From 02db1228ed9ca3728ae0685a5e1602fe7299ae50 Mon Sep 17 00:00:00 2001
From: Ville Kallioniemi <ville.kallioniemi@gmail.com>
Date: Tue, 26 Jan 2016 23:41:01 -0700
Subject: [PATCH 01/24] Add constructor for long types.

---
 unsupported/Eigen/CXX11/src/Tensor/TensorUInt128.h | 6 ++++++
 1 file changed, 6 insertions(+)
diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorUInt128.h b/unsupported/Eigen/CXX11/src/Tensor/TensorUInt128.h
index 4f2adb671..19352eb5e 100644
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorUInt128.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorUInt128.h
@@ -40,6 +40,12 @@ struct TensorUInt128
   EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE
   TensorUInt128(unsigned int x) : high(0), low(x) { }
   EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE
+  TensorUInt128(long x) : high(0), low(x) {
+    eigen_assert(x >= 0);
+  }
+  EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE
+  TensorUInt128(unsigned long x) : high(0), low(x) { }
+  EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE
   TensorUInt128(int64_t x) : high(0), low(x) {
     eigen_assert(x >= 0);
   }

From 6b5dff875e4ba2235f255b7cf0a86b7abed21df0 Mon Sep 17 00:00:00 2001
From: Benoit Steiner <benoit.steiner.goog@gmail.com>
Date: Mon, 1 Feb 2016 12:46:32 -0800
Subject: [PATCH 02/24] Made it possible to limit the number of blocks that
 will be used to evaluate a tensor expression on a CUDA device. This makesit
 possible to set aside streaming multiprocessors for other computations.

---
 .../Eigen/CXX11/src/Tensor/TensorDeviceCuda.h        | 12 +++++++++---
 unsupported/Eigen/CXX11/src/Tensor/TensorExecutor.h  |  4 ++--
 2 files changed, 11 insertions(+), 5 deletions(-)

diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorDeviceCuda.h b/unsupported/Eigen/CXX11/src/Tensor/TensorDeviceCuda.h
index 5abdc489b..e684ab8f7 100644
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorDeviceCuda.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorDeviceCuda.h
@@ -109,10 +109,12 @@ class CudaStreamDevice : public StreamInterface {
 struct GpuDevice {
   // The StreamInterface is not owned: the caller is
   // responsible for its initialization and eventual destruction.
-  explicit GpuDevice(const StreamInterface* stream) : stream_(stream) {
+  explicit GpuDevice(const StreamInterface* stream) : stream_(stream), max_blocks_(INT_MAX) {
+    eigen_assert(stream);
+  }
+  explicit GpuDevice(const StreamInterface* stream, int num_blocks) : stream_(stream), max_blocks_(num_blocks) {
     eigen_assert(stream);
   }
-
   // TODO(bsteiner): This is an internal API, we should not expose it.
   EIGEN_STRONG_INLINE const cudaStream_t& stream() const {
     return stream_->stream();
@@ -246,6 +248,10 @@ struct GpuDevice {
 #endif
   }
 
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE int maxBlocks() const {
+    return max_blocks_;
+  }
+
   // This function checks if the CUDA runtime recorded an error for the
   // underlying stream device.
   inline bool ok() const {
@@ -259,7 +265,7 @@ struct GpuDevice {
 
  private:
   const StreamInterface* stream_;
-
+  int max_blocks_;
 };
 
 #ifndef __CUDA_ARCH__
diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorExecutor.h b/unsupported/Eigen/CXX11/src/Tensor/TensorExecutor.h
index d2ab70f2b..df15c6204 100644
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorExecutor.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorExecutor.h
@@ -220,7 +220,7 @@ EIGEN_DEVICE_FUNC inline void TensorExecutor<Expression, GpuDevice, false>::run(
   if (needs_assign)
   {
     const int block_size = device.maxCudaThreadsPerBlock();
-    const int max_blocks = device.getNumCudaMultiProcessors() * device.maxCudaThreadsPerMultiProcessor() / block_size;
+    const int max_blocks = numext::maxi<int>(device.maxBlocks(), device.getNumCudaMultiProcessors() * device.maxCudaThreadsPerMultiProcessor() / block_size);
     const Index size = array_prod(evaluator.dimensions());
     // Create a least one block to ensure we won't crash if we're called with tensors of size 0.
     const int num_blocks = numext::maxi<int>(numext::mini<int>(max_blocks, (size + block_size - 1) / block_size), 1);
@@ -239,7 +239,7 @@ EIGEN_DEVICE_FUNC inline void TensorExecutor<Expression, GpuDevice, true>::run(c
   if (needs_assign)
   {
     const int block_size = device.maxCudaThreadsPerBlock();
-    const int max_blocks = device.getNumCudaMultiProcessors() * device.maxCudaThreadsPerMultiProcessor() / block_size;
+    const int max_blocks = numext::maxi<int>(device.maxBlocks(), device.getNumCudaMultiProcessors() * device.maxCudaThreadsPerMultiProcessor() / block_size);
     const Index size = array_prod(evaluator.dimensions());
     // Create a least one block to ensure we won't crash if we're called with tensors of size 0.
     const int num_blocks = numext::maxi<int>(numext::mini<int>(max_blocks, (size + block_size - 1) / block_size), 1);

From 922b5f527b56ba2fcf1e5a4da5216a29afdbb312 Mon Sep 17 00:00:00 2001
From: Benoit Steiner <benoit.steiner.goog@gmail.com>
Date: Mon, 1 Feb 2016 13:30:49 -0800
Subject: [PATCH 03/24] Silenced a few compilation warnings

---
 unsupported/test/cxx11_tensor_device.cu | 18 +++++++++---------
 1 file changed, 9 insertions(+), 9 deletions(-)

diff --git a/unsupported/test/cxx11_tensor_device.cu b/unsupported/test/cxx11_tensor_device.cu
index 80cf9ffba..cbe9e6449 100644
--- a/unsupported/test/cxx11_tensor_device.cu
+++ b/unsupported/test/cxx11_tensor_device.cu
@@ -109,19 +109,19 @@ struct GPUContext {
 
 // The actual expression to evaluate
 template <typename Context>
-static void test_contextual_eval(Context* context)
+void test_contextual_eval(Context* context)
 {
   context->out().device(context->device()) = context->in1() + context->in2() * 3.14f + context->in1().constant(2.718f);
 }
 
 template <typename Context>
-static void test_forced_contextual_eval(Context* context)
+void test_forced_contextual_eval(Context* context)
 {
   context->out().device(context->device()) = (context->in1() + context->in2()).eval() * 3.14f + context->in1().constant(2.718f);
 }
 
 template <typename Context>
-static void test_compound_assignment(Context* context)
+void test_compound_assignment(Context* context)
 {
   context->out().device(context->device()) = context->in1().constant(2.718f);
   context->out().device(context->device()) += context->in1() + context->in2() * 3.14f;
@@ -129,7 +129,7 @@ static void test_compound_assignment(Context* context)
 
 
 template <typename Context>
-static void test_contraction(Context* context)
+void test_contraction(Context* context)
 {
   Eigen::array<std::pair<int, int>, 2> dims;
   dims[0] = std::make_pair(1, 1);
@@ -145,7 +145,7 @@ static void test_contraction(Context* context)
 
 
 template <typename Context>
-static void test_1d_convolution(Context* context)
+void test_1d_convolution(Context* context)
 {
   Eigen::DSizes<int, 3> indices(0,0,0);
   Eigen::DSizes<int, 3> sizes(40,49,70);
@@ -155,7 +155,7 @@ static void test_1d_convolution(Context* context)
 }
 
 template <typename Context>
-static void test_2d_convolution(Context* context)
+void test_2d_convolution(Context* context)
 {
   Eigen::DSizes<int, 3> indices(0,0,0);
   Eigen::DSizes<int, 3> sizes(40,49,69);
@@ -165,7 +165,7 @@ static void test_2d_convolution(Context* context)
 }
 
 template <typename Context>
-static void test_3d_convolution(Context* context)
+void test_3d_convolution(Context* context)
 {
   Eigen::DSizes<int, 3> indices(0,0,0);
   Eigen::DSizes<int, 3> sizes(39,49,69);
@@ -175,7 +175,7 @@ static void test_3d_convolution(Context* context)
 }
 
 
-static void test_cpu() {
+void test_cpu() {
   Eigen::Tensor<float, 3> in1(40,50,70);
   Eigen::Tensor<float, 3> in2(40,50,70);
   Eigen::Tensor<float, 3> out(40,50,70);
@@ -267,7 +267,7 @@ static void test_cpu() {
   }
 }
 
-static void test_gpu() {
+void test_gpu() {
   Eigen::Tensor<float, 3> in1(40,50,70);
   Eigen::Tensor<float, 3> in2(40,50,70);
   Eigen::Tensor<float, 3> out(40,50,70);

From 0ce5d32be583c0a2592158ad59ce7ad11125d645 Mon Sep 17 00:00:00 2001
From: Benoit Steiner <benoit.steiner.goog@gmail.com>
Date: Mon, 1 Feb 2016 13:33:23 -0800
Subject: [PATCH 04/24] Sharded the cxx11_tensor_contract_cuda test

---
 .../test/cxx11_tensor_contract_cuda.cu        | 48 ++++++++++---------
 1 file changed, 25 insertions(+), 23 deletions(-)

diff --git a/unsupported/test/cxx11_tensor_contract_cuda.cu b/unsupported/test/cxx11_tensor_contract_cuda.cu
index cbd902d6a..2c3cf64a9 100644
--- a/unsupported/test/cxx11_tensor_contract_cuda.cu
+++ b/unsupported/test/cxx11_tensor_contract_cuda.cu
@@ -22,7 +22,7 @@ using Eigen::Tensor;
 typedef Tensor<float, 1>::DimensionPair DimPair;
 
 template<int DataLayout>
-static void test_cuda_contraction(int m_size, int k_size, int n_size)
+void test_cuda_contraction(int m_size, int k_size, int n_size)
 {
   std::cout << "Calling with (" << m_size << "," << k_size << "," << n_size << ")" << std::endl;
   // with these dimensions, the output has 300 * 140 elements, which is
@@ -88,37 +88,39 @@ static void test_cuda_contraction(int m_size, int k_size, int n_size)
 void test_cxx11_tensor_cuda()
 {
   std::cout << "Calling contraction tests" << std::endl;
-  CALL_SUBTEST(test_cuda_contraction<ColMajor>(128, 128, 128));
-  CALL_SUBTEST(test_cuda_contraction<RowMajor>(128, 128, 128));
+  CALL_SUBTEST_1(test_cuda_contraction<ColMajor>(128, 128, 128));
+  CALL_SUBTEST_1(test_cuda_contraction<RowMajor>(128, 128, 128));
   for (int k = 32; k < 256; k++) {
-    CALL_SUBTEST(test_cuda_contraction<ColMajor>(128, k, 128));
-    CALL_SUBTEST(test_cuda_contraction<RowMajor>(128, k, 128));
+    CALL_SUBTEST_2(test_cuda_contraction<ColMajor>(128, k, 128));
+    CALL_SUBTEST_3(test_cuda_contraction<RowMajor>(128, k, 128));
   }
   for (int k = 32; k < 256; k++) {
-    CALL_SUBTEST(test_cuda_contraction<ColMajor>(128, 128, k));
-    CALL_SUBTEST(test_cuda_contraction<RowMajor>(128, 128, k));
+    CALL_SUBTEST_4(test_cuda_contraction<ColMajor>(128, 128, k));
+    CALL_SUBTEST_5(test_cuda_contraction<RowMajor>(128, 128, k));
   }
   for (int k = 32; k < 256; k++) {
-    CALL_SUBTEST(test_cuda_contraction<ColMajor>(k, 128, 128));
-    CALL_SUBTEST(test_cuda_contraction<RowMajor>(k, 128, 128));
+    CALL_SUBTEST_6(test_cuda_contraction<ColMajor>(k, 128, 128));
+    CALL_SUBTEST_7(test_cuda_contraction<RowMajor>(k, 128, 128));
   }
 
-  int m_sizes[] = {31,   39,   63,   64,  65,
-                   127, 129,  255,  257, 511,
-                   512, 513, 1023, 1024, 1025 };
-  int n_sizes[] = {31,   39,   63,   64,  65,
-                   127, 129,  255,  257, 511,
-                   512, 513, 1023, 1024, 1025 };
+  static const int m_sizes[] = {31,   39,   63,   64,  65,
+                                127, 129,  255,  257, 511,
+                                512, 513, 1023, 1024, 1025};
+  static const int n_sizes[] = {31,   39,   63,   64,  65,
+                                127, 129,  255,  257, 511,
+                                512, 513, 1023, 1024, 1025};
 
-  int k_sizes[] = { 31,  39,  63, 64,    65,
-                    95,  96, 127, 129,  255,
-                   257, 511, 512, 513, 1023,
-                  1024, 1025};
+  static const int k_sizes[] = {31,  39,  63, 64,    65,
+                                95,  96, 127, 129,  255,
+                                257, 511, 512, 513, 1023,
+                                1024, 1025};
 
-  for (int i = 0; i <15; i++)
-    for (int j = 0; j < 15; j++)
+  for (int i = 0; i <15; i++) {
+    for (int j = 0; j < 15; j++) {
       for (int k = 0; k < 17; k++) {
-        CALL_SUBTEST(test_cuda_contraction<ColMajor>(m_sizes[i], n_sizes[j], k_sizes[k]));
-        CALL_SUBTEST(test_cuda_contraction<RowMajor>(m_sizes[i], n_sizes[j], k_sizes[k]));
+        CALL_SUBTEST_8(test_cuda_contraction<ColMajor>(m_sizes[i], n_sizes[j], k_sizes[k]));
+        CALL_SUBTEST_9(test_cuda_contraction<RowMajor>(m_sizes[i], n_sizes[j], k_sizes[k]));
       }
+    }
+  }
 }

From 64ce78c2ec52aa2fd2e408c7c4160b06e8fc1a03 Mon Sep 17 00:00:00 2001
From: Benoit Steiner <benoit.steiner.goog@gmail.com>
Date: Mon, 1 Feb 2016 13:57:41 -0800
Subject: [PATCH 05/24] Cleaned up a tensor contraction test

---
 .../test/cxx11_tensor_contract_cuda.cu        | 86 ++++++++++++-------
 1 file changed, 56 insertions(+), 30 deletions(-)

diff --git a/unsupported/test/cxx11_tensor_contract_cuda.cu b/unsupported/test/cxx11_tensor_contract_cuda.cu
index 2c3cf64a9..6d1ef07f9 100644
--- a/unsupported/test/cxx11_tensor_contract_cuda.cu
+++ b/unsupported/test/cxx11_tensor_contract_cuda.cu
@@ -24,14 +24,14 @@ typedef Tensor<float, 1>::DimensionPair DimPair;
 template<int DataLayout>
 void test_cuda_contraction(int m_size, int k_size, int n_size)
 {
-  std::cout << "Calling with (" << m_size << "," << k_size << "," << n_size << ")" << std::endl;
+  std::cout << "Testing for (" << m_size << "," << k_size << "," << n_size << ")" << std::endl;
   // with these dimensions, the output has 300 * 140 elements, which is
   // more than 30 * 1024, which is the number of threads in blocks on
   // a 15 SM GK110 GPU
-  Tensor<float, 2, DataLayout> t_left(Eigen::array<int, 2>(m_size, k_size));
-  Tensor<float, 2, DataLayout> t_right(Eigen::array<int, 2>(k_size, n_size));
-  Tensor<float, 2, DataLayout> t_result(Eigen::array<int, 2>(m_size, n_size));
-  Tensor<float, 2, DataLayout> t_result_gpu(Eigen::array<int, 2>(m_size, n_size));
+  Tensor<float, 2, DataLayout> t_left(m_size, k_size);
+  Tensor<float, 2, DataLayout> t_right(k_size, n_size);
+  Tensor<float, 2, DataLayout> t_result(m_size, n_size);
+  Tensor<float, 2, DataLayout> t_result_gpu(m_size, n_size);
   Eigen::array<DimPair, 1> dims(DimPair(1, 0));
 
   t_left.setRandom();
@@ -84,43 +84,69 @@ void test_cuda_contraction(int m_size, int k_size, int n_size)
   cudaFree((void*)d_t_result);
 }
 
-
-void test_cxx11_tensor_cuda()
-{
-  std::cout << "Calling contraction tests" << std::endl;
-  CALL_SUBTEST_1(test_cuda_contraction<ColMajor>(128, 128, 128));
-  CALL_SUBTEST_1(test_cuda_contraction<RowMajor>(128, 128, 128));
+template<int DataLayout>
+void test_cuda_contraction_m() {
   for (int k = 32; k < 256; k++) {
-    CALL_SUBTEST_2(test_cuda_contraction<ColMajor>(128, k, 128));
-    CALL_SUBTEST_3(test_cuda_contraction<RowMajor>(128, k, 128));
+    test_cuda_contraction<ColMajor>(k, 128, 128);
+    test_cuda_contraction<RowMajor>(k, 128, 128);
   }
+}
+
+template<int DataLayout>
+void test_cuda_contraction_k() {
   for (int k = 32; k < 256; k++) {
-    CALL_SUBTEST_4(test_cuda_contraction<ColMajor>(128, 128, k));
-    CALL_SUBTEST_5(test_cuda_contraction<RowMajor>(128, 128, k));
+    test_cuda_contraction<ColMajor>(128, k, 128);
+    test_cuda_contraction<RowMajor>(128, k, 128);
   }
+}
+
+template<int DataLayout>
+void test_cuda_contraction_n() {
   for (int k = 32; k < 256; k++) {
-    CALL_SUBTEST_6(test_cuda_contraction<ColMajor>(k, 128, 128));
-    CALL_SUBTEST_7(test_cuda_contraction<RowMajor>(k, 128, 128));
+    test_cuda_contraction<ColMajor>(128, 128, k);
+    test_cuda_contraction<RowMajor>(128, 128, k);
   }
+}
 
-  static const int m_sizes[] = {31,   39,   63,   64,  65,
-                                127, 129,  255,  257, 511,
-                                512, 513, 1023, 1024, 1025};
-  static const int n_sizes[] = {31,   39,   63,   64,  65,
-                                127, 129,  255,  257, 511,
-                                512, 513, 1023, 1024, 1025};
 
-  static const int k_sizes[] = {31,  39,  63, 64,    65,
-                                95,  96, 127, 129,  255,
-                                257, 511, 512, 513, 1023,
-                                1024, 1025};
+template<int DataLayout>
+void test_cuda_contraction_sizes() {
+  int m_sizes[] = { 31,  39,   63,   64,   65,
+                   127, 129,  255,  257 , 511,
+                   512, 513, 1023, 1024, 1025};
 
-  for (int i = 0; i <15; i++) {
+  int n_sizes[] = { 31,  39,   63,   64,   65,
+                   127, 129,  255,  257,  511,
+                   512, 513, 1023, 1024, 1025};
+
+  int k_sizes[] = {  31,   39,  63,  64,   65,
+                     95,   96, 127, 129,  255,
+                    257,  511, 512, 513, 1023,
+                   1024, 1025};
+
+  for (int i = 0; i < 15; i++) {
     for (int j = 0; j < 15; j++) {
       for (int k = 0; k < 17; k++) {
-        CALL_SUBTEST_8(test_cuda_contraction<ColMajor>(m_sizes[i], n_sizes[j], k_sizes[k]));
-        CALL_SUBTEST_9(test_cuda_contraction<RowMajor>(m_sizes[i], n_sizes[j], k_sizes[k]));
+        test_cuda_contraction<DataLayout>(m_sizes[i], n_sizes[j], k_sizes[k]);
       }
     }
   }
 }
+
+void test_cxx11_tensor_cuda()
+{
+  CALL_SUBTEST_1(test_cuda_contraction<ColMajor>(128, 128, 128));
+  CALL_SUBTEST_1(test_cuda_contraction<RowMajor>(128, 128, 128));
+
+  CALL_SUBTEST_2(test_cuda_contraction_m<ColMajor>());
+  CALL_SUBTEST_3(test_cuda_contraction_m<RowMajor>());
+
+  CALL_SUBTEST_4(test_cuda_contraction_k<ColMajor>());
+  CALL_SUBTEST_5(test_cuda_contraction_k<RowMajor>());
+
+  CALL_SUBTEST_6(test_cuda_contraction_n<ColMajor>());
+  CALL_SUBTEST_7(test_cuda_contraction_n<RowMajor>());
+
+  CALL_SUBTEST_8(test_cuda_contraction_sizes<ColMajor>());
+  CALL_SUBTEST_9(test_cuda_contraction_sizes<RowMajor>());
+}

From aedea349aabb44d51a4e64cd2c96242f0cea95ba Mon Sep 17 00:00:00 2001
From: Ville Kallioniemi <ville.kallioniemi@oracle.com>
Date: Mon, 1 Feb 2016 20:25:02 -0700
Subject: [PATCH 06/24] Replace separate low word constructors with a single
 templated constructor.

---
 .../Eigen/CXX11/src/Tensor/TensorUInt128.h    | 25 ++++++++-----------
 unsupported/test/cxx11_tensor_uint128.cpp     |  2 +-
 2 files changed, 11 insertions(+), 16 deletions(-)

diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorUInt128.h b/unsupported/Eigen/CXX11/src/Tensor/TensorUInt128.h
index 0d34f7ee6..981515f4b 100644
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorUInt128.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorUInt128.h
@@ -33,24 +33,19 @@ struct TensorUInt128
   HIGH high;
   LOW low;
 
+  template<typename OTHER_HIGH, typename OTHER_LOW>
   EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE
-  TensorUInt128(int32_t x) : high(0), low(x) {
+  TensorUInt128(const TensorUInt128<OTHER_HIGH, OTHER_LOW>& other) : high(other.high), low(other.low) {
+    static_assert(sizeof(OTHER_HIGH) <= sizeof(HIGH), "high too wide");
+    static_assert(sizeof(OTHER_LOW) <= sizeof(LOW), "low too wide");
+  }
+
+  template<typename T>
+  EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE
+  explicit TensorUInt128(const T& x) : high(0), low(x) {
     eigen_assert(x >= 0);
   }
-  EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE
-  TensorUInt128(uint32_t x) : high(0), low(x) { }
-  EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE
-  TensorUInt128(long x) : high(0), low(x) {
-    eigen_assert(x >= 0);
-  }
-  EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE
-  TensorUInt128(unsigned long x) : high(0), low(x) { }
-  EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE
-  TensorUInt128(int64_t x) : high(0), low(x) {
-    eigen_assert(x >= 0);
-  }
-  EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE
-  TensorUInt128(uint64_t x) : high(0), low(x) { }
+
   EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE
   TensorUInt128(uint64_t y, uint64_t x) : high(y), low(x) { }
 
diff --git a/unsupported/test/cxx11_tensor_uint128.cpp b/unsupported/test/cxx11_tensor_uint128.cpp
index ee3767e58..424c70197 100644
--- a/unsupported/test/cxx11_tensor_uint128.cpp
+++ b/unsupported/test/cxx11_tensor_uint128.cpp
@@ -127,7 +127,7 @@ void test_misc2() {
 
       TensorUInt128<uint64_t, uint64_t> result = (TensorUInt128<uint64_t, static_val<0> >(shift, 0) / TensorUInt128<static_val<0>, uint64_t>(divider) - TensorUInt128<static_val<1>, static_val<0> >(1, 0) + TensorUInt128<static_val<0>, static_val<1> >(1));
       uint64_t actual = static_cast<uint64_t>(result);
-      VERIFY_EQUAL(actual, expected);
+      VERIFY_IS_EQUAL(actual, expected);
     }
   }
 }

From 99cde88341145c43fc4134af07556d8c6ff12066 Mon Sep 17 00:00:00 2001
From: Benoit Steiner <benoit.steiner.goog@gmail.com>
Date: Tue, 2 Feb 2016 11:06:53 -0800
Subject: [PATCH 07/24] Don't try to use direct offsets when computing a tensor
 product, since the required stride isn't available.

---
 unsupported/Eigen/CXX11/src/Tensor/TensorContractionMapper.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorContractionMapper.h b/unsupported/Eigen/CXX11/src/Tensor/TensorContractionMapper.h
index 63c8ae126..392aa6d37 100644
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorContractionMapper.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorContractionMapper.h
@@ -344,7 +344,7 @@ class TensorContractionSubMapper {
   enum {
     // We can use direct offsets iff the parent mapper supports then and we can compute the strides.
     // TODO: we should also enable direct offsets for the Rhs case.
-    UseDirectOffsets = (side == Lhs) && inner_dim_contiguous && ParentMapper::DirectOffsets
+    UseDirectOffsets = ParentMapper::DirectOffsets && (side == Lhs) && inner_dim_contiguous && (array_size<contract_t>::value > 0)
   };
 
   EIGEN_DEVICE_FUNC TensorContractionSubMapper(const ParentMapper& base_mapper, Index vert_offset, Index horiz_offset)

From 783018d8f65faeec0fc6f795bc2630240ecdd051 Mon Sep 17 00:00:00 2001
From: Ville Kallioniemi <ville.kallioniemi@gmail.com>
Date: Tue, 2 Feb 2016 16:45:12 -0700
Subject: [PATCH 08/24] Use EIGEN_STATIC_ASSERT for backward compatibility.

---
 unsupported/Eigen/CXX11/src/Tensor/TensorUInt128.h | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorUInt128.h b/unsupported/Eigen/CXX11/src/Tensor/TensorUInt128.h
index 981515f4b..52d5b7b1a 100644
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorUInt128.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorUInt128.h
@@ -36,8 +36,8 @@ struct TensorUInt128
   template<typename OTHER_HIGH, typename OTHER_LOW>
   EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE
   TensorUInt128(const TensorUInt128<OTHER_HIGH, OTHER_LOW>& other) : high(other.high), low(other.low) {
-    static_assert(sizeof(OTHER_HIGH) <= sizeof(HIGH), "high too wide");
-    static_assert(sizeof(OTHER_LOW) <= sizeof(LOW), "low too wide");
+    EIGEN_STATIC_ASSERT(sizeof(OTHER_HIGH) <= sizeof(HIGH), "high too wide");
+    EIGEN_STATIC_ASSERT(sizeof(OTHER_LOW) <= sizeof(LOW), "low too wide");
   }
 
   template<typename T>

From c85fbfd0b747b9af48144bab9a79127ab2b6257b Mon Sep 17 00:00:00 2001
From: Gael Guennebaud <g.gael@free.fr>
Date: Wed, 3 Feb 2016 16:08:43 +0100
Subject: [PATCH 09/24] Clarify documentation on the restrictions of writable
 sparse block expressions.

---
 doc/SparseQuickReference.dox | 62 +++++++++++++++++++++---------------
 doc/TutorialSparse.dox       | 24 ++++++++++++--
 2 files changed, 58 insertions(+), 28 deletions(-)

diff --git a/doc/SparseQuickReference.dox b/doc/SparseQuickReference.dox
index d04ac35c5..e0a30edcc 100644
--- a/doc/SparseQuickReference.dox
+++ b/doc/SparseQuickReference.dox
@@ -21,7 +21,7 @@ i.e either row major or column major. The default is column major. Most arithmet
 <td> Resize/Reserve</td>
 <td> 
  \code
-    sm1.resize(m,n);      //Change sm1 to a m x n matrix. 
+    sm1.resize(m,n);      // Change sm1 to a m x n matrix.
     sm1.reserve(nnz);     // Allocate room for nnz nonzeros elements.   
   \endcode 
 </td>
@@ -151,10 +151,10 @@ It is easy to perform arithmetic operations on sparse matrices provided that the
 <td> Permutation </td>
 <td> 
 \code 
-perm.indices(); // Reference to the vector of indices
+perm.indices();      // Reference to the vector of indices
 sm1.twistedBy(perm); // Permute rows and columns
-sm2 = sm1 * perm; //Permute the columns
-sm2 = perm * sm1; // Permute the columns
+sm2 = sm1 * perm;    // Permute the columns
+sm2 = perm * sm1;    // Permute the columns
 \endcode 
 </td>
 <td> 
@@ -181,9 +181,9 @@ sm2 = perm * sm1; // Permute the columns
 
 \section sparseotherops Other supported operations
 <table class="manual">
-<tr><th>Operations</th> <th> Code </th> <th> Notes</th> </tr>
+<tr><th style="min-width:initial"> Code </th> <th> Notes</th> </tr>
+<tr><td colspan="2">Sub-matrices</td></tr>
 <tr>
-<td>Sub-matrices</td> 
 <td> 
 \code 
   sm1.block(startRow, startCol, rows, cols); 
@@ -193,25 +193,31 @@ sm2 = perm * sm1; // Permute the columns
   sm1.bottomLeftCorner( rows, cols);
   sm1.bottomRightCorner( rows, cols);
   \endcode
-</td> <td>  </td>
+</td><td>
+Contrary to dense matrices, here <strong>all these methods are read-only</strong>.\n
+See \ref TutorialSparse_SubMatrices and below for read-write sub-matrices.
+</td>
 </tr>
-<tr> 
-<td> Range </td>
+<tr class="alt"><td colspan="2"> Range </td></tr>
+<tr class="alt">
 <td> 
 \code 
-  sm1.innerVector(outer); 
-  sm1.innerVectors(start, size);
-  sm1.leftCols(size);
-  sm2.rightCols(size);
-  sm1.middleRows(start, numRows);
-  sm1.middleCols(start, numCols);
-  sm1.col(j);
+  sm1.innerVector(outer);           // RW
+  sm1.innerVectors(start, size);    // RW
+  sm1.leftCols(size);               // RW
+  sm2.rightCols(size);              // RO because sm2 is row-major
+  sm1.middleRows(start, numRows);   // RO becasue sm1 is column-major
+  sm1.middleCols(start, numCols);   // RW
+  sm1.col(j);                       // RW
 \endcode
 </td>
-<td>A inner vector is either a row (for row-major) or a column (for column-major). As stated earlier, the evaluation can be done in a matrix with different storage order </td>
+<td>
+A inner vector is either a row (for row-major) or a column (for column-major).\n
+As stated earlier, for a read-write sub-matrix (RW), the evaluation can be done in a matrix with different storage order.
+</td>
 </tr>
+<tr><td colspan="2"> Triangular and selfadjoint views</td></tr>
 <tr>
-<td> Triangular and selfadjoint views</td>
 <td> 
 \code
   sm2 = sm1.triangularview<Lower>();
@@ -222,26 +228,30 @@ sm2 = perm * sm1; // Permute the columns
 \code 
   \endcode </td>
 </tr>
-<tr> 
-<td>Triangular solve </td>
+<tr class="alt"><td colspan="2">Triangular solve </td></tr>
+<tr class="alt">
 <td> 
 \code 
  dv2 = sm1.triangularView<Upper>().solve(dv1);
- dv2 = sm1.topLeftCorner(size, size).triangularView<Lower>().solve(dv1);
+ dv2 = sm1.topLeftCorner(size, size)
+          .triangularView<Lower>().solve(dv1);
 \endcode 
 </td>
 <td> For general sparse solve, Use any suitable module described at \ref TopicSparseSystems </td>
 </tr>
+<tr><td colspan="2"> Low-level API</td></tr>
 <tr>
-<td> Low-level API</td>
 <td>
 \code
-sm1.valuePtr(); // Pointer to the values
-sm1.innerIndextr(); // Pointer to the indices.
-sm1.outerIndexPtr(); //Pointer to the beginning of each inner vector
+sm1.valuePtr();      // Pointer to the values
+sm1.innerIndextr();  // Pointer to the indices.
+sm1.outerIndexPtr(); // Pointer to the beginning of each inner vector
 \endcode
 </td>
-<td> If the matrix is not in compressed form, makeCompressed() should be called before. Note that these functions are mostly provided for interoperability purposes with external libraries. A better access to the values of the matrix is done by using the InnerIterator class as described in \link TutorialSparse the Tutorial Sparse \endlink section</td>
+<td>
+If the matrix is not in compressed form, makeCompressed() should be called before.\n
+Note that these functions are mostly provided for interoperability purposes with external libraries.\n
+A better access to the values of the matrix is done by using the InnerIterator class as described in \link TutorialSparse the Tutorial Sparse \endlink section</td>
 </tr>
 </table>
 */
diff --git a/doc/TutorialSparse.dox b/doc/TutorialSparse.dox
index 1f0be387d..352907408 100644
--- a/doc/TutorialSparse.dox
+++ b/doc/TutorialSparse.dox
@@ -241,11 +241,11 @@ In the following \em sm denotes a sparse matrix, \em sv a sparse vector, \em dm
 sm1.real()   sm1.imag()   -sm1                    0.5*sm1
 sm1+sm2      sm1-sm2      sm1.cwiseProduct(sm2)
 \endcode
-However, a strong restriction is that the storage orders must match. For instance, in the following example:
+However, <strong>a strong restriction is that the storage orders must match</strong>. For instance, in the following example:
 \code
 sm4 = sm1 + sm2 + sm3;
 \endcode
-sm1, sm2, and sm3 must all be row-major or all column major.
+sm1, sm2, and sm3 must all be row-major or all column-major.
 On the other hand, there is no restriction on the target matrix sm4.
 For instance, this means that for computing \f$ A^T + A \f$, the matrix \f$ A^T \f$ must be evaluated into a temporary matrix of compatible storage order:
 \code
@@ -311,6 +311,26 @@ sm2 = sm1.transpose() * P;
     \endcode
 
 
+\subsection TutorialSparse_SubMatrices Block operations
+
+Regarding read-access, sparse matrices expose the same API than for dense matrices to access to sub-matrices such as blocks, columns, and rows. See \ref TutorialBlockOperations for a detailed introduction.
+However, for performance reasons, writing to a sub-sparse-matrix is much more limited, and currently only contiguous sets of columns (resp. rows) of a column-major (resp. row-major) SparseMatrix are writable. Moreover, this information has to be known at compile-time, leaving out methods such as <tt>block(...)</tt> and <tt>corner*(...)</tt>. The available API for write-access to a SparseMatrix are summarized below:
+\code
+SparseMatrix<double,ColMajor> sm1;
+sm1.col(j) = ...;
+sm1.leftCols(ncols) = ...;
+sm1.middleCols(j,ncols) = ...;
+sm1.rightCols(ncols) = ...;
+
+SparseMatrix<double,RowMajor> sm2;
+sm2.row(i) = ...;
+sm2.topRows(nrows) = ...;
+sm2.middleRows(i,nrows) = ...;
+sm2.bottomRows(nrows) = ...;
+\endcode
+
+In addition, sparse matrices expose the SparseMatrixBase::innerVector() and SparseMatrixBase::innerVectors() methods, which are aliases to the col/middleCols methods for a column-major storage, and to the row/middleRows methods for a row-major storage.
+
 \subsection TutorialSparse_TriangularSelfadjoint Triangular and selfadjoint views
 
 Just as with dense matrices, the triangularView() function can be used to address a triangular part of the matrix, and perform triangular solves with a dense right hand side:

From eb6d9aea0e27c3161db3c57d415a3996d60b80bc Mon Sep 17 00:00:00 2001
From: Gael Guennebaud <g.gael@free.fr>
Date: Wed, 3 Feb 2016 16:58:23 +0100
Subject: [PATCH 10/24] Clarify error message when writing to a read-only
 sparse-sub-matrix.

---
 Eigen/src/SparseCore/SparseBlock.h | 23 +++++++++++++++++++----
 1 file changed, 19 insertions(+), 4 deletions(-)

diff --git a/Eigen/src/SparseCore/SparseBlock.h b/Eigen/src/SparseCore/SparseBlock.h
index 43fe788d9..3a811113f 100644
--- a/Eigen/src/SparseCore/SparseBlock.h
+++ b/Eigen/src/SparseCore/SparseBlock.h
@@ -73,8 +73,15 @@ public:
     Index m_outerStart;
     const internal::variable_if_dynamic<Index, OuterSize> m_outerSize;
   
-  public:
-    EIGEN_INHERIT_ASSIGNMENT_OPERATORS(BlockImpl)
+  protected:
+    // Disable assignment with clear error message.
+    // Note that simply removing operator= yields compilation errors with ICC+MSVC
+    template<typename T>
+    BlockImpl& operator=(const T&)
+    {
+      EIGEN_STATIC_ASSERT(sizeof(T)==0, THIS_SPARSE_BLOCK_SUBEXPRESSION_IS_READ_ONLY);
+      return *this;
+    }
 };
 
 
@@ -424,8 +431,6 @@ public:
     friend struct internal::unary_evaluator<Block<XprType,BlockRows,BlockCols,InnerPanel>, internal::IteratorBased, Scalar >;
     
     Index nonZeros() const { return Dynamic; }
-    
-    EIGEN_INHERIT_ASSIGNMENT_OPERATORS(BlockImpl)
 
     typename internal::ref_selector<XprType>::non_const_type m_matrix;
     const internal::variable_if_dynamic<Index, XprType::RowsAtCompileTime == 1 ? 0 : Dynamic> m_startRow;
@@ -433,6 +438,16 @@ public:
     const internal::variable_if_dynamic<Index, RowsAtCompileTime> m_blockRows;
     const internal::variable_if_dynamic<Index, ColsAtCompileTime> m_blockCols;
 
+  protected:
+    // Disable assignment with clear error message.
+    // Note that simply removing operator= yields compilation errors with ICC+MSVC
+    template<typename T>
+    BlockImpl& operator=(const T&)
+    {
+      EIGEN_STATIC_ASSERT(sizeof(T)==0, THIS_SPARSE_BLOCK_SUBEXPRESSION_IS_READ_ONLY);
+      return *this;
+    }
+
 };
 
 namespace internal {

From c301f99208d8aa9985c6a029d602622a60fb5e7b Mon Sep 17 00:00:00 2001
From: Damien R <anonymous@invalid.net>
Date: Wed, 3 Feb 2016 18:07:25 +0100
Subject: [PATCH 11/24] bug #1164: fix list and deque specializations such that
 our aligned allocator is automatically activatived only when the user did not
 specified an allocator (or specified the default std::allocator).

---
 Eigen/src/StlSupport/StdDeque.h |  18 +--
 Eigen/src/StlSupport/StdList.h  |  18 +--
 test/CMakeLists.txt             |   2 +
 test/stddeque_overload.cpp      | 159 ++++++++++++++++++++++++++
 test/stdlist_overload.cpp       | 192 ++++++++++++++++++++++++++++++++
 5 files changed, 363 insertions(+), 26 deletions(-)
 create mode 100644 test/stddeque_overload.cpp
 create mode 100644 test/stdlist_overload.cpp

diff --git a/Eigen/src/StlSupport/StdDeque.h b/Eigen/src/StlSupport/StdDeque.h
index 25930cb85..cf1fedf92 100644
--- a/Eigen/src/StlSupport/StdDeque.h
+++ b/Eigen/src/StlSupport/StdDeque.h
@@ -13,32 +13,24 @@
 
 #include "details.h"
 
-// Define the explicit instantiation (e.g. necessary for the Intel compiler)
-#if EIGEN_COMP_GNUC || EIGEN_COMP_ICC
-  #define EIGEN_EXPLICIT_STL_DEQUE_INSTANTIATION(...) template class std::deque<__VA_ARGS__, EIGEN_ALIGNED_ALLOCATOR<__VA_ARGS__> >;
-#else
-  #define EIGEN_EXPLICIT_STL_DEQUE_INSTANTIATION(...)
-#endif
-
 /**
  * This section contains a convenience MACRO which allows an easy specialization of
  * std::deque such that for data types with alignment issues the correct allocator
  * is used automatically.
  */
 #define EIGEN_DEFINE_STL_DEQUE_SPECIALIZATION(...) \
-EIGEN_EXPLICIT_STL_DEQUE_INSTANTIATION(__VA_ARGS__) \
 namespace std \
 { \
-  template<typename _Ay> \
-  class deque<__VA_ARGS__, _Ay>  \
+  template<> \
+  class deque<__VA_ARGS__, std::allocator<__VA_ARGS__> >           \
     : public deque<__VA_ARGS__, EIGEN_ALIGNED_ALLOCATOR<__VA_ARGS__> > \
   { \
     typedef deque<__VA_ARGS__, EIGEN_ALIGNED_ALLOCATOR<__VA_ARGS__> > deque_base; \
   public: \
     typedef __VA_ARGS__ value_type; \
-    typedef typename deque_base::allocator_type allocator_type; \
-    typedef typename deque_base::size_type size_type;  \
-    typedef typename deque_base::iterator iterator;  \
+    typedef deque_base::allocator_type allocator_type; \
+    typedef deque_base::size_type size_type;  \
+    typedef deque_base::iterator iterator;  \
     explicit deque(const allocator_type& a = allocator_type()) : deque_base(a) {}  \
     template<typename InputIterator> \
     deque(InputIterator first, InputIterator last, const allocator_type& a = allocator_type()) : deque_base(first, last, a) {} \
diff --git a/Eigen/src/StlSupport/StdList.h b/Eigen/src/StlSupport/StdList.h
index 7412b50aa..e1eba4985 100644
--- a/Eigen/src/StlSupport/StdList.h
+++ b/Eigen/src/StlSupport/StdList.h
@@ -12,32 +12,24 @@
 
 #include "details.h"
 
-// Define the explicit instantiation (e.g. necessary for the Intel compiler)
-#if EIGEN_COMP_GNUC || EIGEN_COMP_ICC
-  #define EIGEN_EXPLICIT_STL_LIST_INSTANTIATION(...) template class std::list<__VA_ARGS__, EIGEN_ALIGNED_ALLOCATOR<__VA_ARGS__> >;
-#else
-  #define EIGEN_EXPLICIT_STL_LIST_INSTANTIATION(...)
-#endif
-
 /**
  * This section contains a convenience MACRO which allows an easy specialization of
  * std::list such that for data types with alignment issues the correct allocator
  * is used automatically.
  */
 #define EIGEN_DEFINE_STL_LIST_SPECIALIZATION(...) \
-EIGEN_EXPLICIT_STL_LIST_INSTANTIATION(__VA_ARGS__) \
 namespace std \
 { \
-  template<typename _Ay> \
-  class list<__VA_ARGS__, _Ay>  \
+  template<> \
+  class list<__VA_ARGS__, std::allocator<__VA_ARGS__> >           \
     : public list<__VA_ARGS__, EIGEN_ALIGNED_ALLOCATOR<__VA_ARGS__> > \
   { \
     typedef list<__VA_ARGS__, EIGEN_ALIGNED_ALLOCATOR<__VA_ARGS__> > list_base; \
   public: \
     typedef __VA_ARGS__ value_type; \
-    typedef typename list_base::allocator_type allocator_type; \
-    typedef typename list_base::size_type size_type;  \
-    typedef typename list_base::iterator iterator;  \
+    typedef list_base::allocator_type allocator_type; \
+    typedef list_base::size_type size_type;  \
+    typedef list_base::iterator iterator;  \
     explicit list(const allocator_type& a = allocator_type()) : list_base(a) {}  \
     template<typename InputIterator> \
     list(InputIterator first, InputIterator last, const allocator_type& a = allocator_type()) : list_base(first, last, a) {} \
diff --git a/test/CMakeLists.txt b/test/CMakeLists.txt
index bbebf29cd..4420e0c51 100644
--- a/test/CMakeLists.txt
+++ b/test/CMakeLists.txt
@@ -226,7 +226,9 @@ ei_add_test(geo_homogeneous)
 ei_add_test(stdvector)
 ei_add_test(stdvector_overload)
 ei_add_test(stdlist)
+ei_add_test(stdlist_overload)
 ei_add_test(stddeque)
+ei_add_test(stddeque_overload)
 ei_add_test(sparse_basic)
 ei_add_test(sparse_block)
 ei_add_test(sparse_vector)
diff --git a/test/stddeque_overload.cpp b/test/stddeque_overload.cpp
new file mode 100644
index 000000000..d887e35ba
--- /dev/null
+++ b/test/stddeque_overload.cpp
@@ -0,0 +1,159 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2008 Benoit Jacob <jacob.benoit.1@gmail.com>
+// Copyright (C) 2010 Hauke Heibel <hauke.heibel@gmail.com>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+#include "main.h"
+
+#include <Eigen/StdDeque>
+#include <Eigen/Geometry>
+
+EIGEN_DEFINE_STL_DEQUE_SPECIALIZATION(Vector4f)
+
+EIGEN_DEFINE_STL_DEQUE_SPECIALIZATION(Matrix2f)
+EIGEN_DEFINE_STL_DEQUE_SPECIALIZATION(Matrix4f)
+EIGEN_DEFINE_STL_DEQUE_SPECIALIZATION(Matrix4d)
+
+EIGEN_DEFINE_STL_DEQUE_SPECIALIZATION(Affine3f)
+EIGEN_DEFINE_STL_DEQUE_SPECIALIZATION(Affine3d)
+
+EIGEN_DEFINE_STL_DEQUE_SPECIALIZATION(Quaternionf)
+EIGEN_DEFINE_STL_DEQUE_SPECIALIZATION(Quaterniond)
+
+template<typename MatrixType>
+void check_stddeque_matrix(const MatrixType& m)
+{
+  typename MatrixType::Index rows = m.rows();
+  typename MatrixType::Index cols = m.cols();
+  MatrixType x = MatrixType::Random(rows,cols), y = MatrixType::Random(rows,cols);
+  std::deque<MatrixType> v(10, MatrixType(rows,cols)), w(20, y);
+  v[5] = x;
+  w[6] = v[5];
+  VERIFY_IS_APPROX(w[6], v[5]);
+  v = w;
+  for(int i = 0; i < 20; i++)
+  {
+    VERIFY_IS_APPROX(w[i], v[i]);
+  }
+
+  v.resize(21);
+  v[20] = x;
+  VERIFY_IS_APPROX(v[20], x);
+  v.resize(22,y);
+  VERIFY_IS_APPROX(v[21], y);
+  v.push_back(x);
+  VERIFY_IS_APPROX(v[22], x);
+  VERIFY((size_t)&(v[22]) == (size_t)&(v[21]) + sizeof(MatrixType));
+
+  // do a lot of push_back such that the deque gets internally resized
+  // (with memory reallocation)
+  MatrixType* ref = &w[0];
+  for(int i=0; i<30 || ((ref==&w[0]) && i<300); ++i)
+    v.push_back(w[i%w.size()]);
+  for(unsigned int i=23; i<v.size(); ++i)
+  {
+    VERIFY(v[i]==w[(i-23)%w.size()]);
+  }
+}
+
+template<typename TransformType>
+void check_stddeque_transform(const TransformType&)
+{
+  typedef typename TransformType::MatrixType MatrixType;
+  TransformType x(MatrixType::Random()), y(MatrixType::Random());
+  std::deque<TransformType> v(10), w(20, y);
+  v[5] = x;
+  w[6] = v[5];
+  VERIFY_IS_APPROX(w[6], v[5]);
+  v = w;
+  for(int i = 0; i < 20; i++)
+  {
+    VERIFY_IS_APPROX(w[i], v[i]);
+  }
+
+  v.resize(21);
+  v[20] = x;
+  VERIFY_IS_APPROX(v[20], x);
+  v.resize(22,y);
+  VERIFY_IS_APPROX(v[21], y);
+  v.push_back(x);
+  VERIFY_IS_APPROX(v[22], x);
+
+  // do a lot of push_back such that the deque gets internally resized
+  // (with memory reallocation)
+  TransformType* ref = &w[0];
+  for(int i=0; i<30 || ((ref==&w[0]) && i<300); ++i)
+    v.push_back(w[i%w.size()]);
+  for(unsigned int i=23; i<v.size(); ++i)
+  {
+    VERIFY(v[i].matrix()==w[(i-23)%w.size()].matrix());
+  }
+}
+
+template<typename QuaternionType>
+void check_stddeque_quaternion(const QuaternionType&)
+{
+  typedef typename QuaternionType::Coefficients Coefficients;
+  QuaternionType x(Coefficients::Random()), y(Coefficients::Random());
+  std::deque<QuaternionType> v(10), w(20, y);
+  v[5] = x;
+  w[6] = v[5];
+  VERIFY_IS_APPROX(w[6], v[5]);
+  v = w;
+  for(int i = 0; i < 20; i++)
+  {
+    VERIFY_IS_APPROX(w[i], v[i]);
+  }
+
+  v.resize(21);
+  v[20] = x;
+  VERIFY_IS_APPROX(v[20], x);
+  v.resize(22,y);
+  VERIFY_IS_APPROX(v[21], y);
+  v.push_back(x);
+  VERIFY_IS_APPROX(v[22], x);
+
+  // do a lot of push_back such that the deque gets internally resized
+  // (with memory reallocation)
+  QuaternionType* ref = &w[0];
+  for(int i=0; i<30 || ((ref==&w[0]) && i<300); ++i)
+    v.push_back(w[i%w.size()]);
+  for(unsigned int i=23; i<v.size(); ++i)
+  {
+    VERIFY(v[i].coeffs()==w[(i-23)%w.size()].coeffs());
+  }
+}
+
+void test_stddeque_overload()
+{
+  // some non vectorizable fixed sizes
+  CALL_SUBTEST_1(check_stddeque_matrix(Vector2f()));
+  CALL_SUBTEST_1(check_stddeque_matrix(Matrix3f()));
+  CALL_SUBTEST_2(check_stddeque_matrix(Matrix3d()));
+
+  // some vectorizable fixed sizes
+  CALL_SUBTEST_1(check_stddeque_matrix(Matrix2f()));
+  CALL_SUBTEST_1(check_stddeque_matrix(Vector4f()));
+  CALL_SUBTEST_1(check_stddeque_matrix(Matrix4f()));
+  CALL_SUBTEST_2(check_stddeque_matrix(Matrix4d()));
+
+  // some dynamic sizes
+  CALL_SUBTEST_3(check_stddeque_matrix(MatrixXd(1,1)));
+  CALL_SUBTEST_3(check_stddeque_matrix(VectorXd(20)));
+  CALL_SUBTEST_3(check_stddeque_matrix(RowVectorXf(20)));
+  CALL_SUBTEST_3(check_stddeque_matrix(MatrixXcf(10,10)));
+
+  // some Transform
+  CALL_SUBTEST_4(check_stddeque_transform(Affine2f())); // does not need the specialization (2+1)^2 = 9
+  CALL_SUBTEST_4(check_stddeque_transform(Affine3f()));
+  CALL_SUBTEST_4(check_stddeque_transform(Affine3d()));
+
+  // some Quaternion
+  CALL_SUBTEST_5(check_stddeque_quaternion(Quaternionf()));
+  CALL_SUBTEST_5(check_stddeque_quaternion(Quaterniond()));
+}
diff --git a/test/stdlist_overload.cpp b/test/stdlist_overload.cpp
new file mode 100644
index 000000000..bb910bd43
--- /dev/null
+++ b/test/stdlist_overload.cpp
@@ -0,0 +1,192 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2008 Benoit Jacob <jacob.benoit.1@gmail.com>
+// Copyright (C) 2010 Hauke Heibel <hauke.heibel@gmail.com>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+#include "main.h"
+
+#include <Eigen/StdList>
+#include <Eigen/Geometry>
+
+EIGEN_DEFINE_STL_LIST_SPECIALIZATION(Vector4f)
+
+EIGEN_DEFINE_STL_LIST_SPECIALIZATION(Matrix2f)
+EIGEN_DEFINE_STL_LIST_SPECIALIZATION(Matrix4f)
+EIGEN_DEFINE_STL_LIST_SPECIALIZATION(Matrix4d)
+
+EIGEN_DEFINE_STL_LIST_SPECIALIZATION(Affine3f)
+EIGEN_DEFINE_STL_LIST_SPECIALIZATION(Affine3d)
+
+EIGEN_DEFINE_STL_LIST_SPECIALIZATION(Quaternionf)
+EIGEN_DEFINE_STL_LIST_SPECIALIZATION(Quaterniond)
+
+template <class Container, class Position>
+typename Container::iterator get(Container & c, Position position)
+{
+  typename Container::iterator it = c.begin();
+  std::advance(it, position);
+  return it;
+}
+
+template <class Container, class Position, class Value>
+void set(Container & c, Position position, const Value & value)
+{
+  typename Container::iterator it = c.begin();
+  std::advance(it, position);
+  *it = value;
+}
+
+template<typename MatrixType>
+void check_stdlist_matrix(const MatrixType& m)
+{
+  typename MatrixType::Index rows = m.rows();
+  typename MatrixType::Index cols = m.cols();
+  MatrixType x = MatrixType::Random(rows,cols), y = MatrixType::Random(rows,cols);
+  std::list<MatrixType> v(10, MatrixType(rows,cols)), w(20, y);
+  typename std::list<MatrixType>::iterator itv = get(v, 5);
+  typename std::list<MatrixType>::iterator itw = get(w, 6);
+  *itv = x;
+  *itw = *itv;
+  VERIFY_IS_APPROX(*itw, *itv);
+  v = w;
+  itv = v.begin();
+  itw = w.begin();
+  for(int i = 0; i < 20; i++)
+  {
+    VERIFY_IS_APPROX(*itw, *itv);
+    ++itv;
+    ++itw;
+  }
+
+  v.resize(21);
+  set(v, 20, x);
+  VERIFY_IS_APPROX(*get(v, 20), x);
+  v.resize(22,y);
+  VERIFY_IS_APPROX(*get(v, 21), y);
+  v.push_back(x);
+  VERIFY_IS_APPROX(*get(v, 22), x);
+
+  // do a lot of push_back such that the list gets internally resized
+  // (with memory reallocation)
+  MatrixType* ref = &(*get(w, 0));
+  for(int i=0; i<30 || ((ref==&(*get(w, 0))) && i<300); ++i)
+    v.push_back(*get(w, i%w.size()));
+  for(unsigned int i=23; i<v.size(); ++i)
+  {
+    VERIFY((*get(v, i))==(*get(w, (i-23)%w.size())));
+  }
+}
+
+template<typename TransformType>
+void check_stdlist_transform(const TransformType&)
+{
+  typedef typename TransformType::MatrixType MatrixType;
+  TransformType x(MatrixType::Random()), y(MatrixType::Random());
+  std::list<TransformType> v(10), w(20, y);
+  typename std::list<TransformType>::iterator itv = get(v, 5);
+  typename std::list<TransformType>::iterator itw = get(w, 6);
+  *itv = x;
+  *itw = *itv;
+  VERIFY_IS_APPROX(*itw, *itv);
+  v = w;
+  itv = v.begin();
+  itw = w.begin();
+  for(int i = 0; i < 20; i++)
+  {
+    VERIFY_IS_APPROX(*itw, *itv);
+    ++itv;
+    ++itw;
+  }
+
+  v.resize(21);
+  set(v, 20, x);
+  VERIFY_IS_APPROX(*get(v, 20), x);
+  v.resize(22,y);
+  VERIFY_IS_APPROX(*get(v, 21), y);
+  v.push_back(x);
+  VERIFY_IS_APPROX(*get(v, 22), x);
+
+  // do a lot of push_back such that the list gets internally resized
+  // (with memory reallocation)
+  TransformType* ref = &(*get(w, 0));
+  for(int i=0; i<30 || ((ref==&(*get(w, 0))) && i<300); ++i)
+    v.push_back(*get(w, i%w.size()));
+  for(unsigned int i=23; i<v.size(); ++i)
+  {
+    VERIFY(get(v, i)->matrix()==get(w, (i-23)%w.size())->matrix());
+  }
+}
+
+template<typename QuaternionType>
+void check_stdlist_quaternion(const QuaternionType&)
+{
+  typedef typename QuaternionType::Coefficients Coefficients;
+  QuaternionType x(Coefficients::Random()), y(Coefficients::Random());
+  std::list<QuaternionType> v(10), w(20, y);
+  typename std::list<QuaternionType>::iterator itv = get(v, 5);
+  typename std::list<QuaternionType>::iterator itw = get(w, 6);
+  *itv = x;
+  *itw = *itv;
+  VERIFY_IS_APPROX(*itw, *itv);
+  v = w;
+  itv = v.begin();
+  itw = w.begin();
+  for(int i = 0; i < 20; i++)
+  {
+    VERIFY_IS_APPROX(*itw, *itv);
+    ++itv;
+    ++itw;
+  }
+
+  v.resize(21);
+  set(v, 20, x);
+  VERIFY_IS_APPROX(*get(v, 20), x);
+  v.resize(22,y);
+  VERIFY_IS_APPROX(*get(v, 21), y);
+  v.push_back(x);
+  VERIFY_IS_APPROX(*get(v, 22), x);
+
+  // do a lot of push_back such that the list gets internally resized
+  // (with memory reallocation)
+  QuaternionType* ref = &(*get(w, 0));
+  for(int i=0; i<30 || ((ref==&(*get(w, 0))) && i<300); ++i)
+    v.push_back(*get(w, i%w.size()));
+  for(unsigned int i=23; i<v.size(); ++i)
+  {
+    VERIFY(get(v, i)->coeffs()==get(w, (i-23)%w.size())->coeffs());
+  }
+}
+
+void test_stdlist_overload()
+{
+  // some non vectorizable fixed sizes
+  CALL_SUBTEST_1(check_stdlist_matrix(Vector2f()));
+  CALL_SUBTEST_1(check_stdlist_matrix(Matrix3f()));
+  CALL_SUBTEST_2(check_stdlist_matrix(Matrix3d()));
+
+  // some vectorizable fixed sizes
+  CALL_SUBTEST_1(check_stdlist_matrix(Matrix2f()));
+  CALL_SUBTEST_1(check_stdlist_matrix(Vector4f()));
+  CALL_SUBTEST_1(check_stdlist_matrix(Matrix4f()));
+  CALL_SUBTEST_2(check_stdlist_matrix(Matrix4d()));
+
+  // some dynamic sizes
+  CALL_SUBTEST_3(check_stdlist_matrix(MatrixXd(1,1)));
+  CALL_SUBTEST_3(check_stdlist_matrix(VectorXd(20)));
+  CALL_SUBTEST_3(check_stdlist_matrix(RowVectorXf(20)));
+  CALL_SUBTEST_3(check_stdlist_matrix(MatrixXcf(10,10)));
+
+  // some Transform
+  CALL_SUBTEST_4(check_stdlist_transform(Affine2f())); // does not need the specialization (2+1)^2 = 9
+  CALL_SUBTEST_4(check_stdlist_transform(Affine3f()));
+  CALL_SUBTEST_4(check_stdlist_transform(Affine3d()));
+
+  // some Quaternion
+  CALL_SUBTEST_5(check_stdlist_quaternion(Quaternionf()));
+  CALL_SUBTEST_5(check_stdlist_quaternion(Quaterniond()));
+}

From 70dc14e4e10c08bdfe77d8a32cf14b81af9022fa Mon Sep 17 00:00:00 2001
From: Gael Guennebaud <g.gael@free.fr>
Date: Wed, 3 Feb 2016 18:25:41 +0100
Subject: [PATCH 12/24] bug #1161: fix division by zero for huge scalar types

---
 Eigen/src/Core/products/GeneralBlockPanelKernel.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/Eigen/src/Core/products/GeneralBlockPanelKernel.h b/Eigen/src/Core/products/GeneralBlockPanelKernel.h
index d2e6f26c8..54e118395 100644
--- a/Eigen/src/Core/products/GeneralBlockPanelKernel.h
+++ b/Eigen/src/Core/products/GeneralBlockPanelKernel.h
@@ -178,7 +178,7 @@ void evaluateProductBlockingSizesHeuristic(Index& k, Index& m, Index& n, Index n
     // We also include a register-level block of the result (mx x nr).
     // (In an ideal world only the lhs panel would stay in L1)
     // Moreover, kc has to be a multiple of 8 to be compatible with loop peeling, leading to a maximum blocking size of:
-    const Index max_kc = ((l1-k_sub)/k_div) & (~(k_peeling-1));
+    const Index max_kc = std::max<Index>(((l1-k_sub)/k_div) & (~(k_peeling-1)),1);
     const Index old_k = k;
     if(k>max_kc)
     {

From 492fe7ce02f69693a53a018024920c36911aa9a5 Mon Sep 17 00:00:00 2001
From: Benoit Steiner <benoit.steiner.goog@gmail.com>
Date: Wed, 3 Feb 2016 12:51:19 -0800
Subject: [PATCH 13/24] Silenced some unhelpful warnings generated by nvcc.

---
 Eigen/src/Core/util/DisableStupidWarnings.h  | 5 +++++
 Eigen/src/Core/util/ReenableStupidWarnings.h | 3 +++
 2 files changed, 8 insertions(+)

diff --git a/Eigen/src/Core/util/DisableStupidWarnings.h b/Eigen/src/Core/util/DisableStupidWarnings.h
index 91c61fcf2..686a7c9ce 100755
--- a/Eigen/src/Core/util/DisableStupidWarnings.h
+++ b/Eigen/src/Core/util/DisableStupidWarnings.h
@@ -20,6 +20,7 @@
     #pragma warning( push )
   #endif
   #pragma warning( disable : 4100 4101 4127 4181 4211 4244 4273 4324 4503 4512 4522 4700 4717 4800)
+
 #elif defined __INTEL_COMPILER
   // 2196 - routine is both "inline" and "noinline" ("noinline" assumed)
   //        ICC 12 generates this warning even without any inline keyword, when defining class methods 'inline' i.e. inside of class body
@@ -32,6 +33,7 @@
     #pragma warning push
   #endif
   #pragma warning disable 2196 279 1684 2259
+
 #elif defined __clang__
   // -Wconstant-logical-operand - warning: use of logical && with constant operand; switch to bitwise & or remove constant
   //     this is really a stupid warning as it warns on compile-time expressions involving enums
@@ -39,6 +41,9 @@
     #pragma clang diagnostic push
   #endif
   #pragma clang diagnostic ignored "-Wconstant-logical-operand"
+
+#elif defined __NVCC__
+  #pragma diag_suppress code_is_unreachable
 #endif
 
 #endif // not EIGEN_WARNINGS_DISABLED
diff --git a/Eigen/src/Core/util/ReenableStupidWarnings.h b/Eigen/src/Core/util/ReenableStupidWarnings.h
index 5ddfbd4aa..4a2e0d54a 100644
--- a/Eigen/src/Core/util/ReenableStupidWarnings.h
+++ b/Eigen/src/Core/util/ReenableStupidWarnings.h
@@ -8,6 +8,9 @@
     #pragma warning pop
   #elif defined __clang__
     #pragma clang diagnostic pop
+  #elif defined __NVCC__
+    #pragma diag_warning code_is_unreachable
+    #pragma diag_warning initialization_not_reachable
   #endif
 #endif
 

From d7742d22e4a1787dc16dac938b4f26601af7b488 Mon Sep 17 00:00:00 2001
From: Benoit Steiner <benoit.steiner.goog@gmail.com>
Date: Wed, 3 Feb 2016 13:47:28 -0800
Subject: [PATCH 14/24] Revert the nvcc messages to their default severity
 instead of the forcing them to be warnings

---
 Eigen/src/Core/util/ReenableStupidWarnings.h | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/Eigen/src/Core/util/ReenableStupidWarnings.h b/Eigen/src/Core/util/ReenableStupidWarnings.h
index 4a2e0d54a..a45d17f9e 100644
--- a/Eigen/src/Core/util/ReenableStupidWarnings.h
+++ b/Eigen/src/Core/util/ReenableStupidWarnings.h
@@ -9,8 +9,8 @@
   #elif defined __clang__
     #pragma clang diagnostic pop
   #elif defined __NVCC__
-    #pragma diag_warning code_is_unreachable
-    #pragma diag_warning initialization_not_reachable
+    #pragma diag_default code_is_unreachable
+    #pragma diag_default initialization_not_reachable
   #endif
 #endif
 

From af8436b19694901b580ef58283a6e8b64ccde7d2 Mon Sep 17 00:00:00 2001
From: Benoit Steiner <benoit.steiner.goog@gmail.com>
Date: Wed, 3 Feb 2016 13:48:36 -0800
Subject: [PATCH 15/24] Silenced the "calling a __host__ function from a
 __host__ __device__ function is not allowed" messages

---
 unsupported/test/CMakeLists.txt | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/unsupported/test/CMakeLists.txt b/unsupported/test/CMakeLists.txt
index eed724bcf..33d31c098 100644
--- a/unsupported/test/CMakeLists.txt
+++ b/unsupported/test/CMakeLists.txt
@@ -158,7 +158,7 @@ if(CUDA_FOUND)
   if("${CMAKE_CXX_COMPILER_ID}" STREQUAL "Clang")
     set(CUDA_NVCC_FLAGS "-ccbin /usr/bin/clang" CACHE STRING "nvcc flags" FORCE)
   endif()
-  set(CUDA_NVCC_FLAGS "-std=c++11 --relaxed-constexpr -arch compute_30")
+  set(CUDA_NVCC_FLAGS "-std=c++11 --relaxed-constexpr -arch compute_30 -Xcudafe \"--diag_suppress 2651 --diag_suppress 2653\"")
   cuda_include_directories("${CMAKE_CURRENT_BINARY_DIR}" "${CUDA_TOOLKIT_ROOT_DIR}/include")
   set(EIGEN_ADD_TEST_FILENAME_EXTENSION "cu")
 

From 5d82e47ef68d0293086227d4abe7a1c05d4f4fd8 Mon Sep 17 00:00:00 2001
From: Benoit Steiner <benoit.steiner.goog@gmail.com>
Date: Wed, 3 Feb 2016 14:10:06 -0800
Subject: [PATCH 16/24] Properly disable nvcc warning messages in user code.

---
 Eigen/src/Core/util/DisableStupidWarnings.h  | 4 ++++
 Eigen/src/Core/util/ReenableStupidWarnings.h | 9 +++++++--
 unsupported/test/CMakeLists.txt              | 2 +-
 3 files changed, 12 insertions(+), 3 deletions(-)

diff --git a/Eigen/src/Core/util/DisableStupidWarnings.h b/Eigen/src/Core/util/DisableStupidWarnings.h
index 686a7c9ce..d325bc062 100755
--- a/Eigen/src/Core/util/DisableStupidWarnings.h
+++ b/Eigen/src/Core/util/DisableStupidWarnings.h
@@ -44,6 +44,10 @@
 
 #elif defined __NVCC__
   #pragma diag_suppress code_is_unreachable
+  #pragma diag_suppress initialization_not_reachable
+  #pragma diag_suppress 2651
+  #pragma diag_suppress 2653
+
 #endif
 
 #endif // not EIGEN_WARNINGS_DISABLED
diff --git a/Eigen/src/Core/util/ReenableStupidWarnings.h b/Eigen/src/Core/util/ReenableStupidWarnings.h
index a45d17f9e..ea88e226c 100644
--- a/Eigen/src/Core/util/ReenableStupidWarnings.h
+++ b/Eigen/src/Core/util/ReenableStupidWarnings.h
@@ -9,8 +9,13 @@
   #elif defined __clang__
     #pragma clang diagnostic pop
   #elif defined __NVCC__
-    #pragma diag_default code_is_unreachable
-    #pragma diag_default initialization_not_reachable
+//    Don't reenable the diagnostic messages, as it turns out these messages need
+//    to be disabled at the point of the template instantiation (i.e the user code)
+//    otherwise they'll be triggeredby nvcc.
+//    #pragma diag_default code_is_unreachable
+//    #pragma diag_default initialization_not_reachable
+//    #pragma diag_default 2651
+//    #pragma diag_default 2653
   #endif
 #endif
 
diff --git a/unsupported/test/CMakeLists.txt b/unsupported/test/CMakeLists.txt
index 33d31c098..42e0189a4 100644
--- a/unsupported/test/CMakeLists.txt
+++ b/unsupported/test/CMakeLists.txt
@@ -158,7 +158,7 @@ if(CUDA_FOUND)
   if("${CMAKE_CXX_COMPILER_ID}" STREQUAL "Clang")
     set(CUDA_NVCC_FLAGS "-ccbin /usr/bin/clang" CACHE STRING "nvcc flags" FORCE)
   endif()
-  set(CUDA_NVCC_FLAGS "-std=c++11 --relaxed-constexpr -arch compute_30 -Xcudafe \"--diag_suppress 2651 --diag_suppress 2653\"")
+  set(CUDA_NVCC_FLAGS "-std=c++11 --relaxed-constexpr -arch compute_30 -Xcudafe \"--display_error_number\"")
   cuda_include_directories("${CMAKE_CURRENT_BINARY_DIR}" "${CUDA_TOOLKIT_ROOT_DIR}/include")
   set(EIGEN_ADD_TEST_FILENAME_EXTENSION "cu")
 

From f933f69021438af1a42f8dff9451cde0dce2a460 Mon Sep 17 00:00:00 2001
From: Benoit Steiner <benoit.steiner.goog@gmail.com>
Date: Wed, 3 Feb 2016 14:12:18 -0800
Subject: [PATCH 17/24] Added a few comments

---
 Eigen/src/Core/util/DisableStupidWarnings.h | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/Eigen/src/Core/util/DisableStupidWarnings.h b/Eigen/src/Core/util/DisableStupidWarnings.h
index d325bc062..3ed931855 100755
--- a/Eigen/src/Core/util/DisableStupidWarnings.h
+++ b/Eigen/src/Core/util/DisableStupidWarnings.h
@@ -43,8 +43,11 @@
   #pragma clang diagnostic ignored "-Wconstant-logical-operand"
 
 #elif defined __NVCC__
+  // Disable the "statement is unreachable" message
   #pragma diag_suppress code_is_unreachable
+  // Disable the "dynamic initialization in unreachable code" message
   #pragma diag_suppress initialization_not_reachable
+  // Disable the "calling a __host__ function from a __host__ __device__ function is not allowed" messages
   #pragma diag_suppress 2651
   #pragma diag_suppress 2653
 

From bcbde37a1154ef0da3587709af888af54e8b9720 Mon Sep 17 00:00:00 2001
From: Benoit Steiner <benoit.steiner.goog@gmail.com>
Date: Wed, 3 Feb 2016 14:53:08 -0800
Subject: [PATCH 18/24] Made sure the code compiles when EIGEN_HAS_C99_MATH
 isn't defined

---
 Eigen/src/Core/SpecialFunctions.h | 27 ++++++++++++++++++---------
 1 file changed, 18 insertions(+), 9 deletions(-)

diff --git a/Eigen/src/Core/SpecialFunctions.h b/Eigen/src/Core/SpecialFunctions.h
index 9f89e184d..6c6b21f98 100644
--- a/Eigen/src/Core/SpecialFunctions.h
+++ b/Eigen/src/Core/SpecialFunctions.h
@@ -134,7 +134,24 @@ struct lgamma_impl<double> {
  * Implementation of digamma (psi)                                          *
  ****************************************************************************/
 
-#ifdef EIGEN_HAS_C99_MATH
+template <typename Scalar>
+struct digamma_retval {
+  typedef Scalar type;
+};
+
+#ifndef EIGEN_HAS_C99_MATH
+
+template <typename Scalar>
+struct digamma_impl {
+  EIGEN_DEVICE_FUNC
+  static Scalar run(Scalar x) {
+    EIGEN_STATIC_ASSERT((internal::is_same<Scalar, Scalar>::value == false),
+                        THIS_TYPE_IS_NOT_SUPPORTED);
+    return Scalar(0);
+  }
+};
+
+#else
 
 /*
  *
@@ -202,14 +219,6 @@ struct digamma_impl_maybe_poly<double> {
   }
 };
 
-#endif  // EIGEN_HAS_C99_MATH
-
-template <typename Scalar>
-struct digamma_retval {
-  typedef Scalar type;
-};
-
-#ifdef EIGEN_HAS_C99_MATH
 template <typename Scalar>
 struct digamma_impl {
   EIGEN_DEVICE_FUNC

From 1cbb79cdfd4a43faf43f4095df456731d98961c0 Mon Sep 17 00:00:00 2001
From: Benoit Steiner <benoit.steiner.goog@gmail.com>
Date: Wed, 3 Feb 2016 15:58:26 -0800
Subject: [PATCH 19/24] Made sure the dummy element of size 0 array is always
 intialized to silence some compiler warnings

---
 unsupported/Eigen/CXX11/src/Core/util/EmulateArray.h | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/unsupported/Eigen/CXX11/src/Core/util/EmulateArray.h b/unsupported/Eigen/CXX11/src/Core/util/EmulateArray.h
index 89aeb03e7..4df0165b9 100644
--- a/unsupported/Eigen/CXX11/src/Core/util/EmulateArray.h
+++ b/unsupported/Eigen/CXX11/src/Core/util/EmulateArray.h
@@ -165,10 +165,10 @@ template <typename T> class array<T, 0> {
   static EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE std::size_t size() { return 0; }
 
   EIGEN_DEVICE_FUNC
-  EIGEN_STRONG_INLINE array() { }
+  EIGEN_STRONG_INLINE array() : dummy(static_cast<T>(0)) { }
 
 #ifdef EIGEN_HAS_VARIADIC_TEMPLATES
-  EIGEN_DEVICE_FUNC array(std::initializer_list<T> l) {
+  EIGEN_DEVICE_FUNC array(std::initializer_list<T> l) : dummy(static_cast<T>(0)) {
     eigen_assert(l.size() == 0);
   }
 #endif

From 727ff2696087fbd96815c33addc649086d31287c Mon Sep 17 00:00:00 2001
From: Benoit Steiner <benoit.steiner.goog@gmail.com>
Date: Wed, 3 Feb 2016 16:01:37 -0800
Subject: [PATCH 20/24] Disable 2 more nvcc warning messages

---
 Eigen/src/Core/util/DisableStupidWarnings.h | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/Eigen/src/Core/util/DisableStupidWarnings.h b/Eigen/src/Core/util/DisableStupidWarnings.h
index 3ed931855..829b23ac8 100755
--- a/Eigen/src/Core/util/DisableStupidWarnings.h
+++ b/Eigen/src/Core/util/DisableStupidWarnings.h
@@ -47,10 +47,11 @@
   #pragma diag_suppress code_is_unreachable
   // Disable the "dynamic initialization in unreachable code" message
   #pragma diag_suppress initialization_not_reachable
-  // Disable the "calling a __host__ function from a __host__ __device__ function is not allowed" messages
+  // Disable the "calling a __host__ function from a __host__ __device__ function is not allowed" messages (yes, there are 4 of them)
   #pragma diag_suppress 2651
   #pragma diag_suppress 2653
-
+  #pragma diag_suppress 2668
+  #pragma diag_suppress 2670
 #endif
 
 #endif // not EIGEN_WARNINGS_DISABLED

From 4ab63a3f6f6b57243a6016b29361a241a47cce46 Mon Sep 17 00:00:00 2001
From: Benoit Steiner <benoit.steiner.goog@gmail.com>
Date: Wed, 3 Feb 2016 17:23:07 -0800
Subject: [PATCH 21/24] Fixed the initialization of the dummy member of the
 array class to make it compatible with pairs of element.

---
 unsupported/Eigen/CXX11/src/Core/util/EmulateArray.h | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/unsupported/Eigen/CXX11/src/Core/util/EmulateArray.h b/unsupported/Eigen/CXX11/src/Core/util/EmulateArray.h
index 4df0165b9..56e2b8afc 100644
--- a/unsupported/Eigen/CXX11/src/Core/util/EmulateArray.h
+++ b/unsupported/Eigen/CXX11/src/Core/util/EmulateArray.h
@@ -165,10 +165,10 @@ template <typename T> class array<T, 0> {
   static EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE std::size_t size() { return 0; }
 
   EIGEN_DEVICE_FUNC
-  EIGEN_STRONG_INLINE array() : dummy(static_cast<T>(0)) { }
+  EIGEN_STRONG_INLINE array() : dummy() { }
 
 #ifdef EIGEN_HAS_VARIADIC_TEMPLATES
-  EIGEN_DEVICE_FUNC array(std::initializer_list<T> l) : dummy(static_cast<T>(0)) {
+  EIGEN_DEVICE_FUNC array(std::initializer_list<T> l) : dummy() {
     eigen_assert(l.size() == 0);
   }
 #endif

From f53537899573d8463985a33906a82e6c05a7aff9 Mon Sep 17 00:00:00 2001
From: Benoit Steiner <benoit.steiner.goog@gmail.com>
Date: Wed, 3 Feb 2016 18:58:29 -0800
Subject: [PATCH 22/24] Added support for vectorized type casting of int to
 char.

---
 Eigen/src/Core/GenericPacketMath.h            |  5 +++++
 .../Eigen/CXX11/src/Tensor/TensorConversion.h | 20 +++++++++++++++++++
 2 files changed, 25 insertions(+)

diff --git a/Eigen/src/Core/GenericPacketMath.h b/Eigen/src/Core/GenericPacketMath.h
index 8f63af7cb..02882bdea 100644
--- a/Eigen/src/Core/GenericPacketMath.h
+++ b/Eigen/src/Core/GenericPacketMath.h
@@ -134,6 +134,11 @@ pcast(const SrcPacket& a, const SrcPacket& /*b*/) {
   return static_cast<TgtPacket>(a);
 }
 
+template <typename SrcPacket, typename TgtPacket>
+EIGEN_DEVICE_FUNC inline TgtPacket
+pcast(const SrcPacket& a, const SrcPacket& /*b*/, const SrcPacket& /*c*/, const SrcPacket& /*d*/) {
+  return static_cast<TgtPacket>(a);
+}
 
 /** \internal \returns a + b (coeff-wise) */
 template<typename Packet> EIGEN_DEVICE_FUNC inline Packet
diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorConversion.h b/unsupported/Eigen/CXX11/src/Tensor/TensorConversion.h
index 877bcd0df..d2defcaf4 100644
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorConversion.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorConversion.h
@@ -86,6 +86,26 @@ struct PacketConverter<TensorEvaluator, SrcPacket, TgtPacket, 2, 1> {
   const TensorEvaluator& m_impl;
 };
 
+template <typename TensorEvaluator, typename SrcPacket, typename TgtPacket>
+struct PacketConverter<TensorEvaluator, SrcPacket, TgtPacket, 4, 1> {
+  PacketConverter(const TensorEvaluator& impl)
+      : m_impl(impl) {}
+
+  template<int LoadMode, typename Index>
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TgtPacket packet(Index index) const {
+    const int SrcPacketSize = internal::unpacket_traits<SrcPacket>::size;
+
+    SrcPacket src1 = m_impl.template packet<LoadMode>(index);
+    SrcPacket src2 = m_impl.template packet<LoadMode>(index + SrcPacketSize);
+    SrcPacket src3 = m_impl.template packet<LoadMode>(index + 2 * SrcPacketSize);
+    SrcPacket src4 = m_impl.template packet<LoadMode>(index + 3 * SrcPacketSize);
+    TgtPacket result = internal::pcast<SrcPacket, TgtPacket>(src1, src2, src3, src4);
+    return result;
+  }
+
+ private:
+  const TensorEvaluator& m_impl;
+};
 
 template <typename TensorEvaluator, typename SrcPacket, typename TgtPacket>
 struct PacketConverter<TensorEvaluator, SrcPacket, TgtPacket, 1, 2> {

From d5d7798b9d8e1a25aa8928a7e79d36ff7d72b7d7 Mon Sep 17 00:00:00 2001
From: Gael Guennebaud <g.gael@free.fr>
Date: Thu, 4 Feb 2016 09:53:47 +0100
Subject: [PATCH 23/24] Improve heuritics for switching between coeff-based and
 general matrix product implementation.

---
 Eigen/src/Core/GeneralProduct.h | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/Eigen/src/Core/GeneralProduct.h b/Eigen/src/Core/GeneralProduct.h
index fe8204ac3..d2290241c 100644
--- a/Eigen/src/Core/GeneralProduct.h
+++ b/Eigen/src/Core/GeneralProduct.h
@@ -125,8 +125,8 @@ template<>              struct product_type_selector<Small,Small,Large>  { enum
 template<>              struct product_type_selector<Large,Small,Large>  { enum { ret = GemmProduct }; };
 template<>              struct product_type_selector<Small,Large,Large>  { enum { ret = GemmProduct }; };
 template<>              struct product_type_selector<Large,Large,Large>  { enum { ret = GemmProduct }; };
-template<>              struct product_type_selector<Large,Small,Small>  { enum { ret = GemmProduct }; };
-template<>              struct product_type_selector<Small,Large,Small>  { enum { ret = GemmProduct }; };
+template<>              struct product_type_selector<Large,Small,Small>  { enum { ret = CoeffBasedProductMode }; };
+template<>              struct product_type_selector<Small,Large,Small>  { enum { ret = CoeffBasedProductMode }; };
 template<>              struct product_type_selector<Large,Large,Small>  { enum { ret = GemmProduct }; };
 
 } // end namespace internal

From 659fc9c1593e0bfd1b886557699573873198fb61 Mon Sep 17 00:00:00 2001
From: Gael Guennebaud <g.gael@free.fr>
Date: Thu, 4 Feb 2016 09:55:09 +0100
Subject: [PATCH 24/24] Remove dead code

---
 Eigen/src/Core/GeneralProduct.h           | 26 -----------------------
 Eigen/src/Core/util/ForwardDeclarations.h |  4 ----
 2 files changed, 30 deletions(-)

diff --git a/Eigen/src/Core/GeneralProduct.h b/Eigen/src/Core/GeneralProduct.h
index d2290241c..0769a212e 100644
--- a/Eigen/src/Core/GeneralProduct.h
+++ b/Eigen/src/Core/GeneralProduct.h
@@ -76,32 +76,6 @@ public:
 #endif
 };
 
-// template<typename Lhs, typename Rhs> struct product_tag
-// {
-// private:
-//   
-//   typedef typename remove_all<Lhs>::type _Lhs;
-//   typedef typename remove_all<Rhs>::type _Rhs;
-//   enum {
-//     Rows  = _Lhs::RowsAtCompileTime,
-//     Cols  = _Rhs::ColsAtCompileTime,
-//     Depth = EIGEN_SIZE_MIN_PREFER_FIXED(_Lhs::ColsAtCompileTime, _Rhs::RowsAtCompileTime)
-//   };
-// 
-//   enum {
-//     rows_select = Rows==1 ? int(Rows) : int(Large),
-//     cols_select = Cols==1 ? int(Cols) : int(Large),
-//     depth_select = Depth==1 ? int(Depth) : int(Large)
-//   };
-//   typedef product_type_selector<rows_select, cols_select, depth_select> selector;
-// 
-// public:
-//   enum {
-//     ret = selector::ret
-//   };
-// 
-// };
-
 /* The following allows to select the kind of product at compile time
  * based on the three dimensions of the product.
  * This is a compile time mapping from {1,Small,Large}^3 -> {product types} */
diff --git a/Eigen/src/Core/util/ForwardDeclarations.h b/Eigen/src/Core/util/ForwardDeclarations.h
index 483af876f..31c7088e7 100644
--- a/Eigen/src/Core/util/ForwardDeclarations.h
+++ b/Eigen/src/Core/util/ForwardDeclarations.h
@@ -94,10 +94,6 @@ template<typename BinaryOp,  typename Lhs, typename Rhs>  class CwiseBinaryOp;
 template<typename Decomposition, typename Rhstype>        class Solve;
 template<typename XprType>                                class Inverse;
 
-namespace internal {
-  template<typename Lhs, typename Rhs> struct product_tag;
-}
-
 template<typename Lhs, typename Rhs, int Option = DefaultProduct> class Product;
          
 template<typename Derived> class DiagonalBase;

Operations	Code	Notes
Code	Notes
Sub-matrices
Sub-matrices	\code sm1.block(startRow, startCol, rows, cols); @@ -193,25 +193,31 @@ sm2 = perm * sm1; // Permute the columns sm1.bottomLeftCorner( rows, cols); sm1.bottomRightCorner( rows, cols); \endcode -		+Contrary to dense matrices, here all these methods are read-only.\n +See \ref TutorialSparse_SubMatrices and below for read-write sub-matrices. +
Range
Range
\code - sm1.innerVector(outer); - sm1.innerVectors(start, size); - sm1.leftCols(size); - sm2.rightCols(size); - sm1.middleRows(start, numRows); - sm1.middleCols(start, numCols); - sm1.col(j); + sm1.innerVector(outer); // RW + sm1.innerVectors(start, size); // RW + sm1.leftCols(size); // RW + sm2.rightCols(size); // RO because sm2 is row-major + sm1.middleRows(start, numRows); // RO becasue sm1 is column-major + sm1.middleCols(start, numCols); // RW + sm1.col(j); // RW \endcode	A inner vector is either a row (for row-major) or a column (for column-major). As stated earlier, the evaluation can be done in a matrix with different storage order	+A inner vector is either a row (for row-major) or a column (for column-major).\n +As stated earlier, for a read-write sub-matrix (RW), the evaluation can be done in a matrix with different storage order. +
Triangular and selfadjoint views
Triangular and selfadjoint views	\code sm2 = sm1.triangularview(); @@ -222,26 +228,30 @@ sm2 = perm * sm1; // Permute the columns \code \endcode
Triangular solve
Triangular solve
\code dv2 = sm1.triangularView().solve(dv1); - dv2 = sm1.topLeftCorner(size, size).triangularView().solve(dv1); + dv2 = sm1.topLeftCorner(size, size) + .triangularView().solve(dv1); \endcode	For general sparse solve, Use any suitable module described at \ref TopicSparseSystems
Low-level API
Low-level API	\code -sm1.valuePtr(); // Pointer to the values -sm1.innerIndextr(); // Pointer to the indices. -sm1.outerIndexPtr(); //Pointer to the beginning of each inner vector +sm1.valuePtr(); // Pointer to the values +sm1.innerIndextr(); // Pointer to the indices. +sm1.outerIndexPtr(); // Pointer to the beginning of each inner vector \endcode	If the matrix is not in compressed form, makeCompressed() should be called before. Note that these functions are mostly provided for interoperability purposes with external libraries. A better access to the values of the matrix is done by using the InnerIterator class as described in \link TutorialSparse the Tutorial Sparse \endlink section	+If the matrix is not in compressed form, makeCompressed() should be called before.\n +Note that these functions are mostly provided for interoperability purposes with external libraries.\n +A better access to the values of the matrix is done by using the InnerIterator class as described in \link TutorialSparse the Tutorial Sparse \endlink section