From 07ac4f7e027cddd3457a34295420480f7e541ac5 Mon Sep 17 00:00:00 2001
From: Rasmus Munk Larsen <rmlarsen@google.com>
Date: Thu, 14 Apr 2016 18:28:23 -0700
Subject: [PATCH 01/27] Eigen Tensor cost model part 2: Thread scheduling for
 standard evaluators and reductions. The cost model is turned off by default.

---
 .../Eigen/CXX11/src/Tensor/TensorCostModel.h  |   6 +-
 .../Eigen/CXX11/src/Tensor/TensorEvaluator.h  |   5 +
 .../Eigen/CXX11/src/Tensor/TensorExecutor.h   |  91 ++++++++++-----
 .../Eigen/CXX11/src/Tensor/TensorReduction.h  | 104 +++++++-----------
 4 files changed, 110 insertions(+), 96 deletions(-)
diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorCostModel.h b/unsupported/Eigen/CXX11/src/Tensor/TensorCostModel.h
index 32bc5d0b2..4e8f86674 100644
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorCostModel.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorCostModel.h
@@ -10,9 +10,9 @@
 #ifndef EIGEN_CXX11_TENSOR_TENSOR_COST_MODEL_H
 #define EIGEN_CXX11_TENSOR_TENSOR_COST_MODEL_H
 
-#if !defined(EIGEN_USE_GPU)
-#define EIGEN_USE_COST_MODEL
-#endif
+//#if !defined(EIGEN_USE_GPU)
+//#define EIGEN_USE_COST_MODEL
+//#endif
 
 namespace Eigen {
 
diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorEvaluator.h b/unsupported/Eigen/CXX11/src/Tensor/TensorEvaluator.h
index f1f9a90df..293012646 100644
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorEvaluator.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorEvaluator.h
@@ -189,6 +189,11 @@ struct TensorEvaluator<const Derived, Device>
     return loadConstant(m_data+index);
   }
 
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorOpCost costPerCoeff(bool vectorized) const {
+    return TensorOpCost(sizeof(CoeffReturnType), 0, 0, vectorized,
+                        internal::unpacket_traits<PacketReturnType>::size);
+  }
+
   EIGEN_DEVICE_FUNC const Scalar* data() const { return m_data; }
 
  protected:
diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorExecutor.h b/unsupported/Eigen/CXX11/src/Tensor/TensorExecutor.h
index eabfd91fe..df9cc0998 100644
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorExecutor.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorExecutor.h
@@ -59,9 +59,16 @@ class TensorExecutor<Expression, DefaultDevice, true>
     {
       const Index size = array_prod(evaluator.dimensions());
       const int PacketSize = unpacket_traits<typename TensorEvaluator<Expression, DefaultDevice>::PacketReturnType>::size;
+      // Manually unroll this loop since compilers don't do it.
+      const Index UnrolledSize = (size / (4 * PacketSize)) * 4 * PacketSize;
+      for (Index i = 0; i < UnrolledSize; i += 4*PacketSize) {
+        evaluator.evalPacket(i);
+        evaluator.evalPacket(i+PacketSize);
+        evaluator.evalPacket(i+2*PacketSize);
+        evaluator.evalPacket(i+3*PacketSize);
+      }
       const Index VectorizedSize = (size / PacketSize) * PacketSize;
-
-      for (Index i = 0; i < VectorizedSize; i += PacketSize) {
+      for (Index i = UnrolledSize; i < VectorizedSize; i += PacketSize) {
         evaluator.evalPacket(i);
       }
       for (Index i = VectorizedSize; i < size; ++i) {
@@ -78,8 +85,9 @@ class TensorExecutor<Expression, DefaultDevice, true>
 #ifdef EIGEN_USE_THREADS
 template <typename Evaluator, typename Index, bool Vectorizable>
 struct EvalRange {
-  static void run(Evaluator evaluator, const Index first, const Index last) {
-    eigen_assert(last > first);
+  static void run(void* evaluator_in, const Index first, const Index last) {
+    Evaluator evaluator(*static_cast<Evaluator*>(evaluator_in));
+    eigen_assert(last >= first);
     for (Index i = first; i < last; ++i) {
       evaluator.evalScalar(i);
     }
@@ -88,28 +96,45 @@ struct EvalRange {
 
 template <typename Evaluator, typename Index>
 struct EvalRange<Evaluator, Index, true> {
-  static void run(Evaluator evaluator, const Index first, const Index last) {
-    eigen_assert(last > first);
+  static void run(void* evaluator_in, const Index first, const Index last) {
+    Evaluator evaluator(*static_cast<Evaluator*>(evaluator_in));
+    eigen_assert(last >= first);
 
     Index i = first;
-    static const int PacketSize = unpacket_traits<typename Evaluator::PacketReturnType>::size;
+    const int PacketSize = unpacket_traits<typename Evaluator::PacketReturnType>::size;
     if (last - first >= PacketSize) {
       eigen_assert(first % PacketSize == 0);
-      Index lastPacket = last - (last % PacketSize);
-      for (; i < lastPacket; i += PacketSize) {
+      Index last_chunk_offset = last - 4 * PacketSize;
+      // Manually unroll this loop since compilers don't do it.
+      for (; i <= last_chunk_offset; i += 4*PacketSize) {
+        evaluator.evalPacket(i);
+        evaluator.evalPacket(i+PacketSize);
+        evaluator.evalPacket(i+2*PacketSize);
+        evaluator.evalPacket(i+3*PacketSize);
+      }
+      last_chunk_offset = last - PacketSize;
+      for (; i <= last_chunk_offset; i += PacketSize) {
         evaluator.evalPacket(i);
       }
     }
-
     for (; i < last; ++i) {
       evaluator.evalScalar(i);
     }
   }
 };
 
-template<typename Expression, bool Vectorizable>
-class TensorExecutor<Expression, ThreadPoolDevice, Vectorizable>
-{
+// Used to make an std::function to add to the ThreadPool with less templating
+// than EvalRange::Run.
+// This requires that this and EvalRange takes a void* to the evaluator that can
+// be downcast to the right type by the EvalRange.
+template <typename Index>
+inline void InvokeEvalRange(void (*run_fn)(void*, const Index, const Index),
+                      void* evaluator, const Index first, const Index last) {
+  run_fn(evaluator, first, last);
+}
+
+template <typename Expression, bool Vectorizable>
+class TensorExecutor<Expression, ThreadPoolDevice, Vectorizable> {
  public:
   typedef typename Expression::Index Index;
   static inline void run(const Expression& expr, const ThreadPoolDevice& device)
@@ -119,24 +144,35 @@ class TensorExecutor<Expression, ThreadPoolDevice, Vectorizable>
     const bool needs_assign = evaluator.evalSubExprsIfNeeded(NULL);
     if (needs_assign)
     {
+      const Index PacketSize = Vectorizable ? unpacket_traits<typename Evaluator::PacketReturnType>::size : 1;
       const Index size = array_prod(evaluator.dimensions());
-
-      static const int PacketSize = Vectorizable ? unpacket_traits<typename Evaluator::PacketReturnType>::size : 1;
-
-      int blocksz = std::ceil<int>(static_cast<float>(size)/device.numThreads()) + PacketSize - 1;
-      const Index blocksize = numext::maxi<Index>(PacketSize, (blocksz - (blocksz % PacketSize)));
-      const unsigned int numblocks = static_cast<unsigned int>(size / blocksize);
-
-      Barrier barrier(numblocks);
-      for (unsigned int i = 0; i < numblocks; ++i) {
-        device.enqueue_with_barrier(&barrier, &EvalRange<Evaluator, Index, Vectorizable>::run, evaluator, i*blocksize, (i+1)*blocksize);
+      int num_threads = device.numThreads();
+#ifdef EIGEN_USE_COST_MODEL
+      if (num_threads > 1) {
+        num_threads = TensorCostModel<ThreadPoolDevice>::numThreads(
+            size, evaluator.costPerCoeff(Vectorizable), num_threads);
       }
+#endif
+      if (num_threads == 1) {
+        EvalRange<Evaluator, Index, Vectorizable>::run(&evaluator, 0, size);
+      } else {
+        Index blocksz = std::ceil<Index>(static_cast<float>(size)/num_threads) + PacketSize - 1;
+        const Index blocksize = numext::maxi<Index>(PacketSize, (blocksz - (blocksz % PacketSize)));
+        const Index numblocks = size / blocksize;
 
-      if (static_cast<Index>(numblocks) * blocksize < size) {
-        EvalRange<Evaluator, Index, Vectorizable>::run(evaluator, numblocks * blocksize, size);
+        Barrier barrier(numblocks);
+        for (int i = 0; i < numblocks; ++i) {
+          device.enqueue_with_barrier(
+              &barrier, &InvokeEvalRange<Index>,
+              &EvalRange<Evaluator, Index, Vectorizable>::run,
+              static_cast<void*>(&evaluator), i * blocksize,
+              (i + 1) * blocksize);
+        }
+        if (numblocks * blocksize < size) {
+          EvalRange<Evaluator, Index, Vectorizable>::run(&evaluator, numblocks * blocksize, size);
+        }
+        barrier.Wait();
       }
-
-      barrier.Wait();
     }
     evaluator.cleanup();
   }
@@ -226,7 +262,6 @@ inline void TensorExecutor<Expression, GpuDevice, Vectorizable>::run(
 #endif  // __CUDACC__
 #endif  // EIGEN_USE_GPU
 
-
 } // end namespace internal
 
 } // end namespace Eigen
diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorReduction.h b/unsupported/Eigen/CXX11/src/Tensor/TensorReduction.h
index 1c9e7ab66..885295f0a 100644
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorReduction.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorReduction.h
@@ -214,7 +214,7 @@ struct FullReducer {
 
   static EIGEN_DEVICE_FUNC void run(const Self& self, Op& reducer, const Device&, typename Self::CoeffReturnType* output) {
     const typename Self::Index num_coeffs = array_prod(self.m_impl.dimensions());
-    *output = InnerMostDimReducer<Self, Op>::reduce(self, 0, num_coeffs, reducer);
+    *output = InnerMostDimReducer<Self, Op, Vectorizable>::reduce(self, 0, num_coeffs, reducer);
   }
 };
 
@@ -222,18 +222,19 @@ struct FullReducer {
 #ifdef EIGEN_USE_THREADS
 // Multithreaded full reducers
 template <typename Self, typename Op,
-          bool vectorizable = (Self::InputPacketAccess & Op::PacketAccess)>
+          bool Vectorizable = (Self::InputPacketAccess & Op::PacketAccess)>
 struct FullReducerShard {
   static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void run(const Self& self, typename Self::Index firstIndex,
                   typename Self::Index numValuesToReduce, Op& reducer,
                   typename Self::CoeffReturnType* output) {
-    *output = InnerMostDimReducer<Self, Op, vectorizable>::reduce(
+    *output = InnerMostDimReducer<Self, Op, Vectorizable>::reduce(
         self, firstIndex, numValuesToReduce, reducer);
   }
 };
 
-template <typename Self, typename Op>
-struct FullReducer<Self, Op, ThreadPoolDevice, false> {
+// Multithreaded full reducer
+template <typename Self, typename Op, bool Vectorizable>
+struct FullReducer<Self, Op, ThreadPoolDevice, Vectorizable> {
   static const bool HasOptimizedImplementation = !Op::IsStateful;
   static const int PacketSize =
       unpacket_traits<typename Self::PacketReturnType>::size;
@@ -247,79 +248,44 @@ struct FullReducer<Self, Op, ThreadPoolDevice, false> {
       *output = reducer.finalize(reducer.initialize());
       return;
     }
-    const std::size_t num_threads = device.numThreads();
+#ifdef EIGEN_USE_COST_MODEL
+    const TensorOpCost cost =
+        self.m_impl.costPerCoeff(Vectorizable) +
+        TensorOpCost(0, 0, internal::functor_traits<Op>::Cost, Vectorizable,
+                     PacketSize);
+    const int num_threads = TensorCostModel<ThreadPoolDevice>::numThreads(
+        num_coeffs, cost, device.numThreads());
+#else
+    const int num_threads = device.numThreads();
+#endif
     if (num_threads == 1) {
-      *output = InnerMostDimReducer<Self, Op, false>::reduce(self, 0, num_coeffs, reducer);
-      return;
-    } else {
-      const Index blocksize = std::floor<Index>(static_cast<float>(num_coeffs) / num_threads);
-      const unsigned int numblocks = blocksize > 0 ? static_cast<unsigned int>(num_coeffs / blocksize) : 0;
-      eigen_assert(num_coeffs >= static_cast<Index>(numblocks) * blocksize);
-
-      Barrier barrier(numblocks);
-      MaxSizeVector<typename Self::CoeffReturnType> shards(numblocks, reducer.initialize());
-      for (unsigned int i = 0; i < numblocks; ++i) {
-        device.enqueue_with_barrier(&barrier, &FullReducerShard<Self, Op, false>::run, self,
-                                    i * blocksize, blocksize, reducer, &shards[i]);
-      }
-
-      typename Self::CoeffReturnType finalShard;
-      if (static_cast<Index>(numblocks) * blocksize < num_coeffs) {
-        finalShard = InnerMostDimReducer<Self, Op, false>::reduce(
-            self, numblocks * blocksize, num_coeffs - numblocks * blocksize, reducer);
-      } else {
-        finalShard = reducer.initialize();
-      }
-      barrier.Wait();
-      for (unsigned int i = 0; i < numblocks; ++i) {
-        reducer.reduce(shards[i], &finalShard);
-      }
-      *output = reducer.finalize(finalShard);
-    }
-  }
-};
-
-template <typename Self, typename Op>
-struct FullReducer<Self, Op, ThreadPoolDevice, true> {
-  static const bool HasOptimizedImplementation = !Op::IsStateful;
-  static const int PacketSize =
-      unpacket_traits<typename Self::PacketReturnType>::size;
-
-  // launch one reducer per thread and accumulate the result.
-  static void run(const Self& self, Op& reducer, const ThreadPoolDevice& device,
-                  typename Self::CoeffReturnType* output) {
-    typedef typename Self::Index Index;
-    const Index num_coeffs = array_prod(self.m_impl.dimensions());
-    if (num_coeffs == 0) {
-      *output = reducer.finalize(reducer.initialize());
+      *output =
+          InnerMostDimReducer<Self, Op, Vectorizable>::reduce(self, 0, num_coeffs, reducer);
       return;
     }
-    const std::size_t num_threads = device.numThreads();
-    if (num_threads == 1) {
-      *output = InnerMostDimReducer<Self, Op, true>::reduce(self, 0, num_coeffs, reducer);
-      return;
-    }
-    const Index blocksize = std::floor<Index>(static_cast<float>(num_coeffs) / num_threads);
-    const unsigned int numblocks = blocksize > 0 ? static_cast<unsigned int>(num_coeffs / blocksize) : 0;
-    eigen_assert(num_coeffs >= static_cast<Index>(numblocks) * blocksize);
+    const Index blocksize =
+        std::floor<Index>(static_cast<float>(num_coeffs) / num_threads);
+    const Index numblocks = blocksize > 0 ? num_coeffs / blocksize : 0;
+    eigen_assert(num_coeffs >= numblocks * blocksize);
 
     Barrier barrier(numblocks);
     MaxSizeVector<typename Self::CoeffReturnType> shards(numblocks, reducer.initialize());
-    for (unsigned int i = 0; i < numblocks; ++i) {
-      device.enqueue_with_barrier(&barrier, &FullReducerShard<Self, Op, true>::run,
+    for (Index i = 0; i < numblocks; ++i) {
+      device.enqueue_with_barrier(&barrier, &FullReducerShard<Self, Op, Vectorizable>::run,
                                   self, i * blocksize, blocksize, reducer,
                                   &shards[i]);
     }
     typename Self::CoeffReturnType finalShard;
-    if (static_cast<Index>(numblocks) * blocksize < num_coeffs) {
-      finalShard = InnerMostDimReducer<Self, Op, true>::reduce(
-          self, numblocks * blocksize, num_coeffs - numblocks * blocksize, reducer);
+    if (numblocks * blocksize < num_coeffs) {
+      finalShard = InnerMostDimReducer<Self, Op, Vectorizable>::reduce(
+          self, numblocks * blocksize, num_coeffs - numblocks * blocksize,
+          reducer);
     } else {
       finalShard = reducer.initialize();
     }
-
     barrier.Wait();
-    for (unsigned int i = 0; i < numblocks; ++i) {
+
+    for (Index i = 0; i < numblocks; ++i) {
       reducer.reduce(shards[i], &finalShard);
     }
     *output = reducer.finalize(finalShard);
@@ -498,13 +464,21 @@ struct TensorEvaluator<const TensorReductionOp<Op, Dims, ArgType>, Device>
 
   EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Dimensions& dimensions() const { return m_dimensions; }
 
+  static bool size_large_enough(Index total_size) {
+#ifndef EIGEN_USE_COST_MODEL
+    return total_size > 1024 * 1024;
+#else
+    return true || total_size;
+#endif
+  }
+
   EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC bool evalSubExprsIfNeeded(CoeffReturnType* data) {
     m_impl.evalSubExprsIfNeeded(NULL);
 
     // Use the FullReducer if possible.
     if (RunningFullReduction && internal::FullReducer<Self, Op, Device>::HasOptimizedImplementation &&
         ((RunningOnGPU && (m_device.majorDeviceVersion() >= 3)) ||
-         (!RunningOnGPU && (internal::array_prod(m_impl.dimensions()) > 1024 * 1024)))) {
+         (!RunningOnGPU && size_large_enough(internal::array_prod(m_impl.dimensions()))))) {
 
       bool need_assign = false;
       if (!data) {

From 1e80bddde3756ac7cd36a0db5e7d2493a7b93066 Mon Sep 17 00:00:00 2001
From: Gael Guennebaud <g.gael@free.fr>
Date: Fri, 15 Apr 2016 17:58:36 +0200
Subject: [PATCH 02/27] Fix trmv for mixing types.

---
 Eigen/src/Core/products/GeneralMatrixMatrixTriangular.h | 2 +-
 Eigen/src/Core/products/TriangularMatrixVector.h        | 4 ++--
 test/mixingtypes.cpp                                    | 8 ++++++--
 3 files changed, 9 insertions(+), 5 deletions(-)

diff --git a/Eigen/src/Core/products/GeneralMatrixMatrixTriangular.h b/Eigen/src/Core/products/GeneralMatrixMatrixTriangular.h
index 831089dee..80ba89465 100644
--- a/Eigen/src/Core/products/GeneralMatrixMatrixTriangular.h
+++ b/Eigen/src/Core/products/GeneralMatrixMatrixTriangular.h
@@ -43,7 +43,7 @@ struct general_matrix_matrix_triangular_product<Index,LhsScalar,LhsStorageOrder,
   typedef typename scalar_product_traits<LhsScalar, RhsScalar>::ReturnType ResScalar;
   static EIGEN_STRONG_INLINE void run(Index size, Index depth,const LhsScalar* lhs, Index lhsStride,
                                       const RhsScalar* rhs, Index rhsStride, ResScalar* res, Index resStride,
-                                      const ResScalar& alpha, level3_blocking<LhsScalar,RhsScalar>& blocking)
+                                      const ResScalar& alpha, level3_blocking<RhsScalar,LhsScalar>& blocking)
   {
     general_matrix_matrix_triangular_product<Index,
         RhsScalar, RhsStorageOrder==RowMajor ? ColMajor : RowMajor, ConjugateRhs,
diff --git a/Eigen/src/Core/products/TriangularMatrixVector.h b/Eigen/src/Core/products/TriangularMatrixVector.h
index 7c014b72a..f79840aa7 100644
--- a/Eigen/src/Core/products/TriangularMatrixVector.h
+++ b/Eigen/src/Core/products/TriangularMatrixVector.h
@@ -27,13 +27,13 @@ struct triangular_matrix_vector_product<Index,Mode,LhsScalar,ConjLhs,RhsScalar,C
     HasZeroDiag = (Mode & ZeroDiag)==ZeroDiag
   };
   static EIGEN_DONT_INLINE  void run(Index _rows, Index _cols, const LhsScalar* _lhs, Index lhsStride,
-                                     const RhsScalar* _rhs, Index rhsIncr, ResScalar* _res, Index resIncr, const ResScalar& alpha);
+                                     const RhsScalar* _rhs, Index rhsIncr, ResScalar* _res, Index resIncr, const RhsScalar& alpha);
 };
 
 template<typename Index, int Mode, typename LhsScalar, bool ConjLhs, typename RhsScalar, bool ConjRhs, int Version>
 EIGEN_DONT_INLINE void triangular_matrix_vector_product<Index,Mode,LhsScalar,ConjLhs,RhsScalar,ConjRhs,ColMajor,Version>
   ::run(Index _rows, Index _cols, const LhsScalar* _lhs, Index lhsStride,
-        const RhsScalar* _rhs, Index rhsIncr, ResScalar* _res, Index resIncr, const ResScalar& alpha)
+        const RhsScalar* _rhs, Index rhsIncr, ResScalar* _res, Index resIncr, const RhsScalar& alpha)
   {
     static const Index PanelWidth = EIGEN_TUNE_TRIANGULAR_PANEL_WIDTH;
     Index size = (std::min)(_rows,_cols);
diff --git a/test/mixingtypes.cpp b/test/mixingtypes.cpp
index a3b469af8..0b381ec6c 100644
--- a/test/mixingtypes.cpp
+++ b/test/mixingtypes.cpp
@@ -148,10 +148,14 @@ template<int SizeAtCompileType> void mixingtypes(int size = SizeAtCompileType)
   VERIFY_IS_APPROX(sd*vd.adjoint()*mcd,  sd*vd.adjoint().template cast<CD>().eval()*mcd);
   VERIFY_IS_APPROX(scd*vd.adjoint()*mcd, scd*vd.adjoint().template cast<CD>().eval()*mcd);
 
-  VERIFY_IS_APPROX(sd*vcd.adjoint()*md.template triangularView<Upper>(),  sd*vcd.adjoint()*md.template cast<CD>().eval().template triangularView<Upper>());
+  VERIFY_IS_APPROX( sd*vcd.adjoint()*md.template triangularView<Upper>(),  sd*vcd.adjoint()*md.template cast<CD>().eval().template triangularView<Upper>());
   VERIFY_IS_APPROX(scd*vcd.adjoint()*md.template triangularView<Lower>(), scd*vcd.adjoint()*md.template cast<CD>().eval().template triangularView<Lower>());
-  VERIFY_IS_APPROX(sd*vd.adjoint()*mcd.template triangularView<Lower>(),  sd*vd.adjoint().template cast<CD>().eval()*mcd.template triangularView<Lower>());
+  VERIFY_IS_APPROX( sd*vcd.adjoint()*md.transpose().template triangularView<Upper>(),  sd*vcd.adjoint()*md.transpose().template cast<CD>().eval().template triangularView<Upper>());
+  VERIFY_IS_APPROX(scd*vcd.adjoint()*md.transpose().template triangularView<Lower>(), scd*vcd.adjoint()*md.transpose().template cast<CD>().eval().template triangularView<Lower>());
+  VERIFY_IS_APPROX( sd*vd.adjoint()*mcd.template triangularView<Lower>(),  sd*vd.adjoint().template cast<CD>().eval()*mcd.template triangularView<Lower>());
   VERIFY_IS_APPROX(scd*vd.adjoint()*mcd.template triangularView<Upper>(), scd*vd.adjoint().template cast<CD>().eval()*mcd.template triangularView<Upper>());
+  VERIFY_IS_APPROX( sd*vd.adjoint()*mcd.transpose().template triangularView<Lower>(),  sd*vd.adjoint().template cast<CD>().eval()*mcd.transpose().template triangularView<Lower>());
+  VERIFY_IS_APPROX(scd*vd.adjoint()*mcd.transpose().template triangularView<Upper>(), scd*vd.adjoint().template cast<CD>().eval()*mcd.transpose().template triangularView<Upper>());
 
   // Not supported yet: trmm
 //   VERIFY_IS_APPROX(sd*mcd*md.template triangularView<Lower>(),  sd*mcd*md.template cast<CD>().eval().template triangularView<Lower>());

From 1d2343062805edb86113e2aef5ebcbe5030a57a5 Mon Sep 17 00:00:00 2001
From: Benoit Steiner <benoit.steiner.goog@gmail.com>
Date: Fri, 15 Apr 2016 10:53:31 -0700
Subject: [PATCH 03/27] Improved the matrix multiplication blocking in the case
 where mr is not a power of 2 (e.g on Haswell CPUs).

---
 .../Core/products/GeneralBlockPanelKernel.h   | 38 +++++++++----------
 1 file changed, 17 insertions(+), 21 deletions(-)

diff --git a/Eigen/src/Core/products/GeneralBlockPanelKernel.h b/Eigen/src/Core/products/GeneralBlockPanelKernel.h
index 267ac1de9..3d35c8d46 100644
--- a/Eigen/src/Core/products/GeneralBlockPanelKernel.h
+++ b/Eigen/src/Core/products/GeneralBlockPanelKernel.h
@@ -11,8 +11,8 @@
 #define EIGEN_GENERAL_BLOCK_PANEL_H
 
 
-namespace Eigen { 
-  
+namespace Eigen {
+
 namespace internal {
 
 template<typename _LhsScalar, typename _RhsScalar, bool _ConjLhs=false, bool _ConjRhs=false>
@@ -36,7 +36,7 @@ const std::ptrdiff_t defaultL3CacheSize = 512*1024;
 #endif
 
 /** \internal */
-struct CacheSizes { 
+struct CacheSizes {
   CacheSizes(): m_l1(-1),m_l2(-1),m_l3(-1) {
     int l1CacheSize, l2CacheSize, l3CacheSize;
     queryCacheSizes(l1CacheSize, l2CacheSize, l3CacheSize);
@@ -107,13 +107,9 @@ void evaluateProductBlockingSizesHeuristic(Index& k, Index& m, Index& n, Index n
     enum {
       kdiv = KcFactor * (Traits::mr * sizeof(LhsScalar) + Traits::nr * sizeof(RhsScalar)),
       ksub = Traits::mr * Traits::nr * sizeof(ResScalar),
-      k_mask = -8,
-
+      kr = 8,
       mr = Traits::mr,
-      mr_mask = -mr,
-
       nr = Traits::nr,
-      nr_mask = -nr
     };
     // Increasing k gives us more time to prefetch the content of the "C"
     // registers. However once the latency is hidden there is no point in
@@ -121,7 +117,7 @@ void evaluateProductBlockingSizesHeuristic(Index& k, Index& m, Index& n, Index n
     // experimentally).
     const Index k_cache = (std::min<Index>)((l1-ksub)/kdiv, 320);
     if (k_cache < k) {
-      k = k_cache & k_mask;
+      k = k_cache - (k_cache % kr);
       eigen_internal_assert(k > 0);
     }
 
@@ -130,10 +126,10 @@ void evaluateProductBlockingSizesHeuristic(Index& k, Index& m, Index& n, Index n
     if (n_cache <= n_per_thread) {
       // Don't exceed the capacity of the l2 cache.
       eigen_internal_assert(n_cache >= static_cast<Index>(nr));
-      n = n_cache & nr_mask;
+      n = n_cache - (n_cache % nr);
       eigen_internal_assert(n > 0);
     } else {
-      n = (std::min<Index>)(n, (n_per_thread + nr - 1) & nr_mask);
+      n = (std::min<Index>)(n, (n_per_thread + nr - 1) - ((n_per_thread + nr - 1) % nr));
     }
 
     if (l3 > l2) {
@@ -141,10 +137,10 @@ void evaluateProductBlockingSizesHeuristic(Index& k, Index& m, Index& n, Index n
       const Index m_cache = (l3-l2) / (sizeof(LhsScalar) * k * num_threads);
       const Index m_per_thread = numext::div_ceil(m, num_threads);
       if(m_cache < m_per_thread && m_cache >= static_cast<Index>(mr)) {
-        m = m_cache & mr_mask;
+        m = m_cache - (m_cache % mr);
         eigen_internal_assert(m > 0);
       } else {
-        m = (std::min<Index>)(m, (m_per_thread + mr - 1) & mr_mask);
+        m = (std::min<Index>)(m, (m_per_thread + mr - 1) - ((m_per_thread + mr - 1) % mr));
       }
     }
   }
@@ -156,23 +152,23 @@ void evaluateProductBlockingSizesHeuristic(Index& k, Index& m, Index& n, Index n
     l2 = 32*1024;
     l3 = 512*1024;
 #endif
-    
+
     // Early return for small problems because the computation below are time consuming for small problems.
     // Perhaps it would make more sense to consider k*n*m??
     // Note that for very tiny problem, this function should be bypassed anyway
     // because we use the coefficient-based implementation for them.
     if((std::max)(k,(std::max)(m,n))<48)
       return;
-    
+
     typedef typename Traits::ResScalar ResScalar;
     enum {
       k_peeling = 8,
       k_div = KcFactor * (Traits::mr * sizeof(LhsScalar) + Traits::nr * sizeof(RhsScalar)),
       k_sub = Traits::mr * Traits::nr * sizeof(ResScalar)
     };
-    
+
     // ---- 1st level of blocking on L1, yields kc ----
-    
+
     // Blocking on the third dimension (i.e., k) is chosen so that an horizontal panel
     // of size mr x kc of the lhs plus a vertical panel of kc x nr of the rhs both fits within L1 cache.
     // We also include a register-level block of the result (mx x nr).
@@ -187,12 +183,12 @@ void evaluateProductBlockingSizesHeuristic(Index& k, Index& m, Index& n, Index n
       //    while keeping the same number of sweeps over the result.
       k = (k%max_kc)==0 ? max_kc
                         : max_kc - k_peeling * ((max_kc-1-(k%max_kc))/(k_peeling*(k/max_kc+1)));
-                        
+
       eigen_internal_assert(((old_k/k) == (old_k/max_kc)) && "the number of sweeps has to remain the same");
     }
-    
+
     // ---- 2nd level of blocking on max(L2,L3), yields nc ----
-    
+
     // TODO find a reliable way to get the actual amount of cache per core to use for 2nd level blocking, that is:
     //      actual_l2 = max(l2, l3/nb_core_sharing_l3)
     // The number below is quite conservative: it is better to underestimate the cache size rather than overestimating it)
@@ -202,7 +198,7 @@ void evaluateProductBlockingSizesHeuristic(Index& k, Index& m, Index& n, Index n
     #else
     const Index actual_l2 = 1572864; // == 1.5 MB
     #endif
-    
+
     // Here, nc is chosen such that a block of kc x nc of the rhs fit within half of L2.
     // The second half is implicitly reserved to access the result and lhs coefficients.
     // When k<max_kc, then nc can arbitrarily growth. In practice, it seems to be fruitful

From 40c9923a8aa84bb273e003a92e7cbd644bdecc4c Mon Sep 17 00:00:00 2001
From: Benoit Steiner <benoit.steiner.goog@gmail.com>
Date: Fri, 15 Apr 2016 11:27:52 -0700
Subject: [PATCH 04/27] Fixed compilation errors with msvc

---
 unsupported/test/cxx11_eventcount.cpp | 4 ++--
 unsupported/test/cxx11_runqueue.cpp   | 3 ++-
 2 files changed, 4 insertions(+), 3 deletions(-)

diff --git a/unsupported/test/cxx11_eventcount.cpp b/unsupported/test/cxx11_eventcount.cpp
index 271e0f66d..ebd019e2d 100644
--- a/unsupported/test/cxx11_eventcount.cpp
+++ b/unsupported/test/cxx11_eventcount.cpp
@@ -67,8 +67,8 @@ const int TestQueue::kQueueSize;
 static void test_stress_eventcount()
 {
   const int kThreads = std::thread::hardware_concurrency();
-  const int kEvents = 1 << 16;
-  const int kQueues = 10;
+  static const int kEvents = 1 << 16;
+  static const int kQueues = 10;
 
   std::vector<EventCount::Waiter> waiters(kThreads);
   EventCount ec(waiters);
diff --git a/unsupported/test/cxx11_runqueue.cpp b/unsupported/test/cxx11_runqueue.cpp
index f1217a01b..b1e2dbd6d 100644
--- a/unsupported/test/cxx11_runqueue.cpp
+++ b/unsupported/test/cxx11_runqueue.cpp
@@ -9,6 +9,7 @@
 // with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
 
 #define EIGEN_USE_THREADS
+#include <cstdlib>
 #include "main.h"
 #include <Eigen/CXX11/ThreadPool>
 
@@ -138,7 +139,7 @@ void test_empty_runqueue()
 // PopBack. Ensure that we don't crash, deadlock, and all sanity checks pass.
 void test_stress_runqueue()
 {
-  const int kEvents = 1 << 18;
+  static const int kEvents = 1 << 18;
   RunQueue<int, 8> q;
   std::atomic<int> total(0);
   std::vector<std::unique_ptr<std::thread>> threads;

From 3718bf654bd173ae05f396f5d0cff1a4e15ef72d Mon Sep 17 00:00:00 2001
From: Rasmus Munk Larsen <rmlarsen@google.com>
Date: Fri, 15 Apr 2016 12:51:33 -0700
Subject: [PATCH 05/27] Get rid of void* casting when calling EvalRange::run.

---
 .../Eigen/CXX11/src/Tensor/TensorExecutor.h   | 28 ++++++-------------
 1 file changed, 8 insertions(+), 20 deletions(-)

diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorExecutor.h b/unsupported/Eigen/CXX11/src/Tensor/TensorExecutor.h
index df9cc0998..7a54f7a23 100644
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorExecutor.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorExecutor.h
@@ -85,8 +85,8 @@ class TensorExecutor<Expression, DefaultDevice, true>
 #ifdef EIGEN_USE_THREADS
 template <typename Evaluator, typename Index, bool Vectorizable>
 struct EvalRange {
-  static void run(void* evaluator_in, const Index first, const Index last) {
-    Evaluator evaluator(*static_cast<Evaluator*>(evaluator_in));
+  static void run(Evaluator* evaluator_in, const Index first, const Index last) {
+    Evaluator evaluator = *evaluator_in;
     eigen_assert(last >= first);
     for (Index i = first; i < last; ++i) {
       evaluator.evalScalar(i);
@@ -96,10 +96,9 @@ struct EvalRange {
 
 template <typename Evaluator, typename Index>
 struct EvalRange<Evaluator, Index, true> {
-  static void run(void* evaluator_in, const Index first, const Index last) {
-    Evaluator evaluator(*static_cast<Evaluator*>(evaluator_in));
+  static void run(Evaluator* evaluator_in, const Index first, const Index last) {
+    Evaluator evaluator = *evaluator_in;
     eigen_assert(last >= first);
-
     Index i = first;
     const int PacketSize = unpacket_traits<typename Evaluator::PacketReturnType>::size;
     if (last - first >= PacketSize) {
@@ -123,16 +122,6 @@ struct EvalRange<Evaluator, Index, true> {
   }
 };
 
-// Used to make an std::function to add to the ThreadPool with less templating
-// than EvalRange::Run.
-// This requires that this and EvalRange takes a void* to the evaluator that can
-// be downcast to the right type by the EvalRange.
-template <typename Index>
-inline void InvokeEvalRange(void (*run_fn)(void*, const Index, const Index),
-                      void* evaluator, const Index first, const Index last) {
-  run_fn(evaluator, first, last);
-}
-
 template <typename Expression, bool Vectorizable>
 class TensorExecutor<Expression, ThreadPoolDevice, Vectorizable> {
  public:
@@ -163,13 +152,12 @@ class TensorExecutor<Expression, ThreadPoolDevice, Vectorizable> {
         Barrier barrier(numblocks);
         for (int i = 0; i < numblocks; ++i) {
           device.enqueue_with_barrier(
-              &barrier, &InvokeEvalRange<Index>,
-              &EvalRange<Evaluator, Index, Vectorizable>::run,
-              static_cast<void*>(&evaluator), i * blocksize,
-              (i + 1) * blocksize);
+              &barrier, &EvalRange<Evaluator, Index, Vectorizable>::run,
+              &evaluator, i * blocksize, (i + 1) * blocksize);
         }
         if (numblocks * blocksize < size) {
-          EvalRange<Evaluator, Index, Vectorizable>::run(&evaluator, numblocks * blocksize, size);
+          EvalRange<Evaluator, Index, Vectorizable>::run(
+              &evaluator, numblocks * blocksize, size);
         }
         barrier.Wait();
       }

From 2a7115daca3e36e1486c4e22f05ad11f8a055030 Mon Sep 17 00:00:00 2001
From: Gael Guennebaud <g.gael@free.fr>
Date: Fri, 15 Apr 2016 22:34:11 +0200
Subject: [PATCH 06/27] bug #1203: by-pass large stack-allocation in stableNorm
 if EIGEN_STACK_ALLOCATION_LIMIT is too small

---
 Eigen/src/Core/StableNorm.h | 7 ++++---
 1 file changed, 4 insertions(+), 3 deletions(-)

diff --git a/Eigen/src/Core/StableNorm.h b/Eigen/src/Core/StableNorm.h
index 7fe39808b..d2fe1e199 100644
--- a/Eigen/src/Core/StableNorm.h
+++ b/Eigen/src/Core/StableNorm.h
@@ -168,11 +168,12 @@ MatrixBase<Derived>::stableNorm() const
   DerivedCopy copy(derived());
   
   enum {
-    CanAlign = (int(Flags)&DirectAccessBit) || (int(internal::evaluator<DerivedCopyClean>::Alignment)>0) // FIXME
+    CanAlign = (   (int(DerivedCopyClean::Flags)&DirectAccessBit)
+                || (int(internal::evaluator<DerivedCopyClean>::Alignment)>0) // FIXME Alignment)>0 might not be enough
+               ) && (blockSize*sizeof(Scalar)*2<EIGEN_STACK_ALLOCATION_LIMIT) // ifwe cannot allocate on the stack, then let's not bother about this optimization
   };
   typedef typename internal::conditional<CanAlign, Ref<const Matrix<Scalar,Dynamic,1,0,blockSize,1>, internal::evaluator<DerivedCopyClean>::Alignment>,
-                                                   typename DerivedCopyClean
-                                                   ::ConstSegmentReturnType>::type SegmentWrapper;
+                                                   typename DerivedCopyClean::ConstSegmentReturnType>::type SegmentWrapper;
   Index n = size();
   
   if(n==1)

From 6c43c49e4aaf833e97a3c759b79d952bfd0a4d9c Mon Sep 17 00:00:00 2001
From: Benoit Steiner <benoit.steiner.goog@gmail.com>
Date: Fri, 15 Apr 2016 15:34:34 -0700
Subject: [PATCH 07/27] Fixed a few compilation warnings

---
 unsupported/Eigen/CXX11/src/Tensor/TensorContraction.h | 2 +-
 unsupported/Eigen/CXX11/src/Tensor/TensorGenerator.h   | 2 +-
 unsupported/Eigen/CXX11/src/Tensor/TensorImagePatch.h  | 1 -
 unsupported/Eigen/CXX11/src/Tensor/TensorVolumePatch.h | 1 -
 4 files changed, 2 insertions(+), 4 deletions(-)

diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorContraction.h b/unsupported/Eigen/CXX11/src/Tensor/TensorContraction.h
index 1627d4870..2671f4357 100644
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorContraction.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorContraction.h
@@ -440,7 +440,7 @@ struct TensorContractionEvaluatorBase
     return m_result[index];
   }
 
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorOpCost costPerCoeff(bool vectorized) const {
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorOpCost costPerCoeff(bool) const {
     return TensorOpCost(sizeof(CoeffReturnType), 0, 0);
   }
 
diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorGenerator.h b/unsupported/Eigen/CXX11/src/Tensor/TensorGenerator.h
index 9e5791bd3..8ff7d5815 100644
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorGenerator.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorGenerator.h
@@ -146,7 +146,7 @@ struct TensorEvaluator<const TensorGeneratorOp<Generator, ArgType>, Device>
   }
 
   EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorOpCost
-  costPerCoeff(bool vectorized) const {
+  costPerCoeff(bool) const {
     // TODO(rmlarsen): This is just a placeholder. Define interface to make
     // generators return their cost.
     return TensorOpCost(0, 0, TensorOpCost::AddCost<Scalar>() +
diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorImagePatch.h b/unsupported/Eigen/CXX11/src/Tensor/TensorImagePatch.h
index 0b769f278..bafcc67bd 100644
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorImagePatch.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorImagePatch.h
@@ -448,7 +448,6 @@ struct TensorEvaluator<const TensorImagePatchOp<Rows, Cols, ArgType>, Device>
  protected:
   EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE PacketReturnType packetWithPossibleZero(Index index) const
   {
-    const int PacketSize = internal::unpacket_traits<PacketReturnType>::size;
     EIGEN_ALIGN_MAX typename internal::remove_const<CoeffReturnType>::type values[PacketSize];
     for (int i = 0; i < PacketSize; ++i) {
       values[i] = coeff(index+i);
diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorVolumePatch.h b/unsupported/Eigen/CXX11/src/Tensor/TensorVolumePatch.h
index f8db43dbe..e735fc76f 100644
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorVolumePatch.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorVolumePatch.h
@@ -525,7 +525,6 @@ struct TensorEvaluator<const TensorVolumePatchOp<Planes, Rows, Cols, ArgType>, D
  protected:
   EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE PacketReturnType packetWithPossibleZero(Index index) const
   {
-    const int PacketSize = internal::unpacket_traits<PacketReturnType>::size;
     EIGEN_ALIGN_MAX typename internal::remove_const<CoeffReturnType>::type values[PacketSize];
     for (int i = 0; i < PacketSize; ++i) {
       values[i] = coeff(index+i);

From 7cff898e0a2a576a17168f36ba3576b764bbf326 Mon Sep 17 00:00:00 2001
From: Benoit Steiner <benoit.steiner.goog@gmail.com>
Date: Fri, 15 Apr 2016 15:46:14 -0700
Subject: [PATCH 08/27] Deleted unnecessary variable

---
 unsupported/Eigen/CXX11/src/Tensor/TensorEvaluator.h | 1 -
 1 file changed, 1 deletion(-)

diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorEvaluator.h b/unsupported/Eigen/CXX11/src/Tensor/TensorEvaluator.h
index 293012646..ae4ce3c90 100644
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorEvaluator.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorEvaluator.h
@@ -459,7 +459,6 @@ struct TensorEvaluator<const TensorSelectOp<IfArgType, ThenArgType, ElseArgType>
   template<int LoadMode>
   EIGEN_DEVICE_FUNC PacketReturnType packet(Index index) const
   {
-    const int PacketSize = internal::unpacket_traits<PacketReturnType>::size;
     internal::Selector<PacketSize> select;
     for (Index i = 0; i < PacketSize; ++i) {
       select.select[i] = m_condImpl.coeff(index+i);

From 1a16fb1532552c87fb1cb67ef40679b7a4744613 Mon Sep 17 00:00:00 2001
From: Benoit Steiner <benoit.steiner.goog@gmail.com>
Date: Fri, 15 Apr 2016 15:50:13 -0700
Subject: [PATCH 09/27] Deleted extraneous comma.

---
 Eigen/src/Core/products/GeneralBlockPanelKernel.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/Eigen/src/Core/products/GeneralBlockPanelKernel.h b/Eigen/src/Core/products/GeneralBlockPanelKernel.h
index 3d35c8d46..bd559dc6a 100644
--- a/Eigen/src/Core/products/GeneralBlockPanelKernel.h
+++ b/Eigen/src/Core/products/GeneralBlockPanelKernel.h
@@ -109,7 +109,7 @@ void evaluateProductBlockingSizesHeuristic(Index& k, Index& m, Index& n, Index n
       ksub = Traits::mr * Traits::nr * sizeof(ResScalar),
       kr = 8,
       mr = Traits::mr,
-      nr = Traits::nr,
+      nr = Traits::nr
     };
     // Increasing k gives us more time to prefetch the content of the "C"
     // registers. However once the latency is hidden there is no point in

From c8e8f93d6ce9ed02a40ea08b5ccc1ef6e82de0f2 Mon Sep 17 00:00:00 2001
From: Benoit Steiner <benoit.steiner.goog@gmail.com>
Date: Fri, 15 Apr 2016 16:48:10 -0700
Subject: [PATCH 10/27] Move the evalGemm method into the
 TensorContractionEvaluatorBase class to make it accessible from both the
 single and multithreaded contraction evaluators.

---
 .../CXX11/src/Tensor/TensorContraction.h      | 188 +++++++++---------
 1 file changed, 94 insertions(+), 94 deletions(-)

diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorContraction.h b/unsupported/Eigen/CXX11/src/Tensor/TensorContraction.h
index 2671f4357..97182258d 100644
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorContraction.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorContraction.h
@@ -426,6 +426,99 @@ struct TensorContractionEvaluatorBase
         buffer, resIncr, alpha);
   }
 
+  template <bool lhs_inner_dim_contiguous, bool rhs_inner_dim_contiguous, bool rhs_inner_dim_reordered, int Alignment>
+  EIGEN_DEVICE_FUNC void evalGemm(Scalar* buffer) const {
+    // columns in left side, rows in right side
+    const Index k = this->m_k_size;
+
+    // rows in left side
+    const Index m = this->m_i_size;
+
+    // columns in right side
+    const Index n = this->m_j_size;
+
+    // zero out the result buffer (which must be of size at least m * n * sizeof(Scalar)
+    this->m_device.memset(buffer, 0, m * n * sizeof(Scalar));
+
+    // define mr, nr, and all of my data mapper types
+    typedef typename internal::remove_const<typename EvalLeftArgType::Scalar>::type LhsScalar;
+    typedef typename internal::remove_const<typename EvalRightArgType::Scalar>::type RhsScalar;
+    typedef typename internal::gebp_traits<LhsScalar, RhsScalar> Traits;
+
+    const Index nr = Traits::nr;
+    const Index mr = Traits::mr;
+
+    typedef TensorEvaluator<EvalLeftArgType, Device> LeftEvaluator;
+    typedef TensorEvaluator<EvalRightArgType, Device> RightEvaluator;
+
+    const Index lhs_packet_size = internal::unpacket_traits<typename LeftEvaluator::PacketReturnType>::size;
+    const Index rhs_packet_size = internal::unpacket_traits<typename RightEvaluator::PacketReturnType>::size;
+
+    typedef internal::TensorContractionInputMapper<LhsScalar, Index, internal::Lhs,
+                                                   LeftEvaluator, left_nocontract_t,
+                                                   contract_t, lhs_packet_size,
+                                                   lhs_inner_dim_contiguous,
+                                                   false, Unaligned> LhsMapper;
+
+    typedef internal::TensorContractionInputMapper<RhsScalar, Index, internal::Rhs,
+                                                   RightEvaluator, right_nocontract_t,
+                                                   contract_t, rhs_packet_size,
+                                                   rhs_inner_dim_contiguous,
+                                                   rhs_inner_dim_reordered, Unaligned> RhsMapper;
+
+    typedef internal::blas_data_mapper<Scalar, Index, ColMajor> OutputMapper;
+
+    // Declare GEBP packing and kernel structs
+    internal::gemm_pack_lhs<LhsScalar, Index, typename LhsMapper::SubMapper, mr, Traits::LhsProgress, ColMajor> pack_lhs;
+    internal::gemm_pack_rhs<RhsScalar, Index, typename RhsMapper::SubMapper, nr, ColMajor> pack_rhs;
+
+    internal::gebp_kernel<LhsScalar, RhsScalar, Index, OutputMapper, mr, nr, false, false> gebp;
+
+    // initialize data mappers
+    LhsMapper lhs(this->m_leftImpl, this->m_left_nocontract_strides, this->m_i_strides,
+                  this->m_left_contracting_strides, this->m_k_strides);
+
+    RhsMapper rhs(this->m_rightImpl, this->m_right_nocontract_strides, this->m_j_strides,
+                  this->m_right_contracting_strides, this->m_k_strides);
+
+    OutputMapper output(buffer, m);
+
+    // Sizes of the blocks to load in cache. See the Goto paper for details.
+    internal::TensorContractionBlocking<LhsMapper, RhsMapper, Index, internal::ShardByCol> blocking(k, m, n, 1);
+    const Index kc = blocking.kc();
+    const Index mc = numext::mini(m, blocking.mc());
+    const Index nc = numext::mini(n, blocking.nc());
+    const Index sizeA = mc * kc;
+    const Index sizeB = kc * nc;
+
+    LhsScalar* blockA = static_cast<LhsScalar *>(this->m_device.allocate(sizeA * sizeof(LhsScalar)));
+    RhsScalar* blockB = static_cast<RhsScalar *>(this->m_device.allocate(sizeB * sizeof(RhsScalar)));
+
+    for(Index i2=0; i2<m; i2+=mc)
+    {
+      const Index actual_mc = numext::mini(i2+mc,m)-i2;
+      for (Index k2 = 0; k2 < k; k2 += kc) {
+        // make sure we don't overshoot right edge of left matrix, then pack vertical panel
+        const Index actual_kc = numext::mini(k2 + kc, k) - k2;
+        pack_lhs(blockA, lhs.getSubMapper(i2, k2), actual_kc, actual_mc, 0, 0);
+
+        // series of horizontal blocks
+        for (Index j2 = 0; j2 < n; j2 += nc) {
+          // make sure we don't overshoot right edge of right matrix, then pack block
+          const Index actual_nc = numext::mini(j2 + nc, n) - j2;
+          pack_rhs(blockB, rhs.getSubMapper(k2, j2), actual_kc, actual_nc, 0, 0);
+
+          // call gebp (matrix kernel)
+          // The parameters here are copied from Eigen's GEMM implementation
+          gebp(output.getSubMapper(i2, j2), blockA, blockB, actual_mc, actual_kc, actual_nc, 1.0, -1, -1, 0, 0);
+        }
+      }
+    }
+
+    this->m_device.deallocate(blockA);
+    this->m_device.deallocate(blockB);
+  }
+
   EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void cleanup() {
     m_leftImpl.cleanup();
     m_rightImpl.cleanup();
@@ -533,100 +626,7 @@ struct TensorEvaluator<const TensorContractionOp<Indices, LeftArgType, RightArgT
       return;
     }
 
-    evalGemm<lhs_inner_dim_contiguous, rhs_inner_dim_contiguous, rhs_inner_dim_reordered, Alignment>(buffer);
-  }
-
-  template <bool lhs_inner_dim_contiguous, bool rhs_inner_dim_contiguous, bool rhs_inner_dim_reordered, int Alignment>
-  EIGEN_DEVICE_FUNC void evalGemm(Scalar* buffer) const {
-    // columns in left side, rows in right side
-    const Index k = this->m_k_size;
-
-    // rows in left side
-    const Index m = this->m_i_size;
-
-    // columns in right side
-    const Index n = this->m_j_size;
-
-    // zero out the result buffer (which must be of size at least m * n * sizeof(Scalar)
-    this->m_device.memset(buffer, 0, m * n * sizeof(Scalar));
-
-    // define mr, nr, and all of my data mapper types
-    typedef typename internal::remove_const<typename EvalLeftArgType::Scalar>::type LhsScalar;
-    typedef typename internal::remove_const<typename EvalRightArgType::Scalar>::type RhsScalar;
-    typedef typename internal::gebp_traits<LhsScalar, RhsScalar> Traits;
-
-    const Index nr = Traits::nr;
-    const Index mr = Traits::mr;
-
-    typedef TensorEvaluator<EvalLeftArgType, Device> LeftEvaluator;
-    typedef TensorEvaluator<EvalRightArgType, Device> RightEvaluator;
-
-    const Index lhs_packet_size = internal::unpacket_traits<typename LeftEvaluator::PacketReturnType>::size;
-    const Index rhs_packet_size = internal::unpacket_traits<typename RightEvaluator::PacketReturnType>::size;
-
-    typedef internal::TensorContractionInputMapper<LhsScalar, Index, internal::Lhs,
-                                                   LeftEvaluator, left_nocontract_t,
-                                                   contract_t, lhs_packet_size,
-                                                   lhs_inner_dim_contiguous,
-                                                   false, Unaligned> LhsMapper;
-
-    typedef internal::TensorContractionInputMapper<RhsScalar, Index, internal::Rhs,
-                                                   RightEvaluator, right_nocontract_t,
-                                                   contract_t, rhs_packet_size,
-                                                   rhs_inner_dim_contiguous,
-                                                   rhs_inner_dim_reordered, Unaligned> RhsMapper;
-
-    typedef internal::blas_data_mapper<Scalar, Index, ColMajor> OutputMapper;
-
-    // Declare GEBP packing and kernel structs
-    internal::gemm_pack_lhs<LhsScalar, Index, typename LhsMapper::SubMapper, mr, Traits::LhsProgress, ColMajor> pack_lhs;
-    internal::gemm_pack_rhs<RhsScalar, Index, typename RhsMapper::SubMapper, nr, ColMajor> pack_rhs;
-
-    internal::gebp_kernel<LhsScalar, RhsScalar, Index, OutputMapper, mr, nr, false, false> gebp;
-
-    // initialize data mappers
-    LhsMapper lhs(this->m_leftImpl, this->m_left_nocontract_strides, this->m_i_strides,
-                  this->m_left_contracting_strides, this->m_k_strides);
-
-    RhsMapper rhs(this->m_rightImpl, this->m_right_nocontract_strides, this->m_j_strides,
-                  this->m_right_contracting_strides, this->m_k_strides);
-
-    OutputMapper output(buffer, m);
-
-    // Sizes of the blocks to load in cache. See the Goto paper for details.
-    internal::TensorContractionBlocking<LhsMapper, RhsMapper, Index, internal::ShardByCol> blocking(k, m, n, 1);
-    const Index kc = blocking.kc();
-    const Index mc = numext::mini(m, blocking.mc());
-    const Index nc = numext::mini(n, blocking.nc());
-    const Index sizeA = mc * kc;
-    const Index sizeB = kc * nc;
-
-    LhsScalar* blockA = static_cast<LhsScalar *>(this->m_device.allocate(sizeA * sizeof(LhsScalar)));
-    RhsScalar* blockB = static_cast<RhsScalar *>(this->m_device.allocate(sizeB * sizeof(RhsScalar)));
-
-    for(Index i2=0; i2<m; i2+=mc)
-    {
-      const Index actual_mc = numext::mini(i2+mc,m)-i2;
-      for (Index k2 = 0; k2 < k; k2 += kc) {
-        // make sure we don't overshoot right edge of left matrix, then pack vertical panel
-        const Index actual_kc = numext::mini(k2 + kc, k) - k2;
-        pack_lhs(blockA, lhs.getSubMapper(i2, k2), actual_kc, actual_mc, 0, 0);
-
-        // series of horizontal blocks
-        for (Index j2 = 0; j2 < n; j2 += nc) {
-          // make sure we don't overshoot right edge of right matrix, then pack block
-          const Index actual_nc = numext::mini(j2 + nc, n) - j2;
-          pack_rhs(blockB, rhs.getSubMapper(k2, j2), actual_kc, actual_nc, 0, 0);
-
-          // call gebp (matrix kernel)
-          // The parameters here are copied from Eigen's GEMM implementation
-          gebp(output.getSubMapper(i2, j2), blockA, blockB, actual_mc, actual_kc, actual_nc, 1.0, -1, -1, 0, 0);
-        }
-      }
-    }
-
-    this->m_device.deallocate(blockA);
-    this->m_device.deallocate(blockB);
+    this->template evalGemm<lhs_inner_dim_contiguous, rhs_inner_dim_contiguous, rhs_inner_dim_reordered, Alignment>(buffer);
   }
 };
 

From e4fe611e2c9cf7b2853016c39898a224b3ec51a3 Mon Sep 17 00:00:00 2001
From: Gael Guennebaud <g.gael@free.fr>
Date: Sat, 16 Apr 2016 15:17:39 +0200
Subject: [PATCH 11/27] Enable lazy-coeff-based-product for vector*(1x1)
 products

---
 Eigen/src/Core/GeneralProduct.h | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/Eigen/src/Core/GeneralProduct.h b/Eigen/src/Core/GeneralProduct.h
index 53f934999..f7c5f4276 100644
--- a/Eigen/src/Core/GeneralProduct.h
+++ b/Eigen/src/Core/GeneralProduct.h
@@ -81,6 +81,8 @@ public:
  * This is a compile time mapping from {1,Small,Large}^3 -> {product types} */
 // FIXME I'm not sure the current mapping is the ideal one.
 template<int M, int N>  struct product_type_selector<M,N,1>              { enum { ret = OuterProduct }; };
+template<int M>         struct product_type_selector<M, 1, 1>            { enum { ret = LazyCoeffBasedProductMode }; };
+template<int N>         struct product_type_selector<1, N, 1>            { enum { ret = LazyCoeffBasedProductMode }; };
 template<int Depth>     struct product_type_selector<1,    1,    Depth>  { enum { ret = InnerProduct }; };
 template<>              struct product_type_selector<1,    1,    1>      { enum { ret = InnerProduct }; };
 template<>              struct product_type_selector<Small,1,    Small>  { enum { ret = CoeffBasedProductMode }; };

From 5fbcfe5eb45a946230c06b7a7cac2e6ec6088457 Mon Sep 17 00:00:00 2001
From: Benoit Steiner <benoit.steiner.goog@gmail.com>
Date: Sun, 17 Apr 2016 18:42:31 -0700
Subject: [PATCH 12/27] Worked around the lack of a rand_r function on windows
 systems

---
 unsupported/test/cxx11_runqueue.cpp | 9 +++++++++
 1 file changed, 9 insertions(+)

diff --git a/unsupported/test/cxx11_runqueue.cpp b/unsupported/test/cxx11_runqueue.cpp
index b1e2dbd6d..f8552b76d 100644
--- a/unsupported/test/cxx11_runqueue.cpp
+++ b/unsupported/test/cxx11_runqueue.cpp
@@ -13,6 +13,15 @@
 #include "main.h"
 #include <Eigen/CXX11/ThreadPool>
 
+
+#ifdef EIGEN_COMP_MSVC_STRICT
+// Visual studio doesn't implementan rand_r() function since its
+// implementation of rand()is already thread safe
+int rand_r(unsigned int*) {
+  return rand();
+}
+#endif
+
 void test_basic_runqueue()
 {
   RunQueue<int, 4> q;

From 84543c8be2076b747bbc730a760e7693ab6caae4 Mon Sep 17 00:00:00 2001
From: Benoit Steiner <benoit.steiner.goog@gmail.com>
Date: Sun, 17 Apr 2016 19:29:27 -0700
Subject: [PATCH 13/27] Worked around the lack of a rand_r function on windows
 systems

---
 unsupported/test/cxx11_eventcount.cpp | 8 ++++++++
 1 file changed, 8 insertions(+)

diff --git a/unsupported/test/cxx11_eventcount.cpp b/unsupported/test/cxx11_eventcount.cpp
index ebd019e2d..59039dae9 100644
--- a/unsupported/test/cxx11_eventcount.cpp
+++ b/unsupported/test/cxx11_eventcount.cpp
@@ -12,6 +12,14 @@
 #include "main.h"
 #include <Eigen/CXX11/ThreadPool>
 
+#ifdef EIGEN_COMP_MSVC_STRICT
+// Visual studio doesn't implementan rand_r() function since its
+// implementation of rand()is already thread safe
+int rand_r(unsigned int*) {
+  return rand();
+}
+#endif
+
 static void test_basic_eventcount()
 {
   std::vector<EventCount::Waiter> waiters(1);

From 50968a0a3ed2686b25f6df1687f4cf7fc6b66da1 Mon Sep 17 00:00:00 2001
From: Benoit Steiner <benoit.steiner.goog@gmail.com>
Date: Tue, 19 Apr 2016 11:53:58 -0700
Subject: [PATCH 14/27] Use DenseIndex in the MeanReducer to avoid overflows
 when processing very large tensors.

---
 unsupported/Eigen/CXX11/src/Tensor/TensorFunctors.h | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorFunctors.h b/unsupported/Eigen/CXX11/src/Tensor/TensorFunctors.h
index 44dc2d730..33cd00391 100644
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorFunctors.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorFunctors.h
@@ -158,8 +158,8 @@ template <typename T> struct MeanReducer
   }
 
   protected:
-    int scalarCount_;
-    int packetCount_;
+    DenseIndex scalarCount_;
+    DenseIndex packetCount_;
 };
 
 template <typename T> struct MaxReducer

From f953c607058efd7c3508e888ac4bf2c526336b87 Mon Sep 17 00:00:00 2001
From: Benoit Steiner <benoit.steiner.goog@gmail.com>
Date: Tue, 19 Apr 2016 12:57:39 -0700
Subject: [PATCH 15/27] Fixed 2 recent regression tests

---
 unsupported/test/cxx11_eventcount.cpp | 25 ++++++++++++++-----------
 unsupported/test/cxx11_runqueue.cpp   | 27 +++++++++++++++------------
 2 files changed, 29 insertions(+), 23 deletions(-)

diff --git a/unsupported/test/cxx11_eventcount.cpp b/unsupported/test/cxx11_eventcount.cpp
index 59039dae9..898c4c278 100644
--- a/unsupported/test/cxx11_eventcount.cpp
+++ b/unsupported/test/cxx11_eventcount.cpp
@@ -12,11 +12,14 @@
 #include "main.h"
 #include <Eigen/CXX11/ThreadPool>
 
+// Visual studio doesn't implement a rand_r() function since its
+// implementation of rand() is already thread safe
+int rand_reentrant(unsigned int* s) {
 #ifdef EIGEN_COMP_MSVC_STRICT
-// Visual studio doesn't implementan rand_r() function since its
-// implementation of rand()is already thread safe
-int rand_r(unsigned int*) {
   return rand();
+#else
+  return rand_r(s);
+endif
 }
 #endif
 
@@ -85,15 +88,15 @@ static void test_stress_eventcount()
   std::vector<std::unique_ptr<std::thread>> producers;
   for (int i = 0; i < kThreads; i++) {
     producers.emplace_back(new std::thread([&ec, &queues]() {
-      unsigned rnd = std::hash<std::thread::id>()(std::this_thread::get_id());
-      for (int i = 0; i < kEvents; i++) {
-        unsigned idx = rand_r(&rnd) % kQueues;
+      unsigned int rnd = static_cast<unsigned int>(std::hash<std::thread::id>()(std::this_thread::get_id()));
+      for (int j = 0; j < kEvents; j++) {
+        unsigned idx = rand_reentrant(&rnd) % kQueues;
         if (queues[idx].Push()) {
           ec.Notify(false);
           continue;
         }
         std::this_thread::yield();
-        i--;
+        j--;
       }
     }));
   }
@@ -102,11 +105,11 @@ static void test_stress_eventcount()
   for (int i = 0; i < kThreads; i++) {
     consumers.emplace_back(new std::thread([&ec, &queues, &waiters, i]() {
       EventCount::Waiter& w = waiters[i];
-      unsigned rnd = std::hash<std::thread::id>()(std::this_thread::get_id());
-      for (int i = 0; i < kEvents; i++) {
-        unsigned idx = rand_r(&rnd) % kQueues;
+      unsigned int rnd = static_cast<unsigned int>(std::hash<std::thread::id>()(std::this_thread::get_id()));
+      for (int j = 0; j < kEvents; k++) {
+        unsigned idx = rand_reentrant(&rnd) % kQueues;
         if (queues[idx].Pop()) continue;
-        i--;
+        j--;
         ec.Prewait(&w);
         bool empty = true;
         for (int q = 0; q < kQueues; q++) {
diff --git a/unsupported/test/cxx11_runqueue.cpp b/unsupported/test/cxx11_runqueue.cpp
index f8552b76d..5cfb38b2e 100644
--- a/unsupported/test/cxx11_runqueue.cpp
+++ b/unsupported/test/cxx11_runqueue.cpp
@@ -14,11 +14,14 @@
 #include <Eigen/CXX11/ThreadPool>
 
 
+// Visual studio doesn't implement a rand_r() function since its
+// implementation of rand() is already thread safe
+int rand_reentrant(unsigned int* s) {
 #ifdef EIGEN_COMP_MSVC_STRICT
-// Visual studio doesn't implementan rand_r() function since its
-// implementation of rand()is already thread safe
-int rand_r(unsigned int*) {
   return rand();
+#else
+  return rand_r(s);
+endif
 }
 #endif
 
@@ -115,11 +118,11 @@ void test_empty_runqueue()
     unsigned rnd = 0;
     std::vector<int> stolen;
     for (int i = 0; i < 1 << 18; i++) {
-      if (rand_r(&rnd) % 2)
+      if (rand_reentrant(&rnd) % 2)
         VERIFY_IS_EQUAL(0, q.PushFront(1));
       else
         VERIFY_IS_EQUAL(0, q.PushBack(1));
-      if (rand_r(&rnd) % 2)
+      if (rand_reentrant(&rnd) % 2)
         VERIFY_IS_EQUAL(1, q.PopFront());
       else {
         for (;;) {
@@ -176,30 +179,30 @@ void test_stress_runqueue()
   for (int i = 0; i < 2; i++) {
     threads.emplace_back(new std::thread([&q, &total]() {
       int sum = 0;
-      for (int i = 1; i < kEvents; i++) {
-        if (q.PushBack(i) == 0) {
-          sum += i;
+      for (int j = 1; j < kEvents; j++) {
+        if (q.PushBack(j) == 0) {
+          sum += j;
           continue;
         }
         std::this_thread::yield();
-        i--;
+        j--;
       }
       total += sum;
     }));
     threads.emplace_back(new std::thread([&q, &total]() {
       int sum = 0;
       std::vector<int> stolen;
-      for (int i = 1; i < kEvents;) {
+      for (int j = 1; j < kEvents;) {
         if (q.PopBackHalf(&stolen) == 0) {
           std::this_thread::yield();
           continue;
         }
-        while (stolen.size() && i < kEvents) {
+        while (stolen.size() && j < kEvents) {
           int v = stolen.back();
           stolen.pop_back();
           VERIFY_IS_NOT_EQUAL(v, 0);
           sum += v;
-          i++;
+          j++;
         }
       }
       while (stolen.size()) {

From a278414d1b1220d56f574a80027955c55542cd95 Mon Sep 17 00:00:00 2001
From: Benoit Steiner <benoit.steiner.goog@gmail.com>
Date: Tue, 19 Apr 2016 13:54:28 -0700
Subject: [PATCH 16/27] Avoid an unnecessary copy of the evaluator.

---
 unsupported/Eigen/CXX11/src/Tensor/TensorExecutor.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorExecutor.h b/unsupported/Eigen/CXX11/src/Tensor/TensorExecutor.h
index 7a54f7a23..907da9446 100644
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorExecutor.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorExecutor.h
@@ -183,7 +183,7 @@ class TensorExecutor<Expression, GpuDevice, Vectorizable> {
 template <typename Evaluator, typename Index, bool Vectorizable>
 struct EigenMetaKernelEval {
   static __device__ EIGEN_ALWAYS_INLINE
-  void run(Evaluator eval, Index first, Index last, Index step_size) {
+  void run(Evaluator& eval, Index first, Index last, Index step_size) {
     for (Index i = first; i < last; i += step_size) {
       eval.evalScalar(i);
     }

From 884c07505868c0167467c5c3de207724b24f12ab Mon Sep 17 00:00:00 2001
From: Benoit Steiner <benoit.steiner.goog@gmail.com>
Date: Tue, 19 Apr 2016 14:33:30 -0700
Subject: [PATCH 17/27] Use numext::ceil instead of std::ceil

---
 unsupported/Eigen/CXX11/src/Tensor/TensorReductionCuda.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorReductionCuda.h b/unsupported/Eigen/CXX11/src/Tensor/TensorReductionCuda.h
index c33d54d6e..02193f263 100644
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorReductionCuda.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorReductionCuda.h
@@ -136,7 +136,7 @@ struct FullReducer<Self, Op, GpuDevice, Vectorizable> {
     const Index num_coeffs = array_prod(self.m_impl.dimensions());
     const int block_size = 256;
     const int num_per_thread = 128;
-    const int num_blocks = std::ceil(static_cast<float>(num_coeffs) / (block_size * num_per_thread));
+    const int num_blocks = numext::ceil(static_cast<float>(num_coeffs) / (block_size * num_per_thread));
 
     if (num_blocks > 1) {
       // We initialize the outputs outside the reduction kernel when we can't be sure that there

From b9ea40c30d1d32d0f31b047aa681c384fd1a2c98 Mon Sep 17 00:00:00 2001
From: Benoit Steiner <benoit.steiner.goog@gmail.com>
Date: Tue, 19 Apr 2016 14:35:11 -0700
Subject: [PATCH 18/27] Don't take the address of a kernel on CUDA devices that
 don't support this feature.

---
 unsupported/Eigen/CXX11/src/Tensor/TensorDeviceCuda.h | 9 ++++++---
 1 file changed, 6 insertions(+), 3 deletions(-)

diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorDeviceCuda.h b/unsupported/Eigen/CXX11/src/Tensor/TensorDeviceCuda.h
index 821835cf3..8e7f5dddb 100644
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorDeviceCuda.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorDeviceCuda.h
@@ -291,14 +291,17 @@ struct GpuDevice {
   int max_blocks_;
 };
 
-#ifndef __CUDA_ARCH__
+#if !defined(__CUDA_ARCH__)
 #define LAUNCH_CUDA_KERNEL(kernel, gridsize, blocksize, sharedmem, device, ...)             \
   (kernel) <<< (gridsize), (blocksize), (sharedmem), (device).stream() >>> (__VA_ARGS__);   \
   assert(cudaGetLastError() == cudaSuccess);
-#else
+#elif __CUDA_ARCH__ >= 350
 #define LAUNCH_CUDA_KERNEL(kernel, ...)                                                     \
   { const auto __attribute__((__unused__)) __makeTheKernelInstantiate = &(kernel); }        \
-  eigen_assert(false && "Cannot launch a kernel from another kernel" __CUDA_ARCH__);
+  eigen_assert(false && "Cannot launch a kernel from another kernel" __CUDA_ARCH__ kernel);
+#else
+#define LAUNCH_CUDA_KERNEL(kernel, ...)                                                     \
+  eigen_assert(false && "Cannot launch a kernel from another kernel" __CUDA_ARCH__ kernel);
 #endif
 
 

From 7129d998db0a8dd74125ad7081f3d220cbce96f0 Mon Sep 17 00:00:00 2001
From: Benoit Steiner <benoit.steiner.goog@gmail.com>
Date: Tue, 19 Apr 2016 14:55:21 -0700
Subject: [PATCH 19/27] Simplified the code that launches cuda kernels.

---
 .../Eigen/CXX11/src/Tensor/TensorDeviceCuda.h        |  9 ---------
 unsupported/Eigen/CXX11/src/Tensor/TensorExecutor.h  |  2 +-
 .../Eigen/CXX11/src/Tensor/TensorReductionCuda.h     | 12 ++++++------
 3 files changed, 7 insertions(+), 16 deletions(-)

diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorDeviceCuda.h b/unsupported/Eigen/CXX11/src/Tensor/TensorDeviceCuda.h
index 8e7f5dddb..1d2d162dc 100644
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorDeviceCuda.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorDeviceCuda.h
@@ -291,18 +291,9 @@ struct GpuDevice {
   int max_blocks_;
 };
 
-#if !defined(__CUDA_ARCH__)
 #define LAUNCH_CUDA_KERNEL(kernel, gridsize, blocksize, sharedmem, device, ...)             \
   (kernel) <<< (gridsize), (blocksize), (sharedmem), (device).stream() >>> (__VA_ARGS__);   \
   assert(cudaGetLastError() == cudaSuccess);
-#elif __CUDA_ARCH__ >= 350
-#define LAUNCH_CUDA_KERNEL(kernel, ...)                                                     \
-  { const auto __attribute__((__unused__)) __makeTheKernelInstantiate = &(kernel); }        \
-  eigen_assert(false && "Cannot launch a kernel from another kernel" __CUDA_ARCH__ kernel);
-#else
-#define LAUNCH_CUDA_KERNEL(kernel, ...)                                                     \
-  eigen_assert(false && "Cannot launch a kernel from another kernel" __CUDA_ARCH__ kernel);
-#endif
 
 
 // FIXME: Should be device and kernel specific.
diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorExecutor.h b/unsupported/Eigen/CXX11/src/Tensor/TensorExecutor.h
index 907da9446..bf6e10a7b 100644
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorExecutor.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorExecutor.h
@@ -193,7 +193,7 @@ struct EigenMetaKernelEval {
 template <typename Evaluator, typename Index>
 struct EigenMetaKernelEval<Evaluator, Index, true> {
   static __device__ EIGEN_ALWAYS_INLINE
-  void run(Evaluator eval, Index first, Index last, Index step_size) {
+  void run(Evaluator& eval, Index first, Index last, Index step_size) {
     const Index PacketSize = unpacket_traits<typename Evaluator::PacketReturnType>::size;
     const Index vectorized_size = (last / PacketSize) * PacketSize;
     const Index vectorized_step_size = step_size * PacketSize;
diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorReductionCuda.h b/unsupported/Eigen/CXX11/src/Tensor/TensorReductionCuda.h
index 02193f263..d80436326 100644
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorReductionCuda.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorReductionCuda.h
@@ -126,11 +126,11 @@ struct FullReducer<Self, Op, GpuDevice, Vectorizable> {
                                                  internal::is_same<typename Self::CoeffReturnType, float>::value;
 
   template <typename OutputType>
-  static EIGEN_DEVICE_FUNC void run(const Self&, Op&, const GpuDevice&, OutputType*) {
+  static void run(const Self&, Op&, const GpuDevice&, OutputType*) {
     assert(false && "Should only be called on floats");
   }
 
-  static EIGEN_DEVICE_FUNC void run(const Self& self, Op& reducer, const GpuDevice& device, float* output) {
+  static void run(const Self& self, Op& reducer, const GpuDevice& device, float* output) {
     typedef typename Self::Index Index;
 
     const Index num_coeffs = array_prod(self.m_impl.dimensions());
@@ -226,12 +226,12 @@ struct InnerReducer<Self, Op, GpuDevice> {
                                                  internal::is_same<typename Self::CoeffReturnType, float>::value;
 
   template <typename Device, typename OutputType>
-  static EIGEN_DEVICE_FUNC bool run(const Self&, Op&, const Device&, OutputType*, typename Self::Index, typename Self::Index) {
+  static bool run(const Self&, Op&, const Device&, OutputType*, typename Self::Index, typename Self::Index) {
     assert(false && "Should only be called to reduce floats on a gpu device");
     return true;
   }
 
-  static EIGEN_DEVICE_FUNC bool run(const Self& self, Op& reducer, const GpuDevice& device, float* output, typename Self::Index num_coeffs_to_reduce, typename Self::Index num_preserved_vals) {
+  static bool run(const Self& self, Op& reducer, const GpuDevice& device, float* output, typename Self::Index num_coeffs_to_reduce, typename Self::Index num_preserved_vals) {
     typedef typename Self::Index Index;
 
     // It's faster to use the usual code.
@@ -305,12 +305,12 @@ struct OuterReducer<Self, Op, GpuDevice> {
                                                  internal::is_same<typename Self::CoeffReturnType, float>::value;
 
   template <typename Device, typename OutputType>
-  static EIGEN_DEVICE_FUNC bool run(const Self&, Op&, const Device&, OutputType*, typename Self::Index, typename Self::Index) {
+  static bool run(const Self&, Op&, const Device&, OutputType*, typename Self::Index, typename Self::Index) {
     assert(false && "Should only be called to reduce floats on a gpu device");
     return true;
   }
 
-  static EIGEN_DEVICE_FUNC bool run(const Self& self, Op& reducer, const GpuDevice& device, float* output, typename Self::Index num_coeffs_to_reduce, typename Self::Index num_preserved_vals) {
+  static bool run(const Self& self, Op& reducer, const GpuDevice& device, float* output, typename Self::Index num_coeffs_to_reduce, typename Self::Index num_preserved_vals) {
     typedef typename Self::Index Index;
 
     // It's faster to use the usual code.

From 5b1106c56b64e4691c2849184d416631c689348a Mon Sep 17 00:00:00 2001
From: Benoit Steiner <benoit.steiner.goog@gmail.com>
Date: Tue, 19 Apr 2016 14:57:57 -0700
Subject: [PATCH 20/27] Fixed a compilation error with nvcc 7.

---
 unsupported/Eigen/CXX11/src/Tensor/TensorReductionCuda.h | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorReductionCuda.h b/unsupported/Eigen/CXX11/src/Tensor/TensorReductionCuda.h
index d80436326..afa5a257a 100644
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorReductionCuda.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorReductionCuda.h
@@ -126,7 +126,7 @@ struct FullReducer<Self, Op, GpuDevice, Vectorizable> {
                                                  internal::is_same<typename Self::CoeffReturnType, float>::value;
 
   template <typename OutputType>
-  static void run(const Self&, Op&, const GpuDevice&, OutputType*) {
+  static EIGEN_DEVICE_FUNC void run(const Self&, Op&, const GpuDevice&, OutputType*) {
     assert(false && "Should only be called on floats");
   }
 
@@ -226,7 +226,7 @@ struct InnerReducer<Self, Op, GpuDevice> {
                                                  internal::is_same<typename Self::CoeffReturnType, float>::value;
 
   template <typename Device, typename OutputType>
-  static bool run(const Self&, Op&, const Device&, OutputType*, typename Self::Index, typename Self::Index) {
+  static EIGEN_DEVICE_FUNC bool run(const Self&, Op&, const Device&, OutputType*, typename Self::Index, typename Self::Index) {
     assert(false && "Should only be called to reduce floats on a gpu device");
     return true;
   }
@@ -305,7 +305,7 @@ struct OuterReducer<Self, Op, GpuDevice> {
                                                  internal::is_same<typename Self::CoeffReturnType, float>::value;
 
   template <typename Device, typename OutputType>
-  static bool run(const Self&, Op&, const Device&, OutputType*, typename Self::Index, typename Self::Index) {
+  static EIGEN_DEVICE_FUNC bool run(const Self&, Op&, const Device&, OutputType*, typename Self::Index, typename Self::Index) {
     assert(false && "Should only be called to reduce floats on a gpu device");
     return true;
   }

From 04f954956d407fbf6cf7c2be3a993460aa608177 Mon Sep 17 00:00:00 2001
From: Benoit Steiner <benoit.steiner.goog@gmail.com>
Date: Tue, 19 Apr 2016 15:27:09 -0700
Subject: [PATCH 21/27] Fixed a few typos

---
 unsupported/test/cxx11_eventcount.cpp | 6 +++---
 unsupported/test/cxx11_runqueue.cpp   | 4 ++--
 2 files changed, 5 insertions(+), 5 deletions(-)

diff --git a/unsupported/test/cxx11_eventcount.cpp b/unsupported/test/cxx11_eventcount.cpp
index 898c4c278..2f250338c 100644
--- a/unsupported/test/cxx11_eventcount.cpp
+++ b/unsupported/test/cxx11_eventcount.cpp
@@ -16,12 +16,12 @@
 // implementation of rand() is already thread safe
 int rand_reentrant(unsigned int* s) {
 #ifdef EIGEN_COMP_MSVC_STRICT
+  EIGEN_UNUSED_VARIABLE(s);
   return rand();
 #else
   return rand_r(s);
-endif
-}
 #endif
+}
 
 static void test_basic_eventcount()
 {
@@ -106,7 +106,7 @@ static void test_stress_eventcount()
     consumers.emplace_back(new std::thread([&ec, &queues, &waiters, i]() {
       EventCount::Waiter& w = waiters[i];
       unsigned int rnd = static_cast<unsigned int>(std::hash<std::thread::id>()(std::this_thread::get_id()));
-      for (int j = 0; j < kEvents; k++) {
+      for (int j = 0; j < kEvents; j++) {
         unsigned idx = rand_reentrant(&rnd) % kQueues;
         if (queues[idx].Pop()) continue;
         j--;
diff --git a/unsupported/test/cxx11_runqueue.cpp b/unsupported/test/cxx11_runqueue.cpp
index 5cfb38b2e..4207824bf 100644
--- a/unsupported/test/cxx11_runqueue.cpp
+++ b/unsupported/test/cxx11_runqueue.cpp
@@ -18,12 +18,12 @@
 // implementation of rand() is already thread safe
 int rand_reentrant(unsigned int* s) {
 #ifdef EIGEN_COMP_MSVC_STRICT
+  EIGEN_UNUSED_VARIABLE(s);
   return rand();
 #else
   return rand_r(s);
-endif
-}
 #endif
+}
 
 void test_basic_runqueue()
 {

From 2b7216302864715eb6950448fc71b0e4b51d617c Mon Sep 17 00:00:00 2001
From: Benoit Steiner <benoit.steiner.goog@gmail.com>
Date: Tue, 19 Apr 2016 15:56:02 -0700
Subject: [PATCH 22/27] Implemented a more portable version of thread local
 variables

---
 unsupported/Eigen/CXX11/ThreadPool            |  1 +
 .../src/ThreadPool/NonBlockingThreadPool.h    |  2 +-
 .../Eigen/CXX11/src/ThreadPool/ThreadLocal.h  | 22 +++++++++++++++++++
 3 files changed, 24 insertions(+), 1 deletion(-)
 create mode 100644 unsupported/Eigen/CXX11/src/ThreadPool/ThreadLocal.h

diff --git a/unsupported/Eigen/CXX11/ThreadPool b/unsupported/Eigen/CXX11/ThreadPool
index 6f360ae2f..6cd057b4b 100644
--- a/unsupported/Eigen/CXX11/ThreadPool
+++ b/unsupported/Eigen/CXX11/ThreadPool
@@ -47,6 +47,7 @@
 
 #include "src/ThreadPool/EventCount.h"
 #include "src/ThreadPool/RunQueue.h"
+#include "src/ThreadPool/ThreadLocal.h"
 #include "src/ThreadPool/ThreadPoolInterface.h"
 #include "src/ThreadPool/ThreadEnvironment.h"
 #include "src/ThreadPool/SimpleThreadPool.h"
diff --git a/unsupported/Eigen/CXX11/src/ThreadPool/NonBlockingThreadPool.h b/unsupported/Eigen/CXX11/src/ThreadPool/NonBlockingThreadPool.h
index 18dec5393..1c471a19f 100644
--- a/unsupported/Eigen/CXX11/src/ThreadPool/NonBlockingThreadPool.h
+++ b/unsupported/Eigen/CXX11/src/ThreadPool/NonBlockingThreadPool.h
@@ -212,7 +212,7 @@ class NonBlockingThreadPoolTempl : public Eigen::ThreadPoolInterface {
   }
 
   PerThread* GetPerThread() {
-    static thread_local PerThread per_thread_;
+    EIGEN_THREAD_LOCAL PerThread per_thread_;
     PerThread* pt = &per_thread_;
     if (pt->inited) return pt;
     pt->inited = true;
diff --git a/unsupported/Eigen/CXX11/src/ThreadPool/ThreadLocal.h b/unsupported/Eigen/CXX11/src/ThreadPool/ThreadLocal.h
new file mode 100644
index 000000000..cfa221732
--- /dev/null
+++ b/unsupported/Eigen/CXX11/src/ThreadPool/ThreadLocal.h
@@ -0,0 +1,22 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2016 Benoit Steiner <benoit.steiner.goog@gmail.com>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+#ifndef EIGEN_CXX11_THREADPOOL_THREAD_LOCAL_H
+#define EIGEN_CXX11_THREADPOOL_THREAD_LOCAL_H
+
+// Try to come up with a portable implementation of thread local variables
+#if EIGEN_COMP_GNUC && EIGEN_GNUC_AT_MOST(4, 7)
+#define EIGEN_THREAD_LOCAL static __thread
+#elif EIGEN_COMP_CLANG
+#define EIGEN_THREAD_LOCAL static __thread
+#else
+#define EIGEN_THREAD_LOCAL static thread_local
+#endif
+
+#endif  // EIGEN_CXX11_THREADPOOL_THREAD_LOCAL_H

From 1d0238375d53e168eda40139979f8d128e72f4e8 Mon Sep 17 00:00:00 2001
From: Benoit Steiner <benoit.steiner.goog@gmail.com>
Date: Tue, 19 Apr 2016 17:44:12 -0700
Subject: [PATCH 23/27] Made sure all the required header files are included
 when trying to use fp16

---
 Eigen/Core | 6 +++++-
 1 file changed, 5 insertions(+), 1 deletion(-)

diff --git a/Eigen/Core b/Eigen/Core
index c7192b037..ac58f10c3 100644
--- a/Eigen/Core
+++ b/Eigen/Core
@@ -214,10 +214,14 @@
   #include <vector_types.h>
   #if defined __CUDACC_VER__ && __CUDACC_VER__ >= 70500
     #define EIGEN_HAS_CUDA_FP16
-    #include <cuda_fp16.h>
   #endif
 #endif
 
+#if defined EIGEN_HAS_CUDA_FP16
+  #include <host_defines.h>
+  #include <cuda_fp16.h>
+#endif
+
 #if (defined _OPENMP) && (!defined EIGEN_DONT_PARALLELIZE)
   #define EIGEN_HAS_OPENMP
 #endif

From c7c2054bb519ad01039560254f9dda8698cb0cd9 Mon Sep 17 00:00:00 2001
From: Benoit Steiner <benoit.steiner.goog@gmail.com>
Date: Tue, 19 Apr 2016 17:59:58 -0700
Subject: [PATCH 24/27] Started to implement a portable way to yield.

---
 unsupported/Eigen/CXX11/ThreadPool            |  3 ++-
 .../Eigen/CXX11/src/ThreadPool/EventCount.h   |  4 ++--
 .../Eigen/CXX11/src/ThreadPool/ThreadYield.h  | 20 +++++++++++++++++++
 3 files changed, 24 insertions(+), 3 deletions(-)
 create mode 100644 unsupported/Eigen/CXX11/src/ThreadPool/ThreadYield.h

diff --git a/unsupported/Eigen/CXX11/ThreadPool b/unsupported/Eigen/CXX11/ThreadPool
index 6cd057b4b..fe00a0b65 100644
--- a/unsupported/Eigen/CXX11/ThreadPool
+++ b/unsupported/Eigen/CXX11/ThreadPool
@@ -45,9 +45,10 @@
 #include <functional>
 #include <memory>
 
+#include "src/ThreadPool/ThreadLocal.h"
+#include "src/ThreadPool/ThreadYield.h"
 #include "src/ThreadPool/EventCount.h"
 #include "src/ThreadPool/RunQueue.h"
-#include "src/ThreadPool/ThreadLocal.h"
 #include "src/ThreadPool/ThreadPoolInterface.h"
 #include "src/ThreadPool/ThreadEnvironment.h"
 #include "src/ThreadPool/SimpleThreadPool.h"
diff --git a/unsupported/Eigen/CXX11/src/ThreadPool/EventCount.h b/unsupported/Eigen/CXX11/src/ThreadPool/EventCount.h
index 16eee1a41..6dd64f185 100644
--- a/unsupported/Eigen/CXX11/src/ThreadPool/EventCount.h
+++ b/unsupported/Eigen/CXX11/src/ThreadPool/EventCount.h
@@ -81,7 +81,7 @@ class EventCount {
       if (int64_t((state & kEpochMask) - epoch) < 0) {
         // The preceeding waiter has not decided on its fate. Wait until it
         // calls either CancelWait or CommitWait, or is notified.
-        std::this_thread::yield();
+        EIGEN_THREAD_YIELD();
         state = state_.load(std::memory_order_seq_cst);
         continue;
       }
@@ -112,7 +112,7 @@ class EventCount {
       if (int64_t((state & kEpochMask) - epoch) < 0) {
         // The preceeding waiter has not decided on its fate. Wait until it
         // calls either CancelWait or CommitWait, or is notified.
-        std::this_thread::yield();
+        EIGEN_THREAD_YIELD();
         state = state_.load(std::memory_order_relaxed);
         continue;
       }
diff --git a/unsupported/Eigen/CXX11/src/ThreadPool/ThreadYield.h b/unsupported/Eigen/CXX11/src/ThreadPool/ThreadYield.h
new file mode 100644
index 000000000..a859c7ba3
--- /dev/null
+++ b/unsupported/Eigen/CXX11/src/ThreadPool/ThreadYield.h
@@ -0,0 +1,20 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2016 Benoit Steiner <benoit.steiner.goog@gmail.com>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+#ifndef EIGEN_CXX11_THREADPOOL_THREAD_YIELD_H
+#define EIGEN_CXX11_THREADPOOL_THREAD_YIELD_H
+
+// Try to come up with a portable way to yield
+#if EIGEN_COMP_GNUC && EIGEN_GNUC_AT_MOST(4, 7)
+#define EIGEN_THREAD_YIELD() sched_yield()
+#else
+#define EIGEN_THREAD_YIELD() std::this_thread::yield()
+#endif
+
+#endif  // EIGEN_CXX11_THREADPOOL_THREAD_YIELD_H

From 80200a182862bf7c19b51645ab8e7b9e15b65a90 Mon Sep 17 00:00:00 2001
From: Benoit Steiner <benoit.steiner.goog@gmail.com>
Date: Wed, 20 Apr 2016 12:10:27 -0700
Subject: [PATCH 25/27] Don't attempt to leverage the _cvtss_sh and _cvtsh_ss
 instructions when compiling with clang since it's unclear which versions of
 clang actually support these instruction.

---
 Eigen/Core | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/Eigen/Core b/Eigen/Core
index ac58f10c3..50040135f 100644
--- a/Eigen/Core
+++ b/Eigen/Core
@@ -204,7 +204,7 @@
   #endif
 #endif
 
-#if defined(__F16C__)
+#if defined(__F16C__) && !defined(EIGEN_COMP_CLANG)
   // We can use the optimized fp16 to float and float to fp16 conversion routines
   #define EIGEN_HAS_FP16_C
 #endif

From a792cd357d31f0a4fce62ed1fa4cc0334cf2f143 Mon Sep 17 00:00:00 2001
From: Benoit Steiner <benoit.steiner.goog@gmail.com>
Date: Wed, 20 Apr 2016 17:33:58 -0700
Subject: [PATCH 26/27] Added more tests

---
 unsupported/test/cxx11_tensor_fixed_size.cpp | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/unsupported/test/cxx11_tensor_fixed_size.cpp b/unsupported/test/cxx11_tensor_fixed_size.cpp
index 1c33fefb3..5fe164859 100644
--- a/unsupported/test/cxx11_tensor_fixed_size.cpp
+++ b/unsupported/test/cxx11_tensor_fixed_size.cpp
@@ -20,6 +20,8 @@ static void test_0d()
   TensorFixedSize<float, Sizes<> > scalar1;
   TensorFixedSize<float, Sizes<>, RowMajor> scalar2;
   VERIFY_IS_EQUAL(scalar1.rank(), 0);
+  VERIFY_IS_EQUAL(scalar1.size(), 1);
+  VERIFY_IS_EQUAL(array_prod(scalar1.dimensions()), 1);
 
   scalar1() = 7.0;
   scalar2() = 13.0;

From 2dde1b102866e1928e925678951463f2a7051af1 Mon Sep 17 00:00:00 2001
From: Benoit Steiner <benoit.steiner.goog@gmail.com>
Date: Wed, 20 Apr 2016 18:08:20 -0700
Subject: [PATCH 27/27] Don't crash when attempting to reduce empty tensors.

---
 unsupported/Eigen/CXX11/src/Tensor/TensorExecutor.h  |  2 +-
 unsupported/Eigen/CXX11/src/Tensor/TensorMeta.h      | 12 ++++++++++--
 .../Eigen/CXX11/src/Tensor/TensorReductionCuda.h     |  7 ++++++-
 3 files changed, 17 insertions(+), 4 deletions(-)

diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorExecutor.h b/unsupported/Eigen/CXX11/src/Tensor/TensorExecutor.h
index bf6e10a7b..c3edae477 100644
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorExecutor.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorExecutor.h
@@ -238,7 +238,7 @@ inline void TensorExecutor<Expression, GpuDevice, Vectorizable>::run(
                            device.maxCudaThreadsPerMultiProcessor() / block_size;
     const Index size = array_prod(evaluator.dimensions());
     // Create a least one block to ensure we won't crash when tensorflow calls with tensors of size 0.
-    const int num_blocks = numext::maxi<int>(numext::mini<int>(max_blocks, (size + block_size - 1) / block_size), 1);
+    const int num_blocks = numext::maxi<int>(numext::mini<int>(max_blocks, divup<int>(size, block_size)), 1);
 
     LAUNCH_CUDA_KERNEL(
         (EigenMetaKernel<TensorEvaluator<Expression, GpuDevice>, Index>),
diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorMeta.h b/unsupported/Eigen/CXX11/src/Tensor/TensorMeta.h
index 6af2d45d4..cd04716bd 100644
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorMeta.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorMeta.h
@@ -24,9 +24,17 @@ const T2& choose(Cond<false>, const T1&, const T2& second) {
   return second;
 }
 
-template <typename T> EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE
+
+template <typename T, typename X, typename Y>
+EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE
+T divup(const X x, const Y y) {
+  return static_cast<T>((x + y - 1) / y);
+}
+
+template <typename T>
+EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE
 T divup(const T x, const T y) {
-  return (x + y - 1) / y;
+  return static_cast<T>((x + y - 1) / y);
 }
 
 template <size_t n> struct max_n_1 {
diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorReductionCuda.h b/unsupported/Eigen/CXX11/src/Tensor/TensorReductionCuda.h
index afa5a257a..fd2587dd5 100644
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorReductionCuda.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorReductionCuda.h
@@ -134,9 +134,14 @@ struct FullReducer<Self, Op, GpuDevice, Vectorizable> {
     typedef typename Self::Index Index;
 
     const Index num_coeffs = array_prod(self.m_impl.dimensions());
+    // Don't crash when we're called with an input tensor of size 0.
+    if (num_coeffs == 0) {
+      return;
+    }
+
     const int block_size = 256;
     const int num_per_thread = 128;
-    const int num_blocks = numext::ceil(static_cast<float>(num_coeffs) / (block_size * num_per_thread));
+    const int num_blocks = divup<int>(num_coeffs, block_size * num_per_thread);
 
     if (num_blocks > 1) {
       // We initialize the outputs outside the reduction kernel when we can't be sure that there