From 07ac4f7e027cddd3457a34295420480f7e541ac5 Mon Sep 17 00:00:00 2001 From: Rasmus Munk Larsen Date: Thu, 14 Apr 2016 18:28:23 -0700 Subject: [PATCH 01/27] Eigen Tensor cost model part 2: Thread scheduling for standard evaluators and reductions. The cost model is turned off by default. --- .../Eigen/CXX11/src/Tensor/TensorCostModel.h | 6 +- .../Eigen/CXX11/src/Tensor/TensorEvaluator.h | 5 + .../Eigen/CXX11/src/Tensor/TensorExecutor.h | 91 ++++++++++----- .../Eigen/CXX11/src/Tensor/TensorReduction.h | 104 +++++++----------- 4 files changed, 110 insertions(+), 96 deletions(-) diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorCostModel.h b/unsupported/Eigen/CXX11/src/Tensor/TensorCostModel.h index 32bc5d0b2..4e8f86674 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorCostModel.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorCostModel.h @@ -10,9 +10,9 @@ #ifndef EIGEN_CXX11_TENSOR_TENSOR_COST_MODEL_H #define EIGEN_CXX11_TENSOR_TENSOR_COST_MODEL_H -#if !defined(EIGEN_USE_GPU) -#define EIGEN_USE_COST_MODEL -#endif +//#if !defined(EIGEN_USE_GPU) +//#define EIGEN_USE_COST_MODEL +//#endif namespace Eigen { diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorEvaluator.h b/unsupported/Eigen/CXX11/src/Tensor/TensorEvaluator.h index f1f9a90df..293012646 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorEvaluator.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorEvaluator.h @@ -189,6 +189,11 @@ struct TensorEvaluator return loadConstant(m_data+index); } + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorOpCost costPerCoeff(bool vectorized) const { + return TensorOpCost(sizeof(CoeffReturnType), 0, 0, vectorized, + internal::unpacket_traits::size); + } + EIGEN_DEVICE_FUNC const Scalar* data() const { return m_data; } protected: diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorExecutor.h b/unsupported/Eigen/CXX11/src/Tensor/TensorExecutor.h index eabfd91fe..df9cc0998 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorExecutor.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorExecutor.h @@ -59,9 +59,16 @@ class TensorExecutor { const Index size = array_prod(evaluator.dimensions()); const int PacketSize = unpacket_traits::PacketReturnType>::size; + // Manually unroll this loop since compilers don't do it. + const Index UnrolledSize = (size / (4 * PacketSize)) * 4 * PacketSize; + for (Index i = 0; i < UnrolledSize; i += 4*PacketSize) { + evaluator.evalPacket(i); + evaluator.evalPacket(i+PacketSize); + evaluator.evalPacket(i+2*PacketSize); + evaluator.evalPacket(i+3*PacketSize); + } const Index VectorizedSize = (size / PacketSize) * PacketSize; - - for (Index i = 0; i < VectorizedSize; i += PacketSize) { + for (Index i = UnrolledSize; i < VectorizedSize; i += PacketSize) { evaluator.evalPacket(i); } for (Index i = VectorizedSize; i < size; ++i) { @@ -78,8 +85,9 @@ class TensorExecutor #ifdef EIGEN_USE_THREADS template struct EvalRange { - static void run(Evaluator evaluator, const Index first, const Index last) { - eigen_assert(last > first); + static void run(void* evaluator_in, const Index first, const Index last) { + Evaluator evaluator(*static_cast(evaluator_in)); + eigen_assert(last >= first); for (Index i = first; i < last; ++i) { evaluator.evalScalar(i); } @@ -88,28 +96,45 @@ struct EvalRange { template struct EvalRange { - static void run(Evaluator evaluator, const Index first, const Index last) { - eigen_assert(last > first); + static void run(void* evaluator_in, const Index first, const Index last) { + Evaluator evaluator(*static_cast(evaluator_in)); + eigen_assert(last >= first); Index i = first; - static const int PacketSize = unpacket_traits::size; + const int PacketSize = unpacket_traits::size; if (last - first >= PacketSize) { eigen_assert(first % PacketSize == 0); - Index lastPacket = last - (last % PacketSize); - for (; i < lastPacket; i += PacketSize) { + Index last_chunk_offset = last - 4 * PacketSize; + // Manually unroll this loop since compilers don't do it. + for (; i <= last_chunk_offset; i += 4*PacketSize) { + evaluator.evalPacket(i); + evaluator.evalPacket(i+PacketSize); + evaluator.evalPacket(i+2*PacketSize); + evaluator.evalPacket(i+3*PacketSize); + } + last_chunk_offset = last - PacketSize; + for (; i <= last_chunk_offset; i += PacketSize) { evaluator.evalPacket(i); } } - for (; i < last; ++i) { evaluator.evalScalar(i); } } }; -template -class TensorExecutor -{ +// Used to make an std::function to add to the ThreadPool with less templating +// than EvalRange::Run. +// This requires that this and EvalRange takes a void* to the evaluator that can +// be downcast to the right type by the EvalRange. +template +inline void InvokeEvalRange(void (*run_fn)(void*, const Index, const Index), + void* evaluator, const Index first, const Index last) { + run_fn(evaluator, first, last); +} + +template +class TensorExecutor { public: typedef typename Expression::Index Index; static inline void run(const Expression& expr, const ThreadPoolDevice& device) @@ -119,24 +144,35 @@ class TensorExecutor const bool needs_assign = evaluator.evalSubExprsIfNeeded(NULL); if (needs_assign) { + const Index PacketSize = Vectorizable ? unpacket_traits::size : 1; const Index size = array_prod(evaluator.dimensions()); - - static const int PacketSize = Vectorizable ? unpacket_traits::size : 1; - - int blocksz = std::ceil(static_cast(size)/device.numThreads()) + PacketSize - 1; - const Index blocksize = numext::maxi(PacketSize, (blocksz - (blocksz % PacketSize))); - const unsigned int numblocks = static_cast(size / blocksize); - - Barrier barrier(numblocks); - for (unsigned int i = 0; i < numblocks; ++i) { - device.enqueue_with_barrier(&barrier, &EvalRange::run, evaluator, i*blocksize, (i+1)*blocksize); + int num_threads = device.numThreads(); +#ifdef EIGEN_USE_COST_MODEL + if (num_threads > 1) { + num_threads = TensorCostModel::numThreads( + size, evaluator.costPerCoeff(Vectorizable), num_threads); } +#endif + if (num_threads == 1) { + EvalRange::run(&evaluator, 0, size); + } else { + Index blocksz = std::ceil(static_cast(size)/num_threads) + PacketSize - 1; + const Index blocksize = numext::maxi(PacketSize, (blocksz - (blocksz % PacketSize))); + const Index numblocks = size / blocksize; - if (static_cast(numblocks) * blocksize < size) { - EvalRange::run(evaluator, numblocks * blocksize, size); + Barrier barrier(numblocks); + for (int i = 0; i < numblocks; ++i) { + device.enqueue_with_barrier( + &barrier, &InvokeEvalRange, + &EvalRange::run, + static_cast(&evaluator), i * blocksize, + (i + 1) * blocksize); + } + if (numblocks * blocksize < size) { + EvalRange::run(&evaluator, numblocks * blocksize, size); + } + barrier.Wait(); } - - barrier.Wait(); } evaluator.cleanup(); } @@ -226,7 +262,6 @@ inline void TensorExecutor::run( #endif // __CUDACC__ #endif // EIGEN_USE_GPU - } // end namespace internal } // end namespace Eigen diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorReduction.h b/unsupported/Eigen/CXX11/src/Tensor/TensorReduction.h index 1c9e7ab66..885295f0a 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorReduction.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorReduction.h @@ -214,7 +214,7 @@ struct FullReducer { static EIGEN_DEVICE_FUNC void run(const Self& self, Op& reducer, const Device&, typename Self::CoeffReturnType* output) { const typename Self::Index num_coeffs = array_prod(self.m_impl.dimensions()); - *output = InnerMostDimReducer::reduce(self, 0, num_coeffs, reducer); + *output = InnerMostDimReducer::reduce(self, 0, num_coeffs, reducer); } }; @@ -222,18 +222,19 @@ struct FullReducer { #ifdef EIGEN_USE_THREADS // Multithreaded full reducers template + bool Vectorizable = (Self::InputPacketAccess & Op::PacketAccess)> struct FullReducerShard { static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void run(const Self& self, typename Self::Index firstIndex, typename Self::Index numValuesToReduce, Op& reducer, typename Self::CoeffReturnType* output) { - *output = InnerMostDimReducer::reduce( + *output = InnerMostDimReducer::reduce( self, firstIndex, numValuesToReduce, reducer); } }; -template -struct FullReducer { +// Multithreaded full reducer +template +struct FullReducer { static const bool HasOptimizedImplementation = !Op::IsStateful; static const int PacketSize = unpacket_traits::size; @@ -247,79 +248,44 @@ struct FullReducer { *output = reducer.finalize(reducer.initialize()); return; } - const std::size_t num_threads = device.numThreads(); +#ifdef EIGEN_USE_COST_MODEL + const TensorOpCost cost = + self.m_impl.costPerCoeff(Vectorizable) + + TensorOpCost(0, 0, internal::functor_traits::Cost, Vectorizable, + PacketSize); + const int num_threads = TensorCostModel::numThreads( + num_coeffs, cost, device.numThreads()); +#else + const int num_threads = device.numThreads(); +#endif if (num_threads == 1) { - *output = InnerMostDimReducer::reduce(self, 0, num_coeffs, reducer); - return; - } else { - const Index blocksize = std::floor(static_cast(num_coeffs) / num_threads); - const unsigned int numblocks = blocksize > 0 ? static_cast(num_coeffs / blocksize) : 0; - eigen_assert(num_coeffs >= static_cast(numblocks) * blocksize); - - Barrier barrier(numblocks); - MaxSizeVector shards(numblocks, reducer.initialize()); - for (unsigned int i = 0; i < numblocks; ++i) { - device.enqueue_with_barrier(&barrier, &FullReducerShard::run, self, - i * blocksize, blocksize, reducer, &shards[i]); - } - - typename Self::CoeffReturnType finalShard; - if (static_cast(numblocks) * blocksize < num_coeffs) { - finalShard = InnerMostDimReducer::reduce( - self, numblocks * blocksize, num_coeffs - numblocks * blocksize, reducer); - } else { - finalShard = reducer.initialize(); - } - barrier.Wait(); - for (unsigned int i = 0; i < numblocks; ++i) { - reducer.reduce(shards[i], &finalShard); - } - *output = reducer.finalize(finalShard); - } - } -}; - -template -struct FullReducer { - static const bool HasOptimizedImplementation = !Op::IsStateful; - static const int PacketSize = - unpacket_traits::size; - - // launch one reducer per thread and accumulate the result. - static void run(const Self& self, Op& reducer, const ThreadPoolDevice& device, - typename Self::CoeffReturnType* output) { - typedef typename Self::Index Index; - const Index num_coeffs = array_prod(self.m_impl.dimensions()); - if (num_coeffs == 0) { - *output = reducer.finalize(reducer.initialize()); + *output = + InnerMostDimReducer::reduce(self, 0, num_coeffs, reducer); return; } - const std::size_t num_threads = device.numThreads(); - if (num_threads == 1) { - *output = InnerMostDimReducer::reduce(self, 0, num_coeffs, reducer); - return; - } - const Index blocksize = std::floor(static_cast(num_coeffs) / num_threads); - const unsigned int numblocks = blocksize > 0 ? static_cast(num_coeffs / blocksize) : 0; - eigen_assert(num_coeffs >= static_cast(numblocks) * blocksize); + const Index blocksize = + std::floor(static_cast(num_coeffs) / num_threads); + const Index numblocks = blocksize > 0 ? num_coeffs / blocksize : 0; + eigen_assert(num_coeffs >= numblocks * blocksize); Barrier barrier(numblocks); MaxSizeVector shards(numblocks, reducer.initialize()); - for (unsigned int i = 0; i < numblocks; ++i) { - device.enqueue_with_barrier(&barrier, &FullReducerShard::run, + for (Index i = 0; i < numblocks; ++i) { + device.enqueue_with_barrier(&barrier, &FullReducerShard::run, self, i * blocksize, blocksize, reducer, &shards[i]); } typename Self::CoeffReturnType finalShard; - if (static_cast(numblocks) * blocksize < num_coeffs) { - finalShard = InnerMostDimReducer::reduce( - self, numblocks * blocksize, num_coeffs - numblocks * blocksize, reducer); + if (numblocks * blocksize < num_coeffs) { + finalShard = InnerMostDimReducer::reduce( + self, numblocks * blocksize, num_coeffs - numblocks * blocksize, + reducer); } else { finalShard = reducer.initialize(); } - barrier.Wait(); - for (unsigned int i = 0; i < numblocks; ++i) { + + for (Index i = 0; i < numblocks; ++i) { reducer.reduce(shards[i], &finalShard); } *output = reducer.finalize(finalShard); @@ -498,13 +464,21 @@ struct TensorEvaluator, Device> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Dimensions& dimensions() const { return m_dimensions; } + static bool size_large_enough(Index total_size) { +#ifndef EIGEN_USE_COST_MODEL + return total_size > 1024 * 1024; +#else + return true || total_size; +#endif + } + EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC bool evalSubExprsIfNeeded(CoeffReturnType* data) { m_impl.evalSubExprsIfNeeded(NULL); // Use the FullReducer if possible. if (RunningFullReduction && internal::FullReducer::HasOptimizedImplementation && ((RunningOnGPU && (m_device.majorDeviceVersion() >= 3)) || - (!RunningOnGPU && (internal::array_prod(m_impl.dimensions()) > 1024 * 1024)))) { + (!RunningOnGPU && size_large_enough(internal::array_prod(m_impl.dimensions()))))) { bool need_assign = false; if (!data) { From 1e80bddde3756ac7cd36a0db5e7d2493a7b93066 Mon Sep 17 00:00:00 2001 From: Gael Guennebaud Date: Fri, 15 Apr 2016 17:58:36 +0200 Subject: [PATCH 02/27] Fix trmv for mixing types. --- Eigen/src/Core/products/GeneralMatrixMatrixTriangular.h | 2 +- Eigen/src/Core/products/TriangularMatrixVector.h | 4 ++-- test/mixingtypes.cpp | 8 ++++++-- 3 files changed, 9 insertions(+), 5 deletions(-) diff --git a/Eigen/src/Core/products/GeneralMatrixMatrixTriangular.h b/Eigen/src/Core/products/GeneralMatrixMatrixTriangular.h index 831089dee..80ba89465 100644 --- a/Eigen/src/Core/products/GeneralMatrixMatrixTriangular.h +++ b/Eigen/src/Core/products/GeneralMatrixMatrixTriangular.h @@ -43,7 +43,7 @@ struct general_matrix_matrix_triangular_product::ReturnType ResScalar; static EIGEN_STRONG_INLINE void run(Index size, Index depth,const LhsScalar* lhs, Index lhsStride, const RhsScalar* rhs, Index rhsStride, ResScalar* res, Index resStride, - const ResScalar& alpha, level3_blocking& blocking) + const ResScalar& alpha, level3_blocking& blocking) { general_matrix_matrix_triangular_product EIGEN_DONT_INLINE void triangular_matrix_vector_product ::run(Index _rows, Index _cols, const LhsScalar* _lhs, Index lhsStride, - const RhsScalar* _rhs, Index rhsIncr, ResScalar* _res, Index resIncr, const ResScalar& alpha) + const RhsScalar* _rhs, Index rhsIncr, ResScalar* _res, Index resIncr, const RhsScalar& alpha) { static const Index PanelWidth = EIGEN_TUNE_TRIANGULAR_PANEL_WIDTH; Index size = (std::min)(_rows,_cols); diff --git a/test/mixingtypes.cpp b/test/mixingtypes.cpp index a3b469af8..0b381ec6c 100644 --- a/test/mixingtypes.cpp +++ b/test/mixingtypes.cpp @@ -148,10 +148,14 @@ template void mixingtypes(int size = SizeAtCompileType) VERIFY_IS_APPROX(sd*vd.adjoint()*mcd, sd*vd.adjoint().template cast().eval()*mcd); VERIFY_IS_APPROX(scd*vd.adjoint()*mcd, scd*vd.adjoint().template cast().eval()*mcd); - VERIFY_IS_APPROX(sd*vcd.adjoint()*md.template triangularView(), sd*vcd.adjoint()*md.template cast().eval().template triangularView()); + VERIFY_IS_APPROX( sd*vcd.adjoint()*md.template triangularView(), sd*vcd.adjoint()*md.template cast().eval().template triangularView()); VERIFY_IS_APPROX(scd*vcd.adjoint()*md.template triangularView(), scd*vcd.adjoint()*md.template cast().eval().template triangularView()); - VERIFY_IS_APPROX(sd*vd.adjoint()*mcd.template triangularView(), sd*vd.adjoint().template cast().eval()*mcd.template triangularView()); + VERIFY_IS_APPROX( sd*vcd.adjoint()*md.transpose().template triangularView(), sd*vcd.adjoint()*md.transpose().template cast().eval().template triangularView()); + VERIFY_IS_APPROX(scd*vcd.adjoint()*md.transpose().template triangularView(), scd*vcd.adjoint()*md.transpose().template cast().eval().template triangularView()); + VERIFY_IS_APPROX( sd*vd.adjoint()*mcd.template triangularView(), sd*vd.adjoint().template cast().eval()*mcd.template triangularView()); VERIFY_IS_APPROX(scd*vd.adjoint()*mcd.template triangularView(), scd*vd.adjoint().template cast().eval()*mcd.template triangularView()); + VERIFY_IS_APPROX( sd*vd.adjoint()*mcd.transpose().template triangularView(), sd*vd.adjoint().template cast().eval()*mcd.transpose().template triangularView()); + VERIFY_IS_APPROX(scd*vd.adjoint()*mcd.transpose().template triangularView(), scd*vd.adjoint().template cast().eval()*mcd.transpose().template triangularView()); // Not supported yet: trmm // VERIFY_IS_APPROX(sd*mcd*md.template triangularView(), sd*mcd*md.template cast().eval().template triangularView()); From 1d2343062805edb86113e2aef5ebcbe5030a57a5 Mon Sep 17 00:00:00 2001 From: Benoit Steiner Date: Fri, 15 Apr 2016 10:53:31 -0700 Subject: [PATCH 03/27] Improved the matrix multiplication blocking in the case where mr is not a power of 2 (e.g on Haswell CPUs). --- .../Core/products/GeneralBlockPanelKernel.h | 38 +++++++++---------- 1 file changed, 17 insertions(+), 21 deletions(-) diff --git a/Eigen/src/Core/products/GeneralBlockPanelKernel.h b/Eigen/src/Core/products/GeneralBlockPanelKernel.h index 267ac1de9..3d35c8d46 100644 --- a/Eigen/src/Core/products/GeneralBlockPanelKernel.h +++ b/Eigen/src/Core/products/GeneralBlockPanelKernel.h @@ -11,8 +11,8 @@ #define EIGEN_GENERAL_BLOCK_PANEL_H -namespace Eigen { - +namespace Eigen { + namespace internal { template @@ -36,7 +36,7 @@ const std::ptrdiff_t defaultL3CacheSize = 512*1024; #endif /** \internal */ -struct CacheSizes { +struct CacheSizes { CacheSizes(): m_l1(-1),m_l2(-1),m_l3(-1) { int l1CacheSize, l2CacheSize, l3CacheSize; queryCacheSizes(l1CacheSize, l2CacheSize, l3CacheSize); @@ -107,13 +107,9 @@ void evaluateProductBlockingSizesHeuristic(Index& k, Index& m, Index& n, Index n enum { kdiv = KcFactor * (Traits::mr * sizeof(LhsScalar) + Traits::nr * sizeof(RhsScalar)), ksub = Traits::mr * Traits::nr * sizeof(ResScalar), - k_mask = -8, - + kr = 8, mr = Traits::mr, - mr_mask = -mr, - nr = Traits::nr, - nr_mask = -nr }; // Increasing k gives us more time to prefetch the content of the "C" // registers. However once the latency is hidden there is no point in @@ -121,7 +117,7 @@ void evaluateProductBlockingSizesHeuristic(Index& k, Index& m, Index& n, Index n // experimentally). const Index k_cache = (std::min)((l1-ksub)/kdiv, 320); if (k_cache < k) { - k = k_cache & k_mask; + k = k_cache - (k_cache % kr); eigen_internal_assert(k > 0); } @@ -130,10 +126,10 @@ void evaluateProductBlockingSizesHeuristic(Index& k, Index& m, Index& n, Index n if (n_cache <= n_per_thread) { // Don't exceed the capacity of the l2 cache. eigen_internal_assert(n_cache >= static_cast(nr)); - n = n_cache & nr_mask; + n = n_cache - (n_cache % nr); eigen_internal_assert(n > 0); } else { - n = (std::min)(n, (n_per_thread + nr - 1) & nr_mask); + n = (std::min)(n, (n_per_thread + nr - 1) - ((n_per_thread + nr - 1) % nr)); } if (l3 > l2) { @@ -141,10 +137,10 @@ void evaluateProductBlockingSizesHeuristic(Index& k, Index& m, Index& n, Index n const Index m_cache = (l3-l2) / (sizeof(LhsScalar) * k * num_threads); const Index m_per_thread = numext::div_ceil(m, num_threads); if(m_cache < m_per_thread && m_cache >= static_cast(mr)) { - m = m_cache & mr_mask; + m = m_cache - (m_cache % mr); eigen_internal_assert(m > 0); } else { - m = (std::min)(m, (m_per_thread + mr - 1) & mr_mask); + m = (std::min)(m, (m_per_thread + mr - 1) - ((m_per_thread + mr - 1) % mr)); } } } @@ -156,23 +152,23 @@ void evaluateProductBlockingSizesHeuristic(Index& k, Index& m, Index& n, Index n l2 = 32*1024; l3 = 512*1024; #endif - + // Early return for small problems because the computation below are time consuming for small problems. // Perhaps it would make more sense to consider k*n*m?? // Note that for very tiny problem, this function should be bypassed anyway // because we use the coefficient-based implementation for them. if((std::max)(k,(std::max)(m,n))<48) return; - + typedef typename Traits::ResScalar ResScalar; enum { k_peeling = 8, k_div = KcFactor * (Traits::mr * sizeof(LhsScalar) + Traits::nr * sizeof(RhsScalar)), k_sub = Traits::mr * Traits::nr * sizeof(ResScalar) }; - + // ---- 1st level of blocking on L1, yields kc ---- - + // Blocking on the third dimension (i.e., k) is chosen so that an horizontal panel // of size mr x kc of the lhs plus a vertical panel of kc x nr of the rhs both fits within L1 cache. // We also include a register-level block of the result (mx x nr). @@ -187,12 +183,12 @@ void evaluateProductBlockingSizesHeuristic(Index& k, Index& m, Index& n, Index n // while keeping the same number of sweeps over the result. k = (k%max_kc)==0 ? max_kc : max_kc - k_peeling * ((max_kc-1-(k%max_kc))/(k_peeling*(k/max_kc+1))); - + eigen_internal_assert(((old_k/k) == (old_k/max_kc)) && "the number of sweeps has to remain the same"); } - + // ---- 2nd level of blocking on max(L2,L3), yields nc ---- - + // TODO find a reliable way to get the actual amount of cache per core to use for 2nd level blocking, that is: // actual_l2 = max(l2, l3/nb_core_sharing_l3) // The number below is quite conservative: it is better to underestimate the cache size rather than overestimating it) @@ -202,7 +198,7 @@ void evaluateProductBlockingSizesHeuristic(Index& k, Index& m, Index& n, Index n #else const Index actual_l2 = 1572864; // == 1.5 MB #endif - + // Here, nc is chosen such that a block of kc x nc of the rhs fit within half of L2. // The second half is implicitly reserved to access the result and lhs coefficients. // When k Date: Fri, 15 Apr 2016 11:27:52 -0700 Subject: [PATCH 04/27] Fixed compilation errors with msvc --- unsupported/test/cxx11_eventcount.cpp | 4 ++-- unsupported/test/cxx11_runqueue.cpp | 3 ++- 2 files changed, 4 insertions(+), 3 deletions(-) diff --git a/unsupported/test/cxx11_eventcount.cpp b/unsupported/test/cxx11_eventcount.cpp index 271e0f66d..ebd019e2d 100644 --- a/unsupported/test/cxx11_eventcount.cpp +++ b/unsupported/test/cxx11_eventcount.cpp @@ -67,8 +67,8 @@ const int TestQueue::kQueueSize; static void test_stress_eventcount() { const int kThreads = std::thread::hardware_concurrency(); - const int kEvents = 1 << 16; - const int kQueues = 10; + static const int kEvents = 1 << 16; + static const int kQueues = 10; std::vector waiters(kThreads); EventCount ec(waiters); diff --git a/unsupported/test/cxx11_runqueue.cpp b/unsupported/test/cxx11_runqueue.cpp index f1217a01b..b1e2dbd6d 100644 --- a/unsupported/test/cxx11_runqueue.cpp +++ b/unsupported/test/cxx11_runqueue.cpp @@ -9,6 +9,7 @@ // with this file, You can obtain one at http://mozilla.org/MPL/2.0/. #define EIGEN_USE_THREADS +#include #include "main.h" #include @@ -138,7 +139,7 @@ void test_empty_runqueue() // PopBack. Ensure that we don't crash, deadlock, and all sanity checks pass. void test_stress_runqueue() { - const int kEvents = 1 << 18; + static const int kEvents = 1 << 18; RunQueue q; std::atomic total(0); std::vector> threads; From 3718bf654bd173ae05f396f5d0cff1a4e15ef72d Mon Sep 17 00:00:00 2001 From: Rasmus Munk Larsen Date: Fri, 15 Apr 2016 12:51:33 -0700 Subject: [PATCH 05/27] Get rid of void* casting when calling EvalRange::run. --- .../Eigen/CXX11/src/Tensor/TensorExecutor.h | 28 ++++++------------- 1 file changed, 8 insertions(+), 20 deletions(-) diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorExecutor.h b/unsupported/Eigen/CXX11/src/Tensor/TensorExecutor.h index df9cc0998..7a54f7a23 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorExecutor.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorExecutor.h @@ -85,8 +85,8 @@ class TensorExecutor #ifdef EIGEN_USE_THREADS template struct EvalRange { - static void run(void* evaluator_in, const Index first, const Index last) { - Evaluator evaluator(*static_cast(evaluator_in)); + static void run(Evaluator* evaluator_in, const Index first, const Index last) { + Evaluator evaluator = *evaluator_in; eigen_assert(last >= first); for (Index i = first; i < last; ++i) { evaluator.evalScalar(i); @@ -96,10 +96,9 @@ struct EvalRange { template struct EvalRange { - static void run(void* evaluator_in, const Index first, const Index last) { - Evaluator evaluator(*static_cast(evaluator_in)); + static void run(Evaluator* evaluator_in, const Index first, const Index last) { + Evaluator evaluator = *evaluator_in; eigen_assert(last >= first); - Index i = first; const int PacketSize = unpacket_traits::size; if (last - first >= PacketSize) { @@ -123,16 +122,6 @@ struct EvalRange { } }; -// Used to make an std::function to add to the ThreadPool with less templating -// than EvalRange::Run. -// This requires that this and EvalRange takes a void* to the evaluator that can -// be downcast to the right type by the EvalRange. -template -inline void InvokeEvalRange(void (*run_fn)(void*, const Index, const Index), - void* evaluator, const Index first, const Index last) { - run_fn(evaluator, first, last); -} - template class TensorExecutor { public: @@ -163,13 +152,12 @@ class TensorExecutor { Barrier barrier(numblocks); for (int i = 0; i < numblocks; ++i) { device.enqueue_with_barrier( - &barrier, &InvokeEvalRange, - &EvalRange::run, - static_cast(&evaluator), i * blocksize, - (i + 1) * blocksize); + &barrier, &EvalRange::run, + &evaluator, i * blocksize, (i + 1) * blocksize); } if (numblocks * blocksize < size) { - EvalRange::run(&evaluator, numblocks * blocksize, size); + EvalRange::run( + &evaluator, numblocks * blocksize, size); } barrier.Wait(); } From 2a7115daca3e36e1486c4e22f05ad11f8a055030 Mon Sep 17 00:00:00 2001 From: Gael Guennebaud Date: Fri, 15 Apr 2016 22:34:11 +0200 Subject: [PATCH 06/27] bug #1203: by-pass large stack-allocation in stableNorm if EIGEN_STACK_ALLOCATION_LIMIT is too small --- Eigen/src/Core/StableNorm.h | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/Eigen/src/Core/StableNorm.h b/Eigen/src/Core/StableNorm.h index 7fe39808b..d2fe1e199 100644 --- a/Eigen/src/Core/StableNorm.h +++ b/Eigen/src/Core/StableNorm.h @@ -168,11 +168,12 @@ MatrixBase::stableNorm() const DerivedCopy copy(derived()); enum { - CanAlign = (int(Flags)&DirectAccessBit) || (int(internal::evaluator::Alignment)>0) // FIXME + CanAlign = ( (int(DerivedCopyClean::Flags)&DirectAccessBit) + || (int(internal::evaluator::Alignment)>0) // FIXME Alignment)>0 might not be enough + ) && (blockSize*sizeof(Scalar)*2, internal::evaluator::Alignment>, - typename DerivedCopyClean - ::ConstSegmentReturnType>::type SegmentWrapper; + typename DerivedCopyClean::ConstSegmentReturnType>::type SegmentWrapper; Index n = size(); if(n==1) From 6c43c49e4aaf833e97a3c759b79d952bfd0a4d9c Mon Sep 17 00:00:00 2001 From: Benoit Steiner Date: Fri, 15 Apr 2016 15:34:34 -0700 Subject: [PATCH 07/27] Fixed a few compilation warnings --- unsupported/Eigen/CXX11/src/Tensor/TensorContraction.h | 2 +- unsupported/Eigen/CXX11/src/Tensor/TensorGenerator.h | 2 +- unsupported/Eigen/CXX11/src/Tensor/TensorImagePatch.h | 1 - unsupported/Eigen/CXX11/src/Tensor/TensorVolumePatch.h | 1 - 4 files changed, 2 insertions(+), 4 deletions(-) diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorContraction.h b/unsupported/Eigen/CXX11/src/Tensor/TensorContraction.h index 1627d4870..2671f4357 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorContraction.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorContraction.h @@ -440,7 +440,7 @@ struct TensorContractionEvaluatorBase return m_result[index]; } - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorOpCost costPerCoeff(bool vectorized) const { + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorOpCost costPerCoeff(bool) const { return TensorOpCost(sizeof(CoeffReturnType), 0, 0); } diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorGenerator.h b/unsupported/Eigen/CXX11/src/Tensor/TensorGenerator.h index 9e5791bd3..8ff7d5815 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorGenerator.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorGenerator.h @@ -146,7 +146,7 @@ struct TensorEvaluator, Device> } EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorOpCost - costPerCoeff(bool vectorized) const { + costPerCoeff(bool) const { // TODO(rmlarsen): This is just a placeholder. Define interface to make // generators return their cost. return TensorOpCost(0, 0, TensorOpCost::AddCost() + diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorImagePatch.h b/unsupported/Eigen/CXX11/src/Tensor/TensorImagePatch.h index 0b769f278..bafcc67bd 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorImagePatch.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorImagePatch.h @@ -448,7 +448,6 @@ struct TensorEvaluator, Device> protected: EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE PacketReturnType packetWithPossibleZero(Index index) const { - const int PacketSize = internal::unpacket_traits::size; EIGEN_ALIGN_MAX typename internal::remove_const::type values[PacketSize]; for (int i = 0; i < PacketSize; ++i) { values[i] = coeff(index+i); diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorVolumePatch.h b/unsupported/Eigen/CXX11/src/Tensor/TensorVolumePatch.h index f8db43dbe..e735fc76f 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorVolumePatch.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorVolumePatch.h @@ -525,7 +525,6 @@ struct TensorEvaluator, D protected: EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE PacketReturnType packetWithPossibleZero(Index index) const { - const int PacketSize = internal::unpacket_traits::size; EIGEN_ALIGN_MAX typename internal::remove_const::type values[PacketSize]; for (int i = 0; i < PacketSize; ++i) { values[i] = coeff(index+i); From 7cff898e0a2a576a17168f36ba3576b764bbf326 Mon Sep 17 00:00:00 2001 From: Benoit Steiner Date: Fri, 15 Apr 2016 15:46:14 -0700 Subject: [PATCH 08/27] Deleted unnecessary variable --- unsupported/Eigen/CXX11/src/Tensor/TensorEvaluator.h | 1 - 1 file changed, 1 deletion(-) diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorEvaluator.h b/unsupported/Eigen/CXX11/src/Tensor/TensorEvaluator.h index 293012646..ae4ce3c90 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorEvaluator.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorEvaluator.h @@ -459,7 +459,6 @@ struct TensorEvaluator template EIGEN_DEVICE_FUNC PacketReturnType packet(Index index) const { - const int PacketSize = internal::unpacket_traits::size; internal::Selector select; for (Index i = 0; i < PacketSize; ++i) { select.select[i] = m_condImpl.coeff(index+i); From 1a16fb1532552c87fb1cb67ef40679b7a4744613 Mon Sep 17 00:00:00 2001 From: Benoit Steiner Date: Fri, 15 Apr 2016 15:50:13 -0700 Subject: [PATCH 09/27] Deleted extraneous comma. --- Eigen/src/Core/products/GeneralBlockPanelKernel.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Eigen/src/Core/products/GeneralBlockPanelKernel.h b/Eigen/src/Core/products/GeneralBlockPanelKernel.h index 3d35c8d46..bd559dc6a 100644 --- a/Eigen/src/Core/products/GeneralBlockPanelKernel.h +++ b/Eigen/src/Core/products/GeneralBlockPanelKernel.h @@ -109,7 +109,7 @@ void evaluateProductBlockingSizesHeuristic(Index& k, Index& m, Index& n, Index n ksub = Traits::mr * Traits::nr * sizeof(ResScalar), kr = 8, mr = Traits::mr, - nr = Traits::nr, + nr = Traits::nr }; // Increasing k gives us more time to prefetch the content of the "C" // registers. However once the latency is hidden there is no point in From c8e8f93d6ce9ed02a40ea08b5ccc1ef6e82de0f2 Mon Sep 17 00:00:00 2001 From: Benoit Steiner Date: Fri, 15 Apr 2016 16:48:10 -0700 Subject: [PATCH 10/27] Move the evalGemm method into the TensorContractionEvaluatorBase class to make it accessible from both the single and multithreaded contraction evaluators. --- .../CXX11/src/Tensor/TensorContraction.h | 188 +++++++++--------- 1 file changed, 94 insertions(+), 94 deletions(-) diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorContraction.h b/unsupported/Eigen/CXX11/src/Tensor/TensorContraction.h index 2671f4357..97182258d 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorContraction.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorContraction.h @@ -426,6 +426,99 @@ struct TensorContractionEvaluatorBase buffer, resIncr, alpha); } + template + EIGEN_DEVICE_FUNC void evalGemm(Scalar* buffer) const { + // columns in left side, rows in right side + const Index k = this->m_k_size; + + // rows in left side + const Index m = this->m_i_size; + + // columns in right side + const Index n = this->m_j_size; + + // zero out the result buffer (which must be of size at least m * n * sizeof(Scalar) + this->m_device.memset(buffer, 0, m * n * sizeof(Scalar)); + + // define mr, nr, and all of my data mapper types + typedef typename internal::remove_const::type LhsScalar; + typedef typename internal::remove_const::type RhsScalar; + typedef typename internal::gebp_traits Traits; + + const Index nr = Traits::nr; + const Index mr = Traits::mr; + + typedef TensorEvaluator LeftEvaluator; + typedef TensorEvaluator RightEvaluator; + + const Index lhs_packet_size = internal::unpacket_traits::size; + const Index rhs_packet_size = internal::unpacket_traits::size; + + typedef internal::TensorContractionInputMapper LhsMapper; + + typedef internal::TensorContractionInputMapper RhsMapper; + + typedef internal::blas_data_mapper OutputMapper; + + // Declare GEBP packing and kernel structs + internal::gemm_pack_lhs pack_lhs; + internal::gemm_pack_rhs pack_rhs; + + internal::gebp_kernel gebp; + + // initialize data mappers + LhsMapper lhs(this->m_leftImpl, this->m_left_nocontract_strides, this->m_i_strides, + this->m_left_contracting_strides, this->m_k_strides); + + RhsMapper rhs(this->m_rightImpl, this->m_right_nocontract_strides, this->m_j_strides, + this->m_right_contracting_strides, this->m_k_strides); + + OutputMapper output(buffer, m); + + // Sizes of the blocks to load in cache. See the Goto paper for details. + internal::TensorContractionBlocking blocking(k, m, n, 1); + const Index kc = blocking.kc(); + const Index mc = numext::mini(m, blocking.mc()); + const Index nc = numext::mini(n, blocking.nc()); + const Index sizeA = mc * kc; + const Index sizeB = kc * nc; + + LhsScalar* blockA = static_cast(this->m_device.allocate(sizeA * sizeof(LhsScalar))); + RhsScalar* blockB = static_cast(this->m_device.allocate(sizeB * sizeof(RhsScalar))); + + for(Index i2=0; i2m_device.deallocate(blockA); + this->m_device.deallocate(blockB); + } + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void cleanup() { m_leftImpl.cleanup(); m_rightImpl.cleanup(); @@ -533,100 +626,7 @@ struct TensorEvaluator(buffer); - } - - template - EIGEN_DEVICE_FUNC void evalGemm(Scalar* buffer) const { - // columns in left side, rows in right side - const Index k = this->m_k_size; - - // rows in left side - const Index m = this->m_i_size; - - // columns in right side - const Index n = this->m_j_size; - - // zero out the result buffer (which must be of size at least m * n * sizeof(Scalar) - this->m_device.memset(buffer, 0, m * n * sizeof(Scalar)); - - // define mr, nr, and all of my data mapper types - typedef typename internal::remove_const::type LhsScalar; - typedef typename internal::remove_const::type RhsScalar; - typedef typename internal::gebp_traits Traits; - - const Index nr = Traits::nr; - const Index mr = Traits::mr; - - typedef TensorEvaluator LeftEvaluator; - typedef TensorEvaluator RightEvaluator; - - const Index lhs_packet_size = internal::unpacket_traits::size; - const Index rhs_packet_size = internal::unpacket_traits::size; - - typedef internal::TensorContractionInputMapper LhsMapper; - - typedef internal::TensorContractionInputMapper RhsMapper; - - typedef internal::blas_data_mapper OutputMapper; - - // Declare GEBP packing and kernel structs - internal::gemm_pack_lhs pack_lhs; - internal::gemm_pack_rhs pack_rhs; - - internal::gebp_kernel gebp; - - // initialize data mappers - LhsMapper lhs(this->m_leftImpl, this->m_left_nocontract_strides, this->m_i_strides, - this->m_left_contracting_strides, this->m_k_strides); - - RhsMapper rhs(this->m_rightImpl, this->m_right_nocontract_strides, this->m_j_strides, - this->m_right_contracting_strides, this->m_k_strides); - - OutputMapper output(buffer, m); - - // Sizes of the blocks to load in cache. See the Goto paper for details. - internal::TensorContractionBlocking blocking(k, m, n, 1); - const Index kc = blocking.kc(); - const Index mc = numext::mini(m, blocking.mc()); - const Index nc = numext::mini(n, blocking.nc()); - const Index sizeA = mc * kc; - const Index sizeB = kc * nc; - - LhsScalar* blockA = static_cast(this->m_device.allocate(sizeA * sizeof(LhsScalar))); - RhsScalar* blockB = static_cast(this->m_device.allocate(sizeB * sizeof(RhsScalar))); - - for(Index i2=0; i2m_device.deallocate(blockA); - this->m_device.deallocate(blockB); + this->template evalGemm(buffer); } }; From e4fe611e2c9cf7b2853016c39898a224b3ec51a3 Mon Sep 17 00:00:00 2001 From: Gael Guennebaud Date: Sat, 16 Apr 2016 15:17:39 +0200 Subject: [PATCH 11/27] Enable lazy-coeff-based-product for vector*(1x1) products --- Eigen/src/Core/GeneralProduct.h | 2 ++ 1 file changed, 2 insertions(+) diff --git a/Eigen/src/Core/GeneralProduct.h b/Eigen/src/Core/GeneralProduct.h index 53f934999..f7c5f4276 100644 --- a/Eigen/src/Core/GeneralProduct.h +++ b/Eigen/src/Core/GeneralProduct.h @@ -81,6 +81,8 @@ public: * This is a compile time mapping from {1,Small,Large}^3 -> {product types} */ // FIXME I'm not sure the current mapping is the ideal one. template struct product_type_selector { enum { ret = OuterProduct }; }; +template struct product_type_selector { enum { ret = LazyCoeffBasedProductMode }; }; +template struct product_type_selector<1, N, 1> { enum { ret = LazyCoeffBasedProductMode }; }; template struct product_type_selector<1, 1, Depth> { enum { ret = InnerProduct }; }; template<> struct product_type_selector<1, 1, 1> { enum { ret = InnerProduct }; }; template<> struct product_type_selector { enum { ret = CoeffBasedProductMode }; }; From 5fbcfe5eb45a946230c06b7a7cac2e6ec6088457 Mon Sep 17 00:00:00 2001 From: Benoit Steiner Date: Sun, 17 Apr 2016 18:42:31 -0700 Subject: [PATCH 12/27] Worked around the lack of a rand_r function on windows systems --- unsupported/test/cxx11_runqueue.cpp | 9 +++++++++ 1 file changed, 9 insertions(+) diff --git a/unsupported/test/cxx11_runqueue.cpp b/unsupported/test/cxx11_runqueue.cpp index b1e2dbd6d..f8552b76d 100644 --- a/unsupported/test/cxx11_runqueue.cpp +++ b/unsupported/test/cxx11_runqueue.cpp @@ -13,6 +13,15 @@ #include "main.h" #include + +#ifdef EIGEN_COMP_MSVC_STRICT +// Visual studio doesn't implementan rand_r() function since its +// implementation of rand()is already thread safe +int rand_r(unsigned int*) { + return rand(); +} +#endif + void test_basic_runqueue() { RunQueue q; From 84543c8be2076b747bbc730a760e7693ab6caae4 Mon Sep 17 00:00:00 2001 From: Benoit Steiner Date: Sun, 17 Apr 2016 19:29:27 -0700 Subject: [PATCH 13/27] Worked around the lack of a rand_r function on windows systems --- unsupported/test/cxx11_eventcount.cpp | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/unsupported/test/cxx11_eventcount.cpp b/unsupported/test/cxx11_eventcount.cpp index ebd019e2d..59039dae9 100644 --- a/unsupported/test/cxx11_eventcount.cpp +++ b/unsupported/test/cxx11_eventcount.cpp @@ -12,6 +12,14 @@ #include "main.h" #include +#ifdef EIGEN_COMP_MSVC_STRICT +// Visual studio doesn't implementan rand_r() function since its +// implementation of rand()is already thread safe +int rand_r(unsigned int*) { + return rand(); +} +#endif + static void test_basic_eventcount() { std::vector waiters(1); From 50968a0a3ed2686b25f6df1687f4cf7fc6b66da1 Mon Sep 17 00:00:00 2001 From: Benoit Steiner Date: Tue, 19 Apr 2016 11:53:58 -0700 Subject: [PATCH 14/27] Use DenseIndex in the MeanReducer to avoid overflows when processing very large tensors. --- unsupported/Eigen/CXX11/src/Tensor/TensorFunctors.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorFunctors.h b/unsupported/Eigen/CXX11/src/Tensor/TensorFunctors.h index 44dc2d730..33cd00391 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorFunctors.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorFunctors.h @@ -158,8 +158,8 @@ template struct MeanReducer } protected: - int scalarCount_; - int packetCount_; + DenseIndex scalarCount_; + DenseIndex packetCount_; }; template struct MaxReducer From f953c607058efd7c3508e888ac4bf2c526336b87 Mon Sep 17 00:00:00 2001 From: Benoit Steiner Date: Tue, 19 Apr 2016 12:57:39 -0700 Subject: [PATCH 15/27] Fixed 2 recent regression tests --- unsupported/test/cxx11_eventcount.cpp | 25 ++++++++++++++----------- unsupported/test/cxx11_runqueue.cpp | 27 +++++++++++++++------------ 2 files changed, 29 insertions(+), 23 deletions(-) diff --git a/unsupported/test/cxx11_eventcount.cpp b/unsupported/test/cxx11_eventcount.cpp index 59039dae9..898c4c278 100644 --- a/unsupported/test/cxx11_eventcount.cpp +++ b/unsupported/test/cxx11_eventcount.cpp @@ -12,11 +12,14 @@ #include "main.h" #include +// Visual studio doesn't implement a rand_r() function since its +// implementation of rand() is already thread safe +int rand_reentrant(unsigned int* s) { #ifdef EIGEN_COMP_MSVC_STRICT -// Visual studio doesn't implementan rand_r() function since its -// implementation of rand()is already thread safe -int rand_r(unsigned int*) { return rand(); +#else + return rand_r(s); +endif } #endif @@ -85,15 +88,15 @@ static void test_stress_eventcount() std::vector> producers; for (int i = 0; i < kThreads; i++) { producers.emplace_back(new std::thread([&ec, &queues]() { - unsigned rnd = std::hash()(std::this_thread::get_id()); - for (int i = 0; i < kEvents; i++) { - unsigned idx = rand_r(&rnd) % kQueues; + unsigned int rnd = static_cast(std::hash()(std::this_thread::get_id())); + for (int j = 0; j < kEvents; j++) { + unsigned idx = rand_reentrant(&rnd) % kQueues; if (queues[idx].Push()) { ec.Notify(false); continue; } std::this_thread::yield(); - i--; + j--; } })); } @@ -102,11 +105,11 @@ static void test_stress_eventcount() for (int i = 0; i < kThreads; i++) { consumers.emplace_back(new std::thread([&ec, &queues, &waiters, i]() { EventCount::Waiter& w = waiters[i]; - unsigned rnd = std::hash()(std::this_thread::get_id()); - for (int i = 0; i < kEvents; i++) { - unsigned idx = rand_r(&rnd) % kQueues; + unsigned int rnd = static_cast(std::hash()(std::this_thread::get_id())); + for (int j = 0; j < kEvents; k++) { + unsigned idx = rand_reentrant(&rnd) % kQueues; if (queues[idx].Pop()) continue; - i--; + j--; ec.Prewait(&w); bool empty = true; for (int q = 0; q < kQueues; q++) { diff --git a/unsupported/test/cxx11_runqueue.cpp b/unsupported/test/cxx11_runqueue.cpp index f8552b76d..5cfb38b2e 100644 --- a/unsupported/test/cxx11_runqueue.cpp +++ b/unsupported/test/cxx11_runqueue.cpp @@ -14,11 +14,14 @@ #include +// Visual studio doesn't implement a rand_r() function since its +// implementation of rand() is already thread safe +int rand_reentrant(unsigned int* s) { #ifdef EIGEN_COMP_MSVC_STRICT -// Visual studio doesn't implementan rand_r() function since its -// implementation of rand()is already thread safe -int rand_r(unsigned int*) { return rand(); +#else + return rand_r(s); +endif } #endif @@ -115,11 +118,11 @@ void test_empty_runqueue() unsigned rnd = 0; std::vector stolen; for (int i = 0; i < 1 << 18; i++) { - if (rand_r(&rnd) % 2) + if (rand_reentrant(&rnd) % 2) VERIFY_IS_EQUAL(0, q.PushFront(1)); else VERIFY_IS_EQUAL(0, q.PushBack(1)); - if (rand_r(&rnd) % 2) + if (rand_reentrant(&rnd) % 2) VERIFY_IS_EQUAL(1, q.PopFront()); else { for (;;) { @@ -176,30 +179,30 @@ void test_stress_runqueue() for (int i = 0; i < 2; i++) { threads.emplace_back(new std::thread([&q, &total]() { int sum = 0; - for (int i = 1; i < kEvents; i++) { - if (q.PushBack(i) == 0) { - sum += i; + for (int j = 1; j < kEvents; j++) { + if (q.PushBack(j) == 0) { + sum += j; continue; } std::this_thread::yield(); - i--; + j--; } total += sum; })); threads.emplace_back(new std::thread([&q, &total]() { int sum = 0; std::vector stolen; - for (int i = 1; i < kEvents;) { + for (int j = 1; j < kEvents;) { if (q.PopBackHalf(&stolen) == 0) { std::this_thread::yield(); continue; } - while (stolen.size() && i < kEvents) { + while (stolen.size() && j < kEvents) { int v = stolen.back(); stolen.pop_back(); VERIFY_IS_NOT_EQUAL(v, 0); sum += v; - i++; + j++; } } while (stolen.size()) { From a278414d1b1220d56f574a80027955c55542cd95 Mon Sep 17 00:00:00 2001 From: Benoit Steiner Date: Tue, 19 Apr 2016 13:54:28 -0700 Subject: [PATCH 16/27] Avoid an unnecessary copy of the evaluator. --- unsupported/Eigen/CXX11/src/Tensor/TensorExecutor.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorExecutor.h b/unsupported/Eigen/CXX11/src/Tensor/TensorExecutor.h index 7a54f7a23..907da9446 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorExecutor.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorExecutor.h @@ -183,7 +183,7 @@ class TensorExecutor { template struct EigenMetaKernelEval { static __device__ EIGEN_ALWAYS_INLINE - void run(Evaluator eval, Index first, Index last, Index step_size) { + void run(Evaluator& eval, Index first, Index last, Index step_size) { for (Index i = first; i < last; i += step_size) { eval.evalScalar(i); } From 884c07505868c0167467c5c3de207724b24f12ab Mon Sep 17 00:00:00 2001 From: Benoit Steiner Date: Tue, 19 Apr 2016 14:33:30 -0700 Subject: [PATCH 17/27] Use numext::ceil instead of std::ceil --- unsupported/Eigen/CXX11/src/Tensor/TensorReductionCuda.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorReductionCuda.h b/unsupported/Eigen/CXX11/src/Tensor/TensorReductionCuda.h index c33d54d6e..02193f263 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorReductionCuda.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorReductionCuda.h @@ -136,7 +136,7 @@ struct FullReducer { const Index num_coeffs = array_prod(self.m_impl.dimensions()); const int block_size = 256; const int num_per_thread = 128; - const int num_blocks = std::ceil(static_cast(num_coeffs) / (block_size * num_per_thread)); + const int num_blocks = numext::ceil(static_cast(num_coeffs) / (block_size * num_per_thread)); if (num_blocks > 1) { // We initialize the outputs outside the reduction kernel when we can't be sure that there From b9ea40c30d1d32d0f31b047aa681c384fd1a2c98 Mon Sep 17 00:00:00 2001 From: Benoit Steiner Date: Tue, 19 Apr 2016 14:35:11 -0700 Subject: [PATCH 18/27] Don't take the address of a kernel on CUDA devices that don't support this feature. --- unsupported/Eigen/CXX11/src/Tensor/TensorDeviceCuda.h | 9 ++++++--- 1 file changed, 6 insertions(+), 3 deletions(-) diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorDeviceCuda.h b/unsupported/Eigen/CXX11/src/Tensor/TensorDeviceCuda.h index 821835cf3..8e7f5dddb 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorDeviceCuda.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorDeviceCuda.h @@ -291,14 +291,17 @@ struct GpuDevice { int max_blocks_; }; -#ifndef __CUDA_ARCH__ +#if !defined(__CUDA_ARCH__) #define LAUNCH_CUDA_KERNEL(kernel, gridsize, blocksize, sharedmem, device, ...) \ (kernel) <<< (gridsize), (blocksize), (sharedmem), (device).stream() >>> (__VA_ARGS__); \ assert(cudaGetLastError() == cudaSuccess); -#else +#elif __CUDA_ARCH__ >= 350 #define LAUNCH_CUDA_KERNEL(kernel, ...) \ { const auto __attribute__((__unused__)) __makeTheKernelInstantiate = &(kernel); } \ - eigen_assert(false && "Cannot launch a kernel from another kernel" __CUDA_ARCH__); + eigen_assert(false && "Cannot launch a kernel from another kernel" __CUDA_ARCH__ kernel); +#else +#define LAUNCH_CUDA_KERNEL(kernel, ...) \ + eigen_assert(false && "Cannot launch a kernel from another kernel" __CUDA_ARCH__ kernel); #endif From 7129d998db0a8dd74125ad7081f3d220cbce96f0 Mon Sep 17 00:00:00 2001 From: Benoit Steiner Date: Tue, 19 Apr 2016 14:55:21 -0700 Subject: [PATCH 19/27] Simplified the code that launches cuda kernels. --- .../Eigen/CXX11/src/Tensor/TensorDeviceCuda.h | 9 --------- unsupported/Eigen/CXX11/src/Tensor/TensorExecutor.h | 2 +- .../Eigen/CXX11/src/Tensor/TensorReductionCuda.h | 12 ++++++------ 3 files changed, 7 insertions(+), 16 deletions(-) diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorDeviceCuda.h b/unsupported/Eigen/CXX11/src/Tensor/TensorDeviceCuda.h index 8e7f5dddb..1d2d162dc 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorDeviceCuda.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorDeviceCuda.h @@ -291,18 +291,9 @@ struct GpuDevice { int max_blocks_; }; -#if !defined(__CUDA_ARCH__) #define LAUNCH_CUDA_KERNEL(kernel, gridsize, blocksize, sharedmem, device, ...) \ (kernel) <<< (gridsize), (blocksize), (sharedmem), (device).stream() >>> (__VA_ARGS__); \ assert(cudaGetLastError() == cudaSuccess); -#elif __CUDA_ARCH__ >= 350 -#define LAUNCH_CUDA_KERNEL(kernel, ...) \ - { const auto __attribute__((__unused__)) __makeTheKernelInstantiate = &(kernel); } \ - eigen_assert(false && "Cannot launch a kernel from another kernel" __CUDA_ARCH__ kernel); -#else -#define LAUNCH_CUDA_KERNEL(kernel, ...) \ - eigen_assert(false && "Cannot launch a kernel from another kernel" __CUDA_ARCH__ kernel); -#endif // FIXME: Should be device and kernel specific. diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorExecutor.h b/unsupported/Eigen/CXX11/src/Tensor/TensorExecutor.h index 907da9446..bf6e10a7b 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorExecutor.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorExecutor.h @@ -193,7 +193,7 @@ struct EigenMetaKernelEval { template struct EigenMetaKernelEval { static __device__ EIGEN_ALWAYS_INLINE - void run(Evaluator eval, Index first, Index last, Index step_size) { + void run(Evaluator& eval, Index first, Index last, Index step_size) { const Index PacketSize = unpacket_traits::size; const Index vectorized_size = (last / PacketSize) * PacketSize; const Index vectorized_step_size = step_size * PacketSize; diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorReductionCuda.h b/unsupported/Eigen/CXX11/src/Tensor/TensorReductionCuda.h index 02193f263..d80436326 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorReductionCuda.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorReductionCuda.h @@ -126,11 +126,11 @@ struct FullReducer { internal::is_same::value; template - static EIGEN_DEVICE_FUNC void run(const Self&, Op&, const GpuDevice&, OutputType*) { + static void run(const Self&, Op&, const GpuDevice&, OutputType*) { assert(false && "Should only be called on floats"); } - static EIGEN_DEVICE_FUNC void run(const Self& self, Op& reducer, const GpuDevice& device, float* output) { + static void run(const Self& self, Op& reducer, const GpuDevice& device, float* output) { typedef typename Self::Index Index; const Index num_coeffs = array_prod(self.m_impl.dimensions()); @@ -226,12 +226,12 @@ struct InnerReducer { internal::is_same::value; template - static EIGEN_DEVICE_FUNC bool run(const Self&, Op&, const Device&, OutputType*, typename Self::Index, typename Self::Index) { + static bool run(const Self&, Op&, const Device&, OutputType*, typename Self::Index, typename Self::Index) { assert(false && "Should only be called to reduce floats on a gpu device"); return true; } - static EIGEN_DEVICE_FUNC bool run(const Self& self, Op& reducer, const GpuDevice& device, float* output, typename Self::Index num_coeffs_to_reduce, typename Self::Index num_preserved_vals) { + static bool run(const Self& self, Op& reducer, const GpuDevice& device, float* output, typename Self::Index num_coeffs_to_reduce, typename Self::Index num_preserved_vals) { typedef typename Self::Index Index; // It's faster to use the usual code. @@ -305,12 +305,12 @@ struct OuterReducer { internal::is_same::value; template - static EIGEN_DEVICE_FUNC bool run(const Self&, Op&, const Device&, OutputType*, typename Self::Index, typename Self::Index) { + static bool run(const Self&, Op&, const Device&, OutputType*, typename Self::Index, typename Self::Index) { assert(false && "Should only be called to reduce floats on a gpu device"); return true; } - static EIGEN_DEVICE_FUNC bool run(const Self& self, Op& reducer, const GpuDevice& device, float* output, typename Self::Index num_coeffs_to_reduce, typename Self::Index num_preserved_vals) { + static bool run(const Self& self, Op& reducer, const GpuDevice& device, float* output, typename Self::Index num_coeffs_to_reduce, typename Self::Index num_preserved_vals) { typedef typename Self::Index Index; // It's faster to use the usual code. From 5b1106c56b64e4691c2849184d416631c689348a Mon Sep 17 00:00:00 2001 From: Benoit Steiner Date: Tue, 19 Apr 2016 14:57:57 -0700 Subject: [PATCH 20/27] Fixed a compilation error with nvcc 7. --- unsupported/Eigen/CXX11/src/Tensor/TensorReductionCuda.h | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorReductionCuda.h b/unsupported/Eigen/CXX11/src/Tensor/TensorReductionCuda.h index d80436326..afa5a257a 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorReductionCuda.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorReductionCuda.h @@ -126,7 +126,7 @@ struct FullReducer { internal::is_same::value; template - static void run(const Self&, Op&, const GpuDevice&, OutputType*) { + static EIGEN_DEVICE_FUNC void run(const Self&, Op&, const GpuDevice&, OutputType*) { assert(false && "Should only be called on floats"); } @@ -226,7 +226,7 @@ struct InnerReducer { internal::is_same::value; template - static bool run(const Self&, Op&, const Device&, OutputType*, typename Self::Index, typename Self::Index) { + static EIGEN_DEVICE_FUNC bool run(const Self&, Op&, const Device&, OutputType*, typename Self::Index, typename Self::Index) { assert(false && "Should only be called to reduce floats on a gpu device"); return true; } @@ -305,7 +305,7 @@ struct OuterReducer { internal::is_same::value; template - static bool run(const Self&, Op&, const Device&, OutputType*, typename Self::Index, typename Self::Index) { + static EIGEN_DEVICE_FUNC bool run(const Self&, Op&, const Device&, OutputType*, typename Self::Index, typename Self::Index) { assert(false && "Should only be called to reduce floats on a gpu device"); return true; } From 04f954956d407fbf6cf7c2be3a993460aa608177 Mon Sep 17 00:00:00 2001 From: Benoit Steiner Date: Tue, 19 Apr 2016 15:27:09 -0700 Subject: [PATCH 21/27] Fixed a few typos --- unsupported/test/cxx11_eventcount.cpp | 6 +++--- unsupported/test/cxx11_runqueue.cpp | 4 ++-- 2 files changed, 5 insertions(+), 5 deletions(-) diff --git a/unsupported/test/cxx11_eventcount.cpp b/unsupported/test/cxx11_eventcount.cpp index 898c4c278..2f250338c 100644 --- a/unsupported/test/cxx11_eventcount.cpp +++ b/unsupported/test/cxx11_eventcount.cpp @@ -16,12 +16,12 @@ // implementation of rand() is already thread safe int rand_reentrant(unsigned int* s) { #ifdef EIGEN_COMP_MSVC_STRICT + EIGEN_UNUSED_VARIABLE(s); return rand(); #else return rand_r(s); -endif -} #endif +} static void test_basic_eventcount() { @@ -106,7 +106,7 @@ static void test_stress_eventcount() consumers.emplace_back(new std::thread([&ec, &queues, &waiters, i]() { EventCount::Waiter& w = waiters[i]; unsigned int rnd = static_cast(std::hash()(std::this_thread::get_id())); - for (int j = 0; j < kEvents; k++) { + for (int j = 0; j < kEvents; j++) { unsigned idx = rand_reentrant(&rnd) % kQueues; if (queues[idx].Pop()) continue; j--; diff --git a/unsupported/test/cxx11_runqueue.cpp b/unsupported/test/cxx11_runqueue.cpp index 5cfb38b2e..4207824bf 100644 --- a/unsupported/test/cxx11_runqueue.cpp +++ b/unsupported/test/cxx11_runqueue.cpp @@ -18,12 +18,12 @@ // implementation of rand() is already thread safe int rand_reentrant(unsigned int* s) { #ifdef EIGEN_COMP_MSVC_STRICT + EIGEN_UNUSED_VARIABLE(s); return rand(); #else return rand_r(s); -endif -} #endif +} void test_basic_runqueue() { From 2b7216302864715eb6950448fc71b0e4b51d617c Mon Sep 17 00:00:00 2001 From: Benoit Steiner Date: Tue, 19 Apr 2016 15:56:02 -0700 Subject: [PATCH 22/27] Implemented a more portable version of thread local variables --- unsupported/Eigen/CXX11/ThreadPool | 1 + .../src/ThreadPool/NonBlockingThreadPool.h | 2 +- .../Eigen/CXX11/src/ThreadPool/ThreadLocal.h | 22 +++++++++++++++++++ 3 files changed, 24 insertions(+), 1 deletion(-) create mode 100644 unsupported/Eigen/CXX11/src/ThreadPool/ThreadLocal.h diff --git a/unsupported/Eigen/CXX11/ThreadPool b/unsupported/Eigen/CXX11/ThreadPool index 6f360ae2f..6cd057b4b 100644 --- a/unsupported/Eigen/CXX11/ThreadPool +++ b/unsupported/Eigen/CXX11/ThreadPool @@ -47,6 +47,7 @@ #include "src/ThreadPool/EventCount.h" #include "src/ThreadPool/RunQueue.h" +#include "src/ThreadPool/ThreadLocal.h" #include "src/ThreadPool/ThreadPoolInterface.h" #include "src/ThreadPool/ThreadEnvironment.h" #include "src/ThreadPool/SimpleThreadPool.h" diff --git a/unsupported/Eigen/CXX11/src/ThreadPool/NonBlockingThreadPool.h b/unsupported/Eigen/CXX11/src/ThreadPool/NonBlockingThreadPool.h index 18dec5393..1c471a19f 100644 --- a/unsupported/Eigen/CXX11/src/ThreadPool/NonBlockingThreadPool.h +++ b/unsupported/Eigen/CXX11/src/ThreadPool/NonBlockingThreadPool.h @@ -212,7 +212,7 @@ class NonBlockingThreadPoolTempl : public Eigen::ThreadPoolInterface { } PerThread* GetPerThread() { - static thread_local PerThread per_thread_; + EIGEN_THREAD_LOCAL PerThread per_thread_; PerThread* pt = &per_thread_; if (pt->inited) return pt; pt->inited = true; diff --git a/unsupported/Eigen/CXX11/src/ThreadPool/ThreadLocal.h b/unsupported/Eigen/CXX11/src/ThreadPool/ThreadLocal.h new file mode 100644 index 000000000..cfa221732 --- /dev/null +++ b/unsupported/Eigen/CXX11/src/ThreadPool/ThreadLocal.h @@ -0,0 +1,22 @@ +// This file is part of Eigen, a lightweight C++ template library +// for linear algebra. +// +// Copyright (C) 2016 Benoit Steiner +// +// This Source Code Form is subject to the terms of the Mozilla +// Public License v. 2.0. If a copy of the MPL was not distributed +// with this file, You can obtain one at http://mozilla.org/MPL/2.0/. + +#ifndef EIGEN_CXX11_THREADPOOL_THREAD_LOCAL_H +#define EIGEN_CXX11_THREADPOOL_THREAD_LOCAL_H + +// Try to come up with a portable implementation of thread local variables +#if EIGEN_COMP_GNUC && EIGEN_GNUC_AT_MOST(4, 7) +#define EIGEN_THREAD_LOCAL static __thread +#elif EIGEN_COMP_CLANG +#define EIGEN_THREAD_LOCAL static __thread +#else +#define EIGEN_THREAD_LOCAL static thread_local +#endif + +#endif // EIGEN_CXX11_THREADPOOL_THREAD_LOCAL_H From 1d0238375d53e168eda40139979f8d128e72f4e8 Mon Sep 17 00:00:00 2001 From: Benoit Steiner Date: Tue, 19 Apr 2016 17:44:12 -0700 Subject: [PATCH 23/27] Made sure all the required header files are included when trying to use fp16 --- Eigen/Core | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/Eigen/Core b/Eigen/Core index c7192b037..ac58f10c3 100644 --- a/Eigen/Core +++ b/Eigen/Core @@ -214,10 +214,14 @@ #include #if defined __CUDACC_VER__ && __CUDACC_VER__ >= 70500 #define EIGEN_HAS_CUDA_FP16 - #include #endif #endif +#if defined EIGEN_HAS_CUDA_FP16 + #include + #include +#endif + #if (defined _OPENMP) && (!defined EIGEN_DONT_PARALLELIZE) #define EIGEN_HAS_OPENMP #endif From c7c2054bb519ad01039560254f9dda8698cb0cd9 Mon Sep 17 00:00:00 2001 From: Benoit Steiner Date: Tue, 19 Apr 2016 17:59:58 -0700 Subject: [PATCH 24/27] Started to implement a portable way to yield. --- unsupported/Eigen/CXX11/ThreadPool | 3 ++- .../Eigen/CXX11/src/ThreadPool/EventCount.h | 4 ++-- .../Eigen/CXX11/src/ThreadPool/ThreadYield.h | 20 +++++++++++++++++++ 3 files changed, 24 insertions(+), 3 deletions(-) create mode 100644 unsupported/Eigen/CXX11/src/ThreadPool/ThreadYield.h diff --git a/unsupported/Eigen/CXX11/ThreadPool b/unsupported/Eigen/CXX11/ThreadPool index 6cd057b4b..fe00a0b65 100644 --- a/unsupported/Eigen/CXX11/ThreadPool +++ b/unsupported/Eigen/CXX11/ThreadPool @@ -45,9 +45,10 @@ #include #include +#include "src/ThreadPool/ThreadLocal.h" +#include "src/ThreadPool/ThreadYield.h" #include "src/ThreadPool/EventCount.h" #include "src/ThreadPool/RunQueue.h" -#include "src/ThreadPool/ThreadLocal.h" #include "src/ThreadPool/ThreadPoolInterface.h" #include "src/ThreadPool/ThreadEnvironment.h" #include "src/ThreadPool/SimpleThreadPool.h" diff --git a/unsupported/Eigen/CXX11/src/ThreadPool/EventCount.h b/unsupported/Eigen/CXX11/src/ThreadPool/EventCount.h index 16eee1a41..6dd64f185 100644 --- a/unsupported/Eigen/CXX11/src/ThreadPool/EventCount.h +++ b/unsupported/Eigen/CXX11/src/ThreadPool/EventCount.h @@ -81,7 +81,7 @@ class EventCount { if (int64_t((state & kEpochMask) - epoch) < 0) { // The preceeding waiter has not decided on its fate. Wait until it // calls either CancelWait or CommitWait, or is notified. - std::this_thread::yield(); + EIGEN_THREAD_YIELD(); state = state_.load(std::memory_order_seq_cst); continue; } @@ -112,7 +112,7 @@ class EventCount { if (int64_t((state & kEpochMask) - epoch) < 0) { // The preceeding waiter has not decided on its fate. Wait until it // calls either CancelWait or CommitWait, or is notified. - std::this_thread::yield(); + EIGEN_THREAD_YIELD(); state = state_.load(std::memory_order_relaxed); continue; } diff --git a/unsupported/Eigen/CXX11/src/ThreadPool/ThreadYield.h b/unsupported/Eigen/CXX11/src/ThreadPool/ThreadYield.h new file mode 100644 index 000000000..a859c7ba3 --- /dev/null +++ b/unsupported/Eigen/CXX11/src/ThreadPool/ThreadYield.h @@ -0,0 +1,20 @@ +// This file is part of Eigen, a lightweight C++ template library +// for linear algebra. +// +// Copyright (C) 2016 Benoit Steiner +// +// This Source Code Form is subject to the terms of the Mozilla +// Public License v. 2.0. If a copy of the MPL was not distributed +// with this file, You can obtain one at http://mozilla.org/MPL/2.0/. + +#ifndef EIGEN_CXX11_THREADPOOL_THREAD_YIELD_H +#define EIGEN_CXX11_THREADPOOL_THREAD_YIELD_H + +// Try to come up with a portable way to yield +#if EIGEN_COMP_GNUC && EIGEN_GNUC_AT_MOST(4, 7) +#define EIGEN_THREAD_YIELD() sched_yield() +#else +#define EIGEN_THREAD_YIELD() std::this_thread::yield() +#endif + +#endif // EIGEN_CXX11_THREADPOOL_THREAD_YIELD_H From 80200a182862bf7c19b51645ab8e7b9e15b65a90 Mon Sep 17 00:00:00 2001 From: Benoit Steiner Date: Wed, 20 Apr 2016 12:10:27 -0700 Subject: [PATCH 25/27] Don't attempt to leverage the _cvtss_sh and _cvtsh_ss instructions when compiling with clang since it's unclear which versions of clang actually support these instruction. --- Eigen/Core | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Eigen/Core b/Eigen/Core index ac58f10c3..50040135f 100644 --- a/Eigen/Core +++ b/Eigen/Core @@ -204,7 +204,7 @@ #endif #endif -#if defined(__F16C__) +#if defined(__F16C__) && !defined(EIGEN_COMP_CLANG) // We can use the optimized fp16 to float and float to fp16 conversion routines #define EIGEN_HAS_FP16_C #endif From a792cd357d31f0a4fce62ed1fa4cc0334cf2f143 Mon Sep 17 00:00:00 2001 From: Benoit Steiner Date: Wed, 20 Apr 2016 17:33:58 -0700 Subject: [PATCH 26/27] Added more tests --- unsupported/test/cxx11_tensor_fixed_size.cpp | 2 ++ 1 file changed, 2 insertions(+) diff --git a/unsupported/test/cxx11_tensor_fixed_size.cpp b/unsupported/test/cxx11_tensor_fixed_size.cpp index 1c33fefb3..5fe164859 100644 --- a/unsupported/test/cxx11_tensor_fixed_size.cpp +++ b/unsupported/test/cxx11_tensor_fixed_size.cpp @@ -20,6 +20,8 @@ static void test_0d() TensorFixedSize > scalar1; TensorFixedSize, RowMajor> scalar2; VERIFY_IS_EQUAL(scalar1.rank(), 0); + VERIFY_IS_EQUAL(scalar1.size(), 1); + VERIFY_IS_EQUAL(array_prod(scalar1.dimensions()), 1); scalar1() = 7.0; scalar2() = 13.0; From 2dde1b102866e1928e925678951463f2a7051af1 Mon Sep 17 00:00:00 2001 From: Benoit Steiner Date: Wed, 20 Apr 2016 18:08:20 -0700 Subject: [PATCH 27/27] Don't crash when attempting to reduce empty tensors. --- unsupported/Eigen/CXX11/src/Tensor/TensorExecutor.h | 2 +- unsupported/Eigen/CXX11/src/Tensor/TensorMeta.h | 12 ++++++++++-- .../Eigen/CXX11/src/Tensor/TensorReductionCuda.h | 7 ++++++- 3 files changed, 17 insertions(+), 4 deletions(-) diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorExecutor.h b/unsupported/Eigen/CXX11/src/Tensor/TensorExecutor.h index bf6e10a7b..c3edae477 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorExecutor.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorExecutor.h @@ -238,7 +238,7 @@ inline void TensorExecutor::run( device.maxCudaThreadsPerMultiProcessor() / block_size; const Index size = array_prod(evaluator.dimensions()); // Create a least one block to ensure we won't crash when tensorflow calls with tensors of size 0. - const int num_blocks = numext::maxi(numext::mini(max_blocks, (size + block_size - 1) / block_size), 1); + const int num_blocks = numext::maxi(numext::mini(max_blocks, divup(size, block_size)), 1); LAUNCH_CUDA_KERNEL( (EigenMetaKernel, Index>), diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorMeta.h b/unsupported/Eigen/CXX11/src/Tensor/TensorMeta.h index 6af2d45d4..cd04716bd 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorMeta.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorMeta.h @@ -24,9 +24,17 @@ const T2& choose(Cond, const T1&, const T2& second) { return second; } -template EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE + +template +EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE +T divup(const X x, const Y y) { + return static_cast((x + y - 1) / y); +} + +template +EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE T divup(const T x, const T y) { - return (x + y - 1) / y; + return static_cast((x + y - 1) / y); } template struct max_n_1 { diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorReductionCuda.h b/unsupported/Eigen/CXX11/src/Tensor/TensorReductionCuda.h index afa5a257a..fd2587dd5 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorReductionCuda.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorReductionCuda.h @@ -134,9 +134,14 @@ struct FullReducer { typedef typename Self::Index Index; const Index num_coeffs = array_prod(self.m_impl.dimensions()); + // Don't crash when we're called with an input tensor of size 0. + if (num_coeffs == 0) { + return; + } + const int block_size = 256; const int num_per_thread = 128; - const int num_blocks = numext::ceil(static_cast(num_coeffs) / (block_size * num_per_thread)); + const int num_blocks = divup(num_coeffs, block_size * num_per_thread); if (num_blocks > 1) { // We initialize the outputs outside the reduction kernel when we can't be sure that there