diff --git a/unsupported/Eigen/src/Tensor/TensorContraction.h b/unsupported/Eigen/src/Tensor/TensorContraction.h index f6cf8402c..c3f9135f0 100644 --- a/unsupported/Eigen/src/Tensor/TensorContraction.h +++ b/unsupported/Eigen/src/Tensor/TensorContraction.h @@ -40,7 +40,7 @@ struct traits::val, typename traits::PointerType, typename traits::PointerType>; - enum { Flags = 0 }; + static constexpr int Flags = 0; }; template @@ -168,7 +168,7 @@ template +EIGEN_STRONG_INLINE void tensor_contraction_dispatch(Func&& fn, bool lhs_inner_dim_contiguous, + bool rhs_inner_dim_contiguous, bool rhs_inner_dim_reordered) { + if (lhs_inner_dim_contiguous) { + if (rhs_inner_dim_contiguous) { + if (rhs_inner_dim_reordered) + fn(bool_constant{}, bool_constant{}, bool_constant{}); + else + fn(bool_constant{}, bool_constant{}, bool_constant{}); + } else { + if (rhs_inner_dim_reordered) + fn(bool_constant{}, bool_constant{}, bool_constant{}); + else + fn(bool_constant{}, bool_constant{}, bool_constant{}); + } + } else { + if (rhs_inner_dim_contiguous) { + if (rhs_inner_dim_reordered) + fn(bool_constant{}, bool_constant{}, bool_constant{}); + else + fn(bool_constant{}, bool_constant{}, bool_constant{}); + } else { + if (rhs_inner_dim_reordered) + fn(bool_constant{}, bool_constant{}, bool_constant{}); + else + fn(bool_constant{}, bool_constant{}, bool_constant{}); + } + } +} + } // end namespace internal +// Legacy macros kept for backward compatibility with code that overrides them +// (e.g. TensorFlow Lite restricts template instantiations for binary size). +// New Eigen code should use internal::tensor_contraction_dispatch() instead. +#ifndef TENSOR_CONTRACTION_DISPATCH +#define TENSOR_CONTRACTION_DISPATCH(METHOD, ALIGNMENT, ARGS) \ + ::Eigen::internal::tensor_contraction_dispatch( \ + [&](auto lhs_c, auto rhs_c, auto rhs_r) { METHOD ARGS; }, \ + this->m_lhs_inner_dim_contiguous, this->m_rhs_inner_dim_contiguous, this->m_rhs_inner_dim_reordered) +#endif + +#ifndef TENSOR_CONTRACTION_ASYNC_DISPATCH +#define TENSOR_CONTRACTION_ASYNC_DISPATCH(METHOD, DONE, ALIGNMENT, ARGS, FN) \ + ::Eigen::internal::tensor_contraction_dispatch( \ + [&](auto lhs_c, auto rhs_c, auto rhs_r) { (new METHOD ARGS)->FN; }, \ + this->m_lhs_inner_dim_contiguous, this->m_rhs_inner_dim_contiguous, this->m_rhs_inner_dim_reordered) +#endif + // Tensor contraction params that should enable to get from output matrix // 2-dimensional coordinates to the output tensor dimensions. struct TensorContractionParams { @@ -281,16 +332,8 @@ struct NoOpOutputKernel { * \param[in] num_cols Number of available columns */ template - EIGEN_ALWAYS_INLINE void operator()(const internal::blas_data_mapper& output_mapper, - const TensorContractionParams& params, Index i, Index j, Index num_rows, - Index num_cols) const { - EIGEN_UNUSED_VARIABLE(output_mapper); - EIGEN_UNUSED_VARIABLE(params); - EIGEN_UNUSED_VARIABLE(i); - EIGEN_UNUSED_VARIABLE(j); - EIGEN_UNUSED_VARIABLE(num_rows); - EIGEN_UNUSED_VARIABLE(num_cols); - } + EIGEN_ALWAYS_INLINE void operator()(const internal::blas_data_mapper&, + const TensorContractionParams&, Index, Index, Index, Index) const {} }; /** Tensor contraction class. @@ -350,14 +393,12 @@ struct TensorContractionEvaluatorBase { using EvaluatorPointerType = typename Storage::Type; static constexpr int Layout = TensorEvaluator::Layout; - enum { - IsAligned = true, - PacketAccess = (PacketType::size > 1), - BlockAccess = false, - PreferBlockAccess = false, - CoordAccess = false, // to be implemented - RawAccess = true - }; + static constexpr bool IsAligned = true; + static constexpr bool PacketAccess = (PacketType::size > 1); + static constexpr bool BlockAccess = false; + static constexpr bool PreferBlockAccess = false; + static constexpr bool CoordAccess = false; // to be implemented + static constexpr bool RawAccess = true; //===- Tensor block evaluation strategy (see TensorBlock.h) -------------===// using TensorBlock = internal::TensorBlockNotImplemented; @@ -397,7 +438,7 @@ struct TensorContractionEvaluatorBase { device), m_device(device), m_output_kernel(op.outputKernel()), - m_result(NULL) { + m_result(nullptr) { EIGEN_STATIC_ASSERT((static_cast(TensorEvaluator::Layout) == static_cast(TensorEvaluator::Layout)), YOU_MADE_A_PROGRAMMING_MISTAKE); @@ -581,8 +622,8 @@ struct TensorContractionEvaluatorBase { EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Dimensions& dimensions() const { return m_dimensions; } EIGEN_STRONG_INLINE bool evalSubExprsIfNeeded(EvaluatorPointerType data) { - m_leftImpl.evalSubExprsIfNeeded(NULL); - m_rightImpl.evalSubExprsIfNeeded(NULL); + m_leftImpl.evalSubExprsIfNeeded(nullptr); + m_rightImpl.evalSubExprsIfNeeded(nullptr); if (data) { evalTo(data); return false; @@ -609,72 +650,6 @@ struct TensorContractionEvaluatorBase { } #endif // EIGEN_USE_THREADS -#ifndef TENSOR_CONTRACTION_DISPATCH -#define TENSOR_CONTRACTION_DISPATCH(METHOD, ALIGNMENT, ARGS) \ - if (this->m_lhs_inner_dim_contiguous) { \ - if (this->m_rhs_inner_dim_contiguous) { \ - if (this->m_rhs_inner_dim_reordered) { \ - METHOD ARGS; \ - } else { \ - METHOD ARGS; \ - } \ - } else { \ - if (this->m_rhs_inner_dim_reordered) { \ - METHOD ARGS; \ - } else { \ - METHOD ARGS; \ - } \ - } \ - } else { \ - if (this->m_rhs_inner_dim_contiguous) { \ - if (this->m_rhs_inner_dim_reordered) { \ - METHOD ARGS; \ - } else { \ - METHOD ARGS; \ - } \ - } else { \ - if (this->m_rhs_inner_dim_reordered) { \ - METHOD ARGS; \ - } else { \ - METHOD ARGS; \ - } \ - } \ - } -#endif - -#ifndef TENSOR_CONTRACTION_ASYNC_DISPATCH -#define TENSOR_CONTRACTION_ASYNC_DISPATCH(METHOD, DONE, ALIGNMENT, ARGS, FN) \ - if (this->m_lhs_inner_dim_contiguous) { \ - if (this->m_rhs_inner_dim_contiguous) { \ - if (this->m_rhs_inner_dim_reordered) { \ - (new METHOD ARGS)->FN; \ - } else { \ - (new METHOD ARGS)->FN; \ - } \ - } else { \ - if (this->m_rhs_inner_dim_reordered) { \ - (new METHOD ARGS)->FN; \ - } else { \ - (new METHOD ARGS)->FN; \ - } \ - } \ - } else { \ - if (this->m_rhs_inner_dim_contiguous) { \ - if (this->m_rhs_inner_dim_reordered) { \ - (new METHOD ARGS)->FN; \ - } else { \ - (new METHOD ARGS)->FN; \ - } \ - } else { \ - if (this->m_rhs_inner_dim_reordered) { \ - (new METHOD ARGS)->FN; \ - } else { \ - (new METHOD ARGS)->FN; \ - } \ - } \ - } -#endif - EIGEN_DEVICE_FUNC void evalTo(Scalar* buffer) const { static_cast(this)->template evalProduct(buffer); } @@ -867,9 +842,9 @@ struct TensorContractionEvaluatorBase { m_leftImpl.cleanup(); m_rightImpl.cleanup(); - if (m_result != NULL) { + if (m_result != nullptr) { m_device.deallocate(m_result); - m_result = NULL; + m_result = nullptr; } } @@ -957,7 +932,11 @@ struct TensorEvaluator void evalProduct(Scalar* buffer) const { - TENSOR_CONTRACTION_DISPATCH(this->template evalProductSequential, Alignment, (buffer)); + internal::tensor_contraction_dispatch( + [&](auto lhs_c, auto rhs_c, auto rhs_r) { + this->template evalProductSequential(buffer); + }, + this->m_lhs_inner_dim_contiguous, this->m_rhs_inner_dim_contiguous, this->m_rhs_inner_dim_reordered); } }; diff --git a/unsupported/Eigen/src/Tensor/TensorContractionBlocking.h b/unsupported/Eigen/src/Tensor/TensorContractionBlocking.h index 7fbe30a9f..7fd21fbd9 100644 --- a/unsupported/Eigen/src/Tensor/TensorContractionBlocking.h +++ b/unsupported/Eigen/src/Tensor/TensorContractionBlocking.h @@ -16,7 +16,8 @@ namespace Eigen { namespace internal { -enum { ShardByRow = 0, ShardByCol = 1 }; +constexpr int ShardByRow = 0; +constexpr int ShardByCol = 1; // Default Blocking Strategy template class MakePointer_> struct CoeffLoader { - enum { DirectOffsets = false }; + static constexpr bool DirectOffsets = false; EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE CoeffLoader(const Tensor& tensor) : m_tensor(tensor) {} @@ -44,7 +45,7 @@ struct CoeffLoader { EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE const typename MakePointer_::Type data() const { eigen_assert(false && "unsupported"); - return NULL; + return nullptr; } EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE typename Tensor::Scalar coeff(typename Tensor::Index index) const { @@ -62,7 +63,7 @@ struct CoeffLoader { template class MakePointer_> struct CoeffLoader { - enum { DirectOffsets = true }; + static constexpr bool DirectOffsets = true; EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE CoeffLoader(const Tensor& tensor) : m_data(tensor.data()) {} @@ -100,7 +101,7 @@ class SimpleTensorContractionMapper { m_contract_strides(contract_strides), m_k_strides(k_strides) {} - enum { DirectOffsets = CoeffLoader::DirectOffsets }; + static constexpr bool DirectOffsets = CoeffLoader::DirectOffsets; EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE void offsetBuffer(typename Tensor::Index offset) { m_tensor.offsetBuffer(offset); @@ -123,7 +124,6 @@ class SimpleTensorContractionMapper { EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Index computeIndex(Index row, Index col) const { const bool left = (side == Lhs); - EIGEN_UNUSED_VARIABLE(left); // annoying bug in g++8.1: https://gcc.gnu.org/bugzilla/show_bug.cgi?id=85963 Index nocontract_val = left ? row : col; Index linidx = 0; EIGEN_UNROLL_LOOP @@ -164,7 +164,6 @@ class SimpleTensorContractionMapper { EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE IndexPair computeIndexPair(Index row, Index col, const Index distance) const { const bool left = (side == Lhs); - EIGEN_UNUSED_VARIABLE(left); // annoying bug in g++8.1: https://gcc.gnu.org/bugzilla/show_bug.cgi?id=85963 Index nocontract_val[2] = {left ? row : col, left ? row + distance : col}; Index linidx[2] = {0, 0}; if (array_size::value > array_size::value) { @@ -363,12 +362,10 @@ class TensorContractionSubMapper { using LinearMapper = Self; using SubMapper = Self; - enum { - // We can use direct offsets iff the parent mapper supports then and we can compute the strides. - // TODO: we should also enable direct offsets for the Rhs case. - UseDirectOffsets = - ParentMapper::DirectOffsets && (side == Lhs) && inner_dim_contiguous && (array_size::value > 0) - }; + // We can use direct offsets iff the parent mapper supports then and we can compute the strides. + // TODO: we should also enable direct offsets for the Rhs case. + static constexpr bool UseDirectOffsets = + ParentMapper::DirectOffsets && (side == Lhs) && inner_dim_contiguous && (array_size::value > 0); EIGEN_DEVICE_FUNC TensorContractionSubMapper(const ParentMapper& base_mapper, Index vert_offset, Index horiz_offset) : m_base_mapper(base_mapper), m_vert_offset(vert_offset), m_horiz_offset(horiz_offset) { @@ -429,8 +426,9 @@ class TensorContractionSubMapper { EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE void storePacket(Index i, const PacketT& p) const { if (UseDirectOffsets) { m_base_mapper.storePacket(i, 0, p); + } else { + m_base_mapper.storePacket(i + m_vert_offset, m_horiz_offset, p); } - m_base_mapper.storePacket(i + m_vert_offset, m_horiz_offset, p); } EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE LinearMapper getLinearMapper(Index i, Index j) const { @@ -451,7 +449,7 @@ class TensorContractionSubMapper { template EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE PacketT load(Index i) const { - EIGEN_STATIC_ASSERT((internal::is_same::value), YOU_MADE_A_PROGRAMMING_MISTAKE); + static_assert(std::is_same::value, "YOU_MADE_A_PROGRAMMING_MISTAKE"); const int ActualAlignment = (AlignmentType == Aligned) && (Alignment == Aligned) ? Aligned : Unaligned; if (UseDirectOffsets) { return m_base_mapper.template loadPacket(i, 0); diff --git a/unsupported/Eigen/src/Tensor/TensorContractionThreadPool.h b/unsupported/Eigen/src/Tensor/TensorContractionThreadPool.h index 5dd5d6036..0e102c2c3 100644 --- a/unsupported/Eigen/src/Tensor/TensorContractionThreadPool.h +++ b/unsupported/Eigen/src/Tensor/TensorContractionThreadPool.h @@ -99,7 +99,7 @@ struct TensorEvaluator::value; + static constexpr bool IsEvalInSyncMode = std::is_same::value; const Index m = this->m_i_size; const Index n = this->m_j_size; @@ -176,7 +176,11 @@ struct TensorEvaluatortemplate evalProductSequential, Unaligned, (buffer)); + internal::tensor_contraction_dispatch( + [&](auto lhs_c, auto rhs_c, auto rhs_r) { + this->template evalProductSequential(buffer); + }, + this->m_lhs_inner_dim_contiguous, this->m_rhs_inner_dim_contiguous, this->m_rhs_inner_dim_reordered); if (!IsEvalInSyncMode) done(); return; } @@ -258,22 +262,22 @@ struct TensorEvaluator ctx( + this, num_threads, buffer, m, n, k, bm, bn, bk, nm, nn, nk, gm, gn, nm0, nn0, shard_by_col, + parallel_pack, parallelize_by_sharding_dim_only, NoCallback()); + ctx.run(); + } + else { + auto* ctx = new EvalParallelContext( + this, num_threads, buffer, m, n, k, bm, bn, bk, nm, nn, nk, gm, gn, nm0, nn0, shard_by_col, + parallel_pack, parallelize_by_sharding_dim_only, std::move(done)); + ctx->run(); + } + }, + this->m_lhs_inner_dim_contiguous, this->m_rhs_inner_dim_contiguous, this->m_rhs_inner_dim_reordered); } // ------------------------------------------------------------------------ // @@ -432,11 +436,11 @@ struct TensorEvaluator class ThreadLocalBlocksInitialize { static constexpr bool kIsLhs = !is_rhs && std::is_same::value; - static const bool kIsRhs = is_rhs && std::is_same::value; + static constexpr bool kIsRhs = is_rhs && std::is_same::value; static_assert(kIsLhs || kIsRhs, "Unknown block type"); using Blocks = ThreadLocalBlocks; @@ -668,10 +672,8 @@ struct TensorEvaluator struct ThreadLocalBlocksAllocator; @@ -684,7 +686,6 @@ struct TensorEvaluator(std::move(mem_handle), std::move(rhs_blocks)); } @@ -703,7 +704,6 @@ struct TensorEvaluator(std::move(mem_handle), std::move(lhs_blocks)); } @@ -1015,10 +1015,6 @@ struct TensorEvaluator - using SyncEvalParallelContext = EvalParallelContext; - // ------------------------------------------------------------------------ // // EvalShardedByInnerDimContext orchestrates sync/async contraction @@ -1086,11 +1082,11 @@ struct TensorEvaluator::size; + static constexpr Index packet_size = internal::packet_traits::size; const Self* evaluator; // TensorContraction evaluator - // These fields required fromTENSOR_CONTRACTION_DISPATCH macro. + // These fields cache values from the evaluator for use in processBlock dispatch. bool m_lhs_inner_dim_contiguous; bool m_rhs_inner_dim_contiguous; bool m_rhs_inner_dim_reordered; @@ -1131,7 +1127,7 @@ struct TensorEvaluator= ~128? - static const Index l0_size = 4; + static constexpr Index l0_size = 4; Index l0_ranges; // Keep count of pending gemm tasks for each l0 range. @@ -1144,9 +1140,12 @@ struct TensorEvaluatortemplate evalGemmPartialWithoutOutputKernel, Alignment, - (buf, begin, end, - /*num_threads=*/internal::convert_index(num_blocks))); + internal::tensor_contraction_dispatch( + [&](auto lhs_c, auto rhs_c, auto rhs_r) { + evaluator->template evalGemmPartialWithoutOutputKernel( + buf, begin, end, /*num_threads=*/internal::convert_index(num_blocks)); + }, + m_lhs_inner_dim_contiguous, m_rhs_inner_dim_contiguous, m_rhs_inner_dim_reordered); // Check if it was the last task in l0 range. const Index l0_index = block_idx / l0_size; diff --git a/unsupported/benchmarks/Tensor/bench_contraction.cpp b/unsupported/benchmarks/Tensor/bench_contraction.cpp index faf648f6d..83b1f19be 100644 --- a/unsupported/benchmarks/Tensor/bench_contraction.cpp +++ b/unsupported/benchmarks/Tensor/bench_contraction.cpp @@ -131,7 +131,7 @@ static void ContractionSizes(::benchmark::Benchmark* b) { static void ThreadPoolSizes(::benchmark::Benchmark* b) { for (int size : {64, 256, 512, 1024}) { - for (int threads : {2, 4, 8}) { + for (int threads : {1, 2, 4, 8, 16}) { b->Args({size, size, size, threads}); } }