From bf5326c3cad4b92082bb347a09e9a9ab6d50075b Mon Sep 17 00:00:00 2001 From: Gael Guennebaud Date: Thu, 8 May 2008 08:12:52 +0000 Subject: [PATCH] * Added ReferencableBit flag to known if coeffRef is available. (needed by the new product implementation) * Make the packet* members template to support aligned and unaligned access. This makes Block vectorizable. Combined with ReferencableBit, we should be able to determine at runtime (in some specific cases) if an aligned vectorization is possible or not. * Improved the new product implementation to robustly handle all cases, it now passes all the tests. * Renamed the packet version ei_predux to ei_preduxp to avoid name collision. --- Eigen/Core | 5 + Eigen/src/Core/Block.h | 6 +- Eigen/src/Core/Map.h | 2 +- Eigen/src/Core/PacketMath.h | 4 +- Eigen/src/Core/PacketMath_SSE.h | 40 +++++- Eigen/src/Core/Product.h | 6 + Eigen/src/Core/ProductWIP.h | 235 +++++++++++++++++--------------- Eigen/src/Core/util/Constants.h | 6 +- Eigen/src/Core/util/Macros.h | 7 + 9 files changed, 191 insertions(+), 120 deletions(-) diff --git a/Eigen/Core b/Eigen/Core index 999bb96f2..cc05c308e 100644 --- a/Eigen/Core +++ b/Eigen/Core @@ -1,13 +1,18 @@ #ifndef EIGEN_CORE_H #define EIGEN_CORE_H +#define EIGEN_WIP_PRODUCT + #ifndef EIGEN_DONT_VECTORIZE #if ((defined __SSE2__) && ( (!defined __GNUC__) || (__GNUC__>=4 && __GNUC_MINOR__>=2))) #define EIGEN_VECTORIZE #define EIGEN_VECTORIZE_SSE #include #include +// SSE3: #include +// SSSE3: +//#include #endif #ifdef __ALTIVEC__ // There are zero chances of both __SSE2__ AND __ALTIVEC__ been defined #define EIGEN_VECTORIZE diff --git a/Eigen/src/Core/Block.h b/Eigen/src/Core/Block.h index 247a46b40..00bb375a9 100644 --- a/Eigen/src/Core/Block.h +++ b/Eigen/src/Core/Block.h @@ -71,7 +71,7 @@ struct ei_traits > || (ColsAtCompileTime != Dynamic && MatrixType::ColsAtCompileTime == Dynamic)) ? ~LargeBit : ~(unsigned int)0, - Flags = MatrixType::Flags & DefaultLostFlagMask & FlagsMaskLargeBit, + Flags = MatrixType::Flags & (DefaultLostFlagMask | VectorizableBit | ReferencableBit) & FlagsMaskLargeBit, CoeffReadCost = MatrixType::CoeffReadCost }; }; @@ -146,13 +146,13 @@ template class Block template PacketScalar _packetCoeff(int row, int col) const { - return m_matrix.packetCoeff(row + m_startRow.value(), col + m_startCol.value()); + return m_matrix.template packetCoeff(row + m_startRow.value(), col + m_startCol.value()); } template void _writePacketCoeff(int row, int col, const PacketScalar& x) { - m_matrix.const_cast_derived().writePacketCoeff(row + m_startRow.value(), col + m_startCol.value(), x); + m_matrix.const_cast_derived().template writePacketCoeff(row + m_startRow.value(), col + m_startCol.value(), x); } protected: diff --git a/Eigen/src/Core/Map.h b/Eigen/src/Core/Map.h index 266da0cfe..e9f65cd7a 100644 --- a/Eigen/src/Core/Map.h +++ b/Eigen/src/Core/Map.h @@ -47,7 +47,7 @@ struct ei_traits > ColsAtCompileTime = MatrixType::ColsAtCompileTime, MaxRowsAtCompileTime = MatrixType::MaxRowsAtCompileTime, MaxColsAtCompileTime = MatrixType::MaxColsAtCompileTime, - Flags = MatrixType::Flags & DefaultLostFlagMask, + Flags = MatrixType::Flags & (DefaultLostFlagMask | ReferencableBit), CoeffReadCost = NumTraits::ReadCost }; }; diff --git a/Eigen/src/Core/PacketMath.h b/Eigen/src/Core/PacketMath.h index 59c143ede..267192482 100644 --- a/Eigen/src/Core/PacketMath.h +++ b/Eigen/src/Core/PacketMath.h @@ -71,10 +71,10 @@ template inline void ei_pstoreu(Scalar* to, const Scalar& from template inline Scalar ei_pfirst(const Scalar& a) { return a; } /** \internal \returns a packet where the element i contains the sum of the packet of \a vec[i] */ -// template inline Scalar ei_predux(const Scalar* vecs) { return vecs[0]; } +template inline Scalar ei_preduxp(const Scalar* vecs) { return vecs[0]; } /** \internal \returns the sum of the elements of \a a*/ -// template inline Scalar ei_predux(const Scalar& a) { return a; } +template inline Scalar ei_predux(const Scalar& a) { return a; } #endif // EIGEN_PACKET_MATH_H diff --git a/Eigen/src/Core/PacketMath_SSE.h b/Eigen/src/Core/PacketMath_SSE.h index 1872affea..dd467f52a 100644 --- a/Eigen/src/Core/PacketMath_SSE.h +++ b/Eigen/src/Core/PacketMath_SSE.h @@ -102,14 +102,19 @@ inline int ei_pfirst(const __m128i& a) { return _mm_cvtsi128_si32(a); } #ifdef __SSE3__ // TODO implement SSE2 versions as well as integer versions -inline __m128 ei_predux(const __m128* vecs) +inline __m128 ei_preduxp(const __m128* vecs) { return _mm_hadd_ps(_mm_hadd_ps(vecs[0], vecs[1]),_mm_hadd_ps(vecs[2], vecs[3])); } -inline __m128d ei_predux(const __m128d* vecs) +inline __m128d ei_preduxp(const __m128d* vecs) { return _mm_hadd_pd(vecs[0], vecs[1]); } +// SSSE3 version: +// inline __m128i ei_preduxp(const __m128i* vecs) +// { +// return _mm_hadd_epi32(_mm_hadd_epi32(vecs[0], vecs[1]),_mm_hadd_epi32(vecs[2], vecs[3])); +// } inline float ei_predux(const __m128& a) { @@ -118,6 +123,13 @@ inline float ei_predux(const __m128& a) } inline double ei_predux(const __m128d& a) { return ei_pfirst(_mm_hadd_pd(a, a)); } + +// SSSE3 version: +// inline float ei_predux(const __m128i& a) +// { +// __m128i tmp0 = _mm_hadd_epi32(a,a); +// return ei_pfirst(_mm_hadd_epi32(tmp0, tmp0)); +// } #else // SSE2 versions inline float ei_predux(const __m128& a) @@ -130,7 +142,7 @@ inline double ei_predux(const __m128d& a) return ei_pfirst(_mm_add_sd(a, _mm_unpackhi_pd(a,a))); } -inline __m128 ei_predux(const __m128* vecs) +inline __m128 ei_preduxp(const __m128* vecs) { __m128 tmp0, tmp1, tmp2; tmp0 = _mm_unpacklo_ps(vecs[0], vecs[1]); @@ -144,11 +156,31 @@ inline __m128 ei_predux(const __m128* vecs) return _mm_add_ps(tmp0, tmp2); } -inline __m128d ei_predux(const __m128d* vecs) +inline __m128d ei_preduxp(const __m128d* vecs) { return _mm_add_pd(_mm_unpacklo_pd(vecs[0], vecs[1]), _mm_unpackhi_pd(vecs[0], vecs[1])); } #endif // SSE3 +inline int ei_predux(const __m128i& a) +{ + __m128i tmp = _mm_add_epi32(a, _mm_unpackhi_epi64(a,a)); + return ei_pfirst(tmp) + ei_pfirst(_mm_shuffle_epi32(tmp, 1)); +} + +inline __m128i ei_preduxp(const __m128i* vecs) +{ + __m128i tmp0, tmp1, tmp2; + tmp0 = _mm_unpacklo_epi32(vecs[0], vecs[1]); + tmp1 = _mm_unpackhi_epi32(vecs[0], vecs[1]); + tmp2 = _mm_unpackhi_epi32(vecs[2], vecs[3]); + tmp0 = _mm_add_epi32(tmp0, tmp1); + tmp1 = _mm_unpacklo_epi32(vecs[2], vecs[3]); + tmp1 = _mm_add_epi32(tmp1, tmp2); + tmp2 = _mm_unpacklo_epi64(tmp0, tmp1); + tmp0 = _mm_unpackhi_epi64(tmp0, tmp1); + return _mm_add_epi32(tmp0, tmp2); +} + #endif // EIGEN_PACKET_MATH_SSE_H diff --git a/Eigen/src/Core/Product.h b/Eigen/src/Core/Product.h index c1b2f5457..66c6d4d5b 100644 --- a/Eigen/src/Core/Product.h +++ b/Eigen/src/Core/Product.h @@ -107,6 +107,12 @@ struct ei_packet_product_unroller +struct ei_packet_product_unroller +{ + static void run(int, int, const Lhs&, const Rhs&, PacketScalar&) {} +}; + template struct ProductPacketCoeffImpl { inline static typename Product::PacketScalar execute(const Product& product, int row, int col) { return product._packetCoeffRowMajor(row,col); } diff --git a/Eigen/src/Core/ProductWIP.h b/Eigen/src/Core/ProductWIP.h index 57bb899d6..a5fc9d298 100644 --- a/Eigen/src/Core/ProductWIP.h +++ b/Eigen/src/Core/ProductWIP.h @@ -73,7 +73,7 @@ struct ei_packet_product_unroller static void run(int row, int col, const Lhs& lhs, const Rhs& rhs, PacketScalar &res) { ei_packet_product_unroller::run(row, col, lhs, rhs, res); - res = ei_pmadd(ei_pset1(lhs.coeff(row, Index)), rhs.packetCoeff(Index, col), res); + res = ei_pmadd(ei_pset1(lhs.coeff(row, Index)), rhs.template packetCoeff(Index, col), res); } }; @@ -83,7 +83,7 @@ struct ei_packet_product_unroller static void run(int row, int col, const Lhs& lhs, const Rhs& rhs, PacketScalar &res) { ei_packet_product_unroller::run(row, col, lhs, rhs, res); - res = ei_pmadd(lhs.packetCoeff(row, Index), ei_pset1(rhs.coeff(Index, col)), res); + res = ei_pmadd(lhs.template packetCoeff(row, Index), ei_pset1(rhs.coeff(Index, col)), res); } }; @@ -92,7 +92,7 @@ struct ei_packet_product_unroller { static void run(int row, int col, const Lhs& lhs, const Rhs& rhs, PacketScalar &res) { - res = ei_pmul(ei_pset1(lhs.coeff(row, 0)),rhs.packetCoeff(0, col)); + res = ei_pmul(ei_pset1(lhs.coeff(row, 0)),rhs.template packetCoeff(0, col)); } }; @@ -101,7 +101,7 @@ struct ei_packet_product_unroller { static void run(int row, int col, const Lhs& lhs, const Rhs& rhs, PacketScalar &res) { - res = ei_pmul(lhs.packetCoeff(row, 0), ei_pset1(rhs.coeff(0, col))); + res = ei_pmul(lhs.template packetCoeff(row, 0), ei_pset1(rhs.coeff(0, col))); } }; @@ -111,6 +111,12 @@ struct ei_packet_product_unroller +struct ei_packet_product_unroller +{ + static void run(int, int, const Lhs&, const Rhs&, PacketScalar&) {} +}; + template struct ProductPacketCoeffImpl { inline static typename Product::PacketScalar execute(const Product& product, int row, int col) { return product._packetCoeffRowMajor(row,col); } @@ -139,16 +145,53 @@ template struct ei_product_eval_mode { enum{ value = Lhs::MaxRowsAtCompileTime >= EIGEN_CACHEFRIENDLY_PRODUCT_THRESHOLD && Rhs::MaxColsAtCompileTime >= EIGEN_CACHEFRIENDLY_PRODUCT_THRESHOLD - && (!( (Lhs::Flags&RowMajorBit) && ((Rhs::Flags&RowMajorBit) ^ RowMajorBit))) ? CacheOptimalProduct : NormalProduct }; }; +template class ei_product_eval_to_column_major +{ + typedef typename ei_traits::Scalar _Scalar; + enum {_MaxRows = ei_traits::MaxRowsAtCompileTime, + _MaxCols = ei_traits::MaxColsAtCompileTime, + _Flags = ei_traits::Flags + }; + + public: + typedef Matrix<_Scalar, + ei_traits::RowsAtCompileTime, + ei_traits::ColsAtCompileTime, + ei_corrected_matrix_flags<_Scalar, ei_size_at_compile_time<_MaxRows,_MaxCols>::ret, _Flags>::ret & ~RowMajorBit, + ei_traits::MaxRowsAtCompileTime, + ei_traits::MaxColsAtCompileTime> type; +}; + +template struct ei_product_nested_rhs +{ + typedef typename ei_meta_if< + ei_is_temporary::ret && !(ei_traits::Flags & RowMajorBit), + T, + typename ei_meta_if< + (ei_traits::Flags & EvalBeforeNestingBit) + || (ei_traits::Flags & RowMajorBit) + || (!(ei_traits::Flags & ReferencableBit)) + || (n+1) * NumTraits::Scalar>::ReadCost < (n-1) * T::CoeffReadCost, + typename ei_product_eval_to_column_major::type, + const T& + >::ret + >::ret type; +}; + template struct ei_traits > { typedef typename Lhs::Scalar Scalar; - typedef typename ei_nested::type LhsNested; - typedef typename ei_nested::type RhsNested; + // the cache friendly product evals lhs once only + // FIXME what to do if we chose to dynamically call the normal product from the cache friendly one for small matrices ? + typedef typename ei_nested::type LhsNested; + // NOTE that rhs must be ColumnMajor, so we might need a special nested type calculation + typedef typename ei_meta_if::type, + typename ei_nested::type>::ret RhsNested; typedef typename ei_unref::type _LhsNested; typedef typename ei_unref::type _RhsNested; enum { @@ -160,16 +203,20 @@ struct ei_traits > ColsAtCompileTime = Rhs::ColsAtCompileTime, MaxRowsAtCompileTime = Lhs::MaxRowsAtCompileTime, MaxColsAtCompileTime = Rhs::MaxColsAtCompileTime, + // the vectorization flags are only used by the normal product, + // the other one is always vectorized ! _RhsVectorizable = (RhsFlags & RowMajorBit) && (RhsFlags & VectorizableBit) && (ColsAtCompileTime % ei_packet_traits::size == 0), _LhsVectorizable = (!(LhsFlags & RowMajorBit)) && (LhsFlags & VectorizableBit) && (RowsAtCompileTime % ei_packet_traits::size == 0), - _Vectorizable = (_LhsVectorizable || _RhsVectorizable) ? 1 : 0, + _Vectorizable = (_LhsVectorizable || _RhsVectorizable) ? 0 : 0, _RowMajor = (RhsFlags & RowMajorBit) && (EvalMode==(int)CacheOptimalProduct ? (int)LhsFlags & RowMajorBit : (!_LhsVectorizable)), _LostBits = DefaultLostFlagMask & ~( (_RowMajor ? 0 : RowMajorBit) | ((RowsAtCompileTime == Dynamic || ColsAtCompileTime == Dynamic) ? 0 : LargeBit)), Flags = ((unsigned int)(LhsFlags | RhsFlags) & _LostBits) -// | EvalBeforeAssigningBit //FIXME + #ifndef EIGEN_WIP_PRODUCT_DIRTY + | EvalBeforeAssigningBit //FIXME + #endif | EvalBeforeNestingBit | (_Vectorizable ? VectorizableBit : 0), CoeffReadCost @@ -197,11 +244,16 @@ template class Product : ei_no_assignm PacketSize = ei_packet_traits::size, #if (defined __i386__) // i386 architectures provides only 8 xmmm register, - // so let's reduce the max number of rows processed at once + // so let's reduce the max number of rows processed at once. + // NOTE that so far the maximal supported value is 8. MaxBlockRows = 4, + MaxBlockRows_ClampingMask = 0xFFFFFC, #else MaxBlockRows = 8, + MaxBlockRows_ClampingMask = 0xFFFFF8, #endif + // maximal size of the blocks fitted in L2 cache + MaxL2BlockSize = EIGEN_TUNE_FOR_L2_CACHE_SIZE / sizeof(Scalar) }; Product(const Lhs& lhs, const Rhs& rhs) @@ -214,16 +266,6 @@ template class Product : ei_no_assignm template void _cacheFriendlyEval(DestDerived& res) const; - /** \internal */ - template - void _cacheFriendlyEvalImpl(DestDerived& res) const __attribute__ ((noinline)); - - /** \internal */ - template - void _cacheFriendlyEvalKernel(DestDerived& res, - int l2i, int l2j, int l2k, int l1i, - int l2blockRowEnd, int l2blockColEnd, int l2blockSizeEnd, const Scalar* block) const EIGEN_DONT_INLINE; - private: int _rows() const { return m_lhs.rows(); } @@ -255,7 +297,7 @@ template class Product : ei_no_assignm if(Lhs::ColsAtCompileTime <= EIGEN_UNROLLING_LIMIT) { PacketScalar res; - ei_packet_product_unroller @@ -269,21 +311,30 @@ template class Product : ei_no_assignm PacketScalar _packetCoeffRowMajor(int row, int col) const { PacketScalar res; - res = ei_pmul(ei_pset1(m_lhs.coeff(row, 0)),m_rhs.packetCoeff(0, col)); + res = ei_pmul(ei_pset1(m_lhs.coeff(row, 0)),m_rhs.template packetCoeff(0, col)); for(int i = 1; i < m_lhs.cols(); i++) - res = ei_pmadd(ei_pset1(m_lhs.coeff(row, i)), m_rhs.packetCoeff(i, col), res); + res = ei_pmadd(ei_pset1(m_lhs.coeff(row, i)), m_rhs.template packetCoeff(i, col), res); return res; } PacketScalar _packetCoeffColumnMajor(int row, int col) const { PacketScalar res; - res = ei_pmul(m_lhs.packetCoeff(row, 0), ei_pset1(m_rhs.coeff(0, col))); + res = ei_pmul(m_lhs.template packetCoeff(row, 0), ei_pset1(m_rhs.coeff(0, col))); for(int i = 1; i < m_lhs.cols(); i++) - res = ei_pmadd(m_lhs.packetCoeff(row, i), ei_pset1(m_rhs.coeff(i, col)), res); + res = ei_pmadd(m_lhs.template packetCoeff(row, i), ei_pset1(m_rhs.coeff(i, col)), res); return res; } + /** \internal */ + template + void _cacheFriendlyEvalImpl(DestDerived& res) const __attribute__ ((noinline)); + + /** \internal */ + template + void _cacheFriendlyEvalKernel(DestDerived& res, + int l2i, int l2j, int l2k, int l1i, + int l2blockRowEnd, int l2blockColEnd, int l2blockSizeEnd, const Scalar* block) const EIGEN_DONT_INLINE; protected: const LhsNested m_lhs; @@ -361,7 +412,8 @@ void Product::_cacheFriendlyEvalKernel(DestDerived& res, int offsetblock = l2k * (l2blockRowEnd-l2i) + (l1i-l2i)*(l2blockSizeEnd-l2k) - l2k*BlockRows; const Scalar* localB = &block[offsetblock]; - int l1jsize = l1j * m_lhs.cols(); //TODO find a better way to optimize address computation ? +// int l1jsize = l1j * m_lhs.cols(); //TODO find a better way to optimize address computation ? + Scalar* rhsColumn = &(m_rhs.const_cast_derived().coeffRef(0, l1j)); // don't worry, dst is a set of registers PacketScalar dst[BlockRows]; @@ -383,7 +435,8 @@ void Product::_cacheFriendlyEvalKernel(DestDerived& res, // unaligned loads are expensive, therefore let's preload the next element in advance if (RhsAlignment==UnAligned) - tmp1 = ei_ploadu(&m_rhs.derived().data()[l1jsize+l2k]); + //tmp1 = ei_ploadu(&m_rhs.data()[l1jsize+l2k]); + tmp1 = ei_ploadu(&rhsColumn[l2k]); for(int k=l2k; k::_cacheFriendlyEvalKernel(DestDerived& res, //PacketScalar tmp = m_rhs.template packetCoeff(k, l1j); if (RhsAlignment==Aligned) { - tmp = ei_pload(&m_rhs.data()[l1jsize + k]); + //tmp = ei_pload(&m_rhs.data()[l1jsize + k]); + tmp = ei_pload(&rhsColumn[k]); } else { tmp = tmp1; if (k+PacketSize::_cacheFriendlyEvalKernel(DestDerived& res, // we have up to 4 packets (for doubles: 8 rows / 2) if (PacketRows>=1) res.template writePacketCoeff(l1i, l1j, - ei_padd(res.template packetCoeff(l1i, l1j), ei_predux(&(dst[0])))); + ei_padd(res.template packetCoeff(l1i, l1j), ei_preduxp(&(dst[0])))); if (PacketRows>=2) res.template writePacketCoeff(l1i+PacketSize, l1j, - ei_padd(res.template packetCoeff(l1i+PacketSize, l1j), ei_predux(&(dst[PacketSize])))); + ei_padd(res.template packetCoeff(l1i+PacketSize, l1j), ei_preduxp(&(dst[PacketSize])))); if (PacketRows>=3) res.template writePacketCoeff(l1i+2*PacketSize, l1j, - ei_padd(res.template packetCoeff(l1i+2*PacketSize, l1j), ei_predux(&(dst[2*PacketSize])))); + ei_padd(res.template packetCoeff(l1i+2*PacketSize, l1j), ei_preduxp(&(dst[2*PacketSize])))); if (PacketRows>=4) res.template writePacketCoeff(l1i+3*PacketSize, l1j, - ei_padd(res.template packetCoeff(l1i+3*PacketSize, l1j), ei_predux(&(dst[3*PacketSize])))); + ei_padd(res.template packetCoeff(l1i+3*PacketSize, l1j), ei_preduxp(&(dst[3*PacketSize])))); // process the remaining rows one at a time if (RemainingStart<=0 && BlockRows>=1) res.coeffRef(l1i+0, l1j) += ei_predux(dst[0]); @@ -450,38 +505,37 @@ template template void Product::_cacheFriendlyEvalImpl(DestDerived& res) const { - // allow direct access to data for benchmark purpose - const Scalar* __restrict__ a = m_lhs.derived().data(); - const Scalar* __restrict__ b = m_rhs.derived().data(); - Scalar* __restrict__ c = res.derived().data(); - // FIXME find a way to optimize: (an_xpr) + (a * b) // then we don't need to clear res and avoid and additional mat-mat sum -// res.setZero(); + #ifndef EIGEN_WIP_PRODUCT_DIRTY +// std::cout << "wip product\n"; + res.setZero(); + #endif - const int bs = PacketSize * MaxBlockRows; // total number of elements treated at once const int rows = _rows(); const int cols = _cols(); const int remainingSize = m_lhs.cols()%PacketSize; const int size = m_lhs.cols() - remainingSize; // third dimension of the product clamped to packet boundaries - const int l2blocksize = 256 > _cols() ? _cols() : 256; - // FIXME use calloca ?? (allocation on the stack) - Scalar* __restrict__ block = new Scalar[l2blocksize*size]; + const int l2BlockRows = MaxL2BlockSize > _rows() ? _rows() : MaxL2BlockSize; + const int l2BlockCols = MaxL2BlockSize > _cols() ? _cols() : MaxL2BlockSize; + const int l2BlockSize = MaxL2BlockSize > size ? size : MaxL2BlockSize; + //Scalar* __restrict__ block = new Scalar[l2blocksize*size];; + Scalar* __restrict__ block = (Scalar*)alloca(sizeof(Scalar)*l2BlockRows*size); // loops on each L2 cache friendly blocks of the result - for(int l2i=0; l2i<_rows(); l2i+=l2blocksize) + for(int l2i=0; l2i<_rows(); l2i+=l2BlockRows) { - const int l2blockRowEnd = std::min(l2i+l2blocksize, rows); - const int l2blockRowEndBW = l2blockRowEnd & 0xFFFFF8; // end of the rows aligned to bw - const int l2blockRowRemaining = l2blockRowEnd - l2blockRowEndBW; // number of remaining rows + const int l2blockRowEnd = std::min(l2i+l2BlockRows, rows); + const int l2blockRowEndBW = l2blockRowEnd & MaxBlockRows_ClampingMask; // end of the rows aligned to bw + const int l2blockRemainingRows = l2blockRowEnd - l2blockRowEndBW; // number of remaining rows // build a cache friendly block int count = 0; // copy l2blocksize rows of m_lhs to blocks of ps x bw - for(int l2k=0; l2k::_cacheFriendlyEvalImpl(DestDerived& res) const block[count++] = m_lhs.coeff(i+w,k+s); } } - if (l2blockRowRemaining>0) + if (l2blockRemainingRows>0) { for (int k=l2k; k::_cacheFriendlyEvalImpl(DestDerived& res) const #if 0 for(int l1j=l2j; l1j::_cacheFriendlyEvalImpl(DestDerived& res) const dst[1] = dst[0]; dst[2] = dst[0]; dst[3] = dst[0]; - if (bw==8) + if (MaxBlockRows==8) { dst[4] = dst[0]; dst[5] = dst[0]; @@ -543,7 +597,7 @@ void Product::_cacheFriendlyEvalImpl(DestDerived& res) const // TODO in unaligned mode, preload the next element // PacketScalar tmp1 = _mm_load_ps(&m_rhs.derived().data()[l1jsize+l2k]); asm("#eigen begincore"); - for(int k=l2k; k(k, l1j); // TODO make this branching compile time (costly for doubles) @@ -559,10 +613,10 @@ void Product::_cacheFriendlyEvalImpl(DestDerived& res) const dst[1] = ei_pmadd(tmp, b1, dst[1]); b1 = ei_pload(&(localB[k*bw+3*ps])); dst[2] = ei_pmadd(tmp, b0, dst[2]); - if (bw==8) + if (MaxBlockRows==8) b0 = ei_pload(&(localB[k*bw+4*ps])); dst[3] = ei_pmadd(tmp, b1, dst[3]); - if (bw==8) + if (MaxBlockRows==8) { b1 = ei_pload(&(localB[k*bw+5*ps])); dst[4] = ei_pmadd(tmp, b0, dst[4]); @@ -576,19 +630,20 @@ void Product::_cacheFriendlyEvalImpl(DestDerived& res) const // if (resIsAligned) { - res.template writePacketCoeff(l1i, l1j, ei_padd(res.template packetCoeff(l1i, l1j), ei_predux(dst))); - if (ps==2) - res.template writePacketCoeff(l1i+2,l1j, ei_padd(res.template packetCoeff(l1i+2,l1j), ei_predux(&(dst[2])))); - if (bw==8) + res.template writePacketCoeff(l1i, l1j, ei_padd(res.template packetCoeff(l1i, l1j), ei_preduxp(dst))); + if (PacketSize==2) + res.template writePacketCoeff(l1i+2,l1j, ei_padd(res.template packetCoeff(l1i+2,l1j), ei_preduxp(&(dst[2])))); + if (MaxBlockRows==8) { - res.template writePacketCoeff(l1i+4,l1j, ei_padd(res.template packetCoeff(l1i+4,l1j), ei_predux(&(dst[4])))); - if (ps==2) - res.template writePacketCoeff(l1i+6,l1j, ei_padd(res.template packetCoeff(l1i+6,l1j), ei_predux(&(dst[6])))); + res.template writePacketCoeff(l1i+4,l1j, ei_padd(res.template packetCoeff(l1i+4,l1j), ei_preduxp(&(dst[4])))); + if (PacketSize==2) + res.template writePacketCoeff(l1i+6,l1j, ei_padd(res.template packetCoeff(l1i+6,l1j), ei_preduxp(&(dst[6])))); } } // else // { // // TODO uncommenting this code kill the perf, even though it is never called !! +// // this is because dst cannot be a set of registers only // // TODO optimize this loop // // TODO is it better to do one redux at once or packet reduxes + unaligned store ? // for (int w = 0; w::_cacheFriendlyEvalImpl(DestDerived& res) const } #endif } - if (l2blockRowRemaining>0) + if (l2blockRemainingRows>0) { // this is an attempt to build an array of kernels, but I did not manage to get it compiles // typedef void (*Kernel)(DestDerived& , int, int, int, int, int, int, int, const Scalar*); // Kernel kernels[8]; // kernels[0] = (Kernel)(&Product::template _cacheFriendlyEvalKernel); -// kernels[l2blockRowRemaining](res, l2i, l2j, l2k, l2blockRowEndBW, l2blockRowEnd, l2blockColEnd, l2blockSizeEnd, block); +// kernels[l2blockRemainingRows](res, l2i, l2j, l2k, l2blockRowEndBW, l2blockRowEnd, l2blockColEnd, l2blockSizeEnd, block); - switch(l2blockRowRemaining) + switch(l2blockRemainingRows) { case 1:_cacheFriendlyEvalKernel( res, l2i, l2j, l2k, l2blockRowEndBW, l2blockRowEnd, l2blockColEnd, l2blockSizeEnd, block); break; @@ -627,43 +682,6 @@ void Product::_cacheFriendlyEvalImpl(DestDerived& res) const default: ei_internal_assert(false && "internal error"); break; } - -#if 0 - // TODO optimize this part using a generic templated function that processes N rows - // here we process the remaining l2blockRowRemaining rows - for(int l1j=l2j; l1j::_cacheFriendlyEvalImpl(DestDerived& res) const NormalProduct>( m_lhs.block(0,size, _rows(), remainingSize), m_rhs.block(size,0, remainingSize, _cols())).lazy(); +// res += m_lhs.block(0,size, _rows(), remainingSize)._lazyProduct(m_rhs.block(size,0, remainingSize, _cols())); } - delete[] block; +// delete[] block; } #endif // EIGEN_PRODUCT_H diff --git a/Eigen/src/Core/util/Constants.h b/Eigen/src/Core/util/Constants.h index c74d8f87b..48498bfae 100644 --- a/Eigen/src/Core/util/Constants.h +++ b/Eigen/src/Core/util/Constants.h @@ -43,16 +43,18 @@ const unsigned int NullDiagBit = 0x40; ///< means all diagonal coefficients const unsigned int UnitDiagBit = 0x80; ///< means all diagonal coefficients are equal to 1 const unsigned int NullLowerBit = 0x200; ///< means the strictly triangular lower part is 0 const unsigned int NullUpperBit = 0x400; ///< means the strictly triangular upper part is 0 +const unsigned int ReferencableBit = 0x800; ///< means the expression is writable through MatrixBase::coeffRef(int,int) enum { Upper=NullLowerBit, Lower=NullUpperBit }; enum { Aligned=0, UnAligned=1 }; // list of flags that are lost by default -const unsigned int DefaultLostFlagMask = ~(VectorizableBit | Like1DArrayBit | NullDiagBit | UnitDiagBit | NullLowerBit | NullUpperBit); +const unsigned int DefaultLostFlagMask = ~(VectorizableBit | Like1DArrayBit | ReferencableBit + | NullDiagBit | UnitDiagBit | NullLowerBit | NullUpperBit); enum { ConditionalJumpCost = 5 }; enum CornerType { TopLeft, TopRight, BottomLeft, BottomRight }; enum DirectionType { Vertical, Horizontal }; -enum ProductEvaluationMode { NormalProduct, CacheOptimalProduct }; +enum ProductEvaluationMode { NormalProduct, CacheOptimalProduct, LazyProduct}; #endif // EIGEN_CONSTANTS_H diff --git a/Eigen/src/Core/util/Macros.h b/Eigen/src/Core/util/Macros.h index 613ab9617..f703d159a 100644 --- a/Eigen/src/Core/util/Macros.h +++ b/Eigen/src/Core/util/Macros.h @@ -45,6 +45,13 @@ #define EIGEN_DEFAULT_MATRIX_FLAGS EIGEN_DEFAULT_MATRIX_STORAGE_ORDER +/** Define a hint size when dealling with large matrices and L2 cache friendlyness + * More precisely, its square value represents the amount of bytes which can be assumed to stay in L2 cache. + */ +#ifndef EIGEN_TUNE_FOR_L2_CACHE_SIZE +#define EIGEN_TUNE_FOR_L2_CACHE_SIZE 1024 +#endif + #define USING_PART_OF_NAMESPACE_EIGEN \ EIGEN_USING_MATRIX_TYPEDEFS \ using Eigen::Matrix; \