From e3f613cbd42cc629b08dd2e39ea192af1996e124 Mon Sep 17 00:00:00 2001 From: Gael Guennebaud Date: Mon, 5 Dec 2016 13:02:01 +0100 Subject: [PATCH] Improve performance of row-major-dense-matrix * vector products for recent CPUs. This revised version does not bother about aligned loads/stores, and rather processes 8 rows at ones for better instruction pipelining. --- Eigen/src/Core/products/GeneralMatrixVector.h | 342 +++++++----------- 1 file changed, 125 insertions(+), 217 deletions(-) diff --git a/Eigen/src/Core/products/GeneralMatrixVector.h b/Eigen/src/Core/products/GeneralMatrixVector.h index 098b7c4e1..ce9fcaf03 100644 --- a/Eigen/src/Core/products/GeneralMatrixVector.h +++ b/Eigen/src/Core/products/GeneralMatrixVector.h @@ -242,253 +242,161 @@ EIGEN_DONT_INLINE static void run( template EIGEN_DONT_INLINE void general_matrix_vector_product::run( Index rows, Index cols, - const LhsMapper& lhs, + const LhsMapper& alhs, const RhsMapper& rhs, ResScalar* res, Index resIncr, ResScalar alpha) { + // The following copy tells the compiler that lhs's attributes are not modified outside this function + // This helps GCC to generate propoer code. + LhsMapper lhs(alhs); + eigen_internal_assert(rhs.stride()==1); - - #ifdef _EIGEN_ACCUMULATE_PACKETS - #error _EIGEN_ACCUMULATE_PACKETS has already been defined - #endif - - #define _EIGEN_ACCUMULATE_PACKETS(Alignment0,Alignment13,Alignment2) {\ - RhsPacket b = rhs.getVectorMapper(j, 0).template load(0); \ - ptmp0 = pcj.pmadd(lhs0.template load(j), b, ptmp0); \ - ptmp1 = pcj.pmadd(lhs1.template load(j), b, ptmp1); \ - ptmp2 = pcj.pmadd(lhs2.template load(j), b, ptmp2); \ - ptmp3 = pcj.pmadd(lhs3.template load(j), b, ptmp3); } - conj_helper cj; conj_helper pcj; - typedef typename LhsMapper::VectorMapper LhsScalars; + // TODO: fine tune the following heuristic. The rationale is that if the matrix is very large, + // processing 8 rows at once might be counter productive wrt cache. + const Index n8 = lhs.stride()*sizeof(LhsScalar)>32000 ? 0 : rows-7; + const Index n4 = rows-3; + const Index n2 = rows-1; - enum { AllAligned=0, EvenAligned=1, FirstAligned=2, NoneAligned=3 }; - const Index rowsAtOnce = 4; - const Index peels = 2; - const Index RhsPacketAlignedMask = RhsPacketSize-1; - const Index LhsPacketAlignedMask = LhsPacketSize-1; - const Index depth = cols; - const Index lhsStride = lhs.stride(); + // TODO: for padded aligned inputs, we could enable aligned reads + enum { LhsAlignment = Unaligned }; - // How many coeffs of the result do we have to skip to be aligned. - // Here we assume data are at least aligned on the base scalar type - // if that's not the case then vectorization is discarded, see below. - Index alignedStart = rhs.firstAligned(depth); - Index alignedSize = RhsPacketSize>1 ? alignedStart + ((depth-alignedStart) & ~RhsPacketAlignedMask) : 0; - const Index peeledSize = alignedSize - RhsPacketSize*peels - RhsPacketSize + 1; - - const Index alignmentStep = LhsPacketSize>1 ? (LhsPacketSize - lhsStride % LhsPacketSize) & LhsPacketAlignedMask : 0; - Index alignmentPattern = alignmentStep==0 ? AllAligned - : alignmentStep==(LhsPacketSize/2) ? EvenAligned - : FirstAligned; - - // we cannot assume the first element is aligned because of sub-matrices - const Index lhsAlignmentOffset = lhs.firstAligned(depth); - const Index rhsAlignmentOffset = rhs.firstAligned(rows); - - // find how many rows do we have to skip to be aligned with rhs (if possible) - Index skipRows = 0; - // if the data cannot be aligned (TODO add some compile time tests when possible, e.g. for floats) - if( (sizeof(LhsScalar)!=sizeof(RhsScalar)) || - (lhsAlignmentOffset < 0) || (lhsAlignmentOffset == depth) || - (rhsAlignmentOffset < 0) || (rhsAlignmentOffset == rows) ) + Index i=0; + for(; i 4) - { - // TODO: extend the code to support aligned loads whenever possible when LhsPacketSize > 4. - alignmentPattern = NoneAligned; - } - else if (LhsPacketSize>1) - { - // eigen_internal_assert(size_t(firstLhs+lhsAlignmentOffset)%sizeof(LhsPacket)==0 || depth(ResScalar(0)), + c1 = pset1(ResScalar(0)), + c2 = pset1(ResScalar(0)), + c3 = pset1(ResScalar(0)), + c4 = pset1(ResScalar(0)), + c5 = pset1(ResScalar(0)), + c6 = pset1(ResScalar(0)), + c7 = pset1(ResScalar(0)); - while (skipRows(j,0); + + c0 = pcj.pmadd(lhs.template load(i+0,j),b0,c0); + c1 = pcj.pmadd(lhs.template load(i+1,j),b0,c1); + c2 = pcj.pmadd(lhs.template load(i+2,j),b0,c2); + c3 = pcj.pmadd(lhs.template load(i+3,j),b0,c3); + c4 = pcj.pmadd(lhs.template load(i+4,j),b0,c4); + c5 = pcj.pmadd(lhs.template load(i+5,j),b0,c5); + c6 = pcj.pmadd(lhs.template load(i+6,j),b0,c6); + c7 = pcj.pmadd(lhs.template load(i+7,j),b0,c7); } - else + ResScalar cc0 = predux(c0); + ResScalar cc1 = predux(c1); + ResScalar cc2 = predux(c2); + ResScalar cc3 = predux(c3); + ResScalar cc4 = predux(c4); + ResScalar cc5 = predux(c5); + ResScalar cc6 = predux(c6); + ResScalar cc7 = predux(c7); + for(; j= rows) - || LhsPacketSize > depth - || (size_t(firstLhs+alignedStart+lhsStride*skipRows)%sizeof(LhsPacket))==0);*/ + res[(i+0)*resIncr] += alpha*cc0; + res[(i+1)*resIncr] += alpha*cc1; + res[(i+2)*resIncr] += alpha*cc2; + res[(i+3)*resIncr] += alpha*cc3; + res[(i+4)*resIncr] += alpha*cc4; + res[(i+5)*resIncr] += alpha*cc5; + res[(i+6)*resIncr] += alpha*cc6; + res[(i+7)*resIncr] += alpha*cc7; } - else if(Vectorizable) + for(; i(ResScalar(0)), + c1 = pset1(ResScalar(0)), + c2 = pset1(ResScalar(0)), + c3 = pset1(ResScalar(0)); + + Index j=0; + for(; j+LhsPacketSize<=cols; j+=LhsPacketSize) + { + RhsPacket b0 = rhs.template load(j,0); + + c0 = pcj.pmadd(lhs.template load(i+0,j),b0,c0); + c1 = pcj.pmadd(lhs.template load(i+1,j),b0,c1); + c2 = pcj.pmadd(lhs.template load(i+2,j),b0,c2); + c3 = pcj.pmadd(lhs.template load(i+3,j),b0,c3); + } + ResScalar cc0 = predux(c0); + ResScalar cc1 = predux(c1); + ResScalar cc2 = predux(c2); + ResScalar cc3 = predux(c3); + for(; j(ResScalar(0)), + c1 = pset1(ResScalar(0)); - // this helps the compiler generating good binary code - const LhsScalars lhs0 = lhs.getVectorMapper(i+0, 0), lhs1 = lhs.getVectorMapper(i+offset1, 0), - lhs2 = lhs.getVectorMapper(i+2, 0), lhs3 = lhs.getVectorMapper(i+offset3, 0); - - if (Vectorizable) + Index j=0; + for(; j+LhsPacketSize<=cols; j+=LhsPacketSize) { - /* explicit vectorization */ - ResPacket ptmp0 = pset1(ResScalar(0)), ptmp1 = pset1(ResScalar(0)), - ptmp2 = pset1(ResScalar(0)), ptmp3 = pset1(ResScalar(0)); + RhsPacket b0 = rhs.template load(j,0); - // process initial unaligned coeffs - // FIXME this loop get vectorized by the compiler ! - for (Index j=0; jalignedStart) - { - switch(alignmentPattern) - { - case AllAligned: - for (Index j = alignedStart; j1) - { - /* Here we proccess 4 rows with with two peeled iterations to hide - * the overhead of unaligned loads. Moreover unaligned loads are handled - * using special shift/move operations between the two aligned packets - * overlaping the desired unaligned packet. This is *much* more efficient - * than basic unaligned loads. - */ - LhsPacket A01, A02, A03, A11, A12, A13; - A01 = lhs1.template load(alignedStart-1); - A02 = lhs2.template load(alignedStart-2); - A03 = lhs3.template load(alignedStart-3); - - for (; j(0); - A11 = lhs1.template load(j-1+LhsPacketSize); palign<1>(A01,A11); - A12 = lhs2.template load(j-2+LhsPacketSize); palign<2>(A02,A12); - A13 = lhs3.template load(j-3+LhsPacketSize); palign<3>(A03,A13); - - ptmp0 = pcj.pmadd(lhs0.template load(j), b, ptmp0); - ptmp1 = pcj.pmadd(A01, b, ptmp1); - A01 = lhs1.template load(j-1+2*LhsPacketSize); palign<1>(A11,A01); - ptmp2 = pcj.pmadd(A02, b, ptmp2); - A02 = lhs2.template load(j-2+2*LhsPacketSize); palign<2>(A12,A02); - ptmp3 = pcj.pmadd(A03, b, ptmp3); - A03 = lhs3.template load(j-3+2*LhsPacketSize); palign<3>(A13,A03); - - b = rhs.getVectorMapper(j+RhsPacketSize, 0).template load(0); - ptmp0 = pcj.pmadd(lhs0.template load(j+LhsPacketSize), b, ptmp0); - ptmp1 = pcj.pmadd(A11, b, ptmp1); - ptmp2 = pcj.pmadd(A12, b, ptmp2); - ptmp3 = pcj.pmadd(A13, b, ptmp3); - } - } - for (; j(i+0,j),b0,c0); + c1 = pcj.pmadd(lhs.template load(i+1,j),b0,c1); } - res[i*resIncr] += alpha*tmp0; - res[(i+offset1)*resIncr] += alpha*tmp1; - res[(i+2)*resIncr] += alpha*tmp2; - res[(i+offset3)*resIncr] += alpha*tmp3; + ResScalar cc0 = predux(c0); + ResScalar cc1 = predux(c1); + for(; j(ResScalar(0)); + Index j=0; + for(; j+LhsPacketSize<=cols; j+=LhsPacketSize) { - EIGEN_ALIGN_MAX ResScalar tmp0 = ResScalar(0); - ResPacket ptmp0 = pset1(tmp0); - const LhsScalars lhs0 = lhs.getVectorMapper(i, 0); - // process first unaligned result's coeffs - // FIXME this loop get vectorized by the compiler ! - for (Index j=0; jalignedStart) - { - // process aligned rhs coeffs - if (lhs0.template aligned(alignedStart)) - for (Index j = alignedStart;j(j), rhs.getVectorMapper(j, 0).template load(0), ptmp0); - else - for (Index j = alignedStart;j(j), rhs.getVectorMapper(j, 0).template load(0), ptmp0); - tmp0 += predux(ptmp0); - } - - // process remaining scalars - // FIXME this loop get vectorized by the compiler ! - for (Index j=alignedSize; j(j,0); + c0 = pcj.pmadd(lhs.template load(i,j),b0,c0); } - if (skipRows) + ResScalar cc0 = predux(c0); + for(; j