From 8be011e7765cf5d30cad86edd55850b0b1277741 Mon Sep 17 00:00:00 2001 From: Gael Guennebaud Date: Wed, 26 Mar 2014 23:14:44 +0100 Subject: [PATCH 1/5] Remove remaining bits of the dead working buffer --- Eigen/src/Core/products/GeneralMatrixMatrix.h | 16 +--------------- 1 file changed, 1 insertion(+), 15 deletions(-) diff --git a/Eigen/src/Core/products/GeneralMatrixMatrix.h b/Eigen/src/Core/products/GeneralMatrixMatrix.h index 6776cf5a8..19991fa3f 100644 --- a/Eigen/src/Core/products/GeneralMatrixMatrix.h +++ b/Eigen/src/Core/products/GeneralMatrixMatrix.h @@ -278,13 +278,11 @@ class gemm_blocking_space Traits; enum { SizeA = ActualRows * MaxDepth, - SizeB = ActualCols * MaxDepth, - SizeW = MaxDepth * Traits::WorkSpaceFactor + SizeB = ActualCols * MaxDepth }; EIGEN_ALIGN16 LhsScalar m_staticA[SizeA]; EIGEN_ALIGN16 RhsScalar m_staticB[SizeB]; - EIGEN_ALIGN16 RhsScalar m_staticW[SizeW]; public: @@ -295,12 +293,10 @@ class gemm_blocking_spacem_kc = MaxDepth; this->m_blockA = m_staticA; this->m_blockB = m_staticB; - this->m_blockW = m_staticW; } inline void allocateA() {} inline void allocateB() {} - inline void allocateW() {} inline void allocateAll() {} }; @@ -319,7 +315,6 @@ class gemm_blocking_space(this->m_kc, this->m_mc, this->m_nc); m_sizeA = this->m_mc * this->m_kc; m_sizeB = this->m_kc * this->m_nc; - m_sizeW = this->m_kc*Traits::WorkSpaceFactor; } void allocateA() @@ -347,24 +341,16 @@ class gemm_blocking_spacem_blockB = aligned_new(m_sizeB); } - void allocateW() - { - if(this->m_blockW==0) - this->m_blockW = aligned_new(m_sizeW); - } - void allocateAll() { allocateA(); allocateB(); - allocateW(); } ~gemm_blocking_space() { aligned_delete(this->m_blockA, m_sizeA); aligned_delete(this->m_blockB, m_sizeB); - aligned_delete(this->m_blockW, m_sizeW); } }; From f0a4c9d5abe94703d8b4e5ec49cba20df014d0ca Mon Sep 17 00:00:00 2001 From: Gael Guennebaud Date: Wed, 26 Mar 2014 23:22:36 +0100 Subject: [PATCH 2/5] Update gebp kernel to process a panle of 4 columns at once for the remaining ones. --- .../Core/products/GeneralBlockPanelKernel.h | 464 +++++++++++------- .../Core/products/SelfadjointMatrixMatrix.h | 153 +++--- 2 files changed, 380 insertions(+), 237 deletions(-) diff --git a/Eigen/src/Core/products/GeneralBlockPanelKernel.h b/Eigen/src/Core/products/GeneralBlockPanelKernel.h index 8a398d912..2b520383b 100644 --- a/Eigen/src/Core/products/GeneralBlockPanelKernel.h +++ b/Eigen/src/Core/products/GeneralBlockPanelKernel.h @@ -167,8 +167,6 @@ public: // register block size along the M direction (currently, this one cannot be modified) mr = LhsPacketSize, - WorkSpaceFactor = nr * RhsPacketSize, - LhsProgress = LhsPacketSize, RhsProgress = 1 }; @@ -242,7 +240,6 @@ public: NumberOfRegisters = EIGEN_ARCH_DEFAULT_NUMBER_OF_REGISTERS, nr = NumberOfRegisters/2, mr = LhsPacketSize, - WorkSpaceFactor = nr*RhsPacketSize, LhsProgress = LhsPacketSize, RhsProgress = 1 @@ -327,7 +324,6 @@ public: // FIXME: should depend on NumberOfRegisters nr = 4, mr = ResPacketSize, - WorkSpaceFactor = Vectorizable ? 2*nr*RealPacketSize : nr, LhsProgress = ResPacketSize, RhsProgress = 1 @@ -459,7 +455,6 @@ public: // FIXME: should depend on NumberOfRegisters nr = 4, mr = ResPacketSize, - WorkSpaceFactor = nr*RhsPacketSize, LhsProgress = ResPacketSize, RhsProgress = 1 @@ -564,63 +559,45 @@ void gebp_kernel if(strideA==-1) strideA = depth; if(strideB==-1) strideB = depth; conj_helper cj; - Index packet_cols = (cols/nr) * nr; + Index packet_cols8 = nr>=8 ? (cols/8) * 8 : 0; + Index packet_cols4 = nr>=4 ? (cols/4) * 4 : 0; // Here we assume that mr==LhsProgress const Index peeled_mc = (rows/mr)*mr; const Index peeled_kc = (depth/4)*4; // loops on each micro vertical panel of rhs (depth x nr) - for(Index j2=0; j2=8) { - // loops on each largest micro horizontal panel of lhs (mr x depth) - // => we select a mr x nr micro block of res which is entirely - // stored into mr/packet_size x nr registers. - for(Index i=0; i we select a mr x nr micro block of res which is entirely + // stored into mr/packet_size x nr registers. + for(Index i=0; i EIGEN_GEBGP_ONESTEP8(1,A0,A1); EIGEN_GEBGP_ONESTEP8(2,A1,A0); EIGEN_GEBGP_ONESTEP8(3,A0,A1); - } - blB += 4*nr*RhsProgress; - blA += 4*mr; - } - // process remaining peeled loop - for(Index k=peeled_kc; k(alpha); @@ -718,60 +683,20 @@ void gebp_kernel pstoreu(r0+5*resStride, R5); pstoreu(r0+6*resStride, R6); pstoreu(r0+7*resStride, R0); - } - else // nr==4 - { - ResPacket R0, R1, R2; - ResPacket alphav = pset1(alpha); - - R0 = ploadu(r0+0*resStride); - R1 = ploadu(r0+1*resStride); - R2 = ploadu(r0+2*resStride); - traits.acc(C0, alphav, R0); - pstoreu(r0+0*resStride, R0); - R0 = ploadu(r0+3*resStride); - - traits.acc(C1, alphav, R1); - traits.acc(C2, alphav, R2); - traits.acc(C3, alphav, R0); - pstoreu(r0+1*resStride, R1); - pstoreu(r0+2*resStride, R2); - pstoreu(r0+3*resStride, R0); } - } - - for(Index i=peeled_mc; i B_1 = blB[7]; MADD(cj,A0,B_0,C6, B_0); MADD(cj,A0,B_1,C7, B_1); - } - blB += nr; + blB += 8; + } + res[(j2+0)*resStride + i] += alpha*C0; + res[(j2+1)*resStride + i] += alpha*C1; + res[(j2+2)*resStride + i] += alpha*C2; + res[(j2+3)*resStride + i] += alpha*C3; + res[(j2+4)*resStride + i] += alpha*C4; + res[(j2+5)*resStride + i] += alpha*C5; + res[(j2+6)*resStride + i] += alpha*C6; + res[(j2+7)*resStride + i] += alpha*C7; + } + } + } + + // Second pass using depth x 4 panels + // If nr==8, then we have at most one such panel + if(nr>=4) + { + for(Index j2=packet_cols8; j2 we select a mr x 4 micro block of res which is entirely + // stored into mr/packet_size x 4 registers. + for(Index i=0; i(alpha); + + R0 = ploadu(r0+0*resStride); + R1 = ploadu(r0+1*resStride); + R2 = ploadu(r0+2*resStride); + traits.acc(C0, alphav, R0); + pstoreu(r0+0*resStride, R0); + R0 = ploadu(r0+3*resStride); + + traits.acc(C1, alphav, R1); + traits.acc(C2, alphav, R2); + traits.acc(C3, alphav, R0); + + pstoreu(r0+1*resStride, R1); + pstoreu(r0+2*resStride, R2); + pstoreu(r0+3*resStride, R0); + } + + for(Index i=peeled_mc; i do the same but with nr==1 - for(Index j2=packet_cols; j2=depth && offset<=stride)); conj_if::IsComplex && Conjugate> cj; - Index packet_cols = (cols/nr) * nr; + Index packet_cols8 = nr>=8 ? (cols/8) * 8 : 0; + Index packet_cols4 = nr>=4 ? (cols/4) * 4 : 0; Index count = 0; - for(Index j2=0; j2=8) { - // skip what we have before - if(PanelMode) count += nr * offset; - const Scalar* b0 = &rhs[(j2+0)*rhsStride]; - const Scalar* b1 = &rhs[(j2+1)*rhsStride]; - const Scalar* b2 = &rhs[(j2+2)*rhsStride]; - const Scalar* b3 = &rhs[(j2+3)*rhsStride]; - const Scalar* b4 = &rhs[(j2+4)*rhsStride]; - const Scalar* b5 = &rhs[(j2+5)*rhsStride]; - const Scalar* b6 = &rhs[(j2+6)*rhsStride]; - const Scalar* b7 = &rhs[(j2+7)*rhsStride]; - for(Index k=0; k=4) blockB[count+2] = cj(b2[k]); - if(nr>=4) blockB[count+3] = cj(b3[k]); - if(nr>=8) blockB[count+4] = cj(b4[k]); - if(nr>=8) blockB[count+5] = cj(b5[k]); - if(nr>=8) blockB[count+6] = cj(b6[k]); - if(nr>=8) blockB[count+7] = cj(b7[k]); - count += nr; + // skip what we have before + if(PanelMode) count += 8 * offset; + const Scalar* b0 = &rhs[(j2+0)*rhsStride]; + const Scalar* b1 = &rhs[(j2+1)*rhsStride]; + const Scalar* b2 = &rhs[(j2+2)*rhsStride]; + const Scalar* b3 = &rhs[(j2+3)*rhsStride]; + const Scalar* b4 = &rhs[(j2+4)*rhsStride]; + const Scalar* b5 = &rhs[(j2+5)*rhsStride]; + const Scalar* b6 = &rhs[(j2+6)*rhsStride]; + const Scalar* b7 = &rhs[(j2+7)*rhsStride]; + for(Index k=0; k=4) + { + for(Index j2=packet_cols8; j2=depth && offset<=stride)); conj_if::IsComplex && Conjugate> cj; - Index packet_cols = (cols/nr) * nr; + Index packet_cols8 = nr>=8 ? (cols/8) * 8 : 0; + Index packet_cols4 = nr>=4 ? (cols/4) * 4 : 0; Index count = 0; - for(Index j2=0; j2=8) { - // skip what we have before - if(PanelMode) count += nr * offset; - for(Index k=0; k(&rhs[k*rhsStride + j2]); - pstoreu(blockB+count, cj.pconj(A)); - count += PacketSize; - } else { - const Scalar* b0 = &rhs[k*rhsStride + j2]; - blockB[count+0] = cj(b0[0]); - blockB[count+1] = cj(b0[1]); - if(nr>=4) blockB[count+2] = cj(b0[2]); - if(nr>=4) blockB[count+3] = cj(b0[3]); - if(nr>=8) blockB[count+4] = cj(b0[4]); - if(nr>=8) blockB[count+5] = cj(b0[5]); - if(nr>=8) blockB[count+6] = cj(b0[6]); - if(nr>=8) blockB[count+7] = cj(b0[7]); - count += nr; + // skip what we have before + if(PanelMode) count += 8 * offset; + for(Index k=0; k(&rhs[k*rhsStride + j2]); + pstoreu(blockB+count, cj.pconj(A)); + count += PacketSize; + } else { + const Scalar* b0 = &rhs[k*rhsStride + j2]; + blockB[count+0] = cj(b0[0]); + blockB[count+1] = cj(b0[1]); + blockB[count+2] = cj(b0[2]); + blockB[count+3] = cj(b0[3]); + blockB[count+4] = cj(b0[4]); + blockB[count+5] = cj(b0[5]); + blockB[count+6] = cj(b0[6]); + blockB[count+7] = cj(b0[7]); + count += 8; + } } + // skip what we have after + if(PanelMode) count += 8 * (stride-offset-depth); + } + } + if(nr>=4) + { + for(Index j2=packet_cols8; j2(&rhs[k*rhsStride + j2]); + pstoreu(blockB+count, cj.pconj(A)); + count += PacketSize; + } else { + const Scalar* b0 = &rhs[k*rhsStride + j2]; + blockB[count+0] = cj(b0[0]); + blockB[count+1] = cj(b0[1]); + blockB[count+2] = cj(b0[2]); + blockB[count+3] = cj(b0[3]); + count += 4; + } + } + // skip what we have after + if(PanelMode) count += 4 * (stride-offset-depth); } - // skip what we have after - if(PanelMode) count += nr * (stride-offset-depth); } // copy the remaining columns one at a time (nr==1) - for(Index j2=packet_cols; j2 rhs(_rhs,rhsStride); - Index packet_cols = (cols/nr)*nr; + Index packet_cols8 = nr>=8 ? (cols/8) * 8 : 0; + Index packet_cols4 = nr>=4 ? (cols/4) * 4 : 0; // first part: normal case for(Index j2=0; j2=8 ? (std::min)(k2+rows,packet_cols8) : k2; + if(nr>=8) { - // again we can split vertically in three different parts (transpose, symmetric, normal) - // transpose - for(Index k=k2; k=4) + // again we can split vertically in three different parts (transpose, symmetric, normal) + // transpose + for(Index k=k2; k=8) - { blockB[count+4] = numext::conj(rhs(j2+4,k)); blockB[count+5] = numext::conj(rhs(j2+5,k)); blockB[count+6] = numext::conj(rhs(j2+6,k)); blockB[count+7] = numext::conj(rhs(j2+7,k)); + count += 8; } - count += nr; - } - // symmetric - Index h = 0; - for(Index k=j2; k=4) + // symmetric + Index h = 0; + for(Index k=j2; k=8) - { blockB[count+4] = rhs(k,j2+4); blockB[count+5] = rhs(k,j2+5); blockB[count+6] = rhs(k,j2+6); blockB[count+7] = rhs(k,j2+7); + count += 8; + } + } + } + if(nr>=4) + { + for(Index j2=end8; j2<(std::min)(k2+rows,packet_cols4); j2+=4) + { + // again we can split vertically in three different parts (transpose, symmetric, normal) + // transpose + for(Index k=k2; k=8) { - for(Index k=k2; k=4) + for(Index k=k2; k=8) - { blockB[count+4] = numext::conj(rhs(j2+4,k)); blockB[count+5] = numext::conj(rhs(j2+5,k)); blockB[count+6] = numext::conj(rhs(j2+6,k)); blockB[count+7] = numext::conj(rhs(j2+7,k)); + count += 8; + } + } + } + if(nr>=4) + { + for(Index j2=(std::max)(packet_cols8,k2+rows); j2 the same with nr==1) - for(Index j2=packet_cols; j2 gebp_kernel; symm_pack_lhs pack_lhs; @@ -376,11 +420,10 @@ EIGEN_DONT_INLINE void product_selfadjoint_matrix(kc, mc, nc); - std::size_t sizeW = kc*Traits::WorkSpaceFactor; - std::size_t sizeB = sizeW + kc*cols; + std::size_t sizeB = kc*cols; ei_declare_aligned_stack_constructed_variable(Scalar, blockA, kc*mc, 0); ei_declare_aligned_stack_constructed_variable(Scalar, allocatedBlockB, sizeB, 0); - Scalar* blockB = allocatedBlockB + sizeW; + Scalar* blockB = allocatedBlockB; gebp_kernel gebp_kernel; gemm_pack_lhs pack_lhs; From ba3457cab2349dd03585435571ff1a8f90cf7403 Mon Sep 17 00:00:00 2001 From: Abhijit Kundu Date: Wed, 26 Mar 2014 22:02:48 -0400 Subject: [PATCH 3/5] Fixed compilation error due to obsolete internal::abs and internal::sqrt function calls --- demos/opengl/quaternion_demo.cpp | 2 +- demos/opengl/trackball.cpp | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/demos/opengl/quaternion_demo.cpp b/demos/opengl/quaternion_demo.cpp index 04165619b..dd323a4c9 100644 --- a/demos/opengl/quaternion_demo.cpp +++ b/demos/opengl/quaternion_demo.cpp @@ -234,7 +234,7 @@ void RenderingWidget::drawScene() gpu.drawVector(Vector3f::Zero(), length*Vector3f::UnitZ(), Color(0,0,1,1)); // draw the fractal object - float sqrt3 = internal::sqrt(3.); + float sqrt3 = std::sqrt(3.); glLightfv(GL_LIGHT0, GL_AMBIENT, Vector4f(0.5,0.5,0.5,1).data()); glLightfv(GL_LIGHT0, GL_DIFFUSE, Vector4f(0.5,1,0.5,1).data()); glLightfv(GL_LIGHT0, GL_SPECULAR, Vector4f(1,1,1,1).data()); diff --git a/demos/opengl/trackball.cpp b/demos/opengl/trackball.cpp index 77ac790c8..7c2da8e96 100644 --- a/demos/opengl/trackball.cpp +++ b/demos/opengl/trackball.cpp @@ -23,7 +23,7 @@ void Trackball::track(const Vector2i& point2D) { Vector3f axis = mLastPoint3D.cross(newPoint3D).normalized(); float cos_angle = mLastPoint3D.dot(newPoint3D); - if ( internal::abs(cos_angle) < 1.0 ) + if ( std::abs(cos_angle) < 1.0 ) { float angle = 2. * acos(cos_angle); if (mMode==Around) From 9ce0d785137c4034f19193c6a6781bbbb38e29fb Mon Sep 17 00:00:00 2001 From: Mark Borgerding Date: Wed, 26 Mar 2014 22:26:07 -0400 Subject: [PATCH 4/5] immintrin.h did not come until intel version 11 --- Eigen/Core | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Eigen/Core b/Eigen/Core index 468ae0c76..412409497 100644 --- a/Eigen/Core +++ b/Eigen/Core @@ -123,7 +123,7 @@ extern "C" { // In theory we should only include immintrin.h and not the other *mmintrin.h header files directly. // Doing so triggers some issues with ICC. However old gcc versions seems to not have this file, thus: - #ifdef __INTEL_COMPILER + #if defined(__INTEL_COMPILER) && __INTEL_COMPILER >= 1110 #include #else #include From fb03b56647e299428c03e3b1c3a01f3318e1af7e Mon Sep 17 00:00:00 2001 From: Gael Guennebaud Date: Thu, 27 Mar 2014 11:38:35 +0100 Subject: [PATCH 5/5] Fix warning --- Eigen/src/Core/products/GeneralBlockPanelKernel.h | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/Eigen/src/Core/products/GeneralBlockPanelKernel.h b/Eigen/src/Core/products/GeneralBlockPanelKernel.h index 2b520383b..cba76edfe 100644 --- a/Eigen/src/Core/products/GeneralBlockPanelKernel.h +++ b/Eigen/src/Core/products/GeneralBlockPanelKernel.h @@ -594,8 +594,9 @@ void gebp_kernel // performs "inner" products const RhsScalar* blB = &blockB[j2*strideB+offsetB*nr]; - LhsPacket A0, A1; + LhsPacket A0; // uncomment for register prefetching + // LhsPacket A1; // traits.loadLhs(blA, A0); for(Index k=0; k // performs "inner" products const RhsScalar* blB = &blockB[j2*strideB+offsetB*4]; - LhsPacket A0, A1; + LhsPacket A0; // uncomment for register prefetching + // LhsPacket A1; // traits.loadLhs(blA, A0); for(Index k=0; k