diff --git a/Eigen/Core b/Eigen/Core index b99f49a01..1e38ecf69 100644 --- a/Eigen/Core +++ b/Eigen/Core @@ -145,7 +145,9 @@ namespace Eigen { #include "src/Core/Swap.h" #include "src/Core/CommaInitializer.h" #include "src/Core/Part.h" -#include "src/Core/CacheFriendlyProduct.h" +#include "src/Core/products/GeneralMatrixMatrix.h" +#include "src/Core/products/GeneralMatrixVector.h" +#include "src/Core/products/SelfadjointMatrixVector.h" } // namespace Eigen diff --git a/Eigen/src/Core/products/GeneralMatrixMatrix.h b/Eigen/src/Core/products/GeneralMatrixMatrix.h new file mode 100644 index 000000000..1041688b4 --- /dev/null +++ b/Eigen/src/Core/products/GeneralMatrixMatrix.h @@ -0,0 +1,409 @@ +// This file is part of Eigen, a lightweight C++ template library +// for linear algebra. Eigen itself is part of the KDE project. +// +// Copyright (C) 2008 Gael Guennebaud +// +// Eigen is free software; you can redistribute it and/or +// modify it under the terms of the GNU Lesser General Public +// License as published by the Free Software Foundation; either +// version 3 of the License, or (at your option) any later version. +// +// Alternatively, you can redistribute it and/or +// modify it under the terms of the GNU General Public License as +// published by the Free Software Foundation; either version 2 of +// the License, or (at your option) any later version. +// +// Eigen is distributed in the hope that it will be useful, but WITHOUT ANY +// WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS +// FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public License or the +// GNU General Public License for more details. +// +// You should have received a copy of the GNU Lesser General Public +// License and a copy of the GNU General Public License along with +// Eigen. If not, see . + +#ifndef EIGEN_GENERAL_MATRIX_MATRIX_H +#define EIGEN_GENERAL_MATRIX_MATRIX_H + +template +struct ei_L2_block_traits { + enum {width = 8 * ei_meta_sqrt::ret }; +}; + +#ifndef EIGEN_EXTERN_INSTANTIATIONS + +template +static void ei_cache_friendly_product( + int _rows, int _cols, int depth, + bool _lhsRowMajor, const Scalar* _lhs, int _lhsStride, + bool _rhsRowMajor, const Scalar* _rhs, int _rhsStride, + bool resRowMajor, Scalar* res, int resStride) +{ + const Scalar* EIGEN_RESTRICT lhs; + const Scalar* EIGEN_RESTRICT rhs; + int lhsStride, rhsStride, rows, cols; + bool lhsRowMajor; + + if (resRowMajor) + { + lhs = _rhs; + rhs = _lhs; + lhsStride = _rhsStride; + rhsStride = _lhsStride; + cols = _rows; + rows = _cols; + lhsRowMajor = !_rhsRowMajor; + ei_assert(_lhsRowMajor); + } + else + { + lhs = _lhs; + rhs = _rhs; + lhsStride = _lhsStride; + rhsStride = _rhsStride; + rows = _rows; + cols = _cols; + lhsRowMajor = _lhsRowMajor; + ei_assert(!_rhsRowMajor); + } + + typedef typename ei_packet_traits::type PacketType; + + enum { + PacketSize = sizeof(PacketType)/sizeof(Scalar), + #if (defined __i386__) + // i386 architecture provides only 8 xmm registers, + // so let's reduce the max number of rows processed at once. + MaxBlockRows = 4, + MaxBlockRows_ClampingMask = 0xFFFFFC, + #else + MaxBlockRows = 8, + MaxBlockRows_ClampingMask = 0xFFFFF8, + #endif + // maximal size of the blocks fitted in L2 cache + MaxL2BlockSize = ei_L2_block_traits::width + }; + + const bool resIsAligned = (PacketSize==1) || (((resStride%PacketSize) == 0) && (size_t(res)%16==0)); + + const int remainingSize = depth % PacketSize; + const int size = depth - remainingSize; // third dimension of the product clamped to packet boundaries + const int l2BlockRows = MaxL2BlockSize > rows ? rows : MaxL2BlockSize; + const int l2BlockCols = MaxL2BlockSize > cols ? cols : MaxL2BlockSize; + const int l2BlockSize = MaxL2BlockSize > size ? size : MaxL2BlockSize; + const int l2BlockSizeAligned = (1 + std::max(l2BlockSize,l2BlockCols)/PacketSize)*PacketSize; + const bool needRhsCopy = (PacketSize>1) && ((rhsStride%PacketSize!=0) || (size_t(rhs)%16!=0)); + Scalar* EIGEN_RESTRICT block = 0; + const int allocBlockSize = l2BlockRows*size; + block = ei_aligned_stack_new(Scalar, allocBlockSize); + Scalar* EIGEN_RESTRICT rhsCopy + = ei_aligned_stack_new(Scalar, l2BlockSizeAligned*l2BlockSizeAligned); + +#ifndef EIGEN_USE_NEW_PRODUCT + // loops on each L2 cache friendly blocks of the result + for(int l2i=0; l2i0) + { + for (int k=l2k; k1 && resIsAligned) + { + // the result is aligned: let's do packet reduction + ei_pstore(&(localRes[0]), ei_padd(ei_pload(&(localRes[0])), ei_preduxp(&dst[0]))); + if (PacketSize==2) + ei_pstore(&(localRes[2]), ei_padd(ei_pload(&(localRes[2])), ei_preduxp(&(dst[2])))); + if (MaxBlockRows==8) + { + ei_pstore(&(localRes[4]), ei_padd(ei_pload(&(localRes[4])), ei_preduxp(&(dst[4])))); + if (PacketSize==2) + ei_pstore(&(localRes[6]), ei_padd(ei_pload(&(localRes[6])), ei_preduxp(&(dst[6])))); + } + } + else + { + // not aligned => per coeff packet reduction + localRes[0] += ei_predux(dst[0]); + localRes[1] += ei_predux(dst[1]); + localRes[2] += ei_predux(dst[2]); + localRes[3] += ei_predux(dst[3]); + if (MaxBlockRows==8) + { + localRes[4] += ei_predux(dst[4]); + localRes[5] += ei_predux(dst[5]); + localRes[6] += ei_predux(dst[6]); + localRes[7] += ei_predux(dst[7]); + } + } + } + } + if (l2blockRemainingRows>0) + { + int offsetblock = l2k * (l2blockRowEnd-l2i) + (l2blockRowEndBW-l2i)*(l2blockSizeEnd-l2k) - l2k*l2blockRemainingRows; + const Scalar* localB = &block[offsetblock]; + + for(int l1j=l2j; l1j=2) dst[1] = ei_pmadd(tmp, ei_pload(&(localB[k*l2blockRemainingRows+ PacketSize])), dst[1]); + if (l2blockRemainingRows>=3) dst[2] = ei_pmadd(tmp, ei_pload(&(localB[k*l2blockRemainingRows+2*PacketSize])), dst[2]); + if (l2blockRemainingRows>=4) dst[3] = ei_pmadd(tmp, ei_pload(&(localB[k*l2blockRemainingRows+3*PacketSize])), dst[3]); + if (MaxBlockRows==8) + { + if (l2blockRemainingRows>=5) dst[4] = ei_pmadd(tmp, ei_pload(&(localB[k*l2blockRemainingRows+4*PacketSize])), dst[4]); + if (l2blockRemainingRows>=6) dst[5] = ei_pmadd(tmp, ei_pload(&(localB[k*l2blockRemainingRows+5*PacketSize])), dst[5]); + if (l2blockRemainingRows>=7) dst[6] = ei_pmadd(tmp, ei_pload(&(localB[k*l2blockRemainingRows+6*PacketSize])), dst[6]); + if (l2blockRemainingRows>=8) dst[7] = ei_pmadd(tmp, ei_pload(&(localB[k*l2blockRemainingRows+7*PacketSize])), dst[7]); + } + } + + Scalar* EIGEN_RESTRICT localRes = &(res[l2blockRowEndBW + l1j*resStride]); + + // process the remaining rows once at a time + localRes[0] += ei_predux(dst[0]); + if (l2blockRemainingRows>=2) localRes[1] += ei_predux(dst[1]); + if (l2blockRemainingRows>=3) localRes[2] += ei_predux(dst[2]); + if (l2blockRemainingRows>=4) localRes[3] += ei_predux(dst[3]); + if (MaxBlockRows==8) + { + if (l2blockRemainingRows>=5) localRes[4] += ei_predux(dst[4]); + if (l2blockRemainingRows>=6) localRes[5] += ei_predux(dst[5]); + if (l2blockRemainingRows>=7) localRes[6] += ei_predux(dst[6]); + if (l2blockRemainingRows>=8) localRes[7] += ei_predux(dst[7]); + } + + } + } + } + } + } + if (PacketSize>1 && remainingSize) + { + if (lhsRowMajor) + { + for (int j=0; j. -#ifndef EIGEN_CACHE_FRIENDLY_PRODUCT_H -#define EIGEN_CACHE_FRIENDLY_PRODUCT_H - -template -struct ei_L2_block_traits { - enum {width = 8 * ei_meta_sqrt::ret }; -}; - -#ifndef EIGEN_EXTERN_INSTANTIATIONS - -template -static void ei_cache_friendly_product( - int _rows, int _cols, int depth, - bool _lhsRowMajor, const Scalar* _lhs, int _lhsStride, - bool _rhsRowMajor, const Scalar* _rhs, int _rhsStride, - bool resRowMajor, Scalar* res, int resStride) -{ - const Scalar* EIGEN_RESTRICT lhs; - const Scalar* EIGEN_RESTRICT rhs; - int lhsStride, rhsStride, rows, cols; - bool lhsRowMajor; - - if (resRowMajor) - { - lhs = _rhs; - rhs = _lhs; - lhsStride = _rhsStride; - rhsStride = _lhsStride; - cols = _rows; - rows = _cols; - lhsRowMajor = !_rhsRowMajor; - ei_assert(_lhsRowMajor); - } - else - { - lhs = _lhs; - rhs = _rhs; - lhsStride = _lhsStride; - rhsStride = _rhsStride; - rows = _rows; - cols = _cols; - lhsRowMajor = _lhsRowMajor; - ei_assert(!_rhsRowMajor); - } - - typedef typename ei_packet_traits::type PacketType; - - enum { - PacketSize = sizeof(PacketType)/sizeof(Scalar), - #if (defined __i386__) - // i386 architecture provides only 8 xmm registers, - // so let's reduce the max number of rows processed at once. - MaxBlockRows = 4, - MaxBlockRows_ClampingMask = 0xFFFFFC, - #else - MaxBlockRows = 8, - MaxBlockRows_ClampingMask = 0xFFFFF8, - #endif - // maximal size of the blocks fitted in L2 cache - MaxL2BlockSize = ei_L2_block_traits::width - }; - - const bool resIsAligned = (PacketSize==1) || (((resStride%PacketSize) == 0) && (size_t(res)%16==0)); - - const int remainingSize = depth % PacketSize; - const int size = depth - remainingSize; // third dimension of the product clamped to packet boundaries - const int l2BlockRows = MaxL2BlockSize > rows ? rows : MaxL2BlockSize; - const int l2BlockCols = MaxL2BlockSize > cols ? cols : MaxL2BlockSize; - const int l2BlockSize = MaxL2BlockSize > size ? size : MaxL2BlockSize; - const int l2BlockSizeAligned = (1 + std::max(l2BlockSize,l2BlockCols)/PacketSize)*PacketSize; - const bool needRhsCopy = (PacketSize>1) && ((rhsStride%PacketSize!=0) || (size_t(rhs)%16!=0)); - Scalar* EIGEN_RESTRICT block = 0; - const int allocBlockSize = l2BlockRows*size; - block = ei_aligned_stack_new(Scalar, allocBlockSize); - Scalar* EIGEN_RESTRICT rhsCopy - = ei_aligned_stack_new(Scalar, l2BlockSizeAligned*l2BlockSizeAligned); - -#ifndef EIGEN_USE_NEW_PRODUCT - // loops on each L2 cache friendly blocks of the result - for(int l2i=0; l2i0) - { - for (int k=l2k; k1 && resIsAligned) - { - // the result is aligned: let's do packet reduction - ei_pstore(&(localRes[0]), ei_padd(ei_pload(&(localRes[0])), ei_preduxp(&dst[0]))); - if (PacketSize==2) - ei_pstore(&(localRes[2]), ei_padd(ei_pload(&(localRes[2])), ei_preduxp(&(dst[2])))); - if (MaxBlockRows==8) - { - ei_pstore(&(localRes[4]), ei_padd(ei_pload(&(localRes[4])), ei_preduxp(&(dst[4])))); - if (PacketSize==2) - ei_pstore(&(localRes[6]), ei_padd(ei_pload(&(localRes[6])), ei_preduxp(&(dst[6])))); - } - } - else - { - // not aligned => per coeff packet reduction - localRes[0] += ei_predux(dst[0]); - localRes[1] += ei_predux(dst[1]); - localRes[2] += ei_predux(dst[2]); - localRes[3] += ei_predux(dst[3]); - if (MaxBlockRows==8) - { - localRes[4] += ei_predux(dst[4]); - localRes[5] += ei_predux(dst[5]); - localRes[6] += ei_predux(dst[6]); - localRes[7] += ei_predux(dst[7]); - } - } - } - } - if (l2blockRemainingRows>0) - { - int offsetblock = l2k * (l2blockRowEnd-l2i) + (l2blockRowEndBW-l2i)*(l2blockSizeEnd-l2k) - l2k*l2blockRemainingRows; - const Scalar* localB = &block[offsetblock]; - - for(int l1j=l2j; l1j=2) dst[1] = ei_pmadd(tmp, ei_pload(&(localB[k*l2blockRemainingRows+ PacketSize])), dst[1]); - if (l2blockRemainingRows>=3) dst[2] = ei_pmadd(tmp, ei_pload(&(localB[k*l2blockRemainingRows+2*PacketSize])), dst[2]); - if (l2blockRemainingRows>=4) dst[3] = ei_pmadd(tmp, ei_pload(&(localB[k*l2blockRemainingRows+3*PacketSize])), dst[3]); - if (MaxBlockRows==8) - { - if (l2blockRemainingRows>=5) dst[4] = ei_pmadd(tmp, ei_pload(&(localB[k*l2blockRemainingRows+4*PacketSize])), dst[4]); - if (l2blockRemainingRows>=6) dst[5] = ei_pmadd(tmp, ei_pload(&(localB[k*l2blockRemainingRows+5*PacketSize])), dst[5]); - if (l2blockRemainingRows>=7) dst[6] = ei_pmadd(tmp, ei_pload(&(localB[k*l2blockRemainingRows+6*PacketSize])), dst[6]); - if (l2blockRemainingRows>=8) dst[7] = ei_pmadd(tmp, ei_pload(&(localB[k*l2blockRemainingRows+7*PacketSize])), dst[7]); - } - } - - Scalar* EIGEN_RESTRICT localRes = &(res[l2blockRowEndBW + l1j*resStride]); - - // process the remaining rows once at a time - localRes[0] += ei_predux(dst[0]); - if (l2blockRemainingRows>=2) localRes[1] += ei_predux(dst[1]); - if (l2blockRemainingRows>=3) localRes[2] += ei_predux(dst[2]); - if (l2blockRemainingRows>=4) localRes[3] += ei_predux(dst[3]); - if (MaxBlockRows==8) - { - if (l2blockRemainingRows>=5) localRes[4] += ei_predux(dst[4]); - if (l2blockRemainingRows>=6) localRes[5] += ei_predux(dst[5]); - if (l2blockRemainingRows>=7) localRes[6] += ei_predux(dst[6]); - if (l2blockRemainingRows>=8) localRes[7] += ei_predux(dst[7]); - } - - } - } - } - } - } - if (PacketSize>1 && remainingSize) - { - if (lhsRowMajor) - { - for (int j=0; j +// +// Eigen is free software; you can redistribute it and/or +// modify it under the terms of the GNU Lesser General Public +// License as published by the Free Software Foundation; either +// version 3 of the License, or (at your option) any later version. +// +// Alternatively, you can redistribute it and/or +// modify it under the terms of the GNU General Public License as +// published by the Free Software Foundation; either version 2 of +// the License, or (at your option) any later version. +// +// Eigen is distributed in the hope that it will be useful, but WITHOUT ANY +// WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS +// FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public License or the +// GNU General Public License for more details. +// +// You should have received a copy of the GNU Lesser General Public +// License and a copy of the GNU General Public License along with +// Eigen. If not, see . + +#ifndef EIGEN_SELFADJOINT_MATRIX_VECTOR_H +#define EIGEN_SELFADJOINT_MATRIX_VECTOR_H + +template struct ei_conj_if { + template Scalar operator() (const Scalar& x) const { return ei_conj(x); } +}; + +template<> struct ei_conj_if { + template Scalar& operator() (Scalar& x) const { return x; } +}; + +/* Optimized col-major selfadjoint matrix * vector product: + * This algorithm processes 2 columns at onces that allows to both reduce + * the number of load/stores of the result by a factor 2 and to reduce + * the instruction dependency. + */ +template +static EIGEN_DONT_INLINE void ei_product_selfadjoint_vector( + int size, + const Scalar* lhs, int lhsStride, + const Scalar* rhs, //int rhsIncr, + Scalar* res) +{ + typedef typename ei_packet_traits::type Packet; + const int PacketSize = sizeof(Packet)/sizeof(Scalar); + + enum { + IsRowMajor = StorageOrder==RowMajorBit ? 1 : 0, + IsLower = UpLo == LowerTriangularBit ? 1 : 0, + FirstTriangular = IsRowMajor == IsLower + }; + + ei_conj_if::IsComplex && IsRowMajor> conj0; + ei_conj_if::IsComplex && !IsRowMajor> conj1; + + for (int i=0;i +// +// Eigen is free software; you can redistribute it and/or +// modify it under the terms of the GNU Lesser General Public +// License as published by the Free Software Foundation; either +// version 3 of the License, or (at your option) any later version. +// +// Alternatively, you can redistribute it and/or +// modify it under the terms of the GNU General Public License as +// published by the Free Software Foundation; either version 2 of +// the License, or (at your option) any later version. +// +// Eigen is distributed in the hope that it will be useful, but WITHOUT ANY +// WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS +// FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public License or the +// GNU General Public License for more details. +// +// You should have received a copy of the GNU Lesser General Public +// License and a copy of the GNU General Public License along with +// Eigen. If not, see . + +#include "main.h" + +template void product_selfadjoint(const MatrixType& m) +{ + typedef typename MatrixType::Scalar Scalar; + typedef typename NumTraits::Real RealScalar; + typedef Matrix VectorType; + + int rows = m.rows(); + int cols = m.cols(); + + MatrixType m1 = MatrixType::Random(rows, cols), + m2 = MatrixType::Random(rows, cols); + VectorType v1 = VectorType::Random(rows), + v2 = VectorType::Random(rows); + + m1 = m1.adjoint()*m1; + + // col-lower + m2.setZero(); + m2.template part() = m1; + ei_product_selfadjoint_vector + (cols,m2.data(),cols, v1.data(), v2.data()); + VERIFY_IS_APPROX(v2, m1 * v1); + + // col-upper + m2.setZero(); + m2.template part() = m1; + ei_product_selfadjoint_vector(cols,m2.data(),cols, v1.data(), v2.data()); + VERIFY_IS_APPROX(v2, m1 * v1); + +} + +void test_product_selfadjoint() +{ + for(int i = 0; i < g_repeat ; i++) { + CALL_SUBTEST( product_selfadjoint(Matrix()) ); + CALL_SUBTEST( product_selfadjoint(Matrix()) ); + CALL_SUBTEST( product_selfadjoint(Matrix3d()) ); + CALL_SUBTEST( product_selfadjoint(MatrixXcf(4, 4)) ); + CALL_SUBTEST( product_selfadjoint(MatrixXcd(21,21)) ); + CALL_SUBTEST( product_selfadjoint(MatrixXd(17,17)) ); + CALL_SUBTEST( product_selfadjoint(Matrix(18,18)) ); + CALL_SUBTEST( product_selfadjoint(Matrix,Dynamic,Dynamic,RowMajor>(19, 19)) ); + } +}