diff --git a/Eigen/Householder b/Eigen/Householder index 5070e070e..719edaffe 100644 --- a/Eigen/Householder +++ b/Eigen/Householder @@ -22,8 +22,8 @@ // IWYU pragma: begin_exports #include "src/Householder/Householder.h" -#include "src/Householder/HouseholderSequence.h" #include "src/Householder/BlockHouseholder.h" +#include "src/Householder/HouseholderSequence.h" // IWYU pragma: end_exports #include "src/Core/util/ReenableStupidWarnings.h" diff --git a/Eigen/src/Householder/BlockHouseholder.h b/Eigen/src/Householder/BlockHouseholder.h index 6a4bc675a..faa5594a0 100644 --- a/Eigen/src/Householder/BlockHouseholder.h +++ b/Eigen/src/Householder/BlockHouseholder.h @@ -36,14 +36,8 @@ void make_block_householder_triangular_factor(TriangularFactorType& triFactor, c triFactor.row(i).tail(rt).noalias() = -hCoeffs(i) * vectors.col(i).tail(rs).adjoint() * vectors.bottomRightCorner(rs, rt).template triangularView(); - // FIXME: use the following line with .noalias() once triangular product supports in-place operation. - // triFactor.row(i).tail(rt) = triFactor.row(i).tail(rt) * triFactor.bottomRightCorner(rt,rt).template - // triangularView(); - for (Index j = nbVecs - 1; j > i; --j) { - typename TriangularFactorType::Scalar z = triFactor(i, j); - triFactor(i, j) = z * triFactor(j, j); - if (nbVecs - j - 1 > 0) triFactor.row(i).tail(nbVecs - j - 1) += z * triFactor.row(j).tail(nbVecs - j - 1); - } + triFactor.row(i).tail(rt) = + (triFactor.row(i).tail(rt) * triFactor.bottomRightCorner(rt, rt).template triangularView()).eval(); } triFactor(i, i) = hCoeffs(i); } @@ -71,14 +65,43 @@ void apply_block_householder_on_the_left(MatrixType& mat, const VectorsType& vec (VectorsType::MaxColsAtCompileTime == 1 && MatrixType::MaxColsAtCompileTime != 1) ? RowMajor : ColMajor, VectorsType::MaxColsAtCompileTime, MatrixType::MaxColsAtCompileTime> tmp = V.adjoint() * mat; - // FIXME: add .noalias() once triangular product supports in-place operation. if (forward) - tmp = T.template triangularView() * tmp; + tmp = (T.template triangularView() * tmp).eval(); else - tmp = T.template triangularView().adjoint() * tmp; + tmp = (T.template triangularView().adjoint() * tmp).eval(); mat.noalias() -= V * tmp; } +/** \internal + * if forward then perform mat = mat * H0 * H1 * H2 + * otherwise perform mat = mat * H2 * H1 * H0 + */ +template +void apply_block_householder_on_the_right(MatrixType& mat, const VectorsType& vectors, const CoeffsType& hCoeffs, + bool forward) { + enum { TFactorSize = VectorsType::ColsAtCompileTime }; + Index nbVecs = vectors.cols(); + Matrix T(nbVecs, nbVecs); + + if (forward) + make_block_householder_triangular_factor(T, vectors, hCoeffs); + else + make_block_householder_triangular_factor(T, vectors, hCoeffs.conjugate()); + const TriangularView V(vectors); + + // A -= (A * V) * T * V^* (forward) + // A -= (A * V) * T^* * V^* (backward) + Matrix + tmp = mat * V; + if (forward) + tmp = (tmp * T.template triangularView()).eval(); + else + tmp = (tmp * T.template triangularView().adjoint()).eval(); + mat.noalias() -= tmp * V.adjoint(); +} + } // end namespace internal } // end namespace Eigen diff --git a/Eigen/src/Householder/HouseholderSequence.h b/Eigen/src/Householder/HouseholderSequence.h index d49c96156..d7eb47712 100644 --- a/Eigen/src/Householder/HouseholderSequence.h +++ b/Eigen/src/Householder/HouseholderSequence.h @@ -281,10 +281,7 @@ class HouseholderSequence : public EigenBase BlockSize) { dst.setIdentity(rows(), rows()); - if (m_reverse) - applyThisOnTheLeft(dst, workspace, true); - else - applyThisOnTheLeft(dst, workspace, true); + applyThisOnTheLeft(dst, workspace, true); } else { dst.setIdentity(rows(), rows()); for (Index k = vecs - 1; k >= 0; --k) { @@ -309,14 +306,49 @@ class HouseholderSequence : public EigenBase inline void applyThisOnTheRight(Dest& dst, Workspace& workspace) const { - workspace.resize(dst.rows()); - for (Index k = 0; k < m_length; ++k) { - Index actual_k = m_reverse ? m_length - k - 1 : k; - dst.rightCols(rows() - m_shift - actual_k) - .applyHouseholderOnTheRight(essentialVector(actual_k), m_coeffs.coeff(actual_k), workspace.data()); + // Use the block path when the reflectors are long enough for GEMM to outperform GEMV. + // The threshold on rows() (the reflector length) is higher than for the left-side path because + // the right-side block application has more overhead from the tmp = mat * V product layout. + if (m_length >= BlockSize && rows() - m_shift >= 4 * BlockSize) { + applyBlockOnTheRight(dst); + } else { + workspace.resize(dst.rows()); + for (Index k = 0; k < m_length; ++k) { + Index actual_k = m_reverse ? m_length - k - 1 : k; + dst.rightCols(rows() - m_shift - actual_k) + .applyHouseholderOnTheRight(essentialVector(actual_k), m_coeffs.coeff(actual_k), workspace.data()); + } } } + private: + /** \internal Block Householder application on the right, kept out-of-line + * to avoid template bloat pessimizing the scalar path above. */ + template + EIGEN_DONT_INLINE void applyBlockOnTheRight(Dest& dst) const { + // Make sure we have at least 2 useful blocks, otherwise it is point-less: + Index blockSize = m_length < Index(2 * BlockSize) ? (m_length + 1) / 2 : Index(BlockSize); + for (Index i = 0; i < m_length; i += blockSize) { + // Right-side application processes blocks in opposite order to left-side: + // forward (m_reverse=false): first block first; reversed: last block first. + Index end = m_reverse ? m_length - i : (std::min)(m_length, i + blockSize); + Index k = m_reverse ? (std::max)(Index(0), end - blockSize) : i; + Index bs = end - k; + Index start = k + m_shift; + + typedef Block, Dynamic, Dynamic> SubVectorsType; + SubVectorsType sub_vecs1(m_vectors.const_cast_derived(), Side == OnTheRight ? k : start, + Side == OnTheRight ? start : k, Side == OnTheRight ? bs : m_vectors.rows() - start, + Side == OnTheRight ? m_vectors.cols() - start : bs); + std::conditional_t, SubVectorsType&> sub_vecs(sub_vecs1); + + Index dstCols = rows() - m_shift - k; + auto sub_dst = dst.rightCols(dstCols); + internal::apply_block_householder_on_the_right(sub_dst, sub_vecs, m_coeffs.segment(k, bs), !m_reverse); + } + } + + public: /** \internal */ template inline void applyThisOnTheLeft(Dest& dst, bool inputIsIdentity = false) const { diff --git a/benchmarks/CMakeLists.txt b/benchmarks/CMakeLists.txt index 7e1b349b1..c414e66b3 100644 --- a/benchmarks/CMakeLists.txt +++ b/benchmarks/CMakeLists.txt @@ -35,5 +35,6 @@ add_subdirectory(Eigenvalues) add_subdirectory(Geometry) add_subdirectory(Sparse) add_subdirectory(FFT) +add_subdirectory(Householder) add_subdirectory(Solvers) add_subdirectory(Tuning) diff --git a/benchmarks/Householder/CMakeLists.txt b/benchmarks/Householder/CMakeLists.txt new file mode 100644 index 000000000..57c317878 --- /dev/null +++ b/benchmarks/Householder/CMakeLists.txt @@ -0,0 +1 @@ +eigen_add_benchmark(bench_householder bench_householder.cpp) diff --git a/benchmarks/Householder/bench_householder.cpp b/benchmarks/Householder/bench_householder.cpp new file mode 100644 index 000000000..61d39eddd --- /dev/null +++ b/benchmarks/Householder/bench_householder.cpp @@ -0,0 +1,370 @@ +// Benchmarks for Householder reflections. +// +// Tests makeHouseholder, makeHouseholderInPlace, applyHouseholderOnTheLeft, +// applyHouseholderOnTheRight, HouseholderSequence evaluation, and block +// Householder operations. + +#include +#include +#include + +using namespace Eigen; + +// ============================================================================= +// makeHouseholderInPlace: compute Householder reflector in-place. +// ============================================================================= + +template +static void BM_MakeHouseholderInPlace(benchmark::State& state) { + const Index n = state.range(0); + using Vec = Matrix; + using RealScalar = typename NumTraits::Real; + + Vec v = Vec::Random(n); + Vec v_copy = v; + Scalar tau; + RealScalar beta; + for (auto _ : state) { + v = v_copy; + v.makeHouseholderInPlace(tau, beta); + benchmark::DoNotOptimize(tau); + benchmark::DoNotOptimize(beta); + } + state.SetItemsProcessed(state.iterations()); +} + +// ============================================================================= +// makeHouseholder: compute Householder reflector, storing essential part +// separately. +// ============================================================================= + +template +static void BM_MakeHouseholder(benchmark::State& state) { + const Index n = state.range(0); + using Vec = Matrix; + using RealScalar = typename NumTraits::Real; + + Vec v = Vec::Random(n); + Vec essential(n - 1); + Scalar tau; + RealScalar beta; + for (auto _ : state) { + v.makeHouseholder(essential, tau, beta); + benchmark::DoNotOptimize(essential.data()); + } + state.SetItemsProcessed(state.iterations()); +} + +// ============================================================================= +// applyHouseholderOnTheLeft: apply H = I - tau * v * v^* from the left. +// ============================================================================= + +template +static void BM_ApplyHouseholderOnTheLeft(benchmark::State& state) { + const Index rows = state.range(0); + const Index cols = state.range(1); + using Mat = Matrix; + using Vec = Matrix; + using RealScalar = typename NumTraits::Real; + + // Create a Householder reflector from a random vector. + Vec v = Vec::Random(rows); + Vec essential(rows - 1); + Scalar tau; + RealScalar beta; + v.makeHouseholder(essential, tau, beta); + + Mat A = Mat::Random(rows, cols); + Mat A_copy = A; + Vec workspace(cols); + for (auto _ : state) { + A = A_copy; + A.applyHouseholderOnTheLeft(essential, tau, workspace.data()); + benchmark::DoNotOptimize(A.data()); + } + state.SetItemsProcessed(state.iterations()); +} + +// ============================================================================= +// applyHouseholderOnTheRight: apply H = I - tau * v * v^* from the right. +// ============================================================================= + +template +static void BM_ApplyHouseholderOnTheRight(benchmark::State& state) { + const Index rows = state.range(0); + const Index cols = state.range(1); + using Mat = Matrix; + using Vec = Matrix; + using RealScalar = typename NumTraits::Real; + + // Create a Householder reflector from a random vector. + Vec v = Vec::Random(cols); + Vec essential(cols - 1); + Scalar tau; + RealScalar beta; + v.makeHouseholder(essential, tau, beta); + + Mat A = Mat::Random(rows, cols); + Mat A_copy = A; + Vec workspace(rows); + for (auto _ : state) { + A = A_copy; + A.applyHouseholderOnTheRight(essential, tau, workspace.data()); + benchmark::DoNotOptimize(A.data()); + } + state.SetItemsProcessed(state.iterations()); +} + +// ============================================================================= +// HouseholderSequence evalTo: materialize Q = H_0 * H_1 * ... * H_{k-1} +// as a dense matrix. +// ============================================================================= + +template +static void BM_HouseholderSequence_EvalTo(benchmark::State& state) { + const Index rows = state.range(0); + const Index cols = state.range(1); + using Mat = Matrix; + + // Build a Householder sequence via QR factorization. + Mat A = Mat::Random(rows, cols); + HouseholderQR qr(A); + Mat Q(rows, rows); + for (auto _ : state) { + Q = qr.householderQ(); + benchmark::DoNotOptimize(Q.data()); + } + state.SetItemsProcessed(state.iterations()); +} + +// ============================================================================= +// HouseholderSequence applyOnTheLeft: apply Q from the left to a matrix +// without materializing Q. +// ============================================================================= + +template +static void BM_HouseholderSequence_ApplyLeft(benchmark::State& state) { + const Index rows = state.range(0); + const Index cols = state.range(1); + using Mat = Matrix; + + Mat A = Mat::Random(rows, cols); + HouseholderQR qr(A); + Mat B = Mat::Random(rows, cols); + Mat C(rows, cols); + for (auto _ : state) { + C.noalias() = qr.householderQ() * B; + benchmark::DoNotOptimize(C.data()); + } + state.SetItemsProcessed(state.iterations()); +} + +// ============================================================================= +// HouseholderSequence applyOnTheRight: apply Q from the right to a matrix +// without materializing Q. +// ============================================================================= + +template +static void BM_HouseholderSequence_ApplyRight(benchmark::State& state) { + const Index rows = state.range(0); + const Index cols = state.range(1); + using Mat = Matrix; + + Mat A = Mat::Random(rows, cols); + HouseholderQR qr(A); + Mat B = Mat::Random(cols, rows); + Mat C(cols, rows); + for (auto _ : state) { + C.noalias() = B * qr.householderQ(); + benchmark::DoNotOptimize(C.data()); + } + state.SetItemsProcessed(state.iterations()); +} + +// ============================================================================= +// HouseholderSequence adjoint apply: apply Q^* from the left (common in +// least-squares solves: Q^* * b). +// ============================================================================= + +template +static void BM_HouseholderSequence_AdjointApplyLeft(benchmark::State& state) { + const Index rows = state.range(0); + const Index cols = state.range(1); + using Mat = Matrix; + using Vec = Matrix; + + Mat A = Mat::Random(rows, cols); + HouseholderQR qr(A); + Vec b = Vec::Random(rows); + Vec c(rows); + for (auto _ : state) { + c.noalias() = qr.householderQ().adjoint() * b; + benchmark::DoNotOptimize(c.data()); + } + state.SetItemsProcessed(state.iterations()); +} + +// ============================================================================= +// Block Householder: make_block_householder_triangular_factor. +// ============================================================================= + +template +static void BM_BlockHouseholder_TriangularFactor(benchmark::State& state) { + const Index n = state.range(0); + using Mat = Matrix; + using Vec = Matrix; + + // Build Householder vectors via QR. + Mat A = Mat::Random(n, n); + HouseholderQR qr(A); + const Mat& qrMat = qr.matrixQR(); + Vec hCoeffs = qr.hCoeffs(); + + // Use the full set of vectors. + Index nbVecs = (std::min)(n, n); + Mat vectors = qrMat.leftCols(nbVecs); + Mat T(nbVecs, nbVecs); + for (auto _ : state) { + internal::make_block_householder_triangular_factor(T, vectors, hCoeffs.head(nbVecs)); + benchmark::DoNotOptimize(T.data()); + } + state.SetItemsProcessed(state.iterations()); +} + +// ============================================================================= +// Block Householder: apply_block_householder_on_the_left. +// ============================================================================= + +template +static void BM_BlockHouseholder_ApplyLeft(benchmark::State& state) { + const Index n = state.range(0); + const Index rhs_cols = state.range(1); + using Mat = Matrix; + using Vec = Matrix; + + // Build Householder vectors via QR. + Mat A = Mat::Random(n, n); + HouseholderQR qr(A); + const Mat& qrMat = qr.matrixQR(); + Vec hCoeffs = qr.hCoeffs(); + + // Use a block of reflectors. + Index nbVecs = (std::min)(n, Index(48)); + Mat vectors = qrMat.block(0, 0, n, nbVecs); + + Mat B = Mat::Random(n, rhs_cols); + Mat B_copy = B; + for (auto _ : state) { + B = B_copy; + internal::apply_block_householder_on_the_left(B, vectors, hCoeffs.head(nbVecs), true); + benchmark::DoNotOptimize(B.data()); + } + state.SetItemsProcessed(state.iterations()); +} + +// ============================================================================= +// Size configurations +// ============================================================================= + +static void VectorSizes(::benchmark::Benchmark* b) { + for (int n : {8, 16, 32, 64, 128, 256, 512, 1024, 4096}) b->Arg(n); +} + +static void SquareSizes(::benchmark::Benchmark* b) { + for (int n : {32, 48, 64, 80, 96, 112, 128, 160, 192, 256, 384, 512, 768, 1024}) b->Args({n, n}); +} + +// Fine-grained sizes around the blocking threshold to find the crossover point. +static void SquareSizesFine(::benchmark::Benchmark* b) { + for (int n : {32, 40, 48, 56, 64, 72, 80, 88, 96, 112, 128, 160, 192, 256}) b->Args({n, n}); +} + +// Rectangular: many rows, fewer columns (m_length = cols, dst is rows x rows). +static void RectApplyRight(::benchmark::Benchmark* b) { + // Square + for (int n : {48, 64, 96, 128, 256, 512, 1024}) b->Args({n, n}); + // Wide dst * narrow Q: dst is (rows x rows), Q is (cols x cols), so rows > cols. + b->Args({256, 64}); + b->Args({256, 128}); + b->Args({512, 64}); + b->Args({512, 128}); + b->Args({1024, 64}); + b->Args({1024, 128}); + b->Args({1024, 256}); +} + +static void RectSizes(::benchmark::Benchmark* b) { + // Square + for (int n : {32, 64, 128, 256, 512, 1024}) b->Args({n, n}); + // Tall-thin + b->Args({1000, 32}); + b->Args({1000, 100}); + b->Args({10000, 32}); + b->Args({10000, 100}); +} + +static void BlockSizes(::benchmark::Benchmark* b) { + for (int n : {64, 128, 256, 512, 1024}) { + b->Args({n, n}); + b->Args({n, 32}); + } +} + +// ============================================================================= +// Register benchmarks: float +// ============================================================================= + +BENCHMARK(BM_MakeHouseholderInPlace)->Apply(VectorSizes)->Name("MakeHouseholderInPlace_float"); +BENCHMARK(BM_MakeHouseholder)->Apply(VectorSizes)->Name("MakeHouseholder_float"); +BENCHMARK(BM_ApplyHouseholderOnTheLeft)->Apply(RectSizes)->Name("ApplyHouseholderOnTheLeft_float"); +BENCHMARK(BM_ApplyHouseholderOnTheRight)->Apply(RectSizes)->Name("ApplyHouseholderOnTheRight_float"); +BENCHMARK(BM_HouseholderSequence_EvalTo)->Apply(SquareSizesFine)->Name("HouseholderSequence_EvalTo_float"); +BENCHMARK(BM_HouseholderSequence_ApplyLeft)->Apply(RectSizes)->Name("HouseholderSequence_ApplyLeft_float"); +BENCHMARK(BM_HouseholderSequence_ApplyRight) + ->Apply(RectApplyRight) + ->Name("HouseholderSequence_ApplyRight_float"); +BENCHMARK(BM_HouseholderSequence_AdjointApplyLeft) + ->Apply(RectSizes) + ->Name("HouseholderSequence_AdjointApplyLeft_float"); +BENCHMARK(BM_BlockHouseholder_TriangularFactor) + ->Apply(VectorSizes) + ->Name("BlockHouseholder_TriangularFactor_float"); +BENCHMARK(BM_BlockHouseholder_ApplyLeft)->Apply(BlockSizes)->Name("BlockHouseholder_ApplyLeft_float"); + +// ============================================================================= +// Register benchmarks: double +// ============================================================================= + +BENCHMARK(BM_MakeHouseholderInPlace)->Apply(VectorSizes)->Name("MakeHouseholderInPlace_double"); +BENCHMARK(BM_MakeHouseholder)->Apply(VectorSizes)->Name("MakeHouseholder_double"); +BENCHMARK(BM_ApplyHouseholderOnTheLeft)->Apply(RectSizes)->Name("ApplyHouseholderOnTheLeft_double"); +BENCHMARK(BM_ApplyHouseholderOnTheRight)->Apply(RectSizes)->Name("ApplyHouseholderOnTheRight_double"); +BENCHMARK(BM_HouseholderSequence_EvalTo)->Apply(SquareSizesFine)->Name("HouseholderSequence_EvalTo_double"); +BENCHMARK(BM_HouseholderSequence_ApplyLeft)->Apply(RectSizes)->Name("HouseholderSequence_ApplyLeft_double"); +BENCHMARK(BM_HouseholderSequence_ApplyRight) + ->Apply(RectApplyRight) + ->Name("HouseholderSequence_ApplyRight_double"); +BENCHMARK(BM_HouseholderSequence_AdjointApplyLeft) + ->Apply(RectSizes) + ->Name("HouseholderSequence_AdjointApplyLeft_double"); +BENCHMARK(BM_BlockHouseholder_TriangularFactor) + ->Apply(VectorSizes) + ->Name("BlockHouseholder_TriangularFactor_double"); +BENCHMARK(BM_BlockHouseholder_ApplyLeft)->Apply(BlockSizes)->Name("BlockHouseholder_ApplyLeft_double"); + +// ============================================================================= +// Register benchmarks: std::complex +// ============================================================================= + +BENCHMARK(BM_MakeHouseholderInPlace>) + ->Apply(VectorSizes) + ->Name("MakeHouseholderInPlace_complexdouble"); +BENCHMARK(BM_ApplyHouseholderOnTheLeft>) + ->Apply(RectSizes) + ->Name("ApplyHouseholderOnTheLeft_complexdouble"); +BENCHMARK(BM_HouseholderSequence_EvalTo>) + ->Apply(SquareSizes) + ->Name("HouseholderSequence_EvalTo_complexdouble"); +BENCHMARK(BM_HouseholderSequence_ApplyLeft>) + ->Apply(SquareSizes) + ->Name("HouseholderSequence_ApplyLeft_complexdouble");