From a31de4778de4911ce4838a4cc042ae0276451499 Mon Sep 17 00:00:00 2001 From: Rasmus Munk Larsen <4643818-rmlarsen1@users.noreply.gitlab.com> Date: Wed, 25 Feb 2026 10:03:05 -0800 Subject: [PATCH] Blocked Jacobi SVD sweep with L2-cache-adaptive threshold libeigen/eigen!2206 Co-authored-by: Rasmus Munk Larsen Co-authored-by: Rasmus Munk Larsen --- Eigen/src/SVD/JacobiSVD.h | 320 +++++++++++++++++++++++--- benchmarks/SVD/bench_jacobi_sweep.cpp | 62 +++++ 2 files changed, 349 insertions(+), 33 deletions(-) create mode 100644 benchmarks/SVD/bench_jacobi_sweep.cpp diff --git a/Eigen/src/SVD/JacobiSVD.h b/Eigen/src/SVD/JacobiSVD.h index 1b08bb392..e9c0f0df2 100644 --- a/Eigen/src/SVD/JacobiSVD.h +++ b/Eigen/src/SVD/JacobiSVD.h @@ -372,8 +372,8 @@ struct svd_precondition_2x2_block_to_be_real { typedef typename MatrixType::Scalar Scalar; typedef typename MatrixType::RealScalar RealScalar; static bool run(typename SVD::WorkMatrixType& work_matrix, SVD& svd, Index p, Index q, RealScalar& maxDiagEntry) { - using std::abs; - using std::sqrt; + using numext::abs; + using numext::sqrt; Scalar z; JacobiRotation rot; RealScalar n = sqrt(numext::abs2(work_matrix.coeff(p, p)) + numext::abs2(work_matrix.coeff(q, p))); @@ -653,6 +653,12 @@ class JacobiSVD : public SVDBase > { template JacobiSVD& compute_impl(const MatrixBase& matrix, unsigned int computationOptions); + // Blocked sweep for the Jacobi SVD (works for both real and complex scalars). + // Extracted into a separate EIGEN_DONT_INLINE method to prevent the blocking + // code from interfering with the compiler's optimization of the non-blocking + // scalar sweep. + EIGEN_DONT_INLINE bool blocked_sweep(RealScalar considerAsZero, RealScalar precision, RealScalar& maxDiagEntry); + protected: using Base::m_computationOptions; using Base::m_computeFullU; @@ -686,6 +692,14 @@ class JacobiSVD : public SVDBase > { internal::qr_preconditioner_impl m_qr_precond_morerows; WorkMatrixType m_workMatrix; + + // Blocking parameters for the Jacobi SVD sweep. +#ifdef EIGEN_JACOBI_SVD_BLOCK_SIZE + static constexpr Index kBlockSize = EIGEN_JACOBI_SVD_BLOCK_SIZE; +#else + static constexpr Index kBlockSize = 32; +#endif + Matrix m_blockBuffer; }; template @@ -703,7 +717,7 @@ JacobiSVD& JacobiSVD::compute_impl(con EIGEN_STATIC_ASSERT((std::is_same::value), Input matrix must have the same Scalar type as the BDCSVD object.); - using std::abs; + using numext::abs; allocate(matrix.rows(), matrix.cols(), computationOptions); @@ -746,37 +760,73 @@ JacobiSVD& JacobiSVD::compute_impl(con while (!finished) { finished = true; - // do a sweep: for all index pairs (p,q), perform SVD of the corresponding 2x2 sub-matrix - // Threshold is hoisted before the double loop; it only needs updating when maxDiagEntry - // increases (which only happens inside the rotation block). Since maxDiagEntry is - // monotonically non-decreasing, a slightly stale threshold is conservative. - RealScalar threshold = numext::maxi(considerAsZero, precision * maxDiagEntry); + { + // Sweep with optional blocking for large matrices. + // Use blocking when the matrix is large enough that individual left rotations + // (strided row operations on column-major data) cause significant cache misses. + // The threshold is derived from the L2 cache size: blocking becomes worthwhile + // when n exceeds sqrt(L2 / 4). We divide by sizeof(float) rather than sizeof(RealScalar) + // because the cache miss pattern depends on the number of columns accessed (one cache + // line per column), not the scalar size. This also makes the threshold appropriately + // more conservative for larger types where GEMM overhead is higher. + const Index n = diagSize(); +#ifdef EIGEN_JACOBI_SVD_BLOCKING_THRESHOLD + const Index blockingThreshold = EIGEN_JACOBI_SVD_BLOCKING_THRESHOLD; +#else + const Index blockingThreshold = + static_cast(numext::sqrt(static_cast(l2CacheSize() / sizeof(float)))); +#endif - for (Index p = 1; p < diagSize(); ++p) { - for (Index q = 0; q < p; ++q) { - // if this 2x2 sub-matrix is not diagonal already... - // notice that this comparison will evaluate to false if any NaN is involved, ensuring that NaN's don't - // keep us iterating forever. Similarly, small denormal numbers are considered zero. - if (abs(m_workMatrix.coeff(p, q)) > threshold || abs(m_workMatrix.coeff(q, p)) > threshold) { - finished = false; - // perform SVD decomposition of 2x2 sub-matrix corresponding to indices p,q to make it diagonal - // the complex to real operation returns true if the updated 2x2 block is not already diagonal - if (internal::svd_precondition_2x2_block_to_be_real::run(m_workMatrix, *this, p, q, - maxDiagEntry)) { - JacobiRotation j_left, j_right; - internal::real_2x2_jacobi_svd(m_workMatrix, p, q, &j_left, &j_right); - - // accumulate resulting Jacobi rotations - m_workMatrix.applyOnTheLeft(p, q, j_left); - if (computeU()) m_matrixU.applyOnTheRight(p, q, j_left.transpose()); - - m_workMatrix.applyOnTheRight(p, q, j_right); - if (computeV()) m_matrixV.applyOnTheRight(p, q, j_right); - - // keep track of the largest diagonal coefficient - maxDiagEntry = numext::maxi( - maxDiagEntry, numext::maxi(abs(m_workMatrix.coeff(p, p)), abs(m_workMatrix.coeff(q, q)))); - threshold = numext::maxi(considerAsZero, precision * maxDiagEntry); + if (n >= blockingThreshold) { + // The blocked sweep is in a separate EIGEN_DONT_INLINE method to prevent + // the blocking code from interfering with the compiler's optimization of + // the non-blocking scalar sweep below. + finished = !blocked_sweep(considerAsZero, precision, maxDiagEntry); + } + // Non-blocking paths: apply rotations individually. The real and complex + // paths are kept separate to avoid any codegen impact from the complex + // preconditioner on GCC's optimization of the real inner loop. + else + EIGEN_IF_CONSTEXPR(NumTraits::IsComplex) { + // Complex non-blocking sweep: condition each 2x2 block to be real before diagonalizing. + for (Index p = 1; p < n; ++p) { + for (Index q = 0; q < p; ++q) { + RealScalar threshold = numext::maxi(considerAsZero, precision * maxDiagEntry); + if (abs(m_workMatrix.coeff(p, q)) > threshold || abs(m_workMatrix.coeff(q, p)) > threshold) { + finished = false; + if (internal::svd_precondition_2x2_block_to_be_real::run(m_workMatrix, *this, p, q, + maxDiagEntry)) { + JacobiRotation j_left, j_right; + internal::real_2x2_jacobi_svd(m_workMatrix, p, q, &j_left, &j_right); + m_workMatrix.applyOnTheLeft(p, q, j_left); + if (computeU()) m_matrixU.applyOnTheRight(p, q, j_left.transpose()); + m_workMatrix.applyOnTheRight(p, q, j_right); + if (computeV()) m_matrixV.applyOnTheRight(p, q, j_right); + maxDiagEntry = numext::maxi( + maxDiagEntry, + numext::maxi(abs(m_workMatrix.coeff(p, p)), abs(m_workMatrix.coeff(q, q)))); + } + } + } + } + } + else { + // Real non-blocking sweep: diagonalize each 2x2 block directly. + RealScalar threshold = numext::maxi(considerAsZero, precision * maxDiagEntry); + for (Index p = 1; p < n; ++p) { + for (Index q = 0; q < p; ++q) { + if (abs(m_workMatrix.coeff(p, q)) > threshold || abs(m_workMatrix.coeff(q, p)) > threshold) { + finished = false; + JacobiRotation j_left, j_right; + internal::real_2x2_jacobi_svd(m_workMatrix, p, q, &j_left, &j_right); + m_workMatrix.applyOnTheLeft(p, q, j_left); + if (computeU()) m_matrixU.applyOnTheRight(p, q, j_left.transpose()); + m_workMatrix.applyOnTheRight(p, q, j_right); + if (computeV()) m_matrixV.applyOnTheRight(p, q, j_right); + maxDiagEntry = numext::maxi( + maxDiagEntry, numext::maxi(abs(m_workMatrix.coeff(p, p)), abs(m_workMatrix.coeff(q, q)))); + threshold = numext::maxi(considerAsZero, precision * maxDiagEntry); + } } } } @@ -826,6 +876,210 @@ JacobiSVD& JacobiSVD::compute_impl(con return *this; } +// Blocked Jacobi SVD sweep for both real and complex scalar types. For large n, +// applying left rotations (row operations on column-major data) causes cache +// misses due to strided access. To mitigate this, we accumulate kBlockSize left +// rotations into a small dense matrix and apply them via a single GEMM to the +// contiguous row block q..q+kBlockSize-1 and the (possibly distant) row p. +// Right rotations and column scalings act on columns (contiguous in column-major) +// and are applied individually. +// +// For complex types, the 2x2 preconditioning (making the block real) involves +// complex left rotations and row scalings, which are also accumulated into the +// block matrix. Column scalings from preconditioning are applied directly. +// +// The accumulated rotation matrix has lower-triangular structure in its top-left +// kBlockSize x kBlockSize corner, which we exploit with triangularView. +// +// Returns true if any off-diagonal element exceeded the threshold (i.e. sweep +// is not yet converged). +template +EIGEN_DONT_INLINE bool JacobiSVD::blocked_sweep(RealScalar considerAsZero, RealScalar precision, + RealScalar& maxDiagEntry) { + using numext::abs; + using numext::sqrt; + const Index n = diagSize(); + RealScalar threshold = numext::maxi(considerAsZero, precision * maxDiagEntry); + bool notFinished = false; + + for (Index p = 1; p < n; ++p) { + Index q = 0; + + // Blocked loop: process kBlockSize pairs (p,q+qq) for qq=0..kBlockSize-1. + // We extract the relevant (kBlockSize+1) x (kBlockSize+1) submatrix of W + // into a small buffer, compute all rotations on the buffer, accumulate the + // left transformations into `accum`, and apply them in one GEMM at the end. + for (; q + kBlockSize <= p; q += kBlockSize) { + // Buffer = [ W(q:q+k, q:q+k) W(q:q+k, p) ] + // [ W(p, q:q+k) W(p, p) ] + m_blockBuffer.template topLeftCorner() = + m_workMatrix.template block(q, q); + m_blockBuffer.col(kBlockSize).template head() = m_workMatrix.col(p).template segment(q); + m_blockBuffer.row(kBlockSize).template head() = m_workMatrix.row(p).template segment(q); + m_blockBuffer(kBlockSize, kBlockSize) = m_workMatrix(p, p); + + // Accumulator for left transformations: W <- accum * W. + // After processing qq pairs, accum's top-left kBlockSize x kBlockSize + // block is lower-triangular (each rotation only mixes row qq with row + // kBlockSize, so rows 0..qq-1 are unchanged). + Matrix accum; + accum.setIdentity(kBlockSize + 1, kBlockSize + 1); + bool blockDirty = false; + + for (Index qq = 0; qq < kBlockSize; ++qq) { + if (abs(m_blockBuffer.coeff(kBlockSize, qq)) > threshold || + abs(m_blockBuffer.coeff(qq, kBlockSize)) > threshold) { + notFinished = true; + blockDirty = true; + + // Complex preconditioning: transform the 2x2 block + // [w_pp w_pq] = [buffer(kBlockSize, kBlockSize) buffer(kBlockSize, qq)] + // [w_qp w_qq] [buffer(qq, kBlockSize) buffer(qq, qq) ] + // to have real entries via unitary row/column operations, so + // real_2x2_jacobi_svd can be applied. + // + // Left operations (complex rotation, row scaling by e^{i*theta}) are + // accumulated into `accum` for deferred GEMM application. + // Right operations (column scaling) are applied directly since column + // ops are contiguous in column-major layout. + bool doRealSvd = true; + EIGEN_IF_CONSTEXPR(NumTraits::IsComplex) { + Scalar z; + // nn = ||(w_pp, w_qp)||_2, the norm of the first column of the 2x2 block. + RealScalar nn = sqrt(numext::abs2(m_blockBuffer.coeff(kBlockSize, kBlockSize)) + + numext::abs2(m_blockBuffer.coeff(qq, kBlockSize))); + + if (numext::is_exactly_zero(nn)) { + // First column is zero => block is already upper triangular. + m_blockBuffer.coeffRef(kBlockSize, kBlockSize) = Scalar(0); + m_blockBuffer.coeffRef(qq, kBlockSize) = Scalar(0); + + // Scale rows by z = e^{-i*arg(w)} to make remaining entries real. + if (abs(numext::imag(m_blockBuffer.coeff(kBlockSize, qq))) > considerAsZero) { + z = abs(m_blockBuffer.coeff(kBlockSize, qq)) / m_blockBuffer.coeff(kBlockSize, qq); + m_blockBuffer.row(kBlockSize) *= z; + accum.row(kBlockSize) *= z; // accumulate left op + if (computeU()) m_matrixU.col(p) *= numext::conj(z); + } + if (abs(numext::imag(m_blockBuffer.coeff(qq, qq))) > considerAsZero) { + z = abs(m_blockBuffer.coeff(qq, qq)) / m_blockBuffer.coeff(qq, qq); + m_blockBuffer.row(qq) *= z; + accum.row(qq) *= z; // accumulate left op + if (computeU()) m_matrixU.col(q + qq) *= numext::conj(z); + } + } else { + // Apply complex Givens rotation to zero out w_qp: + // [c s] [w_pp] [nn] conj(w_pp) w_qp + // [-s c] [w_qp] = [0 ] c = ----------, s = ------ + // nn nn + JacobiRotation rot; + rot.c() = numext::conj(m_blockBuffer.coeff(kBlockSize, kBlockSize)) / nn; + rot.s() = m_blockBuffer.coeff(qq, kBlockSize) / nn; + m_blockBuffer.applyOnTheLeft(kBlockSize, qq, rot); + accum.applyOnTheLeft(kBlockSize, qq, rot); // accumulate left op + if (computeU()) m_matrixU.applyOnTheRight(p, q + qq, rot.adjoint()); + + // Scale column qq by z = e^{-i*arg(w_pq)} to make w_pq real. + if (abs(numext::imag(m_blockBuffer.coeff(kBlockSize, qq))) > considerAsZero) { + z = abs(m_blockBuffer.coeff(kBlockSize, qq)) / m_blockBuffer.coeff(kBlockSize, qq); + m_blockBuffer.col(qq) *= z; + m_workMatrix.col(q + qq) *= z; // right op: apply directly + if (computeV()) m_matrixV.col(q + qq) *= z; + } + // Scale row qq by z = e^{-i*arg(w_qq)} to make w_qq real. + if (abs(numext::imag(m_blockBuffer.coeff(qq, qq))) > considerAsZero) { + z = abs(m_blockBuffer.coeff(qq, qq)) / m_blockBuffer.coeff(qq, qq); + m_blockBuffer.row(qq) *= z; + accum.row(qq) *= z; // accumulate left op + if (computeU()) m_matrixU.col(q + qq) *= numext::conj(z); + } + } + // Update maxDiagEntry from preconditioning. + maxDiagEntry = numext::maxi( + maxDiagEntry, numext::maxi(abs(m_blockBuffer.coeff(kBlockSize, kBlockSize)), + abs(m_blockBuffer.coeff(qq, qq)))); + // Check if 2x2 block still needs diagonalizing. + RealScalar precondThreshold = numext::maxi(considerAsZero, precision * maxDiagEntry); + doRealSvd = abs(m_blockBuffer.coeff(kBlockSize, qq)) > precondThreshold || + abs(m_blockBuffer.coeff(qq, kBlockSize)) > precondThreshold; + } + + if (doRealSvd) { + // Compute real 2x2 SVD: buffer_2x2 = j_left * diag * j_right^T. + JacobiRotation j_left, j_right; + internal::real_2x2_jacobi_svd(m_blockBuffer, kBlockSize, qq, &j_left, &j_right); + m_blockBuffer.applyOnTheLeft(kBlockSize, qq, j_left); + m_blockBuffer.applyOnTheRight(kBlockSize, qq, j_right); + + // Accumulate left rotation for deferred GEMM. + accum.applyOnTheLeft(kBlockSize, qq, j_left); + + // Right rotation is a column op (contiguous): apply directly. + m_workMatrix.applyOnTheRight(p, q + qq, j_right); + if (computeU()) m_matrixU.applyOnTheRight(p, q + qq, j_left.transpose()); + if (computeV()) m_matrixV.applyOnTheRight(p, q + qq, j_right); + + maxDiagEntry = numext::maxi( + maxDiagEntry, numext::maxi(abs(m_blockBuffer.coeff(kBlockSize, kBlockSize)), + abs(m_blockBuffer.coeff(qq, qq)))); + } + threshold = numext::maxi(considerAsZero, precision * maxDiagEntry); + } + } + + // Apply accumulated left rotations: W <- accum * W, via GEMM. + // When p == q + kBlockSize, all kBlockSize+1 rows are contiguous. + // Otherwise, rows q..q+k-1 and row p are non-adjacent; we split: + // [Mq] [L11 l12] [Mq] + // [Mp] <- [l21 l22] [Mp] + // L11 is lower-triangular (exploited via triangularView). + if (blockDirty) { + if (p == q + kBlockSize) { + m_workMatrix.template middleRows(q) = + accum * m_workMatrix.template middleRows(q); + } else { + const auto L11 = accum.template topLeftCorner(); + const auto l12 = accum.col(kBlockSize).template head(); + const auto l21 = accum.row(kBlockSize).template head(); + const Scalar l22 = accum(kBlockSize, kBlockSize); + auto Mq = m_workMatrix.template middleRows(q); + auto Mp = m_workMatrix.row(p); + Matrix Mp_save = Mp; + Mp.noalias() = l21 * Mq + l22 * Mp_save; + Mq = L11.template triangularView() * Mq + l12 * Mp_save; + } + } + } + + // Scalar loop for remaining pairs after blocked processing. + for (; q < p; ++q) { + if (abs(m_workMatrix.coeff(p, q)) > threshold || abs(m_workMatrix.coeff(q, p)) > threshold) { + notFinished = true; + + bool doRealSvd = true; + EIGEN_IF_CONSTEXPR(NumTraits::IsComplex) { + doRealSvd = internal::svd_precondition_2x2_block_to_be_real::run(m_workMatrix, *this, p, + q, maxDiagEntry); + } + + if (doRealSvd) { + JacobiRotation j_left, j_right; + internal::real_2x2_jacobi_svd(m_workMatrix, p, q, &j_left, &j_right); + m_workMatrix.applyOnTheLeft(p, q, j_left); + if (computeU()) m_matrixU.applyOnTheRight(p, q, j_left.transpose()); + m_workMatrix.applyOnTheRight(p, q, j_right); + if (computeV()) m_matrixV.applyOnTheRight(p, q, j_right); + maxDiagEntry = numext::maxi( + maxDiagEntry, numext::maxi(abs(m_workMatrix.coeff(p, p)), abs(m_workMatrix.coeff(q, q)))); + } + threshold = numext::maxi(considerAsZero, precision * maxDiagEntry); + } + } + } + + return notFinished; +} + /** \svd_module * * \return the singular value decomposition of \c *this computed by two-sided diff --git a/benchmarks/SVD/bench_jacobi_sweep.cpp b/benchmarks/SVD/bench_jacobi_sweep.cpp new file mode 100644 index 000000000..45d0a7c55 --- /dev/null +++ b/benchmarks/SVD/bench_jacobi_sweep.cpp @@ -0,0 +1,62 @@ +// Standalone JacobiSVD benchmark for block-size and threshold sweeps. +// Compile-time parameters via -D flags: +// BLOCK_SIZE (default 24), BLOCKING_THRESHOLD (default 192) +// +// Build example: +// g++ -O3 -DNDEBUG -march=native -DBLOCK_SIZE=16 -DBLOCKING_THRESHOLD=128 +// -I../.. bench_jacobi_sweep.cpp -lbenchmark -lbenchmark_main -lpthread + +#include + +// Override block size before including Eigen headers. +#ifdef BLOCK_SIZE +#define EIGEN_JACOBI_SVD_BLOCK_SIZE BLOCK_SIZE +#endif +#ifdef BLOCKING_THRESHOLD +#define EIGEN_JACOBI_SVD_BLOCKING_THRESHOLD BLOCKING_THRESHOLD +#endif + +#include + +using namespace Eigen; + +template +using Mat = Matrix; + +template +EIGEN_DONT_INLINE void do_compute(SVD& svd, const typename SVD::MatrixType& A) { + svd.compute(A); +} + +template +static void BM_JacobiSVD(benchmark::State& state) { + const Index n = state.range(0); + Mat A = Mat::Random(n, n); + JacobiSVD, Options> svd(n, n); + for (auto _ : state) { + do_compute(svd, A); + benchmark::DoNotOptimize(svd.singularValues().data()); + } + state.SetItemsProcessed(state.iterations()); +} + +static void Sizes(::benchmark::Benchmark* b) { + for (int s : {32, 64, 128, 192, 256, 384, 512}) b->Args({s}); +} + +// float ValuesOnly +BENCHMARK(BM_JacobiSVD)->Apply(Sizes)->Name("Jacobi_float_VO"); +// float ThinUV +BENCHMARK(BM_JacobiSVD)->Apply(Sizes)->Name("Jacobi_float_UV"); +// double ValuesOnly +BENCHMARK(BM_JacobiSVD)->Apply(Sizes)->Name("Jacobi_double_VO"); +// double ThinUV +BENCHMARK(BM_JacobiSVD)->Apply(Sizes)->Name("Jacobi_double_UV"); +// complex ValuesOnly +BENCHMARK(BM_JacobiSVD, 0>)->Apply(Sizes)->Name("Jacobi_cfloat_VO"); +// complex ThinUV +BENCHMARK((BM_JacobiSVD, ComputeThinU | ComputeThinV>))->Apply(Sizes)->Name("Jacobi_cfloat_UV"); +// complex ValuesOnly +BENCHMARK(BM_JacobiSVD, 0>)->Apply(Sizes)->Name("Jacobi_cdouble_VO"); +// complex ThinUV +BENCHMARK((BM_JacobiSVD, ComputeThinU | ComputeThinV>))->Apply(Sizes)->Name("Jacobi_cdouble_UV");