Blocked Jacobi SVD sweep with L2-cache-adaptive threshold

libeigen/eigen!2206

Co-authored-by: Rasmus Munk Larsen <rmlarsen@gmail.com>
Co-authored-by: Rasmus Munk Larsen <rmlarsen@google.com>
This commit is contained in:
Rasmus Munk Larsen
2026-02-25 10:03:05 -08:00
parent 647e0009ba
commit a31de4778d
2 changed files with 349 additions and 33 deletions

View File

@@ -372,8 +372,8 @@ struct svd_precondition_2x2_block_to_be_real<MatrixType, Options, true> {
typedef typename MatrixType::Scalar Scalar;
typedef typename MatrixType::RealScalar RealScalar;
static bool run(typename SVD::WorkMatrixType& work_matrix, SVD& svd, Index p, Index q, RealScalar& maxDiagEntry) {
using std::abs;
using std::sqrt;
using numext::abs;
using numext::sqrt;
Scalar z;
JacobiRotation<Scalar> rot;
RealScalar n = sqrt(numext::abs2(work_matrix.coeff(p, p)) + numext::abs2(work_matrix.coeff(q, p)));
@@ -653,6 +653,12 @@ class JacobiSVD : public SVDBase<JacobiSVD<MatrixType_, Options_> > {
template <typename Derived>
JacobiSVD& compute_impl(const MatrixBase<Derived>& matrix, unsigned int computationOptions);
// Blocked sweep for the Jacobi SVD (works for both real and complex scalars).
// Extracted into a separate EIGEN_DONT_INLINE method to prevent the blocking
// code from interfering with the compiler's optimization of the non-blocking
// scalar sweep.
EIGEN_DONT_INLINE bool blocked_sweep(RealScalar considerAsZero, RealScalar precision, RealScalar& maxDiagEntry);
protected:
using Base::m_computationOptions;
using Base::m_computeFullU;
@@ -686,6 +692,14 @@ class JacobiSVD : public SVDBase<JacobiSVD<MatrixType_, Options_> > {
internal::qr_preconditioner_impl<MatrixType, Options, QRPreconditioner, internal::PreconditionIfMoreRowsThanCols>
m_qr_precond_morerows;
WorkMatrixType m_workMatrix;
// Blocking parameters for the Jacobi SVD sweep.
#ifdef EIGEN_JACOBI_SVD_BLOCK_SIZE
static constexpr Index kBlockSize = EIGEN_JACOBI_SVD_BLOCK_SIZE;
#else
static constexpr Index kBlockSize = 32;
#endif
Matrix<Scalar, kBlockSize + 1, kBlockSize + 1> m_blockBuffer;
};
template <typename MatrixType, int Options>
@@ -703,7 +717,7 @@ JacobiSVD<MatrixType, Options>& JacobiSVD<MatrixType, Options>::compute_impl(con
EIGEN_STATIC_ASSERT((std::is_same<typename Derived::Scalar, typename MatrixType::Scalar>::value),
Input matrix must have the same Scalar type as the BDCSVD object.);
using std::abs;
using numext::abs;
allocate(matrix.rows(), matrix.cols(), computationOptions);
@@ -746,37 +760,73 @@ JacobiSVD<MatrixType, Options>& JacobiSVD<MatrixType, Options>::compute_impl(con
while (!finished) {
finished = true;
// do a sweep: for all index pairs (p,q), perform SVD of the corresponding 2x2 sub-matrix
// Threshold is hoisted before the double loop; it only needs updating when maxDiagEntry
// increases (which only happens inside the rotation block). Since maxDiagEntry is
// monotonically non-decreasing, a slightly stale threshold is conservative.
RealScalar threshold = numext::maxi<RealScalar>(considerAsZero, precision * maxDiagEntry);
{
// Sweep with optional blocking for large matrices.
// Use blocking when the matrix is large enough that individual left rotations
// (strided row operations on column-major data) cause significant cache misses.
// The threshold is derived from the L2 cache size: blocking becomes worthwhile
// when n exceeds sqrt(L2 / 4). We divide by sizeof(float) rather than sizeof(RealScalar)
// because the cache miss pattern depends on the number of columns accessed (one cache
// line per column), not the scalar size. This also makes the threshold appropriately
// more conservative for larger types where GEMM overhead is higher.
const Index n = diagSize();
#ifdef EIGEN_JACOBI_SVD_BLOCKING_THRESHOLD
const Index blockingThreshold = EIGEN_JACOBI_SVD_BLOCKING_THRESHOLD;
#else
const Index blockingThreshold =
static_cast<Index>(numext::sqrt(static_cast<RealScalar>(l2CacheSize() / sizeof(float))));
#endif
for (Index p = 1; p < diagSize(); ++p) {
for (Index q = 0; q < p; ++q) {
// if this 2x2 sub-matrix is not diagonal already...
// notice that this comparison will evaluate to false if any NaN is involved, ensuring that NaN's don't
// keep us iterating forever. Similarly, small denormal numbers are considered zero.
if (abs(m_workMatrix.coeff(p, q)) > threshold || abs(m_workMatrix.coeff(q, p)) > threshold) {
finished = false;
// perform SVD decomposition of 2x2 sub-matrix corresponding to indices p,q to make it diagonal
// the complex to real operation returns true if the updated 2x2 block is not already diagonal
if (internal::svd_precondition_2x2_block_to_be_real<MatrixType, Options>::run(m_workMatrix, *this, p, q,
maxDiagEntry)) {
JacobiRotation<RealScalar> j_left, j_right;
internal::real_2x2_jacobi_svd(m_workMatrix, p, q, &j_left, &j_right);
// accumulate resulting Jacobi rotations
m_workMatrix.applyOnTheLeft(p, q, j_left);
if (computeU()) m_matrixU.applyOnTheRight(p, q, j_left.transpose());
m_workMatrix.applyOnTheRight(p, q, j_right);
if (computeV()) m_matrixV.applyOnTheRight(p, q, j_right);
// keep track of the largest diagonal coefficient
maxDiagEntry = numext::maxi<RealScalar>(
maxDiagEntry, numext::maxi<RealScalar>(abs(m_workMatrix.coeff(p, p)), abs(m_workMatrix.coeff(q, q))));
threshold = numext::maxi<RealScalar>(considerAsZero, precision * maxDiagEntry);
if (n >= blockingThreshold) {
// The blocked sweep is in a separate EIGEN_DONT_INLINE method to prevent
// the blocking code from interfering with the compiler's optimization of
// the non-blocking scalar sweep below.
finished = !blocked_sweep(considerAsZero, precision, maxDiagEntry);
}
// Non-blocking paths: apply rotations individually. The real and complex
// paths are kept separate to avoid any codegen impact from the complex
// preconditioner on GCC's optimization of the real inner loop.
else
EIGEN_IF_CONSTEXPR(NumTraits<Scalar>::IsComplex) {
// Complex non-blocking sweep: condition each 2x2 block to be real before diagonalizing.
for (Index p = 1; p < n; ++p) {
for (Index q = 0; q < p; ++q) {
RealScalar threshold = numext::maxi<RealScalar>(considerAsZero, precision * maxDiagEntry);
if (abs(m_workMatrix.coeff(p, q)) > threshold || abs(m_workMatrix.coeff(q, p)) > threshold) {
finished = false;
if (internal::svd_precondition_2x2_block_to_be_real<MatrixType, Options>::run(m_workMatrix, *this, p, q,
maxDiagEntry)) {
JacobiRotation<RealScalar> j_left, j_right;
internal::real_2x2_jacobi_svd(m_workMatrix, p, q, &j_left, &j_right);
m_workMatrix.applyOnTheLeft(p, q, j_left);
if (computeU()) m_matrixU.applyOnTheRight(p, q, j_left.transpose());
m_workMatrix.applyOnTheRight(p, q, j_right);
if (computeV()) m_matrixV.applyOnTheRight(p, q, j_right);
maxDiagEntry = numext::maxi<RealScalar>(
maxDiagEntry,
numext::maxi<RealScalar>(abs(m_workMatrix.coeff(p, p)), abs(m_workMatrix.coeff(q, q))));
}
}
}
}
}
else {
// Real non-blocking sweep: diagonalize each 2x2 block directly.
RealScalar threshold = numext::maxi<RealScalar>(considerAsZero, precision * maxDiagEntry);
for (Index p = 1; p < n; ++p) {
for (Index q = 0; q < p; ++q) {
if (abs(m_workMatrix.coeff(p, q)) > threshold || abs(m_workMatrix.coeff(q, p)) > threshold) {
finished = false;
JacobiRotation<RealScalar> j_left, j_right;
internal::real_2x2_jacobi_svd(m_workMatrix, p, q, &j_left, &j_right);
m_workMatrix.applyOnTheLeft(p, q, j_left);
if (computeU()) m_matrixU.applyOnTheRight(p, q, j_left.transpose());
m_workMatrix.applyOnTheRight(p, q, j_right);
if (computeV()) m_matrixV.applyOnTheRight(p, q, j_right);
maxDiagEntry = numext::maxi<RealScalar>(
maxDiagEntry, numext::maxi<RealScalar>(abs(m_workMatrix.coeff(p, p)), abs(m_workMatrix.coeff(q, q))));
threshold = numext::maxi<RealScalar>(considerAsZero, precision * maxDiagEntry);
}
}
}
}
@@ -826,6 +876,210 @@ JacobiSVD<MatrixType, Options>& JacobiSVD<MatrixType, Options>::compute_impl(con
return *this;
}
// Blocked Jacobi SVD sweep for both real and complex scalar types. For large n,
// applying left rotations (row operations on column-major data) causes cache
// misses due to strided access. To mitigate this, we accumulate kBlockSize left
// rotations into a small dense matrix and apply them via a single GEMM to the
// contiguous row block q..q+kBlockSize-1 and the (possibly distant) row p.
// Right rotations and column scalings act on columns (contiguous in column-major)
// and are applied individually.
//
// For complex types, the 2x2 preconditioning (making the block real) involves
// complex left rotations and row scalings, which are also accumulated into the
// block matrix. Column scalings from preconditioning are applied directly.
//
// The accumulated rotation matrix has lower-triangular structure in its top-left
// kBlockSize x kBlockSize corner, which we exploit with triangularView.
//
// Returns true if any off-diagonal element exceeded the threshold (i.e. sweep
// is not yet converged).
template <typename MatrixType, int Options>
EIGEN_DONT_INLINE bool JacobiSVD<MatrixType, Options>::blocked_sweep(RealScalar considerAsZero, RealScalar precision,
RealScalar& maxDiagEntry) {
using numext::abs;
using numext::sqrt;
const Index n = diagSize();
RealScalar threshold = numext::maxi<RealScalar>(considerAsZero, precision * maxDiagEntry);
bool notFinished = false;
for (Index p = 1; p < n; ++p) {
Index q = 0;
// Blocked loop: process kBlockSize pairs (p,q+qq) for qq=0..kBlockSize-1.
// We extract the relevant (kBlockSize+1) x (kBlockSize+1) submatrix of W
// into a small buffer, compute all rotations on the buffer, accumulate the
// left transformations into `accum`, and apply them in one GEMM at the end.
for (; q + kBlockSize <= p; q += kBlockSize) {
// Buffer = [ W(q:q+k, q:q+k) W(q:q+k, p) ]
// [ W(p, q:q+k) W(p, p) ]
m_blockBuffer.template topLeftCorner<kBlockSize, kBlockSize>() =
m_workMatrix.template block<kBlockSize, kBlockSize>(q, q);
m_blockBuffer.col(kBlockSize).template head<kBlockSize>() = m_workMatrix.col(p).template segment<kBlockSize>(q);
m_blockBuffer.row(kBlockSize).template head<kBlockSize>() = m_workMatrix.row(p).template segment<kBlockSize>(q);
m_blockBuffer(kBlockSize, kBlockSize) = m_workMatrix(p, p);
// Accumulator for left transformations: W <- accum * W.
// After processing qq pairs, accum's top-left kBlockSize x kBlockSize
// block is lower-triangular (each rotation only mixes row qq with row
// kBlockSize, so rows 0..qq-1 are unchanged).
Matrix<Scalar, kBlockSize + 1, kBlockSize + 1> accum;
accum.setIdentity(kBlockSize + 1, kBlockSize + 1);
bool blockDirty = false;
for (Index qq = 0; qq < kBlockSize; ++qq) {
if (abs(m_blockBuffer.coeff(kBlockSize, qq)) > threshold ||
abs(m_blockBuffer.coeff(qq, kBlockSize)) > threshold) {
notFinished = true;
blockDirty = true;
// Complex preconditioning: transform the 2x2 block
// [w_pp w_pq] = [buffer(kBlockSize, kBlockSize) buffer(kBlockSize, qq)]
// [w_qp w_qq] [buffer(qq, kBlockSize) buffer(qq, qq) ]
// to have real entries via unitary row/column operations, so
// real_2x2_jacobi_svd can be applied.
//
// Left operations (complex rotation, row scaling by e^{i*theta}) are
// accumulated into `accum` for deferred GEMM application.
// Right operations (column scaling) are applied directly since column
// ops are contiguous in column-major layout.
bool doRealSvd = true;
EIGEN_IF_CONSTEXPR(NumTraits<Scalar>::IsComplex) {
Scalar z;
// nn = ||(w_pp, w_qp)||_2, the norm of the first column of the 2x2 block.
RealScalar nn = sqrt(numext::abs2(m_blockBuffer.coeff(kBlockSize, kBlockSize)) +
numext::abs2(m_blockBuffer.coeff(qq, kBlockSize)));
if (numext::is_exactly_zero(nn)) {
// First column is zero => block is already upper triangular.
m_blockBuffer.coeffRef(kBlockSize, kBlockSize) = Scalar(0);
m_blockBuffer.coeffRef(qq, kBlockSize) = Scalar(0);
// Scale rows by z = e^{-i*arg(w)} to make remaining entries real.
if (abs(numext::imag(m_blockBuffer.coeff(kBlockSize, qq))) > considerAsZero) {
z = abs(m_blockBuffer.coeff(kBlockSize, qq)) / m_blockBuffer.coeff(kBlockSize, qq);
m_blockBuffer.row(kBlockSize) *= z;
accum.row(kBlockSize) *= z; // accumulate left op
if (computeU()) m_matrixU.col(p) *= numext::conj(z);
}
if (abs(numext::imag(m_blockBuffer.coeff(qq, qq))) > considerAsZero) {
z = abs(m_blockBuffer.coeff(qq, qq)) / m_blockBuffer.coeff(qq, qq);
m_blockBuffer.row(qq) *= z;
accum.row(qq) *= z; // accumulate left op
if (computeU()) m_matrixU.col(q + qq) *= numext::conj(z);
}
} else {
// Apply complex Givens rotation to zero out w_qp:
// [c s] [w_pp] [nn] conj(w_pp) w_qp
// [-s c] [w_qp] = [0 ] c = ----------, s = ------
// nn nn
JacobiRotation<Scalar> rot;
rot.c() = numext::conj(m_blockBuffer.coeff(kBlockSize, kBlockSize)) / nn;
rot.s() = m_blockBuffer.coeff(qq, kBlockSize) / nn;
m_blockBuffer.applyOnTheLeft(kBlockSize, qq, rot);
accum.applyOnTheLeft(kBlockSize, qq, rot); // accumulate left op
if (computeU()) m_matrixU.applyOnTheRight(p, q + qq, rot.adjoint());
// Scale column qq by z = e^{-i*arg(w_pq)} to make w_pq real.
if (abs(numext::imag(m_blockBuffer.coeff(kBlockSize, qq))) > considerAsZero) {
z = abs(m_blockBuffer.coeff(kBlockSize, qq)) / m_blockBuffer.coeff(kBlockSize, qq);
m_blockBuffer.col(qq) *= z;
m_workMatrix.col(q + qq) *= z; // right op: apply directly
if (computeV()) m_matrixV.col(q + qq) *= z;
}
// Scale row qq by z = e^{-i*arg(w_qq)} to make w_qq real.
if (abs(numext::imag(m_blockBuffer.coeff(qq, qq))) > considerAsZero) {
z = abs(m_blockBuffer.coeff(qq, qq)) / m_blockBuffer.coeff(qq, qq);
m_blockBuffer.row(qq) *= z;
accum.row(qq) *= z; // accumulate left op
if (computeU()) m_matrixU.col(q + qq) *= numext::conj(z);
}
}
// Update maxDiagEntry from preconditioning.
maxDiagEntry = numext::maxi<RealScalar>(
maxDiagEntry, numext::maxi<RealScalar>(abs(m_blockBuffer.coeff(kBlockSize, kBlockSize)),
abs(m_blockBuffer.coeff(qq, qq))));
// Check if 2x2 block still needs diagonalizing.
RealScalar precondThreshold = numext::maxi<RealScalar>(considerAsZero, precision * maxDiagEntry);
doRealSvd = abs(m_blockBuffer.coeff(kBlockSize, qq)) > precondThreshold ||
abs(m_blockBuffer.coeff(qq, kBlockSize)) > precondThreshold;
}
if (doRealSvd) {
// Compute real 2x2 SVD: buffer_2x2 = j_left * diag * j_right^T.
JacobiRotation<RealScalar> j_left, j_right;
internal::real_2x2_jacobi_svd(m_blockBuffer, kBlockSize, qq, &j_left, &j_right);
m_blockBuffer.applyOnTheLeft(kBlockSize, qq, j_left);
m_blockBuffer.applyOnTheRight(kBlockSize, qq, j_right);
// Accumulate left rotation for deferred GEMM.
accum.applyOnTheLeft(kBlockSize, qq, j_left);
// Right rotation is a column op (contiguous): apply directly.
m_workMatrix.applyOnTheRight(p, q + qq, j_right);
if (computeU()) m_matrixU.applyOnTheRight(p, q + qq, j_left.transpose());
if (computeV()) m_matrixV.applyOnTheRight(p, q + qq, j_right);
maxDiagEntry = numext::maxi<RealScalar>(
maxDiagEntry, numext::maxi<RealScalar>(abs(m_blockBuffer.coeff(kBlockSize, kBlockSize)),
abs(m_blockBuffer.coeff(qq, qq))));
}
threshold = numext::maxi<RealScalar>(considerAsZero, precision * maxDiagEntry);
}
}
// Apply accumulated left rotations: W <- accum * W, via GEMM.
// When p == q + kBlockSize, all kBlockSize+1 rows are contiguous.
// Otherwise, rows q..q+k-1 and row p are non-adjacent; we split:
// [Mq] [L11 l12] [Mq]
// [Mp] <- [l21 l22] [Mp]
// L11 is lower-triangular (exploited via triangularView).
if (blockDirty) {
if (p == q + kBlockSize) {
m_workMatrix.template middleRows<kBlockSize + 1>(q) =
accum * m_workMatrix.template middleRows<kBlockSize + 1>(q);
} else {
const auto L11 = accum.template topLeftCorner<kBlockSize, kBlockSize>();
const auto l12 = accum.col(kBlockSize).template head<kBlockSize>();
const auto l21 = accum.row(kBlockSize).template head<kBlockSize>();
const Scalar l22 = accum(kBlockSize, kBlockSize);
auto Mq = m_workMatrix.template middleRows<kBlockSize>(q);
auto Mp = m_workMatrix.row(p);
Matrix<Scalar, 1, Dynamic> Mp_save = Mp;
Mp.noalias() = l21 * Mq + l22 * Mp_save;
Mq = L11.template triangularView<Lower>() * Mq + l12 * Mp_save;
}
}
}
// Scalar loop for remaining pairs after blocked processing.
for (; q < p; ++q) {
if (abs(m_workMatrix.coeff(p, q)) > threshold || abs(m_workMatrix.coeff(q, p)) > threshold) {
notFinished = true;
bool doRealSvd = true;
EIGEN_IF_CONSTEXPR(NumTraits<Scalar>::IsComplex) {
doRealSvd = internal::svd_precondition_2x2_block_to_be_real<MatrixType, Options>::run(m_workMatrix, *this, p,
q, maxDiagEntry);
}
if (doRealSvd) {
JacobiRotation<RealScalar> j_left, j_right;
internal::real_2x2_jacobi_svd(m_workMatrix, p, q, &j_left, &j_right);
m_workMatrix.applyOnTheLeft(p, q, j_left);
if (computeU()) m_matrixU.applyOnTheRight(p, q, j_left.transpose());
m_workMatrix.applyOnTheRight(p, q, j_right);
if (computeV()) m_matrixV.applyOnTheRight(p, q, j_right);
maxDiagEntry = numext::maxi<RealScalar>(
maxDiagEntry, numext::maxi<RealScalar>(abs(m_workMatrix.coeff(p, p)), abs(m_workMatrix.coeff(q, q))));
}
threshold = numext::maxi<RealScalar>(considerAsZero, precision * maxDiagEntry);
}
}
}
return notFinished;
}
/** \svd_module
*
* \return the singular value decomposition of \c *this computed by two-sided

View File

@@ -0,0 +1,62 @@
// Standalone JacobiSVD benchmark for block-size and threshold sweeps.
// Compile-time parameters via -D flags:
// BLOCK_SIZE (default 24), BLOCKING_THRESHOLD (default 192)
//
// Build example:
// g++ -O3 -DNDEBUG -march=native -DBLOCK_SIZE=16 -DBLOCKING_THRESHOLD=128
// -I../.. bench_jacobi_sweep.cpp -lbenchmark -lbenchmark_main -lpthread
#include <benchmark/benchmark.h>
// Override block size before including Eigen headers.
#ifdef BLOCK_SIZE
#define EIGEN_JACOBI_SVD_BLOCK_SIZE BLOCK_SIZE
#endif
#ifdef BLOCKING_THRESHOLD
#define EIGEN_JACOBI_SVD_BLOCKING_THRESHOLD BLOCKING_THRESHOLD
#endif
#include <Eigen/Dense>
using namespace Eigen;
template <typename Scalar>
using Mat = Matrix<Scalar, Dynamic, Dynamic>;
template <typename SVD>
EIGEN_DONT_INLINE void do_compute(SVD& svd, const typename SVD::MatrixType& A) {
svd.compute(A);
}
template <typename Scalar, int Options>
static void BM_JacobiSVD(benchmark::State& state) {
const Index n = state.range(0);
Mat<Scalar> A = Mat<Scalar>::Random(n, n);
JacobiSVD<Mat<Scalar>, Options> svd(n, n);
for (auto _ : state) {
do_compute(svd, A);
benchmark::DoNotOptimize(svd.singularValues().data());
}
state.SetItemsProcessed(state.iterations());
}
static void Sizes(::benchmark::Benchmark* b) {
for (int s : {32, 64, 128, 192, 256, 384, 512}) b->Args({s});
}
// float ValuesOnly
BENCHMARK(BM_JacobiSVD<float, 0>)->Apply(Sizes)->Name("Jacobi_float_VO");
// float ThinUV
BENCHMARK(BM_JacobiSVD<float, ComputeThinU | ComputeThinV>)->Apply(Sizes)->Name("Jacobi_float_UV");
// double ValuesOnly
BENCHMARK(BM_JacobiSVD<double, 0>)->Apply(Sizes)->Name("Jacobi_double_VO");
// double ThinUV
BENCHMARK(BM_JacobiSVD<double, ComputeThinU | ComputeThinV>)->Apply(Sizes)->Name("Jacobi_double_UV");
// complex<float> ValuesOnly
BENCHMARK(BM_JacobiSVD<std::complex<float>, 0>)->Apply(Sizes)->Name("Jacobi_cfloat_VO");
// complex<float> ThinUV
BENCHMARK((BM_JacobiSVD<std::complex<float>, ComputeThinU | ComputeThinV>))->Apply(Sizes)->Name("Jacobi_cfloat_UV");
// complex<double> ValuesOnly
BENCHMARK(BM_JacobiSVD<std::complex<double>, 0>)->Apply(Sizes)->Name("Jacobi_cdouble_VO");
// complex<double> ThinUV
BENCHMARK((BM_JacobiSVD<std::complex<double>, ComputeThinU | ComputeThinV>))->Apply(Sizes)->Name("Jacobi_cdouble_UV");