From a31de4778de4911ce4838a4cc042ae0276451499 Mon Sep 17 00:00:00 2001
From: Rasmus Munk Larsen <4643818-rmlarsen1@users.noreply.gitlab.com>
Date: Wed, 25 Feb 2026 10:03:05 -0800
Subject: [PATCH] Blocked Jacobi SVD sweep with L2-cache-adaptive threshold

libeigen/eigen!2206

Co-authored-by: Rasmus Munk Larsen <rmlarsen@gmail.com>
Co-authored-by: Rasmus Munk Larsen <rmlarsen@google.com>
---
 Eigen/src/SVD/JacobiSVD.h             | 320 +++++++++++++++++++++++---
 benchmarks/SVD/bench_jacobi_sweep.cpp |  62 +++++
 2 files changed, 349 insertions(+), 33 deletions(-)
 create mode 100644 benchmarks/SVD/bench_jacobi_sweep.cpp
diff --git a/Eigen/src/SVD/JacobiSVD.h b/Eigen/src/SVD/JacobiSVD.h
index 1b08bb392..e9c0f0df2 100644
--- a/Eigen/src/SVD/JacobiSVD.h
+++ b/Eigen/src/SVD/JacobiSVD.h
@@ -372,8 +372,8 @@ struct svd_precondition_2x2_block_to_be_real<MatrixType, Options, true> {
   typedef typename MatrixType::Scalar Scalar;
   typedef typename MatrixType::RealScalar RealScalar;
   static bool run(typename SVD::WorkMatrixType& work_matrix, SVD& svd, Index p, Index q, RealScalar& maxDiagEntry) {
-    using std::abs;
-    using std::sqrt;
+    using numext::abs;
+    using numext::sqrt;
     Scalar z;
     JacobiRotation<Scalar> rot;
     RealScalar n = sqrt(numext::abs2(work_matrix.coeff(p, p)) + numext::abs2(work_matrix.coeff(q, p)));
@@ -653,6 +653,12 @@ class JacobiSVD : public SVDBase<JacobiSVD<MatrixType_, Options_> > {
   template <typename Derived>
   JacobiSVD& compute_impl(const MatrixBase<Derived>& matrix, unsigned int computationOptions);
 
+  // Blocked sweep for the Jacobi SVD (works for both real and complex scalars).
+  // Extracted into a separate EIGEN_DONT_INLINE method to prevent the blocking
+  // code from interfering with the compiler's optimization of the non-blocking
+  // scalar sweep.
+  EIGEN_DONT_INLINE bool blocked_sweep(RealScalar considerAsZero, RealScalar precision, RealScalar& maxDiagEntry);
+
  protected:
   using Base::m_computationOptions;
   using Base::m_computeFullU;
@@ -686,6 +692,14 @@ class JacobiSVD : public SVDBase<JacobiSVD<MatrixType_, Options_> > {
   internal::qr_preconditioner_impl<MatrixType, Options, QRPreconditioner, internal::PreconditionIfMoreRowsThanCols>
       m_qr_precond_morerows;
   WorkMatrixType m_workMatrix;
+
+  // Blocking parameters for the Jacobi SVD sweep.
+#ifdef EIGEN_JACOBI_SVD_BLOCK_SIZE
+  static constexpr Index kBlockSize = EIGEN_JACOBI_SVD_BLOCK_SIZE;
+#else
+  static constexpr Index kBlockSize = 32;
+#endif
+  Matrix<Scalar, kBlockSize + 1, kBlockSize + 1> m_blockBuffer;
 };
 
 template <typename MatrixType, int Options>
@@ -703,7 +717,7 @@ JacobiSVD<MatrixType, Options>& JacobiSVD<MatrixType, Options>::compute_impl(con
   EIGEN_STATIC_ASSERT((std::is_same<typename Derived::Scalar, typename MatrixType::Scalar>::value),
                       Input matrix must have the same Scalar type as the BDCSVD object.);
 
-  using std::abs;
+  using numext::abs;
 
   allocate(matrix.rows(), matrix.cols(), computationOptions);
 
@@ -746,37 +760,73 @@ JacobiSVD<MatrixType, Options>& JacobiSVD<MatrixType, Options>::compute_impl(con
   while (!finished) {
     finished = true;
 
-    // do a sweep: for all index pairs (p,q), perform SVD of the corresponding 2x2 sub-matrix
-    // Threshold is hoisted before the double loop; it only needs updating when maxDiagEntry
-    // increases (which only happens inside the rotation block). Since maxDiagEntry is
-    // monotonically non-decreasing, a slightly stale threshold is conservative.
-    RealScalar threshold = numext::maxi<RealScalar>(considerAsZero, precision * maxDiagEntry);
+    {
+      // Sweep with optional blocking for large matrices.
+      // Use blocking when the matrix is large enough that individual left rotations
+      // (strided row operations on column-major data) cause significant cache misses.
+      // The threshold is derived from the L2 cache size: blocking becomes worthwhile
+      // when n exceeds sqrt(L2 / 4). We divide by sizeof(float) rather than sizeof(RealScalar)
+      // because the cache miss pattern depends on the number of columns accessed (one cache
+      // line per column), not the scalar size. This also makes the threshold appropriately
+      // more conservative for larger types where GEMM overhead is higher.
+      const Index n = diagSize();
+#ifdef EIGEN_JACOBI_SVD_BLOCKING_THRESHOLD
+      const Index blockingThreshold = EIGEN_JACOBI_SVD_BLOCKING_THRESHOLD;
+#else
+      const Index blockingThreshold =
+          static_cast<Index>(numext::sqrt(static_cast<RealScalar>(l2CacheSize() / sizeof(float))));
+#endif
 
-    for (Index p = 1; p < diagSize(); ++p) {
-      for (Index q = 0; q < p; ++q) {
-        // if this 2x2 sub-matrix is not diagonal already...
-        // notice that this comparison will evaluate to false if any NaN is involved, ensuring that NaN's don't
-        // keep us iterating forever. Similarly, small denormal numbers are considered zero.
-        if (abs(m_workMatrix.coeff(p, q)) > threshold || abs(m_workMatrix.coeff(q, p)) > threshold) {
-          finished = false;
-          // perform SVD decomposition of 2x2 sub-matrix corresponding to indices p,q to make it diagonal
-          // the complex to real operation returns true if the updated 2x2 block is not already diagonal
-          if (internal::svd_precondition_2x2_block_to_be_real<MatrixType, Options>::run(m_workMatrix, *this, p, q,
-                                                                                        maxDiagEntry)) {
-            JacobiRotation<RealScalar> j_left, j_right;
-            internal::real_2x2_jacobi_svd(m_workMatrix, p, q, &j_left, &j_right);
-
-            // accumulate resulting Jacobi rotations
-            m_workMatrix.applyOnTheLeft(p, q, j_left);
-            if (computeU()) m_matrixU.applyOnTheRight(p, q, j_left.transpose());
-
-            m_workMatrix.applyOnTheRight(p, q, j_right);
-            if (computeV()) m_matrixV.applyOnTheRight(p, q, j_right);
-
-            // keep track of the largest diagonal coefficient
-            maxDiagEntry = numext::maxi<RealScalar>(
-                maxDiagEntry, numext::maxi<RealScalar>(abs(m_workMatrix.coeff(p, p)), abs(m_workMatrix.coeff(q, q))));
-            threshold = numext::maxi<RealScalar>(considerAsZero, precision * maxDiagEntry);
+      if (n >= blockingThreshold) {
+        // The blocked sweep is in a separate EIGEN_DONT_INLINE method to prevent
+        // the blocking code from interfering with the compiler's optimization of
+        // the non-blocking scalar sweep below.
+        finished = !blocked_sweep(considerAsZero, precision, maxDiagEntry);
+      }
+      // Non-blocking paths: apply rotations individually. The real and complex
+      // paths are kept separate to avoid any codegen impact from the complex
+      // preconditioner on GCC's optimization of the real inner loop.
+      else
+        EIGEN_IF_CONSTEXPR(NumTraits<Scalar>::IsComplex) {
+          // Complex non-blocking sweep: condition each 2x2 block to be real before diagonalizing.
+          for (Index p = 1; p < n; ++p) {
+            for (Index q = 0; q < p; ++q) {
+              RealScalar threshold = numext::maxi<RealScalar>(considerAsZero, precision * maxDiagEntry);
+              if (abs(m_workMatrix.coeff(p, q)) > threshold || abs(m_workMatrix.coeff(q, p)) > threshold) {
+                finished = false;
+                if (internal::svd_precondition_2x2_block_to_be_real<MatrixType, Options>::run(m_workMatrix, *this, p, q,
+                                                                                              maxDiagEntry)) {
+                  JacobiRotation<RealScalar> j_left, j_right;
+                  internal::real_2x2_jacobi_svd(m_workMatrix, p, q, &j_left, &j_right);
+                  m_workMatrix.applyOnTheLeft(p, q, j_left);
+                  if (computeU()) m_matrixU.applyOnTheRight(p, q, j_left.transpose());
+                  m_workMatrix.applyOnTheRight(p, q, j_right);
+                  if (computeV()) m_matrixV.applyOnTheRight(p, q, j_right);
+                  maxDiagEntry = numext::maxi<RealScalar>(
+                      maxDiagEntry,
+                      numext::maxi<RealScalar>(abs(m_workMatrix.coeff(p, p)), abs(m_workMatrix.coeff(q, q))));
+                }
+              }
+            }
+          }
+        }
+      else {
+        // Real non-blocking sweep: diagonalize each 2x2 block directly.
+        RealScalar threshold = numext::maxi<RealScalar>(considerAsZero, precision * maxDiagEntry);
+        for (Index p = 1; p < n; ++p) {
+          for (Index q = 0; q < p; ++q) {
+            if (abs(m_workMatrix.coeff(p, q)) > threshold || abs(m_workMatrix.coeff(q, p)) > threshold) {
+              finished = false;
+              JacobiRotation<RealScalar> j_left, j_right;
+              internal::real_2x2_jacobi_svd(m_workMatrix, p, q, &j_left, &j_right);
+              m_workMatrix.applyOnTheLeft(p, q, j_left);
+              if (computeU()) m_matrixU.applyOnTheRight(p, q, j_left.transpose());
+              m_workMatrix.applyOnTheRight(p, q, j_right);
+              if (computeV()) m_matrixV.applyOnTheRight(p, q, j_right);
+              maxDiagEntry = numext::maxi<RealScalar>(
+                  maxDiagEntry, numext::maxi<RealScalar>(abs(m_workMatrix.coeff(p, p)), abs(m_workMatrix.coeff(q, q))));
+              threshold = numext::maxi<RealScalar>(considerAsZero, precision * maxDiagEntry);
+            }
           }
         }
       }
@@ -826,6 +876,210 @@ JacobiSVD<MatrixType, Options>& JacobiSVD<MatrixType, Options>::compute_impl(con
   return *this;
 }
 
+// Blocked Jacobi SVD sweep for both real and complex scalar types. For large n,
+// applying left rotations (row operations on column-major data) causes cache
+// misses due to strided access. To mitigate this, we accumulate kBlockSize left
+// rotations into a small dense matrix and apply them via a single GEMM to the
+// contiguous row block q..q+kBlockSize-1 and the (possibly distant) row p.
+// Right rotations and column scalings act on columns (contiguous in column-major)
+// and are applied individually.
+//
+// For complex types, the 2x2 preconditioning (making the block real) involves
+// complex left rotations and row scalings, which are also accumulated into the
+// block matrix. Column scalings from preconditioning are applied directly.
+//
+// The accumulated rotation matrix has lower-triangular structure in its top-left
+// kBlockSize x kBlockSize corner, which we exploit with triangularView.
+//
+// Returns true if any off-diagonal element exceeded the threshold (i.e. sweep
+// is not yet converged).
+template <typename MatrixType, int Options>
+EIGEN_DONT_INLINE bool JacobiSVD<MatrixType, Options>::blocked_sweep(RealScalar considerAsZero, RealScalar precision,
+                                                                     RealScalar& maxDiagEntry) {
+  using numext::abs;
+  using numext::sqrt;
+  const Index n = diagSize();
+  RealScalar threshold = numext::maxi<RealScalar>(considerAsZero, precision * maxDiagEntry);
+  bool notFinished = false;
+
+  for (Index p = 1; p < n; ++p) {
+    Index q = 0;
+
+    // Blocked loop: process kBlockSize pairs (p,q+qq) for qq=0..kBlockSize-1.
+    // We extract the relevant (kBlockSize+1) x (kBlockSize+1) submatrix of W
+    // into a small buffer, compute all rotations on the buffer, accumulate the
+    // left transformations into `accum`, and apply them in one GEMM at the end.
+    for (; q + kBlockSize <= p; q += kBlockSize) {
+      // Buffer = [ W(q:q+k, q:q+k)  W(q:q+k, p) ]
+      //          [ W(p, q:q+k)       W(p, p)      ]
+      m_blockBuffer.template topLeftCorner<kBlockSize, kBlockSize>() =
+          m_workMatrix.template block<kBlockSize, kBlockSize>(q, q);
+      m_blockBuffer.col(kBlockSize).template head<kBlockSize>() = m_workMatrix.col(p).template segment<kBlockSize>(q);
+      m_blockBuffer.row(kBlockSize).template head<kBlockSize>() = m_workMatrix.row(p).template segment<kBlockSize>(q);
+      m_blockBuffer(kBlockSize, kBlockSize) = m_workMatrix(p, p);
+
+      // Accumulator for left transformations: W <- accum * W.
+      // After processing qq pairs, accum's top-left kBlockSize x kBlockSize
+      // block is lower-triangular (each rotation only mixes row qq with row
+      // kBlockSize, so rows 0..qq-1 are unchanged).
+      Matrix<Scalar, kBlockSize + 1, kBlockSize + 1> accum;
+      accum.setIdentity(kBlockSize + 1, kBlockSize + 1);
+      bool blockDirty = false;
+
+      for (Index qq = 0; qq < kBlockSize; ++qq) {
+        if (abs(m_blockBuffer.coeff(kBlockSize, qq)) > threshold ||
+            abs(m_blockBuffer.coeff(qq, kBlockSize)) > threshold) {
+          notFinished = true;
+          blockDirty = true;
+
+          // Complex preconditioning: transform the 2x2 block
+          //   [w_pp  w_pq] = [buffer(kBlockSize, kBlockSize)  buffer(kBlockSize, qq)]
+          //   [w_qp  w_qq]   [buffer(qq, kBlockSize)          buffer(qq, qq)        ]
+          // to have real entries via unitary row/column operations, so
+          // real_2x2_jacobi_svd can be applied.
+          //
+          // Left operations (complex rotation, row scaling by e^{i*theta}) are
+          // accumulated into `accum` for deferred GEMM application.
+          // Right operations (column scaling) are applied directly since column
+          // ops are contiguous in column-major layout.
+          bool doRealSvd = true;
+          EIGEN_IF_CONSTEXPR(NumTraits<Scalar>::IsComplex) {
+            Scalar z;
+            // nn = ||(w_pp, w_qp)||_2, the norm of the first column of the 2x2 block.
+            RealScalar nn = sqrt(numext::abs2(m_blockBuffer.coeff(kBlockSize, kBlockSize)) +
+                                 numext::abs2(m_blockBuffer.coeff(qq, kBlockSize)));
+
+            if (numext::is_exactly_zero(nn)) {
+              // First column is zero => block is already upper triangular.
+              m_blockBuffer.coeffRef(kBlockSize, kBlockSize) = Scalar(0);
+              m_blockBuffer.coeffRef(qq, kBlockSize) = Scalar(0);
+
+              // Scale rows by z = e^{-i*arg(w)} to make remaining entries real.
+              if (abs(numext::imag(m_blockBuffer.coeff(kBlockSize, qq))) > considerAsZero) {
+                z = abs(m_blockBuffer.coeff(kBlockSize, qq)) / m_blockBuffer.coeff(kBlockSize, qq);
+                m_blockBuffer.row(kBlockSize) *= z;
+                accum.row(kBlockSize) *= z;  // accumulate left op
+                if (computeU()) m_matrixU.col(p) *= numext::conj(z);
+              }
+              if (abs(numext::imag(m_blockBuffer.coeff(qq, qq))) > considerAsZero) {
+                z = abs(m_blockBuffer.coeff(qq, qq)) / m_blockBuffer.coeff(qq, qq);
+                m_blockBuffer.row(qq) *= z;
+                accum.row(qq) *= z;  // accumulate left op
+                if (computeU()) m_matrixU.col(q + qq) *= numext::conj(z);
+              }
+            } else {
+              // Apply complex Givens rotation to zero out w_qp:
+              //   [c  s] [w_pp]   [nn]      conj(w_pp)         w_qp
+              //   [-s c] [w_qp] = [0 ]  c = ----------,  s = ------
+              //                                 nn              nn
+              JacobiRotation<Scalar> rot;
+              rot.c() = numext::conj(m_blockBuffer.coeff(kBlockSize, kBlockSize)) / nn;
+              rot.s() = m_blockBuffer.coeff(qq, kBlockSize) / nn;
+              m_blockBuffer.applyOnTheLeft(kBlockSize, qq, rot);
+              accum.applyOnTheLeft(kBlockSize, qq, rot);  // accumulate left op
+              if (computeU()) m_matrixU.applyOnTheRight(p, q + qq, rot.adjoint());
+
+              // Scale column qq by z = e^{-i*arg(w_pq)} to make w_pq real.
+              if (abs(numext::imag(m_blockBuffer.coeff(kBlockSize, qq))) > considerAsZero) {
+                z = abs(m_blockBuffer.coeff(kBlockSize, qq)) / m_blockBuffer.coeff(kBlockSize, qq);
+                m_blockBuffer.col(qq) *= z;
+                m_workMatrix.col(q + qq) *= z;  // right op: apply directly
+                if (computeV()) m_matrixV.col(q + qq) *= z;
+              }
+              // Scale row qq by z = e^{-i*arg(w_qq)} to make w_qq real.
+              if (abs(numext::imag(m_blockBuffer.coeff(qq, qq))) > considerAsZero) {
+                z = abs(m_blockBuffer.coeff(qq, qq)) / m_blockBuffer.coeff(qq, qq);
+                m_blockBuffer.row(qq) *= z;
+                accum.row(qq) *= z;  // accumulate left op
+                if (computeU()) m_matrixU.col(q + qq) *= numext::conj(z);
+              }
+            }
+            // Update maxDiagEntry from preconditioning.
+            maxDiagEntry = numext::maxi<RealScalar>(
+                maxDiagEntry, numext::maxi<RealScalar>(abs(m_blockBuffer.coeff(kBlockSize, kBlockSize)),
+                                                       abs(m_blockBuffer.coeff(qq, qq))));
+            // Check if 2x2 block still needs diagonalizing.
+            RealScalar precondThreshold = numext::maxi<RealScalar>(considerAsZero, precision * maxDiagEntry);
+            doRealSvd = abs(m_blockBuffer.coeff(kBlockSize, qq)) > precondThreshold ||
+                        abs(m_blockBuffer.coeff(qq, kBlockSize)) > precondThreshold;
+          }
+
+          if (doRealSvd) {
+            // Compute real 2x2 SVD: buffer_2x2 = j_left * diag * j_right^T.
+            JacobiRotation<RealScalar> j_left, j_right;
+            internal::real_2x2_jacobi_svd(m_blockBuffer, kBlockSize, qq, &j_left, &j_right);
+            m_blockBuffer.applyOnTheLeft(kBlockSize, qq, j_left);
+            m_blockBuffer.applyOnTheRight(kBlockSize, qq, j_right);
+
+            // Accumulate left rotation for deferred GEMM.
+            accum.applyOnTheLeft(kBlockSize, qq, j_left);
+
+            // Right rotation is a column op (contiguous): apply directly.
+            m_workMatrix.applyOnTheRight(p, q + qq, j_right);
+            if (computeU()) m_matrixU.applyOnTheRight(p, q + qq, j_left.transpose());
+            if (computeV()) m_matrixV.applyOnTheRight(p, q + qq, j_right);
+
+            maxDiagEntry = numext::maxi<RealScalar>(
+                maxDiagEntry, numext::maxi<RealScalar>(abs(m_blockBuffer.coeff(kBlockSize, kBlockSize)),
+                                                       abs(m_blockBuffer.coeff(qq, qq))));
+          }
+          threshold = numext::maxi<RealScalar>(considerAsZero, precision * maxDiagEntry);
+        }
+      }
+
+      // Apply accumulated left rotations: W <- accum * W, via GEMM.
+      // When p == q + kBlockSize, all kBlockSize+1 rows are contiguous.
+      // Otherwise, rows q..q+k-1 and row p are non-adjacent; we split:
+      //   [Mq]      [L11  l12] [Mq]
+      //   [Mp] <-   [l21  l22] [Mp]
+      // L11 is lower-triangular (exploited via triangularView).
+      if (blockDirty) {
+        if (p == q + kBlockSize) {
+          m_workMatrix.template middleRows<kBlockSize + 1>(q) =
+              accum * m_workMatrix.template middleRows<kBlockSize + 1>(q);
+        } else {
+          const auto L11 = accum.template topLeftCorner<kBlockSize, kBlockSize>();
+          const auto l12 = accum.col(kBlockSize).template head<kBlockSize>();
+          const auto l21 = accum.row(kBlockSize).template head<kBlockSize>();
+          const Scalar l22 = accum(kBlockSize, kBlockSize);
+          auto Mq = m_workMatrix.template middleRows<kBlockSize>(q);
+          auto Mp = m_workMatrix.row(p);
+          Matrix<Scalar, 1, Dynamic> Mp_save = Mp;
+          Mp.noalias() = l21 * Mq + l22 * Mp_save;
+          Mq = L11.template triangularView<Lower>() * Mq + l12 * Mp_save;
+        }
+      }
+    }
+
+    // Scalar loop for remaining pairs after blocked processing.
+    for (; q < p; ++q) {
+      if (abs(m_workMatrix.coeff(p, q)) > threshold || abs(m_workMatrix.coeff(q, p)) > threshold) {
+        notFinished = true;
+
+        bool doRealSvd = true;
+        EIGEN_IF_CONSTEXPR(NumTraits<Scalar>::IsComplex) {
+          doRealSvd = internal::svd_precondition_2x2_block_to_be_real<MatrixType, Options>::run(m_workMatrix, *this, p,
+                                                                                                q, maxDiagEntry);
+        }
+
+        if (doRealSvd) {
+          JacobiRotation<RealScalar> j_left, j_right;
+          internal::real_2x2_jacobi_svd(m_workMatrix, p, q, &j_left, &j_right);
+          m_workMatrix.applyOnTheLeft(p, q, j_left);
+          if (computeU()) m_matrixU.applyOnTheRight(p, q, j_left.transpose());
+          m_workMatrix.applyOnTheRight(p, q, j_right);
+          if (computeV()) m_matrixV.applyOnTheRight(p, q, j_right);
+          maxDiagEntry = numext::maxi<RealScalar>(
+              maxDiagEntry, numext::maxi<RealScalar>(abs(m_workMatrix.coeff(p, p)), abs(m_workMatrix.coeff(q, q))));
+        }
+        threshold = numext::maxi<RealScalar>(considerAsZero, precision * maxDiagEntry);
+      }
+    }
+  }
+
+  return notFinished;
+}
+
 /** \svd_module
  *
  * \return the singular value decomposition of \c *this computed by two-sided
diff --git a/benchmarks/SVD/bench_jacobi_sweep.cpp b/benchmarks/SVD/bench_jacobi_sweep.cpp
new file mode 100644
index 000000000..45d0a7c55
--- /dev/null
+++ b/benchmarks/SVD/bench_jacobi_sweep.cpp
@@ -0,0 +1,62 @@
+// Standalone JacobiSVD benchmark for block-size and threshold sweeps.
+// Compile-time parameters via -D flags:
+//   BLOCK_SIZE (default 24), BLOCKING_THRESHOLD (default 192)
+//
+// Build example:
+//   g++ -O3 -DNDEBUG -march=native -DBLOCK_SIZE=16 -DBLOCKING_THRESHOLD=128
+//       -I../.. bench_jacobi_sweep.cpp -lbenchmark -lbenchmark_main -lpthread
+
+#include <benchmark/benchmark.h>
+
+// Override block size before including Eigen headers.
+#ifdef BLOCK_SIZE
+#define EIGEN_JACOBI_SVD_BLOCK_SIZE BLOCK_SIZE
+#endif
+#ifdef BLOCKING_THRESHOLD
+#define EIGEN_JACOBI_SVD_BLOCKING_THRESHOLD BLOCKING_THRESHOLD
+#endif
+
+#include <Eigen/Dense>
+
+using namespace Eigen;
+
+template <typename Scalar>
+using Mat = Matrix<Scalar, Dynamic, Dynamic>;
+
+template <typename SVD>
+EIGEN_DONT_INLINE void do_compute(SVD& svd, const typename SVD::MatrixType& A) {
+  svd.compute(A);
+}
+
+template <typename Scalar, int Options>
+static void BM_JacobiSVD(benchmark::State& state) {
+  const Index n = state.range(0);
+  Mat<Scalar> A = Mat<Scalar>::Random(n, n);
+  JacobiSVD<Mat<Scalar>, Options> svd(n, n);
+  for (auto _ : state) {
+    do_compute(svd, A);
+    benchmark::DoNotOptimize(svd.singularValues().data());
+  }
+  state.SetItemsProcessed(state.iterations());
+}
+
+static void Sizes(::benchmark::Benchmark* b) {
+  for (int s : {32, 64, 128, 192, 256, 384, 512}) b->Args({s});
+}
+
+// float ValuesOnly
+BENCHMARK(BM_JacobiSVD<float, 0>)->Apply(Sizes)->Name("Jacobi_float_VO");
+// float ThinUV
+BENCHMARK(BM_JacobiSVD<float, ComputeThinU | ComputeThinV>)->Apply(Sizes)->Name("Jacobi_float_UV");
+// double ValuesOnly
+BENCHMARK(BM_JacobiSVD<double, 0>)->Apply(Sizes)->Name("Jacobi_double_VO");
+// double ThinUV
+BENCHMARK(BM_JacobiSVD<double, ComputeThinU | ComputeThinV>)->Apply(Sizes)->Name("Jacobi_double_UV");
+// complex<float> ValuesOnly
+BENCHMARK(BM_JacobiSVD<std::complex<float>, 0>)->Apply(Sizes)->Name("Jacobi_cfloat_VO");
+// complex<float> ThinUV
+BENCHMARK((BM_JacobiSVD<std::complex<float>, ComputeThinU | ComputeThinV>))->Apply(Sizes)->Name("Jacobi_cfloat_UV");
+// complex<double> ValuesOnly
+BENCHMARK(BM_JacobiSVD<std::complex<double>, 0>)->Apply(Sizes)->Name("Jacobi_cdouble_VO");
+// complex<double> ThinUV
+BENCHMARK((BM_JacobiSVD<std::complex<double>, ComputeThinU | ComputeThinV>))->Apply(Sizes)->Name("Jacobi_cdouble_UV");