Add benchmarks for unsupported modules and extend supported benchmarks

libeigen/eigen!2179 Closes #3036 Co-authored-by: Rasmus Munk Larsen <rmlarsen@gmail.com>
2026-04-10 11:34:33 +08:00 · 2026-02-24 17:12:33 -08:00
parent fa567f6bcd
commit 16da0279f1
33 changed files with 2320 additions and 10 deletions
--- a/benchmarks/Cholesky/CMakeLists.txt
+++ b/benchmarks/Cholesky/CMakeLists.txt
@@ -1 +1,2 @@
 eigen_add_benchmark(bench_cholesky bench_cholesky.cpp)
 eigen_add_benchmark(bench_cholesky_double bench_cholesky.cpp DEFINITIONS SCALAR=double)
--- a/benchmarks/Cholesky/bench_cholesky.cpp
+++ b/benchmarks/Cholesky/bench_cholesky.cpp
@@ -4,7 +4,11 @@
 using namespace Eigen;
-typedef float Scalar;
+#ifndef SCALAR
 #define SCALAR float
 #endif
 typedef SCALAR Scalar;
 static void BM_LDLT(benchmark::State& state) {
  int n = state.range(0);
--- a/benchmarks/Core/CMakeLists.txt
+++ b/benchmarks/Core/CMakeLists.txt
@@ -15,3 +15,5 @@ eigen_add_benchmark(bench_diagonal bench_diagonal.cpp)
 eigen_add_benchmark(bench_triangular_product bench_triangular_product.cpp)
 eigen_add_benchmark(bench_selfadjoint_product bench_selfadjoint_product.cpp)
 eigen_add_benchmark(bench_construction bench_construction.cpp)
 eigen_add_benchmark(bench_fixed_size bench_fixed_size.cpp)
 eigen_add_benchmark(bench_fixed_size_double bench_fixed_size.cpp DEFINITIONS SCALAR=double)
--- a/benchmarks/Core/bench_fixed_size.cpp
+++ b/benchmarks/Core/bench_fixed_size.cpp
@@ -0,0 +1,123 @@
 // Benchmarks for fixed-size matrix operations (2x2, 3x3, 4x4).
 // Critical for PCL, ROS, Sophus, Drake which use small matrices extensively.
 #include <benchmark/benchmark.h>
 #include <Eigen/Core>
 #include <Eigen/LU>
 using namespace Eigen;
 #ifndef SCALAR
 #define SCALAR float
 #endif
 typedef SCALAR Scalar;
 // --- Fixed-size GEMM ---
 template <int N>
 static void BM_FixedGemm(benchmark::State& state) {
  typedef Matrix<Scalar, N, N> Mat;
  Mat a = Mat::Random();
  Mat b = Mat::Random();
  Mat c;
  for (auto _ : state) {
    c.noalias() = a * b;
    benchmark::DoNotOptimize(c.data());
    benchmark::ClobberMemory();
  }
  state.counters["GFLOPS"] =
      benchmark::Counter(2.0 * N * N * N, benchmark::Counter::kIsIterationInvariantRate, benchmark::Counter::kIs1000);
 }
 // --- Fixed-size inverse ---
 template <int N>
 static void BM_FixedInverse(benchmark::State& state) {
  typedef Matrix<Scalar, N, N> Mat;
  Mat a = Mat::Random();
  // Make well-conditioned.
  a = a * a.transpose() + Mat::Identity();
  Mat result;
  for (auto _ : state) {
    result = a.inverse();
    benchmark::DoNotOptimize(result.data());
    benchmark::ClobberMemory();
  }
 }
 // --- Fixed-size determinant ---
 template <int N>
 static void BM_FixedDeterminant(benchmark::State& state) {
  typedef Matrix<Scalar, N, N> Mat;
  Mat a = Mat::Random();
  Scalar result;
  for (auto _ : state) {
    result = a.determinant();
    benchmark::DoNotOptimize(&result);
    benchmark::ClobberMemory();
  }
 }
 // --- Batch transform: Matrix4 * Matrix<4,N> ---
 static void BM_BatchTransform4xN(benchmark::State& state) {
  int N = state.range(0);
  typedef Matrix<Scalar, 4, 4> Mat4;
  typedef Matrix<Scalar, 4, Dynamic> MatXN;
  Mat4 transform = Mat4::Random();
  MatXN points = MatXN::Random(4, N);
  MatXN result(4, N);
  for (auto _ : state) {
    result.noalias() = transform * points;
    benchmark::DoNotOptimize(result.data());
    benchmark::ClobberMemory();
  }
  state.counters["GFLOPS"] =
      benchmark::Counter(2.0 * 4 * 4 * N, benchmark::Counter::kIsIterationInvariantRate, benchmark::Counter::kIs1000);
 }
 // --- Fixed 3x3 batch operations (common in point cloud processing) ---
 static void BM_Batch3x3Gemm(benchmark::State& state) {
  int count = state.range(0);
  typedef Matrix<Scalar, 3, 3> Mat3;
  std::vector<Mat3> a(count), b(count), c(count);
  for (int i = 0; i < count; ++i) {
    a[i] = Mat3::Random();
    b[i] = Mat3::Random();
  }
  for (auto _ : state) {
    for (int i = 0; i < count; ++i) {
      c[i].noalias() = a[i] * b[i];
    }
    benchmark::DoNotOptimize(c.data());
    benchmark::ClobberMemory();
  }
  state.counters["GFLOPS"] =
      benchmark::Counter(2.0 * 27 * count, benchmark::Counter::kIsIterationInvariantRate, benchmark::Counter::kIs1000);
 }
 // Fixed-size GEMM
 BENCHMARK(BM_FixedGemm<2>)->Name("FixedGemm_2x2");
 BENCHMARK(BM_FixedGemm<3>)->Name("FixedGemm_3x3");
 BENCHMARK(BM_FixedGemm<4>)->Name("FixedGemm_4x4");
 // Fixed-size inverse
 BENCHMARK(BM_FixedInverse<2>)->Name("FixedInverse_2x2");
 BENCHMARK(BM_FixedInverse<3>)->Name("FixedInverse_3x3");
 BENCHMARK(BM_FixedInverse<4>)->Name("FixedInverse_4x4");
 // Fixed-size determinant
 BENCHMARK(BM_FixedDeterminant<2>)->Name("FixedDet_2x2");
 BENCHMARK(BM_FixedDeterminant<3>)->Name("FixedDet_3x3");
 BENCHMARK(BM_FixedDeterminant<4>)->Name("FixedDet_4x4");
 // Batch 4xN transform
 BENCHMARK(BM_BatchTransform4xN)->Arg(1)->Arg(4)->Arg(8)->Arg(16)->Arg(64);
 // Batch 3x3 GEMM
 BENCHMARK(BM_Batch3x3Gemm)->Arg(100)->Arg(1000)->Arg(10000);
--- a/benchmarks/Eigenvalues/CMakeLists.txt
+++ b/benchmarks/Eigenvalues/CMakeLists.txt
@@ -1,2 +1,3 @@
 eigen_add_benchmark(bench_eigensolver bench_eigensolver.cpp)
 eigen_add_benchmark(bench_eigensolver_double bench_eigensolver.cpp DEFINITIONS SCALAR=double)
 eigen_add_benchmark(bench_eig33 bench_eig33.cpp)
--- a/benchmarks/Eigenvalues/bench_eigensolver.cpp
+++ b/benchmarks/Eigenvalues/bench_eigensolver.cpp
@@ -5,7 +5,11 @@
 using namespace Eigen;
-typedef float Scalar;
+#ifndef SCALAR
 #define SCALAR float
 #endif
 typedef SCALAR Scalar;
 static void BM_SelfAdjointEigenSolver(benchmark::State& state) {
  int n = state.range(0);
--- a/benchmarks/FFT/bench_fft.cpp
+++ b/benchmarks/FFT/bench_fft.cpp
@@ -33,11 +33,20 @@ static void BM_FFT(benchmark::State& state) {
      benchmark::Counter(mflops_per_iter, benchmark::Counter::kIsIterationInvariantRate, benchmark::Counter::kIs1000);
 }
-BENCHMARK(BM_FFT<std::complex<float>, true>)->Arg(1024)->Arg(4096);
+static void FFTSizes(::benchmark::Benchmark* b) {
-BENCHMARK(BM_FFT<std::complex<float>, false>)->Arg(1024)->Arg(4096);
+  for (int n : {64, 128, 256, 512, 1024, 2048, 4096, 8192, 16384, 65536}) {
-BENCHMARK(BM_FFT<float, true>)->Arg(1024)->Arg(4096);
+    b->Arg(n);
-BENCHMARK(BM_FFT<float, false>)->Arg(1024)->Arg(4096);
+  }
-BENCHMARK(BM_FFT<std::complex<double>, true>)->Arg(1024)->Arg(4096);
+  // Non-power-of-2 sizes.
-BENCHMARK(BM_FFT<std::complex<double>, false>)->Arg(1024)->Arg(4096);
+  b->Arg(1000);
-BENCHMARK(BM_FFT<double, true>)->Arg(1024)->Arg(4096);
+  b->Arg(5000);
-BENCHMARK(BM_FFT<double, false>)->Arg(1024)->Arg(4096);
+}
 BENCHMARK(BM_FFT<std::complex<float>, true>)->Apply(FFTSizes);
 BENCHMARK(BM_FFT<std::complex<float>, false>)->Apply(FFTSizes);
 BENCHMARK(BM_FFT<float, true>)->Apply(FFTSizes);
 BENCHMARK(BM_FFT<float, false>)->Apply(FFTSizes);
 BENCHMARK(BM_FFT<std::complex<double>, true>)->Apply(FFTSizes);
 BENCHMARK(BM_FFT<std::complex<double>, false>)->Apply(FFTSizes);
 BENCHMARK(BM_FFT<double, true>)->Apply(FFTSizes);
 BENCHMARK(BM_FFT<double, false>)->Apply(FFTSizes);
--- a/benchmarks/Sparse/CMakeLists.txt
+++ b/benchmarks/Sparse/CMakeLists.txt
@@ -1,3 +1,4 @@
 eigen_add_benchmark(bench_spmv bench_spmv.cpp)
 eigen_add_benchmark(bench_spmm bench_spmm.cpp)
 eigen_add_benchmark(bench_sparse_transpose bench_sparse_transpose.cpp)
 eigen_add_benchmark(bench_sparse_solvers bench_sparse_solvers.cpp)
--- a/benchmarks/Sparse/bench_sparse_solvers.cpp
+++ b/benchmarks/Sparse/bench_sparse_solvers.cpp
@@ -0,0 +1,182 @@
 // Benchmarks for sparse decomposition solvers.
 // Tests SimplicialLLT, SimplicialLDLT, SparseQR, SparseLU, CG, BiCGSTAB.
 #include <benchmark/benchmark.h>
 #include <Eigen/Sparse>
 #include <Eigen/SparseCholesky>
 #include <Eigen/SparseLU>
 #include <Eigen/SparseQR>
 #include <Eigen/IterativeLinearSolvers>
 #include <Eigen/OrderingMethods>
 using namespace Eigen;
 typedef double Scalar;
 typedef SparseMatrix<Scalar> SpMat;
 typedef Matrix<Scalar, Dynamic, 1> Vec;
 // Generate a SPD banded matrix (Laplacian-like).
 static SpMat generateSPD(int n, int bandwidth) {
  SpMat A(n, n);
  std::vector<Triplet<Scalar>> trips;
  trips.reserve(n * (2 * bandwidth + 1));
  for (int i = 0; i < n; ++i) {
    Scalar diag = 0;
    for (int j = std::max(0, i - bandwidth); j < std::min(n, i + bandwidth + 1); ++j) {
      if (i != j) {
        Scalar val = -1.0 / (1 + std::abs(i - j));
        trips.emplace_back(i, j, val);
        diag -= val;
      }
    }
    trips.emplace_back(i, i, diag + 1.0);
  }
  A.setFromTriplets(trips.begin(), trips.end());
  return A;
 }
 // Generate a general (non-symmetric) sparse matrix with diagonal dominance.
 static SpMat generateGeneral(int n, int bandwidth) {
  SpMat A(n, n);
  std::vector<Triplet<Scalar>> trips;
  trips.reserve(n * (2 * bandwidth + 1));
  for (int i = 0; i < n; ++i) {
    Scalar diag = 0;
    for (int j = std::max(0, i - bandwidth); j < std::min(n, i + bandwidth + 1); ++j) {
      if (i != j) {
        Scalar val = -0.5 / (1 + std::abs(i - j));
        if (j > i) val *= 1.5;
        trips.emplace_back(i, j, val);
        diag += std::abs(val);
      }
    }
    trips.emplace_back(i, i, diag + 1.0);
  }
  A.setFromTriplets(trips.begin(), trips.end());
  return A;
 }
 // --- SimplicialLLT ---
 static void BM_SimplicialLLT(benchmark::State& state) {
  int n = state.range(0);
  int bw = state.range(1);
  SpMat A = generateSPD(n, bw);
  Vec b = Vec::Random(n);
  for (auto _ : state) {
    SimplicialLLT<SpMat> solver(A);
    Vec x = solver.solve(b);
    benchmark::DoNotOptimize(x.data());
    benchmark::ClobberMemory();
  }
 }
 // --- SimplicialLDLT ---
 static void BM_SimplicialLDLT(benchmark::State& state) {
  int n = state.range(0);
  int bw = state.range(1);
  SpMat A = generateSPD(n, bw);
  Vec b = Vec::Random(n);
  for (auto _ : state) {
    SimplicialLDLT<SpMat> solver(A);
    Vec x = solver.solve(b);
    benchmark::DoNotOptimize(x.data());
    benchmark::ClobberMemory();
  }
 }
 // --- SparseLU ---
 static void BM_SparseLU(benchmark::State& state) {
  int n = state.range(0);
  int bw = state.range(1);
  SpMat A = generateGeneral(n, bw);
  Vec b = Vec::Random(n);
  for (auto _ : state) {
    SparseLU<SpMat, COLAMDOrdering<int>> solver;
    solver.compute(A);
    Vec x = solver.solve(b);
    benchmark::DoNotOptimize(x.data());
    benchmark::ClobberMemory();
  }
 }
 // --- SparseQR ---
 static void BM_SparseQR(benchmark::State& state) {
  int n = state.range(0);
  int bw = state.range(1);
  SpMat A = generateGeneral(n, bw);
  Vec b = Vec::Random(n);
  for (auto _ : state) {
    SparseQR<SpMat, COLAMDOrdering<int>> solver;
    solver.compute(A);
    Vec x = solver.solve(b);
    benchmark::DoNotOptimize(x.data());
    benchmark::ClobberMemory();
  }
 }
 // --- ConjugateGradient (SPD) ---
 static void BM_CG(benchmark::State& state) {
  int n = state.range(0);
  int bw = state.range(1);
  SpMat A = generateSPD(n, bw);
  Vec b = Vec::Random(n);
  ConjugateGradient<SpMat> solver;
  solver.setMaxIterations(1000);
  solver.setTolerance(1e-10);
  solver.compute(A);
  for (auto _ : state) {
    Vec x = solver.solve(b);
    benchmark::DoNotOptimize(x.data());
    benchmark::ClobberMemory();
  }
  state.counters["iterations"] = solver.iterations();
 }
 // --- BiCGSTAB (general) ---
 static void BM_BiCGSTAB(benchmark::State& state) {
  int n = state.range(0);
  int bw = state.range(1);
  SpMat A = generateGeneral(n, bw);
  Vec b = Vec::Random(n);
  BiCGSTAB<SpMat> solver;
  solver.setMaxIterations(1000);
  solver.setTolerance(1e-10);
  solver.compute(A);
  for (auto _ : state) {
    Vec x = solver.solve(b);
    benchmark::DoNotOptimize(x.data());
    benchmark::ClobberMemory();
  }
  state.counters["iterations"] = solver.iterations();
 }
 static void DirectSolverSizes(::benchmark::Benchmark* b) {
  for (int n : {1000, 5000, 10000, 50000}) {
    for (int bw : {5, 20}) {
      b->Args({n, bw});
    }
  }
 }
 static void IterativeSolverSizes(::benchmark::Benchmark* b) {
  for (int n : {1000, 10000, 50000}) {
    for (int bw : {5, 20}) {
      b->Args({n, bw});
    }
  }
 }
 BENCHMARK(BM_SimplicialLLT)->Apply(DirectSolverSizes);
 BENCHMARK(BM_SimplicialLDLT)->Apply(DirectSolverSizes);
 BENCHMARK(BM_SparseLU)->Apply(DirectSolverSizes);
 BENCHMARK(BM_SparseQR)->Apply(DirectSolverSizes);
 BENCHMARK(BM_CG)->Apply(IterativeSolverSizes);
 BENCHMARK(BM_BiCGSTAB)->Apply(IterativeSolverSizes);
--- a/unsupported/benchmarks/AutoDiff/CMakeLists.txt
+++ b/unsupported/benchmarks/AutoDiff/CMakeLists.txt
@@ -0,0 +1 @@
 eigen_add_benchmark(bench_autodiff bench_autodiff.cpp)
--- a/unsupported/benchmarks/AutoDiff/bench_autodiff.cpp
+++ b/unsupported/benchmarks/AutoDiff/bench_autodiff.cpp
@@ -0,0 +1,177 @@
 // Benchmarks for Eigen AutoDiff module.
 // Compares AutoDiff Jacobian computation against NumericalDiff and hand-coded Jacobians.
 #include <benchmark/benchmark.h>
 #include <Eigen/Core>
 #include <unsupported/Eigen/AutoDiff>
 #include <unsupported/Eigen/NumericalDiff>
 using namespace Eigen;
 // --- Small functor: Rosenbrock-like (2 inputs -> 2 outputs) ---
 struct SmallFunctor {
  typedef Matrix<double, 2, 1> InputType;
  typedef Matrix<double, 2, 1> ValueType;
  typedef Matrix<double, 2, 2> JacobianType;
  enum { InputsAtCompileTime = 2, ValuesAtCompileTime = 2 };
  template <typename T>
  void operator()(const Matrix<T, 2, 1>& x, Matrix<T, 2, 1>* v) const {
    (*v)(0) = T(1) - x(0);
    (*v)(1) = T(10) * (x(1) - x(0) * x(0));
  }
 };
 // --- Medium functor: chain of operations (6 inputs -> 6 outputs) ---
 struct MediumFunctor {
  typedef Matrix<double, 6, 1> InputType;
  typedef Matrix<double, 6, 1> ValueType;
  typedef Matrix<double, 6, 6> JacobianType;
  enum { InputsAtCompileTime = 6, ValuesAtCompileTime = 6 };
  template <typename T>
  void operator()(const Matrix<T, 6, 1>& x, Matrix<T, 6, 1>* v) const {
    (*v)(0) = sin(x(0)) * cos(x(1)) + x(2) * x(2);
    (*v)(1) = exp(x(1) * T(0.1)) + x(3);
    (*v)(2) = x(0) * x(2) - x(4) * x(5);
    (*v)(3) = sqrt(x(3) * x(3) + T(1)) + x(0);
    (*v)(4) = x(4) * x(4) + x(5) * x(5) + x(0) * x(1);
    (*v)(5) = log(x(2) * x(2) + T(1)) + x(3) * x(4);
  }
 };
 // --- Dynamic-size functor (N inputs -> N outputs) ---
 struct DynamicFunctor {
  typedef Matrix<double, Dynamic, 1> InputType;
  typedef Matrix<double, Dynamic, 1> ValueType;
  typedef Matrix<double, Dynamic, Dynamic> JacobianType;
  const int n_;
  DynamicFunctor(int n) : n_(n) {}
  enum { InputsAtCompileTime = Dynamic, ValuesAtCompileTime = Dynamic };
  int inputs() const { return n_; }
  int values() const { return n_; }
  template <typename T>
  void operator()(const Matrix<T, Dynamic, 1>& x, Matrix<T, Dynamic, 1>* v) const {
    v->resize(n_);
    (*v)(0) = T(1) - x(0);
    for (int i = 1; i < n_; ++i) {
      (*v)(i) = T(10) * (x(i) - x(i - 1) * x(i - 1));
    }
  }
 };
 // Wrapper for NumericalDiff compatibility.
 struct SmallFunctorND : SmallFunctor {
  typedef double Scalar;
  int inputs() const { return 2; }
  int values() const { return 2; }
  int operator()(const InputType& x, ValueType& v) const {
    SmallFunctor::operator()(x, &v);
    return 0;
  }
 };
 struct MediumFunctorND : MediumFunctor {
  typedef double Scalar;
  int inputs() const { return 6; }
  int values() const { return 6; }
  int operator()(const InputType& x, ValueType& v) const {
    MediumFunctor::operator()(x, &v);
    return 0;
  }
 };
 // --- AutoDiff Jacobian benchmarks ---
 template <typename Functor>
 static void BM_AutoDiffJacobian(benchmark::State& state, Functor func) {
  AutoDiffJacobian<Functor> adf(func);
  typename Functor::InputType x = Functor::InputType::Random();
  typename Functor::ValueType v;
  typename Functor::JacobianType jac;
  for (auto _ : state) {
    adf(x, &v, &jac);
    benchmark::DoNotOptimize(jac.data());
    benchmark::ClobberMemory();
  }
 }
 // --- Dynamic AutoDiff Jacobian ---
 static void BM_AutoDiffJacobian_Dynamic(benchmark::State& state) {
  int n = state.range(0);
  DynamicFunctor func(n);
  AutoDiffJacobian<DynamicFunctor> adf(func);
  VectorXd x = VectorXd::Random(n);
  VectorXd v(n);
  MatrixXd jac(n, n);
  for (auto _ : state) {
    adf(x, &v, &jac);
    benchmark::DoNotOptimize(jac.data());
    benchmark::ClobberMemory();
  }
 }
 // --- NumericalDiff benchmarks ---
 template <typename Functor>
 static void BM_NumericalDiffJacobian(benchmark::State& state, Functor func) {
  NumericalDiff<Functor> ndf(func);
  typename Functor::InputType x = Functor::InputType::Random();
  typename Functor::JacobianType jac;
  for (auto _ : state) {
    ndf.df(x, jac);
    benchmark::DoNotOptimize(jac.data());
    benchmark::ClobberMemory();
  }
 }
 // --- Hand-coded Jacobian (Rosenbrock) for comparison ---
 static void BM_HandCoded_Small(benchmark::State& state) {
  Vector2d x = Vector2d::Random();
  Matrix2d jac;
  for (auto _ : state) {
    jac(0, 0) = -1;
    jac(0, 1) = 0;
    jac(1, 0) = -20 * x(0);
    jac(1, 1) = 10;
    benchmark::DoNotOptimize(jac.data());
    benchmark::ClobberMemory();
  }
 }
 // --- Scalar AutoDiff evaluation (no Jacobian, just forward pass) ---
 static void BM_AutoDiffScalar_Eval(benchmark::State& state) {
  int n = state.range(0);
  using ADScalar = AutoDiffScalar<VectorXd>;
  VectorXd x = VectorXd::Random(n);
  for (auto _ : state) {
    ADScalar sum(0.0, VectorXd::Zero(n));
    for (int i = 0; i < n; ++i) {
      ADScalar xi(x(i), n, i);
      sum += xi * xi + sin(xi);
    }
    benchmark::DoNotOptimize(sum.value());
    benchmark::DoNotOptimize(sum.derivatives().data());
    benchmark::ClobberMemory();
  }
 }
 BENCHMARK_CAPTURE(BM_AutoDiffJacobian, Small, SmallFunctor());
 BENCHMARK_CAPTURE(BM_AutoDiffJacobian, Medium, MediumFunctor());
 BENCHMARK(BM_AutoDiffJacobian_Dynamic)->Arg(2)->Arg(6)->Arg(20)->Arg(50)->Arg(100);
 BENCHMARK_CAPTURE(BM_NumericalDiffJacobian, Small, SmallFunctorND());
 BENCHMARK_CAPTURE(BM_NumericalDiffJacobian, Medium, MediumFunctorND());
 BENCHMARK(BM_HandCoded_Small);
 BENCHMARK(BM_AutoDiffScalar_Eval)->Arg(2)->Arg(6)->Arg(20)->Arg(50)->Arg(100);
--- a/unsupported/benchmarks/CMakeLists.txt
+++ b/unsupported/benchmarks/CMakeLists.txt
@@ -0,0 +1,35 @@
 cmake_minimum_required(VERSION 3.10)
 project(EigenUnsupportedBenchmarks CXX)
 find_package(benchmark REQUIRED)
 find_package(Threads REQUIRED)
 set(EIGEN_SOURCE_DIR "${CMAKE_CURRENT_SOURCE_DIR}/../..")
 # Helper: add a Google Benchmark target (mirrors benchmarks/CMakeLists.txt).
 #   eigen_add_benchmark(name source [LIBRARIES lib1 lib2 ...] [DEFINITIONS def1 def2 ...])
 function(eigen_add_benchmark name source)
  cmake_parse_arguments(BENCH "" "" "LIBRARIES;DEFINITIONS" ${ARGN})
  if(NOT IS_ABSOLUTE "${source}")
    set(source "${CMAKE_CURRENT_SOURCE_DIR}/${source}")
  endif()
  add_executable(${name} ${source})
  target_include_directories(${name} PRIVATE ${EIGEN_SOURCE_DIR})
  target_link_libraries(${name} PRIVATE benchmark::benchmark benchmark::benchmark_main
                                        Threads::Threads)
  if(BENCH_LIBRARIES)
    target_link_libraries(${name} PRIVATE ${BENCH_LIBRARIES})
  endif()
  target_compile_options(${name} PRIVATE -O3 -DNDEBUG)
  if(BENCH_DEFINITIONS)
    target_compile_definitions(${name} PRIVATE ${BENCH_DEFINITIONS})
  endif()
 endfunction()
 add_subdirectory(Tensor)
 add_subdirectory(MatrixFunctions)
 add_subdirectory(SpecialFunctions)
 add_subdirectory(AutoDiff)
 add_subdirectory(Splines)
 add_subdirectory(IterativeSolvers)
 add_subdirectory(KroneckerProduct)
--- a/unsupported/benchmarks/IterativeSolvers/CMakeLists.txt
+++ b/unsupported/benchmarks/IterativeSolvers/CMakeLists.txt
@@ -0,0 +1 @@
 eigen_add_benchmark(bench_iterative_solvers bench_iterative_solvers.cpp)
--- a/unsupported/benchmarks/IterativeSolvers/bench_iterative_solvers.cpp
+++ b/unsupported/benchmarks/IterativeSolvers/bench_iterative_solvers.cpp
@@ -0,0 +1,209 @@
 // Benchmarks for unsupported iterative solvers: GMRES, MINRES, IDRS, IDRSTABL, BiCGSTABL, DGMRES.
 #include <benchmark/benchmark.h>
 #include <Eigen/Sparse>
 #include <Eigen/IterativeLinearSolvers>
 #include <unsupported/Eigen/IterativeSolvers>
 using namespace Eigen;
 typedef double Scalar;
 typedef SparseMatrix<Scalar> SpMat;
 typedef Matrix<Scalar, Dynamic, 1> Vec;
 // Generate a SPD sparse matrix (Laplacian-like with diagonal dominance).
 static SpMat generateSPD(int n, int bandwidth) {
  SpMat A(n, n);
  std::vector<Triplet<Scalar>> trips;
  trips.reserve(n * (2 * bandwidth + 1));
  for (int i = 0; i < n; ++i) {
    Scalar diag = 0;
    for (int j = std::max(0, i - bandwidth); j < std::min(n, i + bandwidth + 1); ++j) {
      if (i != j) {
        Scalar val = -1.0 / (1 + std::abs(i - j));
        trips.emplace_back(i, j, val);
        diag -= val;
      }
    }
    trips.emplace_back(i, i, diag + 1.0);
  }
  A.setFromTriplets(trips.begin(), trips.end());
  return A;
 }
 // Generate a general (non-symmetric) sparse matrix.
 static SpMat generateGeneral(int n, int bandwidth) {
  SpMat A(n, n);
  std::vector<Triplet<Scalar>> trips;
  trips.reserve(n * (2 * bandwidth + 1));
  for (int i = 0; i < n; ++i) {
    Scalar diag = 0;
    for (int j = std::max(0, i - bandwidth); j < std::min(n, i + bandwidth + 1); ++j) {
      if (i != j) {
        Scalar val = -0.5 / (1 + std::abs(i - j));
        if (j > i) val *= 1.5;  // asymmetry
        trips.emplace_back(i, j, val);
        diag += std::abs(val);
      }
    }
    trips.emplace_back(i, i, diag + 1.0);  // diagonal dominance
  }
  A.setFromTriplets(trips.begin(), trips.end());
  return A;
 }
 // --- GMRES ---
 static void BM_GMRES(benchmark::State& state) {
  int n = state.range(0);
  int bw = state.range(1);
  SpMat A = generateGeneral(n, bw);
  Vec b = Vec::Random(n);
  GMRES<SpMat> solver;
  solver.setMaxIterations(1000);
  solver.setTolerance(1e-10);
  solver.compute(A);
  for (auto _ : state) {
    Vec x = solver.solve(b);
    benchmark::DoNotOptimize(x.data());
    benchmark::ClobberMemory();
  }
  state.counters["iterations"] = solver.iterations();
 }
 // --- DGMRES ---
 static void BM_DGMRES(benchmark::State& state) {
  int n = state.range(0);
  int bw = state.range(1);
  SpMat A = generateGeneral(n, bw);
  Vec b = Vec::Random(n);
  DGMRES<SpMat> solver;
  solver.setMaxIterations(1000);
  solver.setTolerance(1e-10);
  solver.compute(A);
  for (auto _ : state) {
    Vec x = solver.solve(b);
    benchmark::DoNotOptimize(x.data());
    benchmark::ClobberMemory();
  }
  state.counters["iterations"] = solver.iterations();
 }
 // --- MINRES (SPD matrices) ---
 static void BM_MINRES(benchmark::State& state) {
  int n = state.range(0);
  int bw = state.range(1);
  SpMat A = generateSPD(n, bw);
  Vec b = Vec::Random(n);
  MINRES<SpMat> solver;
  solver.setMaxIterations(1000);
  solver.setTolerance(1e-10);
  solver.compute(A);
  for (auto _ : state) {
    Vec x = solver.solve(b);
    benchmark::DoNotOptimize(x.data());
    benchmark::ClobberMemory();
  }
  state.counters["iterations"] = solver.iterations();
 }
 // --- IDRS ---
 static void BM_IDRS(benchmark::State& state) {
  int n = state.range(0);
  int bw = state.range(1);
  SpMat A = generateGeneral(n, bw);
  Vec b = Vec::Random(n);
  IDRS<SpMat> solver;
  solver.setMaxIterations(1000);
  solver.setTolerance(1e-10);
  solver.compute(A);
  for (auto _ : state) {
    Vec x = solver.solve(b);
    benchmark::DoNotOptimize(x.data());
    benchmark::ClobberMemory();
  }
  state.counters["iterations"] = solver.iterations();
 }
 // --- BiCGSTABL ---
 static void BM_BiCGSTABL(benchmark::State& state) {
  int n = state.range(0);
  int bw = state.range(1);
  SpMat A = generateGeneral(n, bw);
  Vec b = Vec::Random(n);
  BiCGSTABL<SpMat> solver;
  solver.setMaxIterations(1000);
  solver.setTolerance(1e-10);
  solver.compute(A);
  for (auto _ : state) {
    Vec x = solver.solve(b);
    benchmark::DoNotOptimize(x.data());
    benchmark::ClobberMemory();
  }
  state.counters["iterations"] = solver.iterations();
 }
 // --- Compare with CG (supported module, SPD only) ---
 static void BM_CG_Reference(benchmark::State& state) {
  int n = state.range(0);
  int bw = state.range(1);
  SpMat A = generateSPD(n, bw);
  Vec b = Vec::Random(n);
  ConjugateGradient<SpMat> solver;
  solver.setMaxIterations(1000);
  solver.setTolerance(1e-10);
  solver.compute(A);
  for (auto _ : state) {
    Vec x = solver.solve(b);
    benchmark::DoNotOptimize(x.data());
    benchmark::ClobberMemory();
  }
  state.counters["iterations"] = solver.iterations();
 }
 // --- Compare with BiCGSTAB (supported module, general) ---
 static void BM_BiCGSTAB_Reference(benchmark::State& state) {
  int n = state.range(0);
  int bw = state.range(1);
  SpMat A = generateGeneral(n, bw);
  Vec b = Vec::Random(n);
  BiCGSTAB<SpMat> solver;
  solver.setMaxIterations(1000);
  solver.setTolerance(1e-10);
  solver.compute(A);
  for (auto _ : state) {
    Vec x = solver.solve(b);
    benchmark::DoNotOptimize(x.data());
    benchmark::ClobberMemory();
  }
  state.counters["iterations"] = solver.iterations();
 }
 static void SolverSizes(::benchmark::Benchmark* b) {
  for (int n : {1000, 10000, 100000}) {
    for (int bw : {5, 20}) {
      b->Args({n, bw});
    }
  }
 }
 BENCHMARK(BM_GMRES)->Apply(SolverSizes);
 BENCHMARK(BM_DGMRES)->Apply(SolverSizes);
 BENCHMARK(BM_MINRES)->Apply(SolverSizes);
 BENCHMARK(BM_IDRS)->Apply(SolverSizes);
 BENCHMARK(BM_BiCGSTABL)->Apply(SolverSizes);
 BENCHMARK(BM_CG_Reference)->Apply(SolverSizes);
 BENCHMARK(BM_BiCGSTAB_Reference)->Apply(SolverSizes);
--- a/unsupported/benchmarks/KroneckerProduct/CMakeLists.txt
+++ b/unsupported/benchmarks/KroneckerProduct/CMakeLists.txt
@@ -0,0 +1 @@
 eigen_add_benchmark(bench_kronecker bench_kronecker.cpp)
--- a/unsupported/benchmarks/KroneckerProduct/bench_kronecker.cpp
+++ b/unsupported/benchmarks/KroneckerProduct/bench_kronecker.cpp
@@ -0,0 +1,83 @@
 // Benchmarks for Kronecker product (dense and sparse).
 #include <benchmark/benchmark.h>
 #include <Eigen/Core>
 #include <Eigen/Sparse>
 #include <unsupported/Eigen/KroneckerProduct>
 using namespace Eigen;
 typedef double Scalar;
 typedef Matrix<Scalar, Dynamic, Dynamic> Mat;
 typedef SparseMatrix<Scalar> SpMat;
 // --- Dense Kronecker product ---
 static void BM_KroneckerDense(benchmark::State& state) {
  int na = state.range(0);
  int nb = state.range(1);
  Mat A = Mat::Random(na, na);
  Mat B = Mat::Random(nb, nb);
  for (auto _ : state) {
    Mat C = kroneckerProduct(A, B).eval();
    benchmark::DoNotOptimize(C.data());
    benchmark::ClobberMemory();
  }
  int outSize = na * nb;
  state.counters["output_size"] = outSize;
 }
 // --- Sparse Kronecker product ---
 static void BM_KroneckerSparse(benchmark::State& state) {
  int na = state.range(0);
  int nb = state.range(1);
  // Create sparse identity-like matrices with some fill.
  SpMat A(na, na);
  SpMat B(nb, nb);
  std::vector<Triplet<Scalar>> tripsA, tripsB;
  for (int i = 0; i < na; ++i) {
    tripsA.emplace_back(i, i, 2.0);
    if (i + 1 < na) {
      tripsA.emplace_back(i, i + 1, -1.0);
      tripsA.emplace_back(i + 1, i, -1.0);
    }
  }
  for (int i = 0; i < nb; ++i) {
    tripsB.emplace_back(i, i, 2.0);
    if (i + 1 < nb) {
      tripsB.emplace_back(i, i + 1, -1.0);
      tripsB.emplace_back(i + 1, i, -1.0);
    }
  }
  A.setFromTriplets(tripsA.begin(), tripsA.end());
  B.setFromTriplets(tripsB.begin(), tripsB.end());
  for (auto _ : state) {
    SpMat C = kroneckerProduct(A, B).eval();
    benchmark::DoNotOptimize(C.valuePtr());
    benchmark::ClobberMemory();
  }
  state.counters["output_size"] = na * nb;
 }
 static void KroneckerSizes(::benchmark::Benchmark* b) {
  for (int na : {4, 8, 16}) {
    for (int nb : {4, 8, 16}) {
      b->Args({na, nb});
    }
  }
 }
 static void KroneckerSparseSizes(::benchmark::Benchmark* b) {
  for (int na : {16, 32, 64, 128}) {
    for (int nb : {16, 32, 64, 128}) {
      b->Args({na, nb});
    }
  }
 }
 BENCHMARK(BM_KroneckerDense)->Apply(KroneckerSizes);
 BENCHMARK(BM_KroneckerSparse)->Apply(KroneckerSparseSizes);
--- a/unsupported/benchmarks/MatrixFunctions/CMakeLists.txt
+++ b/unsupported/benchmarks/MatrixFunctions/CMakeLists.txt
@@ -0,0 +1,3 @@
 eigen_add_benchmark(bench_matrix_exponential bench_matrix_exponential.cpp)
 eigen_add_benchmark(bench_matrix_logarithm bench_matrix_logarithm.cpp)
 eigen_add_benchmark(bench_matrix_power bench_matrix_power.cpp)
--- a/unsupported/benchmarks/MatrixFunctions/bench_matrix_exponential.cpp
+++ b/unsupported/benchmarks/MatrixFunctions/bench_matrix_exponential.cpp
@@ -0,0 +1,52 @@
 // Benchmarks for matrix exponential.
 // Critical for Sophus Lie group operations (SLAM, visual odometry).
 #include <benchmark/benchmark.h>
 #include <Eigen/Core>
 #include <unsupported/Eigen/MatrixFunctions>
 using namespace Eigen;
 #ifndef SCALAR
 #define SCALAR double
 #endif
 typedef SCALAR Scalar;
 static void BM_MatrixExp(benchmark::State& state) {
  int n = state.range(0);
  typedef Matrix<Scalar, Dynamic, Dynamic> MatrixType;
  // Generate a random matrix with reasonable spectral radius.
  MatrixType A = MatrixType::Random(n, n) / Scalar(n);
  MatrixType result(n, n);
  for (auto _ : state) {
    result = A.exp();
    benchmark::DoNotOptimize(result.data());
    benchmark::ClobberMemory();
  }
 }
 // Fixed-size specializations for Lie group sizes.
 template <int N>
 static void BM_MatrixExp_Fixed(benchmark::State& state) {
  typedef Matrix<Scalar, N, N> MatrixType;
  MatrixType A = MatrixType::Random() / Scalar(N);
  MatrixType result;
  for (auto _ : state) {
    result = A.exp();
    benchmark::DoNotOptimize(result.data());
    benchmark::ClobberMemory();
  }
 }
 // Dynamic sizes: Lie groups (2,3,4) plus larger.
 BENCHMARK(BM_MatrixExp)->Arg(2)->Arg(3)->Arg(4)->Arg(8)->Arg(16)->Arg(32)->Arg(64)->Arg(128);
 // Fixed-size Lie group dimensions.
 BENCHMARK(BM_MatrixExp_Fixed<2>);
 BENCHMARK(BM_MatrixExp_Fixed<3>);
 BENCHMARK(BM_MatrixExp_Fixed<4>);
--- a/unsupported/benchmarks/MatrixFunctions/bench_matrix_logarithm.cpp
+++ b/unsupported/benchmarks/MatrixFunctions/bench_matrix_logarithm.cpp
@@ -0,0 +1,51 @@
 // Benchmarks for matrix logarithm.
 // Inverse of matrix exponential, used for Lie group log maps.
 #include <benchmark/benchmark.h>
 #include <Eigen/Core>
 #include <unsupported/Eigen/MatrixFunctions>
 using namespace Eigen;
 #ifndef SCALAR
 #define SCALAR double
 #endif
 typedef SCALAR Scalar;
 static void BM_MatrixLog(benchmark::State& state) {
  int n = state.range(0);
  typedef Matrix<Scalar, Dynamic, Dynamic> MatrixType;
  // Generate a matrix close to identity for stable log computation.
  MatrixType A = MatrixType::Identity(n, n) + MatrixType::Random(n, n) / Scalar(n * 2);
  // Ensure A is in the principal branch by computing exp(small matrix).
  A = (MatrixType::Random(n, n) / Scalar(n * 4)).exp();
  MatrixType result(n, n);
  for (auto _ : state) {
    result = A.log();
    benchmark::DoNotOptimize(result.data());
    benchmark::ClobberMemory();
  }
 }
 template <int N>
 static void BM_MatrixLog_Fixed(benchmark::State& state) {
  typedef Matrix<Scalar, N, N> MatrixType;
  MatrixType A = (MatrixType::Random() / Scalar(N * 4)).exp();
  MatrixType result;
  for (auto _ : state) {
    result = A.log();
    benchmark::DoNotOptimize(result.data());
    benchmark::ClobberMemory();
  }
 }
 BENCHMARK(BM_MatrixLog)->Arg(2)->Arg(3)->Arg(4)->Arg(8)->Arg(16)->Arg(32)->Arg(64);
 BENCHMARK(BM_MatrixLog_Fixed<2>);
 BENCHMARK(BM_MatrixLog_Fixed<3>);
 BENCHMARK(BM_MatrixLog_Fixed<4>);
--- a/unsupported/benchmarks/MatrixFunctions/bench_matrix_power.cpp
+++ b/unsupported/benchmarks/MatrixFunctions/bench_matrix_power.cpp
@@ -0,0 +1,99 @@
 // Benchmarks for matrix power functions: sqrt, pow, cos, sin, cosh, sinh.
 #include <benchmark/benchmark.h>
 #include <Eigen/Core>
 #include <unsupported/Eigen/MatrixFunctions>
 using namespace Eigen;
 typedef double Scalar;
 typedef Matrix<Scalar, Dynamic, Dynamic> Mat;
 static void BM_MatrixSqrt(benchmark::State& state) {
  int n = state.range(0);
  // SPD matrix has well-defined sqrt.
  Mat tmp = Mat::Random(n, n);
  Mat A = tmp * tmp.transpose() + Mat::Identity(n, n);
  Mat result(n, n);
  for (auto _ : state) {
    result = A.sqrt();
    benchmark::DoNotOptimize(result.data());
    benchmark::ClobberMemory();
  }
 }
 static void BM_MatrixPow(benchmark::State& state) {
  int n = state.range(0);
  Mat tmp = Mat::Random(n, n);
  Mat A = tmp * tmp.transpose() + Mat::Identity(n, n);
  Mat result(n, n);
  Scalar p = 2.5;
  for (auto _ : state) {
    result = A.pow(p);
    benchmark::DoNotOptimize(result.data());
    benchmark::ClobberMemory();
  }
 }
 static void BM_MatrixCos(benchmark::State& state) {
  int n = state.range(0);
  Mat A = Mat::Random(n, n) / Scalar(n);
  Mat result(n, n);
  for (auto _ : state) {
    result = A.cos();
    benchmark::DoNotOptimize(result.data());
    benchmark::ClobberMemory();
  }
 }
 static void BM_MatrixSin(benchmark::State& state) {
  int n = state.range(0);
  Mat A = Mat::Random(n, n) / Scalar(n);
  Mat result(n, n);
  for (auto _ : state) {
    result = A.sin();
    benchmark::DoNotOptimize(result.data());
    benchmark::ClobberMemory();
  }
 }
 static void BM_MatrixCosh(benchmark::State& state) {
  int n = state.range(0);
  Mat A = Mat::Random(n, n) / Scalar(n);
  Mat result(n, n);
  for (auto _ : state) {
    result = A.cosh();
    benchmark::DoNotOptimize(result.data());
    benchmark::ClobberMemory();
  }
 }
 static void BM_MatrixSinh(benchmark::State& state) {
  int n = state.range(0);
  Mat A = Mat::Random(n, n) / Scalar(n);
  Mat result(n, n);
  for (auto _ : state) {
    result = A.sinh();
    benchmark::DoNotOptimize(result.data());
    benchmark::ClobberMemory();
  }
 }
 static void MatPowerSizes(::benchmark::Benchmark* b) {
  for (int n : {4, 8, 16, 32, 64}) {
    b->Arg(n);
  }
 }
 BENCHMARK(BM_MatrixSqrt)->Apply(MatPowerSizes);
 BENCHMARK(BM_MatrixPow)->Apply(MatPowerSizes);
 BENCHMARK(BM_MatrixCos)->Apply(MatPowerSizes);
 BENCHMARK(BM_MatrixSin)->Apply(MatPowerSizes);
 BENCHMARK(BM_MatrixCosh)->Apply(MatPowerSizes);
 BENCHMARK(BM_MatrixSinh)->Apply(MatPowerSizes);
--- a/unsupported/benchmarks/SpecialFunctions/CMakeLists.txt
+++ b/unsupported/benchmarks/SpecialFunctions/CMakeLists.txt
@@ -0,0 +1 @@
 eigen_add_benchmark(bench_special_functions bench_special_functions.cpp)
--- a/unsupported/benchmarks/SpecialFunctions/bench_special_functions.cpp
+++ b/unsupported/benchmarks/SpecialFunctions/bench_special_functions.cpp
@@ -0,0 +1,127 @@
 // Benchmarks for special functions beyond what bench_cwise_math.cpp covers.
 // Includes Bessel functions, two-argument functions (igamma, betainc),
 // and additional functions (lgamma, digamma, zeta, polygamma).
 #include <benchmark/benchmark.h>
 #include <Eigen/Core>
 #include <unsupported/Eigen/SpecialFunctions>
 using namespace Eigen;
 // Macro for unary special functions on arrays.
 #define BENCH_SPECIAL_UNARY(NAME, EXPR, LO, HI)                                                          \
  template <typename Scalar>                                                                             \
  static void BM_##NAME(benchmark::State& state) {                                                       \
    const Index n = state.range(0);                                                                      \
    using Arr = Array<Scalar, Dynamic, 1>;                                                               \
    Arr a = (Arr::Random(n) + Scalar(1)) * Scalar((double(HI) - double(LO)) / 2.0) + Scalar(LO);         \
    Arr b(n);                                                                                            \
    for (auto _ : state) {                                                                               \
      b = EXPR;                                                                                          \
      benchmark::DoNotOptimize(b.data());                                                                \
    }                                                                                                    \
    state.counters["Elements/s"] = benchmark::Counter(n, benchmark::Counter::kIsIterationInvariantRate); \
    state.SetBytesProcessed(state.iterations() * n * sizeof(Scalar) * 2);                                \
  }
 // Macro for binary special functions on arrays.
 #define BENCH_SPECIAL_BINARY(NAME, EXPR, LO_A, HI_A, LO_B, HI_B)                                         \
  template <typename Scalar>                                                                             \
  static void BM_##NAME(benchmark::State& state) {                                                       \
    const Index n = state.range(0);                                                                      \
    using Arr = Array<Scalar, Dynamic, 1>;                                                               \
    Arr a = (Arr::Random(n) + Scalar(1)) * Scalar((double(HI_A) - double(LO_A)) / 2.0) + Scalar(LO_A);   \
    Arr b = (Arr::Random(n) + Scalar(1)) * Scalar((double(HI_B) - double(LO_B)) / 2.0) + Scalar(LO_B);   \
    Arr c(n);                                                                                            \
    for (auto _ : state) {                                                                               \
      c = EXPR;                                                                                          \
      benchmark::DoNotOptimize(c.data());                                                                \
    }                                                                                                    \
    state.counters["Elements/s"] = benchmark::Counter(n, benchmark::Counter::kIsIterationInvariantRate); \
    state.SetBytesProcessed(state.iterations() * n * sizeof(Scalar) * 3);                                \
  }
 // --- Unary special functions ---
 BENCH_SPECIAL_UNARY(Lgamma, Eigen::lgamma(a), 0.1, 20)
 BENCH_SPECIAL_UNARY(Digamma, Eigen::digamma(a), 0.1, 20)
 // --- Bessel functions (first kind) ---
 BENCH_SPECIAL_UNARY(BesselI0, Eigen::bessel_i0(a), 0, 10)
 BENCH_SPECIAL_UNARY(BesselI1, Eigen::bessel_i1(a), 0, 10)
 BENCH_SPECIAL_UNARY(BesselI0e, Eigen::bessel_i0e(a), 0, 100)
 BENCH_SPECIAL_UNARY(BesselI1e, Eigen::bessel_i1e(a), 0, 100)
 BENCH_SPECIAL_UNARY(BesselJ0, Eigen::bessel_j0(a), 0, 20)
 BENCH_SPECIAL_UNARY(BesselJ1, Eigen::bessel_j1(a), 0, 20)
 // --- Bessel functions (second kind) ---
 BENCH_SPECIAL_UNARY(BesselY0, Eigen::bessel_y0(a), 0.1, 20)
 BENCH_SPECIAL_UNARY(BesselY1, Eigen::bessel_y1(a), 0.1, 20)
 BENCH_SPECIAL_UNARY(BesselK0, Eigen::bessel_k0(a), 0.1, 20)
 BENCH_SPECIAL_UNARY(BesselK1, Eigen::bessel_k1(a), 0.1, 20)
 BENCH_SPECIAL_UNARY(BesselK0e, Eigen::bessel_k0e(a), 0.1, 100)
 BENCH_SPECIAL_UNARY(BesselK1e, Eigen::bessel_k1e(a), 0.1, 100)
 // --- Two-argument functions ---
 BENCH_SPECIAL_BINARY(Igamma, Eigen::igamma(a, b), 0.1, 10, 0.1, 10)
 BENCH_SPECIAL_BINARY(Igammac, Eigen::igammac(a, b), 0.1, 10, 0.1, 10)
 BENCH_SPECIAL_BINARY(Zeta, Eigen::zeta(a, b), 1.1, 10, 0.1, 10)
 BENCH_SPECIAL_BINARY(Polygamma, Eigen::polygamma(a, b), 1, 4, 0.1, 10)
 // --- Ternary: betainc ---
 template <typename Scalar>
 static void BM_Betainc(benchmark::State& state) {
  const Index n = state.range(0);
  using Arr = Array<Scalar, Dynamic, 1>;
  Arr a = (Arr::Random(n) + Scalar(1)) * Scalar(2.5) + Scalar(0.5);  // [0.5, 5.5]
  Arr b = (Arr::Random(n) + Scalar(1)) * Scalar(2.5) + Scalar(0.5);
  Arr x = (Arr::Random(n) + Scalar(1)) * Scalar(0.5);  // [0, 1]
  Arr result(n);
  for (auto _ : state) {
    result = Eigen::betainc(a, b, x);
    benchmark::DoNotOptimize(result.data());
  }
  state.counters["Elements/s"] = benchmark::Counter(n, benchmark::Counter::kIsIterationInvariantRate);
  state.SetBytesProcessed(state.iterations() * n * sizeof(Scalar) * 4);
 }
 static void SpecialSizes(::benchmark::Benchmark* b) {
  for (int n : {256, 4096, 65536, 1048576}) b->Arg(n);
 }
 // --- Register float ---
 BENCHMARK(BM_Lgamma<float>)->Apply(SpecialSizes)->Name("Lgamma_float");
 BENCHMARK(BM_Digamma<float>)->Apply(SpecialSizes)->Name("Digamma_float");
 BENCHMARK(BM_BesselI0<float>)->Apply(SpecialSizes)->Name("BesselI0_float");
 BENCHMARK(BM_BesselI1<float>)->Apply(SpecialSizes)->Name("BesselI1_float");
 BENCHMARK(BM_BesselI0e<float>)->Apply(SpecialSizes)->Name("BesselI0e_float");
 BENCHMARK(BM_BesselI1e<float>)->Apply(SpecialSizes)->Name("BesselI1e_float");
 BENCHMARK(BM_BesselJ0<float>)->Apply(SpecialSizes)->Name("BesselJ0_float");
 BENCHMARK(BM_BesselJ1<float>)->Apply(SpecialSizes)->Name("BesselJ1_float");
 BENCHMARK(BM_BesselY0<float>)->Apply(SpecialSizes)->Name("BesselY0_float");
 BENCHMARK(BM_BesselY1<float>)->Apply(SpecialSizes)->Name("BesselY1_float");
 BENCHMARK(BM_BesselK0<float>)->Apply(SpecialSizes)->Name("BesselK0_float");
 BENCHMARK(BM_BesselK1<float>)->Apply(SpecialSizes)->Name("BesselK1_float");
 BENCHMARK(BM_BesselK0e<float>)->Apply(SpecialSizes)->Name("BesselK0e_float");
 BENCHMARK(BM_BesselK1e<float>)->Apply(SpecialSizes)->Name("BesselK1e_float");
 BENCHMARK(BM_Igamma<float>)->Apply(SpecialSizes)->Name("Igamma_float");
 BENCHMARK(BM_Igammac<float>)->Apply(SpecialSizes)->Name("Igammac_float");
 BENCHMARK(BM_Betainc<float>)->Apply(SpecialSizes)->Name("Betainc_float");
 BENCHMARK(BM_Zeta<float>)->Apply(SpecialSizes)->Name("Zeta_float");
 BENCHMARK(BM_Polygamma<float>)->Apply(SpecialSizes)->Name("Polygamma_float");
 // --- Register double ---
 BENCHMARK(BM_Lgamma<double>)->Apply(SpecialSizes)->Name("Lgamma_double");
 BENCHMARK(BM_Digamma<double>)->Apply(SpecialSizes)->Name("Digamma_double");
 BENCHMARK(BM_BesselI0<double>)->Apply(SpecialSizes)->Name("BesselI0_double");
 BENCHMARK(BM_BesselI1<double>)->Apply(SpecialSizes)->Name("BesselI1_double");
 BENCHMARK(BM_BesselJ0<double>)->Apply(SpecialSizes)->Name("BesselJ0_double");
 BENCHMARK(BM_BesselJ1<double>)->Apply(SpecialSizes)->Name("BesselJ1_double");
 BENCHMARK(BM_BesselY0<double>)->Apply(SpecialSizes)->Name("BesselY0_double");
 BENCHMARK(BM_BesselY1<double>)->Apply(SpecialSizes)->Name("BesselY1_double");
 BENCHMARK(BM_BesselK0<double>)->Apply(SpecialSizes)->Name("BesselK0_double");
 BENCHMARK(BM_BesselK1<double>)->Apply(SpecialSizes)->Name("BesselK1_double");
 BENCHMARK(BM_Igamma<double>)->Apply(SpecialSizes)->Name("Igamma_double");
 BENCHMARK(BM_Igammac<double>)->Apply(SpecialSizes)->Name("Igammac_double");
 BENCHMARK(BM_Betainc<double>)->Apply(SpecialSizes)->Name("Betainc_double");
 BENCHMARK(BM_Zeta<double>)->Apply(SpecialSizes)->Name("Zeta_double");
 BENCHMARK(BM_Polygamma<double>)->Apply(SpecialSizes)->Name("Polygamma_double");
--- a/unsupported/benchmarks/Splines/CMakeLists.txt
+++ b/unsupported/benchmarks/Splines/CMakeLists.txt
@@ -0,0 +1 @@
 eigen_add_benchmark(bench_splines bench_splines.cpp)
--- a/unsupported/benchmarks/Splines/bench_splines.cpp
+++ b/unsupported/benchmarks/Splines/bench_splines.cpp
@@ -0,0 +1,98 @@
 // Benchmarks for Eigen Spline module.
 // Tests fitting, evaluation, and derivative computation.
 #include <benchmark/benchmark.h>
 #include <Eigen/Core>
 #include <unsupported/Eigen/Splines>
 using namespace Eigen;
 typedef double Scalar;
 // --- Spline fitting (interpolation) ---
 template <int Dim, int Degree>
 static void BM_SplineFit(benchmark::State& state) {
  const int n = state.range(0);
  typedef Spline<Scalar, Dim> SplineType;
  typedef typename SplineType::PointType PointType;
  // Generate random points.
  Matrix<Scalar, Dim, Dynamic> pts(Dim, n);
  pts.setRandom();
  for (auto _ : state) {
    SplineType spline = SplineFitting<SplineType>::Interpolate(pts, Degree);
    benchmark::DoNotOptimize(spline.knots().data());
    benchmark::ClobberMemory();
  }
 }
 // --- Spline evaluation ---
 template <int Dim, int Degree>
 static void BM_SplineEval(benchmark::State& state) {
  const int n = state.range(0);  // number of control points for fitting
  const int neval = 1000;        // number of evaluation points
  typedef Spline<Scalar, Dim> SplineType;
  Matrix<Scalar, Dim, Dynamic> pts(Dim, n);
  pts.setRandom();
  SplineType spline = SplineFitting<SplineType>::Interpolate(pts, Degree);
  // Generate evaluation parameters in [0, 1].
  VectorXd u = VectorXd::LinSpaced(neval, 0, 1);
  for (auto _ : state) {
    for (int i = 0; i < neval; ++i) {
      auto pt = spline(u(i));
      benchmark::DoNotOptimize(pt.data());
    }
    benchmark::ClobberMemory();
  }
  state.counters["Evals/s"] = benchmark::Counter(neval, benchmark::Counter::kIsIterationInvariantRate);
 }
 // --- Spline derivative evaluation ---
 template <int Dim, int Degree>
 static void BM_SplineDerivatives(benchmark::State& state) {
  const int n = state.range(0);
  const int neval = 1000;
  typedef Spline<Scalar, Dim> SplineType;
  Matrix<Scalar, Dim, Dynamic> pts(Dim, n);
  pts.setRandom();
  SplineType spline = SplineFitting<SplineType>::Interpolate(pts, Degree);
  VectorXd u = VectorXd::LinSpaced(neval, 0, 1);
  for (auto _ : state) {
    for (int i = 0; i < neval; ++i) {
      auto derivs = spline.derivatives(u(i), 1);
      benchmark::DoNotOptimize(derivs.data());
    }
    benchmark::ClobberMemory();
  }
  state.counters["Evals/s"] = benchmark::Counter(neval, benchmark::Counter::kIsIterationInvariantRate);
 }
 static void SplineSizes(::benchmark::Benchmark* b) {
  for (int n : {10, 50, 200, 1000}) {
    b->Arg(n);
  }
 }
 // 2D cubic splines
 BENCHMARK(BM_SplineFit<2, 3>)->Apply(SplineSizes)->Name("SplineFit_2D_Cubic");
 BENCHMARK(BM_SplineEval<2, 3>)->Apply(SplineSizes)->Name("SplineEval_2D_Cubic");
 BENCHMARK(BM_SplineDerivatives<2, 3>)->Apply(SplineSizes)->Name("SplineDerivatives_2D_Cubic");
 // 3D cubic splines
 BENCHMARK(BM_SplineFit<3, 3>)->Apply(SplineSizes)->Name("SplineFit_3D_Cubic");
 BENCHMARK(BM_SplineEval<3, 3>)->Apply(SplineSizes)->Name("SplineEval_3D_Cubic");
 BENCHMARK(BM_SplineDerivatives<3, 3>)->Apply(SplineSizes)->Name("SplineDerivatives_3D_Cubic");
 // 2D quintic splines
 BENCHMARK(BM_SplineFit<2, 5>)->Apply(SplineSizes)->Name("SplineFit_2D_Quintic");
 BENCHMARK(BM_SplineEval<2, 5>)->Apply(SplineSizes)->Name("SplineEval_2D_Quintic");
--- a/unsupported/benchmarks/Tensor/CMakeLists.txt
+++ b/unsupported/benchmarks/Tensor/CMakeLists.txt
@@ -0,0 +1,8 @@
 eigen_add_benchmark(bench_contraction bench_contraction.cpp)
 eigen_add_benchmark(bench_convolution bench_convolution.cpp)
 eigen_add_benchmark(bench_reduction bench_reduction.cpp)
 eigen_add_benchmark(bench_broadcasting bench_broadcasting.cpp)
 eigen_add_benchmark(bench_shuffling bench_shuffling.cpp)
 eigen_add_benchmark(bench_tensor_fft bench_tensor_fft.cpp)
 eigen_add_benchmark(bench_morphing bench_morphing.cpp)
 eigen_add_benchmark(bench_coefficient_wise bench_coefficient_wise.cpp)
--- a/unsupported/benchmarks/Tensor/bench_broadcasting.cpp
+++ b/unsupported/benchmarks/Tensor/bench_broadcasting.cpp
@@ -0,0 +1,111 @@
 // Benchmarks for Eigen Tensor broadcasting.
 // Tests broadcasting along various dimensions and ranks.
 #include <benchmark/benchmark.h>
 #include <unsupported/Eigen/CXX11/Tensor>
 using namespace Eigen;
 typedef float Scalar;
 // --- Broadcast row vector {1,N} -> {M,N} ---
 static void BM_BroadcastRow(benchmark::State& state) {
  const int M = state.range(0);
  const int N = state.range(1);
  Tensor<Scalar, 2> row(1, N);
  Tensor<Scalar, 2> result(M, N);
  row.setRandom();
  Eigen::array<int, 2> bcast = {M, 1};
  for (auto _ : state) {
    result = row.broadcast(bcast);
    benchmark::DoNotOptimize(result.data());
    benchmark::ClobberMemory();
  }
  state.SetBytesProcessed(state.iterations() * M * N * sizeof(Scalar));
 }
 // --- Broadcast col vector {M,1} -> {M,N} ---
 static void BM_BroadcastCol(benchmark::State& state) {
  const int M = state.range(0);
  const int N = state.range(1);
  Tensor<Scalar, 2> col(M, 1);
  Tensor<Scalar, 2> result(M, N);
  col.setRandom();
  Eigen::array<int, 2> bcast = {1, N};
  for (auto _ : state) {
    result = col.broadcast(bcast);
    benchmark::DoNotOptimize(result.data());
    benchmark::ClobberMemory();
  }
  state.SetBytesProcessed(state.iterations() * M * N * sizeof(Scalar));
 }
 // --- Broadcast + element-wise add (bias addition pattern) ---
 static void BM_BroadcastAdd(benchmark::State& state) {
  const int M = state.range(0);
  const int N = state.range(1);
  Tensor<Scalar, 2> mat(M, N);
  Tensor<Scalar, 2> bias(1, N);
  Tensor<Scalar, 2> result(M, N);
  mat.setRandom();
  bias.setRandom();
  Eigen::array<int, 2> bcast = {M, 1};
  for (auto _ : state) {
    result = mat + bias.broadcast(bcast);
    benchmark::DoNotOptimize(result.data());
    benchmark::ClobberMemory();
  }
  state.SetBytesProcessed(state.iterations() * M * N * sizeof(Scalar) * 2);
 }
 // --- Rank-4 broadcast (batch x channels x 1 x 1) -> (batch x channels x H x W) ---
 static void BM_BroadcastRank4(benchmark::State& state) {
  const int batch = state.range(0);
  const int C = state.range(1);
  const int H = state.range(2);
  Tensor<Scalar, 4> bias(batch, C, 1, 1);
  Tensor<Scalar, 4> result(batch, C, H, H);
  bias.setRandom();
  Eigen::array<int, 4> bcast = {1, 1, H, H};
  for (auto _ : state) {
    result = bias.broadcast(bcast);
    benchmark::DoNotOptimize(result.data());
    benchmark::ClobberMemory();
  }
  state.SetBytesProcessed(state.iterations() * batch * C * H * H * sizeof(Scalar));
 }
 static void BroadcastSizes(::benchmark::Benchmark* b) {
  for (int m : {64, 256, 1024}) {
    for (int n : {64, 256, 1024}) {
      b->Args({m, n});
    }
  }
 }
 static void Rank4Sizes(::benchmark::Benchmark* b) {
  for (int batch : {1, 8}) {
    for (int c : {64, 256}) {
      for (int h : {16, 32}) {
        b->Args({batch, c, h});
      }
    }
  }
 }
 BENCHMARK(BM_BroadcastRow)->Apply(BroadcastSizes);
 BENCHMARK(BM_BroadcastCol)->Apply(BroadcastSizes);
 BENCHMARK(BM_BroadcastAdd)->Apply(BroadcastSizes);
 BENCHMARK(BM_BroadcastRank4)->Apply(Rank4Sizes);
--- a/unsupported/benchmarks/Tensor/bench_coefficient_wise.cpp
+++ b/unsupported/benchmarks/Tensor/bench_coefficient_wise.cpp
@@ -0,0 +1,131 @@
 // Benchmarks for Eigen Tensor coefficient-wise operations.
 // Covers activation functions, normalization, and element-wise arithmetic.
 #include <benchmark/benchmark.h>
 #include <unsupported/Eigen/CXX11/Tensor>
 using namespace Eigen;
 typedef float Scalar;
 // Macro to define a benchmark for a unary tensor operation.
 #define BENCH_TENSOR_UNARY(NAME, EXPR)                                        \
  static void BM_##NAME(benchmark::State& state) {                            \
    const int M = state.range(0);                                             \
    const int N = state.range(1);                                             \
    Tensor<Scalar, 2> a(M, N);                                                \
    a.setRandom();                                                            \
    Tensor<Scalar, 2> b(M, N);                                                \
    for (auto _ : state) {                                                    \
      b = EXPR;                                                               \
      benchmark::DoNotOptimize(b.data());                                     \
      benchmark::ClobberMemory();                                             \
    }                                                                         \
    state.SetBytesProcessed(state.iterations() * M * N * sizeof(Scalar) * 2); \
  }
 BENCH_TENSOR_UNARY(Exp, a.exp())
 BENCH_TENSOR_UNARY(Log, a.abs().log())
 BENCH_TENSOR_UNARY(Tanh, a.tanh())
 BENCH_TENSOR_UNARY(Sigmoid, a.sigmoid())
 BENCH_TENSOR_UNARY(ReLU, a.cwiseMax(Scalar(0)))
 BENCH_TENSOR_UNARY(Sqrt, a.abs().sqrt())
 // --- Element-wise binary operations ---
 static void BM_Add(benchmark::State& state) {
  const int M = state.range(0);
  const int N = state.range(1);
  Tensor<Scalar, 2> a(M, N);
  Tensor<Scalar, 2> b(M, N);
  Tensor<Scalar, 2> c(M, N);
  a.setRandom();
  b.setRandom();
  for (auto _ : state) {
    c = a + b;
    benchmark::DoNotOptimize(c.data());
    benchmark::ClobberMemory();
  }
  state.SetBytesProcessed(state.iterations() * M * N * sizeof(Scalar) * 3);
 }
 static void BM_Mul(benchmark::State& state) {
  const int M = state.range(0);
  const int N = state.range(1);
  Tensor<Scalar, 2> a(M, N);
  Tensor<Scalar, 2> b(M, N);
  Tensor<Scalar, 2> c(M, N);
  a.setRandom();
  b.setRandom();
  for (auto _ : state) {
    c = a * b;
    benchmark::DoNotOptimize(c.data());
    benchmark::ClobberMemory();
  }
  state.SetBytesProcessed(state.iterations() * M * N * sizeof(Scalar) * 3);
 }
 // --- Fused multiply-add ---
 static void BM_FMA(benchmark::State& state) {
  const int M = state.range(0);
  const int N = state.range(1);
  Tensor<Scalar, 2> a(M, N);
  Tensor<Scalar, 2> b(M, N);
  Tensor<Scalar, 2> c(M, N);
  Tensor<Scalar, 2> d(M, N);
  a.setRandom();
  b.setRandom();
  c.setRandom();
  for (auto _ : state) {
    d = a * b + c;
    benchmark::DoNotOptimize(d.data());
    benchmark::ClobberMemory();
  }
  state.SetBytesProcessed(state.iterations() * M * N * sizeof(Scalar) * 4);
 }
 // --- Rank-4 coefficient-wise (CNN feature maps) ---
 static void BM_ReLU_Rank4(benchmark::State& state) {
  const int batch = state.range(0);
  const int C = state.range(1);
  const int H = state.range(2);
  Tensor<Scalar, 4> a(batch, C, H, H);
  Tensor<Scalar, 4> b(batch, C, H, H);
  a.setRandom();
  for (auto _ : state) {
    b = a.cwiseMax(Scalar(0));
    benchmark::DoNotOptimize(b.data());
    benchmark::ClobberMemory();
  }
  state.SetBytesProcessed(state.iterations() * batch * C * H * H * sizeof(Scalar) * 2);
 }
 static void CwiseSizes(::benchmark::Benchmark* b) {
  for (int size : {256, 1024}) {
    b->Args({size, size});
  }
 }
 static void Rank4Sizes(::benchmark::Benchmark* b) {
  b->Args({32, 64, 16});
  b->Args({8, 128, 32});
  b->Args({1, 256, 64});
 }
 BENCHMARK(BM_Exp)->Apply(CwiseSizes);
 BENCHMARK(BM_Log)->Apply(CwiseSizes);
 BENCHMARK(BM_Tanh)->Apply(CwiseSizes);
 BENCHMARK(BM_Sigmoid)->Apply(CwiseSizes);
 BENCHMARK(BM_ReLU)->Apply(CwiseSizes);
 BENCHMARK(BM_Sqrt)->Apply(CwiseSizes);
 BENCHMARK(BM_Add)->Apply(CwiseSizes);
 BENCHMARK(BM_Mul)->Apply(CwiseSizes);
 BENCHMARK(BM_FMA)->Apply(CwiseSizes);
 BENCHMARK(BM_ReLU_Rank4)->Apply(Rank4Sizes);
--- a/unsupported/benchmarks/Tensor/bench_contraction.cpp
+++ b/unsupported/benchmarks/Tensor/bench_contraction.cpp
@@ -0,0 +1,148 @@
 // Benchmarks for Eigen Tensor contraction (generalized GEMM).
 // Tests single-threaded (DefaultDevice) and multi-threaded (ThreadPoolDevice) variants.
 #define EIGEN_USE_THREADS
 #include <benchmark/benchmark.h>
 #include <unsupported/Eigen/CXX11/Tensor>
 #include <unsupported/Eigen/CXX11/ThreadPool>
 using namespace Eigen;
 #ifndef SCALAR
 #define SCALAR float
 #endif
 typedef SCALAR Scalar;
 // --- DefaultDevice contraction (rank-2, equivalent to matrix multiply) ---
 static void BM_Contraction(benchmark::State& state) {
  const int M = state.range(0);
  const int N = state.range(1);
  const int K = state.range(2);
  Tensor<Scalar, 2> A(M, K);
  Tensor<Scalar, 2> B(K, N);
  Tensor<Scalar, 2> C(M, N);
  A.setRandom();
  B.setRandom();
  using ContractDims = Tensor<Scalar, 2>::DimensionPair;
  Eigen::array<ContractDims, 1> contract_dims = {ContractDims(1, 0)};
  for (auto _ : state) {
    C = A.contract(B, contract_dims);
    benchmark::DoNotOptimize(C.data());
    benchmark::ClobberMemory();
  }
  state.counters["GFLOPS"] =
      benchmark::Counter(2.0 * M * N * K, benchmark::Counter::kIsIterationInvariantRate, benchmark::Counter::kIs1000);
 }
 // --- ThreadPoolDevice contraction ---
 static void BM_Contraction_ThreadPool(benchmark::State& state) {
  const int M = state.range(0);
  const int N = state.range(1);
  const int K = state.range(2);
  const int threads = state.range(3);
  Tensor<Scalar, 2> A(M, K);
  Tensor<Scalar, 2> B(K, N);
  Tensor<Scalar, 2> C(M, N);
  A.setRandom();
  B.setRandom();
  ThreadPool tp(threads);
  ThreadPoolDevice dev(&tp, threads);
  using ContractDims = Tensor<Scalar, 2>::DimensionPair;
  Eigen::array<ContractDims, 1> contract_dims = {ContractDims(1, 0)};
  for (auto _ : state) {
    C.device(dev) = A.contract(B, contract_dims);
    benchmark::DoNotOptimize(C.data());
    benchmark::ClobberMemory();
  }
  state.counters["GFLOPS"] =
      benchmark::Counter(2.0 * M * N * K, benchmark::Counter::kIsIterationInvariantRate, benchmark::Counter::kIs1000);
  state.counters["threads"] = threads;
 }
 // --- Rank-3 batch contraction ---
 static void BM_BatchContraction(benchmark::State& state) {
  const int batch = state.range(0);
  const int M = state.range(1);
  const int N = state.range(2);
  const int K = state.range(3);
  Tensor<Scalar, 3> A(batch, M, K);
  Tensor<Scalar, 3> B(batch, K, N);
  Tensor<Scalar, 3> C(batch, M, N);
  A.setRandom();
  B.setRandom();
  using ContractDims = Tensor<Scalar, 3>::DimensionPair;
  Eigen::array<ContractDims, 1> contract_dims = {ContractDims(2, 1)};
  for (auto _ : state) {
    C = A.contract(B, contract_dims);
    benchmark::DoNotOptimize(C.data());
    benchmark::ClobberMemory();
  }
  state.counters["GFLOPS"] = benchmark::Counter(2.0 * batch * M * N * K, benchmark::Counter::kIsIterationInvariantRate,
                                                benchmark::Counter::kIs1000);
 }
 // --- RowMajor contraction ---
 static void BM_Contraction_RowMajor(benchmark::State& state) {
  const int M = state.range(0);
  const int N = state.range(1);
  const int K = state.range(2);
  Tensor<Scalar, 2, RowMajor> A(M, K);
  Tensor<Scalar, 2, RowMajor> B(K, N);
  Tensor<Scalar, 2, RowMajor> C(M, N);
  A.setRandom();
  B.setRandom();
  using ContractDims = Tensor<Scalar, 2, RowMajor>::DimensionPair;
  Eigen::array<ContractDims, 1> contract_dims = {ContractDims(1, 0)};
  for (auto _ : state) {
    C = A.contract(B, contract_dims);
    benchmark::DoNotOptimize(C.data());
    benchmark::ClobberMemory();
  }
  state.counters["GFLOPS"] =
      benchmark::Counter(2.0 * M * N * K, benchmark::Counter::kIsIterationInvariantRate, benchmark::Counter::kIs1000);
 }
 static void ContractionSizes(::benchmark::Benchmark* b) {
  for (int size : {32, 64, 128, 256, 512, 1024}) {
    b->Args({size, size, size});
  }
  // Non-square
  b->Args({256, 256, 1024});
  b->Args({1024, 64, 64});
 }
 static void ThreadPoolSizes(::benchmark::Benchmark* b) {
  for (int size : {64, 256, 512, 1024}) {
    for (int threads : {2, 4, 8}) {
      b->Args({size, size, size, threads});
    }
  }
 }
 static void BatchSizes(::benchmark::Benchmark* b) {
  for (int batch : {1, 8, 32}) {
    for (int size : {64, 256}) {
      b->Args({batch, size, size, size});
    }
  }
 }
 BENCHMARK(BM_Contraction)->Apply(ContractionSizes);
 BENCHMARK(BM_Contraction_RowMajor)->Apply(ContractionSizes);
 BENCHMARK(BM_Contraction_ThreadPool)->Apply(ThreadPoolSizes);
 BENCHMARK(BM_BatchContraction)->Apply(BatchSizes);
--- a/unsupported/benchmarks/Tensor/bench_convolution.cpp
+++ b/unsupported/benchmarks/Tensor/bench_convolution.cpp
@@ -0,0 +1,151 @@
 // Benchmarks for Eigen Tensor convolution (1D and 2D).
 #define EIGEN_USE_THREADS
 #include <benchmark/benchmark.h>
 #include <unsupported/Eigen/CXX11/Tensor>
 #include <unsupported/Eigen/CXX11/ThreadPool>
 using namespace Eigen;
 typedef float Scalar;
 // --- 1D convolution ---
 static void BM_Convolve1D(benchmark::State& state) {
  const int input_size = state.range(0);
  const int kernel_size = state.range(1);
  Tensor<Scalar, 1> input(input_size);
  Tensor<Scalar, 1> kernel(kernel_size);
  input.setRandom();
  kernel.setRandom();
  Eigen::array<int, 1> dims = {0};
  for (auto _ : state) {
    Tensor<Scalar, 1> result = input.convolve(kernel, dims);
    benchmark::DoNotOptimize(result.data());
    benchmark::ClobberMemory();
  }
  double flops = 2.0 * (input_size - kernel_size + 1) * kernel_size;
  state.counters["GFLOPS"] =
      benchmark::Counter(flops, benchmark::Counter::kIsIterationInvariantRate, benchmark::Counter::kIs1000);
 }
 // --- 2D convolution ---
 static void BM_Convolve2D(benchmark::State& state) {
  const int H = state.range(0);
  const int W = state.range(1);
  const int kH = state.range(2);
  const int kW = state.range(3);
  Tensor<Scalar, 2> input(H, W);
  Tensor<Scalar, 2> kernel(kH, kW);
  input.setRandom();
  kernel.setRandom();
  Eigen::array<int, 2> dims = {0, 1};
  for (auto _ : state) {
    Tensor<Scalar, 2> result = input.convolve(kernel, dims);
    benchmark::DoNotOptimize(result.data());
    benchmark::ClobberMemory();
  }
  double flops = 2.0 * (H - kH + 1) * (W - kW + 1) * kH * kW;
  state.counters["GFLOPS"] =
      benchmark::Counter(flops, benchmark::Counter::kIsIterationInvariantRate, benchmark::Counter::kIs1000);
 }
 // --- 2D convolution with channels (rank-3: C x H x W, convolve on H,W) ---
 static void BM_Convolve2D_Channels(benchmark::State& state) {
  const int C = state.range(0);
  const int H = state.range(1);
  const int kH = state.range(2);
  Tensor<Scalar, 3> input(C, H, H);
  Tensor<Scalar, 2> kernel(kH, kH);
  input.setRandom();
  kernel.setRandom();
  Eigen::array<int, 2> dims = {1, 2};
  for (auto _ : state) {
    Tensor<Scalar, 3> result = input.convolve(kernel, dims);
    benchmark::DoNotOptimize(result.data());
    benchmark::ClobberMemory();
  }
  int outH = H - kH + 1;
  double flops = 2.0 * C * outH * outH * kH * kH;
  state.counters["GFLOPS"] =
      benchmark::Counter(flops, benchmark::Counter::kIsIterationInvariantRate, benchmark::Counter::kIs1000);
 }
 // --- 2D convolution with ThreadPool ---
 static void BM_Convolve2D_ThreadPool(benchmark::State& state) {
  const int H = state.range(0);
  const int kH = state.range(1);
  const int threads = state.range(2);
  Tensor<Scalar, 2> input(H, H);
  Tensor<Scalar, 2> kernel(kH, kH);
  Tensor<Scalar, 2> result(H - kH + 1, H - kH + 1);
  input.setRandom();
  kernel.setRandom();
  ThreadPool tp(threads);
  ThreadPoolDevice dev(&tp, threads);
  Eigen::array<int, 2> dims = {0, 1};
  for (auto _ : state) {
    result.device(dev) = input.convolve(kernel, dims);
    benchmark::DoNotOptimize(result.data());
    benchmark::ClobberMemory();
  }
  int outH = H - kH + 1;
  double flops = 2.0 * outH * outH * kH * kH;
  state.counters["GFLOPS"] =
      benchmark::Counter(flops, benchmark::Counter::kIsIterationInvariantRate, benchmark::Counter::kIs1000);
  state.counters["threads"] = threads;
 }
 static void Conv1DSizes(::benchmark::Benchmark* b) {
  for (int input : {128, 512, 2048}) {
    for (int kernel : {3, 5, 11}) {
      b->Args({input, kernel});
    }
  }
 }
 static void Conv2DSizes(::benchmark::Benchmark* b) {
  for (int hw : {32, 64, 128, 224}) {
    for (int k : {3, 5, 7}) {
      b->Args({hw, hw, k, k});
    }
  }
 }
 static void Conv2DChannelSizes(::benchmark::Benchmark* b) {
  for (int c : {3, 64, 128}) {
    for (int hw : {16, 32, 56}) {
      for (int k : {3, 5}) {
        b->Args({c, hw, k});
      }
    }
  }
 }
 static void Conv2DThreadPoolSizes(::benchmark::Benchmark* b) {
  for (int hw : {64, 128, 224}) {
    for (int k : {3, 5}) {
      for (int threads : {2, 4, 8}) {
        b->Args({hw, k, threads});
      }
    }
  }
 }
 BENCHMARK(BM_Convolve1D)->Apply(Conv1DSizes);
 BENCHMARK(BM_Convolve2D)->Apply(Conv2DSizes);
 BENCHMARK(BM_Convolve2D_Channels)->Apply(Conv2DChannelSizes);
 BENCHMARK(BM_Convolve2D_ThreadPool)->Apply(Conv2DThreadPoolSizes);
--- a/unsupported/benchmarks/Tensor/bench_morphing.cpp
+++ b/unsupported/benchmarks/Tensor/bench_morphing.cpp
@@ -0,0 +1,142 @@
 // Benchmarks for Eigen Tensor morphing operations: reshape, slice, chip, pad, stride.
 #include <benchmark/benchmark.h>
 #include <unsupported/Eigen/CXX11/Tensor>
 using namespace Eigen;
 typedef float Scalar;
 // --- Reshape (zero-cost if no evaluation needed; force eval via assignment) ---
 static void BM_Reshape(benchmark::State& state) {
  const int M = state.range(0);
  const int N = state.range(1);
  Tensor<Scalar, 2> A(M, N);
  A.setRandom();
  Eigen::array<Index, 1> new_shape = {M * N};
  for (auto _ : state) {
    Tensor<Scalar, 1> B = A.reshape(new_shape);
    benchmark::DoNotOptimize(B.data());
    benchmark::ClobberMemory();
  }
  state.SetBytesProcessed(state.iterations() * M * N * sizeof(Scalar));
 }
 // --- Slice ---
 static void BM_Slice(benchmark::State& state) {
  const int M = state.range(0);
  const int N = state.range(1);
  Tensor<Scalar, 2> A(M, N);
  A.setRandom();
  int sliceM = M / 2;
  int sliceN = N / 2;
  Eigen::array<Index, 2> offsets = {0, 0};
  Eigen::array<Index, 2> extents = {sliceM, sliceN};
  for (auto _ : state) {
    Tensor<Scalar, 2> B = A.slice(offsets, extents);
    benchmark::DoNotOptimize(B.data());
    benchmark::ClobberMemory();
  }
  state.SetBytesProcessed(state.iterations() * sliceM * sliceN * sizeof(Scalar));
 }
 // --- Chip (extract a sub-tensor along one dimension) ---
 static void BM_Chip(benchmark::State& state) {
  const int D0 = state.range(0);
  const int D1 = state.range(1);
  const int D2 = state.range(2);
  Tensor<Scalar, 3> A(D0, D1, D2);
  A.setRandom();
  for (auto _ : state) {
    Tensor<Scalar, 2> B = A.chip(0, 0);
    benchmark::DoNotOptimize(B.data());
    benchmark::ClobberMemory();
  }
  state.SetBytesProcessed(state.iterations() * D1 * D2 * sizeof(Scalar));
 }
 // --- Pad ---
 static void BM_Pad(benchmark::State& state) {
  const int M = state.range(0);
  const int N = state.range(1);
  const int padSize = state.range(2);
  Tensor<Scalar, 2> A(M, N);
  A.setRandom();
  Eigen::array<std::pair<int, int>, 2> paddings;
  paddings[0] = {padSize, padSize};
  paddings[1] = {padSize, padSize};
  for (auto _ : state) {
    Tensor<Scalar, 2> B = A.pad(paddings);
    benchmark::DoNotOptimize(B.data());
    benchmark::ClobberMemory();
  }
  int outM = M + 2 * padSize;
  int outN = N + 2 * padSize;
  state.SetBytesProcessed(state.iterations() * outM * outN * sizeof(Scalar));
 }
 // --- Stride ---
 static void BM_Stride(benchmark::State& state) {
  const int M = state.range(0);
  const int N = state.range(1);
  const int stride = state.range(2);
  Tensor<Scalar, 2> A(M, N);
  A.setRandom();
  Eigen::array<Index, 2> strides_arr = {stride, stride};
  for (auto _ : state) {
    Tensor<Scalar, 2> B = A.stride(strides_arr);
    benchmark::DoNotOptimize(B.data());
    benchmark::ClobberMemory();
  }
  int outM = (M + stride - 1) / stride;
  int outN = (N + stride - 1) / stride;
  state.SetBytesProcessed(state.iterations() * outM * outN * sizeof(Scalar));
 }
 static void MorphSizes(::benchmark::Benchmark* b) {
  for (int size : {256, 1024}) {
    b->Args({size, size});
  }
 }
 static void ChipSizes(::benchmark::Benchmark* b) {
  b->Args({32, 256, 256});
  b->Args({64, 128, 128});
  b->Args({8, 512, 512});
 }
 static void PadSizes(::benchmark::Benchmark* b) {
  for (int size : {256, 1024}) {
    for (int pad : {1, 4, 16}) {
      b->Args({size, size, pad});
    }
  }
 }
 static void StrideSizes(::benchmark::Benchmark* b) {
  for (int size : {256, 1024}) {
    for (int stride : {2, 4}) {
      b->Args({size, size, stride});
    }
  }
 }
 BENCHMARK(BM_Reshape)->Apply(MorphSizes);
 BENCHMARK(BM_Slice)->Apply(MorphSizes);
 BENCHMARK(BM_Chip)->Apply(ChipSizes);
 BENCHMARK(BM_Pad)->Apply(PadSizes);
 BENCHMARK(BM_Stride)->Apply(StrideSizes);
--- a/unsupported/benchmarks/Tensor/bench_reduction.cpp
+++ b/unsupported/benchmarks/Tensor/bench_reduction.cpp
@@ -0,0 +1,158 @@
 // Benchmarks for Eigen Tensor reductions (sum, maximum, mean).
 // Tests full and partial reductions, inner vs outer dimension, DefaultDevice and ThreadPoolDevice.
 #define EIGEN_USE_THREADS
 #include <benchmark/benchmark.h>
 #include <unsupported/Eigen/CXX11/Tensor>
 #include <unsupported/Eigen/CXX11/ThreadPool>
 using namespace Eigen;
 #ifndef SCALAR
 #define SCALAR float
 #endif
 typedef SCALAR Scalar;
 // --- Full reduction (rank-2) ---
 template <typename ReduceOp>
 static void BM_FullReduction(benchmark::State& state) {
  const int M = state.range(0);
  const int N = state.range(1);
  Tensor<Scalar, 2> A(M, N);
  A.setRandom();
  for (auto _ : state) {
    Tensor<Scalar, 0> result = A.reduce(Eigen::array<int, 2>{0, 1}, ReduceOp());
    benchmark::DoNotOptimize(result.data());
    benchmark::ClobberMemory();
  }
  state.SetBytesProcessed(state.iterations() * M * N * sizeof(Scalar));
 }
 // --- Partial reduction along dim 0 (inner dim, ColMajor) ---
 static void BM_ReduceInner(benchmark::State& state) {
  const int M = state.range(0);
  const int N = state.range(1);
  Tensor<Scalar, 2> A(M, N);
  A.setRandom();
  Eigen::array<int, 1> reduce_dims = {0};
  for (auto _ : state) {
    Tensor<Scalar, 1> result = A.sum(reduce_dims);
    benchmark::DoNotOptimize(result.data());
    benchmark::ClobberMemory();
  }
  state.SetBytesProcessed(state.iterations() * M * N * sizeof(Scalar));
 }
 // --- Partial reduction along dim 1 (outer dim, ColMajor) ---
 static void BM_ReduceOuter(benchmark::State& state) {
  const int M = state.range(0);
  const int N = state.range(1);
  Tensor<Scalar, 2> A(M, N);
  A.setRandom();
  Eigen::array<int, 1> reduce_dims = {1};
  for (auto _ : state) {
    Tensor<Scalar, 1> result = A.sum(reduce_dims);
    benchmark::DoNotOptimize(result.data());
    benchmark::ClobberMemory();
  }
  state.SetBytesProcessed(state.iterations() * M * N * sizeof(Scalar));
 }
 // --- Rank-4 partial reduction (batch x channels x H x W), reduce along spatial dims ---
 static void BM_ReduceSpatial(benchmark::State& state) {
  const int batch = state.range(0);
  const int C = state.range(1);
  const int H = state.range(2);
  Tensor<Scalar, 4> A(batch, C, H, H);
  A.setRandom();
  Eigen::array<int, 2> reduce_dims = {2, 3};
  for (auto _ : state) {
    Tensor<Scalar, 2> result = A.sum(reduce_dims);
    benchmark::DoNotOptimize(result.data());
    benchmark::ClobberMemory();
  }
  state.SetBytesProcessed(state.iterations() * batch * C * H * H * sizeof(Scalar));
 }
 // --- Full reduction with ThreadPoolDevice ---
 static void BM_FullReduction_ThreadPool(benchmark::State& state) {
  const int M = state.range(0);
  const int N = state.range(1);
  const int threads = state.range(2);
  Tensor<Scalar, 2> A(M, N);
  Tensor<Scalar, 0> result;
  A.setRandom();
  ThreadPool tp(threads);
  ThreadPoolDevice dev(&tp, threads);
  for (auto _ : state) {
    result.device(dev) = A.sum();
    benchmark::DoNotOptimize(result.data());
    benchmark::ClobberMemory();
  }
  state.SetBytesProcessed(state.iterations() * M * N * sizeof(Scalar));
  state.counters["threads"] = threads;
 }
 // --- Maximum reduction (rank-2) ---
 static void BM_MaxReduction(benchmark::State& state) {
  const int M = state.range(0);
  const int N = state.range(1);
  Tensor<Scalar, 2> A(M, N);
  A.setRandom();
  for (auto _ : state) {
    Tensor<Scalar, 0> result = A.maximum();
    benchmark::DoNotOptimize(result.data());
    benchmark::ClobberMemory();
  }
  state.SetBytesProcessed(state.iterations() * M * N * sizeof(Scalar));
 }
 static void ReductionSizes(::benchmark::Benchmark* b) {
  for (int size : {64, 256, 1024}) {
    b->Args({size, size});
  }
 }
 static void ThreadPoolReductionSizes(::benchmark::Benchmark* b) {
  for (int size : {256, 1024}) {
    for (int threads : {2, 4, 8}) {
      b->Args({size, size, threads});
    }
  }
 }
 static void SpatialSizes(::benchmark::Benchmark* b) {
  for (int batch : {1, 8, 32}) {
    for (int c : {64, 128}) {
      for (int h : {16, 32}) {
        b->Args({batch, c, h});
      }
    }
  }
 }
 BENCHMARK(BM_FullReduction<internal::SumReducer<Scalar>>)->Apply(ReductionSizes)->Name("SumReduction");
 BENCHMARK(BM_FullReduction<internal::MaxReducer<Scalar>>)->Apply(ReductionSizes)->Name("MaxReduction_Full");
 BENCHMARK(BM_MaxReduction)->Apply(ReductionSizes);
 BENCHMARK(BM_ReduceInner)->Apply(ReductionSizes);
 BENCHMARK(BM_ReduceOuter)->Apply(ReductionSizes);
 BENCHMARK(BM_ReduceSpatial)->Apply(SpatialSizes);
 BENCHMARK(BM_FullReduction_ThreadPool)->Apply(ThreadPoolReductionSizes);
--- a/unsupported/benchmarks/Tensor/bench_shuffling.cpp
+++ b/unsupported/benchmarks/Tensor/bench_shuffling.cpp
@@ -0,0 +1,115 @@
 // Benchmarks for Eigen Tensor shuffling (transpose / permutation).
 #include <benchmark/benchmark.h>
 #include <unsupported/Eigen/CXX11/Tensor>
 using namespace Eigen;
 typedef float Scalar;
 // --- Rank-2 transpose ---
 static void BM_Shuffle2D(benchmark::State& state) {
  const int M = state.range(0);
  const int N = state.range(1);
  Tensor<Scalar, 2> A(M, N);
  Tensor<Scalar, 2> B(N, M);
  A.setRandom();
  Eigen::array<int, 2> perm = {1, 0};
  for (auto _ : state) {
    B = A.shuffle(perm);
    benchmark::DoNotOptimize(B.data());
    benchmark::ClobberMemory();
  }
  state.SetBytesProcessed(state.iterations() * M * N * sizeof(Scalar) * 2);
 }
 // --- Identity shuffle (no permutation, measures overhead) ---
 static void BM_ShuffleIdentity(benchmark::State& state) {
  const int M = state.range(0);
  const int N = state.range(1);
  Tensor<Scalar, 2> A(M, N);
  Tensor<Scalar, 2> B(M, N);
  A.setRandom();
  Eigen::array<int, 2> perm = {0, 1};
  for (auto _ : state) {
    B = A.shuffle(perm);
    benchmark::DoNotOptimize(B.data());
    benchmark::ClobberMemory();
  }
  state.SetBytesProcessed(state.iterations() * M * N * sizeof(Scalar) * 2);
 }
 // --- Rank-3 permutation ---
 static void BM_Shuffle3D(benchmark::State& state) {
  const int D0 = state.range(0);
  const int D1 = state.range(1);
  const int D2 = state.range(2);
  Tensor<Scalar, 3> A(D0, D1, D2);
  A.setRandom();
  // Permutation (2, 0, 1)
  Eigen::array<int, 3> perm = {2, 0, 1};
  for (auto _ : state) {
    Tensor<Scalar, 3> B = A.shuffle(perm);
    benchmark::DoNotOptimize(B.data());
    benchmark::ClobberMemory();
  }
  state.SetBytesProcessed(state.iterations() * D0 * D1 * D2 * sizeof(Scalar) * 2);
 }
 // --- Rank-4 permutation (NCHW -> NHWC layout conversion) ---
 static void BM_Shuffle4D_NCHW_to_NHWC(benchmark::State& state) {
  const int N = state.range(0);
  const int C = state.range(1);
  const int H = state.range(2);
  Tensor<Scalar, 4> A(N, C, H, H);
  A.setRandom();
  // NCHW -> NHWC: permute (0, 2, 3, 1)
  Eigen::array<int, 4> perm = {0, 2, 3, 1};
  for (auto _ : state) {
    Tensor<Scalar, 4> B = A.shuffle(perm);
    benchmark::DoNotOptimize(B.data());
    benchmark::ClobberMemory();
  }
  state.SetBytesProcessed(state.iterations() * N * C * H * H * sizeof(Scalar) * 2);
 }
 static void Shuffle2DSizes(::benchmark::Benchmark* b) {
  for (int size : {256, 1024}) {
    b->Args({size, size});
  }
  b->Args({64, 4096});
  b->Args({4096, 64});
 }
 static void Shuffle3DSizes(::benchmark::Benchmark* b) {
  b->Args({64, 64, 64});
  b->Args({128, 128, 64});
  b->Args({32, 256, 256});
 }
 static void Shuffle4DSizes(::benchmark::Benchmark* b) {
  for (int batch : {1, 8}) {
    for (int c : {3, 64}) {
      for (int h : {32, 64}) {
        b->Args({batch, c, h});
      }
    }
  }
 }
 BENCHMARK(BM_Shuffle2D)->Apply(Shuffle2DSizes);
 BENCHMARK(BM_ShuffleIdentity)->Apply(Shuffle2DSizes);
 BENCHMARK(BM_Shuffle3D)->Apply(Shuffle3DSizes);
 BENCHMARK(BM_Shuffle4D_NCHW_to_NHWC)->Apply(Shuffle4DSizes);
--- a/unsupported/benchmarks/Tensor/bench_tensor_fft.cpp
+++ b/unsupported/benchmarks/Tensor/bench_tensor_fft.cpp
@@ -0,0 +1,80 @@
 // Benchmarks for Eigen Tensor FFT.
 #include <benchmark/benchmark.h>
 #include <unsupported/Eigen/CXX11/Tensor>
 using namespace Eigen;
 #ifndef SCALAR
 #define SCALAR float
 #endif
 typedef SCALAR Scalar;
 // --- 1D FFT ---
 static void BM_TensorFFT_1D(benchmark::State& state) {
  const int N = state.range(0);
  Tensor<Scalar, 1> input(N);
  input.setRandom();
  Eigen::array<int, 1> fft_dims = {0};
  for (auto _ : state) {
    Tensor<std::complex<Scalar>, 1> result = input.template fft<BothParts, FFT_FORWARD>(fft_dims);
    benchmark::DoNotOptimize(result.data());
    benchmark::ClobberMemory();
  }
  double mflops = 5.0 * N * std::log2(static_cast<double>(N)) / 2.0;  // real->complex
  state.counters["MFLOPS"] =
      benchmark::Counter(mflops, benchmark::Counter::kIsIterationInvariantRate, benchmark::Counter::kIs1000);
 }
 // --- 2D FFT ---
 static void BM_TensorFFT_2D(benchmark::State& state) {
  const int N = state.range(0);
  Tensor<Scalar, 2> input(N, N);
  input.setRandom();
  Eigen::array<int, 2> fft_dims = {0, 1};
  for (auto _ : state) {
    Tensor<std::complex<Scalar>, 2> result = input.template fft<BothParts, FFT_FORWARD>(fft_dims);
    benchmark::DoNotOptimize(result.data());
    benchmark::ClobberMemory();
  }
  double total = N * N;
  double mflops = 5.0 * total * std::log2(static_cast<double>(N));
  state.counters["MFLOPS"] =
      benchmark::Counter(mflops, benchmark::Counter::kIsIterationInvariantRate, benchmark::Counter::kIs1000);
 }
 // --- 1D inverse FFT ---
 static void BM_TensorIFFT_1D(benchmark::State& state) {
  const int N = state.range(0);
  Tensor<std::complex<Scalar>, 1> input(N);
  input.setRandom();
  Eigen::array<int, 1> fft_dims = {0};
  for (auto _ : state) {
    Tensor<std::complex<Scalar>, 1> result = input.template fft<BothParts, FFT_REVERSE>(fft_dims);
    benchmark::DoNotOptimize(result.data());
    benchmark::ClobberMemory();
  }
  double mflops = 5.0 * N * std::log2(static_cast<double>(N));
  state.counters["MFLOPS"] =
      benchmark::Counter(mflops, benchmark::Counter::kIsIterationInvariantRate, benchmark::Counter::kIs1000);
 }
 static void FFTSizes(::benchmark::Benchmark* b) {
  for (int n : {64, 256, 1024, 4096}) {
    b->Arg(n);
  }
 }
 BENCHMARK(BM_TensorFFT_1D)->Apply(FFTSizes);
 BENCHMARK(BM_TensorFFT_2D)->Apply(FFTSizes);
 BENCHMARK(BM_TensorIFFT_1D)->Apply(FFTSizes);
`@@ -1 +1,2 @@`
	`eigen_add_benchmark(bench_cholesky bench_cholesky.cpp)`	`eigen_add_benchmark(bench_cholesky bench_cholesky.cpp)`
		`eigen_add_benchmark(bench_cholesky_double bench_cholesky.cpp DEFINITIONS SCALAR=double)`
		`@@ -0,0 +1 @@`
							`eigen_add_benchmark(bench_autodiff bench_autodiff.cpp)`
		`@@ -0,0 +1 @@`
							`eigen_add_benchmark(bench_iterative_solvers bench_iterative_solvers.cpp)`
		`@@ -0,0 +1 @@`
							`eigen_add_benchmark(bench_kronecker bench_kronecker.cpp)`
		`@@ -0,0 +1 @@`
							`eigen_add_benchmark(bench_special_functions bench_special_functions.cpp)`
		`@@ -0,0 +1 @@`
							`eigen_add_benchmark(bench_splines bench_splines.cpp)`