diff --git a/benchmarks/Cholesky/CMakeLists.txt b/benchmarks/Cholesky/CMakeLists.txt
index 8cf6a3c01..1565448be 100644
--- a/benchmarks/Cholesky/CMakeLists.txt
+++ b/benchmarks/Cholesky/CMakeLists.txt
@@ -1 +1,2 @@
 eigen_add_benchmark(bench_cholesky bench_cholesky.cpp)
+eigen_add_benchmark(bench_cholesky_double bench_cholesky.cpp DEFINITIONS SCALAR=double)
diff --git a/benchmarks/Cholesky/bench_cholesky.cpp b/benchmarks/Cholesky/bench_cholesky.cpp
index cbce03175..5990e82c1 100644
--- a/benchmarks/Cholesky/bench_cholesky.cpp
+++ b/benchmarks/Cholesky/bench_cholesky.cpp
@@ -4,7 +4,11 @@
 
 using namespace Eigen;
 
-typedef float Scalar;
+#ifndef SCALAR
+#define SCALAR float
+#endif
+
+typedef SCALAR Scalar;
 
 static void BM_LDLT(benchmark::State& state) {
   int n = state.range(0);
diff --git a/benchmarks/Core/CMakeLists.txt b/benchmarks/Core/CMakeLists.txt
index 7cc62d1d7..a95e9bb2a 100644
--- a/benchmarks/Core/CMakeLists.txt
+++ b/benchmarks/Core/CMakeLists.txt
@@ -15,3 +15,5 @@ eigen_add_benchmark(bench_diagonal bench_diagonal.cpp)
 eigen_add_benchmark(bench_triangular_product bench_triangular_product.cpp)
 eigen_add_benchmark(bench_selfadjoint_product bench_selfadjoint_product.cpp)
 eigen_add_benchmark(bench_construction bench_construction.cpp)
+eigen_add_benchmark(bench_fixed_size bench_fixed_size.cpp)
+eigen_add_benchmark(bench_fixed_size_double bench_fixed_size.cpp DEFINITIONS SCALAR=double)
diff --git a/benchmarks/Core/bench_fixed_size.cpp b/benchmarks/Core/bench_fixed_size.cpp
new file mode 100644
index 000000000..f7b674d3a
--- /dev/null
+++ b/benchmarks/Core/bench_fixed_size.cpp
@@ -0,0 +1,123 @@
+// Benchmarks for fixed-size matrix operations (2x2, 3x3, 4x4).
+// Critical for PCL, ROS, Sophus, Drake which use small matrices extensively.
+
+#include <benchmark/benchmark.h>
+#include <Eigen/Core>
+#include <Eigen/LU>
+
+using namespace Eigen;
+
+#ifndef SCALAR
+#define SCALAR float
+#endif
+
+typedef SCALAR Scalar;
+
+// --- Fixed-size GEMM ---
+template <int N>
+static void BM_FixedGemm(benchmark::State& state) {
+  typedef Matrix<Scalar, N, N> Mat;
+  Mat a = Mat::Random();
+  Mat b = Mat::Random();
+  Mat c;
+
+  for (auto _ : state) {
+    c.noalias() = a * b;
+    benchmark::DoNotOptimize(c.data());
+    benchmark::ClobberMemory();
+  }
+  state.counters["GFLOPS"] =
+      benchmark::Counter(2.0 * N * N * N, benchmark::Counter::kIsIterationInvariantRate, benchmark::Counter::kIs1000);
+}
+
+// --- Fixed-size inverse ---
+template <int N>
+static void BM_FixedInverse(benchmark::State& state) {
+  typedef Matrix<Scalar, N, N> Mat;
+  Mat a = Mat::Random();
+  // Make well-conditioned.
+  a = a * a.transpose() + Mat::Identity();
+  Mat result;
+
+  for (auto _ : state) {
+    result = a.inverse();
+    benchmark::DoNotOptimize(result.data());
+    benchmark::ClobberMemory();
+  }
+}
+
+// --- Fixed-size determinant ---
+template <int N>
+static void BM_FixedDeterminant(benchmark::State& state) {
+  typedef Matrix<Scalar, N, N> Mat;
+  Mat a = Mat::Random();
+  Scalar result;
+
+  for (auto _ : state) {
+    result = a.determinant();
+    benchmark::DoNotOptimize(&result);
+    benchmark::ClobberMemory();
+  }
+}
+
+// --- Batch transform: Matrix4 * Matrix<4,N> ---
+static void BM_BatchTransform4xN(benchmark::State& state) {
+  int N = state.range(0);
+  typedef Matrix<Scalar, 4, 4> Mat4;
+  typedef Matrix<Scalar, 4, Dynamic> MatXN;
+
+  Mat4 transform = Mat4::Random();
+  MatXN points = MatXN::Random(4, N);
+  MatXN result(4, N);
+
+  for (auto _ : state) {
+    result.noalias() = transform * points;
+    benchmark::DoNotOptimize(result.data());
+    benchmark::ClobberMemory();
+  }
+  state.counters["GFLOPS"] =
+      benchmark::Counter(2.0 * 4 * 4 * N, benchmark::Counter::kIsIterationInvariantRate, benchmark::Counter::kIs1000);
+}
+
+// --- Fixed 3x3 batch operations (common in point cloud processing) ---
+static void BM_Batch3x3Gemm(benchmark::State& state) {
+  int count = state.range(0);
+  typedef Matrix<Scalar, 3, 3> Mat3;
+
+  std::vector<Mat3> a(count), b(count), c(count);
+  for (int i = 0; i < count; ++i) {
+    a[i] = Mat3::Random();
+    b[i] = Mat3::Random();
+  }
+
+  for (auto _ : state) {
+    for (int i = 0; i < count; ++i) {
+      c[i].noalias() = a[i] * b[i];
+    }
+    benchmark::DoNotOptimize(c.data());
+    benchmark::ClobberMemory();
+  }
+  state.counters["GFLOPS"] =
+      benchmark::Counter(2.0 * 27 * count, benchmark::Counter::kIsIterationInvariantRate, benchmark::Counter::kIs1000);
+}
+
+// Fixed-size GEMM
+BENCHMARK(BM_FixedGemm<2>)->Name("FixedGemm_2x2");
+BENCHMARK(BM_FixedGemm<3>)->Name("FixedGemm_3x3");
+BENCHMARK(BM_FixedGemm<4>)->Name("FixedGemm_4x4");
+
+// Fixed-size inverse
+BENCHMARK(BM_FixedInverse<2>)->Name("FixedInverse_2x2");
+BENCHMARK(BM_FixedInverse<3>)->Name("FixedInverse_3x3");
+BENCHMARK(BM_FixedInverse<4>)->Name("FixedInverse_4x4");
+
+// Fixed-size determinant
+BENCHMARK(BM_FixedDeterminant<2>)->Name("FixedDet_2x2");
+BENCHMARK(BM_FixedDeterminant<3>)->Name("FixedDet_3x3");
+BENCHMARK(BM_FixedDeterminant<4>)->Name("FixedDet_4x4");
+
+// Batch 4xN transform
+BENCHMARK(BM_BatchTransform4xN)->Arg(1)->Arg(4)->Arg(8)->Arg(16)->Arg(64);
+
+// Batch 3x3 GEMM
+BENCHMARK(BM_Batch3x3Gemm)->Arg(100)->Arg(1000)->Arg(10000);
diff --git a/benchmarks/Eigenvalues/CMakeLists.txt b/benchmarks/Eigenvalues/CMakeLists.txt
index 0f4398bf1..67604770a 100644
--- a/benchmarks/Eigenvalues/CMakeLists.txt
+++ b/benchmarks/Eigenvalues/CMakeLists.txt
@@ -1,2 +1,3 @@
 eigen_add_benchmark(bench_eigensolver bench_eigensolver.cpp)
+eigen_add_benchmark(bench_eigensolver_double bench_eigensolver.cpp DEFINITIONS SCALAR=double)
 eigen_add_benchmark(bench_eig33 bench_eig33.cpp)
diff --git a/benchmarks/Eigenvalues/bench_eigensolver.cpp b/benchmarks/Eigenvalues/bench_eigensolver.cpp
index 4cb1ccb2f..2a1d853f8 100644
--- a/benchmarks/Eigenvalues/bench_eigensolver.cpp
+++ b/benchmarks/Eigenvalues/bench_eigensolver.cpp
@@ -5,7 +5,11 @@
 
 using namespace Eigen;
 
-typedef float Scalar;
+#ifndef SCALAR
+#define SCALAR float
+#endif
+
+typedef SCALAR Scalar;
 
 static void BM_SelfAdjointEigenSolver(benchmark::State& state) {
   int n = state.range(0);
diff --git a/benchmarks/FFT/bench_fft.cpp b/benchmarks/FFT/bench_fft.cpp
index 391178faa..4934ccaa3 100644
--- a/benchmarks/FFT/bench_fft.cpp
+++ b/benchmarks/FFT/bench_fft.cpp
@@ -33,11 +33,20 @@ static void BM_FFT(benchmark::State& state) {
       benchmark::Counter(mflops_per_iter, benchmark::Counter::kIsIterationInvariantRate, benchmark::Counter::kIs1000);
 }
 
-BENCHMARK(BM_FFT<std::complex<float>, true>)->Arg(1024)->Arg(4096);
-BENCHMARK(BM_FFT<std::complex<float>, false>)->Arg(1024)->Arg(4096);
-BENCHMARK(BM_FFT<float, true>)->Arg(1024)->Arg(4096);
-BENCHMARK(BM_FFT<float, false>)->Arg(1024)->Arg(4096);
-BENCHMARK(BM_FFT<std::complex<double>, true>)->Arg(1024)->Arg(4096);
-BENCHMARK(BM_FFT<std::complex<double>, false>)->Arg(1024)->Arg(4096);
-BENCHMARK(BM_FFT<double, true>)->Arg(1024)->Arg(4096);
-BENCHMARK(BM_FFT<double, false>)->Arg(1024)->Arg(4096);
+static void FFTSizes(::benchmark::Benchmark* b) {
+  for (int n : {64, 128, 256, 512, 1024, 2048, 4096, 8192, 16384, 65536}) {
+    b->Arg(n);
+  }
+  // Non-power-of-2 sizes.
+  b->Arg(1000);
+  b->Arg(5000);
+}
+
+BENCHMARK(BM_FFT<std::complex<float>, true>)->Apply(FFTSizes);
+BENCHMARK(BM_FFT<std::complex<float>, false>)->Apply(FFTSizes);
+BENCHMARK(BM_FFT<float, true>)->Apply(FFTSizes);
+BENCHMARK(BM_FFT<float, false>)->Apply(FFTSizes);
+BENCHMARK(BM_FFT<std::complex<double>, true>)->Apply(FFTSizes);
+BENCHMARK(BM_FFT<std::complex<double>, false>)->Apply(FFTSizes);
+BENCHMARK(BM_FFT<double, true>)->Apply(FFTSizes);
+BENCHMARK(BM_FFT<double, false>)->Apply(FFTSizes);
diff --git a/benchmarks/Sparse/CMakeLists.txt b/benchmarks/Sparse/CMakeLists.txt
index 4ebb7db95..93168aca8 100644
--- a/benchmarks/Sparse/CMakeLists.txt
+++ b/benchmarks/Sparse/CMakeLists.txt
@@ -1,3 +1,4 @@
 eigen_add_benchmark(bench_spmv bench_spmv.cpp)
 eigen_add_benchmark(bench_spmm bench_spmm.cpp)
 eigen_add_benchmark(bench_sparse_transpose bench_sparse_transpose.cpp)
+eigen_add_benchmark(bench_sparse_solvers bench_sparse_solvers.cpp)
diff --git a/benchmarks/Sparse/bench_sparse_solvers.cpp b/benchmarks/Sparse/bench_sparse_solvers.cpp
new file mode 100644
index 000000000..37817c933
--- /dev/null
+++ b/benchmarks/Sparse/bench_sparse_solvers.cpp
@@ -0,0 +1,182 @@
+// Benchmarks for sparse decomposition solvers.
+// Tests SimplicialLLT, SimplicialLDLT, SparseQR, SparseLU, CG, BiCGSTAB.
+
+#include <benchmark/benchmark.h>
+#include <Eigen/Sparse>
+#include <Eigen/SparseCholesky>
+#include <Eigen/SparseLU>
+#include <Eigen/SparseQR>
+#include <Eigen/IterativeLinearSolvers>
+#include <Eigen/OrderingMethods>
+
+using namespace Eigen;
+
+typedef double Scalar;
+typedef SparseMatrix<Scalar> SpMat;
+typedef Matrix<Scalar, Dynamic, 1> Vec;
+
+// Generate a SPD banded matrix (Laplacian-like).
+static SpMat generateSPD(int n, int bandwidth) {
+  SpMat A(n, n);
+  std::vector<Triplet<Scalar>> trips;
+  trips.reserve(n * (2 * bandwidth + 1));
+  for (int i = 0; i < n; ++i) {
+    Scalar diag = 0;
+    for (int j = std::max(0, i - bandwidth); j < std::min(n, i + bandwidth + 1); ++j) {
+      if (i != j) {
+        Scalar val = -1.0 / (1 + std::abs(i - j));
+        trips.emplace_back(i, j, val);
+        diag -= val;
+      }
+    }
+    trips.emplace_back(i, i, diag + 1.0);
+  }
+  A.setFromTriplets(trips.begin(), trips.end());
+  return A;
+}
+
+// Generate a general (non-symmetric) sparse matrix with diagonal dominance.
+static SpMat generateGeneral(int n, int bandwidth) {
+  SpMat A(n, n);
+  std::vector<Triplet<Scalar>> trips;
+  trips.reserve(n * (2 * bandwidth + 1));
+  for (int i = 0; i < n; ++i) {
+    Scalar diag = 0;
+    for (int j = std::max(0, i - bandwidth); j < std::min(n, i + bandwidth + 1); ++j) {
+      if (i != j) {
+        Scalar val = -0.5 / (1 + std::abs(i - j));
+        if (j > i) val *= 1.5;
+        trips.emplace_back(i, j, val);
+        diag += std::abs(val);
+      }
+    }
+    trips.emplace_back(i, i, diag + 1.0);
+  }
+  A.setFromTriplets(trips.begin(), trips.end());
+  return A;
+}
+
+// --- SimplicialLLT ---
+static void BM_SimplicialLLT(benchmark::State& state) {
+  int n = state.range(0);
+  int bw = state.range(1);
+  SpMat A = generateSPD(n, bw);
+  Vec b = Vec::Random(n);
+
+  for (auto _ : state) {
+    SimplicialLLT<SpMat> solver(A);
+    Vec x = solver.solve(b);
+    benchmark::DoNotOptimize(x.data());
+    benchmark::ClobberMemory();
+  }
+}
+
+// --- SimplicialLDLT ---
+static void BM_SimplicialLDLT(benchmark::State& state) {
+  int n = state.range(0);
+  int bw = state.range(1);
+  SpMat A = generateSPD(n, bw);
+  Vec b = Vec::Random(n);
+
+  for (auto _ : state) {
+    SimplicialLDLT<SpMat> solver(A);
+    Vec x = solver.solve(b);
+    benchmark::DoNotOptimize(x.data());
+    benchmark::ClobberMemory();
+  }
+}
+
+// --- SparseLU ---
+static void BM_SparseLU(benchmark::State& state) {
+  int n = state.range(0);
+  int bw = state.range(1);
+  SpMat A = generateGeneral(n, bw);
+  Vec b = Vec::Random(n);
+
+  for (auto _ : state) {
+    SparseLU<SpMat, COLAMDOrdering<int>> solver;
+    solver.compute(A);
+    Vec x = solver.solve(b);
+    benchmark::DoNotOptimize(x.data());
+    benchmark::ClobberMemory();
+  }
+}
+
+// --- SparseQR ---
+static void BM_SparseQR(benchmark::State& state) {
+  int n = state.range(0);
+  int bw = state.range(1);
+  SpMat A = generateGeneral(n, bw);
+  Vec b = Vec::Random(n);
+
+  for (auto _ : state) {
+    SparseQR<SpMat, COLAMDOrdering<int>> solver;
+    solver.compute(A);
+    Vec x = solver.solve(b);
+    benchmark::DoNotOptimize(x.data());
+    benchmark::ClobberMemory();
+  }
+}
+
+// --- ConjugateGradient (SPD) ---
+static void BM_CG(benchmark::State& state) {
+  int n = state.range(0);
+  int bw = state.range(1);
+  SpMat A = generateSPD(n, bw);
+  Vec b = Vec::Random(n);
+
+  ConjugateGradient<SpMat> solver;
+  solver.setMaxIterations(1000);
+  solver.setTolerance(1e-10);
+  solver.compute(A);
+
+  for (auto _ : state) {
+    Vec x = solver.solve(b);
+    benchmark::DoNotOptimize(x.data());
+    benchmark::ClobberMemory();
+  }
+  state.counters["iterations"] = solver.iterations();
+}
+
+// --- BiCGSTAB (general) ---
+static void BM_BiCGSTAB(benchmark::State& state) {
+  int n = state.range(0);
+  int bw = state.range(1);
+  SpMat A = generateGeneral(n, bw);
+  Vec b = Vec::Random(n);
+
+  BiCGSTAB<SpMat> solver;
+  solver.setMaxIterations(1000);
+  solver.setTolerance(1e-10);
+  solver.compute(A);
+
+  for (auto _ : state) {
+    Vec x = solver.solve(b);
+    benchmark::DoNotOptimize(x.data());
+    benchmark::ClobberMemory();
+  }
+  state.counters["iterations"] = solver.iterations();
+}
+
+static void DirectSolverSizes(::benchmark::Benchmark* b) {
+  for (int n : {1000, 5000, 10000, 50000}) {
+    for (int bw : {5, 20}) {
+      b->Args({n, bw});
+    }
+  }
+}
+
+static void IterativeSolverSizes(::benchmark::Benchmark* b) {
+  for (int n : {1000, 10000, 50000}) {
+    for (int bw : {5, 20}) {
+      b->Args({n, bw});
+    }
+  }
+}
+
+BENCHMARK(BM_SimplicialLLT)->Apply(DirectSolverSizes);
+BENCHMARK(BM_SimplicialLDLT)->Apply(DirectSolverSizes);
+BENCHMARK(BM_SparseLU)->Apply(DirectSolverSizes);
+BENCHMARK(BM_SparseQR)->Apply(DirectSolverSizes);
+BENCHMARK(BM_CG)->Apply(IterativeSolverSizes);
+BENCHMARK(BM_BiCGSTAB)->Apply(IterativeSolverSizes);
diff --git a/unsupported/benchmarks/AutoDiff/CMakeLists.txt b/unsupported/benchmarks/AutoDiff/CMakeLists.txt
new file mode 100644
index 000000000..cb5b98687
--- /dev/null
+++ b/unsupported/benchmarks/AutoDiff/CMakeLists.txt
@@ -0,0 +1 @@
+eigen_add_benchmark(bench_autodiff bench_autodiff.cpp)
diff --git a/unsupported/benchmarks/AutoDiff/bench_autodiff.cpp b/unsupported/benchmarks/AutoDiff/bench_autodiff.cpp
new file mode 100644
index 000000000..74c334f97
--- /dev/null
+++ b/unsupported/benchmarks/AutoDiff/bench_autodiff.cpp
@@ -0,0 +1,177 @@
+// Benchmarks for Eigen AutoDiff module.
+// Compares AutoDiff Jacobian computation against NumericalDiff and hand-coded Jacobians.
+
+#include <benchmark/benchmark.h>
+#include <Eigen/Core>
+#include <unsupported/Eigen/AutoDiff>
+#include <unsupported/Eigen/NumericalDiff>
+
+using namespace Eigen;
+
+// --- Small functor: Rosenbrock-like (2 inputs -> 2 outputs) ---
+struct SmallFunctor {
+  typedef Matrix<double, 2, 1> InputType;
+  typedef Matrix<double, 2, 1> ValueType;
+  typedef Matrix<double, 2, 2> JacobianType;
+
+  enum { InputsAtCompileTime = 2, ValuesAtCompileTime = 2 };
+
+  template <typename T>
+  void operator()(const Matrix<T, 2, 1>& x, Matrix<T, 2, 1>* v) const {
+    (*v)(0) = T(1) - x(0);
+    (*v)(1) = T(10) * (x(1) - x(0) * x(0));
+  }
+};
+
+// --- Medium functor: chain of operations (6 inputs -> 6 outputs) ---
+struct MediumFunctor {
+  typedef Matrix<double, 6, 1> InputType;
+  typedef Matrix<double, 6, 1> ValueType;
+  typedef Matrix<double, 6, 6> JacobianType;
+
+  enum { InputsAtCompileTime = 6, ValuesAtCompileTime = 6 };
+
+  template <typename T>
+  void operator()(const Matrix<T, 6, 1>& x, Matrix<T, 6, 1>* v) const {
+    (*v)(0) = sin(x(0)) * cos(x(1)) + x(2) * x(2);
+    (*v)(1) = exp(x(1) * T(0.1)) + x(3);
+    (*v)(2) = x(0) * x(2) - x(4) * x(5);
+    (*v)(3) = sqrt(x(3) * x(3) + T(1)) + x(0);
+    (*v)(4) = x(4) * x(4) + x(5) * x(5) + x(0) * x(1);
+    (*v)(5) = log(x(2) * x(2) + T(1)) + x(3) * x(4);
+  }
+};
+
+// --- Dynamic-size functor (N inputs -> N outputs) ---
+struct DynamicFunctor {
+  typedef Matrix<double, Dynamic, 1> InputType;
+  typedef Matrix<double, Dynamic, 1> ValueType;
+  typedef Matrix<double, Dynamic, Dynamic> JacobianType;
+
+  const int n_;
+  DynamicFunctor(int n) : n_(n) {}
+
+  enum { InputsAtCompileTime = Dynamic, ValuesAtCompileTime = Dynamic };
+
+  int inputs() const { return n_; }
+  int values() const { return n_; }
+
+  template <typename T>
+  void operator()(const Matrix<T, Dynamic, 1>& x, Matrix<T, Dynamic, 1>* v) const {
+    v->resize(n_);
+    (*v)(0) = T(1) - x(0);
+    for (int i = 1; i < n_; ++i) {
+      (*v)(i) = T(10) * (x(i) - x(i - 1) * x(i - 1));
+    }
+  }
+};
+
+// Wrapper for NumericalDiff compatibility.
+struct SmallFunctorND : SmallFunctor {
+  typedef double Scalar;
+  int inputs() const { return 2; }
+  int values() const { return 2; }
+  int operator()(const InputType& x, ValueType& v) const {
+    SmallFunctor::operator()(x, &v);
+    return 0;
+  }
+};
+
+struct MediumFunctorND : MediumFunctor {
+  typedef double Scalar;
+  int inputs() const { return 6; }
+  int values() const { return 6; }
+  int operator()(const InputType& x, ValueType& v) const {
+    MediumFunctor::operator()(x, &v);
+    return 0;
+  }
+};
+
+// --- AutoDiff Jacobian benchmarks ---
+template <typename Functor>
+static void BM_AutoDiffJacobian(benchmark::State& state, Functor func) {
+  AutoDiffJacobian<Functor> adf(func);
+  typename Functor::InputType x = Functor::InputType::Random();
+  typename Functor::ValueType v;
+  typename Functor::JacobianType jac;
+
+  for (auto _ : state) {
+    adf(x, &v, &jac);
+    benchmark::DoNotOptimize(jac.data());
+    benchmark::ClobberMemory();
+  }
+}
+
+// --- Dynamic AutoDiff Jacobian ---
+static void BM_AutoDiffJacobian_Dynamic(benchmark::State& state) {
+  int n = state.range(0);
+  DynamicFunctor func(n);
+  AutoDiffJacobian<DynamicFunctor> adf(func);
+
+  VectorXd x = VectorXd::Random(n);
+  VectorXd v(n);
+  MatrixXd jac(n, n);
+
+  for (auto _ : state) {
+    adf(x, &v, &jac);
+    benchmark::DoNotOptimize(jac.data());
+    benchmark::ClobberMemory();
+  }
+}
+
+// --- NumericalDiff benchmarks ---
+template <typename Functor>
+static void BM_NumericalDiffJacobian(benchmark::State& state, Functor func) {
+  NumericalDiff<Functor> ndf(func);
+  typename Functor::InputType x = Functor::InputType::Random();
+  typename Functor::JacobianType jac;
+
+  for (auto _ : state) {
+    ndf.df(x, jac);
+    benchmark::DoNotOptimize(jac.data());
+    benchmark::ClobberMemory();
+  }
+}
+
+// --- Hand-coded Jacobian (Rosenbrock) for comparison ---
+static void BM_HandCoded_Small(benchmark::State& state) {
+  Vector2d x = Vector2d::Random();
+  Matrix2d jac;
+
+  for (auto _ : state) {
+    jac(0, 0) = -1;
+    jac(0, 1) = 0;
+    jac(1, 0) = -20 * x(0);
+    jac(1, 1) = 10;
+    benchmark::DoNotOptimize(jac.data());
+    benchmark::ClobberMemory();
+  }
+}
+
+// --- Scalar AutoDiff evaluation (no Jacobian, just forward pass) ---
+static void BM_AutoDiffScalar_Eval(benchmark::State& state) {
+  int n = state.range(0);
+  using ADScalar = AutoDiffScalar<VectorXd>;
+  VectorXd x = VectorXd::Random(n);
+
+  for (auto _ : state) {
+    ADScalar sum(0.0, VectorXd::Zero(n));
+    for (int i = 0; i < n; ++i) {
+      ADScalar xi(x(i), n, i);
+      sum += xi * xi + sin(xi);
+    }
+    benchmark::DoNotOptimize(sum.value());
+    benchmark::DoNotOptimize(sum.derivatives().data());
+    benchmark::ClobberMemory();
+  }
+}
+
+BENCHMARK_CAPTURE(BM_AutoDiffJacobian, Small, SmallFunctor());
+BENCHMARK_CAPTURE(BM_AutoDiffJacobian, Medium, MediumFunctor());
+BENCHMARK(BM_AutoDiffJacobian_Dynamic)->Arg(2)->Arg(6)->Arg(20)->Arg(50)->Arg(100);
+
+BENCHMARK_CAPTURE(BM_NumericalDiffJacobian, Small, SmallFunctorND());
+BENCHMARK_CAPTURE(BM_NumericalDiffJacobian, Medium, MediumFunctorND());
+
+BENCHMARK(BM_HandCoded_Small);
+BENCHMARK(BM_AutoDiffScalar_Eval)->Arg(2)->Arg(6)->Arg(20)->Arg(50)->Arg(100);
diff --git a/unsupported/benchmarks/CMakeLists.txt b/unsupported/benchmarks/CMakeLists.txt
new file mode 100644
index 000000000..ed8c88ea0
--- /dev/null
+++ b/unsupported/benchmarks/CMakeLists.txt
@@ -0,0 +1,35 @@
+cmake_minimum_required(VERSION 3.10)
+project(EigenUnsupportedBenchmarks CXX)
+
+find_package(benchmark REQUIRED)
+find_package(Threads REQUIRED)
+
+set(EIGEN_SOURCE_DIR "${CMAKE_CURRENT_SOURCE_DIR}/../..")
+
+# Helper: add a Google Benchmark target (mirrors benchmarks/CMakeLists.txt).
+#   eigen_add_benchmark(name source [LIBRARIES lib1 lib2 ...] [DEFINITIONS def1 def2 ...])
+function(eigen_add_benchmark name source)
+  cmake_parse_arguments(BENCH "" "" "LIBRARIES;DEFINITIONS" ${ARGN})
+  if(NOT IS_ABSOLUTE "${source}")
+    set(source "${CMAKE_CURRENT_SOURCE_DIR}/${source}")
+  endif()
+  add_executable(${name} ${source})
+  target_include_directories(${name} PRIVATE ${EIGEN_SOURCE_DIR})
+  target_link_libraries(${name} PRIVATE benchmark::benchmark benchmark::benchmark_main
+                                        Threads::Threads)
+  if(BENCH_LIBRARIES)
+    target_link_libraries(${name} PRIVATE ${BENCH_LIBRARIES})
+  endif()
+  target_compile_options(${name} PRIVATE -O3 -DNDEBUG)
+  if(BENCH_DEFINITIONS)
+    target_compile_definitions(${name} PRIVATE ${BENCH_DEFINITIONS})
+  endif()
+endfunction()
+
+add_subdirectory(Tensor)
+add_subdirectory(MatrixFunctions)
+add_subdirectory(SpecialFunctions)
+add_subdirectory(AutoDiff)
+add_subdirectory(Splines)
+add_subdirectory(IterativeSolvers)
+add_subdirectory(KroneckerProduct)
diff --git a/unsupported/benchmarks/IterativeSolvers/CMakeLists.txt b/unsupported/benchmarks/IterativeSolvers/CMakeLists.txt
new file mode 100644
index 000000000..4ae536a31
--- /dev/null
+++ b/unsupported/benchmarks/IterativeSolvers/CMakeLists.txt
@@ -0,0 +1 @@
+eigen_add_benchmark(bench_iterative_solvers bench_iterative_solvers.cpp)
diff --git a/unsupported/benchmarks/IterativeSolvers/bench_iterative_solvers.cpp b/unsupported/benchmarks/IterativeSolvers/bench_iterative_solvers.cpp
new file mode 100644
index 000000000..5105d40e2
--- /dev/null
+++ b/unsupported/benchmarks/IterativeSolvers/bench_iterative_solvers.cpp
@@ -0,0 +1,209 @@
+// Benchmarks for unsupported iterative solvers: GMRES, MINRES, IDRS, IDRSTABL, BiCGSTABL, DGMRES.
+
+#include <benchmark/benchmark.h>
+#include <Eigen/Sparse>
+#include <Eigen/IterativeLinearSolvers>
+#include <unsupported/Eigen/IterativeSolvers>
+
+using namespace Eigen;
+
+typedef double Scalar;
+typedef SparseMatrix<Scalar> SpMat;
+typedef Matrix<Scalar, Dynamic, 1> Vec;
+
+// Generate a SPD sparse matrix (Laplacian-like with diagonal dominance).
+static SpMat generateSPD(int n, int bandwidth) {
+  SpMat A(n, n);
+  std::vector<Triplet<Scalar>> trips;
+  trips.reserve(n * (2 * bandwidth + 1));
+  for (int i = 0; i < n; ++i) {
+    Scalar diag = 0;
+    for (int j = std::max(0, i - bandwidth); j < std::min(n, i + bandwidth + 1); ++j) {
+      if (i != j) {
+        Scalar val = -1.0 / (1 + std::abs(i - j));
+        trips.emplace_back(i, j, val);
+        diag -= val;
+      }
+    }
+    trips.emplace_back(i, i, diag + 1.0);
+  }
+  A.setFromTriplets(trips.begin(), trips.end());
+  return A;
+}
+
+// Generate a general (non-symmetric) sparse matrix.
+static SpMat generateGeneral(int n, int bandwidth) {
+  SpMat A(n, n);
+  std::vector<Triplet<Scalar>> trips;
+  trips.reserve(n * (2 * bandwidth + 1));
+  for (int i = 0; i < n; ++i) {
+    Scalar diag = 0;
+    for (int j = std::max(0, i - bandwidth); j < std::min(n, i + bandwidth + 1); ++j) {
+      if (i != j) {
+        Scalar val = -0.5 / (1 + std::abs(i - j));
+        if (j > i) val *= 1.5;  // asymmetry
+        trips.emplace_back(i, j, val);
+        diag += std::abs(val);
+      }
+    }
+    trips.emplace_back(i, i, diag + 1.0);  // diagonal dominance
+  }
+  A.setFromTriplets(trips.begin(), trips.end());
+  return A;
+}
+
+// --- GMRES ---
+static void BM_GMRES(benchmark::State& state) {
+  int n = state.range(0);
+  int bw = state.range(1);
+  SpMat A = generateGeneral(n, bw);
+  Vec b = Vec::Random(n);
+
+  GMRES<SpMat> solver;
+  solver.setMaxIterations(1000);
+  solver.setTolerance(1e-10);
+  solver.compute(A);
+
+  for (auto _ : state) {
+    Vec x = solver.solve(b);
+    benchmark::DoNotOptimize(x.data());
+    benchmark::ClobberMemory();
+  }
+  state.counters["iterations"] = solver.iterations();
+}
+
+// --- DGMRES ---
+static void BM_DGMRES(benchmark::State& state) {
+  int n = state.range(0);
+  int bw = state.range(1);
+  SpMat A = generateGeneral(n, bw);
+  Vec b = Vec::Random(n);
+
+  DGMRES<SpMat> solver;
+  solver.setMaxIterations(1000);
+  solver.setTolerance(1e-10);
+  solver.compute(A);
+
+  for (auto _ : state) {
+    Vec x = solver.solve(b);
+    benchmark::DoNotOptimize(x.data());
+    benchmark::ClobberMemory();
+  }
+  state.counters["iterations"] = solver.iterations();
+}
+
+// --- MINRES (SPD matrices) ---
+static void BM_MINRES(benchmark::State& state) {
+  int n = state.range(0);
+  int bw = state.range(1);
+  SpMat A = generateSPD(n, bw);
+  Vec b = Vec::Random(n);
+
+  MINRES<SpMat> solver;
+  solver.setMaxIterations(1000);
+  solver.setTolerance(1e-10);
+  solver.compute(A);
+
+  for (auto _ : state) {
+    Vec x = solver.solve(b);
+    benchmark::DoNotOptimize(x.data());
+    benchmark::ClobberMemory();
+  }
+  state.counters["iterations"] = solver.iterations();
+}
+
+// --- IDRS ---
+static void BM_IDRS(benchmark::State& state) {
+  int n = state.range(0);
+  int bw = state.range(1);
+  SpMat A = generateGeneral(n, bw);
+  Vec b = Vec::Random(n);
+
+  IDRS<SpMat> solver;
+  solver.setMaxIterations(1000);
+  solver.setTolerance(1e-10);
+  solver.compute(A);
+
+  for (auto _ : state) {
+    Vec x = solver.solve(b);
+    benchmark::DoNotOptimize(x.data());
+    benchmark::ClobberMemory();
+  }
+  state.counters["iterations"] = solver.iterations();
+}
+
+// --- BiCGSTABL ---
+static void BM_BiCGSTABL(benchmark::State& state) {
+  int n = state.range(0);
+  int bw = state.range(1);
+  SpMat A = generateGeneral(n, bw);
+  Vec b = Vec::Random(n);
+
+  BiCGSTABL<SpMat> solver;
+  solver.setMaxIterations(1000);
+  solver.setTolerance(1e-10);
+  solver.compute(A);
+
+  for (auto _ : state) {
+    Vec x = solver.solve(b);
+    benchmark::DoNotOptimize(x.data());
+    benchmark::ClobberMemory();
+  }
+  state.counters["iterations"] = solver.iterations();
+}
+
+// --- Compare with CG (supported module, SPD only) ---
+static void BM_CG_Reference(benchmark::State& state) {
+  int n = state.range(0);
+  int bw = state.range(1);
+  SpMat A = generateSPD(n, bw);
+  Vec b = Vec::Random(n);
+
+  ConjugateGradient<SpMat> solver;
+  solver.setMaxIterations(1000);
+  solver.setTolerance(1e-10);
+  solver.compute(A);
+
+  for (auto _ : state) {
+    Vec x = solver.solve(b);
+    benchmark::DoNotOptimize(x.data());
+    benchmark::ClobberMemory();
+  }
+  state.counters["iterations"] = solver.iterations();
+}
+
+// --- Compare with BiCGSTAB (supported module, general) ---
+static void BM_BiCGSTAB_Reference(benchmark::State& state) {
+  int n = state.range(0);
+  int bw = state.range(1);
+  SpMat A = generateGeneral(n, bw);
+  Vec b = Vec::Random(n);
+
+  BiCGSTAB<SpMat> solver;
+  solver.setMaxIterations(1000);
+  solver.setTolerance(1e-10);
+  solver.compute(A);
+
+  for (auto _ : state) {
+    Vec x = solver.solve(b);
+    benchmark::DoNotOptimize(x.data());
+    benchmark::ClobberMemory();
+  }
+  state.counters["iterations"] = solver.iterations();
+}
+
+static void SolverSizes(::benchmark::Benchmark* b) {
+  for (int n : {1000, 10000, 100000}) {
+    for (int bw : {5, 20}) {
+      b->Args({n, bw});
+    }
+  }
+}
+
+BENCHMARK(BM_GMRES)->Apply(SolverSizes);
+BENCHMARK(BM_DGMRES)->Apply(SolverSizes);
+BENCHMARK(BM_MINRES)->Apply(SolverSizes);
+BENCHMARK(BM_IDRS)->Apply(SolverSizes);
+BENCHMARK(BM_BiCGSTABL)->Apply(SolverSizes);
+BENCHMARK(BM_CG_Reference)->Apply(SolverSizes);
+BENCHMARK(BM_BiCGSTAB_Reference)->Apply(SolverSizes);
diff --git a/unsupported/benchmarks/KroneckerProduct/CMakeLists.txt b/unsupported/benchmarks/KroneckerProduct/CMakeLists.txt
new file mode 100644
index 000000000..ff0d28969
--- /dev/null
+++ b/unsupported/benchmarks/KroneckerProduct/CMakeLists.txt
@@ -0,0 +1 @@
+eigen_add_benchmark(bench_kronecker bench_kronecker.cpp)
diff --git a/unsupported/benchmarks/KroneckerProduct/bench_kronecker.cpp b/unsupported/benchmarks/KroneckerProduct/bench_kronecker.cpp
new file mode 100644
index 000000000..882887e2c
--- /dev/null
+++ b/unsupported/benchmarks/KroneckerProduct/bench_kronecker.cpp
@@ -0,0 +1,83 @@
+// Benchmarks for Kronecker product (dense and sparse).
+
+#include <benchmark/benchmark.h>
+#include <Eigen/Core>
+#include <Eigen/Sparse>
+#include <unsupported/Eigen/KroneckerProduct>
+
+using namespace Eigen;
+
+typedef double Scalar;
+typedef Matrix<Scalar, Dynamic, Dynamic> Mat;
+typedef SparseMatrix<Scalar> SpMat;
+
+// --- Dense Kronecker product ---
+static void BM_KroneckerDense(benchmark::State& state) {
+  int na = state.range(0);
+  int nb = state.range(1);
+
+  Mat A = Mat::Random(na, na);
+  Mat B = Mat::Random(nb, nb);
+
+  for (auto _ : state) {
+    Mat C = kroneckerProduct(A, B).eval();
+    benchmark::DoNotOptimize(C.data());
+    benchmark::ClobberMemory();
+  }
+  int outSize = na * nb;
+  state.counters["output_size"] = outSize;
+}
+
+// --- Sparse Kronecker product ---
+static void BM_KroneckerSparse(benchmark::State& state) {
+  int na = state.range(0);
+  int nb = state.range(1);
+
+  // Create sparse identity-like matrices with some fill.
+  SpMat A(na, na);
+  SpMat B(nb, nb);
+
+  std::vector<Triplet<Scalar>> tripsA, tripsB;
+  for (int i = 0; i < na; ++i) {
+    tripsA.emplace_back(i, i, 2.0);
+    if (i + 1 < na) {
+      tripsA.emplace_back(i, i + 1, -1.0);
+      tripsA.emplace_back(i + 1, i, -1.0);
+    }
+  }
+  for (int i = 0; i < nb; ++i) {
+    tripsB.emplace_back(i, i, 2.0);
+    if (i + 1 < nb) {
+      tripsB.emplace_back(i, i + 1, -1.0);
+      tripsB.emplace_back(i + 1, i, -1.0);
+    }
+  }
+  A.setFromTriplets(tripsA.begin(), tripsA.end());
+  B.setFromTriplets(tripsB.begin(), tripsB.end());
+
+  for (auto _ : state) {
+    SpMat C = kroneckerProduct(A, B).eval();
+    benchmark::DoNotOptimize(C.valuePtr());
+    benchmark::ClobberMemory();
+  }
+  state.counters["output_size"] = na * nb;
+}
+
+static void KroneckerSizes(::benchmark::Benchmark* b) {
+  for (int na : {4, 8, 16}) {
+    for (int nb : {4, 8, 16}) {
+      b->Args({na, nb});
+    }
+  }
+}
+
+static void KroneckerSparseSizes(::benchmark::Benchmark* b) {
+  for (int na : {16, 32, 64, 128}) {
+    for (int nb : {16, 32, 64, 128}) {
+      b->Args({na, nb});
+    }
+  }
+}
+
+BENCHMARK(BM_KroneckerDense)->Apply(KroneckerSizes);
+BENCHMARK(BM_KroneckerSparse)->Apply(KroneckerSparseSizes);
diff --git a/unsupported/benchmarks/MatrixFunctions/CMakeLists.txt b/unsupported/benchmarks/MatrixFunctions/CMakeLists.txt
new file mode 100644
index 000000000..2a612ed3c
--- /dev/null
+++ b/unsupported/benchmarks/MatrixFunctions/CMakeLists.txt
@@ -0,0 +1,3 @@
+eigen_add_benchmark(bench_matrix_exponential bench_matrix_exponential.cpp)
+eigen_add_benchmark(bench_matrix_logarithm bench_matrix_logarithm.cpp)
+eigen_add_benchmark(bench_matrix_power bench_matrix_power.cpp)
diff --git a/unsupported/benchmarks/MatrixFunctions/bench_matrix_exponential.cpp b/unsupported/benchmarks/MatrixFunctions/bench_matrix_exponential.cpp
new file mode 100644
index 000000000..3778b769c
--- /dev/null
+++ b/unsupported/benchmarks/MatrixFunctions/bench_matrix_exponential.cpp
@@ -0,0 +1,52 @@
+// Benchmarks for matrix exponential.
+// Critical for Sophus Lie group operations (SLAM, visual odometry).
+
+#include <benchmark/benchmark.h>
+#include <Eigen/Core>
+#include <unsupported/Eigen/MatrixFunctions>
+
+using namespace Eigen;
+
+#ifndef SCALAR
+#define SCALAR double
+#endif
+
+typedef SCALAR Scalar;
+
+static void BM_MatrixExp(benchmark::State& state) {
+  int n = state.range(0);
+  typedef Matrix<Scalar, Dynamic, Dynamic> MatrixType;
+
+  // Generate a random matrix with reasonable spectral radius.
+  MatrixType A = MatrixType::Random(n, n) / Scalar(n);
+  MatrixType result(n, n);
+
+  for (auto _ : state) {
+    result = A.exp();
+    benchmark::DoNotOptimize(result.data());
+    benchmark::ClobberMemory();
+  }
+}
+
+// Fixed-size specializations for Lie group sizes.
+template <int N>
+static void BM_MatrixExp_Fixed(benchmark::State& state) {
+  typedef Matrix<Scalar, N, N> MatrixType;
+
+  MatrixType A = MatrixType::Random() / Scalar(N);
+  MatrixType result;
+
+  for (auto _ : state) {
+    result = A.exp();
+    benchmark::DoNotOptimize(result.data());
+    benchmark::ClobberMemory();
+  }
+}
+
+// Dynamic sizes: Lie groups (2,3,4) plus larger.
+BENCHMARK(BM_MatrixExp)->Arg(2)->Arg(3)->Arg(4)->Arg(8)->Arg(16)->Arg(32)->Arg(64)->Arg(128);
+
+// Fixed-size Lie group dimensions.
+BENCHMARK(BM_MatrixExp_Fixed<2>);
+BENCHMARK(BM_MatrixExp_Fixed<3>);
+BENCHMARK(BM_MatrixExp_Fixed<4>);
diff --git a/unsupported/benchmarks/MatrixFunctions/bench_matrix_logarithm.cpp b/unsupported/benchmarks/MatrixFunctions/bench_matrix_logarithm.cpp
new file mode 100644
index 000000000..4d1c8eadf
--- /dev/null
+++ b/unsupported/benchmarks/MatrixFunctions/bench_matrix_logarithm.cpp
@@ -0,0 +1,51 @@
+// Benchmarks for matrix logarithm.
+// Inverse of matrix exponential, used for Lie group log maps.
+
+#include <benchmark/benchmark.h>
+#include <Eigen/Core>
+#include <unsupported/Eigen/MatrixFunctions>
+
+using namespace Eigen;
+
+#ifndef SCALAR
+#define SCALAR double
+#endif
+
+typedef SCALAR Scalar;
+
+static void BM_MatrixLog(benchmark::State& state) {
+  int n = state.range(0);
+  typedef Matrix<Scalar, Dynamic, Dynamic> MatrixType;
+
+  // Generate a matrix close to identity for stable log computation.
+  MatrixType A = MatrixType::Identity(n, n) + MatrixType::Random(n, n) / Scalar(n * 2);
+  // Ensure A is in the principal branch by computing exp(small matrix).
+  A = (MatrixType::Random(n, n) / Scalar(n * 4)).exp();
+  MatrixType result(n, n);
+
+  for (auto _ : state) {
+    result = A.log();
+    benchmark::DoNotOptimize(result.data());
+    benchmark::ClobberMemory();
+  }
+}
+
+template <int N>
+static void BM_MatrixLog_Fixed(benchmark::State& state) {
+  typedef Matrix<Scalar, N, N> MatrixType;
+
+  MatrixType A = (MatrixType::Random() / Scalar(N * 4)).exp();
+  MatrixType result;
+
+  for (auto _ : state) {
+    result = A.log();
+    benchmark::DoNotOptimize(result.data());
+    benchmark::ClobberMemory();
+  }
+}
+
+BENCHMARK(BM_MatrixLog)->Arg(2)->Arg(3)->Arg(4)->Arg(8)->Arg(16)->Arg(32)->Arg(64);
+
+BENCHMARK(BM_MatrixLog_Fixed<2>);
+BENCHMARK(BM_MatrixLog_Fixed<3>);
+BENCHMARK(BM_MatrixLog_Fixed<4>);
diff --git a/unsupported/benchmarks/MatrixFunctions/bench_matrix_power.cpp b/unsupported/benchmarks/MatrixFunctions/bench_matrix_power.cpp
new file mode 100644
index 000000000..a1ebea074
--- /dev/null
+++ b/unsupported/benchmarks/MatrixFunctions/bench_matrix_power.cpp
@@ -0,0 +1,99 @@
+// Benchmarks for matrix power functions: sqrt, pow, cos, sin, cosh, sinh.
+
+#include <benchmark/benchmark.h>
+#include <Eigen/Core>
+#include <unsupported/Eigen/MatrixFunctions>
+
+using namespace Eigen;
+
+typedef double Scalar;
+typedef Matrix<Scalar, Dynamic, Dynamic> Mat;
+
+static void BM_MatrixSqrt(benchmark::State& state) {
+  int n = state.range(0);
+  // SPD matrix has well-defined sqrt.
+  Mat tmp = Mat::Random(n, n);
+  Mat A = tmp * tmp.transpose() + Mat::Identity(n, n);
+  Mat result(n, n);
+
+  for (auto _ : state) {
+    result = A.sqrt();
+    benchmark::DoNotOptimize(result.data());
+    benchmark::ClobberMemory();
+  }
+}
+
+static void BM_MatrixPow(benchmark::State& state) {
+  int n = state.range(0);
+  Mat tmp = Mat::Random(n, n);
+  Mat A = tmp * tmp.transpose() + Mat::Identity(n, n);
+  Mat result(n, n);
+  Scalar p = 2.5;
+
+  for (auto _ : state) {
+    result = A.pow(p);
+    benchmark::DoNotOptimize(result.data());
+    benchmark::ClobberMemory();
+  }
+}
+
+static void BM_MatrixCos(benchmark::State& state) {
+  int n = state.range(0);
+  Mat A = Mat::Random(n, n) / Scalar(n);
+  Mat result(n, n);
+
+  for (auto _ : state) {
+    result = A.cos();
+    benchmark::DoNotOptimize(result.data());
+    benchmark::ClobberMemory();
+  }
+}
+
+static void BM_MatrixSin(benchmark::State& state) {
+  int n = state.range(0);
+  Mat A = Mat::Random(n, n) / Scalar(n);
+  Mat result(n, n);
+
+  for (auto _ : state) {
+    result = A.sin();
+    benchmark::DoNotOptimize(result.data());
+    benchmark::ClobberMemory();
+  }
+}
+
+static void BM_MatrixCosh(benchmark::State& state) {
+  int n = state.range(0);
+  Mat A = Mat::Random(n, n) / Scalar(n);
+  Mat result(n, n);
+
+  for (auto _ : state) {
+    result = A.cosh();
+    benchmark::DoNotOptimize(result.data());
+    benchmark::ClobberMemory();
+  }
+}
+
+static void BM_MatrixSinh(benchmark::State& state) {
+  int n = state.range(0);
+  Mat A = Mat::Random(n, n) / Scalar(n);
+  Mat result(n, n);
+
+  for (auto _ : state) {
+    result = A.sinh();
+    benchmark::DoNotOptimize(result.data());
+    benchmark::ClobberMemory();
+  }
+}
+
+static void MatPowerSizes(::benchmark::Benchmark* b) {
+  for (int n : {4, 8, 16, 32, 64}) {
+    b->Arg(n);
+  }
+}
+
+BENCHMARK(BM_MatrixSqrt)->Apply(MatPowerSizes);
+BENCHMARK(BM_MatrixPow)->Apply(MatPowerSizes);
+BENCHMARK(BM_MatrixCos)->Apply(MatPowerSizes);
+BENCHMARK(BM_MatrixSin)->Apply(MatPowerSizes);
+BENCHMARK(BM_MatrixCosh)->Apply(MatPowerSizes);
+BENCHMARK(BM_MatrixSinh)->Apply(MatPowerSizes);
diff --git a/unsupported/benchmarks/SpecialFunctions/CMakeLists.txt b/unsupported/benchmarks/SpecialFunctions/CMakeLists.txt
new file mode 100644
index 000000000..a6734137a
--- /dev/null
+++ b/unsupported/benchmarks/SpecialFunctions/CMakeLists.txt
@@ -0,0 +1 @@
+eigen_add_benchmark(bench_special_functions bench_special_functions.cpp)
diff --git a/unsupported/benchmarks/SpecialFunctions/bench_special_functions.cpp b/unsupported/benchmarks/SpecialFunctions/bench_special_functions.cpp
new file mode 100644
index 000000000..dc6505de5
--- /dev/null
+++ b/unsupported/benchmarks/SpecialFunctions/bench_special_functions.cpp
@@ -0,0 +1,127 @@
+// Benchmarks for special functions beyond what bench_cwise_math.cpp covers.
+// Includes Bessel functions, two-argument functions (igamma, betainc),
+// and additional functions (lgamma, digamma, zeta, polygamma).
+
+#include <benchmark/benchmark.h>
+#include <Eigen/Core>
+#include <unsupported/Eigen/SpecialFunctions>
+
+using namespace Eigen;
+
+// Macro for unary special functions on arrays.
+#define BENCH_SPECIAL_UNARY(NAME, EXPR, LO, HI)                                                          \
+  template <typename Scalar>                                                                             \
+  static void BM_##NAME(benchmark::State& state) {                                                       \
+    const Index n = state.range(0);                                                                      \
+    using Arr = Array<Scalar, Dynamic, 1>;                                                               \
+    Arr a = (Arr::Random(n) + Scalar(1)) * Scalar((double(HI) - double(LO)) / 2.0) + Scalar(LO);         \
+    Arr b(n);                                                                                            \
+    for (auto _ : state) {                                                                               \
+      b = EXPR;                                                                                          \
+      benchmark::DoNotOptimize(b.data());                                                                \
+    }                                                                                                    \
+    state.counters["Elements/s"] = benchmark::Counter(n, benchmark::Counter::kIsIterationInvariantRate); \
+    state.SetBytesProcessed(state.iterations() * n * sizeof(Scalar) * 2);                                \
+  }
+
+// Macro for binary special functions on arrays.
+#define BENCH_SPECIAL_BINARY(NAME, EXPR, LO_A, HI_A, LO_B, HI_B)                                         \
+  template <typename Scalar>                                                                             \
+  static void BM_##NAME(benchmark::State& state) {                                                       \
+    const Index n = state.range(0);                                                                      \
+    using Arr = Array<Scalar, Dynamic, 1>;                                                               \
+    Arr a = (Arr::Random(n) + Scalar(1)) * Scalar((double(HI_A) - double(LO_A)) / 2.0) + Scalar(LO_A);   \
+    Arr b = (Arr::Random(n) + Scalar(1)) * Scalar((double(HI_B) - double(LO_B)) / 2.0) + Scalar(LO_B);   \
+    Arr c(n);                                                                                            \
+    for (auto _ : state) {                                                                               \
+      c = EXPR;                                                                                          \
+      benchmark::DoNotOptimize(c.data());                                                                \
+    }                                                                                                    \
+    state.counters["Elements/s"] = benchmark::Counter(n, benchmark::Counter::kIsIterationInvariantRate); \
+    state.SetBytesProcessed(state.iterations() * n * sizeof(Scalar) * 3);                                \
+  }
+
+// --- Unary special functions ---
+BENCH_SPECIAL_UNARY(Lgamma, Eigen::lgamma(a), 0.1, 20)
+BENCH_SPECIAL_UNARY(Digamma, Eigen::digamma(a), 0.1, 20)
+
+// --- Bessel functions (first kind) ---
+BENCH_SPECIAL_UNARY(BesselI0, Eigen::bessel_i0(a), 0, 10)
+BENCH_SPECIAL_UNARY(BesselI1, Eigen::bessel_i1(a), 0, 10)
+BENCH_SPECIAL_UNARY(BesselI0e, Eigen::bessel_i0e(a), 0, 100)
+BENCH_SPECIAL_UNARY(BesselI1e, Eigen::bessel_i1e(a), 0, 100)
+BENCH_SPECIAL_UNARY(BesselJ0, Eigen::bessel_j0(a), 0, 20)
+BENCH_SPECIAL_UNARY(BesselJ1, Eigen::bessel_j1(a), 0, 20)
+
+// --- Bessel functions (second kind) ---
+BENCH_SPECIAL_UNARY(BesselY0, Eigen::bessel_y0(a), 0.1, 20)
+BENCH_SPECIAL_UNARY(BesselY1, Eigen::bessel_y1(a), 0.1, 20)
+BENCH_SPECIAL_UNARY(BesselK0, Eigen::bessel_k0(a), 0.1, 20)
+BENCH_SPECIAL_UNARY(BesselK1, Eigen::bessel_k1(a), 0.1, 20)
+BENCH_SPECIAL_UNARY(BesselK0e, Eigen::bessel_k0e(a), 0.1, 100)
+BENCH_SPECIAL_UNARY(BesselK1e, Eigen::bessel_k1e(a), 0.1, 100)
+
+// --- Two-argument functions ---
+BENCH_SPECIAL_BINARY(Igamma, Eigen::igamma(a, b), 0.1, 10, 0.1, 10)
+BENCH_SPECIAL_BINARY(Igammac, Eigen::igammac(a, b), 0.1, 10, 0.1, 10)
+BENCH_SPECIAL_BINARY(Zeta, Eigen::zeta(a, b), 1.1, 10, 0.1, 10)
+BENCH_SPECIAL_BINARY(Polygamma, Eigen::polygamma(a, b), 1, 4, 0.1, 10)
+
+// --- Ternary: betainc ---
+template <typename Scalar>
+static void BM_Betainc(benchmark::State& state) {
+  const Index n = state.range(0);
+  using Arr = Array<Scalar, Dynamic, 1>;
+  Arr a = (Arr::Random(n) + Scalar(1)) * Scalar(2.5) + Scalar(0.5);  // [0.5, 5.5]
+  Arr b = (Arr::Random(n) + Scalar(1)) * Scalar(2.5) + Scalar(0.5);
+  Arr x = (Arr::Random(n) + Scalar(1)) * Scalar(0.5);  // [0, 1]
+  Arr result(n);
+  for (auto _ : state) {
+    result = Eigen::betainc(a, b, x);
+    benchmark::DoNotOptimize(result.data());
+  }
+  state.counters["Elements/s"] = benchmark::Counter(n, benchmark::Counter::kIsIterationInvariantRate);
+  state.SetBytesProcessed(state.iterations() * n * sizeof(Scalar) * 4);
+}
+
+static void SpecialSizes(::benchmark::Benchmark* b) {
+  for (int n : {256, 4096, 65536, 1048576}) b->Arg(n);
+}
+
+// --- Register float ---
+BENCHMARK(BM_Lgamma<float>)->Apply(SpecialSizes)->Name("Lgamma_float");
+BENCHMARK(BM_Digamma<float>)->Apply(SpecialSizes)->Name("Digamma_float");
+BENCHMARK(BM_BesselI0<float>)->Apply(SpecialSizes)->Name("BesselI0_float");
+BENCHMARK(BM_BesselI1<float>)->Apply(SpecialSizes)->Name("BesselI1_float");
+BENCHMARK(BM_BesselI0e<float>)->Apply(SpecialSizes)->Name("BesselI0e_float");
+BENCHMARK(BM_BesselI1e<float>)->Apply(SpecialSizes)->Name("BesselI1e_float");
+BENCHMARK(BM_BesselJ0<float>)->Apply(SpecialSizes)->Name("BesselJ0_float");
+BENCHMARK(BM_BesselJ1<float>)->Apply(SpecialSizes)->Name("BesselJ1_float");
+BENCHMARK(BM_BesselY0<float>)->Apply(SpecialSizes)->Name("BesselY0_float");
+BENCHMARK(BM_BesselY1<float>)->Apply(SpecialSizes)->Name("BesselY1_float");
+BENCHMARK(BM_BesselK0<float>)->Apply(SpecialSizes)->Name("BesselK0_float");
+BENCHMARK(BM_BesselK1<float>)->Apply(SpecialSizes)->Name("BesselK1_float");
+BENCHMARK(BM_BesselK0e<float>)->Apply(SpecialSizes)->Name("BesselK0e_float");
+BENCHMARK(BM_BesselK1e<float>)->Apply(SpecialSizes)->Name("BesselK1e_float");
+BENCHMARK(BM_Igamma<float>)->Apply(SpecialSizes)->Name("Igamma_float");
+BENCHMARK(BM_Igammac<float>)->Apply(SpecialSizes)->Name("Igammac_float");
+BENCHMARK(BM_Betainc<float>)->Apply(SpecialSizes)->Name("Betainc_float");
+BENCHMARK(BM_Zeta<float>)->Apply(SpecialSizes)->Name("Zeta_float");
+BENCHMARK(BM_Polygamma<float>)->Apply(SpecialSizes)->Name("Polygamma_float");
+
+// --- Register double ---
+BENCHMARK(BM_Lgamma<double>)->Apply(SpecialSizes)->Name("Lgamma_double");
+BENCHMARK(BM_Digamma<double>)->Apply(SpecialSizes)->Name("Digamma_double");
+BENCHMARK(BM_BesselI0<double>)->Apply(SpecialSizes)->Name("BesselI0_double");
+BENCHMARK(BM_BesselI1<double>)->Apply(SpecialSizes)->Name("BesselI1_double");
+BENCHMARK(BM_BesselJ0<double>)->Apply(SpecialSizes)->Name("BesselJ0_double");
+BENCHMARK(BM_BesselJ1<double>)->Apply(SpecialSizes)->Name("BesselJ1_double");
+BENCHMARK(BM_BesselY0<double>)->Apply(SpecialSizes)->Name("BesselY0_double");
+BENCHMARK(BM_BesselY1<double>)->Apply(SpecialSizes)->Name("BesselY1_double");
+BENCHMARK(BM_BesselK0<double>)->Apply(SpecialSizes)->Name("BesselK0_double");
+BENCHMARK(BM_BesselK1<double>)->Apply(SpecialSizes)->Name("BesselK1_double");
+BENCHMARK(BM_Igamma<double>)->Apply(SpecialSizes)->Name("Igamma_double");
+BENCHMARK(BM_Igammac<double>)->Apply(SpecialSizes)->Name("Igammac_double");
+BENCHMARK(BM_Betainc<double>)->Apply(SpecialSizes)->Name("Betainc_double");
+BENCHMARK(BM_Zeta<double>)->Apply(SpecialSizes)->Name("Zeta_double");
+BENCHMARK(BM_Polygamma<double>)->Apply(SpecialSizes)->Name("Polygamma_double");
diff --git a/unsupported/benchmarks/Splines/CMakeLists.txt b/unsupported/benchmarks/Splines/CMakeLists.txt
new file mode 100644
index 000000000..a54acc4b5
--- /dev/null
+++ b/unsupported/benchmarks/Splines/CMakeLists.txt
@@ -0,0 +1 @@
+eigen_add_benchmark(bench_splines bench_splines.cpp)
diff --git a/unsupported/benchmarks/Splines/bench_splines.cpp b/unsupported/benchmarks/Splines/bench_splines.cpp
new file mode 100644
index 000000000..e422a8f48
--- /dev/null
+++ b/unsupported/benchmarks/Splines/bench_splines.cpp
@@ -0,0 +1,98 @@
+// Benchmarks for Eigen Spline module.
+// Tests fitting, evaluation, and derivative computation.
+
+#include <benchmark/benchmark.h>
+#include <Eigen/Core>
+#include <unsupported/Eigen/Splines>
+
+using namespace Eigen;
+
+typedef double Scalar;
+
+// --- Spline fitting (interpolation) ---
+template <int Dim, int Degree>
+static void BM_SplineFit(benchmark::State& state) {
+  const int n = state.range(0);
+
+  typedef Spline<Scalar, Dim> SplineType;
+  typedef typename SplineType::PointType PointType;
+
+  // Generate random points.
+  Matrix<Scalar, Dim, Dynamic> pts(Dim, n);
+  pts.setRandom();
+
+  for (auto _ : state) {
+    SplineType spline = SplineFitting<SplineType>::Interpolate(pts, Degree);
+    benchmark::DoNotOptimize(spline.knots().data());
+    benchmark::ClobberMemory();
+  }
+}
+
+// --- Spline evaluation ---
+template <int Dim, int Degree>
+static void BM_SplineEval(benchmark::State& state) {
+  const int n = state.range(0);  // number of control points for fitting
+  const int neval = 1000;        // number of evaluation points
+
+  typedef Spline<Scalar, Dim> SplineType;
+
+  Matrix<Scalar, Dim, Dynamic> pts(Dim, n);
+  pts.setRandom();
+  SplineType spline = SplineFitting<SplineType>::Interpolate(pts, Degree);
+
+  // Generate evaluation parameters in [0, 1].
+  VectorXd u = VectorXd::LinSpaced(neval, 0, 1);
+
+  for (auto _ : state) {
+    for (int i = 0; i < neval; ++i) {
+      auto pt = spline(u(i));
+      benchmark::DoNotOptimize(pt.data());
+    }
+    benchmark::ClobberMemory();
+  }
+  state.counters["Evals/s"] = benchmark::Counter(neval, benchmark::Counter::kIsIterationInvariantRate);
+}
+
+// --- Spline derivative evaluation ---
+template <int Dim, int Degree>
+static void BM_SplineDerivatives(benchmark::State& state) {
+  const int n = state.range(0);
+  const int neval = 1000;
+
+  typedef Spline<Scalar, Dim> SplineType;
+
+  Matrix<Scalar, Dim, Dynamic> pts(Dim, n);
+  pts.setRandom();
+  SplineType spline = SplineFitting<SplineType>::Interpolate(pts, Degree);
+
+  VectorXd u = VectorXd::LinSpaced(neval, 0, 1);
+
+  for (auto _ : state) {
+    for (int i = 0; i < neval; ++i) {
+      auto derivs = spline.derivatives(u(i), 1);
+      benchmark::DoNotOptimize(derivs.data());
+    }
+    benchmark::ClobberMemory();
+  }
+  state.counters["Evals/s"] = benchmark::Counter(neval, benchmark::Counter::kIsIterationInvariantRate);
+}
+
+static void SplineSizes(::benchmark::Benchmark* b) {
+  for (int n : {10, 50, 200, 1000}) {
+    b->Arg(n);
+  }
+}
+
+// 2D cubic splines
+BENCHMARK(BM_SplineFit<2, 3>)->Apply(SplineSizes)->Name("SplineFit_2D_Cubic");
+BENCHMARK(BM_SplineEval<2, 3>)->Apply(SplineSizes)->Name("SplineEval_2D_Cubic");
+BENCHMARK(BM_SplineDerivatives<2, 3>)->Apply(SplineSizes)->Name("SplineDerivatives_2D_Cubic");
+
+// 3D cubic splines
+BENCHMARK(BM_SplineFit<3, 3>)->Apply(SplineSizes)->Name("SplineFit_3D_Cubic");
+BENCHMARK(BM_SplineEval<3, 3>)->Apply(SplineSizes)->Name("SplineEval_3D_Cubic");
+BENCHMARK(BM_SplineDerivatives<3, 3>)->Apply(SplineSizes)->Name("SplineDerivatives_3D_Cubic");
+
+// 2D quintic splines
+BENCHMARK(BM_SplineFit<2, 5>)->Apply(SplineSizes)->Name("SplineFit_2D_Quintic");
+BENCHMARK(BM_SplineEval<2, 5>)->Apply(SplineSizes)->Name("SplineEval_2D_Quintic");
diff --git a/unsupported/benchmarks/Tensor/CMakeLists.txt b/unsupported/benchmarks/Tensor/CMakeLists.txt
new file mode 100644
index 000000000..4dab17c48
--- /dev/null
+++ b/unsupported/benchmarks/Tensor/CMakeLists.txt
@@ -0,0 +1,8 @@
+eigen_add_benchmark(bench_contraction bench_contraction.cpp)
+eigen_add_benchmark(bench_convolution bench_convolution.cpp)
+eigen_add_benchmark(bench_reduction bench_reduction.cpp)
+eigen_add_benchmark(bench_broadcasting bench_broadcasting.cpp)
+eigen_add_benchmark(bench_shuffling bench_shuffling.cpp)
+eigen_add_benchmark(bench_tensor_fft bench_tensor_fft.cpp)
+eigen_add_benchmark(bench_morphing bench_morphing.cpp)
+eigen_add_benchmark(bench_coefficient_wise bench_coefficient_wise.cpp)
diff --git a/unsupported/benchmarks/Tensor/bench_broadcasting.cpp b/unsupported/benchmarks/Tensor/bench_broadcasting.cpp
new file mode 100644
index 000000000..01aed2048
--- /dev/null
+++ b/unsupported/benchmarks/Tensor/bench_broadcasting.cpp
@@ -0,0 +1,111 @@
+// Benchmarks for Eigen Tensor broadcasting.
+// Tests broadcasting along various dimensions and ranks.
+
+#include <benchmark/benchmark.h>
+#include <unsupported/Eigen/CXX11/Tensor>
+
+using namespace Eigen;
+
+typedef float Scalar;
+
+// --- Broadcast row vector {1,N} -> {M,N} ---
+static void BM_BroadcastRow(benchmark::State& state) {
+  const int M = state.range(0);
+  const int N = state.range(1);
+
+  Tensor<Scalar, 2> row(1, N);
+  Tensor<Scalar, 2> result(M, N);
+  row.setRandom();
+
+  Eigen::array<int, 2> bcast = {M, 1};
+
+  for (auto _ : state) {
+    result = row.broadcast(bcast);
+    benchmark::DoNotOptimize(result.data());
+    benchmark::ClobberMemory();
+  }
+  state.SetBytesProcessed(state.iterations() * M * N * sizeof(Scalar));
+}
+
+// --- Broadcast col vector {M,1} -> {M,N} ---
+static void BM_BroadcastCol(benchmark::State& state) {
+  const int M = state.range(0);
+  const int N = state.range(1);
+
+  Tensor<Scalar, 2> col(M, 1);
+  Tensor<Scalar, 2> result(M, N);
+  col.setRandom();
+
+  Eigen::array<int, 2> bcast = {1, N};
+
+  for (auto _ : state) {
+    result = col.broadcast(bcast);
+    benchmark::DoNotOptimize(result.data());
+    benchmark::ClobberMemory();
+  }
+  state.SetBytesProcessed(state.iterations() * M * N * sizeof(Scalar));
+}
+
+// --- Broadcast + element-wise add (bias addition pattern) ---
+static void BM_BroadcastAdd(benchmark::State& state) {
+  const int M = state.range(0);
+  const int N = state.range(1);
+
+  Tensor<Scalar, 2> mat(M, N);
+  Tensor<Scalar, 2> bias(1, N);
+  Tensor<Scalar, 2> result(M, N);
+  mat.setRandom();
+  bias.setRandom();
+
+  Eigen::array<int, 2> bcast = {M, 1};
+
+  for (auto _ : state) {
+    result = mat + bias.broadcast(bcast);
+    benchmark::DoNotOptimize(result.data());
+    benchmark::ClobberMemory();
+  }
+  state.SetBytesProcessed(state.iterations() * M * N * sizeof(Scalar) * 2);
+}
+
+// --- Rank-4 broadcast (batch x channels x 1 x 1) -> (batch x channels x H x W) ---
+static void BM_BroadcastRank4(benchmark::State& state) {
+  const int batch = state.range(0);
+  const int C = state.range(1);
+  const int H = state.range(2);
+
+  Tensor<Scalar, 4> bias(batch, C, 1, 1);
+  Tensor<Scalar, 4> result(batch, C, H, H);
+  bias.setRandom();
+
+  Eigen::array<int, 4> bcast = {1, 1, H, H};
+
+  for (auto _ : state) {
+    result = bias.broadcast(bcast);
+    benchmark::DoNotOptimize(result.data());
+    benchmark::ClobberMemory();
+  }
+  state.SetBytesProcessed(state.iterations() * batch * C * H * H * sizeof(Scalar));
+}
+
+static void BroadcastSizes(::benchmark::Benchmark* b) {
+  for (int m : {64, 256, 1024}) {
+    for (int n : {64, 256, 1024}) {
+      b->Args({m, n});
+    }
+  }
+}
+
+static void Rank4Sizes(::benchmark::Benchmark* b) {
+  for (int batch : {1, 8}) {
+    for (int c : {64, 256}) {
+      for (int h : {16, 32}) {
+        b->Args({batch, c, h});
+      }
+    }
+  }
+}
+
+BENCHMARK(BM_BroadcastRow)->Apply(BroadcastSizes);
+BENCHMARK(BM_BroadcastCol)->Apply(BroadcastSizes);
+BENCHMARK(BM_BroadcastAdd)->Apply(BroadcastSizes);
+BENCHMARK(BM_BroadcastRank4)->Apply(Rank4Sizes);
diff --git a/unsupported/benchmarks/Tensor/bench_coefficient_wise.cpp b/unsupported/benchmarks/Tensor/bench_coefficient_wise.cpp
new file mode 100644
index 000000000..aed482811
--- /dev/null
+++ b/unsupported/benchmarks/Tensor/bench_coefficient_wise.cpp
@@ -0,0 +1,131 @@
+// Benchmarks for Eigen Tensor coefficient-wise operations.
+// Covers activation functions, normalization, and element-wise arithmetic.
+
+#include <benchmark/benchmark.h>
+#include <unsupported/Eigen/CXX11/Tensor>
+
+using namespace Eigen;
+
+typedef float Scalar;
+
+// Macro to define a benchmark for a unary tensor operation.
+#define BENCH_TENSOR_UNARY(NAME, EXPR)                                        \
+  static void BM_##NAME(benchmark::State& state) {                            \
+    const int M = state.range(0);                                             \
+    const int N = state.range(1);                                             \
+    Tensor<Scalar, 2> a(M, N);                                                \
+    a.setRandom();                                                            \
+    Tensor<Scalar, 2> b(M, N);                                                \
+    for (auto _ : state) {                                                    \
+      b = EXPR;                                                               \
+      benchmark::DoNotOptimize(b.data());                                     \
+      benchmark::ClobberMemory();                                             \
+    }                                                                         \
+    state.SetBytesProcessed(state.iterations() * M * N * sizeof(Scalar) * 2); \
+  }
+
+BENCH_TENSOR_UNARY(Exp, a.exp())
+BENCH_TENSOR_UNARY(Log, a.abs().log())
+BENCH_TENSOR_UNARY(Tanh, a.tanh())
+BENCH_TENSOR_UNARY(Sigmoid, a.sigmoid())
+BENCH_TENSOR_UNARY(ReLU, a.cwiseMax(Scalar(0)))
+BENCH_TENSOR_UNARY(Sqrt, a.abs().sqrt())
+
+// --- Element-wise binary operations ---
+static void BM_Add(benchmark::State& state) {
+  const int M = state.range(0);
+  const int N = state.range(1);
+
+  Tensor<Scalar, 2> a(M, N);
+  Tensor<Scalar, 2> b(M, N);
+  Tensor<Scalar, 2> c(M, N);
+  a.setRandom();
+  b.setRandom();
+
+  for (auto _ : state) {
+    c = a + b;
+    benchmark::DoNotOptimize(c.data());
+    benchmark::ClobberMemory();
+  }
+  state.SetBytesProcessed(state.iterations() * M * N * sizeof(Scalar) * 3);
+}
+
+static void BM_Mul(benchmark::State& state) {
+  const int M = state.range(0);
+  const int N = state.range(1);
+
+  Tensor<Scalar, 2> a(M, N);
+  Tensor<Scalar, 2> b(M, N);
+  Tensor<Scalar, 2> c(M, N);
+  a.setRandom();
+  b.setRandom();
+
+  for (auto _ : state) {
+    c = a * b;
+    benchmark::DoNotOptimize(c.data());
+    benchmark::ClobberMemory();
+  }
+  state.SetBytesProcessed(state.iterations() * M * N * sizeof(Scalar) * 3);
+}
+
+// --- Fused multiply-add ---
+static void BM_FMA(benchmark::State& state) {
+  const int M = state.range(0);
+  const int N = state.range(1);
+
+  Tensor<Scalar, 2> a(M, N);
+  Tensor<Scalar, 2> b(M, N);
+  Tensor<Scalar, 2> c(M, N);
+  Tensor<Scalar, 2> d(M, N);
+  a.setRandom();
+  b.setRandom();
+  c.setRandom();
+
+  for (auto _ : state) {
+    d = a * b + c;
+    benchmark::DoNotOptimize(d.data());
+    benchmark::ClobberMemory();
+  }
+  state.SetBytesProcessed(state.iterations() * M * N * sizeof(Scalar) * 4);
+}
+
+// --- Rank-4 coefficient-wise (CNN feature maps) ---
+static void BM_ReLU_Rank4(benchmark::State& state) {
+  const int batch = state.range(0);
+  const int C = state.range(1);
+  const int H = state.range(2);
+
+  Tensor<Scalar, 4> a(batch, C, H, H);
+  Tensor<Scalar, 4> b(batch, C, H, H);
+  a.setRandom();
+
+  for (auto _ : state) {
+    b = a.cwiseMax(Scalar(0));
+    benchmark::DoNotOptimize(b.data());
+    benchmark::ClobberMemory();
+  }
+  state.SetBytesProcessed(state.iterations() * batch * C * H * H * sizeof(Scalar) * 2);
+}
+
+static void CwiseSizes(::benchmark::Benchmark* b) {
+  for (int size : {256, 1024}) {
+    b->Args({size, size});
+  }
+}
+
+static void Rank4Sizes(::benchmark::Benchmark* b) {
+  b->Args({32, 64, 16});
+  b->Args({8, 128, 32});
+  b->Args({1, 256, 64});
+}
+
+BENCHMARK(BM_Exp)->Apply(CwiseSizes);
+BENCHMARK(BM_Log)->Apply(CwiseSizes);
+BENCHMARK(BM_Tanh)->Apply(CwiseSizes);
+BENCHMARK(BM_Sigmoid)->Apply(CwiseSizes);
+BENCHMARK(BM_ReLU)->Apply(CwiseSizes);
+BENCHMARK(BM_Sqrt)->Apply(CwiseSizes);
+BENCHMARK(BM_Add)->Apply(CwiseSizes);
+BENCHMARK(BM_Mul)->Apply(CwiseSizes);
+BENCHMARK(BM_FMA)->Apply(CwiseSizes);
+BENCHMARK(BM_ReLU_Rank4)->Apply(Rank4Sizes);
diff --git a/unsupported/benchmarks/Tensor/bench_contraction.cpp b/unsupported/benchmarks/Tensor/bench_contraction.cpp
new file mode 100644
index 000000000..0b71ebfe3
--- /dev/null
+++ b/unsupported/benchmarks/Tensor/bench_contraction.cpp
@@ -0,0 +1,148 @@
+// Benchmarks for Eigen Tensor contraction (generalized GEMM).
+// Tests single-threaded (DefaultDevice) and multi-threaded (ThreadPoolDevice) variants.
+
+#define EIGEN_USE_THREADS
+
+#include <benchmark/benchmark.h>
+#include <unsupported/Eigen/CXX11/Tensor>
+#include <unsupported/Eigen/CXX11/ThreadPool>
+
+using namespace Eigen;
+
+#ifndef SCALAR
+#define SCALAR float
+#endif
+
+typedef SCALAR Scalar;
+
+// --- DefaultDevice contraction (rank-2, equivalent to matrix multiply) ---
+static void BM_Contraction(benchmark::State& state) {
+  const int M = state.range(0);
+  const int N = state.range(1);
+  const int K = state.range(2);
+
+  Tensor<Scalar, 2> A(M, K);
+  Tensor<Scalar, 2> B(K, N);
+  Tensor<Scalar, 2> C(M, N);
+  A.setRandom();
+  B.setRandom();
+
+  using ContractDims = Tensor<Scalar, 2>::DimensionPair;
+  Eigen::array<ContractDims, 1> contract_dims = {ContractDims(1, 0)};
+
+  for (auto _ : state) {
+    C = A.contract(B, contract_dims);
+    benchmark::DoNotOptimize(C.data());
+    benchmark::ClobberMemory();
+  }
+  state.counters["GFLOPS"] =
+      benchmark::Counter(2.0 * M * N * K, benchmark::Counter::kIsIterationInvariantRate, benchmark::Counter::kIs1000);
+}
+
+// --- ThreadPoolDevice contraction ---
+static void BM_Contraction_ThreadPool(benchmark::State& state) {
+  const int M = state.range(0);
+  const int N = state.range(1);
+  const int K = state.range(2);
+  const int threads = state.range(3);
+
+  Tensor<Scalar, 2> A(M, K);
+  Tensor<Scalar, 2> B(K, N);
+  Tensor<Scalar, 2> C(M, N);
+  A.setRandom();
+  B.setRandom();
+
+  ThreadPool tp(threads);
+  ThreadPoolDevice dev(&tp, threads);
+
+  using ContractDims = Tensor<Scalar, 2>::DimensionPair;
+  Eigen::array<ContractDims, 1> contract_dims = {ContractDims(1, 0)};
+
+  for (auto _ : state) {
+    C.device(dev) = A.contract(B, contract_dims);
+    benchmark::DoNotOptimize(C.data());
+    benchmark::ClobberMemory();
+  }
+  state.counters["GFLOPS"] =
+      benchmark::Counter(2.0 * M * N * K, benchmark::Counter::kIsIterationInvariantRate, benchmark::Counter::kIs1000);
+  state.counters["threads"] = threads;
+}
+
+// --- Rank-3 batch contraction ---
+static void BM_BatchContraction(benchmark::State& state) {
+  const int batch = state.range(0);
+  const int M = state.range(1);
+  const int N = state.range(2);
+  const int K = state.range(3);
+
+  Tensor<Scalar, 3> A(batch, M, K);
+  Tensor<Scalar, 3> B(batch, K, N);
+  Tensor<Scalar, 3> C(batch, M, N);
+  A.setRandom();
+  B.setRandom();
+
+  using ContractDims = Tensor<Scalar, 3>::DimensionPair;
+  Eigen::array<ContractDims, 1> contract_dims = {ContractDims(2, 1)};
+
+  for (auto _ : state) {
+    C = A.contract(B, contract_dims);
+    benchmark::DoNotOptimize(C.data());
+    benchmark::ClobberMemory();
+  }
+  state.counters["GFLOPS"] = benchmark::Counter(2.0 * batch * M * N * K, benchmark::Counter::kIsIterationInvariantRate,
+                                                benchmark::Counter::kIs1000);
+}
+
+// --- RowMajor contraction ---
+static void BM_Contraction_RowMajor(benchmark::State& state) {
+  const int M = state.range(0);
+  const int N = state.range(1);
+  const int K = state.range(2);
+
+  Tensor<Scalar, 2, RowMajor> A(M, K);
+  Tensor<Scalar, 2, RowMajor> B(K, N);
+  Tensor<Scalar, 2, RowMajor> C(M, N);
+  A.setRandom();
+  B.setRandom();
+
+  using ContractDims = Tensor<Scalar, 2, RowMajor>::DimensionPair;
+  Eigen::array<ContractDims, 1> contract_dims = {ContractDims(1, 0)};
+
+  for (auto _ : state) {
+    C = A.contract(B, contract_dims);
+    benchmark::DoNotOptimize(C.data());
+    benchmark::ClobberMemory();
+  }
+  state.counters["GFLOPS"] =
+      benchmark::Counter(2.0 * M * N * K, benchmark::Counter::kIsIterationInvariantRate, benchmark::Counter::kIs1000);
+}
+
+static void ContractionSizes(::benchmark::Benchmark* b) {
+  for (int size : {32, 64, 128, 256, 512, 1024}) {
+    b->Args({size, size, size});
+  }
+  // Non-square
+  b->Args({256, 256, 1024});
+  b->Args({1024, 64, 64});
+}
+
+static void ThreadPoolSizes(::benchmark::Benchmark* b) {
+  for (int size : {64, 256, 512, 1024}) {
+    for (int threads : {2, 4, 8}) {
+      b->Args({size, size, size, threads});
+    }
+  }
+}
+
+static void BatchSizes(::benchmark::Benchmark* b) {
+  for (int batch : {1, 8, 32}) {
+    for (int size : {64, 256}) {
+      b->Args({batch, size, size, size});
+    }
+  }
+}
+
+BENCHMARK(BM_Contraction)->Apply(ContractionSizes);
+BENCHMARK(BM_Contraction_RowMajor)->Apply(ContractionSizes);
+BENCHMARK(BM_Contraction_ThreadPool)->Apply(ThreadPoolSizes);
+BENCHMARK(BM_BatchContraction)->Apply(BatchSizes);
diff --git a/unsupported/benchmarks/Tensor/bench_convolution.cpp b/unsupported/benchmarks/Tensor/bench_convolution.cpp
new file mode 100644
index 000000000..46e44eeb8
--- /dev/null
+++ b/unsupported/benchmarks/Tensor/bench_convolution.cpp
@@ -0,0 +1,151 @@
+// Benchmarks for Eigen Tensor convolution (1D and 2D).
+
+#define EIGEN_USE_THREADS
+
+#include <benchmark/benchmark.h>
+#include <unsupported/Eigen/CXX11/Tensor>
+#include <unsupported/Eigen/CXX11/ThreadPool>
+
+using namespace Eigen;
+
+typedef float Scalar;
+
+// --- 1D convolution ---
+static void BM_Convolve1D(benchmark::State& state) {
+  const int input_size = state.range(0);
+  const int kernel_size = state.range(1);
+
+  Tensor<Scalar, 1> input(input_size);
+  Tensor<Scalar, 1> kernel(kernel_size);
+  input.setRandom();
+  kernel.setRandom();
+
+  Eigen::array<int, 1> dims = {0};
+
+  for (auto _ : state) {
+    Tensor<Scalar, 1> result = input.convolve(kernel, dims);
+    benchmark::DoNotOptimize(result.data());
+    benchmark::ClobberMemory();
+  }
+  double flops = 2.0 * (input_size - kernel_size + 1) * kernel_size;
+  state.counters["GFLOPS"] =
+      benchmark::Counter(flops, benchmark::Counter::kIsIterationInvariantRate, benchmark::Counter::kIs1000);
+}
+
+// --- 2D convolution ---
+static void BM_Convolve2D(benchmark::State& state) {
+  const int H = state.range(0);
+  const int W = state.range(1);
+  const int kH = state.range(2);
+  const int kW = state.range(3);
+
+  Tensor<Scalar, 2> input(H, W);
+  Tensor<Scalar, 2> kernel(kH, kW);
+  input.setRandom();
+  kernel.setRandom();
+
+  Eigen::array<int, 2> dims = {0, 1};
+
+  for (auto _ : state) {
+    Tensor<Scalar, 2> result = input.convolve(kernel, dims);
+    benchmark::DoNotOptimize(result.data());
+    benchmark::ClobberMemory();
+  }
+  double flops = 2.0 * (H - kH + 1) * (W - kW + 1) * kH * kW;
+  state.counters["GFLOPS"] =
+      benchmark::Counter(flops, benchmark::Counter::kIsIterationInvariantRate, benchmark::Counter::kIs1000);
+}
+
+// --- 2D convolution with channels (rank-3: C x H x W, convolve on H,W) ---
+static void BM_Convolve2D_Channels(benchmark::State& state) {
+  const int C = state.range(0);
+  const int H = state.range(1);
+  const int kH = state.range(2);
+
+  Tensor<Scalar, 3> input(C, H, H);
+  Tensor<Scalar, 2> kernel(kH, kH);
+  input.setRandom();
+  kernel.setRandom();
+
+  Eigen::array<int, 2> dims = {1, 2};
+
+  for (auto _ : state) {
+    Tensor<Scalar, 3> result = input.convolve(kernel, dims);
+    benchmark::DoNotOptimize(result.data());
+    benchmark::ClobberMemory();
+  }
+  int outH = H - kH + 1;
+  double flops = 2.0 * C * outH * outH * kH * kH;
+  state.counters["GFLOPS"] =
+      benchmark::Counter(flops, benchmark::Counter::kIsIterationInvariantRate, benchmark::Counter::kIs1000);
+}
+
+// --- 2D convolution with ThreadPool ---
+static void BM_Convolve2D_ThreadPool(benchmark::State& state) {
+  const int H = state.range(0);
+  const int kH = state.range(1);
+  const int threads = state.range(2);
+
+  Tensor<Scalar, 2> input(H, H);
+  Tensor<Scalar, 2> kernel(kH, kH);
+  Tensor<Scalar, 2> result(H - kH + 1, H - kH + 1);
+  input.setRandom();
+  kernel.setRandom();
+
+  ThreadPool tp(threads);
+  ThreadPoolDevice dev(&tp, threads);
+
+  Eigen::array<int, 2> dims = {0, 1};
+
+  for (auto _ : state) {
+    result.device(dev) = input.convolve(kernel, dims);
+    benchmark::DoNotOptimize(result.data());
+    benchmark::ClobberMemory();
+  }
+  int outH = H - kH + 1;
+  double flops = 2.0 * outH * outH * kH * kH;
+  state.counters["GFLOPS"] =
+      benchmark::Counter(flops, benchmark::Counter::kIsIterationInvariantRate, benchmark::Counter::kIs1000);
+  state.counters["threads"] = threads;
+}
+
+static void Conv1DSizes(::benchmark::Benchmark* b) {
+  for (int input : {128, 512, 2048}) {
+    for (int kernel : {3, 5, 11}) {
+      b->Args({input, kernel});
+    }
+  }
+}
+
+static void Conv2DSizes(::benchmark::Benchmark* b) {
+  for (int hw : {32, 64, 128, 224}) {
+    for (int k : {3, 5, 7}) {
+      b->Args({hw, hw, k, k});
+    }
+  }
+}
+
+static void Conv2DChannelSizes(::benchmark::Benchmark* b) {
+  for (int c : {3, 64, 128}) {
+    for (int hw : {16, 32, 56}) {
+      for (int k : {3, 5}) {
+        b->Args({c, hw, k});
+      }
+    }
+  }
+}
+
+static void Conv2DThreadPoolSizes(::benchmark::Benchmark* b) {
+  for (int hw : {64, 128, 224}) {
+    for (int k : {3, 5}) {
+      for (int threads : {2, 4, 8}) {
+        b->Args({hw, k, threads});
+      }
+    }
+  }
+}
+
+BENCHMARK(BM_Convolve1D)->Apply(Conv1DSizes);
+BENCHMARK(BM_Convolve2D)->Apply(Conv2DSizes);
+BENCHMARK(BM_Convolve2D_Channels)->Apply(Conv2DChannelSizes);
+BENCHMARK(BM_Convolve2D_ThreadPool)->Apply(Conv2DThreadPoolSizes);
diff --git a/unsupported/benchmarks/Tensor/bench_morphing.cpp b/unsupported/benchmarks/Tensor/bench_morphing.cpp
new file mode 100644
index 000000000..8d226e757
--- /dev/null
+++ b/unsupported/benchmarks/Tensor/bench_morphing.cpp
@@ -0,0 +1,142 @@
+// Benchmarks for Eigen Tensor morphing operations: reshape, slice, chip, pad, stride.
+
+#include <benchmark/benchmark.h>
+#include <unsupported/Eigen/CXX11/Tensor>
+
+using namespace Eigen;
+
+typedef float Scalar;
+
+// --- Reshape (zero-cost if no evaluation needed; force eval via assignment) ---
+static void BM_Reshape(benchmark::State& state) {
+  const int M = state.range(0);
+  const int N = state.range(1);
+
+  Tensor<Scalar, 2> A(M, N);
+  A.setRandom();
+
+  Eigen::array<Index, 1> new_shape = {M * N};
+
+  for (auto _ : state) {
+    Tensor<Scalar, 1> B = A.reshape(new_shape);
+    benchmark::DoNotOptimize(B.data());
+    benchmark::ClobberMemory();
+  }
+  state.SetBytesProcessed(state.iterations() * M * N * sizeof(Scalar));
+}
+
+// --- Slice ---
+static void BM_Slice(benchmark::State& state) {
+  const int M = state.range(0);
+  const int N = state.range(1);
+
+  Tensor<Scalar, 2> A(M, N);
+  A.setRandom();
+
+  int sliceM = M / 2;
+  int sliceN = N / 2;
+  Eigen::array<Index, 2> offsets = {0, 0};
+  Eigen::array<Index, 2> extents = {sliceM, sliceN};
+
+  for (auto _ : state) {
+    Tensor<Scalar, 2> B = A.slice(offsets, extents);
+    benchmark::DoNotOptimize(B.data());
+    benchmark::ClobberMemory();
+  }
+  state.SetBytesProcessed(state.iterations() * sliceM * sliceN * sizeof(Scalar));
+}
+
+// --- Chip (extract a sub-tensor along one dimension) ---
+static void BM_Chip(benchmark::State& state) {
+  const int D0 = state.range(0);
+  const int D1 = state.range(1);
+  const int D2 = state.range(2);
+
+  Tensor<Scalar, 3> A(D0, D1, D2);
+  A.setRandom();
+
+  for (auto _ : state) {
+    Tensor<Scalar, 2> B = A.chip(0, 0);
+    benchmark::DoNotOptimize(B.data());
+    benchmark::ClobberMemory();
+  }
+  state.SetBytesProcessed(state.iterations() * D1 * D2 * sizeof(Scalar));
+}
+
+// --- Pad ---
+static void BM_Pad(benchmark::State& state) {
+  const int M = state.range(0);
+  const int N = state.range(1);
+  const int padSize = state.range(2);
+
+  Tensor<Scalar, 2> A(M, N);
+  A.setRandom();
+
+  Eigen::array<std::pair<int, int>, 2> paddings;
+  paddings[0] = {padSize, padSize};
+  paddings[1] = {padSize, padSize};
+
+  for (auto _ : state) {
+    Tensor<Scalar, 2> B = A.pad(paddings);
+    benchmark::DoNotOptimize(B.data());
+    benchmark::ClobberMemory();
+  }
+  int outM = M + 2 * padSize;
+  int outN = N + 2 * padSize;
+  state.SetBytesProcessed(state.iterations() * outM * outN * sizeof(Scalar));
+}
+
+// --- Stride ---
+static void BM_Stride(benchmark::State& state) {
+  const int M = state.range(0);
+  const int N = state.range(1);
+  const int stride = state.range(2);
+
+  Tensor<Scalar, 2> A(M, N);
+  A.setRandom();
+
+  Eigen::array<Index, 2> strides_arr = {stride, stride};
+
+  for (auto _ : state) {
+    Tensor<Scalar, 2> B = A.stride(strides_arr);
+    benchmark::DoNotOptimize(B.data());
+    benchmark::ClobberMemory();
+  }
+  int outM = (M + stride - 1) / stride;
+  int outN = (N + stride - 1) / stride;
+  state.SetBytesProcessed(state.iterations() * outM * outN * sizeof(Scalar));
+}
+
+static void MorphSizes(::benchmark::Benchmark* b) {
+  for (int size : {256, 1024}) {
+    b->Args({size, size});
+  }
+}
+
+static void ChipSizes(::benchmark::Benchmark* b) {
+  b->Args({32, 256, 256});
+  b->Args({64, 128, 128});
+  b->Args({8, 512, 512});
+}
+
+static void PadSizes(::benchmark::Benchmark* b) {
+  for (int size : {256, 1024}) {
+    for (int pad : {1, 4, 16}) {
+      b->Args({size, size, pad});
+    }
+  }
+}
+
+static void StrideSizes(::benchmark::Benchmark* b) {
+  for (int size : {256, 1024}) {
+    for (int stride : {2, 4}) {
+      b->Args({size, size, stride});
+    }
+  }
+}
+
+BENCHMARK(BM_Reshape)->Apply(MorphSizes);
+BENCHMARK(BM_Slice)->Apply(MorphSizes);
+BENCHMARK(BM_Chip)->Apply(ChipSizes);
+BENCHMARK(BM_Pad)->Apply(PadSizes);
+BENCHMARK(BM_Stride)->Apply(StrideSizes);
diff --git a/unsupported/benchmarks/Tensor/bench_reduction.cpp b/unsupported/benchmarks/Tensor/bench_reduction.cpp
new file mode 100644
index 000000000..795c95c72
--- /dev/null
+++ b/unsupported/benchmarks/Tensor/bench_reduction.cpp
@@ -0,0 +1,158 @@
+// Benchmarks for Eigen Tensor reductions (sum, maximum, mean).
+// Tests full and partial reductions, inner vs outer dimension, DefaultDevice and ThreadPoolDevice.
+
+#define EIGEN_USE_THREADS
+
+#include <benchmark/benchmark.h>
+#include <unsupported/Eigen/CXX11/Tensor>
+#include <unsupported/Eigen/CXX11/ThreadPool>
+
+using namespace Eigen;
+
+#ifndef SCALAR
+#define SCALAR float
+#endif
+
+typedef SCALAR Scalar;
+
+// --- Full reduction (rank-2) ---
+template <typename ReduceOp>
+static void BM_FullReduction(benchmark::State& state) {
+  const int M = state.range(0);
+  const int N = state.range(1);
+
+  Tensor<Scalar, 2> A(M, N);
+  A.setRandom();
+
+  for (auto _ : state) {
+    Tensor<Scalar, 0> result = A.reduce(Eigen::array<int, 2>{0, 1}, ReduceOp());
+    benchmark::DoNotOptimize(result.data());
+    benchmark::ClobberMemory();
+  }
+  state.SetBytesProcessed(state.iterations() * M * N * sizeof(Scalar));
+}
+
+// --- Partial reduction along dim 0 (inner dim, ColMajor) ---
+static void BM_ReduceInner(benchmark::State& state) {
+  const int M = state.range(0);
+  const int N = state.range(1);
+
+  Tensor<Scalar, 2> A(M, N);
+  A.setRandom();
+
+  Eigen::array<int, 1> reduce_dims = {0};
+
+  for (auto _ : state) {
+    Tensor<Scalar, 1> result = A.sum(reduce_dims);
+    benchmark::DoNotOptimize(result.data());
+    benchmark::ClobberMemory();
+  }
+  state.SetBytesProcessed(state.iterations() * M * N * sizeof(Scalar));
+}
+
+// --- Partial reduction along dim 1 (outer dim, ColMajor) ---
+static void BM_ReduceOuter(benchmark::State& state) {
+  const int M = state.range(0);
+  const int N = state.range(1);
+
+  Tensor<Scalar, 2> A(M, N);
+  A.setRandom();
+
+  Eigen::array<int, 1> reduce_dims = {1};
+
+  for (auto _ : state) {
+    Tensor<Scalar, 1> result = A.sum(reduce_dims);
+    benchmark::DoNotOptimize(result.data());
+    benchmark::ClobberMemory();
+  }
+  state.SetBytesProcessed(state.iterations() * M * N * sizeof(Scalar));
+}
+
+// --- Rank-4 partial reduction (batch x channels x H x W), reduce along spatial dims ---
+static void BM_ReduceSpatial(benchmark::State& state) {
+  const int batch = state.range(0);
+  const int C = state.range(1);
+  const int H = state.range(2);
+
+  Tensor<Scalar, 4> A(batch, C, H, H);
+  A.setRandom();
+
+  Eigen::array<int, 2> reduce_dims = {2, 3};
+
+  for (auto _ : state) {
+    Tensor<Scalar, 2> result = A.sum(reduce_dims);
+    benchmark::DoNotOptimize(result.data());
+    benchmark::ClobberMemory();
+  }
+  state.SetBytesProcessed(state.iterations() * batch * C * H * H * sizeof(Scalar));
+}
+
+// --- Full reduction with ThreadPoolDevice ---
+static void BM_FullReduction_ThreadPool(benchmark::State& state) {
+  const int M = state.range(0);
+  const int N = state.range(1);
+  const int threads = state.range(2);
+
+  Tensor<Scalar, 2> A(M, N);
+  Tensor<Scalar, 0> result;
+  A.setRandom();
+
+  ThreadPool tp(threads);
+  ThreadPoolDevice dev(&tp, threads);
+
+  for (auto _ : state) {
+    result.device(dev) = A.sum();
+    benchmark::DoNotOptimize(result.data());
+    benchmark::ClobberMemory();
+  }
+  state.SetBytesProcessed(state.iterations() * M * N * sizeof(Scalar));
+  state.counters["threads"] = threads;
+}
+
+// --- Maximum reduction (rank-2) ---
+static void BM_MaxReduction(benchmark::State& state) {
+  const int M = state.range(0);
+  const int N = state.range(1);
+
+  Tensor<Scalar, 2> A(M, N);
+  A.setRandom();
+
+  for (auto _ : state) {
+    Tensor<Scalar, 0> result = A.maximum();
+    benchmark::DoNotOptimize(result.data());
+    benchmark::ClobberMemory();
+  }
+  state.SetBytesProcessed(state.iterations() * M * N * sizeof(Scalar));
+}
+
+static void ReductionSizes(::benchmark::Benchmark* b) {
+  for (int size : {64, 256, 1024}) {
+    b->Args({size, size});
+  }
+}
+
+static void ThreadPoolReductionSizes(::benchmark::Benchmark* b) {
+  for (int size : {256, 1024}) {
+    for (int threads : {2, 4, 8}) {
+      b->Args({size, size, threads});
+    }
+  }
+}
+
+static void SpatialSizes(::benchmark::Benchmark* b) {
+  for (int batch : {1, 8, 32}) {
+    for (int c : {64, 128}) {
+      for (int h : {16, 32}) {
+        b->Args({batch, c, h});
+      }
+    }
+  }
+}
+
+BENCHMARK(BM_FullReduction<internal::SumReducer<Scalar>>)->Apply(ReductionSizes)->Name("SumReduction");
+BENCHMARK(BM_FullReduction<internal::MaxReducer<Scalar>>)->Apply(ReductionSizes)->Name("MaxReduction_Full");
+BENCHMARK(BM_MaxReduction)->Apply(ReductionSizes);
+BENCHMARK(BM_ReduceInner)->Apply(ReductionSizes);
+BENCHMARK(BM_ReduceOuter)->Apply(ReductionSizes);
+BENCHMARK(BM_ReduceSpatial)->Apply(SpatialSizes);
+BENCHMARK(BM_FullReduction_ThreadPool)->Apply(ThreadPoolReductionSizes);
diff --git a/unsupported/benchmarks/Tensor/bench_shuffling.cpp b/unsupported/benchmarks/Tensor/bench_shuffling.cpp
new file mode 100644
index 000000000..6296accc8
--- /dev/null
+++ b/unsupported/benchmarks/Tensor/bench_shuffling.cpp
@@ -0,0 +1,115 @@
+// Benchmarks for Eigen Tensor shuffling (transpose / permutation).
+
+#include <benchmark/benchmark.h>
+#include <unsupported/Eigen/CXX11/Tensor>
+
+using namespace Eigen;
+
+typedef float Scalar;
+
+// --- Rank-2 transpose ---
+static void BM_Shuffle2D(benchmark::State& state) {
+  const int M = state.range(0);
+  const int N = state.range(1);
+
+  Tensor<Scalar, 2> A(M, N);
+  Tensor<Scalar, 2> B(N, M);
+  A.setRandom();
+
+  Eigen::array<int, 2> perm = {1, 0};
+
+  for (auto _ : state) {
+    B = A.shuffle(perm);
+    benchmark::DoNotOptimize(B.data());
+    benchmark::ClobberMemory();
+  }
+  state.SetBytesProcessed(state.iterations() * M * N * sizeof(Scalar) * 2);
+}
+
+// --- Identity shuffle (no permutation, measures overhead) ---
+static void BM_ShuffleIdentity(benchmark::State& state) {
+  const int M = state.range(0);
+  const int N = state.range(1);
+
+  Tensor<Scalar, 2> A(M, N);
+  Tensor<Scalar, 2> B(M, N);
+  A.setRandom();
+
+  Eigen::array<int, 2> perm = {0, 1};
+
+  for (auto _ : state) {
+    B = A.shuffle(perm);
+    benchmark::DoNotOptimize(B.data());
+    benchmark::ClobberMemory();
+  }
+  state.SetBytesProcessed(state.iterations() * M * N * sizeof(Scalar) * 2);
+}
+
+// --- Rank-3 permutation ---
+static void BM_Shuffle3D(benchmark::State& state) {
+  const int D0 = state.range(0);
+  const int D1 = state.range(1);
+  const int D2 = state.range(2);
+
+  Tensor<Scalar, 3> A(D0, D1, D2);
+  A.setRandom();
+
+  // Permutation (2, 0, 1)
+  Eigen::array<int, 3> perm = {2, 0, 1};
+
+  for (auto _ : state) {
+    Tensor<Scalar, 3> B = A.shuffle(perm);
+    benchmark::DoNotOptimize(B.data());
+    benchmark::ClobberMemory();
+  }
+  state.SetBytesProcessed(state.iterations() * D0 * D1 * D2 * sizeof(Scalar) * 2);
+}
+
+// --- Rank-4 permutation (NCHW -> NHWC layout conversion) ---
+static void BM_Shuffle4D_NCHW_to_NHWC(benchmark::State& state) {
+  const int N = state.range(0);
+  const int C = state.range(1);
+  const int H = state.range(2);
+
+  Tensor<Scalar, 4> A(N, C, H, H);
+  A.setRandom();
+
+  // NCHW -> NHWC: permute (0, 2, 3, 1)
+  Eigen::array<int, 4> perm = {0, 2, 3, 1};
+
+  for (auto _ : state) {
+    Tensor<Scalar, 4> B = A.shuffle(perm);
+    benchmark::DoNotOptimize(B.data());
+    benchmark::ClobberMemory();
+  }
+  state.SetBytesProcessed(state.iterations() * N * C * H * H * sizeof(Scalar) * 2);
+}
+
+static void Shuffle2DSizes(::benchmark::Benchmark* b) {
+  for (int size : {256, 1024}) {
+    b->Args({size, size});
+  }
+  b->Args({64, 4096});
+  b->Args({4096, 64});
+}
+
+static void Shuffle3DSizes(::benchmark::Benchmark* b) {
+  b->Args({64, 64, 64});
+  b->Args({128, 128, 64});
+  b->Args({32, 256, 256});
+}
+
+static void Shuffle4DSizes(::benchmark::Benchmark* b) {
+  for (int batch : {1, 8}) {
+    for (int c : {3, 64}) {
+      for (int h : {32, 64}) {
+        b->Args({batch, c, h});
+      }
+    }
+  }
+}
+
+BENCHMARK(BM_Shuffle2D)->Apply(Shuffle2DSizes);
+BENCHMARK(BM_ShuffleIdentity)->Apply(Shuffle2DSizes);
+BENCHMARK(BM_Shuffle3D)->Apply(Shuffle3DSizes);
+BENCHMARK(BM_Shuffle4D_NCHW_to_NHWC)->Apply(Shuffle4DSizes);
diff --git a/unsupported/benchmarks/Tensor/bench_tensor_fft.cpp b/unsupported/benchmarks/Tensor/bench_tensor_fft.cpp
new file mode 100644
index 000000000..26ff64b1e
--- /dev/null
+++ b/unsupported/benchmarks/Tensor/bench_tensor_fft.cpp
@@ -0,0 +1,80 @@
+// Benchmarks for Eigen Tensor FFT.
+
+#include <benchmark/benchmark.h>
+#include <unsupported/Eigen/CXX11/Tensor>
+
+using namespace Eigen;
+
+#ifndef SCALAR
+#define SCALAR float
+#endif
+
+typedef SCALAR Scalar;
+
+// --- 1D FFT ---
+static void BM_TensorFFT_1D(benchmark::State& state) {
+  const int N = state.range(0);
+
+  Tensor<Scalar, 1> input(N);
+  input.setRandom();
+
+  Eigen::array<int, 1> fft_dims = {0};
+
+  for (auto _ : state) {
+    Tensor<std::complex<Scalar>, 1> result = input.template fft<BothParts, FFT_FORWARD>(fft_dims);
+    benchmark::DoNotOptimize(result.data());
+    benchmark::ClobberMemory();
+  }
+  double mflops = 5.0 * N * std::log2(static_cast<double>(N)) / 2.0;  // real->complex
+  state.counters["MFLOPS"] =
+      benchmark::Counter(mflops, benchmark::Counter::kIsIterationInvariantRate, benchmark::Counter::kIs1000);
+}
+
+// --- 2D FFT ---
+static void BM_TensorFFT_2D(benchmark::State& state) {
+  const int N = state.range(0);
+
+  Tensor<Scalar, 2> input(N, N);
+  input.setRandom();
+
+  Eigen::array<int, 2> fft_dims = {0, 1};
+
+  for (auto _ : state) {
+    Tensor<std::complex<Scalar>, 2> result = input.template fft<BothParts, FFT_FORWARD>(fft_dims);
+    benchmark::DoNotOptimize(result.data());
+    benchmark::ClobberMemory();
+  }
+  double total = N * N;
+  double mflops = 5.0 * total * std::log2(static_cast<double>(N));
+  state.counters["MFLOPS"] =
+      benchmark::Counter(mflops, benchmark::Counter::kIsIterationInvariantRate, benchmark::Counter::kIs1000);
+}
+
+// --- 1D inverse FFT ---
+static void BM_TensorIFFT_1D(benchmark::State& state) {
+  const int N = state.range(0);
+
+  Tensor<std::complex<Scalar>, 1> input(N);
+  input.setRandom();
+
+  Eigen::array<int, 1> fft_dims = {0};
+
+  for (auto _ : state) {
+    Tensor<std::complex<Scalar>, 1> result = input.template fft<BothParts, FFT_REVERSE>(fft_dims);
+    benchmark::DoNotOptimize(result.data());
+    benchmark::ClobberMemory();
+  }
+  double mflops = 5.0 * N * std::log2(static_cast<double>(N));
+  state.counters["MFLOPS"] =
+      benchmark::Counter(mflops, benchmark::Counter::kIsIterationInvariantRate, benchmark::Counter::kIs1000);
+}
+
+static void FFTSizes(::benchmark::Benchmark* b) {
+  for (int n : {64, 256, 1024, 4096}) {
+    b->Arg(n);
+  }
+}
+
+BENCHMARK(BM_TensorFFT_1D)->Apply(FFTSizes);
+BENCHMARK(BM_TensorFFT_2D)->Apply(FFTSizes);
+BENCHMARK(BM_TensorIFFT_1D)->Apply(FFTSizes);