Add benchmarks for unsupported modules and extend supported benchmarks

libeigen/eigen!2179

Closes #3036

Co-authored-by: Rasmus Munk Larsen <rmlarsen@gmail.com>
This commit is contained in:
Rasmus Munk Larsen
2026-02-24 17:12:33 -08:00
parent fa567f6bcd
commit 16da0279f1
33 changed files with 2320 additions and 10 deletions

View File

@@ -1 +1,2 @@
eigen_add_benchmark(bench_cholesky bench_cholesky.cpp) eigen_add_benchmark(bench_cholesky bench_cholesky.cpp)
eigen_add_benchmark(bench_cholesky_double bench_cholesky.cpp DEFINITIONS SCALAR=double)

View File

@@ -4,7 +4,11 @@
using namespace Eigen; using namespace Eigen;
typedef float Scalar; #ifndef SCALAR
#define SCALAR float
#endif
typedef SCALAR Scalar;
static void BM_LDLT(benchmark::State& state) { static void BM_LDLT(benchmark::State& state) {
int n = state.range(0); int n = state.range(0);

View File

@@ -15,3 +15,5 @@ eigen_add_benchmark(bench_diagonal bench_diagonal.cpp)
eigen_add_benchmark(bench_triangular_product bench_triangular_product.cpp) eigen_add_benchmark(bench_triangular_product bench_triangular_product.cpp)
eigen_add_benchmark(bench_selfadjoint_product bench_selfadjoint_product.cpp) eigen_add_benchmark(bench_selfadjoint_product bench_selfadjoint_product.cpp)
eigen_add_benchmark(bench_construction bench_construction.cpp) eigen_add_benchmark(bench_construction bench_construction.cpp)
eigen_add_benchmark(bench_fixed_size bench_fixed_size.cpp)
eigen_add_benchmark(bench_fixed_size_double bench_fixed_size.cpp DEFINITIONS SCALAR=double)

View File

@@ -0,0 +1,123 @@
// Benchmarks for fixed-size matrix operations (2x2, 3x3, 4x4).
// Critical for PCL, ROS, Sophus, Drake which use small matrices extensively.
#include <benchmark/benchmark.h>
#include <Eigen/Core>
#include <Eigen/LU>
using namespace Eigen;
#ifndef SCALAR
#define SCALAR float
#endif
typedef SCALAR Scalar;
// --- Fixed-size GEMM ---
template <int N>
static void BM_FixedGemm(benchmark::State& state) {
typedef Matrix<Scalar, N, N> Mat;
Mat a = Mat::Random();
Mat b = Mat::Random();
Mat c;
for (auto _ : state) {
c.noalias() = a * b;
benchmark::DoNotOptimize(c.data());
benchmark::ClobberMemory();
}
state.counters["GFLOPS"] =
benchmark::Counter(2.0 * N * N * N, benchmark::Counter::kIsIterationInvariantRate, benchmark::Counter::kIs1000);
}
// --- Fixed-size inverse ---
template <int N>
static void BM_FixedInverse(benchmark::State& state) {
typedef Matrix<Scalar, N, N> Mat;
Mat a = Mat::Random();
// Make well-conditioned.
a = a * a.transpose() + Mat::Identity();
Mat result;
for (auto _ : state) {
result = a.inverse();
benchmark::DoNotOptimize(result.data());
benchmark::ClobberMemory();
}
}
// --- Fixed-size determinant ---
template <int N>
static void BM_FixedDeterminant(benchmark::State& state) {
typedef Matrix<Scalar, N, N> Mat;
Mat a = Mat::Random();
Scalar result;
for (auto _ : state) {
result = a.determinant();
benchmark::DoNotOptimize(&result);
benchmark::ClobberMemory();
}
}
// --- Batch transform: Matrix4 * Matrix<4,N> ---
static void BM_BatchTransform4xN(benchmark::State& state) {
int N = state.range(0);
typedef Matrix<Scalar, 4, 4> Mat4;
typedef Matrix<Scalar, 4, Dynamic> MatXN;
Mat4 transform = Mat4::Random();
MatXN points = MatXN::Random(4, N);
MatXN result(4, N);
for (auto _ : state) {
result.noalias() = transform * points;
benchmark::DoNotOptimize(result.data());
benchmark::ClobberMemory();
}
state.counters["GFLOPS"] =
benchmark::Counter(2.0 * 4 * 4 * N, benchmark::Counter::kIsIterationInvariantRate, benchmark::Counter::kIs1000);
}
// --- Fixed 3x3 batch operations (common in point cloud processing) ---
static void BM_Batch3x3Gemm(benchmark::State& state) {
int count = state.range(0);
typedef Matrix<Scalar, 3, 3> Mat3;
std::vector<Mat3> a(count), b(count), c(count);
for (int i = 0; i < count; ++i) {
a[i] = Mat3::Random();
b[i] = Mat3::Random();
}
for (auto _ : state) {
for (int i = 0; i < count; ++i) {
c[i].noalias() = a[i] * b[i];
}
benchmark::DoNotOptimize(c.data());
benchmark::ClobberMemory();
}
state.counters["GFLOPS"] =
benchmark::Counter(2.0 * 27 * count, benchmark::Counter::kIsIterationInvariantRate, benchmark::Counter::kIs1000);
}
// Fixed-size GEMM
BENCHMARK(BM_FixedGemm<2>)->Name("FixedGemm_2x2");
BENCHMARK(BM_FixedGemm<3>)->Name("FixedGemm_3x3");
BENCHMARK(BM_FixedGemm<4>)->Name("FixedGemm_4x4");
// Fixed-size inverse
BENCHMARK(BM_FixedInverse<2>)->Name("FixedInverse_2x2");
BENCHMARK(BM_FixedInverse<3>)->Name("FixedInverse_3x3");
BENCHMARK(BM_FixedInverse<4>)->Name("FixedInverse_4x4");
// Fixed-size determinant
BENCHMARK(BM_FixedDeterminant<2>)->Name("FixedDet_2x2");
BENCHMARK(BM_FixedDeterminant<3>)->Name("FixedDet_3x3");
BENCHMARK(BM_FixedDeterminant<4>)->Name("FixedDet_4x4");
// Batch 4xN transform
BENCHMARK(BM_BatchTransform4xN)->Arg(1)->Arg(4)->Arg(8)->Arg(16)->Arg(64);
// Batch 3x3 GEMM
BENCHMARK(BM_Batch3x3Gemm)->Arg(100)->Arg(1000)->Arg(10000);

View File

@@ -1,2 +1,3 @@
eigen_add_benchmark(bench_eigensolver bench_eigensolver.cpp) eigen_add_benchmark(bench_eigensolver bench_eigensolver.cpp)
eigen_add_benchmark(bench_eigensolver_double bench_eigensolver.cpp DEFINITIONS SCALAR=double)
eigen_add_benchmark(bench_eig33 bench_eig33.cpp) eigen_add_benchmark(bench_eig33 bench_eig33.cpp)

View File

@@ -5,7 +5,11 @@
using namespace Eigen; using namespace Eigen;
typedef float Scalar; #ifndef SCALAR
#define SCALAR float
#endif
typedef SCALAR Scalar;
static void BM_SelfAdjointEigenSolver(benchmark::State& state) { static void BM_SelfAdjointEigenSolver(benchmark::State& state) {
int n = state.range(0); int n = state.range(0);

View File

@@ -33,11 +33,20 @@ static void BM_FFT(benchmark::State& state) {
benchmark::Counter(mflops_per_iter, benchmark::Counter::kIsIterationInvariantRate, benchmark::Counter::kIs1000); benchmark::Counter(mflops_per_iter, benchmark::Counter::kIsIterationInvariantRate, benchmark::Counter::kIs1000);
} }
BENCHMARK(BM_FFT<std::complex<float>, true>)->Arg(1024)->Arg(4096); static void FFTSizes(::benchmark::Benchmark* b) {
BENCHMARK(BM_FFT<std::complex<float>, false>)->Arg(1024)->Arg(4096); for (int n : {64, 128, 256, 512, 1024, 2048, 4096, 8192, 16384, 65536}) {
BENCHMARK(BM_FFT<float, true>)->Arg(1024)->Arg(4096); b->Arg(n);
BENCHMARK(BM_FFT<float, false>)->Arg(1024)->Arg(4096); }
BENCHMARK(BM_FFT<std::complex<double>, true>)->Arg(1024)->Arg(4096); // Non-power-of-2 sizes.
BENCHMARK(BM_FFT<std::complex<double>, false>)->Arg(1024)->Arg(4096); b->Arg(1000);
BENCHMARK(BM_FFT<double, true>)->Arg(1024)->Arg(4096); b->Arg(5000);
BENCHMARK(BM_FFT<double, false>)->Arg(1024)->Arg(4096); }
BENCHMARK(BM_FFT<std::complex<float>, true>)->Apply(FFTSizes);
BENCHMARK(BM_FFT<std::complex<float>, false>)->Apply(FFTSizes);
BENCHMARK(BM_FFT<float, true>)->Apply(FFTSizes);
BENCHMARK(BM_FFT<float, false>)->Apply(FFTSizes);
BENCHMARK(BM_FFT<std::complex<double>, true>)->Apply(FFTSizes);
BENCHMARK(BM_FFT<std::complex<double>, false>)->Apply(FFTSizes);
BENCHMARK(BM_FFT<double, true>)->Apply(FFTSizes);
BENCHMARK(BM_FFT<double, false>)->Apply(FFTSizes);

View File

@@ -1,3 +1,4 @@
eigen_add_benchmark(bench_spmv bench_spmv.cpp) eigen_add_benchmark(bench_spmv bench_spmv.cpp)
eigen_add_benchmark(bench_spmm bench_spmm.cpp) eigen_add_benchmark(bench_spmm bench_spmm.cpp)
eigen_add_benchmark(bench_sparse_transpose bench_sparse_transpose.cpp) eigen_add_benchmark(bench_sparse_transpose bench_sparse_transpose.cpp)
eigen_add_benchmark(bench_sparse_solvers bench_sparse_solvers.cpp)

View File

@@ -0,0 +1,182 @@
// Benchmarks for sparse decomposition solvers.
// Tests SimplicialLLT, SimplicialLDLT, SparseQR, SparseLU, CG, BiCGSTAB.
#include <benchmark/benchmark.h>
#include <Eigen/Sparse>
#include <Eigen/SparseCholesky>
#include <Eigen/SparseLU>
#include <Eigen/SparseQR>
#include <Eigen/IterativeLinearSolvers>
#include <Eigen/OrderingMethods>
using namespace Eigen;
typedef double Scalar;
typedef SparseMatrix<Scalar> SpMat;
typedef Matrix<Scalar, Dynamic, 1> Vec;
// Generate a SPD banded matrix (Laplacian-like).
static SpMat generateSPD(int n, int bandwidth) {
SpMat A(n, n);
std::vector<Triplet<Scalar>> trips;
trips.reserve(n * (2 * bandwidth + 1));
for (int i = 0; i < n; ++i) {
Scalar diag = 0;
for (int j = std::max(0, i - bandwidth); j < std::min(n, i + bandwidth + 1); ++j) {
if (i != j) {
Scalar val = -1.0 / (1 + std::abs(i - j));
trips.emplace_back(i, j, val);
diag -= val;
}
}
trips.emplace_back(i, i, diag + 1.0);
}
A.setFromTriplets(trips.begin(), trips.end());
return A;
}
// Generate a general (non-symmetric) sparse matrix with diagonal dominance.
static SpMat generateGeneral(int n, int bandwidth) {
SpMat A(n, n);
std::vector<Triplet<Scalar>> trips;
trips.reserve(n * (2 * bandwidth + 1));
for (int i = 0; i < n; ++i) {
Scalar diag = 0;
for (int j = std::max(0, i - bandwidth); j < std::min(n, i + bandwidth + 1); ++j) {
if (i != j) {
Scalar val = -0.5 / (1 + std::abs(i - j));
if (j > i) val *= 1.5;
trips.emplace_back(i, j, val);
diag += std::abs(val);
}
}
trips.emplace_back(i, i, diag + 1.0);
}
A.setFromTriplets(trips.begin(), trips.end());
return A;
}
// --- SimplicialLLT ---
static void BM_SimplicialLLT(benchmark::State& state) {
int n = state.range(0);
int bw = state.range(1);
SpMat A = generateSPD(n, bw);
Vec b = Vec::Random(n);
for (auto _ : state) {
SimplicialLLT<SpMat> solver(A);
Vec x = solver.solve(b);
benchmark::DoNotOptimize(x.data());
benchmark::ClobberMemory();
}
}
// --- SimplicialLDLT ---
static void BM_SimplicialLDLT(benchmark::State& state) {
int n = state.range(0);
int bw = state.range(1);
SpMat A = generateSPD(n, bw);
Vec b = Vec::Random(n);
for (auto _ : state) {
SimplicialLDLT<SpMat> solver(A);
Vec x = solver.solve(b);
benchmark::DoNotOptimize(x.data());
benchmark::ClobberMemory();
}
}
// --- SparseLU ---
static void BM_SparseLU(benchmark::State& state) {
int n = state.range(0);
int bw = state.range(1);
SpMat A = generateGeneral(n, bw);
Vec b = Vec::Random(n);
for (auto _ : state) {
SparseLU<SpMat, COLAMDOrdering<int>> solver;
solver.compute(A);
Vec x = solver.solve(b);
benchmark::DoNotOptimize(x.data());
benchmark::ClobberMemory();
}
}
// --- SparseQR ---
static void BM_SparseQR(benchmark::State& state) {
int n = state.range(0);
int bw = state.range(1);
SpMat A = generateGeneral(n, bw);
Vec b = Vec::Random(n);
for (auto _ : state) {
SparseQR<SpMat, COLAMDOrdering<int>> solver;
solver.compute(A);
Vec x = solver.solve(b);
benchmark::DoNotOptimize(x.data());
benchmark::ClobberMemory();
}
}
// --- ConjugateGradient (SPD) ---
static void BM_CG(benchmark::State& state) {
int n = state.range(0);
int bw = state.range(1);
SpMat A = generateSPD(n, bw);
Vec b = Vec::Random(n);
ConjugateGradient<SpMat> solver;
solver.setMaxIterations(1000);
solver.setTolerance(1e-10);
solver.compute(A);
for (auto _ : state) {
Vec x = solver.solve(b);
benchmark::DoNotOptimize(x.data());
benchmark::ClobberMemory();
}
state.counters["iterations"] = solver.iterations();
}
// --- BiCGSTAB (general) ---
static void BM_BiCGSTAB(benchmark::State& state) {
int n = state.range(0);
int bw = state.range(1);
SpMat A = generateGeneral(n, bw);
Vec b = Vec::Random(n);
BiCGSTAB<SpMat> solver;
solver.setMaxIterations(1000);
solver.setTolerance(1e-10);
solver.compute(A);
for (auto _ : state) {
Vec x = solver.solve(b);
benchmark::DoNotOptimize(x.data());
benchmark::ClobberMemory();
}
state.counters["iterations"] = solver.iterations();
}
static void DirectSolverSizes(::benchmark::Benchmark* b) {
for (int n : {1000, 5000, 10000, 50000}) {
for (int bw : {5, 20}) {
b->Args({n, bw});
}
}
}
static void IterativeSolverSizes(::benchmark::Benchmark* b) {
for (int n : {1000, 10000, 50000}) {
for (int bw : {5, 20}) {
b->Args({n, bw});
}
}
}
BENCHMARK(BM_SimplicialLLT)->Apply(DirectSolverSizes);
BENCHMARK(BM_SimplicialLDLT)->Apply(DirectSolverSizes);
BENCHMARK(BM_SparseLU)->Apply(DirectSolverSizes);
BENCHMARK(BM_SparseQR)->Apply(DirectSolverSizes);
BENCHMARK(BM_CG)->Apply(IterativeSolverSizes);
BENCHMARK(BM_BiCGSTAB)->Apply(IterativeSolverSizes);

View File

@@ -0,0 +1 @@
eigen_add_benchmark(bench_autodiff bench_autodiff.cpp)

View File

@@ -0,0 +1,177 @@
// Benchmarks for Eigen AutoDiff module.
// Compares AutoDiff Jacobian computation against NumericalDiff and hand-coded Jacobians.
#include <benchmark/benchmark.h>
#include <Eigen/Core>
#include <unsupported/Eigen/AutoDiff>
#include <unsupported/Eigen/NumericalDiff>
using namespace Eigen;
// --- Small functor: Rosenbrock-like (2 inputs -> 2 outputs) ---
struct SmallFunctor {
typedef Matrix<double, 2, 1> InputType;
typedef Matrix<double, 2, 1> ValueType;
typedef Matrix<double, 2, 2> JacobianType;
enum { InputsAtCompileTime = 2, ValuesAtCompileTime = 2 };
template <typename T>
void operator()(const Matrix<T, 2, 1>& x, Matrix<T, 2, 1>* v) const {
(*v)(0) = T(1) - x(0);
(*v)(1) = T(10) * (x(1) - x(0) * x(0));
}
};
// --- Medium functor: chain of operations (6 inputs -> 6 outputs) ---
struct MediumFunctor {
typedef Matrix<double, 6, 1> InputType;
typedef Matrix<double, 6, 1> ValueType;
typedef Matrix<double, 6, 6> JacobianType;
enum { InputsAtCompileTime = 6, ValuesAtCompileTime = 6 };
template <typename T>
void operator()(const Matrix<T, 6, 1>& x, Matrix<T, 6, 1>* v) const {
(*v)(0) = sin(x(0)) * cos(x(1)) + x(2) * x(2);
(*v)(1) = exp(x(1) * T(0.1)) + x(3);
(*v)(2) = x(0) * x(2) - x(4) * x(5);
(*v)(3) = sqrt(x(3) * x(3) + T(1)) + x(0);
(*v)(4) = x(4) * x(4) + x(5) * x(5) + x(0) * x(1);
(*v)(5) = log(x(2) * x(2) + T(1)) + x(3) * x(4);
}
};
// --- Dynamic-size functor (N inputs -> N outputs) ---
struct DynamicFunctor {
typedef Matrix<double, Dynamic, 1> InputType;
typedef Matrix<double, Dynamic, 1> ValueType;
typedef Matrix<double, Dynamic, Dynamic> JacobianType;
const int n_;
DynamicFunctor(int n) : n_(n) {}
enum { InputsAtCompileTime = Dynamic, ValuesAtCompileTime = Dynamic };
int inputs() const { return n_; }
int values() const { return n_; }
template <typename T>
void operator()(const Matrix<T, Dynamic, 1>& x, Matrix<T, Dynamic, 1>* v) const {
v->resize(n_);
(*v)(0) = T(1) - x(0);
for (int i = 1; i < n_; ++i) {
(*v)(i) = T(10) * (x(i) - x(i - 1) * x(i - 1));
}
}
};
// Wrapper for NumericalDiff compatibility.
struct SmallFunctorND : SmallFunctor {
typedef double Scalar;
int inputs() const { return 2; }
int values() const { return 2; }
int operator()(const InputType& x, ValueType& v) const {
SmallFunctor::operator()(x, &v);
return 0;
}
};
struct MediumFunctorND : MediumFunctor {
typedef double Scalar;
int inputs() const { return 6; }
int values() const { return 6; }
int operator()(const InputType& x, ValueType& v) const {
MediumFunctor::operator()(x, &v);
return 0;
}
};
// --- AutoDiff Jacobian benchmarks ---
template <typename Functor>
static void BM_AutoDiffJacobian(benchmark::State& state, Functor func) {
AutoDiffJacobian<Functor> adf(func);
typename Functor::InputType x = Functor::InputType::Random();
typename Functor::ValueType v;
typename Functor::JacobianType jac;
for (auto _ : state) {
adf(x, &v, &jac);
benchmark::DoNotOptimize(jac.data());
benchmark::ClobberMemory();
}
}
// --- Dynamic AutoDiff Jacobian ---
static void BM_AutoDiffJacobian_Dynamic(benchmark::State& state) {
int n = state.range(0);
DynamicFunctor func(n);
AutoDiffJacobian<DynamicFunctor> adf(func);
VectorXd x = VectorXd::Random(n);
VectorXd v(n);
MatrixXd jac(n, n);
for (auto _ : state) {
adf(x, &v, &jac);
benchmark::DoNotOptimize(jac.data());
benchmark::ClobberMemory();
}
}
// --- NumericalDiff benchmarks ---
template <typename Functor>
static void BM_NumericalDiffJacobian(benchmark::State& state, Functor func) {
NumericalDiff<Functor> ndf(func);
typename Functor::InputType x = Functor::InputType::Random();
typename Functor::JacobianType jac;
for (auto _ : state) {
ndf.df(x, jac);
benchmark::DoNotOptimize(jac.data());
benchmark::ClobberMemory();
}
}
// --- Hand-coded Jacobian (Rosenbrock) for comparison ---
static void BM_HandCoded_Small(benchmark::State& state) {
Vector2d x = Vector2d::Random();
Matrix2d jac;
for (auto _ : state) {
jac(0, 0) = -1;
jac(0, 1) = 0;
jac(1, 0) = -20 * x(0);
jac(1, 1) = 10;
benchmark::DoNotOptimize(jac.data());
benchmark::ClobberMemory();
}
}
// --- Scalar AutoDiff evaluation (no Jacobian, just forward pass) ---
static void BM_AutoDiffScalar_Eval(benchmark::State& state) {
int n = state.range(0);
using ADScalar = AutoDiffScalar<VectorXd>;
VectorXd x = VectorXd::Random(n);
for (auto _ : state) {
ADScalar sum(0.0, VectorXd::Zero(n));
for (int i = 0; i < n; ++i) {
ADScalar xi(x(i), n, i);
sum += xi * xi + sin(xi);
}
benchmark::DoNotOptimize(sum.value());
benchmark::DoNotOptimize(sum.derivatives().data());
benchmark::ClobberMemory();
}
}
BENCHMARK_CAPTURE(BM_AutoDiffJacobian, Small, SmallFunctor());
BENCHMARK_CAPTURE(BM_AutoDiffJacobian, Medium, MediumFunctor());
BENCHMARK(BM_AutoDiffJacobian_Dynamic)->Arg(2)->Arg(6)->Arg(20)->Arg(50)->Arg(100);
BENCHMARK_CAPTURE(BM_NumericalDiffJacobian, Small, SmallFunctorND());
BENCHMARK_CAPTURE(BM_NumericalDiffJacobian, Medium, MediumFunctorND());
BENCHMARK(BM_HandCoded_Small);
BENCHMARK(BM_AutoDiffScalar_Eval)->Arg(2)->Arg(6)->Arg(20)->Arg(50)->Arg(100);

View File

@@ -0,0 +1,35 @@
cmake_minimum_required(VERSION 3.10)
project(EigenUnsupportedBenchmarks CXX)
find_package(benchmark REQUIRED)
find_package(Threads REQUIRED)
set(EIGEN_SOURCE_DIR "${CMAKE_CURRENT_SOURCE_DIR}/../..")
# Helper: add a Google Benchmark target (mirrors benchmarks/CMakeLists.txt).
# eigen_add_benchmark(name source [LIBRARIES lib1 lib2 ...] [DEFINITIONS def1 def2 ...])
function(eigen_add_benchmark name source)
cmake_parse_arguments(BENCH "" "" "LIBRARIES;DEFINITIONS" ${ARGN})
if(NOT IS_ABSOLUTE "${source}")
set(source "${CMAKE_CURRENT_SOURCE_DIR}/${source}")
endif()
add_executable(${name} ${source})
target_include_directories(${name} PRIVATE ${EIGEN_SOURCE_DIR})
target_link_libraries(${name} PRIVATE benchmark::benchmark benchmark::benchmark_main
Threads::Threads)
if(BENCH_LIBRARIES)
target_link_libraries(${name} PRIVATE ${BENCH_LIBRARIES})
endif()
target_compile_options(${name} PRIVATE -O3 -DNDEBUG)
if(BENCH_DEFINITIONS)
target_compile_definitions(${name} PRIVATE ${BENCH_DEFINITIONS})
endif()
endfunction()
add_subdirectory(Tensor)
add_subdirectory(MatrixFunctions)
add_subdirectory(SpecialFunctions)
add_subdirectory(AutoDiff)
add_subdirectory(Splines)
add_subdirectory(IterativeSolvers)
add_subdirectory(KroneckerProduct)

View File

@@ -0,0 +1 @@
eigen_add_benchmark(bench_iterative_solvers bench_iterative_solvers.cpp)

View File

@@ -0,0 +1,209 @@
// Benchmarks for unsupported iterative solvers: GMRES, MINRES, IDRS, IDRSTABL, BiCGSTABL, DGMRES.
#include <benchmark/benchmark.h>
#include <Eigen/Sparse>
#include <Eigen/IterativeLinearSolvers>
#include <unsupported/Eigen/IterativeSolvers>
using namespace Eigen;
typedef double Scalar;
typedef SparseMatrix<Scalar> SpMat;
typedef Matrix<Scalar, Dynamic, 1> Vec;
// Generate a SPD sparse matrix (Laplacian-like with diagonal dominance).
static SpMat generateSPD(int n, int bandwidth) {
SpMat A(n, n);
std::vector<Triplet<Scalar>> trips;
trips.reserve(n * (2 * bandwidth + 1));
for (int i = 0; i < n; ++i) {
Scalar diag = 0;
for (int j = std::max(0, i - bandwidth); j < std::min(n, i + bandwidth + 1); ++j) {
if (i != j) {
Scalar val = -1.0 / (1 + std::abs(i - j));
trips.emplace_back(i, j, val);
diag -= val;
}
}
trips.emplace_back(i, i, diag + 1.0);
}
A.setFromTriplets(trips.begin(), trips.end());
return A;
}
// Generate a general (non-symmetric) sparse matrix.
static SpMat generateGeneral(int n, int bandwidth) {
SpMat A(n, n);
std::vector<Triplet<Scalar>> trips;
trips.reserve(n * (2 * bandwidth + 1));
for (int i = 0; i < n; ++i) {
Scalar diag = 0;
for (int j = std::max(0, i - bandwidth); j < std::min(n, i + bandwidth + 1); ++j) {
if (i != j) {
Scalar val = -0.5 / (1 + std::abs(i - j));
if (j > i) val *= 1.5; // asymmetry
trips.emplace_back(i, j, val);
diag += std::abs(val);
}
}
trips.emplace_back(i, i, diag + 1.0); // diagonal dominance
}
A.setFromTriplets(trips.begin(), trips.end());
return A;
}
// --- GMRES ---
static void BM_GMRES(benchmark::State& state) {
int n = state.range(0);
int bw = state.range(1);
SpMat A = generateGeneral(n, bw);
Vec b = Vec::Random(n);
GMRES<SpMat> solver;
solver.setMaxIterations(1000);
solver.setTolerance(1e-10);
solver.compute(A);
for (auto _ : state) {
Vec x = solver.solve(b);
benchmark::DoNotOptimize(x.data());
benchmark::ClobberMemory();
}
state.counters["iterations"] = solver.iterations();
}
// --- DGMRES ---
static void BM_DGMRES(benchmark::State& state) {
int n = state.range(0);
int bw = state.range(1);
SpMat A = generateGeneral(n, bw);
Vec b = Vec::Random(n);
DGMRES<SpMat> solver;
solver.setMaxIterations(1000);
solver.setTolerance(1e-10);
solver.compute(A);
for (auto _ : state) {
Vec x = solver.solve(b);
benchmark::DoNotOptimize(x.data());
benchmark::ClobberMemory();
}
state.counters["iterations"] = solver.iterations();
}
// --- MINRES (SPD matrices) ---
static void BM_MINRES(benchmark::State& state) {
int n = state.range(0);
int bw = state.range(1);
SpMat A = generateSPD(n, bw);
Vec b = Vec::Random(n);
MINRES<SpMat> solver;
solver.setMaxIterations(1000);
solver.setTolerance(1e-10);
solver.compute(A);
for (auto _ : state) {
Vec x = solver.solve(b);
benchmark::DoNotOptimize(x.data());
benchmark::ClobberMemory();
}
state.counters["iterations"] = solver.iterations();
}
// --- IDRS ---
static void BM_IDRS(benchmark::State& state) {
int n = state.range(0);
int bw = state.range(1);
SpMat A = generateGeneral(n, bw);
Vec b = Vec::Random(n);
IDRS<SpMat> solver;
solver.setMaxIterations(1000);
solver.setTolerance(1e-10);
solver.compute(A);
for (auto _ : state) {
Vec x = solver.solve(b);
benchmark::DoNotOptimize(x.data());
benchmark::ClobberMemory();
}
state.counters["iterations"] = solver.iterations();
}
// --- BiCGSTABL ---
static void BM_BiCGSTABL(benchmark::State& state) {
int n = state.range(0);
int bw = state.range(1);
SpMat A = generateGeneral(n, bw);
Vec b = Vec::Random(n);
BiCGSTABL<SpMat> solver;
solver.setMaxIterations(1000);
solver.setTolerance(1e-10);
solver.compute(A);
for (auto _ : state) {
Vec x = solver.solve(b);
benchmark::DoNotOptimize(x.data());
benchmark::ClobberMemory();
}
state.counters["iterations"] = solver.iterations();
}
// --- Compare with CG (supported module, SPD only) ---
static void BM_CG_Reference(benchmark::State& state) {
int n = state.range(0);
int bw = state.range(1);
SpMat A = generateSPD(n, bw);
Vec b = Vec::Random(n);
ConjugateGradient<SpMat> solver;
solver.setMaxIterations(1000);
solver.setTolerance(1e-10);
solver.compute(A);
for (auto _ : state) {
Vec x = solver.solve(b);
benchmark::DoNotOptimize(x.data());
benchmark::ClobberMemory();
}
state.counters["iterations"] = solver.iterations();
}
// --- Compare with BiCGSTAB (supported module, general) ---
static void BM_BiCGSTAB_Reference(benchmark::State& state) {
int n = state.range(0);
int bw = state.range(1);
SpMat A = generateGeneral(n, bw);
Vec b = Vec::Random(n);
BiCGSTAB<SpMat> solver;
solver.setMaxIterations(1000);
solver.setTolerance(1e-10);
solver.compute(A);
for (auto _ : state) {
Vec x = solver.solve(b);
benchmark::DoNotOptimize(x.data());
benchmark::ClobberMemory();
}
state.counters["iterations"] = solver.iterations();
}
static void SolverSizes(::benchmark::Benchmark* b) {
for (int n : {1000, 10000, 100000}) {
for (int bw : {5, 20}) {
b->Args({n, bw});
}
}
}
BENCHMARK(BM_GMRES)->Apply(SolverSizes);
BENCHMARK(BM_DGMRES)->Apply(SolverSizes);
BENCHMARK(BM_MINRES)->Apply(SolverSizes);
BENCHMARK(BM_IDRS)->Apply(SolverSizes);
BENCHMARK(BM_BiCGSTABL)->Apply(SolverSizes);
BENCHMARK(BM_CG_Reference)->Apply(SolverSizes);
BENCHMARK(BM_BiCGSTAB_Reference)->Apply(SolverSizes);

View File

@@ -0,0 +1 @@
eigen_add_benchmark(bench_kronecker bench_kronecker.cpp)

View File

@@ -0,0 +1,83 @@
// Benchmarks for Kronecker product (dense and sparse).
#include <benchmark/benchmark.h>
#include <Eigen/Core>
#include <Eigen/Sparse>
#include <unsupported/Eigen/KroneckerProduct>
using namespace Eigen;
typedef double Scalar;
typedef Matrix<Scalar, Dynamic, Dynamic> Mat;
typedef SparseMatrix<Scalar> SpMat;
// --- Dense Kronecker product ---
static void BM_KroneckerDense(benchmark::State& state) {
int na = state.range(0);
int nb = state.range(1);
Mat A = Mat::Random(na, na);
Mat B = Mat::Random(nb, nb);
for (auto _ : state) {
Mat C = kroneckerProduct(A, B).eval();
benchmark::DoNotOptimize(C.data());
benchmark::ClobberMemory();
}
int outSize = na * nb;
state.counters["output_size"] = outSize;
}
// --- Sparse Kronecker product ---
static void BM_KroneckerSparse(benchmark::State& state) {
int na = state.range(0);
int nb = state.range(1);
// Create sparse identity-like matrices with some fill.
SpMat A(na, na);
SpMat B(nb, nb);
std::vector<Triplet<Scalar>> tripsA, tripsB;
for (int i = 0; i < na; ++i) {
tripsA.emplace_back(i, i, 2.0);
if (i + 1 < na) {
tripsA.emplace_back(i, i + 1, -1.0);
tripsA.emplace_back(i + 1, i, -1.0);
}
}
for (int i = 0; i < nb; ++i) {
tripsB.emplace_back(i, i, 2.0);
if (i + 1 < nb) {
tripsB.emplace_back(i, i + 1, -1.0);
tripsB.emplace_back(i + 1, i, -1.0);
}
}
A.setFromTriplets(tripsA.begin(), tripsA.end());
B.setFromTriplets(tripsB.begin(), tripsB.end());
for (auto _ : state) {
SpMat C = kroneckerProduct(A, B).eval();
benchmark::DoNotOptimize(C.valuePtr());
benchmark::ClobberMemory();
}
state.counters["output_size"] = na * nb;
}
static void KroneckerSizes(::benchmark::Benchmark* b) {
for (int na : {4, 8, 16}) {
for (int nb : {4, 8, 16}) {
b->Args({na, nb});
}
}
}
static void KroneckerSparseSizes(::benchmark::Benchmark* b) {
for (int na : {16, 32, 64, 128}) {
for (int nb : {16, 32, 64, 128}) {
b->Args({na, nb});
}
}
}
BENCHMARK(BM_KroneckerDense)->Apply(KroneckerSizes);
BENCHMARK(BM_KroneckerSparse)->Apply(KroneckerSparseSizes);

View File

@@ -0,0 +1,3 @@
eigen_add_benchmark(bench_matrix_exponential bench_matrix_exponential.cpp)
eigen_add_benchmark(bench_matrix_logarithm bench_matrix_logarithm.cpp)
eigen_add_benchmark(bench_matrix_power bench_matrix_power.cpp)

View File

@@ -0,0 +1,52 @@
// Benchmarks for matrix exponential.
// Critical for Sophus Lie group operations (SLAM, visual odometry).
#include <benchmark/benchmark.h>
#include <Eigen/Core>
#include <unsupported/Eigen/MatrixFunctions>
using namespace Eigen;
#ifndef SCALAR
#define SCALAR double
#endif
typedef SCALAR Scalar;
static void BM_MatrixExp(benchmark::State& state) {
int n = state.range(0);
typedef Matrix<Scalar, Dynamic, Dynamic> MatrixType;
// Generate a random matrix with reasonable spectral radius.
MatrixType A = MatrixType::Random(n, n) / Scalar(n);
MatrixType result(n, n);
for (auto _ : state) {
result = A.exp();
benchmark::DoNotOptimize(result.data());
benchmark::ClobberMemory();
}
}
// Fixed-size specializations for Lie group sizes.
template <int N>
static void BM_MatrixExp_Fixed(benchmark::State& state) {
typedef Matrix<Scalar, N, N> MatrixType;
MatrixType A = MatrixType::Random() / Scalar(N);
MatrixType result;
for (auto _ : state) {
result = A.exp();
benchmark::DoNotOptimize(result.data());
benchmark::ClobberMemory();
}
}
// Dynamic sizes: Lie groups (2,3,4) plus larger.
BENCHMARK(BM_MatrixExp)->Arg(2)->Arg(3)->Arg(4)->Arg(8)->Arg(16)->Arg(32)->Arg(64)->Arg(128);
// Fixed-size Lie group dimensions.
BENCHMARK(BM_MatrixExp_Fixed<2>);
BENCHMARK(BM_MatrixExp_Fixed<3>);
BENCHMARK(BM_MatrixExp_Fixed<4>);

View File

@@ -0,0 +1,51 @@
// Benchmarks for matrix logarithm.
// Inverse of matrix exponential, used for Lie group log maps.
#include <benchmark/benchmark.h>
#include <Eigen/Core>
#include <unsupported/Eigen/MatrixFunctions>
using namespace Eigen;
#ifndef SCALAR
#define SCALAR double
#endif
typedef SCALAR Scalar;
static void BM_MatrixLog(benchmark::State& state) {
int n = state.range(0);
typedef Matrix<Scalar, Dynamic, Dynamic> MatrixType;
// Generate a matrix close to identity for stable log computation.
MatrixType A = MatrixType::Identity(n, n) + MatrixType::Random(n, n) / Scalar(n * 2);
// Ensure A is in the principal branch by computing exp(small matrix).
A = (MatrixType::Random(n, n) / Scalar(n * 4)).exp();
MatrixType result(n, n);
for (auto _ : state) {
result = A.log();
benchmark::DoNotOptimize(result.data());
benchmark::ClobberMemory();
}
}
template <int N>
static void BM_MatrixLog_Fixed(benchmark::State& state) {
typedef Matrix<Scalar, N, N> MatrixType;
MatrixType A = (MatrixType::Random() / Scalar(N * 4)).exp();
MatrixType result;
for (auto _ : state) {
result = A.log();
benchmark::DoNotOptimize(result.data());
benchmark::ClobberMemory();
}
}
BENCHMARK(BM_MatrixLog)->Arg(2)->Arg(3)->Arg(4)->Arg(8)->Arg(16)->Arg(32)->Arg(64);
BENCHMARK(BM_MatrixLog_Fixed<2>);
BENCHMARK(BM_MatrixLog_Fixed<3>);
BENCHMARK(BM_MatrixLog_Fixed<4>);

View File

@@ -0,0 +1,99 @@
// Benchmarks for matrix power functions: sqrt, pow, cos, sin, cosh, sinh.
#include <benchmark/benchmark.h>
#include <Eigen/Core>
#include <unsupported/Eigen/MatrixFunctions>
using namespace Eigen;
typedef double Scalar;
typedef Matrix<Scalar, Dynamic, Dynamic> Mat;
static void BM_MatrixSqrt(benchmark::State& state) {
int n = state.range(0);
// SPD matrix has well-defined sqrt.
Mat tmp = Mat::Random(n, n);
Mat A = tmp * tmp.transpose() + Mat::Identity(n, n);
Mat result(n, n);
for (auto _ : state) {
result = A.sqrt();
benchmark::DoNotOptimize(result.data());
benchmark::ClobberMemory();
}
}
static void BM_MatrixPow(benchmark::State& state) {
int n = state.range(0);
Mat tmp = Mat::Random(n, n);
Mat A = tmp * tmp.transpose() + Mat::Identity(n, n);
Mat result(n, n);
Scalar p = 2.5;
for (auto _ : state) {
result = A.pow(p);
benchmark::DoNotOptimize(result.data());
benchmark::ClobberMemory();
}
}
static void BM_MatrixCos(benchmark::State& state) {
int n = state.range(0);
Mat A = Mat::Random(n, n) / Scalar(n);
Mat result(n, n);
for (auto _ : state) {
result = A.cos();
benchmark::DoNotOptimize(result.data());
benchmark::ClobberMemory();
}
}
static void BM_MatrixSin(benchmark::State& state) {
int n = state.range(0);
Mat A = Mat::Random(n, n) / Scalar(n);
Mat result(n, n);
for (auto _ : state) {
result = A.sin();
benchmark::DoNotOptimize(result.data());
benchmark::ClobberMemory();
}
}
static void BM_MatrixCosh(benchmark::State& state) {
int n = state.range(0);
Mat A = Mat::Random(n, n) / Scalar(n);
Mat result(n, n);
for (auto _ : state) {
result = A.cosh();
benchmark::DoNotOptimize(result.data());
benchmark::ClobberMemory();
}
}
static void BM_MatrixSinh(benchmark::State& state) {
int n = state.range(0);
Mat A = Mat::Random(n, n) / Scalar(n);
Mat result(n, n);
for (auto _ : state) {
result = A.sinh();
benchmark::DoNotOptimize(result.data());
benchmark::ClobberMemory();
}
}
static void MatPowerSizes(::benchmark::Benchmark* b) {
for (int n : {4, 8, 16, 32, 64}) {
b->Arg(n);
}
}
BENCHMARK(BM_MatrixSqrt)->Apply(MatPowerSizes);
BENCHMARK(BM_MatrixPow)->Apply(MatPowerSizes);
BENCHMARK(BM_MatrixCos)->Apply(MatPowerSizes);
BENCHMARK(BM_MatrixSin)->Apply(MatPowerSizes);
BENCHMARK(BM_MatrixCosh)->Apply(MatPowerSizes);
BENCHMARK(BM_MatrixSinh)->Apply(MatPowerSizes);

View File

@@ -0,0 +1 @@
eigen_add_benchmark(bench_special_functions bench_special_functions.cpp)

View File

@@ -0,0 +1,127 @@
// Benchmarks for special functions beyond what bench_cwise_math.cpp covers.
// Includes Bessel functions, two-argument functions (igamma, betainc),
// and additional functions (lgamma, digamma, zeta, polygamma).
#include <benchmark/benchmark.h>
#include <Eigen/Core>
#include <unsupported/Eigen/SpecialFunctions>
using namespace Eigen;
// Macro for unary special functions on arrays.
#define BENCH_SPECIAL_UNARY(NAME, EXPR, LO, HI) \
template <typename Scalar> \
static void BM_##NAME(benchmark::State& state) { \
const Index n = state.range(0); \
using Arr = Array<Scalar, Dynamic, 1>; \
Arr a = (Arr::Random(n) + Scalar(1)) * Scalar((double(HI) - double(LO)) / 2.0) + Scalar(LO); \
Arr b(n); \
for (auto _ : state) { \
b = EXPR; \
benchmark::DoNotOptimize(b.data()); \
} \
state.counters["Elements/s"] = benchmark::Counter(n, benchmark::Counter::kIsIterationInvariantRate); \
state.SetBytesProcessed(state.iterations() * n * sizeof(Scalar) * 2); \
}
// Macro for binary special functions on arrays.
#define BENCH_SPECIAL_BINARY(NAME, EXPR, LO_A, HI_A, LO_B, HI_B) \
template <typename Scalar> \
static void BM_##NAME(benchmark::State& state) { \
const Index n = state.range(0); \
using Arr = Array<Scalar, Dynamic, 1>; \
Arr a = (Arr::Random(n) + Scalar(1)) * Scalar((double(HI_A) - double(LO_A)) / 2.0) + Scalar(LO_A); \
Arr b = (Arr::Random(n) + Scalar(1)) * Scalar((double(HI_B) - double(LO_B)) / 2.0) + Scalar(LO_B); \
Arr c(n); \
for (auto _ : state) { \
c = EXPR; \
benchmark::DoNotOptimize(c.data()); \
} \
state.counters["Elements/s"] = benchmark::Counter(n, benchmark::Counter::kIsIterationInvariantRate); \
state.SetBytesProcessed(state.iterations() * n * sizeof(Scalar) * 3); \
}
// --- Unary special functions ---
BENCH_SPECIAL_UNARY(Lgamma, Eigen::lgamma(a), 0.1, 20)
BENCH_SPECIAL_UNARY(Digamma, Eigen::digamma(a), 0.1, 20)
// --- Bessel functions (first kind) ---
BENCH_SPECIAL_UNARY(BesselI0, Eigen::bessel_i0(a), 0, 10)
BENCH_SPECIAL_UNARY(BesselI1, Eigen::bessel_i1(a), 0, 10)
BENCH_SPECIAL_UNARY(BesselI0e, Eigen::bessel_i0e(a), 0, 100)
BENCH_SPECIAL_UNARY(BesselI1e, Eigen::bessel_i1e(a), 0, 100)
BENCH_SPECIAL_UNARY(BesselJ0, Eigen::bessel_j0(a), 0, 20)
BENCH_SPECIAL_UNARY(BesselJ1, Eigen::bessel_j1(a), 0, 20)
// --- Bessel functions (second kind) ---
BENCH_SPECIAL_UNARY(BesselY0, Eigen::bessel_y0(a), 0.1, 20)
BENCH_SPECIAL_UNARY(BesselY1, Eigen::bessel_y1(a), 0.1, 20)
BENCH_SPECIAL_UNARY(BesselK0, Eigen::bessel_k0(a), 0.1, 20)
BENCH_SPECIAL_UNARY(BesselK1, Eigen::bessel_k1(a), 0.1, 20)
BENCH_SPECIAL_UNARY(BesselK0e, Eigen::bessel_k0e(a), 0.1, 100)
BENCH_SPECIAL_UNARY(BesselK1e, Eigen::bessel_k1e(a), 0.1, 100)
// --- Two-argument functions ---
BENCH_SPECIAL_BINARY(Igamma, Eigen::igamma(a, b), 0.1, 10, 0.1, 10)
BENCH_SPECIAL_BINARY(Igammac, Eigen::igammac(a, b), 0.1, 10, 0.1, 10)
BENCH_SPECIAL_BINARY(Zeta, Eigen::zeta(a, b), 1.1, 10, 0.1, 10)
BENCH_SPECIAL_BINARY(Polygamma, Eigen::polygamma(a, b), 1, 4, 0.1, 10)
// --- Ternary: betainc ---
template <typename Scalar>
static void BM_Betainc(benchmark::State& state) {
const Index n = state.range(0);
using Arr = Array<Scalar, Dynamic, 1>;
Arr a = (Arr::Random(n) + Scalar(1)) * Scalar(2.5) + Scalar(0.5); // [0.5, 5.5]
Arr b = (Arr::Random(n) + Scalar(1)) * Scalar(2.5) + Scalar(0.5);
Arr x = (Arr::Random(n) + Scalar(1)) * Scalar(0.5); // [0, 1]
Arr result(n);
for (auto _ : state) {
result = Eigen::betainc(a, b, x);
benchmark::DoNotOptimize(result.data());
}
state.counters["Elements/s"] = benchmark::Counter(n, benchmark::Counter::kIsIterationInvariantRate);
state.SetBytesProcessed(state.iterations() * n * sizeof(Scalar) * 4);
}
static void SpecialSizes(::benchmark::Benchmark* b) {
for (int n : {256, 4096, 65536, 1048576}) b->Arg(n);
}
// --- Register float ---
BENCHMARK(BM_Lgamma<float>)->Apply(SpecialSizes)->Name("Lgamma_float");
BENCHMARK(BM_Digamma<float>)->Apply(SpecialSizes)->Name("Digamma_float");
BENCHMARK(BM_BesselI0<float>)->Apply(SpecialSizes)->Name("BesselI0_float");
BENCHMARK(BM_BesselI1<float>)->Apply(SpecialSizes)->Name("BesselI1_float");
BENCHMARK(BM_BesselI0e<float>)->Apply(SpecialSizes)->Name("BesselI0e_float");
BENCHMARK(BM_BesselI1e<float>)->Apply(SpecialSizes)->Name("BesselI1e_float");
BENCHMARK(BM_BesselJ0<float>)->Apply(SpecialSizes)->Name("BesselJ0_float");
BENCHMARK(BM_BesselJ1<float>)->Apply(SpecialSizes)->Name("BesselJ1_float");
BENCHMARK(BM_BesselY0<float>)->Apply(SpecialSizes)->Name("BesselY0_float");
BENCHMARK(BM_BesselY1<float>)->Apply(SpecialSizes)->Name("BesselY1_float");
BENCHMARK(BM_BesselK0<float>)->Apply(SpecialSizes)->Name("BesselK0_float");
BENCHMARK(BM_BesselK1<float>)->Apply(SpecialSizes)->Name("BesselK1_float");
BENCHMARK(BM_BesselK0e<float>)->Apply(SpecialSizes)->Name("BesselK0e_float");
BENCHMARK(BM_BesselK1e<float>)->Apply(SpecialSizes)->Name("BesselK1e_float");
BENCHMARK(BM_Igamma<float>)->Apply(SpecialSizes)->Name("Igamma_float");
BENCHMARK(BM_Igammac<float>)->Apply(SpecialSizes)->Name("Igammac_float");
BENCHMARK(BM_Betainc<float>)->Apply(SpecialSizes)->Name("Betainc_float");
BENCHMARK(BM_Zeta<float>)->Apply(SpecialSizes)->Name("Zeta_float");
BENCHMARK(BM_Polygamma<float>)->Apply(SpecialSizes)->Name("Polygamma_float");
// --- Register double ---
BENCHMARK(BM_Lgamma<double>)->Apply(SpecialSizes)->Name("Lgamma_double");
BENCHMARK(BM_Digamma<double>)->Apply(SpecialSizes)->Name("Digamma_double");
BENCHMARK(BM_BesselI0<double>)->Apply(SpecialSizes)->Name("BesselI0_double");
BENCHMARK(BM_BesselI1<double>)->Apply(SpecialSizes)->Name("BesselI1_double");
BENCHMARK(BM_BesselJ0<double>)->Apply(SpecialSizes)->Name("BesselJ0_double");
BENCHMARK(BM_BesselJ1<double>)->Apply(SpecialSizes)->Name("BesselJ1_double");
BENCHMARK(BM_BesselY0<double>)->Apply(SpecialSizes)->Name("BesselY0_double");
BENCHMARK(BM_BesselY1<double>)->Apply(SpecialSizes)->Name("BesselY1_double");
BENCHMARK(BM_BesselK0<double>)->Apply(SpecialSizes)->Name("BesselK0_double");
BENCHMARK(BM_BesselK1<double>)->Apply(SpecialSizes)->Name("BesselK1_double");
BENCHMARK(BM_Igamma<double>)->Apply(SpecialSizes)->Name("Igamma_double");
BENCHMARK(BM_Igammac<double>)->Apply(SpecialSizes)->Name("Igammac_double");
BENCHMARK(BM_Betainc<double>)->Apply(SpecialSizes)->Name("Betainc_double");
BENCHMARK(BM_Zeta<double>)->Apply(SpecialSizes)->Name("Zeta_double");
BENCHMARK(BM_Polygamma<double>)->Apply(SpecialSizes)->Name("Polygamma_double");

View File

@@ -0,0 +1 @@
eigen_add_benchmark(bench_splines bench_splines.cpp)

View File

@@ -0,0 +1,98 @@
// Benchmarks for Eigen Spline module.
// Tests fitting, evaluation, and derivative computation.
#include <benchmark/benchmark.h>
#include <Eigen/Core>
#include <unsupported/Eigen/Splines>
using namespace Eigen;
typedef double Scalar;
// --- Spline fitting (interpolation) ---
template <int Dim, int Degree>
static void BM_SplineFit(benchmark::State& state) {
const int n = state.range(0);
typedef Spline<Scalar, Dim> SplineType;
typedef typename SplineType::PointType PointType;
// Generate random points.
Matrix<Scalar, Dim, Dynamic> pts(Dim, n);
pts.setRandom();
for (auto _ : state) {
SplineType spline = SplineFitting<SplineType>::Interpolate(pts, Degree);
benchmark::DoNotOptimize(spline.knots().data());
benchmark::ClobberMemory();
}
}
// --- Spline evaluation ---
template <int Dim, int Degree>
static void BM_SplineEval(benchmark::State& state) {
const int n = state.range(0); // number of control points for fitting
const int neval = 1000; // number of evaluation points
typedef Spline<Scalar, Dim> SplineType;
Matrix<Scalar, Dim, Dynamic> pts(Dim, n);
pts.setRandom();
SplineType spline = SplineFitting<SplineType>::Interpolate(pts, Degree);
// Generate evaluation parameters in [0, 1].
VectorXd u = VectorXd::LinSpaced(neval, 0, 1);
for (auto _ : state) {
for (int i = 0; i < neval; ++i) {
auto pt = spline(u(i));
benchmark::DoNotOptimize(pt.data());
}
benchmark::ClobberMemory();
}
state.counters["Evals/s"] = benchmark::Counter(neval, benchmark::Counter::kIsIterationInvariantRate);
}
// --- Spline derivative evaluation ---
template <int Dim, int Degree>
static void BM_SplineDerivatives(benchmark::State& state) {
const int n = state.range(0);
const int neval = 1000;
typedef Spline<Scalar, Dim> SplineType;
Matrix<Scalar, Dim, Dynamic> pts(Dim, n);
pts.setRandom();
SplineType spline = SplineFitting<SplineType>::Interpolate(pts, Degree);
VectorXd u = VectorXd::LinSpaced(neval, 0, 1);
for (auto _ : state) {
for (int i = 0; i < neval; ++i) {
auto derivs = spline.derivatives(u(i), 1);
benchmark::DoNotOptimize(derivs.data());
}
benchmark::ClobberMemory();
}
state.counters["Evals/s"] = benchmark::Counter(neval, benchmark::Counter::kIsIterationInvariantRate);
}
static void SplineSizes(::benchmark::Benchmark* b) {
for (int n : {10, 50, 200, 1000}) {
b->Arg(n);
}
}
// 2D cubic splines
BENCHMARK(BM_SplineFit<2, 3>)->Apply(SplineSizes)->Name("SplineFit_2D_Cubic");
BENCHMARK(BM_SplineEval<2, 3>)->Apply(SplineSizes)->Name("SplineEval_2D_Cubic");
BENCHMARK(BM_SplineDerivatives<2, 3>)->Apply(SplineSizes)->Name("SplineDerivatives_2D_Cubic");
// 3D cubic splines
BENCHMARK(BM_SplineFit<3, 3>)->Apply(SplineSizes)->Name("SplineFit_3D_Cubic");
BENCHMARK(BM_SplineEval<3, 3>)->Apply(SplineSizes)->Name("SplineEval_3D_Cubic");
BENCHMARK(BM_SplineDerivatives<3, 3>)->Apply(SplineSizes)->Name("SplineDerivatives_3D_Cubic");
// 2D quintic splines
BENCHMARK(BM_SplineFit<2, 5>)->Apply(SplineSizes)->Name("SplineFit_2D_Quintic");
BENCHMARK(BM_SplineEval<2, 5>)->Apply(SplineSizes)->Name("SplineEval_2D_Quintic");

View File

@@ -0,0 +1,8 @@
eigen_add_benchmark(bench_contraction bench_contraction.cpp)
eigen_add_benchmark(bench_convolution bench_convolution.cpp)
eigen_add_benchmark(bench_reduction bench_reduction.cpp)
eigen_add_benchmark(bench_broadcasting bench_broadcasting.cpp)
eigen_add_benchmark(bench_shuffling bench_shuffling.cpp)
eigen_add_benchmark(bench_tensor_fft bench_tensor_fft.cpp)
eigen_add_benchmark(bench_morphing bench_morphing.cpp)
eigen_add_benchmark(bench_coefficient_wise bench_coefficient_wise.cpp)

View File

@@ -0,0 +1,111 @@
// Benchmarks for Eigen Tensor broadcasting.
// Tests broadcasting along various dimensions and ranks.
#include <benchmark/benchmark.h>
#include <unsupported/Eigen/CXX11/Tensor>
using namespace Eigen;
typedef float Scalar;
// --- Broadcast row vector {1,N} -> {M,N} ---
static void BM_BroadcastRow(benchmark::State& state) {
const int M = state.range(0);
const int N = state.range(1);
Tensor<Scalar, 2> row(1, N);
Tensor<Scalar, 2> result(M, N);
row.setRandom();
Eigen::array<int, 2> bcast = {M, 1};
for (auto _ : state) {
result = row.broadcast(bcast);
benchmark::DoNotOptimize(result.data());
benchmark::ClobberMemory();
}
state.SetBytesProcessed(state.iterations() * M * N * sizeof(Scalar));
}
// --- Broadcast col vector {M,1} -> {M,N} ---
static void BM_BroadcastCol(benchmark::State& state) {
const int M = state.range(0);
const int N = state.range(1);
Tensor<Scalar, 2> col(M, 1);
Tensor<Scalar, 2> result(M, N);
col.setRandom();
Eigen::array<int, 2> bcast = {1, N};
for (auto _ : state) {
result = col.broadcast(bcast);
benchmark::DoNotOptimize(result.data());
benchmark::ClobberMemory();
}
state.SetBytesProcessed(state.iterations() * M * N * sizeof(Scalar));
}
// --- Broadcast + element-wise add (bias addition pattern) ---
static void BM_BroadcastAdd(benchmark::State& state) {
const int M = state.range(0);
const int N = state.range(1);
Tensor<Scalar, 2> mat(M, N);
Tensor<Scalar, 2> bias(1, N);
Tensor<Scalar, 2> result(M, N);
mat.setRandom();
bias.setRandom();
Eigen::array<int, 2> bcast = {M, 1};
for (auto _ : state) {
result = mat + bias.broadcast(bcast);
benchmark::DoNotOptimize(result.data());
benchmark::ClobberMemory();
}
state.SetBytesProcessed(state.iterations() * M * N * sizeof(Scalar) * 2);
}
// --- Rank-4 broadcast (batch x channels x 1 x 1) -> (batch x channels x H x W) ---
static void BM_BroadcastRank4(benchmark::State& state) {
const int batch = state.range(0);
const int C = state.range(1);
const int H = state.range(2);
Tensor<Scalar, 4> bias(batch, C, 1, 1);
Tensor<Scalar, 4> result(batch, C, H, H);
bias.setRandom();
Eigen::array<int, 4> bcast = {1, 1, H, H};
for (auto _ : state) {
result = bias.broadcast(bcast);
benchmark::DoNotOptimize(result.data());
benchmark::ClobberMemory();
}
state.SetBytesProcessed(state.iterations() * batch * C * H * H * sizeof(Scalar));
}
static void BroadcastSizes(::benchmark::Benchmark* b) {
for (int m : {64, 256, 1024}) {
for (int n : {64, 256, 1024}) {
b->Args({m, n});
}
}
}
static void Rank4Sizes(::benchmark::Benchmark* b) {
for (int batch : {1, 8}) {
for (int c : {64, 256}) {
for (int h : {16, 32}) {
b->Args({batch, c, h});
}
}
}
}
BENCHMARK(BM_BroadcastRow)->Apply(BroadcastSizes);
BENCHMARK(BM_BroadcastCol)->Apply(BroadcastSizes);
BENCHMARK(BM_BroadcastAdd)->Apply(BroadcastSizes);
BENCHMARK(BM_BroadcastRank4)->Apply(Rank4Sizes);

View File

@@ -0,0 +1,131 @@
// Benchmarks for Eigen Tensor coefficient-wise operations.
// Covers activation functions, normalization, and element-wise arithmetic.
#include <benchmark/benchmark.h>
#include <unsupported/Eigen/CXX11/Tensor>
using namespace Eigen;
typedef float Scalar;
// Macro to define a benchmark for a unary tensor operation.
#define BENCH_TENSOR_UNARY(NAME, EXPR) \
static void BM_##NAME(benchmark::State& state) { \
const int M = state.range(0); \
const int N = state.range(1); \
Tensor<Scalar, 2> a(M, N); \
a.setRandom(); \
Tensor<Scalar, 2> b(M, N); \
for (auto _ : state) { \
b = EXPR; \
benchmark::DoNotOptimize(b.data()); \
benchmark::ClobberMemory(); \
} \
state.SetBytesProcessed(state.iterations() * M * N * sizeof(Scalar) * 2); \
}
BENCH_TENSOR_UNARY(Exp, a.exp())
BENCH_TENSOR_UNARY(Log, a.abs().log())
BENCH_TENSOR_UNARY(Tanh, a.tanh())
BENCH_TENSOR_UNARY(Sigmoid, a.sigmoid())
BENCH_TENSOR_UNARY(ReLU, a.cwiseMax(Scalar(0)))
BENCH_TENSOR_UNARY(Sqrt, a.abs().sqrt())
// --- Element-wise binary operations ---
static void BM_Add(benchmark::State& state) {
const int M = state.range(0);
const int N = state.range(1);
Tensor<Scalar, 2> a(M, N);
Tensor<Scalar, 2> b(M, N);
Tensor<Scalar, 2> c(M, N);
a.setRandom();
b.setRandom();
for (auto _ : state) {
c = a + b;
benchmark::DoNotOptimize(c.data());
benchmark::ClobberMemory();
}
state.SetBytesProcessed(state.iterations() * M * N * sizeof(Scalar) * 3);
}
static void BM_Mul(benchmark::State& state) {
const int M = state.range(0);
const int N = state.range(1);
Tensor<Scalar, 2> a(M, N);
Tensor<Scalar, 2> b(M, N);
Tensor<Scalar, 2> c(M, N);
a.setRandom();
b.setRandom();
for (auto _ : state) {
c = a * b;
benchmark::DoNotOptimize(c.data());
benchmark::ClobberMemory();
}
state.SetBytesProcessed(state.iterations() * M * N * sizeof(Scalar) * 3);
}
// --- Fused multiply-add ---
static void BM_FMA(benchmark::State& state) {
const int M = state.range(0);
const int N = state.range(1);
Tensor<Scalar, 2> a(M, N);
Tensor<Scalar, 2> b(M, N);
Tensor<Scalar, 2> c(M, N);
Tensor<Scalar, 2> d(M, N);
a.setRandom();
b.setRandom();
c.setRandom();
for (auto _ : state) {
d = a * b + c;
benchmark::DoNotOptimize(d.data());
benchmark::ClobberMemory();
}
state.SetBytesProcessed(state.iterations() * M * N * sizeof(Scalar) * 4);
}
// --- Rank-4 coefficient-wise (CNN feature maps) ---
static void BM_ReLU_Rank4(benchmark::State& state) {
const int batch = state.range(0);
const int C = state.range(1);
const int H = state.range(2);
Tensor<Scalar, 4> a(batch, C, H, H);
Tensor<Scalar, 4> b(batch, C, H, H);
a.setRandom();
for (auto _ : state) {
b = a.cwiseMax(Scalar(0));
benchmark::DoNotOptimize(b.data());
benchmark::ClobberMemory();
}
state.SetBytesProcessed(state.iterations() * batch * C * H * H * sizeof(Scalar) * 2);
}
static void CwiseSizes(::benchmark::Benchmark* b) {
for (int size : {256, 1024}) {
b->Args({size, size});
}
}
static void Rank4Sizes(::benchmark::Benchmark* b) {
b->Args({32, 64, 16});
b->Args({8, 128, 32});
b->Args({1, 256, 64});
}
BENCHMARK(BM_Exp)->Apply(CwiseSizes);
BENCHMARK(BM_Log)->Apply(CwiseSizes);
BENCHMARK(BM_Tanh)->Apply(CwiseSizes);
BENCHMARK(BM_Sigmoid)->Apply(CwiseSizes);
BENCHMARK(BM_ReLU)->Apply(CwiseSizes);
BENCHMARK(BM_Sqrt)->Apply(CwiseSizes);
BENCHMARK(BM_Add)->Apply(CwiseSizes);
BENCHMARK(BM_Mul)->Apply(CwiseSizes);
BENCHMARK(BM_FMA)->Apply(CwiseSizes);
BENCHMARK(BM_ReLU_Rank4)->Apply(Rank4Sizes);

View File

@@ -0,0 +1,148 @@
// Benchmarks for Eigen Tensor contraction (generalized GEMM).
// Tests single-threaded (DefaultDevice) and multi-threaded (ThreadPoolDevice) variants.
#define EIGEN_USE_THREADS
#include <benchmark/benchmark.h>
#include <unsupported/Eigen/CXX11/Tensor>
#include <unsupported/Eigen/CXX11/ThreadPool>
using namespace Eigen;
#ifndef SCALAR
#define SCALAR float
#endif
typedef SCALAR Scalar;
// --- DefaultDevice contraction (rank-2, equivalent to matrix multiply) ---
static void BM_Contraction(benchmark::State& state) {
const int M = state.range(0);
const int N = state.range(1);
const int K = state.range(2);
Tensor<Scalar, 2> A(M, K);
Tensor<Scalar, 2> B(K, N);
Tensor<Scalar, 2> C(M, N);
A.setRandom();
B.setRandom();
using ContractDims = Tensor<Scalar, 2>::DimensionPair;
Eigen::array<ContractDims, 1> contract_dims = {ContractDims(1, 0)};
for (auto _ : state) {
C = A.contract(B, contract_dims);
benchmark::DoNotOptimize(C.data());
benchmark::ClobberMemory();
}
state.counters["GFLOPS"] =
benchmark::Counter(2.0 * M * N * K, benchmark::Counter::kIsIterationInvariantRate, benchmark::Counter::kIs1000);
}
// --- ThreadPoolDevice contraction ---
static void BM_Contraction_ThreadPool(benchmark::State& state) {
const int M = state.range(0);
const int N = state.range(1);
const int K = state.range(2);
const int threads = state.range(3);
Tensor<Scalar, 2> A(M, K);
Tensor<Scalar, 2> B(K, N);
Tensor<Scalar, 2> C(M, N);
A.setRandom();
B.setRandom();
ThreadPool tp(threads);
ThreadPoolDevice dev(&tp, threads);
using ContractDims = Tensor<Scalar, 2>::DimensionPair;
Eigen::array<ContractDims, 1> contract_dims = {ContractDims(1, 0)};
for (auto _ : state) {
C.device(dev) = A.contract(B, contract_dims);
benchmark::DoNotOptimize(C.data());
benchmark::ClobberMemory();
}
state.counters["GFLOPS"] =
benchmark::Counter(2.0 * M * N * K, benchmark::Counter::kIsIterationInvariantRate, benchmark::Counter::kIs1000);
state.counters["threads"] = threads;
}
// --- Rank-3 batch contraction ---
static void BM_BatchContraction(benchmark::State& state) {
const int batch = state.range(0);
const int M = state.range(1);
const int N = state.range(2);
const int K = state.range(3);
Tensor<Scalar, 3> A(batch, M, K);
Tensor<Scalar, 3> B(batch, K, N);
Tensor<Scalar, 3> C(batch, M, N);
A.setRandom();
B.setRandom();
using ContractDims = Tensor<Scalar, 3>::DimensionPair;
Eigen::array<ContractDims, 1> contract_dims = {ContractDims(2, 1)};
for (auto _ : state) {
C = A.contract(B, contract_dims);
benchmark::DoNotOptimize(C.data());
benchmark::ClobberMemory();
}
state.counters["GFLOPS"] = benchmark::Counter(2.0 * batch * M * N * K, benchmark::Counter::kIsIterationInvariantRate,
benchmark::Counter::kIs1000);
}
// --- RowMajor contraction ---
static void BM_Contraction_RowMajor(benchmark::State& state) {
const int M = state.range(0);
const int N = state.range(1);
const int K = state.range(2);
Tensor<Scalar, 2, RowMajor> A(M, K);
Tensor<Scalar, 2, RowMajor> B(K, N);
Tensor<Scalar, 2, RowMajor> C(M, N);
A.setRandom();
B.setRandom();
using ContractDims = Tensor<Scalar, 2, RowMajor>::DimensionPair;
Eigen::array<ContractDims, 1> contract_dims = {ContractDims(1, 0)};
for (auto _ : state) {
C = A.contract(B, contract_dims);
benchmark::DoNotOptimize(C.data());
benchmark::ClobberMemory();
}
state.counters["GFLOPS"] =
benchmark::Counter(2.0 * M * N * K, benchmark::Counter::kIsIterationInvariantRate, benchmark::Counter::kIs1000);
}
static void ContractionSizes(::benchmark::Benchmark* b) {
for (int size : {32, 64, 128, 256, 512, 1024}) {
b->Args({size, size, size});
}
// Non-square
b->Args({256, 256, 1024});
b->Args({1024, 64, 64});
}
static void ThreadPoolSizes(::benchmark::Benchmark* b) {
for (int size : {64, 256, 512, 1024}) {
for (int threads : {2, 4, 8}) {
b->Args({size, size, size, threads});
}
}
}
static void BatchSizes(::benchmark::Benchmark* b) {
for (int batch : {1, 8, 32}) {
for (int size : {64, 256}) {
b->Args({batch, size, size, size});
}
}
}
BENCHMARK(BM_Contraction)->Apply(ContractionSizes);
BENCHMARK(BM_Contraction_RowMajor)->Apply(ContractionSizes);
BENCHMARK(BM_Contraction_ThreadPool)->Apply(ThreadPoolSizes);
BENCHMARK(BM_BatchContraction)->Apply(BatchSizes);

View File

@@ -0,0 +1,151 @@
// Benchmarks for Eigen Tensor convolution (1D and 2D).
#define EIGEN_USE_THREADS
#include <benchmark/benchmark.h>
#include <unsupported/Eigen/CXX11/Tensor>
#include <unsupported/Eigen/CXX11/ThreadPool>
using namespace Eigen;
typedef float Scalar;
// --- 1D convolution ---
static void BM_Convolve1D(benchmark::State& state) {
const int input_size = state.range(0);
const int kernel_size = state.range(1);
Tensor<Scalar, 1> input(input_size);
Tensor<Scalar, 1> kernel(kernel_size);
input.setRandom();
kernel.setRandom();
Eigen::array<int, 1> dims = {0};
for (auto _ : state) {
Tensor<Scalar, 1> result = input.convolve(kernel, dims);
benchmark::DoNotOptimize(result.data());
benchmark::ClobberMemory();
}
double flops = 2.0 * (input_size - kernel_size + 1) * kernel_size;
state.counters["GFLOPS"] =
benchmark::Counter(flops, benchmark::Counter::kIsIterationInvariantRate, benchmark::Counter::kIs1000);
}
// --- 2D convolution ---
static void BM_Convolve2D(benchmark::State& state) {
const int H = state.range(0);
const int W = state.range(1);
const int kH = state.range(2);
const int kW = state.range(3);
Tensor<Scalar, 2> input(H, W);
Tensor<Scalar, 2> kernel(kH, kW);
input.setRandom();
kernel.setRandom();
Eigen::array<int, 2> dims = {0, 1};
for (auto _ : state) {
Tensor<Scalar, 2> result = input.convolve(kernel, dims);
benchmark::DoNotOptimize(result.data());
benchmark::ClobberMemory();
}
double flops = 2.0 * (H - kH + 1) * (W - kW + 1) * kH * kW;
state.counters["GFLOPS"] =
benchmark::Counter(flops, benchmark::Counter::kIsIterationInvariantRate, benchmark::Counter::kIs1000);
}
// --- 2D convolution with channels (rank-3: C x H x W, convolve on H,W) ---
static void BM_Convolve2D_Channels(benchmark::State& state) {
const int C = state.range(0);
const int H = state.range(1);
const int kH = state.range(2);
Tensor<Scalar, 3> input(C, H, H);
Tensor<Scalar, 2> kernel(kH, kH);
input.setRandom();
kernel.setRandom();
Eigen::array<int, 2> dims = {1, 2};
for (auto _ : state) {
Tensor<Scalar, 3> result = input.convolve(kernel, dims);
benchmark::DoNotOptimize(result.data());
benchmark::ClobberMemory();
}
int outH = H - kH + 1;
double flops = 2.0 * C * outH * outH * kH * kH;
state.counters["GFLOPS"] =
benchmark::Counter(flops, benchmark::Counter::kIsIterationInvariantRate, benchmark::Counter::kIs1000);
}
// --- 2D convolution with ThreadPool ---
static void BM_Convolve2D_ThreadPool(benchmark::State& state) {
const int H = state.range(0);
const int kH = state.range(1);
const int threads = state.range(2);
Tensor<Scalar, 2> input(H, H);
Tensor<Scalar, 2> kernel(kH, kH);
Tensor<Scalar, 2> result(H - kH + 1, H - kH + 1);
input.setRandom();
kernel.setRandom();
ThreadPool tp(threads);
ThreadPoolDevice dev(&tp, threads);
Eigen::array<int, 2> dims = {0, 1};
for (auto _ : state) {
result.device(dev) = input.convolve(kernel, dims);
benchmark::DoNotOptimize(result.data());
benchmark::ClobberMemory();
}
int outH = H - kH + 1;
double flops = 2.0 * outH * outH * kH * kH;
state.counters["GFLOPS"] =
benchmark::Counter(flops, benchmark::Counter::kIsIterationInvariantRate, benchmark::Counter::kIs1000);
state.counters["threads"] = threads;
}
static void Conv1DSizes(::benchmark::Benchmark* b) {
for (int input : {128, 512, 2048}) {
for (int kernel : {3, 5, 11}) {
b->Args({input, kernel});
}
}
}
static void Conv2DSizes(::benchmark::Benchmark* b) {
for (int hw : {32, 64, 128, 224}) {
for (int k : {3, 5, 7}) {
b->Args({hw, hw, k, k});
}
}
}
static void Conv2DChannelSizes(::benchmark::Benchmark* b) {
for (int c : {3, 64, 128}) {
for (int hw : {16, 32, 56}) {
for (int k : {3, 5}) {
b->Args({c, hw, k});
}
}
}
}
static void Conv2DThreadPoolSizes(::benchmark::Benchmark* b) {
for (int hw : {64, 128, 224}) {
for (int k : {3, 5}) {
for (int threads : {2, 4, 8}) {
b->Args({hw, k, threads});
}
}
}
}
BENCHMARK(BM_Convolve1D)->Apply(Conv1DSizes);
BENCHMARK(BM_Convolve2D)->Apply(Conv2DSizes);
BENCHMARK(BM_Convolve2D_Channels)->Apply(Conv2DChannelSizes);
BENCHMARK(BM_Convolve2D_ThreadPool)->Apply(Conv2DThreadPoolSizes);

View File

@@ -0,0 +1,142 @@
// Benchmarks for Eigen Tensor morphing operations: reshape, slice, chip, pad, stride.
#include <benchmark/benchmark.h>
#include <unsupported/Eigen/CXX11/Tensor>
using namespace Eigen;
typedef float Scalar;
// --- Reshape (zero-cost if no evaluation needed; force eval via assignment) ---
static void BM_Reshape(benchmark::State& state) {
const int M = state.range(0);
const int N = state.range(1);
Tensor<Scalar, 2> A(M, N);
A.setRandom();
Eigen::array<Index, 1> new_shape = {M * N};
for (auto _ : state) {
Tensor<Scalar, 1> B = A.reshape(new_shape);
benchmark::DoNotOptimize(B.data());
benchmark::ClobberMemory();
}
state.SetBytesProcessed(state.iterations() * M * N * sizeof(Scalar));
}
// --- Slice ---
static void BM_Slice(benchmark::State& state) {
const int M = state.range(0);
const int N = state.range(1);
Tensor<Scalar, 2> A(M, N);
A.setRandom();
int sliceM = M / 2;
int sliceN = N / 2;
Eigen::array<Index, 2> offsets = {0, 0};
Eigen::array<Index, 2> extents = {sliceM, sliceN};
for (auto _ : state) {
Tensor<Scalar, 2> B = A.slice(offsets, extents);
benchmark::DoNotOptimize(B.data());
benchmark::ClobberMemory();
}
state.SetBytesProcessed(state.iterations() * sliceM * sliceN * sizeof(Scalar));
}
// --- Chip (extract a sub-tensor along one dimension) ---
static void BM_Chip(benchmark::State& state) {
const int D0 = state.range(0);
const int D1 = state.range(1);
const int D2 = state.range(2);
Tensor<Scalar, 3> A(D0, D1, D2);
A.setRandom();
for (auto _ : state) {
Tensor<Scalar, 2> B = A.chip(0, 0);
benchmark::DoNotOptimize(B.data());
benchmark::ClobberMemory();
}
state.SetBytesProcessed(state.iterations() * D1 * D2 * sizeof(Scalar));
}
// --- Pad ---
static void BM_Pad(benchmark::State& state) {
const int M = state.range(0);
const int N = state.range(1);
const int padSize = state.range(2);
Tensor<Scalar, 2> A(M, N);
A.setRandom();
Eigen::array<std::pair<int, int>, 2> paddings;
paddings[0] = {padSize, padSize};
paddings[1] = {padSize, padSize};
for (auto _ : state) {
Tensor<Scalar, 2> B = A.pad(paddings);
benchmark::DoNotOptimize(B.data());
benchmark::ClobberMemory();
}
int outM = M + 2 * padSize;
int outN = N + 2 * padSize;
state.SetBytesProcessed(state.iterations() * outM * outN * sizeof(Scalar));
}
// --- Stride ---
static void BM_Stride(benchmark::State& state) {
const int M = state.range(0);
const int N = state.range(1);
const int stride = state.range(2);
Tensor<Scalar, 2> A(M, N);
A.setRandom();
Eigen::array<Index, 2> strides_arr = {stride, stride};
for (auto _ : state) {
Tensor<Scalar, 2> B = A.stride(strides_arr);
benchmark::DoNotOptimize(B.data());
benchmark::ClobberMemory();
}
int outM = (M + stride - 1) / stride;
int outN = (N + stride - 1) / stride;
state.SetBytesProcessed(state.iterations() * outM * outN * sizeof(Scalar));
}
static void MorphSizes(::benchmark::Benchmark* b) {
for (int size : {256, 1024}) {
b->Args({size, size});
}
}
static void ChipSizes(::benchmark::Benchmark* b) {
b->Args({32, 256, 256});
b->Args({64, 128, 128});
b->Args({8, 512, 512});
}
static void PadSizes(::benchmark::Benchmark* b) {
for (int size : {256, 1024}) {
for (int pad : {1, 4, 16}) {
b->Args({size, size, pad});
}
}
}
static void StrideSizes(::benchmark::Benchmark* b) {
for (int size : {256, 1024}) {
for (int stride : {2, 4}) {
b->Args({size, size, stride});
}
}
}
BENCHMARK(BM_Reshape)->Apply(MorphSizes);
BENCHMARK(BM_Slice)->Apply(MorphSizes);
BENCHMARK(BM_Chip)->Apply(ChipSizes);
BENCHMARK(BM_Pad)->Apply(PadSizes);
BENCHMARK(BM_Stride)->Apply(StrideSizes);

View File

@@ -0,0 +1,158 @@
// Benchmarks for Eigen Tensor reductions (sum, maximum, mean).
// Tests full and partial reductions, inner vs outer dimension, DefaultDevice and ThreadPoolDevice.
#define EIGEN_USE_THREADS
#include <benchmark/benchmark.h>
#include <unsupported/Eigen/CXX11/Tensor>
#include <unsupported/Eigen/CXX11/ThreadPool>
using namespace Eigen;
#ifndef SCALAR
#define SCALAR float
#endif
typedef SCALAR Scalar;
// --- Full reduction (rank-2) ---
template <typename ReduceOp>
static void BM_FullReduction(benchmark::State& state) {
const int M = state.range(0);
const int N = state.range(1);
Tensor<Scalar, 2> A(M, N);
A.setRandom();
for (auto _ : state) {
Tensor<Scalar, 0> result = A.reduce(Eigen::array<int, 2>{0, 1}, ReduceOp());
benchmark::DoNotOptimize(result.data());
benchmark::ClobberMemory();
}
state.SetBytesProcessed(state.iterations() * M * N * sizeof(Scalar));
}
// --- Partial reduction along dim 0 (inner dim, ColMajor) ---
static void BM_ReduceInner(benchmark::State& state) {
const int M = state.range(0);
const int N = state.range(1);
Tensor<Scalar, 2> A(M, N);
A.setRandom();
Eigen::array<int, 1> reduce_dims = {0};
for (auto _ : state) {
Tensor<Scalar, 1> result = A.sum(reduce_dims);
benchmark::DoNotOptimize(result.data());
benchmark::ClobberMemory();
}
state.SetBytesProcessed(state.iterations() * M * N * sizeof(Scalar));
}
// --- Partial reduction along dim 1 (outer dim, ColMajor) ---
static void BM_ReduceOuter(benchmark::State& state) {
const int M = state.range(0);
const int N = state.range(1);
Tensor<Scalar, 2> A(M, N);
A.setRandom();
Eigen::array<int, 1> reduce_dims = {1};
for (auto _ : state) {
Tensor<Scalar, 1> result = A.sum(reduce_dims);
benchmark::DoNotOptimize(result.data());
benchmark::ClobberMemory();
}
state.SetBytesProcessed(state.iterations() * M * N * sizeof(Scalar));
}
// --- Rank-4 partial reduction (batch x channels x H x W), reduce along spatial dims ---
static void BM_ReduceSpatial(benchmark::State& state) {
const int batch = state.range(0);
const int C = state.range(1);
const int H = state.range(2);
Tensor<Scalar, 4> A(batch, C, H, H);
A.setRandom();
Eigen::array<int, 2> reduce_dims = {2, 3};
for (auto _ : state) {
Tensor<Scalar, 2> result = A.sum(reduce_dims);
benchmark::DoNotOptimize(result.data());
benchmark::ClobberMemory();
}
state.SetBytesProcessed(state.iterations() * batch * C * H * H * sizeof(Scalar));
}
// --- Full reduction with ThreadPoolDevice ---
static void BM_FullReduction_ThreadPool(benchmark::State& state) {
const int M = state.range(0);
const int N = state.range(1);
const int threads = state.range(2);
Tensor<Scalar, 2> A(M, N);
Tensor<Scalar, 0> result;
A.setRandom();
ThreadPool tp(threads);
ThreadPoolDevice dev(&tp, threads);
for (auto _ : state) {
result.device(dev) = A.sum();
benchmark::DoNotOptimize(result.data());
benchmark::ClobberMemory();
}
state.SetBytesProcessed(state.iterations() * M * N * sizeof(Scalar));
state.counters["threads"] = threads;
}
// --- Maximum reduction (rank-2) ---
static void BM_MaxReduction(benchmark::State& state) {
const int M = state.range(0);
const int N = state.range(1);
Tensor<Scalar, 2> A(M, N);
A.setRandom();
for (auto _ : state) {
Tensor<Scalar, 0> result = A.maximum();
benchmark::DoNotOptimize(result.data());
benchmark::ClobberMemory();
}
state.SetBytesProcessed(state.iterations() * M * N * sizeof(Scalar));
}
static void ReductionSizes(::benchmark::Benchmark* b) {
for (int size : {64, 256, 1024}) {
b->Args({size, size});
}
}
static void ThreadPoolReductionSizes(::benchmark::Benchmark* b) {
for (int size : {256, 1024}) {
for (int threads : {2, 4, 8}) {
b->Args({size, size, threads});
}
}
}
static void SpatialSizes(::benchmark::Benchmark* b) {
for (int batch : {1, 8, 32}) {
for (int c : {64, 128}) {
for (int h : {16, 32}) {
b->Args({batch, c, h});
}
}
}
}
BENCHMARK(BM_FullReduction<internal::SumReducer<Scalar>>)->Apply(ReductionSizes)->Name("SumReduction");
BENCHMARK(BM_FullReduction<internal::MaxReducer<Scalar>>)->Apply(ReductionSizes)->Name("MaxReduction_Full");
BENCHMARK(BM_MaxReduction)->Apply(ReductionSizes);
BENCHMARK(BM_ReduceInner)->Apply(ReductionSizes);
BENCHMARK(BM_ReduceOuter)->Apply(ReductionSizes);
BENCHMARK(BM_ReduceSpatial)->Apply(SpatialSizes);
BENCHMARK(BM_FullReduction_ThreadPool)->Apply(ThreadPoolReductionSizes);

View File

@@ -0,0 +1,115 @@
// Benchmarks for Eigen Tensor shuffling (transpose / permutation).
#include <benchmark/benchmark.h>
#include <unsupported/Eigen/CXX11/Tensor>
using namespace Eigen;
typedef float Scalar;
// --- Rank-2 transpose ---
static void BM_Shuffle2D(benchmark::State& state) {
const int M = state.range(0);
const int N = state.range(1);
Tensor<Scalar, 2> A(M, N);
Tensor<Scalar, 2> B(N, M);
A.setRandom();
Eigen::array<int, 2> perm = {1, 0};
for (auto _ : state) {
B = A.shuffle(perm);
benchmark::DoNotOptimize(B.data());
benchmark::ClobberMemory();
}
state.SetBytesProcessed(state.iterations() * M * N * sizeof(Scalar) * 2);
}
// --- Identity shuffle (no permutation, measures overhead) ---
static void BM_ShuffleIdentity(benchmark::State& state) {
const int M = state.range(0);
const int N = state.range(1);
Tensor<Scalar, 2> A(M, N);
Tensor<Scalar, 2> B(M, N);
A.setRandom();
Eigen::array<int, 2> perm = {0, 1};
for (auto _ : state) {
B = A.shuffle(perm);
benchmark::DoNotOptimize(B.data());
benchmark::ClobberMemory();
}
state.SetBytesProcessed(state.iterations() * M * N * sizeof(Scalar) * 2);
}
// --- Rank-3 permutation ---
static void BM_Shuffle3D(benchmark::State& state) {
const int D0 = state.range(0);
const int D1 = state.range(1);
const int D2 = state.range(2);
Tensor<Scalar, 3> A(D0, D1, D2);
A.setRandom();
// Permutation (2, 0, 1)
Eigen::array<int, 3> perm = {2, 0, 1};
for (auto _ : state) {
Tensor<Scalar, 3> B = A.shuffle(perm);
benchmark::DoNotOptimize(B.data());
benchmark::ClobberMemory();
}
state.SetBytesProcessed(state.iterations() * D0 * D1 * D2 * sizeof(Scalar) * 2);
}
// --- Rank-4 permutation (NCHW -> NHWC layout conversion) ---
static void BM_Shuffle4D_NCHW_to_NHWC(benchmark::State& state) {
const int N = state.range(0);
const int C = state.range(1);
const int H = state.range(2);
Tensor<Scalar, 4> A(N, C, H, H);
A.setRandom();
// NCHW -> NHWC: permute (0, 2, 3, 1)
Eigen::array<int, 4> perm = {0, 2, 3, 1};
for (auto _ : state) {
Tensor<Scalar, 4> B = A.shuffle(perm);
benchmark::DoNotOptimize(B.data());
benchmark::ClobberMemory();
}
state.SetBytesProcessed(state.iterations() * N * C * H * H * sizeof(Scalar) * 2);
}
static void Shuffle2DSizes(::benchmark::Benchmark* b) {
for (int size : {256, 1024}) {
b->Args({size, size});
}
b->Args({64, 4096});
b->Args({4096, 64});
}
static void Shuffle3DSizes(::benchmark::Benchmark* b) {
b->Args({64, 64, 64});
b->Args({128, 128, 64});
b->Args({32, 256, 256});
}
static void Shuffle4DSizes(::benchmark::Benchmark* b) {
for (int batch : {1, 8}) {
for (int c : {3, 64}) {
for (int h : {32, 64}) {
b->Args({batch, c, h});
}
}
}
}
BENCHMARK(BM_Shuffle2D)->Apply(Shuffle2DSizes);
BENCHMARK(BM_ShuffleIdentity)->Apply(Shuffle2DSizes);
BENCHMARK(BM_Shuffle3D)->Apply(Shuffle3DSizes);
BENCHMARK(BM_Shuffle4D_NCHW_to_NHWC)->Apply(Shuffle4DSizes);

View File

@@ -0,0 +1,80 @@
// Benchmarks for Eigen Tensor FFT.
#include <benchmark/benchmark.h>
#include <unsupported/Eigen/CXX11/Tensor>
using namespace Eigen;
#ifndef SCALAR
#define SCALAR float
#endif
typedef SCALAR Scalar;
// --- 1D FFT ---
static void BM_TensorFFT_1D(benchmark::State& state) {
const int N = state.range(0);
Tensor<Scalar, 1> input(N);
input.setRandom();
Eigen::array<int, 1> fft_dims = {0};
for (auto _ : state) {
Tensor<std::complex<Scalar>, 1> result = input.template fft<BothParts, FFT_FORWARD>(fft_dims);
benchmark::DoNotOptimize(result.data());
benchmark::ClobberMemory();
}
double mflops = 5.0 * N * std::log2(static_cast<double>(N)) / 2.0; // real->complex
state.counters["MFLOPS"] =
benchmark::Counter(mflops, benchmark::Counter::kIsIterationInvariantRate, benchmark::Counter::kIs1000);
}
// --- 2D FFT ---
static void BM_TensorFFT_2D(benchmark::State& state) {
const int N = state.range(0);
Tensor<Scalar, 2> input(N, N);
input.setRandom();
Eigen::array<int, 2> fft_dims = {0, 1};
for (auto _ : state) {
Tensor<std::complex<Scalar>, 2> result = input.template fft<BothParts, FFT_FORWARD>(fft_dims);
benchmark::DoNotOptimize(result.data());
benchmark::ClobberMemory();
}
double total = N * N;
double mflops = 5.0 * total * std::log2(static_cast<double>(N));
state.counters["MFLOPS"] =
benchmark::Counter(mflops, benchmark::Counter::kIsIterationInvariantRate, benchmark::Counter::kIs1000);
}
// --- 1D inverse FFT ---
static void BM_TensorIFFT_1D(benchmark::State& state) {
const int N = state.range(0);
Tensor<std::complex<Scalar>, 1> input(N);
input.setRandom();
Eigen::array<int, 1> fft_dims = {0};
for (auto _ : state) {
Tensor<std::complex<Scalar>, 1> result = input.template fft<BothParts, FFT_REVERSE>(fft_dims);
benchmark::DoNotOptimize(result.data());
benchmark::ClobberMemory();
}
double mflops = 5.0 * N * std::log2(static_cast<double>(N));
state.counters["MFLOPS"] =
benchmark::Counter(mflops, benchmark::Counter::kIsIterationInvariantRate, benchmark::Counter::kIs1000);
}
static void FFTSizes(::benchmark::Benchmark* b) {
for (int n : {64, 256, 1024, 4096}) {
b->Arg(n);
}
}
BENCHMARK(BM_TensorFFT_1D)->Apply(FFTSizes);
BENCHMARK(BM_TensorFFT_2D)->Apply(FFTSizes);
BENCHMARK(BM_TensorIFFT_1D)->Apply(FFTSizes);