diff --git a/benchmarks/Cholesky/CMakeLists.txt b/benchmarks/Cholesky/CMakeLists.txt index 8cf6a3c01..1565448be 100644 --- a/benchmarks/Cholesky/CMakeLists.txt +++ b/benchmarks/Cholesky/CMakeLists.txt @@ -1 +1,2 @@ eigen_add_benchmark(bench_cholesky bench_cholesky.cpp) +eigen_add_benchmark(bench_cholesky_double bench_cholesky.cpp DEFINITIONS SCALAR=double) diff --git a/benchmarks/Cholesky/bench_cholesky.cpp b/benchmarks/Cholesky/bench_cholesky.cpp index cbce03175..5990e82c1 100644 --- a/benchmarks/Cholesky/bench_cholesky.cpp +++ b/benchmarks/Cholesky/bench_cholesky.cpp @@ -4,7 +4,11 @@ using namespace Eigen; -typedef float Scalar; +#ifndef SCALAR +#define SCALAR float +#endif + +typedef SCALAR Scalar; static void BM_LDLT(benchmark::State& state) { int n = state.range(0); diff --git a/benchmarks/Core/CMakeLists.txt b/benchmarks/Core/CMakeLists.txt index 7cc62d1d7..a95e9bb2a 100644 --- a/benchmarks/Core/CMakeLists.txt +++ b/benchmarks/Core/CMakeLists.txt @@ -15,3 +15,5 @@ eigen_add_benchmark(bench_diagonal bench_diagonal.cpp) eigen_add_benchmark(bench_triangular_product bench_triangular_product.cpp) eigen_add_benchmark(bench_selfadjoint_product bench_selfadjoint_product.cpp) eigen_add_benchmark(bench_construction bench_construction.cpp) +eigen_add_benchmark(bench_fixed_size bench_fixed_size.cpp) +eigen_add_benchmark(bench_fixed_size_double bench_fixed_size.cpp DEFINITIONS SCALAR=double) diff --git a/benchmarks/Core/bench_fixed_size.cpp b/benchmarks/Core/bench_fixed_size.cpp new file mode 100644 index 000000000..f7b674d3a --- /dev/null +++ b/benchmarks/Core/bench_fixed_size.cpp @@ -0,0 +1,123 @@ +// Benchmarks for fixed-size matrix operations (2x2, 3x3, 4x4). +// Critical for PCL, ROS, Sophus, Drake which use small matrices extensively. + +#include +#include +#include + +using namespace Eigen; + +#ifndef SCALAR +#define SCALAR float +#endif + +typedef SCALAR Scalar; + +// --- Fixed-size GEMM --- +template +static void BM_FixedGemm(benchmark::State& state) { + typedef Matrix Mat; + Mat a = Mat::Random(); + Mat b = Mat::Random(); + Mat c; + + for (auto _ : state) { + c.noalias() = a * b; + benchmark::DoNotOptimize(c.data()); + benchmark::ClobberMemory(); + } + state.counters["GFLOPS"] = + benchmark::Counter(2.0 * N * N * N, benchmark::Counter::kIsIterationInvariantRate, benchmark::Counter::kIs1000); +} + +// --- Fixed-size inverse --- +template +static void BM_FixedInverse(benchmark::State& state) { + typedef Matrix Mat; + Mat a = Mat::Random(); + // Make well-conditioned. + a = a * a.transpose() + Mat::Identity(); + Mat result; + + for (auto _ : state) { + result = a.inverse(); + benchmark::DoNotOptimize(result.data()); + benchmark::ClobberMemory(); + } +} + +// --- Fixed-size determinant --- +template +static void BM_FixedDeterminant(benchmark::State& state) { + typedef Matrix Mat; + Mat a = Mat::Random(); + Scalar result; + + for (auto _ : state) { + result = a.determinant(); + benchmark::DoNotOptimize(&result); + benchmark::ClobberMemory(); + } +} + +// --- Batch transform: Matrix4 * Matrix<4,N> --- +static void BM_BatchTransform4xN(benchmark::State& state) { + int N = state.range(0); + typedef Matrix Mat4; + typedef Matrix MatXN; + + Mat4 transform = Mat4::Random(); + MatXN points = MatXN::Random(4, N); + MatXN result(4, N); + + for (auto _ : state) { + result.noalias() = transform * points; + benchmark::DoNotOptimize(result.data()); + benchmark::ClobberMemory(); + } + state.counters["GFLOPS"] = + benchmark::Counter(2.0 * 4 * 4 * N, benchmark::Counter::kIsIterationInvariantRate, benchmark::Counter::kIs1000); +} + +// --- Fixed 3x3 batch operations (common in point cloud processing) --- +static void BM_Batch3x3Gemm(benchmark::State& state) { + int count = state.range(0); + typedef Matrix Mat3; + + std::vector a(count), b(count), c(count); + for (int i = 0; i < count; ++i) { + a[i] = Mat3::Random(); + b[i] = Mat3::Random(); + } + + for (auto _ : state) { + for (int i = 0; i < count; ++i) { + c[i].noalias() = a[i] * b[i]; + } + benchmark::DoNotOptimize(c.data()); + benchmark::ClobberMemory(); + } + state.counters["GFLOPS"] = + benchmark::Counter(2.0 * 27 * count, benchmark::Counter::kIsIterationInvariantRate, benchmark::Counter::kIs1000); +} + +// Fixed-size GEMM +BENCHMARK(BM_FixedGemm<2>)->Name("FixedGemm_2x2"); +BENCHMARK(BM_FixedGemm<3>)->Name("FixedGemm_3x3"); +BENCHMARK(BM_FixedGemm<4>)->Name("FixedGemm_4x4"); + +// Fixed-size inverse +BENCHMARK(BM_FixedInverse<2>)->Name("FixedInverse_2x2"); +BENCHMARK(BM_FixedInverse<3>)->Name("FixedInverse_3x3"); +BENCHMARK(BM_FixedInverse<4>)->Name("FixedInverse_4x4"); + +// Fixed-size determinant +BENCHMARK(BM_FixedDeterminant<2>)->Name("FixedDet_2x2"); +BENCHMARK(BM_FixedDeterminant<3>)->Name("FixedDet_3x3"); +BENCHMARK(BM_FixedDeterminant<4>)->Name("FixedDet_4x4"); + +// Batch 4xN transform +BENCHMARK(BM_BatchTransform4xN)->Arg(1)->Arg(4)->Arg(8)->Arg(16)->Arg(64); + +// Batch 3x3 GEMM +BENCHMARK(BM_Batch3x3Gemm)->Arg(100)->Arg(1000)->Arg(10000); diff --git a/benchmarks/Eigenvalues/CMakeLists.txt b/benchmarks/Eigenvalues/CMakeLists.txt index 0f4398bf1..67604770a 100644 --- a/benchmarks/Eigenvalues/CMakeLists.txt +++ b/benchmarks/Eigenvalues/CMakeLists.txt @@ -1,2 +1,3 @@ eigen_add_benchmark(bench_eigensolver bench_eigensolver.cpp) +eigen_add_benchmark(bench_eigensolver_double bench_eigensolver.cpp DEFINITIONS SCALAR=double) eigen_add_benchmark(bench_eig33 bench_eig33.cpp) diff --git a/benchmarks/Eigenvalues/bench_eigensolver.cpp b/benchmarks/Eigenvalues/bench_eigensolver.cpp index 4cb1ccb2f..2a1d853f8 100644 --- a/benchmarks/Eigenvalues/bench_eigensolver.cpp +++ b/benchmarks/Eigenvalues/bench_eigensolver.cpp @@ -5,7 +5,11 @@ using namespace Eigen; -typedef float Scalar; +#ifndef SCALAR +#define SCALAR float +#endif + +typedef SCALAR Scalar; static void BM_SelfAdjointEigenSolver(benchmark::State& state) { int n = state.range(0); diff --git a/benchmarks/FFT/bench_fft.cpp b/benchmarks/FFT/bench_fft.cpp index 391178faa..4934ccaa3 100644 --- a/benchmarks/FFT/bench_fft.cpp +++ b/benchmarks/FFT/bench_fft.cpp @@ -33,11 +33,20 @@ static void BM_FFT(benchmark::State& state) { benchmark::Counter(mflops_per_iter, benchmark::Counter::kIsIterationInvariantRate, benchmark::Counter::kIs1000); } -BENCHMARK(BM_FFT, true>)->Arg(1024)->Arg(4096); -BENCHMARK(BM_FFT, false>)->Arg(1024)->Arg(4096); -BENCHMARK(BM_FFT)->Arg(1024)->Arg(4096); -BENCHMARK(BM_FFT)->Arg(1024)->Arg(4096); -BENCHMARK(BM_FFT, true>)->Arg(1024)->Arg(4096); -BENCHMARK(BM_FFT, false>)->Arg(1024)->Arg(4096); -BENCHMARK(BM_FFT)->Arg(1024)->Arg(4096); -BENCHMARK(BM_FFT)->Arg(1024)->Arg(4096); +static void FFTSizes(::benchmark::Benchmark* b) { + for (int n : {64, 128, 256, 512, 1024, 2048, 4096, 8192, 16384, 65536}) { + b->Arg(n); + } + // Non-power-of-2 sizes. + b->Arg(1000); + b->Arg(5000); +} + +BENCHMARK(BM_FFT, true>)->Apply(FFTSizes); +BENCHMARK(BM_FFT, false>)->Apply(FFTSizes); +BENCHMARK(BM_FFT)->Apply(FFTSizes); +BENCHMARK(BM_FFT)->Apply(FFTSizes); +BENCHMARK(BM_FFT, true>)->Apply(FFTSizes); +BENCHMARK(BM_FFT, false>)->Apply(FFTSizes); +BENCHMARK(BM_FFT)->Apply(FFTSizes); +BENCHMARK(BM_FFT)->Apply(FFTSizes); diff --git a/benchmarks/Sparse/CMakeLists.txt b/benchmarks/Sparse/CMakeLists.txt index 4ebb7db95..93168aca8 100644 --- a/benchmarks/Sparse/CMakeLists.txt +++ b/benchmarks/Sparse/CMakeLists.txt @@ -1,3 +1,4 @@ eigen_add_benchmark(bench_spmv bench_spmv.cpp) eigen_add_benchmark(bench_spmm bench_spmm.cpp) eigen_add_benchmark(bench_sparse_transpose bench_sparse_transpose.cpp) +eigen_add_benchmark(bench_sparse_solvers bench_sparse_solvers.cpp) diff --git a/benchmarks/Sparse/bench_sparse_solvers.cpp b/benchmarks/Sparse/bench_sparse_solvers.cpp new file mode 100644 index 000000000..37817c933 --- /dev/null +++ b/benchmarks/Sparse/bench_sparse_solvers.cpp @@ -0,0 +1,182 @@ +// Benchmarks for sparse decomposition solvers. +// Tests SimplicialLLT, SimplicialLDLT, SparseQR, SparseLU, CG, BiCGSTAB. + +#include +#include +#include +#include +#include +#include +#include + +using namespace Eigen; + +typedef double Scalar; +typedef SparseMatrix SpMat; +typedef Matrix Vec; + +// Generate a SPD banded matrix (Laplacian-like). +static SpMat generateSPD(int n, int bandwidth) { + SpMat A(n, n); + std::vector> trips; + trips.reserve(n * (2 * bandwidth + 1)); + for (int i = 0; i < n; ++i) { + Scalar diag = 0; + for (int j = std::max(0, i - bandwidth); j < std::min(n, i + bandwidth + 1); ++j) { + if (i != j) { + Scalar val = -1.0 / (1 + std::abs(i - j)); + trips.emplace_back(i, j, val); + diag -= val; + } + } + trips.emplace_back(i, i, diag + 1.0); + } + A.setFromTriplets(trips.begin(), trips.end()); + return A; +} + +// Generate a general (non-symmetric) sparse matrix with diagonal dominance. +static SpMat generateGeneral(int n, int bandwidth) { + SpMat A(n, n); + std::vector> trips; + trips.reserve(n * (2 * bandwidth + 1)); + for (int i = 0; i < n; ++i) { + Scalar diag = 0; + for (int j = std::max(0, i - bandwidth); j < std::min(n, i + bandwidth + 1); ++j) { + if (i != j) { + Scalar val = -0.5 / (1 + std::abs(i - j)); + if (j > i) val *= 1.5; + trips.emplace_back(i, j, val); + diag += std::abs(val); + } + } + trips.emplace_back(i, i, diag + 1.0); + } + A.setFromTriplets(trips.begin(), trips.end()); + return A; +} + +// --- SimplicialLLT --- +static void BM_SimplicialLLT(benchmark::State& state) { + int n = state.range(0); + int bw = state.range(1); + SpMat A = generateSPD(n, bw); + Vec b = Vec::Random(n); + + for (auto _ : state) { + SimplicialLLT solver(A); + Vec x = solver.solve(b); + benchmark::DoNotOptimize(x.data()); + benchmark::ClobberMemory(); + } +} + +// --- SimplicialLDLT --- +static void BM_SimplicialLDLT(benchmark::State& state) { + int n = state.range(0); + int bw = state.range(1); + SpMat A = generateSPD(n, bw); + Vec b = Vec::Random(n); + + for (auto _ : state) { + SimplicialLDLT solver(A); + Vec x = solver.solve(b); + benchmark::DoNotOptimize(x.data()); + benchmark::ClobberMemory(); + } +} + +// --- SparseLU --- +static void BM_SparseLU(benchmark::State& state) { + int n = state.range(0); + int bw = state.range(1); + SpMat A = generateGeneral(n, bw); + Vec b = Vec::Random(n); + + for (auto _ : state) { + SparseLU> solver; + solver.compute(A); + Vec x = solver.solve(b); + benchmark::DoNotOptimize(x.data()); + benchmark::ClobberMemory(); + } +} + +// --- SparseQR --- +static void BM_SparseQR(benchmark::State& state) { + int n = state.range(0); + int bw = state.range(1); + SpMat A = generateGeneral(n, bw); + Vec b = Vec::Random(n); + + for (auto _ : state) { + SparseQR> solver; + solver.compute(A); + Vec x = solver.solve(b); + benchmark::DoNotOptimize(x.data()); + benchmark::ClobberMemory(); + } +} + +// --- ConjugateGradient (SPD) --- +static void BM_CG(benchmark::State& state) { + int n = state.range(0); + int bw = state.range(1); + SpMat A = generateSPD(n, bw); + Vec b = Vec::Random(n); + + ConjugateGradient solver; + solver.setMaxIterations(1000); + solver.setTolerance(1e-10); + solver.compute(A); + + for (auto _ : state) { + Vec x = solver.solve(b); + benchmark::DoNotOptimize(x.data()); + benchmark::ClobberMemory(); + } + state.counters["iterations"] = solver.iterations(); +} + +// --- BiCGSTAB (general) --- +static void BM_BiCGSTAB(benchmark::State& state) { + int n = state.range(0); + int bw = state.range(1); + SpMat A = generateGeneral(n, bw); + Vec b = Vec::Random(n); + + BiCGSTAB solver; + solver.setMaxIterations(1000); + solver.setTolerance(1e-10); + solver.compute(A); + + for (auto _ : state) { + Vec x = solver.solve(b); + benchmark::DoNotOptimize(x.data()); + benchmark::ClobberMemory(); + } + state.counters["iterations"] = solver.iterations(); +} + +static void DirectSolverSizes(::benchmark::Benchmark* b) { + for (int n : {1000, 5000, 10000, 50000}) { + for (int bw : {5, 20}) { + b->Args({n, bw}); + } + } +} + +static void IterativeSolverSizes(::benchmark::Benchmark* b) { + for (int n : {1000, 10000, 50000}) { + for (int bw : {5, 20}) { + b->Args({n, bw}); + } + } +} + +BENCHMARK(BM_SimplicialLLT)->Apply(DirectSolverSizes); +BENCHMARK(BM_SimplicialLDLT)->Apply(DirectSolverSizes); +BENCHMARK(BM_SparseLU)->Apply(DirectSolverSizes); +BENCHMARK(BM_SparseQR)->Apply(DirectSolverSizes); +BENCHMARK(BM_CG)->Apply(IterativeSolverSizes); +BENCHMARK(BM_BiCGSTAB)->Apply(IterativeSolverSizes); diff --git a/unsupported/benchmarks/AutoDiff/CMakeLists.txt b/unsupported/benchmarks/AutoDiff/CMakeLists.txt new file mode 100644 index 000000000..cb5b98687 --- /dev/null +++ b/unsupported/benchmarks/AutoDiff/CMakeLists.txt @@ -0,0 +1 @@ +eigen_add_benchmark(bench_autodiff bench_autodiff.cpp) diff --git a/unsupported/benchmarks/AutoDiff/bench_autodiff.cpp b/unsupported/benchmarks/AutoDiff/bench_autodiff.cpp new file mode 100644 index 000000000..74c334f97 --- /dev/null +++ b/unsupported/benchmarks/AutoDiff/bench_autodiff.cpp @@ -0,0 +1,177 @@ +// Benchmarks for Eigen AutoDiff module. +// Compares AutoDiff Jacobian computation against NumericalDiff and hand-coded Jacobians. + +#include +#include +#include +#include + +using namespace Eigen; + +// --- Small functor: Rosenbrock-like (2 inputs -> 2 outputs) --- +struct SmallFunctor { + typedef Matrix InputType; + typedef Matrix ValueType; + typedef Matrix JacobianType; + + enum { InputsAtCompileTime = 2, ValuesAtCompileTime = 2 }; + + template + void operator()(const Matrix& x, Matrix* v) const { + (*v)(0) = T(1) - x(0); + (*v)(1) = T(10) * (x(1) - x(0) * x(0)); + } +}; + +// --- Medium functor: chain of operations (6 inputs -> 6 outputs) --- +struct MediumFunctor { + typedef Matrix InputType; + typedef Matrix ValueType; + typedef Matrix JacobianType; + + enum { InputsAtCompileTime = 6, ValuesAtCompileTime = 6 }; + + template + void operator()(const Matrix& x, Matrix* v) const { + (*v)(0) = sin(x(0)) * cos(x(1)) + x(2) * x(2); + (*v)(1) = exp(x(1) * T(0.1)) + x(3); + (*v)(2) = x(0) * x(2) - x(4) * x(5); + (*v)(3) = sqrt(x(3) * x(3) + T(1)) + x(0); + (*v)(4) = x(4) * x(4) + x(5) * x(5) + x(0) * x(1); + (*v)(5) = log(x(2) * x(2) + T(1)) + x(3) * x(4); + } +}; + +// --- Dynamic-size functor (N inputs -> N outputs) --- +struct DynamicFunctor { + typedef Matrix InputType; + typedef Matrix ValueType; + typedef Matrix JacobianType; + + const int n_; + DynamicFunctor(int n) : n_(n) {} + + enum { InputsAtCompileTime = Dynamic, ValuesAtCompileTime = Dynamic }; + + int inputs() const { return n_; } + int values() const { return n_; } + + template + void operator()(const Matrix& x, Matrix* v) const { + v->resize(n_); + (*v)(0) = T(1) - x(0); + for (int i = 1; i < n_; ++i) { + (*v)(i) = T(10) * (x(i) - x(i - 1) * x(i - 1)); + } + } +}; + +// Wrapper for NumericalDiff compatibility. +struct SmallFunctorND : SmallFunctor { + typedef double Scalar; + int inputs() const { return 2; } + int values() const { return 2; } + int operator()(const InputType& x, ValueType& v) const { + SmallFunctor::operator()(x, &v); + return 0; + } +}; + +struct MediumFunctorND : MediumFunctor { + typedef double Scalar; + int inputs() const { return 6; } + int values() const { return 6; } + int operator()(const InputType& x, ValueType& v) const { + MediumFunctor::operator()(x, &v); + return 0; + } +}; + +// --- AutoDiff Jacobian benchmarks --- +template +static void BM_AutoDiffJacobian(benchmark::State& state, Functor func) { + AutoDiffJacobian adf(func); + typename Functor::InputType x = Functor::InputType::Random(); + typename Functor::ValueType v; + typename Functor::JacobianType jac; + + for (auto _ : state) { + adf(x, &v, &jac); + benchmark::DoNotOptimize(jac.data()); + benchmark::ClobberMemory(); + } +} + +// --- Dynamic AutoDiff Jacobian --- +static void BM_AutoDiffJacobian_Dynamic(benchmark::State& state) { + int n = state.range(0); + DynamicFunctor func(n); + AutoDiffJacobian adf(func); + + VectorXd x = VectorXd::Random(n); + VectorXd v(n); + MatrixXd jac(n, n); + + for (auto _ : state) { + adf(x, &v, &jac); + benchmark::DoNotOptimize(jac.data()); + benchmark::ClobberMemory(); + } +} + +// --- NumericalDiff benchmarks --- +template +static void BM_NumericalDiffJacobian(benchmark::State& state, Functor func) { + NumericalDiff ndf(func); + typename Functor::InputType x = Functor::InputType::Random(); + typename Functor::JacobianType jac; + + for (auto _ : state) { + ndf.df(x, jac); + benchmark::DoNotOptimize(jac.data()); + benchmark::ClobberMemory(); + } +} + +// --- Hand-coded Jacobian (Rosenbrock) for comparison --- +static void BM_HandCoded_Small(benchmark::State& state) { + Vector2d x = Vector2d::Random(); + Matrix2d jac; + + for (auto _ : state) { + jac(0, 0) = -1; + jac(0, 1) = 0; + jac(1, 0) = -20 * x(0); + jac(1, 1) = 10; + benchmark::DoNotOptimize(jac.data()); + benchmark::ClobberMemory(); + } +} + +// --- Scalar AutoDiff evaluation (no Jacobian, just forward pass) --- +static void BM_AutoDiffScalar_Eval(benchmark::State& state) { + int n = state.range(0); + using ADScalar = AutoDiffScalar; + VectorXd x = VectorXd::Random(n); + + for (auto _ : state) { + ADScalar sum(0.0, VectorXd::Zero(n)); + for (int i = 0; i < n; ++i) { + ADScalar xi(x(i), n, i); + sum += xi * xi + sin(xi); + } + benchmark::DoNotOptimize(sum.value()); + benchmark::DoNotOptimize(sum.derivatives().data()); + benchmark::ClobberMemory(); + } +} + +BENCHMARK_CAPTURE(BM_AutoDiffJacobian, Small, SmallFunctor()); +BENCHMARK_CAPTURE(BM_AutoDiffJacobian, Medium, MediumFunctor()); +BENCHMARK(BM_AutoDiffJacobian_Dynamic)->Arg(2)->Arg(6)->Arg(20)->Arg(50)->Arg(100); + +BENCHMARK_CAPTURE(BM_NumericalDiffJacobian, Small, SmallFunctorND()); +BENCHMARK_CAPTURE(BM_NumericalDiffJacobian, Medium, MediumFunctorND()); + +BENCHMARK(BM_HandCoded_Small); +BENCHMARK(BM_AutoDiffScalar_Eval)->Arg(2)->Arg(6)->Arg(20)->Arg(50)->Arg(100); diff --git a/unsupported/benchmarks/CMakeLists.txt b/unsupported/benchmarks/CMakeLists.txt new file mode 100644 index 000000000..ed8c88ea0 --- /dev/null +++ b/unsupported/benchmarks/CMakeLists.txt @@ -0,0 +1,35 @@ +cmake_minimum_required(VERSION 3.10) +project(EigenUnsupportedBenchmarks CXX) + +find_package(benchmark REQUIRED) +find_package(Threads REQUIRED) + +set(EIGEN_SOURCE_DIR "${CMAKE_CURRENT_SOURCE_DIR}/../..") + +# Helper: add a Google Benchmark target (mirrors benchmarks/CMakeLists.txt). +# eigen_add_benchmark(name source [LIBRARIES lib1 lib2 ...] [DEFINITIONS def1 def2 ...]) +function(eigen_add_benchmark name source) + cmake_parse_arguments(BENCH "" "" "LIBRARIES;DEFINITIONS" ${ARGN}) + if(NOT IS_ABSOLUTE "${source}") + set(source "${CMAKE_CURRENT_SOURCE_DIR}/${source}") + endif() + add_executable(${name} ${source}) + target_include_directories(${name} PRIVATE ${EIGEN_SOURCE_DIR}) + target_link_libraries(${name} PRIVATE benchmark::benchmark benchmark::benchmark_main + Threads::Threads) + if(BENCH_LIBRARIES) + target_link_libraries(${name} PRIVATE ${BENCH_LIBRARIES}) + endif() + target_compile_options(${name} PRIVATE -O3 -DNDEBUG) + if(BENCH_DEFINITIONS) + target_compile_definitions(${name} PRIVATE ${BENCH_DEFINITIONS}) + endif() +endfunction() + +add_subdirectory(Tensor) +add_subdirectory(MatrixFunctions) +add_subdirectory(SpecialFunctions) +add_subdirectory(AutoDiff) +add_subdirectory(Splines) +add_subdirectory(IterativeSolvers) +add_subdirectory(KroneckerProduct) diff --git a/unsupported/benchmarks/IterativeSolvers/CMakeLists.txt b/unsupported/benchmarks/IterativeSolvers/CMakeLists.txt new file mode 100644 index 000000000..4ae536a31 --- /dev/null +++ b/unsupported/benchmarks/IterativeSolvers/CMakeLists.txt @@ -0,0 +1 @@ +eigen_add_benchmark(bench_iterative_solvers bench_iterative_solvers.cpp) diff --git a/unsupported/benchmarks/IterativeSolvers/bench_iterative_solvers.cpp b/unsupported/benchmarks/IterativeSolvers/bench_iterative_solvers.cpp new file mode 100644 index 000000000..5105d40e2 --- /dev/null +++ b/unsupported/benchmarks/IterativeSolvers/bench_iterative_solvers.cpp @@ -0,0 +1,209 @@ +// Benchmarks for unsupported iterative solvers: GMRES, MINRES, IDRS, IDRSTABL, BiCGSTABL, DGMRES. + +#include +#include +#include +#include + +using namespace Eigen; + +typedef double Scalar; +typedef SparseMatrix SpMat; +typedef Matrix Vec; + +// Generate a SPD sparse matrix (Laplacian-like with diagonal dominance). +static SpMat generateSPD(int n, int bandwidth) { + SpMat A(n, n); + std::vector> trips; + trips.reserve(n * (2 * bandwidth + 1)); + for (int i = 0; i < n; ++i) { + Scalar diag = 0; + for (int j = std::max(0, i - bandwidth); j < std::min(n, i + bandwidth + 1); ++j) { + if (i != j) { + Scalar val = -1.0 / (1 + std::abs(i - j)); + trips.emplace_back(i, j, val); + diag -= val; + } + } + trips.emplace_back(i, i, diag + 1.0); + } + A.setFromTriplets(trips.begin(), trips.end()); + return A; +} + +// Generate a general (non-symmetric) sparse matrix. +static SpMat generateGeneral(int n, int bandwidth) { + SpMat A(n, n); + std::vector> trips; + trips.reserve(n * (2 * bandwidth + 1)); + for (int i = 0; i < n; ++i) { + Scalar diag = 0; + for (int j = std::max(0, i - bandwidth); j < std::min(n, i + bandwidth + 1); ++j) { + if (i != j) { + Scalar val = -0.5 / (1 + std::abs(i - j)); + if (j > i) val *= 1.5; // asymmetry + trips.emplace_back(i, j, val); + diag += std::abs(val); + } + } + trips.emplace_back(i, i, diag + 1.0); // diagonal dominance + } + A.setFromTriplets(trips.begin(), trips.end()); + return A; +} + +// --- GMRES --- +static void BM_GMRES(benchmark::State& state) { + int n = state.range(0); + int bw = state.range(1); + SpMat A = generateGeneral(n, bw); + Vec b = Vec::Random(n); + + GMRES solver; + solver.setMaxIterations(1000); + solver.setTolerance(1e-10); + solver.compute(A); + + for (auto _ : state) { + Vec x = solver.solve(b); + benchmark::DoNotOptimize(x.data()); + benchmark::ClobberMemory(); + } + state.counters["iterations"] = solver.iterations(); +} + +// --- DGMRES --- +static void BM_DGMRES(benchmark::State& state) { + int n = state.range(0); + int bw = state.range(1); + SpMat A = generateGeneral(n, bw); + Vec b = Vec::Random(n); + + DGMRES solver; + solver.setMaxIterations(1000); + solver.setTolerance(1e-10); + solver.compute(A); + + for (auto _ : state) { + Vec x = solver.solve(b); + benchmark::DoNotOptimize(x.data()); + benchmark::ClobberMemory(); + } + state.counters["iterations"] = solver.iterations(); +} + +// --- MINRES (SPD matrices) --- +static void BM_MINRES(benchmark::State& state) { + int n = state.range(0); + int bw = state.range(1); + SpMat A = generateSPD(n, bw); + Vec b = Vec::Random(n); + + MINRES solver; + solver.setMaxIterations(1000); + solver.setTolerance(1e-10); + solver.compute(A); + + for (auto _ : state) { + Vec x = solver.solve(b); + benchmark::DoNotOptimize(x.data()); + benchmark::ClobberMemory(); + } + state.counters["iterations"] = solver.iterations(); +} + +// --- IDRS --- +static void BM_IDRS(benchmark::State& state) { + int n = state.range(0); + int bw = state.range(1); + SpMat A = generateGeneral(n, bw); + Vec b = Vec::Random(n); + + IDRS solver; + solver.setMaxIterations(1000); + solver.setTolerance(1e-10); + solver.compute(A); + + for (auto _ : state) { + Vec x = solver.solve(b); + benchmark::DoNotOptimize(x.data()); + benchmark::ClobberMemory(); + } + state.counters["iterations"] = solver.iterations(); +} + +// --- BiCGSTABL --- +static void BM_BiCGSTABL(benchmark::State& state) { + int n = state.range(0); + int bw = state.range(1); + SpMat A = generateGeneral(n, bw); + Vec b = Vec::Random(n); + + BiCGSTABL solver; + solver.setMaxIterations(1000); + solver.setTolerance(1e-10); + solver.compute(A); + + for (auto _ : state) { + Vec x = solver.solve(b); + benchmark::DoNotOptimize(x.data()); + benchmark::ClobberMemory(); + } + state.counters["iterations"] = solver.iterations(); +} + +// --- Compare with CG (supported module, SPD only) --- +static void BM_CG_Reference(benchmark::State& state) { + int n = state.range(0); + int bw = state.range(1); + SpMat A = generateSPD(n, bw); + Vec b = Vec::Random(n); + + ConjugateGradient solver; + solver.setMaxIterations(1000); + solver.setTolerance(1e-10); + solver.compute(A); + + for (auto _ : state) { + Vec x = solver.solve(b); + benchmark::DoNotOptimize(x.data()); + benchmark::ClobberMemory(); + } + state.counters["iterations"] = solver.iterations(); +} + +// --- Compare with BiCGSTAB (supported module, general) --- +static void BM_BiCGSTAB_Reference(benchmark::State& state) { + int n = state.range(0); + int bw = state.range(1); + SpMat A = generateGeneral(n, bw); + Vec b = Vec::Random(n); + + BiCGSTAB solver; + solver.setMaxIterations(1000); + solver.setTolerance(1e-10); + solver.compute(A); + + for (auto _ : state) { + Vec x = solver.solve(b); + benchmark::DoNotOptimize(x.data()); + benchmark::ClobberMemory(); + } + state.counters["iterations"] = solver.iterations(); +} + +static void SolverSizes(::benchmark::Benchmark* b) { + for (int n : {1000, 10000, 100000}) { + for (int bw : {5, 20}) { + b->Args({n, bw}); + } + } +} + +BENCHMARK(BM_GMRES)->Apply(SolverSizes); +BENCHMARK(BM_DGMRES)->Apply(SolverSizes); +BENCHMARK(BM_MINRES)->Apply(SolverSizes); +BENCHMARK(BM_IDRS)->Apply(SolverSizes); +BENCHMARK(BM_BiCGSTABL)->Apply(SolverSizes); +BENCHMARK(BM_CG_Reference)->Apply(SolverSizes); +BENCHMARK(BM_BiCGSTAB_Reference)->Apply(SolverSizes); diff --git a/unsupported/benchmarks/KroneckerProduct/CMakeLists.txt b/unsupported/benchmarks/KroneckerProduct/CMakeLists.txt new file mode 100644 index 000000000..ff0d28969 --- /dev/null +++ b/unsupported/benchmarks/KroneckerProduct/CMakeLists.txt @@ -0,0 +1 @@ +eigen_add_benchmark(bench_kronecker bench_kronecker.cpp) diff --git a/unsupported/benchmarks/KroneckerProduct/bench_kronecker.cpp b/unsupported/benchmarks/KroneckerProduct/bench_kronecker.cpp new file mode 100644 index 000000000..882887e2c --- /dev/null +++ b/unsupported/benchmarks/KroneckerProduct/bench_kronecker.cpp @@ -0,0 +1,83 @@ +// Benchmarks for Kronecker product (dense and sparse). + +#include +#include +#include +#include + +using namespace Eigen; + +typedef double Scalar; +typedef Matrix Mat; +typedef SparseMatrix SpMat; + +// --- Dense Kronecker product --- +static void BM_KroneckerDense(benchmark::State& state) { + int na = state.range(0); + int nb = state.range(1); + + Mat A = Mat::Random(na, na); + Mat B = Mat::Random(nb, nb); + + for (auto _ : state) { + Mat C = kroneckerProduct(A, B).eval(); + benchmark::DoNotOptimize(C.data()); + benchmark::ClobberMemory(); + } + int outSize = na * nb; + state.counters["output_size"] = outSize; +} + +// --- Sparse Kronecker product --- +static void BM_KroneckerSparse(benchmark::State& state) { + int na = state.range(0); + int nb = state.range(1); + + // Create sparse identity-like matrices with some fill. + SpMat A(na, na); + SpMat B(nb, nb); + + std::vector> tripsA, tripsB; + for (int i = 0; i < na; ++i) { + tripsA.emplace_back(i, i, 2.0); + if (i + 1 < na) { + tripsA.emplace_back(i, i + 1, -1.0); + tripsA.emplace_back(i + 1, i, -1.0); + } + } + for (int i = 0; i < nb; ++i) { + tripsB.emplace_back(i, i, 2.0); + if (i + 1 < nb) { + tripsB.emplace_back(i, i + 1, -1.0); + tripsB.emplace_back(i + 1, i, -1.0); + } + } + A.setFromTriplets(tripsA.begin(), tripsA.end()); + B.setFromTriplets(tripsB.begin(), tripsB.end()); + + for (auto _ : state) { + SpMat C = kroneckerProduct(A, B).eval(); + benchmark::DoNotOptimize(C.valuePtr()); + benchmark::ClobberMemory(); + } + state.counters["output_size"] = na * nb; +} + +static void KroneckerSizes(::benchmark::Benchmark* b) { + for (int na : {4, 8, 16}) { + for (int nb : {4, 8, 16}) { + b->Args({na, nb}); + } + } +} + +static void KroneckerSparseSizes(::benchmark::Benchmark* b) { + for (int na : {16, 32, 64, 128}) { + for (int nb : {16, 32, 64, 128}) { + b->Args({na, nb}); + } + } +} + +BENCHMARK(BM_KroneckerDense)->Apply(KroneckerSizes); +BENCHMARK(BM_KroneckerSparse)->Apply(KroneckerSparseSizes); diff --git a/unsupported/benchmarks/MatrixFunctions/CMakeLists.txt b/unsupported/benchmarks/MatrixFunctions/CMakeLists.txt new file mode 100644 index 000000000..2a612ed3c --- /dev/null +++ b/unsupported/benchmarks/MatrixFunctions/CMakeLists.txt @@ -0,0 +1,3 @@ +eigen_add_benchmark(bench_matrix_exponential bench_matrix_exponential.cpp) +eigen_add_benchmark(bench_matrix_logarithm bench_matrix_logarithm.cpp) +eigen_add_benchmark(bench_matrix_power bench_matrix_power.cpp) diff --git a/unsupported/benchmarks/MatrixFunctions/bench_matrix_exponential.cpp b/unsupported/benchmarks/MatrixFunctions/bench_matrix_exponential.cpp new file mode 100644 index 000000000..3778b769c --- /dev/null +++ b/unsupported/benchmarks/MatrixFunctions/bench_matrix_exponential.cpp @@ -0,0 +1,52 @@ +// Benchmarks for matrix exponential. +// Critical for Sophus Lie group operations (SLAM, visual odometry). + +#include +#include +#include + +using namespace Eigen; + +#ifndef SCALAR +#define SCALAR double +#endif + +typedef SCALAR Scalar; + +static void BM_MatrixExp(benchmark::State& state) { + int n = state.range(0); + typedef Matrix MatrixType; + + // Generate a random matrix with reasonable spectral radius. + MatrixType A = MatrixType::Random(n, n) / Scalar(n); + MatrixType result(n, n); + + for (auto _ : state) { + result = A.exp(); + benchmark::DoNotOptimize(result.data()); + benchmark::ClobberMemory(); + } +} + +// Fixed-size specializations for Lie group sizes. +template +static void BM_MatrixExp_Fixed(benchmark::State& state) { + typedef Matrix MatrixType; + + MatrixType A = MatrixType::Random() / Scalar(N); + MatrixType result; + + for (auto _ : state) { + result = A.exp(); + benchmark::DoNotOptimize(result.data()); + benchmark::ClobberMemory(); + } +} + +// Dynamic sizes: Lie groups (2,3,4) plus larger. +BENCHMARK(BM_MatrixExp)->Arg(2)->Arg(3)->Arg(4)->Arg(8)->Arg(16)->Arg(32)->Arg(64)->Arg(128); + +// Fixed-size Lie group dimensions. +BENCHMARK(BM_MatrixExp_Fixed<2>); +BENCHMARK(BM_MatrixExp_Fixed<3>); +BENCHMARK(BM_MatrixExp_Fixed<4>); diff --git a/unsupported/benchmarks/MatrixFunctions/bench_matrix_logarithm.cpp b/unsupported/benchmarks/MatrixFunctions/bench_matrix_logarithm.cpp new file mode 100644 index 000000000..4d1c8eadf --- /dev/null +++ b/unsupported/benchmarks/MatrixFunctions/bench_matrix_logarithm.cpp @@ -0,0 +1,51 @@ +// Benchmarks for matrix logarithm. +// Inverse of matrix exponential, used for Lie group log maps. + +#include +#include +#include + +using namespace Eigen; + +#ifndef SCALAR +#define SCALAR double +#endif + +typedef SCALAR Scalar; + +static void BM_MatrixLog(benchmark::State& state) { + int n = state.range(0); + typedef Matrix MatrixType; + + // Generate a matrix close to identity for stable log computation. + MatrixType A = MatrixType::Identity(n, n) + MatrixType::Random(n, n) / Scalar(n * 2); + // Ensure A is in the principal branch by computing exp(small matrix). + A = (MatrixType::Random(n, n) / Scalar(n * 4)).exp(); + MatrixType result(n, n); + + for (auto _ : state) { + result = A.log(); + benchmark::DoNotOptimize(result.data()); + benchmark::ClobberMemory(); + } +} + +template +static void BM_MatrixLog_Fixed(benchmark::State& state) { + typedef Matrix MatrixType; + + MatrixType A = (MatrixType::Random() / Scalar(N * 4)).exp(); + MatrixType result; + + for (auto _ : state) { + result = A.log(); + benchmark::DoNotOptimize(result.data()); + benchmark::ClobberMemory(); + } +} + +BENCHMARK(BM_MatrixLog)->Arg(2)->Arg(3)->Arg(4)->Arg(8)->Arg(16)->Arg(32)->Arg(64); + +BENCHMARK(BM_MatrixLog_Fixed<2>); +BENCHMARK(BM_MatrixLog_Fixed<3>); +BENCHMARK(BM_MatrixLog_Fixed<4>); diff --git a/unsupported/benchmarks/MatrixFunctions/bench_matrix_power.cpp b/unsupported/benchmarks/MatrixFunctions/bench_matrix_power.cpp new file mode 100644 index 000000000..a1ebea074 --- /dev/null +++ b/unsupported/benchmarks/MatrixFunctions/bench_matrix_power.cpp @@ -0,0 +1,99 @@ +// Benchmarks for matrix power functions: sqrt, pow, cos, sin, cosh, sinh. + +#include +#include +#include + +using namespace Eigen; + +typedef double Scalar; +typedef Matrix Mat; + +static void BM_MatrixSqrt(benchmark::State& state) { + int n = state.range(0); + // SPD matrix has well-defined sqrt. + Mat tmp = Mat::Random(n, n); + Mat A = tmp * tmp.transpose() + Mat::Identity(n, n); + Mat result(n, n); + + for (auto _ : state) { + result = A.sqrt(); + benchmark::DoNotOptimize(result.data()); + benchmark::ClobberMemory(); + } +} + +static void BM_MatrixPow(benchmark::State& state) { + int n = state.range(0); + Mat tmp = Mat::Random(n, n); + Mat A = tmp * tmp.transpose() + Mat::Identity(n, n); + Mat result(n, n); + Scalar p = 2.5; + + for (auto _ : state) { + result = A.pow(p); + benchmark::DoNotOptimize(result.data()); + benchmark::ClobberMemory(); + } +} + +static void BM_MatrixCos(benchmark::State& state) { + int n = state.range(0); + Mat A = Mat::Random(n, n) / Scalar(n); + Mat result(n, n); + + for (auto _ : state) { + result = A.cos(); + benchmark::DoNotOptimize(result.data()); + benchmark::ClobberMemory(); + } +} + +static void BM_MatrixSin(benchmark::State& state) { + int n = state.range(0); + Mat A = Mat::Random(n, n) / Scalar(n); + Mat result(n, n); + + for (auto _ : state) { + result = A.sin(); + benchmark::DoNotOptimize(result.data()); + benchmark::ClobberMemory(); + } +} + +static void BM_MatrixCosh(benchmark::State& state) { + int n = state.range(0); + Mat A = Mat::Random(n, n) / Scalar(n); + Mat result(n, n); + + for (auto _ : state) { + result = A.cosh(); + benchmark::DoNotOptimize(result.data()); + benchmark::ClobberMemory(); + } +} + +static void BM_MatrixSinh(benchmark::State& state) { + int n = state.range(0); + Mat A = Mat::Random(n, n) / Scalar(n); + Mat result(n, n); + + for (auto _ : state) { + result = A.sinh(); + benchmark::DoNotOptimize(result.data()); + benchmark::ClobberMemory(); + } +} + +static void MatPowerSizes(::benchmark::Benchmark* b) { + for (int n : {4, 8, 16, 32, 64}) { + b->Arg(n); + } +} + +BENCHMARK(BM_MatrixSqrt)->Apply(MatPowerSizes); +BENCHMARK(BM_MatrixPow)->Apply(MatPowerSizes); +BENCHMARK(BM_MatrixCos)->Apply(MatPowerSizes); +BENCHMARK(BM_MatrixSin)->Apply(MatPowerSizes); +BENCHMARK(BM_MatrixCosh)->Apply(MatPowerSizes); +BENCHMARK(BM_MatrixSinh)->Apply(MatPowerSizes); diff --git a/unsupported/benchmarks/SpecialFunctions/CMakeLists.txt b/unsupported/benchmarks/SpecialFunctions/CMakeLists.txt new file mode 100644 index 000000000..a6734137a --- /dev/null +++ b/unsupported/benchmarks/SpecialFunctions/CMakeLists.txt @@ -0,0 +1 @@ +eigen_add_benchmark(bench_special_functions bench_special_functions.cpp) diff --git a/unsupported/benchmarks/SpecialFunctions/bench_special_functions.cpp b/unsupported/benchmarks/SpecialFunctions/bench_special_functions.cpp new file mode 100644 index 000000000..dc6505de5 --- /dev/null +++ b/unsupported/benchmarks/SpecialFunctions/bench_special_functions.cpp @@ -0,0 +1,127 @@ +// Benchmarks for special functions beyond what bench_cwise_math.cpp covers. +// Includes Bessel functions, two-argument functions (igamma, betainc), +// and additional functions (lgamma, digamma, zeta, polygamma). + +#include +#include +#include + +using namespace Eigen; + +// Macro for unary special functions on arrays. +#define BENCH_SPECIAL_UNARY(NAME, EXPR, LO, HI) \ + template \ + static void BM_##NAME(benchmark::State& state) { \ + const Index n = state.range(0); \ + using Arr = Array; \ + Arr a = (Arr::Random(n) + Scalar(1)) * Scalar((double(HI) - double(LO)) / 2.0) + Scalar(LO); \ + Arr b(n); \ + for (auto _ : state) { \ + b = EXPR; \ + benchmark::DoNotOptimize(b.data()); \ + } \ + state.counters["Elements/s"] = benchmark::Counter(n, benchmark::Counter::kIsIterationInvariantRate); \ + state.SetBytesProcessed(state.iterations() * n * sizeof(Scalar) * 2); \ + } + +// Macro for binary special functions on arrays. +#define BENCH_SPECIAL_BINARY(NAME, EXPR, LO_A, HI_A, LO_B, HI_B) \ + template \ + static void BM_##NAME(benchmark::State& state) { \ + const Index n = state.range(0); \ + using Arr = Array; \ + Arr a = (Arr::Random(n) + Scalar(1)) * Scalar((double(HI_A) - double(LO_A)) / 2.0) + Scalar(LO_A); \ + Arr b = (Arr::Random(n) + Scalar(1)) * Scalar((double(HI_B) - double(LO_B)) / 2.0) + Scalar(LO_B); \ + Arr c(n); \ + for (auto _ : state) { \ + c = EXPR; \ + benchmark::DoNotOptimize(c.data()); \ + } \ + state.counters["Elements/s"] = benchmark::Counter(n, benchmark::Counter::kIsIterationInvariantRate); \ + state.SetBytesProcessed(state.iterations() * n * sizeof(Scalar) * 3); \ + } + +// --- Unary special functions --- +BENCH_SPECIAL_UNARY(Lgamma, Eigen::lgamma(a), 0.1, 20) +BENCH_SPECIAL_UNARY(Digamma, Eigen::digamma(a), 0.1, 20) + +// --- Bessel functions (first kind) --- +BENCH_SPECIAL_UNARY(BesselI0, Eigen::bessel_i0(a), 0, 10) +BENCH_SPECIAL_UNARY(BesselI1, Eigen::bessel_i1(a), 0, 10) +BENCH_SPECIAL_UNARY(BesselI0e, Eigen::bessel_i0e(a), 0, 100) +BENCH_SPECIAL_UNARY(BesselI1e, Eigen::bessel_i1e(a), 0, 100) +BENCH_SPECIAL_UNARY(BesselJ0, Eigen::bessel_j0(a), 0, 20) +BENCH_SPECIAL_UNARY(BesselJ1, Eigen::bessel_j1(a), 0, 20) + +// --- Bessel functions (second kind) --- +BENCH_SPECIAL_UNARY(BesselY0, Eigen::bessel_y0(a), 0.1, 20) +BENCH_SPECIAL_UNARY(BesselY1, Eigen::bessel_y1(a), 0.1, 20) +BENCH_SPECIAL_UNARY(BesselK0, Eigen::bessel_k0(a), 0.1, 20) +BENCH_SPECIAL_UNARY(BesselK1, Eigen::bessel_k1(a), 0.1, 20) +BENCH_SPECIAL_UNARY(BesselK0e, Eigen::bessel_k0e(a), 0.1, 100) +BENCH_SPECIAL_UNARY(BesselK1e, Eigen::bessel_k1e(a), 0.1, 100) + +// --- Two-argument functions --- +BENCH_SPECIAL_BINARY(Igamma, Eigen::igamma(a, b), 0.1, 10, 0.1, 10) +BENCH_SPECIAL_BINARY(Igammac, Eigen::igammac(a, b), 0.1, 10, 0.1, 10) +BENCH_SPECIAL_BINARY(Zeta, Eigen::zeta(a, b), 1.1, 10, 0.1, 10) +BENCH_SPECIAL_BINARY(Polygamma, Eigen::polygamma(a, b), 1, 4, 0.1, 10) + +// --- Ternary: betainc --- +template +static void BM_Betainc(benchmark::State& state) { + const Index n = state.range(0); + using Arr = Array; + Arr a = (Arr::Random(n) + Scalar(1)) * Scalar(2.5) + Scalar(0.5); // [0.5, 5.5] + Arr b = (Arr::Random(n) + Scalar(1)) * Scalar(2.5) + Scalar(0.5); + Arr x = (Arr::Random(n) + Scalar(1)) * Scalar(0.5); // [0, 1] + Arr result(n); + for (auto _ : state) { + result = Eigen::betainc(a, b, x); + benchmark::DoNotOptimize(result.data()); + } + state.counters["Elements/s"] = benchmark::Counter(n, benchmark::Counter::kIsIterationInvariantRate); + state.SetBytesProcessed(state.iterations() * n * sizeof(Scalar) * 4); +} + +static void SpecialSizes(::benchmark::Benchmark* b) { + for (int n : {256, 4096, 65536, 1048576}) b->Arg(n); +} + +// --- Register float --- +BENCHMARK(BM_Lgamma)->Apply(SpecialSizes)->Name("Lgamma_float"); +BENCHMARK(BM_Digamma)->Apply(SpecialSizes)->Name("Digamma_float"); +BENCHMARK(BM_BesselI0)->Apply(SpecialSizes)->Name("BesselI0_float"); +BENCHMARK(BM_BesselI1)->Apply(SpecialSizes)->Name("BesselI1_float"); +BENCHMARK(BM_BesselI0e)->Apply(SpecialSizes)->Name("BesselI0e_float"); +BENCHMARK(BM_BesselI1e)->Apply(SpecialSizes)->Name("BesselI1e_float"); +BENCHMARK(BM_BesselJ0)->Apply(SpecialSizes)->Name("BesselJ0_float"); +BENCHMARK(BM_BesselJ1)->Apply(SpecialSizes)->Name("BesselJ1_float"); +BENCHMARK(BM_BesselY0)->Apply(SpecialSizes)->Name("BesselY0_float"); +BENCHMARK(BM_BesselY1)->Apply(SpecialSizes)->Name("BesselY1_float"); +BENCHMARK(BM_BesselK0)->Apply(SpecialSizes)->Name("BesselK0_float"); +BENCHMARK(BM_BesselK1)->Apply(SpecialSizes)->Name("BesselK1_float"); +BENCHMARK(BM_BesselK0e)->Apply(SpecialSizes)->Name("BesselK0e_float"); +BENCHMARK(BM_BesselK1e)->Apply(SpecialSizes)->Name("BesselK1e_float"); +BENCHMARK(BM_Igamma)->Apply(SpecialSizes)->Name("Igamma_float"); +BENCHMARK(BM_Igammac)->Apply(SpecialSizes)->Name("Igammac_float"); +BENCHMARK(BM_Betainc)->Apply(SpecialSizes)->Name("Betainc_float"); +BENCHMARK(BM_Zeta)->Apply(SpecialSizes)->Name("Zeta_float"); +BENCHMARK(BM_Polygamma)->Apply(SpecialSizes)->Name("Polygamma_float"); + +// --- Register double --- +BENCHMARK(BM_Lgamma)->Apply(SpecialSizes)->Name("Lgamma_double"); +BENCHMARK(BM_Digamma)->Apply(SpecialSizes)->Name("Digamma_double"); +BENCHMARK(BM_BesselI0)->Apply(SpecialSizes)->Name("BesselI0_double"); +BENCHMARK(BM_BesselI1)->Apply(SpecialSizes)->Name("BesselI1_double"); +BENCHMARK(BM_BesselJ0)->Apply(SpecialSizes)->Name("BesselJ0_double"); +BENCHMARK(BM_BesselJ1)->Apply(SpecialSizes)->Name("BesselJ1_double"); +BENCHMARK(BM_BesselY0)->Apply(SpecialSizes)->Name("BesselY0_double"); +BENCHMARK(BM_BesselY1)->Apply(SpecialSizes)->Name("BesselY1_double"); +BENCHMARK(BM_BesselK0)->Apply(SpecialSizes)->Name("BesselK0_double"); +BENCHMARK(BM_BesselK1)->Apply(SpecialSizes)->Name("BesselK1_double"); +BENCHMARK(BM_Igamma)->Apply(SpecialSizes)->Name("Igamma_double"); +BENCHMARK(BM_Igammac)->Apply(SpecialSizes)->Name("Igammac_double"); +BENCHMARK(BM_Betainc)->Apply(SpecialSizes)->Name("Betainc_double"); +BENCHMARK(BM_Zeta)->Apply(SpecialSizes)->Name("Zeta_double"); +BENCHMARK(BM_Polygamma)->Apply(SpecialSizes)->Name("Polygamma_double"); diff --git a/unsupported/benchmarks/Splines/CMakeLists.txt b/unsupported/benchmarks/Splines/CMakeLists.txt new file mode 100644 index 000000000..a54acc4b5 --- /dev/null +++ b/unsupported/benchmarks/Splines/CMakeLists.txt @@ -0,0 +1 @@ +eigen_add_benchmark(bench_splines bench_splines.cpp) diff --git a/unsupported/benchmarks/Splines/bench_splines.cpp b/unsupported/benchmarks/Splines/bench_splines.cpp new file mode 100644 index 000000000..e422a8f48 --- /dev/null +++ b/unsupported/benchmarks/Splines/bench_splines.cpp @@ -0,0 +1,98 @@ +// Benchmarks for Eigen Spline module. +// Tests fitting, evaluation, and derivative computation. + +#include +#include +#include + +using namespace Eigen; + +typedef double Scalar; + +// --- Spline fitting (interpolation) --- +template +static void BM_SplineFit(benchmark::State& state) { + const int n = state.range(0); + + typedef Spline SplineType; + typedef typename SplineType::PointType PointType; + + // Generate random points. + Matrix pts(Dim, n); + pts.setRandom(); + + for (auto _ : state) { + SplineType spline = SplineFitting::Interpolate(pts, Degree); + benchmark::DoNotOptimize(spline.knots().data()); + benchmark::ClobberMemory(); + } +} + +// --- Spline evaluation --- +template +static void BM_SplineEval(benchmark::State& state) { + const int n = state.range(0); // number of control points for fitting + const int neval = 1000; // number of evaluation points + + typedef Spline SplineType; + + Matrix pts(Dim, n); + pts.setRandom(); + SplineType spline = SplineFitting::Interpolate(pts, Degree); + + // Generate evaluation parameters in [0, 1]. + VectorXd u = VectorXd::LinSpaced(neval, 0, 1); + + for (auto _ : state) { + for (int i = 0; i < neval; ++i) { + auto pt = spline(u(i)); + benchmark::DoNotOptimize(pt.data()); + } + benchmark::ClobberMemory(); + } + state.counters["Evals/s"] = benchmark::Counter(neval, benchmark::Counter::kIsIterationInvariantRate); +} + +// --- Spline derivative evaluation --- +template +static void BM_SplineDerivatives(benchmark::State& state) { + const int n = state.range(0); + const int neval = 1000; + + typedef Spline SplineType; + + Matrix pts(Dim, n); + pts.setRandom(); + SplineType spline = SplineFitting::Interpolate(pts, Degree); + + VectorXd u = VectorXd::LinSpaced(neval, 0, 1); + + for (auto _ : state) { + for (int i = 0; i < neval; ++i) { + auto derivs = spline.derivatives(u(i), 1); + benchmark::DoNotOptimize(derivs.data()); + } + benchmark::ClobberMemory(); + } + state.counters["Evals/s"] = benchmark::Counter(neval, benchmark::Counter::kIsIterationInvariantRate); +} + +static void SplineSizes(::benchmark::Benchmark* b) { + for (int n : {10, 50, 200, 1000}) { + b->Arg(n); + } +} + +// 2D cubic splines +BENCHMARK(BM_SplineFit<2, 3>)->Apply(SplineSizes)->Name("SplineFit_2D_Cubic"); +BENCHMARK(BM_SplineEval<2, 3>)->Apply(SplineSizes)->Name("SplineEval_2D_Cubic"); +BENCHMARK(BM_SplineDerivatives<2, 3>)->Apply(SplineSizes)->Name("SplineDerivatives_2D_Cubic"); + +// 3D cubic splines +BENCHMARK(BM_SplineFit<3, 3>)->Apply(SplineSizes)->Name("SplineFit_3D_Cubic"); +BENCHMARK(BM_SplineEval<3, 3>)->Apply(SplineSizes)->Name("SplineEval_3D_Cubic"); +BENCHMARK(BM_SplineDerivatives<3, 3>)->Apply(SplineSizes)->Name("SplineDerivatives_3D_Cubic"); + +// 2D quintic splines +BENCHMARK(BM_SplineFit<2, 5>)->Apply(SplineSizes)->Name("SplineFit_2D_Quintic"); +BENCHMARK(BM_SplineEval<2, 5>)->Apply(SplineSizes)->Name("SplineEval_2D_Quintic"); diff --git a/unsupported/benchmarks/Tensor/CMakeLists.txt b/unsupported/benchmarks/Tensor/CMakeLists.txt new file mode 100644 index 000000000..4dab17c48 --- /dev/null +++ b/unsupported/benchmarks/Tensor/CMakeLists.txt @@ -0,0 +1,8 @@ +eigen_add_benchmark(bench_contraction bench_contraction.cpp) +eigen_add_benchmark(bench_convolution bench_convolution.cpp) +eigen_add_benchmark(bench_reduction bench_reduction.cpp) +eigen_add_benchmark(bench_broadcasting bench_broadcasting.cpp) +eigen_add_benchmark(bench_shuffling bench_shuffling.cpp) +eigen_add_benchmark(bench_tensor_fft bench_tensor_fft.cpp) +eigen_add_benchmark(bench_morphing bench_morphing.cpp) +eigen_add_benchmark(bench_coefficient_wise bench_coefficient_wise.cpp) diff --git a/unsupported/benchmarks/Tensor/bench_broadcasting.cpp b/unsupported/benchmarks/Tensor/bench_broadcasting.cpp new file mode 100644 index 000000000..01aed2048 --- /dev/null +++ b/unsupported/benchmarks/Tensor/bench_broadcasting.cpp @@ -0,0 +1,111 @@ +// Benchmarks for Eigen Tensor broadcasting. +// Tests broadcasting along various dimensions and ranks. + +#include +#include + +using namespace Eigen; + +typedef float Scalar; + +// --- Broadcast row vector {1,N} -> {M,N} --- +static void BM_BroadcastRow(benchmark::State& state) { + const int M = state.range(0); + const int N = state.range(1); + + Tensor row(1, N); + Tensor result(M, N); + row.setRandom(); + + Eigen::array bcast = {M, 1}; + + for (auto _ : state) { + result = row.broadcast(bcast); + benchmark::DoNotOptimize(result.data()); + benchmark::ClobberMemory(); + } + state.SetBytesProcessed(state.iterations() * M * N * sizeof(Scalar)); +} + +// --- Broadcast col vector {M,1} -> {M,N} --- +static void BM_BroadcastCol(benchmark::State& state) { + const int M = state.range(0); + const int N = state.range(1); + + Tensor col(M, 1); + Tensor result(M, N); + col.setRandom(); + + Eigen::array bcast = {1, N}; + + for (auto _ : state) { + result = col.broadcast(bcast); + benchmark::DoNotOptimize(result.data()); + benchmark::ClobberMemory(); + } + state.SetBytesProcessed(state.iterations() * M * N * sizeof(Scalar)); +} + +// --- Broadcast + element-wise add (bias addition pattern) --- +static void BM_BroadcastAdd(benchmark::State& state) { + const int M = state.range(0); + const int N = state.range(1); + + Tensor mat(M, N); + Tensor bias(1, N); + Tensor result(M, N); + mat.setRandom(); + bias.setRandom(); + + Eigen::array bcast = {M, 1}; + + for (auto _ : state) { + result = mat + bias.broadcast(bcast); + benchmark::DoNotOptimize(result.data()); + benchmark::ClobberMemory(); + } + state.SetBytesProcessed(state.iterations() * M * N * sizeof(Scalar) * 2); +} + +// --- Rank-4 broadcast (batch x channels x 1 x 1) -> (batch x channels x H x W) --- +static void BM_BroadcastRank4(benchmark::State& state) { + const int batch = state.range(0); + const int C = state.range(1); + const int H = state.range(2); + + Tensor bias(batch, C, 1, 1); + Tensor result(batch, C, H, H); + bias.setRandom(); + + Eigen::array bcast = {1, 1, H, H}; + + for (auto _ : state) { + result = bias.broadcast(bcast); + benchmark::DoNotOptimize(result.data()); + benchmark::ClobberMemory(); + } + state.SetBytesProcessed(state.iterations() * batch * C * H * H * sizeof(Scalar)); +} + +static void BroadcastSizes(::benchmark::Benchmark* b) { + for (int m : {64, 256, 1024}) { + for (int n : {64, 256, 1024}) { + b->Args({m, n}); + } + } +} + +static void Rank4Sizes(::benchmark::Benchmark* b) { + for (int batch : {1, 8}) { + for (int c : {64, 256}) { + for (int h : {16, 32}) { + b->Args({batch, c, h}); + } + } + } +} + +BENCHMARK(BM_BroadcastRow)->Apply(BroadcastSizes); +BENCHMARK(BM_BroadcastCol)->Apply(BroadcastSizes); +BENCHMARK(BM_BroadcastAdd)->Apply(BroadcastSizes); +BENCHMARK(BM_BroadcastRank4)->Apply(Rank4Sizes); diff --git a/unsupported/benchmarks/Tensor/bench_coefficient_wise.cpp b/unsupported/benchmarks/Tensor/bench_coefficient_wise.cpp new file mode 100644 index 000000000..aed482811 --- /dev/null +++ b/unsupported/benchmarks/Tensor/bench_coefficient_wise.cpp @@ -0,0 +1,131 @@ +// Benchmarks for Eigen Tensor coefficient-wise operations. +// Covers activation functions, normalization, and element-wise arithmetic. + +#include +#include + +using namespace Eigen; + +typedef float Scalar; + +// Macro to define a benchmark for a unary tensor operation. +#define BENCH_TENSOR_UNARY(NAME, EXPR) \ + static void BM_##NAME(benchmark::State& state) { \ + const int M = state.range(0); \ + const int N = state.range(1); \ + Tensor a(M, N); \ + a.setRandom(); \ + Tensor b(M, N); \ + for (auto _ : state) { \ + b = EXPR; \ + benchmark::DoNotOptimize(b.data()); \ + benchmark::ClobberMemory(); \ + } \ + state.SetBytesProcessed(state.iterations() * M * N * sizeof(Scalar) * 2); \ + } + +BENCH_TENSOR_UNARY(Exp, a.exp()) +BENCH_TENSOR_UNARY(Log, a.abs().log()) +BENCH_TENSOR_UNARY(Tanh, a.tanh()) +BENCH_TENSOR_UNARY(Sigmoid, a.sigmoid()) +BENCH_TENSOR_UNARY(ReLU, a.cwiseMax(Scalar(0))) +BENCH_TENSOR_UNARY(Sqrt, a.abs().sqrt()) + +// --- Element-wise binary operations --- +static void BM_Add(benchmark::State& state) { + const int M = state.range(0); + const int N = state.range(1); + + Tensor a(M, N); + Tensor b(M, N); + Tensor c(M, N); + a.setRandom(); + b.setRandom(); + + for (auto _ : state) { + c = a + b; + benchmark::DoNotOptimize(c.data()); + benchmark::ClobberMemory(); + } + state.SetBytesProcessed(state.iterations() * M * N * sizeof(Scalar) * 3); +} + +static void BM_Mul(benchmark::State& state) { + const int M = state.range(0); + const int N = state.range(1); + + Tensor a(M, N); + Tensor b(M, N); + Tensor c(M, N); + a.setRandom(); + b.setRandom(); + + for (auto _ : state) { + c = a * b; + benchmark::DoNotOptimize(c.data()); + benchmark::ClobberMemory(); + } + state.SetBytesProcessed(state.iterations() * M * N * sizeof(Scalar) * 3); +} + +// --- Fused multiply-add --- +static void BM_FMA(benchmark::State& state) { + const int M = state.range(0); + const int N = state.range(1); + + Tensor a(M, N); + Tensor b(M, N); + Tensor c(M, N); + Tensor d(M, N); + a.setRandom(); + b.setRandom(); + c.setRandom(); + + for (auto _ : state) { + d = a * b + c; + benchmark::DoNotOptimize(d.data()); + benchmark::ClobberMemory(); + } + state.SetBytesProcessed(state.iterations() * M * N * sizeof(Scalar) * 4); +} + +// --- Rank-4 coefficient-wise (CNN feature maps) --- +static void BM_ReLU_Rank4(benchmark::State& state) { + const int batch = state.range(0); + const int C = state.range(1); + const int H = state.range(2); + + Tensor a(batch, C, H, H); + Tensor b(batch, C, H, H); + a.setRandom(); + + for (auto _ : state) { + b = a.cwiseMax(Scalar(0)); + benchmark::DoNotOptimize(b.data()); + benchmark::ClobberMemory(); + } + state.SetBytesProcessed(state.iterations() * batch * C * H * H * sizeof(Scalar) * 2); +} + +static void CwiseSizes(::benchmark::Benchmark* b) { + for (int size : {256, 1024}) { + b->Args({size, size}); + } +} + +static void Rank4Sizes(::benchmark::Benchmark* b) { + b->Args({32, 64, 16}); + b->Args({8, 128, 32}); + b->Args({1, 256, 64}); +} + +BENCHMARK(BM_Exp)->Apply(CwiseSizes); +BENCHMARK(BM_Log)->Apply(CwiseSizes); +BENCHMARK(BM_Tanh)->Apply(CwiseSizes); +BENCHMARK(BM_Sigmoid)->Apply(CwiseSizes); +BENCHMARK(BM_ReLU)->Apply(CwiseSizes); +BENCHMARK(BM_Sqrt)->Apply(CwiseSizes); +BENCHMARK(BM_Add)->Apply(CwiseSizes); +BENCHMARK(BM_Mul)->Apply(CwiseSizes); +BENCHMARK(BM_FMA)->Apply(CwiseSizes); +BENCHMARK(BM_ReLU_Rank4)->Apply(Rank4Sizes); diff --git a/unsupported/benchmarks/Tensor/bench_contraction.cpp b/unsupported/benchmarks/Tensor/bench_contraction.cpp new file mode 100644 index 000000000..0b71ebfe3 --- /dev/null +++ b/unsupported/benchmarks/Tensor/bench_contraction.cpp @@ -0,0 +1,148 @@ +// Benchmarks for Eigen Tensor contraction (generalized GEMM). +// Tests single-threaded (DefaultDevice) and multi-threaded (ThreadPoolDevice) variants. + +#define EIGEN_USE_THREADS + +#include +#include +#include + +using namespace Eigen; + +#ifndef SCALAR +#define SCALAR float +#endif + +typedef SCALAR Scalar; + +// --- DefaultDevice contraction (rank-2, equivalent to matrix multiply) --- +static void BM_Contraction(benchmark::State& state) { + const int M = state.range(0); + const int N = state.range(1); + const int K = state.range(2); + + Tensor A(M, K); + Tensor B(K, N); + Tensor C(M, N); + A.setRandom(); + B.setRandom(); + + using ContractDims = Tensor::DimensionPair; + Eigen::array contract_dims = {ContractDims(1, 0)}; + + for (auto _ : state) { + C = A.contract(B, contract_dims); + benchmark::DoNotOptimize(C.data()); + benchmark::ClobberMemory(); + } + state.counters["GFLOPS"] = + benchmark::Counter(2.0 * M * N * K, benchmark::Counter::kIsIterationInvariantRate, benchmark::Counter::kIs1000); +} + +// --- ThreadPoolDevice contraction --- +static void BM_Contraction_ThreadPool(benchmark::State& state) { + const int M = state.range(0); + const int N = state.range(1); + const int K = state.range(2); + const int threads = state.range(3); + + Tensor A(M, K); + Tensor B(K, N); + Tensor C(M, N); + A.setRandom(); + B.setRandom(); + + ThreadPool tp(threads); + ThreadPoolDevice dev(&tp, threads); + + using ContractDims = Tensor::DimensionPair; + Eigen::array contract_dims = {ContractDims(1, 0)}; + + for (auto _ : state) { + C.device(dev) = A.contract(B, contract_dims); + benchmark::DoNotOptimize(C.data()); + benchmark::ClobberMemory(); + } + state.counters["GFLOPS"] = + benchmark::Counter(2.0 * M * N * K, benchmark::Counter::kIsIterationInvariantRate, benchmark::Counter::kIs1000); + state.counters["threads"] = threads; +} + +// --- Rank-3 batch contraction --- +static void BM_BatchContraction(benchmark::State& state) { + const int batch = state.range(0); + const int M = state.range(1); + const int N = state.range(2); + const int K = state.range(3); + + Tensor A(batch, M, K); + Tensor B(batch, K, N); + Tensor C(batch, M, N); + A.setRandom(); + B.setRandom(); + + using ContractDims = Tensor::DimensionPair; + Eigen::array contract_dims = {ContractDims(2, 1)}; + + for (auto _ : state) { + C = A.contract(B, contract_dims); + benchmark::DoNotOptimize(C.data()); + benchmark::ClobberMemory(); + } + state.counters["GFLOPS"] = benchmark::Counter(2.0 * batch * M * N * K, benchmark::Counter::kIsIterationInvariantRate, + benchmark::Counter::kIs1000); +} + +// --- RowMajor contraction --- +static void BM_Contraction_RowMajor(benchmark::State& state) { + const int M = state.range(0); + const int N = state.range(1); + const int K = state.range(2); + + Tensor A(M, K); + Tensor B(K, N); + Tensor C(M, N); + A.setRandom(); + B.setRandom(); + + using ContractDims = Tensor::DimensionPair; + Eigen::array contract_dims = {ContractDims(1, 0)}; + + for (auto _ : state) { + C = A.contract(B, contract_dims); + benchmark::DoNotOptimize(C.data()); + benchmark::ClobberMemory(); + } + state.counters["GFLOPS"] = + benchmark::Counter(2.0 * M * N * K, benchmark::Counter::kIsIterationInvariantRate, benchmark::Counter::kIs1000); +} + +static void ContractionSizes(::benchmark::Benchmark* b) { + for (int size : {32, 64, 128, 256, 512, 1024}) { + b->Args({size, size, size}); + } + // Non-square + b->Args({256, 256, 1024}); + b->Args({1024, 64, 64}); +} + +static void ThreadPoolSizes(::benchmark::Benchmark* b) { + for (int size : {64, 256, 512, 1024}) { + for (int threads : {2, 4, 8}) { + b->Args({size, size, size, threads}); + } + } +} + +static void BatchSizes(::benchmark::Benchmark* b) { + for (int batch : {1, 8, 32}) { + for (int size : {64, 256}) { + b->Args({batch, size, size, size}); + } + } +} + +BENCHMARK(BM_Contraction)->Apply(ContractionSizes); +BENCHMARK(BM_Contraction_RowMajor)->Apply(ContractionSizes); +BENCHMARK(BM_Contraction_ThreadPool)->Apply(ThreadPoolSizes); +BENCHMARK(BM_BatchContraction)->Apply(BatchSizes); diff --git a/unsupported/benchmarks/Tensor/bench_convolution.cpp b/unsupported/benchmarks/Tensor/bench_convolution.cpp new file mode 100644 index 000000000..46e44eeb8 --- /dev/null +++ b/unsupported/benchmarks/Tensor/bench_convolution.cpp @@ -0,0 +1,151 @@ +// Benchmarks for Eigen Tensor convolution (1D and 2D). + +#define EIGEN_USE_THREADS + +#include +#include +#include + +using namespace Eigen; + +typedef float Scalar; + +// --- 1D convolution --- +static void BM_Convolve1D(benchmark::State& state) { + const int input_size = state.range(0); + const int kernel_size = state.range(1); + + Tensor input(input_size); + Tensor kernel(kernel_size); + input.setRandom(); + kernel.setRandom(); + + Eigen::array dims = {0}; + + for (auto _ : state) { + Tensor result = input.convolve(kernel, dims); + benchmark::DoNotOptimize(result.data()); + benchmark::ClobberMemory(); + } + double flops = 2.0 * (input_size - kernel_size + 1) * kernel_size; + state.counters["GFLOPS"] = + benchmark::Counter(flops, benchmark::Counter::kIsIterationInvariantRate, benchmark::Counter::kIs1000); +} + +// --- 2D convolution --- +static void BM_Convolve2D(benchmark::State& state) { + const int H = state.range(0); + const int W = state.range(1); + const int kH = state.range(2); + const int kW = state.range(3); + + Tensor input(H, W); + Tensor kernel(kH, kW); + input.setRandom(); + kernel.setRandom(); + + Eigen::array dims = {0, 1}; + + for (auto _ : state) { + Tensor result = input.convolve(kernel, dims); + benchmark::DoNotOptimize(result.data()); + benchmark::ClobberMemory(); + } + double flops = 2.0 * (H - kH + 1) * (W - kW + 1) * kH * kW; + state.counters["GFLOPS"] = + benchmark::Counter(flops, benchmark::Counter::kIsIterationInvariantRate, benchmark::Counter::kIs1000); +} + +// --- 2D convolution with channels (rank-3: C x H x W, convolve on H,W) --- +static void BM_Convolve2D_Channels(benchmark::State& state) { + const int C = state.range(0); + const int H = state.range(1); + const int kH = state.range(2); + + Tensor input(C, H, H); + Tensor kernel(kH, kH); + input.setRandom(); + kernel.setRandom(); + + Eigen::array dims = {1, 2}; + + for (auto _ : state) { + Tensor result = input.convolve(kernel, dims); + benchmark::DoNotOptimize(result.data()); + benchmark::ClobberMemory(); + } + int outH = H - kH + 1; + double flops = 2.0 * C * outH * outH * kH * kH; + state.counters["GFLOPS"] = + benchmark::Counter(flops, benchmark::Counter::kIsIterationInvariantRate, benchmark::Counter::kIs1000); +} + +// --- 2D convolution with ThreadPool --- +static void BM_Convolve2D_ThreadPool(benchmark::State& state) { + const int H = state.range(0); + const int kH = state.range(1); + const int threads = state.range(2); + + Tensor input(H, H); + Tensor kernel(kH, kH); + Tensor result(H - kH + 1, H - kH + 1); + input.setRandom(); + kernel.setRandom(); + + ThreadPool tp(threads); + ThreadPoolDevice dev(&tp, threads); + + Eigen::array dims = {0, 1}; + + for (auto _ : state) { + result.device(dev) = input.convolve(kernel, dims); + benchmark::DoNotOptimize(result.data()); + benchmark::ClobberMemory(); + } + int outH = H - kH + 1; + double flops = 2.0 * outH * outH * kH * kH; + state.counters["GFLOPS"] = + benchmark::Counter(flops, benchmark::Counter::kIsIterationInvariantRate, benchmark::Counter::kIs1000); + state.counters["threads"] = threads; +} + +static void Conv1DSizes(::benchmark::Benchmark* b) { + for (int input : {128, 512, 2048}) { + for (int kernel : {3, 5, 11}) { + b->Args({input, kernel}); + } + } +} + +static void Conv2DSizes(::benchmark::Benchmark* b) { + for (int hw : {32, 64, 128, 224}) { + for (int k : {3, 5, 7}) { + b->Args({hw, hw, k, k}); + } + } +} + +static void Conv2DChannelSizes(::benchmark::Benchmark* b) { + for (int c : {3, 64, 128}) { + for (int hw : {16, 32, 56}) { + for (int k : {3, 5}) { + b->Args({c, hw, k}); + } + } + } +} + +static void Conv2DThreadPoolSizes(::benchmark::Benchmark* b) { + for (int hw : {64, 128, 224}) { + for (int k : {3, 5}) { + for (int threads : {2, 4, 8}) { + b->Args({hw, k, threads}); + } + } + } +} + +BENCHMARK(BM_Convolve1D)->Apply(Conv1DSizes); +BENCHMARK(BM_Convolve2D)->Apply(Conv2DSizes); +BENCHMARK(BM_Convolve2D_Channels)->Apply(Conv2DChannelSizes); +BENCHMARK(BM_Convolve2D_ThreadPool)->Apply(Conv2DThreadPoolSizes); diff --git a/unsupported/benchmarks/Tensor/bench_morphing.cpp b/unsupported/benchmarks/Tensor/bench_morphing.cpp new file mode 100644 index 000000000..8d226e757 --- /dev/null +++ b/unsupported/benchmarks/Tensor/bench_morphing.cpp @@ -0,0 +1,142 @@ +// Benchmarks for Eigen Tensor morphing operations: reshape, slice, chip, pad, stride. + +#include +#include + +using namespace Eigen; + +typedef float Scalar; + +// --- Reshape (zero-cost if no evaluation needed; force eval via assignment) --- +static void BM_Reshape(benchmark::State& state) { + const int M = state.range(0); + const int N = state.range(1); + + Tensor A(M, N); + A.setRandom(); + + Eigen::array new_shape = {M * N}; + + for (auto _ : state) { + Tensor B = A.reshape(new_shape); + benchmark::DoNotOptimize(B.data()); + benchmark::ClobberMemory(); + } + state.SetBytesProcessed(state.iterations() * M * N * sizeof(Scalar)); +} + +// --- Slice --- +static void BM_Slice(benchmark::State& state) { + const int M = state.range(0); + const int N = state.range(1); + + Tensor A(M, N); + A.setRandom(); + + int sliceM = M / 2; + int sliceN = N / 2; + Eigen::array offsets = {0, 0}; + Eigen::array extents = {sliceM, sliceN}; + + for (auto _ : state) { + Tensor B = A.slice(offsets, extents); + benchmark::DoNotOptimize(B.data()); + benchmark::ClobberMemory(); + } + state.SetBytesProcessed(state.iterations() * sliceM * sliceN * sizeof(Scalar)); +} + +// --- Chip (extract a sub-tensor along one dimension) --- +static void BM_Chip(benchmark::State& state) { + const int D0 = state.range(0); + const int D1 = state.range(1); + const int D2 = state.range(2); + + Tensor A(D0, D1, D2); + A.setRandom(); + + for (auto _ : state) { + Tensor B = A.chip(0, 0); + benchmark::DoNotOptimize(B.data()); + benchmark::ClobberMemory(); + } + state.SetBytesProcessed(state.iterations() * D1 * D2 * sizeof(Scalar)); +} + +// --- Pad --- +static void BM_Pad(benchmark::State& state) { + const int M = state.range(0); + const int N = state.range(1); + const int padSize = state.range(2); + + Tensor A(M, N); + A.setRandom(); + + Eigen::array, 2> paddings; + paddings[0] = {padSize, padSize}; + paddings[1] = {padSize, padSize}; + + for (auto _ : state) { + Tensor B = A.pad(paddings); + benchmark::DoNotOptimize(B.data()); + benchmark::ClobberMemory(); + } + int outM = M + 2 * padSize; + int outN = N + 2 * padSize; + state.SetBytesProcessed(state.iterations() * outM * outN * sizeof(Scalar)); +} + +// --- Stride --- +static void BM_Stride(benchmark::State& state) { + const int M = state.range(0); + const int N = state.range(1); + const int stride = state.range(2); + + Tensor A(M, N); + A.setRandom(); + + Eigen::array strides_arr = {stride, stride}; + + for (auto _ : state) { + Tensor B = A.stride(strides_arr); + benchmark::DoNotOptimize(B.data()); + benchmark::ClobberMemory(); + } + int outM = (M + stride - 1) / stride; + int outN = (N + stride - 1) / stride; + state.SetBytesProcessed(state.iterations() * outM * outN * sizeof(Scalar)); +} + +static void MorphSizes(::benchmark::Benchmark* b) { + for (int size : {256, 1024}) { + b->Args({size, size}); + } +} + +static void ChipSizes(::benchmark::Benchmark* b) { + b->Args({32, 256, 256}); + b->Args({64, 128, 128}); + b->Args({8, 512, 512}); +} + +static void PadSizes(::benchmark::Benchmark* b) { + for (int size : {256, 1024}) { + for (int pad : {1, 4, 16}) { + b->Args({size, size, pad}); + } + } +} + +static void StrideSizes(::benchmark::Benchmark* b) { + for (int size : {256, 1024}) { + for (int stride : {2, 4}) { + b->Args({size, size, stride}); + } + } +} + +BENCHMARK(BM_Reshape)->Apply(MorphSizes); +BENCHMARK(BM_Slice)->Apply(MorphSizes); +BENCHMARK(BM_Chip)->Apply(ChipSizes); +BENCHMARK(BM_Pad)->Apply(PadSizes); +BENCHMARK(BM_Stride)->Apply(StrideSizes); diff --git a/unsupported/benchmarks/Tensor/bench_reduction.cpp b/unsupported/benchmarks/Tensor/bench_reduction.cpp new file mode 100644 index 000000000..795c95c72 --- /dev/null +++ b/unsupported/benchmarks/Tensor/bench_reduction.cpp @@ -0,0 +1,158 @@ +// Benchmarks for Eigen Tensor reductions (sum, maximum, mean). +// Tests full and partial reductions, inner vs outer dimension, DefaultDevice and ThreadPoolDevice. + +#define EIGEN_USE_THREADS + +#include +#include +#include + +using namespace Eigen; + +#ifndef SCALAR +#define SCALAR float +#endif + +typedef SCALAR Scalar; + +// --- Full reduction (rank-2) --- +template +static void BM_FullReduction(benchmark::State& state) { + const int M = state.range(0); + const int N = state.range(1); + + Tensor A(M, N); + A.setRandom(); + + for (auto _ : state) { + Tensor result = A.reduce(Eigen::array{0, 1}, ReduceOp()); + benchmark::DoNotOptimize(result.data()); + benchmark::ClobberMemory(); + } + state.SetBytesProcessed(state.iterations() * M * N * sizeof(Scalar)); +} + +// --- Partial reduction along dim 0 (inner dim, ColMajor) --- +static void BM_ReduceInner(benchmark::State& state) { + const int M = state.range(0); + const int N = state.range(1); + + Tensor A(M, N); + A.setRandom(); + + Eigen::array reduce_dims = {0}; + + for (auto _ : state) { + Tensor result = A.sum(reduce_dims); + benchmark::DoNotOptimize(result.data()); + benchmark::ClobberMemory(); + } + state.SetBytesProcessed(state.iterations() * M * N * sizeof(Scalar)); +} + +// --- Partial reduction along dim 1 (outer dim, ColMajor) --- +static void BM_ReduceOuter(benchmark::State& state) { + const int M = state.range(0); + const int N = state.range(1); + + Tensor A(M, N); + A.setRandom(); + + Eigen::array reduce_dims = {1}; + + for (auto _ : state) { + Tensor result = A.sum(reduce_dims); + benchmark::DoNotOptimize(result.data()); + benchmark::ClobberMemory(); + } + state.SetBytesProcessed(state.iterations() * M * N * sizeof(Scalar)); +} + +// --- Rank-4 partial reduction (batch x channels x H x W), reduce along spatial dims --- +static void BM_ReduceSpatial(benchmark::State& state) { + const int batch = state.range(0); + const int C = state.range(1); + const int H = state.range(2); + + Tensor A(batch, C, H, H); + A.setRandom(); + + Eigen::array reduce_dims = {2, 3}; + + for (auto _ : state) { + Tensor result = A.sum(reduce_dims); + benchmark::DoNotOptimize(result.data()); + benchmark::ClobberMemory(); + } + state.SetBytesProcessed(state.iterations() * batch * C * H * H * sizeof(Scalar)); +} + +// --- Full reduction with ThreadPoolDevice --- +static void BM_FullReduction_ThreadPool(benchmark::State& state) { + const int M = state.range(0); + const int N = state.range(1); + const int threads = state.range(2); + + Tensor A(M, N); + Tensor result; + A.setRandom(); + + ThreadPool tp(threads); + ThreadPoolDevice dev(&tp, threads); + + for (auto _ : state) { + result.device(dev) = A.sum(); + benchmark::DoNotOptimize(result.data()); + benchmark::ClobberMemory(); + } + state.SetBytesProcessed(state.iterations() * M * N * sizeof(Scalar)); + state.counters["threads"] = threads; +} + +// --- Maximum reduction (rank-2) --- +static void BM_MaxReduction(benchmark::State& state) { + const int M = state.range(0); + const int N = state.range(1); + + Tensor A(M, N); + A.setRandom(); + + for (auto _ : state) { + Tensor result = A.maximum(); + benchmark::DoNotOptimize(result.data()); + benchmark::ClobberMemory(); + } + state.SetBytesProcessed(state.iterations() * M * N * sizeof(Scalar)); +} + +static void ReductionSizes(::benchmark::Benchmark* b) { + for (int size : {64, 256, 1024}) { + b->Args({size, size}); + } +} + +static void ThreadPoolReductionSizes(::benchmark::Benchmark* b) { + for (int size : {256, 1024}) { + for (int threads : {2, 4, 8}) { + b->Args({size, size, threads}); + } + } +} + +static void SpatialSizes(::benchmark::Benchmark* b) { + for (int batch : {1, 8, 32}) { + for (int c : {64, 128}) { + for (int h : {16, 32}) { + b->Args({batch, c, h}); + } + } + } +} + +BENCHMARK(BM_FullReduction>)->Apply(ReductionSizes)->Name("SumReduction"); +BENCHMARK(BM_FullReduction>)->Apply(ReductionSizes)->Name("MaxReduction_Full"); +BENCHMARK(BM_MaxReduction)->Apply(ReductionSizes); +BENCHMARK(BM_ReduceInner)->Apply(ReductionSizes); +BENCHMARK(BM_ReduceOuter)->Apply(ReductionSizes); +BENCHMARK(BM_ReduceSpatial)->Apply(SpatialSizes); +BENCHMARK(BM_FullReduction_ThreadPool)->Apply(ThreadPoolReductionSizes); diff --git a/unsupported/benchmarks/Tensor/bench_shuffling.cpp b/unsupported/benchmarks/Tensor/bench_shuffling.cpp new file mode 100644 index 000000000..6296accc8 --- /dev/null +++ b/unsupported/benchmarks/Tensor/bench_shuffling.cpp @@ -0,0 +1,115 @@ +// Benchmarks for Eigen Tensor shuffling (transpose / permutation). + +#include +#include + +using namespace Eigen; + +typedef float Scalar; + +// --- Rank-2 transpose --- +static void BM_Shuffle2D(benchmark::State& state) { + const int M = state.range(0); + const int N = state.range(1); + + Tensor A(M, N); + Tensor B(N, M); + A.setRandom(); + + Eigen::array perm = {1, 0}; + + for (auto _ : state) { + B = A.shuffle(perm); + benchmark::DoNotOptimize(B.data()); + benchmark::ClobberMemory(); + } + state.SetBytesProcessed(state.iterations() * M * N * sizeof(Scalar) * 2); +} + +// --- Identity shuffle (no permutation, measures overhead) --- +static void BM_ShuffleIdentity(benchmark::State& state) { + const int M = state.range(0); + const int N = state.range(1); + + Tensor A(M, N); + Tensor B(M, N); + A.setRandom(); + + Eigen::array perm = {0, 1}; + + for (auto _ : state) { + B = A.shuffle(perm); + benchmark::DoNotOptimize(B.data()); + benchmark::ClobberMemory(); + } + state.SetBytesProcessed(state.iterations() * M * N * sizeof(Scalar) * 2); +} + +// --- Rank-3 permutation --- +static void BM_Shuffle3D(benchmark::State& state) { + const int D0 = state.range(0); + const int D1 = state.range(1); + const int D2 = state.range(2); + + Tensor A(D0, D1, D2); + A.setRandom(); + + // Permutation (2, 0, 1) + Eigen::array perm = {2, 0, 1}; + + for (auto _ : state) { + Tensor B = A.shuffle(perm); + benchmark::DoNotOptimize(B.data()); + benchmark::ClobberMemory(); + } + state.SetBytesProcessed(state.iterations() * D0 * D1 * D2 * sizeof(Scalar) * 2); +} + +// --- Rank-4 permutation (NCHW -> NHWC layout conversion) --- +static void BM_Shuffle4D_NCHW_to_NHWC(benchmark::State& state) { + const int N = state.range(0); + const int C = state.range(1); + const int H = state.range(2); + + Tensor A(N, C, H, H); + A.setRandom(); + + // NCHW -> NHWC: permute (0, 2, 3, 1) + Eigen::array perm = {0, 2, 3, 1}; + + for (auto _ : state) { + Tensor B = A.shuffle(perm); + benchmark::DoNotOptimize(B.data()); + benchmark::ClobberMemory(); + } + state.SetBytesProcessed(state.iterations() * N * C * H * H * sizeof(Scalar) * 2); +} + +static void Shuffle2DSizes(::benchmark::Benchmark* b) { + for (int size : {256, 1024}) { + b->Args({size, size}); + } + b->Args({64, 4096}); + b->Args({4096, 64}); +} + +static void Shuffle3DSizes(::benchmark::Benchmark* b) { + b->Args({64, 64, 64}); + b->Args({128, 128, 64}); + b->Args({32, 256, 256}); +} + +static void Shuffle4DSizes(::benchmark::Benchmark* b) { + for (int batch : {1, 8}) { + for (int c : {3, 64}) { + for (int h : {32, 64}) { + b->Args({batch, c, h}); + } + } + } +} + +BENCHMARK(BM_Shuffle2D)->Apply(Shuffle2DSizes); +BENCHMARK(BM_ShuffleIdentity)->Apply(Shuffle2DSizes); +BENCHMARK(BM_Shuffle3D)->Apply(Shuffle3DSizes); +BENCHMARK(BM_Shuffle4D_NCHW_to_NHWC)->Apply(Shuffle4DSizes); diff --git a/unsupported/benchmarks/Tensor/bench_tensor_fft.cpp b/unsupported/benchmarks/Tensor/bench_tensor_fft.cpp new file mode 100644 index 000000000..26ff64b1e --- /dev/null +++ b/unsupported/benchmarks/Tensor/bench_tensor_fft.cpp @@ -0,0 +1,80 @@ +// Benchmarks for Eigen Tensor FFT. + +#include +#include + +using namespace Eigen; + +#ifndef SCALAR +#define SCALAR float +#endif + +typedef SCALAR Scalar; + +// --- 1D FFT --- +static void BM_TensorFFT_1D(benchmark::State& state) { + const int N = state.range(0); + + Tensor input(N); + input.setRandom(); + + Eigen::array fft_dims = {0}; + + for (auto _ : state) { + Tensor, 1> result = input.template fft(fft_dims); + benchmark::DoNotOptimize(result.data()); + benchmark::ClobberMemory(); + } + double mflops = 5.0 * N * std::log2(static_cast(N)) / 2.0; // real->complex + state.counters["MFLOPS"] = + benchmark::Counter(mflops, benchmark::Counter::kIsIterationInvariantRate, benchmark::Counter::kIs1000); +} + +// --- 2D FFT --- +static void BM_TensorFFT_2D(benchmark::State& state) { + const int N = state.range(0); + + Tensor input(N, N); + input.setRandom(); + + Eigen::array fft_dims = {0, 1}; + + for (auto _ : state) { + Tensor, 2> result = input.template fft(fft_dims); + benchmark::DoNotOptimize(result.data()); + benchmark::ClobberMemory(); + } + double total = N * N; + double mflops = 5.0 * total * std::log2(static_cast(N)); + state.counters["MFLOPS"] = + benchmark::Counter(mflops, benchmark::Counter::kIsIterationInvariantRate, benchmark::Counter::kIs1000); +} + +// --- 1D inverse FFT --- +static void BM_TensorIFFT_1D(benchmark::State& state) { + const int N = state.range(0); + + Tensor, 1> input(N); + input.setRandom(); + + Eigen::array fft_dims = {0}; + + for (auto _ : state) { + Tensor, 1> result = input.template fft(fft_dims); + benchmark::DoNotOptimize(result.data()); + benchmark::ClobberMemory(); + } + double mflops = 5.0 * N * std::log2(static_cast(N)); + state.counters["MFLOPS"] = + benchmark::Counter(mflops, benchmark::Counter::kIsIterationInvariantRate, benchmark::Counter::kIs1000); +} + +static void FFTSizes(::benchmark::Benchmark* b) { + for (int n : {64, 256, 1024, 4096}) { + b->Arg(n); + } +} + +BENCHMARK(BM_TensorFFT_1D)->Apply(FFTSizes); +BENCHMARK(BM_TensorFFT_2D)->Apply(FFTSizes); +BENCHMARK(BM_TensorIFFT_1D)->Apply(FFTSizes);