mirror of
https://gitlab.com/libeigen/eigen.git
synced 2026-04-10 11:34:33 +08:00
Add benchmarks for unsupported modules and extend supported benchmarks
libeigen/eigen!2179 Closes #3036 Co-authored-by: Rasmus Munk Larsen <rmlarsen@gmail.com>
This commit is contained in:
@@ -1 +1,2 @@
|
|||||||
eigen_add_benchmark(bench_cholesky bench_cholesky.cpp)
|
eigen_add_benchmark(bench_cholesky bench_cholesky.cpp)
|
||||||
|
eigen_add_benchmark(bench_cholesky_double bench_cholesky.cpp DEFINITIONS SCALAR=double)
|
||||||
|
|||||||
@@ -4,7 +4,11 @@
|
|||||||
|
|
||||||
using namespace Eigen;
|
using namespace Eigen;
|
||||||
|
|
||||||
typedef float Scalar;
|
#ifndef SCALAR
|
||||||
|
#define SCALAR float
|
||||||
|
#endif
|
||||||
|
|
||||||
|
typedef SCALAR Scalar;
|
||||||
|
|
||||||
static void BM_LDLT(benchmark::State& state) {
|
static void BM_LDLT(benchmark::State& state) {
|
||||||
int n = state.range(0);
|
int n = state.range(0);
|
||||||
|
|||||||
@@ -15,3 +15,5 @@ eigen_add_benchmark(bench_diagonal bench_diagonal.cpp)
|
|||||||
eigen_add_benchmark(bench_triangular_product bench_triangular_product.cpp)
|
eigen_add_benchmark(bench_triangular_product bench_triangular_product.cpp)
|
||||||
eigen_add_benchmark(bench_selfadjoint_product bench_selfadjoint_product.cpp)
|
eigen_add_benchmark(bench_selfadjoint_product bench_selfadjoint_product.cpp)
|
||||||
eigen_add_benchmark(bench_construction bench_construction.cpp)
|
eigen_add_benchmark(bench_construction bench_construction.cpp)
|
||||||
|
eigen_add_benchmark(bench_fixed_size bench_fixed_size.cpp)
|
||||||
|
eigen_add_benchmark(bench_fixed_size_double bench_fixed_size.cpp DEFINITIONS SCALAR=double)
|
||||||
|
|||||||
123
benchmarks/Core/bench_fixed_size.cpp
Normal file
123
benchmarks/Core/bench_fixed_size.cpp
Normal file
@@ -0,0 +1,123 @@
|
|||||||
|
// Benchmarks for fixed-size matrix operations (2x2, 3x3, 4x4).
|
||||||
|
// Critical for PCL, ROS, Sophus, Drake which use small matrices extensively.
|
||||||
|
|
||||||
|
#include <benchmark/benchmark.h>
|
||||||
|
#include <Eigen/Core>
|
||||||
|
#include <Eigen/LU>
|
||||||
|
|
||||||
|
using namespace Eigen;
|
||||||
|
|
||||||
|
#ifndef SCALAR
|
||||||
|
#define SCALAR float
|
||||||
|
#endif
|
||||||
|
|
||||||
|
typedef SCALAR Scalar;
|
||||||
|
|
||||||
|
// --- Fixed-size GEMM ---
|
||||||
|
template <int N>
|
||||||
|
static void BM_FixedGemm(benchmark::State& state) {
|
||||||
|
typedef Matrix<Scalar, N, N> Mat;
|
||||||
|
Mat a = Mat::Random();
|
||||||
|
Mat b = Mat::Random();
|
||||||
|
Mat c;
|
||||||
|
|
||||||
|
for (auto _ : state) {
|
||||||
|
c.noalias() = a * b;
|
||||||
|
benchmark::DoNotOptimize(c.data());
|
||||||
|
benchmark::ClobberMemory();
|
||||||
|
}
|
||||||
|
state.counters["GFLOPS"] =
|
||||||
|
benchmark::Counter(2.0 * N * N * N, benchmark::Counter::kIsIterationInvariantRate, benchmark::Counter::kIs1000);
|
||||||
|
}
|
||||||
|
|
||||||
|
// --- Fixed-size inverse ---
|
||||||
|
template <int N>
|
||||||
|
static void BM_FixedInverse(benchmark::State& state) {
|
||||||
|
typedef Matrix<Scalar, N, N> Mat;
|
||||||
|
Mat a = Mat::Random();
|
||||||
|
// Make well-conditioned.
|
||||||
|
a = a * a.transpose() + Mat::Identity();
|
||||||
|
Mat result;
|
||||||
|
|
||||||
|
for (auto _ : state) {
|
||||||
|
result = a.inverse();
|
||||||
|
benchmark::DoNotOptimize(result.data());
|
||||||
|
benchmark::ClobberMemory();
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// --- Fixed-size determinant ---
|
||||||
|
template <int N>
|
||||||
|
static void BM_FixedDeterminant(benchmark::State& state) {
|
||||||
|
typedef Matrix<Scalar, N, N> Mat;
|
||||||
|
Mat a = Mat::Random();
|
||||||
|
Scalar result;
|
||||||
|
|
||||||
|
for (auto _ : state) {
|
||||||
|
result = a.determinant();
|
||||||
|
benchmark::DoNotOptimize(&result);
|
||||||
|
benchmark::ClobberMemory();
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// --- Batch transform: Matrix4 * Matrix<4,N> ---
|
||||||
|
static void BM_BatchTransform4xN(benchmark::State& state) {
|
||||||
|
int N = state.range(0);
|
||||||
|
typedef Matrix<Scalar, 4, 4> Mat4;
|
||||||
|
typedef Matrix<Scalar, 4, Dynamic> MatXN;
|
||||||
|
|
||||||
|
Mat4 transform = Mat4::Random();
|
||||||
|
MatXN points = MatXN::Random(4, N);
|
||||||
|
MatXN result(4, N);
|
||||||
|
|
||||||
|
for (auto _ : state) {
|
||||||
|
result.noalias() = transform * points;
|
||||||
|
benchmark::DoNotOptimize(result.data());
|
||||||
|
benchmark::ClobberMemory();
|
||||||
|
}
|
||||||
|
state.counters["GFLOPS"] =
|
||||||
|
benchmark::Counter(2.0 * 4 * 4 * N, benchmark::Counter::kIsIterationInvariantRate, benchmark::Counter::kIs1000);
|
||||||
|
}
|
||||||
|
|
||||||
|
// --- Fixed 3x3 batch operations (common in point cloud processing) ---
|
||||||
|
static void BM_Batch3x3Gemm(benchmark::State& state) {
|
||||||
|
int count = state.range(0);
|
||||||
|
typedef Matrix<Scalar, 3, 3> Mat3;
|
||||||
|
|
||||||
|
std::vector<Mat3> a(count), b(count), c(count);
|
||||||
|
for (int i = 0; i < count; ++i) {
|
||||||
|
a[i] = Mat3::Random();
|
||||||
|
b[i] = Mat3::Random();
|
||||||
|
}
|
||||||
|
|
||||||
|
for (auto _ : state) {
|
||||||
|
for (int i = 0; i < count; ++i) {
|
||||||
|
c[i].noalias() = a[i] * b[i];
|
||||||
|
}
|
||||||
|
benchmark::DoNotOptimize(c.data());
|
||||||
|
benchmark::ClobberMemory();
|
||||||
|
}
|
||||||
|
state.counters["GFLOPS"] =
|
||||||
|
benchmark::Counter(2.0 * 27 * count, benchmark::Counter::kIsIterationInvariantRate, benchmark::Counter::kIs1000);
|
||||||
|
}
|
||||||
|
|
||||||
|
// Fixed-size GEMM
|
||||||
|
BENCHMARK(BM_FixedGemm<2>)->Name("FixedGemm_2x2");
|
||||||
|
BENCHMARK(BM_FixedGemm<3>)->Name("FixedGemm_3x3");
|
||||||
|
BENCHMARK(BM_FixedGemm<4>)->Name("FixedGemm_4x4");
|
||||||
|
|
||||||
|
// Fixed-size inverse
|
||||||
|
BENCHMARK(BM_FixedInverse<2>)->Name("FixedInverse_2x2");
|
||||||
|
BENCHMARK(BM_FixedInverse<3>)->Name("FixedInverse_3x3");
|
||||||
|
BENCHMARK(BM_FixedInverse<4>)->Name("FixedInverse_4x4");
|
||||||
|
|
||||||
|
// Fixed-size determinant
|
||||||
|
BENCHMARK(BM_FixedDeterminant<2>)->Name("FixedDet_2x2");
|
||||||
|
BENCHMARK(BM_FixedDeterminant<3>)->Name("FixedDet_3x3");
|
||||||
|
BENCHMARK(BM_FixedDeterminant<4>)->Name("FixedDet_4x4");
|
||||||
|
|
||||||
|
// Batch 4xN transform
|
||||||
|
BENCHMARK(BM_BatchTransform4xN)->Arg(1)->Arg(4)->Arg(8)->Arg(16)->Arg(64);
|
||||||
|
|
||||||
|
// Batch 3x3 GEMM
|
||||||
|
BENCHMARK(BM_Batch3x3Gemm)->Arg(100)->Arg(1000)->Arg(10000);
|
||||||
@@ -1,2 +1,3 @@
|
|||||||
eigen_add_benchmark(bench_eigensolver bench_eigensolver.cpp)
|
eigen_add_benchmark(bench_eigensolver bench_eigensolver.cpp)
|
||||||
|
eigen_add_benchmark(bench_eigensolver_double bench_eigensolver.cpp DEFINITIONS SCALAR=double)
|
||||||
eigen_add_benchmark(bench_eig33 bench_eig33.cpp)
|
eigen_add_benchmark(bench_eig33 bench_eig33.cpp)
|
||||||
|
|||||||
@@ -5,7 +5,11 @@
|
|||||||
|
|
||||||
using namespace Eigen;
|
using namespace Eigen;
|
||||||
|
|
||||||
typedef float Scalar;
|
#ifndef SCALAR
|
||||||
|
#define SCALAR float
|
||||||
|
#endif
|
||||||
|
|
||||||
|
typedef SCALAR Scalar;
|
||||||
|
|
||||||
static void BM_SelfAdjointEigenSolver(benchmark::State& state) {
|
static void BM_SelfAdjointEigenSolver(benchmark::State& state) {
|
||||||
int n = state.range(0);
|
int n = state.range(0);
|
||||||
|
|||||||
@@ -33,11 +33,20 @@ static void BM_FFT(benchmark::State& state) {
|
|||||||
benchmark::Counter(mflops_per_iter, benchmark::Counter::kIsIterationInvariantRate, benchmark::Counter::kIs1000);
|
benchmark::Counter(mflops_per_iter, benchmark::Counter::kIsIterationInvariantRate, benchmark::Counter::kIs1000);
|
||||||
}
|
}
|
||||||
|
|
||||||
BENCHMARK(BM_FFT<std::complex<float>, true>)->Arg(1024)->Arg(4096);
|
static void FFTSizes(::benchmark::Benchmark* b) {
|
||||||
BENCHMARK(BM_FFT<std::complex<float>, false>)->Arg(1024)->Arg(4096);
|
for (int n : {64, 128, 256, 512, 1024, 2048, 4096, 8192, 16384, 65536}) {
|
||||||
BENCHMARK(BM_FFT<float, true>)->Arg(1024)->Arg(4096);
|
b->Arg(n);
|
||||||
BENCHMARK(BM_FFT<float, false>)->Arg(1024)->Arg(4096);
|
}
|
||||||
BENCHMARK(BM_FFT<std::complex<double>, true>)->Arg(1024)->Arg(4096);
|
// Non-power-of-2 sizes.
|
||||||
BENCHMARK(BM_FFT<std::complex<double>, false>)->Arg(1024)->Arg(4096);
|
b->Arg(1000);
|
||||||
BENCHMARK(BM_FFT<double, true>)->Arg(1024)->Arg(4096);
|
b->Arg(5000);
|
||||||
BENCHMARK(BM_FFT<double, false>)->Arg(1024)->Arg(4096);
|
}
|
||||||
|
|
||||||
|
BENCHMARK(BM_FFT<std::complex<float>, true>)->Apply(FFTSizes);
|
||||||
|
BENCHMARK(BM_FFT<std::complex<float>, false>)->Apply(FFTSizes);
|
||||||
|
BENCHMARK(BM_FFT<float, true>)->Apply(FFTSizes);
|
||||||
|
BENCHMARK(BM_FFT<float, false>)->Apply(FFTSizes);
|
||||||
|
BENCHMARK(BM_FFT<std::complex<double>, true>)->Apply(FFTSizes);
|
||||||
|
BENCHMARK(BM_FFT<std::complex<double>, false>)->Apply(FFTSizes);
|
||||||
|
BENCHMARK(BM_FFT<double, true>)->Apply(FFTSizes);
|
||||||
|
BENCHMARK(BM_FFT<double, false>)->Apply(FFTSizes);
|
||||||
|
|||||||
@@ -1,3 +1,4 @@
|
|||||||
eigen_add_benchmark(bench_spmv bench_spmv.cpp)
|
eigen_add_benchmark(bench_spmv bench_spmv.cpp)
|
||||||
eigen_add_benchmark(bench_spmm bench_spmm.cpp)
|
eigen_add_benchmark(bench_spmm bench_spmm.cpp)
|
||||||
eigen_add_benchmark(bench_sparse_transpose bench_sparse_transpose.cpp)
|
eigen_add_benchmark(bench_sparse_transpose bench_sparse_transpose.cpp)
|
||||||
|
eigen_add_benchmark(bench_sparse_solvers bench_sparse_solvers.cpp)
|
||||||
|
|||||||
182
benchmarks/Sparse/bench_sparse_solvers.cpp
Normal file
182
benchmarks/Sparse/bench_sparse_solvers.cpp
Normal file
@@ -0,0 +1,182 @@
|
|||||||
|
// Benchmarks for sparse decomposition solvers.
|
||||||
|
// Tests SimplicialLLT, SimplicialLDLT, SparseQR, SparseLU, CG, BiCGSTAB.
|
||||||
|
|
||||||
|
#include <benchmark/benchmark.h>
|
||||||
|
#include <Eigen/Sparse>
|
||||||
|
#include <Eigen/SparseCholesky>
|
||||||
|
#include <Eigen/SparseLU>
|
||||||
|
#include <Eigen/SparseQR>
|
||||||
|
#include <Eigen/IterativeLinearSolvers>
|
||||||
|
#include <Eigen/OrderingMethods>
|
||||||
|
|
||||||
|
using namespace Eigen;
|
||||||
|
|
||||||
|
typedef double Scalar;
|
||||||
|
typedef SparseMatrix<Scalar> SpMat;
|
||||||
|
typedef Matrix<Scalar, Dynamic, 1> Vec;
|
||||||
|
|
||||||
|
// Generate a SPD banded matrix (Laplacian-like).
|
||||||
|
static SpMat generateSPD(int n, int bandwidth) {
|
||||||
|
SpMat A(n, n);
|
||||||
|
std::vector<Triplet<Scalar>> trips;
|
||||||
|
trips.reserve(n * (2 * bandwidth + 1));
|
||||||
|
for (int i = 0; i < n; ++i) {
|
||||||
|
Scalar diag = 0;
|
||||||
|
for (int j = std::max(0, i - bandwidth); j < std::min(n, i + bandwidth + 1); ++j) {
|
||||||
|
if (i != j) {
|
||||||
|
Scalar val = -1.0 / (1 + std::abs(i - j));
|
||||||
|
trips.emplace_back(i, j, val);
|
||||||
|
diag -= val;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
trips.emplace_back(i, i, diag + 1.0);
|
||||||
|
}
|
||||||
|
A.setFromTriplets(trips.begin(), trips.end());
|
||||||
|
return A;
|
||||||
|
}
|
||||||
|
|
||||||
|
// Generate a general (non-symmetric) sparse matrix with diagonal dominance.
|
||||||
|
static SpMat generateGeneral(int n, int bandwidth) {
|
||||||
|
SpMat A(n, n);
|
||||||
|
std::vector<Triplet<Scalar>> trips;
|
||||||
|
trips.reserve(n * (2 * bandwidth + 1));
|
||||||
|
for (int i = 0; i < n; ++i) {
|
||||||
|
Scalar diag = 0;
|
||||||
|
for (int j = std::max(0, i - bandwidth); j < std::min(n, i + bandwidth + 1); ++j) {
|
||||||
|
if (i != j) {
|
||||||
|
Scalar val = -0.5 / (1 + std::abs(i - j));
|
||||||
|
if (j > i) val *= 1.5;
|
||||||
|
trips.emplace_back(i, j, val);
|
||||||
|
diag += std::abs(val);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
trips.emplace_back(i, i, diag + 1.0);
|
||||||
|
}
|
||||||
|
A.setFromTriplets(trips.begin(), trips.end());
|
||||||
|
return A;
|
||||||
|
}
|
||||||
|
|
||||||
|
// --- SimplicialLLT ---
|
||||||
|
static void BM_SimplicialLLT(benchmark::State& state) {
|
||||||
|
int n = state.range(0);
|
||||||
|
int bw = state.range(1);
|
||||||
|
SpMat A = generateSPD(n, bw);
|
||||||
|
Vec b = Vec::Random(n);
|
||||||
|
|
||||||
|
for (auto _ : state) {
|
||||||
|
SimplicialLLT<SpMat> solver(A);
|
||||||
|
Vec x = solver.solve(b);
|
||||||
|
benchmark::DoNotOptimize(x.data());
|
||||||
|
benchmark::ClobberMemory();
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// --- SimplicialLDLT ---
|
||||||
|
static void BM_SimplicialLDLT(benchmark::State& state) {
|
||||||
|
int n = state.range(0);
|
||||||
|
int bw = state.range(1);
|
||||||
|
SpMat A = generateSPD(n, bw);
|
||||||
|
Vec b = Vec::Random(n);
|
||||||
|
|
||||||
|
for (auto _ : state) {
|
||||||
|
SimplicialLDLT<SpMat> solver(A);
|
||||||
|
Vec x = solver.solve(b);
|
||||||
|
benchmark::DoNotOptimize(x.data());
|
||||||
|
benchmark::ClobberMemory();
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// --- SparseLU ---
|
||||||
|
static void BM_SparseLU(benchmark::State& state) {
|
||||||
|
int n = state.range(0);
|
||||||
|
int bw = state.range(1);
|
||||||
|
SpMat A = generateGeneral(n, bw);
|
||||||
|
Vec b = Vec::Random(n);
|
||||||
|
|
||||||
|
for (auto _ : state) {
|
||||||
|
SparseLU<SpMat, COLAMDOrdering<int>> solver;
|
||||||
|
solver.compute(A);
|
||||||
|
Vec x = solver.solve(b);
|
||||||
|
benchmark::DoNotOptimize(x.data());
|
||||||
|
benchmark::ClobberMemory();
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// --- SparseQR ---
|
||||||
|
static void BM_SparseQR(benchmark::State& state) {
|
||||||
|
int n = state.range(0);
|
||||||
|
int bw = state.range(1);
|
||||||
|
SpMat A = generateGeneral(n, bw);
|
||||||
|
Vec b = Vec::Random(n);
|
||||||
|
|
||||||
|
for (auto _ : state) {
|
||||||
|
SparseQR<SpMat, COLAMDOrdering<int>> solver;
|
||||||
|
solver.compute(A);
|
||||||
|
Vec x = solver.solve(b);
|
||||||
|
benchmark::DoNotOptimize(x.data());
|
||||||
|
benchmark::ClobberMemory();
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// --- ConjugateGradient (SPD) ---
|
||||||
|
static void BM_CG(benchmark::State& state) {
|
||||||
|
int n = state.range(0);
|
||||||
|
int bw = state.range(1);
|
||||||
|
SpMat A = generateSPD(n, bw);
|
||||||
|
Vec b = Vec::Random(n);
|
||||||
|
|
||||||
|
ConjugateGradient<SpMat> solver;
|
||||||
|
solver.setMaxIterations(1000);
|
||||||
|
solver.setTolerance(1e-10);
|
||||||
|
solver.compute(A);
|
||||||
|
|
||||||
|
for (auto _ : state) {
|
||||||
|
Vec x = solver.solve(b);
|
||||||
|
benchmark::DoNotOptimize(x.data());
|
||||||
|
benchmark::ClobberMemory();
|
||||||
|
}
|
||||||
|
state.counters["iterations"] = solver.iterations();
|
||||||
|
}
|
||||||
|
|
||||||
|
// --- BiCGSTAB (general) ---
|
||||||
|
static void BM_BiCGSTAB(benchmark::State& state) {
|
||||||
|
int n = state.range(0);
|
||||||
|
int bw = state.range(1);
|
||||||
|
SpMat A = generateGeneral(n, bw);
|
||||||
|
Vec b = Vec::Random(n);
|
||||||
|
|
||||||
|
BiCGSTAB<SpMat> solver;
|
||||||
|
solver.setMaxIterations(1000);
|
||||||
|
solver.setTolerance(1e-10);
|
||||||
|
solver.compute(A);
|
||||||
|
|
||||||
|
for (auto _ : state) {
|
||||||
|
Vec x = solver.solve(b);
|
||||||
|
benchmark::DoNotOptimize(x.data());
|
||||||
|
benchmark::ClobberMemory();
|
||||||
|
}
|
||||||
|
state.counters["iterations"] = solver.iterations();
|
||||||
|
}
|
||||||
|
|
||||||
|
static void DirectSolverSizes(::benchmark::Benchmark* b) {
|
||||||
|
for (int n : {1000, 5000, 10000, 50000}) {
|
||||||
|
for (int bw : {5, 20}) {
|
||||||
|
b->Args({n, bw});
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
static void IterativeSolverSizes(::benchmark::Benchmark* b) {
|
||||||
|
for (int n : {1000, 10000, 50000}) {
|
||||||
|
for (int bw : {5, 20}) {
|
||||||
|
b->Args({n, bw});
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
BENCHMARK(BM_SimplicialLLT)->Apply(DirectSolverSizes);
|
||||||
|
BENCHMARK(BM_SimplicialLDLT)->Apply(DirectSolverSizes);
|
||||||
|
BENCHMARK(BM_SparseLU)->Apply(DirectSolverSizes);
|
||||||
|
BENCHMARK(BM_SparseQR)->Apply(DirectSolverSizes);
|
||||||
|
BENCHMARK(BM_CG)->Apply(IterativeSolverSizes);
|
||||||
|
BENCHMARK(BM_BiCGSTAB)->Apply(IterativeSolverSizes);
|
||||||
1
unsupported/benchmarks/AutoDiff/CMakeLists.txt
Normal file
1
unsupported/benchmarks/AutoDiff/CMakeLists.txt
Normal file
@@ -0,0 +1 @@
|
|||||||
|
eigen_add_benchmark(bench_autodiff bench_autodiff.cpp)
|
||||||
177
unsupported/benchmarks/AutoDiff/bench_autodiff.cpp
Normal file
177
unsupported/benchmarks/AutoDiff/bench_autodiff.cpp
Normal file
@@ -0,0 +1,177 @@
|
|||||||
|
// Benchmarks for Eigen AutoDiff module.
|
||||||
|
// Compares AutoDiff Jacobian computation against NumericalDiff and hand-coded Jacobians.
|
||||||
|
|
||||||
|
#include <benchmark/benchmark.h>
|
||||||
|
#include <Eigen/Core>
|
||||||
|
#include <unsupported/Eigen/AutoDiff>
|
||||||
|
#include <unsupported/Eigen/NumericalDiff>
|
||||||
|
|
||||||
|
using namespace Eigen;
|
||||||
|
|
||||||
|
// --- Small functor: Rosenbrock-like (2 inputs -> 2 outputs) ---
|
||||||
|
struct SmallFunctor {
|
||||||
|
typedef Matrix<double, 2, 1> InputType;
|
||||||
|
typedef Matrix<double, 2, 1> ValueType;
|
||||||
|
typedef Matrix<double, 2, 2> JacobianType;
|
||||||
|
|
||||||
|
enum { InputsAtCompileTime = 2, ValuesAtCompileTime = 2 };
|
||||||
|
|
||||||
|
template <typename T>
|
||||||
|
void operator()(const Matrix<T, 2, 1>& x, Matrix<T, 2, 1>* v) const {
|
||||||
|
(*v)(0) = T(1) - x(0);
|
||||||
|
(*v)(1) = T(10) * (x(1) - x(0) * x(0));
|
||||||
|
}
|
||||||
|
};
|
||||||
|
|
||||||
|
// --- Medium functor: chain of operations (6 inputs -> 6 outputs) ---
|
||||||
|
struct MediumFunctor {
|
||||||
|
typedef Matrix<double, 6, 1> InputType;
|
||||||
|
typedef Matrix<double, 6, 1> ValueType;
|
||||||
|
typedef Matrix<double, 6, 6> JacobianType;
|
||||||
|
|
||||||
|
enum { InputsAtCompileTime = 6, ValuesAtCompileTime = 6 };
|
||||||
|
|
||||||
|
template <typename T>
|
||||||
|
void operator()(const Matrix<T, 6, 1>& x, Matrix<T, 6, 1>* v) const {
|
||||||
|
(*v)(0) = sin(x(0)) * cos(x(1)) + x(2) * x(2);
|
||||||
|
(*v)(1) = exp(x(1) * T(0.1)) + x(3);
|
||||||
|
(*v)(2) = x(0) * x(2) - x(4) * x(5);
|
||||||
|
(*v)(3) = sqrt(x(3) * x(3) + T(1)) + x(0);
|
||||||
|
(*v)(4) = x(4) * x(4) + x(5) * x(5) + x(0) * x(1);
|
||||||
|
(*v)(5) = log(x(2) * x(2) + T(1)) + x(3) * x(4);
|
||||||
|
}
|
||||||
|
};
|
||||||
|
|
||||||
|
// --- Dynamic-size functor (N inputs -> N outputs) ---
|
||||||
|
struct DynamicFunctor {
|
||||||
|
typedef Matrix<double, Dynamic, 1> InputType;
|
||||||
|
typedef Matrix<double, Dynamic, 1> ValueType;
|
||||||
|
typedef Matrix<double, Dynamic, Dynamic> JacobianType;
|
||||||
|
|
||||||
|
const int n_;
|
||||||
|
DynamicFunctor(int n) : n_(n) {}
|
||||||
|
|
||||||
|
enum { InputsAtCompileTime = Dynamic, ValuesAtCompileTime = Dynamic };
|
||||||
|
|
||||||
|
int inputs() const { return n_; }
|
||||||
|
int values() const { return n_; }
|
||||||
|
|
||||||
|
template <typename T>
|
||||||
|
void operator()(const Matrix<T, Dynamic, 1>& x, Matrix<T, Dynamic, 1>* v) const {
|
||||||
|
v->resize(n_);
|
||||||
|
(*v)(0) = T(1) - x(0);
|
||||||
|
for (int i = 1; i < n_; ++i) {
|
||||||
|
(*v)(i) = T(10) * (x(i) - x(i - 1) * x(i - 1));
|
||||||
|
}
|
||||||
|
}
|
||||||
|
};
|
||||||
|
|
||||||
|
// Wrapper for NumericalDiff compatibility.
|
||||||
|
struct SmallFunctorND : SmallFunctor {
|
||||||
|
typedef double Scalar;
|
||||||
|
int inputs() const { return 2; }
|
||||||
|
int values() const { return 2; }
|
||||||
|
int operator()(const InputType& x, ValueType& v) const {
|
||||||
|
SmallFunctor::operator()(x, &v);
|
||||||
|
return 0;
|
||||||
|
}
|
||||||
|
};
|
||||||
|
|
||||||
|
struct MediumFunctorND : MediumFunctor {
|
||||||
|
typedef double Scalar;
|
||||||
|
int inputs() const { return 6; }
|
||||||
|
int values() const { return 6; }
|
||||||
|
int operator()(const InputType& x, ValueType& v) const {
|
||||||
|
MediumFunctor::operator()(x, &v);
|
||||||
|
return 0;
|
||||||
|
}
|
||||||
|
};
|
||||||
|
|
||||||
|
// --- AutoDiff Jacobian benchmarks ---
|
||||||
|
template <typename Functor>
|
||||||
|
static void BM_AutoDiffJacobian(benchmark::State& state, Functor func) {
|
||||||
|
AutoDiffJacobian<Functor> adf(func);
|
||||||
|
typename Functor::InputType x = Functor::InputType::Random();
|
||||||
|
typename Functor::ValueType v;
|
||||||
|
typename Functor::JacobianType jac;
|
||||||
|
|
||||||
|
for (auto _ : state) {
|
||||||
|
adf(x, &v, &jac);
|
||||||
|
benchmark::DoNotOptimize(jac.data());
|
||||||
|
benchmark::ClobberMemory();
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// --- Dynamic AutoDiff Jacobian ---
|
||||||
|
static void BM_AutoDiffJacobian_Dynamic(benchmark::State& state) {
|
||||||
|
int n = state.range(0);
|
||||||
|
DynamicFunctor func(n);
|
||||||
|
AutoDiffJacobian<DynamicFunctor> adf(func);
|
||||||
|
|
||||||
|
VectorXd x = VectorXd::Random(n);
|
||||||
|
VectorXd v(n);
|
||||||
|
MatrixXd jac(n, n);
|
||||||
|
|
||||||
|
for (auto _ : state) {
|
||||||
|
adf(x, &v, &jac);
|
||||||
|
benchmark::DoNotOptimize(jac.data());
|
||||||
|
benchmark::ClobberMemory();
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// --- NumericalDiff benchmarks ---
|
||||||
|
template <typename Functor>
|
||||||
|
static void BM_NumericalDiffJacobian(benchmark::State& state, Functor func) {
|
||||||
|
NumericalDiff<Functor> ndf(func);
|
||||||
|
typename Functor::InputType x = Functor::InputType::Random();
|
||||||
|
typename Functor::JacobianType jac;
|
||||||
|
|
||||||
|
for (auto _ : state) {
|
||||||
|
ndf.df(x, jac);
|
||||||
|
benchmark::DoNotOptimize(jac.data());
|
||||||
|
benchmark::ClobberMemory();
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// --- Hand-coded Jacobian (Rosenbrock) for comparison ---
|
||||||
|
static void BM_HandCoded_Small(benchmark::State& state) {
|
||||||
|
Vector2d x = Vector2d::Random();
|
||||||
|
Matrix2d jac;
|
||||||
|
|
||||||
|
for (auto _ : state) {
|
||||||
|
jac(0, 0) = -1;
|
||||||
|
jac(0, 1) = 0;
|
||||||
|
jac(1, 0) = -20 * x(0);
|
||||||
|
jac(1, 1) = 10;
|
||||||
|
benchmark::DoNotOptimize(jac.data());
|
||||||
|
benchmark::ClobberMemory();
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// --- Scalar AutoDiff evaluation (no Jacobian, just forward pass) ---
|
||||||
|
static void BM_AutoDiffScalar_Eval(benchmark::State& state) {
|
||||||
|
int n = state.range(0);
|
||||||
|
using ADScalar = AutoDiffScalar<VectorXd>;
|
||||||
|
VectorXd x = VectorXd::Random(n);
|
||||||
|
|
||||||
|
for (auto _ : state) {
|
||||||
|
ADScalar sum(0.0, VectorXd::Zero(n));
|
||||||
|
for (int i = 0; i < n; ++i) {
|
||||||
|
ADScalar xi(x(i), n, i);
|
||||||
|
sum += xi * xi + sin(xi);
|
||||||
|
}
|
||||||
|
benchmark::DoNotOptimize(sum.value());
|
||||||
|
benchmark::DoNotOptimize(sum.derivatives().data());
|
||||||
|
benchmark::ClobberMemory();
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
BENCHMARK_CAPTURE(BM_AutoDiffJacobian, Small, SmallFunctor());
|
||||||
|
BENCHMARK_CAPTURE(BM_AutoDiffJacobian, Medium, MediumFunctor());
|
||||||
|
BENCHMARK(BM_AutoDiffJacobian_Dynamic)->Arg(2)->Arg(6)->Arg(20)->Arg(50)->Arg(100);
|
||||||
|
|
||||||
|
BENCHMARK_CAPTURE(BM_NumericalDiffJacobian, Small, SmallFunctorND());
|
||||||
|
BENCHMARK_CAPTURE(BM_NumericalDiffJacobian, Medium, MediumFunctorND());
|
||||||
|
|
||||||
|
BENCHMARK(BM_HandCoded_Small);
|
||||||
|
BENCHMARK(BM_AutoDiffScalar_Eval)->Arg(2)->Arg(6)->Arg(20)->Arg(50)->Arg(100);
|
||||||
35
unsupported/benchmarks/CMakeLists.txt
Normal file
35
unsupported/benchmarks/CMakeLists.txt
Normal file
@@ -0,0 +1,35 @@
|
|||||||
|
cmake_minimum_required(VERSION 3.10)
|
||||||
|
project(EigenUnsupportedBenchmarks CXX)
|
||||||
|
|
||||||
|
find_package(benchmark REQUIRED)
|
||||||
|
find_package(Threads REQUIRED)
|
||||||
|
|
||||||
|
set(EIGEN_SOURCE_DIR "${CMAKE_CURRENT_SOURCE_DIR}/../..")
|
||||||
|
|
||||||
|
# Helper: add a Google Benchmark target (mirrors benchmarks/CMakeLists.txt).
|
||||||
|
# eigen_add_benchmark(name source [LIBRARIES lib1 lib2 ...] [DEFINITIONS def1 def2 ...])
|
||||||
|
function(eigen_add_benchmark name source)
|
||||||
|
cmake_parse_arguments(BENCH "" "" "LIBRARIES;DEFINITIONS" ${ARGN})
|
||||||
|
if(NOT IS_ABSOLUTE "${source}")
|
||||||
|
set(source "${CMAKE_CURRENT_SOURCE_DIR}/${source}")
|
||||||
|
endif()
|
||||||
|
add_executable(${name} ${source})
|
||||||
|
target_include_directories(${name} PRIVATE ${EIGEN_SOURCE_DIR})
|
||||||
|
target_link_libraries(${name} PRIVATE benchmark::benchmark benchmark::benchmark_main
|
||||||
|
Threads::Threads)
|
||||||
|
if(BENCH_LIBRARIES)
|
||||||
|
target_link_libraries(${name} PRIVATE ${BENCH_LIBRARIES})
|
||||||
|
endif()
|
||||||
|
target_compile_options(${name} PRIVATE -O3 -DNDEBUG)
|
||||||
|
if(BENCH_DEFINITIONS)
|
||||||
|
target_compile_definitions(${name} PRIVATE ${BENCH_DEFINITIONS})
|
||||||
|
endif()
|
||||||
|
endfunction()
|
||||||
|
|
||||||
|
add_subdirectory(Tensor)
|
||||||
|
add_subdirectory(MatrixFunctions)
|
||||||
|
add_subdirectory(SpecialFunctions)
|
||||||
|
add_subdirectory(AutoDiff)
|
||||||
|
add_subdirectory(Splines)
|
||||||
|
add_subdirectory(IterativeSolvers)
|
||||||
|
add_subdirectory(KroneckerProduct)
|
||||||
1
unsupported/benchmarks/IterativeSolvers/CMakeLists.txt
Normal file
1
unsupported/benchmarks/IterativeSolvers/CMakeLists.txt
Normal file
@@ -0,0 +1 @@
|
|||||||
|
eigen_add_benchmark(bench_iterative_solvers bench_iterative_solvers.cpp)
|
||||||
@@ -0,0 +1,209 @@
|
|||||||
|
// Benchmarks for unsupported iterative solvers: GMRES, MINRES, IDRS, IDRSTABL, BiCGSTABL, DGMRES.
|
||||||
|
|
||||||
|
#include <benchmark/benchmark.h>
|
||||||
|
#include <Eigen/Sparse>
|
||||||
|
#include <Eigen/IterativeLinearSolvers>
|
||||||
|
#include <unsupported/Eigen/IterativeSolvers>
|
||||||
|
|
||||||
|
using namespace Eigen;
|
||||||
|
|
||||||
|
typedef double Scalar;
|
||||||
|
typedef SparseMatrix<Scalar> SpMat;
|
||||||
|
typedef Matrix<Scalar, Dynamic, 1> Vec;
|
||||||
|
|
||||||
|
// Generate a SPD sparse matrix (Laplacian-like with diagonal dominance).
|
||||||
|
static SpMat generateSPD(int n, int bandwidth) {
|
||||||
|
SpMat A(n, n);
|
||||||
|
std::vector<Triplet<Scalar>> trips;
|
||||||
|
trips.reserve(n * (2 * bandwidth + 1));
|
||||||
|
for (int i = 0; i < n; ++i) {
|
||||||
|
Scalar diag = 0;
|
||||||
|
for (int j = std::max(0, i - bandwidth); j < std::min(n, i + bandwidth + 1); ++j) {
|
||||||
|
if (i != j) {
|
||||||
|
Scalar val = -1.0 / (1 + std::abs(i - j));
|
||||||
|
trips.emplace_back(i, j, val);
|
||||||
|
diag -= val;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
trips.emplace_back(i, i, diag + 1.0);
|
||||||
|
}
|
||||||
|
A.setFromTriplets(trips.begin(), trips.end());
|
||||||
|
return A;
|
||||||
|
}
|
||||||
|
|
||||||
|
// Generate a general (non-symmetric) sparse matrix.
|
||||||
|
static SpMat generateGeneral(int n, int bandwidth) {
|
||||||
|
SpMat A(n, n);
|
||||||
|
std::vector<Triplet<Scalar>> trips;
|
||||||
|
trips.reserve(n * (2 * bandwidth + 1));
|
||||||
|
for (int i = 0; i < n; ++i) {
|
||||||
|
Scalar diag = 0;
|
||||||
|
for (int j = std::max(0, i - bandwidth); j < std::min(n, i + bandwidth + 1); ++j) {
|
||||||
|
if (i != j) {
|
||||||
|
Scalar val = -0.5 / (1 + std::abs(i - j));
|
||||||
|
if (j > i) val *= 1.5; // asymmetry
|
||||||
|
trips.emplace_back(i, j, val);
|
||||||
|
diag += std::abs(val);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
trips.emplace_back(i, i, diag + 1.0); // diagonal dominance
|
||||||
|
}
|
||||||
|
A.setFromTriplets(trips.begin(), trips.end());
|
||||||
|
return A;
|
||||||
|
}
|
||||||
|
|
||||||
|
// --- GMRES ---
|
||||||
|
static void BM_GMRES(benchmark::State& state) {
|
||||||
|
int n = state.range(0);
|
||||||
|
int bw = state.range(1);
|
||||||
|
SpMat A = generateGeneral(n, bw);
|
||||||
|
Vec b = Vec::Random(n);
|
||||||
|
|
||||||
|
GMRES<SpMat> solver;
|
||||||
|
solver.setMaxIterations(1000);
|
||||||
|
solver.setTolerance(1e-10);
|
||||||
|
solver.compute(A);
|
||||||
|
|
||||||
|
for (auto _ : state) {
|
||||||
|
Vec x = solver.solve(b);
|
||||||
|
benchmark::DoNotOptimize(x.data());
|
||||||
|
benchmark::ClobberMemory();
|
||||||
|
}
|
||||||
|
state.counters["iterations"] = solver.iterations();
|
||||||
|
}
|
||||||
|
|
||||||
|
// --- DGMRES ---
|
||||||
|
static void BM_DGMRES(benchmark::State& state) {
|
||||||
|
int n = state.range(0);
|
||||||
|
int bw = state.range(1);
|
||||||
|
SpMat A = generateGeneral(n, bw);
|
||||||
|
Vec b = Vec::Random(n);
|
||||||
|
|
||||||
|
DGMRES<SpMat> solver;
|
||||||
|
solver.setMaxIterations(1000);
|
||||||
|
solver.setTolerance(1e-10);
|
||||||
|
solver.compute(A);
|
||||||
|
|
||||||
|
for (auto _ : state) {
|
||||||
|
Vec x = solver.solve(b);
|
||||||
|
benchmark::DoNotOptimize(x.data());
|
||||||
|
benchmark::ClobberMemory();
|
||||||
|
}
|
||||||
|
state.counters["iterations"] = solver.iterations();
|
||||||
|
}
|
||||||
|
|
||||||
|
// --- MINRES (SPD matrices) ---
|
||||||
|
static void BM_MINRES(benchmark::State& state) {
|
||||||
|
int n = state.range(0);
|
||||||
|
int bw = state.range(1);
|
||||||
|
SpMat A = generateSPD(n, bw);
|
||||||
|
Vec b = Vec::Random(n);
|
||||||
|
|
||||||
|
MINRES<SpMat> solver;
|
||||||
|
solver.setMaxIterations(1000);
|
||||||
|
solver.setTolerance(1e-10);
|
||||||
|
solver.compute(A);
|
||||||
|
|
||||||
|
for (auto _ : state) {
|
||||||
|
Vec x = solver.solve(b);
|
||||||
|
benchmark::DoNotOptimize(x.data());
|
||||||
|
benchmark::ClobberMemory();
|
||||||
|
}
|
||||||
|
state.counters["iterations"] = solver.iterations();
|
||||||
|
}
|
||||||
|
|
||||||
|
// --- IDRS ---
|
||||||
|
static void BM_IDRS(benchmark::State& state) {
|
||||||
|
int n = state.range(0);
|
||||||
|
int bw = state.range(1);
|
||||||
|
SpMat A = generateGeneral(n, bw);
|
||||||
|
Vec b = Vec::Random(n);
|
||||||
|
|
||||||
|
IDRS<SpMat> solver;
|
||||||
|
solver.setMaxIterations(1000);
|
||||||
|
solver.setTolerance(1e-10);
|
||||||
|
solver.compute(A);
|
||||||
|
|
||||||
|
for (auto _ : state) {
|
||||||
|
Vec x = solver.solve(b);
|
||||||
|
benchmark::DoNotOptimize(x.data());
|
||||||
|
benchmark::ClobberMemory();
|
||||||
|
}
|
||||||
|
state.counters["iterations"] = solver.iterations();
|
||||||
|
}
|
||||||
|
|
||||||
|
// --- BiCGSTABL ---
|
||||||
|
static void BM_BiCGSTABL(benchmark::State& state) {
|
||||||
|
int n = state.range(0);
|
||||||
|
int bw = state.range(1);
|
||||||
|
SpMat A = generateGeneral(n, bw);
|
||||||
|
Vec b = Vec::Random(n);
|
||||||
|
|
||||||
|
BiCGSTABL<SpMat> solver;
|
||||||
|
solver.setMaxIterations(1000);
|
||||||
|
solver.setTolerance(1e-10);
|
||||||
|
solver.compute(A);
|
||||||
|
|
||||||
|
for (auto _ : state) {
|
||||||
|
Vec x = solver.solve(b);
|
||||||
|
benchmark::DoNotOptimize(x.data());
|
||||||
|
benchmark::ClobberMemory();
|
||||||
|
}
|
||||||
|
state.counters["iterations"] = solver.iterations();
|
||||||
|
}
|
||||||
|
|
||||||
|
// --- Compare with CG (supported module, SPD only) ---
|
||||||
|
static void BM_CG_Reference(benchmark::State& state) {
|
||||||
|
int n = state.range(0);
|
||||||
|
int bw = state.range(1);
|
||||||
|
SpMat A = generateSPD(n, bw);
|
||||||
|
Vec b = Vec::Random(n);
|
||||||
|
|
||||||
|
ConjugateGradient<SpMat> solver;
|
||||||
|
solver.setMaxIterations(1000);
|
||||||
|
solver.setTolerance(1e-10);
|
||||||
|
solver.compute(A);
|
||||||
|
|
||||||
|
for (auto _ : state) {
|
||||||
|
Vec x = solver.solve(b);
|
||||||
|
benchmark::DoNotOptimize(x.data());
|
||||||
|
benchmark::ClobberMemory();
|
||||||
|
}
|
||||||
|
state.counters["iterations"] = solver.iterations();
|
||||||
|
}
|
||||||
|
|
||||||
|
// --- Compare with BiCGSTAB (supported module, general) ---
|
||||||
|
static void BM_BiCGSTAB_Reference(benchmark::State& state) {
|
||||||
|
int n = state.range(0);
|
||||||
|
int bw = state.range(1);
|
||||||
|
SpMat A = generateGeneral(n, bw);
|
||||||
|
Vec b = Vec::Random(n);
|
||||||
|
|
||||||
|
BiCGSTAB<SpMat> solver;
|
||||||
|
solver.setMaxIterations(1000);
|
||||||
|
solver.setTolerance(1e-10);
|
||||||
|
solver.compute(A);
|
||||||
|
|
||||||
|
for (auto _ : state) {
|
||||||
|
Vec x = solver.solve(b);
|
||||||
|
benchmark::DoNotOptimize(x.data());
|
||||||
|
benchmark::ClobberMemory();
|
||||||
|
}
|
||||||
|
state.counters["iterations"] = solver.iterations();
|
||||||
|
}
|
||||||
|
|
||||||
|
static void SolverSizes(::benchmark::Benchmark* b) {
|
||||||
|
for (int n : {1000, 10000, 100000}) {
|
||||||
|
for (int bw : {5, 20}) {
|
||||||
|
b->Args({n, bw});
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
BENCHMARK(BM_GMRES)->Apply(SolverSizes);
|
||||||
|
BENCHMARK(BM_DGMRES)->Apply(SolverSizes);
|
||||||
|
BENCHMARK(BM_MINRES)->Apply(SolverSizes);
|
||||||
|
BENCHMARK(BM_IDRS)->Apply(SolverSizes);
|
||||||
|
BENCHMARK(BM_BiCGSTABL)->Apply(SolverSizes);
|
||||||
|
BENCHMARK(BM_CG_Reference)->Apply(SolverSizes);
|
||||||
|
BENCHMARK(BM_BiCGSTAB_Reference)->Apply(SolverSizes);
|
||||||
1
unsupported/benchmarks/KroneckerProduct/CMakeLists.txt
Normal file
1
unsupported/benchmarks/KroneckerProduct/CMakeLists.txt
Normal file
@@ -0,0 +1 @@
|
|||||||
|
eigen_add_benchmark(bench_kronecker bench_kronecker.cpp)
|
||||||
83
unsupported/benchmarks/KroneckerProduct/bench_kronecker.cpp
Normal file
83
unsupported/benchmarks/KroneckerProduct/bench_kronecker.cpp
Normal file
@@ -0,0 +1,83 @@
|
|||||||
|
// Benchmarks for Kronecker product (dense and sparse).
|
||||||
|
|
||||||
|
#include <benchmark/benchmark.h>
|
||||||
|
#include <Eigen/Core>
|
||||||
|
#include <Eigen/Sparse>
|
||||||
|
#include <unsupported/Eigen/KroneckerProduct>
|
||||||
|
|
||||||
|
using namespace Eigen;
|
||||||
|
|
||||||
|
typedef double Scalar;
|
||||||
|
typedef Matrix<Scalar, Dynamic, Dynamic> Mat;
|
||||||
|
typedef SparseMatrix<Scalar> SpMat;
|
||||||
|
|
||||||
|
// --- Dense Kronecker product ---
|
||||||
|
static void BM_KroneckerDense(benchmark::State& state) {
|
||||||
|
int na = state.range(0);
|
||||||
|
int nb = state.range(1);
|
||||||
|
|
||||||
|
Mat A = Mat::Random(na, na);
|
||||||
|
Mat B = Mat::Random(nb, nb);
|
||||||
|
|
||||||
|
for (auto _ : state) {
|
||||||
|
Mat C = kroneckerProduct(A, B).eval();
|
||||||
|
benchmark::DoNotOptimize(C.data());
|
||||||
|
benchmark::ClobberMemory();
|
||||||
|
}
|
||||||
|
int outSize = na * nb;
|
||||||
|
state.counters["output_size"] = outSize;
|
||||||
|
}
|
||||||
|
|
||||||
|
// --- Sparse Kronecker product ---
|
||||||
|
static void BM_KroneckerSparse(benchmark::State& state) {
|
||||||
|
int na = state.range(0);
|
||||||
|
int nb = state.range(1);
|
||||||
|
|
||||||
|
// Create sparse identity-like matrices with some fill.
|
||||||
|
SpMat A(na, na);
|
||||||
|
SpMat B(nb, nb);
|
||||||
|
|
||||||
|
std::vector<Triplet<Scalar>> tripsA, tripsB;
|
||||||
|
for (int i = 0; i < na; ++i) {
|
||||||
|
tripsA.emplace_back(i, i, 2.0);
|
||||||
|
if (i + 1 < na) {
|
||||||
|
tripsA.emplace_back(i, i + 1, -1.0);
|
||||||
|
tripsA.emplace_back(i + 1, i, -1.0);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
for (int i = 0; i < nb; ++i) {
|
||||||
|
tripsB.emplace_back(i, i, 2.0);
|
||||||
|
if (i + 1 < nb) {
|
||||||
|
tripsB.emplace_back(i, i + 1, -1.0);
|
||||||
|
tripsB.emplace_back(i + 1, i, -1.0);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
A.setFromTriplets(tripsA.begin(), tripsA.end());
|
||||||
|
B.setFromTriplets(tripsB.begin(), tripsB.end());
|
||||||
|
|
||||||
|
for (auto _ : state) {
|
||||||
|
SpMat C = kroneckerProduct(A, B).eval();
|
||||||
|
benchmark::DoNotOptimize(C.valuePtr());
|
||||||
|
benchmark::ClobberMemory();
|
||||||
|
}
|
||||||
|
state.counters["output_size"] = na * nb;
|
||||||
|
}
|
||||||
|
|
||||||
|
static void KroneckerSizes(::benchmark::Benchmark* b) {
|
||||||
|
for (int na : {4, 8, 16}) {
|
||||||
|
for (int nb : {4, 8, 16}) {
|
||||||
|
b->Args({na, nb});
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
static void KroneckerSparseSizes(::benchmark::Benchmark* b) {
|
||||||
|
for (int na : {16, 32, 64, 128}) {
|
||||||
|
for (int nb : {16, 32, 64, 128}) {
|
||||||
|
b->Args({na, nb});
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
BENCHMARK(BM_KroneckerDense)->Apply(KroneckerSizes);
|
||||||
|
BENCHMARK(BM_KroneckerSparse)->Apply(KroneckerSparseSizes);
|
||||||
3
unsupported/benchmarks/MatrixFunctions/CMakeLists.txt
Normal file
3
unsupported/benchmarks/MatrixFunctions/CMakeLists.txt
Normal file
@@ -0,0 +1,3 @@
|
|||||||
|
eigen_add_benchmark(bench_matrix_exponential bench_matrix_exponential.cpp)
|
||||||
|
eigen_add_benchmark(bench_matrix_logarithm bench_matrix_logarithm.cpp)
|
||||||
|
eigen_add_benchmark(bench_matrix_power bench_matrix_power.cpp)
|
||||||
@@ -0,0 +1,52 @@
|
|||||||
|
// Benchmarks for matrix exponential.
|
||||||
|
// Critical for Sophus Lie group operations (SLAM, visual odometry).
|
||||||
|
|
||||||
|
#include <benchmark/benchmark.h>
|
||||||
|
#include <Eigen/Core>
|
||||||
|
#include <unsupported/Eigen/MatrixFunctions>
|
||||||
|
|
||||||
|
using namespace Eigen;
|
||||||
|
|
||||||
|
#ifndef SCALAR
|
||||||
|
#define SCALAR double
|
||||||
|
#endif
|
||||||
|
|
||||||
|
typedef SCALAR Scalar;
|
||||||
|
|
||||||
|
static void BM_MatrixExp(benchmark::State& state) {
|
||||||
|
int n = state.range(0);
|
||||||
|
typedef Matrix<Scalar, Dynamic, Dynamic> MatrixType;
|
||||||
|
|
||||||
|
// Generate a random matrix with reasonable spectral radius.
|
||||||
|
MatrixType A = MatrixType::Random(n, n) / Scalar(n);
|
||||||
|
MatrixType result(n, n);
|
||||||
|
|
||||||
|
for (auto _ : state) {
|
||||||
|
result = A.exp();
|
||||||
|
benchmark::DoNotOptimize(result.data());
|
||||||
|
benchmark::ClobberMemory();
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// Fixed-size specializations for Lie group sizes.
|
||||||
|
template <int N>
|
||||||
|
static void BM_MatrixExp_Fixed(benchmark::State& state) {
|
||||||
|
typedef Matrix<Scalar, N, N> MatrixType;
|
||||||
|
|
||||||
|
MatrixType A = MatrixType::Random() / Scalar(N);
|
||||||
|
MatrixType result;
|
||||||
|
|
||||||
|
for (auto _ : state) {
|
||||||
|
result = A.exp();
|
||||||
|
benchmark::DoNotOptimize(result.data());
|
||||||
|
benchmark::ClobberMemory();
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// Dynamic sizes: Lie groups (2,3,4) plus larger.
|
||||||
|
BENCHMARK(BM_MatrixExp)->Arg(2)->Arg(3)->Arg(4)->Arg(8)->Arg(16)->Arg(32)->Arg(64)->Arg(128);
|
||||||
|
|
||||||
|
// Fixed-size Lie group dimensions.
|
||||||
|
BENCHMARK(BM_MatrixExp_Fixed<2>);
|
||||||
|
BENCHMARK(BM_MatrixExp_Fixed<3>);
|
||||||
|
BENCHMARK(BM_MatrixExp_Fixed<4>);
|
||||||
@@ -0,0 +1,51 @@
|
|||||||
|
// Benchmarks for matrix logarithm.
|
||||||
|
// Inverse of matrix exponential, used for Lie group log maps.
|
||||||
|
|
||||||
|
#include <benchmark/benchmark.h>
|
||||||
|
#include <Eigen/Core>
|
||||||
|
#include <unsupported/Eigen/MatrixFunctions>
|
||||||
|
|
||||||
|
using namespace Eigen;
|
||||||
|
|
||||||
|
#ifndef SCALAR
|
||||||
|
#define SCALAR double
|
||||||
|
#endif
|
||||||
|
|
||||||
|
typedef SCALAR Scalar;
|
||||||
|
|
||||||
|
static void BM_MatrixLog(benchmark::State& state) {
|
||||||
|
int n = state.range(0);
|
||||||
|
typedef Matrix<Scalar, Dynamic, Dynamic> MatrixType;
|
||||||
|
|
||||||
|
// Generate a matrix close to identity for stable log computation.
|
||||||
|
MatrixType A = MatrixType::Identity(n, n) + MatrixType::Random(n, n) / Scalar(n * 2);
|
||||||
|
// Ensure A is in the principal branch by computing exp(small matrix).
|
||||||
|
A = (MatrixType::Random(n, n) / Scalar(n * 4)).exp();
|
||||||
|
MatrixType result(n, n);
|
||||||
|
|
||||||
|
for (auto _ : state) {
|
||||||
|
result = A.log();
|
||||||
|
benchmark::DoNotOptimize(result.data());
|
||||||
|
benchmark::ClobberMemory();
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
template <int N>
|
||||||
|
static void BM_MatrixLog_Fixed(benchmark::State& state) {
|
||||||
|
typedef Matrix<Scalar, N, N> MatrixType;
|
||||||
|
|
||||||
|
MatrixType A = (MatrixType::Random() / Scalar(N * 4)).exp();
|
||||||
|
MatrixType result;
|
||||||
|
|
||||||
|
for (auto _ : state) {
|
||||||
|
result = A.log();
|
||||||
|
benchmark::DoNotOptimize(result.data());
|
||||||
|
benchmark::ClobberMemory();
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
BENCHMARK(BM_MatrixLog)->Arg(2)->Arg(3)->Arg(4)->Arg(8)->Arg(16)->Arg(32)->Arg(64);
|
||||||
|
|
||||||
|
BENCHMARK(BM_MatrixLog_Fixed<2>);
|
||||||
|
BENCHMARK(BM_MatrixLog_Fixed<3>);
|
||||||
|
BENCHMARK(BM_MatrixLog_Fixed<4>);
|
||||||
@@ -0,0 +1,99 @@
|
|||||||
|
// Benchmarks for matrix power functions: sqrt, pow, cos, sin, cosh, sinh.
|
||||||
|
|
||||||
|
#include <benchmark/benchmark.h>
|
||||||
|
#include <Eigen/Core>
|
||||||
|
#include <unsupported/Eigen/MatrixFunctions>
|
||||||
|
|
||||||
|
using namespace Eigen;
|
||||||
|
|
||||||
|
typedef double Scalar;
|
||||||
|
typedef Matrix<Scalar, Dynamic, Dynamic> Mat;
|
||||||
|
|
||||||
|
static void BM_MatrixSqrt(benchmark::State& state) {
|
||||||
|
int n = state.range(0);
|
||||||
|
// SPD matrix has well-defined sqrt.
|
||||||
|
Mat tmp = Mat::Random(n, n);
|
||||||
|
Mat A = tmp * tmp.transpose() + Mat::Identity(n, n);
|
||||||
|
Mat result(n, n);
|
||||||
|
|
||||||
|
for (auto _ : state) {
|
||||||
|
result = A.sqrt();
|
||||||
|
benchmark::DoNotOptimize(result.data());
|
||||||
|
benchmark::ClobberMemory();
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
static void BM_MatrixPow(benchmark::State& state) {
|
||||||
|
int n = state.range(0);
|
||||||
|
Mat tmp = Mat::Random(n, n);
|
||||||
|
Mat A = tmp * tmp.transpose() + Mat::Identity(n, n);
|
||||||
|
Mat result(n, n);
|
||||||
|
Scalar p = 2.5;
|
||||||
|
|
||||||
|
for (auto _ : state) {
|
||||||
|
result = A.pow(p);
|
||||||
|
benchmark::DoNotOptimize(result.data());
|
||||||
|
benchmark::ClobberMemory();
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
static void BM_MatrixCos(benchmark::State& state) {
|
||||||
|
int n = state.range(0);
|
||||||
|
Mat A = Mat::Random(n, n) / Scalar(n);
|
||||||
|
Mat result(n, n);
|
||||||
|
|
||||||
|
for (auto _ : state) {
|
||||||
|
result = A.cos();
|
||||||
|
benchmark::DoNotOptimize(result.data());
|
||||||
|
benchmark::ClobberMemory();
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
static void BM_MatrixSin(benchmark::State& state) {
|
||||||
|
int n = state.range(0);
|
||||||
|
Mat A = Mat::Random(n, n) / Scalar(n);
|
||||||
|
Mat result(n, n);
|
||||||
|
|
||||||
|
for (auto _ : state) {
|
||||||
|
result = A.sin();
|
||||||
|
benchmark::DoNotOptimize(result.data());
|
||||||
|
benchmark::ClobberMemory();
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
static void BM_MatrixCosh(benchmark::State& state) {
|
||||||
|
int n = state.range(0);
|
||||||
|
Mat A = Mat::Random(n, n) / Scalar(n);
|
||||||
|
Mat result(n, n);
|
||||||
|
|
||||||
|
for (auto _ : state) {
|
||||||
|
result = A.cosh();
|
||||||
|
benchmark::DoNotOptimize(result.data());
|
||||||
|
benchmark::ClobberMemory();
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
static void BM_MatrixSinh(benchmark::State& state) {
|
||||||
|
int n = state.range(0);
|
||||||
|
Mat A = Mat::Random(n, n) / Scalar(n);
|
||||||
|
Mat result(n, n);
|
||||||
|
|
||||||
|
for (auto _ : state) {
|
||||||
|
result = A.sinh();
|
||||||
|
benchmark::DoNotOptimize(result.data());
|
||||||
|
benchmark::ClobberMemory();
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
static void MatPowerSizes(::benchmark::Benchmark* b) {
|
||||||
|
for (int n : {4, 8, 16, 32, 64}) {
|
||||||
|
b->Arg(n);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
BENCHMARK(BM_MatrixSqrt)->Apply(MatPowerSizes);
|
||||||
|
BENCHMARK(BM_MatrixPow)->Apply(MatPowerSizes);
|
||||||
|
BENCHMARK(BM_MatrixCos)->Apply(MatPowerSizes);
|
||||||
|
BENCHMARK(BM_MatrixSin)->Apply(MatPowerSizes);
|
||||||
|
BENCHMARK(BM_MatrixCosh)->Apply(MatPowerSizes);
|
||||||
|
BENCHMARK(BM_MatrixSinh)->Apply(MatPowerSizes);
|
||||||
1
unsupported/benchmarks/SpecialFunctions/CMakeLists.txt
Normal file
1
unsupported/benchmarks/SpecialFunctions/CMakeLists.txt
Normal file
@@ -0,0 +1 @@
|
|||||||
|
eigen_add_benchmark(bench_special_functions bench_special_functions.cpp)
|
||||||
@@ -0,0 +1,127 @@
|
|||||||
|
// Benchmarks for special functions beyond what bench_cwise_math.cpp covers.
|
||||||
|
// Includes Bessel functions, two-argument functions (igamma, betainc),
|
||||||
|
// and additional functions (lgamma, digamma, zeta, polygamma).
|
||||||
|
|
||||||
|
#include <benchmark/benchmark.h>
|
||||||
|
#include <Eigen/Core>
|
||||||
|
#include <unsupported/Eigen/SpecialFunctions>
|
||||||
|
|
||||||
|
using namespace Eigen;
|
||||||
|
|
||||||
|
// Macro for unary special functions on arrays.
|
||||||
|
#define BENCH_SPECIAL_UNARY(NAME, EXPR, LO, HI) \
|
||||||
|
template <typename Scalar> \
|
||||||
|
static void BM_##NAME(benchmark::State& state) { \
|
||||||
|
const Index n = state.range(0); \
|
||||||
|
using Arr = Array<Scalar, Dynamic, 1>; \
|
||||||
|
Arr a = (Arr::Random(n) + Scalar(1)) * Scalar((double(HI) - double(LO)) / 2.0) + Scalar(LO); \
|
||||||
|
Arr b(n); \
|
||||||
|
for (auto _ : state) { \
|
||||||
|
b = EXPR; \
|
||||||
|
benchmark::DoNotOptimize(b.data()); \
|
||||||
|
} \
|
||||||
|
state.counters["Elements/s"] = benchmark::Counter(n, benchmark::Counter::kIsIterationInvariantRate); \
|
||||||
|
state.SetBytesProcessed(state.iterations() * n * sizeof(Scalar) * 2); \
|
||||||
|
}
|
||||||
|
|
||||||
|
// Macro for binary special functions on arrays.
|
||||||
|
#define BENCH_SPECIAL_BINARY(NAME, EXPR, LO_A, HI_A, LO_B, HI_B) \
|
||||||
|
template <typename Scalar> \
|
||||||
|
static void BM_##NAME(benchmark::State& state) { \
|
||||||
|
const Index n = state.range(0); \
|
||||||
|
using Arr = Array<Scalar, Dynamic, 1>; \
|
||||||
|
Arr a = (Arr::Random(n) + Scalar(1)) * Scalar((double(HI_A) - double(LO_A)) / 2.0) + Scalar(LO_A); \
|
||||||
|
Arr b = (Arr::Random(n) + Scalar(1)) * Scalar((double(HI_B) - double(LO_B)) / 2.0) + Scalar(LO_B); \
|
||||||
|
Arr c(n); \
|
||||||
|
for (auto _ : state) { \
|
||||||
|
c = EXPR; \
|
||||||
|
benchmark::DoNotOptimize(c.data()); \
|
||||||
|
} \
|
||||||
|
state.counters["Elements/s"] = benchmark::Counter(n, benchmark::Counter::kIsIterationInvariantRate); \
|
||||||
|
state.SetBytesProcessed(state.iterations() * n * sizeof(Scalar) * 3); \
|
||||||
|
}
|
||||||
|
|
||||||
|
// --- Unary special functions ---
|
||||||
|
BENCH_SPECIAL_UNARY(Lgamma, Eigen::lgamma(a), 0.1, 20)
|
||||||
|
BENCH_SPECIAL_UNARY(Digamma, Eigen::digamma(a), 0.1, 20)
|
||||||
|
|
||||||
|
// --- Bessel functions (first kind) ---
|
||||||
|
BENCH_SPECIAL_UNARY(BesselI0, Eigen::bessel_i0(a), 0, 10)
|
||||||
|
BENCH_SPECIAL_UNARY(BesselI1, Eigen::bessel_i1(a), 0, 10)
|
||||||
|
BENCH_SPECIAL_UNARY(BesselI0e, Eigen::bessel_i0e(a), 0, 100)
|
||||||
|
BENCH_SPECIAL_UNARY(BesselI1e, Eigen::bessel_i1e(a), 0, 100)
|
||||||
|
BENCH_SPECIAL_UNARY(BesselJ0, Eigen::bessel_j0(a), 0, 20)
|
||||||
|
BENCH_SPECIAL_UNARY(BesselJ1, Eigen::bessel_j1(a), 0, 20)
|
||||||
|
|
||||||
|
// --- Bessel functions (second kind) ---
|
||||||
|
BENCH_SPECIAL_UNARY(BesselY0, Eigen::bessel_y0(a), 0.1, 20)
|
||||||
|
BENCH_SPECIAL_UNARY(BesselY1, Eigen::bessel_y1(a), 0.1, 20)
|
||||||
|
BENCH_SPECIAL_UNARY(BesselK0, Eigen::bessel_k0(a), 0.1, 20)
|
||||||
|
BENCH_SPECIAL_UNARY(BesselK1, Eigen::bessel_k1(a), 0.1, 20)
|
||||||
|
BENCH_SPECIAL_UNARY(BesselK0e, Eigen::bessel_k0e(a), 0.1, 100)
|
||||||
|
BENCH_SPECIAL_UNARY(BesselK1e, Eigen::bessel_k1e(a), 0.1, 100)
|
||||||
|
|
||||||
|
// --- Two-argument functions ---
|
||||||
|
BENCH_SPECIAL_BINARY(Igamma, Eigen::igamma(a, b), 0.1, 10, 0.1, 10)
|
||||||
|
BENCH_SPECIAL_BINARY(Igammac, Eigen::igammac(a, b), 0.1, 10, 0.1, 10)
|
||||||
|
BENCH_SPECIAL_BINARY(Zeta, Eigen::zeta(a, b), 1.1, 10, 0.1, 10)
|
||||||
|
BENCH_SPECIAL_BINARY(Polygamma, Eigen::polygamma(a, b), 1, 4, 0.1, 10)
|
||||||
|
|
||||||
|
// --- Ternary: betainc ---
|
||||||
|
template <typename Scalar>
|
||||||
|
static void BM_Betainc(benchmark::State& state) {
|
||||||
|
const Index n = state.range(0);
|
||||||
|
using Arr = Array<Scalar, Dynamic, 1>;
|
||||||
|
Arr a = (Arr::Random(n) + Scalar(1)) * Scalar(2.5) + Scalar(0.5); // [0.5, 5.5]
|
||||||
|
Arr b = (Arr::Random(n) + Scalar(1)) * Scalar(2.5) + Scalar(0.5);
|
||||||
|
Arr x = (Arr::Random(n) + Scalar(1)) * Scalar(0.5); // [0, 1]
|
||||||
|
Arr result(n);
|
||||||
|
for (auto _ : state) {
|
||||||
|
result = Eigen::betainc(a, b, x);
|
||||||
|
benchmark::DoNotOptimize(result.data());
|
||||||
|
}
|
||||||
|
state.counters["Elements/s"] = benchmark::Counter(n, benchmark::Counter::kIsIterationInvariantRate);
|
||||||
|
state.SetBytesProcessed(state.iterations() * n * sizeof(Scalar) * 4);
|
||||||
|
}
|
||||||
|
|
||||||
|
static void SpecialSizes(::benchmark::Benchmark* b) {
|
||||||
|
for (int n : {256, 4096, 65536, 1048576}) b->Arg(n);
|
||||||
|
}
|
||||||
|
|
||||||
|
// --- Register float ---
|
||||||
|
BENCHMARK(BM_Lgamma<float>)->Apply(SpecialSizes)->Name("Lgamma_float");
|
||||||
|
BENCHMARK(BM_Digamma<float>)->Apply(SpecialSizes)->Name("Digamma_float");
|
||||||
|
BENCHMARK(BM_BesselI0<float>)->Apply(SpecialSizes)->Name("BesselI0_float");
|
||||||
|
BENCHMARK(BM_BesselI1<float>)->Apply(SpecialSizes)->Name("BesselI1_float");
|
||||||
|
BENCHMARK(BM_BesselI0e<float>)->Apply(SpecialSizes)->Name("BesselI0e_float");
|
||||||
|
BENCHMARK(BM_BesselI1e<float>)->Apply(SpecialSizes)->Name("BesselI1e_float");
|
||||||
|
BENCHMARK(BM_BesselJ0<float>)->Apply(SpecialSizes)->Name("BesselJ0_float");
|
||||||
|
BENCHMARK(BM_BesselJ1<float>)->Apply(SpecialSizes)->Name("BesselJ1_float");
|
||||||
|
BENCHMARK(BM_BesselY0<float>)->Apply(SpecialSizes)->Name("BesselY0_float");
|
||||||
|
BENCHMARK(BM_BesselY1<float>)->Apply(SpecialSizes)->Name("BesselY1_float");
|
||||||
|
BENCHMARK(BM_BesselK0<float>)->Apply(SpecialSizes)->Name("BesselK0_float");
|
||||||
|
BENCHMARK(BM_BesselK1<float>)->Apply(SpecialSizes)->Name("BesselK1_float");
|
||||||
|
BENCHMARK(BM_BesselK0e<float>)->Apply(SpecialSizes)->Name("BesselK0e_float");
|
||||||
|
BENCHMARK(BM_BesselK1e<float>)->Apply(SpecialSizes)->Name("BesselK1e_float");
|
||||||
|
BENCHMARK(BM_Igamma<float>)->Apply(SpecialSizes)->Name("Igamma_float");
|
||||||
|
BENCHMARK(BM_Igammac<float>)->Apply(SpecialSizes)->Name("Igammac_float");
|
||||||
|
BENCHMARK(BM_Betainc<float>)->Apply(SpecialSizes)->Name("Betainc_float");
|
||||||
|
BENCHMARK(BM_Zeta<float>)->Apply(SpecialSizes)->Name("Zeta_float");
|
||||||
|
BENCHMARK(BM_Polygamma<float>)->Apply(SpecialSizes)->Name("Polygamma_float");
|
||||||
|
|
||||||
|
// --- Register double ---
|
||||||
|
BENCHMARK(BM_Lgamma<double>)->Apply(SpecialSizes)->Name("Lgamma_double");
|
||||||
|
BENCHMARK(BM_Digamma<double>)->Apply(SpecialSizes)->Name("Digamma_double");
|
||||||
|
BENCHMARK(BM_BesselI0<double>)->Apply(SpecialSizes)->Name("BesselI0_double");
|
||||||
|
BENCHMARK(BM_BesselI1<double>)->Apply(SpecialSizes)->Name("BesselI1_double");
|
||||||
|
BENCHMARK(BM_BesselJ0<double>)->Apply(SpecialSizes)->Name("BesselJ0_double");
|
||||||
|
BENCHMARK(BM_BesselJ1<double>)->Apply(SpecialSizes)->Name("BesselJ1_double");
|
||||||
|
BENCHMARK(BM_BesselY0<double>)->Apply(SpecialSizes)->Name("BesselY0_double");
|
||||||
|
BENCHMARK(BM_BesselY1<double>)->Apply(SpecialSizes)->Name("BesselY1_double");
|
||||||
|
BENCHMARK(BM_BesselK0<double>)->Apply(SpecialSizes)->Name("BesselK0_double");
|
||||||
|
BENCHMARK(BM_BesselK1<double>)->Apply(SpecialSizes)->Name("BesselK1_double");
|
||||||
|
BENCHMARK(BM_Igamma<double>)->Apply(SpecialSizes)->Name("Igamma_double");
|
||||||
|
BENCHMARK(BM_Igammac<double>)->Apply(SpecialSizes)->Name("Igammac_double");
|
||||||
|
BENCHMARK(BM_Betainc<double>)->Apply(SpecialSizes)->Name("Betainc_double");
|
||||||
|
BENCHMARK(BM_Zeta<double>)->Apply(SpecialSizes)->Name("Zeta_double");
|
||||||
|
BENCHMARK(BM_Polygamma<double>)->Apply(SpecialSizes)->Name("Polygamma_double");
|
||||||
1
unsupported/benchmarks/Splines/CMakeLists.txt
Normal file
1
unsupported/benchmarks/Splines/CMakeLists.txt
Normal file
@@ -0,0 +1 @@
|
|||||||
|
eigen_add_benchmark(bench_splines bench_splines.cpp)
|
||||||
98
unsupported/benchmarks/Splines/bench_splines.cpp
Normal file
98
unsupported/benchmarks/Splines/bench_splines.cpp
Normal file
@@ -0,0 +1,98 @@
|
|||||||
|
// Benchmarks for Eigen Spline module.
|
||||||
|
// Tests fitting, evaluation, and derivative computation.
|
||||||
|
|
||||||
|
#include <benchmark/benchmark.h>
|
||||||
|
#include <Eigen/Core>
|
||||||
|
#include <unsupported/Eigen/Splines>
|
||||||
|
|
||||||
|
using namespace Eigen;
|
||||||
|
|
||||||
|
typedef double Scalar;
|
||||||
|
|
||||||
|
// --- Spline fitting (interpolation) ---
|
||||||
|
template <int Dim, int Degree>
|
||||||
|
static void BM_SplineFit(benchmark::State& state) {
|
||||||
|
const int n = state.range(0);
|
||||||
|
|
||||||
|
typedef Spline<Scalar, Dim> SplineType;
|
||||||
|
typedef typename SplineType::PointType PointType;
|
||||||
|
|
||||||
|
// Generate random points.
|
||||||
|
Matrix<Scalar, Dim, Dynamic> pts(Dim, n);
|
||||||
|
pts.setRandom();
|
||||||
|
|
||||||
|
for (auto _ : state) {
|
||||||
|
SplineType spline = SplineFitting<SplineType>::Interpolate(pts, Degree);
|
||||||
|
benchmark::DoNotOptimize(spline.knots().data());
|
||||||
|
benchmark::ClobberMemory();
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// --- Spline evaluation ---
|
||||||
|
template <int Dim, int Degree>
|
||||||
|
static void BM_SplineEval(benchmark::State& state) {
|
||||||
|
const int n = state.range(0); // number of control points for fitting
|
||||||
|
const int neval = 1000; // number of evaluation points
|
||||||
|
|
||||||
|
typedef Spline<Scalar, Dim> SplineType;
|
||||||
|
|
||||||
|
Matrix<Scalar, Dim, Dynamic> pts(Dim, n);
|
||||||
|
pts.setRandom();
|
||||||
|
SplineType spline = SplineFitting<SplineType>::Interpolate(pts, Degree);
|
||||||
|
|
||||||
|
// Generate evaluation parameters in [0, 1].
|
||||||
|
VectorXd u = VectorXd::LinSpaced(neval, 0, 1);
|
||||||
|
|
||||||
|
for (auto _ : state) {
|
||||||
|
for (int i = 0; i < neval; ++i) {
|
||||||
|
auto pt = spline(u(i));
|
||||||
|
benchmark::DoNotOptimize(pt.data());
|
||||||
|
}
|
||||||
|
benchmark::ClobberMemory();
|
||||||
|
}
|
||||||
|
state.counters["Evals/s"] = benchmark::Counter(neval, benchmark::Counter::kIsIterationInvariantRate);
|
||||||
|
}
|
||||||
|
|
||||||
|
// --- Spline derivative evaluation ---
|
||||||
|
template <int Dim, int Degree>
|
||||||
|
static void BM_SplineDerivatives(benchmark::State& state) {
|
||||||
|
const int n = state.range(0);
|
||||||
|
const int neval = 1000;
|
||||||
|
|
||||||
|
typedef Spline<Scalar, Dim> SplineType;
|
||||||
|
|
||||||
|
Matrix<Scalar, Dim, Dynamic> pts(Dim, n);
|
||||||
|
pts.setRandom();
|
||||||
|
SplineType spline = SplineFitting<SplineType>::Interpolate(pts, Degree);
|
||||||
|
|
||||||
|
VectorXd u = VectorXd::LinSpaced(neval, 0, 1);
|
||||||
|
|
||||||
|
for (auto _ : state) {
|
||||||
|
for (int i = 0; i < neval; ++i) {
|
||||||
|
auto derivs = spline.derivatives(u(i), 1);
|
||||||
|
benchmark::DoNotOptimize(derivs.data());
|
||||||
|
}
|
||||||
|
benchmark::ClobberMemory();
|
||||||
|
}
|
||||||
|
state.counters["Evals/s"] = benchmark::Counter(neval, benchmark::Counter::kIsIterationInvariantRate);
|
||||||
|
}
|
||||||
|
|
||||||
|
static void SplineSizes(::benchmark::Benchmark* b) {
|
||||||
|
for (int n : {10, 50, 200, 1000}) {
|
||||||
|
b->Arg(n);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// 2D cubic splines
|
||||||
|
BENCHMARK(BM_SplineFit<2, 3>)->Apply(SplineSizes)->Name("SplineFit_2D_Cubic");
|
||||||
|
BENCHMARK(BM_SplineEval<2, 3>)->Apply(SplineSizes)->Name("SplineEval_2D_Cubic");
|
||||||
|
BENCHMARK(BM_SplineDerivatives<2, 3>)->Apply(SplineSizes)->Name("SplineDerivatives_2D_Cubic");
|
||||||
|
|
||||||
|
// 3D cubic splines
|
||||||
|
BENCHMARK(BM_SplineFit<3, 3>)->Apply(SplineSizes)->Name("SplineFit_3D_Cubic");
|
||||||
|
BENCHMARK(BM_SplineEval<3, 3>)->Apply(SplineSizes)->Name("SplineEval_3D_Cubic");
|
||||||
|
BENCHMARK(BM_SplineDerivatives<3, 3>)->Apply(SplineSizes)->Name("SplineDerivatives_3D_Cubic");
|
||||||
|
|
||||||
|
// 2D quintic splines
|
||||||
|
BENCHMARK(BM_SplineFit<2, 5>)->Apply(SplineSizes)->Name("SplineFit_2D_Quintic");
|
||||||
|
BENCHMARK(BM_SplineEval<2, 5>)->Apply(SplineSizes)->Name("SplineEval_2D_Quintic");
|
||||||
8
unsupported/benchmarks/Tensor/CMakeLists.txt
Normal file
8
unsupported/benchmarks/Tensor/CMakeLists.txt
Normal file
@@ -0,0 +1,8 @@
|
|||||||
|
eigen_add_benchmark(bench_contraction bench_contraction.cpp)
|
||||||
|
eigen_add_benchmark(bench_convolution bench_convolution.cpp)
|
||||||
|
eigen_add_benchmark(bench_reduction bench_reduction.cpp)
|
||||||
|
eigen_add_benchmark(bench_broadcasting bench_broadcasting.cpp)
|
||||||
|
eigen_add_benchmark(bench_shuffling bench_shuffling.cpp)
|
||||||
|
eigen_add_benchmark(bench_tensor_fft bench_tensor_fft.cpp)
|
||||||
|
eigen_add_benchmark(bench_morphing bench_morphing.cpp)
|
||||||
|
eigen_add_benchmark(bench_coefficient_wise bench_coefficient_wise.cpp)
|
||||||
111
unsupported/benchmarks/Tensor/bench_broadcasting.cpp
Normal file
111
unsupported/benchmarks/Tensor/bench_broadcasting.cpp
Normal file
@@ -0,0 +1,111 @@
|
|||||||
|
// Benchmarks for Eigen Tensor broadcasting.
|
||||||
|
// Tests broadcasting along various dimensions and ranks.
|
||||||
|
|
||||||
|
#include <benchmark/benchmark.h>
|
||||||
|
#include <unsupported/Eigen/CXX11/Tensor>
|
||||||
|
|
||||||
|
using namespace Eigen;
|
||||||
|
|
||||||
|
typedef float Scalar;
|
||||||
|
|
||||||
|
// --- Broadcast row vector {1,N} -> {M,N} ---
|
||||||
|
static void BM_BroadcastRow(benchmark::State& state) {
|
||||||
|
const int M = state.range(0);
|
||||||
|
const int N = state.range(1);
|
||||||
|
|
||||||
|
Tensor<Scalar, 2> row(1, N);
|
||||||
|
Tensor<Scalar, 2> result(M, N);
|
||||||
|
row.setRandom();
|
||||||
|
|
||||||
|
Eigen::array<int, 2> bcast = {M, 1};
|
||||||
|
|
||||||
|
for (auto _ : state) {
|
||||||
|
result = row.broadcast(bcast);
|
||||||
|
benchmark::DoNotOptimize(result.data());
|
||||||
|
benchmark::ClobberMemory();
|
||||||
|
}
|
||||||
|
state.SetBytesProcessed(state.iterations() * M * N * sizeof(Scalar));
|
||||||
|
}
|
||||||
|
|
||||||
|
// --- Broadcast col vector {M,1} -> {M,N} ---
|
||||||
|
static void BM_BroadcastCol(benchmark::State& state) {
|
||||||
|
const int M = state.range(0);
|
||||||
|
const int N = state.range(1);
|
||||||
|
|
||||||
|
Tensor<Scalar, 2> col(M, 1);
|
||||||
|
Tensor<Scalar, 2> result(M, N);
|
||||||
|
col.setRandom();
|
||||||
|
|
||||||
|
Eigen::array<int, 2> bcast = {1, N};
|
||||||
|
|
||||||
|
for (auto _ : state) {
|
||||||
|
result = col.broadcast(bcast);
|
||||||
|
benchmark::DoNotOptimize(result.data());
|
||||||
|
benchmark::ClobberMemory();
|
||||||
|
}
|
||||||
|
state.SetBytesProcessed(state.iterations() * M * N * sizeof(Scalar));
|
||||||
|
}
|
||||||
|
|
||||||
|
// --- Broadcast + element-wise add (bias addition pattern) ---
|
||||||
|
static void BM_BroadcastAdd(benchmark::State& state) {
|
||||||
|
const int M = state.range(0);
|
||||||
|
const int N = state.range(1);
|
||||||
|
|
||||||
|
Tensor<Scalar, 2> mat(M, N);
|
||||||
|
Tensor<Scalar, 2> bias(1, N);
|
||||||
|
Tensor<Scalar, 2> result(M, N);
|
||||||
|
mat.setRandom();
|
||||||
|
bias.setRandom();
|
||||||
|
|
||||||
|
Eigen::array<int, 2> bcast = {M, 1};
|
||||||
|
|
||||||
|
for (auto _ : state) {
|
||||||
|
result = mat + bias.broadcast(bcast);
|
||||||
|
benchmark::DoNotOptimize(result.data());
|
||||||
|
benchmark::ClobberMemory();
|
||||||
|
}
|
||||||
|
state.SetBytesProcessed(state.iterations() * M * N * sizeof(Scalar) * 2);
|
||||||
|
}
|
||||||
|
|
||||||
|
// --- Rank-4 broadcast (batch x channels x 1 x 1) -> (batch x channels x H x W) ---
|
||||||
|
static void BM_BroadcastRank4(benchmark::State& state) {
|
||||||
|
const int batch = state.range(0);
|
||||||
|
const int C = state.range(1);
|
||||||
|
const int H = state.range(2);
|
||||||
|
|
||||||
|
Tensor<Scalar, 4> bias(batch, C, 1, 1);
|
||||||
|
Tensor<Scalar, 4> result(batch, C, H, H);
|
||||||
|
bias.setRandom();
|
||||||
|
|
||||||
|
Eigen::array<int, 4> bcast = {1, 1, H, H};
|
||||||
|
|
||||||
|
for (auto _ : state) {
|
||||||
|
result = bias.broadcast(bcast);
|
||||||
|
benchmark::DoNotOptimize(result.data());
|
||||||
|
benchmark::ClobberMemory();
|
||||||
|
}
|
||||||
|
state.SetBytesProcessed(state.iterations() * batch * C * H * H * sizeof(Scalar));
|
||||||
|
}
|
||||||
|
|
||||||
|
static void BroadcastSizes(::benchmark::Benchmark* b) {
|
||||||
|
for (int m : {64, 256, 1024}) {
|
||||||
|
for (int n : {64, 256, 1024}) {
|
||||||
|
b->Args({m, n});
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
static void Rank4Sizes(::benchmark::Benchmark* b) {
|
||||||
|
for (int batch : {1, 8}) {
|
||||||
|
for (int c : {64, 256}) {
|
||||||
|
for (int h : {16, 32}) {
|
||||||
|
b->Args({batch, c, h});
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
BENCHMARK(BM_BroadcastRow)->Apply(BroadcastSizes);
|
||||||
|
BENCHMARK(BM_BroadcastCol)->Apply(BroadcastSizes);
|
||||||
|
BENCHMARK(BM_BroadcastAdd)->Apply(BroadcastSizes);
|
||||||
|
BENCHMARK(BM_BroadcastRank4)->Apply(Rank4Sizes);
|
||||||
131
unsupported/benchmarks/Tensor/bench_coefficient_wise.cpp
Normal file
131
unsupported/benchmarks/Tensor/bench_coefficient_wise.cpp
Normal file
@@ -0,0 +1,131 @@
|
|||||||
|
// Benchmarks for Eigen Tensor coefficient-wise operations.
|
||||||
|
// Covers activation functions, normalization, and element-wise arithmetic.
|
||||||
|
|
||||||
|
#include <benchmark/benchmark.h>
|
||||||
|
#include <unsupported/Eigen/CXX11/Tensor>
|
||||||
|
|
||||||
|
using namespace Eigen;
|
||||||
|
|
||||||
|
typedef float Scalar;
|
||||||
|
|
||||||
|
// Macro to define a benchmark for a unary tensor operation.
|
||||||
|
#define BENCH_TENSOR_UNARY(NAME, EXPR) \
|
||||||
|
static void BM_##NAME(benchmark::State& state) { \
|
||||||
|
const int M = state.range(0); \
|
||||||
|
const int N = state.range(1); \
|
||||||
|
Tensor<Scalar, 2> a(M, N); \
|
||||||
|
a.setRandom(); \
|
||||||
|
Tensor<Scalar, 2> b(M, N); \
|
||||||
|
for (auto _ : state) { \
|
||||||
|
b = EXPR; \
|
||||||
|
benchmark::DoNotOptimize(b.data()); \
|
||||||
|
benchmark::ClobberMemory(); \
|
||||||
|
} \
|
||||||
|
state.SetBytesProcessed(state.iterations() * M * N * sizeof(Scalar) * 2); \
|
||||||
|
}
|
||||||
|
|
||||||
|
BENCH_TENSOR_UNARY(Exp, a.exp())
|
||||||
|
BENCH_TENSOR_UNARY(Log, a.abs().log())
|
||||||
|
BENCH_TENSOR_UNARY(Tanh, a.tanh())
|
||||||
|
BENCH_TENSOR_UNARY(Sigmoid, a.sigmoid())
|
||||||
|
BENCH_TENSOR_UNARY(ReLU, a.cwiseMax(Scalar(0)))
|
||||||
|
BENCH_TENSOR_UNARY(Sqrt, a.abs().sqrt())
|
||||||
|
|
||||||
|
// --- Element-wise binary operations ---
|
||||||
|
static void BM_Add(benchmark::State& state) {
|
||||||
|
const int M = state.range(0);
|
||||||
|
const int N = state.range(1);
|
||||||
|
|
||||||
|
Tensor<Scalar, 2> a(M, N);
|
||||||
|
Tensor<Scalar, 2> b(M, N);
|
||||||
|
Tensor<Scalar, 2> c(M, N);
|
||||||
|
a.setRandom();
|
||||||
|
b.setRandom();
|
||||||
|
|
||||||
|
for (auto _ : state) {
|
||||||
|
c = a + b;
|
||||||
|
benchmark::DoNotOptimize(c.data());
|
||||||
|
benchmark::ClobberMemory();
|
||||||
|
}
|
||||||
|
state.SetBytesProcessed(state.iterations() * M * N * sizeof(Scalar) * 3);
|
||||||
|
}
|
||||||
|
|
||||||
|
static void BM_Mul(benchmark::State& state) {
|
||||||
|
const int M = state.range(0);
|
||||||
|
const int N = state.range(1);
|
||||||
|
|
||||||
|
Tensor<Scalar, 2> a(M, N);
|
||||||
|
Tensor<Scalar, 2> b(M, N);
|
||||||
|
Tensor<Scalar, 2> c(M, N);
|
||||||
|
a.setRandom();
|
||||||
|
b.setRandom();
|
||||||
|
|
||||||
|
for (auto _ : state) {
|
||||||
|
c = a * b;
|
||||||
|
benchmark::DoNotOptimize(c.data());
|
||||||
|
benchmark::ClobberMemory();
|
||||||
|
}
|
||||||
|
state.SetBytesProcessed(state.iterations() * M * N * sizeof(Scalar) * 3);
|
||||||
|
}
|
||||||
|
|
||||||
|
// --- Fused multiply-add ---
|
||||||
|
static void BM_FMA(benchmark::State& state) {
|
||||||
|
const int M = state.range(0);
|
||||||
|
const int N = state.range(1);
|
||||||
|
|
||||||
|
Tensor<Scalar, 2> a(M, N);
|
||||||
|
Tensor<Scalar, 2> b(M, N);
|
||||||
|
Tensor<Scalar, 2> c(M, N);
|
||||||
|
Tensor<Scalar, 2> d(M, N);
|
||||||
|
a.setRandom();
|
||||||
|
b.setRandom();
|
||||||
|
c.setRandom();
|
||||||
|
|
||||||
|
for (auto _ : state) {
|
||||||
|
d = a * b + c;
|
||||||
|
benchmark::DoNotOptimize(d.data());
|
||||||
|
benchmark::ClobberMemory();
|
||||||
|
}
|
||||||
|
state.SetBytesProcessed(state.iterations() * M * N * sizeof(Scalar) * 4);
|
||||||
|
}
|
||||||
|
|
||||||
|
// --- Rank-4 coefficient-wise (CNN feature maps) ---
|
||||||
|
static void BM_ReLU_Rank4(benchmark::State& state) {
|
||||||
|
const int batch = state.range(0);
|
||||||
|
const int C = state.range(1);
|
||||||
|
const int H = state.range(2);
|
||||||
|
|
||||||
|
Tensor<Scalar, 4> a(batch, C, H, H);
|
||||||
|
Tensor<Scalar, 4> b(batch, C, H, H);
|
||||||
|
a.setRandom();
|
||||||
|
|
||||||
|
for (auto _ : state) {
|
||||||
|
b = a.cwiseMax(Scalar(0));
|
||||||
|
benchmark::DoNotOptimize(b.data());
|
||||||
|
benchmark::ClobberMemory();
|
||||||
|
}
|
||||||
|
state.SetBytesProcessed(state.iterations() * batch * C * H * H * sizeof(Scalar) * 2);
|
||||||
|
}
|
||||||
|
|
||||||
|
static void CwiseSizes(::benchmark::Benchmark* b) {
|
||||||
|
for (int size : {256, 1024}) {
|
||||||
|
b->Args({size, size});
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
static void Rank4Sizes(::benchmark::Benchmark* b) {
|
||||||
|
b->Args({32, 64, 16});
|
||||||
|
b->Args({8, 128, 32});
|
||||||
|
b->Args({1, 256, 64});
|
||||||
|
}
|
||||||
|
|
||||||
|
BENCHMARK(BM_Exp)->Apply(CwiseSizes);
|
||||||
|
BENCHMARK(BM_Log)->Apply(CwiseSizes);
|
||||||
|
BENCHMARK(BM_Tanh)->Apply(CwiseSizes);
|
||||||
|
BENCHMARK(BM_Sigmoid)->Apply(CwiseSizes);
|
||||||
|
BENCHMARK(BM_ReLU)->Apply(CwiseSizes);
|
||||||
|
BENCHMARK(BM_Sqrt)->Apply(CwiseSizes);
|
||||||
|
BENCHMARK(BM_Add)->Apply(CwiseSizes);
|
||||||
|
BENCHMARK(BM_Mul)->Apply(CwiseSizes);
|
||||||
|
BENCHMARK(BM_FMA)->Apply(CwiseSizes);
|
||||||
|
BENCHMARK(BM_ReLU_Rank4)->Apply(Rank4Sizes);
|
||||||
148
unsupported/benchmarks/Tensor/bench_contraction.cpp
Normal file
148
unsupported/benchmarks/Tensor/bench_contraction.cpp
Normal file
@@ -0,0 +1,148 @@
|
|||||||
|
// Benchmarks for Eigen Tensor contraction (generalized GEMM).
|
||||||
|
// Tests single-threaded (DefaultDevice) and multi-threaded (ThreadPoolDevice) variants.
|
||||||
|
|
||||||
|
#define EIGEN_USE_THREADS
|
||||||
|
|
||||||
|
#include <benchmark/benchmark.h>
|
||||||
|
#include <unsupported/Eigen/CXX11/Tensor>
|
||||||
|
#include <unsupported/Eigen/CXX11/ThreadPool>
|
||||||
|
|
||||||
|
using namespace Eigen;
|
||||||
|
|
||||||
|
#ifndef SCALAR
|
||||||
|
#define SCALAR float
|
||||||
|
#endif
|
||||||
|
|
||||||
|
typedef SCALAR Scalar;
|
||||||
|
|
||||||
|
// --- DefaultDevice contraction (rank-2, equivalent to matrix multiply) ---
|
||||||
|
static void BM_Contraction(benchmark::State& state) {
|
||||||
|
const int M = state.range(0);
|
||||||
|
const int N = state.range(1);
|
||||||
|
const int K = state.range(2);
|
||||||
|
|
||||||
|
Tensor<Scalar, 2> A(M, K);
|
||||||
|
Tensor<Scalar, 2> B(K, N);
|
||||||
|
Tensor<Scalar, 2> C(M, N);
|
||||||
|
A.setRandom();
|
||||||
|
B.setRandom();
|
||||||
|
|
||||||
|
using ContractDims = Tensor<Scalar, 2>::DimensionPair;
|
||||||
|
Eigen::array<ContractDims, 1> contract_dims = {ContractDims(1, 0)};
|
||||||
|
|
||||||
|
for (auto _ : state) {
|
||||||
|
C = A.contract(B, contract_dims);
|
||||||
|
benchmark::DoNotOptimize(C.data());
|
||||||
|
benchmark::ClobberMemory();
|
||||||
|
}
|
||||||
|
state.counters["GFLOPS"] =
|
||||||
|
benchmark::Counter(2.0 * M * N * K, benchmark::Counter::kIsIterationInvariantRate, benchmark::Counter::kIs1000);
|
||||||
|
}
|
||||||
|
|
||||||
|
// --- ThreadPoolDevice contraction ---
|
||||||
|
static void BM_Contraction_ThreadPool(benchmark::State& state) {
|
||||||
|
const int M = state.range(0);
|
||||||
|
const int N = state.range(1);
|
||||||
|
const int K = state.range(2);
|
||||||
|
const int threads = state.range(3);
|
||||||
|
|
||||||
|
Tensor<Scalar, 2> A(M, K);
|
||||||
|
Tensor<Scalar, 2> B(K, N);
|
||||||
|
Tensor<Scalar, 2> C(M, N);
|
||||||
|
A.setRandom();
|
||||||
|
B.setRandom();
|
||||||
|
|
||||||
|
ThreadPool tp(threads);
|
||||||
|
ThreadPoolDevice dev(&tp, threads);
|
||||||
|
|
||||||
|
using ContractDims = Tensor<Scalar, 2>::DimensionPair;
|
||||||
|
Eigen::array<ContractDims, 1> contract_dims = {ContractDims(1, 0)};
|
||||||
|
|
||||||
|
for (auto _ : state) {
|
||||||
|
C.device(dev) = A.contract(B, contract_dims);
|
||||||
|
benchmark::DoNotOptimize(C.data());
|
||||||
|
benchmark::ClobberMemory();
|
||||||
|
}
|
||||||
|
state.counters["GFLOPS"] =
|
||||||
|
benchmark::Counter(2.0 * M * N * K, benchmark::Counter::kIsIterationInvariantRate, benchmark::Counter::kIs1000);
|
||||||
|
state.counters["threads"] = threads;
|
||||||
|
}
|
||||||
|
|
||||||
|
// --- Rank-3 batch contraction ---
|
||||||
|
static void BM_BatchContraction(benchmark::State& state) {
|
||||||
|
const int batch = state.range(0);
|
||||||
|
const int M = state.range(1);
|
||||||
|
const int N = state.range(2);
|
||||||
|
const int K = state.range(3);
|
||||||
|
|
||||||
|
Tensor<Scalar, 3> A(batch, M, K);
|
||||||
|
Tensor<Scalar, 3> B(batch, K, N);
|
||||||
|
Tensor<Scalar, 3> C(batch, M, N);
|
||||||
|
A.setRandom();
|
||||||
|
B.setRandom();
|
||||||
|
|
||||||
|
using ContractDims = Tensor<Scalar, 3>::DimensionPair;
|
||||||
|
Eigen::array<ContractDims, 1> contract_dims = {ContractDims(2, 1)};
|
||||||
|
|
||||||
|
for (auto _ : state) {
|
||||||
|
C = A.contract(B, contract_dims);
|
||||||
|
benchmark::DoNotOptimize(C.data());
|
||||||
|
benchmark::ClobberMemory();
|
||||||
|
}
|
||||||
|
state.counters["GFLOPS"] = benchmark::Counter(2.0 * batch * M * N * K, benchmark::Counter::kIsIterationInvariantRate,
|
||||||
|
benchmark::Counter::kIs1000);
|
||||||
|
}
|
||||||
|
|
||||||
|
// --- RowMajor contraction ---
|
||||||
|
static void BM_Contraction_RowMajor(benchmark::State& state) {
|
||||||
|
const int M = state.range(0);
|
||||||
|
const int N = state.range(1);
|
||||||
|
const int K = state.range(2);
|
||||||
|
|
||||||
|
Tensor<Scalar, 2, RowMajor> A(M, K);
|
||||||
|
Tensor<Scalar, 2, RowMajor> B(K, N);
|
||||||
|
Tensor<Scalar, 2, RowMajor> C(M, N);
|
||||||
|
A.setRandom();
|
||||||
|
B.setRandom();
|
||||||
|
|
||||||
|
using ContractDims = Tensor<Scalar, 2, RowMajor>::DimensionPair;
|
||||||
|
Eigen::array<ContractDims, 1> contract_dims = {ContractDims(1, 0)};
|
||||||
|
|
||||||
|
for (auto _ : state) {
|
||||||
|
C = A.contract(B, contract_dims);
|
||||||
|
benchmark::DoNotOptimize(C.data());
|
||||||
|
benchmark::ClobberMemory();
|
||||||
|
}
|
||||||
|
state.counters["GFLOPS"] =
|
||||||
|
benchmark::Counter(2.0 * M * N * K, benchmark::Counter::kIsIterationInvariantRate, benchmark::Counter::kIs1000);
|
||||||
|
}
|
||||||
|
|
||||||
|
static void ContractionSizes(::benchmark::Benchmark* b) {
|
||||||
|
for (int size : {32, 64, 128, 256, 512, 1024}) {
|
||||||
|
b->Args({size, size, size});
|
||||||
|
}
|
||||||
|
// Non-square
|
||||||
|
b->Args({256, 256, 1024});
|
||||||
|
b->Args({1024, 64, 64});
|
||||||
|
}
|
||||||
|
|
||||||
|
static void ThreadPoolSizes(::benchmark::Benchmark* b) {
|
||||||
|
for (int size : {64, 256, 512, 1024}) {
|
||||||
|
for (int threads : {2, 4, 8}) {
|
||||||
|
b->Args({size, size, size, threads});
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
static void BatchSizes(::benchmark::Benchmark* b) {
|
||||||
|
for (int batch : {1, 8, 32}) {
|
||||||
|
for (int size : {64, 256}) {
|
||||||
|
b->Args({batch, size, size, size});
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
BENCHMARK(BM_Contraction)->Apply(ContractionSizes);
|
||||||
|
BENCHMARK(BM_Contraction_RowMajor)->Apply(ContractionSizes);
|
||||||
|
BENCHMARK(BM_Contraction_ThreadPool)->Apply(ThreadPoolSizes);
|
||||||
|
BENCHMARK(BM_BatchContraction)->Apply(BatchSizes);
|
||||||
151
unsupported/benchmarks/Tensor/bench_convolution.cpp
Normal file
151
unsupported/benchmarks/Tensor/bench_convolution.cpp
Normal file
@@ -0,0 +1,151 @@
|
|||||||
|
// Benchmarks for Eigen Tensor convolution (1D and 2D).
|
||||||
|
|
||||||
|
#define EIGEN_USE_THREADS
|
||||||
|
|
||||||
|
#include <benchmark/benchmark.h>
|
||||||
|
#include <unsupported/Eigen/CXX11/Tensor>
|
||||||
|
#include <unsupported/Eigen/CXX11/ThreadPool>
|
||||||
|
|
||||||
|
using namespace Eigen;
|
||||||
|
|
||||||
|
typedef float Scalar;
|
||||||
|
|
||||||
|
// --- 1D convolution ---
|
||||||
|
static void BM_Convolve1D(benchmark::State& state) {
|
||||||
|
const int input_size = state.range(0);
|
||||||
|
const int kernel_size = state.range(1);
|
||||||
|
|
||||||
|
Tensor<Scalar, 1> input(input_size);
|
||||||
|
Tensor<Scalar, 1> kernel(kernel_size);
|
||||||
|
input.setRandom();
|
||||||
|
kernel.setRandom();
|
||||||
|
|
||||||
|
Eigen::array<int, 1> dims = {0};
|
||||||
|
|
||||||
|
for (auto _ : state) {
|
||||||
|
Tensor<Scalar, 1> result = input.convolve(kernel, dims);
|
||||||
|
benchmark::DoNotOptimize(result.data());
|
||||||
|
benchmark::ClobberMemory();
|
||||||
|
}
|
||||||
|
double flops = 2.0 * (input_size - kernel_size + 1) * kernel_size;
|
||||||
|
state.counters["GFLOPS"] =
|
||||||
|
benchmark::Counter(flops, benchmark::Counter::kIsIterationInvariantRate, benchmark::Counter::kIs1000);
|
||||||
|
}
|
||||||
|
|
||||||
|
// --- 2D convolution ---
|
||||||
|
static void BM_Convolve2D(benchmark::State& state) {
|
||||||
|
const int H = state.range(0);
|
||||||
|
const int W = state.range(1);
|
||||||
|
const int kH = state.range(2);
|
||||||
|
const int kW = state.range(3);
|
||||||
|
|
||||||
|
Tensor<Scalar, 2> input(H, W);
|
||||||
|
Tensor<Scalar, 2> kernel(kH, kW);
|
||||||
|
input.setRandom();
|
||||||
|
kernel.setRandom();
|
||||||
|
|
||||||
|
Eigen::array<int, 2> dims = {0, 1};
|
||||||
|
|
||||||
|
for (auto _ : state) {
|
||||||
|
Tensor<Scalar, 2> result = input.convolve(kernel, dims);
|
||||||
|
benchmark::DoNotOptimize(result.data());
|
||||||
|
benchmark::ClobberMemory();
|
||||||
|
}
|
||||||
|
double flops = 2.0 * (H - kH + 1) * (W - kW + 1) * kH * kW;
|
||||||
|
state.counters["GFLOPS"] =
|
||||||
|
benchmark::Counter(flops, benchmark::Counter::kIsIterationInvariantRate, benchmark::Counter::kIs1000);
|
||||||
|
}
|
||||||
|
|
||||||
|
// --- 2D convolution with channels (rank-3: C x H x W, convolve on H,W) ---
|
||||||
|
static void BM_Convolve2D_Channels(benchmark::State& state) {
|
||||||
|
const int C = state.range(0);
|
||||||
|
const int H = state.range(1);
|
||||||
|
const int kH = state.range(2);
|
||||||
|
|
||||||
|
Tensor<Scalar, 3> input(C, H, H);
|
||||||
|
Tensor<Scalar, 2> kernel(kH, kH);
|
||||||
|
input.setRandom();
|
||||||
|
kernel.setRandom();
|
||||||
|
|
||||||
|
Eigen::array<int, 2> dims = {1, 2};
|
||||||
|
|
||||||
|
for (auto _ : state) {
|
||||||
|
Tensor<Scalar, 3> result = input.convolve(kernel, dims);
|
||||||
|
benchmark::DoNotOptimize(result.data());
|
||||||
|
benchmark::ClobberMemory();
|
||||||
|
}
|
||||||
|
int outH = H - kH + 1;
|
||||||
|
double flops = 2.0 * C * outH * outH * kH * kH;
|
||||||
|
state.counters["GFLOPS"] =
|
||||||
|
benchmark::Counter(flops, benchmark::Counter::kIsIterationInvariantRate, benchmark::Counter::kIs1000);
|
||||||
|
}
|
||||||
|
|
||||||
|
// --- 2D convolution with ThreadPool ---
|
||||||
|
static void BM_Convolve2D_ThreadPool(benchmark::State& state) {
|
||||||
|
const int H = state.range(0);
|
||||||
|
const int kH = state.range(1);
|
||||||
|
const int threads = state.range(2);
|
||||||
|
|
||||||
|
Tensor<Scalar, 2> input(H, H);
|
||||||
|
Tensor<Scalar, 2> kernel(kH, kH);
|
||||||
|
Tensor<Scalar, 2> result(H - kH + 1, H - kH + 1);
|
||||||
|
input.setRandom();
|
||||||
|
kernel.setRandom();
|
||||||
|
|
||||||
|
ThreadPool tp(threads);
|
||||||
|
ThreadPoolDevice dev(&tp, threads);
|
||||||
|
|
||||||
|
Eigen::array<int, 2> dims = {0, 1};
|
||||||
|
|
||||||
|
for (auto _ : state) {
|
||||||
|
result.device(dev) = input.convolve(kernel, dims);
|
||||||
|
benchmark::DoNotOptimize(result.data());
|
||||||
|
benchmark::ClobberMemory();
|
||||||
|
}
|
||||||
|
int outH = H - kH + 1;
|
||||||
|
double flops = 2.0 * outH * outH * kH * kH;
|
||||||
|
state.counters["GFLOPS"] =
|
||||||
|
benchmark::Counter(flops, benchmark::Counter::kIsIterationInvariantRate, benchmark::Counter::kIs1000);
|
||||||
|
state.counters["threads"] = threads;
|
||||||
|
}
|
||||||
|
|
||||||
|
static void Conv1DSizes(::benchmark::Benchmark* b) {
|
||||||
|
for (int input : {128, 512, 2048}) {
|
||||||
|
for (int kernel : {3, 5, 11}) {
|
||||||
|
b->Args({input, kernel});
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
static void Conv2DSizes(::benchmark::Benchmark* b) {
|
||||||
|
for (int hw : {32, 64, 128, 224}) {
|
||||||
|
for (int k : {3, 5, 7}) {
|
||||||
|
b->Args({hw, hw, k, k});
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
static void Conv2DChannelSizes(::benchmark::Benchmark* b) {
|
||||||
|
for (int c : {3, 64, 128}) {
|
||||||
|
for (int hw : {16, 32, 56}) {
|
||||||
|
for (int k : {3, 5}) {
|
||||||
|
b->Args({c, hw, k});
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
static void Conv2DThreadPoolSizes(::benchmark::Benchmark* b) {
|
||||||
|
for (int hw : {64, 128, 224}) {
|
||||||
|
for (int k : {3, 5}) {
|
||||||
|
for (int threads : {2, 4, 8}) {
|
||||||
|
b->Args({hw, k, threads});
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
BENCHMARK(BM_Convolve1D)->Apply(Conv1DSizes);
|
||||||
|
BENCHMARK(BM_Convolve2D)->Apply(Conv2DSizes);
|
||||||
|
BENCHMARK(BM_Convolve2D_Channels)->Apply(Conv2DChannelSizes);
|
||||||
|
BENCHMARK(BM_Convolve2D_ThreadPool)->Apply(Conv2DThreadPoolSizes);
|
||||||
142
unsupported/benchmarks/Tensor/bench_morphing.cpp
Normal file
142
unsupported/benchmarks/Tensor/bench_morphing.cpp
Normal file
@@ -0,0 +1,142 @@
|
|||||||
|
// Benchmarks for Eigen Tensor morphing operations: reshape, slice, chip, pad, stride.
|
||||||
|
|
||||||
|
#include <benchmark/benchmark.h>
|
||||||
|
#include <unsupported/Eigen/CXX11/Tensor>
|
||||||
|
|
||||||
|
using namespace Eigen;
|
||||||
|
|
||||||
|
typedef float Scalar;
|
||||||
|
|
||||||
|
// --- Reshape (zero-cost if no evaluation needed; force eval via assignment) ---
|
||||||
|
static void BM_Reshape(benchmark::State& state) {
|
||||||
|
const int M = state.range(0);
|
||||||
|
const int N = state.range(1);
|
||||||
|
|
||||||
|
Tensor<Scalar, 2> A(M, N);
|
||||||
|
A.setRandom();
|
||||||
|
|
||||||
|
Eigen::array<Index, 1> new_shape = {M * N};
|
||||||
|
|
||||||
|
for (auto _ : state) {
|
||||||
|
Tensor<Scalar, 1> B = A.reshape(new_shape);
|
||||||
|
benchmark::DoNotOptimize(B.data());
|
||||||
|
benchmark::ClobberMemory();
|
||||||
|
}
|
||||||
|
state.SetBytesProcessed(state.iterations() * M * N * sizeof(Scalar));
|
||||||
|
}
|
||||||
|
|
||||||
|
// --- Slice ---
|
||||||
|
static void BM_Slice(benchmark::State& state) {
|
||||||
|
const int M = state.range(0);
|
||||||
|
const int N = state.range(1);
|
||||||
|
|
||||||
|
Tensor<Scalar, 2> A(M, N);
|
||||||
|
A.setRandom();
|
||||||
|
|
||||||
|
int sliceM = M / 2;
|
||||||
|
int sliceN = N / 2;
|
||||||
|
Eigen::array<Index, 2> offsets = {0, 0};
|
||||||
|
Eigen::array<Index, 2> extents = {sliceM, sliceN};
|
||||||
|
|
||||||
|
for (auto _ : state) {
|
||||||
|
Tensor<Scalar, 2> B = A.slice(offsets, extents);
|
||||||
|
benchmark::DoNotOptimize(B.data());
|
||||||
|
benchmark::ClobberMemory();
|
||||||
|
}
|
||||||
|
state.SetBytesProcessed(state.iterations() * sliceM * sliceN * sizeof(Scalar));
|
||||||
|
}
|
||||||
|
|
||||||
|
// --- Chip (extract a sub-tensor along one dimension) ---
|
||||||
|
static void BM_Chip(benchmark::State& state) {
|
||||||
|
const int D0 = state.range(0);
|
||||||
|
const int D1 = state.range(1);
|
||||||
|
const int D2 = state.range(2);
|
||||||
|
|
||||||
|
Tensor<Scalar, 3> A(D0, D1, D2);
|
||||||
|
A.setRandom();
|
||||||
|
|
||||||
|
for (auto _ : state) {
|
||||||
|
Tensor<Scalar, 2> B = A.chip(0, 0);
|
||||||
|
benchmark::DoNotOptimize(B.data());
|
||||||
|
benchmark::ClobberMemory();
|
||||||
|
}
|
||||||
|
state.SetBytesProcessed(state.iterations() * D1 * D2 * sizeof(Scalar));
|
||||||
|
}
|
||||||
|
|
||||||
|
// --- Pad ---
|
||||||
|
static void BM_Pad(benchmark::State& state) {
|
||||||
|
const int M = state.range(0);
|
||||||
|
const int N = state.range(1);
|
||||||
|
const int padSize = state.range(2);
|
||||||
|
|
||||||
|
Tensor<Scalar, 2> A(M, N);
|
||||||
|
A.setRandom();
|
||||||
|
|
||||||
|
Eigen::array<std::pair<int, int>, 2> paddings;
|
||||||
|
paddings[0] = {padSize, padSize};
|
||||||
|
paddings[1] = {padSize, padSize};
|
||||||
|
|
||||||
|
for (auto _ : state) {
|
||||||
|
Tensor<Scalar, 2> B = A.pad(paddings);
|
||||||
|
benchmark::DoNotOptimize(B.data());
|
||||||
|
benchmark::ClobberMemory();
|
||||||
|
}
|
||||||
|
int outM = M + 2 * padSize;
|
||||||
|
int outN = N + 2 * padSize;
|
||||||
|
state.SetBytesProcessed(state.iterations() * outM * outN * sizeof(Scalar));
|
||||||
|
}
|
||||||
|
|
||||||
|
// --- Stride ---
|
||||||
|
static void BM_Stride(benchmark::State& state) {
|
||||||
|
const int M = state.range(0);
|
||||||
|
const int N = state.range(1);
|
||||||
|
const int stride = state.range(2);
|
||||||
|
|
||||||
|
Tensor<Scalar, 2> A(M, N);
|
||||||
|
A.setRandom();
|
||||||
|
|
||||||
|
Eigen::array<Index, 2> strides_arr = {stride, stride};
|
||||||
|
|
||||||
|
for (auto _ : state) {
|
||||||
|
Tensor<Scalar, 2> B = A.stride(strides_arr);
|
||||||
|
benchmark::DoNotOptimize(B.data());
|
||||||
|
benchmark::ClobberMemory();
|
||||||
|
}
|
||||||
|
int outM = (M + stride - 1) / stride;
|
||||||
|
int outN = (N + stride - 1) / stride;
|
||||||
|
state.SetBytesProcessed(state.iterations() * outM * outN * sizeof(Scalar));
|
||||||
|
}
|
||||||
|
|
||||||
|
static void MorphSizes(::benchmark::Benchmark* b) {
|
||||||
|
for (int size : {256, 1024}) {
|
||||||
|
b->Args({size, size});
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
static void ChipSizes(::benchmark::Benchmark* b) {
|
||||||
|
b->Args({32, 256, 256});
|
||||||
|
b->Args({64, 128, 128});
|
||||||
|
b->Args({8, 512, 512});
|
||||||
|
}
|
||||||
|
|
||||||
|
static void PadSizes(::benchmark::Benchmark* b) {
|
||||||
|
for (int size : {256, 1024}) {
|
||||||
|
for (int pad : {1, 4, 16}) {
|
||||||
|
b->Args({size, size, pad});
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
static void StrideSizes(::benchmark::Benchmark* b) {
|
||||||
|
for (int size : {256, 1024}) {
|
||||||
|
for (int stride : {2, 4}) {
|
||||||
|
b->Args({size, size, stride});
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
BENCHMARK(BM_Reshape)->Apply(MorphSizes);
|
||||||
|
BENCHMARK(BM_Slice)->Apply(MorphSizes);
|
||||||
|
BENCHMARK(BM_Chip)->Apply(ChipSizes);
|
||||||
|
BENCHMARK(BM_Pad)->Apply(PadSizes);
|
||||||
|
BENCHMARK(BM_Stride)->Apply(StrideSizes);
|
||||||
158
unsupported/benchmarks/Tensor/bench_reduction.cpp
Normal file
158
unsupported/benchmarks/Tensor/bench_reduction.cpp
Normal file
@@ -0,0 +1,158 @@
|
|||||||
|
// Benchmarks for Eigen Tensor reductions (sum, maximum, mean).
|
||||||
|
// Tests full and partial reductions, inner vs outer dimension, DefaultDevice and ThreadPoolDevice.
|
||||||
|
|
||||||
|
#define EIGEN_USE_THREADS
|
||||||
|
|
||||||
|
#include <benchmark/benchmark.h>
|
||||||
|
#include <unsupported/Eigen/CXX11/Tensor>
|
||||||
|
#include <unsupported/Eigen/CXX11/ThreadPool>
|
||||||
|
|
||||||
|
using namespace Eigen;
|
||||||
|
|
||||||
|
#ifndef SCALAR
|
||||||
|
#define SCALAR float
|
||||||
|
#endif
|
||||||
|
|
||||||
|
typedef SCALAR Scalar;
|
||||||
|
|
||||||
|
// --- Full reduction (rank-2) ---
|
||||||
|
template <typename ReduceOp>
|
||||||
|
static void BM_FullReduction(benchmark::State& state) {
|
||||||
|
const int M = state.range(0);
|
||||||
|
const int N = state.range(1);
|
||||||
|
|
||||||
|
Tensor<Scalar, 2> A(M, N);
|
||||||
|
A.setRandom();
|
||||||
|
|
||||||
|
for (auto _ : state) {
|
||||||
|
Tensor<Scalar, 0> result = A.reduce(Eigen::array<int, 2>{0, 1}, ReduceOp());
|
||||||
|
benchmark::DoNotOptimize(result.data());
|
||||||
|
benchmark::ClobberMemory();
|
||||||
|
}
|
||||||
|
state.SetBytesProcessed(state.iterations() * M * N * sizeof(Scalar));
|
||||||
|
}
|
||||||
|
|
||||||
|
// --- Partial reduction along dim 0 (inner dim, ColMajor) ---
|
||||||
|
static void BM_ReduceInner(benchmark::State& state) {
|
||||||
|
const int M = state.range(0);
|
||||||
|
const int N = state.range(1);
|
||||||
|
|
||||||
|
Tensor<Scalar, 2> A(M, N);
|
||||||
|
A.setRandom();
|
||||||
|
|
||||||
|
Eigen::array<int, 1> reduce_dims = {0};
|
||||||
|
|
||||||
|
for (auto _ : state) {
|
||||||
|
Tensor<Scalar, 1> result = A.sum(reduce_dims);
|
||||||
|
benchmark::DoNotOptimize(result.data());
|
||||||
|
benchmark::ClobberMemory();
|
||||||
|
}
|
||||||
|
state.SetBytesProcessed(state.iterations() * M * N * sizeof(Scalar));
|
||||||
|
}
|
||||||
|
|
||||||
|
// --- Partial reduction along dim 1 (outer dim, ColMajor) ---
|
||||||
|
static void BM_ReduceOuter(benchmark::State& state) {
|
||||||
|
const int M = state.range(0);
|
||||||
|
const int N = state.range(1);
|
||||||
|
|
||||||
|
Tensor<Scalar, 2> A(M, N);
|
||||||
|
A.setRandom();
|
||||||
|
|
||||||
|
Eigen::array<int, 1> reduce_dims = {1};
|
||||||
|
|
||||||
|
for (auto _ : state) {
|
||||||
|
Tensor<Scalar, 1> result = A.sum(reduce_dims);
|
||||||
|
benchmark::DoNotOptimize(result.data());
|
||||||
|
benchmark::ClobberMemory();
|
||||||
|
}
|
||||||
|
state.SetBytesProcessed(state.iterations() * M * N * sizeof(Scalar));
|
||||||
|
}
|
||||||
|
|
||||||
|
// --- Rank-4 partial reduction (batch x channels x H x W), reduce along spatial dims ---
|
||||||
|
static void BM_ReduceSpatial(benchmark::State& state) {
|
||||||
|
const int batch = state.range(0);
|
||||||
|
const int C = state.range(1);
|
||||||
|
const int H = state.range(2);
|
||||||
|
|
||||||
|
Tensor<Scalar, 4> A(batch, C, H, H);
|
||||||
|
A.setRandom();
|
||||||
|
|
||||||
|
Eigen::array<int, 2> reduce_dims = {2, 3};
|
||||||
|
|
||||||
|
for (auto _ : state) {
|
||||||
|
Tensor<Scalar, 2> result = A.sum(reduce_dims);
|
||||||
|
benchmark::DoNotOptimize(result.data());
|
||||||
|
benchmark::ClobberMemory();
|
||||||
|
}
|
||||||
|
state.SetBytesProcessed(state.iterations() * batch * C * H * H * sizeof(Scalar));
|
||||||
|
}
|
||||||
|
|
||||||
|
// --- Full reduction with ThreadPoolDevice ---
|
||||||
|
static void BM_FullReduction_ThreadPool(benchmark::State& state) {
|
||||||
|
const int M = state.range(0);
|
||||||
|
const int N = state.range(1);
|
||||||
|
const int threads = state.range(2);
|
||||||
|
|
||||||
|
Tensor<Scalar, 2> A(M, N);
|
||||||
|
Tensor<Scalar, 0> result;
|
||||||
|
A.setRandom();
|
||||||
|
|
||||||
|
ThreadPool tp(threads);
|
||||||
|
ThreadPoolDevice dev(&tp, threads);
|
||||||
|
|
||||||
|
for (auto _ : state) {
|
||||||
|
result.device(dev) = A.sum();
|
||||||
|
benchmark::DoNotOptimize(result.data());
|
||||||
|
benchmark::ClobberMemory();
|
||||||
|
}
|
||||||
|
state.SetBytesProcessed(state.iterations() * M * N * sizeof(Scalar));
|
||||||
|
state.counters["threads"] = threads;
|
||||||
|
}
|
||||||
|
|
||||||
|
// --- Maximum reduction (rank-2) ---
|
||||||
|
static void BM_MaxReduction(benchmark::State& state) {
|
||||||
|
const int M = state.range(0);
|
||||||
|
const int N = state.range(1);
|
||||||
|
|
||||||
|
Tensor<Scalar, 2> A(M, N);
|
||||||
|
A.setRandom();
|
||||||
|
|
||||||
|
for (auto _ : state) {
|
||||||
|
Tensor<Scalar, 0> result = A.maximum();
|
||||||
|
benchmark::DoNotOptimize(result.data());
|
||||||
|
benchmark::ClobberMemory();
|
||||||
|
}
|
||||||
|
state.SetBytesProcessed(state.iterations() * M * N * sizeof(Scalar));
|
||||||
|
}
|
||||||
|
|
||||||
|
static void ReductionSizes(::benchmark::Benchmark* b) {
|
||||||
|
for (int size : {64, 256, 1024}) {
|
||||||
|
b->Args({size, size});
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
static void ThreadPoolReductionSizes(::benchmark::Benchmark* b) {
|
||||||
|
for (int size : {256, 1024}) {
|
||||||
|
for (int threads : {2, 4, 8}) {
|
||||||
|
b->Args({size, size, threads});
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
static void SpatialSizes(::benchmark::Benchmark* b) {
|
||||||
|
for (int batch : {1, 8, 32}) {
|
||||||
|
for (int c : {64, 128}) {
|
||||||
|
for (int h : {16, 32}) {
|
||||||
|
b->Args({batch, c, h});
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
BENCHMARK(BM_FullReduction<internal::SumReducer<Scalar>>)->Apply(ReductionSizes)->Name("SumReduction");
|
||||||
|
BENCHMARK(BM_FullReduction<internal::MaxReducer<Scalar>>)->Apply(ReductionSizes)->Name("MaxReduction_Full");
|
||||||
|
BENCHMARK(BM_MaxReduction)->Apply(ReductionSizes);
|
||||||
|
BENCHMARK(BM_ReduceInner)->Apply(ReductionSizes);
|
||||||
|
BENCHMARK(BM_ReduceOuter)->Apply(ReductionSizes);
|
||||||
|
BENCHMARK(BM_ReduceSpatial)->Apply(SpatialSizes);
|
||||||
|
BENCHMARK(BM_FullReduction_ThreadPool)->Apply(ThreadPoolReductionSizes);
|
||||||
115
unsupported/benchmarks/Tensor/bench_shuffling.cpp
Normal file
115
unsupported/benchmarks/Tensor/bench_shuffling.cpp
Normal file
@@ -0,0 +1,115 @@
|
|||||||
|
// Benchmarks for Eigen Tensor shuffling (transpose / permutation).
|
||||||
|
|
||||||
|
#include <benchmark/benchmark.h>
|
||||||
|
#include <unsupported/Eigen/CXX11/Tensor>
|
||||||
|
|
||||||
|
using namespace Eigen;
|
||||||
|
|
||||||
|
typedef float Scalar;
|
||||||
|
|
||||||
|
// --- Rank-2 transpose ---
|
||||||
|
static void BM_Shuffle2D(benchmark::State& state) {
|
||||||
|
const int M = state.range(0);
|
||||||
|
const int N = state.range(1);
|
||||||
|
|
||||||
|
Tensor<Scalar, 2> A(M, N);
|
||||||
|
Tensor<Scalar, 2> B(N, M);
|
||||||
|
A.setRandom();
|
||||||
|
|
||||||
|
Eigen::array<int, 2> perm = {1, 0};
|
||||||
|
|
||||||
|
for (auto _ : state) {
|
||||||
|
B = A.shuffle(perm);
|
||||||
|
benchmark::DoNotOptimize(B.data());
|
||||||
|
benchmark::ClobberMemory();
|
||||||
|
}
|
||||||
|
state.SetBytesProcessed(state.iterations() * M * N * sizeof(Scalar) * 2);
|
||||||
|
}
|
||||||
|
|
||||||
|
// --- Identity shuffle (no permutation, measures overhead) ---
|
||||||
|
static void BM_ShuffleIdentity(benchmark::State& state) {
|
||||||
|
const int M = state.range(0);
|
||||||
|
const int N = state.range(1);
|
||||||
|
|
||||||
|
Tensor<Scalar, 2> A(M, N);
|
||||||
|
Tensor<Scalar, 2> B(M, N);
|
||||||
|
A.setRandom();
|
||||||
|
|
||||||
|
Eigen::array<int, 2> perm = {0, 1};
|
||||||
|
|
||||||
|
for (auto _ : state) {
|
||||||
|
B = A.shuffle(perm);
|
||||||
|
benchmark::DoNotOptimize(B.data());
|
||||||
|
benchmark::ClobberMemory();
|
||||||
|
}
|
||||||
|
state.SetBytesProcessed(state.iterations() * M * N * sizeof(Scalar) * 2);
|
||||||
|
}
|
||||||
|
|
||||||
|
// --- Rank-3 permutation ---
|
||||||
|
static void BM_Shuffle3D(benchmark::State& state) {
|
||||||
|
const int D0 = state.range(0);
|
||||||
|
const int D1 = state.range(1);
|
||||||
|
const int D2 = state.range(2);
|
||||||
|
|
||||||
|
Tensor<Scalar, 3> A(D0, D1, D2);
|
||||||
|
A.setRandom();
|
||||||
|
|
||||||
|
// Permutation (2, 0, 1)
|
||||||
|
Eigen::array<int, 3> perm = {2, 0, 1};
|
||||||
|
|
||||||
|
for (auto _ : state) {
|
||||||
|
Tensor<Scalar, 3> B = A.shuffle(perm);
|
||||||
|
benchmark::DoNotOptimize(B.data());
|
||||||
|
benchmark::ClobberMemory();
|
||||||
|
}
|
||||||
|
state.SetBytesProcessed(state.iterations() * D0 * D1 * D2 * sizeof(Scalar) * 2);
|
||||||
|
}
|
||||||
|
|
||||||
|
// --- Rank-4 permutation (NCHW -> NHWC layout conversion) ---
|
||||||
|
static void BM_Shuffle4D_NCHW_to_NHWC(benchmark::State& state) {
|
||||||
|
const int N = state.range(0);
|
||||||
|
const int C = state.range(1);
|
||||||
|
const int H = state.range(2);
|
||||||
|
|
||||||
|
Tensor<Scalar, 4> A(N, C, H, H);
|
||||||
|
A.setRandom();
|
||||||
|
|
||||||
|
// NCHW -> NHWC: permute (0, 2, 3, 1)
|
||||||
|
Eigen::array<int, 4> perm = {0, 2, 3, 1};
|
||||||
|
|
||||||
|
for (auto _ : state) {
|
||||||
|
Tensor<Scalar, 4> B = A.shuffle(perm);
|
||||||
|
benchmark::DoNotOptimize(B.data());
|
||||||
|
benchmark::ClobberMemory();
|
||||||
|
}
|
||||||
|
state.SetBytesProcessed(state.iterations() * N * C * H * H * sizeof(Scalar) * 2);
|
||||||
|
}
|
||||||
|
|
||||||
|
static void Shuffle2DSizes(::benchmark::Benchmark* b) {
|
||||||
|
for (int size : {256, 1024}) {
|
||||||
|
b->Args({size, size});
|
||||||
|
}
|
||||||
|
b->Args({64, 4096});
|
||||||
|
b->Args({4096, 64});
|
||||||
|
}
|
||||||
|
|
||||||
|
static void Shuffle3DSizes(::benchmark::Benchmark* b) {
|
||||||
|
b->Args({64, 64, 64});
|
||||||
|
b->Args({128, 128, 64});
|
||||||
|
b->Args({32, 256, 256});
|
||||||
|
}
|
||||||
|
|
||||||
|
static void Shuffle4DSizes(::benchmark::Benchmark* b) {
|
||||||
|
for (int batch : {1, 8}) {
|
||||||
|
for (int c : {3, 64}) {
|
||||||
|
for (int h : {32, 64}) {
|
||||||
|
b->Args({batch, c, h});
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
BENCHMARK(BM_Shuffle2D)->Apply(Shuffle2DSizes);
|
||||||
|
BENCHMARK(BM_ShuffleIdentity)->Apply(Shuffle2DSizes);
|
||||||
|
BENCHMARK(BM_Shuffle3D)->Apply(Shuffle3DSizes);
|
||||||
|
BENCHMARK(BM_Shuffle4D_NCHW_to_NHWC)->Apply(Shuffle4DSizes);
|
||||||
80
unsupported/benchmarks/Tensor/bench_tensor_fft.cpp
Normal file
80
unsupported/benchmarks/Tensor/bench_tensor_fft.cpp
Normal file
@@ -0,0 +1,80 @@
|
|||||||
|
// Benchmarks for Eigen Tensor FFT.
|
||||||
|
|
||||||
|
#include <benchmark/benchmark.h>
|
||||||
|
#include <unsupported/Eigen/CXX11/Tensor>
|
||||||
|
|
||||||
|
using namespace Eigen;
|
||||||
|
|
||||||
|
#ifndef SCALAR
|
||||||
|
#define SCALAR float
|
||||||
|
#endif
|
||||||
|
|
||||||
|
typedef SCALAR Scalar;
|
||||||
|
|
||||||
|
// --- 1D FFT ---
|
||||||
|
static void BM_TensorFFT_1D(benchmark::State& state) {
|
||||||
|
const int N = state.range(0);
|
||||||
|
|
||||||
|
Tensor<Scalar, 1> input(N);
|
||||||
|
input.setRandom();
|
||||||
|
|
||||||
|
Eigen::array<int, 1> fft_dims = {0};
|
||||||
|
|
||||||
|
for (auto _ : state) {
|
||||||
|
Tensor<std::complex<Scalar>, 1> result = input.template fft<BothParts, FFT_FORWARD>(fft_dims);
|
||||||
|
benchmark::DoNotOptimize(result.data());
|
||||||
|
benchmark::ClobberMemory();
|
||||||
|
}
|
||||||
|
double mflops = 5.0 * N * std::log2(static_cast<double>(N)) / 2.0; // real->complex
|
||||||
|
state.counters["MFLOPS"] =
|
||||||
|
benchmark::Counter(mflops, benchmark::Counter::kIsIterationInvariantRate, benchmark::Counter::kIs1000);
|
||||||
|
}
|
||||||
|
|
||||||
|
// --- 2D FFT ---
|
||||||
|
static void BM_TensorFFT_2D(benchmark::State& state) {
|
||||||
|
const int N = state.range(0);
|
||||||
|
|
||||||
|
Tensor<Scalar, 2> input(N, N);
|
||||||
|
input.setRandom();
|
||||||
|
|
||||||
|
Eigen::array<int, 2> fft_dims = {0, 1};
|
||||||
|
|
||||||
|
for (auto _ : state) {
|
||||||
|
Tensor<std::complex<Scalar>, 2> result = input.template fft<BothParts, FFT_FORWARD>(fft_dims);
|
||||||
|
benchmark::DoNotOptimize(result.data());
|
||||||
|
benchmark::ClobberMemory();
|
||||||
|
}
|
||||||
|
double total = N * N;
|
||||||
|
double mflops = 5.0 * total * std::log2(static_cast<double>(N));
|
||||||
|
state.counters["MFLOPS"] =
|
||||||
|
benchmark::Counter(mflops, benchmark::Counter::kIsIterationInvariantRate, benchmark::Counter::kIs1000);
|
||||||
|
}
|
||||||
|
|
||||||
|
// --- 1D inverse FFT ---
|
||||||
|
static void BM_TensorIFFT_1D(benchmark::State& state) {
|
||||||
|
const int N = state.range(0);
|
||||||
|
|
||||||
|
Tensor<std::complex<Scalar>, 1> input(N);
|
||||||
|
input.setRandom();
|
||||||
|
|
||||||
|
Eigen::array<int, 1> fft_dims = {0};
|
||||||
|
|
||||||
|
for (auto _ : state) {
|
||||||
|
Tensor<std::complex<Scalar>, 1> result = input.template fft<BothParts, FFT_REVERSE>(fft_dims);
|
||||||
|
benchmark::DoNotOptimize(result.data());
|
||||||
|
benchmark::ClobberMemory();
|
||||||
|
}
|
||||||
|
double mflops = 5.0 * N * std::log2(static_cast<double>(N));
|
||||||
|
state.counters["MFLOPS"] =
|
||||||
|
benchmark::Counter(mflops, benchmark::Counter::kIsIterationInvariantRate, benchmark::Counter::kIs1000);
|
||||||
|
}
|
||||||
|
|
||||||
|
static void FFTSizes(::benchmark::Benchmark* b) {
|
||||||
|
for (int n : {64, 256, 1024, 4096}) {
|
||||||
|
b->Arg(n);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
BENCHMARK(BM_TensorFFT_1D)->Apply(FFTSizes);
|
||||||
|
BENCHMARK(BM_TensorFFT_2D)->Apply(FFTSizes);
|
||||||
|
BENCHMARK(BM_TensorIFFT_1D)->Apply(FFTSizes);
|
||||||
Reference in New Issue
Block a user