From 3108f6360e5311a39551c8f043111b207d957fee Mon Sep 17 00:00:00 2001 From: Rasmus Munk Larsen <4643818-rmlarsen1@users.noreply.gitlab.com> Date: Tue, 17 Feb 2026 20:51:36 -0800 Subject: [PATCH] Migrate Eigen benchmarks to the Google benchmark framework libeigen/eigen!2132 Closes #3025 Co-authored-by: Rasmus Munk Larsen --- benchmarks/CMakeLists.txt | 60 +++++++++ benchmarks/benchBlasGemm.cpp | 73 +++++++++++ benchmarks/benchCholesky.cpp | 48 +++++++ benchmarks/benchEigenSolver.cpp | 42 ++++++ benchmarks/benchFFT.cpp | 43 ++++++ benchmarks/benchGemm.cpp | 81 ++++++++++++ benchmarks/benchGemv.cpp | 160 +++++++++++++++++++++++ benchmarks/benchGeometry.cpp | 43 ++++++ benchmarks/benchVecAdd.cpp | 28 ++++ benchmarks/bench_move_semantics.cpp | 41 ++++++ benchmarks/bench_reverse.cpp | 30 +++++ benchmarks/bench_svd.cpp | 103 +++++++++++++++ benchmarks/bench_trsm.cpp | 99 ++++++++++++++ benchmarks/benchmark_aocl.cpp | 123 +++++++++++++++++ benchmarks/benchmark_blocking_sizes.cpp | 76 +++++++++++ benchmarks/dense_solvers.cpp | 167 ++++++++++++++++++++++++ benchmarks/eig33.cpp | 97 ++++++++++++++ benchmarks/geometry.cpp | 71 ++++++++++ benchmarks/print_blocking.cpp | 51 ++++++++ benchmarks/quatmul.cpp | 38 ++++++ benchmarks/sparse_dense_product.cpp | 64 +++++++++ benchmarks/sparse_product.cpp | 49 +++++++ benchmarks/sparse_transpose.cpp | 45 +++++++ 23 files changed, 1632 insertions(+) create mode 100644 benchmarks/CMakeLists.txt create mode 100644 benchmarks/benchBlasGemm.cpp create mode 100644 benchmarks/benchCholesky.cpp create mode 100644 benchmarks/benchEigenSolver.cpp create mode 100644 benchmarks/benchFFT.cpp create mode 100644 benchmarks/benchGemm.cpp create mode 100644 benchmarks/benchGemv.cpp create mode 100644 benchmarks/benchGeometry.cpp create mode 100644 benchmarks/benchVecAdd.cpp create mode 100644 benchmarks/bench_move_semantics.cpp create mode 100644 benchmarks/bench_reverse.cpp create mode 100644 benchmarks/bench_svd.cpp create mode 100644 benchmarks/bench_trsm.cpp create mode 100644 benchmarks/benchmark_aocl.cpp create mode 100644 benchmarks/benchmark_blocking_sizes.cpp create mode 100644 benchmarks/dense_solvers.cpp create mode 100644 benchmarks/eig33.cpp create mode 100644 benchmarks/geometry.cpp create mode 100644 benchmarks/print_blocking.cpp create mode 100644 benchmarks/quatmul.cpp create mode 100644 benchmarks/sparse_dense_product.cpp create mode 100644 benchmarks/sparse_product.cpp create mode 100644 benchmarks/sparse_transpose.cpp diff --git a/benchmarks/CMakeLists.txt b/benchmarks/CMakeLists.txt new file mode 100644 index 000000000..9156bbdcc --- /dev/null +++ b/benchmarks/CMakeLists.txt @@ -0,0 +1,60 @@ +cmake_minimum_required(VERSION 3.10) +project(EigenBenchmarks CXX) + +find_package(benchmark REQUIRED) + +# Eigen is a header-only library; find it relative to this directory. +set(EIGEN_SOURCE_DIR "${CMAKE_CURRENT_SOURCE_DIR}/..") + +# Helper: add a Google Benchmark target. +# eigen_add_benchmark(name source [LIBRARIES lib1 lib2 ...] [DEFINITIONS def1 def2 ...]) +function(eigen_add_benchmark name source) + cmake_parse_arguments(BENCH "" "" "LIBRARIES;DEFINITIONS" ${ARGN}) + add_executable(${name} ${source}) + target_include_directories(${name} PRIVATE ${EIGEN_SOURCE_DIR}) + target_link_libraries(${name} PRIVATE benchmark::benchmark benchmark::benchmark_main) + if(BENCH_LIBRARIES) + target_link_libraries(${name} PRIVATE ${BENCH_LIBRARIES}) + endif() + target_compile_options(${name} PRIVATE -O3 -DNDEBUG) + if(BENCH_DEFINITIONS) + target_compile_definitions(${name} PRIVATE ${BENCH_DEFINITIONS}) + endif() +endfunction() + +# --- Dense benchmarks --- +eigen_add_benchmark(bench_cholesky benchCholesky.cpp) +eigen_add_benchmark(bench_eigensolver benchEigenSolver.cpp) +eigen_add_benchmark(bench_fft benchFFT.cpp) +eigen_add_benchmark(bench_geometry_transforms benchGeometry.cpp) +eigen_add_benchmark(bench_vecadd benchVecAdd.cpp) +eigen_add_benchmark(bench_gemm benchGemm.cpp) +eigen_add_benchmark(bench_gemm_double benchGemm.cpp DEFINITIONS SCALAR=double) +eigen_add_benchmark(bench_gemv benchGemv.cpp) +eigen_add_benchmark(bench_move_semantics bench_move_semantics.cpp) +eigen_add_benchmark(bench_reverse bench_reverse.cpp) +eigen_add_benchmark(bench_dense_solvers dense_solvers.cpp) +eigen_add_benchmark(bench_trsm bench_trsm.cpp) +eigen_add_benchmark(bench_svd bench_svd.cpp) +eigen_add_benchmark(bench_eig33 eig33.cpp) +eigen_add_benchmark(bench_geometry geometry.cpp) +eigen_add_benchmark(bench_quatmul quatmul.cpp) + +# --- Sparse benchmarks --- +eigen_add_benchmark(bench_sparse_dense_product sparse_dense_product.cpp) +eigen_add_benchmark(bench_sparse_product sparse_product.cpp) +eigen_add_benchmark(bench_sparse_transpose sparse_transpose.cpp) + +# --- GEMM blocking parameter sweep --- +eigen_add_benchmark(bench_blocking_sizes benchmark_blocking_sizes.cpp) + +# --- AOCL benchmark (AOCL optional) --- +eigen_add_benchmark(bench_aocl benchmark_aocl.cpp) + +# Optional: BLAS GEMM comparison benchmark (requires CBLAS) +find_package(BLAS QUIET) +if(BLAS_FOUND) + eigen_add_benchmark(bench_blas_gemm benchBlasGemm.cpp + LIBRARIES ${BLAS_LIBRARIES} + DEFINITIONS HAVE_BLAS) +endif() diff --git a/benchmarks/benchBlasGemm.cpp b/benchmarks/benchBlasGemm.cpp new file mode 100644 index 000000000..33b34da65 --- /dev/null +++ b/benchmarks/benchBlasGemm.cpp @@ -0,0 +1,73 @@ +// Benchmark: Eigen GEMM vs CBLAS GEMM +// Requires CBLAS: compile with -DHAVE_BLAS and link -lcblas +// +// Based on bench/benchBlasGemm.cpp + +#include +#include + +using namespace Eigen; + +#ifndef SCALAR +#define SCALAR float +#endif + +typedef SCALAR Scalar; +typedef Matrix MyMatrix; + +static void BM_EigenGemm(benchmark::State& state) { + int M = state.range(0); + int N = state.range(1); + int K = state.range(2); + MyMatrix a = MyMatrix::Random(M, K); + MyMatrix b = MyMatrix::Random(K, N); + MyMatrix c = MyMatrix::Random(M, N); + for (auto _ : state) { + c.noalias() += a * b; + benchmark::DoNotOptimize(c.data()); + } + state.counters["GFLOPS"] = + benchmark::Counter(2.0 * M * N * K, benchmark::Counter::kIsIterationInvariantRate, benchmark::Counter::kIs1000); +} + +#ifdef HAVE_BLAS +extern "C" { +#include +} + +#ifdef _FLOAT +#define CBLAS_GEMM cblas_sgemm +#else +#define CBLAS_GEMM cblas_dgemm +#endif + +static void BM_CblasGemm(benchmark::State& state) { + int M = state.range(0); + int N = state.range(1); + int K = state.range(2); + MyMatrix a = MyMatrix::Random(M, K); + MyMatrix b = MyMatrix::Random(K, N); + MyMatrix c = MyMatrix::Random(M, N); + Scalar alpha = 1, beta = 1; + for (auto _ : state) { + CBLAS_GEMM(CblasColMajor, CblasNoTrans, CblasNoTrans, M, N, K, alpha, a.data(), M, b.data(), K, beta, c.data(), M); + benchmark::DoNotOptimize(c.data()); + } + state.counters["GFLOPS"] = + benchmark::Counter(2.0 * M * N * K, benchmark::Counter::kIsIterationInvariantRate, benchmark::Counter::kIs1000); +} +#endif + +static void GemmSizes(::benchmark::Benchmark* b) { + for (int s : {32, 64, 128, 256, 512, 1024, 2048}) { + b->Args({s, s, s}); + } + // Rectangular + b->Args({1000, 100, 1000}); + b->Args({100, 1000, 100}); +} + +BENCHMARK(BM_EigenGemm)->Apply(GemmSizes); +#ifdef HAVE_BLAS +BENCHMARK(BM_CblasGemm)->Apply(GemmSizes); +#endif diff --git a/benchmarks/benchCholesky.cpp b/benchmarks/benchCholesky.cpp new file mode 100644 index 000000000..cbce03175 --- /dev/null +++ b/benchmarks/benchCholesky.cpp @@ -0,0 +1,48 @@ +#include +#include +#include + +using namespace Eigen; + +typedef float Scalar; + +static void BM_LDLT(benchmark::State& state) { + int n = state.range(0); + typedef Matrix MatrixType; + typedef Matrix SquareMatrixType; + MatrixType a = MatrixType::Random(n, n); + SquareMatrixType covMat = a * a.adjoint(); + int r = internal::random(0, n - 1); + int c = internal::random(0, n - 1); + Scalar acc = 0; + for (auto _ : state) { + LDLT cholnosqrt(covMat); + acc += cholnosqrt.matrixL().coeff(r, c); + benchmark::DoNotOptimize(acc); + } +} +BENCHMARK(BM_LDLT)->RangeMultiplier(2)->Range(4, 1500); + +static void BM_LLT(benchmark::State& state) { + int n = state.range(0); + typedef Matrix MatrixType; + typedef Matrix SquareMatrixType; + MatrixType a = MatrixType::Random(n, n); + SquareMatrixType covMat = a * a.adjoint(); + int r = internal::random(0, n - 1); + int c = internal::random(0, n - 1); + Scalar acc = 0; + for (auto _ : state) { + LLT chol(covMat); + acc += chol.matrixL().coeff(r, c); + benchmark::DoNotOptimize(acc); + } + double cost = 0; + for (int j = 0; j < n; ++j) { + int rem = std::max(n - j - 1, 0); + cost += 2 * (rem * j + rem + j); + } + state.counters["GFLOPS"] = + benchmark::Counter(cost, benchmark::Counter::kIsIterationInvariantRate, benchmark::Counter::kIs1000); +} +BENCHMARK(BM_LLT)->RangeMultiplier(2)->Range(4, 1500); diff --git a/benchmarks/benchEigenSolver.cpp b/benchmarks/benchEigenSolver.cpp new file mode 100644 index 000000000..4cb1ccb2f --- /dev/null +++ b/benchmarks/benchEigenSolver.cpp @@ -0,0 +1,42 @@ +#include +#include +#include +#include + +using namespace Eigen; + +typedef float Scalar; + +static void BM_SelfAdjointEigenSolver(benchmark::State& state) { + int n = state.range(0); + typedef Matrix MatrixType; + MatrixType a = MatrixType::Random(n, n); + MatrixType covMat = a * a.adjoint(); + int r = internal::random(0, n - 1); + int c = internal::random(0, n - 1); + Scalar acc = 0; + SelfAdjointEigenSolver ei(covMat); + for (auto _ : state) { + ei.compute(covMat); + acc += ei.eigenvectors().coeff(r, c); + benchmark::DoNotOptimize(acc); + } +} +BENCHMARK(BM_SelfAdjointEigenSolver)->RangeMultiplier(2)->Range(4, 512); + +static void BM_EigenSolver(benchmark::State& state) { + int n = state.range(0); + typedef Matrix MatrixType; + MatrixType a = MatrixType::Random(n, n); + MatrixType covMat = a * a.adjoint(); + int r = internal::random(0, n - 1); + int c = internal::random(0, n - 1); + Scalar acc = 0; + EigenSolver ei(covMat); + for (auto _ : state) { + ei.compute(covMat); + acc += std::norm(ei.eigenvectors().coeff(r, c)); + benchmark::DoNotOptimize(acc); + } +} +BENCHMARK(BM_EigenSolver)->RangeMultiplier(2)->Range(4, 512); diff --git a/benchmarks/benchFFT.cpp b/benchmarks/benchFFT.cpp new file mode 100644 index 000000000..391178faa --- /dev/null +++ b/benchmarks/benchFFT.cpp @@ -0,0 +1,43 @@ +#include +#include +#include +#include +#include + +using namespace Eigen; + +template +static void BM_FFT(benchmark::State& state) { + typedef typename NumTraits::Real ScalarType; + typedef std::complex Complex; + int nfft = state.range(0); + std::vector inbuf(nfft); + std::vector outbuf(nfft); + FFT fft; + if (Unscaled) fft.SetFlag(fft.Unscaled); + if (HalfSpec) fft.SetFlag(fft.HalfSpectrum); + std::fill(inbuf.begin(), inbuf.end(), T(0)); + fft.fwd(outbuf, inbuf); + + for (auto _ : state) { + if (Forward) + fft.fwd(outbuf, inbuf); + else + fft.inv(inbuf, outbuf); + benchmark::DoNotOptimize(outbuf.data()); + benchmark::DoNotOptimize(inbuf.data()); + } + double mflops_per_iter = 5.0 * nfft * std::log2(static_cast(nfft)); + if (!NumTraits::IsComplex) mflops_per_iter /= 2; + state.counters["MFLOPS"] = + benchmark::Counter(mflops_per_iter, benchmark::Counter::kIsIterationInvariantRate, benchmark::Counter::kIs1000); +} + +BENCHMARK(BM_FFT, true>)->Arg(1024)->Arg(4096); +BENCHMARK(BM_FFT, false>)->Arg(1024)->Arg(4096); +BENCHMARK(BM_FFT)->Arg(1024)->Arg(4096); +BENCHMARK(BM_FFT)->Arg(1024)->Arg(4096); +BENCHMARK(BM_FFT, true>)->Arg(1024)->Arg(4096); +BENCHMARK(BM_FFT, false>)->Arg(1024)->Arg(4096); +BENCHMARK(BM_FFT)->Arg(1024)->Arg(4096); +BENCHMARK(BM_FFT)->Arg(1024)->Arg(4096); diff --git a/benchmarks/benchGemm.cpp b/benchmarks/benchGemm.cpp new file mode 100644 index 000000000..db061b7fe --- /dev/null +++ b/benchmarks/benchGemm.cpp @@ -0,0 +1,81 @@ +#include +#include + +using namespace Eigen; + +#ifndef SCALAR +#define SCALAR float +#endif + +typedef SCALAR Scalar; +typedef Matrix Mat; + +template +EIGEN_DONT_INLINE void gemm(const A& a, const B& b, C& c) { + c.noalias() += a * b; +} + +static void BM_EigenGemm(benchmark::State& state) { + int m = state.range(0); + int n = state.range(1); + int p = state.range(2); + Mat a(m, p); + a.setRandom(); + Mat b(p, n); + b.setRandom(); + Mat c = Mat::Zero(m, n); + for (auto _ : state) { + c.setZero(); + gemm(a, b, c); + benchmark::DoNotOptimize(c.data()); + benchmark::ClobberMemory(); + } + state.counters["GFLOPS"] = + benchmark::Counter(2.0 * m * n * p, benchmark::Counter::kIsIterationInvariantRate, benchmark::Counter::kIs1000); +} + +static void GemmSizes(::benchmark::Benchmark* b) { + for (int size : {8, 16, 32, 64, 96, 128, 160, 192, 224, 256, 288, 320, 384, 448, 512, 768, 1024, 1536, 2048}) { + b->Args({size, size, size}); + } + // Non-square sizes + b->Args({64, 64, 1024}); + b->Args({1024, 64, 64}); + b->Args({64, 1024, 64}); + b->Args({256, 256, 1024}); + b->Args({1024, 256, 256}); +} + +BENCHMARK(BM_EigenGemm)->Apply(GemmSizes); + +#ifdef HAVE_BLAS +extern "C" { +#include +} + +static void BM_BlasGemm(benchmark::State& state) { + int m = state.range(0); + int n = state.range(1); + int p = state.range(2); + Mat a(m, p); + a.setRandom(); + Mat b(p, n); + b.setRandom(); + Mat c = Mat::Zero(m, n); + char notrans = 'N'; + Scalar one = 1, zero = 0; + for (auto _ : state) { + c.setZero(); + if constexpr (std::is_same_v) { + sgemm_(¬rans, ¬rans, &m, &n, &p, &one, a.data(), &m, b.data(), &p, &one, c.data(), &m); + } else { + dgemm_(¬rans, ¬rans, &m, &n, &p, &one, a.data(), &m, b.data(), &p, &one, c.data(), &m); + } + benchmark::DoNotOptimize(c.data()); + benchmark::ClobberMemory(); + } + state.counters["GFLOPS"] = + benchmark::Counter(2.0 * m * n * p, benchmark::Counter::kIsIterationInvariantRate, benchmark::Counter::kIs1000); +} +BENCHMARK(BM_BlasGemm)->Apply(GemmSizes); +#endif diff --git a/benchmarks/benchGemv.cpp b/benchmarks/benchGemv.cpp new file mode 100644 index 000000000..cc03973a6 --- /dev/null +++ b/benchmarks/benchGemv.cpp @@ -0,0 +1,160 @@ +// Benchmark for dense general matrix-vector multiplication (GEMV). +// +// Tests performance of y += op(A) * x for various matrix sizes, aspect ratios, +// scalar types, and operation variants (transpose, conjugate, adjoint). +// +// The Eigen GEMV kernel (Eigen/src/Core/products/GeneralMatrixVector.h) has +// two main specializations: +// - ColMajor kernel: used for y += A * x with column-major A. +// Processes vertical panels, vectorizes along rows. +// - RowMajor kernel: used for y += A^T * x with column-major A. +// Processes groups of rows, vectorizes the dot product along columns. +// +// For complex scalars, conjugation flags (ConjugateLhs, ConjugateRhs) select +// additional code paths within each kernel via conj_helper. +// +// Operation mapping (for column-major stored A): +// Gemv y += A * x -> ColMajor kernel, no conjugation +// GemvTrans y += A^T * x -> RowMajor kernel, no conjugation +// GemvConj y += conj(A) * x -> ColMajor kernel, ConjugateLhs=true +// GemvAdj y += A^H * x -> RowMajor kernel, ConjugateLhs=true + +#include +#include + +using namespace Eigen; + +// ---------- Benchmark helpers ---------- + +// GEMV flop count: 2*m*n for real, 8*m*n for complex. +template +double gemvFlops(Index m, Index n) { + return (NumTraits::IsComplex ? 8.0 : 2.0) * m * n; +} + +// ---------- y += A * x (ColMajor GEMV kernel, no conjugation) ---------- + +template +static void BM_Gemv(benchmark::State& state) { + using Mat = Matrix; + using Vec = Matrix; + const Index m = state.range(0); + const Index n = state.range(1); + Mat A = Mat::Random(m, n); + Vec x = Vec::Random(n); + Vec y = Vec::Random(m); + for (auto _ : state) { + y.noalias() += A * x; + benchmark::DoNotOptimize(y.data()); + benchmark::ClobberMemory(); + } + state.counters["GFLOPS"] = benchmark::Counter(gemvFlops(m, n), benchmark::Counter::kIsIterationInvariantRate, + benchmark::Counter::kIs1000); +} + +// ---------- y += A^T * x (RowMajor GEMV kernel, no conjugation) ---------- + +template +static void BM_GemvTrans(benchmark::State& state) { + using Mat = Matrix; + using Vec = Matrix; + const Index m = state.range(0); + const Index n = state.range(1); + Mat A = Mat::Random(m, n); + Vec x = Vec::Random(m); + Vec y = Vec::Random(n); + for (auto _ : state) { + y.noalias() += A.transpose() * x; + benchmark::DoNotOptimize(y.data()); + benchmark::ClobberMemory(); + } + state.counters["GFLOPS"] = benchmark::Counter(gemvFlops(m, n), benchmark::Counter::kIsIterationInvariantRate, + benchmark::Counter::kIs1000); +} + +// ---------- y += conj(A) * x (ColMajor kernel, ConjugateLhs=true) ---------- + +template +static void BM_GemvConj(benchmark::State& state) { + using Mat = Matrix; + using Vec = Matrix; + const Index m = state.range(0); + const Index n = state.range(1); + Mat A = Mat::Random(m, n); + Vec x = Vec::Random(n); + Vec y = Vec::Random(m); + for (auto _ : state) { + y.noalias() += A.conjugate() * x; + benchmark::DoNotOptimize(y.data()); + benchmark::ClobberMemory(); + } + state.counters["GFLOPS"] = benchmark::Counter(gemvFlops(m, n), benchmark::Counter::kIsIterationInvariantRate, + benchmark::Counter::kIs1000); +} + +// ---------- y += A^H * x (RowMajor kernel, ConjugateLhs=true) ---------- + +template +static void BM_GemvAdj(benchmark::State& state) { + using Mat = Matrix; + using Vec = Matrix; + const Index m = state.range(0); + const Index n = state.range(1); + Mat A = Mat::Random(m, n); + Vec x = Vec::Random(m); + Vec y = Vec::Random(n); + for (auto _ : state) { + y.noalias() += A.adjoint() * x; + benchmark::DoNotOptimize(y.data()); + benchmark::ClobberMemory(); + } + state.counters["GFLOPS"] = benchmark::Counter(gemvFlops(m, n), benchmark::Counter::kIsIterationInvariantRate, + benchmark::Counter::kIs1000); +} + +// ---------- Size configurations ---------- +// All sizes refer to the stored matrix A (m rows, n cols). + +static void GemvSizes(::benchmark::Benchmark* b) { + // Square matrices: exercises balanced kernel behavior. + for (int size : {8, 16, 32, 64, 128, 256, 512, 1024, 4096}) { + b->Args({size, size}); + } + // Tall-thin (m >> n): in ColMajor kernel, the inner vectorized loop over rows + // is long while the outer column loop is short. In RowMajor kernel (transpose), + // there are many rows to process but short dot products. + for (int n : {1, 4, 16, 64}) { + for (int m : {256, 1024, 4096}) { + if (m != n) b->Args({m, n}); + } + } + // Short-wide (m << n): in ColMajor kernel, the outer column loop is long but + // the inner vectorized loop over rows is short. In RowMajor kernel (transpose), + // there are few rows but long dot products. + for (int m : {1, 4, 16, 64}) { + for (int n : {256, 1024, 4096}) { + if (m != n) b->Args({m, n}); + } + } +} + +// ---------- Register benchmarks ---------- + +// Real types: Gemv and GemvTrans exercise the two kernel specializations. +// Conjugation is a no-op for real scalars. + +BENCHMARK(BM_Gemv)->Apply(GemvSizes)->Name("Gemv_float"); +BENCHMARK(BM_Gemv)->Apply(GemvSizes)->Name("Gemv_double"); +BENCHMARK(BM_GemvTrans)->Apply(GemvSizes)->Name("GemvTrans_float"); +BENCHMARK(BM_GemvTrans)->Apply(GemvSizes)->Name("GemvTrans_double"); + +// Complex types: all four variants exercise distinct kernel code paths. + +BENCHMARK(BM_Gemv>)->Apply(GemvSizes)->Name("Gemv_cfloat"); +BENCHMARK(BM_Gemv>)->Apply(GemvSizes)->Name("Gemv_cdouble"); +BENCHMARK(BM_GemvTrans>)->Apply(GemvSizes)->Name("GemvTrans_cfloat"); +BENCHMARK(BM_GemvTrans>)->Apply(GemvSizes)->Name("GemvTrans_cdouble"); +BENCHMARK(BM_GemvConj>)->Apply(GemvSizes)->Name("GemvConj_cfloat"); +BENCHMARK(BM_GemvConj>)->Apply(GemvSizes)->Name("GemvConj_cdouble"); +BENCHMARK(BM_GemvAdj>)->Apply(GemvSizes)->Name("GemvAdj_cfloat"); +BENCHMARK(BM_GemvAdj>)->Apply(GemvSizes)->Name("GemvAdj_cdouble"); diff --git a/benchmarks/benchGeometry.cpp b/benchmarks/benchGeometry.cpp new file mode 100644 index 000000000..c64f731aa --- /dev/null +++ b/benchmarks/benchGeometry.cpp @@ -0,0 +1,43 @@ +#include +#include +#include + +using namespace Eigen; + +template +static void BM_TransformVec(benchmark::State& state) { + typedef Transform Trans; + typedef Matrix Vec; + Trans t; + t.setIdentity(); + Vec v; + v.setRandom(); + for (auto _ : state) { + v = t * v; + benchmark::DoNotOptimize(v.data()); + } +} + +template +static void BM_TransformTransform(benchmark::State& state) { + typedef Transform Trans; + Trans t1, t2; + t1.setIdentity(); + t2.setIdentity(); + for (auto _ : state) { + t2 = Trans(t1 * t2); + benchmark::DoNotOptimize(t2.data()); + } +} + +BENCHMARK(BM_TransformVec); +BENCHMARK(BM_TransformVec); +BENCHMARK(BM_TransformVec); +BENCHMARK(BM_TransformVec); +BENCHMARK(BM_TransformVec); +BENCHMARK(BM_TransformVec); + +BENCHMARK(BM_TransformTransform); +BENCHMARK(BM_TransformTransform); +BENCHMARK(BM_TransformTransform); +BENCHMARK(BM_TransformTransform); diff --git a/benchmarks/benchVecAdd.cpp b/benchmarks/benchVecAdd.cpp new file mode 100644 index 000000000..b4db14b72 --- /dev/null +++ b/benchmarks/benchVecAdd.cpp @@ -0,0 +1,28 @@ +#include +#include + +using namespace Eigen; + +static void BM_VecAdd(benchmark::State& state) { + int size = state.range(0); + VectorXf a = VectorXf::Random(size); + VectorXf b = VectorXf::Random(size); + for (auto _ : state) { + a = a + b; + benchmark::DoNotOptimize(a.data()); + } + state.SetBytesProcessed(state.iterations() * size * sizeof(float) * 3); +} +BENCHMARK(BM_VecAdd)->RangeMultiplier(4)->Range(64, 1 << 20); + +static void BM_MatAdd(benchmark::State& state) { + int n = state.range(0); + MatrixXf a = MatrixXf::Random(n, n); + MatrixXf b = MatrixXf::Random(n, n); + for (auto _ : state) { + a = a + b; + benchmark::DoNotOptimize(a.data()); + } + state.SetBytesProcessed(state.iterations() * n * n * sizeof(float) * 3); +} +BENCHMARK(BM_MatAdd)->RangeMultiplier(2)->Range(8, 512); diff --git a/benchmarks/bench_move_semantics.cpp b/benchmarks/bench_move_semantics.cpp new file mode 100644 index 000000000..be3e99e8c --- /dev/null +++ b/benchmarks/bench_move_semantics.cpp @@ -0,0 +1,41 @@ +#include +#include +#include "../test/MovableScalar.h" +#include + +template +void copy_matrix(MatrixType& m) { + MatrixType tmp(m); + m = tmp; +} + +template +void move_matrix(MatrixType&& m) { + MatrixType tmp(std::move(m)); + m = std::move(tmp); +} + +template +static void BM_CopySemantics(benchmark::State& state) { + using MatrixType = Eigen::Matrix, 1, 10>; + MatrixType data = MatrixType::Random().eval(); + for (auto _ : state) { + copy_matrix(data); + benchmark::DoNotOptimize(data.data()); + } +} + +template +static void BM_MoveSemantics(benchmark::State& state) { + using MatrixType = Eigen::Matrix, 1, 10>; + MatrixType data = MatrixType::Random().eval(); + for (auto _ : state) { + move_matrix(std::move(data)); + benchmark::DoNotOptimize(data.data()); + } +} + +BENCHMARK(BM_CopySemantics); +BENCHMARK(BM_MoveSemantics); +BENCHMARK(BM_CopySemantics); +BENCHMARK(BM_MoveSemantics); diff --git a/benchmarks/bench_reverse.cpp b/benchmarks/bench_reverse.cpp new file mode 100644 index 000000000..808df18b3 --- /dev/null +++ b/benchmarks/bench_reverse.cpp @@ -0,0 +1,30 @@ +#include +#include + +using namespace Eigen; + +static void BM_MatrixReverse(benchmark::State& state) { + int n = state.range(0); + typedef Matrix MatrixType; + MatrixType a = MatrixType::Random(n, n); + MatrixType b(n, n); + for (auto _ : state) { + b = a.reverse(); + benchmark::DoNotOptimize(b.data()); + } + state.SetBytesProcessed(state.iterations() * n * n * sizeof(double)); +} +BENCHMARK(BM_MatrixReverse)->RangeMultiplier(2)->Range(4, 512); + +static void BM_VectorReverse(benchmark::State& state) { + int n = state.range(0); + typedef Matrix VectorType; + VectorType a = VectorType::Random(n); + VectorType b(n); + for (auto _ : state) { + b = a.reverse(); + benchmark::DoNotOptimize(b.data()); + } + state.SetBytesProcessed(state.iterations() * n * sizeof(double)); +} +BENCHMARK(BM_VectorReverse)->RangeMultiplier(4)->Range(16, 1 << 18); diff --git a/benchmarks/bench_svd.cpp b/benchmarks/bench_svd.cpp new file mode 100644 index 000000000..b7e6da38c --- /dev/null +++ b/benchmarks/bench_svd.cpp @@ -0,0 +1,103 @@ +#include +#include + +using namespace Eigen; + +// Benchmark JacobiSVD and BDCSVD for various scalar types, matrix shapes, +// and computation options. + +// ---------- helpers ---------- + +template +using Mat = Matrix; + +template +EIGEN_DONT_INLINE void do_compute(SVD& svd, const typename SVD::MatrixType& A) { + svd.compute(A); +} + +// ---------- JacobiSVD ---------- + +template +static void BM_JacobiSVD(benchmark::State& state) { + const Index rows = state.range(0); + const Index cols = state.range(1); + Mat A = Mat::Random(rows, cols); + JacobiSVD, Options> svd(rows, cols); + for (auto _ : state) { + do_compute(svd, A); + benchmark::DoNotOptimize(svd.singularValues().data()); + } + state.SetItemsProcessed(state.iterations()); +} + +// ---------- BDCSVD ---------- + +template +static void BM_BDCSVD(benchmark::State& state) { + const Index rows = state.range(0); + const Index cols = state.range(1); + Mat A = Mat::Random(rows, cols); + BDCSVD, Options> svd(rows, cols); + for (auto _ : state) { + do_compute(svd, A); + benchmark::DoNotOptimize(svd.singularValues().data()); + } + state.SetItemsProcessed(state.iterations()); +} + +// ---------- Size configurations ---------- + +// Sizes suitable for JacobiSVD (O(n^2 p), expensive for large n). +static void JacobiSizes(::benchmark::Benchmark* b) { + // Square + for (int s : {4, 8, 16, 32, 64, 128, 256, 512}) b->Args({s, s}); + // Tall-skinny + b->Args({100, 4}); + b->Args({1000, 4}); + b->Args({1000, 10}); +} + +// Sizes suitable for BDCSVD (divide-and-conquer, faster for large matrices). +static void BDCSizes(::benchmark::Benchmark* b) { + // Square + for (int s : {4, 8, 16, 32, 64, 128, 256, 512, 1024}) b->Args({s, s}); + // Tall-skinny (triggers R-bidiagonalization when aspect ratio > 4) + b->Args({100, 4}); + b->Args({1000, 4}); + b->Args({1000, 10}); + b->Args({1000, 100}); + b->Args({10000, 10}); + b->Args({10000, 100}); +} + +// ---------- Register benchmarks ---------- + +// JacobiSVD — float +BENCHMARK(BM_JacobiSVD)->Apply(JacobiSizes)->Name("JacobiSVD_float_ThinUV"); +BENCHMARK(BM_JacobiSVD)->Apply(JacobiSizes)->Name("JacobiSVD_float_ValuesOnly"); + +// JacobiSVD — double +BENCHMARK(BM_JacobiSVD)->Apply(JacobiSizes)->Name("JacobiSVD_double_ThinUV"); +BENCHMARK(BM_JacobiSVD)->Apply(JacobiSizes)->Name("JacobiSVD_double_ValuesOnly"); + +// BDCSVD — float +BENCHMARK(BM_BDCSVD)->Apply(BDCSizes)->Name("BDCSVD_float_ThinUV"); +BENCHMARK(BM_BDCSVD)->Apply(BDCSizes)->Name("BDCSVD_float_ValuesOnly"); + +// BDCSVD — double +BENCHMARK(BM_BDCSVD)->Apply(BDCSizes)->Name("BDCSVD_double_ThinUV"); +BENCHMARK(BM_BDCSVD)->Apply(BDCSizes)->Name("BDCSVD_double_ValuesOnly"); + +// JacobiSVD — QR preconditioner comparison (double, 64x64, ThinUV) +BENCHMARK(BM_JacobiSVD) + ->Args({64, 64}) + ->Args({1000, 10}) + ->Name("JacobiSVD_double_ColPivQR"); +BENCHMARK(BM_JacobiSVD) + ->Args({64, 64}) + ->Args({1000, 10}) + ->Name("JacobiSVD_double_HouseholderQR"); +BENCHMARK(BM_JacobiSVD) + ->Args({64, 64}) + ->Name("JacobiSVD_double_FullPivQR"); diff --git a/benchmarks/bench_trsm.cpp b/benchmarks/bench_trsm.cpp new file mode 100644 index 000000000..3da3a90c9 --- /dev/null +++ b/benchmarks/bench_trsm.cpp @@ -0,0 +1,99 @@ +#include +#include + +using namespace Eigen; + +// ---------- TRSV: triangular solve with single RHS vector ---------- + +template +static void BM_TRSV(benchmark::State& state) { + using Mat = Matrix; + using Vec = Matrix; + const Index n = state.range(0); + Mat A = Mat::Random(n, n); + // Make diagonally dominant to ensure well-conditioned triangular part. + A.diagonal().array() += Scalar(n); + Vec x = Vec::Random(n); + Vec b = x; + for (auto _ : state) { + x = b; + A.template triangularView().solveInPlace(x); + benchmark::DoNotOptimize(x.data()); + } + state.SetItemsProcessed(state.iterations() * n * n); +} + +// ---------- TRSM: triangular solve with multiple RHS (OnTheLeft) ---------- + +template +static void BM_TRSM_Left(benchmark::State& state) { + using Mat = Matrix; + const Index n = state.range(0); + const Index nrhs = state.range(1); + Mat A = Mat::Random(n, n); + A.diagonal().array() += Scalar(n); + Mat X = Mat::Random(n, nrhs); + Mat B = X; + for (auto _ : state) { + X = B; + A.template triangularView().solveInPlace(X); + benchmark::DoNotOptimize(X.data()); + } + state.SetItemsProcessed(state.iterations() * n * n * nrhs); +} + +// ---------- TRSM: triangular solve with multiple RHS (OnTheRight) ---------- + +template +static void BM_TRSM_Right(benchmark::State& state) { + using Mat = Matrix; + const Index n = state.range(0); + const Index nrhs = state.range(1); + Mat A = Mat::Random(n, n); + A.diagonal().array() += Scalar(n); + Mat X = Mat::Random(nrhs, n); + Mat B = X; + for (auto _ : state) { + X = B; + A.template triangularView().template solveInPlace(X); + benchmark::DoNotOptimize(X.data()); + } + state.SetItemsProcessed(state.iterations() * n * n * nrhs); +} + +// ---------- Size configurations ---------- + +static void TrsvSizes(::benchmark::Benchmark* b) { + for (int n : {32, 64, 128, 256, 512, 1024}) { + b->Args({n}); + } +} + +static void TrsmSizes(::benchmark::Benchmark* b) { + for (int n : {32, 64, 128, 256, 512, 1024}) { + for (int nrhs : {1, 4, 16, 64, 256}) { + b->Args({n, nrhs}); + } + } +} + +// ---------- TRSV benchmarks ---------- + +BENCHMARK(BM_TRSV)->Apply(TrsvSizes)->Name("TRSV_float_Lower"); +BENCHMARK(BM_TRSV)->Apply(TrsvSizes)->Name("TRSV_float_Upper"); +BENCHMARK(BM_TRSV)->Apply(TrsvSizes)->Name("TRSV_double_Lower"); +BENCHMARK(BM_TRSV)->Apply(TrsvSizes)->Name("TRSV_double_Upper"); + +// ---------- TRSM Left benchmarks ---------- + +BENCHMARK(BM_TRSM_Left)->Apply(TrsmSizes)->Name("TRSM_Left_float_Lower"); +BENCHMARK(BM_TRSM_Left)->Apply(TrsmSizes)->Name("TRSM_Left_float_Upper"); +BENCHMARK(BM_TRSM_Left)->Apply(TrsmSizes)->Name("TRSM_Left_double_Lower"); +BENCHMARK(BM_TRSM_Left)->Apply(TrsmSizes)->Name("TRSM_Left_double_Upper"); + +// ---------- TRSM Right benchmarks ---------- + +BENCHMARK(BM_TRSM_Right)->Apply(TrsmSizes)->Name("TRSM_Right_float_Lower"); +BENCHMARK(BM_TRSM_Right)->Apply(TrsmSizes)->Name("TRSM_Right_float_Upper"); +BENCHMARK(BM_TRSM_Right)->Apply(TrsmSizes)->Name("TRSM_Right_double_Lower"); +BENCHMARK(BM_TRSM_Right)->Apply(TrsmSizes)->Name("TRSM_Right_double_Upper"); diff --git a/benchmarks/benchmark_aocl.cpp b/benchmarks/benchmark_aocl.cpp new file mode 100644 index 000000000..5850cfe5c --- /dev/null +++ b/benchmarks/benchmark_aocl.cpp @@ -0,0 +1,123 @@ +#include +#include +#include +#include + +using namespace Eigen; + +static void BM_VectorExp(benchmark::State& state) { + int n = state.range(0); + VectorXd v = VectorXd::LinSpaced(n, 0.1, 10.0); + VectorXd result(n); + for (auto _ : state) { + result = v.array().exp(); + benchmark::DoNotOptimize(result.data()); + } + state.SetBytesProcessed(state.iterations() * n * sizeof(double)); +} + +static void BM_VectorSin(benchmark::State& state) { + int n = state.range(0); + VectorXd v = VectorXd::LinSpaced(n, 0.1, 10.0); + VectorXd result(n); + for (auto _ : state) { + result = v.array().sin(); + benchmark::DoNotOptimize(result.data()); + } + state.SetBytesProcessed(state.iterations() * n * sizeof(double)); +} + +static void BM_VectorCos(benchmark::State& state) { + int n = state.range(0); + VectorXd v = VectorXd::LinSpaced(n, 0.1, 10.0); + VectorXd result(n); + for (auto _ : state) { + result = v.array().cos(); + benchmark::DoNotOptimize(result.data()); + } + state.SetBytesProcessed(state.iterations() * n * sizeof(double)); +} + +static void BM_VectorSqrt(benchmark::State& state) { + int n = state.range(0); + VectorXd v = VectorXd::LinSpaced(n, 0.1, 10.0); + VectorXd result(n); + for (auto _ : state) { + result = v.array().sqrt(); + benchmark::DoNotOptimize(result.data()); + } + state.SetBytesProcessed(state.iterations() * n * sizeof(double)); +} + +static void BM_VectorLog(benchmark::State& state) { + int n = state.range(0); + VectorXd v = VectorXd::LinSpaced(n, 0.1, 10.0); + VectorXd result(n); + for (auto _ : state) { + result = v.array().log(); + benchmark::DoNotOptimize(result.data()); + } + state.SetBytesProcessed(state.iterations() * n * sizeof(double)); +} + +static void BM_VectorTanh(benchmark::State& state) { + int n = state.range(0); + VectorXd v = VectorXd::LinSpaced(n, 0.1, 10.0); + VectorXd result(n); + for (auto _ : state) { + result = v.array().tanh(); + benchmark::DoNotOptimize(result.data()); + } + state.SetBytesProcessed(state.iterations() * n * sizeof(double)); +} + +static void VectorSizes(::benchmark::Benchmark* b) { + for (int n : {10000, 100000, 1000000, 5000000}) { + b->Arg(n); + } +} + +BENCHMARK(BM_VectorExp)->Apply(VectorSizes); +BENCHMARK(BM_VectorSin)->Apply(VectorSizes); +BENCHMARK(BM_VectorCos)->Apply(VectorSizes); +BENCHMARK(BM_VectorSqrt)->Apply(VectorSizes); +BENCHMARK(BM_VectorLog)->Apply(VectorSizes); +BENCHMARK(BM_VectorTanh)->Apply(VectorSizes); + +static void BM_DGEMM(benchmark::State& state) { + int n = state.range(0); + MatrixXd A = MatrixXd::Random(n, n); + MatrixXd B = MatrixXd::Random(n, n); + MatrixXd C(n, n); + for (auto _ : state) { + C.noalias() = A * B; + benchmark::DoNotOptimize(C.data()); + } + state.counters["GFLOPS"] = + benchmark::Counter(2.0 * n * n * n, benchmark::Counter::kIsIterationInvariantRate, benchmark::Counter::kIs1000); +} +BENCHMARK(BM_DGEMM)->Arg(256)->Arg(512)->Arg(1024)->Arg(2048); + +static void BM_EigenDecomposition(benchmark::State& state) { + int n = state.range(0); + MatrixXd M = MatrixXd::Random(n, n); + M = (M + M.transpose()) * 0.5; + SelfAdjointEigenSolver solver; + for (auto _ : state) { + solver.compute(M); + benchmark::DoNotOptimize(solver.eigenvalues().data()); + } +} +BENCHMARK(BM_EigenDecomposition)->Arg(256)->Arg(512)->Arg(1024); + +static void BM_FSI_Risk(benchmark::State& state) { + int numPeriods = state.range(0); + int numAssets = state.range(1); + MatrixXd returns = MatrixXd::Random(numPeriods, numAssets); + for (auto _ : state) { + MatrixXd cov = (returns.transpose() * returns) / (numPeriods - 1); + SelfAdjointEigenSolver solver(cov); + benchmark::DoNotOptimize(solver.eigenvalues().data()); + } +} +BENCHMARK(BM_FSI_Risk)->Args({10000, 500}); diff --git a/benchmarks/benchmark_blocking_sizes.cpp b/benchmarks/benchmark_blocking_sizes.cpp new file mode 100644 index 000000000..ac9ba7371 --- /dev/null +++ b/benchmarks/benchmark_blocking_sizes.cpp @@ -0,0 +1,76 @@ +#include + +#include + +bool eigen_use_specific_block_size; +int eigen_block_size_k, eigen_block_size_m, eigen_block_size_n; +#define EIGEN_TEST_SPECIFIC_BLOCKING_SIZES eigen_use_specific_block_size +#define EIGEN_TEST_SPECIFIC_BLOCKING_SIZE_K eigen_block_size_k +#define EIGEN_TEST_SPECIFIC_BLOCKING_SIZE_M eigen_block_size_m +#define EIGEN_TEST_SPECIFIC_BLOCKING_SIZE_N eigen_block_size_n +#include + +using namespace Eigen; + +typedef MatrixXf MatrixType; +typedef MatrixType::Scalar Scalar; + +static void BM_GemmDefaultBlocking(benchmark::State& state) { + int k = state.range(0); + int m = state.range(1); + int n = state.range(2); + eigen_use_specific_block_size = false; + MatrixType lhs = MatrixType::Random(m, k); + MatrixType rhs = MatrixType::Random(k, n); + MatrixType dst = MatrixType::Zero(m, n); + for (auto _ : state) { + dst.noalias() = lhs * rhs; + benchmark::DoNotOptimize(dst.data()); + } + state.counters["GFLOPS"] = + benchmark::Counter(2.0 * k * m * n, benchmark::Counter::kIsIterationInvariantRate, benchmark::Counter::kIs1000); +} + +static void BM_GemmCustomBlocking(benchmark::State& state) { + int k = state.range(0); + int m = state.range(1); + int n = state.range(2); + int bk = state.range(3); + int bm = state.range(4); + int bn = state.range(5); + eigen_use_specific_block_size = true; + eigen_block_size_k = bk; + eigen_block_size_m = bm; + eigen_block_size_n = bn; + MatrixType lhs = MatrixType::Random(m, k); + MatrixType rhs = MatrixType::Random(k, n); + MatrixType dst = MatrixType::Zero(m, n); + for (auto _ : state) { + dst.noalias() = lhs * rhs; + benchmark::DoNotOptimize(dst.data()); + } + state.counters["GFLOPS"] = + benchmark::Counter(2.0 * k * m * n, benchmark::Counter::kIsIterationInvariantRate, benchmark::Counter::kIs1000); +} + +static void DefaultBlockingSizes(::benchmark::Benchmark* b) { + for (int s : {64, 128, 256, 512, 1024, 2048}) { + b->Args({s, s, s}); + } +} + +static void CustomBlockingSizes(::benchmark::Benchmark* b) { + // Test a few product sizes with varying block sizes + for (int s : {256, 512, 1024}) { + for (int bk : {16, 32, 64, 128, 256}) { + if (bk > s) continue; + for (int bm : {16, 32, 64, 128, 256}) { + if (bm > s) continue; + b->Args({s, s, s, bk, bm, s}); + } + } + } +} + +BENCHMARK(BM_GemmDefaultBlocking)->Apply(DefaultBlockingSizes); +BENCHMARK(BM_GemmCustomBlocking)->Apply(CustomBlockingSizes); diff --git a/benchmarks/dense_solvers.cpp b/benchmarks/dense_solvers.cpp new file mode 100644 index 000000000..62223f067 --- /dev/null +++ b/benchmarks/dense_solvers.cpp @@ -0,0 +1,167 @@ +#include +#include + +using namespace Eigen; + +typedef float Scalar; +typedef Matrix Mat; + +template +EIGEN_DONT_INLINE void compute_norm_equation(Solver& solver, const Mat& A) { + if (A.rows() != A.cols()) + solver.compute(A.transpose() * A); + else + solver.compute(A); +} + +template +EIGEN_DONT_INLINE void compute(Solver& solver, const Mat& A) { + solver.compute(A); +} + +static void BM_LLT(benchmark::State& state) { + int rows = state.range(0); + int cols = state.range(1); + int size = cols; + Mat A(rows, cols); + A.setRandom(); + if (rows == cols) A = A * A.adjoint(); + LLT solver(size); + for (auto _ : state) { + compute_norm_equation(solver, A); + benchmark::DoNotOptimize(solver.matrixLLT().data()); + } +} + +static void BM_LDLT(benchmark::State& state) { + int rows = state.range(0); + int cols = state.range(1); + int size = cols; + Mat A(rows, cols); + A.setRandom(); + if (rows == cols) A = A * A.adjoint(); + LDLT solver(size); + for (auto _ : state) { + compute_norm_equation(solver, A); + benchmark::DoNotOptimize(solver.matrixLDLT().data()); + } +} + +static void BM_PartialPivLU(benchmark::State& state) { + int rows = state.range(0); + int cols = state.range(1); + int size = cols; + Mat A(rows, cols); + A.setRandom(); + if (rows == cols) A = A * A.adjoint(); + PartialPivLU solver(size); + for (auto _ : state) { + compute_norm_equation(solver, A); + benchmark::DoNotOptimize(solver.matrixLU().data()); + } +} + +static void BM_FullPivLU(benchmark::State& state) { + int rows = state.range(0); + int cols = state.range(1); + int size = cols; + Mat A(rows, cols); + A.setRandom(); + if (rows == cols) A = A * A.adjoint(); + FullPivLU solver(size, size); + for (auto _ : state) { + compute_norm_equation(solver, A); + benchmark::DoNotOptimize(solver.matrixLU().data()); + } +} + +static void BM_HouseholderQR(benchmark::State& state) { + int rows = state.range(0); + int cols = state.range(1); + Mat A = Mat::Random(rows, cols); + HouseholderQR solver(rows, cols); + for (auto _ : state) { + compute(solver, A); + benchmark::DoNotOptimize(solver.matrixQR().data()); + } +} + +static void BM_ColPivHouseholderQR(benchmark::State& state) { + int rows = state.range(0); + int cols = state.range(1); + Mat A = Mat::Random(rows, cols); + ColPivHouseholderQR solver(rows, cols); + for (auto _ : state) { + compute(solver, A); + benchmark::DoNotOptimize(solver.matrixQR().data()); + } +} + +static void BM_COD(benchmark::State& state) { + int rows = state.range(0); + int cols = state.range(1); + Mat A = Mat::Random(rows, cols); + CompleteOrthogonalDecomposition solver(rows, cols); + for (auto _ : state) { + compute(solver, A); + benchmark::DoNotOptimize(solver.matrixQTZ().data()); + } +} + +static void BM_FullPivHouseholderQR(benchmark::State& state) { + int rows = state.range(0); + int cols = state.range(1); + Mat A = Mat::Random(rows, cols); + FullPivHouseholderQR solver(rows, cols); + for (auto _ : state) { + compute(solver, A); + benchmark::DoNotOptimize(solver.matrixQR().data()); + } +} + +static void BM_JacobiSVD(benchmark::State& state) { + int rows = state.range(0); + int cols = state.range(1); + Mat A = Mat::Random(rows, cols); + JacobiSVD solver(rows, cols); + for (auto _ : state) { + solver.compute(A); + benchmark::DoNotOptimize(solver.singularValues().data()); + } +} + +static void BM_BDCSVD(benchmark::State& state) { + int rows = state.range(0); + int cols = state.range(1); + Mat A = Mat::Random(rows, cols); + BDCSVD solver(rows, cols); + for (auto _ : state) { + solver.compute(A); + benchmark::DoNotOptimize(solver.singularValues().data()); + } +} + +static void DenseSolverSizes(::benchmark::Benchmark* b) { + // Square sizes + for (int s : {8, 100, 1000}) { + b->Args({s, s}); + } + // Tall-skinny sizes + b->Args({10000, 8}); + b->Args({10000, 100}); +} + +BENCHMARK(BM_LLT)->Apply(DenseSolverSizes); +BENCHMARK(BM_LDLT)->Apply(DenseSolverSizes); +BENCHMARK(BM_PartialPivLU)->Apply(DenseSolverSizes); +BENCHMARK(BM_FullPivLU)->Apply(DenseSolverSizes); +BENCHMARK(BM_HouseholderQR)->Apply(DenseSolverSizes); +BENCHMARK(BM_ColPivHouseholderQR)->Apply(DenseSolverSizes); +BENCHMARK(BM_COD)->Apply(DenseSolverSizes); +BENCHMARK(BM_FullPivHouseholderQR)->Apply(DenseSolverSizes); +BENCHMARK(BM_JacobiSVD)->Apply([](::benchmark::Benchmark* b) { + // JacobiSVD is very slow for large matrices + for (int s : {8, 100}) b->Args({s, s}); + b->Args({10000, 8}); +}); +BENCHMARK(BM_BDCSVD)->Apply(DenseSolverSizes); diff --git a/benchmarks/eig33.cpp b/benchmarks/eig33.cpp new file mode 100644 index 000000000..2b63aaa4a --- /dev/null +++ b/benchmarks/eig33.cpp @@ -0,0 +1,97 @@ +#include +#include +#include +#include + +using namespace Eigen; + +template +inline void computeRoots(const Matrix& m, Roots& roots) { + typedef typename Matrix::Scalar Scalar; + const Scalar s_inv3 = 1.0 / 3.0; + const Scalar s_sqrt3 = std::sqrt(Scalar(3.0)); + Scalar c0 = m(0, 0) * m(1, 1) * m(2, 2) + Scalar(2) * m(0, 1) * m(0, 2) * m(1, 2) - m(0, 0) * m(1, 2) * m(1, 2) - + m(1, 1) * m(0, 2) * m(0, 2) - m(2, 2) * m(0, 1) * m(0, 1); + Scalar c1 = m(0, 0) * m(1, 1) - m(0, 1) * m(0, 1) + m(0, 0) * m(2, 2) - m(0, 2) * m(0, 2) + m(1, 1) * m(2, 2) - + m(1, 2) * m(1, 2); + Scalar c2 = m(0, 0) + m(1, 1) + m(2, 2); + Scalar c2_over_3 = c2 * s_inv3; + Scalar a_over_3 = (c1 - c2 * c2_over_3) * s_inv3; + if (a_over_3 > Scalar(0)) a_over_3 = Scalar(0); + Scalar half_b = Scalar(0.5) * (c0 + c2_over_3 * (Scalar(2) * c2_over_3 * c2_over_3 - c1)); + Scalar q = half_b * half_b + a_over_3 * a_over_3 * a_over_3; + if (q > Scalar(0)) q = Scalar(0); + Scalar rho = std::sqrt(-a_over_3); + Scalar theta = std::atan2(std::sqrt(-q), half_b) * s_inv3; + Scalar cos_theta = std::cos(theta); + Scalar sin_theta = std::sin(theta); + roots(2) = c2_over_3 + Scalar(2) * rho * cos_theta; + roots(0) = c2_over_3 - rho * (cos_theta + s_sqrt3 * sin_theta); + roots(1) = c2_over_3 - rho * (cos_theta - s_sqrt3 * sin_theta); +} + +template +void eigen33(const Matrix& mat, Matrix& evecs, Vector& evals) { + typedef typename Matrix::Scalar Scalar; + Scalar shift = mat.trace() / 3; + Matrix scaledMat = mat; + scaledMat.diagonal().array() -= shift; + Scalar scale = scaledMat.cwiseAbs().maxCoeff(); + scale = std::max(scale, Scalar(1)); + scaledMat /= scale; + computeRoots(scaledMat, evals); + if ((evals(2) - evals(0)) <= Eigen::NumTraits::epsilon()) { + evecs.setIdentity(); + } else { + Matrix tmp; + tmp = scaledMat; + tmp.diagonal().array() -= evals(2); + evecs.col(2) = tmp.row(0).cross(tmp.row(1)).normalized(); + tmp = scaledMat; + tmp.diagonal().array() -= evals(1); + evecs.col(1) = tmp.row(0).cross(tmp.row(1)); + Scalar n1 = evecs.col(1).norm(); + if (n1 <= Eigen::NumTraits::epsilon()) + evecs.col(1) = evecs.col(2).unitOrthogonal(); + else + evecs.col(1) /= n1; + evecs.col(1) = evecs.col(2).cross(evecs.col(1).cross(evecs.col(2))).normalized(); + evecs.col(0) = evecs.col(2).cross(evecs.col(1)); + } + evals *= scale; + evals.array() += shift; +} + +static void BM_Eig33_Iterative(benchmark::State& state) { + Matrix3d A = Matrix3d::Random(); + A = A.adjoint() * A; + SelfAdjointEigenSolver eig(A); + for (auto _ : state) { + eig.compute(A); + benchmark::DoNotOptimize(eig.eigenvalues().data()); + } +} +BENCHMARK(BM_Eig33_Iterative); + +static void BM_Eig33_Direct(benchmark::State& state) { + Matrix3d A = Matrix3d::Random(); + A = A.adjoint() * A; + SelfAdjointEigenSolver eig(A); + for (auto _ : state) { + eig.computeDirect(A); + benchmark::DoNotOptimize(eig.eigenvalues().data()); + } +} +BENCHMARK(BM_Eig33_Direct); + +static void BM_Eig33_Custom(benchmark::State& state) { + Matrix3d A = Matrix3d::Random(); + A = A.adjoint() * A; + Matrix3d evecs; + Vector3d evals; + for (auto _ : state) { + eigen33(A, evecs, evals); + benchmark::DoNotOptimize(evals.data()); + } +} +BENCHMARK(BM_Eig33_Custom); diff --git a/benchmarks/geometry.cpp b/benchmarks/geometry.cpp new file mode 100644 index 000000000..88b2f1051 --- /dev/null +++ b/benchmarks/geometry.cpp @@ -0,0 +1,71 @@ +#include +#include + +using namespace Eigen; + +// Helper to get dimension from various transform types +template +struct TransformDim { + static constexpr int value = T::Dim; +}; + +template +struct TransformDim> { + static constexpr int value = Rows; +}; + +template +static void BM_TransformData(benchmark::State& state) { + typedef typename Transformation::Scalar Scalar; + constexpr int Dim = TransformDim::value; + Transformation t; + if constexpr (std::is_same_v>) { + t.setRandom(); + } else { + Matrix mat; + mat.setRandom(); + t = Transformation(mat); + } + Matrix data; + data.setRandom(); + for (auto _ : state) { + data = t * data; + benchmark::DoNotOptimize(data.data()); + } +} + +// For quaternion: apply per-column +template +static void BM_QuatTransform(benchmark::State& state) { + Quaternion q; + q.setIdentity(); + Matrix data; + data.setRandom(); + for (auto _ : state) { + for (int i = 0; i < N; ++i) data.col(i) = q * data.col(i); + benchmark::DoNotOptimize(data.data()); + } +} + +// Use typedefs to avoid commas in macro arguments +typedef Transform Isometry3f_t; +typedef Transform Affine3f_t; +typedef Transform AffineCompact3f_t; +typedef Matrix Matrix3f_t; + +BENCHMARK(BM_TransformData); +BENCHMARK(BM_TransformData); +BENCHMARK(BM_TransformData); +BENCHMARK(BM_TransformData); +BENCHMARK(BM_TransformData); +BENCHMARK(BM_TransformData); +BENCHMARK(BM_TransformData); +BENCHMARK(BM_TransformData); +BENCHMARK(BM_TransformData); +BENCHMARK(BM_TransformData); +BENCHMARK(BM_TransformData); +BENCHMARK(BM_TransformData); + +BENCHMARK(BM_QuatTransform); +BENCHMARK(BM_QuatTransform); +BENCHMARK(BM_QuatTransform); diff --git a/benchmarks/print_blocking.cpp b/benchmarks/print_blocking.cpp new file mode 100644 index 000000000..3f5a25692 --- /dev/null +++ b/benchmarks/print_blocking.cpp @@ -0,0 +1,51 @@ +#include +#include + +using namespace Eigen; +using namespace Eigen::internal; + +int main() { + printf("%-8s %-8s %-8s | %-8s %-8s %-8s | %-8s %-8s %-8s | %-8s %-8s %-8s\n", "m", "n", "k", "kc_f", "mc_f", "nc_f", + "kc_d", "mc_d", "nc_d", "mr_f", "nr_f", "LhsPr_f"); + + // Print gebp_traits info + { + using Traits = gebp_traits; + printf("Float traits: mr=%d, nr=%d, LhsProgress=%d, RhsProgress=%d, NumberOfRegisters=%d\n", Traits::mr, Traits::nr, + Traits::LhsProgress, Traits::RhsProgress, Traits::NumberOfRegisters); + } + { + using Traits = gebp_traits; + printf("Double traits: mr=%d, nr=%d, LhsProgress=%d, RhsProgress=%d, NumberOfRegisters=%d\n", Traits::mr, + Traits::nr, Traits::LhsProgress, Traits::RhsProgress, Traits::NumberOfRegisters); + } + + // Print cache sizes + std::ptrdiff_t l1, l2, l3; + manage_caching_sizes(GetAction, &l1, &l2, &l3); + printf("Cache sizes: L1=%ld, L2=%ld, L3=%ld\n", (long)l1, (long)l2, (long)l3); + + for (int size : {8, 16, 32, 64, 96, 128, 160, 192, 224, 256, 288, 320, 384, 448, 512, 768, 1024, 1536, 2048}) { + { + Index kf = size, mf = size, nf = size; + computeProductBlockingSizes(kf, mf, nf); + Index kd = size, md = size, nd = size; + computeProductBlockingSizes(kd, md, nd); + printf("%-8d %-8d %-8d | %-8ld %-8ld %-8ld | %-8ld %-8ld %-8ld\n", size, size, size, (long)kf, (long)mf, (long)nf, + (long)kd, (long)md, (long)nd); + } + } + + // Non-square + for (auto [m, n, k] : + std::initializer_list>{{64, 64, 1024}, {1024, 64, 64}, {64, 1024, 64}}) { + Index kf = k, mf = m, nf = n; + computeProductBlockingSizes(kf, mf, nf); + Index kd = k, md = m, nd = n; + computeProductBlockingSizes(kd, md, nd); + printf("%-8d %-8d %-8d | %-8ld %-8ld %-8ld | %-8ld %-8ld %-8ld\n", m, n, k, (long)kf, (long)mf, (long)nf, (long)kd, + (long)md, (long)nd); + } + + return 0; +} diff --git a/benchmarks/quatmul.cpp b/benchmarks/quatmul.cpp new file mode 100644 index 000000000..a35ecba3f --- /dev/null +++ b/benchmarks/quatmul.cpp @@ -0,0 +1,38 @@ +#include +#include +#include + +using namespace Eigen; + +template +EIGEN_DONT_INLINE void quatmul_default(const Quat& a, const Quat& b, Quat& c) { + c = a * b; +} + +template +EIGEN_DONT_INLINE void quatmul_novec(const Quat& a, const Quat& b, Quat& c) { + c = internal::quat_product<0, Quat, Quat, typename Quat::Scalar>::run(a, b); +} + +template +static void BM_QuatMul_Default(benchmark::State& state) { + Quat a(4, 1, 2, 3), b(2, 3, 4, 5), c; + for (auto _ : state) { + quatmul_default(a, b, c); + benchmark::DoNotOptimize(c.coeffs().data()); + } +} + +template +static void BM_QuatMul_NoVec(benchmark::State& state) { + Quat a(4, 1, 2, 3), b(2, 3, 4, 5), c; + for (auto _ : state) { + quatmul_novec(a, b, c); + benchmark::DoNotOptimize(c.coeffs().data()); + } +} + +BENCHMARK(BM_QuatMul_Default); +BENCHMARK(BM_QuatMul_NoVec); +BENCHMARK(BM_QuatMul_Default); +BENCHMARK(BM_QuatMul_NoVec); diff --git a/benchmarks/sparse_dense_product.cpp b/benchmarks/sparse_dense_product.cpp new file mode 100644 index 000000000..61f42eabc --- /dev/null +++ b/benchmarks/sparse_dense_product.cpp @@ -0,0 +1,64 @@ +#include +#include + +using namespace Eigen; + +typedef double Scalar; +typedef SparseMatrix SpMat; +typedef Matrix DenseVec; + +static void fillMatrix(int nnzPerCol, int rows, int cols, SpMat& dst) { + dst.resize(rows, cols); + dst.reserve(VectorXi::Constant(cols, nnzPerCol)); + for (int j = 0; j < cols; ++j) { + std::set used; + for (int i = 0; i < nnzPerCol; ++i) { + int row; + do { + row = internal::random(0, rows - 1); + } while (used.count(row)); + used.insert(row); + dst.insert(row, j) = internal::random(); + } + } + dst.makeCompressed(); +} + +static void BM_SpMV(benchmark::State& state) { + int n = state.range(0); + int nnzPerCol = state.range(1); + SpMat sm(n, n); + fillMatrix(nnzPerCol, n, n, sm); + DenseVec v = DenseVec::Random(n); + DenseVec res(n); + for (auto _ : state) { + res.noalias() = sm * v; + benchmark::DoNotOptimize(res.data()); + } + state.counters["nnz"] = sm.nonZeros(); +} + +static void BM_SpMV_Transpose(benchmark::State& state) { + int n = state.range(0); + int nnzPerCol = state.range(1); + SpMat sm(n, n); + fillMatrix(nnzPerCol, n, n, sm); + DenseVec v = DenseVec::Random(n); + DenseVec res(n); + for (auto _ : state) { + res.noalias() = sm.transpose() * v; + benchmark::DoNotOptimize(res.data()); + } + state.counters["nnz"] = sm.nonZeros(); +} + +static void SpMVSizes(::benchmark::Benchmark* b) { + for (int n : {1000, 10000, 100000}) { + for (int nnz : {7, 20, 50}) { + b->Args({n, nnz}); + } + } +} + +BENCHMARK(BM_SpMV)->Apply(SpMVSizes); +BENCHMARK(BM_SpMV_Transpose)->Apply(SpMVSizes); diff --git a/benchmarks/sparse_product.cpp b/benchmarks/sparse_product.cpp new file mode 100644 index 000000000..82044e6b2 --- /dev/null +++ b/benchmarks/sparse_product.cpp @@ -0,0 +1,49 @@ +#include +#include +#include + +using namespace Eigen; + +typedef double Scalar; +typedef SparseMatrix SpMat; + +static void fillMatrix(int nnzPerCol, int rows, int cols, SpMat& dst) { + dst.resize(rows, cols); + dst.reserve(VectorXi::Constant(cols, nnzPerCol)); + for (int j = 0; j < cols; ++j) { + std::set used; + for (int i = 0; i < nnzPerCol; ++i) { + int row; + do { + row = internal::random(0, rows - 1); + } while (used.count(row)); + used.insert(row); + dst.insert(row, j) = internal::random(); + } + } + dst.makeCompressed(); +} + +static void BM_SparseMM(benchmark::State& state) { + int n = state.range(0); + int nnzPerCol = state.range(1); + SpMat sm1(n, n), sm2(n, n), sm3(n, n); + fillMatrix(nnzPerCol, n, n, sm1); + fillMatrix(nnzPerCol, n, n, sm2); + for (auto _ : state) { + sm3 = sm1 * sm2; + benchmark::DoNotOptimize(sm3.valuePtr()); + } + state.counters["nnz_A"] = sm1.nonZeros(); + state.counters["nnz_B"] = sm2.nonZeros(); +} + +static void SpMMSizes(::benchmark::Benchmark* b) { + for (int n : {1000, 10000}) { + for (int nnz : {4, 6, 10}) { + b->Args({n, nnz}); + } + } +} + +BENCHMARK(BM_SparseMM)->Apply(SpMMSizes); diff --git a/benchmarks/sparse_transpose.cpp b/benchmarks/sparse_transpose.cpp new file mode 100644 index 000000000..fa57c5289 --- /dev/null +++ b/benchmarks/sparse_transpose.cpp @@ -0,0 +1,45 @@ +#include +#include +#include + +using namespace Eigen; + +typedef double Scalar; +typedef SparseMatrix SpMat; + +static void fillMatrix(float density, int rows, int cols, SpMat& dst) { + dst.resize(rows, cols); + dst.reserve(static_cast(rows * cols * density)); + for (int j = 0; j < cols; ++j) { + for (int i = 0; i < rows; ++i) { + if (internal::random(0, 1) < density) { + dst.insert(i, j) = internal::random(); + } + } + } + dst.makeCompressed(); +} + +static void BM_SparseTranspose(benchmark::State& state) { + int n = state.range(0); + float density = state.range(1) / 10000.0f; + SpMat sm(n, n), result(n, n); + fillMatrix(density, n, n, sm); + for (auto _ : state) { + result = sm.transpose(); + benchmark::DoNotOptimize(result.valuePtr()); + } + state.counters["nnz"] = sm.nonZeros(); + state.counters["density%"] = density * 100; +} + +static void TransposeSizes(::benchmark::Benchmark* b) { + // Args: {size, density*10000} + for (int n : {1000, 10000}) { + for (int d : {100, 50, 10, 4}) { // 1%, 0.5%, 0.1%, 0.04% + b->Args({n, d}); + } + } +} + +BENCHMARK(BM_SparseTranspose)->Apply(TransposeSizes);