mirror of
https://gitlab.com/libeigen/eigen.git
synced 2026-04-10 11:34:33 +08:00
159 lines
4.4 KiB
C++
159 lines
4.4 KiB
C++
// Benchmarks for Eigen Tensor reductions (sum, maximum, mean).
|
|
// Tests full and partial reductions, inner vs outer dimension, DefaultDevice and ThreadPoolDevice.
|
|
|
|
#define EIGEN_USE_THREADS
|
|
|
|
#include <benchmark/benchmark.h>
|
|
#include <unsupported/Eigen/CXX11/Tensor>
|
|
#include <unsupported/Eigen/CXX11/ThreadPool>
|
|
|
|
using namespace Eigen;
|
|
|
|
#ifndef SCALAR
|
|
#define SCALAR float
|
|
#endif
|
|
|
|
typedef SCALAR Scalar;
|
|
|
|
// --- Full reduction (rank-2) ---
|
|
template <typename ReduceOp>
|
|
static void BM_FullReduction(benchmark::State& state) {
|
|
const int M = state.range(0);
|
|
const int N = state.range(1);
|
|
|
|
Tensor<Scalar, 2> A(M, N);
|
|
A.setRandom();
|
|
|
|
for (auto _ : state) {
|
|
Tensor<Scalar, 0> result = A.reduce(Eigen::array<int, 2>{0, 1}, ReduceOp());
|
|
benchmark::DoNotOptimize(result.data());
|
|
benchmark::ClobberMemory();
|
|
}
|
|
state.SetBytesProcessed(state.iterations() * M * N * sizeof(Scalar));
|
|
}
|
|
|
|
// --- Partial reduction along dim 0 (inner dim, ColMajor) ---
|
|
static void BM_ReduceInner(benchmark::State& state) {
|
|
const int M = state.range(0);
|
|
const int N = state.range(1);
|
|
|
|
Tensor<Scalar, 2> A(M, N);
|
|
A.setRandom();
|
|
|
|
Eigen::array<int, 1> reduce_dims = {0};
|
|
|
|
for (auto _ : state) {
|
|
Tensor<Scalar, 1> result = A.sum(reduce_dims);
|
|
benchmark::DoNotOptimize(result.data());
|
|
benchmark::ClobberMemory();
|
|
}
|
|
state.SetBytesProcessed(state.iterations() * M * N * sizeof(Scalar));
|
|
}
|
|
|
|
// --- Partial reduction along dim 1 (outer dim, ColMajor) ---
|
|
static void BM_ReduceOuter(benchmark::State& state) {
|
|
const int M = state.range(0);
|
|
const int N = state.range(1);
|
|
|
|
Tensor<Scalar, 2> A(M, N);
|
|
A.setRandom();
|
|
|
|
Eigen::array<int, 1> reduce_dims = {1};
|
|
|
|
for (auto _ : state) {
|
|
Tensor<Scalar, 1> result = A.sum(reduce_dims);
|
|
benchmark::DoNotOptimize(result.data());
|
|
benchmark::ClobberMemory();
|
|
}
|
|
state.SetBytesProcessed(state.iterations() * M * N * sizeof(Scalar));
|
|
}
|
|
|
|
// --- Rank-4 partial reduction (batch x channels x H x W), reduce along spatial dims ---
|
|
static void BM_ReduceSpatial(benchmark::State& state) {
|
|
const int batch = state.range(0);
|
|
const int C = state.range(1);
|
|
const int H = state.range(2);
|
|
|
|
Tensor<Scalar, 4> A(batch, C, H, H);
|
|
A.setRandom();
|
|
|
|
Eigen::array<int, 2> reduce_dims = {2, 3};
|
|
|
|
for (auto _ : state) {
|
|
Tensor<Scalar, 2> result = A.sum(reduce_dims);
|
|
benchmark::DoNotOptimize(result.data());
|
|
benchmark::ClobberMemory();
|
|
}
|
|
state.SetBytesProcessed(state.iterations() * batch * C * H * H * sizeof(Scalar));
|
|
}
|
|
|
|
// --- Full reduction with ThreadPoolDevice ---
|
|
static void BM_FullReduction_ThreadPool(benchmark::State& state) {
|
|
const int M = state.range(0);
|
|
const int N = state.range(1);
|
|
const int threads = state.range(2);
|
|
|
|
Tensor<Scalar, 2> A(M, N);
|
|
Tensor<Scalar, 0> result;
|
|
A.setRandom();
|
|
|
|
ThreadPool tp(threads);
|
|
ThreadPoolDevice dev(&tp, threads);
|
|
|
|
for (auto _ : state) {
|
|
result.device(dev) = A.sum();
|
|
benchmark::DoNotOptimize(result.data());
|
|
benchmark::ClobberMemory();
|
|
}
|
|
state.SetBytesProcessed(state.iterations() * M * N * sizeof(Scalar));
|
|
state.counters["threads"] = threads;
|
|
}
|
|
|
|
// --- Maximum reduction (rank-2) ---
|
|
static void BM_MaxReduction(benchmark::State& state) {
|
|
const int M = state.range(0);
|
|
const int N = state.range(1);
|
|
|
|
Tensor<Scalar, 2> A(M, N);
|
|
A.setRandom();
|
|
|
|
for (auto _ : state) {
|
|
Tensor<Scalar, 0> result = A.maximum();
|
|
benchmark::DoNotOptimize(result.data());
|
|
benchmark::ClobberMemory();
|
|
}
|
|
state.SetBytesProcessed(state.iterations() * M * N * sizeof(Scalar));
|
|
}
|
|
|
|
static void ReductionSizes(::benchmark::Benchmark* b) {
|
|
for (int size : {64, 256, 1024}) {
|
|
b->Args({size, size});
|
|
}
|
|
}
|
|
|
|
static void ThreadPoolReductionSizes(::benchmark::Benchmark* b) {
|
|
for (int size : {256, 1024}) {
|
|
for (int threads : {2, 4, 8}) {
|
|
b->Args({size, size, threads});
|
|
}
|
|
}
|
|
}
|
|
|
|
static void SpatialSizes(::benchmark::Benchmark* b) {
|
|
for (int batch : {1, 8, 32}) {
|
|
for (int c : {64, 128}) {
|
|
for (int h : {16, 32}) {
|
|
b->Args({batch, c, h});
|
|
}
|
|
}
|
|
}
|
|
}
|
|
|
|
BENCHMARK(BM_FullReduction<internal::SumReducer<Scalar>>)->Apply(ReductionSizes)->Name("SumReduction");
|
|
BENCHMARK(BM_FullReduction<internal::MaxReducer<Scalar>>)->Apply(ReductionSizes)->Name("MaxReduction_Full");
|
|
BENCHMARK(BM_MaxReduction)->Apply(ReductionSizes);
|
|
BENCHMARK(BM_ReduceInner)->Apply(ReductionSizes);
|
|
BENCHMARK(BM_ReduceOuter)->Apply(ReductionSizes);
|
|
BENCHMARK(BM_ReduceSpatial)->Apply(SpatialSizes);
|
|
BENCHMARK(BM_FullReduction_ThreadPool)->Apply(ThreadPoolReductionSizes);
|