// Benchmarks for Eigen Tensor coefficient-wise operations. // Covers activation functions, normalization, and element-wise arithmetic. #include #include using namespace Eigen; typedef float Scalar; // Macro to define a benchmark for a unary tensor operation. #define BENCH_TENSOR_UNARY(NAME, EXPR) \ static void BM_##NAME(benchmark::State& state) { \ const int M = state.range(0); \ const int N = state.range(1); \ Tensor a(M, N); \ a.setRandom(); \ Tensor b(M, N); \ for (auto _ : state) { \ b = EXPR; \ benchmark::DoNotOptimize(b.data()); \ benchmark::ClobberMemory(); \ } \ state.SetBytesProcessed(state.iterations() * M * N * sizeof(Scalar) * 2); \ } BENCH_TENSOR_UNARY(Exp, a.exp()) BENCH_TENSOR_UNARY(Log, a.abs().log()) BENCH_TENSOR_UNARY(Tanh, a.tanh()) BENCH_TENSOR_UNARY(Sigmoid, a.sigmoid()) BENCH_TENSOR_UNARY(ReLU, a.cwiseMax(Scalar(0))) BENCH_TENSOR_UNARY(Sqrt, a.abs().sqrt()) // --- Element-wise binary operations --- static void BM_Add(benchmark::State& state) { const int M = state.range(0); const int N = state.range(1); Tensor a(M, N); Tensor b(M, N); Tensor c(M, N); a.setRandom(); b.setRandom(); for (auto _ : state) { c = a + b; benchmark::DoNotOptimize(c.data()); benchmark::ClobberMemory(); } state.SetBytesProcessed(state.iterations() * M * N * sizeof(Scalar) * 3); } static void BM_Mul(benchmark::State& state) { const int M = state.range(0); const int N = state.range(1); Tensor a(M, N); Tensor b(M, N); Tensor c(M, N); a.setRandom(); b.setRandom(); for (auto _ : state) { c = a * b; benchmark::DoNotOptimize(c.data()); benchmark::ClobberMemory(); } state.SetBytesProcessed(state.iterations() * M * N * sizeof(Scalar) * 3); } // --- Fused multiply-add --- static void BM_FMA(benchmark::State& state) { const int M = state.range(0); const int N = state.range(1); Tensor a(M, N); Tensor b(M, N); Tensor c(M, N); Tensor d(M, N); a.setRandom(); b.setRandom(); c.setRandom(); for (auto _ : state) { d = a * b + c; benchmark::DoNotOptimize(d.data()); benchmark::ClobberMemory(); } state.SetBytesProcessed(state.iterations() * M * N * sizeof(Scalar) * 4); } // --- Rank-4 coefficient-wise (CNN feature maps) --- static void BM_ReLU_Rank4(benchmark::State& state) { const int batch = state.range(0); const int C = state.range(1); const int H = state.range(2); Tensor a(batch, C, H, H); Tensor b(batch, C, H, H); a.setRandom(); for (auto _ : state) { b = a.cwiseMax(Scalar(0)); benchmark::DoNotOptimize(b.data()); benchmark::ClobberMemory(); } state.SetBytesProcessed(state.iterations() * batch * C * H * H * sizeof(Scalar) * 2); } static void CwiseSizes(::benchmark::Benchmark* b) { for (int size : {256, 1024}) { b->Args({size, size}); } } static void Rank4Sizes(::benchmark::Benchmark* b) { b->Args({32, 64, 16}); b->Args({8, 128, 32}); b->Args({1, 256, 64}); } BENCHMARK(BM_Exp)->Apply(CwiseSizes); BENCHMARK(BM_Log)->Apply(CwiseSizes); BENCHMARK(BM_Tanh)->Apply(CwiseSizes); BENCHMARK(BM_Sigmoid)->Apply(CwiseSizes); BENCHMARK(BM_ReLU)->Apply(CwiseSizes); BENCHMARK(BM_Sqrt)->Apply(CwiseSizes); BENCHMARK(BM_Add)->Apply(CwiseSizes); BENCHMARK(BM_Mul)->Apply(CwiseSizes); BENCHMARK(BM_FMA)->Apply(CwiseSizes); BENCHMARK(BM_ReLU_Rank4)->Apply(Rank4Sizes);