unsupported/benchmarks/Tensor/bench_coefficient_wise.cpp

// Benchmarks for Eigen Tensor coefficient-wise operations.
// Covers activation functions, normalization, and element-wise arithmetic.

#include <benchmark/benchmark.h>
#include <unsupported/Eigen/CXX11/Tensor>

using namespace Eigen;

typedef float Scalar;

// Macro to define a benchmark for a unary tensor operation.
#define BENCH_TENSOR_UNARY(NAME, EXPR)                                        \
  static void BM_##NAME(benchmark::State& state) {                            \
    const int M = state.range(0);                                             \
    const int N = state.range(1);                                             \
    Tensor<Scalar, 2> a(M, N);                                                \
    a.setRandom();                                                            \
    Tensor<Scalar, 2> b(M, N);                                                \
    for (auto _ : state) {                                                    \
      b = EXPR;                                                               \
      benchmark::DoNotOptimize(b.data());                                     \
      benchmark::ClobberMemory();                                             \
    }                                                                         \
    state.SetBytesProcessed(state.iterations() * M * N * sizeof(Scalar) * 2); \
  }

BENCH_TENSOR_UNARY(Exp, a.exp())
BENCH_TENSOR_UNARY(Log, a.abs().log())
BENCH_TENSOR_UNARY(Tanh, a.tanh())
BENCH_TENSOR_UNARY(Sigmoid, a.sigmoid())
BENCH_TENSOR_UNARY(ReLU, a.cwiseMax(Scalar(0)))
BENCH_TENSOR_UNARY(Sqrt, a.abs().sqrt())

// --- Element-wise binary operations ---
static void BM_Add(benchmark::State& state) {
  const int M = state.range(0);
  const int N = state.range(1);

  Tensor<Scalar, 2> a(M, N);
  Tensor<Scalar, 2> b(M, N);
  Tensor<Scalar, 2> c(M, N);
  a.setRandom();
  b.setRandom();

  for (auto _ : state) {
    c = a + b;
    benchmark::DoNotOptimize(c.data());
    benchmark::ClobberMemory();
  }
  state.SetBytesProcessed(state.iterations() * M * N * sizeof(Scalar) * 3);
}

static void BM_Mul(benchmark::State& state) {
  const int M = state.range(0);
  const int N = state.range(1);

  Tensor<Scalar, 2> a(M, N);
  Tensor<Scalar, 2> b(M, N);
  Tensor<Scalar, 2> c(M, N);
  a.setRandom();
  b.setRandom();

  for (auto _ : state) {
    c = a * b;
    benchmark::DoNotOptimize(c.data());
    benchmark::ClobberMemory();
  }
  state.SetBytesProcessed(state.iterations() * M * N * sizeof(Scalar) * 3);
}

// --- Fused multiply-add ---
static void BM_FMA(benchmark::State& state) {
  const int M = state.range(0);
  const int N = state.range(1);

  Tensor<Scalar, 2> a(M, N);
  Tensor<Scalar, 2> b(M, N);
  Tensor<Scalar, 2> c(M, N);
  Tensor<Scalar, 2> d(M, N);
  a.setRandom();
  b.setRandom();
  c.setRandom();

  for (auto _ : state) {
    d = a * b + c;
    benchmark::DoNotOptimize(d.data());
    benchmark::ClobberMemory();
  }
  state.SetBytesProcessed(state.iterations() * M * N * sizeof(Scalar) * 4);
}

// --- Rank-4 coefficient-wise (CNN feature maps) ---
static void BM_ReLU_Rank4(benchmark::State& state) {
  const int batch = state.range(0);
  const int C = state.range(1);
  const int H = state.range(2);

  Tensor<Scalar, 4> a(batch, C, H, H);
  Tensor<Scalar, 4> b(batch, C, H, H);
  a.setRandom();

  for (auto _ : state) {
    b = a.cwiseMax(Scalar(0));
    benchmark::DoNotOptimize(b.data());
    benchmark::ClobberMemory();
  }
  state.SetBytesProcessed(state.iterations() * batch * C * H * H * sizeof(Scalar) * 2);
}

static void CwiseSizes(::benchmark::Benchmark* b) {
  for (int size : {256, 1024}) {
    b->Args({size, size});
  }
}

static void Rank4Sizes(::benchmark::Benchmark* b) {
  b->Args({32, 64, 16});
  b->Args({8, 128, 32});
  b->Args({1, 256, 64});
}

BENCHMARK(BM_Exp)->Apply(CwiseSizes);
BENCHMARK(BM_Log)->Apply(CwiseSizes);
BENCHMARK(BM_Tanh)->Apply(CwiseSizes);
BENCHMARK(BM_Sigmoid)->Apply(CwiseSizes);
BENCHMARK(BM_ReLU)->Apply(CwiseSizes);
BENCHMARK(BM_Sqrt)->Apply(CwiseSizes);
BENCHMARK(BM_Add)->Apply(CwiseSizes);
BENCHMARK(BM_Mul)->Apply(CwiseSizes);
BENCHMARK(BM_FMA)->Apply(CwiseSizes);
BENCHMARK(BM_ReLU_Rank4)->Apply(Rank4Sizes);
Add benchmarks for unsupported modules and extend supported benchmarks libeigen/eigen!2179 Closes #3036 Co-authored-by: Rasmus Munk Larsen <rmlarsen@gmail.com> 2026-02-24 17:12:33 -08:00			`// Benchmarks for Eigen Tensor coefficient-wise operations.`
			`// Covers activation functions, normalization, and element-wise arithmetic.`

			`#include <benchmark/benchmark.h>`
			`#include <unsupported/Eigen/CXX11/Tensor>`

			`using namespace Eigen;`

			`typedef float Scalar;`

			`// Macro to define a benchmark for a unary tensor operation.`
			`#define BENCH_TENSOR_UNARY(NAME, EXPR) \`
			`static void BM_##NAME(benchmark::State& state) { \`
			`const int M = state.range(0); \`
			`const int N = state.range(1); \`
			`Tensor<Scalar, 2> a(M, N); \`
			`a.setRandom(); \`
			`Tensor<Scalar, 2> b(M, N); \`
			`for (auto _ : state) { \`
			`b = EXPR; \`
			`benchmark::DoNotOptimize(b.data()); \`
			`benchmark::ClobberMemory(); \`
			`} \`
			`state.SetBytesProcessed(state.iterations() * M * N * sizeof(Scalar) * 2); \`
			`}`

			`BENCH_TENSOR_UNARY(Exp, a.exp())`
			`BENCH_TENSOR_UNARY(Log, a.abs().log())`
			`BENCH_TENSOR_UNARY(Tanh, a.tanh())`
			`BENCH_TENSOR_UNARY(Sigmoid, a.sigmoid())`
			`BENCH_TENSOR_UNARY(ReLU, a.cwiseMax(Scalar(0)))`
			`BENCH_TENSOR_UNARY(Sqrt, a.abs().sqrt())`

			`// --- Element-wise binary operations ---`
			`static void BM_Add(benchmark::State& state) {`
			`const int M = state.range(0);`
			`const int N = state.range(1);`

			`Tensor<Scalar, 2> a(M, N);`
			`Tensor<Scalar, 2> b(M, N);`
			`Tensor<Scalar, 2> c(M, N);`
			`a.setRandom();`
			`b.setRandom();`

			`for (auto _ : state) {`
			`c = a + b;`
			`benchmark::DoNotOptimize(c.data());`
			`benchmark::ClobberMemory();`
			`}`
			`state.SetBytesProcessed(state.iterations() * M * N * sizeof(Scalar) * 3);`
			`}`

			`static void BM_Mul(benchmark::State& state) {`
			`const int M = state.range(0);`
			`const int N = state.range(1);`

			`Tensor<Scalar, 2> a(M, N);`
			`Tensor<Scalar, 2> b(M, N);`
			`Tensor<Scalar, 2> c(M, N);`
			`a.setRandom();`
			`b.setRandom();`

			`for (auto _ : state) {`
			`c = a * b;`
			`benchmark::DoNotOptimize(c.data());`
			`benchmark::ClobberMemory();`
			`}`
			`state.SetBytesProcessed(state.iterations() * M * N * sizeof(Scalar) * 3);`
			`}`

			`// --- Fused multiply-add ---`
			`static void BM_FMA(benchmark::State& state) {`
			`const int M = state.range(0);`
			`const int N = state.range(1);`

			`Tensor<Scalar, 2> a(M, N);`
			`Tensor<Scalar, 2> b(M, N);`
			`Tensor<Scalar, 2> c(M, N);`
			`Tensor<Scalar, 2> d(M, N);`
			`a.setRandom();`
			`b.setRandom();`
			`c.setRandom();`

			`for (auto _ : state) {`
			`d = a * b + c;`
			`benchmark::DoNotOptimize(d.data());`
			`benchmark::ClobberMemory();`
			`}`
			`state.SetBytesProcessed(state.iterations() * M * N * sizeof(Scalar) * 4);`
			`}`

			`// --- Rank-4 coefficient-wise (CNN feature maps) ---`
			`static void BM_ReLU_Rank4(benchmark::State& state) {`
			`const int batch = state.range(0);`
			`const int C = state.range(1);`
			`const int H = state.range(2);`

			`Tensor<Scalar, 4> a(batch, C, H, H);`
			`Tensor<Scalar, 4> b(batch, C, H, H);`
			`a.setRandom();`

			`for (auto _ : state) {`
			`b = a.cwiseMax(Scalar(0));`
			`benchmark::DoNotOptimize(b.data());`
			`benchmark::ClobberMemory();`
			`}`
			`state.SetBytesProcessed(state.iterations() * batch * C * H * H * sizeof(Scalar) * 2);`
			`}`

			`static void CwiseSizes(::benchmark::Benchmark* b) {`
			`for (int size : {256, 1024}) {`
			`b->Args({size, size});`
			`}`
			`}`

			`static void Rank4Sizes(::benchmark::Benchmark* b) {`
			`b->Args({32, 64, 16});`
			`b->Args({8, 128, 32});`
			`b->Args({1, 256, 64});`
			`}`

			`BENCHMARK(BM_Exp)->Apply(CwiseSizes);`
			`BENCHMARK(BM_Log)->Apply(CwiseSizes);`
			`BENCHMARK(BM_Tanh)->Apply(CwiseSizes);`
			`BENCHMARK(BM_Sigmoid)->Apply(CwiseSizes);`
			`BENCHMARK(BM_ReLU)->Apply(CwiseSizes);`
			`BENCHMARK(BM_Sqrt)->Apply(CwiseSizes);`
			`BENCHMARK(BM_Add)->Apply(CwiseSizes);`
			`BENCHMARK(BM_Mul)->Apply(CwiseSizes);`
			`BENCHMARK(BM_FMA)->Apply(CwiseSizes);`
			`BENCHMARK(BM_ReLU_Rank4)->Apply(Rank4Sizes);`