// Benchmarks for Eigen Tensor broadcasting. // Tests broadcasting along various dimensions and ranks. #include #include using namespace Eigen; typedef float Scalar; // --- Broadcast row vector {1,N} -> {M,N} --- static void BM_BroadcastRow(benchmark::State& state) { const int M = state.range(0); const int N = state.range(1); Tensor row(1, N); Tensor result(M, N); row.setRandom(); Eigen::array bcast = {M, 1}; for (auto _ : state) { result = row.broadcast(bcast); benchmark::DoNotOptimize(result.data()); benchmark::ClobberMemory(); } state.SetBytesProcessed(state.iterations() * M * N * sizeof(Scalar)); } // --- Broadcast col vector {M,1} -> {M,N} --- static void BM_BroadcastCol(benchmark::State& state) { const int M = state.range(0); const int N = state.range(1); Tensor col(M, 1); Tensor result(M, N); col.setRandom(); Eigen::array bcast = {1, N}; for (auto _ : state) { result = col.broadcast(bcast); benchmark::DoNotOptimize(result.data()); benchmark::ClobberMemory(); } state.SetBytesProcessed(state.iterations() * M * N * sizeof(Scalar)); } // --- Broadcast + element-wise add (bias addition pattern) --- static void BM_BroadcastAdd(benchmark::State& state) { const int M = state.range(0); const int N = state.range(1); Tensor mat(M, N); Tensor bias(1, N); Tensor result(M, N); mat.setRandom(); bias.setRandom(); Eigen::array bcast = {M, 1}; for (auto _ : state) { result = mat + bias.broadcast(bcast); benchmark::DoNotOptimize(result.data()); benchmark::ClobberMemory(); } state.SetBytesProcessed(state.iterations() * M * N * sizeof(Scalar) * 2); } // --- Rank-4 broadcast (batch x channels x 1 x 1) -> (batch x channels x H x W) --- static void BM_BroadcastRank4(benchmark::State& state) { const int batch = state.range(0); const int C = state.range(1); const int H = state.range(2); Tensor bias(batch, C, 1, 1); Tensor result(batch, C, H, H); bias.setRandom(); Eigen::array bcast = {1, 1, H, H}; for (auto _ : state) { result = bias.broadcast(bcast); benchmark::DoNotOptimize(result.data()); benchmark::ClobberMemory(); } state.SetBytesProcessed(state.iterations() * batch * C * H * H * sizeof(Scalar)); } static void BroadcastSizes(::benchmark::Benchmark* b) { for (int m : {64, 256, 1024}) { for (int n : {64, 256, 1024}) { b->Args({m, n}); } } } static void Rank4Sizes(::benchmark::Benchmark* b) { for (int batch : {1, 8}) { for (int c : {64, 256}) { for (int h : {16, 32}) { b->Args({batch, c, h}); } } } } BENCHMARK(BM_BroadcastRow)->Apply(BroadcastSizes); BENCHMARK(BM_BroadcastCol)->Apply(BroadcastSizes); BENCHMARK(BM_BroadcastAdd)->Apply(BroadcastSizes); BENCHMARK(BM_BroadcastRank4)->Apply(Rank4Sizes);