diff --git a/benchmarks/benchGemv.cpp b/benchmarks/benchGemv.cpp index cc03973a6..ad994d50a 100644 --- a/benchmarks/benchGemv.cpp +++ b/benchmarks/benchGemv.cpp @@ -117,23 +117,23 @@ static void BM_GemvAdj(benchmark::State& state) { static void GemvSizes(::benchmark::Benchmark* b) { // Square matrices: exercises balanced kernel behavior. - for (int size : {8, 16, 32, 64, 128, 256, 512, 1024, 4096}) { + for (int size : {8, 32, 128, 512, 1024}) { b->Args({size, size}); } // Tall-thin (m >> n): in ColMajor kernel, the inner vectorized loop over rows // is long while the outer column loop is short. In RowMajor kernel (transpose), // there are many rows to process but short dot products. - for (int n : {1, 4, 16, 64}) { - for (int m : {256, 1024, 4096}) { - if (m != n) b->Args({m, n}); + for (int n : {1, 16}) { + for (int m : {256, 1024}) { + b->Args({m, n}); } } // Short-wide (m << n): in ColMajor kernel, the outer column loop is long but // the inner vectorized loop over rows is short. In RowMajor kernel (transpose), // there are few rows but long dot products. - for (int m : {1, 4, 16, 64}) { - for (int n : {256, 1024, 4096}) { - if (m != n) b->Args({m, n}); + for (int m : {1, 16}) { + for (int n : {256, 1024}) { + b->Args({m, n}); } } } @@ -149,12 +149,9 @@ BENCHMARK(BM_GemvTrans)->Apply(GemvSizes)->Name("GemvTrans_float"); BENCHMARK(BM_GemvTrans)->Apply(GemvSizes)->Name("GemvTrans_double"); // Complex types: all four variants exercise distinct kernel code paths. +// Only cfloat is benchmarked since cdouble exercises the same paths but slower. BENCHMARK(BM_Gemv>)->Apply(GemvSizes)->Name("Gemv_cfloat"); -BENCHMARK(BM_Gemv>)->Apply(GemvSizes)->Name("Gemv_cdouble"); BENCHMARK(BM_GemvTrans>)->Apply(GemvSizes)->Name("GemvTrans_cfloat"); -BENCHMARK(BM_GemvTrans>)->Apply(GemvSizes)->Name("GemvTrans_cdouble"); BENCHMARK(BM_GemvConj>)->Apply(GemvSizes)->Name("GemvConj_cfloat"); -BENCHMARK(BM_GemvConj>)->Apply(GemvSizes)->Name("GemvConj_cdouble"); BENCHMARK(BM_GemvAdj>)->Apply(GemvSizes)->Name("GemvAdj_cfloat"); -BENCHMARK(BM_GemvAdj>)->Apply(GemvSizes)->Name("GemvAdj_cdouble"); diff --git a/benchmarks/bench_trsm.cpp b/benchmarks/bench_trsm.cpp index 3da3a90c9..7b050b53d 100644 --- a/benchmarks/bench_trsm.cpp +++ b/benchmarks/bench_trsm.cpp @@ -64,36 +64,31 @@ static void BM_TRSM_Right(benchmark::State& state) { // ---------- Size configurations ---------- static void TrsvSizes(::benchmark::Benchmark* b) { - for (int n : {32, 64, 128, 256, 512, 1024}) { + for (int n : {32, 128, 512}) { b->Args({n}); } } static void TrsmSizes(::benchmark::Benchmark* b) { - for (int n : {32, 64, 128, 256, 512, 1024}) { - for (int nrhs : {1, 4, 16, 64, 256}) { + for (int n : {64, 256, 512}) { + for (int nrhs : {1, 16, 64}) { b->Args({n, nrhs}); } } } // ---------- TRSV benchmarks ---------- +// Only Lower is benchmarked; Upper exercises the same kernel via transposed storage. BENCHMARK(BM_TRSV)->Apply(TrsvSizes)->Name("TRSV_float_Lower"); -BENCHMARK(BM_TRSV)->Apply(TrsvSizes)->Name("TRSV_float_Upper"); BENCHMARK(BM_TRSV)->Apply(TrsvSizes)->Name("TRSV_double_Lower"); -BENCHMARK(BM_TRSV)->Apply(TrsvSizes)->Name("TRSV_double_Upper"); // ---------- TRSM Left benchmarks ---------- BENCHMARK(BM_TRSM_Left)->Apply(TrsmSizes)->Name("TRSM_Left_float_Lower"); -BENCHMARK(BM_TRSM_Left)->Apply(TrsmSizes)->Name("TRSM_Left_float_Upper"); BENCHMARK(BM_TRSM_Left)->Apply(TrsmSizes)->Name("TRSM_Left_double_Lower"); -BENCHMARK(BM_TRSM_Left)->Apply(TrsmSizes)->Name("TRSM_Left_double_Upper"); // ---------- TRSM Right benchmarks ---------- BENCHMARK(BM_TRSM_Right)->Apply(TrsmSizes)->Name("TRSM_Right_float_Lower"); -BENCHMARK(BM_TRSM_Right)->Apply(TrsmSizes)->Name("TRSM_Right_float_Upper"); BENCHMARK(BM_TRSM_Right)->Apply(TrsmSizes)->Name("TRSM_Right_double_Lower"); -BENCHMARK(BM_TRSM_Right)->Apply(TrsmSizes)->Name("TRSM_Right_double_Upper");