benchmarks/GPU/bench_gpu_solvers.cpp

// GPU solver benchmarks: GpuLLT and GpuLU compute + solve throughput.
//
// Measures factorization and solve performance for the host-matrix and
// DeviceMatrix code paths across a range of matrix sizes.
//
// For Nsight Systems profiling:
//   nsys profile --trace=cuda,nvtx ./bench_gpu_solvers
//
// For Nsight Compute kernel analysis:
//   ncu --set full -o profile ./bench_gpu_solvers --benchmark_filter=BM_GpuLLT_Compute/4096

#include <benchmark/benchmark.h>

#include <Eigen/Cholesky>
#include <Eigen/GPU>
#include <Eigen/LU>

using namespace Eigen;

#ifndef SCALAR
#define SCALAR double
#endif

using Scalar = SCALAR;
using Mat = Matrix<Scalar, Dynamic, Dynamic>;

// --------------------------------------------------------------------------
// Helpers
// --------------------------------------------------------------------------

static Mat make_spd(Index n) {
  Mat M = Mat::Random(n, n);
  return M.adjoint() * M + Mat::Identity(n, n) * static_cast<Scalar>(n);
}

// CUDA warm-up: ensure the GPU is initialized before timing.
static void cuda_warmup() {
  static bool done = false;
  if (!done) {
    void* p;
    cudaMalloc(&p, 1);
    cudaFree(p);
    done = true;
  }
}

// --------------------------------------------------------------------------
// GpuLLT benchmarks
// --------------------------------------------------------------------------

// Factorize from host matrix (includes H2D upload).
static void BM_GpuLLT_Compute_Host(benchmark::State& state) {
  cuda_warmup();
  const Index n = state.range(0);
  Mat A = make_spd(n);
  GpuLLT<Scalar> llt;

  for (auto _ : state) {
    llt.compute(A);
    if (llt.info() != Success) state.SkipWithError("factorization failed");
  }

  double flops = static_cast<double>(n) * static_cast<double>(n) * static_cast<double>(n) / 3.0;
  state.counters["GFLOPS"] =
      benchmark::Counter(flops, benchmark::Counter::kIsIterationInvariantRate, benchmark::Counter::kIs1000);
  state.counters["n"] = n;
}

// Factorize from DeviceMatrix (D2D copy path).
static void BM_GpuLLT_Compute_Device(benchmark::State& state) {
  cuda_warmup();
  const Index n = state.range(0);
  Mat A = make_spd(n);
  auto d_A = DeviceMatrix<Scalar>::fromHost(A);
  GpuLLT<Scalar> llt;

  for (auto _ : state) {
    llt.compute(d_A);
    if (llt.info() != Success) state.SkipWithError("factorization failed");
  }

  double flops = static_cast<double>(n) * static_cast<double>(n) * static_cast<double>(n) / 3.0;
  state.counters["GFLOPS"] =
      benchmark::Counter(flops, benchmark::Counter::kIsIterationInvariantRate, benchmark::Counter::kIs1000);
  state.counters["n"] = n;
}

// Factorize from DeviceMatrix (move path, no copy).
static void BM_GpuLLT_Compute_DeviceMove(benchmark::State& state) {
  cuda_warmup();
  const Index n = state.range(0);
  Mat A = make_spd(n);
  GpuLLT<Scalar> llt;

  for (auto _ : state) {
    auto d_A = DeviceMatrix<Scalar>::fromHost(A);
    llt.compute(std::move(d_A));
    if (llt.info() != Success) state.SkipWithError("factorization failed");
  }

  double flops = static_cast<double>(n) * static_cast<double>(n) * static_cast<double>(n) / 3.0;
  state.counters["GFLOPS"] =
      benchmark::Counter(flops, benchmark::Counter::kIsIterationInvariantRate, benchmark::Counter::kIs1000);
  state.counters["n"] = n;
}

// Solve from host matrix (H2D + potrs + D2H).
static void BM_GpuLLT_Solve_Host(benchmark::State& state) {
  cuda_warmup();
  const Index n = state.range(0);
  const Index nrhs = state.range(1);
  Mat A = make_spd(n);
  Mat B = Mat::Random(n, nrhs);
  GpuLLT<Scalar> llt(A);

  for (auto _ : state) {
    Mat X = llt.solve(B);
    benchmark::DoNotOptimize(X.data());
  }

  state.counters["n"] = n;
  state.counters["nrhs"] = nrhs;
}

// Solve from DeviceMatrix (D2D + potrs, async, toHost at end).
static void BM_GpuLLT_Solve_Device(benchmark::State& state) {
  cuda_warmup();
  const Index n = state.range(0);
  const Index nrhs = state.range(1);
  Mat A = make_spd(n);
  Mat B = Mat::Random(n, nrhs);
  GpuLLT<Scalar> llt(A);
  auto d_B = DeviceMatrix<Scalar>::fromHost(B);

  for (auto _ : state) {
    DeviceMatrix<Scalar> d_X = llt.solve(d_B);
    Mat X = d_X.toHost();
    benchmark::DoNotOptimize(X.data());
  }

  state.counters["n"] = n;
  state.counters["nrhs"] = nrhs;
}

// Solve staying entirely on device (no toHost — measures pure GPU time).
static void BM_GpuLLT_Solve_DeviceOnly(benchmark::State& state) {
  cuda_warmup();
  const Index n = state.range(0);
  const Index nrhs = state.range(1);
  Mat A = make_spd(n);
  Mat B = Mat::Random(n, nrhs);
  GpuLLT<Scalar> llt(A);
  auto d_B = DeviceMatrix<Scalar>::fromHost(B);

  for (auto _ : state) {
    DeviceMatrix<Scalar> d_X = llt.solve(d_B);
    // Force completion without D2H transfer.
    cudaStreamSynchronize(llt.stream());
    benchmark::DoNotOptimize(d_X.data());
  }

  state.counters["n"] = n;
  state.counters["nrhs"] = nrhs;
}

// --------------------------------------------------------------------------
// GpuLU benchmarks
// --------------------------------------------------------------------------

static void BM_GpuLU_Compute_Host(benchmark::State& state) {
  cuda_warmup();
  const Index n = state.range(0);
  Mat A = Mat::Random(n, n);
  GpuLU<Scalar> lu;

  for (auto _ : state) {
    lu.compute(A);
    if (lu.info() != Success) state.SkipWithError("factorization failed");
  }

  double flops = 2.0 / 3.0 * static_cast<double>(n) * static_cast<double>(n) * static_cast<double>(n);
  state.counters["GFLOPS"] =
      benchmark::Counter(flops, benchmark::Counter::kIsIterationInvariantRate, benchmark::Counter::kIs1000);
  state.counters["n"] = n;
}

static void BM_GpuLU_Compute_Device(benchmark::State& state) {
  cuda_warmup();
  const Index n = state.range(0);
  Mat A = Mat::Random(n, n);
  auto d_A = DeviceMatrix<Scalar>::fromHost(A);
  GpuLU<Scalar> lu;

  for (auto _ : state) {
    lu.compute(d_A);
    if (lu.info() != Success) state.SkipWithError("factorization failed");
  }

  double flops = 2.0 / 3.0 * static_cast<double>(n) * static_cast<double>(n) * static_cast<double>(n);
  state.counters["GFLOPS"] =
      benchmark::Counter(flops, benchmark::Counter::kIsIterationInvariantRate, benchmark::Counter::kIs1000);
  state.counters["n"] = n;
}

static void BM_GpuLU_Solve_Host(benchmark::State& state) {
  cuda_warmup();
  const Index n = state.range(0);
  const Index nrhs = state.range(1);
  Mat A = Mat::Random(n, n);
  Mat B = Mat::Random(n, nrhs);
  GpuLU<Scalar> lu(A);

  for (auto _ : state) {
    Mat X = lu.solve(B);
    benchmark::DoNotOptimize(X.data());
  }

  state.counters["n"] = n;
  state.counters["nrhs"] = nrhs;
}

static void BM_GpuLU_Solve_Device(benchmark::State& state) {
  cuda_warmup();
  const Index n = state.range(0);
  const Index nrhs = state.range(1);
  Mat A = Mat::Random(n, n);
  Mat B = Mat::Random(n, nrhs);
  GpuLU<Scalar> lu(A);
  auto d_B = DeviceMatrix<Scalar>::fromHost(B);

  for (auto _ : state) {
    DeviceMatrix<Scalar> d_X = lu.solve(d_B);
    Mat X = d_X.toHost();
    benchmark::DoNotOptimize(X.data());
  }

  state.counters["n"] = n;
  state.counters["nrhs"] = nrhs;
}

// --------------------------------------------------------------------------
// CPU baselines for comparison
// --------------------------------------------------------------------------

static void BM_CpuLLT_Compute(benchmark::State& state) {
  const Index n = state.range(0);
  Mat A = make_spd(n);
  LLT<Mat> llt;

  for (auto _ : state) {
    llt.compute(A);
    benchmark::DoNotOptimize(llt.matrixLLT().data());
  }

  double flops = static_cast<double>(n) * static_cast<double>(n) * static_cast<double>(n) / 3.0;
  state.counters["GFLOPS"] =
      benchmark::Counter(flops, benchmark::Counter::kIsIterationInvariantRate, benchmark::Counter::kIs1000);
  state.counters["n"] = n;
}

static void BM_CpuLU_Compute(benchmark::State& state) {
  const Index n = state.range(0);
  Mat A = Mat::Random(n, n);
  PartialPivLU<Mat> lu;

  for (auto _ : state) {
    lu.compute(A);
    benchmark::DoNotOptimize(lu.matrixLU().data());
  }

  double flops = 2.0 / 3.0 * static_cast<double>(n) * static_cast<double>(n) * static_cast<double>(n);
  state.counters["GFLOPS"] =
      benchmark::Counter(flops, benchmark::Counter::kIsIterationInvariantRate, benchmark::Counter::kIs1000);
  state.counters["n"] = n;
}

// --------------------------------------------------------------------------
// Registration
// --------------------------------------------------------------------------

// clang-format off
BENCHMARK(BM_GpuLLT_Compute_Host)->ArgsProduct({{64, 128, 256, 512, 1024, 2048, 4096}})->Unit(benchmark::kMicrosecond);
BENCHMARK(BM_GpuLLT_Compute_Device)->ArgsProduct({{64, 128, 256, 512, 1024, 2048, 4096}})->Unit(benchmark::kMicrosecond);
BENCHMARK(BM_GpuLLT_Compute_DeviceMove)->ArgsProduct({{64, 128, 256, 512, 1024, 2048, 4096}})->Unit(benchmark::kMicrosecond);
BENCHMARK(BM_GpuLLT_Solve_Host)->ArgsProduct({{64, 256, 1024, 4096}, {1, 16}})->Unit(benchmark::kMicrosecond);
BENCHMARK(BM_GpuLLT_Solve_Device)->ArgsProduct({{64, 256, 1024, 4096}, {1, 16}})->Unit(benchmark::kMicrosecond);
BENCHMARK(BM_GpuLLT_Solve_DeviceOnly)->ArgsProduct({{64, 256, 1024, 4096}, {1, 16}})->Unit(benchmark::kMicrosecond);

BENCHMARK(BM_GpuLU_Compute_Host)->ArgsProduct({{64, 128, 256, 512, 1024, 2048, 4096}})->Unit(benchmark::kMicrosecond);
BENCHMARK(BM_GpuLU_Compute_Device)->ArgsProduct({{64, 128, 256, 512, 1024, 2048, 4096}})->Unit(benchmark::kMicrosecond);
BENCHMARK(BM_GpuLU_Solve_Host)->ArgsProduct({{64, 256, 1024, 4096}, {1, 16}})->Unit(benchmark::kMicrosecond);
BENCHMARK(BM_GpuLU_Solve_Device)->ArgsProduct({{64, 256, 1024, 4096}, {1, 16}})->Unit(benchmark::kMicrosecond);

BENCHMARK(BM_CpuLLT_Compute)->ArgsProduct({{64, 128, 256, 512, 1024, 2048, 4096}})->Unit(benchmark::kMicrosecond);
BENCHMARK(BM_CpuLU_Compute)->ArgsProduct({{64, 128, 256, 512, 1024, 2048, 4096}})->Unit(benchmark::kMicrosecond);
// clang-format on
GPU: Add library dispatch module (DeviceMatrix, cuBLAS, cuSOLVER) Add Eigen/GPU module: A standalone GPU library dispatch layer where DeviceMatrix<Scalar> operations map 1:1 to cuBLAS/cuSOLVER calls. CPU and GPU solvers coexist in the same binary with compatible syntax. Core infrastructure: - DeviceMatrix<Scalar>: RAII dense column-major GPU memory wrapper with async host transfer (fromHost/toHost) and CUDA event-based cross-stream synchronization. - GpuContext: Unified execution context owning a CUDA stream + cuBLAS handle + cuSOLVER handle. Thread-local default with explicit override via setThreadLocal(). Stream-borrowing constructor for integration. - DeviceBuffer: Typed RAII device allocation with move semantics. cuBLAS dispatch (expression syntax): - GEMM: d_C = d_A.adjoint() * d_B (cublasXgemm) - TRSM: d_X = d_A.triangularView<Lower>().solve(d_B) (cublasXtrsm) - SYMM/HEMM: d_C = d_A.selfadjointView<Lower>() * d_B (cublasXsymm) - SYRK/HERK: d_C = d_A * d_A.adjoint() (cublasXsyrk) cuSOLVER dispatch: - GpuLLT: Cached Cholesky factorization (cusolverDnXpotrf + Xpotrs) - GpuLU: Cached LU factorization (cusolverDnXgetrf + Xgetrs) - Solver chaining: auto x = d_A.llt().solve(d_B) - Solver expressions with .device(ctx) for explicit stream control. CI: Bump CUDA container to Ubuntu 22.04 (CMake 3.22), GCC 10->11, Clang 12->14. Bump cmake_minimum_required to 3.17 for FindCUDAToolkit. Tests: gpu_cublas.cpp, gpu_cusolver_llt.cpp, gpu_cusolver_lu.cpp, gpu_device_matrix.cpp, gpu_library_example.cu Benchmarks: bench_gpu_solvers.cpp, bench_gpu_chaining.cpp, bench_gpu_batching.cpp 2026-04-09 16:15:39 -07:00			`// GPU solver benchmarks: GpuLLT and GpuLU compute + solve throughput.`
			`//`
			`// Measures factorization and solve performance for the host-matrix and`
			`// DeviceMatrix code paths across a range of matrix sizes.`
			`//`
			`// For Nsight Systems profiling:`
			`// nsys profile --trace=cuda,nvtx ./bench_gpu_solvers`
			`//`
			`// For Nsight Compute kernel analysis:`
			`// ncu --set full -o profile ./bench_gpu_solvers --benchmark_filter=BM_GpuLLT_Compute/4096`

			`#include <benchmark/benchmark.h>`

			`#include <Eigen/Cholesky>`
			`#include <Eigen/GPU>`
			`#include <Eigen/LU>`

			`using namespace Eigen;`

			`#ifndef SCALAR`
			`#define SCALAR double`
			`#endif`

			`using Scalar = SCALAR;`
			`using Mat = Matrix<Scalar, Dynamic, Dynamic>;`

			`// --------------------------------------------------------------------------`
			`// Helpers`
			`// --------------------------------------------------------------------------`

			`static Mat make_spd(Index n) {`
			`Mat M = Mat::Random(n, n);`
			`return M.adjoint() * M + Mat::Identity(n, n) * static_cast<Scalar>(n);`
			`}`

			`// CUDA warm-up: ensure the GPU is initialized before timing.`
			`static void cuda_warmup() {`
			`static bool done = false;`
			`if (!done) {`
			`void* p;`
			`cudaMalloc(&p, 1);`
			`cudaFree(p);`
			`done = true;`
			`}`
			`}`

			`// --------------------------------------------------------------------------`
			`// GpuLLT benchmarks`
			`// --------------------------------------------------------------------------`

			`// Factorize from host matrix (includes H2D upload).`
			`static void BM_GpuLLT_Compute_Host(benchmark::State& state) {`
			`cuda_warmup();`
			`const Index n = state.range(0);`
			`Mat A = make_spd(n);`
			`GpuLLT<Scalar> llt;`

			`for (auto _ : state) {`
			`llt.compute(A);`
			`if (llt.info() != Success) state.SkipWithError("factorization failed");`
			`}`

			`double flops = static_cast<double>(n) * static_cast<double>(n) * static_cast<double>(n) / 3.0;`
			`state.counters["GFLOPS"] =`
			`benchmark::Counter(flops, benchmark::Counter::kIsIterationInvariantRate, benchmark::Counter::kIs1000);`
			`state.counters["n"] = n;`
			`}`

			`// Factorize from DeviceMatrix (D2D copy path).`
			`static void BM_GpuLLT_Compute_Device(benchmark::State& state) {`
			`cuda_warmup();`
			`const Index n = state.range(0);`
			`Mat A = make_spd(n);`
			`auto d_A = DeviceMatrix<Scalar>::fromHost(A);`
			`GpuLLT<Scalar> llt;`

			`for (auto _ : state) {`
			`llt.compute(d_A);`
			`if (llt.info() != Success) state.SkipWithError("factorization failed");`
			`}`

			`double flops = static_cast<double>(n) * static_cast<double>(n) * static_cast<double>(n) / 3.0;`
			`state.counters["GFLOPS"] =`
			`benchmark::Counter(flops, benchmark::Counter::kIsIterationInvariantRate, benchmark::Counter::kIs1000);`
			`state.counters["n"] = n;`
			`}`

			`// Factorize from DeviceMatrix (move path, no copy).`
			`static void BM_GpuLLT_Compute_DeviceMove(benchmark::State& state) {`
			`cuda_warmup();`
			`const Index n = state.range(0);`
			`Mat A = make_spd(n);`
			`GpuLLT<Scalar> llt;`

			`for (auto _ : state) {`
			`auto d_A = DeviceMatrix<Scalar>::fromHost(A);`
			`llt.compute(std::move(d_A));`
			`if (llt.info() != Success) state.SkipWithError("factorization failed");`
			`}`

			`double flops = static_cast<double>(n) * static_cast<double>(n) * static_cast<double>(n) / 3.0;`
			`state.counters["GFLOPS"] =`
			`benchmark::Counter(flops, benchmark::Counter::kIsIterationInvariantRate, benchmark::Counter::kIs1000);`
			`state.counters["n"] = n;`
			`}`

			`// Solve from host matrix (H2D + potrs + D2H).`
			`static void BM_GpuLLT_Solve_Host(benchmark::State& state) {`
			`cuda_warmup();`
			`const Index n = state.range(0);`
			`const Index nrhs = state.range(1);`
			`Mat A = make_spd(n);`
			`Mat B = Mat::Random(n, nrhs);`
			`GpuLLT<Scalar> llt(A);`

			`for (auto _ : state) {`
			`Mat X = llt.solve(B);`
			`benchmark::DoNotOptimize(X.data());`
			`}`

			`state.counters["n"] = n;`
			`state.counters["nrhs"] = nrhs;`
			`}`

			`// Solve from DeviceMatrix (D2D + potrs, async, toHost at end).`
			`static void BM_GpuLLT_Solve_Device(benchmark::State& state) {`
			`cuda_warmup();`
			`const Index n = state.range(0);`
			`const Index nrhs = state.range(1);`
			`Mat A = make_spd(n);`
			`Mat B = Mat::Random(n, nrhs);`
			`GpuLLT<Scalar> llt(A);`
			`auto d_B = DeviceMatrix<Scalar>::fromHost(B);`

			`for (auto _ : state) {`
			`DeviceMatrix<Scalar> d_X = llt.solve(d_B);`
			`Mat X = d_X.toHost();`
			`benchmark::DoNotOptimize(X.data());`
			`}`

			`state.counters["n"] = n;`
			`state.counters["nrhs"] = nrhs;`
			`}`

			`// Solve staying entirely on device (no toHost — measures pure GPU time).`
			`static void BM_GpuLLT_Solve_DeviceOnly(benchmark::State& state) {`
			`cuda_warmup();`
			`const Index n = state.range(0);`
			`const Index nrhs = state.range(1);`
			`Mat A = make_spd(n);`
			`Mat B = Mat::Random(n, nrhs);`
			`GpuLLT<Scalar> llt(A);`
			`auto d_B = DeviceMatrix<Scalar>::fromHost(B);`

			`for (auto _ : state) {`
			`DeviceMatrix<Scalar> d_X = llt.solve(d_B);`
			`// Force completion without D2H transfer.`
			`cudaStreamSynchronize(llt.stream());`
			`benchmark::DoNotOptimize(d_X.data());`
			`}`

			`state.counters["n"] = n;`
			`state.counters["nrhs"] = nrhs;`
			`}`

			`// --------------------------------------------------------------------------`
			`// GpuLU benchmarks`
			`// --------------------------------------------------------------------------`

			`static void BM_GpuLU_Compute_Host(benchmark::State& state) {`
			`cuda_warmup();`
			`const Index n = state.range(0);`
			`Mat A = Mat::Random(n, n);`
			`GpuLU<Scalar> lu;`

			`for (auto _ : state) {`
			`lu.compute(A);`
			`if (lu.info() != Success) state.SkipWithError("factorization failed");`
			`}`

			`double flops = 2.0 / 3.0 * static_cast<double>(n) * static_cast<double>(n) * static_cast<double>(n);`
			`state.counters["GFLOPS"] =`
			`benchmark::Counter(flops, benchmark::Counter::kIsIterationInvariantRate, benchmark::Counter::kIs1000);`
			`state.counters["n"] = n;`
			`}`

			`static void BM_GpuLU_Compute_Device(benchmark::State& state) {`
			`cuda_warmup();`
			`const Index n = state.range(0);`
			`Mat A = Mat::Random(n, n);`
			`auto d_A = DeviceMatrix<Scalar>::fromHost(A);`
			`GpuLU<Scalar> lu;`

			`for (auto _ : state) {`
			`lu.compute(d_A);`
			`if (lu.info() != Success) state.SkipWithError("factorization failed");`
			`}`

			`double flops = 2.0 / 3.0 * static_cast<double>(n) * static_cast<double>(n) * static_cast<double>(n);`
			`state.counters["GFLOPS"] =`
			`benchmark::Counter(flops, benchmark::Counter::kIsIterationInvariantRate, benchmark::Counter::kIs1000);`
			`state.counters["n"] = n;`
			`}`

			`static void BM_GpuLU_Solve_Host(benchmark::State& state) {`
			`cuda_warmup();`
			`const Index n = state.range(0);`
			`const Index nrhs = state.range(1);`
			`Mat A = Mat::Random(n, n);`
			`Mat B = Mat::Random(n, nrhs);`
			`GpuLU<Scalar> lu(A);`

			`for (auto _ : state) {`
			`Mat X = lu.solve(B);`
			`benchmark::DoNotOptimize(X.data());`
			`}`

			`state.counters["n"] = n;`
			`state.counters["nrhs"] = nrhs;`
			`}`

			`static void BM_GpuLU_Solve_Device(benchmark::State& state) {`
			`cuda_warmup();`
			`const Index n = state.range(0);`
			`const Index nrhs = state.range(1);`
			`Mat A = Mat::Random(n, n);`
			`Mat B = Mat::Random(n, nrhs);`
			`GpuLU<Scalar> lu(A);`
			`auto d_B = DeviceMatrix<Scalar>::fromHost(B);`

			`for (auto _ : state) {`
			`DeviceMatrix<Scalar> d_X = lu.solve(d_B);`
			`Mat X = d_X.toHost();`
			`benchmark::DoNotOptimize(X.data());`
			`}`

			`state.counters["n"] = n;`
			`state.counters["nrhs"] = nrhs;`
			`}`

			`// --------------------------------------------------------------------------`
			`// CPU baselines for comparison`
			`// --------------------------------------------------------------------------`

			`static void BM_CpuLLT_Compute(benchmark::State& state) {`
			`const Index n = state.range(0);`
			`Mat A = make_spd(n);`
			`LLT<Mat> llt;`

			`for (auto _ : state) {`
			`llt.compute(A);`
			`benchmark::DoNotOptimize(llt.matrixLLT().data());`
			`}`

			`double flops = static_cast<double>(n) * static_cast<double>(n) * static_cast<double>(n) / 3.0;`
			`state.counters["GFLOPS"] =`
			`benchmark::Counter(flops, benchmark::Counter::kIsIterationInvariantRate, benchmark::Counter::kIs1000);`
			`state.counters["n"] = n;`
			`}`

			`static void BM_CpuLU_Compute(benchmark::State& state) {`
			`const Index n = state.range(0);`
			`Mat A = Mat::Random(n, n);`
			`PartialPivLU<Mat> lu;`

			`for (auto _ : state) {`
			`lu.compute(A);`
			`benchmark::DoNotOptimize(lu.matrixLU().data());`
			`}`

			`double flops = 2.0 / 3.0 * static_cast<double>(n) * static_cast<double>(n) * static_cast<double>(n);`
			`state.counters["GFLOPS"] =`
			`benchmark::Counter(flops, benchmark::Counter::kIsIterationInvariantRate, benchmark::Counter::kIs1000);`
			`state.counters["n"] = n;`
			`}`

			`// --------------------------------------------------------------------------`
			`// Registration`
			`// --------------------------------------------------------------------------`

			`// clang-format off`
			`BENCHMARK(BM_GpuLLT_Compute_Host)->ArgsProduct({{64, 128, 256, 512, 1024, 2048, 4096}})->Unit(benchmark::kMicrosecond);`
			`BENCHMARK(BM_GpuLLT_Compute_Device)->ArgsProduct({{64, 128, 256, 512, 1024, 2048, 4096}})->Unit(benchmark::kMicrosecond);`
			`BENCHMARK(BM_GpuLLT_Compute_DeviceMove)->ArgsProduct({{64, 128, 256, 512, 1024, 2048, 4096}})->Unit(benchmark::kMicrosecond);`
			`BENCHMARK(BM_GpuLLT_Solve_Host)->ArgsProduct({{64, 256, 1024, 4096}, {1, 16}})->Unit(benchmark::kMicrosecond);`
			`BENCHMARK(BM_GpuLLT_Solve_Device)->ArgsProduct({{64, 256, 1024, 4096}, {1, 16}})->Unit(benchmark::kMicrosecond);`
			`BENCHMARK(BM_GpuLLT_Solve_DeviceOnly)->ArgsProduct({{64, 256, 1024, 4096}, {1, 16}})->Unit(benchmark::kMicrosecond);`

			`BENCHMARK(BM_GpuLU_Compute_Host)->ArgsProduct({{64, 128, 256, 512, 1024, 2048, 4096}})->Unit(benchmark::kMicrosecond);`
			`BENCHMARK(BM_GpuLU_Compute_Device)->ArgsProduct({{64, 128, 256, 512, 1024, 2048, 4096}})->Unit(benchmark::kMicrosecond);`
			`BENCHMARK(BM_GpuLU_Solve_Host)->ArgsProduct({{64, 256, 1024, 4096}, {1, 16}})->Unit(benchmark::kMicrosecond);`
			`BENCHMARK(BM_GpuLU_Solve_Device)->ArgsProduct({{64, 256, 1024, 4096}, {1, 16}})->Unit(benchmark::kMicrosecond);`

			`BENCHMARK(BM_CpuLLT_Compute)->ArgsProduct({{64, 128, 256, 512, 1024, 2048, 4096}})->Unit(benchmark::kMicrosecond);`
			`BENCHMARK(BM_CpuLU_Compute)->ArgsProduct({{64, 128, 256, 512, 1024, 2048, 4096}})->Unit(benchmark::kMicrosecond);`
			`// clang-format on`