test/gpu_cusolver_lu.cpp

// This file is part of Eigen, a lightweight C++ template library
// for linear algebra.
//
// Copyright (C) 2026 Eigen Authors
//
// This Source Code Form is subject to the terms of the Mozilla
// Public License v. 2.0. If a copy of the MPL was not distributed
// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.

// Tests for GpuLU: GPU partial-pivoting LU decomposition via cuSOLVER.
// Covers cusolverDnXgetrf (factorization) and cusolverDnXgetrs (solve)
// for float, double, complex<float>, complex<double>.
//
#define EIGEN_USE_GPU
#include "main.h"
#include <Eigen/LU>
#include <Eigen/GPU>

using namespace Eigen;

// ---- Test factorization + NoTrans solve: residual ||A*X - B|| / ||B|| -------

template <typename Scalar>
void test_getrf(Index n) {
  using MatrixType = Matrix<Scalar, Dynamic, Dynamic>;
  using RealScalar = typename NumTraits<Scalar>::Real;

  MatrixType A = MatrixType::Random(n, n);
  MatrixType B = MatrixType::Random(n, 4);

  GpuLU<Scalar> lu(A);
  VERIFY_IS_EQUAL(lu.info(), Success);

  MatrixType X = lu.solve(B);
  // Backward error bound for LU: ||A*X - B|| <= O(n*u) * ||A|| * ||X||.
  // Normalize by ||A||*||X|| rather than ||B|| to be condition-number agnostic.
  RealScalar residual = (A * X - B).norm() / (A.norm() * X.norm());
  VERIFY(residual < RealScalar(10) * RealScalar(n) * NumTraits<Scalar>::epsilon());
}

// ---- Test solve: A^T*X = B and A^H*X = B ------------------------------------

template <typename Scalar>
void test_getrs_trans(Index n) {
  using MatrixType = Matrix<Scalar, Dynamic, Dynamic>;
  using RealScalar = typename NumTraits<Scalar>::Real;

  MatrixType A = MatrixType::Random(n, n);
  MatrixType B = MatrixType::Random(n, 3);
  RealScalar tol = RealScalar(10) * RealScalar(n) * NumTraits<Scalar>::epsilon();

  GpuLU<Scalar> lu(A);
  VERIFY_IS_EQUAL(lu.info(), Success);

  MatrixType Xt = lu.solve(B, GpuLU<Scalar>::Transpose);
  VERIFY((A.transpose() * Xt - B).norm() / (A.norm() * Xt.norm()) < tol);

  MatrixType Xc = lu.solve(B, GpuLU<Scalar>::ConjugateTranspose);
  VERIFY((A.adjoint() * Xc - B).norm() / (A.norm() * Xc.norm()) < tol);
}

// ---- Test multiple solves reuse the device-resident LU ----------------------

template <typename Scalar>
void test_multiple_solves(Index n) {
  using MatrixType = Matrix<Scalar, Dynamic, Dynamic>;
  using RealScalar = typename NumTraits<Scalar>::Real;

  MatrixType A = MatrixType::Random(n, n);
  GpuLU<Scalar> lu(A);
  VERIFY_IS_EQUAL(lu.info(), Success);

  RealScalar tol = RealScalar(10) * RealScalar(n) * NumTraits<Scalar>::epsilon();
  for (int k = 0; k < 5; ++k) {
    MatrixType B = MatrixType::Random(n, 3);
    MatrixType X = lu.solve(B);
    VERIFY((A * X - B).norm() / (A.norm() * X.norm()) < tol);
  }
}

// ---- Agreement with CPU PartialPivLU ----------------------------------------

template <typename Scalar>
void test_vs_cpu(Index n) {
  using MatrixType = Matrix<Scalar, Dynamic, Dynamic>;
  using RealScalar = typename NumTraits<Scalar>::Real;

  MatrixType A = MatrixType::Random(n, n);
  MatrixType B = MatrixType::Random(n, 5);

  GpuLU<Scalar> gpu_lu(A);
  VERIFY_IS_EQUAL(gpu_lu.info(), Success);

  MatrixType X_gpu = gpu_lu.solve(B);
  MatrixType X_cpu = PartialPivLU<MatrixType>(A).solve(B);

  RealScalar tol = RealScalar(100) * RealScalar(n) * NumTraits<Scalar>::epsilon();
  VERIFY((X_gpu - X_cpu).norm() / X_cpu.norm() < tol);
}

// ---- Singular matrix detection ----------------------------------------------

void test_singular() {
  MatrixXd A = MatrixXd::Zero(8, 8);
  GpuLU<double> lu(A);
  VERIFY_IS_EQUAL(lu.info(), NumericalIssue);
}

// ---- DeviceMatrix integration tests -----------------------------------------

template <typename Scalar>
void test_device_matrix_solve(Index n) {
  using MatrixType = Matrix<Scalar, Dynamic, Dynamic>;
  using RealScalar = typename NumTraits<Scalar>::Real;

  MatrixType A = MatrixType::Random(n, n);
  MatrixType B = MatrixType::Random(n, 4);

  auto d_A = DeviceMatrix<Scalar>::fromHost(A);
  auto d_B = DeviceMatrix<Scalar>::fromHost(B);

  GpuLU<Scalar> lu;
  lu.compute(d_A);
  VERIFY_IS_EQUAL(lu.info(), Success);

  DeviceMatrix<Scalar> d_X = lu.solve(d_B);
  MatrixType X = d_X.toHost();

  RealScalar residual = (A * X - B).norm() / (A.norm() * X.norm());
  VERIFY(residual < RealScalar(10) * RealScalar(n) * NumTraits<Scalar>::epsilon());
}

template <typename Scalar>
void test_device_matrix_move_compute(Index n) {
  using MatrixType = Matrix<Scalar, Dynamic, Dynamic>;
  using RealScalar = typename NumTraits<Scalar>::Real;

  MatrixType A = MatrixType::Random(n, n);
  MatrixType B = MatrixType::Random(n, 1);

  auto d_A = DeviceMatrix<Scalar>::fromHost(A);
  GpuLU<Scalar> lu;
  lu.compute(std::move(d_A));
  VERIFY_IS_EQUAL(lu.info(), Success);
  VERIFY(d_A.empty());

  MatrixType X = lu.solve(B);
  RealScalar residual = (A * X - B).norm() / (A.norm() * X.norm());
  VERIFY(residual < RealScalar(10) * RealScalar(n) * NumTraits<Scalar>::epsilon());
}

template <typename Scalar>
void test_chaining(Index n) {
  using MatrixType = Matrix<Scalar, Dynamic, Dynamic>;
  using RealScalar = typename NumTraits<Scalar>::Real;

  MatrixType A = MatrixType::Random(n, n);
  MatrixType B = MatrixType::Random(n, 3);

  auto d_A = DeviceMatrix<Scalar>::fromHost(A);
  auto d_B = DeviceMatrix<Scalar>::fromHost(B);

  GpuLU<Scalar> lu;
  lu.compute(d_A);
  VERIFY_IS_EQUAL(lu.info(), Success);

  // Chain: solve → use result as RHS
  DeviceMatrix<Scalar> d_X = lu.solve(d_B);
  DeviceMatrix<Scalar> d_Y = lu.solve(d_X);
  MatrixType Y = d_Y.toHost();

  MatrixType X_ref = PartialPivLU<MatrixType>(A).solve(B);
  MatrixType Y_ref = PartialPivLU<MatrixType>(A).solve(X_ref);

  RealScalar tol = RealScalar(100) * RealScalar(n) * NumTraits<Scalar>::epsilon() * Y_ref.norm();
  VERIFY((Y - Y_ref).norm() < tol);
}

// ---- Per-scalar driver -------------------------------------------------------

template <typename Scalar>
void test_scalar() {
  CALL_SUBTEST(test_getrf<Scalar>(1));
  CALL_SUBTEST(test_getrf<Scalar>(64));
  CALL_SUBTEST(test_getrf<Scalar>(256));

  CALL_SUBTEST(test_getrs_trans<Scalar>(64));
  CALL_SUBTEST(test_getrs_trans<Scalar>(128));

  CALL_SUBTEST(test_multiple_solves<Scalar>(128));

  CALL_SUBTEST(test_vs_cpu<Scalar>(64));
  CALL_SUBTEST(test_vs_cpu<Scalar>(256));

  CALL_SUBTEST(test_device_matrix_solve<Scalar>(64));
  CALL_SUBTEST(test_device_matrix_move_compute<Scalar>(64));
  CALL_SUBTEST(test_chaining<Scalar>(64));
}

EIGEN_DECLARE_TEST(gpu_cusolver_lu) {
  CALL_SUBTEST(test_scalar<float>());
  CALL_SUBTEST(test_scalar<double>());
  CALL_SUBTEST(test_scalar<std::complex<float>>());
  CALL_SUBTEST(test_scalar<std::complex<double>>());
  CALL_SUBTEST(test_singular());
}
GPU: Add library dispatch module (DeviceMatrix, cuBLAS, cuSOLVER) Add Eigen/GPU module: A standalone GPU library dispatch layer where DeviceMatrix<Scalar> operations map 1:1 to cuBLAS/cuSOLVER calls. CPU and GPU solvers coexist in the same binary with compatible syntax. Core infrastructure: - DeviceMatrix<Scalar>: RAII dense column-major GPU memory wrapper with async host transfer (fromHost/toHost) and CUDA event-based cross-stream synchronization. - GpuContext: Unified execution context owning a CUDA stream + cuBLAS handle + cuSOLVER handle. Thread-local default with explicit override via setThreadLocal(). Stream-borrowing constructor for integration. - DeviceBuffer: Typed RAII device allocation with move semantics. cuBLAS dispatch (expression syntax): - GEMM: d_C = d_A.adjoint() * d_B (cublasXgemm) - TRSM: d_X = d_A.triangularView<Lower>().solve(d_B) (cublasXtrsm) - SYMM/HEMM: d_C = d_A.selfadjointView<Lower>() * d_B (cublasXsymm) - SYRK/HERK: d_C = d_A * d_A.adjoint() (cublasXsyrk) cuSOLVER dispatch: - GpuLLT: Cached Cholesky factorization (cusolverDnXpotrf + Xpotrs) - GpuLU: Cached LU factorization (cusolverDnXgetrf + Xgetrs) - Solver chaining: auto x = d_A.llt().solve(d_B) - Solver expressions with .device(ctx) for explicit stream control. CI: Bump CUDA container to Ubuntu 22.04 (CMake 3.22), GCC 10->11, Clang 12->14. Bump cmake_minimum_required to 3.17 for FindCUDAToolkit. Tests: gpu_cublas.cpp, gpu_cusolver_llt.cpp, gpu_cusolver_lu.cpp, gpu_device_matrix.cpp, gpu_library_example.cu Benchmarks: bench_gpu_solvers.cpp, bench_gpu_chaining.cpp, bench_gpu_batching.cpp 2026-04-09 16:15:39 -07:00			`// This file is part of Eigen, a lightweight C++ template library`
			`// for linear algebra.`
			`//`
			`// Copyright (C) 2026 Eigen Authors`
			`//`
			`// This Source Code Form is subject to the terms of the Mozilla`
			`// Public License v. 2.0. If a copy of the MPL was not distributed`
			`// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.`

			`// Tests for GpuLU: GPU partial-pivoting LU decomposition via cuSOLVER.`
			`// Covers cusolverDnXgetrf (factorization) and cusolverDnXgetrs (solve)`
			`// for float, double, complex<float>, complex<double>.`
			`//`
			`#define EIGEN_USE_GPU`
			`#include "main.h"`
			`#include <Eigen/LU>`
			`#include <Eigen/GPU>`

			`using namespace Eigen;`

			`// ---- Test factorization + NoTrans solve: residual \|\|A*X - B\|\| / \|\|B\|\| -------`

			`template <typename Scalar>`
			`void test_getrf(Index n) {`
			`using MatrixType = Matrix<Scalar, Dynamic, Dynamic>;`
			`using RealScalar = typename NumTraits<Scalar>::Real;`

			`MatrixType A = MatrixType::Random(n, n);`
			`MatrixType B = MatrixType::Random(n, 4);`

			`GpuLU<Scalar> lu(A);`
			`VERIFY_IS_EQUAL(lu.info(), Success);`

			`MatrixType X = lu.solve(B);`
			`// Backward error bound for LU: \|\|AX - B\|\| <= O(nu) * \|\|A\|\| * \|\|X\|\|.`
			`// Normalize by \|\|A\|\|*\|\|X\|\| rather than \|\|B\|\| to be condition-number agnostic.`
			`RealScalar residual = (A * X - B).norm() / (A.norm() * X.norm());`
			`VERIFY(residual < RealScalar(10) * RealScalar(n) * NumTraits<Scalar>::epsilon());`
			`}`

			`// ---- Test solve: A^TX = B and A^HX = B ------------------------------------`

			`template <typename Scalar>`
			`void test_getrs_trans(Index n) {`
			`using MatrixType = Matrix<Scalar, Dynamic, Dynamic>;`
			`using RealScalar = typename NumTraits<Scalar>::Real;`

			`MatrixType A = MatrixType::Random(n, n);`
			`MatrixType B = MatrixType::Random(n, 3);`
			`RealScalar tol = RealScalar(10) * RealScalar(n) * NumTraits<Scalar>::epsilon();`

			`GpuLU<Scalar> lu(A);`
			`VERIFY_IS_EQUAL(lu.info(), Success);`

			`MatrixType Xt = lu.solve(B, GpuLU<Scalar>::Transpose);`
			`VERIFY((A.transpose() * Xt - B).norm() / (A.norm() * Xt.norm()) < tol);`

			`MatrixType Xc = lu.solve(B, GpuLU<Scalar>::ConjugateTranspose);`
			`VERIFY((A.adjoint() * Xc - B).norm() / (A.norm() * Xc.norm()) < tol);`
			`}`

			`// ---- Test multiple solves reuse the device-resident LU ----------------------`

			`template <typename Scalar>`
			`void test_multiple_solves(Index n) {`
			`using MatrixType = Matrix<Scalar, Dynamic, Dynamic>;`
			`using RealScalar = typename NumTraits<Scalar>::Real;`

			`MatrixType A = MatrixType::Random(n, n);`
			`GpuLU<Scalar> lu(A);`
			`VERIFY_IS_EQUAL(lu.info(), Success);`

			`RealScalar tol = RealScalar(10) * RealScalar(n) * NumTraits<Scalar>::epsilon();`
			`for (int k = 0; k < 5; ++k) {`
			`MatrixType B = MatrixType::Random(n, 3);`
			`MatrixType X = lu.solve(B);`
			`VERIFY((A * X - B).norm() / (A.norm() * X.norm()) < tol);`
			`}`
			`}`

			`// ---- Agreement with CPU PartialPivLU ----------------------------------------`

			`template <typename Scalar>`
			`void test_vs_cpu(Index n) {`
			`using MatrixType = Matrix<Scalar, Dynamic, Dynamic>;`
			`using RealScalar = typename NumTraits<Scalar>::Real;`

			`MatrixType A = MatrixType::Random(n, n);`
			`MatrixType B = MatrixType::Random(n, 5);`

			`GpuLU<Scalar> gpu_lu(A);`
			`VERIFY_IS_EQUAL(gpu_lu.info(), Success);`

			`MatrixType X_gpu = gpu_lu.solve(B);`
			`MatrixType X_cpu = PartialPivLU<MatrixType>(A).solve(B);`

			`RealScalar tol = RealScalar(100) * RealScalar(n) * NumTraits<Scalar>::epsilon();`
			`VERIFY((X_gpu - X_cpu).norm() / X_cpu.norm() < tol);`
			`}`

			`// ---- Singular matrix detection ----------------------------------------------`

			`void test_singular() {`
			`MatrixXd A = MatrixXd::Zero(8, 8);`
			`GpuLU<double> lu(A);`
			`VERIFY_IS_EQUAL(lu.info(), NumericalIssue);`
			`}`

			`// ---- DeviceMatrix integration tests -----------------------------------------`

			`template <typename Scalar>`
			`void test_device_matrix_solve(Index n) {`
			`using MatrixType = Matrix<Scalar, Dynamic, Dynamic>;`
			`using RealScalar = typename NumTraits<Scalar>::Real;`

			`MatrixType A = MatrixType::Random(n, n);`
			`MatrixType B = MatrixType::Random(n, 4);`

			`auto d_A = DeviceMatrix<Scalar>::fromHost(A);`
			`auto d_B = DeviceMatrix<Scalar>::fromHost(B);`

			`GpuLU<Scalar> lu;`
			`lu.compute(d_A);`
			`VERIFY_IS_EQUAL(lu.info(), Success);`

			`DeviceMatrix<Scalar> d_X = lu.solve(d_B);`
			`MatrixType X = d_X.toHost();`

			`RealScalar residual = (A * X - B).norm() / (A.norm() * X.norm());`
			`VERIFY(residual < RealScalar(10) * RealScalar(n) * NumTraits<Scalar>::epsilon());`
			`}`

			`template <typename Scalar>`
			`void test_device_matrix_move_compute(Index n) {`
			`using MatrixType = Matrix<Scalar, Dynamic, Dynamic>;`
			`using RealScalar = typename NumTraits<Scalar>::Real;`

			`MatrixType A = MatrixType::Random(n, n);`
			`MatrixType B = MatrixType::Random(n, 1);`

			`auto d_A = DeviceMatrix<Scalar>::fromHost(A);`
			`GpuLU<Scalar> lu;`
			`lu.compute(std::move(d_A));`
			`VERIFY_IS_EQUAL(lu.info(), Success);`
			`VERIFY(d_A.empty());`

			`MatrixType X = lu.solve(B);`
			`RealScalar residual = (A * X - B).norm() / (A.norm() * X.norm());`
			`VERIFY(residual < RealScalar(10) * RealScalar(n) * NumTraits<Scalar>::epsilon());`
			`}`

			`template <typename Scalar>`
			`void test_chaining(Index n) {`
			`using MatrixType = Matrix<Scalar, Dynamic, Dynamic>;`
			`using RealScalar = typename NumTraits<Scalar>::Real;`

			`MatrixType A = MatrixType::Random(n, n);`
			`MatrixType B = MatrixType::Random(n, 3);`

			`auto d_A = DeviceMatrix<Scalar>::fromHost(A);`
			`auto d_B = DeviceMatrix<Scalar>::fromHost(B);`

			`GpuLU<Scalar> lu;`
			`lu.compute(d_A);`
			`VERIFY_IS_EQUAL(lu.info(), Success);`

			`// Chain: solve → use result as RHS`
			`DeviceMatrix<Scalar> d_X = lu.solve(d_B);`
			`DeviceMatrix<Scalar> d_Y = lu.solve(d_X);`
			`MatrixType Y = d_Y.toHost();`

			`MatrixType X_ref = PartialPivLU<MatrixType>(A).solve(B);`
			`MatrixType Y_ref = PartialPivLU<MatrixType>(A).solve(X_ref);`

			`RealScalar tol = RealScalar(100) * RealScalar(n) * NumTraits<Scalar>::epsilon() * Y_ref.norm();`
			`VERIFY((Y - Y_ref).norm() < tol);`
			`}`

			`// ---- Per-scalar driver -------------------------------------------------------`

			`template <typename Scalar>`
			`void test_scalar() {`
			`CALL_SUBTEST(test_getrf<Scalar>(1));`
			`CALL_SUBTEST(test_getrf<Scalar>(64));`
			`CALL_SUBTEST(test_getrf<Scalar>(256));`

			`CALL_SUBTEST(test_getrs_trans<Scalar>(64));`
			`CALL_SUBTEST(test_getrs_trans<Scalar>(128));`

			`CALL_SUBTEST(test_multiple_solves<Scalar>(128));`

			`CALL_SUBTEST(test_vs_cpu<Scalar>(64));`
			`CALL_SUBTEST(test_vs_cpu<Scalar>(256));`

			`CALL_SUBTEST(test_device_matrix_solve<Scalar>(64));`
			`CALL_SUBTEST(test_device_matrix_move_compute<Scalar>(64));`
			`CALL_SUBTEST(test_chaining<Scalar>(64));`
			`}`

			`EIGEN_DECLARE_TEST(gpu_cusolver_lu) {`
			`CALL_SUBTEST(test_scalar<float>());`
			`CALL_SUBTEST(test_scalar<double>());`
			`CALL_SUBTEST(test_scalar<std::complex<float>>());`
			`CALL_SUBTEST(test_scalar<std::complex<double>>());`
			`CALL_SUBTEST(test_singular());`
			`}`