Files
eigen/test/gpu_cublas.cpp
Rasmus Munk Larsen 8593c7f5a1 GPU: Add dense cuSOLVER solvers (QR, SVD, EigenSolver)
Add QR (geqrf + ormqr + trsm), SVD (gesvd), and self-adjoint eigenvalue
decomposition (syevd) via cuSOLVER. All support host and DeviceMatrix input.

Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
2026-04-09 19:11:34 -07:00

757 lines
23 KiB
C++
Raw Permalink Blame History

This file contains ambiguous Unicode characters
This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.
// This file is part of Eigen, a lightweight C++ template library
// for linear algebra.
//
// Copyright (C) 2026 Rasmus Munk Larsen <rmlarsen@gmail.com>
//
// This Source Code Form is subject to the terms of the Mozilla
// Public License v. 2.0. If a copy of the MPL was not distributed
// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
// Tests for cuBLAS GEMM dispatch via DeviceMatrix expression syntax.
// Covers: d_C = d_A * d_B, adjoint, transpose, scaled, +=, .device(ctx).
#define EIGEN_USE_GPU
#include "main.h"
#include <Eigen/GPU>
using namespace Eigen;
// Unit roundoff for GPU GEMM compute precision.
// TF32 (opt-in via EIGEN_CUDA_TF32) has eps ~ 2^{-10}.
template <typename Scalar>
typename NumTraits<Scalar>::Real gpu_unit_roundoff() {
#if defined(EIGEN_CUDA_TF32) && !defined(EIGEN_NO_CUDA_TENSOR_OPS)
using RealScalar = typename NumTraits<Scalar>::Real;
if (std::is_same<RealScalar, float>::value) return RealScalar(9.8e-4);
#endif
return NumTraits<Scalar>::epsilon();
}
// Higham-Mary probabilistic error bound for GEMM:
// ||C - fl(C)||_F <= lambda * sqrt(k) * u * ||A||_F * ||B||_F
// where k is the inner dimension, u is the unit roundoff, and
// lambda = sqrt(2 * ln(2/delta)) with delta = failure probability.
// lambda = 5 corresponds to delta ~ 10^{-6}.
// Reference: Higham & Mary, "Probabilistic Error Analysis for Inner Products",
// SIAM J. Matrix Anal. Appl., 2019.
template <typename Scalar>
typename NumTraits<Scalar>::Real gemm_error_bound(Index k, typename NumTraits<Scalar>::Real normA,
typename NumTraits<Scalar>::Real normB) {
using RealScalar = typename NumTraits<Scalar>::Real;
constexpr RealScalar lambda = 5;
return lambda * std::sqrt(static_cast<RealScalar>(k)) * gpu_unit_roundoff<Scalar>() * normA * normB;
}
// ---- Basic GEMM: C = A * B -------------------------------------------------
template <typename Scalar>
void test_gemm_basic(Index m, Index n, Index k) {
using Mat = Matrix<Scalar, Dynamic, Dynamic>;
using RealScalar = typename NumTraits<Scalar>::Real;
Mat A = Mat::Random(m, k);
Mat B = Mat::Random(k, n);
auto d_A = DeviceMatrix<Scalar>::fromHost(A);
auto d_B = DeviceMatrix<Scalar>::fromHost(B);
// Expression: d_C = d_A * d_B
DeviceMatrix<Scalar> d_C;
d_C = d_A * d_B;
Mat C = d_C.toHost();
Mat C_ref = A * B;
RealScalar tol = gemm_error_bound<Scalar>(k, A.norm(), B.norm());
VERIFY((C - C_ref).norm() < tol);
}
// ---- GEMM with adjoint: C = A^H * B ----------------------------------------
template <typename Scalar>
void test_gemm_adjoint_lhs(Index m, Index n, Index k) {
using Mat = Matrix<Scalar, Dynamic, Dynamic>;
using RealScalar = typename NumTraits<Scalar>::Real;
Mat A = Mat::Random(k, m); // A is k×m, A^H is m×k
Mat B = Mat::Random(k, n);
auto d_A = DeviceMatrix<Scalar>::fromHost(A);
auto d_B = DeviceMatrix<Scalar>::fromHost(B);
DeviceMatrix<Scalar> d_C;
d_C = d_A.adjoint() * d_B;
Mat C = d_C.toHost();
Mat C_ref = A.adjoint() * B;
RealScalar tol = gemm_error_bound<Scalar>(k, A.norm(), B.norm());
VERIFY((C - C_ref).norm() < tol);
}
// ---- GEMM with transpose: C = A * B^T --------------------------------------
template <typename Scalar>
void test_gemm_transpose_rhs(Index m, Index n, Index k) {
using Mat = Matrix<Scalar, Dynamic, Dynamic>;
using RealScalar = typename NumTraits<Scalar>::Real;
Mat A = Mat::Random(m, k);
Mat B = Mat::Random(n, k); // B is n×k, B^T is k×n
auto d_A = DeviceMatrix<Scalar>::fromHost(A);
auto d_B = DeviceMatrix<Scalar>::fromHost(B);
DeviceMatrix<Scalar> d_C;
d_C = d_A * d_B.transpose();
Mat C = d_C.toHost();
Mat C_ref = A * B.transpose();
RealScalar tol = gemm_error_bound<Scalar>(k, A.norm(), B.norm());
VERIFY((C - C_ref).norm() < tol);
}
// ---- GEMM with scaled: C = alpha * A * B ------------------------------------
template <typename Scalar>
void test_gemm_scaled(Index m, Index n, Index k) {
using Mat = Matrix<Scalar, Dynamic, Dynamic>;
using RealScalar = typename NumTraits<Scalar>::Real;
Mat A = Mat::Random(m, k);
Mat B = Mat::Random(k, n);
Scalar alpha = Scalar(2.5);
auto d_A = DeviceMatrix<Scalar>::fromHost(A);
auto d_B = DeviceMatrix<Scalar>::fromHost(B);
DeviceMatrix<Scalar> d_C;
d_C = alpha * d_A * d_B;
Mat C = d_C.toHost();
Mat C_ref = alpha * A * B;
RealScalar tol = gemm_error_bound<Scalar>(k, A.norm(), B.norm());
VERIFY((C - C_ref).norm() < tol);
}
// ---- GEMM accumulate: C += A * B (beta=1) -----------------------------------
template <typename Scalar>
void test_gemm_accumulate(Index m, Index n, Index k) {
using Mat = Matrix<Scalar, Dynamic, Dynamic>;
using RealScalar = typename NumTraits<Scalar>::Real;
Mat A = Mat::Random(m, k);
Mat B = Mat::Random(k, n);
Mat C_init = Mat::Random(m, n);
auto d_A = DeviceMatrix<Scalar>::fromHost(A);
auto d_B = DeviceMatrix<Scalar>::fromHost(B);
auto d_C = DeviceMatrix<Scalar>::fromHost(C_init);
d_C += d_A * d_B;
Mat C = d_C.toHost();
Mat C_ref = C_init + A * B;
RealScalar tol = gemm_error_bound<Scalar>(k, A.norm(), B.norm());
VERIFY((C - C_ref).norm() < tol);
}
// ---- GEMM accumulate into empty destination ---------------------------------
template <typename Scalar>
void test_gemm_accumulate_empty(Index m, Index n, Index k) {
using Mat = Matrix<Scalar, Dynamic, Dynamic>;
using RealScalar = typename NumTraits<Scalar>::Real;
Mat A = Mat::Random(m, k);
Mat B = Mat::Random(k, n);
auto d_A = DeviceMatrix<Scalar>::fromHost(A);
auto d_B = DeviceMatrix<Scalar>::fromHost(B);
DeviceMatrix<Scalar> d_C;
d_C += d_A * d_B;
Mat C = d_C.toHost();
Mat C_ref = A * B;
RealScalar tol = gemm_error_bound<Scalar>(k, A.norm(), B.norm());
VERIFY((C - C_ref).norm() < tol);
}
// ---- GEMM subtract: C -= A * B (beta=1, alpha=-1) --------------------------
template <typename Scalar>
void test_gemm_subtract(Index m, Index n, Index k) {
using Mat = Matrix<Scalar, Dynamic, Dynamic>;
using RealScalar = typename NumTraits<Scalar>::Real;
Mat A = Mat::Random(m, k);
Mat B = Mat::Random(k, n);
Mat C_init = Mat::Random(m, n);
auto d_A = DeviceMatrix<Scalar>::fromHost(A);
auto d_B = DeviceMatrix<Scalar>::fromHost(B);
auto d_C = DeviceMatrix<Scalar>::fromHost(C_init);
GpuContext ctx;
d_C.device(ctx) -= d_A * d_B;
Mat C = d_C.toHost();
Mat C_ref = C_init - A * B;
RealScalar tol = gemm_error_bound<Scalar>(k, A.norm(), B.norm());
VERIFY((C - C_ref).norm() < tol);
}
// ---- GEMM subtract from empty destination -----------------------------------
template <typename Scalar>
void test_gemm_subtract_empty(Index m, Index n, Index k) {
using Mat = Matrix<Scalar, Dynamic, Dynamic>;
using RealScalar = typename NumTraits<Scalar>::Real;
Mat A = Mat::Random(m, k);
Mat B = Mat::Random(k, n);
auto d_A = DeviceMatrix<Scalar>::fromHost(A);
auto d_B = DeviceMatrix<Scalar>::fromHost(B);
GpuContext ctx;
DeviceMatrix<Scalar> d_C;
d_C.device(ctx) -= d_A * d_B;
Mat C = d_C.toHost();
Mat C_ref = -(A * B);
RealScalar tol = gemm_error_bound<Scalar>(k, A.norm(), B.norm());
VERIFY((C - C_ref).norm() < tol);
}
// ---- GEMM with scaled RHS: C = A * (alpha * B) -----------------------------
template <typename Scalar>
void test_gemm_scaled_rhs(Index m, Index n, Index k) {
using Mat = Matrix<Scalar, Dynamic, Dynamic>;
using RealScalar = typename NumTraits<Scalar>::Real;
Mat A = Mat::Random(m, k);
Mat B = Mat::Random(k, n);
Scalar alpha = Scalar(3.0);
auto d_A = DeviceMatrix<Scalar>::fromHost(A);
auto d_B = DeviceMatrix<Scalar>::fromHost(B);
DeviceMatrix<Scalar> d_C;
d_C = d_A * (alpha * d_B);
Mat C = d_C.toHost();
Mat C_ref = A * (alpha * B);
RealScalar tol = gemm_error_bound<Scalar>(k, A.norm(), B.norm());
VERIFY((C - C_ref).norm() < tol);
}
// ---- GEMM dimension mismatch must assert ------------------------------------
template <typename Scalar>
void test_gemm_dimension_mismatch() {
using Mat = Matrix<Scalar, Dynamic, Dynamic>;
Mat A = Mat::Random(8, 5);
Mat B = Mat::Random(6, 7); // inner dimension mismatch
auto d_A = DeviceMatrix<Scalar>::fromHost(A);
auto d_B = DeviceMatrix<Scalar>::fromHost(B);
DeviceMatrix<Scalar> d_C;
VERIFY_RAISES_ASSERT(d_C = d_A * d_B);
}
// ---- GEMM with explicit GpuContext ------------------------------------------
template <typename Scalar>
void test_gemm_explicit_context(Index m, Index n, Index k) {
using Mat = Matrix<Scalar, Dynamic, Dynamic>;
using RealScalar = typename NumTraits<Scalar>::Real;
Mat A = Mat::Random(m, k);
Mat B = Mat::Random(k, n);
auto d_A = DeviceMatrix<Scalar>::fromHost(A);
auto d_B = DeviceMatrix<Scalar>::fromHost(B);
GpuContext ctx;
DeviceMatrix<Scalar> d_C;
d_C.device(ctx) = d_A * d_B;
Mat C = d_C.toHost();
Mat C_ref = A * B;
RealScalar tol = gemm_error_bound<Scalar>(k, A.norm(), B.norm());
VERIFY((C - C_ref).norm() < tol);
}
// ---- GEMM cross-context reuse of the same destination -----------------------
template <typename Scalar>
void test_gemm_cross_context_reuse(Index n) {
using Mat = Matrix<Scalar, Dynamic, Dynamic>;
using RealScalar = typename NumTraits<Scalar>::Real;
Mat A = Mat::Random(n, n);
Mat B = Mat::Random(n, n);
Mat D = Mat::Random(n, n);
Mat E = Mat::Random(n, n);
auto d_A = DeviceMatrix<Scalar>::fromHost(A);
auto d_B = DeviceMatrix<Scalar>::fromHost(B);
auto d_D = DeviceMatrix<Scalar>::fromHost(D);
auto d_E = DeviceMatrix<Scalar>::fromHost(E);
GpuContext ctx1;
GpuContext ctx2;
DeviceMatrix<Scalar> d_C;
d_C.device(ctx1) = d_A * d_B;
d_C.device(ctx2) += d_D * d_E;
Mat C = d_C.toHost();
Mat C_ref = A * B + D * E;
RealScalar tol = gemm_error_bound<Scalar>(n, A.norm(), B.norm()) + gemm_error_bound<Scalar>(n, D.norm(), E.norm());
VERIFY((C - C_ref).norm() < tol);
}
// ---- GEMM cross-context resize of the destination ---------------------------
template <typename Scalar>
void test_gemm_cross_context_resize() {
using Mat = Matrix<Scalar, Dynamic, Dynamic>;
using RealScalar = typename NumTraits<Scalar>::Real;
Mat A = Mat::Random(64, 64);
Mat B = Mat::Random(64, 64);
Mat D = Mat::Random(32, 16);
Mat E = Mat::Random(16, 8);
auto d_A = DeviceMatrix<Scalar>::fromHost(A);
auto d_B = DeviceMatrix<Scalar>::fromHost(B);
auto d_D = DeviceMatrix<Scalar>::fromHost(D);
auto d_E = DeviceMatrix<Scalar>::fromHost(E);
GpuContext ctx1;
GpuContext ctx2;
DeviceMatrix<Scalar> d_C;
d_C.device(ctx1) = d_A * d_B;
d_C.device(ctx2) = d_D * d_E;
Mat C = d_C.toHost();
Mat C_ref = D * E;
RealScalar tol = gemm_error_bound<Scalar>(16, D.norm(), E.norm());
VERIFY((C - C_ref).norm() < tol);
}
// ---- GEMM chaining: C = (A * B) then D = C * E -----------------------------
template <typename Scalar>
void test_gemm_chain(Index n) {
using Mat = Matrix<Scalar, Dynamic, Dynamic>;
using RealScalar = typename NumTraits<Scalar>::Real;
Mat A = Mat::Random(n, n);
Mat B = Mat::Random(n, n);
Mat E = Mat::Random(n, n);
auto d_A = DeviceMatrix<Scalar>::fromHost(A);
auto d_B = DeviceMatrix<Scalar>::fromHost(B);
auto d_E = DeviceMatrix<Scalar>::fromHost(E);
DeviceMatrix<Scalar> d_C;
d_C = d_A * d_B;
DeviceMatrix<Scalar> d_D;
d_D = d_C * d_E;
Mat D = d_D.toHost();
Mat D_ref = (A * B) * E;
Mat C_ref = A * B;
RealScalar tol =
gemm_error_bound<Scalar>(n, A.norm(), B.norm()) * E.norm() + gemm_error_bound<Scalar>(n, C_ref.norm(), E.norm());
VERIFY((D - D_ref).norm() < tol);
}
// ---- Square identity check: A * I = A ---------------------------------------
template <typename Scalar>
void test_gemm_identity(Index n) {
using Mat = Matrix<Scalar, Dynamic, Dynamic>;
Mat A = Mat::Random(n, n);
Mat eye = Mat::Identity(n, n);
auto d_A = DeviceMatrix<Scalar>::fromHost(A);
auto d_I = DeviceMatrix<Scalar>::fromHost(eye);
DeviceMatrix<Scalar> d_C;
d_C = d_A * d_I;
Mat C = d_C.toHost();
VERIFY_IS_APPROX(C, A);
}
// ---- LLT solve expression: d_X = d_A.llt().solve(d_B) ----------------------
template <typename MatrixType>
MatrixType make_spd(Index n) {
using Scalar = typename MatrixType::Scalar;
MatrixType M = MatrixType::Random(n, n);
return M.adjoint() * M + MatrixType::Identity(n, n) * static_cast<Scalar>(n);
}
template <typename Scalar>
void test_llt_solve_expr(Index n, Index nrhs) {
using Mat = Matrix<Scalar, Dynamic, Dynamic>;
using RealScalar = typename NumTraits<Scalar>::Real;
Mat A = make_spd<Mat>(n);
Mat B = Mat::Random(n, nrhs);
auto d_A = DeviceMatrix<Scalar>::fromHost(A);
auto d_B = DeviceMatrix<Scalar>::fromHost(B);
DeviceMatrix<Scalar> d_X;
d_X = d_A.llt().solve(d_B);
Mat X = d_X.toHost();
RealScalar residual = (A * X - B).norm() / B.norm();
VERIFY(residual < RealScalar(n) * gpu_unit_roundoff<Scalar>());
}
// ---- LLT solve with explicit context ----------------------------------------
template <typename Scalar>
void test_llt_solve_expr_context(Index n, Index nrhs) {
using Mat = Matrix<Scalar, Dynamic, Dynamic>;
using RealScalar = typename NumTraits<Scalar>::Real;
Mat A = make_spd<Mat>(n);
Mat B = Mat::Random(n, nrhs);
auto d_A = DeviceMatrix<Scalar>::fromHost(A);
auto d_B = DeviceMatrix<Scalar>::fromHost(B);
GpuContext ctx;
DeviceMatrix<Scalar> d_X;
d_X.device(ctx) = d_A.llt().solve(d_B);
Mat X = d_X.toHost();
RealScalar residual = (A * X - B).norm() / B.norm();
VERIFY(residual < RealScalar(n) * gpu_unit_roundoff<Scalar>());
}
// ---- LU solve expression: d_X = d_A.lu().solve(d_B) ------------------------
template <typename Scalar>
void test_lu_solve_expr(Index n, Index nrhs) {
using Mat = Matrix<Scalar, Dynamic, Dynamic>;
using RealScalar = typename NumTraits<Scalar>::Real;
Mat A = Mat::Random(n, n);
Mat B = Mat::Random(n, nrhs);
auto d_A = DeviceMatrix<Scalar>::fromHost(A);
auto d_B = DeviceMatrix<Scalar>::fromHost(B);
DeviceMatrix<Scalar> d_X;
d_X = d_A.lu().solve(d_B);
Mat X = d_X.toHost();
RealScalar residual = (A * X - B).norm() / (A.norm() * X.norm());
VERIFY(residual < RealScalar(10) * RealScalar(n) * gpu_unit_roundoff<Scalar>());
}
// ---- GEMM + solver chain: C = A * B, X = C.llt().solve(D) ------------------
template <typename Scalar>
void test_gemm_then_solve(Index n) {
using Mat = Matrix<Scalar, Dynamic, Dynamic>;
using RealScalar = typename NumTraits<Scalar>::Real;
Mat A = Mat::Random(n, n);
Mat D = Mat::Random(n, 1);
// Make SPD: C = A^H * A + n*I
auto d_A = DeviceMatrix<Scalar>::fromHost(A);
DeviceMatrix<Scalar> d_C;
d_C = d_A.adjoint() * d_A;
// Add n*I on host (no element-wise ops on DeviceMatrix yet).
Mat C = d_C.toHost();
C += Mat::Identity(n, n) * static_cast<Scalar>(n);
d_C = DeviceMatrix<Scalar>::fromHost(C);
auto d_D = DeviceMatrix<Scalar>::fromHost(D);
DeviceMatrix<Scalar> d_X;
d_X = d_C.llt().solve(d_D);
Mat X = d_X.toHost();
RealScalar residual = (C * X - D).norm() / D.norm();
VERIFY(residual < RealScalar(n) * gpu_unit_roundoff<Scalar>());
}
// ---- LLT solve with Upper triangle -----------------------------------------
template <typename Scalar>
void test_llt_solve_upper(Index n, Index nrhs) {
using Mat = Matrix<Scalar, Dynamic, Dynamic>;
using RealScalar = typename NumTraits<Scalar>::Real;
Mat A = make_spd<Mat>(n);
Mat B = Mat::Random(n, nrhs);
auto d_A = DeviceMatrix<Scalar>::fromHost(A);
auto d_B = DeviceMatrix<Scalar>::fromHost(B);
DeviceMatrix<Scalar> d_X;
d_X = d_A.template llt<Upper>().solve(d_B);
Mat X = d_X.toHost();
RealScalar residual = (A * X - B).norm() / B.norm();
VERIFY(residual < RealScalar(n) * gpu_unit_roundoff<Scalar>());
}
// ---- LU solve with explicit context -----------------------------------------
template <typename Scalar>
void test_lu_solve_expr_context(Index n, Index nrhs) {
using Mat = Matrix<Scalar, Dynamic, Dynamic>;
using RealScalar = typename NumTraits<Scalar>::Real;
Mat A = Mat::Random(n, n);
Mat B = Mat::Random(n, nrhs);
auto d_A = DeviceMatrix<Scalar>::fromHost(A);
auto d_B = DeviceMatrix<Scalar>::fromHost(B);
GpuContext ctx;
DeviceMatrix<Scalar> d_X;
d_X.device(ctx) = d_A.lu().solve(d_B);
Mat X = d_X.toHost();
RealScalar residual = (A * X - B).norm() / (A.norm() * X.norm());
VERIFY(residual < RealScalar(10) * RealScalar(n) * gpu_unit_roundoff<Scalar>());
}
// ---- Zero-nrhs solver expressions ------------------------------------------
template <typename Scalar>
void test_llt_solve_zero_nrhs(Index n) {
using Mat = Matrix<Scalar, Dynamic, Dynamic>;
Mat A = make_spd<Mat>(n);
Mat B = Mat::Random(n, 0);
auto d_A = DeviceMatrix<Scalar>::fromHost(A);
auto d_B = DeviceMatrix<Scalar>::fromHost(B);
DeviceMatrix<Scalar> d_X;
d_X = d_A.llt().solve(d_B);
VERIFY_IS_EQUAL(d_X.rows(), n);
VERIFY_IS_EQUAL(d_X.cols(), 0);
}
template <typename Scalar>
void test_lu_solve_zero_nrhs(Index n) {
using Mat = Matrix<Scalar, Dynamic, Dynamic>;
Mat A = Mat::Random(n, n);
Mat B = Mat::Random(n, 0);
auto d_A = DeviceMatrix<Scalar>::fromHost(A);
auto d_B = DeviceMatrix<Scalar>::fromHost(B);
DeviceMatrix<Scalar> d_X;
d_X = d_A.lu().solve(d_B);
VERIFY_IS_EQUAL(d_X.rows(), n);
VERIFY_IS_EQUAL(d_X.cols(), 0);
}
// ---- TRSM: triangularView<UpLo>().solve(B) ----------------------------------
template <typename Scalar, int UpLo>
void test_trsm(Index n, Index nrhs) {
using Mat = Matrix<Scalar, Dynamic, Dynamic>;
using RealScalar = typename NumTraits<Scalar>::Real;
// Build a well-conditioned triangular matrix.
Mat A = Mat::Random(n, n);
A.diagonal().array() += static_cast<Scalar>(n); // ensure non-singular
if (UpLo == Lower)
A = A.template triangularView<Lower>();
else
A = A.template triangularView<Upper>();
Mat B = Mat::Random(n, nrhs);
auto d_A = DeviceMatrix<Scalar>::fromHost(A);
auto d_B = DeviceMatrix<Scalar>::fromHost(B);
DeviceMatrix<Scalar> d_X;
d_X = d_A.template triangularView<UpLo>().solve(d_B);
Mat X = d_X.toHost();
RealScalar residual = (A * X - B).norm() / B.norm();
VERIFY(residual < RealScalar(n) * gpu_unit_roundoff<Scalar>());
}
// ---- SYMM/HEMM: selfadjointView<UpLo>() * B --------------------------------
template <typename Scalar, int UpLo>
void test_symm(Index n, Index nrhs) {
using Mat = Matrix<Scalar, Dynamic, Dynamic>;
using RealScalar = typename NumTraits<Scalar>::Real;
Mat A = make_spd<Mat>(n); // SPD is also self-adjoint
Mat B = Mat::Random(n, nrhs);
auto d_A = DeviceMatrix<Scalar>::fromHost(A);
auto d_B = DeviceMatrix<Scalar>::fromHost(B);
DeviceMatrix<Scalar> d_C;
d_C = d_A.template selfadjointView<UpLo>() * d_B;
Mat C = d_C.toHost();
Mat C_ref = A * B; // A is symmetric, so full multiply == symm
RealScalar tol = gemm_error_bound<Scalar>(n, A.norm(), B.norm());
VERIFY((C - C_ref).norm() < tol);
}
// ---- SYRK/HERK: rankUpdate(A) → C = A * A^H --------------------------------
template <typename Scalar>
void test_syrk(Index n, Index k) {
using Mat = Matrix<Scalar, Dynamic, Dynamic>;
using RealScalar = typename NumTraits<Scalar>::Real;
Mat A = Mat::Random(n, k);
auto d_A = DeviceMatrix<Scalar>::fromHost(A);
DeviceMatrix<Scalar> d_C;
d_C.template selfadjointView<Lower>().rankUpdate(d_A);
Mat C = d_C.toHost();
// Only lower triangle is meaningful for SYRK. Compare lower triangle.
Mat C_ref = A * A.adjoint();
// Extract lower triangle for comparison.
Mat C_lower = C.template triangularView<Lower>();
Mat C_ref_lower = C_ref.template triangularView<Lower>();
RealScalar tol = gemm_error_bound<Scalar>(k, A.norm(), A.norm());
VERIFY((C_lower - C_ref_lower).norm() < tol);
}
// ---- Per-scalar driver ------------------------------------------------------
template <typename Scalar>
void test_scalar() {
CALL_SUBTEST(test_gemm_basic<Scalar>(64, 64, 64));
CALL_SUBTEST(test_gemm_basic<Scalar>(128, 64, 32));
CALL_SUBTEST(test_gemm_basic<Scalar>(1, 1, 1));
CALL_SUBTEST(test_gemm_basic<Scalar>(256, 256, 256));
CALL_SUBTEST(test_gemm_adjoint_lhs<Scalar>(64, 64, 64));
CALL_SUBTEST(test_gemm_adjoint_lhs<Scalar>(128, 32, 64));
CALL_SUBTEST(test_gemm_transpose_rhs<Scalar>(64, 64, 64));
CALL_SUBTEST(test_gemm_transpose_rhs<Scalar>(128, 32, 64));
CALL_SUBTEST(test_gemm_scaled<Scalar>(64, 64, 64));
CALL_SUBTEST(test_gemm_scaled_rhs<Scalar>(64, 64, 64));
CALL_SUBTEST(test_gemm_accumulate<Scalar>(64, 64, 64));
CALL_SUBTEST(test_gemm_accumulate_empty<Scalar>(64, 64, 64));
CALL_SUBTEST(test_gemm_subtract<Scalar>(64, 64, 64));
CALL_SUBTEST(test_gemm_subtract_empty<Scalar>(64, 64, 64));
CALL_SUBTEST(test_gemm_dimension_mismatch<Scalar>());
CALL_SUBTEST(test_gemm_explicit_context<Scalar>(64, 64, 64));
CALL_SUBTEST(test_gemm_cross_context_reuse<Scalar>(64));
CALL_SUBTEST(test_gemm_cross_context_resize<Scalar>());
CALL_SUBTEST(test_gemm_chain<Scalar>(64));
CALL_SUBTEST(test_gemm_identity<Scalar>(64));
// Solver expressions — zero-size edge cases (use dedicated tests, not residual-based)
// Solver expressions
CALL_SUBTEST(test_llt_solve_expr<Scalar>(64, 1));
CALL_SUBTEST(test_llt_solve_expr<Scalar>(64, 4));
CALL_SUBTEST(test_llt_solve_expr<Scalar>(256, 8));
CALL_SUBTEST(test_llt_solve_expr_context<Scalar>(64, 4));
CALL_SUBTEST(test_llt_solve_upper<Scalar>(64, 4));
CALL_SUBTEST(test_lu_solve_expr<Scalar>(64, 1));
CALL_SUBTEST(test_lu_solve_expr<Scalar>(64, 4));
CALL_SUBTEST(test_lu_solve_expr<Scalar>(256, 8));
CALL_SUBTEST(test_lu_solve_expr_context<Scalar>(64, 4));
CALL_SUBTEST(test_llt_solve_zero_nrhs<Scalar>(64));
CALL_SUBTEST(test_llt_solve_zero_nrhs<Scalar>(0));
CALL_SUBTEST(test_lu_solve_zero_nrhs<Scalar>(64));
CALL_SUBTEST(test_lu_solve_zero_nrhs<Scalar>(0));
CALL_SUBTEST(test_gemm_then_solve<Scalar>(64));
// TRSM
CALL_SUBTEST((test_trsm<Scalar, Lower>(64, 1)));
CALL_SUBTEST((test_trsm<Scalar, Lower>(64, 4)));
CALL_SUBTEST((test_trsm<Scalar, Upper>(64, 4)));
CALL_SUBTEST((test_trsm<Scalar, Lower>(256, 8)));
// SYMM/HEMM
CALL_SUBTEST((test_symm<Scalar, Lower>(64, 4)));
CALL_SUBTEST((test_symm<Scalar, Upper>(64, 4)));
CALL_SUBTEST((test_symm<Scalar, Lower>(128, 8)));
// SYRK/HERK
CALL_SUBTEST(test_syrk<Scalar>(64, 64));
CALL_SUBTEST(test_syrk<Scalar>(64, 32));
CALL_SUBTEST(test_syrk<Scalar>(128, 64));
}
// ---- Solver failure mode tests (not templated on Scalar) --------------------
void test_llt_not_spd() {
// Negative definite matrix — LLT factorization must fail.
MatrixXd A = -MatrixXd::Identity(8, 8);
MatrixXd B = MatrixXd::Random(8, 1);
auto d_A = DeviceMatrix<double>::fromHost(A);
auto d_B = DeviceMatrix<double>::fromHost(B);
DeviceMatrix<double> d_X;
VERIFY_RAISES_ASSERT(d_X = d_A.llt().solve(d_B));
}
void test_lu_singular() {
// Zero matrix — LU factorization must detect singularity.
MatrixXd A = MatrixXd::Zero(8, 8);
MatrixXd B = MatrixXd::Random(8, 1);
auto d_A = DeviceMatrix<double>::fromHost(A);
auto d_B = DeviceMatrix<double>::fromHost(B);
DeviceMatrix<double> d_X;
VERIFY_RAISES_ASSERT(d_X = d_A.lu().solve(d_B));
}
EIGEN_DECLARE_TEST(gpu_cublas) {
CALL_SUBTEST(test_scalar<float>());
CALL_SUBTEST(test_scalar<double>());
CALL_SUBTEST(test_scalar<std::complex<float>>());
CALL_SUBTEST(test_scalar<std::complex<double>>());
CALL_SUBTEST(test_llt_not_spd());
CALL_SUBTEST(test_lu_singular());
}