GPU: Add library dispatch module (DeviceMatrix, cuBLAS, cuSOLVER)
Add Eigen/GPU module: A standalone GPU library dispatch layer where
DeviceMatrix<Scalar> operations map 1:1 to cuBLAS/cuSOLVER calls.
CPU and GPU solvers coexist in the same binary with compatible syntax.
Core infrastructure:
- DeviceMatrix<Scalar>: RAII dense column-major GPU memory wrapper with
async host transfer (fromHost/toHost) and CUDA event-based cross-stream
synchronization.
- GpuContext: Unified execution context owning a CUDA stream + cuBLAS
handle + cuSOLVER handle. Thread-local default with explicit override
via setThreadLocal(). Stream-borrowing constructor for integration.
- DeviceBuffer: Typed RAII device allocation with move semantics.
cuBLAS dispatch (expression syntax):
- GEMM: d_C = d_A.adjoint() * d_B (cublasXgemm)
- TRSM: d_X = d_A.triangularView<Lower>().solve(d_B) (cublasXtrsm)
- SYMM/HEMM: d_C = d_A.selfadjointView<Lower>() * d_B (cublasXsymm)
- SYRK/HERK: d_C = d_A * d_A.adjoint() (cublasXsyrk)
cuSOLVER dispatch:
- GpuLLT: Cached Cholesky factorization (cusolverDnXpotrf + Xpotrs)
- GpuLU: Cached LU factorization (cusolverDnXgetrf + Xgetrs)
- Solver chaining: auto x = d_A.llt().solve(d_B)
- Solver expressions with .device(ctx) for explicit stream control.
CI: Bump CUDA container to Ubuntu 22.04 (CMake 3.22), GCC 10->11,
Clang 12->14. Bump cmake_minimum_required to 3.17 for FindCUDAToolkit.
Tests: gpu_cublas.cpp, gpu_cusolver_llt.cpp, gpu_cusolver_lu.cpp,
gpu_device_matrix.cpp, gpu_library_example.cu
Benchmarks: bench_gpu_solvers.cpp, bench_gpu_chaining.cpp,
bench_gpu_batching.cpp
2026-04-09 16:15:39 -07:00
|
|
|
|
// This file is part of Eigen, a lightweight C++ template library
|
|
|
|
|
|
// for linear algebra.
|
|
|
|
|
|
//
|
|
|
|
|
|
// Copyright (C) 2026 Rasmus Munk Larsen <rmlarsen@gmail.com>
|
|
|
|
|
|
//
|
|
|
|
|
|
// This Source Code Form is subject to the terms of the Mozilla
|
|
|
|
|
|
// Public License v. 2.0. If a copy of the MPL was not distributed
|
|
|
|
|
|
// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
|
|
|
|
|
|
|
|
|
|
|
|
// Tests for cuBLAS GEMM dispatch via DeviceMatrix expression syntax.
|
|
|
|
|
|
// Covers: d_C = d_A * d_B, adjoint, transpose, scaled, +=, .device(ctx).
|
|
|
|
|
|
|
|
|
|
|
|
#define EIGEN_USE_GPU
|
|
|
|
|
|
#include "main.h"
|
|
|
|
|
|
#include <Eigen/GPU>
|
|
|
|
|
|
|
|
|
|
|
|
using namespace Eigen;
|
|
|
|
|
|
|
2026-04-09 19:11:25 -07:00
|
|
|
|
// Unit roundoff for GPU GEMM compute precision.
|
|
|
|
|
|
// TF32 (opt-in via EIGEN_CUDA_TF32) has eps ~ 2^{-10}.
|
|
|
|
|
|
template <typename Scalar>
|
|
|
|
|
|
typename NumTraits<Scalar>::Real gpu_unit_roundoff() {
|
|
|
|
|
|
#if defined(EIGEN_CUDA_TF32) && !defined(EIGEN_NO_CUDA_TENSOR_OPS)
|
|
|
|
|
|
using RealScalar = typename NumTraits<Scalar>::Real;
|
|
|
|
|
|
if (std::is_same<RealScalar, float>::value) return RealScalar(9.8e-4);
|
|
|
|
|
|
#endif
|
|
|
|
|
|
return NumTraits<Scalar>::epsilon();
|
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
// Higham-Mary probabilistic error bound for GEMM:
|
|
|
|
|
|
// ||C - fl(C)||_F <= lambda * sqrt(k) * u * ||A||_F * ||B||_F
|
|
|
|
|
|
// where k is the inner dimension, u is the unit roundoff, and
|
|
|
|
|
|
// lambda = sqrt(2 * ln(2/delta)) with delta = failure probability.
|
|
|
|
|
|
// lambda = 5 corresponds to delta ~ 10^{-6}.
|
|
|
|
|
|
// Reference: Higham & Mary, "Probabilistic Error Analysis for Inner Products",
|
|
|
|
|
|
// SIAM J. Matrix Anal. Appl., 2019.
|
|
|
|
|
|
template <typename Scalar>
|
|
|
|
|
|
typename NumTraits<Scalar>::Real gemm_error_bound(Index k, typename NumTraits<Scalar>::Real normA,
|
|
|
|
|
|
typename NumTraits<Scalar>::Real normB) {
|
|
|
|
|
|
using RealScalar = typename NumTraits<Scalar>::Real;
|
|
|
|
|
|
constexpr RealScalar lambda = 5;
|
|
|
|
|
|
return lambda * std::sqrt(static_cast<RealScalar>(k)) * gpu_unit_roundoff<Scalar>() * normA * normB;
|
|
|
|
|
|
}
|
|
|
|
|
|
|
GPU: Add library dispatch module (DeviceMatrix, cuBLAS, cuSOLVER)
Add Eigen/GPU module: A standalone GPU library dispatch layer where
DeviceMatrix<Scalar> operations map 1:1 to cuBLAS/cuSOLVER calls.
CPU and GPU solvers coexist in the same binary with compatible syntax.
Core infrastructure:
- DeviceMatrix<Scalar>: RAII dense column-major GPU memory wrapper with
async host transfer (fromHost/toHost) and CUDA event-based cross-stream
synchronization.
- GpuContext: Unified execution context owning a CUDA stream + cuBLAS
handle + cuSOLVER handle. Thread-local default with explicit override
via setThreadLocal(). Stream-borrowing constructor for integration.
- DeviceBuffer: Typed RAII device allocation with move semantics.
cuBLAS dispatch (expression syntax):
- GEMM: d_C = d_A.adjoint() * d_B (cublasXgemm)
- TRSM: d_X = d_A.triangularView<Lower>().solve(d_B) (cublasXtrsm)
- SYMM/HEMM: d_C = d_A.selfadjointView<Lower>() * d_B (cublasXsymm)
- SYRK/HERK: d_C = d_A * d_A.adjoint() (cublasXsyrk)
cuSOLVER dispatch:
- GpuLLT: Cached Cholesky factorization (cusolverDnXpotrf + Xpotrs)
- GpuLU: Cached LU factorization (cusolverDnXgetrf + Xgetrs)
- Solver chaining: auto x = d_A.llt().solve(d_B)
- Solver expressions with .device(ctx) for explicit stream control.
CI: Bump CUDA container to Ubuntu 22.04 (CMake 3.22), GCC 10->11,
Clang 12->14. Bump cmake_minimum_required to 3.17 for FindCUDAToolkit.
Tests: gpu_cublas.cpp, gpu_cusolver_llt.cpp, gpu_cusolver_lu.cpp,
gpu_device_matrix.cpp, gpu_library_example.cu
Benchmarks: bench_gpu_solvers.cpp, bench_gpu_chaining.cpp,
bench_gpu_batching.cpp
2026-04-09 16:15:39 -07:00
|
|
|
|
// ---- Basic GEMM: C = A * B -------------------------------------------------
|
|
|
|
|
|
|
|
|
|
|
|
template <typename Scalar>
|
|
|
|
|
|
void test_gemm_basic(Index m, Index n, Index k) {
|
|
|
|
|
|
using Mat = Matrix<Scalar, Dynamic, Dynamic>;
|
|
|
|
|
|
using RealScalar = typename NumTraits<Scalar>::Real;
|
|
|
|
|
|
|
|
|
|
|
|
Mat A = Mat::Random(m, k);
|
|
|
|
|
|
Mat B = Mat::Random(k, n);
|
|
|
|
|
|
|
|
|
|
|
|
auto d_A = DeviceMatrix<Scalar>::fromHost(A);
|
|
|
|
|
|
auto d_B = DeviceMatrix<Scalar>::fromHost(B);
|
|
|
|
|
|
|
|
|
|
|
|
// Expression: d_C = d_A * d_B
|
|
|
|
|
|
DeviceMatrix<Scalar> d_C;
|
|
|
|
|
|
d_C = d_A * d_B;
|
|
|
|
|
|
|
|
|
|
|
|
Mat C = d_C.toHost();
|
|
|
|
|
|
Mat C_ref = A * B;
|
|
|
|
|
|
|
2026-04-09 19:11:25 -07:00
|
|
|
|
RealScalar tol = gemm_error_bound<Scalar>(k, A.norm(), B.norm());
|
GPU: Add library dispatch module (DeviceMatrix, cuBLAS, cuSOLVER)
Add Eigen/GPU module: A standalone GPU library dispatch layer where
DeviceMatrix<Scalar> operations map 1:1 to cuBLAS/cuSOLVER calls.
CPU and GPU solvers coexist in the same binary with compatible syntax.
Core infrastructure:
- DeviceMatrix<Scalar>: RAII dense column-major GPU memory wrapper with
async host transfer (fromHost/toHost) and CUDA event-based cross-stream
synchronization.
- GpuContext: Unified execution context owning a CUDA stream + cuBLAS
handle + cuSOLVER handle. Thread-local default with explicit override
via setThreadLocal(). Stream-borrowing constructor for integration.
- DeviceBuffer: Typed RAII device allocation with move semantics.
cuBLAS dispatch (expression syntax):
- GEMM: d_C = d_A.adjoint() * d_B (cublasXgemm)
- TRSM: d_X = d_A.triangularView<Lower>().solve(d_B) (cublasXtrsm)
- SYMM/HEMM: d_C = d_A.selfadjointView<Lower>() * d_B (cublasXsymm)
- SYRK/HERK: d_C = d_A * d_A.adjoint() (cublasXsyrk)
cuSOLVER dispatch:
- GpuLLT: Cached Cholesky factorization (cusolverDnXpotrf + Xpotrs)
- GpuLU: Cached LU factorization (cusolverDnXgetrf + Xgetrs)
- Solver chaining: auto x = d_A.llt().solve(d_B)
- Solver expressions with .device(ctx) for explicit stream control.
CI: Bump CUDA container to Ubuntu 22.04 (CMake 3.22), GCC 10->11,
Clang 12->14. Bump cmake_minimum_required to 3.17 for FindCUDAToolkit.
Tests: gpu_cublas.cpp, gpu_cusolver_llt.cpp, gpu_cusolver_lu.cpp,
gpu_device_matrix.cpp, gpu_library_example.cu
Benchmarks: bench_gpu_solvers.cpp, bench_gpu_chaining.cpp,
bench_gpu_batching.cpp
2026-04-09 16:15:39 -07:00
|
|
|
|
VERIFY((C - C_ref).norm() < tol);
|
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
// ---- GEMM with adjoint: C = A^H * B ----------------------------------------
|
|
|
|
|
|
|
|
|
|
|
|
template <typename Scalar>
|
|
|
|
|
|
void test_gemm_adjoint_lhs(Index m, Index n, Index k) {
|
|
|
|
|
|
using Mat = Matrix<Scalar, Dynamic, Dynamic>;
|
|
|
|
|
|
using RealScalar = typename NumTraits<Scalar>::Real;
|
|
|
|
|
|
|
|
|
|
|
|
Mat A = Mat::Random(k, m); // A is k×m, A^H is m×k
|
|
|
|
|
|
Mat B = Mat::Random(k, n);
|
|
|
|
|
|
|
|
|
|
|
|
auto d_A = DeviceMatrix<Scalar>::fromHost(A);
|
|
|
|
|
|
auto d_B = DeviceMatrix<Scalar>::fromHost(B);
|
|
|
|
|
|
|
|
|
|
|
|
DeviceMatrix<Scalar> d_C;
|
|
|
|
|
|
d_C = d_A.adjoint() * d_B;
|
|
|
|
|
|
|
|
|
|
|
|
Mat C = d_C.toHost();
|
|
|
|
|
|
Mat C_ref = A.adjoint() * B;
|
|
|
|
|
|
|
2026-04-09 19:11:25 -07:00
|
|
|
|
RealScalar tol = gemm_error_bound<Scalar>(k, A.norm(), B.norm());
|
GPU: Add library dispatch module (DeviceMatrix, cuBLAS, cuSOLVER)
Add Eigen/GPU module: A standalone GPU library dispatch layer where
DeviceMatrix<Scalar> operations map 1:1 to cuBLAS/cuSOLVER calls.
CPU and GPU solvers coexist in the same binary with compatible syntax.
Core infrastructure:
- DeviceMatrix<Scalar>: RAII dense column-major GPU memory wrapper with
async host transfer (fromHost/toHost) and CUDA event-based cross-stream
synchronization.
- GpuContext: Unified execution context owning a CUDA stream + cuBLAS
handle + cuSOLVER handle. Thread-local default with explicit override
via setThreadLocal(). Stream-borrowing constructor for integration.
- DeviceBuffer: Typed RAII device allocation with move semantics.
cuBLAS dispatch (expression syntax):
- GEMM: d_C = d_A.adjoint() * d_B (cublasXgemm)
- TRSM: d_X = d_A.triangularView<Lower>().solve(d_B) (cublasXtrsm)
- SYMM/HEMM: d_C = d_A.selfadjointView<Lower>() * d_B (cublasXsymm)
- SYRK/HERK: d_C = d_A * d_A.adjoint() (cublasXsyrk)
cuSOLVER dispatch:
- GpuLLT: Cached Cholesky factorization (cusolverDnXpotrf + Xpotrs)
- GpuLU: Cached LU factorization (cusolverDnXgetrf + Xgetrs)
- Solver chaining: auto x = d_A.llt().solve(d_B)
- Solver expressions with .device(ctx) for explicit stream control.
CI: Bump CUDA container to Ubuntu 22.04 (CMake 3.22), GCC 10->11,
Clang 12->14. Bump cmake_minimum_required to 3.17 for FindCUDAToolkit.
Tests: gpu_cublas.cpp, gpu_cusolver_llt.cpp, gpu_cusolver_lu.cpp,
gpu_device_matrix.cpp, gpu_library_example.cu
Benchmarks: bench_gpu_solvers.cpp, bench_gpu_chaining.cpp,
bench_gpu_batching.cpp
2026-04-09 16:15:39 -07:00
|
|
|
|
VERIFY((C - C_ref).norm() < tol);
|
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
// ---- GEMM with transpose: C = A * B^T --------------------------------------
|
|
|
|
|
|
|
|
|
|
|
|
template <typename Scalar>
|
|
|
|
|
|
void test_gemm_transpose_rhs(Index m, Index n, Index k) {
|
|
|
|
|
|
using Mat = Matrix<Scalar, Dynamic, Dynamic>;
|
|
|
|
|
|
using RealScalar = typename NumTraits<Scalar>::Real;
|
|
|
|
|
|
|
|
|
|
|
|
Mat A = Mat::Random(m, k);
|
|
|
|
|
|
Mat B = Mat::Random(n, k); // B is n×k, B^T is k×n
|
|
|
|
|
|
|
|
|
|
|
|
auto d_A = DeviceMatrix<Scalar>::fromHost(A);
|
|
|
|
|
|
auto d_B = DeviceMatrix<Scalar>::fromHost(B);
|
|
|
|
|
|
|
|
|
|
|
|
DeviceMatrix<Scalar> d_C;
|
|
|
|
|
|
d_C = d_A * d_B.transpose();
|
|
|
|
|
|
|
|
|
|
|
|
Mat C = d_C.toHost();
|
|
|
|
|
|
Mat C_ref = A * B.transpose();
|
|
|
|
|
|
|
2026-04-09 19:11:25 -07:00
|
|
|
|
RealScalar tol = gemm_error_bound<Scalar>(k, A.norm(), B.norm());
|
GPU: Add library dispatch module (DeviceMatrix, cuBLAS, cuSOLVER)
Add Eigen/GPU module: A standalone GPU library dispatch layer where
DeviceMatrix<Scalar> operations map 1:1 to cuBLAS/cuSOLVER calls.
CPU and GPU solvers coexist in the same binary with compatible syntax.
Core infrastructure:
- DeviceMatrix<Scalar>: RAII dense column-major GPU memory wrapper with
async host transfer (fromHost/toHost) and CUDA event-based cross-stream
synchronization.
- GpuContext: Unified execution context owning a CUDA stream + cuBLAS
handle + cuSOLVER handle. Thread-local default with explicit override
via setThreadLocal(). Stream-borrowing constructor for integration.
- DeviceBuffer: Typed RAII device allocation with move semantics.
cuBLAS dispatch (expression syntax):
- GEMM: d_C = d_A.adjoint() * d_B (cublasXgemm)
- TRSM: d_X = d_A.triangularView<Lower>().solve(d_B) (cublasXtrsm)
- SYMM/HEMM: d_C = d_A.selfadjointView<Lower>() * d_B (cublasXsymm)
- SYRK/HERK: d_C = d_A * d_A.adjoint() (cublasXsyrk)
cuSOLVER dispatch:
- GpuLLT: Cached Cholesky factorization (cusolverDnXpotrf + Xpotrs)
- GpuLU: Cached LU factorization (cusolverDnXgetrf + Xgetrs)
- Solver chaining: auto x = d_A.llt().solve(d_B)
- Solver expressions with .device(ctx) for explicit stream control.
CI: Bump CUDA container to Ubuntu 22.04 (CMake 3.22), GCC 10->11,
Clang 12->14. Bump cmake_minimum_required to 3.17 for FindCUDAToolkit.
Tests: gpu_cublas.cpp, gpu_cusolver_llt.cpp, gpu_cusolver_lu.cpp,
gpu_device_matrix.cpp, gpu_library_example.cu
Benchmarks: bench_gpu_solvers.cpp, bench_gpu_chaining.cpp,
bench_gpu_batching.cpp
2026-04-09 16:15:39 -07:00
|
|
|
|
VERIFY((C - C_ref).norm() < tol);
|
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
// ---- GEMM with scaled: C = alpha * A * B ------------------------------------
|
|
|
|
|
|
|
|
|
|
|
|
template <typename Scalar>
|
|
|
|
|
|
void test_gemm_scaled(Index m, Index n, Index k) {
|
|
|
|
|
|
using Mat = Matrix<Scalar, Dynamic, Dynamic>;
|
|
|
|
|
|
using RealScalar = typename NumTraits<Scalar>::Real;
|
|
|
|
|
|
|
|
|
|
|
|
Mat A = Mat::Random(m, k);
|
|
|
|
|
|
Mat B = Mat::Random(k, n);
|
|
|
|
|
|
Scalar alpha = Scalar(2.5);
|
|
|
|
|
|
|
|
|
|
|
|
auto d_A = DeviceMatrix<Scalar>::fromHost(A);
|
|
|
|
|
|
auto d_B = DeviceMatrix<Scalar>::fromHost(B);
|
|
|
|
|
|
|
|
|
|
|
|
DeviceMatrix<Scalar> d_C;
|
|
|
|
|
|
d_C = alpha * d_A * d_B;
|
|
|
|
|
|
|
|
|
|
|
|
Mat C = d_C.toHost();
|
|
|
|
|
|
Mat C_ref = alpha * A * B;
|
|
|
|
|
|
|
2026-04-09 19:11:25 -07:00
|
|
|
|
RealScalar tol = gemm_error_bound<Scalar>(k, A.norm(), B.norm());
|
GPU: Add library dispatch module (DeviceMatrix, cuBLAS, cuSOLVER)
Add Eigen/GPU module: A standalone GPU library dispatch layer where
DeviceMatrix<Scalar> operations map 1:1 to cuBLAS/cuSOLVER calls.
CPU and GPU solvers coexist in the same binary with compatible syntax.
Core infrastructure:
- DeviceMatrix<Scalar>: RAII dense column-major GPU memory wrapper with
async host transfer (fromHost/toHost) and CUDA event-based cross-stream
synchronization.
- GpuContext: Unified execution context owning a CUDA stream + cuBLAS
handle + cuSOLVER handle. Thread-local default with explicit override
via setThreadLocal(). Stream-borrowing constructor for integration.
- DeviceBuffer: Typed RAII device allocation with move semantics.
cuBLAS dispatch (expression syntax):
- GEMM: d_C = d_A.adjoint() * d_B (cublasXgemm)
- TRSM: d_X = d_A.triangularView<Lower>().solve(d_B) (cublasXtrsm)
- SYMM/HEMM: d_C = d_A.selfadjointView<Lower>() * d_B (cublasXsymm)
- SYRK/HERK: d_C = d_A * d_A.adjoint() (cublasXsyrk)
cuSOLVER dispatch:
- GpuLLT: Cached Cholesky factorization (cusolverDnXpotrf + Xpotrs)
- GpuLU: Cached LU factorization (cusolverDnXgetrf + Xgetrs)
- Solver chaining: auto x = d_A.llt().solve(d_B)
- Solver expressions with .device(ctx) for explicit stream control.
CI: Bump CUDA container to Ubuntu 22.04 (CMake 3.22), GCC 10->11,
Clang 12->14. Bump cmake_minimum_required to 3.17 for FindCUDAToolkit.
Tests: gpu_cublas.cpp, gpu_cusolver_llt.cpp, gpu_cusolver_lu.cpp,
gpu_device_matrix.cpp, gpu_library_example.cu
Benchmarks: bench_gpu_solvers.cpp, bench_gpu_chaining.cpp,
bench_gpu_batching.cpp
2026-04-09 16:15:39 -07:00
|
|
|
|
VERIFY((C - C_ref).norm() < tol);
|
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
// ---- GEMM accumulate: C += A * B (beta=1) -----------------------------------
|
|
|
|
|
|
|
|
|
|
|
|
template <typename Scalar>
|
|
|
|
|
|
void test_gemm_accumulate(Index m, Index n, Index k) {
|
|
|
|
|
|
using Mat = Matrix<Scalar, Dynamic, Dynamic>;
|
|
|
|
|
|
using RealScalar = typename NumTraits<Scalar>::Real;
|
|
|
|
|
|
|
|
|
|
|
|
Mat A = Mat::Random(m, k);
|
|
|
|
|
|
Mat B = Mat::Random(k, n);
|
|
|
|
|
|
Mat C_init = Mat::Random(m, n);
|
|
|
|
|
|
|
|
|
|
|
|
auto d_A = DeviceMatrix<Scalar>::fromHost(A);
|
|
|
|
|
|
auto d_B = DeviceMatrix<Scalar>::fromHost(B);
|
|
|
|
|
|
auto d_C = DeviceMatrix<Scalar>::fromHost(C_init);
|
|
|
|
|
|
|
|
|
|
|
|
d_C += d_A * d_B;
|
|
|
|
|
|
|
|
|
|
|
|
Mat C = d_C.toHost();
|
|
|
|
|
|
Mat C_ref = C_init + A * B;
|
|
|
|
|
|
|
2026-04-09 19:11:25 -07:00
|
|
|
|
RealScalar tol = gemm_error_bound<Scalar>(k, A.norm(), B.norm());
|
GPU: Add library dispatch module (DeviceMatrix, cuBLAS, cuSOLVER)
Add Eigen/GPU module: A standalone GPU library dispatch layer where
DeviceMatrix<Scalar> operations map 1:1 to cuBLAS/cuSOLVER calls.
CPU and GPU solvers coexist in the same binary with compatible syntax.
Core infrastructure:
- DeviceMatrix<Scalar>: RAII dense column-major GPU memory wrapper with
async host transfer (fromHost/toHost) and CUDA event-based cross-stream
synchronization.
- GpuContext: Unified execution context owning a CUDA stream + cuBLAS
handle + cuSOLVER handle. Thread-local default with explicit override
via setThreadLocal(). Stream-borrowing constructor for integration.
- DeviceBuffer: Typed RAII device allocation with move semantics.
cuBLAS dispatch (expression syntax):
- GEMM: d_C = d_A.adjoint() * d_B (cublasXgemm)
- TRSM: d_X = d_A.triangularView<Lower>().solve(d_B) (cublasXtrsm)
- SYMM/HEMM: d_C = d_A.selfadjointView<Lower>() * d_B (cublasXsymm)
- SYRK/HERK: d_C = d_A * d_A.adjoint() (cublasXsyrk)
cuSOLVER dispatch:
- GpuLLT: Cached Cholesky factorization (cusolverDnXpotrf + Xpotrs)
- GpuLU: Cached LU factorization (cusolverDnXgetrf + Xgetrs)
- Solver chaining: auto x = d_A.llt().solve(d_B)
- Solver expressions with .device(ctx) for explicit stream control.
CI: Bump CUDA container to Ubuntu 22.04 (CMake 3.22), GCC 10->11,
Clang 12->14. Bump cmake_minimum_required to 3.17 for FindCUDAToolkit.
Tests: gpu_cublas.cpp, gpu_cusolver_llt.cpp, gpu_cusolver_lu.cpp,
gpu_device_matrix.cpp, gpu_library_example.cu
Benchmarks: bench_gpu_solvers.cpp, bench_gpu_chaining.cpp,
bench_gpu_batching.cpp
2026-04-09 16:15:39 -07:00
|
|
|
|
VERIFY((C - C_ref).norm() < tol);
|
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
// ---- GEMM accumulate into empty destination ---------------------------------
|
|
|
|
|
|
|
|
|
|
|
|
template <typename Scalar>
|
|
|
|
|
|
void test_gemm_accumulate_empty(Index m, Index n, Index k) {
|
|
|
|
|
|
using Mat = Matrix<Scalar, Dynamic, Dynamic>;
|
|
|
|
|
|
using RealScalar = typename NumTraits<Scalar>::Real;
|
|
|
|
|
|
|
|
|
|
|
|
Mat A = Mat::Random(m, k);
|
|
|
|
|
|
Mat B = Mat::Random(k, n);
|
|
|
|
|
|
|
|
|
|
|
|
auto d_A = DeviceMatrix<Scalar>::fromHost(A);
|
|
|
|
|
|
auto d_B = DeviceMatrix<Scalar>::fromHost(B);
|
|
|
|
|
|
DeviceMatrix<Scalar> d_C;
|
|
|
|
|
|
|
|
|
|
|
|
d_C += d_A * d_B;
|
|
|
|
|
|
|
|
|
|
|
|
Mat C = d_C.toHost();
|
|
|
|
|
|
Mat C_ref = A * B;
|
|
|
|
|
|
|
2026-04-09 19:11:25 -07:00
|
|
|
|
RealScalar tol = gemm_error_bound<Scalar>(k, A.norm(), B.norm());
|
GPU: Add library dispatch module (DeviceMatrix, cuBLAS, cuSOLVER)
Add Eigen/GPU module: A standalone GPU library dispatch layer where
DeviceMatrix<Scalar> operations map 1:1 to cuBLAS/cuSOLVER calls.
CPU and GPU solvers coexist in the same binary with compatible syntax.
Core infrastructure:
- DeviceMatrix<Scalar>: RAII dense column-major GPU memory wrapper with
async host transfer (fromHost/toHost) and CUDA event-based cross-stream
synchronization.
- GpuContext: Unified execution context owning a CUDA stream + cuBLAS
handle + cuSOLVER handle. Thread-local default with explicit override
via setThreadLocal(). Stream-borrowing constructor for integration.
- DeviceBuffer: Typed RAII device allocation with move semantics.
cuBLAS dispatch (expression syntax):
- GEMM: d_C = d_A.adjoint() * d_B (cublasXgemm)
- TRSM: d_X = d_A.triangularView<Lower>().solve(d_B) (cublasXtrsm)
- SYMM/HEMM: d_C = d_A.selfadjointView<Lower>() * d_B (cublasXsymm)
- SYRK/HERK: d_C = d_A * d_A.adjoint() (cublasXsyrk)
cuSOLVER dispatch:
- GpuLLT: Cached Cholesky factorization (cusolverDnXpotrf + Xpotrs)
- GpuLU: Cached LU factorization (cusolverDnXgetrf + Xgetrs)
- Solver chaining: auto x = d_A.llt().solve(d_B)
- Solver expressions with .device(ctx) for explicit stream control.
CI: Bump CUDA container to Ubuntu 22.04 (CMake 3.22), GCC 10->11,
Clang 12->14. Bump cmake_minimum_required to 3.17 for FindCUDAToolkit.
Tests: gpu_cublas.cpp, gpu_cusolver_llt.cpp, gpu_cusolver_lu.cpp,
gpu_device_matrix.cpp, gpu_library_example.cu
Benchmarks: bench_gpu_solvers.cpp, bench_gpu_chaining.cpp,
bench_gpu_batching.cpp
2026-04-09 16:15:39 -07:00
|
|
|
|
VERIFY((C - C_ref).norm() < tol);
|
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
// ---- GEMM subtract: C -= A * B (beta=1, alpha=-1) --------------------------
|
|
|
|
|
|
|
|
|
|
|
|
template <typename Scalar>
|
|
|
|
|
|
void test_gemm_subtract(Index m, Index n, Index k) {
|
|
|
|
|
|
using Mat = Matrix<Scalar, Dynamic, Dynamic>;
|
|
|
|
|
|
using RealScalar = typename NumTraits<Scalar>::Real;
|
|
|
|
|
|
|
|
|
|
|
|
Mat A = Mat::Random(m, k);
|
|
|
|
|
|
Mat B = Mat::Random(k, n);
|
|
|
|
|
|
Mat C_init = Mat::Random(m, n);
|
|
|
|
|
|
|
|
|
|
|
|
auto d_A = DeviceMatrix<Scalar>::fromHost(A);
|
|
|
|
|
|
auto d_B = DeviceMatrix<Scalar>::fromHost(B);
|
|
|
|
|
|
auto d_C = DeviceMatrix<Scalar>::fromHost(C_init);
|
|
|
|
|
|
|
|
|
|
|
|
GpuContext ctx;
|
|
|
|
|
|
d_C.device(ctx) -= d_A * d_B;
|
|
|
|
|
|
|
|
|
|
|
|
Mat C = d_C.toHost();
|
|
|
|
|
|
Mat C_ref = C_init - A * B;
|
|
|
|
|
|
|
2026-04-09 19:11:25 -07:00
|
|
|
|
RealScalar tol = gemm_error_bound<Scalar>(k, A.norm(), B.norm());
|
GPU: Add library dispatch module (DeviceMatrix, cuBLAS, cuSOLVER)
Add Eigen/GPU module: A standalone GPU library dispatch layer where
DeviceMatrix<Scalar> operations map 1:1 to cuBLAS/cuSOLVER calls.
CPU and GPU solvers coexist in the same binary with compatible syntax.
Core infrastructure:
- DeviceMatrix<Scalar>: RAII dense column-major GPU memory wrapper with
async host transfer (fromHost/toHost) and CUDA event-based cross-stream
synchronization.
- GpuContext: Unified execution context owning a CUDA stream + cuBLAS
handle + cuSOLVER handle. Thread-local default with explicit override
via setThreadLocal(). Stream-borrowing constructor for integration.
- DeviceBuffer: Typed RAII device allocation with move semantics.
cuBLAS dispatch (expression syntax):
- GEMM: d_C = d_A.adjoint() * d_B (cublasXgemm)
- TRSM: d_X = d_A.triangularView<Lower>().solve(d_B) (cublasXtrsm)
- SYMM/HEMM: d_C = d_A.selfadjointView<Lower>() * d_B (cublasXsymm)
- SYRK/HERK: d_C = d_A * d_A.adjoint() (cublasXsyrk)
cuSOLVER dispatch:
- GpuLLT: Cached Cholesky factorization (cusolverDnXpotrf + Xpotrs)
- GpuLU: Cached LU factorization (cusolverDnXgetrf + Xgetrs)
- Solver chaining: auto x = d_A.llt().solve(d_B)
- Solver expressions with .device(ctx) for explicit stream control.
CI: Bump CUDA container to Ubuntu 22.04 (CMake 3.22), GCC 10->11,
Clang 12->14. Bump cmake_minimum_required to 3.17 for FindCUDAToolkit.
Tests: gpu_cublas.cpp, gpu_cusolver_llt.cpp, gpu_cusolver_lu.cpp,
gpu_device_matrix.cpp, gpu_library_example.cu
Benchmarks: bench_gpu_solvers.cpp, bench_gpu_chaining.cpp,
bench_gpu_batching.cpp
2026-04-09 16:15:39 -07:00
|
|
|
|
VERIFY((C - C_ref).norm() < tol);
|
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
// ---- GEMM subtract from empty destination -----------------------------------
|
|
|
|
|
|
|
|
|
|
|
|
template <typename Scalar>
|
|
|
|
|
|
void test_gemm_subtract_empty(Index m, Index n, Index k) {
|
|
|
|
|
|
using Mat = Matrix<Scalar, Dynamic, Dynamic>;
|
|
|
|
|
|
using RealScalar = typename NumTraits<Scalar>::Real;
|
|
|
|
|
|
|
|
|
|
|
|
Mat A = Mat::Random(m, k);
|
|
|
|
|
|
Mat B = Mat::Random(k, n);
|
|
|
|
|
|
|
|
|
|
|
|
auto d_A = DeviceMatrix<Scalar>::fromHost(A);
|
|
|
|
|
|
auto d_B = DeviceMatrix<Scalar>::fromHost(B);
|
|
|
|
|
|
|
|
|
|
|
|
GpuContext ctx;
|
|
|
|
|
|
DeviceMatrix<Scalar> d_C;
|
|
|
|
|
|
d_C.device(ctx) -= d_A * d_B;
|
|
|
|
|
|
|
|
|
|
|
|
Mat C = d_C.toHost();
|
|
|
|
|
|
Mat C_ref = -(A * B);
|
|
|
|
|
|
|
2026-04-09 19:11:25 -07:00
|
|
|
|
RealScalar tol = gemm_error_bound<Scalar>(k, A.norm(), B.norm());
|
GPU: Add library dispatch module (DeviceMatrix, cuBLAS, cuSOLVER)
Add Eigen/GPU module: A standalone GPU library dispatch layer where
DeviceMatrix<Scalar> operations map 1:1 to cuBLAS/cuSOLVER calls.
CPU and GPU solvers coexist in the same binary with compatible syntax.
Core infrastructure:
- DeviceMatrix<Scalar>: RAII dense column-major GPU memory wrapper with
async host transfer (fromHost/toHost) and CUDA event-based cross-stream
synchronization.
- GpuContext: Unified execution context owning a CUDA stream + cuBLAS
handle + cuSOLVER handle. Thread-local default with explicit override
via setThreadLocal(). Stream-borrowing constructor for integration.
- DeviceBuffer: Typed RAII device allocation with move semantics.
cuBLAS dispatch (expression syntax):
- GEMM: d_C = d_A.adjoint() * d_B (cublasXgemm)
- TRSM: d_X = d_A.triangularView<Lower>().solve(d_B) (cublasXtrsm)
- SYMM/HEMM: d_C = d_A.selfadjointView<Lower>() * d_B (cublasXsymm)
- SYRK/HERK: d_C = d_A * d_A.adjoint() (cublasXsyrk)
cuSOLVER dispatch:
- GpuLLT: Cached Cholesky factorization (cusolverDnXpotrf + Xpotrs)
- GpuLU: Cached LU factorization (cusolverDnXgetrf + Xgetrs)
- Solver chaining: auto x = d_A.llt().solve(d_B)
- Solver expressions with .device(ctx) for explicit stream control.
CI: Bump CUDA container to Ubuntu 22.04 (CMake 3.22), GCC 10->11,
Clang 12->14. Bump cmake_minimum_required to 3.17 for FindCUDAToolkit.
Tests: gpu_cublas.cpp, gpu_cusolver_llt.cpp, gpu_cusolver_lu.cpp,
gpu_device_matrix.cpp, gpu_library_example.cu
Benchmarks: bench_gpu_solvers.cpp, bench_gpu_chaining.cpp,
bench_gpu_batching.cpp
2026-04-09 16:15:39 -07:00
|
|
|
|
VERIFY((C - C_ref).norm() < tol);
|
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
// ---- GEMM with scaled RHS: C = A * (alpha * B) -----------------------------
|
|
|
|
|
|
|
|
|
|
|
|
template <typename Scalar>
|
|
|
|
|
|
void test_gemm_scaled_rhs(Index m, Index n, Index k) {
|
|
|
|
|
|
using Mat = Matrix<Scalar, Dynamic, Dynamic>;
|
|
|
|
|
|
using RealScalar = typename NumTraits<Scalar>::Real;
|
|
|
|
|
|
|
|
|
|
|
|
Mat A = Mat::Random(m, k);
|
|
|
|
|
|
Mat B = Mat::Random(k, n);
|
|
|
|
|
|
Scalar alpha = Scalar(3.0);
|
|
|
|
|
|
|
|
|
|
|
|
auto d_A = DeviceMatrix<Scalar>::fromHost(A);
|
|
|
|
|
|
auto d_B = DeviceMatrix<Scalar>::fromHost(B);
|
|
|
|
|
|
|
|
|
|
|
|
DeviceMatrix<Scalar> d_C;
|
|
|
|
|
|
d_C = d_A * (alpha * d_B);
|
|
|
|
|
|
|
|
|
|
|
|
Mat C = d_C.toHost();
|
|
|
|
|
|
Mat C_ref = A * (alpha * B);
|
|
|
|
|
|
|
2026-04-09 19:11:25 -07:00
|
|
|
|
RealScalar tol = gemm_error_bound<Scalar>(k, A.norm(), B.norm());
|
GPU: Add library dispatch module (DeviceMatrix, cuBLAS, cuSOLVER)
Add Eigen/GPU module: A standalone GPU library dispatch layer where
DeviceMatrix<Scalar> operations map 1:1 to cuBLAS/cuSOLVER calls.
CPU and GPU solvers coexist in the same binary with compatible syntax.
Core infrastructure:
- DeviceMatrix<Scalar>: RAII dense column-major GPU memory wrapper with
async host transfer (fromHost/toHost) and CUDA event-based cross-stream
synchronization.
- GpuContext: Unified execution context owning a CUDA stream + cuBLAS
handle + cuSOLVER handle. Thread-local default with explicit override
via setThreadLocal(). Stream-borrowing constructor for integration.
- DeviceBuffer: Typed RAII device allocation with move semantics.
cuBLAS dispatch (expression syntax):
- GEMM: d_C = d_A.adjoint() * d_B (cublasXgemm)
- TRSM: d_X = d_A.triangularView<Lower>().solve(d_B) (cublasXtrsm)
- SYMM/HEMM: d_C = d_A.selfadjointView<Lower>() * d_B (cublasXsymm)
- SYRK/HERK: d_C = d_A * d_A.adjoint() (cublasXsyrk)
cuSOLVER dispatch:
- GpuLLT: Cached Cholesky factorization (cusolverDnXpotrf + Xpotrs)
- GpuLU: Cached LU factorization (cusolverDnXgetrf + Xgetrs)
- Solver chaining: auto x = d_A.llt().solve(d_B)
- Solver expressions with .device(ctx) for explicit stream control.
CI: Bump CUDA container to Ubuntu 22.04 (CMake 3.22), GCC 10->11,
Clang 12->14. Bump cmake_minimum_required to 3.17 for FindCUDAToolkit.
Tests: gpu_cublas.cpp, gpu_cusolver_llt.cpp, gpu_cusolver_lu.cpp,
gpu_device_matrix.cpp, gpu_library_example.cu
Benchmarks: bench_gpu_solvers.cpp, bench_gpu_chaining.cpp,
bench_gpu_batching.cpp
2026-04-09 16:15:39 -07:00
|
|
|
|
VERIFY((C - C_ref).norm() < tol);
|
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
// ---- GEMM dimension mismatch must assert ------------------------------------
|
|
|
|
|
|
|
|
|
|
|
|
template <typename Scalar>
|
|
|
|
|
|
void test_gemm_dimension_mismatch() {
|
|
|
|
|
|
using Mat = Matrix<Scalar, Dynamic, Dynamic>;
|
|
|
|
|
|
|
|
|
|
|
|
Mat A = Mat::Random(8, 5);
|
|
|
|
|
|
Mat B = Mat::Random(6, 7); // inner dimension mismatch
|
|
|
|
|
|
|
|
|
|
|
|
auto d_A = DeviceMatrix<Scalar>::fromHost(A);
|
|
|
|
|
|
auto d_B = DeviceMatrix<Scalar>::fromHost(B);
|
|
|
|
|
|
DeviceMatrix<Scalar> d_C;
|
|
|
|
|
|
|
|
|
|
|
|
VERIFY_RAISES_ASSERT(d_C = d_A * d_B);
|
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
// ---- GEMM with explicit GpuContext ------------------------------------------
|
|
|
|
|
|
|
|
|
|
|
|
template <typename Scalar>
|
|
|
|
|
|
void test_gemm_explicit_context(Index m, Index n, Index k) {
|
|
|
|
|
|
using Mat = Matrix<Scalar, Dynamic, Dynamic>;
|
|
|
|
|
|
using RealScalar = typename NumTraits<Scalar>::Real;
|
|
|
|
|
|
|
|
|
|
|
|
Mat A = Mat::Random(m, k);
|
|
|
|
|
|
Mat B = Mat::Random(k, n);
|
|
|
|
|
|
|
|
|
|
|
|
auto d_A = DeviceMatrix<Scalar>::fromHost(A);
|
|
|
|
|
|
auto d_B = DeviceMatrix<Scalar>::fromHost(B);
|
|
|
|
|
|
|
|
|
|
|
|
GpuContext ctx;
|
|
|
|
|
|
DeviceMatrix<Scalar> d_C;
|
|
|
|
|
|
d_C.device(ctx) = d_A * d_B;
|
|
|
|
|
|
|
|
|
|
|
|
Mat C = d_C.toHost();
|
|
|
|
|
|
Mat C_ref = A * B;
|
|
|
|
|
|
|
2026-04-09 19:11:25 -07:00
|
|
|
|
RealScalar tol = gemm_error_bound<Scalar>(k, A.norm(), B.norm());
|
GPU: Add library dispatch module (DeviceMatrix, cuBLAS, cuSOLVER)
Add Eigen/GPU module: A standalone GPU library dispatch layer where
DeviceMatrix<Scalar> operations map 1:1 to cuBLAS/cuSOLVER calls.
CPU and GPU solvers coexist in the same binary with compatible syntax.
Core infrastructure:
- DeviceMatrix<Scalar>: RAII dense column-major GPU memory wrapper with
async host transfer (fromHost/toHost) and CUDA event-based cross-stream
synchronization.
- GpuContext: Unified execution context owning a CUDA stream + cuBLAS
handle + cuSOLVER handle. Thread-local default with explicit override
via setThreadLocal(). Stream-borrowing constructor for integration.
- DeviceBuffer: Typed RAII device allocation with move semantics.
cuBLAS dispatch (expression syntax):
- GEMM: d_C = d_A.adjoint() * d_B (cublasXgemm)
- TRSM: d_X = d_A.triangularView<Lower>().solve(d_B) (cublasXtrsm)
- SYMM/HEMM: d_C = d_A.selfadjointView<Lower>() * d_B (cublasXsymm)
- SYRK/HERK: d_C = d_A * d_A.adjoint() (cublasXsyrk)
cuSOLVER dispatch:
- GpuLLT: Cached Cholesky factorization (cusolverDnXpotrf + Xpotrs)
- GpuLU: Cached LU factorization (cusolverDnXgetrf + Xgetrs)
- Solver chaining: auto x = d_A.llt().solve(d_B)
- Solver expressions with .device(ctx) for explicit stream control.
CI: Bump CUDA container to Ubuntu 22.04 (CMake 3.22), GCC 10->11,
Clang 12->14. Bump cmake_minimum_required to 3.17 for FindCUDAToolkit.
Tests: gpu_cublas.cpp, gpu_cusolver_llt.cpp, gpu_cusolver_lu.cpp,
gpu_device_matrix.cpp, gpu_library_example.cu
Benchmarks: bench_gpu_solvers.cpp, bench_gpu_chaining.cpp,
bench_gpu_batching.cpp
2026-04-09 16:15:39 -07:00
|
|
|
|
VERIFY((C - C_ref).norm() < tol);
|
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
// ---- GEMM cross-context reuse of the same destination -----------------------
|
|
|
|
|
|
|
|
|
|
|
|
template <typename Scalar>
|
|
|
|
|
|
void test_gemm_cross_context_reuse(Index n) {
|
|
|
|
|
|
using Mat = Matrix<Scalar, Dynamic, Dynamic>;
|
|
|
|
|
|
using RealScalar = typename NumTraits<Scalar>::Real;
|
|
|
|
|
|
|
|
|
|
|
|
Mat A = Mat::Random(n, n);
|
|
|
|
|
|
Mat B = Mat::Random(n, n);
|
|
|
|
|
|
Mat D = Mat::Random(n, n);
|
|
|
|
|
|
Mat E = Mat::Random(n, n);
|
|
|
|
|
|
|
|
|
|
|
|
auto d_A = DeviceMatrix<Scalar>::fromHost(A);
|
|
|
|
|
|
auto d_B = DeviceMatrix<Scalar>::fromHost(B);
|
|
|
|
|
|
auto d_D = DeviceMatrix<Scalar>::fromHost(D);
|
|
|
|
|
|
auto d_E = DeviceMatrix<Scalar>::fromHost(E);
|
|
|
|
|
|
|
|
|
|
|
|
GpuContext ctx1;
|
|
|
|
|
|
GpuContext ctx2;
|
|
|
|
|
|
DeviceMatrix<Scalar> d_C;
|
|
|
|
|
|
d_C.device(ctx1) = d_A * d_B;
|
|
|
|
|
|
d_C.device(ctx2) += d_D * d_E;
|
|
|
|
|
|
|
|
|
|
|
|
Mat C = d_C.toHost();
|
|
|
|
|
|
Mat C_ref = A * B + D * E;
|
|
|
|
|
|
|
2026-04-09 19:11:25 -07:00
|
|
|
|
RealScalar tol = gemm_error_bound<Scalar>(n, A.norm(), B.norm()) + gemm_error_bound<Scalar>(n, D.norm(), E.norm());
|
GPU: Add library dispatch module (DeviceMatrix, cuBLAS, cuSOLVER)
Add Eigen/GPU module: A standalone GPU library dispatch layer where
DeviceMatrix<Scalar> operations map 1:1 to cuBLAS/cuSOLVER calls.
CPU and GPU solvers coexist in the same binary with compatible syntax.
Core infrastructure:
- DeviceMatrix<Scalar>: RAII dense column-major GPU memory wrapper with
async host transfer (fromHost/toHost) and CUDA event-based cross-stream
synchronization.
- GpuContext: Unified execution context owning a CUDA stream + cuBLAS
handle + cuSOLVER handle. Thread-local default with explicit override
via setThreadLocal(). Stream-borrowing constructor for integration.
- DeviceBuffer: Typed RAII device allocation with move semantics.
cuBLAS dispatch (expression syntax):
- GEMM: d_C = d_A.adjoint() * d_B (cublasXgemm)
- TRSM: d_X = d_A.triangularView<Lower>().solve(d_B) (cublasXtrsm)
- SYMM/HEMM: d_C = d_A.selfadjointView<Lower>() * d_B (cublasXsymm)
- SYRK/HERK: d_C = d_A * d_A.adjoint() (cublasXsyrk)
cuSOLVER dispatch:
- GpuLLT: Cached Cholesky factorization (cusolverDnXpotrf + Xpotrs)
- GpuLU: Cached LU factorization (cusolverDnXgetrf + Xgetrs)
- Solver chaining: auto x = d_A.llt().solve(d_B)
- Solver expressions with .device(ctx) for explicit stream control.
CI: Bump CUDA container to Ubuntu 22.04 (CMake 3.22), GCC 10->11,
Clang 12->14. Bump cmake_minimum_required to 3.17 for FindCUDAToolkit.
Tests: gpu_cublas.cpp, gpu_cusolver_llt.cpp, gpu_cusolver_lu.cpp,
gpu_device_matrix.cpp, gpu_library_example.cu
Benchmarks: bench_gpu_solvers.cpp, bench_gpu_chaining.cpp,
bench_gpu_batching.cpp
2026-04-09 16:15:39 -07:00
|
|
|
|
VERIFY((C - C_ref).norm() < tol);
|
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
// ---- GEMM cross-context resize of the destination ---------------------------
|
|
|
|
|
|
|
|
|
|
|
|
template <typename Scalar>
|
|
|
|
|
|
void test_gemm_cross_context_resize() {
|
|
|
|
|
|
using Mat = Matrix<Scalar, Dynamic, Dynamic>;
|
|
|
|
|
|
using RealScalar = typename NumTraits<Scalar>::Real;
|
|
|
|
|
|
|
|
|
|
|
|
Mat A = Mat::Random(64, 64);
|
|
|
|
|
|
Mat B = Mat::Random(64, 64);
|
|
|
|
|
|
Mat D = Mat::Random(32, 16);
|
|
|
|
|
|
Mat E = Mat::Random(16, 8);
|
|
|
|
|
|
|
|
|
|
|
|
auto d_A = DeviceMatrix<Scalar>::fromHost(A);
|
|
|
|
|
|
auto d_B = DeviceMatrix<Scalar>::fromHost(B);
|
|
|
|
|
|
auto d_D = DeviceMatrix<Scalar>::fromHost(D);
|
|
|
|
|
|
auto d_E = DeviceMatrix<Scalar>::fromHost(E);
|
|
|
|
|
|
|
|
|
|
|
|
GpuContext ctx1;
|
|
|
|
|
|
GpuContext ctx2;
|
|
|
|
|
|
DeviceMatrix<Scalar> d_C;
|
|
|
|
|
|
d_C.device(ctx1) = d_A * d_B;
|
|
|
|
|
|
d_C.device(ctx2) = d_D * d_E;
|
|
|
|
|
|
|
|
|
|
|
|
Mat C = d_C.toHost();
|
|
|
|
|
|
Mat C_ref = D * E;
|
|
|
|
|
|
|
2026-04-09 19:11:25 -07:00
|
|
|
|
RealScalar tol = gemm_error_bound<Scalar>(16, D.norm(), E.norm());
|
GPU: Add library dispatch module (DeviceMatrix, cuBLAS, cuSOLVER)
Add Eigen/GPU module: A standalone GPU library dispatch layer where
DeviceMatrix<Scalar> operations map 1:1 to cuBLAS/cuSOLVER calls.
CPU and GPU solvers coexist in the same binary with compatible syntax.
Core infrastructure:
- DeviceMatrix<Scalar>: RAII dense column-major GPU memory wrapper with
async host transfer (fromHost/toHost) and CUDA event-based cross-stream
synchronization.
- GpuContext: Unified execution context owning a CUDA stream + cuBLAS
handle + cuSOLVER handle. Thread-local default with explicit override
via setThreadLocal(). Stream-borrowing constructor for integration.
- DeviceBuffer: Typed RAII device allocation with move semantics.
cuBLAS dispatch (expression syntax):
- GEMM: d_C = d_A.adjoint() * d_B (cublasXgemm)
- TRSM: d_X = d_A.triangularView<Lower>().solve(d_B) (cublasXtrsm)
- SYMM/HEMM: d_C = d_A.selfadjointView<Lower>() * d_B (cublasXsymm)
- SYRK/HERK: d_C = d_A * d_A.adjoint() (cublasXsyrk)
cuSOLVER dispatch:
- GpuLLT: Cached Cholesky factorization (cusolverDnXpotrf + Xpotrs)
- GpuLU: Cached LU factorization (cusolverDnXgetrf + Xgetrs)
- Solver chaining: auto x = d_A.llt().solve(d_B)
- Solver expressions with .device(ctx) for explicit stream control.
CI: Bump CUDA container to Ubuntu 22.04 (CMake 3.22), GCC 10->11,
Clang 12->14. Bump cmake_minimum_required to 3.17 for FindCUDAToolkit.
Tests: gpu_cublas.cpp, gpu_cusolver_llt.cpp, gpu_cusolver_lu.cpp,
gpu_device_matrix.cpp, gpu_library_example.cu
Benchmarks: bench_gpu_solvers.cpp, bench_gpu_chaining.cpp,
bench_gpu_batching.cpp
2026-04-09 16:15:39 -07:00
|
|
|
|
VERIFY((C - C_ref).norm() < tol);
|
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
// ---- GEMM chaining: C = (A * B) then D = C * E -----------------------------
|
|
|
|
|
|
|
|
|
|
|
|
template <typename Scalar>
|
|
|
|
|
|
void test_gemm_chain(Index n) {
|
|
|
|
|
|
using Mat = Matrix<Scalar, Dynamic, Dynamic>;
|
|
|
|
|
|
using RealScalar = typename NumTraits<Scalar>::Real;
|
|
|
|
|
|
|
|
|
|
|
|
Mat A = Mat::Random(n, n);
|
|
|
|
|
|
Mat B = Mat::Random(n, n);
|
|
|
|
|
|
Mat E = Mat::Random(n, n);
|
|
|
|
|
|
|
|
|
|
|
|
auto d_A = DeviceMatrix<Scalar>::fromHost(A);
|
|
|
|
|
|
auto d_B = DeviceMatrix<Scalar>::fromHost(B);
|
|
|
|
|
|
auto d_E = DeviceMatrix<Scalar>::fromHost(E);
|
|
|
|
|
|
|
|
|
|
|
|
DeviceMatrix<Scalar> d_C;
|
|
|
|
|
|
d_C = d_A * d_B;
|
|
|
|
|
|
DeviceMatrix<Scalar> d_D;
|
|
|
|
|
|
d_D = d_C * d_E;
|
|
|
|
|
|
|
|
|
|
|
|
Mat D = d_D.toHost();
|
|
|
|
|
|
Mat D_ref = (A * B) * E;
|
|
|
|
|
|
|
2026-04-09 19:11:25 -07:00
|
|
|
|
Mat C_ref = A * B;
|
|
|
|
|
|
RealScalar tol =
|
|
|
|
|
|
gemm_error_bound<Scalar>(n, A.norm(), B.norm()) * E.norm() + gemm_error_bound<Scalar>(n, C_ref.norm(), E.norm());
|
GPU: Add library dispatch module (DeviceMatrix, cuBLAS, cuSOLVER)
Add Eigen/GPU module: A standalone GPU library dispatch layer where
DeviceMatrix<Scalar> operations map 1:1 to cuBLAS/cuSOLVER calls.
CPU and GPU solvers coexist in the same binary with compatible syntax.
Core infrastructure:
- DeviceMatrix<Scalar>: RAII dense column-major GPU memory wrapper with
async host transfer (fromHost/toHost) and CUDA event-based cross-stream
synchronization.
- GpuContext: Unified execution context owning a CUDA stream + cuBLAS
handle + cuSOLVER handle. Thread-local default with explicit override
via setThreadLocal(). Stream-borrowing constructor for integration.
- DeviceBuffer: Typed RAII device allocation with move semantics.
cuBLAS dispatch (expression syntax):
- GEMM: d_C = d_A.adjoint() * d_B (cublasXgemm)
- TRSM: d_X = d_A.triangularView<Lower>().solve(d_B) (cublasXtrsm)
- SYMM/HEMM: d_C = d_A.selfadjointView<Lower>() * d_B (cublasXsymm)
- SYRK/HERK: d_C = d_A * d_A.adjoint() (cublasXsyrk)
cuSOLVER dispatch:
- GpuLLT: Cached Cholesky factorization (cusolverDnXpotrf + Xpotrs)
- GpuLU: Cached LU factorization (cusolverDnXgetrf + Xgetrs)
- Solver chaining: auto x = d_A.llt().solve(d_B)
- Solver expressions with .device(ctx) for explicit stream control.
CI: Bump CUDA container to Ubuntu 22.04 (CMake 3.22), GCC 10->11,
Clang 12->14. Bump cmake_minimum_required to 3.17 for FindCUDAToolkit.
Tests: gpu_cublas.cpp, gpu_cusolver_llt.cpp, gpu_cusolver_lu.cpp,
gpu_device_matrix.cpp, gpu_library_example.cu
Benchmarks: bench_gpu_solvers.cpp, bench_gpu_chaining.cpp,
bench_gpu_batching.cpp
2026-04-09 16:15:39 -07:00
|
|
|
|
VERIFY((D - D_ref).norm() < tol);
|
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
// ---- Square identity check: A * I = A ---------------------------------------
|
|
|
|
|
|
|
|
|
|
|
|
template <typename Scalar>
|
|
|
|
|
|
void test_gemm_identity(Index n) {
|
|
|
|
|
|
using Mat = Matrix<Scalar, Dynamic, Dynamic>;
|
|
|
|
|
|
|
|
|
|
|
|
Mat A = Mat::Random(n, n);
|
|
|
|
|
|
Mat eye = Mat::Identity(n, n);
|
|
|
|
|
|
|
|
|
|
|
|
auto d_A = DeviceMatrix<Scalar>::fromHost(A);
|
|
|
|
|
|
auto d_I = DeviceMatrix<Scalar>::fromHost(eye);
|
|
|
|
|
|
|
|
|
|
|
|
DeviceMatrix<Scalar> d_C;
|
|
|
|
|
|
d_C = d_A * d_I;
|
|
|
|
|
|
|
|
|
|
|
|
Mat C = d_C.toHost();
|
|
|
|
|
|
VERIFY_IS_APPROX(C, A);
|
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
// ---- LLT solve expression: d_X = d_A.llt().solve(d_B) ----------------------
|
|
|
|
|
|
|
|
|
|
|
|
template <typename MatrixType>
|
|
|
|
|
|
MatrixType make_spd(Index n) {
|
|
|
|
|
|
using Scalar = typename MatrixType::Scalar;
|
|
|
|
|
|
MatrixType M = MatrixType::Random(n, n);
|
|
|
|
|
|
return M.adjoint() * M + MatrixType::Identity(n, n) * static_cast<Scalar>(n);
|
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
template <typename Scalar>
|
|
|
|
|
|
void test_llt_solve_expr(Index n, Index nrhs) {
|
|
|
|
|
|
using Mat = Matrix<Scalar, Dynamic, Dynamic>;
|
|
|
|
|
|
using RealScalar = typename NumTraits<Scalar>::Real;
|
|
|
|
|
|
|
|
|
|
|
|
Mat A = make_spd<Mat>(n);
|
|
|
|
|
|
Mat B = Mat::Random(n, nrhs);
|
|
|
|
|
|
|
|
|
|
|
|
auto d_A = DeviceMatrix<Scalar>::fromHost(A);
|
|
|
|
|
|
auto d_B = DeviceMatrix<Scalar>::fromHost(B);
|
|
|
|
|
|
|
|
|
|
|
|
DeviceMatrix<Scalar> d_X;
|
|
|
|
|
|
d_X = d_A.llt().solve(d_B);
|
|
|
|
|
|
|
|
|
|
|
|
Mat X = d_X.toHost();
|
|
|
|
|
|
RealScalar residual = (A * X - B).norm() / B.norm();
|
2026-04-09 19:11:25 -07:00
|
|
|
|
VERIFY(residual < RealScalar(n) * gpu_unit_roundoff<Scalar>());
|
GPU: Add library dispatch module (DeviceMatrix, cuBLAS, cuSOLVER)
Add Eigen/GPU module: A standalone GPU library dispatch layer where
DeviceMatrix<Scalar> operations map 1:1 to cuBLAS/cuSOLVER calls.
CPU and GPU solvers coexist in the same binary with compatible syntax.
Core infrastructure:
- DeviceMatrix<Scalar>: RAII dense column-major GPU memory wrapper with
async host transfer (fromHost/toHost) and CUDA event-based cross-stream
synchronization.
- GpuContext: Unified execution context owning a CUDA stream + cuBLAS
handle + cuSOLVER handle. Thread-local default with explicit override
via setThreadLocal(). Stream-borrowing constructor for integration.
- DeviceBuffer: Typed RAII device allocation with move semantics.
cuBLAS dispatch (expression syntax):
- GEMM: d_C = d_A.adjoint() * d_B (cublasXgemm)
- TRSM: d_X = d_A.triangularView<Lower>().solve(d_B) (cublasXtrsm)
- SYMM/HEMM: d_C = d_A.selfadjointView<Lower>() * d_B (cublasXsymm)
- SYRK/HERK: d_C = d_A * d_A.adjoint() (cublasXsyrk)
cuSOLVER dispatch:
- GpuLLT: Cached Cholesky factorization (cusolverDnXpotrf + Xpotrs)
- GpuLU: Cached LU factorization (cusolverDnXgetrf + Xgetrs)
- Solver chaining: auto x = d_A.llt().solve(d_B)
- Solver expressions with .device(ctx) for explicit stream control.
CI: Bump CUDA container to Ubuntu 22.04 (CMake 3.22), GCC 10->11,
Clang 12->14. Bump cmake_minimum_required to 3.17 for FindCUDAToolkit.
Tests: gpu_cublas.cpp, gpu_cusolver_llt.cpp, gpu_cusolver_lu.cpp,
gpu_device_matrix.cpp, gpu_library_example.cu
Benchmarks: bench_gpu_solvers.cpp, bench_gpu_chaining.cpp,
bench_gpu_batching.cpp
2026-04-09 16:15:39 -07:00
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
// ---- LLT solve with explicit context ----------------------------------------
|
|
|
|
|
|
|
|
|
|
|
|
template <typename Scalar>
|
|
|
|
|
|
void test_llt_solve_expr_context(Index n, Index nrhs) {
|
|
|
|
|
|
using Mat = Matrix<Scalar, Dynamic, Dynamic>;
|
|
|
|
|
|
using RealScalar = typename NumTraits<Scalar>::Real;
|
|
|
|
|
|
|
|
|
|
|
|
Mat A = make_spd<Mat>(n);
|
|
|
|
|
|
Mat B = Mat::Random(n, nrhs);
|
|
|
|
|
|
|
|
|
|
|
|
auto d_A = DeviceMatrix<Scalar>::fromHost(A);
|
|
|
|
|
|
auto d_B = DeviceMatrix<Scalar>::fromHost(B);
|
|
|
|
|
|
|
|
|
|
|
|
GpuContext ctx;
|
|
|
|
|
|
DeviceMatrix<Scalar> d_X;
|
|
|
|
|
|
d_X.device(ctx) = d_A.llt().solve(d_B);
|
|
|
|
|
|
|
|
|
|
|
|
Mat X = d_X.toHost();
|
|
|
|
|
|
RealScalar residual = (A * X - B).norm() / B.norm();
|
2026-04-09 19:11:25 -07:00
|
|
|
|
VERIFY(residual < RealScalar(n) * gpu_unit_roundoff<Scalar>());
|
GPU: Add library dispatch module (DeviceMatrix, cuBLAS, cuSOLVER)
Add Eigen/GPU module: A standalone GPU library dispatch layer where
DeviceMatrix<Scalar> operations map 1:1 to cuBLAS/cuSOLVER calls.
CPU and GPU solvers coexist in the same binary with compatible syntax.
Core infrastructure:
- DeviceMatrix<Scalar>: RAII dense column-major GPU memory wrapper with
async host transfer (fromHost/toHost) and CUDA event-based cross-stream
synchronization.
- GpuContext: Unified execution context owning a CUDA stream + cuBLAS
handle + cuSOLVER handle. Thread-local default with explicit override
via setThreadLocal(). Stream-borrowing constructor for integration.
- DeviceBuffer: Typed RAII device allocation with move semantics.
cuBLAS dispatch (expression syntax):
- GEMM: d_C = d_A.adjoint() * d_B (cublasXgemm)
- TRSM: d_X = d_A.triangularView<Lower>().solve(d_B) (cublasXtrsm)
- SYMM/HEMM: d_C = d_A.selfadjointView<Lower>() * d_B (cublasXsymm)
- SYRK/HERK: d_C = d_A * d_A.adjoint() (cublasXsyrk)
cuSOLVER dispatch:
- GpuLLT: Cached Cholesky factorization (cusolverDnXpotrf + Xpotrs)
- GpuLU: Cached LU factorization (cusolverDnXgetrf + Xgetrs)
- Solver chaining: auto x = d_A.llt().solve(d_B)
- Solver expressions with .device(ctx) for explicit stream control.
CI: Bump CUDA container to Ubuntu 22.04 (CMake 3.22), GCC 10->11,
Clang 12->14. Bump cmake_minimum_required to 3.17 for FindCUDAToolkit.
Tests: gpu_cublas.cpp, gpu_cusolver_llt.cpp, gpu_cusolver_lu.cpp,
gpu_device_matrix.cpp, gpu_library_example.cu
Benchmarks: bench_gpu_solvers.cpp, bench_gpu_chaining.cpp,
bench_gpu_batching.cpp
2026-04-09 16:15:39 -07:00
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
// ---- LU solve expression: d_X = d_A.lu().solve(d_B) ------------------------
|
|
|
|
|
|
|
|
|
|
|
|
template <typename Scalar>
|
|
|
|
|
|
void test_lu_solve_expr(Index n, Index nrhs) {
|
|
|
|
|
|
using Mat = Matrix<Scalar, Dynamic, Dynamic>;
|
|
|
|
|
|
using RealScalar = typename NumTraits<Scalar>::Real;
|
|
|
|
|
|
|
|
|
|
|
|
Mat A = Mat::Random(n, n);
|
|
|
|
|
|
Mat B = Mat::Random(n, nrhs);
|
|
|
|
|
|
|
|
|
|
|
|
auto d_A = DeviceMatrix<Scalar>::fromHost(A);
|
|
|
|
|
|
auto d_B = DeviceMatrix<Scalar>::fromHost(B);
|
|
|
|
|
|
|
|
|
|
|
|
DeviceMatrix<Scalar> d_X;
|
|
|
|
|
|
d_X = d_A.lu().solve(d_B);
|
|
|
|
|
|
|
|
|
|
|
|
Mat X = d_X.toHost();
|
|
|
|
|
|
RealScalar residual = (A * X - B).norm() / (A.norm() * X.norm());
|
2026-04-09 19:11:25 -07:00
|
|
|
|
VERIFY(residual < RealScalar(10) * RealScalar(n) * gpu_unit_roundoff<Scalar>());
|
GPU: Add library dispatch module (DeviceMatrix, cuBLAS, cuSOLVER)
Add Eigen/GPU module: A standalone GPU library dispatch layer where
DeviceMatrix<Scalar> operations map 1:1 to cuBLAS/cuSOLVER calls.
CPU and GPU solvers coexist in the same binary with compatible syntax.
Core infrastructure:
- DeviceMatrix<Scalar>: RAII dense column-major GPU memory wrapper with
async host transfer (fromHost/toHost) and CUDA event-based cross-stream
synchronization.
- GpuContext: Unified execution context owning a CUDA stream + cuBLAS
handle + cuSOLVER handle. Thread-local default with explicit override
via setThreadLocal(). Stream-borrowing constructor for integration.
- DeviceBuffer: Typed RAII device allocation with move semantics.
cuBLAS dispatch (expression syntax):
- GEMM: d_C = d_A.adjoint() * d_B (cublasXgemm)
- TRSM: d_X = d_A.triangularView<Lower>().solve(d_B) (cublasXtrsm)
- SYMM/HEMM: d_C = d_A.selfadjointView<Lower>() * d_B (cublasXsymm)
- SYRK/HERK: d_C = d_A * d_A.adjoint() (cublasXsyrk)
cuSOLVER dispatch:
- GpuLLT: Cached Cholesky factorization (cusolverDnXpotrf + Xpotrs)
- GpuLU: Cached LU factorization (cusolverDnXgetrf + Xgetrs)
- Solver chaining: auto x = d_A.llt().solve(d_B)
- Solver expressions with .device(ctx) for explicit stream control.
CI: Bump CUDA container to Ubuntu 22.04 (CMake 3.22), GCC 10->11,
Clang 12->14. Bump cmake_minimum_required to 3.17 for FindCUDAToolkit.
Tests: gpu_cublas.cpp, gpu_cusolver_llt.cpp, gpu_cusolver_lu.cpp,
gpu_device_matrix.cpp, gpu_library_example.cu
Benchmarks: bench_gpu_solvers.cpp, bench_gpu_chaining.cpp,
bench_gpu_batching.cpp
2026-04-09 16:15:39 -07:00
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
// ---- GEMM + solver chain: C = A * B, X = C.llt().solve(D) ------------------
|
|
|
|
|
|
|
|
|
|
|
|
template <typename Scalar>
|
|
|
|
|
|
void test_gemm_then_solve(Index n) {
|
|
|
|
|
|
using Mat = Matrix<Scalar, Dynamic, Dynamic>;
|
|
|
|
|
|
using RealScalar = typename NumTraits<Scalar>::Real;
|
|
|
|
|
|
|
|
|
|
|
|
Mat A = Mat::Random(n, n);
|
|
|
|
|
|
Mat D = Mat::Random(n, 1);
|
|
|
|
|
|
|
|
|
|
|
|
// Make SPD: C = A^H * A + n*I
|
|
|
|
|
|
auto d_A = DeviceMatrix<Scalar>::fromHost(A);
|
|
|
|
|
|
DeviceMatrix<Scalar> d_C;
|
|
|
|
|
|
d_C = d_A.adjoint() * d_A;
|
|
|
|
|
|
|
|
|
|
|
|
// Add n*I on host (no element-wise ops on DeviceMatrix yet).
|
|
|
|
|
|
Mat C = d_C.toHost();
|
|
|
|
|
|
C += Mat::Identity(n, n) * static_cast<Scalar>(n);
|
|
|
|
|
|
d_C = DeviceMatrix<Scalar>::fromHost(C);
|
|
|
|
|
|
|
|
|
|
|
|
auto d_D = DeviceMatrix<Scalar>::fromHost(D);
|
|
|
|
|
|
|
|
|
|
|
|
DeviceMatrix<Scalar> d_X;
|
|
|
|
|
|
d_X = d_C.llt().solve(d_D);
|
|
|
|
|
|
|
|
|
|
|
|
Mat X = d_X.toHost();
|
|
|
|
|
|
RealScalar residual = (C * X - D).norm() / D.norm();
|
2026-04-09 19:11:25 -07:00
|
|
|
|
VERIFY(residual < RealScalar(n) * gpu_unit_roundoff<Scalar>());
|
GPU: Add library dispatch module (DeviceMatrix, cuBLAS, cuSOLVER)
Add Eigen/GPU module: A standalone GPU library dispatch layer where
DeviceMatrix<Scalar> operations map 1:1 to cuBLAS/cuSOLVER calls.
CPU and GPU solvers coexist in the same binary with compatible syntax.
Core infrastructure:
- DeviceMatrix<Scalar>: RAII dense column-major GPU memory wrapper with
async host transfer (fromHost/toHost) and CUDA event-based cross-stream
synchronization.
- GpuContext: Unified execution context owning a CUDA stream + cuBLAS
handle + cuSOLVER handle. Thread-local default with explicit override
via setThreadLocal(). Stream-borrowing constructor for integration.
- DeviceBuffer: Typed RAII device allocation with move semantics.
cuBLAS dispatch (expression syntax):
- GEMM: d_C = d_A.adjoint() * d_B (cublasXgemm)
- TRSM: d_X = d_A.triangularView<Lower>().solve(d_B) (cublasXtrsm)
- SYMM/HEMM: d_C = d_A.selfadjointView<Lower>() * d_B (cublasXsymm)
- SYRK/HERK: d_C = d_A * d_A.adjoint() (cublasXsyrk)
cuSOLVER dispatch:
- GpuLLT: Cached Cholesky factorization (cusolverDnXpotrf + Xpotrs)
- GpuLU: Cached LU factorization (cusolverDnXgetrf + Xgetrs)
- Solver chaining: auto x = d_A.llt().solve(d_B)
- Solver expressions with .device(ctx) for explicit stream control.
CI: Bump CUDA container to Ubuntu 22.04 (CMake 3.22), GCC 10->11,
Clang 12->14. Bump cmake_minimum_required to 3.17 for FindCUDAToolkit.
Tests: gpu_cublas.cpp, gpu_cusolver_llt.cpp, gpu_cusolver_lu.cpp,
gpu_device_matrix.cpp, gpu_library_example.cu
Benchmarks: bench_gpu_solvers.cpp, bench_gpu_chaining.cpp,
bench_gpu_batching.cpp
2026-04-09 16:15:39 -07:00
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
// ---- LLT solve with Upper triangle -----------------------------------------
|
|
|
|
|
|
|
|
|
|
|
|
template <typename Scalar>
|
|
|
|
|
|
void test_llt_solve_upper(Index n, Index nrhs) {
|
|
|
|
|
|
using Mat = Matrix<Scalar, Dynamic, Dynamic>;
|
|
|
|
|
|
using RealScalar = typename NumTraits<Scalar>::Real;
|
|
|
|
|
|
|
|
|
|
|
|
Mat A = make_spd<Mat>(n);
|
|
|
|
|
|
Mat B = Mat::Random(n, nrhs);
|
|
|
|
|
|
|
|
|
|
|
|
auto d_A = DeviceMatrix<Scalar>::fromHost(A);
|
|
|
|
|
|
auto d_B = DeviceMatrix<Scalar>::fromHost(B);
|
|
|
|
|
|
|
|
|
|
|
|
DeviceMatrix<Scalar> d_X;
|
|
|
|
|
|
d_X = d_A.template llt<Upper>().solve(d_B);
|
|
|
|
|
|
|
|
|
|
|
|
Mat X = d_X.toHost();
|
|
|
|
|
|
RealScalar residual = (A * X - B).norm() / B.norm();
|
2026-04-09 19:11:25 -07:00
|
|
|
|
VERIFY(residual < RealScalar(n) * gpu_unit_roundoff<Scalar>());
|
GPU: Add library dispatch module (DeviceMatrix, cuBLAS, cuSOLVER)
Add Eigen/GPU module: A standalone GPU library dispatch layer where
DeviceMatrix<Scalar> operations map 1:1 to cuBLAS/cuSOLVER calls.
CPU and GPU solvers coexist in the same binary with compatible syntax.
Core infrastructure:
- DeviceMatrix<Scalar>: RAII dense column-major GPU memory wrapper with
async host transfer (fromHost/toHost) and CUDA event-based cross-stream
synchronization.
- GpuContext: Unified execution context owning a CUDA stream + cuBLAS
handle + cuSOLVER handle. Thread-local default with explicit override
via setThreadLocal(). Stream-borrowing constructor for integration.
- DeviceBuffer: Typed RAII device allocation with move semantics.
cuBLAS dispatch (expression syntax):
- GEMM: d_C = d_A.adjoint() * d_B (cublasXgemm)
- TRSM: d_X = d_A.triangularView<Lower>().solve(d_B) (cublasXtrsm)
- SYMM/HEMM: d_C = d_A.selfadjointView<Lower>() * d_B (cublasXsymm)
- SYRK/HERK: d_C = d_A * d_A.adjoint() (cublasXsyrk)
cuSOLVER dispatch:
- GpuLLT: Cached Cholesky factorization (cusolverDnXpotrf + Xpotrs)
- GpuLU: Cached LU factorization (cusolverDnXgetrf + Xgetrs)
- Solver chaining: auto x = d_A.llt().solve(d_B)
- Solver expressions with .device(ctx) for explicit stream control.
CI: Bump CUDA container to Ubuntu 22.04 (CMake 3.22), GCC 10->11,
Clang 12->14. Bump cmake_minimum_required to 3.17 for FindCUDAToolkit.
Tests: gpu_cublas.cpp, gpu_cusolver_llt.cpp, gpu_cusolver_lu.cpp,
gpu_device_matrix.cpp, gpu_library_example.cu
Benchmarks: bench_gpu_solvers.cpp, bench_gpu_chaining.cpp,
bench_gpu_batching.cpp
2026-04-09 16:15:39 -07:00
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
// ---- LU solve with explicit context -----------------------------------------
|
|
|
|
|
|
|
|
|
|
|
|
template <typename Scalar>
|
|
|
|
|
|
void test_lu_solve_expr_context(Index n, Index nrhs) {
|
|
|
|
|
|
using Mat = Matrix<Scalar, Dynamic, Dynamic>;
|
|
|
|
|
|
using RealScalar = typename NumTraits<Scalar>::Real;
|
|
|
|
|
|
|
|
|
|
|
|
Mat A = Mat::Random(n, n);
|
|
|
|
|
|
Mat B = Mat::Random(n, nrhs);
|
|
|
|
|
|
|
|
|
|
|
|
auto d_A = DeviceMatrix<Scalar>::fromHost(A);
|
|
|
|
|
|
auto d_B = DeviceMatrix<Scalar>::fromHost(B);
|
|
|
|
|
|
|
|
|
|
|
|
GpuContext ctx;
|
|
|
|
|
|
DeviceMatrix<Scalar> d_X;
|
|
|
|
|
|
d_X.device(ctx) = d_A.lu().solve(d_B);
|
|
|
|
|
|
|
|
|
|
|
|
Mat X = d_X.toHost();
|
|
|
|
|
|
RealScalar residual = (A * X - B).norm() / (A.norm() * X.norm());
|
2026-04-09 19:11:25 -07:00
|
|
|
|
VERIFY(residual < RealScalar(10) * RealScalar(n) * gpu_unit_roundoff<Scalar>());
|
GPU: Add library dispatch module (DeviceMatrix, cuBLAS, cuSOLVER)
Add Eigen/GPU module: A standalone GPU library dispatch layer where
DeviceMatrix<Scalar> operations map 1:1 to cuBLAS/cuSOLVER calls.
CPU and GPU solvers coexist in the same binary with compatible syntax.
Core infrastructure:
- DeviceMatrix<Scalar>: RAII dense column-major GPU memory wrapper with
async host transfer (fromHost/toHost) and CUDA event-based cross-stream
synchronization.
- GpuContext: Unified execution context owning a CUDA stream + cuBLAS
handle + cuSOLVER handle. Thread-local default with explicit override
via setThreadLocal(). Stream-borrowing constructor for integration.
- DeviceBuffer: Typed RAII device allocation with move semantics.
cuBLAS dispatch (expression syntax):
- GEMM: d_C = d_A.adjoint() * d_B (cublasXgemm)
- TRSM: d_X = d_A.triangularView<Lower>().solve(d_B) (cublasXtrsm)
- SYMM/HEMM: d_C = d_A.selfadjointView<Lower>() * d_B (cublasXsymm)
- SYRK/HERK: d_C = d_A * d_A.adjoint() (cublasXsyrk)
cuSOLVER dispatch:
- GpuLLT: Cached Cholesky factorization (cusolverDnXpotrf + Xpotrs)
- GpuLU: Cached LU factorization (cusolverDnXgetrf + Xgetrs)
- Solver chaining: auto x = d_A.llt().solve(d_B)
- Solver expressions with .device(ctx) for explicit stream control.
CI: Bump CUDA container to Ubuntu 22.04 (CMake 3.22), GCC 10->11,
Clang 12->14. Bump cmake_minimum_required to 3.17 for FindCUDAToolkit.
Tests: gpu_cublas.cpp, gpu_cusolver_llt.cpp, gpu_cusolver_lu.cpp,
gpu_device_matrix.cpp, gpu_library_example.cu
Benchmarks: bench_gpu_solvers.cpp, bench_gpu_chaining.cpp,
bench_gpu_batching.cpp
2026-04-09 16:15:39 -07:00
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
// ---- Zero-nrhs solver expressions ------------------------------------------
|
|
|
|
|
|
|
|
|
|
|
|
template <typename Scalar>
|
|
|
|
|
|
void test_llt_solve_zero_nrhs(Index n) {
|
|
|
|
|
|
using Mat = Matrix<Scalar, Dynamic, Dynamic>;
|
|
|
|
|
|
|
|
|
|
|
|
Mat A = make_spd<Mat>(n);
|
|
|
|
|
|
Mat B = Mat::Random(n, 0);
|
|
|
|
|
|
|
|
|
|
|
|
auto d_A = DeviceMatrix<Scalar>::fromHost(A);
|
|
|
|
|
|
auto d_B = DeviceMatrix<Scalar>::fromHost(B);
|
|
|
|
|
|
|
|
|
|
|
|
DeviceMatrix<Scalar> d_X;
|
|
|
|
|
|
d_X = d_A.llt().solve(d_B);
|
|
|
|
|
|
|
|
|
|
|
|
VERIFY_IS_EQUAL(d_X.rows(), n);
|
|
|
|
|
|
VERIFY_IS_EQUAL(d_X.cols(), 0);
|
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
template <typename Scalar>
|
|
|
|
|
|
void test_lu_solve_zero_nrhs(Index n) {
|
|
|
|
|
|
using Mat = Matrix<Scalar, Dynamic, Dynamic>;
|
|
|
|
|
|
|
|
|
|
|
|
Mat A = Mat::Random(n, n);
|
|
|
|
|
|
Mat B = Mat::Random(n, 0);
|
|
|
|
|
|
|
|
|
|
|
|
auto d_A = DeviceMatrix<Scalar>::fromHost(A);
|
|
|
|
|
|
auto d_B = DeviceMatrix<Scalar>::fromHost(B);
|
|
|
|
|
|
|
|
|
|
|
|
DeviceMatrix<Scalar> d_X;
|
|
|
|
|
|
d_X = d_A.lu().solve(d_B);
|
|
|
|
|
|
|
|
|
|
|
|
VERIFY_IS_EQUAL(d_X.rows(), n);
|
|
|
|
|
|
VERIFY_IS_EQUAL(d_X.cols(), 0);
|
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
// ---- TRSM: triangularView<UpLo>().solve(B) ----------------------------------
|
|
|
|
|
|
|
|
|
|
|
|
template <typename Scalar, int UpLo>
|
|
|
|
|
|
void test_trsm(Index n, Index nrhs) {
|
|
|
|
|
|
using Mat = Matrix<Scalar, Dynamic, Dynamic>;
|
|
|
|
|
|
using RealScalar = typename NumTraits<Scalar>::Real;
|
|
|
|
|
|
|
|
|
|
|
|
// Build a well-conditioned triangular matrix.
|
|
|
|
|
|
Mat A = Mat::Random(n, n);
|
|
|
|
|
|
A.diagonal().array() += static_cast<Scalar>(n); // ensure non-singular
|
|
|
|
|
|
if (UpLo == Lower)
|
|
|
|
|
|
A = A.template triangularView<Lower>();
|
|
|
|
|
|
else
|
|
|
|
|
|
A = A.template triangularView<Upper>();
|
|
|
|
|
|
|
|
|
|
|
|
Mat B = Mat::Random(n, nrhs);
|
|
|
|
|
|
|
|
|
|
|
|
auto d_A = DeviceMatrix<Scalar>::fromHost(A);
|
|
|
|
|
|
auto d_B = DeviceMatrix<Scalar>::fromHost(B);
|
|
|
|
|
|
|
|
|
|
|
|
DeviceMatrix<Scalar> d_X;
|
|
|
|
|
|
d_X = d_A.template triangularView<UpLo>().solve(d_B);
|
|
|
|
|
|
|
|
|
|
|
|
Mat X = d_X.toHost();
|
|
|
|
|
|
RealScalar residual = (A * X - B).norm() / B.norm();
|
2026-04-09 19:11:25 -07:00
|
|
|
|
VERIFY(residual < RealScalar(n) * gpu_unit_roundoff<Scalar>());
|
GPU: Add library dispatch module (DeviceMatrix, cuBLAS, cuSOLVER)
Add Eigen/GPU module: A standalone GPU library dispatch layer where
DeviceMatrix<Scalar> operations map 1:1 to cuBLAS/cuSOLVER calls.
CPU and GPU solvers coexist in the same binary with compatible syntax.
Core infrastructure:
- DeviceMatrix<Scalar>: RAII dense column-major GPU memory wrapper with
async host transfer (fromHost/toHost) and CUDA event-based cross-stream
synchronization.
- GpuContext: Unified execution context owning a CUDA stream + cuBLAS
handle + cuSOLVER handle. Thread-local default with explicit override
via setThreadLocal(). Stream-borrowing constructor for integration.
- DeviceBuffer: Typed RAII device allocation with move semantics.
cuBLAS dispatch (expression syntax):
- GEMM: d_C = d_A.adjoint() * d_B (cublasXgemm)
- TRSM: d_X = d_A.triangularView<Lower>().solve(d_B) (cublasXtrsm)
- SYMM/HEMM: d_C = d_A.selfadjointView<Lower>() * d_B (cublasXsymm)
- SYRK/HERK: d_C = d_A * d_A.adjoint() (cublasXsyrk)
cuSOLVER dispatch:
- GpuLLT: Cached Cholesky factorization (cusolverDnXpotrf + Xpotrs)
- GpuLU: Cached LU factorization (cusolverDnXgetrf + Xgetrs)
- Solver chaining: auto x = d_A.llt().solve(d_B)
- Solver expressions with .device(ctx) for explicit stream control.
CI: Bump CUDA container to Ubuntu 22.04 (CMake 3.22), GCC 10->11,
Clang 12->14. Bump cmake_minimum_required to 3.17 for FindCUDAToolkit.
Tests: gpu_cublas.cpp, gpu_cusolver_llt.cpp, gpu_cusolver_lu.cpp,
gpu_device_matrix.cpp, gpu_library_example.cu
Benchmarks: bench_gpu_solvers.cpp, bench_gpu_chaining.cpp,
bench_gpu_batching.cpp
2026-04-09 16:15:39 -07:00
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
// ---- SYMM/HEMM: selfadjointView<UpLo>() * B --------------------------------
|
|
|
|
|
|
|
|
|
|
|
|
template <typename Scalar, int UpLo>
|
|
|
|
|
|
void test_symm(Index n, Index nrhs) {
|
|
|
|
|
|
using Mat = Matrix<Scalar, Dynamic, Dynamic>;
|
|
|
|
|
|
using RealScalar = typename NumTraits<Scalar>::Real;
|
|
|
|
|
|
|
|
|
|
|
|
Mat A = make_spd<Mat>(n); // SPD is also self-adjoint
|
|
|
|
|
|
Mat B = Mat::Random(n, nrhs);
|
|
|
|
|
|
|
|
|
|
|
|
auto d_A = DeviceMatrix<Scalar>::fromHost(A);
|
|
|
|
|
|
auto d_B = DeviceMatrix<Scalar>::fromHost(B);
|
|
|
|
|
|
|
|
|
|
|
|
DeviceMatrix<Scalar> d_C;
|
|
|
|
|
|
d_C = d_A.template selfadjointView<UpLo>() * d_B;
|
|
|
|
|
|
|
|
|
|
|
|
Mat C = d_C.toHost();
|
|
|
|
|
|
Mat C_ref = A * B; // A is symmetric, so full multiply == symm
|
|
|
|
|
|
|
2026-04-09 19:11:25 -07:00
|
|
|
|
RealScalar tol = gemm_error_bound<Scalar>(n, A.norm(), B.norm());
|
GPU: Add library dispatch module (DeviceMatrix, cuBLAS, cuSOLVER)
Add Eigen/GPU module: A standalone GPU library dispatch layer where
DeviceMatrix<Scalar> operations map 1:1 to cuBLAS/cuSOLVER calls.
CPU and GPU solvers coexist in the same binary with compatible syntax.
Core infrastructure:
- DeviceMatrix<Scalar>: RAII dense column-major GPU memory wrapper with
async host transfer (fromHost/toHost) and CUDA event-based cross-stream
synchronization.
- GpuContext: Unified execution context owning a CUDA stream + cuBLAS
handle + cuSOLVER handle. Thread-local default with explicit override
via setThreadLocal(). Stream-borrowing constructor for integration.
- DeviceBuffer: Typed RAII device allocation with move semantics.
cuBLAS dispatch (expression syntax):
- GEMM: d_C = d_A.adjoint() * d_B (cublasXgemm)
- TRSM: d_X = d_A.triangularView<Lower>().solve(d_B) (cublasXtrsm)
- SYMM/HEMM: d_C = d_A.selfadjointView<Lower>() * d_B (cublasXsymm)
- SYRK/HERK: d_C = d_A * d_A.adjoint() (cublasXsyrk)
cuSOLVER dispatch:
- GpuLLT: Cached Cholesky factorization (cusolverDnXpotrf + Xpotrs)
- GpuLU: Cached LU factorization (cusolverDnXgetrf + Xgetrs)
- Solver chaining: auto x = d_A.llt().solve(d_B)
- Solver expressions with .device(ctx) for explicit stream control.
CI: Bump CUDA container to Ubuntu 22.04 (CMake 3.22), GCC 10->11,
Clang 12->14. Bump cmake_minimum_required to 3.17 for FindCUDAToolkit.
Tests: gpu_cublas.cpp, gpu_cusolver_llt.cpp, gpu_cusolver_lu.cpp,
gpu_device_matrix.cpp, gpu_library_example.cu
Benchmarks: bench_gpu_solvers.cpp, bench_gpu_chaining.cpp,
bench_gpu_batching.cpp
2026-04-09 16:15:39 -07:00
|
|
|
|
VERIFY((C - C_ref).norm() < tol);
|
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
// ---- SYRK/HERK: rankUpdate(A) → C = A * A^H --------------------------------
|
|
|
|
|
|
|
|
|
|
|
|
template <typename Scalar>
|
|
|
|
|
|
void test_syrk(Index n, Index k) {
|
|
|
|
|
|
using Mat = Matrix<Scalar, Dynamic, Dynamic>;
|
|
|
|
|
|
using RealScalar = typename NumTraits<Scalar>::Real;
|
|
|
|
|
|
|
|
|
|
|
|
Mat A = Mat::Random(n, k);
|
|
|
|
|
|
|
|
|
|
|
|
auto d_A = DeviceMatrix<Scalar>::fromHost(A);
|
|
|
|
|
|
|
|
|
|
|
|
DeviceMatrix<Scalar> d_C;
|
|
|
|
|
|
d_C.template selfadjointView<Lower>().rankUpdate(d_A);
|
|
|
|
|
|
|
|
|
|
|
|
Mat C = d_C.toHost();
|
|
|
|
|
|
// Only lower triangle is meaningful for SYRK. Compare lower triangle.
|
|
|
|
|
|
Mat C_ref = A * A.adjoint();
|
|
|
|
|
|
|
|
|
|
|
|
// Extract lower triangle for comparison.
|
|
|
|
|
|
Mat C_lower = C.template triangularView<Lower>();
|
|
|
|
|
|
Mat C_ref_lower = C_ref.template triangularView<Lower>();
|
|
|
|
|
|
|
2026-04-09 19:11:25 -07:00
|
|
|
|
RealScalar tol = gemm_error_bound<Scalar>(k, A.norm(), A.norm());
|
GPU: Add library dispatch module (DeviceMatrix, cuBLAS, cuSOLVER)
Add Eigen/GPU module: A standalone GPU library dispatch layer where
DeviceMatrix<Scalar> operations map 1:1 to cuBLAS/cuSOLVER calls.
CPU and GPU solvers coexist in the same binary with compatible syntax.
Core infrastructure:
- DeviceMatrix<Scalar>: RAII dense column-major GPU memory wrapper with
async host transfer (fromHost/toHost) and CUDA event-based cross-stream
synchronization.
- GpuContext: Unified execution context owning a CUDA stream + cuBLAS
handle + cuSOLVER handle. Thread-local default with explicit override
via setThreadLocal(). Stream-borrowing constructor for integration.
- DeviceBuffer: Typed RAII device allocation with move semantics.
cuBLAS dispatch (expression syntax):
- GEMM: d_C = d_A.adjoint() * d_B (cublasXgemm)
- TRSM: d_X = d_A.triangularView<Lower>().solve(d_B) (cublasXtrsm)
- SYMM/HEMM: d_C = d_A.selfadjointView<Lower>() * d_B (cublasXsymm)
- SYRK/HERK: d_C = d_A * d_A.adjoint() (cublasXsyrk)
cuSOLVER dispatch:
- GpuLLT: Cached Cholesky factorization (cusolverDnXpotrf + Xpotrs)
- GpuLU: Cached LU factorization (cusolverDnXgetrf + Xgetrs)
- Solver chaining: auto x = d_A.llt().solve(d_B)
- Solver expressions with .device(ctx) for explicit stream control.
CI: Bump CUDA container to Ubuntu 22.04 (CMake 3.22), GCC 10->11,
Clang 12->14. Bump cmake_minimum_required to 3.17 for FindCUDAToolkit.
Tests: gpu_cublas.cpp, gpu_cusolver_llt.cpp, gpu_cusolver_lu.cpp,
gpu_device_matrix.cpp, gpu_library_example.cu
Benchmarks: bench_gpu_solvers.cpp, bench_gpu_chaining.cpp,
bench_gpu_batching.cpp
2026-04-09 16:15:39 -07:00
|
|
|
|
VERIFY((C_lower - C_ref_lower).norm() < tol);
|
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
// ---- Per-scalar driver ------------------------------------------------------
|
|
|
|
|
|
|
|
|
|
|
|
template <typename Scalar>
|
|
|
|
|
|
void test_scalar() {
|
|
|
|
|
|
CALL_SUBTEST(test_gemm_basic<Scalar>(64, 64, 64));
|
|
|
|
|
|
CALL_SUBTEST(test_gemm_basic<Scalar>(128, 64, 32));
|
|
|
|
|
|
CALL_SUBTEST(test_gemm_basic<Scalar>(1, 1, 1));
|
|
|
|
|
|
CALL_SUBTEST(test_gemm_basic<Scalar>(256, 256, 256));
|
|
|
|
|
|
|
|
|
|
|
|
CALL_SUBTEST(test_gemm_adjoint_lhs<Scalar>(64, 64, 64));
|
|
|
|
|
|
CALL_SUBTEST(test_gemm_adjoint_lhs<Scalar>(128, 32, 64));
|
|
|
|
|
|
|
|
|
|
|
|
CALL_SUBTEST(test_gemm_transpose_rhs<Scalar>(64, 64, 64));
|
|
|
|
|
|
CALL_SUBTEST(test_gemm_transpose_rhs<Scalar>(128, 32, 64));
|
|
|
|
|
|
|
|
|
|
|
|
CALL_SUBTEST(test_gemm_scaled<Scalar>(64, 64, 64));
|
|
|
|
|
|
CALL_SUBTEST(test_gemm_scaled_rhs<Scalar>(64, 64, 64));
|
|
|
|
|
|
CALL_SUBTEST(test_gemm_accumulate<Scalar>(64, 64, 64));
|
|
|
|
|
|
CALL_SUBTEST(test_gemm_accumulate_empty<Scalar>(64, 64, 64));
|
|
|
|
|
|
CALL_SUBTEST(test_gemm_subtract<Scalar>(64, 64, 64));
|
|
|
|
|
|
CALL_SUBTEST(test_gemm_subtract_empty<Scalar>(64, 64, 64));
|
|
|
|
|
|
CALL_SUBTEST(test_gemm_dimension_mismatch<Scalar>());
|
|
|
|
|
|
CALL_SUBTEST(test_gemm_explicit_context<Scalar>(64, 64, 64));
|
|
|
|
|
|
CALL_SUBTEST(test_gemm_cross_context_reuse<Scalar>(64));
|
|
|
|
|
|
CALL_SUBTEST(test_gemm_cross_context_resize<Scalar>());
|
|
|
|
|
|
CALL_SUBTEST(test_gemm_chain<Scalar>(64));
|
|
|
|
|
|
CALL_SUBTEST(test_gemm_identity<Scalar>(64));
|
|
|
|
|
|
|
|
|
|
|
|
// Solver expressions — zero-size edge cases (use dedicated tests, not residual-based)
|
|
|
|
|
|
|
|
|
|
|
|
// Solver expressions
|
|
|
|
|
|
CALL_SUBTEST(test_llt_solve_expr<Scalar>(64, 1));
|
|
|
|
|
|
CALL_SUBTEST(test_llt_solve_expr<Scalar>(64, 4));
|
|
|
|
|
|
CALL_SUBTEST(test_llt_solve_expr<Scalar>(256, 8));
|
|
|
|
|
|
CALL_SUBTEST(test_llt_solve_expr_context<Scalar>(64, 4));
|
|
|
|
|
|
CALL_SUBTEST(test_llt_solve_upper<Scalar>(64, 4));
|
|
|
|
|
|
CALL_SUBTEST(test_lu_solve_expr<Scalar>(64, 1));
|
|
|
|
|
|
CALL_SUBTEST(test_lu_solve_expr<Scalar>(64, 4));
|
|
|
|
|
|
CALL_SUBTEST(test_lu_solve_expr<Scalar>(256, 8));
|
|
|
|
|
|
CALL_SUBTEST(test_lu_solve_expr_context<Scalar>(64, 4));
|
|
|
|
|
|
CALL_SUBTEST(test_llt_solve_zero_nrhs<Scalar>(64));
|
|
|
|
|
|
CALL_SUBTEST(test_llt_solve_zero_nrhs<Scalar>(0));
|
|
|
|
|
|
CALL_SUBTEST(test_lu_solve_zero_nrhs<Scalar>(64));
|
|
|
|
|
|
CALL_SUBTEST(test_lu_solve_zero_nrhs<Scalar>(0));
|
|
|
|
|
|
CALL_SUBTEST(test_gemm_then_solve<Scalar>(64));
|
|
|
|
|
|
|
|
|
|
|
|
// TRSM
|
|
|
|
|
|
CALL_SUBTEST((test_trsm<Scalar, Lower>(64, 1)));
|
|
|
|
|
|
CALL_SUBTEST((test_trsm<Scalar, Lower>(64, 4)));
|
|
|
|
|
|
CALL_SUBTEST((test_trsm<Scalar, Upper>(64, 4)));
|
|
|
|
|
|
CALL_SUBTEST((test_trsm<Scalar, Lower>(256, 8)));
|
|
|
|
|
|
|
|
|
|
|
|
// SYMM/HEMM
|
|
|
|
|
|
CALL_SUBTEST((test_symm<Scalar, Lower>(64, 4)));
|
|
|
|
|
|
CALL_SUBTEST((test_symm<Scalar, Upper>(64, 4)));
|
|
|
|
|
|
CALL_SUBTEST((test_symm<Scalar, Lower>(128, 8)));
|
|
|
|
|
|
|
|
|
|
|
|
// SYRK/HERK
|
|
|
|
|
|
CALL_SUBTEST(test_syrk<Scalar>(64, 64));
|
|
|
|
|
|
CALL_SUBTEST(test_syrk<Scalar>(64, 32));
|
|
|
|
|
|
CALL_SUBTEST(test_syrk<Scalar>(128, 64));
|
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
// ---- Solver failure mode tests (not templated on Scalar) --------------------
|
|
|
|
|
|
|
|
|
|
|
|
void test_llt_not_spd() {
|
|
|
|
|
|
// Negative definite matrix — LLT factorization must fail.
|
|
|
|
|
|
MatrixXd A = -MatrixXd::Identity(8, 8);
|
|
|
|
|
|
MatrixXd B = MatrixXd::Random(8, 1);
|
|
|
|
|
|
auto d_A = DeviceMatrix<double>::fromHost(A);
|
|
|
|
|
|
auto d_B = DeviceMatrix<double>::fromHost(B);
|
|
|
|
|
|
DeviceMatrix<double> d_X;
|
|
|
|
|
|
VERIFY_RAISES_ASSERT(d_X = d_A.llt().solve(d_B));
|
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
void test_lu_singular() {
|
|
|
|
|
|
// Zero matrix — LU factorization must detect singularity.
|
|
|
|
|
|
MatrixXd A = MatrixXd::Zero(8, 8);
|
|
|
|
|
|
MatrixXd B = MatrixXd::Random(8, 1);
|
|
|
|
|
|
auto d_A = DeviceMatrix<double>::fromHost(A);
|
|
|
|
|
|
auto d_B = DeviceMatrix<double>::fromHost(B);
|
|
|
|
|
|
DeviceMatrix<double> d_X;
|
|
|
|
|
|
VERIFY_RAISES_ASSERT(d_X = d_A.lu().solve(d_B));
|
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
EIGEN_DECLARE_TEST(gpu_cublas) {
|
|
|
|
|
|
CALL_SUBTEST(test_scalar<float>());
|
|
|
|
|
|
CALL_SUBTEST(test_scalar<double>());
|
|
|
|
|
|
CALL_SUBTEST(test_scalar<std::complex<float>>());
|
|
|
|
|
|
CALL_SUBTEST(test_scalar<std::complex<double>>());
|
|
|
|
|
|
CALL_SUBTEST(test_llt_not_spd());
|
|
|
|
|
|
CALL_SUBTEST(test_lu_singular());
|
|
|
|
|
|
}
|