Files
eigen/test/gpu_device_matrix.cpp
Rasmus Munk Larsen 58c44ef36d GPU: Add library dispatch module (DeviceMatrix, cuBLAS, cuSOLVER)
Add Eigen/GPU module: A standalone GPU library dispatch layer where
DeviceMatrix<Scalar> operations map 1:1 to cuBLAS/cuSOLVER calls.
CPU and GPU solvers coexist in the same binary with compatible syntax.

Core infrastructure:
- DeviceMatrix<Scalar>: RAII dense column-major GPU memory wrapper with
  async host transfer (fromHost/toHost) and CUDA event-based cross-stream
  synchronization.
- GpuContext: Unified execution context owning a CUDA stream + cuBLAS
  handle + cuSOLVER handle. Thread-local default with explicit override
  via setThreadLocal(). Stream-borrowing constructor for integration.
- DeviceBuffer: Typed RAII device allocation with move semantics.

cuBLAS dispatch (expression syntax):
- GEMM: d_C = d_A.adjoint() * d_B (cublasXgemm)
- TRSM: d_X = d_A.triangularView<Lower>().solve(d_B) (cublasXtrsm)
- SYMM/HEMM: d_C = d_A.selfadjointView<Lower>() * d_B (cublasXsymm)
- SYRK/HERK: d_C = d_A * d_A.adjoint() (cublasXsyrk)

cuSOLVER dispatch:
- GpuLLT: Cached Cholesky factorization (cusolverDnXpotrf + Xpotrs)
- GpuLU: Cached LU factorization (cusolverDnXgetrf + Xgetrs)
- Solver chaining: auto x = d_A.llt().solve(d_B)
- Solver expressions with .device(ctx) for explicit stream control.

CI: Bump CUDA container to Ubuntu 22.04 (CMake 3.22), GCC 10->11,
Clang 12->14. Bump cmake_minimum_required to 3.17 for FindCUDAToolkit.

Tests: gpu_cublas.cpp, gpu_cusolver_llt.cpp, gpu_cusolver_lu.cpp,
gpu_device_matrix.cpp, gpu_library_example.cu
Benchmarks: bench_gpu_solvers.cpp, bench_gpu_chaining.cpp,
bench_gpu_batching.cpp
2026-04-09 19:05:25 -07:00

248 lines
7.5 KiB
C++

// This file is part of Eigen, a lightweight C++ template library
// for linear algebra.
//
// Copyright (C) 2026 Rasmus Munk Larsen <rmlarsen@gmail.com>
//
// This Source Code Form is subject to the terms of the Mozilla
// Public License v. 2.0. If a copy of the MPL was not distributed
// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
// Tests for DeviceMatrix and HostTransfer: typed RAII GPU memory wrapper.
// No cuSOLVER dependency — only CUDA runtime.
#define EIGEN_USE_GPU
#include "main.h"
#include <Eigen/GPU>
using namespace Eigen;
// ---- Default construction ---------------------------------------------------
void test_default_construct() {
DeviceMatrix<double> dm;
VERIFY(dm.empty());
VERIFY_IS_EQUAL(dm.rows(), 0);
VERIFY_IS_EQUAL(dm.cols(), 0);
VERIFY(dm.data() == nullptr);
VERIFY_IS_EQUAL(dm.sizeInBytes(), size_t(0));
}
// ---- Allocate uninitialized -------------------------------------------------
template <typename Scalar>
void test_allocate(Index rows, Index cols) {
DeviceMatrix<Scalar> dm(rows, cols);
VERIFY(!dm.empty());
VERIFY_IS_EQUAL(dm.rows(), rows);
VERIFY_IS_EQUAL(dm.cols(), cols);
VERIFY_IS_EQUAL(dm.outerStride(), rows);
VERIFY(dm.data() != nullptr);
VERIFY_IS_EQUAL(dm.sizeInBytes(), size_t(rows) * size_t(cols) * sizeof(Scalar));
}
// ---- fromHost / toHost roundtrip (synchronous) ------------------------------
template <typename Scalar>
void test_roundtrip(Index rows, Index cols) {
using MatrixType = Matrix<Scalar, Dynamic, Dynamic>;
MatrixType host = MatrixType::Random(rows, cols);
auto dm = DeviceMatrix<Scalar>::fromHost(host);
VERIFY_IS_EQUAL(dm.rows(), rows);
VERIFY_IS_EQUAL(dm.cols(), cols);
VERIFY(!dm.empty());
MatrixType result = dm.toHost();
VERIFY_IS_EQUAL(result.rows(), rows);
VERIFY_IS_EQUAL(result.cols(), cols);
VERIFY_IS_APPROX(result, host);
}
// ---- fromHostAsync / toHostAsync roundtrip -----------------------------------
template <typename Scalar>
void test_roundtrip_async(Index rows, Index cols) {
using MatrixType = Matrix<Scalar, Dynamic, Dynamic>;
MatrixType host = MatrixType::Random(rows, cols);
cudaStream_t stream;
EIGEN_CUDA_RUNTIME_CHECK(cudaStreamCreate(&stream));
// Async upload from raw pointer.
auto dm = DeviceMatrix<Scalar>::fromHostAsync(host.data(), rows, cols, rows, stream);
VERIFY_IS_EQUAL(dm.rows(), rows);
VERIFY_IS_EQUAL(dm.cols(), cols);
// Async download via HostTransfer future.
auto transfer = dm.toHostAsync(stream);
// get() blocks and returns the matrix.
MatrixType result = transfer.get();
VERIFY_IS_APPROX(result, host);
EIGEN_CUDA_RUNTIME_CHECK(cudaStreamDestroy(stream));
}
// ---- HostTransfer::ready() and idempotent get() -----------------------------
void test_host_transfer_ready() {
using MatrixType = Matrix<double, Dynamic, Dynamic>;
MatrixType host = MatrixType::Random(100, 100);
auto dm = DeviceMatrix<double>::fromHost(host);
auto transfer = dm.toHostAsync();
// After get(), ready() must return true.
MatrixType result = transfer.get();
VERIFY(transfer.ready());
VERIFY_IS_APPROX(result, host);
// get() is idempotent.
MatrixType& result2 = transfer.get();
VERIFY_IS_APPROX(result2, host);
}
// ---- HostTransfer move ------------------------------------------------------
void test_host_transfer_move() {
using MatrixType = Matrix<double, Dynamic, Dynamic>;
MatrixType host = MatrixType::Random(50, 50);
auto dm = DeviceMatrix<double>::fromHost(host);
auto transfer = dm.toHostAsync();
HostTransfer<double> moved(std::move(transfer));
MatrixType result = moved.get();
VERIFY_IS_APPROX(result, host);
}
// ---- clone() produces independent copy --------------------------------------
template <typename Scalar>
void test_clone(Index rows, Index cols) {
using MatrixType = Matrix<Scalar, Dynamic, Dynamic>;
MatrixType host = MatrixType::Random(rows, cols);
auto dm = DeviceMatrix<Scalar>::fromHost(host);
auto cloned = dm.clone();
// Overwrite original with different data.
MatrixType other = MatrixType::Random(rows, cols);
dm = DeviceMatrix<Scalar>::fromHost(other);
// Clone still holds the original data.
MatrixType clone_result = cloned.toHost();
VERIFY_IS_APPROX(clone_result, host);
// Original holds the new data.
MatrixType dm_result = dm.toHost();
VERIFY_IS_APPROX(dm_result, other);
}
// ---- Move construct ---------------------------------------------------------
template <typename Scalar>
void test_move_construct(Index rows, Index cols) {
using MatrixType = Matrix<Scalar, Dynamic, Dynamic>;
MatrixType host = MatrixType::Random(rows, cols);
auto dm = DeviceMatrix<Scalar>::fromHost(host);
DeviceMatrix<Scalar> moved(std::move(dm));
VERIFY(dm.empty());
VERIFY(dm.data() == nullptr);
VERIFY_IS_EQUAL(moved.rows(), rows);
VERIFY_IS_EQUAL(moved.cols(), cols);
MatrixType result = moved.toHost();
VERIFY_IS_APPROX(result, host);
}
// ---- Move assign ------------------------------------------------------------
template <typename Scalar>
void test_move_assign(Index rows, Index cols) {
using MatrixType = Matrix<Scalar, Dynamic, Dynamic>;
MatrixType host = MatrixType::Random(rows, cols);
auto dm = DeviceMatrix<Scalar>::fromHost(host);
DeviceMatrix<Scalar> dest;
dest = std::move(dm);
VERIFY(dm.empty());
VERIFY_IS_EQUAL(dest.rows(), rows);
MatrixType result = dest.toHost();
VERIFY_IS_APPROX(result, host);
}
// ---- resize() ---------------------------------------------------------------
void test_resize() {
DeviceMatrix<double> dm(10, 20);
VERIFY_IS_EQUAL(dm.rows(), 10);
VERIFY_IS_EQUAL(dm.cols(), 20);
dm.resize(50, 30);
VERIFY_IS_EQUAL(dm.rows(), 50);
VERIFY_IS_EQUAL(dm.cols(), 30);
VERIFY_IS_EQUAL(dm.outerStride(), 50);
VERIFY(dm.data() != nullptr);
// Resize to same dimensions is a no-op.
double* ptr_before = dm.data();
dm.resize(50, 30);
VERIFY(dm.data() == ptr_before);
}
// ---- Empty / 0x0 matrix -----------------------------------------------------
void test_empty() {
using MatrixType = Matrix<double, Dynamic, Dynamic>;
MatrixType empty_mat(0, 0);
auto dm = DeviceMatrix<double>::fromHost(empty_mat);
VERIFY(dm.empty());
VERIFY_IS_EQUAL(dm.rows(), 0);
VERIFY_IS_EQUAL(dm.cols(), 0);
MatrixType result = dm.toHost();
VERIFY_IS_EQUAL(result.rows(), 0);
VERIFY_IS_EQUAL(result.cols(), 0);
}
// ---- Per-scalar driver ------------------------------------------------------
template <typename Scalar>
void test_scalar() {
// Square.
CALL_SUBTEST(test_roundtrip<Scalar>(1, 1));
CALL_SUBTEST(test_roundtrip<Scalar>(64, 64));
CALL_SUBTEST(test_roundtrip<Scalar>(256, 256));
// Rectangular.
CALL_SUBTEST(test_roundtrip<Scalar>(100, 7));
CALL_SUBTEST(test_roundtrip<Scalar>(7, 100));
// Async roundtrip.
CALL_SUBTEST(test_roundtrip_async<Scalar>(64, 64));
CALL_SUBTEST(test_roundtrip_async<Scalar>(100, 7));
CALL_SUBTEST(test_clone<Scalar>(64, 64));
CALL_SUBTEST(test_move_construct<Scalar>(64, 64));
CALL_SUBTEST(test_move_assign<Scalar>(64, 64));
}
EIGEN_DECLARE_TEST(gpu_device_matrix) {
CALL_SUBTEST(test_default_construct());
CALL_SUBTEST(test_empty());
CALL_SUBTEST(test_resize());
CALL_SUBTEST(test_host_transfer_ready());
CALL_SUBTEST(test_host_transfer_move());
CALL_SUBTEST((test_allocate<float>(100, 50)));
CALL_SUBTEST((test_allocate<double>(100, 50)));
CALL_SUBTEST(test_scalar<float>());
CALL_SUBTEST(test_scalar<double>());
CALL_SUBTEST(test_scalar<std::complex<float>>());
CALL_SUBTEST(test_scalar<std::complex<double>>());
}