Files
eigen/test/gpu_library_example.cu
Rasmus Munk Larsen 58c44ef36d GPU: Add library dispatch module (DeviceMatrix, cuBLAS, cuSOLVER)
Add Eigen/GPU module: A standalone GPU library dispatch layer where
DeviceMatrix<Scalar> operations map 1:1 to cuBLAS/cuSOLVER calls.
CPU and GPU solvers coexist in the same binary with compatible syntax.

Core infrastructure:
- DeviceMatrix<Scalar>: RAII dense column-major GPU memory wrapper with
  async host transfer (fromHost/toHost) and CUDA event-based cross-stream
  synchronization.
- GpuContext: Unified execution context owning a CUDA stream + cuBLAS
  handle + cuSOLVER handle. Thread-local default with explicit override
  via setThreadLocal(). Stream-borrowing constructor for integration.
- DeviceBuffer: Typed RAII device allocation with move semantics.

cuBLAS dispatch (expression syntax):
- GEMM: d_C = d_A.adjoint() * d_B (cublasXgemm)
- TRSM: d_X = d_A.triangularView<Lower>().solve(d_B) (cublasXtrsm)
- SYMM/HEMM: d_C = d_A.selfadjointView<Lower>() * d_B (cublasXsymm)
- SYRK/HERK: d_C = d_A * d_A.adjoint() (cublasXsyrk)

cuSOLVER dispatch:
- GpuLLT: Cached Cholesky factorization (cusolverDnXpotrf + Xpotrs)
- GpuLU: Cached LU factorization (cusolverDnXgetrf + Xgetrs)
- Solver chaining: auto x = d_A.llt().solve(d_B)
- Solver expressions with .device(ctx) for explicit stream control.

CI: Bump CUDA container to Ubuntu 22.04 (CMake 3.22), GCC 10->11,
Clang 12->14. Bump cmake_minimum_required to 3.17 for FindCUDAToolkit.

Tests: gpu_cublas.cpp, gpu_cusolver_llt.cpp, gpu_cusolver_lu.cpp,
gpu_device_matrix.cpp, gpu_library_example.cu
Benchmarks: bench_gpu_solvers.cpp, bench_gpu_chaining.cpp,
bench_gpu_batching.cpp
2026-04-09 19:05:25 -07:00

111 lines
3.2 KiB
Plaintext

// This file is part of Eigen, a lightweight C++ template library
// for linear algebra.
//
// Copyright (C) 2026 Rasmus Munk Larsen <rmlarsen@gmail.com>
//
// This Source Code Form is subject to the terms of the Mozilla
// Public License v. 2.0. If a copy of the MPL was not distributed
// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
// Smoke test for GPU library test infrastructure.
// Verifies GpuContext, GpuBuffer, and host<->device matrix transfers
// without requiring any NVIDIA library (cuBLAS, cuSOLVER, etc.).
#define EIGEN_USE_GPU
#include "main.h"
#include "gpu_context.h"
#include "gpu_library_test_helper.h"
using namespace Eigen;
using namespace Eigen::test;
// Test that GpuContext initializes, reports valid device info, and owns a cuSOLVER handle.
void test_gpu_context() {
GpuContext ctx;
VERIFY(ctx.device() >= 0);
VERIFY(ctx.deviceProperties().major >= 7); // sm_70 minimum
VERIFY(ctx.stream != nullptr);
VERIFY(ctx.cusolver != nullptr);
std::cout << " GPU: " << ctx.deviceProperties().name << " (sm_" << ctx.deviceProperties().major
<< ctx.deviceProperties().minor << ")\n";
}
// Test dense matrix roundtrip: host -> device -> host.
template <typename MatrixType>
void test_dense_roundtrip() {
GpuContext ctx;
const Index rows = 64;
const Index cols = 32;
MatrixType A = MatrixType::Random(rows, cols);
auto buf = gpu_copy_to_device(ctx.stream, A);
VERIFY(buf.data != nullptr);
VERIFY(buf.size == rows * cols);
MatrixType B(rows, cols);
B.setZero();
gpu_copy_to_host(ctx.stream, buf, B);
ctx.synchronize();
VERIFY_IS_EQUAL(A, B);
}
// Test GpuBuffer RAII: move semantics, async zero-init.
void test_gpu_buffer() {
GpuContext ctx;
GpuBuffer<float> a(128);
VERIFY(a.data != nullptr);
VERIFY(a.size == 128);
// Move construction.
GpuBuffer<float> b(std::move(a));
VERIFY(a.data == nullptr);
VERIFY(b.data != nullptr);
VERIFY(b.size == 128);
// Move assignment.
GpuBuffer<float> c;
c = std::move(b);
VERIFY(b.data == nullptr);
VERIFY(c.data != nullptr);
// setZeroAsync.
c.setZeroAsync(ctx.stream);
ctx.synchronize();
std::vector<float> host(128);
GPU_CHECK(cudaMemcpy(host.data(), c.data, 128 * sizeof(float), cudaMemcpyDeviceToHost));
for (int i = 0; i < 128; ++i) {
VERIFY_IS_EQUAL(host[i], 0.0f);
}
}
// Test with vectors (1D).
template <typename Scalar>
void test_vector_roundtrip() {
GpuContext ctx;
const Index n = 256;
Matrix<Scalar, Dynamic, 1> v = Matrix<Scalar, Dynamic, 1>::Random(n);
auto buf = gpu_copy_to_device(ctx.stream, v);
Matrix<Scalar, Dynamic, 1> w(n);
w.setZero();
gpu_copy_to_host(ctx.stream, buf, w);
ctx.synchronize();
VERIFY_IS_EQUAL(v, w);
}
EIGEN_DECLARE_TEST(gpu_library_example) {
CALL_SUBTEST(test_gpu_context());
CALL_SUBTEST(test_gpu_buffer());
CALL_SUBTEST(test_dense_roundtrip<MatrixXf>());
CALL_SUBTEST(test_dense_roundtrip<MatrixXd>());
CALL_SUBTEST((test_dense_roundtrip<Matrix<float, Dynamic, Dynamic, RowMajor>>()));
CALL_SUBTEST((test_dense_roundtrip<Matrix<double, Dynamic, Dynamic, RowMajor>>()));
CALL_SUBTEST(test_vector_roundtrip<float>());
CALL_SUBTEST(test_vector_roundtrip<double>());
CALL_SUBTEST(test_vector_roundtrip<std::complex<float>>());
}