mirror of
https://gitlab.com/libeigen/eigen.git
synced 2026-04-10 11:34:33 +08:00
Add Eigen/GPU module: A standalone GPU library dispatch layer where DeviceMatrix<Scalar> operations map 1:1 to cuBLAS/cuSOLVER calls. CPU and GPU solvers coexist in the same binary with compatible syntax. Core infrastructure: - DeviceMatrix<Scalar>: RAII dense column-major GPU memory wrapper with async host transfer (fromHost/toHost) and CUDA event-based cross-stream synchronization. - GpuContext: Unified execution context owning a CUDA stream + cuBLAS handle + cuSOLVER handle. Thread-local default with explicit override via setThreadLocal(). Stream-borrowing constructor for integration. - DeviceBuffer: Typed RAII device allocation with move semantics. cuBLAS dispatch (expression syntax): - GEMM: d_C = d_A.adjoint() * d_B (cublasXgemm) - TRSM: d_X = d_A.triangularView<Lower>().solve(d_B) (cublasXtrsm) - SYMM/HEMM: d_C = d_A.selfadjointView<Lower>() * d_B (cublasXsymm) - SYRK/HERK: d_C = d_A * d_A.adjoint() (cublasXsyrk) cuSOLVER dispatch: - GpuLLT: Cached Cholesky factorization (cusolverDnXpotrf + Xpotrs) - GpuLU: Cached LU factorization (cusolverDnXgetrf + Xgetrs) - Solver chaining: auto x = d_A.llt().solve(d_B) - Solver expressions with .device(ctx) for explicit stream control. CI: Bump CUDA container to Ubuntu 22.04 (CMake 3.22), GCC 10->11, Clang 12->14. Bump cmake_minimum_required to 3.17 for FindCUDAToolkit. Tests: gpu_cublas.cpp, gpu_cusolver_llt.cpp, gpu_cusolver_lu.cpp, gpu_device_matrix.cpp, gpu_library_example.cu Benchmarks: bench_gpu_solvers.cpp, bench_gpu_chaining.cpp, bench_gpu_batching.cpp
111 lines
3.2 KiB
Plaintext
111 lines
3.2 KiB
Plaintext
// This file is part of Eigen, a lightweight C++ template library
|
|
// for linear algebra.
|
|
//
|
|
// Copyright (C) 2026 Rasmus Munk Larsen <rmlarsen@gmail.com>
|
|
//
|
|
// This Source Code Form is subject to the terms of the Mozilla
|
|
// Public License v. 2.0. If a copy of the MPL was not distributed
|
|
// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
|
|
|
|
// Smoke test for GPU library test infrastructure.
|
|
// Verifies GpuContext, GpuBuffer, and host<->device matrix transfers
|
|
// without requiring any NVIDIA library (cuBLAS, cuSOLVER, etc.).
|
|
|
|
#define EIGEN_USE_GPU
|
|
#include "main.h"
|
|
#include "gpu_context.h"
|
|
#include "gpu_library_test_helper.h"
|
|
|
|
using namespace Eigen;
|
|
using namespace Eigen::test;
|
|
|
|
// Test that GpuContext initializes, reports valid device info, and owns a cuSOLVER handle.
|
|
void test_gpu_context() {
|
|
GpuContext ctx;
|
|
VERIFY(ctx.device() >= 0);
|
|
VERIFY(ctx.deviceProperties().major >= 7); // sm_70 minimum
|
|
VERIFY(ctx.stream != nullptr);
|
|
VERIFY(ctx.cusolver != nullptr);
|
|
std::cout << " GPU: " << ctx.deviceProperties().name << " (sm_" << ctx.deviceProperties().major
|
|
<< ctx.deviceProperties().minor << ")\n";
|
|
}
|
|
|
|
// Test dense matrix roundtrip: host -> device -> host.
|
|
template <typename MatrixType>
|
|
void test_dense_roundtrip() {
|
|
GpuContext ctx;
|
|
const Index rows = 64;
|
|
const Index cols = 32;
|
|
|
|
MatrixType A = MatrixType::Random(rows, cols);
|
|
auto buf = gpu_copy_to_device(ctx.stream, A);
|
|
VERIFY(buf.data != nullptr);
|
|
VERIFY(buf.size == rows * cols);
|
|
|
|
MatrixType B(rows, cols);
|
|
B.setZero();
|
|
gpu_copy_to_host(ctx.stream, buf, B);
|
|
ctx.synchronize();
|
|
|
|
VERIFY_IS_EQUAL(A, B);
|
|
}
|
|
|
|
// Test GpuBuffer RAII: move semantics, async zero-init.
|
|
void test_gpu_buffer() {
|
|
GpuContext ctx;
|
|
|
|
GpuBuffer<float> a(128);
|
|
VERIFY(a.data != nullptr);
|
|
VERIFY(a.size == 128);
|
|
|
|
// Move construction.
|
|
GpuBuffer<float> b(std::move(a));
|
|
VERIFY(a.data == nullptr);
|
|
VERIFY(b.data != nullptr);
|
|
VERIFY(b.size == 128);
|
|
|
|
// Move assignment.
|
|
GpuBuffer<float> c;
|
|
c = std::move(b);
|
|
VERIFY(b.data == nullptr);
|
|
VERIFY(c.data != nullptr);
|
|
|
|
// setZeroAsync.
|
|
c.setZeroAsync(ctx.stream);
|
|
ctx.synchronize();
|
|
|
|
std::vector<float> host(128);
|
|
GPU_CHECK(cudaMemcpy(host.data(), c.data, 128 * sizeof(float), cudaMemcpyDeviceToHost));
|
|
for (int i = 0; i < 128; ++i) {
|
|
VERIFY_IS_EQUAL(host[i], 0.0f);
|
|
}
|
|
}
|
|
|
|
// Test with vectors (1D).
|
|
template <typename Scalar>
|
|
void test_vector_roundtrip() {
|
|
GpuContext ctx;
|
|
const Index n = 256;
|
|
Matrix<Scalar, Dynamic, 1> v = Matrix<Scalar, Dynamic, 1>::Random(n);
|
|
auto buf = gpu_copy_to_device(ctx.stream, v);
|
|
|
|
Matrix<Scalar, Dynamic, 1> w(n);
|
|
w.setZero();
|
|
gpu_copy_to_host(ctx.stream, buf, w);
|
|
ctx.synchronize();
|
|
|
|
VERIFY_IS_EQUAL(v, w);
|
|
}
|
|
|
|
EIGEN_DECLARE_TEST(gpu_library_example) {
|
|
CALL_SUBTEST(test_gpu_context());
|
|
CALL_SUBTEST(test_gpu_buffer());
|
|
CALL_SUBTEST(test_dense_roundtrip<MatrixXf>());
|
|
CALL_SUBTEST(test_dense_roundtrip<MatrixXd>());
|
|
CALL_SUBTEST((test_dense_roundtrip<Matrix<float, Dynamic, Dynamic, RowMajor>>()));
|
|
CALL_SUBTEST((test_dense_roundtrip<Matrix<double, Dynamic, Dynamic, RowMajor>>()));
|
|
CALL_SUBTEST(test_vector_roundtrip<float>());
|
|
CALL_SUBTEST(test_vector_roundtrip<double>());
|
|
CALL_SUBTEST(test_vector_roundtrip<std::complex<float>>());
|
|
}
|