mirror of
https://gitlab.com/libeigen/eigen.git
synced 2026-04-10 11:34:33 +08:00
Add Eigen/GPU module: A standalone GPU library dispatch layer where DeviceMatrix<Scalar> operations map 1:1 to cuBLAS/cuSOLVER calls. CPU and GPU solvers coexist in the same binary with compatible syntax. Core infrastructure: - DeviceMatrix<Scalar>: RAII dense column-major GPU memory wrapper with async host transfer (fromHost/toHost) and CUDA event-based cross-stream synchronization. - GpuContext: Unified execution context owning a CUDA stream + cuBLAS handle + cuSOLVER handle. Thread-local default with explicit override via setThreadLocal(). Stream-borrowing constructor for integration. - DeviceBuffer: Typed RAII device allocation with move semantics. cuBLAS dispatch (expression syntax): - GEMM: d_C = d_A.adjoint() * d_B (cublasXgemm) - TRSM: d_X = d_A.triangularView<Lower>().solve(d_B) (cublasXtrsm) - SYMM/HEMM: d_C = d_A.selfadjointView<Lower>() * d_B (cublasXsymm) - SYRK/HERK: d_C = d_A * d_A.adjoint() (cublasXsyrk) cuSOLVER dispatch: - GpuLLT: Cached Cholesky factorization (cusolverDnXpotrf + Xpotrs) - GpuLU: Cached LU factorization (cusolverDnXgetrf + Xgetrs) - Solver chaining: auto x = d_A.llt().solve(d_B) - Solver expressions with .device(ctx) for explicit stream control. CI: Bump CUDA container to Ubuntu 22.04 (CMake 3.22), GCC 10->11, Clang 12->14. Bump cmake_minimum_required to 3.17 for FindCUDAToolkit. Tests: gpu_cublas.cpp, gpu_cusolver_llt.cpp, gpu_cusolver_lu.cpp, gpu_device_matrix.cpp, gpu_library_example.cu Benchmarks: bench_gpu_solvers.cpp, bench_gpu_chaining.cpp, bench_gpu_batching.cpp
91 lines
3.2 KiB
C++
91 lines
3.2 KiB
C++
// This file is part of Eigen, a lightweight C++ template library
|
|
// for linear algebra.
|
|
//
|
|
// Copyright (C) 2026 Rasmus Munk Larsen <rmlarsen@gmail.com>
|
|
//
|
|
// This Source Code Form is subject to the terms of the Mozilla
|
|
// Public License v. 2.0. If a copy of the MPL was not distributed
|
|
// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
|
|
|
|
#ifndef EIGEN_TEST_GPU_LIBRARY_TEST_HELPER_H
|
|
#define EIGEN_TEST_GPU_LIBRARY_TEST_HELPER_H
|
|
|
|
// Helpers for GPU tests that call NVIDIA library APIs (cuBLAS, cuSOLVER, etc.)
|
|
// from the host side. Provides RAII GPU memory management and async matrix transfer.
|
|
//
|
|
// This is separate from gpu_common.h (element-parallel device kernels) and
|
|
// gpu_test_helper.h (serialization-based device kernels). Those patterns run
|
|
// user functors inside GPU kernels. This helper is for host-orchestrated tests
|
|
// that call library APIs which launch their own kernels internally.
|
|
//
|
|
// All transfers use an explicit stream and cudaMemcpyAsync. Callers must
|
|
// synchronize (ctx.synchronize() or cudaStreamSynchronize) before reading
|
|
// results back on the host.
|
|
|
|
#include "gpu_test_helper.h"
|
|
|
|
namespace Eigen {
|
|
namespace test {
|
|
|
|
// RAII wrapper for GPU device memory. Prevents leaks when VERIFY macros abort.
|
|
template <typename Scalar>
|
|
struct GpuBuffer {
|
|
Scalar* data = nullptr;
|
|
Index size = 0;
|
|
|
|
GpuBuffer() = default;
|
|
|
|
explicit GpuBuffer(Index n) : size(n) { GPU_CHECK(gpuMalloc(reinterpret_cast<void**>(&data), n * sizeof(Scalar))); }
|
|
|
|
~GpuBuffer() {
|
|
if (data) GPU_CHECK(gpuFree(data));
|
|
}
|
|
|
|
// Move-only.
|
|
GpuBuffer(GpuBuffer&& other) noexcept : data(other.data), size(other.size) {
|
|
other.data = nullptr;
|
|
other.size = 0;
|
|
}
|
|
GpuBuffer& operator=(GpuBuffer&& other) noexcept {
|
|
if (this != &other) {
|
|
if (data) GPU_CHECK(gpuFree(data));
|
|
data = other.data;
|
|
size = other.size;
|
|
other.data = nullptr;
|
|
other.size = 0;
|
|
}
|
|
return *this;
|
|
}
|
|
|
|
GpuBuffer(const GpuBuffer&) = delete;
|
|
GpuBuffer& operator=(const GpuBuffer&) = delete;
|
|
|
|
// Async zero the buffer on the given stream.
|
|
void setZeroAsync(cudaStream_t stream) { GPU_CHECK(cudaMemsetAsync(data, 0, size * sizeof(Scalar), stream)); }
|
|
};
|
|
|
|
// Copy a dense Eigen matrix to a new GPU buffer, async on the given stream.
|
|
// Caller must synchronize before the host matrix is freed or modified.
|
|
template <typename Derived>
|
|
GpuBuffer<typename Derived::Scalar> gpu_copy_to_device(cudaStream_t stream, const MatrixBase<Derived>& host_mat) {
|
|
using Scalar = typename Derived::Scalar;
|
|
const auto& mat = host_mat.derived();
|
|
GpuBuffer<Scalar> buf(mat.size());
|
|
GPU_CHECK(cudaMemcpyAsync(buf.data, mat.data(), mat.size() * sizeof(Scalar), cudaMemcpyHostToDevice, stream));
|
|
return buf;
|
|
}
|
|
|
|
// Copy GPU buffer contents back to a dense Eigen matrix, async on the given stream.
|
|
// Caller must synchronize before reading from host_mat.
|
|
template <typename Scalar, typename Derived>
|
|
void gpu_copy_to_host(cudaStream_t stream, const GpuBuffer<Scalar>& buf, MatrixBase<Derived>& host_mat) {
|
|
auto& mat = host_mat.derived();
|
|
eigen_assert(buf.size == mat.size());
|
|
GPU_CHECK(cudaMemcpyAsync(mat.data(), buf.data, mat.size() * sizeof(Scalar), cudaMemcpyDeviceToHost, stream));
|
|
}
|
|
|
|
} // namespace test
|
|
} // namespace Eigen
|
|
|
|
#endif // EIGEN_TEST_GPU_LIBRARY_TEST_HELPER_H
|