mirror of
https://gitlab.com/libeigen/eigen.git
synced 2026-04-10 11:34:33 +08:00
Add Eigen/GPU module: A standalone GPU library dispatch layer where DeviceMatrix<Scalar> operations map 1:1 to cuBLAS/cuSOLVER calls. CPU and GPU solvers coexist in the same binary with compatible syntax. Core infrastructure: - DeviceMatrix<Scalar>: RAII dense column-major GPU memory wrapper with async host transfer (fromHost/toHost) and CUDA event-based cross-stream synchronization. - GpuContext: Unified execution context owning a CUDA stream + cuBLAS handle + cuSOLVER handle. Thread-local default with explicit override via setThreadLocal(). Stream-borrowing constructor for integration. - DeviceBuffer: Typed RAII device allocation with move semantics. cuBLAS dispatch (expression syntax): - GEMM: d_C = d_A.adjoint() * d_B (cublasXgemm) - TRSM: d_X = d_A.triangularView<Lower>().solve(d_B) (cublasXtrsm) - SYMM/HEMM: d_C = d_A.selfadjointView<Lower>() * d_B (cublasXsymm) - SYRK/HERK: d_C = d_A * d_A.adjoint() (cublasXsyrk) cuSOLVER dispatch: - GpuLLT: Cached Cholesky factorization (cusolverDnXpotrf + Xpotrs) - GpuLU: Cached LU factorization (cusolverDnXgetrf + Xgetrs) - Solver chaining: auto x = d_A.llt().solve(d_B) - Solver expressions with .device(ctx) for explicit stream control. CI: Bump CUDA container to Ubuntu 22.04 (CMake 3.22), GCC 10->11, Clang 12->14. Bump cmake_minimum_required to 3.17 for FindCUDAToolkit. Tests: gpu_cublas.cpp, gpu_cusolver_llt.cpp, gpu_cusolver_lu.cpp, gpu_device_matrix.cpp, gpu_library_example.cu Benchmarks: bench_gpu_solvers.cpp, bench_gpu_chaining.cpp, bench_gpu_batching.cpp
54 lines
2.2 KiB
CMake
54 lines
2.2 KiB
CMake
# GPU benchmarks require CUDA runtime + cuSOLVER.
|
|
# Build separately from the main benchmark tree since they need CUDA toolchain.
|
|
#
|
|
# Usage:
|
|
# cmake -G Ninja -B build-bench-gpu -S benchmarks/GPU \
|
|
# -DCMAKE_CUDA_ARCHITECTURES=89
|
|
# cmake --build build-bench-gpu
|
|
#
|
|
# Profiling:
|
|
# nsys profile --trace=cuda ./build-bench-gpu/bench_gpu_solvers
|
|
# ncu --set full -o profile ./build-bench-gpu/bench_gpu_solvers --benchmark_filter=BM_GpuLLT_Compute/4096
|
|
|
|
cmake_minimum_required(VERSION 3.18)
|
|
project(EigenGpuBenchmarks CXX)
|
|
|
|
find_package(benchmark REQUIRED)
|
|
find_package(CUDAToolkit REQUIRED)
|
|
|
|
set(EIGEN_SOURCE_DIR "${CMAKE_CURRENT_SOURCE_DIR}/../..")
|
|
|
|
function(eigen_add_gpu_benchmark name source)
|
|
cmake_parse_arguments(BENCH "" "" "LIBRARIES;DEFINITIONS" ${ARGN})
|
|
if(NOT IS_ABSOLUTE "${source}")
|
|
set(source "${CMAKE_CURRENT_SOURCE_DIR}/${source}")
|
|
endif()
|
|
add_executable(${name} ${source})
|
|
target_include_directories(${name} PRIVATE
|
|
${EIGEN_SOURCE_DIR}
|
|
${CUDAToolkit_INCLUDE_DIRS})
|
|
target_link_libraries(${name} PRIVATE
|
|
benchmark::benchmark benchmark::benchmark_main
|
|
CUDA::cudart CUDA::cusolver CUDA::cublas)
|
|
if(BENCH_LIBRARIES)
|
|
target_link_libraries(${name} PRIVATE ${BENCH_LIBRARIES})
|
|
endif()
|
|
target_compile_options(${name} PRIVATE -O3 -DNDEBUG)
|
|
target_compile_definitions(${name} PRIVATE EIGEN_USE_GPU)
|
|
if(BENCH_DEFINITIONS)
|
|
target_compile_definitions(${name} PRIVATE ${BENCH_DEFINITIONS})
|
|
endif()
|
|
endfunction()
|
|
|
|
# Solver benchmarks: LLT/LU compute + solve, host vs device paths, CPU baselines.
|
|
eigen_add_gpu_benchmark(bench_gpu_solvers bench_gpu_solvers.cpp)
|
|
eigen_add_gpu_benchmark(bench_gpu_solvers_float bench_gpu_solvers.cpp DEFINITIONS SCALAR=float)
|
|
|
|
# Chaining benchmarks: async pipeline efficiency, host-roundtrip vs device chain.
|
|
eigen_add_gpu_benchmark(bench_gpu_chaining bench_gpu_chaining.cpp)
|
|
eigen_add_gpu_benchmark(bench_gpu_chaining_float bench_gpu_chaining.cpp DEFINITIONS SCALAR=float)
|
|
|
|
# Batching benchmarks: multi-stream concurrency for many small systems.
|
|
eigen_add_gpu_benchmark(bench_gpu_batching bench_gpu_batching.cpp)
|
|
eigen_add_gpu_benchmark(bench_gpu_batching_float bench_gpu_batching.cpp DEFINITIONS SCALAR=float)
|