mirror of
https://gitlab.com/libeigen/eigen.git
synced 2026-04-10 11:34:33 +08:00
Add the operator interface needed for GPU iterative solvers: - BLAS Level-1 on DeviceMatrix: dot(), norm(), squaredNorm(), setZero(), noalias(), operator+=/-=/\*= dispatching to cuBLAS axpy/scal/dot/nrm2. - DeviceScalar<Scalar>: device-resident scalar returned by reductions. Defers host sync until value is read (implicit conversion). Device-side division via NPP for real types. - GpuContext: stream-borrowing constructor, setThreadLocal(), cublasLtHandle(), cusparseHandle(). - GEMM upgraded from cublasGemmEx to cublasLtMatmul with heuristic algorithm selection and plan caching. - GpuSparseContext: GpuContext& constructor for same-stream execution, deviceView() returning DeviceSparseView with operator* for device-resident SpMV (d_y = d_A * d_x). - geam expressions: d_C = d_A + alpha * d_B via cublasXgeam. - GpuSVD::matrixV() convenience wrapper. These additions make DeviceMatrix usable as a VectorType in Eigen algorithm templates. Conjugate gradient is the motivating example and is tested against CPU ConjugateGradient for correctness. Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
472 lines
15 KiB
C++
472 lines
15 KiB
C++
// This file is part of Eigen, a lightweight C++ template library
|
|
// for linear algebra.
|
|
//
|
|
// Copyright (C) 2026 Rasmus Munk Larsen <rmlarsen@gmail.com>
|
|
//
|
|
// This Source Code Form is subject to the terms of the Mozilla
|
|
// Public License v. 2.0. If a copy of the MPL was not distributed
|
|
// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
|
|
|
|
// Tests for DeviceMatrix and HostTransfer: typed RAII GPU memory wrapper.
|
|
// No cuSOLVER dependency — only CUDA runtime.
|
|
|
|
#define EIGEN_USE_GPU
|
|
#include "main.h"
|
|
#include <Eigen/Sparse>
|
|
#include <Eigen/GPU>
|
|
|
|
using namespace Eigen;
|
|
|
|
// ---- Default construction ---------------------------------------------------
|
|
|
|
void test_default_construct() {
|
|
DeviceMatrix<double> dm;
|
|
VERIFY(dm.empty());
|
|
VERIFY_IS_EQUAL(dm.rows(), 0);
|
|
VERIFY_IS_EQUAL(dm.cols(), 0);
|
|
VERIFY(dm.data() == nullptr);
|
|
VERIFY_IS_EQUAL(dm.sizeInBytes(), size_t(0));
|
|
}
|
|
|
|
// ---- Allocate uninitialized -------------------------------------------------
|
|
|
|
template <typename Scalar>
|
|
void test_allocate(Index rows, Index cols) {
|
|
DeviceMatrix<Scalar> dm(rows, cols);
|
|
VERIFY(!dm.empty());
|
|
VERIFY_IS_EQUAL(dm.rows(), rows);
|
|
VERIFY_IS_EQUAL(dm.cols(), cols);
|
|
VERIFY(dm.data() != nullptr);
|
|
VERIFY_IS_EQUAL(dm.sizeInBytes(), size_t(rows) * size_t(cols) * sizeof(Scalar));
|
|
}
|
|
|
|
// ---- fromHost / toHost roundtrip (synchronous) ------------------------------
|
|
|
|
template <typename Scalar>
|
|
void test_roundtrip(Index rows, Index cols) {
|
|
using MatrixType = Matrix<Scalar, Dynamic, Dynamic>;
|
|
MatrixType host = MatrixType::Random(rows, cols);
|
|
|
|
auto dm = DeviceMatrix<Scalar>::fromHost(host);
|
|
VERIFY_IS_EQUAL(dm.rows(), rows);
|
|
VERIFY_IS_EQUAL(dm.cols(), cols);
|
|
VERIFY(!dm.empty());
|
|
|
|
MatrixType result = dm.toHost();
|
|
VERIFY_IS_EQUAL(result.rows(), rows);
|
|
VERIFY_IS_EQUAL(result.cols(), cols);
|
|
VERIFY_IS_APPROX(result, host);
|
|
}
|
|
|
|
// ---- fromHostAsync / toHostAsync roundtrip -----------------------------------
|
|
|
|
template <typename Scalar>
|
|
void test_roundtrip_async(Index rows, Index cols) {
|
|
using MatrixType = Matrix<Scalar, Dynamic, Dynamic>;
|
|
MatrixType host = MatrixType::Random(rows, cols);
|
|
|
|
cudaStream_t stream;
|
|
EIGEN_CUDA_RUNTIME_CHECK(cudaStreamCreate(&stream));
|
|
|
|
// Async upload from raw pointer.
|
|
auto dm = DeviceMatrix<Scalar>::fromHostAsync(host.data(), rows, cols, stream);
|
|
VERIFY_IS_EQUAL(dm.rows(), rows);
|
|
VERIFY_IS_EQUAL(dm.cols(), cols);
|
|
|
|
// Async download via HostTransfer future.
|
|
auto transfer = dm.toHostAsync(stream);
|
|
|
|
// get() blocks and returns the matrix.
|
|
MatrixType result = transfer.get();
|
|
VERIFY_IS_APPROX(result, host);
|
|
|
|
EIGEN_CUDA_RUNTIME_CHECK(cudaStreamDestroy(stream));
|
|
}
|
|
|
|
// ---- HostTransfer::ready() and idempotent get() -----------------------------
|
|
|
|
void test_host_transfer_ready() {
|
|
using MatrixType = Matrix<double, Dynamic, Dynamic>;
|
|
MatrixType host = MatrixType::Random(100, 100);
|
|
|
|
auto dm = DeviceMatrix<double>::fromHost(host);
|
|
auto transfer = dm.toHostAsync();
|
|
|
|
// After get(), ready() must return true.
|
|
MatrixType result = transfer.get();
|
|
VERIFY(transfer.ready());
|
|
VERIFY_IS_APPROX(result, host);
|
|
|
|
// get() is idempotent.
|
|
MatrixType& result2 = transfer.get();
|
|
VERIFY_IS_APPROX(result2, host);
|
|
}
|
|
|
|
// ---- HostTransfer move ------------------------------------------------------
|
|
|
|
void test_host_transfer_move() {
|
|
using MatrixType = Matrix<double, Dynamic, Dynamic>;
|
|
MatrixType host = MatrixType::Random(50, 50);
|
|
|
|
auto dm = DeviceMatrix<double>::fromHost(host);
|
|
auto transfer = dm.toHostAsync();
|
|
|
|
HostTransfer<double> moved(std::move(transfer));
|
|
MatrixType result = moved.get();
|
|
VERIFY_IS_APPROX(result, host);
|
|
}
|
|
|
|
// ---- clone() produces independent copy --------------------------------------
|
|
|
|
template <typename Scalar>
|
|
void test_clone(Index rows, Index cols) {
|
|
using MatrixType = Matrix<Scalar, Dynamic, Dynamic>;
|
|
MatrixType host = MatrixType::Random(rows, cols);
|
|
|
|
auto dm = DeviceMatrix<Scalar>::fromHost(host);
|
|
auto cloned = dm.clone();
|
|
|
|
// Overwrite original with different data.
|
|
MatrixType other = MatrixType::Random(rows, cols);
|
|
dm = DeviceMatrix<Scalar>::fromHost(other);
|
|
|
|
// Clone still holds the original data.
|
|
MatrixType clone_result = cloned.toHost();
|
|
VERIFY_IS_APPROX(clone_result, host);
|
|
|
|
// Original holds the new data.
|
|
MatrixType dm_result = dm.toHost();
|
|
VERIFY_IS_APPROX(dm_result, other);
|
|
}
|
|
|
|
// ---- Move construct ---------------------------------------------------------
|
|
|
|
template <typename Scalar>
|
|
void test_move_construct(Index rows, Index cols) {
|
|
using MatrixType = Matrix<Scalar, Dynamic, Dynamic>;
|
|
MatrixType host = MatrixType::Random(rows, cols);
|
|
|
|
auto dm = DeviceMatrix<Scalar>::fromHost(host);
|
|
DeviceMatrix<Scalar> moved(std::move(dm));
|
|
|
|
VERIFY(dm.empty());
|
|
VERIFY(dm.data() == nullptr);
|
|
|
|
VERIFY_IS_EQUAL(moved.rows(), rows);
|
|
VERIFY_IS_EQUAL(moved.cols(), cols);
|
|
MatrixType result = moved.toHost();
|
|
VERIFY_IS_APPROX(result, host);
|
|
}
|
|
|
|
// ---- Move assign ------------------------------------------------------------
|
|
|
|
template <typename Scalar>
|
|
void test_move_assign(Index rows, Index cols) {
|
|
using MatrixType = Matrix<Scalar, Dynamic, Dynamic>;
|
|
MatrixType host = MatrixType::Random(rows, cols);
|
|
|
|
auto dm = DeviceMatrix<Scalar>::fromHost(host);
|
|
DeviceMatrix<Scalar> dest;
|
|
dest = std::move(dm);
|
|
|
|
VERIFY(dm.empty());
|
|
VERIFY_IS_EQUAL(dest.rows(), rows);
|
|
MatrixType result = dest.toHost();
|
|
VERIFY_IS_APPROX(result, host);
|
|
}
|
|
|
|
// ---- resize() ---------------------------------------------------------------
|
|
|
|
void test_resize() {
|
|
DeviceMatrix<double> dm(10, 20);
|
|
VERIFY_IS_EQUAL(dm.rows(), 10);
|
|
VERIFY_IS_EQUAL(dm.cols(), 20);
|
|
|
|
dm.resize(50, 30);
|
|
VERIFY_IS_EQUAL(dm.rows(), 50);
|
|
VERIFY_IS_EQUAL(dm.cols(), 30);
|
|
VERIFY(dm.data() != nullptr);
|
|
|
|
// Resize to same dimensions is a no-op.
|
|
double* ptr_before = dm.data();
|
|
dm.resize(50, 30);
|
|
VERIFY(dm.data() == ptr_before);
|
|
}
|
|
|
|
// ---- Empty / 0x0 matrix -----------------------------------------------------
|
|
|
|
void test_empty() {
|
|
using MatrixType = Matrix<double, Dynamic, Dynamic>;
|
|
MatrixType empty_mat(0, 0);
|
|
|
|
auto dm = DeviceMatrix<double>::fromHost(empty_mat);
|
|
VERIFY(dm.empty());
|
|
VERIFY_IS_EQUAL(dm.rows(), 0);
|
|
VERIFY_IS_EQUAL(dm.cols(), 0);
|
|
|
|
MatrixType result = dm.toHost();
|
|
VERIFY_IS_EQUAL(result.rows(), 0);
|
|
VERIFY_IS_EQUAL(result.cols(), 0);
|
|
}
|
|
|
|
// ---- Per-scalar driver ------------------------------------------------------
|
|
|
|
template <typename Scalar>
|
|
void test_scalar() {
|
|
// Square.
|
|
CALL_SUBTEST(test_roundtrip<Scalar>(1, 1));
|
|
CALL_SUBTEST(test_roundtrip<Scalar>(64, 64));
|
|
CALL_SUBTEST(test_roundtrip<Scalar>(256, 256));
|
|
|
|
// Rectangular.
|
|
CALL_SUBTEST(test_roundtrip<Scalar>(100, 7));
|
|
CALL_SUBTEST(test_roundtrip<Scalar>(7, 100));
|
|
|
|
// Async roundtrip.
|
|
CALL_SUBTEST(test_roundtrip_async<Scalar>(64, 64));
|
|
CALL_SUBTEST(test_roundtrip_async<Scalar>(100, 7));
|
|
|
|
CALL_SUBTEST(test_clone<Scalar>(64, 64));
|
|
CALL_SUBTEST(test_move_construct<Scalar>(64, 64));
|
|
CALL_SUBTEST(test_move_assign<Scalar>(64, 64));
|
|
}
|
|
|
|
// ---- BLAS-1: dot product ----------------------------------------------------
|
|
|
|
template <typename Scalar>
|
|
void test_blas1(Index n) {
|
|
using Vec = Matrix<Scalar, Dynamic, 1>;
|
|
using RealScalar = typename NumTraits<Scalar>::Real;
|
|
|
|
// All BLAS-1 ops share one GpuContext — same stream, zero event overhead.
|
|
GpuContext ctx;
|
|
|
|
RealScalar tol = RealScalar(10) * RealScalar(n) * NumTraits<Scalar>::epsilon();
|
|
|
|
// dot
|
|
{
|
|
Vec a = Vec::Random(n);
|
|
Vec b = Vec::Random(n);
|
|
auto d_a = DeviceMatrix<Scalar>::fromHost(a, ctx.stream());
|
|
auto d_b = DeviceMatrix<Scalar>::fromHost(b, ctx.stream());
|
|
Scalar gpu_dot = d_a.dot(ctx, d_b);
|
|
Scalar cpu_dot = a.dot(b);
|
|
VERIFY(numext::abs(gpu_dot - cpu_dot) < tol * numext::abs(cpu_dot) + tol);
|
|
}
|
|
|
|
// norm / squaredNorm
|
|
{
|
|
Vec a = Vec::Random(n);
|
|
auto d_a = DeviceMatrix<Scalar>::fromHost(a, ctx.stream());
|
|
RealScalar gpu_norm = d_a.norm(ctx);
|
|
RealScalar cpu_norm = a.norm();
|
|
VERIFY(numext::abs(gpu_norm - cpu_norm) < tol * cpu_norm + tol);
|
|
RealScalar gpu_sqnorm = d_a.squaredNorm(ctx);
|
|
RealScalar cpu_sqnorm = a.squaredNorm();
|
|
VERIFY(numext::abs(gpu_sqnorm - cpu_sqnorm) < tol * cpu_sqnorm + tol);
|
|
}
|
|
|
|
// addScaled (axpy)
|
|
{
|
|
Vec x = Vec::Random(n);
|
|
Vec y = Vec::Random(n);
|
|
Scalar alpha(2.5);
|
|
Vec y_ref = y + alpha * x;
|
|
auto d_y = DeviceMatrix<Scalar>::fromHost(y, ctx.stream());
|
|
auto d_x = DeviceMatrix<Scalar>::fromHost(x, ctx.stream());
|
|
d_y.addScaled(ctx, alpha, d_x);
|
|
Vec y_gpu = d_y.toHost(ctx.stream());
|
|
VERIFY((y_gpu - y_ref).norm() < tol * y_ref.norm() + tol);
|
|
}
|
|
|
|
// scale (scal)
|
|
{
|
|
Vec x = Vec::Random(n);
|
|
Scalar alpha(3.0);
|
|
Vec x_ref = alpha * x;
|
|
auto d_x = DeviceMatrix<Scalar>::fromHost(x, ctx.stream());
|
|
d_x.scale(ctx, alpha);
|
|
Vec x_gpu = d_x.toHost(ctx.stream());
|
|
VERIFY((x_gpu - x_ref).norm() < tol * x_ref.norm() + tol);
|
|
}
|
|
|
|
// copyFrom
|
|
{
|
|
Vec x = Vec::Random(n);
|
|
auto d_x = DeviceMatrix<Scalar>::fromHost(x, ctx.stream());
|
|
DeviceMatrix<Scalar> d_y;
|
|
d_y.copyFrom(ctx, d_x);
|
|
Vec y = d_y.toHost(ctx.stream());
|
|
VERIFY_IS_APPROX(y, x);
|
|
}
|
|
|
|
// setZero
|
|
{
|
|
Vec x = Vec::Random(n);
|
|
auto d_x = DeviceMatrix<Scalar>::fromHost(x, ctx.stream());
|
|
d_x.setZero(ctx);
|
|
Vec result = d_x.toHost(ctx.stream());
|
|
VERIFY_IS_EQUAL(result, Vec::Zero(n));
|
|
}
|
|
}
|
|
|
|
// ---- BLAS-1 operator overloads (CG-style) -----------------------------------
|
|
|
|
template <typename Scalar>
|
|
void test_cg_operators(Index n) {
|
|
using Vec = Matrix<Scalar, Dynamic, 1>;
|
|
using RealScalar = typename NumTraits<Scalar>::Real;
|
|
|
|
RealScalar tol = RealScalar(10) * RealScalar(n) * NumTraits<Scalar>::epsilon();
|
|
|
|
Vec x = Vec::Random(n);
|
|
Vec p = Vec::Random(n);
|
|
Vec tmp = Vec::Random(n);
|
|
Vec z = Vec::Random(n);
|
|
Scalar alpha(2.5);
|
|
Scalar beta(0.7);
|
|
|
|
// Test: x += alpha * p
|
|
{
|
|
Vec x_ref = x + alpha * p;
|
|
auto d_x = DeviceMatrix<Scalar>::fromHost(x);
|
|
auto d_p = DeviceMatrix<Scalar>::fromHost(p);
|
|
d_x += alpha * d_p;
|
|
Vec x_gpu = d_x.toHost();
|
|
VERIFY((x_gpu - x_ref).norm() < tol * x_ref.norm() + tol);
|
|
}
|
|
|
|
// Test: r -= alpha * tmp
|
|
{
|
|
Vec r = Vec::Random(n);
|
|
Vec r_ref = r - alpha * tmp;
|
|
auto d_r = DeviceMatrix<Scalar>::fromHost(r);
|
|
auto d_tmp = DeviceMatrix<Scalar>::fromHost(tmp);
|
|
d_r -= alpha * d_tmp;
|
|
Vec r_gpu = d_r.toHost();
|
|
VERIFY((r_gpu - r_ref).norm() < tol * r_ref.norm() + tol);
|
|
}
|
|
|
|
// Test: p = z + beta * p (cuBLAS geam)
|
|
{
|
|
Vec p_copy = p;
|
|
Vec p_ref = z + beta * p_copy;
|
|
auto d_p = DeviceMatrix<Scalar>::fromHost(p_copy);
|
|
auto d_z = DeviceMatrix<Scalar>::fromHost(z);
|
|
d_p = d_z + beta * d_p;
|
|
Vec p_gpu = d_p.toHost();
|
|
VERIFY((p_gpu - p_ref).norm() < tol * p_ref.norm() + tol);
|
|
}
|
|
|
|
// Test: operator+= and operator-= with DeviceMatrix (no scalar)
|
|
{
|
|
Vec a = Vec::Random(n);
|
|
Vec b = Vec::Random(n);
|
|
Vec a_ref = a + b;
|
|
auto d_a = DeviceMatrix<Scalar>::fromHost(a);
|
|
auto d_b = DeviceMatrix<Scalar>::fromHost(b);
|
|
d_a += d_b;
|
|
VERIFY((d_a.toHost() - a_ref).norm() < tol * a_ref.norm() + tol);
|
|
}
|
|
}
|
|
|
|
// ---- DeviceScalar: deferred sync -------------------------------------------
|
|
|
|
template <typename Scalar>
|
|
void test_device_scalar() {
|
|
using Vec = Matrix<Scalar, Dynamic, 1>;
|
|
using RealScalar = typename NumTraits<Scalar>::Real;
|
|
|
|
const Index n = 256;
|
|
Vec a = Vec::Random(n);
|
|
Vec b = Vec::Random(n);
|
|
|
|
GpuContext ctx;
|
|
auto d_a = DeviceMatrix<Scalar>::fromHost(a, ctx.stream());
|
|
auto d_b = DeviceMatrix<Scalar>::fromHost(b, ctx.stream());
|
|
|
|
// dot() returns DeviceScalar — implicit conversion to Scalar syncs.
|
|
Scalar gpu_dot = d_a.dot(ctx, d_b);
|
|
Scalar cpu_dot = a.dot(b);
|
|
RealScalar tol = RealScalar(10) * RealScalar(n) * NumTraits<Scalar>::epsilon();
|
|
VERIFY(numext::abs(gpu_dot - cpu_dot) < tol * numext::abs(cpu_dot) + tol);
|
|
|
|
// squaredNorm() returns host RealScalar directly (syncs internally).
|
|
RealScalar gpu_sqnorm = d_a.squaredNorm(ctx);
|
|
RealScalar cpu_sqnorm = a.squaredNorm();
|
|
VERIFY(numext::abs(gpu_sqnorm - cpu_sqnorm) < tol * cpu_sqnorm + tol);
|
|
|
|
// norm() returns DeviceScalar<RealScalar> — implicit conversion syncs.
|
|
RealScalar gpu_norm = d_a.norm(ctx);
|
|
RealScalar cpu_norm = a.norm();
|
|
VERIFY(numext::abs(gpu_norm - cpu_norm) < tol * cpu_norm + tol);
|
|
|
|
// Convenience overloads (thread-local context).
|
|
GpuContext::setThreadLocal(&ctx);
|
|
Scalar gpu_dot2 = d_a.dot(d_b);
|
|
VERIFY(numext::abs(gpu_dot2 - cpu_dot) < tol * numext::abs(cpu_dot) + tol);
|
|
GpuContext::setThreadLocal(nullptr);
|
|
|
|
// Empty vectors: dot and norm must return zero.
|
|
{
|
|
DeviceMatrix<Scalar> d_empty(0, 1);
|
|
DeviceMatrix<Scalar> d_empty2(0, 1);
|
|
Scalar empty_dot = d_empty.dot(ctx, d_empty2);
|
|
VERIFY_IS_EQUAL(empty_dot, Scalar(0));
|
|
RealScalar empty_sqnorm = d_empty.squaredNorm(ctx);
|
|
VERIFY_IS_EQUAL(empty_sqnorm, RealScalar(0));
|
|
RealScalar empty_norm = d_empty.norm(ctx);
|
|
VERIFY_IS_EQUAL(empty_norm, RealScalar(0));
|
|
}
|
|
}
|
|
|
|
// ---- cwiseProduct -----------------------------------------------------------
|
|
|
|
template <typename Scalar>
|
|
void test_cwiseProduct() {
|
|
using Vec = Matrix<Scalar, Dynamic, 1>;
|
|
using RealScalar = typename NumTraits<Scalar>::Real;
|
|
|
|
const Index n = 256;
|
|
Vec a = Vec::Random(n);
|
|
Vec b = Vec::Random(n);
|
|
Vec ref = a.array() * b.array();
|
|
|
|
GpuContext ctx;
|
|
auto d_a = DeviceMatrix<Scalar>::fromHost(a, ctx.stream());
|
|
auto d_b = DeviceMatrix<Scalar>::fromHost(b, ctx.stream());
|
|
auto d_c = d_a.cwiseProduct(ctx, d_b);
|
|
Vec result = d_c.toHost(ctx.stream());
|
|
|
|
RealScalar tol = RealScalar(10) * RealScalar(n) * NumTraits<Scalar>::epsilon();
|
|
VERIFY((result - ref).norm() < tol * ref.norm() + tol);
|
|
}
|
|
|
|
EIGEN_DECLARE_TEST(gpu_device_matrix) {
|
|
CALL_SUBTEST(test_default_construct());
|
|
CALL_SUBTEST(test_empty());
|
|
CALL_SUBTEST(test_resize());
|
|
CALL_SUBTEST(test_host_transfer_ready());
|
|
CALL_SUBTEST(test_host_transfer_move());
|
|
CALL_SUBTEST((test_allocate<float>(100, 50)));
|
|
CALL_SUBTEST((test_allocate<double>(100, 50)));
|
|
CALL_SUBTEST(test_scalar<float>());
|
|
CALL_SUBTEST(test_scalar<double>());
|
|
CALL_SUBTEST(test_scalar<std::complex<float>>());
|
|
CALL_SUBTEST(test_scalar<std::complex<double>>());
|
|
CALL_SUBTEST(test_blas1<float>(256));
|
|
CALL_SUBTEST(test_blas1<double>(256));
|
|
CALL_SUBTEST(test_blas1<std::complex<float>>(256));
|
|
CALL_SUBTEST(test_blas1<std::complex<double>>(256));
|
|
CALL_SUBTEST(test_cg_operators<float>(256));
|
|
CALL_SUBTEST(test_cg_operators<double>(256));
|
|
CALL_SUBTEST(test_cg_operators<std::complex<float>>(256));
|
|
CALL_SUBTEST(test_cg_operators<std::complex<double>>(256));
|
|
CALL_SUBTEST(test_device_scalar<float>());
|
|
CALL_SUBTEST(test_device_scalar<double>());
|
|
CALL_SUBTEST(test_device_scalar<std::complex<float>>());
|
|
CALL_SUBTEST(test_device_scalar<std::complex<double>>());
|
|
CALL_SUBTEST(test_cwiseProduct<float>());
|
|
CALL_SUBTEST(test_cwiseProduct<double>());
|
|
}
|