// This file is part of Eigen, a lightweight C++ template library // for linear algebra. // // Copyright (C) 2026 Rasmus Munk Larsen // // This Source Code Form is subject to the terms of the Mozilla // Public License v. 2.0. If a copy of the MPL was not distributed // with this file, You can obtain one at http://mozilla.org/MPL/2.0/. // Tests for DeviceMatrix and HostTransfer: typed RAII GPU memory wrapper. // No cuSOLVER dependency — only CUDA runtime. #define EIGEN_USE_GPU #include "main.h" #include #include using namespace Eigen; // ---- Default construction --------------------------------------------------- void test_default_construct() { DeviceMatrix dm; VERIFY(dm.empty()); VERIFY_IS_EQUAL(dm.rows(), 0); VERIFY_IS_EQUAL(dm.cols(), 0); VERIFY(dm.data() == nullptr); VERIFY_IS_EQUAL(dm.sizeInBytes(), size_t(0)); } // ---- Allocate uninitialized ------------------------------------------------- template void test_allocate(Index rows, Index cols) { DeviceMatrix dm(rows, cols); VERIFY(!dm.empty()); VERIFY_IS_EQUAL(dm.rows(), rows); VERIFY_IS_EQUAL(dm.cols(), cols); VERIFY(dm.data() != nullptr); VERIFY_IS_EQUAL(dm.sizeInBytes(), size_t(rows) * size_t(cols) * sizeof(Scalar)); } // ---- fromHost / toHost roundtrip (synchronous) ------------------------------ template void test_roundtrip(Index rows, Index cols) { using MatrixType = Matrix; MatrixType host = MatrixType::Random(rows, cols); auto dm = DeviceMatrix::fromHost(host); VERIFY_IS_EQUAL(dm.rows(), rows); VERIFY_IS_EQUAL(dm.cols(), cols); VERIFY(!dm.empty()); MatrixType result = dm.toHost(); VERIFY_IS_EQUAL(result.rows(), rows); VERIFY_IS_EQUAL(result.cols(), cols); VERIFY_IS_APPROX(result, host); } // ---- fromHostAsync / toHostAsync roundtrip ----------------------------------- template void test_roundtrip_async(Index rows, Index cols) { using MatrixType = Matrix; MatrixType host = MatrixType::Random(rows, cols); cudaStream_t stream; EIGEN_CUDA_RUNTIME_CHECK(cudaStreamCreate(&stream)); // Async upload from raw pointer. auto dm = DeviceMatrix::fromHostAsync(host.data(), rows, cols, stream); VERIFY_IS_EQUAL(dm.rows(), rows); VERIFY_IS_EQUAL(dm.cols(), cols); // Async download via HostTransfer future. auto transfer = dm.toHostAsync(stream); // get() blocks and returns the matrix. MatrixType result = transfer.get(); VERIFY_IS_APPROX(result, host); EIGEN_CUDA_RUNTIME_CHECK(cudaStreamDestroy(stream)); } // ---- HostTransfer::ready() and idempotent get() ----------------------------- void test_host_transfer_ready() { using MatrixType = Matrix; MatrixType host = MatrixType::Random(100, 100); auto dm = DeviceMatrix::fromHost(host); auto transfer = dm.toHostAsync(); // After get(), ready() must return true. MatrixType result = transfer.get(); VERIFY(transfer.ready()); VERIFY_IS_APPROX(result, host); // get() is idempotent. MatrixType& result2 = transfer.get(); VERIFY_IS_APPROX(result2, host); } // ---- HostTransfer move ------------------------------------------------------ void test_host_transfer_move() { using MatrixType = Matrix; MatrixType host = MatrixType::Random(50, 50); auto dm = DeviceMatrix::fromHost(host); auto transfer = dm.toHostAsync(); HostTransfer moved(std::move(transfer)); MatrixType result = moved.get(); VERIFY_IS_APPROX(result, host); } // ---- clone() produces independent copy -------------------------------------- template void test_clone(Index rows, Index cols) { using MatrixType = Matrix; MatrixType host = MatrixType::Random(rows, cols); auto dm = DeviceMatrix::fromHost(host); auto cloned = dm.clone(); // Overwrite original with different data. MatrixType other = MatrixType::Random(rows, cols); dm = DeviceMatrix::fromHost(other); // Clone still holds the original data. MatrixType clone_result = cloned.toHost(); VERIFY_IS_APPROX(clone_result, host); // Original holds the new data. MatrixType dm_result = dm.toHost(); VERIFY_IS_APPROX(dm_result, other); } // ---- Move construct --------------------------------------------------------- template void test_move_construct(Index rows, Index cols) { using MatrixType = Matrix; MatrixType host = MatrixType::Random(rows, cols); auto dm = DeviceMatrix::fromHost(host); DeviceMatrix moved(std::move(dm)); VERIFY(dm.empty()); VERIFY(dm.data() == nullptr); VERIFY_IS_EQUAL(moved.rows(), rows); VERIFY_IS_EQUAL(moved.cols(), cols); MatrixType result = moved.toHost(); VERIFY_IS_APPROX(result, host); } // ---- Move assign ------------------------------------------------------------ template void test_move_assign(Index rows, Index cols) { using MatrixType = Matrix; MatrixType host = MatrixType::Random(rows, cols); auto dm = DeviceMatrix::fromHost(host); DeviceMatrix dest; dest = std::move(dm); VERIFY(dm.empty()); VERIFY_IS_EQUAL(dest.rows(), rows); MatrixType result = dest.toHost(); VERIFY_IS_APPROX(result, host); } // ---- resize() --------------------------------------------------------------- void test_resize() { DeviceMatrix dm(10, 20); VERIFY_IS_EQUAL(dm.rows(), 10); VERIFY_IS_EQUAL(dm.cols(), 20); dm.resize(50, 30); VERIFY_IS_EQUAL(dm.rows(), 50); VERIFY_IS_EQUAL(dm.cols(), 30); VERIFY(dm.data() != nullptr); // Resize to same dimensions is a no-op. double* ptr_before = dm.data(); dm.resize(50, 30); VERIFY(dm.data() == ptr_before); } // ---- Empty / 0x0 matrix ----------------------------------------------------- void test_empty() { using MatrixType = Matrix; MatrixType empty_mat(0, 0); auto dm = DeviceMatrix::fromHost(empty_mat); VERIFY(dm.empty()); VERIFY_IS_EQUAL(dm.rows(), 0); VERIFY_IS_EQUAL(dm.cols(), 0); MatrixType result = dm.toHost(); VERIFY_IS_EQUAL(result.rows(), 0); VERIFY_IS_EQUAL(result.cols(), 0); } // ---- Per-scalar driver ------------------------------------------------------ template void test_scalar() { // Square. CALL_SUBTEST(test_roundtrip(1, 1)); CALL_SUBTEST(test_roundtrip(64, 64)); CALL_SUBTEST(test_roundtrip(256, 256)); // Rectangular. CALL_SUBTEST(test_roundtrip(100, 7)); CALL_SUBTEST(test_roundtrip(7, 100)); // Async roundtrip. CALL_SUBTEST(test_roundtrip_async(64, 64)); CALL_SUBTEST(test_roundtrip_async(100, 7)); CALL_SUBTEST(test_clone(64, 64)); CALL_SUBTEST(test_move_construct(64, 64)); CALL_SUBTEST(test_move_assign(64, 64)); } // ---- BLAS-1: dot product ---------------------------------------------------- template void test_blas1(Index n) { using Vec = Matrix; using RealScalar = typename NumTraits::Real; // All BLAS-1 ops share one GpuContext — same stream, zero event overhead. GpuContext ctx; RealScalar tol = RealScalar(10) * RealScalar(n) * NumTraits::epsilon(); // dot { Vec a = Vec::Random(n); Vec b = Vec::Random(n); auto d_a = DeviceMatrix::fromHost(a, ctx.stream()); auto d_b = DeviceMatrix::fromHost(b, ctx.stream()); Scalar gpu_dot = d_a.dot(ctx, d_b); Scalar cpu_dot = a.dot(b); VERIFY(numext::abs(gpu_dot - cpu_dot) < tol * numext::abs(cpu_dot) + tol); } // norm / squaredNorm { Vec a = Vec::Random(n); auto d_a = DeviceMatrix::fromHost(a, ctx.stream()); RealScalar gpu_norm = d_a.norm(ctx); RealScalar cpu_norm = a.norm(); VERIFY(numext::abs(gpu_norm - cpu_norm) < tol * cpu_norm + tol); RealScalar gpu_sqnorm = d_a.squaredNorm(ctx); RealScalar cpu_sqnorm = a.squaredNorm(); VERIFY(numext::abs(gpu_sqnorm - cpu_sqnorm) < tol * cpu_sqnorm + tol); } // addScaled (axpy) { Vec x = Vec::Random(n); Vec y = Vec::Random(n); Scalar alpha(2.5); Vec y_ref = y + alpha * x; auto d_y = DeviceMatrix::fromHost(y, ctx.stream()); auto d_x = DeviceMatrix::fromHost(x, ctx.stream()); d_y.addScaled(ctx, alpha, d_x); Vec y_gpu = d_y.toHost(ctx.stream()); VERIFY((y_gpu - y_ref).norm() < tol * y_ref.norm() + tol); } // scale (scal) { Vec x = Vec::Random(n); Scalar alpha(3.0); Vec x_ref = alpha * x; auto d_x = DeviceMatrix::fromHost(x, ctx.stream()); d_x.scale(ctx, alpha); Vec x_gpu = d_x.toHost(ctx.stream()); VERIFY((x_gpu - x_ref).norm() < tol * x_ref.norm() + tol); } // copyFrom { Vec x = Vec::Random(n); auto d_x = DeviceMatrix::fromHost(x, ctx.stream()); DeviceMatrix d_y; d_y.copyFrom(ctx, d_x); Vec y = d_y.toHost(ctx.stream()); VERIFY_IS_APPROX(y, x); } // setZero { Vec x = Vec::Random(n); auto d_x = DeviceMatrix::fromHost(x, ctx.stream()); d_x.setZero(ctx); Vec result = d_x.toHost(ctx.stream()); VERIFY_IS_EQUAL(result, Vec::Zero(n)); } } // ---- BLAS-1 operator overloads (CG-style) ----------------------------------- template void test_cg_operators(Index n) { using Vec = Matrix; using RealScalar = typename NumTraits::Real; RealScalar tol = RealScalar(10) * RealScalar(n) * NumTraits::epsilon(); Vec x = Vec::Random(n); Vec p = Vec::Random(n); Vec tmp = Vec::Random(n); Vec z = Vec::Random(n); Scalar alpha(2.5); Scalar beta(0.7); // Test: x += alpha * p { Vec x_ref = x + alpha * p; auto d_x = DeviceMatrix::fromHost(x); auto d_p = DeviceMatrix::fromHost(p); d_x += alpha * d_p; Vec x_gpu = d_x.toHost(); VERIFY((x_gpu - x_ref).norm() < tol * x_ref.norm() + tol); } // Test: r -= alpha * tmp { Vec r = Vec::Random(n); Vec r_ref = r - alpha * tmp; auto d_r = DeviceMatrix::fromHost(r); auto d_tmp = DeviceMatrix::fromHost(tmp); d_r -= alpha * d_tmp; Vec r_gpu = d_r.toHost(); VERIFY((r_gpu - r_ref).norm() < tol * r_ref.norm() + tol); } // Test: p = z + beta * p (cuBLAS geam) { Vec p_copy = p; Vec p_ref = z + beta * p_copy; auto d_p = DeviceMatrix::fromHost(p_copy); auto d_z = DeviceMatrix::fromHost(z); d_p = d_z + beta * d_p; Vec p_gpu = d_p.toHost(); VERIFY((p_gpu - p_ref).norm() < tol * p_ref.norm() + tol); } // Test: operator+= and operator-= with DeviceMatrix (no scalar) { Vec a = Vec::Random(n); Vec b = Vec::Random(n); Vec a_ref = a + b; auto d_a = DeviceMatrix::fromHost(a); auto d_b = DeviceMatrix::fromHost(b); d_a += d_b; VERIFY((d_a.toHost() - a_ref).norm() < tol * a_ref.norm() + tol); } } // ---- DeviceScalar: deferred sync ------------------------------------------- template void test_device_scalar() { using Vec = Matrix; using RealScalar = typename NumTraits::Real; const Index n = 256; Vec a = Vec::Random(n); Vec b = Vec::Random(n); GpuContext ctx; auto d_a = DeviceMatrix::fromHost(a, ctx.stream()); auto d_b = DeviceMatrix::fromHost(b, ctx.stream()); // dot() returns DeviceScalar — implicit conversion to Scalar syncs. Scalar gpu_dot = d_a.dot(ctx, d_b); Scalar cpu_dot = a.dot(b); RealScalar tol = RealScalar(10) * RealScalar(n) * NumTraits::epsilon(); VERIFY(numext::abs(gpu_dot - cpu_dot) < tol * numext::abs(cpu_dot) + tol); // squaredNorm() returns host RealScalar directly (syncs internally). RealScalar gpu_sqnorm = d_a.squaredNorm(ctx); RealScalar cpu_sqnorm = a.squaredNorm(); VERIFY(numext::abs(gpu_sqnorm - cpu_sqnorm) < tol * cpu_sqnorm + tol); // norm() returns DeviceScalar — implicit conversion syncs. RealScalar gpu_norm = d_a.norm(ctx); RealScalar cpu_norm = a.norm(); VERIFY(numext::abs(gpu_norm - cpu_norm) < tol * cpu_norm + tol); // Convenience overloads (thread-local context). GpuContext::setThreadLocal(&ctx); Scalar gpu_dot2 = d_a.dot(d_b); VERIFY(numext::abs(gpu_dot2 - cpu_dot) < tol * numext::abs(cpu_dot) + tol); GpuContext::setThreadLocal(nullptr); // Empty vectors: dot and norm must return zero. { DeviceMatrix d_empty(0, 1); DeviceMatrix d_empty2(0, 1); Scalar empty_dot = d_empty.dot(ctx, d_empty2); VERIFY_IS_EQUAL(empty_dot, Scalar(0)); RealScalar empty_sqnorm = d_empty.squaredNorm(ctx); VERIFY_IS_EQUAL(empty_sqnorm, RealScalar(0)); RealScalar empty_norm = d_empty.norm(ctx); VERIFY_IS_EQUAL(empty_norm, RealScalar(0)); } } // ---- cwiseProduct ----------------------------------------------------------- template void test_cwiseProduct() { using Vec = Matrix; using RealScalar = typename NumTraits::Real; const Index n = 256; Vec a = Vec::Random(n); Vec b = Vec::Random(n); Vec ref = a.array() * b.array(); GpuContext ctx; auto d_a = DeviceMatrix::fromHost(a, ctx.stream()); auto d_b = DeviceMatrix::fromHost(b, ctx.stream()); auto d_c = d_a.cwiseProduct(ctx, d_b); Vec result = d_c.toHost(ctx.stream()); RealScalar tol = RealScalar(10) * RealScalar(n) * NumTraits::epsilon(); VERIFY((result - ref).norm() < tol * ref.norm() + tol); } EIGEN_DECLARE_TEST(gpu_device_matrix) { CALL_SUBTEST(test_default_construct()); CALL_SUBTEST(test_empty()); CALL_SUBTEST(test_resize()); CALL_SUBTEST(test_host_transfer_ready()); CALL_SUBTEST(test_host_transfer_move()); CALL_SUBTEST((test_allocate(100, 50))); CALL_SUBTEST((test_allocate(100, 50))); CALL_SUBTEST(test_scalar()); CALL_SUBTEST(test_scalar()); CALL_SUBTEST(test_scalar>()); CALL_SUBTEST(test_scalar>()); CALL_SUBTEST(test_blas1(256)); CALL_SUBTEST(test_blas1(256)); CALL_SUBTEST(test_blas1>(256)); CALL_SUBTEST(test_blas1>(256)); CALL_SUBTEST(test_cg_operators(256)); CALL_SUBTEST(test_cg_operators(256)); CALL_SUBTEST(test_cg_operators>(256)); CALL_SUBTEST(test_cg_operators>(256)); CALL_SUBTEST(test_device_scalar()); CALL_SUBTEST(test_device_scalar()); CALL_SUBTEST(test_device_scalar>()); CALL_SUBTEST(test_device_scalar>()); CALL_SUBTEST(test_cwiseProduct()); CALL_SUBTEST(test_cwiseProduct()); }