mirror of
https://gitlab.com/libeigen/eigen.git
synced 2026-04-10 11:34:33 +08:00
Compare commits
13 Commits
selfadjoin
...
gpu-sparse
| Author | SHA1 | Date | |
|---|---|---|---|
|
|
43a95b62bb | ||
|
|
8593c7f5a1 | ||
|
|
58c44ef36d | ||
|
|
6a9405bf7a | ||
|
|
e055e4e415 | ||
|
|
b1d2ce4c85 | ||
|
|
ab70739c9c | ||
|
|
e778b5d22b | ||
|
|
def45c5e1e | ||
|
|
110530a4d8 | ||
|
|
bde3a68bae | ||
|
|
8eabfb5342 | ||
|
|
4ad90a60f1 |
@@ -1,4 +1,4 @@
|
||||
cmake_minimum_required(VERSION 3.10.0)
|
||||
cmake_minimum_required(VERSION 3.17)
|
||||
|
||||
#==============================================================================
|
||||
# CMake Policy issues.
|
||||
@@ -9,7 +9,7 @@ if (POLICY CMP0077)
|
||||
endif (POLICY CMP0077)
|
||||
|
||||
# NOTE Remove setting the policy once the minimum required CMake version is
|
||||
# increased to at least 3.15. Retain enabling the export to package registry.
|
||||
# increased to at least 3.21. Retain enabling the export to package registry.
|
||||
if (POLICY CMP0090)
|
||||
# The export command does not populate package registry by default
|
||||
cmake_policy (SET CMP0090 NEW)
|
||||
@@ -672,7 +672,7 @@ if (EIGEN_BUILD_TESTING)
|
||||
endif()
|
||||
|
||||
set(EIGEN_CUDA_CXX_FLAGS "" CACHE STRING "Additional flags to pass to the cuda compiler.")
|
||||
set(EIGEN_CUDA_COMPUTE_ARCH 30 CACHE STRING "The CUDA compute architecture(s) to target when compiling CUDA code")
|
||||
set(EIGEN_CUDA_COMPUTE_ARCH 70 CACHE STRING "The CUDA compute architecture(s) to target when compiling CUDA code")
|
||||
|
||||
option(EIGEN_TEST_SYCL "Add Sycl support." OFF)
|
||||
if(EIGEN_TEST_SYCL)
|
||||
@@ -817,4 +817,3 @@ endif()
|
||||
message(STATUS "")
|
||||
message(STATUS "Configured Eigen ${EIGEN_VERSION_STRING}")
|
||||
message(STATUS "")
|
||||
|
||||
|
||||
@@ -50,9 +50,9 @@
|
||||
#include "src/Core/util/AOCL_Support.h"
|
||||
|
||||
|
||||
#if defined(EIGEN_HAS_CUDA_FP16) || defined(EIGEN_HAS_HIP_FP16)
|
||||
#define EIGEN_HAS_GPU_FP16
|
||||
#endif
|
||||
// EIGEN_HAS_GPU_FP16 is now always true when compiling with CUDA or HIP.
|
||||
// Use EIGEN_GPUCC (compile-time) or EIGEN_GPU_COMPILE_PHASE (device phase) instead.
|
||||
// TODO: Remove EIGEN_HAS_GPU_BF16 similarly once HIP bf16 guards are cleaned up.
|
||||
|
||||
#if defined(EIGEN_HAS_CUDA_BF16) || defined(EIGEN_HAS_HIP_BF16)
|
||||
#define EIGEN_HAS_GPU_BF16
|
||||
|
||||
69
Eigen/GPU
Normal file
69
Eigen/GPU
Normal file
@@ -0,0 +1,69 @@
|
||||
// This file is part of Eigen, a lightweight C++ template library
|
||||
// for linear algebra.
|
||||
//
|
||||
// This Source Code Form is subject to the terms of the Mozilla
|
||||
// Public License v. 2.0. If a copy of the MPL was not distributed
|
||||
// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
|
||||
|
||||
#ifndef EIGEN_GPU_MODULE_H
|
||||
#define EIGEN_GPU_MODULE_H
|
||||
|
||||
#include "Core"
|
||||
|
||||
#include "src/Core/util/DisableStupidWarnings.h"
|
||||
|
||||
/** \defgroup GPU_Module GPU module
|
||||
*
|
||||
* GPU-accelerated solvers and operations using NVIDIA CUDA libraries
|
||||
* (cuSOLVER, cuBLAS, cuSPARSE, cuFFT, cuDSS).
|
||||
*
|
||||
* This module provides explicit GPU solver classes that coexist with Eigen's
|
||||
* CPU solvers. Unlike the LAPACKE dispatch (which replaces the CPU
|
||||
* implementation globally), GPU classes are separate types the user
|
||||
* instantiates by choice:
|
||||
*
|
||||
* \code
|
||||
* #define EIGEN_USE_GPU
|
||||
* #include <Eigen/GPU>
|
||||
*
|
||||
* // CPU path (unchanged)
|
||||
* Eigen::LLT<Eigen::MatrixXd> llt_cpu(A);
|
||||
*
|
||||
* // GPU path (explicit)
|
||||
* Eigen::GpuLLT<double> llt_gpu(A); // L stays on device
|
||||
* auto X = llt_gpu.solve(B); // only B transferred per solve
|
||||
* \endcode
|
||||
*
|
||||
* Requires CUDA 11.4+. See CLAUDE.md.
|
||||
*/
|
||||
|
||||
#ifdef EIGEN_USE_GPU
|
||||
// IWYU pragma: begin_exports
|
||||
#include "src/GPU/DeviceMatrix.h"
|
||||
#include "src/GPU/GpuContext.h"
|
||||
#include "src/GPU/DeviceExpr.h"
|
||||
#include "src/GPU/DeviceBlasExpr.h"
|
||||
#include "src/GPU/DeviceSolverExpr.h"
|
||||
#include "src/GPU/DeviceDispatch.h"
|
||||
#include "src/GPU/GpuLLT.h"
|
||||
#include "src/GPU/GpuLU.h"
|
||||
#include "src/GPU/GpuQR.h"
|
||||
#include "src/GPU/GpuSVD.h"
|
||||
#include "src/GPU/GpuEigenSolver.h"
|
||||
#include "src/GPU/CuFftSupport.h"
|
||||
#include "src/GPU/GpuFFT.h"
|
||||
#include "src/GPU/CuSparseSupport.h"
|
||||
#include "src/GPU/GpuSparseContext.h"
|
||||
#ifdef EIGEN_CUDSS
|
||||
#include "src/GPU/CuDssSupport.h"
|
||||
#include "src/GPU/GpuSparseSolverBase.h"
|
||||
#include "src/GPU/GpuSparseLLT.h"
|
||||
#include "src/GPU/GpuSparseLDLT.h"
|
||||
#include "src/GPU/GpuSparseLU.h"
|
||||
#endif
|
||||
// IWYU pragma: end_exports
|
||||
#endif
|
||||
|
||||
#include "src/Core/util/ReenableStupidWarnings.h"
|
||||
|
||||
#endif // EIGEN_GPU_MODULE_H
|
||||
@@ -858,16 +858,8 @@ struct hash<Eigen::bfloat16> {
|
||||
} // namespace std
|
||||
#endif
|
||||
|
||||
// Add the missing shfl* intrinsics.
|
||||
// The __shfl* functions are only valid on HIP or _CUDA_ARCH_ >= 300.
|
||||
// CUDA defines them for (__CUDA_ARCH__ >= 300 || !defined(__CUDA_ARCH__))
|
||||
//
|
||||
// HIP and CUDA prior to SDK 9.0 define
|
||||
// __shfl, __shfl_up, __shfl_down, __shfl_xor for int and float
|
||||
// CUDA since 9.0 deprecates those and instead defines
|
||||
// __shfl_sync, __shfl_up_sync, __shfl_down_sync, __shfl_xor_sync,
|
||||
// with native support for __half and __nv_bfloat16
|
||||
//
|
||||
// Warp shuffle overloads for Eigen::bfloat16.
|
||||
// HIP uses non-sync __shfl variants; CUDA has native __nv_bfloat16 support in __shfl_sync.
|
||||
// Note that the following are __device__ - only functions.
|
||||
#if defined(EIGEN_HIPCC)
|
||||
|
||||
|
||||
@@ -141,6 +141,140 @@ EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS Packet plog2_float(const Pac
|
||||
return plog_impl_float<Packet, /* base2 */ true>(_x);
|
||||
}
|
||||
|
||||
// -----------------------------------------------------------------------
|
||||
// Double logarithm: shared polynomial + two range-reduction backends
|
||||
// -----------------------------------------------------------------------
|
||||
|
||||
// Cephes rational-polynomial approximation of log(1+f) for
|
||||
// f in [sqrt(0.5)-1, sqrt(2)-1].
|
||||
// Evaluates x - 0.5*x^2 + x^3 * P(x)/Q(x) where P and Q are degree-5.
|
||||
// See: http://www.netlib.org/cephes/
|
||||
template <typename Packet>
|
||||
EIGEN_STRONG_INLINE Packet plog_mantissa_double(const Packet x) {
|
||||
const Packet cst_cephes_log_p0 = pset1<Packet>(1.01875663804580931796E-4);
|
||||
const Packet cst_cephes_log_p1 = pset1<Packet>(4.97494994976747001425E-1);
|
||||
const Packet cst_cephes_log_p2 = pset1<Packet>(4.70579119878881725854E0);
|
||||
const Packet cst_cephes_log_p3 = pset1<Packet>(1.44989225341610930846E1);
|
||||
const Packet cst_cephes_log_p4 = pset1<Packet>(1.79368678507819816313E1);
|
||||
const Packet cst_cephes_log_p5 = pset1<Packet>(7.70838733755885391666E0);
|
||||
// Q0 = 1.0; pmadd(1, x, q1) simplifies to padd(x, q1).
|
||||
const Packet cst_cephes_log_q1 = pset1<Packet>(1.12873587189167450590E1);
|
||||
const Packet cst_cephes_log_q2 = pset1<Packet>(4.52279145837532221105E1);
|
||||
const Packet cst_cephes_log_q3 = pset1<Packet>(8.29875266912776603211E1);
|
||||
const Packet cst_cephes_log_q4 = pset1<Packet>(7.11544750618563894466E1);
|
||||
const Packet cst_cephes_log_q5 = pset1<Packet>(2.31251620126765340583E1);
|
||||
|
||||
Packet x2 = pmul(x, x);
|
||||
Packet x3 = pmul(x2, x);
|
||||
|
||||
// Evaluate P and Q simultaneously for better ILP.
|
||||
Packet y, y1, y_;
|
||||
y = pmadd(cst_cephes_log_p0, x, cst_cephes_log_p1);
|
||||
y1 = pmadd(cst_cephes_log_p3, x, cst_cephes_log_p4);
|
||||
y = pmadd(y, x, cst_cephes_log_p2);
|
||||
y1 = pmadd(y1, x, cst_cephes_log_p5);
|
||||
y_ = pmadd(y, x3, y1);
|
||||
|
||||
y = padd(x, cst_cephes_log_q1);
|
||||
y1 = pmadd(cst_cephes_log_q3, x, cst_cephes_log_q4);
|
||||
y = pmadd(y, x, cst_cephes_log_q2);
|
||||
y1 = pmadd(y1, x, cst_cephes_log_q5);
|
||||
y = pmadd(y, x3, y1);
|
||||
|
||||
y_ = pmul(y_, x3);
|
||||
y = pdiv(y_, y);
|
||||
y = pnmadd(pset1<Packet>(0.5), x2, y);
|
||||
return padd(x, y);
|
||||
}
|
||||
|
||||
// Detect whether unpacket_traits<Packet>::integer_packet is defined.
|
||||
template <typename Packet, typename = void>
|
||||
struct packet_has_integer_packet : std::false_type {};
|
||||
template <typename Packet>
|
||||
struct packet_has_integer_packet<Packet, void_t<typename unpacket_traits<Packet>::integer_packet>> : std::true_type {};
|
||||
|
||||
// Dispatch struct for double-precision range reduction.
|
||||
// Primary template: pfrexp-based fallback (used when integer_packet is absent).
|
||||
template <typename Packet, bool UseIntegerPacket>
|
||||
struct plog_range_reduce_double {
|
||||
EIGEN_STRONG_INLINE static void run(const Packet v, Packet& f, Packet& e) {
|
||||
const Packet one = pset1<Packet>(1.0);
|
||||
const Packet cst_cephes_SQRTHF = pset1<Packet>(0.70710678118654752440E0);
|
||||
// pfrexp: f in [0.5, 1), e = unbiased exponent as double.
|
||||
f = pfrexp(v, e);
|
||||
// Shift [0.5,1) -> [sqrt(0.5)-1, sqrt(2)-1] with exponent correction:
|
||||
// if f < sqrt(0.5): f = f + f - 1, e -= 1 (giving f in [0, sqrt(2)-1))
|
||||
// else: f = f - 1 (giving f in [sqrt(0.5)-1, 0))
|
||||
Packet mask = pcmp_lt(f, cst_cephes_SQRTHF);
|
||||
Packet tmp = pand(f, mask);
|
||||
f = psub(f, one);
|
||||
e = psub(e, pand(one, mask));
|
||||
f = padd(f, tmp);
|
||||
}
|
||||
};
|
||||
|
||||
// Specialisation: fast integer-bit-manipulation path (musl-inspired).
|
||||
// Requires unpacket_traits<Packet>::integer_packet to be a 64-bit integer packet.
|
||||
template <typename Packet>
|
||||
struct plog_range_reduce_double<Packet, true> {
|
||||
EIGEN_STRONG_INLINE static void run(const Packet v, Packet& f, Packet& e) {
|
||||
typedef typename unpacket_traits<Packet>::integer_packet PacketI;
|
||||
// 2^-1022: smallest positive normal double.
|
||||
const PacketI cst_min_normal = pset1<PacketI>(static_cast<int64_t>(0x0010000000000000LL));
|
||||
// Lower 52-bit mask (IEEE mantissa field).
|
||||
const PacketI cst_mant_mask = pset1<PacketI>(static_cast<int64_t>(0x000FFFFFFFFFFFFFLL));
|
||||
// Offset = 1.0_bits - sqrt(0.5)_bits. Adding this to the integer
|
||||
// representation shifts the exponent field so that the [sqrt(0.5), sqrt(2))
|
||||
// half-octave boundary falls on an exact biased-exponent boundary, letting
|
||||
// us extract e with a single right shift. The constant is:
|
||||
// 0x3FF0000000000000 - 0x3FE6A09E667F3BCD = 0x00095F619980C433
|
||||
const PacketI cst_sqrt_half_offset =
|
||||
pset1<PacketI>(static_cast<int64_t>(0x3FF0000000000000LL - 0x3FE6A09E667F3BCDLL));
|
||||
// IEEE double exponent bias (1023).
|
||||
const PacketI cst_exp_bias = pset1<PacketI>(static_cast<int64_t>(1023));
|
||||
// sqrt(0.5) IEEE bits — used to reconstruct f from biased mantissa.
|
||||
const PacketI cst_half_mant = pset1<PacketI>(static_cast<int64_t>(0x3FE6A09E667F3BCDLL));
|
||||
|
||||
// Reinterpret v as a 64-bit integer vector.
|
||||
PacketI vi = preinterpret<PacketI>(v);
|
||||
|
||||
// Normalise denormals: multiply by 2^52 and correct the exponent by -52.
|
||||
PacketI is_denormal = pcmp_lt(vi, cst_min_normal);
|
||||
// 2^52 via bit pattern: biased exponent = 52 + 1023 = 0x433, mantissa = 0.
|
||||
Packet v_norm = pmul(v, pset1frombits<Packet>(static_cast<uint64_t>(int64_t(52 + 0x3ff) << 52)));
|
||||
vi = pselect(is_denormal, preinterpret<PacketI>(v_norm), vi);
|
||||
PacketI denorm_adj = pand(is_denormal, pset1<PacketI>(static_cast<int64_t>(52)));
|
||||
|
||||
// Bias the integer representation so the exponent field directly encodes
|
||||
// the half-octave index.
|
||||
PacketI vi_biased = padd(vi, cst_sqrt_half_offset);
|
||||
// Extract unbiased exponent: shift out mantissa bits, subtract IEEE bias
|
||||
// and denormal adjustment.
|
||||
PacketI e_int = psub(psub(plogical_shift_right<52>(vi_biased), cst_exp_bias), denorm_adj);
|
||||
// Convert integer exponent to floating-point.
|
||||
e = pcast<PacketI, Packet>(e_int);
|
||||
|
||||
// Reconstruct mantissa in [sqrt(0.5), sqrt(2)) via integer arithmetic.
|
||||
// The integer addition of the masked mantissa bits and the sqrt(0.5) bit
|
||||
// pattern carries into the exponent field, yielding a value in that range.
|
||||
// Then subtract 1 to centre on 0: f in [sqrt(0.5)-1, sqrt(2)-1].
|
||||
f = psub(preinterpret<Packet>(padd(pand(vi_biased, cst_mant_mask), cst_half_mant)), pset1<Packet>(1.0));
|
||||
}
|
||||
};
|
||||
|
||||
// Core range reduction and polynomial for double logarithm.
|
||||
// Input: v > 0 (zero / negative / inf / nan are handled by the caller).
|
||||
// Output: log_mantissa ≈ log(mantissa of v in [sqrt(0.5), sqrt(2))),
|
||||
// e = unbiased exponent of v as a double.
|
||||
// Selects the fast integer path when integer_packet is available, otherwise
|
||||
// falls back to pfrexp.
|
||||
template <typename Packet>
|
||||
EIGEN_STRONG_INLINE void plog_core_double(const Packet v, Packet& log_mantissa, Packet& e) {
|
||||
Packet f;
|
||||
plog_range_reduce_double<Packet, packet_has_integer_packet<Packet>::value>::run(v, f, e);
|
||||
log_mantissa = plog_mantissa_double(f);
|
||||
}
|
||||
|
||||
/* Returns the base e (2.718...) or base 2 logarithm of x.
|
||||
* The argument is separated into its exponent and fractional parts.
|
||||
* The logarithm of the fraction in the interval [sqrt(1/2), sqrt(2)],
|
||||
@@ -152,87 +286,29 @@ EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS Packet plog2_float(const Pac
|
||||
*/
|
||||
template <typename Packet, bool base2>
|
||||
EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS Packet plog_impl_double(const Packet _x) {
|
||||
Packet x = _x;
|
||||
|
||||
const Packet cst_1 = pset1<Packet>(1.0);
|
||||
const Packet cst_neg_half = pset1<Packet>(-0.5);
|
||||
const Packet cst_minus_inf = pset1frombits<Packet>(static_cast<uint64_t>(0xfff0000000000000ull));
|
||||
const Packet cst_pos_inf = pset1frombits<Packet>(static_cast<uint64_t>(0x7ff0000000000000ull));
|
||||
|
||||
// Polynomial Coefficients for log(1+x) = x - x**2/2 + x**3 P(x)/Q(x)
|
||||
// 1/sqrt(2) <= x < sqrt(2)
|
||||
const Packet cst_cephes_SQRTHF = pset1<Packet>(0.70710678118654752440E0);
|
||||
const Packet cst_cephes_log_p0 = pset1<Packet>(1.01875663804580931796E-4);
|
||||
const Packet cst_cephes_log_p1 = pset1<Packet>(4.97494994976747001425E-1);
|
||||
const Packet cst_cephes_log_p2 = pset1<Packet>(4.70579119878881725854E0);
|
||||
const Packet cst_cephes_log_p3 = pset1<Packet>(1.44989225341610930846E1);
|
||||
const Packet cst_cephes_log_p4 = pset1<Packet>(1.79368678507819816313E1);
|
||||
const Packet cst_cephes_log_p5 = pset1<Packet>(7.70838733755885391666E0);
|
||||
Packet log_mantissa, e;
|
||||
plog_core_double(_x, log_mantissa, e);
|
||||
|
||||
const Packet cst_cephes_log_q0 = pset1<Packet>(1.0);
|
||||
const Packet cst_cephes_log_q1 = pset1<Packet>(1.12873587189167450590E1);
|
||||
const Packet cst_cephes_log_q2 = pset1<Packet>(4.52279145837532221105E1);
|
||||
const Packet cst_cephes_log_q3 = pset1<Packet>(8.29875266912776603211E1);
|
||||
const Packet cst_cephes_log_q4 = pset1<Packet>(7.11544750618563894466E1);
|
||||
const Packet cst_cephes_log_q5 = pset1<Packet>(2.31251620126765340583E1);
|
||||
|
||||
Packet e;
|
||||
// extract significant in the range [0.5,1) and exponent
|
||||
x = pfrexp(x, e);
|
||||
|
||||
// Shift the inputs from the range [0.5,1) to [sqrt(1/2),sqrt(2))
|
||||
// and shift by -1. The values are then centered around 0, which improves
|
||||
// the stability of the polynomial evaluation.
|
||||
// if( x < SQRTHF ) {
|
||||
// e -= 1;
|
||||
// x = x + x - 1.0;
|
||||
// } else { x = x - 1.0; }
|
||||
Packet mask = pcmp_lt(x, cst_cephes_SQRTHF);
|
||||
Packet tmp = pand(x, mask);
|
||||
x = psub(x, cst_1);
|
||||
e = psub(e, pand(cst_1, mask));
|
||||
x = padd(x, tmp);
|
||||
|
||||
Packet x2 = pmul(x, x);
|
||||
Packet x3 = pmul(x2, x);
|
||||
|
||||
// Evaluate the polynomial in factored form for better instruction-level parallelism.
|
||||
// y = x - 0.5*x^2 + x^3 * polevl( x, P, 5 ) / p1evl( x, Q, 5 ) );
|
||||
Packet y, y1, y_;
|
||||
y = pmadd(cst_cephes_log_p0, x, cst_cephes_log_p1);
|
||||
y1 = pmadd(cst_cephes_log_p3, x, cst_cephes_log_p4);
|
||||
y = pmadd(y, x, cst_cephes_log_p2);
|
||||
y1 = pmadd(y1, x, cst_cephes_log_p5);
|
||||
y_ = pmadd(y, x3, y1);
|
||||
|
||||
y = pmadd(cst_cephes_log_q0, x, cst_cephes_log_q1);
|
||||
y1 = pmadd(cst_cephes_log_q3, x, cst_cephes_log_q4);
|
||||
y = pmadd(y, x, cst_cephes_log_q2);
|
||||
y1 = pmadd(y1, x, cst_cephes_log_q5);
|
||||
y = pmadd(y, x3, y1);
|
||||
|
||||
y_ = pmul(y_, x3);
|
||||
y = pdiv(y_, y);
|
||||
|
||||
y = pmadd(cst_neg_half, x2, y);
|
||||
x = padd(x, y);
|
||||
|
||||
// Add the logarithm of the exponent back to the result of the interpolation.
|
||||
// Combine: log(x) = e * ln2 + log(mantissa), or log2(x) = log(mantissa)*log2e + e.
|
||||
Packet x;
|
||||
if (base2) {
|
||||
const Packet cst_log2e = pset1<Packet>(static_cast<double>(EIGEN_LOG2E));
|
||||
x = pmadd(x, cst_log2e, e);
|
||||
x = pmadd(log_mantissa, cst_log2e, e);
|
||||
} else {
|
||||
const Packet cst_ln2 = pset1<Packet>(static_cast<double>(EIGEN_LN2));
|
||||
x = pmadd(e, cst_ln2, x);
|
||||
x = pmadd(e, cst_ln2, log_mantissa);
|
||||
}
|
||||
|
||||
Packet invalid_mask = pcmp_lt_or_nan(_x, pzero(_x));
|
||||
Packet iszero_mask = pcmp_eq(_x, pzero(_x));
|
||||
Packet pos_inf_mask = pcmp_eq(_x, cst_pos_inf);
|
||||
// Filter out invalid inputs, i.e.:
|
||||
// - negative arg will be NAN
|
||||
// - 0 will be -INF
|
||||
// - +INF will be +INF
|
||||
// Filter out invalid inputs:
|
||||
// - negative arg → NAN
|
||||
// - 0 → -INF
|
||||
// - +INF → +INF
|
||||
return pselect(iszero_mask, cst_minus_inf, por(pselect(pos_inf_mask, cst_pos_inf, x), invalid_mask));
|
||||
}
|
||||
|
||||
@@ -286,8 +362,11 @@ EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS Packet generic_log1p_float(c
|
||||
return result;
|
||||
}
|
||||
|
||||
/** \internal \returns log(1 + x) for double precision float.
|
||||
Same direct approach as the float version.
|
||||
/** \internal \returns log(1 + x) for double precision.
|
||||
Computes log(1+x) using plog_core_double for the core range reduction and
|
||||
polynomial evaluation. The rounding error from forming u = fl(1+x) is
|
||||
recovered as dx = x - (u - 1) and folded in as a first-order correction
|
||||
dx/u after the polynomial evaluation.
|
||||
*/
|
||||
template <typename Packet>
|
||||
EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS Packet generic_log1p_double(const Packet& x) {
|
||||
@@ -295,67 +374,31 @@ EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS Packet generic_log1p_double(
|
||||
const Packet cst_minus_inf = pset1frombits<Packet>(static_cast<uint64_t>(0xfff0000000000000ull));
|
||||
const Packet cst_pos_inf = pset1frombits<Packet>(static_cast<uint64_t>(0x7ff0000000000000ull));
|
||||
|
||||
// u = 1 + x, with rounding. Recover the lost low bits: dx = x - (u - 1).
|
||||
Packet u = padd(one, x);
|
||||
Packet dx = psub(x, psub(u, one));
|
||||
|
||||
// For |x| tiny enough that u rounds to 1, return x directly.
|
||||
Packet small_mask = pcmp_eq(u, one);
|
||||
// For u = +inf (x very large), return +inf.
|
||||
Packet inf_mask = pcmp_eq(u, cst_pos_inf);
|
||||
|
||||
const Packet cst_cephes_SQRTHF = pset1<Packet>(0.70710678118654752440E0);
|
||||
Packet e;
|
||||
Packet m = pfrexp(u, e);
|
||||
Packet mask = pcmp_lt(m, cst_cephes_SQRTHF);
|
||||
Packet tmp = pand(m, mask);
|
||||
m = psub(m, one);
|
||||
e = psub(e, pand(one, mask));
|
||||
m = padd(m, tmp);
|
||||
// Core range reduction and polynomial on u.
|
||||
Packet log_u, e;
|
||||
plog_core_double(u, log_u, e);
|
||||
|
||||
// Same polynomial as plog_double.
|
||||
const Packet cst_neg_half = pset1<Packet>(-0.5);
|
||||
const Packet cst_cephes_log_p0 = pset1<Packet>(1.01875663804580931796E-4);
|
||||
const Packet cst_cephes_log_p1 = pset1<Packet>(4.97494994976747001425E-1);
|
||||
const Packet cst_cephes_log_p2 = pset1<Packet>(4.70579119878881725854E0);
|
||||
const Packet cst_cephes_log_p3 = pset1<Packet>(1.44989225341610930846E1);
|
||||
const Packet cst_cephes_log_p4 = pset1<Packet>(1.79368678507819816313E1);
|
||||
const Packet cst_cephes_log_p5 = pset1<Packet>(7.70838733755885391666E0);
|
||||
const Packet cst_cephes_log_q0 = pset1<Packet>(1.0);
|
||||
const Packet cst_cephes_log_q1 = pset1<Packet>(1.12873587189167450590E1);
|
||||
const Packet cst_cephes_log_q2 = pset1<Packet>(4.52279145837532221105E1);
|
||||
const Packet cst_cephes_log_q3 = pset1<Packet>(8.29875266912776603211E1);
|
||||
const Packet cst_cephes_log_q4 = pset1<Packet>(7.11544750618563894466E1);
|
||||
const Packet cst_cephes_log_q5 = pset1<Packet>(2.31251620126765340583E1);
|
||||
|
||||
Packet m2 = pmul(m, m);
|
||||
Packet m3 = pmul(m2, m);
|
||||
|
||||
Packet y, y1, y_;
|
||||
y = pmadd(cst_cephes_log_p0, m, cst_cephes_log_p1);
|
||||
y1 = pmadd(cst_cephes_log_p3, m, cst_cephes_log_p4);
|
||||
y = pmadd(y, m, cst_cephes_log_p2);
|
||||
y1 = pmadd(y1, m, cst_cephes_log_p5);
|
||||
y_ = pmadd(y, m3, y1);
|
||||
|
||||
y = pmadd(cst_cephes_log_q0, m, cst_cephes_log_q1);
|
||||
y1 = pmadd(cst_cephes_log_q3, m, cst_cephes_log_q4);
|
||||
y = pmadd(y, m, cst_cephes_log_q2);
|
||||
y1 = pmadd(y1, m, cst_cephes_log_q5);
|
||||
y = pmadd(y, m3, y1);
|
||||
|
||||
y_ = pmul(y_, m3);
|
||||
Packet log_m = pdiv(y_, y);
|
||||
log_m = pmadd(cst_neg_half, m2, log_m);
|
||||
log_m = padd(m, log_m);
|
||||
|
||||
// result = e * ln2 + log(m) + dx/u.
|
||||
// result = e * ln2 + log(u) + dx/u.
|
||||
// The dx/u term corrects for the rounding error in u = fl(1+x).
|
||||
const Packet cst_ln2 = pset1<Packet>(static_cast<double>(EIGEN_LN2));
|
||||
Packet result = pmadd(e, cst_ln2, padd(log_m, pdiv(dx, u)));
|
||||
Packet result = pmadd(e, cst_ln2, padd(log_u, pdiv(dx, u)));
|
||||
|
||||
// Handle special cases.
|
||||
Packet neg_mask = pcmp_lt(u, pzero(u));
|
||||
Packet zero_mask = pcmp_eq(x, pset1<Packet>(-1.0));
|
||||
result = pselect(small_mask, x, result);
|
||||
result = pselect(inf_mask, cst_pos_inf, result);
|
||||
result = pselect(zero_mask, cst_minus_inf, result);
|
||||
result = por(neg_mask, result);
|
||||
result = por(neg_mask, result); // NaN for x < -1
|
||||
return result;
|
||||
}
|
||||
|
||||
|
||||
@@ -230,40 +230,31 @@ EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS Packet ptan_float(const Pack
|
||||
return psincos_float<TrigFunction::Tan>(x);
|
||||
}
|
||||
|
||||
// Trigonometric argument reduction for double for inputs smaller than 15.
|
||||
// Reduces trigonometric arguments for double inputs where x < 15. Given an argument x and its corresponding quadrant
|
||||
// count n, the function computes and returns the reduced argument t such that x = n * pi/2 + t.
|
||||
// Pi/2 split into 3 double-precision parts (triple-double).
|
||||
// c1 + c2 + c3 = pi/2 to ~159 bits. Computed by Sollya.
|
||||
// c1 = RD(pi/2), c2 = RD(pi/2 - c1), c3 = RD(pi/2 - c1 - c2).
|
||||
template <typename Packet>
|
||||
Packet trig_reduce_small_double(const Packet& x, const Packet& q) {
|
||||
// Pi/2 split into 2 values
|
||||
const Packet cst_pio2_a = pset1<Packet>(-1.570796325802803);
|
||||
const Packet cst_pio2_b = pset1<Packet>(-9.920935184482005e-10);
|
||||
|
||||
Packet t;
|
||||
t = pmadd(cst_pio2_a, q, x);
|
||||
t = pmadd(cst_pio2_b, q, t);
|
||||
return t;
|
||||
Packet cst_pio2_1() {
|
||||
return pset1<Packet>(-1.5707963267948965579989817342720925807952880859375); // -0x1.921fb54442d18p0
|
||||
}
|
||||
template <typename Packet>
|
||||
Packet cst_pio2_2() {
|
||||
return pset1<Packet>(-6.12323399573676603586882014729198302312846062338790e-17); // -0x1.1a62633145c07p-54
|
||||
}
|
||||
template <typename Packet>
|
||||
Packet cst_pio2_3() {
|
||||
return pset1<Packet>(1.4973849048591698329435081771059920083527504761695190e-33); // 0x1.f1976b7ed8fbcp-110
|
||||
}
|
||||
|
||||
// Trigonometric argument reduction for double for inputs smaller than 1e14.
|
||||
// Reduces trigonometric arguments for double inputs where x < 1e14. Given an argument x and its corresponding quadrant
|
||||
// count n, the function computes and returns the reduced argument t such that x = n * pi/2 + t.
|
||||
// Trigonometric argument reduction for double, small inputs (|x| < small_th).
|
||||
// Reduces x to t such that x = q * pi/2 + t, where |t| <= pi/4.
|
||||
// Uses a triple-double split of pi/2 with FMA for high accuracy.
|
||||
template <typename Packet>
|
||||
Packet trig_reduce_medium_double(const Packet& x, const Packet& q_high, const Packet& q_low) {
|
||||
// Pi/2 split into 4 values
|
||||
const Packet cst_pio2_a = pset1<Packet>(-1.570796325802803);
|
||||
const Packet cst_pio2_b = pset1<Packet>(-9.920935184482005e-10);
|
||||
const Packet cst_pio2_c = pset1<Packet>(-6.123234014771656e-17);
|
||||
const Packet cst_pio2_d = pset1<Packet>(1.903488962019325e-25);
|
||||
|
||||
Packet trig_reduce_small_double(const Packet& x, const Packet& q) {
|
||||
Packet t;
|
||||
t = pmadd(cst_pio2_a, q_high, x);
|
||||
t = pmadd(cst_pio2_a, q_low, t);
|
||||
t = pmadd(cst_pio2_b, q_high, t);
|
||||
t = pmadd(cst_pio2_b, q_low, t);
|
||||
t = pmadd(cst_pio2_c, q_high, t);
|
||||
t = pmadd(cst_pio2_c, q_low, t);
|
||||
t = pmadd(cst_pio2_d, padd(q_low, q_high), t);
|
||||
t = pmadd(cst_pio2_1<Packet>(), q, x);
|
||||
t = pmadd(cst_pio2_2<Packet>(), q, t);
|
||||
t = pmadd(cst_pio2_3<Packet>(), q, t);
|
||||
return t;
|
||||
}
|
||||
|
||||
@@ -284,11 +275,13 @@ EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS
|
||||
// If the argument is bigger than this value, use the non-vectorized std version
|
||||
const double huge_th = 1e14;
|
||||
|
||||
const Packet cst_2oPI = pset1<Packet>(0.63661977236758134307553505349006); // 2/PI
|
||||
// 2/PI as a double-word: hi + lo = 2/pi to ~107 bits. Computed by Sollya.
|
||||
const Packet cst_2oPI_hi =
|
||||
pset1<Packet>(0.63661977236758138243288840385503135621547698974609375); // 0x1.45f306dc9c883p-1
|
||||
const Packet cst_2oPI_lo =
|
||||
pset1<Packet>(-3.9357353350364971763790381828183628368294820823718866e-17); // -0x1.6b01ec5417056p-55
|
||||
// Integer Packet constants
|
||||
const PacketI cst_one = pset1<PacketI>(ScalarI(1));
|
||||
// Constant for splitting
|
||||
const Packet cst_split = pset1<Packet>(1 << 24);
|
||||
|
||||
Packet x_abs = pabs(x);
|
||||
|
||||
@@ -298,76 +291,56 @@ EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS
|
||||
|
||||
// TODO Implement huge angle argument reduction
|
||||
if (EIGEN_PREDICT_FALSE(predux_any(pcmp_le(pset1<Packet>(small_th), x_abs)))) {
|
||||
Packet q_high = pmul(pfloor(pmul(x_abs, pdiv(cst_2oPI, cst_split))), cst_split);
|
||||
Packet q_low_noround = psub(pmul(x_abs, cst_2oPI), q_high);
|
||||
q_int = pcast<Packet, PacketI>(padd(q_low_noround, pset1<Packet>(0.5)));
|
||||
Packet q_low = pcast<PacketI, Packet>(q_int);
|
||||
s = trig_reduce_medium_double(x_abs, q_high, q_low);
|
||||
// Medium path: use double-word product x * (2/pi) for precise quadrant computation.
|
||||
Packet prod_hi, prod_lo;
|
||||
twoprod(x_abs, cst_2oPI_hi, prod_hi, prod_lo);
|
||||
// Correction for 2/pi truncation: add x * lo(2/pi)
|
||||
prod_lo = pmadd(x_abs, cst_2oPI_lo, prod_lo);
|
||||
|
||||
// Round the double-word (prod_hi, prod_lo) to the nearest integer.
|
||||
Packet q = pround(prod_hi);
|
||||
// Compute exact fractional part to check if rounding was correct.
|
||||
Packet frac = padd(psub(prod_hi, q), prod_lo);
|
||||
// Correct if fractional part crossed +-0.5 boundary.
|
||||
q = padd(q, pand(pcmp_lt(pset1<Packet>(0.5), frac), pset1<Packet>(1.0)));
|
||||
q = padd(q, pand(pcmp_lt(frac, pset1<Packet>(-0.5)), pset1<Packet>(-1.0)));
|
||||
|
||||
q_int = pcast<Packet, PacketI>(q);
|
||||
s = trig_reduce_small_double(x_abs, q);
|
||||
} else {
|
||||
Packet qval_noround = pmul(x_abs, cst_2oPI);
|
||||
// Small path: simple reduction with triple-double pi/2 split.
|
||||
Packet qval_noround = pmul(x_abs, cst_2oPI_hi);
|
||||
q_int = pcast<Packet, PacketI>(padd(qval_noround, pset1<Packet>(0.5)));
|
||||
Packet q = pcast<PacketI, Packet>(q_int);
|
||||
s = trig_reduce_small_double(x_abs, q);
|
||||
}
|
||||
|
||||
// All the upcoming approximating polynomials have even exponents
|
||||
Packet ss = pmul(s, s);
|
||||
|
||||
// Padé approximant of cos(x)
|
||||
// Assuring < 1 ULP error on the interval [-pi/4, pi/4]
|
||||
// cos(x) ~= (80737373*x^8 - 13853547000*x^6 + 727718024880*x^4 - 11275015752000*x^2 + 23594700729600)/(147173*x^8 +
|
||||
// 39328920*x^6 + 5772800880*x^4 + 522334612800*x^2 + 23594700729600)
|
||||
// MATLAB code to compute those coefficients:
|
||||
// syms x;
|
||||
// cosf = @(x) cos(x);
|
||||
// pade_cosf = pade(cosf(x), x, 0, 'Order', 8)
|
||||
const Packet cn4 = pset1<Packet>(80737373);
|
||||
const Packet cn3 = pset1<Packet>(-13853547000);
|
||||
const Packet cn2 = pset1<Packet>(727718024880);
|
||||
const Packet cn1 = pset1<Packet>(-11275015752000);
|
||||
const Packet cn0 = pset1<Packet>(23594700729600); // shared with cd0
|
||||
const Packet cd3 = pset1<Packet>(147173);
|
||||
const Packet cd2 = pset1<Packet>(39328920);
|
||||
const Packet cd1 = pset1<Packet>(5772800880);
|
||||
const Packet cd0 = pset1<Packet>(522334612800);
|
||||
Packet sc1_num = pmadd(ss, cn4, cn3);
|
||||
Packet sc2_num = pmadd(sc1_num, ss, cn2);
|
||||
Packet sc3_num = pmadd(sc2_num, ss, cn1);
|
||||
Packet sc4_num = pmadd(sc3_num, ss, cn0);
|
||||
Packet sc1_denum = pmadd(ss, cd3, cd2);
|
||||
Packet sc2_denum = pmadd(sc1_denum, ss, cd1);
|
||||
Packet sc3_denum = pmadd(sc2_denum, ss, cd0);
|
||||
Packet sc4_denum = pmadd(sc3_denum, ss, cn0);
|
||||
Packet scos = pdiv(sc4_num, sc4_denum);
|
||||
// Minimax polynomial approximation of cos(x) on [-pi/4, pi/4].
|
||||
// cos(x) = 1 + u * P(u), where u = x^2 and P is degree 6 (7 FMAs total).
|
||||
// Coefficients computed by Sollya fpminimax. Max polynomial error ~1.3e-19.
|
||||
Packet scos = pset1<Packet>(-1.1368926065317776472832699312119132152576472805094454088248312473297119140625e-11);
|
||||
scos = pmadd(scos, ss, pset1<Packet>(2.0875905481768720039634091158002593413556269297259859740734100341796875e-09));
|
||||
scos = pmadd(scos, ss, pset1<Packet>(-2.7557315712466412785356544880299711763882442028261721134185791015625e-07));
|
||||
scos = pmadd(scos, ss, pset1<Packet>(2.480158729424286522739599714082459058772656135261058807373046875e-05));
|
||||
scos = pmadd(scos, ss, pset1<Packet>(-1.388888888888178789471350427220386336557567119598388671875e-03));
|
||||
scos = pmadd(scos, ss, pset1<Packet>(4.166666666666664353702032030923874117434024810791015625e-02));
|
||||
scos = pmadd(scos, ss, pset1<Packet>(-0.5));
|
||||
scos = pmadd(scos, ss, pset1<Packet>(1.0));
|
||||
|
||||
// Padé approximant of sin(x)
|
||||
// Assuring < 1 ULP error on the interval [-pi/4, pi/4]
|
||||
// sin(x) ~= (x*(4585922449*x^8 - 1066023933480*x^6 + 83284044283440*x^4 - 2303682236856000*x^2 +
|
||||
// 15605159573203200))/(45*(1029037*x^8 + 345207016*x^6 + 61570292784*x^4 + 6603948711360*x^2 + 346781323848960))
|
||||
// MATLAB code to compute those coefficients:
|
||||
// syms x;
|
||||
// sinf = @(x) sin(x);
|
||||
// pade_sinf = pade(sinf(x), x, 0, 'Order', 8, 'OrderMode', 'relative')
|
||||
const Packet sn4 = pset1<Packet>(4585922449);
|
||||
const Packet sn3 = pset1<Packet>(-1066023933480);
|
||||
const Packet sn2 = pset1<Packet>(83284044283440);
|
||||
const Packet sn1 = pset1<Packet>(-2303682236856000);
|
||||
const Packet sn0 = pset1<Packet>(15605159573203200);
|
||||
const Packet sd3 = pset1<Packet>(1029037);
|
||||
const Packet sd2 = pset1<Packet>(345207016);
|
||||
const Packet sd1 = pset1<Packet>(61570292784);
|
||||
const Packet sd0_inner = pset1<Packet>(6603948711360);
|
||||
const Packet sd0 = pset1<Packet>(346781323848960);
|
||||
const Packet cst_45 = pset1<Packet>(45);
|
||||
Packet ss1_num = pmadd(ss, sn4, sn3);
|
||||
Packet ss2_num = pmadd(ss1_num, ss, sn2);
|
||||
Packet ss3_num = pmadd(ss2_num, ss, sn1);
|
||||
Packet ss4_num = pmadd(ss3_num, ss, sn0);
|
||||
Packet ss1_denum = pmadd(ss, sd3, sd2);
|
||||
Packet ss2_denum = pmadd(ss1_denum, ss, sd1);
|
||||
Packet ss3_denum = pmadd(ss2_denum, ss, sd0_inner);
|
||||
Packet ss4_denum = pmadd(ss3_denum, ss, sd0);
|
||||
Packet ssin = pdiv(pmul(s, ss4_num), pmul(cst_45, ss4_denum));
|
||||
// Minimax polynomial approximation of sin(x) on [-pi/4, pi/4].
|
||||
// sin(x) = x * (1 + u * R(u)), where u = x^2 and R is degree 5.
|
||||
// Computed as: x + x * u * R(u) (6 FMAs + 1 mul).
|
||||
// Coefficients computed by Sollya fpminimax. Max polynomial error ~1.0e-17.
|
||||
Packet ssin = pset1<Packet>(1.59193066075142890698150587293845624470289834562208852730691432952880859375e-10);
|
||||
ssin = pmadd(ssin, ss, pset1<Packet>(-2.50511517945670206974594627392927126408039839589037001132965087890625e-08));
|
||||
ssin = pmadd(ssin, ss, pset1<Packet>(2.755731622544328228235042954619160582296899519860744476318359375e-06));
|
||||
ssin = pmadd(ssin, ss, pset1<Packet>(-1.9841269837089632013978068858506276228581555187702178955078125e-04));
|
||||
ssin = pmadd(ssin, ss, pset1<Packet>(8.333333333331312264835588621281203813850879669189453125e-03));
|
||||
ssin = pmadd(ssin, ss, pset1<Packet>(-0.1666666666666666574148081281236954964697360992431640625));
|
||||
ssin = pmul(ssin, ss);
|
||||
ssin = pmadd(ssin, s, s);
|
||||
|
||||
Packet poly_mask = preinterpret<Packet>(pcmp_eq(pand(q_int, cst_one), pzero(q_int)));
|
||||
|
||||
|
||||
@@ -45,7 +45,7 @@
|
||||
// Eigen with GPU support.
|
||||
// Any functions that require `numext::bit_cast` may also not be constexpr,
|
||||
// including any native types when setting via raw bit values.
|
||||
#if defined(EIGEN_HAS_GPU_FP16) || defined(EIGEN_HAS_ARM64_FP16_SCALAR_ARITHMETIC) || defined(EIGEN_HAS_BUILTIN_FLOAT16)
|
||||
#if defined(EIGEN_GPUCC) || defined(EIGEN_HAS_ARM64_FP16_SCALAR_ARITHMETIC) || defined(EIGEN_HAS_BUILTIN_FLOAT16)
|
||||
#define _EIGEN_MAYBE_CONSTEXPR
|
||||
#else
|
||||
#define _EIGEN_MAYBE_CONSTEXPR constexpr
|
||||
@@ -121,12 +121,12 @@ namespace half_impl {
|
||||
//
|
||||
// Making the host side compile phase of hipcc use the same Eigen::half impl, as the gcc compile, resolves
|
||||
// this error, and hence the following convoluted #if condition
|
||||
#if !defined(EIGEN_HAS_GPU_FP16) || !defined(EIGEN_GPU_COMPILE_PHASE)
|
||||
#if !defined(EIGEN_GPUCC) || !defined(EIGEN_GPU_COMPILE_PHASE)
|
||||
|
||||
// Make our own __half_raw definition that is similar to CUDA's.
|
||||
struct __half_raw {
|
||||
struct construct_from_rep_tag {};
|
||||
#if (defined(EIGEN_HAS_GPU_FP16) && !defined(EIGEN_GPU_COMPILE_PHASE))
|
||||
#if (defined(EIGEN_GPUCC) && !defined(EIGEN_GPU_COMPILE_PHASE))
|
||||
// Eigen::half can be used as the datatype for shared memory declarations (in Eigen and TF)
|
||||
// The element type for shared memory cannot have non-trivial constructors
|
||||
// and hence the following special casing (which skips the zero-initilization).
|
||||
@@ -152,16 +152,12 @@ struct __half_raw {
|
||||
#endif
|
||||
};
|
||||
|
||||
#elif defined(EIGEN_HAS_HIP_FP16)
|
||||
#elif defined(EIGEN_HIPCC)
|
||||
// HIP GPU compile phase: nothing to do here.
|
||||
// HIP fp16 header file has a definition for __half_raw
|
||||
#elif defined(EIGEN_HAS_CUDA_FP16)
|
||||
#elif defined(EIGEN_CUDACC)
|
||||
|
||||
// CUDA GPU compile phase.
|
||||
#if EIGEN_CUDA_SDK_VER < 90000
|
||||
// In CUDA < 9.0, __half is the equivalent of CUDA 9's __half_raw
|
||||
typedef __half __half_raw;
|
||||
#endif // defined(EIGEN_HAS_CUDA_FP16)
|
||||
|
||||
#elif defined(SYCL_DEVICE_ONLY)
|
||||
typedef cl::sycl::half __half_raw;
|
||||
@@ -175,15 +171,13 @@ struct half_base : public __half_raw {
|
||||
EIGEN_DEVICE_FUNC _EIGEN_MAYBE_CONSTEXPR half_base() {}
|
||||
EIGEN_DEVICE_FUNC _EIGEN_MAYBE_CONSTEXPR half_base(const __half_raw& h) : __half_raw(h) {}
|
||||
|
||||
#if defined(EIGEN_HAS_GPU_FP16)
|
||||
#if defined(EIGEN_HAS_HIP_FP16)
|
||||
#if defined(EIGEN_GPUCC)
|
||||
#if defined(EIGEN_HIPCC)
|
||||
EIGEN_DEVICE_FUNC _EIGEN_MAYBE_CONSTEXPR half_base(const __half& h) { x = __half_as_ushort(h); }
|
||||
#elif defined(EIGEN_HAS_CUDA_FP16)
|
||||
#if EIGEN_CUDA_SDK_VER >= 90000
|
||||
#elif defined(EIGEN_CUDACC)
|
||||
EIGEN_DEVICE_FUNC _EIGEN_MAYBE_CONSTEXPR half_base(const __half& h) : __half_raw(*(__half_raw*)&h) {}
|
||||
#endif
|
||||
#endif
|
||||
#endif
|
||||
};
|
||||
|
||||
} // namespace half_impl
|
||||
@@ -192,36 +186,29 @@ struct half_base : public __half_raw {
|
||||
struct half : public half_impl::half_base {
|
||||
// Writing this out as separate #if-else blocks to make the code easier to follow
|
||||
// The same applies to most #if-else blocks in this file
|
||||
#if !defined(EIGEN_HAS_GPU_FP16) || !defined(EIGEN_GPU_COMPILE_PHASE)
|
||||
#if !defined(EIGEN_GPUCC) || !defined(EIGEN_GPU_COMPILE_PHASE)
|
||||
// Use the same base class for the following two scenarios
|
||||
// * when compiling without GPU support enabled
|
||||
// * during host compile phase when compiling with GPU support enabled
|
||||
typedef half_impl::__half_raw __half_raw;
|
||||
#elif defined(EIGEN_HAS_HIP_FP16)
|
||||
#elif defined(EIGEN_HIPCC)
|
||||
// Nothing to do here
|
||||
// HIP fp16 header file has a definition for __half_raw
|
||||
#elif defined(EIGEN_HAS_CUDA_FP16)
|
||||
// Note that EIGEN_CUDA_SDK_VER is set to 0 even when compiling with HIP, so
|
||||
// (EIGEN_CUDA_SDK_VER < 90000) is true even for HIP! So keeping this within
|
||||
// #if defined(EIGEN_HAS_CUDA_FP16) is needed
|
||||
#if defined(EIGEN_CUDA_SDK_VER) && EIGEN_CUDA_SDK_VER < 90000
|
||||
typedef half_impl::__half_raw __half_raw;
|
||||
#endif
|
||||
#elif defined(EIGEN_CUDACC)
|
||||
// Nothing to do here.
|
||||
#endif
|
||||
|
||||
EIGEN_DEVICE_FUNC _EIGEN_MAYBE_CONSTEXPR half() {}
|
||||
|
||||
EIGEN_DEVICE_FUNC _EIGEN_MAYBE_CONSTEXPR half(const __half_raw& h) : half_impl::half_base(h) {}
|
||||
|
||||
#if defined(EIGEN_HAS_GPU_FP16)
|
||||
#if defined(EIGEN_HAS_HIP_FP16)
|
||||
#if defined(EIGEN_GPUCC)
|
||||
#if defined(EIGEN_HIPCC)
|
||||
EIGEN_DEVICE_FUNC _EIGEN_MAYBE_CONSTEXPR half(const __half& h) : half_impl::half_base(h) {}
|
||||
#elif defined(EIGEN_HAS_CUDA_FP16)
|
||||
#if defined(EIGEN_CUDA_SDK_VER) && EIGEN_CUDA_SDK_VER >= 90000
|
||||
#elif defined(EIGEN_CUDACC)
|
||||
EIGEN_DEVICE_FUNC _EIGEN_MAYBE_CONSTEXPR half(const __half& h) : half_impl::half_base(h) {}
|
||||
#endif
|
||||
#endif
|
||||
#endif
|
||||
|
||||
#if defined(EIGEN_HAS_ARM64_FP16_SCALAR_ARITHMETIC)
|
||||
explicit EIGEN_DEVICE_FUNC _EIGEN_MAYBE_CONSTEXPR half(__fp16 b)
|
||||
@@ -248,7 +235,7 @@ struct half : public half_impl::half_base {
|
||||
return half_impl::half_to_float(*this);
|
||||
}
|
||||
|
||||
#if defined(EIGEN_HAS_GPU_FP16) && !defined(EIGEN_GPU_COMPILE_PHASE)
|
||||
#if defined(EIGEN_GPUCC) && !defined(EIGEN_GPU_COMPILE_PHASE)
|
||||
EIGEN_DEVICE_FUNC operator __half() const {
|
||||
::__half_raw hr;
|
||||
hr.x = x;
|
||||
@@ -380,8 +367,7 @@ namespace Eigen {
|
||||
|
||||
namespace half_impl {
|
||||
|
||||
#if (defined(EIGEN_HAS_CUDA_FP16) && defined(EIGEN_CUDA_ARCH) && EIGEN_CUDA_ARCH >= 530) || \
|
||||
(defined(EIGEN_HAS_HIP_FP16) && defined(HIP_DEVICE_COMPILE))
|
||||
#if defined(EIGEN_GPU_COMPILE_PHASE)
|
||||
// Note: We deliberately do *not* define this to 1 even if we have Arm's native
|
||||
// fp16 type since GPU half types are rather different from native CPU half types.
|
||||
#define EIGEN_HAS_NATIVE_GPU_FP16
|
||||
@@ -393,24 +379,10 @@ namespace half_impl {
|
||||
// conversion steps back and forth.
|
||||
|
||||
#if defined(EIGEN_HAS_NATIVE_GPU_FP16)
|
||||
EIGEN_STRONG_INLINE __device__ half operator+(const half& a, const half& b) {
|
||||
#if defined(EIGEN_CUDA_SDK_VER) && EIGEN_CUDA_SDK_VER >= 90000
|
||||
return __hadd(::__half(a), ::__half(b));
|
||||
#else
|
||||
return __hadd(a, b);
|
||||
#endif
|
||||
}
|
||||
EIGEN_STRONG_INLINE __device__ half operator+(const half& a, const half& b) { return __hadd(::__half(a), ::__half(b)); }
|
||||
EIGEN_STRONG_INLINE __device__ half operator*(const half& a, const half& b) { return __hmul(a, b); }
|
||||
EIGEN_STRONG_INLINE __device__ half operator-(const half& a, const half& b) { return __hsub(a, b); }
|
||||
EIGEN_STRONG_INLINE __device__ half operator/(const half& a, const half& b) {
|
||||
#if defined(EIGEN_CUDA_SDK_VER) && EIGEN_CUDA_SDK_VER >= 90000
|
||||
return __hdiv(a, b);
|
||||
#else
|
||||
float num = __half2float(a);
|
||||
float denom = __half2float(b);
|
||||
return __float2half(num / denom);
|
||||
#endif
|
||||
}
|
||||
EIGEN_STRONG_INLINE __device__ half operator/(const half& a, const half& b) { return __hdiv(a, b); }
|
||||
EIGEN_STRONG_INLINE __device__ half operator-(const half& a) { return __hneg(a); }
|
||||
EIGEN_STRONG_INLINE __device__ half& operator+=(half& a, const half& b) {
|
||||
a = a + b;
|
||||
@@ -505,7 +477,7 @@ EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC bool operator>=(const half& a, const half&
|
||||
// We need to provide emulated *host-side* FP16 operators for clang.
|
||||
#pragma push_macro("EIGEN_DEVICE_FUNC")
|
||||
#undef EIGEN_DEVICE_FUNC
|
||||
#if defined(EIGEN_HAS_CUDA_FP16) && defined(EIGEN_HAS_NATIVE_GPU_FP16)
|
||||
#if defined(EIGEN_CUDACC) && defined(EIGEN_HAS_NATIVE_GPU_FP16)
|
||||
#define EIGEN_DEVICE_FUNC __host__
|
||||
#else // both host and device need emulated ops.
|
||||
#define EIGEN_DEVICE_FUNC __host__ __device__
|
||||
@@ -636,7 +608,7 @@ EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC _EIGEN_MAYBE_CONSTEXPR __half_raw raw_uint
|
||||
// because this is constexpr function.
|
||||
// Fortunately, since we need to disable EIGEN_CONSTEXPR for GPU anyway, we can get out
|
||||
// of this catch22 by having separate bodies for GPU / non GPU
|
||||
#if defined(EIGEN_HAS_GPU_FP16)
|
||||
#if defined(EIGEN_GPUCC)
|
||||
__half_raw h;
|
||||
h.x = x;
|
||||
return h;
|
||||
@@ -661,8 +633,7 @@ EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC numext::uint16_t raw_half_as_uint16(const
|
||||
}
|
||||
|
||||
EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC __half_raw float_to_half_rtne(float ff) {
|
||||
#if (defined(EIGEN_HAS_CUDA_FP16) && defined(EIGEN_CUDA_ARCH) && EIGEN_CUDA_ARCH >= 300) || \
|
||||
(defined(EIGEN_HAS_HIP_FP16) && defined(EIGEN_HIP_DEVICE_COMPILE))
|
||||
#if defined(EIGEN_GPU_COMPILE_PHASE)
|
||||
__half tmp_ff = __float2half(ff);
|
||||
return *(__half_raw*)&tmp_ff;
|
||||
|
||||
@@ -735,8 +706,7 @@ EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC __half_raw float_to_half_rtne(float ff) {
|
||||
}
|
||||
|
||||
EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC float half_to_float(__half_raw h) {
|
||||
#if (defined(EIGEN_HAS_CUDA_FP16) && defined(EIGEN_CUDA_ARCH) && EIGEN_CUDA_ARCH >= 300) || \
|
||||
(defined(EIGEN_HAS_HIP_FP16) && defined(EIGEN_HIP_DEVICE_COMPILE))
|
||||
#if defined(EIGEN_GPU_COMPILE_PHASE)
|
||||
return __half2float(h);
|
||||
#elif defined(EIGEN_HAS_ARM64_FP16_SCALAR_ARITHMETIC) || defined(EIGEN_HAS_BUILTIN_FLOAT16)
|
||||
return static_cast<float>(h.x);
|
||||
@@ -778,8 +748,7 @@ EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC bool(isinf)(const half& a) {
|
||||
#endif
|
||||
}
|
||||
EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC bool(isnan)(const half& a) {
|
||||
#if (defined(EIGEN_HAS_CUDA_FP16) && defined(EIGEN_CUDA_ARCH) && EIGEN_CUDA_ARCH >= 530) || \
|
||||
(defined(EIGEN_HAS_HIP_FP16) && defined(EIGEN_HIP_DEVICE_COMPILE))
|
||||
#if defined(EIGEN_GPU_COMPILE_PHASE)
|
||||
return __hisnan(a);
|
||||
#elif defined(EIGEN_HAS_ARM64_FP16_SCALAR_ARITHMETIC) || defined(EIGEN_HAS_BUILTIN_FLOAT16)
|
||||
return (numext::bit_cast<numext::uint16_t>(a.x) & 0x7fff) > 0x7c00;
|
||||
@@ -810,16 +779,14 @@ EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC half abs(const half& a) {
|
||||
#endif
|
||||
}
|
||||
EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC half exp(const half& a) {
|
||||
#if (EIGEN_CUDA_SDK_VER >= 80000 && defined EIGEN_CUDA_ARCH && EIGEN_CUDA_ARCH >= 530) || \
|
||||
defined(EIGEN_HIP_DEVICE_COMPILE)
|
||||
#if defined(EIGEN_CUDA_ARCH) || defined(EIGEN_HIP_DEVICE_COMPILE)
|
||||
return half(hexp(a));
|
||||
#else
|
||||
return half(::expf(float(a)));
|
||||
#endif
|
||||
}
|
||||
EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC half exp2(const half& a) {
|
||||
#if (EIGEN_CUDA_SDK_VER >= 80000 && defined EIGEN_CUDA_ARCH && EIGEN_CUDA_ARCH >= 530) || \
|
||||
defined(EIGEN_HIP_DEVICE_COMPILE)
|
||||
#if defined(EIGEN_CUDA_ARCH) || defined(EIGEN_HIP_DEVICE_COMPILE)
|
||||
return half(hexp2(a));
|
||||
#else
|
||||
return half(::exp2f(float(a)));
|
||||
@@ -827,9 +794,7 @@ EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC half exp2(const half& a) {
|
||||
}
|
||||
EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC half expm1(const half& a) { return half(numext::expm1(float(a))); }
|
||||
EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC half log(const half& a) {
|
||||
#if (defined(EIGEN_HAS_CUDA_FP16) && EIGEN_CUDA_SDK_VER >= 80000 && defined(EIGEN_CUDA_ARCH) && \
|
||||
EIGEN_CUDA_ARCH >= 530) || \
|
||||
(defined(EIGEN_HAS_HIP_FP16) && defined(EIGEN_HIP_DEVICE_COMPILE))
|
||||
#if defined(EIGEN_GPU_COMPILE_PHASE)
|
||||
return half(hlog(a));
|
||||
#else
|
||||
return half(::logf(float(a)));
|
||||
@@ -842,8 +807,7 @@ EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC half log2(const half& a) {
|
||||
}
|
||||
|
||||
EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC half sqrt(const half& a) {
|
||||
#if (EIGEN_CUDA_SDK_VER >= 80000 && defined EIGEN_CUDA_ARCH && EIGEN_CUDA_ARCH >= 530) || \
|
||||
defined(EIGEN_HIP_DEVICE_COMPILE)
|
||||
#if defined(EIGEN_CUDA_ARCH) || defined(EIGEN_HIP_DEVICE_COMPILE)
|
||||
return half(hsqrt(a));
|
||||
#else
|
||||
return half(::sqrtf(float(a)));
|
||||
@@ -864,16 +828,14 @@ EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC half acos(const half& a) { return half(::a
|
||||
EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC half atan(const half& a) { return half(::atanf(float(a))); }
|
||||
EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC half atanh(const half& a) { return half(::atanhf(float(a))); }
|
||||
EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC half floor(const half& a) {
|
||||
#if (EIGEN_CUDA_SDK_VER >= 80000 && defined EIGEN_CUDA_ARCH && EIGEN_CUDA_ARCH >= 300) || \
|
||||
defined(EIGEN_HIP_DEVICE_COMPILE)
|
||||
#if (defined(EIGEN_CUDA_ARCH)) || defined(EIGEN_HIP_DEVICE_COMPILE)
|
||||
return half(hfloor(a));
|
||||
#else
|
||||
return half(::floorf(float(a)));
|
||||
#endif
|
||||
}
|
||||
EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC half ceil(const half& a) {
|
||||
#if (EIGEN_CUDA_SDK_VER >= 80000 && defined EIGEN_CUDA_ARCH && EIGEN_CUDA_ARCH >= 300) || \
|
||||
defined(EIGEN_HIP_DEVICE_COMPILE)
|
||||
#if (defined(EIGEN_CUDA_ARCH)) || defined(EIGEN_HIP_DEVICE_COMPILE)
|
||||
return half(hceil(a));
|
||||
#else
|
||||
return half(::ceilf(float(a)));
|
||||
@@ -1007,20 +969,12 @@ EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC Eigen::half madd<Eigen::half>(const Eigen:
|
||||
} // namespace numext
|
||||
} // namespace Eigen
|
||||
|
||||
// Add the missing shfl* intrinsics.
|
||||
// The __shfl* functions are only valid on HIP or _CUDA_ARCH_ >= 300.
|
||||
// CUDA defines them for (__CUDA_ARCH__ >= 300 || !defined(__CUDA_ARCH__))
|
||||
//
|
||||
// HIP and CUDA prior to SDK 9.0 define
|
||||
// __shfl, __shfl_up, __shfl_down, __shfl_xor for int and float
|
||||
// CUDA since 9.0 deprecates those and instead defines
|
||||
// __shfl_sync, __shfl_up_sync, __shfl_down_sync, __shfl_xor_sync,
|
||||
// with native support for __half and __nv_bfloat16
|
||||
//
|
||||
// Warp shuffle overloads for Eigen::half.
|
||||
// CUDA uses __shfl_*_sync (with mask); HIP uses __shfl_* (no mask).
|
||||
// Note that the following are __device__ - only functions.
|
||||
#if (defined(EIGEN_CUDACC) && (!defined(EIGEN_CUDA_ARCH) || EIGEN_CUDA_ARCH >= 300)) || defined(EIGEN_HIPCC)
|
||||
#if defined(EIGEN_CUDACC) || defined(EIGEN_HIPCC)
|
||||
|
||||
#if defined(EIGEN_HAS_CUDA_FP16) && EIGEN_CUDA_SDK_VER >= 90000
|
||||
#if defined(EIGEN_CUDACC)
|
||||
|
||||
__device__ EIGEN_STRONG_INLINE Eigen::half __shfl_sync(unsigned mask, Eigen::half var, int srcLane,
|
||||
int width = warpSize) {
|
||||
@@ -1046,7 +1000,7 @@ __device__ EIGEN_STRONG_INLINE Eigen::half __shfl_xor_sync(unsigned mask, Eigen:
|
||||
return static_cast<Eigen::half>(__shfl_xor_sync(mask, h, laneMask, width));
|
||||
}
|
||||
|
||||
#else // HIP or CUDA SDK < 9.0
|
||||
#else // HIP
|
||||
|
||||
__device__ EIGEN_STRONG_INLINE Eigen::half __shfl(Eigen::half var, int srcLane, int width = warpSize) {
|
||||
const int ivar = static_cast<int>(Eigen::numext::bit_cast<Eigen::numext::uint16_t>(var));
|
||||
@@ -1072,7 +1026,7 @@ __device__ EIGEN_STRONG_INLINE Eigen::half __shfl_xor(Eigen::half var, int laneM
|
||||
#endif // __shfl*
|
||||
|
||||
// ldg() has an overload for __half_raw, but we also need one for Eigen::half.
|
||||
#if (defined(EIGEN_CUDACC) && (!defined(EIGEN_CUDA_ARCH) || EIGEN_CUDA_ARCH >= 350)) || defined(EIGEN_HIPCC)
|
||||
#if defined(EIGEN_CUDACC) || defined(EIGEN_HIPCC)
|
||||
EIGEN_STRONG_INLINE __device__ Eigen::half __ldg(const Eigen::half* ptr) {
|
||||
return Eigen::half_impl::raw_uint16_to_half(__ldg(reinterpret_cast<const Eigen::numext::uint16_t*>(ptr)));
|
||||
}
|
||||
@@ -1095,8 +1049,7 @@ namespace internal {
|
||||
template <>
|
||||
struct cast_impl<float, half> {
|
||||
EIGEN_DEVICE_FUNC static inline half run(const float& a) {
|
||||
#if (defined(EIGEN_HAS_CUDA_FP16) && defined(EIGEN_CUDA_ARCH) && EIGEN_CUDA_ARCH >= 300) || \
|
||||
(defined(EIGEN_HAS_HIP_FP16) && defined(EIGEN_HIP_DEVICE_COMPILE))
|
||||
#if defined(EIGEN_GPU_COMPILE_PHASE)
|
||||
return __float2half(a);
|
||||
#else
|
||||
return half(a);
|
||||
@@ -1107,8 +1060,7 @@ struct cast_impl<float, half> {
|
||||
template <>
|
||||
struct cast_impl<int, half> {
|
||||
EIGEN_DEVICE_FUNC static inline half run(const int& a) {
|
||||
#if (defined(EIGEN_HAS_CUDA_FP16) && defined(EIGEN_CUDA_ARCH) && EIGEN_CUDA_ARCH >= 300) || \
|
||||
(defined(EIGEN_HAS_HIP_FP16) && defined(EIGEN_HIP_DEVICE_COMPILE))
|
||||
#if defined(EIGEN_GPU_COMPILE_PHASE)
|
||||
return __float2half(static_cast<float>(a));
|
||||
#else
|
||||
return half(static_cast<float>(a));
|
||||
@@ -1119,8 +1071,7 @@ struct cast_impl<int, half> {
|
||||
template <>
|
||||
struct cast_impl<half, float> {
|
||||
EIGEN_DEVICE_FUNC static inline float run(const half& a) {
|
||||
#if (defined(EIGEN_HAS_CUDA_FP16) && defined(EIGEN_CUDA_ARCH) && EIGEN_CUDA_ARCH >= 300) || \
|
||||
(defined(EIGEN_HAS_HIP_FP16) && defined(EIGEN_HIP_DEVICE_COMPILE))
|
||||
#if defined(EIGEN_GPU_COMPILE_PHASE)
|
||||
return __half2float(a);
|
||||
#else
|
||||
return static_cast<float>(a);
|
||||
|
||||
@@ -17,19 +17,8 @@ namespace Eigen {
|
||||
|
||||
namespace internal {
|
||||
|
||||
// Read-only data cached load available.
|
||||
#if defined(EIGEN_HIP_DEVICE_COMPILE) || (defined(EIGEN_CUDA_ARCH) && EIGEN_CUDA_ARCH >= 350)
|
||||
#define EIGEN_GPU_HAS_LDG 1
|
||||
#endif
|
||||
|
||||
// FP16 math available.
|
||||
#if (defined(EIGEN_CUDA_ARCH) && EIGEN_CUDA_ARCH >= 530)
|
||||
#define EIGEN_CUDA_HAS_FP16_ARITHMETIC 1
|
||||
#endif
|
||||
|
||||
#if defined(EIGEN_HIP_DEVICE_COMPILE) || defined(EIGEN_CUDA_HAS_FP16_ARITHMETIC)
|
||||
#define EIGEN_GPU_HAS_FP16_ARITHMETIC 1
|
||||
#endif
|
||||
// Read-only data cached load (__ldg) and native FP16 arithmetic are available
|
||||
// on all supported GPU architectures (sm_70+ for CUDA, GFX906+ for HIP).
|
||||
|
||||
// We need to distinguish ‘clang as the CUDA compiler’ from ‘clang as the host compiler,
|
||||
// invoked by NVCC’ (e.g. on MacOS). The former needs to see both host and device implementation
|
||||
@@ -56,92 +45,84 @@ struct is_arithmetic<double2> {
|
||||
|
||||
template <>
|
||||
struct packet_traits<float> : default_packet_traits {
|
||||
typedef float4 type;
|
||||
typedef float4 half;
|
||||
enum {
|
||||
Vectorizable = 1,
|
||||
AlignedOnScalar = 1,
|
||||
size = 4,
|
||||
using type = float4;
|
||||
using half = float4;
|
||||
static constexpr int Vectorizable = 1;
|
||||
static constexpr int AlignedOnScalar = 1;
|
||||
static constexpr int size = 4;
|
||||
|
||||
HasDiv = 1,
|
||||
HasSin = 0,
|
||||
HasCos = 0,
|
||||
HasLog = 1,
|
||||
HasExp = 1,
|
||||
HasSqrt = 1,
|
||||
HasRsqrt = 1,
|
||||
HasLGamma = 1,
|
||||
HasDiGamma = 1,
|
||||
HasZeta = 1,
|
||||
HasPolygamma = 1,
|
||||
HasErf = 1,
|
||||
HasErfc = 1,
|
||||
HasNdtri = 1,
|
||||
HasBessel = 1,
|
||||
HasIGamma = 1,
|
||||
HasIGammaDerA = 1,
|
||||
HasGammaSampleDerAlpha = 1,
|
||||
HasIGammac = 1,
|
||||
HasBetaInc = 1,
|
||||
static constexpr int HasDiv = 1;
|
||||
static constexpr int HasSin = 0;
|
||||
static constexpr int HasCos = 0;
|
||||
static constexpr int HasLog = 1;
|
||||
static constexpr int HasExp = 1;
|
||||
static constexpr int HasSqrt = 1;
|
||||
static constexpr int HasRsqrt = 1;
|
||||
static constexpr int HasLGamma = 1;
|
||||
static constexpr int HasDiGamma = 1;
|
||||
static constexpr int HasZeta = 1;
|
||||
static constexpr int HasPolygamma = 1;
|
||||
static constexpr int HasErf = 1;
|
||||
static constexpr int HasErfc = 1;
|
||||
static constexpr int HasNdtri = 1;
|
||||
static constexpr int HasBessel = 1;
|
||||
static constexpr int HasIGamma = 1;
|
||||
static constexpr int HasIGammaDerA = 1;
|
||||
static constexpr int HasGammaSampleDerAlpha = 1;
|
||||
static constexpr int HasIGammac = 1;
|
||||
static constexpr int HasBetaInc = 1;
|
||||
|
||||
HasFloor = 1,
|
||||
HasCmp = EIGEN_HAS_GPU_DEVICE_FUNCTIONS
|
||||
};
|
||||
static constexpr int HasFloor = 1;
|
||||
static constexpr int HasCmp = EIGEN_HAS_GPU_DEVICE_FUNCTIONS;
|
||||
};
|
||||
|
||||
template <>
|
||||
struct packet_traits<double> : default_packet_traits {
|
||||
typedef double2 type;
|
||||
typedef double2 half;
|
||||
enum {
|
||||
Vectorizable = 1,
|
||||
AlignedOnScalar = 1,
|
||||
size = 2,
|
||||
using type = double2;
|
||||
using half = double2;
|
||||
static constexpr int Vectorizable = 1;
|
||||
static constexpr int AlignedOnScalar = 1;
|
||||
static constexpr int size = 2;
|
||||
|
||||
HasDiv = 1,
|
||||
HasLog = 1,
|
||||
HasExp = 1,
|
||||
HasSqrt = 1,
|
||||
HasRsqrt = 1,
|
||||
HasLGamma = 1,
|
||||
HasDiGamma = 1,
|
||||
HasZeta = 1,
|
||||
HasPolygamma = 1,
|
||||
HasErf = 1,
|
||||
HasErfc = 1,
|
||||
HasNdtri = 1,
|
||||
HasBessel = 1,
|
||||
HasIGamma = 1,
|
||||
HasIGammaDerA = 1,
|
||||
HasGammaSampleDerAlpha = 1,
|
||||
HasIGammac = 1,
|
||||
HasBetaInc = 1,
|
||||
};
|
||||
static constexpr int HasDiv = 1;
|
||||
static constexpr int HasLog = 1;
|
||||
static constexpr int HasExp = 1;
|
||||
static constexpr int HasSqrt = 1;
|
||||
static constexpr int HasRsqrt = 1;
|
||||
static constexpr int HasLGamma = 1;
|
||||
static constexpr int HasDiGamma = 1;
|
||||
static constexpr int HasZeta = 1;
|
||||
static constexpr int HasPolygamma = 1;
|
||||
static constexpr int HasErf = 1;
|
||||
static constexpr int HasErfc = 1;
|
||||
static constexpr int HasNdtri = 1;
|
||||
static constexpr int HasBessel = 1;
|
||||
static constexpr int HasIGamma = 1;
|
||||
static constexpr int HasIGammaDerA = 1;
|
||||
static constexpr int HasGammaSampleDerAlpha = 1;
|
||||
static constexpr int HasIGammac = 1;
|
||||
static constexpr int HasBetaInc = 1;
|
||||
};
|
||||
|
||||
template <>
|
||||
struct unpacket_traits<float4> {
|
||||
typedef float type;
|
||||
enum {
|
||||
size = 4,
|
||||
alignment = Aligned16,
|
||||
vectorizable = true,
|
||||
masked_load_available = false,
|
||||
masked_store_available = false
|
||||
};
|
||||
typedef float4 half;
|
||||
using type = float;
|
||||
static constexpr int size = 4;
|
||||
static constexpr int alignment = Aligned16;
|
||||
static constexpr bool vectorizable = true;
|
||||
static constexpr bool masked_load_available = false;
|
||||
static constexpr bool masked_store_available = false;
|
||||
using half = float4;
|
||||
};
|
||||
template <>
|
||||
struct unpacket_traits<double2> {
|
||||
typedef double type;
|
||||
enum {
|
||||
size = 2,
|
||||
alignment = Aligned16,
|
||||
vectorizable = true,
|
||||
masked_load_available = false,
|
||||
masked_store_available = false
|
||||
};
|
||||
typedef double2 half;
|
||||
using type = double;
|
||||
static constexpr int size = 2;
|
||||
static constexpr int alignment = Aligned16;
|
||||
static constexpr bool vectorizable = true;
|
||||
static constexpr bool masked_load_available = false;
|
||||
static constexpr bool masked_store_available = false;
|
||||
using half = double2;
|
||||
};
|
||||
|
||||
template <>
|
||||
@@ -403,7 +384,7 @@ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void pstoreu<double>(double* to, const dou
|
||||
|
||||
template <>
|
||||
EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE float4 ploadt_ro<float4, Aligned>(const float* from) {
|
||||
#if defined(EIGEN_GPU_HAS_LDG)
|
||||
#if defined(EIGEN_GPU_COMPILE_PHASE)
|
||||
return __ldg(reinterpret_cast<const float4*>(from));
|
||||
#else
|
||||
return make_float4(from[0], from[1], from[2], from[3]);
|
||||
@@ -411,7 +392,7 @@ EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE float4 ploadt_ro<float4, Aligned>(const fl
|
||||
}
|
||||
template <>
|
||||
EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE double2 ploadt_ro<double2, Aligned>(const double* from) {
|
||||
#if defined(EIGEN_GPU_HAS_LDG)
|
||||
#if defined(EIGEN_GPU_COMPILE_PHASE)
|
||||
return __ldg(reinterpret_cast<const double2*>(from));
|
||||
#else
|
||||
return make_double2(from[0], from[1]);
|
||||
@@ -420,7 +401,7 @@ EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE double2 ploadt_ro<double2, Aligned>(const
|
||||
|
||||
template <>
|
||||
EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE float4 ploadt_ro<float4, Unaligned>(const float* from) {
|
||||
#if defined(EIGEN_GPU_HAS_LDG)
|
||||
#if defined(EIGEN_GPU_COMPILE_PHASE)
|
||||
return make_float4(__ldg(from + 0), __ldg(from + 1), __ldg(from + 2), __ldg(from + 3));
|
||||
#else
|
||||
return make_float4(from[0], from[1], from[2], from[3]);
|
||||
@@ -428,7 +409,7 @@ EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE float4 ploadt_ro<float4, Unaligned>(const
|
||||
}
|
||||
template <>
|
||||
EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE double2 ploadt_ro<double2, Unaligned>(const double* from) {
|
||||
#if defined(EIGEN_GPU_HAS_LDG)
|
||||
#if defined(EIGEN_GPU_COMPILE_PHASE)
|
||||
return make_double2(__ldg(from + 0), __ldg(from + 1));
|
||||
#else
|
||||
return make_double2(from[0], from[1]);
|
||||
@@ -591,23 +572,20 @@ EIGEN_DEVICE_FUNC inline void ptranspose(PacketBlock<double2, 2>& kernel) {
|
||||
|
||||
#endif // defined(EIGEN_GPUCC) && defined(EIGEN_USE_GPU)
|
||||
|
||||
// Half-packet functions are not available on the host for CUDA 9.0-9.2, only
|
||||
// on device. There is no benefit to using them on the host anyways, since they are
|
||||
// emulated.
|
||||
#if (defined(EIGEN_HAS_CUDA_FP16) || defined(EIGEN_HAS_HIP_FP16)) && defined(EIGEN_GPU_COMPILE_PHASE)
|
||||
// Half-packet functions are only available in GPU device compilation — they use
|
||||
// intrinsics (__half2, etc.) that have no host-side benefit.
|
||||
#if defined(EIGEN_GPU_COMPILE_PHASE)
|
||||
|
||||
typedef ulonglong2 Packet4h2;
|
||||
using Packet4h2 = ulonglong2;
|
||||
template <>
|
||||
struct unpacket_traits<Packet4h2> {
|
||||
typedef Eigen::half type;
|
||||
enum {
|
||||
size = 8,
|
||||
alignment = Aligned16,
|
||||
vectorizable = true,
|
||||
masked_load_available = false,
|
||||
masked_store_available = false
|
||||
};
|
||||
typedef Packet4h2 half;
|
||||
using type = Eigen::half;
|
||||
static constexpr int size = 8;
|
||||
static constexpr int alignment = Aligned16;
|
||||
static constexpr bool vectorizable = true;
|
||||
static constexpr bool masked_load_available = false;
|
||||
static constexpr bool masked_store_available = false;
|
||||
using half = Packet4h2;
|
||||
};
|
||||
template <>
|
||||
struct is_arithmetic<Packet4h2> {
|
||||
@@ -616,15 +594,13 @@ struct is_arithmetic<Packet4h2> {
|
||||
|
||||
template <>
|
||||
struct unpacket_traits<half2> {
|
||||
typedef Eigen::half type;
|
||||
enum {
|
||||
size = 2,
|
||||
alignment = Aligned16,
|
||||
vectorizable = true,
|
||||
masked_load_available = false,
|
||||
masked_store_available = false
|
||||
};
|
||||
typedef half2 half;
|
||||
using type = Eigen::half;
|
||||
static constexpr int size = 2;
|
||||
static constexpr int alignment = Aligned16;
|
||||
static constexpr bool vectorizable = true;
|
||||
static constexpr bool masked_load_available = false;
|
||||
static constexpr bool masked_store_available = false;
|
||||
using half = half2;
|
||||
};
|
||||
template <>
|
||||
struct is_arithmetic<half2> {
|
||||
@@ -633,23 +609,21 @@ struct is_arithmetic<half2> {
|
||||
|
||||
template <>
|
||||
struct packet_traits<Eigen::half> : default_packet_traits {
|
||||
typedef Packet4h2 type;
|
||||
typedef Packet4h2 half;
|
||||
enum {
|
||||
Vectorizable = 1,
|
||||
AlignedOnScalar = 1,
|
||||
size = 8,
|
||||
HasAdd = 1,
|
||||
HasSub = 1,
|
||||
HasMul = 1,
|
||||
HasDiv = 1,
|
||||
HasSqrt = 1,
|
||||
HasRsqrt = 1,
|
||||
HasExp = 1,
|
||||
HasExpm1 = 1,
|
||||
HasLog = 1,
|
||||
HasLog1p = 1
|
||||
};
|
||||
using type = Packet4h2;
|
||||
using half = Packet4h2;
|
||||
static constexpr int Vectorizable = 1;
|
||||
static constexpr int AlignedOnScalar = 1;
|
||||
static constexpr int size = 8;
|
||||
static constexpr int HasAdd = 1;
|
||||
static constexpr int HasSub = 1;
|
||||
static constexpr int HasMul = 1;
|
||||
static constexpr int HasDiv = 1;
|
||||
static constexpr int HasSqrt = 1;
|
||||
static constexpr int HasRsqrt = 1;
|
||||
static constexpr int HasExp = 1;
|
||||
static constexpr int HasExpm1 = 1;
|
||||
static constexpr int HasLog = 1;
|
||||
static constexpr int HasLog1p = 1;
|
||||
};
|
||||
|
||||
template <>
|
||||
@@ -690,7 +664,7 @@ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void pstoreu(Eigen::half* to, const half2&
|
||||
}
|
||||
|
||||
EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE half2 ploadt_ro_aligned(const Eigen::half* from) {
|
||||
#if defined(EIGEN_GPU_HAS_LDG)
|
||||
#if defined(EIGEN_GPU_COMPILE_PHASE)
|
||||
// Input is guaranteed to be properly aligned.
|
||||
return __ldg(reinterpret_cast<const half2*>(from));
|
||||
#else
|
||||
@@ -699,7 +673,7 @@ EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE half2 ploadt_ro_aligned(const Eigen::half*
|
||||
}
|
||||
|
||||
EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE half2 ploadt_ro_unaligned(const Eigen::half* from) {
|
||||
#if defined(EIGEN_GPU_HAS_LDG)
|
||||
#if defined(EIGEN_GPU_COMPILE_PHASE)
|
||||
return __halves2half2(__ldg(from + 0), __ldg(from + 1));
|
||||
#else
|
||||
return __halves2half2(*(from + 0), *(from + 1));
|
||||
@@ -745,12 +719,7 @@ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void ptranspose(PacketBlock<half2, 2>& ker
|
||||
}
|
||||
|
||||
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 plset(const Eigen::half& a) {
|
||||
#if defined(EIGEN_GPU_HAS_FP16_ARITHMETIC)
|
||||
return __halves2half2(a, __hadd(a, __float2half(1.0f)));
|
||||
#else
|
||||
float f = __half2float(a) + 1.0f;
|
||||
return __halves2half2(a, __float2half(f));
|
||||
#endif
|
||||
}
|
||||
|
||||
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 pselect(const half2& mask, const half2& a, const half2& b) {
|
||||
@@ -837,89 +806,21 @@ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 pandnot(const half2& a, const half2&
|
||||
return __halves2half2(result1, result2);
|
||||
}
|
||||
|
||||
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 padd(const half2& a, const half2& b) {
|
||||
#if defined(EIGEN_GPU_HAS_FP16_ARITHMETIC)
|
||||
return __hadd2(a, b);
|
||||
#else
|
||||
float a1 = __low2float(a);
|
||||
float a2 = __high2float(a);
|
||||
float b1 = __low2float(b);
|
||||
float b2 = __high2float(b);
|
||||
float r1 = a1 + b1;
|
||||
float r2 = a2 + b2;
|
||||
return __floats2half2_rn(r1, r2);
|
||||
#endif
|
||||
}
|
||||
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 padd(const half2& a, const half2& b) { return __hadd2(a, b); }
|
||||
|
||||
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 psub(const half2& a, const half2& b) {
|
||||
#if defined(EIGEN_GPU_HAS_FP16_ARITHMETIC)
|
||||
return __hsub2(a, b);
|
||||
#else
|
||||
float a1 = __low2float(a);
|
||||
float a2 = __high2float(a);
|
||||
float b1 = __low2float(b);
|
||||
float b2 = __high2float(b);
|
||||
float r1 = a1 - b1;
|
||||
float r2 = a2 - b2;
|
||||
return __floats2half2_rn(r1, r2);
|
||||
#endif
|
||||
}
|
||||
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 psub(const half2& a, const half2& b) { return __hsub2(a, b); }
|
||||
|
||||
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 pnegate(const half2& a) {
|
||||
#if defined(EIGEN_GPU_HAS_FP16_ARITHMETIC)
|
||||
return __hneg2(a);
|
||||
#else
|
||||
float a1 = __low2float(a);
|
||||
float a2 = __high2float(a);
|
||||
return __floats2half2_rn(-a1, -a2);
|
||||
#endif
|
||||
}
|
||||
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 pnegate(const half2& a) { return __hneg2(a); }
|
||||
|
||||
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 pconj(const half2& a) { return a; }
|
||||
|
||||
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 pmul(const half2& a, const half2& b) {
|
||||
#if defined(EIGEN_GPU_HAS_FP16_ARITHMETIC)
|
||||
return __hmul2(a, b);
|
||||
#else
|
||||
float a1 = __low2float(a);
|
||||
float a2 = __high2float(a);
|
||||
float b1 = __low2float(b);
|
||||
float b2 = __high2float(b);
|
||||
float r1 = a1 * b1;
|
||||
float r2 = a2 * b2;
|
||||
return __floats2half2_rn(r1, r2);
|
||||
#endif
|
||||
}
|
||||
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 pmul(const half2& a, const half2& b) { return __hmul2(a, b); }
|
||||
|
||||
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 pmadd(const half2& a, const half2& b, const half2& c) {
|
||||
#if defined(EIGEN_GPU_HAS_FP16_ARITHMETIC)
|
||||
return __hfma2(a, b, c);
|
||||
#else
|
||||
float a1 = __low2float(a);
|
||||
float a2 = __high2float(a);
|
||||
float b1 = __low2float(b);
|
||||
float b2 = __high2float(b);
|
||||
float c1 = __low2float(c);
|
||||
float c2 = __high2float(c);
|
||||
float r1 = a1 * b1 + c1;
|
||||
float r2 = a2 * b2 + c2;
|
||||
return __floats2half2_rn(r1, r2);
|
||||
#endif
|
||||
}
|
||||
|
||||
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 pdiv(const half2& a, const half2& b) {
|
||||
#if defined(EIGEN_GPU_HAS_FP16_ARITHMETIC)
|
||||
return __h2div(a, b);
|
||||
#else
|
||||
float a1 = __low2float(a);
|
||||
float a2 = __high2float(a);
|
||||
float b1 = __low2float(b);
|
||||
float b2 = __high2float(b);
|
||||
float r1 = a1 / b1;
|
||||
float r2 = a2 / b2;
|
||||
return __floats2half2_rn(r1, r2);
|
||||
#endif
|
||||
}
|
||||
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 pdiv(const half2& a, const half2& b) { return __h2div(a, b); }
|
||||
|
||||
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 pmin(const half2& a, const half2& b) {
|
||||
float a1 = __low2float(a);
|
||||
@@ -942,47 +843,23 @@ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 pmax(const half2& a, const half2& b)
|
||||
}
|
||||
|
||||
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Eigen::half predux(const half2& a) {
|
||||
#if defined(EIGEN_GPU_HAS_FP16_ARITHMETIC)
|
||||
return __hadd(__low2half(a), __high2half(a));
|
||||
#else
|
||||
float a1 = __low2float(a);
|
||||
float a2 = __high2float(a);
|
||||
return Eigen::half(__float2half(a1 + a2));
|
||||
#endif
|
||||
}
|
||||
|
||||
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Eigen::half predux_max(const half2& a) {
|
||||
#if defined(EIGEN_GPU_HAS_FP16_ARITHMETIC)
|
||||
__half first = __low2half(a);
|
||||
__half second = __high2half(a);
|
||||
return __hgt(first, second) ? first : second;
|
||||
#else
|
||||
float a1 = __low2float(a);
|
||||
float a2 = __high2float(a);
|
||||
return a1 > a2 ? __low2half(a) : __high2half(a);
|
||||
#endif
|
||||
}
|
||||
|
||||
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Eigen::half predux_min(const half2& a) {
|
||||
#if defined(EIGEN_GPU_HAS_FP16_ARITHMETIC)
|
||||
__half first = __low2half(a);
|
||||
__half second = __high2half(a);
|
||||
return __hlt(first, second) ? first : second;
|
||||
#else
|
||||
float a1 = __low2float(a);
|
||||
float a2 = __high2float(a);
|
||||
return a1 < a2 ? __low2half(a) : __high2half(a);
|
||||
#endif
|
||||
}
|
||||
|
||||
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Eigen::half predux_mul(const half2& a) {
|
||||
#if defined(EIGEN_GPU_HAS_FP16_ARITHMETIC)
|
||||
return __hmul(__low2half(a), __high2half(a));
|
||||
#else
|
||||
float a1 = __low2float(a);
|
||||
float a2 = __high2float(a);
|
||||
return Eigen::half(__float2half(a1 * a2));
|
||||
#endif
|
||||
}
|
||||
|
||||
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 plog1p(const half2& a) {
|
||||
@@ -1001,8 +878,6 @@ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 pexpm1(const half2& a) {
|
||||
return __floats2half2_rn(r1, r2);
|
||||
}
|
||||
|
||||
#if (EIGEN_CUDA_SDK_VER >= 80000 && defined(EIGEN_CUDA_HAS_FP16_ARITHMETIC)) || defined(EIGEN_HIP_DEVICE_COMPILE)
|
||||
|
||||
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 plog(const half2& a) { return h2log(a); }
|
||||
|
||||
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 pexp(const half2& a) { return h2exp(a); }
|
||||
@@ -1010,41 +885,6 @@ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 pexp(const half2& a) { return h2exp(
|
||||
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 psqrt(const half2& a) { return h2sqrt(a); }
|
||||
|
||||
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 prsqrt(const half2& a) { return h2rsqrt(a); }
|
||||
|
||||
#else
|
||||
|
||||
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 plog(const half2& a) {
|
||||
float a1 = __low2float(a);
|
||||
float a2 = __high2float(a);
|
||||
float r1 = logf(a1);
|
||||
float r2 = logf(a2);
|
||||
return __floats2half2_rn(r1, r2);
|
||||
}
|
||||
|
||||
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 pexp(const half2& a) {
|
||||
float a1 = __low2float(a);
|
||||
float a2 = __high2float(a);
|
||||
float r1 = expf(a1);
|
||||
float r2 = expf(a2);
|
||||
return __floats2half2_rn(r1, r2);
|
||||
}
|
||||
|
||||
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 psqrt(const half2& a) {
|
||||
float a1 = __low2float(a);
|
||||
float a2 = __high2float(a);
|
||||
float r1 = sqrtf(a1);
|
||||
float r2 = sqrtf(a2);
|
||||
return __floats2half2_rn(r1, r2);
|
||||
}
|
||||
|
||||
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 prsqrt(const half2& a) {
|
||||
float a1 = __low2float(a);
|
||||
float a2 = __high2float(a);
|
||||
float r1 = rsqrtf(a1);
|
||||
float r2 = rsqrtf(a2);
|
||||
return __floats2half2_rn(r1, r2);
|
||||
}
|
||||
#endif
|
||||
} // namespace
|
||||
|
||||
template <>
|
||||
@@ -1091,19 +931,17 @@ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void pstoreu<Eigen::half>(Eigen::half* to,
|
||||
|
||||
template <>
|
||||
EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE Packet4h2 ploadt_ro<Packet4h2, Aligned>(const Eigen::half* from) {
|
||||
#if defined(EIGEN_GPU_HAS_LDG)
|
||||
Packet4h2 r;
|
||||
#if defined(EIGEN_GPU_COMPILE_PHASE)
|
||||
r = __ldg(reinterpret_cast<const Packet4h2*>(from));
|
||||
return r;
|
||||
#else
|
||||
Packet4h2 r;
|
||||
half2* r_alias = reinterpret_cast<half2*>(&r);
|
||||
r_alias[0] = ploadt_ro_aligned(from + 0);
|
||||
r_alias[1] = ploadt_ro_aligned(from + 2);
|
||||
r_alias[2] = ploadt_ro_aligned(from + 4);
|
||||
r_alias[3] = ploadt_ro_aligned(from + 6);
|
||||
return r;
|
||||
#endif
|
||||
return r;
|
||||
}
|
||||
|
||||
template <>
|
||||
@@ -1272,7 +1110,7 @@ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet4h2 plset<Packet4h2>(const Eigen::ha
|
||||
p_alias[2] = __halves2half2(__hadd(a, __float2half(4.0f)), __hadd(a, __float2half(5.0f)));
|
||||
p_alias[3] = __halves2half2(__hadd(a, __float2half(6.0f)), __hadd(a, __float2half(7.0f)));
|
||||
return r;
|
||||
#elif defined(EIGEN_CUDA_HAS_FP16_ARITHMETIC)
|
||||
#elif defined(EIGEN_CUDA_ARCH)
|
||||
Packet4h2 r;
|
||||
half2* r_alias = reinterpret_cast<half2*>(&r);
|
||||
|
||||
@@ -1290,16 +1128,6 @@ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet4h2 plset<Packet4h2>(const Eigen::ha
|
||||
r_alias[3] = plset(__high2half(c));
|
||||
|
||||
return r;
|
||||
|
||||
#else
|
||||
float f = __half2float(a);
|
||||
Packet4h2 r;
|
||||
half2* p_alias = reinterpret_cast<half2*>(&r);
|
||||
p_alias[0] = __halves2half2(a, __float2half(f + 1.0f));
|
||||
p_alias[1] = __halves2half2(__float2half(f + 2.0f), __float2half(f + 3.0f));
|
||||
p_alias[2] = __halves2half2(__float2half(f + 4.0f), __float2half(f + 5.0f));
|
||||
p_alias[3] = __halves2half2(__float2half(f + 6.0f), __float2half(f + 7.0f));
|
||||
return r;
|
||||
#endif
|
||||
}
|
||||
|
||||
@@ -1533,7 +1361,7 @@ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Eigen::half predux_max<Packet4h2>(const Pa
|
||||
half2 m1 = __halves2half2(predux_max(a_alias[2]), predux_max(a_alias[3]));
|
||||
__half first = predux_max(m0);
|
||||
__half second = predux_max(m1);
|
||||
#if defined(EIGEN_CUDA_HAS_FP16_ARITHMETIC)
|
||||
#if defined(EIGEN_CUDA_ARCH)
|
||||
return (__hgt(first, second) ? first : second);
|
||||
#else
|
||||
float ffirst = __half2float(first);
|
||||
@@ -1549,7 +1377,7 @@ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Eigen::half predux_min<Packet4h2>(const Pa
|
||||
half2 m1 = __halves2half2(predux_min(a_alias[2]), predux_min(a_alias[3]));
|
||||
__half first = predux_min(m0);
|
||||
__half second = predux_min(m1);
|
||||
#if defined(EIGEN_CUDA_HAS_FP16_ARITHMETIC)
|
||||
#if defined(EIGEN_CUDA_ARCH)
|
||||
return (__hlt(first, second) ? first : second);
|
||||
#else
|
||||
float ffirst = __half2float(first);
|
||||
@@ -1641,47 +1469,17 @@ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet4h2 prsqrt<Packet4h2>(const Packet4h
|
||||
// the implementation of GPU half reduction.
|
||||
template <>
|
||||
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 padd<half2>(const half2& a, const half2& b) {
|
||||
#if defined(EIGEN_GPU_HAS_FP16_ARITHMETIC)
|
||||
return __hadd2(a, b);
|
||||
#else
|
||||
float a1 = __low2float(a);
|
||||
float a2 = __high2float(a);
|
||||
float b1 = __low2float(b);
|
||||
float b2 = __high2float(b);
|
||||
float r1 = a1 + b1;
|
||||
float r2 = a2 + b2;
|
||||
return __floats2half2_rn(r1, r2);
|
||||
#endif
|
||||
}
|
||||
|
||||
template <>
|
||||
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 pmul<half2>(const half2& a, const half2& b) {
|
||||
#if defined(EIGEN_GPU_HAS_FP16_ARITHMETIC)
|
||||
return __hmul2(a, b);
|
||||
#else
|
||||
float a1 = __low2float(a);
|
||||
float a2 = __high2float(a);
|
||||
float b1 = __low2float(b);
|
||||
float b2 = __high2float(b);
|
||||
float r1 = a1 * b1;
|
||||
float r2 = a2 * b2;
|
||||
return __floats2half2_rn(r1, r2);
|
||||
#endif
|
||||
}
|
||||
|
||||
template <>
|
||||
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 pdiv<half2>(const half2& a, const half2& b) {
|
||||
#if defined(EIGEN_GPU_HAS_FP16_ARITHMETIC)
|
||||
return __h2div(a, b);
|
||||
#else
|
||||
float a1 = __low2float(a);
|
||||
float a2 = __high2float(a);
|
||||
float b1 = __low2float(b);
|
||||
float b2 = __high2float(b);
|
||||
float r1 = a1 / b1;
|
||||
float r2 = a2 / b2;
|
||||
return __floats2half2_rn(r1, r2);
|
||||
#endif
|
||||
}
|
||||
|
||||
template <>
|
||||
@@ -1706,11 +1504,7 @@ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 pmax<half2>(const half2& a, const ha
|
||||
return __halves2half2(r1, r2);
|
||||
}
|
||||
|
||||
#endif // (defined(EIGEN_HAS_CUDA_FP16) || defined(EIGEN_HAS_HIP_FP16)) && defined(EIGEN_GPU_COMPILE_PHASE)
|
||||
|
||||
#undef EIGEN_GPU_HAS_LDG
|
||||
#undef EIGEN_CUDA_HAS_FP16_ARITHMETIC
|
||||
#undef EIGEN_GPU_HAS_FP16_ARITHMETIC
|
||||
#endif // defined(EIGEN_GPU_COMPILE_PHASE)
|
||||
|
||||
} // end namespace internal
|
||||
|
||||
|
||||
@@ -17,8 +17,7 @@ namespace Eigen {
|
||||
|
||||
namespace internal {
|
||||
|
||||
#if (defined(EIGEN_HAS_CUDA_FP16) && defined(EIGEN_CUDA_ARCH) && EIGEN_CUDA_ARCH >= 300) || \
|
||||
(defined(EIGEN_HAS_HIP_FP16) && defined(EIGEN_HIP_DEVICE_COMPILE))
|
||||
#if defined(EIGEN_GPU_COMPILE_PHASE)
|
||||
|
||||
template <>
|
||||
struct type_casting_traits<Eigen::half, float> {
|
||||
|
||||
@@ -541,12 +541,6 @@ extern "C" {
|
||||
#if defined EIGEN_CUDACC
|
||||
#define EIGEN_VECTORIZE_GPU
|
||||
#include <vector_types.h>
|
||||
#if EIGEN_CUDA_SDK_VER >= 70500
|
||||
#define EIGEN_HAS_CUDA_FP16
|
||||
#endif
|
||||
#endif
|
||||
|
||||
#if defined(EIGEN_HAS_CUDA_FP16)
|
||||
#include <cuda_runtime_api.h>
|
||||
#include <cuda_fp16.h>
|
||||
#endif
|
||||
@@ -554,7 +548,6 @@ extern "C" {
|
||||
#if defined(EIGEN_HIPCC)
|
||||
#define EIGEN_VECTORIZE_GPU
|
||||
#include <hip/hip_vector_types.h>
|
||||
#define EIGEN_HAS_HIP_FP16
|
||||
#include <hip/hip_fp16.h>
|
||||
#define EIGEN_HAS_HIP_BF16
|
||||
#include <hip/hip_bfloat16.h>
|
||||
|
||||
@@ -84,8 +84,7 @@
|
||||
#endif
|
||||
|
||||
#if defined __NVCC__ && defined __CUDACC__
|
||||
// MSVC 14.16 (required by CUDA 9.*) does not support the _Pragma keyword, so
|
||||
// we instead use Microsoft's __pragma extension.
|
||||
// MSVC does not support the _Pragma keyword, so we use Microsoft's __pragma extension.
|
||||
#if defined _MSC_VER
|
||||
#define EIGEN_MAKE_PRAGMA(X) __pragma(#X)
|
||||
#else
|
||||
|
||||
@@ -148,13 +148,8 @@
|
||||
#endif
|
||||
|
||||
#if defined(__NVCC__)
|
||||
#if defined(__CUDACC_VER_MAJOR__) && (__CUDACC_VER_MAJOR__ >= 9)
|
||||
// CUDA 11.4+ always defines __CUDACC_VER_MAJOR__.
|
||||
#define EIGEN_COMP_NVCC ((__CUDACC_VER_MAJOR__ * 10000) + (__CUDACC_VER_MINOR__ * 100))
|
||||
#elif defined(__CUDACC_VER__)
|
||||
#define EIGEN_COMP_NVCC __CUDACC_VER__
|
||||
#else
|
||||
#error "NVCC did not define compiler version."
|
||||
#endif
|
||||
#else
|
||||
#define EIGEN_COMP_NVCC 0
|
||||
#endif
|
||||
@@ -575,6 +570,10 @@
|
||||
#define EIGEN_CUDA_SDK_VER 0
|
||||
#endif
|
||||
|
||||
#if defined(EIGEN_CUDACC) && EIGEN_CUDA_SDK_VER > 0 && EIGEN_CUDA_SDK_VER < 110400
|
||||
#error "Eigen requires CUDA 11.4 or later."
|
||||
#endif
|
||||
|
||||
#if defined(__HIPCC__) && !defined(EIGEN_NO_HIP) && !defined(__SYCL_DEVICE_ONLY__)
|
||||
// Means the compiler is HIPCC (analogous to EIGEN_CUDACC, but for HIP)
|
||||
#define EIGEN_HIPCC __HIPCC__
|
||||
@@ -584,22 +583,20 @@
|
||||
// ++ host_defines.h which contains the defines for the __host__ and __device__ macros
|
||||
#include <hip/hip_runtime.h>
|
||||
|
||||
// Eigen requires ROCm/HIP >= 5.6 (GFX906 minimum architecture).
|
||||
// This floor exists to allow simplifying shared CUDA/HIP preprocessor guards —
|
||||
// all __HIP_ARCH_HAS_WARP_SHUFFLE__, __HIP_ARCH_HAS_FP16__, etc. are always true on GFX906+.
|
||||
#if defined(HIP_VERSION_MAJOR) && (HIP_VERSION_MAJOR < 5 || (HIP_VERSION_MAJOR == 5 && HIP_VERSION_MINOR < 6))
|
||||
#error "Eigen requires ROCm/HIP >= 5.6."
|
||||
#endif
|
||||
|
||||
#if defined(__HIP_DEVICE_COMPILE__) && !defined(__SYCL_DEVICE_ONLY__)
|
||||
// analogous to EIGEN_CUDA_ARCH, but for HIP
|
||||
#define EIGEN_HIP_DEVICE_COMPILE __HIP_DEVICE_COMPILE__
|
||||
#endif
|
||||
|
||||
// For HIP (ROCm 3.5 and higher), we need to explicitly set the launch_bounds attribute
|
||||
// value to 1024. The compiler assigns a default value of 256 when the attribute is not
|
||||
// specified. This results in failures on the HIP platform, for cases when a GPU kernel
|
||||
// without an explicit launch_bounds attribute is called with a threads_per_block value
|
||||
// greater than 256.
|
||||
//
|
||||
// This is a regression in functionality and is expected to be fixed within the next
|
||||
// couple of ROCm releases (compiler will go back to using 1024 value as the default)
|
||||
//
|
||||
// In the meantime, we will use a "only enabled for HIP" macro to set the launch_bounds
|
||||
// attribute.
|
||||
// HIP compilers default to launch_bounds(256), which causes failures when kernels
|
||||
// are called with more than 256 threads per block. Explicitly set to 1024 for HIP.
|
||||
|
||||
#define EIGEN_HIP_LAUNCH_BOUNDS_1024 __launch_bounds__(1024)
|
||||
|
||||
|
||||
@@ -25,7 +25,7 @@ namespace internal {
|
||||
template <typename SolverType, int Size, bool IsComplex>
|
||||
struct direct_selfadjoint_eigenvalues;
|
||||
|
||||
template <typename MatrixType, typename DiagType, typename SubDiagType>
|
||||
template <bool PerBlockScaling, typename MatrixType, typename DiagType, typename SubDiagType>
|
||||
EIGEN_DEVICE_FUNC ComputationInfo computeFromTridiagonal_impl(DiagType& diag, SubDiagType& subdiag,
|
||||
const Index maxIterations, bool computeEigenvectors,
|
||||
MatrixType& eivec);
|
||||
@@ -438,7 +438,7 @@ EIGEN_DEVICE_FUNC SelfAdjointEigenSolver<MatrixType>& SelfAdjointEigenSolver<Mat
|
||||
m_eivec = matrix;
|
||||
m_eivalues.coeffRef(0, 0) = numext::real(m_eivec.coeff(0, 0));
|
||||
if (computeEigenvectors) m_eivec.setOnes(n, n);
|
||||
m_info = Success;
|
||||
m_info = (numext::isfinite)(m_eivalues.coeffRef(0, 0)) ? Success : NoConvergence;
|
||||
m_isInitialized = true;
|
||||
m_eigenvectorsOk = computeEigenvectors;
|
||||
return *this;
|
||||
@@ -448,7 +448,11 @@ EIGEN_DEVICE_FUNC SelfAdjointEigenSolver<MatrixType>& SelfAdjointEigenSolver<Mat
|
||||
RealVectorType& diag = m_eivalues;
|
||||
EigenvectorsType& mat = m_eivec;
|
||||
|
||||
// map the matrix coefficients to [-1:1] to avoid over- and underflow.
|
||||
// Scale the matrix to [-1:1] to avoid overflow/underflow during tridiagonalization
|
||||
// and subsequent QR iteration. This uniform scaling ensures the tridiagonal output is
|
||||
// well-conditioned. Note: for block-diagonal matrices with widely separated scales, this
|
||||
// can underflow small blocks. Users with such matrices should tridiagonalize separately
|
||||
// and call computeFromTridiagonal(), which uses per-block scaling.
|
||||
mat = matrix.template triangularView<Lower>();
|
||||
RealScalar scale = mat.cwiseAbs().maxCoeff();
|
||||
if (!(numext::isfinite)(scale)) {
|
||||
@@ -464,9 +468,9 @@ EIGEN_DEVICE_FUNC SelfAdjointEigenSolver<MatrixType>& SelfAdjointEigenSolver<Mat
|
||||
m_hcoeffs.resize(n - 1);
|
||||
internal::tridiagonalization_inplace(mat, diag, m_subdiag, m_hcoeffs, m_workspace, computeEigenvectors);
|
||||
|
||||
m_info = internal::computeFromTridiagonal_impl(diag, m_subdiag, m_maxIterations, computeEigenvectors, m_eivec);
|
||||
m_info = internal::computeFromTridiagonal_impl<false>(diag, m_subdiag, m_maxIterations, computeEigenvectors, m_eivec);
|
||||
|
||||
// scale back the eigen values
|
||||
// Scale back the eigenvalues.
|
||||
m_eivalues *= scale;
|
||||
|
||||
m_isInitialized = true;
|
||||
@@ -482,30 +486,26 @@ SelfAdjointEigenSolver<MatrixType>& SelfAdjointEigenSolver<MatrixType>::computeF
|
||||
m_eivalues = diag;
|
||||
m_subdiag = subdiag;
|
||||
|
||||
// Scale the tridiagonal matrix to [-1:1] to avoid over- and underflow,
|
||||
// just like compute() does for the full matrix.
|
||||
RealScalar scale = m_eivalues.cwiseAbs().maxCoeff();
|
||||
if (m_subdiag.size() > 0) scale = numext::maxi(scale, m_subdiag.cwiseAbs().maxCoeff());
|
||||
if (!(numext::isfinite)(scale)) {
|
||||
m_info = NoConvergence;
|
||||
m_isInitialized = true;
|
||||
m_eigenvectorsOk = false;
|
||||
return *this;
|
||||
}
|
||||
if (numext::is_exactly_zero(scale)) scale = RealScalar(1);
|
||||
const bool needsScaling = scale != RealScalar(1);
|
||||
if (needsScaling) {
|
||||
m_eivalues /= scale;
|
||||
m_subdiag /= scale;
|
||||
// Check for Inf/NaN in the input.
|
||||
{
|
||||
RealScalar scale = RealScalar(0);
|
||||
if (m_eivalues.size() > 0) scale = m_eivalues.cwiseAbs().maxCoeff();
|
||||
if (m_subdiag.size() > 0) scale = numext::maxi(scale, m_subdiag.cwiseAbs().maxCoeff());
|
||||
if (!(numext::isfinite)(scale)) {
|
||||
m_info = NoConvergence;
|
||||
m_isInitialized = true;
|
||||
m_eigenvectorsOk = false;
|
||||
return *this;
|
||||
}
|
||||
}
|
||||
|
||||
if (computeEigenvectors) {
|
||||
m_eivec.setIdentity(diag.size(), diag.size());
|
||||
}
|
||||
m_info = internal::computeFromTridiagonal_impl(m_eivalues, m_subdiag, m_maxIterations, computeEigenvectors, m_eivec);
|
||||
|
||||
// Scale back the eigenvalues.
|
||||
if (needsScaling) m_eivalues *= scale;
|
||||
// Use per-deflation-block scaling (like LAPACK's DSTERF) to avoid losing
|
||||
// precision when the tridiagonal entries span a wide range of magnitudes.
|
||||
m_info =
|
||||
internal::computeFromTridiagonal_impl<true>(m_eivalues, m_subdiag, m_maxIterations, computeEigenvectors, m_eivec);
|
||||
|
||||
m_isInitialized = true;
|
||||
m_eigenvectorsOk = computeEigenvectors;
|
||||
@@ -517,6 +517,10 @@ namespace internal {
|
||||
* \internal
|
||||
* \brief Compute the eigendecomposition from a tridiagonal matrix
|
||||
*
|
||||
* \tparam PerBlockScaling If true, each deflation block is independently scaled to [-1,1] before
|
||||
* QR iteration, following LAPACK's DSTERF approach. This prevents precision loss when entries
|
||||
* span a wide range of magnitudes. When false, the caller is responsible for ensuring the
|
||||
* entries are in a safe range (e.g. by pre-scaling the dense matrix before tridiagonalization).
|
||||
* \param[in,out] diag : On input, the diagonal of the matrix, on output the eigenvalues
|
||||
* \param[in,out] subdiag : The subdiagonal part of the matrix (entries are modified during the decomposition)
|
||||
* \param[in] maxIterations : the maximum number of iterations
|
||||
@@ -524,7 +528,7 @@ namespace internal {
|
||||
* \param[out] eivec : The matrix to store the eigenvectors if computeEigenvectors==true. Must be allocated on input.
|
||||
* \returns \c Success or \c NoConvergence
|
||||
*/
|
||||
template <typename MatrixType, typename DiagType, typename SubDiagType>
|
||||
template <bool PerBlockScaling, typename MatrixType, typename DiagType, typename SubDiagType>
|
||||
EIGEN_DEVICE_FUNC ComputationInfo computeFromTridiagonal_impl(DiagType& diag, SubDiagType& subdiag,
|
||||
const Index maxIterations, bool computeEigenvectors,
|
||||
MatrixType& eivec) {
|
||||
@@ -539,21 +543,32 @@ EIGEN_DEVICE_FUNC ComputationInfo computeFromTridiagonal_impl(DiagType& diag, Su
|
||||
typedef typename DiagType::RealScalar RealScalar;
|
||||
const RealScalar considerAsZero = (std::numeric_limits<RealScalar>::min)();
|
||||
const RealScalar precision_inv = RealScalar(1) / NumTraits<RealScalar>::epsilon();
|
||||
while (end > 0) {
|
||||
for (Index i = start; i < end; ++i) {
|
||||
|
||||
// Helper lambda for the deflation test.
|
||||
auto deflate = [&](Index lo, Index hi) {
|
||||
for (Index i = lo; i < hi; ++i) {
|
||||
if (numext::abs(subdiag[i]) < considerAsZero) {
|
||||
subdiag[i] = RealScalar(0);
|
||||
} else {
|
||||
// abs(subdiag[i]) <= epsilon * sqrt(abs(diag[i]) + abs(diag[i+1]))
|
||||
// Scaled to prevent underflows.
|
||||
const RealScalar scaled_subdiag = precision_inv * subdiag[i];
|
||||
if (scaled_subdiag * scaled_subdiag <= (numext::abs(diag[i]) + numext::abs(diag[i + 1]))) {
|
||||
subdiag[i] = RealScalar(0);
|
||||
}
|
||||
}
|
||||
}
|
||||
};
|
||||
|
||||
// find the largest unreduced block at the end of the matrix.
|
||||
// For per-block scaling, track the currently scaled block and its scale factor.
|
||||
// When the outer loop identifies a block outside the scaled region, unscale the old
|
||||
// block and scale the new one. This keeps the same outer loop structure (one QR step
|
||||
// per iteration) while ensuring each block is processed in scaled coordinates.
|
||||
Index scaled_start = -1, scaled_end = -1;
|
||||
RealScalar block_scale = RealScalar(1);
|
||||
|
||||
while (end > 0) {
|
||||
deflate(start, end);
|
||||
|
||||
// Find the largest unreduced block at the end of the matrix.
|
||||
while (end > 0 && numext::is_exactly_zero(subdiag[end - 1])) {
|
||||
end--;
|
||||
}
|
||||
@@ -566,9 +581,42 @@ EIGEN_DEVICE_FUNC ComputationInfo computeFromTridiagonal_impl(DiagType& diag, Su
|
||||
start = end - 1;
|
||||
while (start > 0 && !numext::is_exactly_zero(subdiag[start - 1])) start--;
|
||||
|
||||
if (PerBlockScaling) {
|
||||
// Check if we've moved to a different block than the one currently scaled.
|
||||
if (start != scaled_start || end != scaled_end) {
|
||||
// Unscale the previous block if it was scaled.
|
||||
if (block_scale != RealScalar(1)) {
|
||||
for (Index i = scaled_start; i <= scaled_end; ++i) diag[i] /= block_scale;
|
||||
for (Index i = scaled_start; i < scaled_end; ++i) {
|
||||
if (!numext::is_exactly_zero(subdiag[i])) subdiag[i] /= block_scale;
|
||||
}
|
||||
block_scale = RealScalar(1);
|
||||
}
|
||||
// Compute the norm and scale the new block to [-1:1].
|
||||
RealScalar block_norm = RealScalar(0);
|
||||
for (Index i = start; i <= end; ++i) block_norm = numext::maxi(block_norm, numext::abs(diag[i]));
|
||||
for (Index i = start; i < end; ++i) block_norm = numext::maxi(block_norm, numext::abs(subdiag[i]));
|
||||
if (block_norm > RealScalar(0) && block_norm != RealScalar(1)) {
|
||||
block_scale = RealScalar(1) / block_norm;
|
||||
for (Index i = start; i <= end; ++i) diag[i] *= block_scale;
|
||||
for (Index i = start; i < end; ++i) subdiag[i] *= block_scale;
|
||||
}
|
||||
scaled_start = start;
|
||||
scaled_end = end;
|
||||
}
|
||||
}
|
||||
|
||||
internal::tridiagonal_qr_step<MatrixType::Flags & RowMajorBit ? RowMajor : ColMajor>(
|
||||
diag.data(), subdiag.data(), start, end, computeEigenvectors ? eivec.data() : (Scalar*)0, n);
|
||||
}
|
||||
|
||||
// Unscale any remaining scaled block.
|
||||
if (PerBlockScaling && block_scale != RealScalar(1)) {
|
||||
for (Index i = scaled_start; i <= scaled_end; ++i) diag[i] /= block_scale;
|
||||
for (Index i = scaled_start; i < scaled_end; ++i) {
|
||||
if (!numext::is_exactly_zero(subdiag[i])) subdiag[i] /= block_scale;
|
||||
}
|
||||
}
|
||||
if (iter <= maxIterations * n)
|
||||
info = Success;
|
||||
else
|
||||
|
||||
233
Eigen/src/GPU/CuBlasSupport.h
Normal file
233
Eigen/src/GPU/CuBlasSupport.h
Normal file
@@ -0,0 +1,233 @@
|
||||
// This file is part of Eigen, a lightweight C++ template library
|
||||
// for linear algebra.
|
||||
//
|
||||
// Copyright (C) 2026 Rasmus Munk Larsen <rmlarsen@gmail.com>
|
||||
//
|
||||
// This Source Code Form is subject to the terms of the Mozilla
|
||||
// Public License v. 2.0. If a copy of the MPL was not distributed
|
||||
// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
|
||||
|
||||
// cuBLAS-specific support types:
|
||||
// - Error-checking macro
|
||||
// - Operation enum and mapping to cublasOperation_t
|
||||
//
|
||||
// Generic CUDA runtime utilities (DeviceBuffer, cuda_data_type) are in GpuSupport.h.
|
||||
|
||||
#ifndef EIGEN_GPU_CUBLAS_SUPPORT_H
|
||||
#define EIGEN_GPU_CUBLAS_SUPPORT_H
|
||||
|
||||
// IWYU pragma: private
|
||||
#include "./InternalHeaderCheck.h"
|
||||
|
||||
#include "./GpuSupport.h"
|
||||
#include <cublas_v2.h>
|
||||
|
||||
namespace Eigen {
|
||||
namespace internal {
|
||||
|
||||
// ---- Error-checking macro ---------------------------------------------------
|
||||
|
||||
#define EIGEN_CUBLAS_CHECK(expr) \
|
||||
do { \
|
||||
cublasStatus_t _s = (expr); \
|
||||
eigen_assert(_s == CUBLAS_STATUS_SUCCESS && "cuBLAS call failed"); \
|
||||
} while (0)
|
||||
|
||||
// ---- Operation enum ---------------------------------------------------------
|
||||
// Maps transpose/adjoint flags to cublasOperation_t.
|
||||
|
||||
enum class GpuOp { NoTrans, Trans, ConjTrans };
|
||||
|
||||
constexpr cublasOperation_t to_cublas_op(GpuOp op) {
|
||||
switch (op) {
|
||||
case GpuOp::Trans:
|
||||
return CUBLAS_OP_T;
|
||||
case GpuOp::ConjTrans:
|
||||
return CUBLAS_OP_C;
|
||||
default:
|
||||
return CUBLAS_OP_N;
|
||||
}
|
||||
}
|
||||
|
||||
// ---- Scalar → cublasComputeType_t -------------------------------------------
|
||||
// cublasGemmEx requires a compute type (separate from the data type).
|
||||
//
|
||||
// Precision policy:
|
||||
// - Default: tensor core algorithms enabled (CUBLAS_GEMM_DEFAULT_TENSOR_OP).
|
||||
// For double, cuBLAS may use Ozaki emulation on sm_80+ tensor cores.
|
||||
// - EIGEN_CUDA_TF32: opt-in to TF32 for float (~2x faster, 10-bit mantissa).
|
||||
// - EIGEN_NO_CUDA_TENSOR_OPS: disables all tensor core usage. Uses pedantic
|
||||
// compute types and CUBLAS_GEMM_DEFAULT algorithm. For bit-exact reproducibility.
|
||||
|
||||
template <typename Scalar>
|
||||
struct cuda_compute_type;
|
||||
|
||||
template <>
|
||||
struct cuda_compute_type<float> {
|
||||
#if defined(EIGEN_NO_CUDA_TENSOR_OPS)
|
||||
static constexpr cublasComputeType_t value = CUBLAS_COMPUTE_32F_PEDANTIC;
|
||||
#elif defined(EIGEN_CUDA_TF32)
|
||||
static constexpr cublasComputeType_t value = CUBLAS_COMPUTE_32F_FAST_TF32;
|
||||
#else
|
||||
static constexpr cublasComputeType_t value = CUBLAS_COMPUTE_32F;
|
||||
#endif
|
||||
};
|
||||
template <>
|
||||
struct cuda_compute_type<double> {
|
||||
#ifdef EIGEN_NO_CUDA_TENSOR_OPS
|
||||
static constexpr cublasComputeType_t value = CUBLAS_COMPUTE_64F_PEDANTIC;
|
||||
#else
|
||||
static constexpr cublasComputeType_t value = CUBLAS_COMPUTE_64F;
|
||||
#endif
|
||||
};
|
||||
template <>
|
||||
struct cuda_compute_type<std::complex<float>> {
|
||||
#if defined(EIGEN_NO_CUDA_TENSOR_OPS)
|
||||
static constexpr cublasComputeType_t value = CUBLAS_COMPUTE_32F_PEDANTIC;
|
||||
#elif defined(EIGEN_CUDA_TF32)
|
||||
static constexpr cublasComputeType_t value = CUBLAS_COMPUTE_32F_FAST_TF32;
|
||||
#else
|
||||
static constexpr cublasComputeType_t value = CUBLAS_COMPUTE_32F;
|
||||
#endif
|
||||
};
|
||||
template <>
|
||||
struct cuda_compute_type<std::complex<double>> {
|
||||
#ifdef EIGEN_NO_CUDA_TENSOR_OPS
|
||||
static constexpr cublasComputeType_t value = CUBLAS_COMPUTE_64F_PEDANTIC;
|
||||
#else
|
||||
static constexpr cublasComputeType_t value = CUBLAS_COMPUTE_64F;
|
||||
#endif
|
||||
};
|
||||
// ---- GEMM algorithm hint ----------------------------------------------------
|
||||
|
||||
constexpr cublasGemmAlgo_t cuda_gemm_algo() {
|
||||
#ifdef EIGEN_NO_CUDA_TENSOR_OPS
|
||||
return CUBLAS_GEMM_DEFAULT;
|
||||
#else
|
||||
return CUBLAS_GEMM_DEFAULT_TENSOR_OP;
|
||||
#endif
|
||||
}
|
||||
|
||||
// ---- Alpha/beta scalar type for cublasGemmEx --------------------------------
|
||||
// For standard types, alpha/beta match the scalar type.
|
||||
|
||||
template <typename Scalar>
|
||||
struct cuda_gemm_scalar {
|
||||
using type = Scalar;
|
||||
};
|
||||
|
||||
// ---- Type-specific cuBLAS wrappers ------------------------------------------
|
||||
// cuBLAS uses separate functions per type (Strsm, Dtrsm, etc.).
|
||||
// These overloaded wrappers allow calling cublasXtrsm/cublasXsymm/cublasXsyrk
|
||||
// with any supported scalar type.
|
||||
|
||||
// TRSM wrappers
|
||||
inline cublasStatus_t cublasXtrsm(cublasHandle_t h, cublasSideMode_t side, cublasFillMode_t uplo,
|
||||
cublasOperation_t trans, cublasDiagType_t diag, int m, int n, const float* alpha,
|
||||
const float* A, int lda, float* B, int ldb) {
|
||||
return cublasStrsm(h, side, uplo, trans, diag, m, n, alpha, A, lda, B, ldb);
|
||||
}
|
||||
inline cublasStatus_t cublasXtrsm(cublasHandle_t h, cublasSideMode_t side, cublasFillMode_t uplo,
|
||||
cublasOperation_t trans, cublasDiagType_t diag, int m, int n, const double* alpha,
|
||||
const double* A, int lda, double* B, int ldb) {
|
||||
return cublasDtrsm(h, side, uplo, trans, diag, m, n, alpha, A, lda, B, ldb);
|
||||
}
|
||||
inline cublasStatus_t cublasXtrsm(cublasHandle_t h, cublasSideMode_t side, cublasFillMode_t uplo,
|
||||
cublasOperation_t trans, cublasDiagType_t diag, int m, int n,
|
||||
const std::complex<float>* alpha, const std::complex<float>* A, int lda,
|
||||
std::complex<float>* B, int ldb) {
|
||||
return cublasCtrsm(h, side, uplo, trans, diag, m, n, reinterpret_cast<const cuComplex*>(alpha),
|
||||
reinterpret_cast<const cuComplex*>(A), lda, reinterpret_cast<cuComplex*>(B), ldb);
|
||||
}
|
||||
inline cublasStatus_t cublasXtrsm(cublasHandle_t h, cublasSideMode_t side, cublasFillMode_t uplo,
|
||||
cublasOperation_t trans, cublasDiagType_t diag, int m, int n,
|
||||
const std::complex<double>* alpha, const std::complex<double>* A, int lda,
|
||||
std::complex<double>* B, int ldb) {
|
||||
return cublasZtrsm(h, side, uplo, trans, diag, m, n, reinterpret_cast<const cuDoubleComplex*>(alpha),
|
||||
reinterpret_cast<const cuDoubleComplex*>(A), lda, reinterpret_cast<cuDoubleComplex*>(B), ldb);
|
||||
}
|
||||
|
||||
// SYMM wrappers (real → symm, complex → hemm)
|
||||
inline cublasStatus_t cublasXsymm(cublasHandle_t h, cublasSideMode_t side, cublasFillMode_t uplo, int m, int n,
|
||||
const float* alpha, const float* A, int lda, const float* B, int ldb,
|
||||
const float* beta, float* C, int ldc) {
|
||||
return cublasSsymm(h, side, uplo, m, n, alpha, A, lda, B, ldb, beta, C, ldc);
|
||||
}
|
||||
inline cublasStatus_t cublasXsymm(cublasHandle_t h, cublasSideMode_t side, cublasFillMode_t uplo, int m, int n,
|
||||
const double* alpha, const double* A, int lda, const double* B, int ldb,
|
||||
const double* beta, double* C, int ldc) {
|
||||
return cublasDsymm(h, side, uplo, m, n, alpha, A, lda, B, ldb, beta, C, ldc);
|
||||
}
|
||||
inline cublasStatus_t cublasXsymm(cublasHandle_t h, cublasSideMode_t side, cublasFillMode_t uplo, int m, int n,
|
||||
const std::complex<float>* alpha, const std::complex<float>* A, int lda,
|
||||
const std::complex<float>* B, int ldb, const std::complex<float>* beta,
|
||||
std::complex<float>* C, int ldc) {
|
||||
return cublasChemm(h, side, uplo, m, n, reinterpret_cast<const cuComplex*>(alpha),
|
||||
reinterpret_cast<const cuComplex*>(A), lda, reinterpret_cast<const cuComplex*>(B), ldb,
|
||||
reinterpret_cast<const cuComplex*>(beta), reinterpret_cast<cuComplex*>(C), ldc);
|
||||
}
|
||||
inline cublasStatus_t cublasXsymm(cublasHandle_t h, cublasSideMode_t side, cublasFillMode_t uplo, int m, int n,
|
||||
const std::complex<double>* alpha, const std::complex<double>* A, int lda,
|
||||
const std::complex<double>* B, int ldb, const std::complex<double>* beta,
|
||||
std::complex<double>* C, int ldc) {
|
||||
return cublasZhemm(h, side, uplo, m, n, reinterpret_cast<const cuDoubleComplex*>(alpha),
|
||||
reinterpret_cast<const cuDoubleComplex*>(A), lda, reinterpret_cast<const cuDoubleComplex*>(B), ldb,
|
||||
reinterpret_cast<const cuDoubleComplex*>(beta), reinterpret_cast<cuDoubleComplex*>(C), ldc);
|
||||
}
|
||||
|
||||
// SYRK wrappers (real → syrk, complex → herk)
|
||||
inline cublasStatus_t cublasXsyrk(cublasHandle_t h, cublasFillMode_t uplo, cublasOperation_t trans, int n, int k,
|
||||
const float* alpha, const float* A, int lda, const float* beta, float* C, int ldc) {
|
||||
return cublasSsyrk(h, uplo, trans, n, k, alpha, A, lda, beta, C, ldc);
|
||||
}
|
||||
inline cublasStatus_t cublasXsyrk(cublasHandle_t h, cublasFillMode_t uplo, cublasOperation_t trans, int n, int k,
|
||||
const double* alpha, const double* A, int lda, const double* beta, double* C,
|
||||
int ldc) {
|
||||
return cublasDsyrk(h, uplo, trans, n, k, alpha, A, lda, beta, C, ldc);
|
||||
}
|
||||
inline cublasStatus_t cublasXsyrk(cublasHandle_t h, cublasFillMode_t uplo, cublasOperation_t trans, int n, int k,
|
||||
const float* alpha, const std::complex<float>* A, int lda, const float* beta,
|
||||
std::complex<float>* C, int ldc) {
|
||||
return cublasCherk(h, uplo, trans, n, k, alpha, reinterpret_cast<const cuComplex*>(A), lda, beta,
|
||||
reinterpret_cast<cuComplex*>(C), ldc);
|
||||
}
|
||||
inline cublasStatus_t cublasXsyrk(cublasHandle_t h, cublasFillMode_t uplo, cublasOperation_t trans, int n, int k,
|
||||
const double* alpha, const std::complex<double>* A, int lda, const double* beta,
|
||||
std::complex<double>* C, int ldc) {
|
||||
return cublasZherk(h, uplo, trans, n, k, alpha, reinterpret_cast<const cuDoubleComplex*>(A), lda, beta,
|
||||
reinterpret_cast<cuDoubleComplex*>(C), ldc);
|
||||
}
|
||||
|
||||
// GEAM wrappers: C = alpha * op(A) + beta * op(B)
|
||||
// Covers transpose, scale, matrix add/subtract in one call.
|
||||
inline cublasStatus_t cublasXgeam(cublasHandle_t h, cublasOperation_t transa, cublasOperation_t transb, int m, int n,
|
||||
const float* alpha, const float* A, int lda, const float* beta, const float* B,
|
||||
int ldb, float* C, int ldc) {
|
||||
return cublasSgeam(h, transa, transb, m, n, alpha, A, lda, beta, B, ldb, C, ldc);
|
||||
}
|
||||
inline cublasStatus_t cublasXgeam(cublasHandle_t h, cublasOperation_t transa, cublasOperation_t transb, int m, int n,
|
||||
const double* alpha, const double* A, int lda, const double* beta, const double* B,
|
||||
int ldb, double* C, int ldc) {
|
||||
return cublasDgeam(h, transa, transb, m, n, alpha, A, lda, beta, B, ldb, C, ldc);
|
||||
}
|
||||
inline cublasStatus_t cublasXgeam(cublasHandle_t h, cublasOperation_t transa, cublasOperation_t transb, int m, int n,
|
||||
const std::complex<float>* alpha, const std::complex<float>* A, int lda,
|
||||
const std::complex<float>* beta, const std::complex<float>* B, int ldb,
|
||||
std::complex<float>* C, int ldc) {
|
||||
return cublasCgeam(h, transa, transb, m, n, reinterpret_cast<const cuComplex*>(alpha),
|
||||
reinterpret_cast<const cuComplex*>(A), lda, reinterpret_cast<const cuComplex*>(beta),
|
||||
reinterpret_cast<const cuComplex*>(B), ldb, reinterpret_cast<cuComplex*>(C), ldc);
|
||||
}
|
||||
inline cublasStatus_t cublasXgeam(cublasHandle_t h, cublasOperation_t transa, cublasOperation_t transb, int m, int n,
|
||||
const std::complex<double>* alpha, const std::complex<double>* A, int lda,
|
||||
const std::complex<double>* beta, const std::complex<double>* B, int ldb,
|
||||
std::complex<double>* C, int ldc) {
|
||||
return cublasZgeam(h, transa, transb, m, n, reinterpret_cast<const cuDoubleComplex*>(alpha),
|
||||
reinterpret_cast<const cuDoubleComplex*>(A), lda, reinterpret_cast<const cuDoubleComplex*>(beta),
|
||||
reinterpret_cast<const cuDoubleComplex*>(B), ldb, reinterpret_cast<cuDoubleComplex*>(C), ldc);
|
||||
}
|
||||
|
||||
} // namespace internal
|
||||
} // namespace Eigen
|
||||
|
||||
#endif // EIGEN_GPU_CUBLAS_SUPPORT_H
|
||||
134
Eigen/src/GPU/CuDssSupport.h
Normal file
134
Eigen/src/GPU/CuDssSupport.h
Normal file
@@ -0,0 +1,134 @@
|
||||
// This file is part of Eigen, a lightweight C++ template library
|
||||
// for linear algebra.
|
||||
//
|
||||
// Copyright (C) 2026 Rasmus Munk Larsen <rmlarsen@gmail.com>
|
||||
//
|
||||
// This Source Code Form is subject to the terms of the Mozilla
|
||||
// Public License v. 2.0. If a copy of the MPL was not distributed
|
||||
// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
|
||||
|
||||
// cuDSS support utilities: error checking macro, type mapping.
|
||||
//
|
||||
// cuDSS is NVIDIA's sparse direct solver library, supporting Cholesky (LL^T),
|
||||
// LDL^T, and LU factorization on GPU. It requires CUDA 12.0+ and is
|
||||
// distributed separately from the CUDA Toolkit.
|
||||
|
||||
#ifndef EIGEN_GPU_CUDSS_SUPPORT_H
|
||||
#define EIGEN_GPU_CUDSS_SUPPORT_H
|
||||
|
||||
// IWYU pragma: private
|
||||
#include "./InternalHeaderCheck.h"
|
||||
|
||||
#include "./GpuSupport.h"
|
||||
#include <cudss.h>
|
||||
|
||||
namespace Eigen {
|
||||
namespace internal {
|
||||
|
||||
// ---- Error checking ---------------------------------------------------------
|
||||
|
||||
#define EIGEN_CUDSS_CHECK(x) \
|
||||
do { \
|
||||
cudssStatus_t _s = (x); \
|
||||
eigen_assert(_s == CUDSS_STATUS_SUCCESS && "cuDSS call failed: " #x); \
|
||||
EIGEN_UNUSED_VARIABLE(_s); \
|
||||
} while (0)
|
||||
|
||||
// ---- Scalar → cudssMatrixType_t for SPD/HPD ---------------------------------
|
||||
|
||||
template <typename Scalar>
|
||||
struct cudss_spd_type;
|
||||
|
||||
template <>
|
||||
struct cudss_spd_type<float> {
|
||||
static constexpr cudssMatrixType_t value = CUDSS_MTYPE_SPD;
|
||||
};
|
||||
template <>
|
||||
struct cudss_spd_type<double> {
|
||||
static constexpr cudssMatrixType_t value = CUDSS_MTYPE_SPD;
|
||||
};
|
||||
template <>
|
||||
struct cudss_spd_type<std::complex<float>> {
|
||||
static constexpr cudssMatrixType_t value = CUDSS_MTYPE_HPD;
|
||||
};
|
||||
template <>
|
||||
struct cudss_spd_type<std::complex<double>> {
|
||||
static constexpr cudssMatrixType_t value = CUDSS_MTYPE_HPD;
|
||||
};
|
||||
|
||||
// ---- Scalar → cudssMatrixType_t for symmetric/Hermitian ---------------------
|
||||
|
||||
template <typename Scalar>
|
||||
struct cudss_symmetric_type;
|
||||
|
||||
template <>
|
||||
struct cudss_symmetric_type<float> {
|
||||
static constexpr cudssMatrixType_t value = CUDSS_MTYPE_SYMMETRIC;
|
||||
};
|
||||
template <>
|
||||
struct cudss_symmetric_type<double> {
|
||||
static constexpr cudssMatrixType_t value = CUDSS_MTYPE_SYMMETRIC;
|
||||
};
|
||||
template <>
|
||||
struct cudss_symmetric_type<std::complex<float>> {
|
||||
static constexpr cudssMatrixType_t value = CUDSS_MTYPE_HERMITIAN;
|
||||
};
|
||||
template <>
|
||||
struct cudss_symmetric_type<std::complex<double>> {
|
||||
static constexpr cudssMatrixType_t value = CUDSS_MTYPE_HERMITIAN;
|
||||
};
|
||||
|
||||
// ---- StorageIndex → cudaDataType_t ------------------------------------------
|
||||
|
||||
template <typename StorageIndex>
|
||||
struct cudss_index_type;
|
||||
|
||||
template <>
|
||||
struct cudss_index_type<int> {
|
||||
static constexpr cudaDataType_t value = CUDA_R_32I;
|
||||
};
|
||||
template <>
|
||||
struct cudss_index_type<int64_t> {
|
||||
static constexpr cudaDataType_t value = CUDA_R_64I;
|
||||
};
|
||||
|
||||
// ---- UpLo → cudssMatrixViewType_t -------------------------------------------
|
||||
// For symmetric matrices stored as CSC (ColMajor), cuDSS sees CSR of A^T.
|
||||
// Since A = A^T, the data is the same, but the triangle view must be swapped.
|
||||
|
||||
template <int UpLo, int StorageOrder>
|
||||
struct cudss_view_type;
|
||||
|
||||
// ColMajor (CSC) passed as CSR: lower ↔ upper swap.
|
||||
template <>
|
||||
struct cudss_view_type<Lower, ColMajor> {
|
||||
static constexpr cudssMatrixViewType_t value = CUDSS_MVIEW_UPPER;
|
||||
};
|
||||
template <>
|
||||
struct cudss_view_type<Upper, ColMajor> {
|
||||
static constexpr cudssMatrixViewType_t value = CUDSS_MVIEW_LOWER;
|
||||
};
|
||||
|
||||
// RowMajor (CSR) passed directly: no swap needed.
|
||||
template <>
|
||||
struct cudss_view_type<Lower, RowMajor> {
|
||||
static constexpr cudssMatrixViewType_t value = CUDSS_MVIEW_LOWER;
|
||||
};
|
||||
template <>
|
||||
struct cudss_view_type<Upper, RowMajor> {
|
||||
static constexpr cudssMatrixViewType_t value = CUDSS_MVIEW_UPPER;
|
||||
};
|
||||
|
||||
} // namespace internal
|
||||
|
||||
// ---- Ordering enum ----------------------------------------------------------
|
||||
|
||||
enum class GpuSparseOrdering {
|
||||
AMD, // Default fill-reducing ordering
|
||||
METIS, // METIS nested dissection
|
||||
RCM // Reverse Cuthill-McKee
|
||||
};
|
||||
|
||||
} // namespace Eigen
|
||||
|
||||
#endif // EIGEN_GPU_CUDSS_SUPPORT_H
|
||||
103
Eigen/src/GPU/CuFftSupport.h
Normal file
103
Eigen/src/GPU/CuFftSupport.h
Normal file
@@ -0,0 +1,103 @@
|
||||
// This file is part of Eigen, a lightweight C++ template library
|
||||
// for linear algebra.
|
||||
//
|
||||
// Copyright (C) 2026 Rasmus Munk Larsen <rmlarsen@gmail.com>
|
||||
//
|
||||
// This Source Code Form is subject to the terms of the Mozilla
|
||||
// Public License v. 2.0. If a copy of the MPL was not distributed
|
||||
// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
|
||||
|
||||
// cuFFT support utilities: error checking macro, type mapping.
|
||||
|
||||
#ifndef EIGEN_GPU_CUFFT_SUPPORT_H
|
||||
#define EIGEN_GPU_CUFFT_SUPPORT_H
|
||||
|
||||
// IWYU pragma: private
|
||||
#include "./InternalHeaderCheck.h"
|
||||
|
||||
#include "./GpuSupport.h"
|
||||
#include <cufft.h>
|
||||
|
||||
namespace Eigen {
|
||||
namespace internal {
|
||||
|
||||
// ---- Error checking ---------------------------------------------------------
|
||||
|
||||
#define EIGEN_CUFFT_CHECK(x) \
|
||||
do { \
|
||||
cufftResult _r = (x); \
|
||||
eigen_assert(_r == CUFFT_SUCCESS && "cuFFT call failed: " #x); \
|
||||
EIGEN_UNUSED_VARIABLE(_r); \
|
||||
} while (0)
|
||||
|
||||
// ---- Scalar → cufftType traits ----------------------------------------------
|
||||
|
||||
template <typename Scalar>
|
||||
struct cufft_c2c_type;
|
||||
|
||||
template <>
|
||||
struct cufft_c2c_type<float> {
|
||||
static constexpr cufftType value = CUFFT_C2C;
|
||||
};
|
||||
template <>
|
||||
struct cufft_c2c_type<double> {
|
||||
static constexpr cufftType value = CUFFT_Z2Z;
|
||||
};
|
||||
|
||||
template <typename Scalar>
|
||||
struct cufft_r2c_type;
|
||||
|
||||
template <>
|
||||
struct cufft_r2c_type<float> {
|
||||
static constexpr cufftType value = CUFFT_R2C;
|
||||
};
|
||||
template <>
|
||||
struct cufft_r2c_type<double> {
|
||||
static constexpr cufftType value = CUFFT_D2Z;
|
||||
};
|
||||
|
||||
template <typename Scalar>
|
||||
struct cufft_c2r_type;
|
||||
|
||||
template <>
|
||||
struct cufft_c2r_type<float> {
|
||||
static constexpr cufftType value = CUFFT_C2R;
|
||||
};
|
||||
template <>
|
||||
struct cufft_c2r_type<double> {
|
||||
static constexpr cufftType value = CUFFT_Z2D;
|
||||
};
|
||||
|
||||
// ---- Type-dispatched cuFFT execution ----------------------------------------
|
||||
|
||||
// C2C
|
||||
inline cufftResult cufftExecC2C_dispatch(cufftHandle plan, std::complex<float>* in, std::complex<float>* out,
|
||||
int direction) {
|
||||
return cufftExecC2C(plan, reinterpret_cast<cufftComplex*>(in), reinterpret_cast<cufftComplex*>(out), direction);
|
||||
}
|
||||
inline cufftResult cufftExecC2C_dispatch(cufftHandle plan, std::complex<double>* in, std::complex<double>* out,
|
||||
int direction) {
|
||||
return cufftExecZ2Z(plan, reinterpret_cast<cufftDoubleComplex*>(in), reinterpret_cast<cufftDoubleComplex*>(out),
|
||||
direction);
|
||||
}
|
||||
|
||||
// R2C
|
||||
inline cufftResult cufftExecR2C_dispatch(cufftHandle plan, float* in, std::complex<float>* out) {
|
||||
return cufftExecR2C(plan, in, reinterpret_cast<cufftComplex*>(out));
|
||||
}
|
||||
inline cufftResult cufftExecR2C_dispatch(cufftHandle plan, double* in, std::complex<double>* out) {
|
||||
return cufftExecD2Z(plan, in, reinterpret_cast<cufftDoubleComplex*>(out));
|
||||
}
|
||||
|
||||
// C2R
|
||||
inline cufftResult cufftExecC2R_dispatch(cufftHandle plan, std::complex<float>* in, float* out) {
|
||||
return cufftExecC2R(plan, reinterpret_cast<cufftComplex*>(in), out);
|
||||
}
|
||||
inline cufftResult cufftExecC2R_dispatch(cufftHandle plan, std::complex<double>* in, double* out) {
|
||||
return cufftExecZ2D(plan, reinterpret_cast<cufftDoubleComplex*>(in), out);
|
||||
}
|
||||
|
||||
} // namespace internal
|
||||
} // namespace Eigen
|
||||
|
||||
#endif // EIGEN_GPU_CUFFT_SUPPORT_H
|
||||
159
Eigen/src/GPU/CuSolverSupport.h
Normal file
159
Eigen/src/GPU/CuSolverSupport.h
Normal file
@@ -0,0 +1,159 @@
|
||||
// This file is part of Eigen, a lightweight C++ template library
|
||||
// for linear algebra.
|
||||
//
|
||||
// Copyright (C) 2026 Rasmus Munk Larsen <rmlarsen@gmail.com>
|
||||
//
|
||||
// This Source Code Form is subject to the terms of the Mozilla
|
||||
// Public License v. 2.0. If a copy of the MPL was not distributed
|
||||
// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
|
||||
|
||||
// cuSOLVER-specific support types:
|
||||
// - cuSOLVER error-checking macro
|
||||
// - RAII wrapper for cusolverDnParams
|
||||
// - Scalar → cudaDataType_t mapping
|
||||
// - (UpLo, StorageOrder) → cublasFillMode_t mapping
|
||||
//
|
||||
// Generic CUDA runtime utilities (DeviceBuffer, EIGEN_CUDA_RUNTIME_CHECK)
|
||||
// are in GpuSupport.h.
|
||||
|
||||
#ifndef EIGEN_GPU_CUSOLVER_SUPPORT_H
|
||||
#define EIGEN_GPU_CUSOLVER_SUPPORT_H
|
||||
|
||||
// IWYU pragma: private
|
||||
#include "./InternalHeaderCheck.h"
|
||||
|
||||
#include "./GpuSupport.h"
|
||||
#include <cusolverDn.h>
|
||||
|
||||
namespace Eigen {
|
||||
namespace internal {
|
||||
|
||||
// ---- Error-checking macros --------------------------------------------------
|
||||
|
||||
#define EIGEN_CUSOLVER_CHECK(expr) \
|
||||
do { \
|
||||
cusolverStatus_t _s = (expr); \
|
||||
eigen_assert(_s == CUSOLVER_STATUS_SUCCESS && "cuSOLVER call failed"); \
|
||||
} while (0)
|
||||
|
||||
// ---- RAII: cusolverDnParams -------------------------------------------------
|
||||
|
||||
struct CusolverParams {
|
||||
cusolverDnParams_t p = nullptr;
|
||||
|
||||
CusolverParams() { EIGEN_CUSOLVER_CHECK(cusolverDnCreateParams(&p)); }
|
||||
|
||||
~CusolverParams() {
|
||||
if (p) (void)cusolverDnDestroyParams(p); // destructor: can't propagate
|
||||
}
|
||||
|
||||
// Move-only.
|
||||
CusolverParams(CusolverParams&& o) noexcept : p(o.p) { o.p = nullptr; }
|
||||
CusolverParams& operator=(CusolverParams&& o) noexcept {
|
||||
if (this != &o) {
|
||||
if (p) (void)cusolverDnDestroyParams(p);
|
||||
p = o.p;
|
||||
o.p = nullptr;
|
||||
}
|
||||
return *this;
|
||||
}
|
||||
|
||||
CusolverParams(const CusolverParams&) = delete;
|
||||
CusolverParams& operator=(const CusolverParams&) = delete;
|
||||
};
|
||||
|
||||
// ---- Scalar → cudaDataType_t ------------------------------------------------
|
||||
// Alias for backward compatibility. The canonical trait is cuda_data_type<> in GpuSupport.h.
|
||||
template <typename Scalar>
|
||||
using cusolver_data_type = cuda_data_type<Scalar>;
|
||||
|
||||
// ---- (UpLo, StorageOrder) → cublasFillMode_t --------------------------------
|
||||
// cuSOLVER always interprets the matrix as column-major. A row-major matrix A
|
||||
// appears as A^T to cuSOLVER, so the upper/lower triangle is swapped.
|
||||
|
||||
template <int UpLo, int StorageOrder>
|
||||
struct cusolver_fill_mode;
|
||||
|
||||
template <>
|
||||
struct cusolver_fill_mode<Lower, ColMajor> {
|
||||
static constexpr cublasFillMode_t value = CUBLAS_FILL_MODE_LOWER;
|
||||
};
|
||||
template <>
|
||||
struct cusolver_fill_mode<Upper, ColMajor> {
|
||||
static constexpr cublasFillMode_t value = CUBLAS_FILL_MODE_UPPER;
|
||||
};
|
||||
template <>
|
||||
struct cusolver_fill_mode<Lower, RowMajor> {
|
||||
static constexpr cublasFillMode_t value = CUBLAS_FILL_MODE_UPPER;
|
||||
};
|
||||
template <>
|
||||
struct cusolver_fill_mode<Upper, RowMajor> {
|
||||
static constexpr cublasFillMode_t value = CUBLAS_FILL_MODE_LOWER;
|
||||
};
|
||||
|
||||
// ---- Type-specific cuSOLVER wrappers ----------------------------------------
|
||||
// cuSOLVER does not provide generic X variants for ormqr/unmqr. These overloaded
|
||||
// wrappers dispatch to the correct type-specific function.
|
||||
// For real types: ormqr (orthogonal Q). For complex types: unmqr (unitary Q).
|
||||
|
||||
inline cusolverStatus_t cusolverDnXormqr(cusolverDnHandle_t h, cublasSideMode_t side, cublasOperation_t trans, int m,
|
||||
int n, int k, const float* A, int lda, const float* tau, float* C, int ldc,
|
||||
float* work, int lwork, int* info) {
|
||||
return cusolverDnSormqr(h, side, trans, m, n, k, A, lda, tau, C, ldc, work, lwork, info);
|
||||
}
|
||||
inline cusolverStatus_t cusolverDnXormqr(cusolverDnHandle_t h, cublasSideMode_t side, cublasOperation_t trans, int m,
|
||||
int n, int k, const double* A, int lda, const double* tau, double* C, int ldc,
|
||||
double* work, int lwork, int* info) {
|
||||
return cusolverDnDormqr(h, side, trans, m, n, k, A, lda, tau, C, ldc, work, lwork, info);
|
||||
}
|
||||
inline cusolverStatus_t cusolverDnXormqr(cusolverDnHandle_t h, cublasSideMode_t side, cublasOperation_t trans, int m,
|
||||
int n, int k, const std::complex<float>* A, int lda,
|
||||
const std::complex<float>* tau, std::complex<float>* C, int ldc,
|
||||
std::complex<float>* work, int lwork, int* info) {
|
||||
return cusolverDnCunmqr(h, side, trans, m, n, k, reinterpret_cast<const cuComplex*>(A), lda,
|
||||
reinterpret_cast<const cuComplex*>(tau), reinterpret_cast<cuComplex*>(C), ldc,
|
||||
reinterpret_cast<cuComplex*>(work), lwork, info);
|
||||
}
|
||||
inline cusolverStatus_t cusolverDnXormqr(cusolverDnHandle_t h, cublasSideMode_t side, cublasOperation_t trans, int m,
|
||||
int n, int k, const std::complex<double>* A, int lda,
|
||||
const std::complex<double>* tau, std::complex<double>* C, int ldc,
|
||||
std::complex<double>* work, int lwork, int* info) {
|
||||
return cusolverDnZunmqr(h, side, trans, m, n, k, reinterpret_cast<const cuDoubleComplex*>(A), lda,
|
||||
reinterpret_cast<const cuDoubleComplex*>(tau), reinterpret_cast<cuDoubleComplex*>(C), ldc,
|
||||
reinterpret_cast<cuDoubleComplex*>(work), lwork, info);
|
||||
}
|
||||
|
||||
// Buffer size wrappers for ormqr/unmqr.
|
||||
inline cusolverStatus_t cusolverDnXormqr_bufferSize(cusolverDnHandle_t h, cublasSideMode_t side,
|
||||
cublasOperation_t trans, int m, int n, int k, const float* A,
|
||||
int lda, const float* tau, const float* C, int ldc, int* lwork) {
|
||||
return cusolverDnSormqr_bufferSize(h, side, trans, m, n, k, A, lda, tau, C, ldc, lwork);
|
||||
}
|
||||
inline cusolverStatus_t cusolverDnXormqr_bufferSize(cusolverDnHandle_t h, cublasSideMode_t side,
|
||||
cublasOperation_t trans, int m, int n, int k, const double* A,
|
||||
int lda, const double* tau, const double* C, int ldc, int* lwork) {
|
||||
return cusolverDnDormqr_bufferSize(h, side, trans, m, n, k, A, lda, tau, C, ldc, lwork);
|
||||
}
|
||||
inline cusolverStatus_t cusolverDnXormqr_bufferSize(cusolverDnHandle_t h, cublasSideMode_t side,
|
||||
cublasOperation_t trans, int m, int n, int k,
|
||||
const std::complex<float>* A, int lda,
|
||||
const std::complex<float>* tau, const std::complex<float>* C,
|
||||
int ldc, int* lwork) {
|
||||
return cusolverDnCunmqr_bufferSize(h, side, trans, m, n, k, reinterpret_cast<const cuComplex*>(A), lda,
|
||||
reinterpret_cast<const cuComplex*>(tau), reinterpret_cast<const cuComplex*>(C),
|
||||
ldc, lwork);
|
||||
}
|
||||
inline cusolverStatus_t cusolverDnXormqr_bufferSize(cusolverDnHandle_t h, cublasSideMode_t side,
|
||||
cublasOperation_t trans, int m, int n, int k,
|
||||
const std::complex<double>* A, int lda,
|
||||
const std::complex<double>* tau, const std::complex<double>* C,
|
||||
int ldc, int* lwork) {
|
||||
return cusolverDnZunmqr_bufferSize(h, side, trans, m, n, k, reinterpret_cast<const cuDoubleComplex*>(A), lda,
|
||||
reinterpret_cast<const cuDoubleComplex*>(tau),
|
||||
reinterpret_cast<const cuDoubleComplex*>(C), ldc, lwork);
|
||||
}
|
||||
|
||||
} // namespace internal
|
||||
} // namespace Eigen
|
||||
|
||||
#endif // EIGEN_GPU_CUSOLVER_SUPPORT_H
|
||||
34
Eigen/src/GPU/CuSparseSupport.h
Normal file
34
Eigen/src/GPU/CuSparseSupport.h
Normal file
@@ -0,0 +1,34 @@
|
||||
// This file is part of Eigen, a lightweight C++ template library
|
||||
// for linear algebra.
|
||||
//
|
||||
// Copyright (C) 2026 Rasmus Munk Larsen <rmlarsen@gmail.com>
|
||||
//
|
||||
// This Source Code Form is subject to the terms of the Mozilla
|
||||
// Public License v. 2.0. If a copy of the MPL was not distributed
|
||||
// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
|
||||
|
||||
// cuSPARSE support utilities: error checking macro.
|
||||
|
||||
#ifndef EIGEN_GPU_CUSPARSE_SUPPORT_H
|
||||
#define EIGEN_GPU_CUSPARSE_SUPPORT_H
|
||||
|
||||
// IWYU pragma: private
|
||||
#include "./InternalHeaderCheck.h"
|
||||
|
||||
#include "./GpuSupport.h"
|
||||
#include <cusparse.h>
|
||||
|
||||
namespace Eigen {
|
||||
namespace internal {
|
||||
|
||||
#define EIGEN_CUSPARSE_CHECK(x) \
|
||||
do { \
|
||||
cusparseStatus_t _s = (x); \
|
||||
eigen_assert(_s == CUSPARSE_STATUS_SUCCESS && "cuSPARSE call failed: " #x); \
|
||||
EIGEN_UNUSED_VARIABLE(_s); \
|
||||
} while (0)
|
||||
|
||||
} // namespace internal
|
||||
} // namespace Eigen
|
||||
|
||||
#endif // EIGEN_GPU_CUSPARSE_SUPPORT_H
|
||||
146
Eigen/src/GPU/DeviceBlasExpr.h
Normal file
146
Eigen/src/GPU/DeviceBlasExpr.h
Normal file
@@ -0,0 +1,146 @@
|
||||
// This file is part of Eigen, a lightweight C++ template library
|
||||
// for linear algebra.
|
||||
//
|
||||
// Copyright (C) 2026 Rasmus Munk Larsen <rmlarsen@gmail.com>
|
||||
//
|
||||
// This Source Code Form is subject to the terms of the Mozilla
|
||||
// Public License v. 2.0. If a copy of the MPL was not distributed
|
||||
// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
|
||||
|
||||
// BLAS Level 3 expression types for DeviceMatrix (beyond GEMM):
|
||||
// TrsmExpr → cublasXtrsm (triangular solve)
|
||||
// SymmExpr → cublasXsymm (symmetric multiply, real)
|
||||
// → cublasXhemm (Hermitian multiply, complex)
|
||||
// SyrkExpr → cublasXsyrk (symmetric rank-k update, real)
|
||||
// → cublasXherk (Hermitian rank-k update, complex)
|
||||
|
||||
#ifndef EIGEN_GPU_DEVICE_BLAS_EXPR_H
|
||||
#define EIGEN_GPU_DEVICE_BLAS_EXPR_H
|
||||
|
||||
// IWYU pragma: private
|
||||
#include "./InternalHeaderCheck.h"
|
||||
|
||||
namespace Eigen {
|
||||
|
||||
template <typename Scalar_>
|
||||
class DeviceMatrix;
|
||||
|
||||
// ---- DeviceTriangularView ---------------------------------------------------
|
||||
// d_A.triangularView<Lower>() → view with .solve(d_B)
|
||||
|
||||
template <typename Scalar_, int UpLo_>
|
||||
class DeviceTriangularView {
|
||||
public:
|
||||
using Scalar = Scalar_;
|
||||
enum { UpLo = UpLo_ };
|
||||
|
||||
explicit DeviceTriangularView(const DeviceMatrix<Scalar>& m) : mat_(m) {}
|
||||
const DeviceMatrix<Scalar>& matrix() const { return mat_; }
|
||||
|
||||
/** Build a TRSM solve expression. */
|
||||
TrsmExpr<Scalar, UpLo_> solve(const DeviceMatrix<Scalar>& rhs) const { return {mat_, rhs}; }
|
||||
|
||||
private:
|
||||
const DeviceMatrix<Scalar>& mat_;
|
||||
};
|
||||
|
||||
// ---- TrsmExpr: triangularView<UpLo>().solve(B) → cublasXtrsm ---------------
|
||||
|
||||
template <typename Scalar_, int UpLo_>
|
||||
class TrsmExpr {
|
||||
public:
|
||||
using Scalar = Scalar_;
|
||||
enum { UpLo = UpLo_ };
|
||||
|
||||
TrsmExpr(const DeviceMatrix<Scalar>& A, const DeviceMatrix<Scalar>& B) : A_(A), B_(B) {}
|
||||
const DeviceMatrix<Scalar>& matrix() const { return A_; }
|
||||
const DeviceMatrix<Scalar>& rhs() const { return B_; }
|
||||
|
||||
private:
|
||||
const DeviceMatrix<Scalar>& A_;
|
||||
const DeviceMatrix<Scalar>& B_;
|
||||
};
|
||||
|
||||
// ---- DeviceSelfAdjointView --------------------------------------------------
|
||||
// d_A.selfadjointView<Lower>() → view that can multiply: view * d_B
|
||||
|
||||
template <typename Scalar_, int UpLo_>
|
||||
class DeviceSelfAdjointView {
|
||||
public:
|
||||
using Scalar = Scalar_;
|
||||
using RealScalar = typename NumTraits<Scalar>::Real;
|
||||
enum { UpLo = UpLo_ };
|
||||
|
||||
explicit DeviceSelfAdjointView(DeviceMatrix<Scalar>& m) : mat_(m) {}
|
||||
const DeviceMatrix<Scalar>& matrix() const { return mat_; }
|
||||
DeviceMatrix<Scalar>& matrix() { return mat_; }
|
||||
|
||||
/** Rank-k update: C.selfadjointView<Lower>().rankUpdate(A, alpha)
|
||||
* computes C = alpha * A * A^H + C (lower triangle only).
|
||||
* Maps to cublasXsyrk (real) or cublasXherk (complex). */
|
||||
void rankUpdate(const DeviceMatrix<Scalar>& A, RealScalar alpha = RealScalar(1));
|
||||
|
||||
private:
|
||||
DeviceMatrix<Scalar>& mat_;
|
||||
};
|
||||
|
||||
// Const variant for multiplication only (no rankUpdate).
|
||||
template <typename Scalar_, int UpLo_>
|
||||
class ConstDeviceSelfAdjointView {
|
||||
public:
|
||||
using Scalar = Scalar_;
|
||||
enum { UpLo = UpLo_ };
|
||||
|
||||
explicit ConstDeviceSelfAdjointView(const DeviceMatrix<Scalar>& m) : mat_(m) {}
|
||||
const DeviceMatrix<Scalar>& matrix() const { return mat_; }
|
||||
|
||||
private:
|
||||
const DeviceMatrix<Scalar>& mat_;
|
||||
};
|
||||
|
||||
// ---- SymmExpr: selfadjointView<UpLo>() * B → cublasXsymm/Xhemm ------------
|
||||
|
||||
template <typename Scalar_, int UpLo_>
|
||||
class SymmExpr {
|
||||
public:
|
||||
using Scalar = Scalar_;
|
||||
enum { UpLo = UpLo_ };
|
||||
|
||||
SymmExpr(const DeviceMatrix<Scalar>& A, const DeviceMatrix<Scalar>& B) : A_(A), B_(B) {}
|
||||
const DeviceMatrix<Scalar>& matrix() const { return A_; }
|
||||
const DeviceMatrix<Scalar>& rhs() const { return B_; }
|
||||
|
||||
private:
|
||||
const DeviceMatrix<Scalar>& A_;
|
||||
const DeviceMatrix<Scalar>& B_;
|
||||
};
|
||||
|
||||
// operator*: DeviceSelfAdjointView * DeviceMatrix → SymmExpr (mutable and const variants)
|
||||
template <typename S, int UpLo>
|
||||
SymmExpr<S, UpLo> operator*(const DeviceSelfAdjointView<S, UpLo>& a, const DeviceMatrix<S>& b) {
|
||||
return {a.matrix(), b};
|
||||
}
|
||||
template <typename S, int UpLo>
|
||||
SymmExpr<S, UpLo> operator*(const ConstDeviceSelfAdjointView<S, UpLo>& a, const DeviceMatrix<S>& b) {
|
||||
return {a.matrix(), b};
|
||||
}
|
||||
|
||||
// ---- SyrkExpr: rankUpdate(A) → cublasXsyrk/Xherk ---------------------------
|
||||
// C.rankUpdate(A) computes C += A * A^H (or A^H * A depending on convention).
|
||||
|
||||
template <typename Scalar_, int UpLo_>
|
||||
class SyrkExpr {
|
||||
public:
|
||||
using Scalar = Scalar_;
|
||||
enum { UpLo = UpLo_ };
|
||||
|
||||
SyrkExpr(const DeviceMatrix<Scalar>& A) : A_(A) {}
|
||||
const DeviceMatrix<Scalar>& matrix() const { return A_; }
|
||||
|
||||
private:
|
||||
const DeviceMatrix<Scalar>& A_;
|
||||
};
|
||||
|
||||
} // namespace Eigen
|
||||
|
||||
#endif // EIGEN_GPU_DEVICE_BLAS_EXPR_H
|
||||
509
Eigen/src/GPU/DeviceDispatch.h
Normal file
509
Eigen/src/GPU/DeviceDispatch.h
Normal file
@@ -0,0 +1,509 @@
|
||||
// This file is part of Eigen, a lightweight C++ template library
|
||||
// for linear algebra.
|
||||
//
|
||||
// Copyright (C) 2026 Rasmus Munk Larsen <rmlarsen@gmail.com>
|
||||
//
|
||||
// This Source Code Form is subject to the terms of the Mozilla
|
||||
// Public License v. 2.0. If a copy of the MPL was not distributed
|
||||
// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
|
||||
|
||||
// Dispatch functions that map DeviceMatrix expressions to NVIDIA library calls.
|
||||
//
|
||||
// dispatch_gemm() — GemmExpr → cublasXgemm
|
||||
//
|
||||
// Each function documents the exact library call and parameters.
|
||||
|
||||
#ifndef EIGEN_GPU_DEVICE_DISPATCH_H
|
||||
#define EIGEN_GPU_DEVICE_DISPATCH_H
|
||||
|
||||
// IWYU pragma: private
|
||||
#include "./InternalHeaderCheck.h"
|
||||
|
||||
#include "./DeviceExpr.h"
|
||||
#include "./DeviceBlasExpr.h"
|
||||
#include "./DeviceSolverExpr.h"
|
||||
#include "./GpuContext.h"
|
||||
#include "./CuSolverSupport.h"
|
||||
|
||||
namespace Eigen {
|
||||
namespace internal {
|
||||
|
||||
// ---- GEMM dispatch ----------------------------------------------------------
|
||||
// GemmExpr<Lhs, Rhs> → cublasGemmEx(transA, transB, m, n, k, alpha, A, lda, B, ldb, beta, C, ldc)
|
||||
//
|
||||
// The generic API cublasGemmEx handles all scalar types (float, double,
|
||||
// complex<float>, complex<double>) via cudaDataType_t.
|
||||
|
||||
template <typename Lhs, typename Rhs>
|
||||
void dispatch_gemm(
|
||||
GpuContext& ctx, DeviceMatrix<typename device_expr_traits<Lhs>::scalar_type>& dst, const GemmExpr<Lhs, Rhs>& expr,
|
||||
typename device_expr_traits<Lhs>::scalar_type beta_val,
|
||||
typename device_expr_traits<Lhs>::scalar_type alpha_scale = typename device_expr_traits<Lhs>::scalar_type(1)) {
|
||||
using Scalar = typename device_expr_traits<Lhs>::scalar_type;
|
||||
using traits_lhs = device_expr_traits<Lhs>;
|
||||
using traits_rhs = device_expr_traits<Rhs>;
|
||||
|
||||
const DeviceMatrix<Scalar>& A = traits_lhs::matrix(expr.lhs());
|
||||
const DeviceMatrix<Scalar>& B = traits_rhs::matrix(expr.rhs());
|
||||
|
||||
constexpr cublasOperation_t transA = to_cublas_op(traits_lhs::op);
|
||||
constexpr cublasOperation_t transB = to_cublas_op(traits_rhs::op);
|
||||
|
||||
// GEMM dimensions: C(m,n) = op(A)(m,k) * op(B)(k,n)
|
||||
// op(A) has dimensions (A.rows, A.cols) if NoTrans, (A.cols, A.rows) if Trans/ConjTrans.
|
||||
const int64_t m = (traits_lhs::op == GpuOp::NoTrans) ? A.rows() : A.cols();
|
||||
const int64_t k = (traits_lhs::op == GpuOp::NoTrans) ? A.cols() : A.rows();
|
||||
const int64_t n = (traits_rhs::op == GpuOp::NoTrans) ? B.cols() : B.rows();
|
||||
const int64_t rhs_k = (traits_rhs::op == GpuOp::NoTrans) ? B.rows() : B.cols();
|
||||
|
||||
eigen_assert(k == rhs_k && "DeviceMatrix GEMM dimension mismatch");
|
||||
|
||||
const int64_t lda = A.rows();
|
||||
const int64_t ldb = B.rows();
|
||||
|
||||
// Serialize all accesses to the destination buffer on this stream.
|
||||
if (!dst.empty()) {
|
||||
dst.waitReady(ctx.stream());
|
||||
}
|
||||
|
||||
// Allocate or resize destination.
|
||||
const bool resized = dst.empty() || dst.rows() != m || dst.cols() != n;
|
||||
if (resized) {
|
||||
dst.resize(m, n);
|
||||
}
|
||||
const int64_t ldc = dst.rows();
|
||||
|
||||
// cuBLAS requires alpha/beta as float for half/bfloat16 inputs.
|
||||
using GemmScalar = typename cuda_gemm_scalar<Scalar>::type;
|
||||
GemmScalar alpha_gval =
|
||||
static_cast<GemmScalar>(alpha_scale * traits_lhs::alpha(expr.lhs()) * traits_rhs::alpha(expr.rhs()));
|
||||
GemmScalar beta_gval = static_cast<GemmScalar>(beta_val);
|
||||
|
||||
// Wait for operands to be ready on this stream.
|
||||
A.waitReady(ctx.stream());
|
||||
B.waitReady(ctx.stream());
|
||||
|
||||
// If there is no existing valid destination to accumulate into, treat it as
|
||||
// zero rather than reading uninitialized memory.
|
||||
if (resized && beta_gval != GemmScalar(0) && dst.sizeInBytes() > 0) {
|
||||
EIGEN_CUDA_RUNTIME_CHECK(cudaMemsetAsync(dst.data(), 0, dst.sizeInBytes(), ctx.stream()));
|
||||
}
|
||||
|
||||
constexpr cudaDataType_t dtype = cuda_data_type<Scalar>::value;
|
||||
constexpr cublasComputeType_t compute = cuda_compute_type<Scalar>::value;
|
||||
|
||||
EIGEN_CUBLAS_CHECK(cublasGemmEx(ctx.cublasHandle(), transA, transB, static_cast<int>(m), static_cast<int>(n),
|
||||
static_cast<int>(k), &alpha_gval, A.data(), dtype, static_cast<int>(lda), B.data(),
|
||||
dtype, static_cast<int>(ldb), &beta_gval, dst.data(), dtype, static_cast<int>(ldc),
|
||||
compute, cuda_gemm_algo()));
|
||||
|
||||
dst.recordReady(ctx.stream());
|
||||
}
|
||||
|
||||
// ---- LLT solve dispatch -----------------------------------------------------
|
||||
// LltSolveExpr → cusolverDnXpotrf (factorize) + cusolverDnXpotrs (solve).
|
||||
// No caching — factor and workspace are temporary. Syncs to check info.
|
||||
|
||||
template <typename Scalar, int UpLo>
|
||||
void dispatch_llt_solve(GpuContext& ctx, DeviceMatrix<Scalar>& dst, const LltSolveExpr<Scalar, UpLo>& expr) {
|
||||
const DeviceMatrix<Scalar>& A = expr.matrix();
|
||||
const DeviceMatrix<Scalar>& B = expr.rhs();
|
||||
|
||||
eigen_assert(A.rows() == A.cols() && "LLT requires a square matrix");
|
||||
eigen_assert(B.rows() == A.rows() && "LLT solve: RHS rows must match matrix size");
|
||||
|
||||
const Index n = A.rows();
|
||||
const int64_t nrhs = static_cast<int64_t>(B.cols());
|
||||
|
||||
// Zero-size fast paths: no work, just resize dst.
|
||||
// Wait on dst before resize to avoid freeing memory another stream is using.
|
||||
if (n == 0 || nrhs == 0) {
|
||||
if (!dst.empty()) dst.waitReady(ctx.stream());
|
||||
dst.resize(n == 0 ? 0 : n, B.cols());
|
||||
return;
|
||||
}
|
||||
|
||||
A.waitReady(ctx.stream());
|
||||
B.waitReady(ctx.stream());
|
||||
if (!dst.empty()) dst.waitReady(ctx.stream());
|
||||
|
||||
constexpr cudaDataType_t dtype = cuda_data_type<Scalar>::value;
|
||||
constexpr cublasFillMode_t uplo = cusolver_fill_mode<UpLo, ColMajor>::value;
|
||||
const int64_t lda = static_cast<int64_t>(A.rows());
|
||||
const int64_t ldb = static_cast<int64_t>(B.rows());
|
||||
|
||||
const size_t mat_bytes = static_cast<size_t>(lda) * static_cast<size_t>(n) * sizeof(Scalar);
|
||||
const size_t rhs_bytes = static_cast<size_t>(ldb) * static_cast<size_t>(nrhs) * sizeof(Scalar);
|
||||
|
||||
// D2D copy A → factor buffer (potrf is in-place).
|
||||
DeviceBuffer d_factor(mat_bytes);
|
||||
EIGEN_CUDA_RUNTIME_CHECK(cudaMemcpyAsync(d_factor.ptr, A.data(), mat_bytes, cudaMemcpyDeviceToDevice, ctx.stream()));
|
||||
|
||||
// Query workspace and factorize.
|
||||
CusolverParams params;
|
||||
DeviceBuffer d_factorize_info(sizeof(int));
|
||||
size_t dev_ws = 0, host_ws = 0;
|
||||
EIGEN_CUSOLVER_CHECK(cusolverDnXpotrf_bufferSize(ctx.cusolverHandle(), params.p, uplo, static_cast<int64_t>(n), dtype,
|
||||
d_factor.ptr, lda, dtype, &dev_ws, &host_ws));
|
||||
|
||||
DeviceBuffer d_workspace(dev_ws);
|
||||
std::vector<char> h_workspace(host_ws);
|
||||
|
||||
EIGEN_CUSOLVER_CHECK(cusolverDnXpotrf(
|
||||
ctx.cusolverHandle(), params.p, uplo, static_cast<int64_t>(n), dtype, d_factor.ptr, lda, dtype, d_workspace.ptr,
|
||||
dev_ws, host_ws > 0 ? h_workspace.data() : nullptr, host_ws, static_cast<int*>(d_factorize_info.ptr)));
|
||||
|
||||
// Check factorization info before proceeding to solve.
|
||||
int factorize_info = 0;
|
||||
EIGEN_CUDA_RUNTIME_CHECK(
|
||||
cudaMemcpyAsync(&factorize_info, d_factorize_info.ptr, sizeof(int), cudaMemcpyDeviceToHost, ctx.stream()));
|
||||
EIGEN_CUDA_RUNTIME_CHECK(cudaStreamSynchronize(ctx.stream()));
|
||||
eigen_assert(factorize_info == 0 && "cuSOLVER LLT factorization failed (matrix not positive definite)");
|
||||
|
||||
// D2D copy B → dst (potrs is in-place on the RHS).
|
||||
dst.resize(n, B.cols());
|
||||
EIGEN_CUDA_RUNTIME_CHECK(cudaMemcpyAsync(dst.data(), B.data(), rhs_bytes, cudaMemcpyDeviceToDevice, ctx.stream()));
|
||||
|
||||
// Solve.
|
||||
DeviceBuffer d_solve_info(sizeof(int));
|
||||
EIGEN_CUSOLVER_CHECK(cusolverDnXpotrs(ctx.cusolverHandle(), params.p, uplo, static_cast<int64_t>(n), nrhs, dtype,
|
||||
d_factor.ptr, lda, dtype, dst.data(), static_cast<int64_t>(dst.rows()),
|
||||
static_cast<int*>(d_solve_info.ptr)));
|
||||
|
||||
// Sync to ensure workspace locals can be freed safely.
|
||||
int solve_info = 0;
|
||||
EIGEN_CUDA_RUNTIME_CHECK(
|
||||
cudaMemcpyAsync(&solve_info, d_solve_info.ptr, sizeof(int), cudaMemcpyDeviceToHost, ctx.stream()));
|
||||
EIGEN_CUDA_RUNTIME_CHECK(cudaStreamSynchronize(ctx.stream()));
|
||||
eigen_assert(solve_info == 0 && "cuSOLVER LLT solve failed");
|
||||
|
||||
dst.recordReady(ctx.stream());
|
||||
}
|
||||
|
||||
// ---- LU solve dispatch ------------------------------------------------------
|
||||
// LuSolveExpr → cusolverDnXgetrf (factorize) + cusolverDnXgetrs (solve).
|
||||
|
||||
template <typename Scalar>
|
||||
void dispatch_lu_solve(GpuContext& ctx, DeviceMatrix<Scalar>& dst, const LuSolveExpr<Scalar>& expr) {
|
||||
const DeviceMatrix<Scalar>& A = expr.matrix();
|
||||
const DeviceMatrix<Scalar>& B = expr.rhs();
|
||||
|
||||
eigen_assert(A.rows() == A.cols() && "LU requires a square matrix");
|
||||
eigen_assert(B.rows() == A.rows() && "LU solve: RHS rows must match matrix size");
|
||||
|
||||
const Index n = A.rows();
|
||||
const int64_t nrhs = static_cast<int64_t>(B.cols());
|
||||
|
||||
if (n == 0 || nrhs == 0) {
|
||||
if (!dst.empty()) dst.waitReady(ctx.stream());
|
||||
dst.resize(n == 0 ? 0 : n, B.cols());
|
||||
return;
|
||||
}
|
||||
|
||||
A.waitReady(ctx.stream());
|
||||
B.waitReady(ctx.stream());
|
||||
if (!dst.empty()) dst.waitReady(ctx.stream());
|
||||
|
||||
constexpr cudaDataType_t dtype = cuda_data_type<Scalar>::value;
|
||||
const int64_t lda = static_cast<int64_t>(A.rows());
|
||||
const int64_t ldb = static_cast<int64_t>(B.rows());
|
||||
|
||||
const size_t mat_bytes = static_cast<size_t>(lda) * static_cast<size_t>(n) * sizeof(Scalar);
|
||||
const size_t rhs_bytes = static_cast<size_t>(ldb) * static_cast<size_t>(nrhs) * sizeof(Scalar);
|
||||
const size_t ipiv_bytes = static_cast<size_t>(n) * sizeof(int64_t);
|
||||
|
||||
// D2D copy A → LU buffer (getrf is in-place).
|
||||
DeviceBuffer d_lu(mat_bytes);
|
||||
EIGEN_CUDA_RUNTIME_CHECK(cudaMemcpyAsync(d_lu.ptr, A.data(), mat_bytes, cudaMemcpyDeviceToDevice, ctx.stream()));
|
||||
|
||||
DeviceBuffer d_ipiv(ipiv_bytes);
|
||||
|
||||
// Query workspace and factorize.
|
||||
CusolverParams params;
|
||||
DeviceBuffer d_factorize_info(sizeof(int));
|
||||
size_t dev_ws = 0, host_ws = 0;
|
||||
EIGEN_CUSOLVER_CHECK(cusolverDnXgetrf_bufferSize(ctx.cusolverHandle(), params.p, static_cast<int64_t>(n),
|
||||
static_cast<int64_t>(n), dtype, d_lu.ptr, lda, dtype, &dev_ws,
|
||||
&host_ws));
|
||||
|
||||
DeviceBuffer d_workspace(dev_ws);
|
||||
std::vector<char> h_workspace(host_ws);
|
||||
|
||||
EIGEN_CUSOLVER_CHECK(
|
||||
cusolverDnXgetrf(ctx.cusolverHandle(), params.p, static_cast<int64_t>(n), static_cast<int64_t>(n), dtype,
|
||||
d_lu.ptr, lda, static_cast<int64_t*>(d_ipiv.ptr), dtype, d_workspace.ptr, dev_ws,
|
||||
host_ws > 0 ? h_workspace.data() : nullptr, host_ws, static_cast<int*>(d_factorize_info.ptr)));
|
||||
|
||||
// Check factorization info before proceeding to solve.
|
||||
int factorize_info = 0;
|
||||
EIGEN_CUDA_RUNTIME_CHECK(
|
||||
cudaMemcpyAsync(&factorize_info, d_factorize_info.ptr, sizeof(int), cudaMemcpyDeviceToHost, ctx.stream()));
|
||||
EIGEN_CUDA_RUNTIME_CHECK(cudaStreamSynchronize(ctx.stream()));
|
||||
eigen_assert(factorize_info == 0 && "cuSOLVER LU factorization failed (singular matrix)");
|
||||
|
||||
// D2D copy B → dst (getrs is in-place on the RHS).
|
||||
dst.resize(n, B.cols());
|
||||
EIGEN_CUDA_RUNTIME_CHECK(cudaMemcpyAsync(dst.data(), B.data(), rhs_bytes, cudaMemcpyDeviceToDevice, ctx.stream()));
|
||||
|
||||
// Solve (NoTranspose).
|
||||
DeviceBuffer d_solve_info(sizeof(int));
|
||||
EIGEN_CUSOLVER_CHECK(cusolverDnXgetrs(ctx.cusolverHandle(), params.p, CUBLAS_OP_N, static_cast<int64_t>(n), nrhs,
|
||||
dtype, d_lu.ptr, lda, static_cast<const int64_t*>(d_ipiv.ptr), dtype,
|
||||
dst.data(), static_cast<int64_t>(dst.rows()),
|
||||
static_cast<int*>(d_solve_info.ptr)));
|
||||
|
||||
// Sync to ensure workspace locals can be freed safely.
|
||||
int solve_info = 0;
|
||||
EIGEN_CUDA_RUNTIME_CHECK(
|
||||
cudaMemcpyAsync(&solve_info, d_solve_info.ptr, sizeof(int), cudaMemcpyDeviceToHost, ctx.stream()));
|
||||
EIGEN_CUDA_RUNTIME_CHECK(cudaStreamSynchronize(ctx.stream()));
|
||||
eigen_assert(solve_info == 0 && "cuSOLVER LU solve failed");
|
||||
|
||||
dst.recordReady(ctx.stream());
|
||||
}
|
||||
|
||||
// ---- TRSM dispatch ----------------------------------------------------------
|
||||
// TrsmExpr → cublasXtrsm: solve op(A) * X = B where A is triangular.
|
||||
// Side=Left, Diag=NonUnit. A is square, B is n×nrhs.
|
||||
|
||||
template <typename Scalar, int UpLo>
|
||||
void dispatch_trsm(GpuContext& ctx, DeviceMatrix<Scalar>& dst, const TrsmExpr<Scalar, UpLo>& expr) {
|
||||
const DeviceMatrix<Scalar>& A = expr.matrix();
|
||||
const DeviceMatrix<Scalar>& B = expr.rhs();
|
||||
|
||||
eigen_assert(A.rows() == A.cols() && "TRSM requires a square triangular matrix");
|
||||
eigen_assert(B.rows() == A.rows() && "TRSM: RHS rows must match matrix size");
|
||||
|
||||
const int n = static_cast<int>(A.rows());
|
||||
const int nrhs = static_cast<int>(B.cols());
|
||||
|
||||
if (n == 0 || nrhs == 0) {
|
||||
if (!dst.empty()) dst.waitReady(ctx.stream());
|
||||
dst.resize(n == 0 ? 0 : n, B.cols());
|
||||
return;
|
||||
}
|
||||
|
||||
A.waitReady(ctx.stream());
|
||||
B.waitReady(ctx.stream());
|
||||
if (!dst.empty()) dst.waitReady(ctx.stream());
|
||||
|
||||
// D2D copy B → dst (trsm is in-place on the RHS).
|
||||
dst.resize(n, B.cols());
|
||||
const size_t rhs_bytes = static_cast<size_t>(dst.rows()) * static_cast<size_t>(nrhs) * sizeof(Scalar);
|
||||
EIGEN_CUDA_RUNTIME_CHECK(cudaMemcpyAsync(dst.data(), B.data(), rhs_bytes, cudaMemcpyDeviceToDevice, ctx.stream()));
|
||||
|
||||
constexpr cublasFillMode_t uplo = (UpLo == Lower) ? CUBLAS_FILL_MODE_LOWER : CUBLAS_FILL_MODE_UPPER;
|
||||
Scalar alpha(1);
|
||||
|
||||
EIGEN_CUBLAS_CHECK(cublasXtrsm(ctx.cublasHandle(), CUBLAS_SIDE_LEFT, uplo, CUBLAS_OP_N, CUBLAS_DIAG_NON_UNIT, n, nrhs,
|
||||
&alpha, A.data(), static_cast<int>(A.rows()), dst.data(),
|
||||
static_cast<int>(dst.rows())));
|
||||
|
||||
dst.recordReady(ctx.stream());
|
||||
}
|
||||
|
||||
// ---- SYMM/HEMM dispatch -----------------------------------------------------
|
||||
// SymmExpr → cublasXsymm (real) or cublasXhemm (complex).
|
||||
// C = A * B where A is symmetric/Hermitian. Side=Left.
|
||||
|
||||
template <typename Scalar, int UpLo>
|
||||
void dispatch_symm(GpuContext& ctx, DeviceMatrix<Scalar>& dst, const SymmExpr<Scalar, UpLo>& expr) {
|
||||
const DeviceMatrix<Scalar>& A = expr.matrix();
|
||||
const DeviceMatrix<Scalar>& B = expr.rhs();
|
||||
|
||||
eigen_assert(A.rows() == A.cols() && "SYMM requires a square matrix");
|
||||
eigen_assert(B.rows() == A.rows() && "SYMM: RHS rows must match matrix size");
|
||||
|
||||
const int m = static_cast<int>(A.rows());
|
||||
const int n = static_cast<int>(B.cols());
|
||||
|
||||
if (m == 0 || n == 0) {
|
||||
if (!dst.empty()) dst.waitReady(ctx.stream());
|
||||
dst.resize(m == 0 ? 0 : m, B.cols());
|
||||
return;
|
||||
}
|
||||
|
||||
A.waitReady(ctx.stream());
|
||||
B.waitReady(ctx.stream());
|
||||
if (!dst.empty()) dst.waitReady(ctx.stream());
|
||||
|
||||
dst.resize(m, n);
|
||||
|
||||
constexpr cublasFillMode_t uplo = (UpLo == Lower) ? CUBLAS_FILL_MODE_LOWER : CUBLAS_FILL_MODE_UPPER;
|
||||
Scalar alpha(1), beta(0);
|
||||
|
||||
EIGEN_CUBLAS_CHECK(cublasXsymm(ctx.cublasHandle(), CUBLAS_SIDE_LEFT, uplo, m, n, &alpha, A.data(),
|
||||
static_cast<int>(A.rows()), B.data(), static_cast<int>(B.rows()), &beta, dst.data(),
|
||||
static_cast<int>(dst.rows())));
|
||||
|
||||
dst.recordReady(ctx.stream());
|
||||
}
|
||||
|
||||
// ---- SYRK/HERK dispatch -----------------------------------------------------
|
||||
// SyrkExpr → cublasXsyrk (real) or cublasXherk (complex).
|
||||
// C = alpha * A * A^H + beta * C. UpLo specifies which triangle of C is stored.
|
||||
|
||||
template <typename Scalar, int UpLo>
|
||||
void dispatch_syrk(GpuContext& ctx, DeviceMatrix<Scalar>& dst, const SyrkExpr<Scalar, UpLo>& expr,
|
||||
typename NumTraits<Scalar>::Real alpha_val, typename NumTraits<Scalar>::Real beta_val) {
|
||||
using RealScalar = typename NumTraits<Scalar>::Real;
|
||||
const DeviceMatrix<Scalar>& A = expr.matrix();
|
||||
|
||||
const int n = static_cast<int>(A.rows());
|
||||
const int k = static_cast<int>(A.cols());
|
||||
|
||||
if (n == 0) {
|
||||
if (!dst.empty()) dst.waitReady(ctx.stream());
|
||||
dst.resize(0, 0);
|
||||
return;
|
||||
}
|
||||
|
||||
A.waitReady(ctx.stream());
|
||||
if (!dst.empty()) dst.waitReady(ctx.stream());
|
||||
|
||||
if (dst.empty() || dst.rows() != n || dst.cols() != n) {
|
||||
dst.resize(n, n);
|
||||
if (beta_val != RealScalar(0)) {
|
||||
EIGEN_CUDA_RUNTIME_CHECK(cudaMemsetAsync(dst.data(), 0, dst.sizeInBytes(), ctx.stream()));
|
||||
}
|
||||
}
|
||||
|
||||
constexpr cublasFillMode_t uplo = (UpLo == Lower) ? CUBLAS_FILL_MODE_LOWER : CUBLAS_FILL_MODE_UPPER;
|
||||
|
||||
EIGEN_CUBLAS_CHECK(cublasXsyrk(ctx.cublasHandle(), uplo, CUBLAS_OP_N, n, k, &alpha_val, A.data(),
|
||||
static_cast<int>(A.rows()), &beta_val, dst.data(), static_cast<int>(dst.rows())));
|
||||
|
||||
dst.recordReady(ctx.stream());
|
||||
}
|
||||
|
||||
} // namespace internal
|
||||
|
||||
// ---- DeviceAssignment: d_C.device(ctx) = expr ------------------------------
|
||||
// Returned by DeviceMatrix::device(ctx). Dispatches expressions to library calls.
|
||||
|
||||
template <typename Scalar_>
|
||||
class DeviceAssignment {
|
||||
public:
|
||||
using Scalar = Scalar_;
|
||||
|
||||
DeviceAssignment(DeviceMatrix<Scalar>& dst, GpuContext& ctx) : dst_(dst), ctx_(ctx) {}
|
||||
|
||||
// operator= dispatches GEMM with beta=0 (overwrite).
|
||||
template <typename Lhs, typename Rhs>
|
||||
DeviceMatrix<Scalar>& operator=(const GemmExpr<Lhs, Rhs>& expr) {
|
||||
internal::dispatch_gemm(ctx_, dst_, expr, Scalar(0));
|
||||
return dst_;
|
||||
}
|
||||
|
||||
// operator+= dispatches GEMM with beta=1 (accumulate).
|
||||
template <typename Lhs, typename Rhs>
|
||||
DeviceMatrix<Scalar>& operator+=(const GemmExpr<Lhs, Rhs>& expr) {
|
||||
internal::dispatch_gemm(ctx_, dst_, expr, Scalar(1));
|
||||
return dst_;
|
||||
}
|
||||
|
||||
// operator-= dispatches GEMM with negated alpha, beta=1: C = C - alpha*op(A)*op(B).
|
||||
template <typename Lhs, typename Rhs>
|
||||
DeviceMatrix<Scalar>& operator-=(const GemmExpr<Lhs, Rhs>& expr) {
|
||||
internal::dispatch_gemm(ctx_, dst_, expr, Scalar(1), Scalar(-1));
|
||||
return dst_;
|
||||
}
|
||||
|
||||
// operator= dispatches LLT solve (potrf + potrs).
|
||||
template <int UpLo>
|
||||
DeviceMatrix<Scalar>& operator=(const LltSolveExpr<Scalar, UpLo>& expr) {
|
||||
internal::dispatch_llt_solve(ctx_, dst_, expr);
|
||||
return dst_;
|
||||
}
|
||||
|
||||
// operator= dispatches LU solve (getrf + getrs).
|
||||
DeviceMatrix<Scalar>& operator=(const LuSolveExpr<Scalar>& expr) {
|
||||
internal::dispatch_lu_solve(ctx_, dst_, expr);
|
||||
return dst_;
|
||||
}
|
||||
|
||||
// operator= dispatches TRSM (triangular solve).
|
||||
template <int UpLo>
|
||||
DeviceMatrix<Scalar>& operator=(const TrsmExpr<Scalar, UpLo>& expr) {
|
||||
internal::dispatch_trsm(ctx_, dst_, expr);
|
||||
return dst_;
|
||||
}
|
||||
|
||||
// operator= dispatches SYMM/HEMM (symmetric/Hermitian multiply).
|
||||
template <int UpLo>
|
||||
DeviceMatrix<Scalar>& operator=(const SymmExpr<Scalar, UpLo>& expr) {
|
||||
internal::dispatch_symm(ctx_, dst_, expr);
|
||||
return dst_;
|
||||
}
|
||||
|
||||
// Catch-all: static_assert for unsupported expressions.
|
||||
template <typename Expr>
|
||||
DeviceMatrix<Scalar>& operator=(const Expr&) {
|
||||
static_assert(sizeof(Expr) == 0,
|
||||
"DeviceMatrix expression not supported: no cuBLAS/cuSOLVER mapping. "
|
||||
"Supported: GEMM (A*B), TRSM (.triangularView().solve()), "
|
||||
"SYMM (.selfadjointView()*B), LLT (.llt().solve()), LU (.lu().solve()).");
|
||||
return dst_;
|
||||
}
|
||||
|
||||
private:
|
||||
DeviceMatrix<Scalar>& dst_;
|
||||
GpuContext& ctx_;
|
||||
};
|
||||
|
||||
// ---- Out-of-line DeviceMatrix expression operator= definitions -------------
|
||||
// These are declared in DeviceMatrix.h but defined here because they need
|
||||
// GpuContext::threadLocal() which requires the full GpuContext definition.
|
||||
|
||||
template <typename Scalar_>
|
||||
template <typename Lhs, typename Rhs>
|
||||
DeviceMatrix<Scalar_>& DeviceMatrix<Scalar_>::operator=(const GemmExpr<Lhs, Rhs>& expr) {
|
||||
device(GpuContext::threadLocal()) = expr;
|
||||
return *this;
|
||||
}
|
||||
|
||||
template <typename Scalar_>
|
||||
template <typename Lhs, typename Rhs>
|
||||
DeviceMatrix<Scalar_>& DeviceMatrix<Scalar_>::operator+=(const GemmExpr<Lhs, Rhs>& expr) {
|
||||
device(GpuContext::threadLocal()) += expr;
|
||||
return *this;
|
||||
}
|
||||
|
||||
template <typename Scalar_>
|
||||
template <int UpLo>
|
||||
DeviceMatrix<Scalar_>& DeviceMatrix<Scalar_>::operator=(const LltSolveExpr<Scalar_, UpLo>& expr) {
|
||||
device(GpuContext::threadLocal()) = expr;
|
||||
return *this;
|
||||
}
|
||||
|
||||
template <typename Scalar_>
|
||||
DeviceMatrix<Scalar_>& DeviceMatrix<Scalar_>::operator=(const LuSolveExpr<Scalar_>& expr) {
|
||||
device(GpuContext::threadLocal()) = expr;
|
||||
return *this;
|
||||
}
|
||||
|
||||
template <typename Scalar_>
|
||||
template <int UpLo>
|
||||
DeviceMatrix<Scalar_>& DeviceMatrix<Scalar_>::operator=(const TrsmExpr<Scalar_, UpLo>& expr) {
|
||||
device(GpuContext::threadLocal()) = expr;
|
||||
return *this;
|
||||
}
|
||||
|
||||
template <typename Scalar_>
|
||||
template <int UpLo>
|
||||
DeviceMatrix<Scalar_>& DeviceMatrix<Scalar_>::operator=(const SymmExpr<Scalar_, UpLo>& expr) {
|
||||
device(GpuContext::threadLocal()) = expr;
|
||||
return *this;
|
||||
}
|
||||
|
||||
// DeviceSelfAdjointView::rankUpdate — defined here because it needs GpuContext.
|
||||
template <typename Scalar_, int UpLo_>
|
||||
void DeviceSelfAdjointView<Scalar_, UpLo_>::rankUpdate(const DeviceMatrix<Scalar_>& A, RealScalar alpha) {
|
||||
SyrkExpr<Scalar_, UpLo_> expr(A);
|
||||
RealScalar beta = matrix().empty() ? RealScalar(0) : RealScalar(1);
|
||||
internal::dispatch_syrk(GpuContext::threadLocal(), matrix(), expr, alpha, beta);
|
||||
}
|
||||
|
||||
} // namespace Eigen
|
||||
|
||||
#endif // EIGEN_GPU_DEVICE_DISPATCH_H
|
||||
224
Eigen/src/GPU/DeviceExpr.h
Normal file
224
Eigen/src/GPU/DeviceExpr.h
Normal file
@@ -0,0 +1,224 @@
|
||||
// This file is part of Eigen, a lightweight C++ template library
|
||||
// for linear algebra.
|
||||
//
|
||||
// Copyright (C) 2026 Rasmus Munk Larsen <rmlarsen@gmail.com>
|
||||
//
|
||||
// This Source Code Form is subject to the terms of the Mozilla
|
||||
// Public License v. 2.0. If a copy of the MPL was not distributed
|
||||
// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
|
||||
|
||||
// Lightweight expression types for DeviceMatrix operations.
|
||||
//
|
||||
// These are NOT Eigen expression templates. Each type maps 1:1 to a single
|
||||
// NVIDIA library call (cuBLAS or cuSOLVER). There is no coefficient-level
|
||||
// evaluation, no lazy fusion, no packet operations.
|
||||
//
|
||||
// Expression types:
|
||||
// DeviceAdjointView<S> — d_A.adjoint() → marks ConjTrans for GEMM
|
||||
// DeviceTransposeView<S> — d_A.transpose() → marks Trans for GEMM
|
||||
// DeviceScaled<Expr> — alpha * expr → carries scalar factor
|
||||
// GemmExpr<Lhs, Rhs> — lhs * rhs → dispatches to cublasXgemm
|
||||
|
||||
#ifndef EIGEN_GPU_DEVICE_EXPR_H
|
||||
#define EIGEN_GPU_DEVICE_EXPR_H
|
||||
|
||||
// IWYU pragma: private
|
||||
#include "./InternalHeaderCheck.h"
|
||||
|
||||
#include "./CuBlasSupport.h"
|
||||
|
||||
namespace Eigen {
|
||||
|
||||
// Forward declaration.
|
||||
template <typename Scalar_>
|
||||
class DeviceMatrix;
|
||||
|
||||
namespace internal {
|
||||
|
||||
// ---- Traits: extract operation info from expression types -------------------
|
||||
|
||||
// Default: a DeviceMatrix is NoTrans.
|
||||
template <typename T>
|
||||
struct device_expr_traits {
|
||||
static constexpr bool is_device_expr = false;
|
||||
};
|
||||
|
||||
template <typename Scalar>
|
||||
struct device_expr_traits<DeviceMatrix<Scalar>> {
|
||||
using scalar_type = Scalar;
|
||||
static constexpr GpuOp op = GpuOp::NoTrans;
|
||||
static constexpr bool is_device_expr = true;
|
||||
static const DeviceMatrix<Scalar>& matrix(const DeviceMatrix<Scalar>& x) { return x; }
|
||||
static Scalar alpha(const DeviceMatrix<Scalar>&) { return Scalar(1); }
|
||||
};
|
||||
|
||||
} // namespace internal
|
||||
|
||||
// ---- DeviceAdjointView: marks ConjTrans ------------------------------------
|
||||
// Returned by DeviceMatrix::adjoint(). Maps to cublasXgemm transA/B = C.
|
||||
|
||||
template <typename Scalar_>
|
||||
class DeviceAdjointView {
|
||||
public:
|
||||
using Scalar = Scalar_;
|
||||
explicit DeviceAdjointView(const DeviceMatrix<Scalar>& m) : mat_(m) {}
|
||||
const DeviceMatrix<Scalar>& matrix() const { return mat_; }
|
||||
|
||||
private:
|
||||
const DeviceMatrix<Scalar>& mat_;
|
||||
};
|
||||
|
||||
namespace internal {
|
||||
template <typename Scalar>
|
||||
struct device_expr_traits<DeviceAdjointView<Scalar>> {
|
||||
using scalar_type = Scalar;
|
||||
static constexpr GpuOp op = GpuOp::ConjTrans;
|
||||
static constexpr bool is_device_expr = true;
|
||||
static const DeviceMatrix<Scalar>& matrix(const DeviceAdjointView<Scalar>& x) { return x.matrix(); }
|
||||
static Scalar alpha(const DeviceAdjointView<Scalar>&) { return Scalar(1); }
|
||||
};
|
||||
} // namespace internal
|
||||
|
||||
// ---- DeviceTransposeView: marks Trans --------------------------------------
|
||||
// Returned by DeviceMatrix::transpose(). Maps to cublasXgemm transA/B = T.
|
||||
|
||||
template <typename Scalar_>
|
||||
class DeviceTransposeView {
|
||||
public:
|
||||
using Scalar = Scalar_;
|
||||
explicit DeviceTransposeView(const DeviceMatrix<Scalar>& m) : mat_(m) {}
|
||||
const DeviceMatrix<Scalar>& matrix() const { return mat_; }
|
||||
|
||||
private:
|
||||
const DeviceMatrix<Scalar>& mat_;
|
||||
};
|
||||
|
||||
namespace internal {
|
||||
template <typename Scalar>
|
||||
struct device_expr_traits<DeviceTransposeView<Scalar>> {
|
||||
using scalar_type = Scalar;
|
||||
static constexpr GpuOp op = GpuOp::Trans;
|
||||
static constexpr bool is_device_expr = true;
|
||||
static const DeviceMatrix<Scalar>& matrix(const DeviceTransposeView<Scalar>& x) { return x.matrix(); }
|
||||
static Scalar alpha(const DeviceTransposeView<Scalar>&) { return Scalar(1); }
|
||||
};
|
||||
} // namespace internal
|
||||
|
||||
// ---- DeviceScaled: alpha * expr --------------------------------------------
|
||||
// Returned by operator*(Scalar, DeviceMatrix/View). Carries the scalar factor.
|
||||
|
||||
template <typename Inner>
|
||||
class DeviceScaled {
|
||||
public:
|
||||
using Scalar = typename internal::device_expr_traits<Inner>::scalar_type;
|
||||
DeviceScaled(Scalar alpha, const Inner& inner) : alpha_(alpha), inner_(inner) {}
|
||||
Scalar scalar() const { return alpha_; }
|
||||
const Inner& inner() const { return inner_; }
|
||||
|
||||
private:
|
||||
Scalar alpha_;
|
||||
const Inner& inner_;
|
||||
};
|
||||
|
||||
namespace internal {
|
||||
template <typename Inner>
|
||||
struct device_expr_traits<DeviceScaled<Inner>> {
|
||||
using scalar_type = typename device_expr_traits<Inner>::scalar_type;
|
||||
static constexpr GpuOp op = device_expr_traits<Inner>::op;
|
||||
static constexpr bool is_device_expr = true;
|
||||
static const DeviceMatrix<scalar_type>& matrix(const DeviceScaled<Inner>& x) {
|
||||
return device_expr_traits<Inner>::matrix(x.inner());
|
||||
}
|
||||
static scalar_type alpha(const DeviceScaled<Inner>& x) {
|
||||
return x.scalar() * device_expr_traits<Inner>::alpha(x.inner());
|
||||
}
|
||||
};
|
||||
} // namespace internal
|
||||
|
||||
// ---- GemmExpr: lhs * rhs → cublasXgemm ------------------------------------
|
||||
// Returned by operator*(lhs_expr, rhs_expr). Dispatches to cuBLAS GEMM.
|
||||
|
||||
template <typename Lhs, typename Rhs>
|
||||
class GemmExpr {
|
||||
public:
|
||||
using Scalar = typename internal::device_expr_traits<Lhs>::scalar_type;
|
||||
static_assert(std::is_same<Scalar, typename internal::device_expr_traits<Rhs>::scalar_type>::value,
|
||||
"DeviceMatrix GEMM: LHS and RHS must have the same scalar type");
|
||||
|
||||
GemmExpr(const Lhs& lhs, const Rhs& rhs) : lhs_(lhs), rhs_(rhs) {}
|
||||
const Lhs& lhs() const { return lhs_; }
|
||||
const Rhs& rhs() const { return rhs_; }
|
||||
|
||||
private:
|
||||
// Stored by reference. Expression objects must not outlive their operands.
|
||||
// This is safe for the one-liner pattern (d_C = d_A * d_B) since all
|
||||
// temporaries live until the semicolon.
|
||||
const Lhs& lhs_;
|
||||
const Rhs& rhs_;
|
||||
};
|
||||
|
||||
// ---- Free operator* overloads that produce GemmExpr ------------------------
|
||||
// These cover: DM*DM, Adj*DM, DM*Adj, Trans*DM, DM*Trans, Scaled*DM, etc.
|
||||
|
||||
// DeviceMatrix * DeviceMatrix
|
||||
template <typename S>
|
||||
GemmExpr<DeviceMatrix<S>, DeviceMatrix<S>> operator*(const DeviceMatrix<S>& a, const DeviceMatrix<S>& b) {
|
||||
return {a, b};
|
||||
}
|
||||
|
||||
// AdjointView * DeviceMatrix
|
||||
template <typename S>
|
||||
GemmExpr<DeviceAdjointView<S>, DeviceMatrix<S>> operator*(const DeviceAdjointView<S>& a, const DeviceMatrix<S>& b) {
|
||||
return {a, b};
|
||||
}
|
||||
|
||||
// DeviceMatrix * AdjointView
|
||||
template <typename S>
|
||||
GemmExpr<DeviceMatrix<S>, DeviceAdjointView<S>> operator*(const DeviceMatrix<S>& a, const DeviceAdjointView<S>& b) {
|
||||
return {a, b};
|
||||
}
|
||||
|
||||
// TransposeView * DeviceMatrix
|
||||
template <typename S>
|
||||
GemmExpr<DeviceTransposeView<S>, DeviceMatrix<S>> operator*(const DeviceTransposeView<S>& a, const DeviceMatrix<S>& b) {
|
||||
return {a, b};
|
||||
}
|
||||
|
||||
// DeviceMatrix * TransposeView
|
||||
template <typename S>
|
||||
GemmExpr<DeviceMatrix<S>, DeviceTransposeView<S>> operator*(const DeviceMatrix<S>& a, const DeviceTransposeView<S>& b) {
|
||||
return {a, b};
|
||||
}
|
||||
|
||||
// Scaled * DeviceMatrix
|
||||
template <typename Inner, typename S>
|
||||
GemmExpr<DeviceScaled<Inner>, DeviceMatrix<S>> operator*(const DeviceScaled<Inner>& a, const DeviceMatrix<S>& b) {
|
||||
return {a, b};
|
||||
}
|
||||
|
||||
// DeviceMatrix * Scaled
|
||||
template <typename S, typename Inner>
|
||||
GemmExpr<DeviceMatrix<S>, DeviceScaled<Inner>> operator*(const DeviceMatrix<S>& a, const DeviceScaled<Inner>& b) {
|
||||
return {a, b};
|
||||
}
|
||||
|
||||
// ---- Scalar * DeviceMatrix / View → DeviceScaled ---------------------------
|
||||
|
||||
template <typename S>
|
||||
DeviceScaled<DeviceMatrix<S>> operator*(S alpha, const DeviceMatrix<S>& m) {
|
||||
return {alpha, m};
|
||||
}
|
||||
|
||||
template <typename S>
|
||||
DeviceScaled<DeviceAdjointView<S>> operator*(S alpha, const DeviceAdjointView<S>& m) {
|
||||
return {alpha, m};
|
||||
}
|
||||
|
||||
template <typename S>
|
||||
DeviceScaled<DeviceTransposeView<S>> operator*(S alpha, const DeviceTransposeView<S>& m) {
|
||||
return {alpha, m};
|
||||
}
|
||||
|
||||
} // namespace Eigen
|
||||
|
||||
#endif // EIGEN_GPU_DEVICE_EXPR_H
|
||||
503
Eigen/src/GPU/DeviceMatrix.h
Normal file
503
Eigen/src/GPU/DeviceMatrix.h
Normal file
@@ -0,0 +1,503 @@
|
||||
// This file is part of Eigen, a lightweight C++ template library
|
||||
// for linear algebra.
|
||||
//
|
||||
// Copyright (C) 2026 Rasmus Munk Larsen <rmlarsen@gmail.com>
|
||||
//
|
||||
// This Source Code Form is subject to the terms of the Mozilla
|
||||
// Public License v. 2.0. If a copy of the MPL was not distributed
|
||||
// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
|
||||
|
||||
// Typed RAII wrapper for a dense matrix in GPU device memory.
|
||||
//
|
||||
// DeviceMatrix<Scalar> holds a column-major matrix on the GPU with tracked
|
||||
// dimensions. Always dense (leading dimension = rows). It can be passed to GPU solvers
|
||||
// (GpuLLT, GpuLU, future cuBLAS/cuDSS) without host round-trips.
|
||||
//
|
||||
// Cross-stream safety is automatic: an internal CUDA event tracks when the
|
||||
// last write completed. Consumers on a different stream wait on that event
|
||||
// before reading.
|
||||
//
|
||||
// Usage:
|
||||
// auto d_A = DeviceMatrix<double>::fromHost(A); // upload (sync)
|
||||
// GpuLLT<double> llt;
|
||||
// llt.compute(d_A); // factor on device
|
||||
// auto d_X = llt.solve(d_B); // async, no sync
|
||||
// MatrixXd X = d_X.toHost(); // download + block
|
||||
//
|
||||
// Async variants:
|
||||
// auto d_A = DeviceMatrix<double>::fromHostAsync(A.data(), n, n, stream);
|
||||
// auto transfer = d_X.toHostAsync(stream); // enqueue D2H
|
||||
// // ... overlap with other work ...
|
||||
// MatrixXd X = transfer.get(); // block + retrieve
|
||||
|
||||
#ifndef EIGEN_GPU_DEVICE_MATRIX_H
|
||||
#define EIGEN_GPU_DEVICE_MATRIX_H
|
||||
|
||||
// IWYU pragma: private
|
||||
#include "./InternalHeaderCheck.h"
|
||||
|
||||
#include "./GpuSupport.h"
|
||||
|
||||
namespace Eigen {
|
||||
|
||||
// Forward declarations.
|
||||
template <typename, int>
|
||||
class GpuLLT;
|
||||
template <typename>
|
||||
class GpuLU;
|
||||
template <typename>
|
||||
class DeviceAdjointView;
|
||||
template <typename>
|
||||
class DeviceTransposeView;
|
||||
template <typename>
|
||||
class DeviceAssignment;
|
||||
template <typename, typename>
|
||||
class GemmExpr;
|
||||
template <typename, int>
|
||||
class LltSolveExpr;
|
||||
template <typename>
|
||||
class LuSolveExpr;
|
||||
template <typename, int>
|
||||
class DeviceLLTView;
|
||||
template <typename>
|
||||
class DeviceLUView;
|
||||
template <typename, int>
|
||||
class DeviceTriangularView;
|
||||
template <typename, int>
|
||||
class DeviceSelfAdjointView;
|
||||
template <typename, int>
|
||||
class ConstDeviceSelfAdjointView;
|
||||
template <typename, int>
|
||||
class TrsmExpr;
|
||||
template <typename, int>
|
||||
class SymmExpr;
|
||||
template <typename, int>
|
||||
class SyrkExpr;
|
||||
class GpuContext;
|
||||
|
||||
// --------------------------------------------------------------------------
|
||||
// HostTransfer — future-like wrapper for an async device-to-host transfer.
|
||||
// --------------------------------------------------------------------------
|
||||
|
||||
/** \ingroup GPU_Module
|
||||
* \class HostTransfer
|
||||
* \brief Future for an asynchronous device-to-host matrix transfer.
|
||||
*
|
||||
* Returned by DeviceMatrix::toHostAsync(). The transfer runs asynchronously
|
||||
* on the given CUDA stream. Call get() to block until complete and retrieve
|
||||
* the host matrix, or ready() to poll without blocking.
|
||||
*/
|
||||
template <typename Scalar_>
|
||||
class HostTransfer {
|
||||
public:
|
||||
using Scalar = Scalar_;
|
||||
using PlainMatrix = Matrix<Scalar, Dynamic, Dynamic, ColMajor>;
|
||||
|
||||
/** Block until the transfer completes and return the host matrix.
|
||||
* Idempotent: subsequent calls return the same matrix without re-syncing. */
|
||||
PlainMatrix& get() {
|
||||
if (!synced_) {
|
||||
EIGEN_CUDA_RUNTIME_CHECK(cudaEventSynchronize(event_));
|
||||
synced_ = true;
|
||||
}
|
||||
return host_buf_;
|
||||
}
|
||||
|
||||
/** Non-blocking check: has the transfer completed? */
|
||||
bool ready() const {
|
||||
if (synced_) return true;
|
||||
cudaError_t err = cudaEventQuery(event_);
|
||||
if (err == cudaSuccess) return true;
|
||||
eigen_assert(err == cudaErrorNotReady && "cudaEventQuery failed");
|
||||
return false;
|
||||
}
|
||||
|
||||
~HostTransfer() {
|
||||
if (event_) (void)cudaEventDestroy(event_);
|
||||
}
|
||||
|
||||
HostTransfer(HostTransfer&& o) noexcept : host_buf_(std::move(o.host_buf_)), event_(o.event_), synced_(o.synced_) {
|
||||
o.event_ = nullptr;
|
||||
o.synced_ = true;
|
||||
}
|
||||
|
||||
HostTransfer& operator=(HostTransfer&& o) noexcept {
|
||||
if (this != &o) {
|
||||
if (event_) (void)cudaEventDestroy(event_);
|
||||
host_buf_ = std::move(o.host_buf_);
|
||||
event_ = o.event_;
|
||||
synced_ = o.synced_;
|
||||
o.event_ = nullptr;
|
||||
o.synced_ = true;
|
||||
}
|
||||
return *this;
|
||||
}
|
||||
|
||||
HostTransfer(const HostTransfer&) = delete;
|
||||
HostTransfer& operator=(const HostTransfer&) = delete;
|
||||
|
||||
private:
|
||||
template <typename>
|
||||
friend class DeviceMatrix;
|
||||
|
||||
HostTransfer(PlainMatrix&& buf, cudaEvent_t event) : host_buf_(std::move(buf)), event_(event), synced_(false) {}
|
||||
|
||||
PlainMatrix host_buf_;
|
||||
cudaEvent_t event_ = nullptr;
|
||||
bool synced_ = false;
|
||||
};
|
||||
|
||||
// --------------------------------------------------------------------------
|
||||
// DeviceMatrix — typed RAII wrapper for a dense matrix in device memory.
|
||||
// --------------------------------------------------------------------------
|
||||
|
||||
/** \ingroup GPU_Module
|
||||
* \class DeviceMatrix
|
||||
* \brief RAII wrapper for a dense column-major matrix in GPU device memory.
|
||||
*
|
||||
* \tparam Scalar_ Element type: float, double, complex<float>, complex<double>
|
||||
*
|
||||
* Owns a device allocation with tracked dimensions. Always dense
|
||||
* (leading dimension = rows; no stride padding).
|
||||
* An internal CUDA event records when the data was last written, enabling
|
||||
* safe cross-stream consumption without user-visible synchronization.
|
||||
*
|
||||
* Each method has a synchronous and an asynchronous variant:
|
||||
* - fromHost() / fromHostAsync(): upload from host
|
||||
* - toHost() / toHostAsync(): download to host
|
||||
*/
|
||||
template <typename Scalar_>
|
||||
class DeviceMatrix {
|
||||
public:
|
||||
using Scalar = Scalar_;
|
||||
using PlainMatrix = Matrix<Scalar, Dynamic, Dynamic, ColMajor>;
|
||||
|
||||
// ---- Construction / destruction ------------------------------------------
|
||||
|
||||
/** Default: empty (0x0, no allocation). */
|
||||
DeviceMatrix() = default;
|
||||
|
||||
/** Allocate uninitialized device memory for a rows x cols matrix. */
|
||||
DeviceMatrix(Index rows, Index cols) : rows_(rows), cols_(cols) {
|
||||
eigen_assert(rows >= 0 && cols >= 0);
|
||||
size_t bytes = sizeInBytes();
|
||||
if (bytes > 0) {
|
||||
EIGEN_CUDA_RUNTIME_CHECK(cudaMalloc(reinterpret_cast<void**>(&data_), bytes));
|
||||
}
|
||||
}
|
||||
|
||||
~DeviceMatrix() {
|
||||
if (data_) (void)cudaFree(data_);
|
||||
if (ready_event_) (void)cudaEventDestroy(ready_event_);
|
||||
}
|
||||
|
||||
// ---- Move-only -----------------------------------------------------------
|
||||
|
||||
DeviceMatrix(DeviceMatrix&& o) noexcept
|
||||
: data_(o.data_),
|
||||
rows_(o.rows_),
|
||||
cols_(o.cols_),
|
||||
ready_event_(o.ready_event_),
|
||||
ready_stream_(o.ready_stream_),
|
||||
retained_buffer_(std::move(o.retained_buffer_)) {
|
||||
o.data_ = nullptr;
|
||||
o.rows_ = 0;
|
||||
o.cols_ = 0;
|
||||
o.ready_event_ = nullptr;
|
||||
o.ready_stream_ = nullptr;
|
||||
}
|
||||
|
||||
DeviceMatrix& operator=(DeviceMatrix&& o) noexcept {
|
||||
if (this != &o) {
|
||||
if (data_) (void)cudaFree(data_);
|
||||
if (ready_event_) (void)cudaEventDestroy(ready_event_);
|
||||
data_ = o.data_;
|
||||
rows_ = o.rows_;
|
||||
cols_ = o.cols_;
|
||||
ready_event_ = o.ready_event_;
|
||||
ready_stream_ = o.ready_stream_;
|
||||
retained_buffer_ = std::move(o.retained_buffer_);
|
||||
o.data_ = nullptr;
|
||||
o.rows_ = 0;
|
||||
o.cols_ = 0;
|
||||
o.ready_event_ = nullptr;
|
||||
o.ready_stream_ = nullptr;
|
||||
}
|
||||
return *this;
|
||||
}
|
||||
|
||||
DeviceMatrix(const DeviceMatrix&) = delete;
|
||||
DeviceMatrix& operator=(const DeviceMatrix&) = delete;
|
||||
|
||||
// ---- Upload from host ----------------------------------------------------
|
||||
|
||||
/** Upload a host Eigen matrix to device memory (synchronous).
|
||||
*
|
||||
* Evaluates the expression into a contiguous ColMajor temporary, copies to
|
||||
* device via cudaMemcpyAsync on \p stream, and synchronizes before returning.
|
||||
*
|
||||
* \param host Any Eigen matrix expression.
|
||||
* \param stream CUDA stream for the transfer (default: stream 0).
|
||||
*/
|
||||
template <typename Derived>
|
||||
static DeviceMatrix fromHost(const MatrixBase<Derived>& host, cudaStream_t stream = nullptr) {
|
||||
const PlainMatrix mat(host.derived());
|
||||
DeviceMatrix dm(mat.rows(), mat.cols());
|
||||
if (dm.sizeInBytes() > 0) {
|
||||
EIGEN_CUDA_RUNTIME_CHECK(cudaMemcpyAsync(dm.data_, mat.data(), dm.sizeInBytes(), cudaMemcpyHostToDevice, stream));
|
||||
EIGEN_CUDA_RUNTIME_CHECK(cudaStreamSynchronize(stream));
|
||||
}
|
||||
return dm;
|
||||
}
|
||||
|
||||
/** Upload from a raw host pointer to device memory (asynchronous).
|
||||
*
|
||||
* Enqueues an async H2D copy on \p stream and records an internal event.
|
||||
* The caller must keep \p host_data alive until the transfer completes
|
||||
* (check via the internal event or synchronize the stream).
|
||||
*
|
||||
* \param host_data Pointer to contiguous column-major host data.
|
||||
* \param rows Number of rows.
|
||||
* \param cols Number of columns.
|
||||
* \param stream CUDA stream for the transfer.
|
||||
*/
|
||||
static DeviceMatrix fromHostAsync(const Scalar* host_data, Index rows, Index cols, cudaStream_t stream) {
|
||||
eigen_assert(rows >= 0 && cols >= 0);
|
||||
eigen_assert(host_data != nullptr || (rows == 0 || cols == 0));
|
||||
DeviceMatrix dm(rows, cols);
|
||||
if (dm.sizeInBytes() > 0) {
|
||||
EIGEN_CUDA_RUNTIME_CHECK(cudaMemcpyAsync(dm.data_, host_data, dm.sizeInBytes(), cudaMemcpyHostToDevice, stream));
|
||||
dm.recordReady(stream);
|
||||
}
|
||||
return dm;
|
||||
}
|
||||
|
||||
// ---- Download to host ----------------------------------------------------
|
||||
|
||||
/** Download device matrix to host memory (synchronous).
|
||||
*
|
||||
* Waits on the internal ready event, enqueues a D2H copy on \p stream,
|
||||
* synchronizes, and returns the host matrix directly.
|
||||
*
|
||||
* \param stream CUDA stream for the transfer (default: stream 0).
|
||||
*/
|
||||
PlainMatrix toHost(cudaStream_t stream = nullptr) const {
|
||||
PlainMatrix host_buf(rows_, cols_);
|
||||
if (sizeInBytes() > 0) {
|
||||
waitReady(stream);
|
||||
EIGEN_CUDA_RUNTIME_CHECK(cudaMemcpyAsync(host_buf.data(), data_, sizeInBytes(), cudaMemcpyDeviceToHost, stream));
|
||||
EIGEN_CUDA_RUNTIME_CHECK(cudaStreamSynchronize(stream));
|
||||
}
|
||||
return host_buf;
|
||||
}
|
||||
|
||||
/** Enqueue an async device-to-host transfer and return a future.
|
||||
*
|
||||
* Waits on the internal ready event (if any) to ensure the device data is
|
||||
* valid, then enqueues the D2H copy on \p stream. Returns a HostTransfer
|
||||
* future; call .get() to block and retrieve the host matrix.
|
||||
*
|
||||
* \param stream CUDA stream for the transfer (default: stream 0).
|
||||
*/
|
||||
HostTransfer<Scalar> toHostAsync(cudaStream_t stream = nullptr) const {
|
||||
PlainMatrix host_buf(rows_, cols_);
|
||||
if (sizeInBytes() > 0) {
|
||||
waitReady(stream);
|
||||
EIGEN_CUDA_RUNTIME_CHECK(cudaMemcpyAsync(host_buf.data(), data_, sizeInBytes(), cudaMemcpyDeviceToHost, stream));
|
||||
}
|
||||
// Record a transfer-complete event.
|
||||
cudaEvent_t transfer_event;
|
||||
EIGEN_CUDA_RUNTIME_CHECK(cudaEventCreateWithFlags(&transfer_event, cudaEventDisableTiming));
|
||||
EIGEN_CUDA_RUNTIME_CHECK(cudaEventRecord(transfer_event, stream));
|
||||
return HostTransfer<Scalar>(std::move(host_buf), transfer_event);
|
||||
}
|
||||
|
||||
// ---- Device-to-device copy -----------------------------------------------
|
||||
|
||||
/** Deep copy on device. Fully async — records event on the result, no sync.
|
||||
*
|
||||
* \param stream CUDA stream for the D2D copy (default: stream 0).
|
||||
*/
|
||||
DeviceMatrix clone(cudaStream_t stream = nullptr) const {
|
||||
DeviceMatrix result(rows_, cols_);
|
||||
if (sizeInBytes() > 0) {
|
||||
waitReady(stream);
|
||||
EIGEN_CUDA_RUNTIME_CHECK(cudaMemcpyAsync(result.data_, data_, sizeInBytes(), cudaMemcpyDeviceToDevice, stream));
|
||||
result.recordReady(stream);
|
||||
}
|
||||
return result;
|
||||
}
|
||||
|
||||
// ---- Resize (destructive) ------------------------------------------------
|
||||
|
||||
/** Discard contents and reallocate to (rows x cols). Clears the ready event. */
|
||||
void resize(Index rows, Index cols) {
|
||||
if (rows == rows_ && cols == cols_) return;
|
||||
if (data_) {
|
||||
(void)cudaFree(data_);
|
||||
data_ = nullptr;
|
||||
}
|
||||
if (ready_event_) {
|
||||
(void)cudaEventDestroy(ready_event_);
|
||||
ready_event_ = nullptr;
|
||||
}
|
||||
ready_stream_ = nullptr;
|
||||
retained_buffer_ = internal::DeviceBuffer();
|
||||
rows_ = rows;
|
||||
cols_ = cols;
|
||||
size_t bytes = sizeInBytes();
|
||||
if (bytes > 0) {
|
||||
EIGEN_CUDA_RUNTIME_CHECK(cudaMalloc(reinterpret_cast<void**>(&data_), bytes));
|
||||
}
|
||||
}
|
||||
|
||||
// ---- Accessors -----------------------------------------------------------
|
||||
|
||||
Scalar* data() { return data_; }
|
||||
const Scalar* data() const { return data_; }
|
||||
Index rows() const { return rows_; }
|
||||
Index cols() const { return cols_; }
|
||||
bool empty() const { return rows_ == 0 || cols_ == 0; }
|
||||
|
||||
/** Size of the device allocation in bytes. */
|
||||
size_t sizeInBytes() const { return static_cast<size_t>(rows_) * static_cast<size_t>(cols_) * sizeof(Scalar); }
|
||||
|
||||
// ---- Event synchronization (public for library dispatch interop) ---------
|
||||
|
||||
/** Record that device data is ready after work on \p stream. */
|
||||
void recordReady(cudaStream_t stream) {
|
||||
ensureEvent();
|
||||
EIGEN_CUDA_RUNTIME_CHECK(cudaEventRecord(ready_event_, stream));
|
||||
ready_stream_ = stream;
|
||||
}
|
||||
|
||||
/** Make \p stream wait until the device data is ready.
|
||||
* No-op if no event recorded, or if the consumer stream is the same as the
|
||||
* producer stream (CUDA guarantees in-order execution within a stream). */
|
||||
void waitReady(cudaStream_t stream) const {
|
||||
if (ready_event_ && stream != ready_stream_) {
|
||||
EIGEN_CUDA_RUNTIME_CHECK(cudaStreamWaitEvent(stream, ready_event_, 0));
|
||||
}
|
||||
}
|
||||
|
||||
// ---- Expression methods (dispatch to cuBLAS/cuSOLVER) --------------------
|
||||
|
||||
/** Adjoint view for GEMM dispatch. Maps to cublasXgemm with ConjTrans. */
|
||||
DeviceAdjointView<Scalar> adjoint() const { return DeviceAdjointView<Scalar>(*this); }
|
||||
|
||||
/** Transpose view for GEMM dispatch. Maps to cublasXgemm with Trans. */
|
||||
DeviceTransposeView<Scalar> transpose() const { return DeviceTransposeView<Scalar>(*this); }
|
||||
|
||||
/** Bind this matrix to a GpuContext for expression assignment.
|
||||
* Returns a DeviceAssignment proxy: `d_C.device(ctx) = d_A * d_B;` */
|
||||
DeviceAssignment<Scalar> device(GpuContext& ctx) { return DeviceAssignment<Scalar>(*this, ctx); }
|
||||
|
||||
/** Assign from a GEMM expression using the thread-local default GpuContext.
|
||||
* Defined out-of-line after GpuContext is fully declared (see DeviceDispatch.h). */
|
||||
template <typename Lhs, typename Rhs>
|
||||
DeviceMatrix& operator=(const GemmExpr<Lhs, Rhs>& expr);
|
||||
|
||||
/** Accumulate from a GEMM expression using the thread-local default GpuContext. */
|
||||
template <typename Lhs, typename Rhs>
|
||||
DeviceMatrix& operator+=(const GemmExpr<Lhs, Rhs>& expr);
|
||||
|
||||
/** Cholesky view: d_A.llt().solve(d_B) → LltSolveExpr. */
|
||||
DeviceLLTView<Scalar, Lower> llt() const { return DeviceLLTView<Scalar, Lower>(*this); }
|
||||
|
||||
/** Cholesky view with explicit triangle: d_A.llt<Upper>().solve(d_B). */
|
||||
template <int UpLo>
|
||||
DeviceLLTView<Scalar, UpLo> llt() const {
|
||||
return DeviceLLTView<Scalar, UpLo>(*this);
|
||||
}
|
||||
|
||||
/** LU view: d_A.lu().solve(d_B) → LuSolveExpr. */
|
||||
DeviceLUView<Scalar> lu() const { return DeviceLUView<Scalar>(*this); }
|
||||
|
||||
/** Assign from an LLT solve expression (thread-local default context). */
|
||||
template <int UpLo>
|
||||
DeviceMatrix& operator=(const LltSolveExpr<Scalar, UpLo>& expr);
|
||||
|
||||
/** Assign from an LU solve expression (thread-local default context). */
|
||||
DeviceMatrix& operator=(const LuSolveExpr<Scalar>& expr);
|
||||
|
||||
/** Triangular view: d_A.triangularView<Lower>().solve(d_B) → TrsmExpr. */
|
||||
template <int UpLo>
|
||||
DeviceTriangularView<Scalar, UpLo> triangularView() const {
|
||||
return DeviceTriangularView<Scalar, UpLo>(*this);
|
||||
}
|
||||
|
||||
/** Self-adjoint view (mutable): d_C.selfadjointView<Lower>().rankUpdate(d_A). */
|
||||
template <int UpLo>
|
||||
DeviceSelfAdjointView<Scalar, UpLo> selfadjointView() {
|
||||
return DeviceSelfAdjointView<Scalar, UpLo>(*this);
|
||||
}
|
||||
|
||||
/** Self-adjoint view (const): d_A.selfadjointView<Lower>() * d_B → SymmExpr. */
|
||||
template <int UpLo>
|
||||
ConstDeviceSelfAdjointView<Scalar, UpLo> selfadjointView() const {
|
||||
return ConstDeviceSelfAdjointView<Scalar, UpLo>(*this);
|
||||
}
|
||||
|
||||
/** Assign from a TRSM expression (thread-local default context). */
|
||||
template <int UpLo>
|
||||
DeviceMatrix& operator=(const TrsmExpr<Scalar, UpLo>& expr);
|
||||
|
||||
/** Assign from a SYMM expression (thread-local default context). */
|
||||
template <int UpLo>
|
||||
DeviceMatrix& operator=(const SymmExpr<Scalar, UpLo>& expr);
|
||||
|
||||
private:
|
||||
// ---- Private: adopt a raw device pointer (used by friend solvers) --------
|
||||
|
||||
DeviceMatrix(Scalar* device_ptr, Index rows, Index cols) : data_(device_ptr), rows_(rows), cols_(cols) {}
|
||||
|
||||
/** Transfer ownership of the device pointer out. Zeros internal state. */
|
||||
Scalar* release() {
|
||||
Scalar* p = data_;
|
||||
data_ = nullptr;
|
||||
rows_ = 0;
|
||||
cols_ = 0;
|
||||
if (ready_event_) {
|
||||
(void)cudaEventDestroy(ready_event_);
|
||||
ready_event_ = nullptr;
|
||||
}
|
||||
ready_stream_ = nullptr;
|
||||
return p;
|
||||
}
|
||||
|
||||
// ---- Private helpers -------------------------------------------------------
|
||||
|
||||
void ensureEvent() {
|
||||
if (!ready_event_) {
|
||||
EIGEN_CUDA_RUNTIME_CHECK(cudaEventCreateWithFlags(&ready_event_, cudaEventDisableTiming));
|
||||
}
|
||||
}
|
||||
|
||||
void retainBuffer(internal::DeviceBuffer&& buffer) { retained_buffer_ = std::move(buffer); }
|
||||
|
||||
// ---- Friend declarations ------------------------------------------------
|
||||
|
||||
template <typename, int>
|
||||
friend class GpuLLT;
|
||||
template <typename>
|
||||
friend class GpuLU;
|
||||
template <typename>
|
||||
friend class GpuQR;
|
||||
template <typename>
|
||||
friend class GpuSVD;
|
||||
template <typename>
|
||||
friend class GpuSelfAdjointEigenSolver;
|
||||
|
||||
// ---- Data members --------------------------------------------------------
|
||||
|
||||
Scalar* data_ = nullptr;
|
||||
Index rows_ = 0;
|
||||
Index cols_ = 0;
|
||||
cudaEvent_t ready_event_ = nullptr; // internal: tracks last write completion
|
||||
cudaStream_t ready_stream_ = nullptr; // stream that recorded ready_event_ (for same-stream skip)
|
||||
internal::DeviceBuffer retained_buffer_; // internal: keeps async aux buffers alive
|
||||
};
|
||||
|
||||
} // namespace Eigen
|
||||
|
||||
#endif // EIGEN_GPU_DEVICE_MATRIX_H
|
||||
115
Eigen/src/GPU/DeviceSolverExpr.h
Normal file
115
Eigen/src/GPU/DeviceSolverExpr.h
Normal file
@@ -0,0 +1,115 @@
|
||||
// This file is part of Eigen, a lightweight C++ template library
|
||||
// for linear algebra.
|
||||
//
|
||||
// Copyright (C) 2026 Rasmus Munk Larsen <rmlarsen@gmail.com>
|
||||
//
|
||||
// This Source Code Form is subject to the terms of the Mozilla
|
||||
// Public License v. 2.0. If a copy of the MPL was not distributed
|
||||
// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
|
||||
|
||||
// Solver expression types for DeviceMatrix.
|
||||
//
|
||||
// Each expression maps 1:1 to cuSOLVER library calls:
|
||||
// LltSolveExpr → cusolverDnXpotrf + cusolverDnXpotrs
|
||||
// LuSolveExpr → cusolverDnXgetrf + cusolverDnXgetrs
|
||||
//
|
||||
// Usage:
|
||||
// d_X = d_A.llt().solve(d_B); // Cholesky solve
|
||||
// d_X.device(ctx) = d_A.lu().solve(d_B); // LU solve on explicit stream
|
||||
|
||||
#ifndef EIGEN_GPU_DEVICE_SOLVER_EXPR_H
|
||||
#define EIGEN_GPU_DEVICE_SOLVER_EXPR_H
|
||||
|
||||
// IWYU pragma: private
|
||||
#include "./InternalHeaderCheck.h"
|
||||
|
||||
namespace Eigen {
|
||||
|
||||
// Forward declarations.
|
||||
template <typename Scalar_>
|
||||
class DeviceMatrix;
|
||||
class GpuContext;
|
||||
|
||||
// ---- LLT solve expression ---------------------------------------------------
|
||||
// d_A.llt().solve(d_B) → LltSolveExpr → cusolverDnXpotrf + cusolverDnXpotrs
|
||||
|
||||
template <typename Scalar_, int UpLo_ = Lower>
|
||||
class LltSolveExpr {
|
||||
public:
|
||||
using Scalar = Scalar_;
|
||||
enum { UpLo = UpLo_ };
|
||||
|
||||
LltSolveExpr(const DeviceMatrix<Scalar>& A, const DeviceMatrix<Scalar>& B) : A_(A), B_(B) {}
|
||||
const DeviceMatrix<Scalar>& matrix() const { return A_; }
|
||||
const DeviceMatrix<Scalar>& rhs() const { return B_; }
|
||||
|
||||
private:
|
||||
const DeviceMatrix<Scalar>& A_;
|
||||
const DeviceMatrix<Scalar>& B_;
|
||||
};
|
||||
|
||||
// ---- LU solve expression ----------------------------------------------------
|
||||
// d_A.lu().solve(d_B) → LuSolveExpr → cusolverDnXgetrf + cusolverDnXgetrs
|
||||
|
||||
template <typename Scalar_>
|
||||
class LuSolveExpr {
|
||||
public:
|
||||
using Scalar = Scalar_;
|
||||
|
||||
LuSolveExpr(const DeviceMatrix<Scalar>& A, const DeviceMatrix<Scalar>& B) : A_(A), B_(B) {}
|
||||
const DeviceMatrix<Scalar>& matrix() const { return A_; }
|
||||
const DeviceMatrix<Scalar>& rhs() const { return B_; }
|
||||
|
||||
private:
|
||||
const DeviceMatrix<Scalar>& A_;
|
||||
const DeviceMatrix<Scalar>& B_;
|
||||
};
|
||||
|
||||
// ---- DeviceLLTView: d_A.llt() → view with .solve() and .device() -----------
|
||||
|
||||
template <typename Scalar_, int UpLo_ = Lower>
|
||||
class DeviceLLTView {
|
||||
public:
|
||||
using Scalar = Scalar_;
|
||||
|
||||
explicit DeviceLLTView(const DeviceMatrix<Scalar>& m) : mat_(m) {}
|
||||
|
||||
/** Build a solve expression: d_A.llt().solve(d_B).
|
||||
* The expression is evaluated when assigned to a DeviceMatrix. */
|
||||
LltSolveExpr<Scalar, UpLo_> solve(const DeviceMatrix<Scalar>& rhs) const { return {mat_, rhs}; }
|
||||
|
||||
// For cached factorizations, use the explicit GpuLLT API directly:
|
||||
// GpuLLT<double> llt;
|
||||
// llt.compute(d_A);
|
||||
// auto d_X1 = llt.solve(d_B1);
|
||||
// auto d_X2 = llt.solve(d_B2);
|
||||
|
||||
private:
|
||||
const DeviceMatrix<Scalar>& mat_;
|
||||
};
|
||||
|
||||
// ---- DeviceLUView: d_A.lu() → view with .solve() and .device() -------------
|
||||
|
||||
template <typename Scalar_>
|
||||
class DeviceLUView {
|
||||
public:
|
||||
using Scalar = Scalar_;
|
||||
|
||||
explicit DeviceLUView(const DeviceMatrix<Scalar>& m) : mat_(m) {}
|
||||
|
||||
/** Build a solve expression: d_A.lu().solve(d_B). */
|
||||
LuSolveExpr<Scalar> solve(const DeviceMatrix<Scalar>& rhs) const { return {mat_, rhs}; }
|
||||
|
||||
// For cached factorizations, use the explicit GpuLU API directly:
|
||||
// GpuLU<double> lu;
|
||||
// lu.compute(d_A);
|
||||
// auto d_X1 = lu.solve(d_B1);
|
||||
// auto d_X2 = lu.solve(d_B2);
|
||||
|
||||
private:
|
||||
const DeviceMatrix<Scalar>& mat_;
|
||||
};
|
||||
|
||||
} // namespace Eigen
|
||||
|
||||
#endif // EIGEN_GPU_DEVICE_SOLVER_EXPR_H
|
||||
83
Eigen/src/GPU/GpuContext.h
Normal file
83
Eigen/src/GPU/GpuContext.h
Normal file
@@ -0,0 +1,83 @@
|
||||
// This file is part of Eigen, a lightweight C++ template library
|
||||
// for linear algebra.
|
||||
//
|
||||
// Copyright (C) 2026 Rasmus Munk Larsen <rmlarsen@gmail.com>
|
||||
//
|
||||
// This Source Code Form is subject to the terms of the Mozilla
|
||||
// Public License v. 2.0. If a copy of the MPL was not distributed
|
||||
// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
|
||||
|
||||
// Unified GPU execution context.
|
||||
//
|
||||
// GpuContext owns a CUDA stream and all NVIDIA library handles (cuBLAS,
|
||||
// cuSOLVER, future cuDSS/cuSPARSE). It is the entry point for all GPU
|
||||
// operations on DeviceMatrix.
|
||||
//
|
||||
// Usage:
|
||||
// GpuContext ctx; // explicit context
|
||||
// d_C.device(ctx) = d_A * d_B; // GEMM on ctx's stream
|
||||
//
|
||||
// d_C = d_A * d_B; // thread-local default context
|
||||
// GpuContext& ctx = GpuContext::threadLocal();
|
||||
|
||||
#ifndef EIGEN_GPU_CONTEXT_H
|
||||
#define EIGEN_GPU_CONTEXT_H
|
||||
|
||||
// IWYU pragma: private
|
||||
#include "./InternalHeaderCheck.h"
|
||||
|
||||
#include "./CuBlasSupport.h"
|
||||
#include "./CuSolverSupport.h"
|
||||
|
||||
namespace Eigen {
|
||||
|
||||
/** \ingroup GPU_Module
|
||||
* \class GpuContext
|
||||
* \brief Unified GPU execution context owning a CUDA stream and library handles.
|
||||
*
|
||||
* Each GpuContext instance creates a dedicated CUDA stream, a cuBLAS handle,
|
||||
* and a cuSOLVER handle, all bound to that stream. Multiple contexts enable
|
||||
* concurrent execution on independent streams.
|
||||
*
|
||||
* A lazily-created thread-local default is available via threadLocal() for
|
||||
* simple single-stream usage.
|
||||
*/
|
||||
class GpuContext {
|
||||
public:
|
||||
GpuContext() {
|
||||
EIGEN_CUDA_RUNTIME_CHECK(cudaStreamCreate(&stream_));
|
||||
EIGEN_CUBLAS_CHECK(cublasCreate(&cublas_));
|
||||
EIGEN_CUBLAS_CHECK(cublasSetStream(cublas_, stream_));
|
||||
EIGEN_CUSOLVER_CHECK(cusolverDnCreate(&cusolver_));
|
||||
EIGEN_CUSOLVER_CHECK(cusolverDnSetStream(cusolver_, stream_));
|
||||
}
|
||||
|
||||
~GpuContext() {
|
||||
if (cusolver_) (void)cusolverDnDestroy(cusolver_);
|
||||
if (cublas_) (void)cublasDestroy(cublas_);
|
||||
if (stream_) (void)cudaStreamDestroy(stream_);
|
||||
}
|
||||
|
||||
// Non-copyable, non-movable (owns library handles).
|
||||
GpuContext(const GpuContext&) = delete;
|
||||
GpuContext& operator=(const GpuContext&) = delete;
|
||||
|
||||
/** Lazily-created thread-local default context. */
|
||||
static GpuContext& threadLocal() {
|
||||
thread_local GpuContext ctx;
|
||||
return ctx;
|
||||
}
|
||||
|
||||
cudaStream_t stream() const { return stream_; }
|
||||
cublasHandle_t cublasHandle() const { return cublas_; }
|
||||
cusolverDnHandle_t cusolverHandle() const { return cusolver_; }
|
||||
|
||||
private:
|
||||
cudaStream_t stream_ = nullptr;
|
||||
cublasHandle_t cublas_ = nullptr;
|
||||
cusolverDnHandle_t cusolver_ = nullptr;
|
||||
};
|
||||
|
||||
} // namespace Eigen
|
||||
|
||||
#endif // EIGEN_GPU_CONTEXT_H
|
||||
232
Eigen/src/GPU/GpuEigenSolver.h
Normal file
232
Eigen/src/GPU/GpuEigenSolver.h
Normal file
@@ -0,0 +1,232 @@
|
||||
// This file is part of Eigen, a lightweight C++ template library
|
||||
// for linear algebra.
|
||||
//
|
||||
// Copyright (C) 2026 Rasmus Munk Larsen <rmlarsen@gmail.com>
|
||||
//
|
||||
// This Source Code Form is subject to the terms of the Mozilla
|
||||
// Public License v. 2.0. If a copy of the MPL was not distributed
|
||||
// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
|
||||
|
||||
// GPU self-adjoint eigenvalue decomposition using cuSOLVER.
|
||||
//
|
||||
// Wraps cusolverDnXsyevd (symmetric/Hermitian divide-and-conquer).
|
||||
// Stores eigenvalues and eigenvectors on device.
|
||||
//
|
||||
// Usage:
|
||||
// GpuSelfAdjointEigenSolver<double> es(A);
|
||||
// VectorXd eigenvals = es.eigenvalues();
|
||||
// MatrixXd eigenvecs = es.eigenvectors();
|
||||
|
||||
#ifndef EIGEN_GPU_EIGENSOLVER_H
|
||||
#define EIGEN_GPU_EIGENSOLVER_H
|
||||
|
||||
// IWYU pragma: private
|
||||
#include "./InternalHeaderCheck.h"
|
||||
|
||||
#include "./CuSolverSupport.h"
|
||||
#include <vector>
|
||||
|
||||
namespace Eigen {
|
||||
|
||||
template <typename Scalar_>
|
||||
class GpuSelfAdjointEigenSolver {
|
||||
public:
|
||||
using Scalar = Scalar_;
|
||||
using RealScalar = typename NumTraits<Scalar>::Real;
|
||||
using PlainMatrix = Matrix<Scalar, Dynamic, Dynamic, ColMajor>;
|
||||
using RealVector = Matrix<RealScalar, Dynamic, 1>;
|
||||
|
||||
/** Eigenvalue-only or eigenvalues + eigenvectors. */
|
||||
enum ComputeMode { EigenvaluesOnly, ComputeEigenvectors };
|
||||
|
||||
GpuSelfAdjointEigenSolver() { init_context(); }
|
||||
|
||||
template <typename InputType>
|
||||
explicit GpuSelfAdjointEigenSolver(const EigenBase<InputType>& A, ComputeMode mode = ComputeEigenvectors) {
|
||||
init_context();
|
||||
compute(A, mode);
|
||||
}
|
||||
|
||||
~GpuSelfAdjointEigenSolver() {
|
||||
if (handle_) (void)cusolverDnDestroy(handle_);
|
||||
if (stream_) (void)cudaStreamDestroy(stream_);
|
||||
}
|
||||
|
||||
GpuSelfAdjointEigenSolver(const GpuSelfAdjointEigenSolver&) = delete;
|
||||
GpuSelfAdjointEigenSolver& operator=(const GpuSelfAdjointEigenSolver&) = delete;
|
||||
|
||||
// ---- Factorization -------------------------------------------------------
|
||||
|
||||
template <typename InputType>
|
||||
GpuSelfAdjointEigenSolver& compute(const EigenBase<InputType>& A, ComputeMode mode = ComputeEigenvectors) {
|
||||
eigen_assert(A.rows() == A.cols() && "GpuSelfAdjointEigenSolver requires a square matrix");
|
||||
mode_ = mode;
|
||||
n_ = A.rows();
|
||||
info_ = InvalidInput;
|
||||
info_synced_ = false;
|
||||
|
||||
if (n_ == 0) {
|
||||
info_ = Success;
|
||||
info_synced_ = true;
|
||||
return *this;
|
||||
}
|
||||
|
||||
const PlainMatrix mat(A.derived());
|
||||
lda_ = static_cast<int64_t>(n_);
|
||||
const size_t mat_bytes = static_cast<size_t>(lda_) * static_cast<size_t>(n_) * sizeof(Scalar);
|
||||
|
||||
// syevd overwrites A with eigenvectors (if requested).
|
||||
d_A_ = internal::DeviceBuffer(mat_bytes);
|
||||
EIGEN_CUDA_RUNTIME_CHECK(cudaMemcpyAsync(d_A_.ptr, mat.data(), mat_bytes, cudaMemcpyHostToDevice, stream_));
|
||||
factorize();
|
||||
return *this;
|
||||
}
|
||||
|
||||
GpuSelfAdjointEigenSolver& compute(const DeviceMatrix<Scalar>& d_A, ComputeMode mode = ComputeEigenvectors) {
|
||||
eigen_assert(d_A.rows() == d_A.cols() && "GpuSelfAdjointEigenSolver requires a square matrix");
|
||||
mode_ = mode;
|
||||
n_ = d_A.rows();
|
||||
info_ = InvalidInput;
|
||||
info_synced_ = false;
|
||||
|
||||
if (n_ == 0) {
|
||||
info_ = Success;
|
||||
info_synced_ = true;
|
||||
return *this;
|
||||
}
|
||||
|
||||
d_A.waitReady(stream_);
|
||||
lda_ = static_cast<int64_t>(n_);
|
||||
const size_t mat_bytes = static_cast<size_t>(lda_) * static_cast<size_t>(n_) * sizeof(Scalar);
|
||||
|
||||
d_A_ = internal::DeviceBuffer(mat_bytes);
|
||||
EIGEN_CUDA_RUNTIME_CHECK(cudaMemcpyAsync(d_A_.ptr, d_A.data(), mat_bytes, cudaMemcpyDeviceToDevice, stream_));
|
||||
|
||||
factorize();
|
||||
return *this;
|
||||
}
|
||||
|
||||
// ---- Accessors -----------------------------------------------------------
|
||||
|
||||
ComputationInfo info() const {
|
||||
sync_info();
|
||||
return info_;
|
||||
}
|
||||
|
||||
Index cols() const { return n_; }
|
||||
Index rows() const { return n_; }
|
||||
|
||||
// TODO: Add device-side accessors (deviceEigenvalues(), deviceEigenvectors())
|
||||
// returning DeviceMatrix views of the internal buffers, so users can chain
|
||||
// GPU operations without round-tripping through host memory.
|
||||
|
||||
/** Eigenvalues in ascending order. Downloads from device. */
|
||||
RealVector eigenvalues() const {
|
||||
sync_info();
|
||||
eigen_assert(info_ == Success);
|
||||
RealVector W(n_);
|
||||
if (n_ > 0) {
|
||||
EIGEN_CUDA_RUNTIME_CHECK(
|
||||
cudaMemcpy(W.data(), d_W_.ptr, static_cast<size_t>(n_) * sizeof(RealScalar), cudaMemcpyDeviceToHost));
|
||||
}
|
||||
return W;
|
||||
}
|
||||
|
||||
/** Eigenvectors (columns). Downloads from device.
|
||||
* Requires ComputeEigenvectors mode. */
|
||||
PlainMatrix eigenvectors() const {
|
||||
sync_info();
|
||||
eigen_assert(info_ == Success);
|
||||
eigen_assert(mode_ == ComputeEigenvectors && "eigenvectors() requires ComputeEigenvectors mode");
|
||||
PlainMatrix V(n_, n_);
|
||||
if (n_ > 0) {
|
||||
EIGEN_CUDA_RUNTIME_CHECK(cudaMemcpy(V.data(), d_A_.ptr,
|
||||
static_cast<size_t>(lda_) * static_cast<size_t>(n_) * sizeof(Scalar),
|
||||
cudaMemcpyDeviceToHost));
|
||||
}
|
||||
return V;
|
||||
}
|
||||
|
||||
cudaStream_t stream() const { return stream_; }
|
||||
|
||||
private:
|
||||
cudaStream_t stream_ = nullptr;
|
||||
cusolverDnHandle_t handle_ = nullptr;
|
||||
internal::CusolverParams params_;
|
||||
internal::DeviceBuffer d_A_; // overwritten with eigenvectors by syevd
|
||||
internal::DeviceBuffer d_W_; // eigenvalues (RealScalar, length n)
|
||||
internal::DeviceBuffer d_scratch_; // workspace + info
|
||||
size_t scratch_size_ = 0;
|
||||
std::vector<char> h_workspace_;
|
||||
ComputeMode mode_ = ComputeEigenvectors;
|
||||
Index n_ = 0;
|
||||
int64_t lda_ = 0;
|
||||
ComputationInfo info_ = InvalidInput;
|
||||
int info_word_ = 0;
|
||||
bool info_synced_ = true;
|
||||
|
||||
void init_context() {
|
||||
EIGEN_CUDA_RUNTIME_CHECK(cudaStreamCreate(&stream_));
|
||||
EIGEN_CUSOLVER_CHECK(cusolverDnCreate(&handle_));
|
||||
EIGEN_CUSOLVER_CHECK(cusolverDnSetStream(handle_, stream_));
|
||||
ensure_scratch(0);
|
||||
}
|
||||
|
||||
void ensure_scratch(size_t workspace_bytes) {
|
||||
constexpr size_t kAlign = 16;
|
||||
workspace_bytes = (workspace_bytes + kAlign - 1) & ~(kAlign - 1);
|
||||
size_t needed = workspace_bytes + sizeof(int);
|
||||
if (needed > scratch_size_) {
|
||||
if (d_scratch_.ptr) EIGEN_CUDA_RUNTIME_CHECK(cudaStreamSynchronize(stream_));
|
||||
d_scratch_ = internal::DeviceBuffer(needed);
|
||||
scratch_size_ = needed;
|
||||
}
|
||||
}
|
||||
|
||||
void* scratch_workspace() const { return d_scratch_.ptr; }
|
||||
int* scratch_info() const {
|
||||
return reinterpret_cast<int*>(static_cast<char*>(d_scratch_.ptr) + scratch_size_ - sizeof(int));
|
||||
}
|
||||
|
||||
void sync_info() const {
|
||||
if (!info_synced_) {
|
||||
EIGEN_CUDA_RUNTIME_CHECK(cudaStreamSynchronize(stream_));
|
||||
const_cast<GpuSelfAdjointEigenSolver*>(this)->info_ = (info_word_ == 0) ? Success : NumericalIssue;
|
||||
const_cast<GpuSelfAdjointEigenSolver*>(this)->info_synced_ = true;
|
||||
}
|
||||
}
|
||||
|
||||
void factorize() {
|
||||
constexpr cudaDataType_t dtype = internal::cusolver_data_type<Scalar>::value;
|
||||
constexpr cudaDataType_t rtype = internal::cuda_data_type<RealScalar>::value;
|
||||
|
||||
info_synced_ = false;
|
||||
info_ = InvalidInput;
|
||||
|
||||
d_W_ = internal::DeviceBuffer(static_cast<size_t>(n_) * sizeof(RealScalar));
|
||||
|
||||
const cusolverEigMode_t jobz =
|
||||
(mode_ == ComputeEigenvectors) ? CUSOLVER_EIG_MODE_VECTOR : CUSOLVER_EIG_MODE_NOVECTOR;
|
||||
|
||||
// Use lower triangle (standard convention).
|
||||
constexpr cublasFillMode_t uplo = CUBLAS_FILL_MODE_LOWER;
|
||||
|
||||
size_t dev_ws = 0, host_ws = 0;
|
||||
EIGEN_CUSOLVER_CHECK(cusolverDnXsyevd_bufferSize(handle_, params_.p, jobz, uplo, static_cast<int64_t>(n_), dtype,
|
||||
d_A_.ptr, lda_, rtype, d_W_.ptr, dtype, &dev_ws, &host_ws));
|
||||
|
||||
ensure_scratch(dev_ws);
|
||||
h_workspace_.resize(host_ws);
|
||||
|
||||
EIGEN_CUSOLVER_CHECK(cusolverDnXsyevd(handle_, params_.p, jobz, uplo, static_cast<int64_t>(n_), dtype, d_A_.ptr,
|
||||
lda_, rtype, d_W_.ptr, dtype, scratch_workspace(), dev_ws,
|
||||
host_ws > 0 ? h_workspace_.data() : nullptr, host_ws, scratch_info()));
|
||||
|
||||
EIGEN_CUDA_RUNTIME_CHECK(
|
||||
cudaMemcpyAsync(&info_word_, scratch_info(), sizeof(int), cudaMemcpyDeviceToHost, stream_));
|
||||
}
|
||||
};
|
||||
|
||||
} // namespace Eigen
|
||||
|
||||
#endif // EIGEN_GPU_EIGENSOLVER_H
|
||||
308
Eigen/src/GPU/GpuFFT.h
Normal file
308
Eigen/src/GPU/GpuFFT.h
Normal file
@@ -0,0 +1,308 @@
|
||||
// This file is part of Eigen, a lightweight C++ template library
|
||||
// for linear algebra.
|
||||
//
|
||||
// Copyright (C) 2026 Rasmus Munk Larsen <rmlarsen@gmail.com>
|
||||
//
|
||||
// This Source Code Form is subject to the terms of the Mozilla
|
||||
// Public License v. 2.0. If a copy of the MPL was not distributed
|
||||
// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
|
||||
|
||||
// GPU FFT via cuFFT.
|
||||
//
|
||||
// Standalone GPU FFT class with plan caching. Supports 1D and 2D transforms:
|
||||
// C2C (complex-to-complex), R2C (real-to-complex), C2R (complex-to-real).
|
||||
//
|
||||
// Inverse transforms are scaled by 1/n (1D) or 1/(n*m) (2D) so that
|
||||
// inv(fwd(x)) == x, matching Eigen's FFT convention.
|
||||
//
|
||||
// cuFFT plans are cached by (size, type) and reused across calls.
|
||||
//
|
||||
// Usage:
|
||||
// GpuFFT<float> fft;
|
||||
// VectorXcf X = fft.fwd(x); // 1D C2C or R2C
|
||||
// VectorXcf y = fft.inv(X); // 1D C2C inverse
|
||||
// VectorXf r = fft.invReal(X, n); // 1D C2R inverse
|
||||
// MatrixXcf B = fft.fwd2d(A); // 2D C2C forward
|
||||
// MatrixXcf C = fft.inv2d(B); // 2D C2C inverse
|
||||
|
||||
#ifndef EIGEN_GPU_FFT_H
|
||||
#define EIGEN_GPU_FFT_H
|
||||
|
||||
// IWYU pragma: private
|
||||
#include "./InternalHeaderCheck.h"
|
||||
|
||||
#include "./CuFftSupport.h"
|
||||
#include "./CuBlasSupport.h"
|
||||
#include <map>
|
||||
|
||||
namespace Eigen {
|
||||
|
||||
template <typename Scalar_>
|
||||
class GpuFFT {
|
||||
public:
|
||||
using Scalar = Scalar_;
|
||||
using Complex = std::complex<Scalar>;
|
||||
using ComplexVector = Matrix<Complex, Dynamic, 1>;
|
||||
using RealVector = Matrix<Scalar, Dynamic, 1>;
|
||||
using ComplexMatrix = Matrix<Complex, Dynamic, Dynamic, ColMajor>;
|
||||
|
||||
GpuFFT() {
|
||||
EIGEN_CUDA_RUNTIME_CHECK(cudaStreamCreate(&stream_));
|
||||
EIGEN_CUBLAS_CHECK(cublasCreate(&cublas_));
|
||||
EIGEN_CUBLAS_CHECK(cublasSetStream(cublas_, stream_));
|
||||
}
|
||||
|
||||
~GpuFFT() {
|
||||
for (auto& kv : plans_) (void)cufftDestroy(kv.second);
|
||||
if (cublas_) (void)cublasDestroy(cublas_);
|
||||
if (stream_) (void)cudaStreamDestroy(stream_);
|
||||
}
|
||||
|
||||
GpuFFT(const GpuFFT&) = delete;
|
||||
GpuFFT& operator=(const GpuFFT&) = delete;
|
||||
|
||||
// ---- 1D Complex-to-Complex ------------------------------------------------
|
||||
|
||||
/** Forward 1D C2C FFT. */
|
||||
template <typename Derived>
|
||||
ComplexVector fwd(const MatrixBase<Derived>& x,
|
||||
typename std::enable_if<NumTraits<typename Derived::Scalar>::IsComplex>::type* = nullptr) {
|
||||
const ComplexVector input(x.derived());
|
||||
const int n = static_cast<int>(input.size());
|
||||
if (n == 0) return ComplexVector(0);
|
||||
|
||||
ensure_buffers(n * sizeof(Complex), n * sizeof(Complex));
|
||||
EIGEN_CUDA_RUNTIME_CHECK(
|
||||
cudaMemcpyAsync(d_in_.ptr, input.data(), n * sizeof(Complex), cudaMemcpyHostToDevice, stream_));
|
||||
|
||||
cufftHandle plan = get_plan_1d(n, internal::cufft_c2c_type<Scalar>::value);
|
||||
EIGEN_CUFFT_CHECK(internal::cufftExecC2C_dispatch(plan, static_cast<Complex*>(d_in_.ptr),
|
||||
static_cast<Complex*>(d_out_.ptr), CUFFT_FORWARD));
|
||||
|
||||
ComplexVector result(n);
|
||||
EIGEN_CUDA_RUNTIME_CHECK(
|
||||
cudaMemcpyAsync(result.data(), d_out_.ptr, n * sizeof(Complex), cudaMemcpyDeviceToHost, stream_));
|
||||
EIGEN_CUDA_RUNTIME_CHECK(cudaStreamSynchronize(stream_));
|
||||
return result;
|
||||
}
|
||||
|
||||
/** Inverse 1D C2C FFT. Scaled by 1/n. */
|
||||
template <typename Derived>
|
||||
ComplexVector inv(const MatrixBase<Derived>& X) {
|
||||
static_assert(NumTraits<typename Derived::Scalar>::IsComplex, "inv() requires complex input");
|
||||
const ComplexVector input(X.derived());
|
||||
const int n = static_cast<int>(input.size());
|
||||
if (n == 0) return ComplexVector(0);
|
||||
|
||||
ensure_buffers(n * sizeof(Complex), n * sizeof(Complex));
|
||||
EIGEN_CUDA_RUNTIME_CHECK(
|
||||
cudaMemcpyAsync(d_in_.ptr, input.data(), n * sizeof(Complex), cudaMemcpyHostToDevice, stream_));
|
||||
|
||||
cufftHandle plan = get_plan_1d(n, internal::cufft_c2c_type<Scalar>::value);
|
||||
EIGEN_CUFFT_CHECK(internal::cufftExecC2C_dispatch(plan, static_cast<Complex*>(d_in_.ptr),
|
||||
static_cast<Complex*>(d_out_.ptr), CUFFT_INVERSE));
|
||||
|
||||
// Scale by 1/n.
|
||||
scale_device(static_cast<Complex*>(d_out_.ptr), n, Scalar(1) / Scalar(n));
|
||||
|
||||
ComplexVector result(n);
|
||||
EIGEN_CUDA_RUNTIME_CHECK(
|
||||
cudaMemcpyAsync(result.data(), d_out_.ptr, n * sizeof(Complex), cudaMemcpyDeviceToHost, stream_));
|
||||
EIGEN_CUDA_RUNTIME_CHECK(cudaStreamSynchronize(stream_));
|
||||
return result;
|
||||
}
|
||||
|
||||
// ---- 1D Real-to-Complex ---------------------------------------------------
|
||||
|
||||
/** Forward 1D R2C FFT. Returns n/2+1 complex values (half-spectrum). */
|
||||
template <typename Derived>
|
||||
ComplexVector fwd(const MatrixBase<Derived>& x,
|
||||
typename std::enable_if<!NumTraits<typename Derived::Scalar>::IsComplex>::type* = nullptr) {
|
||||
const RealVector input(x.derived());
|
||||
const int n = static_cast<int>(input.size());
|
||||
if (n == 0) return ComplexVector(0);
|
||||
|
||||
const int n_complex = n / 2 + 1;
|
||||
ensure_buffers(n * sizeof(Scalar), n_complex * sizeof(Complex));
|
||||
EIGEN_CUDA_RUNTIME_CHECK(
|
||||
cudaMemcpyAsync(d_in_.ptr, input.data(), n * sizeof(Scalar), cudaMemcpyHostToDevice, stream_));
|
||||
|
||||
cufftHandle plan = get_plan_1d(n, internal::cufft_r2c_type<Scalar>::value);
|
||||
EIGEN_CUFFT_CHECK(
|
||||
internal::cufftExecR2C_dispatch(plan, static_cast<Scalar*>(d_in_.ptr), static_cast<Complex*>(d_out_.ptr)));
|
||||
|
||||
ComplexVector result(n_complex);
|
||||
EIGEN_CUDA_RUNTIME_CHECK(
|
||||
cudaMemcpyAsync(result.data(), d_out_.ptr, n_complex * sizeof(Complex), cudaMemcpyDeviceToHost, stream_));
|
||||
EIGEN_CUDA_RUNTIME_CHECK(cudaStreamSynchronize(stream_));
|
||||
return result;
|
||||
}
|
||||
|
||||
// ---- 1D Complex-to-Real ---------------------------------------------------
|
||||
|
||||
/** Inverse 1D C2R FFT. Input is n/2+1 complex values, output is nfft real values.
|
||||
* Scaled by 1/nfft. Caller must specify nfft (original real signal length). */
|
||||
template <typename Derived>
|
||||
RealVector invReal(const MatrixBase<Derived>& X, Index nfft) {
|
||||
static_assert(NumTraits<typename Derived::Scalar>::IsComplex, "invReal() requires complex input");
|
||||
const ComplexVector input(X.derived());
|
||||
const int n = static_cast<int>(nfft);
|
||||
const int n_complex = n / 2 + 1;
|
||||
eigen_assert(input.size() == n_complex);
|
||||
if (n == 0) return RealVector(0);
|
||||
|
||||
ensure_buffers(n_complex * sizeof(Complex), n * sizeof(Scalar));
|
||||
// cuFFT C2R may overwrite the input, so we copy to d_in_.
|
||||
EIGEN_CUDA_RUNTIME_CHECK(
|
||||
cudaMemcpyAsync(d_in_.ptr, input.data(), n_complex * sizeof(Complex), cudaMemcpyHostToDevice, stream_));
|
||||
|
||||
cufftHandle plan = get_plan_1d(n, internal::cufft_c2r_type<Scalar>::value);
|
||||
EIGEN_CUFFT_CHECK(
|
||||
internal::cufftExecC2R_dispatch(plan, static_cast<Complex*>(d_in_.ptr), static_cast<Scalar*>(d_out_.ptr)));
|
||||
|
||||
// Scale by 1/n.
|
||||
scale_device_real(static_cast<Scalar*>(d_out_.ptr), n, Scalar(1) / Scalar(n));
|
||||
|
||||
RealVector result(n);
|
||||
EIGEN_CUDA_RUNTIME_CHECK(
|
||||
cudaMemcpyAsync(result.data(), d_out_.ptr, n * sizeof(Scalar), cudaMemcpyDeviceToHost, stream_));
|
||||
EIGEN_CUDA_RUNTIME_CHECK(cudaStreamSynchronize(stream_));
|
||||
return result;
|
||||
}
|
||||
|
||||
// ---- 2D Complex-to-Complex ------------------------------------------------
|
||||
|
||||
/** Forward 2D C2C FFT. Input and output are rows x cols complex matrices. */
|
||||
template <typename Derived>
|
||||
ComplexMatrix fwd2d(const MatrixBase<Derived>& A) {
|
||||
static_assert(NumTraits<typename Derived::Scalar>::IsComplex, "fwd2d() requires complex input");
|
||||
const ComplexMatrix input(A.derived());
|
||||
const int rows = static_cast<int>(input.rows());
|
||||
const int cols = static_cast<int>(input.cols());
|
||||
if (rows == 0 || cols == 0) return ComplexMatrix(rows, cols);
|
||||
|
||||
const size_t total = static_cast<size_t>(rows) * static_cast<size_t>(cols) * sizeof(Complex);
|
||||
ensure_buffers(total, total);
|
||||
EIGEN_CUDA_RUNTIME_CHECK(cudaMemcpyAsync(d_in_.ptr, input.data(), total, cudaMemcpyHostToDevice, stream_));
|
||||
|
||||
cufftHandle plan = get_plan_2d(rows, cols, internal::cufft_c2c_type<Scalar>::value);
|
||||
EIGEN_CUFFT_CHECK(internal::cufftExecC2C_dispatch(plan, static_cast<Complex*>(d_in_.ptr),
|
||||
static_cast<Complex*>(d_out_.ptr), CUFFT_FORWARD));
|
||||
|
||||
ComplexMatrix result(rows, cols);
|
||||
EIGEN_CUDA_RUNTIME_CHECK(cudaMemcpyAsync(result.data(), d_out_.ptr, total, cudaMemcpyDeviceToHost, stream_));
|
||||
EIGEN_CUDA_RUNTIME_CHECK(cudaStreamSynchronize(stream_));
|
||||
return result;
|
||||
}
|
||||
|
||||
/** Inverse 2D C2C FFT. Scaled by 1/(rows*cols). */
|
||||
template <typename Derived>
|
||||
ComplexMatrix inv2d(const MatrixBase<Derived>& A) {
|
||||
static_assert(NumTraits<typename Derived::Scalar>::IsComplex, "inv2d() requires complex input");
|
||||
const ComplexMatrix input(A.derived());
|
||||
const int rows = static_cast<int>(input.rows());
|
||||
const int cols = static_cast<int>(input.cols());
|
||||
if (rows == 0 || cols == 0) return ComplexMatrix(rows, cols);
|
||||
|
||||
const size_t total = static_cast<size_t>(rows) * static_cast<size_t>(cols) * sizeof(Complex);
|
||||
ensure_buffers(total, total);
|
||||
EIGEN_CUDA_RUNTIME_CHECK(cudaMemcpyAsync(d_in_.ptr, input.data(), total, cudaMemcpyHostToDevice, stream_));
|
||||
|
||||
cufftHandle plan = get_plan_2d(rows, cols, internal::cufft_c2c_type<Scalar>::value);
|
||||
EIGEN_CUFFT_CHECK(internal::cufftExecC2C_dispatch(plan, static_cast<Complex*>(d_in_.ptr),
|
||||
static_cast<Complex*>(d_out_.ptr), CUFFT_INVERSE));
|
||||
|
||||
// Scale by 1/(rows*cols).
|
||||
const int total_elems = rows * cols;
|
||||
scale_device(static_cast<Complex*>(d_out_.ptr), total_elems, Scalar(1) / Scalar(total_elems));
|
||||
|
||||
ComplexMatrix result(rows, cols);
|
||||
EIGEN_CUDA_RUNTIME_CHECK(cudaMemcpyAsync(result.data(), d_out_.ptr, total, cudaMemcpyDeviceToHost, stream_));
|
||||
EIGEN_CUDA_RUNTIME_CHECK(cudaStreamSynchronize(stream_));
|
||||
return result;
|
||||
}
|
||||
|
||||
// ---- Accessors ------------------------------------------------------------
|
||||
|
||||
cudaStream_t stream() const { return stream_; }
|
||||
|
||||
private:
|
||||
cudaStream_t stream_ = nullptr;
|
||||
cublasHandle_t cublas_ = nullptr;
|
||||
std::map<int64_t, cufftHandle> plans_;
|
||||
internal::DeviceBuffer d_in_;
|
||||
internal::DeviceBuffer d_out_;
|
||||
size_t d_in_size_ = 0;
|
||||
size_t d_out_size_ = 0;
|
||||
|
||||
void ensure_buffers(size_t in_bytes, size_t out_bytes) {
|
||||
if (in_bytes > d_in_size_) {
|
||||
if (d_in_.ptr) EIGEN_CUDA_RUNTIME_CHECK(cudaStreamSynchronize(stream_));
|
||||
d_in_ = internal::DeviceBuffer(in_bytes);
|
||||
d_in_size_ = in_bytes;
|
||||
}
|
||||
if (out_bytes > d_out_size_) {
|
||||
if (d_out_.ptr) EIGEN_CUDA_RUNTIME_CHECK(cudaStreamSynchronize(stream_));
|
||||
d_out_ = internal::DeviceBuffer(out_bytes);
|
||||
d_out_size_ = out_bytes;
|
||||
}
|
||||
}
|
||||
|
||||
// Plan key encoding: rank (1 bit) | type (4 bits) | dims
|
||||
static int64_t plan_key_1d(int n, cufftType type) { return (int64_t(n) << 5) | (int64_t(type) << 1) | 0; }
|
||||
|
||||
static int64_t plan_key_2d(int rows, int cols, cufftType type) {
|
||||
return (int64_t(rows) << 35) | (int64_t(cols) << 5) | (int64_t(type) << 1) | 1;
|
||||
}
|
||||
|
||||
cufftHandle get_plan_1d(int n, cufftType type) {
|
||||
int64_t key = plan_key_1d(n, type);
|
||||
auto it = plans_.find(key);
|
||||
if (it != plans_.end()) return it->second;
|
||||
|
||||
cufftHandle plan;
|
||||
EIGEN_CUFFT_CHECK(cufftPlan1d(&plan, n, type, /*batch=*/1));
|
||||
EIGEN_CUFFT_CHECK(cufftSetStream(plan, stream_));
|
||||
plans_[key] = plan;
|
||||
return plan;
|
||||
}
|
||||
|
||||
cufftHandle get_plan_2d(int rows, int cols, cufftType type) {
|
||||
int64_t key = plan_key_2d(rows, cols, type);
|
||||
auto it = plans_.find(key);
|
||||
if (it != plans_.end()) return it->second;
|
||||
|
||||
// cuFFT uses row-major (C order) for 2D: first dim = rows, second = cols.
|
||||
// Eigen matrices are column-major, so we pass (cols, rows) to cuFFT
|
||||
// to get the correct 2D transform.
|
||||
cufftHandle plan;
|
||||
EIGEN_CUFFT_CHECK(cufftPlan2d(&plan, cols, rows, type));
|
||||
EIGEN_CUFFT_CHECK(cufftSetStream(plan, stream_));
|
||||
plans_[key] = plan;
|
||||
return plan;
|
||||
}
|
||||
|
||||
// Scale complex array on device using cuBLAS scal.
|
||||
void scale_device(Complex* d_ptr, int n, Scalar alpha) { scale_complex(cublas_, d_ptr, n, alpha); }
|
||||
|
||||
// Scale real array on device using cuBLAS scal.
|
||||
void scale_device_real(Scalar* d_ptr, int n, Scalar alpha) { scale_real(cublas_, d_ptr, n, alpha); }
|
||||
|
||||
// Type-dispatched cuBLAS scal wrappers (C++14 compatible).
|
||||
static void scale_complex(cublasHandle_t h, std::complex<float>* p, int n, float a) {
|
||||
EIGEN_CUBLAS_CHECK(cublasCsscal(h, n, &a, reinterpret_cast<cuComplex*>(p), 1));
|
||||
}
|
||||
static void scale_complex(cublasHandle_t h, std::complex<double>* p, int n, double a) {
|
||||
EIGEN_CUBLAS_CHECK(cublasZdscal(h, n, &a, reinterpret_cast<cuDoubleComplex*>(p), 1));
|
||||
}
|
||||
static void scale_real(cublasHandle_t h, float* p, int n, float a) {
|
||||
EIGEN_CUBLAS_CHECK(cublasSscal(h, n, &a, p, 1));
|
||||
}
|
||||
static void scale_real(cublasHandle_t h, double* p, int n, double a) {
|
||||
EIGEN_CUBLAS_CHECK(cublasDscal(h, n, &a, p, 1));
|
||||
}
|
||||
};
|
||||
|
||||
} // namespace Eigen
|
||||
|
||||
#endif // EIGEN_GPU_FFT_H
|
||||
385
Eigen/src/GPU/GpuLLT.h
Normal file
385
Eigen/src/GPU/GpuLLT.h
Normal file
@@ -0,0 +1,385 @@
|
||||
// This file is part of Eigen, a lightweight C++ template library
|
||||
// for linear algebra.
|
||||
//
|
||||
// Copyright (C) 2026 Eigen Authors
|
||||
//
|
||||
// This Source Code Form is subject to the terms of the Mozilla
|
||||
// Public License v. 2.0. If a copy of the MPL was not distributed
|
||||
// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
|
||||
|
||||
// GPU Cholesky (LLT) decomposition using cuSOLVER.
|
||||
//
|
||||
// Unlike Eigen's CPU LLT<MatrixType>, GpuLLT keeps the factored Cholesky
|
||||
// factor in device memory for the lifetime of the object. Multiple solves
|
||||
// against the same factor therefore only transfer the RHS and solution
|
||||
// vectors, not the factor itself.
|
||||
//
|
||||
// Requires CUDA 11.0+ (cusolverDnXpotrf / cusolverDnXpotrs generic API).
|
||||
// Requires CUDA 11.4+ (cusolverDnX generic API + cudaMallocAsync).
|
||||
//
|
||||
// Usage:
|
||||
// GpuLLT<double> llt(A); // upload A, potrf, L stays on device
|
||||
// if (llt.info() != Success) { ... }
|
||||
// MatrixXd x1 = llt.solve(b1); // potrs, only b1 transferred
|
||||
// MatrixXd x2 = llt.solve(b2); // L already on device
|
||||
|
||||
#ifndef EIGEN_GPU_LLT_H
|
||||
#define EIGEN_GPU_LLT_H
|
||||
|
||||
// IWYU pragma: private
|
||||
#include "./InternalHeaderCheck.h"
|
||||
|
||||
#include "./CuSolverSupport.h"
|
||||
#include <vector>
|
||||
|
||||
namespace Eigen {
|
||||
|
||||
/** \ingroup GPU_Module
|
||||
* \class GpuLLT
|
||||
* \brief GPU Cholesky (LL^T) decomposition via cuSOLVER
|
||||
*
|
||||
* \tparam Scalar_ Element type: float, double, complex<float>, complex<double>
|
||||
* \tparam UpLo_ Triangle used: Lower (default) or Upper
|
||||
*
|
||||
* Factorizes a symmetric positive-definite matrix A = LL^H on the GPU and
|
||||
* caches the factor L in device memory. Each subsequent solve(B) uploads only
|
||||
* B, calls cusolverDnXpotrs, and downloads the result — the factor is not
|
||||
* re-transferred.
|
||||
*
|
||||
* Each GpuLLT object owns a dedicated CUDA stream and cuSOLVER handle,
|
||||
* enabling concurrent factorizations from multiple objects on the same host
|
||||
* thread.
|
||||
*/
|
||||
template <typename Scalar_, int UpLo_ = Lower>
|
||||
class GpuLLT {
|
||||
public:
|
||||
using Scalar = Scalar_;
|
||||
using RealScalar = typename NumTraits<Scalar>::Real;
|
||||
using PlainMatrix = Matrix<Scalar, Dynamic, Dynamic, ColMajor>;
|
||||
|
||||
enum { UpLo = UpLo_ };
|
||||
|
||||
// ---- Construction / destruction ------------------------------------------
|
||||
|
||||
/** Default constructor. Does not factorize; call compute() before solve(). */
|
||||
GpuLLT() { init_context(); }
|
||||
|
||||
/** Factor A immediately. Equivalent to GpuLLT llt; llt.compute(A). */
|
||||
template <typename InputType>
|
||||
explicit GpuLLT(const EigenBase<InputType>& A) {
|
||||
init_context();
|
||||
compute(A);
|
||||
}
|
||||
|
||||
~GpuLLT() {
|
||||
// Ignore errors in destructors — cannot propagate.
|
||||
if (handle_) (void)cusolverDnDestroy(handle_);
|
||||
if (stream_) (void)cudaStreamDestroy(stream_);
|
||||
}
|
||||
|
||||
// Non-copyable (owns device memory and library handles).
|
||||
GpuLLT(const GpuLLT&) = delete;
|
||||
GpuLLT& operator=(const GpuLLT&) = delete;
|
||||
|
||||
// Movable.
|
||||
GpuLLT(GpuLLT&& o) noexcept
|
||||
: stream_(o.stream_),
|
||||
handle_(o.handle_),
|
||||
params_(std::move(o.params_)),
|
||||
d_factor_(std::move(o.d_factor_)),
|
||||
factor_alloc_size_(o.factor_alloc_size_),
|
||||
d_scratch_(std::move(o.d_scratch_)),
|
||||
scratch_size_(o.scratch_size_),
|
||||
h_workspace_(std::move(o.h_workspace_)),
|
||||
n_(o.n_),
|
||||
lda_(o.lda_),
|
||||
info_(o.info_),
|
||||
info_word_(o.info_word_),
|
||||
info_synced_(o.info_synced_) {
|
||||
o.stream_ = nullptr;
|
||||
o.handle_ = nullptr;
|
||||
o.factor_alloc_size_ = 0;
|
||||
o.scratch_size_ = 0;
|
||||
o.n_ = 0;
|
||||
o.info_ = InvalidInput;
|
||||
o.info_word_ = 0;
|
||||
o.info_synced_ = true;
|
||||
}
|
||||
|
||||
GpuLLT& operator=(GpuLLT&& o) noexcept {
|
||||
if (this != &o) {
|
||||
if (handle_) (void)cusolverDnDestroy(handle_);
|
||||
if (stream_) (void)cudaStreamDestroy(stream_);
|
||||
stream_ = o.stream_;
|
||||
handle_ = o.handle_;
|
||||
params_ = std::move(o.params_);
|
||||
d_factor_ = std::move(o.d_factor_);
|
||||
factor_alloc_size_ = o.factor_alloc_size_;
|
||||
d_scratch_ = std::move(o.d_scratch_);
|
||||
scratch_size_ = o.scratch_size_;
|
||||
h_workspace_ = std::move(o.h_workspace_);
|
||||
n_ = o.n_;
|
||||
lda_ = o.lda_;
|
||||
info_ = o.info_;
|
||||
info_word_ = o.info_word_;
|
||||
info_synced_ = o.info_synced_;
|
||||
o.stream_ = nullptr;
|
||||
o.handle_ = nullptr;
|
||||
o.factor_alloc_size_ = 0;
|
||||
o.scratch_size_ = 0;
|
||||
o.n_ = 0;
|
||||
o.info_ = InvalidInput;
|
||||
o.info_word_ = 0;
|
||||
o.info_synced_ = true;
|
||||
}
|
||||
return *this;
|
||||
}
|
||||
|
||||
// ---- Factorization -------------------------------------------------------
|
||||
|
||||
/** Compute the Cholesky factorization of A (host matrix).
|
||||
*
|
||||
* Uploads A to device memory, calls cusolverDnXpotrf, and retains the
|
||||
* factored matrix on device. Any previous factorization is overwritten.
|
||||
*/
|
||||
template <typename InputType>
|
||||
GpuLLT& compute(const EigenBase<InputType>& A) {
|
||||
eigen_assert(A.rows() == A.cols());
|
||||
if (!begin_compute(A.rows())) return *this;
|
||||
|
||||
// Evaluate A into a contiguous ColMajor matrix (handles arbitrary expressions).
|
||||
const PlainMatrix mat(A.derived());
|
||||
lda_ = static_cast<int64_t>(mat.rows());
|
||||
allocate_factor_storage();
|
||||
EIGEN_CUDA_RUNTIME_CHECK(
|
||||
cudaMemcpyAsync(d_factor_.ptr, mat.data(), factorBytes(), cudaMemcpyHostToDevice, stream_));
|
||||
|
||||
factorize();
|
||||
return *this;
|
||||
}
|
||||
|
||||
/** Compute the Cholesky factorization from a device-resident matrix (D2D copy). */
|
||||
GpuLLT& compute(const DeviceMatrix<Scalar>& d_A) {
|
||||
eigen_assert(d_A.rows() == d_A.cols());
|
||||
if (!begin_compute(d_A.rows())) return *this;
|
||||
|
||||
lda_ = static_cast<int64_t>(d_A.rows());
|
||||
d_A.waitReady(stream_);
|
||||
allocate_factor_storage();
|
||||
EIGEN_CUDA_RUNTIME_CHECK(
|
||||
cudaMemcpyAsync(d_factor_.ptr, d_A.data(), factorBytes(), cudaMemcpyDeviceToDevice, stream_));
|
||||
|
||||
factorize();
|
||||
return *this;
|
||||
}
|
||||
|
||||
/** Compute the Cholesky factorization from a device matrix (move, no copy). */
|
||||
GpuLLT& compute(DeviceMatrix<Scalar>&& d_A) {
|
||||
eigen_assert(d_A.rows() == d_A.cols());
|
||||
if (!begin_compute(d_A.rows())) return *this;
|
||||
|
||||
lda_ = static_cast<int64_t>(d_A.rows());
|
||||
d_A.waitReady(stream_);
|
||||
d_factor_ = internal::DeviceBuffer::adopt(static_cast<void*>(d_A.release()));
|
||||
|
||||
factorize();
|
||||
return *this;
|
||||
}
|
||||
|
||||
// ---- Solve ---------------------------------------------------------------
|
||||
|
||||
/** Solve A * X = B using the cached Cholesky factor (host → host).
|
||||
*
|
||||
* Uploads B to device memory, calls cusolverDnXpotrs using the factor
|
||||
* retained from compute(), and returns the solution X on the host.
|
||||
* The factor is not re-transferred; only B goes up and X comes down.
|
||||
*
|
||||
* \pre compute() must have been called and info() == Success.
|
||||
* \returns X such that A * X ≈ B
|
||||
*/
|
||||
template <typename Rhs>
|
||||
PlainMatrix solve(const MatrixBase<Rhs>& B) const {
|
||||
const_cast<GpuLLT*>(this)->sync_info();
|
||||
eigen_assert(info_ == Success && "GpuLLT::solve called on a failed or uninitialized factorization");
|
||||
eigen_assert(B.rows() == n_);
|
||||
|
||||
const PlainMatrix rhs(B);
|
||||
const int64_t nrhs = static_cast<int64_t>(rhs.cols());
|
||||
const int64_t ldb = static_cast<int64_t>(rhs.rows());
|
||||
DeviceMatrix<Scalar> d_X = solve_impl(nrhs, ldb, [&](Scalar* d_x_ptr) {
|
||||
EIGEN_CUDA_RUNTIME_CHECK(
|
||||
cudaMemcpyAsync(d_x_ptr, rhs.data(), rhsBytes(nrhs, ldb), cudaMemcpyHostToDevice, stream_));
|
||||
});
|
||||
|
||||
PlainMatrix X(n_, B.cols());
|
||||
int solve_info = 0;
|
||||
EIGEN_CUDA_RUNTIME_CHECK(
|
||||
cudaMemcpyAsync(X.data(), d_X.data(), rhsBytes(nrhs, ldb), cudaMemcpyDeviceToHost, stream_));
|
||||
EIGEN_CUDA_RUNTIME_CHECK(
|
||||
cudaMemcpyAsync(&solve_info, scratch_info(), sizeof(int), cudaMemcpyDeviceToHost, stream_));
|
||||
EIGEN_CUDA_RUNTIME_CHECK(cudaStreamSynchronize(stream_));
|
||||
|
||||
eigen_assert(solve_info == 0 && "cusolverDnXpotrs reported an error");
|
||||
return X;
|
||||
}
|
||||
|
||||
/** Solve A * X = B with device-resident RHS. Fully async.
|
||||
*
|
||||
* All work is enqueued on this solver's stream. Returns a DeviceMatrix
|
||||
* with a recorded ready event — no host synchronization occurs.
|
||||
* The caller should check info() after compute() to verify the
|
||||
* factorization succeeded; this method does not check.
|
||||
*/
|
||||
DeviceMatrix<Scalar> solve(const DeviceMatrix<Scalar>& d_B) const {
|
||||
eigen_assert(d_B.rows() == n_);
|
||||
d_B.waitReady(stream_);
|
||||
const int64_t nrhs = static_cast<int64_t>(d_B.cols());
|
||||
const int64_t ldb = static_cast<int64_t>(d_B.rows());
|
||||
return solve_impl(nrhs, ldb, [&](Scalar* d_x_ptr) {
|
||||
EIGEN_CUDA_RUNTIME_CHECK(
|
||||
cudaMemcpyAsync(d_x_ptr, d_B.data(), rhsBytes(nrhs, ldb), cudaMemcpyDeviceToDevice, stream_));
|
||||
});
|
||||
}
|
||||
|
||||
// ---- Accessors -----------------------------------------------------------
|
||||
|
||||
/** Returns Success if the last compute() succeeded, NumericalIssue otherwise.
|
||||
* Lazily synchronizes the stream on first call after compute(). */
|
||||
ComputationInfo info() const {
|
||||
const_cast<GpuLLT*>(this)->sync_info();
|
||||
return info_;
|
||||
}
|
||||
|
||||
Index rows() const { return n_; }
|
||||
Index cols() const { return n_; }
|
||||
|
||||
/** Returns the CUDA stream owned by this object.
|
||||
* Advanced users may submit additional GPU work on this stream
|
||||
* to overlap with or chain after GpuLLT operations. */
|
||||
cudaStream_t stream() const { return stream_; }
|
||||
|
||||
private:
|
||||
cudaStream_t stream_ = nullptr;
|
||||
cusolverDnHandle_t handle_ = nullptr;
|
||||
internal::CusolverParams params_; // cuSOLVER params (created once, reused)
|
||||
internal::DeviceBuffer d_factor_; // factored L (or U) on device (grows, never shrinks)
|
||||
size_t factor_alloc_size_ = 0; // current d_factor_ allocation size
|
||||
internal::DeviceBuffer d_scratch_; // combined workspace + info word (grows, never shrinks)
|
||||
size_t scratch_size_ = 0; // current scratch allocation size
|
||||
std::vector<char> h_workspace_; // host workspace (kept alive until next compute)
|
||||
Index n_ = 0;
|
||||
int64_t lda_ = 0;
|
||||
ComputationInfo info_ = InvalidInput;
|
||||
int info_word_ = 0; // host-side target for async info download
|
||||
bool info_synced_ = true; // has the stream been synced for info?
|
||||
|
||||
bool begin_compute(Index rows) {
|
||||
n_ = rows;
|
||||
info_ = InvalidInput;
|
||||
if (n_ == 0) {
|
||||
info_ = Success;
|
||||
return false;
|
||||
}
|
||||
return true;
|
||||
}
|
||||
|
||||
size_t factorBytes() const { return rhsBytes(static_cast<int64_t>(n_), lda_); }
|
||||
|
||||
static size_t rhsBytes(int64_t cols, int64_t outer_stride) {
|
||||
return static_cast<size_t>(outer_stride) * static_cast<size_t>(cols) * sizeof(Scalar);
|
||||
}
|
||||
|
||||
void allocate_factor_storage() {
|
||||
size_t needed = factorBytes();
|
||||
if (needed > factor_alloc_size_) {
|
||||
d_factor_ = internal::DeviceBuffer(needed);
|
||||
factor_alloc_size_ = needed;
|
||||
}
|
||||
}
|
||||
|
||||
// Ensure d_scratch_ is at least `workspace_bytes + sizeof(int)`.
|
||||
// Layout: [workspace (workspace_bytes) | info_word (sizeof(int))].
|
||||
// Ensure d_scratch_ can hold workspace_bytes + an aligned info word.
|
||||
// Grows but never shrinks. Syncs the stream before reallocating to
|
||||
// avoid freeing memory that async kernels may still be using.
|
||||
void ensure_scratch(size_t workspace_bytes) {
|
||||
// Round up so the info word is naturally aligned.
|
||||
// 16-byte alignment for optimal GPU memory access.
|
||||
constexpr size_t kAlign = 16;
|
||||
workspace_bytes = (workspace_bytes + kAlign - 1) & ~(kAlign - 1);
|
||||
size_t needed = workspace_bytes + sizeof(int);
|
||||
if (needed > scratch_size_) {
|
||||
if (d_scratch_.ptr) EIGEN_CUDA_RUNTIME_CHECK(cudaStreamSynchronize(stream_));
|
||||
d_scratch_ = internal::DeviceBuffer(needed);
|
||||
scratch_size_ = needed;
|
||||
}
|
||||
}
|
||||
|
||||
void* scratch_workspace() const { return d_scratch_.ptr; }
|
||||
int* scratch_info() const {
|
||||
return reinterpret_cast<int*>(static_cast<char*>(d_scratch_.ptr) + scratch_size_ - sizeof(int));
|
||||
}
|
||||
|
||||
template <typename CopyRhs>
|
||||
DeviceMatrix<Scalar> solve_impl(int64_t nrhs, int64_t ldb, CopyRhs&& copy_rhs) const {
|
||||
constexpr cudaDataType_t dtype = internal::cusolver_data_type<Scalar>::value;
|
||||
constexpr cublasFillMode_t uplo = internal::cusolver_fill_mode<UpLo_, ColMajor>::value;
|
||||
|
||||
Scalar* d_x_ptr = nullptr;
|
||||
EIGEN_CUDA_RUNTIME_CHECK(cudaMalloc(reinterpret_cast<void**>(&d_x_ptr), rhsBytes(nrhs, ldb)));
|
||||
copy_rhs(d_x_ptr);
|
||||
|
||||
EIGEN_CUSOLVER_CHECK(cusolverDnXpotrs(handle_, params_.p, uplo, static_cast<int64_t>(n_), nrhs, dtype,
|
||||
d_factor_.ptr, lda_, dtype, d_x_ptr, ldb, scratch_info()));
|
||||
|
||||
DeviceMatrix<Scalar> result(d_x_ptr, n_, static_cast<Index>(nrhs));
|
||||
result.recordReady(stream_);
|
||||
return result;
|
||||
}
|
||||
|
||||
void init_context() {
|
||||
EIGEN_CUDA_RUNTIME_CHECK(cudaStreamCreate(&stream_));
|
||||
EIGEN_CUSOLVER_CHECK(cusolverDnCreate(&handle_));
|
||||
EIGEN_CUSOLVER_CHECK(cusolverDnSetStream(handle_, stream_));
|
||||
ensure_scratch(0); // allocate at least the info word
|
||||
}
|
||||
|
||||
// Synchronize stream and interpret the info word. No-op if already synced.
|
||||
void sync_info() {
|
||||
if (!info_synced_) {
|
||||
EIGEN_CUDA_RUNTIME_CHECK(cudaStreamSynchronize(stream_));
|
||||
info_ = (info_word_ == 0) ? Success : NumericalIssue;
|
||||
info_synced_ = true;
|
||||
}
|
||||
}
|
||||
|
||||
// Run cusolverDnXpotrf on d_factor_ (already on device).
|
||||
// Enqueues factorization + async info download. Does NOT sync.
|
||||
// Workspaces are stored as members to ensure they outlive the async kernels.
|
||||
void factorize() {
|
||||
constexpr cudaDataType_t dtype = internal::cusolver_data_type<Scalar>::value;
|
||||
constexpr cublasFillMode_t uplo = internal::cusolver_fill_mode<UpLo_, ColMajor>::value;
|
||||
|
||||
info_synced_ = false;
|
||||
info_ = InvalidInput;
|
||||
|
||||
size_t dev_ws_bytes = 0, host_ws_bytes = 0;
|
||||
EIGEN_CUSOLVER_CHECK(cusolverDnXpotrf_bufferSize(handle_, params_.p, uplo, static_cast<int64_t>(n_), dtype,
|
||||
d_factor_.ptr, lda_, dtype, &dev_ws_bytes, &host_ws_bytes));
|
||||
|
||||
ensure_scratch(dev_ws_bytes);
|
||||
h_workspace_.resize(host_ws_bytes);
|
||||
|
||||
EIGEN_CUSOLVER_CHECK(cusolverDnXpotrf(
|
||||
handle_, params_.p, uplo, static_cast<int64_t>(n_), dtype, d_factor_.ptr, lda_, dtype, scratch_workspace(),
|
||||
dev_ws_bytes, host_ws_bytes > 0 ? h_workspace_.data() : nullptr, host_ws_bytes, scratch_info()));
|
||||
|
||||
// Enqueue async download of info word — sync deferred to info() or solve().
|
||||
EIGEN_CUDA_RUNTIME_CHECK(
|
||||
cudaMemcpyAsync(&info_word_, scratch_info(), sizeof(int), cudaMemcpyDeviceToHost, stream_));
|
||||
}
|
||||
};
|
||||
|
||||
} // namespace Eigen
|
||||
|
||||
#endif // EIGEN_GPU_LLT_H
|
||||
371
Eigen/src/GPU/GpuLU.h
Normal file
371
Eigen/src/GPU/GpuLU.h
Normal file
@@ -0,0 +1,371 @@
|
||||
// This file is part of Eigen, a lightweight C++ template library
|
||||
// for linear algebra.
|
||||
//
|
||||
// Copyright (C) 2026 Eigen Authors
|
||||
//
|
||||
// This Source Code Form is subject to the terms of the Mozilla
|
||||
// Public License v. 2.0. If a copy of the MPL was not distributed
|
||||
// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
|
||||
|
||||
// GPU partial-pivoting LU decomposition using cuSOLVER.
|
||||
//
|
||||
// Wraps cusolverDnXgetrf (factorization) and cusolverDnXgetrs (solve).
|
||||
// The factored LU matrix and pivot array are kept in device memory for the
|
||||
// lifetime of the object, so repeated solves only transfer the RHS/solution.
|
||||
//
|
||||
// Requires CUDA 11.0+ (cusolverDnX generic API).
|
||||
//
|
||||
// Usage:
|
||||
// GpuLU<double> lu(A); // upload A, getrf, LU+ipiv on device
|
||||
// if (lu.info() != Success) { ... }
|
||||
// MatrixXd x = lu.solve(b); // getrs NoTrans, only b transferred
|
||||
// MatrixXd xt = lu.solve(b, GpuLU<double>::Transpose); // A^T x = b
|
||||
|
||||
#ifndef EIGEN_GPU_LU_H
|
||||
#define EIGEN_GPU_LU_H
|
||||
|
||||
// IWYU pragma: private
|
||||
#include "./InternalHeaderCheck.h"
|
||||
|
||||
#include "./CuSolverSupport.h"
|
||||
#include <vector>
|
||||
|
||||
namespace Eigen {
|
||||
|
||||
/** \ingroup GPU_Module
|
||||
* \class GpuLU
|
||||
* \brief GPU LU decomposition with partial pivoting via cuSOLVER
|
||||
*
|
||||
* \tparam Scalar_ Element type: float, double, complex<float>, complex<double>
|
||||
*
|
||||
* Decomposes a square matrix A = P L U on the GPU and retains the factored
|
||||
* matrix and pivot array in device memory. Solves A*X=B, A^T*X=B, or
|
||||
* A^H*X=B by passing the appropriate TransposeMode.
|
||||
*
|
||||
* Each GpuLU object owns a dedicated CUDA stream and cuSOLVER handle.
|
||||
*/
|
||||
template <typename Scalar_>
|
||||
class GpuLU {
|
||||
public:
|
||||
using Scalar = Scalar_;
|
||||
using RealScalar = typename NumTraits<Scalar>::Real;
|
||||
using PlainMatrix = Matrix<Scalar, Dynamic, Dynamic, ColMajor>;
|
||||
|
||||
/** Controls which system is solved in solve(). */
|
||||
enum TransposeMode {
|
||||
NoTranspose, ///< Solve A * X = B
|
||||
Transpose, ///< Solve A^T * X = B
|
||||
ConjugateTranspose ///< Solve A^H * X = B (same as Transpose for real types)
|
||||
};
|
||||
|
||||
// ---- Construction / destruction ------------------------------------------
|
||||
|
||||
GpuLU() { init_context(); }
|
||||
|
||||
template <typename InputType>
|
||||
explicit GpuLU(const EigenBase<InputType>& A) {
|
||||
init_context();
|
||||
compute(A);
|
||||
}
|
||||
|
||||
~GpuLU() {
|
||||
if (handle_) (void)cusolverDnDestroy(handle_);
|
||||
if (stream_) (void)cudaStreamDestroy(stream_);
|
||||
}
|
||||
|
||||
GpuLU(const GpuLU&) = delete;
|
||||
GpuLU& operator=(const GpuLU&) = delete;
|
||||
|
||||
GpuLU(GpuLU&& o) noexcept
|
||||
: stream_(o.stream_),
|
||||
handle_(o.handle_),
|
||||
params_(std::move(o.params_)),
|
||||
d_lu_(std::move(o.d_lu_)),
|
||||
lu_alloc_size_(o.lu_alloc_size_),
|
||||
d_ipiv_(std::move(o.d_ipiv_)),
|
||||
d_scratch_(std::move(o.d_scratch_)),
|
||||
scratch_size_(o.scratch_size_),
|
||||
h_workspace_(std::move(o.h_workspace_)),
|
||||
n_(o.n_),
|
||||
lda_(o.lda_),
|
||||
info_(o.info_),
|
||||
info_word_(o.info_word_),
|
||||
info_synced_(o.info_synced_) {
|
||||
o.stream_ = nullptr;
|
||||
o.handle_ = nullptr;
|
||||
o.lu_alloc_size_ = 0;
|
||||
o.scratch_size_ = 0;
|
||||
o.n_ = 0;
|
||||
o.info_ = InvalidInput;
|
||||
o.info_word_ = 0;
|
||||
o.info_synced_ = true;
|
||||
}
|
||||
|
||||
GpuLU& operator=(GpuLU&& o) noexcept {
|
||||
if (this != &o) {
|
||||
if (handle_) (void)cusolverDnDestroy(handle_);
|
||||
if (stream_) (void)cudaStreamDestroy(stream_);
|
||||
stream_ = o.stream_;
|
||||
handle_ = o.handle_;
|
||||
params_ = std::move(o.params_);
|
||||
d_lu_ = std::move(o.d_lu_);
|
||||
lu_alloc_size_ = o.lu_alloc_size_;
|
||||
d_ipiv_ = std::move(o.d_ipiv_);
|
||||
d_scratch_ = std::move(o.d_scratch_);
|
||||
scratch_size_ = o.scratch_size_;
|
||||
h_workspace_ = std::move(o.h_workspace_);
|
||||
n_ = o.n_;
|
||||
lda_ = o.lda_;
|
||||
info_ = o.info_;
|
||||
info_word_ = o.info_word_;
|
||||
info_synced_ = o.info_synced_;
|
||||
o.stream_ = nullptr;
|
||||
o.handle_ = nullptr;
|
||||
o.lu_alloc_size_ = 0;
|
||||
o.scratch_size_ = 0;
|
||||
o.n_ = 0;
|
||||
o.info_ = InvalidInput;
|
||||
o.info_word_ = 0;
|
||||
o.info_synced_ = true;
|
||||
}
|
||||
return *this;
|
||||
}
|
||||
|
||||
// ---- Factorization -------------------------------------------------------
|
||||
|
||||
/** Compute the LU factorization of A (host matrix, must be square). */
|
||||
template <typename InputType>
|
||||
GpuLU& compute(const EigenBase<InputType>& A) {
|
||||
eigen_assert(A.rows() == A.cols() && "GpuLU requires a square matrix");
|
||||
if (!begin_compute(A.rows())) return *this;
|
||||
|
||||
const PlainMatrix mat(A.derived());
|
||||
lda_ = static_cast<int64_t>(mat.rows());
|
||||
allocate_lu_storage();
|
||||
EIGEN_CUDA_RUNTIME_CHECK(cudaMemcpyAsync(d_lu_.ptr, mat.data(), matrixBytes(), cudaMemcpyHostToDevice, stream_));
|
||||
|
||||
factorize();
|
||||
return *this;
|
||||
}
|
||||
|
||||
/** Compute the LU factorization from a device-resident matrix (D2D copy). */
|
||||
GpuLU& compute(const DeviceMatrix<Scalar>& d_A) {
|
||||
eigen_assert(d_A.rows() == d_A.cols() && "GpuLU requires a square matrix");
|
||||
if (!begin_compute(d_A.rows())) return *this;
|
||||
|
||||
lda_ = static_cast<int64_t>(d_A.rows());
|
||||
d_A.waitReady(stream_);
|
||||
allocate_lu_storage();
|
||||
EIGEN_CUDA_RUNTIME_CHECK(cudaMemcpyAsync(d_lu_.ptr, d_A.data(), matrixBytes(), cudaMemcpyDeviceToDevice, stream_));
|
||||
|
||||
factorize();
|
||||
return *this;
|
||||
}
|
||||
|
||||
/** Compute the LU factorization from a device matrix (move, no copy). */
|
||||
GpuLU& compute(DeviceMatrix<Scalar>&& d_A) {
|
||||
eigen_assert(d_A.rows() == d_A.cols() && "GpuLU requires a square matrix");
|
||||
if (!begin_compute(d_A.rows())) return *this;
|
||||
|
||||
lda_ = static_cast<int64_t>(d_A.rows());
|
||||
d_A.waitReady(stream_);
|
||||
d_lu_ = internal::DeviceBuffer::adopt(static_cast<void*>(d_A.release()));
|
||||
|
||||
factorize();
|
||||
return *this;
|
||||
}
|
||||
|
||||
// ---- Solve ---------------------------------------------------------------
|
||||
|
||||
/** Solve op(A) * X = B using the cached LU factorization (host → host).
|
||||
*
|
||||
* \param B Right-hand side (n x nrhs host matrix).
|
||||
* \param mode NoTranspose (default), Transpose, or ConjugateTranspose.
|
||||
*/
|
||||
template <typename Rhs>
|
||||
PlainMatrix solve(const MatrixBase<Rhs>& B, TransposeMode mode = NoTranspose) const {
|
||||
const_cast<GpuLU*>(this)->sync_info();
|
||||
eigen_assert(info_ == Success && "GpuLU::solve called on a failed or uninitialized factorization");
|
||||
eigen_assert(B.rows() == n_);
|
||||
|
||||
const PlainMatrix rhs(B);
|
||||
const int64_t nrhs = static_cast<int64_t>(rhs.cols());
|
||||
const int64_t ldb = static_cast<int64_t>(rhs.rows());
|
||||
DeviceMatrix<Scalar> d_X = solve_impl(nrhs, ldb, mode, [&](Scalar* d_x_ptr) {
|
||||
EIGEN_CUDA_RUNTIME_CHECK(
|
||||
cudaMemcpyAsync(d_x_ptr, rhs.data(), matrixBytes(nrhs, ldb), cudaMemcpyHostToDevice, stream_));
|
||||
});
|
||||
|
||||
PlainMatrix X(n_, B.cols());
|
||||
int solve_info = 0;
|
||||
EIGEN_CUDA_RUNTIME_CHECK(
|
||||
cudaMemcpyAsync(X.data(), d_X.data(), matrixBytes(nrhs, ldb), cudaMemcpyDeviceToHost, stream_));
|
||||
EIGEN_CUDA_RUNTIME_CHECK(
|
||||
cudaMemcpyAsync(&solve_info, scratch_info(), sizeof(int), cudaMemcpyDeviceToHost, stream_));
|
||||
EIGEN_CUDA_RUNTIME_CHECK(cudaStreamSynchronize(stream_));
|
||||
|
||||
eigen_assert(solve_info == 0 && "cusolverDnXgetrs reported an error");
|
||||
return X;
|
||||
}
|
||||
|
||||
/** Solve op(A) * X = B with device-resident RHS. Fully async. */
|
||||
DeviceMatrix<Scalar> solve(const DeviceMatrix<Scalar>& d_B, TransposeMode mode = NoTranspose) const {
|
||||
eigen_assert(d_B.rows() == n_);
|
||||
d_B.waitReady(stream_);
|
||||
const int64_t nrhs = static_cast<int64_t>(d_B.cols());
|
||||
const int64_t ldb = static_cast<int64_t>(d_B.rows());
|
||||
return solve_impl(nrhs, ldb, mode, [&](Scalar* d_x_ptr) {
|
||||
EIGEN_CUDA_RUNTIME_CHECK(
|
||||
cudaMemcpyAsync(d_x_ptr, d_B.data(), matrixBytes(nrhs, ldb), cudaMemcpyDeviceToDevice, stream_));
|
||||
});
|
||||
}
|
||||
|
||||
// ---- Accessors -----------------------------------------------------------
|
||||
|
||||
/** Lazily synchronizes the stream on first call after compute(). */
|
||||
ComputationInfo info() const {
|
||||
const_cast<GpuLU*>(this)->sync_info();
|
||||
return info_;
|
||||
}
|
||||
Index rows() const { return n_; }
|
||||
Index cols() const { return n_; }
|
||||
cudaStream_t stream() const { return stream_; }
|
||||
|
||||
private:
|
||||
cudaStream_t stream_ = nullptr;
|
||||
cusolverDnHandle_t handle_ = nullptr;
|
||||
internal::CusolverParams params_; // cuSOLVER params (created once, reused)
|
||||
internal::DeviceBuffer d_lu_; // LU factors on device (grows, never shrinks)
|
||||
size_t lu_alloc_size_ = 0; // current d_lu_ allocation size
|
||||
internal::DeviceBuffer d_ipiv_; // pivot indices (int64_t) on device
|
||||
internal::DeviceBuffer d_scratch_; // combined workspace + info word (grows, never shrinks)
|
||||
size_t scratch_size_ = 0; // current scratch allocation size
|
||||
std::vector<char> h_workspace_; // host workspace (kept alive until next compute)
|
||||
Index n_ = 0;
|
||||
int64_t lda_ = 0;
|
||||
ComputationInfo info_ = InvalidInput;
|
||||
int info_word_ = 0; // host-side target for async info download
|
||||
bool info_synced_ = true; // has the stream been synced for info?
|
||||
|
||||
bool begin_compute(Index rows) {
|
||||
n_ = rows;
|
||||
info_ = InvalidInput;
|
||||
if (n_ == 0) {
|
||||
info_ = Success;
|
||||
return false;
|
||||
}
|
||||
return true;
|
||||
}
|
||||
|
||||
size_t matrixBytes() const { return matrixBytes(static_cast<int64_t>(n_), lda_); }
|
||||
|
||||
static size_t matrixBytes(int64_t cols, int64_t outer_stride) {
|
||||
return static_cast<size_t>(outer_stride) * static_cast<size_t>(cols) * sizeof(Scalar);
|
||||
}
|
||||
|
||||
void allocate_lu_storage() {
|
||||
size_t needed = matrixBytes();
|
||||
if (needed > lu_alloc_size_) {
|
||||
d_lu_ = internal::DeviceBuffer(needed);
|
||||
lu_alloc_size_ = needed;
|
||||
}
|
||||
}
|
||||
|
||||
// Ensure d_scratch_ is at least `workspace_bytes + sizeof(int)`.
|
||||
// Layout: [workspace (workspace_bytes) | info_word (sizeof(int))].
|
||||
// Ensure d_scratch_ can hold workspace_bytes + an aligned info word.
|
||||
// Grows but never shrinks. Syncs the stream before reallocating to
|
||||
// avoid freeing memory that async kernels may still be using.
|
||||
void ensure_scratch(size_t workspace_bytes) {
|
||||
constexpr size_t kAlign = 16;
|
||||
workspace_bytes = (workspace_bytes + kAlign - 1) & ~(kAlign - 1);
|
||||
size_t needed = workspace_bytes + sizeof(int);
|
||||
if (needed > scratch_size_) {
|
||||
if (d_scratch_.ptr) EIGEN_CUDA_RUNTIME_CHECK(cudaStreamSynchronize(stream_));
|
||||
d_scratch_ = internal::DeviceBuffer(needed);
|
||||
scratch_size_ = needed;
|
||||
}
|
||||
}
|
||||
|
||||
void* scratch_workspace() const { return d_scratch_.ptr; }
|
||||
int* scratch_info() const {
|
||||
return reinterpret_cast<int*>(static_cast<char*>(d_scratch_.ptr) + scratch_size_ - sizeof(int));
|
||||
}
|
||||
|
||||
template <typename CopyRhs>
|
||||
DeviceMatrix<Scalar> solve_impl(int64_t nrhs, int64_t ldb, TransposeMode mode, CopyRhs&& copy_rhs) const {
|
||||
constexpr cudaDataType_t dtype = internal::cusolver_data_type<Scalar>::value;
|
||||
const cublasOperation_t trans = to_cublas_op(mode);
|
||||
|
||||
Scalar* d_x_ptr = nullptr;
|
||||
EIGEN_CUDA_RUNTIME_CHECK(cudaMalloc(reinterpret_cast<void**>(&d_x_ptr), matrixBytes(nrhs, ldb)));
|
||||
copy_rhs(d_x_ptr);
|
||||
|
||||
EIGEN_CUSOLVER_CHECK(cusolverDnXgetrs(handle_, params_.p, trans, static_cast<int64_t>(n_), nrhs, dtype, d_lu_.ptr,
|
||||
lda_, static_cast<const int64_t*>(d_ipiv_.ptr), dtype, d_x_ptr, ldb,
|
||||
scratch_info()));
|
||||
|
||||
DeviceMatrix<Scalar> result(d_x_ptr, n_, static_cast<Index>(nrhs));
|
||||
result.recordReady(stream_);
|
||||
return result;
|
||||
}
|
||||
|
||||
void init_context() {
|
||||
EIGEN_CUDA_RUNTIME_CHECK(cudaStreamCreate(&stream_));
|
||||
EIGEN_CUSOLVER_CHECK(cusolverDnCreate(&handle_));
|
||||
EIGEN_CUSOLVER_CHECK(cusolverDnSetStream(handle_, stream_));
|
||||
ensure_scratch(0); // allocate at least the info word
|
||||
}
|
||||
|
||||
void sync_info() {
|
||||
if (!info_synced_) {
|
||||
EIGEN_CUDA_RUNTIME_CHECK(cudaStreamSynchronize(stream_));
|
||||
info_ = (info_word_ == 0) ? Success : NumericalIssue;
|
||||
info_synced_ = true;
|
||||
}
|
||||
}
|
||||
|
||||
// Run cusolverDnXgetrf on d_lu_ (already on device). Allocates d_ipiv_.
|
||||
// Enqueues factorization + async info download. Does NOT sync.
|
||||
// Workspaces are stored as members to ensure they outlive the async kernels.
|
||||
void factorize() {
|
||||
constexpr cudaDataType_t dtype = internal::cusolver_data_type<Scalar>::value;
|
||||
const size_t ipiv_bytes = static_cast<size_t>(n_) * sizeof(int64_t);
|
||||
|
||||
info_synced_ = false;
|
||||
info_ = InvalidInput;
|
||||
|
||||
d_ipiv_ = internal::DeviceBuffer(ipiv_bytes);
|
||||
|
||||
size_t dev_ws_bytes = 0, host_ws_bytes = 0;
|
||||
EIGEN_CUSOLVER_CHECK(cusolverDnXgetrf_bufferSize(handle_, params_.p, static_cast<int64_t>(n_),
|
||||
static_cast<int64_t>(n_), dtype, d_lu_.ptr, lda_, dtype,
|
||||
&dev_ws_bytes, &host_ws_bytes));
|
||||
|
||||
ensure_scratch(dev_ws_bytes);
|
||||
h_workspace_.resize(host_ws_bytes);
|
||||
|
||||
EIGEN_CUSOLVER_CHECK(
|
||||
cusolverDnXgetrf(handle_, params_.p, static_cast<int64_t>(n_), static_cast<int64_t>(n_), dtype, d_lu_.ptr, lda_,
|
||||
static_cast<int64_t*>(d_ipiv_.ptr), dtype, scratch_workspace(), dev_ws_bytes,
|
||||
host_ws_bytes > 0 ? h_workspace_.data() : nullptr, host_ws_bytes, scratch_info()));
|
||||
|
||||
EIGEN_CUDA_RUNTIME_CHECK(
|
||||
cudaMemcpyAsync(&info_word_, scratch_info(), sizeof(int), cudaMemcpyDeviceToHost, stream_));
|
||||
}
|
||||
|
||||
static cublasOperation_t to_cublas_op(TransposeMode mode) {
|
||||
switch (mode) {
|
||||
case Transpose:
|
||||
return CUBLAS_OP_T;
|
||||
case ConjugateTranspose:
|
||||
return CUBLAS_OP_C;
|
||||
default:
|
||||
return CUBLAS_OP_N;
|
||||
}
|
||||
}
|
||||
};
|
||||
|
||||
} // namespace Eigen
|
||||
|
||||
#endif // EIGEN_GPU_LU_H
|
||||
389
Eigen/src/GPU/GpuQR.h
Normal file
389
Eigen/src/GPU/GpuQR.h
Normal file
@@ -0,0 +1,389 @@
|
||||
// This file is part of Eigen, a lightweight C++ template library
|
||||
// for linear algebra.
|
||||
//
|
||||
// Copyright (C) 2026 Rasmus Munk Larsen <rmlarsen@gmail.com>
|
||||
//
|
||||
// This Source Code Form is subject to the terms of the Mozilla
|
||||
// Public License v. 2.0. If a copy of the MPL was not distributed
|
||||
// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
|
||||
|
||||
// GPU QR decomposition using cuSOLVER.
|
||||
//
|
||||
// Wraps cusolverDnXgeqrf (factorization), cusolverDnXormqr (apply Q),
|
||||
// cusolverDnXorgqr (form Q), and cublasXtrsm (triangular solve on R).
|
||||
//
|
||||
// The factored matrix (reflectors + R) and tau stay in device memory.
|
||||
// Solve uses ormqr + trsm without forming Q explicitly.
|
||||
//
|
||||
// Usage:
|
||||
// GpuQR<double> qr(A); // upload A, geqrf
|
||||
// if (qr.info() != Success) { ... }
|
||||
// MatrixXd X = qr.solve(B); // Q^H * B via ormqr, then trsm on R
|
||||
//
|
||||
// Expression syntax:
|
||||
// d_X = d_A.qr().solve(d_B); // temporary, no caching
|
||||
|
||||
#ifndef EIGEN_GPU_QR_H
|
||||
#define EIGEN_GPU_QR_H
|
||||
|
||||
// IWYU pragma: private
|
||||
#include "./InternalHeaderCheck.h"
|
||||
|
||||
#include "./CuSolverSupport.h"
|
||||
#include "./CuBlasSupport.h"
|
||||
#include <vector>
|
||||
|
||||
namespace Eigen {
|
||||
|
||||
template <typename Scalar_>
|
||||
class GpuQR {
|
||||
public:
|
||||
using Scalar = Scalar_;
|
||||
using RealScalar = typename NumTraits<Scalar>::Real;
|
||||
using PlainMatrix = Matrix<Scalar, Dynamic, Dynamic, ColMajor>;
|
||||
|
||||
GpuQR() { init_context(); }
|
||||
|
||||
template <typename InputType>
|
||||
explicit GpuQR(const EigenBase<InputType>& A) {
|
||||
init_context();
|
||||
compute(A);
|
||||
}
|
||||
|
||||
~GpuQR() {
|
||||
if (handle_) (void)cusolverDnDestroy(handle_);
|
||||
if (cublas_) (void)cublasDestroy(cublas_);
|
||||
if (stream_) (void)cudaStreamDestroy(stream_);
|
||||
}
|
||||
|
||||
GpuQR(const GpuQR&) = delete;
|
||||
GpuQR& operator=(const GpuQR&) = delete;
|
||||
|
||||
GpuQR(GpuQR&& o) noexcept
|
||||
: stream_(o.stream_),
|
||||
handle_(o.handle_),
|
||||
cublas_(o.cublas_),
|
||||
params_(std::move(o.params_)),
|
||||
d_qr_(std::move(o.d_qr_)),
|
||||
d_tau_(std::move(o.d_tau_)),
|
||||
d_scratch_(std::move(o.d_scratch_)),
|
||||
scratch_size_(o.scratch_size_),
|
||||
h_workspace_(std::move(o.h_workspace_)),
|
||||
m_(o.m_),
|
||||
n_(o.n_),
|
||||
lda_(o.lda_),
|
||||
info_(o.info_),
|
||||
info_word_(o.info_word_),
|
||||
info_synced_(o.info_synced_) {
|
||||
o.stream_ = nullptr;
|
||||
o.handle_ = nullptr;
|
||||
o.cublas_ = nullptr;
|
||||
o.scratch_size_ = 0;
|
||||
o.m_ = 0;
|
||||
o.n_ = 0;
|
||||
o.lda_ = 0;
|
||||
o.info_ = InvalidInput;
|
||||
o.info_word_ = 0;
|
||||
o.info_synced_ = true;
|
||||
}
|
||||
|
||||
GpuQR& operator=(GpuQR&& o) noexcept {
|
||||
if (this != &o) {
|
||||
if (handle_) (void)cusolverDnDestroy(handle_);
|
||||
if (cublas_) (void)cublasDestroy(cublas_);
|
||||
if (stream_) (void)cudaStreamDestroy(stream_);
|
||||
stream_ = o.stream_;
|
||||
handle_ = o.handle_;
|
||||
cublas_ = o.cublas_;
|
||||
params_ = std::move(o.params_);
|
||||
d_qr_ = std::move(o.d_qr_);
|
||||
d_tau_ = std::move(o.d_tau_);
|
||||
d_scratch_ = std::move(o.d_scratch_);
|
||||
scratch_size_ = o.scratch_size_;
|
||||
h_workspace_ = std::move(o.h_workspace_);
|
||||
m_ = o.m_;
|
||||
n_ = o.n_;
|
||||
lda_ = o.lda_;
|
||||
info_ = o.info_;
|
||||
info_word_ = o.info_word_;
|
||||
info_synced_ = o.info_synced_;
|
||||
o.stream_ = nullptr;
|
||||
o.handle_ = nullptr;
|
||||
o.cublas_ = nullptr;
|
||||
o.scratch_size_ = 0;
|
||||
o.m_ = 0;
|
||||
o.n_ = 0;
|
||||
o.lda_ = 0;
|
||||
o.info_ = InvalidInput;
|
||||
o.info_word_ = 0;
|
||||
o.info_synced_ = true;
|
||||
}
|
||||
return *this;
|
||||
}
|
||||
|
||||
// ---- Factorization -------------------------------------------------------
|
||||
|
||||
template <typename InputType>
|
||||
GpuQR& compute(const EigenBase<InputType>& A) {
|
||||
m_ = A.rows();
|
||||
n_ = A.cols();
|
||||
info_ = InvalidInput;
|
||||
info_synced_ = false;
|
||||
|
||||
if (m_ == 0 || n_ == 0) {
|
||||
info_ = Success;
|
||||
info_synced_ = true;
|
||||
return *this;
|
||||
}
|
||||
|
||||
const PlainMatrix mat(A.derived());
|
||||
lda_ = static_cast<int64_t>(mat.rows());
|
||||
const size_t mat_bytes = static_cast<size_t>(lda_) * static_cast<size_t>(n_) * sizeof(Scalar);
|
||||
const size_t tau_bytes = static_cast<size_t>((std::min)(m_, n_)) * sizeof(Scalar);
|
||||
|
||||
d_qr_ = internal::DeviceBuffer(mat_bytes);
|
||||
d_tau_ = internal::DeviceBuffer(tau_bytes);
|
||||
|
||||
EIGEN_CUDA_RUNTIME_CHECK(cudaMemcpyAsync(d_qr_.ptr, mat.data(), mat_bytes, cudaMemcpyHostToDevice, stream_));
|
||||
|
||||
factorize();
|
||||
return *this;
|
||||
}
|
||||
|
||||
GpuQR& compute(const DeviceMatrix<Scalar>& d_A) {
|
||||
m_ = d_A.rows();
|
||||
n_ = d_A.cols();
|
||||
info_ = InvalidInput;
|
||||
info_synced_ = false;
|
||||
|
||||
if (m_ == 0 || n_ == 0) {
|
||||
info_ = Success;
|
||||
info_synced_ = true;
|
||||
return *this;
|
||||
}
|
||||
|
||||
lda_ = static_cast<int64_t>(d_A.rows());
|
||||
const size_t mat_bytes = static_cast<size_t>(lda_) * static_cast<size_t>(n_) * sizeof(Scalar);
|
||||
const size_t tau_bytes = static_cast<size_t>((std::min)(m_, n_)) * sizeof(Scalar);
|
||||
|
||||
d_A.waitReady(stream_);
|
||||
d_qr_ = internal::DeviceBuffer(mat_bytes);
|
||||
d_tau_ = internal::DeviceBuffer(tau_bytes);
|
||||
EIGEN_CUDA_RUNTIME_CHECK(cudaMemcpyAsync(d_qr_.ptr, d_A.data(), mat_bytes, cudaMemcpyDeviceToDevice, stream_));
|
||||
|
||||
factorize();
|
||||
return *this;
|
||||
}
|
||||
|
||||
// ---- Solve ---------------------------------------------------------------
|
||||
|
||||
/** Solve A * X = B via QR: X = R^{-1} * Q^H * B (least-squares for m >= n).
|
||||
* Uses ormqr (apply Q^H) + trsm (solve R), without forming Q explicitly.
|
||||
* Requires m >= n (overdetermined or square). Underdetermined not supported.
|
||||
*
|
||||
* TODO: Add device-side accessor for the R factor (and Q application) as
|
||||
* DeviceMatrix, so users can chain GPU operations without host round-trips. */
|
||||
template <typename Rhs>
|
||||
PlainMatrix solve(const MatrixBase<Rhs>& B) const {
|
||||
sync_info();
|
||||
eigen_assert(info_ == Success && "GpuQR::solve called on a failed or uninitialized factorization");
|
||||
eigen_assert(B.rows() == m_);
|
||||
eigen_assert(m_ >= n_ && "GpuQR::solve requires m >= n (use SVD for underdetermined systems)");
|
||||
|
||||
const PlainMatrix rhs(B);
|
||||
const int64_t nrhs = static_cast<int64_t>(rhs.cols());
|
||||
const int64_t ldb = static_cast<int64_t>(rhs.rows()); // = m_
|
||||
const size_t b_bytes = static_cast<size_t>(ldb) * static_cast<size_t>(nrhs) * sizeof(Scalar);
|
||||
|
||||
// Upload B to device (m × nrhs buffer).
|
||||
internal::DeviceBuffer d_B(b_bytes);
|
||||
EIGEN_CUDA_RUNTIME_CHECK(cudaMemcpyAsync(d_B.ptr, rhs.data(), b_bytes, cudaMemcpyHostToDevice, stream_));
|
||||
|
||||
// Apply Q^H to B in-place: d_B becomes m × nrhs, first n rows hold Q^H * B relevant part.
|
||||
apply_QH(d_B.ptr, ldb, nrhs);
|
||||
|
||||
// Solve R * X = (Q^H * B)[0:n,:] via trsm on the first n rows.
|
||||
Scalar alpha(1);
|
||||
EIGEN_CUBLAS_CHECK(internal::cublasXtrsm(cublas_, CUBLAS_SIDE_LEFT, CUBLAS_FILL_MODE_UPPER, CUBLAS_OP_N,
|
||||
CUBLAS_DIAG_NON_UNIT, static_cast<int>(n_), static_cast<int>(nrhs), &alpha,
|
||||
static_cast<const Scalar*>(d_qr_.ptr), static_cast<int>(lda_),
|
||||
static_cast<Scalar*>(d_B.ptr), static_cast<int>(ldb)));
|
||||
|
||||
// Download the first n rows of each column (stride = ldb = m, width = n).
|
||||
PlainMatrix X(n_, rhs.cols());
|
||||
if (m_ == n_) {
|
||||
// Square: dense copy, no stride mismatch.
|
||||
EIGEN_CUDA_RUNTIME_CHECK(cudaMemcpyAsync(X.data(), d_B.ptr,
|
||||
static_cast<size_t>(n_) * static_cast<size_t>(nrhs) * sizeof(Scalar),
|
||||
cudaMemcpyDeviceToHost, stream_));
|
||||
} else {
|
||||
// Overdetermined: 2D copy to extract first n rows from each column.
|
||||
EIGEN_CUDA_RUNTIME_CHECK(cudaMemcpy2DAsync(
|
||||
X.data(), static_cast<size_t>(n_) * sizeof(Scalar), d_B.ptr, static_cast<size_t>(ldb) * sizeof(Scalar),
|
||||
static_cast<size_t>(n_) * sizeof(Scalar), static_cast<size_t>(nrhs), cudaMemcpyDeviceToHost, stream_));
|
||||
}
|
||||
EIGEN_CUDA_RUNTIME_CHECK(cudaStreamSynchronize(stream_));
|
||||
return X;
|
||||
}
|
||||
|
||||
/** Solve with device-resident RHS. Returns n × nrhs DeviceMatrix. */
|
||||
DeviceMatrix<Scalar> solve(const DeviceMatrix<Scalar>& d_B) const {
|
||||
sync_info();
|
||||
eigen_assert(info_ == Success && "GpuQR::solve called on a failed or uninitialized factorization");
|
||||
eigen_assert(d_B.rows() == m_);
|
||||
eigen_assert(m_ >= n_ && "GpuQR::solve requires m >= n (use SVD for underdetermined systems)");
|
||||
d_B.waitReady(stream_);
|
||||
|
||||
const int64_t nrhs = static_cast<int64_t>(d_B.cols());
|
||||
const int64_t ldb = static_cast<int64_t>(d_B.rows()); // = m_
|
||||
const size_t b_bytes = static_cast<size_t>(ldb) * static_cast<size_t>(nrhs) * sizeof(Scalar);
|
||||
|
||||
// D2D copy B into working buffer (ormqr and trsm are in-place).
|
||||
internal::DeviceBuffer d_work(b_bytes);
|
||||
EIGEN_CUDA_RUNTIME_CHECK(cudaMemcpyAsync(d_work.ptr, d_B.data(), b_bytes, cudaMemcpyDeviceToDevice, stream_));
|
||||
|
||||
apply_QH(d_work.ptr, ldb, nrhs);
|
||||
|
||||
// trsm on the first n rows.
|
||||
Scalar alpha(1);
|
||||
EIGEN_CUBLAS_CHECK(internal::cublasXtrsm(cublas_, CUBLAS_SIDE_LEFT, CUBLAS_FILL_MODE_UPPER, CUBLAS_OP_N,
|
||||
CUBLAS_DIAG_NON_UNIT, static_cast<int>(n_), static_cast<int>(nrhs), &alpha,
|
||||
static_cast<const Scalar*>(d_qr_.ptr), static_cast<int>(lda_),
|
||||
static_cast<Scalar*>(d_work.ptr), static_cast<int>(ldb)));
|
||||
|
||||
if (m_ == n_) {
|
||||
// Square: result is the whole buffer, dense.
|
||||
DeviceMatrix<Scalar> result(static_cast<Scalar*>(d_work.ptr), n_, static_cast<Index>(nrhs));
|
||||
d_work.ptr = nullptr; // transfer ownership
|
||||
result.recordReady(stream_);
|
||||
return result;
|
||||
} else {
|
||||
// Overdetermined: copy first n rows of each column into a dense n × nrhs result.
|
||||
DeviceMatrix<Scalar> result(n_, static_cast<Index>(nrhs));
|
||||
EIGEN_CUDA_RUNTIME_CHECK(cudaMemcpy2DAsync(result.data(), static_cast<size_t>(n_) * sizeof(Scalar), d_work.ptr,
|
||||
static_cast<size_t>(ldb) * sizeof(Scalar),
|
||||
static_cast<size_t>(n_) * sizeof(Scalar), static_cast<size_t>(nrhs),
|
||||
cudaMemcpyDeviceToDevice, stream_));
|
||||
result.recordReady(stream_);
|
||||
return result;
|
||||
// d_work freed here via RAII — safe because stream is ordered.
|
||||
}
|
||||
}
|
||||
|
||||
// ---- Accessors -----------------------------------------------------------
|
||||
|
||||
ComputationInfo info() const {
|
||||
sync_info();
|
||||
return info_;
|
||||
}
|
||||
|
||||
Index rows() const { return m_; }
|
||||
Index cols() const { return n_; }
|
||||
cudaStream_t stream() const { return stream_; }
|
||||
|
||||
private:
|
||||
cudaStream_t stream_ = nullptr;
|
||||
cusolverDnHandle_t handle_ = nullptr;
|
||||
cublasHandle_t cublas_ = nullptr;
|
||||
internal::CusolverParams params_;
|
||||
internal::DeviceBuffer d_qr_; // QR factors (reflectors in lower, R in upper)
|
||||
internal::DeviceBuffer d_tau_; // Householder scalars (min(m,n))
|
||||
internal::DeviceBuffer d_scratch_; // workspace + info word
|
||||
size_t scratch_size_ = 0;
|
||||
std::vector<char> h_workspace_;
|
||||
Index m_ = 0;
|
||||
Index n_ = 0;
|
||||
int64_t lda_ = 0;
|
||||
ComputationInfo info_ = InvalidInput;
|
||||
int info_word_ = 0;
|
||||
bool info_synced_ = true;
|
||||
|
||||
void init_context() {
|
||||
EIGEN_CUDA_RUNTIME_CHECK(cudaStreamCreate(&stream_));
|
||||
EIGEN_CUSOLVER_CHECK(cusolverDnCreate(&handle_));
|
||||
EIGEN_CUSOLVER_CHECK(cusolverDnSetStream(handle_, stream_));
|
||||
EIGEN_CUBLAS_CHECK(cublasCreate(&cublas_));
|
||||
EIGEN_CUBLAS_CHECK(cublasSetStream(cublas_, stream_));
|
||||
ensure_scratch(0);
|
||||
}
|
||||
|
||||
void ensure_scratch(size_t workspace_bytes) {
|
||||
constexpr size_t kAlign = 16;
|
||||
workspace_bytes = (workspace_bytes + kAlign - 1) & ~(kAlign - 1);
|
||||
size_t needed = workspace_bytes + sizeof(int);
|
||||
if (needed > scratch_size_) {
|
||||
if (d_scratch_.ptr) EIGEN_CUDA_RUNTIME_CHECK(cudaStreamSynchronize(stream_));
|
||||
d_scratch_ = internal::DeviceBuffer(needed);
|
||||
scratch_size_ = needed;
|
||||
}
|
||||
}
|
||||
|
||||
void* scratch_workspace() const { return d_scratch_.ptr; }
|
||||
int* scratch_info() const {
|
||||
return reinterpret_cast<int*>(static_cast<char*>(d_scratch_.ptr) + scratch_size_ - sizeof(int));
|
||||
}
|
||||
|
||||
void sync_info() const {
|
||||
if (!info_synced_) {
|
||||
EIGEN_CUDA_RUNTIME_CHECK(cudaStreamSynchronize(stream_));
|
||||
const_cast<GpuQR*>(this)->info_ = (info_word_ == 0) ? Success : NumericalIssue;
|
||||
const_cast<GpuQR*>(this)->info_synced_ = true;
|
||||
}
|
||||
}
|
||||
|
||||
void factorize() {
|
||||
constexpr cudaDataType_t dtype = internal::cusolver_data_type<Scalar>::value;
|
||||
|
||||
info_synced_ = false;
|
||||
info_ = InvalidInput;
|
||||
|
||||
size_t dev_ws = 0, host_ws = 0;
|
||||
EIGEN_CUSOLVER_CHECK(cusolverDnXgeqrf_bufferSize(handle_, params_.p, static_cast<int64_t>(m_),
|
||||
static_cast<int64_t>(n_), dtype, d_qr_.ptr, lda_, dtype,
|
||||
d_tau_.ptr, dtype, &dev_ws, &host_ws));
|
||||
|
||||
ensure_scratch(dev_ws);
|
||||
h_workspace_.resize(host_ws);
|
||||
|
||||
EIGEN_CUSOLVER_CHECK(cusolverDnXgeqrf(handle_, params_.p, static_cast<int64_t>(m_), static_cast<int64_t>(n_), dtype,
|
||||
d_qr_.ptr, lda_, dtype, d_tau_.ptr, dtype, scratch_workspace(), dev_ws,
|
||||
host_ws > 0 ? h_workspace_.data() : nullptr, host_ws, scratch_info()));
|
||||
|
||||
EIGEN_CUDA_RUNTIME_CHECK(
|
||||
cudaMemcpyAsync(&info_word_, scratch_info(), sizeof(int), cudaMemcpyDeviceToHost, stream_));
|
||||
}
|
||||
|
||||
// Apply Q^H to a device buffer in-place: d_B = Q^H * d_B.
|
||||
// Uses type-specific ormqr (real) or unmqr (complex) wrappers from CuSolverSupport.h.
|
||||
// For real types: Q^H = Q^T, use CUBLAS_OP_T. For complex: use CUBLAS_OP_C.
|
||||
void apply_QH(void* d_B, int64_t ldb, int64_t nrhs) const {
|
||||
const int im = static_cast<int>(m_);
|
||||
const int in = static_cast<int>(nrhs);
|
||||
const int ik = static_cast<int>((std::min)(m_, n_));
|
||||
const int ilda = static_cast<int>(lda_);
|
||||
const int ildb = static_cast<int>(ldb);
|
||||
constexpr cublasOperation_t trans = NumTraits<Scalar>::IsComplex ? CUBLAS_OP_C : CUBLAS_OP_T;
|
||||
|
||||
int lwork = 0;
|
||||
EIGEN_CUSOLVER_CHECK(internal::cusolverDnXormqr_bufferSize(
|
||||
handle_, CUBLAS_SIDE_LEFT, trans, im, in, ik, static_cast<const Scalar*>(d_qr_.ptr), ilda,
|
||||
static_cast<const Scalar*>(d_tau_.ptr), static_cast<const Scalar*>(d_B), ildb, &lwork));
|
||||
|
||||
internal::DeviceBuffer d_work(static_cast<size_t>(lwork) * sizeof(Scalar));
|
||||
|
||||
EIGEN_CUSOLVER_CHECK(internal::cusolverDnXormqr(handle_, CUBLAS_SIDE_LEFT, trans, im, in, ik,
|
||||
static_cast<const Scalar*>(d_qr_.ptr), ilda,
|
||||
static_cast<const Scalar*>(d_tau_.ptr), static_cast<Scalar*>(d_B),
|
||||
ildb, static_cast<Scalar*>(d_work.ptr), lwork, scratch_info()));
|
||||
|
||||
// Sync to ensure workspace can be freed safely, and check ormqr info.
|
||||
EIGEN_CUDA_RUNTIME_CHECK(cudaStreamSynchronize(stream_));
|
||||
int ormqr_info = 0;
|
||||
EIGEN_CUDA_RUNTIME_CHECK(cudaMemcpy(&ormqr_info, scratch_info(), sizeof(int), cudaMemcpyDeviceToHost));
|
||||
eigen_assert(ormqr_info == 0 && "cusolverDnXormqr reported an error");
|
||||
}
|
||||
};
|
||||
|
||||
} // namespace Eigen
|
||||
|
||||
#endif // EIGEN_GPU_QR_H
|
||||
490
Eigen/src/GPU/GpuSVD.h
Normal file
490
Eigen/src/GPU/GpuSVD.h
Normal file
@@ -0,0 +1,490 @@
|
||||
// This file is part of Eigen, a lightweight C++ template library
|
||||
// for linear algebra.
|
||||
//
|
||||
// Copyright (C) 2026 Rasmus Munk Larsen <rmlarsen@gmail.com>
|
||||
//
|
||||
// This Source Code Form is subject to the terms of the Mozilla
|
||||
// Public License v. 2.0. If a copy of the MPL was not distributed
|
||||
// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
|
||||
|
||||
// GPU SVD decomposition using cuSOLVER (divide-and-conquer).
|
||||
//
|
||||
// Wraps cusolverDnXgesvd. Stores U, S, VT on device. Solve uses
|
||||
// cuBLAS GEMM: X = VT^H * diag(D) * U^H * B.
|
||||
//
|
||||
// cuSOLVER returns VT (not V). We store and expose VT directly.
|
||||
//
|
||||
// Usage:
|
||||
// GpuSVD<double> svd(A, ComputeThinU | ComputeThinV);
|
||||
// VectorXd S = svd.singularValues();
|
||||
// MatrixXd U = svd.matrixU(); // m×k or m×m
|
||||
// MatrixXd VT = svd.matrixVT(); // k×n or n×n (this is V^T)
|
||||
// MatrixXd X = svd.solve(B); // pseudoinverse
|
||||
// MatrixXd X = svd.solve(B, k); // truncated (top k triplets)
|
||||
// MatrixXd X = svd.solve(B, 0.1); // Tikhonov regularized
|
||||
|
||||
#ifndef EIGEN_GPU_SVD_H
|
||||
#define EIGEN_GPU_SVD_H
|
||||
|
||||
// IWYU pragma: private
|
||||
#include "./InternalHeaderCheck.h"
|
||||
|
||||
#include "./CuSolverSupport.h"
|
||||
#include "./CuBlasSupport.h"
|
||||
#include <vector>
|
||||
|
||||
namespace Eigen {
|
||||
|
||||
template <typename Scalar_>
|
||||
class GpuSVD {
|
||||
public:
|
||||
using Scalar = Scalar_;
|
||||
using RealScalar = typename NumTraits<Scalar>::Real;
|
||||
using PlainMatrix = Matrix<Scalar, Dynamic, Dynamic, ColMajor>;
|
||||
using RealVector = Matrix<RealScalar, Dynamic, 1>;
|
||||
|
||||
GpuSVD() { init_context(); }
|
||||
|
||||
template <typename InputType>
|
||||
explicit GpuSVD(const EigenBase<InputType>& A, unsigned int options = ComputeThinU | ComputeThinV) {
|
||||
init_context();
|
||||
compute(A, options);
|
||||
}
|
||||
|
||||
~GpuSVD() {
|
||||
if (handle_) (void)cusolverDnDestroy(handle_);
|
||||
if (cublas_) (void)cublasDestroy(cublas_);
|
||||
if (stream_) (void)cudaStreamDestroy(stream_);
|
||||
}
|
||||
|
||||
GpuSVD(const GpuSVD&) = delete;
|
||||
GpuSVD& operator=(const GpuSVD&) = delete;
|
||||
// Move constructors omitted for brevity — follow GpuQR pattern.
|
||||
|
||||
// ---- Factorization -------------------------------------------------------
|
||||
|
||||
template <typename InputType>
|
||||
GpuSVD& compute(const EigenBase<InputType>& A, unsigned int options = ComputeThinU | ComputeThinV) {
|
||||
options_ = options;
|
||||
m_ = A.rows();
|
||||
n_ = A.cols();
|
||||
info_ = InvalidInput;
|
||||
info_synced_ = false;
|
||||
|
||||
if (m_ == 0 || n_ == 0) {
|
||||
info_ = Success;
|
||||
info_synced_ = true;
|
||||
return *this;
|
||||
}
|
||||
|
||||
// cuSOLVER gesvd requires m >= n. For wide matrices, transpose internally.
|
||||
transposed_ = (m_ < n_);
|
||||
const PlainMatrix mat = transposed_ ? PlainMatrix(A.derived().adjoint()) : PlainMatrix(A.derived());
|
||||
if (transposed_) std::swap(m_, n_);
|
||||
|
||||
lda_ = static_cast<int64_t>(mat.rows());
|
||||
const size_t mat_bytes = static_cast<size_t>(lda_) * static_cast<size_t>(n_) * sizeof(Scalar);
|
||||
|
||||
// Copy (possibly transposed) A to device (gesvd overwrites it).
|
||||
d_A_ = internal::DeviceBuffer(mat_bytes);
|
||||
EIGEN_CUDA_RUNTIME_CHECK(cudaMemcpyAsync(d_A_.ptr, mat.data(), mat_bytes, cudaMemcpyHostToDevice, stream_));
|
||||
|
||||
factorize();
|
||||
return *this;
|
||||
}
|
||||
|
||||
GpuSVD& compute(const DeviceMatrix<Scalar>& d_A, unsigned int options = ComputeThinU | ComputeThinV) {
|
||||
options_ = options;
|
||||
m_ = d_A.rows();
|
||||
n_ = d_A.cols();
|
||||
info_ = InvalidInput;
|
||||
info_synced_ = false;
|
||||
|
||||
if (m_ == 0 || n_ == 0) {
|
||||
info_ = Success;
|
||||
info_synced_ = true;
|
||||
return *this;
|
||||
}
|
||||
|
||||
transposed_ = (m_ < n_);
|
||||
d_A.waitReady(stream_);
|
||||
|
||||
if (transposed_) {
|
||||
// Transpose on device via cuBLAS geam: d_A_ = A^H.
|
||||
std::swap(m_, n_);
|
||||
lda_ = m_;
|
||||
const size_t mat_bytes = static_cast<size_t>(lda_) * static_cast<size_t>(n_) * sizeof(Scalar);
|
||||
d_A_ = internal::DeviceBuffer(mat_bytes);
|
||||
Scalar alpha_one(1), beta_zero(0);
|
||||
// geam: C(m×n) = alpha * op(A) + beta * op(B). Use B = nullptr trick: beta=0.
|
||||
// A is the original d_A (n_orig × m_orig = n × m after swap), transposed → m × n.
|
||||
EIGEN_CUBLAS_CHECK(internal::cublasXgeam(
|
||||
cublas_, CUBLAS_OP_C, CUBLAS_OP_N, static_cast<int>(m_), static_cast<int>(n_), &alpha_one, d_A.data(),
|
||||
static_cast<int>(d_A.rows()), &beta_zero, static_cast<const Scalar*>(nullptr), static_cast<int>(m_),
|
||||
static_cast<Scalar*>(d_A_.ptr), static_cast<int>(m_)));
|
||||
} else {
|
||||
lda_ = static_cast<int64_t>(d_A.rows());
|
||||
const size_t mat_bytes = static_cast<size_t>(lda_) * static_cast<size_t>(n_) * sizeof(Scalar);
|
||||
d_A_ = internal::DeviceBuffer(mat_bytes);
|
||||
EIGEN_CUDA_RUNTIME_CHECK(cudaMemcpyAsync(d_A_.ptr, d_A.data(), mat_bytes, cudaMemcpyDeviceToDevice, stream_));
|
||||
}
|
||||
|
||||
factorize();
|
||||
return *this;
|
||||
}
|
||||
|
||||
// ---- Accessors -----------------------------------------------------------
|
||||
|
||||
ComputationInfo info() const {
|
||||
sync_info();
|
||||
return info_;
|
||||
}
|
||||
|
||||
Index rows() const { return transposed_ ? n_ : m_; }
|
||||
Index cols() const { return transposed_ ? m_ : n_; }
|
||||
|
||||
// TODO: Add device-side accessors (deviceU(), deviceVT(), deviceSingularValues())
|
||||
// returning DeviceMatrix views of the internal buffers, so users can chain
|
||||
// GPU operations without round-tripping through host memory.
|
||||
|
||||
/** Singular values (always available). Downloads from device on each call. */
|
||||
RealVector singularValues() const {
|
||||
sync_info();
|
||||
eigen_assert(info_ == Success);
|
||||
const Index k = (std::min)(m_, n_);
|
||||
RealVector S(k);
|
||||
EIGEN_CUDA_RUNTIME_CHECK(
|
||||
cudaMemcpy(S.data(), d_S_.ptr, static_cast<size_t>(k) * sizeof(RealScalar), cudaMemcpyDeviceToHost));
|
||||
return S;
|
||||
}
|
||||
|
||||
/** Left singular vectors U. Returns m_orig × k or m_orig × m_orig.
|
||||
* For transposed case (m_orig < n_orig), U comes from cuSOLVER's VT. */
|
||||
PlainMatrix matrixU() const {
|
||||
sync_info();
|
||||
eigen_assert(info_ == Success);
|
||||
eigen_assert((options_ & (ComputeThinU | ComputeFullU)) && "matrixU() requires ComputeThinU or ComputeFullU");
|
||||
const Index m_orig = transposed_ ? n_ : m_;
|
||||
const Index n_orig = transposed_ ? m_ : n_;
|
||||
const Index k = (std::min)(m_orig, n_orig);
|
||||
if (!transposed_) {
|
||||
const Index ucols = (options_ & ComputeFullU) ? m_ : k;
|
||||
PlainMatrix U(m_, ucols);
|
||||
EIGEN_CUDA_RUNTIME_CHECK(cudaMemcpy(U.data(), d_U_.ptr,
|
||||
static_cast<size_t>(m_) * static_cast<size_t>(ucols) * sizeof(Scalar),
|
||||
cudaMemcpyDeviceToHost));
|
||||
return U;
|
||||
} else {
|
||||
// Transposed: U_orig = VT_stored^H. VT_stored is vtrows × n_ (= vtrows × m_orig).
|
||||
const Index vtrows = (options_ & ComputeFullU) ? m_orig : k; // Note: FullU maps to FullV of A^H
|
||||
PlainMatrix VT_stored(vtrows, n_);
|
||||
EIGEN_CUDA_RUNTIME_CHECK(cudaMemcpy(VT_stored.data(), d_VT_.ptr,
|
||||
static_cast<size_t>(vtrows) * static_cast<size_t>(n_) * sizeof(Scalar),
|
||||
cudaMemcpyDeviceToHost));
|
||||
return VT_stored.adjoint(); // m_orig × vtrows
|
||||
}
|
||||
}
|
||||
|
||||
/** Right singular vectors transposed V^T. Returns k × n_orig or n_orig × n_orig.
|
||||
* For transposed case, VT comes from cuSOLVER's U. */
|
||||
PlainMatrix matrixVT() const {
|
||||
sync_info();
|
||||
eigen_assert(info_ == Success);
|
||||
eigen_assert((options_ & (ComputeThinV | ComputeFullV)) && "matrixVT() requires ComputeThinV or ComputeFullV");
|
||||
const Index m_orig = transposed_ ? n_ : m_;
|
||||
const Index n_orig = transposed_ ? m_ : n_;
|
||||
const Index k = (std::min)(m_orig, n_orig);
|
||||
if (!transposed_) {
|
||||
const Index vtrows = (options_ & ComputeFullV) ? n_ : k;
|
||||
PlainMatrix VT(vtrows, n_);
|
||||
EIGEN_CUDA_RUNTIME_CHECK(cudaMemcpy(VT.data(), d_VT_.ptr,
|
||||
static_cast<size_t>(vtrows) * static_cast<size_t>(n_) * sizeof(Scalar),
|
||||
cudaMemcpyDeviceToHost));
|
||||
return VT;
|
||||
} else {
|
||||
// Transposed: VT_orig = U_stored^H. U_stored is m_ × ucols (= n_orig × ucols).
|
||||
const Index ucols = (options_ & ComputeFullV) ? n_orig : k; // FullV maps to FullU of A^H
|
||||
PlainMatrix U_stored(m_, ucols);
|
||||
EIGEN_CUDA_RUNTIME_CHECK(cudaMemcpy(U_stored.data(), d_U_.ptr,
|
||||
static_cast<size_t>(m_) * static_cast<size_t>(ucols) * sizeof(Scalar),
|
||||
cudaMemcpyDeviceToHost));
|
||||
return U_stored.adjoint(); // ucols × n_orig
|
||||
}
|
||||
}
|
||||
|
||||
/** Number of singular values above threshold. */
|
||||
Index rank(RealScalar threshold = RealScalar(-1)) const {
|
||||
RealVector S = singularValues();
|
||||
if (S.size() == 0) return 0;
|
||||
if (threshold < 0) {
|
||||
threshold = (std::max)(m_, n_) * S(0) * NumTraits<RealScalar>::epsilon();
|
||||
}
|
||||
Index r = 0;
|
||||
for (Index i = 0; i < S.size(); ++i) {
|
||||
if (S(i) > threshold) ++r;
|
||||
}
|
||||
return r;
|
||||
}
|
||||
|
||||
// ---- Solve ---------------------------------------------------------------
|
||||
|
||||
/** Pseudoinverse solve: X = V * diag(1/S) * U^H * B. */
|
||||
template <typename Rhs>
|
||||
PlainMatrix solve(const MatrixBase<Rhs>& B) const {
|
||||
return solve_impl(B, (std::min)(m_, n_), RealScalar(0));
|
||||
}
|
||||
|
||||
/** Truncated solve: use only top trunc singular triplets. */
|
||||
template <typename Rhs>
|
||||
PlainMatrix solve(const MatrixBase<Rhs>& B, Index trunc) const {
|
||||
eigen_assert(trunc > 0 && trunc <= (std::min)(m_, n_));
|
||||
return solve_impl(B, trunc, RealScalar(0));
|
||||
}
|
||||
|
||||
/** Tikhonov-regularized solve: D_ii = S_i / (S_i^2 + lambda^2). */
|
||||
template <typename Rhs>
|
||||
PlainMatrix solve(const MatrixBase<Rhs>& B, RealScalar lambda) const {
|
||||
eigen_assert(lambda > 0);
|
||||
return solve_impl(B, (std::min)(m_, n_), lambda);
|
||||
}
|
||||
|
||||
cudaStream_t stream() const { return stream_; }
|
||||
|
||||
private:
|
||||
cudaStream_t stream_ = nullptr;
|
||||
cusolverDnHandle_t handle_ = nullptr;
|
||||
cublasHandle_t cublas_ = nullptr;
|
||||
internal::CusolverParams params_;
|
||||
internal::DeviceBuffer d_A_; // working copy of A (overwritten by gesvd)
|
||||
internal::DeviceBuffer d_U_; // left singular vectors
|
||||
internal::DeviceBuffer d_S_; // singular values (RealScalar)
|
||||
internal::DeviceBuffer d_VT_; // right singular vectors transposed
|
||||
internal::DeviceBuffer d_scratch_; // workspace + info
|
||||
size_t scratch_size_ = 0;
|
||||
std::vector<char> h_workspace_;
|
||||
unsigned int options_ = 0;
|
||||
Index m_ = 0;
|
||||
Index n_ = 0;
|
||||
int64_t lda_ = 0;
|
||||
bool transposed_ = false; // true if m < n (we compute SVD of A^T internally)
|
||||
ComputationInfo info_ = InvalidInput;
|
||||
int info_word_ = 0;
|
||||
bool info_synced_ = true;
|
||||
|
||||
void init_context() {
|
||||
EIGEN_CUDA_RUNTIME_CHECK(cudaStreamCreate(&stream_));
|
||||
EIGEN_CUSOLVER_CHECK(cusolverDnCreate(&handle_));
|
||||
EIGEN_CUSOLVER_CHECK(cusolverDnSetStream(handle_, stream_));
|
||||
EIGEN_CUBLAS_CHECK(cublasCreate(&cublas_));
|
||||
EIGEN_CUBLAS_CHECK(cublasSetStream(cublas_, stream_));
|
||||
ensure_scratch(0);
|
||||
}
|
||||
|
||||
void ensure_scratch(size_t workspace_bytes) {
|
||||
constexpr size_t kAlign = 16;
|
||||
workspace_bytes = (workspace_bytes + kAlign - 1) & ~(kAlign - 1);
|
||||
size_t needed = workspace_bytes + sizeof(int);
|
||||
if (needed > scratch_size_) {
|
||||
if (d_scratch_.ptr) EIGEN_CUDA_RUNTIME_CHECK(cudaStreamSynchronize(stream_));
|
||||
d_scratch_ = internal::DeviceBuffer(needed);
|
||||
scratch_size_ = needed;
|
||||
}
|
||||
}
|
||||
|
||||
void* scratch_workspace() const { return d_scratch_.ptr; }
|
||||
int* scratch_info() const {
|
||||
return reinterpret_cast<int*>(static_cast<char*>(d_scratch_.ptr) + scratch_size_ - sizeof(int));
|
||||
}
|
||||
|
||||
void sync_info() const {
|
||||
if (!info_synced_) {
|
||||
EIGEN_CUDA_RUNTIME_CHECK(cudaStreamSynchronize(stream_));
|
||||
const_cast<GpuSVD*>(this)->info_ = (info_word_ == 0) ? Success : NumericalIssue;
|
||||
const_cast<GpuSVD*>(this)->info_synced_ = true;
|
||||
}
|
||||
}
|
||||
|
||||
// Swap U↔V flags for the transposed case.
|
||||
static unsigned int swap_uv_options(unsigned int opts) {
|
||||
unsigned int result = 0;
|
||||
if (opts & ComputeThinU) result |= ComputeThinV;
|
||||
if (opts & ComputeFullU) result |= ComputeFullV;
|
||||
if (opts & ComputeThinV) result |= ComputeThinU;
|
||||
if (opts & ComputeFullV) result |= ComputeFullU;
|
||||
return result;
|
||||
}
|
||||
|
||||
static signed char jobu(unsigned int opts) {
|
||||
if (opts & ComputeFullU) return 'A';
|
||||
if (opts & ComputeThinU) return 'S';
|
||||
return 'N';
|
||||
}
|
||||
|
||||
static signed char jobvt(unsigned int opts) {
|
||||
if (opts & ComputeFullV) return 'A';
|
||||
if (opts & ComputeThinV) return 'S';
|
||||
return 'N';
|
||||
}
|
||||
|
||||
void factorize() {
|
||||
constexpr cudaDataType_t dtype = internal::cusolver_data_type<Scalar>::value;
|
||||
constexpr cudaDataType_t rtype = internal::cuda_data_type<RealScalar>::value;
|
||||
const Index k = (std::min)(m_, n_);
|
||||
|
||||
info_synced_ = false;
|
||||
info_ = InvalidInput;
|
||||
|
||||
// Allocate output buffers. When transposed, swap U/V roles for cuSOLVER.
|
||||
d_S_ = internal::DeviceBuffer(static_cast<size_t>(k) * sizeof(RealScalar));
|
||||
|
||||
// Internal options: for transposed case, what user wants as U we compute as VT of A^H.
|
||||
const unsigned int int_opts = transposed_ ? swap_uv_options(options_) : options_;
|
||||
|
||||
const Index ucols = (int_opts & ComputeFullU) ? m_ : ((int_opts & ComputeThinU) ? k : 0);
|
||||
const Index vtrows = (int_opts & ComputeFullV) ? n_ : ((int_opts & ComputeThinV) ? k : 0);
|
||||
const int64_t ldu = m_;
|
||||
const int64_t ldvt = vtrows > 0 ? vtrows : 1;
|
||||
|
||||
if (ucols > 0) d_U_ = internal::DeviceBuffer(static_cast<size_t>(m_) * static_cast<size_t>(ucols) * sizeof(Scalar));
|
||||
if (vtrows > 0)
|
||||
d_VT_ = internal::DeviceBuffer(static_cast<size_t>(vtrows) * static_cast<size_t>(n_) * sizeof(Scalar));
|
||||
|
||||
// computeType must match the matrix data type (dtype), not the singular value type (rtype).
|
||||
eigen_assert(m_ >= n_ && "Internal error: m_ < n_ should have been handled by transpose in compute()");
|
||||
size_t dev_ws = 0, host_ws = 0;
|
||||
EIGEN_CUSOLVER_CHECK(cusolverDnXgesvd_bufferSize(
|
||||
handle_, params_.p, jobu(int_opts), jobvt(int_opts), static_cast<int64_t>(m_), static_cast<int64_t>(n_), dtype,
|
||||
d_A_.ptr, lda_, rtype, d_S_.ptr, dtype, ucols > 0 ? d_U_.ptr : nullptr, ldu, dtype,
|
||||
vtrows > 0 ? d_VT_.ptr : nullptr, ldvt, dtype, &dev_ws, &host_ws));
|
||||
|
||||
ensure_scratch(dev_ws);
|
||||
h_workspace_.resize(host_ws);
|
||||
|
||||
// Compute SVD.
|
||||
EIGEN_CUSOLVER_CHECK(cusolverDnXgesvd(handle_, params_.p, jobu(int_opts), jobvt(int_opts), static_cast<int64_t>(m_),
|
||||
static_cast<int64_t>(n_), dtype, d_A_.ptr, lda_, rtype, d_S_.ptr, dtype,
|
||||
ucols > 0 ? d_U_.ptr : nullptr, ldu, dtype, vtrows > 0 ? d_VT_.ptr : nullptr,
|
||||
ldvt, dtype, scratch_workspace(), dev_ws,
|
||||
host_ws > 0 ? h_workspace_.data() : nullptr, host_ws, scratch_info()));
|
||||
|
||||
EIGEN_CUDA_RUNTIME_CHECK(
|
||||
cudaMemcpyAsync(&info_word_, scratch_info(), sizeof(int), cudaMemcpyDeviceToHost, stream_));
|
||||
}
|
||||
|
||||
// Internal solve: X = V * diag(D) * U^H * B, using top `trunc` triplets.
|
||||
// D_ii = 1/S_i (if lambda==0) or S_i/(S_i^2+lambda^2).
|
||||
//
|
||||
// For non-transposed: stored U, VT. X = VT^H * D * U^H * B.
|
||||
// For transposed (SVD of A^H): stored U', VT'. X = U' * D * VT' * B.
|
||||
template <typename Rhs>
|
||||
PlainMatrix solve_impl(const MatrixBase<Rhs>& B, Index trunc, RealScalar lambda) const {
|
||||
sync_info();
|
||||
eigen_assert(info_ == Success && "GpuSVD::solve called on a failed or uninitialized decomposition");
|
||||
eigen_assert((options_ & (ComputeThinU | ComputeFullU)) && "solve requires U");
|
||||
eigen_assert((options_ & (ComputeThinV | ComputeFullV)) && "solve requires V");
|
||||
|
||||
const Index m_orig = transposed_ ? n_ : m_;
|
||||
const Index n_orig = transposed_ ? m_ : n_;
|
||||
eigen_assert(B.rows() == m_orig);
|
||||
|
||||
const Index k = (std::min)(m_, n_); // = min(m_orig, n_orig)
|
||||
const Index kk = (std::min)(trunc, k);
|
||||
const Index nrhs = B.cols();
|
||||
|
||||
// Download S to host to build the diagonal scaling.
|
||||
RealVector S(k);
|
||||
EIGEN_CUDA_RUNTIME_CHECK(
|
||||
cudaMemcpy(S.data(), d_S_.ptr, static_cast<size_t>(k) * sizeof(RealScalar), cudaMemcpyDeviceToHost));
|
||||
EIGEN_CUDA_RUNTIME_CHECK(cudaStreamSynchronize(stream_));
|
||||
|
||||
// Upload B (m_orig × nrhs).
|
||||
const PlainMatrix rhs(B);
|
||||
internal::DeviceBuffer d_B(static_cast<size_t>(m_orig) * static_cast<size_t>(nrhs) * sizeof(Scalar));
|
||||
EIGEN_CUDA_RUNTIME_CHECK(cudaMemcpyAsync(d_B.ptr, rhs.data(),
|
||||
static_cast<size_t>(m_orig) * static_cast<size_t>(nrhs) * sizeof(Scalar),
|
||||
cudaMemcpyHostToDevice, stream_));
|
||||
|
||||
// Step 1: tmp = U_orig^H * B (kk × nrhs).
|
||||
// Non-transposed: U_stored is m_×ucols, U_orig = U_stored. Use U_stored^H * B.
|
||||
// Transposed: U_orig = VT_stored^H, so U_orig^H = VT_stored. Use VT_stored * B (no transpose!).
|
||||
internal::DeviceBuffer d_tmp(static_cast<size_t>(kk) * static_cast<size_t>(nrhs) * sizeof(Scalar));
|
||||
{
|
||||
Scalar alpha_one(1), beta_zero(0);
|
||||
constexpr cudaDataType_t dtype = internal::cuda_data_type<Scalar>::value;
|
||||
constexpr cublasComputeType_t compute = internal::cuda_compute_type<Scalar>::value;
|
||||
|
||||
if (!transposed_) {
|
||||
// U_stored^H * B: (m_×kk)^H × (m_×nrhs) → kk×nrhs.
|
||||
EIGEN_CUBLAS_CHECK(cublasGemmEx(cublas_, CUBLAS_OP_C, CUBLAS_OP_N, static_cast<int>(kk), static_cast<int>(nrhs),
|
||||
static_cast<int>(m_), &alpha_one, d_U_.ptr, dtype, static_cast<int>(m_),
|
||||
d_B.ptr, dtype, static_cast<int>(m_orig), &beta_zero, d_tmp.ptr, dtype,
|
||||
static_cast<int>(kk), compute, internal::cuda_gemm_algo()));
|
||||
} else {
|
||||
// VT_stored * B: VT_stored is vtrows×n_ = kk×m_orig (thin), NoTrans.
|
||||
// vtrows×m_orig times m_orig×nrhs → vtrows×nrhs. Use first kk rows.
|
||||
const Index vtrows_stored = (swap_uv_options(options_) & ComputeFullV) ? n_ : k;
|
||||
EIGEN_CUBLAS_CHECK(cublasGemmEx(
|
||||
cublas_, CUBLAS_OP_N, CUBLAS_OP_N, static_cast<int>(kk), static_cast<int>(nrhs), static_cast<int>(m_orig),
|
||||
&alpha_one, d_VT_.ptr, dtype, static_cast<int>(vtrows_stored), d_B.ptr, dtype, static_cast<int>(m_orig),
|
||||
&beta_zero, d_tmp.ptr, dtype, static_cast<int>(kk), compute, internal::cuda_gemm_algo()));
|
||||
}
|
||||
}
|
||||
|
||||
// Step 2: Scale row i of tmp by D_ii.
|
||||
// Download tmp to host, scale, re-upload. (Simple and correct; a device kernel would be faster.)
|
||||
{
|
||||
PlainMatrix tmp(kk, nrhs);
|
||||
EIGEN_CUDA_RUNTIME_CHECK(cudaMemcpyAsync(tmp.data(), d_tmp.ptr,
|
||||
static_cast<size_t>(kk) * static_cast<size_t>(nrhs) * sizeof(Scalar),
|
||||
cudaMemcpyDeviceToHost, stream_));
|
||||
EIGEN_CUDA_RUNTIME_CHECK(cudaStreamSynchronize(stream_));
|
||||
|
||||
for (Index i = 0; i < kk; ++i) {
|
||||
RealScalar si = S(i);
|
||||
RealScalar di = (lambda == RealScalar(0)) ? (si > 0 ? RealScalar(1) / si : RealScalar(0))
|
||||
: si / (si * si + lambda * lambda);
|
||||
tmp.row(i) *= Scalar(di);
|
||||
}
|
||||
|
||||
EIGEN_CUDA_RUNTIME_CHECK(cudaMemcpyAsync(d_tmp.ptr, tmp.data(),
|
||||
static_cast<size_t>(kk) * static_cast<size_t>(nrhs) * sizeof(Scalar),
|
||||
cudaMemcpyHostToDevice, stream_));
|
||||
}
|
||||
|
||||
// Step 3: X = V_orig * tmp (n_orig × nrhs).
|
||||
// Non-transposed: V_orig = VT_stored^H. VT_stored[:kk,:]^H * tmp → n_orig × nrhs.
|
||||
// Transposed: V_orig = U_stored[:,:kk]. U_stored * tmp → n_orig × nrhs (NoTrans).
|
||||
PlainMatrix X(n_orig, nrhs);
|
||||
{
|
||||
internal::DeviceBuffer d_X(static_cast<size_t>(n_orig) * static_cast<size_t>(nrhs) * sizeof(Scalar));
|
||||
Scalar alpha_one(1), beta_zero(0);
|
||||
constexpr cudaDataType_t dtype = internal::cuda_data_type<Scalar>::value;
|
||||
constexpr cublasComputeType_t compute = internal::cuda_compute_type<Scalar>::value;
|
||||
|
||||
if (!transposed_) {
|
||||
const Index vtrows = (options_ & ComputeFullV) ? n_ : k;
|
||||
EIGEN_CUBLAS_CHECK(cublasGemmEx(cublas_, CUBLAS_OP_C, CUBLAS_OP_N, static_cast<int>(n_orig),
|
||||
static_cast<int>(nrhs), static_cast<int>(kk), &alpha_one, d_VT_.ptr, dtype,
|
||||
static_cast<int>(vtrows), d_tmp.ptr, dtype, static_cast<int>(kk), &beta_zero,
|
||||
d_X.ptr, dtype, static_cast<int>(n_orig), compute, internal::cuda_gemm_algo()));
|
||||
} else {
|
||||
// U_stored is m_×ucols. V_orig = U_stored[:,:kk]. NoTrans × tmp.
|
||||
EIGEN_CUBLAS_CHECK(cublasGemmEx(cublas_, CUBLAS_OP_N, CUBLAS_OP_N, static_cast<int>(n_orig),
|
||||
static_cast<int>(nrhs), static_cast<int>(kk), &alpha_one, d_U_.ptr, dtype,
|
||||
static_cast<int>(m_), d_tmp.ptr, dtype, static_cast<int>(kk), &beta_zero,
|
||||
d_X.ptr, dtype, static_cast<int>(n_orig), compute, internal::cuda_gemm_algo()));
|
||||
}
|
||||
|
||||
EIGEN_CUDA_RUNTIME_CHECK(cudaMemcpyAsync(X.data(), d_X.ptr,
|
||||
static_cast<size_t>(n_orig) * static_cast<size_t>(nrhs) * sizeof(Scalar),
|
||||
cudaMemcpyDeviceToHost, stream_));
|
||||
EIGEN_CUDA_RUNTIME_CHECK(cudaStreamSynchronize(stream_));
|
||||
}
|
||||
|
||||
return X;
|
||||
}
|
||||
};
|
||||
|
||||
} // namespace Eigen
|
||||
|
||||
#endif // EIGEN_GPU_SVD_H
|
||||
321
Eigen/src/GPU/GpuSparseContext.h
Normal file
321
Eigen/src/GPU/GpuSparseContext.h
Normal file
@@ -0,0 +1,321 @@
|
||||
// This file is part of Eigen, a lightweight C++ template library
|
||||
// for linear algebra.
|
||||
//
|
||||
// Copyright (C) 2026 Rasmus Munk Larsen <rmlarsen@gmail.com>
|
||||
//
|
||||
// This Source Code Form is subject to the terms of the Mozilla
|
||||
// Public License v. 2.0. If a copy of the MPL was not distributed
|
||||
// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
|
||||
|
||||
// GPU sparse matrix-vector multiply (SpMV) and sparse matrix-dense matrix
|
||||
// multiply (SpMM) via cuSPARSE.
|
||||
//
|
||||
// GpuSparseContext manages a cuSPARSE handle and device buffers. It accepts
|
||||
// Eigen SparseMatrix<Scalar, ColMajor> (CSC) and performs SpMV/SpMM on the
|
||||
// GPU. RowMajor input is implicitly converted to ColMajor.
|
||||
//
|
||||
// Usage:
|
||||
// GpuSparseContext<double> ctx;
|
||||
// VectorXd y = ctx.multiply(A, x); // y = A * x
|
||||
// ctx.multiply(A, x, y, 2.0, 1.0); // y = 2*A*x + y
|
||||
// VectorXd z = ctx.multiplyT(A, x); // z = A^T * x
|
||||
// MatrixXd Y = ctx.multiplyMat(A, X); // Y = A * X (multiple RHS)
|
||||
|
||||
#ifndef EIGEN_GPU_SPARSE_CONTEXT_H
|
||||
#define EIGEN_GPU_SPARSE_CONTEXT_H
|
||||
|
||||
// IWYU pragma: private
|
||||
#include "./InternalHeaderCheck.h"
|
||||
|
||||
#include "./CuSparseSupport.h"
|
||||
|
||||
namespace Eigen {
|
||||
|
||||
template <typename Scalar_>
|
||||
class GpuSparseContext {
|
||||
public:
|
||||
using Scalar = Scalar_;
|
||||
using RealScalar = typename NumTraits<Scalar>::Real;
|
||||
using StorageIndex = int;
|
||||
using SpMat = SparseMatrix<Scalar, ColMajor, StorageIndex>;
|
||||
using DenseVector = Matrix<Scalar, Dynamic, 1>;
|
||||
using DenseMatrix = Matrix<Scalar, Dynamic, Dynamic, ColMajor>;
|
||||
|
||||
GpuSparseContext() {
|
||||
EIGEN_CUDA_RUNTIME_CHECK(cudaStreamCreate(&stream_));
|
||||
EIGEN_CUSPARSE_CHECK(cusparseCreate(&handle_));
|
||||
EIGEN_CUSPARSE_CHECK(cusparseSetStream(handle_, stream_));
|
||||
}
|
||||
|
||||
~GpuSparseContext() {
|
||||
destroy_descriptors();
|
||||
if (handle_) (void)cusparseDestroy(handle_);
|
||||
if (stream_) (void)cudaStreamDestroy(stream_);
|
||||
}
|
||||
|
||||
GpuSparseContext(const GpuSparseContext&) = delete;
|
||||
GpuSparseContext& operator=(const GpuSparseContext&) = delete;
|
||||
|
||||
// ---- SpMV: y = A * x -----------------------------------------------------
|
||||
|
||||
/** Compute y = A * x. Returns y as a new dense vector. */
|
||||
template <typename InputType, typename Rhs>
|
||||
DenseVector multiply(const SparseMatrixBase<InputType>& A, const MatrixBase<Rhs>& x) {
|
||||
const SpMat mat(A.derived());
|
||||
DenseVector y(mat.rows());
|
||||
y.setZero();
|
||||
multiply_impl(mat, x.derived(), y, Scalar(1), Scalar(0), CUSPARSE_OPERATION_NON_TRANSPOSE);
|
||||
return y;
|
||||
}
|
||||
|
||||
/** Compute y = alpha * op(A) * x + beta * y (in-place). */
|
||||
template <typename InputType, typename Rhs, typename Dest>
|
||||
void multiply(const SparseMatrixBase<InputType>& A, const MatrixBase<Rhs>& x, MatrixBase<Dest>& y,
|
||||
Scalar alpha = Scalar(1), Scalar beta = Scalar(0),
|
||||
cusparseOperation_t op = CUSPARSE_OPERATION_NON_TRANSPOSE) {
|
||||
const SpMat mat(A.derived());
|
||||
multiply_impl(mat, x.derived(), y.derived(), alpha, beta, op);
|
||||
}
|
||||
|
||||
// ---- SpMV transpose: y = A^T * x -----------------------------------------
|
||||
|
||||
/** Compute y = A^T * x. Returns y as a new dense vector. */
|
||||
template <typename InputType, typename Rhs>
|
||||
DenseVector multiplyT(const SparseMatrixBase<InputType>& A, const MatrixBase<Rhs>& x) {
|
||||
const SpMat mat(A.derived());
|
||||
DenseVector y(mat.cols());
|
||||
y.setZero();
|
||||
multiply_impl(mat, x.derived(), y, Scalar(1), Scalar(0), CUSPARSE_OPERATION_TRANSPOSE);
|
||||
return y;
|
||||
}
|
||||
|
||||
// ---- SpMM: Y = A * X (multiple RHS) --------------------------------------
|
||||
|
||||
/** Compute Y = A * X where X is a dense matrix (multiple RHS). Returns Y. */
|
||||
template <typename InputType, typename Rhs>
|
||||
DenseMatrix multiplyMat(const SparseMatrixBase<InputType>& A, const MatrixBase<Rhs>& X) {
|
||||
const SpMat mat(A.derived());
|
||||
const DenseMatrix rhs(X.derived());
|
||||
eigen_assert(mat.cols() == rhs.rows());
|
||||
|
||||
const Index m = mat.rows();
|
||||
const Index n = rhs.cols();
|
||||
if (m == 0 || n == 0 || mat.nonZeros() == 0) return DenseMatrix::Zero(m, n);
|
||||
|
||||
DenseMatrix Y = DenseMatrix::Zero(m, n);
|
||||
spmm_impl(mat, rhs, Y, Scalar(1), Scalar(0), CUSPARSE_OPERATION_NON_TRANSPOSE);
|
||||
return Y;
|
||||
}
|
||||
|
||||
// ---- Accessors ------------------------------------------------------------
|
||||
|
||||
cudaStream_t stream() const { return stream_; }
|
||||
|
||||
private:
|
||||
cudaStream_t stream_ = nullptr;
|
||||
cusparseHandle_t handle_ = nullptr;
|
||||
|
||||
// Cached device buffers (grow-only).
|
||||
internal::DeviceBuffer d_outerPtr_;
|
||||
internal::DeviceBuffer d_innerIdx_;
|
||||
internal::DeviceBuffer d_values_;
|
||||
internal::DeviceBuffer d_x_;
|
||||
internal::DeviceBuffer d_y_;
|
||||
internal::DeviceBuffer d_workspace_;
|
||||
size_t d_outerPtr_size_ = 0;
|
||||
size_t d_innerIdx_size_ = 0;
|
||||
size_t d_values_size_ = 0;
|
||||
size_t d_x_size_ = 0;
|
||||
size_t d_y_size_ = 0;
|
||||
size_t d_workspace_size_ = 0;
|
||||
|
||||
// Cached cuSPARSE descriptors.
|
||||
cusparseSpMatDescr_t spmat_desc_ = nullptr;
|
||||
Index cached_rows_ = -1;
|
||||
Index cached_cols_ = -1;
|
||||
Index cached_nnz_ = -1;
|
||||
|
||||
// ---- SpMV implementation --------------------------------------------------
|
||||
|
||||
template <typename RhsDerived, typename DestDerived>
|
||||
void multiply_impl(const SpMat& A, const RhsDerived& x, DestDerived& y, Scalar alpha, Scalar beta,
|
||||
cusparseOperation_t op) {
|
||||
eigen_assert(A.isCompressed());
|
||||
|
||||
const Index m = A.rows();
|
||||
const Index n = A.cols();
|
||||
const Index nnz = A.nonZeros();
|
||||
const Index x_size = (op == CUSPARSE_OPERATION_NON_TRANSPOSE) ? n : m;
|
||||
const Index y_size = (op == CUSPARSE_OPERATION_NON_TRANSPOSE) ? m : n;
|
||||
|
||||
eigen_assert(x.size() == x_size);
|
||||
eigen_assert(y.size() == y_size);
|
||||
|
||||
if (m == 0 || n == 0 || nnz == 0) {
|
||||
if (beta == Scalar(0))
|
||||
y.setZero();
|
||||
else
|
||||
y *= beta;
|
||||
return;
|
||||
}
|
||||
|
||||
// Upload sparse matrix to device.
|
||||
upload_sparse(A);
|
||||
|
||||
// Upload x to device.
|
||||
ensure_buffer(d_x_, d_x_size_, static_cast<size_t>(x_size) * sizeof(Scalar));
|
||||
const DenseVector x_tmp(x);
|
||||
EIGEN_CUDA_RUNTIME_CHECK(
|
||||
cudaMemcpyAsync(d_x_.ptr, x_tmp.data(), x_size * sizeof(Scalar), cudaMemcpyHostToDevice, stream_));
|
||||
|
||||
// Upload y to device (for beta != 0).
|
||||
ensure_buffer(d_y_, d_y_size_, static_cast<size_t>(y_size) * sizeof(Scalar));
|
||||
if (beta != Scalar(0)) {
|
||||
const DenseVector y_tmp(y);
|
||||
EIGEN_CUDA_RUNTIME_CHECK(
|
||||
cudaMemcpyAsync(d_y_.ptr, y_tmp.data(), y_size * sizeof(Scalar), cudaMemcpyHostToDevice, stream_));
|
||||
}
|
||||
|
||||
// Create dense vector descriptors.
|
||||
constexpr cudaDataType_t dtype = internal::cuda_data_type<Scalar>::value;
|
||||
cusparseDnVecDescr_t x_desc = nullptr, y_desc = nullptr;
|
||||
EIGEN_CUSPARSE_CHECK(cusparseCreateDnVec(&x_desc, x_size, d_x_.ptr, dtype));
|
||||
EIGEN_CUSPARSE_CHECK(cusparseCreateDnVec(&y_desc, y_size, d_y_.ptr, dtype));
|
||||
|
||||
// Query workspace size.
|
||||
size_t ws_size = 0;
|
||||
EIGEN_CUSPARSE_CHECK(cusparseSpMV_bufferSize(handle_, op, &alpha, spmat_desc_, x_desc, &beta, y_desc, dtype,
|
||||
CUSPARSE_SPMV_ALG_DEFAULT, &ws_size));
|
||||
ensure_buffer(d_workspace_, d_workspace_size_, ws_size);
|
||||
|
||||
// Execute SpMV.
|
||||
EIGEN_CUSPARSE_CHECK(cusparseSpMV(handle_, op, &alpha, spmat_desc_, x_desc, &beta, y_desc, dtype,
|
||||
CUSPARSE_SPMV_ALG_DEFAULT, d_workspace_.ptr));
|
||||
|
||||
// Download result.
|
||||
EIGEN_CUDA_RUNTIME_CHECK(
|
||||
cudaMemcpyAsync(y.data(), d_y_.ptr, y_size * sizeof(Scalar), cudaMemcpyDeviceToHost, stream_));
|
||||
EIGEN_CUDA_RUNTIME_CHECK(cudaStreamSynchronize(stream_));
|
||||
|
||||
(void)cusparseDestroyDnVec(x_desc);
|
||||
(void)cusparseDestroyDnVec(y_desc);
|
||||
}
|
||||
|
||||
// ---- SpMM implementation --------------------------------------------------
|
||||
|
||||
void spmm_impl(const SpMat& A, const DenseMatrix& X, DenseMatrix& Y, Scalar alpha, Scalar beta,
|
||||
cusparseOperation_t op) {
|
||||
eigen_assert(A.isCompressed());
|
||||
|
||||
const Index m = A.rows();
|
||||
const Index n = X.cols();
|
||||
const Index k = A.cols();
|
||||
const Index nnz = A.nonZeros();
|
||||
|
||||
if (m == 0 || n == 0 || k == 0 || nnz == 0) {
|
||||
if (beta == Scalar(0))
|
||||
Y.setZero();
|
||||
else
|
||||
Y *= beta;
|
||||
return;
|
||||
}
|
||||
|
||||
upload_sparse(A);
|
||||
|
||||
// Upload X to device.
|
||||
const size_t x_bytes = static_cast<size_t>(k) * static_cast<size_t>(n) * sizeof(Scalar);
|
||||
const size_t y_bytes = static_cast<size_t>(m) * static_cast<size_t>(n) * sizeof(Scalar);
|
||||
ensure_buffer(d_x_, d_x_size_, x_bytes);
|
||||
ensure_buffer(d_y_, d_y_size_, y_bytes);
|
||||
EIGEN_CUDA_RUNTIME_CHECK(cudaMemcpyAsync(d_x_.ptr, X.data(), x_bytes, cudaMemcpyHostToDevice, stream_));
|
||||
if (beta != Scalar(0)) {
|
||||
EIGEN_CUDA_RUNTIME_CHECK(cudaMemcpyAsync(d_y_.ptr, Y.data(), y_bytes, cudaMemcpyHostToDevice, stream_));
|
||||
}
|
||||
|
||||
// Create dense matrix descriptors.
|
||||
constexpr cudaDataType_t dtype = internal::cuda_data_type<Scalar>::value;
|
||||
cusparseDnMatDescr_t x_desc = nullptr, y_desc = nullptr;
|
||||
// Eigen is column-major, so ld = rows.
|
||||
EIGEN_CUSPARSE_CHECK(cusparseCreateDnMat(&x_desc, k, n, k, d_x_.ptr, dtype, CUSPARSE_ORDER_COL));
|
||||
EIGEN_CUSPARSE_CHECK(cusparseCreateDnMat(&y_desc, m, n, m, d_y_.ptr, dtype, CUSPARSE_ORDER_COL));
|
||||
|
||||
// Query workspace.
|
||||
size_t ws_size = 0;
|
||||
EIGEN_CUSPARSE_CHECK(cusparseSpMM_bufferSize(handle_, op, CUSPARSE_OPERATION_NON_TRANSPOSE, &alpha, spmat_desc_,
|
||||
x_desc, &beta, y_desc, dtype, CUSPARSE_SPMM_ALG_DEFAULT, &ws_size));
|
||||
ensure_buffer(d_workspace_, d_workspace_size_, ws_size);
|
||||
|
||||
// Execute SpMM.
|
||||
EIGEN_CUSPARSE_CHECK(cusparseSpMM(handle_, op, CUSPARSE_OPERATION_NON_TRANSPOSE, &alpha, spmat_desc_, x_desc, &beta,
|
||||
y_desc, dtype, CUSPARSE_SPMM_ALG_DEFAULT, d_workspace_.ptr));
|
||||
|
||||
// Download result.
|
||||
EIGEN_CUDA_RUNTIME_CHECK(cudaMemcpyAsync(Y.data(), d_y_.ptr, y_bytes, cudaMemcpyDeviceToHost, stream_));
|
||||
EIGEN_CUDA_RUNTIME_CHECK(cudaStreamSynchronize(stream_));
|
||||
|
||||
(void)cusparseDestroyDnMat(x_desc);
|
||||
(void)cusparseDestroyDnMat(y_desc);
|
||||
}
|
||||
|
||||
// ---- Helpers --------------------------------------------------------------
|
||||
|
||||
void upload_sparse(const SpMat& A) {
|
||||
const Index m = A.rows();
|
||||
const Index n = A.cols();
|
||||
const Index nnz = A.nonZeros();
|
||||
|
||||
const size_t outer_bytes = static_cast<size_t>(n + 1) * sizeof(StorageIndex);
|
||||
const size_t inner_bytes = static_cast<size_t>(nnz) * sizeof(StorageIndex);
|
||||
const size_t val_bytes = static_cast<size_t>(nnz) * sizeof(Scalar);
|
||||
|
||||
ensure_buffer(d_outerPtr_, d_outerPtr_size_, outer_bytes);
|
||||
ensure_buffer(d_innerIdx_, d_innerIdx_size_, inner_bytes);
|
||||
ensure_buffer(d_values_, d_values_size_, val_bytes);
|
||||
|
||||
EIGEN_CUDA_RUNTIME_CHECK(
|
||||
cudaMemcpyAsync(d_outerPtr_.ptr, A.outerIndexPtr(), outer_bytes, cudaMemcpyHostToDevice, stream_));
|
||||
EIGEN_CUDA_RUNTIME_CHECK(
|
||||
cudaMemcpyAsync(d_innerIdx_.ptr, A.innerIndexPtr(), inner_bytes, cudaMemcpyHostToDevice, stream_));
|
||||
EIGEN_CUDA_RUNTIME_CHECK(cudaMemcpyAsync(d_values_.ptr, A.valuePtr(), val_bytes, cudaMemcpyHostToDevice, stream_));
|
||||
|
||||
// Recreate descriptor if shape changed.
|
||||
if (m != cached_rows_ || n != cached_cols_ || nnz != cached_nnz_) {
|
||||
destroy_descriptors();
|
||||
|
||||
constexpr cusparseIndexType_t idx_type = (sizeof(StorageIndex) == 4) ? CUSPARSE_INDEX_32I : CUSPARSE_INDEX_64I;
|
||||
constexpr cudaDataType_t val_type = internal::cuda_data_type<Scalar>::value;
|
||||
|
||||
// ColMajor → CSC. outerIndexPtr = col offsets, innerIndexPtr = row indices.
|
||||
EIGEN_CUSPARSE_CHECK(cusparseCreateCsc(&spmat_desc_, m, n, nnz, d_outerPtr_.ptr, d_innerIdx_.ptr, d_values_.ptr,
|
||||
idx_type, idx_type, CUSPARSE_INDEX_BASE_ZERO, val_type));
|
||||
cached_rows_ = m;
|
||||
cached_cols_ = n;
|
||||
cached_nnz_ = nnz;
|
||||
} else {
|
||||
// Same shape — just update pointers.
|
||||
EIGEN_CUSPARSE_CHECK(cusparseCscSetPointers(spmat_desc_, d_outerPtr_.ptr, d_innerIdx_.ptr, d_values_.ptr));
|
||||
}
|
||||
}
|
||||
|
||||
void destroy_descriptors() {
|
||||
if (spmat_desc_) {
|
||||
(void)cusparseDestroySpMat(spmat_desc_);
|
||||
spmat_desc_ = nullptr;
|
||||
}
|
||||
cached_rows_ = -1;
|
||||
cached_cols_ = -1;
|
||||
cached_nnz_ = -1;
|
||||
}
|
||||
|
||||
void ensure_buffer(internal::DeviceBuffer& buf, size_t& current_size, size_t needed) {
|
||||
if (needed > current_size) {
|
||||
if (buf.ptr) EIGEN_CUDA_RUNTIME_CHECK(cudaStreamSynchronize(stream_));
|
||||
buf = internal::DeviceBuffer(needed);
|
||||
current_size = needed;
|
||||
}
|
||||
}
|
||||
};
|
||||
|
||||
} // namespace Eigen
|
||||
|
||||
#endif // EIGEN_GPU_SPARSE_CONTEXT_H
|
||||
62
Eigen/src/GPU/GpuSparseLDLT.h
Normal file
62
Eigen/src/GPU/GpuSparseLDLT.h
Normal file
@@ -0,0 +1,62 @@
|
||||
// This file is part of Eigen, a lightweight C++ template library
|
||||
// for linear algebra.
|
||||
//
|
||||
// Copyright (C) 2026 Rasmus Munk Larsen <rmlarsen@gmail.com>
|
||||
//
|
||||
// This Source Code Form is subject to the terms of the Mozilla
|
||||
// Public License v. 2.0. If a copy of the MPL was not distributed
|
||||
// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
|
||||
|
||||
// GPU sparse LDL^T / LDL^H factorization via cuDSS.
|
||||
//
|
||||
// For symmetric indefinite (or Hermitian indefinite) sparse matrices.
|
||||
// Same three-phase workflow as GpuSparseLLT.
|
||||
//
|
||||
// Usage:
|
||||
// GpuSparseLDLT<double> ldlt(A); // analyze + factorize
|
||||
// VectorXd x = ldlt.solve(b); // solve
|
||||
|
||||
#ifndef EIGEN_GPU_SPARSE_LDLT_H
|
||||
#define EIGEN_GPU_SPARSE_LDLT_H
|
||||
|
||||
// IWYU pragma: private
|
||||
#include "./InternalHeaderCheck.h"
|
||||
|
||||
#include "./GpuSparseSolverBase.h"
|
||||
|
||||
namespace Eigen {
|
||||
|
||||
/** GPU sparse LDL^T factorization (symmetric indefinite / Hermitian indefinite).
|
||||
*
|
||||
* Wraps cuDSS with CUDSS_MTYPE_SYMMETRIC (real) or CUDSS_MTYPE_HERMITIAN (complex).
|
||||
* Uses pivoting for numerical stability.
|
||||
*
|
||||
* \tparam Scalar_ float, double, complex<float>, or complex<double>
|
||||
* \tparam UpLo_ Lower (default) or Upper — which triangle of A is stored
|
||||
*/
|
||||
template <typename Scalar_, int UpLo_ = Lower>
|
||||
class GpuSparseLDLT : public internal::GpuSparseSolverBase<Scalar_, GpuSparseLDLT<Scalar_, UpLo_>> {
|
||||
using Base = internal::GpuSparseSolverBase<Scalar_, GpuSparseLDLT>;
|
||||
friend Base;
|
||||
|
||||
public:
|
||||
using Scalar = Scalar_;
|
||||
enum { UpLo = UpLo_ };
|
||||
|
||||
GpuSparseLDLT() = default;
|
||||
|
||||
template <typename InputType>
|
||||
explicit GpuSparseLDLT(const SparseMatrixBase<InputType>& A) {
|
||||
this->compute(A);
|
||||
}
|
||||
|
||||
static constexpr bool needs_csr_conversion() { return false; }
|
||||
static constexpr cudssMatrixType_t cudss_matrix_type() { return internal::cudss_symmetric_type<Scalar>::value; }
|
||||
static constexpr cudssMatrixViewType_t cudss_matrix_view() {
|
||||
return internal::cudss_view_type<UpLo, ColMajor>::value;
|
||||
}
|
||||
};
|
||||
|
||||
} // namespace Eigen
|
||||
|
||||
#endif // EIGEN_GPU_SPARSE_LDLT_H
|
||||
62
Eigen/src/GPU/GpuSparseLLT.h
Normal file
62
Eigen/src/GPU/GpuSparseLLT.h
Normal file
@@ -0,0 +1,62 @@
|
||||
// This file is part of Eigen, a lightweight C++ template library
|
||||
// for linear algebra.
|
||||
//
|
||||
// Copyright (C) 2026 Rasmus Munk Larsen <rmlarsen@gmail.com>
|
||||
//
|
||||
// This Source Code Form is subject to the terms of the Mozilla
|
||||
// Public License v. 2.0. If a copy of the MPL was not distributed
|
||||
// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
|
||||
|
||||
// GPU sparse Cholesky (LL^T / LL^H) via cuDSS.
|
||||
//
|
||||
// Usage:
|
||||
// GpuSparseLLT<double> llt(A); // analyze + factorize
|
||||
// VectorXd x = llt.solve(b); // solve
|
||||
// llt.analyzePattern(A); // or separate phases
|
||||
// llt.factorize(A_new); // reuse symbolic analysis
|
||||
|
||||
#ifndef EIGEN_GPU_SPARSE_LLT_H
|
||||
#define EIGEN_GPU_SPARSE_LLT_H
|
||||
|
||||
// IWYU pragma: private
|
||||
#include "./InternalHeaderCheck.h"
|
||||
|
||||
#include "./GpuSparseSolverBase.h"
|
||||
|
||||
namespace Eigen {
|
||||
|
||||
/** GPU sparse Cholesky factorization (LL^T for real, LL^H for complex).
|
||||
*
|
||||
* Wraps cuDSS with CUDSS_MTYPE_SPD (real) or CUDSS_MTYPE_HPD (complex).
|
||||
* Accepts ColMajor SparseMatrix (CSC), reinterpreted as CSR with swapped
|
||||
* triangle view for zero-copy upload.
|
||||
*
|
||||
* \tparam Scalar_ float, double, complex<float>, or complex<double>
|
||||
* \tparam UpLo_ Lower (default) or Upper — which triangle of A is stored
|
||||
*/
|
||||
template <typename Scalar_, int UpLo_ = Lower>
|
||||
class GpuSparseLLT : public internal::GpuSparseSolverBase<Scalar_, GpuSparseLLT<Scalar_, UpLo_>> {
|
||||
using Base = internal::GpuSparseSolverBase<Scalar_, GpuSparseLLT>;
|
||||
friend Base;
|
||||
|
||||
public:
|
||||
using Scalar = Scalar_;
|
||||
enum { UpLo = UpLo_ };
|
||||
|
||||
GpuSparseLLT() = default;
|
||||
|
||||
template <typename InputType>
|
||||
explicit GpuSparseLLT(const SparseMatrixBase<InputType>& A) {
|
||||
this->compute(A);
|
||||
}
|
||||
|
||||
static constexpr bool needs_csr_conversion() { return false; }
|
||||
static constexpr cudssMatrixType_t cudss_matrix_type() { return internal::cudss_spd_type<Scalar>::value; }
|
||||
static constexpr cudssMatrixViewType_t cudss_matrix_view() {
|
||||
return internal::cudss_view_type<UpLo, ColMajor>::value;
|
||||
}
|
||||
};
|
||||
|
||||
} // namespace Eigen
|
||||
|
||||
#endif // EIGEN_GPU_SPARSE_LLT_H
|
||||
59
Eigen/src/GPU/GpuSparseLU.h
Normal file
59
Eigen/src/GPU/GpuSparseLU.h
Normal file
@@ -0,0 +1,59 @@
|
||||
// This file is part of Eigen, a lightweight C++ template library
|
||||
// for linear algebra.
|
||||
//
|
||||
// Copyright (C) 2026 Rasmus Munk Larsen <rmlarsen@gmail.com>
|
||||
//
|
||||
// This Source Code Form is subject to the terms of the Mozilla
|
||||
// Public License v. 2.0. If a copy of the MPL was not distributed
|
||||
// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
|
||||
|
||||
// GPU sparse LU factorization via cuDSS.
|
||||
//
|
||||
// For general (non-symmetric) sparse matrices. Uses pivoting.
|
||||
// Same three-phase workflow as GpuSparseLLT.
|
||||
//
|
||||
// Usage:
|
||||
// GpuSparseLU<double> lu(A); // analyze + factorize
|
||||
// VectorXd x = lu.solve(b); // solve
|
||||
|
||||
#ifndef EIGEN_GPU_SPARSE_LU_H
|
||||
#define EIGEN_GPU_SPARSE_LU_H
|
||||
|
||||
// IWYU pragma: private
|
||||
#include "./InternalHeaderCheck.h"
|
||||
|
||||
#include "./GpuSparseSolverBase.h"
|
||||
|
||||
namespace Eigen {
|
||||
|
||||
/** GPU sparse LU factorization (general matrices).
|
||||
*
|
||||
* Wraps cuDSS with CUDSS_MTYPE_GENERAL and CUDSS_MVIEW_FULL.
|
||||
* Accepts ColMajor SparseMatrix (CSC); internally converts to RowMajor
|
||||
* CSR since cuDSS requires CSR input.
|
||||
*
|
||||
* \tparam Scalar_ float, double, complex<float>, or complex<double>
|
||||
*/
|
||||
template <typename Scalar_>
|
||||
class GpuSparseLU : public internal::GpuSparseSolverBase<Scalar_, GpuSparseLU<Scalar_>> {
|
||||
using Base = internal::GpuSparseSolverBase<Scalar_, GpuSparseLU>;
|
||||
friend Base;
|
||||
|
||||
public:
|
||||
using Scalar = Scalar_;
|
||||
|
||||
GpuSparseLU() = default;
|
||||
|
||||
template <typename InputType>
|
||||
explicit GpuSparseLU(const SparseMatrixBase<InputType>& A) {
|
||||
this->compute(A);
|
||||
}
|
||||
|
||||
static constexpr bool needs_csr_conversion() { return true; }
|
||||
static constexpr cudssMatrixType_t cudss_matrix_type() { return CUDSS_MTYPE_GENERAL; }
|
||||
static constexpr cudssMatrixViewType_t cudss_matrix_view() { return CUDSS_MVIEW_FULL; }
|
||||
};
|
||||
|
||||
} // namespace Eigen
|
||||
|
||||
#endif // EIGEN_GPU_SPARSE_LU_H
|
||||
356
Eigen/src/GPU/GpuSparseSolverBase.h
Normal file
356
Eigen/src/GPU/GpuSparseSolverBase.h
Normal file
@@ -0,0 +1,356 @@
|
||||
// This file is part of Eigen, a lightweight C++ template library
|
||||
// for linear algebra.
|
||||
//
|
||||
// Copyright (C) 2026 Rasmus Munk Larsen <rmlarsen@gmail.com>
|
||||
//
|
||||
// This Source Code Form is subject to the terms of the Mozilla
|
||||
// Public License v. 2.0. If a copy of the MPL was not distributed
|
||||
// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
|
||||
|
||||
// Common base for GPU sparse direct solvers (LLT, LDLT, LU) via cuDSS.
|
||||
//
|
||||
// All three solver types share the same three-phase workflow
|
||||
// (analyzePattern → factorize → solve) and differ only in the
|
||||
// cudssMatrixType_t and cudssMatrixViewType_t passed to cuDSS.
|
||||
// This CRTP base implements the entire workflow; derived classes
|
||||
// provide the matrix type/view via static constexpr members.
|
||||
|
||||
#ifndef EIGEN_GPU_SPARSE_SOLVER_BASE_H
|
||||
#define EIGEN_GPU_SPARSE_SOLVER_BASE_H
|
||||
|
||||
// IWYU pragma: private
|
||||
#include "./InternalHeaderCheck.h"
|
||||
|
||||
#include "./CuDssSupport.h"
|
||||
|
||||
namespace Eigen {
|
||||
namespace internal {
|
||||
|
||||
/** CRTP base for GPU sparse direct solvers.
|
||||
*
|
||||
* \tparam Scalar_ Element type (passed explicitly to avoid incomplete-type issues with CRTP).
|
||||
* \tparam Derived The concrete solver class (GpuSparseLLT, GpuSparseLDLT, GpuSparseLU).
|
||||
* Must provide:
|
||||
* - `static constexpr cudssMatrixType_t cudss_matrix_type()`
|
||||
* - `static constexpr cudssMatrixViewType_t cudss_matrix_view()`
|
||||
*/
|
||||
template <typename Scalar_, typename Derived>
|
||||
class GpuSparseSolverBase {
|
||||
public:
|
||||
using Scalar = Scalar_;
|
||||
using RealScalar = typename NumTraits<Scalar>::Real;
|
||||
using StorageIndex = int;
|
||||
using SpMat = SparseMatrix<Scalar, ColMajor, StorageIndex>;
|
||||
using CsrMat = SparseMatrix<Scalar, RowMajor, StorageIndex>;
|
||||
using DenseVector = Matrix<Scalar, Dynamic, 1>;
|
||||
using DenseMatrix = Matrix<Scalar, Dynamic, Dynamic, ColMajor>;
|
||||
|
||||
GpuSparseSolverBase() { init_context(); }
|
||||
|
||||
~GpuSparseSolverBase() {
|
||||
destroy_cudss_objects();
|
||||
if (handle_) (void)cudssDestroy(handle_);
|
||||
if (stream_) (void)cudaStreamDestroy(stream_);
|
||||
}
|
||||
|
||||
GpuSparseSolverBase(const GpuSparseSolverBase&) = delete;
|
||||
GpuSparseSolverBase& operator=(const GpuSparseSolverBase&) = delete;
|
||||
|
||||
// ---- Configuration --------------------------------------------------------
|
||||
|
||||
/** Set the fill-reducing ordering algorithm. Must be called before compute/analyzePattern. */
|
||||
void setOrdering(GpuSparseOrdering ordering) { ordering_ = ordering; }
|
||||
|
||||
// ---- Factorization --------------------------------------------------------
|
||||
|
||||
/** Symbolic analysis + numeric factorization. */
|
||||
template <typename InputType>
|
||||
Derived& compute(const SparseMatrixBase<InputType>& A) {
|
||||
analyzePattern(A);
|
||||
if (info_ == Success) {
|
||||
factorize(A);
|
||||
}
|
||||
return derived();
|
||||
}
|
||||
|
||||
/** Symbolic analysis only. Uploads sparsity structure to device.
|
||||
* This phase is synchronous (blocks until complete). */
|
||||
template <typename InputType>
|
||||
Derived& analyzePattern(const SparseMatrixBase<InputType>& A) {
|
||||
const SpMat csc(A.derived());
|
||||
eigen_assert(csc.rows() == csc.cols() && "GpuSparseSolver requires a square matrix");
|
||||
eigen_assert(csc.isCompressed() && "GpuSparseSolver requires a compressed sparse matrix");
|
||||
|
||||
n_ = csc.rows();
|
||||
info_ = InvalidInput;
|
||||
analysis_done_ = false;
|
||||
|
||||
if (n_ == 0) {
|
||||
nnz_ = 0;
|
||||
info_ = Success;
|
||||
analysis_done_ = true;
|
||||
return derived();
|
||||
}
|
||||
|
||||
// For symmetric solvers, ColMajor CSC can be reinterpreted as CSR with
|
||||
// swapped triangle view (zero copy). For general solvers, we must convert
|
||||
// to actual RowMajor CSR so cuDSS sees the correct matrix, not A^T.
|
||||
if (Derived::needs_csr_conversion()) {
|
||||
const CsrMat csr(csc);
|
||||
nnz_ = csr.nonZeros();
|
||||
upload_csr(csr);
|
||||
} else {
|
||||
nnz_ = csc.nonZeros();
|
||||
upload_csr_from_csc(csc);
|
||||
}
|
||||
create_cudss_matrix();
|
||||
apply_ordering_config();
|
||||
|
||||
if (data_) EIGEN_CUDSS_CHECK(cudssDataDestroy(handle_, data_));
|
||||
EIGEN_CUDSS_CHECK(cudssDataCreate(handle_, &data_));
|
||||
|
||||
create_placeholder_dense();
|
||||
|
||||
EIGEN_CUDSS_CHECK(cudssExecute(handle_, CUDSS_PHASE_ANALYSIS, config_, data_, d_A_cudss_, d_x_cudss_, d_b_cudss_));
|
||||
|
||||
analysis_done_ = true;
|
||||
info_ = Success;
|
||||
return derived();
|
||||
}
|
||||
|
||||
/** Numeric factorization using the symbolic analysis from analyzePattern.
|
||||
*
|
||||
* \warning The sparsity pattern (outerIndexPtr, innerIndexPtr) must be
|
||||
* identical to the one passed to analyzePattern(). Only the numerical
|
||||
* values may change. Passing a different pattern is undefined behavior.
|
||||
* This matches the contract of CHOLMOD, UMFPACK, and cuDSS's own API.
|
||||
*
|
||||
* This phase is asynchronous — info() lazily synchronizes. */
|
||||
template <typename InputType>
|
||||
Derived& factorize(const SparseMatrixBase<InputType>& A) {
|
||||
eigen_assert(analysis_done_ && "factorize() requires analyzePattern() first");
|
||||
|
||||
if (n_ == 0) {
|
||||
info_ = Success;
|
||||
return derived();
|
||||
}
|
||||
|
||||
// Convert to the same format used in analyzePattern.
|
||||
// Both temporaries must outlive the async memcpy (pageable H2D is actually
|
||||
// synchronous w.r.t. the host, but keep them alive for clarity).
|
||||
const SpMat csc(A.derived());
|
||||
eigen_assert(csc.rows() == n_ && csc.cols() == n_);
|
||||
|
||||
const Scalar* value_ptr;
|
||||
Index value_nnz;
|
||||
CsrMat csr_tmp;
|
||||
if (Derived::needs_csr_conversion()) {
|
||||
csr_tmp = CsrMat(csc);
|
||||
value_ptr = csr_tmp.valuePtr();
|
||||
value_nnz = csr_tmp.nonZeros();
|
||||
} else {
|
||||
value_ptr = csc.valuePtr();
|
||||
value_nnz = csc.nonZeros();
|
||||
}
|
||||
eigen_assert(value_nnz == nnz_);
|
||||
|
||||
EIGEN_CUDA_RUNTIME_CHECK(cudaMemcpyAsync(d_values_.ptr, value_ptr, static_cast<size_t>(nnz_) * sizeof(Scalar),
|
||||
cudaMemcpyHostToDevice, stream_));
|
||||
|
||||
EIGEN_CUDSS_CHECK(cudssMatrixSetValues(d_A_cudss_, d_values_.ptr));
|
||||
|
||||
info_ = InvalidInput;
|
||||
info_synced_ = false;
|
||||
EIGEN_CUDSS_CHECK(
|
||||
cudssExecute(handle_, CUDSS_PHASE_FACTORIZATION, config_, data_, d_A_cudss_, d_x_cudss_, d_b_cudss_));
|
||||
|
||||
return derived();
|
||||
}
|
||||
|
||||
// ---- Solve ----------------------------------------------------------------
|
||||
|
||||
/** Solve A * X = B. Returns X as a dense matrix.
|
||||
* Supports single or multiple right-hand sides. */
|
||||
template <typename Rhs>
|
||||
DenseMatrix solve(const MatrixBase<Rhs>& B) const {
|
||||
sync_info();
|
||||
eigen_assert(info_ == Success && "GpuSparseSolver::solve requires a successful factorization");
|
||||
eigen_assert(B.rows() == n_);
|
||||
|
||||
const DenseMatrix rhs(B);
|
||||
const int64_t nrhs = static_cast<int64_t>(rhs.cols());
|
||||
|
||||
if (n_ == 0) return DenseMatrix(0, rhs.cols());
|
||||
|
||||
const size_t rhs_bytes = static_cast<size_t>(n_) * static_cast<size_t>(nrhs) * sizeof(Scalar);
|
||||
DeviceBuffer d_b(rhs_bytes);
|
||||
DeviceBuffer d_x(rhs_bytes);
|
||||
EIGEN_CUDA_RUNTIME_CHECK(cudaMemcpyAsync(d_b.ptr, rhs.data(), rhs_bytes, cudaMemcpyHostToDevice, stream_));
|
||||
|
||||
constexpr cudaDataType_t dtype = cuda_data_type<Scalar>::value;
|
||||
cudssMatrix_t b_cudss = nullptr, x_cudss = nullptr;
|
||||
EIGEN_CUDSS_CHECK(cudssMatrixCreateDn(&b_cudss, static_cast<int64_t>(n_), nrhs, static_cast<int64_t>(n_), d_b.ptr,
|
||||
dtype, CUDSS_LAYOUT_COL_MAJOR));
|
||||
EIGEN_CUDSS_CHECK(cudssMatrixCreateDn(&x_cudss, static_cast<int64_t>(n_), nrhs, static_cast<int64_t>(n_), d_x.ptr,
|
||||
dtype, CUDSS_LAYOUT_COL_MAJOR));
|
||||
|
||||
EIGEN_CUDSS_CHECK(cudssExecute(handle_, CUDSS_PHASE_SOLVE, config_, data_, d_A_cudss_, x_cudss, b_cudss));
|
||||
|
||||
DenseMatrix X(n_, rhs.cols());
|
||||
EIGEN_CUDA_RUNTIME_CHECK(cudaMemcpyAsync(X.data(), d_x.ptr, rhs_bytes, cudaMemcpyDeviceToHost, stream_));
|
||||
EIGEN_CUDA_RUNTIME_CHECK(cudaStreamSynchronize(stream_));
|
||||
|
||||
(void)cudssMatrixDestroy(b_cudss);
|
||||
(void)cudssMatrixDestroy(x_cudss);
|
||||
|
||||
return X;
|
||||
}
|
||||
|
||||
// ---- Accessors ------------------------------------------------------------
|
||||
|
||||
ComputationInfo info() const {
|
||||
sync_info();
|
||||
return info_;
|
||||
}
|
||||
Index rows() const { return n_; }
|
||||
Index cols() const { return n_; }
|
||||
|
||||
cudaStream_t stream() const { return stream_; }
|
||||
|
||||
protected:
|
||||
// ---- CUDA / cuDSS handles -------------------------------------------------
|
||||
cudaStream_t stream_ = nullptr;
|
||||
cudssHandle_t handle_ = nullptr;
|
||||
cudssConfig_t config_ = nullptr;
|
||||
cudssData_t data_ = nullptr;
|
||||
cudssMatrix_t d_A_cudss_ = nullptr;
|
||||
cudssMatrix_t d_x_cudss_ = nullptr;
|
||||
cudssMatrix_t d_b_cudss_ = nullptr;
|
||||
|
||||
// ---- Device buffers for CSR arrays ----------------------------------------
|
||||
DeviceBuffer d_rowPtr_;
|
||||
DeviceBuffer d_colIdx_;
|
||||
DeviceBuffer d_values_;
|
||||
|
||||
// ---- State ----------------------------------------------------------------
|
||||
Index n_ = 0;
|
||||
Index nnz_ = 0;
|
||||
ComputationInfo info_ = InvalidInput;
|
||||
bool info_synced_ = true;
|
||||
bool analysis_done_ = false;
|
||||
GpuSparseOrdering ordering_ = GpuSparseOrdering::AMD;
|
||||
|
||||
private:
|
||||
Derived& derived() { return static_cast<Derived&>(*this); }
|
||||
const Derived& derived() const { return static_cast<const Derived&>(*this); }
|
||||
|
||||
void init_context() {
|
||||
EIGEN_CUDA_RUNTIME_CHECK(cudaStreamCreate(&stream_));
|
||||
EIGEN_CUDSS_CHECK(cudssCreate(&handle_));
|
||||
EIGEN_CUDSS_CHECK(cudssSetStream(handle_, stream_));
|
||||
EIGEN_CUDSS_CHECK(cudssConfigCreate(&config_));
|
||||
}
|
||||
|
||||
void sync_info() const {
|
||||
if (!info_synced_) {
|
||||
EIGEN_CUDA_RUNTIME_CHECK(cudaStreamSynchronize(stream_));
|
||||
int cudss_info = 0;
|
||||
EIGEN_CUDSS_CHECK(cudssDataGet(handle_, data_, CUDSS_DATA_INFO, &cudss_info, sizeof(cudss_info), nullptr));
|
||||
auto* self = const_cast<GpuSparseSolverBase*>(this);
|
||||
self->info_ = (cudss_info == 0) ? Success : NumericalIssue;
|
||||
self->info_synced_ = true;
|
||||
}
|
||||
}
|
||||
|
||||
void destroy_cudss_objects() {
|
||||
if (d_A_cudss_) {
|
||||
(void)cudssMatrixDestroy(d_A_cudss_);
|
||||
d_A_cudss_ = nullptr;
|
||||
}
|
||||
if (d_x_cudss_) {
|
||||
(void)cudssMatrixDestroy(d_x_cudss_);
|
||||
d_x_cudss_ = nullptr;
|
||||
}
|
||||
if (d_b_cudss_) {
|
||||
(void)cudssMatrixDestroy(d_b_cudss_);
|
||||
d_b_cudss_ = nullptr;
|
||||
}
|
||||
if (data_) {
|
||||
(void)cudssDataDestroy(handle_, data_);
|
||||
data_ = nullptr;
|
||||
}
|
||||
if (config_) {
|
||||
(void)cudssConfigDestroy(config_);
|
||||
config_ = nullptr;
|
||||
}
|
||||
}
|
||||
|
||||
// Upload CSR from a RowMajor sparse matrix (native CSR).
|
||||
void upload_csr(const CsrMat& csr) { upload_compressed(csr.outerIndexPtr(), csr.innerIndexPtr(), csr.valuePtr()); }
|
||||
|
||||
// Upload CSC arrays reinterpreted as CSR (for symmetric matrices: CSC(A) = CSR(A^T) = CSR(A)).
|
||||
void upload_csr_from_csc(const SpMat& csc) {
|
||||
upload_compressed(csc.outerIndexPtr(), csc.innerIndexPtr(), csc.valuePtr());
|
||||
}
|
||||
|
||||
void upload_compressed(const StorageIndex* outer, const StorageIndex* inner, const Scalar* values) {
|
||||
const size_t rowptr_bytes = static_cast<size_t>(n_ + 1) * sizeof(StorageIndex);
|
||||
const size_t colidx_bytes = static_cast<size_t>(nnz_) * sizeof(StorageIndex);
|
||||
const size_t values_bytes = static_cast<size_t>(nnz_) * sizeof(Scalar);
|
||||
|
||||
d_rowPtr_ = DeviceBuffer(rowptr_bytes);
|
||||
d_colIdx_ = DeviceBuffer(colidx_bytes);
|
||||
d_values_ = DeviceBuffer(values_bytes);
|
||||
|
||||
EIGEN_CUDA_RUNTIME_CHECK(cudaMemcpyAsync(d_rowPtr_.ptr, outer, rowptr_bytes, cudaMemcpyHostToDevice, stream_));
|
||||
EIGEN_CUDA_RUNTIME_CHECK(cudaMemcpyAsync(d_colIdx_.ptr, inner, colidx_bytes, cudaMemcpyHostToDevice, stream_));
|
||||
EIGEN_CUDA_RUNTIME_CHECK(cudaMemcpyAsync(d_values_.ptr, values, values_bytes, cudaMemcpyHostToDevice, stream_));
|
||||
}
|
||||
|
||||
void create_cudss_matrix() {
|
||||
if (d_A_cudss_) (void)cudssMatrixDestroy(d_A_cudss_);
|
||||
|
||||
constexpr cudaDataType_t idx_type = cudss_index_type<StorageIndex>::value;
|
||||
constexpr cudaDataType_t val_type = cuda_data_type<Scalar>::value;
|
||||
constexpr cudssMatrixType_t mtype = Derived::cudss_matrix_type();
|
||||
constexpr cudssMatrixViewType_t mview = Derived::cudss_matrix_view();
|
||||
|
||||
EIGEN_CUDSS_CHECK(cudssMatrixCreateCsr(
|
||||
&d_A_cudss_, static_cast<int64_t>(n_), static_cast<int64_t>(n_), static_cast<int64_t>(nnz_), d_rowPtr_.ptr,
|
||||
/*rowEnd=*/nullptr, d_colIdx_.ptr, d_values_.ptr, idx_type, val_type, mtype, mview, CUDSS_BASE_ZERO));
|
||||
}
|
||||
|
||||
void apply_ordering_config() {
|
||||
cudssAlgType_t alg;
|
||||
switch (ordering_) {
|
||||
case GpuSparseOrdering::AMD:
|
||||
alg = CUDSS_ALG_DEFAULT;
|
||||
break;
|
||||
case GpuSparseOrdering::METIS:
|
||||
alg = CUDSS_ALG_2;
|
||||
break;
|
||||
case GpuSparseOrdering::RCM:
|
||||
alg = CUDSS_ALG_3;
|
||||
break;
|
||||
default:
|
||||
alg = CUDSS_ALG_DEFAULT;
|
||||
break;
|
||||
}
|
||||
EIGEN_CUDSS_CHECK(cudssConfigSet(config_, CUDSS_CONFIG_REORDERING_ALG, &alg, sizeof(alg)));
|
||||
}
|
||||
|
||||
void create_placeholder_dense() {
|
||||
if (d_x_cudss_) (void)cudssMatrixDestroy(d_x_cudss_);
|
||||
if (d_b_cudss_) (void)cudssMatrixDestroy(d_b_cudss_);
|
||||
constexpr cudaDataType_t dtype = cuda_data_type<Scalar>::value;
|
||||
EIGEN_CUDSS_CHECK(cudssMatrixCreateDn(&d_x_cudss_, static_cast<int64_t>(n_), 1, static_cast<int64_t>(n_), nullptr,
|
||||
dtype, CUDSS_LAYOUT_COL_MAJOR));
|
||||
EIGEN_CUDSS_CHECK(cudssMatrixCreateDn(&d_b_cudss_, static_cast<int64_t>(n_), 1, static_cast<int64_t>(n_), nullptr,
|
||||
dtype, CUDSS_LAYOUT_COL_MAJOR));
|
||||
}
|
||||
};
|
||||
|
||||
} // namespace internal
|
||||
} // namespace Eigen
|
||||
|
||||
#endif // EIGEN_GPU_SPARSE_SOLVER_BASE_H
|
||||
101
Eigen/src/GPU/GpuSupport.h
Normal file
101
Eigen/src/GPU/GpuSupport.h
Normal file
@@ -0,0 +1,101 @@
|
||||
// This file is part of Eigen, a lightweight C++ template library
|
||||
// for linear algebra.
|
||||
//
|
||||
// Copyright (C) 2026 Rasmus Munk Larsen <rmlarsen@gmail.com>
|
||||
//
|
||||
// This Source Code Form is subject to the terms of the Mozilla
|
||||
// Public License v. 2.0. If a copy of the MPL was not distributed
|
||||
// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
|
||||
|
||||
// Generic CUDA runtime support shared across all GPU library integrations
|
||||
// (cuSOLVER, cuBLAS, cuDSS, etc.):
|
||||
// - Error-checking macros
|
||||
// - RAII device buffer
|
||||
//
|
||||
// Only depends on <cuda_runtime.h>. No NVIDIA library headers.
|
||||
|
||||
#ifndef EIGEN_GPU_SUPPORT_H
|
||||
#define EIGEN_GPU_SUPPORT_H
|
||||
|
||||
// IWYU pragma: private
|
||||
#include "./InternalHeaderCheck.h"
|
||||
|
||||
#include <cuda_runtime.h>
|
||||
|
||||
namespace Eigen {
|
||||
namespace internal {
|
||||
|
||||
// ---- Error-checking macros --------------------------------------------------
|
||||
// These abort (via eigen_assert) on failure. Not for use in destructors.
|
||||
|
||||
#define EIGEN_CUDA_RUNTIME_CHECK(expr) \
|
||||
do { \
|
||||
cudaError_t _e = (expr); \
|
||||
eigen_assert(_e == cudaSuccess && "CUDA runtime call failed"); \
|
||||
} while (0)
|
||||
|
||||
// ---- RAII: device buffer ----------------------------------------------------
|
||||
|
||||
struct DeviceBuffer {
|
||||
void* ptr = nullptr;
|
||||
|
||||
DeviceBuffer() = default;
|
||||
|
||||
explicit DeviceBuffer(size_t bytes) {
|
||||
if (bytes > 0) EIGEN_CUDA_RUNTIME_CHECK(cudaMalloc(&ptr, bytes));
|
||||
}
|
||||
|
||||
~DeviceBuffer() {
|
||||
if (ptr) (void)cudaFree(ptr); // destructor: ignore errors
|
||||
}
|
||||
|
||||
// Move-only.
|
||||
DeviceBuffer(DeviceBuffer&& o) noexcept : ptr(o.ptr) { o.ptr = nullptr; }
|
||||
DeviceBuffer& operator=(DeviceBuffer&& o) noexcept {
|
||||
if (this != &o) {
|
||||
if (ptr) (void)cudaFree(ptr);
|
||||
ptr = o.ptr;
|
||||
o.ptr = nullptr;
|
||||
}
|
||||
return *this;
|
||||
}
|
||||
|
||||
DeviceBuffer(const DeviceBuffer&) = delete;
|
||||
DeviceBuffer& operator=(const DeviceBuffer&) = delete;
|
||||
|
||||
// Adopt an existing device pointer. Caller relinquishes ownership.
|
||||
static DeviceBuffer adopt(void* p) {
|
||||
DeviceBuffer b;
|
||||
b.ptr = p;
|
||||
return b;
|
||||
}
|
||||
};
|
||||
|
||||
// ---- Scalar → cudaDataType_t ------------------------------------------------
|
||||
// Shared by cuBLAS and cuSOLVER. cudaDataType_t is defined in library_types.h
|
||||
// which is included transitively by cuda_runtime.h.
|
||||
|
||||
template <typename Scalar>
|
||||
struct cuda_data_type;
|
||||
|
||||
template <>
|
||||
struct cuda_data_type<float> {
|
||||
static constexpr cudaDataType_t value = CUDA_R_32F;
|
||||
};
|
||||
template <>
|
||||
struct cuda_data_type<double> {
|
||||
static constexpr cudaDataType_t value = CUDA_R_64F;
|
||||
};
|
||||
template <>
|
||||
struct cuda_data_type<std::complex<float>> {
|
||||
static constexpr cudaDataType_t value = CUDA_C_32F;
|
||||
};
|
||||
template <>
|
||||
struct cuda_data_type<std::complex<double>> {
|
||||
static constexpr cudaDataType_t value = CUDA_C_64F;
|
||||
};
|
||||
|
||||
} // namespace internal
|
||||
} // namespace Eigen
|
||||
|
||||
#endif // EIGEN_GPU_SUPPORT_H
|
||||
3
Eigen/src/GPU/InternalHeaderCheck.h
Normal file
3
Eigen/src/GPU/InternalHeaderCheck.h
Normal file
@@ -0,0 +1,3 @@
|
||||
#ifndef EIGEN_GPU_MODULE_H
|
||||
#error "Please include Eigen/GPU instead of including headers inside the src/GPU directory directly."
|
||||
#endif
|
||||
636
Eigen/src/GPU/README.md
Normal file
636
Eigen/src/GPU/README.md
Normal file
@@ -0,0 +1,636 @@
|
||||
# Eigen GPU Module (`Eigen/GPU`)
|
||||
|
||||
GPU-accelerated linear algebra for Eigen users, dispatching to NVIDIA CUDA
|
||||
libraries (cuBLAS, cuSOLVER, cuFFT, cuSPARSE, cuDSS). Requires CUDA 11.4+;
|
||||
cuDSS features require CUDA 12.0+ and a separate cuDSS install. Header-only.
|
||||
|
||||
## Why this module
|
||||
|
||||
Eigen is the linear algebra foundation for a large ecosystem of C++ projects
|
||||
in robotics (ROS, Drake, MoveIt, Pinocchio), computer vision (OpenCV, COLMAP,
|
||||
Open3D), scientific computing (Ceres, Stan), and beyond. Many of these
|
||||
projects run on GPU-equipped hardware but cannot use GPUs for Eigen operations
|
||||
without dropping down to raw CUDA library APIs.
|
||||
|
||||
GPU sparse solvers are a particularly acute gap. Sparse factorization is the
|
||||
bottleneck in SLAM, bundle adjustment, FEM, and nonlinear optimization --
|
||||
exactly the workloads where GPU acceleration matters most. Downstream projects
|
||||
like [Ceres](https://github.com/ceres-solver/ceres-solver/issues/1151) and
|
||||
[COLMAP](https://github.com/colmap/colmap/issues/4018) have open requests for
|
||||
GPU-accelerated sparse solvers, and third-party projects like
|
||||
[cholespy](https://github.com/rgl-epfl/cholespy) exist specifically because
|
||||
Eigen lacks them. The `Eigen/GPU` module provides GPU sparse Cholesky, LDL^T,
|
||||
and LU factorization via cuDSS, alongside dense solvers (cuSOLVER), matrix
|
||||
products (cuBLAS), FFT (cuFFT), and sparse matrix-vector products (cuSPARSE).
|
||||
|
||||
Existing Eigen users should be able to move performance-critical dense or
|
||||
sparse linear algebra to the GPU with minimal code changes and without
|
||||
learning CUDA library APIs directly.
|
||||
|
||||
## Design philosophy
|
||||
|
||||
**CPU and GPU coexist.** There is no global compile-time switch that replaces
|
||||
CPU implementations (unlike `EIGEN_USE_LAPACKE`). Users choose GPU solvers
|
||||
explicitly -- `GpuLLT<double>` vs `LLT<MatrixXd>`, `GpuSparseLLT<double>` vs
|
||||
`SimplicialLLT<SparseMatrix<double>>` -- and both coexist in the same binary.
|
||||
This also lets users keep the factored matrix on device across multiple solves,
|
||||
something impossible with compile-time replacement.
|
||||
|
||||
**Familiar syntax.** GPU operations use the same expression patterns as CPU
|
||||
Eigen. Here is a side-by-side comparison:
|
||||
|
||||
```cpp
|
||||
// ---- CPU (Eigen) ---- // ---- GPU (Eigen/GPU) ----
|
||||
#include <Eigen/Dense> #define EIGEN_USE_GPU
|
||||
#include <Eigen/GPU>
|
||||
|
||||
// Dense
|
||||
MatrixXd A = ...; auto d_A = DeviceMatrix<double>::fromHost(A);
|
||||
MatrixXd B = ...; auto d_B = DeviceMatrix<double>::fromHost(B);
|
||||
|
||||
MatrixXd C = A * B; DeviceMatrix<double> d_C = d_A * d_B;
|
||||
MatrixXd X = A.llt().solve(B); DeviceMatrix<double> d_X = d_A.llt().solve(d_B);
|
||||
|
||||
MatrixXd X = d_X.toHost();
|
||||
|
||||
// Sparse (using SpMat = SparseMatrix<double>)
|
||||
SimplicialLLT<SpMat> llt(A); GpuSparseLLT<double> llt(A);
|
||||
VectorXd x = llt.solve(b); VectorXd x = llt.solve(b);
|
||||
```
|
||||
|
||||
The GPU version reads like CPU Eigen with explicit upload/download for dense
|
||||
operations, and an almost identical API for sparse solvers. Unsupported
|
||||
expressions are compile errors.
|
||||
|
||||
**Explicit over implicit.** Host-device transfers, stream management, and
|
||||
library handle lifetimes are visible in the API. There are no hidden
|
||||
allocations or synchronizations except where documented (e.g., `toHost()` must
|
||||
synchronize to deliver data to the host).
|
||||
|
||||
## Key concepts
|
||||
|
||||
### `DeviceMatrix<Scalar>`
|
||||
|
||||
A typed RAII wrapper for a dense column-major matrix in GPU device memory.
|
||||
This is the GPU counterpart of Eigen's `MatrixX<Scalar>`. A vector is simply
|
||||
a `DeviceMatrix` with one column.
|
||||
|
||||
```cpp
|
||||
// Upload from host
|
||||
auto d_A = DeviceMatrix<double>::fromHost(A);
|
||||
|
||||
// Allocate uninitialized
|
||||
DeviceMatrix<double> d_C(m, n);
|
||||
|
||||
// Download to host
|
||||
MatrixXd C = d_C.toHost();
|
||||
|
||||
// Async download (returns a future)
|
||||
auto transfer = d_C.toHostAsync();
|
||||
// ... do other work ...
|
||||
MatrixXd C = transfer.get();
|
||||
```
|
||||
|
||||
`DeviceMatrix` supports expression methods that mirror Eigen's API:
|
||||
`adjoint()`, `transpose()`, `triangularView<UpLo>()`,
|
||||
`selfadjointView<UpLo>()`, `llt()`, `lu()`. These return lightweight
|
||||
expression objects that are evaluated when assigned.
|
||||
|
||||
### `GpuContext`
|
||||
|
||||
Every GPU operation needs a CUDA stream and library handles (cuBLAS,
|
||||
cuSOLVER). `GpuContext` bundles these together.
|
||||
|
||||
For simple usage, you don't need to create one -- a per-thread default context
|
||||
is created lazily on first use:
|
||||
|
||||
```cpp
|
||||
// These use the thread-local default context automatically
|
||||
d_C = d_A * d_B;
|
||||
d_X = d_A.llt().solve(d_B);
|
||||
```
|
||||
|
||||
For concurrent multi-stream execution, create explicit contexts:
|
||||
|
||||
```cpp
|
||||
GpuContext ctx1, ctx2;
|
||||
d_C1.device(ctx1) = d_A1 * d_B1; // runs on stream 1
|
||||
d_C2.device(ctx2) = d_A2 * d_B2; // runs on stream 2 (concurrently)
|
||||
```
|
||||
|
||||
## Usage
|
||||
|
||||
### Matrix operations (cuBLAS)
|
||||
|
||||
```cpp
|
||||
auto d_A = DeviceMatrix<double>::fromHost(A);
|
||||
auto d_B = DeviceMatrix<double>::fromHost(B);
|
||||
|
||||
// GEMM: C = A * B, C = A^H * B, C = A * B^T, ...
|
||||
DeviceMatrix<double> d_C = d_A * d_B;
|
||||
d_C = d_A.adjoint() * d_B;
|
||||
d_C = d_A * d_B.transpose();
|
||||
|
||||
// Scaled and accumulated
|
||||
d_C += 2.0 * d_A * d_B; // alpha=2, beta=1
|
||||
d_C.device(ctx) -= d_A * d_B; // alpha=-1, beta=1 (requires explicit context)
|
||||
|
||||
// Triangular solve (TRSM)
|
||||
d_X = d_A.triangularView<Lower>().solve(d_B);
|
||||
|
||||
// Symmetric/Hermitian multiply (SYMM/HEMM)
|
||||
d_C = d_A.selfadjointView<Lower>() * d_B;
|
||||
|
||||
// Rank-k update (SYRK/HERK)
|
||||
d_C.selfadjointView<Lower>().rankUpdate(d_A); // C += A * A^H
|
||||
```
|
||||
|
||||
### Dense solvers (cuSOLVER)
|
||||
|
||||
**One-shot expression syntax** -- Convenient, re-factorizes each time:
|
||||
|
||||
```cpp
|
||||
// Cholesky solve (potrf + potrs)
|
||||
d_X = d_A.llt().solve(d_B);
|
||||
|
||||
// LU solve (getrf + getrs)
|
||||
d_Y = d_A.lu().solve(d_B);
|
||||
```
|
||||
|
||||
**Cached factorization** -- Factor once, solve many times:
|
||||
|
||||
```cpp
|
||||
GpuLLT<double> llt;
|
||||
llt.compute(d_A); // factorize (async)
|
||||
if (llt.info() != Success) { ... } // lazy sync on first info() call
|
||||
auto d_X1 = llt.solve(d_B1); // reuses factor (async)
|
||||
auto d_X2 = llt.solve(d_B2); // reuses factor (async)
|
||||
MatrixXd X2 = d_X2.toHost();
|
||||
|
||||
// LU with transpose solve
|
||||
GpuLU<double> lu;
|
||||
lu.compute(d_A);
|
||||
auto d_Y = lu.solve(d_B, GpuLU<double>::Transpose); // A^T Y = B
|
||||
|
||||
// QR solve (overdetermined least squares)
|
||||
GpuQR<double> qr;
|
||||
qr.compute(d_A); // factorize on device (async)
|
||||
auto d_X = qr.solve(d_B); // Q^H * B via ormqr, then trsm on R
|
||||
MatrixXd X = d_X.toHost();
|
||||
|
||||
// SVD (results downloaded on access)
|
||||
GpuSVD<double> svd;
|
||||
svd.compute(d_A, ComputeThinU | ComputeThinV);
|
||||
VectorXd S = svd.singularValues(); // downloads to host
|
||||
MatrixXd U = svd.matrixU(); // downloads to host
|
||||
MatrixXd VT = svd.matrixVT(); // V^T (matches cuSOLVER)
|
||||
|
||||
// Self-adjoint eigenvalue decomposition (results downloaded on access)
|
||||
GpuSelfAdjointEigenSolver<double> es;
|
||||
es.compute(d_A);
|
||||
VectorXd eigenvals = es.eigenvalues(); // downloads to host
|
||||
MatrixXd eigenvecs = es.eigenvectors(); // downloads to host
|
||||
```
|
||||
|
||||
The cached API keeps the factored matrix on device, avoiding redundant
|
||||
host-device transfers and re-factorizations. All solvers also accept host
|
||||
matrices directly as a convenience (e.g., `GpuLLT<double> llt(A)` or
|
||||
`qr.solve(B)`), which handles upload/download internally.
|
||||
|
||||
### Sparse direct solvers (cuDSS)
|
||||
|
||||
Requires cuDSS (separate install, CUDA 12.0+). Define `EIGEN_CUDSS` before
|
||||
including `Eigen/GPU` and link with `-lcudss`.
|
||||
|
||||
```cpp
|
||||
SparseMatrix<double> A = ...; // symmetric positive definite
|
||||
VectorXd b = ...;
|
||||
|
||||
// Sparse Cholesky -- one-liner
|
||||
GpuSparseLLT<double> llt(A);
|
||||
VectorXd x = llt.solve(b);
|
||||
|
||||
// Three-phase workflow for repeated solves with the same sparsity pattern
|
||||
GpuSparseLLT<double> llt;
|
||||
llt.analyzePattern(A); // symbolic analysis (once)
|
||||
llt.factorize(A); // numeric factorization
|
||||
VectorXd x = llt.solve(b);
|
||||
llt.factorize(A_new_values); // refactorize (reuses symbolic analysis)
|
||||
VectorXd x2 = llt.solve(b);
|
||||
|
||||
// Sparse LDL^T (symmetric indefinite)
|
||||
GpuSparseLDLT<double> ldlt(A);
|
||||
VectorXd x = ldlt.solve(b);
|
||||
|
||||
// Sparse LU (general non-symmetric)
|
||||
GpuSparseLU<double> lu(A);
|
||||
VectorXd x = lu.solve(b);
|
||||
```
|
||||
|
||||
### FFT (cuFFT)
|
||||
|
||||
```cpp
|
||||
GpuFFT<float> fft;
|
||||
|
||||
// 1D complex-to-complex
|
||||
VectorXcf X = fft.fwd(x); // forward
|
||||
VectorXcf y = fft.inv(X); // inverse (scaled by 1/n)
|
||||
|
||||
// 1D real-to-complex / complex-to-real
|
||||
VectorXcf R = fft.fwd(r); // returns n/2+1 complex (half-spectrum)
|
||||
VectorXf s = fft.invReal(R, n); // C2R inverse, caller specifies n
|
||||
|
||||
// 2D complex-to-complex
|
||||
MatrixXcf B = fft.fwd2d(A); // 2D forward
|
||||
MatrixXcf C = fft.inv2d(B); // 2D inverse (scaled by 1/(rows*cols))
|
||||
|
||||
// Plans are cached and reused across calls with the same size/type.
|
||||
```
|
||||
|
||||
### Sparse matrix-vector multiply (cuSPARSE)
|
||||
|
||||
```cpp
|
||||
SparseMatrix<double> A = ...;
|
||||
VectorXd x = ...;
|
||||
|
||||
GpuSparseContext<double> ctx;
|
||||
VectorXd y = ctx.multiply(A, x); // y = A * x
|
||||
VectorXd z = ctx.multiplyT(A, x); // z = A^T * x
|
||||
ctx.multiply(A, x, y, 2.0, 1.0); // y = 2*A*x + y
|
||||
|
||||
// Multiple RHS (SpMM)
|
||||
MatrixXd Y = ctx.multiplyMat(A, X); // Y = A * X
|
||||
```
|
||||
|
||||
### Precision control
|
||||
|
||||
GEMM dispatch enables tensor core algorithms by default, allowing cuBLAS to
|
||||
choose the fastest algorithm for the given precision and architecture. For
|
||||
double precision on sm_80+ (Ampere), this allows Ozaki emulation -- full FP64
|
||||
results computed faster via tensor cores.
|
||||
|
||||
| Macro | Effect |
|
||||
|---|---|
|
||||
| *(default)* | Tensor core algorithms enabled. Float uses full FP32. Double may use Ozaki on sm_80+. |
|
||||
| `EIGEN_CUDA_TF32` | Opt-in: Float uses TF32 (~2x faster, 10-bit mantissa). Double unaffected. |
|
||||
| `EIGEN_NO_CUDA_TENSOR_OPS` | Opt-out: Pedantic compute types, no tensor cores. For bit-exact reproducibility. |
|
||||
|
||||
### Stream control and async execution
|
||||
|
||||
Operations are asynchronous by default. The compute-solve chain runs without
|
||||
host synchronization until you need a result on the host:
|
||||
|
||||
```
|
||||
fromHost(A) --sync--> compute() --async--> solve() --async--> toHost()
|
||||
H2D potrf potrs D2H
|
||||
sync
|
||||
```
|
||||
|
||||
Mandatory sync points:
|
||||
- `fromHost()` -- Synchronizes to complete the upload before returning
|
||||
- `toHost()` / `HostTransfer::get()` -- Must deliver data to host
|
||||
- `info()` -- Must read the factorization status
|
||||
|
||||
**Cross-stream safety** is automatic. `DeviceMatrix` tracks write completion
|
||||
via CUDA events. When a matrix written on stream A is read on stream B, the
|
||||
module automatically inserts `cudaStreamWaitEvent`. Same-stream operations
|
||||
skip the wait (CUDA guarantees in-order execution within a stream).
|
||||
|
||||
## Reference
|
||||
|
||||
### Supported scalar types
|
||||
|
||||
`float`, `double`, `std::complex<float>`, `std::complex<double>` (unless
|
||||
noted otherwise).
|
||||
|
||||
### Expression -> library call mapping
|
||||
|
||||
| DeviceMatrix expression | Library call | Parameters |
|
||||
|---|---|---|
|
||||
| `C = A * B` | `cublasGemmEx` | transA=N, transB=N, alpha=1, beta=0 |
|
||||
| `C = A.adjoint() * B` | `cublasGemmEx` | transA=C, transB=N |
|
||||
| `C = A.transpose() * B` | `cublasGemmEx` | transA=T, transB=N |
|
||||
| `C = A * B.adjoint()` | `cublasGemmEx` | transA=N, transB=C |
|
||||
| `C = A * B.transpose()` | `cublasGemmEx` | transA=N, transB=T |
|
||||
| `C = alpha * A * B` | `cublasGemmEx` | alpha from LHS |
|
||||
| `C = A * (alpha * B)` | `cublasGemmEx` | alpha from RHS |
|
||||
| `C += A * B` | `cublasGemmEx` | alpha=1, beta=1 |
|
||||
| `C.device(ctx) -= A * B` | `cublasGemmEx` | alpha=-1, beta=1 |
|
||||
| `X = A.llt().solve(B)` | `cusolverDnXpotrf` + `Xpotrs` | uplo, n, nrhs |
|
||||
| `X = A.llt<Upper>().solve(B)` | same | uplo=Upper |
|
||||
| `X = A.lu().solve(B)` | `cusolverDnXgetrf` + `Xgetrs` | n, nrhs |
|
||||
| `X = A.triangularView<L>().solve(B)` | `cublasXtrsm` | side=L, uplo, diag=NonUnit |
|
||||
| `C = A.selfadjointView<L>() * B` | `cublasXsymm` / `cublasXhemm` | side=L, uplo |
|
||||
| `C.selfadjointView<L>().rankUpdate(A)` | `cublasXsyrk` / `cublasXherk` | uplo, trans=N |
|
||||
|
||||
### `DeviceMatrix<Scalar>`
|
||||
|
||||
Typed RAII wrapper for a dense column-major matrix in GPU device memory.
|
||||
Always dense (leading dimension = rows). A vector is a `DeviceMatrix` with
|
||||
one column.
|
||||
|
||||
```cpp
|
||||
// Construction
|
||||
DeviceMatrix<Scalar>() // Empty (0x0)
|
||||
DeviceMatrix<Scalar>(rows, cols) // Allocate uninitialized
|
||||
|
||||
// Upload / download
|
||||
static DeviceMatrix fromHost(matrix, stream=nullptr) // -> DeviceMatrix (syncs)
|
||||
static DeviceMatrix fromHostAsync(ptr, rows, cols, outerStride, s) // -> DeviceMatrix (no sync, caller manages ptr lifetime)
|
||||
PlainMatrix toHost(stream=nullptr) // -> host Matrix (syncs)
|
||||
HostTransfer toHostAsync(stream=nullptr) // -> HostTransfer future (no sync)
|
||||
DeviceMatrix clone(stream=nullptr) // -> DeviceMatrix (D2D copy, async)
|
||||
|
||||
// Dimensions and access
|
||||
Index rows()
|
||||
Index cols()
|
||||
size_t sizeInBytes()
|
||||
bool empty()
|
||||
Scalar* data() // Raw device pointer
|
||||
void resize(Index rows, Index cols) // Discard contents, reallocate
|
||||
|
||||
// Expression builders (return lightweight views, evaluated on assignment)
|
||||
AdjointView adjoint() // GEMM with ConjTrans
|
||||
TransposeView transpose() // GEMM with Trans
|
||||
LltExpr llt() / llt<UpLo>() // -> .solve(d_B) -> DeviceMatrix
|
||||
LuExpr lu() // -> .solve(d_B) -> DeviceMatrix
|
||||
TriangularView triangularView<UpLo>() // -> .solve(d_B) -> DeviceMatrix (TRSM)
|
||||
SelfAdjointView selfadjointView<UpLo>() // -> * d_B (SYMM), .rankUpdate(d_A) (SYRK)
|
||||
DeviceAssignment device(GpuContext& ctx) // Bind assignment to explicit stream
|
||||
```
|
||||
|
||||
### `GpuContext`
|
||||
|
||||
Unified GPU execution context owning a CUDA stream and library handles.
|
||||
|
||||
```cpp
|
||||
GpuContext() // Creates dedicated stream + handles
|
||||
static GpuContext& threadLocal() // Per-thread default (lazy-created)
|
||||
|
||||
cudaStream_t stream()
|
||||
cublasHandle_t cublasHandle()
|
||||
cusolverDnHandle_t cusolverHandle()
|
||||
```
|
||||
|
||||
Non-copyable, non-movable (owns library handles).
|
||||
|
||||
### `GpuLLT<Scalar, UpLo>` -- Dense Cholesky (cuSOLVER)
|
||||
|
||||
Caches the Cholesky factor on device for repeated solves.
|
||||
|
||||
```cpp
|
||||
GpuLLT() // Default construct, then call compute()
|
||||
GpuLLT(const EigenBase<D>& A) // Convenience: upload + factorize
|
||||
|
||||
GpuLLT& compute(const EigenBase<D>& A) // Upload + factorize
|
||||
GpuLLT& compute(const DeviceMatrix& d_A) // D2D copy + factorize
|
||||
GpuLLT& compute(DeviceMatrix&& d_A) // Adopt + factorize (no copy)
|
||||
|
||||
PlainMatrix solve(const MatrixBase<D>& B) // -> host Matrix (syncs)
|
||||
DeviceMatrix solve(const DeviceMatrix& d_B) // -> DeviceMatrix (async, stays on device)
|
||||
|
||||
ComputationInfo info() // Lazy sync on first call: Success or NumericalIssue
|
||||
Index rows() / cols()
|
||||
cudaStream_t stream()
|
||||
```
|
||||
|
||||
### `GpuLU<Scalar>` -- Dense LU (cuSOLVER)
|
||||
|
||||
Same pattern as `GpuLLT`. Adds `TransposeMode` parameter on `solve()`.
|
||||
|
||||
```cpp
|
||||
PlainMatrix solve(const MatrixBase<D>& B, TransposeMode m = NoTranspose) // -> host Matrix
|
||||
DeviceMatrix solve(const DeviceMatrix& d_B, TransposeMode m = NoTranspose) // -> DeviceMatrix
|
||||
```
|
||||
|
||||
`TransposeMode`: `NoTranspose`, `Transpose`, `ConjugateTranspose`.
|
||||
|
||||
### `GpuQR<Scalar>` -- Dense QR (cuSOLVER)
|
||||
|
||||
QR factorization via `cusolverDnXgeqrf`. Solve uses ORMQR (apply Q^H) + TRSM
|
||||
(back-substitute on R) -- Q is never formed explicitly.
|
||||
|
||||
```cpp
|
||||
GpuQR() // Default construct
|
||||
GpuQR(const EigenBase<D>& A) // Convenience: upload + factorize
|
||||
|
||||
GpuQR& compute(const EigenBase<D>& A) // Upload + factorize
|
||||
GpuQR& compute(const DeviceMatrix& d_A) // D2D copy + factorize
|
||||
|
||||
PlainMatrix solve(const MatrixBase<D>& B) // -> host Matrix (syncs)
|
||||
DeviceMatrix solve(const DeviceMatrix& d_B) // -> DeviceMatrix (async)
|
||||
|
||||
ComputationInfo info() // Lazy sync
|
||||
Index rows() / cols()
|
||||
cudaStream_t stream()
|
||||
```
|
||||
|
||||
### `GpuSVD<Scalar>` -- Dense SVD (cuSOLVER)
|
||||
|
||||
SVD via `cusolverDnXgesvd`. Supports `ComputeThinU | ComputeThinV`,
|
||||
`ComputeFullU | ComputeFullV`, or `0` (values only). Wide matrices (m < n)
|
||||
handled by internal transpose.
|
||||
|
||||
```cpp
|
||||
GpuSVD() // Default construct, then call compute()
|
||||
GpuSVD(const EigenBase<D>& A, unsigned options = ComputeThinU | ComputeThinV) // Convenience
|
||||
|
||||
GpuSVD& compute(const EigenBase<D>& A, unsigned options = ComputeThinU | ComputeThinV)
|
||||
GpuSVD& compute(const DeviceMatrix& d_A, unsigned options = ComputeThinU | ComputeThinV)
|
||||
|
||||
RealVector singularValues() // -> host vector (syncs, downloads)
|
||||
PlainMatrix matrixU() // -> host Matrix (syncs, downloads)
|
||||
PlainMatrix matrixVT() // -> host Matrix (syncs, downloads V^T)
|
||||
|
||||
PlainMatrix solve(const MatrixBase<D>& B) // -> host Matrix (pseudoinverse)
|
||||
PlainMatrix solve(const MatrixBase<D>& B, Index k) // Truncated (top k triplets)
|
||||
PlainMatrix solve(const MatrixBase<D>& B, RealScalar l) // Tikhonov regularized
|
||||
|
||||
Index rank(RealScalar threshold = -1)
|
||||
ComputationInfo info() // Lazy sync
|
||||
Index rows() / cols()
|
||||
cudaStream_t stream()
|
||||
```
|
||||
|
||||
**Note:** `singularValues()`, `matrixU()`, and `matrixVT()` download to host
|
||||
on each call. Device-side accessors returning `DeviceMatrix` are planned but
|
||||
not yet implemented.
|
||||
|
||||
### `GpuSelfAdjointEigenSolver<Scalar>` -- Eigendecomposition (cuSOLVER)
|
||||
|
||||
Symmetric/Hermitian eigenvalue decomposition via `cusolverDnXsyevd`.
|
||||
`ComputeMode` enum: `EigenvaluesOnly`, `ComputeEigenvectors`.
|
||||
|
||||
```cpp
|
||||
GpuSelfAdjointEigenSolver() // Default construct, then call compute()
|
||||
GpuSelfAdjointEigenSolver(const EigenBase<D>& A, ComputeMode mode = ComputeEigenvectors) // Convenience
|
||||
|
||||
GpuSelfAdjointEigenSolver& compute(const EigenBase<D>& A, ComputeMode mode = ComputeEigenvectors)
|
||||
GpuSelfAdjointEigenSolver& compute(const DeviceMatrix& d_A, ComputeMode mode = ComputeEigenvectors)
|
||||
|
||||
RealVector eigenvalues() // -> host vector (syncs, downloads, ascending order)
|
||||
PlainMatrix eigenvectors() // -> host Matrix (syncs, downloads, columns)
|
||||
|
||||
ComputationInfo info() // Lazy sync
|
||||
Index rows() / cols()
|
||||
cudaStream_t stream()
|
||||
```
|
||||
|
||||
**Note:** `eigenvalues()` and `eigenvectors()` download to host on each call.
|
||||
Device-side accessors returning `DeviceMatrix` are planned but not yet
|
||||
implemented.
|
||||
|
||||
### `HostTransfer<Scalar>`
|
||||
|
||||
Future for async device-to-host transfer. Returned by
|
||||
`DeviceMatrix::toHostAsync()`.
|
||||
|
||||
```cpp
|
||||
PlainMatrix& get() // Block until complete, return host Matrix ref. Idempotent.
|
||||
bool ready() // Non-blocking poll
|
||||
```
|
||||
|
||||
### `GpuSparseLLT<Scalar, UpLo>` -- Sparse Cholesky (cuDSS)
|
||||
|
||||
Requires cuDSS (CUDA 12.0+, `#define EIGEN_CUDSS`). Three-phase workflow
|
||||
with symbolic reuse. Accepts `SparseMatrix<Scalar, ColMajor, int>` (CSC).
|
||||
|
||||
```cpp
|
||||
GpuSparseLLT() // Default construct
|
||||
GpuSparseLLT(const SparseMatrixBase<D>& A) // Analyze + factorize
|
||||
|
||||
GpuSparseLLT& analyzePattern(const SparseMatrixBase<D>& A) // Symbolic analysis (reusable)
|
||||
GpuSparseLLT& factorize(const SparseMatrixBase<D>& A) // Numeric factorization
|
||||
GpuSparseLLT& compute(const SparseMatrixBase<D>& A) // analyzePattern + factorize
|
||||
void setOrdering(GpuSparseOrdering ord) // AMD (default), METIS, or RCM
|
||||
|
||||
DenseMatrix solve(const MatrixBase<D>& B) // -> host Matrix (syncs)
|
||||
|
||||
ComputationInfo info() // Lazy sync
|
||||
Index rows() / cols()
|
||||
cudaStream_t stream()
|
||||
```
|
||||
|
||||
### `GpuSparseLDLT<Scalar, UpLo>` -- Sparse LDL^T (cuDSS)
|
||||
|
||||
Symmetric indefinite. Same API as `GpuSparseLLT`.
|
||||
|
||||
### `GpuSparseLU<Scalar>` -- Sparse LU (cuDSS)
|
||||
|
||||
General non-symmetric. Same API as `GpuSparseLLT` (without `UpLo`).
|
||||
|
||||
### `GpuFFT<Scalar>` -- FFT (cuFFT)
|
||||
|
||||
Plans cached by (size, type) and reused. Inverse transforms scaled so
|
||||
`inv(fwd(x)) == x`. Supported scalars: `float`, `double`.
|
||||
|
||||
```cpp
|
||||
// 1D transforms (host vectors in and out)
|
||||
ComplexVector fwd(const MatrixBase<D>& x) // C2C forward (complex input)
|
||||
ComplexVector fwd(const MatrixBase<D>& x) // R2C forward (real input, returns n/2+1)
|
||||
ComplexVector inv(const MatrixBase<D>& X) // C2C inverse, scaled by 1/n
|
||||
RealVector invReal(const MatrixBase<D>& X, Index n) // C2R inverse, scaled by 1/n
|
||||
|
||||
// 2D transforms (host matrices in and out)
|
||||
ComplexMatrix fwd2d(const MatrixBase<D>& A) // 2D C2C forward
|
||||
ComplexMatrix inv2d(const MatrixBase<D>& A) // 2D C2C inverse, scaled by 1/(rows*cols)
|
||||
|
||||
cudaStream_t stream()
|
||||
```
|
||||
|
||||
All FFT methods accept host data and return host data. Upload/download is
|
||||
handled internally. The C2C and R2C overloads of `fwd()` are distinguished by
|
||||
the input scalar type (complex vs real).
|
||||
|
||||
### `GpuSparseContext<Scalar>` -- SpMV/SpMM (cuSPARSE)
|
||||
|
||||
Accepts `SparseMatrix<Scalar, ColMajor>`. All methods accept host data and
|
||||
return host data.
|
||||
|
||||
```cpp
|
||||
GpuSparseContext() // Creates own stream + cuSPARSE handle
|
||||
|
||||
DenseVector multiply(A, x) // y = A * x
|
||||
void multiply(A, x, y, alpha=1, beta=0, // y = alpha*op(A)*x + beta*y
|
||||
op=CUSPARSE_OPERATION_NON_TRANSPOSE)
|
||||
DenseVector multiplyT(A, x) // y = A^T * x
|
||||
DenseMatrix multiplyMat(A, X) // Y = A * X (SpMM)
|
||||
|
||||
cudaStream_t stream()
|
||||
```
|
||||
|
||||
### Aliasing
|
||||
|
||||
Unlike Eigen's `Matrix`, where omitting `.noalias()` triggers a copy to a
|
||||
temporary, DeviceMatrix dispatches directly to NVIDIA library calls which have
|
||||
no built-in aliasing protection. All operations are implicitly noalias.
|
||||
The caller must ensure operands don't alias the destination for GEMM and TRSM
|
||||
(debug asserts catch violations).
|
||||
|
||||
## File layout
|
||||
|
||||
| File | Depends on | Contents |
|
||||
|------|-----------|----------|
|
||||
| `GpuSupport.h` | `<cuda_runtime.h>` | Error macro, `DeviceBuffer`, `cuda_data_type<>` |
|
||||
| `DeviceMatrix.h` | `GpuSupport.h` | `DeviceMatrix<>`, `HostTransfer<>` |
|
||||
| `DeviceExpr.h` | `DeviceMatrix.h` | GEMM expression wrappers |
|
||||
| `DeviceBlasExpr.h` | `DeviceMatrix.h` | TRSM, SYMM, SYRK expression wrappers |
|
||||
| `DeviceSolverExpr.h` | `DeviceMatrix.h` | Solver expression wrappers (LLT, LU) |
|
||||
| `DeviceDispatch.h` | all above | All dispatch functions + `DeviceAssignment` |
|
||||
| `GpuContext.h` | `CuBlasSupport.h`, `CuSolverSupport.h` | `GpuContext` |
|
||||
| `CuBlasSupport.h` | `GpuSupport.h`, `<cublas_v2.h>` | cuBLAS error macro, op/compute type maps |
|
||||
| `CuSolverSupport.h` | `GpuSupport.h`, `<cusolverDn.h>` | cuSOLVER params, fill-mode mapping |
|
||||
| `GpuLLT.h` | `CuSolverSupport.h` | Cached dense Cholesky factorization |
|
||||
| `GpuLU.h` | `CuSolverSupport.h` | Cached dense LU factorization |
|
||||
| `GpuQR.h` | `CuSolverSupport.h`, `CuBlasSupport.h` | Dense QR decomposition |
|
||||
| `GpuSVD.h` | `CuSolverSupport.h`, `CuBlasSupport.h` | Dense SVD decomposition |
|
||||
| `GpuEigenSolver.h` | `CuSolverSupport.h` | Self-adjoint eigenvalue decomposition |
|
||||
| `CuFftSupport.h` | `GpuSupport.h`, `<cufft.h>` | cuFFT error macro, type-dispatch wrappers |
|
||||
| `GpuFFT.h` | `CuFftSupport.h`, `CuBlasSupport.h` | 1D/2D FFT with plan caching |
|
||||
| `CuSparseSupport.h` | `GpuSupport.h`, `<cusparse.h>` | cuSPARSE error macro |
|
||||
| `GpuSparseContext.h` | `CuSparseSupport.h` | SpMV/SpMM via cuSPARSE |
|
||||
| `CuDssSupport.h` | `GpuSupport.h`, `<cudss.h>` | cuDSS error macro, type traits (optional) |
|
||||
| `GpuSparseSolverBase.h` | `CuDssSupport.h` | CRTP base for sparse solvers (optional) |
|
||||
| `GpuSparseLLT.h` | `GpuSparseSolverBase.h` | Sparse Cholesky via cuDSS (optional) |
|
||||
| `GpuSparseLDLT.h` | `GpuSparseSolverBase.h` | Sparse LDL^T via cuDSS (optional) |
|
||||
| `GpuSparseLU.h` | `GpuSparseSolverBase.h` | Sparse LU via cuDSS (optional) |
|
||||
|
||||
## Building and testing
|
||||
|
||||
```bash
|
||||
cmake -G Ninja -B build -S . \
|
||||
-DEIGEN_TEST_CUDA=ON \
|
||||
-DEIGEN_CUDA_COMPUTE_ARCH="70" \
|
||||
-DEIGEN_TEST_CUBLAS=ON \
|
||||
-DEIGEN_TEST_CUSOLVER=ON
|
||||
|
||||
cmake --build build --target gpu_cublas gpu_cusolver_llt gpu_cusolver_lu \
|
||||
gpu_cusolver_qr gpu_cusolver_svd gpu_cusolver_eigen \
|
||||
gpu_device_matrix gpu_cufft gpu_cusparse_spmv
|
||||
ctest --test-dir build -R "gpu_" --output-on-failure
|
||||
|
||||
# Sparse solvers (cuDSS -- separate install required)
|
||||
cmake -G Ninja -B build -S . \
|
||||
-DEIGEN_TEST_CUDA=ON \
|
||||
-DEIGEN_CUDA_COMPUTE_ARCH="70" \
|
||||
-DEIGEN_TEST_CUDSS=ON
|
||||
|
||||
cmake --build build --target gpu_cudss_llt gpu_cudss_ldlt gpu_cudss_lu
|
||||
ctest --test-dir build -R gpu_cudss --output-on-failure
|
||||
```
|
||||
|
||||
## Future work
|
||||
|
||||
- **Device-side accessors for decomposition results.** `GpuSVD`,
|
||||
`GpuSelfAdjointEigenSolver`, and `GpuQR` currently download decomposition
|
||||
results to host on access (e.g., `svd.matrixU()` returns a host `MatrixXd`).
|
||||
Device-side accessors returning `DeviceMatrix` views of the internal buffers
|
||||
would allow chaining GPU operations (e.g., `svd.deviceU() * d_A`) without
|
||||
round-tripping through host memory.
|
||||
- **Device-resident sparse matrix-vector products.** `GpuSparseContext`
|
||||
currently operates on host vectors and matrices, uploading and downloading
|
||||
on each call. The key missing piece is a `DeviceSparseView` that holds a
|
||||
sparse matrix on device and supports operator syntax (`d_y = d_A * d_x`)
|
||||
with `DeviceMatrix` operands -- keeping the entire SpMV/SpMM pipeline on
|
||||
device. This is essential for iterative solvers and any workflow that chains
|
||||
sparse and dense operations without returning to the host.
|
||||
17
benchmarks/BLAS/CMakeLists.txt
Normal file
17
benchmarks/BLAS/CMakeLists.txt
Normal file
@@ -0,0 +1,17 @@
|
||||
# Benchmarks for Eigen's built-in BLAS implementation.
|
||||
# Compiles the Eigen BLAS sources directly into the benchmark executable
|
||||
# so there is no external BLAS dependency.
|
||||
|
||||
set(EIGEN_BLAS_SRCS
|
||||
${EIGEN_SOURCE_DIR}/blas/single.cpp
|
||||
${EIGEN_SOURCE_DIR}/blas/double.cpp
|
||||
${EIGEN_SOURCE_DIR}/blas/complex_single.cpp
|
||||
${EIGEN_SOURCE_DIR}/blas/complex_double.cpp
|
||||
${EIGEN_SOURCE_DIR}/blas/xerbla.cpp
|
||||
${EIGEN_SOURCE_DIR}/blas/lsame.cpp
|
||||
${EIGEN_SOURCE_DIR}/blas/complexdots.cpp
|
||||
)
|
||||
|
||||
eigen_add_benchmark(bench_blas bench_blas.cpp)
|
||||
target_sources(bench_blas PRIVATE ${EIGEN_BLAS_SRCS})
|
||||
target_include_directories(bench_blas PRIVATE ${EIGEN_SOURCE_DIR}/blas)
|
||||
488
benchmarks/BLAS/bench_blas.cpp
Normal file
488
benchmarks/BLAS/bench_blas.cpp
Normal file
@@ -0,0 +1,488 @@
|
||||
// Benchmark for Eigen's BLAS implementation.
|
||||
//
|
||||
// Calls the Eigen BLAS C interface directly (the extern "C" functions defined
|
||||
// in blas/{single,double,complex_single,complex_double}.cpp).
|
||||
//
|
||||
// Covers Level 1, 2, and 3 routines — with emphasis on the routines that
|
||||
// were recently rewritten from f2c to C++: rotm, rotmg, spmv, sbmv, hbmv,
|
||||
// hpmv, tbmv, lsame, and complex dot products.
|
||||
|
||||
#include <benchmark/benchmark.h>
|
||||
|
||||
#include <Eigen/Core>
|
||||
#include <complex>
|
||||
#include <vector>
|
||||
|
||||
#include "blas/blas.h"
|
||||
|
||||
using Eigen::Index;
|
||||
|
||||
// ---------------------------------------------------------------------------
|
||||
// Helpers
|
||||
// ---------------------------------------------------------------------------
|
||||
|
||||
// Flop-rate counter (units = individual flops per call).
|
||||
static benchmark::Counter GflopsCounter(double flops) {
|
||||
return benchmark::Counter(flops, benchmark::Counter::kIsIterationInvariantRate, benchmark::Counter::kIs1000);
|
||||
}
|
||||
|
||||
// Fill a vector with random values in [-1, 1].
|
||||
template <typename T>
|
||||
static void fillRand(T* data, Index n) {
|
||||
Eigen::Map<Eigen::Matrix<T, Eigen::Dynamic, 1>>(data, n).setRandom();
|
||||
}
|
||||
|
||||
// Fill a symmetric band matrix A in BLAS band storage (column-major).
|
||||
// Upper triangle: A[i,j] stored at a[(k+i-j) + j*lda], 0 <= j-i <= k.
|
||||
template <typename T>
|
||||
static void fillSymBandUpper(T* a, int n, int k, int lda) {
|
||||
std::fill(a, a + lda * n, T(0));
|
||||
for (int j = 0; j < n; ++j)
|
||||
for (int i = std::max(0, j - k); i <= j; ++i) a[(k + i - j) + j * lda] = T(std::rand()) / T(RAND_MAX) - T(0.5);
|
||||
}
|
||||
|
||||
// Fill a packed symmetric matrix (upper triangle, column-major).
|
||||
template <typename T>
|
||||
static void fillSymPacked(T* ap, int n) {
|
||||
int sz = n * (n + 1) / 2;
|
||||
for (int i = 0; i < sz; ++i) ap[i] = T(std::rand()) / T(RAND_MAX) - T(0.5);
|
||||
}
|
||||
|
||||
// Fill a triangular band matrix in BLAS band storage (upper, column-major).
|
||||
template <typename T>
|
||||
static void fillTriBandUpper(T* a, int n, int k, int lda) {
|
||||
std::fill(a, a + lda * n, T(0));
|
||||
for (int j = 0; j < n; ++j)
|
||||
for (int i = std::max(0, j - k); i <= j; ++i) {
|
||||
T val = T(std::rand()) / T(RAND_MAX) - T(0.5);
|
||||
if (i == j) val += T(n); // diagonal dominance
|
||||
a[(k + i - j) + j * lda] = val;
|
||||
}
|
||||
}
|
||||
|
||||
// ---------------------------------------------------------------------------
|
||||
// Type-dispatched BLAS wrappers
|
||||
// ---------------------------------------------------------------------------
|
||||
|
||||
inline float blas_dot(int* n, float* x, int* incx, float* y, int* incy) { return sdot_(n, x, incx, y, incy); }
|
||||
inline double blas_dot(int* n, double* x, int* incx, double* y, int* incy) { return ddot_(n, x, incx, y, incy); }
|
||||
|
||||
inline void blas_axpy(int* n, float* a, float* x, int* incx, float* y, int* incy) { saxpy_(n, a, x, incx, y, incy); }
|
||||
inline void blas_axpy(int* n, double* a, double* x, int* incx, double* y, int* incy) { daxpy_(n, a, x, incx, y, incy); }
|
||||
|
||||
inline float blas_nrm2(int* n, float* x, int* incx) { return snrm2_(n, x, incx); }
|
||||
inline double blas_nrm2(int* n, double* x, int* incx) { return dnrm2_(n, x, incx); }
|
||||
|
||||
inline void blas_rotm(int* n, float* x, int* incx, float* y, int* incy, float* p) { srotm_(n, x, incx, y, incy, p); }
|
||||
inline void blas_rotm(int* n, double* x, int* incx, double* y, int* incy, double* p) { drotm_(n, x, incx, y, incy, p); }
|
||||
|
||||
inline void blas_rotmg(float* d1, float* d2, float* x1, float* y1, float* p) { srotmg_(d1, d2, x1, y1, p); }
|
||||
inline void blas_rotmg(double* d1, double* d2, double* x1, double* y1, double* p) { drotmg_(d1, d2, x1, y1, p); }
|
||||
|
||||
inline void blas_dotcw(int* n, float* cx, int* incx, float* cy, int* incy, float* res) {
|
||||
cdotcw_(n, cx, incx, cy, incy, res);
|
||||
}
|
||||
inline void blas_dotcw(int* n, double* cx, int* incx, double* cy, int* incy, double* res) {
|
||||
zdotcw_(n, cx, incx, cy, incy, res);
|
||||
}
|
||||
|
||||
inline void blas_gemv(char* t, int* m, int* n, float* a, float* A, int* lda, float* x, int* incx, float* b, float* y,
|
||||
int* incy) {
|
||||
sgemv_(t, m, n, a, A, lda, x, incx, b, y, incy);
|
||||
}
|
||||
inline void blas_gemv(char* t, int* m, int* n, double* a, double* A, int* lda, double* x, int* incx, double* b,
|
||||
double* y, int* incy) {
|
||||
dgemv_(t, m, n, a, A, lda, x, incx, b, y, incy);
|
||||
}
|
||||
|
||||
inline void blas_spmv(char* uplo, int* n, float* alpha, float* ap, float* x, int* incx, float* beta, float* y,
|
||||
int* incy) {
|
||||
sspmv_(uplo, n, alpha, ap, x, incx, beta, y, incy);
|
||||
}
|
||||
inline void blas_spmv(char* uplo, int* n, double* alpha, double* ap, double* x, int* incx, double* beta, double* y,
|
||||
int* incy) {
|
||||
dspmv_(uplo, n, alpha, ap, x, incx, beta, y, incy);
|
||||
}
|
||||
|
||||
inline void blas_sbmv(char* uplo, int* n, int* k, float* alpha, float* a, int* lda, float* x, int* incx, float* beta,
|
||||
float* y, int* incy) {
|
||||
ssbmv_(uplo, n, k, alpha, a, lda, x, incx, beta, y, incy);
|
||||
}
|
||||
inline void blas_sbmv(char* uplo, int* n, int* k, double* alpha, double* a, int* lda, double* x, int* incx,
|
||||
double* beta, double* y, int* incy) {
|
||||
dsbmv_(uplo, n, k, alpha, a, lda, x, incx, beta, y, incy);
|
||||
}
|
||||
|
||||
inline void blas_tbmv(char* uplo, char* trans, char* diag, int* n, int* k, float* a, int* lda, float* x, int* incx) {
|
||||
stbmv_(uplo, trans, diag, n, k, a, lda, x, incx);
|
||||
}
|
||||
inline void blas_tbmv(char* uplo, char* trans, char* diag, int* n, int* k, double* a, int* lda, double* x, int* incx) {
|
||||
dtbmv_(uplo, trans, diag, n, k, a, lda, x, incx);
|
||||
}
|
||||
|
||||
inline void blas_hbmv(char* uplo, int* n, int* k, float* alpha, float* a, int* lda, float* x, int* incx, float* beta,
|
||||
float* y, int* incy) {
|
||||
chbmv_(uplo, n, k, alpha, a, lda, x, incx, beta, y, incy);
|
||||
}
|
||||
inline void blas_hbmv(char* uplo, int* n, int* k, double* alpha, double* a, int* lda, double* x, int* incx,
|
||||
double* beta, double* y, int* incy) {
|
||||
zhbmv_(uplo, n, k, alpha, a, lda, x, incx, beta, y, incy);
|
||||
}
|
||||
|
||||
inline void blas_hpmv(char* uplo, int* n, float* alpha, float* ap, float* x, int* incx, float* beta, float* y,
|
||||
int* incy) {
|
||||
chpmv_(uplo, n, alpha, ap, x, incx, beta, y, incy);
|
||||
}
|
||||
inline void blas_hpmv(char* uplo, int* n, double* alpha, double* ap, double* x, int* incx, double* beta, double* y,
|
||||
int* incy) {
|
||||
zhpmv_(uplo, n, alpha, ap, x, incx, beta, y, incy);
|
||||
}
|
||||
|
||||
inline void blas_gemm(char* ta, char* tb, int* m, int* n, int* k, float* alpha, float* a, int* lda, float* b, int* ldb,
|
||||
float* beta, float* c, int* ldc) {
|
||||
sgemm_(ta, tb, m, n, k, alpha, a, lda, b, ldb, beta, c, ldc);
|
||||
}
|
||||
inline void blas_gemm(char* ta, char* tb, int* m, int* n, int* k, double* alpha, double* a, int* lda, double* b,
|
||||
int* ldb, double* beta, double* c, int* ldc) {
|
||||
dgemm_(ta, tb, m, n, k, alpha, a, lda, b, ldb, beta, c, ldc);
|
||||
}
|
||||
|
||||
// =========================================================================
|
||||
// Level 1 — Real
|
||||
// =========================================================================
|
||||
|
||||
// ----- SDOT / DDOT -----
|
||||
template <typename T>
|
||||
static void BM_dot(benchmark::State& state) {
|
||||
int n = static_cast<int>(state.range(0));
|
||||
int one = 1;
|
||||
std::vector<T> x(n), y(n);
|
||||
fillRand(x.data(), n);
|
||||
fillRand(y.data(), n);
|
||||
for (auto _ : state) {
|
||||
T r = blas_dot(&n, x.data(), &one, y.data(), &one);
|
||||
benchmark::DoNotOptimize(r);
|
||||
}
|
||||
state.counters["GFLOPS"] = GflopsCounter(2.0 * n);
|
||||
}
|
||||
|
||||
// ----- SAXPY / DAXPY -----
|
||||
template <typename T>
|
||||
static void BM_axpy(benchmark::State& state) {
|
||||
int n = static_cast<int>(state.range(0));
|
||||
int one = 1;
|
||||
T alpha = T(2.5);
|
||||
std::vector<T> x(n), y(n);
|
||||
fillRand(x.data(), n);
|
||||
fillRand(y.data(), n);
|
||||
for (auto _ : state) {
|
||||
blas_axpy(&n, &alpha, x.data(), &one, y.data(), &one);
|
||||
benchmark::DoNotOptimize(y.data());
|
||||
}
|
||||
state.counters["GFLOPS"] = GflopsCounter(2.0 * n);
|
||||
}
|
||||
|
||||
// ----- SNRM2 / DNRM2 -----
|
||||
template <typename T>
|
||||
static void BM_nrm2(benchmark::State& state) {
|
||||
int n = static_cast<int>(state.range(0));
|
||||
int one = 1;
|
||||
std::vector<T> x(n);
|
||||
fillRand(x.data(), n);
|
||||
for (auto _ : state) {
|
||||
T r = blas_nrm2(&n, x.data(), &one);
|
||||
benchmark::DoNotOptimize(r);
|
||||
}
|
||||
// Nominal flops; Eigen's stableNorm() does more work internally.
|
||||
state.counters["GFLOPS"] = GflopsCounter(2.0 * n - 1);
|
||||
}
|
||||
|
||||
// ----- SROTM / DROTM -----
|
||||
template <typename T>
|
||||
static void BM_rotm(benchmark::State& state) {
|
||||
int n = static_cast<int>(state.range(0));
|
||||
int one = 1;
|
||||
std::vector<T> x(n), y(n);
|
||||
T param[5] = {T(-1), T(0.6), T(-0.8), T(0.8), T(0.6)}; // full rotation
|
||||
fillRand(x.data(), n);
|
||||
fillRand(y.data(), n);
|
||||
for (auto _ : state) {
|
||||
blas_rotm(&n, x.data(), &one, y.data(), &one, param);
|
||||
benchmark::DoNotOptimize(x.data());
|
||||
benchmark::DoNotOptimize(y.data());
|
||||
}
|
||||
// 4 muls + 2 adds per element pair.
|
||||
state.counters["GFLOPS"] = GflopsCounter(6.0 * n);
|
||||
}
|
||||
|
||||
// ----- SROTMG / DROTMG -----
|
||||
template <typename T>
|
||||
static void BM_rotmg(benchmark::State& state) {
|
||||
T d1 = T(2), d2 = T(3), x1 = T(1), y1 = T(0.5);
|
||||
T param[5];
|
||||
for (auto _ : state) {
|
||||
T td1 = d1, td2 = d2, tx1 = x1;
|
||||
blas_rotmg(&td1, &td2, &tx1, &y1, param);
|
||||
benchmark::DoNotOptimize(param);
|
||||
}
|
||||
}
|
||||
|
||||
// =========================================================================
|
||||
// Level 1 — Complex
|
||||
// =========================================================================
|
||||
|
||||
// Complex conjugate dot product via the worker functions (cdotcw_ / zdotcw_)
|
||||
// which use an output pointer, avoiding the ABI ambiguity of the struct-returning
|
||||
// cdotc_ / zdotc_ wrappers.
|
||||
template <typename T>
|
||||
static void BM_dotc(benchmark::State& state) {
|
||||
int n = static_cast<int>(state.range(0));
|
||||
int one = 1;
|
||||
std::vector<T> x(2 * n), y(2 * n); // interleaved real/imag
|
||||
fillRand(x.data(), 2 * n);
|
||||
fillRand(y.data(), 2 * n);
|
||||
T res[2];
|
||||
for (auto _ : state) {
|
||||
blas_dotcw(&n, x.data(), &one, y.data(), &one, res);
|
||||
benchmark::DoNotOptimize(res);
|
||||
}
|
||||
// Conjugate dot: 6 mul + 2 add per element = 8n flops.
|
||||
state.counters["GFLOPS"] = GflopsCounter(8.0 * n);
|
||||
}
|
||||
|
||||
// =========================================================================
|
||||
// Level 2 — General Matrix-Vector (SGEMV / DGEMV)
|
||||
// =========================================================================
|
||||
|
||||
template <typename T>
|
||||
static void BM_gemv(benchmark::State& state) {
|
||||
int m = static_cast<int>(state.range(0));
|
||||
int n = static_cast<int>(state.range(1));
|
||||
int one = 1;
|
||||
T alpha = T(1), beta = T(0);
|
||||
char trans = 'N';
|
||||
std::vector<T> a(m * n), x(n), y(m);
|
||||
fillRand(a.data(), m * n);
|
||||
fillRand(x.data(), n);
|
||||
fillRand(y.data(), m);
|
||||
for (auto _ : state) {
|
||||
blas_gemv(&trans, &m, &n, &alpha, a.data(), &m, x.data(), &one, &beta, y.data(), &one);
|
||||
benchmark::DoNotOptimize(y.data());
|
||||
}
|
||||
state.counters["GFLOPS"] = GflopsCounter(2.0 * m * n);
|
||||
}
|
||||
|
||||
// =========================================================================
|
||||
// Level 2 — Symmetric Packed (SSPMV / DSPMV)
|
||||
// =========================================================================
|
||||
|
||||
template <typename T>
|
||||
static void BM_spmv(benchmark::State& state) {
|
||||
int n = static_cast<int>(state.range(0));
|
||||
int one = 1;
|
||||
T alpha = T(1), beta = T(0);
|
||||
char uplo = 'U';
|
||||
int sz = n * (n + 1) / 2;
|
||||
std::vector<T> ap(sz), x(n), y(n);
|
||||
fillSymPacked(ap.data(), n);
|
||||
fillRand(x.data(), n);
|
||||
fillRand(y.data(), n);
|
||||
for (auto _ : state) {
|
||||
blas_spmv(&uplo, &n, &alpha, ap.data(), x.data(), &one, &beta, y.data(), &one);
|
||||
benchmark::DoNotOptimize(y.data());
|
||||
}
|
||||
// Symmetric: each off-diag element contributes to two y entries.
|
||||
state.counters["GFLOPS"] = GflopsCounter(2.0 * n * n);
|
||||
}
|
||||
|
||||
// =========================================================================
|
||||
// Level 2 — Symmetric Band (SSBMV / DSBMV)
|
||||
// =========================================================================
|
||||
|
||||
template <typename T>
|
||||
static void BM_sbmv(benchmark::State& state) {
|
||||
int n = static_cast<int>(state.range(0));
|
||||
int k = static_cast<int>(state.range(1));
|
||||
int lda = k + 1;
|
||||
int one = 1;
|
||||
T alpha = T(1), beta = T(0);
|
||||
char uplo = 'U';
|
||||
std::vector<T> a(lda * n), x(n), y(n);
|
||||
fillSymBandUpper(a.data(), n, k, lda);
|
||||
fillRand(x.data(), n);
|
||||
fillRand(y.data(), n);
|
||||
for (auto _ : state) {
|
||||
blas_sbmv(&uplo, &n, &k, &alpha, a.data(), &lda, x.data(), &one, &beta, y.data(), &one);
|
||||
benchmark::DoNotOptimize(y.data());
|
||||
}
|
||||
state.counters["GFLOPS"] = GflopsCounter(2.0 * n * (2 * k + 1));
|
||||
}
|
||||
|
||||
// =========================================================================
|
||||
// Level 2 — Triangular Band (STBMV / DTBMV)
|
||||
// =========================================================================
|
||||
|
||||
template <typename T>
|
||||
static void BM_tbmv(benchmark::State& state) {
|
||||
int n = static_cast<int>(state.range(0));
|
||||
int k = static_cast<int>(state.range(1));
|
||||
int lda = k + 1;
|
||||
int one = 1;
|
||||
char uplo = 'U', trans = 'N', diag = 'N';
|
||||
std::vector<T> a(lda * n), x(n), x_orig(n);
|
||||
fillTriBandUpper(a.data(), n, k, lda);
|
||||
fillRand(x_orig.data(), n);
|
||||
for (auto _ : state) {
|
||||
state.PauseTiming();
|
||||
std::copy(x_orig.begin(), x_orig.end(), x.begin());
|
||||
state.ResumeTiming();
|
||||
blas_tbmv(&uplo, &trans, &diag, &n, &k, a.data(), &lda, x.data(), &one);
|
||||
benchmark::DoNotOptimize(x.data());
|
||||
}
|
||||
state.counters["GFLOPS"] = GflopsCounter(1.0 * n * (k + 1));
|
||||
}
|
||||
|
||||
// =========================================================================
|
||||
// Level 2 — Hermitian Band (CHBMV / ZHBMV)
|
||||
// =========================================================================
|
||||
|
||||
template <typename T>
|
||||
static void BM_hbmv(benchmark::State& state) {
|
||||
int n = static_cast<int>(state.range(0));
|
||||
int k = static_cast<int>(state.range(1));
|
||||
int lda = k + 1;
|
||||
int one = 1;
|
||||
char uplo = 'U';
|
||||
// Complex: each element is 2 reals.
|
||||
std::vector<T> a(2 * lda * n), x(2 * n), y(2 * n);
|
||||
T alpha[2] = {T(1), T(0)};
|
||||
T beta[2] = {T(0), T(0)};
|
||||
fillRand(a.data(), 2 * lda * n);
|
||||
// Make diagonal real (imag part = 0).
|
||||
for (int j = 0; j < n; ++j) a[2 * (k + j * lda) + 1] = T(0);
|
||||
fillRand(x.data(), 2 * n);
|
||||
fillRand(y.data(), 2 * n);
|
||||
for (auto _ : state) {
|
||||
blas_hbmv(&uplo, &n, &k, alpha, a.data(), &lda, x.data(), &one, beta, y.data(), &one);
|
||||
benchmark::DoNotOptimize(y.data());
|
||||
}
|
||||
// Complex hermitian band: 8*n*(2k+1) flops approximately.
|
||||
state.counters["GFLOPS"] = GflopsCounter(8.0 * n * (2 * k + 1));
|
||||
}
|
||||
|
||||
// =========================================================================
|
||||
// Level 2 — Hermitian Packed (CHPMV / ZHPMV)
|
||||
// =========================================================================
|
||||
|
||||
template <typename T>
|
||||
static void BM_hpmv(benchmark::State& state) {
|
||||
int n = static_cast<int>(state.range(0));
|
||||
int one = 1;
|
||||
char uplo = 'U';
|
||||
int sz = n * (n + 1) / 2;
|
||||
std::vector<T> ap(2 * sz), x(2 * n), y(2 * n);
|
||||
T alpha[2] = {T(1), T(0)};
|
||||
T beta[2] = {T(0), T(0)};
|
||||
fillRand(ap.data(), 2 * sz);
|
||||
// Make diagonal real.
|
||||
int kk = 0;
|
||||
for (int j = 0; j < n; ++j) {
|
||||
ap[2 * (kk + j) + 1] = T(0);
|
||||
kk += j + 1;
|
||||
}
|
||||
fillRand(x.data(), 2 * n);
|
||||
fillRand(y.data(), 2 * n);
|
||||
for (auto _ : state) {
|
||||
blas_hpmv(&uplo, &n, alpha, ap.data(), x.data(), &one, beta, y.data(), &one);
|
||||
benchmark::DoNotOptimize(y.data());
|
||||
}
|
||||
state.counters["GFLOPS"] = GflopsCounter(8.0 * n * n);
|
||||
}
|
||||
|
||||
// =========================================================================
|
||||
// Level 3 — General Matrix Multiply (SGEMM / DGEMM)
|
||||
// =========================================================================
|
||||
|
||||
template <typename T>
|
||||
static void BM_gemm(benchmark::State& state) {
|
||||
int n = static_cast<int>(state.range(0));
|
||||
T alpha = T(1), beta = T(0);
|
||||
char trans = 'N';
|
||||
std::vector<T> a(n * n), b(n * n), c(n * n);
|
||||
fillRand(a.data(), n * n);
|
||||
fillRand(b.data(), n * n);
|
||||
fillRand(c.data(), n * n);
|
||||
for (auto _ : state) {
|
||||
blas_gemm(&trans, &trans, &n, &n, &n, &alpha, a.data(), &n, b.data(), &n, &beta, c.data(), &n);
|
||||
benchmark::DoNotOptimize(c.data());
|
||||
}
|
||||
state.counters["GFLOPS"] = GflopsCounter(2.0 * n * n * n);
|
||||
}
|
||||
|
||||
// =========================================================================
|
||||
// Register benchmarks
|
||||
// =========================================================================
|
||||
|
||||
// clang-format off
|
||||
|
||||
// --- Vector sizes for Level 1 ---
|
||||
#define L1_SIZES ->Arg(64)->Arg(256)->Arg(1024)->Arg(4096)->Arg(16384)->Arg(65536)
|
||||
|
||||
BENCHMARK(BM_dot<float>) L1_SIZES ->Name("sdot");
|
||||
BENCHMARK(BM_dot<double>) L1_SIZES ->Name("ddot");
|
||||
BENCHMARK(BM_axpy<float>) L1_SIZES ->Name("saxpy");
|
||||
BENCHMARK(BM_axpy<double>) L1_SIZES ->Name("daxpy");
|
||||
BENCHMARK(BM_nrm2<float>) L1_SIZES ->Name("snrm2");
|
||||
BENCHMARK(BM_nrm2<double>) L1_SIZES ->Name("dnrm2");
|
||||
BENCHMARK(BM_rotm<float>) L1_SIZES ->Name("srotm");
|
||||
BENCHMARK(BM_rotm<double>) L1_SIZES ->Name("drotm");
|
||||
BENCHMARK(BM_rotmg<float>) ->Name("srotmg");
|
||||
BENCHMARK(BM_rotmg<double>) ->Name("drotmg");
|
||||
BENCHMARK(BM_dotc<float>) L1_SIZES ->Name("cdotc");
|
||||
BENCHMARK(BM_dotc<double>) L1_SIZES ->Name("zdotc");
|
||||
|
||||
#undef L1_SIZES
|
||||
|
||||
// --- Matrix sizes for Level 2 ---
|
||||
// GEMV: {m, n}
|
||||
#define GEMV_SIZES \
|
||||
->Args({64, 64})->Args({256, 256})->Args({1024, 1024})->Args({4096, 4096}) \
|
||||
->Args({4096, 64})->Args({64, 4096})
|
||||
|
||||
BENCHMARK(BM_gemv<float>) GEMV_SIZES ->Name("sgemv");
|
||||
BENCHMARK(BM_gemv<double>) GEMV_SIZES ->Name("dgemv");
|
||||
#undef GEMV_SIZES
|
||||
|
||||
// Symmetric packed: {n}
|
||||
#define SPM_SIZES ->Arg(64)->Arg(256)->Arg(1024)->Arg(4096)
|
||||
|
||||
BENCHMARK(BM_spmv<float>) SPM_SIZES ->Name("sspmv");
|
||||
BENCHMARK(BM_spmv<double>) SPM_SIZES ->Name("dspmv");
|
||||
BENCHMARK(BM_hpmv<float>) SPM_SIZES ->Name("chpmv");
|
||||
BENCHMARK(BM_hpmv<double>) SPM_SIZES ->Name("zhpmv");
|
||||
|
||||
#undef SPM_SIZES
|
||||
|
||||
// Band: {n, k}
|
||||
#define BAND_SIZES \
|
||||
->Args({256, 4})->Args({256, 32})->Args({1024, 4})->Args({1024, 32}) \
|
||||
->Args({4096, 4})->Args({4096, 32})->Args({4096, 128})
|
||||
|
||||
BENCHMARK(BM_sbmv<float>) BAND_SIZES ->Name("ssbmv");
|
||||
BENCHMARK(BM_sbmv<double>) BAND_SIZES ->Name("dsbmv");
|
||||
BENCHMARK(BM_tbmv<float>) BAND_SIZES ->Name("stbmv");
|
||||
BENCHMARK(BM_tbmv<double>) BAND_SIZES ->Name("dtbmv");
|
||||
BENCHMARK(BM_hbmv<float>) BAND_SIZES ->Name("chbmv");
|
||||
BENCHMARK(BM_hbmv<double>) BAND_SIZES ->Name("zhbmv");
|
||||
|
||||
#undef BAND_SIZES
|
||||
|
||||
// --- Square sizes for Level 3 ---
|
||||
#define GEMM_SIZES ->Arg(32)->Arg(64)->Arg(128)->Arg(256)->Arg(512)->Arg(1024)
|
||||
|
||||
BENCHMARK(BM_gemm<float>) GEMM_SIZES ->Name("sgemm");
|
||||
BENCHMARK(BM_gemm<double>) GEMM_SIZES ->Name("dgemm");
|
||||
|
||||
#undef GEMM_SIZES
|
||||
|
||||
// clang-format on
|
||||
@@ -20,7 +20,11 @@ function(eigen_add_benchmark name source)
|
||||
if(BENCH_LIBRARIES)
|
||||
target_link_libraries(${name} PRIVATE ${BENCH_LIBRARIES})
|
||||
endif()
|
||||
target_compile_options(${name} PRIVATE -O3 -DNDEBUG)
|
||||
target_compile_options(${name} PRIVATE
|
||||
$<$<CXX_COMPILER_ID:MSVC>:/O2>
|
||||
$<$<NOT:$<CXX_COMPILER_ID:MSVC>>:-O3>
|
||||
)
|
||||
target_compile_definitions(${name} PRIVATE NDEBUG)
|
||||
if(BENCH_DEFINITIONS)
|
||||
target_compile_definitions(${name} PRIVATE ${BENCH_DEFINITIONS})
|
||||
endif()
|
||||
@@ -38,3 +42,11 @@ add_subdirectory(FFT)
|
||||
add_subdirectory(Householder)
|
||||
add_subdirectory(Solvers)
|
||||
add_subdirectory(Tuning)
|
||||
add_subdirectory(BLAS)
|
||||
|
||||
# GPU benchmarks have their own CMake project (needs CUDAToolkit).
|
||||
# They can also be built standalone: cmake -B build -S benchmarks/GPU
|
||||
find_package(CUDAToolkit QUIET)
|
||||
if(CUDAToolkit_FOUND)
|
||||
add_subdirectory(GPU)
|
||||
endif()
|
||||
|
||||
57
benchmarks/GPU/CMakeLists.txt
Normal file
57
benchmarks/GPU/CMakeLists.txt
Normal file
@@ -0,0 +1,57 @@
|
||||
# GPU benchmarks require CUDA runtime + cuSOLVER.
|
||||
# Build separately from the main benchmark tree since they need CUDA toolchain.
|
||||
#
|
||||
# Usage:
|
||||
# cmake -G Ninja -B build-bench-gpu -S benchmarks/GPU \
|
||||
# -DCMAKE_CUDA_ARCHITECTURES=89
|
||||
# cmake --build build-bench-gpu
|
||||
#
|
||||
# Profiling:
|
||||
# nsys profile --trace=cuda ./build-bench-gpu/bench_gpu_solvers
|
||||
# ncu --set full -o profile ./build-bench-gpu/bench_gpu_solvers --benchmark_filter=BM_GpuLLT_Compute/4096
|
||||
|
||||
cmake_minimum_required(VERSION 3.18)
|
||||
project(EigenGpuBenchmarks CXX)
|
||||
|
||||
find_package(benchmark REQUIRED)
|
||||
find_package(CUDAToolkit REQUIRED)
|
||||
|
||||
set(EIGEN_SOURCE_DIR "${CMAKE_CURRENT_SOURCE_DIR}/../..")
|
||||
|
||||
function(eigen_add_gpu_benchmark name source)
|
||||
cmake_parse_arguments(BENCH "" "" "LIBRARIES;DEFINITIONS" ${ARGN})
|
||||
if(NOT IS_ABSOLUTE "${source}")
|
||||
set(source "${CMAKE_CURRENT_SOURCE_DIR}/${source}")
|
||||
endif()
|
||||
add_executable(${name} ${source})
|
||||
target_include_directories(${name} PRIVATE
|
||||
${EIGEN_SOURCE_DIR}
|
||||
${CUDAToolkit_INCLUDE_DIRS})
|
||||
target_link_libraries(${name} PRIVATE
|
||||
benchmark::benchmark benchmark::benchmark_main
|
||||
CUDA::cudart CUDA::cusolver CUDA::cublas)
|
||||
if(BENCH_LIBRARIES)
|
||||
target_link_libraries(${name} PRIVATE ${BENCH_LIBRARIES})
|
||||
endif()
|
||||
target_compile_options(${name} PRIVATE -O3 -DNDEBUG)
|
||||
target_compile_definitions(${name} PRIVATE EIGEN_USE_GPU)
|
||||
if(BENCH_DEFINITIONS)
|
||||
target_compile_definitions(${name} PRIVATE ${BENCH_DEFINITIONS})
|
||||
endif()
|
||||
endfunction()
|
||||
|
||||
# Solver benchmarks: LLT/LU compute + solve, host vs device paths, CPU baselines.
|
||||
eigen_add_gpu_benchmark(bench_gpu_solvers bench_gpu_solvers.cpp)
|
||||
eigen_add_gpu_benchmark(bench_gpu_solvers_float bench_gpu_solvers.cpp DEFINITIONS SCALAR=float)
|
||||
|
||||
# Chaining benchmarks: async pipeline efficiency, host-roundtrip vs device chain.
|
||||
eigen_add_gpu_benchmark(bench_gpu_chaining bench_gpu_chaining.cpp)
|
||||
eigen_add_gpu_benchmark(bench_gpu_chaining_float bench_gpu_chaining.cpp DEFINITIONS SCALAR=float)
|
||||
|
||||
# Batching benchmarks: multi-stream concurrency for many small systems.
|
||||
eigen_add_gpu_benchmark(bench_gpu_batching bench_gpu_batching.cpp)
|
||||
eigen_add_gpu_benchmark(bench_gpu_batching_float bench_gpu_batching.cpp DEFINITIONS SCALAR=float)
|
||||
|
||||
# FFT benchmarks: 1D/2D C2C, R2C, C2R throughput and plan reuse.
|
||||
eigen_add_gpu_benchmark(bench_gpu_fft bench_gpu_fft.cpp LIBRARIES CUDA::cufft)
|
||||
eigen_add_gpu_benchmark(bench_gpu_fft_double bench_gpu_fft.cpp LIBRARIES CUDA::cufft DEFINITIONS SCALAR=double)
|
||||
268
benchmarks/GPU/bench_gpu_batching.cpp
Normal file
268
benchmarks/GPU/bench_gpu_batching.cpp
Normal file
@@ -0,0 +1,268 @@
|
||||
// GPU batching benchmarks: multi-stream concurrency for many small solves.
|
||||
//
|
||||
// Each GpuLLT/GpuLU owns its own CUDA stream. This benchmark measures how
|
||||
// well multiple solver instances overlap on the GPU, which is critical for
|
||||
// workloads like robotics (many small systems) and SLAM (batched poses).
|
||||
//
|
||||
// Compares:
|
||||
// 1. Sequential: one solver handles all systems one by one
|
||||
// 2. Batched: N solvers on N streams, all launched before any sync
|
||||
// 3. CPU baseline: Eigen LLT on host
|
||||
//
|
||||
// For Nsight Systems: batched mode should show overlapping kernels on
|
||||
// different streams in the timeline view.
|
||||
//
|
||||
// nsys profile --trace=cuda ./bench_gpu_batching
|
||||
|
||||
#include <benchmark/benchmark.h>
|
||||
|
||||
#include <Eigen/Cholesky>
|
||||
#include <Eigen/GPU>
|
||||
|
||||
#include <memory>
|
||||
#include <vector>
|
||||
|
||||
using namespace Eigen;
|
||||
|
||||
#ifndef SCALAR
|
||||
#define SCALAR double
|
||||
#endif
|
||||
|
||||
using Scalar = SCALAR;
|
||||
using Mat = Matrix<Scalar, Dynamic, Dynamic>;
|
||||
|
||||
static Mat make_spd(Index n) {
|
||||
Mat M = Mat::Random(n, n);
|
||||
return M.adjoint() * M + Mat::Identity(n, n) * static_cast<Scalar>(n);
|
||||
}
|
||||
|
||||
static void cuda_warmup() {
|
||||
static bool done = false;
|
||||
if (!done) {
|
||||
void* p;
|
||||
cudaMalloc(&p, 1);
|
||||
cudaFree(p);
|
||||
done = true;
|
||||
}
|
||||
}
|
||||
|
||||
// --------------------------------------------------------------------------
|
||||
// Sequential: one solver, N systems solved one after another
|
||||
// --------------------------------------------------------------------------
|
||||
|
||||
static void BM_Batch_Sequential(benchmark::State& state) {
|
||||
cuda_warmup();
|
||||
const Index n = state.range(0);
|
||||
const int batch_size = static_cast<int>(state.range(1));
|
||||
|
||||
// Pre-generate all SPD matrices and RHS vectors.
|
||||
std::vector<Mat> As(batch_size);
|
||||
std::vector<Mat> Bs(batch_size);
|
||||
for (int i = 0; i < batch_size; ++i) {
|
||||
As[i] = make_spd(n);
|
||||
Bs[i] = Mat::Random(n, 1);
|
||||
}
|
||||
|
||||
GpuLLT<Scalar> llt;
|
||||
|
||||
for (auto _ : state) {
|
||||
std::vector<Mat> results(batch_size);
|
||||
for (int i = 0; i < batch_size; ++i) {
|
||||
llt.compute(As[i]);
|
||||
results[i] = llt.solve(Bs[i]);
|
||||
}
|
||||
benchmark::DoNotOptimize(results.back().data());
|
||||
}
|
||||
|
||||
state.counters["n"] = n;
|
||||
state.counters["batch"] = batch_size;
|
||||
state.counters["total_solves"] = batch_size;
|
||||
}
|
||||
|
||||
// --------------------------------------------------------------------------
|
||||
// Sequential with DeviceMatrix (avoid re-upload of A each iteration)
|
||||
// --------------------------------------------------------------------------
|
||||
|
||||
static void BM_Batch_Sequential_Device(benchmark::State& state) {
|
||||
cuda_warmup();
|
||||
const Index n = state.range(0);
|
||||
const int batch_size = static_cast<int>(state.range(1));
|
||||
|
||||
std::vector<Mat> As(batch_size);
|
||||
std::vector<Mat> Bs(batch_size);
|
||||
std::vector<DeviceMatrix<Scalar>> d_As(batch_size);
|
||||
std::vector<DeviceMatrix<Scalar>> d_Bs(batch_size);
|
||||
for (int i = 0; i < batch_size; ++i) {
|
||||
As[i] = make_spd(n);
|
||||
Bs[i] = Mat::Random(n, 1);
|
||||
d_As[i] = DeviceMatrix<Scalar>::fromHost(As[i]);
|
||||
d_Bs[i] = DeviceMatrix<Scalar>::fromHost(Bs[i]);
|
||||
}
|
||||
|
||||
GpuLLT<Scalar> llt;
|
||||
|
||||
for (auto _ : state) {
|
||||
std::vector<Mat> results(batch_size);
|
||||
for (int i = 0; i < batch_size; ++i) {
|
||||
llt.compute(d_As[i]);
|
||||
DeviceMatrix<Scalar> d_X = llt.solve(d_Bs[i]);
|
||||
results[i] = d_X.toHost();
|
||||
}
|
||||
benchmark::DoNotOptimize(results.back().data());
|
||||
}
|
||||
|
||||
state.counters["n"] = n;
|
||||
state.counters["batch"] = batch_size;
|
||||
state.counters["total_solves"] = batch_size;
|
||||
}
|
||||
|
||||
// --------------------------------------------------------------------------
|
||||
// Batched: N solvers on N streams, overlapping execution
|
||||
// --------------------------------------------------------------------------
|
||||
|
||||
static void BM_Batch_MultiStream(benchmark::State& state) {
|
||||
cuda_warmup();
|
||||
const Index n = state.range(0);
|
||||
const int batch_size = static_cast<int>(state.range(1));
|
||||
|
||||
std::vector<Mat> As(batch_size);
|
||||
std::vector<Mat> Bs(batch_size);
|
||||
std::vector<DeviceMatrix<Scalar>> d_As(batch_size);
|
||||
std::vector<DeviceMatrix<Scalar>> d_Bs(batch_size);
|
||||
for (int i = 0; i < batch_size; ++i) {
|
||||
As[i] = make_spd(n);
|
||||
Bs[i] = Mat::Random(n, 1);
|
||||
d_As[i] = DeviceMatrix<Scalar>::fromHost(As[i]);
|
||||
d_Bs[i] = DeviceMatrix<Scalar>::fromHost(Bs[i]);
|
||||
}
|
||||
|
||||
// N solvers = N independent CUDA streams.
|
||||
std::vector<std::unique_ptr<GpuLLT<Scalar>>> solvers(batch_size);
|
||||
for (int i = 0; i < batch_size; ++i) {
|
||||
solvers[i] = std::make_unique<GpuLLT<Scalar>>();
|
||||
}
|
||||
|
||||
for (auto _ : state) {
|
||||
// Phase 1: launch all factorizations (async, different streams).
|
||||
for (int i = 0; i < batch_size; ++i) {
|
||||
solvers[i]->compute(d_As[i]);
|
||||
}
|
||||
|
||||
// Phase 2: launch all solves (async, different streams).
|
||||
std::vector<DeviceMatrix<Scalar>> d_Xs(batch_size);
|
||||
for (int i = 0; i < batch_size; ++i) {
|
||||
d_Xs[i] = solvers[i]->solve(d_Bs[i]);
|
||||
}
|
||||
|
||||
// Phase 3: download all results.
|
||||
std::vector<Mat> results(batch_size);
|
||||
for (int i = 0; i < batch_size; ++i) {
|
||||
results[i] = d_Xs[i].toHost();
|
||||
}
|
||||
benchmark::DoNotOptimize(results.back().data());
|
||||
}
|
||||
|
||||
state.counters["n"] = n;
|
||||
state.counters["batch"] = batch_size;
|
||||
state.counters["streams"] = batch_size;
|
||||
state.counters["total_solves"] = batch_size;
|
||||
}
|
||||
|
||||
// --------------------------------------------------------------------------
|
||||
// Batched with async download (overlap D2H with computation)
|
||||
// --------------------------------------------------------------------------
|
||||
|
||||
static void BM_Batch_MultiStream_AsyncDownload(benchmark::State& state) {
|
||||
cuda_warmup();
|
||||
const Index n = state.range(0);
|
||||
const int batch_size = static_cast<int>(state.range(1));
|
||||
|
||||
std::vector<Mat> As(batch_size);
|
||||
std::vector<Mat> Bs(batch_size);
|
||||
std::vector<DeviceMatrix<Scalar>> d_As(batch_size);
|
||||
std::vector<DeviceMatrix<Scalar>> d_Bs(batch_size);
|
||||
for (int i = 0; i < batch_size; ++i) {
|
||||
As[i] = make_spd(n);
|
||||
Bs[i] = Mat::Random(n, 1);
|
||||
d_As[i] = DeviceMatrix<Scalar>::fromHost(As[i]);
|
||||
d_Bs[i] = DeviceMatrix<Scalar>::fromHost(Bs[i]);
|
||||
}
|
||||
|
||||
std::vector<std::unique_ptr<GpuLLT<Scalar>>> solvers(batch_size);
|
||||
for (int i = 0; i < batch_size; ++i) {
|
||||
solvers[i] = std::make_unique<GpuLLT<Scalar>>();
|
||||
}
|
||||
|
||||
for (auto _ : state) {
|
||||
// Launch all compute + solve.
|
||||
std::vector<DeviceMatrix<Scalar>> d_Xs(batch_size);
|
||||
for (int i = 0; i < batch_size; ++i) {
|
||||
solvers[i]->compute(d_As[i]);
|
||||
d_Xs[i] = solvers[i]->solve(d_Bs[i]);
|
||||
}
|
||||
|
||||
// Enqueue all async downloads.
|
||||
std::vector<HostTransfer<Scalar>> transfers;
|
||||
transfers.reserve(batch_size);
|
||||
for (int i = 0; i < batch_size; ++i) {
|
||||
transfers.push_back(d_Xs[i].toHostAsync());
|
||||
}
|
||||
|
||||
// Collect all results.
|
||||
for (int i = 0; i < batch_size; ++i) {
|
||||
benchmark::DoNotOptimize(transfers[i].get().data());
|
||||
}
|
||||
}
|
||||
|
||||
state.counters["n"] = n;
|
||||
state.counters["batch"] = batch_size;
|
||||
state.counters["streams"] = batch_size;
|
||||
state.counters["total_solves"] = batch_size;
|
||||
}
|
||||
|
||||
// --------------------------------------------------------------------------
|
||||
// CPU baseline: Eigen LLT on host, sequential
|
||||
// --------------------------------------------------------------------------
|
||||
|
||||
static void BM_Batch_CPU(benchmark::State& state) {
|
||||
const Index n = state.range(0);
|
||||
const int batch_size = static_cast<int>(state.range(1));
|
||||
|
||||
std::vector<Mat> As(batch_size);
|
||||
std::vector<Mat> Bs(batch_size);
|
||||
for (int i = 0; i < batch_size; ++i) {
|
||||
As[i] = make_spd(n);
|
||||
Bs[i] = Mat::Random(n, 1);
|
||||
}
|
||||
|
||||
for (auto _ : state) {
|
||||
std::vector<Mat> results(batch_size);
|
||||
for (int i = 0; i < batch_size; ++i) {
|
||||
LLT<Mat> llt(As[i]);
|
||||
results[i] = llt.solve(Bs[i]);
|
||||
}
|
||||
benchmark::DoNotOptimize(results.back().data());
|
||||
}
|
||||
|
||||
state.counters["n"] = n;
|
||||
state.counters["batch"] = batch_size;
|
||||
state.counters["total_solves"] = batch_size;
|
||||
}
|
||||
|
||||
// --------------------------------------------------------------------------
|
||||
// Registration
|
||||
// --------------------------------------------------------------------------
|
||||
|
||||
// clang-format off
|
||||
// Args: {matrix_size, batch_size}
|
||||
// Small matrices with large batches are the interesting case for multi-stream.
|
||||
BENCHMARK(BM_Batch_Sequential)->ArgsProduct({{16, 32, 64, 128, 256, 512}, {1, 4, 16, 64}})->Unit(benchmark::kMicrosecond);
|
||||
BENCHMARK(BM_Batch_Sequential_Device)->ArgsProduct({{16, 32, 64, 128, 256, 512}, {1, 4, 16, 64}})->Unit(benchmark::kMicrosecond);
|
||||
BENCHMARK(BM_Batch_MultiStream)->ArgsProduct({{16, 32, 64, 128, 256, 512}, {1, 4, 16, 64}})->Unit(benchmark::kMicrosecond);
|
||||
BENCHMARK(BM_Batch_MultiStream_AsyncDownload)->ArgsProduct({{16, 32, 64, 128, 256, 512}, {1, 4, 16, 64}})->Unit(benchmark::kMicrosecond);
|
||||
BENCHMARK(BM_Batch_CPU)->ArgsProduct({{16, 32, 64, 128, 256, 512}, {1, 4, 16, 64}})->Unit(benchmark::kMicrosecond);
|
||||
|
||||
// Also run larger sizes with moderate batching.
|
||||
BENCHMARK(BM_Batch_MultiStream)->ArgsProduct({{512, 1024, 2048}, {1, 4, 8}})->Unit(benchmark::kMicrosecond);
|
||||
BENCHMARK(BM_Batch_MultiStream_AsyncDownload)->ArgsProduct({{512, 1024, 2048}, {1, 4, 8}})->Unit(benchmark::kMicrosecond);
|
||||
// clang-format on
|
||||
216
benchmarks/GPU/bench_gpu_chaining.cpp
Normal file
216
benchmarks/GPU/bench_gpu_chaining.cpp
Normal file
@@ -0,0 +1,216 @@
|
||||
// GPU chaining benchmarks: measure async pipeline efficiency.
|
||||
//
|
||||
// Compares:
|
||||
// 1. Host round-trip per solve (baseline)
|
||||
// 2. DeviceMatrix chaining (no host round-trip between solves)
|
||||
// 3. Varying chain lengths (1, 2, 4, 8 consecutive solves)
|
||||
//
|
||||
// For Nsight Systems: look for gaps between kernel launches in the timeline.
|
||||
// Host round-trip creates visible idle gaps; chaining should show back-to-back kernels.
|
||||
//
|
||||
// nsys profile --trace=cuda,nvtx ./bench_gpu_chaining
|
||||
|
||||
#include <benchmark/benchmark.h>
|
||||
|
||||
#include <Eigen/Cholesky>
|
||||
#include <Eigen/GPU>
|
||||
|
||||
using namespace Eigen;
|
||||
|
||||
#ifndef SCALAR
|
||||
#define SCALAR double
|
||||
#endif
|
||||
|
||||
using Scalar = SCALAR;
|
||||
using Mat = Matrix<Scalar, Dynamic, Dynamic>;
|
||||
|
||||
static Mat make_spd(Index n) {
|
||||
Mat M = Mat::Random(n, n);
|
||||
return M.adjoint() * M + Mat::Identity(n, n) * static_cast<Scalar>(n);
|
||||
}
|
||||
|
||||
static void cuda_warmup() {
|
||||
static bool done = false;
|
||||
if (!done) {
|
||||
void* p;
|
||||
cudaMalloc(&p, 1);
|
||||
cudaFree(p);
|
||||
done = true;
|
||||
}
|
||||
}
|
||||
|
||||
// --------------------------------------------------------------------------
|
||||
// Baseline: host round-trip between every solve
|
||||
// --------------------------------------------------------------------------
|
||||
|
||||
static void BM_Chain_HostRoundtrip(benchmark::State& state) {
|
||||
cuda_warmup();
|
||||
const Index n = state.range(0);
|
||||
const int chain_len = static_cast<int>(state.range(1));
|
||||
|
||||
Mat A = make_spd(n);
|
||||
Mat B = Mat::Random(n, 1);
|
||||
GpuLLT<Scalar> llt(A);
|
||||
|
||||
for (auto _ : state) {
|
||||
Mat X = B;
|
||||
for (int i = 0; i < chain_len; ++i) {
|
||||
X = llt.solve(X); // host → device → host each time
|
||||
}
|
||||
benchmark::DoNotOptimize(X.data());
|
||||
}
|
||||
|
||||
state.counters["n"] = n;
|
||||
state.counters["chain"] = chain_len;
|
||||
state.counters["solves/iter"] = chain_len;
|
||||
}
|
||||
|
||||
// --------------------------------------------------------------------------
|
||||
// DeviceMatrix chaining: no host round-trip between solves
|
||||
// --------------------------------------------------------------------------
|
||||
|
||||
static void BM_Chain_Device(benchmark::State& state) {
|
||||
cuda_warmup();
|
||||
const Index n = state.range(0);
|
||||
const int chain_len = static_cast<int>(state.range(1));
|
||||
|
||||
Mat A = make_spd(n);
|
||||
Mat B = Mat::Random(n, 1);
|
||||
GpuLLT<Scalar> llt(A);
|
||||
auto d_B = DeviceMatrix<Scalar>::fromHost(B);
|
||||
|
||||
for (auto _ : state) {
|
||||
DeviceMatrix<Scalar> d_X = llt.solve(d_B);
|
||||
for (int i = 1; i < chain_len; ++i) {
|
||||
d_X = llt.solve(d_X); // device → device, fully async
|
||||
}
|
||||
Mat X = d_X.toHost(); // single sync at end
|
||||
benchmark::DoNotOptimize(X.data());
|
||||
}
|
||||
|
||||
state.counters["n"] = n;
|
||||
state.counters["chain"] = chain_len;
|
||||
state.counters["solves/iter"] = chain_len;
|
||||
}
|
||||
|
||||
// --------------------------------------------------------------------------
|
||||
// DeviceMatrix chaining with async download (overlap D2H with next iteration)
|
||||
// --------------------------------------------------------------------------
|
||||
|
||||
static void BM_Chain_DeviceAsync(benchmark::State& state) {
|
||||
cuda_warmup();
|
||||
const Index n = state.range(0);
|
||||
const int chain_len = static_cast<int>(state.range(1));
|
||||
|
||||
Mat A = make_spd(n);
|
||||
Mat B = Mat::Random(n, 1);
|
||||
GpuLLT<Scalar> llt(A);
|
||||
auto d_B = DeviceMatrix<Scalar>::fromHost(B);
|
||||
|
||||
for (auto _ : state) {
|
||||
DeviceMatrix<Scalar> d_X = llt.solve(d_B);
|
||||
for (int i = 1; i < chain_len; ++i) {
|
||||
d_X = llt.solve(d_X);
|
||||
}
|
||||
auto transfer = d_X.toHostAsync();
|
||||
Mat X = transfer.get();
|
||||
benchmark::DoNotOptimize(X.data());
|
||||
}
|
||||
|
||||
state.counters["n"] = n;
|
||||
state.counters["chain"] = chain_len;
|
||||
state.counters["solves/iter"] = chain_len;
|
||||
}
|
||||
|
||||
// --------------------------------------------------------------------------
|
||||
// Pure GPU chain (no download — measures kernel-only throughput)
|
||||
// --------------------------------------------------------------------------
|
||||
|
||||
static void BM_Chain_DeviceNoDownload(benchmark::State& state) {
|
||||
cuda_warmup();
|
||||
const Index n = state.range(0);
|
||||
const int chain_len = static_cast<int>(state.range(1));
|
||||
|
||||
Mat A = make_spd(n);
|
||||
Mat B = Mat::Random(n, 1);
|
||||
GpuLLT<Scalar> llt(A);
|
||||
auto d_B = DeviceMatrix<Scalar>::fromHost(B);
|
||||
|
||||
for (auto _ : state) {
|
||||
DeviceMatrix<Scalar> d_X = llt.solve(d_B);
|
||||
for (int i = 1; i < chain_len; ++i) {
|
||||
d_X = llt.solve(d_X);
|
||||
}
|
||||
cudaStreamSynchronize(llt.stream());
|
||||
benchmark::DoNotOptimize(d_X.data());
|
||||
}
|
||||
|
||||
state.counters["n"] = n;
|
||||
state.counters["chain"] = chain_len;
|
||||
state.counters["solves/iter"] = chain_len;
|
||||
}
|
||||
|
||||
// --------------------------------------------------------------------------
|
||||
// Compute + solve chain (full pipeline: factorize, then chain solves)
|
||||
// --------------------------------------------------------------------------
|
||||
|
||||
static void BM_FullPipeline_Host(benchmark::State& state) {
|
||||
cuda_warmup();
|
||||
const Index n = state.range(0);
|
||||
const int chain_len = static_cast<int>(state.range(1));
|
||||
|
||||
Mat A = make_spd(n);
|
||||
Mat B = Mat::Random(n, 1);
|
||||
|
||||
for (auto _ : state) {
|
||||
GpuLLT<Scalar> llt(A);
|
||||
Mat X = B;
|
||||
for (int i = 0; i < chain_len; ++i) {
|
||||
X = llt.solve(X);
|
||||
}
|
||||
benchmark::DoNotOptimize(X.data());
|
||||
}
|
||||
|
||||
state.counters["n"] = n;
|
||||
state.counters["chain"] = chain_len;
|
||||
}
|
||||
|
||||
static void BM_FullPipeline_Device(benchmark::State& state) {
|
||||
cuda_warmup();
|
||||
const Index n = state.range(0);
|
||||
const int chain_len = static_cast<int>(state.range(1));
|
||||
|
||||
Mat A = make_spd(n);
|
||||
Mat B = Mat::Random(n, 1);
|
||||
|
||||
for (auto _ : state) {
|
||||
auto d_A = DeviceMatrix<Scalar>::fromHost(A);
|
||||
auto d_B = DeviceMatrix<Scalar>::fromHost(B);
|
||||
GpuLLT<Scalar> llt;
|
||||
llt.compute(d_A);
|
||||
DeviceMatrix<Scalar> d_X = llt.solve(d_B);
|
||||
for (int i = 1; i < chain_len; ++i) {
|
||||
d_X = llt.solve(d_X);
|
||||
}
|
||||
Mat X = d_X.toHost();
|
||||
benchmark::DoNotOptimize(X.data());
|
||||
}
|
||||
|
||||
state.counters["n"] = n;
|
||||
state.counters["chain"] = chain_len;
|
||||
}
|
||||
|
||||
// --------------------------------------------------------------------------
|
||||
// Registration
|
||||
// --------------------------------------------------------------------------
|
||||
|
||||
// clang-format off
|
||||
// Args: {matrix_size, chain_length}
|
||||
BENCHMARK(BM_Chain_HostRoundtrip)->ArgsProduct({{64, 256, 1024, 4096}, {1, 2, 4, 8}})->Unit(benchmark::kMicrosecond);
|
||||
BENCHMARK(BM_Chain_Device)->ArgsProduct({{64, 256, 1024, 4096}, {1, 2, 4, 8}})->Unit(benchmark::kMicrosecond);
|
||||
BENCHMARK(BM_Chain_DeviceAsync)->ArgsProduct({{64, 256, 1024, 4096}, {1, 2, 4, 8}})->Unit(benchmark::kMicrosecond);
|
||||
BENCHMARK(BM_Chain_DeviceNoDownload)->ArgsProduct({{64, 256, 1024, 4096}, {1, 2, 4, 8}})->Unit(benchmark::kMicrosecond);
|
||||
|
||||
BENCHMARK(BM_FullPipeline_Host)->ArgsProduct({{256, 1024, 4096}, {1, 4}})->Unit(benchmark::kMicrosecond);
|
||||
BENCHMARK(BM_FullPipeline_Device)->ArgsProduct({{256, 1024, 4096}, {1, 4}})->Unit(benchmark::kMicrosecond);
|
||||
// clang-format on
|
||||
185
benchmarks/GPU/bench_gpu_fft.cpp
Normal file
185
benchmarks/GPU/bench_gpu_fft.cpp
Normal file
@@ -0,0 +1,185 @@
|
||||
// GPU FFT benchmarks: GpuFFT 1D and 2D throughput.
|
||||
//
|
||||
// Measures forward and inverse FFT performance across a range of sizes,
|
||||
// including plan-amortized (reuse) and cold-start (new plan) scenarios.
|
||||
//
|
||||
// Usage:
|
||||
// cmake --build build-bench-gpu --target bench_gpu_fft
|
||||
// ./build-bench-gpu/bench_gpu_fft
|
||||
//
|
||||
// Profiling:
|
||||
// nsys profile --trace=cuda ./build-bench-gpu/bench_gpu_fft
|
||||
|
||||
#include <benchmark/benchmark.h>
|
||||
|
||||
#include <Eigen/GPU>
|
||||
|
||||
using namespace Eigen;
|
||||
|
||||
#ifndef SCALAR
|
||||
#define SCALAR float
|
||||
#endif
|
||||
|
||||
using Scalar = SCALAR;
|
||||
using Complex = std::complex<Scalar>;
|
||||
using CVec = Matrix<Complex, Dynamic, 1>;
|
||||
using RVec = Matrix<Scalar, Dynamic, 1>;
|
||||
using CMat = Matrix<Complex, Dynamic, Dynamic>;
|
||||
|
||||
// CUDA warm-up: ensure the GPU is initialized before timing.
|
||||
static void cuda_warmup() {
|
||||
static bool done = false;
|
||||
if (!done) {
|
||||
void* p;
|
||||
cudaMalloc(&p, 1);
|
||||
cudaFree(p);
|
||||
done = true;
|
||||
}
|
||||
}
|
||||
|
||||
// --------------------------------------------------------------------------
|
||||
// 1D C2C Forward
|
||||
// --------------------------------------------------------------------------
|
||||
|
||||
static void BM_GpuFFT_1D_C2C_Fwd(benchmark::State& state) {
|
||||
cuda_warmup();
|
||||
const Index n = state.range(0);
|
||||
CVec x = CVec::Random(n);
|
||||
GpuFFT<Scalar> fft;
|
||||
|
||||
// Warm up plan.
|
||||
CVec tmp = fft.fwd(x);
|
||||
|
||||
for (auto _ : state) {
|
||||
benchmark::DoNotOptimize(fft.fwd(x));
|
||||
}
|
||||
state.SetItemsProcessed(state.iterations() * n);
|
||||
state.SetBytesProcessed(state.iterations() * n * sizeof(Complex) * 2); // read + write
|
||||
}
|
||||
|
||||
BENCHMARK(BM_GpuFFT_1D_C2C_Fwd)->RangeMultiplier(4)->Range(1 << 10, 1 << 22);
|
||||
|
||||
// --------------------------------------------------------------------------
|
||||
// 1D C2C Inverse
|
||||
// --------------------------------------------------------------------------
|
||||
|
||||
static void BM_GpuFFT_1D_C2C_Inv(benchmark::State& state) {
|
||||
cuda_warmup();
|
||||
const Index n = state.range(0);
|
||||
CVec x = CVec::Random(n);
|
||||
GpuFFT<Scalar> fft;
|
||||
CVec X = fft.fwd(x);
|
||||
|
||||
for (auto _ : state) {
|
||||
benchmark::DoNotOptimize(fft.inv(X));
|
||||
}
|
||||
state.SetItemsProcessed(state.iterations() * n);
|
||||
state.SetBytesProcessed(state.iterations() * n * sizeof(Complex) * 2);
|
||||
}
|
||||
|
||||
BENCHMARK(BM_GpuFFT_1D_C2C_Inv)->RangeMultiplier(4)->Range(1 << 10, 1 << 22);
|
||||
|
||||
// --------------------------------------------------------------------------
|
||||
// 1D R2C Forward
|
||||
// --------------------------------------------------------------------------
|
||||
|
||||
static void BM_GpuFFT_1D_R2C_Fwd(benchmark::State& state) {
|
||||
cuda_warmup();
|
||||
const Index n = state.range(0);
|
||||
RVec r = RVec::Random(n);
|
||||
GpuFFT<Scalar> fft;
|
||||
|
||||
// Warm up plan.
|
||||
CVec tmp = fft.fwd(r);
|
||||
|
||||
for (auto _ : state) {
|
||||
benchmark::DoNotOptimize(fft.fwd(r));
|
||||
}
|
||||
state.SetItemsProcessed(state.iterations() * n);
|
||||
state.SetBytesProcessed(state.iterations() * (n * sizeof(Scalar) + (n / 2 + 1) * sizeof(Complex)));
|
||||
}
|
||||
|
||||
BENCHMARK(BM_GpuFFT_1D_R2C_Fwd)->RangeMultiplier(4)->Range(1 << 10, 1 << 22);
|
||||
|
||||
// --------------------------------------------------------------------------
|
||||
// 1D C2R Inverse
|
||||
// --------------------------------------------------------------------------
|
||||
|
||||
static void BM_GpuFFT_1D_C2R_Inv(benchmark::State& state) {
|
||||
cuda_warmup();
|
||||
const Index n = state.range(0);
|
||||
RVec r = RVec::Random(n);
|
||||
GpuFFT<Scalar> fft;
|
||||
CVec R = fft.fwd(r);
|
||||
|
||||
for (auto _ : state) {
|
||||
benchmark::DoNotOptimize(fft.invReal(R, n));
|
||||
}
|
||||
state.SetItemsProcessed(state.iterations() * n);
|
||||
state.SetBytesProcessed(state.iterations() * ((n / 2 + 1) * sizeof(Complex) + n * sizeof(Scalar)));
|
||||
}
|
||||
|
||||
BENCHMARK(BM_GpuFFT_1D_C2R_Inv)->RangeMultiplier(4)->Range(1 << 10, 1 << 22);
|
||||
|
||||
// --------------------------------------------------------------------------
|
||||
// 2D C2C Forward
|
||||
// --------------------------------------------------------------------------
|
||||
|
||||
static void BM_GpuFFT_2D_C2C_Fwd(benchmark::State& state) {
|
||||
cuda_warmup();
|
||||
const Index n = state.range(0); // square n x n
|
||||
CMat A = CMat::Random(n, n);
|
||||
GpuFFT<Scalar> fft;
|
||||
|
||||
// Warm up plan.
|
||||
CMat tmp = fft.fwd2d(A);
|
||||
|
||||
for (auto _ : state) {
|
||||
benchmark::DoNotOptimize(fft.fwd2d(A));
|
||||
}
|
||||
state.SetItemsProcessed(state.iterations() * n * n);
|
||||
state.SetBytesProcessed(state.iterations() * n * n * sizeof(Complex) * 2);
|
||||
}
|
||||
|
||||
BENCHMARK(BM_GpuFFT_2D_C2C_Fwd)->RangeMultiplier(2)->Range(64, 4096);
|
||||
|
||||
// --------------------------------------------------------------------------
|
||||
// 2D C2C Roundtrip (fwd + inv)
|
||||
// --------------------------------------------------------------------------
|
||||
|
||||
static void BM_GpuFFT_2D_C2C_Roundtrip(benchmark::State& state) {
|
||||
cuda_warmup();
|
||||
const Index n = state.range(0);
|
||||
CMat A = CMat::Random(n, n);
|
||||
GpuFFT<Scalar> fft;
|
||||
|
||||
// Warm up plans.
|
||||
CMat tmp = fft.inv2d(fft.fwd2d(A));
|
||||
|
||||
for (auto _ : state) {
|
||||
CMat B = fft.fwd2d(A);
|
||||
benchmark::DoNotOptimize(fft.inv2d(B));
|
||||
}
|
||||
state.SetItemsProcessed(state.iterations() * n * n * 2); // fwd + inv
|
||||
state.SetBytesProcessed(state.iterations() * n * n * sizeof(Complex) * 4);
|
||||
}
|
||||
|
||||
BENCHMARK(BM_GpuFFT_2D_C2C_Roundtrip)->RangeMultiplier(2)->Range(64, 4096);
|
||||
|
||||
// --------------------------------------------------------------------------
|
||||
// 1D Cold start (includes plan creation)
|
||||
// --------------------------------------------------------------------------
|
||||
|
||||
static void BM_GpuFFT_1D_ColdStart(benchmark::State& state) {
|
||||
cuda_warmup();
|
||||
const Index n = state.range(0);
|
||||
CVec x = CVec::Random(n);
|
||||
|
||||
for (auto _ : state) {
|
||||
GpuFFT<Scalar> fft; // new object = new plans
|
||||
benchmark::DoNotOptimize(fft.fwd(x));
|
||||
}
|
||||
state.SetItemsProcessed(state.iterations() * n);
|
||||
}
|
||||
|
||||
BENCHMARK(BM_GpuFFT_1D_ColdStart)->RangeMultiplier(4)->Range(1 << 10, 1 << 20);
|
||||
296
benchmarks/GPU/bench_gpu_solvers.cpp
Normal file
296
benchmarks/GPU/bench_gpu_solvers.cpp
Normal file
@@ -0,0 +1,296 @@
|
||||
// GPU solver benchmarks: GpuLLT and GpuLU compute + solve throughput.
|
||||
//
|
||||
// Measures factorization and solve performance for the host-matrix and
|
||||
// DeviceMatrix code paths across a range of matrix sizes.
|
||||
//
|
||||
// For Nsight Systems profiling:
|
||||
// nsys profile --trace=cuda,nvtx ./bench_gpu_solvers
|
||||
//
|
||||
// For Nsight Compute kernel analysis:
|
||||
// ncu --set full -o profile ./bench_gpu_solvers --benchmark_filter=BM_GpuLLT_Compute/4096
|
||||
|
||||
#include <benchmark/benchmark.h>
|
||||
|
||||
#include <Eigen/Cholesky>
|
||||
#include <Eigen/GPU>
|
||||
#include <Eigen/LU>
|
||||
|
||||
using namespace Eigen;
|
||||
|
||||
#ifndef SCALAR
|
||||
#define SCALAR double
|
||||
#endif
|
||||
|
||||
using Scalar = SCALAR;
|
||||
using Mat = Matrix<Scalar, Dynamic, Dynamic>;
|
||||
|
||||
// --------------------------------------------------------------------------
|
||||
// Helpers
|
||||
// --------------------------------------------------------------------------
|
||||
|
||||
static Mat make_spd(Index n) {
|
||||
Mat M = Mat::Random(n, n);
|
||||
return M.adjoint() * M + Mat::Identity(n, n) * static_cast<Scalar>(n);
|
||||
}
|
||||
|
||||
// CUDA warm-up: ensure the GPU is initialized before timing.
|
||||
static void cuda_warmup() {
|
||||
static bool done = false;
|
||||
if (!done) {
|
||||
void* p;
|
||||
cudaMalloc(&p, 1);
|
||||
cudaFree(p);
|
||||
done = true;
|
||||
}
|
||||
}
|
||||
|
||||
// --------------------------------------------------------------------------
|
||||
// GpuLLT benchmarks
|
||||
// --------------------------------------------------------------------------
|
||||
|
||||
// Factorize from host matrix (includes H2D upload).
|
||||
static void BM_GpuLLT_Compute_Host(benchmark::State& state) {
|
||||
cuda_warmup();
|
||||
const Index n = state.range(0);
|
||||
Mat A = make_spd(n);
|
||||
GpuLLT<Scalar> llt;
|
||||
|
||||
for (auto _ : state) {
|
||||
llt.compute(A);
|
||||
if (llt.info() != Success) state.SkipWithError("factorization failed");
|
||||
}
|
||||
|
||||
double flops = static_cast<double>(n) * static_cast<double>(n) * static_cast<double>(n) / 3.0;
|
||||
state.counters["GFLOPS"] =
|
||||
benchmark::Counter(flops, benchmark::Counter::kIsIterationInvariantRate, benchmark::Counter::kIs1000);
|
||||
state.counters["n"] = n;
|
||||
}
|
||||
|
||||
// Factorize from DeviceMatrix (D2D copy path).
|
||||
static void BM_GpuLLT_Compute_Device(benchmark::State& state) {
|
||||
cuda_warmup();
|
||||
const Index n = state.range(0);
|
||||
Mat A = make_spd(n);
|
||||
auto d_A = DeviceMatrix<Scalar>::fromHost(A);
|
||||
GpuLLT<Scalar> llt;
|
||||
|
||||
for (auto _ : state) {
|
||||
llt.compute(d_A);
|
||||
if (llt.info() != Success) state.SkipWithError("factorization failed");
|
||||
}
|
||||
|
||||
double flops = static_cast<double>(n) * static_cast<double>(n) * static_cast<double>(n) / 3.0;
|
||||
state.counters["GFLOPS"] =
|
||||
benchmark::Counter(flops, benchmark::Counter::kIsIterationInvariantRate, benchmark::Counter::kIs1000);
|
||||
state.counters["n"] = n;
|
||||
}
|
||||
|
||||
// Factorize from DeviceMatrix (move path, no copy).
|
||||
static void BM_GpuLLT_Compute_DeviceMove(benchmark::State& state) {
|
||||
cuda_warmup();
|
||||
const Index n = state.range(0);
|
||||
Mat A = make_spd(n);
|
||||
GpuLLT<Scalar> llt;
|
||||
|
||||
for (auto _ : state) {
|
||||
auto d_A = DeviceMatrix<Scalar>::fromHost(A);
|
||||
llt.compute(std::move(d_A));
|
||||
if (llt.info() != Success) state.SkipWithError("factorization failed");
|
||||
}
|
||||
|
||||
double flops = static_cast<double>(n) * static_cast<double>(n) * static_cast<double>(n) / 3.0;
|
||||
state.counters["GFLOPS"] =
|
||||
benchmark::Counter(flops, benchmark::Counter::kIsIterationInvariantRate, benchmark::Counter::kIs1000);
|
||||
state.counters["n"] = n;
|
||||
}
|
||||
|
||||
// Solve from host matrix (H2D + potrs + D2H).
|
||||
static void BM_GpuLLT_Solve_Host(benchmark::State& state) {
|
||||
cuda_warmup();
|
||||
const Index n = state.range(0);
|
||||
const Index nrhs = state.range(1);
|
||||
Mat A = make_spd(n);
|
||||
Mat B = Mat::Random(n, nrhs);
|
||||
GpuLLT<Scalar> llt(A);
|
||||
|
||||
for (auto _ : state) {
|
||||
Mat X = llt.solve(B);
|
||||
benchmark::DoNotOptimize(X.data());
|
||||
}
|
||||
|
||||
state.counters["n"] = n;
|
||||
state.counters["nrhs"] = nrhs;
|
||||
}
|
||||
|
||||
// Solve from DeviceMatrix (D2D + potrs, async, toHost at end).
|
||||
static void BM_GpuLLT_Solve_Device(benchmark::State& state) {
|
||||
cuda_warmup();
|
||||
const Index n = state.range(0);
|
||||
const Index nrhs = state.range(1);
|
||||
Mat A = make_spd(n);
|
||||
Mat B = Mat::Random(n, nrhs);
|
||||
GpuLLT<Scalar> llt(A);
|
||||
auto d_B = DeviceMatrix<Scalar>::fromHost(B);
|
||||
|
||||
for (auto _ : state) {
|
||||
DeviceMatrix<Scalar> d_X = llt.solve(d_B);
|
||||
Mat X = d_X.toHost();
|
||||
benchmark::DoNotOptimize(X.data());
|
||||
}
|
||||
|
||||
state.counters["n"] = n;
|
||||
state.counters["nrhs"] = nrhs;
|
||||
}
|
||||
|
||||
// Solve staying entirely on device (no toHost — measures pure GPU time).
|
||||
static void BM_GpuLLT_Solve_DeviceOnly(benchmark::State& state) {
|
||||
cuda_warmup();
|
||||
const Index n = state.range(0);
|
||||
const Index nrhs = state.range(1);
|
||||
Mat A = make_spd(n);
|
||||
Mat B = Mat::Random(n, nrhs);
|
||||
GpuLLT<Scalar> llt(A);
|
||||
auto d_B = DeviceMatrix<Scalar>::fromHost(B);
|
||||
|
||||
for (auto _ : state) {
|
||||
DeviceMatrix<Scalar> d_X = llt.solve(d_B);
|
||||
// Force completion without D2H transfer.
|
||||
cudaStreamSynchronize(llt.stream());
|
||||
benchmark::DoNotOptimize(d_X.data());
|
||||
}
|
||||
|
||||
state.counters["n"] = n;
|
||||
state.counters["nrhs"] = nrhs;
|
||||
}
|
||||
|
||||
// --------------------------------------------------------------------------
|
||||
// GpuLU benchmarks
|
||||
// --------------------------------------------------------------------------
|
||||
|
||||
static void BM_GpuLU_Compute_Host(benchmark::State& state) {
|
||||
cuda_warmup();
|
||||
const Index n = state.range(0);
|
||||
Mat A = Mat::Random(n, n);
|
||||
GpuLU<Scalar> lu;
|
||||
|
||||
for (auto _ : state) {
|
||||
lu.compute(A);
|
||||
if (lu.info() != Success) state.SkipWithError("factorization failed");
|
||||
}
|
||||
|
||||
double flops = 2.0 / 3.0 * static_cast<double>(n) * static_cast<double>(n) * static_cast<double>(n);
|
||||
state.counters["GFLOPS"] =
|
||||
benchmark::Counter(flops, benchmark::Counter::kIsIterationInvariantRate, benchmark::Counter::kIs1000);
|
||||
state.counters["n"] = n;
|
||||
}
|
||||
|
||||
static void BM_GpuLU_Compute_Device(benchmark::State& state) {
|
||||
cuda_warmup();
|
||||
const Index n = state.range(0);
|
||||
Mat A = Mat::Random(n, n);
|
||||
auto d_A = DeviceMatrix<Scalar>::fromHost(A);
|
||||
GpuLU<Scalar> lu;
|
||||
|
||||
for (auto _ : state) {
|
||||
lu.compute(d_A);
|
||||
if (lu.info() != Success) state.SkipWithError("factorization failed");
|
||||
}
|
||||
|
||||
double flops = 2.0 / 3.0 * static_cast<double>(n) * static_cast<double>(n) * static_cast<double>(n);
|
||||
state.counters["GFLOPS"] =
|
||||
benchmark::Counter(flops, benchmark::Counter::kIsIterationInvariantRate, benchmark::Counter::kIs1000);
|
||||
state.counters["n"] = n;
|
||||
}
|
||||
|
||||
static void BM_GpuLU_Solve_Host(benchmark::State& state) {
|
||||
cuda_warmup();
|
||||
const Index n = state.range(0);
|
||||
const Index nrhs = state.range(1);
|
||||
Mat A = Mat::Random(n, n);
|
||||
Mat B = Mat::Random(n, nrhs);
|
||||
GpuLU<Scalar> lu(A);
|
||||
|
||||
for (auto _ : state) {
|
||||
Mat X = lu.solve(B);
|
||||
benchmark::DoNotOptimize(X.data());
|
||||
}
|
||||
|
||||
state.counters["n"] = n;
|
||||
state.counters["nrhs"] = nrhs;
|
||||
}
|
||||
|
||||
static void BM_GpuLU_Solve_Device(benchmark::State& state) {
|
||||
cuda_warmup();
|
||||
const Index n = state.range(0);
|
||||
const Index nrhs = state.range(1);
|
||||
Mat A = Mat::Random(n, n);
|
||||
Mat B = Mat::Random(n, nrhs);
|
||||
GpuLU<Scalar> lu(A);
|
||||
auto d_B = DeviceMatrix<Scalar>::fromHost(B);
|
||||
|
||||
for (auto _ : state) {
|
||||
DeviceMatrix<Scalar> d_X = lu.solve(d_B);
|
||||
Mat X = d_X.toHost();
|
||||
benchmark::DoNotOptimize(X.data());
|
||||
}
|
||||
|
||||
state.counters["n"] = n;
|
||||
state.counters["nrhs"] = nrhs;
|
||||
}
|
||||
|
||||
// --------------------------------------------------------------------------
|
||||
// CPU baselines for comparison
|
||||
// --------------------------------------------------------------------------
|
||||
|
||||
static void BM_CpuLLT_Compute(benchmark::State& state) {
|
||||
const Index n = state.range(0);
|
||||
Mat A = make_spd(n);
|
||||
LLT<Mat> llt;
|
||||
|
||||
for (auto _ : state) {
|
||||
llt.compute(A);
|
||||
benchmark::DoNotOptimize(llt.matrixLLT().data());
|
||||
}
|
||||
|
||||
double flops = static_cast<double>(n) * static_cast<double>(n) * static_cast<double>(n) / 3.0;
|
||||
state.counters["GFLOPS"] =
|
||||
benchmark::Counter(flops, benchmark::Counter::kIsIterationInvariantRate, benchmark::Counter::kIs1000);
|
||||
state.counters["n"] = n;
|
||||
}
|
||||
|
||||
static void BM_CpuLU_Compute(benchmark::State& state) {
|
||||
const Index n = state.range(0);
|
||||
Mat A = Mat::Random(n, n);
|
||||
PartialPivLU<Mat> lu;
|
||||
|
||||
for (auto _ : state) {
|
||||
lu.compute(A);
|
||||
benchmark::DoNotOptimize(lu.matrixLU().data());
|
||||
}
|
||||
|
||||
double flops = 2.0 / 3.0 * static_cast<double>(n) * static_cast<double>(n) * static_cast<double>(n);
|
||||
state.counters["GFLOPS"] =
|
||||
benchmark::Counter(flops, benchmark::Counter::kIsIterationInvariantRate, benchmark::Counter::kIs1000);
|
||||
state.counters["n"] = n;
|
||||
}
|
||||
|
||||
// --------------------------------------------------------------------------
|
||||
// Registration
|
||||
// --------------------------------------------------------------------------
|
||||
|
||||
// clang-format off
|
||||
BENCHMARK(BM_GpuLLT_Compute_Host)->ArgsProduct({{64, 128, 256, 512, 1024, 2048, 4096}})->Unit(benchmark::kMicrosecond);
|
||||
BENCHMARK(BM_GpuLLT_Compute_Device)->ArgsProduct({{64, 128, 256, 512, 1024, 2048, 4096}})->Unit(benchmark::kMicrosecond);
|
||||
BENCHMARK(BM_GpuLLT_Compute_DeviceMove)->ArgsProduct({{64, 128, 256, 512, 1024, 2048, 4096}})->Unit(benchmark::kMicrosecond);
|
||||
BENCHMARK(BM_GpuLLT_Solve_Host)->ArgsProduct({{64, 256, 1024, 4096}, {1, 16}})->Unit(benchmark::kMicrosecond);
|
||||
BENCHMARK(BM_GpuLLT_Solve_Device)->ArgsProduct({{64, 256, 1024, 4096}, {1, 16}})->Unit(benchmark::kMicrosecond);
|
||||
BENCHMARK(BM_GpuLLT_Solve_DeviceOnly)->ArgsProduct({{64, 256, 1024, 4096}, {1, 16}})->Unit(benchmark::kMicrosecond);
|
||||
|
||||
BENCHMARK(BM_GpuLU_Compute_Host)->ArgsProduct({{64, 128, 256, 512, 1024, 2048, 4096}})->Unit(benchmark::kMicrosecond);
|
||||
BENCHMARK(BM_GpuLU_Compute_Device)->ArgsProduct({{64, 128, 256, 512, 1024, 2048, 4096}})->Unit(benchmark::kMicrosecond);
|
||||
BENCHMARK(BM_GpuLU_Solve_Host)->ArgsProduct({{64, 256, 1024, 4096}, {1, 16}})->Unit(benchmark::kMicrosecond);
|
||||
BENCHMARK(BM_GpuLU_Solve_Device)->ArgsProduct({{64, 256, 1024, 4096}, {1, 16}})->Unit(benchmark::kMicrosecond);
|
||||
|
||||
BENCHMARK(BM_CpuLLT_Compute)->ArgsProduct({{64, 128, 256, 512, 1024, 2048, 4096}})->Unit(benchmark::kMicrosecond);
|
||||
BENCHMARK(BM_CpuLU_Compute)->ArgsProduct({{64, 128, 256, 512, 1024, 2048, 4096}})->Unit(benchmark::kMicrosecond);
|
||||
// clang-format on
|
||||
@@ -6,11 +6,7 @@ if(EIGEN_BUILD_BLAS)
|
||||
add_custom_target(blas)
|
||||
|
||||
set(EigenBlas_SRCS single.cpp double.cpp complex_single.cpp complex_double.cpp xerbla.cpp
|
||||
f2c/srotm.c f2c/srotmg.c f2c/drotm.c f2c/drotmg.c
|
||||
f2c/lsame.c f2c/dspmv.c f2c/ssbmv.c f2c/chbmv.c
|
||||
f2c/sspmv.c f2c/zhbmv.c f2c/chpmv.c f2c/dsbmv.c
|
||||
f2c/zhpmv.c f2c/dtbmv.c f2c/stbmv.c f2c/ctbmv.c
|
||||
f2c/ztbmv.c f2c/complexdots.c
|
||||
lsame.cpp complexdots.cpp
|
||||
)
|
||||
|
||||
set(EIGEN_BLAS_TARGETS "")
|
||||
|
||||
72
blas/complexdots.cpp
Normal file
72
blas/complexdots.cpp
Normal file
@@ -0,0 +1,72 @@
|
||||
// This file is part of Eigen, a lightweight C++ template library
|
||||
// for linear algebra.
|
||||
//
|
||||
// This Source Code Form is subject to the terms of the Mozilla
|
||||
// Public License v. 2.0. If a copy of the MPL was not distributed
|
||||
// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
|
||||
|
||||
// C++ replacements for the f2c complex dot product wrappers.
|
||||
// These are thin wrappers around the worker functions (cdotcw_, etc.)
|
||||
// defined in level1_cplx_impl.h.
|
||||
//
|
||||
// Note: blas.h declares these as void, but gfortran expects complex functions
|
||||
// to return by value. We define the correct signatures here and do not include
|
||||
// blas.h to avoid the conflicting declarations.
|
||||
|
||||
#if defined(_WIN32)
|
||||
#if defined(EIGEN_BLAS_BUILD_DLL)
|
||||
#define EIGEN_BLAS_CDOT_API __declspec(dllexport)
|
||||
#else
|
||||
#define EIGEN_BLAS_CDOT_API
|
||||
#endif
|
||||
#elif ((defined(__GNUC__) && __GNUC__ >= 4) || defined(__clang__)) && defined(EIGEN_BLAS_BUILD_DLL)
|
||||
#define EIGEN_BLAS_CDOT_API __attribute__((visibility("default")))
|
||||
#else
|
||||
#define EIGEN_BLAS_CDOT_API
|
||||
#endif
|
||||
|
||||
extern "C" {
|
||||
|
||||
// Worker function declarations (defined in level1_cplx_impl.h via complex_single.cpp / complex_double.cpp).
|
||||
void cdotcw_(int *n, float *cx, int *incx, float *cy, int *incy, float *res);
|
||||
void cdotuw_(int *n, float *cx, int *incx, float *cy, int *incy, float *res);
|
||||
void zdotcw_(int *n, double *cx, int *incx, double *cy, int *incy, double *res);
|
||||
void zdotuw_(int *n, double *cx, int *incx, double *cy, int *incy, double *res);
|
||||
|
||||
// POD complex types for C-compatible return values (matches Fortran complex layout).
|
||||
struct eigen_blas_complex_float {
|
||||
float r, i;
|
||||
};
|
||||
struct eigen_blas_complex_double {
|
||||
double r, i;
|
||||
};
|
||||
|
||||
// CDOTC computes the conjugated dot product of two single-precision complex vectors.
|
||||
EIGEN_BLAS_CDOT_API eigen_blas_complex_float cdotc_(int *n, float *cx, int *incx, float *cy, int *incy) {
|
||||
eigen_blas_complex_float res = {0.0f, 0.0f};
|
||||
cdotcw_(n, cx, incx, cy, incy, &res.r);
|
||||
return res;
|
||||
}
|
||||
|
||||
// CDOTU computes the unconjugated dot product of two single-precision complex vectors.
|
||||
EIGEN_BLAS_CDOT_API eigen_blas_complex_float cdotu_(int *n, float *cx, int *incx, float *cy, int *incy) {
|
||||
eigen_blas_complex_float res = {0.0f, 0.0f};
|
||||
cdotuw_(n, cx, incx, cy, incy, &res.r);
|
||||
return res;
|
||||
}
|
||||
|
||||
// ZDOTC computes the conjugated dot product of two double-precision complex vectors.
|
||||
EIGEN_BLAS_CDOT_API eigen_blas_complex_double zdotc_(int *n, double *cx, int *incx, double *cy, int *incy) {
|
||||
eigen_blas_complex_double res = {0.0, 0.0};
|
||||
zdotcw_(n, cx, incx, cy, incy, &res.r);
|
||||
return res;
|
||||
}
|
||||
|
||||
// ZDOTU computes the unconjugated dot product of two double-precision complex vectors.
|
||||
EIGEN_BLAS_CDOT_API eigen_blas_complex_double zdotu_(int *n, double *cx, int *incx, double *cy, int *incy) {
|
||||
eigen_blas_complex_double res = {0.0, 0.0};
|
||||
zdotuw_(n, cx, incx, cy, incy, &res.r);
|
||||
return res;
|
||||
}
|
||||
|
||||
} // extern "C"
|
||||
456
blas/f2c/chbmv.c
456
blas/f2c/chbmv.c
@@ -1,456 +0,0 @@
|
||||
/* chbmv.f -- translated by f2c (version 20100827).
|
||||
You must link the resulting object file with libf2c:
|
||||
on Microsoft Windows system, link with libf2c.lib;
|
||||
on Linux or Unix systems, link with .../path/to/libf2c.a -lm
|
||||
or, if you install libf2c.a in a standard place, with -lf2c -lm
|
||||
-- in that order, at the end of the command line, as in
|
||||
cc *.o -lf2c -lm
|
||||
Source for libf2c is in /netlib/f2c/libf2c.zip, e.g.,
|
||||
|
||||
http://www.netlib.org/f2c/libf2c.zip
|
||||
*/
|
||||
|
||||
#include "datatypes.h"
|
||||
|
||||
static inline void r_cnjg(complex *r, complex *z) {
|
||||
r->r = z->r;
|
||||
r->i = -(z->i);
|
||||
}
|
||||
|
||||
/* Subroutine */ void chbmv_(char *uplo, integer *n, integer *k, complex *alpha, complex *a, integer *lda, complex *x,
|
||||
integer *incx, complex *beta, complex *y, integer *incy) {
|
||||
/* System generated locals */
|
||||
integer a_dim1, a_offset, i__1, i__2, i__3, i__4, i__5;
|
||||
real r__1;
|
||||
complex q__1, q__2, q__3, q__4;
|
||||
|
||||
/* Local variables */
|
||||
integer i__, j, l, ix, iy, jx, jy, kx, ky, info;
|
||||
complex temp1, temp2;
|
||||
extern logical lsame_(char *, char *);
|
||||
integer kplus1;
|
||||
extern /* Subroutine */ void xerbla_(const char *, integer *);
|
||||
|
||||
/* .. Scalar Arguments .. */
|
||||
/* .. */
|
||||
/* .. Array Arguments .. */
|
||||
/* .. */
|
||||
|
||||
/* Purpose */
|
||||
/* ======= */
|
||||
|
||||
/* CHBMV performs the matrix-vector operation */
|
||||
|
||||
/* y := alpha*A*x + beta*y, */
|
||||
|
||||
/* where alpha and beta are scalars, x and y are n element vectors and */
|
||||
/* A is an n by n hermitian band matrix, with k super-diagonals. */
|
||||
|
||||
/* Arguments */
|
||||
/* ========== */
|
||||
|
||||
/* UPLO - CHARACTER*1. */
|
||||
/* On entry, UPLO specifies whether the upper or lower */
|
||||
/* triangular part of the band matrix A is being supplied as */
|
||||
/* follows: */
|
||||
|
||||
/* UPLO = 'U' or 'u' The upper triangular part of A is */
|
||||
/* being supplied. */
|
||||
|
||||
/* UPLO = 'L' or 'l' The lower triangular part of A is */
|
||||
/* being supplied. */
|
||||
|
||||
/* Unchanged on exit. */
|
||||
|
||||
/* N - INTEGER. */
|
||||
/* On entry, N specifies the order of the matrix A. */
|
||||
/* N must be at least zero. */
|
||||
/* Unchanged on exit. */
|
||||
|
||||
/* K - INTEGER. */
|
||||
/* On entry, K specifies the number of super-diagonals of the */
|
||||
/* matrix A. K must satisfy 0 .le. K. */
|
||||
/* Unchanged on exit. */
|
||||
|
||||
/* ALPHA - COMPLEX . */
|
||||
/* On entry, ALPHA specifies the scalar alpha. */
|
||||
/* Unchanged on exit. */
|
||||
|
||||
/* A - COMPLEX array of DIMENSION ( LDA, n ). */
|
||||
/* Before entry with UPLO = 'U' or 'u', the leading ( k + 1 ) */
|
||||
/* by n part of the array A must contain the upper triangular */
|
||||
/* band part of the hermitian matrix, supplied column by */
|
||||
/* column, with the leading diagonal of the matrix in row */
|
||||
/* ( k + 1 ) of the array, the first super-diagonal starting at */
|
||||
/* position 2 in row k, and so on. The top left k by k triangle */
|
||||
/* of the array A is not referenced. */
|
||||
/* The following program segment will transfer the upper */
|
||||
/* triangular part of a hermitian band matrix from conventional */
|
||||
/* full matrix storage to band storage: */
|
||||
|
||||
/* DO 20, J = 1, N */
|
||||
/* M = K + 1 - J */
|
||||
/* DO 10, I = MAX( 1, J - K ), J */
|
||||
/* A( M + I, J ) = matrix( I, J ) */
|
||||
/* 10 CONTINUE */
|
||||
/* 20 CONTINUE */
|
||||
|
||||
/* Before entry with UPLO = 'L' or 'l', the leading ( k + 1 ) */
|
||||
/* by n part of the array A must contain the lower triangular */
|
||||
/* band part of the hermitian matrix, supplied column by */
|
||||
/* column, with the leading diagonal of the matrix in row 1 of */
|
||||
/* the array, the first sub-diagonal starting at position 1 in */
|
||||
/* row 2, and so on. The bottom right k by k triangle of the */
|
||||
/* array A is not referenced. */
|
||||
/* The following program segment will transfer the lower */
|
||||
/* triangular part of a hermitian band matrix from conventional */
|
||||
/* full matrix storage to band storage: */
|
||||
|
||||
/* DO 20, J = 1, N */
|
||||
/* M = 1 - J */
|
||||
/* DO 10, I = J, MIN( N, J + K ) */
|
||||
/* A( M + I, J ) = matrix( I, J ) */
|
||||
/* 10 CONTINUE */
|
||||
/* 20 CONTINUE */
|
||||
|
||||
/* Note that the imaginary parts of the diagonal elements need */
|
||||
/* not be set and are assumed to be zero. */
|
||||
/* Unchanged on exit. */
|
||||
|
||||
/* LDA - INTEGER. */
|
||||
/* On entry, LDA specifies the first dimension of A as declared */
|
||||
/* in the calling (sub) program. LDA must be at least */
|
||||
/* ( k + 1 ). */
|
||||
/* Unchanged on exit. */
|
||||
|
||||
/* X - COMPLEX array of DIMENSION at least */
|
||||
/* ( 1 + ( n - 1 )*abs( INCX ) ). */
|
||||
/* Before entry, the incremented array X must contain the */
|
||||
/* vector x. */
|
||||
/* Unchanged on exit. */
|
||||
|
||||
/* INCX - INTEGER. */
|
||||
/* On entry, INCX specifies the increment for the elements of */
|
||||
/* X. INCX must not be zero. */
|
||||
/* Unchanged on exit. */
|
||||
|
||||
/* BETA - COMPLEX . */
|
||||
/* On entry, BETA specifies the scalar beta. */
|
||||
/* Unchanged on exit. */
|
||||
|
||||
/* Y - COMPLEX array of DIMENSION at least */
|
||||
/* ( 1 + ( n - 1 )*abs( INCY ) ). */
|
||||
/* Before entry, the incremented array Y must contain the */
|
||||
/* vector y. On exit, Y is overwritten by the updated vector y. */
|
||||
|
||||
/* INCY - INTEGER. */
|
||||
/* On entry, INCY specifies the increment for the elements of */
|
||||
/* Y. INCY must not be zero. */
|
||||
/* Unchanged on exit. */
|
||||
|
||||
/* Further Details */
|
||||
/* =============== */
|
||||
|
||||
/* Level 2 Blas routine. */
|
||||
|
||||
/* -- Written on 22-October-1986. */
|
||||
/* Jack Dongarra, Argonne National Lab. */
|
||||
/* Jeremy Du Croz, Nag Central Office. */
|
||||
/* Sven Hammarling, Nag Central Office. */
|
||||
/* Richard Hanson, Sandia National Labs. */
|
||||
|
||||
/* ===================================================================== */
|
||||
|
||||
/* .. Parameters .. */
|
||||
/* .. */
|
||||
/* .. Local Scalars .. */
|
||||
/* .. */
|
||||
/* .. External Functions .. */
|
||||
/* .. */
|
||||
/* .. External Subroutines .. */
|
||||
/* .. */
|
||||
/* .. Intrinsic Functions .. */
|
||||
/* .. */
|
||||
|
||||
/* Test the input parameters. */
|
||||
|
||||
/* Parameter adjustments */
|
||||
a_dim1 = *lda;
|
||||
a_offset = 1 + a_dim1;
|
||||
a -= a_offset;
|
||||
--x;
|
||||
--y;
|
||||
|
||||
/* Function Body */
|
||||
info = 0;
|
||||
if (!lsame_(uplo, "U") && !lsame_(uplo, "L")) {
|
||||
info = 1;
|
||||
} else if (*n < 0) {
|
||||
info = 2;
|
||||
} else if (*k < 0) {
|
||||
info = 3;
|
||||
} else if (*lda < *k + 1) {
|
||||
info = 6;
|
||||
} else if (*incx == 0) {
|
||||
info = 8;
|
||||
} else if (*incy == 0) {
|
||||
info = 11;
|
||||
}
|
||||
if (info != 0) {
|
||||
xerbla_("CHBMV ", &info);
|
||||
return;
|
||||
}
|
||||
|
||||
/* Quick return if possible. */
|
||||
|
||||
if (*n == 0 || (alpha->r == 0.f && alpha->i == 0.f && (beta->r == 1.f && beta->i == 0.f))) {
|
||||
return;
|
||||
}
|
||||
|
||||
/* Set up the start points in X and Y. */
|
||||
|
||||
if (*incx > 0) {
|
||||
kx = 1;
|
||||
} else {
|
||||
kx = 1 - (*n - 1) * *incx;
|
||||
}
|
||||
if (*incy > 0) {
|
||||
ky = 1;
|
||||
} else {
|
||||
ky = 1 - (*n - 1) * *incy;
|
||||
}
|
||||
|
||||
/* Start the operations. In this version the elements of the array A */
|
||||
/* are accessed sequentially with one pass through A. */
|
||||
|
||||
/* First form y := beta*y. */
|
||||
|
||||
if (beta->r != 1.f || beta->i != 0.f) {
|
||||
if (*incy == 1) {
|
||||
if (beta->r == 0.f && beta->i == 0.f) {
|
||||
i__1 = *n;
|
||||
for (i__ = 1; i__ <= i__1; ++i__) {
|
||||
i__2 = i__;
|
||||
y[i__2].r = 0.f, y[i__2].i = 0.f;
|
||||
/* L10: */
|
||||
}
|
||||
} else {
|
||||
i__1 = *n;
|
||||
for (i__ = 1; i__ <= i__1; ++i__) {
|
||||
i__2 = i__;
|
||||
i__3 = i__;
|
||||
q__1.r = beta->r * y[i__3].r - beta->i * y[i__3].i, q__1.i = beta->r * y[i__3].i + beta->i * y[i__3].r;
|
||||
y[i__2].r = q__1.r, y[i__2].i = q__1.i;
|
||||
/* L20: */
|
||||
}
|
||||
}
|
||||
} else {
|
||||
iy = ky;
|
||||
if (beta->r == 0.f && beta->i == 0.f) {
|
||||
i__1 = *n;
|
||||
for (i__ = 1; i__ <= i__1; ++i__) {
|
||||
i__2 = iy;
|
||||
y[i__2].r = 0.f, y[i__2].i = 0.f;
|
||||
iy += *incy;
|
||||
/* L30: */
|
||||
}
|
||||
} else {
|
||||
i__1 = *n;
|
||||
for (i__ = 1; i__ <= i__1; ++i__) {
|
||||
i__2 = iy;
|
||||
i__3 = iy;
|
||||
q__1.r = beta->r * y[i__3].r - beta->i * y[i__3].i, q__1.i = beta->r * y[i__3].i + beta->i * y[i__3].r;
|
||||
y[i__2].r = q__1.r, y[i__2].i = q__1.i;
|
||||
iy += *incy;
|
||||
/* L40: */
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
if (alpha->r == 0.f && alpha->i == 0.f) {
|
||||
return;
|
||||
}
|
||||
if (lsame_(uplo, "U")) {
|
||||
/* Form y when upper triangle of A is stored. */
|
||||
|
||||
kplus1 = *k + 1;
|
||||
if (*incx == 1 && *incy == 1) {
|
||||
i__1 = *n;
|
||||
for (j = 1; j <= i__1; ++j) {
|
||||
i__2 = j;
|
||||
q__1.r = alpha->r * x[i__2].r - alpha->i * x[i__2].i, q__1.i = alpha->r * x[i__2].i + alpha->i * x[i__2].r;
|
||||
temp1.r = q__1.r, temp1.i = q__1.i;
|
||||
temp2.r = 0.f, temp2.i = 0.f;
|
||||
l = kplus1 - j;
|
||||
/* Computing MAX */
|
||||
i__2 = 1, i__3 = j - *k;
|
||||
i__4 = j - 1;
|
||||
for (i__ = max(i__2, i__3); i__ <= i__4; ++i__) {
|
||||
i__2 = i__;
|
||||
i__3 = i__;
|
||||
i__5 = l + i__ + j * a_dim1;
|
||||
q__2.r = temp1.r * a[i__5].r - temp1.i * a[i__5].i, q__2.i = temp1.r * a[i__5].i + temp1.i * a[i__5].r;
|
||||
q__1.r = y[i__3].r + q__2.r, q__1.i = y[i__3].i + q__2.i;
|
||||
y[i__2].r = q__1.r, y[i__2].i = q__1.i;
|
||||
r_cnjg(&q__3, &a[l + i__ + j * a_dim1]);
|
||||
i__2 = i__;
|
||||
q__2.r = q__3.r * x[i__2].r - q__3.i * x[i__2].i, q__2.i = q__3.r * x[i__2].i + q__3.i * x[i__2].r;
|
||||
q__1.r = temp2.r + q__2.r, q__1.i = temp2.i + q__2.i;
|
||||
temp2.r = q__1.r, temp2.i = q__1.i;
|
||||
/* L50: */
|
||||
}
|
||||
i__4 = j;
|
||||
i__2 = j;
|
||||
i__3 = kplus1 + j * a_dim1;
|
||||
r__1 = a[i__3].r;
|
||||
q__3.r = r__1 * temp1.r, q__3.i = r__1 * temp1.i;
|
||||
q__2.r = y[i__2].r + q__3.r, q__2.i = y[i__2].i + q__3.i;
|
||||
q__4.r = alpha->r * temp2.r - alpha->i * temp2.i, q__4.i = alpha->r * temp2.i + alpha->i * temp2.r;
|
||||
q__1.r = q__2.r + q__4.r, q__1.i = q__2.i + q__4.i;
|
||||
y[i__4].r = q__1.r, y[i__4].i = q__1.i;
|
||||
/* L60: */
|
||||
}
|
||||
} else {
|
||||
jx = kx;
|
||||
jy = ky;
|
||||
i__1 = *n;
|
||||
for (j = 1; j <= i__1; ++j) {
|
||||
i__4 = jx;
|
||||
q__1.r = alpha->r * x[i__4].r - alpha->i * x[i__4].i, q__1.i = alpha->r * x[i__4].i + alpha->i * x[i__4].r;
|
||||
temp1.r = q__1.r, temp1.i = q__1.i;
|
||||
temp2.r = 0.f, temp2.i = 0.f;
|
||||
ix = kx;
|
||||
iy = ky;
|
||||
l = kplus1 - j;
|
||||
/* Computing MAX */
|
||||
i__4 = 1, i__2 = j - *k;
|
||||
i__3 = j - 1;
|
||||
for (i__ = max(i__4, i__2); i__ <= i__3; ++i__) {
|
||||
i__4 = iy;
|
||||
i__2 = iy;
|
||||
i__5 = l + i__ + j * a_dim1;
|
||||
q__2.r = temp1.r * a[i__5].r - temp1.i * a[i__5].i, q__2.i = temp1.r * a[i__5].i + temp1.i * a[i__5].r;
|
||||
q__1.r = y[i__2].r + q__2.r, q__1.i = y[i__2].i + q__2.i;
|
||||
y[i__4].r = q__1.r, y[i__4].i = q__1.i;
|
||||
r_cnjg(&q__3, &a[l + i__ + j * a_dim1]);
|
||||
i__4 = ix;
|
||||
q__2.r = q__3.r * x[i__4].r - q__3.i * x[i__4].i, q__2.i = q__3.r * x[i__4].i + q__3.i * x[i__4].r;
|
||||
q__1.r = temp2.r + q__2.r, q__1.i = temp2.i + q__2.i;
|
||||
temp2.r = q__1.r, temp2.i = q__1.i;
|
||||
ix += *incx;
|
||||
iy += *incy;
|
||||
/* L70: */
|
||||
}
|
||||
i__3 = jy;
|
||||
i__4 = jy;
|
||||
i__2 = kplus1 + j * a_dim1;
|
||||
r__1 = a[i__2].r;
|
||||
q__3.r = r__1 * temp1.r, q__3.i = r__1 * temp1.i;
|
||||
q__2.r = y[i__4].r + q__3.r, q__2.i = y[i__4].i + q__3.i;
|
||||
q__4.r = alpha->r * temp2.r - alpha->i * temp2.i, q__4.i = alpha->r * temp2.i + alpha->i * temp2.r;
|
||||
q__1.r = q__2.r + q__4.r, q__1.i = q__2.i + q__4.i;
|
||||
y[i__3].r = q__1.r, y[i__3].i = q__1.i;
|
||||
jx += *incx;
|
||||
jy += *incy;
|
||||
if (j > *k) {
|
||||
kx += *incx;
|
||||
ky += *incy;
|
||||
}
|
||||
/* L80: */
|
||||
}
|
||||
}
|
||||
} else {
|
||||
/* Form y when lower triangle of A is stored. */
|
||||
|
||||
if (*incx == 1 && *incy == 1) {
|
||||
i__1 = *n;
|
||||
for (j = 1; j <= i__1; ++j) {
|
||||
i__3 = j;
|
||||
q__1.r = alpha->r * x[i__3].r - alpha->i * x[i__3].i, q__1.i = alpha->r * x[i__3].i + alpha->i * x[i__3].r;
|
||||
temp1.r = q__1.r, temp1.i = q__1.i;
|
||||
temp2.r = 0.f, temp2.i = 0.f;
|
||||
i__3 = j;
|
||||
i__4 = j;
|
||||
i__2 = j * a_dim1 + 1;
|
||||
r__1 = a[i__2].r;
|
||||
q__2.r = r__1 * temp1.r, q__2.i = r__1 * temp1.i;
|
||||
q__1.r = y[i__4].r + q__2.r, q__1.i = y[i__4].i + q__2.i;
|
||||
y[i__3].r = q__1.r, y[i__3].i = q__1.i;
|
||||
l = 1 - j;
|
||||
/* Computing MIN */
|
||||
i__4 = *n, i__2 = j + *k;
|
||||
i__3 = min(i__4, i__2);
|
||||
for (i__ = j + 1; i__ <= i__3; ++i__) {
|
||||
i__4 = i__;
|
||||
i__2 = i__;
|
||||
i__5 = l + i__ + j * a_dim1;
|
||||
q__2.r = temp1.r * a[i__5].r - temp1.i * a[i__5].i, q__2.i = temp1.r * a[i__5].i + temp1.i * a[i__5].r;
|
||||
q__1.r = y[i__2].r + q__2.r, q__1.i = y[i__2].i + q__2.i;
|
||||
y[i__4].r = q__1.r, y[i__4].i = q__1.i;
|
||||
r_cnjg(&q__3, &a[l + i__ + j * a_dim1]);
|
||||
i__4 = i__;
|
||||
q__2.r = q__3.r * x[i__4].r - q__3.i * x[i__4].i, q__2.i = q__3.r * x[i__4].i + q__3.i * x[i__4].r;
|
||||
q__1.r = temp2.r + q__2.r, q__1.i = temp2.i + q__2.i;
|
||||
temp2.r = q__1.r, temp2.i = q__1.i;
|
||||
/* L90: */
|
||||
}
|
||||
i__3 = j;
|
||||
i__4 = j;
|
||||
q__2.r = alpha->r * temp2.r - alpha->i * temp2.i, q__2.i = alpha->r * temp2.i + alpha->i * temp2.r;
|
||||
q__1.r = y[i__4].r + q__2.r, q__1.i = y[i__4].i + q__2.i;
|
||||
y[i__3].r = q__1.r, y[i__3].i = q__1.i;
|
||||
/* L100: */
|
||||
}
|
||||
} else {
|
||||
jx = kx;
|
||||
jy = ky;
|
||||
i__1 = *n;
|
||||
for (j = 1; j <= i__1; ++j) {
|
||||
i__3 = jx;
|
||||
q__1.r = alpha->r * x[i__3].r - alpha->i * x[i__3].i, q__1.i = alpha->r * x[i__3].i + alpha->i * x[i__3].r;
|
||||
temp1.r = q__1.r, temp1.i = q__1.i;
|
||||
temp2.r = 0.f, temp2.i = 0.f;
|
||||
i__3 = jy;
|
||||
i__4 = jy;
|
||||
i__2 = j * a_dim1 + 1;
|
||||
r__1 = a[i__2].r;
|
||||
q__2.r = r__1 * temp1.r, q__2.i = r__1 * temp1.i;
|
||||
q__1.r = y[i__4].r + q__2.r, q__1.i = y[i__4].i + q__2.i;
|
||||
y[i__3].r = q__1.r, y[i__3].i = q__1.i;
|
||||
l = 1 - j;
|
||||
ix = jx;
|
||||
iy = jy;
|
||||
/* Computing MIN */
|
||||
i__4 = *n, i__2 = j + *k;
|
||||
i__3 = min(i__4, i__2);
|
||||
for (i__ = j + 1; i__ <= i__3; ++i__) {
|
||||
ix += *incx;
|
||||
iy += *incy;
|
||||
i__4 = iy;
|
||||
i__2 = iy;
|
||||
i__5 = l + i__ + j * a_dim1;
|
||||
q__2.r = temp1.r * a[i__5].r - temp1.i * a[i__5].i, q__2.i = temp1.r * a[i__5].i + temp1.i * a[i__5].r;
|
||||
q__1.r = y[i__2].r + q__2.r, q__1.i = y[i__2].i + q__2.i;
|
||||
y[i__4].r = q__1.r, y[i__4].i = q__1.i;
|
||||
r_cnjg(&q__3, &a[l + i__ + j * a_dim1]);
|
||||
i__4 = ix;
|
||||
q__2.r = q__3.r * x[i__4].r - q__3.i * x[i__4].i, q__2.i = q__3.r * x[i__4].i + q__3.i * x[i__4].r;
|
||||
q__1.r = temp2.r + q__2.r, q__1.i = temp2.i + q__2.i;
|
||||
temp2.r = q__1.r, temp2.i = q__1.i;
|
||||
/* L110: */
|
||||
}
|
||||
i__3 = jy;
|
||||
i__4 = jy;
|
||||
q__2.r = alpha->r * temp2.r - alpha->i * temp2.i, q__2.i = alpha->r * temp2.i + alpha->i * temp2.r;
|
||||
q__1.r = y[i__4].r + q__2.r, q__1.i = y[i__4].i + q__2.i;
|
||||
y[i__3].r = q__1.r, y[i__3].i = q__1.i;
|
||||
jx += *incx;
|
||||
jy += *incy;
|
||||
/* L120: */
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/* End of CHBMV . */
|
||||
|
||||
} /* chbmv_ */
|
||||
407
blas/f2c/chpmv.c
407
blas/f2c/chpmv.c
@@ -1,407 +0,0 @@
|
||||
/* chpmv.f -- translated by f2c (version 20100827).
|
||||
You must link the resulting object file with libf2c:
|
||||
on Microsoft Windows system, link with libf2c.lib;
|
||||
on Linux or Unix systems, link with .../path/to/libf2c.a -lm
|
||||
or, if you install libf2c.a in a standard place, with -lf2c -lm
|
||||
-- in that order, at the end of the command line, as in
|
||||
cc *.o -lf2c -lm
|
||||
Source for libf2c is in /netlib/f2c/libf2c.zip, e.g.,
|
||||
|
||||
http://www.netlib.org/f2c/libf2c.zip
|
||||
*/
|
||||
|
||||
#include "datatypes.h"
|
||||
|
||||
static inline void r_cnjg(complex *r, complex *z) {
|
||||
r->r = z->r;
|
||||
r->i = -(z->i);
|
||||
}
|
||||
|
||||
/* Subroutine */ void chpmv_(char *uplo, integer *n, complex *alpha, complex *ap, complex *x, integer *incx,
|
||||
complex *beta, complex *y, integer *incy) {
|
||||
/* System generated locals */
|
||||
integer i__1, i__2, i__3, i__4, i__5;
|
||||
real r__1;
|
||||
complex q__1, q__2, q__3, q__4;
|
||||
|
||||
/* Local variables */
|
||||
integer i__, j, k, kk, ix, iy, jx, jy, kx, ky, info;
|
||||
complex temp1, temp2;
|
||||
extern logical lsame_(char *, char *);
|
||||
extern /* Subroutine */ void xerbla_(const char *, integer *);
|
||||
|
||||
/* .. Scalar Arguments .. */
|
||||
/* .. */
|
||||
/* .. Array Arguments .. */
|
||||
/* .. */
|
||||
|
||||
/* Purpose */
|
||||
/* ======= */
|
||||
|
||||
/* CHPMV performs the matrix-vector operation */
|
||||
|
||||
/* y := alpha*A*x + beta*y, */
|
||||
|
||||
/* where alpha and beta are scalars, x and y are n element vectors and */
|
||||
/* A is an n by n hermitian matrix, supplied in packed form. */
|
||||
|
||||
/* Arguments */
|
||||
/* ========== */
|
||||
|
||||
/* UPLO - CHARACTER*1. */
|
||||
/* On entry, UPLO specifies whether the upper or lower */
|
||||
/* triangular part of the matrix A is supplied in the packed */
|
||||
/* array AP as follows: */
|
||||
|
||||
/* UPLO = 'U' or 'u' The upper triangular part of A is */
|
||||
/* supplied in AP. */
|
||||
|
||||
/* UPLO = 'L' or 'l' The lower triangular part of A is */
|
||||
/* supplied in AP. */
|
||||
|
||||
/* Unchanged on exit. */
|
||||
|
||||
/* N - INTEGER. */
|
||||
/* On entry, N specifies the order of the matrix A. */
|
||||
/* N must be at least zero. */
|
||||
/* Unchanged on exit. */
|
||||
|
||||
/* ALPHA - COMPLEX . */
|
||||
/* On entry, ALPHA specifies the scalar alpha. */
|
||||
/* Unchanged on exit. */
|
||||
|
||||
/* AP - COMPLEX array of DIMENSION at least */
|
||||
/* ( ( n*( n + 1 ) )/2 ). */
|
||||
/* Before entry with UPLO = 'U' or 'u', the array AP must */
|
||||
/* contain the upper triangular part of the hermitian matrix */
|
||||
/* packed sequentially, column by column, so that AP( 1 ) */
|
||||
/* contains a( 1, 1 ), AP( 2 ) and AP( 3 ) contain a( 1, 2 ) */
|
||||
/* and a( 2, 2 ) respectively, and so on. */
|
||||
/* Before entry with UPLO = 'L' or 'l', the array AP must */
|
||||
/* contain the lower triangular part of the hermitian matrix */
|
||||
/* packed sequentially, column by column, so that AP( 1 ) */
|
||||
/* contains a( 1, 1 ), AP( 2 ) and AP( 3 ) contain a( 2, 1 ) */
|
||||
/* and a( 3, 1 ) respectively, and so on. */
|
||||
/* Note that the imaginary parts of the diagonal elements need */
|
||||
/* not be set and are assumed to be zero. */
|
||||
/* Unchanged on exit. */
|
||||
|
||||
/* X - COMPLEX array of dimension at least */
|
||||
/* ( 1 + ( n - 1 )*abs( INCX ) ). */
|
||||
/* Before entry, the incremented array X must contain the n */
|
||||
/* element vector x. */
|
||||
/* Unchanged on exit. */
|
||||
|
||||
/* INCX - INTEGER. */
|
||||
/* On entry, INCX specifies the increment for the elements of */
|
||||
/* X. INCX must not be zero. */
|
||||
/* Unchanged on exit. */
|
||||
|
||||
/* BETA - COMPLEX . */
|
||||
/* On entry, BETA specifies the scalar beta. When BETA is */
|
||||
/* supplied as zero then Y need not be set on input. */
|
||||
/* Unchanged on exit. */
|
||||
|
||||
/* Y - COMPLEX array of dimension at least */
|
||||
/* ( 1 + ( n - 1 )*abs( INCY ) ). */
|
||||
/* Before entry, the incremented array Y must contain the n */
|
||||
/* element vector y. On exit, Y is overwritten by the updated */
|
||||
/* vector y. */
|
||||
|
||||
/* INCY - INTEGER. */
|
||||
/* On entry, INCY specifies the increment for the elements of */
|
||||
/* Y. INCY must not be zero. */
|
||||
/* Unchanged on exit. */
|
||||
|
||||
/* Further Details */
|
||||
/* =============== */
|
||||
|
||||
/* Level 2 Blas routine. */
|
||||
|
||||
/* -- Written on 22-October-1986. */
|
||||
/* Jack Dongarra, Argonne National Lab. */
|
||||
/* Jeremy Du Croz, Nag Central Office. */
|
||||
/* Sven Hammarling, Nag Central Office. */
|
||||
/* Richard Hanson, Sandia National Labs. */
|
||||
|
||||
/* ===================================================================== */
|
||||
|
||||
/* .. Parameters .. */
|
||||
/* .. */
|
||||
/* .. Local Scalars .. */
|
||||
/* .. */
|
||||
/* .. External Functions .. */
|
||||
/* .. */
|
||||
/* .. External Subroutines .. */
|
||||
/* .. */
|
||||
/* .. Intrinsic Functions .. */
|
||||
/* .. */
|
||||
|
||||
/* Test the input parameters. */
|
||||
|
||||
/* Parameter adjustments */
|
||||
--y;
|
||||
--x;
|
||||
--ap;
|
||||
|
||||
/* Function Body */
|
||||
info = 0;
|
||||
if (!lsame_(uplo, "U") && !lsame_(uplo, "L")) {
|
||||
info = 1;
|
||||
} else if (*n < 0) {
|
||||
info = 2;
|
||||
} else if (*incx == 0) {
|
||||
info = 6;
|
||||
} else if (*incy == 0) {
|
||||
info = 9;
|
||||
}
|
||||
if (info != 0) {
|
||||
xerbla_("CHPMV ", &info);
|
||||
return;
|
||||
}
|
||||
|
||||
/* Quick return if possible. */
|
||||
|
||||
if (*n == 0 || (alpha->r == 0.f && alpha->i == 0.f && (beta->r == 1.f && beta->i == 0.f))) {
|
||||
return;
|
||||
}
|
||||
|
||||
/* Set up the start points in X and Y. */
|
||||
|
||||
if (*incx > 0) {
|
||||
kx = 1;
|
||||
} else {
|
||||
kx = 1 - (*n - 1) * *incx;
|
||||
}
|
||||
if (*incy > 0) {
|
||||
ky = 1;
|
||||
} else {
|
||||
ky = 1 - (*n - 1) * *incy;
|
||||
}
|
||||
|
||||
/* Start the operations. In this version the elements of the array AP */
|
||||
/* are accessed sequentially with one pass through AP. */
|
||||
|
||||
/* First form y := beta*y. */
|
||||
|
||||
if (beta->r != 1.f || beta->i != 0.f) {
|
||||
if (*incy == 1) {
|
||||
if (beta->r == 0.f && beta->i == 0.f) {
|
||||
i__1 = *n;
|
||||
for (i__ = 1; i__ <= i__1; ++i__) {
|
||||
i__2 = i__;
|
||||
y[i__2].r = 0.f, y[i__2].i = 0.f;
|
||||
/* L10: */
|
||||
}
|
||||
} else {
|
||||
i__1 = *n;
|
||||
for (i__ = 1; i__ <= i__1; ++i__) {
|
||||
i__2 = i__;
|
||||
i__3 = i__;
|
||||
q__1.r = beta->r * y[i__3].r - beta->i * y[i__3].i, q__1.i = beta->r * y[i__3].i + beta->i * y[i__3].r;
|
||||
y[i__2].r = q__1.r, y[i__2].i = q__1.i;
|
||||
/* L20: */
|
||||
}
|
||||
}
|
||||
} else {
|
||||
iy = ky;
|
||||
if (beta->r == 0.f && beta->i == 0.f) {
|
||||
i__1 = *n;
|
||||
for (i__ = 1; i__ <= i__1; ++i__) {
|
||||
i__2 = iy;
|
||||
y[i__2].r = 0.f, y[i__2].i = 0.f;
|
||||
iy += *incy;
|
||||
/* L30: */
|
||||
}
|
||||
} else {
|
||||
i__1 = *n;
|
||||
for (i__ = 1; i__ <= i__1; ++i__) {
|
||||
i__2 = iy;
|
||||
i__3 = iy;
|
||||
q__1.r = beta->r * y[i__3].r - beta->i * y[i__3].i, q__1.i = beta->r * y[i__3].i + beta->i * y[i__3].r;
|
||||
y[i__2].r = q__1.r, y[i__2].i = q__1.i;
|
||||
iy += *incy;
|
||||
/* L40: */
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
if (alpha->r == 0.f && alpha->i == 0.f) {
|
||||
return;
|
||||
}
|
||||
kk = 1;
|
||||
if (lsame_(uplo, "U")) {
|
||||
/* Form y when AP contains the upper triangle. */
|
||||
|
||||
if (*incx == 1 && *incy == 1) {
|
||||
i__1 = *n;
|
||||
for (j = 1; j <= i__1; ++j) {
|
||||
i__2 = j;
|
||||
q__1.r = alpha->r * x[i__2].r - alpha->i * x[i__2].i, q__1.i = alpha->r * x[i__2].i + alpha->i * x[i__2].r;
|
||||
temp1.r = q__1.r, temp1.i = q__1.i;
|
||||
temp2.r = 0.f, temp2.i = 0.f;
|
||||
k = kk;
|
||||
i__2 = j - 1;
|
||||
for (i__ = 1; i__ <= i__2; ++i__) {
|
||||
i__3 = i__;
|
||||
i__4 = i__;
|
||||
i__5 = k;
|
||||
q__2.r = temp1.r * ap[i__5].r - temp1.i * ap[i__5].i, q__2.i = temp1.r * ap[i__5].i + temp1.i * ap[i__5].r;
|
||||
q__1.r = y[i__4].r + q__2.r, q__1.i = y[i__4].i + q__2.i;
|
||||
y[i__3].r = q__1.r, y[i__3].i = q__1.i;
|
||||
r_cnjg(&q__3, &ap[k]);
|
||||
i__3 = i__;
|
||||
q__2.r = q__3.r * x[i__3].r - q__3.i * x[i__3].i, q__2.i = q__3.r * x[i__3].i + q__3.i * x[i__3].r;
|
||||
q__1.r = temp2.r + q__2.r, q__1.i = temp2.i + q__2.i;
|
||||
temp2.r = q__1.r, temp2.i = q__1.i;
|
||||
++k;
|
||||
/* L50: */
|
||||
}
|
||||
i__2 = j;
|
||||
i__3 = j;
|
||||
i__4 = kk + j - 1;
|
||||
r__1 = ap[i__4].r;
|
||||
q__3.r = r__1 * temp1.r, q__3.i = r__1 * temp1.i;
|
||||
q__2.r = y[i__3].r + q__3.r, q__2.i = y[i__3].i + q__3.i;
|
||||
q__4.r = alpha->r * temp2.r - alpha->i * temp2.i, q__4.i = alpha->r * temp2.i + alpha->i * temp2.r;
|
||||
q__1.r = q__2.r + q__4.r, q__1.i = q__2.i + q__4.i;
|
||||
y[i__2].r = q__1.r, y[i__2].i = q__1.i;
|
||||
kk += j;
|
||||
/* L60: */
|
||||
}
|
||||
} else {
|
||||
jx = kx;
|
||||
jy = ky;
|
||||
i__1 = *n;
|
||||
for (j = 1; j <= i__1; ++j) {
|
||||
i__2 = jx;
|
||||
q__1.r = alpha->r * x[i__2].r - alpha->i * x[i__2].i, q__1.i = alpha->r * x[i__2].i + alpha->i * x[i__2].r;
|
||||
temp1.r = q__1.r, temp1.i = q__1.i;
|
||||
temp2.r = 0.f, temp2.i = 0.f;
|
||||
ix = kx;
|
||||
iy = ky;
|
||||
i__2 = kk + j - 2;
|
||||
for (k = kk; k <= i__2; ++k) {
|
||||
i__3 = iy;
|
||||
i__4 = iy;
|
||||
i__5 = k;
|
||||
q__2.r = temp1.r * ap[i__5].r - temp1.i * ap[i__5].i, q__2.i = temp1.r * ap[i__5].i + temp1.i * ap[i__5].r;
|
||||
q__1.r = y[i__4].r + q__2.r, q__1.i = y[i__4].i + q__2.i;
|
||||
y[i__3].r = q__1.r, y[i__3].i = q__1.i;
|
||||
r_cnjg(&q__3, &ap[k]);
|
||||
i__3 = ix;
|
||||
q__2.r = q__3.r * x[i__3].r - q__3.i * x[i__3].i, q__2.i = q__3.r * x[i__3].i + q__3.i * x[i__3].r;
|
||||
q__1.r = temp2.r + q__2.r, q__1.i = temp2.i + q__2.i;
|
||||
temp2.r = q__1.r, temp2.i = q__1.i;
|
||||
ix += *incx;
|
||||
iy += *incy;
|
||||
/* L70: */
|
||||
}
|
||||
i__2 = jy;
|
||||
i__3 = jy;
|
||||
i__4 = kk + j - 1;
|
||||
r__1 = ap[i__4].r;
|
||||
q__3.r = r__1 * temp1.r, q__3.i = r__1 * temp1.i;
|
||||
q__2.r = y[i__3].r + q__3.r, q__2.i = y[i__3].i + q__3.i;
|
||||
q__4.r = alpha->r * temp2.r - alpha->i * temp2.i, q__4.i = alpha->r * temp2.i + alpha->i * temp2.r;
|
||||
q__1.r = q__2.r + q__4.r, q__1.i = q__2.i + q__4.i;
|
||||
y[i__2].r = q__1.r, y[i__2].i = q__1.i;
|
||||
jx += *incx;
|
||||
jy += *incy;
|
||||
kk += j;
|
||||
/* L80: */
|
||||
}
|
||||
}
|
||||
} else {
|
||||
/* Form y when AP contains the lower triangle. */
|
||||
|
||||
if (*incx == 1 && *incy == 1) {
|
||||
i__1 = *n;
|
||||
for (j = 1; j <= i__1; ++j) {
|
||||
i__2 = j;
|
||||
q__1.r = alpha->r * x[i__2].r - alpha->i * x[i__2].i, q__1.i = alpha->r * x[i__2].i + alpha->i * x[i__2].r;
|
||||
temp1.r = q__1.r, temp1.i = q__1.i;
|
||||
temp2.r = 0.f, temp2.i = 0.f;
|
||||
i__2 = j;
|
||||
i__3 = j;
|
||||
i__4 = kk;
|
||||
r__1 = ap[i__4].r;
|
||||
q__2.r = r__1 * temp1.r, q__2.i = r__1 * temp1.i;
|
||||
q__1.r = y[i__3].r + q__2.r, q__1.i = y[i__3].i + q__2.i;
|
||||
y[i__2].r = q__1.r, y[i__2].i = q__1.i;
|
||||
k = kk + 1;
|
||||
i__2 = *n;
|
||||
for (i__ = j + 1; i__ <= i__2; ++i__) {
|
||||
i__3 = i__;
|
||||
i__4 = i__;
|
||||
i__5 = k;
|
||||
q__2.r = temp1.r * ap[i__5].r - temp1.i * ap[i__5].i, q__2.i = temp1.r * ap[i__5].i + temp1.i * ap[i__5].r;
|
||||
q__1.r = y[i__4].r + q__2.r, q__1.i = y[i__4].i + q__2.i;
|
||||
y[i__3].r = q__1.r, y[i__3].i = q__1.i;
|
||||
r_cnjg(&q__3, &ap[k]);
|
||||
i__3 = i__;
|
||||
q__2.r = q__3.r * x[i__3].r - q__3.i * x[i__3].i, q__2.i = q__3.r * x[i__3].i + q__3.i * x[i__3].r;
|
||||
q__1.r = temp2.r + q__2.r, q__1.i = temp2.i + q__2.i;
|
||||
temp2.r = q__1.r, temp2.i = q__1.i;
|
||||
++k;
|
||||
/* L90: */
|
||||
}
|
||||
i__2 = j;
|
||||
i__3 = j;
|
||||
q__2.r = alpha->r * temp2.r - alpha->i * temp2.i, q__2.i = alpha->r * temp2.i + alpha->i * temp2.r;
|
||||
q__1.r = y[i__3].r + q__2.r, q__1.i = y[i__3].i + q__2.i;
|
||||
y[i__2].r = q__1.r, y[i__2].i = q__1.i;
|
||||
kk += *n - j + 1;
|
||||
/* L100: */
|
||||
}
|
||||
} else {
|
||||
jx = kx;
|
||||
jy = ky;
|
||||
i__1 = *n;
|
||||
for (j = 1; j <= i__1; ++j) {
|
||||
i__2 = jx;
|
||||
q__1.r = alpha->r * x[i__2].r - alpha->i * x[i__2].i, q__1.i = alpha->r * x[i__2].i + alpha->i * x[i__2].r;
|
||||
temp1.r = q__1.r, temp1.i = q__1.i;
|
||||
temp2.r = 0.f, temp2.i = 0.f;
|
||||
i__2 = jy;
|
||||
i__3 = jy;
|
||||
i__4 = kk;
|
||||
r__1 = ap[i__4].r;
|
||||
q__2.r = r__1 * temp1.r, q__2.i = r__1 * temp1.i;
|
||||
q__1.r = y[i__3].r + q__2.r, q__1.i = y[i__3].i + q__2.i;
|
||||
y[i__2].r = q__1.r, y[i__2].i = q__1.i;
|
||||
ix = jx;
|
||||
iy = jy;
|
||||
i__2 = kk + *n - j;
|
||||
for (k = kk + 1; k <= i__2; ++k) {
|
||||
ix += *incx;
|
||||
iy += *incy;
|
||||
i__3 = iy;
|
||||
i__4 = iy;
|
||||
i__5 = k;
|
||||
q__2.r = temp1.r * ap[i__5].r - temp1.i * ap[i__5].i, q__2.i = temp1.r * ap[i__5].i + temp1.i * ap[i__5].r;
|
||||
q__1.r = y[i__4].r + q__2.r, q__1.i = y[i__4].i + q__2.i;
|
||||
y[i__3].r = q__1.r, y[i__3].i = q__1.i;
|
||||
r_cnjg(&q__3, &ap[k]);
|
||||
i__3 = ix;
|
||||
q__2.r = q__3.r * x[i__3].r - q__3.i * x[i__3].i, q__2.i = q__3.r * x[i__3].i + q__3.i * x[i__3].r;
|
||||
q__1.r = temp2.r + q__2.r, q__1.i = temp2.i + q__2.i;
|
||||
temp2.r = q__1.r, temp2.i = q__1.i;
|
||||
/* L110: */
|
||||
}
|
||||
i__2 = jy;
|
||||
i__3 = jy;
|
||||
q__2.r = alpha->r * temp2.r - alpha->i * temp2.i, q__2.i = alpha->r * temp2.i + alpha->i * temp2.r;
|
||||
q__1.r = y[i__3].r + q__2.r, q__1.i = y[i__3].i + q__2.i;
|
||||
y[i__2].r = q__1.r, y[i__2].i = q__1.i;
|
||||
jx += *incx;
|
||||
jy += *incy;
|
||||
kk += *n - j + 1;
|
||||
/* L120: */
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/* End of CHPMV . */
|
||||
|
||||
} /* chpmv_ */
|
||||
@@ -1,73 +0,0 @@
|
||||
/* This file has been modified to use the standard gfortran calling
|
||||
convention, rather than the f2c calling convention.
|
||||
|
||||
It does not require -ff2c when compiled with gfortran.
|
||||
*/
|
||||
|
||||
/* complexdots.f -- translated by f2c (version 20100827).
|
||||
You must link the resulting object file with libf2c:
|
||||
on Microsoft Windows system, link with libf2c.lib;
|
||||
on Linux or Unix systems, link with .../path/to/libf2c.a -lm
|
||||
or, if you install libf2c.a in a standard place, with -lf2c -lm
|
||||
-- in that order, at the end of the command line, as in
|
||||
cc *.o -lf2c -lm
|
||||
Source for libf2c is in /netlib/f2c/libf2c.zip, e.g.,
|
||||
|
||||
http://www.netlib.org/f2c/libf2c.zip
|
||||
*/
|
||||
|
||||
#include "datatypes.h"
|
||||
|
||||
complex cdotc_(integer *n, complex *cx, integer *incx, complex *cy, integer *incy) {
|
||||
complex res;
|
||||
extern /* Subroutine */ void cdotcw_(integer *, complex *, integer *, complex *, integer *, complex *);
|
||||
|
||||
/* Parameter adjustments */
|
||||
--cy;
|
||||
--cx;
|
||||
|
||||
/* Function Body */
|
||||
cdotcw_(n, &cx[1], incx, &cy[1], incy, &res);
|
||||
return res;
|
||||
} /* cdotc_ */
|
||||
|
||||
complex cdotu_(integer *n, complex *cx, integer *incx, complex *cy, integer *incy) {
|
||||
complex res;
|
||||
extern /* Subroutine */ void cdotuw_(integer *, complex *, integer *, complex *, integer *, complex *);
|
||||
|
||||
/* Parameter adjustments */
|
||||
--cy;
|
||||
--cx;
|
||||
|
||||
/* Function Body */
|
||||
cdotuw_(n, &cx[1], incx, &cy[1], incy, &res);
|
||||
return res;
|
||||
} /* cdotu_ */
|
||||
|
||||
doublecomplex zdotc_(integer *n, doublecomplex *cx, integer *incx, doublecomplex *cy, integer *incy) {
|
||||
doublecomplex res;
|
||||
extern /* Subroutine */ void zdotcw_(integer *, doublecomplex *, integer *, doublecomplex *, integer *,
|
||||
doublecomplex *);
|
||||
|
||||
/* Parameter adjustments */
|
||||
--cy;
|
||||
--cx;
|
||||
|
||||
/* Function Body */
|
||||
zdotcw_(n, &cx[1], incx, &cy[1], incy, &res);
|
||||
return res;
|
||||
} /* zdotc_ */
|
||||
|
||||
doublecomplex zdotu_(integer *n, doublecomplex *cx, integer *incx, doublecomplex *cy, integer *incy) {
|
||||
doublecomplex res;
|
||||
extern /* Subroutine */ void zdotuw_(integer *, doublecomplex *, integer *, doublecomplex *, integer *,
|
||||
doublecomplex *);
|
||||
|
||||
/* Parameter adjustments */
|
||||
--cy;
|
||||
--cx;
|
||||
|
||||
/* Function Body */
|
||||
zdotuw_(n, &cx[1], incx, &cy[1], incy, &res);
|
||||
return res;
|
||||
} /* zdotu_ */
|
||||
586
blas/f2c/ctbmv.c
586
blas/f2c/ctbmv.c
@@ -1,586 +0,0 @@
|
||||
/* ctbmv.f -- translated by f2c (version 20100827).
|
||||
You must link the resulting object file with libf2c:
|
||||
on Microsoft Windows system, link with libf2c.lib;
|
||||
on Linux or Unix systems, link with .../path/to/libf2c.a -lm
|
||||
or, if you install libf2c.a in a standard place, with -lf2c -lm
|
||||
-- in that order, at the end of the command line, as in
|
||||
cc *.o -lf2c -lm
|
||||
Source for libf2c is in /netlib/f2c/libf2c.zip, e.g.,
|
||||
|
||||
http://www.netlib.org/f2c/libf2c.zip
|
||||
*/
|
||||
|
||||
#include "datatypes.h"
|
||||
|
||||
static inline void r_cnjg(complex *r, complex *z) {
|
||||
r->r = z->r;
|
||||
r->i = -(z->i);
|
||||
}
|
||||
|
||||
/* Subroutine */ void ctbmv_(char *uplo, char *trans, char *diag, integer *n, integer *k, complex *a, integer *lda,
|
||||
complex *x, integer *incx) {
|
||||
/* System generated locals */
|
||||
integer a_dim1, a_offset, i__1, i__2, i__3, i__4, i__5;
|
||||
complex q__1, q__2, q__3;
|
||||
|
||||
/* Local variables */
|
||||
integer i__, j, l, ix, jx, kx, info;
|
||||
complex temp;
|
||||
extern logical lsame_(char *, char *);
|
||||
integer kplus1;
|
||||
extern /* Subroutine */ void xerbla_(const char *, integer *);
|
||||
logical noconj, nounit;
|
||||
|
||||
/* .. Scalar Arguments .. */
|
||||
/* .. */
|
||||
/* .. Array Arguments .. */
|
||||
/* .. */
|
||||
|
||||
/* Purpose */
|
||||
/* ======= */
|
||||
|
||||
/* CTBMV performs one of the matrix-vector operations */
|
||||
|
||||
/* x := A*x, or x := A'*x, or x := conjg( A' )*x, */
|
||||
|
||||
/* where x is an n element vector and A is an n by n unit, or non-unit, */
|
||||
/* upper or lower triangular band matrix, with ( k + 1 ) diagonals. */
|
||||
|
||||
/* Arguments */
|
||||
/* ========== */
|
||||
|
||||
/* UPLO - CHARACTER*1. */
|
||||
/* On entry, UPLO specifies whether the matrix is an upper or */
|
||||
/* lower triangular matrix as follows: */
|
||||
|
||||
/* UPLO = 'U' or 'u' A is an upper triangular matrix. */
|
||||
|
||||
/* UPLO = 'L' or 'l' A is a lower triangular matrix. */
|
||||
|
||||
/* Unchanged on exit. */
|
||||
|
||||
/* TRANS - CHARACTER*1. */
|
||||
/* On entry, TRANS specifies the operation to be performed as */
|
||||
/* follows: */
|
||||
|
||||
/* TRANS = 'N' or 'n' x := A*x. */
|
||||
|
||||
/* TRANS = 'T' or 't' x := A'*x. */
|
||||
|
||||
/* TRANS = 'C' or 'c' x := conjg( A' )*x. */
|
||||
|
||||
/* Unchanged on exit. */
|
||||
|
||||
/* DIAG - CHARACTER*1. */
|
||||
/* On entry, DIAG specifies whether or not A is unit */
|
||||
/* triangular as follows: */
|
||||
|
||||
/* DIAG = 'U' or 'u' A is assumed to be unit triangular. */
|
||||
|
||||
/* DIAG = 'N' or 'n' A is not assumed to be unit */
|
||||
/* triangular. */
|
||||
|
||||
/* Unchanged on exit. */
|
||||
|
||||
/* N - INTEGER. */
|
||||
/* On entry, N specifies the order of the matrix A. */
|
||||
/* N must be at least zero. */
|
||||
/* Unchanged on exit. */
|
||||
|
||||
/* K - INTEGER. */
|
||||
/* On entry with UPLO = 'U' or 'u', K specifies the number of */
|
||||
/* super-diagonals of the matrix A. */
|
||||
/* On entry with UPLO = 'L' or 'l', K specifies the number of */
|
||||
/* sub-diagonals of the matrix A. */
|
||||
/* K must satisfy 0 .le. K. */
|
||||
/* Unchanged on exit. */
|
||||
|
||||
/* A - COMPLEX array of DIMENSION ( LDA, n ). */
|
||||
/* Before entry with UPLO = 'U' or 'u', the leading ( k + 1 ) */
|
||||
/* by n part of the array A must contain the upper triangular */
|
||||
/* band part of the matrix of coefficients, supplied column by */
|
||||
/* column, with the leading diagonal of the matrix in row */
|
||||
/* ( k + 1 ) of the array, the first super-diagonal starting at */
|
||||
/* position 2 in row k, and so on. The top left k by k triangle */
|
||||
/* of the array A is not referenced. */
|
||||
/* The following program segment will transfer an upper */
|
||||
/* triangular band matrix from conventional full matrix storage */
|
||||
/* to band storage: */
|
||||
|
||||
/* DO 20, J = 1, N */
|
||||
/* M = K + 1 - J */
|
||||
/* DO 10, I = MAX( 1, J - K ), J */
|
||||
/* A( M + I, J ) = matrix( I, J ) */
|
||||
/* 10 CONTINUE */
|
||||
/* 20 CONTINUE */
|
||||
|
||||
/* Before entry with UPLO = 'L' or 'l', the leading ( k + 1 ) */
|
||||
/* by n part of the array A must contain the lower triangular */
|
||||
/* band part of the matrix of coefficients, supplied column by */
|
||||
/* column, with the leading diagonal of the matrix in row 1 of */
|
||||
/* the array, the first sub-diagonal starting at position 1 in */
|
||||
/* row 2, and so on. The bottom right k by k triangle of the */
|
||||
/* array A is not referenced. */
|
||||
/* The following program segment will transfer a lower */
|
||||
/* triangular band matrix from conventional full matrix storage */
|
||||
/* to band storage: */
|
||||
|
||||
/* DO 20, J = 1, N */
|
||||
/* M = 1 - J */
|
||||
/* DO 10, I = J, MIN( N, J + K ) */
|
||||
/* A( M + I, J ) = matrix( I, J ) */
|
||||
/* 10 CONTINUE */
|
||||
/* 20 CONTINUE */
|
||||
|
||||
/* Note that when DIAG = 'U' or 'u' the elements of the array A */
|
||||
/* corresponding to the diagonal elements of the matrix are not */
|
||||
/* referenced, but are assumed to be unity. */
|
||||
/* Unchanged on exit. */
|
||||
|
||||
/* LDA - INTEGER. */
|
||||
/* On entry, LDA specifies the first dimension of A as declared */
|
||||
/* in the calling (sub) program. LDA must be at least */
|
||||
/* ( k + 1 ). */
|
||||
/* Unchanged on exit. */
|
||||
|
||||
/* X - COMPLEX array of dimension at least */
|
||||
/* ( 1 + ( n - 1 )*abs( INCX ) ). */
|
||||
/* Before entry, the incremented array X must contain the n */
|
||||
/* element vector x. On exit, X is overwritten with the */
|
||||
/* transformed vector x. */
|
||||
|
||||
/* INCX - INTEGER. */
|
||||
/* On entry, INCX specifies the increment for the elements of */
|
||||
/* X. INCX must not be zero. */
|
||||
/* Unchanged on exit. */
|
||||
|
||||
/* Further Details */
|
||||
/* =============== */
|
||||
|
||||
/* Level 2 Blas routine. */
|
||||
|
||||
/* -- Written on 22-October-1986. */
|
||||
/* Jack Dongarra, Argonne National Lab. */
|
||||
/* Jeremy Du Croz, Nag Central Office. */
|
||||
/* Sven Hammarling, Nag Central Office. */
|
||||
/* Richard Hanson, Sandia National Labs. */
|
||||
|
||||
/* ===================================================================== */
|
||||
|
||||
/* .. Parameters .. */
|
||||
/* .. */
|
||||
/* .. Local Scalars .. */
|
||||
/* .. */
|
||||
/* .. External Functions .. */
|
||||
/* .. */
|
||||
/* .. External Subroutines .. */
|
||||
/* .. */
|
||||
/* .. Intrinsic Functions .. */
|
||||
/* .. */
|
||||
|
||||
/* Test the input parameters. */
|
||||
|
||||
/* Parameter adjustments */
|
||||
a_dim1 = *lda;
|
||||
a_offset = 1 + a_dim1;
|
||||
a -= a_offset;
|
||||
--x;
|
||||
|
||||
/* Function Body */
|
||||
info = 0;
|
||||
if (!lsame_(uplo, "U") && !lsame_(uplo, "L")) {
|
||||
info = 1;
|
||||
} else if (!lsame_(trans, "N") && !lsame_(trans, "T") && !lsame_(trans, "C")) {
|
||||
info = 2;
|
||||
} else if (!lsame_(diag, "U") && !lsame_(diag, "N")) {
|
||||
info = 3;
|
||||
} else if (*n < 0) {
|
||||
info = 4;
|
||||
} else if (*k < 0) {
|
||||
info = 5;
|
||||
} else if (*lda < *k + 1) {
|
||||
info = 7;
|
||||
} else if (*incx == 0) {
|
||||
info = 9;
|
||||
}
|
||||
if (info != 0) {
|
||||
xerbla_("CTBMV ", &info);
|
||||
return;
|
||||
}
|
||||
|
||||
/* Quick return if possible. */
|
||||
|
||||
if (*n == 0) {
|
||||
return;
|
||||
}
|
||||
|
||||
noconj = lsame_(trans, "T");
|
||||
nounit = lsame_(diag, "N");
|
||||
|
||||
/* Set up the start point in X if the increment is not unity. This */
|
||||
/* will be ( N - 1 )*INCX too small for descending loops. */
|
||||
|
||||
if (*incx <= 0) {
|
||||
kx = 1 - (*n - 1) * *incx;
|
||||
} else if (*incx != 1) {
|
||||
kx = 1;
|
||||
}
|
||||
|
||||
/* Start the operations. In this version the elements of A are */
|
||||
/* accessed sequentially with one pass through A. */
|
||||
|
||||
if (lsame_(trans, "N")) {
|
||||
/* Form x := A*x. */
|
||||
|
||||
if (lsame_(uplo, "U")) {
|
||||
kplus1 = *k + 1;
|
||||
if (*incx == 1) {
|
||||
i__1 = *n;
|
||||
for (j = 1; j <= i__1; ++j) {
|
||||
i__2 = j;
|
||||
if (x[i__2].r != 0.f || x[i__2].i != 0.f) {
|
||||
i__2 = j;
|
||||
temp.r = x[i__2].r, temp.i = x[i__2].i;
|
||||
l = kplus1 - j;
|
||||
/* Computing MAX */
|
||||
i__2 = 1, i__3 = j - *k;
|
||||
i__4 = j - 1;
|
||||
for (i__ = max(i__2, i__3); i__ <= i__4; ++i__) {
|
||||
i__2 = i__;
|
||||
i__3 = i__;
|
||||
i__5 = l + i__ + j * a_dim1;
|
||||
q__2.r = temp.r * a[i__5].r - temp.i * a[i__5].i, q__2.i = temp.r * a[i__5].i + temp.i * a[i__5].r;
|
||||
q__1.r = x[i__3].r + q__2.r, q__1.i = x[i__3].i + q__2.i;
|
||||
x[i__2].r = q__1.r, x[i__2].i = q__1.i;
|
||||
/* L10: */
|
||||
}
|
||||
if (nounit) {
|
||||
i__4 = j;
|
||||
i__2 = j;
|
||||
i__3 = kplus1 + j * a_dim1;
|
||||
q__1.r = x[i__2].r * a[i__3].r - x[i__2].i * a[i__3].i,
|
||||
q__1.i = x[i__2].r * a[i__3].i + x[i__2].i * a[i__3].r;
|
||||
x[i__4].r = q__1.r, x[i__4].i = q__1.i;
|
||||
}
|
||||
}
|
||||
/* L20: */
|
||||
}
|
||||
} else {
|
||||
jx = kx;
|
||||
i__1 = *n;
|
||||
for (j = 1; j <= i__1; ++j) {
|
||||
i__4 = jx;
|
||||
if (x[i__4].r != 0.f || x[i__4].i != 0.f) {
|
||||
i__4 = jx;
|
||||
temp.r = x[i__4].r, temp.i = x[i__4].i;
|
||||
ix = kx;
|
||||
l = kplus1 - j;
|
||||
/* Computing MAX */
|
||||
i__4 = 1, i__2 = j - *k;
|
||||
i__3 = j - 1;
|
||||
for (i__ = max(i__4, i__2); i__ <= i__3; ++i__) {
|
||||
i__4 = ix;
|
||||
i__2 = ix;
|
||||
i__5 = l + i__ + j * a_dim1;
|
||||
q__2.r = temp.r * a[i__5].r - temp.i * a[i__5].i, q__2.i = temp.r * a[i__5].i + temp.i * a[i__5].r;
|
||||
q__1.r = x[i__2].r + q__2.r, q__1.i = x[i__2].i + q__2.i;
|
||||
x[i__4].r = q__1.r, x[i__4].i = q__1.i;
|
||||
ix += *incx;
|
||||
/* L30: */
|
||||
}
|
||||
if (nounit) {
|
||||
i__3 = jx;
|
||||
i__4 = jx;
|
||||
i__2 = kplus1 + j * a_dim1;
|
||||
q__1.r = x[i__4].r * a[i__2].r - x[i__4].i * a[i__2].i,
|
||||
q__1.i = x[i__4].r * a[i__2].i + x[i__4].i * a[i__2].r;
|
||||
x[i__3].r = q__1.r, x[i__3].i = q__1.i;
|
||||
}
|
||||
}
|
||||
jx += *incx;
|
||||
if (j > *k) {
|
||||
kx += *incx;
|
||||
}
|
||||
/* L40: */
|
||||
}
|
||||
}
|
||||
} else {
|
||||
if (*incx == 1) {
|
||||
for (j = *n; j >= 1; --j) {
|
||||
i__1 = j;
|
||||
if (x[i__1].r != 0.f || x[i__1].i != 0.f) {
|
||||
i__1 = j;
|
||||
temp.r = x[i__1].r, temp.i = x[i__1].i;
|
||||
l = 1 - j;
|
||||
/* Computing MIN */
|
||||
i__1 = *n, i__3 = j + *k;
|
||||
i__4 = j + 1;
|
||||
for (i__ = min(i__1, i__3); i__ >= i__4; --i__) {
|
||||
i__1 = i__;
|
||||
i__3 = i__;
|
||||
i__2 = l + i__ + j * a_dim1;
|
||||
q__2.r = temp.r * a[i__2].r - temp.i * a[i__2].i, q__2.i = temp.r * a[i__2].i + temp.i * a[i__2].r;
|
||||
q__1.r = x[i__3].r + q__2.r, q__1.i = x[i__3].i + q__2.i;
|
||||
x[i__1].r = q__1.r, x[i__1].i = q__1.i;
|
||||
/* L50: */
|
||||
}
|
||||
if (nounit) {
|
||||
i__4 = j;
|
||||
i__1 = j;
|
||||
i__3 = j * a_dim1 + 1;
|
||||
q__1.r = x[i__1].r * a[i__3].r - x[i__1].i * a[i__3].i,
|
||||
q__1.i = x[i__1].r * a[i__3].i + x[i__1].i * a[i__3].r;
|
||||
x[i__4].r = q__1.r, x[i__4].i = q__1.i;
|
||||
}
|
||||
}
|
||||
/* L60: */
|
||||
}
|
||||
} else {
|
||||
kx += (*n - 1) * *incx;
|
||||
jx = kx;
|
||||
for (j = *n; j >= 1; --j) {
|
||||
i__4 = jx;
|
||||
if (x[i__4].r != 0.f || x[i__4].i != 0.f) {
|
||||
i__4 = jx;
|
||||
temp.r = x[i__4].r, temp.i = x[i__4].i;
|
||||
ix = kx;
|
||||
l = 1 - j;
|
||||
/* Computing MIN */
|
||||
i__4 = *n, i__1 = j + *k;
|
||||
i__3 = j + 1;
|
||||
for (i__ = min(i__4, i__1); i__ >= i__3; --i__) {
|
||||
i__4 = ix;
|
||||
i__1 = ix;
|
||||
i__2 = l + i__ + j * a_dim1;
|
||||
q__2.r = temp.r * a[i__2].r - temp.i * a[i__2].i, q__2.i = temp.r * a[i__2].i + temp.i * a[i__2].r;
|
||||
q__1.r = x[i__1].r + q__2.r, q__1.i = x[i__1].i + q__2.i;
|
||||
x[i__4].r = q__1.r, x[i__4].i = q__1.i;
|
||||
ix -= *incx;
|
||||
/* L70: */
|
||||
}
|
||||
if (nounit) {
|
||||
i__3 = jx;
|
||||
i__4 = jx;
|
||||
i__1 = j * a_dim1 + 1;
|
||||
q__1.r = x[i__4].r * a[i__1].r - x[i__4].i * a[i__1].i,
|
||||
q__1.i = x[i__4].r * a[i__1].i + x[i__4].i * a[i__1].r;
|
||||
x[i__3].r = q__1.r, x[i__3].i = q__1.i;
|
||||
}
|
||||
}
|
||||
jx -= *incx;
|
||||
if (*n - j >= *k) {
|
||||
kx -= *incx;
|
||||
}
|
||||
/* L80: */
|
||||
}
|
||||
}
|
||||
}
|
||||
} else {
|
||||
/* Form x := A'*x or x := conjg( A' )*x. */
|
||||
|
||||
if (lsame_(uplo, "U")) {
|
||||
kplus1 = *k + 1;
|
||||
if (*incx == 1) {
|
||||
for (j = *n; j >= 1; --j) {
|
||||
i__3 = j;
|
||||
temp.r = x[i__3].r, temp.i = x[i__3].i;
|
||||
l = kplus1 - j;
|
||||
if (noconj) {
|
||||
if (nounit) {
|
||||
i__3 = kplus1 + j * a_dim1;
|
||||
q__1.r = temp.r * a[i__3].r - temp.i * a[i__3].i, q__1.i = temp.r * a[i__3].i + temp.i * a[i__3].r;
|
||||
temp.r = q__1.r, temp.i = q__1.i;
|
||||
}
|
||||
/* Computing MAX */
|
||||
i__4 = 1, i__1 = j - *k;
|
||||
i__3 = max(i__4, i__1);
|
||||
for (i__ = j - 1; i__ >= i__3; --i__) {
|
||||
i__4 = l + i__ + j * a_dim1;
|
||||
i__1 = i__;
|
||||
q__2.r = a[i__4].r * x[i__1].r - a[i__4].i * x[i__1].i,
|
||||
q__2.i = a[i__4].r * x[i__1].i + a[i__4].i * x[i__1].r;
|
||||
q__1.r = temp.r + q__2.r, q__1.i = temp.i + q__2.i;
|
||||
temp.r = q__1.r, temp.i = q__1.i;
|
||||
/* L90: */
|
||||
}
|
||||
} else {
|
||||
if (nounit) {
|
||||
r_cnjg(&q__2, &a[kplus1 + j * a_dim1]);
|
||||
q__1.r = temp.r * q__2.r - temp.i * q__2.i, q__1.i = temp.r * q__2.i + temp.i * q__2.r;
|
||||
temp.r = q__1.r, temp.i = q__1.i;
|
||||
}
|
||||
/* Computing MAX */
|
||||
i__4 = 1, i__1 = j - *k;
|
||||
i__3 = max(i__4, i__1);
|
||||
for (i__ = j - 1; i__ >= i__3; --i__) {
|
||||
r_cnjg(&q__3, &a[l + i__ + j * a_dim1]);
|
||||
i__4 = i__;
|
||||
q__2.r = q__3.r * x[i__4].r - q__3.i * x[i__4].i, q__2.i = q__3.r * x[i__4].i + q__3.i * x[i__4].r;
|
||||
q__1.r = temp.r + q__2.r, q__1.i = temp.i + q__2.i;
|
||||
temp.r = q__1.r, temp.i = q__1.i;
|
||||
/* L100: */
|
||||
}
|
||||
}
|
||||
i__3 = j;
|
||||
x[i__3].r = temp.r, x[i__3].i = temp.i;
|
||||
/* L110: */
|
||||
}
|
||||
} else {
|
||||
kx += (*n - 1) * *incx;
|
||||
jx = kx;
|
||||
for (j = *n; j >= 1; --j) {
|
||||
i__3 = jx;
|
||||
temp.r = x[i__3].r, temp.i = x[i__3].i;
|
||||
kx -= *incx;
|
||||
ix = kx;
|
||||
l = kplus1 - j;
|
||||
if (noconj) {
|
||||
if (nounit) {
|
||||
i__3 = kplus1 + j * a_dim1;
|
||||
q__1.r = temp.r * a[i__3].r - temp.i * a[i__3].i, q__1.i = temp.r * a[i__3].i + temp.i * a[i__3].r;
|
||||
temp.r = q__1.r, temp.i = q__1.i;
|
||||
}
|
||||
/* Computing MAX */
|
||||
i__4 = 1, i__1 = j - *k;
|
||||
i__3 = max(i__4, i__1);
|
||||
for (i__ = j - 1; i__ >= i__3; --i__) {
|
||||
i__4 = l + i__ + j * a_dim1;
|
||||
i__1 = ix;
|
||||
q__2.r = a[i__4].r * x[i__1].r - a[i__4].i * x[i__1].i,
|
||||
q__2.i = a[i__4].r * x[i__1].i + a[i__4].i * x[i__1].r;
|
||||
q__1.r = temp.r + q__2.r, q__1.i = temp.i + q__2.i;
|
||||
temp.r = q__1.r, temp.i = q__1.i;
|
||||
ix -= *incx;
|
||||
/* L120: */
|
||||
}
|
||||
} else {
|
||||
if (nounit) {
|
||||
r_cnjg(&q__2, &a[kplus1 + j * a_dim1]);
|
||||
q__1.r = temp.r * q__2.r - temp.i * q__2.i, q__1.i = temp.r * q__2.i + temp.i * q__2.r;
|
||||
temp.r = q__1.r, temp.i = q__1.i;
|
||||
}
|
||||
/* Computing MAX */
|
||||
i__4 = 1, i__1 = j - *k;
|
||||
i__3 = max(i__4, i__1);
|
||||
for (i__ = j - 1; i__ >= i__3; --i__) {
|
||||
r_cnjg(&q__3, &a[l + i__ + j * a_dim1]);
|
||||
i__4 = ix;
|
||||
q__2.r = q__3.r * x[i__4].r - q__3.i * x[i__4].i, q__2.i = q__3.r * x[i__4].i + q__3.i * x[i__4].r;
|
||||
q__1.r = temp.r + q__2.r, q__1.i = temp.i + q__2.i;
|
||||
temp.r = q__1.r, temp.i = q__1.i;
|
||||
ix -= *incx;
|
||||
/* L130: */
|
||||
}
|
||||
}
|
||||
i__3 = jx;
|
||||
x[i__3].r = temp.r, x[i__3].i = temp.i;
|
||||
jx -= *incx;
|
||||
/* L140: */
|
||||
}
|
||||
}
|
||||
} else {
|
||||
if (*incx == 1) {
|
||||
i__3 = *n;
|
||||
for (j = 1; j <= i__3; ++j) {
|
||||
i__4 = j;
|
||||
temp.r = x[i__4].r, temp.i = x[i__4].i;
|
||||
l = 1 - j;
|
||||
if (noconj) {
|
||||
if (nounit) {
|
||||
i__4 = j * a_dim1 + 1;
|
||||
q__1.r = temp.r * a[i__4].r - temp.i * a[i__4].i, q__1.i = temp.r * a[i__4].i + temp.i * a[i__4].r;
|
||||
temp.r = q__1.r, temp.i = q__1.i;
|
||||
}
|
||||
/* Computing MIN */
|
||||
i__1 = *n, i__2 = j + *k;
|
||||
i__4 = min(i__1, i__2);
|
||||
for (i__ = j + 1; i__ <= i__4; ++i__) {
|
||||
i__1 = l + i__ + j * a_dim1;
|
||||
i__2 = i__;
|
||||
q__2.r = a[i__1].r * x[i__2].r - a[i__1].i * x[i__2].i,
|
||||
q__2.i = a[i__1].r * x[i__2].i + a[i__1].i * x[i__2].r;
|
||||
q__1.r = temp.r + q__2.r, q__1.i = temp.i + q__2.i;
|
||||
temp.r = q__1.r, temp.i = q__1.i;
|
||||
/* L150: */
|
||||
}
|
||||
} else {
|
||||
if (nounit) {
|
||||
r_cnjg(&q__2, &a[j * a_dim1 + 1]);
|
||||
q__1.r = temp.r * q__2.r - temp.i * q__2.i, q__1.i = temp.r * q__2.i + temp.i * q__2.r;
|
||||
temp.r = q__1.r, temp.i = q__1.i;
|
||||
}
|
||||
/* Computing MIN */
|
||||
i__1 = *n, i__2 = j + *k;
|
||||
i__4 = min(i__1, i__2);
|
||||
for (i__ = j + 1; i__ <= i__4; ++i__) {
|
||||
r_cnjg(&q__3, &a[l + i__ + j * a_dim1]);
|
||||
i__1 = i__;
|
||||
q__2.r = q__3.r * x[i__1].r - q__3.i * x[i__1].i, q__2.i = q__3.r * x[i__1].i + q__3.i * x[i__1].r;
|
||||
q__1.r = temp.r + q__2.r, q__1.i = temp.i + q__2.i;
|
||||
temp.r = q__1.r, temp.i = q__1.i;
|
||||
/* L160: */
|
||||
}
|
||||
}
|
||||
i__4 = j;
|
||||
x[i__4].r = temp.r, x[i__4].i = temp.i;
|
||||
/* L170: */
|
||||
}
|
||||
} else {
|
||||
jx = kx;
|
||||
i__3 = *n;
|
||||
for (j = 1; j <= i__3; ++j) {
|
||||
i__4 = jx;
|
||||
temp.r = x[i__4].r, temp.i = x[i__4].i;
|
||||
kx += *incx;
|
||||
ix = kx;
|
||||
l = 1 - j;
|
||||
if (noconj) {
|
||||
if (nounit) {
|
||||
i__4 = j * a_dim1 + 1;
|
||||
q__1.r = temp.r * a[i__4].r - temp.i * a[i__4].i, q__1.i = temp.r * a[i__4].i + temp.i * a[i__4].r;
|
||||
temp.r = q__1.r, temp.i = q__1.i;
|
||||
}
|
||||
/* Computing MIN */
|
||||
i__1 = *n, i__2 = j + *k;
|
||||
i__4 = min(i__1, i__2);
|
||||
for (i__ = j + 1; i__ <= i__4; ++i__) {
|
||||
i__1 = l + i__ + j * a_dim1;
|
||||
i__2 = ix;
|
||||
q__2.r = a[i__1].r * x[i__2].r - a[i__1].i * x[i__2].i,
|
||||
q__2.i = a[i__1].r * x[i__2].i + a[i__1].i * x[i__2].r;
|
||||
q__1.r = temp.r + q__2.r, q__1.i = temp.i + q__2.i;
|
||||
temp.r = q__1.r, temp.i = q__1.i;
|
||||
ix += *incx;
|
||||
/* L180: */
|
||||
}
|
||||
} else {
|
||||
if (nounit) {
|
||||
r_cnjg(&q__2, &a[j * a_dim1 + 1]);
|
||||
q__1.r = temp.r * q__2.r - temp.i * q__2.i, q__1.i = temp.r * q__2.i + temp.i * q__2.r;
|
||||
temp.r = q__1.r, temp.i = q__1.i;
|
||||
}
|
||||
/* Computing MIN */
|
||||
i__1 = *n, i__2 = j + *k;
|
||||
i__4 = min(i__1, i__2);
|
||||
for (i__ = j + 1; i__ <= i__4; ++i__) {
|
||||
r_cnjg(&q__3, &a[l + i__ + j * a_dim1]);
|
||||
i__1 = ix;
|
||||
q__2.r = q__3.r * x[i__1].r - q__3.i * x[i__1].i, q__2.i = q__3.r * x[i__1].i + q__3.i * x[i__1].r;
|
||||
q__1.r = temp.r + q__2.r, q__1.i = temp.i + q__2.i;
|
||||
temp.r = q__1.r, temp.i = q__1.i;
|
||||
ix += *incx;
|
||||
/* L190: */
|
||||
}
|
||||
}
|
||||
i__4 = jx;
|
||||
x[i__4].r = temp.r, x[i__4].i = temp.i;
|
||||
jx += *incx;
|
||||
/* L200: */
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/* End of CTBMV . */
|
||||
|
||||
} /* ctbmv_ */
|
||||
@@ -1,27 +0,0 @@
|
||||
/* This contains a limited subset of the typedefs exposed by f2c
|
||||
for use by the Eigen BLAS C-only implementation.
|
||||
*/
|
||||
|
||||
#ifndef __EIGEN_DATATYPES_H__
|
||||
#define __EIGEN_DATATYPES_H__
|
||||
|
||||
typedef int integer;
|
||||
typedef unsigned int uinteger;
|
||||
typedef float real;
|
||||
typedef double doublereal;
|
||||
typedef struct {
|
||||
real r, i;
|
||||
} complex;
|
||||
typedef struct {
|
||||
doublereal r, i;
|
||||
} doublecomplex;
|
||||
typedef int logical;
|
||||
|
||||
#define abs(x) ((x) >= 0 ? (x) : -(x))
|
||||
#define dabs(x) (doublereal) abs(x)
|
||||
#define min(a, b) ((a) <= (b) ? (a) : (b))
|
||||
#define max(a, b) ((a) >= (b) ? (a) : (b))
|
||||
#define dmin(a, b) (doublereal) min(a, b)
|
||||
#define dmax(a, b) (doublereal) max(a, b)
|
||||
|
||||
#endif
|
||||
213
blas/f2c/drotm.c
213
blas/f2c/drotm.c
@@ -1,213 +0,0 @@
|
||||
/* drotm.f -- translated by f2c (version 20100827).
|
||||
You must link the resulting object file with libf2c:
|
||||
on Microsoft Windows system, link with libf2c.lib;
|
||||
on Linux or Unix systems, link with .../path/to/libf2c.a -lm
|
||||
or, if you install libf2c.a in a standard place, with -lf2c -lm
|
||||
-- in that order, at the end of the command line, as in
|
||||
cc *.o -lf2c -lm
|
||||
Source for libf2c is in /netlib/f2c/libf2c.zip, e.g.,
|
||||
|
||||
http://www.netlib.org/f2c/libf2c.zip
|
||||
*/
|
||||
|
||||
#include "datatypes.h"
|
||||
|
||||
/* Subroutine */ void drotm_(integer *n, doublereal *dx, integer *incx, doublereal *dy, integer *incy,
|
||||
doublereal *dparam) {
|
||||
/* Initialized data */
|
||||
|
||||
static doublereal zero = 0.;
|
||||
static doublereal two = 2.;
|
||||
|
||||
/* System generated locals */
|
||||
integer i__1, i__2;
|
||||
|
||||
/* Local variables */
|
||||
integer i__;
|
||||
doublereal w, z__;
|
||||
integer kx, ky;
|
||||
doublereal dh11, dh12, dh21, dh22, dflag;
|
||||
integer nsteps;
|
||||
|
||||
/* .. Scalar Arguments .. */
|
||||
/* .. */
|
||||
/* .. Array Arguments .. */
|
||||
/* .. */
|
||||
|
||||
/* Purpose */
|
||||
/* ======= */
|
||||
|
||||
/* APPLY THE MODIFIED GIVENS TRANSFORMATION, H, TO THE 2 BY N MATRIX */
|
||||
|
||||
/* (DX**T) , WHERE **T INDICATES TRANSPOSE. THE ELEMENTS OF DX ARE IN */
|
||||
/* (DY**T) */
|
||||
|
||||
/* DX(LX+I*INCX), I = 0 TO N-1, WHERE LX = 1 IF INCX .GE. 0, ELSE */
|
||||
/* LX = (-INCX)*N, AND SIMILARLY FOR SY USING LY AND INCY. */
|
||||
/* WITH DPARAM(1)=DFLAG, H HAS ONE OF THE FOLLOWING FORMS.. */
|
||||
|
||||
/* DFLAG=-1.D0 DFLAG=0.D0 DFLAG=1.D0 DFLAG=-2.D0 */
|
||||
|
||||
/* (DH11 DH12) (1.D0 DH12) (DH11 1.D0) (1.D0 0.D0) */
|
||||
/* H=( ) ( ) ( ) ( ) */
|
||||
/* (DH21 DH22), (DH21 1.D0), (-1.D0 DH22), (0.D0 1.D0). */
|
||||
/* SEE DROTMG FOR A DESCRIPTION OF DATA STORAGE IN DPARAM. */
|
||||
|
||||
/* Arguments */
|
||||
/* ========= */
|
||||
|
||||
/* N (input) INTEGER */
|
||||
/* number of elements in input vector(s) */
|
||||
|
||||
/* DX (input/output) DOUBLE PRECISION array, dimension N */
|
||||
/* double precision vector with N elements */
|
||||
|
||||
/* INCX (input) INTEGER */
|
||||
/* storage spacing between elements of DX */
|
||||
|
||||
/* DY (input/output) DOUBLE PRECISION array, dimension N */
|
||||
/* double precision vector with N elements */
|
||||
|
||||
/* INCY (input) INTEGER */
|
||||
/* storage spacing between elements of DY */
|
||||
|
||||
/* DPARAM (input/output) DOUBLE PRECISION array, dimension 5 */
|
||||
/* DPARAM(1)=DFLAG */
|
||||
/* DPARAM(2)=DH11 */
|
||||
/* DPARAM(3)=DH21 */
|
||||
/* DPARAM(4)=DH12 */
|
||||
/* DPARAM(5)=DH22 */
|
||||
|
||||
/* ===================================================================== */
|
||||
|
||||
/* .. Local Scalars .. */
|
||||
/* .. */
|
||||
/* .. Data statements .. */
|
||||
/* Parameter adjustments */
|
||||
--dparam;
|
||||
--dy;
|
||||
--dx;
|
||||
|
||||
/* Function Body */
|
||||
/* .. */
|
||||
|
||||
dflag = dparam[1];
|
||||
if (*n <= 0 || dflag + two == zero) {
|
||||
goto L140;
|
||||
}
|
||||
if (!(*incx == *incy && *incx > 0)) {
|
||||
goto L70;
|
||||
}
|
||||
|
||||
nsteps = *n * *incx;
|
||||
if (dflag < 0.) {
|
||||
goto L50;
|
||||
} else if (dflag == 0) {
|
||||
goto L10;
|
||||
} else {
|
||||
goto L30;
|
||||
}
|
||||
L10:
|
||||
dh12 = dparam[4];
|
||||
dh21 = dparam[3];
|
||||
i__1 = nsteps;
|
||||
i__2 = *incx;
|
||||
for (i__ = 1; i__2 < 0 ? i__ >= i__1 : i__ <= i__1; i__ += i__2) {
|
||||
w = dx[i__];
|
||||
z__ = dy[i__];
|
||||
dx[i__] = w + z__ * dh12;
|
||||
dy[i__] = w * dh21 + z__;
|
||||
/* L20: */
|
||||
}
|
||||
goto L140;
|
||||
L30:
|
||||
dh11 = dparam[2];
|
||||
dh22 = dparam[5];
|
||||
i__2 = nsteps;
|
||||
i__1 = *incx;
|
||||
for (i__ = 1; i__1 < 0 ? i__ >= i__2 : i__ <= i__2; i__ += i__1) {
|
||||
w = dx[i__];
|
||||
z__ = dy[i__];
|
||||
dx[i__] = w * dh11 + z__;
|
||||
dy[i__] = -w + dh22 * z__;
|
||||
/* L40: */
|
||||
}
|
||||
goto L140;
|
||||
L50:
|
||||
dh11 = dparam[2];
|
||||
dh12 = dparam[4];
|
||||
dh21 = dparam[3];
|
||||
dh22 = dparam[5];
|
||||
i__1 = nsteps;
|
||||
i__2 = *incx;
|
||||
for (i__ = 1; i__2 < 0 ? i__ >= i__1 : i__ <= i__1; i__ += i__2) {
|
||||
w = dx[i__];
|
||||
z__ = dy[i__];
|
||||
dx[i__] = w * dh11 + z__ * dh12;
|
||||
dy[i__] = w * dh21 + z__ * dh22;
|
||||
/* L60: */
|
||||
}
|
||||
goto L140;
|
||||
L70:
|
||||
kx = 1;
|
||||
ky = 1;
|
||||
if (*incx < 0) {
|
||||
kx = (1 - *n) * *incx + 1;
|
||||
}
|
||||
if (*incy < 0) {
|
||||
ky = (1 - *n) * *incy + 1;
|
||||
}
|
||||
|
||||
if (dflag < 0.) {
|
||||
goto L120;
|
||||
} else if (dflag == 0) {
|
||||
goto L80;
|
||||
} else {
|
||||
goto L100;
|
||||
}
|
||||
L80:
|
||||
dh12 = dparam[4];
|
||||
dh21 = dparam[3];
|
||||
i__2 = *n;
|
||||
for (i__ = 1; i__ <= i__2; ++i__) {
|
||||
w = dx[kx];
|
||||
z__ = dy[ky];
|
||||
dx[kx] = w + z__ * dh12;
|
||||
dy[ky] = w * dh21 + z__;
|
||||
kx += *incx;
|
||||
ky += *incy;
|
||||
/* L90: */
|
||||
}
|
||||
goto L140;
|
||||
L100:
|
||||
dh11 = dparam[2];
|
||||
dh22 = dparam[5];
|
||||
i__2 = *n;
|
||||
for (i__ = 1; i__ <= i__2; ++i__) {
|
||||
w = dx[kx];
|
||||
z__ = dy[ky];
|
||||
dx[kx] = w * dh11 + z__;
|
||||
dy[ky] = -w + dh22 * z__;
|
||||
kx += *incx;
|
||||
ky += *incy;
|
||||
/* L110: */
|
||||
}
|
||||
goto L140;
|
||||
L120:
|
||||
dh11 = dparam[2];
|
||||
dh12 = dparam[4];
|
||||
dh21 = dparam[3];
|
||||
dh22 = dparam[5];
|
||||
i__2 = *n;
|
||||
for (i__ = 1; i__ <= i__2; ++i__) {
|
||||
w = dx[kx];
|
||||
z__ = dy[ky];
|
||||
dx[kx] = w * dh11 + z__ * dh12;
|
||||
dy[ky] = w * dh21 + z__ * dh22;
|
||||
kx += *incx;
|
||||
ky += *incy;
|
||||
/* L130: */
|
||||
}
|
||||
L140:
|
||||
return;
|
||||
} /* drotm_ */
|
||||
@@ -1,293 +0,0 @@
|
||||
/* drotmg.f -- translated by f2c (version 20100827).
|
||||
You must link the resulting object file with libf2c:
|
||||
on Microsoft Windows system, link with libf2c.lib;
|
||||
on Linux or Unix systems, link with .../path/to/libf2c.a -lm
|
||||
or, if you install libf2c.a in a standard place, with -lf2c -lm
|
||||
-- in that order, at the end of the command line, as in
|
||||
cc *.o -lf2c -lm
|
||||
Source for libf2c is in /netlib/f2c/libf2c.zip, e.g.,
|
||||
|
||||
http://www.netlib.org/f2c/libf2c.zip
|
||||
*/
|
||||
|
||||
#include "datatypes.h"
|
||||
|
||||
/* Subroutine */ void drotmg_(doublereal *dd1, doublereal *dd2, doublereal *dx1, doublereal *dy1, doublereal *dparam) {
|
||||
/* Initialized data */
|
||||
|
||||
static doublereal zero = 0.;
|
||||
static doublereal one = 1.;
|
||||
static doublereal two = 2.;
|
||||
static doublereal gam = 4096.;
|
||||
static doublereal gamsq = 16777216.;
|
||||
static doublereal rgamsq = 5.9604645e-8;
|
||||
|
||||
/* Format strings */
|
||||
static char fmt_120[] = "";
|
||||
static char fmt_150[] = "";
|
||||
static char fmt_180[] = "";
|
||||
static char fmt_210[] = "";
|
||||
|
||||
/* System generated locals */
|
||||
doublereal d__1;
|
||||
|
||||
/* Local variables */
|
||||
doublereal du, dp1, dp2, dq1, dq2, dh11, dh12, dh21, dh22;
|
||||
integer igo;
|
||||
doublereal dflag, dtemp;
|
||||
|
||||
/* Assigned format variables */
|
||||
static char *igo_fmt;
|
||||
(void)igo_fmt;
|
||||
|
||||
/* .. Scalar Arguments .. */
|
||||
/* .. */
|
||||
/* .. Array Arguments .. */
|
||||
/* .. */
|
||||
|
||||
/* Purpose */
|
||||
/* ======= */
|
||||
|
||||
/* CONSTRUCT THE MODIFIED GIVENS TRANSFORMATION MATRIX H WHICH ZEROS */
|
||||
/* THE SECOND COMPONENT OF THE 2-VECTOR (DSQRT(DD1)*DX1,DSQRT(DD2)* */
|
||||
/* DY2)**T. */
|
||||
/* WITH DPARAM(1)=DFLAG, H HAS ONE OF THE FOLLOWING FORMS.. */
|
||||
|
||||
/* DFLAG=-1.D0 DFLAG=0.D0 DFLAG=1.D0 DFLAG=-2.D0 */
|
||||
|
||||
/* (DH11 DH12) (1.D0 DH12) (DH11 1.D0) (1.D0 0.D0) */
|
||||
/* H=( ) ( ) ( ) ( ) */
|
||||
/* (DH21 DH22), (DH21 1.D0), (-1.D0 DH22), (0.D0 1.D0). */
|
||||
/* LOCATIONS 2-4 OF DPARAM CONTAIN DH11, DH21, DH12, AND DH22 */
|
||||
/* RESPECTIVELY. (VALUES OF 1.D0, -1.D0, OR 0.D0 IMPLIED BY THE */
|
||||
/* VALUE OF DPARAM(1) ARE NOT STORED IN DPARAM.) */
|
||||
|
||||
/* THE VALUES OF GAMSQ AND RGAMSQ SET IN THE DATA STATEMENT MAY BE */
|
||||
/* INEXACT. THIS IS OK AS THEY ARE ONLY USED FOR TESTING THE SIZE */
|
||||
/* OF DD1 AND DD2. ALL ACTUAL SCALING OF DATA IS DONE USING GAM. */
|
||||
|
||||
/* Arguments */
|
||||
/* ========= */
|
||||
|
||||
/* DD1 (input/output) DOUBLE PRECISION */
|
||||
|
||||
/* DD2 (input/output) DOUBLE PRECISION */
|
||||
|
||||
/* DX1 (input/output) DOUBLE PRECISION */
|
||||
|
||||
/* DY1 (input) DOUBLE PRECISION */
|
||||
|
||||
/* DPARAM (input/output) DOUBLE PRECISION array, dimension 5 */
|
||||
/* DPARAM(1)=DFLAG */
|
||||
/* DPARAM(2)=DH11 */
|
||||
/* DPARAM(3)=DH21 */
|
||||
/* DPARAM(4)=DH12 */
|
||||
/* DPARAM(5)=DH22 */
|
||||
|
||||
/* ===================================================================== */
|
||||
|
||||
/* .. Local Scalars .. */
|
||||
/* .. */
|
||||
/* .. Intrinsic Functions .. */
|
||||
/* .. */
|
||||
/* .. Data statements .. */
|
||||
|
||||
/* Parameter adjustments */
|
||||
--dparam;
|
||||
|
||||
/* Function Body */
|
||||
/* .. */
|
||||
if (!(*dd1 < zero)) {
|
||||
goto L10;
|
||||
}
|
||||
/* GO ZERO-H-D-AND-DX1.. */
|
||||
goto L60;
|
||||
L10:
|
||||
/* CASE-DD1-NONNEGATIVE */
|
||||
dp2 = *dd2 * *dy1;
|
||||
if (!(dp2 == zero)) {
|
||||
goto L20;
|
||||
}
|
||||
dflag = -two;
|
||||
goto L260;
|
||||
/* REGULAR-CASE.. */
|
||||
L20:
|
||||
dp1 = *dd1 * *dx1;
|
||||
dq2 = dp2 * *dy1;
|
||||
dq1 = dp1 * *dx1;
|
||||
|
||||
if (!(abs(dq1) > abs(dq2))) {
|
||||
goto L40;
|
||||
}
|
||||
dh21 = -(*dy1) / *dx1;
|
||||
dh12 = dp2 / dp1;
|
||||
|
||||
du = one - dh12 * dh21;
|
||||
|
||||
if (!(du <= zero)) {
|
||||
goto L30;
|
||||
}
|
||||
/* GO ZERO-H-D-AND-DX1.. */
|
||||
goto L60;
|
||||
L30:
|
||||
dflag = zero;
|
||||
*dd1 /= du;
|
||||
*dd2 /= du;
|
||||
*dx1 *= du;
|
||||
/* GO SCALE-CHECK.. */
|
||||
goto L100;
|
||||
L40:
|
||||
if (!(dq2 < zero)) {
|
||||
goto L50;
|
||||
}
|
||||
/* GO ZERO-H-D-AND-DX1.. */
|
||||
goto L60;
|
||||
L50:
|
||||
dflag = one;
|
||||
dh11 = dp1 / dp2;
|
||||
dh22 = *dx1 / *dy1;
|
||||
du = one + dh11 * dh22;
|
||||
dtemp = *dd2 / du;
|
||||
*dd2 = *dd1 / du;
|
||||
*dd1 = dtemp;
|
||||
*dx1 = *dy1 * du;
|
||||
/* GO SCALE-CHECK */
|
||||
goto L100;
|
||||
/* PROCEDURE..ZERO-H-D-AND-DX1.. */
|
||||
L60:
|
||||
dflag = -one;
|
||||
dh11 = zero;
|
||||
dh12 = zero;
|
||||
dh21 = zero;
|
||||
dh22 = zero;
|
||||
|
||||
*dd1 = zero;
|
||||
*dd2 = zero;
|
||||
*dx1 = zero;
|
||||
/* RETURN.. */
|
||||
goto L220;
|
||||
/* PROCEDURE..FIX-H.. */
|
||||
L70:
|
||||
if (!(dflag >= zero)) {
|
||||
goto L90;
|
||||
}
|
||||
|
||||
if (!(dflag == zero)) {
|
||||
goto L80;
|
||||
}
|
||||
dh11 = one;
|
||||
dh22 = one;
|
||||
dflag = -one;
|
||||
goto L90;
|
||||
L80:
|
||||
dh21 = -one;
|
||||
dh12 = one;
|
||||
dflag = -one;
|
||||
L90:
|
||||
switch (igo) {
|
||||
case 0:
|
||||
goto L120;
|
||||
case 1:
|
||||
goto L150;
|
||||
case 2:
|
||||
goto L180;
|
||||
case 3:
|
||||
goto L210;
|
||||
}
|
||||
/* PROCEDURE..SCALE-CHECK */
|
||||
L100:
|
||||
L110:
|
||||
if (!(*dd1 <= rgamsq)) {
|
||||
goto L130;
|
||||
}
|
||||
if (*dd1 == zero) {
|
||||
goto L160;
|
||||
}
|
||||
igo = 0;
|
||||
igo_fmt = fmt_120;
|
||||
/* FIX-H.. */
|
||||
goto L70;
|
||||
L120:
|
||||
/* Computing 2nd power */
|
||||
d__1 = gam;
|
||||
*dd1 *= d__1 * d__1;
|
||||
*dx1 /= gam;
|
||||
dh11 /= gam;
|
||||
dh12 /= gam;
|
||||
goto L110;
|
||||
L130:
|
||||
L140:
|
||||
if (!(*dd1 >= gamsq)) {
|
||||
goto L160;
|
||||
}
|
||||
igo = 1;
|
||||
igo_fmt = fmt_150;
|
||||
/* FIX-H.. */
|
||||
goto L70;
|
||||
L150:
|
||||
/* Computing 2nd power */
|
||||
d__1 = gam;
|
||||
*dd1 /= d__1 * d__1;
|
||||
*dx1 *= gam;
|
||||
dh11 *= gam;
|
||||
dh12 *= gam;
|
||||
goto L140;
|
||||
L160:
|
||||
L170:
|
||||
if (!(abs(*dd2) <= rgamsq)) {
|
||||
goto L190;
|
||||
}
|
||||
if (*dd2 == zero) {
|
||||
goto L220;
|
||||
}
|
||||
igo = 2;
|
||||
igo_fmt = fmt_180;
|
||||
/* FIX-H.. */
|
||||
goto L70;
|
||||
L180:
|
||||
/* Computing 2nd power */
|
||||
d__1 = gam;
|
||||
*dd2 *= d__1 * d__1;
|
||||
dh21 /= gam;
|
||||
dh22 /= gam;
|
||||
goto L170;
|
||||
L190:
|
||||
L200:
|
||||
if (!(abs(*dd2) >= gamsq)) {
|
||||
goto L220;
|
||||
}
|
||||
igo = 3;
|
||||
igo_fmt = fmt_210;
|
||||
/* FIX-H.. */
|
||||
goto L70;
|
||||
L210:
|
||||
/* Computing 2nd power */
|
||||
d__1 = gam;
|
||||
*dd2 /= d__1 * d__1;
|
||||
dh21 *= gam;
|
||||
dh22 *= gam;
|
||||
goto L200;
|
||||
L220:
|
||||
if (dflag < 0.) {
|
||||
goto L250;
|
||||
} else if (dflag == 0) {
|
||||
goto L230;
|
||||
} else {
|
||||
goto L240;
|
||||
}
|
||||
L230:
|
||||
dparam[3] = dh21;
|
||||
dparam[4] = dh12;
|
||||
goto L260;
|
||||
L240:
|
||||
dparam[2] = dh11;
|
||||
dparam[5] = dh22;
|
||||
goto L260;
|
||||
L250:
|
||||
dparam[2] = dh11;
|
||||
dparam[3] = dh21;
|
||||
dparam[4] = dh12;
|
||||
dparam[5] = dh22;
|
||||
L260:
|
||||
dparam[1] = dflag;
|
||||
} /* drotmg_ */
|
||||
356
blas/f2c/dsbmv.c
356
blas/f2c/dsbmv.c
@@ -1,356 +0,0 @@
|
||||
/* dsbmv.f -- translated by f2c (version 20100827).
|
||||
You must link the resulting object file with libf2c:
|
||||
on Microsoft Windows system, link with libf2c.lib;
|
||||
on Linux or Unix systems, link with .../path/to/libf2c.a -lm
|
||||
or, if you install libf2c.a in a standard place, with -lf2c -lm
|
||||
-- in that order, at the end of the command line, as in
|
||||
cc *.o -lf2c -lm
|
||||
Source for libf2c is in /netlib/f2c/libf2c.zip, e.g.,
|
||||
|
||||
http://www.netlib.org/f2c/libf2c.zip
|
||||
*/
|
||||
|
||||
#include "datatypes.h"
|
||||
|
||||
/* Subroutine */ void dsbmv_(char *uplo, integer *n, integer *k, doublereal *alpha, doublereal *a, integer *lda,
|
||||
doublereal *x, integer *incx, doublereal *beta, doublereal *y, integer *incy) {
|
||||
/* System generated locals */
|
||||
integer a_dim1, a_offset, i__1, i__2, i__3, i__4;
|
||||
|
||||
/* Local variables */
|
||||
integer i__, j, l, ix, iy, jx, jy, kx, ky, info;
|
||||
doublereal temp1, temp2;
|
||||
extern logical lsame_(char *, char *);
|
||||
integer kplus1;
|
||||
extern /* Subroutine */ void xerbla_(const char *, integer *);
|
||||
|
||||
/* .. Scalar Arguments .. */
|
||||
/* .. */
|
||||
/* .. Array Arguments .. */
|
||||
/* .. */
|
||||
|
||||
/* Purpose */
|
||||
/* ======= */
|
||||
|
||||
/* DSBMV performs the matrix-vector operation */
|
||||
|
||||
/* y := alpha*A*x + beta*y, */
|
||||
|
||||
/* where alpha and beta are scalars, x and y are n element vectors and */
|
||||
/* A is an n by n symmetric band matrix, with k super-diagonals. */
|
||||
|
||||
/* Arguments */
|
||||
/* ========== */
|
||||
|
||||
/* UPLO - CHARACTER*1. */
|
||||
/* On entry, UPLO specifies whether the upper or lower */
|
||||
/* triangular part of the band matrix A is being supplied as */
|
||||
/* follows: */
|
||||
|
||||
/* UPLO = 'U' or 'u' The upper triangular part of A is */
|
||||
/* being supplied. */
|
||||
|
||||
/* UPLO = 'L' or 'l' The lower triangular part of A is */
|
||||
/* being supplied. */
|
||||
|
||||
/* Unchanged on exit. */
|
||||
|
||||
/* N - INTEGER. */
|
||||
/* On entry, N specifies the order of the matrix A. */
|
||||
/* N must be at least zero. */
|
||||
/* Unchanged on exit. */
|
||||
|
||||
/* K - INTEGER. */
|
||||
/* On entry, K specifies the number of super-diagonals of the */
|
||||
/* matrix A. K must satisfy 0 .le. K. */
|
||||
/* Unchanged on exit. */
|
||||
|
||||
/* ALPHA - DOUBLE PRECISION. */
|
||||
/* On entry, ALPHA specifies the scalar alpha. */
|
||||
/* Unchanged on exit. */
|
||||
|
||||
/* A - DOUBLE PRECISION array of DIMENSION ( LDA, n ). */
|
||||
/* Before entry with UPLO = 'U' or 'u', the leading ( k + 1 ) */
|
||||
/* by n part of the array A must contain the upper triangular */
|
||||
/* band part of the symmetric matrix, supplied column by */
|
||||
/* column, with the leading diagonal of the matrix in row */
|
||||
/* ( k + 1 ) of the array, the first super-diagonal starting at */
|
||||
/* position 2 in row k, and so on. The top left k by k triangle */
|
||||
/* of the array A is not referenced. */
|
||||
/* The following program segment will transfer the upper */
|
||||
/* triangular part of a symmetric band matrix from conventional */
|
||||
/* full matrix storage to band storage: */
|
||||
|
||||
/* DO 20, J = 1, N */
|
||||
/* M = K + 1 - J */
|
||||
/* DO 10, I = MAX( 1, J - K ), J */
|
||||
/* A( M + I, J ) = matrix( I, J ) */
|
||||
/* 10 CONTINUE */
|
||||
/* 20 CONTINUE */
|
||||
|
||||
/* Before entry with UPLO = 'L' or 'l', the leading ( k + 1 ) */
|
||||
/* by n part of the array A must contain the lower triangular */
|
||||
/* band part of the symmetric matrix, supplied column by */
|
||||
/* column, with the leading diagonal of the matrix in row 1 of */
|
||||
/* the array, the first sub-diagonal starting at position 1 in */
|
||||
/* row 2, and so on. The bottom right k by k triangle of the */
|
||||
/* array A is not referenced. */
|
||||
/* The following program segment will transfer the lower */
|
||||
/* triangular part of a symmetric band matrix from conventional */
|
||||
/* full matrix storage to band storage: */
|
||||
|
||||
/* DO 20, J = 1, N */
|
||||
/* M = 1 - J */
|
||||
/* DO 10, I = J, MIN( N, J + K ) */
|
||||
/* A( M + I, J ) = matrix( I, J ) */
|
||||
/* 10 CONTINUE */
|
||||
/* 20 CONTINUE */
|
||||
|
||||
/* Unchanged on exit. */
|
||||
|
||||
/* LDA - INTEGER. */
|
||||
/* On entry, LDA specifies the first dimension of A as declared */
|
||||
/* in the calling (sub) program. LDA must be at least */
|
||||
/* ( k + 1 ). */
|
||||
/* Unchanged on exit. */
|
||||
|
||||
/* X - DOUBLE PRECISION array of DIMENSION at least */
|
||||
/* ( 1 + ( n - 1 )*abs( INCX ) ). */
|
||||
/* Before entry, the incremented array X must contain the */
|
||||
/* vector x. */
|
||||
/* Unchanged on exit. */
|
||||
|
||||
/* INCX - INTEGER. */
|
||||
/* On entry, INCX specifies the increment for the elements of */
|
||||
/* X. INCX must not be zero. */
|
||||
/* Unchanged on exit. */
|
||||
|
||||
/* BETA - DOUBLE PRECISION. */
|
||||
/* On entry, BETA specifies the scalar beta. */
|
||||
/* Unchanged on exit. */
|
||||
|
||||
/* Y - DOUBLE PRECISION array of DIMENSION at least */
|
||||
/* ( 1 + ( n - 1 )*abs( INCY ) ). */
|
||||
/* Before entry, the incremented array Y must contain the */
|
||||
/* vector y. On exit, Y is overwritten by the updated vector y. */
|
||||
|
||||
/* INCY - INTEGER. */
|
||||
/* On entry, INCY specifies the increment for the elements of */
|
||||
/* Y. INCY must not be zero. */
|
||||
/* Unchanged on exit. */
|
||||
|
||||
/* Level 2 Blas routine. */
|
||||
|
||||
/* -- Written on 22-October-1986. */
|
||||
/* Jack Dongarra, Argonne National Lab. */
|
||||
/* Jeremy Du Croz, Nag Central Office. */
|
||||
/* Sven Hammarling, Nag Central Office. */
|
||||
/* Richard Hanson, Sandia National Labs. */
|
||||
|
||||
/* ===================================================================== */
|
||||
|
||||
/* .. Parameters .. */
|
||||
/* .. */
|
||||
/* .. Local Scalars .. */
|
||||
/* .. */
|
||||
/* .. External Functions .. */
|
||||
/* .. */
|
||||
/* .. External Subroutines .. */
|
||||
/* .. */
|
||||
/* .. Intrinsic Functions .. */
|
||||
/* .. */
|
||||
|
||||
/* Test the input parameters. */
|
||||
|
||||
/* Parameter adjustments */
|
||||
a_dim1 = *lda;
|
||||
a_offset = 1 + a_dim1;
|
||||
a -= a_offset;
|
||||
--x;
|
||||
--y;
|
||||
|
||||
/* Function Body */
|
||||
info = 0;
|
||||
if (!lsame_(uplo, "U") && !lsame_(uplo, "L")) {
|
||||
info = 1;
|
||||
} else if (*n < 0) {
|
||||
info = 2;
|
||||
} else if (*k < 0) {
|
||||
info = 3;
|
||||
} else if (*lda < *k + 1) {
|
||||
info = 6;
|
||||
} else if (*incx == 0) {
|
||||
info = 8;
|
||||
} else if (*incy == 0) {
|
||||
info = 11;
|
||||
}
|
||||
if (info != 0) {
|
||||
xerbla_("DSBMV ", &info);
|
||||
return;
|
||||
}
|
||||
|
||||
/* Quick return if possible. */
|
||||
|
||||
if (*n == 0 || (*alpha == 0. && *beta == 1.)) {
|
||||
return;
|
||||
}
|
||||
|
||||
/* Set up the start points in X and Y. */
|
||||
|
||||
if (*incx > 0) {
|
||||
kx = 1;
|
||||
} else {
|
||||
kx = 1 - (*n - 1) * *incx;
|
||||
}
|
||||
if (*incy > 0) {
|
||||
ky = 1;
|
||||
} else {
|
||||
ky = 1 - (*n - 1) * *incy;
|
||||
}
|
||||
|
||||
/* Start the operations. In this version the elements of the array A */
|
||||
/* are accessed sequentially with one pass through A. */
|
||||
|
||||
/* First form y := beta*y. */
|
||||
|
||||
if (*beta != 1.) {
|
||||
if (*incy == 1) {
|
||||
if (*beta == 0.) {
|
||||
i__1 = *n;
|
||||
for (i__ = 1; i__ <= i__1; ++i__) {
|
||||
y[i__] = 0.;
|
||||
/* L10: */
|
||||
}
|
||||
} else {
|
||||
i__1 = *n;
|
||||
for (i__ = 1; i__ <= i__1; ++i__) {
|
||||
y[i__] = *beta * y[i__];
|
||||
/* L20: */
|
||||
}
|
||||
}
|
||||
} else {
|
||||
iy = ky;
|
||||
if (*beta == 0.) {
|
||||
i__1 = *n;
|
||||
for (i__ = 1; i__ <= i__1; ++i__) {
|
||||
y[iy] = 0.;
|
||||
iy += *incy;
|
||||
/* L30: */
|
||||
}
|
||||
} else {
|
||||
i__1 = *n;
|
||||
for (i__ = 1; i__ <= i__1; ++i__) {
|
||||
y[iy] = *beta * y[iy];
|
||||
iy += *incy;
|
||||
/* L40: */
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
if (*alpha == 0.) {
|
||||
return;
|
||||
}
|
||||
if (lsame_(uplo, "U")) {
|
||||
/* Form y when upper triangle of A is stored. */
|
||||
|
||||
kplus1 = *k + 1;
|
||||
if (*incx == 1 && *incy == 1) {
|
||||
i__1 = *n;
|
||||
for (j = 1; j <= i__1; ++j) {
|
||||
temp1 = *alpha * x[j];
|
||||
temp2 = 0.;
|
||||
l = kplus1 - j;
|
||||
/* Computing MAX */
|
||||
i__2 = 1, i__3 = j - *k;
|
||||
i__4 = j - 1;
|
||||
for (i__ = max(i__2, i__3); i__ <= i__4; ++i__) {
|
||||
y[i__] += temp1 * a[l + i__ + j * a_dim1];
|
||||
temp2 += a[l + i__ + j * a_dim1] * x[i__];
|
||||
/* L50: */
|
||||
}
|
||||
y[j] = y[j] + temp1 * a[kplus1 + j * a_dim1] + *alpha * temp2;
|
||||
/* L60: */
|
||||
}
|
||||
} else {
|
||||
jx = kx;
|
||||
jy = ky;
|
||||
i__1 = *n;
|
||||
for (j = 1; j <= i__1; ++j) {
|
||||
temp1 = *alpha * x[jx];
|
||||
temp2 = 0.;
|
||||
ix = kx;
|
||||
iy = ky;
|
||||
l = kplus1 - j;
|
||||
/* Computing MAX */
|
||||
i__4 = 1, i__2 = j - *k;
|
||||
i__3 = j - 1;
|
||||
for (i__ = max(i__4, i__2); i__ <= i__3; ++i__) {
|
||||
y[iy] += temp1 * a[l + i__ + j * a_dim1];
|
||||
temp2 += a[l + i__ + j * a_dim1] * x[ix];
|
||||
ix += *incx;
|
||||
iy += *incy;
|
||||
/* L70: */
|
||||
}
|
||||
y[jy] = y[jy] + temp1 * a[kplus1 + j * a_dim1] + *alpha * temp2;
|
||||
jx += *incx;
|
||||
jy += *incy;
|
||||
if (j > *k) {
|
||||
kx += *incx;
|
||||
ky += *incy;
|
||||
}
|
||||
/* L80: */
|
||||
}
|
||||
}
|
||||
} else {
|
||||
/* Form y when lower triangle of A is stored. */
|
||||
|
||||
if (*incx == 1 && *incy == 1) {
|
||||
i__1 = *n;
|
||||
for (j = 1; j <= i__1; ++j) {
|
||||
temp1 = *alpha * x[j];
|
||||
temp2 = 0.;
|
||||
y[j] += temp1 * a[j * a_dim1 + 1];
|
||||
l = 1 - j;
|
||||
/* Computing MIN */
|
||||
i__4 = *n, i__2 = j + *k;
|
||||
i__3 = min(i__4, i__2);
|
||||
for (i__ = j + 1; i__ <= i__3; ++i__) {
|
||||
y[i__] += temp1 * a[l + i__ + j * a_dim1];
|
||||
temp2 += a[l + i__ + j * a_dim1] * x[i__];
|
||||
/* L90: */
|
||||
}
|
||||
y[j] += *alpha * temp2;
|
||||
/* L100: */
|
||||
}
|
||||
} else {
|
||||
jx = kx;
|
||||
jy = ky;
|
||||
i__1 = *n;
|
||||
for (j = 1; j <= i__1; ++j) {
|
||||
temp1 = *alpha * x[jx];
|
||||
temp2 = 0.;
|
||||
y[jy] += temp1 * a[j * a_dim1 + 1];
|
||||
l = 1 - j;
|
||||
ix = jx;
|
||||
iy = jy;
|
||||
/* Computing MIN */
|
||||
i__4 = *n, i__2 = j + *k;
|
||||
i__3 = min(i__4, i__2);
|
||||
for (i__ = j + 1; i__ <= i__3; ++i__) {
|
||||
ix += *incx;
|
||||
iy += *incy;
|
||||
y[iy] += temp1 * a[l + i__ + j * a_dim1];
|
||||
temp2 += a[l + i__ + j * a_dim1] * x[ix];
|
||||
/* L110: */
|
||||
}
|
||||
y[jy] += *alpha * temp2;
|
||||
jx += *incx;
|
||||
jy += *incy;
|
||||
/* L120: */
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/* End of DSBMV . */
|
||||
|
||||
} /* dsbmv_ */
|
||||
308
blas/f2c/dspmv.c
308
blas/f2c/dspmv.c
@@ -1,308 +0,0 @@
|
||||
/* dspmv.f -- translated by f2c (version 20100827).
|
||||
You must link the resulting object file with libf2c:
|
||||
on Microsoft Windows system, link with libf2c.lib;
|
||||
on Linux or Unix systems, link with .../path/to/libf2c.a -lm
|
||||
or, if you install libf2c.a in a standard place, with -lf2c -lm
|
||||
-- in that order, at the end of the command line, as in
|
||||
cc *.o -lf2c -lm
|
||||
Source for libf2c is in /netlib/f2c/libf2c.zip, e.g.,
|
||||
|
||||
http://www.netlib.org/f2c/libf2c.zip
|
||||
*/
|
||||
|
||||
#include "datatypes.h"
|
||||
|
||||
/* Subroutine */ void dspmv_(char *uplo, integer *n, doublereal *alpha, doublereal *ap, doublereal *x, integer *incx,
|
||||
doublereal *beta, doublereal *y, integer *incy) {
|
||||
/* System generated locals */
|
||||
integer i__1, i__2;
|
||||
|
||||
/* Local variables */
|
||||
integer i__, j, k, kk, ix, iy, jx, jy, kx, ky, info;
|
||||
doublereal temp1, temp2;
|
||||
extern logical lsame_(char *, char *);
|
||||
extern /* Subroutine */ void xerbla_(const char *, integer *);
|
||||
|
||||
/* .. Scalar Arguments .. */
|
||||
/* .. */
|
||||
/* .. Array Arguments .. */
|
||||
/* .. */
|
||||
|
||||
/* Purpose */
|
||||
/* ======= */
|
||||
|
||||
/* DSPMV performs the matrix-vector operation */
|
||||
|
||||
/* y := alpha*A*x + beta*y, */
|
||||
|
||||
/* where alpha and beta are scalars, x and y are n element vectors and */
|
||||
/* A is an n by n symmetric matrix, supplied in packed form. */
|
||||
|
||||
/* Arguments */
|
||||
/* ========== */
|
||||
|
||||
/* UPLO - CHARACTER*1. */
|
||||
/* On entry, UPLO specifies whether the upper or lower */
|
||||
/* triangular part of the matrix A is supplied in the packed */
|
||||
/* array AP as follows: */
|
||||
|
||||
/* UPLO = 'U' or 'u' The upper triangular part of A is */
|
||||
/* supplied in AP. */
|
||||
|
||||
/* UPLO = 'L' or 'l' The lower triangular part of A is */
|
||||
/* supplied in AP. */
|
||||
|
||||
/* Unchanged on exit. */
|
||||
|
||||
/* N - INTEGER. */
|
||||
/* On entry, N specifies the order of the matrix A. */
|
||||
/* N must be at least zero. */
|
||||
/* Unchanged on exit. */
|
||||
|
||||
/* ALPHA - DOUBLE PRECISION. */
|
||||
/* On entry, ALPHA specifies the scalar alpha. */
|
||||
/* Unchanged on exit. */
|
||||
|
||||
/* AP - DOUBLE PRECISION array of DIMENSION at least */
|
||||
/* ( ( n*( n + 1 ) )/2 ). */
|
||||
/* Before entry with UPLO = 'U' or 'u', the array AP must */
|
||||
/* contain the upper triangular part of the symmetric matrix */
|
||||
/* packed sequentially, column by column, so that AP( 1 ) */
|
||||
/* contains a( 1, 1 ), AP( 2 ) and AP( 3 ) contain a( 1, 2 ) */
|
||||
/* and a( 2, 2 ) respectively, and so on. */
|
||||
/* Before entry with UPLO = 'L' or 'l', the array AP must */
|
||||
/* contain the lower triangular part of the symmetric matrix */
|
||||
/* packed sequentially, column by column, so that AP( 1 ) */
|
||||
/* contains a( 1, 1 ), AP( 2 ) and AP( 3 ) contain a( 2, 1 ) */
|
||||
/* and a( 3, 1 ) respectively, and so on. */
|
||||
/* Unchanged on exit. */
|
||||
|
||||
/* X - DOUBLE PRECISION array of dimension at least */
|
||||
/* ( 1 + ( n - 1 )*abs( INCX ) ). */
|
||||
/* Before entry, the incremented array X must contain the n */
|
||||
/* element vector x. */
|
||||
/* Unchanged on exit. */
|
||||
|
||||
/* INCX - INTEGER. */
|
||||
/* On entry, INCX specifies the increment for the elements of */
|
||||
/* X. INCX must not be zero. */
|
||||
/* Unchanged on exit. */
|
||||
|
||||
/* BETA - DOUBLE PRECISION. */
|
||||
/* On entry, BETA specifies the scalar beta. When BETA is */
|
||||
/* supplied as zero then Y need not be set on input. */
|
||||
/* Unchanged on exit. */
|
||||
|
||||
/* Y - DOUBLE PRECISION array of dimension at least */
|
||||
/* ( 1 + ( n - 1 )*abs( INCY ) ). */
|
||||
/* Before entry, the incremented array Y must contain the n */
|
||||
/* element vector y. On exit, Y is overwritten by the updated */
|
||||
/* vector y. */
|
||||
|
||||
/* INCY - INTEGER. */
|
||||
/* On entry, INCY specifies the increment for the elements of */
|
||||
/* Y. INCY must not be zero. */
|
||||
/* Unchanged on exit. */
|
||||
|
||||
/* Further Details */
|
||||
/* =============== */
|
||||
|
||||
/* Level 2 Blas routine. */
|
||||
|
||||
/* -- Written on 22-October-1986. */
|
||||
/* Jack Dongarra, Argonne National Lab. */
|
||||
/* Jeremy Du Croz, Nag Central Office. */
|
||||
/* Sven Hammarling, Nag Central Office. */
|
||||
/* Richard Hanson, Sandia National Labs. */
|
||||
|
||||
/* ===================================================================== */
|
||||
|
||||
/* .. Parameters .. */
|
||||
/* .. */
|
||||
/* .. Local Scalars .. */
|
||||
/* .. */
|
||||
/* .. External Functions .. */
|
||||
/* .. */
|
||||
/* .. External Subroutines .. */
|
||||
/* .. */
|
||||
|
||||
/* Test the input parameters. */
|
||||
|
||||
/* Parameter adjustments */
|
||||
--y;
|
||||
--x;
|
||||
--ap;
|
||||
|
||||
/* Function Body */
|
||||
info = 0;
|
||||
if (!lsame_(uplo, "U") && !lsame_(uplo, "L")) {
|
||||
info = 1;
|
||||
} else if (*n < 0) {
|
||||
info = 2;
|
||||
} else if (*incx == 0) {
|
||||
info = 6;
|
||||
} else if (*incy == 0) {
|
||||
info = 9;
|
||||
}
|
||||
if (info != 0) {
|
||||
xerbla_("DSPMV ", &info);
|
||||
return;
|
||||
}
|
||||
|
||||
/* Quick return if possible. */
|
||||
|
||||
if (*n == 0 || (*alpha == 0. && *beta == 1.)) {
|
||||
return;
|
||||
}
|
||||
|
||||
/* Set up the start points in X and Y. */
|
||||
|
||||
if (*incx > 0) {
|
||||
kx = 1;
|
||||
} else {
|
||||
kx = 1 - (*n - 1) * *incx;
|
||||
}
|
||||
if (*incy > 0) {
|
||||
ky = 1;
|
||||
} else {
|
||||
ky = 1 - (*n - 1) * *incy;
|
||||
}
|
||||
|
||||
/* Start the operations. In this version the elements of the array AP */
|
||||
/* are accessed sequentially with one pass through AP. */
|
||||
|
||||
/* First form y := beta*y. */
|
||||
|
||||
if (*beta != 1.) {
|
||||
if (*incy == 1) {
|
||||
if (*beta == 0.) {
|
||||
i__1 = *n;
|
||||
for (i__ = 1; i__ <= i__1; ++i__) {
|
||||
y[i__] = 0.;
|
||||
/* L10: */
|
||||
}
|
||||
} else {
|
||||
i__1 = *n;
|
||||
for (i__ = 1; i__ <= i__1; ++i__) {
|
||||
y[i__] = *beta * y[i__];
|
||||
/* L20: */
|
||||
}
|
||||
}
|
||||
} else {
|
||||
iy = ky;
|
||||
if (*beta == 0.) {
|
||||
i__1 = *n;
|
||||
for (i__ = 1; i__ <= i__1; ++i__) {
|
||||
y[iy] = 0.;
|
||||
iy += *incy;
|
||||
/* L30: */
|
||||
}
|
||||
} else {
|
||||
i__1 = *n;
|
||||
for (i__ = 1; i__ <= i__1; ++i__) {
|
||||
y[iy] = *beta * y[iy];
|
||||
iy += *incy;
|
||||
/* L40: */
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
if (*alpha == 0.) {
|
||||
return;
|
||||
}
|
||||
kk = 1;
|
||||
if (lsame_(uplo, "U")) {
|
||||
/* Form y when AP contains the upper triangle. */
|
||||
|
||||
if (*incx == 1 && *incy == 1) {
|
||||
i__1 = *n;
|
||||
for (j = 1; j <= i__1; ++j) {
|
||||
temp1 = *alpha * x[j];
|
||||
temp2 = 0.;
|
||||
k = kk;
|
||||
i__2 = j - 1;
|
||||
for (i__ = 1; i__ <= i__2; ++i__) {
|
||||
y[i__] += temp1 * ap[k];
|
||||
temp2 += ap[k] * x[i__];
|
||||
++k;
|
||||
/* L50: */
|
||||
}
|
||||
y[j] = y[j] + temp1 * ap[kk + j - 1] + *alpha * temp2;
|
||||
kk += j;
|
||||
/* L60: */
|
||||
}
|
||||
} else {
|
||||
jx = kx;
|
||||
jy = ky;
|
||||
i__1 = *n;
|
||||
for (j = 1; j <= i__1; ++j) {
|
||||
temp1 = *alpha * x[jx];
|
||||
temp2 = 0.;
|
||||
ix = kx;
|
||||
iy = ky;
|
||||
i__2 = kk + j - 2;
|
||||
for (k = kk; k <= i__2; ++k) {
|
||||
y[iy] += temp1 * ap[k];
|
||||
temp2 += ap[k] * x[ix];
|
||||
ix += *incx;
|
||||
iy += *incy;
|
||||
/* L70: */
|
||||
}
|
||||
y[jy] = y[jy] + temp1 * ap[kk + j - 1] + *alpha * temp2;
|
||||
jx += *incx;
|
||||
jy += *incy;
|
||||
kk += j;
|
||||
/* L80: */
|
||||
}
|
||||
}
|
||||
} else {
|
||||
/* Form y when AP contains the lower triangle. */
|
||||
|
||||
if (*incx == 1 && *incy == 1) {
|
||||
i__1 = *n;
|
||||
for (j = 1; j <= i__1; ++j) {
|
||||
temp1 = *alpha * x[j];
|
||||
temp2 = 0.;
|
||||
y[j] += temp1 * ap[kk];
|
||||
k = kk + 1;
|
||||
i__2 = *n;
|
||||
for (i__ = j + 1; i__ <= i__2; ++i__) {
|
||||
y[i__] += temp1 * ap[k];
|
||||
temp2 += ap[k] * x[i__];
|
||||
++k;
|
||||
/* L90: */
|
||||
}
|
||||
y[j] += *alpha * temp2;
|
||||
kk += *n - j + 1;
|
||||
/* L100: */
|
||||
}
|
||||
} else {
|
||||
jx = kx;
|
||||
jy = ky;
|
||||
i__1 = *n;
|
||||
for (j = 1; j <= i__1; ++j) {
|
||||
temp1 = *alpha * x[jx];
|
||||
temp2 = 0.;
|
||||
y[jy] += temp1 * ap[kk];
|
||||
ix = jx;
|
||||
iy = jy;
|
||||
i__2 = kk + *n - j;
|
||||
for (k = kk + 1; k <= i__2; ++k) {
|
||||
ix += *incx;
|
||||
iy += *incy;
|
||||
y[iy] += temp1 * ap[k];
|
||||
temp2 += ap[k] * x[ix];
|
||||
/* L110: */
|
||||
}
|
||||
y[jy] += *alpha * temp2;
|
||||
jx += *incx;
|
||||
jy += *incy;
|
||||
kk += *n - j + 1;
|
||||
/* L120: */
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/* End of DSPMV . */
|
||||
|
||||
} /* dspmv_ */
|
||||
417
blas/f2c/dtbmv.c
417
blas/f2c/dtbmv.c
@@ -1,417 +0,0 @@
|
||||
/* dtbmv.f -- translated by f2c (version 20100827).
|
||||
You must link the resulting object file with libf2c:
|
||||
on Microsoft Windows system, link with libf2c.lib;
|
||||
on Linux or Unix systems, link with .../path/to/libf2c.a -lm
|
||||
or, if you install libf2c.a in a standard place, with -lf2c -lm
|
||||
-- in that order, at the end of the command line, as in
|
||||
cc *.o -lf2c -lm
|
||||
Source for libf2c is in /netlib/f2c/libf2c.zip, e.g.,
|
||||
|
||||
http://www.netlib.org/f2c/libf2c.zip
|
||||
*/
|
||||
|
||||
#include "datatypes.h"
|
||||
|
||||
/* Subroutine */ void dtbmv_(char *uplo, char *trans, char *diag, integer *n, integer *k, doublereal *a, integer *lda,
|
||||
doublereal *x, integer *incx) {
|
||||
/* System generated locals */
|
||||
integer a_dim1, a_offset, i__1, i__2, i__3, i__4;
|
||||
|
||||
/* Local variables */
|
||||
integer i__, j, l, ix, jx, kx, info;
|
||||
doublereal temp;
|
||||
extern logical lsame_(char *, char *);
|
||||
integer kplus1;
|
||||
extern /* Subroutine */ void xerbla_(const char *, integer *);
|
||||
logical nounit;
|
||||
|
||||
/* .. Scalar Arguments .. */
|
||||
/* .. */
|
||||
/* .. Array Arguments .. */
|
||||
/* .. */
|
||||
|
||||
/* Purpose */
|
||||
/* ======= */
|
||||
|
||||
/* DTBMV performs one of the matrix-vector operations */
|
||||
|
||||
/* x := A*x, or x := A'*x, */
|
||||
|
||||
/* where x is an n element vector and A is an n by n unit, or non-unit, */
|
||||
/* upper or lower triangular band matrix, with ( k + 1 ) diagonals. */
|
||||
|
||||
/* Arguments */
|
||||
/* ========== */
|
||||
|
||||
/* UPLO - CHARACTER*1. */
|
||||
/* On entry, UPLO specifies whether the matrix is an upper or */
|
||||
/* lower triangular matrix as follows: */
|
||||
|
||||
/* UPLO = 'U' or 'u' A is an upper triangular matrix. */
|
||||
|
||||
/* UPLO = 'L' or 'l' A is a lower triangular matrix. */
|
||||
|
||||
/* Unchanged on exit. */
|
||||
|
||||
/* TRANS - CHARACTER*1. */
|
||||
/* On entry, TRANS specifies the operation to be performed as */
|
||||
/* follows: */
|
||||
|
||||
/* TRANS = 'N' or 'n' x := A*x. */
|
||||
|
||||
/* TRANS = 'T' or 't' x := A'*x. */
|
||||
|
||||
/* TRANS = 'C' or 'c' x := A'*x. */
|
||||
|
||||
/* Unchanged on exit. */
|
||||
|
||||
/* DIAG - CHARACTER*1. */
|
||||
/* On entry, DIAG specifies whether or not A is unit */
|
||||
/* triangular as follows: */
|
||||
|
||||
/* DIAG = 'U' or 'u' A is assumed to be unit triangular. */
|
||||
|
||||
/* DIAG = 'N' or 'n' A is not assumed to be unit */
|
||||
/* triangular. */
|
||||
|
||||
/* Unchanged on exit. */
|
||||
|
||||
/* N - INTEGER. */
|
||||
/* On entry, N specifies the order of the matrix A. */
|
||||
/* N must be at least zero. */
|
||||
/* Unchanged on exit. */
|
||||
|
||||
/* K - INTEGER. */
|
||||
/* On entry with UPLO = 'U' or 'u', K specifies the number of */
|
||||
/* super-diagonals of the matrix A. */
|
||||
/* On entry with UPLO = 'L' or 'l', K specifies the number of */
|
||||
/* sub-diagonals of the matrix A. */
|
||||
/* K must satisfy 0 .le. K. */
|
||||
/* Unchanged on exit. */
|
||||
|
||||
/* A - DOUBLE PRECISION array of DIMENSION ( LDA, n ). */
|
||||
/* Before entry with UPLO = 'U' or 'u', the leading ( k + 1 ) */
|
||||
/* by n part of the array A must contain the upper triangular */
|
||||
/* band part of the matrix of coefficients, supplied column by */
|
||||
/* column, with the leading diagonal of the matrix in row */
|
||||
/* ( k + 1 ) of the array, the first super-diagonal starting at */
|
||||
/* position 2 in row k, and so on. The top left k by k triangle */
|
||||
/* of the array A is not referenced. */
|
||||
/* The following program segment will transfer an upper */
|
||||
/* triangular band matrix from conventional full matrix storage */
|
||||
/* to band storage: */
|
||||
|
||||
/* DO 20, J = 1, N */
|
||||
/* M = K + 1 - J */
|
||||
/* DO 10, I = MAX( 1, J - K ), J */
|
||||
/* A( M + I, J ) = matrix( I, J ) */
|
||||
/* 10 CONTINUE */
|
||||
/* 20 CONTINUE */
|
||||
|
||||
/* Before entry with UPLO = 'L' or 'l', the leading ( k + 1 ) */
|
||||
/* by n part of the array A must contain the lower triangular */
|
||||
/* band part of the matrix of coefficients, supplied column by */
|
||||
/* column, with the leading diagonal of the matrix in row 1 of */
|
||||
/* the array, the first sub-diagonal starting at position 1 in */
|
||||
/* row 2, and so on. The bottom right k by k triangle of the */
|
||||
/* array A is not referenced. */
|
||||
/* The following program segment will transfer a lower */
|
||||
/* triangular band matrix from conventional full matrix storage */
|
||||
/* to band storage: */
|
||||
|
||||
/* DO 20, J = 1, N */
|
||||
/* M = 1 - J */
|
||||
/* DO 10, I = J, MIN( N, J + K ) */
|
||||
/* A( M + I, J ) = matrix( I, J ) */
|
||||
/* 10 CONTINUE */
|
||||
/* 20 CONTINUE */
|
||||
|
||||
/* Note that when DIAG = 'U' or 'u' the elements of the array A */
|
||||
/* corresponding to the diagonal elements of the matrix are not */
|
||||
/* referenced, but are assumed to be unity. */
|
||||
/* Unchanged on exit. */
|
||||
|
||||
/* LDA - INTEGER. */
|
||||
/* On entry, LDA specifies the first dimension of A as declared */
|
||||
/* in the calling (sub) program. LDA must be at least */
|
||||
/* ( k + 1 ). */
|
||||
/* Unchanged on exit. */
|
||||
|
||||
/* X - DOUBLE PRECISION array of dimension at least */
|
||||
/* ( 1 + ( n - 1 )*abs( INCX ) ). */
|
||||
/* Before entry, the incremented array X must contain the n */
|
||||
/* element vector x. On exit, X is overwritten with the */
|
||||
/* transformed vector x. */
|
||||
|
||||
/* INCX - INTEGER. */
|
||||
/* On entry, INCX specifies the increment for the elements of */
|
||||
/* X. INCX must not be zero. */
|
||||
/* Unchanged on exit. */
|
||||
|
||||
/* Further Details */
|
||||
/* =============== */
|
||||
|
||||
/* Level 2 Blas routine. */
|
||||
|
||||
/* -- Written on 22-October-1986. */
|
||||
/* Jack Dongarra, Argonne National Lab. */
|
||||
/* Jeremy Du Croz, Nag Central Office. */
|
||||
/* Sven Hammarling, Nag Central Office. */
|
||||
/* Richard Hanson, Sandia National Labs. */
|
||||
|
||||
/* ===================================================================== */
|
||||
|
||||
/* .. Parameters .. */
|
||||
/* .. */
|
||||
/* .. Local Scalars .. */
|
||||
/* .. */
|
||||
/* .. External Functions .. */
|
||||
/* .. */
|
||||
/* .. External Subroutines .. */
|
||||
/* .. */
|
||||
/* .. Intrinsic Functions .. */
|
||||
/* .. */
|
||||
|
||||
/* Test the input parameters. */
|
||||
|
||||
/* Parameter adjustments */
|
||||
a_dim1 = *lda;
|
||||
a_offset = 1 + a_dim1;
|
||||
a -= a_offset;
|
||||
--x;
|
||||
|
||||
/* Function Body */
|
||||
info = 0;
|
||||
if (!lsame_(uplo, "U") && !lsame_(uplo, "L")) {
|
||||
info = 1;
|
||||
} else if (!lsame_(trans, "N") && !lsame_(trans, "T") && !lsame_(trans, "C")) {
|
||||
info = 2;
|
||||
} else if (!lsame_(diag, "U") && !lsame_(diag, "N")) {
|
||||
info = 3;
|
||||
} else if (*n < 0) {
|
||||
info = 4;
|
||||
} else if (*k < 0) {
|
||||
info = 5;
|
||||
} else if (*lda < *k + 1) {
|
||||
info = 7;
|
||||
} else if (*incx == 0) {
|
||||
info = 9;
|
||||
}
|
||||
if (info != 0) {
|
||||
xerbla_("DTBMV ", &info);
|
||||
return;
|
||||
}
|
||||
|
||||
/* Quick return if possible. */
|
||||
|
||||
if (*n == 0) {
|
||||
return;
|
||||
}
|
||||
|
||||
nounit = lsame_(diag, "N");
|
||||
|
||||
/* Set up the start point in X if the increment is not unity. This */
|
||||
/* will be ( N - 1 )*INCX too small for descending loops. */
|
||||
|
||||
if (*incx <= 0) {
|
||||
kx = 1 - (*n - 1) * *incx;
|
||||
} else if (*incx != 1) {
|
||||
kx = 1;
|
||||
}
|
||||
|
||||
/* Start the operations. In this version the elements of A are */
|
||||
/* accessed sequentially with one pass through A. */
|
||||
|
||||
if (lsame_(trans, "N")) {
|
||||
/* Form x := A*x. */
|
||||
|
||||
if (lsame_(uplo, "U")) {
|
||||
kplus1 = *k + 1;
|
||||
if (*incx == 1) {
|
||||
i__1 = *n;
|
||||
for (j = 1; j <= i__1; ++j) {
|
||||
if (x[j] != 0.) {
|
||||
temp = x[j];
|
||||
l = kplus1 - j;
|
||||
/* Computing MAX */
|
||||
i__2 = 1, i__3 = j - *k;
|
||||
i__4 = j - 1;
|
||||
for (i__ = max(i__2, i__3); i__ <= i__4; ++i__) {
|
||||
x[i__] += temp * a[l + i__ + j * a_dim1];
|
||||
/* L10: */
|
||||
}
|
||||
if (nounit) {
|
||||
x[j] *= a[kplus1 + j * a_dim1];
|
||||
}
|
||||
}
|
||||
/* L20: */
|
||||
}
|
||||
} else {
|
||||
jx = kx;
|
||||
i__1 = *n;
|
||||
for (j = 1; j <= i__1; ++j) {
|
||||
if (x[jx] != 0.) {
|
||||
temp = x[jx];
|
||||
ix = kx;
|
||||
l = kplus1 - j;
|
||||
/* Computing MAX */
|
||||
i__4 = 1, i__2 = j - *k;
|
||||
i__3 = j - 1;
|
||||
for (i__ = max(i__4, i__2); i__ <= i__3; ++i__) {
|
||||
x[ix] += temp * a[l + i__ + j * a_dim1];
|
||||
ix += *incx;
|
||||
/* L30: */
|
||||
}
|
||||
if (nounit) {
|
||||
x[jx] *= a[kplus1 + j * a_dim1];
|
||||
}
|
||||
}
|
||||
jx += *incx;
|
||||
if (j > *k) {
|
||||
kx += *incx;
|
||||
}
|
||||
/* L40: */
|
||||
}
|
||||
}
|
||||
} else {
|
||||
if (*incx == 1) {
|
||||
for (j = *n; j >= 1; --j) {
|
||||
if (x[j] != 0.) {
|
||||
temp = x[j];
|
||||
l = 1 - j;
|
||||
/* Computing MIN */
|
||||
i__1 = *n, i__3 = j + *k;
|
||||
i__4 = j + 1;
|
||||
for (i__ = min(i__1, i__3); i__ >= i__4; --i__) {
|
||||
x[i__] += temp * a[l + i__ + j * a_dim1];
|
||||
/* L50: */
|
||||
}
|
||||
if (nounit) {
|
||||
x[j] *= a[j * a_dim1 + 1];
|
||||
}
|
||||
}
|
||||
/* L60: */
|
||||
}
|
||||
} else {
|
||||
kx += (*n - 1) * *incx;
|
||||
jx = kx;
|
||||
for (j = *n; j >= 1; --j) {
|
||||
if (x[jx] != 0.) {
|
||||
temp = x[jx];
|
||||
ix = kx;
|
||||
l = 1 - j;
|
||||
/* Computing MIN */
|
||||
i__4 = *n, i__1 = j + *k;
|
||||
i__3 = j + 1;
|
||||
for (i__ = min(i__4, i__1); i__ >= i__3; --i__) {
|
||||
x[ix] += temp * a[l + i__ + j * a_dim1];
|
||||
ix -= *incx;
|
||||
/* L70: */
|
||||
}
|
||||
if (nounit) {
|
||||
x[jx] *= a[j * a_dim1 + 1];
|
||||
}
|
||||
}
|
||||
jx -= *incx;
|
||||
if (*n - j >= *k) {
|
||||
kx -= *incx;
|
||||
}
|
||||
/* L80: */
|
||||
}
|
||||
}
|
||||
}
|
||||
} else {
|
||||
/* Form x := A'*x. */
|
||||
|
||||
if (lsame_(uplo, "U")) {
|
||||
kplus1 = *k + 1;
|
||||
if (*incx == 1) {
|
||||
for (j = *n; j >= 1; --j) {
|
||||
temp = x[j];
|
||||
l = kplus1 - j;
|
||||
if (nounit) {
|
||||
temp *= a[kplus1 + j * a_dim1];
|
||||
}
|
||||
/* Computing MAX */
|
||||
i__4 = 1, i__1 = j - *k;
|
||||
i__3 = max(i__4, i__1);
|
||||
for (i__ = j - 1; i__ >= i__3; --i__) {
|
||||
temp += a[l + i__ + j * a_dim1] * x[i__];
|
||||
/* L90: */
|
||||
}
|
||||
x[j] = temp;
|
||||
/* L100: */
|
||||
}
|
||||
} else {
|
||||
kx += (*n - 1) * *incx;
|
||||
jx = kx;
|
||||
for (j = *n; j >= 1; --j) {
|
||||
temp = x[jx];
|
||||
kx -= *incx;
|
||||
ix = kx;
|
||||
l = kplus1 - j;
|
||||
if (nounit) {
|
||||
temp *= a[kplus1 + j * a_dim1];
|
||||
}
|
||||
/* Computing MAX */
|
||||
i__4 = 1, i__1 = j - *k;
|
||||
i__3 = max(i__4, i__1);
|
||||
for (i__ = j - 1; i__ >= i__3; --i__) {
|
||||
temp += a[l + i__ + j * a_dim1] * x[ix];
|
||||
ix -= *incx;
|
||||
/* L110: */
|
||||
}
|
||||
x[jx] = temp;
|
||||
jx -= *incx;
|
||||
/* L120: */
|
||||
}
|
||||
}
|
||||
} else {
|
||||
if (*incx == 1) {
|
||||
i__3 = *n;
|
||||
for (j = 1; j <= i__3; ++j) {
|
||||
temp = x[j];
|
||||
l = 1 - j;
|
||||
if (nounit) {
|
||||
temp *= a[j * a_dim1 + 1];
|
||||
}
|
||||
/* Computing MIN */
|
||||
i__1 = *n, i__2 = j + *k;
|
||||
i__4 = min(i__1, i__2);
|
||||
for (i__ = j + 1; i__ <= i__4; ++i__) {
|
||||
temp += a[l + i__ + j * a_dim1] * x[i__];
|
||||
/* L130: */
|
||||
}
|
||||
x[j] = temp;
|
||||
/* L140: */
|
||||
}
|
||||
} else {
|
||||
jx = kx;
|
||||
i__3 = *n;
|
||||
for (j = 1; j <= i__3; ++j) {
|
||||
temp = x[jx];
|
||||
kx += *incx;
|
||||
ix = kx;
|
||||
l = 1 - j;
|
||||
if (nounit) {
|
||||
temp *= a[j * a_dim1 + 1];
|
||||
}
|
||||
/* Computing MIN */
|
||||
i__1 = *n, i__2 = j + *k;
|
||||
i__4 = min(i__1, i__2);
|
||||
for (i__ = j + 1; i__ <= i__4; ++i__) {
|
||||
temp += a[l + i__ + j * a_dim1] * x[ix];
|
||||
ix += *incx;
|
||||
/* L150: */
|
||||
}
|
||||
x[jx] = temp;
|
||||
jx += *incx;
|
||||
/* L160: */
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/* End of DTBMV . */
|
||||
|
||||
} /* dtbmv_ */
|
||||
109
blas/f2c/lsame.c
109
blas/f2c/lsame.c
@@ -1,109 +0,0 @@
|
||||
/* lsame.f -- translated by f2c (version 20100827).
|
||||
You must link the resulting object file with libf2c:
|
||||
on Microsoft Windows system, link with libf2c.lib;
|
||||
on Linux or Unix systems, link with .../path/to/libf2c.a -lm
|
||||
or, if you install libf2c.a in a standard place, with -lf2c -lm
|
||||
-- in that order, at the end of the command line, as in
|
||||
cc *.o -lf2c -lm
|
||||
Source for libf2c is in /netlib/f2c/libf2c.zip, e.g.,
|
||||
|
||||
http://www.netlib.org/f2c/libf2c.zip
|
||||
*/
|
||||
|
||||
#include "datatypes.h"
|
||||
|
||||
logical lsame_(char *ca, char *cb) {
|
||||
/* System generated locals */
|
||||
logical ret_val;
|
||||
|
||||
/* Local variables */
|
||||
integer inta, intb, zcode;
|
||||
|
||||
/* -- LAPACK auxiliary routine (version 3.1) -- */
|
||||
/* Univ. of Tennessee, Univ. of California Berkeley and NAG Ltd.. */
|
||||
/* November 2006 */
|
||||
|
||||
/* .. Scalar Arguments .. */
|
||||
/* .. */
|
||||
|
||||
/* Purpose */
|
||||
/* ======= */
|
||||
|
||||
/* LSAME returns .TRUE. if CA is the same letter as CB regardless of */
|
||||
/* case. */
|
||||
|
||||
/* Arguments */
|
||||
/* ========= */
|
||||
|
||||
/* CA (input) CHARACTER*1 */
|
||||
|
||||
/* CB (input) CHARACTER*1 */
|
||||
/* CA and CB specify the single characters to be compared. */
|
||||
|
||||
/* ===================================================================== */
|
||||
|
||||
/* .. Intrinsic Functions .. */
|
||||
/* .. */
|
||||
/* .. Local Scalars .. */
|
||||
/* .. */
|
||||
|
||||
/* Test if the characters are equal */
|
||||
|
||||
ret_val = *(unsigned char *)ca == *(unsigned char *)cb;
|
||||
if (ret_val) {
|
||||
return ret_val;
|
||||
}
|
||||
|
||||
/* Now test for equivalence if both characters are alphabetic. */
|
||||
|
||||
zcode = 'Z';
|
||||
|
||||
/* Use 'Z' rather than 'A' so that ASCII can be detected on Prime */
|
||||
/* machines, on which ICHAR returns a value with bit 8 set. */
|
||||
/* ICHAR('A') on Prime machines returns 193 which is the same as */
|
||||
/* ICHAR('A') on an EBCDIC machine. */
|
||||
|
||||
inta = *(unsigned char *)ca;
|
||||
intb = *(unsigned char *)cb;
|
||||
|
||||
if (zcode == 90 || zcode == 122) {
|
||||
/* ASCII is assumed - ZCODE is the ASCII code of either lower or */
|
||||
/* upper case 'Z'. */
|
||||
|
||||
if (inta >= 97 && inta <= 122) {
|
||||
inta += -32;
|
||||
}
|
||||
if (intb >= 97 && intb <= 122) {
|
||||
intb += -32;
|
||||
}
|
||||
|
||||
} else if (zcode == 233 || zcode == 169) {
|
||||
/* EBCDIC is assumed - ZCODE is the EBCDIC code of either lower or */
|
||||
/* upper case 'Z'. */
|
||||
|
||||
if ((inta >= 129 && inta <= 137) || (inta >= 145 && inta <= 153) || (inta >= 162 && inta <= 169)) {
|
||||
inta += 64;
|
||||
}
|
||||
if ((intb >= 129 && intb <= 137) || (intb >= 145 && intb <= 153) || (intb >= 162 && intb <= 169)) {
|
||||
intb += 64;
|
||||
}
|
||||
|
||||
} else if (zcode == 218 || zcode == 250) {
|
||||
/* ASCII is assumed, on Prime machines - ZCODE is the ASCII code */
|
||||
/* plus 128 of either lower or upper case 'Z'. */
|
||||
|
||||
if (inta >= 225 && inta <= 250) {
|
||||
inta += -32;
|
||||
}
|
||||
if (intb >= 225 && intb <= 250) {
|
||||
intb += -32;
|
||||
}
|
||||
}
|
||||
ret_val = inta == intb;
|
||||
|
||||
/* RETURN */
|
||||
|
||||
/* End of LSAME */
|
||||
|
||||
return ret_val;
|
||||
} /* lsame_ */
|
||||
212
blas/f2c/srotm.c
212
blas/f2c/srotm.c
@@ -1,212 +0,0 @@
|
||||
/* srotm.f -- translated by f2c (version 20100827).
|
||||
You must link the resulting object file with libf2c:
|
||||
on Microsoft Windows system, link with libf2c.lib;
|
||||
on Linux or Unix systems, link with .../path/to/libf2c.a -lm
|
||||
or, if you install libf2c.a in a standard place, with -lf2c -lm
|
||||
-- in that order, at the end of the command line, as in
|
||||
cc *.o -lf2c -lm
|
||||
Source for libf2c is in /netlib/f2c/libf2c.zip, e.g.,
|
||||
|
||||
http://www.netlib.org/f2c/libf2c.zip
|
||||
*/
|
||||
|
||||
#include "datatypes.h"
|
||||
|
||||
/* Subroutine */ void srotm_(integer *n, real *sx, integer *incx, real *sy, integer *incy, real *sparam) {
|
||||
/* Initialized data */
|
||||
|
||||
static real zero = 0.f;
|
||||
static real two = 2.f;
|
||||
|
||||
/* System generated locals */
|
||||
integer i__1, i__2;
|
||||
|
||||
/* Local variables */
|
||||
integer i__;
|
||||
real w, z__;
|
||||
integer kx, ky;
|
||||
real sh11, sh12, sh21, sh22, sflag;
|
||||
integer nsteps;
|
||||
|
||||
/* .. Scalar Arguments .. */
|
||||
/* .. */
|
||||
/* .. Array Arguments .. */
|
||||
/* .. */
|
||||
|
||||
/* Purpose */
|
||||
/* ======= */
|
||||
|
||||
/* APPLY THE MODIFIED GIVENS TRANSFORMATION, H, TO THE 2 BY N MATRIX */
|
||||
|
||||
/* (SX**T) , WHERE **T INDICATES TRANSPOSE. THE ELEMENTS OF SX ARE IN */
|
||||
/* (DX**T) */
|
||||
|
||||
/* SX(LX+I*INCX), I = 0 TO N-1, WHERE LX = 1 IF INCX .GE. 0, ELSE */
|
||||
/* LX = (-INCX)*N, AND SIMILARLY FOR SY USING USING LY AND INCY. */
|
||||
/* WITH SPARAM(1)=SFLAG, H HAS ONE OF THE FOLLOWING FORMS.. */
|
||||
|
||||
/* SFLAG=-1.E0 SFLAG=0.E0 SFLAG=1.E0 SFLAG=-2.E0 */
|
||||
|
||||
/* (SH11 SH12) (1.E0 SH12) (SH11 1.E0) (1.E0 0.E0) */
|
||||
/* H=( ) ( ) ( ) ( ) */
|
||||
/* (SH21 SH22), (SH21 1.E0), (-1.E0 SH22), (0.E0 1.E0). */
|
||||
/* SEE SROTMG FOR A DESCRIPTION OF DATA STORAGE IN SPARAM. */
|
||||
|
||||
/* Arguments */
|
||||
/* ========= */
|
||||
|
||||
/* N (input) INTEGER */
|
||||
/* number of elements in input vector(s) */
|
||||
|
||||
/* SX (input/output) REAL array, dimension N */
|
||||
/* double precision vector with N elements */
|
||||
|
||||
/* INCX (input) INTEGER */
|
||||
/* storage spacing between elements of SX */
|
||||
|
||||
/* SY (input/output) REAL array, dimension N */
|
||||
/* double precision vector with N elements */
|
||||
|
||||
/* INCY (input) INTEGER */
|
||||
/* storage spacing between elements of SY */
|
||||
|
||||
/* SPARAM (input/output) REAL array, dimension 5 */
|
||||
/* SPARAM(1)=SFLAG */
|
||||
/* SPARAM(2)=SH11 */
|
||||
/* SPARAM(3)=SH21 */
|
||||
/* SPARAM(4)=SH12 */
|
||||
/* SPARAM(5)=SH22 */
|
||||
|
||||
/* ===================================================================== */
|
||||
|
||||
/* .. Local Scalars .. */
|
||||
/* .. */
|
||||
/* .. Data statements .. */
|
||||
/* Parameter adjustments */
|
||||
--sparam;
|
||||
--sy;
|
||||
--sx;
|
||||
|
||||
/* Function Body */
|
||||
/* .. */
|
||||
|
||||
sflag = sparam[1];
|
||||
if (*n <= 0 || sflag + two == zero) {
|
||||
goto L140;
|
||||
}
|
||||
if (!(*incx == *incy && *incx > 0)) {
|
||||
goto L70;
|
||||
}
|
||||
|
||||
nsteps = *n * *incx;
|
||||
if (sflag < 0.f) {
|
||||
goto L50;
|
||||
} else if (sflag == 0) {
|
||||
goto L10;
|
||||
} else {
|
||||
goto L30;
|
||||
}
|
||||
L10:
|
||||
sh12 = sparam[4];
|
||||
sh21 = sparam[3];
|
||||
i__1 = nsteps;
|
||||
i__2 = *incx;
|
||||
for (i__ = 1; i__2 < 0 ? i__ >= i__1 : i__ <= i__1; i__ += i__2) {
|
||||
w = sx[i__];
|
||||
z__ = sy[i__];
|
||||
sx[i__] = w + z__ * sh12;
|
||||
sy[i__] = w * sh21 + z__;
|
||||
/* L20: */
|
||||
}
|
||||
goto L140;
|
||||
L30:
|
||||
sh11 = sparam[2];
|
||||
sh22 = sparam[5];
|
||||
i__2 = nsteps;
|
||||
i__1 = *incx;
|
||||
for (i__ = 1; i__1 < 0 ? i__ >= i__2 : i__ <= i__2; i__ += i__1) {
|
||||
w = sx[i__];
|
||||
z__ = sy[i__];
|
||||
sx[i__] = w * sh11 + z__;
|
||||
sy[i__] = -w + sh22 * z__;
|
||||
/* L40: */
|
||||
}
|
||||
goto L140;
|
||||
L50:
|
||||
sh11 = sparam[2];
|
||||
sh12 = sparam[4];
|
||||
sh21 = sparam[3];
|
||||
sh22 = sparam[5];
|
||||
i__1 = nsteps;
|
||||
i__2 = *incx;
|
||||
for (i__ = 1; i__2 < 0 ? i__ >= i__1 : i__ <= i__1; i__ += i__2) {
|
||||
w = sx[i__];
|
||||
z__ = sy[i__];
|
||||
sx[i__] = w * sh11 + z__ * sh12;
|
||||
sy[i__] = w * sh21 + z__ * sh22;
|
||||
/* L60: */
|
||||
}
|
||||
goto L140;
|
||||
L70:
|
||||
kx = 1;
|
||||
ky = 1;
|
||||
if (*incx < 0) {
|
||||
kx = (1 - *n) * *incx + 1;
|
||||
}
|
||||
if (*incy < 0) {
|
||||
ky = (1 - *n) * *incy + 1;
|
||||
}
|
||||
|
||||
if (sflag < 0.f) {
|
||||
goto L120;
|
||||
} else if (sflag == 0) {
|
||||
goto L80;
|
||||
} else {
|
||||
goto L100;
|
||||
}
|
||||
L80:
|
||||
sh12 = sparam[4];
|
||||
sh21 = sparam[3];
|
||||
i__2 = *n;
|
||||
for (i__ = 1; i__ <= i__2; ++i__) {
|
||||
w = sx[kx];
|
||||
z__ = sy[ky];
|
||||
sx[kx] = w + z__ * sh12;
|
||||
sy[ky] = w * sh21 + z__;
|
||||
kx += *incx;
|
||||
ky += *incy;
|
||||
/* L90: */
|
||||
}
|
||||
goto L140;
|
||||
L100:
|
||||
sh11 = sparam[2];
|
||||
sh22 = sparam[5];
|
||||
i__2 = *n;
|
||||
for (i__ = 1; i__ <= i__2; ++i__) {
|
||||
w = sx[kx];
|
||||
z__ = sy[ky];
|
||||
sx[kx] = w * sh11 + z__;
|
||||
sy[ky] = -w + sh22 * z__;
|
||||
kx += *incx;
|
||||
ky += *incy;
|
||||
/* L110: */
|
||||
}
|
||||
goto L140;
|
||||
L120:
|
||||
sh11 = sparam[2];
|
||||
sh12 = sparam[4];
|
||||
sh21 = sparam[3];
|
||||
sh22 = sparam[5];
|
||||
i__2 = *n;
|
||||
for (i__ = 1; i__ <= i__2; ++i__) {
|
||||
w = sx[kx];
|
||||
z__ = sy[ky];
|
||||
sx[kx] = w * sh11 + z__ * sh12;
|
||||
sy[ky] = w * sh21 + z__ * sh22;
|
||||
kx += *incx;
|
||||
ky += *incy;
|
||||
/* L130: */
|
||||
}
|
||||
L140:
|
||||
return;
|
||||
} /* srotm_ */
|
||||
@@ -1,293 +0,0 @@
|
||||
/* srotmg.f -- translated by f2c (version 20100827).
|
||||
You must link the resulting object file with libf2c:
|
||||
on Microsoft Windows system, link with libf2c.lib;
|
||||
on Linux or Unix systems, link with .../path/to/libf2c.a -lm
|
||||
or, if you install libf2c.a in a standard place, with -lf2c -lm
|
||||
-- in that order, at the end of the command line, as in
|
||||
cc *.o -lf2c -lm
|
||||
Source for libf2c is in /netlib/f2c/libf2c.zip, e.g.,
|
||||
|
||||
http://www.netlib.org/f2c/libf2c.zip
|
||||
*/
|
||||
|
||||
#include "datatypes.h"
|
||||
|
||||
/* Subroutine */ void srotmg_(real *sd1, real *sd2, real *sx1, real *sy1, real *sparam) {
|
||||
/* Initialized data */
|
||||
|
||||
static real zero = 0.f;
|
||||
static real one = 1.f;
|
||||
static real two = 2.f;
|
||||
static real gam = 4096.f;
|
||||
static real gamsq = 16777200.f;
|
||||
static real rgamsq = 5.96046e-8f;
|
||||
|
||||
/* Format strings */
|
||||
static char fmt_120[] = "";
|
||||
static char fmt_150[] = "";
|
||||
static char fmt_180[] = "";
|
||||
static char fmt_210[] = "";
|
||||
|
||||
/* System generated locals */
|
||||
real r__1;
|
||||
|
||||
/* Local variables */
|
||||
real su, sp1, sp2, sq1, sq2, sh11, sh12, sh21, sh22;
|
||||
integer igo;
|
||||
real sflag, stemp;
|
||||
|
||||
/* Assigned format variables */
|
||||
static char *igo_fmt;
|
||||
(void)igo_fmt;
|
||||
|
||||
/* .. Scalar Arguments .. */
|
||||
/* .. */
|
||||
/* .. Array Arguments .. */
|
||||
/* .. */
|
||||
|
||||
/* Purpose */
|
||||
/* ======= */
|
||||
|
||||
/* CONSTRUCT THE MODIFIED GIVENS TRANSFORMATION MATRIX H WHICH ZEROS */
|
||||
/* THE SECOND COMPONENT OF THE 2-VECTOR (SQRT(SD1)*SX1,SQRT(SD2)* */
|
||||
/* SY2)**T. */
|
||||
/* WITH SPARAM(1)=SFLAG, H HAS ONE OF THE FOLLOWING FORMS.. */
|
||||
|
||||
/* SFLAG=-1.E0 SFLAG=0.E0 SFLAG=1.E0 SFLAG=-2.E0 */
|
||||
|
||||
/* (SH11 SH12) (1.E0 SH12) (SH11 1.E0) (1.E0 0.E0) */
|
||||
/* H=( ) ( ) ( ) ( ) */
|
||||
/* (SH21 SH22), (SH21 1.E0), (-1.E0 SH22), (0.E0 1.E0). */
|
||||
/* LOCATIONS 2-4 OF SPARAM CONTAIN SH11,SH21,SH12, AND SH22 */
|
||||
/* RESPECTIVELY. (VALUES OF 1.E0, -1.E0, OR 0.E0 IMPLIED BY THE */
|
||||
/* VALUE OF SPARAM(1) ARE NOT STORED IN SPARAM.) */
|
||||
|
||||
/* THE VALUES OF GAMSQ AND RGAMSQ SET IN THE DATA STATEMENT MAY BE */
|
||||
/* INEXACT. THIS IS OK AS THEY ARE ONLY USED FOR TESTING THE SIZE */
|
||||
/* OF SD1 AND SD2. ALL ACTUAL SCALING OF DATA IS DONE USING GAM. */
|
||||
|
||||
/* Arguments */
|
||||
/* ========= */
|
||||
|
||||
/* SD1 (input/output) REAL */
|
||||
|
||||
/* SD2 (input/output) REAL */
|
||||
|
||||
/* SX1 (input/output) REAL */
|
||||
|
||||
/* SY1 (input) REAL */
|
||||
|
||||
/* SPARAM (input/output) REAL array, dimension 5 */
|
||||
/* SPARAM(1)=SFLAG */
|
||||
/* SPARAM(2)=SH11 */
|
||||
/* SPARAM(3)=SH21 */
|
||||
/* SPARAM(4)=SH12 */
|
||||
/* SPARAM(5)=SH22 */
|
||||
|
||||
/* ===================================================================== */
|
||||
|
||||
/* .. Local Scalars .. */
|
||||
/* .. */
|
||||
/* .. Intrinsic Functions .. */
|
||||
/* .. */
|
||||
/* .. Data statements .. */
|
||||
|
||||
/* Parameter adjustments */
|
||||
--sparam;
|
||||
|
||||
/* Function Body */
|
||||
/* .. */
|
||||
if (!(*sd1 < zero)) {
|
||||
goto L10;
|
||||
}
|
||||
/* GO ZERO-H-D-AND-SX1.. */
|
||||
goto L60;
|
||||
L10:
|
||||
/* CASE-SD1-NONNEGATIVE */
|
||||
sp2 = *sd2 * *sy1;
|
||||
if (!(sp2 == zero)) {
|
||||
goto L20;
|
||||
}
|
||||
sflag = -two;
|
||||
goto L260;
|
||||
/* REGULAR-CASE.. */
|
||||
L20:
|
||||
sp1 = *sd1 * *sx1;
|
||||
sq2 = sp2 * *sy1;
|
||||
sq1 = sp1 * *sx1;
|
||||
|
||||
if (!(dabs(sq1) > dabs(sq2))) {
|
||||
goto L40;
|
||||
}
|
||||
sh21 = -(*sy1) / *sx1;
|
||||
sh12 = sp2 / sp1;
|
||||
|
||||
su = one - sh12 * sh21;
|
||||
|
||||
if (!(su <= zero)) {
|
||||
goto L30;
|
||||
}
|
||||
/* GO ZERO-H-D-AND-SX1.. */
|
||||
goto L60;
|
||||
L30:
|
||||
sflag = zero;
|
||||
*sd1 /= su;
|
||||
*sd2 /= su;
|
||||
*sx1 *= su;
|
||||
/* GO SCALE-CHECK.. */
|
||||
goto L100;
|
||||
L40:
|
||||
if (!(sq2 < zero)) {
|
||||
goto L50;
|
||||
}
|
||||
/* GO ZERO-H-D-AND-SX1.. */
|
||||
goto L60;
|
||||
L50:
|
||||
sflag = one;
|
||||
sh11 = sp1 / sp2;
|
||||
sh22 = *sx1 / *sy1;
|
||||
su = one + sh11 * sh22;
|
||||
stemp = *sd2 / su;
|
||||
*sd2 = *sd1 / su;
|
||||
*sd1 = stemp;
|
||||
*sx1 = *sy1 * su;
|
||||
/* GO SCALE-CHECK */
|
||||
goto L100;
|
||||
/* PROCEDURE..ZERO-H-D-AND-SX1.. */
|
||||
L60:
|
||||
sflag = -one;
|
||||
sh11 = zero;
|
||||
sh12 = zero;
|
||||
sh21 = zero;
|
||||
sh22 = zero;
|
||||
|
||||
*sd1 = zero;
|
||||
*sd2 = zero;
|
||||
*sx1 = zero;
|
||||
/* RETURN.. */
|
||||
goto L220;
|
||||
/* PROCEDURE..FIX-H.. */
|
||||
L70:
|
||||
if (!(sflag >= zero)) {
|
||||
goto L90;
|
||||
}
|
||||
|
||||
if (!(sflag == zero)) {
|
||||
goto L80;
|
||||
}
|
||||
sh11 = one;
|
||||
sh22 = one;
|
||||
sflag = -one;
|
||||
goto L90;
|
||||
L80:
|
||||
sh21 = -one;
|
||||
sh12 = one;
|
||||
sflag = -one;
|
||||
L90:
|
||||
switch (igo) {
|
||||
case 0:
|
||||
goto L120;
|
||||
case 1:
|
||||
goto L150;
|
||||
case 2:
|
||||
goto L180;
|
||||
case 3:
|
||||
goto L210;
|
||||
}
|
||||
/* PROCEDURE..SCALE-CHECK */
|
||||
L100:
|
||||
L110:
|
||||
if (!(*sd1 <= rgamsq)) {
|
||||
goto L130;
|
||||
}
|
||||
if (*sd1 == zero) {
|
||||
goto L160;
|
||||
}
|
||||
igo = 0;
|
||||
igo_fmt = fmt_120;
|
||||
/* FIX-H.. */
|
||||
goto L70;
|
||||
L120:
|
||||
/* Computing 2nd power */
|
||||
r__1 = gam;
|
||||
*sd1 *= r__1 * r__1;
|
||||
*sx1 /= gam;
|
||||
sh11 /= gam;
|
||||
sh12 /= gam;
|
||||
goto L110;
|
||||
L130:
|
||||
L140:
|
||||
if (!(*sd1 >= gamsq)) {
|
||||
goto L160;
|
||||
}
|
||||
igo = 1;
|
||||
igo_fmt = fmt_150;
|
||||
/* FIX-H.. */
|
||||
goto L70;
|
||||
L150:
|
||||
/* Computing 2nd power */
|
||||
r__1 = gam;
|
||||
*sd1 /= r__1 * r__1;
|
||||
*sx1 *= gam;
|
||||
sh11 *= gam;
|
||||
sh12 *= gam;
|
||||
goto L140;
|
||||
L160:
|
||||
L170:
|
||||
if (!(dabs(*sd2) <= rgamsq)) {
|
||||
goto L190;
|
||||
}
|
||||
if (*sd2 == zero) {
|
||||
goto L220;
|
||||
}
|
||||
igo = 2;
|
||||
igo_fmt = fmt_180;
|
||||
/* FIX-H.. */
|
||||
goto L70;
|
||||
L180:
|
||||
/* Computing 2nd power */
|
||||
r__1 = gam;
|
||||
*sd2 *= r__1 * r__1;
|
||||
sh21 /= gam;
|
||||
sh22 /= gam;
|
||||
goto L170;
|
||||
L190:
|
||||
L200:
|
||||
if (!(dabs(*sd2) >= gamsq)) {
|
||||
goto L220;
|
||||
}
|
||||
igo = 3;
|
||||
igo_fmt = fmt_210;
|
||||
/* FIX-H.. */
|
||||
goto L70;
|
||||
L210:
|
||||
/* Computing 2nd power */
|
||||
r__1 = gam;
|
||||
*sd2 /= r__1 * r__1;
|
||||
sh21 *= gam;
|
||||
sh22 *= gam;
|
||||
goto L200;
|
||||
L220:
|
||||
if (sflag < 0.f) {
|
||||
goto L250;
|
||||
} else if (sflag == 0) {
|
||||
goto L230;
|
||||
} else {
|
||||
goto L240;
|
||||
}
|
||||
L230:
|
||||
sparam[3] = sh21;
|
||||
sparam[4] = sh12;
|
||||
goto L260;
|
||||
L240:
|
||||
sparam[2] = sh11;
|
||||
sparam[5] = sh22;
|
||||
goto L260;
|
||||
L250:
|
||||
sparam[2] = sh11;
|
||||
sparam[3] = sh21;
|
||||
sparam[4] = sh12;
|
||||
sparam[5] = sh22;
|
||||
L260:
|
||||
sparam[1] = sflag;
|
||||
} /* srotmg_ */
|
||||
359
blas/f2c/ssbmv.c
359
blas/f2c/ssbmv.c
@@ -1,359 +0,0 @@
|
||||
/* ssbmv.f -- translated by f2c (version 20100827).
|
||||
You must link the resulting object file with libf2c:
|
||||
on Microsoft Windows system, link with libf2c.lib;
|
||||
on Linux or Unix systems, link with .../path/to/libf2c.a -lm
|
||||
or, if you install libf2c.a in a standard place, with -lf2c -lm
|
||||
-- in that order, at the end of the command line, as in
|
||||
cc *.o -lf2c -lm
|
||||
Source for libf2c is in /netlib/f2c/libf2c.zip, e.g.,
|
||||
|
||||
http://www.netlib.org/f2c/libf2c.zip
|
||||
*/
|
||||
|
||||
#include "datatypes.h"
|
||||
|
||||
/* Subroutine */ void ssbmv_(char *uplo, integer *n, integer *k, real *alpha, real *a, integer *lda, real *x,
|
||||
integer *incx, real *beta, real *y, integer *incy) {
|
||||
/* System generated locals */
|
||||
integer a_dim1, a_offset, i__1, i__2, i__3, i__4;
|
||||
|
||||
/* Local variables */
|
||||
integer i__, j, l, ix, iy, jx, jy, kx, ky, info;
|
||||
real temp1, temp2;
|
||||
extern logical lsame_(char *, char *);
|
||||
integer kplus1;
|
||||
extern /* Subroutine */ void xerbla_(const char *, integer *);
|
||||
|
||||
/* .. Scalar Arguments .. */
|
||||
/* .. */
|
||||
/* .. Array Arguments .. */
|
||||
/* .. */
|
||||
|
||||
/* Purpose */
|
||||
/* ======= */
|
||||
|
||||
/* SSBMV performs the matrix-vector operation */
|
||||
|
||||
/* y := alpha*A*x + beta*y, */
|
||||
|
||||
/* where alpha and beta are scalars, x and y are n element vectors and */
|
||||
/* A is an n by n symmetric band matrix, with k super-diagonals. */
|
||||
|
||||
/* Arguments */
|
||||
/* ========== */
|
||||
|
||||
/* UPLO - CHARACTER*1. */
|
||||
/* On entry, UPLO specifies whether the upper or lower */
|
||||
/* triangular part of the band matrix A is being supplied as */
|
||||
/* follows: */
|
||||
|
||||
/* UPLO = 'U' or 'u' The upper triangular part of A is */
|
||||
/* being supplied. */
|
||||
|
||||
/* UPLO = 'L' or 'l' The lower triangular part of A is */
|
||||
/* being supplied. */
|
||||
|
||||
/* Unchanged on exit. */
|
||||
|
||||
/* N - INTEGER. */
|
||||
/* On entry, N specifies the order of the matrix A. */
|
||||
/* N must be at least zero. */
|
||||
/* Unchanged on exit. */
|
||||
|
||||
/* K - INTEGER. */
|
||||
/* On entry, K specifies the number of super-diagonals of the */
|
||||
/* matrix A. K must satisfy 0 .le. K. */
|
||||
/* Unchanged on exit. */
|
||||
|
||||
/* ALPHA - REAL . */
|
||||
/* On entry, ALPHA specifies the scalar alpha. */
|
||||
/* Unchanged on exit. */
|
||||
|
||||
/* A - REAL array of DIMENSION ( LDA, n ). */
|
||||
/* Before entry with UPLO = 'U' or 'u', the leading ( k + 1 ) */
|
||||
/* by n part of the array A must contain the upper triangular */
|
||||
/* band part of the symmetric matrix, supplied column by */
|
||||
/* column, with the leading diagonal of the matrix in row */
|
||||
/* ( k + 1 ) of the array, the first super-diagonal starting at */
|
||||
/* position 2 in row k, and so on. The top left k by k triangle */
|
||||
/* of the array A is not referenced. */
|
||||
/* The following program segment will transfer the upper */
|
||||
/* triangular part of a symmetric band matrix from conventional */
|
||||
/* full matrix storage to band storage: */
|
||||
|
||||
/* DO 20, J = 1, N */
|
||||
/* M = K + 1 - J */
|
||||
/* DO 10, I = MAX( 1, J - K ), J */
|
||||
/* A( M + I, J ) = matrix( I, J ) */
|
||||
/* 10 CONTINUE */
|
||||
/* 20 CONTINUE */
|
||||
|
||||
/* Before entry with UPLO = 'L' or 'l', the leading ( k + 1 ) */
|
||||
/* by n part of the array A must contain the lower triangular */
|
||||
/* band part of the symmetric matrix, supplied column by */
|
||||
/* column, with the leading diagonal of the matrix in row 1 of */
|
||||
/* the array, the first sub-diagonal starting at position 1 in */
|
||||
/* row 2, and so on. The bottom right k by k triangle of the */
|
||||
/* array A is not referenced. */
|
||||
/* The following program segment will transfer the lower */
|
||||
/* triangular part of a symmetric band matrix from conventional */
|
||||
/* full matrix storage to band storage: */
|
||||
|
||||
/* DO 20, J = 1, N */
|
||||
/* M = 1 - J */
|
||||
/* DO 10, I = J, MIN( N, J + K ) */
|
||||
/* A( M + I, J ) = matrix( I, J ) */
|
||||
/* 10 CONTINUE */
|
||||
/* 20 CONTINUE */
|
||||
|
||||
/* Unchanged on exit. */
|
||||
|
||||
/* LDA - INTEGER. */
|
||||
/* On entry, LDA specifies the first dimension of A as declared */
|
||||
/* in the calling (sub) program. LDA must be at least */
|
||||
/* ( k + 1 ). */
|
||||
/* Unchanged on exit. */
|
||||
|
||||
/* X - REAL array of DIMENSION at least */
|
||||
/* ( 1 + ( n - 1 )*abs( INCX ) ). */
|
||||
/* Before entry, the incremented array X must contain the */
|
||||
/* vector x. */
|
||||
/* Unchanged on exit. */
|
||||
|
||||
/* INCX - INTEGER. */
|
||||
/* On entry, INCX specifies the increment for the elements of */
|
||||
/* X. INCX must not be zero. */
|
||||
/* Unchanged on exit. */
|
||||
|
||||
/* BETA - REAL . */
|
||||
/* On entry, BETA specifies the scalar beta. */
|
||||
/* Unchanged on exit. */
|
||||
|
||||
/* Y - REAL array of DIMENSION at least */
|
||||
/* ( 1 + ( n - 1 )*abs( INCY ) ). */
|
||||
/* Before entry, the incremented array Y must contain the */
|
||||
/* vector y. On exit, Y is overwritten by the updated vector y. */
|
||||
|
||||
/* INCY - INTEGER. */
|
||||
/* On entry, INCY specifies the increment for the elements of */
|
||||
/* Y. INCY must not be zero. */
|
||||
/* Unchanged on exit. */
|
||||
|
||||
/* Further Details */
|
||||
/* =============== */
|
||||
|
||||
/* Level 2 Blas routine. */
|
||||
|
||||
/* -- Written on 22-October-1986. */
|
||||
/* Jack Dongarra, Argonne National Lab. */
|
||||
/* Jeremy Du Croz, Nag Central Office. */
|
||||
/* Sven Hammarling, Nag Central Office. */
|
||||
/* Richard Hanson, Sandia National Labs. */
|
||||
|
||||
/* ===================================================================== */
|
||||
|
||||
/* .. Parameters .. */
|
||||
/* .. */
|
||||
/* .. Local Scalars .. */
|
||||
/* .. */
|
||||
/* .. External Functions .. */
|
||||
/* .. */
|
||||
/* .. External Subroutines .. */
|
||||
/* .. */
|
||||
/* .. Intrinsic Functions .. */
|
||||
/* .. */
|
||||
|
||||
/* Test the input parameters. */
|
||||
|
||||
/* Parameter adjustments */
|
||||
a_dim1 = *lda;
|
||||
a_offset = 1 + a_dim1;
|
||||
a -= a_offset;
|
||||
--x;
|
||||
--y;
|
||||
|
||||
/* Function Body */
|
||||
info = 0;
|
||||
if (!lsame_(uplo, "U") && !lsame_(uplo, "L")) {
|
||||
info = 1;
|
||||
} else if (*n < 0) {
|
||||
info = 2;
|
||||
} else if (*k < 0) {
|
||||
info = 3;
|
||||
} else if (*lda < *k + 1) {
|
||||
info = 6;
|
||||
} else if (*incx == 0) {
|
||||
info = 8;
|
||||
} else if (*incy == 0) {
|
||||
info = 11;
|
||||
}
|
||||
if (info != 0) {
|
||||
xerbla_("SSBMV ", &info);
|
||||
return;
|
||||
}
|
||||
|
||||
/* Quick return if possible. */
|
||||
|
||||
if (*n == 0 || (*alpha == 0.f && *beta == 1.f)) {
|
||||
return;
|
||||
}
|
||||
|
||||
/* Set up the start points in X and Y. */
|
||||
|
||||
if (*incx > 0) {
|
||||
kx = 1;
|
||||
} else {
|
||||
kx = 1 - (*n - 1) * *incx;
|
||||
}
|
||||
if (*incy > 0) {
|
||||
ky = 1;
|
||||
} else {
|
||||
ky = 1 - (*n - 1) * *incy;
|
||||
}
|
||||
|
||||
/* Start the operations. In this version the elements of the array A */
|
||||
/* are accessed sequentially with one pass through A. */
|
||||
|
||||
/* First form y := beta*y. */
|
||||
|
||||
if (*beta != 1.f) {
|
||||
if (*incy == 1) {
|
||||
if (*beta == 0.f) {
|
||||
i__1 = *n;
|
||||
for (i__ = 1; i__ <= i__1; ++i__) {
|
||||
y[i__] = 0.f;
|
||||
/* L10: */
|
||||
}
|
||||
} else {
|
||||
i__1 = *n;
|
||||
for (i__ = 1; i__ <= i__1; ++i__) {
|
||||
y[i__] = *beta * y[i__];
|
||||
/* L20: */
|
||||
}
|
||||
}
|
||||
} else {
|
||||
iy = ky;
|
||||
if (*beta == 0.f) {
|
||||
i__1 = *n;
|
||||
for (i__ = 1; i__ <= i__1; ++i__) {
|
||||
y[iy] = 0.f;
|
||||
iy += *incy;
|
||||
/* L30: */
|
||||
}
|
||||
} else {
|
||||
i__1 = *n;
|
||||
for (i__ = 1; i__ <= i__1; ++i__) {
|
||||
y[iy] = *beta * y[iy];
|
||||
iy += *incy;
|
||||
/* L40: */
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
if (*alpha == 0.f) {
|
||||
return;
|
||||
}
|
||||
if (lsame_(uplo, "U")) {
|
||||
/* Form y when upper triangle of A is stored. */
|
||||
|
||||
kplus1 = *k + 1;
|
||||
if (*incx == 1 && *incy == 1) {
|
||||
i__1 = *n;
|
||||
for (j = 1; j <= i__1; ++j) {
|
||||
temp1 = *alpha * x[j];
|
||||
temp2 = 0.f;
|
||||
l = kplus1 - j;
|
||||
/* Computing MAX */
|
||||
i__2 = 1, i__3 = j - *k;
|
||||
i__4 = j - 1;
|
||||
for (i__ = max(i__2, i__3); i__ <= i__4; ++i__) {
|
||||
y[i__] += temp1 * a[l + i__ + j * a_dim1];
|
||||
temp2 += a[l + i__ + j * a_dim1] * x[i__];
|
||||
/* L50: */
|
||||
}
|
||||
y[j] = y[j] + temp1 * a[kplus1 + j * a_dim1] + *alpha * temp2;
|
||||
/* L60: */
|
||||
}
|
||||
} else {
|
||||
jx = kx;
|
||||
jy = ky;
|
||||
i__1 = *n;
|
||||
for (j = 1; j <= i__1; ++j) {
|
||||
temp1 = *alpha * x[jx];
|
||||
temp2 = 0.f;
|
||||
ix = kx;
|
||||
iy = ky;
|
||||
l = kplus1 - j;
|
||||
/* Computing MAX */
|
||||
i__4 = 1, i__2 = j - *k;
|
||||
i__3 = j - 1;
|
||||
for (i__ = max(i__4, i__2); i__ <= i__3; ++i__) {
|
||||
y[iy] += temp1 * a[l + i__ + j * a_dim1];
|
||||
temp2 += a[l + i__ + j * a_dim1] * x[ix];
|
||||
ix += *incx;
|
||||
iy += *incy;
|
||||
/* L70: */
|
||||
}
|
||||
y[jy] = y[jy] + temp1 * a[kplus1 + j * a_dim1] + *alpha * temp2;
|
||||
jx += *incx;
|
||||
jy += *incy;
|
||||
if (j > *k) {
|
||||
kx += *incx;
|
||||
ky += *incy;
|
||||
}
|
||||
/* L80: */
|
||||
}
|
||||
}
|
||||
} else {
|
||||
/* Form y when lower triangle of A is stored. */
|
||||
|
||||
if (*incx == 1 && *incy == 1) {
|
||||
i__1 = *n;
|
||||
for (j = 1; j <= i__1; ++j) {
|
||||
temp1 = *alpha * x[j];
|
||||
temp2 = 0.f;
|
||||
y[j] += temp1 * a[j * a_dim1 + 1];
|
||||
l = 1 - j;
|
||||
/* Computing MIN */
|
||||
i__4 = *n, i__2 = j + *k;
|
||||
i__3 = min(i__4, i__2);
|
||||
for (i__ = j + 1; i__ <= i__3; ++i__) {
|
||||
y[i__] += temp1 * a[l + i__ + j * a_dim1];
|
||||
temp2 += a[l + i__ + j * a_dim1] * x[i__];
|
||||
/* L90: */
|
||||
}
|
||||
y[j] += *alpha * temp2;
|
||||
/* L100: */
|
||||
}
|
||||
} else {
|
||||
jx = kx;
|
||||
jy = ky;
|
||||
i__1 = *n;
|
||||
for (j = 1; j <= i__1; ++j) {
|
||||
temp1 = *alpha * x[jx];
|
||||
temp2 = 0.f;
|
||||
y[jy] += temp1 * a[j * a_dim1 + 1];
|
||||
l = 1 - j;
|
||||
ix = jx;
|
||||
iy = jy;
|
||||
/* Computing MIN */
|
||||
i__4 = *n, i__2 = j + *k;
|
||||
i__3 = min(i__4, i__2);
|
||||
for (i__ = j + 1; i__ <= i__3; ++i__) {
|
||||
ix += *incx;
|
||||
iy += *incy;
|
||||
y[iy] += temp1 * a[l + i__ + j * a_dim1];
|
||||
temp2 += a[l + i__ + j * a_dim1] * x[ix];
|
||||
/* L110: */
|
||||
}
|
||||
y[jy] += *alpha * temp2;
|
||||
jx += *incx;
|
||||
jy += *incy;
|
||||
/* L120: */
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/* End of SSBMV . */
|
||||
|
||||
} /* ssbmv_ */
|
||||
308
blas/f2c/sspmv.c
308
blas/f2c/sspmv.c
@@ -1,308 +0,0 @@
|
||||
/* sspmv.f -- translated by f2c (version 20100827).
|
||||
You must link the resulting object file with libf2c:
|
||||
on Microsoft Windows system, link with libf2c.lib;
|
||||
on Linux or Unix systems, link with .../path/to/libf2c.a -lm
|
||||
or, if you install libf2c.a in a standard place, with -lf2c -lm
|
||||
-- in that order, at the end of the command line, as in
|
||||
cc *.o -lf2c -lm
|
||||
Source for libf2c is in /netlib/f2c/libf2c.zip, e.g.,
|
||||
|
||||
http://www.netlib.org/f2c/libf2c.zip
|
||||
*/
|
||||
|
||||
#include "datatypes.h"
|
||||
|
||||
/* Subroutine */ void sspmv_(char *uplo, integer *n, real *alpha, real *ap, real *x, integer *incx, real *beta, real *y,
|
||||
integer *incy) {
|
||||
/* System generated locals */
|
||||
integer i__1, i__2;
|
||||
|
||||
/* Local variables */
|
||||
integer i__, j, k, kk, ix, iy, jx, jy, kx, ky, info;
|
||||
real temp1, temp2;
|
||||
extern logical lsame_(char *, char *);
|
||||
extern /* Subroutine */ void xerbla_(const char *, integer *);
|
||||
|
||||
/* .. Scalar Arguments .. */
|
||||
/* .. */
|
||||
/* .. Array Arguments .. */
|
||||
/* .. */
|
||||
|
||||
/* Purpose */
|
||||
/* ======= */
|
||||
|
||||
/* SSPMV performs the matrix-vector operation */
|
||||
|
||||
/* y := alpha*A*x + beta*y, */
|
||||
|
||||
/* where alpha and beta are scalars, x and y are n element vectors and */
|
||||
/* A is an n by n symmetric matrix, supplied in packed form. */
|
||||
|
||||
/* Arguments */
|
||||
/* ========== */
|
||||
|
||||
/* UPLO - CHARACTER*1. */
|
||||
/* On entry, UPLO specifies whether the upper or lower */
|
||||
/* triangular part of the matrix A is supplied in the packed */
|
||||
/* array AP as follows: */
|
||||
|
||||
/* UPLO = 'U' or 'u' The upper triangular part of A is */
|
||||
/* supplied in AP. */
|
||||
|
||||
/* UPLO = 'L' or 'l' The lower triangular part of A is */
|
||||
/* supplied in AP. */
|
||||
|
||||
/* Unchanged on exit. */
|
||||
|
||||
/* N - INTEGER. */
|
||||
/* On entry, N specifies the order of the matrix A. */
|
||||
/* N must be at least zero. */
|
||||
/* Unchanged on exit. */
|
||||
|
||||
/* ALPHA - REAL . */
|
||||
/* On entry, ALPHA specifies the scalar alpha. */
|
||||
/* Unchanged on exit. */
|
||||
|
||||
/* AP - REAL array of DIMENSION at least */
|
||||
/* ( ( n*( n + 1 ) )/2 ). */
|
||||
/* Before entry with UPLO = 'U' or 'u', the array AP must */
|
||||
/* contain the upper triangular part of the symmetric matrix */
|
||||
/* packed sequentially, column by column, so that AP( 1 ) */
|
||||
/* contains a( 1, 1 ), AP( 2 ) and AP( 3 ) contain a( 1, 2 ) */
|
||||
/* and a( 2, 2 ) respectively, and so on. */
|
||||
/* Before entry with UPLO = 'L' or 'l', the array AP must */
|
||||
/* contain the lower triangular part of the symmetric matrix */
|
||||
/* packed sequentially, column by column, so that AP( 1 ) */
|
||||
/* contains a( 1, 1 ), AP( 2 ) and AP( 3 ) contain a( 2, 1 ) */
|
||||
/* and a( 3, 1 ) respectively, and so on. */
|
||||
/* Unchanged on exit. */
|
||||
|
||||
/* X - REAL array of dimension at least */
|
||||
/* ( 1 + ( n - 1 )*abs( INCX ) ). */
|
||||
/* Before entry, the incremented array X must contain the n */
|
||||
/* element vector x. */
|
||||
/* Unchanged on exit. */
|
||||
|
||||
/* INCX - INTEGER. */
|
||||
/* On entry, INCX specifies the increment for the elements of */
|
||||
/* X. INCX must not be zero. */
|
||||
/* Unchanged on exit. */
|
||||
|
||||
/* BETA - REAL . */
|
||||
/* On entry, BETA specifies the scalar beta. When BETA is */
|
||||
/* supplied as zero then Y need not be set on input. */
|
||||
/* Unchanged on exit. */
|
||||
|
||||
/* Y - REAL array of dimension at least */
|
||||
/* ( 1 + ( n - 1 )*abs( INCY ) ). */
|
||||
/* Before entry, the incremented array Y must contain the n */
|
||||
/* element vector y. On exit, Y is overwritten by the updated */
|
||||
/* vector y. */
|
||||
|
||||
/* INCY - INTEGER. */
|
||||
/* On entry, INCY specifies the increment for the elements of */
|
||||
/* Y. INCY must not be zero. */
|
||||
/* Unchanged on exit. */
|
||||
|
||||
/* Further Details */
|
||||
/* =============== */
|
||||
|
||||
/* Level 2 Blas routine. */
|
||||
|
||||
/* -- Written on 22-October-1986. */
|
||||
/* Jack Dongarra, Argonne National Lab. */
|
||||
/* Jeremy Du Croz, Nag Central Office. */
|
||||
/* Sven Hammarling, Nag Central Office. */
|
||||
/* Richard Hanson, Sandia National Labs. */
|
||||
|
||||
/* ===================================================================== */
|
||||
|
||||
/* .. Parameters .. */
|
||||
/* .. */
|
||||
/* .. Local Scalars .. */
|
||||
/* .. */
|
||||
/* .. External Functions .. */
|
||||
/* .. */
|
||||
/* .. External Subroutines .. */
|
||||
/* .. */
|
||||
|
||||
/* Test the input parameters. */
|
||||
|
||||
/* Parameter adjustments */
|
||||
--y;
|
||||
--x;
|
||||
--ap;
|
||||
|
||||
/* Function Body */
|
||||
info = 0;
|
||||
if (!lsame_(uplo, "U") && !lsame_(uplo, "L")) {
|
||||
info = 1;
|
||||
} else if (*n < 0) {
|
||||
info = 2;
|
||||
} else if (*incx == 0) {
|
||||
info = 6;
|
||||
} else if (*incy == 0) {
|
||||
info = 9;
|
||||
}
|
||||
if (info != 0) {
|
||||
xerbla_("SSPMV ", &info);
|
||||
return;
|
||||
}
|
||||
|
||||
/* Quick return if possible. */
|
||||
|
||||
if (*n == 0 || (*alpha == 0.f && *beta == 1.f)) {
|
||||
return;
|
||||
}
|
||||
|
||||
/* Set up the start points in X and Y. */
|
||||
|
||||
if (*incx > 0) {
|
||||
kx = 1;
|
||||
} else {
|
||||
kx = 1 - (*n - 1) * *incx;
|
||||
}
|
||||
if (*incy > 0) {
|
||||
ky = 1;
|
||||
} else {
|
||||
ky = 1 - (*n - 1) * *incy;
|
||||
}
|
||||
|
||||
/* Start the operations. In this version the elements of the array AP */
|
||||
/* are accessed sequentially with one pass through AP. */
|
||||
|
||||
/* First form y := beta*y. */
|
||||
|
||||
if (*beta != 1.f) {
|
||||
if (*incy == 1) {
|
||||
if (*beta == 0.f) {
|
||||
i__1 = *n;
|
||||
for (i__ = 1; i__ <= i__1; ++i__) {
|
||||
y[i__] = 0.f;
|
||||
/* L10: */
|
||||
}
|
||||
} else {
|
||||
i__1 = *n;
|
||||
for (i__ = 1; i__ <= i__1; ++i__) {
|
||||
y[i__] = *beta * y[i__];
|
||||
/* L20: */
|
||||
}
|
||||
}
|
||||
} else {
|
||||
iy = ky;
|
||||
if (*beta == 0.f) {
|
||||
i__1 = *n;
|
||||
for (i__ = 1; i__ <= i__1; ++i__) {
|
||||
y[iy] = 0.f;
|
||||
iy += *incy;
|
||||
/* L30: */
|
||||
}
|
||||
} else {
|
||||
i__1 = *n;
|
||||
for (i__ = 1; i__ <= i__1; ++i__) {
|
||||
y[iy] = *beta * y[iy];
|
||||
iy += *incy;
|
||||
/* L40: */
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
if (*alpha == 0.f) {
|
||||
return;
|
||||
}
|
||||
kk = 1;
|
||||
if (lsame_(uplo, "U")) {
|
||||
/* Form y when AP contains the upper triangle. */
|
||||
|
||||
if (*incx == 1 && *incy == 1) {
|
||||
i__1 = *n;
|
||||
for (j = 1; j <= i__1; ++j) {
|
||||
temp1 = *alpha * x[j];
|
||||
temp2 = 0.f;
|
||||
k = kk;
|
||||
i__2 = j - 1;
|
||||
for (i__ = 1; i__ <= i__2; ++i__) {
|
||||
y[i__] += temp1 * ap[k];
|
||||
temp2 += ap[k] * x[i__];
|
||||
++k;
|
||||
/* L50: */
|
||||
}
|
||||
y[j] = y[j] + temp1 * ap[kk + j - 1] + *alpha * temp2;
|
||||
kk += j;
|
||||
/* L60: */
|
||||
}
|
||||
} else {
|
||||
jx = kx;
|
||||
jy = ky;
|
||||
i__1 = *n;
|
||||
for (j = 1; j <= i__1; ++j) {
|
||||
temp1 = *alpha * x[jx];
|
||||
temp2 = 0.f;
|
||||
ix = kx;
|
||||
iy = ky;
|
||||
i__2 = kk + j - 2;
|
||||
for (k = kk; k <= i__2; ++k) {
|
||||
y[iy] += temp1 * ap[k];
|
||||
temp2 += ap[k] * x[ix];
|
||||
ix += *incx;
|
||||
iy += *incy;
|
||||
/* L70: */
|
||||
}
|
||||
y[jy] = y[jy] + temp1 * ap[kk + j - 1] + *alpha * temp2;
|
||||
jx += *incx;
|
||||
jy += *incy;
|
||||
kk += j;
|
||||
/* L80: */
|
||||
}
|
||||
}
|
||||
} else {
|
||||
/* Form y when AP contains the lower triangle. */
|
||||
|
||||
if (*incx == 1 && *incy == 1) {
|
||||
i__1 = *n;
|
||||
for (j = 1; j <= i__1; ++j) {
|
||||
temp1 = *alpha * x[j];
|
||||
temp2 = 0.f;
|
||||
y[j] += temp1 * ap[kk];
|
||||
k = kk + 1;
|
||||
i__2 = *n;
|
||||
for (i__ = j + 1; i__ <= i__2; ++i__) {
|
||||
y[i__] += temp1 * ap[k];
|
||||
temp2 += ap[k] * x[i__];
|
||||
++k;
|
||||
/* L90: */
|
||||
}
|
||||
y[j] += *alpha * temp2;
|
||||
kk += *n - j + 1;
|
||||
/* L100: */
|
||||
}
|
||||
} else {
|
||||
jx = kx;
|
||||
jy = ky;
|
||||
i__1 = *n;
|
||||
for (j = 1; j <= i__1; ++j) {
|
||||
temp1 = *alpha * x[jx];
|
||||
temp2 = 0.f;
|
||||
y[jy] += temp1 * ap[kk];
|
||||
ix = jx;
|
||||
iy = jy;
|
||||
i__2 = kk + *n - j;
|
||||
for (k = kk + 1; k <= i__2; ++k) {
|
||||
ix += *incx;
|
||||
iy += *incy;
|
||||
y[iy] += temp1 * ap[k];
|
||||
temp2 += ap[k] * x[ix];
|
||||
/* L110: */
|
||||
}
|
||||
y[jy] += *alpha * temp2;
|
||||
jx += *incx;
|
||||
jy += *incy;
|
||||
kk += *n - j + 1;
|
||||
/* L120: */
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/* End of SSPMV . */
|
||||
|
||||
} /* sspmv_ */
|
||||
417
blas/f2c/stbmv.c
417
blas/f2c/stbmv.c
@@ -1,417 +0,0 @@
|
||||
/* stbmv.f -- translated by f2c (version 20100827).
|
||||
You must link the resulting object file with libf2c:
|
||||
on Microsoft Windows system, link with libf2c.lib;
|
||||
on Linux or Unix systems, link with .../path/to/libf2c.a -lm
|
||||
or, if you install libf2c.a in a standard place, with -lf2c -lm
|
||||
-- in that order, at the end of the command line, as in
|
||||
cc *.o -lf2c -lm
|
||||
Source for libf2c is in /netlib/f2c/libf2c.zip, e.g.,
|
||||
|
||||
http://www.netlib.org/f2c/libf2c.zip
|
||||
*/
|
||||
|
||||
#include "datatypes.h"
|
||||
|
||||
/* Subroutine */ void stbmv_(char *uplo, char *trans, char *diag, integer *n, integer *k, real *a, integer *lda,
|
||||
real *x, integer *incx) {
|
||||
/* System generated locals */
|
||||
integer a_dim1, a_offset, i__1, i__2, i__3, i__4;
|
||||
|
||||
/* Local variables */
|
||||
integer i__, j, l, ix, jx, kx, info;
|
||||
real temp;
|
||||
extern logical lsame_(char *, char *);
|
||||
integer kplus1;
|
||||
extern /* Subroutine */ void xerbla_(const char *, integer *);
|
||||
logical nounit;
|
||||
|
||||
/* .. Scalar Arguments .. */
|
||||
/* .. */
|
||||
/* .. Array Arguments .. */
|
||||
/* .. */
|
||||
|
||||
/* Purpose */
|
||||
/* ======= */
|
||||
|
||||
/* STBMV performs one of the matrix-vector operations */
|
||||
|
||||
/* x := A*x, or x := A'*x, */
|
||||
|
||||
/* where x is an n element vector and A is an n by n unit, or non-unit, */
|
||||
/* upper or lower triangular band matrix, with ( k + 1 ) diagonals. */
|
||||
|
||||
/* Arguments */
|
||||
/* ========== */
|
||||
|
||||
/* UPLO - CHARACTER*1. */
|
||||
/* On entry, UPLO specifies whether the matrix is an upper or */
|
||||
/* lower triangular matrix as follows: */
|
||||
|
||||
/* UPLO = 'U' or 'u' A is an upper triangular matrix. */
|
||||
|
||||
/* UPLO = 'L' or 'l' A is a lower triangular matrix. */
|
||||
|
||||
/* Unchanged on exit. */
|
||||
|
||||
/* TRANS - CHARACTER*1. */
|
||||
/* On entry, TRANS specifies the operation to be performed as */
|
||||
/* follows: */
|
||||
|
||||
/* TRANS = 'N' or 'n' x := A*x. */
|
||||
|
||||
/* TRANS = 'T' or 't' x := A'*x. */
|
||||
|
||||
/* TRANS = 'C' or 'c' x := A'*x. */
|
||||
|
||||
/* Unchanged on exit. */
|
||||
|
||||
/* DIAG - CHARACTER*1. */
|
||||
/* On entry, DIAG specifies whether or not A is unit */
|
||||
/* triangular as follows: */
|
||||
|
||||
/* DIAG = 'U' or 'u' A is assumed to be unit triangular. */
|
||||
|
||||
/* DIAG = 'N' or 'n' A is not assumed to be unit */
|
||||
/* triangular. */
|
||||
|
||||
/* Unchanged on exit. */
|
||||
|
||||
/* N - INTEGER. */
|
||||
/* On entry, N specifies the order of the matrix A. */
|
||||
/* N must be at least zero. */
|
||||
/* Unchanged on exit. */
|
||||
|
||||
/* K - INTEGER. */
|
||||
/* On entry with UPLO = 'U' or 'u', K specifies the number of */
|
||||
/* super-diagonals of the matrix A. */
|
||||
/* On entry with UPLO = 'L' or 'l', K specifies the number of */
|
||||
/* sub-diagonals of the matrix A. */
|
||||
/* K must satisfy 0 .le. K. */
|
||||
/* Unchanged on exit. */
|
||||
|
||||
/* A - REAL array of DIMENSION ( LDA, n ). */
|
||||
/* Before entry with UPLO = 'U' or 'u', the leading ( k + 1 ) */
|
||||
/* by n part of the array A must contain the upper triangular */
|
||||
/* band part of the matrix of coefficients, supplied column by */
|
||||
/* column, with the leading diagonal of the matrix in row */
|
||||
/* ( k + 1 ) of the array, the first super-diagonal starting at */
|
||||
/* position 2 in row k, and so on. The top left k by k triangle */
|
||||
/* of the array A is not referenced. */
|
||||
/* The following program segment will transfer an upper */
|
||||
/* triangular band matrix from conventional full matrix storage */
|
||||
/* to band storage: */
|
||||
|
||||
/* DO 20, J = 1, N */
|
||||
/* M = K + 1 - J */
|
||||
/* DO 10, I = MAX( 1, J - K ), J */
|
||||
/* A( M + I, J ) = matrix( I, J ) */
|
||||
/* 10 CONTINUE */
|
||||
/* 20 CONTINUE */
|
||||
|
||||
/* Before entry with UPLO = 'L' or 'l', the leading ( k + 1 ) */
|
||||
/* by n part of the array A must contain the lower triangular */
|
||||
/* band part of the matrix of coefficients, supplied column by */
|
||||
/* column, with the leading diagonal of the matrix in row 1 of */
|
||||
/* the array, the first sub-diagonal starting at position 1 in */
|
||||
/* row 2, and so on. The bottom right k by k triangle of the */
|
||||
/* array A is not referenced. */
|
||||
/* The following program segment will transfer a lower */
|
||||
/* triangular band matrix from conventional full matrix storage */
|
||||
/* to band storage: */
|
||||
|
||||
/* DO 20, J = 1, N */
|
||||
/* M = 1 - J */
|
||||
/* DO 10, I = J, MIN( N, J + K ) */
|
||||
/* A( M + I, J ) = matrix( I, J ) */
|
||||
/* 10 CONTINUE */
|
||||
/* 20 CONTINUE */
|
||||
|
||||
/* Note that when DIAG = 'U' or 'u' the elements of the array A */
|
||||
/* corresponding to the diagonal elements of the matrix are not */
|
||||
/* referenced, but are assumed to be unity. */
|
||||
/* Unchanged on exit. */
|
||||
|
||||
/* LDA - INTEGER. */
|
||||
/* On entry, LDA specifies the first dimension of A as declared */
|
||||
/* in the calling (sub) program. LDA must be at least */
|
||||
/* ( k + 1 ). */
|
||||
/* Unchanged on exit. */
|
||||
|
||||
/* X - REAL array of dimension at least */
|
||||
/* ( 1 + ( n - 1 )*abs( INCX ) ). */
|
||||
/* Before entry, the incremented array X must contain the n */
|
||||
/* element vector x. On exit, X is overwritten with the */
|
||||
/* transformed vector x. */
|
||||
|
||||
/* INCX - INTEGER. */
|
||||
/* On entry, INCX specifies the increment for the elements of */
|
||||
/* X. INCX must not be zero. */
|
||||
/* Unchanged on exit. */
|
||||
|
||||
/* Further Details */
|
||||
/* =============== */
|
||||
|
||||
/* Level 2 Blas routine. */
|
||||
|
||||
/* -- Written on 22-October-1986. */
|
||||
/* Jack Dongarra, Argonne National Lab. */
|
||||
/* Jeremy Du Croz, Nag Central Office. */
|
||||
/* Sven Hammarling, Nag Central Office. */
|
||||
/* Richard Hanson, Sandia National Labs. */
|
||||
|
||||
/* ===================================================================== */
|
||||
|
||||
/* .. Parameters .. */
|
||||
/* .. */
|
||||
/* .. Local Scalars .. */
|
||||
/* .. */
|
||||
/* .. External Functions .. */
|
||||
/* .. */
|
||||
/* .. External Subroutines .. */
|
||||
/* .. */
|
||||
/* .. Intrinsic Functions .. */
|
||||
/* .. */
|
||||
|
||||
/* Test the input parameters. */
|
||||
|
||||
/* Parameter adjustments */
|
||||
a_dim1 = *lda;
|
||||
a_offset = 1 + a_dim1;
|
||||
a -= a_offset;
|
||||
--x;
|
||||
|
||||
/* Function Body */
|
||||
info = 0;
|
||||
if (!lsame_(uplo, "U") && !lsame_(uplo, "L")) {
|
||||
info = 1;
|
||||
} else if (!lsame_(trans, "N") && !lsame_(trans, "T") && !lsame_(trans, "C")) {
|
||||
info = 2;
|
||||
} else if (!lsame_(diag, "U") && !lsame_(diag, "N")) {
|
||||
info = 3;
|
||||
} else if (*n < 0) {
|
||||
info = 4;
|
||||
} else if (*k < 0) {
|
||||
info = 5;
|
||||
} else if (*lda < *k + 1) {
|
||||
info = 7;
|
||||
} else if (*incx == 0) {
|
||||
info = 9;
|
||||
}
|
||||
if (info != 0) {
|
||||
xerbla_("STBMV ", &info);
|
||||
return;
|
||||
}
|
||||
|
||||
/* Quick return if possible. */
|
||||
|
||||
if (*n == 0) {
|
||||
return;
|
||||
}
|
||||
|
||||
nounit = lsame_(diag, "N");
|
||||
|
||||
/* Set up the start point in X if the increment is not unity. This */
|
||||
/* will be ( N - 1 )*INCX too small for descending loops. */
|
||||
|
||||
if (*incx <= 0) {
|
||||
kx = 1 - (*n - 1) * *incx;
|
||||
} else if (*incx != 1) {
|
||||
kx = 1;
|
||||
}
|
||||
|
||||
/* Start the operations. In this version the elements of A are */
|
||||
/* accessed sequentially with one pass through A. */
|
||||
|
||||
if (lsame_(trans, "N")) {
|
||||
/* Form x := A*x. */
|
||||
|
||||
if (lsame_(uplo, "U")) {
|
||||
kplus1 = *k + 1;
|
||||
if (*incx == 1) {
|
||||
i__1 = *n;
|
||||
for (j = 1; j <= i__1; ++j) {
|
||||
if (x[j] != 0.f) {
|
||||
temp = x[j];
|
||||
l = kplus1 - j;
|
||||
/* Computing MAX */
|
||||
i__2 = 1, i__3 = j - *k;
|
||||
i__4 = j - 1;
|
||||
for (i__ = max(i__2, i__3); i__ <= i__4; ++i__) {
|
||||
x[i__] += temp * a[l + i__ + j * a_dim1];
|
||||
/* L10: */
|
||||
}
|
||||
if (nounit) {
|
||||
x[j] *= a[kplus1 + j * a_dim1];
|
||||
}
|
||||
}
|
||||
/* L20: */
|
||||
}
|
||||
} else {
|
||||
jx = kx;
|
||||
i__1 = *n;
|
||||
for (j = 1; j <= i__1; ++j) {
|
||||
if (x[jx] != 0.f) {
|
||||
temp = x[jx];
|
||||
ix = kx;
|
||||
l = kplus1 - j;
|
||||
/* Computing MAX */
|
||||
i__4 = 1, i__2 = j - *k;
|
||||
i__3 = j - 1;
|
||||
for (i__ = max(i__4, i__2); i__ <= i__3; ++i__) {
|
||||
x[ix] += temp * a[l + i__ + j * a_dim1];
|
||||
ix += *incx;
|
||||
/* L30: */
|
||||
}
|
||||
if (nounit) {
|
||||
x[jx] *= a[kplus1 + j * a_dim1];
|
||||
}
|
||||
}
|
||||
jx += *incx;
|
||||
if (j > *k) {
|
||||
kx += *incx;
|
||||
}
|
||||
/* L40: */
|
||||
}
|
||||
}
|
||||
} else {
|
||||
if (*incx == 1) {
|
||||
for (j = *n; j >= 1; --j) {
|
||||
if (x[j] != 0.f) {
|
||||
temp = x[j];
|
||||
l = 1 - j;
|
||||
/* Computing MIN */
|
||||
i__1 = *n, i__3 = j + *k;
|
||||
i__4 = j + 1;
|
||||
for (i__ = min(i__1, i__3); i__ >= i__4; --i__) {
|
||||
x[i__] += temp * a[l + i__ + j * a_dim1];
|
||||
/* L50: */
|
||||
}
|
||||
if (nounit) {
|
||||
x[j] *= a[j * a_dim1 + 1];
|
||||
}
|
||||
}
|
||||
/* L60: */
|
||||
}
|
||||
} else {
|
||||
kx += (*n - 1) * *incx;
|
||||
jx = kx;
|
||||
for (j = *n; j >= 1; --j) {
|
||||
if (x[jx] != 0.f) {
|
||||
temp = x[jx];
|
||||
ix = kx;
|
||||
l = 1 - j;
|
||||
/* Computing MIN */
|
||||
i__4 = *n, i__1 = j + *k;
|
||||
i__3 = j + 1;
|
||||
for (i__ = min(i__4, i__1); i__ >= i__3; --i__) {
|
||||
x[ix] += temp * a[l + i__ + j * a_dim1];
|
||||
ix -= *incx;
|
||||
/* L70: */
|
||||
}
|
||||
if (nounit) {
|
||||
x[jx] *= a[j * a_dim1 + 1];
|
||||
}
|
||||
}
|
||||
jx -= *incx;
|
||||
if (*n - j >= *k) {
|
||||
kx -= *incx;
|
||||
}
|
||||
/* L80: */
|
||||
}
|
||||
}
|
||||
}
|
||||
} else {
|
||||
/* Form x := A'*x. */
|
||||
|
||||
if (lsame_(uplo, "U")) {
|
||||
kplus1 = *k + 1;
|
||||
if (*incx == 1) {
|
||||
for (j = *n; j >= 1; --j) {
|
||||
temp = x[j];
|
||||
l = kplus1 - j;
|
||||
if (nounit) {
|
||||
temp *= a[kplus1 + j * a_dim1];
|
||||
}
|
||||
/* Computing MAX */
|
||||
i__4 = 1, i__1 = j - *k;
|
||||
i__3 = max(i__4, i__1);
|
||||
for (i__ = j - 1; i__ >= i__3; --i__) {
|
||||
temp += a[l + i__ + j * a_dim1] * x[i__];
|
||||
/* L90: */
|
||||
}
|
||||
x[j] = temp;
|
||||
/* L100: */
|
||||
}
|
||||
} else {
|
||||
kx += (*n - 1) * *incx;
|
||||
jx = kx;
|
||||
for (j = *n; j >= 1; --j) {
|
||||
temp = x[jx];
|
||||
kx -= *incx;
|
||||
ix = kx;
|
||||
l = kplus1 - j;
|
||||
if (nounit) {
|
||||
temp *= a[kplus1 + j * a_dim1];
|
||||
}
|
||||
/* Computing MAX */
|
||||
i__4 = 1, i__1 = j - *k;
|
||||
i__3 = max(i__4, i__1);
|
||||
for (i__ = j - 1; i__ >= i__3; --i__) {
|
||||
temp += a[l + i__ + j * a_dim1] * x[ix];
|
||||
ix -= *incx;
|
||||
/* L110: */
|
||||
}
|
||||
x[jx] = temp;
|
||||
jx -= *incx;
|
||||
/* L120: */
|
||||
}
|
||||
}
|
||||
} else {
|
||||
if (*incx == 1) {
|
||||
i__3 = *n;
|
||||
for (j = 1; j <= i__3; ++j) {
|
||||
temp = x[j];
|
||||
l = 1 - j;
|
||||
if (nounit) {
|
||||
temp *= a[j * a_dim1 + 1];
|
||||
}
|
||||
/* Computing MIN */
|
||||
i__1 = *n, i__2 = j + *k;
|
||||
i__4 = min(i__1, i__2);
|
||||
for (i__ = j + 1; i__ <= i__4; ++i__) {
|
||||
temp += a[l + i__ + j * a_dim1] * x[i__];
|
||||
/* L130: */
|
||||
}
|
||||
x[j] = temp;
|
||||
/* L140: */
|
||||
}
|
||||
} else {
|
||||
jx = kx;
|
||||
i__3 = *n;
|
||||
for (j = 1; j <= i__3; ++j) {
|
||||
temp = x[jx];
|
||||
kx += *incx;
|
||||
ix = kx;
|
||||
l = 1 - j;
|
||||
if (nounit) {
|
||||
temp *= a[j * a_dim1 + 1];
|
||||
}
|
||||
/* Computing MIN */
|
||||
i__1 = *n, i__2 = j + *k;
|
||||
i__4 = min(i__1, i__2);
|
||||
for (i__ = j + 1; i__ <= i__4; ++i__) {
|
||||
temp += a[l + i__ + j * a_dim1] * x[ix];
|
||||
ix += *incx;
|
||||
/* L150: */
|
||||
}
|
||||
x[jx] = temp;
|
||||
jx += *incx;
|
||||
/* L160: */
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/* End of STBMV . */
|
||||
|
||||
} /* stbmv_ */
|
||||
456
blas/f2c/zhbmv.c
456
blas/f2c/zhbmv.c
@@ -1,456 +0,0 @@
|
||||
/* zhbmv.f -- translated by f2c (version 20100827).
|
||||
You must link the resulting object file with libf2c:
|
||||
on Microsoft Windows system, link with libf2c.lib;
|
||||
on Linux or Unix systems, link with .../path/to/libf2c.a -lm
|
||||
or, if you install libf2c.a in a standard place, with -lf2c -lm
|
||||
-- in that order, at the end of the command line, as in
|
||||
cc *.o -lf2c -lm
|
||||
Source for libf2c is in /netlib/f2c/libf2c.zip, e.g.,
|
||||
|
||||
http://www.netlib.org/f2c/libf2c.zip
|
||||
*/
|
||||
|
||||
#include "datatypes.h"
|
||||
|
||||
static inline void d_cnjg(doublecomplex *r, doublecomplex *z) {
|
||||
r->r = z->r;
|
||||
r->i = -(z->i);
|
||||
}
|
||||
|
||||
/* Subroutine */ void zhbmv_(char *uplo, integer *n, integer *k, doublecomplex *alpha, doublecomplex *a, integer *lda,
|
||||
doublecomplex *x, integer *incx, doublecomplex *beta, doublecomplex *y, integer *incy) {
|
||||
/* System generated locals */
|
||||
integer a_dim1, a_offset, i__1, i__2, i__3, i__4, i__5;
|
||||
doublereal d__1;
|
||||
doublecomplex z__1, z__2, z__3, z__4;
|
||||
|
||||
/* Local variables */
|
||||
integer i__, j, l, ix, iy, jx, jy, kx, ky, info;
|
||||
doublecomplex temp1, temp2;
|
||||
extern logical lsame_(char *, char *);
|
||||
integer kplus1;
|
||||
extern /* Subroutine */ void xerbla_(const char *, integer *);
|
||||
|
||||
/* .. Scalar Arguments .. */
|
||||
/* .. */
|
||||
/* .. Array Arguments .. */
|
||||
/* .. */
|
||||
|
||||
/* Purpose */
|
||||
/* ======= */
|
||||
|
||||
/* ZHBMV performs the matrix-vector operation */
|
||||
|
||||
/* y := alpha*A*x + beta*y, */
|
||||
|
||||
/* where alpha and beta are scalars, x and y are n element vectors and */
|
||||
/* A is an n by n hermitian band matrix, with k super-diagonals. */
|
||||
|
||||
/* Arguments */
|
||||
/* ========== */
|
||||
|
||||
/* UPLO - CHARACTER*1. */
|
||||
/* On entry, UPLO specifies whether the upper or lower */
|
||||
/* triangular part of the band matrix A is being supplied as */
|
||||
/* follows: */
|
||||
|
||||
/* UPLO = 'U' or 'u' The upper triangular part of A is */
|
||||
/* being supplied. */
|
||||
|
||||
/* UPLO = 'L' or 'l' The lower triangular part of A is */
|
||||
/* being supplied. */
|
||||
|
||||
/* Unchanged on exit. */
|
||||
|
||||
/* N - INTEGER. */
|
||||
/* On entry, N specifies the order of the matrix A. */
|
||||
/* N must be at least zero. */
|
||||
/* Unchanged on exit. */
|
||||
|
||||
/* K - INTEGER. */
|
||||
/* On entry, K specifies the number of super-diagonals of the */
|
||||
/* matrix A. K must satisfy 0 .le. K. */
|
||||
/* Unchanged on exit. */
|
||||
|
||||
/* ALPHA - COMPLEX*16 . */
|
||||
/* On entry, ALPHA specifies the scalar alpha. */
|
||||
/* Unchanged on exit. */
|
||||
|
||||
/* A - COMPLEX*16 array of DIMENSION ( LDA, n ). */
|
||||
/* Before entry with UPLO = 'U' or 'u', the leading ( k + 1 ) */
|
||||
/* by n part of the array A must contain the upper triangular */
|
||||
/* band part of the hermitian matrix, supplied column by */
|
||||
/* column, with the leading diagonal of the matrix in row */
|
||||
/* ( k + 1 ) of the array, the first super-diagonal starting at */
|
||||
/* position 2 in row k, and so on. The top left k by k triangle */
|
||||
/* of the array A is not referenced. */
|
||||
/* The following program segment will transfer the upper */
|
||||
/* triangular part of a hermitian band matrix from conventional */
|
||||
/* full matrix storage to band storage: */
|
||||
|
||||
/* DO 20, J = 1, N */
|
||||
/* M = K + 1 - J */
|
||||
/* DO 10, I = MAX( 1, J - K ), J */
|
||||
/* A( M + I, J ) = matrix( I, J ) */
|
||||
/* 10 CONTINUE */
|
||||
/* 20 CONTINUE */
|
||||
|
||||
/* Before entry with UPLO = 'L' or 'l', the leading ( k + 1 ) */
|
||||
/* by n part of the array A must contain the lower triangular */
|
||||
/* band part of the hermitian matrix, supplied column by */
|
||||
/* column, with the leading diagonal of the matrix in row 1 of */
|
||||
/* the array, the first sub-diagonal starting at position 1 in */
|
||||
/* row 2, and so on. The bottom right k by k triangle of the */
|
||||
/* array A is not referenced. */
|
||||
/* The following program segment will transfer the lower */
|
||||
/* triangular part of a hermitian band matrix from conventional */
|
||||
/* full matrix storage to band storage: */
|
||||
|
||||
/* DO 20, J = 1, N */
|
||||
/* M = 1 - J */
|
||||
/* DO 10, I = J, MIN( N, J + K ) */
|
||||
/* A( M + I, J ) = matrix( I, J ) */
|
||||
/* 10 CONTINUE */
|
||||
/* 20 CONTINUE */
|
||||
|
||||
/* Note that the imaginary parts of the diagonal elements need */
|
||||
/* not be set and are assumed to be zero. */
|
||||
/* Unchanged on exit. */
|
||||
|
||||
/* LDA - INTEGER. */
|
||||
/* On entry, LDA specifies the first dimension of A as declared */
|
||||
/* in the calling (sub) program. LDA must be at least */
|
||||
/* ( k + 1 ). */
|
||||
/* Unchanged on exit. */
|
||||
|
||||
/* X - COMPLEX*16 array of DIMENSION at least */
|
||||
/* ( 1 + ( n - 1 )*abs( INCX ) ). */
|
||||
/* Before entry, the incremented array X must contain the */
|
||||
/* vector x. */
|
||||
/* Unchanged on exit. */
|
||||
|
||||
/* INCX - INTEGER. */
|
||||
/* On entry, INCX specifies the increment for the elements of */
|
||||
/* X. INCX must not be zero. */
|
||||
/* Unchanged on exit. */
|
||||
|
||||
/* BETA - COMPLEX*16 . */
|
||||
/* On entry, BETA specifies the scalar beta. */
|
||||
/* Unchanged on exit. */
|
||||
|
||||
/* Y - COMPLEX*16 array of DIMENSION at least */
|
||||
/* ( 1 + ( n - 1 )*abs( INCY ) ). */
|
||||
/* Before entry, the incremented array Y must contain the */
|
||||
/* vector y. On exit, Y is overwritten by the updated vector y. */
|
||||
|
||||
/* INCY - INTEGER. */
|
||||
/* On entry, INCY specifies the increment for the elements of */
|
||||
/* Y. INCY must not be zero. */
|
||||
/* Unchanged on exit. */
|
||||
|
||||
/* Further Details */
|
||||
/* =============== */
|
||||
|
||||
/* Level 2 Blas routine. */
|
||||
|
||||
/* -- Written on 22-October-1986. */
|
||||
/* Jack Dongarra, Argonne National Lab. */
|
||||
/* Jeremy Du Croz, Nag Central Office. */
|
||||
/* Sven Hammarling, Nag Central Office. */
|
||||
/* Richard Hanson, Sandia National Labs. */
|
||||
|
||||
/* ===================================================================== */
|
||||
|
||||
/* .. Parameters .. */
|
||||
/* .. */
|
||||
/* .. Local Scalars .. */
|
||||
/* .. */
|
||||
/* .. External Functions .. */
|
||||
/* .. */
|
||||
/* .. External Subroutines .. */
|
||||
/* .. */
|
||||
/* .. Intrinsic Functions .. */
|
||||
/* .. */
|
||||
|
||||
/* Test the input parameters. */
|
||||
|
||||
/* Parameter adjustments */
|
||||
a_dim1 = *lda;
|
||||
a_offset = 1 + a_dim1;
|
||||
a -= a_offset;
|
||||
--x;
|
||||
--y;
|
||||
|
||||
/* Function Body */
|
||||
info = 0;
|
||||
if (!lsame_(uplo, "U") && !lsame_(uplo, "L")) {
|
||||
info = 1;
|
||||
} else if (*n < 0) {
|
||||
info = 2;
|
||||
} else if (*k < 0) {
|
||||
info = 3;
|
||||
} else if (*lda < *k + 1) {
|
||||
info = 6;
|
||||
} else if (*incx == 0) {
|
||||
info = 8;
|
||||
} else if (*incy == 0) {
|
||||
info = 11;
|
||||
}
|
||||
if (info != 0) {
|
||||
xerbla_("ZHBMV ", &info);
|
||||
return;
|
||||
}
|
||||
|
||||
/* Quick return if possible. */
|
||||
|
||||
if (*n == 0 || (alpha->r == 0. && alpha->i == 0. && (beta->r == 1. && beta->i == 0.))) {
|
||||
return;
|
||||
}
|
||||
|
||||
/* Set up the start points in X and Y. */
|
||||
|
||||
if (*incx > 0) {
|
||||
kx = 1;
|
||||
} else {
|
||||
kx = 1 - (*n - 1) * *incx;
|
||||
}
|
||||
if (*incy > 0) {
|
||||
ky = 1;
|
||||
} else {
|
||||
ky = 1 - (*n - 1) * *incy;
|
||||
}
|
||||
|
||||
/* Start the operations. In this version the elements of the array A */
|
||||
/* are accessed sequentially with one pass through A. */
|
||||
|
||||
/* First form y := beta*y. */
|
||||
|
||||
if (beta->r != 1. || beta->i != 0.) {
|
||||
if (*incy == 1) {
|
||||
if (beta->r == 0. && beta->i == 0.) {
|
||||
i__1 = *n;
|
||||
for (i__ = 1; i__ <= i__1; ++i__) {
|
||||
i__2 = i__;
|
||||
y[i__2].r = 0., y[i__2].i = 0.;
|
||||
/* L10: */
|
||||
}
|
||||
} else {
|
||||
i__1 = *n;
|
||||
for (i__ = 1; i__ <= i__1; ++i__) {
|
||||
i__2 = i__;
|
||||
i__3 = i__;
|
||||
z__1.r = beta->r * y[i__3].r - beta->i * y[i__3].i, z__1.i = beta->r * y[i__3].i + beta->i * y[i__3].r;
|
||||
y[i__2].r = z__1.r, y[i__2].i = z__1.i;
|
||||
/* L20: */
|
||||
}
|
||||
}
|
||||
} else {
|
||||
iy = ky;
|
||||
if (beta->r == 0. && beta->i == 0.) {
|
||||
i__1 = *n;
|
||||
for (i__ = 1; i__ <= i__1; ++i__) {
|
||||
i__2 = iy;
|
||||
y[i__2].r = 0., y[i__2].i = 0.;
|
||||
iy += *incy;
|
||||
/* L30: */
|
||||
}
|
||||
} else {
|
||||
i__1 = *n;
|
||||
for (i__ = 1; i__ <= i__1; ++i__) {
|
||||
i__2 = iy;
|
||||
i__3 = iy;
|
||||
z__1.r = beta->r * y[i__3].r - beta->i * y[i__3].i, z__1.i = beta->r * y[i__3].i + beta->i * y[i__3].r;
|
||||
y[i__2].r = z__1.r, y[i__2].i = z__1.i;
|
||||
iy += *incy;
|
||||
/* L40: */
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
if (alpha->r == 0. && alpha->i == 0.) {
|
||||
return;
|
||||
}
|
||||
if (lsame_(uplo, "U")) {
|
||||
/* Form y when upper triangle of A is stored. */
|
||||
|
||||
kplus1 = *k + 1;
|
||||
if (*incx == 1 && *incy == 1) {
|
||||
i__1 = *n;
|
||||
for (j = 1; j <= i__1; ++j) {
|
||||
i__2 = j;
|
||||
z__1.r = alpha->r * x[i__2].r - alpha->i * x[i__2].i, z__1.i = alpha->r * x[i__2].i + alpha->i * x[i__2].r;
|
||||
temp1.r = z__1.r, temp1.i = z__1.i;
|
||||
temp2.r = 0., temp2.i = 0.;
|
||||
l = kplus1 - j;
|
||||
/* Computing MAX */
|
||||
i__2 = 1, i__3 = j - *k;
|
||||
i__4 = j - 1;
|
||||
for (i__ = max(i__2, i__3); i__ <= i__4; ++i__) {
|
||||
i__2 = i__;
|
||||
i__3 = i__;
|
||||
i__5 = l + i__ + j * a_dim1;
|
||||
z__2.r = temp1.r * a[i__5].r - temp1.i * a[i__5].i, z__2.i = temp1.r * a[i__5].i + temp1.i * a[i__5].r;
|
||||
z__1.r = y[i__3].r + z__2.r, z__1.i = y[i__3].i + z__2.i;
|
||||
y[i__2].r = z__1.r, y[i__2].i = z__1.i;
|
||||
d_cnjg(&z__3, &a[l + i__ + j * a_dim1]);
|
||||
i__2 = i__;
|
||||
z__2.r = z__3.r * x[i__2].r - z__3.i * x[i__2].i, z__2.i = z__3.r * x[i__2].i + z__3.i * x[i__2].r;
|
||||
z__1.r = temp2.r + z__2.r, z__1.i = temp2.i + z__2.i;
|
||||
temp2.r = z__1.r, temp2.i = z__1.i;
|
||||
/* L50: */
|
||||
}
|
||||
i__4 = j;
|
||||
i__2 = j;
|
||||
i__3 = kplus1 + j * a_dim1;
|
||||
d__1 = a[i__3].r;
|
||||
z__3.r = d__1 * temp1.r, z__3.i = d__1 * temp1.i;
|
||||
z__2.r = y[i__2].r + z__3.r, z__2.i = y[i__2].i + z__3.i;
|
||||
z__4.r = alpha->r * temp2.r - alpha->i * temp2.i, z__4.i = alpha->r * temp2.i + alpha->i * temp2.r;
|
||||
z__1.r = z__2.r + z__4.r, z__1.i = z__2.i + z__4.i;
|
||||
y[i__4].r = z__1.r, y[i__4].i = z__1.i;
|
||||
/* L60: */
|
||||
}
|
||||
} else {
|
||||
jx = kx;
|
||||
jy = ky;
|
||||
i__1 = *n;
|
||||
for (j = 1; j <= i__1; ++j) {
|
||||
i__4 = jx;
|
||||
z__1.r = alpha->r * x[i__4].r - alpha->i * x[i__4].i, z__1.i = alpha->r * x[i__4].i + alpha->i * x[i__4].r;
|
||||
temp1.r = z__1.r, temp1.i = z__1.i;
|
||||
temp2.r = 0., temp2.i = 0.;
|
||||
ix = kx;
|
||||
iy = ky;
|
||||
l = kplus1 - j;
|
||||
/* Computing MAX */
|
||||
i__4 = 1, i__2 = j - *k;
|
||||
i__3 = j - 1;
|
||||
for (i__ = max(i__4, i__2); i__ <= i__3; ++i__) {
|
||||
i__4 = iy;
|
||||
i__2 = iy;
|
||||
i__5 = l + i__ + j * a_dim1;
|
||||
z__2.r = temp1.r * a[i__5].r - temp1.i * a[i__5].i, z__2.i = temp1.r * a[i__5].i + temp1.i * a[i__5].r;
|
||||
z__1.r = y[i__2].r + z__2.r, z__1.i = y[i__2].i + z__2.i;
|
||||
y[i__4].r = z__1.r, y[i__4].i = z__1.i;
|
||||
d_cnjg(&z__3, &a[l + i__ + j * a_dim1]);
|
||||
i__4 = ix;
|
||||
z__2.r = z__3.r * x[i__4].r - z__3.i * x[i__4].i, z__2.i = z__3.r * x[i__4].i + z__3.i * x[i__4].r;
|
||||
z__1.r = temp2.r + z__2.r, z__1.i = temp2.i + z__2.i;
|
||||
temp2.r = z__1.r, temp2.i = z__1.i;
|
||||
ix += *incx;
|
||||
iy += *incy;
|
||||
/* L70: */
|
||||
}
|
||||
i__3 = jy;
|
||||
i__4 = jy;
|
||||
i__2 = kplus1 + j * a_dim1;
|
||||
d__1 = a[i__2].r;
|
||||
z__3.r = d__1 * temp1.r, z__3.i = d__1 * temp1.i;
|
||||
z__2.r = y[i__4].r + z__3.r, z__2.i = y[i__4].i + z__3.i;
|
||||
z__4.r = alpha->r * temp2.r - alpha->i * temp2.i, z__4.i = alpha->r * temp2.i + alpha->i * temp2.r;
|
||||
z__1.r = z__2.r + z__4.r, z__1.i = z__2.i + z__4.i;
|
||||
y[i__3].r = z__1.r, y[i__3].i = z__1.i;
|
||||
jx += *incx;
|
||||
jy += *incy;
|
||||
if (j > *k) {
|
||||
kx += *incx;
|
||||
ky += *incy;
|
||||
}
|
||||
/* L80: */
|
||||
}
|
||||
}
|
||||
} else {
|
||||
/* Form y when lower triangle of A is stored. */
|
||||
|
||||
if (*incx == 1 && *incy == 1) {
|
||||
i__1 = *n;
|
||||
for (j = 1; j <= i__1; ++j) {
|
||||
i__3 = j;
|
||||
z__1.r = alpha->r * x[i__3].r - alpha->i * x[i__3].i, z__1.i = alpha->r * x[i__3].i + alpha->i * x[i__3].r;
|
||||
temp1.r = z__1.r, temp1.i = z__1.i;
|
||||
temp2.r = 0., temp2.i = 0.;
|
||||
i__3 = j;
|
||||
i__4 = j;
|
||||
i__2 = j * a_dim1 + 1;
|
||||
d__1 = a[i__2].r;
|
||||
z__2.r = d__1 * temp1.r, z__2.i = d__1 * temp1.i;
|
||||
z__1.r = y[i__4].r + z__2.r, z__1.i = y[i__4].i + z__2.i;
|
||||
y[i__3].r = z__1.r, y[i__3].i = z__1.i;
|
||||
l = 1 - j;
|
||||
/* Computing MIN */
|
||||
i__4 = *n, i__2 = j + *k;
|
||||
i__3 = min(i__4, i__2);
|
||||
for (i__ = j + 1; i__ <= i__3; ++i__) {
|
||||
i__4 = i__;
|
||||
i__2 = i__;
|
||||
i__5 = l + i__ + j * a_dim1;
|
||||
z__2.r = temp1.r * a[i__5].r - temp1.i * a[i__5].i, z__2.i = temp1.r * a[i__5].i + temp1.i * a[i__5].r;
|
||||
z__1.r = y[i__2].r + z__2.r, z__1.i = y[i__2].i + z__2.i;
|
||||
y[i__4].r = z__1.r, y[i__4].i = z__1.i;
|
||||
d_cnjg(&z__3, &a[l + i__ + j * a_dim1]);
|
||||
i__4 = i__;
|
||||
z__2.r = z__3.r * x[i__4].r - z__3.i * x[i__4].i, z__2.i = z__3.r * x[i__4].i + z__3.i * x[i__4].r;
|
||||
z__1.r = temp2.r + z__2.r, z__1.i = temp2.i + z__2.i;
|
||||
temp2.r = z__1.r, temp2.i = z__1.i;
|
||||
/* L90: */
|
||||
}
|
||||
i__3 = j;
|
||||
i__4 = j;
|
||||
z__2.r = alpha->r * temp2.r - alpha->i * temp2.i, z__2.i = alpha->r * temp2.i + alpha->i * temp2.r;
|
||||
z__1.r = y[i__4].r + z__2.r, z__1.i = y[i__4].i + z__2.i;
|
||||
y[i__3].r = z__1.r, y[i__3].i = z__1.i;
|
||||
/* L100: */
|
||||
}
|
||||
} else {
|
||||
jx = kx;
|
||||
jy = ky;
|
||||
i__1 = *n;
|
||||
for (j = 1; j <= i__1; ++j) {
|
||||
i__3 = jx;
|
||||
z__1.r = alpha->r * x[i__3].r - alpha->i * x[i__3].i, z__1.i = alpha->r * x[i__3].i + alpha->i * x[i__3].r;
|
||||
temp1.r = z__1.r, temp1.i = z__1.i;
|
||||
temp2.r = 0., temp2.i = 0.;
|
||||
i__3 = jy;
|
||||
i__4 = jy;
|
||||
i__2 = j * a_dim1 + 1;
|
||||
d__1 = a[i__2].r;
|
||||
z__2.r = d__1 * temp1.r, z__2.i = d__1 * temp1.i;
|
||||
z__1.r = y[i__4].r + z__2.r, z__1.i = y[i__4].i + z__2.i;
|
||||
y[i__3].r = z__1.r, y[i__3].i = z__1.i;
|
||||
l = 1 - j;
|
||||
ix = jx;
|
||||
iy = jy;
|
||||
/* Computing MIN */
|
||||
i__4 = *n, i__2 = j + *k;
|
||||
i__3 = min(i__4, i__2);
|
||||
for (i__ = j + 1; i__ <= i__3; ++i__) {
|
||||
ix += *incx;
|
||||
iy += *incy;
|
||||
i__4 = iy;
|
||||
i__2 = iy;
|
||||
i__5 = l + i__ + j * a_dim1;
|
||||
z__2.r = temp1.r * a[i__5].r - temp1.i * a[i__5].i, z__2.i = temp1.r * a[i__5].i + temp1.i * a[i__5].r;
|
||||
z__1.r = y[i__2].r + z__2.r, z__1.i = y[i__2].i + z__2.i;
|
||||
y[i__4].r = z__1.r, y[i__4].i = z__1.i;
|
||||
d_cnjg(&z__3, &a[l + i__ + j * a_dim1]);
|
||||
i__4 = ix;
|
||||
z__2.r = z__3.r * x[i__4].r - z__3.i * x[i__4].i, z__2.i = z__3.r * x[i__4].i + z__3.i * x[i__4].r;
|
||||
z__1.r = temp2.r + z__2.r, z__1.i = temp2.i + z__2.i;
|
||||
temp2.r = z__1.r, temp2.i = z__1.i;
|
||||
/* L110: */
|
||||
}
|
||||
i__3 = jy;
|
||||
i__4 = jy;
|
||||
z__2.r = alpha->r * temp2.r - alpha->i * temp2.i, z__2.i = alpha->r * temp2.i + alpha->i * temp2.r;
|
||||
z__1.r = y[i__4].r + z__2.r, z__1.i = y[i__4].i + z__2.i;
|
||||
y[i__3].r = z__1.r, y[i__3].i = z__1.i;
|
||||
jx += *incx;
|
||||
jy += *incy;
|
||||
/* L120: */
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/* End of ZHBMV . */
|
||||
|
||||
} /* zhbmv_ */
|
||||
407
blas/f2c/zhpmv.c
407
blas/f2c/zhpmv.c
@@ -1,407 +0,0 @@
|
||||
/* zhpmv.f -- translated by f2c (version 20100827).
|
||||
You must link the resulting object file with libf2c:
|
||||
on Microsoft Windows system, link with libf2c.lib;
|
||||
on Linux or Unix systems, link with .../path/to/libf2c.a -lm
|
||||
or, if you install libf2c.a in a standard place, with -lf2c -lm
|
||||
-- in that order, at the end of the command line, as in
|
||||
cc *.o -lf2c -lm
|
||||
Source for libf2c is in /netlib/f2c/libf2c.zip, e.g.,
|
||||
|
||||
http://www.netlib.org/f2c/libf2c.zip
|
||||
*/
|
||||
|
||||
#include "datatypes.h"
|
||||
|
||||
static inline void d_cnjg(doublecomplex *r, doublecomplex *z) {
|
||||
r->r = z->r;
|
||||
r->i = -(z->i);
|
||||
}
|
||||
|
||||
/* Subroutine */ void zhpmv_(char *uplo, integer *n, doublecomplex *alpha, doublecomplex *ap, doublecomplex *x,
|
||||
integer *incx, doublecomplex *beta, doublecomplex *y, integer *incy) {
|
||||
/* System generated locals */
|
||||
integer i__1, i__2, i__3, i__4, i__5;
|
||||
doublereal d__1;
|
||||
doublecomplex z__1, z__2, z__3, z__4;
|
||||
|
||||
/* Local variables */
|
||||
integer i__, j, k, kk, ix, iy, jx, jy, kx, ky, info;
|
||||
doublecomplex temp1, temp2;
|
||||
extern logical lsame_(char *, char *);
|
||||
extern /* Subroutine */ void xerbla_(const char *, integer *);
|
||||
|
||||
/* .. Scalar Arguments .. */
|
||||
/* .. */
|
||||
/* .. Array Arguments .. */
|
||||
/* .. */
|
||||
|
||||
/* Purpose */
|
||||
/* ======= */
|
||||
|
||||
/* ZHPMV performs the matrix-vector operation */
|
||||
|
||||
/* y := alpha*A*x + beta*y, */
|
||||
|
||||
/* where alpha and beta are scalars, x and y are n element vectors and */
|
||||
/* A is an n by n hermitian matrix, supplied in packed form. */
|
||||
|
||||
/* Arguments */
|
||||
/* ========== */
|
||||
|
||||
/* UPLO - CHARACTER*1. */
|
||||
/* On entry, UPLO specifies whether the upper or lower */
|
||||
/* triangular part of the matrix A is supplied in the packed */
|
||||
/* array AP as follows: */
|
||||
|
||||
/* UPLO = 'U' or 'u' The upper triangular part of A is */
|
||||
/* supplied in AP. */
|
||||
|
||||
/* UPLO = 'L' or 'l' The lower triangular part of A is */
|
||||
/* supplied in AP. */
|
||||
|
||||
/* Unchanged on exit. */
|
||||
|
||||
/* N - INTEGER. */
|
||||
/* On entry, N specifies the order of the matrix A. */
|
||||
/* N must be at least zero. */
|
||||
/* Unchanged on exit. */
|
||||
|
||||
/* ALPHA - COMPLEX*16 . */
|
||||
/* On entry, ALPHA specifies the scalar alpha. */
|
||||
/* Unchanged on exit. */
|
||||
|
||||
/* AP - COMPLEX*16 array of DIMENSION at least */
|
||||
/* ( ( n*( n + 1 ) )/2 ). */
|
||||
/* Before entry with UPLO = 'U' or 'u', the array AP must */
|
||||
/* contain the upper triangular part of the hermitian matrix */
|
||||
/* packed sequentially, column by column, so that AP( 1 ) */
|
||||
/* contains a( 1, 1 ), AP( 2 ) and AP( 3 ) contain a( 1, 2 ) */
|
||||
/* and a( 2, 2 ) respectively, and so on. */
|
||||
/* Before entry with UPLO = 'L' or 'l', the array AP must */
|
||||
/* contain the lower triangular part of the hermitian matrix */
|
||||
/* packed sequentially, column by column, so that AP( 1 ) */
|
||||
/* contains a( 1, 1 ), AP( 2 ) and AP( 3 ) contain a( 2, 1 ) */
|
||||
/* and a( 3, 1 ) respectively, and so on. */
|
||||
/* Note that the imaginary parts of the diagonal elements need */
|
||||
/* not be set and are assumed to be zero. */
|
||||
/* Unchanged on exit. */
|
||||
|
||||
/* X - COMPLEX*16 array of dimension at least */
|
||||
/* ( 1 + ( n - 1 )*abs( INCX ) ). */
|
||||
/* Before entry, the incremented array X must contain the n */
|
||||
/* element vector x. */
|
||||
/* Unchanged on exit. */
|
||||
|
||||
/* INCX - INTEGER. */
|
||||
/* On entry, INCX specifies the increment for the elements of */
|
||||
/* X. INCX must not be zero. */
|
||||
/* Unchanged on exit. */
|
||||
|
||||
/* BETA - COMPLEX*16 . */
|
||||
/* On entry, BETA specifies the scalar beta. When BETA is */
|
||||
/* supplied as zero then Y need not be set on input. */
|
||||
/* Unchanged on exit. */
|
||||
|
||||
/* Y - COMPLEX*16 array of dimension at least */
|
||||
/* ( 1 + ( n - 1 )*abs( INCY ) ). */
|
||||
/* Before entry, the incremented array Y must contain the n */
|
||||
/* element vector y. On exit, Y is overwritten by the updated */
|
||||
/* vector y. */
|
||||
|
||||
/* INCY - INTEGER. */
|
||||
/* On entry, INCY specifies the increment for the elements of */
|
||||
/* Y. INCY must not be zero. */
|
||||
/* Unchanged on exit. */
|
||||
|
||||
/* Further Details */
|
||||
/* =============== */
|
||||
|
||||
/* Level 2 Blas routine. */
|
||||
|
||||
/* -- Written on 22-October-1986. */
|
||||
/* Jack Dongarra, Argonne National Lab. */
|
||||
/* Jeremy Du Croz, Nag Central Office. */
|
||||
/* Sven Hammarling, Nag Central Office. */
|
||||
/* Richard Hanson, Sandia National Labs. */
|
||||
|
||||
/* ===================================================================== */
|
||||
|
||||
/* .. Parameters .. */
|
||||
/* .. */
|
||||
/* .. Local Scalars .. */
|
||||
/* .. */
|
||||
/* .. External Functions .. */
|
||||
/* .. */
|
||||
/* .. External Subroutines .. */
|
||||
/* .. */
|
||||
/* .. Intrinsic Functions .. */
|
||||
/* .. */
|
||||
|
||||
/* Test the input parameters. */
|
||||
|
||||
/* Parameter adjustments */
|
||||
--y;
|
||||
--x;
|
||||
--ap;
|
||||
|
||||
/* Function Body */
|
||||
info = 0;
|
||||
if (!lsame_(uplo, "U") && !lsame_(uplo, "L")) {
|
||||
info = 1;
|
||||
} else if (*n < 0) {
|
||||
info = 2;
|
||||
} else if (*incx == 0) {
|
||||
info = 6;
|
||||
} else if (*incy == 0) {
|
||||
info = 9;
|
||||
}
|
||||
if (info != 0) {
|
||||
xerbla_("ZHPMV ", &info);
|
||||
return;
|
||||
}
|
||||
|
||||
/* Quick return if possible. */
|
||||
|
||||
if (*n == 0 || (alpha->r == 0. && alpha->i == 0. && (beta->r == 1. && beta->i == 0.))) {
|
||||
return;
|
||||
}
|
||||
|
||||
/* Set up the start points in X and Y. */
|
||||
|
||||
if (*incx > 0) {
|
||||
kx = 1;
|
||||
} else {
|
||||
kx = 1 - (*n - 1) * *incx;
|
||||
}
|
||||
if (*incy > 0) {
|
||||
ky = 1;
|
||||
} else {
|
||||
ky = 1 - (*n - 1) * *incy;
|
||||
}
|
||||
|
||||
/* Start the operations. In this version the elements of the array AP */
|
||||
/* are accessed sequentially with one pass through AP. */
|
||||
|
||||
/* First form y := beta*y. */
|
||||
|
||||
if (beta->r != 1. || beta->i != 0.) {
|
||||
if (*incy == 1) {
|
||||
if (beta->r == 0. && beta->i == 0.) {
|
||||
i__1 = *n;
|
||||
for (i__ = 1; i__ <= i__1; ++i__) {
|
||||
i__2 = i__;
|
||||
y[i__2].r = 0., y[i__2].i = 0.;
|
||||
/* L10: */
|
||||
}
|
||||
} else {
|
||||
i__1 = *n;
|
||||
for (i__ = 1; i__ <= i__1; ++i__) {
|
||||
i__2 = i__;
|
||||
i__3 = i__;
|
||||
z__1.r = beta->r * y[i__3].r - beta->i * y[i__3].i, z__1.i = beta->r * y[i__3].i + beta->i * y[i__3].r;
|
||||
y[i__2].r = z__1.r, y[i__2].i = z__1.i;
|
||||
/* L20: */
|
||||
}
|
||||
}
|
||||
} else {
|
||||
iy = ky;
|
||||
if (beta->r == 0. && beta->i == 0.) {
|
||||
i__1 = *n;
|
||||
for (i__ = 1; i__ <= i__1; ++i__) {
|
||||
i__2 = iy;
|
||||
y[i__2].r = 0., y[i__2].i = 0.;
|
||||
iy += *incy;
|
||||
/* L30: */
|
||||
}
|
||||
} else {
|
||||
i__1 = *n;
|
||||
for (i__ = 1; i__ <= i__1; ++i__) {
|
||||
i__2 = iy;
|
||||
i__3 = iy;
|
||||
z__1.r = beta->r * y[i__3].r - beta->i * y[i__3].i, z__1.i = beta->r * y[i__3].i + beta->i * y[i__3].r;
|
||||
y[i__2].r = z__1.r, y[i__2].i = z__1.i;
|
||||
iy += *incy;
|
||||
/* L40: */
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
if (alpha->r == 0. && alpha->i == 0.) {
|
||||
return;
|
||||
}
|
||||
kk = 1;
|
||||
if (lsame_(uplo, "U")) {
|
||||
/* Form y when AP contains the upper triangle. */
|
||||
|
||||
if (*incx == 1 && *incy == 1) {
|
||||
i__1 = *n;
|
||||
for (j = 1; j <= i__1; ++j) {
|
||||
i__2 = j;
|
||||
z__1.r = alpha->r * x[i__2].r - alpha->i * x[i__2].i, z__1.i = alpha->r * x[i__2].i + alpha->i * x[i__2].r;
|
||||
temp1.r = z__1.r, temp1.i = z__1.i;
|
||||
temp2.r = 0., temp2.i = 0.;
|
||||
k = kk;
|
||||
i__2 = j - 1;
|
||||
for (i__ = 1; i__ <= i__2; ++i__) {
|
||||
i__3 = i__;
|
||||
i__4 = i__;
|
||||
i__5 = k;
|
||||
z__2.r = temp1.r * ap[i__5].r - temp1.i * ap[i__5].i, z__2.i = temp1.r * ap[i__5].i + temp1.i * ap[i__5].r;
|
||||
z__1.r = y[i__4].r + z__2.r, z__1.i = y[i__4].i + z__2.i;
|
||||
y[i__3].r = z__1.r, y[i__3].i = z__1.i;
|
||||
d_cnjg(&z__3, &ap[k]);
|
||||
i__3 = i__;
|
||||
z__2.r = z__3.r * x[i__3].r - z__3.i * x[i__3].i, z__2.i = z__3.r * x[i__3].i + z__3.i * x[i__3].r;
|
||||
z__1.r = temp2.r + z__2.r, z__1.i = temp2.i + z__2.i;
|
||||
temp2.r = z__1.r, temp2.i = z__1.i;
|
||||
++k;
|
||||
/* L50: */
|
||||
}
|
||||
i__2 = j;
|
||||
i__3 = j;
|
||||
i__4 = kk + j - 1;
|
||||
d__1 = ap[i__4].r;
|
||||
z__3.r = d__1 * temp1.r, z__3.i = d__1 * temp1.i;
|
||||
z__2.r = y[i__3].r + z__3.r, z__2.i = y[i__3].i + z__3.i;
|
||||
z__4.r = alpha->r * temp2.r - alpha->i * temp2.i, z__4.i = alpha->r * temp2.i + alpha->i * temp2.r;
|
||||
z__1.r = z__2.r + z__4.r, z__1.i = z__2.i + z__4.i;
|
||||
y[i__2].r = z__1.r, y[i__2].i = z__1.i;
|
||||
kk += j;
|
||||
/* L60: */
|
||||
}
|
||||
} else {
|
||||
jx = kx;
|
||||
jy = ky;
|
||||
i__1 = *n;
|
||||
for (j = 1; j <= i__1; ++j) {
|
||||
i__2 = jx;
|
||||
z__1.r = alpha->r * x[i__2].r - alpha->i * x[i__2].i, z__1.i = alpha->r * x[i__2].i + alpha->i * x[i__2].r;
|
||||
temp1.r = z__1.r, temp1.i = z__1.i;
|
||||
temp2.r = 0., temp2.i = 0.;
|
||||
ix = kx;
|
||||
iy = ky;
|
||||
i__2 = kk + j - 2;
|
||||
for (k = kk; k <= i__2; ++k) {
|
||||
i__3 = iy;
|
||||
i__4 = iy;
|
||||
i__5 = k;
|
||||
z__2.r = temp1.r * ap[i__5].r - temp1.i * ap[i__5].i, z__2.i = temp1.r * ap[i__5].i + temp1.i * ap[i__5].r;
|
||||
z__1.r = y[i__4].r + z__2.r, z__1.i = y[i__4].i + z__2.i;
|
||||
y[i__3].r = z__1.r, y[i__3].i = z__1.i;
|
||||
d_cnjg(&z__3, &ap[k]);
|
||||
i__3 = ix;
|
||||
z__2.r = z__3.r * x[i__3].r - z__3.i * x[i__3].i, z__2.i = z__3.r * x[i__3].i + z__3.i * x[i__3].r;
|
||||
z__1.r = temp2.r + z__2.r, z__1.i = temp2.i + z__2.i;
|
||||
temp2.r = z__1.r, temp2.i = z__1.i;
|
||||
ix += *incx;
|
||||
iy += *incy;
|
||||
/* L70: */
|
||||
}
|
||||
i__2 = jy;
|
||||
i__3 = jy;
|
||||
i__4 = kk + j - 1;
|
||||
d__1 = ap[i__4].r;
|
||||
z__3.r = d__1 * temp1.r, z__3.i = d__1 * temp1.i;
|
||||
z__2.r = y[i__3].r + z__3.r, z__2.i = y[i__3].i + z__3.i;
|
||||
z__4.r = alpha->r * temp2.r - alpha->i * temp2.i, z__4.i = alpha->r * temp2.i + alpha->i * temp2.r;
|
||||
z__1.r = z__2.r + z__4.r, z__1.i = z__2.i + z__4.i;
|
||||
y[i__2].r = z__1.r, y[i__2].i = z__1.i;
|
||||
jx += *incx;
|
||||
jy += *incy;
|
||||
kk += j;
|
||||
/* L80: */
|
||||
}
|
||||
}
|
||||
} else {
|
||||
/* Form y when AP contains the lower triangle. */
|
||||
|
||||
if (*incx == 1 && *incy == 1) {
|
||||
i__1 = *n;
|
||||
for (j = 1; j <= i__1; ++j) {
|
||||
i__2 = j;
|
||||
z__1.r = alpha->r * x[i__2].r - alpha->i * x[i__2].i, z__1.i = alpha->r * x[i__2].i + alpha->i * x[i__2].r;
|
||||
temp1.r = z__1.r, temp1.i = z__1.i;
|
||||
temp2.r = 0., temp2.i = 0.;
|
||||
i__2 = j;
|
||||
i__3 = j;
|
||||
i__4 = kk;
|
||||
d__1 = ap[i__4].r;
|
||||
z__2.r = d__1 * temp1.r, z__2.i = d__1 * temp1.i;
|
||||
z__1.r = y[i__3].r + z__2.r, z__1.i = y[i__3].i + z__2.i;
|
||||
y[i__2].r = z__1.r, y[i__2].i = z__1.i;
|
||||
k = kk + 1;
|
||||
i__2 = *n;
|
||||
for (i__ = j + 1; i__ <= i__2; ++i__) {
|
||||
i__3 = i__;
|
||||
i__4 = i__;
|
||||
i__5 = k;
|
||||
z__2.r = temp1.r * ap[i__5].r - temp1.i * ap[i__5].i, z__2.i = temp1.r * ap[i__5].i + temp1.i * ap[i__5].r;
|
||||
z__1.r = y[i__4].r + z__2.r, z__1.i = y[i__4].i + z__2.i;
|
||||
y[i__3].r = z__1.r, y[i__3].i = z__1.i;
|
||||
d_cnjg(&z__3, &ap[k]);
|
||||
i__3 = i__;
|
||||
z__2.r = z__3.r * x[i__3].r - z__3.i * x[i__3].i, z__2.i = z__3.r * x[i__3].i + z__3.i * x[i__3].r;
|
||||
z__1.r = temp2.r + z__2.r, z__1.i = temp2.i + z__2.i;
|
||||
temp2.r = z__1.r, temp2.i = z__1.i;
|
||||
++k;
|
||||
/* L90: */
|
||||
}
|
||||
i__2 = j;
|
||||
i__3 = j;
|
||||
z__2.r = alpha->r * temp2.r - alpha->i * temp2.i, z__2.i = alpha->r * temp2.i + alpha->i * temp2.r;
|
||||
z__1.r = y[i__3].r + z__2.r, z__1.i = y[i__3].i + z__2.i;
|
||||
y[i__2].r = z__1.r, y[i__2].i = z__1.i;
|
||||
kk += *n - j + 1;
|
||||
/* L100: */
|
||||
}
|
||||
} else {
|
||||
jx = kx;
|
||||
jy = ky;
|
||||
i__1 = *n;
|
||||
for (j = 1; j <= i__1; ++j) {
|
||||
i__2 = jx;
|
||||
z__1.r = alpha->r * x[i__2].r - alpha->i * x[i__2].i, z__1.i = alpha->r * x[i__2].i + alpha->i * x[i__2].r;
|
||||
temp1.r = z__1.r, temp1.i = z__1.i;
|
||||
temp2.r = 0., temp2.i = 0.;
|
||||
i__2 = jy;
|
||||
i__3 = jy;
|
||||
i__4 = kk;
|
||||
d__1 = ap[i__4].r;
|
||||
z__2.r = d__1 * temp1.r, z__2.i = d__1 * temp1.i;
|
||||
z__1.r = y[i__3].r + z__2.r, z__1.i = y[i__3].i + z__2.i;
|
||||
y[i__2].r = z__1.r, y[i__2].i = z__1.i;
|
||||
ix = jx;
|
||||
iy = jy;
|
||||
i__2 = kk + *n - j;
|
||||
for (k = kk + 1; k <= i__2; ++k) {
|
||||
ix += *incx;
|
||||
iy += *incy;
|
||||
i__3 = iy;
|
||||
i__4 = iy;
|
||||
i__5 = k;
|
||||
z__2.r = temp1.r * ap[i__5].r - temp1.i * ap[i__5].i, z__2.i = temp1.r * ap[i__5].i + temp1.i * ap[i__5].r;
|
||||
z__1.r = y[i__4].r + z__2.r, z__1.i = y[i__4].i + z__2.i;
|
||||
y[i__3].r = z__1.r, y[i__3].i = z__1.i;
|
||||
d_cnjg(&z__3, &ap[k]);
|
||||
i__3 = ix;
|
||||
z__2.r = z__3.r * x[i__3].r - z__3.i * x[i__3].i, z__2.i = z__3.r * x[i__3].i + z__3.i * x[i__3].r;
|
||||
z__1.r = temp2.r + z__2.r, z__1.i = temp2.i + z__2.i;
|
||||
temp2.r = z__1.r, temp2.i = z__1.i;
|
||||
/* L110: */
|
||||
}
|
||||
i__2 = jy;
|
||||
i__3 = jy;
|
||||
z__2.r = alpha->r * temp2.r - alpha->i * temp2.i, z__2.i = alpha->r * temp2.i + alpha->i * temp2.r;
|
||||
z__1.r = y[i__3].r + z__2.r, z__1.i = y[i__3].i + z__2.i;
|
||||
y[i__2].r = z__1.r, y[i__2].i = z__1.i;
|
||||
jx += *incx;
|
||||
jy += *incy;
|
||||
kk += *n - j + 1;
|
||||
/* L120: */
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/* End of ZHPMV . */
|
||||
|
||||
} /* zhpmv_ */
|
||||
586
blas/f2c/ztbmv.c
586
blas/f2c/ztbmv.c
@@ -1,586 +0,0 @@
|
||||
/* ztbmv.f -- translated by f2c (version 20100827).
|
||||
You must link the resulting object file with libf2c:
|
||||
on Microsoft Windows system, link with libf2c.lib;
|
||||
on Linux or Unix systems, link with .../path/to/libf2c.a -lm
|
||||
or, if you install libf2c.a in a standard place, with -lf2c -lm
|
||||
-- in that order, at the end of the command line, as in
|
||||
cc *.o -lf2c -lm
|
||||
Source for libf2c is in /netlib/f2c/libf2c.zip, e.g.,
|
||||
|
||||
http://www.netlib.org/f2c/libf2c.zip
|
||||
*/
|
||||
|
||||
#include "datatypes.h"
|
||||
|
||||
static inline void d_cnjg(doublecomplex *r, doublecomplex *z) {
|
||||
r->r = z->r;
|
||||
r->i = -(z->i);
|
||||
}
|
||||
|
||||
/* Subroutine */ void ztbmv_(char *uplo, char *trans, char *diag, integer *n, integer *k, doublecomplex *a,
|
||||
integer *lda, doublecomplex *x, integer *incx) {
|
||||
/* System generated locals */
|
||||
integer a_dim1, a_offset, i__1, i__2, i__3, i__4, i__5;
|
||||
doublecomplex z__1, z__2, z__3;
|
||||
|
||||
/* Local variables */
|
||||
integer i__, j, l, ix, jx, kx, info;
|
||||
doublecomplex temp;
|
||||
extern logical lsame_(char *, char *);
|
||||
integer kplus1;
|
||||
extern /* Subroutine */ void xerbla_(const char *, integer *);
|
||||
logical noconj, nounit;
|
||||
|
||||
/* .. Scalar Arguments .. */
|
||||
/* .. */
|
||||
/* .. Array Arguments .. */
|
||||
/* .. */
|
||||
|
||||
/* Purpose */
|
||||
/* ======= */
|
||||
|
||||
/* ZTBMV performs one of the matrix-vector operations */
|
||||
|
||||
/* x := A*x, or x := A'*x, or x := conjg( A' )*x, */
|
||||
|
||||
/* where x is an n element vector and A is an n by n unit, or non-unit, */
|
||||
/* upper or lower triangular band matrix, with ( k + 1 ) diagonals. */
|
||||
|
||||
/* Arguments */
|
||||
/* ========== */
|
||||
|
||||
/* UPLO - CHARACTER*1. */
|
||||
/* On entry, UPLO specifies whether the matrix is an upper or */
|
||||
/* lower triangular matrix as follows: */
|
||||
|
||||
/* UPLO = 'U' or 'u' A is an upper triangular matrix. */
|
||||
|
||||
/* UPLO = 'L' or 'l' A is a lower triangular matrix. */
|
||||
|
||||
/* Unchanged on exit. */
|
||||
|
||||
/* TRANS - CHARACTER*1. */
|
||||
/* On entry, TRANS specifies the operation to be performed as */
|
||||
/* follows: */
|
||||
|
||||
/* TRANS = 'N' or 'n' x := A*x. */
|
||||
|
||||
/* TRANS = 'T' or 't' x := A'*x. */
|
||||
|
||||
/* TRANS = 'C' or 'c' x := conjg( A' )*x. */
|
||||
|
||||
/* Unchanged on exit. */
|
||||
|
||||
/* DIAG - CHARACTER*1. */
|
||||
/* On entry, DIAG specifies whether or not A is unit */
|
||||
/* triangular as follows: */
|
||||
|
||||
/* DIAG = 'U' or 'u' A is assumed to be unit triangular. */
|
||||
|
||||
/* DIAG = 'N' or 'n' A is not assumed to be unit */
|
||||
/* triangular. */
|
||||
|
||||
/* Unchanged on exit. */
|
||||
|
||||
/* N - INTEGER. */
|
||||
/* On entry, N specifies the order of the matrix A. */
|
||||
/* N must be at least zero. */
|
||||
/* Unchanged on exit. */
|
||||
|
||||
/* K - INTEGER. */
|
||||
/* On entry with UPLO = 'U' or 'u', K specifies the number of */
|
||||
/* super-diagonals of the matrix A. */
|
||||
/* On entry with UPLO = 'L' or 'l', K specifies the number of */
|
||||
/* sub-diagonals of the matrix A. */
|
||||
/* K must satisfy 0 .le. K. */
|
||||
/* Unchanged on exit. */
|
||||
|
||||
/* A - COMPLEX*16 array of DIMENSION ( LDA, n ). */
|
||||
/* Before entry with UPLO = 'U' or 'u', the leading ( k + 1 ) */
|
||||
/* by n part of the array A must contain the upper triangular */
|
||||
/* band part of the matrix of coefficients, supplied column by */
|
||||
/* column, with the leading diagonal of the matrix in row */
|
||||
/* ( k + 1 ) of the array, the first super-diagonal starting at */
|
||||
/* position 2 in row k, and so on. The top left k by k triangle */
|
||||
/* of the array A is not referenced. */
|
||||
/* The following program segment will transfer an upper */
|
||||
/* triangular band matrix from conventional full matrix storage */
|
||||
/* to band storage: */
|
||||
|
||||
/* DO 20, J = 1, N */
|
||||
/* M = K + 1 - J */
|
||||
/* DO 10, I = MAX( 1, J - K ), J */
|
||||
/* A( M + I, J ) = matrix( I, J ) */
|
||||
/* 10 CONTINUE */
|
||||
/* 20 CONTINUE */
|
||||
|
||||
/* Before entry with UPLO = 'L' or 'l', the leading ( k + 1 ) */
|
||||
/* by n part of the array A must contain the lower triangular */
|
||||
/* band part of the matrix of coefficients, supplied column by */
|
||||
/* column, with the leading diagonal of the matrix in row 1 of */
|
||||
/* the array, the first sub-diagonal starting at position 1 in */
|
||||
/* row 2, and so on. The bottom right k by k triangle of the */
|
||||
/* array A is not referenced. */
|
||||
/* The following program segment will transfer a lower */
|
||||
/* triangular band matrix from conventional full matrix storage */
|
||||
/* to band storage: */
|
||||
|
||||
/* DO 20, J = 1, N */
|
||||
/* M = 1 - J */
|
||||
/* DO 10, I = J, MIN( N, J + K ) */
|
||||
/* A( M + I, J ) = matrix( I, J ) */
|
||||
/* 10 CONTINUE */
|
||||
/* 20 CONTINUE */
|
||||
|
||||
/* Note that when DIAG = 'U' or 'u' the elements of the array A */
|
||||
/* corresponding to the diagonal elements of the matrix are not */
|
||||
/* referenced, but are assumed to be unity. */
|
||||
/* Unchanged on exit. */
|
||||
|
||||
/* LDA - INTEGER. */
|
||||
/* On entry, LDA specifies the first dimension of A as declared */
|
||||
/* in the calling (sub) program. LDA must be at least */
|
||||
/* ( k + 1 ). */
|
||||
/* Unchanged on exit. */
|
||||
|
||||
/* X - COMPLEX*16 array of dimension at least */
|
||||
/* ( 1 + ( n - 1 )*abs( INCX ) ). */
|
||||
/* Before entry, the incremented array X must contain the n */
|
||||
/* element vector x. On exit, X is overwritten with the */
|
||||
/* transformed vector x. */
|
||||
|
||||
/* INCX - INTEGER. */
|
||||
/* On entry, INCX specifies the increment for the elements of */
|
||||
/* X. INCX must not be zero. */
|
||||
/* Unchanged on exit. */
|
||||
|
||||
/* Further Details */
|
||||
/* =============== */
|
||||
|
||||
/* Level 2 Blas routine. */
|
||||
|
||||
/* -- Written on 22-October-1986. */
|
||||
/* Jack Dongarra, Argonne National Lab. */
|
||||
/* Jeremy Du Croz, Nag Central Office. */
|
||||
/* Sven Hammarling, Nag Central Office. */
|
||||
/* Richard Hanson, Sandia National Labs. */
|
||||
|
||||
/* ===================================================================== */
|
||||
|
||||
/* .. Parameters .. */
|
||||
/* .. */
|
||||
/* .. Local Scalars .. */
|
||||
/* .. */
|
||||
/* .. External Functions .. */
|
||||
/* .. */
|
||||
/* .. External Subroutines .. */
|
||||
/* .. */
|
||||
/* .. Intrinsic Functions .. */
|
||||
/* .. */
|
||||
|
||||
/* Test the input parameters. */
|
||||
|
||||
/* Parameter adjustments */
|
||||
a_dim1 = *lda;
|
||||
a_offset = 1 + a_dim1;
|
||||
a -= a_offset;
|
||||
--x;
|
||||
|
||||
/* Function Body */
|
||||
info = 0;
|
||||
if (!lsame_(uplo, "U") && !lsame_(uplo, "L")) {
|
||||
info = 1;
|
||||
} else if (!lsame_(trans, "N") && !lsame_(trans, "T") && !lsame_(trans, "C")) {
|
||||
info = 2;
|
||||
} else if (!lsame_(diag, "U") && !lsame_(diag, "N")) {
|
||||
info = 3;
|
||||
} else if (*n < 0) {
|
||||
info = 4;
|
||||
} else if (*k < 0) {
|
||||
info = 5;
|
||||
} else if (*lda < *k + 1) {
|
||||
info = 7;
|
||||
} else if (*incx == 0) {
|
||||
info = 9;
|
||||
}
|
||||
if (info != 0) {
|
||||
xerbla_("ZTBMV ", &info);
|
||||
return;
|
||||
}
|
||||
|
||||
/* Quick return if possible. */
|
||||
|
||||
if (*n == 0) {
|
||||
return;
|
||||
}
|
||||
|
||||
noconj = lsame_(trans, "T");
|
||||
nounit = lsame_(diag, "N");
|
||||
|
||||
/* Set up the start point in X if the increment is not unity. This */
|
||||
/* will be ( N - 1 )*INCX too small for descending loops. */
|
||||
|
||||
if (*incx <= 0) {
|
||||
kx = 1 - (*n - 1) * *incx;
|
||||
} else if (*incx != 1) {
|
||||
kx = 1;
|
||||
}
|
||||
|
||||
/* Start the operations. In this version the elements of A are */
|
||||
/* accessed sequentially with one pass through A. */
|
||||
|
||||
if (lsame_(trans, "N")) {
|
||||
/* Form x := A*x. */
|
||||
|
||||
if (lsame_(uplo, "U")) {
|
||||
kplus1 = *k + 1;
|
||||
if (*incx == 1) {
|
||||
i__1 = *n;
|
||||
for (j = 1; j <= i__1; ++j) {
|
||||
i__2 = j;
|
||||
if (x[i__2].r != 0. || x[i__2].i != 0.) {
|
||||
i__2 = j;
|
||||
temp.r = x[i__2].r, temp.i = x[i__2].i;
|
||||
l = kplus1 - j;
|
||||
/* Computing MAX */
|
||||
i__2 = 1, i__3 = j - *k;
|
||||
i__4 = j - 1;
|
||||
for (i__ = max(i__2, i__3); i__ <= i__4; ++i__) {
|
||||
i__2 = i__;
|
||||
i__3 = i__;
|
||||
i__5 = l + i__ + j * a_dim1;
|
||||
z__2.r = temp.r * a[i__5].r - temp.i * a[i__5].i, z__2.i = temp.r * a[i__5].i + temp.i * a[i__5].r;
|
||||
z__1.r = x[i__3].r + z__2.r, z__1.i = x[i__3].i + z__2.i;
|
||||
x[i__2].r = z__1.r, x[i__2].i = z__1.i;
|
||||
/* L10: */
|
||||
}
|
||||
if (nounit) {
|
||||
i__4 = j;
|
||||
i__2 = j;
|
||||
i__3 = kplus1 + j * a_dim1;
|
||||
z__1.r = x[i__2].r * a[i__3].r - x[i__2].i * a[i__3].i,
|
||||
z__1.i = x[i__2].r * a[i__3].i + x[i__2].i * a[i__3].r;
|
||||
x[i__4].r = z__1.r, x[i__4].i = z__1.i;
|
||||
}
|
||||
}
|
||||
/* L20: */
|
||||
}
|
||||
} else {
|
||||
jx = kx;
|
||||
i__1 = *n;
|
||||
for (j = 1; j <= i__1; ++j) {
|
||||
i__4 = jx;
|
||||
if (x[i__4].r != 0. || x[i__4].i != 0.) {
|
||||
i__4 = jx;
|
||||
temp.r = x[i__4].r, temp.i = x[i__4].i;
|
||||
ix = kx;
|
||||
l = kplus1 - j;
|
||||
/* Computing MAX */
|
||||
i__4 = 1, i__2 = j - *k;
|
||||
i__3 = j - 1;
|
||||
for (i__ = max(i__4, i__2); i__ <= i__3; ++i__) {
|
||||
i__4 = ix;
|
||||
i__2 = ix;
|
||||
i__5 = l + i__ + j * a_dim1;
|
||||
z__2.r = temp.r * a[i__5].r - temp.i * a[i__5].i, z__2.i = temp.r * a[i__5].i + temp.i * a[i__5].r;
|
||||
z__1.r = x[i__2].r + z__2.r, z__1.i = x[i__2].i + z__2.i;
|
||||
x[i__4].r = z__1.r, x[i__4].i = z__1.i;
|
||||
ix += *incx;
|
||||
/* L30: */
|
||||
}
|
||||
if (nounit) {
|
||||
i__3 = jx;
|
||||
i__4 = jx;
|
||||
i__2 = kplus1 + j * a_dim1;
|
||||
z__1.r = x[i__4].r * a[i__2].r - x[i__4].i * a[i__2].i,
|
||||
z__1.i = x[i__4].r * a[i__2].i + x[i__4].i * a[i__2].r;
|
||||
x[i__3].r = z__1.r, x[i__3].i = z__1.i;
|
||||
}
|
||||
}
|
||||
jx += *incx;
|
||||
if (j > *k) {
|
||||
kx += *incx;
|
||||
}
|
||||
/* L40: */
|
||||
}
|
||||
}
|
||||
} else {
|
||||
if (*incx == 1) {
|
||||
for (j = *n; j >= 1; --j) {
|
||||
i__1 = j;
|
||||
if (x[i__1].r != 0. || x[i__1].i != 0.) {
|
||||
i__1 = j;
|
||||
temp.r = x[i__1].r, temp.i = x[i__1].i;
|
||||
l = 1 - j;
|
||||
/* Computing MIN */
|
||||
i__1 = *n, i__3 = j + *k;
|
||||
i__4 = j + 1;
|
||||
for (i__ = min(i__1, i__3); i__ >= i__4; --i__) {
|
||||
i__1 = i__;
|
||||
i__3 = i__;
|
||||
i__2 = l + i__ + j * a_dim1;
|
||||
z__2.r = temp.r * a[i__2].r - temp.i * a[i__2].i, z__2.i = temp.r * a[i__2].i + temp.i * a[i__2].r;
|
||||
z__1.r = x[i__3].r + z__2.r, z__1.i = x[i__3].i + z__2.i;
|
||||
x[i__1].r = z__1.r, x[i__1].i = z__1.i;
|
||||
/* L50: */
|
||||
}
|
||||
if (nounit) {
|
||||
i__4 = j;
|
||||
i__1 = j;
|
||||
i__3 = j * a_dim1 + 1;
|
||||
z__1.r = x[i__1].r * a[i__3].r - x[i__1].i * a[i__3].i,
|
||||
z__1.i = x[i__1].r * a[i__3].i + x[i__1].i * a[i__3].r;
|
||||
x[i__4].r = z__1.r, x[i__4].i = z__1.i;
|
||||
}
|
||||
}
|
||||
/* L60: */
|
||||
}
|
||||
} else {
|
||||
kx += (*n - 1) * *incx;
|
||||
jx = kx;
|
||||
for (j = *n; j >= 1; --j) {
|
||||
i__4 = jx;
|
||||
if (x[i__4].r != 0. || x[i__4].i != 0.) {
|
||||
i__4 = jx;
|
||||
temp.r = x[i__4].r, temp.i = x[i__4].i;
|
||||
ix = kx;
|
||||
l = 1 - j;
|
||||
/* Computing MIN */
|
||||
i__4 = *n, i__1 = j + *k;
|
||||
i__3 = j + 1;
|
||||
for (i__ = min(i__4, i__1); i__ >= i__3; --i__) {
|
||||
i__4 = ix;
|
||||
i__1 = ix;
|
||||
i__2 = l + i__ + j * a_dim1;
|
||||
z__2.r = temp.r * a[i__2].r - temp.i * a[i__2].i, z__2.i = temp.r * a[i__2].i + temp.i * a[i__2].r;
|
||||
z__1.r = x[i__1].r + z__2.r, z__1.i = x[i__1].i + z__2.i;
|
||||
x[i__4].r = z__1.r, x[i__4].i = z__1.i;
|
||||
ix -= *incx;
|
||||
/* L70: */
|
||||
}
|
||||
if (nounit) {
|
||||
i__3 = jx;
|
||||
i__4 = jx;
|
||||
i__1 = j * a_dim1 + 1;
|
||||
z__1.r = x[i__4].r * a[i__1].r - x[i__4].i * a[i__1].i,
|
||||
z__1.i = x[i__4].r * a[i__1].i + x[i__4].i * a[i__1].r;
|
||||
x[i__3].r = z__1.r, x[i__3].i = z__1.i;
|
||||
}
|
||||
}
|
||||
jx -= *incx;
|
||||
if (*n - j >= *k) {
|
||||
kx -= *incx;
|
||||
}
|
||||
/* L80: */
|
||||
}
|
||||
}
|
||||
}
|
||||
} else {
|
||||
/* Form x := A'*x or x := conjg( A' )*x. */
|
||||
|
||||
if (lsame_(uplo, "U")) {
|
||||
kplus1 = *k + 1;
|
||||
if (*incx == 1) {
|
||||
for (j = *n; j >= 1; --j) {
|
||||
i__3 = j;
|
||||
temp.r = x[i__3].r, temp.i = x[i__3].i;
|
||||
l = kplus1 - j;
|
||||
if (noconj) {
|
||||
if (nounit) {
|
||||
i__3 = kplus1 + j * a_dim1;
|
||||
z__1.r = temp.r * a[i__3].r - temp.i * a[i__3].i, z__1.i = temp.r * a[i__3].i + temp.i * a[i__3].r;
|
||||
temp.r = z__1.r, temp.i = z__1.i;
|
||||
}
|
||||
/* Computing MAX */
|
||||
i__4 = 1, i__1 = j - *k;
|
||||
i__3 = max(i__4, i__1);
|
||||
for (i__ = j - 1; i__ >= i__3; --i__) {
|
||||
i__4 = l + i__ + j * a_dim1;
|
||||
i__1 = i__;
|
||||
z__2.r = a[i__4].r * x[i__1].r - a[i__4].i * x[i__1].i,
|
||||
z__2.i = a[i__4].r * x[i__1].i + a[i__4].i * x[i__1].r;
|
||||
z__1.r = temp.r + z__2.r, z__1.i = temp.i + z__2.i;
|
||||
temp.r = z__1.r, temp.i = z__1.i;
|
||||
/* L90: */
|
||||
}
|
||||
} else {
|
||||
if (nounit) {
|
||||
d_cnjg(&z__2, &a[kplus1 + j * a_dim1]);
|
||||
z__1.r = temp.r * z__2.r - temp.i * z__2.i, z__1.i = temp.r * z__2.i + temp.i * z__2.r;
|
||||
temp.r = z__1.r, temp.i = z__1.i;
|
||||
}
|
||||
/* Computing MAX */
|
||||
i__4 = 1, i__1 = j - *k;
|
||||
i__3 = max(i__4, i__1);
|
||||
for (i__ = j - 1; i__ >= i__3; --i__) {
|
||||
d_cnjg(&z__3, &a[l + i__ + j * a_dim1]);
|
||||
i__4 = i__;
|
||||
z__2.r = z__3.r * x[i__4].r - z__3.i * x[i__4].i, z__2.i = z__3.r * x[i__4].i + z__3.i * x[i__4].r;
|
||||
z__1.r = temp.r + z__2.r, z__1.i = temp.i + z__2.i;
|
||||
temp.r = z__1.r, temp.i = z__1.i;
|
||||
/* L100: */
|
||||
}
|
||||
}
|
||||
i__3 = j;
|
||||
x[i__3].r = temp.r, x[i__3].i = temp.i;
|
||||
/* L110: */
|
||||
}
|
||||
} else {
|
||||
kx += (*n - 1) * *incx;
|
||||
jx = kx;
|
||||
for (j = *n; j >= 1; --j) {
|
||||
i__3 = jx;
|
||||
temp.r = x[i__3].r, temp.i = x[i__3].i;
|
||||
kx -= *incx;
|
||||
ix = kx;
|
||||
l = kplus1 - j;
|
||||
if (noconj) {
|
||||
if (nounit) {
|
||||
i__3 = kplus1 + j * a_dim1;
|
||||
z__1.r = temp.r * a[i__3].r - temp.i * a[i__3].i, z__1.i = temp.r * a[i__3].i + temp.i * a[i__3].r;
|
||||
temp.r = z__1.r, temp.i = z__1.i;
|
||||
}
|
||||
/* Computing MAX */
|
||||
i__4 = 1, i__1 = j - *k;
|
||||
i__3 = max(i__4, i__1);
|
||||
for (i__ = j - 1; i__ >= i__3; --i__) {
|
||||
i__4 = l + i__ + j * a_dim1;
|
||||
i__1 = ix;
|
||||
z__2.r = a[i__4].r * x[i__1].r - a[i__4].i * x[i__1].i,
|
||||
z__2.i = a[i__4].r * x[i__1].i + a[i__4].i * x[i__1].r;
|
||||
z__1.r = temp.r + z__2.r, z__1.i = temp.i + z__2.i;
|
||||
temp.r = z__1.r, temp.i = z__1.i;
|
||||
ix -= *incx;
|
||||
/* L120: */
|
||||
}
|
||||
} else {
|
||||
if (nounit) {
|
||||
d_cnjg(&z__2, &a[kplus1 + j * a_dim1]);
|
||||
z__1.r = temp.r * z__2.r - temp.i * z__2.i, z__1.i = temp.r * z__2.i + temp.i * z__2.r;
|
||||
temp.r = z__1.r, temp.i = z__1.i;
|
||||
}
|
||||
/* Computing MAX */
|
||||
i__4 = 1, i__1 = j - *k;
|
||||
i__3 = max(i__4, i__1);
|
||||
for (i__ = j - 1; i__ >= i__3; --i__) {
|
||||
d_cnjg(&z__3, &a[l + i__ + j * a_dim1]);
|
||||
i__4 = ix;
|
||||
z__2.r = z__3.r * x[i__4].r - z__3.i * x[i__4].i, z__2.i = z__3.r * x[i__4].i + z__3.i * x[i__4].r;
|
||||
z__1.r = temp.r + z__2.r, z__1.i = temp.i + z__2.i;
|
||||
temp.r = z__1.r, temp.i = z__1.i;
|
||||
ix -= *incx;
|
||||
/* L130: */
|
||||
}
|
||||
}
|
||||
i__3 = jx;
|
||||
x[i__3].r = temp.r, x[i__3].i = temp.i;
|
||||
jx -= *incx;
|
||||
/* L140: */
|
||||
}
|
||||
}
|
||||
} else {
|
||||
if (*incx == 1) {
|
||||
i__3 = *n;
|
||||
for (j = 1; j <= i__3; ++j) {
|
||||
i__4 = j;
|
||||
temp.r = x[i__4].r, temp.i = x[i__4].i;
|
||||
l = 1 - j;
|
||||
if (noconj) {
|
||||
if (nounit) {
|
||||
i__4 = j * a_dim1 + 1;
|
||||
z__1.r = temp.r * a[i__4].r - temp.i * a[i__4].i, z__1.i = temp.r * a[i__4].i + temp.i * a[i__4].r;
|
||||
temp.r = z__1.r, temp.i = z__1.i;
|
||||
}
|
||||
/* Computing MIN */
|
||||
i__1 = *n, i__2 = j + *k;
|
||||
i__4 = min(i__1, i__2);
|
||||
for (i__ = j + 1; i__ <= i__4; ++i__) {
|
||||
i__1 = l + i__ + j * a_dim1;
|
||||
i__2 = i__;
|
||||
z__2.r = a[i__1].r * x[i__2].r - a[i__1].i * x[i__2].i,
|
||||
z__2.i = a[i__1].r * x[i__2].i + a[i__1].i * x[i__2].r;
|
||||
z__1.r = temp.r + z__2.r, z__1.i = temp.i + z__2.i;
|
||||
temp.r = z__1.r, temp.i = z__1.i;
|
||||
/* L150: */
|
||||
}
|
||||
} else {
|
||||
if (nounit) {
|
||||
d_cnjg(&z__2, &a[j * a_dim1 + 1]);
|
||||
z__1.r = temp.r * z__2.r - temp.i * z__2.i, z__1.i = temp.r * z__2.i + temp.i * z__2.r;
|
||||
temp.r = z__1.r, temp.i = z__1.i;
|
||||
}
|
||||
/* Computing MIN */
|
||||
i__1 = *n, i__2 = j + *k;
|
||||
i__4 = min(i__1, i__2);
|
||||
for (i__ = j + 1; i__ <= i__4; ++i__) {
|
||||
d_cnjg(&z__3, &a[l + i__ + j * a_dim1]);
|
||||
i__1 = i__;
|
||||
z__2.r = z__3.r * x[i__1].r - z__3.i * x[i__1].i, z__2.i = z__3.r * x[i__1].i + z__3.i * x[i__1].r;
|
||||
z__1.r = temp.r + z__2.r, z__1.i = temp.i + z__2.i;
|
||||
temp.r = z__1.r, temp.i = z__1.i;
|
||||
/* L160: */
|
||||
}
|
||||
}
|
||||
i__4 = j;
|
||||
x[i__4].r = temp.r, x[i__4].i = temp.i;
|
||||
/* L170: */
|
||||
}
|
||||
} else {
|
||||
jx = kx;
|
||||
i__3 = *n;
|
||||
for (j = 1; j <= i__3; ++j) {
|
||||
i__4 = jx;
|
||||
temp.r = x[i__4].r, temp.i = x[i__4].i;
|
||||
kx += *incx;
|
||||
ix = kx;
|
||||
l = 1 - j;
|
||||
if (noconj) {
|
||||
if (nounit) {
|
||||
i__4 = j * a_dim1 + 1;
|
||||
z__1.r = temp.r * a[i__4].r - temp.i * a[i__4].i, z__1.i = temp.r * a[i__4].i + temp.i * a[i__4].r;
|
||||
temp.r = z__1.r, temp.i = z__1.i;
|
||||
}
|
||||
/* Computing MIN */
|
||||
i__1 = *n, i__2 = j + *k;
|
||||
i__4 = min(i__1, i__2);
|
||||
for (i__ = j + 1; i__ <= i__4; ++i__) {
|
||||
i__1 = l + i__ + j * a_dim1;
|
||||
i__2 = ix;
|
||||
z__2.r = a[i__1].r * x[i__2].r - a[i__1].i * x[i__2].i,
|
||||
z__2.i = a[i__1].r * x[i__2].i + a[i__1].i * x[i__2].r;
|
||||
z__1.r = temp.r + z__2.r, z__1.i = temp.i + z__2.i;
|
||||
temp.r = z__1.r, temp.i = z__1.i;
|
||||
ix += *incx;
|
||||
/* L180: */
|
||||
}
|
||||
} else {
|
||||
if (nounit) {
|
||||
d_cnjg(&z__2, &a[j * a_dim1 + 1]);
|
||||
z__1.r = temp.r * z__2.r - temp.i * z__2.i, z__1.i = temp.r * z__2.i + temp.i * z__2.r;
|
||||
temp.r = z__1.r, temp.i = z__1.i;
|
||||
}
|
||||
/* Computing MIN */
|
||||
i__1 = *n, i__2 = j + *k;
|
||||
i__4 = min(i__1, i__2);
|
||||
for (i__ = j + 1; i__ <= i__4; ++i__) {
|
||||
d_cnjg(&z__3, &a[l + i__ + j * a_dim1]);
|
||||
i__1 = ix;
|
||||
z__2.r = z__3.r * x[i__1].r - z__3.i * x[i__1].i, z__2.i = z__3.r * x[i__1].i + z__3.i * x[i__1].r;
|
||||
z__1.r = temp.r + z__2.r, z__1.i = temp.i + z__2.i;
|
||||
temp.r = z__1.r, temp.i = z__1.i;
|
||||
ix += *incx;
|
||||
/* L190: */
|
||||
}
|
||||
}
|
||||
i__4 = jx;
|
||||
x[i__4].r = temp.r, x[i__4].i = temp.i;
|
||||
jx += *incx;
|
||||
/* L200: */
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/* End of ZTBMV . */
|
||||
|
||||
} /* ztbmv_ */
|
||||
@@ -25,15 +25,19 @@ struct functor_traits<scalar_norm1_op> {
|
||||
// computes the sum of magnitudes of all vector elements or, for a complex vector x, the sum
|
||||
// res = |Rex1| + |Imx1| + |Rex2| + |Imx2| + ... + |Rexn| + |Imxn|, where x is a vector of order n
|
||||
extern "C" RealScalar EIGEN_CAT(REAL_SCALAR_SUFFIX, EIGEN_BLAS_FUNC_NAME(asum))(int *n, RealScalar *px, int *incx) {
|
||||
// std::cerr << "__asum " << *n << " " << *incx << "\n";
|
||||
Complex *x = reinterpret_cast<Complex *>(px);
|
||||
|
||||
if (*n <= 0) return 0;
|
||||
|
||||
// std::complex<T> is layout-compatible with T[2], so we can reinterpret
|
||||
// a complex vector of length n as a real vector of length 2*n and use
|
||||
// the fully vectorized cwiseAbs().sum() path.
|
||||
if (*incx == 1)
|
||||
return make_vector(x, *n).unaryExpr<scalar_norm1_op>().sum();
|
||||
else
|
||||
return make_vector(px, 2 * *n).cwiseAbs().sum();
|
||||
else {
|
||||
// For non-unit stride, fall back to the scalar_norm1_op approach since
|
||||
// the real components are not contiguous across complex elements.
|
||||
Complex *x = reinterpret_cast<Complex *>(px);
|
||||
return make_vector(x, *n, std::abs(*incx)).unaryExpr<scalar_norm1_op>().sum();
|
||||
}
|
||||
}
|
||||
|
||||
extern "C" int EIGEN_CAT(i, EIGEN_BLAS_FUNC_NAME(amax))(int *n, RealScalar *px, int *incx) {
|
||||
|
||||
@@ -69,15 +69,21 @@ EIGEN_BLAS_FUNC(copy)(int *n, RealScalar *px, int *incx, RealScalar *py, int *in
|
||||
// be careful, *incx==0 is allowed !!
|
||||
if (*incx == 1 && *incy == 1)
|
||||
make_vector(y, *n) = make_vector(x, *n);
|
||||
else {
|
||||
if (*incx < 0) x = x - (*n - 1) * (*incx);
|
||||
else if (*incx == 0) {
|
||||
// Broadcast: copy x[0] to all elements of y.
|
||||
if (*incy < 0) y = y - (*n - 1) * (*incy);
|
||||
for (int i = 0; i < *n; ++i) {
|
||||
*y = *x;
|
||||
x += *incx;
|
||||
y += *incy;
|
||||
}
|
||||
}
|
||||
} else if (*incx > 0 && *incy > 0)
|
||||
make_vector(y, *n, *incy) = make_vector(x, *n, *incx);
|
||||
else if (*incx > 0 && *incy < 0)
|
||||
make_vector(y, *n, -*incy).reverse() = make_vector(x, *n, *incx);
|
||||
else if (*incx < 0 && *incy > 0)
|
||||
make_vector(y, *n, *incy) = make_vector(x, *n, -*incx).reverse();
|
||||
else if (*incx < 0 && *incy < 0)
|
||||
make_vector(y, *n, -*incy) = make_vector(x, *n, -*incx);
|
||||
}
|
||||
|
||||
EIGEN_BLAS_FUNC(rotg)(RealScalar *pa, RealScalar *pb, RealScalar *pc, RealScalar *ps) {
|
||||
|
||||
@@ -58,23 +58,21 @@ extern "C" Scalar EIGEN_BLAS_FUNC_NAME(dot)(int *n, Scalar *px, int *incx, Scala
|
||||
Scalar *y = reinterpret_cast<Scalar *>(py);
|
||||
|
||||
if (*incx == 1 && *incy == 1)
|
||||
return (make_vector(x, *n).cwiseProduct(make_vector(y, *n))).sum();
|
||||
return make_vector(x, *n).dot(make_vector(y, *n));
|
||||
else if (*incx > 0 && *incy > 0)
|
||||
return (make_vector(x, *n, *incx).cwiseProduct(make_vector(y, *n, *incy))).sum();
|
||||
return make_vector(x, *n, *incx).dot(make_vector(y, *n, *incy));
|
||||
else if (*incx < 0 && *incy > 0)
|
||||
return (make_vector(x, *n, -*incx).reverse().cwiseProduct(make_vector(y, *n, *incy))).sum();
|
||||
return make_vector(x, *n, -*incx).reverse().dot(make_vector(y, *n, *incy));
|
||||
else if (*incx > 0 && *incy < 0)
|
||||
return (make_vector(x, *n, *incx).cwiseProduct(make_vector(y, *n, -*incy).reverse())).sum();
|
||||
return make_vector(x, *n, *incx).dot(make_vector(y, *n, -*incy).reverse());
|
||||
else if (*incx < 0 && *incy < 0)
|
||||
return (make_vector(x, *n, -*incx).reverse().cwiseProduct(make_vector(y, *n, -*incy).reverse())).sum();
|
||||
return make_vector(x, *n, -*incx).reverse().dot(make_vector(y, *n, -*incy).reverse());
|
||||
else
|
||||
return 0;
|
||||
}
|
||||
|
||||
// computes the Euclidean norm of a vector.
|
||||
// FIXME
|
||||
extern "C" Scalar EIGEN_BLAS_FUNC_NAME(nrm2)(int *n, Scalar *px, int *incx) {
|
||||
// std::cerr << "_nrm2 " << *n << " " << *incx << "\n";
|
||||
if (*n <= 0) return 0;
|
||||
|
||||
Scalar *x = reinterpret_cast<Scalar *>(px);
|
||||
@@ -108,23 +106,171 @@ EIGEN_BLAS_FUNC(rot)(int *n, Scalar *px, int *incx, Scalar *py, int *incy, Scala
|
||||
Eigen::internal::apply_rotation_in_the_plane(vx, vy, Eigen::JacobiRotation<Scalar>(c, s));
|
||||
}
|
||||
|
||||
/*
|
||||
// performs rotation of points in the modified plane.
|
||||
EIGEN_BLAS_FUNC(rotm)(int *n, Scalar *px, int *incx, Scalar *py, int *incy, Scalar *param)
|
||||
{
|
||||
Scalar* x = reinterpret_cast<Scalar*>(px);
|
||||
Scalar* y = reinterpret_cast<Scalar*>(py);
|
||||
// Applies modified Givens rotation H to vectors x and y.
|
||||
// param[0] = flag:
|
||||
// -1: H = [[h11, h12], [h21, h22]] (all 4 elements from param)
|
||||
// 0: H = [[1, h12], [h21, 1]] (h12, h21 from param)
|
||||
// 1: H = [[h11, 1], [-1, h22]] (h11, h22 from param)
|
||||
// -2: H = identity (no-op)
|
||||
// param[1..4] = h11, h21, h12, h22
|
||||
EIGEN_BLAS_FUNC(rotm)(int *n, Scalar *px, int *incx, Scalar *py, int *incy, Scalar *param) {
|
||||
Scalar *x = reinterpret_cast<Scalar *>(px);
|
||||
Scalar *y = reinterpret_cast<Scalar *>(py);
|
||||
|
||||
// TODO
|
||||
Scalar flag = param[0];
|
||||
if (*n <= 0 || flag == Scalar(-2)) return;
|
||||
|
||||
return 0;
|
||||
Scalar h11, h12, h21, h22;
|
||||
if (flag < Scalar(0)) {
|
||||
h11 = param[1];
|
||||
h21 = param[2];
|
||||
h12 = param[3];
|
||||
h22 = param[4];
|
||||
} else if (flag == Scalar(0)) {
|
||||
h11 = Scalar(1);
|
||||
h21 = param[2];
|
||||
h12 = param[3];
|
||||
h22 = Scalar(1);
|
||||
} else {
|
||||
h11 = param[1];
|
||||
h21 = Scalar(-1);
|
||||
h12 = Scalar(1);
|
||||
h22 = param[4];
|
||||
}
|
||||
|
||||
int kx = *incx > 0 ? 0 : (1 - *n) * *incx;
|
||||
int ky = *incy > 0 ? 0 : (1 - *n) * *incy;
|
||||
|
||||
for (int i = 0; i < *n; ++i) {
|
||||
Scalar w = x[kx];
|
||||
Scalar z = y[ky];
|
||||
x[kx] = h11 * w + h12 * z;
|
||||
y[ky] = h21 * w + h22 * z;
|
||||
kx += *incx;
|
||||
ky += *incy;
|
||||
}
|
||||
}
|
||||
|
||||
// computes the modified parameters for a Givens rotation.
|
||||
EIGEN_BLAS_FUNC(rotmg)(Scalar *d1, Scalar *d2, Scalar *x1, Scalar *x2, Scalar *param)
|
||||
{
|
||||
// TODO
|
||||
// Constructs the modified Givens transformation matrix H which zeros the second
|
||||
// component of (sqrt(d1)*x1, sqrt(d2)*y1)^T.
|
||||
EIGEN_BLAS_FUNC(rotmg)(Scalar *d1, Scalar *d2, Scalar *x1, Scalar *y1, Scalar *param) {
|
||||
using std::abs;
|
||||
|
||||
return 0;
|
||||
const Scalar gam = Scalar(4096);
|
||||
const Scalar gamsq = gam * gam;
|
||||
const Scalar rgamsq = Scalar(1) / gamsq;
|
||||
|
||||
Scalar flag, h11 = Scalar(0), h12 = Scalar(0), h21 = Scalar(0), h22 = Scalar(0);
|
||||
|
||||
if (*d1 < Scalar(0)) {
|
||||
// Negative d1: zero everything.
|
||||
flag = Scalar(-1);
|
||||
*d1 = *d2 = *x1 = Scalar(0);
|
||||
} else {
|
||||
Scalar p2 = *d2 * *y1;
|
||||
if (p2 == Scalar(0)) {
|
||||
// d2*y1 == 0: identity transform.
|
||||
param[0] = Scalar(-2);
|
||||
return;
|
||||
}
|
||||
|
||||
Scalar p1 = *d1 * *x1;
|
||||
Scalar q2 = p2 * *y1;
|
||||
Scalar q1 = p1 * *x1;
|
||||
bool do_scale = true;
|
||||
|
||||
if (abs(q1) > abs(q2)) {
|
||||
h21 = -(*y1) / *x1;
|
||||
h12 = p2 / p1;
|
||||
Scalar u = Scalar(1) - h12 * h21;
|
||||
if (u <= Scalar(0)) {
|
||||
flag = Scalar(-1);
|
||||
h11 = h12 = h21 = h22 = Scalar(0);
|
||||
*d1 = *d2 = *x1 = Scalar(0);
|
||||
do_scale = false;
|
||||
} else {
|
||||
flag = Scalar(0);
|
||||
*d1 /= u;
|
||||
*d2 /= u;
|
||||
*x1 *= u;
|
||||
}
|
||||
} else if (q2 < Scalar(0)) {
|
||||
flag = Scalar(-1);
|
||||
h11 = h12 = h21 = h22 = Scalar(0);
|
||||
*d1 = *d2 = *x1 = Scalar(0);
|
||||
do_scale = false;
|
||||
} else {
|
||||
flag = Scalar(1);
|
||||
h11 = p1 / p2;
|
||||
h22 = *x1 / *y1;
|
||||
Scalar u = Scalar(1) + h11 * h22;
|
||||
Scalar temp = *d2 / u;
|
||||
*d2 = *d1 / u;
|
||||
*d1 = temp;
|
||||
*x1 = *y1 * u;
|
||||
}
|
||||
|
||||
if (do_scale) {
|
||||
// Converts compact H representation (flag 0 or 1) to full form (flag -1)
|
||||
// so that scaling factors can be absorbed into all four elements.
|
||||
auto fix_h = [&]() {
|
||||
if (flag >= Scalar(0)) {
|
||||
if (flag == Scalar(0)) {
|
||||
h11 = Scalar(1);
|
||||
h22 = Scalar(1);
|
||||
} else {
|
||||
h21 = Scalar(-1);
|
||||
h12 = Scalar(1);
|
||||
}
|
||||
flag = Scalar(-1);
|
||||
}
|
||||
};
|
||||
|
||||
// Scale d1 up if too small.
|
||||
while (*d1 <= rgamsq && *d1 != Scalar(0)) {
|
||||
fix_h();
|
||||
*d1 *= gamsq;
|
||||
*x1 /= gam;
|
||||
h11 /= gam;
|
||||
h12 /= gam;
|
||||
}
|
||||
// Scale d1 down if too large.
|
||||
while (*d1 >= gamsq) {
|
||||
fix_h();
|
||||
*d1 /= gamsq;
|
||||
*x1 *= gam;
|
||||
h11 *= gam;
|
||||
h12 *= gam;
|
||||
}
|
||||
// Scale |d2| up if too small.
|
||||
while (abs(*d2) <= rgamsq && *d2 != Scalar(0)) {
|
||||
fix_h();
|
||||
*d2 *= gamsq;
|
||||
h21 /= gam;
|
||||
h22 /= gam;
|
||||
}
|
||||
// Scale |d2| down if too large.
|
||||
while (abs(*d2) >= gamsq) {
|
||||
fix_h();
|
||||
*d2 /= gamsq;
|
||||
h21 *= gam;
|
||||
h22 *= gam;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// Store result in param array.
|
||||
if (flag < Scalar(0)) {
|
||||
param[1] = h11;
|
||||
param[2] = h21;
|
||||
param[3] = h12;
|
||||
param[4] = h22;
|
||||
} else if (flag == Scalar(0)) {
|
||||
param[2] = h21;
|
||||
param[3] = h12;
|
||||
} else {
|
||||
param[1] = h11;
|
||||
param[4] = h22;
|
||||
}
|
||||
param[0] = flag;
|
||||
}
|
||||
*/
|
||||
|
||||
@@ -72,31 +72,193 @@ EIGEN_BLAS_FUNC(hemv)
|
||||
if (actual_y != y) delete[] copy_back(actual_y, y, *n, *incy);
|
||||
}
|
||||
|
||||
/** ZHBMV performs the matrix-vector operation
|
||||
/** HBMV performs the matrix-vector operation
|
||||
*
|
||||
* y := alpha*A*x + beta*y,
|
||||
*
|
||||
* where alpha and beta are scalars, x and y are n element vectors and
|
||||
* A is an n by n hermitian band matrix, with k super-diagonals.
|
||||
* Diagonal elements are real; off-diagonal contributions use conjugation.
|
||||
*/
|
||||
// EIGEN_BLAS_FUNC(hbmv)(char *uplo, int *n, int *k, RealScalar *alpha, RealScalar *a, int *lda,
|
||||
// RealScalar *x, int *incx, RealScalar *beta, RealScalar *y, int *incy)
|
||||
// {
|
||||
// return 1;
|
||||
// }
|
||||
EIGEN_BLAS_FUNC(hbmv)
|
||||
(char *uplo, int *n, int *k, RealScalar *palpha, RealScalar *pa, int *lda, RealScalar *px, int *incx, RealScalar *pbeta,
|
||||
RealScalar *py, int *incy) {
|
||||
const Scalar alpha = *reinterpret_cast<const Scalar *>(palpha);
|
||||
const Scalar beta = *reinterpret_cast<const Scalar *>(pbeta);
|
||||
const Scalar *a = reinterpret_cast<const Scalar *>(pa);
|
||||
const Scalar *x = reinterpret_cast<const Scalar *>(px);
|
||||
Scalar *y = reinterpret_cast<Scalar *>(py);
|
||||
|
||||
/** ZHPMV performs the matrix-vector operation
|
||||
int info = 0;
|
||||
if (UPLO(*uplo) == INVALID)
|
||||
info = 1;
|
||||
else if (*n < 0)
|
||||
info = 2;
|
||||
else if (*k < 0)
|
||||
info = 3;
|
||||
else if (*lda < *k + 1)
|
||||
info = 6;
|
||||
else if (*incx == 0)
|
||||
info = 8;
|
||||
else if (*incy == 0)
|
||||
info = 11;
|
||||
if (info) return xerbla_(SCALAR_SUFFIX_UP "HBMV ", &info);
|
||||
|
||||
if (*n == 0 || (alpha == Scalar(0) && beta == Scalar(1))) return;
|
||||
|
||||
const Scalar *actual_x = get_compact_vector(x, *n, *incx);
|
||||
Scalar *actual_y = get_compact_vector(y, *n, *incy);
|
||||
|
||||
// First form y := beta*y.
|
||||
if (beta != Scalar(1)) {
|
||||
if (beta == Scalar(0))
|
||||
make_vector(actual_y, *n).setZero();
|
||||
else
|
||||
make_vector(actual_y, *n) *= beta;
|
||||
}
|
||||
|
||||
if (alpha == Scalar(0)) {
|
||||
if (actual_x != x) delete[] actual_x;
|
||||
if (actual_y != y) delete[] copy_back(actual_y, y, *n, *incy);
|
||||
return;
|
||||
}
|
||||
|
||||
if (*k >= 8) {
|
||||
// Vectorized path: use Eigen Map segments for the inner band operations.
|
||||
ConstMatrixType band(a, *k + 1, *n, *lda);
|
||||
if (UPLO(*uplo) == UP) {
|
||||
for (int j = 0; j < *n; ++j) {
|
||||
int start = std::max(0, j - *k);
|
||||
int len = j - start;
|
||||
int offset = *k - (j - start);
|
||||
Scalar temp1 = alpha * actual_x[j];
|
||||
actual_y[j] += Scalar(Eigen::numext::real(band(*k, j))) * temp1;
|
||||
if (len > 0) {
|
||||
make_vector(actual_y + start, len) += temp1 * band.col(j).segment(offset, len);
|
||||
actual_y[j] += alpha * band.col(j).segment(offset, len).dot(make_vector(actual_x + start, len));
|
||||
}
|
||||
}
|
||||
} else {
|
||||
for (int j = 0; j < *n; ++j) {
|
||||
int len = std::min(*n - 1, j + *k) - j;
|
||||
Scalar temp1 = alpha * actual_x[j];
|
||||
actual_y[j] += Scalar(Eigen::numext::real(band(0, j))) * temp1;
|
||||
if (len > 0) {
|
||||
make_vector(actual_y + j + 1, len) += temp1 * band.col(j).segment(1, len);
|
||||
actual_y[j] += alpha * band.col(j).segment(1, len).dot(make_vector(actual_x + j + 1, len));
|
||||
}
|
||||
}
|
||||
}
|
||||
} else {
|
||||
// Scalar path: for narrow bandwidth, avoid Map overhead.
|
||||
if (UPLO(*uplo) == UP) {
|
||||
for (int j = 0; j < *n; ++j) {
|
||||
Scalar temp1 = alpha * actual_x[j];
|
||||
Scalar temp2 = Scalar(0);
|
||||
for (int i = std::max(0, j - *k); i < j; ++i) {
|
||||
Scalar aij = a[(*k + i - j) + j * *lda];
|
||||
actual_y[i] += temp1 * aij;
|
||||
temp2 += Eigen::numext::conj(aij) * actual_x[i];
|
||||
}
|
||||
actual_y[j] += Scalar(Eigen::numext::real(a[*k + j * *lda])) * temp1 + alpha * temp2;
|
||||
}
|
||||
} else {
|
||||
for (int j = 0; j < *n; ++j) {
|
||||
Scalar temp1 = alpha * actual_x[j];
|
||||
Scalar temp2 = Scalar(0);
|
||||
actual_y[j] += Scalar(Eigen::numext::real(a[j * *lda])) * temp1;
|
||||
for (int i = j + 1; i <= std::min(*n - 1, j + *k); ++i) {
|
||||
Scalar aij = a[(i - j) + j * *lda];
|
||||
actual_y[i] += temp1 * aij;
|
||||
temp2 += Eigen::numext::conj(aij) * actual_x[i];
|
||||
}
|
||||
actual_y[j] += alpha * temp2;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
if (actual_x != x) delete[] actual_x;
|
||||
if (actual_y != y) delete[] copy_back(actual_y, y, *n, *incy);
|
||||
}
|
||||
|
||||
/** HPMV performs the matrix-vector operation
|
||||
*
|
||||
* y := alpha*A*x + beta*y,
|
||||
*
|
||||
* where alpha and beta are scalars, x and y are n element vectors and
|
||||
* A is an n by n hermitian matrix, supplied in packed form.
|
||||
* Diagonal elements are real; off-diagonal contributions use conjugation.
|
||||
*/
|
||||
// EIGEN_BLAS_FUNC(hpmv)(char *uplo, int *n, RealScalar *alpha, RealScalar *ap, RealScalar *x, int *incx, RealScalar
|
||||
// *beta, RealScalar *y, int *incy)
|
||||
// {
|
||||
// return 1;
|
||||
// }
|
||||
EIGEN_BLAS_FUNC(hpmv)
|
||||
(char *uplo, int *n, RealScalar *palpha, RealScalar *pap, RealScalar *px, int *incx, RealScalar *pbeta, RealScalar *py,
|
||||
int *incy) {
|
||||
const Scalar alpha = *reinterpret_cast<const Scalar *>(palpha);
|
||||
const Scalar beta = *reinterpret_cast<const Scalar *>(pbeta);
|
||||
const Scalar *ap = reinterpret_cast<const Scalar *>(pap);
|
||||
const Scalar *x = reinterpret_cast<const Scalar *>(px);
|
||||
Scalar *y = reinterpret_cast<Scalar *>(py);
|
||||
|
||||
int info = 0;
|
||||
if (UPLO(*uplo) == INVALID)
|
||||
info = 1;
|
||||
else if (*n < 0)
|
||||
info = 2;
|
||||
else if (*incx == 0)
|
||||
info = 6;
|
||||
else if (*incy == 0)
|
||||
info = 9;
|
||||
if (info) return xerbla_(SCALAR_SUFFIX_UP "HPMV ", &info);
|
||||
|
||||
if (*n == 0 || (alpha == Scalar(0) && beta == Scalar(1))) return;
|
||||
|
||||
const Scalar *actual_x = get_compact_vector(x, *n, *incx);
|
||||
Scalar *actual_y = get_compact_vector(y, *n, *incy);
|
||||
|
||||
// First form y := beta*y.
|
||||
if (beta != Scalar(1)) {
|
||||
if (beta == Scalar(0))
|
||||
make_vector(actual_y, *n).setZero();
|
||||
else
|
||||
make_vector(actual_y, *n) *= beta;
|
||||
}
|
||||
|
||||
if (alpha == Scalar(0)) {
|
||||
if (actual_x != x) delete[] actual_x;
|
||||
if (actual_y != y) delete[] copy_back(actual_y, y, *n, *incy);
|
||||
return;
|
||||
}
|
||||
|
||||
int kk = 0;
|
||||
if (UPLO(*uplo) == UP) {
|
||||
// Upper triangle packed: column j occupies ap[kk..kk+j].
|
||||
for (int j = 0; j < *n; ++j) {
|
||||
Scalar temp1 = alpha * actual_x[j];
|
||||
// Diagonal is real.
|
||||
actual_y[j] += Scalar(Eigen::numext::real(ap[kk + j])) * temp1;
|
||||
if (j > 0) {
|
||||
make_vector(actual_y, j) += temp1 * make_vector(ap + kk, j);
|
||||
actual_y[j] += alpha * make_vector(ap + kk, j).dot(make_vector(actual_x, j));
|
||||
}
|
||||
kk += j + 1;
|
||||
}
|
||||
} else {
|
||||
// Lower triangle packed: column j occupies ap[kk..kk+(n-j-1)].
|
||||
for (int j = 0; j < *n; ++j) {
|
||||
int len = *n - j - 1;
|
||||
Scalar temp1 = alpha * actual_x[j];
|
||||
// Diagonal is real.
|
||||
actual_y[j] += Scalar(Eigen::numext::real(ap[kk])) * temp1;
|
||||
if (len > 0) {
|
||||
make_vector(actual_y + j + 1, len) += temp1 * make_vector(ap + kk + 1, len);
|
||||
actual_y[j] += alpha * make_vector(ap + kk + 1, len).dot(make_vector(actual_x + j + 1, len));
|
||||
}
|
||||
kk += *n - j;
|
||||
}
|
||||
}
|
||||
|
||||
if (actual_x != x) delete[] actual_x;
|
||||
if (actual_y != y) delete[] copy_back(actual_y, y, *n, *incy);
|
||||
}
|
||||
|
||||
/** ZHPR performs the hermitian rank 1 operation
|
||||
*
|
||||
|
||||
@@ -303,61 +303,158 @@ EIGEN_BLAS_FUNC(gbmv)
|
||||
if (actual_y != y) delete[] copy_back(actual_y, y, actual_m, *incy);
|
||||
}
|
||||
|
||||
#if 0
|
||||
/** TBMV performs one of the matrix-vector operations
|
||||
*
|
||||
* x := A*x, or x := A'*x,
|
||||
*
|
||||
* where x is an n element vector and A is an n by n unit, or non-unit,
|
||||
* upper or lower triangular band matrix, with ( k + 1 ) diagonals.
|
||||
*/
|
||||
EIGEN_BLAS_FUNC(tbmv)(char *uplo, char *opa, char *diag, int *n, int *k, RealScalar *pa, int *lda, RealScalar *px, int *incx)
|
||||
{
|
||||
Scalar* a = reinterpret_cast<Scalar*>(pa);
|
||||
Scalar* x = reinterpret_cast<Scalar*>(px);
|
||||
int coeff_rows = *k + 1;
|
||||
*
|
||||
* x := A*x, or x := A'*x, or x := conjg(A')*x,
|
||||
*
|
||||
* where x is an n element vector and A is an n by n unit, or non-unit,
|
||||
* upper or lower triangular band matrix, with ( k + 1 ) diagonals.
|
||||
*
|
||||
* Band storage: upper triangle stores A[i,j] at a[(k+i-j) + j*lda],
|
||||
* lower triangle stores A[i,j] at a[(i-j) + j*lda].
|
||||
*/
|
||||
EIGEN_BLAS_FUNC(tbmv)
|
||||
(char *uplo, char *opa, char *diag, int *n, int *k, RealScalar *pa, int *lda, RealScalar *px, int *incx) {
|
||||
Scalar *a = reinterpret_cast<Scalar *>(pa);
|
||||
Scalar *x = reinterpret_cast<Scalar *>(px);
|
||||
|
||||
int info = 0;
|
||||
if(UPLO(*uplo)==INVALID) info = 1;
|
||||
else if(OP(*opa)==INVALID) info = 2;
|
||||
else if(DIAG(*diag)==INVALID) info = 3;
|
||||
else if(*n<0) info = 4;
|
||||
else if(*k<0) info = 5;
|
||||
else if(*lda<coeff_rows) info = 7;
|
||||
else if(*incx==0) info = 9;
|
||||
if(info)
|
||||
return xerbla_(SCALAR_SUFFIX_UP"TBMV ",&info,6);
|
||||
if (UPLO(*uplo) == INVALID)
|
||||
info = 1;
|
||||
else if (OP(*opa) == INVALID)
|
||||
info = 2;
|
||||
else if (DIAG(*diag) == INVALID)
|
||||
info = 3;
|
||||
else if (*n < 0)
|
||||
info = 4;
|
||||
else if (*k < 0)
|
||||
info = 5;
|
||||
else if (*lda < *k + 1)
|
||||
info = 7;
|
||||
else if (*incx == 0)
|
||||
info = 9;
|
||||
if (info) return xerbla_(SCALAR_SUFFIX_UP "TBMV ", &info);
|
||||
|
||||
if(*n==0) return;
|
||||
if (*n == 0) return;
|
||||
|
||||
int actual_n = *n;
|
||||
Scalar *actual_x = get_compact_vector(x, *n, *incx);
|
||||
|
||||
Scalar* actual_x = get_compact_vector(x,actual_n,*incx);
|
||||
bool upper = (UPLO(*uplo) == UP);
|
||||
int op = OP(*opa);
|
||||
bool unit = (DIAG(*diag) == UNIT);
|
||||
|
||||
MatrixType mat_coeffs(a,coeff_rows,*n,*lda);
|
||||
|
||||
int ku = UPLO(*uplo)==UPPER ? *k : 0;
|
||||
int kl = UPLO(*uplo)==LOWER ? *k : 0;
|
||||
|
||||
for(int j=0; j<*n; ++j)
|
||||
{
|
||||
int start = std::max(0,j - ku);
|
||||
int end = std::min((*m)-1,j + kl);
|
||||
int len = end - start + 1;
|
||||
int offset = (ku) - j + start;
|
||||
|
||||
if(OP(*trans)==NOTR)
|
||||
make_vector(actual_y+start,len) += (alpha*actual_x[j]) * mat_coeffs.col(j).segment(offset,len);
|
||||
else if(OP(*trans)==TR)
|
||||
actual_y[j] += alpha * ( mat_coeffs.col(j).segment(offset,len).transpose() * make_vector(actual_x+start,len) ).value();
|
||||
else
|
||||
actual_y[j] += alpha * ( mat_coeffs.col(j).segment(offset,len).adjoint() * make_vector(actual_x+start,len) ).value();
|
||||
if (*k >= 8) {
|
||||
// Vectorized path: use Eigen Map segments for the inner band operations.
|
||||
ConstMatrixType band(a, *k + 1, *n, *lda);
|
||||
if (op == NOTR) {
|
||||
if (upper) {
|
||||
for (int j = 0; j < *n; ++j) {
|
||||
if (actual_x[j] != Scalar(0)) {
|
||||
int start = std::max(0, j - *k);
|
||||
int len = j - start;
|
||||
int offset = *k - (j - start);
|
||||
Scalar temp = actual_x[j];
|
||||
if (len > 0) make_vector(actual_x + start, len) += temp * band.col(j).segment(offset, len);
|
||||
if (!unit) actual_x[j] = temp * band(*k, j);
|
||||
}
|
||||
}
|
||||
} else {
|
||||
for (int j = *n - 1; j >= 0; --j) {
|
||||
if (actual_x[j] != Scalar(0)) {
|
||||
int len = std::min(*n - 1, j + *k) - j;
|
||||
Scalar temp = actual_x[j];
|
||||
if (len > 0) make_vector(actual_x + j + 1, len) += temp * band.col(j).segment(1, len);
|
||||
if (!unit) actual_x[j] = temp * band(0, j);
|
||||
}
|
||||
}
|
||||
}
|
||||
} else if (op == TR) {
|
||||
if (upper) {
|
||||
for (int j = *n - 1; j >= 0; --j) {
|
||||
int start = std::max(0, j - *k);
|
||||
int len = j - start;
|
||||
int offset = *k - (j - start);
|
||||
Scalar temp = actual_x[j];
|
||||
if (!unit) temp *= band(*k, j);
|
||||
if (len > 0)
|
||||
temp += (band.col(j).segment(offset, len).cwiseProduct(make_vector(actual_x + start, len))).sum();
|
||||
actual_x[j] = temp;
|
||||
}
|
||||
} else {
|
||||
for (int j = 0; j < *n; ++j) {
|
||||
int len = std::min(*n - 1, j + *k) - j;
|
||||
Scalar temp = actual_x[j];
|
||||
if (!unit) temp *= band(0, j);
|
||||
if (len > 0) temp += (band.col(j).segment(1, len).cwiseProduct(make_vector(actual_x + j + 1, len))).sum();
|
||||
actual_x[j] = temp;
|
||||
}
|
||||
}
|
||||
} else {
|
||||
// Conjugate transpose: .dot() computes conj(lhs) . rhs.
|
||||
if (upper) {
|
||||
for (int j = *n - 1; j >= 0; --j) {
|
||||
int start = std::max(0, j - *k);
|
||||
int len = j - start;
|
||||
int offset = *k - (j - start);
|
||||
Scalar temp = actual_x[j];
|
||||
if (!unit) temp *= Eigen::numext::conj(band(*k, j));
|
||||
if (len > 0) temp += band.col(j).segment(offset, len).dot(make_vector(actual_x + start, len));
|
||||
actual_x[j] = temp;
|
||||
}
|
||||
} else {
|
||||
for (int j = 0; j < *n; ++j) {
|
||||
int len = std::min(*n - 1, j + *k) - j;
|
||||
Scalar temp = actual_x[j];
|
||||
if (!unit) temp *= Eigen::numext::conj(band(0, j));
|
||||
if (len > 0) temp += band.col(j).segment(1, len).dot(make_vector(actual_x + j + 1, len));
|
||||
actual_x[j] = temp;
|
||||
}
|
||||
}
|
||||
}
|
||||
} else {
|
||||
// Scalar path: for narrow bandwidth, avoid Map overhead.
|
||||
if (op == NOTR) {
|
||||
if (upper) {
|
||||
for (int j = 0; j < *n; ++j) {
|
||||
if (actual_x[j] != Scalar(0)) {
|
||||
Scalar temp = actual_x[j];
|
||||
for (int i = std::max(0, j - *k); i < j; ++i) actual_x[i] += temp * a[(*k + i - j) + j * *lda];
|
||||
if (!unit) actual_x[j] = temp * a[*k + j * *lda];
|
||||
}
|
||||
}
|
||||
} else {
|
||||
for (int j = *n - 1; j >= 0; --j) {
|
||||
if (actual_x[j] != Scalar(0)) {
|
||||
Scalar temp = actual_x[j];
|
||||
for (int i = j + 1; i <= std::min(*n - 1, j + *k); ++i) actual_x[i] += temp * a[(i - j) + j * *lda];
|
||||
if (!unit) actual_x[j] = temp * a[j * *lda];
|
||||
}
|
||||
}
|
||||
}
|
||||
} else {
|
||||
// Transpose or conjugate transpose.
|
||||
auto maybe_conj = [op](Scalar val) -> Scalar { return op == ADJ ? Eigen::numext::conj(val) : val; };
|
||||
if (upper) {
|
||||
for (int j = *n - 1; j >= 0; --j) {
|
||||
Scalar temp = actual_x[j];
|
||||
if (!unit) temp *= maybe_conj(a[*k + j * *lda]);
|
||||
for (int i = std::max(0, j - *k); i < j; ++i) temp += maybe_conj(a[(*k + i - j) + j * *lda]) * actual_x[i];
|
||||
actual_x[j] = temp;
|
||||
}
|
||||
} else {
|
||||
for (int j = 0; j < *n; ++j) {
|
||||
Scalar temp = actual_x[j];
|
||||
if (!unit) temp *= maybe_conj(a[j * *lda]);
|
||||
for (int i = j + 1; i <= std::min(*n - 1, j + *k); ++i)
|
||||
temp += maybe_conj(a[(i - j) + j * *lda]) * actual_x[i];
|
||||
actual_x[j] = temp;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
if(actual_x!=x) delete[] actual_x;
|
||||
if(actual_y!=y) delete[] copy_back(actual_y,y,actual_m,*incy);
|
||||
if (actual_x != x) delete[] copy_back(actual_x, x, *n, *incx);
|
||||
}
|
||||
#endif
|
||||
|
||||
/** DTBSV solves one of the systems of equations
|
||||
*
|
||||
|
||||
@@ -158,32 +158,196 @@ EIGEN_BLAS_FUNC(syr2)
|
||||
// func[code](*n, a, *inca, b, *incb, c, *ldc, alpha);
|
||||
}
|
||||
|
||||
/** DSBMV performs the matrix-vector operation
|
||||
/** SBMV performs the matrix-vector operation
|
||||
*
|
||||
* y := alpha*A*x + beta*y,
|
||||
*
|
||||
* where alpha and beta are scalars, x and y are n element vectors and
|
||||
* A is an n by n symmetric band matrix, with k super-diagonals.
|
||||
*
|
||||
* Band storage: upper triangle stores A[i,j] at a[(k+i-j) + j*lda],
|
||||
* lower triangle stores A[i,j] at a[(i-j) + j*lda].
|
||||
*/
|
||||
// EIGEN_BLAS_FUNC(sbmv)( char *uplo, int *n, int *k, RealScalar *alpha, RealScalar *a, int *lda,
|
||||
// RealScalar *x, int *incx, RealScalar *beta, RealScalar *y, int *incy)
|
||||
// {
|
||||
// return 1;
|
||||
// }
|
||||
EIGEN_BLAS_FUNC(sbmv)
|
||||
(char *uplo, int *n, int *k, RealScalar *palpha, RealScalar *pa, int *lda, RealScalar *px, int *incx, RealScalar *pbeta,
|
||||
RealScalar *py, int *incy) {
|
||||
const Scalar alpha = *reinterpret_cast<const Scalar *>(palpha);
|
||||
const Scalar beta = *reinterpret_cast<const Scalar *>(pbeta);
|
||||
const Scalar *a = reinterpret_cast<const Scalar *>(pa);
|
||||
const Scalar *x = reinterpret_cast<const Scalar *>(px);
|
||||
Scalar *y = reinterpret_cast<Scalar *>(py);
|
||||
|
||||
/** DSPMV performs the matrix-vector operation
|
||||
int info = 0;
|
||||
if (UPLO(*uplo) == INVALID)
|
||||
info = 1;
|
||||
else if (*n < 0)
|
||||
info = 2;
|
||||
else if (*k < 0)
|
||||
info = 3;
|
||||
else if (*lda < *k + 1)
|
||||
info = 6;
|
||||
else if (*incx == 0)
|
||||
info = 8;
|
||||
else if (*incy == 0)
|
||||
info = 11;
|
||||
if (info) return xerbla_(SCALAR_SUFFIX_UP "SBMV ", &info);
|
||||
|
||||
if (*n == 0 || (alpha == Scalar(0) && beta == Scalar(1))) return;
|
||||
|
||||
const Scalar *actual_x = get_compact_vector(x, *n, *incx);
|
||||
Scalar *actual_y = get_compact_vector(y, *n, *incy);
|
||||
|
||||
// First form y := beta*y.
|
||||
if (beta != Scalar(1)) {
|
||||
if (beta == Scalar(0))
|
||||
make_vector(actual_y, *n).setZero();
|
||||
else
|
||||
make_vector(actual_y, *n) *= beta;
|
||||
}
|
||||
|
||||
if (alpha == Scalar(0)) {
|
||||
if (actual_x != x) delete[] actual_x;
|
||||
if (actual_y != y) delete[] copy_back(actual_y, y, *n, *incy);
|
||||
return;
|
||||
}
|
||||
|
||||
if (*k >= 8) {
|
||||
// Vectorized path: use Eigen Map segments for the inner band operations.
|
||||
ConstMatrixType band(a, *k + 1, *n, *lda);
|
||||
if (UPLO(*uplo) == UP) {
|
||||
for (int j = 0; j < *n; ++j) {
|
||||
int start = std::max(0, j - *k);
|
||||
int len = j - start;
|
||||
int offset = *k - (j - start);
|
||||
Scalar temp1 = alpha * actual_x[j];
|
||||
actual_y[j] += temp1 * band(*k, j);
|
||||
if (len > 0) {
|
||||
make_vector(actual_y + start, len) += temp1 * band.col(j).segment(offset, len);
|
||||
actual_y[j] += alpha * band.col(j).segment(offset, len).dot(make_vector(actual_x + start, len));
|
||||
}
|
||||
}
|
||||
} else {
|
||||
for (int j = 0; j < *n; ++j) {
|
||||
int len = std::min(*n - 1, j + *k) - j;
|
||||
Scalar temp1 = alpha * actual_x[j];
|
||||
actual_y[j] += temp1 * band(0, j);
|
||||
if (len > 0) {
|
||||
make_vector(actual_y + j + 1, len) += temp1 * band.col(j).segment(1, len);
|
||||
actual_y[j] += alpha * band.col(j).segment(1, len).dot(make_vector(actual_x + j + 1, len));
|
||||
}
|
||||
}
|
||||
}
|
||||
} else {
|
||||
// Scalar path: for narrow bandwidth, avoid Map overhead.
|
||||
if (UPLO(*uplo) == UP) {
|
||||
for (int j = 0; j < *n; ++j) {
|
||||
Scalar temp1 = alpha * actual_x[j];
|
||||
Scalar temp2 = Scalar(0);
|
||||
for (int i = std::max(0, j - *k); i < j; ++i) {
|
||||
Scalar aij = a[(*k + i - j) + j * *lda];
|
||||
actual_y[i] += temp1 * aij;
|
||||
temp2 += aij * actual_x[i];
|
||||
}
|
||||
actual_y[j] += temp1 * a[*k + j * *lda] + alpha * temp2;
|
||||
}
|
||||
} else {
|
||||
for (int j = 0; j < *n; ++j) {
|
||||
Scalar temp1 = alpha * actual_x[j];
|
||||
Scalar temp2 = Scalar(0);
|
||||
actual_y[j] += temp1 * a[j * *lda];
|
||||
for (int i = j + 1; i <= std::min(*n - 1, j + *k); ++i) {
|
||||
Scalar aij = a[(i - j) + j * *lda];
|
||||
actual_y[i] += temp1 * aij;
|
||||
temp2 += aij * actual_x[i];
|
||||
}
|
||||
actual_y[j] += alpha * temp2;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
if (actual_x != x) delete[] actual_x;
|
||||
if (actual_y != y) delete[] copy_back(actual_y, y, *n, *incy);
|
||||
}
|
||||
|
||||
/** SPMV performs the matrix-vector operation
|
||||
*
|
||||
* y := alpha*A*x + beta*y,
|
||||
*
|
||||
* where alpha and beta are scalars, x and y are n element vectors and
|
||||
* A is an n by n symmetric matrix, supplied in packed form.
|
||||
*
|
||||
* Packed storage: upper triangle stores columns sequentially so that
|
||||
* column j occupies positions kk..kk+j (where kk = j*(j+1)/2),
|
||||
* lower triangle stores column j at positions kk..kk+(n-j-1).
|
||||
*/
|
||||
// EIGEN_BLAS_FUNC(spmv)(char *uplo, int *n, RealScalar *alpha, RealScalar *ap, RealScalar *x, int *incx, RealScalar
|
||||
// *beta, RealScalar *y, int *incy)
|
||||
// {
|
||||
// return 1;
|
||||
// }
|
||||
EIGEN_BLAS_FUNC(spmv)
|
||||
(char *uplo, int *n, RealScalar *palpha, RealScalar *pap, RealScalar *px, int *incx, RealScalar *pbeta, RealScalar *py,
|
||||
int *incy) {
|
||||
const Scalar alpha = *reinterpret_cast<const Scalar *>(palpha);
|
||||
const Scalar beta = *reinterpret_cast<const Scalar *>(pbeta);
|
||||
const Scalar *ap = reinterpret_cast<const Scalar *>(pap);
|
||||
const Scalar *x = reinterpret_cast<const Scalar *>(px);
|
||||
Scalar *y = reinterpret_cast<Scalar *>(py);
|
||||
|
||||
int info = 0;
|
||||
if (UPLO(*uplo) == INVALID)
|
||||
info = 1;
|
||||
else if (*n < 0)
|
||||
info = 2;
|
||||
else if (*incx == 0)
|
||||
info = 6;
|
||||
else if (*incy == 0)
|
||||
info = 9;
|
||||
if (info) return xerbla_(SCALAR_SUFFIX_UP "SPMV ", &info);
|
||||
|
||||
if (*n == 0 || (alpha == Scalar(0) && beta == Scalar(1))) return;
|
||||
|
||||
const Scalar *actual_x = get_compact_vector(x, *n, *incx);
|
||||
Scalar *actual_y = get_compact_vector(y, *n, *incy);
|
||||
|
||||
// First form y := beta*y.
|
||||
if (beta != Scalar(1)) {
|
||||
if (beta == Scalar(0))
|
||||
make_vector(actual_y, *n).setZero();
|
||||
else
|
||||
make_vector(actual_y, *n) *= beta;
|
||||
}
|
||||
|
||||
if (alpha == Scalar(0)) {
|
||||
if (actual_x != x) delete[] actual_x;
|
||||
if (actual_y != y) delete[] copy_back(actual_y, y, *n, *incy);
|
||||
return;
|
||||
}
|
||||
|
||||
int kk = 0;
|
||||
if (UPLO(*uplo) == UP) {
|
||||
// Upper triangle packed: column j occupies ap[kk..kk+j].
|
||||
for (int j = 0; j < *n; ++j) {
|
||||
Scalar temp1 = alpha * actual_x[j];
|
||||
actual_y[j] += temp1 * ap[kk + j];
|
||||
if (j > 0) {
|
||||
make_vector(actual_y, j) += temp1 * make_vector(ap + kk, j);
|
||||
actual_y[j] += alpha * make_vector(ap + kk, j).dot(make_vector(actual_x, j));
|
||||
}
|
||||
kk += j + 1;
|
||||
}
|
||||
} else {
|
||||
// Lower triangle packed: column j occupies ap[kk..kk+(n-j-1)].
|
||||
for (int j = 0; j < *n; ++j) {
|
||||
int len = *n - j - 1;
|
||||
Scalar temp1 = alpha * actual_x[j];
|
||||
actual_y[j] += temp1 * ap[kk];
|
||||
if (len > 0) {
|
||||
make_vector(actual_y + j + 1, len) += temp1 * make_vector(ap + kk + 1, len);
|
||||
actual_y[j] += alpha * make_vector(ap + kk + 1, len).dot(make_vector(actual_x + j + 1, len));
|
||||
}
|
||||
kk += *n - j;
|
||||
}
|
||||
}
|
||||
|
||||
if (actual_x != x) delete[] actual_x;
|
||||
if (actual_y != y) delete[] copy_back(actual_y, y, *n, *incy);
|
||||
}
|
||||
|
||||
/** DSPR performs the symmetric rank 1 operation
|
||||
*
|
||||
|
||||
15
blas/lsame.cpp
Normal file
15
blas/lsame.cpp
Normal file
@@ -0,0 +1,15 @@
|
||||
// This file is part of Eigen, a lightweight C++ template library
|
||||
// for linear algebra.
|
||||
//
|
||||
// This Source Code Form is subject to the terms of the Mozilla
|
||||
// Public License v. 2.0. If a copy of the MPL was not distributed
|
||||
// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
|
||||
|
||||
#include <cctype>
|
||||
|
||||
#include "blas.h"
|
||||
|
||||
// LSAME returns true if ca and cb are the same letter, regardless of case.
|
||||
extern "C" EIGEN_BLAS_API int lsame_(const char *ca, const char *cb) {
|
||||
return std::toupper(static_cast<unsigned char>(*ca)) == std::toupper(static_cast<unsigned char>(*cb));
|
||||
}
|
||||
@@ -197,7 +197,7 @@ build:linux:x86-64:nvhpc-26.1:default:unsupported:
|
||||
# Additional flags passed to the cuda compiler.
|
||||
EIGEN_CI_CUDA_CXX_FLAGS: ""
|
||||
# Compute architectures present in the GitLab CI runners.
|
||||
EIGEN_CI_CUDA_COMPUTE_ARCH: "50;75"
|
||||
EIGEN_CI_CUDA_COMPUTE_ARCH: "70;75"
|
||||
EIGEN_CI_BUILD_TARGET: buildtests_gpu
|
||||
EIGEN_CI_TEST_CUDA_CLANG: "off"
|
||||
EIGEN_CI_TEST_CUDA_NVC: "off"
|
||||
@@ -211,20 +211,20 @@ build:linux:x86-64:nvhpc-26.1:default:unsupported:
|
||||
# Build on regular linux to limit GPU cost.
|
||||
- saas-linux-2xlarge-amd64
|
||||
|
||||
# GCC-10, CUDA-12.2
|
||||
build:linux:cuda-12.2:gcc-10:
|
||||
# GCC-11, CUDA-12.2
|
||||
build:linux:cuda-12.2:gcc-11:
|
||||
extends: .build:linux:cuda
|
||||
image: nvidia/cuda:12.2.0-devel-ubuntu20.04
|
||||
image: nvidia/cuda:12.2.0-devel-ubuntu22.04
|
||||
variables:
|
||||
EIGEN_CI_C_COMPILER: gcc-10
|
||||
EIGEN_CI_CXX_COMPILER: g++-10
|
||||
EIGEN_CI_C_COMPILER: gcc-11
|
||||
EIGEN_CI_CXX_COMPILER: g++-11
|
||||
|
||||
# Clang-12, CUDA-12.2
|
||||
build:linux:cuda-12.2:clang-12:
|
||||
extends: build:linux:cuda-12.2:gcc-10
|
||||
# Clang-14, CUDA-12.2
|
||||
build:linux:cuda-12.2:clang-14:
|
||||
extends: build:linux:cuda-12.2:gcc-11
|
||||
variables:
|
||||
EIGEN_CI_C_COMPILER: clang-12
|
||||
EIGEN_CI_CXX_COMPILER: clang++-12
|
||||
EIGEN_CI_C_COMPILER: clang-14
|
||||
EIGEN_CI_CXX_COMPILER: clang++-14
|
||||
EIGEN_CI_TEST_CUDA_CLANG: "on"
|
||||
|
||||
|
||||
@@ -234,7 +234,7 @@ build:linux:cuda-12.2:clang-12:
|
||||
# ROCm HIP
|
||||
build:linux:rocm-latest:gcc-10:
|
||||
extends: .build:linux:cross
|
||||
image: rocm/dev-ubuntu-24.04:latest
|
||||
image: rocm/dev-ubuntu-24.04:6.3.1
|
||||
variables:
|
||||
EIGEN_CI_C_COMPILER: gcc-10
|
||||
EIGEN_CI_CXX_COMPILER: g++-10
|
||||
@@ -386,6 +386,6 @@ build:linux:cross:x86-64:clang-14:sanitizer:smoketest:
|
||||
rules:
|
||||
- if: $CI_PIPELINE_SOURCE == "merge_request_event"
|
||||
tags:
|
||||
- saas-linux-medium-amd64
|
||||
- saas-linux-large-amd64
|
||||
allow_failure: true
|
||||
timeout: 30m
|
||||
|
||||
@@ -55,7 +55,7 @@ build:windows:x86-64:msvc-14.29:avx512dq:
|
||||
extends: .build:windows
|
||||
variables:
|
||||
# Compute architectures present in the GitLab CI runners.
|
||||
EIGEN_CI_CUDA_COMPUTE_ARCH: "50;75"
|
||||
EIGEN_CI_CUDA_COMPUTE_ARCH: "70;75"
|
||||
EIGEN_CI_BUILD_TARGET: buildtests_gpu
|
||||
EIGEN_CI_ADDITIONAL_ARGS:
|
||||
-DEIGEN_TEST_CUDA=on
|
||||
@@ -66,8 +66,8 @@ build:windows:x86-64:msvc-14.29:avx512dq:
|
||||
- x86-64
|
||||
- cuda
|
||||
|
||||
# MSVC 14.29 + CUDA 11.4
|
||||
build:windows:x86-64:cuda-11.4:msvc-14.29:
|
||||
# MSVC 14.29 + CUDA 12.2
|
||||
build:windows:x86-64:cuda-12.2:msvc-14.29:
|
||||
extends: .build:windows:cuda
|
||||
variables:
|
||||
EIGEN_CI_BEFORE_SCRIPT: $$env:CUDA_PATH=$$env:CUDA_PATH_V11_4
|
||||
EIGEN_CI_BEFORE_SCRIPT: $$env:CUDA_PATH=$$env:CUDA_PATH_V12_2
|
||||
|
||||
@@ -265,23 +265,23 @@ test:linux:x86-64:nvhpc-26.1:default:unsupported:
|
||||
tags:
|
||||
- saas-linux-medium-amd64-gpu-standard
|
||||
|
||||
# GCC-10, CUDA-12.2
|
||||
test:linux:cuda-12.2:gcc-10:
|
||||
# GCC-11, CUDA-12.2
|
||||
test:linux:cuda-12.2:gcc-11:
|
||||
extends: .test:linux:cuda
|
||||
image: nvidia/cuda:12.2.0-devel-ubuntu20.04
|
||||
needs: [ build:linux:cuda-12.2:gcc-10 ]
|
||||
image: nvidia/cuda:12.2.0-devel-ubuntu22.04
|
||||
needs: [ build:linux:cuda-12.2:gcc-11 ]
|
||||
variables:
|
||||
EIGEN_CI_CXX_COMPILER: g++-10
|
||||
EIGEN_CI_CC_COMPILER: gcc-10
|
||||
EIGEN_CI_CXX_COMPILER: g++-11
|
||||
EIGEN_CI_CC_COMPILER: gcc-11
|
||||
|
||||
# Clang-12, CUDA-12.2
|
||||
test:linux:cuda-12.2:clang-12:
|
||||
# Clang-14, CUDA-12.2
|
||||
test:linux:cuda-12.2:clang-14:
|
||||
extends: .test:linux:cuda
|
||||
image: nvidia/cuda:12.2.0-devel-ubuntu20.04
|
||||
needs: [ build:linux:cuda-12.2:clang-12 ]
|
||||
image: nvidia/cuda:12.2.0-devel-ubuntu22.04
|
||||
needs: [ build:linux:cuda-12.2:clang-14 ]
|
||||
variables:
|
||||
EIGEN_CI_CXX_COMPILER: clang++-12
|
||||
EIGEN_CI_CC_COMPILER: clang-12
|
||||
EIGEN_CI_CXX_COMPILER: clang++-14
|
||||
EIGEN_CI_CC_COMPILER: clang-14
|
||||
|
||||
|
||||
##### arm ######################################################################
|
||||
@@ -488,7 +488,6 @@ test:linux:x86-64:clang-14:sanitizer:smoketest:
|
||||
variables:
|
||||
EIGEN_CI_INSTALL: clang-14 llvm-14 libclang-rt-14-dev
|
||||
EIGEN_CI_CTEST_LABEL: smoketest
|
||||
EIGEN_CI_CTEST_PARALLEL: "2"
|
||||
EIGEN_CI_CTEST_ARGS: --timeout 120
|
||||
ASAN_OPTIONS: "detect_leaks=0:halt_on_error=1:abort_on_error=1:allocator_may_return_null=1:print_stacktrace=1:detect_stack_use_after_return=0"
|
||||
ASAN_SYMBOLIZER_PATH: "/usr/lib/llvm-14/bin/llvm-symbolizer"
|
||||
@@ -496,6 +495,6 @@ test:linux:x86-64:clang-14:sanitizer:smoketest:
|
||||
rules:
|
||||
- if: $CI_PIPELINE_SOURCE == "merge_request_event"
|
||||
tags:
|
||||
- saas-linux-medium-amd64
|
||||
- saas-linux-large-amd64
|
||||
allow_failure: true
|
||||
timeout: 30m
|
||||
|
||||
@@ -71,7 +71,7 @@ test:windows:x86-64:msvc-14.29:avx512dq:unsupported:
|
||||
- x86-64
|
||||
- cuda
|
||||
|
||||
# MSVC 14.29 + CUDA 11.4
|
||||
test:windows:x86-64:cuda-11.4:msvc-14.29:
|
||||
# MSVC 14.29 + CUDA 12.2
|
||||
test:windows:x86-64:cuda-12.2:msvc-14.29:
|
||||
extends: .test:windows:cuda
|
||||
needs: [ build:windows:x86-64:cuda-11.4:msvc-14.29 ]
|
||||
needs: [ build:windows:x86-64:cuda-12.2:msvc-14.29 ]
|
||||
|
||||
@@ -20,7 +20,8 @@ add_dependencies(check buildtests)
|
||||
|
||||
# Convenience target for only building GPU tests.
|
||||
add_custom_target(buildtests_gpu)
|
||||
add_custom_target(check_gpu COMMAND "ctest" "--output-on-failure"
|
||||
add_custom_target(check_gpu COMMAND "ctest" ${EIGEN_CTEST_ARGS}
|
||||
"--output-on-failure"
|
||||
"--no-compress-output"
|
||||
"--build-no-clean"
|
||||
"-T" "test"
|
||||
@@ -71,4 +72,3 @@ elseif(MSVC)
|
||||
set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} /D_CRT_SECURE_NO_WARNINGS /D_SCL_SECURE_NO_WARNINGS")
|
||||
endif()
|
||||
|
||||
|
||||
|
||||
@@ -8,6 +8,12 @@ macro(ei_add_property prop value)
|
||||
endif()
|
||||
endmacro()
|
||||
|
||||
if(EIGEN_TEST_HIP AND NOT DEFINED EIGEN_HIP_ARCHITECTURES)
|
||||
set(EIGEN_HIP_ARCHITECTURES
|
||||
gfx900;gfx906;gfx908;gfx90a;gfx940;gfx941;gfx942;gfx1030;gfx1100;gfx1101;gfx1102;gfx1150;gfx1151
|
||||
CACHE STRING "HIP GPU architectures to build Eigen's HIP tests for.")
|
||||
endif()
|
||||
|
||||
#internal. See documentation of ei_add_test for details.
|
||||
macro(ei_add_test_internal testname testname_with_suffix)
|
||||
set(targetname ${testname_with_suffix})
|
||||
@@ -30,7 +36,7 @@ macro(ei_add_test_internal testname testname_with_suffix)
|
||||
hip_reset_flags()
|
||||
hip_add_executable(${targetname} ${filename} HIPCC_OPTIONS -std=c++14)
|
||||
target_compile_definitions(${targetname} PRIVATE -DEIGEN_USE_HIP)
|
||||
set_property(TARGET ${targetname} PROPERTY HIP_ARCHITECTURES gfx900 gfx906 gfx908 gfx90a gfx940 gfx941 gfx942 gfx1030)
|
||||
set_property(TARGET ${targetname} PROPERTY HIP_ARCHITECTURES "${EIGEN_HIP_ARCHITECTURES}")
|
||||
elseif(EIGEN_TEST_CUDA_CLANG)
|
||||
set_source_files_properties(${filename} PROPERTIES LANGUAGE CXX)
|
||||
|
||||
@@ -134,6 +140,7 @@ macro(ei_add_test_internal testname testname_with_suffix)
|
||||
if (is_gpu_test)
|
||||
# Add gpu tag for testing only GPU tests.
|
||||
set_property(TEST ${testname_with_suffix} APPEND PROPERTY LABELS "gpu")
|
||||
set_property(TEST ${testname_with_suffix} PROPERTY SKIP_RETURN_CODE 77)
|
||||
endif()
|
||||
|
||||
if(EIGEN_SYCL)
|
||||
|
||||
@@ -30,10 +30,11 @@ Timings are in \b milliseconds, and factors are relative to the LLT decompositio
|
||||
<a name="note_ls">\b *: </a> This decomposition do not support direct least-square solving for over-constrained problems, and the reported timing include the cost to form the symmetric covariance matrix \f$ A^T A \f$.
|
||||
|
||||
\b Observations:
|
||||
+ LLT is always the fastest solvers.
|
||||
+ LLT is always the fastest solver.
|
||||
+ For largely over-constrained problems, the cost of Cholesky/LU decompositions is dominated by the computation of the symmetric covariance matrix.
|
||||
+ For large problem sizes, only the decomposition implementing a cache-friendly blocking strategy scale well. Those include LLT, PartialPivLU, HouseholderQR, and BDCSVD. This explain why for a 4k x 4k matrix, HouseholderQR is faster than LDLT. In the future, LDLT and ColPivHouseholderQR will also implement blocking strategies.
|
||||
+ For large problem sizes, only the decompositions implementing a cache-friendly blocking strategy scale well. Those include LLT, PartialPivLU, HouseholderQR, and BDCSVD. This explains why for a 4k x 4k matrix, HouseholderQR is faster than LDLT.
|
||||
+ CompleteOrthogonalDecomposition is based on ColPivHouseholderQR and they thus achieve the same level of performance.
|
||||
+ FullPivLU and FullPivHouseholderQR are dramatically slower for large matrices due to the lack of blocking, and are not shown for the 4k x 4k case.
|
||||
|
||||
The above table was originally generated by a benchmark tool. Feel free to write your own benchmark to generate a table matching your hardware, compiler, and favorite problem sizes.
|
||||
|
||||
|
||||
@@ -7,13 +7,33 @@ of equations, say \a Ax = \a b, has no solutions. In this case, it makes sense t
|
||||
vector \a x which is closest to being a solution, in the sense that the difference \a Ax - \a b is
|
||||
as small as possible. This \a x is called the least square solution (if the Euclidean norm is used).
|
||||
|
||||
The three methods discussed on this page are the SVD decomposition, the QR decomposition and normal
|
||||
equations. Of these, the SVD decomposition is generally the most accurate but the slowest, normal
|
||||
equations is the fastest but least accurate, and the QR decomposition is in between.
|
||||
The methods discussed on this page are the complete orthogonal decomposition (COD), the SVD
|
||||
decomposition, other QR decompositions, and normal equations. For most problems, we recommend
|
||||
CompleteOrthogonalDecomposition: it robustly computes the minimum-norm least squares solution
|
||||
(like the SVD) for both over- and under-determined systems, including rank-deficient ones, but at
|
||||
QR-like speed. The SVD is the most robust but also the slowest; use it when you also need singular
|
||||
values or vectors. Normal equations are the fastest but least robust.
|
||||
|
||||
\eigenAutoToc
|
||||
|
||||
|
||||
\section LeastSquaresCOD Using the complete orthogonal decomposition (recommended)
|
||||
|
||||
CompleteOrthogonalDecomposition is the recommended method for least squares problems. It handles the
|
||||
widest class of problems — overdetermined, underdetermined, and rank-deficient systems — and computes
|
||||
the minimum-norm solution when the system is rank-deficient or underdetermined, just like the SVD.
|
||||
It is based on a rank-revealing QR factorization (ColPivHouseholderQR) followed by a post-processing
|
||||
step, so it is significantly faster than SVD while providing comparable robustness.
|
||||
|
||||
<table class="example">
|
||||
<tr><th>Example:</th><th>Output:</th></tr>
|
||||
<tr>
|
||||
<td>\include LeastSquaresCOD.cpp </td>
|
||||
<td>\verbinclude LeastSquaresCOD.out </td>
|
||||
</tr>
|
||||
</table>
|
||||
|
||||
|
||||
\section LeastSquaresSVD Using the SVD decomposition
|
||||
|
||||
The \link BDCSVD::solve() solve() \endlink method in the BDCSVD class can be directly used to
|
||||
@@ -30,16 +50,19 @@ computing least squares solutions:
|
||||
</table>
|
||||
|
||||
This is example from the page \link TutorialLinearAlgebra Linear algebra and decompositions \endlink.
|
||||
If you just need to solve the least squares problem, but are not interested in the SVD per se, a
|
||||
faster alternative method is CompleteOrthogonalDecomposition.
|
||||
The SVD gives you singular values and vectors in addition to the least squares solution, but if you
|
||||
only need the solution, CompleteOrthogonalDecomposition (above) is faster.
|
||||
|
||||
|
||||
\section LeastSquaresQR Using the QR decomposition
|
||||
\section LeastSquaresQR Using other QR decompositions
|
||||
|
||||
The solve() method in QR decomposition classes also computes the least squares solution. There are
|
||||
three QR decomposition classes: HouseholderQR (no pivoting, fast but unstable if your matrix is
|
||||
not rull rank), ColPivHouseholderQR (column pivoting, thus a bit slower but more stable) and
|
||||
FullPivHouseholderQR (full pivoting, so slowest and slightly more stable than ColPivHouseholderQR).
|
||||
The solve() method in QR decomposition classes also computes the least squares solution. Besides
|
||||
CompleteOrthogonalDecomposition (above), there are three other QR decomposition classes:
|
||||
HouseholderQR (no pivoting, so fast but unreliable if your matrix is not full rank),
|
||||
ColPivHouseholderQR (column pivoting, a bit slower but rank-revealing), and FullPivHouseholderQR
|
||||
(full pivoting, significantly slower and rarely needed in practice).
|
||||
Note that only CompleteOrthogonalDecomposition and the SVD-based solvers compute minimum-norm
|
||||
solutions for rank-deficient or underdetermined problems; the other QR variants do not.
|
||||
Here is an example with column pivoting:
|
||||
|
||||
<table class="example">
|
||||
|
||||
@@ -42,10 +42,10 @@ To get an overview of the true relative speed of the different decompositions, c
|
||||
<tr class="alt">
|
||||
<td>FullPivLU</td>
|
||||
<td>-</td>
|
||||
<td>Slow</td>
|
||||
<td>Slow (no blocking)</td>
|
||||
<td>Proven</td>
|
||||
<td>Yes</td>
|
||||
<td>-</td>
|
||||
<td>Rank, kernel, image</td>
|
||||
<td>Yes</td>
|
||||
<td>Excellent</td>
|
||||
<td>-</td>
|
||||
@@ -78,7 +78,7 @@ To get an overview of the true relative speed of the different decompositions, c
|
||||
<tr>
|
||||
<td>FullPivHouseholderQR</td>
|
||||
<td>-</td>
|
||||
<td>Slow</td>
|
||||
<td>Slow (no blocking)</td>
|
||||
<td>Proven</td>
|
||||
<td>Yes</td>
|
||||
<td>Orthogonalization</td>
|
||||
@@ -120,7 +120,7 @@ To get an overview of the true relative speed of the different decompositions, c
|
||||
<td>-</td>
|
||||
<td>Yes</td>
|
||||
<td>Excellent</td>
|
||||
<td><em>Soon: blocking</em></td>
|
||||
<td>-</td>
|
||||
</tr>
|
||||
|
||||
<tr><th class="inter" colspan="9">\n Singular values and eigenvalues decompositions</th></tr>
|
||||
@@ -232,7 +232,7 @@ To get an overview of the true relative speed of the different decompositions, c
|
||||
<td>-</td>
|
||||
<td>-</td>
|
||||
<td>Good</td>
|
||||
<td><em>Soon: blocking</em></td>
|
||||
<td>-</td>
|
||||
</tr>
|
||||
|
||||
<tr>
|
||||
@@ -244,7 +244,7 @@ To get an overview of the true relative speed of the different decompositions, c
|
||||
<td>-</td>
|
||||
<td>-</td>
|
||||
<td>Good</td>
|
||||
<td><em>Soon: blocking</em></td>
|
||||
<td>-</td>
|
||||
</tr>
|
||||
|
||||
</table>
|
||||
@@ -253,9 +253,32 @@ To get an overview of the true relative speed of the different decompositions, c
|
||||
<ul>
|
||||
<li><a name="note1">\b 1: </a>There exist two variants of the LDLT algorithm. Eigen's one produces a pure diagonal D matrix, and therefore it cannot handle indefinite matrices, unlike Lapack's one which produces a block diagonal D matrix.</li>
|
||||
<li><a name="note2">\b 2: </a>Eigenvalues, SVD and Schur decompositions rely on iterative algorithms. Their convergence speed depends on how well the eigenvalues are separated.</li>
|
||||
<li><a name="note3">\b 3: </a>Our JacobiSVD is two-sided, making for proven and optimal precision for square matrices. For non-square matrices, we have to use a QR preconditioner first. The default choice, ColPivHouseholderQR, is already very reliable, but if you want it to be proven, use FullPivHouseholderQR instead.
|
||||
<li><a name="note3">\b 3: </a>Our JacobiSVD is two-sided, making for proven and optimal precision for square matrices. For non-square matrices, we have to use a QR preconditioner first. The default choice, ColPivHouseholderQR, is already very reliable, but if you want it to be proven, use FullPivHouseholderQR instead.</li>
|
||||
</ul>
|
||||
|
||||
\section TopicLinAlgPracticalGuidance Practical guidance
|
||||
|
||||
The following recommendations apply to the most common use cases:
|
||||
|
||||
\li <b>Symmetric positive definite systems:</b> Use \b LLT. It is the fastest solver and has excellent
|
||||
numerical properties for this class of problems. For semidefinite or nearly singular symmetric systems,
|
||||
use \b LDLT.
|
||||
\li <b>General invertible systems:</b> Use \b PartialPivLU. It uses cache-friendly blocking and implicit
|
||||
multi-threading, making it the fastest general-purpose solver. Partial pivoting is sufficient for
|
||||
virtually all practical problems.
|
||||
\li <b>Least squares (over- or under-determined systems):</b> Use \b CompleteOrthogonalDecomposition as
|
||||
the default. Like the SVD, it robustly computes the minimum-norm solution for rank-deficient and
|
||||
under-determined problems, but at QR-like speed. Use \b BDCSVD when you also need singular values
|
||||
or vectors, not just the least squares solution.
|
||||
\li <b>Full-rank least squares (overdetermined systems):</b> When the matrix is known to be full rank,
|
||||
\b HouseholderQR is the fastest option. For very tall and skinny well-conditioned matrices,
|
||||
solving via the normal equations with \b LLT can be faster still.
|
||||
\li <b>FullPivLU and FullPivHouseholderQR</b> use complete pivoting, which prevents the use of
|
||||
cache-friendly blocking algorithms and makes them significantly slower than their partial/column
|
||||
pivoting counterparts. In practice, complete pivoting rarely provides meaningful accuracy benefits.
|
||||
These decompositions are primarily useful for debugging, pedagogy, or the very rare case
|
||||
where column pivoting is insufficient.
|
||||
|
||||
\section TopicLinAlgTerminology Terminology
|
||||
|
||||
<dl>
|
||||
|
||||
@@ -43,7 +43,23 @@ depending on your matrix, the problem you are trying to solve, and the trade-off
|
||||
<th>Requirements<br/>on the matrix</th>
|
||||
<th>Speed<br/> (small-to-medium)</th>
|
||||
<th>Speed<br/> (large)</th>
|
||||
<th>Accuracy</th>
|
||||
<th>Robustness<sup><a href="#note_robust">*</a></sup></th>
|
||||
</tr>
|
||||
<tr>
|
||||
<td>LLT</td>
|
||||
<td>llt()</td>
|
||||
<td>Positive definite</td>
|
||||
<td>+++</td>
|
||||
<td>+++</td>
|
||||
<td>+</td>
|
||||
</tr>
|
||||
<tr class="alt">
|
||||
<td>LDLT</td>
|
||||
<td>ldlt()</td>
|
||||
<td>Positive or negative<br/> semidefinite</td>
|
||||
<td>+++</td>
|
||||
<td>+</td>
|
||||
<td>++</td>
|
||||
</tr>
|
||||
<tr>
|
||||
<td>PartialPivLU</td>
|
||||
@@ -54,14 +70,6 @@ depending on your matrix, the problem you are trying to solve, and the trade-off
|
||||
<td>+</td>
|
||||
</tr>
|
||||
<tr class="alt">
|
||||
<td>FullPivLU</td>
|
||||
<td>fullPivLu()</td>
|
||||
<td>None</td>
|
||||
<td>-</td>
|
||||
<td>- -</td>
|
||||
<td>+++</td>
|
||||
</tr>
|
||||
<tr>
|
||||
<td>HouseholderQR</td>
|
||||
<td>householderQr()</td>
|
||||
<td>None</td>
|
||||
@@ -69,7 +77,7 @@ depending on your matrix, the problem you are trying to solve, and the trade-off
|
||||
<td>++</td>
|
||||
<td>+</td>
|
||||
</tr>
|
||||
<tr class="alt">
|
||||
<tr>
|
||||
<td>ColPivHouseholderQR</td>
|
||||
<td>colPivHouseholderQr()</td>
|
||||
<td>None</td>
|
||||
@@ -77,14 +85,6 @@ depending on your matrix, the problem you are trying to solve, and the trade-off
|
||||
<td>-</td>
|
||||
<td>+++</td>
|
||||
</tr>
|
||||
<tr>
|
||||
<td>FullPivHouseholderQR</td>
|
||||
<td>fullPivHouseholderQr()</td>
|
||||
<td>None</td>
|
||||
<td>-</td>
|
||||
<td>- -</td>
|
||||
<td>+++</td>
|
||||
</tr>
|
||||
<tr class="alt">
|
||||
<td>CompleteOrthogonalDecomposition</td>
|
||||
<td>completeOrthogonalDecomposition()</td>
|
||||
@@ -93,23 +93,7 @@ depending on your matrix, the problem you are trying to solve, and the trade-off
|
||||
<td>-</td>
|
||||
<td>+++</td>
|
||||
</tr>
|
||||
<tr class="alt">
|
||||
<td>LLT</td>
|
||||
<td>llt()</td>
|
||||
<td>Positive definite</td>
|
||||
<td>+++</td>
|
||||
<td>+++</td>
|
||||
<td>+</td>
|
||||
</tr>
|
||||
<tr>
|
||||
<td>LDLT</td>
|
||||
<td>ldlt()</td>
|
||||
<td>Positive or negative<br/> semidefinite</td>
|
||||
<td>+++</td>
|
||||
<td>+</td>
|
||||
<td>++</td>
|
||||
</tr>
|
||||
<tr class="alt">
|
||||
<td>BDCSVD</td>
|
||||
<td>bdcSvd()</td>
|
||||
<td>None</td>
|
||||
@@ -126,15 +110,36 @@ depending on your matrix, the problem you are trying to solve, and the trade-off
|
||||
<td>+++</td>
|
||||
</tr>
|
||||
</table>
|
||||
|
||||
<a name="note_robust"><b>*</b></a> The <b>Robustness</b> column indicates how well the decomposition handles
|
||||
ill-conditioned or rank-deficient matrices. All decompositions give excellent accuracy when their
|
||||
requirements on the matrix are met and the problem is well-conditioned.
|
||||
|
||||
To get an overview of the true relative speed of the different decompositions, check this \link DenseDecompositionBenchmark benchmark \endlink.
|
||||
|
||||
All of these decompositions offer a solve() method that works as in the above example.
|
||||
All of these decompositions offer a solve() method that works as in the above example.
|
||||
|
||||
If you know more about the properties of your matrix, you can use the above table to select the best method.
|
||||
For example, a good choice for solving linear systems with a non-symmetric matrix of full rank is PartialPivLU.
|
||||
If you know that your matrix is also symmetric and positive definite, the above table says that
|
||||
a very good choice is the LLT or LDLT decomposition. Here's an example, also demonstrating that using a general
|
||||
matrix (not a vector) as right hand side is possible:
|
||||
\b Practical \b recommendations:
|
||||
\li If your matrix is symmetric positive definite, use \b LLT. It is the fastest and is perfectly accurate
|
||||
for this class of problems. If your matrix is only positive or negative semidefinite, use \b LDLT.
|
||||
\li For a general invertible matrix, \b PartialPivLU is the best choice. It is fast (uses cache-friendly
|
||||
blocking) and reliable for the vast majority of problems.
|
||||
\li For least squares problems (over- or under-determined systems), \b CompleteOrthogonalDecomposition
|
||||
is the recommended default. Like the SVD, it robustly computes the minimum-norm solution for
|
||||
rank-deficient and under-determined problems, but at the cost of a QR decomposition rather than
|
||||
an SVD. Use \b ColPivHouseholderQR if you only need least squares for full-rank overdetermined
|
||||
systems and don't need the minimum-norm property.
|
||||
\li \b SVD decompositions (BDCSVD, JacobiSVD) are the most robust but also the slowest. Use these when
|
||||
you need singular values/vectors, not just the solution.
|
||||
\li \b HouseholderQR is the fastest option for full-rank least squares problems, but it does not
|
||||
reveal rank and cannot compute minimum-norm solutions for rank-deficient problems.
|
||||
\li FullPivLU and FullPivHouseholderQR use complete pivoting, which is significantly slower due to
|
||||
lack of blocking. In practice, they rarely provide meaningful benefits over PartialPivLU and
|
||||
ColPivHouseholderQR, respectively, and are not recommended for general use. They are primarily useful
|
||||
for debugging or for pedagogical purposes.
|
||||
|
||||
Here's an example showing the use of LLT for a symmetric positive definite system, also demonstrating
|
||||
that using a general matrix (not a vector) as right hand side is possible:
|
||||
|
||||
<table class="example">
|
||||
<tr><th>Example:</th><th>Output:</th></tr>
|
||||
@@ -151,14 +156,15 @@ supports many other decompositions), see our special page on
|
||||
|
||||
\section TutorialLinAlgLeastsquares Least squares solving
|
||||
|
||||
The most general and accurate method to solve under- or over-determined linear systems
|
||||
in the least squares sense, is the SVD decomposition. Eigen provides two implementations.
|
||||
The recommended one is the BDCSVD class, which scales well for large problems
|
||||
and automatically falls back to the JacobiSVD class for smaller problems.
|
||||
For both classes, their solve() method solved the linear system in the least-squares
|
||||
sense.
|
||||
The recommended method to solve under- or over-determined linear systems in the least squares sense is
|
||||
\b CompleteOrthogonalDecomposition. Like the SVD, it robustly computes the minimum-norm least squares
|
||||
solution, correctly handling rank-deficient and under-determined problems, but it is significantly faster
|
||||
since it is based on a rank-revealing QR decomposition rather than a full SVD.
|
||||
|
||||
Here is an example:
|
||||
If you also need the singular values or vectors themselves (not just the least squares solution), use
|
||||
\b BDCSVD, which scales well for large problems and automatically falls back to JacobiSVD for smaller ones.
|
||||
|
||||
Here is an example using the SVD:
|
||||
<table class="example">
|
||||
<tr><th>Example:</th><th>Output:</th></tr>
|
||||
<tr>
|
||||
@@ -167,11 +173,9 @@ Here is an example:
|
||||
</tr>
|
||||
</table>
|
||||
|
||||
An alternative to the SVD, which is usually faster and about as accurate, is CompleteOrthogonalDecomposition.
|
||||
|
||||
Again, if you know more about the problem, the table above contains methods that are potentially faster.
|
||||
If your matrix is full rank, HouseHolderQR is the method of choice. If your matrix is full rank and well conditioned,
|
||||
using the Cholesky decomposition (LLT) on the matrix of the normal equations can be faster still.
|
||||
If you know more about the problem, faster methods are available.
|
||||
If your matrix is full rank, HouseholderQR is the fastest method. If your matrix is full rank and
|
||||
well conditioned, using the Cholesky decomposition (LLT) on the normal equations can be faster still.
|
||||
Our page on \link LeastSquares least squares solving \endlink has more details.
|
||||
|
||||
|
||||
@@ -267,8 +271,9 @@ singular matrix). On \ref TopicLinearAlgebraDecompositions "this table" you can
|
||||
whether they are rank-revealing or not.
|
||||
|
||||
Rank-revealing decompositions offer at least a rank() method. They can also offer convenience methods such as isInvertible(),
|
||||
and some are also providing methods to compute the kernel (null-space) and image (column-space) of the matrix, as is the
|
||||
case with FullPivLU:
|
||||
and some are also providing methods to compute the kernel (null-space) and image (column-space) of the matrix.
|
||||
ColPivHouseholderQR, CompleteOrthogonalDecomposition, and FullPivLU all provide these methods. Here is an example using
|
||||
FullPivLU:
|
||||
|
||||
<table class="example">
|
||||
<tr><th>Example:</th><th>Output:</th></tr>
|
||||
|
||||
3
doc/snippets/LeastSquaresCOD.cpp
Normal file
3
doc/snippets/LeastSquaresCOD.cpp
Normal file
@@ -0,0 +1,3 @@
|
||||
MatrixXf A = MatrixXf::Random(3, 2);
|
||||
VectorXf b = VectorXf::Random(3);
|
||||
cout << "The solution using the COD is:\n" << A.completeOrthogonalDecomposition().solve(b) << endl;
|
||||
@@ -433,7 +433,7 @@ if(EIGEN_TEST_CUDA_NVC AND NOT CMAKE_CXX_COMPILER_ID MATCHES "NVHPC")
|
||||
message(WARNING "EIGEN_TEST_CUDA_NVC is set, but CMAKE_CXX_COMPILER does not appear to be nvc++.")
|
||||
endif()
|
||||
|
||||
find_package(CUDA 9.0)
|
||||
find_package(CUDA 11.4)
|
||||
if(CUDA_FOUND AND EIGEN_TEST_CUDA)
|
||||
# Make sure to compile without the -pedantic, -Wundef, -Wnon-virtual-dtor
|
||||
# and -fno-check-new flags since they trigger thousands of compilation warnings
|
||||
@@ -479,6 +479,153 @@ if(CUDA_FOUND AND EIGEN_TEST_CUDA)
|
||||
|
||||
ei_add_test(gpu_example)
|
||||
ei_add_test(gpu_basic)
|
||||
ei_add_test(gpu_library_example "" "CUDA::cusolver")
|
||||
|
||||
# DeviceMatrix tests: only CUDA runtime, no NVIDIA libraries.
|
||||
unset(EIGEN_ADD_TEST_FILENAME_EXTENSION)
|
||||
add_executable(gpu_device_matrix gpu_device_matrix.cpp)
|
||||
target_include_directories(gpu_device_matrix PRIVATE
|
||||
"${CUDA_TOOLKIT_ROOT_DIR}/include"
|
||||
"${CMAKE_CURRENT_BINARY_DIR}")
|
||||
target_link_libraries(gpu_device_matrix Eigen3::Eigen CUDA::cudart)
|
||||
target_compile_definitions(gpu_device_matrix PRIVATE
|
||||
EIGEN_TEST_MAX_SIZE=${EIGEN_TEST_MAX_SIZE}
|
||||
EIGEN_TEST_PART_ALL=1)
|
||||
add_test(NAME gpu_device_matrix COMMAND gpu_device_matrix)
|
||||
add_dependencies(buildtests gpu_device_matrix)
|
||||
add_dependencies(buildtests_gpu gpu_device_matrix)
|
||||
set_property(TEST gpu_device_matrix APPEND PROPERTY LABELS "Official;gpu")
|
||||
set_property(TEST gpu_device_matrix PROPERTY SKIP_RETURN_CODE 77)
|
||||
set(EIGEN_ADD_TEST_FILENAME_EXTENSION "cu")
|
||||
|
||||
# Library-specific GPU tests (activated by later phases, OFF by default).
|
||||
# CUDAToolkit imported targets (CUDA::cublas, etc.) are available from
|
||||
# find_package(CUDAToolkit) above.
|
||||
option(EIGEN_TEST_CUBLAS "Test cuBLAS integration" OFF)
|
||||
if(EIGEN_TEST_CUBLAS AND TARGET CUDA::cublas)
|
||||
# cuBLAS tests are plain .cpp files (no device code), like cuSOLVER tests.
|
||||
unset(EIGEN_ADD_TEST_FILENAME_EXTENSION)
|
||||
add_executable(gpu_cublas gpu_cublas.cpp)
|
||||
target_include_directories(gpu_cublas PRIVATE
|
||||
"${CUDA_TOOLKIT_ROOT_DIR}/include"
|
||||
"${CMAKE_CURRENT_BINARY_DIR}")
|
||||
target_link_libraries(gpu_cublas
|
||||
Eigen3::Eigen CUDA::cudart CUDA::cublas CUDA::cusolver)
|
||||
target_compile_definitions(gpu_cublas PRIVATE
|
||||
EIGEN_TEST_MAX_SIZE=${EIGEN_TEST_MAX_SIZE}
|
||||
EIGEN_TEST_PART_ALL=1)
|
||||
add_test(NAME gpu_cublas COMMAND gpu_cublas)
|
||||
add_dependencies(buildtests gpu_cublas)
|
||||
add_dependencies(buildtests_gpu gpu_cublas)
|
||||
set_property(TEST gpu_cublas APPEND PROPERTY LABELS "Official;gpu")
|
||||
set_property(TEST gpu_cublas PROPERTY SKIP_RETURN_CODE 77)
|
||||
set(EIGEN_ADD_TEST_FILENAME_EXTENSION "cu")
|
||||
endif()
|
||||
|
||||
option(EIGEN_TEST_CUSOLVER "Test cuSOLVER integration" OFF)
|
||||
if(EIGEN_TEST_CUSOLVER AND TARGET CUDA::cusolver)
|
||||
# cuSOLVER tests are plain .cpp files: no device code, compiled by the host
|
||||
# compiler and linked against CUDA runtime + cuSOLVER. This avoids NVCC
|
||||
# instantiating Eigen's CPU packet operations for CUDA vector types.
|
||||
unset(EIGEN_ADD_TEST_FILENAME_EXTENSION)
|
||||
foreach(_cusolver_test IN ITEMS gpu_cusolver_llt gpu_cusolver_lu gpu_cusolver_qr gpu_cusolver_svd gpu_cusolver_eigen)
|
||||
add_executable(${_cusolver_test} ${_cusolver_test}.cpp)
|
||||
target_include_directories(${_cusolver_test} PRIVATE
|
||||
"${CUDA_TOOLKIT_ROOT_DIR}/include"
|
||||
"${CMAKE_CURRENT_BINARY_DIR}")
|
||||
target_link_libraries(${_cusolver_test}
|
||||
Eigen3::Eigen CUDA::cudart CUDA::cusolver CUDA::cublas)
|
||||
target_compile_definitions(${_cusolver_test} PRIVATE
|
||||
EIGEN_TEST_MAX_SIZE=${EIGEN_TEST_MAX_SIZE}
|
||||
EIGEN_TEST_PART_ALL=1)
|
||||
add_test(NAME ${_cusolver_test} COMMAND "${_cusolver_test}")
|
||||
add_dependencies(buildtests ${_cusolver_test})
|
||||
add_dependencies(buildtests_gpu ${_cusolver_test})
|
||||
set_property(TEST ${_cusolver_test} APPEND PROPERTY LABELS "Official;gpu")
|
||||
set_property(TEST ${_cusolver_test} PROPERTY SKIP_RETURN_CODE 77)
|
||||
endforeach()
|
||||
set(EIGEN_ADD_TEST_FILENAME_EXTENSION "cu")
|
||||
endif()
|
||||
|
||||
# cuFFT test (cuFFT is part of the CUDA toolkit — no separate option needed).
|
||||
if(TARGET CUDA::cufft)
|
||||
unset(EIGEN_ADD_TEST_FILENAME_EXTENSION)
|
||||
add_executable(gpu_cufft gpu_cufft.cpp)
|
||||
target_include_directories(gpu_cufft PRIVATE
|
||||
"${CUDA_TOOLKIT_ROOT_DIR}/include"
|
||||
"${CMAKE_CURRENT_BINARY_DIR}")
|
||||
target_link_libraries(gpu_cufft
|
||||
Eigen3::Eigen CUDA::cudart CUDA::cufft CUDA::cublas)
|
||||
target_compile_definitions(gpu_cufft PRIVATE
|
||||
EIGEN_TEST_MAX_SIZE=${EIGEN_TEST_MAX_SIZE}
|
||||
EIGEN_TEST_PART_ALL=1)
|
||||
add_test(NAME gpu_cufft COMMAND gpu_cufft)
|
||||
add_dependencies(buildtests gpu_cufft)
|
||||
add_dependencies(buildtests_gpu gpu_cufft)
|
||||
set_property(TEST gpu_cufft APPEND PROPERTY LABELS "Official;gpu")
|
||||
set_property(TEST gpu_cufft PROPERTY SKIP_RETURN_CODE 77)
|
||||
set(EIGEN_ADD_TEST_FILENAME_EXTENSION "cu")
|
||||
endif()
|
||||
|
||||
# cuSPARSE SpMV test (cuSPARSE is part of the CUDA toolkit).
|
||||
if(TARGET CUDA::cusparse)
|
||||
unset(EIGEN_ADD_TEST_FILENAME_EXTENSION)
|
||||
add_executable(gpu_cusparse_spmv gpu_cusparse_spmv.cpp)
|
||||
target_include_directories(gpu_cusparse_spmv PRIVATE
|
||||
"${CUDA_TOOLKIT_ROOT_DIR}/include"
|
||||
"${CMAKE_CURRENT_BINARY_DIR}")
|
||||
target_link_libraries(gpu_cusparse_spmv
|
||||
Eigen3::Eigen CUDA::cudart CUDA::cusparse)
|
||||
target_compile_definitions(gpu_cusparse_spmv PRIVATE
|
||||
EIGEN_TEST_MAX_SIZE=${EIGEN_TEST_MAX_SIZE}
|
||||
EIGEN_TEST_PART_ALL=1)
|
||||
add_test(NAME gpu_cusparse_spmv COMMAND gpu_cusparse_spmv)
|
||||
add_dependencies(buildtests gpu_cusparse_spmv)
|
||||
add_dependencies(buildtests_gpu gpu_cusparse_spmv)
|
||||
set_property(TEST gpu_cusparse_spmv APPEND PROPERTY LABELS "Official;gpu")
|
||||
set_property(TEST gpu_cusparse_spmv PROPERTY SKIP_RETURN_CODE 77)
|
||||
set(EIGEN_ADD_TEST_FILENAME_EXTENSION "cu")
|
||||
endif()
|
||||
|
||||
option(EIGEN_TEST_CUSPARSE "Test cuSPARSE integration" OFF)
|
||||
if(EIGEN_TEST_CUSPARSE AND TARGET CUDA::cusparse)
|
||||
ei_add_test(gpu_cusparse "" "CUDA::cusparse")
|
||||
endif()
|
||||
|
||||
# cuDSS sparse direct solver tests.
|
||||
# cuDSS is distributed separately from the CUDA Toolkit.
|
||||
option(EIGEN_TEST_CUDSS "Test cuDSS sparse solver integration" OFF)
|
||||
if(EIGEN_TEST_CUDSS)
|
||||
find_path(CUDSS_INCLUDE_DIR cudss.h
|
||||
HINTS ${CUDSS_DIR}/include ${CUDA_TOOLKIT_ROOT_DIR}/include /usr/include)
|
||||
find_library(CUDSS_LIBRARY cudss
|
||||
HINTS ${CUDSS_DIR}/lib ${CUDSS_DIR}/lib64 ${CUDA_TOOLKIT_ROOT_DIR}/lib64 /usr/lib/x86_64-linux-gnu)
|
||||
if(CUDSS_INCLUDE_DIR AND CUDSS_LIBRARY)
|
||||
message(STATUS "cuDSS found: ${CUDSS_LIBRARY}")
|
||||
unset(EIGEN_ADD_TEST_FILENAME_EXTENSION)
|
||||
foreach(_cudss_test IN ITEMS gpu_cudss_llt gpu_cudss_ldlt gpu_cudss_lu)
|
||||
add_executable(${_cudss_test} ${_cudss_test}.cpp)
|
||||
target_include_directories(${_cudss_test} PRIVATE
|
||||
"${CUDA_TOOLKIT_ROOT_DIR}/include"
|
||||
"${CUDSS_INCLUDE_DIR}"
|
||||
"${CMAKE_CURRENT_BINARY_DIR}")
|
||||
target_link_libraries(${_cudss_test}
|
||||
Eigen3::Eigen CUDA::cudart CUDA::cusolver CUDA::cublas ${CUDSS_LIBRARY})
|
||||
target_compile_definitions(${_cudss_test} PRIVATE
|
||||
EIGEN_TEST_MAX_SIZE=${EIGEN_TEST_MAX_SIZE}
|
||||
EIGEN_TEST_PART_ALL=1
|
||||
EIGEN_CUDSS=1)
|
||||
add_test(NAME ${_cudss_test} COMMAND "${_cudss_test}")
|
||||
add_dependencies(buildtests ${_cudss_test})
|
||||
add_dependencies(buildtests_gpu ${_cudss_test})
|
||||
set_property(TEST ${_cudss_test} APPEND PROPERTY LABELS "Official;gpu")
|
||||
set_property(TEST ${_cudss_test} PROPERTY SKIP_RETURN_CODE 77)
|
||||
endforeach()
|
||||
set(EIGEN_ADD_TEST_FILENAME_EXTENSION "cu")
|
||||
else()
|
||||
message(WARNING "EIGEN_TEST_CUDSS=ON but cuDSS not found. Set CUDSS_DIR.")
|
||||
endif()
|
||||
endif()
|
||||
|
||||
unset(EIGEN_ADD_TEST_FILENAME_EXTENSION)
|
||||
|
||||
@@ -502,6 +649,9 @@ if (EIGEN_TEST_HIP)
|
||||
endif()
|
||||
|
||||
find_package(HIP REQUIRED)
|
||||
if (HIP_FOUND AND HIP_VERSION VERSION_LESS "5.6")
|
||||
message(FATAL_ERROR "Eigen requires ROCm/HIP >= 5.6, found ${HIP_VERSION}")
|
||||
endif()
|
||||
if (HIP_FOUND)
|
||||
execute_process(COMMAND ${HIP_PATH}/bin/hipconfig --platform OUTPUT_VARIABLE HIP_PLATFORM)
|
||||
|
||||
|
||||
155
test/bdcsvd.cpp
155
test/bdcsvd.cpp
@@ -15,6 +15,7 @@
|
||||
#define EIGEN_RUNTIME_NO_MALLOC
|
||||
|
||||
#include "main.h"
|
||||
#include "tridiag_test_matrices.h"
|
||||
#include <Eigen/SVD>
|
||||
|
||||
#define SVD_DEFAULT(M) BDCSVD<M>
|
||||
@@ -146,148 +147,26 @@ void verify_bidiagonal_vs_matrix_svd(const Matrix<RealScalar, Dynamic, 1>& diag,
|
||||
|
||||
template <typename RealScalar>
|
||||
void bdcsvd_bidiagonal_hard_cases() {
|
||||
using std::abs;
|
||||
using std::cos;
|
||||
using std::pow;
|
||||
using std::sin;
|
||||
typedef Matrix<RealScalar, Dynamic, 1> VectorXr;
|
||||
|
||||
Eigen::internal::set_is_malloc_allowed(true);
|
||||
|
||||
const RealScalar eps = NumTraits<RealScalar>::epsilon();
|
||||
// Use the shared tridiagonal test matrix generators.
|
||||
// Each generator fills (diag, offdiag) which we treat as (diagonal, superdiagonal)
|
||||
// of a bidiagonal matrix.
|
||||
test::for_all_tridiag_test_matrices<RealScalar>(
|
||||
[](const auto& diag, const auto& offdiag) { verify_bidiagonal_svd<RealScalar>(diag, offdiag); });
|
||||
|
||||
// Test sizes: cover n=1, very small, below/above algoSwap (16), and larger.
|
||||
const int sizes[] = {1, 2, 3, 5, 10, 16, 20, 50, 100};
|
||||
const int numSizes = sizeof(sizes) / sizeof(sizes[0]);
|
||||
// Additional SVD-specific test: identity with cross-validation against full matrix SVD.
|
||||
test::for_tridiag_sizes<RealScalar>([](auto& diag, auto& offdiag) {
|
||||
test::tridiag_identity(diag, offdiag);
|
||||
verify_bidiagonal_vs_matrix_svd<RealScalar>(diag, offdiag);
|
||||
});
|
||||
|
||||
for (int si = 0; si < numSizes; ++si) {
|
||||
const Index n = sizes[si];
|
||||
VectorXr diag(n), superdiag(n > 1 ? n - 1 : 0);
|
||||
|
||||
// 1. Identity: d=[1,...,1], e=[0,...,0]
|
||||
diag.setOnes();
|
||||
superdiag.setZero();
|
||||
verify_bidiagonal_svd<RealScalar>(diag, superdiag);
|
||||
verify_bidiagonal_vs_matrix_svd<RealScalar>(diag, superdiag);
|
||||
|
||||
// 2. Zero: d=[0,...,0], e=[0,...,0]
|
||||
diag.setZero();
|
||||
superdiag.setZero();
|
||||
verify_bidiagonal_svd<RealScalar>(diag, superdiag);
|
||||
|
||||
// 3. Scalar (only meaningful for n=1, but runs for all)
|
||||
if (n == 1) {
|
||||
diag(0) = RealScalar(3.14);
|
||||
verify_bidiagonal_svd<RealScalar>(diag, superdiag);
|
||||
}
|
||||
|
||||
// 4. Golub-Kahan: d=[1,...,1], e=[1,...,1]
|
||||
diag.setOnes();
|
||||
if (n > 1) superdiag.setOnes();
|
||||
verify_bidiagonal_svd<RealScalar>(diag, superdiag);
|
||||
|
||||
// 5. Kahan matrix: d_i = s^(i-1), e_i = -c*s^(i-1)
|
||||
// Clamp exponents so condition number stays bounded by 1/eps.
|
||||
{
|
||||
const RealScalar theta = RealScalar(0.3);
|
||||
const RealScalar s = sin(theta);
|
||||
const RealScalar c = cos(theta);
|
||||
using std::log;
|
||||
const RealScalar maxPower = -log(eps) / (-log(s));
|
||||
for (Index i = 0; i < n; ++i) diag(i) = pow(s, numext::mini(RealScalar(i), maxPower));
|
||||
for (Index i = 0; i < n - 1; ++i) superdiag(i) = -c * pow(s, numext::mini(RealScalar(i), maxPower));
|
||||
verify_bidiagonal_svd<RealScalar>(diag, superdiag);
|
||||
}
|
||||
|
||||
// 6. Geometric decay diagonal: d_i = 0.5^i, e=[0,...,0]
|
||||
// Clamp so condition number stays bounded by 1/eps.
|
||||
{
|
||||
using std::log;
|
||||
const RealScalar base = RealScalar(0.5);
|
||||
const RealScalar maxPower = -log(eps) / (-log(base));
|
||||
for (Index i = 0; i < n; ++i) diag(i) = pow(base, numext::mini(RealScalar(i), maxPower));
|
||||
superdiag.setZero();
|
||||
verify_bidiagonal_svd<RealScalar>(diag, superdiag);
|
||||
}
|
||||
|
||||
// 7. Geometric decay superdiagonal: d=[1,...,1], e_i = 0.5^i
|
||||
diag.setOnes();
|
||||
for (Index i = 0; i < n - 1; ++i) superdiag(i) = pow(RealScalar(0.5), RealScalar(i));
|
||||
verify_bidiagonal_svd<RealScalar>(diag, superdiag);
|
||||
|
||||
// 8. Clustered at 1: d_i = 1 + i*eps, e=[0,...,0]
|
||||
for (Index i = 0; i < n; ++i) diag(i) = RealScalar(1) + RealScalar(i) * eps;
|
||||
superdiag.setZero();
|
||||
verify_bidiagonal_svd<RealScalar>(diag, superdiag);
|
||||
|
||||
// 9. Two clusters: half ≈ 1, half ≈ eps
|
||||
for (Index i = 0; i < n; ++i) diag(i) = (i < n / 2) ? RealScalar(1) : eps;
|
||||
superdiag.setZero();
|
||||
verify_bidiagonal_svd<RealScalar>(diag, superdiag);
|
||||
|
||||
// 10. Single tiny singular value: d=[1,...,1,eps], e=[eps^2,...]
|
||||
diag.setOnes();
|
||||
diag(n - 1) = eps;
|
||||
for (Index i = 0; i < n - 1; ++i) superdiag(i) = eps * eps;
|
||||
verify_bidiagonal_svd<RealScalar>(diag, superdiag);
|
||||
|
||||
// 11. Graded: d_i = 10^(-i), e_i = 10^(-i)
|
||||
for (Index i = 0; i < n; ++i) diag(i) = pow(RealScalar(10), -RealScalar(i));
|
||||
for (Index i = 0; i < n - 1; ++i) superdiag(i) = pow(RealScalar(10), -RealScalar(i));
|
||||
verify_bidiagonal_svd<RealScalar>(diag, superdiag);
|
||||
|
||||
// 12. Nearly diagonal: random diag, eps * random superdiag
|
||||
diag = VectorXr::Random(n).cwiseAbs() + VectorXr::Constant(n, RealScalar(0.1));
|
||||
for (Index i = 0; i < n - 1; ++i) superdiag(i) = eps * (RealScalar(0.5) + abs(internal::random<RealScalar>()));
|
||||
verify_bidiagonal_svd<RealScalar>(diag, superdiag);
|
||||
|
||||
// 13. All equal: d=[c,...,c], e=[c,...,c]
|
||||
diag.setConstant(RealScalar(2.5));
|
||||
if (n > 1) superdiag.setConstant(RealScalar(2.5));
|
||||
verify_bidiagonal_svd<RealScalar>(diag, superdiag);
|
||||
|
||||
// 14. Wilkinson: d_i = |n/2 - i|, e=[1,...,1]
|
||||
for (Index i = 0; i < n; ++i) diag(i) = abs(RealScalar(n / 2) - RealScalar(i));
|
||||
if (n > 1) superdiag.setOnes();
|
||||
verify_bidiagonal_svd<RealScalar>(diag, superdiag);
|
||||
|
||||
// 15. Overflow/underflow: alternating big/tiny diagonal, tiny/big superdiagonal
|
||||
{
|
||||
const RealScalar big = (std::numeric_limits<RealScalar>::max)() / RealScalar(1000);
|
||||
const RealScalar tiny = (std::numeric_limits<RealScalar>::min)() * RealScalar(1000);
|
||||
for (Index i = 0; i < n; ++i) diag(i) = (i % 2 == 0) ? big : tiny;
|
||||
for (Index i = 0; i < n - 1; ++i) superdiag(i) = (i % 2 == 0) ? tiny : big;
|
||||
verify_bidiagonal_svd<RealScalar>(diag, superdiag);
|
||||
}
|
||||
|
||||
// 16. Prescribed condition number: d_i = kappa^(-i/(n-1)), e_i = eps * random
|
||||
if (n > 1) {
|
||||
const RealScalar kappa = RealScalar(1) / eps;
|
||||
for (Index i = 0; i < n; ++i) diag(i) = pow(kappa, -RealScalar(i) / RealScalar(n - 1));
|
||||
for (Index i = 0; i < n - 1; ++i) superdiag(i) = eps * abs(internal::random<RealScalar>());
|
||||
verify_bidiagonal_svd<RealScalar>(diag, superdiag);
|
||||
}
|
||||
|
||||
// 17. Rank-deficient: d=[1,..,0,..,0,..,1], e=[0,...,0]
|
||||
for (Index i = 0; i < n; ++i) diag(i) = (i < n / 3 || i >= 2 * n / 3) ? RealScalar(1) : RealScalar(0);
|
||||
superdiag.setZero();
|
||||
verify_bidiagonal_svd<RealScalar>(diag, superdiag);
|
||||
|
||||
// 18. Arrowhead stress: d_i = linspace(1, n), e_i = 1/(i+1)
|
||||
for (Index i = 0; i < n; ++i) diag(i) = RealScalar(1) + RealScalar(i);
|
||||
for (Index i = 0; i < n - 1; ++i) superdiag(i) = RealScalar(1) / RealScalar(i + 1);
|
||||
verify_bidiagonal_svd<RealScalar>(diag, superdiag);
|
||||
|
||||
// 19. Repeated singular values: d=[1,2,3,1,2,3,...], e=[0,...,0]
|
||||
for (Index i = 0; i < n; ++i) diag(i) = RealScalar((i % 3) + 1);
|
||||
superdiag.setZero();
|
||||
verify_bidiagonal_svd<RealScalar>(diag, superdiag);
|
||||
|
||||
// 20. Glued identity: d=[1,...,1], e=0 except e[n/2-1]=eps
|
||||
diag.setOnes();
|
||||
superdiag.setZero();
|
||||
if (n > 2) superdiag(n / 2 - 1) = eps;
|
||||
verify_bidiagonal_svd<RealScalar>(diag, superdiag);
|
||||
// Additional SVD-specific test: scalar for n=1.
|
||||
{
|
||||
typedef Matrix<RealScalar, Dynamic, 1> VectorXr;
|
||||
VectorXr diag(1), offdiag(0);
|
||||
diag(0) = RealScalar(3.14);
|
||||
verify_bidiagonal_svd<RealScalar>(diag, offdiag);
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
@@ -10,6 +10,7 @@
|
||||
|
||||
#include "main.h"
|
||||
#include "svd_fill.h"
|
||||
#include "tridiag_test_matrices.h"
|
||||
#include <limits>
|
||||
#include <Eigen/Eigenvalues>
|
||||
#include <Eigen/SparseCore>
|
||||
@@ -31,8 +32,19 @@ void selfadjointeigensolver_essential_check(const MatrixType& m) {
|
||||
if (scaling < (std::numeric_limits<RealScalar>::min)()) {
|
||||
VERIFY(eiSymm.eigenvalues().cwiseAbs().maxCoeff() <= (std::numeric_limits<RealScalar>::min)());
|
||||
} else {
|
||||
VERIFY_IS_APPROX((m.template selfadjointView<Lower>() * eiSymm.eigenvectors()) / scaling,
|
||||
(eiSymm.eigenvectors() * eiSymm.eigenvalues().asDiagonal()) / scaling);
|
||||
// Columnwise residual check: for each eigenpair (lambda_i, v_i),
|
||||
// ||A*v_i - lambda_i*v_i|| / ||A||_max <= c * n * eps
|
||||
// This ensures accuracy for every eigenpair, including those corresponding
|
||||
// to small eigenvalues (which a Frobenius norm check would miss).
|
||||
// Computed in scaled space (dividing by ||A||_max) to avoid overflow.
|
||||
MatrixType scaledA = m.template selfadjointView<Lower>();
|
||||
scaledA /= scaling;
|
||||
MatrixType residual =
|
||||
scaledA * eiSymm.eigenvectors() - eiSymm.eigenvectors() * (eiSymm.eigenvalues() / scaling).asDiagonal();
|
||||
RealScalar tol = RealScalar(4) * RealScalar(numext::maxi(Index(1), n)) * NumTraits<RealScalar>::epsilon();
|
||||
for (Index i = 0; i < n; ++i) {
|
||||
VERIFY(residual.col(i).norm() <= tol);
|
||||
}
|
||||
}
|
||||
VERIFY_IS_APPROX(m.template selfadjointView<Lower>().eigenvalues(), eiSymm.eigenvalues());
|
||||
|
||||
@@ -64,6 +76,8 @@ void selfadjointeigensolver_essential_check(const MatrixType& m) {
|
||||
VERIFY(eiDirect.eigenvalues().cwiseAbs().maxCoeff() <= (std::numeric_limits<RealScalar>::min)());
|
||||
} else {
|
||||
VERIFY_IS_APPROX(eiSymm.eigenvalues() / scaling, eiDirect.eigenvalues() / scaling);
|
||||
// TODO: the direct 3x3 solver can produce large backward errors (>>n*eps*||A||)
|
||||
// on some matrices. Investigate and fix, then tighten this to a Frobenius norm check.
|
||||
VERIFY_IS_APPROX((m.template selfadjointView<Lower>() * eiDirect.eigenvectors()) / scaling,
|
||||
(eiDirect.eigenvectors() * eiDirect.eigenvalues().asDiagonal()) / scaling);
|
||||
VERIFY_IS_APPROX(m.template selfadjointView<Lower>().eigenvalues() / scaling, eiDirect.eigenvalues() / scaling);
|
||||
@@ -408,6 +422,137 @@ void selfadjointeigensolver_tridiagonal_scaled(const MatrixType& m) {
|
||||
VERIFY_IS_APPROX(eig2.eigenvalues(), eig2v.eigenvalues());
|
||||
}
|
||||
|
||||
// Test computeFromTridiagonal with wide dynamic range across decoupled blocks.
|
||||
// This exercises the per-block scaling in computeFromTridiagonal_impl: a zero on the
|
||||
// subdiagonal decouples the matrix into blocks with vastly different scales. Global
|
||||
// scaling would underflow the small block; per-block scaling handles both correctly.
|
||||
template <typename RealScalar>
|
||||
void selfadjointeigensolver_tridiagonal_wide_range() {
|
||||
using std::sqrt;
|
||||
typedef Matrix<RealScalar, Dynamic, Dynamic> MatrixType;
|
||||
typedef Matrix<RealScalar, Dynamic, 1> VectorType;
|
||||
|
||||
// Block 1: entries near overflow threshold.
|
||||
// Block 2: entries near 1.
|
||||
// Separated by a zero subdiagonal entry.
|
||||
const RealScalar big = sqrt(NumTraits<RealScalar>::highest()) / RealScalar(10);
|
||||
const Index n = 6;
|
||||
VectorType diag(n), subdiag(n - 1);
|
||||
|
||||
// First block: [0..2], large scale.
|
||||
diag(0) = big;
|
||||
diag(1) = big * RealScalar(1.1);
|
||||
diag(2) = big * RealScalar(0.9);
|
||||
subdiag(0) = big * RealScalar(0.01);
|
||||
subdiag(1) = big * RealScalar(0.02);
|
||||
// Zero subdiagonal decouples the two blocks.
|
||||
subdiag(2) = RealScalar(0);
|
||||
// Second block: [3..5], O(1) scale.
|
||||
diag(3) = RealScalar(1);
|
||||
diag(4) = RealScalar(2);
|
||||
diag(5) = RealScalar(3);
|
||||
subdiag(3) = RealScalar(0.5);
|
||||
subdiag(4) = RealScalar(0.3);
|
||||
|
||||
// Build the full tridiagonal matrix for residual checking.
|
||||
MatrixType T = MatrixType::Zero(n, n);
|
||||
T.diagonal() = diag;
|
||||
T.template diagonal<1>() = subdiag;
|
||||
T.template diagonal<-1>() = subdiag;
|
||||
|
||||
SelfAdjointEigenSolver<MatrixType> eig;
|
||||
eig.computeFromTridiagonal(diag, subdiag, ComputeEigenvectors);
|
||||
VERIFY_IS_EQUAL(eig.info(), Success);
|
||||
|
||||
// Eigenvalues must be sorted.
|
||||
for (Index i = 1; i < n; ++i) {
|
||||
VERIFY(eig.eigenvalues()(i) >= eig.eigenvalues()(i - 1));
|
||||
}
|
||||
|
||||
// Eigenvectors must be orthonormal.
|
||||
RealScalar unitary_tol = RealScalar(4) * RealScalar(n) * NumTraits<RealScalar>::epsilon();
|
||||
VERIFY(eig.eigenvectors().isUnitary(unitary_tol));
|
||||
|
||||
// Full residual check in scaled coordinates.
|
||||
RealScalar Tnorm = T.cwiseAbs().maxCoeff();
|
||||
MatrixType Tscaled = T / Tnorm;
|
||||
MatrixType residual = Tscaled * eig.eigenvectors() - eig.eigenvectors() * (eig.eigenvalues() / Tnorm).asDiagonal();
|
||||
RealScalar rel_err = residual.norm() / Tscaled.norm();
|
||||
VERIFY(rel_err <= RealScalar(8) * RealScalar(n) * NumTraits<RealScalar>::epsilon());
|
||||
|
||||
// The small eigenvalues (~1,2,3) must be accurate, not lost to underflow.
|
||||
// With global scaling to [-1,1], dividing by 'big' would underflow these to zero.
|
||||
// Verify the small eigenvalues are within O(eps) of their true values.
|
||||
// The small block is exactly [[1, 0.5, 0], [0.5, 2, 0.3], [0, 0.3, 3]].
|
||||
MatrixType T_small(3, 3);
|
||||
T_small << RealScalar(1), RealScalar(0.5), RealScalar(0), RealScalar(0.5), RealScalar(2), RealScalar(0.3),
|
||||
RealScalar(0), RealScalar(0.3), RealScalar(3);
|
||||
SelfAdjointEigenSolver<MatrixType> eig_small(T_small);
|
||||
VectorType small_evals = eig_small.eigenvalues();
|
||||
|
||||
// Find the 3 smallest eigenvalues from the combined solver (they should be sorted first).
|
||||
VectorType combined_small = eig.eigenvalues().head(3);
|
||||
VERIFY_IS_APPROX(combined_small, small_evals);
|
||||
|
||||
// Eigenvalues-only mode must agree.
|
||||
SelfAdjointEigenSolver<MatrixType> eig_vals;
|
||||
eig_vals.computeFromTridiagonal(diag, subdiag, EigenvaluesOnly);
|
||||
VERIFY_IS_EQUAL(eig_vals.info(), Success);
|
||||
VERIFY_IS_APPROX(eig.eigenvalues() / Tnorm, eig_vals.eigenvalues() / Tnorm);
|
||||
}
|
||||
|
||||
// Test computeFromTridiagonal with structured hard-case matrices from the literature.
|
||||
template <typename RealScalar>
|
||||
void selfadjointeigensolver_structured_tridiagonal() {
|
||||
typedef Matrix<RealScalar, Dynamic, Dynamic> MatrixType;
|
||||
|
||||
test::for_all_symmetric_tridiag_test_matrices<RealScalar>([](const auto& diag, const auto& offdiag) {
|
||||
Index n = diag.size();
|
||||
|
||||
// Build the full symmetric tridiagonal matrix for residual checking.
|
||||
MatrixType T = MatrixType::Zero(n, n);
|
||||
T.diagonal() = diag;
|
||||
if (n > 1) {
|
||||
T.template diagonal<1>() = offdiag;
|
||||
T.template diagonal<-1>() = offdiag;
|
||||
}
|
||||
RealScalar Tnorm = T.cwiseAbs().maxCoeff();
|
||||
|
||||
// Test with eigenvectors.
|
||||
SelfAdjointEigenSolver<MatrixType> eig;
|
||||
eig.computeFromTridiagonal(diag, offdiag, ComputeEigenvectors);
|
||||
VERIFY_IS_EQUAL(eig.info(), Success);
|
||||
|
||||
// Eigenvalues must be sorted.
|
||||
for (Index i = 1; i < n; ++i) {
|
||||
VERIFY(eig.eigenvalues()(i) >= eig.eigenvalues()(i - 1));
|
||||
}
|
||||
|
||||
// Eigenvectors must be orthonormal.
|
||||
RealScalar unitary_tol =
|
||||
numext::maxi(RealScalar(4) * RealScalar(n) * NumTraits<RealScalar>::epsilon(), test_precision<RealScalar>());
|
||||
VERIFY(eig.eigenvectors().isUnitary(unitary_tol));
|
||||
|
||||
// Residual check: ||T*V - V*D||_F / ||T||_max should be O(n*eps).
|
||||
// Scale T to avoid overflow in the matrix product when entries span extreme ranges.
|
||||
if (Tnorm > (std::numeric_limits<RealScalar>::min)()) {
|
||||
MatrixType Tscaled = T / Tnorm;
|
||||
MatrixType residual =
|
||||
Tscaled * eig.eigenvectors() - eig.eigenvectors() * (eig.eigenvalues() / Tnorm).asDiagonal();
|
||||
RealScalar rel_err = residual.norm() / Tscaled.norm();
|
||||
VERIFY(rel_err <= RealScalar(8) * RealScalar(n) * NumTraits<RealScalar>::epsilon());
|
||||
}
|
||||
|
||||
// Eigenvalues-only mode must produce the same eigenvalues.
|
||||
SelfAdjointEigenSolver<MatrixType> eig_vals;
|
||||
eig_vals.computeFromTridiagonal(diag, offdiag, EigenvaluesOnly);
|
||||
VERIFY_IS_EQUAL(eig_vals.info(), Success);
|
||||
if (Tnorm > (std::numeric_limits<RealScalar>::min)()) {
|
||||
VERIFY_IS_APPROX(eig.eigenvalues() / Tnorm, eig_vals.eigenvalues() / Tnorm);
|
||||
}
|
||||
});
|
||||
}
|
||||
|
||||
// Test with diagonal matrices (tridiagonalization is trivial).
|
||||
template <typename MatrixType>
|
||||
void selfadjointeigensolver_diagonal(const MatrixType& m) {
|
||||
@@ -564,11 +709,78 @@ void bug_1225() {
|
||||
VERIFY_IS_APPROX(eig1.eigenvalues(), eig2.eigenvalues());
|
||||
}
|
||||
|
||||
// Verify that non-finite inputs are detected for all sizes, including 1x1.
|
||||
template <int>
|
||||
void selfadjointeigensolver_nonfinite() {
|
||||
const double inf = std::numeric_limits<double>::infinity();
|
||||
const double nan = std::numeric_limits<double>::quiet_NaN();
|
||||
|
||||
// 1x1 Inf.
|
||||
{
|
||||
Matrix<double, 1, 1> m;
|
||||
m << inf;
|
||||
SelfAdjointEigenSolver<Matrix<double, 1, 1>> eig(m);
|
||||
VERIFY_IS_EQUAL(eig.info(), NoConvergence);
|
||||
}
|
||||
// 1x1 NaN.
|
||||
{
|
||||
Matrix<double, 1, 1> m;
|
||||
m << nan;
|
||||
SelfAdjointEigenSolver<Matrix<double, 1, 1>> eig(m);
|
||||
VERIFY_IS_EQUAL(eig.info(), NoConvergence);
|
||||
}
|
||||
// 1x1 -Inf.
|
||||
{
|
||||
Matrix<double, 1, 1> m;
|
||||
m << -inf;
|
||||
SelfAdjointEigenSolver<Matrix<double, 1, 1>> eig(m);
|
||||
VERIFY_IS_EQUAL(eig.info(), NoConvergence);
|
||||
}
|
||||
// 3x3 with Inf.
|
||||
{
|
||||
Matrix3d m = Matrix3d::Identity();
|
||||
m(1, 1) = inf;
|
||||
SelfAdjointEigenSolver<Matrix3d> eig(m);
|
||||
VERIFY_IS_EQUAL(eig.info(), NoConvergence);
|
||||
}
|
||||
// 3x3 with NaN.
|
||||
{
|
||||
Matrix3d m = Matrix3d::Identity();
|
||||
m(0, 1) = m(1, 0) = nan;
|
||||
SelfAdjointEigenSolver<Matrix3d> eig(m);
|
||||
VERIFY_IS_EQUAL(eig.info(), NoConvergence);
|
||||
}
|
||||
// Dynamic size with Inf.
|
||||
{
|
||||
MatrixXd m = MatrixXd::Identity(5, 5);
|
||||
m(3, 3) = inf;
|
||||
SelfAdjointEigenSolver<MatrixXd> eig(m);
|
||||
VERIFY_IS_EQUAL(eig.info(), NoConvergence);
|
||||
}
|
||||
}
|
||||
|
||||
template <int>
|
||||
void bug_1204() {
|
||||
SparseMatrix<double> A(2, 2);
|
||||
A.setIdentity();
|
||||
SelfAdjointEigenSolver<Eigen::SparseMatrix<double> > eig(A);
|
||||
SelfAdjointEigenSolver<Eigen::SparseMatrix<double>> eig(A);
|
||||
}
|
||||
|
||||
template <int>
|
||||
void selfadjointeigensolver_tridiagonal_zerosized() {
|
||||
SelfAdjointEigenSolver<MatrixXd> eig;
|
||||
VectorXd diag(0), subdiag(0);
|
||||
|
||||
eig.computeFromTridiagonal(diag, subdiag, EigenvaluesOnly);
|
||||
VERIFY_IS_EQUAL(eig.info(), Success);
|
||||
VERIFY_IS_EQUAL(eig.eigenvalues().size(), 0);
|
||||
VERIFY_RAISES_ASSERT(eig.eigenvectors());
|
||||
|
||||
eig.computeFromTridiagonal(diag, subdiag, ComputeEigenvectors);
|
||||
VERIFY_IS_EQUAL(eig.info(), Success);
|
||||
VERIFY_IS_EQUAL(eig.eigenvalues().size(), 0);
|
||||
VERIFY_IS_EQUAL(eig.eigenvectors().rows(), 0);
|
||||
VERIFY_IS_EQUAL(eig.eigenvectors().cols(), 0);
|
||||
}
|
||||
|
||||
// Specific 3x3 test cases that stress the direct solver.
|
||||
@@ -706,6 +918,14 @@ EIGEN_DECLARE_TEST(eigensolver_selfadjoint) {
|
||||
CALL_SUBTEST_4(selfadjointeigensolver_tridiagonal_scaled(MatrixXd(s, s)));
|
||||
CALL_SUBTEST_3(selfadjointeigensolver_tridiagonal_scaled(MatrixXf(s, s)));
|
||||
|
||||
// structured tridiagonal hard cases from the literature
|
||||
CALL_SUBTEST_4(selfadjointeigensolver_structured_tridiagonal<double>());
|
||||
CALL_SUBTEST_3(selfadjointeigensolver_structured_tridiagonal<float>());
|
||||
|
||||
// wide dynamic range tridiagonal (per-block scaling regression)
|
||||
CALL_SUBTEST_4(selfadjointeigensolver_tridiagonal_wide_range<double>());
|
||||
CALL_SUBTEST_3(selfadjointeigensolver_tridiagonal_wide_range<float>());
|
||||
|
||||
// diagonal matrices
|
||||
CALL_SUBTEST_17(selfadjointeigensolver_diagonal(Matrix3d()));
|
||||
CALL_SUBTEST_4(selfadjointeigensolver_diagonal(MatrixXd(s, s)));
|
||||
@@ -724,6 +944,8 @@ EIGEN_DECLARE_TEST(eigensolver_selfadjoint) {
|
||||
CALL_SUBTEST_17(bug_1014<0>());
|
||||
CALL_SUBTEST_17(bug_1204<0>());
|
||||
CALL_SUBTEST_17(bug_1225<0>());
|
||||
CALL_SUBTEST_17(selfadjointeigensolver_nonfinite<0>());
|
||||
CALL_SUBTEST_8(selfadjointeigensolver_tridiagonal_zerosized<0>());
|
||||
|
||||
// Stress tests for direct 3x3 and 2x2 solvers.
|
||||
CALL_SUBTEST_17(direct_3x3_stress<0>());
|
||||
|
||||
@@ -7,12 +7,6 @@
|
||||
// Public License v. 2.0. If a copy of the MPL was not distributed
|
||||
// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
|
||||
|
||||
// workaround issue between gcc >= 4.7 and cuda 5.5
|
||||
#if (defined __GNUC__) && (__GNUC__ > 4 || __GNUC_MINOR__ >= 7)
|
||||
#undef _GLIBCXX_ATOMIC_BUILTINS
|
||||
#undef _GLIBCXX_USE_INT128
|
||||
#endif
|
||||
|
||||
#define EIGEN_TEST_NO_LONGDOUBLE
|
||||
#define EIGEN_DEFAULT_DENSE_INDEX_TYPE int
|
||||
|
||||
|
||||
72
test/gpu_context.h
Normal file
72
test/gpu_context.h
Normal file
@@ -0,0 +1,72 @@
|
||||
// This file is part of Eigen, a lightweight C++ template library
|
||||
// for linear algebra.
|
||||
//
|
||||
// Copyright (C) 2026 Rasmus Munk Larsen <rmlarsen@gmail.com>
|
||||
//
|
||||
// This Source Code Form is subject to the terms of the Mozilla
|
||||
// Public License v. 2.0. If a copy of the MPL was not distributed
|
||||
// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
|
||||
|
||||
#ifndef EIGEN_TEST_GPU_CONTEXT_H
|
||||
#define EIGEN_TEST_GPU_CONTEXT_H
|
||||
|
||||
// RAII context for GPU tests that use NVIDIA library APIs (cuBLAS, cuSOLVER, etc.).
|
||||
// Owns a non-default CUDA stream. Library handles (cuBLAS, cuSOLVER, etc.) are added
|
||||
// here by each integration phase as needed; each handle is bound to the owned stream.
|
||||
//
|
||||
// Usage:
|
||||
// GpuContext ctx;
|
||||
// auto buf = gpu_copy_to_device(ctx.stream, A);
|
||||
// // ... call NVIDIA library APIs using ctx.stream / ctx.cusolver ...
|
||||
// ctx.synchronize();
|
||||
|
||||
#include "gpu_test_helper.h"
|
||||
|
||||
#ifdef EIGEN_USE_GPU
|
||||
#include <cusolverDn.h>
|
||||
|
||||
// Checks cuSOLVER return codes, aborts on failure.
|
||||
#define CUSOLVER_CHECK(expr) \
|
||||
do { \
|
||||
cusolverStatus_t _status = (expr); \
|
||||
if (_status != CUSOLVER_STATUS_SUCCESS) { \
|
||||
printf("cuSOLVER error %d at %s:%d\n", static_cast<int>(_status), __FILE__, __LINE__); \
|
||||
gpu_assert(false); \
|
||||
} \
|
||||
} while (0)
|
||||
|
||||
struct GpuContext {
|
||||
cudaStream_t stream = nullptr;
|
||||
cusolverDnHandle_t cusolver = nullptr;
|
||||
|
||||
GpuContext() {
|
||||
GPU_CHECK(gpuGetDevice(&device_));
|
||||
GPU_CHECK(gpuGetDeviceProperties(&device_props_, device_));
|
||||
GPU_CHECK(cudaStreamCreate(&stream));
|
||||
CUSOLVER_CHECK(cusolverDnCreate(&cusolver));
|
||||
CUSOLVER_CHECK(cusolverDnSetStream(cusolver, stream));
|
||||
}
|
||||
|
||||
~GpuContext() {
|
||||
if (cusolver) CUSOLVER_CHECK(cusolverDnDestroy(cusolver));
|
||||
if (stream) GPU_CHECK(cudaStreamDestroy(stream));
|
||||
}
|
||||
|
||||
int device() const { return device_; }
|
||||
const gpuDeviceProp_t& deviceProperties() const { return device_props_; }
|
||||
|
||||
// Wait for all work submitted on this context's stream to complete.
|
||||
void synchronize() { GPU_CHECK(cudaStreamSynchronize(stream)); }
|
||||
|
||||
// Non-copyable, non-movable.
|
||||
GpuContext(const GpuContext&) = delete;
|
||||
GpuContext& operator=(const GpuContext&) = delete;
|
||||
|
||||
private:
|
||||
int device_ = 0;
|
||||
gpuDeviceProp_t device_props_;
|
||||
};
|
||||
|
||||
#endif // EIGEN_USE_GPU
|
||||
|
||||
#endif // EIGEN_TEST_GPU_CONTEXT_H
|
||||
756
test/gpu_cublas.cpp
Normal file
756
test/gpu_cublas.cpp
Normal file
@@ -0,0 +1,756 @@
|
||||
// This file is part of Eigen, a lightweight C++ template library
|
||||
// for linear algebra.
|
||||
//
|
||||
// Copyright (C) 2026 Rasmus Munk Larsen <rmlarsen@gmail.com>
|
||||
//
|
||||
// This Source Code Form is subject to the terms of the Mozilla
|
||||
// Public License v. 2.0. If a copy of the MPL was not distributed
|
||||
// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
|
||||
|
||||
// Tests for cuBLAS GEMM dispatch via DeviceMatrix expression syntax.
|
||||
// Covers: d_C = d_A * d_B, adjoint, transpose, scaled, +=, .device(ctx).
|
||||
|
||||
#define EIGEN_USE_GPU
|
||||
#include "main.h"
|
||||
#include <Eigen/GPU>
|
||||
|
||||
using namespace Eigen;
|
||||
|
||||
// Unit roundoff for GPU GEMM compute precision.
|
||||
// TF32 (opt-in via EIGEN_CUDA_TF32) has eps ~ 2^{-10}.
|
||||
template <typename Scalar>
|
||||
typename NumTraits<Scalar>::Real gpu_unit_roundoff() {
|
||||
#if defined(EIGEN_CUDA_TF32) && !defined(EIGEN_NO_CUDA_TENSOR_OPS)
|
||||
using RealScalar = typename NumTraits<Scalar>::Real;
|
||||
if (std::is_same<RealScalar, float>::value) return RealScalar(9.8e-4);
|
||||
#endif
|
||||
return NumTraits<Scalar>::epsilon();
|
||||
}
|
||||
|
||||
// Higham-Mary probabilistic error bound for GEMM:
|
||||
// ||C - fl(C)||_F <= lambda * sqrt(k) * u * ||A||_F * ||B||_F
|
||||
// where k is the inner dimension, u is the unit roundoff, and
|
||||
// lambda = sqrt(2 * ln(2/delta)) with delta = failure probability.
|
||||
// lambda = 5 corresponds to delta ~ 10^{-6}.
|
||||
// Reference: Higham & Mary, "Probabilistic Error Analysis for Inner Products",
|
||||
// SIAM J. Matrix Anal. Appl., 2019.
|
||||
template <typename Scalar>
|
||||
typename NumTraits<Scalar>::Real gemm_error_bound(Index k, typename NumTraits<Scalar>::Real normA,
|
||||
typename NumTraits<Scalar>::Real normB) {
|
||||
using RealScalar = typename NumTraits<Scalar>::Real;
|
||||
constexpr RealScalar lambda = 5;
|
||||
return lambda * std::sqrt(static_cast<RealScalar>(k)) * gpu_unit_roundoff<Scalar>() * normA * normB;
|
||||
}
|
||||
|
||||
// ---- Basic GEMM: C = A * B -------------------------------------------------
|
||||
|
||||
template <typename Scalar>
|
||||
void test_gemm_basic(Index m, Index n, Index k) {
|
||||
using Mat = Matrix<Scalar, Dynamic, Dynamic>;
|
||||
using RealScalar = typename NumTraits<Scalar>::Real;
|
||||
|
||||
Mat A = Mat::Random(m, k);
|
||||
Mat B = Mat::Random(k, n);
|
||||
|
||||
auto d_A = DeviceMatrix<Scalar>::fromHost(A);
|
||||
auto d_B = DeviceMatrix<Scalar>::fromHost(B);
|
||||
|
||||
// Expression: d_C = d_A * d_B
|
||||
DeviceMatrix<Scalar> d_C;
|
||||
d_C = d_A * d_B;
|
||||
|
||||
Mat C = d_C.toHost();
|
||||
Mat C_ref = A * B;
|
||||
|
||||
RealScalar tol = gemm_error_bound<Scalar>(k, A.norm(), B.norm());
|
||||
VERIFY((C - C_ref).norm() < tol);
|
||||
}
|
||||
|
||||
// ---- GEMM with adjoint: C = A^H * B ----------------------------------------
|
||||
|
||||
template <typename Scalar>
|
||||
void test_gemm_adjoint_lhs(Index m, Index n, Index k) {
|
||||
using Mat = Matrix<Scalar, Dynamic, Dynamic>;
|
||||
using RealScalar = typename NumTraits<Scalar>::Real;
|
||||
|
||||
Mat A = Mat::Random(k, m); // A is k×m, A^H is m×k
|
||||
Mat B = Mat::Random(k, n);
|
||||
|
||||
auto d_A = DeviceMatrix<Scalar>::fromHost(A);
|
||||
auto d_B = DeviceMatrix<Scalar>::fromHost(B);
|
||||
|
||||
DeviceMatrix<Scalar> d_C;
|
||||
d_C = d_A.adjoint() * d_B;
|
||||
|
||||
Mat C = d_C.toHost();
|
||||
Mat C_ref = A.adjoint() * B;
|
||||
|
||||
RealScalar tol = gemm_error_bound<Scalar>(k, A.norm(), B.norm());
|
||||
VERIFY((C - C_ref).norm() < tol);
|
||||
}
|
||||
|
||||
// ---- GEMM with transpose: C = A * B^T --------------------------------------
|
||||
|
||||
template <typename Scalar>
|
||||
void test_gemm_transpose_rhs(Index m, Index n, Index k) {
|
||||
using Mat = Matrix<Scalar, Dynamic, Dynamic>;
|
||||
using RealScalar = typename NumTraits<Scalar>::Real;
|
||||
|
||||
Mat A = Mat::Random(m, k);
|
||||
Mat B = Mat::Random(n, k); // B is n×k, B^T is k×n
|
||||
|
||||
auto d_A = DeviceMatrix<Scalar>::fromHost(A);
|
||||
auto d_B = DeviceMatrix<Scalar>::fromHost(B);
|
||||
|
||||
DeviceMatrix<Scalar> d_C;
|
||||
d_C = d_A * d_B.transpose();
|
||||
|
||||
Mat C = d_C.toHost();
|
||||
Mat C_ref = A * B.transpose();
|
||||
|
||||
RealScalar tol = gemm_error_bound<Scalar>(k, A.norm(), B.norm());
|
||||
VERIFY((C - C_ref).norm() < tol);
|
||||
}
|
||||
|
||||
// ---- GEMM with scaled: C = alpha * A * B ------------------------------------
|
||||
|
||||
template <typename Scalar>
|
||||
void test_gemm_scaled(Index m, Index n, Index k) {
|
||||
using Mat = Matrix<Scalar, Dynamic, Dynamic>;
|
||||
using RealScalar = typename NumTraits<Scalar>::Real;
|
||||
|
||||
Mat A = Mat::Random(m, k);
|
||||
Mat B = Mat::Random(k, n);
|
||||
Scalar alpha = Scalar(2.5);
|
||||
|
||||
auto d_A = DeviceMatrix<Scalar>::fromHost(A);
|
||||
auto d_B = DeviceMatrix<Scalar>::fromHost(B);
|
||||
|
||||
DeviceMatrix<Scalar> d_C;
|
||||
d_C = alpha * d_A * d_B;
|
||||
|
||||
Mat C = d_C.toHost();
|
||||
Mat C_ref = alpha * A * B;
|
||||
|
||||
RealScalar tol = gemm_error_bound<Scalar>(k, A.norm(), B.norm());
|
||||
VERIFY((C - C_ref).norm() < tol);
|
||||
}
|
||||
|
||||
// ---- GEMM accumulate: C += A * B (beta=1) -----------------------------------
|
||||
|
||||
template <typename Scalar>
|
||||
void test_gemm_accumulate(Index m, Index n, Index k) {
|
||||
using Mat = Matrix<Scalar, Dynamic, Dynamic>;
|
||||
using RealScalar = typename NumTraits<Scalar>::Real;
|
||||
|
||||
Mat A = Mat::Random(m, k);
|
||||
Mat B = Mat::Random(k, n);
|
||||
Mat C_init = Mat::Random(m, n);
|
||||
|
||||
auto d_A = DeviceMatrix<Scalar>::fromHost(A);
|
||||
auto d_B = DeviceMatrix<Scalar>::fromHost(B);
|
||||
auto d_C = DeviceMatrix<Scalar>::fromHost(C_init);
|
||||
|
||||
d_C += d_A * d_B;
|
||||
|
||||
Mat C = d_C.toHost();
|
||||
Mat C_ref = C_init + A * B;
|
||||
|
||||
RealScalar tol = gemm_error_bound<Scalar>(k, A.norm(), B.norm());
|
||||
VERIFY((C - C_ref).norm() < tol);
|
||||
}
|
||||
|
||||
// ---- GEMM accumulate into empty destination ---------------------------------
|
||||
|
||||
template <typename Scalar>
|
||||
void test_gemm_accumulate_empty(Index m, Index n, Index k) {
|
||||
using Mat = Matrix<Scalar, Dynamic, Dynamic>;
|
||||
using RealScalar = typename NumTraits<Scalar>::Real;
|
||||
|
||||
Mat A = Mat::Random(m, k);
|
||||
Mat B = Mat::Random(k, n);
|
||||
|
||||
auto d_A = DeviceMatrix<Scalar>::fromHost(A);
|
||||
auto d_B = DeviceMatrix<Scalar>::fromHost(B);
|
||||
DeviceMatrix<Scalar> d_C;
|
||||
|
||||
d_C += d_A * d_B;
|
||||
|
||||
Mat C = d_C.toHost();
|
||||
Mat C_ref = A * B;
|
||||
|
||||
RealScalar tol = gemm_error_bound<Scalar>(k, A.norm(), B.norm());
|
||||
VERIFY((C - C_ref).norm() < tol);
|
||||
}
|
||||
|
||||
// ---- GEMM subtract: C -= A * B (beta=1, alpha=-1) --------------------------
|
||||
|
||||
template <typename Scalar>
|
||||
void test_gemm_subtract(Index m, Index n, Index k) {
|
||||
using Mat = Matrix<Scalar, Dynamic, Dynamic>;
|
||||
using RealScalar = typename NumTraits<Scalar>::Real;
|
||||
|
||||
Mat A = Mat::Random(m, k);
|
||||
Mat B = Mat::Random(k, n);
|
||||
Mat C_init = Mat::Random(m, n);
|
||||
|
||||
auto d_A = DeviceMatrix<Scalar>::fromHost(A);
|
||||
auto d_B = DeviceMatrix<Scalar>::fromHost(B);
|
||||
auto d_C = DeviceMatrix<Scalar>::fromHost(C_init);
|
||||
|
||||
GpuContext ctx;
|
||||
d_C.device(ctx) -= d_A * d_B;
|
||||
|
||||
Mat C = d_C.toHost();
|
||||
Mat C_ref = C_init - A * B;
|
||||
|
||||
RealScalar tol = gemm_error_bound<Scalar>(k, A.norm(), B.norm());
|
||||
VERIFY((C - C_ref).norm() < tol);
|
||||
}
|
||||
|
||||
// ---- GEMM subtract from empty destination -----------------------------------
|
||||
|
||||
template <typename Scalar>
|
||||
void test_gemm_subtract_empty(Index m, Index n, Index k) {
|
||||
using Mat = Matrix<Scalar, Dynamic, Dynamic>;
|
||||
using RealScalar = typename NumTraits<Scalar>::Real;
|
||||
|
||||
Mat A = Mat::Random(m, k);
|
||||
Mat B = Mat::Random(k, n);
|
||||
|
||||
auto d_A = DeviceMatrix<Scalar>::fromHost(A);
|
||||
auto d_B = DeviceMatrix<Scalar>::fromHost(B);
|
||||
|
||||
GpuContext ctx;
|
||||
DeviceMatrix<Scalar> d_C;
|
||||
d_C.device(ctx) -= d_A * d_B;
|
||||
|
||||
Mat C = d_C.toHost();
|
||||
Mat C_ref = -(A * B);
|
||||
|
||||
RealScalar tol = gemm_error_bound<Scalar>(k, A.norm(), B.norm());
|
||||
VERIFY((C - C_ref).norm() < tol);
|
||||
}
|
||||
|
||||
// ---- GEMM with scaled RHS: C = A * (alpha * B) -----------------------------
|
||||
|
||||
template <typename Scalar>
|
||||
void test_gemm_scaled_rhs(Index m, Index n, Index k) {
|
||||
using Mat = Matrix<Scalar, Dynamic, Dynamic>;
|
||||
using RealScalar = typename NumTraits<Scalar>::Real;
|
||||
|
||||
Mat A = Mat::Random(m, k);
|
||||
Mat B = Mat::Random(k, n);
|
||||
Scalar alpha = Scalar(3.0);
|
||||
|
||||
auto d_A = DeviceMatrix<Scalar>::fromHost(A);
|
||||
auto d_B = DeviceMatrix<Scalar>::fromHost(B);
|
||||
|
||||
DeviceMatrix<Scalar> d_C;
|
||||
d_C = d_A * (alpha * d_B);
|
||||
|
||||
Mat C = d_C.toHost();
|
||||
Mat C_ref = A * (alpha * B);
|
||||
|
||||
RealScalar tol = gemm_error_bound<Scalar>(k, A.norm(), B.norm());
|
||||
VERIFY((C - C_ref).norm() < tol);
|
||||
}
|
||||
|
||||
// ---- GEMM dimension mismatch must assert ------------------------------------
|
||||
|
||||
template <typename Scalar>
|
||||
void test_gemm_dimension_mismatch() {
|
||||
using Mat = Matrix<Scalar, Dynamic, Dynamic>;
|
||||
|
||||
Mat A = Mat::Random(8, 5);
|
||||
Mat B = Mat::Random(6, 7); // inner dimension mismatch
|
||||
|
||||
auto d_A = DeviceMatrix<Scalar>::fromHost(A);
|
||||
auto d_B = DeviceMatrix<Scalar>::fromHost(B);
|
||||
DeviceMatrix<Scalar> d_C;
|
||||
|
||||
VERIFY_RAISES_ASSERT(d_C = d_A * d_B);
|
||||
}
|
||||
|
||||
// ---- GEMM with explicit GpuContext ------------------------------------------
|
||||
|
||||
template <typename Scalar>
|
||||
void test_gemm_explicit_context(Index m, Index n, Index k) {
|
||||
using Mat = Matrix<Scalar, Dynamic, Dynamic>;
|
||||
using RealScalar = typename NumTraits<Scalar>::Real;
|
||||
|
||||
Mat A = Mat::Random(m, k);
|
||||
Mat B = Mat::Random(k, n);
|
||||
|
||||
auto d_A = DeviceMatrix<Scalar>::fromHost(A);
|
||||
auto d_B = DeviceMatrix<Scalar>::fromHost(B);
|
||||
|
||||
GpuContext ctx;
|
||||
DeviceMatrix<Scalar> d_C;
|
||||
d_C.device(ctx) = d_A * d_B;
|
||||
|
||||
Mat C = d_C.toHost();
|
||||
Mat C_ref = A * B;
|
||||
|
||||
RealScalar tol = gemm_error_bound<Scalar>(k, A.norm(), B.norm());
|
||||
VERIFY((C - C_ref).norm() < tol);
|
||||
}
|
||||
|
||||
// ---- GEMM cross-context reuse of the same destination -----------------------
|
||||
|
||||
template <typename Scalar>
|
||||
void test_gemm_cross_context_reuse(Index n) {
|
||||
using Mat = Matrix<Scalar, Dynamic, Dynamic>;
|
||||
using RealScalar = typename NumTraits<Scalar>::Real;
|
||||
|
||||
Mat A = Mat::Random(n, n);
|
||||
Mat B = Mat::Random(n, n);
|
||||
Mat D = Mat::Random(n, n);
|
||||
Mat E = Mat::Random(n, n);
|
||||
|
||||
auto d_A = DeviceMatrix<Scalar>::fromHost(A);
|
||||
auto d_B = DeviceMatrix<Scalar>::fromHost(B);
|
||||
auto d_D = DeviceMatrix<Scalar>::fromHost(D);
|
||||
auto d_E = DeviceMatrix<Scalar>::fromHost(E);
|
||||
|
||||
GpuContext ctx1;
|
||||
GpuContext ctx2;
|
||||
DeviceMatrix<Scalar> d_C;
|
||||
d_C.device(ctx1) = d_A * d_B;
|
||||
d_C.device(ctx2) += d_D * d_E;
|
||||
|
||||
Mat C = d_C.toHost();
|
||||
Mat C_ref = A * B + D * E;
|
||||
|
||||
RealScalar tol = gemm_error_bound<Scalar>(n, A.norm(), B.norm()) + gemm_error_bound<Scalar>(n, D.norm(), E.norm());
|
||||
VERIFY((C - C_ref).norm() < tol);
|
||||
}
|
||||
|
||||
// ---- GEMM cross-context resize of the destination ---------------------------
|
||||
|
||||
template <typename Scalar>
|
||||
void test_gemm_cross_context_resize() {
|
||||
using Mat = Matrix<Scalar, Dynamic, Dynamic>;
|
||||
using RealScalar = typename NumTraits<Scalar>::Real;
|
||||
|
||||
Mat A = Mat::Random(64, 64);
|
||||
Mat B = Mat::Random(64, 64);
|
||||
Mat D = Mat::Random(32, 16);
|
||||
Mat E = Mat::Random(16, 8);
|
||||
|
||||
auto d_A = DeviceMatrix<Scalar>::fromHost(A);
|
||||
auto d_B = DeviceMatrix<Scalar>::fromHost(B);
|
||||
auto d_D = DeviceMatrix<Scalar>::fromHost(D);
|
||||
auto d_E = DeviceMatrix<Scalar>::fromHost(E);
|
||||
|
||||
GpuContext ctx1;
|
||||
GpuContext ctx2;
|
||||
DeviceMatrix<Scalar> d_C;
|
||||
d_C.device(ctx1) = d_A * d_B;
|
||||
d_C.device(ctx2) = d_D * d_E;
|
||||
|
||||
Mat C = d_C.toHost();
|
||||
Mat C_ref = D * E;
|
||||
|
||||
RealScalar tol = gemm_error_bound<Scalar>(16, D.norm(), E.norm());
|
||||
VERIFY((C - C_ref).norm() < tol);
|
||||
}
|
||||
|
||||
// ---- GEMM chaining: C = (A * B) then D = C * E -----------------------------
|
||||
|
||||
template <typename Scalar>
|
||||
void test_gemm_chain(Index n) {
|
||||
using Mat = Matrix<Scalar, Dynamic, Dynamic>;
|
||||
using RealScalar = typename NumTraits<Scalar>::Real;
|
||||
|
||||
Mat A = Mat::Random(n, n);
|
||||
Mat B = Mat::Random(n, n);
|
||||
Mat E = Mat::Random(n, n);
|
||||
|
||||
auto d_A = DeviceMatrix<Scalar>::fromHost(A);
|
||||
auto d_B = DeviceMatrix<Scalar>::fromHost(B);
|
||||
auto d_E = DeviceMatrix<Scalar>::fromHost(E);
|
||||
|
||||
DeviceMatrix<Scalar> d_C;
|
||||
d_C = d_A * d_B;
|
||||
DeviceMatrix<Scalar> d_D;
|
||||
d_D = d_C * d_E;
|
||||
|
||||
Mat D = d_D.toHost();
|
||||
Mat D_ref = (A * B) * E;
|
||||
|
||||
Mat C_ref = A * B;
|
||||
RealScalar tol =
|
||||
gemm_error_bound<Scalar>(n, A.norm(), B.norm()) * E.norm() + gemm_error_bound<Scalar>(n, C_ref.norm(), E.norm());
|
||||
VERIFY((D - D_ref).norm() < tol);
|
||||
}
|
||||
|
||||
// ---- Square identity check: A * I = A ---------------------------------------
|
||||
|
||||
template <typename Scalar>
|
||||
void test_gemm_identity(Index n) {
|
||||
using Mat = Matrix<Scalar, Dynamic, Dynamic>;
|
||||
|
||||
Mat A = Mat::Random(n, n);
|
||||
Mat eye = Mat::Identity(n, n);
|
||||
|
||||
auto d_A = DeviceMatrix<Scalar>::fromHost(A);
|
||||
auto d_I = DeviceMatrix<Scalar>::fromHost(eye);
|
||||
|
||||
DeviceMatrix<Scalar> d_C;
|
||||
d_C = d_A * d_I;
|
||||
|
||||
Mat C = d_C.toHost();
|
||||
VERIFY_IS_APPROX(C, A);
|
||||
}
|
||||
|
||||
// ---- LLT solve expression: d_X = d_A.llt().solve(d_B) ----------------------
|
||||
|
||||
template <typename MatrixType>
|
||||
MatrixType make_spd(Index n) {
|
||||
using Scalar = typename MatrixType::Scalar;
|
||||
MatrixType M = MatrixType::Random(n, n);
|
||||
return M.adjoint() * M + MatrixType::Identity(n, n) * static_cast<Scalar>(n);
|
||||
}
|
||||
|
||||
template <typename Scalar>
|
||||
void test_llt_solve_expr(Index n, Index nrhs) {
|
||||
using Mat = Matrix<Scalar, Dynamic, Dynamic>;
|
||||
using RealScalar = typename NumTraits<Scalar>::Real;
|
||||
|
||||
Mat A = make_spd<Mat>(n);
|
||||
Mat B = Mat::Random(n, nrhs);
|
||||
|
||||
auto d_A = DeviceMatrix<Scalar>::fromHost(A);
|
||||
auto d_B = DeviceMatrix<Scalar>::fromHost(B);
|
||||
|
||||
DeviceMatrix<Scalar> d_X;
|
||||
d_X = d_A.llt().solve(d_B);
|
||||
|
||||
Mat X = d_X.toHost();
|
||||
RealScalar residual = (A * X - B).norm() / B.norm();
|
||||
VERIFY(residual < RealScalar(n) * gpu_unit_roundoff<Scalar>());
|
||||
}
|
||||
|
||||
// ---- LLT solve with explicit context ----------------------------------------
|
||||
|
||||
template <typename Scalar>
|
||||
void test_llt_solve_expr_context(Index n, Index nrhs) {
|
||||
using Mat = Matrix<Scalar, Dynamic, Dynamic>;
|
||||
using RealScalar = typename NumTraits<Scalar>::Real;
|
||||
|
||||
Mat A = make_spd<Mat>(n);
|
||||
Mat B = Mat::Random(n, nrhs);
|
||||
|
||||
auto d_A = DeviceMatrix<Scalar>::fromHost(A);
|
||||
auto d_B = DeviceMatrix<Scalar>::fromHost(B);
|
||||
|
||||
GpuContext ctx;
|
||||
DeviceMatrix<Scalar> d_X;
|
||||
d_X.device(ctx) = d_A.llt().solve(d_B);
|
||||
|
||||
Mat X = d_X.toHost();
|
||||
RealScalar residual = (A * X - B).norm() / B.norm();
|
||||
VERIFY(residual < RealScalar(n) * gpu_unit_roundoff<Scalar>());
|
||||
}
|
||||
|
||||
// ---- LU solve expression: d_X = d_A.lu().solve(d_B) ------------------------
|
||||
|
||||
template <typename Scalar>
|
||||
void test_lu_solve_expr(Index n, Index nrhs) {
|
||||
using Mat = Matrix<Scalar, Dynamic, Dynamic>;
|
||||
using RealScalar = typename NumTraits<Scalar>::Real;
|
||||
|
||||
Mat A = Mat::Random(n, n);
|
||||
Mat B = Mat::Random(n, nrhs);
|
||||
|
||||
auto d_A = DeviceMatrix<Scalar>::fromHost(A);
|
||||
auto d_B = DeviceMatrix<Scalar>::fromHost(B);
|
||||
|
||||
DeviceMatrix<Scalar> d_X;
|
||||
d_X = d_A.lu().solve(d_B);
|
||||
|
||||
Mat X = d_X.toHost();
|
||||
RealScalar residual = (A * X - B).norm() / (A.norm() * X.norm());
|
||||
VERIFY(residual < RealScalar(10) * RealScalar(n) * gpu_unit_roundoff<Scalar>());
|
||||
}
|
||||
|
||||
// ---- GEMM + solver chain: C = A * B, X = C.llt().solve(D) ------------------
|
||||
|
||||
template <typename Scalar>
|
||||
void test_gemm_then_solve(Index n) {
|
||||
using Mat = Matrix<Scalar, Dynamic, Dynamic>;
|
||||
using RealScalar = typename NumTraits<Scalar>::Real;
|
||||
|
||||
Mat A = Mat::Random(n, n);
|
||||
Mat D = Mat::Random(n, 1);
|
||||
|
||||
// Make SPD: C = A^H * A + n*I
|
||||
auto d_A = DeviceMatrix<Scalar>::fromHost(A);
|
||||
DeviceMatrix<Scalar> d_C;
|
||||
d_C = d_A.adjoint() * d_A;
|
||||
|
||||
// Add n*I on host (no element-wise ops on DeviceMatrix yet).
|
||||
Mat C = d_C.toHost();
|
||||
C += Mat::Identity(n, n) * static_cast<Scalar>(n);
|
||||
d_C = DeviceMatrix<Scalar>::fromHost(C);
|
||||
|
||||
auto d_D = DeviceMatrix<Scalar>::fromHost(D);
|
||||
|
||||
DeviceMatrix<Scalar> d_X;
|
||||
d_X = d_C.llt().solve(d_D);
|
||||
|
||||
Mat X = d_X.toHost();
|
||||
RealScalar residual = (C * X - D).norm() / D.norm();
|
||||
VERIFY(residual < RealScalar(n) * gpu_unit_roundoff<Scalar>());
|
||||
}
|
||||
|
||||
// ---- LLT solve with Upper triangle -----------------------------------------
|
||||
|
||||
template <typename Scalar>
|
||||
void test_llt_solve_upper(Index n, Index nrhs) {
|
||||
using Mat = Matrix<Scalar, Dynamic, Dynamic>;
|
||||
using RealScalar = typename NumTraits<Scalar>::Real;
|
||||
|
||||
Mat A = make_spd<Mat>(n);
|
||||
Mat B = Mat::Random(n, nrhs);
|
||||
|
||||
auto d_A = DeviceMatrix<Scalar>::fromHost(A);
|
||||
auto d_B = DeviceMatrix<Scalar>::fromHost(B);
|
||||
|
||||
DeviceMatrix<Scalar> d_X;
|
||||
d_X = d_A.template llt<Upper>().solve(d_B);
|
||||
|
||||
Mat X = d_X.toHost();
|
||||
RealScalar residual = (A * X - B).norm() / B.norm();
|
||||
VERIFY(residual < RealScalar(n) * gpu_unit_roundoff<Scalar>());
|
||||
}
|
||||
|
||||
// ---- LU solve with explicit context -----------------------------------------
|
||||
|
||||
template <typename Scalar>
|
||||
void test_lu_solve_expr_context(Index n, Index nrhs) {
|
||||
using Mat = Matrix<Scalar, Dynamic, Dynamic>;
|
||||
using RealScalar = typename NumTraits<Scalar>::Real;
|
||||
|
||||
Mat A = Mat::Random(n, n);
|
||||
Mat B = Mat::Random(n, nrhs);
|
||||
|
||||
auto d_A = DeviceMatrix<Scalar>::fromHost(A);
|
||||
auto d_B = DeviceMatrix<Scalar>::fromHost(B);
|
||||
|
||||
GpuContext ctx;
|
||||
DeviceMatrix<Scalar> d_X;
|
||||
d_X.device(ctx) = d_A.lu().solve(d_B);
|
||||
|
||||
Mat X = d_X.toHost();
|
||||
RealScalar residual = (A * X - B).norm() / (A.norm() * X.norm());
|
||||
VERIFY(residual < RealScalar(10) * RealScalar(n) * gpu_unit_roundoff<Scalar>());
|
||||
}
|
||||
|
||||
// ---- Zero-nrhs solver expressions ------------------------------------------
|
||||
|
||||
template <typename Scalar>
|
||||
void test_llt_solve_zero_nrhs(Index n) {
|
||||
using Mat = Matrix<Scalar, Dynamic, Dynamic>;
|
||||
|
||||
Mat A = make_spd<Mat>(n);
|
||||
Mat B = Mat::Random(n, 0);
|
||||
|
||||
auto d_A = DeviceMatrix<Scalar>::fromHost(A);
|
||||
auto d_B = DeviceMatrix<Scalar>::fromHost(B);
|
||||
|
||||
DeviceMatrix<Scalar> d_X;
|
||||
d_X = d_A.llt().solve(d_B);
|
||||
|
||||
VERIFY_IS_EQUAL(d_X.rows(), n);
|
||||
VERIFY_IS_EQUAL(d_X.cols(), 0);
|
||||
}
|
||||
|
||||
template <typename Scalar>
|
||||
void test_lu_solve_zero_nrhs(Index n) {
|
||||
using Mat = Matrix<Scalar, Dynamic, Dynamic>;
|
||||
|
||||
Mat A = Mat::Random(n, n);
|
||||
Mat B = Mat::Random(n, 0);
|
||||
|
||||
auto d_A = DeviceMatrix<Scalar>::fromHost(A);
|
||||
auto d_B = DeviceMatrix<Scalar>::fromHost(B);
|
||||
|
||||
DeviceMatrix<Scalar> d_X;
|
||||
d_X = d_A.lu().solve(d_B);
|
||||
|
||||
VERIFY_IS_EQUAL(d_X.rows(), n);
|
||||
VERIFY_IS_EQUAL(d_X.cols(), 0);
|
||||
}
|
||||
|
||||
// ---- TRSM: triangularView<UpLo>().solve(B) ----------------------------------
|
||||
|
||||
template <typename Scalar, int UpLo>
|
||||
void test_trsm(Index n, Index nrhs) {
|
||||
using Mat = Matrix<Scalar, Dynamic, Dynamic>;
|
||||
using RealScalar = typename NumTraits<Scalar>::Real;
|
||||
|
||||
// Build a well-conditioned triangular matrix.
|
||||
Mat A = Mat::Random(n, n);
|
||||
A.diagonal().array() += static_cast<Scalar>(n); // ensure non-singular
|
||||
if (UpLo == Lower)
|
||||
A = A.template triangularView<Lower>();
|
||||
else
|
||||
A = A.template triangularView<Upper>();
|
||||
|
||||
Mat B = Mat::Random(n, nrhs);
|
||||
|
||||
auto d_A = DeviceMatrix<Scalar>::fromHost(A);
|
||||
auto d_B = DeviceMatrix<Scalar>::fromHost(B);
|
||||
|
||||
DeviceMatrix<Scalar> d_X;
|
||||
d_X = d_A.template triangularView<UpLo>().solve(d_B);
|
||||
|
||||
Mat X = d_X.toHost();
|
||||
RealScalar residual = (A * X - B).norm() / B.norm();
|
||||
VERIFY(residual < RealScalar(n) * gpu_unit_roundoff<Scalar>());
|
||||
}
|
||||
|
||||
// ---- SYMM/HEMM: selfadjointView<UpLo>() * B --------------------------------
|
||||
|
||||
template <typename Scalar, int UpLo>
|
||||
void test_symm(Index n, Index nrhs) {
|
||||
using Mat = Matrix<Scalar, Dynamic, Dynamic>;
|
||||
using RealScalar = typename NumTraits<Scalar>::Real;
|
||||
|
||||
Mat A = make_spd<Mat>(n); // SPD is also self-adjoint
|
||||
Mat B = Mat::Random(n, nrhs);
|
||||
|
||||
auto d_A = DeviceMatrix<Scalar>::fromHost(A);
|
||||
auto d_B = DeviceMatrix<Scalar>::fromHost(B);
|
||||
|
||||
DeviceMatrix<Scalar> d_C;
|
||||
d_C = d_A.template selfadjointView<UpLo>() * d_B;
|
||||
|
||||
Mat C = d_C.toHost();
|
||||
Mat C_ref = A * B; // A is symmetric, so full multiply == symm
|
||||
|
||||
RealScalar tol = gemm_error_bound<Scalar>(n, A.norm(), B.norm());
|
||||
VERIFY((C - C_ref).norm() < tol);
|
||||
}
|
||||
|
||||
// ---- SYRK/HERK: rankUpdate(A) → C = A * A^H --------------------------------
|
||||
|
||||
template <typename Scalar>
|
||||
void test_syrk(Index n, Index k) {
|
||||
using Mat = Matrix<Scalar, Dynamic, Dynamic>;
|
||||
using RealScalar = typename NumTraits<Scalar>::Real;
|
||||
|
||||
Mat A = Mat::Random(n, k);
|
||||
|
||||
auto d_A = DeviceMatrix<Scalar>::fromHost(A);
|
||||
|
||||
DeviceMatrix<Scalar> d_C;
|
||||
d_C.template selfadjointView<Lower>().rankUpdate(d_A);
|
||||
|
||||
Mat C = d_C.toHost();
|
||||
// Only lower triangle is meaningful for SYRK. Compare lower triangle.
|
||||
Mat C_ref = A * A.adjoint();
|
||||
|
||||
// Extract lower triangle for comparison.
|
||||
Mat C_lower = C.template triangularView<Lower>();
|
||||
Mat C_ref_lower = C_ref.template triangularView<Lower>();
|
||||
|
||||
RealScalar tol = gemm_error_bound<Scalar>(k, A.norm(), A.norm());
|
||||
VERIFY((C_lower - C_ref_lower).norm() < tol);
|
||||
}
|
||||
|
||||
// ---- Per-scalar driver ------------------------------------------------------
|
||||
|
||||
template <typename Scalar>
|
||||
void test_scalar() {
|
||||
CALL_SUBTEST(test_gemm_basic<Scalar>(64, 64, 64));
|
||||
CALL_SUBTEST(test_gemm_basic<Scalar>(128, 64, 32));
|
||||
CALL_SUBTEST(test_gemm_basic<Scalar>(1, 1, 1));
|
||||
CALL_SUBTEST(test_gemm_basic<Scalar>(256, 256, 256));
|
||||
|
||||
CALL_SUBTEST(test_gemm_adjoint_lhs<Scalar>(64, 64, 64));
|
||||
CALL_SUBTEST(test_gemm_adjoint_lhs<Scalar>(128, 32, 64));
|
||||
|
||||
CALL_SUBTEST(test_gemm_transpose_rhs<Scalar>(64, 64, 64));
|
||||
CALL_SUBTEST(test_gemm_transpose_rhs<Scalar>(128, 32, 64));
|
||||
|
||||
CALL_SUBTEST(test_gemm_scaled<Scalar>(64, 64, 64));
|
||||
CALL_SUBTEST(test_gemm_scaled_rhs<Scalar>(64, 64, 64));
|
||||
CALL_SUBTEST(test_gemm_accumulate<Scalar>(64, 64, 64));
|
||||
CALL_SUBTEST(test_gemm_accumulate_empty<Scalar>(64, 64, 64));
|
||||
CALL_SUBTEST(test_gemm_subtract<Scalar>(64, 64, 64));
|
||||
CALL_SUBTEST(test_gemm_subtract_empty<Scalar>(64, 64, 64));
|
||||
CALL_SUBTEST(test_gemm_dimension_mismatch<Scalar>());
|
||||
CALL_SUBTEST(test_gemm_explicit_context<Scalar>(64, 64, 64));
|
||||
CALL_SUBTEST(test_gemm_cross_context_reuse<Scalar>(64));
|
||||
CALL_SUBTEST(test_gemm_cross_context_resize<Scalar>());
|
||||
CALL_SUBTEST(test_gemm_chain<Scalar>(64));
|
||||
CALL_SUBTEST(test_gemm_identity<Scalar>(64));
|
||||
|
||||
// Solver expressions — zero-size edge cases (use dedicated tests, not residual-based)
|
||||
|
||||
// Solver expressions
|
||||
CALL_SUBTEST(test_llt_solve_expr<Scalar>(64, 1));
|
||||
CALL_SUBTEST(test_llt_solve_expr<Scalar>(64, 4));
|
||||
CALL_SUBTEST(test_llt_solve_expr<Scalar>(256, 8));
|
||||
CALL_SUBTEST(test_llt_solve_expr_context<Scalar>(64, 4));
|
||||
CALL_SUBTEST(test_llt_solve_upper<Scalar>(64, 4));
|
||||
CALL_SUBTEST(test_lu_solve_expr<Scalar>(64, 1));
|
||||
CALL_SUBTEST(test_lu_solve_expr<Scalar>(64, 4));
|
||||
CALL_SUBTEST(test_lu_solve_expr<Scalar>(256, 8));
|
||||
CALL_SUBTEST(test_lu_solve_expr_context<Scalar>(64, 4));
|
||||
CALL_SUBTEST(test_llt_solve_zero_nrhs<Scalar>(64));
|
||||
CALL_SUBTEST(test_llt_solve_zero_nrhs<Scalar>(0));
|
||||
CALL_SUBTEST(test_lu_solve_zero_nrhs<Scalar>(64));
|
||||
CALL_SUBTEST(test_lu_solve_zero_nrhs<Scalar>(0));
|
||||
CALL_SUBTEST(test_gemm_then_solve<Scalar>(64));
|
||||
|
||||
// TRSM
|
||||
CALL_SUBTEST((test_trsm<Scalar, Lower>(64, 1)));
|
||||
CALL_SUBTEST((test_trsm<Scalar, Lower>(64, 4)));
|
||||
CALL_SUBTEST((test_trsm<Scalar, Upper>(64, 4)));
|
||||
CALL_SUBTEST((test_trsm<Scalar, Lower>(256, 8)));
|
||||
|
||||
// SYMM/HEMM
|
||||
CALL_SUBTEST((test_symm<Scalar, Lower>(64, 4)));
|
||||
CALL_SUBTEST((test_symm<Scalar, Upper>(64, 4)));
|
||||
CALL_SUBTEST((test_symm<Scalar, Lower>(128, 8)));
|
||||
|
||||
// SYRK/HERK
|
||||
CALL_SUBTEST(test_syrk<Scalar>(64, 64));
|
||||
CALL_SUBTEST(test_syrk<Scalar>(64, 32));
|
||||
CALL_SUBTEST(test_syrk<Scalar>(128, 64));
|
||||
}
|
||||
|
||||
// ---- Solver failure mode tests (not templated on Scalar) --------------------
|
||||
|
||||
void test_llt_not_spd() {
|
||||
// Negative definite matrix — LLT factorization must fail.
|
||||
MatrixXd A = -MatrixXd::Identity(8, 8);
|
||||
MatrixXd B = MatrixXd::Random(8, 1);
|
||||
auto d_A = DeviceMatrix<double>::fromHost(A);
|
||||
auto d_B = DeviceMatrix<double>::fromHost(B);
|
||||
DeviceMatrix<double> d_X;
|
||||
VERIFY_RAISES_ASSERT(d_X = d_A.llt().solve(d_B));
|
||||
}
|
||||
|
||||
void test_lu_singular() {
|
||||
// Zero matrix — LU factorization must detect singularity.
|
||||
MatrixXd A = MatrixXd::Zero(8, 8);
|
||||
MatrixXd B = MatrixXd::Random(8, 1);
|
||||
auto d_A = DeviceMatrix<double>::fromHost(A);
|
||||
auto d_B = DeviceMatrix<double>::fromHost(B);
|
||||
DeviceMatrix<double> d_X;
|
||||
VERIFY_RAISES_ASSERT(d_X = d_A.lu().solve(d_B));
|
||||
}
|
||||
|
||||
EIGEN_DECLARE_TEST(gpu_cublas) {
|
||||
CALL_SUBTEST(test_scalar<float>());
|
||||
CALL_SUBTEST(test_scalar<double>());
|
||||
CALL_SUBTEST(test_scalar<std::complex<float>>());
|
||||
CALL_SUBTEST(test_scalar<std::complex<double>>());
|
||||
CALL_SUBTEST(test_llt_not_spd());
|
||||
CALL_SUBTEST(test_lu_singular());
|
||||
}
|
||||
154
test/gpu_cudss_ldlt.cpp
Normal file
154
test/gpu_cudss_ldlt.cpp
Normal file
@@ -0,0 +1,154 @@
|
||||
// This file is part of Eigen, a lightweight C++ template library
|
||||
// for linear algebra.
|
||||
//
|
||||
// Copyright (C) 2026 Rasmus Munk Larsen <rmlarsen@gmail.com>
|
||||
//
|
||||
// This Source Code Form is subject to the terms of the Mozilla
|
||||
// Public License v. 2.0. If a copy of the MPL was not distributed
|
||||
// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
|
||||
|
||||
// Tests for GpuSparseLDLT: GPU sparse LDL^T via cuDSS.
|
||||
|
||||
#define EIGEN_USE_GPU
|
||||
#include "main.h"
|
||||
#include <Eigen/Sparse>
|
||||
#include <Eigen/GPU>
|
||||
|
||||
using namespace Eigen;
|
||||
|
||||
// ---- Helper: build a random sparse symmetric indefinite matrix ---------------
|
||||
|
||||
template <typename Scalar>
|
||||
SparseMatrix<Scalar, ColMajor, int> make_symmetric_indefinite(Index n, double density = 0.1) {
|
||||
using SpMat = SparseMatrix<Scalar, ColMajor, int>;
|
||||
|
||||
// Build a random sparse matrix and symmetrize it.
|
||||
// The diagonal has mixed signs to ensure indefiniteness.
|
||||
SpMat R(n, n);
|
||||
R.reserve(VectorXi::Constant(n, static_cast<int>(n * density) + 1));
|
||||
for (Index j = 0; j < n; ++j) {
|
||||
for (Index i = 0; i < n; ++i) {
|
||||
if (i == j || (std::rand() / double(RAND_MAX)) < density) {
|
||||
R.insert(i, j) = Scalar(std::rand() / double(RAND_MAX) - 0.5);
|
||||
}
|
||||
}
|
||||
}
|
||||
R.makeCompressed();
|
||||
|
||||
// A = R + R^H (symmetric), then add diagonal with alternating signs for indefiniteness.
|
||||
SpMat A = R + SparseMatrix<Scalar, ColMajor, int>(R.adjoint());
|
||||
for (Index i = 0; i < n; ++i) {
|
||||
Scalar diag_val = Scalar((i % 2 == 0) ? n : -n);
|
||||
A.coeffRef(i, i) += diag_val;
|
||||
}
|
||||
A.makeCompressed();
|
||||
return A;
|
||||
}
|
||||
|
||||
// ---- Solve and check residual -----------------------------------------------
|
||||
|
||||
template <typename Scalar>
|
||||
void test_solve(Index n) {
|
||||
using SpMat = SparseMatrix<Scalar, ColMajor, int>;
|
||||
using Vec = Matrix<Scalar, Dynamic, 1>;
|
||||
using RealScalar = typename NumTraits<Scalar>::Real;
|
||||
|
||||
SpMat A = make_symmetric_indefinite<Scalar>(n);
|
||||
Vec b = Vec::Random(n);
|
||||
|
||||
GpuSparseLDLT<Scalar> ldlt(A);
|
||||
VERIFY_IS_EQUAL(ldlt.info(), Success);
|
||||
|
||||
Vec x = ldlt.solve(b);
|
||||
VERIFY_IS_EQUAL(x.rows(), n);
|
||||
|
||||
Vec r = A * x - b;
|
||||
RealScalar tol = RealScalar(100) * RealScalar(n) * NumTraits<Scalar>::epsilon();
|
||||
VERIFY(r.norm() / b.norm() < tol);
|
||||
}
|
||||
|
||||
// ---- Multiple RHS -----------------------------------------------------------
|
||||
|
||||
template <typename Scalar>
|
||||
void test_multiple_rhs(Index n, Index nrhs) {
|
||||
using SpMat = SparseMatrix<Scalar, ColMajor, int>;
|
||||
using Mat = Matrix<Scalar, Dynamic, Dynamic>;
|
||||
using RealScalar = typename NumTraits<Scalar>::Real;
|
||||
|
||||
SpMat A = make_symmetric_indefinite<Scalar>(n);
|
||||
Mat B = Mat::Random(n, nrhs);
|
||||
|
||||
GpuSparseLDLT<Scalar> ldlt(A);
|
||||
VERIFY_IS_EQUAL(ldlt.info(), Success);
|
||||
|
||||
Mat X = ldlt.solve(B);
|
||||
VERIFY_IS_EQUAL(X.rows(), n);
|
||||
VERIFY_IS_EQUAL(X.cols(), nrhs);
|
||||
|
||||
Mat R = A * X - B;
|
||||
RealScalar tol = RealScalar(100) * RealScalar(n) * NumTraits<Scalar>::epsilon();
|
||||
VERIFY(R.norm() / B.norm() < tol);
|
||||
}
|
||||
|
||||
// ---- Refactorize ------------------------------------------------------------
|
||||
|
||||
template <typename Scalar>
|
||||
void test_refactorize(Index n) {
|
||||
using SpMat = SparseMatrix<Scalar, ColMajor, int>;
|
||||
using Vec = Matrix<Scalar, Dynamic, 1>;
|
||||
using RealScalar = typename NumTraits<Scalar>::Real;
|
||||
|
||||
SpMat A = make_symmetric_indefinite<Scalar>(n);
|
||||
Vec b = Vec::Random(n);
|
||||
|
||||
GpuSparseLDLT<Scalar> ldlt;
|
||||
ldlt.analyzePattern(A);
|
||||
VERIFY_IS_EQUAL(ldlt.info(), Success);
|
||||
|
||||
ldlt.factorize(A);
|
||||
VERIFY_IS_EQUAL(ldlt.info(), Success);
|
||||
Vec x1 = ldlt.solve(b);
|
||||
|
||||
// Modify values, keep pattern.
|
||||
SpMat A2 = A;
|
||||
for (Index i = 0; i < n; ++i) A2.coeffRef(i, i) *= Scalar(RealScalar(2));
|
||||
|
||||
ldlt.factorize(A2);
|
||||
VERIFY_IS_EQUAL(ldlt.info(), Success);
|
||||
Vec x2 = ldlt.solve(b);
|
||||
|
||||
RealScalar tol = RealScalar(100) * RealScalar(n) * NumTraits<Scalar>::epsilon();
|
||||
VERIFY((A * x1 - b).norm() / b.norm() < tol);
|
||||
VERIFY((A2 * x2 - b).norm() / b.norm() < tol);
|
||||
VERIFY((x1 - x2).norm() > NumTraits<Scalar>::epsilon());
|
||||
}
|
||||
|
||||
// ---- Empty ------------------------------------------------------------------
|
||||
|
||||
void test_empty() {
|
||||
using SpMat = SparseMatrix<double, ColMajor, int>;
|
||||
SpMat A(0, 0);
|
||||
A.makeCompressed();
|
||||
GpuSparseLDLT<double> ldlt(A);
|
||||
VERIFY_IS_EQUAL(ldlt.info(), Success);
|
||||
VERIFY_IS_EQUAL(ldlt.rows(), 0);
|
||||
VERIFY_IS_EQUAL(ldlt.cols(), 0);
|
||||
}
|
||||
|
||||
// ---- Per-scalar driver ------------------------------------------------------
|
||||
|
||||
template <typename Scalar>
|
||||
void test_scalar() {
|
||||
CALL_SUBTEST(test_solve<Scalar>(64));
|
||||
CALL_SUBTEST(test_solve<Scalar>(256));
|
||||
CALL_SUBTEST(test_multiple_rhs<Scalar>(64, 4));
|
||||
CALL_SUBTEST(test_refactorize<Scalar>(64));
|
||||
}
|
||||
|
||||
EIGEN_DECLARE_TEST(gpu_cudss_ldlt) {
|
||||
CALL_SUBTEST(test_scalar<float>());
|
||||
CALL_SUBTEST(test_scalar<double>());
|
||||
CALL_SUBTEST(test_scalar<std::complex<float>>());
|
||||
CALL_SUBTEST(test_scalar<std::complex<double>>());
|
||||
CALL_SUBTEST(test_empty());
|
||||
}
|
||||
202
test/gpu_cudss_llt.cpp
Normal file
202
test/gpu_cudss_llt.cpp
Normal file
@@ -0,0 +1,202 @@
|
||||
// This file is part of Eigen, a lightweight C++ template library
|
||||
// for linear algebra.
|
||||
//
|
||||
// Copyright (C) 2026 Rasmus Munk Larsen <rmlarsen@gmail.com>
|
||||
//
|
||||
// This Source Code Form is subject to the terms of the Mozilla
|
||||
// Public License v. 2.0. If a copy of the MPL was not distributed
|
||||
// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
|
||||
|
||||
// Tests for GpuSparseLLT: GPU sparse Cholesky via cuDSS.
|
||||
|
||||
#define EIGEN_USE_GPU
|
||||
#include "main.h"
|
||||
#include <Eigen/Sparse>
|
||||
#include <Eigen/GPU>
|
||||
|
||||
using namespace Eigen;
|
||||
|
||||
// ---- Helper: build a random sparse SPD matrix -------------------------------
|
||||
|
||||
template <typename Scalar>
|
||||
SparseMatrix<Scalar, ColMajor, int> make_spd(Index n, double density = 0.1) {
|
||||
using SpMat = SparseMatrix<Scalar, ColMajor, int>;
|
||||
using RealScalar = typename NumTraits<Scalar>::Real;
|
||||
|
||||
// Uses the global std::rand state seeded by the test framework (g_seed).
|
||||
SpMat R(n, n);
|
||||
R.reserve(VectorXi::Constant(n, static_cast<int>(n * density) + 1));
|
||||
for (Index j = 0; j < n; ++j) {
|
||||
for (Index i = 0; i < n; ++i) {
|
||||
if (i == j || (std::rand() / double(RAND_MAX)) < density) {
|
||||
R.insert(i, j) = Scalar(std::rand() / double(RAND_MAX) - 0.5);
|
||||
}
|
||||
}
|
||||
}
|
||||
R.makeCompressed();
|
||||
|
||||
// A = R^H * R + n * I (guaranteed SPD).
|
||||
SpMat A = R.adjoint() * R;
|
||||
for (Index i = 0; i < n; ++i) A.coeffRef(i, i) += Scalar(RealScalar(n));
|
||||
A.makeCompressed();
|
||||
return A;
|
||||
}
|
||||
|
||||
// ---- Solve and check residual -----------------------------------------------
|
||||
|
||||
template <typename Scalar>
|
||||
void test_solve(Index n) {
|
||||
using SpMat = SparseMatrix<Scalar, ColMajor, int>;
|
||||
using Vec = Matrix<Scalar, Dynamic, 1>;
|
||||
using RealScalar = typename NumTraits<Scalar>::Real;
|
||||
|
||||
SpMat A = make_spd<Scalar>(n);
|
||||
Vec b = Vec::Random(n);
|
||||
|
||||
GpuSparseLLT<Scalar> llt(A);
|
||||
VERIFY_IS_EQUAL(llt.info(), Success);
|
||||
|
||||
Vec x = llt.solve(b);
|
||||
VERIFY_IS_EQUAL(x.rows(), n);
|
||||
|
||||
// Check residual: ||Ax - b|| / ||b||.
|
||||
Vec r = A * x - b;
|
||||
RealScalar tol = RealScalar(100) * RealScalar(n) * NumTraits<Scalar>::epsilon();
|
||||
VERIFY(r.norm() / b.norm() < tol);
|
||||
}
|
||||
|
||||
// ---- Compare with CPU SimplicialLLT -----------------------------------------
|
||||
|
||||
template <typename Scalar>
|
||||
void test_vs_cpu(Index n) {
|
||||
using SpMat = SparseMatrix<Scalar, ColMajor, int>;
|
||||
using Vec = Matrix<Scalar, Dynamic, 1>;
|
||||
using RealScalar = typename NumTraits<Scalar>::Real;
|
||||
|
||||
SpMat A = make_spd<Scalar>(n);
|
||||
Vec b = Vec::Random(n);
|
||||
|
||||
GpuSparseLLT<Scalar> gpu_llt(A);
|
||||
VERIFY_IS_EQUAL(gpu_llt.info(), Success);
|
||||
Vec x_gpu = gpu_llt.solve(b);
|
||||
|
||||
SimplicialLLT<SpMat> cpu_llt(A);
|
||||
VERIFY_IS_EQUAL(cpu_llt.info(), Success);
|
||||
Vec x_cpu = cpu_llt.solve(b);
|
||||
|
||||
RealScalar tol = RealScalar(100) * RealScalar(n) * NumTraits<Scalar>::epsilon();
|
||||
VERIFY((x_gpu - x_cpu).norm() / x_cpu.norm() < tol);
|
||||
}
|
||||
|
||||
// ---- Multiple RHS -----------------------------------------------------------
|
||||
|
||||
template <typename Scalar>
|
||||
void test_multiple_rhs(Index n, Index nrhs) {
|
||||
using SpMat = SparseMatrix<Scalar, ColMajor, int>;
|
||||
using Mat = Matrix<Scalar, Dynamic, Dynamic>;
|
||||
using RealScalar = typename NumTraits<Scalar>::Real;
|
||||
|
||||
SpMat A = make_spd<Scalar>(n);
|
||||
Mat B = Mat::Random(n, nrhs);
|
||||
|
||||
GpuSparseLLT<Scalar> llt(A);
|
||||
VERIFY_IS_EQUAL(llt.info(), Success);
|
||||
|
||||
Mat X = llt.solve(B);
|
||||
VERIFY_IS_EQUAL(X.rows(), n);
|
||||
VERIFY_IS_EQUAL(X.cols(), nrhs);
|
||||
|
||||
Mat R = A * X - B;
|
||||
RealScalar tol = RealScalar(100) * RealScalar(n) * NumTraits<Scalar>::epsilon();
|
||||
VERIFY(R.norm() / B.norm() < tol);
|
||||
}
|
||||
|
||||
// ---- Separate analyze + factorize (refactorization) -------------------------
|
||||
|
||||
template <typename Scalar>
|
||||
void test_refactorize(Index n) {
|
||||
using SpMat = SparseMatrix<Scalar, ColMajor, int>;
|
||||
using Vec = Matrix<Scalar, Dynamic, 1>;
|
||||
using RealScalar = typename NumTraits<Scalar>::Real;
|
||||
|
||||
SpMat A = make_spd<Scalar>(n);
|
||||
Vec b = Vec::Random(n);
|
||||
|
||||
GpuSparseLLT<Scalar> llt;
|
||||
llt.analyzePattern(A);
|
||||
VERIFY_IS_EQUAL(llt.info(), Success);
|
||||
|
||||
// First factorize + solve.
|
||||
llt.factorize(A);
|
||||
VERIFY_IS_EQUAL(llt.info(), Success);
|
||||
Vec x1 = llt.solve(b);
|
||||
|
||||
// Modify values (keep same pattern): scale diagonal.
|
||||
SpMat A2 = A;
|
||||
for (Index i = 0; i < n; ++i) A2.coeffRef(i, i) *= Scalar(RealScalar(2));
|
||||
|
||||
// Refactorize with same pattern.
|
||||
llt.factorize(A2);
|
||||
VERIFY_IS_EQUAL(llt.info(), Success);
|
||||
Vec x2 = llt.solve(b);
|
||||
|
||||
// Both solutions should satisfy their respective systems.
|
||||
RealScalar tol = RealScalar(100) * RealScalar(n) * NumTraits<Scalar>::epsilon();
|
||||
VERIFY((A * x1 - b).norm() / b.norm() < tol);
|
||||
VERIFY((A2 * x2 - b).norm() / b.norm() < tol);
|
||||
|
||||
// Solutions should differ (A2 != A).
|
||||
VERIFY((x1 - x2).norm() > NumTraits<Scalar>::epsilon());
|
||||
}
|
||||
|
||||
// ---- Empty matrix -----------------------------------------------------------
|
||||
|
||||
void test_empty() {
|
||||
using SpMat = SparseMatrix<double, ColMajor, int>;
|
||||
SpMat A(0, 0);
|
||||
A.makeCompressed();
|
||||
GpuSparseLLT<double> llt(A);
|
||||
VERIFY_IS_EQUAL(llt.info(), Success);
|
||||
VERIFY_IS_EQUAL(llt.rows(), 0);
|
||||
VERIFY_IS_EQUAL(llt.cols(), 0);
|
||||
}
|
||||
|
||||
// ---- Upper triangle ---------------------------------------------------------
|
||||
|
||||
template <typename Scalar>
|
||||
void test_upper(Index n) {
|
||||
using SpMat = SparseMatrix<Scalar, ColMajor, int>;
|
||||
using Vec = Matrix<Scalar, Dynamic, 1>;
|
||||
using RealScalar = typename NumTraits<Scalar>::Real;
|
||||
|
||||
SpMat A = make_spd<Scalar>(n);
|
||||
Vec b = Vec::Random(n);
|
||||
|
||||
GpuSparseLLT<Scalar, Upper> llt(A);
|
||||
VERIFY_IS_EQUAL(llt.info(), Success);
|
||||
|
||||
Vec x = llt.solve(b);
|
||||
Vec r = A * x - b;
|
||||
RealScalar tol = RealScalar(100) * RealScalar(n) * NumTraits<Scalar>::epsilon();
|
||||
VERIFY(r.norm() / b.norm() < tol);
|
||||
}
|
||||
|
||||
// ---- Per-scalar driver ------------------------------------------------------
|
||||
|
||||
template <typename Scalar>
|
||||
void test_scalar() {
|
||||
CALL_SUBTEST(test_solve<Scalar>(64));
|
||||
CALL_SUBTEST(test_solve<Scalar>(256));
|
||||
CALL_SUBTEST(test_vs_cpu<Scalar>(64));
|
||||
CALL_SUBTEST(test_multiple_rhs<Scalar>(64, 4));
|
||||
CALL_SUBTEST(test_refactorize<Scalar>(64));
|
||||
CALL_SUBTEST(test_upper<Scalar>(64));
|
||||
}
|
||||
|
||||
EIGEN_DECLARE_TEST(gpu_cudss_llt) {
|
||||
CALL_SUBTEST(test_scalar<float>());
|
||||
CALL_SUBTEST(test_scalar<double>());
|
||||
CALL_SUBTEST(test_scalar<std::complex<float>>());
|
||||
CALL_SUBTEST(test_scalar<std::complex<double>>());
|
||||
CALL_SUBTEST(test_empty());
|
||||
}
|
||||
147
test/gpu_cudss_lu.cpp
Normal file
147
test/gpu_cudss_lu.cpp
Normal file
@@ -0,0 +1,147 @@
|
||||
// This file is part of Eigen, a lightweight C++ template library
|
||||
// for linear algebra.
|
||||
//
|
||||
// Copyright (C) 2026 Rasmus Munk Larsen <rmlarsen@gmail.com>
|
||||
//
|
||||
// This Source Code Form is subject to the terms of the Mozilla
|
||||
// Public License v. 2.0. If a copy of the MPL was not distributed
|
||||
// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
|
||||
|
||||
// Tests for GpuSparseLU: GPU sparse LU via cuDSS.
|
||||
|
||||
#define EIGEN_USE_GPU
|
||||
#include "main.h"
|
||||
#include <Eigen/Sparse>
|
||||
#include <Eigen/GPU>
|
||||
|
||||
using namespace Eigen;
|
||||
|
||||
// ---- Helper: build a random sparse non-singular general matrix ---------------
|
||||
|
||||
template <typename Scalar>
|
||||
SparseMatrix<Scalar, ColMajor, int> make_general(Index n, double density = 0.1) {
|
||||
using SpMat = SparseMatrix<Scalar, ColMajor, int>;
|
||||
using RealScalar = typename NumTraits<Scalar>::Real;
|
||||
|
||||
SpMat R(n, n);
|
||||
R.reserve(VectorXi::Constant(n, static_cast<int>(n * density) + 1));
|
||||
for (Index j = 0; j < n; ++j) {
|
||||
for (Index i = 0; i < n; ++i) {
|
||||
if (i == j || (std::rand() / double(RAND_MAX)) < density) {
|
||||
R.insert(i, j) = Scalar(std::rand() / double(RAND_MAX) - 0.5);
|
||||
}
|
||||
}
|
||||
}
|
||||
// Add strong diagonal for non-singularity.
|
||||
for (Index i = 0; i < n; ++i) R.coeffRef(i, i) += Scalar(RealScalar(n));
|
||||
R.makeCompressed();
|
||||
return R;
|
||||
}
|
||||
|
||||
// ---- Solve and check residual -----------------------------------------------
|
||||
|
||||
template <typename Scalar>
|
||||
void test_solve(Index n) {
|
||||
using SpMat = SparseMatrix<Scalar, ColMajor, int>;
|
||||
using Vec = Matrix<Scalar, Dynamic, 1>;
|
||||
using RealScalar = typename NumTraits<Scalar>::Real;
|
||||
|
||||
SpMat A = make_general<Scalar>(n);
|
||||
Vec b = Vec::Random(n);
|
||||
|
||||
GpuSparseLU<Scalar> lu(A);
|
||||
VERIFY_IS_EQUAL(lu.info(), Success);
|
||||
|
||||
Vec x = lu.solve(b);
|
||||
VERIFY_IS_EQUAL(x.rows(), n);
|
||||
|
||||
Vec r = A * x - b;
|
||||
RealScalar tol = RealScalar(100) * RealScalar(n) * NumTraits<Scalar>::epsilon();
|
||||
VERIFY(r.norm() / b.norm() < tol);
|
||||
}
|
||||
|
||||
// ---- Multiple RHS -----------------------------------------------------------
|
||||
|
||||
template <typename Scalar>
|
||||
void test_multiple_rhs(Index n, Index nrhs) {
|
||||
using SpMat = SparseMatrix<Scalar, ColMajor, int>;
|
||||
using Mat = Matrix<Scalar, Dynamic, Dynamic>;
|
||||
using RealScalar = typename NumTraits<Scalar>::Real;
|
||||
|
||||
SpMat A = make_general<Scalar>(n);
|
||||
Mat B = Mat::Random(n, nrhs);
|
||||
|
||||
GpuSparseLU<Scalar> lu(A);
|
||||
VERIFY_IS_EQUAL(lu.info(), Success);
|
||||
|
||||
Mat X = lu.solve(B);
|
||||
VERIFY_IS_EQUAL(X.rows(), n);
|
||||
VERIFY_IS_EQUAL(X.cols(), nrhs);
|
||||
|
||||
Mat R = A * X - B;
|
||||
RealScalar tol = RealScalar(100) * RealScalar(n) * NumTraits<Scalar>::epsilon();
|
||||
VERIFY(R.norm() / B.norm() < tol);
|
||||
}
|
||||
|
||||
// ---- Refactorize ------------------------------------------------------------
|
||||
|
||||
template <typename Scalar>
|
||||
void test_refactorize(Index n) {
|
||||
using SpMat = SparseMatrix<Scalar, ColMajor, int>;
|
||||
using Vec = Matrix<Scalar, Dynamic, 1>;
|
||||
using RealScalar = typename NumTraits<Scalar>::Real;
|
||||
|
||||
SpMat A = make_general<Scalar>(n);
|
||||
Vec b = Vec::Random(n);
|
||||
|
||||
GpuSparseLU<Scalar> lu;
|
||||
lu.analyzePattern(A);
|
||||
VERIFY_IS_EQUAL(lu.info(), Success);
|
||||
|
||||
lu.factorize(A);
|
||||
VERIFY_IS_EQUAL(lu.info(), Success);
|
||||
Vec x1 = lu.solve(b);
|
||||
|
||||
// Modify values, keep pattern.
|
||||
SpMat A2 = A;
|
||||
for (Index i = 0; i < n; ++i) A2.coeffRef(i, i) *= Scalar(RealScalar(2));
|
||||
|
||||
lu.factorize(A2);
|
||||
VERIFY_IS_EQUAL(lu.info(), Success);
|
||||
Vec x2 = lu.solve(b);
|
||||
|
||||
RealScalar tol = RealScalar(100) * RealScalar(n) * NumTraits<Scalar>::epsilon();
|
||||
VERIFY((A * x1 - b).norm() / b.norm() < tol);
|
||||
VERIFY((A2 * x2 - b).norm() / b.norm() < tol);
|
||||
VERIFY((x1 - x2).norm() > NumTraits<Scalar>::epsilon());
|
||||
}
|
||||
|
||||
// ---- Empty ------------------------------------------------------------------
|
||||
|
||||
void test_empty() {
|
||||
using SpMat = SparseMatrix<double, ColMajor, int>;
|
||||
SpMat A(0, 0);
|
||||
A.makeCompressed();
|
||||
GpuSparseLU<double> lu(A);
|
||||
VERIFY_IS_EQUAL(lu.info(), Success);
|
||||
VERIFY_IS_EQUAL(lu.rows(), 0);
|
||||
VERIFY_IS_EQUAL(lu.cols(), 0);
|
||||
}
|
||||
|
||||
// ---- Per-scalar driver ------------------------------------------------------
|
||||
|
||||
template <typename Scalar>
|
||||
void test_scalar() {
|
||||
CALL_SUBTEST(test_solve<Scalar>(64));
|
||||
CALL_SUBTEST(test_solve<Scalar>(256));
|
||||
CALL_SUBTEST(test_multiple_rhs<Scalar>(64, 4));
|
||||
CALL_SUBTEST(test_refactorize<Scalar>(64));
|
||||
}
|
||||
|
||||
EIGEN_DECLARE_TEST(gpu_cudss_lu) {
|
||||
CALL_SUBTEST(test_scalar<float>());
|
||||
CALL_SUBTEST(test_scalar<double>());
|
||||
CALL_SUBTEST(test_scalar<std::complex<float>>());
|
||||
CALL_SUBTEST(test_scalar<std::complex<double>>());
|
||||
CALL_SUBTEST(test_empty());
|
||||
}
|
||||
186
test/gpu_cufft.cpp
Normal file
186
test/gpu_cufft.cpp
Normal file
@@ -0,0 +1,186 @@
|
||||
// This file is part of Eigen, a lightweight C++ template library
|
||||
// for linear algebra.
|
||||
//
|
||||
// Copyright (C) 2026 Rasmus Munk Larsen <rmlarsen@gmail.com>
|
||||
//
|
||||
// This Source Code Form is subject to the terms of the Mozilla
|
||||
// Public License v. 2.0. If a copy of the MPL was not distributed
|
||||
// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
|
||||
|
||||
// Tests for GpuFFT: GPU FFT via cuFFT.
|
||||
|
||||
#define EIGEN_USE_GPU
|
||||
#include "main.h"
|
||||
#include <Eigen/GPU>
|
||||
|
||||
using namespace Eigen;
|
||||
|
||||
// ---- 1D C2C roundtrip: inv(fwd(x)) ≈ x -------------------------------------
|
||||
|
||||
template <typename Scalar>
|
||||
void test_c2c_roundtrip(Index n) {
|
||||
using Complex = std::complex<Scalar>;
|
||||
using Vec = Matrix<Complex, Dynamic, 1>;
|
||||
using RealScalar = Scalar;
|
||||
|
||||
Vec x = Vec::Random(n);
|
||||
|
||||
GpuFFT<Scalar> fft;
|
||||
Vec X = fft.fwd(x);
|
||||
VERIFY_IS_EQUAL(X.size(), n);
|
||||
|
||||
Vec y = fft.inv(X);
|
||||
VERIFY_IS_EQUAL(y.size(), n);
|
||||
|
||||
RealScalar tol = RealScalar(10) * RealScalar(n) * NumTraits<Scalar>::epsilon();
|
||||
VERIFY((y - x).norm() / x.norm() < tol);
|
||||
}
|
||||
|
||||
// ---- 1D C2C known signal: FFT of constant = delta --------------------------
|
||||
|
||||
template <typename Scalar>
|
||||
void test_c2c_constant() {
|
||||
using Complex = std::complex<Scalar>;
|
||||
using Vec = Matrix<Complex, Dynamic, 1>;
|
||||
using RealScalar = Scalar;
|
||||
|
||||
const int n = 64;
|
||||
Vec x = Vec::Constant(n, Complex(3.0, 0.0));
|
||||
|
||||
GpuFFT<Scalar> fft;
|
||||
Vec X = fft.fwd(x);
|
||||
|
||||
// FFT of constant c: X[0] = c*n, X[k] = 0 for k > 0.
|
||||
RealScalar tol = RealScalar(10) * NumTraits<Scalar>::epsilon() * RealScalar(n);
|
||||
VERIFY(std::abs(X(0) - Complex(3.0 * n, 0.0)) < tol);
|
||||
for (int k = 1; k < n; ++k) {
|
||||
VERIFY(std::abs(X(k)) < tol);
|
||||
}
|
||||
}
|
||||
|
||||
// ---- 1D R2C/C2R roundtrip: invReal(fwd(r), n) ≈ r --------------------------
|
||||
|
||||
template <typename Scalar>
|
||||
void test_r2c_roundtrip(Index n) {
|
||||
using Complex = std::complex<Scalar>;
|
||||
using CVec = Matrix<Complex, Dynamic, 1>;
|
||||
using RVec = Matrix<Scalar, Dynamic, 1>;
|
||||
using RealScalar = Scalar;
|
||||
|
||||
RVec r = RVec::Random(n);
|
||||
|
||||
GpuFFT<Scalar> fft;
|
||||
CVec R = fft.fwd(r);
|
||||
|
||||
// R2C returns n/2+1 complex values.
|
||||
VERIFY_IS_EQUAL(R.size(), n / 2 + 1);
|
||||
|
||||
RVec s = fft.invReal(R, n);
|
||||
VERIFY_IS_EQUAL(s.size(), n);
|
||||
|
||||
RealScalar tol = RealScalar(10) * RealScalar(n) * NumTraits<Scalar>::epsilon();
|
||||
VERIFY((s - r).norm() / r.norm() < tol);
|
||||
}
|
||||
|
||||
// ---- 2D C2C roundtrip: inv2d(fwd2d(A)) ≈ A ---------------------------------
|
||||
|
||||
template <typename Scalar>
|
||||
void test_2d_roundtrip(Index rows, Index cols) {
|
||||
using Complex = std::complex<Scalar>;
|
||||
using Mat = Matrix<Complex, Dynamic, Dynamic>;
|
||||
using RealScalar = Scalar;
|
||||
|
||||
Mat A = Mat::Random(rows, cols);
|
||||
|
||||
GpuFFT<Scalar> fft;
|
||||
Mat B = fft.fwd2d(A);
|
||||
VERIFY_IS_EQUAL(B.rows(), rows);
|
||||
VERIFY_IS_EQUAL(B.cols(), cols);
|
||||
|
||||
Mat C = fft.inv2d(B);
|
||||
VERIFY_IS_EQUAL(C.rows(), rows);
|
||||
VERIFY_IS_EQUAL(C.cols(), cols);
|
||||
|
||||
RealScalar tol = RealScalar(10) * RealScalar(rows * cols) * NumTraits<Scalar>::epsilon();
|
||||
VERIFY((C - A).norm() / A.norm() < tol);
|
||||
}
|
||||
|
||||
// ---- 2D C2C known signal: constant matrix -----------------------------------
|
||||
|
||||
template <typename Scalar>
|
||||
void test_2d_constant() {
|
||||
using Complex = std::complex<Scalar>;
|
||||
using Mat = Matrix<Complex, Dynamic, Dynamic>;
|
||||
using RealScalar = Scalar;
|
||||
|
||||
const int rows = 16, cols = 32;
|
||||
Mat A = Mat::Constant(rows, cols, Complex(2.0, 0.0));
|
||||
|
||||
GpuFFT<Scalar> fft;
|
||||
Mat B = fft.fwd2d(A);
|
||||
|
||||
// 2D FFT of constant c: B(0,0) = c*rows*cols, all others = 0.
|
||||
RealScalar tol = RealScalar(10) * NumTraits<Scalar>::epsilon() * RealScalar(rows * cols);
|
||||
VERIFY(std::abs(B(0, 0) - Complex(2.0 * rows * cols, 0.0)) < tol);
|
||||
for (int j = 0; j < cols; ++j) {
|
||||
for (int i = 0; i < rows; ++i) {
|
||||
if (i == 0 && j == 0) continue;
|
||||
VERIFY(std::abs(B(i, j)) < tol);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// ---- Plan reuse: repeated calls should work ---------------------------------
|
||||
|
||||
template <typename Scalar>
|
||||
void test_plan_reuse() {
|
||||
using Complex = std::complex<Scalar>;
|
||||
using Vec = Matrix<Complex, Dynamic, 1>;
|
||||
using RealScalar = Scalar;
|
||||
|
||||
GpuFFT<Scalar> fft;
|
||||
for (int trial = 0; trial < 5; ++trial) {
|
||||
Vec x = Vec::Random(128);
|
||||
Vec X = fft.fwd(x);
|
||||
Vec y = fft.inv(X);
|
||||
RealScalar tol = RealScalar(10) * RealScalar(128) * NumTraits<Scalar>::epsilon();
|
||||
VERIFY((y - x).norm() / x.norm() < tol);
|
||||
}
|
||||
}
|
||||
|
||||
// ---- Empty ------------------------------------------------------------------
|
||||
|
||||
template <typename Scalar>
|
||||
void test_empty() {
|
||||
using Complex = std::complex<Scalar>;
|
||||
using Vec = Matrix<Complex, Dynamic, 1>;
|
||||
|
||||
GpuFFT<Scalar> fft;
|
||||
Vec x(0);
|
||||
Vec X = fft.fwd(x);
|
||||
VERIFY_IS_EQUAL(X.size(), 0);
|
||||
Vec y = fft.inv(X);
|
||||
VERIFY_IS_EQUAL(y.size(), 0);
|
||||
}
|
||||
|
||||
// ---- Per-scalar driver ------------------------------------------------------
|
||||
|
||||
template <typename Scalar>
|
||||
void test_scalar() {
|
||||
CALL_SUBTEST(test_c2c_roundtrip<Scalar>(64));
|
||||
CALL_SUBTEST(test_c2c_roundtrip<Scalar>(256));
|
||||
CALL_SUBTEST(test_c2c_roundtrip<Scalar>(1000)); // non-power-of-2
|
||||
CALL_SUBTEST(test_c2c_constant<Scalar>());
|
||||
CALL_SUBTEST(test_r2c_roundtrip<Scalar>(64));
|
||||
CALL_SUBTEST(test_r2c_roundtrip<Scalar>(256));
|
||||
CALL_SUBTEST(test_2d_roundtrip<Scalar>(32, 32));
|
||||
CALL_SUBTEST(test_2d_roundtrip<Scalar>(16, 64)); // non-square
|
||||
CALL_SUBTEST(test_2d_constant<Scalar>());
|
||||
CALL_SUBTEST(test_plan_reuse<Scalar>());
|
||||
CALL_SUBTEST(test_empty<Scalar>());
|
||||
}
|
||||
|
||||
EIGEN_DECLARE_TEST(gpu_cufft) {
|
||||
CALL_SUBTEST(test_scalar<float>());
|
||||
CALL_SUBTEST(test_scalar<double>());
|
||||
}
|
||||
180
test/gpu_cusolver_eigen.cpp
Normal file
180
test/gpu_cusolver_eigen.cpp
Normal file
@@ -0,0 +1,180 @@
|
||||
// This file is part of Eigen, a lightweight C++ template library
|
||||
// for linear algebra.
|
||||
//
|
||||
// Copyright (C) 2026 Rasmus Munk Larsen <rmlarsen@gmail.com>
|
||||
//
|
||||
// This Source Code Form is subject to the terms of the Mozilla
|
||||
// Public License v. 2.0. If a copy of the MPL was not distributed
|
||||
// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
|
||||
|
||||
// Tests for GpuSelfAdjointEigenSolver: GPU symmetric/Hermitian eigenvalue
|
||||
// decomposition via cuSOLVER.
|
||||
|
||||
#define EIGEN_USE_GPU
|
||||
#include "main.h"
|
||||
#include <Eigen/Eigenvalues>
|
||||
#include <Eigen/GPU>
|
||||
|
||||
using namespace Eigen;
|
||||
|
||||
// ---- Reconstruction: V * diag(W) * V^H ≈ A ---------------------------------
|
||||
|
||||
template <typename Scalar>
|
||||
void test_eigen_reconstruction(Index n) {
|
||||
using Mat = Matrix<Scalar, Dynamic, Dynamic>;
|
||||
using RealScalar = typename NumTraits<Scalar>::Real;
|
||||
|
||||
// Build a symmetric/Hermitian matrix.
|
||||
Mat R = Mat::Random(n, n);
|
||||
Mat A = R + R.adjoint();
|
||||
|
||||
GpuSelfAdjointEigenSolver<Scalar> es(A);
|
||||
VERIFY_IS_EQUAL(es.info(), Success);
|
||||
|
||||
auto W = es.eigenvalues();
|
||||
Mat V = es.eigenvectors();
|
||||
|
||||
VERIFY_IS_EQUAL(W.size(), n);
|
||||
VERIFY_IS_EQUAL(V.rows(), n);
|
||||
VERIFY_IS_EQUAL(V.cols(), n);
|
||||
|
||||
// Reconstruct: A_hat = V * diag(W) * V^H.
|
||||
Mat A_hat = V * W.asDiagonal() * V.adjoint();
|
||||
RealScalar tol = RealScalar(5) * std::sqrt(static_cast<RealScalar>(n)) * NumTraits<Scalar>::epsilon() * A.norm();
|
||||
VERIFY((A_hat - A).norm() < tol);
|
||||
|
||||
// Orthogonality: V^H * V ≈ I.
|
||||
Mat VhV = V.adjoint() * V;
|
||||
Mat eye = Mat::Identity(n, n);
|
||||
VERIFY((VhV - eye).norm() < tol);
|
||||
}
|
||||
|
||||
// ---- Eigenvalues match CPU SelfAdjointEigenSolver ---------------------------
|
||||
|
||||
template <typename Scalar>
|
||||
void test_eigen_values(Index n) {
|
||||
using Mat = Matrix<Scalar, Dynamic, Dynamic>;
|
||||
using RealScalar = typename NumTraits<Scalar>::Real;
|
||||
|
||||
Mat R = Mat::Random(n, n);
|
||||
Mat A = R + R.adjoint();
|
||||
|
||||
GpuSelfAdjointEigenSolver<Scalar> gpu_es(A);
|
||||
VERIFY_IS_EQUAL(gpu_es.info(), Success);
|
||||
auto W_gpu = gpu_es.eigenvalues();
|
||||
|
||||
SelfAdjointEigenSolver<Mat> cpu_es(A);
|
||||
auto W_cpu = cpu_es.eigenvalues();
|
||||
|
||||
RealScalar tol = RealScalar(5) * std::sqrt(static_cast<RealScalar>(n)) * NumTraits<Scalar>::epsilon() *
|
||||
W_cpu.cwiseAbs().maxCoeff();
|
||||
VERIFY((W_gpu - W_cpu).norm() < tol);
|
||||
}
|
||||
|
||||
// ---- Eigenvalues-only mode --------------------------------------------------
|
||||
|
||||
template <typename Scalar>
|
||||
void test_eigen_values_only(Index n) {
|
||||
using Mat = Matrix<Scalar, Dynamic, Dynamic>;
|
||||
using RealScalar = typename NumTraits<Scalar>::Real;
|
||||
|
||||
Mat R = Mat::Random(n, n);
|
||||
Mat A = R + R.adjoint();
|
||||
|
||||
GpuSelfAdjointEigenSolver<Scalar> gpu_es(A, GpuSelfAdjointEigenSolver<Scalar>::EigenvaluesOnly);
|
||||
VERIFY_IS_EQUAL(gpu_es.info(), Success);
|
||||
auto W_gpu = gpu_es.eigenvalues();
|
||||
|
||||
SelfAdjointEigenSolver<Mat> cpu_es(A, EigenvaluesOnly);
|
||||
auto W_cpu = cpu_es.eigenvalues();
|
||||
|
||||
RealScalar tol = RealScalar(5) * std::sqrt(static_cast<RealScalar>(n)) * NumTraits<Scalar>::epsilon() *
|
||||
W_cpu.cwiseAbs().maxCoeff();
|
||||
VERIFY((W_gpu - W_cpu).norm() < tol);
|
||||
}
|
||||
|
||||
// ---- DeviceMatrix input path ------------------------------------------------
|
||||
|
||||
template <typename Scalar>
|
||||
void test_eigen_device_matrix(Index n) {
|
||||
using Mat = Matrix<Scalar, Dynamic, Dynamic>;
|
||||
using RealScalar = typename NumTraits<Scalar>::Real;
|
||||
|
||||
Mat R = Mat::Random(n, n);
|
||||
Mat A = R + R.adjoint();
|
||||
|
||||
auto d_A = DeviceMatrix<Scalar>::fromHost(A);
|
||||
GpuSelfAdjointEigenSolver<Scalar> es;
|
||||
es.compute(d_A);
|
||||
VERIFY_IS_EQUAL(es.info(), Success);
|
||||
|
||||
auto W_gpu = es.eigenvalues();
|
||||
Mat V = es.eigenvectors();
|
||||
|
||||
// Verify reconstruction.
|
||||
Mat A_hat = V * W_gpu.asDiagonal() * V.adjoint();
|
||||
RealScalar tol = RealScalar(5) * std::sqrt(static_cast<RealScalar>(n)) * NumTraits<Scalar>::epsilon() * A.norm();
|
||||
VERIFY((A_hat - A).norm() < tol);
|
||||
}
|
||||
|
||||
// ---- Recompute (reuse solver object) ----------------------------------------
|
||||
|
||||
template <typename Scalar>
|
||||
void test_eigen_recompute(Index n) {
|
||||
using Mat = Matrix<Scalar, Dynamic, Dynamic>;
|
||||
using RealScalar = typename NumTraits<Scalar>::Real;
|
||||
|
||||
GpuSelfAdjointEigenSolver<Scalar> es;
|
||||
|
||||
for (int trial = 0; trial < 3; ++trial) {
|
||||
Mat R = Mat::Random(n, n);
|
||||
Mat A = R + R.adjoint();
|
||||
es.compute(A);
|
||||
VERIFY_IS_EQUAL(es.info(), Success);
|
||||
|
||||
auto W = es.eigenvalues();
|
||||
Mat V = es.eigenvectors();
|
||||
Mat A_hat = V * W.asDiagonal() * V.adjoint();
|
||||
RealScalar tol = RealScalar(5) * std::sqrt(static_cast<RealScalar>(n)) * NumTraits<Scalar>::epsilon() * A.norm();
|
||||
VERIFY((A_hat - A).norm() < tol);
|
||||
}
|
||||
}
|
||||
|
||||
// ---- Empty matrix -----------------------------------------------------------
|
||||
|
||||
void test_eigen_empty() {
|
||||
GpuSelfAdjointEigenSolver<double> es(MatrixXd(0, 0));
|
||||
VERIFY_IS_EQUAL(es.info(), Success);
|
||||
VERIFY_IS_EQUAL(es.rows(), 0);
|
||||
VERIFY_IS_EQUAL(es.cols(), 0);
|
||||
}
|
||||
|
||||
// ---- Per-scalar driver ------------------------------------------------------
|
||||
|
||||
template <typename Scalar>
|
||||
void test_scalar() {
|
||||
// Reconstruction + orthogonality.
|
||||
CALL_SUBTEST(test_eigen_reconstruction<Scalar>(64));
|
||||
CALL_SUBTEST(test_eigen_reconstruction<Scalar>(128));
|
||||
|
||||
// Eigenvalues match CPU.
|
||||
CALL_SUBTEST(test_eigen_values<Scalar>(64));
|
||||
CALL_SUBTEST(test_eigen_values<Scalar>(128));
|
||||
|
||||
// Values-only mode.
|
||||
CALL_SUBTEST(test_eigen_values_only<Scalar>(64));
|
||||
|
||||
// DeviceMatrix input.
|
||||
CALL_SUBTEST(test_eigen_device_matrix<Scalar>(64));
|
||||
|
||||
// Recompute.
|
||||
CALL_SUBTEST(test_eigen_recompute<Scalar>(32));
|
||||
}
|
||||
|
||||
EIGEN_DECLARE_TEST(gpu_cusolver_eigen) {
|
||||
CALL_SUBTEST(test_scalar<float>());
|
||||
CALL_SUBTEST(test_scalar<double>());
|
||||
CALL_SUBTEST(test_scalar<std::complex<float>>());
|
||||
CALL_SUBTEST(test_scalar<std::complex<double>>());
|
||||
CALL_SUBTEST(test_eigen_empty());
|
||||
}
|
||||
210
test/gpu_cusolver_llt.cpp
Normal file
210
test/gpu_cusolver_llt.cpp
Normal file
@@ -0,0 +1,210 @@
|
||||
// This file is part of Eigen, a lightweight C++ template library
|
||||
// for linear algebra.
|
||||
//
|
||||
// Copyright (C) 2026 Eigen Authors
|
||||
//
|
||||
// This Source Code Form is subject to the terms of the Mozilla
|
||||
// Public License v. 2.0. If a copy of the MPL was not distributed
|
||||
// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
|
||||
|
||||
// Tests for GpuLLT: GPU Cholesky (LL^T) using cuSOLVER.
|
||||
// Covers cusolverDnXpotrf (factorization) and cusolverDnXpotrs (solve)
|
||||
// for float, double, complex<float>, complex<double>, Lower and Upper.
|
||||
|
||||
#define EIGEN_USE_GPU
|
||||
#include "main.h"
|
||||
#include <Eigen/Cholesky>
|
||||
#include <Eigen/GPU>
|
||||
|
||||
using namespace Eigen;
|
||||
|
||||
// Build a random symmetric positive-definite matrix: A = M^H*M + n*I.
|
||||
template <typename MatrixType>
|
||||
MatrixType make_spd(Index n) {
|
||||
using Scalar = typename MatrixType::Scalar;
|
||||
MatrixType M = MatrixType::Random(n, n);
|
||||
return M.adjoint() * M + MatrixType::Identity(n, n) * static_cast<Scalar>(n);
|
||||
}
|
||||
|
||||
// Test factorization: L*L^H must reconstruct A to within floating-point tolerance.
|
||||
template <typename Scalar, int UpLo>
|
||||
void test_potrf(Index n) {
|
||||
using MatrixType = Matrix<Scalar, Dynamic, Dynamic>;
|
||||
using RealScalar = typename NumTraits<Scalar>::Real;
|
||||
|
||||
MatrixType A = make_spd<MatrixType>(n);
|
||||
|
||||
GpuLLT<Scalar, UpLo> llt(A);
|
||||
VERIFY_IS_EQUAL(llt.info(), Success);
|
||||
|
||||
// Reconstruct L*L^H and compare to original A.
|
||||
// GpuLLT stores the factor on device; use CPU LLT to get the triangular factor
|
||||
// for reconstruction since GpuLLT does not expose the device-resident factor directly.
|
||||
LLT<MatrixType, UpLo> ref(A);
|
||||
VERIFY_IS_EQUAL(ref.info(), Success);
|
||||
MatrixType A_reconstructed = ref.reconstructedMatrix();
|
||||
|
||||
// Both should equal A to within n*eps*||A||.
|
||||
RealScalar tol = RealScalar(4) * RealScalar(n) * NumTraits<Scalar>::epsilon() * A.norm();
|
||||
VERIFY((A_reconstructed - A).norm() < tol);
|
||||
|
||||
// Smoke-test: llt.solve(b) should return the same result as ref.solve(b).
|
||||
MatrixType b = MatrixType::Random(n, 1);
|
||||
MatrixType x_gpu = llt.solve(b);
|
||||
MatrixType x_cpu = ref.solve(b);
|
||||
VERIFY((x_gpu - x_cpu).norm() < tol);
|
||||
}
|
||||
|
||||
// Test solve: residual ||A*X - B|| / ||B|| must be small.
|
||||
template <typename Scalar, int UpLo>
|
||||
void test_potrs(Index n, Index nrhs) {
|
||||
using MatrixType = Matrix<Scalar, Dynamic, Dynamic>;
|
||||
using RealScalar = typename NumTraits<Scalar>::Real;
|
||||
|
||||
MatrixType A = make_spd<MatrixType>(n);
|
||||
MatrixType B = MatrixType::Random(n, nrhs);
|
||||
|
||||
GpuLLT<Scalar, UpLo> llt(A);
|
||||
VERIFY_IS_EQUAL(llt.info(), Success);
|
||||
|
||||
MatrixType X = llt.solve(B);
|
||||
|
||||
RealScalar residual = (A * X - B).norm() / B.norm();
|
||||
RealScalar tol = RealScalar(n) * NumTraits<Scalar>::epsilon();
|
||||
VERIFY(residual < tol);
|
||||
}
|
||||
|
||||
// Test that multiple solves against the same factor all produce correct results.
|
||||
// This exercises the key design property: L stays on device across calls.
|
||||
template <typename Scalar>
|
||||
void test_multiple_solves(Index n) {
|
||||
using MatrixType = Matrix<Scalar, Dynamic, Dynamic>;
|
||||
using RealScalar = typename NumTraits<Scalar>::Real;
|
||||
|
||||
MatrixType A = make_spd<MatrixType>(n);
|
||||
GpuLLT<Scalar, Lower> llt(A);
|
||||
VERIFY_IS_EQUAL(llt.info(), Success);
|
||||
|
||||
RealScalar tol = RealScalar(n) * NumTraits<Scalar>::epsilon();
|
||||
for (int k = 0; k < 5; ++k) {
|
||||
MatrixType B = MatrixType::Random(n, 3);
|
||||
MatrixType X = llt.solve(B);
|
||||
RealScalar residual = (A * X - B).norm() / B.norm();
|
||||
VERIFY(residual < tol);
|
||||
}
|
||||
}
|
||||
|
||||
// Test that GpuLLT correctly detects a non-SPD matrix.
|
||||
void test_not_spd() {
|
||||
MatrixXd A = -MatrixXd::Identity(8, 8); // negative definite
|
||||
GpuLLT<double> llt(A);
|
||||
VERIFY_IS_EQUAL(llt.info(), NumericalIssue);
|
||||
}
|
||||
|
||||
// ---- DeviceMatrix integration tests -----------------------------------------
|
||||
|
||||
// compute(DeviceMatrix) + solve(DeviceMatrix) → toHost
|
||||
template <typename Scalar, int UpLo>
|
||||
void test_device_matrix_solve(Index n, Index nrhs) {
|
||||
using MatrixType = Matrix<Scalar, Dynamic, Dynamic>;
|
||||
using RealScalar = typename NumTraits<Scalar>::Real;
|
||||
|
||||
MatrixType A = make_spd<MatrixType>(n);
|
||||
MatrixType B = MatrixType::Random(n, nrhs);
|
||||
|
||||
auto d_A = DeviceMatrix<Scalar>::fromHost(A);
|
||||
auto d_B = DeviceMatrix<Scalar>::fromHost(B);
|
||||
|
||||
GpuLLT<Scalar, UpLo> llt;
|
||||
llt.compute(d_A);
|
||||
VERIFY_IS_EQUAL(llt.info(), Success);
|
||||
|
||||
DeviceMatrix<Scalar> d_X = llt.solve(d_B);
|
||||
MatrixType X = d_X.toHost();
|
||||
|
||||
RealScalar residual = (A * X - B).norm() / B.norm();
|
||||
VERIFY(residual < RealScalar(n) * NumTraits<Scalar>::epsilon());
|
||||
}
|
||||
|
||||
// compute(DeviceMatrix&&) — move path
|
||||
template <typename Scalar>
|
||||
void test_device_matrix_move_compute(Index n) {
|
||||
using MatrixType = Matrix<Scalar, Dynamic, Dynamic>;
|
||||
using RealScalar = typename NumTraits<Scalar>::Real;
|
||||
|
||||
MatrixType A = make_spd<MatrixType>(n);
|
||||
MatrixType B = MatrixType::Random(n, 1);
|
||||
|
||||
auto d_A = DeviceMatrix<Scalar>::fromHost(A);
|
||||
GpuLLT<Scalar, Lower> llt;
|
||||
llt.compute(std::move(d_A));
|
||||
VERIFY_IS_EQUAL(llt.info(), Success);
|
||||
|
||||
// d_A should be empty after move.
|
||||
VERIFY(d_A.empty());
|
||||
|
||||
MatrixType X = llt.solve(B);
|
||||
RealScalar residual = (A * X - B).norm() / B.norm();
|
||||
VERIFY(residual < RealScalar(n) * NumTraits<Scalar>::epsilon());
|
||||
}
|
||||
|
||||
// Full async chain: compute → solve → solve again with result as RHS → toHost
|
||||
template <typename Scalar>
|
||||
void test_chaining(Index n) {
|
||||
using MatrixType = Matrix<Scalar, Dynamic, Dynamic>;
|
||||
using RealScalar = typename NumTraits<Scalar>::Real;
|
||||
|
||||
MatrixType A = make_spd<MatrixType>(n);
|
||||
MatrixType B = MatrixType::Random(n, 3);
|
||||
|
||||
auto d_A = DeviceMatrix<Scalar>::fromHost(A);
|
||||
auto d_B = DeviceMatrix<Scalar>::fromHost(B);
|
||||
|
||||
GpuLLT<Scalar, Lower> llt;
|
||||
llt.compute(d_A);
|
||||
VERIFY_IS_EQUAL(llt.info(), Success);
|
||||
|
||||
// Chain: solve → use result as RHS for another solve
|
||||
DeviceMatrix<Scalar> d_X = llt.solve(d_B);
|
||||
DeviceMatrix<Scalar> d_Y = llt.solve(d_X);
|
||||
|
||||
// Only sync at the very end.
|
||||
MatrixType Y = d_Y.toHost();
|
||||
|
||||
// Verify: Y = A^{-2} * B
|
||||
MatrixType X_ref = LLT<MatrixType, Lower>(A).solve(B);
|
||||
MatrixType Y_ref = LLT<MatrixType, Lower>(A).solve(X_ref);
|
||||
|
||||
RealScalar tol = RealScalar(4) * RealScalar(n) * NumTraits<Scalar>::epsilon() * Y_ref.norm();
|
||||
VERIFY((Y - Y_ref).norm() < tol);
|
||||
}
|
||||
|
||||
template <typename Scalar>
|
||||
void test_scalar() {
|
||||
CALL_SUBTEST((test_potrf<Scalar, Lower>(1)));
|
||||
CALL_SUBTEST((test_potrf<Scalar, Lower>(64)));
|
||||
CALL_SUBTEST((test_potrf<Scalar, Lower>(256)));
|
||||
CALL_SUBTEST((test_potrf<Scalar, Upper>(64)));
|
||||
CALL_SUBTEST((test_potrf<Scalar, Upper>(256)));
|
||||
|
||||
CALL_SUBTEST((test_potrs<Scalar, Lower>(64, 1)));
|
||||
CALL_SUBTEST((test_potrs<Scalar, Lower>(64, 4)));
|
||||
CALL_SUBTEST((test_potrs<Scalar, Lower>(256, 8)));
|
||||
CALL_SUBTEST((test_potrs<Scalar, Upper>(64, 1)));
|
||||
CALL_SUBTEST((test_potrs<Scalar, Upper>(256, 4)));
|
||||
|
||||
CALL_SUBTEST(test_multiple_solves<Scalar>(128));
|
||||
|
||||
CALL_SUBTEST((test_device_matrix_solve<Scalar, Lower>(64, 4)));
|
||||
CALL_SUBTEST((test_device_matrix_solve<Scalar, Upper>(128, 1)));
|
||||
CALL_SUBTEST(test_device_matrix_move_compute<Scalar>(64));
|
||||
CALL_SUBTEST(test_chaining<Scalar>(64));
|
||||
}
|
||||
|
||||
EIGEN_DECLARE_TEST(gpu_cusolver_llt) {
|
||||
CALL_SUBTEST(test_scalar<float>());
|
||||
CALL_SUBTEST(test_scalar<double>());
|
||||
CALL_SUBTEST(test_scalar<std::complex<float>>());
|
||||
CALL_SUBTEST(test_scalar<std::complex<double>>());
|
||||
CALL_SUBTEST(test_not_spd());
|
||||
}
|
||||
206
test/gpu_cusolver_lu.cpp
Normal file
206
test/gpu_cusolver_lu.cpp
Normal file
@@ -0,0 +1,206 @@
|
||||
// This file is part of Eigen, a lightweight C++ template library
|
||||
// for linear algebra.
|
||||
//
|
||||
// Copyright (C) 2026 Eigen Authors
|
||||
//
|
||||
// This Source Code Form is subject to the terms of the Mozilla
|
||||
// Public License v. 2.0. If a copy of the MPL was not distributed
|
||||
// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
|
||||
|
||||
// Tests for GpuLU: GPU partial-pivoting LU decomposition via cuSOLVER.
|
||||
// Covers cusolverDnXgetrf (factorization) and cusolverDnXgetrs (solve)
|
||||
// for float, double, complex<float>, complex<double>.
|
||||
//
|
||||
#define EIGEN_USE_GPU
|
||||
#include "main.h"
|
||||
#include <Eigen/LU>
|
||||
#include <Eigen/GPU>
|
||||
|
||||
using namespace Eigen;
|
||||
|
||||
// ---- Test factorization + NoTrans solve: residual ||A*X - B|| / ||B|| -------
|
||||
|
||||
template <typename Scalar>
|
||||
void test_getrf(Index n) {
|
||||
using MatrixType = Matrix<Scalar, Dynamic, Dynamic>;
|
||||
using RealScalar = typename NumTraits<Scalar>::Real;
|
||||
|
||||
MatrixType A = MatrixType::Random(n, n);
|
||||
MatrixType B = MatrixType::Random(n, 4);
|
||||
|
||||
GpuLU<Scalar> lu(A);
|
||||
VERIFY_IS_EQUAL(lu.info(), Success);
|
||||
|
||||
MatrixType X = lu.solve(B);
|
||||
// Backward error bound for LU: ||A*X - B|| <= O(n*u) * ||A|| * ||X||.
|
||||
// Normalize by ||A||*||X|| rather than ||B|| to be condition-number agnostic.
|
||||
RealScalar residual = (A * X - B).norm() / (A.norm() * X.norm());
|
||||
VERIFY(residual < RealScalar(10) * RealScalar(n) * NumTraits<Scalar>::epsilon());
|
||||
}
|
||||
|
||||
// ---- Test solve: A^T*X = B and A^H*X = B ------------------------------------
|
||||
|
||||
template <typename Scalar>
|
||||
void test_getrs_trans(Index n) {
|
||||
using MatrixType = Matrix<Scalar, Dynamic, Dynamic>;
|
||||
using RealScalar = typename NumTraits<Scalar>::Real;
|
||||
|
||||
MatrixType A = MatrixType::Random(n, n);
|
||||
MatrixType B = MatrixType::Random(n, 3);
|
||||
RealScalar tol = RealScalar(10) * RealScalar(n) * NumTraits<Scalar>::epsilon();
|
||||
|
||||
GpuLU<Scalar> lu(A);
|
||||
VERIFY_IS_EQUAL(lu.info(), Success);
|
||||
|
||||
MatrixType Xt = lu.solve(B, GpuLU<Scalar>::Transpose);
|
||||
VERIFY((A.transpose() * Xt - B).norm() / (A.norm() * Xt.norm()) < tol);
|
||||
|
||||
MatrixType Xc = lu.solve(B, GpuLU<Scalar>::ConjugateTranspose);
|
||||
VERIFY((A.adjoint() * Xc - B).norm() / (A.norm() * Xc.norm()) < tol);
|
||||
}
|
||||
|
||||
// ---- Test multiple solves reuse the device-resident LU ----------------------
|
||||
|
||||
template <typename Scalar>
|
||||
void test_multiple_solves(Index n) {
|
||||
using MatrixType = Matrix<Scalar, Dynamic, Dynamic>;
|
||||
using RealScalar = typename NumTraits<Scalar>::Real;
|
||||
|
||||
MatrixType A = MatrixType::Random(n, n);
|
||||
GpuLU<Scalar> lu(A);
|
||||
VERIFY_IS_EQUAL(lu.info(), Success);
|
||||
|
||||
RealScalar tol = RealScalar(10) * RealScalar(n) * NumTraits<Scalar>::epsilon();
|
||||
for (int k = 0; k < 5; ++k) {
|
||||
MatrixType B = MatrixType::Random(n, 3);
|
||||
MatrixType X = lu.solve(B);
|
||||
VERIFY((A * X - B).norm() / (A.norm() * X.norm()) < tol);
|
||||
}
|
||||
}
|
||||
|
||||
// ---- Agreement with CPU PartialPivLU ----------------------------------------
|
||||
|
||||
template <typename Scalar>
|
||||
void test_vs_cpu(Index n) {
|
||||
using MatrixType = Matrix<Scalar, Dynamic, Dynamic>;
|
||||
using RealScalar = typename NumTraits<Scalar>::Real;
|
||||
|
||||
MatrixType A = MatrixType::Random(n, n);
|
||||
MatrixType B = MatrixType::Random(n, 5);
|
||||
|
||||
GpuLU<Scalar> gpu_lu(A);
|
||||
VERIFY_IS_EQUAL(gpu_lu.info(), Success);
|
||||
|
||||
MatrixType X_gpu = gpu_lu.solve(B);
|
||||
MatrixType X_cpu = PartialPivLU<MatrixType>(A).solve(B);
|
||||
|
||||
RealScalar tol = RealScalar(100) * RealScalar(n) * NumTraits<Scalar>::epsilon();
|
||||
VERIFY((X_gpu - X_cpu).norm() / X_cpu.norm() < tol);
|
||||
}
|
||||
|
||||
// ---- Singular matrix detection ----------------------------------------------
|
||||
|
||||
void test_singular() {
|
||||
MatrixXd A = MatrixXd::Zero(8, 8);
|
||||
GpuLU<double> lu(A);
|
||||
VERIFY_IS_EQUAL(lu.info(), NumericalIssue);
|
||||
}
|
||||
|
||||
// ---- DeviceMatrix integration tests -----------------------------------------
|
||||
|
||||
template <typename Scalar>
|
||||
void test_device_matrix_solve(Index n) {
|
||||
using MatrixType = Matrix<Scalar, Dynamic, Dynamic>;
|
||||
using RealScalar = typename NumTraits<Scalar>::Real;
|
||||
|
||||
MatrixType A = MatrixType::Random(n, n);
|
||||
MatrixType B = MatrixType::Random(n, 4);
|
||||
|
||||
auto d_A = DeviceMatrix<Scalar>::fromHost(A);
|
||||
auto d_B = DeviceMatrix<Scalar>::fromHost(B);
|
||||
|
||||
GpuLU<Scalar> lu;
|
||||
lu.compute(d_A);
|
||||
VERIFY_IS_EQUAL(lu.info(), Success);
|
||||
|
||||
DeviceMatrix<Scalar> d_X = lu.solve(d_B);
|
||||
MatrixType X = d_X.toHost();
|
||||
|
||||
RealScalar residual = (A * X - B).norm() / (A.norm() * X.norm());
|
||||
VERIFY(residual < RealScalar(10) * RealScalar(n) * NumTraits<Scalar>::epsilon());
|
||||
}
|
||||
|
||||
template <typename Scalar>
|
||||
void test_device_matrix_move_compute(Index n) {
|
||||
using MatrixType = Matrix<Scalar, Dynamic, Dynamic>;
|
||||
using RealScalar = typename NumTraits<Scalar>::Real;
|
||||
|
||||
MatrixType A = MatrixType::Random(n, n);
|
||||
MatrixType B = MatrixType::Random(n, 1);
|
||||
|
||||
auto d_A = DeviceMatrix<Scalar>::fromHost(A);
|
||||
GpuLU<Scalar> lu;
|
||||
lu.compute(std::move(d_A));
|
||||
VERIFY_IS_EQUAL(lu.info(), Success);
|
||||
VERIFY(d_A.empty());
|
||||
|
||||
MatrixType X = lu.solve(B);
|
||||
RealScalar residual = (A * X - B).norm() / (A.norm() * X.norm());
|
||||
VERIFY(residual < RealScalar(10) * RealScalar(n) * NumTraits<Scalar>::epsilon());
|
||||
}
|
||||
|
||||
template <typename Scalar>
|
||||
void test_chaining(Index n) {
|
||||
using MatrixType = Matrix<Scalar, Dynamic, Dynamic>;
|
||||
using RealScalar = typename NumTraits<Scalar>::Real;
|
||||
|
||||
MatrixType A = MatrixType::Random(n, n);
|
||||
MatrixType B = MatrixType::Random(n, 3);
|
||||
|
||||
auto d_A = DeviceMatrix<Scalar>::fromHost(A);
|
||||
auto d_B = DeviceMatrix<Scalar>::fromHost(B);
|
||||
|
||||
GpuLU<Scalar> lu;
|
||||
lu.compute(d_A);
|
||||
VERIFY_IS_EQUAL(lu.info(), Success);
|
||||
|
||||
// Chain: solve → use result as RHS
|
||||
DeviceMatrix<Scalar> d_X = lu.solve(d_B);
|
||||
DeviceMatrix<Scalar> d_Y = lu.solve(d_X);
|
||||
MatrixType Y = d_Y.toHost();
|
||||
|
||||
MatrixType X_ref = PartialPivLU<MatrixType>(A).solve(B);
|
||||
MatrixType Y_ref = PartialPivLU<MatrixType>(A).solve(X_ref);
|
||||
|
||||
RealScalar tol = RealScalar(100) * RealScalar(n) * NumTraits<Scalar>::epsilon() * Y_ref.norm();
|
||||
VERIFY((Y - Y_ref).norm() < tol);
|
||||
}
|
||||
|
||||
// ---- Per-scalar driver -------------------------------------------------------
|
||||
|
||||
template <typename Scalar>
|
||||
void test_scalar() {
|
||||
CALL_SUBTEST(test_getrf<Scalar>(1));
|
||||
CALL_SUBTEST(test_getrf<Scalar>(64));
|
||||
CALL_SUBTEST(test_getrf<Scalar>(256));
|
||||
|
||||
CALL_SUBTEST(test_getrs_trans<Scalar>(64));
|
||||
CALL_SUBTEST(test_getrs_trans<Scalar>(128));
|
||||
|
||||
CALL_SUBTEST(test_multiple_solves<Scalar>(128));
|
||||
|
||||
CALL_SUBTEST(test_vs_cpu<Scalar>(64));
|
||||
CALL_SUBTEST(test_vs_cpu<Scalar>(256));
|
||||
|
||||
CALL_SUBTEST(test_device_matrix_solve<Scalar>(64));
|
||||
CALL_SUBTEST(test_device_matrix_move_compute<Scalar>(64));
|
||||
CALL_SUBTEST(test_chaining<Scalar>(64));
|
||||
}
|
||||
|
||||
EIGEN_DECLARE_TEST(gpu_cusolver_lu) {
|
||||
CALL_SUBTEST(test_scalar<float>());
|
||||
CALL_SUBTEST(test_scalar<double>());
|
||||
CALL_SUBTEST(test_scalar<std::complex<float>>());
|
||||
CALL_SUBTEST(test_scalar<std::complex<double>>());
|
||||
CALL_SUBTEST(test_singular());
|
||||
}
|
||||
185
test/gpu_cusolver_qr.cpp
Normal file
185
test/gpu_cusolver_qr.cpp
Normal file
@@ -0,0 +1,185 @@
|
||||
// This file is part of Eigen, a lightweight C++ template library
|
||||
// for linear algebra.
|
||||
//
|
||||
// Copyright (C) 2026 Rasmus Munk Larsen <rmlarsen@gmail.com>
|
||||
//
|
||||
// This Source Code Form is subject to the terms of the Mozilla
|
||||
// Public License v. 2.0. If a copy of the MPL was not distributed
|
||||
// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
|
||||
|
||||
// Tests for GpuQR: GPU QR decomposition via cuSOLVER.
|
||||
|
||||
#define EIGEN_USE_GPU
|
||||
#include "main.h"
|
||||
#include <Eigen/QR>
|
||||
#include <Eigen/GPU>
|
||||
|
||||
using namespace Eigen;
|
||||
|
||||
// ---- Solve square system: A * X = B -----------------------------------------
|
||||
|
||||
template <typename Scalar>
|
||||
void test_qr_solve_square(Index n, Index nrhs) {
|
||||
using Mat = Matrix<Scalar, Dynamic, Dynamic>;
|
||||
using RealScalar = typename NumTraits<Scalar>::Real;
|
||||
|
||||
Mat A = Mat::Random(n, n);
|
||||
Mat B = Mat::Random(n, nrhs);
|
||||
|
||||
GpuQR<Scalar> qr(A);
|
||||
VERIFY_IS_EQUAL(qr.info(), Success);
|
||||
|
||||
Mat X = qr.solve(B);
|
||||
RealScalar residual = (A * X - B).norm() / (A.norm() * X.norm());
|
||||
VERIFY(residual < RealScalar(10) * RealScalar(n) * NumTraits<Scalar>::epsilon());
|
||||
}
|
||||
|
||||
// ---- Solve overdetermined system: m > n (least-squares) ---------------------
|
||||
|
||||
template <typename Scalar>
|
||||
void test_qr_solve_overdetermined(Index m, Index n, Index nrhs) {
|
||||
using Mat = Matrix<Scalar, Dynamic, Dynamic>;
|
||||
using RealScalar = typename NumTraits<Scalar>::Real;
|
||||
|
||||
eigen_assert(m >= n);
|
||||
Mat A = Mat::Random(m, n);
|
||||
Mat B = Mat::Random(m, nrhs);
|
||||
|
||||
GpuQR<Scalar> qr(A);
|
||||
VERIFY_IS_EQUAL(qr.info(), Success);
|
||||
|
||||
Mat X = qr.solve(B);
|
||||
VERIFY_IS_EQUAL(X.rows(), n);
|
||||
VERIFY_IS_EQUAL(X.cols(), nrhs);
|
||||
|
||||
// Compare with CPU QR.
|
||||
Mat X_cpu = HouseholderQR<Mat>(A).solve(B);
|
||||
RealScalar tol = RealScalar(100) * RealScalar(m) * NumTraits<Scalar>::epsilon();
|
||||
VERIFY((X - X_cpu).norm() / X_cpu.norm() < tol);
|
||||
}
|
||||
|
||||
// ---- Solve with DeviceMatrix input ------------------------------------------
|
||||
|
||||
template <typename Scalar>
|
||||
void test_qr_solve_device(Index n, Index nrhs) {
|
||||
using Mat = Matrix<Scalar, Dynamic, Dynamic>;
|
||||
using RealScalar = typename NumTraits<Scalar>::Real;
|
||||
|
||||
Mat A = Mat::Random(n, n);
|
||||
Mat B = Mat::Random(n, nrhs);
|
||||
|
||||
auto d_A = DeviceMatrix<Scalar>::fromHost(A);
|
||||
auto d_B = DeviceMatrix<Scalar>::fromHost(B);
|
||||
|
||||
GpuQR<Scalar> qr;
|
||||
qr.compute(d_A);
|
||||
VERIFY_IS_EQUAL(qr.info(), Success);
|
||||
|
||||
DeviceMatrix<Scalar> d_X = qr.solve(d_B);
|
||||
Mat X = d_X.toHost();
|
||||
|
||||
RealScalar residual = (A * X - B).norm() / (A.norm() * X.norm());
|
||||
VERIFY(residual < RealScalar(10) * RealScalar(n) * NumTraits<Scalar>::epsilon());
|
||||
}
|
||||
|
||||
// ---- Solve overdetermined via device path -----------------------------------
|
||||
|
||||
template <typename Scalar>
|
||||
void test_qr_solve_overdetermined_device(Index m, Index n, Index nrhs) {
|
||||
using Mat = Matrix<Scalar, Dynamic, Dynamic>;
|
||||
using RealScalar = typename NumTraits<Scalar>::Real;
|
||||
|
||||
eigen_assert(m >= n);
|
||||
Mat A = Mat::Random(m, n);
|
||||
Mat B = Mat::Random(m, nrhs);
|
||||
|
||||
auto d_A = DeviceMatrix<Scalar>::fromHost(A);
|
||||
auto d_B = DeviceMatrix<Scalar>::fromHost(B);
|
||||
|
||||
GpuQR<Scalar> qr;
|
||||
qr.compute(d_A);
|
||||
VERIFY_IS_EQUAL(qr.info(), Success);
|
||||
|
||||
DeviceMatrix<Scalar> d_X = qr.solve(d_B);
|
||||
VERIFY_IS_EQUAL(d_X.rows(), n);
|
||||
VERIFY_IS_EQUAL(d_X.cols(), nrhs);
|
||||
|
||||
Mat X = d_X.toHost();
|
||||
Mat X_cpu = HouseholderQR<Mat>(A).solve(B);
|
||||
RealScalar tol = RealScalar(100) * RealScalar(m) * NumTraits<Scalar>::epsilon();
|
||||
VERIFY((X - X_cpu).norm() / X_cpu.norm() < tol);
|
||||
}
|
||||
|
||||
// ---- Multiple solves reuse the factorization --------------------------------
|
||||
|
||||
template <typename Scalar>
|
||||
void test_qr_multiple_solves(Index n) {
|
||||
using Mat = Matrix<Scalar, Dynamic, Dynamic>;
|
||||
using RealScalar = typename NumTraits<Scalar>::Real;
|
||||
|
||||
Mat A = Mat::Random(n, n);
|
||||
GpuQR<Scalar> qr(A);
|
||||
VERIFY_IS_EQUAL(qr.info(), Success);
|
||||
|
||||
RealScalar tol = RealScalar(10) * RealScalar(n) * NumTraits<Scalar>::epsilon();
|
||||
for (int k = 0; k < 5; ++k) {
|
||||
Mat B = Mat::Random(n, 3);
|
||||
Mat X = qr.solve(B);
|
||||
RealScalar residual = (A * X - B).norm() / (A.norm() * X.norm());
|
||||
VERIFY(residual < tol);
|
||||
}
|
||||
}
|
||||
|
||||
// ---- Agreement with CPU HouseholderQR ---------------------------------------
|
||||
|
||||
template <typename Scalar>
|
||||
void test_qr_vs_cpu(Index n, Index nrhs) {
|
||||
using Mat = Matrix<Scalar, Dynamic, Dynamic>;
|
||||
using RealScalar = typename NumTraits<Scalar>::Real;
|
||||
|
||||
Mat A = Mat::Random(n, n);
|
||||
Mat B = Mat::Random(n, nrhs);
|
||||
|
||||
GpuQR<Scalar> gpu_qr(A);
|
||||
VERIFY_IS_EQUAL(gpu_qr.info(), Success);
|
||||
|
||||
Mat X_gpu = gpu_qr.solve(B);
|
||||
Mat X_cpu = HouseholderQR<Mat>(A).solve(B);
|
||||
|
||||
RealScalar tol = RealScalar(100) * RealScalar(n) * NumTraits<Scalar>::epsilon();
|
||||
VERIFY((X_gpu - X_cpu).norm() / X_cpu.norm() < tol);
|
||||
}
|
||||
|
||||
// ---- Per-scalar driver ------------------------------------------------------
|
||||
|
||||
template <typename Scalar>
|
||||
void test_scalar() {
|
||||
CALL_SUBTEST(test_qr_solve_square<Scalar>(1, 1));
|
||||
CALL_SUBTEST(test_qr_solve_square<Scalar>(64, 1));
|
||||
CALL_SUBTEST(test_qr_solve_square<Scalar>(64, 4));
|
||||
CALL_SUBTEST(test_qr_solve_square<Scalar>(256, 8));
|
||||
|
||||
CALL_SUBTEST(test_qr_solve_overdetermined<Scalar>(128, 64, 4));
|
||||
CALL_SUBTEST(test_qr_solve_overdetermined<Scalar>(256, 128, 1));
|
||||
|
||||
CALL_SUBTEST(test_qr_solve_device<Scalar>(64, 4));
|
||||
CALL_SUBTEST(test_qr_solve_overdetermined_device<Scalar>(128, 64, 4));
|
||||
CALL_SUBTEST(test_qr_multiple_solves<Scalar>(64));
|
||||
CALL_SUBTEST(test_qr_vs_cpu<Scalar>(64, 4));
|
||||
CALL_SUBTEST(test_qr_vs_cpu<Scalar>(256, 8));
|
||||
}
|
||||
|
||||
void test_qr_empty() {
|
||||
GpuQR<double> qr(MatrixXd(0, 0));
|
||||
VERIFY_IS_EQUAL(qr.info(), Success);
|
||||
VERIFY_IS_EQUAL(qr.rows(), 0);
|
||||
VERIFY_IS_EQUAL(qr.cols(), 0);
|
||||
}
|
||||
|
||||
EIGEN_DECLARE_TEST(gpu_cusolver_qr) {
|
||||
CALL_SUBTEST(test_scalar<float>());
|
||||
CALL_SUBTEST(test_scalar<double>());
|
||||
CALL_SUBTEST(test_scalar<std::complex<float>>());
|
||||
CALL_SUBTEST(test_scalar<std::complex<double>>());
|
||||
CALL_SUBTEST(test_qr_empty());
|
||||
}
|
||||
194
test/gpu_cusolver_svd.cpp
Normal file
194
test/gpu_cusolver_svd.cpp
Normal file
@@ -0,0 +1,194 @@
|
||||
// This file is part of Eigen, a lightweight C++ template library
|
||||
// for linear algebra.
|
||||
//
|
||||
// Copyright (C) 2026 Rasmus Munk Larsen <rmlarsen@gmail.com>
|
||||
//
|
||||
// This Source Code Form is subject to the terms of the Mozilla
|
||||
// Public License v. 2.0. If a copy of the MPL was not distributed
|
||||
// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
|
||||
|
||||
// Tests for GpuSVD: GPU SVD via cuSOLVER.
|
||||
|
||||
#define EIGEN_USE_GPU
|
||||
#include "main.h"
|
||||
#include <Eigen/SVD>
|
||||
#include <Eigen/GPU>
|
||||
|
||||
using namespace Eigen;
|
||||
|
||||
// ---- SVD reconstruction: U * diag(S) * VT ≈ A ------------------------------
|
||||
|
||||
template <typename Scalar, unsigned int Options>
|
||||
void test_svd_reconstruction(Index m, Index n) {
|
||||
using Mat = Matrix<Scalar, Dynamic, Dynamic>;
|
||||
using RealScalar = typename NumTraits<Scalar>::Real;
|
||||
|
||||
Mat A = Mat::Random(m, n);
|
||||
GpuSVD<Scalar> svd(A, Options);
|
||||
VERIFY_IS_EQUAL(svd.info(), Success);
|
||||
|
||||
auto S = svd.singularValues();
|
||||
Mat U = svd.matrixU();
|
||||
Mat VT = svd.matrixVT();
|
||||
|
||||
const Index k = (std::min)(m, n);
|
||||
|
||||
// Reconstruct: A_hat = U[:,:k] * diag(S) * VT[:k,:].
|
||||
Mat A_hat = U.leftCols(k) * S.asDiagonal() * VT.topRows(k);
|
||||
RealScalar tol = RealScalar(5) * std::sqrt(static_cast<RealScalar>(k)) * NumTraits<Scalar>::epsilon() * A.norm();
|
||||
VERIFY((A_hat - A).norm() < tol);
|
||||
|
||||
// Orthogonality: U^H * U ≈ I.
|
||||
Mat UtU = U.adjoint() * U;
|
||||
Mat I_u = Mat::Identity(U.cols(), U.cols());
|
||||
VERIFY((UtU - I_u).norm() < tol);
|
||||
|
||||
// Orthogonality: VT * VT^H ≈ I.
|
||||
Mat VtVh = VT * VT.adjoint();
|
||||
Mat I_v = Mat::Identity(VT.rows(), VT.rows());
|
||||
VERIFY((VtVh - I_v).norm() < tol);
|
||||
}
|
||||
|
||||
// ---- Singular values match CPU BDCSVD ---------------------------------------
|
||||
|
||||
template <typename Scalar>
|
||||
void test_svd_singular_values(Index m, Index n) {
|
||||
using Mat = Matrix<Scalar, Dynamic, Dynamic>;
|
||||
using RealScalar = typename NumTraits<Scalar>::Real;
|
||||
|
||||
Mat A = Mat::Random(m, n);
|
||||
GpuSVD<Scalar> svd(A, 0); // values only
|
||||
VERIFY_IS_EQUAL(svd.info(), Success);
|
||||
|
||||
auto S_gpu = svd.singularValues();
|
||||
auto S_cpu = BDCSVD<Mat>(A, 0).singularValues();
|
||||
|
||||
RealScalar tol =
|
||||
RealScalar(5) * std::sqrt(static_cast<RealScalar>((std::min)(m, n))) * NumTraits<Scalar>::epsilon() * S_cpu(0);
|
||||
VERIFY((S_gpu - S_cpu).norm() < tol);
|
||||
}
|
||||
|
||||
// ---- Solve: pseudoinverse ---------------------------------------------------
|
||||
|
||||
template <typename Scalar>
|
||||
void test_svd_solve(Index m, Index n, Index nrhs) {
|
||||
using Mat = Matrix<Scalar, Dynamic, Dynamic>;
|
||||
using RealScalar = typename NumTraits<Scalar>::Real;
|
||||
|
||||
Mat A = Mat::Random(m, n);
|
||||
Mat B = Mat::Random(m, nrhs);
|
||||
|
||||
GpuSVD<Scalar> svd(A, ComputeThinU | ComputeThinV);
|
||||
VERIFY_IS_EQUAL(svd.info(), Success);
|
||||
|
||||
Mat X = svd.solve(B);
|
||||
VERIFY_IS_EQUAL(X.rows(), n);
|
||||
VERIFY_IS_EQUAL(X.cols(), nrhs);
|
||||
|
||||
// Compare with CPU BDCSVD solve.
|
||||
Mat X_cpu = BDCSVD<Mat>(A, ComputeThinU | ComputeThinV).solve(B);
|
||||
RealScalar tol = RealScalar(100) * RealScalar((std::max)(m, n)) * NumTraits<Scalar>::epsilon();
|
||||
VERIFY((X - X_cpu).norm() / X_cpu.norm() < tol);
|
||||
}
|
||||
|
||||
// ---- Solve: truncated -------------------------------------------------------
|
||||
|
||||
template <typename Scalar>
|
||||
void test_svd_solve_truncated(Index m, Index n) {
|
||||
using Mat = Matrix<Scalar, Dynamic, Dynamic>;
|
||||
using RealScalar = typename NumTraits<Scalar>::Real;
|
||||
|
||||
Mat A = Mat::Random(m, n);
|
||||
Mat B = Mat::Random(m, 1);
|
||||
const Index k = (std::min)(m, n);
|
||||
const Index trunc = k / 2;
|
||||
eigen_assert(trunc > 0);
|
||||
|
||||
GpuSVD<Scalar> svd(A, ComputeThinU | ComputeThinV);
|
||||
Mat X_trunc = svd.solve(B, trunc);
|
||||
|
||||
// Build CPU reference: truncated pseudoinverse.
|
||||
auto cpu_svd = BDCSVD<Mat>(A, ComputeThinU | ComputeThinV);
|
||||
auto S = cpu_svd.singularValues();
|
||||
Mat U = cpu_svd.matrixU();
|
||||
Mat V = cpu_svd.matrixV();
|
||||
|
||||
// D_ii = 1/S_i for i < trunc, 0 otherwise.
|
||||
Matrix<RealScalar, Dynamic, 1> D = Matrix<RealScalar, Dynamic, 1>::Zero(k);
|
||||
for (Index i = 0; i < trunc; ++i) D(i) = RealScalar(1) / S(i);
|
||||
Mat X_ref = V * D.asDiagonal() * U.adjoint() * B;
|
||||
|
||||
RealScalar tol = RealScalar(100) * RealScalar(k) * NumTraits<Scalar>::epsilon();
|
||||
VERIFY((X_trunc - X_ref).norm() / X_ref.norm() < tol);
|
||||
}
|
||||
|
||||
// ---- Solve: Tikhonov regularized --------------------------------------------
|
||||
|
||||
template <typename Scalar>
|
||||
void test_svd_solve_regularized(Index m, Index n) {
|
||||
using Mat = Matrix<Scalar, Dynamic, Dynamic>;
|
||||
using RealScalar = typename NumTraits<Scalar>::Real;
|
||||
|
||||
Mat A = Mat::Random(m, n);
|
||||
Mat B = Mat::Random(m, 1);
|
||||
RealScalar lambda = RealScalar(0.1);
|
||||
const Index k = (std::min)(m, n);
|
||||
|
||||
GpuSVD<Scalar> svd(A, ComputeThinU | ComputeThinV);
|
||||
Mat X_reg = svd.solve(B, lambda);
|
||||
|
||||
// CPU reference: D_ii = S_i / (S_i^2 + lambda^2).
|
||||
auto cpu_svd = BDCSVD<Mat>(A, ComputeThinU | ComputeThinV);
|
||||
auto S = cpu_svd.singularValues();
|
||||
Mat U = cpu_svd.matrixU();
|
||||
Mat V = cpu_svd.matrixV();
|
||||
|
||||
Matrix<RealScalar, Dynamic, 1> D(k);
|
||||
for (Index i = 0; i < k; ++i) D(i) = S(i) / (S(i) * S(i) + lambda * lambda);
|
||||
Mat X_ref = V * D.asDiagonal() * U.adjoint() * B;
|
||||
|
||||
RealScalar tol = RealScalar(100) * RealScalar(k) * NumTraits<Scalar>::epsilon();
|
||||
VERIFY((X_reg - X_ref).norm() / X_ref.norm() < tol);
|
||||
}
|
||||
|
||||
// ---- Empty matrix -----------------------------------------------------------
|
||||
|
||||
void test_svd_empty() {
|
||||
GpuSVD<double> svd(MatrixXd(0, 0), 0);
|
||||
VERIFY_IS_EQUAL(svd.info(), Success);
|
||||
VERIFY_IS_EQUAL(svd.rows(), 0);
|
||||
VERIFY_IS_EQUAL(svd.cols(), 0);
|
||||
}
|
||||
|
||||
// ---- Per-scalar driver ------------------------------------------------------
|
||||
|
||||
template <typename Scalar>
|
||||
void test_scalar() {
|
||||
// Reconstruction + orthogonality (thin and full, identical test logic).
|
||||
CALL_SUBTEST((test_svd_reconstruction<Scalar, ComputeThinU | ComputeThinV>(64, 64)));
|
||||
CALL_SUBTEST((test_svd_reconstruction<Scalar, ComputeThinU | ComputeThinV>(128, 64)));
|
||||
CALL_SUBTEST((test_svd_reconstruction<Scalar, ComputeThinU | ComputeThinV>(64, 128))); // wide (m < n)
|
||||
CALL_SUBTEST((test_svd_reconstruction<Scalar, ComputeFullU | ComputeFullV>(64, 64)));
|
||||
CALL_SUBTEST((test_svd_reconstruction<Scalar, ComputeFullU | ComputeFullV>(128, 64)));
|
||||
|
||||
// Singular values.
|
||||
CALL_SUBTEST(test_svd_singular_values<Scalar>(64, 64));
|
||||
CALL_SUBTEST(test_svd_singular_values<Scalar>(128, 64));
|
||||
|
||||
// Solve.
|
||||
CALL_SUBTEST(test_svd_solve<Scalar>(64, 64, 4));
|
||||
CALL_SUBTEST(test_svd_solve<Scalar>(128, 64, 4));
|
||||
CALL_SUBTEST(test_svd_solve<Scalar>(64, 128, 4)); // wide (m < n)
|
||||
|
||||
// Truncated and regularized solve.
|
||||
CALL_SUBTEST(test_svd_solve_truncated<Scalar>(64, 64));
|
||||
CALL_SUBTEST(test_svd_solve_regularized<Scalar>(64, 64));
|
||||
}
|
||||
|
||||
EIGEN_DECLARE_TEST(gpu_cusolver_svd) {
|
||||
CALL_SUBTEST(test_scalar<float>());
|
||||
CALL_SUBTEST(test_scalar<double>());
|
||||
CALL_SUBTEST(test_scalar<std::complex<float>>());
|
||||
CALL_SUBTEST(test_scalar<std::complex<double>>());
|
||||
CALL_SUBTEST(test_svd_empty());
|
||||
}
|
||||
Some files were not shown because too many files have changed in this diff Show More
Reference in New Issue
Block a user