mirror of
https://gitlab.com/libeigen/eigen.git
synced 2026-04-10 11:34:33 +08:00
Compare commits
1 Commits
gpu-modern
...
revert-b1d
| Author | SHA1 | Date | |
|---|---|---|---|
|
|
111c4d23a9 |
@@ -672,7 +672,7 @@ if (EIGEN_BUILD_TESTING)
|
||||
endif()
|
||||
|
||||
set(EIGEN_CUDA_CXX_FLAGS "" CACHE STRING "Additional flags to pass to the cuda compiler.")
|
||||
set(EIGEN_CUDA_COMPUTE_ARCH 70 CACHE STRING "The CUDA compute architecture(s) to target when compiling CUDA code")
|
||||
set(EIGEN_CUDA_COMPUTE_ARCH 30 CACHE STRING "The CUDA compute architecture(s) to target when compiling CUDA code")
|
||||
|
||||
option(EIGEN_TEST_SYCL "Add Sycl support." OFF)
|
||||
if(EIGEN_TEST_SYCL)
|
||||
@@ -817,3 +817,4 @@ endif()
|
||||
message(STATUS "")
|
||||
message(STATUS "Configured Eigen ${EIGEN_VERSION_STRING}")
|
||||
message(STATUS "")
|
||||
|
||||
|
||||
@@ -50,9 +50,9 @@
|
||||
#include "src/Core/util/AOCL_Support.h"
|
||||
|
||||
|
||||
// EIGEN_HAS_GPU_FP16 is now always true when compiling with CUDA or HIP.
|
||||
// Use EIGEN_GPUCC (compile-time) or EIGEN_GPU_COMPILE_PHASE (device phase) instead.
|
||||
// TODO: Remove EIGEN_HAS_GPU_BF16 similarly once HIP bf16 guards are cleaned up.
|
||||
#if defined(EIGEN_HAS_CUDA_FP16) || defined(EIGEN_HAS_HIP_FP16)
|
||||
#define EIGEN_HAS_GPU_FP16
|
||||
#endif
|
||||
|
||||
#if defined(EIGEN_HAS_CUDA_BF16) || defined(EIGEN_HAS_HIP_BF16)
|
||||
#define EIGEN_HAS_GPU_BF16
|
||||
|
||||
@@ -858,8 +858,16 @@ struct hash<Eigen::bfloat16> {
|
||||
} // namespace std
|
||||
#endif
|
||||
|
||||
// Warp shuffle overloads for Eigen::bfloat16.
|
||||
// HIP uses non-sync __shfl variants; CUDA has native __nv_bfloat16 support in __shfl_sync.
|
||||
// Add the missing shfl* intrinsics.
|
||||
// The __shfl* functions are only valid on HIP or _CUDA_ARCH_ >= 300.
|
||||
// CUDA defines them for (__CUDA_ARCH__ >= 300 || !defined(__CUDA_ARCH__))
|
||||
//
|
||||
// HIP and CUDA prior to SDK 9.0 define
|
||||
// __shfl, __shfl_up, __shfl_down, __shfl_xor for int and float
|
||||
// CUDA since 9.0 deprecates those and instead defines
|
||||
// __shfl_sync, __shfl_up_sync, __shfl_down_sync, __shfl_xor_sync,
|
||||
// with native support for __half and __nv_bfloat16
|
||||
//
|
||||
// Note that the following are __device__ - only functions.
|
||||
#if defined(EIGEN_HIPCC)
|
||||
|
||||
|
||||
@@ -141,158 +141,69 @@ EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS Packet plog2_float(const Pac
|
||||
return plog_impl_float<Packet, /* base2 */ true>(_x);
|
||||
}
|
||||
|
||||
// -----------------------------------------------------------------------
|
||||
// Double logarithm: shared polynomial + two range-reduction backends
|
||||
// -----------------------------------------------------------------------
|
||||
|
||||
// Cephes rational-polynomial approximation of log(1+f) for
|
||||
// f in [sqrt(0.5)-1, sqrt(2)-1].
|
||||
// Evaluates x - 0.5*x^2 + x^3 * P(x)/Q(x) where P and Q are degree-5.
|
||||
// See: http://www.netlib.org/cephes/
|
||||
template <typename Packet>
|
||||
EIGEN_STRONG_INLINE Packet plog_mantissa_double(const Packet x) {
|
||||
const Packet cst_cephes_log_p0 = pset1<Packet>(1.01875663804580931796E-4);
|
||||
const Packet cst_cephes_log_p1 = pset1<Packet>(4.97494994976747001425E-1);
|
||||
const Packet cst_cephes_log_p2 = pset1<Packet>(4.70579119878881725854E0);
|
||||
const Packet cst_cephes_log_p3 = pset1<Packet>(1.44989225341610930846E1);
|
||||
const Packet cst_cephes_log_p4 = pset1<Packet>(1.79368678507819816313E1);
|
||||
const Packet cst_cephes_log_p5 = pset1<Packet>(7.70838733755885391666E0);
|
||||
// Q0 = 1.0; pmadd(1, x, q1) simplifies to padd(x, q1).
|
||||
const Packet cst_cephes_log_q1 = pset1<Packet>(1.12873587189167450590E1);
|
||||
const Packet cst_cephes_log_q2 = pset1<Packet>(4.52279145837532221105E1);
|
||||
const Packet cst_cephes_log_q3 = pset1<Packet>(8.29875266912776603211E1);
|
||||
const Packet cst_cephes_log_q4 = pset1<Packet>(7.11544750618563894466E1);
|
||||
const Packet cst_cephes_log_q5 = pset1<Packet>(2.31251620126765340583E1);
|
||||
|
||||
Packet x2 = pmul(x, x);
|
||||
Packet x3 = pmul(x2, x);
|
||||
|
||||
// Evaluate P and Q simultaneously for better ILP.
|
||||
Packet y, y1, y_;
|
||||
y = pmadd(cst_cephes_log_p0, x, cst_cephes_log_p1);
|
||||
y1 = pmadd(cst_cephes_log_p3, x, cst_cephes_log_p4);
|
||||
y = pmadd(y, x, cst_cephes_log_p2);
|
||||
y1 = pmadd(y1, x, cst_cephes_log_p5);
|
||||
y_ = pmadd(y, x3, y1);
|
||||
|
||||
y = padd(x, cst_cephes_log_q1);
|
||||
y1 = pmadd(cst_cephes_log_q3, x, cst_cephes_log_q4);
|
||||
y = pmadd(y, x, cst_cephes_log_q2);
|
||||
y1 = pmadd(y1, x, cst_cephes_log_q5);
|
||||
y = pmadd(y, x3, y1);
|
||||
|
||||
y_ = pmul(y_, x3);
|
||||
y = pdiv(y_, y);
|
||||
y = pnmadd(pset1<Packet>(0.5), x2, y);
|
||||
return padd(x, y);
|
||||
}
|
||||
|
||||
// Detect whether unpacket_traits<Packet>::integer_packet is defined.
|
||||
template <typename Packet, typename = void>
|
||||
struct packet_has_integer_packet : std::false_type {};
|
||||
template <typename Packet>
|
||||
struct packet_has_integer_packet<Packet, void_t<typename unpacket_traits<Packet>::integer_packet>> : std::true_type {};
|
||||
|
||||
// Dispatch struct for double-precision range reduction.
|
||||
// Primary template: pfrexp-based fallback (used when integer_packet is absent).
|
||||
template <typename Packet, bool UseIntegerPacket>
|
||||
struct plog_range_reduce_double {
|
||||
EIGEN_STRONG_INLINE static void run(const Packet v, Packet& f, Packet& e) {
|
||||
const Packet one = pset1<Packet>(1.0);
|
||||
const Packet cst_cephes_SQRTHF = pset1<Packet>(0.70710678118654752440E0);
|
||||
// pfrexp: f in [0.5, 1), e = unbiased exponent as double.
|
||||
f = pfrexp(v, e);
|
||||
// Shift [0.5,1) -> [sqrt(0.5)-1, sqrt(2)-1] with exponent correction:
|
||||
// if f < sqrt(0.5): f = f + f - 1, e -= 1 (giving f in [0, sqrt(2)-1))
|
||||
// else: f = f - 1 (giving f in [sqrt(0.5)-1, 0))
|
||||
Packet mask = pcmp_lt(f, cst_cephes_SQRTHF);
|
||||
Packet tmp = pand(f, mask);
|
||||
f = psub(f, one);
|
||||
e = psub(e, pand(one, mask));
|
||||
f = padd(f, tmp);
|
||||
}
|
||||
};
|
||||
|
||||
// Specialisation: fast integer-bit-manipulation path (musl-inspired).
|
||||
// Requires unpacket_traits<Packet>::integer_packet to be a 64-bit integer packet.
|
||||
template <typename Packet>
|
||||
struct plog_range_reduce_double<Packet, true> {
|
||||
EIGEN_STRONG_INLINE static void run(const Packet v, Packet& f, Packet& e) {
|
||||
typedef typename unpacket_traits<Packet>::integer_packet PacketI;
|
||||
// 2^-1022: smallest positive normal double.
|
||||
const PacketI cst_min_normal = pset1<PacketI>(static_cast<int64_t>(0x0010000000000000LL));
|
||||
// Lower 52-bit mask (IEEE mantissa field).
|
||||
const PacketI cst_mant_mask = pset1<PacketI>(static_cast<int64_t>(0x000FFFFFFFFFFFFFLL));
|
||||
// Offset = 1.0_bits - sqrt(0.5)_bits. Adding this to the integer
|
||||
// representation shifts the exponent field so that the [sqrt(0.5), sqrt(2))
|
||||
// half-octave boundary falls on an exact biased-exponent boundary, letting
|
||||
// us extract e with a single right shift. The constant is:
|
||||
// 0x3FF0000000000000 - 0x3FE6A09E667F3BCD = 0x00095F619980C433
|
||||
const PacketI cst_sqrt_half_offset =
|
||||
pset1<PacketI>(static_cast<int64_t>(0x3FF0000000000000LL - 0x3FE6A09E667F3BCDLL));
|
||||
// IEEE double exponent bias (1023).
|
||||
const PacketI cst_exp_bias = pset1<PacketI>(static_cast<int64_t>(1023));
|
||||
// sqrt(0.5) IEEE bits — used to reconstruct f from biased mantissa.
|
||||
const PacketI cst_half_mant = pset1<PacketI>(static_cast<int64_t>(0x3FE6A09E667F3BCDLL));
|
||||
|
||||
// Reinterpret v as a 64-bit integer vector.
|
||||
PacketI vi = preinterpret<PacketI>(v);
|
||||
|
||||
// Normalise denormals: multiply by 2^52 and correct the exponent by -52.
|
||||
PacketI is_denormal = pcmp_lt(vi, cst_min_normal);
|
||||
// 2^52 via bit pattern: biased exponent = 52 + 1023 = 0x433, mantissa = 0.
|
||||
Packet v_norm = pmul(v, pset1frombits<Packet>(static_cast<uint64_t>(int64_t(52 + 0x3ff) << 52)));
|
||||
vi = pselect(is_denormal, preinterpret<PacketI>(v_norm), vi);
|
||||
PacketI denorm_adj = pand(is_denormal, pset1<PacketI>(static_cast<int64_t>(52)));
|
||||
|
||||
// Bias the integer representation so the exponent field directly encodes
|
||||
// the half-octave index.
|
||||
PacketI vi_biased = padd(vi, cst_sqrt_half_offset);
|
||||
// Extract unbiased exponent: shift out mantissa bits, subtract IEEE bias
|
||||
// and denormal adjustment.
|
||||
PacketI e_int = psub(psub(plogical_shift_right<52>(vi_biased), cst_exp_bias), denorm_adj);
|
||||
// Convert integer exponent to floating-point.
|
||||
e = pcast<PacketI, Packet>(e_int);
|
||||
|
||||
// Reconstruct mantissa in [sqrt(0.5), sqrt(2)) via integer arithmetic.
|
||||
// The integer addition of the masked mantissa bits and the sqrt(0.5) bit
|
||||
// pattern carries into the exponent field, yielding a value in that range.
|
||||
// Then subtract 1 to centre on 0: f in [sqrt(0.5)-1, sqrt(2)-1].
|
||||
f = psub(preinterpret<Packet>(padd(pand(vi_biased, cst_mant_mask), cst_half_mant)), pset1<Packet>(1.0));
|
||||
}
|
||||
};
|
||||
|
||||
// Core range reduction and polynomial for double logarithm.
|
||||
// Input: v > 0 (zero / negative / inf / nan are handled by the caller).
|
||||
// Output: log_mantissa ≈ log(mantissa of v in [sqrt(0.5), sqrt(2))),
|
||||
// e = unbiased exponent of v as a double.
|
||||
// Selects the fast integer path when integer_packet is available, otherwise
|
||||
// falls back to pfrexp.
|
||||
// Core range reduction and polynomial evaluation for double logarithm.
|
||||
//
|
||||
// Same structure as plog_core_float but for double precision.
|
||||
// Given a positive double v (may be denormal), decomposes it as
|
||||
// v = 2^e * (1+f) with f in [sqrt(0.5)-1, sqrt(2)-1], then evaluates
|
||||
// log(1+f) ≈ f - 0.5*f^2 + f^3 * P(f)/Q(f) using the Cephes [5/5]
|
||||
// rational approximation.
|
||||
template <typename Packet>
|
||||
EIGEN_STRONG_INLINE void plog_core_double(const Packet v, Packet& log_mantissa, Packet& e) {
|
||||
Packet f;
|
||||
plog_range_reduce_double<Packet, packet_has_integer_packet<Packet>::value>::run(v, f, e);
|
||||
log_mantissa = plog_mantissa_double(f);
|
||||
typedef typename unpacket_traits<Packet>::integer_packet PacketL;
|
||||
|
||||
const PacketL cst_min_normal = pset1<PacketL>(int64_t(0x0010000000000000LL));
|
||||
const PacketL cst_mant_mask = pset1<PacketL>(int64_t(0x000fffffffffffffLL));
|
||||
const PacketL cst_sqrt_half_offset = pset1<PacketL>(int64_t(0x00095f619980c433LL));
|
||||
const PacketL cst_exp_bias = pset1<PacketL>(int64_t(0x3ff)); // 1023
|
||||
const PacketL cst_half_mant = pset1<PacketL>(int64_t(0x3fe6a09e667f3bcdLL)); // sqrt(0.5)
|
||||
|
||||
// Normalize denormals by multiplying by 2^52.
|
||||
PacketL vi = preinterpret<PacketL>(v);
|
||||
PacketL is_denormal = pcmp_lt(vi, cst_min_normal);
|
||||
Packet v_normalized = pmul(v, pset1<Packet>(4503599627370496.0)); // 2^52
|
||||
vi = pselect(is_denormal, preinterpret<PacketL>(v_normalized), vi);
|
||||
PacketL denorm_adj = pand(is_denormal, pset1<PacketL>(int64_t(52)));
|
||||
|
||||
// Combined range reduction via integer bias (same trick as float version).
|
||||
PacketL vi_biased = padd(vi, cst_sqrt_half_offset);
|
||||
PacketL e_int = psub(psub(plogical_shift_right<52>(vi_biased), cst_exp_bias), denorm_adj);
|
||||
e = pcast<PacketL, Packet>(e_int);
|
||||
Packet f = psub(preinterpret<Packet>(padd(pand(vi_biased, cst_mant_mask), cst_half_mant)), pset1<Packet>(1.0));
|
||||
|
||||
// Rational approximation log(1+f) = f - 0.5*f^2 + f^3 * P(f)/Q(f)
|
||||
// from Cephes, [5/5] rational on [sqrt(0.5)-1, sqrt(2)-1].
|
||||
Packet f2 = pmul(f, f);
|
||||
Packet f3 = pmul(f2, f);
|
||||
|
||||
// Evaluate P and Q in factored form for instruction-level parallelism.
|
||||
Packet y, y1, y_;
|
||||
y = pmadd(pset1<Packet>(1.01875663804580931796E-4), f, pset1<Packet>(4.97494994976747001425E-1));
|
||||
y1 = pmadd(pset1<Packet>(1.44989225341610930846E1), f, pset1<Packet>(1.79368678507819816313E1));
|
||||
y = pmadd(y, f, pset1<Packet>(4.70579119878881725854E0));
|
||||
y1 = pmadd(y1, f, pset1<Packet>(7.70838733755885391666E0));
|
||||
y_ = pmadd(y, f3, y1);
|
||||
|
||||
y = pmadd(pset1<Packet>(1.0), f, pset1<Packet>(1.12873587189167450590E1));
|
||||
y1 = pmadd(pset1<Packet>(8.29875266912776603211E1), f, pset1<Packet>(7.11544750618563894466E1));
|
||||
y = pmadd(y, f, pset1<Packet>(4.52279145837532221105E1));
|
||||
y1 = pmadd(y1, f, pset1<Packet>(2.31251620126765340583E1));
|
||||
y = pmadd(y, f3, y1);
|
||||
|
||||
y_ = pmul(y_, f3);
|
||||
y = pdiv(y_, y);
|
||||
|
||||
y = pmadd(pset1<Packet>(-0.5), f2, y);
|
||||
log_mantissa = padd(f, y);
|
||||
}
|
||||
|
||||
/* Returns the base e (2.718...) or base 2 logarithm of x.
|
||||
* The argument is separated into its exponent and fractional parts.
|
||||
* The logarithm of the fraction in the interval [sqrt(1/2), sqrt(2)],
|
||||
* is approximated by
|
||||
*
|
||||
* log(1+x) = x - 0.5 x**2 + x**3 P(x)/Q(x).
|
||||
*
|
||||
* for more detail see: http://www.netlib.org/cephes/
|
||||
*/
|
||||
// Natural or base-2 logarithm for double packets.
|
||||
template <typename Packet, bool base2>
|
||||
EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS Packet plog_impl_double(const Packet _x) {
|
||||
const Packet cst_minus_inf = pset1frombits<Packet>(static_cast<uint64_t>(0xfff0000000000000ull));
|
||||
const Packet cst_pos_inf = pset1frombits<Packet>(static_cast<uint64_t>(0x7ff0000000000000ull));
|
||||
|
||||
Packet log_mantissa, e;
|
||||
plog_core_double(_x, log_mantissa, e);
|
||||
|
||||
// Combine: log(x) = e * ln2 + log(mantissa), or log2(x) = log(mantissa)*log2e + e.
|
||||
// Add the logarithm of the exponent back to the result.
|
||||
Packet x;
|
||||
if (base2) {
|
||||
const Packet cst_log2e = pset1<Packet>(static_cast<double>(EIGEN_LOG2E));
|
||||
@@ -302,13 +213,11 @@ EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS Packet plog_impl_double(cons
|
||||
x = pmadd(e, cst_ln2, log_mantissa);
|
||||
}
|
||||
|
||||
const Packet cst_minus_inf = pset1frombits<Packet>(static_cast<uint64_t>(0xfff0000000000000ull));
|
||||
const Packet cst_pos_inf = pset1frombits<Packet>(static_cast<uint64_t>(0x7ff0000000000000ull));
|
||||
Packet invalid_mask = pcmp_lt_or_nan(_x, pzero(_x));
|
||||
Packet iszero_mask = pcmp_eq(_x, pzero(_x));
|
||||
Packet pos_inf_mask = pcmp_eq(_x, cst_pos_inf);
|
||||
// Filter out invalid inputs:
|
||||
// - negative arg → NAN
|
||||
// - 0 → -INF
|
||||
// - +INF → +INF
|
||||
return pselect(iszero_mask, cst_minus_inf, por(pselect(pos_inf_mask, cst_pos_inf, x), invalid_mask));
|
||||
}
|
||||
|
||||
@@ -362,11 +271,11 @@ EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS Packet generic_log1p_float(c
|
||||
return result;
|
||||
}
|
||||
|
||||
/** \internal \returns log(1 + x) for double precision.
|
||||
Computes log(1+x) using plog_core_double for the core range reduction and
|
||||
polynomial evaluation. The rounding error from forming u = fl(1+x) is
|
||||
recovered as dx = x - (u - 1) and folded in as a first-order correction
|
||||
dx/u after the polynomial evaluation.
|
||||
/** \internal \returns log(1 + x) for double precision float.
|
||||
Computes log(1+x) using plog_core_double for the core range reduction
|
||||
and polynomial evaluation. The rounding error from forming u = fl(1+x)
|
||||
is recovered as dx = x - (u - 1), and folded in as a first-order
|
||||
correction dx/u after the polynomial evaluation.
|
||||
*/
|
||||
template <typename Packet>
|
||||
EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS Packet generic_log1p_double(const Packet& x) {
|
||||
@@ -374,7 +283,7 @@ EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS Packet generic_log1p_double(
|
||||
const Packet cst_minus_inf = pset1frombits<Packet>(static_cast<uint64_t>(0xfff0000000000000ull));
|
||||
const Packet cst_pos_inf = pset1frombits<Packet>(static_cast<uint64_t>(0x7ff0000000000000ull));
|
||||
|
||||
// u = 1 + x, with rounding. Recover the lost low bits: dx = x - (u - 1).
|
||||
// u = 1 + x, with rounding. Recover the lost low bits: dx = x - (u - 1).
|
||||
Packet u = padd(one, x);
|
||||
Packet dx = psub(x, psub(u, one));
|
||||
|
||||
@@ -398,7 +307,7 @@ EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS Packet generic_log1p_double(
|
||||
result = pselect(small_mask, x, result);
|
||||
result = pselect(inf_mask, cst_pos_inf, result);
|
||||
result = pselect(zero_mask, cst_minus_inf, result);
|
||||
result = por(neg_mask, result); // NaN for x < -1
|
||||
result = por(neg_mask, result);
|
||||
return result;
|
||||
}
|
||||
|
||||
|
||||
@@ -45,7 +45,7 @@
|
||||
// Eigen with GPU support.
|
||||
// Any functions that require `numext::bit_cast` may also not be constexpr,
|
||||
// including any native types when setting via raw bit values.
|
||||
#if defined(EIGEN_GPUCC) || defined(EIGEN_HAS_ARM64_FP16_SCALAR_ARITHMETIC) || defined(EIGEN_HAS_BUILTIN_FLOAT16)
|
||||
#if defined(EIGEN_HAS_GPU_FP16) || defined(EIGEN_HAS_ARM64_FP16_SCALAR_ARITHMETIC) || defined(EIGEN_HAS_BUILTIN_FLOAT16)
|
||||
#define _EIGEN_MAYBE_CONSTEXPR
|
||||
#else
|
||||
#define _EIGEN_MAYBE_CONSTEXPR constexpr
|
||||
@@ -121,12 +121,12 @@ namespace half_impl {
|
||||
//
|
||||
// Making the host side compile phase of hipcc use the same Eigen::half impl, as the gcc compile, resolves
|
||||
// this error, and hence the following convoluted #if condition
|
||||
#if !defined(EIGEN_GPUCC) || !defined(EIGEN_GPU_COMPILE_PHASE)
|
||||
#if !defined(EIGEN_HAS_GPU_FP16) || !defined(EIGEN_GPU_COMPILE_PHASE)
|
||||
|
||||
// Make our own __half_raw definition that is similar to CUDA's.
|
||||
struct __half_raw {
|
||||
struct construct_from_rep_tag {};
|
||||
#if (defined(EIGEN_GPUCC) && !defined(EIGEN_GPU_COMPILE_PHASE))
|
||||
#if (defined(EIGEN_HAS_GPU_FP16) && !defined(EIGEN_GPU_COMPILE_PHASE))
|
||||
// Eigen::half can be used as the datatype for shared memory declarations (in Eigen and TF)
|
||||
// The element type for shared memory cannot have non-trivial constructors
|
||||
// and hence the following special casing (which skips the zero-initilization).
|
||||
@@ -152,12 +152,16 @@ struct __half_raw {
|
||||
#endif
|
||||
};
|
||||
|
||||
#elif defined(EIGEN_HIPCC)
|
||||
#elif defined(EIGEN_HAS_HIP_FP16)
|
||||
// HIP GPU compile phase: nothing to do here.
|
||||
// HIP fp16 header file has a definition for __half_raw
|
||||
#elif defined(EIGEN_CUDACC)
|
||||
#elif defined(EIGEN_HAS_CUDA_FP16)
|
||||
|
||||
// CUDA GPU compile phase.
|
||||
#if EIGEN_CUDA_SDK_VER < 90000
|
||||
// In CUDA < 9.0, __half is the equivalent of CUDA 9's __half_raw
|
||||
typedef __half __half_raw;
|
||||
#endif // defined(EIGEN_HAS_CUDA_FP16)
|
||||
|
||||
#elif defined(SYCL_DEVICE_ONLY)
|
||||
typedef cl::sycl::half __half_raw;
|
||||
@@ -171,13 +175,15 @@ struct half_base : public __half_raw {
|
||||
EIGEN_DEVICE_FUNC _EIGEN_MAYBE_CONSTEXPR half_base() {}
|
||||
EIGEN_DEVICE_FUNC _EIGEN_MAYBE_CONSTEXPR half_base(const __half_raw& h) : __half_raw(h) {}
|
||||
|
||||
#if defined(EIGEN_GPUCC)
|
||||
#if defined(EIGEN_HIPCC)
|
||||
#if defined(EIGEN_HAS_GPU_FP16)
|
||||
#if defined(EIGEN_HAS_HIP_FP16)
|
||||
EIGEN_DEVICE_FUNC _EIGEN_MAYBE_CONSTEXPR half_base(const __half& h) { x = __half_as_ushort(h); }
|
||||
#elif defined(EIGEN_CUDACC)
|
||||
#elif defined(EIGEN_HAS_CUDA_FP16)
|
||||
#if EIGEN_CUDA_SDK_VER >= 90000
|
||||
EIGEN_DEVICE_FUNC _EIGEN_MAYBE_CONSTEXPR half_base(const __half& h) : __half_raw(*(__half_raw*)&h) {}
|
||||
#endif
|
||||
#endif
|
||||
#endif
|
||||
};
|
||||
|
||||
} // namespace half_impl
|
||||
@@ -186,29 +192,36 @@ struct half_base : public __half_raw {
|
||||
struct half : public half_impl::half_base {
|
||||
// Writing this out as separate #if-else blocks to make the code easier to follow
|
||||
// The same applies to most #if-else blocks in this file
|
||||
#if !defined(EIGEN_GPUCC) || !defined(EIGEN_GPU_COMPILE_PHASE)
|
||||
#if !defined(EIGEN_HAS_GPU_FP16) || !defined(EIGEN_GPU_COMPILE_PHASE)
|
||||
// Use the same base class for the following two scenarios
|
||||
// * when compiling without GPU support enabled
|
||||
// * during host compile phase when compiling with GPU support enabled
|
||||
typedef half_impl::__half_raw __half_raw;
|
||||
#elif defined(EIGEN_HIPCC)
|
||||
#elif defined(EIGEN_HAS_HIP_FP16)
|
||||
// Nothing to do here
|
||||
// HIP fp16 header file has a definition for __half_raw
|
||||
#elif defined(EIGEN_CUDACC)
|
||||
// Nothing to do here.
|
||||
#elif defined(EIGEN_HAS_CUDA_FP16)
|
||||
// Note that EIGEN_CUDA_SDK_VER is set to 0 even when compiling with HIP, so
|
||||
// (EIGEN_CUDA_SDK_VER < 90000) is true even for HIP! So keeping this within
|
||||
// #if defined(EIGEN_HAS_CUDA_FP16) is needed
|
||||
#if defined(EIGEN_CUDA_SDK_VER) && EIGEN_CUDA_SDK_VER < 90000
|
||||
typedef half_impl::__half_raw __half_raw;
|
||||
#endif
|
||||
#endif
|
||||
|
||||
EIGEN_DEVICE_FUNC _EIGEN_MAYBE_CONSTEXPR half() {}
|
||||
|
||||
EIGEN_DEVICE_FUNC _EIGEN_MAYBE_CONSTEXPR half(const __half_raw& h) : half_impl::half_base(h) {}
|
||||
|
||||
#if defined(EIGEN_GPUCC)
|
||||
#if defined(EIGEN_HIPCC)
|
||||
#if defined(EIGEN_HAS_GPU_FP16)
|
||||
#if defined(EIGEN_HAS_HIP_FP16)
|
||||
EIGEN_DEVICE_FUNC _EIGEN_MAYBE_CONSTEXPR half(const __half& h) : half_impl::half_base(h) {}
|
||||
#elif defined(EIGEN_CUDACC)
|
||||
#elif defined(EIGEN_HAS_CUDA_FP16)
|
||||
#if defined(EIGEN_CUDA_SDK_VER) && EIGEN_CUDA_SDK_VER >= 90000
|
||||
EIGEN_DEVICE_FUNC _EIGEN_MAYBE_CONSTEXPR half(const __half& h) : half_impl::half_base(h) {}
|
||||
#endif
|
||||
#endif
|
||||
#endif
|
||||
|
||||
#if defined(EIGEN_HAS_ARM64_FP16_SCALAR_ARITHMETIC)
|
||||
explicit EIGEN_DEVICE_FUNC _EIGEN_MAYBE_CONSTEXPR half(__fp16 b)
|
||||
@@ -235,7 +248,7 @@ struct half : public half_impl::half_base {
|
||||
return half_impl::half_to_float(*this);
|
||||
}
|
||||
|
||||
#if defined(EIGEN_GPUCC) && !defined(EIGEN_GPU_COMPILE_PHASE)
|
||||
#if defined(EIGEN_HAS_GPU_FP16) && !defined(EIGEN_GPU_COMPILE_PHASE)
|
||||
EIGEN_DEVICE_FUNC operator __half() const {
|
||||
::__half_raw hr;
|
||||
hr.x = x;
|
||||
@@ -367,7 +380,8 @@ namespace Eigen {
|
||||
|
||||
namespace half_impl {
|
||||
|
||||
#if defined(EIGEN_GPU_COMPILE_PHASE)
|
||||
#if (defined(EIGEN_HAS_CUDA_FP16) && defined(EIGEN_CUDA_ARCH) && EIGEN_CUDA_ARCH >= 530) || \
|
||||
(defined(EIGEN_HAS_HIP_FP16) && defined(HIP_DEVICE_COMPILE))
|
||||
// Note: We deliberately do *not* define this to 1 even if we have Arm's native
|
||||
// fp16 type since GPU half types are rather different from native CPU half types.
|
||||
#define EIGEN_HAS_NATIVE_GPU_FP16
|
||||
@@ -379,10 +393,24 @@ namespace half_impl {
|
||||
// conversion steps back and forth.
|
||||
|
||||
#if defined(EIGEN_HAS_NATIVE_GPU_FP16)
|
||||
EIGEN_STRONG_INLINE __device__ half operator+(const half& a, const half& b) { return __hadd(::__half(a), ::__half(b)); }
|
||||
EIGEN_STRONG_INLINE __device__ half operator+(const half& a, const half& b) {
|
||||
#if defined(EIGEN_CUDA_SDK_VER) && EIGEN_CUDA_SDK_VER >= 90000
|
||||
return __hadd(::__half(a), ::__half(b));
|
||||
#else
|
||||
return __hadd(a, b);
|
||||
#endif
|
||||
}
|
||||
EIGEN_STRONG_INLINE __device__ half operator*(const half& a, const half& b) { return __hmul(a, b); }
|
||||
EIGEN_STRONG_INLINE __device__ half operator-(const half& a, const half& b) { return __hsub(a, b); }
|
||||
EIGEN_STRONG_INLINE __device__ half operator/(const half& a, const half& b) { return __hdiv(a, b); }
|
||||
EIGEN_STRONG_INLINE __device__ half operator/(const half& a, const half& b) {
|
||||
#if defined(EIGEN_CUDA_SDK_VER) && EIGEN_CUDA_SDK_VER >= 90000
|
||||
return __hdiv(a, b);
|
||||
#else
|
||||
float num = __half2float(a);
|
||||
float denom = __half2float(b);
|
||||
return __float2half(num / denom);
|
||||
#endif
|
||||
}
|
||||
EIGEN_STRONG_INLINE __device__ half operator-(const half& a) { return __hneg(a); }
|
||||
EIGEN_STRONG_INLINE __device__ half& operator+=(half& a, const half& b) {
|
||||
a = a + b;
|
||||
@@ -477,7 +505,7 @@ EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC bool operator>=(const half& a, const half&
|
||||
// We need to provide emulated *host-side* FP16 operators for clang.
|
||||
#pragma push_macro("EIGEN_DEVICE_FUNC")
|
||||
#undef EIGEN_DEVICE_FUNC
|
||||
#if defined(EIGEN_CUDACC) && defined(EIGEN_HAS_NATIVE_GPU_FP16)
|
||||
#if defined(EIGEN_HAS_CUDA_FP16) && defined(EIGEN_HAS_NATIVE_GPU_FP16)
|
||||
#define EIGEN_DEVICE_FUNC __host__
|
||||
#else // both host and device need emulated ops.
|
||||
#define EIGEN_DEVICE_FUNC __host__ __device__
|
||||
@@ -608,7 +636,7 @@ EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC _EIGEN_MAYBE_CONSTEXPR __half_raw raw_uint
|
||||
// because this is constexpr function.
|
||||
// Fortunately, since we need to disable EIGEN_CONSTEXPR for GPU anyway, we can get out
|
||||
// of this catch22 by having separate bodies for GPU / non GPU
|
||||
#if defined(EIGEN_GPUCC)
|
||||
#if defined(EIGEN_HAS_GPU_FP16)
|
||||
__half_raw h;
|
||||
h.x = x;
|
||||
return h;
|
||||
@@ -633,7 +661,8 @@ EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC numext::uint16_t raw_half_as_uint16(const
|
||||
}
|
||||
|
||||
EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC __half_raw float_to_half_rtne(float ff) {
|
||||
#if defined(EIGEN_GPU_COMPILE_PHASE)
|
||||
#if (defined(EIGEN_HAS_CUDA_FP16) && defined(EIGEN_CUDA_ARCH) && EIGEN_CUDA_ARCH >= 300) || \
|
||||
(defined(EIGEN_HAS_HIP_FP16) && defined(EIGEN_HIP_DEVICE_COMPILE))
|
||||
__half tmp_ff = __float2half(ff);
|
||||
return *(__half_raw*)&tmp_ff;
|
||||
|
||||
@@ -706,7 +735,8 @@ EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC __half_raw float_to_half_rtne(float ff) {
|
||||
}
|
||||
|
||||
EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC float half_to_float(__half_raw h) {
|
||||
#if defined(EIGEN_GPU_COMPILE_PHASE)
|
||||
#if (defined(EIGEN_HAS_CUDA_FP16) && defined(EIGEN_CUDA_ARCH) && EIGEN_CUDA_ARCH >= 300) || \
|
||||
(defined(EIGEN_HAS_HIP_FP16) && defined(EIGEN_HIP_DEVICE_COMPILE))
|
||||
return __half2float(h);
|
||||
#elif defined(EIGEN_HAS_ARM64_FP16_SCALAR_ARITHMETIC) || defined(EIGEN_HAS_BUILTIN_FLOAT16)
|
||||
return static_cast<float>(h.x);
|
||||
@@ -748,7 +778,8 @@ EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC bool(isinf)(const half& a) {
|
||||
#endif
|
||||
}
|
||||
EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC bool(isnan)(const half& a) {
|
||||
#if defined(EIGEN_GPU_COMPILE_PHASE)
|
||||
#if (defined(EIGEN_HAS_CUDA_FP16) && defined(EIGEN_CUDA_ARCH) && EIGEN_CUDA_ARCH >= 530) || \
|
||||
(defined(EIGEN_HAS_HIP_FP16) && defined(EIGEN_HIP_DEVICE_COMPILE))
|
||||
return __hisnan(a);
|
||||
#elif defined(EIGEN_HAS_ARM64_FP16_SCALAR_ARITHMETIC) || defined(EIGEN_HAS_BUILTIN_FLOAT16)
|
||||
return (numext::bit_cast<numext::uint16_t>(a.x) & 0x7fff) > 0x7c00;
|
||||
@@ -779,14 +810,16 @@ EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC half abs(const half& a) {
|
||||
#endif
|
||||
}
|
||||
EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC half exp(const half& a) {
|
||||
#if defined(EIGEN_CUDA_ARCH) || defined(EIGEN_HIP_DEVICE_COMPILE)
|
||||
#if (EIGEN_CUDA_SDK_VER >= 80000 && defined EIGEN_CUDA_ARCH && EIGEN_CUDA_ARCH >= 530) || \
|
||||
defined(EIGEN_HIP_DEVICE_COMPILE)
|
||||
return half(hexp(a));
|
||||
#else
|
||||
return half(::expf(float(a)));
|
||||
#endif
|
||||
}
|
||||
EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC half exp2(const half& a) {
|
||||
#if defined(EIGEN_CUDA_ARCH) || defined(EIGEN_HIP_DEVICE_COMPILE)
|
||||
#if (EIGEN_CUDA_SDK_VER >= 80000 && defined EIGEN_CUDA_ARCH && EIGEN_CUDA_ARCH >= 530) || \
|
||||
defined(EIGEN_HIP_DEVICE_COMPILE)
|
||||
return half(hexp2(a));
|
||||
#else
|
||||
return half(::exp2f(float(a)));
|
||||
@@ -794,7 +827,9 @@ EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC half exp2(const half& a) {
|
||||
}
|
||||
EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC half expm1(const half& a) { return half(numext::expm1(float(a))); }
|
||||
EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC half log(const half& a) {
|
||||
#if defined(EIGEN_GPU_COMPILE_PHASE)
|
||||
#if (defined(EIGEN_HAS_CUDA_FP16) && EIGEN_CUDA_SDK_VER >= 80000 && defined(EIGEN_CUDA_ARCH) && \
|
||||
EIGEN_CUDA_ARCH >= 530) || \
|
||||
(defined(EIGEN_HAS_HIP_FP16) && defined(EIGEN_HIP_DEVICE_COMPILE))
|
||||
return half(hlog(a));
|
||||
#else
|
||||
return half(::logf(float(a)));
|
||||
@@ -807,7 +842,8 @@ EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC half log2(const half& a) {
|
||||
}
|
||||
|
||||
EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC half sqrt(const half& a) {
|
||||
#if defined(EIGEN_CUDA_ARCH) || defined(EIGEN_HIP_DEVICE_COMPILE)
|
||||
#if (EIGEN_CUDA_SDK_VER >= 80000 && defined EIGEN_CUDA_ARCH && EIGEN_CUDA_ARCH >= 530) || \
|
||||
defined(EIGEN_HIP_DEVICE_COMPILE)
|
||||
return half(hsqrt(a));
|
||||
#else
|
||||
return half(::sqrtf(float(a)));
|
||||
@@ -828,14 +864,16 @@ EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC half acos(const half& a) { return half(::a
|
||||
EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC half atan(const half& a) { return half(::atanf(float(a))); }
|
||||
EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC half atanh(const half& a) { return half(::atanhf(float(a))); }
|
||||
EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC half floor(const half& a) {
|
||||
#if (defined(EIGEN_CUDA_ARCH)) || defined(EIGEN_HIP_DEVICE_COMPILE)
|
||||
#if (EIGEN_CUDA_SDK_VER >= 80000 && defined EIGEN_CUDA_ARCH && EIGEN_CUDA_ARCH >= 300) || \
|
||||
defined(EIGEN_HIP_DEVICE_COMPILE)
|
||||
return half(hfloor(a));
|
||||
#else
|
||||
return half(::floorf(float(a)));
|
||||
#endif
|
||||
}
|
||||
EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC half ceil(const half& a) {
|
||||
#if (defined(EIGEN_CUDA_ARCH)) || defined(EIGEN_HIP_DEVICE_COMPILE)
|
||||
#if (EIGEN_CUDA_SDK_VER >= 80000 && defined EIGEN_CUDA_ARCH && EIGEN_CUDA_ARCH >= 300) || \
|
||||
defined(EIGEN_HIP_DEVICE_COMPILE)
|
||||
return half(hceil(a));
|
||||
#else
|
||||
return half(::ceilf(float(a)));
|
||||
@@ -969,12 +1007,20 @@ EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC Eigen::half madd<Eigen::half>(const Eigen:
|
||||
} // namespace numext
|
||||
} // namespace Eigen
|
||||
|
||||
// Warp shuffle overloads for Eigen::half.
|
||||
// CUDA uses __shfl_*_sync (with mask); HIP uses __shfl_* (no mask).
|
||||
// Add the missing shfl* intrinsics.
|
||||
// The __shfl* functions are only valid on HIP or _CUDA_ARCH_ >= 300.
|
||||
// CUDA defines them for (__CUDA_ARCH__ >= 300 || !defined(__CUDA_ARCH__))
|
||||
//
|
||||
// HIP and CUDA prior to SDK 9.0 define
|
||||
// __shfl, __shfl_up, __shfl_down, __shfl_xor for int and float
|
||||
// CUDA since 9.0 deprecates those and instead defines
|
||||
// __shfl_sync, __shfl_up_sync, __shfl_down_sync, __shfl_xor_sync,
|
||||
// with native support for __half and __nv_bfloat16
|
||||
//
|
||||
// Note that the following are __device__ - only functions.
|
||||
#if defined(EIGEN_CUDACC) || defined(EIGEN_HIPCC)
|
||||
#if (defined(EIGEN_CUDACC) && (!defined(EIGEN_CUDA_ARCH) || EIGEN_CUDA_ARCH >= 300)) || defined(EIGEN_HIPCC)
|
||||
|
||||
#if defined(EIGEN_CUDACC)
|
||||
#if defined(EIGEN_HAS_CUDA_FP16) && EIGEN_CUDA_SDK_VER >= 90000
|
||||
|
||||
__device__ EIGEN_STRONG_INLINE Eigen::half __shfl_sync(unsigned mask, Eigen::half var, int srcLane,
|
||||
int width = warpSize) {
|
||||
@@ -1000,7 +1046,7 @@ __device__ EIGEN_STRONG_INLINE Eigen::half __shfl_xor_sync(unsigned mask, Eigen:
|
||||
return static_cast<Eigen::half>(__shfl_xor_sync(mask, h, laneMask, width));
|
||||
}
|
||||
|
||||
#else // HIP
|
||||
#else // HIP or CUDA SDK < 9.0
|
||||
|
||||
__device__ EIGEN_STRONG_INLINE Eigen::half __shfl(Eigen::half var, int srcLane, int width = warpSize) {
|
||||
const int ivar = static_cast<int>(Eigen::numext::bit_cast<Eigen::numext::uint16_t>(var));
|
||||
@@ -1026,7 +1072,7 @@ __device__ EIGEN_STRONG_INLINE Eigen::half __shfl_xor(Eigen::half var, int laneM
|
||||
#endif // __shfl*
|
||||
|
||||
// ldg() has an overload for __half_raw, but we also need one for Eigen::half.
|
||||
#if defined(EIGEN_CUDACC) || defined(EIGEN_HIPCC)
|
||||
#if (defined(EIGEN_CUDACC) && (!defined(EIGEN_CUDA_ARCH) || EIGEN_CUDA_ARCH >= 350)) || defined(EIGEN_HIPCC)
|
||||
EIGEN_STRONG_INLINE __device__ Eigen::half __ldg(const Eigen::half* ptr) {
|
||||
return Eigen::half_impl::raw_uint16_to_half(__ldg(reinterpret_cast<const Eigen::numext::uint16_t*>(ptr)));
|
||||
}
|
||||
@@ -1049,7 +1095,8 @@ namespace internal {
|
||||
template <>
|
||||
struct cast_impl<float, half> {
|
||||
EIGEN_DEVICE_FUNC static inline half run(const float& a) {
|
||||
#if defined(EIGEN_GPU_COMPILE_PHASE)
|
||||
#if (defined(EIGEN_HAS_CUDA_FP16) && defined(EIGEN_CUDA_ARCH) && EIGEN_CUDA_ARCH >= 300) || \
|
||||
(defined(EIGEN_HAS_HIP_FP16) && defined(EIGEN_HIP_DEVICE_COMPILE))
|
||||
return __float2half(a);
|
||||
#else
|
||||
return half(a);
|
||||
@@ -1060,7 +1107,8 @@ struct cast_impl<float, half> {
|
||||
template <>
|
||||
struct cast_impl<int, half> {
|
||||
EIGEN_DEVICE_FUNC static inline half run(const int& a) {
|
||||
#if defined(EIGEN_GPU_COMPILE_PHASE)
|
||||
#if (defined(EIGEN_HAS_CUDA_FP16) && defined(EIGEN_CUDA_ARCH) && EIGEN_CUDA_ARCH >= 300) || \
|
||||
(defined(EIGEN_HAS_HIP_FP16) && defined(EIGEN_HIP_DEVICE_COMPILE))
|
||||
return __float2half(static_cast<float>(a));
|
||||
#else
|
||||
return half(static_cast<float>(a));
|
||||
@@ -1071,7 +1119,8 @@ struct cast_impl<int, half> {
|
||||
template <>
|
||||
struct cast_impl<half, float> {
|
||||
EIGEN_DEVICE_FUNC static inline float run(const half& a) {
|
||||
#if defined(EIGEN_GPU_COMPILE_PHASE)
|
||||
#if (defined(EIGEN_HAS_CUDA_FP16) && defined(EIGEN_CUDA_ARCH) && EIGEN_CUDA_ARCH >= 300) || \
|
||||
(defined(EIGEN_HAS_HIP_FP16) && defined(EIGEN_HIP_DEVICE_COMPILE))
|
||||
return __half2float(a);
|
||||
#else
|
||||
return static_cast<float>(a);
|
||||
|
||||
@@ -17,8 +17,19 @@ namespace Eigen {
|
||||
|
||||
namespace internal {
|
||||
|
||||
// Read-only data cached load (__ldg) and native FP16 arithmetic are available
|
||||
// on all supported GPU architectures (sm_70+ for CUDA, GFX906+ for HIP).
|
||||
// Read-only data cached load available.
|
||||
#if defined(EIGEN_HIP_DEVICE_COMPILE) || (defined(EIGEN_CUDA_ARCH) && EIGEN_CUDA_ARCH >= 350)
|
||||
#define EIGEN_GPU_HAS_LDG 1
|
||||
#endif
|
||||
|
||||
// FP16 math available.
|
||||
#if (defined(EIGEN_CUDA_ARCH) && EIGEN_CUDA_ARCH >= 530)
|
||||
#define EIGEN_CUDA_HAS_FP16_ARITHMETIC 1
|
||||
#endif
|
||||
|
||||
#if defined(EIGEN_HIP_DEVICE_COMPILE) || defined(EIGEN_CUDA_HAS_FP16_ARITHMETIC)
|
||||
#define EIGEN_GPU_HAS_FP16_ARITHMETIC 1
|
||||
#endif
|
||||
|
||||
// We need to distinguish ‘clang as the CUDA compiler’ from ‘clang as the host compiler,
|
||||
// invoked by NVCC’ (e.g. on MacOS). The former needs to see both host and device implementation
|
||||
@@ -45,84 +56,92 @@ struct is_arithmetic<double2> {
|
||||
|
||||
template <>
|
||||
struct packet_traits<float> : default_packet_traits {
|
||||
using type = float4;
|
||||
using half = float4;
|
||||
static constexpr int Vectorizable = 1;
|
||||
static constexpr int AlignedOnScalar = 1;
|
||||
static constexpr int size = 4;
|
||||
typedef float4 type;
|
||||
typedef float4 half;
|
||||
enum {
|
||||
Vectorizable = 1,
|
||||
AlignedOnScalar = 1,
|
||||
size = 4,
|
||||
|
||||
static constexpr int HasDiv = 1;
|
||||
static constexpr int HasSin = 0;
|
||||
static constexpr int HasCos = 0;
|
||||
static constexpr int HasLog = 1;
|
||||
static constexpr int HasExp = 1;
|
||||
static constexpr int HasSqrt = 1;
|
||||
static constexpr int HasRsqrt = 1;
|
||||
static constexpr int HasLGamma = 1;
|
||||
static constexpr int HasDiGamma = 1;
|
||||
static constexpr int HasZeta = 1;
|
||||
static constexpr int HasPolygamma = 1;
|
||||
static constexpr int HasErf = 1;
|
||||
static constexpr int HasErfc = 1;
|
||||
static constexpr int HasNdtri = 1;
|
||||
static constexpr int HasBessel = 1;
|
||||
static constexpr int HasIGamma = 1;
|
||||
static constexpr int HasIGammaDerA = 1;
|
||||
static constexpr int HasGammaSampleDerAlpha = 1;
|
||||
static constexpr int HasIGammac = 1;
|
||||
static constexpr int HasBetaInc = 1;
|
||||
HasDiv = 1,
|
||||
HasSin = 0,
|
||||
HasCos = 0,
|
||||
HasLog = 1,
|
||||
HasExp = 1,
|
||||
HasSqrt = 1,
|
||||
HasRsqrt = 1,
|
||||
HasLGamma = 1,
|
||||
HasDiGamma = 1,
|
||||
HasZeta = 1,
|
||||
HasPolygamma = 1,
|
||||
HasErf = 1,
|
||||
HasErfc = 1,
|
||||
HasNdtri = 1,
|
||||
HasBessel = 1,
|
||||
HasIGamma = 1,
|
||||
HasIGammaDerA = 1,
|
||||
HasGammaSampleDerAlpha = 1,
|
||||
HasIGammac = 1,
|
||||
HasBetaInc = 1,
|
||||
|
||||
static constexpr int HasFloor = 1;
|
||||
static constexpr int HasCmp = EIGEN_HAS_GPU_DEVICE_FUNCTIONS;
|
||||
HasFloor = 1,
|
||||
HasCmp = EIGEN_HAS_GPU_DEVICE_FUNCTIONS
|
||||
};
|
||||
};
|
||||
|
||||
template <>
|
||||
struct packet_traits<double> : default_packet_traits {
|
||||
using type = double2;
|
||||
using half = double2;
|
||||
static constexpr int Vectorizable = 1;
|
||||
static constexpr int AlignedOnScalar = 1;
|
||||
static constexpr int size = 2;
|
||||
typedef double2 type;
|
||||
typedef double2 half;
|
||||
enum {
|
||||
Vectorizable = 1,
|
||||
AlignedOnScalar = 1,
|
||||
size = 2,
|
||||
|
||||
static constexpr int HasDiv = 1;
|
||||
static constexpr int HasLog = 1;
|
||||
static constexpr int HasExp = 1;
|
||||
static constexpr int HasSqrt = 1;
|
||||
static constexpr int HasRsqrt = 1;
|
||||
static constexpr int HasLGamma = 1;
|
||||
static constexpr int HasDiGamma = 1;
|
||||
static constexpr int HasZeta = 1;
|
||||
static constexpr int HasPolygamma = 1;
|
||||
static constexpr int HasErf = 1;
|
||||
static constexpr int HasErfc = 1;
|
||||
static constexpr int HasNdtri = 1;
|
||||
static constexpr int HasBessel = 1;
|
||||
static constexpr int HasIGamma = 1;
|
||||
static constexpr int HasIGammaDerA = 1;
|
||||
static constexpr int HasGammaSampleDerAlpha = 1;
|
||||
static constexpr int HasIGammac = 1;
|
||||
static constexpr int HasBetaInc = 1;
|
||||
HasDiv = 1,
|
||||
HasLog = 1,
|
||||
HasExp = 1,
|
||||
HasSqrt = 1,
|
||||
HasRsqrt = 1,
|
||||
HasLGamma = 1,
|
||||
HasDiGamma = 1,
|
||||
HasZeta = 1,
|
||||
HasPolygamma = 1,
|
||||
HasErf = 1,
|
||||
HasErfc = 1,
|
||||
HasNdtri = 1,
|
||||
HasBessel = 1,
|
||||
HasIGamma = 1,
|
||||
HasIGammaDerA = 1,
|
||||
HasGammaSampleDerAlpha = 1,
|
||||
HasIGammac = 1,
|
||||
HasBetaInc = 1,
|
||||
};
|
||||
};
|
||||
|
||||
template <>
|
||||
struct unpacket_traits<float4> {
|
||||
using type = float;
|
||||
static constexpr int size = 4;
|
||||
static constexpr int alignment = Aligned16;
|
||||
static constexpr bool vectorizable = true;
|
||||
static constexpr bool masked_load_available = false;
|
||||
static constexpr bool masked_store_available = false;
|
||||
using half = float4;
|
||||
typedef float type;
|
||||
enum {
|
||||
size = 4,
|
||||
alignment = Aligned16,
|
||||
vectorizable = true,
|
||||
masked_load_available = false,
|
||||
masked_store_available = false
|
||||
};
|
||||
typedef float4 half;
|
||||
};
|
||||
template <>
|
||||
struct unpacket_traits<double2> {
|
||||
using type = double;
|
||||
static constexpr int size = 2;
|
||||
static constexpr int alignment = Aligned16;
|
||||
static constexpr bool vectorizable = true;
|
||||
static constexpr bool masked_load_available = false;
|
||||
static constexpr bool masked_store_available = false;
|
||||
using half = double2;
|
||||
typedef double type;
|
||||
enum {
|
||||
size = 2,
|
||||
alignment = Aligned16,
|
||||
vectorizable = true,
|
||||
masked_load_available = false,
|
||||
masked_store_available = false
|
||||
};
|
||||
typedef double2 half;
|
||||
};
|
||||
|
||||
template <>
|
||||
@@ -384,7 +403,7 @@ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void pstoreu<double>(double* to, const dou
|
||||
|
||||
template <>
|
||||
EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE float4 ploadt_ro<float4, Aligned>(const float* from) {
|
||||
#if defined(EIGEN_GPU_COMPILE_PHASE)
|
||||
#if defined(EIGEN_GPU_HAS_LDG)
|
||||
return __ldg(reinterpret_cast<const float4*>(from));
|
||||
#else
|
||||
return make_float4(from[0], from[1], from[2], from[3]);
|
||||
@@ -392,7 +411,7 @@ EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE float4 ploadt_ro<float4, Aligned>(const fl
|
||||
}
|
||||
template <>
|
||||
EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE double2 ploadt_ro<double2, Aligned>(const double* from) {
|
||||
#if defined(EIGEN_GPU_COMPILE_PHASE)
|
||||
#if defined(EIGEN_GPU_HAS_LDG)
|
||||
return __ldg(reinterpret_cast<const double2*>(from));
|
||||
#else
|
||||
return make_double2(from[0], from[1]);
|
||||
@@ -401,7 +420,7 @@ EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE double2 ploadt_ro<double2, Aligned>(const
|
||||
|
||||
template <>
|
||||
EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE float4 ploadt_ro<float4, Unaligned>(const float* from) {
|
||||
#if defined(EIGEN_GPU_COMPILE_PHASE)
|
||||
#if defined(EIGEN_GPU_HAS_LDG)
|
||||
return make_float4(__ldg(from + 0), __ldg(from + 1), __ldg(from + 2), __ldg(from + 3));
|
||||
#else
|
||||
return make_float4(from[0], from[1], from[2], from[3]);
|
||||
@@ -409,7 +428,7 @@ EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE float4 ploadt_ro<float4, Unaligned>(const
|
||||
}
|
||||
template <>
|
||||
EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE double2 ploadt_ro<double2, Unaligned>(const double* from) {
|
||||
#if defined(EIGEN_GPU_COMPILE_PHASE)
|
||||
#if defined(EIGEN_GPU_HAS_LDG)
|
||||
return make_double2(__ldg(from + 0), __ldg(from + 1));
|
||||
#else
|
||||
return make_double2(from[0], from[1]);
|
||||
@@ -572,20 +591,23 @@ EIGEN_DEVICE_FUNC inline void ptranspose(PacketBlock<double2, 2>& kernel) {
|
||||
|
||||
#endif // defined(EIGEN_GPUCC) && defined(EIGEN_USE_GPU)
|
||||
|
||||
// Half-packet functions are only available in GPU device compilation — they use
|
||||
// intrinsics (__half2, etc.) that have no host-side benefit.
|
||||
#if defined(EIGEN_GPU_COMPILE_PHASE)
|
||||
// Half-packet functions are not available on the host for CUDA 9.0-9.2, only
|
||||
// on device. There is no benefit to using them on the host anyways, since they are
|
||||
// emulated.
|
||||
#if (defined(EIGEN_HAS_CUDA_FP16) || defined(EIGEN_HAS_HIP_FP16)) && defined(EIGEN_GPU_COMPILE_PHASE)
|
||||
|
||||
using Packet4h2 = ulonglong2;
|
||||
typedef ulonglong2 Packet4h2;
|
||||
template <>
|
||||
struct unpacket_traits<Packet4h2> {
|
||||
using type = Eigen::half;
|
||||
static constexpr int size = 8;
|
||||
static constexpr int alignment = Aligned16;
|
||||
static constexpr bool vectorizable = true;
|
||||
static constexpr bool masked_load_available = false;
|
||||
static constexpr bool masked_store_available = false;
|
||||
using half = Packet4h2;
|
||||
typedef Eigen::half type;
|
||||
enum {
|
||||
size = 8,
|
||||
alignment = Aligned16,
|
||||
vectorizable = true,
|
||||
masked_load_available = false,
|
||||
masked_store_available = false
|
||||
};
|
||||
typedef Packet4h2 half;
|
||||
};
|
||||
template <>
|
||||
struct is_arithmetic<Packet4h2> {
|
||||
@@ -594,13 +616,15 @@ struct is_arithmetic<Packet4h2> {
|
||||
|
||||
template <>
|
||||
struct unpacket_traits<half2> {
|
||||
using type = Eigen::half;
|
||||
static constexpr int size = 2;
|
||||
static constexpr int alignment = Aligned16;
|
||||
static constexpr bool vectorizable = true;
|
||||
static constexpr bool masked_load_available = false;
|
||||
static constexpr bool masked_store_available = false;
|
||||
using half = half2;
|
||||
typedef Eigen::half type;
|
||||
enum {
|
||||
size = 2,
|
||||
alignment = Aligned16,
|
||||
vectorizable = true,
|
||||
masked_load_available = false,
|
||||
masked_store_available = false
|
||||
};
|
||||
typedef half2 half;
|
||||
};
|
||||
template <>
|
||||
struct is_arithmetic<half2> {
|
||||
@@ -609,21 +633,23 @@ struct is_arithmetic<half2> {
|
||||
|
||||
template <>
|
||||
struct packet_traits<Eigen::half> : default_packet_traits {
|
||||
using type = Packet4h2;
|
||||
using half = Packet4h2;
|
||||
static constexpr int Vectorizable = 1;
|
||||
static constexpr int AlignedOnScalar = 1;
|
||||
static constexpr int size = 8;
|
||||
static constexpr int HasAdd = 1;
|
||||
static constexpr int HasSub = 1;
|
||||
static constexpr int HasMul = 1;
|
||||
static constexpr int HasDiv = 1;
|
||||
static constexpr int HasSqrt = 1;
|
||||
static constexpr int HasRsqrt = 1;
|
||||
static constexpr int HasExp = 1;
|
||||
static constexpr int HasExpm1 = 1;
|
||||
static constexpr int HasLog = 1;
|
||||
static constexpr int HasLog1p = 1;
|
||||
typedef Packet4h2 type;
|
||||
typedef Packet4h2 half;
|
||||
enum {
|
||||
Vectorizable = 1,
|
||||
AlignedOnScalar = 1,
|
||||
size = 8,
|
||||
HasAdd = 1,
|
||||
HasSub = 1,
|
||||
HasMul = 1,
|
||||
HasDiv = 1,
|
||||
HasSqrt = 1,
|
||||
HasRsqrt = 1,
|
||||
HasExp = 1,
|
||||
HasExpm1 = 1,
|
||||
HasLog = 1,
|
||||
HasLog1p = 1
|
||||
};
|
||||
};
|
||||
|
||||
template <>
|
||||
@@ -664,7 +690,7 @@ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void pstoreu(Eigen::half* to, const half2&
|
||||
}
|
||||
|
||||
EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE half2 ploadt_ro_aligned(const Eigen::half* from) {
|
||||
#if defined(EIGEN_GPU_COMPILE_PHASE)
|
||||
#if defined(EIGEN_GPU_HAS_LDG)
|
||||
// Input is guaranteed to be properly aligned.
|
||||
return __ldg(reinterpret_cast<const half2*>(from));
|
||||
#else
|
||||
@@ -673,7 +699,7 @@ EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE half2 ploadt_ro_aligned(const Eigen::half*
|
||||
}
|
||||
|
||||
EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE half2 ploadt_ro_unaligned(const Eigen::half* from) {
|
||||
#if defined(EIGEN_GPU_COMPILE_PHASE)
|
||||
#if defined(EIGEN_GPU_HAS_LDG)
|
||||
return __halves2half2(__ldg(from + 0), __ldg(from + 1));
|
||||
#else
|
||||
return __halves2half2(*(from + 0), *(from + 1));
|
||||
@@ -719,7 +745,12 @@ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void ptranspose(PacketBlock<half2, 2>& ker
|
||||
}
|
||||
|
||||
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 plset(const Eigen::half& a) {
|
||||
#if defined(EIGEN_GPU_HAS_FP16_ARITHMETIC)
|
||||
return __halves2half2(a, __hadd(a, __float2half(1.0f)));
|
||||
#else
|
||||
float f = __half2float(a) + 1.0f;
|
||||
return __halves2half2(a, __float2half(f));
|
||||
#endif
|
||||
}
|
||||
|
||||
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 pselect(const half2& mask, const half2& a, const half2& b) {
|
||||
@@ -806,21 +837,89 @@ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 pandnot(const half2& a, const half2&
|
||||
return __halves2half2(result1, result2);
|
||||
}
|
||||
|
||||
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 padd(const half2& a, const half2& b) { return __hadd2(a, b); }
|
||||
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 padd(const half2& a, const half2& b) {
|
||||
#if defined(EIGEN_GPU_HAS_FP16_ARITHMETIC)
|
||||
return __hadd2(a, b);
|
||||
#else
|
||||
float a1 = __low2float(a);
|
||||
float a2 = __high2float(a);
|
||||
float b1 = __low2float(b);
|
||||
float b2 = __high2float(b);
|
||||
float r1 = a1 + b1;
|
||||
float r2 = a2 + b2;
|
||||
return __floats2half2_rn(r1, r2);
|
||||
#endif
|
||||
}
|
||||
|
||||
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 psub(const half2& a, const half2& b) { return __hsub2(a, b); }
|
||||
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 psub(const half2& a, const half2& b) {
|
||||
#if defined(EIGEN_GPU_HAS_FP16_ARITHMETIC)
|
||||
return __hsub2(a, b);
|
||||
#else
|
||||
float a1 = __low2float(a);
|
||||
float a2 = __high2float(a);
|
||||
float b1 = __low2float(b);
|
||||
float b2 = __high2float(b);
|
||||
float r1 = a1 - b1;
|
||||
float r2 = a2 - b2;
|
||||
return __floats2half2_rn(r1, r2);
|
||||
#endif
|
||||
}
|
||||
|
||||
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 pnegate(const half2& a) { return __hneg2(a); }
|
||||
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 pnegate(const half2& a) {
|
||||
#if defined(EIGEN_GPU_HAS_FP16_ARITHMETIC)
|
||||
return __hneg2(a);
|
||||
#else
|
||||
float a1 = __low2float(a);
|
||||
float a2 = __high2float(a);
|
||||
return __floats2half2_rn(-a1, -a2);
|
||||
#endif
|
||||
}
|
||||
|
||||
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 pconj(const half2& a) { return a; }
|
||||
|
||||
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 pmul(const half2& a, const half2& b) { return __hmul2(a, b); }
|
||||
|
||||
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 pmadd(const half2& a, const half2& b, const half2& c) {
|
||||
return __hfma2(a, b, c);
|
||||
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 pmul(const half2& a, const half2& b) {
|
||||
#if defined(EIGEN_GPU_HAS_FP16_ARITHMETIC)
|
||||
return __hmul2(a, b);
|
||||
#else
|
||||
float a1 = __low2float(a);
|
||||
float a2 = __high2float(a);
|
||||
float b1 = __low2float(b);
|
||||
float b2 = __high2float(b);
|
||||
float r1 = a1 * b1;
|
||||
float r2 = a2 * b2;
|
||||
return __floats2half2_rn(r1, r2);
|
||||
#endif
|
||||
}
|
||||
|
||||
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 pdiv(const half2& a, const half2& b) { return __h2div(a, b); }
|
||||
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 pmadd(const half2& a, const half2& b, const half2& c) {
|
||||
#if defined(EIGEN_GPU_HAS_FP16_ARITHMETIC)
|
||||
return __hfma2(a, b, c);
|
||||
#else
|
||||
float a1 = __low2float(a);
|
||||
float a2 = __high2float(a);
|
||||
float b1 = __low2float(b);
|
||||
float b2 = __high2float(b);
|
||||
float c1 = __low2float(c);
|
||||
float c2 = __high2float(c);
|
||||
float r1 = a1 * b1 + c1;
|
||||
float r2 = a2 * b2 + c2;
|
||||
return __floats2half2_rn(r1, r2);
|
||||
#endif
|
||||
}
|
||||
|
||||
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 pdiv(const half2& a, const half2& b) {
|
||||
#if defined(EIGEN_GPU_HAS_FP16_ARITHMETIC)
|
||||
return __h2div(a, b);
|
||||
#else
|
||||
float a1 = __low2float(a);
|
||||
float a2 = __high2float(a);
|
||||
float b1 = __low2float(b);
|
||||
float b2 = __high2float(b);
|
||||
float r1 = a1 / b1;
|
||||
float r2 = a2 / b2;
|
||||
return __floats2half2_rn(r1, r2);
|
||||
#endif
|
||||
}
|
||||
|
||||
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 pmin(const half2& a, const half2& b) {
|
||||
float a1 = __low2float(a);
|
||||
@@ -843,23 +942,47 @@ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 pmax(const half2& a, const half2& b)
|
||||
}
|
||||
|
||||
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Eigen::half predux(const half2& a) {
|
||||
#if defined(EIGEN_GPU_HAS_FP16_ARITHMETIC)
|
||||
return __hadd(__low2half(a), __high2half(a));
|
||||
#else
|
||||
float a1 = __low2float(a);
|
||||
float a2 = __high2float(a);
|
||||
return Eigen::half(__float2half(a1 + a2));
|
||||
#endif
|
||||
}
|
||||
|
||||
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Eigen::half predux_max(const half2& a) {
|
||||
#if defined(EIGEN_GPU_HAS_FP16_ARITHMETIC)
|
||||
__half first = __low2half(a);
|
||||
__half second = __high2half(a);
|
||||
return __hgt(first, second) ? first : second;
|
||||
#else
|
||||
float a1 = __low2float(a);
|
||||
float a2 = __high2float(a);
|
||||
return a1 > a2 ? __low2half(a) : __high2half(a);
|
||||
#endif
|
||||
}
|
||||
|
||||
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Eigen::half predux_min(const half2& a) {
|
||||
#if defined(EIGEN_GPU_HAS_FP16_ARITHMETIC)
|
||||
__half first = __low2half(a);
|
||||
__half second = __high2half(a);
|
||||
return __hlt(first, second) ? first : second;
|
||||
#else
|
||||
float a1 = __low2float(a);
|
||||
float a2 = __high2float(a);
|
||||
return a1 < a2 ? __low2half(a) : __high2half(a);
|
||||
#endif
|
||||
}
|
||||
|
||||
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Eigen::half predux_mul(const half2& a) {
|
||||
#if defined(EIGEN_GPU_HAS_FP16_ARITHMETIC)
|
||||
return __hmul(__low2half(a), __high2half(a));
|
||||
#else
|
||||
float a1 = __low2float(a);
|
||||
float a2 = __high2float(a);
|
||||
return Eigen::half(__float2half(a1 * a2));
|
||||
#endif
|
||||
}
|
||||
|
||||
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 plog1p(const half2& a) {
|
||||
@@ -878,6 +1001,8 @@ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 pexpm1(const half2& a) {
|
||||
return __floats2half2_rn(r1, r2);
|
||||
}
|
||||
|
||||
#if (EIGEN_CUDA_SDK_VER >= 80000 && defined(EIGEN_CUDA_HAS_FP16_ARITHMETIC)) || defined(EIGEN_HIP_DEVICE_COMPILE)
|
||||
|
||||
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 plog(const half2& a) { return h2log(a); }
|
||||
|
||||
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 pexp(const half2& a) { return h2exp(a); }
|
||||
@@ -885,6 +1010,41 @@ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 pexp(const half2& a) { return h2exp(
|
||||
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 psqrt(const half2& a) { return h2sqrt(a); }
|
||||
|
||||
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 prsqrt(const half2& a) { return h2rsqrt(a); }
|
||||
|
||||
#else
|
||||
|
||||
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 plog(const half2& a) {
|
||||
float a1 = __low2float(a);
|
||||
float a2 = __high2float(a);
|
||||
float r1 = logf(a1);
|
||||
float r2 = logf(a2);
|
||||
return __floats2half2_rn(r1, r2);
|
||||
}
|
||||
|
||||
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 pexp(const half2& a) {
|
||||
float a1 = __low2float(a);
|
||||
float a2 = __high2float(a);
|
||||
float r1 = expf(a1);
|
||||
float r2 = expf(a2);
|
||||
return __floats2half2_rn(r1, r2);
|
||||
}
|
||||
|
||||
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 psqrt(const half2& a) {
|
||||
float a1 = __low2float(a);
|
||||
float a2 = __high2float(a);
|
||||
float r1 = sqrtf(a1);
|
||||
float r2 = sqrtf(a2);
|
||||
return __floats2half2_rn(r1, r2);
|
||||
}
|
||||
|
||||
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 prsqrt(const half2& a) {
|
||||
float a1 = __low2float(a);
|
||||
float a2 = __high2float(a);
|
||||
float r1 = rsqrtf(a1);
|
||||
float r2 = rsqrtf(a2);
|
||||
return __floats2half2_rn(r1, r2);
|
||||
}
|
||||
#endif
|
||||
} // namespace
|
||||
|
||||
template <>
|
||||
@@ -931,17 +1091,19 @@ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void pstoreu<Eigen::half>(Eigen::half* to,
|
||||
|
||||
template <>
|
||||
EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE Packet4h2 ploadt_ro<Packet4h2, Aligned>(const Eigen::half* from) {
|
||||
#if defined(EIGEN_GPU_HAS_LDG)
|
||||
Packet4h2 r;
|
||||
#if defined(EIGEN_GPU_COMPILE_PHASE)
|
||||
r = __ldg(reinterpret_cast<const Packet4h2*>(from));
|
||||
return r;
|
||||
#else
|
||||
Packet4h2 r;
|
||||
half2* r_alias = reinterpret_cast<half2*>(&r);
|
||||
r_alias[0] = ploadt_ro_aligned(from + 0);
|
||||
r_alias[1] = ploadt_ro_aligned(from + 2);
|
||||
r_alias[2] = ploadt_ro_aligned(from + 4);
|
||||
r_alias[3] = ploadt_ro_aligned(from + 6);
|
||||
#endif
|
||||
return r;
|
||||
#endif
|
||||
}
|
||||
|
||||
template <>
|
||||
@@ -1110,7 +1272,7 @@ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet4h2 plset<Packet4h2>(const Eigen::ha
|
||||
p_alias[2] = __halves2half2(__hadd(a, __float2half(4.0f)), __hadd(a, __float2half(5.0f)));
|
||||
p_alias[3] = __halves2half2(__hadd(a, __float2half(6.0f)), __hadd(a, __float2half(7.0f)));
|
||||
return r;
|
||||
#elif defined(EIGEN_CUDA_ARCH)
|
||||
#elif defined(EIGEN_CUDA_HAS_FP16_ARITHMETIC)
|
||||
Packet4h2 r;
|
||||
half2* r_alias = reinterpret_cast<half2*>(&r);
|
||||
|
||||
@@ -1128,6 +1290,16 @@ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet4h2 plset<Packet4h2>(const Eigen::ha
|
||||
r_alias[3] = plset(__high2half(c));
|
||||
|
||||
return r;
|
||||
|
||||
#else
|
||||
float f = __half2float(a);
|
||||
Packet4h2 r;
|
||||
half2* p_alias = reinterpret_cast<half2*>(&r);
|
||||
p_alias[0] = __halves2half2(a, __float2half(f + 1.0f));
|
||||
p_alias[1] = __halves2half2(__float2half(f + 2.0f), __float2half(f + 3.0f));
|
||||
p_alias[2] = __halves2half2(__float2half(f + 4.0f), __float2half(f + 5.0f));
|
||||
p_alias[3] = __halves2half2(__float2half(f + 6.0f), __float2half(f + 7.0f));
|
||||
return r;
|
||||
#endif
|
||||
}
|
||||
|
||||
@@ -1361,7 +1533,7 @@ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Eigen::half predux_max<Packet4h2>(const Pa
|
||||
half2 m1 = __halves2half2(predux_max(a_alias[2]), predux_max(a_alias[3]));
|
||||
__half first = predux_max(m0);
|
||||
__half second = predux_max(m1);
|
||||
#if defined(EIGEN_CUDA_ARCH)
|
||||
#if defined(EIGEN_CUDA_HAS_FP16_ARITHMETIC)
|
||||
return (__hgt(first, second) ? first : second);
|
||||
#else
|
||||
float ffirst = __half2float(first);
|
||||
@@ -1377,7 +1549,7 @@ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Eigen::half predux_min<Packet4h2>(const Pa
|
||||
half2 m1 = __halves2half2(predux_min(a_alias[2]), predux_min(a_alias[3]));
|
||||
__half first = predux_min(m0);
|
||||
__half second = predux_min(m1);
|
||||
#if defined(EIGEN_CUDA_ARCH)
|
||||
#if defined(EIGEN_CUDA_HAS_FP16_ARITHMETIC)
|
||||
return (__hlt(first, second) ? first : second);
|
||||
#else
|
||||
float ffirst = __half2float(first);
|
||||
@@ -1469,17 +1641,47 @@ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet4h2 prsqrt<Packet4h2>(const Packet4h
|
||||
// the implementation of GPU half reduction.
|
||||
template <>
|
||||
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 padd<half2>(const half2& a, const half2& b) {
|
||||
#if defined(EIGEN_GPU_HAS_FP16_ARITHMETIC)
|
||||
return __hadd2(a, b);
|
||||
#else
|
||||
float a1 = __low2float(a);
|
||||
float a2 = __high2float(a);
|
||||
float b1 = __low2float(b);
|
||||
float b2 = __high2float(b);
|
||||
float r1 = a1 + b1;
|
||||
float r2 = a2 + b2;
|
||||
return __floats2half2_rn(r1, r2);
|
||||
#endif
|
||||
}
|
||||
|
||||
template <>
|
||||
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 pmul<half2>(const half2& a, const half2& b) {
|
||||
#if defined(EIGEN_GPU_HAS_FP16_ARITHMETIC)
|
||||
return __hmul2(a, b);
|
||||
#else
|
||||
float a1 = __low2float(a);
|
||||
float a2 = __high2float(a);
|
||||
float b1 = __low2float(b);
|
||||
float b2 = __high2float(b);
|
||||
float r1 = a1 * b1;
|
||||
float r2 = a2 * b2;
|
||||
return __floats2half2_rn(r1, r2);
|
||||
#endif
|
||||
}
|
||||
|
||||
template <>
|
||||
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 pdiv<half2>(const half2& a, const half2& b) {
|
||||
#if defined(EIGEN_GPU_HAS_FP16_ARITHMETIC)
|
||||
return __h2div(a, b);
|
||||
#else
|
||||
float a1 = __low2float(a);
|
||||
float a2 = __high2float(a);
|
||||
float b1 = __low2float(b);
|
||||
float b2 = __high2float(b);
|
||||
float r1 = a1 / b1;
|
||||
float r2 = a2 / b2;
|
||||
return __floats2half2_rn(r1, r2);
|
||||
#endif
|
||||
}
|
||||
|
||||
template <>
|
||||
@@ -1504,7 +1706,11 @@ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 pmax<half2>(const half2& a, const ha
|
||||
return __halves2half2(r1, r2);
|
||||
}
|
||||
|
||||
#endif // defined(EIGEN_GPU_COMPILE_PHASE)
|
||||
#endif // (defined(EIGEN_HAS_CUDA_FP16) || defined(EIGEN_HAS_HIP_FP16)) && defined(EIGEN_GPU_COMPILE_PHASE)
|
||||
|
||||
#undef EIGEN_GPU_HAS_LDG
|
||||
#undef EIGEN_CUDA_HAS_FP16_ARITHMETIC
|
||||
#undef EIGEN_GPU_HAS_FP16_ARITHMETIC
|
||||
|
||||
} // end namespace internal
|
||||
|
||||
|
||||
@@ -17,7 +17,8 @@ namespace Eigen {
|
||||
|
||||
namespace internal {
|
||||
|
||||
#if defined(EIGEN_GPU_COMPILE_PHASE)
|
||||
#if (defined(EIGEN_HAS_CUDA_FP16) && defined(EIGEN_CUDA_ARCH) && EIGEN_CUDA_ARCH >= 300) || \
|
||||
(defined(EIGEN_HAS_HIP_FP16) && defined(EIGEN_HIP_DEVICE_COMPILE))
|
||||
|
||||
template <>
|
||||
struct type_casting_traits<Eigen::half, float> {
|
||||
|
||||
@@ -541,6 +541,12 @@ extern "C" {
|
||||
#if defined EIGEN_CUDACC
|
||||
#define EIGEN_VECTORIZE_GPU
|
||||
#include <vector_types.h>
|
||||
#if EIGEN_CUDA_SDK_VER >= 70500
|
||||
#define EIGEN_HAS_CUDA_FP16
|
||||
#endif
|
||||
#endif
|
||||
|
||||
#if defined(EIGEN_HAS_CUDA_FP16)
|
||||
#include <cuda_runtime_api.h>
|
||||
#include <cuda_fp16.h>
|
||||
#endif
|
||||
@@ -548,6 +554,7 @@ extern "C" {
|
||||
#if defined(EIGEN_HIPCC)
|
||||
#define EIGEN_VECTORIZE_GPU
|
||||
#include <hip/hip_vector_types.h>
|
||||
#define EIGEN_HAS_HIP_FP16
|
||||
#include <hip/hip_fp16.h>
|
||||
#define EIGEN_HAS_HIP_BF16
|
||||
#include <hip/hip_bfloat16.h>
|
||||
|
||||
@@ -84,7 +84,8 @@
|
||||
#endif
|
||||
|
||||
#if defined __NVCC__ && defined __CUDACC__
|
||||
// MSVC does not support the _Pragma keyword, so we use Microsoft's __pragma extension.
|
||||
// MSVC 14.16 (required by CUDA 9.*) does not support the _Pragma keyword, so
|
||||
// we instead use Microsoft's __pragma extension.
|
||||
#if defined _MSC_VER
|
||||
#define EIGEN_MAKE_PRAGMA(X) __pragma(#X)
|
||||
#else
|
||||
|
||||
@@ -148,8 +148,13 @@
|
||||
#endif
|
||||
|
||||
#if defined(__NVCC__)
|
||||
// CUDA 11.4+ always defines __CUDACC_VER_MAJOR__.
|
||||
#if defined(__CUDACC_VER_MAJOR__) && (__CUDACC_VER_MAJOR__ >= 9)
|
||||
#define EIGEN_COMP_NVCC ((__CUDACC_VER_MAJOR__ * 10000) + (__CUDACC_VER_MINOR__ * 100))
|
||||
#elif defined(__CUDACC_VER__)
|
||||
#define EIGEN_COMP_NVCC __CUDACC_VER__
|
||||
#else
|
||||
#error "NVCC did not define compiler version."
|
||||
#endif
|
||||
#else
|
||||
#define EIGEN_COMP_NVCC 0
|
||||
#endif
|
||||
@@ -570,10 +575,6 @@
|
||||
#define EIGEN_CUDA_SDK_VER 0
|
||||
#endif
|
||||
|
||||
#if defined(EIGEN_CUDACC) && EIGEN_CUDA_SDK_VER > 0 && EIGEN_CUDA_SDK_VER < 110400
|
||||
#error "Eigen requires CUDA 11.4 or later."
|
||||
#endif
|
||||
|
||||
#if defined(__HIPCC__) && !defined(EIGEN_NO_HIP) && !defined(__SYCL_DEVICE_ONLY__)
|
||||
// Means the compiler is HIPCC (analogous to EIGEN_CUDACC, but for HIP)
|
||||
#define EIGEN_HIPCC __HIPCC__
|
||||
@@ -583,20 +584,22 @@
|
||||
// ++ host_defines.h which contains the defines for the __host__ and __device__ macros
|
||||
#include <hip/hip_runtime.h>
|
||||
|
||||
// Eigen requires ROCm/HIP >= 5.6 (GFX906 minimum architecture).
|
||||
// This floor exists to allow simplifying shared CUDA/HIP preprocessor guards —
|
||||
// all __HIP_ARCH_HAS_WARP_SHUFFLE__, __HIP_ARCH_HAS_FP16__, etc. are always true on GFX906+.
|
||||
#if defined(HIP_VERSION_MAJOR) && (HIP_VERSION_MAJOR < 5 || (HIP_VERSION_MAJOR == 5 && HIP_VERSION_MINOR < 6))
|
||||
#error "Eigen requires ROCm/HIP >= 5.6."
|
||||
#endif
|
||||
|
||||
#if defined(__HIP_DEVICE_COMPILE__) && !defined(__SYCL_DEVICE_ONLY__)
|
||||
// analogous to EIGEN_CUDA_ARCH, but for HIP
|
||||
#define EIGEN_HIP_DEVICE_COMPILE __HIP_DEVICE_COMPILE__
|
||||
#endif
|
||||
|
||||
// HIP compilers default to launch_bounds(256), which causes failures when kernels
|
||||
// are called with more than 256 threads per block. Explicitly set to 1024 for HIP.
|
||||
// For HIP (ROCm 3.5 and higher), we need to explicitly set the launch_bounds attribute
|
||||
// value to 1024. The compiler assigns a default value of 256 when the attribute is not
|
||||
// specified. This results in failures on the HIP platform, for cases when a GPU kernel
|
||||
// without an explicit launch_bounds attribute is called with a threads_per_block value
|
||||
// greater than 256.
|
||||
//
|
||||
// This is a regression in functionality and is expected to be fixed within the next
|
||||
// couple of ROCm releases (compiler will go back to using 1024 value as the default)
|
||||
//
|
||||
// In the meantime, we will use a "only enabled for HIP" macro to set the launch_bounds
|
||||
// attribute.
|
||||
|
||||
#define EIGEN_HIP_LAUNCH_BOUNDS_1024 __launch_bounds__(1024)
|
||||
|
||||
|
||||
@@ -197,7 +197,7 @@ build:linux:x86-64:nvhpc-26.1:default:unsupported:
|
||||
# Additional flags passed to the cuda compiler.
|
||||
EIGEN_CI_CUDA_CXX_FLAGS: ""
|
||||
# Compute architectures present in the GitLab CI runners.
|
||||
EIGEN_CI_CUDA_COMPUTE_ARCH: "70;75"
|
||||
EIGEN_CI_CUDA_COMPUTE_ARCH: "50;75"
|
||||
EIGEN_CI_BUILD_TARGET: buildtests_gpu
|
||||
EIGEN_CI_TEST_CUDA_CLANG: "off"
|
||||
EIGEN_CI_TEST_CUDA_NVC: "off"
|
||||
@@ -234,7 +234,7 @@ build:linux:cuda-12.2:clang-12:
|
||||
# ROCm HIP
|
||||
build:linux:rocm-latest:gcc-10:
|
||||
extends: .build:linux:cross
|
||||
image: rocm/dev-ubuntu-24.04:6.3.1
|
||||
image: rocm/dev-ubuntu-24.04:latest
|
||||
variables:
|
||||
EIGEN_CI_C_COMPILER: gcc-10
|
||||
EIGEN_CI_CXX_COMPILER: g++-10
|
||||
|
||||
@@ -55,7 +55,7 @@ build:windows:x86-64:msvc-14.29:avx512dq:
|
||||
extends: .build:windows
|
||||
variables:
|
||||
# Compute architectures present in the GitLab CI runners.
|
||||
EIGEN_CI_CUDA_COMPUTE_ARCH: "70;75"
|
||||
EIGEN_CI_CUDA_COMPUTE_ARCH: "50;75"
|
||||
EIGEN_CI_BUILD_TARGET: buildtests_gpu
|
||||
EIGEN_CI_ADDITIONAL_ARGS:
|
||||
-DEIGEN_TEST_CUDA=on
|
||||
@@ -66,8 +66,8 @@ build:windows:x86-64:msvc-14.29:avx512dq:
|
||||
- x86-64
|
||||
- cuda
|
||||
|
||||
# MSVC 14.29 + CUDA 12.2
|
||||
build:windows:x86-64:cuda-12.2:msvc-14.29:
|
||||
# MSVC 14.29 + CUDA 11.4
|
||||
build:windows:x86-64:cuda-11.4:msvc-14.29:
|
||||
extends: .build:windows:cuda
|
||||
variables:
|
||||
EIGEN_CI_BEFORE_SCRIPT: $$env:CUDA_PATH=$$env:CUDA_PATH_V12_2
|
||||
EIGEN_CI_BEFORE_SCRIPT: $$env:CUDA_PATH=$$env:CUDA_PATH_V11_4
|
||||
@@ -71,7 +71,7 @@ test:windows:x86-64:msvc-14.29:avx512dq:unsupported:
|
||||
- x86-64
|
||||
- cuda
|
||||
|
||||
# MSVC 14.29 + CUDA 12.2
|
||||
test:windows:x86-64:cuda-12.2:msvc-14.29:
|
||||
# MSVC 14.29 + CUDA 11.4
|
||||
test:windows:x86-64:cuda-11.4:msvc-14.29:
|
||||
extends: .test:windows:cuda
|
||||
needs: [ build:windows:x86-64:cuda-12.2:msvc-14.29 ]
|
||||
needs: [ build:windows:x86-64:cuda-11.4:msvc-14.29 ]
|
||||
@@ -20,8 +20,7 @@ add_dependencies(check buildtests)
|
||||
|
||||
# Convenience target for only building GPU tests.
|
||||
add_custom_target(buildtests_gpu)
|
||||
add_custom_target(check_gpu COMMAND "ctest" ${EIGEN_CTEST_ARGS}
|
||||
"--output-on-failure"
|
||||
add_custom_target(check_gpu COMMAND "ctest" "--output-on-failure"
|
||||
"--no-compress-output"
|
||||
"--build-no-clean"
|
||||
"-T" "test"
|
||||
@@ -72,3 +71,4 @@ elseif(MSVC)
|
||||
set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} /D_CRT_SECURE_NO_WARNINGS /D_SCL_SECURE_NO_WARNINGS")
|
||||
endif()
|
||||
|
||||
|
||||
|
||||
@@ -8,12 +8,6 @@ macro(ei_add_property prop value)
|
||||
endif()
|
||||
endmacro()
|
||||
|
||||
if(EIGEN_TEST_HIP AND NOT DEFINED EIGEN_HIP_ARCHITECTURES)
|
||||
set(EIGEN_HIP_ARCHITECTURES
|
||||
gfx900;gfx906;gfx908;gfx90a;gfx940;gfx941;gfx942;gfx1030;gfx1100;gfx1101;gfx1102;gfx1150;gfx1151
|
||||
CACHE STRING "HIP GPU architectures to build Eigen's HIP tests for.")
|
||||
endif()
|
||||
|
||||
#internal. See documentation of ei_add_test for details.
|
||||
macro(ei_add_test_internal testname testname_with_suffix)
|
||||
set(targetname ${testname_with_suffix})
|
||||
@@ -36,7 +30,7 @@ macro(ei_add_test_internal testname testname_with_suffix)
|
||||
hip_reset_flags()
|
||||
hip_add_executable(${targetname} ${filename} HIPCC_OPTIONS -std=c++14)
|
||||
target_compile_definitions(${targetname} PRIVATE -DEIGEN_USE_HIP)
|
||||
set_property(TARGET ${targetname} PROPERTY HIP_ARCHITECTURES "${EIGEN_HIP_ARCHITECTURES}")
|
||||
set_property(TARGET ${targetname} PROPERTY HIP_ARCHITECTURES gfx900 gfx906 gfx908 gfx90a gfx940 gfx941 gfx942 gfx1030)
|
||||
elseif(EIGEN_TEST_CUDA_CLANG)
|
||||
set_source_files_properties(${filename} PROPERTIES LANGUAGE CXX)
|
||||
|
||||
@@ -140,7 +134,6 @@ macro(ei_add_test_internal testname testname_with_suffix)
|
||||
if (is_gpu_test)
|
||||
# Add gpu tag for testing only GPU tests.
|
||||
set_property(TEST ${testname_with_suffix} APPEND PROPERTY LABELS "gpu")
|
||||
set_property(TEST ${testname_with_suffix} PROPERTY SKIP_RETURN_CODE 77)
|
||||
endif()
|
||||
|
||||
if(EIGEN_SYCL)
|
||||
|
||||
@@ -433,7 +433,7 @@ if(EIGEN_TEST_CUDA_NVC AND NOT CMAKE_CXX_COMPILER_ID MATCHES "NVHPC")
|
||||
message(WARNING "EIGEN_TEST_CUDA_NVC is set, but CMAKE_CXX_COMPILER does not appear to be nvc++.")
|
||||
endif()
|
||||
|
||||
find_package(CUDA 11.4)
|
||||
find_package(CUDA 9.0)
|
||||
if(CUDA_FOUND AND EIGEN_TEST_CUDA)
|
||||
# Make sure to compile without the -pedantic, -Wundef, -Wnon-virtual-dtor
|
||||
# and -fno-check-new flags since they trigger thousands of compilation warnings
|
||||
@@ -502,9 +502,6 @@ if (EIGEN_TEST_HIP)
|
||||
endif()
|
||||
|
||||
find_package(HIP REQUIRED)
|
||||
if (HIP_FOUND AND HIP_VERSION VERSION_LESS "5.6")
|
||||
message(FATAL_ERROR "Eigen requires ROCm/HIP >= 5.6, found ${HIP_VERSION}")
|
||||
endif()
|
||||
if (HIP_FOUND)
|
||||
execute_process(COMMAND ${HIP_PATH}/bin/hipconfig --platform OUTPUT_VARIABLE HIP_PLATFORM)
|
||||
|
||||
|
||||
@@ -7,6 +7,12 @@
|
||||
// Public License v. 2.0. If a copy of the MPL was not distributed
|
||||
// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
|
||||
|
||||
// workaround issue between gcc >= 4.7 and cuda 5.5
|
||||
#if (defined __GNUC__) && (__GNUC__ > 4 || __GNUC_MINOR__ >= 7)
|
||||
#undef _GLIBCXX_ATOMIC_BUILTINS
|
||||
#undef _GLIBCXX_USE_INT128
|
||||
#endif
|
||||
|
||||
#define EIGEN_TEST_NO_LONGDOUBLE
|
||||
#define EIGEN_DEFAULT_DENSE_INDEX_TYPE int
|
||||
|
||||
|
||||
@@ -6,8 +6,10 @@
|
||||
// Allow gpu** macros for generic tests.
|
||||
#include <Eigen/src/Core/util/GpuHipCudaDefines.inc>
|
||||
|
||||
// std::tuple cannot be used on device, so use our custom implementation there.
|
||||
#if defined(EIGEN_GPU_COMPILE_PHASE)
|
||||
// std::tuple cannot be used on device, and there is a bug in cuda < 9.2 that
|
||||
// doesn't allow std::tuple to compile for host code either. In these cases,
|
||||
// use our custom implementation.
|
||||
#if defined(EIGEN_GPU_COMPILE_PHASE) || (defined(EIGEN_CUDACC) && EIGEN_CUDA_SDK_VER < 92000)
|
||||
#define EIGEN_USE_CUSTOM_TUPLE 1
|
||||
#else
|
||||
#define EIGEN_USE_CUSTOM_TUPLE 0
|
||||
@@ -40,12 +42,6 @@ using tuple_impl::tuple;
|
||||
#undef EIGEN_USE_CUSTOM_TUPLE
|
||||
} // namespace test_detail
|
||||
|
||||
template <typename T>
|
||||
using decay_t = typename std::decay<T>::type;
|
||||
|
||||
template <typename Func, typename... Args>
|
||||
using kernel_result_t = decltype(std::declval<Func>()(std::declval<Args>()...));
|
||||
|
||||
template <size_t N, size_t Idx, typename OutputIndexSequence, typename... Ts>
|
||||
struct extract_output_indices_helper;
|
||||
|
||||
@@ -94,15 +90,14 @@ struct void_helper {
|
||||
// Non-void return value.
|
||||
template <typename Func, typename... Args>
|
||||
static EIGEN_ALWAYS_INLINE EIGEN_DEVICE_FUNC auto call(Func&& func, Args&&... args)
|
||||
-> std::enable_if_t<!std::is_same<kernel_result_t<Func&&, Args&&...>, void>::value,
|
||||
kernel_result_t<Func&&, Args&&...>> {
|
||||
-> std::enable_if_t<!std::is_same<decltype(func(args...)), void>::value, decltype(func(args...))> {
|
||||
return func(std::forward<Args>(args)...);
|
||||
}
|
||||
|
||||
// Void return value.
|
||||
template <typename Func, typename... Args>
|
||||
static EIGEN_ALWAYS_INLINE EIGEN_DEVICE_FUNC auto call(Func&& func, Args&&... args)
|
||||
-> std::enable_if_t<std::is_same<kernel_result_t<Func&&, Args&&...>, void>::value, Void> {
|
||||
-> std::enable_if_t<std::is_same<decltype(func(args...)), void>::value, Void> {
|
||||
func(std::forward<Args>(args)...);
|
||||
return Void{};
|
||||
}
|
||||
@@ -140,18 +135,18 @@ EIGEN_DEVICE_FUNC void run_serialized(std::index_sequence<Indices...>, std::inde
|
||||
const uint8_t* read_end = buffer + capacity;
|
||||
read_ptr = Eigen::deserialize(read_ptr, read_end, input_size);
|
||||
// Create value-type instances to populate.
|
||||
auto args = make_tuple(decay_t<Args>{}...);
|
||||
auto args = make_tuple(typename std::decay<Args>::type{}...);
|
||||
EIGEN_UNUSED_VARIABLE(args); // Avoid NVCC compile warning.
|
||||
// NVCC 9.1 requires us to spell out the template parameters explicitly.
|
||||
read_ptr = Eigen::deserialize(read_ptr, read_end, get<Indices, decay_t<Args>...>(args)...);
|
||||
read_ptr = Eigen::deserialize(read_ptr, read_end, get<Indices, typename std::decay<Args>::type...>(args)...);
|
||||
|
||||
// Call function, with void->Void conversion so we are guaranteed a complete
|
||||
// output type.
|
||||
auto result = void_helper::call(kernel, get<Indices, decay_t<Args>...>(args)...);
|
||||
auto result = void_helper::call(kernel, get<Indices, typename std::decay<Args>::type...>(args)...);
|
||||
|
||||
// Determine required output size.
|
||||
size_t output_size = Eigen::serialize_size(capacity);
|
||||
output_size += Eigen::serialize_size(get<OutputIndices, decay_t<Args>...>(args)...);
|
||||
output_size += Eigen::serialize_size(get<OutputIndices, typename std::decay<Args>::type...>(args)...);
|
||||
output_size += Eigen::serialize_size(result);
|
||||
|
||||
// Always serialize required buffer size.
|
||||
@@ -162,7 +157,7 @@ EIGEN_DEVICE_FUNC void run_serialized(std::index_sequence<Indices...>, std::inde
|
||||
// Serialize outputs if they fit in the buffer.
|
||||
if (output_size <= capacity) {
|
||||
// Collect outputs and result.
|
||||
write_ptr = Eigen::serialize(write_ptr, write_end, get<OutputIndices, decay_t<Args>...>(args)...);
|
||||
write_ptr = Eigen::serialize(write_ptr, write_end, get<OutputIndices, typename std::decay<Args>::type...>(args)...);
|
||||
write_ptr = Eigen::serialize(write_ptr, write_end, result);
|
||||
}
|
||||
}
|
||||
@@ -287,7 +282,7 @@ auto run_serialized_on_gpu(size_t buffer_capacity_hint, std::index_sequence<Indi
|
||||
* \return kernel(args...).
|
||||
*/
|
||||
template <typename Kernel, typename... Args>
|
||||
auto run_on_cpu(Kernel kernel, Args&&... args) -> internal::kernel_result_t<Kernel, Args&&...> {
|
||||
auto run_on_cpu(Kernel kernel, Args&&... args) -> decltype(kernel(args...)) {
|
||||
return kernel(std::forward<Args>(args)...);
|
||||
}
|
||||
|
||||
@@ -306,7 +301,7 @@ auto run_on_cpu(Kernel kernel, Args&&... args) -> internal::kernel_result_t<Kern
|
||||
* \return kernel(args...).
|
||||
*/
|
||||
template <typename Kernel, typename... Args>
|
||||
auto run_on_gpu(Kernel kernel, Args&&... args) -> internal::kernel_result_t<Kernel, Args&&...> {
|
||||
auto run_on_gpu(Kernel kernel, Args&&... args) -> decltype(kernel(args...)) {
|
||||
return internal::run_serialized_on_gpu<Kernel, Args...>(
|
||||
/*buffer_capacity_hint=*/0, std::make_index_sequence<sizeof...(Args)>{},
|
||||
internal::extract_output_indices<Args...>{}, kernel, std::forward<Args>(args)...);
|
||||
@@ -327,8 +322,7 @@ auto run_on_gpu(Kernel kernel, Args&&... args) -> internal::kernel_result_t<Kern
|
||||
* \sa run_on_gpu
|
||||
*/
|
||||
template <typename Kernel, typename... Args>
|
||||
auto run_on_gpu_with_hint(size_t buffer_capacity_hint, Kernel kernel, Args&&... args)
|
||||
-> internal::kernel_result_t<Kernel, Args&&...> {
|
||||
auto run_on_gpu_with_hint(size_t buffer_capacity_hint, Kernel kernel, Args&&... args) -> decltype(kernel(args...)) {
|
||||
return internal::run_serialized_on_gpu<Kernel, Args...>(
|
||||
buffer_capacity_hint, std::make_index_sequence<sizeof...(Args)>{}, internal::extract_output_indices<Args...>{},
|
||||
kernel, std::forward<Args>(args)...);
|
||||
@@ -415,7 +409,7 @@ void print_gpu_device_info() {
|
||||
* \return kernel(args...).
|
||||
*/
|
||||
template <typename Kernel, typename... Args>
|
||||
auto run(Kernel kernel, Args&&... args) -> internal::kernel_result_t<Kernel, Args&&...> {
|
||||
auto run(Kernel kernel, Args&&... args) -> decltype(kernel(args...)) {
|
||||
#ifdef EIGEN_GPUCC
|
||||
return run_on_gpu(kernel, std::forward<Args>(args)...);
|
||||
#else
|
||||
@@ -438,8 +432,7 @@ auto run(Kernel kernel, Args&&... args) -> internal::kernel_result_t<Kernel, Arg
|
||||
* \sa run
|
||||
*/
|
||||
template <typename Kernel, typename... Args>
|
||||
auto run_with_hint(size_t buffer_capacity_hint, Kernel kernel, Args&&... args)
|
||||
-> internal::kernel_result_t<Kernel, Args&&...> {
|
||||
auto run_with_hint(size_t buffer_capacity_hint, Kernel kernel, Args&&... args) -> decltype(kernel(args...)) {
|
||||
#ifdef EIGEN_GPUCC
|
||||
return run_on_gpu_with_hint(buffer_capacity_hint, kernel, std::forward<Args>(args)...);
|
||||
#else
|
||||
|
||||
40
test/main.h
40
test/main.h
@@ -76,8 +76,10 @@
|
||||
#include <cuda.h>
|
||||
#include <cuda_runtime.h>
|
||||
#include <cuda_runtime_api.h>
|
||||
#if CUDA_VERSION >= 7050
|
||||
#include <cuda_fp16.h>
|
||||
#endif
|
||||
#endif
|
||||
|
||||
#if defined(EIGEN_CUDACC) || defined(EIGEN_HIPCC)
|
||||
#define EIGEN_TEST_NO_LONGDOUBLE
|
||||
@@ -947,37 +949,6 @@ inline void set_seed_from_time() {
|
||||
g_seed = static_cast<decltype(g_seed)>(ns);
|
||||
}
|
||||
|
||||
#if defined(EIGEN_USE_GPU)
|
||||
inline int maybe_skip_gpu_tests() {
|
||||
#if defined(EIGEN_USE_HIP)
|
||||
int device_count = 0;
|
||||
hipError_t status = hipGetDeviceCount(&device_count);
|
||||
if (status != hipSuccess) {
|
||||
std::cout << "SKIP: HIP GPU tests require a visible ROCm device. hipGetDeviceCount failed with: "
|
||||
<< hipGetErrorString(status) << std::endl;
|
||||
return 77;
|
||||
}
|
||||
if (device_count <= 0) {
|
||||
std::cout << "SKIP: HIP GPU tests require a visible ROCm device." << std::endl;
|
||||
return 77;
|
||||
}
|
||||
#elif defined(EIGEN_CUDACC)
|
||||
int device_count = 0;
|
||||
cudaError_t status = cudaGetDeviceCount(&device_count);
|
||||
if (status != cudaSuccess) {
|
||||
std::cout << "SKIP: CUDA GPU tests require a visible CUDA device. cudaGetDeviceCount failed with: "
|
||||
<< cudaGetErrorString(status) << std::endl;
|
||||
return 77;
|
||||
}
|
||||
if (device_count <= 0) {
|
||||
std::cout << "SKIP: CUDA GPU tests require a visible CUDA device." << std::endl;
|
||||
return 77;
|
||||
}
|
||||
#endif
|
||||
return 0;
|
||||
}
|
||||
#endif
|
||||
|
||||
int main(int argc, char* argv[]) {
|
||||
g_has_set_repeat = false;
|
||||
g_has_set_seed = false;
|
||||
@@ -1026,13 +997,6 @@ int main(int argc, char* argv[]) {
|
||||
srand(g_seed);
|
||||
std::cout << "Repeating each test " << g_repeat << " times" << std::endl;
|
||||
|
||||
#if defined(EIGEN_USE_GPU)
|
||||
{
|
||||
const int skip_code = maybe_skip_gpu_tests();
|
||||
if (skip_code != 0) return skip_code;
|
||||
}
|
||||
#endif
|
||||
|
||||
VERIFY(EigenTest::all().size() > 0);
|
||||
|
||||
for (std::size_t i = 0; i < EigenTest::all().size(); ++i) {
|
||||
|
||||
@@ -233,15 +233,17 @@ static std::vector<FuncEntry<Scalar>> build_func_table() {
|
||||
// Range iteration helpers
|
||||
// ============================================================================
|
||||
|
||||
// Advances x toward +inf by at least 1 ULP. When step_eps > 0, additionally
|
||||
// jumps by a relative factor of (1 + step_eps) to sample the range sparsely.
|
||||
// Advances a non-negative value toward +inf by at least 1 ULP. When step_eps > 0,
|
||||
// additionally jumps by max(|x|, min_normal) * step_eps. For normals this is
|
||||
// equivalent to x * (1 + eps). For denormals where x * eps < smallest_denormal,
|
||||
// the min_normal floor ensures we still skip through the denormal region at a
|
||||
// rate matching the smallest normals rather than stalling at 1 ULP per step.
|
||||
template <typename Scalar>
|
||||
static inline Scalar advance_by_step(Scalar x, double step_eps) {
|
||||
static inline Scalar advance_positive(Scalar x, double step_eps) {
|
||||
Scalar next = std::nextafter(x, std::numeric_limits<Scalar>::infinity());
|
||||
if (step_eps > 0.0 && std::isfinite(next)) {
|
||||
// Try to jump further by a relative amount.
|
||||
Scalar jumped = next > 0 ? next * static_cast<Scalar>(1.0 + step_eps) : next / static_cast<Scalar>(1.0 + step_eps);
|
||||
// Use the jump only if it actually advances further (handles denormal stalling).
|
||||
Scalar base = std::max(next, std::numeric_limits<Scalar>::min());
|
||||
Scalar jumped = next + base * static_cast<Scalar>(step_eps);
|
||||
if (jumped > next) next = jumped;
|
||||
}
|
||||
return next;
|
||||
@@ -281,26 +283,60 @@ static double linear_to_scalar(int64_t lin, double /*tag*/) {
|
||||
// Dynamic work queue: threads atomically claim chunks for load balancing
|
||||
// ============================================================================
|
||||
|
||||
// Work queue that distributes chunks in positive absolute-value linear space.
|
||||
// Iteration goes outward from 0: the worker tests both +|x| and -|x| for
|
||||
// each sampled magnitude, so the multiplicative step (1 + eps) always works
|
||||
// cleanly — no special handling for negative values needed.
|
||||
template <typename Scalar>
|
||||
struct WorkQueue {
|
||||
int64_t range_hi_lin;
|
||||
int64_t chunk_size;
|
||||
double step_eps;
|
||||
std::atomic<int64_t> next_lin;
|
||||
Scalar orig_lo; // original range for sign filtering
|
||||
Scalar orig_hi;
|
||||
bool test_pos; // whether any positive values are in [lo, hi]
|
||||
bool test_neg; // whether any negative values are in [lo, hi]
|
||||
|
||||
void init(Scalar lo, Scalar hi, int64_t csz, double step) {
|
||||
range_hi_lin = scalar_to_linear(hi);
|
||||
chunk_size = csz;
|
||||
void init(Scalar lo, Scalar hi, int num_threads, double step) {
|
||||
orig_lo = lo;
|
||||
orig_hi = hi;
|
||||
test_pos = (hi >= Scalar(0));
|
||||
test_neg = (lo < Scalar(0));
|
||||
|
||||
// Compute absolute-value iteration range.
|
||||
Scalar abs_lo, abs_hi;
|
||||
if (lo <= Scalar(0) && hi >= Scalar(0)) {
|
||||
abs_lo = Scalar(0);
|
||||
abs_hi = std::max(std::abs(lo), hi);
|
||||
} else {
|
||||
abs_lo = std::min(std::abs(lo), std::abs(hi));
|
||||
abs_hi = std::max(std::abs(lo), std::abs(hi));
|
||||
}
|
||||
|
||||
range_hi_lin = scalar_to_linear(abs_hi);
|
||||
step_eps = step;
|
||||
next_lin.store(scalar_to_linear(lo), std::memory_order_relaxed);
|
||||
next_lin.store(scalar_to_linear(abs_lo), std::memory_order_relaxed);
|
||||
|
||||
uint64_t total_abs = count_scalars_in_range(abs_lo, abs_hi);
|
||||
chunk_size = std::max(int64_t(1), static_cast<int64_t>(total_abs / (num_threads * 16)));
|
||||
if (step > 0.0) {
|
||||
// Ensure chunks are large enough that advance_positive's min_normal floor
|
||||
// can actually skip the denormal region. The denormal region contains
|
||||
// count_scalars_in_range(0, min_normal) ULPs; any chunk must span at
|
||||
// least that many so the min_normal-based jump lands past chunk_hi.
|
||||
int64_t denorm_span = static_cast<int64_t>(count_scalars_in_range(Scalar(0), std::numeric_limits<Scalar>::min()));
|
||||
chunk_size = std::max(chunk_size, denorm_span);
|
||||
}
|
||||
}
|
||||
|
||||
// Claim the next chunk. Returns false when no work remains.
|
||||
// Claim the next chunk of absolute values. Returns false when no work remains.
|
||||
bool claim(Scalar& chunk_lo, Scalar& chunk_hi) {
|
||||
int64_t lo_lin = next_lin.fetch_add(chunk_size, std::memory_order_relaxed);
|
||||
if (lo_lin > range_hi_lin) return false;
|
||||
int64_t hi_lin = lo_lin + chunk_size - 1;
|
||||
if (hi_lin > range_hi_lin) hi_lin = range_hi_lin;
|
||||
if (lo_lin > range_hi_lin || lo_lin < 0) return false;
|
||||
// Compute hi_lin carefully to avoid int64_t overflow.
|
||||
int64_t remaining = range_hi_lin - lo_lin;
|
||||
int64_t hi_lin = (remaining < chunk_size - 1) ? range_hi_lin : lo_lin + chunk_size - 1;
|
||||
chunk_lo = linear_to_scalar(lo_lin, Scalar(0));
|
||||
chunk_hi = linear_to_scalar(hi_lin, Scalar(0));
|
||||
return true;
|
||||
@@ -322,8 +358,12 @@ static void worker(const FuncEntry<Scalar>& func, WorkQueue<Scalar>& queue, int
|
||||
#ifdef EIGEN_HAS_MPFR
|
||||
mpfr_t mp_in, mp_out;
|
||||
if (use_mpfr) {
|
||||
mpfr_init2(mp_in, 128);
|
||||
mpfr_init2(mp_out, 128);
|
||||
// Use 2x the mantissa bits of Scalar for the reference: 48 for float (24-bit
|
||||
// mantissa), 106 for double (53-bit mantissa). This is sufficient for correctly-
|
||||
// rounded results while keeping MPFR evaluation fast.
|
||||
constexpr int kMpfrBits = std::is_same<Scalar, float>::value ? 48 : 106;
|
||||
mpfr_init2(mp_in, kMpfrBits);
|
||||
mpfr_init2(mp_out, kMpfrBits);
|
||||
}
|
||||
#else
|
||||
(void)use_mpfr;
|
||||
@@ -348,32 +388,42 @@ static void worker(const FuncEntry<Scalar>& func, WorkQueue<Scalar>& queue, int
|
||||
}
|
||||
};
|
||||
|
||||
auto flush_batch = [&](int& idx) {
|
||||
if (idx == 0) return;
|
||||
for (int i = idx; i < batch_size; i++) input[i] = input[idx - 1];
|
||||
func.eigen_eval(eigen_out, input);
|
||||
process_batch(idx, input, eigen_out);
|
||||
idx = 0;
|
||||
};
|
||||
|
||||
auto push_value = [&](Scalar v, int& idx) {
|
||||
input[idx++] = v;
|
||||
if (idx == batch_size) flush_batch(idx);
|
||||
};
|
||||
|
||||
Scalar chunk_lo, chunk_hi;
|
||||
while (queue.claim(chunk_lo, chunk_hi)) {
|
||||
int idx = 0;
|
||||
Scalar x = chunk_lo;
|
||||
Scalar abs_x = chunk_lo;
|
||||
for (;;) {
|
||||
input[idx] = x;
|
||||
idx++;
|
||||
|
||||
if (idx == batch_size) {
|
||||
func.eigen_eval(eigen_out, input);
|
||||
process_batch(batch_size, input, eigen_out);
|
||||
idx = 0;
|
||||
// Test +|x| if positive values are in range.
|
||||
if (queue.test_pos && abs_x >= queue.orig_lo && abs_x <= queue.orig_hi) {
|
||||
push_value(abs_x, idx);
|
||||
}
|
||||
// Test -|x| if negative values are in range (skip -0 to avoid testing 0 twice).
|
||||
if (queue.test_neg && abs_x != Scalar(0)) {
|
||||
Scalar neg_x = -abs_x;
|
||||
if (neg_x >= queue.orig_lo && neg_x <= queue.orig_hi) {
|
||||
push_value(neg_x, idx);
|
||||
}
|
||||
}
|
||||
|
||||
if (x >= chunk_hi) break;
|
||||
Scalar next = advance_by_step(x, queue.step_eps);
|
||||
x = (next > chunk_hi) ? chunk_hi : next;
|
||||
if (abs_x >= chunk_hi) break;
|
||||
Scalar next = advance_positive(abs_x, queue.step_eps);
|
||||
abs_x = (next > chunk_hi) ? chunk_hi : next;
|
||||
}
|
||||
|
||||
// Process remaining partial batch. Pad unused slots with the last valid
|
||||
// input so the full-size vectorized eval doesn't read uninitialized memory.
|
||||
if (idx > 0) {
|
||||
for (int i = idx; i < batch_size; i++) input[i] = input[idx - 1];
|
||||
func.eigen_eval(eigen_out, input);
|
||||
process_batch(idx, input, eigen_out);
|
||||
}
|
||||
flush_batch(idx);
|
||||
}
|
||||
|
||||
#ifdef EIGEN_HAS_MPFR
|
||||
@@ -439,11 +489,12 @@ static int run_test(const Options& opts) {
|
||||
std::printf("Function: %s (%s)\n", opts.func_name.c_str(), kTypeName);
|
||||
std::printf("Range: [%.*g, %.*g]\n", kDigits, double(lo), kDigits, double(hi));
|
||||
if (opts.step_eps > 0.0) {
|
||||
std::printf("Sampling step: (1 + %g) * nextafter(x)\n", opts.step_eps);
|
||||
std::printf("Sampling step: |x| * (1 + %g)\n", opts.step_eps);
|
||||
} else {
|
||||
std::printf("Representable values in range: %lu\n", static_cast<unsigned long>(total_scalars));
|
||||
}
|
||||
std::printf("Reference: %s\n", opts.use_mpfr ? "MPFR (128-bit)" : "std C++ math");
|
||||
std::printf("Reference: %s\n",
|
||||
opts.use_mpfr ? (opts.use_double ? "MPFR (106-bit)" : "MPFR (48-bit)") : "std C++ math");
|
||||
std::printf("Threads: %d\n", num_threads);
|
||||
std::printf("Batch size: %d\n", opts.batch_size);
|
||||
std::printf("\n");
|
||||
@@ -459,13 +510,8 @@ static int run_test(const Options& opts) {
|
||||
results.back()->init(opts.hist_width);
|
||||
}
|
||||
|
||||
// Use dynamic work distribution: threads claim small chunks from a shared
|
||||
// queue. This ensures even load balancing regardless of how per-value
|
||||
// work varies across the range (e.g. log on negatives is trivial).
|
||||
// Choose chunk_size so we get ~16 chunks per thread for good balancing.
|
||||
int64_t chunk_size = std::max(int64_t(1), static_cast<int64_t>(total_scalars / (num_threads * 16)));
|
||||
WorkQueue<Scalar> queue;
|
||||
queue.init(lo, hi, chunk_size, opts.step_eps);
|
||||
queue.init(lo, hi, num_threads, opts.step_eps);
|
||||
|
||||
std::vector<std::thread> threads;
|
||||
auto start_time = std::chrono::steady_clock::now();
|
||||
|
||||
@@ -393,8 +393,7 @@ __device__ EIGEN_STRONG_INLINE void EigenContractionKernelInternal(const LhsMapp
|
||||
// the sum across all big k blocks of the product of little k block of index (x, y)
|
||||
// with block of index (y, z). To compute the final output, we need to reduce
|
||||
// the 8 threads over y by summation.
|
||||
// HIP uses non-sync warp shuffles; CUDA requires the _sync variants.
|
||||
#if defined(EIGEN_HIPCC)
|
||||
#if defined(EIGEN_HIPCC) || (defined(EIGEN_CUDA_SDK_VER) && EIGEN_CUDA_SDK_VER < 90000)
|
||||
#define shuffleInc(i, j, mask) res(i, j) += __shfl_xor(res(i, j), mask)
|
||||
#else
|
||||
#define shuffleInc(i, j, mask) res(i, j) += __shfl_xor_sync(0xFFFFFFFF, res(i, j), mask)
|
||||
@@ -623,7 +622,7 @@ __device__ __forceinline__ void EigenFloatContractionKernelInternal16x16(const L
|
||||
x1 = rhs_pf0.x;
|
||||
x2 = rhs_pf0.z;
|
||||
}
|
||||
#if defined(EIGEN_HIPCC)
|
||||
#if defined(EIGEN_HIPCC) || (defined(EIGEN_CUDA_SDK_VER) && EIGEN_CUDA_SDK_VER < 90000)
|
||||
x1 = __shfl_xor(x1, 4);
|
||||
x2 = __shfl_xor(x2, 4);
|
||||
#else
|
||||
@@ -1378,6 +1377,13 @@ struct TensorEvaluator<const TensorContractionOp<Indices, LeftArgType, RightArgT
|
||||
this->m_right_contracting_strides, this->m_k_strides);
|
||||
|
||||
OutputMapper output(buffer, m);
|
||||
|
||||
#if defined(EIGEN_USE_HIP)
|
||||
setGpuSharedMemConfig(hipSharedMemBankSizeEightByte);
|
||||
#else
|
||||
setGpuSharedMemConfig(cudaSharedMemBankSizeEightByte);
|
||||
#endif
|
||||
|
||||
LaunchKernels<LhsScalar, RhsScalar, Index, LhsMapper, RhsMapper, OutputMapper>::Run(lhs, rhs, output, m, n, k,
|
||||
this->m_device);
|
||||
}
|
||||
|
||||
@@ -89,7 +89,7 @@ class IndexMapper {
|
||||
}
|
||||
} else {
|
||||
for (int i = NumDims - 1; i >= 0; --i) {
|
||||
if (i + 1 < static_cast<int>(offset)) {
|
||||
if (static_cast<size_t>(i + 1) < offset) {
|
||||
m_gpuInputStrides[i] = m_gpuInputStrides[i + 1] * gpuInputDimensions[i + 1];
|
||||
m_gpuOutputStrides[i] = m_gpuOutputStrides[i + 1] * gpuOutputDimensions[i + 1];
|
||||
} else {
|
||||
|
||||
@@ -342,6 +342,19 @@ struct GpuDevice {
|
||||
|
||||
#endif
|
||||
|
||||
// FIXME: Should be device and kernel specific.
|
||||
#ifdef EIGEN_GPUCC
|
||||
static EIGEN_DEVICE_FUNC inline void setGpuSharedMemConfig(gpuSharedMemConfig config) {
|
||||
#ifndef EIGEN_GPU_COMPILE_PHASE
|
||||
gpuError_t status = gpuDeviceSetSharedMemConfig(config);
|
||||
EIGEN_UNUSED_VARIABLE(status);
|
||||
gpu_assert(status == gpuSuccess);
|
||||
#else
|
||||
EIGEN_UNUSED_VARIABLE(config);
|
||||
#endif
|
||||
}
|
||||
#endif
|
||||
|
||||
} // end namespace Eigen
|
||||
|
||||
// undefine all the gpu* macros we defined at the beginning of the file
|
||||
|
||||
@@ -175,7 +175,7 @@ EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE T loadConstant(const T* address) {
|
||||
return *address;
|
||||
}
|
||||
// Use the texture cache on CUDA devices whenever possible
|
||||
#if defined(EIGEN_CUDA_ARCH)
|
||||
#if defined(EIGEN_CUDA_ARCH) && EIGEN_CUDA_ARCH >= 350
|
||||
template <>
|
||||
EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE float loadConstant(const float* address) {
|
||||
return __ldg(address);
|
||||
|
||||
@@ -49,7 +49,7 @@ struct PacketType : internal::packet_traits<Scalar> {
|
||||
};
|
||||
|
||||
// For CUDA packet types when using a GpuDevice
|
||||
#if defined(EIGEN_USE_GPU) && defined(EIGEN_GPU_COMPILE_PHASE)
|
||||
#if defined(EIGEN_USE_GPU) && defined(EIGEN_HAS_GPU_FP16) && defined(EIGEN_GPU_COMPILE_PHASE)
|
||||
|
||||
typedef ulonglong2 Packet4h2;
|
||||
template <>
|
||||
|
||||
@@ -453,7 +453,7 @@ template <int B, int N, typename S, typename R, typename I_>
|
||||
__global__ EIGEN_HIP_LAUNCH_BOUNDS_1024 void FullReductionKernel(R, const S, I_, typename S::CoeffReturnType*,
|
||||
unsigned int*);
|
||||
|
||||
#if defined(EIGEN_GPUCC)
|
||||
#if defined(EIGEN_HAS_GPU_FP16)
|
||||
template <typename S, typename R, typename I_>
|
||||
__global__ EIGEN_HIP_LAUNCH_BOUNDS_1024 void ReductionInitFullReduxKernelHalfFloat(
|
||||
R, const S, I_, internal::packet_traits<half>::type*);
|
||||
@@ -883,7 +883,7 @@ struct TensorReductionEvaluatorBase<const TensorReductionOp<Op, Dims, ArgType, M
|
||||
#if defined(EIGEN_USE_GPU) && (defined(EIGEN_GPUCC))
|
||||
template <int B, int N, typename S, typename R, typename I_>
|
||||
KERNEL_FRIEND void internal::FullReductionKernel(R, const S, I_, typename S::CoeffReturnType*, unsigned int*);
|
||||
#if defined(EIGEN_GPUCC)
|
||||
#if defined(EIGEN_HAS_GPU_FP16)
|
||||
template <typename S, typename R, typename I_>
|
||||
KERNEL_FRIEND void internal::ReductionInitFullReduxKernelHalfFloat(R, const S, I_,
|
||||
internal::packet_traits<Eigen::half>::type*);
|
||||
|
||||
@@ -25,6 +25,7 @@ namespace internal {
|
||||
// updated the content of the output address it will try again.
|
||||
template <typename T, typename R>
|
||||
__device__ EIGEN_ALWAYS_INLINE void atomicReduce(T* output, T accum, R& reducer) {
|
||||
#if (defined(EIGEN_HIP_DEVICE_COMPILE) && defined(__HIP_ARCH_HAS_WARP_SHUFFLE__)) || (EIGEN_CUDA_ARCH >= 300)
|
||||
if (sizeof(T) == 4) {
|
||||
unsigned int oldval = *reinterpret_cast<unsigned int*>(output);
|
||||
unsigned int newval = oldval;
|
||||
@@ -60,6 +61,12 @@ __device__ EIGEN_ALWAYS_INLINE void atomicReduce(T* output, T accum, R& reducer)
|
||||
} else {
|
||||
gpu_assert(0 && "Wordsize not supported");
|
||||
}
|
||||
#else // EIGEN_CUDA_ARCH >= 300
|
||||
EIGEN_UNUSED_VARIABLE(output);
|
||||
EIGEN_UNUSED_VARIABLE(accum);
|
||||
EIGEN_UNUSED_VARIABLE(reducer);
|
||||
gpu_assert(0 && "Shouldn't be called on unsupported device");
|
||||
#endif // EIGEN_CUDA_ARCH >= 300
|
||||
}
|
||||
|
||||
// We extend atomicExch to support extra data types
|
||||
@@ -68,42 +75,13 @@ __device__ inline Type atomicExchCustom(Type* address, Type val) {
|
||||
return atomicExch(address, val);
|
||||
}
|
||||
|
||||
template <typename T>
|
||||
EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR auto reduction_shuffle_mask() {
|
||||
#if defined(EIGEN_HIP_DEVICE_COMPILE)
|
||||
return 0xFFFFFFFFFFFFFFFFull;
|
||||
#else
|
||||
return 0xFFFFFFFFu;
|
||||
#endif
|
||||
}
|
||||
|
||||
template <typename T>
|
||||
__device__ EIGEN_ALWAYS_INLINE T reduction_shuffle_down(T value, int offset) {
|
||||
return __shfl_down_sync(reduction_shuffle_mask<T>(), value, offset, warpSize);
|
||||
}
|
||||
|
||||
template <>
|
||||
__device__ EIGEN_ALWAYS_INLINE int reduction_shuffle_down<int>(int value, int offset) {
|
||||
return __shfl_down_sync(reduction_shuffle_mask<int>(), value, offset, warpSize);
|
||||
}
|
||||
|
||||
template <>
|
||||
__device__ EIGEN_ALWAYS_INLINE float reduction_shuffle_down<float>(float value, int offset) {
|
||||
return __shfl_down_sync(reduction_shuffle_mask<float>(), value, offset, warpSize);
|
||||
}
|
||||
|
||||
template <>
|
||||
__device__ EIGEN_ALWAYS_INLINE double reduction_shuffle_down<double>(double value, int offset) {
|
||||
return __shfl_down_sync(reduction_shuffle_mask<double>(), value, offset, warpSize);
|
||||
}
|
||||
|
||||
template <>
|
||||
__device__ inline double atomicExchCustom(double* address, double val) {
|
||||
unsigned long long int* address_as_ull = reinterpret_cast<unsigned long long int*>(address);
|
||||
return __longlong_as_double(atomicExch(address_as_ull, __double_as_longlong(val)));
|
||||
}
|
||||
|
||||
// Half-float reduction specializations.
|
||||
#ifdef EIGEN_HAS_GPU_FP16
|
||||
template <typename R>
|
||||
__device__ inline void atomicReduce(half2* output, half2 accum, R& reducer) {
|
||||
unsigned int oldval = *reinterpret_cast<unsigned int*>(output);
|
||||
@@ -133,10 +111,17 @@ __device__ inline void atomicReduce(Packet4h2* output, Packet4h2 accum, R& reduc
|
||||
}
|
||||
}
|
||||
#endif // EIGEN_GPU_COMPILE_PHASE
|
||||
#endif // EIGEN_HAS_GPU_FP16
|
||||
|
||||
template <>
|
||||
__device__ inline void atomicReduce(float* output, float accum, SumReducer<float>&) {
|
||||
#if (defined(EIGEN_HIP_DEVICE_COMPILE) && defined(__HIP_ARCH_HAS_WARP_SHUFFLE__)) || (EIGEN_CUDA_ARCH >= 300)
|
||||
atomicAdd(output, accum);
|
||||
#else // EIGEN_CUDA_ARCH >= 300
|
||||
EIGEN_UNUSED_VARIABLE(output);
|
||||
EIGEN_UNUSED_VARIABLE(accum);
|
||||
gpu_assert(0 && "Shouldn't be called on unsupported device");
|
||||
#endif // EIGEN_CUDA_ARCH >= 300
|
||||
}
|
||||
|
||||
template <typename CoeffType, typename Index>
|
||||
@@ -153,6 +138,7 @@ template <int BlockSize, int NumPerThread, typename Self, typename Reducer, type
|
||||
__global__ EIGEN_HIP_LAUNCH_BOUNDS_1024 void FullReductionKernel(Reducer reducer, const Self input, Index num_coeffs,
|
||||
typename Self::CoeffReturnType* output,
|
||||
unsigned int* semaphore) {
|
||||
#if (defined(EIGEN_HIP_DEVICE_COMPILE) && defined(__HIP_ARCH_HAS_WARP_SHUFFLE__)) || (EIGEN_CUDA_ARCH >= 300)
|
||||
// Initialize the output value
|
||||
const Index first_index = blockIdx.x * BlockSize * NumPerThread + threadIdx.x;
|
||||
if (gridDim.x == 1) {
|
||||
@@ -193,7 +179,20 @@ __global__ EIGEN_HIP_LAUNCH_BOUNDS_1024 void FullReductionKernel(Reducer reducer
|
||||
|
||||
#pragma unroll
|
||||
for (int offset = warpSize / 2; offset > 0; offset /= 2) {
|
||||
reducer.reduce(reduction_shuffle_down(accum, offset), &accum);
|
||||
#if defined(EIGEN_HIPCC)
|
||||
// use std::is_floating_point to determine the type of reduced_val
|
||||
// This is needed because when Type == double, hipcc will give a "call to __shfl_down is ambiguous" error
|
||||
// and list the float and int versions of __shfl_down as the candidate functions.
|
||||
if (std::is_floating_point<typename Self::CoeffReturnType>::value) {
|
||||
reducer.reduce(__shfl_down(static_cast<float>(accum), offset, warpSize), &accum);
|
||||
} else {
|
||||
reducer.reduce(__shfl_down(static_cast<int>(accum), offset, warpSize), &accum);
|
||||
}
|
||||
#elif defined(EIGEN_CUDA_SDK_VER) && EIGEN_CUDA_SDK_VER < 90000
|
||||
reducer.reduce(__shfl_down(accum, offset, warpSize), &accum);
|
||||
#else
|
||||
reducer.reduce(__shfl_down_sync(0xFFFFFFFF, accum, offset, warpSize), &accum);
|
||||
#endif
|
||||
}
|
||||
|
||||
if ((threadIdx.x & (warpSize - 1)) == 0) {
|
||||
@@ -207,9 +206,17 @@ __global__ EIGEN_HIP_LAUNCH_BOUNDS_1024 void FullReductionKernel(Reducer reducer
|
||||
__threadfence_system();
|
||||
#endif
|
||||
}
|
||||
#else // EIGEN_CUDA_ARCH >= 300
|
||||
EIGEN_UNUSED_VARIABLE(reducer);
|
||||
EIGEN_UNUSED_VARIABLE(input);
|
||||
EIGEN_UNUSED_VARIABLE(num_coeffs);
|
||||
EIGEN_UNUSED_VARIABLE(output);
|
||||
EIGEN_UNUSED_VARIABLE(semaphore);
|
||||
gpu_assert(0 && "Shouldn't be called on unsupported device");
|
||||
#endif // EIGEN_CUDA_ARCH >= 300
|
||||
}
|
||||
|
||||
// Half-float reduction specializations.
|
||||
#ifdef EIGEN_HAS_GPU_FP16
|
||||
template <typename Self, typename Reducer, typename Index>
|
||||
__global__ EIGEN_HIP_LAUNCH_BOUNDS_1024 void ReductionInitFullReduxKernelHalfFloat(Reducer reducer, const Self input,
|
||||
Index num_coeffs, half* scratch) {
|
||||
@@ -312,6 +319,14 @@ __global__ EIGEN_HIP_LAUNCH_BOUNDS_1024 void FullReductionKernelHalfFloat(Reduce
|
||||
hr[i] = wka_out.h;
|
||||
}
|
||||
reducer.reducePacket(r1, &accum);
|
||||
#elif defined(EIGEN_CUDA_SDK_VER) && EIGEN_CUDA_SDK_VER < 90000
|
||||
PacketType r1;
|
||||
half2* hr = reinterpret_cast<half2*>(&r1);
|
||||
half2* hacc = reinterpret_cast<half2*>(&accum);
|
||||
for (int i = 0; i < packet_width / 2; i++) {
|
||||
hr[i] = __shfl_down(hacc[i], offset, warpSize);
|
||||
}
|
||||
reducer.reducePacket(r1, &accum);
|
||||
#else
|
||||
PacketType r1;
|
||||
half2* hr = reinterpret_cast<half2*>(&r1);
|
||||
@@ -362,6 +377,8 @@ __global__ EIGEN_HIP_LAUNCH_BOUNDS_1024 void ReductionCleanupKernelHalfFloat(Op
|
||||
}
|
||||
}
|
||||
|
||||
#endif // EIGEN_HAS_GPU_FP16
|
||||
|
||||
template <typename Self, typename Op, typename OutputType, bool PacketAccess, typename Enabled = void>
|
||||
struct FullReductionLauncher {
|
||||
static void run(const Self&, Op&, const GpuDevice&, OutputType*, typename Self::Index) {
|
||||
@@ -392,7 +409,7 @@ struct FullReductionLauncher<
|
||||
}
|
||||
};
|
||||
|
||||
// Half-float reduction specializations.
|
||||
#ifdef EIGEN_HAS_GPU_FP16
|
||||
template <typename Self, typename Op>
|
||||
struct FullReductionLauncher<Self, Op, Eigen::half, false> {
|
||||
static void run(const Self&, Op&, const GpuDevice&, half*, typename Self::Index) {
|
||||
@@ -426,18 +443,24 @@ struct FullReductionLauncher<Self, Op, Eigen::half, true> {
|
||||
}
|
||||
}
|
||||
};
|
||||
#endif // EIGEN_HAS_GPU_FP16
|
||||
|
||||
template <typename Self, typename Op, bool Vectorizable>
|
||||
struct FullReducer<Self, Op, GpuDevice, Vectorizable> {
|
||||
// Unfortunately nvidia doesn't support well exotic types such as complex,
|
||||
// so reduce the scope of the optimized version of the code to the simple cases
|
||||
// of doubles, floats and half floats
|
||||
// Half-float reduction specializations.
|
||||
#ifdef EIGEN_HAS_GPU_FP16
|
||||
static constexpr bool HasOptimizedImplementation =
|
||||
!Self::ReducerTraits::IsStateful && (internal::is_same<typename Self::CoeffReturnType, float>::value ||
|
||||
internal::is_same<typename Self::CoeffReturnType, double>::value ||
|
||||
(internal::is_same<typename Self::CoeffReturnType, Eigen::half>::value &&
|
||||
reducer_traits<Op, GpuDevice>::PacketAccess));
|
||||
#else // EIGEN_HAS_GPU_FP16
|
||||
static constexpr bool HasOptimizedImplementation =
|
||||
!Self::ReducerTraits::IsStateful && (internal::is_same<typename Self::CoeffReturnType, float>::value ||
|
||||
internal::is_same<typename Self::CoeffReturnType, double>::value);
|
||||
#endif // EIGEN_HAS_GPU_FP16
|
||||
|
||||
template <typename OutputType>
|
||||
static void run(const Self& self, Op& reducer, const GpuDevice& device, OutputType* output) {
|
||||
@@ -458,6 +481,7 @@ __global__ EIGEN_HIP_LAUNCH_BOUNDS_1024 void InnerReductionKernel(Reducer reduce
|
||||
Index num_coeffs_to_reduce,
|
||||
Index num_preserved_coeffs,
|
||||
typename Self::CoeffReturnType* output) {
|
||||
#if (defined(EIGEN_HIP_DEVICE_COMPILE) && defined(__HIP_ARCH_HAS_WARP_SHUFFLE__)) || (EIGEN_CUDA_ARCH >= 300)
|
||||
typedef typename Self::CoeffReturnType Type;
|
||||
eigen_assert(blockDim.y == 1);
|
||||
eigen_assert(blockDim.z == 1);
|
||||
@@ -510,7 +534,20 @@ __global__ EIGEN_HIP_LAUNCH_BOUNDS_1024 void InnerReductionKernel(Reducer reduce
|
||||
|
||||
#pragma unroll
|
||||
for (int offset = warpSize / 2; offset > 0; offset /= 2) {
|
||||
reducer.reduce(reduction_shuffle_down(reduced_val, offset), &reduced_val);
|
||||
#if defined(EIGEN_HIPCC)
|
||||
// use std::is_floating_point to determine the type of reduced_val
|
||||
// This is needed because when Type == double, hipcc will give a "call to __shfl_down is ambiguous" error
|
||||
// and list the float and int versions of __shfl_down as the candidate functions.
|
||||
if (std::is_floating_point<Type>::value) {
|
||||
reducer.reduce(__shfl_down(static_cast<float>(reduced_val), offset), &reduced_val);
|
||||
} else {
|
||||
reducer.reduce(__shfl_down(static_cast<int>(reduced_val), offset), &reduced_val);
|
||||
}
|
||||
#elif defined(EIGEN_CUDA_SDK_VER) && EIGEN_CUDA_SDK_VER < 90000
|
||||
reducer.reduce(__shfl_down(reduced_val, offset), &reduced_val);
|
||||
#else
|
||||
reducer.reduce(__shfl_down_sync(0xFFFFFFFF, reduced_val, offset), &reduced_val);
|
||||
#endif
|
||||
}
|
||||
|
||||
if ((threadIdx.x & (warpSize - 1)) == 0) {
|
||||
@@ -518,9 +555,17 @@ __global__ EIGEN_HIP_LAUNCH_BOUNDS_1024 void InnerReductionKernel(Reducer reduce
|
||||
}
|
||||
}
|
||||
}
|
||||
#else // EIGEN_CUDA_ARCH >= 300
|
||||
EIGEN_UNUSED_VARIABLE(reducer);
|
||||
EIGEN_UNUSED_VARIABLE(input);
|
||||
EIGEN_UNUSED_VARIABLE(num_coeffs_to_reduce);
|
||||
EIGEN_UNUSED_VARIABLE(num_preserved_coeffs);
|
||||
EIGEN_UNUSED_VARIABLE(output);
|
||||
gpu_assert(0 && "Shouldn't be called on unsupported device");
|
||||
#endif // EIGEN_CUDA_ARCH >= 300
|
||||
}
|
||||
|
||||
// Half-float reduction specializations.
|
||||
#ifdef EIGEN_HAS_GPU_FP16
|
||||
|
||||
template <int NumPerThread, typename Self, typename Reducer, typename Index>
|
||||
__global__ EIGEN_HIP_LAUNCH_BOUNDS_1024 void InnerReductionKernelHalfFloat(Reducer reducer, const Self input,
|
||||
@@ -643,6 +688,19 @@ __global__ EIGEN_HIP_LAUNCH_BOUNDS_1024 void InnerReductionKernelHalfFloat(Reduc
|
||||
}
|
||||
reducer.reducePacket(r1, &reduced_val1);
|
||||
reducer.reducePacket(r2, &reduced_val2);
|
||||
#elif defined(EIGEN_CUDA_SDK_VER) && EIGEN_CUDA_SDK_VER < 90000
|
||||
PacketType r1;
|
||||
PacketType r2;
|
||||
half2* hr1 = reinterpret_cast<half2*>(&r1);
|
||||
half2* hr2 = reinterpret_cast<half2*>(&r2);
|
||||
half2* rv1 = reinterpret_cast<half2*>(&reduced_val1);
|
||||
half2* rv2 = reinterpret_cast<half2*>(&reduced_val2);
|
||||
for (int i = 0; i < packet_width / 2; i++) {
|
||||
hr1[i] = __shfl_down(rv1[i], offset, warpSize);
|
||||
hr2[i] = __shfl_down(rv2[i], offset, warpSize);
|
||||
}
|
||||
reducer.reducePacket(r1, &reduced_val1);
|
||||
reducer.reducePacket(r2, &reduced_val2);
|
||||
#else
|
||||
PacketType r1;
|
||||
PacketType r2;
|
||||
@@ -683,6 +741,8 @@ __global__ EIGEN_HIP_LAUNCH_BOUNDS_1024 void InnerReductionKernelHalfFloat(Reduc
|
||||
}
|
||||
}
|
||||
|
||||
#endif // EIGEN_HAS_GPU_FP16
|
||||
|
||||
template <typename Self, typename Op, typename OutputType, bool PacketAccess, typename Enabled = void>
|
||||
struct InnerReductionLauncher {
|
||||
static EIGEN_DEVICE_FUNC bool run(const Self&, Op&, const GpuDevice&, OutputType*, typename Self::Index,
|
||||
@@ -726,7 +786,7 @@ struct InnerReductionLauncher<
|
||||
}
|
||||
};
|
||||
|
||||
// Half-float reduction specializations.
|
||||
#ifdef EIGEN_HAS_GPU_FP16
|
||||
template <typename Self, typename Op>
|
||||
struct InnerReductionLauncher<Self, Op, Eigen::half, false> {
|
||||
static bool run(const Self&, Op&, const GpuDevice&, half*, typename Self::Index, typename Self::Index) {
|
||||
@@ -766,18 +826,24 @@ struct InnerReductionLauncher<Self, Op, Eigen::half, true> {
|
||||
return false;
|
||||
}
|
||||
};
|
||||
#endif // EIGEN_HAS_GPU_FP16
|
||||
|
||||
template <typename Self, typename Op>
|
||||
struct InnerReducer<Self, Op, GpuDevice> {
|
||||
// Unfortunately nvidia doesn't support well exotic types such as complex,
|
||||
// so reduce the scope of the optimized version of the code to the simple case
|
||||
// of floats and half floats.
|
||||
// Half-float reduction specializations.
|
||||
#ifdef EIGEN_HAS_GPU_FP16
|
||||
static constexpr bool HasOptimizedImplementation =
|
||||
!Self::ReducerTraits::IsStateful && (internal::is_same<typename Self::CoeffReturnType, float>::value ||
|
||||
internal::is_same<typename Self::CoeffReturnType, double>::value ||
|
||||
(internal::is_same<typename Self::CoeffReturnType, Eigen::half>::value &&
|
||||
reducer_traits<Op, GpuDevice>::PacketAccess));
|
||||
#else // EIGEN_HAS_GPU_FP16
|
||||
static constexpr bool HasOptimizedImplementation =
|
||||
!Self::ReducerTraits::IsStateful && (internal::is_same<typename Self::CoeffReturnType, float>::value ||
|
||||
internal::is_same<typename Self::CoeffReturnType, double>::value);
|
||||
#endif // EIGEN_HAS_GPU_FP16
|
||||
|
||||
template <typename OutputType>
|
||||
static bool run(const Self& self, Op& reducer, const GpuDevice& device, OutputType* output,
|
||||
|
||||
@@ -237,7 +237,7 @@ if("${CMAKE_SIZEOF_VOID_P}" EQUAL "8" AND NOT CMAKE_CXX_COMPILER_ID STREQUAL "MS
|
||||
ei_add_test(cxx11_tensor_uint128)
|
||||
endif()
|
||||
|
||||
find_package(CUDA 11.4)
|
||||
find_package(CUDA 9.0)
|
||||
if(CUDA_FOUND AND EIGEN_TEST_CUDA)
|
||||
# Make sure to compile without the -pedantic, -Wundef, -Wnon-virtual-dtor
|
||||
# and -fno-check-new flags since they trigger thousands of compilation warnings
|
||||
@@ -281,11 +281,26 @@ if(CUDA_FOUND AND EIGEN_TEST_CUDA)
|
||||
ei_add_test(cxx11_tensor_argmax_gpu)
|
||||
ei_add_test(cxx11_tensor_cast_float16_gpu)
|
||||
ei_add_test(cxx11_tensor_scan_gpu)
|
||||
ei_add_test(cxx11_tensor_device)
|
||||
ei_add_test(cxx11_tensor_gpu)
|
||||
ei_add_test(cxx11_tensor_contract_gpu)
|
||||
ei_add_test(cxx11_tensor_of_float16_gpu)
|
||||
ei_add_test(cxx11_tensor_random_gpu)
|
||||
|
||||
set(EIGEN_CUDA_OLDEST_COMPUTE_ARCH 9999)
|
||||
foreach(ARCH IN LISTS EIGEN_CUDA_COMPUTE_ARCH)
|
||||
if(${ARCH} LESS ${EIGEN_CUDA_OLDEST_COMPUTE_ARCH})
|
||||
set(EIGEN_CUDA_OLDEST_COMPUTE_ARCH ${ARCH})
|
||||
endif()
|
||||
endforeach()
|
||||
|
||||
# Contractions require arch 3.0 or higher
|
||||
if (${EIGEN_CUDA_OLDEST_COMPUTE_ARCH} GREATER 29)
|
||||
ei_add_test(cxx11_tensor_device)
|
||||
ei_add_test(cxx11_tensor_gpu)
|
||||
ei_add_test(cxx11_tensor_contract_gpu)
|
||||
ei_add_test(cxx11_tensor_of_float16_gpu)
|
||||
endif()
|
||||
|
||||
# The random number generation code requires arch 3.5 or greater.
|
||||
if (${EIGEN_CUDA_OLDEST_COMPUTE_ARCH} GREATER 34)
|
||||
ei_add_test(cxx11_tensor_random_gpu)
|
||||
endif()
|
||||
|
||||
unset(EIGEN_ADD_TEST_FILENAME_EXTENSION)
|
||||
endif()
|
||||
@@ -326,6 +341,7 @@ if (EIGEN_TEST_HIP)
|
||||
ei_add_test(cxx11_tensor_cast_float16_gpu)
|
||||
ei_add_test(cxx11_tensor_scan_gpu)
|
||||
ei_add_test(cxx11_tensor_device)
|
||||
|
||||
ei_add_test(cxx11_tensor_gpu)
|
||||
ei_add_test(cxx11_tensor_contract_gpu)
|
||||
ei_add_test(cxx11_tensor_of_float16_gpu)
|
||||
|
||||
@@ -850,7 +850,6 @@ void test_gpu_igamma() {
|
||||
Tensor<Scalar, 2> a(6, 6);
|
||||
Tensor<Scalar, 2> x(6, 6);
|
||||
Tensor<Scalar, 2> out(6, 6);
|
||||
Tensor<Scalar, 2> expected_out(6, 6);
|
||||
out.setZero();
|
||||
|
||||
Scalar a_s[] = {Scalar(0), Scalar(1), Scalar(1.5), Scalar(4), Scalar(0.0001), Scalar(1000.5)};
|
||||
@@ -863,11 +862,14 @@ void test_gpu_igamma() {
|
||||
}
|
||||
}
|
||||
|
||||
for (int i = 0; i < 6; ++i) {
|
||||
for (int j = 0; j < 6; ++j) {
|
||||
expected_out(i, j) = numext::igamma(a(i, j), x(i, j));
|
||||
}
|
||||
}
|
||||
Scalar nan = std::numeric_limits<Scalar>::quiet_NaN();
|
||||
Scalar igamma_s[][6] = {
|
||||
{0.0, nan, nan, nan, nan, nan},
|
||||
{0.0, 0.6321205588285578, 0.7768698398515702, 0.9816843611112658, 9.999500016666262e-05, 1.0},
|
||||
{0.0, 0.4275932955291202, 0.608374823728911, 0.9539882943107686, 7.522076445089201e-07, 1.0},
|
||||
{0.0, 0.01898815687615381, 0.06564245437845008, 0.5665298796332909, 4.166333347221828e-18, 1.0},
|
||||
{0.0, 0.9999780593618628, 0.9999899967080838, 0.9999996219837988, 0.9991370418689945, 1.0},
|
||||
{0.0, 0.0, 0.0, 0.0, 0.0, 0.5042041932513908}};
|
||||
|
||||
std::size_t bytes = a.size() * sizeof(Scalar);
|
||||
|
||||
@@ -895,10 +897,10 @@ void test_gpu_igamma() {
|
||||
|
||||
for (int i = 0; i < 6; ++i) {
|
||||
for (int j = 0; j < 6; ++j) {
|
||||
if ((std::isnan)(expected_out(i, j))) {
|
||||
if ((std::isnan)(igamma_s[i][j])) {
|
||||
VERIFY((std::isnan)(out(i, j)));
|
||||
} else {
|
||||
VERIFY_IS_APPROX(out(i, j), expected_out(i, j));
|
||||
VERIFY_IS_APPROX(out(i, j), igamma_s[i][j]);
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -913,7 +915,6 @@ void test_gpu_igammac() {
|
||||
Tensor<Scalar, 2> a(6, 6);
|
||||
Tensor<Scalar, 2> x(6, 6);
|
||||
Tensor<Scalar, 2> out(6, 6);
|
||||
Tensor<Scalar, 2> expected_out(6, 6);
|
||||
out.setZero();
|
||||
|
||||
Scalar a_s[] = {Scalar(0), Scalar(1), Scalar(1.5), Scalar(4), Scalar(0.0001), Scalar(1000.5)};
|
||||
@@ -926,11 +927,14 @@ void test_gpu_igammac() {
|
||||
}
|
||||
}
|
||||
|
||||
for (int i = 0; i < 6; ++i) {
|
||||
for (int j = 0; j < 6; ++j) {
|
||||
expected_out(i, j) = numext::igammac(a(i, j), x(i, j));
|
||||
}
|
||||
}
|
||||
Scalar nan = std::numeric_limits<Scalar>::quiet_NaN();
|
||||
Scalar igammac_s[][6] = {
|
||||
{nan, nan, nan, nan, nan, nan},
|
||||
{1.0, 0.36787944117144233, 0.22313016014842982, 0.018315638888734182, 0.9999000049998333, 0.0},
|
||||
{1.0, 0.5724067044708798, 0.3916251762710878, 0.04601170568923136, 0.9999992477923555, 0.0},
|
||||
{1.0, 0.9810118431238462, 0.9343575456215499, 0.4334701203667089, 1.0, 0.0},
|
||||
{1.0, 2.1940638138146658e-05, 1.0003291916285e-05, 3.7801620118431334e-07, 0.0008629581310054535, 0.0},
|
||||
{1.0, 1.0, 1.0, 1.0, 1.0, 0.49579580674813944}};
|
||||
|
||||
std::size_t bytes = a.size() * sizeof(Scalar);
|
||||
|
||||
@@ -958,10 +962,10 @@ void test_gpu_igammac() {
|
||||
|
||||
for (int i = 0; i < 6; ++i) {
|
||||
for (int j = 0; j < 6; ++j) {
|
||||
if ((std::isnan)(expected_out(i, j))) {
|
||||
if ((std::isnan)(igammac_s[i][j])) {
|
||||
VERIFY((std::isnan)(out(i, j)));
|
||||
} else {
|
||||
VERIFY_IS_APPROX(out(i, j), expected_out(i, j));
|
||||
VERIFY_IS_APPROX(out(i, j), igammac_s[i][j]);
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -1064,9 +1068,15 @@ void test_gpu_ndtri() {
|
||||
in_x(7) = Scalar(0.99);
|
||||
in_x(8) = Scalar(0.01);
|
||||
|
||||
for (int i = 0; i < 9; ++i) {
|
||||
expected_out(i) = numext::ndtri(in_x(i));
|
||||
}
|
||||
expected_out(0) = std::numeric_limits<Scalar>::infinity();
|
||||
expected_out(1) = -std::numeric_limits<Scalar>::infinity();
|
||||
expected_out(2) = Scalar(0.0);
|
||||
expected_out(3) = Scalar(-0.8416212335729142);
|
||||
expected_out(4) = Scalar(0.8416212335729142);
|
||||
expected_out(5) = Scalar(1.2815515655446004);
|
||||
expected_out(6) = Scalar(-1.2815515655446004);
|
||||
expected_out(7) = Scalar(2.3263478740408408);
|
||||
expected_out(8) = Scalar(-2.3263478740408408);
|
||||
|
||||
std::size_t bytes = in_x.size() * sizeof(Scalar);
|
||||
|
||||
@@ -1080,15 +1090,15 @@ void test_gpu_ndtri() {
|
||||
Eigen::GpuStreamDevice stream;
|
||||
Eigen::GpuDevice gpu_device(&stream);
|
||||
|
||||
Eigen::TensorMap<Eigen::Tensor<Scalar, 1> > gpu_in_x(d_in_x, 9);
|
||||
Eigen::TensorMap<Eigen::Tensor<Scalar, 1> > gpu_out(d_out, 9);
|
||||
Eigen::TensorMap<Eigen::Tensor<Scalar, 1> > gpu_in_x(d_in_x, 6);
|
||||
Eigen::TensorMap<Eigen::Tensor<Scalar, 1> > gpu_out(d_out, 6);
|
||||
|
||||
gpu_out.device(gpu_device) = gpu_in_x.ndtri();
|
||||
|
||||
assert(gpuMemcpyAsync(out.data(), d_out, bytes, gpuMemcpyDeviceToHost, gpu_device.stream()) == gpuSuccess);
|
||||
assert(gpuStreamSynchronize(gpu_device.stream()) == gpuSuccess);
|
||||
|
||||
for (int i = 0; i < 9; ++i) {
|
||||
for (int i = 0; i < 6; ++i) {
|
||||
VERIFY_IS_CWISE_APPROX(out(i), expected_out(i));
|
||||
}
|
||||
|
||||
@@ -1105,9 +1115,12 @@ void test_gpu_betainc() {
|
||||
Tensor<Scalar, 1> expected_out(125);
|
||||
out.setZero();
|
||||
|
||||
Scalar nan = std::numeric_limits<Scalar>::quiet_NaN();
|
||||
|
||||
Array<Scalar, 1, Dynamic> x(125);
|
||||
Array<Scalar, 1, Dynamic> a(125);
|
||||
Array<Scalar, 1, Dynamic> b(125);
|
||||
Array<Scalar, 1, Dynamic> v(125);
|
||||
|
||||
a << 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0,
|
||||
0.0, 0.0, 0.0, 0.03062277660168379, 0.03062277660168379, 0.03062277660168379, 0.03062277660168379,
|
||||
@@ -1147,11 +1160,25 @@ void test_gpu_betainc() {
|
||||
0.5, 0.8, 1.1, -0.1, 0.2, 0.5, 0.8, 1.1, -0.1, 0.2, 0.5, 0.8, 1.1, -0.1, 0.2, 0.5, 0.8, 1.1, -0.1, 0.2, 0.5, 0.8,
|
||||
1.1, -0.1, 0.2, 0.5, 0.8, 1.1, -0.1, 0.2, 0.5, 0.8, 1.1, -0.1, 0.2, 0.5, 0.8, 1.1;
|
||||
|
||||
v << nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan,
|
||||
nan, nan, nan, nan, nan, nan, nan, nan, nan, 0.47972119876364683, 0.5, 0.5202788012363533, nan, nan,
|
||||
0.9518683957740043, 0.9789663010413743, 0.9931729188073435, nan, nan, 0.999995949033062, 0.9999999999993698,
|
||||
0.9999999999999999, nan, nan, 0.9999999999999999, 0.9999999999999999, 0.9999999999999999, nan, nan, nan, nan, nan,
|
||||
nan, nan, 0.006827081192655869, 0.0210336989586256, 0.04813160422599567, nan, nan, 0.20014344256217678,
|
||||
0.5000000000000001, 0.7998565574378232, nan, nan, 0.9991401428435834, 0.999999999698403, 0.9999999999999999, nan,
|
||||
nan, 0.9999999999999999, 0.9999999999999999, 0.9999999999999999, nan, nan, nan, nan, nan, nan, nan,
|
||||
1.0646600232370887e-25, 6.301722877826246e-13, 4.050966937974938e-06, nan, nan, 7.864342668429763e-23,
|
||||
3.015969667594166e-10, 0.0008598571564165444, nan, nan, 6.031987710123844e-08, 0.5000000000000007,
|
||||
0.9999999396801229, nan, nan, 0.9999999999999999, 0.9999999999999999, 0.9999999999999999, nan, nan, nan, nan, nan,
|
||||
nan, nan, 0.0, 7.029920380986636e-306, 2.2450728208591345e-101, nan, nan, 0.0, 9.275871147869727e-302,
|
||||
1.2232913026152827e-97, nan, nan, 0.0, 3.0891393081932924e-252, 2.9303043666183996e-60, nan, nan,
|
||||
2.248913486879199e-196, 0.5000000000004947, 0.9999999999999999, nan;
|
||||
|
||||
for (int i = 0; i < 125; ++i) {
|
||||
in_x(i) = x(i);
|
||||
in_a(i) = a(i);
|
||||
in_b(i) = b(i);
|
||||
expected_out(i) = numext::betainc(a(i), b(i), x(i));
|
||||
expected_out(i) = v(i);
|
||||
}
|
||||
|
||||
std::size_t bytes = in_x.size() * sizeof(Scalar);
|
||||
|
||||
@@ -53,6 +53,8 @@ void test_gpu_numext() {
|
||||
gpu_device.deallocate(d_res_float);
|
||||
}
|
||||
|
||||
#ifdef EIGEN_HAS_GPU_FP16
|
||||
|
||||
template <typename>
|
||||
void test_gpu_conversion() {
|
||||
Eigen::GpuStreamDevice stream;
|
||||
@@ -440,10 +442,12 @@ void test_gpu_forced_evals() {
|
||||
gpu_device.deallocate(d_res_half2);
|
||||
gpu_device.deallocate(d_res_float);
|
||||
}
|
||||
#endif
|
||||
|
||||
EIGEN_DECLARE_TEST(cxx11_tensor_of_float16_gpu) {
|
||||
CALL_SUBTEST_1(test_gpu_numext<void>());
|
||||
|
||||
#ifdef EIGEN_HAS_GPU_FP16
|
||||
CALL_SUBTEST_1(test_gpu_conversion<void>());
|
||||
CALL_SUBTEST_1(test_gpu_unary<void>());
|
||||
CALL_SUBTEST_1(test_gpu_elementwise<void>());
|
||||
@@ -452,4 +456,7 @@ EIGEN_DECLARE_TEST(cxx11_tensor_of_float16_gpu) {
|
||||
CALL_SUBTEST_3(test_gpu_reductions<void>());
|
||||
CALL_SUBTEST_4(test_gpu_full_reductions<void>());
|
||||
CALL_SUBTEST_5(test_gpu_forced_evals<void>());
|
||||
#else
|
||||
std::cout << "Half floats are not supported by this version of gpu: skipping the test" << std::endl;
|
||||
#endif
|
||||
}
|
||||
|
||||
Reference in New Issue
Block a user