diff --git a/CMakeLists.txt b/CMakeLists.txt index acd0e7116..57edb5d15 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -672,7 +672,7 @@ if (EIGEN_BUILD_TESTING) endif() set(EIGEN_CUDA_CXX_FLAGS "" CACHE STRING "Additional flags to pass to the cuda compiler.") - set(EIGEN_CUDA_COMPUTE_ARCH 30 CACHE STRING "The CUDA compute architecture(s) to target when compiling CUDA code") + set(EIGEN_CUDA_COMPUTE_ARCH 70 CACHE STRING "The CUDA compute architecture(s) to target when compiling CUDA code") option(EIGEN_TEST_SYCL "Add Sycl support." OFF) if(EIGEN_TEST_SYCL) @@ -817,4 +817,3 @@ endif() message(STATUS "") message(STATUS "Configured Eigen ${EIGEN_VERSION_STRING}") message(STATUS "") - diff --git a/Eigen/Core b/Eigen/Core index fa33b9672..653c266bf 100644 --- a/Eigen/Core +++ b/Eigen/Core @@ -50,9 +50,9 @@ #include "src/Core/util/AOCL_Support.h" -#if defined(EIGEN_HAS_CUDA_FP16) || defined(EIGEN_HAS_HIP_FP16) -#define EIGEN_HAS_GPU_FP16 -#endif +// EIGEN_HAS_GPU_FP16 is now always true when compiling with CUDA or HIP. +// Use EIGEN_GPUCC (compile-time) or EIGEN_GPU_COMPILE_PHASE (device phase) instead. +// TODO: Remove EIGEN_HAS_GPU_BF16 similarly once HIP bf16 guards are cleaned up. #if defined(EIGEN_HAS_CUDA_BF16) || defined(EIGEN_HAS_HIP_BF16) #define EIGEN_HAS_GPU_BF16 diff --git a/Eigen/src/Core/arch/Default/BFloat16.h b/Eigen/src/Core/arch/Default/BFloat16.h index 27dcc8038..83098a5f7 100644 --- a/Eigen/src/Core/arch/Default/BFloat16.h +++ b/Eigen/src/Core/arch/Default/BFloat16.h @@ -858,16 +858,8 @@ struct hash { } // namespace std #endif -// Add the missing shfl* intrinsics. -// The __shfl* functions are only valid on HIP or _CUDA_ARCH_ >= 300. -// CUDA defines them for (__CUDA_ARCH__ >= 300 || !defined(__CUDA_ARCH__)) -// -// HIP and CUDA prior to SDK 9.0 define -// __shfl, __shfl_up, __shfl_down, __shfl_xor for int and float -// CUDA since 9.0 deprecates those and instead defines -// __shfl_sync, __shfl_up_sync, __shfl_down_sync, __shfl_xor_sync, -// with native support for __half and __nv_bfloat16 -// +// Warp shuffle overloads for Eigen::bfloat16. +// HIP uses non-sync __shfl variants; CUDA has native __nv_bfloat16 support in __shfl_sync. // Note that the following are __device__ - only functions. #if defined(EIGEN_HIPCC) diff --git a/Eigen/src/Core/arch/Default/Half.h b/Eigen/src/Core/arch/Default/Half.h index 70ee8f900..aa9dc831d 100644 --- a/Eigen/src/Core/arch/Default/Half.h +++ b/Eigen/src/Core/arch/Default/Half.h @@ -45,7 +45,7 @@ // Eigen with GPU support. // Any functions that require `numext::bit_cast` may also not be constexpr, // including any native types when setting via raw bit values. -#if defined(EIGEN_HAS_GPU_FP16) || defined(EIGEN_HAS_ARM64_FP16_SCALAR_ARITHMETIC) || defined(EIGEN_HAS_BUILTIN_FLOAT16) +#if defined(EIGEN_GPUCC) || defined(EIGEN_HAS_ARM64_FP16_SCALAR_ARITHMETIC) || defined(EIGEN_HAS_BUILTIN_FLOAT16) #define _EIGEN_MAYBE_CONSTEXPR #else #define _EIGEN_MAYBE_CONSTEXPR constexpr @@ -121,12 +121,12 @@ namespace half_impl { // // Making the host side compile phase of hipcc use the same Eigen::half impl, as the gcc compile, resolves // this error, and hence the following convoluted #if condition -#if !defined(EIGEN_HAS_GPU_FP16) || !defined(EIGEN_GPU_COMPILE_PHASE) +#if !defined(EIGEN_GPUCC) || !defined(EIGEN_GPU_COMPILE_PHASE) // Make our own __half_raw definition that is similar to CUDA's. struct __half_raw { struct construct_from_rep_tag {}; -#if (defined(EIGEN_HAS_GPU_FP16) && !defined(EIGEN_GPU_COMPILE_PHASE)) +#if (defined(EIGEN_GPUCC) && !defined(EIGEN_GPU_COMPILE_PHASE)) // Eigen::half can be used as the datatype for shared memory declarations (in Eigen and TF) // The element type for shared memory cannot have non-trivial constructors // and hence the following special casing (which skips the zero-initilization). @@ -152,16 +152,12 @@ struct __half_raw { #endif }; -#elif defined(EIGEN_HAS_HIP_FP16) +#elif defined(EIGEN_HIPCC) // HIP GPU compile phase: nothing to do here. // HIP fp16 header file has a definition for __half_raw -#elif defined(EIGEN_HAS_CUDA_FP16) +#elif defined(EIGEN_CUDACC) // CUDA GPU compile phase. -#if EIGEN_CUDA_SDK_VER < 90000 -// In CUDA < 9.0, __half is the equivalent of CUDA 9's __half_raw -typedef __half __half_raw; -#endif // defined(EIGEN_HAS_CUDA_FP16) #elif defined(SYCL_DEVICE_ONLY) typedef cl::sycl::half __half_raw; @@ -175,15 +171,13 @@ struct half_base : public __half_raw { EIGEN_DEVICE_FUNC _EIGEN_MAYBE_CONSTEXPR half_base() {} EIGEN_DEVICE_FUNC _EIGEN_MAYBE_CONSTEXPR half_base(const __half_raw& h) : __half_raw(h) {} -#if defined(EIGEN_HAS_GPU_FP16) -#if defined(EIGEN_HAS_HIP_FP16) +#if defined(EIGEN_GPUCC) +#if defined(EIGEN_HIPCC) EIGEN_DEVICE_FUNC _EIGEN_MAYBE_CONSTEXPR half_base(const __half& h) { x = __half_as_ushort(h); } -#elif defined(EIGEN_HAS_CUDA_FP16) -#if EIGEN_CUDA_SDK_VER >= 90000 +#elif defined(EIGEN_CUDACC) EIGEN_DEVICE_FUNC _EIGEN_MAYBE_CONSTEXPR half_base(const __half& h) : __half_raw(*(__half_raw*)&h) {} #endif #endif -#endif }; } // namespace half_impl @@ -192,36 +186,29 @@ struct half_base : public __half_raw { struct half : public half_impl::half_base { // Writing this out as separate #if-else blocks to make the code easier to follow // The same applies to most #if-else blocks in this file -#if !defined(EIGEN_HAS_GPU_FP16) || !defined(EIGEN_GPU_COMPILE_PHASE) +#if !defined(EIGEN_GPUCC) || !defined(EIGEN_GPU_COMPILE_PHASE) // Use the same base class for the following two scenarios // * when compiling without GPU support enabled // * during host compile phase when compiling with GPU support enabled typedef half_impl::__half_raw __half_raw; -#elif defined(EIGEN_HAS_HIP_FP16) +#elif defined(EIGEN_HIPCC) // Nothing to do here // HIP fp16 header file has a definition for __half_raw -#elif defined(EIGEN_HAS_CUDA_FP16) -// Note that EIGEN_CUDA_SDK_VER is set to 0 even when compiling with HIP, so -// (EIGEN_CUDA_SDK_VER < 90000) is true even for HIP! So keeping this within -// #if defined(EIGEN_HAS_CUDA_FP16) is needed -#if defined(EIGEN_CUDA_SDK_VER) && EIGEN_CUDA_SDK_VER < 90000 - typedef half_impl::__half_raw __half_raw; -#endif +#elif defined(EIGEN_CUDACC) + // Nothing to do here. #endif EIGEN_DEVICE_FUNC _EIGEN_MAYBE_CONSTEXPR half() {} EIGEN_DEVICE_FUNC _EIGEN_MAYBE_CONSTEXPR half(const __half_raw& h) : half_impl::half_base(h) {} -#if defined(EIGEN_HAS_GPU_FP16) -#if defined(EIGEN_HAS_HIP_FP16) +#if defined(EIGEN_GPUCC) +#if defined(EIGEN_HIPCC) EIGEN_DEVICE_FUNC _EIGEN_MAYBE_CONSTEXPR half(const __half& h) : half_impl::half_base(h) {} -#elif defined(EIGEN_HAS_CUDA_FP16) -#if defined(EIGEN_CUDA_SDK_VER) && EIGEN_CUDA_SDK_VER >= 90000 +#elif defined(EIGEN_CUDACC) EIGEN_DEVICE_FUNC _EIGEN_MAYBE_CONSTEXPR half(const __half& h) : half_impl::half_base(h) {} #endif #endif -#endif #if defined(EIGEN_HAS_ARM64_FP16_SCALAR_ARITHMETIC) explicit EIGEN_DEVICE_FUNC _EIGEN_MAYBE_CONSTEXPR half(__fp16 b) @@ -248,7 +235,7 @@ struct half : public half_impl::half_base { return half_impl::half_to_float(*this); } -#if defined(EIGEN_HAS_GPU_FP16) && !defined(EIGEN_GPU_COMPILE_PHASE) +#if defined(EIGEN_GPUCC) && !defined(EIGEN_GPU_COMPILE_PHASE) EIGEN_DEVICE_FUNC operator __half() const { ::__half_raw hr; hr.x = x; @@ -380,8 +367,7 @@ namespace Eigen { namespace half_impl { -#if (defined(EIGEN_HAS_CUDA_FP16) && defined(EIGEN_CUDA_ARCH) && EIGEN_CUDA_ARCH >= 530) || \ - (defined(EIGEN_HAS_HIP_FP16) && defined(HIP_DEVICE_COMPILE)) +#if defined(EIGEN_GPU_COMPILE_PHASE) // Note: We deliberately do *not* define this to 1 even if we have Arm's native // fp16 type since GPU half types are rather different from native CPU half types. #define EIGEN_HAS_NATIVE_GPU_FP16 @@ -393,24 +379,10 @@ namespace half_impl { // conversion steps back and forth. #if defined(EIGEN_HAS_NATIVE_GPU_FP16) -EIGEN_STRONG_INLINE __device__ half operator+(const half& a, const half& b) { -#if defined(EIGEN_CUDA_SDK_VER) && EIGEN_CUDA_SDK_VER >= 90000 - return __hadd(::__half(a), ::__half(b)); -#else - return __hadd(a, b); -#endif -} +EIGEN_STRONG_INLINE __device__ half operator+(const half& a, const half& b) { return __hadd(::__half(a), ::__half(b)); } EIGEN_STRONG_INLINE __device__ half operator*(const half& a, const half& b) { return __hmul(a, b); } EIGEN_STRONG_INLINE __device__ half operator-(const half& a, const half& b) { return __hsub(a, b); } -EIGEN_STRONG_INLINE __device__ half operator/(const half& a, const half& b) { -#if defined(EIGEN_CUDA_SDK_VER) && EIGEN_CUDA_SDK_VER >= 90000 - return __hdiv(a, b); -#else - float num = __half2float(a); - float denom = __half2float(b); - return __float2half(num / denom); -#endif -} +EIGEN_STRONG_INLINE __device__ half operator/(const half& a, const half& b) { return __hdiv(a, b); } EIGEN_STRONG_INLINE __device__ half operator-(const half& a) { return __hneg(a); } EIGEN_STRONG_INLINE __device__ half& operator+=(half& a, const half& b) { a = a + b; @@ -505,7 +477,7 @@ EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC bool operator>=(const half& a, const half& // We need to provide emulated *host-side* FP16 operators for clang. #pragma push_macro("EIGEN_DEVICE_FUNC") #undef EIGEN_DEVICE_FUNC -#if defined(EIGEN_HAS_CUDA_FP16) && defined(EIGEN_HAS_NATIVE_GPU_FP16) +#if defined(EIGEN_CUDACC) && defined(EIGEN_HAS_NATIVE_GPU_FP16) #define EIGEN_DEVICE_FUNC __host__ #else // both host and device need emulated ops. #define EIGEN_DEVICE_FUNC __host__ __device__ @@ -636,7 +608,7 @@ EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC _EIGEN_MAYBE_CONSTEXPR __half_raw raw_uint // because this is constexpr function. // Fortunately, since we need to disable EIGEN_CONSTEXPR for GPU anyway, we can get out // of this catch22 by having separate bodies for GPU / non GPU -#if defined(EIGEN_HAS_GPU_FP16) +#if defined(EIGEN_GPUCC) __half_raw h; h.x = x; return h; @@ -661,8 +633,7 @@ EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC numext::uint16_t raw_half_as_uint16(const } EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC __half_raw float_to_half_rtne(float ff) { -#if (defined(EIGEN_HAS_CUDA_FP16) && defined(EIGEN_CUDA_ARCH) && EIGEN_CUDA_ARCH >= 300) || \ - (defined(EIGEN_HAS_HIP_FP16) && defined(EIGEN_HIP_DEVICE_COMPILE)) +#if defined(EIGEN_GPU_COMPILE_PHASE) __half tmp_ff = __float2half(ff); return *(__half_raw*)&tmp_ff; @@ -735,8 +706,7 @@ EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC __half_raw float_to_half_rtne(float ff) { } EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC float half_to_float(__half_raw h) { -#if (defined(EIGEN_HAS_CUDA_FP16) && defined(EIGEN_CUDA_ARCH) && EIGEN_CUDA_ARCH >= 300) || \ - (defined(EIGEN_HAS_HIP_FP16) && defined(EIGEN_HIP_DEVICE_COMPILE)) +#if defined(EIGEN_GPU_COMPILE_PHASE) return __half2float(h); #elif defined(EIGEN_HAS_ARM64_FP16_SCALAR_ARITHMETIC) || defined(EIGEN_HAS_BUILTIN_FLOAT16) return static_cast(h.x); @@ -778,8 +748,7 @@ EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC bool(isinf)(const half& a) { #endif } EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC bool(isnan)(const half& a) { -#if (defined(EIGEN_HAS_CUDA_FP16) && defined(EIGEN_CUDA_ARCH) && EIGEN_CUDA_ARCH >= 530) || \ - (defined(EIGEN_HAS_HIP_FP16) && defined(EIGEN_HIP_DEVICE_COMPILE)) +#if defined(EIGEN_GPU_COMPILE_PHASE) return __hisnan(a); #elif defined(EIGEN_HAS_ARM64_FP16_SCALAR_ARITHMETIC) || defined(EIGEN_HAS_BUILTIN_FLOAT16) return (numext::bit_cast(a.x) & 0x7fff) > 0x7c00; @@ -810,16 +779,14 @@ EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC half abs(const half& a) { #endif } EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC half exp(const half& a) { -#if (EIGEN_CUDA_SDK_VER >= 80000 && defined EIGEN_CUDA_ARCH && EIGEN_CUDA_ARCH >= 530) || \ - defined(EIGEN_HIP_DEVICE_COMPILE) +#if defined(EIGEN_CUDA_ARCH) || defined(EIGEN_HIP_DEVICE_COMPILE) return half(hexp(a)); #else return half(::expf(float(a))); #endif } EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC half exp2(const half& a) { -#if (EIGEN_CUDA_SDK_VER >= 80000 && defined EIGEN_CUDA_ARCH && EIGEN_CUDA_ARCH >= 530) || \ - defined(EIGEN_HIP_DEVICE_COMPILE) +#if defined(EIGEN_CUDA_ARCH) || defined(EIGEN_HIP_DEVICE_COMPILE) return half(hexp2(a)); #else return half(::exp2f(float(a))); @@ -827,9 +794,7 @@ EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC half exp2(const half& a) { } EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC half expm1(const half& a) { return half(numext::expm1(float(a))); } EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC half log(const half& a) { -#if (defined(EIGEN_HAS_CUDA_FP16) && EIGEN_CUDA_SDK_VER >= 80000 && defined(EIGEN_CUDA_ARCH) && \ - EIGEN_CUDA_ARCH >= 530) || \ - (defined(EIGEN_HAS_HIP_FP16) && defined(EIGEN_HIP_DEVICE_COMPILE)) +#if defined(EIGEN_GPU_COMPILE_PHASE) return half(hlog(a)); #else return half(::logf(float(a))); @@ -842,8 +807,7 @@ EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC half log2(const half& a) { } EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC half sqrt(const half& a) { -#if (EIGEN_CUDA_SDK_VER >= 80000 && defined EIGEN_CUDA_ARCH && EIGEN_CUDA_ARCH >= 530) || \ - defined(EIGEN_HIP_DEVICE_COMPILE) +#if defined(EIGEN_CUDA_ARCH) || defined(EIGEN_HIP_DEVICE_COMPILE) return half(hsqrt(a)); #else return half(::sqrtf(float(a))); @@ -864,16 +828,14 @@ EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC half acos(const half& a) { return half(::a EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC half atan(const half& a) { return half(::atanf(float(a))); } EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC half atanh(const half& a) { return half(::atanhf(float(a))); } EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC half floor(const half& a) { -#if (EIGEN_CUDA_SDK_VER >= 80000 && defined EIGEN_CUDA_ARCH && EIGEN_CUDA_ARCH >= 300) || \ - defined(EIGEN_HIP_DEVICE_COMPILE) +#if (defined(EIGEN_CUDA_ARCH)) || defined(EIGEN_HIP_DEVICE_COMPILE) return half(hfloor(a)); #else return half(::floorf(float(a))); #endif } EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC half ceil(const half& a) { -#if (EIGEN_CUDA_SDK_VER >= 80000 && defined EIGEN_CUDA_ARCH && EIGEN_CUDA_ARCH >= 300) || \ - defined(EIGEN_HIP_DEVICE_COMPILE) +#if (defined(EIGEN_CUDA_ARCH)) || defined(EIGEN_HIP_DEVICE_COMPILE) return half(hceil(a)); #else return half(::ceilf(float(a))); @@ -1007,20 +969,12 @@ EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC Eigen::half madd(const Eigen: } // namespace numext } // namespace Eigen -// Add the missing shfl* intrinsics. -// The __shfl* functions are only valid on HIP or _CUDA_ARCH_ >= 300. -// CUDA defines them for (__CUDA_ARCH__ >= 300 || !defined(__CUDA_ARCH__)) -// -// HIP and CUDA prior to SDK 9.0 define -// __shfl, __shfl_up, __shfl_down, __shfl_xor for int and float -// CUDA since 9.0 deprecates those and instead defines -// __shfl_sync, __shfl_up_sync, __shfl_down_sync, __shfl_xor_sync, -// with native support for __half and __nv_bfloat16 -// +// Warp shuffle overloads for Eigen::half. +// CUDA uses __shfl_*_sync (with mask); HIP uses __shfl_* (no mask). // Note that the following are __device__ - only functions. -#if (defined(EIGEN_CUDACC) && (!defined(EIGEN_CUDA_ARCH) || EIGEN_CUDA_ARCH >= 300)) || defined(EIGEN_HIPCC) +#if defined(EIGEN_CUDACC) || defined(EIGEN_HIPCC) -#if defined(EIGEN_HAS_CUDA_FP16) && EIGEN_CUDA_SDK_VER >= 90000 +#if defined(EIGEN_CUDACC) __device__ EIGEN_STRONG_INLINE Eigen::half __shfl_sync(unsigned mask, Eigen::half var, int srcLane, int width = warpSize) { @@ -1046,7 +1000,7 @@ __device__ EIGEN_STRONG_INLINE Eigen::half __shfl_xor_sync(unsigned mask, Eigen: return static_cast(__shfl_xor_sync(mask, h, laneMask, width)); } -#else // HIP or CUDA SDK < 9.0 +#else // HIP __device__ EIGEN_STRONG_INLINE Eigen::half __shfl(Eigen::half var, int srcLane, int width = warpSize) { const int ivar = static_cast(Eigen::numext::bit_cast(var)); @@ -1072,7 +1026,7 @@ __device__ EIGEN_STRONG_INLINE Eigen::half __shfl_xor(Eigen::half var, int laneM #endif // __shfl* // ldg() has an overload for __half_raw, but we also need one for Eigen::half. -#if (defined(EIGEN_CUDACC) && (!defined(EIGEN_CUDA_ARCH) || EIGEN_CUDA_ARCH >= 350)) || defined(EIGEN_HIPCC) +#if defined(EIGEN_CUDACC) || defined(EIGEN_HIPCC) EIGEN_STRONG_INLINE __device__ Eigen::half __ldg(const Eigen::half* ptr) { return Eigen::half_impl::raw_uint16_to_half(__ldg(reinterpret_cast(ptr))); } @@ -1095,8 +1049,7 @@ namespace internal { template <> struct cast_impl { EIGEN_DEVICE_FUNC static inline half run(const float& a) { -#if (defined(EIGEN_HAS_CUDA_FP16) && defined(EIGEN_CUDA_ARCH) && EIGEN_CUDA_ARCH >= 300) || \ - (defined(EIGEN_HAS_HIP_FP16) && defined(EIGEN_HIP_DEVICE_COMPILE)) +#if defined(EIGEN_GPU_COMPILE_PHASE) return __float2half(a); #else return half(a); @@ -1107,8 +1060,7 @@ struct cast_impl { template <> struct cast_impl { EIGEN_DEVICE_FUNC static inline half run(const int& a) { -#if (defined(EIGEN_HAS_CUDA_FP16) && defined(EIGEN_CUDA_ARCH) && EIGEN_CUDA_ARCH >= 300) || \ - (defined(EIGEN_HAS_HIP_FP16) && defined(EIGEN_HIP_DEVICE_COMPILE)) +#if defined(EIGEN_GPU_COMPILE_PHASE) return __float2half(static_cast(a)); #else return half(static_cast(a)); @@ -1119,8 +1071,7 @@ struct cast_impl { template <> struct cast_impl { EIGEN_DEVICE_FUNC static inline float run(const half& a) { -#if (defined(EIGEN_HAS_CUDA_FP16) && defined(EIGEN_CUDA_ARCH) && EIGEN_CUDA_ARCH >= 300) || \ - (defined(EIGEN_HAS_HIP_FP16) && defined(EIGEN_HIP_DEVICE_COMPILE)) +#if defined(EIGEN_GPU_COMPILE_PHASE) return __half2float(a); #else return static_cast(a); diff --git a/Eigen/src/Core/arch/GPU/PacketMath.h b/Eigen/src/Core/arch/GPU/PacketMath.h index edafb6610..9649843c5 100644 --- a/Eigen/src/Core/arch/GPU/PacketMath.h +++ b/Eigen/src/Core/arch/GPU/PacketMath.h @@ -17,19 +17,8 @@ namespace Eigen { namespace internal { -// Read-only data cached load available. -#if defined(EIGEN_HIP_DEVICE_COMPILE) || (defined(EIGEN_CUDA_ARCH) && EIGEN_CUDA_ARCH >= 350) -#define EIGEN_GPU_HAS_LDG 1 -#endif - -// FP16 math available. -#if (defined(EIGEN_CUDA_ARCH) && EIGEN_CUDA_ARCH >= 530) -#define EIGEN_CUDA_HAS_FP16_ARITHMETIC 1 -#endif - -#if defined(EIGEN_HIP_DEVICE_COMPILE) || defined(EIGEN_CUDA_HAS_FP16_ARITHMETIC) -#define EIGEN_GPU_HAS_FP16_ARITHMETIC 1 -#endif +// Read-only data cached load (__ldg) and native FP16 arithmetic are available +// on all supported GPU architectures (sm_70+ for CUDA, GFX906+ for HIP). // We need to distinguish ‘clang as the CUDA compiler’ from ‘clang as the host compiler, // invoked by NVCC’ (e.g. on MacOS). The former needs to see both host and device implementation @@ -56,92 +45,84 @@ struct is_arithmetic { template <> struct packet_traits : default_packet_traits { - typedef float4 type; - typedef float4 half; - enum { - Vectorizable = 1, - AlignedOnScalar = 1, - size = 4, + using type = float4; + using half = float4; + static constexpr int Vectorizable = 1; + static constexpr int AlignedOnScalar = 1; + static constexpr int size = 4; - HasDiv = 1, - HasSin = 0, - HasCos = 0, - HasLog = 1, - HasExp = 1, - HasSqrt = 1, - HasRsqrt = 1, - HasLGamma = 1, - HasDiGamma = 1, - HasZeta = 1, - HasPolygamma = 1, - HasErf = 1, - HasErfc = 1, - HasNdtri = 1, - HasBessel = 1, - HasIGamma = 1, - HasIGammaDerA = 1, - HasGammaSampleDerAlpha = 1, - HasIGammac = 1, - HasBetaInc = 1, + static constexpr int HasDiv = 1; + static constexpr int HasSin = 0; + static constexpr int HasCos = 0; + static constexpr int HasLog = 1; + static constexpr int HasExp = 1; + static constexpr int HasSqrt = 1; + static constexpr int HasRsqrt = 1; + static constexpr int HasLGamma = 1; + static constexpr int HasDiGamma = 1; + static constexpr int HasZeta = 1; + static constexpr int HasPolygamma = 1; + static constexpr int HasErf = 1; + static constexpr int HasErfc = 1; + static constexpr int HasNdtri = 1; + static constexpr int HasBessel = 1; + static constexpr int HasIGamma = 1; + static constexpr int HasIGammaDerA = 1; + static constexpr int HasGammaSampleDerAlpha = 1; + static constexpr int HasIGammac = 1; + static constexpr int HasBetaInc = 1; - HasFloor = 1, - HasCmp = EIGEN_HAS_GPU_DEVICE_FUNCTIONS - }; + static constexpr int HasFloor = 1; + static constexpr int HasCmp = EIGEN_HAS_GPU_DEVICE_FUNCTIONS; }; template <> struct packet_traits : default_packet_traits { - typedef double2 type; - typedef double2 half; - enum { - Vectorizable = 1, - AlignedOnScalar = 1, - size = 2, + using type = double2; + using half = double2; + static constexpr int Vectorizable = 1; + static constexpr int AlignedOnScalar = 1; + static constexpr int size = 2; - HasDiv = 1, - HasLog = 1, - HasExp = 1, - HasSqrt = 1, - HasRsqrt = 1, - HasLGamma = 1, - HasDiGamma = 1, - HasZeta = 1, - HasPolygamma = 1, - HasErf = 1, - HasErfc = 1, - HasNdtri = 1, - HasBessel = 1, - HasIGamma = 1, - HasIGammaDerA = 1, - HasGammaSampleDerAlpha = 1, - HasIGammac = 1, - HasBetaInc = 1, - }; + static constexpr int HasDiv = 1; + static constexpr int HasLog = 1; + static constexpr int HasExp = 1; + static constexpr int HasSqrt = 1; + static constexpr int HasRsqrt = 1; + static constexpr int HasLGamma = 1; + static constexpr int HasDiGamma = 1; + static constexpr int HasZeta = 1; + static constexpr int HasPolygamma = 1; + static constexpr int HasErf = 1; + static constexpr int HasErfc = 1; + static constexpr int HasNdtri = 1; + static constexpr int HasBessel = 1; + static constexpr int HasIGamma = 1; + static constexpr int HasIGammaDerA = 1; + static constexpr int HasGammaSampleDerAlpha = 1; + static constexpr int HasIGammac = 1; + static constexpr int HasBetaInc = 1; }; template <> struct unpacket_traits { - typedef float type; - enum { - size = 4, - alignment = Aligned16, - vectorizable = true, - masked_load_available = false, - masked_store_available = false - }; - typedef float4 half; + using type = float; + static constexpr int size = 4; + static constexpr int alignment = Aligned16; + static constexpr bool vectorizable = true; + static constexpr bool masked_load_available = false; + static constexpr bool masked_store_available = false; + using half = float4; }; template <> struct unpacket_traits { - typedef double type; - enum { - size = 2, - alignment = Aligned16, - vectorizable = true, - masked_load_available = false, - masked_store_available = false - }; - typedef double2 half; + using type = double; + static constexpr int size = 2; + static constexpr int alignment = Aligned16; + static constexpr bool vectorizable = true; + static constexpr bool masked_load_available = false; + static constexpr bool masked_store_available = false; + using half = double2; }; template <> @@ -403,7 +384,7 @@ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void pstoreu(double* to, const dou template <> EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE float4 ploadt_ro(const float* from) { -#if defined(EIGEN_GPU_HAS_LDG) +#if defined(EIGEN_GPU_COMPILE_PHASE) return __ldg(reinterpret_cast(from)); #else return make_float4(from[0], from[1], from[2], from[3]); @@ -411,7 +392,7 @@ EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE float4 ploadt_ro(const fl } template <> EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE double2 ploadt_ro(const double* from) { -#if defined(EIGEN_GPU_HAS_LDG) +#if defined(EIGEN_GPU_COMPILE_PHASE) return __ldg(reinterpret_cast(from)); #else return make_double2(from[0], from[1]); @@ -420,7 +401,7 @@ EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE double2 ploadt_ro(const template <> EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE float4 ploadt_ro(const float* from) { -#if defined(EIGEN_GPU_HAS_LDG) +#if defined(EIGEN_GPU_COMPILE_PHASE) return make_float4(__ldg(from + 0), __ldg(from + 1), __ldg(from + 2), __ldg(from + 3)); #else return make_float4(from[0], from[1], from[2], from[3]); @@ -428,7 +409,7 @@ EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE float4 ploadt_ro(const } template <> EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE double2 ploadt_ro(const double* from) { -#if defined(EIGEN_GPU_HAS_LDG) +#if defined(EIGEN_GPU_COMPILE_PHASE) return make_double2(__ldg(from + 0), __ldg(from + 1)); #else return make_double2(from[0], from[1]); @@ -591,23 +572,20 @@ EIGEN_DEVICE_FUNC inline void ptranspose(PacketBlock& kernel) { #endif // defined(EIGEN_GPUCC) && defined(EIGEN_USE_GPU) -// Half-packet functions are not available on the host for CUDA 9.0-9.2, only -// on device. There is no benefit to using them on the host anyways, since they are -// emulated. -#if (defined(EIGEN_HAS_CUDA_FP16) || defined(EIGEN_HAS_HIP_FP16)) && defined(EIGEN_GPU_COMPILE_PHASE) +// Half-packet functions are only available in GPU device compilation — they use +// intrinsics (__half2, etc.) that have no host-side benefit. +#if defined(EIGEN_GPU_COMPILE_PHASE) -typedef ulonglong2 Packet4h2; +using Packet4h2 = ulonglong2; template <> struct unpacket_traits { - typedef Eigen::half type; - enum { - size = 8, - alignment = Aligned16, - vectorizable = true, - masked_load_available = false, - masked_store_available = false - }; - typedef Packet4h2 half; + using type = Eigen::half; + static constexpr int size = 8; + static constexpr int alignment = Aligned16; + static constexpr bool vectorizable = true; + static constexpr bool masked_load_available = false; + static constexpr bool masked_store_available = false; + using half = Packet4h2; }; template <> struct is_arithmetic { @@ -616,15 +594,13 @@ struct is_arithmetic { template <> struct unpacket_traits { - typedef Eigen::half type; - enum { - size = 2, - alignment = Aligned16, - vectorizable = true, - masked_load_available = false, - masked_store_available = false - }; - typedef half2 half; + using type = Eigen::half; + static constexpr int size = 2; + static constexpr int alignment = Aligned16; + static constexpr bool vectorizable = true; + static constexpr bool masked_load_available = false; + static constexpr bool masked_store_available = false; + using half = half2; }; template <> struct is_arithmetic { @@ -633,23 +609,21 @@ struct is_arithmetic { template <> struct packet_traits : default_packet_traits { - typedef Packet4h2 type; - typedef Packet4h2 half; - enum { - Vectorizable = 1, - AlignedOnScalar = 1, - size = 8, - HasAdd = 1, - HasSub = 1, - HasMul = 1, - HasDiv = 1, - HasSqrt = 1, - HasRsqrt = 1, - HasExp = 1, - HasExpm1 = 1, - HasLog = 1, - HasLog1p = 1 - }; + using type = Packet4h2; + using half = Packet4h2; + static constexpr int Vectorizable = 1; + static constexpr int AlignedOnScalar = 1; + static constexpr int size = 8; + static constexpr int HasAdd = 1; + static constexpr int HasSub = 1; + static constexpr int HasMul = 1; + static constexpr int HasDiv = 1; + static constexpr int HasSqrt = 1; + static constexpr int HasRsqrt = 1; + static constexpr int HasExp = 1; + static constexpr int HasExpm1 = 1; + static constexpr int HasLog = 1; + static constexpr int HasLog1p = 1; }; template <> @@ -690,7 +664,7 @@ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void pstoreu(Eigen::half* to, const half2& } EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE half2 ploadt_ro_aligned(const Eigen::half* from) { -#if defined(EIGEN_GPU_HAS_LDG) +#if defined(EIGEN_GPU_COMPILE_PHASE) // Input is guaranteed to be properly aligned. return __ldg(reinterpret_cast(from)); #else @@ -699,7 +673,7 @@ EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE half2 ploadt_ro_aligned(const Eigen::half* } EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE half2 ploadt_ro_unaligned(const Eigen::half* from) { -#if defined(EIGEN_GPU_HAS_LDG) +#if defined(EIGEN_GPU_COMPILE_PHASE) return __halves2half2(__ldg(from + 0), __ldg(from + 1)); #else return __halves2half2(*(from + 0), *(from + 1)); @@ -745,12 +719,7 @@ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void ptranspose(PacketBlock& ker } EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 plset(const Eigen::half& a) { -#if defined(EIGEN_GPU_HAS_FP16_ARITHMETIC) return __halves2half2(a, __hadd(a, __float2half(1.0f))); -#else - float f = __half2float(a) + 1.0f; - return __halves2half2(a, __float2half(f)); -#endif } EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 pselect(const half2& mask, const half2& a, const half2& b) { @@ -837,89 +806,21 @@ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 pandnot(const half2& a, const half2& return __halves2half2(result1, result2); } -EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 padd(const half2& a, const half2& b) { -#if defined(EIGEN_GPU_HAS_FP16_ARITHMETIC) - return __hadd2(a, b); -#else - float a1 = __low2float(a); - float a2 = __high2float(a); - float b1 = __low2float(b); - float b2 = __high2float(b); - float r1 = a1 + b1; - float r2 = a2 + b2; - return __floats2half2_rn(r1, r2); -#endif -} +EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 padd(const half2& a, const half2& b) { return __hadd2(a, b); } -EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 psub(const half2& a, const half2& b) { -#if defined(EIGEN_GPU_HAS_FP16_ARITHMETIC) - return __hsub2(a, b); -#else - float a1 = __low2float(a); - float a2 = __high2float(a); - float b1 = __low2float(b); - float b2 = __high2float(b); - float r1 = a1 - b1; - float r2 = a2 - b2; - return __floats2half2_rn(r1, r2); -#endif -} +EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 psub(const half2& a, const half2& b) { return __hsub2(a, b); } -EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 pnegate(const half2& a) { -#if defined(EIGEN_GPU_HAS_FP16_ARITHMETIC) - return __hneg2(a); -#else - float a1 = __low2float(a); - float a2 = __high2float(a); - return __floats2half2_rn(-a1, -a2); -#endif -} +EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 pnegate(const half2& a) { return __hneg2(a); } EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 pconj(const half2& a) { return a; } -EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 pmul(const half2& a, const half2& b) { -#if defined(EIGEN_GPU_HAS_FP16_ARITHMETIC) - return __hmul2(a, b); -#else - float a1 = __low2float(a); - float a2 = __high2float(a); - float b1 = __low2float(b); - float b2 = __high2float(b); - float r1 = a1 * b1; - float r2 = a2 * b2; - return __floats2half2_rn(r1, r2); -#endif -} +EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 pmul(const half2& a, const half2& b) { return __hmul2(a, b); } EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 pmadd(const half2& a, const half2& b, const half2& c) { -#if defined(EIGEN_GPU_HAS_FP16_ARITHMETIC) return __hfma2(a, b, c); -#else - float a1 = __low2float(a); - float a2 = __high2float(a); - float b1 = __low2float(b); - float b2 = __high2float(b); - float c1 = __low2float(c); - float c2 = __high2float(c); - float r1 = a1 * b1 + c1; - float r2 = a2 * b2 + c2; - return __floats2half2_rn(r1, r2); -#endif } -EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 pdiv(const half2& a, const half2& b) { -#if defined(EIGEN_GPU_HAS_FP16_ARITHMETIC) - return __h2div(a, b); -#else - float a1 = __low2float(a); - float a2 = __high2float(a); - float b1 = __low2float(b); - float b2 = __high2float(b); - float r1 = a1 / b1; - float r2 = a2 / b2; - return __floats2half2_rn(r1, r2); -#endif -} +EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 pdiv(const half2& a, const half2& b) { return __h2div(a, b); } EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 pmin(const half2& a, const half2& b) { float a1 = __low2float(a); @@ -942,47 +843,23 @@ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 pmax(const half2& a, const half2& b) } EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Eigen::half predux(const half2& a) { -#if defined(EIGEN_GPU_HAS_FP16_ARITHMETIC) return __hadd(__low2half(a), __high2half(a)); -#else - float a1 = __low2float(a); - float a2 = __high2float(a); - return Eigen::half(__float2half(a1 + a2)); -#endif } EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Eigen::half predux_max(const half2& a) { -#if defined(EIGEN_GPU_HAS_FP16_ARITHMETIC) __half first = __low2half(a); __half second = __high2half(a); return __hgt(first, second) ? first : second; -#else - float a1 = __low2float(a); - float a2 = __high2float(a); - return a1 > a2 ? __low2half(a) : __high2half(a); -#endif } EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Eigen::half predux_min(const half2& a) { -#if defined(EIGEN_GPU_HAS_FP16_ARITHMETIC) __half first = __low2half(a); __half second = __high2half(a); return __hlt(first, second) ? first : second; -#else - float a1 = __low2float(a); - float a2 = __high2float(a); - return a1 < a2 ? __low2half(a) : __high2half(a); -#endif } EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Eigen::half predux_mul(const half2& a) { -#if defined(EIGEN_GPU_HAS_FP16_ARITHMETIC) return __hmul(__low2half(a), __high2half(a)); -#else - float a1 = __low2float(a); - float a2 = __high2float(a); - return Eigen::half(__float2half(a1 * a2)); -#endif } EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 plog1p(const half2& a) { @@ -1001,8 +878,6 @@ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 pexpm1(const half2& a) { return __floats2half2_rn(r1, r2); } -#if (EIGEN_CUDA_SDK_VER >= 80000 && defined(EIGEN_CUDA_HAS_FP16_ARITHMETIC)) || defined(EIGEN_HIP_DEVICE_COMPILE) - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 plog(const half2& a) { return h2log(a); } EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 pexp(const half2& a) { return h2exp(a); } @@ -1010,41 +885,6 @@ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 pexp(const half2& a) { return h2exp( EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 psqrt(const half2& a) { return h2sqrt(a); } EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 prsqrt(const half2& a) { return h2rsqrt(a); } - -#else - -EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 plog(const half2& a) { - float a1 = __low2float(a); - float a2 = __high2float(a); - float r1 = logf(a1); - float r2 = logf(a2); - return __floats2half2_rn(r1, r2); -} - -EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 pexp(const half2& a) { - float a1 = __low2float(a); - float a2 = __high2float(a); - float r1 = expf(a1); - float r2 = expf(a2); - return __floats2half2_rn(r1, r2); -} - -EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 psqrt(const half2& a) { - float a1 = __low2float(a); - float a2 = __high2float(a); - float r1 = sqrtf(a1); - float r2 = sqrtf(a2); - return __floats2half2_rn(r1, r2); -} - -EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 prsqrt(const half2& a) { - float a1 = __low2float(a); - float a2 = __high2float(a); - float r1 = rsqrtf(a1); - float r2 = rsqrtf(a2); - return __floats2half2_rn(r1, r2); -} -#endif } // namespace template <> @@ -1091,19 +931,17 @@ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void pstoreu(Eigen::half* to, template <> EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE Packet4h2 ploadt_ro(const Eigen::half* from) { -#if defined(EIGEN_GPU_HAS_LDG) Packet4h2 r; +#if defined(EIGEN_GPU_COMPILE_PHASE) r = __ldg(reinterpret_cast(from)); - return r; #else - Packet4h2 r; half2* r_alias = reinterpret_cast(&r); r_alias[0] = ploadt_ro_aligned(from + 0); r_alias[1] = ploadt_ro_aligned(from + 2); r_alias[2] = ploadt_ro_aligned(from + 4); r_alias[3] = ploadt_ro_aligned(from + 6); - return r; #endif + return r; } template <> @@ -1272,7 +1110,7 @@ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet4h2 plset(const Eigen::ha p_alias[2] = __halves2half2(__hadd(a, __float2half(4.0f)), __hadd(a, __float2half(5.0f))); p_alias[3] = __halves2half2(__hadd(a, __float2half(6.0f)), __hadd(a, __float2half(7.0f))); return r; -#elif defined(EIGEN_CUDA_HAS_FP16_ARITHMETIC) +#elif defined(EIGEN_CUDA_ARCH) Packet4h2 r; half2* r_alias = reinterpret_cast(&r); @@ -1290,16 +1128,6 @@ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet4h2 plset(const Eigen::ha r_alias[3] = plset(__high2half(c)); return r; - -#else - float f = __half2float(a); - Packet4h2 r; - half2* p_alias = reinterpret_cast(&r); - p_alias[0] = __halves2half2(a, __float2half(f + 1.0f)); - p_alias[1] = __halves2half2(__float2half(f + 2.0f), __float2half(f + 3.0f)); - p_alias[2] = __halves2half2(__float2half(f + 4.0f), __float2half(f + 5.0f)); - p_alias[3] = __halves2half2(__float2half(f + 6.0f), __float2half(f + 7.0f)); - return r; #endif } @@ -1533,7 +1361,7 @@ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Eigen::half predux_max(const Pa half2 m1 = __halves2half2(predux_max(a_alias[2]), predux_max(a_alias[3])); __half first = predux_max(m0); __half second = predux_max(m1); -#if defined(EIGEN_CUDA_HAS_FP16_ARITHMETIC) +#if defined(EIGEN_CUDA_ARCH) return (__hgt(first, second) ? first : second); #else float ffirst = __half2float(first); @@ -1549,7 +1377,7 @@ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Eigen::half predux_min(const Pa half2 m1 = __halves2half2(predux_min(a_alias[2]), predux_min(a_alias[3])); __half first = predux_min(m0); __half second = predux_min(m1); -#if defined(EIGEN_CUDA_HAS_FP16_ARITHMETIC) +#if defined(EIGEN_CUDA_ARCH) return (__hlt(first, second) ? first : second); #else float ffirst = __half2float(first); @@ -1641,47 +1469,17 @@ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet4h2 prsqrt(const Packet4h // the implementation of GPU half reduction. template <> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 padd(const half2& a, const half2& b) { -#if defined(EIGEN_GPU_HAS_FP16_ARITHMETIC) return __hadd2(a, b); -#else - float a1 = __low2float(a); - float a2 = __high2float(a); - float b1 = __low2float(b); - float b2 = __high2float(b); - float r1 = a1 + b1; - float r2 = a2 + b2; - return __floats2half2_rn(r1, r2); -#endif } template <> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 pmul(const half2& a, const half2& b) { -#if defined(EIGEN_GPU_HAS_FP16_ARITHMETIC) return __hmul2(a, b); -#else - float a1 = __low2float(a); - float a2 = __high2float(a); - float b1 = __low2float(b); - float b2 = __high2float(b); - float r1 = a1 * b1; - float r2 = a2 * b2; - return __floats2half2_rn(r1, r2); -#endif } template <> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 pdiv(const half2& a, const half2& b) { -#if defined(EIGEN_GPU_HAS_FP16_ARITHMETIC) return __h2div(a, b); -#else - float a1 = __low2float(a); - float a2 = __high2float(a); - float b1 = __low2float(b); - float b2 = __high2float(b); - float r1 = a1 / b1; - float r2 = a2 / b2; - return __floats2half2_rn(r1, r2); -#endif } template <> @@ -1706,11 +1504,7 @@ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 pmax(const half2& a, const ha return __halves2half2(r1, r2); } -#endif // (defined(EIGEN_HAS_CUDA_FP16) || defined(EIGEN_HAS_HIP_FP16)) && defined(EIGEN_GPU_COMPILE_PHASE) - -#undef EIGEN_GPU_HAS_LDG -#undef EIGEN_CUDA_HAS_FP16_ARITHMETIC -#undef EIGEN_GPU_HAS_FP16_ARITHMETIC +#endif // defined(EIGEN_GPU_COMPILE_PHASE) } // end namespace internal diff --git a/Eigen/src/Core/arch/GPU/TypeCasting.h b/Eigen/src/Core/arch/GPU/TypeCasting.h index ae43f8eaf..27cbbbe0e 100644 --- a/Eigen/src/Core/arch/GPU/TypeCasting.h +++ b/Eigen/src/Core/arch/GPU/TypeCasting.h @@ -17,8 +17,7 @@ namespace Eigen { namespace internal { -#if (defined(EIGEN_HAS_CUDA_FP16) && defined(EIGEN_CUDA_ARCH) && EIGEN_CUDA_ARCH >= 300) || \ - (defined(EIGEN_HAS_HIP_FP16) && defined(EIGEN_HIP_DEVICE_COMPILE)) +#if defined(EIGEN_GPU_COMPILE_PHASE) template <> struct type_casting_traits { diff --git a/Eigen/src/Core/util/ConfigureVectorization.h b/Eigen/src/Core/util/ConfigureVectorization.h index 1cffb1d56..4f31ae544 100644 --- a/Eigen/src/Core/util/ConfigureVectorization.h +++ b/Eigen/src/Core/util/ConfigureVectorization.h @@ -541,12 +541,6 @@ extern "C" { #if defined EIGEN_CUDACC #define EIGEN_VECTORIZE_GPU #include -#if EIGEN_CUDA_SDK_VER >= 70500 -#define EIGEN_HAS_CUDA_FP16 -#endif -#endif - -#if defined(EIGEN_HAS_CUDA_FP16) #include #include #endif @@ -554,7 +548,6 @@ extern "C" { #if defined(EIGEN_HIPCC) #define EIGEN_VECTORIZE_GPU #include -#define EIGEN_HAS_HIP_FP16 #include #define EIGEN_HAS_HIP_BF16 #include diff --git a/Eigen/src/Core/util/DisableStupidWarnings.h b/Eigen/src/Core/util/DisableStupidWarnings.h index ab0c542d0..b2bc18a3d 100644 --- a/Eigen/src/Core/util/DisableStupidWarnings.h +++ b/Eigen/src/Core/util/DisableStupidWarnings.h @@ -84,8 +84,7 @@ #endif #if defined __NVCC__ && defined __CUDACC__ -// MSVC 14.16 (required by CUDA 9.*) does not support the _Pragma keyword, so -// we instead use Microsoft's __pragma extension. +// MSVC does not support the _Pragma keyword, so we use Microsoft's __pragma extension. #if defined _MSC_VER #define EIGEN_MAKE_PRAGMA(X) __pragma(#X) #else diff --git a/Eigen/src/Core/util/Macros.h b/Eigen/src/Core/util/Macros.h index 34f68729b..109a3b3cf 100644 --- a/Eigen/src/Core/util/Macros.h +++ b/Eigen/src/Core/util/Macros.h @@ -148,13 +148,8 @@ #endif #if defined(__NVCC__) -#if defined(__CUDACC_VER_MAJOR__) && (__CUDACC_VER_MAJOR__ >= 9) +// CUDA 11.4+ always defines __CUDACC_VER_MAJOR__. #define EIGEN_COMP_NVCC ((__CUDACC_VER_MAJOR__ * 10000) + (__CUDACC_VER_MINOR__ * 100)) -#elif defined(__CUDACC_VER__) -#define EIGEN_COMP_NVCC __CUDACC_VER__ -#else -#error "NVCC did not define compiler version." -#endif #else #define EIGEN_COMP_NVCC 0 #endif @@ -575,6 +570,10 @@ #define EIGEN_CUDA_SDK_VER 0 #endif +#if defined(EIGEN_CUDACC) && EIGEN_CUDA_SDK_VER > 0 && EIGEN_CUDA_SDK_VER < 110400 +#error "Eigen requires CUDA 11.4 or later." +#endif + #if defined(__HIPCC__) && !defined(EIGEN_NO_HIP) && !defined(__SYCL_DEVICE_ONLY__) // Means the compiler is HIPCC (analogous to EIGEN_CUDACC, but for HIP) #define EIGEN_HIPCC __HIPCC__ @@ -584,22 +583,20 @@ // ++ host_defines.h which contains the defines for the __host__ and __device__ macros #include +// Eigen requires ROCm/HIP >= 5.6 (GFX906 minimum architecture). +// This floor exists to allow simplifying shared CUDA/HIP preprocessor guards — +// all __HIP_ARCH_HAS_WARP_SHUFFLE__, __HIP_ARCH_HAS_FP16__, etc. are always true on GFX906+. +#if defined(HIP_VERSION_MAJOR) && (HIP_VERSION_MAJOR < 5 || (HIP_VERSION_MAJOR == 5 && HIP_VERSION_MINOR < 6)) +#error "Eigen requires ROCm/HIP >= 5.6." +#endif + #if defined(__HIP_DEVICE_COMPILE__) && !defined(__SYCL_DEVICE_ONLY__) // analogous to EIGEN_CUDA_ARCH, but for HIP #define EIGEN_HIP_DEVICE_COMPILE __HIP_DEVICE_COMPILE__ #endif -// For HIP (ROCm 3.5 and higher), we need to explicitly set the launch_bounds attribute -// value to 1024. The compiler assigns a default value of 256 when the attribute is not -// specified. This results in failures on the HIP platform, for cases when a GPU kernel -// without an explicit launch_bounds attribute is called with a threads_per_block value -// greater than 256. -// -// This is a regression in functionality and is expected to be fixed within the next -// couple of ROCm releases (compiler will go back to using 1024 value as the default) -// -// In the meantime, we will use a "only enabled for HIP" macro to set the launch_bounds -// attribute. +// HIP compilers default to launch_bounds(256), which causes failures when kernels +// are called with more than 256 threads per block. Explicitly set to 1024 for HIP. #define EIGEN_HIP_LAUNCH_BOUNDS_1024 __launch_bounds__(1024) diff --git a/ci/build.linux.gitlab-ci.yml b/ci/build.linux.gitlab-ci.yml index 4e6928c07..9a67c476d 100644 --- a/ci/build.linux.gitlab-ci.yml +++ b/ci/build.linux.gitlab-ci.yml @@ -197,7 +197,7 @@ build:linux:x86-64:nvhpc-26.1:default:unsupported: # Additional flags passed to the cuda compiler. EIGEN_CI_CUDA_CXX_FLAGS: "" # Compute architectures present in the GitLab CI runners. - EIGEN_CI_CUDA_COMPUTE_ARCH: "50;75" + EIGEN_CI_CUDA_COMPUTE_ARCH: "70;75" EIGEN_CI_BUILD_TARGET: buildtests_gpu EIGEN_CI_TEST_CUDA_CLANG: "off" EIGEN_CI_TEST_CUDA_NVC: "off" @@ -234,7 +234,7 @@ build:linux:cuda-12.2:clang-12: # ROCm HIP build:linux:rocm-latest:gcc-10: extends: .build:linux:cross - image: rocm/dev-ubuntu-24.04:latest + image: rocm/dev-ubuntu-24.04:6.3.1 variables: EIGEN_CI_C_COMPILER: gcc-10 EIGEN_CI_CXX_COMPILER: g++-10 diff --git a/ci/build.windows.gitlab-ci.yml b/ci/build.windows.gitlab-ci.yml index ff9c4036e..69ced1e56 100644 --- a/ci/build.windows.gitlab-ci.yml +++ b/ci/build.windows.gitlab-ci.yml @@ -55,7 +55,7 @@ build:windows:x86-64:msvc-14.29:avx512dq: extends: .build:windows variables: # Compute architectures present in the GitLab CI runners. - EIGEN_CI_CUDA_COMPUTE_ARCH: "50;75" + EIGEN_CI_CUDA_COMPUTE_ARCH: "70;75" EIGEN_CI_BUILD_TARGET: buildtests_gpu EIGEN_CI_ADDITIONAL_ARGS: -DEIGEN_TEST_CUDA=on @@ -66,8 +66,8 @@ build:windows:x86-64:msvc-14.29:avx512dq: - x86-64 - cuda -# MSVC 14.29 + CUDA 11.4 -build:windows:x86-64:cuda-11.4:msvc-14.29: +# MSVC 14.29 + CUDA 12.2 +build:windows:x86-64:cuda-12.2:msvc-14.29: extends: .build:windows:cuda variables: - EIGEN_CI_BEFORE_SCRIPT: $$env:CUDA_PATH=$$env:CUDA_PATH_V11_4 \ No newline at end of file + EIGEN_CI_BEFORE_SCRIPT: $$env:CUDA_PATH=$$env:CUDA_PATH_V12_2 diff --git a/ci/test.windows.gitlab-ci.yml b/ci/test.windows.gitlab-ci.yml index 34b1d7424..e3c666ce5 100644 --- a/ci/test.windows.gitlab-ci.yml +++ b/ci/test.windows.gitlab-ci.yml @@ -71,7 +71,7 @@ test:windows:x86-64:msvc-14.29:avx512dq:unsupported: - x86-64 - cuda -# MSVC 14.29 + CUDA 11.4 -test:windows:x86-64:cuda-11.4:msvc-14.29: +# MSVC 14.29 + CUDA 12.2 +test:windows:x86-64:cuda-12.2:msvc-14.29: extends: .test:windows:cuda - needs: [ build:windows:x86-64:cuda-11.4:msvc-14.29 ] \ No newline at end of file + needs: [ build:windows:x86-64:cuda-12.2:msvc-14.29 ] diff --git a/cmake/EigenConfigureTesting.cmake b/cmake/EigenConfigureTesting.cmake index d72d88a02..1103ba217 100644 --- a/cmake/EigenConfigureTesting.cmake +++ b/cmake/EigenConfigureTesting.cmake @@ -20,7 +20,8 @@ add_dependencies(check buildtests) # Convenience target for only building GPU tests. add_custom_target(buildtests_gpu) -add_custom_target(check_gpu COMMAND "ctest" "--output-on-failure" +add_custom_target(check_gpu COMMAND "ctest" ${EIGEN_CTEST_ARGS} + "--output-on-failure" "--no-compress-output" "--build-no-clean" "-T" "test" @@ -71,4 +72,3 @@ elseif(MSVC) set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} /D_CRT_SECURE_NO_WARNINGS /D_SCL_SECURE_NO_WARNINGS") endif() - diff --git a/cmake/EigenTesting.cmake b/cmake/EigenTesting.cmake index e5aefa3c5..910214873 100644 --- a/cmake/EigenTesting.cmake +++ b/cmake/EigenTesting.cmake @@ -8,6 +8,12 @@ macro(ei_add_property prop value) endif() endmacro() +if(EIGEN_TEST_HIP AND NOT DEFINED EIGEN_HIP_ARCHITECTURES) + set(EIGEN_HIP_ARCHITECTURES + gfx900;gfx906;gfx908;gfx90a;gfx940;gfx941;gfx942;gfx1030;gfx1100;gfx1101;gfx1102;gfx1150;gfx1151 + CACHE STRING "HIP GPU architectures to build Eigen's HIP tests for.") +endif() + #internal. See documentation of ei_add_test for details. macro(ei_add_test_internal testname testname_with_suffix) set(targetname ${testname_with_suffix}) @@ -30,7 +36,7 @@ macro(ei_add_test_internal testname testname_with_suffix) hip_reset_flags() hip_add_executable(${targetname} ${filename} HIPCC_OPTIONS -std=c++14) target_compile_definitions(${targetname} PRIVATE -DEIGEN_USE_HIP) - set_property(TARGET ${targetname} PROPERTY HIP_ARCHITECTURES gfx900 gfx906 gfx908 gfx90a gfx940 gfx941 gfx942 gfx1030) + set_property(TARGET ${targetname} PROPERTY HIP_ARCHITECTURES "${EIGEN_HIP_ARCHITECTURES}") elseif(EIGEN_TEST_CUDA_CLANG) set_source_files_properties(${filename} PROPERTIES LANGUAGE CXX) @@ -134,6 +140,7 @@ macro(ei_add_test_internal testname testname_with_suffix) if (is_gpu_test) # Add gpu tag for testing only GPU tests. set_property(TEST ${testname_with_suffix} APPEND PROPERTY LABELS "gpu") + set_property(TEST ${testname_with_suffix} PROPERTY SKIP_RETURN_CODE 77) endif() if(EIGEN_SYCL) diff --git a/test/CMakeLists.txt b/test/CMakeLists.txt index 87648b9f7..e50f24f44 100644 --- a/test/CMakeLists.txt +++ b/test/CMakeLists.txt @@ -433,7 +433,7 @@ if(EIGEN_TEST_CUDA_NVC AND NOT CMAKE_CXX_COMPILER_ID MATCHES "NVHPC") message(WARNING "EIGEN_TEST_CUDA_NVC is set, but CMAKE_CXX_COMPILER does not appear to be nvc++.") endif() -find_package(CUDA 9.0) +find_package(CUDA 11.4) if(CUDA_FOUND AND EIGEN_TEST_CUDA) # Make sure to compile without the -pedantic, -Wundef, -Wnon-virtual-dtor # and -fno-check-new flags since they trigger thousands of compilation warnings @@ -502,6 +502,9 @@ if (EIGEN_TEST_HIP) endif() find_package(HIP REQUIRED) + if (HIP_FOUND AND HIP_VERSION VERSION_LESS "5.6") + message(FATAL_ERROR "Eigen requires ROCm/HIP >= 5.6, found ${HIP_VERSION}") + endif() if (HIP_FOUND) execute_process(COMMAND ${HIP_PATH}/bin/hipconfig --platform OUTPUT_VARIABLE HIP_PLATFORM) diff --git a/test/gpu_basic.cu b/test/gpu_basic.cu index aa6c1bb4b..c3a188e94 100644 --- a/test/gpu_basic.cu +++ b/test/gpu_basic.cu @@ -7,12 +7,6 @@ // Public License v. 2.0. If a copy of the MPL was not distributed // with this file, You can obtain one at http://mozilla.org/MPL/2.0/. -// workaround issue between gcc >= 4.7 and cuda 5.5 -#if (defined __GNUC__) && (__GNUC__ > 4 || __GNUC_MINOR__ >= 7) -#undef _GLIBCXX_ATOMIC_BUILTINS -#undef _GLIBCXX_USE_INT128 -#endif - #define EIGEN_TEST_NO_LONGDOUBLE #define EIGEN_DEFAULT_DENSE_INDEX_TYPE int diff --git a/test/gpu_test_helper.h b/test/gpu_test_helper.h index 3b2ec9c0d..80960eb31 100644 --- a/test/gpu_test_helper.h +++ b/test/gpu_test_helper.h @@ -6,10 +6,8 @@ // Allow gpu** macros for generic tests. #include -// std::tuple cannot be used on device, and there is a bug in cuda < 9.2 that -// doesn't allow std::tuple to compile for host code either. In these cases, -// use our custom implementation. -#if defined(EIGEN_GPU_COMPILE_PHASE) || (defined(EIGEN_CUDACC) && EIGEN_CUDA_SDK_VER < 92000) +// std::tuple cannot be used on device, so use our custom implementation there. +#if defined(EIGEN_GPU_COMPILE_PHASE) #define EIGEN_USE_CUSTOM_TUPLE 1 #else #define EIGEN_USE_CUSTOM_TUPLE 0 @@ -42,6 +40,12 @@ using tuple_impl::tuple; #undef EIGEN_USE_CUSTOM_TUPLE } // namespace test_detail +template +using decay_t = typename std::decay::type; + +template +using kernel_result_t = decltype(std::declval()(std::declval()...)); + template struct extract_output_indices_helper; @@ -90,14 +94,15 @@ struct void_helper { // Non-void return value. template static EIGEN_ALWAYS_INLINE EIGEN_DEVICE_FUNC auto call(Func&& func, Args&&... args) - -> std::enable_if_t::value, decltype(func(args...))> { + -> std::enable_if_t, void>::value, + kernel_result_t> { return func(std::forward(args)...); } // Void return value. template static EIGEN_ALWAYS_INLINE EIGEN_DEVICE_FUNC auto call(Func&& func, Args&&... args) - -> std::enable_if_t::value, Void> { + -> std::enable_if_t, void>::value, Void> { func(std::forward(args)...); return Void{}; } @@ -135,18 +140,18 @@ EIGEN_DEVICE_FUNC void run_serialized(std::index_sequence, std::inde const uint8_t* read_end = buffer + capacity; read_ptr = Eigen::deserialize(read_ptr, read_end, input_size); // Create value-type instances to populate. - auto args = make_tuple(typename std::decay::type{}...); + auto args = make_tuple(decay_t{}...); EIGEN_UNUSED_VARIABLE(args); // Avoid NVCC compile warning. // NVCC 9.1 requires us to spell out the template parameters explicitly. - read_ptr = Eigen::deserialize(read_ptr, read_end, get::type...>(args)...); + read_ptr = Eigen::deserialize(read_ptr, read_end, get...>(args)...); // Call function, with void->Void conversion so we are guaranteed a complete // output type. - auto result = void_helper::call(kernel, get::type...>(args)...); + auto result = void_helper::call(kernel, get...>(args)...); // Determine required output size. size_t output_size = Eigen::serialize_size(capacity); - output_size += Eigen::serialize_size(get::type...>(args)...); + output_size += Eigen::serialize_size(get...>(args)...); output_size += Eigen::serialize_size(result); // Always serialize required buffer size. @@ -157,7 +162,7 @@ EIGEN_DEVICE_FUNC void run_serialized(std::index_sequence, std::inde // Serialize outputs if they fit in the buffer. if (output_size <= capacity) { // Collect outputs and result. - write_ptr = Eigen::serialize(write_ptr, write_end, get::type...>(args)...); + write_ptr = Eigen::serialize(write_ptr, write_end, get...>(args)...); write_ptr = Eigen::serialize(write_ptr, write_end, result); } } @@ -282,7 +287,7 @@ auto run_serialized_on_gpu(size_t buffer_capacity_hint, std::index_sequence -auto run_on_cpu(Kernel kernel, Args&&... args) -> decltype(kernel(args...)) { +auto run_on_cpu(Kernel kernel, Args&&... args) -> internal::kernel_result_t { return kernel(std::forward(args)...); } @@ -301,7 +306,7 @@ auto run_on_cpu(Kernel kernel, Args&&... args) -> decltype(kernel(args...)) { * \return kernel(args...). */ template -auto run_on_gpu(Kernel kernel, Args&&... args) -> decltype(kernel(args...)) { +auto run_on_gpu(Kernel kernel, Args&&... args) -> internal::kernel_result_t { return internal::run_serialized_on_gpu( /*buffer_capacity_hint=*/0, std::make_index_sequence{}, internal::extract_output_indices{}, kernel, std::forward(args)...); @@ -322,7 +327,8 @@ auto run_on_gpu(Kernel kernel, Args&&... args) -> decltype(kernel(args...)) { * \sa run_on_gpu */ template -auto run_on_gpu_with_hint(size_t buffer_capacity_hint, Kernel kernel, Args&&... args) -> decltype(kernel(args...)) { +auto run_on_gpu_with_hint(size_t buffer_capacity_hint, Kernel kernel, Args&&... args) + -> internal::kernel_result_t { return internal::run_serialized_on_gpu( buffer_capacity_hint, std::make_index_sequence{}, internal::extract_output_indices{}, kernel, std::forward(args)...); @@ -409,7 +415,7 @@ void print_gpu_device_info() { * \return kernel(args...). */ template -auto run(Kernel kernel, Args&&... args) -> decltype(kernel(args...)) { +auto run(Kernel kernel, Args&&... args) -> internal::kernel_result_t { #ifdef EIGEN_GPUCC return run_on_gpu(kernel, std::forward(args)...); #else @@ -432,7 +438,8 @@ auto run(Kernel kernel, Args&&... args) -> decltype(kernel(args...)) { * \sa run */ template -auto run_with_hint(size_t buffer_capacity_hint, Kernel kernel, Args&&... args) -> decltype(kernel(args...)) { +auto run_with_hint(size_t buffer_capacity_hint, Kernel kernel, Args&&... args) + -> internal::kernel_result_t { #ifdef EIGEN_GPUCC return run_on_gpu_with_hint(buffer_capacity_hint, kernel, std::forward(args)...); #else diff --git a/test/main.h b/test/main.h index 6cedca562..b429572b7 100644 --- a/test/main.h +++ b/test/main.h @@ -76,10 +76,8 @@ #include #include #include -#if CUDA_VERSION >= 7050 #include #endif -#endif #if defined(EIGEN_CUDACC) || defined(EIGEN_HIPCC) #define EIGEN_TEST_NO_LONGDOUBLE @@ -949,6 +947,37 @@ inline void set_seed_from_time() { g_seed = static_cast(ns); } +#if defined(EIGEN_USE_GPU) +inline int maybe_skip_gpu_tests() { +#if defined(EIGEN_USE_HIP) + int device_count = 0; + hipError_t status = hipGetDeviceCount(&device_count); + if (status != hipSuccess) { + std::cout << "SKIP: HIP GPU tests require a visible ROCm device. hipGetDeviceCount failed with: " + << hipGetErrorString(status) << std::endl; + return 77; + } + if (device_count <= 0) { + std::cout << "SKIP: HIP GPU tests require a visible ROCm device." << std::endl; + return 77; + } +#elif defined(EIGEN_CUDACC) + int device_count = 0; + cudaError_t status = cudaGetDeviceCount(&device_count); + if (status != cudaSuccess) { + std::cout << "SKIP: CUDA GPU tests require a visible CUDA device. cudaGetDeviceCount failed with: " + << cudaGetErrorString(status) << std::endl; + return 77; + } + if (device_count <= 0) { + std::cout << "SKIP: CUDA GPU tests require a visible CUDA device." << std::endl; + return 77; + } +#endif + return 0; +} +#endif + int main(int argc, char* argv[]) { g_has_set_repeat = false; g_has_set_seed = false; @@ -997,6 +1026,13 @@ int main(int argc, char* argv[]) { srand(g_seed); std::cout << "Repeating each test " << g_repeat << " times" << std::endl; +#if defined(EIGEN_USE_GPU) + { + const int skip_code = maybe_skip_gpu_tests(); + if (skip_code != 0) return skip_code; + } +#endif + VERIFY(EigenTest::all().size() > 0); for (std::size_t i = 0; i < EigenTest::all().size(); ++i) { diff --git a/unsupported/Eigen/src/Tensor/TensorContractionGpu.h b/unsupported/Eigen/src/Tensor/TensorContractionGpu.h index 87bf008d0..79ad6c29c 100644 --- a/unsupported/Eigen/src/Tensor/TensorContractionGpu.h +++ b/unsupported/Eigen/src/Tensor/TensorContractionGpu.h @@ -393,7 +393,8 @@ __device__ EIGEN_STRONG_INLINE void EigenContractionKernelInternal(const LhsMapp // the sum across all big k blocks of the product of little k block of index (x, y) // with block of index (y, z). To compute the final output, we need to reduce // the 8 threads over y by summation. -#if defined(EIGEN_HIPCC) || (defined(EIGEN_CUDA_SDK_VER) && EIGEN_CUDA_SDK_VER < 90000) + // HIP uses non-sync warp shuffles; CUDA requires the _sync variants. +#if defined(EIGEN_HIPCC) #define shuffleInc(i, j, mask) res(i, j) += __shfl_xor(res(i, j), mask) #else #define shuffleInc(i, j, mask) res(i, j) += __shfl_xor_sync(0xFFFFFFFF, res(i, j), mask) @@ -622,7 +623,7 @@ __device__ __forceinline__ void EigenFloatContractionKernelInternal16x16(const L x1 = rhs_pf0.x; x2 = rhs_pf0.z; } -#if defined(EIGEN_HIPCC) || (defined(EIGEN_CUDA_SDK_VER) && EIGEN_CUDA_SDK_VER < 90000) +#if defined(EIGEN_HIPCC) x1 = __shfl_xor(x1, 4); x2 = __shfl_xor(x2, 4); #else @@ -1377,13 +1378,6 @@ struct TensorEvaluatorm_right_contracting_strides, this->m_k_strides); OutputMapper output(buffer, m); - -#if defined(EIGEN_USE_HIP) - setGpuSharedMemConfig(hipSharedMemBankSizeEightByte); -#else - setGpuSharedMemConfig(cudaSharedMemBankSizeEightByte); -#endif - LaunchKernels::Run(lhs, rhs, output, m, n, k, this->m_device); } diff --git a/unsupported/Eigen/src/Tensor/TensorConvolution.h b/unsupported/Eigen/src/Tensor/TensorConvolution.h index 021f7cd47..0d0ec2564 100644 --- a/unsupported/Eigen/src/Tensor/TensorConvolution.h +++ b/unsupported/Eigen/src/Tensor/TensorConvolution.h @@ -89,7 +89,7 @@ class IndexMapper { } } else { for (int i = NumDims - 1; i >= 0; --i) { - if (static_cast(i + 1) < offset) { + if (i + 1 < static_cast(offset)) { m_gpuInputStrides[i] = m_gpuInputStrides[i + 1] * gpuInputDimensions[i + 1]; m_gpuOutputStrides[i] = m_gpuOutputStrides[i + 1] * gpuOutputDimensions[i + 1]; } else { diff --git a/unsupported/Eigen/src/Tensor/TensorDeviceGpu.h b/unsupported/Eigen/src/Tensor/TensorDeviceGpu.h index d2e0d08bb..e0f74a6c5 100644 --- a/unsupported/Eigen/src/Tensor/TensorDeviceGpu.h +++ b/unsupported/Eigen/src/Tensor/TensorDeviceGpu.h @@ -342,19 +342,6 @@ struct GpuDevice { #endif -// FIXME: Should be device and kernel specific. -#ifdef EIGEN_GPUCC -static EIGEN_DEVICE_FUNC inline void setGpuSharedMemConfig(gpuSharedMemConfig config) { -#ifndef EIGEN_GPU_COMPILE_PHASE - gpuError_t status = gpuDeviceSetSharedMemConfig(config); - EIGEN_UNUSED_VARIABLE(status); - gpu_assert(status == gpuSuccess); -#else - EIGEN_UNUSED_VARIABLE(config); -#endif -} -#endif - } // end namespace Eigen // undefine all the gpu* macros we defined at the beginning of the file diff --git a/unsupported/Eigen/src/Tensor/TensorEvaluator.h b/unsupported/Eigen/src/Tensor/TensorEvaluator.h index a09ce7abe..03c957c92 100644 --- a/unsupported/Eigen/src/Tensor/TensorEvaluator.h +++ b/unsupported/Eigen/src/Tensor/TensorEvaluator.h @@ -175,7 +175,7 @@ EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE T loadConstant(const T* address) { return *address; } // Use the texture cache on CUDA devices whenever possible -#if defined(EIGEN_CUDA_ARCH) && EIGEN_CUDA_ARCH >= 350 +#if defined(EIGEN_CUDA_ARCH) template <> EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE float loadConstant(const float* address) { return __ldg(address); diff --git a/unsupported/Eigen/src/Tensor/TensorMeta.h b/unsupported/Eigen/src/Tensor/TensorMeta.h index 5762b8547..5dd0145ca 100644 --- a/unsupported/Eigen/src/Tensor/TensorMeta.h +++ b/unsupported/Eigen/src/Tensor/TensorMeta.h @@ -49,7 +49,7 @@ struct PacketType : internal::packet_traits { }; // For CUDA packet types when using a GpuDevice -#if defined(EIGEN_USE_GPU) && defined(EIGEN_HAS_GPU_FP16) && defined(EIGEN_GPU_COMPILE_PHASE) +#if defined(EIGEN_USE_GPU) && defined(EIGEN_GPU_COMPILE_PHASE) typedef ulonglong2 Packet4h2; template <> diff --git a/unsupported/Eigen/src/Tensor/TensorReduction.h b/unsupported/Eigen/src/Tensor/TensorReduction.h index 36865354c..751516e1c 100644 --- a/unsupported/Eigen/src/Tensor/TensorReduction.h +++ b/unsupported/Eigen/src/Tensor/TensorReduction.h @@ -453,7 +453,7 @@ template __global__ EIGEN_HIP_LAUNCH_BOUNDS_1024 void FullReductionKernel(R, const S, I_, typename S::CoeffReturnType*, unsigned int*); -#if defined(EIGEN_HAS_GPU_FP16) +#if defined(EIGEN_GPUCC) template __global__ EIGEN_HIP_LAUNCH_BOUNDS_1024 void ReductionInitFullReduxKernelHalfFloat( R, const S, I_, internal::packet_traits::type*); @@ -883,7 +883,7 @@ struct TensorReductionEvaluatorBase KERNEL_FRIEND void internal::FullReductionKernel(R, const S, I_, typename S::CoeffReturnType*, unsigned int*); -#if defined(EIGEN_HAS_GPU_FP16) +#if defined(EIGEN_GPUCC) template KERNEL_FRIEND void internal::ReductionInitFullReduxKernelHalfFloat(R, const S, I_, internal::packet_traits::type*); diff --git a/unsupported/Eigen/src/Tensor/TensorReductionGpu.h b/unsupported/Eigen/src/Tensor/TensorReductionGpu.h index f3e5db6b6..645286e02 100644 --- a/unsupported/Eigen/src/Tensor/TensorReductionGpu.h +++ b/unsupported/Eigen/src/Tensor/TensorReductionGpu.h @@ -25,7 +25,6 @@ namespace internal { // updated the content of the output address it will try again. template __device__ EIGEN_ALWAYS_INLINE void atomicReduce(T* output, T accum, R& reducer) { -#if (defined(EIGEN_HIP_DEVICE_COMPILE) && defined(__HIP_ARCH_HAS_WARP_SHUFFLE__)) || (EIGEN_CUDA_ARCH >= 300) if (sizeof(T) == 4) { unsigned int oldval = *reinterpret_cast(output); unsigned int newval = oldval; @@ -61,12 +60,6 @@ __device__ EIGEN_ALWAYS_INLINE void atomicReduce(T* output, T accum, R& reducer) } else { gpu_assert(0 && "Wordsize not supported"); } -#else // EIGEN_CUDA_ARCH >= 300 - EIGEN_UNUSED_VARIABLE(output); - EIGEN_UNUSED_VARIABLE(accum); - EIGEN_UNUSED_VARIABLE(reducer); - gpu_assert(0 && "Shouldn't be called on unsupported device"); -#endif // EIGEN_CUDA_ARCH >= 300 } // We extend atomicExch to support extra data types @@ -75,13 +68,42 @@ __device__ inline Type atomicExchCustom(Type* address, Type val) { return atomicExch(address, val); } +template +EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR auto reduction_shuffle_mask() { +#if defined(EIGEN_HIP_DEVICE_COMPILE) + return 0xFFFFFFFFFFFFFFFFull; +#else + return 0xFFFFFFFFu; +#endif +} + +template +__device__ EIGEN_ALWAYS_INLINE T reduction_shuffle_down(T value, int offset) { + return __shfl_down_sync(reduction_shuffle_mask(), value, offset, warpSize); +} + +template <> +__device__ EIGEN_ALWAYS_INLINE int reduction_shuffle_down(int value, int offset) { + return __shfl_down_sync(reduction_shuffle_mask(), value, offset, warpSize); +} + +template <> +__device__ EIGEN_ALWAYS_INLINE float reduction_shuffle_down(float value, int offset) { + return __shfl_down_sync(reduction_shuffle_mask(), value, offset, warpSize); +} + +template <> +__device__ EIGEN_ALWAYS_INLINE double reduction_shuffle_down(double value, int offset) { + return __shfl_down_sync(reduction_shuffle_mask(), value, offset, warpSize); +} + template <> __device__ inline double atomicExchCustom(double* address, double val) { unsigned long long int* address_as_ull = reinterpret_cast(address); return __longlong_as_double(atomicExch(address_as_ull, __double_as_longlong(val))); } -#ifdef EIGEN_HAS_GPU_FP16 +// Half-float reduction specializations. template __device__ inline void atomicReduce(half2* output, half2 accum, R& reducer) { unsigned int oldval = *reinterpret_cast(output); @@ -111,17 +133,10 @@ __device__ inline void atomicReduce(Packet4h2* output, Packet4h2 accum, R& reduc } } #endif // EIGEN_GPU_COMPILE_PHASE -#endif // EIGEN_HAS_GPU_FP16 template <> __device__ inline void atomicReduce(float* output, float accum, SumReducer&) { -#if (defined(EIGEN_HIP_DEVICE_COMPILE) && defined(__HIP_ARCH_HAS_WARP_SHUFFLE__)) || (EIGEN_CUDA_ARCH >= 300) atomicAdd(output, accum); -#else // EIGEN_CUDA_ARCH >= 300 - EIGEN_UNUSED_VARIABLE(output); - EIGEN_UNUSED_VARIABLE(accum); - gpu_assert(0 && "Shouldn't be called on unsupported device"); -#endif // EIGEN_CUDA_ARCH >= 300 } template @@ -138,7 +153,6 @@ template = 300) // Initialize the output value const Index first_index = blockIdx.x * BlockSize * NumPerThread + threadIdx.x; if (gridDim.x == 1) { @@ -179,20 +193,7 @@ __global__ EIGEN_HIP_LAUNCH_BOUNDS_1024 void FullReductionKernel(Reducer reducer #pragma unroll for (int offset = warpSize / 2; offset > 0; offset /= 2) { -#if defined(EIGEN_HIPCC) - // use std::is_floating_point to determine the type of reduced_val - // This is needed because when Type == double, hipcc will give a "call to __shfl_down is ambiguous" error - // and list the float and int versions of __shfl_down as the candidate functions. - if (std::is_floating_point::value) { - reducer.reduce(__shfl_down(static_cast(accum), offset, warpSize), &accum); - } else { - reducer.reduce(__shfl_down(static_cast(accum), offset, warpSize), &accum); - } -#elif defined(EIGEN_CUDA_SDK_VER) && EIGEN_CUDA_SDK_VER < 90000 - reducer.reduce(__shfl_down(accum, offset, warpSize), &accum); -#else - reducer.reduce(__shfl_down_sync(0xFFFFFFFF, accum, offset, warpSize), &accum); -#endif + reducer.reduce(reduction_shuffle_down(accum, offset), &accum); } if ((threadIdx.x & (warpSize - 1)) == 0) { @@ -206,17 +207,9 @@ __global__ EIGEN_HIP_LAUNCH_BOUNDS_1024 void FullReductionKernel(Reducer reducer __threadfence_system(); #endif } -#else // EIGEN_CUDA_ARCH >= 300 - EIGEN_UNUSED_VARIABLE(reducer); - EIGEN_UNUSED_VARIABLE(input); - EIGEN_UNUSED_VARIABLE(num_coeffs); - EIGEN_UNUSED_VARIABLE(output); - EIGEN_UNUSED_VARIABLE(semaphore); - gpu_assert(0 && "Shouldn't be called on unsupported device"); -#endif // EIGEN_CUDA_ARCH >= 300 } -#ifdef EIGEN_HAS_GPU_FP16 +// Half-float reduction specializations. template __global__ EIGEN_HIP_LAUNCH_BOUNDS_1024 void ReductionInitFullReduxKernelHalfFloat(Reducer reducer, const Self input, Index num_coeffs, half* scratch) { @@ -319,14 +312,6 @@ __global__ EIGEN_HIP_LAUNCH_BOUNDS_1024 void FullReductionKernelHalfFloat(Reduce hr[i] = wka_out.h; } reducer.reducePacket(r1, &accum); -#elif defined(EIGEN_CUDA_SDK_VER) && EIGEN_CUDA_SDK_VER < 90000 - PacketType r1; - half2* hr = reinterpret_cast(&r1); - half2* hacc = reinterpret_cast(&accum); - for (int i = 0; i < packet_width / 2; i++) { - hr[i] = __shfl_down(hacc[i], offset, warpSize); - } - reducer.reducePacket(r1, &accum); #else PacketType r1; half2* hr = reinterpret_cast(&r1); @@ -377,8 +362,6 @@ __global__ EIGEN_HIP_LAUNCH_BOUNDS_1024 void ReductionCleanupKernelHalfFloat(Op } } -#endif // EIGEN_HAS_GPU_FP16 - template struct FullReductionLauncher { static void run(const Self&, Op&, const GpuDevice&, OutputType*, typename Self::Index) { @@ -409,7 +392,7 @@ struct FullReductionLauncher< } }; -#ifdef EIGEN_HAS_GPU_FP16 +// Half-float reduction specializations. template struct FullReductionLauncher { static void run(const Self&, Op&, const GpuDevice&, half*, typename Self::Index) { @@ -443,24 +426,18 @@ struct FullReductionLauncher { } } }; -#endif // EIGEN_HAS_GPU_FP16 template struct FullReducer { // Unfortunately nvidia doesn't support well exotic types such as complex, // so reduce the scope of the optimized version of the code to the simple cases // of doubles, floats and half floats -#ifdef EIGEN_HAS_GPU_FP16 + // Half-float reduction specializations. static constexpr bool HasOptimizedImplementation = !Self::ReducerTraits::IsStateful && (internal::is_same::value || internal::is_same::value || (internal::is_same::value && reducer_traits::PacketAccess)); -#else // EIGEN_HAS_GPU_FP16 - static constexpr bool HasOptimizedImplementation = - !Self::ReducerTraits::IsStateful && (internal::is_same::value || - internal::is_same::value); -#endif // EIGEN_HAS_GPU_FP16 template static void run(const Self& self, Op& reducer, const GpuDevice& device, OutputType* output) { @@ -481,7 +458,6 @@ __global__ EIGEN_HIP_LAUNCH_BOUNDS_1024 void InnerReductionKernel(Reducer reduce Index num_coeffs_to_reduce, Index num_preserved_coeffs, typename Self::CoeffReturnType* output) { -#if (defined(EIGEN_HIP_DEVICE_COMPILE) && defined(__HIP_ARCH_HAS_WARP_SHUFFLE__)) || (EIGEN_CUDA_ARCH >= 300) typedef typename Self::CoeffReturnType Type; eigen_assert(blockDim.y == 1); eigen_assert(blockDim.z == 1); @@ -534,20 +510,7 @@ __global__ EIGEN_HIP_LAUNCH_BOUNDS_1024 void InnerReductionKernel(Reducer reduce #pragma unroll for (int offset = warpSize / 2; offset > 0; offset /= 2) { -#if defined(EIGEN_HIPCC) - // use std::is_floating_point to determine the type of reduced_val - // This is needed because when Type == double, hipcc will give a "call to __shfl_down is ambiguous" error - // and list the float and int versions of __shfl_down as the candidate functions. - if (std::is_floating_point::value) { - reducer.reduce(__shfl_down(static_cast(reduced_val), offset), &reduced_val); - } else { - reducer.reduce(__shfl_down(static_cast(reduced_val), offset), &reduced_val); - } -#elif defined(EIGEN_CUDA_SDK_VER) && EIGEN_CUDA_SDK_VER < 90000 - reducer.reduce(__shfl_down(reduced_val, offset), &reduced_val); -#else - reducer.reduce(__shfl_down_sync(0xFFFFFFFF, reduced_val, offset), &reduced_val); -#endif + reducer.reduce(reduction_shuffle_down(reduced_val, offset), &reduced_val); } if ((threadIdx.x & (warpSize - 1)) == 0) { @@ -555,17 +518,9 @@ __global__ EIGEN_HIP_LAUNCH_BOUNDS_1024 void InnerReductionKernel(Reducer reduce } } } -#else // EIGEN_CUDA_ARCH >= 300 - EIGEN_UNUSED_VARIABLE(reducer); - EIGEN_UNUSED_VARIABLE(input); - EIGEN_UNUSED_VARIABLE(num_coeffs_to_reduce); - EIGEN_UNUSED_VARIABLE(num_preserved_coeffs); - EIGEN_UNUSED_VARIABLE(output); - gpu_assert(0 && "Shouldn't be called on unsupported device"); -#endif // EIGEN_CUDA_ARCH >= 300 } -#ifdef EIGEN_HAS_GPU_FP16 +// Half-float reduction specializations. template __global__ EIGEN_HIP_LAUNCH_BOUNDS_1024 void InnerReductionKernelHalfFloat(Reducer reducer, const Self input, @@ -688,19 +643,6 @@ __global__ EIGEN_HIP_LAUNCH_BOUNDS_1024 void InnerReductionKernelHalfFloat(Reduc } reducer.reducePacket(r1, &reduced_val1); reducer.reducePacket(r2, &reduced_val2); -#elif defined(EIGEN_CUDA_SDK_VER) && EIGEN_CUDA_SDK_VER < 90000 - PacketType r1; - PacketType r2; - half2* hr1 = reinterpret_cast(&r1); - half2* hr2 = reinterpret_cast(&r2); - half2* rv1 = reinterpret_cast(&reduced_val1); - half2* rv2 = reinterpret_cast(&reduced_val2); - for (int i = 0; i < packet_width / 2; i++) { - hr1[i] = __shfl_down(rv1[i], offset, warpSize); - hr2[i] = __shfl_down(rv2[i], offset, warpSize); - } - reducer.reducePacket(r1, &reduced_val1); - reducer.reducePacket(r2, &reduced_val2); #else PacketType r1; PacketType r2; @@ -741,8 +683,6 @@ __global__ EIGEN_HIP_LAUNCH_BOUNDS_1024 void InnerReductionKernelHalfFloat(Reduc } } -#endif // EIGEN_HAS_GPU_FP16 - template struct InnerReductionLauncher { static EIGEN_DEVICE_FUNC bool run(const Self&, Op&, const GpuDevice&, OutputType*, typename Self::Index, @@ -786,7 +726,7 @@ struct InnerReductionLauncher< } }; -#ifdef EIGEN_HAS_GPU_FP16 +// Half-float reduction specializations. template struct InnerReductionLauncher { static bool run(const Self&, Op&, const GpuDevice&, half*, typename Self::Index, typename Self::Index) { @@ -826,24 +766,18 @@ struct InnerReductionLauncher { return false; } }; -#endif // EIGEN_HAS_GPU_FP16 template struct InnerReducer { // Unfortunately nvidia doesn't support well exotic types such as complex, // so reduce the scope of the optimized version of the code to the simple case // of floats and half floats. -#ifdef EIGEN_HAS_GPU_FP16 + // Half-float reduction specializations. static constexpr bool HasOptimizedImplementation = !Self::ReducerTraits::IsStateful && (internal::is_same::value || internal::is_same::value || (internal::is_same::value && reducer_traits::PacketAccess)); -#else // EIGEN_HAS_GPU_FP16 - static constexpr bool HasOptimizedImplementation = - !Self::ReducerTraits::IsStateful && (internal::is_same::value || - internal::is_same::value); -#endif // EIGEN_HAS_GPU_FP16 template static bool run(const Self& self, Op& reducer, const GpuDevice& device, OutputType* output, diff --git a/unsupported/test/CMakeLists.txt b/unsupported/test/CMakeLists.txt index 4160fad49..e90ca8cf0 100644 --- a/unsupported/test/CMakeLists.txt +++ b/unsupported/test/CMakeLists.txt @@ -237,7 +237,7 @@ if("${CMAKE_SIZEOF_VOID_P}" EQUAL "8" AND NOT CMAKE_CXX_COMPILER_ID STREQUAL "MS ei_add_test(cxx11_tensor_uint128) endif() -find_package(CUDA 9.0) +find_package(CUDA 11.4) if(CUDA_FOUND AND EIGEN_TEST_CUDA) # Make sure to compile without the -pedantic, -Wundef, -Wnon-virtual-dtor # and -fno-check-new flags since they trigger thousands of compilation warnings @@ -281,26 +281,11 @@ if(CUDA_FOUND AND EIGEN_TEST_CUDA) ei_add_test(cxx11_tensor_argmax_gpu) ei_add_test(cxx11_tensor_cast_float16_gpu) ei_add_test(cxx11_tensor_scan_gpu) - - set(EIGEN_CUDA_OLDEST_COMPUTE_ARCH 9999) - foreach(ARCH IN LISTS EIGEN_CUDA_COMPUTE_ARCH) - if(${ARCH} LESS ${EIGEN_CUDA_OLDEST_COMPUTE_ARCH}) - set(EIGEN_CUDA_OLDEST_COMPUTE_ARCH ${ARCH}) - endif() - endforeach() - - # Contractions require arch 3.0 or higher - if (${EIGEN_CUDA_OLDEST_COMPUTE_ARCH} GREATER 29) - ei_add_test(cxx11_tensor_device) - ei_add_test(cxx11_tensor_gpu) - ei_add_test(cxx11_tensor_contract_gpu) - ei_add_test(cxx11_tensor_of_float16_gpu) - endif() - - # The random number generation code requires arch 3.5 or greater. - if (${EIGEN_CUDA_OLDEST_COMPUTE_ARCH} GREATER 34) - ei_add_test(cxx11_tensor_random_gpu) - endif() + ei_add_test(cxx11_tensor_device) + ei_add_test(cxx11_tensor_gpu) + ei_add_test(cxx11_tensor_contract_gpu) + ei_add_test(cxx11_tensor_of_float16_gpu) + ei_add_test(cxx11_tensor_random_gpu) unset(EIGEN_ADD_TEST_FILENAME_EXTENSION) endif() @@ -341,7 +326,6 @@ if (EIGEN_TEST_HIP) ei_add_test(cxx11_tensor_cast_float16_gpu) ei_add_test(cxx11_tensor_scan_gpu) ei_add_test(cxx11_tensor_device) - ei_add_test(cxx11_tensor_gpu) ei_add_test(cxx11_tensor_contract_gpu) ei_add_test(cxx11_tensor_of_float16_gpu) diff --git a/unsupported/test/cxx11_tensor_gpu.cu b/unsupported/test/cxx11_tensor_gpu.cu index d96e1b234..28d27abf0 100644 --- a/unsupported/test/cxx11_tensor_gpu.cu +++ b/unsupported/test/cxx11_tensor_gpu.cu @@ -850,6 +850,7 @@ void test_gpu_igamma() { Tensor a(6, 6); Tensor x(6, 6); Tensor out(6, 6); + Tensor expected_out(6, 6); out.setZero(); Scalar a_s[] = {Scalar(0), Scalar(1), Scalar(1.5), Scalar(4), Scalar(0.0001), Scalar(1000.5)}; @@ -862,14 +863,11 @@ void test_gpu_igamma() { } } - Scalar nan = std::numeric_limits::quiet_NaN(); - Scalar igamma_s[][6] = { - {0.0, nan, nan, nan, nan, nan}, - {0.0, 0.6321205588285578, 0.7768698398515702, 0.9816843611112658, 9.999500016666262e-05, 1.0}, - {0.0, 0.4275932955291202, 0.608374823728911, 0.9539882943107686, 7.522076445089201e-07, 1.0}, - {0.0, 0.01898815687615381, 0.06564245437845008, 0.5665298796332909, 4.166333347221828e-18, 1.0}, - {0.0, 0.9999780593618628, 0.9999899967080838, 0.9999996219837988, 0.9991370418689945, 1.0}, - {0.0, 0.0, 0.0, 0.0, 0.0, 0.5042041932513908}}; + for (int i = 0; i < 6; ++i) { + for (int j = 0; j < 6; ++j) { + expected_out(i, j) = numext::igamma(a(i, j), x(i, j)); + } + } std::size_t bytes = a.size() * sizeof(Scalar); @@ -897,10 +895,10 @@ void test_gpu_igamma() { for (int i = 0; i < 6; ++i) { for (int j = 0; j < 6; ++j) { - if ((std::isnan)(igamma_s[i][j])) { + if ((std::isnan)(expected_out(i, j))) { VERIFY((std::isnan)(out(i, j))); } else { - VERIFY_IS_APPROX(out(i, j), igamma_s[i][j]); + VERIFY_IS_APPROX(out(i, j), expected_out(i, j)); } } } @@ -915,6 +913,7 @@ void test_gpu_igammac() { Tensor a(6, 6); Tensor x(6, 6); Tensor out(6, 6); + Tensor expected_out(6, 6); out.setZero(); Scalar a_s[] = {Scalar(0), Scalar(1), Scalar(1.5), Scalar(4), Scalar(0.0001), Scalar(1000.5)}; @@ -927,14 +926,11 @@ void test_gpu_igammac() { } } - Scalar nan = std::numeric_limits::quiet_NaN(); - Scalar igammac_s[][6] = { - {nan, nan, nan, nan, nan, nan}, - {1.0, 0.36787944117144233, 0.22313016014842982, 0.018315638888734182, 0.9999000049998333, 0.0}, - {1.0, 0.5724067044708798, 0.3916251762710878, 0.04601170568923136, 0.9999992477923555, 0.0}, - {1.0, 0.9810118431238462, 0.9343575456215499, 0.4334701203667089, 1.0, 0.0}, - {1.0, 2.1940638138146658e-05, 1.0003291916285e-05, 3.7801620118431334e-07, 0.0008629581310054535, 0.0}, - {1.0, 1.0, 1.0, 1.0, 1.0, 0.49579580674813944}}; + for (int i = 0; i < 6; ++i) { + for (int j = 0; j < 6; ++j) { + expected_out(i, j) = numext::igammac(a(i, j), x(i, j)); + } + } std::size_t bytes = a.size() * sizeof(Scalar); @@ -962,10 +958,10 @@ void test_gpu_igammac() { for (int i = 0; i < 6; ++i) { for (int j = 0; j < 6; ++j) { - if ((std::isnan)(igammac_s[i][j])) { + if ((std::isnan)(expected_out(i, j))) { VERIFY((std::isnan)(out(i, j))); } else { - VERIFY_IS_APPROX(out(i, j), igammac_s[i][j]); + VERIFY_IS_APPROX(out(i, j), expected_out(i, j)); } } } @@ -1068,15 +1064,9 @@ void test_gpu_ndtri() { in_x(7) = Scalar(0.99); in_x(8) = Scalar(0.01); - expected_out(0) = std::numeric_limits::infinity(); - expected_out(1) = -std::numeric_limits::infinity(); - expected_out(2) = Scalar(0.0); - expected_out(3) = Scalar(-0.8416212335729142); - expected_out(4) = Scalar(0.8416212335729142); - expected_out(5) = Scalar(1.2815515655446004); - expected_out(6) = Scalar(-1.2815515655446004); - expected_out(7) = Scalar(2.3263478740408408); - expected_out(8) = Scalar(-2.3263478740408408); + for (int i = 0; i < 9; ++i) { + expected_out(i) = numext::ndtri(in_x(i)); + } std::size_t bytes = in_x.size() * sizeof(Scalar); @@ -1090,15 +1080,15 @@ void test_gpu_ndtri() { Eigen::GpuStreamDevice stream; Eigen::GpuDevice gpu_device(&stream); - Eigen::TensorMap > gpu_in_x(d_in_x, 6); - Eigen::TensorMap > gpu_out(d_out, 6); + Eigen::TensorMap > gpu_in_x(d_in_x, 9); + Eigen::TensorMap > gpu_out(d_out, 9); gpu_out.device(gpu_device) = gpu_in_x.ndtri(); assert(gpuMemcpyAsync(out.data(), d_out, bytes, gpuMemcpyDeviceToHost, gpu_device.stream()) == gpuSuccess); assert(gpuStreamSynchronize(gpu_device.stream()) == gpuSuccess); - for (int i = 0; i < 6; ++i) { + for (int i = 0; i < 9; ++i) { VERIFY_IS_CWISE_APPROX(out(i), expected_out(i)); } @@ -1115,12 +1105,9 @@ void test_gpu_betainc() { Tensor expected_out(125); out.setZero(); - Scalar nan = std::numeric_limits::quiet_NaN(); - Array x(125); Array a(125); Array b(125); - Array v(125); a << 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.03062277660168379, 0.03062277660168379, 0.03062277660168379, 0.03062277660168379, @@ -1160,25 +1147,11 @@ void test_gpu_betainc() { 0.5, 0.8, 1.1, -0.1, 0.2, 0.5, 0.8, 1.1, -0.1, 0.2, 0.5, 0.8, 1.1, -0.1, 0.2, 0.5, 0.8, 1.1, -0.1, 0.2, 0.5, 0.8, 1.1, -0.1, 0.2, 0.5, 0.8, 1.1, -0.1, 0.2, 0.5, 0.8, 1.1, -0.1, 0.2, 0.5, 0.8, 1.1; - v << nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, - nan, nan, nan, nan, nan, nan, nan, nan, nan, 0.47972119876364683, 0.5, 0.5202788012363533, nan, nan, - 0.9518683957740043, 0.9789663010413743, 0.9931729188073435, nan, nan, 0.999995949033062, 0.9999999999993698, - 0.9999999999999999, nan, nan, 0.9999999999999999, 0.9999999999999999, 0.9999999999999999, nan, nan, nan, nan, nan, - nan, nan, 0.006827081192655869, 0.0210336989586256, 0.04813160422599567, nan, nan, 0.20014344256217678, - 0.5000000000000001, 0.7998565574378232, nan, nan, 0.9991401428435834, 0.999999999698403, 0.9999999999999999, nan, - nan, 0.9999999999999999, 0.9999999999999999, 0.9999999999999999, nan, nan, nan, nan, nan, nan, nan, - 1.0646600232370887e-25, 6.301722877826246e-13, 4.050966937974938e-06, nan, nan, 7.864342668429763e-23, - 3.015969667594166e-10, 0.0008598571564165444, nan, nan, 6.031987710123844e-08, 0.5000000000000007, - 0.9999999396801229, nan, nan, 0.9999999999999999, 0.9999999999999999, 0.9999999999999999, nan, nan, nan, nan, nan, - nan, nan, 0.0, 7.029920380986636e-306, 2.2450728208591345e-101, nan, nan, 0.0, 9.275871147869727e-302, - 1.2232913026152827e-97, nan, nan, 0.0, 3.0891393081932924e-252, 2.9303043666183996e-60, nan, nan, - 2.248913486879199e-196, 0.5000000000004947, 0.9999999999999999, nan; - for (int i = 0; i < 125; ++i) { in_x(i) = x(i); in_a(i) = a(i); in_b(i) = b(i); - expected_out(i) = v(i); + expected_out(i) = numext::betainc(a(i), b(i), x(i)); } std::size_t bytes = in_x.size() * sizeof(Scalar); diff --git a/unsupported/test/cxx11_tensor_of_float16_gpu.cu b/unsupported/test/cxx11_tensor_of_float16_gpu.cu index d29df4fed..d57a25e71 100644 --- a/unsupported/test/cxx11_tensor_of_float16_gpu.cu +++ b/unsupported/test/cxx11_tensor_of_float16_gpu.cu @@ -53,8 +53,6 @@ void test_gpu_numext() { gpu_device.deallocate(d_res_float); } -#ifdef EIGEN_HAS_GPU_FP16 - template void test_gpu_conversion() { Eigen::GpuStreamDevice stream; @@ -442,12 +440,10 @@ void test_gpu_forced_evals() { gpu_device.deallocate(d_res_half2); gpu_device.deallocate(d_res_float); } -#endif EIGEN_DECLARE_TEST(cxx11_tensor_of_float16_gpu) { CALL_SUBTEST_1(test_gpu_numext()); -#ifdef EIGEN_HAS_GPU_FP16 CALL_SUBTEST_1(test_gpu_conversion()); CALL_SUBTEST_1(test_gpu_unary()); CALL_SUBTEST_1(test_gpu_elementwise()); @@ -456,7 +452,4 @@ EIGEN_DECLARE_TEST(cxx11_tensor_of_float16_gpu) { CALL_SUBTEST_3(test_gpu_reductions()); CALL_SUBTEST_4(test_gpu_full_reductions()); CALL_SUBTEST_5(test_gpu_forced_evals()); -#else - std::cout << "Half floats are not supported by this version of gpu: skipping the test" << std::endl; -#endif }