From 414c42bfcf2bec054bb19c64be8cb3337f121bad Mon Sep 17 00:00:00 2001 From: Antonio Sanchez Date: Tue, 11 Mar 2025 19:20:03 -0700 Subject: [PATCH] Fix cuda clang builds --- Eigen/Core | 11 +- Eigen/src/Core/GenericPacketMath.h | 2 +- Eigen/src/Core/MathFunctions.h | 2 +- Eigen/src/Core/arch/CUDA/Half.h | 52 +-- Eigen/src/Core/arch/CUDA/PacketMathHalf.h | 76 ++--- Eigen/src/Core/util/Meta.h | 2 +- test/CMakeLists.txt | 6 +- .../Eigen/CXX11/src/Tensor/TensorChipping.h | 1 + .../CXX11/src/Tensor/TensorContractionCuda.h | 11 +- .../CXX11/src/Tensor/TensorConvolution.h | 30 +- .../Eigen/CXX11/src/Tensor/TensorDeviceCuda.h | 5 +- .../Eigen/CXX11/src/Tensor/TensorExecutor.h | 2 +- .../Eigen/CXX11/src/Tensor/TensorIndexList.h | 3 + .../Eigen/CXX11/src/Tensor/TensorReduction.h | 2 +- .../CXX11/src/Tensor/TensorReductionCuda.h | 306 +++++++++++++----- .../Eigen/CXX11/src/Tensor/TensorUInt128.h | 1 + unsupported/test/CMakeLists.txt | 8 +- .../test/cxx11_tensor_of_float16_cuda.cu | 2 +- 18 files changed, 334 insertions(+), 188 deletions(-) diff --git a/Eigen/Core b/Eigen/Core index a032dfab9..56f19acb6 100644 --- a/Eigen/Core +++ b/Eigen/Core @@ -62,9 +62,16 @@ #else #define EIGEN_DEVICE_FUNC - #endif +#if defined(EIGEN_CUDACC) + #include + #define EIGEN_CUDA_SDK_VER (CUDA_VERSION * 10) +#else + #define EIGEN_CUDA_SDK_VER 0 +#endif + + // When compiling CUDA device code with NVCC, pull in math functions from the // global namespace. In host mode, and when device doee with clang, use the // std versions. @@ -254,7 +261,7 @@ #if defined EIGEN_CUDACC #define EIGEN_VECTORIZE_CUDA #include - #if EIGEN_CUDACC_VER >= 70500 + #if EIGEN_CUDA_SDK_VER >= 70500 #define EIGEN_HAS_CUDA_FP16 #endif #endif diff --git a/Eigen/src/Core/GenericPacketMath.h b/Eigen/src/Core/GenericPacketMath.h index a6878bb5a..d2e401777 100644 --- a/Eigen/src/Core/GenericPacketMath.h +++ b/Eigen/src/Core/GenericPacketMath.h @@ -299,7 +299,7 @@ template EIGEN_DEVICE_FUNC inline void pstoreu template EIGEN_DEVICE_FUNC inline void prefetch(const Scalar* addr) { #ifdef __CUDA_ARCH__ -#if defined(__LP64__) +#if defined(__LP64__) || EIGEN_OS_WIN64 // 64-bit pointer operand constraint for inlined asm asm(" prefetch.L1 [ %1 ];" : "=l"(addr) : "l"(addr)); #else diff --git a/Eigen/src/Core/MathFunctions.h b/Eigen/src/Core/MathFunctions.h index cceffb9a3..2dc0015bf 100644 --- a/Eigen/src/Core/MathFunctions.h +++ b/Eigen/src/Core/MathFunctions.h @@ -539,7 +539,7 @@ namespace std_fallback { template struct log1p_impl { - static inline Scalar run(const Scalar& x) + static EIGEN_DEVICE_FUNC inline Scalar run(const Scalar& x) { EIGEN_STATIC_ASSERT_NON_INTEGER(Scalar) #if EIGEN_HAS_CXX11_MATH diff --git a/Eigen/src/Core/arch/CUDA/Half.h b/Eigen/src/Core/arch/CUDA/Half.h index aaa97a13b..1d7bf36bc 100644 --- a/Eigen/src/Core/arch/CUDA/Half.h +++ b/Eigen/src/Core/arch/CUDA/Half.h @@ -57,7 +57,7 @@ struct __half_raw { explicit EIGEN_DEVICE_FUNC __half_raw(unsigned short raw) : x(raw) {} unsigned short x; }; -#elif defined(EIGEN_CUDACC_VER) && EIGEN_CUDACC_VER < 90000 +#elif EIGEN_CUDA_SDK_VER < 90000 // In CUDA < 9.0, __half is the equivalent of CUDA 9's __half_raw typedef __half __half_raw; #endif @@ -70,7 +70,7 @@ struct half_base : public __half_raw { EIGEN_DEVICE_FUNC half_base() {} EIGEN_DEVICE_FUNC half_base(const half_base& h) : __half_raw(h) {} EIGEN_DEVICE_FUNC half_base(const __half_raw& h) : __half_raw(h) {} -#if defined(EIGEN_HAS_CUDA_FP16) && defined(EIGEN_CUDACC_VER) && EIGEN_CUDACC_VER >= 90000 +#if defined(EIGEN_HAS_CUDA_FP16) && EIGEN_CUDA_SDK_VER >= 90000 EIGEN_DEVICE_FUNC half_base(const __half& h) : __half_raw(*(__half_raw*)&h) {} #endif }; @@ -79,7 +79,7 @@ struct half_base : public __half_raw { // Class definition. struct half : public half_impl::half_base { - #if !defined(EIGEN_HAS_CUDA_FP16) || (defined(EIGEN_CUDACC_VER) && EIGEN_CUDACC_VER < 90000) + #if !defined(EIGEN_HAS_CUDA_FP16) || (EIGEN_CUDA_SDK_VER < 90000) typedef half_impl::__half_raw __half_raw; #endif @@ -87,7 +87,7 @@ struct half : public half_impl::half_base { EIGEN_DEVICE_FUNC half(const __half_raw& h) : half_impl::half_base(h) {} EIGEN_DEVICE_FUNC half(const half& h) : half_impl::half_base(h) {} -#if defined(EIGEN_HAS_CUDA_FP16) && defined(EIGEN_CUDACC_VER) && EIGEN_CUDACC_VER >= 90000 +#if defined(EIGEN_HAS_CUDA_FP16) && EIGEN_CUDA_SDK_VER >= 90000 EIGEN_DEVICE_FUNC half(const __half& h) : half_impl::half_base(h) {} #endif @@ -209,55 +209,55 @@ namespace half_impl { // versions to get the ALU speed increased), but you do save the // conversion steps back and forth. -EIGEN_STRONG_INLINE __device__ half operator + (const half& a, const half& b) { +EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC half operator + (const half& a, const half& b) { return __hadd(static_cast<__half>(a), static_cast<__half>(b)); } -EIGEN_STRONG_INLINE __device__ half operator * (const half& a, const half& b) { +EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC half operator * (const half& a, const half& b) { return __hmul(static_cast<__half>(a), static_cast<__half>(b)); } -EIGEN_STRONG_INLINE __device__ half operator - (const half& a, const half& b) { +EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC half operator - (const half& a, const half& b) { return __hsub(static_cast<__half>(a), static_cast<__half>(b)); } -EIGEN_STRONG_INLINE __device__ half operator / (const half& a, const half& b) { +EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC half operator / (const half& a, const half& b) { float num = __half2float(a); float denom = __half2float(b); return __float2half(num / denom); } -EIGEN_STRONG_INLINE __device__ half operator - (const half& a) { +EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC half operator - (const half& a) { return __hneg(static_cast<__half>(a)); } -EIGEN_STRONG_INLINE __device__ half& operator += (half& a, const half& b) { +EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC half& operator += (half& a, const half& b) { a = a + b; return a; } -EIGEN_STRONG_INLINE __device__ half& operator *= (half& a, const half& b) { +EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC half& operator *= (half& a, const half& b) { a = a * b; return a; } -EIGEN_STRONG_INLINE __device__ half& operator -= (half& a, const half& b) { +EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC half& operator -= (half& a, const half& b) { a = a - b; return a; } -EIGEN_STRONG_INLINE __device__ half& operator /= (half& a, const half& b) { +EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC half& operator /= (half& a, const half& b) { a = a / b; return a; } -EIGEN_STRONG_INLINE __device__ bool operator == (const half& a, const half& b) { +EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC bool operator == (const half& a, const half& b) { return __heq(static_cast<__half>(a), static_cast<__half>(b)); } -EIGEN_STRONG_INLINE __device__ bool operator != (const half& a, const half& b) { +EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC bool operator != (const half& a, const half& b) { return __hne(static_cast<__half>(a), static_cast<__half>(b)); } -EIGEN_STRONG_INLINE __device__ bool operator < (const half& a, const half& b) { +EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC bool operator < (const half& a, const half& b) { return __hlt(static_cast<__half>(a), static_cast<__half>(b)); } -EIGEN_STRONG_INLINE __device__ bool operator <= (const half& a, const half& b) { +EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC bool operator <= (const half& a, const half& b) { return __hle(static_cast<__half>(a), static_cast<__half>(b)); } -EIGEN_STRONG_INLINE __device__ bool operator > (const half& a, const half& b) { +EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC bool operator > (const half& a, const half& b) { return __hgt(static_cast<__half>(a), static_cast<__half>(b)); } -EIGEN_STRONG_INLINE __device__ bool operator >= (const half& a, const half& b) { +EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC bool operator >= (const half& a, const half& b) { return __hge(static_cast<__half>(a), static_cast<__half>(b)); } @@ -449,14 +449,14 @@ EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC half abs(const half& a) { return result; } EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC half exp(const half& a) { -#if EIGEN_CUDACC_VER >= 80000 && defined EIGEN_CUDA_ARCH && EIGEN_CUDA_ARCH >= 530 +#if EIGEN_CUDA_SDK_VER >= 80000 && defined EIGEN_CUDA_ARCH && EIGEN_CUDA_ARCH >= 530 return half(hexp(a)); #else return half(::expf(float(a))); #endif } EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC half log(const half& a) { -#if defined(EIGEN_HAS_CUDA_FP16) && EIGEN_CUDACC_VER >= 80000 && defined(EIGEN_CUDA_ARCH) && EIGEN_CUDA_ARCH >= 530 +#if defined(EIGEN_HAS_CUDA_FP16) && EIGEN_CUDA_SDK_VER >= 80000 && defined(EIGEN_CUDA_ARCH) && EIGEN_CUDA_ARCH >= 530 return half(::hlog(a)); #else return half(::logf(float(a))); @@ -469,7 +469,7 @@ EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC half log10(const half& a) { return half(::log10f(float(a))); } EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC half sqrt(const half& a) { -#if EIGEN_CUDACC_VER >= 80000 && defined EIGEN_CUDA_ARCH && EIGEN_CUDA_ARCH >= 530 +#if EIGEN_CUDA_SDK_VER >= 80000 && defined EIGEN_CUDA_ARCH && EIGEN_CUDA_ARCH >= 530 return half(hsqrt(a)); #else return half(::sqrtf(float(a))); @@ -491,14 +491,14 @@ EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC half tanh(const half& a) { return half(::tanhf(float(a))); } EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC half floor(const half& a) { -#if EIGEN_CUDACC_VER >= 80000 && defined EIGEN_CUDA_ARCH && EIGEN_CUDA_ARCH >= 300 +#if EIGEN_CUDA_SDK_VER >= 80000 && defined EIGEN_CUDA_ARCH && EIGEN_CUDA_ARCH >= 300 return half(hfloor(a)); #else return half(::floorf(float(a))); #endif } EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC half ceil(const half& a) { -#if EIGEN_CUDACC_VER >= 80000 && defined EIGEN_CUDA_ARCH && EIGEN_CUDA_ARCH >= 300 +#if EIGEN_CUDA_SDK_VER >= 80000 && defined EIGEN_CUDA_ARCH && EIGEN_CUDA_ARCH >= 300 return half(hceil(a)); #else return half(::ceilf(float(a))); @@ -593,7 +593,7 @@ EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC Eigen::half exph(const Eigen::half& a) { return Eigen::half(::expf(float(a))); } EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC Eigen::half logh(const Eigen::half& a) { -#if EIGEN_CUDACC_VER >= 80000 && defined(EIGEN_CUDA_ARCH) && EIGEN_CUDA_ARCH >= 530 +#if EIGEN_CUDA_SDK_VER >= 80000 && defined(EIGEN_CUDA_ARCH) && EIGEN_CUDA_ARCH >= 530 return Eigen::half(::hlog(a)); #else return Eigen::half(::logf(float(a))); @@ -638,7 +638,7 @@ struct hash { // // Note that the following are __device__ - only functions. #if defined(EIGEN_CUDACC) && (!defined(EIGEN_CUDA_ARCH) || EIGEN_CUDA_ARCH >= 300) -#if defined(EIGEN_HAS_CUDA_FP16) && EIGEN_CUDACC_VER >= 90000 +#if defined(EIGEN_HAS_CUDA_FP16) && EIGEN_CUDA_SDK_VER >= 90000 __device__ EIGEN_STRONG_INLINE Eigen::half __shfl_sync(unsigned mask, Eigen::half var, int srcLane, int width = warpSize) { diff --git a/Eigen/src/Core/arch/CUDA/PacketMathHalf.h b/Eigen/src/Core/arch/CUDA/PacketMathHalf.h index aa6c11f57..8da026e19 100644 --- a/Eigen/src/Core/arch/CUDA/PacketMathHalf.h +++ b/Eigen/src/Core/arch/CUDA/PacketMathHalf.h @@ -41,42 +41,42 @@ template<> struct packet_traits : default_packet_traits template<> struct unpacket_traits { typedef Eigen::half type; enum {size=2, alignment=Aligned16}; typedef half2 half; }; -template<> __device__ EIGEN_STRONG_INLINE half2 pset1(const Eigen::half& from) { +template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 pset1(const Eigen::half& from) { return __half2half2(from); } -template<> __device__ EIGEN_STRONG_INLINE half2 pload(const Eigen::half* from) { +template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 pload(const Eigen::half* from) { return *reinterpret_cast(from); } -template<> __device__ EIGEN_STRONG_INLINE half2 ploadu(const Eigen::half* from) { +template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 ploadu(const Eigen::half* from) { return __halves2half2(from[0], from[1]); } -template<> EIGEN_STRONG_INLINE half2 ploaddup(const Eigen::half* from) { +template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 ploaddup(const Eigen::half* from) { return __halves2half2(from[0], from[0]); } -template<> __device__ EIGEN_STRONG_INLINE void pstore(Eigen::half* to, const half2& from) { +template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void pstore(Eigen::half* to, const half2& from) { *reinterpret_cast(to) = from; } -template<> __device__ EIGEN_STRONG_INLINE void pstoreu(Eigen::half* to, const half2& from) { +template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void pstoreu(Eigen::half* to, const half2& from) { to[0] = __low2half(from); to[1] = __high2half(from); } template<> - __device__ EIGEN_ALWAYS_INLINE half2 ploadt_ro(const Eigen::half* from) { + EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE half2 ploadt_ro(const Eigen::half* from) { #if __CUDA_ARCH__ >= 350 - return __ldg((const half2*)from); + return __ldg(reinterpret_cast(from)); #else return __halves2half2(*(from+0), *(from+1)); #endif } template<> -__device__ EIGEN_ALWAYS_INLINE half2 ploadt_ro(const Eigen::half* from) { +EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE half2 ploadt_ro(const Eigen::half* from) { #if __CUDA_ARCH__ >= 350 return __halves2half2(__ldg(from+0), __ldg(from+1)); #else @@ -84,20 +84,20 @@ __device__ EIGEN_ALWAYS_INLINE half2 ploadt_ro(const Eigen::ha #endif } -template<> __device__ EIGEN_STRONG_INLINE half2 pgather(const Eigen::half* from, Index stride) { +template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 pgather(const Eigen::half* from, Index stride) { return __halves2half2(from[0*stride], from[1*stride]); } -template<> __device__ EIGEN_STRONG_INLINE void pscatter(Eigen::half* to, const half2& from, Index stride) { +template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void pscatter(Eigen::half* to, const half2& from, Index stride) { to[stride*0] = __low2half(from); to[stride*1] = __high2half(from); } -template<> __device__ EIGEN_STRONG_INLINE Eigen::half pfirst(const half2& a) { +template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Eigen::half pfirst(const half2& a) { return __low2half(a); } -template<> __device__ EIGEN_STRONG_INLINE half2 pabs(const half2& a) { +template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 pabs(const half2& a) { half2 result; unsigned temp = *(reinterpret_cast(&(a))); *(reinterpret_cast(&(result))) = temp & 0x7FFF7FFF; @@ -105,7 +105,7 @@ template<> __device__ EIGEN_STRONG_INLINE half2 pabs(const half2& a) { } -__device__ EIGEN_STRONG_INLINE void +EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void ptranspose(PacketBlock& kernel) { __half a1 = __low2half(kernel.packet[0]); __half a2 = __high2half(kernel.packet[0]); @@ -115,7 +115,7 @@ ptranspose(PacketBlock& kernel) { kernel.packet[1] = __halves2half2(a2, b2); } -template<> __device__ EIGEN_STRONG_INLINE half2 plset(const Eigen::half& a) { +template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 plset(const Eigen::half& a) { #if __CUDA_ARCH__ >= 530 return __halves2half2(a, __hadd(a, __float2half(1.0f))); #else @@ -124,7 +124,7 @@ template<> __device__ EIGEN_STRONG_INLINE half2 plset(const Eigen::half& #endif } -template<> __device__ EIGEN_STRONG_INLINE half2 padd(const half2& a, const half2& b) { +template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 padd(const half2& a, const half2& b) { #if __CUDA_ARCH__ >= 530 return __hadd2(a, b); #else @@ -138,7 +138,7 @@ template<> __device__ EIGEN_STRONG_INLINE half2 padd(const half2& a, cons #endif } -template<> __device__ EIGEN_STRONG_INLINE half2 psub(const half2& a, const half2& b) { +template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 psub(const half2& a, const half2& b) { #if __CUDA_ARCH__ >= 530 return __hsub2(a, b); #else @@ -152,7 +152,7 @@ template<> __device__ EIGEN_STRONG_INLINE half2 psub(const half2& a, cons #endif } -template<> __device__ EIGEN_STRONG_INLINE half2 pnegate(const half2& a) { +template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 pnegate(const half2& a) { #if __CUDA_ARCH__ >= 530 return __hneg2(a); #else @@ -162,9 +162,9 @@ template<> __device__ EIGEN_STRONG_INLINE half2 pnegate(const half2& a) { #endif } -template<> __device__ EIGEN_STRONG_INLINE half2 pconj(const half2& a) { return a; } +template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 pconj(const half2& a) { return a; } -template<> __device__ EIGEN_STRONG_INLINE half2 pmul(const half2& a, const half2& b) { +template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 pmul(const half2& a, const half2& b) { #if __CUDA_ARCH__ >= 530 return __hmul2(a, b); #else @@ -178,7 +178,7 @@ template<> __device__ EIGEN_STRONG_INLINE half2 pmul(const half2& a, cons #endif } -template<> __device__ EIGEN_STRONG_INLINE half2 pmadd(const half2& a, const half2& b, const half2& c) { +template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 pmadd(const half2& a, const half2& b, const half2& c) { #if __CUDA_ARCH__ >= 530 return __hfma2(a, b, c); #else @@ -194,7 +194,7 @@ template<> __device__ EIGEN_STRONG_INLINE half2 pmadd(const half2& a, con #endif } -template<> __device__ EIGEN_STRONG_INLINE half2 pdiv(const half2& a, const half2& b) { +template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 pdiv(const half2& a, const half2& b) { float a1 = __low2float(a); float a2 = __high2float(a); float b1 = __low2float(b); @@ -204,7 +204,7 @@ template<> __device__ EIGEN_STRONG_INLINE half2 pdiv(const half2& a, cons return __floats2half2_rn(r1, r2); } -template<> __device__ EIGEN_STRONG_INLINE half2 pmin(const half2& a, const half2& b) { +template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 pmin(const half2& a, const half2& b) { float a1 = __low2float(a); float a2 = __high2float(a); float b1 = __low2float(b); @@ -214,7 +214,7 @@ template<> __device__ EIGEN_STRONG_INLINE half2 pmin(const half2& a, cons return __halves2half2(r1, r2); } -template<> __device__ EIGEN_STRONG_INLINE half2 pmax(const half2& a, const half2& b) { +template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 pmax(const half2& a, const half2& b) { float a1 = __low2float(a); float a2 = __high2float(a); float b1 = __low2float(b); @@ -224,7 +224,7 @@ template<> __device__ EIGEN_STRONG_INLINE half2 pmax(const half2& a, cons return __halves2half2(r1, r2); } -template<> __device__ EIGEN_STRONG_INLINE Eigen::half predux(const half2& a) { +template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Eigen::half predux(const half2& a) { #if __CUDA_ARCH__ >= 530 return __hadd(__low2half(a), __high2half(a)); #else @@ -234,7 +234,7 @@ template<> __device__ EIGEN_STRONG_INLINE Eigen::half predux(const half2& #endif } -template<> __device__ EIGEN_STRONG_INLINE Eigen::half predux_max(const half2& a) { +template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Eigen::half predux_max(const half2& a) { #if __CUDA_ARCH__ >= 530 __half first = __low2half(a); __half second = __high2half(a); @@ -246,7 +246,7 @@ template<> __device__ EIGEN_STRONG_INLINE Eigen::half predux_max(const ha #endif } -template<> __device__ EIGEN_STRONG_INLINE Eigen::half predux_min(const half2& a) { +template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Eigen::half predux_min(const half2& a) { #if __CUDA_ARCH__ >= 530 __half first = __low2half(a); __half second = __high2half(a); @@ -258,7 +258,7 @@ template<> __device__ EIGEN_STRONG_INLINE Eigen::half predux_min(const ha #endif } -template<> __device__ EIGEN_STRONG_INLINE Eigen::half predux_mul(const half2& a) { +template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Eigen::half predux_mul(const half2& a) { #if __CUDA_ARCH__ >= 530 return __hmul(__low2half(a), __high2half(a)); #else @@ -268,7 +268,7 @@ template<> __device__ EIGEN_STRONG_INLINE Eigen::half predux_mul(const ha #endif } -template<> __device__ EIGEN_STRONG_INLINE half2 plog1p(const half2& a) { +template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 plog1p(const half2& a) { float a1 = __low2float(a); float a2 = __high2float(a); float r1 = log1pf(a1); @@ -276,31 +276,31 @@ template<> __device__ EIGEN_STRONG_INLINE half2 plog1p(const half2& a) { return __floats2half2_rn(r1, r2); } -#if EIGEN_CUDACC_VER >= 80000 && defined EIGEN_CUDA_ARCH && EIGEN_CUDA_ARCH >= 530 +#if EIGEN_CUDA_SDK_VER >= 80000 && defined EIGEN_CUDA_ARCH && EIGEN_CUDA_ARCH >= 530 -template<> __device__ EIGEN_STRONG_INLINE +template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 plog(const half2& a) { return h2log(a); } -template<> __device__ EIGEN_STRONG_INLINE +template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 pexp(const half2& a) { return h2exp(a); } -template<> __device__ EIGEN_STRONG_INLINE +template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 psqrt(const half2& a) { return h2sqrt(a); } -template<> __device__ EIGEN_STRONG_INLINE +template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 prsqrt(const half2& a) { return h2rsqrt(a); } #else -template<> __device__ EIGEN_STRONG_INLINE half2 plog(const half2& a) { +template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 plog(const half2& a) { float a1 = __low2float(a); float a2 = __high2float(a); float r1 = logf(a1); @@ -308,7 +308,7 @@ template<> __device__ EIGEN_STRONG_INLINE half2 plog(const half2& a) { return __floats2half2_rn(r1, r2); } -template<> __device__ EIGEN_STRONG_INLINE half2 pexp(const half2& a) { +template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 pexp(const half2& a) { float a1 = __low2float(a); float a2 = __high2float(a); float r1 = expf(a1); @@ -316,7 +316,7 @@ template<> __device__ EIGEN_STRONG_INLINE half2 pexp(const half2& a) { return __floats2half2_rn(r1, r2); } -template<> __device__ EIGEN_STRONG_INLINE half2 psqrt(const half2& a) { +template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 psqrt(const half2& a) { float a1 = __low2float(a); float a2 = __high2float(a); float r1 = sqrtf(a1); @@ -324,7 +324,7 @@ template<> __device__ EIGEN_STRONG_INLINE half2 psqrt(const half2& a) { return __floats2half2_rn(r1, r2); } -template<> __device__ EIGEN_STRONG_INLINE half2 prsqrt(const half2& a) { +template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 prsqrt(const half2& a) { float a1 = __low2float(a); float a2 = __high2float(a); float r1 = rsqrtf(a1); diff --git a/Eigen/src/Core/util/Meta.h b/Eigen/src/Core/util/Meta.h index 782c4b885..f54eb2876 100755 --- a/Eigen/src/Core/util/Meta.h +++ b/Eigen/src/Core/util/Meta.h @@ -541,7 +541,7 @@ using std::numeric_limits; // Integer division with rounding up. // T is assumed to be an integer type with a>=0, and b>0 template -T div_ceil(const T &a, const T &b) +EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE T div_ceil(const T &a, const T &b) { return (a+b-1) / b; } diff --git a/test/CMakeLists.txt b/test/CMakeLists.txt index e3ce600c3..5d1fa9cf1 100644 --- a/test/CMakeLists.txt +++ b/test/CMakeLists.txt @@ -365,7 +365,11 @@ find_package(CUDA 5.0) if(CUDA_FOUND) if( (NOT EIGEN_TEST_CXX11) OR (CMAKE_VERSION VERSION_LESS 3.3)) - string(APPEND EIGEN_CUDA_CXX11_FLAGS " -std=c++11") + if (MSVC) + string(APPEND EIGEN_CUDA_CXX11_FLAGS " /std:c++11") + else() + string(APPEND EIGEN_CUDA_CXX11_FLAGS " -std=c++11") + endif(MSVC) endif() if(EIGEN_TEST_CUDA_CLANG) diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorChipping.h b/unsupported/Eigen/CXX11/src/Tensor/TensorChipping.h index 996b8ea86..80cfb9eb4 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorChipping.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorChipping.h @@ -42,6 +42,7 @@ template struct DimensionId { EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE DimensionId(DenseIndex dim) { + EIGEN_ONLY_USED_FOR_DEBUG(dim); eigen_assert(dim == DimId); } EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE DenseIndex actualDim() const { diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorContractionCuda.h b/unsupported/Eigen/CXX11/src/Tensor/TensorContractionCuda.h index 7563838ae..b78467b2a 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorContractionCuda.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorContractionCuda.h @@ -388,7 +388,7 @@ EigenContractionKernelInternal(const LhsMapper lhs, const RhsMapper rhs, // the sum across all big k blocks of the product of little k block of index (x, y) // with block of index (y, z). To compute the final output, we need to reduce // the 8 threads over y by summation. -#if EIGEN_CUDACC_VER < 90000 +#if EIGEN_CUDA_SDK_VER < 90000 #define shuffleInc(i, j, mask) res(i, j) += __shfl_xor(res(i, j), mask) #else #define shuffleInc(i, j, mask) res(i, j) += __shfl_xor_sync(0xFFFFFFFF, res(i, j), mask) @@ -533,8 +533,6 @@ EigenFloatContractionKernelInternal16x16(const LhsMapper lhs, const RhsMapper rh float2 rhs_shmem2[][8], const Index m_size, const Index n_size, const Index k_size, const Index base_m, const Index base_n) { - typedef float Scalar; - // prefetch registers float4 lhs_pf0, rhs_pf0; @@ -619,7 +617,7 @@ EigenFloatContractionKernelInternal16x16(const LhsMapper lhs, const RhsMapper rh x1 = rhs_pf0.x; x2 = rhs_pf0.z; } -#if EIGEN_CUDACC_VER < 90000 +#if EIGEN_CUDA_SDK_VER < 90000 x1 = __shfl_xor(x1, 4); x2 = __shfl_xor(x2, 4); #else @@ -775,8 +773,6 @@ EigenFloatContractionKernelInternal(const LhsMapper lhs, const RhsMapper rhs, float2 rhs_shmem2[][8], const Index m_size, const Index n_size, const Index k_size, const Index base_m, const Index base_n) { - typedef float Scalar; - // prefetch registers float4 lhs_pf0, lhs_pf1, lhs_pf2, lhs_pf3; float4 rhs_pf0, rhs_pf1; @@ -1146,9 +1142,6 @@ EigenFloatContractionKernel(const LhsMapper lhs, const RhsMapper rhs, typedef float2 LHS_MEM[64][32]; typedef float2 RHS_MEM[128][8]; - typedef float2 LHS_MEM16x16[32][16]; - typedef float2 RHS_MEM16x16[64][8]; - const Index m_block_idx = blockIdx.x; const Index n_block_idx = blockIdx.y; diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorConvolution.h b/unsupported/Eigen/CXX11/src/Tensor/TensorConvolution.h index 81cbd4f01..8b0e9bf3b 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorConvolution.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorConvolution.h @@ -570,8 +570,8 @@ __global__ void EigenConvolutionKernel1D( const int maxX, const int kernelSize, float* buffer) { extern __shared__ float s[]; - const int first_x = blockIdx.x * maxX; - const int last_x = (first_x + maxX < numX ? first_x + maxX : numX) - 1; + const int first_x = static_cast(blockIdx.x * maxX); + const int last_x = static_cast((first_x + maxX < numX ? first_x + maxX : numX) - 1); const int num_x_input = last_x - first_x + GetKernelSize()(kernelSize); const int num_x_output = last_x - first_x + 1; @@ -619,13 +619,13 @@ __global__ void EigenConvolutionKernel2D( const int kernelSizeY, float* buffer) { extern __shared__ float s[]; - const int first_x = blockIdx.x * maxX; - const int last_x = (first_x + maxX < numX ? first_x + maxX : numX) - 1; + const int first_x = static_cast(blockIdx.x * maxX); + const int last_x = static_cast((first_x + maxX < numX ? first_x + maxX : numX) - 1); const int num_x_input = last_x - first_x + GetKernelSize()(kernelSizeX); const int num_x_output = last_x - first_x + 1; - const int first_y = blockIdx.y * maxY; - const int last_y = (first_y + maxY < numY ? first_y + maxY : numY) - 1; + const int first_y = static_cast(blockIdx.y * maxY); + const int last_y = static_cast((first_y + maxY < numY ? first_y + maxY : numY) - 1); const int num_y_input = last_y - first_y + GetKernelSize()(kernelSizeY); const int num_y_output = last_y - first_y + 1; @@ -688,17 +688,17 @@ __global__ void EigenConvolutionKernel3D( extern __shared__ float s[]; // Load inputs to shared memory - const int first_x = blockIdx.x * maxX; - const int last_x = (first_x + maxX < numX ? first_x + maxX : numX) - 1; - const int num_x_input = last_x - first_x + kernelSizeX; + const int first_x = static_cast(blockIdx.x * maxX); + const int last_x = static_cast((first_x + maxX < numX ? first_x + maxX : numX) - 1); + const int num_x_input = static_cast(last_x - first_x + kernelSizeX); - const int first_y = blockIdx.y * maxY; - const int last_y = (first_y + maxY < numY ? first_y + maxY : numY) - 1; - const int num_y_input = last_y - first_y + kernelSizeY; + const int first_y = static_cast(blockIdx.y * maxY); + const int last_y = static_cast((first_y + maxY < numY ? first_y + maxY : numY) - 1); + const int num_y_input = static_cast(last_y - first_y + kernelSizeY); - const int first_z = blockIdx.z * maxZ; - const int last_z = (first_z + maxZ < numZ ? first_z + maxZ : numZ) - 1; - const int num_z_input = last_z - first_z + kernelSizeZ; + const int first_z = static_cast(blockIdx.z * maxZ); + const int last_z = static_cast((first_z + maxZ < numZ ? first_z + maxZ : numZ) - 1); + const int num_z_input = static_cast(last_z - first_z + kernelSizeZ); for (size_t p = 0; p < numPlanes; ++p) { diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorDeviceCuda.h b/unsupported/Eigen/CXX11/src/Tensor/TensorDeviceCuda.h index 51cd7fcde..f7eda1ea9 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorDeviceCuda.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorDeviceCuda.h @@ -248,6 +248,9 @@ struct GpuDevice { EIGEN_UNUSED_VARIABLE(err) assert(err == cudaSuccess); #else + EIGEN_UNUSED_VARIABLE(buffer); + EIGEN_UNUSED_VARIABLE(c); + EIGEN_UNUSED_VARIABLE(n); eigen_assert(false && "The default device should be used instead to generate kernel code"); #endif } @@ -292,7 +295,7 @@ struct GpuDevice { return stream_->deviceProperties().maxThreadsPerMultiProcessor; } EIGEN_STRONG_INLINE int sharedMemPerBlock() const { - return stream_->deviceProperties().sharedMemPerBlock; + return static_cast(stream_->deviceProperties().sharedMemPerBlock); } EIGEN_STRONG_INLINE int majorDeviceVersion() const { return stream_->deviceProperties().major; diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorExecutor.h b/unsupported/Eigen/CXX11/src/Tensor/TensorExecutor.h index 55667b161..620c7c535 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorExecutor.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorExecutor.h @@ -253,7 +253,7 @@ inline void TensorExecutor::run( const int block_size = device.maxCudaThreadsPerBlock(); const int max_blocks = device.getNumCudaMultiProcessors() * device.maxCudaThreadsPerMultiProcessor() / block_size; - const Index size = array_prod(evaluator.dimensions()); + const Index size = static_cast(array_prod(evaluator.dimensions())); // Create a least one block to ensure we won't crash when tensorflow calls with tensors of size 0. const int num_blocks = numext::maxi(numext::mini(max_blocks, divup(size, block_size)), 1); diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorIndexList.h b/unsupported/Eigen/CXX11/src/Tensor/TensorIndexList.h index 62452be52..d8f33a400 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorIndexList.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorIndexList.h @@ -22,6 +22,7 @@ struct type2index { static const DenseIndex value = n; EIGEN_DEVICE_FUNC constexpr operator DenseIndex() const { return n; } EIGEN_DEVICE_FUNC void set(DenseIndex val) { + EIGEN_ONLY_USED_FOR_DEBUG(val); eigen_assert(val == n); } }; @@ -38,6 +39,7 @@ struct type2indexpair { } EIGEN_DEVICE_FUNC void set(const IndexPair& val) { + EIGEN_ONLY_USED_FOR_DEBUG(val); eigen_assert(val.first == f); eigen_assert(val.second == s); } @@ -254,6 +256,7 @@ struct tuple_coeff<0, ValueT> { } template EIGEN_DEVICE_FUNC static void set(const DenseIndex i, IndexTuple& t, const ValueT value) { + EIGEN_ONLY_USED_FOR_DEBUG(i); eigen_assert (i == 0); update_value(array_get<0>(t), value); } diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorReduction.h b/unsupported/Eigen/CXX11/src/Tensor/TensorReduction.h index a96e59f4c..93d955872 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorReduction.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorReduction.h @@ -497,7 +497,7 @@ struct TensorEvaluator, } else if(RunningOnSycl){ const Index num_values_to_reduce = internal::array_prod(m_reducedDims); - const Index num_coeffs_to_preserve = internal::array_prod(m_dimensions); + const Index num_coeffs_to_preserve = static_cast(internal::array_prod(m_dimensions)); if (!data) { data = static_cast(m_device.allocate(sizeof(CoeffReturnType) * num_coeffs_to_preserve)); m_result = data; diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorReductionCuda.h b/unsupported/Eigen/CXX11/src/Tensor/TensorReductionCuda.h index 62c4a766d..45731b82b 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorReductionCuda.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorReductionCuda.h @@ -63,6 +63,9 @@ __device__ EIGEN_ALWAYS_INLINE void atomicReduce(T* output, T accum, R& reducer) assert(0 && "Wordsize not supported"); } #else + EIGEN_UNUSED_VARIABLE(output); + EIGEN_UNUSED_VARIABLE(accum); + EIGEN_UNUSED_VARIABLE(reducer); assert(0 && "Shouldn't be called on unsupported device"); #endif } @@ -105,6 +108,8 @@ __device__ inline void atomicReduce(float* output, float accum, SumReducer= 300 atomicAdd(output, accum); #else + EIGEN_UNUSED_VARIABLE(output); + EIGEN_UNUSED_VARIABLE(accum); assert(0 && "Shouldn't be called on unsupported device"); #endif } @@ -185,6 +190,11 @@ __global__ void FullReductionKernel(Reducer reducer, const Self input, Index num atomicInc(semaphore, gridDim.x + 1); } #else + EIGEN_UNUSED_VARIABLE(reducer); + EIGEN_UNUSED_VARIABLE(input); + EIGEN_UNUSED_VARIABLE(num_coeffs); + EIGEN_UNUSED_VARIABLE(output); + EIGEN_UNUSED_VARIABLE(semaphore); assert(0 && "Shouldn't be called on unsupported device"); #endif } @@ -194,14 +204,31 @@ __global__ void FullReductionKernel(Reducer reducer, const Self input, Index num template __global__ void ReductionInitFullReduxKernelHalfFloat(Reducer reducer, const Self input, Index num_coeffs, half2* scratch) { +#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 300 eigen_assert(blockDim.x == 1); eigen_assert(gridDim.x == 1); - if (num_coeffs % 2 != 0) { - half last = input.m_impl.coeff(num_coeffs-1); - *scratch = __halves2half2(last, reducer.initialize()); + typedef packet_traits::type packet_type; + Index packet_remainder = num_coeffs % Index(unpacket_traits::size); + if (packet_remainder != 0) { + half2* h2scratch = reinterpret_cast(scratch); + for (Index i = num_coeffs - packet_remainder; i + 2 <= num_coeffs; i += 2) { + *h2scratch = __halves2half2(input.coeff(i), input.coeff(i + 1)); + h2scratch++; + } + if ((num_coeffs & 1) != 0) { + half lastCoeff = input.coeff(num_coeffs - 1); + *h2scratch = __halves2half2(lastCoeff, reducer.initialize()); + } } else { - *scratch = reducer.template initializePacket(); + packet_type reduce = reducer.template initializePacket(); + internal::pstoreu(scratch, reduce); } +#else + EIGEN_UNUSED_VARIABLE(input); + EIGEN_UNUSED_VARIABLE(reducer); + EIGEN_UNUSED_VARIABLE(num_coeffs); + EIGEN_UNUSED_VARIABLE(scratch); +#endif } template (); - } + typedef typename packet_traits::type PacketType; + EIGEN_UNUSED_VARIABLE(input); - if (thread_id == 0 && num_coeffs % 2 != 0) { - output[num_coeffs-1] = reducer.initialize(); + const Index num_packets = num_coeffs / Index(unpacket_traits::size); + PacketType* p_output = reinterpret_cast(output); + for (Index i = thread_id; i < num_packets; i += num_threads) { + p_output[i] = reducer.template initializePacket(); + } + Index packet_remainder = num_coeffs % Index(unpacket_traits::size); + if (thread_id < packet_remainder) { + output[num_coeffs - packet_remainder + thread_id] = reducer.initialize(); } } @@ -223,50 +254,94 @@ template __global__ void FullReductionKernelHalfFloat(Reducer reducer, const Self input, Index num_coeffs, half* output, half2* scratch) { - eigen_assert(NumPerThread % 2 == 0); - - const Index first_index = blockIdx.x * BlockSize * NumPerThread + 2*threadIdx.x; +#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 300 + EIGEN_UNUSED_VARIABLE(num_coeffs); + typedef typename packet_traits::type PacketType; + const int packet_width = unpacket_traits::size; + eigen_assert(NumPerThread % packet_width == 0); + const Index first_index = blockIdx.x * BlockSize * NumPerThread + packet_width * threadIdx.x; // Initialize the output value if it wasn't initialized by the ReductionInitKernel - if (gridDim.x == 1 && first_index == 0) { - if (num_coeffs % 2 != 0) { - half last = input.m_impl.coeff(num_coeffs-1); - *scratch = __halves2half2(last, reducer.initialize()); - } else { - *scratch = reducer.template initializePacket(); + if (gridDim.x == 1) { + if (first_index == 0) { + int rem = num_coeffs % packet_width; + if (rem != 0) { + half2* p_scratch = reinterpret_cast(scratch); + pstoreu(scratch, reducer.template initializePacket()); + for (int i = 0; i < rem / 2; i++) { + *p_scratch = __halves2half2(input.coeff(num_coeffs - packet_width + 2 * i), + input.coeff(num_coeffs - packet_width + 2 * i + 1)); + p_scratch++; + } + if ((num_coeffs & 1) != 0) { + half last = input.coeff(num_coeffs - 1); + *p_scratch = __halves2half2(last, reducer.initialize()); + } + } else { + PacketType reduce = reducer.template initializePacket(); + pstoreu(scratch, reduce); + } } __syncthreads(); } - half2 accum = reducer.template initializePacket(); - const Index max_iter = numext::mini((num_coeffs - first_index) / 2, NumPerThread*BlockSize / 2); + PacketType accum = reducer.template initializePacket(); + const Index max_iter = + numext::mini((num_coeffs - first_index) / packet_width, NumPerThread * BlockSize / packet_width); for (Index i = 0; i < max_iter; i += BlockSize) { - const Index index = first_index + 2*i; - eigen_assert(index + 1 < num_coeffs); - half2 val = input.m_impl.template packet(index); + const Index index = first_index + packet_width * i; + eigen_assert(index + packet_width < num_coeffs); + PacketType val = input.template packet(index); reducer.reducePacket(val, &accum); } -#pragma unroll - for (int offset = warpSize/2; offset > 0; offset /= 2) { + #pragma unroll + for (int offset = warpSize / 2; offset > 0; offset /= 2) { #if defined(EIGEN_CUDA_SDK_VER) && EIGEN_CUDA_SDK_VER < 90000 - reducer.reducePacket(__shfl_down(accum, offset, warpSize), &accum); + PacketType r1; + half2* hr = reinterpret_cast(&r1); + half2* hacc = reinterpret_cast(&accum); + for (int i = 0; i < packet_width / 2; i++) { + hr[i] = __shfl_down(hacc[i], offset, warpSize); + } + reducer.reducePacket(r1, &accum); #else - reducer.reducePacket(__shfl_down_sync(0xFFFFFFFF, accum, offset, warpSize), &accum); + PacketType r1; + half2* hr = reinterpret_cast(&r1); + half2* hacc = reinterpret_cast(&accum); + for (int i = 0; i < packet_width / 2; i++) { + hr[i] = __shfl_down_sync(0xFFFFFFFF, hacc[i], (unsigned)offset, warpSize); + } + reducer.reducePacket(r1, &accum); + #endif } if ((threadIdx.x & (warpSize - 1)) == 0) { - atomicReduce(scratch, accum, reducer); + atomicReduce(reinterpret_cast(scratch), accum, reducer); } __syncthreads(); - - if (gridDim.x == 1 && first_index == 0) { - half tmp = __low2half(*scratch); - reducer.reduce(__high2half(*scratch), &tmp); - *output = tmp; + half2* rv1 = reinterpret_cast(scratch); + if (packet_width > 2) { + reducer.reducePacket(rv1[2], rv1); + reducer.reducePacket(rv1[3], rv1 + 1); + reducer.reducePacket(rv1[1], rv1); } + if (gridDim.x == 1) { + if (first_index == 0) { + half tmp = __low2half(*rv1); + reducer.reduce(__high2half(*rv1), &tmp); + *output = tmp; + } + } +#else + EIGEN_UNUSED_VARIABLE(reducer); + EIGEN_UNUSED_VARIABLE(input); + EIGEN_UNUSED_VARIABLE(num_coeffs); + EIGEN_UNUSED_VARIABLE(output); + EIGEN_UNUSED_VARIABLE(scratch); +#endif } template @@ -296,7 +371,6 @@ struct FullReductionLauncher< void>::type> { static void run(const Self& self, Op& reducer, const GpuDevice& device, OutputType* output, typename Self::Index num_coeffs) { typedef typename Self::Index Index; - typedef typename Self::CoeffReturnType Scalar; const int block_size = 256; const int num_per_thread = 128; const int num_blocks = divup(num_coeffs, block_size * num_per_thread); @@ -448,6 +522,11 @@ __global__ void InnerReductionKernel(Reducer reducer, const Self input, Index nu } } #else + EIGEN_UNUSED_VARIABLE(reducer); + EIGEN_UNUSED_VARIABLE(input); + EIGEN_UNUSED_VARIABLE(num_coeffs_to_reduce); + EIGEN_UNUSED_VARIABLE(num_preserved_coeffs); + EIGEN_UNUSED_VARIABLE(output); assert(0 && "Shouldn't be called on unsupported device"); #endif } @@ -458,27 +537,30 @@ template __global__ void InnerReductionKernelHalfFloat(Reducer reducer, const Self input, Index num_coeffs_to_reduce, Index num_preserved_coeffs, half* output) { +#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 300 eigen_assert(blockDim.y == 1); eigen_assert(blockDim.z == 1); eigen_assert(gridDim.y == 1); eigen_assert(gridDim.z == 1); - const int unroll_times = 16; + typedef typename packet_traits::type PacketType; + const int packet_width = unpacket_traits::size; + const int unroll_times = 16 / packet_width; eigen_assert(NumPerThread % unroll_times == 0); eigen_assert(unroll_times % 2 == 0); - const Index input_col_blocks = divup(num_coeffs_to_reduce, blockDim.x * NumPerThread * 2); - const Index num_input_blocks = divup(input_col_blocks * num_preserved_coeffs, 2); + const Index input_col_blocks = numext::div_ceil(num_coeffs_to_reduce, blockDim.x * NumPerThread * 2); + const Index num_input_blocks = numext::div_ceil(input_col_blocks * num_preserved_coeffs, 2); const Index num_threads = blockDim.x * gridDim.x; const Index thread_id = blockIdx.x * blockDim.x + threadIdx.x; // Initialize the output values if they weren't initialized by the ReductionInitKernel if (gridDim.x == 1) { - Index i = 2*thread_id; - for (; i + 1 < num_preserved_coeffs; i += 2*num_threads) { - half* loc = output + i; - *((half2*)loc) = reducer.template initializePacket(); + Index i = packet_width * thread_id; + for (; i + packet_width <= num_preserved_coeffs; i += packet_width * num_threads) { + PacketType* poutput = reinterpret_cast(output + i); + *poutput = reducer.template initializePacket(); } if (i < num_preserved_coeffs) { output[i] = reducer.initialize(); @@ -487,71 +569,123 @@ __global__ void InnerReductionKernelHalfFloat(Reducer reducer, const Self input, } for (Index i = blockIdx.x; i < num_input_blocks; i += gridDim.x) { - const Index row = 2 * (i / input_col_blocks); + const Index row = 2 * (i / input_col_blocks); // everybody takes 2 rows if (row + 1 < num_preserved_coeffs) { const Index col_block = i % input_col_blocks; - const Index col_begin = 2 * (col_block * blockDim.x * NumPerThread + threadIdx.x); + const Index col_begin = packet_width * (col_block * blockDim.x * NumPerThread + threadIdx.x); - half2 reduced_val1 = reducer.template initializePacket(); - half2 reduced_val2 = reducer.template initializePacket(); + PacketType reduced_val1 = reducer.template initializePacket(); + PacketType reduced_val2 = reducer.template initializePacket(); for (Index j = 0; j < NumPerThread; j += unroll_times) { - const Index last_col = col_begin + blockDim.x * (j + unroll_times - 1) * 2; + const Index last_col = col_begin + blockDim.x * (j + unroll_times - 1) * packet_width; if (last_col >= num_coeffs_to_reduce) { Index col = col_begin + blockDim.x * j; - for (; col + 1 < num_coeffs_to_reduce; col += blockDim.x) { - const half2 val1 = input.m_impl.template packet(row * num_coeffs_to_reduce + col); + for (; col + packet_width <= num_coeffs_to_reduce; col += blockDim.x) { + const PacketType val1 = input.m_impl.template packet(row * num_coeffs_to_reduce + col); reducer.reducePacket(val1, &reduced_val1); - const half2 val2 = input.m_impl.template packet((row+1) * num_coeffs_to_reduce + col); + const PacketType val2 = input.m_impl.template packet((row + 1) * num_coeffs_to_reduce + col); reducer.reducePacket(val2, &reduced_val2); } if (col < num_coeffs_to_reduce) { - // Peel; - const half last1 = input.m_impl.coeff(row * num_coeffs_to_reduce + col); - const half2 val1 = __halves2half2(last1, reducer.initialize()); - reducer.reducePacket(val1, &reduced_val1); - const half last2 = input.m_impl.coeff((row+1) * num_coeffs_to_reduce + col); - const half2 val2 = __halves2half2(last2, reducer.initialize()); - reducer.reducePacket(val2, &reduced_val2); + PacketType r1 = reducer.template initializePacket(); + PacketType r2 = reducer.template initializePacket(); + half2* hr1 = reinterpret_cast(&r1); + half2* hr2 = reinterpret_cast(&r2); + while (col + 1 < num_coeffs_to_reduce) { + *hr1 = __halves2half2(input.m_impl.coeff(row * num_coeffs_to_reduce + col), + input.m_impl.coeff(row * num_coeffs_to_reduce + col + 1)); + *hr2 = __halves2half2(input.m_impl.coeff((row + 1) * num_coeffs_to_reduce + col), + input.m_impl.coeff((row + 1) * num_coeffs_to_reduce + col + 1)); + hr1++; + hr2++; + col += 2; + } + if (col < num_coeffs_to_reduce) { + // Peel; + const half last1 = input.m_impl.coeff(row * num_coeffs_to_reduce + col); + *hr1 = __halves2half2(last1, reducer.initialize()); + const half last2 = input.m_impl.coeff((row + 1) * num_coeffs_to_reduce + col); + *hr2 = __halves2half2(last2, reducer.initialize()); + } + reducer.reducePacket(r1, &reduced_val1); + reducer.reducePacket(r2, &reduced_val2); } break; } else { // Faster version of the loop with no branches after unrolling. -#pragma unroll + #pragma unroll for (int k = 0; k < unroll_times; ++k) { - const Index col = col_begin + blockDim.x * (j + k) * 2; - reducer.reducePacket(input.m_impl.template packet(row * num_coeffs_to_reduce + col), &reduced_val1); - reducer.reducePacket(input.m_impl.template packet((row + 1)* num_coeffs_to_reduce + col), &reduced_val2); + const Index col = col_begin + blockDim.x * (j + k) * packet_width; + reducer.reducePacket(input.m_impl.template packet(row * num_coeffs_to_reduce + col), + &reduced_val1); + reducer.reducePacket(input.m_impl.template packet((row + 1) * num_coeffs_to_reduce + col), + &reduced_val2); } } } -#pragma unroll - for (int offset = warpSize/2; offset > 0; offset /= 2) { + #pragma unroll + for (int offset = warpSize / 2; offset > 0; offset /= 2) { #if defined(EIGEN_CUDA_SDK_VER) && EIGEN_CUDA_SDK_VER < 90000 - - reducer.reducePacket(__shfl_down(reduced_val1, offset, warpSize), &reduced_val1); - reducer.reducePacket(__shfl_down(reduced_val2, offset, warpSize), &reduced_val2); + PacketType r1; + PacketType r2; + half2* hr1 = reinterpret_cast(&r1); + half2* hr2 = reinterpret_cast(&r2); + half2* rv1 = reinterpret_cast(&reduced_val1); + half2* rv2 = reinterpret_cast(&reduced_val2); + for (int i = 0; i < packet_width / 2; i++) { + hr1[i] = __shfl_down(rv1[i], offset, warpSize); + hr2[i] = __shfl_down(rv2[i], offset, warpSize); + } + reducer.reducePacket(r1, &reduced_val1); + reducer.reducePacket(r2, &reduced_val2); #else - reducer.reducePacket(__shfl_down_sync(0xFFFFFFFF, reduced_val1, offset, warpSize), &reduced_val1); - reducer.reducePacket(__shfl_down_sync(0xFFFFFFFF, reduced_val2, offset, warpSize), &reduced_val2); + PacketType r1; + PacketType r2; + half2* hr1 = reinterpret_cast(&r1); + half2* hr2 = reinterpret_cast(&r2); + half2* rr1 = reinterpret_cast(&reduced_val1); + half2* rr2 = reinterpret_cast(&reduced_val2); + for (int j = 0; j < packet_width / 2; j++) { + hr1[j] = __shfl_down_sync(0xFFFFFFFF, rr1[j], (unsigned)offset, warpSize); + hr2[j] = __shfl_down_sync(0xFFFFFFFF, rr2[j], (unsigned)offset, warpSize); + } + reducer.reducePacket(r1, &reduced_val1); + reducer.reducePacket(r2, &reduced_val2); #endif } - - half val1 = __low2half(reduced_val1); - reducer.reduce(__high2half(reduced_val1), &val1); - half val2 = __low2half(reduced_val2); - reducer.reduce(__high2half(reduced_val2), &val2); - half2 val = __halves2half2(val1, val2); - + half2* rv1 = reinterpret_cast(&reduced_val1); + half2* rv2 = reinterpret_cast(&reduced_val2); + half2 val; + if (packet_width > 2) { + reducer.reducePacket(rv1[2], rv1); + reducer.reducePacket(rv1[3], rv1 + 1); + reducer.reducePacket(rv1[1], rv1); + reducer.reducePacket(rv2[2], rv2); + reducer.reducePacket(rv2[3], rv2 + 1); + reducer.reducePacket(rv2[1], rv2); + } + half val1 = __low2half(*rv1); + reducer.reduce(__high2half(*rv1), &val1); + half val2 = __low2half(*rv2); + reducer.reduce(__high2half(*rv2), &val2); + val = __halves2half2(val1, val2); if ((threadIdx.x & (warpSize - 1)) == 0) { half* loc = output + row; - atomicReduce((half2*)loc, val, reducer); + atomicReduce(reinterpret_cast(loc), val, reducer); } } } +#else + EIGEN_UNUSED_VARIABLE(reducer); + EIGEN_UNUSED_VARIABLE(input); + EIGEN_UNUSED_VARIABLE(num_coeffs_to_reduce); + EIGEN_UNUSED_VARIABLE(num_preserved_coeffs); + EIGEN_UNUSED_VARIABLE(output); +#endif } #endif @@ -586,12 +720,12 @@ struct InnerReductionLauncher< if (num_blocks > 1) { // We initialize the outputs outside the reduction kernel when we can't be sure that there // won't be a race conditions between multiple thread blocks. - const int dyn_blocks = divup(num_preserved_vals, 1024); - const int max_blocks = device.getNumCudaMultiProcessors() * + const int dyn_blocks2 = divup(num_preserved_vals, 1024); + const int max_blocks2 = device.getNumCudaMultiProcessors() * device.maxCudaThreadsPerMultiProcessor() / 1024; - const int num_blocks = numext::mini(max_blocks, dyn_blocks); + const int num_blocks2 = numext::mini(max_blocks2, dyn_blocks2); LAUNCH_CUDA_KERNEL((ReductionInitKernel), - num_blocks, 1024, 0, device, reducer.initialize(), + num_blocks2, 1024, 0, device, reducer.initialize(), num_preserved_vals, output); } @@ -632,10 +766,6 @@ struct InnerReductionLauncher { if (num_blocks > 1) { // We initialize the outputs outside the reduction kernel when we can't be sure that there // won't be a race conditions between multiple thread blocks. - const int dyn_blocks = divup(num_preserved_vals, 1024); - const int max_blocks = device.getNumCudaMultiProcessors() * - device.maxCudaThreadsPerMultiProcessor() / 1024; - const int num_blocks = numext::mini(max_blocks, dyn_blocks); LAUNCH_CUDA_KERNEL((ReductionInitKernelHalfFloat), 1, 1, 0, device, reducer, self, num_preserved_vals, output); } @@ -745,12 +875,12 @@ struct OuterReducer { if (num_blocks > 1) { // We initialize the outputs in the reduction kernel itself when we don't have to worry // about race conditions between multiple thread blocks. - const int dyn_blocks = divup(num_preserved_vals, 1024); - const int max_blocks = device.getNumCudaMultiProcessors() * + const int dyn_blocks2 = divup(num_preserved_vals, 1024); + const int max_blocks2 = device.getNumCudaMultiProcessors() * device.maxCudaThreadsPerMultiProcessor() / 1024; - const int num_blocks = numext::mini(max_blocks, dyn_blocks); + const int num_blocks2 = numext::mini(max_blocks2, dyn_blocks2); LAUNCH_CUDA_KERNEL((ReductionInitKernel), - num_blocks, 1024, 0, device, reducer.initialize(), + num_blocks2, 1024, 0, device, reducer.initialize(), num_preserved_vals, output); } diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorUInt128.h b/unsupported/Eigen/CXX11/src/Tensor/TensorUInt128.h index 3523e7c94..e5dbcf935 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorUInt128.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorUInt128.h @@ -23,6 +23,7 @@ struct static_val { template EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE static_val(const T& v) { + EIGEN_ONLY_USED_FOR_DEBUG(v); eigen_assert(v == n); } }; diff --git a/unsupported/test/CMakeLists.txt b/unsupported/test/CMakeLists.txt index c3c68f3ab..9248f0feb 100644 --- a/unsupported/test/CMakeLists.txt +++ b/unsupported/test/CMakeLists.txt @@ -72,7 +72,7 @@ if(MPFR_FOUND AND EIGEN_COMPILER_SUPPORT_CPP11) include_directories(${MPFR_INCLUDES} ./mpreal) ei_add_property(EIGEN_TESTED_BACKENDS "MPFR C++, ") set(EIGEN_MPFR_TEST_LIBRARIES ${MPFR_LIBRARIES} ${GMP_LIBRARIES}) - ei_add_test(mpreal_support "-std=c++11" "${EIGEN_MPFR_TEST_LIBRARIES}" ) + ei_add_test(mpreal_support "-std=c++11" "${EIGEN_MPFR_TEST_LIBRARIES}" ) else() ei_add_property(EIGEN_MISSING_BACKENDS "MPFR C++, ") endif() @@ -218,7 +218,11 @@ if(CUDA_FOUND AND EIGEN_TEST_CUDA) message(STATUS "Flags used to compile cuda code: " ${CMAKE_CXX_FLAGS}) if( (NOT EIGEN_TEST_CXX11) OR (CMAKE_VERSION VERSION_LESS 3.3)) - string(APPEND EIGEN_CUDA_CXX11_FLAGS " -std=c++11") + if (MSVC) + string(APPEND EIGEN_CUDA_CXX11_FLAGS " /std:c++11") + else() + string(APPEND EIGEN_CUDA_CXX11_FLAGS " -std=c++11") + endif(MSVC) endif() if("${CMAKE_CXX_COMPILER_ID}" STREQUAL "Clang") diff --git a/unsupported/test/cxx11_tensor_of_float16_cuda.cu b/unsupported/test/cxx11_tensor_of_float16_cuda.cu index e296bf991..cf36a4958 100644 --- a/unsupported/test/cxx11_tensor_of_float16_cuda.cu +++ b/unsupported/test/cxx11_tensor_of_float16_cuda.cu @@ -339,7 +339,7 @@ void test_cuda_reductions(int size1, int size2, int redux) { gpu_float1.device(gpu_device) = gpu_float1.random() * 2.0f; gpu_float2.device(gpu_device) = gpu_float2.random() * 2.0f; - Eigen::array redux_dim = {{redux}}; + Eigen::array redux_dim = {redux}; gpu_res_float.device(gpu_device) = gpu_float1.sum(redux_dim).cast(); gpu_res_half.device(gpu_device) = gpu_float1.cast().sum(redux_dim);