diff --git a/Eigen/src/Core/arch/CUDA/PacketMathHalf.h b/Eigen/src/Core/arch/CUDA/PacketMathHalf.h index dc09c74d1..61d532e4d 100644 --- a/Eigen/src/Core/arch/CUDA/PacketMathHalf.h +++ b/Eigen/src/Core/arch/CUDA/PacketMathHalf.h @@ -17,7 +17,8 @@ // we'll use on the host side (SSE, AVX, ...) #if defined(__CUDACC__) && defined(EIGEN_USE_GPU) -#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 300 +// Most of the following operations require arch >= 5.3 +#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 530 namespace Eigen { namespace internal { @@ -67,20 +68,12 @@ template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void pstoreu(half* to, co template<> EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE half2 ploadt_ro(const half* from) { -#if __CUDA_ARCH__ >= 320 return __ldg((const half2*)from); -#else - return __halves2half2(*(from+0), *(from+1)); -#endif } template<> EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE half2 ploadt_ro(const half* from) { -#if __CUDA_ARCH__ >= 320 return __halves2half2(__ldg(from+0), __ldg(from+1)); -#else - return __halves2half2(*(from+0), *(from+1)); -#endif } template<> EIGEN_DEVICE_FUNC inline half2 pgather(const half* from, Index stride) { @@ -113,8 +106,6 @@ ptranspose(PacketBlock& kernel) { kernel.packet[1] = __halves2half2(a2, b2); } -// The following operations require arch >= 5.3 -#if __CUDA_ARCH__ >= 530 template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 plset(const half& a) { return __halves2half2(a, __hadd(a, __float2half(1.0f))); } @@ -190,7 +181,6 @@ template<> EIGEN_DEVICE_FUNC inline half predux_min(const half2& a) { template<> EIGEN_DEVICE_FUNC inline half predux_mul(const half2& a) { return __hmul(__low2half(a), __high2half(a)); } -#endif } // end namespace internal diff --git a/Eigen/src/Core/arch/CUDA/TypeCasting.h b/Eigen/src/Core/arch/CUDA/TypeCasting.h index b2a9724de..396b38eaf 100644 --- a/Eigen/src/Core/arch/CUDA/TypeCasting.h +++ b/Eigen/src/Core/arch/CUDA/TypeCasting.h @@ -71,6 +71,7 @@ struct functor_traits > +#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 530 template <> struct type_casting_traits { @@ -82,22 +83,9 @@ struct type_casting_traits { }; template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE float4 pcast(const half2& a, const half2& b) { -#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 300 float2 r1 = __half22float2(a); float2 r2 = __half22float2(b); return make_float4(r1.x, r1.y, r2.x, r2.y); -#else - half r1; - r1.x = a.x & 0xFFFF; - half r2; - r2.x = (a.x & 0xFFFF0000) >> 16; - half r3; - r3.x = b.x & 0xFFFF; - half r4; - r4.x = (b.x & 0xFFFF0000) >> 16; - return make_float4(static_cast(r1), static_cast(r2), - static_cast(r3), static_cast(r4)); -#endif } template <> @@ -111,19 +99,10 @@ struct type_casting_traits { template<> EIGEN_STRONG_INLINE half2 pcast(const float4& a) { // Simply discard the second half of the input -#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 300 return __float22half2_rn(make_float2(a.x, a.y)); -#else - half r1 = static_cast(a.x); - half r2 = static_cast(a.y); - half2 r; - r.x = 0; - r.x |= r1.x; - r.x |= (static_cast(r2.x) << 16); - return r; -#endif } +#endif #endif } // end namespace internal diff --git a/unsupported/test/CMakeLists.txt b/unsupported/test/CMakeLists.txt index b1931d80a..c088df1c1 100644 --- a/unsupported/test/CMakeLists.txt +++ b/unsupported/test/CMakeLists.txt @@ -210,10 +210,7 @@ if(CUDA_FOUND AND EIGEN_TEST_CUDA) ei_add_test(cxx11_tensor_random_cuda) endif() - # Operations other that casting of half floats are only supported starting with arch 5.3 - if (${EIGEN_CUDA_COMPUTE_ARCH} GREATER 52) - ei_add_test(cxx11_tensor_of_float16_cuda) - endif() + ei_add_test(cxx11_tensor_of_float16_cuda) unset(EIGEN_ADD_TEST_FILENAME_EXTENSION) endif()