From 483aaad10a925b5b22ea87bcabe01712db4fe870 Mon Sep 17 00:00:00 2001 From: Benoit Steiner Date: Wed, 30 Mar 2016 17:08:13 -0700 Subject: [PATCH 01/14] Fixed compilation warning --- unsupported/Eigen/CXX11/src/Tensor/TensorUInt128.h | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorUInt128.h b/unsupported/Eigen/CXX11/src/Tensor/TensorUInt128.h index 543a444fb..f68ac1794 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorUInt128.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorUInt128.h @@ -53,7 +53,9 @@ struct TensorUInt128 template EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE explicit TensorUInt128(const T& x) : high(0), low(x) { - eigen_assert(x < NumTraits::highest()); + typedef typename conditional::type UnsignedT; + typedef typename conditional::type UnsignedLow; + eigen_assert(static_cast(x) < static_cast(NumTraits::highest())); eigen_assert(x >= 0); } From bc68fc2fe73adba1cf4d0b40d99d201c3f12bb64 Mon Sep 17 00:00:00 2001 From: Benoit Steiner Date: Wed, 30 Mar 2016 17:58:32 -0700 Subject: [PATCH 02/14] Enable constant expressions when compiling cuda code with clang. --- Eigen/src/Core/util/Macros.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/Eigen/src/Core/util/Macros.h b/Eigen/src/Core/util/Macros.h index dbfc9bd37..97627d14c 100644 --- a/Eigen/src/Core/util/Macros.h +++ b/Eigen/src/Core/util/Macros.h @@ -370,8 +370,8 @@ // Does the compiler support const expressions? #ifdef __CUDACC__ -// Const expressions are supported provided that c++11 is enabled and we're using nvcc 7.5 or above -#if defined(__CUDACC_VER__) && __CUDACC_VER__ >= 70500 && __cplusplus > 199711L +// Const expressions are supported provided that c++11 is enabled and we're using either clang or nvcc 7.5 or above +#if __cplusplus > 199711L && defined(__CUDACC_VER__) && (defined(__clang__) || __CUDACC_VER__ >= 70500) #define EIGEN_HAS_CONSTEXPR 1 #endif #elif (defined(__cplusplus) && __cplusplus >= 201402L) || \ From 4f1a7e51c17586487c986a456e39af40b41bf4b4 Mon Sep 17 00:00:00 2001 From: Benoit Steiner Date: Wed, 30 Mar 2016 17:59:49 -0700 Subject: [PATCH 03/14] Pull math functions from the global namespace only when compiling cuda code with nvcc. When compiling with clang, we want to use the std namespace. --- Eigen/Core | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/Eigen/Core b/Eigen/Core index 8428c51e4..24799f32b 100644 --- a/Eigen/Core +++ b/Eigen/Core @@ -42,7 +42,10 @@ #endif -#if defined(__CUDA_ARCH__) +// When compiling CUDA device code with NVCC, pull in math functions from the +// global namespace. In host mode, and when device doee with clang, use the +// std versions. +#if defined(__CUDA_ARCH__) && defined(__NVCC__) #define EIGEN_USING_STD_MATH(FUNC) using ::FUNC; #else #define EIGEN_USING_STD_MATH(FUNC) using std::FUNC; From 791e5cfb6990220b2cfdb7b6f793298a5153561b Mon Sep 17 00:00:00 2001 From: Benoit Steiner Date: Wed, 30 Mar 2016 18:36:36 -0700 Subject: [PATCH 04/14] Added NumTraits for type2index. --- .../Eigen/CXX11/src/Tensor/TensorIndexList.h | 17 +++++++++++++++++ 1 file changed, 17 insertions(+) diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorIndexList.h b/unsupported/Eigen/CXX11/src/Tensor/TensorIndexList.h index 01c31c13e..985594bc8 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorIndexList.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorIndexList.h @@ -45,6 +45,23 @@ struct type2index { } }; +template struct NumTraits > +{ + typedef DenseIndex Real; + enum { + IsComplex = 0, + RequireInitialization = false, + ReadCost = 1, + AddCost = 1, + MulCost = 1 + }; + + EIGEN_DEVICE_FUNC static inline Real epsilon() { return 0; } + EIGEN_DEVICE_FUNC static inline Real dummy_precision() { return 0; } + EIGEN_DEVICE_FUNC static inline Real highest() { return n; } + EIGEN_DEVICE_FUNC static inline Real lowest() { return n; } +}; + namespace internal { template EIGEN_DEVICE_FUNC void update_value(T& val, DenseIndex new_val) { From af4ef540bfeb381daaae86f91d492eed39f84e68 Mon Sep 17 00:00:00 2001 From: Benoit Steiner Date: Wed, 30 Mar 2016 18:37:19 -0700 Subject: [PATCH 05/14] Fixed a off-by-one bug in a debug assertion --- unsupported/Eigen/CXX11/src/Tensor/TensorUInt128.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorUInt128.h b/unsupported/Eigen/CXX11/src/Tensor/TensorUInt128.h index f68ac1794..3e56589c3 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorUInt128.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorUInt128.h @@ -55,7 +55,7 @@ struct TensorUInt128 explicit TensorUInt128(const T& x) : high(0), low(x) { typedef typename conditional::type UnsignedT; typedef typename conditional::type UnsignedLow; - eigen_assert(static_cast(x) < static_cast(NumTraits::highest())); + eigen_assert(static_cast(x) <= static_cast(NumTraits::highest())); eigen_assert(x >= 0); } From 8c8a79cec1b7d03be30df0e70cea236b8f52ef64 Mon Sep 17 00:00:00 2001 From: Benoit Steiner Date: Thu, 31 Mar 2016 10:33:32 -0700 Subject: [PATCH 06/14] Fixed a typo --- Eigen/src/Core/arch/CUDA/Half.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Eigen/src/Core/arch/CUDA/Half.h b/Eigen/src/Core/arch/CUDA/Half.h index 6c412159c..ace250c6f 100644 --- a/Eigen/src/Core/arch/CUDA/Half.h +++ b/Eigen/src/Core/arch/CUDA/Half.h @@ -348,7 +348,7 @@ namespace numext { static inline EIGEN_DEVICE_FUNC bool (isinf)(const Eigen::half& a) { return (a.x & 0x7fff) == 0x7c00; } -static inline EIGEN_HALF_CUDA_H bool (isnan)(const Eigen::half& a) { +static inline EIGEN_DEVICE_FUNC bool (isnan)(const Eigen::half& a) { #if defined(EIGEN_HAS_CUDA_FP16) && defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 530 return __hisnan(a); #else From b575fb1d02f7a98c94a576284fbcd4ff85970120 Mon Sep 17 00:00:00 2001 From: Benoit Steiner Date: Thu, 31 Mar 2016 10:43:59 -0700 Subject: [PATCH 07/14] Added NumTraits for half floats --- Eigen/src/Core/arch/CUDA/Half.h | 12 ++++++++++++ 1 file changed, 12 insertions(+) diff --git a/Eigen/src/Core/arch/CUDA/Half.h b/Eigen/src/Core/arch/CUDA/Half.h index ace250c6f..dc7119c06 100644 --- a/Eigen/src/Core/arch/CUDA/Half.h +++ b/Eigen/src/Core/arch/CUDA/Half.h @@ -341,6 +341,18 @@ template<> struct is_arithmetic { enum { value = true }; }; } // end namespace internal +template<> struct NumTraits + : GenericNumTraits +{ + EIGEN_DEVICE_FUNC static inline float dummy_precision() { return 1e-3f; } + EIGEN_DEVICE_FUNC static inline Eigen::half highest() { + return internal::raw_uint16_to_half(0x7bff); + } + EIGEN_DEVICE_FUNC static inline Eigen::half lowest() { + return internal::raw_uint16_to_half(0xfbff); + } +}; + // Infinity/NaN checks. namespace numext { From c36ab1990247a5b60bcad564759e8903f30fbab5 Mon Sep 17 00:00:00 2001 From: Benoit Steiner Date: Thu, 31 Mar 2016 10:55:03 -0700 Subject: [PATCH 08/14] Added __ldg primitive for fp16. --- Eigen/src/Core/arch/CUDA/Half.h | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/Eigen/src/Core/arch/CUDA/Half.h b/Eigen/src/Core/arch/CUDA/Half.h index dc7119c06..a2a2bac37 100644 --- a/Eigen/src/Core/arch/CUDA/Half.h +++ b/Eigen/src/Core/arch/CUDA/Half.h @@ -423,7 +423,14 @@ using ::ceil; __device__ inline Eigen::half __shfl_xor(Eigen::half var, int laneMask, int width=warpSize) { return static_cast(__shfl_xor(static_cast(var), laneMask, width)); } +#endif +// ldg() has an overload for __half, but we also need one for Eigen::half. +#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 320 +static inline EIGEN_DEVICE_FUNC Eigen::half __ldg(const Eigen::half* ptr) { + return Eigen::internal::raw_uint16_to_half( + __ldg(reinterpret_cast(ptr))); +} #endif From 4c859181daa3807f54ee7ae8add6bac66e896ace Mon Sep 17 00:00:00 2001 From: Benoit Steiner Date: Thu, 31 Mar 2016 12:48:38 -0700 Subject: [PATCH 09/14] Made it possible to use the NumTraits for complex and Array in a cuda kernel. --- Eigen/src/Core/NumTraits.h | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/Eigen/src/Core/NumTraits.h b/Eigen/src/Core/NumTraits.h index b7b5e7d22..e065fa714 100644 --- a/Eigen/src/Core/NumTraits.h +++ b/Eigen/src/Core/NumTraits.h @@ -153,7 +153,9 @@ template struct NumTraits > MulCost = 4 * NumTraits::MulCost + 2 * NumTraits::AddCost }; + EIGEN_DEVICE_FUNC static inline Real epsilon() { return NumTraits::epsilon(); } + EIGEN_DEVICE_FUNC static inline Real dummy_precision() { return NumTraits::dummy_precision(); } }; @@ -166,7 +168,7 @@ struct NumTraits > typedef typename NumTraits::NonInteger NonIntegerScalar; typedef Array NonInteger; typedef ArrayType & Nested; - + enum { IsComplex = NumTraits::IsComplex, IsInteger = NumTraits::IsInteger, @@ -176,8 +178,10 @@ struct NumTraits > AddCost = ArrayType::SizeAtCompileTime==Dynamic ? HugeCost : ArrayType::SizeAtCompileTime * NumTraits::AddCost, MulCost = ArrayType::SizeAtCompileTime==Dynamic ? HugeCost : ArrayType::SizeAtCompileTime * NumTraits::MulCost }; - + + EIGEN_DEVICE_FUNC static inline RealScalar epsilon() { return NumTraits::epsilon(); } + EIGEN_DEVICE_FUNC static inline RealScalar dummy_precision() { return NumTraits::dummy_precision(); } }; From 0f5cc504fe2e024c723943c55cf87eedfe12dd8f Mon Sep 17 00:00:00 2001 From: Benoit Steiner Date: Thu, 31 Mar 2016 12:59:39 -0700 Subject: [PATCH 10/14] Properly gate the fft code --- unsupported/Eigen/CXX11/src/Tensor/TensorFFT.h | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorFFT.h b/unsupported/Eigen/CXX11/src/Tensor/TensorFFT.h index 7086a426d..1918392d1 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorFFT.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorFFT.h @@ -10,8 +10,9 @@ #ifndef EIGEN_CXX11_TENSOR_TENSOR_FFT_H #define EIGEN_CXX11_TENSOR_TENSOR_FFT_H -// NVCC fails to compile this code -#if !defined(__CUDACC__) +// This code requires the ability to initialize arrays of constant +// values directly inside a class. +#ifdef EIGEN_HAS_CONSTEXPR namespace Eigen { @@ -638,7 +639,7 @@ struct TensorEvaluator, D } // end namespace Eigen -#endif // __CUDACC__ +#endif // EIGEN_HAS_CONSTEXPR #endif // EIGEN_CXX11_TENSOR_TENSOR_FFT_H From f197813f370c7977bdd6023c13e08dfaf1f9498d Mon Sep 17 00:00:00 2001 From: Benoit Steiner Date: Thu, 31 Mar 2016 13:09:23 -0700 Subject: [PATCH 11/14] Added the ability to hash a fp16 --- Eigen/src/Core/arch/CUDA/Half.h | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/Eigen/src/Core/arch/CUDA/Half.h b/Eigen/src/Core/arch/CUDA/Half.h index a2a2bac37..44645522a 100644 --- a/Eigen/src/Core/arch/CUDA/Half.h +++ b/Eigen/src/Core/arch/CUDA/Half.h @@ -414,6 +414,13 @@ using ::log; using ::sqrt; using ::floor; using ::ceil; +template <> + +struct hash { + size_t operator()(const Eigen::half& a) const { + return std::hash()(a.x); + } +}; } // end namespace std From 92b7f7b6503f2fa66e1f346b88fb6bff434d4d1d Mon Sep 17 00:00:00 2001 From: Benoit Steiner Date: Thu, 31 Mar 2016 13:09:58 -0700 Subject: [PATCH 12/14] Improved code formating --- Eigen/src/Core/arch/CUDA/Half.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Eigen/src/Core/arch/CUDA/Half.h b/Eigen/src/Core/arch/CUDA/Half.h index 44645522a..70050358c 100644 --- a/Eigen/src/Core/arch/CUDA/Half.h +++ b/Eigen/src/Core/arch/CUDA/Half.h @@ -414,8 +414,8 @@ using ::log; using ::sqrt; using ::floor; using ::ceil; -template <> +template <> struct hash { size_t operator()(const Eigen::half& a) const { return std::hash()(a.x); From 0ea7ab4f623864c82163d106cc93c8a97e4baac6 Mon Sep 17 00:00:00 2001 From: Benoit Steiner Date: Thu, 31 Mar 2016 14:44:55 -0700 Subject: [PATCH 13/14] Hashing was only officially introduced in c++11. Therefore only define an implementation of the hash function for float16 if c++11 is enabled. --- Eigen/src/Core/arch/CUDA/Half.h | 2 ++ 1 file changed, 2 insertions(+) diff --git a/Eigen/src/Core/arch/CUDA/Half.h b/Eigen/src/Core/arch/CUDA/Half.h index 70050358c..212aa0d5d 100644 --- a/Eigen/src/Core/arch/CUDA/Half.h +++ b/Eigen/src/Core/arch/CUDA/Half.h @@ -415,12 +415,14 @@ using ::sqrt; using ::floor; using ::ceil; +#if __cplusplus > 199711L template <> struct hash { size_t operator()(const Eigen::half& a) const { return std::hash()(a.x); } }; +#endif } // end namespace std From 3da495e6b9a9e8def7914b53a8698a09b1998037 Mon Sep 17 00:00:00 2001 From: Benoit Steiner Date: Thu, 31 Mar 2016 18:11:51 -0700 Subject: [PATCH 14/14] Relaxed the condition used to gate the fft code. --- unsupported/Eigen/CXX11/src/Tensor/TensorFFT.h | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorFFT.h b/unsupported/Eigen/CXX11/src/Tensor/TensorFFT.h index 1918392d1..d6db45ade 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorFFT.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorFFT.h @@ -12,7 +12,7 @@ // This code requires the ability to initialize arrays of constant // values directly inside a class. -#ifdef EIGEN_HAS_CONSTEXPR +#if __cplusplus >= 201103L || EIGEN_COMP_MSVC >= 1900 namespace Eigen { @@ -565,7 +565,7 @@ struct TensorEvaluator, D // This will support a maximum FFT size of 2^32 for each dimension // m_sin_PI_div_n_LUT[i] = (-2) * std::sin(M_PI / std::pow(2,i)) ^ 2; - RealScalar m_sin_PI_div_n_LUT[32] = { + const RealScalar m_sin_PI_div_n_LUT[32] = { RealScalar(0.0), RealScalar(-2), RealScalar(-0.999999999999999), @@ -601,7 +601,7 @@ struct TensorEvaluator, D }; // m_minus_sin_2_PI_div_n_LUT[i] = -std::sin(2 * M_PI / std::pow(2,i)); - RealScalar m_minus_sin_2_PI_div_n_LUT[32] = { + const RealScalar m_minus_sin_2_PI_div_n_LUT[32] = { RealScalar(0.0), RealScalar(0.0), RealScalar(-1.00000000000000e+00),