From 483aaad10a925b5b22ea87bcabe01712db4fe870 Mon Sep 17 00:00:00 2001
From: Benoit Steiner <benoit.steiner.goog@gmail.com>
Date: Wed, 30 Mar 2016 17:08:13 -0700
Subject: [PATCH 01/14] Fixed compilation warning

---
 unsupported/Eigen/CXX11/src/Tensor/TensorUInt128.h | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)
diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorUInt128.h b/unsupported/Eigen/CXX11/src/Tensor/TensorUInt128.h
index 543a444fb..f68ac1794 100644
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorUInt128.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorUInt128.h
@@ -53,7 +53,9 @@ struct TensorUInt128
   template<typename T>
   EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE
   explicit TensorUInt128(const T& x) : high(0), low(x) {
-    eigen_assert(x < NumTraits<LOW>::highest());
+    typedef typename conditional<sizeof(T) == 8, uint64_t, uint32_t>::type UnsignedT;
+    typedef typename conditional<sizeof(LOW) == 8, uint64_t, uint32_t>::type UnsignedLow;
+    eigen_assert(static_cast<UnsignedT>(x) < static_cast<UnsignedLow>(NumTraits<LOW>::highest()));
     eigen_assert(x >= 0);
   }
 

From bc68fc2fe73adba1cf4d0b40d99d201c3f12bb64 Mon Sep 17 00:00:00 2001
From: Benoit Steiner <benoit.steiner.goog@gmail.com>
Date: Wed, 30 Mar 2016 17:58:32 -0700
Subject: [PATCH 02/14] Enable constant expressions when compiling cuda code
 with clang.

---
 Eigen/src/Core/util/Macros.h | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/Eigen/src/Core/util/Macros.h b/Eigen/src/Core/util/Macros.h
index dbfc9bd37..97627d14c 100644
--- a/Eigen/src/Core/util/Macros.h
+++ b/Eigen/src/Core/util/Macros.h
@@ -370,8 +370,8 @@
 
 // Does the compiler support const expressions?
 #ifdef __CUDACC__
-// Const expressions are supported provided that c++11 is enabled and we're using nvcc 7.5 or above
-#if defined(__CUDACC_VER__) &&  __CUDACC_VER__ >= 70500 && __cplusplus > 199711L
+// Const expressions are supported provided that c++11 is enabled and we're using either clang or nvcc 7.5 or above
+#if __cplusplus > 199711L && defined(__CUDACC_VER__) && (defined(__clang__) || __CUDACC_VER__ >= 70500)
   #define EIGEN_HAS_CONSTEXPR 1
 #endif
 #elif (defined(__cplusplus) && __cplusplus >= 201402L) || \

From 4f1a7e51c17586487c986a456e39af40b41bf4b4 Mon Sep 17 00:00:00 2001
From: Benoit Steiner <benoit.steiner.goog@gmail.com>
Date: Wed, 30 Mar 2016 17:59:49 -0700
Subject: [PATCH 03/14] Pull math functions from the global namespace only when
 compiling cuda code with nvcc. When compiling with clang, we want to use the
 std namespace.

---
 Eigen/Core | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

diff --git a/Eigen/Core b/Eigen/Core
index 8428c51e4..24799f32b 100644
--- a/Eigen/Core
+++ b/Eigen/Core
@@ -42,7 +42,10 @@
   
 #endif
 
-#if defined(__CUDA_ARCH__)
+// When compiling CUDA device code with NVCC, pull in math functions from the
+// global namespace.  In host mode, and when device doee with clang, use the
+// std versions.
+#if defined(__CUDA_ARCH__) && defined(__NVCC__)
   #define EIGEN_USING_STD_MATH(FUNC) using ::FUNC;
 #else
   #define EIGEN_USING_STD_MATH(FUNC) using std::FUNC;

From 791e5cfb6990220b2cfdb7b6f793298a5153561b Mon Sep 17 00:00:00 2001
From: Benoit Steiner <benoit.steiner.goog@gmail.com>
Date: Wed, 30 Mar 2016 18:36:36 -0700
Subject: [PATCH 04/14] Added NumTraits for type2index.

---
 .../Eigen/CXX11/src/Tensor/TensorIndexList.h    | 17 +++++++++++++++++
 1 file changed, 17 insertions(+)

diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorIndexList.h b/unsupported/Eigen/CXX11/src/Tensor/TensorIndexList.h
index 01c31c13e..985594bc8 100644
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorIndexList.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorIndexList.h
@@ -45,6 +45,23 @@ struct type2index {
   }
 };
 
+template<DenseIndex n> struct NumTraits<type2index<n> >
+{
+  typedef DenseIndex Real;
+  enum {
+    IsComplex = 0,
+    RequireInitialization = false,
+    ReadCost = 1,
+    AddCost = 1,
+    MulCost = 1
+  };
+
+  EIGEN_DEVICE_FUNC static inline Real epsilon() { return 0; }
+  EIGEN_DEVICE_FUNC static inline Real dummy_precision() { return 0; }
+  EIGEN_DEVICE_FUNC static inline Real highest() { return n; }
+  EIGEN_DEVICE_FUNC static inline Real lowest() { return n; }
+};
+
 namespace internal {
 template <typename T>
 EIGEN_DEVICE_FUNC void update_value(T& val, DenseIndex new_val) {

From af4ef540bfeb381daaae86f91d492eed39f84e68 Mon Sep 17 00:00:00 2001
From: Benoit Steiner <benoit.steiner.goog@gmail.com>
Date: Wed, 30 Mar 2016 18:37:19 -0700
Subject: [PATCH 05/14] Fixed a off-by-one bug in a debug assertion

---
 unsupported/Eigen/CXX11/src/Tensor/TensorUInt128.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorUInt128.h b/unsupported/Eigen/CXX11/src/Tensor/TensorUInt128.h
index f68ac1794..3e56589c3 100644
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorUInt128.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorUInt128.h
@@ -55,7 +55,7 @@ struct TensorUInt128
   explicit TensorUInt128(const T& x) : high(0), low(x) {
     typedef typename conditional<sizeof(T) == 8, uint64_t, uint32_t>::type UnsignedT;
     typedef typename conditional<sizeof(LOW) == 8, uint64_t, uint32_t>::type UnsignedLow;
-    eigen_assert(static_cast<UnsignedT>(x) < static_cast<UnsignedLow>(NumTraits<LOW>::highest()));
+    eigen_assert(static_cast<UnsignedT>(x) <= static_cast<UnsignedLow>(NumTraits<LOW>::highest()));
     eigen_assert(x >= 0);
   }
 

From 8c8a79cec1b7d03be30df0e70cea236b8f52ef64 Mon Sep 17 00:00:00 2001
From: Benoit Steiner <benoit.steiner.goog@gmail.com>
Date: Thu, 31 Mar 2016 10:33:32 -0700
Subject: [PATCH 06/14] Fixed a typo

---
 Eigen/src/Core/arch/CUDA/Half.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/Eigen/src/Core/arch/CUDA/Half.h b/Eigen/src/Core/arch/CUDA/Half.h
index 6c412159c..ace250c6f 100644
--- a/Eigen/src/Core/arch/CUDA/Half.h
+++ b/Eigen/src/Core/arch/CUDA/Half.h
@@ -348,7 +348,7 @@ namespace numext {
 static inline EIGEN_DEVICE_FUNC bool (isinf)(const Eigen::half& a) {
   return (a.x & 0x7fff) == 0x7c00;
 }
-static inline EIGEN_HALF_CUDA_H bool (isnan)(const Eigen::half& a) {
+static inline EIGEN_DEVICE_FUNC bool (isnan)(const Eigen::half& a) {
 #if defined(EIGEN_HAS_CUDA_FP16) && defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 530
   return __hisnan(a);
 #else

From b575fb1d02f7a98c94a576284fbcd4ff85970120 Mon Sep 17 00:00:00 2001
From: Benoit Steiner <benoit.steiner.goog@gmail.com>
Date: Thu, 31 Mar 2016 10:43:59 -0700
Subject: [PATCH 07/14] Added NumTraits for half floats

---
 Eigen/src/Core/arch/CUDA/Half.h | 12 ++++++++++++
 1 file changed, 12 insertions(+)

diff --git a/Eigen/src/Core/arch/CUDA/Half.h b/Eigen/src/Core/arch/CUDA/Half.h
index ace250c6f..dc7119c06 100644
--- a/Eigen/src/Core/arch/CUDA/Half.h
+++ b/Eigen/src/Core/arch/CUDA/Half.h
@@ -341,6 +341,18 @@ template<> struct is_arithmetic<half> { enum { value = true }; };
 
 } // end namespace internal
 
+template<> struct NumTraits<Eigen::half>
+    : GenericNumTraits<Eigen::half>
+{
+  EIGEN_DEVICE_FUNC static inline float dummy_precision() { return 1e-3f; }
+  EIGEN_DEVICE_FUNC static inline Eigen::half highest() {
+    return internal::raw_uint16_to_half(0x7bff);
+  }
+  EIGEN_DEVICE_FUNC static inline Eigen::half lowest() {
+    return internal::raw_uint16_to_half(0xfbff);
+  }
+};
+
 // Infinity/NaN checks.
 
 namespace numext {

From c36ab1990247a5b60bcad564759e8903f30fbab5 Mon Sep 17 00:00:00 2001
From: Benoit Steiner <benoit.steiner.goog@gmail.com>
Date: Thu, 31 Mar 2016 10:55:03 -0700
Subject: [PATCH 08/14] Added __ldg primitive for fp16.

---
 Eigen/src/Core/arch/CUDA/Half.h | 7 +++++++
 1 file changed, 7 insertions(+)

diff --git a/Eigen/src/Core/arch/CUDA/Half.h b/Eigen/src/Core/arch/CUDA/Half.h
index dc7119c06..a2a2bac37 100644
--- a/Eigen/src/Core/arch/CUDA/Half.h
+++ b/Eigen/src/Core/arch/CUDA/Half.h
@@ -423,7 +423,14 @@ using ::ceil;
 __device__ inline Eigen::half __shfl_xor(Eigen::half var, int laneMask, int width=warpSize) {
   return static_cast<Eigen::half>(__shfl_xor(static_cast<float>(var), laneMask, width));
 }
+#endif
 
+// ldg() has an overload for __half, but we also need one for Eigen::half.
+#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 320
+static inline EIGEN_DEVICE_FUNC Eigen::half __ldg(const Eigen::half* ptr) {
+  return Eigen::internal::raw_uint16_to_half(
+      __ldg(reinterpret_cast<const unsigned short*>(ptr)));
+}
 #endif
 
 

From 4c859181daa3807f54ee7ae8add6bac66e896ace Mon Sep 17 00:00:00 2001
From: Benoit Steiner <benoit.steiner.goog@gmail.com>
Date: Thu, 31 Mar 2016 12:48:38 -0700
Subject: [PATCH 09/14] Made it possible to use the NumTraits for complex and
 Array in a cuda kernel.

---
 Eigen/src/Core/NumTraits.h | 8 ++++++--
 1 file changed, 6 insertions(+), 2 deletions(-)

diff --git a/Eigen/src/Core/NumTraits.h b/Eigen/src/Core/NumTraits.h
index b7b5e7d22..e065fa714 100644
--- a/Eigen/src/Core/NumTraits.h
+++ b/Eigen/src/Core/NumTraits.h
@@ -153,7 +153,9 @@ template<typename _Real> struct NumTraits<std::complex<_Real> >
     MulCost = 4 * NumTraits<Real>::MulCost + 2 * NumTraits<Real>::AddCost
   };
 
+  EIGEN_DEVICE_FUNC
   static inline Real epsilon() { return NumTraits<Real>::epsilon(); }
+  EIGEN_DEVICE_FUNC
   static inline Real dummy_precision() { return NumTraits<Real>::dummy_precision(); }
 };
 
@@ -166,7 +168,7 @@ struct NumTraits<Array<Scalar, Rows, Cols, Options, MaxRows, MaxCols> >
   typedef typename NumTraits<Scalar>::NonInteger NonIntegerScalar;
   typedef Array<NonIntegerScalar, Rows, Cols, Options, MaxRows, MaxCols> NonInteger;
   typedef ArrayType & Nested;
-  
+
   enum {
     IsComplex = NumTraits<Scalar>::IsComplex,
     IsInteger = NumTraits<Scalar>::IsInteger,
@@ -176,8 +178,10 @@ struct NumTraits<Array<Scalar, Rows, Cols, Options, MaxRows, MaxCols> >
     AddCost  = ArrayType::SizeAtCompileTime==Dynamic ? HugeCost : ArrayType::SizeAtCompileTime * NumTraits<Scalar>::AddCost,
     MulCost  = ArrayType::SizeAtCompileTime==Dynamic ? HugeCost : ArrayType::SizeAtCompileTime * NumTraits<Scalar>::MulCost
   };
-  
+
+  EIGEN_DEVICE_FUNC
   static inline RealScalar epsilon() { return NumTraits<RealScalar>::epsilon(); }
+  EIGEN_DEVICE_FUNC
   static inline RealScalar dummy_precision() { return NumTraits<RealScalar>::dummy_precision(); }
 };
 

From 0f5cc504fe2e024c723943c55cf87eedfe12dd8f Mon Sep 17 00:00:00 2001
From: Benoit Steiner <benoit.steiner.goog@gmail.com>
Date: Thu, 31 Mar 2016 12:59:39 -0700
Subject: [PATCH 10/14] Properly gate the fft code

---
 unsupported/Eigen/CXX11/src/Tensor/TensorFFT.h | 7 ++++---
 1 file changed, 4 insertions(+), 3 deletions(-)

diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorFFT.h b/unsupported/Eigen/CXX11/src/Tensor/TensorFFT.h
index 7086a426d..1918392d1 100644
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorFFT.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorFFT.h
@@ -10,8 +10,9 @@
 #ifndef EIGEN_CXX11_TENSOR_TENSOR_FFT_H
 #define EIGEN_CXX11_TENSOR_TENSOR_FFT_H
 
-// NVCC fails to compile this code
-#if !defined(__CUDACC__)
+// This code requires the ability to initialize arrays of constant
+// values directly inside a class.
+#ifdef EIGEN_HAS_CONSTEXPR
 
 namespace Eigen {
 
@@ -638,7 +639,7 @@ struct TensorEvaluator<const TensorFFTOp<FFT, ArgType, FFTResultType, FFTDir>, D
 
 }  // end namespace Eigen
 
-#endif  // __CUDACC__
+#endif  // EIGEN_HAS_CONSTEXPR
 
 
 #endif  // EIGEN_CXX11_TENSOR_TENSOR_FFT_H

From f197813f370c7977bdd6023c13e08dfaf1f9498d Mon Sep 17 00:00:00 2001
From: Benoit Steiner <benoit.steiner.goog@gmail.com>
Date: Thu, 31 Mar 2016 13:09:23 -0700
Subject: [PATCH 11/14] Added the ability to hash a fp16

---
 Eigen/src/Core/arch/CUDA/Half.h | 7 +++++++
 1 file changed, 7 insertions(+)

diff --git a/Eigen/src/Core/arch/CUDA/Half.h b/Eigen/src/Core/arch/CUDA/Half.h
index a2a2bac37..44645522a 100644
--- a/Eigen/src/Core/arch/CUDA/Half.h
+++ b/Eigen/src/Core/arch/CUDA/Half.h
@@ -414,6 +414,13 @@ using ::log;
 using ::sqrt;
 using ::floor;
 using ::ceil;
+template <>
+
+struct hash<Eigen::half> {
+  size_t operator()(const Eigen::half& a) const {
+    return std::hash<unsigned short>()(a.x);
+  }
+};
 
 } // end namespace std
 

From 92b7f7b6503f2fa66e1f346b88fb6bff434d4d1d Mon Sep 17 00:00:00 2001
From: Benoit Steiner <benoit.steiner.goog@gmail.com>
Date: Thu, 31 Mar 2016 13:09:58 -0700
Subject: [PATCH 12/14] Improved code formating

---
 Eigen/src/Core/arch/CUDA/Half.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/Eigen/src/Core/arch/CUDA/Half.h b/Eigen/src/Core/arch/CUDA/Half.h
index 44645522a..70050358c 100644
--- a/Eigen/src/Core/arch/CUDA/Half.h
+++ b/Eigen/src/Core/arch/CUDA/Half.h
@@ -414,8 +414,8 @@ using ::log;
 using ::sqrt;
 using ::floor;
 using ::ceil;
-template <>
 
+template <>
 struct hash<Eigen::half> {
   size_t operator()(const Eigen::half& a) const {
     return std::hash<unsigned short>()(a.x);

From 0ea7ab4f623864c82163d106cc93c8a97e4baac6 Mon Sep 17 00:00:00 2001
From: Benoit Steiner <benoit.steiner.goog@gmail.com>
Date: Thu, 31 Mar 2016 14:44:55 -0700
Subject: [PATCH 13/14] Hashing was only officially introduced in c++11.
 Therefore only define an implementation of the hash function for float16 if
 c++11 is enabled.

---
 Eigen/src/Core/arch/CUDA/Half.h | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/Eigen/src/Core/arch/CUDA/Half.h b/Eigen/src/Core/arch/CUDA/Half.h
index 70050358c..212aa0d5d 100644
--- a/Eigen/src/Core/arch/CUDA/Half.h
+++ b/Eigen/src/Core/arch/CUDA/Half.h
@@ -415,12 +415,14 @@ using ::sqrt;
 using ::floor;
 using ::ceil;
 
+#if __cplusplus > 199711L
 template <>
 struct hash<Eigen::half> {
   size_t operator()(const Eigen::half& a) const {
     return std::hash<unsigned short>()(a.x);
   }
 };
+#endif
 
 } // end namespace std
 

From 3da495e6b9a9e8def7914b53a8698a09b1998037 Mon Sep 17 00:00:00 2001
From: Benoit Steiner <benoit.steiner.goog@gmail.com>
Date: Thu, 31 Mar 2016 18:11:51 -0700
Subject: [PATCH 14/14] Relaxed the condition used to gate the fft code.

---
 unsupported/Eigen/CXX11/src/Tensor/TensorFFT.h | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorFFT.h b/unsupported/Eigen/CXX11/src/Tensor/TensorFFT.h
index 1918392d1..d6db45ade 100644
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorFFT.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorFFT.h
@@ -12,7 +12,7 @@
 
 // This code requires the ability to initialize arrays of constant
 // values directly inside a class.
-#ifdef EIGEN_HAS_CONSTEXPR
+#if __cplusplus >= 201103L || EIGEN_COMP_MSVC >= 1900
 
 namespace Eigen {
 
@@ -565,7 +565,7 @@ struct TensorEvaluator<const TensorFFTOp<FFT, ArgType, FFTResultType, FFTDir>, D
 
   // This will support a maximum FFT size of 2^32 for each dimension
   // m_sin_PI_div_n_LUT[i] = (-2) * std::sin(M_PI / std::pow(2,i)) ^ 2;
-  RealScalar m_sin_PI_div_n_LUT[32] = {
+  const RealScalar m_sin_PI_div_n_LUT[32] = {
     RealScalar(0.0),
     RealScalar(-2),
     RealScalar(-0.999999999999999),
@@ -601,7 +601,7 @@ struct TensorEvaluator<const TensorFFTOp<FFT, ArgType, FFTResultType, FFTDir>, D
   };
 
   // m_minus_sin_2_PI_div_n_LUT[i] = -std::sin(2 * M_PI / std::pow(2,i));
-  RealScalar m_minus_sin_2_PI_div_n_LUT[32] = {
+  const RealScalar m_minus_sin_2_PI_div_n_LUT[32] = {
     RealScalar(0.0),
     RealScalar(0.0),
     RealScalar(-1.00000000000000e+00),