Merged eigen/eigen into default

2026-04-10 11:34:33 +08:00 · 2019-01-09 15:04:17 -08:00
parent 055f0b73db d812f411c3
commit cb3c059fa4
14 changed files with 409 additions and 123 deletions
--- a/Eigen/src/Core/GenericPacketMath.h
+++ b/Eigen/src/Core/GenericPacketMath.h
@@ -406,18 +406,39 @@ typename conditional<(unpacket_traits<Packet>::size%8)==0,typename unpacket_trai
 predux_half_dowto4(const Packet& a)
 { return a; }

-/** \internal \returns the product of the elements of \a a*/
+/** \internal \returns the product of the elements of \a a */
 template<typename Packet> EIGEN_DEVICE_FUNC inline typename unpacket_traits<Packet>::type predux_mul(const Packet& a)
 { return a; }

-/** \internal \returns the min of the elements of \a a*/
+/** \internal \returns the min of the elements of \a a */
 template<typename Packet> EIGEN_DEVICE_FUNC inline typename unpacket_traits<Packet>::type predux_min(const Packet& a)
 { return a; }

-/** \internal \returns the max of the elements of \a a*/
+/** \internal \returns the max of the elements of \a a */
 template<typename Packet> EIGEN_DEVICE_FUNC inline typename unpacket_traits<Packet>::type predux_max(const Packet& a)
 { return a; }

+/** \internal \returns true if all coeffs of \a a means "true"
+  * It is supposed to be called on values returned by pcmp_*.
+  */
+// not needed yet
+// template<typename Packet> EIGEN_DEVICE_FUNC inline bool predux_all(const Packet& a)
+// { return bool(a); }
+
+/** \internal \returns true if any coeffs of \a a means "true"
+  * It is supposed to be called on values returned by pcmp_*.
+  */
+template<typename Packet> EIGEN_DEVICE_FUNC inline bool predux_any(const Packet& a)
+{
+  // Dirty but generic implementation where "true" is assumed to be non 0 and all the sames.
+  // It is expected that "true" is either:
+  //  - Scalar(1)
+  //  - bits full of ones (NaN for floats),
+  //  - or first bit equals to 1 (1 for ints, smallest denormal for floats).
+  // For all these cases, taking the sum is just fine, and this boils down to a no-op for scalars.
+  return bool(predux(a));
+}
+
 /** \internal \returns the reversed elements of \a a*/
 template<typename Packet> EIGEN_DEVICE_FUNC inline Packet preverse(const Packet& a)
 { return a; }
--- a/Eigen/src/Core/arch/AVX/PacketMath.h
+++ b/Eigen/src/Core/arch/AVX/PacketMath.h
@@ -576,6 +576,16 @@ template<> EIGEN_STRONG_INLINE double predux_max<Packet4d>(const Packet4d& a)
  return pfirst(_mm256_max_pd(tmp, _mm256_shuffle_pd(tmp, tmp, 1)));
 }

+// not needed yet
+// template<> EIGEN_STRONG_INLINE bool predux_all(const Packet8f& x)
+// {
+//   return _mm256_movemask_ps(x)==0xFF;
+// }
+
+template<> EIGEN_STRONG_INLINE bool predux_any(const Packet8f& x)
+{
+  return _mm256_movemask_ps(x)!=0;
+}

 template<int Offset>
 struct palign_impl<Offset,Packet8f>
--- a/Eigen/src/Core/arch/AVX512/MathFunctions.h
+++ b/Eigen/src/Core/arch/AVX512/MathFunctions.h
@@ -47,6 +47,7 @@ plog<Packet16f>(const Packet16f& _x) {
  // The smallest non denormalized float number.
  _EIGEN_DECLARE_CONST_Packet16f_FROM_INT(min_norm_pos, 0x00800000);
  _EIGEN_DECLARE_CONST_Packet16f_FROM_INT(minus_inf, 0xff800000);
+  _EIGEN_DECLARE_CONST_Packet16f_FROM_INT(pos_inf, 0x7f800000);
  _EIGEN_DECLARE_CONST_Packet16f_FROM_INT(nan, 0x7fc00000);

  // Polynomial coefficients.
@@ -116,10 +117,16 @@ plog<Packet16f>(const Packet16f& _x) {
  x = padd(x, y);
  x = padd(x, y2);

-  // Filter out invalid inputs, i.e. negative arg will be NAN, 0 will be -INF.
+  __mmask16 pos_inf_mask = _mm512_cmp_ps_mask(_x,p16f_pos_inf,_CMP_EQ_OQ);
+  // Filter out invalid inputs, i.e.:
+  //  - negative arg will be NAN,
+  //  - 0 will be -INF.
+  //  - +INF will be +INF
  return _mm512_mask_blend_ps(iszero_mask,
-                              _mm512_mask_blend_ps(invalid_mask, x, p16f_nan),
-                              p16f_minus_inf);
+            _mm512_mask_blend_ps(invalid_mask,
+              _mm512_mask_blend_ps(pos_inf_mask,x,p16f_pos_inf),
+              p16f_nan),
+            p16f_minus_inf);
 }
 #endif

--- a/Eigen/src/Core/arch/AVX512/PacketMath.h
+++ b/Eigen/src/Core/arch/AVX512/PacketMath.h
@@ -283,9 +283,27 @@ EIGEN_STRONG_INLINE Packet16f cat256(Packet8f a, Packet8f b) {
 }
 #endif

+template<> EIGEN_STRONG_INLINE Packet16f pcmp_le(const Packet16f& a, const Packet16f& b) {
+  __m256 lo = pcmp_le(extract256<0>(a), extract256<0>(b));
+  __m256 hi = pcmp_le(extract256<1>(a), extract256<1>(b));
+  return cat256(lo, hi);
+}
+
+template<> EIGEN_STRONG_INLINE Packet16f pcmp_lt(const Packet16f& a, const Packet16f& b) {
+  __m256 lo = pcmp_lt(extract256<0>(a), extract256<0>(b));
+  __m256 hi = pcmp_lt(extract256<1>(a), extract256<1>(b));
+  return cat256(lo, hi);
+}
+
+template<> EIGEN_STRONG_INLINE Packet16f pcmp_eq(const Packet16f& a, const Packet16f& b) {
+  __m256 lo = pcmp_eq(extract256<0>(a), extract256<0>(b));
+  __m256 hi = pcmp_eq(extract256<1>(a), extract256<1>(b));
+  return cat256(lo, hi);
+}
+
 template<> EIGEN_STRONG_INLINE Packet16f pcmp_lt_or_nan(const Packet16f& a, const Packet16f& b) {
-  __m256 lo = _mm256_cmp_ps(extract256<0>(a), extract256<0>(b), _CMP_NGE_UQ);
-  __m256 hi = _mm256_cmp_ps(extract256<1>(a), extract256<1>(b), _CMP_NGE_UQ);
+  __m256 lo = pcmp_lt_or_nan(extract256<0>(a), extract256<0>(b));
+  __m256 hi = pcmp_lt_or_nan(extract256<1>(a), extract256<1>(b));
  return cat256(lo, hi);
 }

@@ -964,6 +982,13 @@ EIGEN_STRONG_INLINE double predux_max<Packet8d>(const Packet8d& a) {
  return pfirst(_mm256_max_pd(res, _mm256_shuffle_pd(res, res, 1)));
 }

+template<> EIGEN_STRONG_INLINE bool predux_any(const Packet16f& x)
+{
+  Packet16i xi = _mm512_castps_si512(x);
+  __mmask16 tmp = _mm512_test_epi32_mask(xi,xi);
+  return !_mm512_kortestz(tmp,tmp);
+}
+
 template <int Offset>
 struct palign_impl<Offset, Packet16f> {
  static EIGEN_STRONG_INLINE void run(Packet16f& first,
--- a/Eigen/src/Core/arch/AltiVec/PacketMath.h
+++ b/Eigen/src/Core/arch/AltiVec/PacketMath.h
@@ -720,6 +720,11 @@ template<> EIGEN_STRONG_INLINE int predux_max<Packet4i>(const Packet4i& a)
  return pfirst(res);
 }

+template<> EIGEN_STRONG_INLINE bool predux_any(const Packet4f& x)
+{
+  return vec_any_ne(x, pzero(x));
+}
+
 template<int Offset>
 struct palign_impl<Offset,Packet4f>
 {
--- a/Eigen/src/Core/arch/Default/GenericPacketMathFunctions.h
+++ b/Eigen/src/Core/arch/Default/GenericPacketMathFunctions.h
@@ -269,88 +269,121 @@ EIGEN_UNUSED
 Packet psincos_float(const Packet& _x)
 {
  typedef typename unpacket_traits<Packet>::integer_packet PacketI;
-  const Packet cst_1          = pset1<Packet>(1.0f);
-  const Packet cst_half       = pset1<Packet>(0.5f);

-  const PacketI csti_1        = pset1<PacketI>(1);
-  const PacketI csti_not1     = pset1<PacketI>(~1);
-  const PacketI csti_2        = pset1<PacketI>(2);
-  const PacketI csti_3        = pset1<PacketI>(3);
-
-  const Packet cst_sign_mask  = pset1frombits<Packet>(0x80000000u);
-
-  const Packet cst_minus_cephes_DP1 = pset1<Packet>(-0.78515625f);
-  const Packet cst_minus_cephes_DP2 = pset1<Packet>(-2.4187564849853515625e-4f);
-  const Packet cst_minus_cephes_DP3 = pset1<Packet>(-3.77489497744594108e-8f);
-  const Packet cst_sincof_p0        = pset1<Packet>(-1.9515295891E-4f);
-  const Packet cst_sincof_p1        = pset1<Packet>( 8.3321608736E-3f);
-  const Packet cst_sincof_p2        = pset1<Packet>(-1.6666654611E-1f);
-  const Packet cst_coscof_p0        = pset1<Packet>( 2.443315711809948E-005f);
-  const Packet cst_coscof_p1        = pset1<Packet>(-1.388731625493765E-003f);
-  const Packet cst_coscof_p2        = pset1<Packet>( 4.166664568298827E-002f);
-  const Packet cst_cephes_FOPI      = pset1<Packet>( 1.27323954473516f); // 4 / M_PI
+  const Packet  cst_2oPI            = pset1<Packet>(0.636619746685028076171875f); // 2/PI
+  const Packet  cst_rounding_magic  = pset1<Packet>(12582912); // 2^23 for rounding
+  const PacketI csti_1              = pset1<PacketI>(1);
+  const Packet  cst_sign_mask       = pset1frombits<Packet>(0x80000000u);

  Packet x = pabs(_x);

-  // Scale x by 4/Pi to find x's octant.
-  Packet y = pmul(x, cst_cephes_FOPI);
+  // Scale x by 2/Pi to find x's octant.
+  Packet y = pmul(x, cst_2oPI);

-  // Get the octant. We'll reduce x by this number of octants or by one more than it.
-  PacketI y_int = pcast<Packet,PacketI>(y);
-  // x's from even-numbered octants will translate to octant 0: [0, +Pi/4].
-  // x's from odd-numbered octants will translate to octant -1: [-Pi/4, 0].
-  // Adjustment for odd-numbered octants: octant = (octant + 1) & (~1).
-  PacketI y_int1 = pand(padd(y_int, csti_1), csti_not1); // could be pbitclear<0>(...)
-  y = pcast<PacketI,Packet>(y_int1);
+  // Rounding trick:
+  Packet y_round = padd(y, cst_rounding_magic);
+  PacketI y_int = preinterpret<PacketI>(y_round); // last 23 digits represent integer (if abs(x)<2^24)
+  y = psub(y_round, cst_rounding_magic); // nearest integer to x*4/pi

  // Compute the sign to apply to the polynomial.
-  // sign = third_bit(y_int1) xor signbit(_x)
-  Packet sign_bit = ComputeSine ? pxor(_x, preinterpret<Packet>(pshiftleft<29>(y_int1)))
-                                : preinterpret<Packet>(pshiftleft<29>(padd(y_int1,csti_3)));
+  // sin: sign = second_bit(y_int) xor signbit(_x)
+  // cos: sign = second_bit(y_int+1)
+  Packet sign_bit = ComputeSine ? pxor(_x, preinterpret<Packet>(pshiftleft<30>(y_int)))
+                                : preinterpret<Packet>(pshiftleft<30>(padd(y_int,csti_1)));
  sign_bit = pand(sign_bit, cst_sign_mask); // clear all but left most bit

-  // Get the polynomial selection mask from the second bit of y_int1
+  // Get the polynomial selection mask from the second bit of y_int
  // We'll calculate both (sin and cos) polynomials and then select from the two.
-  Packet poly_mask = preinterpret<Packet>(pcmp_eq(pand(y_int1, csti_2), pzero(y_int1)));
+  Packet poly_mask = preinterpret<Packet>(pcmp_eq(pand(y_int, csti_1), pzero(y_int)));

-  // Reduce x by y octants to get: -Pi/4 <= x <= +Pi/4.
-  // The magic pass: "Extended precision modular arithmetic"
-  // x = ((x - y * DP1) - y * DP2) - y * DP3
-  x = pmadd(y, cst_minus_cephes_DP1, x);
-  x = pmadd(y, cst_minus_cephes_DP2, x);
-  x = pmadd(y, cst_minus_cephes_DP3, x);
+  // Reduce x by y octants to get: -Pi/4 <= x <= +Pi/4
+  // using "Extended precision modular arithmetic"
+  #if defined(EIGEN_HAS_SINGLE_INSTRUCTION_MADD)
+  // This version requires true FMA for high accuracy
+  // It provides a max error of 1ULP up to (with absolute_error < 5.9605e-08):
+  const float huge_th = ComputeSine ? 117435.992f : 71476.0625f;
+  x = pmadd(y, pset1<Packet>(-1.57079601287841796875f), x);
+  x = pmadd(y, pset1<Packet>(-3.1391647326017846353352069854736328125e-07f), x);
+  x = pmadd(y, pset1<Packet>(-5.390302529957764765544681040410068817436695098876953125e-15f), x);
+  #else
+  // Without true FMA, the previous set of coefficients maintain 1ULP accuracy
+  // up to x<15.7 (for sin), but accuracy is immediately lost for x>15.7.
+  // We thus use one more iteration to maintain 2ULPs up to reasonably large inputs.
+
+  // The following set of coefficients maintain 1ULP up to 9.43 and 14.16 for sin and cos respectively.
+  // and 2 ULP up to:
+  const float huge_th = ComputeSine ? 25966.f : 18838.f;
+  x = pmadd(y, pset1<Packet>(-1.5703125), x); // = 0xbfc90000
+  x = pmadd(y, pset1<Packet>(-0.000483989715576171875), x); // = 0xb9fdc000
+  x = pmadd(y, pset1<Packet>(1.62865035235881805419921875e-07), x); // = 0x342ee000
+  x = pmadd(y, pset1<Packet>(5.5644315544167710640977020375430583953857421875e-11), x); // = 0x2e74b9ee
+
+  // For the record, the following set of coefficients maintain 2ULP up
+  // to a slightly larger range:
+  // const float huge_th = ComputeSine ? 51981.f : 39086.125f;
+  // but it slightly fails to maintain 1ULP for two values of sin below pi.
+  // x = pmadd(y, pset1<Packet>(-3.140625/2.), x);
+  // x = pmadd(y, pset1<Packet>(-0.00048351287841796875), x);
+  // x = pmadd(y, pset1<Packet>(-3.13855707645416259765625e-07), x);
+  // x = pmadd(y, pset1<Packet>(-6.0771006282767103812147979624569416046142578125e-11), x);
+
+  // For the record, with only 3 iterations it is possible to maintain
+  // 1 ULP up to 3PI (maybe more) and 2ULP up to 255.
+  // The coefficients are: 0xbfc90f80, 0xb7354480, 0x2e74b9ee
+  #endif
+
+  // We use huge_vals as a temporary for abs(_x) to ensure huge_vals
+  // is fully initialized for the last pselect(). (prevent compiler warning)
+  Packet huge_vals = pabs(_x);
+  Packet huge_mask = pcmp_le(pset1<Packet>(huge_th),huge_vals);
+  
+  if(predux_any(huge_mask))
+  {
+    const int PacketSize = unpacket_traits<Packet>::size;
+    #if EIGEN_HAS_CXX11
+    alignas(Packet) float vals[PacketSize];
+    #else
+    EIGEN_ALIGN_TO_BOUNDARY(sizeof(Packet)) float vals[PacketSize];
+    #endif
+    pstoreu(vals, _x);
+    for(int k=0; k<PacketSize;++k) {
+      float val = vals[k];
+      if(numext::abs(val)>=huge_th)  {
+        vals[k] = ComputeSine ? std::sin(val) : std::cos(val);
+      }
+    }
+    huge_vals = ploadu<Packet>(vals);
+  }

  Packet x2 = pmul(x,x);

-  // Evaluate the cos(x) polynomial. (0 <= x <= Pi/4)
-  Packet y1 = cst_coscof_p0;
-  y1 = pmadd(y1, x2, cst_coscof_p1);
-  y1 = pmadd(y1, x2, cst_coscof_p2);
-  y1 = pmul(y1, x2);
-  y1 = pmul(y1, x2);
-  y1 = psub(y1, pmul(x2, cst_half));
-  y1 = padd(y1, cst_1);
+  // Evaluate the cos(x) polynomial. (-Pi/4 <= x <= Pi/4)
+  Packet y1 =        pset1<Packet>(2.4372266125283204019069671630859375e-05f);
+  y1 = pmadd(y1, x2, pset1<Packet>(-0.00138865201734006404876708984375f     ));
+  y1 = pmadd(y1, x2, pset1<Packet>(0.041666619479656219482421875f           ));
+  y1 = pmadd(y1, x2, pset1<Packet>(-0.5f));
+  y1 = pmadd(y1, x2, pset1<Packet>(1.f));

-  // Evaluate the sin(x) polynomial. (Pi/4 <= x <= 0) 
-  Packet y2 = cst_sincof_p0;
-  y2 = pmadd(y2, x2, cst_sincof_p1);
-  y2 = pmadd(y2, x2, cst_sincof_p2);
+  // Evaluate the sin(x) polynomial. (Pi/4 <= x <= Pi/4)
+  // octave/matlab code to compute those coefficients:
+  //    x = (0:0.0001:pi/4)';
+  //    A = [x.^3 x.^5 x.^7];
+  //    w = ((1.-(x/(pi/4)).^2).^5)*2000+1;         # weights trading relative accuracy
+  //    c = (A'*diag(w)*A)\(A'*diag(w)*(sin(x)-x)); # weighted LS, linear coeff forced to 1
+  //    printf('%.64f\n %.64f\n%.64f\n', c(3), c(2), c(1))
+  //
+  Packet y2 =        pset1<Packet>(-0.0001959234114083702898469196984621021329076029360294342041015625f);
+  y2 = pmadd(y2, x2, pset1<Packet>( 0.0083326873655616851693794799871284340042620897293090820312500000f));
+  y2 = pmadd(y2, x2, pset1<Packet>(-0.1666666203982298255503735617821803316473960876464843750000000000f));
  y2 = pmul(y2, x2);
  y2 = pmadd(y2, x, x);

-  // Select the correct result from the two polynoms.
+  // Select the correct result from the two polynomials.
  y = ComputeSine ? pselect(poly_mask,y2,y1)
                  : pselect(poly_mask,y1,y2);

-  // For very large arguments the the reduction to the [-Pi/4,+Pi/4] range
-  // does not work thus leading to sine/cosine out of the [-1:1] range.
-  // Since computing the sine/cosine for very large entry entries makes little
-  // sense in term of accuracy, we simply clamp to [-1,1]:
-  y = pmin(y,pset1<Packet>( 1.f));
-  y = pmax(y,pset1<Packet>(-1.f));
-
-  // Update the sign
-  return pxor(y, sign_bit);
+  // Update the sign and filter huge inputs
+  return pselect(huge_mask, huge_vals, pxor(y, sign_bit));
 }

 template<typename Packet>
--- a/Eigen/src/Core/arch/NEON/PacketMath.h
+++ b/Eigen/src/Core/arch/NEON/PacketMath.h
@@ -551,6 +551,13 @@ template<> EIGEN_STRONG_INLINE int32_t predux_max<Packet4i>(const Packet4i& a)
  return vget_lane_s32(max, 0);
 }

+template<> EIGEN_STRONG_INLINE bool predux_any(const Packet4f& x)
+{
+  uint32x2_t tmp = vorr_u32(vget_low_u32( vreinterpretq_u32_f32(x)),
+                            vget_high_u32(vreinterpretq_u32_f32(x)));
+  return vget_lane_u32(vpmax_u32(tmp,tmp),0);
+}
+
 // this PALIGN_NEON business is to work around a bug in LLVM Clang 3.0 causing incorrect compilation errors,
 // see bug 347 and this LLVM bug: http://llvm.org/bugs/show_bug.cgi?id=11074
 #define PALIGN_NEON(Offset,Type,Command) \
--- a/Eigen/src/Core/arch/SSE/PacketMath.h
+++ b/Eigen/src/Core/arch/SSE/PacketMath.h
@@ -813,6 +813,17 @@ template<> EIGEN_STRONG_INLINE int predux_max<Packet4i>(const Packet4i& a)
 #endif // EIGEN_VECTORIZE_SSE4_1
 }

+// not needed yet
+// template<> EIGEN_STRONG_INLINE bool predux_all(const Packet4f& x)
+// {
+//   return _mm_movemask_ps(x) == 0xF;
+// }
+
+template<> EIGEN_STRONG_INLINE bool predux_any(const Packet4f& x)
+{
+  return _mm_movemask_ps(x) != 0x0;
+}
+
 #if EIGEN_COMP_GNUC
 // template <> EIGEN_STRONG_INLINE Packet4f pmadd(const Packet4f&  a, const Packet4f&  b, const Packet4f&  c)
 // {
--- a/Eigen/src/Core/util/StaticAssert.h
+++ b/Eigen/src/Core/util/StaticAssert.h
@@ -104,7 +104,8 @@
        STORAGE_INDEX_MUST_MATCH=1,
        CHOLMOD_SUPPORTS_DOUBLE_PRECISION_ONLY=1,
        SELFADJOINTVIEW_ACCEPTS_UPPER_AND_LOWER_MODE_ONLY=1,
-        INVALID_TEMPLATE_PARAMETER=1
+        INVALID_TEMPLATE_PARAMETER=1,
+        GPU_TENSOR_CONTRACTION_DOES_NOT_SUPPORT_OUTPUT_KERNELS=1
      };
    };