From db90c4939cdb652e67a609d26a48393ad540afce Mon Sep 17 00:00:00 2001 From: Rasmus Munk Larsen <4643818-rmlarsen1@users.noreply.gitlab.com> Date: Wed, 26 Nov 2025 00:17:12 +0000 Subject: [PATCH] Add a ptanh_float implementation that is accurate to 1 ULP libeigen/eigen!2082 Co-authored-by: Rasmus Munk Larsen --- .../arch/Default/GenericPacketMathFunctions.h | 71 +++++++++++++++++-- .../Default/GenericPacketMathFunctionsFwd.h | 2 +- 2 files changed, 66 insertions(+), 7 deletions(-) diff --git a/Eigen/src/Core/arch/Default/GenericPacketMathFunctions.h b/Eigen/src/Core/arch/Default/GenericPacketMathFunctions.h index 96d278386..487127b79 100644 --- a/Eigen/src/Core/arch/Default/GenericPacketMathFunctions.h +++ b/Eigen/src/Core/arch/Default/GenericPacketMathFunctions.h @@ -4,6 +4,7 @@ // Copyright (C) 2007 Julien Pommier // Copyright (C) 2014 Pedro Gonnet (pedro.gonnet@gmail.com) // Copyright (C) 2009-2019 Gael Guennebaud +// Copyright (C) 2018-2025 Rasmus Munk Larsen // // This Source Code Form is subject to the terms of the Mozilla // Public License v. 2.0. If a copy of the MPL was not distributed @@ -641,7 +642,7 @@ EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS Packet generic_expm1(const P // "m = floor(x/log(2)+1/2)" and "r" is the remainder. The result is then // "exp(x) = 2^m*exp(r)" where exp(r) is in the range [-1,1). // exp(r) is computed using a 6th order minimax polynomial approximation. -template +template EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS Packet pexp_float(const Packet _x) { const Packet cst_zero = pset1(0.0f); const Packet cst_one = pset1(1.0f); @@ -656,10 +657,15 @@ EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS Packet pexp_float(const Pack const Packet cst_p4 = pset1(4.166965186595916748046875e-2f); const Packet cst_p5 = pset1(8.36894474923610687255859375e-3f); const Packet cst_p6 = pset1(1.37449637986719608306884765625e-3f); - - // Clamp x. - Packet zero_mask = pcmp_lt(_x, cst_exp_lo); - Packet x = pmin(_x, cst_exp_hi); + Packet zero_mask; + Packet x; + if (!IsFinite) { + // Clamp x. + zero_mask = pcmp_lt(_x, cst_exp_lo); + x = pmin(_x, cst_exp_hi); + } else { + x = _x; + } // Express exp(x) as exp(m*ln(2) + r), start by extracting // m = floor(x/ln(2) + 0.5). @@ -682,7 +688,9 @@ EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS Packet pexp_float(const Pack const Packet p_low = padd(r, cst_one); Packet y = pmadd(r, p_odd, p_even); y = pmadd(r2, y, p_low); - + if (IsFinite) { + return pldexp_fast(y, m); + } // Return 2^m * exp(r). const Packet fast_pldexp_unsafe = pcmp_lt(cst_pldexp_threshold, pabs(x)); if (!predux_any(fast_pldexp_unsafe)) { @@ -1292,6 +1300,8 @@ EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS Packet generic_atan(const Pa return pxor(result, x_signmask); } +#ifdef EIGEN_FAST_MATH + /** \internal \returns the hyperbolic tan of \a a (coeff-wise) Doesn't do anything fancy, just a 9/8-degree rational interpolant which is accurate up to a couple of ulps in the (approximate) range [-8, 8], @@ -1343,6 +1353,55 @@ EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS T ptanh_float(const T& a_x) return pdiv(p, q); } +#else + +/** \internal \returns the hyperbolic tan of \a a (coeff-wise). + On the domain [-1.25:1.25] we use an approximation of the form + tanh(x) ~= x^3 * (P(x) / Q(x)) + x, where P and Q are polynomials in x^2. + For |x| > 1.25, tanh is implememented as tanh(x) = 1 - (2 / (1 + exp(2*x))). + + This implementation has a maximum error of 1 ULP (measured with AVX2+FMA). + + This implementation works on both scalars and packets. +*/ +template +EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS T ptanh_float(const T& x) { + // The polynomial coefficients were computed using Rminimax: + // % ./ratapprox --function="tanh(x)-x" --dom='[-1.25,1.25]' --num="[x^3,x^5]" --den="even" + // --type="[3,4]" --numF="[SG]" --denF="[SG]" --log --dispCoeff="dec" --output=tanhf.solly + constexpr float alpha[] = {-1.46725140511989593505859375e-02f, -3.333333432674407958984375e-01f}; + constexpr float beta[] = {1.570280082523822784423828125e-02, 4.4401752948760986328125e-01, 1.0f}; + const T x2 = pmul(x, x); + const T x3 = pmul(x2, x); + const T p = ppolevl::run(x2, alpha); + const T q = ppolevl::run(x2, beta); + const T small_tanh = pmadd(x3, pdiv(p, q), x); + + const T sign_mask = pset1(-0.0f); + const T abs_x = pandnot(x, sign_mask); + constexpr float kSmallThreshold = 1.25f; + const T large_mask = pcmp_lt(pset1(kSmallThreshold), abs_x); + // Fast exit if all elements are small. + if (!predux_any(large_mask)) { + return small_tanh; + } + + // Compute as 1 - (2 / (1 + exp(2*x))) + const T one = pset1(1.0f); + const T two = pset1(2.0f); + const T s = pexp_float(pmul(two, abs_x)); + const T abs_tanh = psub(one, pdiv(two, padd(s, one))); + + // Handle infinite inputs and set sign bit. + constexpr float kHugeThreshold = 16.0f; + const T huge_mask = pcmp_lt(pset1(kHugeThreshold), abs_x); + const T x_sign = pand(sign_mask, x); + const T large_tanh = por(x_sign, pselect(huge_mask, one, abs_tanh)); + return pselect(large_mask, large_tanh, small_tanh); +} + +#endif // EIGEN_FAST_MATH + /** \internal \returns the hyperbolic tan of \a a (coeff-wise) This uses a 19/18-degree rational interpolant which is accurate up to a couple of ulps in the (approximate) range [-18.7, 18.7], diff --git a/Eigen/src/Core/arch/Default/GenericPacketMathFunctionsFwd.h b/Eigen/src/Core/arch/Default/GenericPacketMathFunctionsFwd.h index 44ccb745b..69a551757 100644 --- a/Eigen/src/Core/arch/Default/GenericPacketMathFunctionsFwd.h +++ b/Eigen/src/Core/arch/Default/GenericPacketMathFunctionsFwd.h @@ -95,7 +95,7 @@ template EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS Packet generic_exp2(const Packet& x); /** \internal \returns exp(x) for single precision float */ -template +template EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS Packet pexp_float(const Packet _x); /** \internal \returns exp(x) for double precision real numbers */