diff --git a/Eigen/src/Core/arch/Default/GenericPacketMathDoubleWord.h b/Eigen/src/Core/arch/Default/GenericPacketMathDoubleWord.h new file mode 100644 index 000000000..5cb677fea --- /dev/null +++ b/Eigen/src/Core/arch/Default/GenericPacketMathDoubleWord.h @@ -0,0 +1,208 @@ +// This file is part of Eigen, a lightweight C++ template library +// for linear algebra. +// +// Copyright (C) 2018-2025 Rasmus Munk Larsen +// +// This Source Code Form is subject to the terms of the Mozilla +// Public License v. 2.0. If a copy of the MPL was not distributed +// with this file, You can obtain one at http://mozilla.org/MPL/2.0/. + +#ifndef EIGEN_ARCH_GENERIC_PACKET_MATH_DOUBLE_WORD_H +#define EIGEN_ARCH_GENERIC_PACKET_MATH_DOUBLE_WORD_H + +// IWYU pragma: private +#include "../../InternalHeaderCheck.h" + +namespace Eigen { +namespace internal { + +// This function splits x into the nearest integer n and fractional part r, +// such that x = n + r holds exactly. +template +EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void absolute_split(const Packet& x, Packet& n, Packet& r) { + n = pround(x); + r = psub(x, n); +} + +// This function computes the sum {s, r}, such that x + y = s_hi + s_lo +// holds exactly, and s_hi = fl(x+y), if |x| >= |y|. +template +EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void fast_twosum(const Packet& x, const Packet& y, Packet& s_hi, Packet& s_lo) { + s_hi = padd(x, y); + const Packet t = psub(s_hi, x); + s_lo = psub(y, t); +} + +#ifdef EIGEN_VECTORIZE_FMA +// This function implements the extended precision product of +// a pair of floating point numbers. Given {x, y}, it computes the pair +// {p_hi, p_lo} such that x * y = p_hi + p_lo holds exactly and +// p_hi = fl(x * y). +template +EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void twoprod(const Packet& x, const Packet& y, Packet& p_hi, Packet& p_lo) { + p_hi = pmul(x, y); + p_lo = pmsub(x, y, p_hi); +} + +// A version of twoprod that takes x, y, and fl(x*y) as input and returns the p_lo such that +// x * y = xy + p_lo holds exactly. +template +EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet twoprod_low(const Packet& x, const Packet& y, const Packet& xy) { + return pmsub(x, y, xy); +} + +#else + +// This function implements the Veltkamp splitting. Given a floating point +// number x it returns the pair {x_hi, x_lo} such that x_hi + x_lo = x holds +// exactly and that half of the significant of x fits in x_hi. +// This is Algorithm 3 from Jean-Michel Muller, "Elementary Functions", +// 3rd edition, Birkh\"auser, 2016. +template +EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void veltkamp_splitting(const Packet& x, Packet& x_hi, Packet& x_lo) { + typedef typename unpacket_traits::type Scalar; + constexpr int shift = (NumTraits::digits() + 1) / 2; + const Scalar shift_scale = Scalar(uint64_t(1) << shift); // Scalar constructor not necessarily constexpr. + const Packet gamma = pmul(pset1(shift_scale + Scalar(1)), x); + Packet rho = psub(x, gamma); + x_hi = padd(rho, gamma); + x_lo = psub(x, x_hi); +} + +// This function implements Dekker's algorithm for products x * y. +// Given floating point numbers {x, y} computes the pair +// {p_hi, p_lo} such that x * y = p_hi + p_lo holds exactly and +// p_hi = fl(x * y). +template +EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void twoprod(const Packet& x, const Packet& y, Packet& p_hi, Packet& p_lo) { + Packet x_hi, x_lo, y_hi, y_lo; + veltkamp_splitting(x, x_hi, x_lo); + veltkamp_splitting(y, y_hi, y_lo); + + p_hi = pmul(x, y); + p_lo = pmadd(x_hi, y_hi, pnegate(p_hi)); + p_lo = pmadd(x_hi, y_lo, p_lo); + p_lo = pmadd(x_lo, y_hi, p_lo); + p_lo = pmadd(x_lo, y_lo, p_lo); +} + +// A version of twoprod that takes x, y, and fl(x*y) as input and returns the p_lo such that +// x * y = xy + p_lo holds exactly. +template +EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet twoprod_low(const Packet& x, const Packet& y, const Packet& xy) { + Packet x_hi, x_lo, y_hi, y_lo; + veltkamp_splitting(x, x_hi, x_lo); + veltkamp_splitting(y, y_hi, y_lo); + + Packet p_lo = pmadd(x_hi, y_hi, pnegate(xy)); + p_lo = pmadd(x_hi, y_lo, p_lo); + p_lo = pmadd(x_lo, y_hi, p_lo); + p_lo = pmadd(x_lo, y_lo, p_lo); + return p_lo; +} + +#endif // EIGEN_VECTORIZE_FMA + +// This function implements Dekker's algorithm for the addition +// of two double word numbers represented by {x_hi, x_lo} and {y_hi, y_lo}. +// It returns the result as a pair {s_hi, s_lo} such that +// x_hi + x_lo + y_hi + y_lo = s_hi + s_lo holds exactly. +// This is Algorithm 5 from Jean-Michel Muller, "Elementary Functions", +// 3rd edition, Birkh\"auser, 2016. +template +EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void twosum(const Packet& x_hi, const Packet& x_lo, const Packet& y_hi, + const Packet& y_lo, Packet& s_hi, Packet& s_lo) { + const Packet x_greater_mask = pcmp_lt(pabs(y_hi), pabs(x_hi)); + Packet r_hi_1, r_lo_1; + fast_twosum(x_hi, y_hi, r_hi_1, r_lo_1); + Packet r_hi_2, r_lo_2; + fast_twosum(y_hi, x_hi, r_hi_2, r_lo_2); + const Packet r_hi = pselect(x_greater_mask, r_hi_1, r_hi_2); + + const Packet s1 = padd(padd(y_lo, r_lo_1), x_lo); + const Packet s2 = padd(padd(x_lo, r_lo_2), y_lo); + const Packet s = pselect(x_greater_mask, s1, s2); + + fast_twosum(r_hi, s, s_hi, s_lo); +} + +// This is a version of twosum for double word numbers, +// which assumes that |x_hi| >= |y_hi|. +template +EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void fast_twosum(const Packet& x_hi, const Packet& x_lo, const Packet& y_hi, + const Packet& y_lo, Packet& s_hi, Packet& s_lo) { + Packet r_hi, r_lo; + fast_twosum(x_hi, y_hi, r_hi, r_lo); + const Packet s = padd(padd(y_lo, r_lo), x_lo); + fast_twosum(r_hi, s, s_hi, s_lo); +} + +// This is a version of twosum for adding a floating point number x to +// double word number {y_hi, y_lo} number, with the assumption +// that |x| >= |y_hi|. +template +EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void fast_twosum(const Packet& x, const Packet& y_hi, const Packet& y_lo, + Packet& s_hi, Packet& s_lo) { + Packet r_hi, r_lo; + fast_twosum(x, y_hi, r_hi, r_lo); + const Packet s = padd(y_lo, r_lo); + fast_twosum(r_hi, s, s_hi, s_lo); +} + +// This function implements the multiplication of a double word +// number represented by {x_hi, x_lo} by a floating point number y. +// It returns the result as a pair {p_hi, p_lo} such that +// (x_hi + x_lo) * y = p_hi + p_lo hold with a relative error +// of less than 2*2^{-2p}, where p is the number of significand bit +// in the floating point type. +// This is Algorithm 7 from Jean-Michel Muller, "Elementary Functions", +// 3rd edition, Birkh\"auser, 2016. +template +EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void twoprod(const Packet& x_hi, const Packet& x_lo, const Packet& y, + Packet& p_hi, Packet& p_lo) { + Packet c_hi, c_lo1; + twoprod(x_hi, y, c_hi, c_lo1); + const Packet c_lo2 = pmul(x_lo, y); + Packet t_hi, t_lo1; + fast_twosum(c_hi, c_lo2, t_hi, t_lo1); + const Packet t_lo2 = padd(t_lo1, c_lo1); + fast_twosum(t_hi, t_lo2, p_hi, p_lo); +} + +// This function implements the multiplication of two double word +// numbers represented by {x_hi, x_lo} and {y_hi, y_lo}. +// It returns the result as a pair {p_hi, p_lo} such that +// (x_hi + x_lo) * (y_hi + y_lo) = p_hi + p_lo holds with a relative error +// of less than 2*2^{-2p}, where p is the number of significand bit +// in the floating point type. +template +EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void twoprod(const Packet& x_hi, const Packet& x_lo, const Packet& y_hi, + const Packet& y_lo, Packet& p_hi, Packet& p_lo) { + Packet p_hi_hi, p_hi_lo; + twoprod(x_hi, x_lo, y_hi, p_hi_hi, p_hi_lo); + Packet p_lo_hi, p_lo_lo; + twoprod(x_hi, x_lo, y_lo, p_lo_hi, p_lo_lo); + fast_twosum(p_hi_hi, p_hi_lo, p_lo_hi, p_lo_lo, p_hi, p_lo); +} + +// This function implements the division of double word {x_hi, x_lo} +// by float y. This is Algorithm 15 from "Tight and rigorous error bounds +// for basic building blocks of double-word arithmetic", Joldes, Muller, & Popescu, +// 2017. https://hal.archives-ouvertes.fr/hal-01351529 +template +EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void doubleword_div_fp(const Packet& x_hi, const Packet& x_lo, const Packet& y, + Packet& z_hi, Packet& z_lo) { + const Packet t_hi = pdiv(x_hi, y); + Packet pi_hi, pi_lo; + twoprod(t_hi, y, pi_hi, pi_lo); + const Packet delta_hi = psub(x_hi, pi_hi); + const Packet delta_t = psub(delta_hi, pi_lo); + const Packet delta = padd(delta_t, x_lo); + const Packet t_lo = pdiv(delta, y); + fast_twosum(t_hi, t_lo, z_hi, z_lo); +} + +} // end namespace internal +} // end namespace Eigen + +#endif // EIGEN_ARCH_GENERIC_PACKET_MATH_DOUBLE_WORD_H diff --git a/Eigen/src/Core/arch/Default/GenericPacketMathFrexpLdexp.h b/Eigen/src/Core/arch/Default/GenericPacketMathFrexpLdexp.h new file mode 100644 index 000000000..978818ecd --- /dev/null +++ b/Eigen/src/Core/arch/Default/GenericPacketMathFrexpLdexp.h @@ -0,0 +1,162 @@ +// This file is part of Eigen, a lightweight C++ template library +// for linear algebra. +// +// Copyright (C) 2009-2019 Gael Guennebaud +// Copyright (C) 2018-2025 Rasmus Munk Larsen +// +// This Source Code Form is subject to the terms of the Mozilla +// Public License v. 2.0. If a copy of the MPL was not distributed +// with this file, You can obtain one at http://mozilla.org/MPL/2.0/. + +#ifndef EIGEN_ARCH_GENERIC_PACKET_MATH_FREXP_LDEXP_H +#define EIGEN_ARCH_GENERIC_PACKET_MATH_FREXP_LDEXP_H + +// IWYU pragma: private +#include "../../InternalHeaderCheck.h" + +namespace Eigen { +namespace internal { + +// Creates a Scalar integer type with same bit-width. +template +struct make_integer; +template <> +struct make_integer { + typedef numext::int32_t type; +}; +template <> +struct make_integer { + typedef numext::int64_t type; +}; +template <> +struct make_integer { + typedef numext::int16_t type; +}; +template <> +struct make_integer { + typedef numext::int16_t type; +}; + +template +EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC Packet pfrexp_generic_get_biased_exponent(const Packet& a) { + typedef typename unpacket_traits::type Scalar; + typedef typename unpacket_traits::integer_packet PacketI; + static constexpr int mantissa_bits = numext::numeric_limits::digits - 1; + return pcast(plogical_shift_right(preinterpret(pabs(a)))); +} + +// Safely applies frexp, correctly handles denormals. +// Assumes IEEE floating point format. +template +EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC Packet pfrexp_generic(const Packet& a, Packet& exponent) { + typedef typename unpacket_traits::type Scalar; + typedef typename make_unsigned::type>::type ScalarUI; + static constexpr int TotalBits = sizeof(Scalar) * CHAR_BIT, MantissaBits = numext::numeric_limits::digits - 1, + ExponentBits = TotalBits - MantissaBits - 1; + + constexpr ScalarUI scalar_sign_mantissa_mask = + ~(((ScalarUI(1) << ExponentBits) - ScalarUI(1)) << MantissaBits); // ~0x7f800000 + const Packet sign_mantissa_mask = pset1frombits(static_cast(scalar_sign_mantissa_mask)); + const Packet half = pset1(Scalar(0.5)); + const Packet zero = pzero(a); + const Packet normal_min = pset1((numext::numeric_limits::min)()); // Minimum normal value, 2^-126 + + // To handle denormals, normalize by multiplying by 2^(int(MantissaBits)+1). + const Packet is_denormal = pcmp_lt(pabs(a), normal_min); + constexpr ScalarUI scalar_normalization_offset = ScalarUI(MantissaBits + 1); // 24 + // The following cannot be constexpr because bfloat16(uint16_t) is not constexpr. + const Scalar scalar_normalization_factor = Scalar(ScalarUI(1) << int(scalar_normalization_offset)); // 2^24 + const Packet normalization_factor = pset1(scalar_normalization_factor); + const Packet normalized_a = pselect(is_denormal, pmul(a, normalization_factor), a); + + // Determine exponent offset: -126 if normal, -126-24 if denormal + const Scalar scalar_exponent_offset = -Scalar((ScalarUI(1) << (ExponentBits - 1)) - ScalarUI(2)); // -126 + Packet exponent_offset = pset1(scalar_exponent_offset); + const Packet normalization_offset = pset1(-Scalar(scalar_normalization_offset)); // -24 + exponent_offset = pselect(is_denormal, padd(exponent_offset, normalization_offset), exponent_offset); + + // Determine exponent and mantissa from normalized_a. + exponent = pfrexp_generic_get_biased_exponent(normalized_a); + // Zero, Inf and NaN return 'a' unmodified, exponent is zero + // (technically the exponent is unspecified for inf/NaN, but GCC/Clang set it to zero) + const Scalar scalar_non_finite_exponent = Scalar((ScalarUI(1) << ExponentBits) - ScalarUI(1)); // 255 + const Packet non_finite_exponent = pset1(scalar_non_finite_exponent); + const Packet is_zero_or_not_finite = por(pcmp_eq(a, zero), pcmp_eq(exponent, non_finite_exponent)); + const Packet m = pselect(is_zero_or_not_finite, a, por(pand(normalized_a, sign_mantissa_mask), half)); + exponent = pselect(is_zero_or_not_finite, zero, padd(exponent, exponent_offset)); + return m; +} + +// Safely applies ldexp, correctly handles overflows, underflows and denormals. +// Assumes IEEE floating point format. +template +EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC Packet pldexp_generic(const Packet& a, const Packet& exponent) { + // We want to return a * 2^exponent, allowing for all possible integer + // exponents without overflowing or underflowing in intermediate + // computations. + // + // Since 'a' and the output can be denormal, the maximum range of 'exponent' + // to consider for a float is: + // -255-23 -> 255+23 + // Below -278 any finite float 'a' will become zero, and above +278 any + // finite float will become inf, including when 'a' is the smallest possible + // denormal. + // + // Unfortunately, 2^(278) cannot be represented using either one or two + // finite normal floats, so we must split the scale factor into at least + // three parts. It turns out to be faster to split 'exponent' into four + // factors, since [exponent>>2] is much faster to compute that [exponent/3]. + // + // Set e = min(max(exponent, -278), 278); + // b = floor(e/4); + // out = ((((a * 2^(b)) * 2^(b)) * 2^(b)) * 2^(e-3*b)) + // + // This will avoid any intermediate overflows and correctly handle 0, inf, + // NaN cases. + typedef typename unpacket_traits::integer_packet PacketI; + typedef typename unpacket_traits::type Scalar; + typedef typename unpacket_traits::type ScalarI; + static constexpr int TotalBits = sizeof(Scalar) * CHAR_BIT, MantissaBits = numext::numeric_limits::digits - 1, + ExponentBits = TotalBits - MantissaBits - 1; + + const Packet max_exponent = pset1(Scalar((ScalarI(1) << ExponentBits) + ScalarI(MantissaBits - 1))); // 278 + const PacketI bias = pset1((ScalarI(1) << (ExponentBits - 1)) - ScalarI(1)); // 127 + const PacketI e = pcast(pmin(pmax(exponent, pnegate(max_exponent)), max_exponent)); + PacketI b = parithmetic_shift_right<2>(e); // floor(e/4); + Packet c = preinterpret(plogical_shift_left(padd(b, bias))); // 2^b + Packet out = pmul(pmul(pmul(a, c), c), c); // a * 2^(3b) + b = pnmadd(pset1(3), b, e); // e - 3b + c = preinterpret(plogical_shift_left(padd(b, bias))); // 2^(e-3*b) + out = pmul(out, c); + return out; +} + +// Explicitly multiplies +// a * (2^e) +// clamping e to the range +// [NumTraits::min_exponent()-2, NumTraits::max_exponent()] +// +// This is approx 7x faster than pldexp_impl, but will prematurely over/underflow +// if 2^e doesn't fit into a normal floating-point Scalar. +// +// Assumes IEEE floating point format +template +EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC Packet pldexp_fast(const Packet& a, const Packet& exponent) { + typedef typename unpacket_traits::integer_packet PacketI; + typedef typename unpacket_traits::type Scalar; + typedef typename unpacket_traits::type ScalarI; + static constexpr int TotalBits = sizeof(Scalar) * CHAR_BIT, MantissaBits = numext::numeric_limits::digits - 1, + ExponentBits = TotalBits - MantissaBits - 1; + + const Packet bias = pset1(Scalar((ScalarI(1) << (ExponentBits - 1)) - ScalarI(1))); // 127 + const Packet limit = pset1(Scalar((ScalarI(1) << ExponentBits) - ScalarI(1))); // 255 + // restrict biased exponent between 0 and 255 for float. + const PacketI e = pcast(pmin(pmax(padd(exponent, bias), pzero(limit)), limit)); // exponent + 127 + // return a * (2^e) + return pmul(a, preinterpret(plogical_shift_left(e))); +} + +} // end namespace internal +} // end namespace Eigen + +#endif // EIGEN_ARCH_GENERIC_PACKET_MATH_FREXP_LDEXP_H diff --git a/Eigen/src/Core/arch/Default/GenericPacketMathFunctions.h b/Eigen/src/Core/arch/Default/GenericPacketMathFunctions.h index 915241f73..297df8367 100644 --- a/Eigen/src/Core/arch/Default/GenericPacketMathFunctions.h +++ b/Eigen/src/Core/arch/Default/GenericPacketMathFunctions.h @@ -19,276 +19,16 @@ // IWYU pragma: private #include "../../InternalHeaderCheck.h" +#include "GenericPacketMathPolynomials.h" +#include "GenericPacketMathFrexpLdexp.h" +#include "GenericPacketMathDoubleWord.h" namespace Eigen { namespace internal { -// Creates a Scalar integer type with same bit-width. -template -struct make_integer; -template <> -struct make_integer { - typedef numext::int32_t type; -}; -template <> -struct make_integer { - typedef numext::int64_t type; -}; -template <> -struct make_integer { - typedef numext::int16_t type; -}; -template <> -struct make_integer { - typedef numext::int16_t type; -}; - -/* polevl (modified for Eigen) - * - * Evaluate polynomial - * - * - * - * SYNOPSIS: - * - * int N; - * Scalar x, y, coef[N+1]; - * - * y = polevl( x, coef); - * - * - * - * DESCRIPTION: - * - * Evaluates polynomial of degree N: - * - * 2 N - * y = C + C x + C x +...+ C x - * 0 1 2 N - * - * Coefficients are stored in reverse order: - * - * coef[0] = C , ..., coef[N] = C . - * N 0 - * - * The function p1evl() assumes that coef[N] = 1.0 and is - * omitted from the array. Its calling arguments are - * otherwise the same as polevl(). - * - * - * The Eigen implementation is templatized. For best speed, store - * coef as a const array (constexpr), e.g. - * - * const double coef[] = {1.0, 2.0, 3.0, ...}; - * - */ -template -struct ppolevl { - static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet run(const Packet& x, - const typename unpacket_traits::type coeff[]) { - EIGEN_STATIC_ASSERT((N > 0), YOU_MADE_A_PROGRAMMING_MISTAKE); - return pmadd(ppolevl::run(x, coeff), x, pset1(coeff[N])); - } -}; - -template -struct ppolevl { - static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet run(const Packet& x, - const typename unpacket_traits::type coeff[]) { - EIGEN_UNUSED_VARIABLE(x); - return pset1(coeff[0]); - } -}; - -/* chbevl (modified for Eigen) - * - * Evaluate Chebyshev series - * - * - * - * SYNOPSIS: - * - * int N; - * Scalar x, y, coef[N], chebevl(); - * - * y = chbevl( x, coef, N ); - * - * - * - * DESCRIPTION: - * - * Evaluates the series - * - * N-1 - * - ' - * y = > coef[i] T (x/2) - * - i - * i=0 - * - * of Chebyshev polynomials Ti at argument x/2. - * - * Coefficients are stored in reverse order, i.e. the zero - * order term is last in the array. Note N is the number of - * coefficients, not the order. - * - * If coefficients are for the interval a to b, x must - * have been transformed to x -> 2(2x - b - a)/(b-a) before - * entering the routine. This maps x from (a, b) to (-1, 1), - * over which the Chebyshev polynomials are defined. - * - * If the coefficients are for the inverted interval, in - * which (a, b) is mapped to (1/b, 1/a), the transformation - * required is x -> 2(2ab/x - b - a)/(b-a). If b is infinity, - * this becomes x -> 4a/x - 1. - * - * - * - * SPEED: - * - * Taking advantage of the recurrence properties of the - * Chebyshev polynomials, the routine requires one more - * addition per loop than evaluating a nested polynomial of - * the same degree. - * - */ - -template -struct pchebevl { - EIGEN_DEVICE_FUNC static EIGEN_STRONG_INLINE Packet run(Packet x, - const typename unpacket_traits::type coef[]) { - typedef typename unpacket_traits::type Scalar; - Packet b0 = pset1(coef[0]); - Packet b1 = pset1(static_cast(0.f)); - Packet b2; - - for (int i = 1; i < N; i++) { - b2 = b1; - b1 = b0; - b0 = psub(pmadd(x, b1, pset1(coef[i])), b2); - } - - return pmul(pset1(static_cast(0.5f)), psub(b0, b2)); - } -}; - -template -EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC Packet pfrexp_generic_get_biased_exponent(const Packet& a) { - typedef typename unpacket_traits::type Scalar; - typedef typename unpacket_traits::integer_packet PacketI; - static constexpr int mantissa_bits = numext::numeric_limits::digits - 1; - return pcast(plogical_shift_right(preinterpret(pabs(a)))); -} - -// Safely applies frexp, correctly handles denormals. -// Assumes IEEE floating point format. -template -EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC Packet pfrexp_generic(const Packet& a, Packet& exponent) { - typedef typename unpacket_traits::type Scalar; - typedef typename make_unsigned::type>::type ScalarUI; - static constexpr int TotalBits = sizeof(Scalar) * CHAR_BIT, MantissaBits = numext::numeric_limits::digits - 1, - ExponentBits = TotalBits - MantissaBits - 1; - - constexpr ScalarUI scalar_sign_mantissa_mask = - ~(((ScalarUI(1) << ExponentBits) - ScalarUI(1)) << MantissaBits); // ~0x7f800000 - const Packet sign_mantissa_mask = pset1frombits(static_cast(scalar_sign_mantissa_mask)); - const Packet half = pset1(Scalar(0.5)); - const Packet zero = pzero(a); - const Packet normal_min = pset1((numext::numeric_limits::min)()); // Minimum normal value, 2^-126 - - // To handle denormals, normalize by multiplying by 2^(int(MantissaBits)+1). - const Packet is_denormal = pcmp_lt(pabs(a), normal_min); - constexpr ScalarUI scalar_normalization_offset = ScalarUI(MantissaBits + 1); // 24 - // The following cannot be constexpr because bfloat16(uint16_t) is not constexpr. - const Scalar scalar_normalization_factor = Scalar(ScalarUI(1) << int(scalar_normalization_offset)); // 2^24 - const Packet normalization_factor = pset1(scalar_normalization_factor); - const Packet normalized_a = pselect(is_denormal, pmul(a, normalization_factor), a); - - // Determine exponent offset: -126 if normal, -126-24 if denormal - const Scalar scalar_exponent_offset = -Scalar((ScalarUI(1) << (ExponentBits - 1)) - ScalarUI(2)); // -126 - Packet exponent_offset = pset1(scalar_exponent_offset); - const Packet normalization_offset = pset1(-Scalar(scalar_normalization_offset)); // -24 - exponent_offset = pselect(is_denormal, padd(exponent_offset, normalization_offset), exponent_offset); - - // Determine exponent and mantissa from normalized_a. - exponent = pfrexp_generic_get_biased_exponent(normalized_a); - // Zero, Inf and NaN return 'a' unmodified, exponent is zero - // (technically the exponent is unspecified for inf/NaN, but GCC/Clang set it to zero) - const Scalar scalar_non_finite_exponent = Scalar((ScalarUI(1) << ExponentBits) - ScalarUI(1)); // 255 - const Packet non_finite_exponent = pset1(scalar_non_finite_exponent); - const Packet is_zero_or_not_finite = por(pcmp_eq(a, zero), pcmp_eq(exponent, non_finite_exponent)); - const Packet m = pselect(is_zero_or_not_finite, a, por(pand(normalized_a, sign_mantissa_mask), half)); - exponent = pselect(is_zero_or_not_finite, zero, padd(exponent, exponent_offset)); - return m; -} - -// Safely applies ldexp, correctly handles overflows, underflows and denormals. -// Assumes IEEE floating point format. -template -EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC Packet pldexp_generic(const Packet& a, const Packet& exponent) { - // We want to return a * 2^exponent, allowing for all possible integer - // exponents without overflowing or underflowing in intermediate - // computations. - // - // Since 'a' and the output can be denormal, the maximum range of 'exponent' - // to consider for a float is: - // -255-23 -> 255+23 - // Below -278 any finite float 'a' will become zero, and above +278 any - // finite float will become inf, including when 'a' is the smallest possible - // denormal. - // - // Unfortunately, 2^(278) cannot be represented using either one or two - // finite normal floats, so we must split the scale factor into at least - // three parts. It turns out to be faster to split 'exponent' into four - // factors, since [exponent>>2] is much faster to compute that [exponent/3]. - // - // Set e = min(max(exponent, -278), 278); - // b = floor(e/4); - // out = ((((a * 2^(b)) * 2^(b)) * 2^(b)) * 2^(e-3*b)) - // - // This will avoid any intermediate overflows and correctly handle 0, inf, - // NaN cases. - typedef typename unpacket_traits::integer_packet PacketI; - typedef typename unpacket_traits::type Scalar; - typedef typename unpacket_traits::type ScalarI; - static constexpr int TotalBits = sizeof(Scalar) * CHAR_BIT, MantissaBits = numext::numeric_limits::digits - 1, - ExponentBits = TotalBits - MantissaBits - 1; - - const Packet max_exponent = pset1(Scalar((ScalarI(1) << ExponentBits) + ScalarI(MantissaBits - 1))); // 278 - const PacketI bias = pset1((ScalarI(1) << (ExponentBits - 1)) - ScalarI(1)); // 127 - const PacketI e = pcast(pmin(pmax(exponent, pnegate(max_exponent)), max_exponent)); - PacketI b = parithmetic_shift_right<2>(e); // floor(e/4); - Packet c = preinterpret(plogical_shift_left(padd(b, bias))); // 2^b - Packet out = pmul(pmul(pmul(a, c), c), c); // a * 2^(3b) - b = pnmadd(pset1(3), b, e); // e - 3b - c = preinterpret(plogical_shift_left(padd(b, bias))); // 2^(e-3*b) - out = pmul(out, c); - return out; -} - -// Explicitly multiplies -// a * (2^e) -// clamping e to the range -// [NumTraits::min_exponent()-2, NumTraits::max_exponent()] -// -// This is approx 7x faster than pldexp_impl, but will prematurely over/underflow -// if 2^e doesn't fit into a normal floating-point Scalar. -// -// Assumes IEEE floating point format -template -EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC Packet pldexp_fast(const Packet& a, const Packet& exponent) { - typedef typename unpacket_traits::integer_packet PacketI; - typedef typename unpacket_traits::type Scalar; - typedef typename unpacket_traits::type ScalarI; - static constexpr int TotalBits = sizeof(Scalar) * CHAR_BIT, MantissaBits = numext::numeric_limits::digits - 1, - ExponentBits = TotalBits - MantissaBits - 1; - - const Packet bias = pset1(Scalar((ScalarI(1) << (ExponentBits - 1)) - ScalarI(1))); // 127 - const Packet limit = pset1(Scalar((ScalarI(1) << ExponentBits) - ScalarI(1))); // 255 - // restrict biased exponent between 0 and 255 for float. - const PacketI e = pcast(pmin(pmax(padd(exponent, bias), pzero(limit)), limit)); // exponent + 127 - // return a * (2^e) - return pmul(a, preinterpret(plogical_shift_left(e))); -} +//---------------------------------------------------------------------- +// Cubic Root Functions +//---------------------------------------------------------------------- // This function implements a single step of Halley's iteration for // computing x = y^(1/3): @@ -426,6 +166,10 @@ EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS Packet pcbrt_double(const Pa return cbrt_special_cases_and_sign(x, r); } +//---------------------------------------------------------------------- +// Exponential and Logarithmic Functions +//---------------------------------------------------------------------- + // Natural or base 2 logarithm. // Computes log(x) as log(2^e * m) = C*e + log(m), where the constant C =log(2) // and m is in the range [sqrt(1/2),sqrt(2)). In this range, the logarithm can @@ -773,6 +517,36 @@ EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS Packet pexp_double(const Pac return pselect(zero_mask, cst_zero, pmax(pldexp(x, fx), _x)); } +// This function computes exp2(x) = exp(ln(2) * x). +// To improve accuracy, the product ln(2)*x is computed using the twoprod +// algorithm, such that ln(2) * x = p_hi + p_lo holds exactly. Then exp2(x) is +// computed as exp2(x) = exp(p_hi) * exp(p_lo) ~= exp(p_hi) * (1 + p_lo). This +// correction step this reduces the maximum absolute error as follows: +// +// type | max error (simple product) | max error (twoprod) | +// ----------------------------------------------------------- +// float | 35 ulps | 4 ulps | +// double | 363 ulps | 110 ulps | +// +template +EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS Packet generic_exp2(const Packet& _x) { + typedef typename unpacket_traits::type Scalar; + constexpr int max_exponent = std::numeric_limits::max_exponent; + constexpr int digits = std::numeric_limits::digits; + constexpr Scalar max_cap = Scalar(max_exponent + 1); + constexpr Scalar min_cap = -Scalar(max_exponent + digits - 1); + Packet x = pmax(pmin(_x, pset1(max_cap)), pset1(min_cap)); + Packet p_hi, p_lo; + twoprod(pset1(Scalar(EIGEN_LN2)), x, p_hi, p_lo); + Packet exp2_hi = pexp(p_hi); + Packet exp2_lo = padd(pset1(Scalar(1)), p_lo); + return pmul(exp2_hi, exp2_lo); +} + +//---------------------------------------------------------------------- +// Trigonometric Functions +//---------------------------------------------------------------------- + // Enum for selecting which function to compute. SinCos is intended to compute // pairs of Sin and Cos of the even entries in the packet, e.g. // SinCos([a, *, b, *]) = [sin(a), cos(a), sin(b), cos(b)]. @@ -1200,6 +974,10 @@ EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS return psincos_double(x); } +//---------------------------------------------------------------------- +// Inverse Trigonometric Functions +//---------------------------------------------------------------------- + // Generic implementation of acos(x). template EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS Packet pacos_float(const Packet& x_in) { @@ -1348,6 +1126,10 @@ EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS Packet generic_atan(const Pa return pxor(result, x_signmask); } +//---------------------------------------------------------------------- +// Hyperbolic Functions +//---------------------------------------------------------------------- + #ifdef EIGEN_FAST_MATH /** \internal \returns the hyperbolic tan of \a a (coeff-wise) @@ -1570,6 +1352,10 @@ EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS Packet patanh_double(const P return por(x_gt_one, pselect(x_eq_one, por(x_sign, inf), pselect(x_gt_half, y_large, y_small))); } +//---------------------------------------------------------------------- +// Complex Arithmetic and Functions +//---------------------------------------------------------------------- + template EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS Packet pdiv_complex(const Packet& x, const Packet& y) { typedef typename unpacket_traits::as_real RealPacket; @@ -1577,7 +1363,7 @@ EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS Packet pdiv_complex(const Pa // In the following we annotate the code for the case where the inputs // are a pair length-2 SIMD vectors representing a single pair of complex // numbers x = a + i*b, y = c + i*d. - static const RealPacket one = pset1(RealScalar(1)); + const RealPacket one = pset1(RealScalar(1)); const RealPacket y_flip = pcplxflip(y).v; // We need to avoid dividing by Inf/Inf, so use a mask to carefully // apply the scale. @@ -1825,6 +1611,10 @@ EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS Packet phypot_complex(const return Packet(h); // |sqrt(a^2+b^2), sqrt(a^2+b^2)| } +//---------------------------------------------------------------------- +// Sign Function +//---------------------------------------------------------------------- + template struct psign_impl::value && !NumTraits::type>::IsComplex && @@ -1912,196 +1702,9 @@ struct psign_impl::value && } }; -// TODO(rmlarsen): The following set of utilities for double word arithmetic -// should perhaps be refactored as a separate file, since it would be generally -// useful for special function implementation etc. Writing the algorithms in -// terms if a double word type would also make the code more readable. - -// This function splits x into the nearest integer n and fractional part r, -// such that x = n + r holds exactly. -template -EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void absolute_split(const Packet& x, Packet& n, Packet& r) { - n = pround(x); - r = psub(x, n); -} - -// This function computes the sum {s, r}, such that x + y = s_hi + s_lo -// holds exactly, and s_hi = fl(x+y), if |x| >= |y|. -template -EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void fast_twosum(const Packet& x, const Packet& y, Packet& s_hi, Packet& s_lo) { - s_hi = padd(x, y); - const Packet t = psub(s_hi, x); - s_lo = psub(y, t); -} - -#ifdef EIGEN_VECTORIZE_FMA -// This function implements the extended precision product of -// a pair of floating point numbers. Given {x, y}, it computes the pair -// {p_hi, p_lo} such that x * y = p_hi + p_lo holds exactly and -// p_hi = fl(x * y). -template -EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void twoprod(const Packet& x, const Packet& y, Packet& p_hi, Packet& p_lo) { - p_hi = pmul(x, y); - p_lo = pmsub(x, y, p_hi); -} - -// A version of twoprod that takes x, y, and fl(x*y) as input and returns the p_lo such that -// x * y = xy + p_lo holds exactly. -template -EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet twoprod_low(const Packet& x, const Packet& y, const Packet& xy) { - return pmsub(x, y, xy); -} - -#else - -// This function implements the Veltkamp splitting. Given a floating point -// number x it returns the pair {x_hi, x_lo} such that x_hi + x_lo = x holds -// exactly and that half of the significant of x fits in x_hi. -// This is Algorithm 3 from Jean-Michel Muller, "Elementary Functions", -// 3rd edition, Birkh\"auser, 2016. -template -EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void veltkamp_splitting(const Packet& x, Packet& x_hi, Packet& x_lo) { - typedef typename unpacket_traits::type Scalar; - constexpr int shift = (NumTraits::digits() + 1) / 2; - const Scalar shift_scale = Scalar(uint64_t(1) << shift); // Scalar constructor not necessarily constexpr. - const Packet gamma = pmul(pset1(shift_scale + Scalar(1)), x); - Packet rho = psub(x, gamma); - x_hi = padd(rho, gamma); - x_lo = psub(x, x_hi); -} - -// This function implements Dekker's algorithm for products x * y. -// Given floating point numbers {x, y} computes the pair -// {p_hi, p_lo} such that x * y = p_hi + p_lo holds exactly and -// p_hi = fl(x * y). -template -EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void twoprod(const Packet& x, const Packet& y, Packet& p_hi, Packet& p_lo) { - Packet x_hi, x_lo, y_hi, y_lo; - veltkamp_splitting(x, x_hi, x_lo); - veltkamp_splitting(y, y_hi, y_lo); - - p_hi = pmul(x, y); - p_lo = pmadd(x_hi, y_hi, pnegate(p_hi)); - p_lo = pmadd(x_hi, y_lo, p_lo); - p_lo = pmadd(x_lo, y_hi, p_lo); - p_lo = pmadd(x_lo, y_lo, p_lo); -} - -// A version of twoprod that takes x, y, and fl(x*y) as input and returns the p_lo such that -// x * y = xy + p_lo holds exactly. -template -EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet twoprod_low(const Packet& x, const Packet& y, const Packet& xy) { - Packet x_hi, x_lo, y_hi, y_lo; - veltkamp_splitting(x, x_hi, x_lo); - veltkamp_splitting(y, y_hi, y_lo); - - Packet p_lo = pmadd(x_hi, y_hi, pnegate(xy)); - p_lo = pmadd(x_hi, y_lo, p_lo); - p_lo = pmadd(x_lo, y_hi, p_lo); - p_lo = pmadd(x_lo, y_lo, p_lo); - return p_lo; -} - -#endif // EIGEN_VECTORIZE_FMA - -// This function implements Dekker's algorithm for the addition -// of two double word numbers represented by {x_hi, x_lo} and {y_hi, y_lo}. -// It returns the result as a pair {s_hi, s_lo} such that -// x_hi + x_lo + y_hi + y_lo = s_hi + s_lo holds exactly. -// This is Algorithm 5 from Jean-Michel Muller, "Elementary Functions", -// 3rd edition, Birkh\"auser, 2016. -template -EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void twosum(const Packet& x_hi, const Packet& x_lo, const Packet& y_hi, - const Packet& y_lo, Packet& s_hi, Packet& s_lo) { - const Packet x_greater_mask = pcmp_lt(pabs(y_hi), pabs(x_hi)); - Packet r_hi_1, r_lo_1; - fast_twosum(x_hi, y_hi, r_hi_1, r_lo_1); - Packet r_hi_2, r_lo_2; - fast_twosum(y_hi, x_hi, r_hi_2, r_lo_2); - const Packet r_hi = pselect(x_greater_mask, r_hi_1, r_hi_2); - - const Packet s1 = padd(padd(y_lo, r_lo_1), x_lo); - const Packet s2 = padd(padd(x_lo, r_lo_2), y_lo); - const Packet s = pselect(x_greater_mask, s1, s2); - - fast_twosum(r_hi, s, s_hi, s_lo); -} - -// This is a version of twosum for double word numbers, -// which assumes that |x_hi| >= |y_hi|. -template -EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void fast_twosum(const Packet& x_hi, const Packet& x_lo, const Packet& y_hi, - const Packet& y_lo, Packet& s_hi, Packet& s_lo) { - Packet r_hi, r_lo; - fast_twosum(x_hi, y_hi, r_hi, r_lo); - const Packet s = padd(padd(y_lo, r_lo), x_lo); - fast_twosum(r_hi, s, s_hi, s_lo); -} - -// This is a version of twosum for adding a floating point number x to -// double word number {y_hi, y_lo} number, with the assumption -// that |x| >= |y_hi|. -template -EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void fast_twosum(const Packet& x, const Packet& y_hi, const Packet& y_lo, - Packet& s_hi, Packet& s_lo) { - Packet r_hi, r_lo; - fast_twosum(x, y_hi, r_hi, r_lo); - const Packet s = padd(y_lo, r_lo); - fast_twosum(r_hi, s, s_hi, s_lo); -} - -// This function implements the multiplication of a double word -// number represented by {x_hi, x_lo} by a floating point number y. -// It returns the result as a pair {p_hi, p_lo} such that -// (x_hi + x_lo) * y = p_hi + p_lo hold with a relative error -// of less than 2*2^{-2p}, where p is the number of significand bit -// in the floating point type. -// This is Algorithm 7 from Jean-Michel Muller, "Elementary Functions", -// 3rd edition, Birkh\"auser, 2016. -template -EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void twoprod(const Packet& x_hi, const Packet& x_lo, const Packet& y, - Packet& p_hi, Packet& p_lo) { - Packet c_hi, c_lo1; - twoprod(x_hi, y, c_hi, c_lo1); - const Packet c_lo2 = pmul(x_lo, y); - Packet t_hi, t_lo1; - fast_twosum(c_hi, c_lo2, t_hi, t_lo1); - const Packet t_lo2 = padd(t_lo1, c_lo1); - fast_twosum(t_hi, t_lo2, p_hi, p_lo); -} - -// This function implements the multiplication of two double word -// numbers represented by {x_hi, x_lo} and {y_hi, y_lo}. -// It returns the result as a pair {p_hi, p_lo} such that -// (x_hi + x_lo) * (y_hi + y_lo) = p_hi + p_lo holds with a relative error -// of less than 2*2^{-2p}, where p is the number of significand bit -// in the floating point type. -template -EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void twoprod(const Packet& x_hi, const Packet& x_lo, const Packet& y_hi, - const Packet& y_lo, Packet& p_hi, Packet& p_lo) { - Packet p_hi_hi, p_hi_lo; - twoprod(x_hi, x_lo, y_hi, p_hi_hi, p_hi_lo); - Packet p_lo_hi, p_lo_lo; - twoprod(x_hi, x_lo, y_lo, p_lo_hi, p_lo_lo); - fast_twosum(p_hi_hi, p_hi_lo, p_lo_hi, p_lo_lo, p_hi, p_lo); -} - -// This function implements the division of double word {x_hi, x_lo} -// by float y. This is Algorithm 15 from "Tight and rigorous error bounds -// for basic building blocks of double-word arithmetic", Joldes, Muller, & Popescu, -// 2017. https://hal.archives-ouvertes.fr/hal-01351529 -template -EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void doubleword_div_fp(const Packet& x_hi, const Packet& x_lo, const Packet& y, - Packet& z_hi, Packet& z_lo) { - const Packet t_hi = pdiv(x_hi, y); - Packet pi_hi, pi_lo; - twoprod(t_hi, y, pi_hi, pi_lo); - const Packet delta_hi = psub(x_hi, pi_hi); - const Packet delta_t = psub(delta_hi, pi_lo); - const Packet delta = padd(delta_t, x_lo); - const Packet t_lo = pdiv(delta, y); - fast_twosum(t_hi, t_lo, z_hi, z_lo); -} +//---------------------------------------------------------------------- +// Power Functions (accurate_log2, generic_pow, unary_pow) +//---------------------------------------------------------------------- // This function computes log2(x) and returns the result as a double word. template @@ -2647,31 +2250,9 @@ struct unary_pow_impl { } }; -// This function computes exp2(x) = exp(ln(2) * x). -// To improve accuracy, the product ln(2)*x is computed using the twoprod -// algorithm, such that ln(2) * x = p_hi + p_lo holds exactly. Then exp2(x) is -// computed as exp2(x) = exp(p_hi) * exp(p_lo) ~= exp(p_hi) * (1 + p_lo). This -// correction step this reduces the maximum absolute error as follows: -// -// type | max error (simple product) | max error (twoprod) | -// ----------------------------------------------------------- -// float | 35 ulps | 4 ulps | -// double | 363 ulps | 110 ulps | -// -template -EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS Packet generic_exp2(const Packet& _x) { - typedef typename unpacket_traits::type Scalar; - constexpr int max_exponent = std::numeric_limits::max_exponent; - constexpr int digits = std::numeric_limits::digits; - constexpr Scalar max_cap = Scalar(max_exponent + 1); - constexpr Scalar min_cap = -Scalar(max_exponent + digits - 1); - Packet x = pmax(pmin(_x, pset1(max_cap)), pset1(min_cap)); - Packet p_hi, p_lo; - twoprod(pset1(Scalar(EIGEN_LN2)), x, p_hi, p_lo); - Packet exp2_hi = pexp(p_hi); - Packet exp2_lo = padd(pset1(Scalar(1)), p_lo); - return pmul(exp2_hi, exp2_lo); -} +//---------------------------------------------------------------------- +// Rounding Functions +//---------------------------------------------------------------------- template EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet generic_rint(const Packet& a) { diff --git a/Eigen/src/Core/arch/Default/GenericPacketMathFunctionsFwd.h b/Eigen/src/Core/arch/Default/GenericPacketMathFunctionsFwd.h index 942ae129c..fb9d71017 100644 --- a/Eigen/src/Core/arch/Default/GenericPacketMathFunctionsFwd.h +++ b/Eigen/src/Core/arch/Default/GenericPacketMathFunctionsFwd.h @@ -70,11 +70,11 @@ EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS Packet plog_float(const Pack template EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS Packet plog2_float(const Packet _x); -/** \internal \returns log(x) for single precision float */ +/** \internal \returns log(x) for double precision float */ template EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS Packet plog_double(const Packet _x); -/** \internal \returns log2(x) for single precision float */ +/** \internal \returns log2(x) for double precision float */ template EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS Packet plog2_double(const Packet _x); diff --git a/Eigen/src/Core/arch/Default/GenericPacketMathPolynomials.h b/Eigen/src/Core/arch/Default/GenericPacketMathPolynomials.h new file mode 100644 index 000000000..a8ec6aebb --- /dev/null +++ b/Eigen/src/Core/arch/Default/GenericPacketMathPolynomials.h @@ -0,0 +1,151 @@ +// This file is part of Eigen, a lightweight C++ template library +// for linear algebra. +// +// Copyright (C) 2007 Julien Pommier +// Copyright (C) 2009-2019 Gael Guennebaud +// +// This Source Code Form is subject to the terms of the Mozilla +// Public License v. 2.0. If a copy of the MPL was not distributed +// with this file, You can obtain one at http://mozilla.org/MPL/2.0/. + +#ifndef EIGEN_ARCH_GENERIC_PACKET_MATH_POLYNOMIALS_H +#define EIGEN_ARCH_GENERIC_PACKET_MATH_POLYNOMIALS_H + +// IWYU pragma: private +#include "../../InternalHeaderCheck.h" + +namespace Eigen { +namespace internal { + +/* polevl (modified for Eigen) + * + * Evaluate polynomial + * + * + * + * SYNOPSIS: + * + * int N; + * Scalar x, y, coef[N+1]; + * + * y = polevl( x, coef); + * + * + * + * DESCRIPTION: + * + * Evaluates polynomial of degree N: + * + * 2 N + * y = C + C x + C x +...+ C x + * 0 1 2 N + * + * Coefficients are stored in reverse order: + * + * coef[0] = C , ..., coef[N] = C . + * N 0 + * + * The function p1evl() assumes that coef[N] = 1.0 and is + * omitted from the array. Its calling arguments are + * otherwise the same as polevl(). + * + * + * The Eigen implementation is templatized. For best speed, store + * coef as a const array (constexpr), e.g. + * + * const double coef[] = {1.0, 2.0, 3.0, ...}; + * + */ +template +struct ppolevl { + static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet run(const Packet& x, + const typename unpacket_traits::type coeff[]) { + EIGEN_STATIC_ASSERT((N > 0), YOU_MADE_A_PROGRAMMING_MISTAKE); + return pmadd(ppolevl::run(x, coeff), x, pset1(coeff[N])); + } +}; + +template +struct ppolevl { + static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet run(const Packet& x, + const typename unpacket_traits::type coeff[]) { + EIGEN_UNUSED_VARIABLE(x); + return pset1(coeff[0]); + } +}; + +/* chbevl (modified for Eigen) + * + * Evaluate Chebyshev series + * + * + * + * SYNOPSIS: + * + * int N; + * Scalar x, y, coef[N], chebevl(); + * + * y = chbevl( x, coef, N ); + * + * + * + * DESCRIPTION: + * + * Evaluates the series + * + * N-1 + * - ' + * y = > coef[i] T (x/2) + * - i + * i=0 + * + * of Chebyshev polynomials Ti at argument x/2. + * + * Coefficients are stored in reverse order, i.e. the zero + * order term is last in the array. Note N is the number of + * coefficients, not the order. + * + * If coefficients are for the interval a to b, x must + * have been transformed to x -> 2(2x - b - a)/(b-a) before + * entering the routine. This maps x from (a, b) to (-1, 1), + * over which the Chebyshev polynomials are defined. + * + * If the coefficients are for the inverted interval, in + * which (a, b) is mapped to (1/b, 1/a), the transformation + * required is x -> 2(2ab/x - b - a)/(b-a). If b is infinity, + * this becomes x -> 4a/x - 1. + * + * + * + * SPEED: + * + * Taking advantage of the recurrence properties of the + * Chebyshev polynomials, the routine requires one more + * addition per loop than evaluating a nested polynomial of + * the same degree. + * + */ + +template +struct pchebevl { + EIGEN_DEVICE_FUNC static EIGEN_STRONG_INLINE Packet run(Packet x, + const typename unpacket_traits::type coef[]) { + typedef typename unpacket_traits::type Scalar; + Packet b0 = pset1(coef[0]); + Packet b1 = pset1(static_cast(0.f)); + Packet b2; + + for (int i = 1; i < N; i++) { + b2 = b1; + b1 = b0; + b0 = psub(pmadd(x, b1, pset1(coef[i])), b2); + } + + return pmul(pset1(static_cast(0.5f)), psub(b0, b2)); + } +}; + +} // end namespace internal +} // end namespace Eigen + +#endif // EIGEN_ARCH_GENERIC_PACKET_MATH_POLYNOMIALS_H