Vectorize tan(x)

libeigen/eigen!2086

Co-authored-by: Rasmus Munk Larsen <rmlarsen@google.com>
This commit is contained in:
Rasmus Munk Larsen
2025-12-02 21:53:10 +00:00
parent 01a919d13f
commit 75bcd155c4
12 changed files with 89 additions and 32 deletions

View File

@@ -32,6 +32,7 @@ EIGEN_DOUBLE_PACKET_FUNCTION(cbrt, Packet4d)
#ifdef EIGEN_VECTORIZE_AVX2
EIGEN_DOUBLE_PACKET_FUNCTION(sin, Packet4d)
EIGEN_DOUBLE_PACKET_FUNCTION(cos, Packet4d)
EIGEN_DOUBLE_PACKET_FUNCTION(tan, Packet4d)
#endif
EIGEN_GENERIC_PACKET_FUNCTION(atan, Packet4d)
EIGEN_GENERIC_PACKET_FUNCTION(exp2, Packet4d)

View File

@@ -110,6 +110,7 @@ struct packet_traits<float> : default_packet_traits {
HasReciprocal = EIGEN_FAST_MATH,
HasSin = EIGEN_FAST_MATH,
HasCos = EIGEN_FAST_MATH,
HasTan = EIGEN_FAST_MATH,
HasACos = 1,
HasASin = 1,
HasATan = 1,
@@ -143,6 +144,7 @@ struct packet_traits<double> : default_packet_traits {
#ifdef EIGEN_VECTORIZE_AVX2
HasSin = EIGEN_FAST_MATH,
HasCos = EIGEN_FAST_MATH,
HasTan = EIGEN_FAST_MATH,
#endif
HasTanh = EIGEN_FAST_MATH,
HasErf = 1,

View File

@@ -119,6 +119,7 @@ struct packet_traits<float> : default_packet_traits {
HasConj = 1,
HasSin = EIGEN_FAST_MATH,
HasCos = EIGEN_FAST_MATH,
HasTan = EIGEN_FAST_MATH,
HasACos = 1,
HasASin = 1,
HasATan = 1,
@@ -154,6 +155,7 @@ struct packet_traits<double> : default_packet_traits {
HasCbrt = 1,
HasSin = EIGEN_FAST_MATH,
HasCos = EIGEN_FAST_MATH,
HasTan = EIGEN_FAST_MATH,
HasLog = 1,
HasExp = 1,
HasLog1p = 1,

View File

@@ -178,6 +178,7 @@ struct packet_traits<float> : default_packet_traits {
HasAbs = 1,
HasSin = EIGEN_FAST_MATH,
HasCos = EIGEN_FAST_MATH,
HasTan = EIGEN_FAST_MATH,
HasACos = 1,
HasASin = 1,
HasATan = 1,
@@ -3098,6 +3099,7 @@ struct packet_traits<double> : default_packet_traits {
HasAbs = 1,
HasSin = EIGEN_FAST_MATH,
HasCos = EIGEN_FAST_MATH,
HasTan = EIGEN_FAST_MATH,
HasTanh = EIGEN_FAST_MATH,
HasErf = EIGEN_FAST_MATH,
HasErfc = EIGEN_FAST_MATH,

View File

@@ -773,6 +773,11 @@ EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS Packet pexp_double(const Pac
return pselect(zero_mask, cst_zero, pmax(pldexp(x, fx), _x));
}
// Enum for selecting which function to compute. SinCos is intended to compute
// pairs of Sin and Cos of the even entries in the packet, e.g.
// SinCos([a, *, b, *]) = [sin(a), cos(a), sin(b), cos(b)].
enum class TrigFunction : uint8_t { Sin, Cos, Tan, SinCos };
// The following code is inspired by the following stack-overflow answer:
// https://stackoverflow.com/questions/30463616/payne-hanek-algorithm-implementation-in-c/30465751#30465751
// It has been largely optimized:
@@ -829,7 +834,7 @@ inline float trig_reduce_huge(float xf, Eigen::numext::int32_t* quadrant) {
return float(double(int64_t(p)) * pio2_62);
}
template <bool ComputeSine, typename Packet, bool ComputeBoth = false>
template <TrigFunction Func, typename Packet>
EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS
#if EIGEN_COMP_GNUC_STRICT
__attribute__((optimize("-fno-unsafe-math-optimizations")))
@@ -859,7 +864,7 @@ EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS
#if defined(EIGEN_VECTORIZE_FMA)
// This version requires true FMA for high accuracy.
// It provides a max error of 1ULP up to (with absolute_error < 5.9605e-08):
const float huge_th = ComputeSine ? 117435.992f : 71476.0625f;
constexpr float huge_th = (Func == TrigFunction::Sin) ? 117435.992f : 71476.0625f;
x = pmadd(y, pset1<Packet>(-1.57079601287841796875f), x);
x = pmadd(y, pset1<Packet>(-3.1391647326017846353352069854736328125e-07f), x);
x = pmadd(y, pset1<Packet>(-5.390302529957764765544681040410068817436695098876953125e-15f), x);
@@ -870,7 +875,7 @@ EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS
// The following set of coefficients maintain 1ULP up to 9.43 and 14.16 for sin and cos respectively.
// and 2 ULP up to:
const float huge_th = ComputeSine ? 25966.f : 18838.f;
constexpr float huge_th = (Func == TrigFunction::Sin) ? 25966.f : 18838.f;
x = pmadd(y, pset1<Packet>(-1.5703125), x); // = 0xbfc90000
EIGEN_OPTIMIZATION_BARRIER(x)
x = pmadd(y, pset1<Packet>(-0.000483989715576171875), x); // = 0xb9fdc000
@@ -908,13 +913,6 @@ EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS
y_int = ploadu<PacketI>(y_int2);
}
// Compute the sign to apply to the polynomial.
// sin: sign = second_bit(y_int) xor signbit(_x)
// cos: sign = second_bit(y_int+1)
Packet sign_bit = ComputeSine ? pxor(_x, preinterpret<Packet>(plogical_shift_left<30>(y_int)))
: preinterpret<Packet>(plogical_shift_left<30>(padd(y_int, csti_1)));
sign_bit = pand(sign_bit, cst_sign_mask); // clear all but left most bit
// Get the polynomial selection mask from the second bit of y_int
// We'll calculate both (sin and cos) polynomials and then select from the two.
Packet poly_mask = preinterpret<Packet>(pcmp_eq(pand(y_int, csti_1), pzero(y_int)));
@@ -943,7 +941,15 @@ EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS
y2 = pmadd(y2, x, x);
// Select the correct result from the two polynomials.
if (ComputeBoth) {
// Compute the sign to apply to the polynomial.
// sin: sign = second_bit(y_int) xor signbit(_x)
// cos: sign = second_bit(y_int+1)
Packet sign_bit = (Func == TrigFunction::Sin) ? pxor(_x, preinterpret<Packet>(plogical_shift_left<30>(y_int)))
: preinterpret<Packet>(plogical_shift_left<30>(padd(y_int, csti_1)));
sign_bit = pand(sign_bit, cst_sign_mask); // clear all but left most bit
if ((Func == TrigFunction::SinCos) || (Func == TrigFunction::Tan)) {
// TODO(rmlarsen): Add single polynomial for tan(x) instead of paying for sin+cos+div.
Packet peven = peven_mask(x);
Packet ysin = pselect(poly_mask, y2, y1);
Packet ycos = pselect(poly_mask, y1, y2);
@@ -951,23 +957,28 @@ EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS
Packet sign_bit_cos = preinterpret<Packet>(plogical_shift_left<30>(padd(y_int, csti_1)));
sign_bit_sin = pand(sign_bit_sin, cst_sign_mask); // clear all but left most bit
sign_bit_cos = pand(sign_bit_cos, cst_sign_mask); // clear all but left most bit
y = pselect(peven, pxor(ysin, sign_bit_sin), pxor(ycos, sign_bit_cos));
y = (Func == TrigFunction::SinCos) ? pselect(peven, pxor(ysin, sign_bit_sin), pxor(ycos, sign_bit_cos))
: pdiv(pxor(ysin, sign_bit_sin), pxor(ycos, sign_bit_cos));
} else {
y = ComputeSine ? pselect(poly_mask, y2, y1) : pselect(poly_mask, y1, y2);
y = (Func == TrigFunction::Sin) ? pselect(poly_mask, y2, y1) : pselect(poly_mask, y1, y2);
y = pxor(y, sign_bit);
}
// Update the sign and filter huge inputs
return y;
}
template <typename Packet>
EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS Packet psin_float(const Packet& x) {
return psincos_float<true>(x);
return psincos_float<TrigFunction::Sin>(x);
}
template <typename Packet>
EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS Packet pcos_float(const Packet& x) {
return psincos_float<false>(x);
return psincos_float<TrigFunction::Cos>(x);
}
template <typename Packet>
EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS Packet ptan_float(const Packet& x) {
return psincos_float<TrigFunction::Tan>(x);
}
// Trigonometric argument reduction for double for inputs smaller than 15.
@@ -1007,7 +1018,7 @@ Packet trig_reduce_medium_double(const Packet& x, const Packet& q_high, const Pa
return t;
}
template <bool ComputeSine, typename Packet, bool ComputeBoth = false>
template <TrigFunction Func, typename Packet>
EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS
#if EIGEN_COMP_GNUC_STRICT
__attribute__((optimize("-fno-unsafe-math-optimizations")))
@@ -1094,19 +1105,26 @@ EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS
Packet sign_sin = pxor(x, preinterpret<Packet>(plogical_shift_left<62>(q_int)));
Packet sign_cos = preinterpret<Packet>(plogical_shift_left<62>(padd(q_int, cst_one)));
Packet sign_bit, sFinalRes;
if (ComputeBoth) {
if (Func == TrigFunction::Sin) {
sign_bit = sign_sin;
sFinalRes = pselect(poly_mask, ssin, scos);
} else if (Func == TrigFunction::Cos) {
sign_bit = sign_cos;
sFinalRes = pselect(poly_mask, scos, ssin);
} else if (Func == TrigFunction::Tan) {
// TODO(rmlarsen): Add single polynomial for tan(x) instead of paying for sin+cos+div.
sign_bit = pxor(sign_sin, sign_cos);
sFinalRes = pdiv(pselect(poly_mask, ssin, scos), pselect(poly_mask, scos, ssin));
} else if (Func == TrigFunction::SinCos) {
Packet peven = peven_mask(x);
sign_bit = pselect((s), sign_sin, sign_cos);
sFinalRes = pselect(pxor(peven, poly_mask), ssin, scos);
} else {
sign_bit = ComputeSine ? sign_sin : sign_cos;
sFinalRes = ComputeSine ? pselect(poly_mask, ssin, scos) : pselect(poly_mask, scos, ssin);
}
sign_bit = pand(sign_bit, cst_sign_mask); // clear all but left most bit
sFinalRes = pxor(sFinalRes, sign_bit);
// If the inputs values are higher than that a value that the argument reduction can currently address, compute them
// using std::sin and std::cos
// using the C++ standard library.
// TODO Remove it when huge angle argument reduction is implemented
if (EIGEN_PREDICT_FALSE(predux_any(pcmp_le(pset1<Packet>(huge_th), x_abs)))) {
const int PacketSize = unpacket_traits<Packet>::size;
@@ -1117,10 +1135,15 @@ EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS
for (int k = 0; k < PacketSize; ++k) {
double val = x_cpy[k];
if (std::abs(val) > huge_th && (numext::isfinite)(val)) {
if (ComputeBoth)
if (Func == TrigFunction::Sin) {
sincos_vals[k] = std::sin(val);
} else if (Func == TrigFunction::Cos) {
sincos_vals[k] = std::cos(val);
} else if (Func == TrigFunction::Tan) {
sincos_vals[k] = std::tan(val);
} else if (Func == TrigFunction::SinCos) {
sincos_vals[k] = k % 2 == 0 ? std::sin(val) : std::cos(val);
else
sincos_vals[k] = ComputeSine ? std::sin(val) : std::cos(val);
}
}
}
sFinalRes = ploadu<Packet>(sincos_vals);
@@ -1130,26 +1153,31 @@ EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS
template <typename Packet>
EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS Packet psin_double(const Packet& x) {
return psincos_double<true>(x);
return psincos_double<TrigFunction::Sin>(x);
}
template <typename Packet>
EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS Packet pcos_double(const Packet& x) {
return psincos_double<false>(x);
return psincos_double<TrigFunction::Cos>(x);
}
template <bool ComputeSin, typename Packet>
template <typename Packet>
EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS Packet ptan_double(const Packet& x) {
return psincos_double<TrigFunction::Tan>(x);
}
template <typename Packet>
EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS
std::enable_if_t<std::is_same<typename unpacket_traits<Packet>::type, float>::value, Packet>
psincos_selector(const Packet& x) {
return psincos_float<ComputeSin, Packet, true>(x);
return psincos_float<TrigFunction::SinCos, Packet>(x);
}
template <bool ComputeSin, typename Packet>
template <typename Packet>
EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS
std::enable_if_t<std::is_same<typename unpacket_traits<Packet>::type, double>::value, Packet>
psincos_selector(const Packet& x) {
return psincos_double<ComputeSin, Packet, true>(x);
return psincos_double<TrigFunction::SinCos, Packet>(x);
}
// Generic implementation of acos(x).
@@ -1599,7 +1627,7 @@ EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS Packet pexp_complex(const Pa
// cis(y):
RealPacket y = pand(odd_mask, a.v);
y = por(y, pcplxflip(Packet(y)).v);
RealPacket cisy = psincos_selector<false, RealPacket>(y);
RealPacket cisy = psincos_selector<RealPacket>(y);
cisy = pcplxflip(Packet(cisy)).v; // cos(y) + i * sin(y)
const RealPacket cst_pos_inf = pset1<RealPacket>(NumTraits<RealScalar>::infinity());

View File

@@ -110,6 +110,10 @@ EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS Packet psin_float(const Pack
template <typename Packet>
EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS Packet pcos_float(const Packet& x);
/** \internal \returns tan(x) for single precision float */
template <typename Packet>
EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS Packet ptan_float(const Packet& x);
/** \internal \returns sin(x) for double precision float */
template <typename Packet>
EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS Packet psin_double(const Packet& x);
@@ -118,6 +122,10 @@ EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS Packet psin_double(const Pac
template <typename Packet>
EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS Packet pcos_double(const Packet& x);
/** \internal \returns tan(x) for double precision float */
template <typename Packet>
EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS Packet ptan_double(const Packet& x);
/** \internal \returns asin(x) for single precision float */
template <typename Packet>
EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS Packet pasin_float(const Packet& x);
@@ -200,6 +208,7 @@ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet generic_round(const Packet& a);
#define EIGEN_INSTANTIATE_GENERIC_MATH_FUNCS_FLOAT(PACKET) \
EIGEN_FLOAT_PACKET_FUNCTION(sin, PACKET) \
EIGEN_FLOAT_PACKET_FUNCTION(cos, PACKET) \
EIGEN_FLOAT_PACKET_FUNCTION(tan, PACKET) \
EIGEN_FLOAT_PACKET_FUNCTION(asin, PACKET) \
EIGEN_FLOAT_PACKET_FUNCTION(acos, PACKET) \
EIGEN_FLOAT_PACKET_FUNCTION(tanh, PACKET) \
@@ -216,6 +225,7 @@ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet generic_round(const Packet& a);
#define EIGEN_INSTANTIATE_GENERIC_MATH_FUNCS_DOUBLE(PACKET) \
EIGEN_DOUBLE_PACKET_FUNCTION(sin, PACKET) \
EIGEN_DOUBLE_PACKET_FUNCTION(cos, PACKET) \
EIGEN_DOUBLE_PACKET_FUNCTION(tan, PACKET) \
EIGEN_DOUBLE_PACKET_FUNCTION(log, PACKET) \
EIGEN_DOUBLE_PACKET_FUNCTION(log2, PACKET) \
EIGEN_DOUBLE_PACKET_FUNCTION(exp, PACKET) \

View File

@@ -197,6 +197,7 @@ struct packet_traits<float> : default_packet_traits {
HasDiv = 1,
HasSin = EIGEN_FAST_MATH,
HasCos = EIGEN_FAST_MATH,
HasTan = EIGEN_FAST_MATH,
HasACos = 1,
HasASin = 1,
HasATan = 1,
@@ -5017,6 +5018,7 @@ struct packet_traits<double> : default_packet_traits {
#endif
HasSin = EIGEN_FAST_MATH,
HasCos = EIGEN_FAST_MATH,
HasTan = EIGEN_FAST_MATH,
HasSqrt = 1,
HasRsqrt = 1,
HasCbrt = 1,

View File

@@ -507,6 +507,7 @@ struct packet_traits<float> : default_packet_traits {
HasSin = EIGEN_FAST_MATH,
HasCos = EIGEN_FAST_MATH,
HasTan = EIGEN_FAST_MATH,
HasLog = 1,
HasExp = 1,
HasSqrt = 1,

View File

@@ -183,6 +183,7 @@ struct packet_traits<float> : default_packet_traits {
HasReciprocal = EIGEN_FAST_MATH,
HasSin = EIGEN_FAST_MATH,
HasCos = EIGEN_FAST_MATH,
HasTan = EIGEN_FAST_MATH,
HasACos = 1,
HasASin = 1,
HasATan = 1,
@@ -216,6 +217,7 @@ struct packet_traits<double> : default_packet_traits {
HasDiv = 1,
HasSin = EIGEN_FAST_MATH,
HasCos = EIGEN_FAST_MATH,
HasTan = EIGEN_FAST_MATH,
HasTanh = EIGEN_FAST_MATH,
HasErf = EIGEN_FAST_MATH,
HasErfc = EIGEN_FAST_MATH,

View File

@@ -36,6 +36,11 @@ EIGEN_STRONG_INLINE PacketXf pcos<PacketXf>(const PacketXf& x) {
return pcos_float(x);
}
template <>
EIGEN_STRONG_INLINE PacketXf ptan<PacketXf>(const PacketXf& x) {
return ptan_float(x);
}
// Hyperbolic Tangent function.
template <>
EIGEN_STRONG_INLINE PacketXf ptanh<PacketXf>(const PacketXf& x) {

View File

@@ -353,6 +353,7 @@ struct packet_traits<float> : default_packet_traits {
HasCmp = 1,
HasSin = EIGEN_FAST_MATH,
HasCos = EIGEN_FAST_MATH,
HasTan = EIGEN_FAST_MATH,
HasLog = 1,
HasExp = 1,
HasPow = 1,

View File

@@ -56,6 +56,7 @@ struct generic_float_packet_traits : default_packet_traits {
HasReciprocal = 1,
HasSin = 1,
HasCos = 1,
HasTan = 1,
HasACos = 1,
HasASin = 1,
HasATan = 1,