bug #1652: implements a much more accurate version of vectorized sin/cos. This new version achieve same speed for SSE/AVX, and is slightly faster with FMA. Guarantees are as follows:

- no FMA: 1ULP up to 3pi, 2ULP up to sin(25966) and cos(18838), fallback to std::sin/cos for larger inputs
  - FMA: 1ULP up to sin(117435.992) and cos(71476.0625), fallback to std::sin/cos for larger inputs
This commit is contained in:
Gael Guennebaud
2019-01-09 15:25:17 +01:00
parent e70ffef967
commit e6b217b8dd
5 changed files with 181 additions and 75 deletions

View File

@@ -568,17 +568,22 @@ template<typename Scalar,typename Packet> void packetmath_real()
if(PacketTraits::HasCos)
{
packet_helper<PacketTraits::HasCos,Packet> h;
for(Scalar k = 1; k<Scalar(1000)/std::numeric_limits<Scalar>::epsilon(); k*=2) {
data1[0] = k*Scalar(EIGEN_PI) * internal::random<Scalar>(0.8,1.2);
data1[1] = (k+1)*Scalar(EIGEN_PI) * internal::random<Scalar>(0.8,1.2);
h.store(data2, internal::pcos(h.load(data1)));
VERIFY(data2[0]<=Scalar(1.) && data2[0]>=Scalar(-1.));
VERIFY(data2[1]<=Scalar(1.) && data2[1]>=Scalar(-1.));
data1[0] = (2*k+1)*Scalar(EIGEN_PI)/2 * internal::random<Scalar>(0.8,1.2);
data1[1] = (2*k+3)*Scalar(EIGEN_PI)/2 * internal::random<Scalar>(0.8,1.2);
h.store(data2, internal::psin(h.load(data1)));
VERIFY(data2[0]<=Scalar(1.) && data2[0]>=Scalar(-1.));
VERIFY(data2[1]<=Scalar(1.) && data2[1]>=Scalar(-1.));
for(Scalar k = 1; k<Scalar(10000)/std::numeric_limits<Scalar>::epsilon(); k*=2)
{
for(int k1=0;k1<=1; ++k1)
{
data1[0] = (2*k+k1 )*Scalar(EIGEN_PI)/2 * internal::random<Scalar>(0.8,1.2);
data1[1] = (2*k+2+k1)*Scalar(EIGEN_PI)/2 * internal::random<Scalar>(0.8,1.2);
h.store(data2, internal::pcos(h.load(data1)));
h.store(data2+PacketSize, internal::psin(h.load(data1)));
VERIFY(data2[0]<=Scalar(1.) && data2[0]>=Scalar(-1.));
VERIFY(data2[1]<=Scalar(1.) && data2[1]>=Scalar(-1.));
VERIFY(data2[PacketSize+0]<=Scalar(1.) && data2[PacketSize+0]>=Scalar(-1.));
VERIFY(data2[PacketSize+1]<=Scalar(1.) && data2[PacketSize+1]>=Scalar(-1.));
VERIFY_IS_APPROX(numext::abs2(data2[0])+numext::abs2(data2[PacketSize+0]), Scalar(1));
VERIFY_IS_APPROX(numext::abs2(data2[1])+numext::abs2(data2[PacketSize+1]), Scalar(1));
}
}
data1[0] = std::numeric_limits<Scalar>::infinity();
@@ -596,6 +601,12 @@ template<typename Scalar,typename Packet> void packetmath_real()
VERIFY((numext::isnan)(data2[0]));
h.store(data2, internal::pcos(h.load(data1)));
VERIFY((numext::isnan)(data2[0]));
data1[0] = -Scalar(0.);
h.store(data2, internal::psin(h.load(data1)));
VERIFY( internal::biteq(data2[0], data1[0]) );
h.store(data2, internal::pcos(h.load(data1)));
VERIFY_IS_EQUAL(data2[0], Scalar(1));
}
}
}
@@ -633,6 +644,29 @@ template<typename Scalar,typename Packet> void packetmath_notcomplex()
ref[i] = data1[0]+Scalar(i);
internal::pstore(data2, internal::plset<Packet>(data1[0]));
VERIFY(areApprox(ref, data2, PacketSize) && "internal::plset");
{
unsigned char* data1_bits = reinterpret_cast<unsigned char*>(data1);
// predux_all - not needed yet
// for (unsigned int i=0; i<PacketSize*sizeof(Scalar); ++i) data1_bits[i] = 0xff;
// VERIFY(internal::predux_all(internal::pload<Packet>(data1)) && "internal::predux_all(1111)");
// for(int k=0; k<PacketSize; ++k)
// {
// for (unsigned int i=0; i<sizeof(Scalar); ++i) data1_bits[k*sizeof(Scalar)+i] = 0x0;
// VERIFY( (!internal::predux_all(internal::pload<Packet>(data1))) && "internal::predux_all(0101)");
// for (unsigned int i=0; i<sizeof(Scalar); ++i) data1_bits[k*sizeof(Scalar)+i] = 0xff;
// }
// predux_any
for (unsigned int i=0; i<PacketSize*sizeof(Scalar); ++i) data1_bits[i] = 0x0;
VERIFY( (!internal::predux_any(internal::pload<Packet>(data1))) && "internal::predux_any(0000)");
for(int k=0; k<PacketSize; ++k)
{
for (unsigned int i=0; i<sizeof(Scalar); ++i) data1_bits[k*sizeof(Scalar)+i] = 0xff;
VERIFY( internal::predux_any(internal::pload<Packet>(data1)) && "internal::predux_any(0101)");
for (unsigned int i=0; i<sizeof(Scalar); ++i) data1_bits[k*sizeof(Scalar)+i] = 0x00;
}
}
}
template<typename Scalar,typename Packet,bool ConjLhs,bool ConjRhs> void test_conj_helper(Scalar* data1, Scalar* data2, Scalar* ref, Scalar* pval)