Implement vectorized complex square root.

Closes #1905

Measured speedup for sqrt of `complex<float>` on Skylake:

SSE:
```
name                      old time/op             new time/op  delta
BM_eigen_sqrt_ctype/1     49.4ns ± 0%             54.3ns ± 0%  +10.01%
BM_eigen_sqrt_ctype/8      332ns ± 0%               50ns ± 1%  -84.97%
BM_eigen_sqrt_ctype/64    2.81µs ± 1%             0.38µs ± 0%  -86.49%
BM_eigen_sqrt_ctype/512   23.8µs ± 0%              3.0µs ± 0%  -87.32%
BM_eigen_sqrt_ctype/4k     202µs ± 0%               24µs ± 2%  -88.03%
BM_eigen_sqrt_ctype/32k   1.63ms ± 0%             0.19ms ± 0%  -88.18%
BM_eigen_sqrt_ctype/256k  13.0ms ± 0%              1.5ms ± 1%  -88.20%
BM_eigen_sqrt_ctype/1M    52.1ms ± 0%              6.2ms ± 0%  -88.18%
```

AVX2:
```
name                      old cpu/op  new cpu/op  delta
BM_eigen_sqrt_ctype/1     53.6ns ± 0%  55.6ns ± 0%   +3.71%
BM_eigen_sqrt_ctype/8      334ns ± 0%    27ns ± 0%  -91.86%
BM_eigen_sqrt_ctype/64    2.79µs ± 0%  0.22µs ± 2%  -92.28%
BM_eigen_sqrt_ctype/512   23.8µs ± 1%   1.7µs ± 1%  -92.81%
BM_eigen_sqrt_ctype/4k     201µs ± 0%    14µs ± 1%  -93.24%
BM_eigen_sqrt_ctype/32k   1.62ms ± 0%  0.11ms ± 1%  -93.29%
BM_eigen_sqrt_ctype/256k  13.0ms ± 0%   0.9ms ± 1%  -93.31%
BM_eigen_sqrt_ctype/1M    52.0ms ± 0%   3.5ms ± 1%  -93.31%
```

AVX512:
```
name                      old cpu/op  new cpu/op  delta
BM_eigen_sqrt_ctype/1     53.7ns ± 0%  56.2ns ± 1%   +4.75%
BM_eigen_sqrt_ctype/8      334ns ± 0%    18ns ± 2%  -94.63%
BM_eigen_sqrt_ctype/64    2.79µs ± 0%  0.12µs ± 1%  -95.54%
BM_eigen_sqrt_ctype/512   23.9µs ± 1%   1.0µs ± 1%  -95.89%
BM_eigen_sqrt_ctype/4k     202µs ± 0%     8µs ± 1%  -96.13%
BM_eigen_sqrt_ctype/32k   1.63ms ± 0%  0.06ms ± 1%  -96.15%
BM_eigen_sqrt_ctype/256k  13.0ms ± 0%   0.5ms ± 4%  -96.11%
BM_eigen_sqrt_ctype/1M    52.1ms ± 0%   2.0ms ± 1%  -96.13%
```
This commit is contained in:
Rasmus Munk Larsen
2020-12-08 18:13:35 -08:00
parent 8cfe0db108
commit 125cc9a5df
10 changed files with 290 additions and 11 deletions

View File

@@ -473,8 +473,6 @@ void packetmath() {
CHECK_CWISE3_IF(true, internal::pselect, internal::pselect);
}
CHECK_CWISE1_IF(PacketTraits::HasSqrt, numext::sqrt, internal::psqrt);
for (int i = 0; i < size; ++i) {
data1[i] = internal::random<Scalar>();
}
@@ -486,6 +484,11 @@ void packetmath() {
packetmath_boolean_mask_ops<Scalar, Packet>();
packetmath_pcast_ops_runner<Scalar, Packet>::run();
packetmath_minus_zero_add<Scalar, Packet>();
for (int i = 0; i < size; ++i) {
data1[i] = numext::abs(internal::random<Scalar>());
}
CHECK_CWISE1_IF(PacketTraits::HasSqrt, numext::sqrt, internal::psqrt);
}
// Notice that this definition works for complex types as well.
@@ -899,6 +902,8 @@ void test_conj_helper(Scalar* data1, Scalar* data2, Scalar* ref, Scalar* pval) {
template <typename Scalar, typename Packet>
void packetmath_complex() {
typedef internal::packet_traits<Scalar> PacketTraits;
typedef typename Scalar::value_type RealScalar;
const int PacketSize = internal::unpacket_traits<Packet>::size;
const int size = PacketSize * 4;
@@ -917,11 +922,55 @@ void packetmath_complex() {
test_conj_helper<Scalar, Packet, true, false>(data1, data2, ref, pval);
test_conj_helper<Scalar, Packet, true, true>(data1, data2, ref, pval);
// Test pcplxflip.
{
for (int i = 0; i < PacketSize; ++i) ref[i] = Scalar(std::imag(data1[i]), std::real(data1[i]));
internal::pstore(pval, internal::pcplxflip(internal::pload<Packet>(data1)));
VERIFY(test::areApprox(ref, pval, PacketSize) && "pcplxflip");
}
if (PacketTraits::HasSqrt) {
for (int i = 0; i < size; ++i) {
data1[i] = Scalar(internal::random<RealScalar>(), internal::random<RealScalar>());
}
CHECK_CWISE1(numext::sqrt, internal::psqrt);
// Test misc. corner cases.
const RealScalar zero = RealScalar(0);
const RealScalar one = RealScalar(1);
const RealScalar inf = std::numeric_limits<RealScalar>::infinity();
const RealScalar nan = std::numeric_limits<RealScalar>::quiet_NaN();
data1[0] = Scalar(zero, zero);
data1[1] = Scalar(-zero, zero);
data1[2] = Scalar(one, zero);
data1[3] = Scalar(zero, one);
CHECK_CWISE1(numext::sqrt, internal::psqrt);
data1[0] = Scalar(-one, zero);
data1[1] = Scalar(zero, -one);
data1[2] = Scalar(one, one);
data1[3] = Scalar(-one, -one);
CHECK_CWISE1(numext::sqrt, internal::psqrt);
data1[0] = Scalar(inf, zero);
data1[1] = Scalar(zero, inf);
data1[2] = Scalar(-inf, zero);
data1[3] = Scalar(zero, -inf);
CHECK_CWISE1(numext::sqrt, internal::psqrt);
data1[0] = Scalar(inf, inf);
data1[1] = Scalar(-inf, inf);
data1[2] = Scalar(inf, -inf);
data1[3] = Scalar(-inf, -inf);
CHECK_CWISE1(numext::sqrt, internal::psqrt);
data1[0] = Scalar(nan, zero);
data1[1] = Scalar(zero, nan);
data1[2] = Scalar(nan, one);
data1[3] = Scalar(one, nan);
CHECK_CWISE1(numext::sqrt, internal::psqrt);
data1[0] = Scalar(nan, nan);
data1[1] = Scalar(inf, nan);
data1[2] = Scalar(nan, inf);
data1[3] = Scalar(-inf, nan);
CHECK_CWISE1(numext::sqrt, internal::psqrt);
}
}
template <typename Scalar, typename Packet>