Add missing packet ops for bool, and make it pass the same packet op unit tests as other arithmetic types.

This change also contains a few minor cleanups:
  1. Remove packet op pnot, which is not needed for anything other than pcmp_le_or_nan,
     which can be done in other ways.
  2. Remove the "HasInsert" enum, which is no longer needed since we removed the
     corresponding packet ops.
  3. Add faster pselect op for Packet4i when SSE4.1 is supported.

Among other things, this makes the fast transposeInPlace() method available for Matrix<bool>.

Run on ************** (72 X 2994 MHz CPUs); 2020-05-09T10:51:02.372347913-07:00
CPU: Intel Skylake Xeon with HyperThreading (36 cores) dL1:32KB dL2:1024KB dL3:24MB
Benchmark                        Time(ns)        CPU(ns)     Iterations
-----------------------------------------------------------------------
BM_TransposeInPlace<float>/4            9.77           9.77    71670320
BM_TransposeInPlace<float>/8           21.9           21.9     31929525
BM_TransposeInPlace<float>/16          66.6           66.6     10000000
BM_TransposeInPlace<float>/32         243            243        2879561
BM_TransposeInPlace<float>/59         844            844         829767
BM_TransposeInPlace<float>/64         933            933         750567
BM_TransposeInPlace<float>/128       3944           3945         177405
BM_TransposeInPlace<float>/256      16853          16853          41457
BM_TransposeInPlace<float>/512     204952         204968           3448
BM_TransposeInPlace<float>/1k     1053889        1053861            664
BM_TransposeInPlace<bool>/4            14.4           14.4     48637301
BM_TransposeInPlace<bool>/8            36.0           36.0     19370222
BM_TransposeInPlace<bool>/16           31.5           31.5     22178902
BM_TransposeInPlace<bool>/32          111            111        6272048
BM_TransposeInPlace<bool>/59          626            626        1000000
BM_TransposeInPlace<bool>/64          428            428        1632689
BM_TransposeInPlace<bool>/128        1677           1677         417377
BM_TransposeInPlace<bool>/256        7126           7126          96264
BM_TransposeInPlace<bool>/512       29021          29024          24165
BM_TransposeInPlace<bool>/1k       116321         116330           6068
This commit is contained in:
Rasmus Munk Larsen
2020-05-11 13:23:31 -07:00
parent d640276d31
commit 9b411757ab
9 changed files with 247 additions and 156 deletions

View File

@@ -10,11 +10,25 @@
#include "packetmath_test_shared.h"
#define REF_ADD(a,b) ((a)+(b))
#define REF_SUB(a,b) ((a)-(b))
#define REF_MUL(a,b) ((a)*(b))
#define REF_DIV(a,b) ((a)/(b))
#define REF_ABS_DIFF(a,b) ((a)>(b)?(a)-(b):(b)-(a))
template <typename T>
inline T REF_ADD(const T& a, const T& b) { return a + b;}
template <typename T>
inline T REF_SUB(const T& a, const T& b) { return a - b;}
template <typename T>
inline T REF_MUL(const T& a, const T& b) { return a * b;}
template <typename T>
inline T REF_DIV(const T& a, const T& b) { return a / b;}
template <typename T>
inline T REF_ABS_DIFF(const T& a, const T& b) { return a>b ? a - b : b-a;}
// Specializations for bool
template <>
inline bool REF_ADD(const bool& a, const bool& b) { return a || b;}
template <>
inline bool REF_SUB(const bool& a, const bool& b) { return a ^ b;}
template <>
inline bool REF_MUL(const bool& a, const bool& b) { return a && b;}
template<typename FromScalar, typename FromPacket, typename ToScalar, typename ToPacket, bool CanCast = false>
struct test_cast_helper;
@@ -70,7 +84,8 @@ void test_cast() {
test_cast_helper<FromScalar, FromPacket, ToScalar, ToPacket, CanCast>::run();
}
template<typename Scalar,typename Packet> void packetmath_boolean()
template<typename Scalar,typename Packet>
void packetmath_boolean_mask_ops()
{
const int PacketSize = internal::unpacket_traits<Packet>::size;
const int size = 2*PacketSize;
@@ -82,9 +97,18 @@ template<typename Scalar,typename Packet> void packetmath_boolean()
{
data1[i] = internal::random<Scalar>();
}
CHECK_CWISE2_IF(true, internal::por, internal::por);
CHECK_CWISE2_IF(true, internal::pxor, internal::pxor);
CHECK_CWISE2_IF(true, internal::pand, internal::pand);
CHECK_CWISE1(internal::ptrue, internal::ptrue);
CHECK_CWISE2_IF(true, internal::pandnot, internal::pandnot);
for (int i = 0; i < PacketSize; ++i) {
data1[i] = Scalar(i);
data1[i + PacketSize] = internal::random<bool>() ? data1[i] : Scalar(0);
}
CHECK_CWISE2_IF(true, internal::pcmp_eq, internal::pcmp_eq);
}
template<>
void packetmath_boolean_mask_ops<bool, internal::Packet16b>()
{
}
template<typename Scalar,typename Packet> void packetmath()
@@ -171,9 +195,6 @@ template<typename Scalar,typename Packet> void packetmath()
CHECK_CWISE2_IF(PacketTraits::HasMul, REF_MUL, internal::pmul);
CHECK_CWISE2_IF(PacketTraits::HasDiv, REF_DIV, internal::pdiv);
CHECK_CWISE1(internal::pnot, internal::pnot);
CHECK_CWISE1(internal::pzero, internal::pzero);
CHECK_CWISE1(internal::ptrue, internal::ptrue);
if (PacketTraits::HasNegate)
CHECK_CWISE1(internal::negate, internal::pnegate);
CHECK_CWISE1(numext::conj, internal::pconj);
@@ -252,7 +273,7 @@ template<typename Scalar,typename Packet> void packetmath()
ref[0] = Scalar(1);
for (int i=0; i<PacketSize; ++i)
ref[0] *= data1[i];
ref[0] = REF_MUL(ref[0], data1[i]);
VERIFY(internal::isApprox(ref[0], internal::predux_mul(internal::pload<Packet>(data1))) && "internal::predux_mul");
for (int i=0; i<PacketSize; ++i)
@@ -272,6 +293,7 @@ template<typename Scalar,typename Packet> void packetmath()
}
}
if (PacketTraits::HasBlend) {
Packet thenPacket = internal::pload<Packet>(data1);
Packet elsePacket = internal::pload<Packet>(data2);
@@ -304,26 +326,22 @@ template<typename Scalar,typename Packet> void packetmath()
CHECK_CWISE3_IF(true, internal::pselect, internal::pselect);
}
{
for (int i = 0; i < PacketSize; ++i) {
data1[i] = Scalar(i);
data1[i + PacketSize] = internal::random<bool>() ? data1[i] : Scalar(0);
}
CHECK_CWISE2_IF(true, internal::pcmp_eq, internal::pcmp_eq);
}
CHECK_CWISE1_IF(PacketTraits::HasSqrt, numext::sqrt, internal::psqrt);
for (int i=0; i<size; ++i)
{
data1[i] = internal::random<Scalar>();
}
CHECK_CWISE2_IF(true, internal::pandnot, internal::pandnot);
CHECK_CWISE1(internal::pzero, internal::pzero);
CHECK_CWISE2_IF(true, internal::por, internal::por);
CHECK_CWISE2_IF(true, internal::pxor, internal::pxor);
CHECK_CWISE2_IF(true, internal::pand, internal::pand);
packetmath_boolean<Scalar, Packet>();
packetmath_boolean_mask_ops<Scalar, Packet>();
}
template<typename Scalar,typename Packet> void packetmath_real()
{
typedef internal::packet_traits<Scalar> PacketTraits;
@@ -753,7 +771,7 @@ EIGEN_DECLARE_TEST(packetmath)
CALL_SUBTEST_12( test::runner<std::complex<double> >::run() );
CALL_SUBTEST_13(( packetmath<half,internal::packet_traits<half>::type>() ));
#ifdef EIGEN_PACKET_MATH_SSE_H
CALL_SUBTEST_14(( packetmath_boolean<bool,internal::packet_traits<bool>::type>() ));
CALL_SUBTEST_14(( packetmath<bool,internal::packet_traits<bool>::type>() ));
#endif
g_first_pass = false;
}