Add partial vectorization for matrices and tensors of bool. This speeds up boolean operations on Tensors by up to 25x.

Benchmark numbers for the logical and of two NxN tensors:

name                                               old time/op             new time/op             delta
BM_booleanAnd_1T/3   [using 1 threads]             14.6ns ± 0%             14.4ns ± 0%   -0.96%
BM_booleanAnd_1T/4   [using 1 threads]             20.5ns ±12%              9.0ns ± 0%  -56.07%
BM_booleanAnd_1T/7   [using 1 threads]             41.7ns ± 0%             10.5ns ± 0%  -74.87%
BM_booleanAnd_1T/8   [using 1 threads]             52.1ns ± 0%             10.1ns ± 0%  -80.59%
BM_booleanAnd_1T/10  [using 1 threads]             76.3ns ± 0%             13.8ns ± 0%  -81.87%
BM_booleanAnd_1T/15  [using 1 threads]              167ns ± 0%               16ns ± 0%  -90.45%
BM_booleanAnd_1T/16  [using 1 threads]              188ns ± 0%               16ns ± 0%  -91.57%
BM_booleanAnd_1T/31  [using 1 threads]              667ns ± 0%               34ns ± 0%  -94.83%
BM_booleanAnd_1T/32  [using 1 threads]              710ns ± 0%               35ns ± 0%  -95.01%
BM_booleanAnd_1T/64  [using 1 threads]             2.80µs ± 0%             0.11µs ± 0%  -95.93%
BM_booleanAnd_1T/128 [using 1 threads]             11.2µs ± 0%              0.4µs ± 0%  -96.11%
BM_booleanAnd_1T/256 [using 1 threads]             44.6µs ± 0%              2.5µs ± 0%  -94.31%
BM_booleanAnd_1T/512 [using 1 threads]              178µs ± 0%               10µs ± 0%  -94.35%
BM_booleanAnd_1T/1k  [using 1 threads]              717µs ± 0%               78µs ± 1%  -89.07%
BM_booleanAnd_1T/2k  [using 1 threads]             2.87ms ± 0%             0.31ms ± 1%  -89.08%
BM_booleanAnd_1T/4k  [using 1 threads]             11.7ms ± 0%              1.9ms ± 4%  -83.55%
BM_booleanAnd_1T/10k [using 1 threads]             70.3ms ± 0%             17.2ms ± 4%  -75.48%
This commit is contained in:
Rasmus Munk Larsen
2020-04-20 20:16:28 +00:00
parent 00f6340153
commit 2f6ddaa25c
5 changed files with 147 additions and 63 deletions

View File

@@ -70,6 +70,23 @@ void test_cast() {
test_cast_helper<FromScalar, FromPacket, ToScalar, ToPacket, CanCast>::run();
}
template<typename Scalar,typename Packet> void packetmath_boolean()
{
const int PacketSize = internal::unpacket_traits<Packet>::size;
const int size = 2*PacketSize;
EIGEN_ALIGN_MAX Scalar data1[size];
EIGEN_ALIGN_MAX Scalar data2[size];
EIGEN_ALIGN_MAX Scalar ref[size];
for (int i=0; i<size; ++i)
{
data1[i] = internal::random<Scalar>();
}
CHECK_CWISE2_IF(true, internal::por, internal::por);
CHECK_CWISE2_IF(true, internal::pxor, internal::pxor);
CHECK_CWISE2_IF(true, internal::pand, internal::pand);
}
template<typename Scalar,typename Packet> void packetmath()
{
typedef internal::packet_traits<Scalar> PacketTraits;
@@ -337,21 +354,6 @@ template<typename Scalar,typename Packet> void packetmath()
VERIFY(test::areApprox(ref, data2, PacketSize) && "internal::pinsertlast");
}
{
for (int i=0; i<PacketSize; ++i)
{
data1[i] = internal::random<Scalar>();
unsigned char v = internal::random<bool>() ? 0xff : 0;
char* bytes = (char*)(data1+PacketSize+i);
for(int k=0; k<int(sizeof(Scalar)); ++k) {
bytes[k] = v;
}
}
CHECK_CWISE2_IF(true, internal::por, internal::por);
CHECK_CWISE2_IF(true, internal::pxor, internal::pxor);
CHECK_CWISE2_IF(true, internal::pand, internal::pand);
CHECK_CWISE2_IF(true, internal::pandnot, internal::pandnot);
}
{
for (int i = 0; i < PacketSize; ++i) {
// "if" mask
@@ -377,8 +379,17 @@ template<typename Scalar,typename Packet> void packetmath()
}
CHECK_CWISE1_IF(PacketTraits::HasSqrt, numext::sqrt, internal::psqrt);
for (int i=0; i<size; ++i)
{
data1[i] = internal::random<Scalar>();
}
CHECK_CWISE2_IF(true, internal::pandnot, internal::pandnot);
packetmath_boolean<Scalar, Packet>();
}
template<typename Scalar,typename Packet> void packetmath_real()
{
typedef internal::packet_traits<Scalar> PacketTraits;
@@ -807,6 +818,9 @@ EIGEN_DECLARE_TEST(packetmath)
CALL_SUBTEST_11( test::runner<std::complex<float> >::run() );
CALL_SUBTEST_12( test::runner<std::complex<double> >::run() );
CALL_SUBTEST_13(( packetmath<half,internal::packet_traits<half>::type>() ));
#ifdef EIGEN_PACKET_MATH_SSE_H
CALL_SUBTEST_14(( packetmath_boolean<bool,internal::packet_traits<bool>::type>() ));
#endif
g_first_pass = false;
}
}