remove denormal flushing in fp32tobf16 for avx & avx512

(cherry picked from commit e6a5a594a7)
This commit is contained in:
Gauri Deshpande
2021-08-09 22:15:21 +00:00
committed by Rasmus Munk Larsen
parent 4e0357c6dd
commit 93bff85a42
4 changed files with 8 additions and 117 deletions

View File

@@ -32,18 +32,6 @@ float BinaryToFloat(uint32_t sign, uint32_t exponent, uint32_t high_mantissa,
return dest;
}
void test_truncate(float input, float expected_truncation, float expected_rounding){
bfloat16 truncated = Eigen::bfloat16_impl::truncate_to_bfloat16(input);
bfloat16 rounded = Eigen::bfloat16_impl::float_to_bfloat16_rtne<false>(input);
if ((numext::isnan)(input)){
VERIFY((numext::isnan)(static_cast<float>(truncated)) || (numext::isinf)(static_cast<float>(truncated)));
VERIFY((numext::isnan)(static_cast<float>(rounded)) || (numext::isinf)(static_cast<float>(rounded)));
return;
}
VERIFY_IS_EQUAL(expected_truncation, static_cast<float>(truncated));
VERIFY_IS_EQUAL(expected_rounding, static_cast<float>(rounded));
}
template<typename T>
void test_roundtrip() {
// Representable T round trip via bfloat16
@@ -122,31 +110,6 @@ void test_conversion()
VERIFY_BFLOAT16_BITS_EQUAL(bfloat16(0.0f), 0x0000);
VERIFY_BFLOAT16_BITS_EQUAL(bfloat16(-0.0f), 0x8000);
// Flush denormals to zero
for (float denorm = -std::numeric_limits<float>::denorm_min();
denorm < std::numeric_limits<float>::denorm_min();
denorm = nextafterf(denorm, 1.0f)) {
bfloat16 bf_trunc = Eigen::bfloat16_impl::truncate_to_bfloat16(denorm);
VERIFY_IS_EQUAL(static_cast<float>(bf_trunc), 0.0f);
// Implicit conversion of denormls to bool is correct
VERIFY_IS_EQUAL(static_cast<bool>(bfloat16(denorm)), false);
VERIFY_IS_EQUAL(bfloat16(denorm), false);
if (std::signbit(denorm)) {
VERIFY_BFLOAT16_BITS_EQUAL(bf_trunc, 0x8000);
} else {
VERIFY_BFLOAT16_BITS_EQUAL(bf_trunc, 0x0000);
}
bfloat16 bf_round = Eigen::bfloat16_impl::float_to_bfloat16_rtne<false>(denorm);
VERIFY_IS_EQUAL(static_cast<float>(bf_round), 0.0f);
if (std::signbit(denorm)) {
VERIFY_BFLOAT16_BITS_EQUAL(bf_round, 0x8000);
} else {
VERIFY_BFLOAT16_BITS_EQUAL(bf_round, 0x0000);
}
}
// Default is zero
VERIFY_IS_EQUAL(static_cast<float>(bfloat16()), 0.0f);
@@ -156,52 +119,6 @@ void test_conversion()
test_roundtrip<std::complex<float> >();
test_roundtrip<std::complex<double> >();
// Truncate test
test_truncate(
BinaryToFloat(0, 0x80, 0x48, 0xf5c3),
BinaryToFloat(0, 0x80, 0x48, 0x0000),
BinaryToFloat(0, 0x80, 0x49, 0x0000));
test_truncate(
BinaryToFloat(1, 0x80, 0x48, 0xf5c3),
BinaryToFloat(1, 0x80, 0x48, 0x0000),
BinaryToFloat(1, 0x80, 0x49, 0x0000));
test_truncate(
BinaryToFloat(0, 0x80, 0x48, 0x8000),
BinaryToFloat(0, 0x80, 0x48, 0x0000),
BinaryToFloat(0, 0x80, 0x48, 0x0000));
test_truncate(
BinaryToFloat(0, 0xff, 0x00, 0x0001),
BinaryToFloat(0, 0xff, 0x40, 0x0000),
BinaryToFloat(0, 0xff, 0x40, 0x0000));
test_truncate(
BinaryToFloat(0, 0xff, 0x7f, 0xffff),
BinaryToFloat(0, 0xff, 0x40, 0x0000),
BinaryToFloat(0, 0xff, 0x40, 0x0000));
test_truncate(
BinaryToFloat(1, 0x80, 0x48, 0xc000),
BinaryToFloat(1, 0x80, 0x48, 0x0000),
BinaryToFloat(1, 0x80, 0x49, 0x0000));
test_truncate(
BinaryToFloat(0, 0x80, 0x48, 0x0000),
BinaryToFloat(0, 0x80, 0x48, 0x0000),
BinaryToFloat(0, 0x80, 0x48, 0x0000));
test_truncate(
BinaryToFloat(0, 0x80, 0x48, 0x4000),
BinaryToFloat(0, 0x80, 0x48, 0x0000),
BinaryToFloat(0, 0x80, 0x48, 0x0000));
test_truncate(
BinaryToFloat(0, 0x80, 0x48, 0x8000),
BinaryToFloat(0, 0x80, 0x48, 0x0000),
BinaryToFloat(0, 0x80, 0x48, 0x0000));
test_truncate(
BinaryToFloat(0, 0x00, 0x48, 0x8000),
BinaryToFloat(0, 0x00, 0x00, 0x0000),
BinaryToFloat(0, 0x00, 0x00, 0x0000));
test_truncate(
BinaryToFloat(0, 0x00, 0x7f, 0xc000),
BinaryToFloat(0, 0x00, 0x00, 0x0000),
BinaryToFloat(0, 0x00, 0x00, 0x0000));
// Conversion
Array<float,1,100> a;
for (int i = 0; i < 100; i++) a(i) = i + 1.25;
@@ -250,12 +167,6 @@ void test_conversion()
VERIFY_BFLOAT16_BITS_EQUAL(bfloat16(BinaryToFloat(0x0, 0xff, 0x40, 0x0)), 0x7fc0);
VERIFY_BFLOAT16_BITS_EQUAL(bfloat16(BinaryToFloat(0x1, 0xff, 0x40, 0x0)), 0xffc0);
VERIFY_BFLOAT16_BITS_EQUAL(Eigen::bfloat16_impl::truncate_to_bfloat16(
BinaryToFloat(0x0, 0xff, 0x40, 0x0)),
0x7fc0);
VERIFY_BFLOAT16_BITS_EQUAL(Eigen::bfloat16_impl::truncate_to_bfloat16(
BinaryToFloat(0x1, 0xff, 0x40, 0x0)),
0xffc0);
}
void test_numtraits()