mirror of
https://gitlab.com/libeigen/eigen.git
synced 2026-04-10 11:34:33 +08:00
remove denormal flushing in fp32tobf16 for avx & avx512
(cherry picked from commit e6a5a594a7)
This commit is contained in:
committed by
Rasmus Munk Larsen
parent
4e0357c6dd
commit
93bff85a42
@@ -32,18 +32,6 @@ float BinaryToFloat(uint32_t sign, uint32_t exponent, uint32_t high_mantissa,
|
||||
return dest;
|
||||
}
|
||||
|
||||
void test_truncate(float input, float expected_truncation, float expected_rounding){
|
||||
bfloat16 truncated = Eigen::bfloat16_impl::truncate_to_bfloat16(input);
|
||||
bfloat16 rounded = Eigen::bfloat16_impl::float_to_bfloat16_rtne<false>(input);
|
||||
if ((numext::isnan)(input)){
|
||||
VERIFY((numext::isnan)(static_cast<float>(truncated)) || (numext::isinf)(static_cast<float>(truncated)));
|
||||
VERIFY((numext::isnan)(static_cast<float>(rounded)) || (numext::isinf)(static_cast<float>(rounded)));
|
||||
return;
|
||||
}
|
||||
VERIFY_IS_EQUAL(expected_truncation, static_cast<float>(truncated));
|
||||
VERIFY_IS_EQUAL(expected_rounding, static_cast<float>(rounded));
|
||||
}
|
||||
|
||||
template<typename T>
|
||||
void test_roundtrip() {
|
||||
// Representable T round trip via bfloat16
|
||||
@@ -122,31 +110,6 @@ void test_conversion()
|
||||
VERIFY_BFLOAT16_BITS_EQUAL(bfloat16(0.0f), 0x0000);
|
||||
VERIFY_BFLOAT16_BITS_EQUAL(bfloat16(-0.0f), 0x8000);
|
||||
|
||||
// Flush denormals to zero
|
||||
for (float denorm = -std::numeric_limits<float>::denorm_min();
|
||||
denorm < std::numeric_limits<float>::denorm_min();
|
||||
denorm = nextafterf(denorm, 1.0f)) {
|
||||
bfloat16 bf_trunc = Eigen::bfloat16_impl::truncate_to_bfloat16(denorm);
|
||||
VERIFY_IS_EQUAL(static_cast<float>(bf_trunc), 0.0f);
|
||||
|
||||
// Implicit conversion of denormls to bool is correct
|
||||
VERIFY_IS_EQUAL(static_cast<bool>(bfloat16(denorm)), false);
|
||||
VERIFY_IS_EQUAL(bfloat16(denorm), false);
|
||||
|
||||
if (std::signbit(denorm)) {
|
||||
VERIFY_BFLOAT16_BITS_EQUAL(bf_trunc, 0x8000);
|
||||
} else {
|
||||
VERIFY_BFLOAT16_BITS_EQUAL(bf_trunc, 0x0000);
|
||||
}
|
||||
bfloat16 bf_round = Eigen::bfloat16_impl::float_to_bfloat16_rtne<false>(denorm);
|
||||
VERIFY_IS_EQUAL(static_cast<float>(bf_round), 0.0f);
|
||||
if (std::signbit(denorm)) {
|
||||
VERIFY_BFLOAT16_BITS_EQUAL(bf_round, 0x8000);
|
||||
} else {
|
||||
VERIFY_BFLOAT16_BITS_EQUAL(bf_round, 0x0000);
|
||||
}
|
||||
}
|
||||
|
||||
// Default is zero
|
||||
VERIFY_IS_EQUAL(static_cast<float>(bfloat16()), 0.0f);
|
||||
|
||||
@@ -156,52 +119,6 @@ void test_conversion()
|
||||
test_roundtrip<std::complex<float> >();
|
||||
test_roundtrip<std::complex<double> >();
|
||||
|
||||
// Truncate test
|
||||
test_truncate(
|
||||
BinaryToFloat(0, 0x80, 0x48, 0xf5c3),
|
||||
BinaryToFloat(0, 0x80, 0x48, 0x0000),
|
||||
BinaryToFloat(0, 0x80, 0x49, 0x0000));
|
||||
test_truncate(
|
||||
BinaryToFloat(1, 0x80, 0x48, 0xf5c3),
|
||||
BinaryToFloat(1, 0x80, 0x48, 0x0000),
|
||||
BinaryToFloat(1, 0x80, 0x49, 0x0000));
|
||||
test_truncate(
|
||||
BinaryToFloat(0, 0x80, 0x48, 0x8000),
|
||||
BinaryToFloat(0, 0x80, 0x48, 0x0000),
|
||||
BinaryToFloat(0, 0x80, 0x48, 0x0000));
|
||||
test_truncate(
|
||||
BinaryToFloat(0, 0xff, 0x00, 0x0001),
|
||||
BinaryToFloat(0, 0xff, 0x40, 0x0000),
|
||||
BinaryToFloat(0, 0xff, 0x40, 0x0000));
|
||||
test_truncate(
|
||||
BinaryToFloat(0, 0xff, 0x7f, 0xffff),
|
||||
BinaryToFloat(0, 0xff, 0x40, 0x0000),
|
||||
BinaryToFloat(0, 0xff, 0x40, 0x0000));
|
||||
test_truncate(
|
||||
BinaryToFloat(1, 0x80, 0x48, 0xc000),
|
||||
BinaryToFloat(1, 0x80, 0x48, 0x0000),
|
||||
BinaryToFloat(1, 0x80, 0x49, 0x0000));
|
||||
test_truncate(
|
||||
BinaryToFloat(0, 0x80, 0x48, 0x0000),
|
||||
BinaryToFloat(0, 0x80, 0x48, 0x0000),
|
||||
BinaryToFloat(0, 0x80, 0x48, 0x0000));
|
||||
test_truncate(
|
||||
BinaryToFloat(0, 0x80, 0x48, 0x4000),
|
||||
BinaryToFloat(0, 0x80, 0x48, 0x0000),
|
||||
BinaryToFloat(0, 0x80, 0x48, 0x0000));
|
||||
test_truncate(
|
||||
BinaryToFloat(0, 0x80, 0x48, 0x8000),
|
||||
BinaryToFloat(0, 0x80, 0x48, 0x0000),
|
||||
BinaryToFloat(0, 0x80, 0x48, 0x0000));
|
||||
test_truncate(
|
||||
BinaryToFloat(0, 0x00, 0x48, 0x8000),
|
||||
BinaryToFloat(0, 0x00, 0x00, 0x0000),
|
||||
BinaryToFloat(0, 0x00, 0x00, 0x0000));
|
||||
test_truncate(
|
||||
BinaryToFloat(0, 0x00, 0x7f, 0xc000),
|
||||
BinaryToFloat(0, 0x00, 0x00, 0x0000),
|
||||
BinaryToFloat(0, 0x00, 0x00, 0x0000));
|
||||
|
||||
// Conversion
|
||||
Array<float,1,100> a;
|
||||
for (int i = 0; i < 100; i++) a(i) = i + 1.25;
|
||||
@@ -250,12 +167,6 @@ void test_conversion()
|
||||
|
||||
VERIFY_BFLOAT16_BITS_EQUAL(bfloat16(BinaryToFloat(0x0, 0xff, 0x40, 0x0)), 0x7fc0);
|
||||
VERIFY_BFLOAT16_BITS_EQUAL(bfloat16(BinaryToFloat(0x1, 0xff, 0x40, 0x0)), 0xffc0);
|
||||
VERIFY_BFLOAT16_BITS_EQUAL(Eigen::bfloat16_impl::truncate_to_bfloat16(
|
||||
BinaryToFloat(0x0, 0xff, 0x40, 0x0)),
|
||||
0x7fc0);
|
||||
VERIFY_BFLOAT16_BITS_EQUAL(Eigen::bfloat16_impl::truncate_to_bfloat16(
|
||||
BinaryToFloat(0x1, 0xff, 0x40, 0x0)),
|
||||
0xffc0);
|
||||
}
|
||||
|
||||
void test_numtraits()
|
||||
|
||||
Reference in New Issue
Block a user