remove denormal flushing in fp32tobf16 for avx & avx512

(cherry picked from commit e6a5a594a7)
2026-04-10 11:34:33 +08:00 · 2021-08-09 22:15:21 +00:00
parent 4e0357c6dd
commit 93bff85a42
4 changed files with 8 additions and 117 deletions
--- a/test/bfloat16_float.cpp
+++ b/test/bfloat16_float.cpp
@@ -32,18 +32,6 @@ float BinaryToFloat(uint32_t sign, uint32_t exponent, uint32_t high_mantissa,
  return dest;
 }

-void test_truncate(float input, float expected_truncation, float expected_rounding){
-  bfloat16 truncated = Eigen::bfloat16_impl::truncate_to_bfloat16(input);
-  bfloat16 rounded = Eigen::bfloat16_impl::float_to_bfloat16_rtne<false>(input);
-  if ((numext::isnan)(input)){
-    VERIFY((numext::isnan)(static_cast<float>(truncated)) || (numext::isinf)(static_cast<float>(truncated)));
-    VERIFY((numext::isnan)(static_cast<float>(rounded)) || (numext::isinf)(static_cast<float>(rounded)));
-    return;
-  }
-  VERIFY_IS_EQUAL(expected_truncation, static_cast<float>(truncated));
-  VERIFY_IS_EQUAL(expected_rounding, static_cast<float>(rounded));
-}
-
 template<typename T>
 void test_roundtrip() {
  // Representable T round trip via bfloat16
@@ -122,31 +110,6 @@ void test_conversion()
  VERIFY_BFLOAT16_BITS_EQUAL(bfloat16(0.0f), 0x0000);
  VERIFY_BFLOAT16_BITS_EQUAL(bfloat16(-0.0f), 0x8000);

-  // Flush denormals to zero
-  for (float denorm = -std::numeric_limits<float>::denorm_min();
-       denorm < std::numeric_limits<float>::denorm_min();
-       denorm = nextafterf(denorm, 1.0f)) {
-    bfloat16 bf_trunc = Eigen::bfloat16_impl::truncate_to_bfloat16(denorm);
-    VERIFY_IS_EQUAL(static_cast<float>(bf_trunc), 0.0f);
-
-    // Implicit conversion of denormls to bool is correct
-    VERIFY_IS_EQUAL(static_cast<bool>(bfloat16(denorm)), false);
-    VERIFY_IS_EQUAL(bfloat16(denorm), false);
-
-    if (std::signbit(denorm)) {
-      VERIFY_BFLOAT16_BITS_EQUAL(bf_trunc, 0x8000);
-    } else {
-      VERIFY_BFLOAT16_BITS_EQUAL(bf_trunc, 0x0000);
-    }
-    bfloat16 bf_round = Eigen::bfloat16_impl::float_to_bfloat16_rtne<false>(denorm);
-    VERIFY_IS_EQUAL(static_cast<float>(bf_round), 0.0f);
-    if (std::signbit(denorm)) {
-      VERIFY_BFLOAT16_BITS_EQUAL(bf_round, 0x8000);
-    } else {
-      VERIFY_BFLOAT16_BITS_EQUAL(bf_round, 0x0000);
-    }
-  }
-
  // Default is zero
  VERIFY_IS_EQUAL(static_cast<float>(bfloat16()), 0.0f);

@@ -156,52 +119,6 @@ void test_conversion()
  test_roundtrip<std::complex<float> >();
  test_roundtrip<std::complex<double> >();

-  // Truncate test
-  test_truncate(
-      BinaryToFloat(0, 0x80, 0x48, 0xf5c3),
-      BinaryToFloat(0, 0x80, 0x48, 0x0000),
-      BinaryToFloat(0, 0x80, 0x49, 0x0000));
-  test_truncate(
-      BinaryToFloat(1, 0x80, 0x48, 0xf5c3),
-      BinaryToFloat(1, 0x80, 0x48, 0x0000),
-      BinaryToFloat(1, 0x80, 0x49, 0x0000));
-  test_truncate(
-      BinaryToFloat(0, 0x80, 0x48, 0x8000),
-      BinaryToFloat(0, 0x80, 0x48, 0x0000),
-      BinaryToFloat(0, 0x80, 0x48, 0x0000));
-  test_truncate(
-      BinaryToFloat(0, 0xff, 0x00, 0x0001),
-      BinaryToFloat(0, 0xff, 0x40, 0x0000),
-      BinaryToFloat(0, 0xff, 0x40, 0x0000));
-  test_truncate(
-      BinaryToFloat(0, 0xff, 0x7f, 0xffff),
-      BinaryToFloat(0, 0xff, 0x40, 0x0000),
-      BinaryToFloat(0, 0xff, 0x40, 0x0000));
-  test_truncate(
-      BinaryToFloat(1, 0x80, 0x48, 0xc000),
-      BinaryToFloat(1, 0x80, 0x48, 0x0000),
-      BinaryToFloat(1, 0x80, 0x49, 0x0000));
-  test_truncate(
-      BinaryToFloat(0, 0x80, 0x48, 0x0000),
-      BinaryToFloat(0, 0x80, 0x48, 0x0000),
-      BinaryToFloat(0, 0x80, 0x48, 0x0000));
-  test_truncate(
-      BinaryToFloat(0, 0x80, 0x48, 0x4000),
-      BinaryToFloat(0, 0x80, 0x48, 0x0000),
-      BinaryToFloat(0, 0x80, 0x48, 0x0000));
-  test_truncate(
-      BinaryToFloat(0, 0x80, 0x48, 0x8000),
-      BinaryToFloat(0, 0x80, 0x48, 0x0000),
-      BinaryToFloat(0, 0x80, 0x48, 0x0000));
-  test_truncate(
-      BinaryToFloat(0, 0x00, 0x48, 0x8000),
-      BinaryToFloat(0, 0x00, 0x00, 0x0000),
-      BinaryToFloat(0, 0x00, 0x00, 0x0000));
-  test_truncate(
-      BinaryToFloat(0, 0x00, 0x7f, 0xc000),
-      BinaryToFloat(0, 0x00, 0x00, 0x0000),
-      BinaryToFloat(0, 0x00, 0x00, 0x0000));
-
  // Conversion
  Array<float,1,100> a;
  for (int i = 0; i < 100; i++) a(i) = i + 1.25;
@@ -250,12 +167,6 @@ void test_conversion()

  VERIFY_BFLOAT16_BITS_EQUAL(bfloat16(BinaryToFloat(0x0, 0xff, 0x40, 0x0)), 0x7fc0);
  VERIFY_BFLOAT16_BITS_EQUAL(bfloat16(BinaryToFloat(0x1, 0xff, 0x40, 0x0)), 0xffc0);
-  VERIFY_BFLOAT16_BITS_EQUAL(Eigen::bfloat16_impl::truncate_to_bfloat16(
-                               BinaryToFloat(0x0, 0xff, 0x40, 0x0)),
-                             0x7fc0);
-  VERIFY_BFLOAT16_BITS_EQUAL(Eigen::bfloat16_impl::truncate_to_bfloat16(
-                               BinaryToFloat(0x1, 0xff, 0x40, 0x0)),
-                             0xffc0);
 }

 void test_numtraits()