Add ULP accuracy measurement tool and documentation for vectorized math functions

libeigen/eigen!2153 Co-authored-by: Rasmus Munk Larsen <rmlarsen@gmail.com>
2026-04-10 11:34:33 +08:00 · 2026-03-01 13:22:16 -08:00
parent c20b6f5c41
commit c66fc52868
5 changed files with 973 additions and 0 deletions
--- a/doc/CoeffwiseMathFunctionsTable.dox
+++ b/doc/CoeffwiseMathFunctionsTable.dox
@@ -608,6 +608,133 @@ This also means that, unless specified, if the function \c std::foo is available
 \n
 \section CoeffwiseMathFunctionsAccuracy Accuracy of vectorized math functions
 The following tables summarize the accuracy of %Eigen's vectorized implementations measured
 in units of ULP (Unit in the Last Place) on an x86-64 system (Intel Xeon, GCC) with SSE2 SIMD
 target. The reference values were computed using
 <a href="https://www.mpfr.org/">MPFR</a> at 128-bit precision.
 Float results are exhaustive over all ~4.28 billion finite representable values.
 Double results sample ~2.88 billion values using a geometric stepping factor of 10<sup>-6</sup>.
 These numbers may differ for other SIMD targets (AVX, AVX512, NEON, SVE, etc.)
 since each has its own packet math implementations. Functions marked "delegates to std"
 do not have a custom vectorized implementation for the tested SIMD target &mdash; they call
 the standard library function element-by-element.
 The full histograms for each function can be generated with the \c ulp_accuracy tool
 in <tt>test/ulp_accuracy/</tt>.
 \subsection CoeffwiseMathFunctionsAccuracy_float Float precision
 <table class="manual-hl">
 <tr><th>Function</th><th>Max |ULP|</th><th>Mean |ULP|</th><th>% Exact</th><th>Notes</th></tr>
 <tr><th colspan="5">Trigonometric</th></tr>
 <tr><td>sin</td>  <td>2</td>  <td>0.087</td> <td>91.3%</td> <td>\ref CoeffwiseMathFunctionsAccuracy_note1 "1"</td></tr>
 <tr><td>cos</td>  <td>2</td>  <td>0.088</td> <td>91.2%</td> <td>\ref CoeffwiseMathFunctionsAccuracy_note1 "1"</td></tr>
 <tr><td>tan</td>  <td>5</td>  <td>0.238</td> <td>77.3%</td> <td>\ref CoeffwiseMathFunctionsAccuracy_note1 "1"</td></tr>
 <tr><td>asin</td> <td>4</td>  <td>0.726</td> <td>51.3%</td> <td></td></tr>
 <tr><td>acos</td> <td>4</td>  <td>0.057</td> <td>95.0%</td> <td></td></tr>
 <tr><td>atan</td> <td>4</td>  <td>0.061</td> <td>94.0%</td> <td></td></tr>
 <tr><th colspan="5">Hyperbolic</th></tr>
 <tr><td>sinh</td>  <td>2</td>  <td>0.017</td> <td>98.3%</td> <td></td></tr>
 <tr><td>cosh</td>  <td>2</td>  <td>0.004</td> <td>99.6%</td> <td></td></tr>
 <tr><td>tanh</td>  <td>6</td>  <td>0.030</td> <td>97.2%</td> <td></td></tr>
 <tr><td>asinh</td> <td>2</td>  <td>0.145</td> <td>85.5%</td> <td></td></tr>
 <tr><td>acosh</td> <td>2</td>  <td>0.057</td> <td>94.3%</td> <td></td></tr>
 <tr><td>atanh</td> <td>2</td>  <td>0.004</td> <td>99.6%</td> <td></td></tr>
 <tr><th colspan="5">Exponential / Logarithmic</th></tr>
 <tr><td>exp</td>   <td>1</td>  <td>0.018</td> <td>98.2%</td> <td></td></tr>
 <tr><td>exp2</td>  <td>6</td>  <td>0.034</td> <td>97.3%</td> <td></td></tr>
 <tr><td>expm1</td> <td>5</td>  <td>0.060</td> <td>94.6%</td> <td></td></tr>
 <tr><td>log</td>   <td>3</td>  <td>0.120</td> <td>88.0%</td> <td></td></tr>
 <tr><td>log1p</td> <td>5</td>  <td>0.134</td> <td>87.5%</td> <td></td></tr>
 <tr><td>log10</td> <td>2</td>  <td>0.007</td> <td>99.3%</td> <td></td></tr>
 <tr><td>log2</td>  <td>5</td>  <td>0.005</td> <td>99.5%</td> <td></td></tr>
 <tr><th colspan="5">Error / Special</th></tr>
 <tr><td>erf</td>      <td>7</td>     <td>0.332</td> <td>67.5%</td> <td></td></tr>
 <tr><td>erfc</td>     <td>8</td>     <td>0.010</td> <td>99.2%</td> <td></td></tr>
 <tr><td>lgamma</td>   <td colspan="3"><em>delegates to std</em></td> <td>\ref CoeffwiseMathFunctionsAccuracy_note3 "3"</td></tr>
 <tr><th colspan="5">Other</th></tr>
 <tr><td>logistic</td> <td>7</td>     <td>0.040</td> <td>97.0%</td> <td></td></tr>
 <tr><td>sqrt</td>     <td>0</td>     <td>0.000</td> <td>100%</td>  <td>Uses hardware sqrt</td></tr>
 <tr><td>cbrt</td>     <td>2</td>     <td>0.552</td> <td>49.1%</td> <td></td></tr>
 <tr><td>rsqrt</td>    <td>&infin;</td> <td>0.114</td> <td>88.2%</td> <td>\ref CoeffwiseMathFunctionsAccuracy_note4 "4"</td></tr>
 </table>
 \subsection CoeffwiseMathFunctionsAccuracy_double Double precision
 <table class="manual-hl">
 <tr><th>Function</th><th>Max |ULP|</th><th>Mean |ULP|</th><th>% Exact</th><th>Notes</th></tr>
 <tr><th colspan="5">Trigonometric</th></tr>
 <tr><td>sin</td>  <td>13,879,755</td> <td>0.093</td> <td>93.2%</td> <td>\ref CoeffwiseMathFunctionsAccuracy_note1 "1"</td></tr>
 <tr><td>cos</td>  <td>2,024,130</td>  <td>0.043</td> <td>98.2%</td> <td>\ref CoeffwiseMathFunctionsAccuracy_note1 "1"</td></tr>
 <tr><td>tan</td>  <td>13,879,755</td> <td>0.128</td> <td>92.7%</td> <td>\ref CoeffwiseMathFunctionsAccuracy_note1 "1"</td></tr>
 <tr><td>asin</td> <td>1</td>          <td>&lt;0.001</td> <td>&gt;99.9%</td> <td></td></tr>
 <tr><td>acos</td> <td>1</td>          <td>&lt;0.001</td> <td>100%</td> <td></td></tr>
 <tr><td>atan</td> <td>5</td>          <td>0.013</td> <td>98.8%</td> <td></td></tr>
 <tr><th colspan="5">Hyperbolic</th></tr>
 <tr><td>sinh</td>  <td>2</td>  <td>0.004</td> <td>99.6%</td> <td></td></tr>
 <tr><td>cosh</td>  <td>2</td>  <td>0.001</td> <td>99.9%</td> <td></td></tr>
 <tr><td>tanh</td>  <td>8</td>  <td>0.008</td> <td>99.3%</td> <td></td></tr>
 <tr><td>asinh</td> <td>2</td>  <td>0.098</td> <td>90.2%</td> <td></td></tr>
 <tr><td>acosh</td> <td>2</td>  <td>0.047</td> <td>95.3%</td> <td></td></tr>
 <tr><td>atanh</td> <td>2</td>  <td>&lt;0.001</td> <td>&gt;99.9%</td> <td></td></tr>
 <tr><th colspan="5">Exponential / Logarithmic</th></tr>
 <tr><td>exp</td>   <td>2</td>   <td>0.001</td> <td>99.9%</td> <td></td></tr>
 <tr><td>exp2</td>  <td>214</td> <td>0.107</td> <td>99.6%</td> <td>\ref CoeffwiseMathFunctionsAccuracy_note2 "2"</td></tr>
 <tr><td>expm1</td> <td>3</td>   <td>0.010</td> <td>99.1%</td> <td></td></tr>
 <tr><td>log</td>   <td>2</td>   <td>0.147</td> <td>85.3%</td> <td></td></tr>
 <tr><td>log1p</td> <td>3</td>   <td>0.097</td> <td>90.6%</td> <td></td></tr>
 <tr><td>log10</td> <td>2</td>   <td>0.001</td> <td>99.9%</td> <td></td></tr>
 <tr><td>log2</td>  <td>2</td>   <td>&lt;0.001</td> <td>99.9%</td> <td></td></tr>
 <tr><th colspan="5">Error / Special</th></tr>
 <tr><td>erf</td>      <td>&infin;</td> <td>0.050</td> <td>70.5%</td> <td>\ref CoeffwiseMathFunctionsAccuracy_note5 "5"</td></tr>
 <tr><td>erfc</td>     <td>11</td>      <td>0.002</td> <td>99.9%</td> <td></td></tr>
 <tr><td>lgamma</td>   <td colspan="3"><em>delegates to std</em></td> <td>\ref CoeffwiseMathFunctionsAccuracy_note3 "3"</td></tr>
 <tr><th colspan="5">Other</th></tr>
 <tr><td>logistic</td> <td>3</td>       <td>0.008</td> <td>99.2%</td> <td></td></tr>
 <tr><td>sqrt</td>     <td>0</td>       <td>0.000</td> <td>100%</td>  <td>Uses hardware sqrt</td></tr>
 <tr><td>cbrt</td>     <td>2</td>       <td>0.119</td> <td>88.1%</td> <td></td></tr>
 <tr><td>rsqrt</td>    <td>&infin;</td> <td>0.135</td> <td>86.5%</td> <td>\ref CoeffwiseMathFunctionsAccuracy_note4 "4"</td></tr>
 </table>
 \subsection CoeffwiseMathFunctionsAccuracy_notes Notes
 \anchor CoeffwiseMathFunctionsAccuracy_note1
 <b>1. sin/cos/tan argument reduction:</b>
 %Eigen's vectorized sin, cos, and tan use a Cody-Waite argument reduction scheme that
 subtracts multiples of &pi;/2 from the input. For very large arguments (|x| &gt; ~10<sup>4</sup>
 in float, |x| &gt; ~10 in double), this reduction loses precision, producing
 occasional large ULP errors. The mean error remains low because most representable
 values are small. Applications that need high accuracy for large arguments should
 perform argument reduction in user code before calling these functions.
 \anchor CoeffwiseMathFunctionsAccuracy_note2
 <b>2. exp2 double precision:</b>
 The exp2 implementation for double shows a max error of 214 ULP near the overflow
 boundary (x &asymp; 1022). The mean error is still low (0.107 ULP, 99.6% exact), so
 the large max error affects only inputs very close to overflow.
 \anchor CoeffwiseMathFunctionsAccuracy_note3
 <b>3. lgamma:</b>
 The vectorized lgamma delegates to the standard library function \c std::lgamma for
 this SIMD target (SSE2) and therefore has the same accuracy as the platform's C math library.
 \anchor CoeffwiseMathFunctionsAccuracy_note4
 <b>4. rsqrt max=&infin;:</b>
 The infinite max ULP is due to a sign disagreement at a single subnormal input:
 rsqrt(-0) returns -&infin; in %Eigen but the MPFR reference produces NaN (rsqrt of a negative
 value). Ignoring this edge case, the implementation is accurate (&lt; 2 ULP) everywhere else.
 For float, 16.8 million subnormal negative inputs (0.4%) also produce &plusmn;&infin; vs NaN.
 The mean error excluding these outliers is well below 1 ULP.
 \anchor CoeffwiseMathFunctionsAccuracy_note5
 <b>5. erf double &infin;:</b>
 The vectorized erf for double returns NaN for &plusmn;&infin; instead of the correct &plusmn;1.
 This produces an infinite max ULP error at a single input value. Excluding &plusmn;&infin;,
 the max error is 3 ULP and the mean is 0.050 ULP.
 */
 }
--- a/test/CMakeLists.txt
+++ b/test/CMakeLists.txt
@@ -532,6 +532,18 @@ if(EIGEN_TEST_BUILD_DOCUMENTATION)
  add_dependencies(buildtests doc)
 endif()
 # ULP accuracy measurement tool (see test/ulp_accuracy/README.md)
 find_package(MPFR)
 find_package(GMP)
 add_executable(ulp_accuracy ulp_accuracy/ulp_accuracy.cpp)
 target_compile_options(ulp_accuracy PRIVATE -pthread)
 target_link_libraries(ulp_accuracy Eigen3::Eigen ${CMAKE_THREAD_LIBS_INIT} pthread)
 if(MPFR_FOUND AND GMP_FOUND)
  target_include_directories(ulp_accuracy PRIVATE ${MPFR_INCLUDES} ${GMP_INCLUDES})
  target_link_libraries(ulp_accuracy ${MPFR_LIBRARIES} ${GMP_LIBRARIES})
  target_compile_definitions(ulp_accuracy PRIVATE EIGEN_HAS_MPFR)
 endif()
 # Register all smoke tests
 include("EigenSmokeTestList")
 ei_add_smoke_tests("${ei_smoke_test_list}")
--- a/test/ulp_accuracy/README.md
+++ b/test/ulp_accuracy/README.md
@@ -0,0 +1,145 @@
 # ULP Accuracy Measurement Tool
 Standalone tool for measuring the accuracy of Eigen's vectorized math functions
 in units of ULP (Unit in the Last Place). Compares Eigen's SIMD implementations
 against either MPFR (128-bit high-precision reference) or the standard C++ math
 library.
 ## Building
 From the Eigen build directory:
 ```bash
 cd build
 cmake ..
 cmake --build . --target ulp_accuracy
 ```
 If MPFR and GMP are installed, the build automatically enables MPFR support
 (`EIGEN_HAS_MPFR`). Without them, only `--ref=std` is available.
 ### Installing MPFR (Debian/Ubuntu)
 ```bash
 sudo apt install libmpfr-dev libgmp-dev
 ```
 ## Usage
 ```
 ./test/ulp_accuracy [options]
 Options:
  --func=NAME    Function to test (required unless --list)
  --lo=VAL       Start of range (default: -inf)
  --hi=VAL       End of range (default: +inf)
  --double       Test double precision (default: float)
  --step=EPS     Sampling step: advance by (1+EPS)*nextafter(x)
                 (default: 0 = exhaustive; useful for double, e.g. 1e-6)
  --threads=N    Number of threads (default: all cores)
  --batch=N      Batch size for Eigen eval (default: 4096)
  --ref=MODE     Reference: 'std' (default) or 'mpfr'
  --hist_width=N Histogram half-width in ULPs (default: 10)
  --list         List available functions
 ```
 ## Examples
 List all supported functions:
 ```bash
 ./test/ulp_accuracy --list
 ```
 Exhaustive float test of sin against std (tests all ~4.28 billion finite floats):
 ```bash
 ./test/ulp_accuracy --func=sin
 ```
 Float test against MPFR (more accurate reference, but slower):
 ```bash
 ./test/ulp_accuracy --func=sin --ref=mpfr
 ```
 Double precision test with geometric sampling (exhaustive is impractical for double):
 ```bash
 ./test/ulp_accuracy --func=exp --double --step=1e-6
 ```
 Test a specific range:
 ```bash
 ./test/ulp_accuracy --func=sin --lo=0 --hi=6.2832
 ```
 ## Output
 The tool prints:
 - **Test configuration**: function, range, reference mode, thread count
 - **Max |ULP error|**: worst-case absolute ULP error with the offending input value
 - **Mean |ULP error|**: average absolute ULP error across all tested values
 - **Signed ULP histogram**: distribution of signed errors showing bias direction
 Example output:
 ```
 Function: sin (float)
 Range: [-inf, inf]
 Representable values in range: 4278190082
 Reference: MPFR (128-bit)
 Threads: 32
 Batch size: 4096
 Results:
  Values tested: 4278190081
  Time: 529.04 seconds (8.1 Mvalues/s)
  Max |ULP error|: 2
    at x = -1.5413464e+38 (Eigen=-0.482218683, ref=-0.482218742)
  Mean |ULP error|: 0.0874
 Signed ULP error histogram [-10, +10]:
  -2   :        51988 (  0.001%)
  -1   :    186805349 (  4.366%)
  0    :   3904475407 ( 91.265%)
  1    :    186805349 (  4.366%)
  2    :        51988 (  0.001%)
 ```
 ## How it works
 1. **Range splitting**: The input range is divided evenly across threads by
   splitting the linear ULP space.
 2. **Batched evaluation**: Each thread fills batches of input values, evaluates
   them through Eigen's vectorized path (using `Eigen::Array` operations), and
   computes reference values one at a time.
 3. **ULP computation**: IEEE 754 bit patterns are mapped to a linear integer
   scale where adjacent representable values are adjacent integers. The signed
   ULP error is the difference between Eigen's result and the reference on this
   scale. Special cases (NaN, infinity mismatches) report infinite error.
 4. **Result reduction**: Per-thread statistics (max error, mean error, histogram)
   are merged after all threads complete.
 ## Supported functions
 | Category | Functions |
 |----------|-----------|
 | Trigonometric | sin, cos, tan, asin, acos, atan |
 | Hyperbolic | sinh, cosh, tanh, asinh, acosh, atanh |
 | Exponential/Log | exp, exp2, expm1, log, log1p, log10, log2 |
 | Error/Gamma | erf, erfc, lgamma |
 | Other | logistic, sqrt, cbrt, rsqrt |
 ## File organization
 - `ulp_accuracy.cpp` — Main tool: ULP computation, worker threads, CLI, result printing
 - `mpfr_reference.h` — MPFR reference function wrappers and scalar conversion helpers
 ## Performance tips
 - Float exhaustive sweeps test ~4.28 billion values. With `--ref=std` this takes
  ~50 seconds per function; with `--ref=mpfr` it takes ~500 seconds (10x slower).
 - For double precision, exhaustive testing is impractical. Use `--step=1e-6` to
  sample ~2.88 billion values geometrically.
 - Thread count defaults to all available cores. MPFR is the bottleneck (single
  MPFR call per value per thread), so more cores help significantly.
--- a/test/ulp_accuracy/mpfr_reference.h
+++ b/test/ulp_accuracy/mpfr_reference.h
@@ -0,0 +1,78 @@
 // MPFR high-precision reference implementations for ULP accuracy testing.
 //
 // This header provides MPFR-based reference functions for all math operations
 // tested by the ulp_accuracy tool. It also includes scalar conversion helpers
 // between float/double and mpfr_t.
 //
 // Only compiled when EIGEN_HAS_MPFR is defined (i.e., MPFR and GMP are found).
 #ifndef EIGEN_ULP_ACCURACY_MPFR_REFERENCE_H
 #define EIGEN_ULP_ACCURACY_MPFR_REFERENCE_H
 #ifdef EIGEN_HAS_MPFR
 #include <mpfr.h>
 // ---------------------------------------------------------------------------
 // Scalar <-> mpfr_t conversion
 // ---------------------------------------------------------------------------
 template <typename Scalar>
 static void mpfr_set_scalar(mpfr_t rop, Scalar x, mpfr_rnd_t rnd);
 template <>
 void mpfr_set_scalar<float>(mpfr_t rop, float x, mpfr_rnd_t rnd) {
  mpfr_set_flt(rop, x, rnd);
 }
 template <>
 void mpfr_set_scalar<double>(mpfr_t rop, double x, mpfr_rnd_t rnd) {
  mpfr_set_d(rop, x, rnd);
 }
 template <typename Scalar>
 static Scalar mpfr_get_scalar(mpfr_t op, mpfr_rnd_t rnd);
 template <>
 float mpfr_get_scalar<float>(mpfr_t op, mpfr_rnd_t rnd) {
  return mpfr_get_flt(op, rnd);
 }
 template <>
 double mpfr_get_scalar<double>(mpfr_t op, mpfr_rnd_t rnd) {
  return mpfr_get_d(op, rnd);
 }
 // ---------------------------------------------------------------------------
 // MPFR wrappers for functions not directly provided by libmpfr
 // ---------------------------------------------------------------------------
 // logistic(x) = 1 / (1 + exp(-x))
 static int mpfr_logistic(mpfr_t rop, const mpfr_t op, mpfr_rnd_t rnd) {
  mpfr_t tmp, one;
  mpfr_init2(tmp, mpfr_get_prec(rop));
  mpfr_init2(one, mpfr_get_prec(rop));
  mpfr_set_ui(one, 1, rnd);
  mpfr_neg(tmp, op, rnd);
  mpfr_exp(tmp, tmp, rnd);
  mpfr_add(tmp, tmp, one, rnd);
  int ret = mpfr_div(rop, one, tmp, rnd);
  mpfr_clear(tmp);
  mpfr_clear(one);
  return ret;
 }
 // rsqrt(x) = 1 / sqrt(x)
 static int mpfr_rsqrt(mpfr_t rop, const mpfr_t op, mpfr_rnd_t rnd) { return mpfr_rec_sqrt(rop, op, rnd); }
 // exp2(x) = 2^x
 static int mpfr_exp2_wrap(mpfr_t rop, const mpfr_t op, mpfr_rnd_t rnd) {
  mpfr_t two;
  mpfr_init2(two, mpfr_get_prec(rop));
  mpfr_set_ui(two, 2, rnd);
  int ret = mpfr_pow(rop, two, op, rnd);
  mpfr_clear(two);
  return ret;
 }
 // log2(x) — thin wrapper to match the function signature convention.
 static int mpfr_log2_wrap(mpfr_t rop, const mpfr_t op, mpfr_rnd_t rnd) { return mpfr_log2(rop, op, rnd); }
 #endif  // EIGEN_HAS_MPFR
 #endif  // EIGEN_ULP_ACCURACY_MPFR_REFERENCE_H
--- a/test/ulp_accuracy/ulp_accuracy.cpp
+++ b/test/ulp_accuracy/ulp_accuracy.cpp
@@ -0,0 +1,611 @@
 // Standalone tool to measure ULP accuracy of Eigen's vectorized math functions
 // against either MPFR (high-precision reference) or std C++ math functions.
 //
 // See README.md in this directory for full documentation.
 //
 // Usage:
 //   ./ulp_accuracy --func=sin --lo=0 --hi=6.2832 --threads=16
 //   ./ulp_accuracy --func=exp --threads=16
 //   ./ulp_accuracy --func=sin --ref=mpfr
 //   ./ulp_accuracy --func=sin --double --step=1e-6
 //   ./ulp_accuracy --list
 //
 // Build:
 //   cd build && cmake .. && make ulp_accuracy
 #include <Eigen/Core>
 #include <unsupported/Eigen/SpecialFunctions>
 #ifdef EIGEN_HAS_MPFR
 #include <mpfr.h>
 #endif
 #include <cfloat>
 #include <chrono>
 #include <cmath>
 #include <cstdint>
 #include <cstdio>
 #include <cstring>
 #include <functional>
 #include <limits>
 #include <memory>
 #include <string>
 #include <thread>
 #include <type_traits>
 #include <vector>
 #include "mpfr_reference.h"
 // ============================================================================
 // ULP distance computation
 // ============================================================================
 // Maps IEEE 754 bits to a linear integer scale where adjacent representable
 // values are adjacent integers. The mapping is strictly monotonic:
 //   -inf -> most negative, -0.0 -> -1, +0.0 -> 0, +inf -> most positive.
 static inline int64_t scalar_to_linear(float x) {
  int32_t bits;
  std::memcpy(&bits, &x, sizeof(bits));
  if (bits < 0) {
    bits = static_cast<int32_t>(INT32_MIN) - bits - 1;
  }
  return static_cast<int64_t>(bits);
 }
 static inline int64_t scalar_to_linear(double x) {
  int64_t bits;
  std::memcpy(&bits, &x, sizeof(bits));
  if (bits < 0) {
    bits = static_cast<int64_t>(INT64_MIN) - bits - 1;
  }
  return bits;
 }
 // Returns (eigen_val - ref_val) in ULP space.
 // Positive means Eigen overestimates, negative means it underestimates.
 // Returns INT64_MAX for incomparable values (NaN vs number, inf mismatch).
 template <typename Scalar>
 static inline int64_t signed_ulp_error(Scalar eigen_val, Scalar ref_val) {
  if (eigen_val == ref_val) return 0;  // also handles -0.0 == +0.0
  bool e_nan = std::isnan(eigen_val), r_nan = std::isnan(ref_val);
  if (e_nan && r_nan) return 0;
  if (e_nan || r_nan) return INT64_MAX;
  if (std::isinf(eigen_val) || std::isinf(ref_val)) return INT64_MAX;
  int64_t a = scalar_to_linear(eigen_val);
  int64_t b = scalar_to_linear(ref_val);
  // Overflow check for a - b.
  if (b > 0 && a < INT64_MIN + b) return INT64_MAX;
  if (b < 0 && a > INT64_MAX + b) return INT64_MAX;
  return a - b;
 }
 // ============================================================================
 // Per-thread accumulator with signed ULP histogram
 // ============================================================================
 template <typename Scalar>
 struct alignas(128) ThreadResult {
  int64_t max_abs_ulp = 0;
  Scalar max_ulp_at = Scalar(0);
  Scalar max_ulp_eigen = Scalar(0);
  Scalar max_ulp_ref = Scalar(0);
  double abs_ulp_sum = 0.0;
  uint64_t count = 0;
  // Signed histogram: bins for errors in [-hist_width, +hist_width],
  // plus two overflow bins for < -hist_width and > +hist_width.
  // Layout: [<-W] [-W] [-W+1] ... [0] ... [W-1] [W] [>W]
  // Total bins = 2*hist_width + 3
  int hist_width = 0;
  std::vector<uint64_t> hist;
  void init(int w) {
    hist_width = w;
    hist.assign(2 * w + 3, 0);
  }
  void record(int64_t signed_err, Scalar x, Scalar eigen_val, Scalar ref_val) {
    int64_t abs_err = signed_err < 0 ? -signed_err : signed_err;
    if (signed_err == INT64_MAX) abs_err = INT64_MAX;
    if (abs_err > max_abs_ulp) {
      max_abs_ulp = abs_err;
      max_ulp_at = x;
      max_ulp_eigen = eigen_val;
      max_ulp_ref = ref_val;
    }
    if (abs_err != INT64_MAX) {
      abs_ulp_sum += static_cast<double>(abs_err);
    }
    count++;
    // Histogram bin.
    int bin;
    if (signed_err == INT64_MAX || signed_err > hist_width) {
      bin = 2 * hist_width + 2;  // overflow high
    } else if (signed_err < -hist_width) {
      bin = 0;  // overflow low
    } else {
      bin = static_cast<int>(signed_err) + hist_width + 1;
    }
    hist[bin]++;
  }
 };
 // ============================================================================
 // Function registry
 // ============================================================================
 template <typename Scalar>
 struct FuncEntry {
  using ArrayType = Eigen::Array<Scalar, Eigen::Dynamic, 1>;
  using EigenEval = std::function<void(Eigen::Ref<ArrayType>, const Eigen::Ref<const ArrayType>&)>;
  using StdEval = std::function<Scalar(Scalar)>;
 #ifdef EIGEN_HAS_MPFR
  using MpfrEval = std::function<int(mpfr_t, const mpfr_t, mpfr_rnd_t)>;
 #endif
  std::string name;
  EigenEval eigen_eval;
  StdEval std_eval;
 #ifdef EIGEN_HAS_MPFR
  MpfrEval mpfr_eval;
 #endif
  Scalar default_lo;
  Scalar default_hi;
 };
 // std::logistic is not part of the C++ standard library.
 template <typename Scalar>
 static Scalar std_logistic(Scalar x) {
  if (x >= 0) {
    Scalar e = std::exp(-x);
    return Scalar(1) / (Scalar(1) + e);
  } else {
    Scalar e = std::exp(x);
    return e / (Scalar(1) + e);
  }
 }
 template <typename Scalar>
 static std::vector<FuncEntry<Scalar>> build_func_table() {
  using ArrayType = Eigen::Array<Scalar, Eigen::Dynamic, 1>;
  std::vector<FuncEntry<Scalar>> table;
 #ifdef EIGEN_HAS_MPFR
 #define ADD_FUNC(fname, eigen_expr, std_expr, mpfr_fn, lo, hi)                                                        \
  table.push_back({#fname, [](Eigen::Ref<ArrayType> out, const Eigen::Ref<const ArrayType>& a) { out = eigen_expr; }, \
                   [](Scalar x) -> Scalar { return std_expr; },                                                       \
                   [](mpfr_t r, const mpfr_t o, mpfr_rnd_t d) { return mpfr_fn(r, o, d); }, lo, hi})
 #else
 #define ADD_FUNC(fname, eigen_expr, std_expr, mpfr_fn, lo, hi)                                                        \
  table.push_back({#fname, [](Eigen::Ref<ArrayType> out, const Eigen::Ref<const ArrayType>& a) { out = eigen_expr; }, \
                   [](Scalar x) -> Scalar { return std_expr; }, lo, hi})
 #endif
  constexpr Scalar kInf = std::numeric_limits<Scalar>::infinity();
  // Trigonometric
  // clang-format off
  ADD_FUNC(sin,   a.sin(),   std::sin(x),   mpfr_sin,   -kInf, kInf);
  ADD_FUNC(cos,   a.cos(),   std::cos(x),   mpfr_cos,   -kInf, kInf);
  ADD_FUNC(tan,   a.tan(),   std::tan(x),   mpfr_tan,   -kInf, kInf);
  ADD_FUNC(asin,  a.asin(),  std::asin(x),  mpfr_asin,  -kInf, kInf);
  ADD_FUNC(acos,  a.acos(),  std::acos(x),  mpfr_acos,  -kInf, kInf);
  ADD_FUNC(atan,  a.atan(),  std::atan(x),  mpfr_atan,  -kInf, kInf);
  // Hyperbolic
  ADD_FUNC(sinh,  a.sinh(),  std::sinh(x),  mpfr_sinh,  -kInf, kInf);
  ADD_FUNC(cosh,  a.cosh(),  std::cosh(x),  mpfr_cosh,  -kInf, kInf);
  ADD_FUNC(tanh,  a.tanh(),  std::tanh(x),  mpfr_tanh,  -kInf, kInf);
  ADD_FUNC(asinh, a.asinh(), std::asinh(x), mpfr_asinh, -kInf, kInf);
  ADD_FUNC(acosh, a.acosh(), std::acosh(x), mpfr_acosh, -kInf, kInf);
  ADD_FUNC(atanh, a.atanh(), std::atanh(x), mpfr_atanh, -kInf, kInf);
  // Exponential / Logarithmic
  ADD_FUNC(exp,   a.exp(),     std::exp(x),    mpfr_exp,       -kInf, kInf);
  ADD_FUNC(exp2,  a.exp2(),    std::exp2(x),   mpfr_exp2_wrap, -kInf, kInf);
  ADD_FUNC(expm1, a.expm1(),   std::expm1(x),  mpfr_expm1,     -kInf, kInf);
  ADD_FUNC(log,   a.log(),     std::log(x),    mpfr_log,       -kInf, kInf);
  ADD_FUNC(log1p, a.log1p(),   std::log1p(x),  mpfr_log1p,     -kInf, kInf);
  ADD_FUNC(log10, a.log10(),   std::log10(x),  mpfr_log10,     -kInf, kInf);
  ADD_FUNC(log2,  a.log2(),    std::log2(x),   mpfr_log2_wrap, -kInf, kInf);
  // Error / Gamma
  ADD_FUNC(erf,    a.erf(),    std::erf(x),    mpfr_erf,     -kInf, kInf);
  ADD_FUNC(erfc,   a.erfc(),   std::erfc(x),   mpfr_erfc,    -kInf, kInf);
  ADD_FUNC(lgamma, a.lgamma(), std::lgamma(x), mpfr_lngamma, -kInf, kInf);
  // Other
  ADD_FUNC(logistic, a.logistic(), std_logistic(x), mpfr_logistic, -kInf, kInf);
  ADD_FUNC(sqrt,  a.sqrt(),  std::sqrt(x),            mpfr_sqrt,  -kInf, kInf);
  ADD_FUNC(cbrt,  a.cbrt(),  std::cbrt(x),            mpfr_cbrt,  -kInf, kInf);
  ADD_FUNC(rsqrt, a.rsqrt(), Scalar(1)/std::sqrt(x),  mpfr_rsqrt, -kInf, kInf);
  // clang-format on
 #undef ADD_FUNC
  return table;
 }
 // ============================================================================
 // Range iteration helpers
 // ============================================================================
 // Advances x toward +inf by at least 1 ULP. When step_eps > 0, additionally
 // jumps by a relative factor of (1 + step_eps) to sample the range sparsely.
 template <typename Scalar>
 static inline Scalar advance_by_step(Scalar x, double step_eps) {
  Scalar next = std::nextafter(x, std::numeric_limits<Scalar>::infinity());
  if (step_eps > 0.0 && std::isfinite(next)) {
    // Try to jump further by a relative amount.
    Scalar jumped = next > 0 ? next * static_cast<Scalar>(1.0 + step_eps) : next / static_cast<Scalar>(1.0 + step_eps);
    // Use the jump only if it actually advances further (handles denormal stalling).
    if (jumped > next) next = jumped;
  }
  return next;
 }
 // Counts the number of representable scalars in [lo, hi].
 template <typename Scalar>
 static uint64_t count_scalars_in_range(Scalar lo, Scalar hi) {
  if (lo > hi) return 0;
  uint64_t lo_u = static_cast<uint64_t>(scalar_to_linear(lo));
  uint64_t hi_u = static_cast<uint64_t>(scalar_to_linear(hi));
  uint64_t diff = hi_u - lo_u;
  return diff == UINT64_MAX ? UINT64_MAX : diff + 1;
 }
 // Advances a scalar by n ULPs in the linear representation.
 static float advance_scalar(float x, uint64_t n) {
  int64_t lin = scalar_to_linear(x);
  lin += static_cast<int64_t>(n);
  int32_t ibits;
  if (lin < 0) {
    ibits = static_cast<int32_t>(INT32_MIN) - static_cast<int32_t>(lin) - 1;
  } else {
    ibits = static_cast<int32_t>(lin);
  }
  float result;
  std::memcpy(&result, &ibits, sizeof(result));
  return result;
 }
 static double advance_scalar(double x, uint64_t n) {
  int64_t lin = scalar_to_linear(x);
  lin += static_cast<int64_t>(n);
  int64_t ibits;
  if (lin < 0) {
    ibits = static_cast<int64_t>(INT64_MIN) - lin - 1;
  } else {
    ibits = lin;
  }
  double result;
  std::memcpy(&result, &ibits, sizeof(result));
  return result;
 }
 // ============================================================================
 // Worker thread: evaluates Eigen and reference over a subrange
 // ============================================================================
 template <typename Scalar>
 static void worker(const FuncEntry<Scalar>& func, Scalar lo, Scalar hi, int batch_size, bool use_mpfr, double step_eps,
                   ThreadResult<Scalar>& result) {
  using ArrayType = Eigen::Array<Scalar, Eigen::Dynamic, 1>;
  ArrayType input(batch_size);
  ArrayType eigen_out(batch_size);
  std::vector<Scalar> ref_out(batch_size);
 #ifdef EIGEN_HAS_MPFR
  mpfr_t mp_in, mp_out;
  if (use_mpfr) {
    mpfr_init2(mp_in, 128);
    mpfr_init2(mp_out, 128);
  }
 #else
  (void)use_mpfr;
 #endif
  auto process_batch = [&](int n, const ArrayType& in, const ArrayType& eig) {
    for (int i = 0; i < n; i++) {
 #ifdef EIGEN_HAS_MPFR
      if (use_mpfr) {
        mpfr_set_scalar<Scalar>(mp_in, in[i], MPFR_RNDN);
        func.mpfr_eval(mp_out, mp_in, MPFR_RNDN);
        ref_out[i] = mpfr_get_scalar<Scalar>(mp_out, MPFR_RNDN);
      } else
 #endif
      {
        ref_out[i] = func.std_eval(in[i]);
      }
    }
    for (int i = 0; i < n; i++) {
      int64_t err = signed_ulp_error(eig[i], ref_out[i]);
      result.record(err, in[i], eig[i], ref_out[i]);
    }
  };
  int idx = 0;
  Scalar x = lo;
  for (;;) {
    input[idx] = x;
    idx++;
    if (idx == batch_size) {
      func.eigen_eval(eigen_out, input);
      process_batch(batch_size, input, eigen_out);
      idx = 0;
    }
    if (x >= hi) break;
    Scalar next = advance_by_step(x, step_eps);
    x = (next > hi) ? hi : next;
  }
  // Process remaining partial batch.
  if (idx > 0) {
    auto partial_in = input.head(idx);
    auto partial_eigen = eigen_out.head(idx);
    func.eigen_eval(partial_eigen, partial_in);
    process_batch(idx, input, eigen_out);
  }
 #ifdef EIGEN_HAS_MPFR
  if (use_mpfr) {
    mpfr_clear(mp_in);
    mpfr_clear(mp_out);
  }
 #endif
 }
 // ============================================================================
 // Test driver: splits range across threads and prints results
 // ============================================================================
 struct Options {
  std::string func_name;
  double lo = std::numeric_limits<double>::quiet_NaN();
  double hi = std::numeric_limits<double>::quiet_NaN();
  int num_threads;
  int batch_size = 4096;
  int hist_width = 10;
  bool use_mpfr = false;
  bool use_double = false;
  double step_eps = 0.0;
  bool list_funcs = false;
 };
 template <typename Scalar>
 static int run_test(const Options& opts) {
  const int kDigits = std::is_same<Scalar, float>::value ? 9 : 17;
  const char* kTypeName = std::is_same<Scalar, float>::value ? "float" : "double";
  auto table = build_func_table<Scalar>();
  if (opts.list_funcs) {
    std::printf("Available functions:\n");
    for (const auto& f : table) {
      std::printf("  %s\n", f.name.c_str());
    }
    return 0;
  }
  // Look up the requested function.
  const FuncEntry<Scalar>* entry = nullptr;
  for (const auto& f : table) {
    if (f.name == opts.func_name) {
      entry = &f;
      break;
    }
  }
  if (!entry) {
    std::fprintf(stderr, "Error: unknown function '%s' (use --list to see available functions)\n",
                 opts.func_name.c_str());
    return 1;
  }
  Scalar lo = std::isnan(opts.lo) ? entry->default_lo : static_cast<Scalar>(opts.lo);
  Scalar hi = std::isnan(opts.hi) ? entry->default_hi : static_cast<Scalar>(opts.hi);
  uint64_t total_scalars = count_scalars_in_range(lo, hi);
  int num_threads = opts.num_threads;
  // Print test configuration.
  std::printf("Function: %s (%s)\n", opts.func_name.c_str(), kTypeName);
  std::printf("Range: [%.*g, %.*g]\n", kDigits, double(lo), kDigits, double(hi));
  if (opts.step_eps > 0.0) {
    std::printf("Sampling step: (1 + %g) * nextafter(x)\n", opts.step_eps);
  } else {
    std::printf("Representable values in range: %lu\n", static_cast<unsigned long>(total_scalars));
  }
  std::printf("Reference: %s\n", opts.use_mpfr ? "MPFR (128-bit)" : "std C++ math");
  std::printf("Threads: %d\n", num_threads);
  std::printf("Batch size: %d\n", opts.batch_size);
  std::printf("\n");
  std::fflush(stdout);
  // Split range across threads.
  if (total_scalars > 0 && static_cast<uint64_t>(num_threads) > total_scalars) {
    num_threads = static_cast<int>(total_scalars);
  }
  if (num_threads < 1) num_threads = 1;
  // Heap-allocate each ThreadResult separately to avoid false sharing.
  std::vector<std::unique_ptr<ThreadResult<Scalar>>> results;
  results.reserve(num_threads);
  for (int t = 0; t < num_threads; t++) {
    results.push_back(std::make_unique<ThreadResult<Scalar>>());
    results.back()->init(opts.hist_width);
  }
  std::vector<std::thread> threads;
  uint64_t scalars_per_thread = total_scalars / num_threads;
  Scalar chunk_lo = lo;
  auto start_time = std::chrono::steady_clock::now();
  for (int t = 0; t < num_threads; t++) {
    Scalar chunk_hi;
    if (t == num_threads - 1) {
      chunk_hi = hi;
    } else {
      chunk_hi = advance_scalar(chunk_lo, scalars_per_thread - 1);
    }
    threads.emplace_back(worker<Scalar>, std::cref(*entry), chunk_lo, chunk_hi, opts.batch_size, opts.use_mpfr,
                         opts.step_eps, std::ref(*results[t]));
    chunk_lo = std::nextafter(chunk_hi, std::numeric_limits<Scalar>::infinity());
  }
  for (auto& t : threads) t.join();
  auto end_time = std::chrono::steady_clock::now();
  double elapsed = std::chrono::duration<double>(end_time - start_time).count();
  // Reduce per-thread results.
  ThreadResult<Scalar> global;
  global.init(opts.hist_width);
  for (int t = 0; t < num_threads; t++) {
    const auto& r = *results[t];
    if (r.max_abs_ulp > global.max_abs_ulp) {
      global.max_abs_ulp = r.max_abs_ulp;
      global.max_ulp_at = r.max_ulp_at;
      global.max_ulp_eigen = r.max_ulp_eigen;
      global.max_ulp_ref = r.max_ulp_ref;
    }
    global.abs_ulp_sum += r.abs_ulp_sum;
    global.count += r.count;
    for (size_t b = 0; b < global.hist.size(); b++) {
      global.hist[b] += r.hist[b];
    }
  }
  double mean_ulp = global.count > 0 ? global.abs_ulp_sum / global.count : 0.0;
  // Print results.
  std::printf("Results:\n");
  std::printf("  Values tested: %lu\n", static_cast<unsigned long>(global.count));
  std::printf("  Time: %.2f seconds (%.1f Mvalues/s)\n", elapsed, global.count / elapsed / 1e6);
  if (global.max_abs_ulp == INT64_MAX) {
    std::printf("  Max |ULP error|: inf\n");
  } else {
    std::printf("  Max |ULP error|: %ld\n", static_cast<long>(global.max_abs_ulp));
  }
  std::printf("    at x = %.*g (Eigen=%.*g, ref=%.*g)\n", kDigits, double(global.max_ulp_at), kDigits,
              double(global.max_ulp_eigen), kDigits, double(global.max_ulp_ref));
  std::printf("  Mean |ULP error|: %.4f\n", mean_ulp);
  std::printf("\n");
  // Print signed error histogram.
  std::printf("Signed ULP error histogram [-%d, +%d]:\n", opts.hist_width, opts.hist_width);
  int nbins = 2 * opts.hist_width + 3;
  for (int b = 0; b < nbins; b++) {
    if (global.hist[b] == 0) continue;
    double pct = 100.0 * global.hist[b] / global.count;
    if (b == 0) {
      std::printf("  <%-4d: %12lu (%7.3f%%)\n", -opts.hist_width, static_cast<unsigned long>(global.hist[b]), pct);
    } else if (b == nbins - 1) {
      std::printf("  >%-4d: %12lu (%7.3f%%)\n", opts.hist_width, static_cast<unsigned long>(global.hist[b]), pct);
    } else {
      int err = b - opts.hist_width - 1;
      std::printf("  %-5d: %12lu (%7.3f%%)\n", err, static_cast<unsigned long>(global.hist[b]), pct);
    }
  }
  return 0;
 }
 // ============================================================================
 // Command-line parsing
 // ============================================================================
 static void print_usage() {
  std::printf(
      "Usage: ulp_accuracy [options]\n"
      "  --func=NAME    Function to test (required unless --list)\n"
      "  --lo=VAL       Start of range (default: -inf)\n"
      "  --hi=VAL       End of range (default: +inf)\n"
      "  --double       Test double precision (default: float)\n"
      "  --step=EPS     Sampling step: advance by (1+EPS)*nextafter(x)\n"
      "                 (default: 0 = exhaustive; useful for double, e.g. 1e-6)\n"
      "  --threads=N    Number of threads (default: all cores)\n"
      "  --batch=N      Batch size for Eigen eval (default: 4096)\n"
      "  --ref=MODE     Reference: 'std' (default) or 'mpfr'\n"
      "  --hist_width=N Histogram half-width in ULPs (default: 10)\n"
      "  --list         List available functions\n");
 }
 int main(int argc, char* argv[]) {
  Options opts;
  opts.num_threads = static_cast<int>(std::thread::hardware_concurrency());
  if (opts.num_threads == 0) opts.num_threads = 4;
  std::string ref_mode;
  for (int i = 1; i < argc; i++) {
    std::string arg = argv[i];
    if (arg.substr(0, 7) == "--func=") {
      opts.func_name = arg.substr(7);
    } else if (arg.substr(0, 5) == "--lo=") {
      std::string val = arg.substr(5);
      if (val == "inf")
        opts.lo = std::numeric_limits<double>::infinity();
      else if (val == "-inf")
        opts.lo = -std::numeric_limits<double>::infinity();
      else
        opts.lo = std::stod(val);
    } else if (arg.substr(0, 5) == "--hi=") {
      std::string val = arg.substr(5);
      if (val == "inf")
        opts.hi = std::numeric_limits<double>::infinity();
      else if (val == "-inf")
        opts.hi = -std::numeric_limits<double>::infinity();
      else
        opts.hi = std::stod(val);
    } else if (arg.substr(0, 10) == "--threads=") {
      opts.num_threads = std::stoi(arg.substr(10));
    } else if (arg.substr(0, 8) == "--batch=") {
      opts.batch_size = std::stoi(arg.substr(8));
    } else if (arg.substr(0, 6) == "--ref=") {
      ref_mode = arg.substr(6);
    } else if (arg.substr(0, 13) == "--hist_width=") {
      opts.hist_width = std::stoi(arg.substr(13));
    } else if (arg.substr(0, 7) == "--step=") {
      opts.step_eps = std::stod(arg.substr(7));
    } else if (arg == "--double") {
      opts.use_double = true;
    } else if (arg == "--list") {
      opts.list_funcs = true;
    } else if (arg == "--help" || arg == "-h") {
      print_usage();
      return 0;
    } else {
      std::fprintf(stderr, "Unknown option: %s\n", arg.c_str());
      print_usage();
      return 1;
    }
  }
  // Determine reference mode (default: std).
  if (ref_mode.empty() || ref_mode == "std") {
    opts.use_mpfr = false;
  } else if (ref_mode == "mpfr") {
 #ifdef EIGEN_HAS_MPFR
    opts.use_mpfr = true;
 #else
    std::fprintf(stderr, "Error: MPFR support not compiled in. Use --ref=std or rebuild with MPFR.\n");
    return 1;
 #endif
  } else {
    std::fprintf(stderr, "Error: --ref must be 'std' or 'mpfr'\n");
    return 1;
  }
  if (!opts.list_funcs && opts.func_name.empty()) {
    std::fprintf(stderr, "Error: --func=NAME is required (use --list to see available functions)\n");
    return 1;
  }
  if (opts.use_double) {
    return run_test<double>(opts);
  } else {
    return run_test<float>(opts);
  }
 }