From c66fc52868a1ad2a144d8fbcae8edfaf6c40ccd2 Mon Sep 17 00:00:00 2001 From: Rasmus Munk Larsen <4643818-rmlarsen1@users.noreply.gitlab.com> Date: Sun, 1 Mar 2026 13:22:16 -0800 Subject: [PATCH] Add ULP accuracy measurement tool and documentation for vectorized math functions libeigen/eigen!2153 Co-authored-by: Rasmus Munk Larsen --- doc/CoeffwiseMathFunctionsTable.dox | 127 ++++++ test/CMakeLists.txt | 12 + test/ulp_accuracy/README.md | 145 +++++++ test/ulp_accuracy/mpfr_reference.h | 78 ++++ test/ulp_accuracy/ulp_accuracy.cpp | 611 ++++++++++++++++++++++++++++ 5 files changed, 973 insertions(+) create mode 100644 test/ulp_accuracy/README.md create mode 100644 test/ulp_accuracy/mpfr_reference.h create mode 100644 test/ulp_accuracy/ulp_accuracy.cpp diff --git a/doc/CoeffwiseMathFunctionsTable.dox b/doc/CoeffwiseMathFunctionsTable.dox index eb0f36c1b..cda19d985 100644 --- a/doc/CoeffwiseMathFunctionsTable.dox +++ b/doc/CoeffwiseMathFunctionsTable.dox @@ -608,6 +608,133 @@ This also means that, unless specified, if the function \c std::foo is available \n +\section CoeffwiseMathFunctionsAccuracy Accuracy of vectorized math functions + +The following tables summarize the accuracy of %Eigen's vectorized implementations measured +in units of ULP (Unit in the Last Place) on an x86-64 system (Intel Xeon, GCC) with SSE2 SIMD +target. The reference values were computed using +MPFR at 128-bit precision. +Float results are exhaustive over all ~4.28 billion finite representable values. +Double results sample ~2.88 billion values using a geometric stepping factor of 10-6. + +These numbers may differ for other SIMD targets (AVX, AVX512, NEON, SVE, etc.) +since each has its own packet math implementations. Functions marked "delegates to std" +do not have a custom vectorized implementation for the tested SIMD target — they call +the standard library function element-by-element. + +The full histograms for each function can be generated with the \c ulp_accuracy tool +in test/ulp_accuracy/. + +\subsection CoeffwiseMathFunctionsAccuracy_float Float precision + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
FunctionMax |ULP|Mean |ULP|% ExactNotes
Trigonometric
sin 2 0.087 91.3% \ref CoeffwiseMathFunctionsAccuracy_note1 "1"
cos 2 0.088 91.2% \ref CoeffwiseMathFunctionsAccuracy_note1 "1"
tan 5 0.238 77.3% \ref CoeffwiseMathFunctionsAccuracy_note1 "1"
asin 4 0.726 51.3%
acos 4 0.057 95.0%
atan 4 0.061 94.0%
Hyperbolic
sinh 2 0.017 98.3%
cosh 2 0.004 99.6%
tanh 6 0.030 97.2%
asinh 2 0.145 85.5%
acosh 2 0.057 94.3%
atanh 2 0.004 99.6%
Exponential / Logarithmic
exp 1 0.018 98.2%
exp2 6 0.034 97.3%
expm1 5 0.060 94.6%
log 3 0.120 88.0%
log1p 5 0.134 87.5%
log10 2 0.007 99.3%
log2 5 0.005 99.5%
Error / Special
erf 7 0.332 67.5%
erfc 8 0.010 99.2%
lgamma delegates to std \ref CoeffwiseMathFunctionsAccuracy_note3 "3"
Other
logistic 7 0.040 97.0%
sqrt 0 0.000 100% Uses hardware sqrt
cbrt 2 0.552 49.1%
rsqrt 0.114 88.2% \ref CoeffwiseMathFunctionsAccuracy_note4 "4"
+ +\subsection CoeffwiseMathFunctionsAccuracy_double Double precision + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
FunctionMax |ULP|Mean |ULP|% ExactNotes
Trigonometric
sin 13,879,755 0.093 93.2% \ref CoeffwiseMathFunctionsAccuracy_note1 "1"
cos 2,024,130 0.043 98.2% \ref CoeffwiseMathFunctionsAccuracy_note1 "1"
tan 13,879,755 0.128 92.7% \ref CoeffwiseMathFunctionsAccuracy_note1 "1"
asin 1 <0.001 >99.9%
acos 1 <0.001 100%
atan 5 0.013 98.8%
Hyperbolic
sinh 2 0.004 99.6%
cosh 2 0.001 99.9%
tanh 8 0.008 99.3%
asinh 2 0.098 90.2%
acosh 2 0.047 95.3%
atanh 2 <0.001 >99.9%
Exponential / Logarithmic
exp 2 0.001 99.9%
exp2 214 0.107 99.6% \ref CoeffwiseMathFunctionsAccuracy_note2 "2"
expm1 3 0.010 99.1%
log 2 0.147 85.3%
log1p 3 0.097 90.6%
log10 2 0.001 99.9%
log2 2 <0.001 99.9%
Error / Special
erf 0.050 70.5% \ref CoeffwiseMathFunctionsAccuracy_note5 "5"
erfc 11 0.002 99.9%
lgamma delegates to std \ref CoeffwiseMathFunctionsAccuracy_note3 "3"
Other
logistic 3 0.008 99.2%
sqrt 0 0.000 100% Uses hardware sqrt
cbrt 2 0.119 88.1%
rsqrt 0.135 86.5% \ref CoeffwiseMathFunctionsAccuracy_note4 "4"
+ +\subsection CoeffwiseMathFunctionsAccuracy_notes Notes + +\anchor CoeffwiseMathFunctionsAccuracy_note1 +1. sin/cos/tan argument reduction: +%Eigen's vectorized sin, cos, and tan use a Cody-Waite argument reduction scheme that +subtracts multiples of π/2 from the input. For very large arguments (|x| > ~104 +in float, |x| > ~10 in double), this reduction loses precision, producing +occasional large ULP errors. The mean error remains low because most representable +values are small. Applications that need high accuracy for large arguments should +perform argument reduction in user code before calling these functions. + +\anchor CoeffwiseMathFunctionsAccuracy_note2 +2. exp2 double precision: +The exp2 implementation for double shows a max error of 214 ULP near the overflow +boundary (x ≈ 1022). The mean error is still low (0.107 ULP, 99.6% exact), so +the large max error affects only inputs very close to overflow. + +\anchor CoeffwiseMathFunctionsAccuracy_note3 +3. lgamma: +The vectorized lgamma delegates to the standard library function \c std::lgamma for +this SIMD target (SSE2) and therefore has the same accuracy as the platform's C math library. + +\anchor CoeffwiseMathFunctionsAccuracy_note4 +4. rsqrt max=∞: +The infinite max ULP is due to a sign disagreement at a single subnormal input: +rsqrt(-0) returns -∞ in %Eigen but the MPFR reference produces NaN (rsqrt of a negative +value). Ignoring this edge case, the implementation is accurate (< 2 ULP) everywhere else. +For float, 16.8 million subnormal negative inputs (0.4%) also produce ±∞ vs NaN. +The mean error excluding these outliers is well below 1 ULP. + +\anchor CoeffwiseMathFunctionsAccuracy_note5 +5. erf double ∞: +The vectorized erf for double returns NaN for ±∞ instead of the correct ±1. +This produces an infinite max ULP error at a single input value. Excluding ±∞, +the max error is 3 ULP and the mean is 0.050 ULP. + */ } diff --git a/test/CMakeLists.txt b/test/CMakeLists.txt index 285977443..0555bfe63 100644 --- a/test/CMakeLists.txt +++ b/test/CMakeLists.txt @@ -532,6 +532,18 @@ if(EIGEN_TEST_BUILD_DOCUMENTATION) add_dependencies(buildtests doc) endif() +# ULP accuracy measurement tool (see test/ulp_accuracy/README.md) +find_package(MPFR) +find_package(GMP) +add_executable(ulp_accuracy ulp_accuracy/ulp_accuracy.cpp) +target_compile_options(ulp_accuracy PRIVATE -pthread) +target_link_libraries(ulp_accuracy Eigen3::Eigen ${CMAKE_THREAD_LIBS_INIT} pthread) +if(MPFR_FOUND AND GMP_FOUND) + target_include_directories(ulp_accuracy PRIVATE ${MPFR_INCLUDES} ${GMP_INCLUDES}) + target_link_libraries(ulp_accuracy ${MPFR_LIBRARIES} ${GMP_LIBRARIES}) + target_compile_definitions(ulp_accuracy PRIVATE EIGEN_HAS_MPFR) +endif() + # Register all smoke tests include("EigenSmokeTestList") ei_add_smoke_tests("${ei_smoke_test_list}") diff --git a/test/ulp_accuracy/README.md b/test/ulp_accuracy/README.md new file mode 100644 index 000000000..440d66844 --- /dev/null +++ b/test/ulp_accuracy/README.md @@ -0,0 +1,145 @@ +# ULP Accuracy Measurement Tool + +Standalone tool for measuring the accuracy of Eigen's vectorized math functions +in units of ULP (Unit in the Last Place). Compares Eigen's SIMD implementations +against either MPFR (128-bit high-precision reference) or the standard C++ math +library. + +## Building + +From the Eigen build directory: + +```bash +cd build +cmake .. +cmake --build . --target ulp_accuracy +``` + +If MPFR and GMP are installed, the build automatically enables MPFR support +(`EIGEN_HAS_MPFR`). Without them, only `--ref=std` is available. + +### Installing MPFR (Debian/Ubuntu) + +```bash +sudo apt install libmpfr-dev libgmp-dev +``` + +## Usage + +``` +./test/ulp_accuracy [options] + +Options: + --func=NAME Function to test (required unless --list) + --lo=VAL Start of range (default: -inf) + --hi=VAL End of range (default: +inf) + --double Test double precision (default: float) + --step=EPS Sampling step: advance by (1+EPS)*nextafter(x) + (default: 0 = exhaustive; useful for double, e.g. 1e-6) + --threads=N Number of threads (default: all cores) + --batch=N Batch size for Eigen eval (default: 4096) + --ref=MODE Reference: 'std' (default) or 'mpfr' + --hist_width=N Histogram half-width in ULPs (default: 10) + --list List available functions +``` + +## Examples + +List all supported functions: +```bash +./test/ulp_accuracy --list +``` + +Exhaustive float test of sin against std (tests all ~4.28 billion finite floats): +```bash +./test/ulp_accuracy --func=sin +``` + +Float test against MPFR (more accurate reference, but slower): +```bash +./test/ulp_accuracy --func=sin --ref=mpfr +``` + +Double precision test with geometric sampling (exhaustive is impractical for double): +```bash +./test/ulp_accuracy --func=exp --double --step=1e-6 +``` + +Test a specific range: +```bash +./test/ulp_accuracy --func=sin --lo=0 --hi=6.2832 +``` + +## Output + +The tool prints: + +- **Test configuration**: function, range, reference mode, thread count +- **Max |ULP error|**: worst-case absolute ULP error with the offending input value +- **Mean |ULP error|**: average absolute ULP error across all tested values +- **Signed ULP histogram**: distribution of signed errors showing bias direction + +Example output: +``` +Function: sin (float) +Range: [-inf, inf] +Representable values in range: 4278190082 +Reference: MPFR (128-bit) +Threads: 32 +Batch size: 4096 + +Results: + Values tested: 4278190081 + Time: 529.04 seconds (8.1 Mvalues/s) + Max |ULP error|: 2 + at x = -1.5413464e+38 (Eigen=-0.482218683, ref=-0.482218742) + Mean |ULP error|: 0.0874 + +Signed ULP error histogram [-10, +10]: + -2 : 51988 ( 0.001%) + -1 : 186805349 ( 4.366%) + 0 : 3904475407 ( 91.265%) + 1 : 186805349 ( 4.366%) + 2 : 51988 ( 0.001%) +``` + +## How it works + +1. **Range splitting**: The input range is divided evenly across threads by + splitting the linear ULP space. + +2. **Batched evaluation**: Each thread fills batches of input values, evaluates + them through Eigen's vectorized path (using `Eigen::Array` operations), and + computes reference values one at a time. + +3. **ULP computation**: IEEE 754 bit patterns are mapped to a linear integer + scale where adjacent representable values are adjacent integers. The signed + ULP error is the difference between Eigen's result and the reference on this + scale. Special cases (NaN, infinity mismatches) report infinite error. + +4. **Result reduction**: Per-thread statistics (max error, mean error, histogram) + are merged after all threads complete. + +## Supported functions + +| Category | Functions | +|----------|-----------| +| Trigonometric | sin, cos, tan, asin, acos, atan | +| Hyperbolic | sinh, cosh, tanh, asinh, acosh, atanh | +| Exponential/Log | exp, exp2, expm1, log, log1p, log10, log2 | +| Error/Gamma | erf, erfc, lgamma | +| Other | logistic, sqrt, cbrt, rsqrt | + +## File organization + +- `ulp_accuracy.cpp` — Main tool: ULP computation, worker threads, CLI, result printing +- `mpfr_reference.h` — MPFR reference function wrappers and scalar conversion helpers + +## Performance tips + +- Float exhaustive sweeps test ~4.28 billion values. With `--ref=std` this takes + ~50 seconds per function; with `--ref=mpfr` it takes ~500 seconds (10x slower). +- For double precision, exhaustive testing is impractical. Use `--step=1e-6` to + sample ~2.88 billion values geometrically. +- Thread count defaults to all available cores. MPFR is the bottleneck (single + MPFR call per value per thread), so more cores help significantly. diff --git a/test/ulp_accuracy/mpfr_reference.h b/test/ulp_accuracy/mpfr_reference.h new file mode 100644 index 000000000..37c691155 --- /dev/null +++ b/test/ulp_accuracy/mpfr_reference.h @@ -0,0 +1,78 @@ +// MPFR high-precision reference implementations for ULP accuracy testing. +// +// This header provides MPFR-based reference functions for all math operations +// tested by the ulp_accuracy tool. It also includes scalar conversion helpers +// between float/double and mpfr_t. +// +// Only compiled when EIGEN_HAS_MPFR is defined (i.e., MPFR and GMP are found). + +#ifndef EIGEN_ULP_ACCURACY_MPFR_REFERENCE_H +#define EIGEN_ULP_ACCURACY_MPFR_REFERENCE_H + +#ifdef EIGEN_HAS_MPFR +#include + +// --------------------------------------------------------------------------- +// Scalar <-> mpfr_t conversion +// --------------------------------------------------------------------------- + +template +static void mpfr_set_scalar(mpfr_t rop, Scalar x, mpfr_rnd_t rnd); +template <> +void mpfr_set_scalar(mpfr_t rop, float x, mpfr_rnd_t rnd) { + mpfr_set_flt(rop, x, rnd); +} +template <> +void mpfr_set_scalar(mpfr_t rop, double x, mpfr_rnd_t rnd) { + mpfr_set_d(rop, x, rnd); +} + +template +static Scalar mpfr_get_scalar(mpfr_t op, mpfr_rnd_t rnd); +template <> +float mpfr_get_scalar(mpfr_t op, mpfr_rnd_t rnd) { + return mpfr_get_flt(op, rnd); +} +template <> +double mpfr_get_scalar(mpfr_t op, mpfr_rnd_t rnd) { + return mpfr_get_d(op, rnd); +} + +// --------------------------------------------------------------------------- +// MPFR wrappers for functions not directly provided by libmpfr +// --------------------------------------------------------------------------- + +// logistic(x) = 1 / (1 + exp(-x)) +static int mpfr_logistic(mpfr_t rop, const mpfr_t op, mpfr_rnd_t rnd) { + mpfr_t tmp, one; + mpfr_init2(tmp, mpfr_get_prec(rop)); + mpfr_init2(one, mpfr_get_prec(rop)); + mpfr_set_ui(one, 1, rnd); + mpfr_neg(tmp, op, rnd); + mpfr_exp(tmp, tmp, rnd); + mpfr_add(tmp, tmp, one, rnd); + int ret = mpfr_div(rop, one, tmp, rnd); + mpfr_clear(tmp); + mpfr_clear(one); + return ret; +} + +// rsqrt(x) = 1 / sqrt(x) +static int mpfr_rsqrt(mpfr_t rop, const mpfr_t op, mpfr_rnd_t rnd) { return mpfr_rec_sqrt(rop, op, rnd); } + +// exp2(x) = 2^x +static int mpfr_exp2_wrap(mpfr_t rop, const mpfr_t op, mpfr_rnd_t rnd) { + mpfr_t two; + mpfr_init2(two, mpfr_get_prec(rop)); + mpfr_set_ui(two, 2, rnd); + int ret = mpfr_pow(rop, two, op, rnd); + mpfr_clear(two); + return ret; +} + +// log2(x) — thin wrapper to match the function signature convention. +static int mpfr_log2_wrap(mpfr_t rop, const mpfr_t op, mpfr_rnd_t rnd) { return mpfr_log2(rop, op, rnd); } + +#endif // EIGEN_HAS_MPFR + +#endif // EIGEN_ULP_ACCURACY_MPFR_REFERENCE_H diff --git a/test/ulp_accuracy/ulp_accuracy.cpp b/test/ulp_accuracy/ulp_accuracy.cpp new file mode 100644 index 000000000..658d2343d --- /dev/null +++ b/test/ulp_accuracy/ulp_accuracy.cpp @@ -0,0 +1,611 @@ +// Standalone tool to measure ULP accuracy of Eigen's vectorized math functions +// against either MPFR (high-precision reference) or std C++ math functions. +// +// See README.md in this directory for full documentation. +// +// Usage: +// ./ulp_accuracy --func=sin --lo=0 --hi=6.2832 --threads=16 +// ./ulp_accuracy --func=exp --threads=16 +// ./ulp_accuracy --func=sin --ref=mpfr +// ./ulp_accuracy --func=sin --double --step=1e-6 +// ./ulp_accuracy --list +// +// Build: +// cd build && cmake .. && make ulp_accuracy + +#include +#include + +#ifdef EIGEN_HAS_MPFR +#include +#endif + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include "mpfr_reference.h" + +// ============================================================================ +// ULP distance computation +// ============================================================================ + +// Maps IEEE 754 bits to a linear integer scale where adjacent representable +// values are adjacent integers. The mapping is strictly monotonic: +// -inf -> most negative, -0.0 -> -1, +0.0 -> 0, +inf -> most positive. +static inline int64_t scalar_to_linear(float x) { + int32_t bits; + std::memcpy(&bits, &x, sizeof(bits)); + if (bits < 0) { + bits = static_cast(INT32_MIN) - bits - 1; + } + return static_cast(bits); +} + +static inline int64_t scalar_to_linear(double x) { + int64_t bits; + std::memcpy(&bits, &x, sizeof(bits)); + if (bits < 0) { + bits = static_cast(INT64_MIN) - bits - 1; + } + return bits; +} + +// Returns (eigen_val - ref_val) in ULP space. +// Positive means Eigen overestimates, negative means it underestimates. +// Returns INT64_MAX for incomparable values (NaN vs number, inf mismatch). +template +static inline int64_t signed_ulp_error(Scalar eigen_val, Scalar ref_val) { + if (eigen_val == ref_val) return 0; // also handles -0.0 == +0.0 + bool e_nan = std::isnan(eigen_val), r_nan = std::isnan(ref_val); + if (e_nan && r_nan) return 0; + if (e_nan || r_nan) return INT64_MAX; + if (std::isinf(eigen_val) || std::isinf(ref_val)) return INT64_MAX; + int64_t a = scalar_to_linear(eigen_val); + int64_t b = scalar_to_linear(ref_val); + // Overflow check for a - b. + if (b > 0 && a < INT64_MIN + b) return INT64_MAX; + if (b < 0 && a > INT64_MAX + b) return INT64_MAX; + return a - b; +} + +// ============================================================================ +// Per-thread accumulator with signed ULP histogram +// ============================================================================ + +template +struct alignas(128) ThreadResult { + int64_t max_abs_ulp = 0; + Scalar max_ulp_at = Scalar(0); + Scalar max_ulp_eigen = Scalar(0); + Scalar max_ulp_ref = Scalar(0); + double abs_ulp_sum = 0.0; + uint64_t count = 0; + + // Signed histogram: bins for errors in [-hist_width, +hist_width], + // plus two overflow bins for < -hist_width and > +hist_width. + // Layout: [<-W] [-W] [-W+1] ... [0] ... [W-1] [W] [>W] + // Total bins = 2*hist_width + 3 + int hist_width = 0; + std::vector hist; + + void init(int w) { + hist_width = w; + hist.assign(2 * w + 3, 0); + } + + void record(int64_t signed_err, Scalar x, Scalar eigen_val, Scalar ref_val) { + int64_t abs_err = signed_err < 0 ? -signed_err : signed_err; + if (signed_err == INT64_MAX) abs_err = INT64_MAX; + + if (abs_err > max_abs_ulp) { + max_abs_ulp = abs_err; + max_ulp_at = x; + max_ulp_eigen = eigen_val; + max_ulp_ref = ref_val; + } + if (abs_err != INT64_MAX) { + abs_ulp_sum += static_cast(abs_err); + } + count++; + + // Histogram bin. + int bin; + if (signed_err == INT64_MAX || signed_err > hist_width) { + bin = 2 * hist_width + 2; // overflow high + } else if (signed_err < -hist_width) { + bin = 0; // overflow low + } else { + bin = static_cast(signed_err) + hist_width + 1; + } + hist[bin]++; + } +}; + +// ============================================================================ +// Function registry +// ============================================================================ + +template +struct FuncEntry { + using ArrayType = Eigen::Array; + using EigenEval = std::function, const Eigen::Ref&)>; + using StdEval = std::function; + +#ifdef EIGEN_HAS_MPFR + using MpfrEval = std::function; +#endif + + std::string name; + EigenEval eigen_eval; + StdEval std_eval; +#ifdef EIGEN_HAS_MPFR + MpfrEval mpfr_eval; +#endif + Scalar default_lo; + Scalar default_hi; +}; + +// std::logistic is not part of the C++ standard library. +template +static Scalar std_logistic(Scalar x) { + if (x >= 0) { + Scalar e = std::exp(-x); + return Scalar(1) / (Scalar(1) + e); + } else { + Scalar e = std::exp(x); + return e / (Scalar(1) + e); + } +} + +template +static std::vector> build_func_table() { + using ArrayType = Eigen::Array; + std::vector> table; + +#ifdef EIGEN_HAS_MPFR +#define ADD_FUNC(fname, eigen_expr, std_expr, mpfr_fn, lo, hi) \ + table.push_back({#fname, [](Eigen::Ref out, const Eigen::Ref& a) { out = eigen_expr; }, \ + [](Scalar x) -> Scalar { return std_expr; }, \ + [](mpfr_t r, const mpfr_t o, mpfr_rnd_t d) { return mpfr_fn(r, o, d); }, lo, hi}) +#else +#define ADD_FUNC(fname, eigen_expr, std_expr, mpfr_fn, lo, hi) \ + table.push_back({#fname, [](Eigen::Ref out, const Eigen::Ref& a) { out = eigen_expr; }, \ + [](Scalar x) -> Scalar { return std_expr; }, lo, hi}) +#endif + + constexpr Scalar kInf = std::numeric_limits::infinity(); + + // Trigonometric + // clang-format off + ADD_FUNC(sin, a.sin(), std::sin(x), mpfr_sin, -kInf, kInf); + ADD_FUNC(cos, a.cos(), std::cos(x), mpfr_cos, -kInf, kInf); + ADD_FUNC(tan, a.tan(), std::tan(x), mpfr_tan, -kInf, kInf); + ADD_FUNC(asin, a.asin(), std::asin(x), mpfr_asin, -kInf, kInf); + ADD_FUNC(acos, a.acos(), std::acos(x), mpfr_acos, -kInf, kInf); + ADD_FUNC(atan, a.atan(), std::atan(x), mpfr_atan, -kInf, kInf); + + // Hyperbolic + ADD_FUNC(sinh, a.sinh(), std::sinh(x), mpfr_sinh, -kInf, kInf); + ADD_FUNC(cosh, a.cosh(), std::cosh(x), mpfr_cosh, -kInf, kInf); + ADD_FUNC(tanh, a.tanh(), std::tanh(x), mpfr_tanh, -kInf, kInf); + ADD_FUNC(asinh, a.asinh(), std::asinh(x), mpfr_asinh, -kInf, kInf); + ADD_FUNC(acosh, a.acosh(), std::acosh(x), mpfr_acosh, -kInf, kInf); + ADD_FUNC(atanh, a.atanh(), std::atanh(x), mpfr_atanh, -kInf, kInf); + + // Exponential / Logarithmic + ADD_FUNC(exp, a.exp(), std::exp(x), mpfr_exp, -kInf, kInf); + ADD_FUNC(exp2, a.exp2(), std::exp2(x), mpfr_exp2_wrap, -kInf, kInf); + ADD_FUNC(expm1, a.expm1(), std::expm1(x), mpfr_expm1, -kInf, kInf); + ADD_FUNC(log, a.log(), std::log(x), mpfr_log, -kInf, kInf); + ADD_FUNC(log1p, a.log1p(), std::log1p(x), mpfr_log1p, -kInf, kInf); + ADD_FUNC(log10, a.log10(), std::log10(x), mpfr_log10, -kInf, kInf); + ADD_FUNC(log2, a.log2(), std::log2(x), mpfr_log2_wrap, -kInf, kInf); + + // Error / Gamma + ADD_FUNC(erf, a.erf(), std::erf(x), mpfr_erf, -kInf, kInf); + ADD_FUNC(erfc, a.erfc(), std::erfc(x), mpfr_erfc, -kInf, kInf); + ADD_FUNC(lgamma, a.lgamma(), std::lgamma(x), mpfr_lngamma, -kInf, kInf); + + // Other + ADD_FUNC(logistic, a.logistic(), std_logistic(x), mpfr_logistic, -kInf, kInf); + ADD_FUNC(sqrt, a.sqrt(), std::sqrt(x), mpfr_sqrt, -kInf, kInf); + ADD_FUNC(cbrt, a.cbrt(), std::cbrt(x), mpfr_cbrt, -kInf, kInf); + ADD_FUNC(rsqrt, a.rsqrt(), Scalar(1)/std::sqrt(x), mpfr_rsqrt, -kInf, kInf); + // clang-format on + +#undef ADD_FUNC + return table; +} + +// ============================================================================ +// Range iteration helpers +// ============================================================================ + +// Advances x toward +inf by at least 1 ULP. When step_eps > 0, additionally +// jumps by a relative factor of (1 + step_eps) to sample the range sparsely. +template +static inline Scalar advance_by_step(Scalar x, double step_eps) { + Scalar next = std::nextafter(x, std::numeric_limits::infinity()); + if (step_eps > 0.0 && std::isfinite(next)) { + // Try to jump further by a relative amount. + Scalar jumped = next > 0 ? next * static_cast(1.0 + step_eps) : next / static_cast(1.0 + step_eps); + // Use the jump only if it actually advances further (handles denormal stalling). + if (jumped > next) next = jumped; + } + return next; +} + +// Counts the number of representable scalars in [lo, hi]. +template +static uint64_t count_scalars_in_range(Scalar lo, Scalar hi) { + if (lo > hi) return 0; + uint64_t lo_u = static_cast(scalar_to_linear(lo)); + uint64_t hi_u = static_cast(scalar_to_linear(hi)); + uint64_t diff = hi_u - lo_u; + return diff == UINT64_MAX ? UINT64_MAX : diff + 1; +} + +// Advances a scalar by n ULPs in the linear representation. +static float advance_scalar(float x, uint64_t n) { + int64_t lin = scalar_to_linear(x); + lin += static_cast(n); + int32_t ibits; + if (lin < 0) { + ibits = static_cast(INT32_MIN) - static_cast(lin) - 1; + } else { + ibits = static_cast(lin); + } + float result; + std::memcpy(&result, &ibits, sizeof(result)); + return result; +} + +static double advance_scalar(double x, uint64_t n) { + int64_t lin = scalar_to_linear(x); + lin += static_cast(n); + int64_t ibits; + if (lin < 0) { + ibits = static_cast(INT64_MIN) - lin - 1; + } else { + ibits = lin; + } + double result; + std::memcpy(&result, &ibits, sizeof(result)); + return result; +} + +// ============================================================================ +// Worker thread: evaluates Eigen and reference over a subrange +// ============================================================================ + +template +static void worker(const FuncEntry& func, Scalar lo, Scalar hi, int batch_size, bool use_mpfr, double step_eps, + ThreadResult& result) { + using ArrayType = Eigen::Array; + ArrayType input(batch_size); + ArrayType eigen_out(batch_size); + std::vector ref_out(batch_size); + +#ifdef EIGEN_HAS_MPFR + mpfr_t mp_in, mp_out; + if (use_mpfr) { + mpfr_init2(mp_in, 128); + mpfr_init2(mp_out, 128); + } +#else + (void)use_mpfr; +#endif + + auto process_batch = [&](int n, const ArrayType& in, const ArrayType& eig) { + for (int i = 0; i < n; i++) { +#ifdef EIGEN_HAS_MPFR + if (use_mpfr) { + mpfr_set_scalar(mp_in, in[i], MPFR_RNDN); + func.mpfr_eval(mp_out, mp_in, MPFR_RNDN); + ref_out[i] = mpfr_get_scalar(mp_out, MPFR_RNDN); + } else +#endif + { + ref_out[i] = func.std_eval(in[i]); + } + } + for (int i = 0; i < n; i++) { + int64_t err = signed_ulp_error(eig[i], ref_out[i]); + result.record(err, in[i], eig[i], ref_out[i]); + } + }; + + int idx = 0; + Scalar x = lo; + for (;;) { + input[idx] = x; + idx++; + + if (idx == batch_size) { + func.eigen_eval(eigen_out, input); + process_batch(batch_size, input, eigen_out); + idx = 0; + } + + if (x >= hi) break; + Scalar next = advance_by_step(x, step_eps); + x = (next > hi) ? hi : next; + } + + // Process remaining partial batch. + if (idx > 0) { + auto partial_in = input.head(idx); + auto partial_eigen = eigen_out.head(idx); + func.eigen_eval(partial_eigen, partial_in); + process_batch(idx, input, eigen_out); + } + +#ifdef EIGEN_HAS_MPFR + if (use_mpfr) { + mpfr_clear(mp_in); + mpfr_clear(mp_out); + } +#endif +} + +// ============================================================================ +// Test driver: splits range across threads and prints results +// ============================================================================ + +struct Options { + std::string func_name; + double lo = std::numeric_limits::quiet_NaN(); + double hi = std::numeric_limits::quiet_NaN(); + int num_threads; + int batch_size = 4096; + int hist_width = 10; + bool use_mpfr = false; + bool use_double = false; + double step_eps = 0.0; + bool list_funcs = false; +}; + +template +static int run_test(const Options& opts) { + const int kDigits = std::is_same::value ? 9 : 17; + const char* kTypeName = std::is_same::value ? "float" : "double"; + + auto table = build_func_table(); + + if (opts.list_funcs) { + std::printf("Available functions:\n"); + for (const auto& f : table) { + std::printf(" %s\n", f.name.c_str()); + } + return 0; + } + + // Look up the requested function. + const FuncEntry* entry = nullptr; + for (const auto& f : table) { + if (f.name == opts.func_name) { + entry = &f; + break; + } + } + if (!entry) { + std::fprintf(stderr, "Error: unknown function '%s' (use --list to see available functions)\n", + opts.func_name.c_str()); + return 1; + } + + Scalar lo = std::isnan(opts.lo) ? entry->default_lo : static_cast(opts.lo); + Scalar hi = std::isnan(opts.hi) ? entry->default_hi : static_cast(opts.hi); + uint64_t total_scalars = count_scalars_in_range(lo, hi); + int num_threads = opts.num_threads; + + // Print test configuration. + std::printf("Function: %s (%s)\n", opts.func_name.c_str(), kTypeName); + std::printf("Range: [%.*g, %.*g]\n", kDigits, double(lo), kDigits, double(hi)); + if (opts.step_eps > 0.0) { + std::printf("Sampling step: (1 + %g) * nextafter(x)\n", opts.step_eps); + } else { + std::printf("Representable values in range: %lu\n", static_cast(total_scalars)); + } + std::printf("Reference: %s\n", opts.use_mpfr ? "MPFR (128-bit)" : "std C++ math"); + std::printf("Threads: %d\n", num_threads); + std::printf("Batch size: %d\n", opts.batch_size); + std::printf("\n"); + std::fflush(stdout); + + // Split range across threads. + if (total_scalars > 0 && static_cast(num_threads) > total_scalars) { + num_threads = static_cast(total_scalars); + } + if (num_threads < 1) num_threads = 1; + + // Heap-allocate each ThreadResult separately to avoid false sharing. + std::vector>> results; + results.reserve(num_threads); + for (int t = 0; t < num_threads; t++) { + results.push_back(std::make_unique>()); + results.back()->init(opts.hist_width); + } + + std::vector threads; + uint64_t scalars_per_thread = total_scalars / num_threads; + Scalar chunk_lo = lo; + + auto start_time = std::chrono::steady_clock::now(); + + for (int t = 0; t < num_threads; t++) { + Scalar chunk_hi; + if (t == num_threads - 1) { + chunk_hi = hi; + } else { + chunk_hi = advance_scalar(chunk_lo, scalars_per_thread - 1); + } + threads.emplace_back(worker, std::cref(*entry), chunk_lo, chunk_hi, opts.batch_size, opts.use_mpfr, + opts.step_eps, std::ref(*results[t])); + chunk_lo = std::nextafter(chunk_hi, std::numeric_limits::infinity()); + } + + for (auto& t : threads) t.join(); + auto end_time = std::chrono::steady_clock::now(); + double elapsed = std::chrono::duration(end_time - start_time).count(); + + // Reduce per-thread results. + ThreadResult global; + global.init(opts.hist_width); + for (int t = 0; t < num_threads; t++) { + const auto& r = *results[t]; + if (r.max_abs_ulp > global.max_abs_ulp) { + global.max_abs_ulp = r.max_abs_ulp; + global.max_ulp_at = r.max_ulp_at; + global.max_ulp_eigen = r.max_ulp_eigen; + global.max_ulp_ref = r.max_ulp_ref; + } + global.abs_ulp_sum += r.abs_ulp_sum; + global.count += r.count; + for (size_t b = 0; b < global.hist.size(); b++) { + global.hist[b] += r.hist[b]; + } + } + + double mean_ulp = global.count > 0 ? global.abs_ulp_sum / global.count : 0.0; + + // Print results. + std::printf("Results:\n"); + std::printf(" Values tested: %lu\n", static_cast(global.count)); + std::printf(" Time: %.2f seconds (%.1f Mvalues/s)\n", elapsed, global.count / elapsed / 1e6); + if (global.max_abs_ulp == INT64_MAX) { + std::printf(" Max |ULP error|: inf\n"); + } else { + std::printf(" Max |ULP error|: %ld\n", static_cast(global.max_abs_ulp)); + } + std::printf(" at x = %.*g (Eigen=%.*g, ref=%.*g)\n", kDigits, double(global.max_ulp_at), kDigits, + double(global.max_ulp_eigen), kDigits, double(global.max_ulp_ref)); + std::printf(" Mean |ULP error|: %.4f\n", mean_ulp); + std::printf("\n"); + + // Print signed error histogram. + std::printf("Signed ULP error histogram [-%d, +%d]:\n", opts.hist_width, opts.hist_width); + int nbins = 2 * opts.hist_width + 3; + for (int b = 0; b < nbins; b++) { + if (global.hist[b] == 0) continue; + double pct = 100.0 * global.hist[b] / global.count; + if (b == 0) { + std::printf(" <%-4d: %12lu (%7.3f%%)\n", -opts.hist_width, static_cast(global.hist[b]), pct); + } else if (b == nbins - 1) { + std::printf(" >%-4d: %12lu (%7.3f%%)\n", opts.hist_width, static_cast(global.hist[b]), pct); + } else { + int err = b - opts.hist_width - 1; + std::printf(" %-5d: %12lu (%7.3f%%)\n", err, static_cast(global.hist[b]), pct); + } + } + + return 0; +} + +// ============================================================================ +// Command-line parsing +// ============================================================================ + +static void print_usage() { + std::printf( + "Usage: ulp_accuracy [options]\n" + " --func=NAME Function to test (required unless --list)\n" + " --lo=VAL Start of range (default: -inf)\n" + " --hi=VAL End of range (default: +inf)\n" + " --double Test double precision (default: float)\n" + " --step=EPS Sampling step: advance by (1+EPS)*nextafter(x)\n" + " (default: 0 = exhaustive; useful for double, e.g. 1e-6)\n" + " --threads=N Number of threads (default: all cores)\n" + " --batch=N Batch size for Eigen eval (default: 4096)\n" + " --ref=MODE Reference: 'std' (default) or 'mpfr'\n" + " --hist_width=N Histogram half-width in ULPs (default: 10)\n" + " --list List available functions\n"); +} + +int main(int argc, char* argv[]) { + Options opts; + opts.num_threads = static_cast(std::thread::hardware_concurrency()); + if (opts.num_threads == 0) opts.num_threads = 4; + std::string ref_mode; + + for (int i = 1; i < argc; i++) { + std::string arg = argv[i]; + if (arg.substr(0, 7) == "--func=") { + opts.func_name = arg.substr(7); + } else if (arg.substr(0, 5) == "--lo=") { + std::string val = arg.substr(5); + if (val == "inf") + opts.lo = std::numeric_limits::infinity(); + else if (val == "-inf") + opts.lo = -std::numeric_limits::infinity(); + else + opts.lo = std::stod(val); + } else if (arg.substr(0, 5) == "--hi=") { + std::string val = arg.substr(5); + if (val == "inf") + opts.hi = std::numeric_limits::infinity(); + else if (val == "-inf") + opts.hi = -std::numeric_limits::infinity(); + else + opts.hi = std::stod(val); + } else if (arg.substr(0, 10) == "--threads=") { + opts.num_threads = std::stoi(arg.substr(10)); + } else if (arg.substr(0, 8) == "--batch=") { + opts.batch_size = std::stoi(arg.substr(8)); + } else if (arg.substr(0, 6) == "--ref=") { + ref_mode = arg.substr(6); + } else if (arg.substr(0, 13) == "--hist_width=") { + opts.hist_width = std::stoi(arg.substr(13)); + } else if (arg.substr(0, 7) == "--step=") { + opts.step_eps = std::stod(arg.substr(7)); + } else if (arg == "--double") { + opts.use_double = true; + } else if (arg == "--list") { + opts.list_funcs = true; + } else if (arg == "--help" || arg == "-h") { + print_usage(); + return 0; + } else { + std::fprintf(stderr, "Unknown option: %s\n", arg.c_str()); + print_usage(); + return 1; + } + } + + // Determine reference mode (default: std). + if (ref_mode.empty() || ref_mode == "std") { + opts.use_mpfr = false; + } else if (ref_mode == "mpfr") { +#ifdef EIGEN_HAS_MPFR + opts.use_mpfr = true; +#else + std::fprintf(stderr, "Error: MPFR support not compiled in. Use --ref=std or rebuild with MPFR.\n"); + return 1; +#endif + } else { + std::fprintf(stderr, "Error: --ref must be 'std' or 'mpfr'\n"); + return 1; + } + + if (!opts.list_funcs && opts.func_name.empty()) { + std::fprintf(stderr, "Error: --func=NAME is required (use --list to see available functions)\n"); + return 1; + } + + if (opts.use_double) { + return run_test(opts); + } else { + return run_test(opts); + } +}