// Standalone tool to measure ULP accuracy of Eigen's vectorized math functions // against either MPFR (high-precision reference) or std C++ math functions. // // See README.md in this directory for full documentation. // // Usage: // ./ulp_accuracy --func=sin --lo=0 --hi=6.2832 --threads=16 // ./ulp_accuracy --func=exp --threads=16 // ./ulp_accuracy --func=sin --ref=mpfr // ./ulp_accuracy --func=sin --double --step=1e-6 // ./ulp_accuracy --list // // Build: // cd build && cmake .. && make ulp_accuracy #include #include #ifdef EIGEN_HAS_MPFR #include #endif #include #include #include #include #include #include #include #include #include #include #include #include #include #include "mpfr_reference.h" // ============================================================================ // ULP distance computation // ============================================================================ // Maps IEEE 754 bits to a linear integer scale where adjacent representable // values are adjacent integers. The mapping is strictly monotonic: // -inf -> most negative, -0.0 -> -1, +0.0 -> 0, +inf -> most positive. static inline int64_t scalar_to_linear(float x) { int32_t bits; std::memcpy(&bits, &x, sizeof(bits)); if (bits < 0) { bits = static_cast(INT32_MIN) - bits - 1; } return static_cast(bits); } static inline int64_t scalar_to_linear(double x) { int64_t bits; std::memcpy(&bits, &x, sizeof(bits)); if (bits < 0) { bits = static_cast(INT64_MIN) - bits - 1; } return bits; } // Returns (eigen_val - ref_val) in ULP space. // Positive means Eigen overestimates, negative means it underestimates. // Returns INT64_MAX for incomparable values (NaN vs number, inf mismatch). template static inline int64_t signed_ulp_error(Scalar eigen_val, Scalar ref_val) { if (eigen_val == ref_val) return 0; // also handles -0.0 == +0.0 bool e_nan = std::isnan(eigen_val), r_nan = std::isnan(ref_val); if (e_nan && r_nan) return 0; if (e_nan || r_nan) return INT64_MAX; if (std::isinf(eigen_val) || std::isinf(ref_val)) return INT64_MAX; int64_t a = scalar_to_linear(eigen_val); int64_t b = scalar_to_linear(ref_val); // Overflow check for a - b. if (b > 0 && a < INT64_MIN + b) return INT64_MAX; if (b < 0 && a > INT64_MAX + b) return INT64_MAX; return a - b; } // ============================================================================ // Per-thread accumulator with signed ULP histogram // ============================================================================ template struct alignas(128) ThreadResult { int64_t max_abs_ulp = 0; Scalar max_ulp_at = Scalar(0); Scalar max_ulp_eigen = Scalar(0); Scalar max_ulp_ref = Scalar(0); double abs_ulp_sum = 0.0; uint64_t count = 0; // Signed histogram: bins for errors in [-hist_width, +hist_width], // plus two overflow bins for < -hist_width and > +hist_width. // Layout: [<-W] [-W] [-W+1] ... [0] ... [W-1] [W] [>W] // Total bins = 2*hist_width + 3 int hist_width = 0; std::vector hist; void init(int w) { hist_width = w; hist.assign(2 * w + 3, 0); } void record(int64_t signed_err, Scalar x, Scalar eigen_val, Scalar ref_val) { int64_t abs_err = signed_err < 0 ? -signed_err : signed_err; if (signed_err == INT64_MAX) abs_err = INT64_MAX; if (abs_err > max_abs_ulp) { max_abs_ulp = abs_err; max_ulp_at = x; max_ulp_eigen = eigen_val; max_ulp_ref = ref_val; } if (abs_err != INT64_MAX) { abs_ulp_sum += static_cast(abs_err); } count++; // Histogram bin. int bin; if (signed_err == INT64_MAX || signed_err > hist_width) { bin = 2 * hist_width + 2; // overflow high } else if (signed_err < -hist_width) { bin = 0; // overflow low } else { bin = static_cast(signed_err) + hist_width + 1; } hist[bin]++; } }; // ============================================================================ // Function registry // ============================================================================ template struct FuncEntry { using ArrayType = Eigen::Array; using EigenEval = std::function, const Eigen::Ref&)>; using StdEval = std::function; #ifdef EIGEN_HAS_MPFR using MpfrEval = std::function; #endif std::string name; EigenEval eigen_eval; StdEval std_eval; #ifdef EIGEN_HAS_MPFR MpfrEval mpfr_eval; #endif Scalar default_lo; Scalar default_hi; }; // std::logistic is not part of the C++ standard library. template static Scalar std_logistic(Scalar x) { if (x >= 0) { Scalar e = std::exp(-x); return Scalar(1) / (Scalar(1) + e); } else { Scalar e = std::exp(x); return e / (Scalar(1) + e); } } template static std::vector> build_func_table() { using ArrayType = Eigen::Array; std::vector> table; #ifdef EIGEN_HAS_MPFR #define ADD_FUNC(fname, eigen_expr, std_expr, mpfr_fn, lo, hi) \ table.push_back({#fname, [](Eigen::Ref out, const Eigen::Ref& a) { out = eigen_expr; }, \ [](Scalar x) -> Scalar { return std_expr; }, \ [](mpfr_t r, const mpfr_t o, mpfr_rnd_t d) { return mpfr_fn(r, o, d); }, lo, hi}) #else #define ADD_FUNC(fname, eigen_expr, std_expr, mpfr_fn, lo, hi) \ table.push_back({#fname, [](Eigen::Ref out, const Eigen::Ref& a) { out = eigen_expr; }, \ [](Scalar x) -> Scalar { return std_expr; }, lo, hi}) #endif constexpr Scalar kInf = std::numeric_limits::infinity(); // Trigonometric // clang-format off ADD_FUNC(sin, a.sin(), std::sin(x), mpfr_sin, -kInf, kInf); ADD_FUNC(cos, a.cos(), std::cos(x), mpfr_cos, -kInf, kInf); ADD_FUNC(tan, a.tan(), std::tan(x), mpfr_tan, -kInf, kInf); ADD_FUNC(asin, a.asin(), std::asin(x), mpfr_asin, -kInf, kInf); ADD_FUNC(acos, a.acos(), std::acos(x), mpfr_acos, -kInf, kInf); ADD_FUNC(atan, a.atan(), std::atan(x), mpfr_atan, -kInf, kInf); // Hyperbolic ADD_FUNC(sinh, a.sinh(), std::sinh(x), mpfr_sinh, -kInf, kInf); ADD_FUNC(cosh, a.cosh(), std::cosh(x), mpfr_cosh, -kInf, kInf); ADD_FUNC(tanh, a.tanh(), std::tanh(x), mpfr_tanh, -kInf, kInf); ADD_FUNC(asinh, a.asinh(), std::asinh(x), mpfr_asinh, -kInf, kInf); ADD_FUNC(acosh, a.acosh(), std::acosh(x), mpfr_acosh, -kInf, kInf); ADD_FUNC(atanh, a.atanh(), std::atanh(x), mpfr_atanh, -kInf, kInf); // Exponential / Logarithmic ADD_FUNC(exp, a.exp(), std::exp(x), mpfr_exp, -kInf, kInf); ADD_FUNC(exp2, a.exp2(), std::exp2(x), mpfr_exp2_wrap, -kInf, kInf); ADD_FUNC(expm1, a.expm1(), std::expm1(x), mpfr_expm1, -kInf, kInf); ADD_FUNC(log, a.log(), std::log(x), mpfr_log, -kInf, kInf); ADD_FUNC(log1p, a.log1p(), std::log1p(x), mpfr_log1p, -kInf, kInf); ADD_FUNC(log10, a.log10(), std::log10(x), mpfr_log10, -kInf, kInf); ADD_FUNC(log2, a.log2(), std::log2(x), mpfr_log2_wrap, -kInf, kInf); // Error / Gamma ADD_FUNC(erf, a.erf(), std::erf(x), mpfr_erf, -kInf, kInf); ADD_FUNC(erfc, a.erfc(), std::erfc(x), mpfr_erfc, -kInf, kInf); ADD_FUNC(lgamma, a.lgamma(), std::lgamma(x), mpfr_lngamma, -kInf, kInf); // Other ADD_FUNC(logistic, a.logistic(), std_logistic(x), mpfr_logistic, -kInf, kInf); ADD_FUNC(sqrt, a.sqrt(), std::sqrt(x), mpfr_sqrt, -kInf, kInf); ADD_FUNC(cbrt, a.cbrt(), std::cbrt(x), mpfr_cbrt, -kInf, kInf); ADD_FUNC(rsqrt, a.rsqrt(), Scalar(1)/std::sqrt(x), mpfr_rsqrt, -kInf, kInf); // clang-format on #undef ADD_FUNC return table; } // ============================================================================ // Range iteration helpers // ============================================================================ // Advances x toward +inf by at least 1 ULP. When step_eps > 0, additionally // jumps by a relative factor of (1 + step_eps) to sample the range sparsely. template static inline Scalar advance_by_step(Scalar x, double step_eps) { Scalar next = std::nextafter(x, std::numeric_limits::infinity()); if (step_eps > 0.0 && std::isfinite(next)) { // Try to jump further by a relative amount. Scalar jumped = next > 0 ? next * static_cast(1.0 + step_eps) : next / static_cast(1.0 + step_eps); // Use the jump only if it actually advances further (handles denormal stalling). if (jumped > next) next = jumped; } return next; } // Counts the number of representable scalars in [lo, hi]. template static uint64_t count_scalars_in_range(Scalar lo, Scalar hi) { if (lo > hi) return 0; uint64_t lo_u = static_cast(scalar_to_linear(lo)); uint64_t hi_u = static_cast(scalar_to_linear(hi)); uint64_t diff = hi_u - lo_u; return diff == UINT64_MAX ? UINT64_MAX : diff + 1; } // Advances a scalar by n ULPs in the linear representation. static float advance_scalar(float x, uint64_t n) { int64_t lin = scalar_to_linear(x); lin += static_cast(n); int32_t ibits; if (lin < 0) { ibits = static_cast(INT32_MIN) - static_cast(lin) - 1; } else { ibits = static_cast(lin); } float result; std::memcpy(&result, &ibits, sizeof(result)); return result; } static double advance_scalar(double x, uint64_t n) { int64_t lin = scalar_to_linear(x); lin += static_cast(n); int64_t ibits; if (lin < 0) { ibits = static_cast(INT64_MIN) - lin - 1; } else { ibits = lin; } double result; std::memcpy(&result, &ibits, sizeof(result)); return result; } // ============================================================================ // Worker thread: evaluates Eigen and reference over a subrange // ============================================================================ template static void worker(const FuncEntry& func, Scalar lo, Scalar hi, int batch_size, bool use_mpfr, double step_eps, ThreadResult& result) { using ArrayType = Eigen::Array; ArrayType input(batch_size); ArrayType eigen_out(batch_size); std::vector ref_out(batch_size); #ifdef EIGEN_HAS_MPFR mpfr_t mp_in, mp_out; if (use_mpfr) { mpfr_init2(mp_in, 128); mpfr_init2(mp_out, 128); } #else (void)use_mpfr; #endif auto process_batch = [&](int n, const ArrayType& in, const ArrayType& eig) { for (int i = 0; i < n; i++) { #ifdef EIGEN_HAS_MPFR if (use_mpfr) { mpfr_set_scalar(mp_in, in[i], MPFR_RNDN); func.mpfr_eval(mp_out, mp_in, MPFR_RNDN); ref_out[i] = mpfr_get_scalar(mp_out, MPFR_RNDN); } else #endif { ref_out[i] = func.std_eval(in[i]); } } for (int i = 0; i < n; i++) { int64_t err = signed_ulp_error(eig[i], ref_out[i]); result.record(err, in[i], eig[i], ref_out[i]); } }; int idx = 0; Scalar x = lo; for (;;) { input[idx] = x; idx++; if (idx == batch_size) { func.eigen_eval(eigen_out, input); process_batch(batch_size, input, eigen_out); idx = 0; } if (x >= hi) break; Scalar next = advance_by_step(x, step_eps); x = (next > hi) ? hi : next; } // Process remaining partial batch. if (idx > 0) { auto partial_in = input.head(idx); auto partial_eigen = eigen_out.head(idx); func.eigen_eval(partial_eigen, partial_in); process_batch(idx, input, eigen_out); } #ifdef EIGEN_HAS_MPFR if (use_mpfr) { mpfr_clear(mp_in); mpfr_clear(mp_out); } #endif } // ============================================================================ // Test driver: splits range across threads and prints results // ============================================================================ struct Options { std::string func_name; double lo = std::numeric_limits::quiet_NaN(); double hi = std::numeric_limits::quiet_NaN(); int num_threads; int batch_size = 4096; int hist_width = 10; bool use_mpfr = false; bool use_double = false; double step_eps = 0.0; bool list_funcs = false; }; template static int run_test(const Options& opts) { const int kDigits = std::is_same::value ? 9 : 17; const char* kTypeName = std::is_same::value ? "float" : "double"; auto table = build_func_table(); if (opts.list_funcs) { std::printf("Available functions:\n"); for (const auto& f : table) { std::printf(" %s\n", f.name.c_str()); } return 0; } // Look up the requested function. const FuncEntry* entry = nullptr; for (const auto& f : table) { if (f.name == opts.func_name) { entry = &f; break; } } if (!entry) { std::fprintf(stderr, "Error: unknown function '%s' (use --list to see available functions)\n", opts.func_name.c_str()); return 1; } Scalar lo = std::isnan(opts.lo) ? entry->default_lo : static_cast(opts.lo); Scalar hi = std::isnan(opts.hi) ? entry->default_hi : static_cast(opts.hi); uint64_t total_scalars = count_scalars_in_range(lo, hi); int num_threads = opts.num_threads; // Print test configuration. std::printf("Function: %s (%s)\n", opts.func_name.c_str(), kTypeName); std::printf("Range: [%.*g, %.*g]\n", kDigits, double(lo), kDigits, double(hi)); if (opts.step_eps > 0.0) { std::printf("Sampling step: (1 + %g) * nextafter(x)\n", opts.step_eps); } else { std::printf("Representable values in range: %lu\n", static_cast(total_scalars)); } std::printf("Reference: %s\n", opts.use_mpfr ? "MPFR (128-bit)" : "std C++ math"); std::printf("Threads: %d\n", num_threads); std::printf("Batch size: %d\n", opts.batch_size); std::printf("\n"); std::fflush(stdout); // Split range across threads. if (total_scalars > 0 && static_cast(num_threads) > total_scalars) { num_threads = static_cast(total_scalars); } if (num_threads < 1) num_threads = 1; // Heap-allocate each ThreadResult separately to avoid false sharing. std::vector>> results; results.reserve(num_threads); for (int t = 0; t < num_threads; t++) { results.push_back(std::make_unique>()); results.back()->init(opts.hist_width); } std::vector threads; uint64_t scalars_per_thread = total_scalars / num_threads; Scalar chunk_lo = lo; auto start_time = std::chrono::steady_clock::now(); for (int t = 0; t < num_threads; t++) { Scalar chunk_hi; if (t == num_threads - 1) { chunk_hi = hi; } else { chunk_hi = advance_scalar(chunk_lo, scalars_per_thread - 1); } threads.emplace_back(worker, std::cref(*entry), chunk_lo, chunk_hi, opts.batch_size, opts.use_mpfr, opts.step_eps, std::ref(*results[t])); chunk_lo = std::nextafter(chunk_hi, std::numeric_limits::infinity()); } for (auto& t : threads) t.join(); auto end_time = std::chrono::steady_clock::now(); double elapsed = std::chrono::duration(end_time - start_time).count(); // Reduce per-thread results. ThreadResult global; global.init(opts.hist_width); for (int t = 0; t < num_threads; t++) { const auto& r = *results[t]; if (r.max_abs_ulp > global.max_abs_ulp) { global.max_abs_ulp = r.max_abs_ulp; global.max_ulp_at = r.max_ulp_at; global.max_ulp_eigen = r.max_ulp_eigen; global.max_ulp_ref = r.max_ulp_ref; } global.abs_ulp_sum += r.abs_ulp_sum; global.count += r.count; for (size_t b = 0; b < global.hist.size(); b++) { global.hist[b] += r.hist[b]; } } double mean_ulp = global.count > 0 ? global.abs_ulp_sum / global.count : 0.0; // Print results. std::printf("Results:\n"); std::printf(" Values tested: %lu\n", static_cast(global.count)); std::printf(" Time: %.2f seconds (%.1f Mvalues/s)\n", elapsed, global.count / elapsed / 1e6); if (global.max_abs_ulp == INT64_MAX) { std::printf(" Max |ULP error|: inf\n"); } else { std::printf(" Max |ULP error|: %ld\n", static_cast(global.max_abs_ulp)); } std::printf(" at x = %.*g (Eigen=%.*g, ref=%.*g)\n", kDigits, double(global.max_ulp_at), kDigits, double(global.max_ulp_eigen), kDigits, double(global.max_ulp_ref)); std::printf(" Mean |ULP error|: %.4f\n", mean_ulp); std::printf("\n"); // Print signed error histogram. std::printf("Signed ULP error histogram [-%d, +%d]:\n", opts.hist_width, opts.hist_width); int nbins = 2 * opts.hist_width + 3; for (int b = 0; b < nbins; b++) { if (global.hist[b] == 0) continue; double pct = 100.0 * global.hist[b] / global.count; if (b == 0) { std::printf(" <%-4d: %12lu (%7.3f%%)\n", -opts.hist_width, static_cast(global.hist[b]), pct); } else if (b == nbins - 1) { std::printf(" >%-4d: %12lu (%7.3f%%)\n", opts.hist_width, static_cast(global.hist[b]), pct); } else { int err = b - opts.hist_width - 1; std::printf(" %-5d: %12lu (%7.3f%%)\n", err, static_cast(global.hist[b]), pct); } } return 0; } // ============================================================================ // Command-line parsing // ============================================================================ static void print_usage() { std::printf( "Usage: ulp_accuracy [options]\n" " --func=NAME Function to test (required unless --list)\n" " --lo=VAL Start of range (default: -inf)\n" " --hi=VAL End of range (default: +inf)\n" " --double Test double precision (default: float)\n" " --step=EPS Sampling step: advance by (1+EPS)*nextafter(x)\n" " (default: 0 = exhaustive; useful for double, e.g. 1e-6)\n" " --threads=N Number of threads (default: all cores)\n" " --batch=N Batch size for Eigen eval (default: 4096)\n" " --ref=MODE Reference: 'std' (default) or 'mpfr'\n" " --hist_width=N Histogram half-width in ULPs (default: 10)\n" " --list List available functions\n"); } int main(int argc, char* argv[]) { Options opts; opts.num_threads = static_cast(std::thread::hardware_concurrency()); if (opts.num_threads == 0) opts.num_threads = 4; std::string ref_mode; for (int i = 1; i < argc; i++) { std::string arg = argv[i]; if (arg.substr(0, 7) == "--func=") { opts.func_name = arg.substr(7); } else if (arg.substr(0, 5) == "--lo=") { std::string val = arg.substr(5); if (val == "inf") opts.lo = std::numeric_limits::infinity(); else if (val == "-inf") opts.lo = -std::numeric_limits::infinity(); else opts.lo = std::stod(val); } else if (arg.substr(0, 5) == "--hi=") { std::string val = arg.substr(5); if (val == "inf") opts.hi = std::numeric_limits::infinity(); else if (val == "-inf") opts.hi = -std::numeric_limits::infinity(); else opts.hi = std::stod(val); } else if (arg.substr(0, 10) == "--threads=") { opts.num_threads = std::stoi(arg.substr(10)); } else if (arg.substr(0, 8) == "--batch=") { opts.batch_size = std::stoi(arg.substr(8)); } else if (arg.substr(0, 6) == "--ref=") { ref_mode = arg.substr(6); } else if (arg.substr(0, 13) == "--hist_width=") { opts.hist_width = std::stoi(arg.substr(13)); } else if (arg.substr(0, 7) == "--step=") { opts.step_eps = std::stod(arg.substr(7)); } else if (arg == "--double") { opts.use_double = true; } else if (arg == "--list") { opts.list_funcs = true; } else if (arg == "--help" || arg == "-h") { print_usage(); return 0; } else { std::fprintf(stderr, "Unknown option: %s\n", arg.c_str()); print_usage(); return 1; } } // Determine reference mode (default: std). if (ref_mode.empty() || ref_mode == "std") { opts.use_mpfr = false; } else if (ref_mode == "mpfr") { #ifdef EIGEN_HAS_MPFR opts.use_mpfr = true; #else std::fprintf(stderr, "Error: MPFR support not compiled in. Use --ref=std or rebuild with MPFR.\n"); return 1; #endif } else { std::fprintf(stderr, "Error: --ref must be 'std' or 'mpfr'\n"); return 1; } if (!opts.list_funcs && opts.func_name.empty()) { std::fprintf(stderr, "Error: --func=NAME is required (use --list to see available functions)\n"); return 1; } if (opts.use_double) { return run_test(opts); } else { return run_test(opts); } }