mirror of
https://gitlab.com/libeigen/eigen.git
synced 2026-04-10 11:34:33 +08:00
Add ULP accuracy measurement tool and documentation for vectorized math functions
libeigen/eigen!2153 Co-authored-by: Rasmus Munk Larsen <rmlarsen@gmail.com>
This commit is contained in:
@@ -608,6 +608,133 @@ This also means that, unless specified, if the function \c std::foo is available
|
|||||||
|
|
||||||
\n
|
\n
|
||||||
|
|
||||||
|
\section CoeffwiseMathFunctionsAccuracy Accuracy of vectorized math functions
|
||||||
|
|
||||||
|
The following tables summarize the accuracy of %Eigen's vectorized implementations measured
|
||||||
|
in units of ULP (Unit in the Last Place) on an x86-64 system (Intel Xeon, GCC) with SSE2 SIMD
|
||||||
|
target. The reference values were computed using
|
||||||
|
<a href="https://www.mpfr.org/">MPFR</a> at 128-bit precision.
|
||||||
|
Float results are exhaustive over all ~4.28 billion finite representable values.
|
||||||
|
Double results sample ~2.88 billion values using a geometric stepping factor of 10<sup>-6</sup>.
|
||||||
|
|
||||||
|
These numbers may differ for other SIMD targets (AVX, AVX512, NEON, SVE, etc.)
|
||||||
|
since each has its own packet math implementations. Functions marked "delegates to std"
|
||||||
|
do not have a custom vectorized implementation for the tested SIMD target — they call
|
||||||
|
the standard library function element-by-element.
|
||||||
|
|
||||||
|
The full histograms for each function can be generated with the \c ulp_accuracy tool
|
||||||
|
in <tt>test/ulp_accuracy/</tt>.
|
||||||
|
|
||||||
|
\subsection CoeffwiseMathFunctionsAccuracy_float Float precision
|
||||||
|
|
||||||
|
<table class="manual-hl">
|
||||||
|
<tr><th>Function</th><th>Max |ULP|</th><th>Mean |ULP|</th><th>% Exact</th><th>Notes</th></tr>
|
||||||
|
<tr><th colspan="5">Trigonometric</th></tr>
|
||||||
|
<tr><td>sin</td> <td>2</td> <td>0.087</td> <td>91.3%</td> <td>\ref CoeffwiseMathFunctionsAccuracy_note1 "1"</td></tr>
|
||||||
|
<tr><td>cos</td> <td>2</td> <td>0.088</td> <td>91.2%</td> <td>\ref CoeffwiseMathFunctionsAccuracy_note1 "1"</td></tr>
|
||||||
|
<tr><td>tan</td> <td>5</td> <td>0.238</td> <td>77.3%</td> <td>\ref CoeffwiseMathFunctionsAccuracy_note1 "1"</td></tr>
|
||||||
|
<tr><td>asin</td> <td>4</td> <td>0.726</td> <td>51.3%</td> <td></td></tr>
|
||||||
|
<tr><td>acos</td> <td>4</td> <td>0.057</td> <td>95.0%</td> <td></td></tr>
|
||||||
|
<tr><td>atan</td> <td>4</td> <td>0.061</td> <td>94.0%</td> <td></td></tr>
|
||||||
|
<tr><th colspan="5">Hyperbolic</th></tr>
|
||||||
|
<tr><td>sinh</td> <td>2</td> <td>0.017</td> <td>98.3%</td> <td></td></tr>
|
||||||
|
<tr><td>cosh</td> <td>2</td> <td>0.004</td> <td>99.6%</td> <td></td></tr>
|
||||||
|
<tr><td>tanh</td> <td>6</td> <td>0.030</td> <td>97.2%</td> <td></td></tr>
|
||||||
|
<tr><td>asinh</td> <td>2</td> <td>0.145</td> <td>85.5%</td> <td></td></tr>
|
||||||
|
<tr><td>acosh</td> <td>2</td> <td>0.057</td> <td>94.3%</td> <td></td></tr>
|
||||||
|
<tr><td>atanh</td> <td>2</td> <td>0.004</td> <td>99.6%</td> <td></td></tr>
|
||||||
|
<tr><th colspan="5">Exponential / Logarithmic</th></tr>
|
||||||
|
<tr><td>exp</td> <td>1</td> <td>0.018</td> <td>98.2%</td> <td></td></tr>
|
||||||
|
<tr><td>exp2</td> <td>6</td> <td>0.034</td> <td>97.3%</td> <td></td></tr>
|
||||||
|
<tr><td>expm1</td> <td>5</td> <td>0.060</td> <td>94.6%</td> <td></td></tr>
|
||||||
|
<tr><td>log</td> <td>3</td> <td>0.120</td> <td>88.0%</td> <td></td></tr>
|
||||||
|
<tr><td>log1p</td> <td>5</td> <td>0.134</td> <td>87.5%</td> <td></td></tr>
|
||||||
|
<tr><td>log10</td> <td>2</td> <td>0.007</td> <td>99.3%</td> <td></td></tr>
|
||||||
|
<tr><td>log2</td> <td>5</td> <td>0.005</td> <td>99.5%</td> <td></td></tr>
|
||||||
|
<tr><th colspan="5">Error / Special</th></tr>
|
||||||
|
<tr><td>erf</td> <td>7</td> <td>0.332</td> <td>67.5%</td> <td></td></tr>
|
||||||
|
<tr><td>erfc</td> <td>8</td> <td>0.010</td> <td>99.2%</td> <td></td></tr>
|
||||||
|
<tr><td>lgamma</td> <td colspan="3"><em>delegates to std</em></td> <td>\ref CoeffwiseMathFunctionsAccuracy_note3 "3"</td></tr>
|
||||||
|
<tr><th colspan="5">Other</th></tr>
|
||||||
|
<tr><td>logistic</td> <td>7</td> <td>0.040</td> <td>97.0%</td> <td></td></tr>
|
||||||
|
<tr><td>sqrt</td> <td>0</td> <td>0.000</td> <td>100%</td> <td>Uses hardware sqrt</td></tr>
|
||||||
|
<tr><td>cbrt</td> <td>2</td> <td>0.552</td> <td>49.1%</td> <td></td></tr>
|
||||||
|
<tr><td>rsqrt</td> <td>∞</td> <td>0.114</td> <td>88.2%</td> <td>\ref CoeffwiseMathFunctionsAccuracy_note4 "4"</td></tr>
|
||||||
|
</table>
|
||||||
|
|
||||||
|
\subsection CoeffwiseMathFunctionsAccuracy_double Double precision
|
||||||
|
|
||||||
|
<table class="manual-hl">
|
||||||
|
<tr><th>Function</th><th>Max |ULP|</th><th>Mean |ULP|</th><th>% Exact</th><th>Notes</th></tr>
|
||||||
|
<tr><th colspan="5">Trigonometric</th></tr>
|
||||||
|
<tr><td>sin</td> <td>13,879,755</td> <td>0.093</td> <td>93.2%</td> <td>\ref CoeffwiseMathFunctionsAccuracy_note1 "1"</td></tr>
|
||||||
|
<tr><td>cos</td> <td>2,024,130</td> <td>0.043</td> <td>98.2%</td> <td>\ref CoeffwiseMathFunctionsAccuracy_note1 "1"</td></tr>
|
||||||
|
<tr><td>tan</td> <td>13,879,755</td> <td>0.128</td> <td>92.7%</td> <td>\ref CoeffwiseMathFunctionsAccuracy_note1 "1"</td></tr>
|
||||||
|
<tr><td>asin</td> <td>1</td> <td><0.001</td> <td>>99.9%</td> <td></td></tr>
|
||||||
|
<tr><td>acos</td> <td>1</td> <td><0.001</td> <td>100%</td> <td></td></tr>
|
||||||
|
<tr><td>atan</td> <td>5</td> <td>0.013</td> <td>98.8%</td> <td></td></tr>
|
||||||
|
<tr><th colspan="5">Hyperbolic</th></tr>
|
||||||
|
<tr><td>sinh</td> <td>2</td> <td>0.004</td> <td>99.6%</td> <td></td></tr>
|
||||||
|
<tr><td>cosh</td> <td>2</td> <td>0.001</td> <td>99.9%</td> <td></td></tr>
|
||||||
|
<tr><td>tanh</td> <td>8</td> <td>0.008</td> <td>99.3%</td> <td></td></tr>
|
||||||
|
<tr><td>asinh</td> <td>2</td> <td>0.098</td> <td>90.2%</td> <td></td></tr>
|
||||||
|
<tr><td>acosh</td> <td>2</td> <td>0.047</td> <td>95.3%</td> <td></td></tr>
|
||||||
|
<tr><td>atanh</td> <td>2</td> <td><0.001</td> <td>>99.9%</td> <td></td></tr>
|
||||||
|
<tr><th colspan="5">Exponential / Logarithmic</th></tr>
|
||||||
|
<tr><td>exp</td> <td>2</td> <td>0.001</td> <td>99.9%</td> <td></td></tr>
|
||||||
|
<tr><td>exp2</td> <td>214</td> <td>0.107</td> <td>99.6%</td> <td>\ref CoeffwiseMathFunctionsAccuracy_note2 "2"</td></tr>
|
||||||
|
<tr><td>expm1</td> <td>3</td> <td>0.010</td> <td>99.1%</td> <td></td></tr>
|
||||||
|
<tr><td>log</td> <td>2</td> <td>0.147</td> <td>85.3%</td> <td></td></tr>
|
||||||
|
<tr><td>log1p</td> <td>3</td> <td>0.097</td> <td>90.6%</td> <td></td></tr>
|
||||||
|
<tr><td>log10</td> <td>2</td> <td>0.001</td> <td>99.9%</td> <td></td></tr>
|
||||||
|
<tr><td>log2</td> <td>2</td> <td><0.001</td> <td>99.9%</td> <td></td></tr>
|
||||||
|
<tr><th colspan="5">Error / Special</th></tr>
|
||||||
|
<tr><td>erf</td> <td>∞</td> <td>0.050</td> <td>70.5%</td> <td>\ref CoeffwiseMathFunctionsAccuracy_note5 "5"</td></tr>
|
||||||
|
<tr><td>erfc</td> <td>11</td> <td>0.002</td> <td>99.9%</td> <td></td></tr>
|
||||||
|
<tr><td>lgamma</td> <td colspan="3"><em>delegates to std</em></td> <td>\ref CoeffwiseMathFunctionsAccuracy_note3 "3"</td></tr>
|
||||||
|
<tr><th colspan="5">Other</th></tr>
|
||||||
|
<tr><td>logistic</td> <td>3</td> <td>0.008</td> <td>99.2%</td> <td></td></tr>
|
||||||
|
<tr><td>sqrt</td> <td>0</td> <td>0.000</td> <td>100%</td> <td>Uses hardware sqrt</td></tr>
|
||||||
|
<tr><td>cbrt</td> <td>2</td> <td>0.119</td> <td>88.1%</td> <td></td></tr>
|
||||||
|
<tr><td>rsqrt</td> <td>∞</td> <td>0.135</td> <td>86.5%</td> <td>\ref CoeffwiseMathFunctionsAccuracy_note4 "4"</td></tr>
|
||||||
|
</table>
|
||||||
|
|
||||||
|
\subsection CoeffwiseMathFunctionsAccuracy_notes Notes
|
||||||
|
|
||||||
|
\anchor CoeffwiseMathFunctionsAccuracy_note1
|
||||||
|
<b>1. sin/cos/tan argument reduction:</b>
|
||||||
|
%Eigen's vectorized sin, cos, and tan use a Cody-Waite argument reduction scheme that
|
||||||
|
subtracts multiples of π/2 from the input. For very large arguments (|x| > ~10<sup>4</sup>
|
||||||
|
in float, |x| > ~10 in double), this reduction loses precision, producing
|
||||||
|
occasional large ULP errors. The mean error remains low because most representable
|
||||||
|
values are small. Applications that need high accuracy for large arguments should
|
||||||
|
perform argument reduction in user code before calling these functions.
|
||||||
|
|
||||||
|
\anchor CoeffwiseMathFunctionsAccuracy_note2
|
||||||
|
<b>2. exp2 double precision:</b>
|
||||||
|
The exp2 implementation for double shows a max error of 214 ULP near the overflow
|
||||||
|
boundary (x ≈ 1022). The mean error is still low (0.107 ULP, 99.6% exact), so
|
||||||
|
the large max error affects only inputs very close to overflow.
|
||||||
|
|
||||||
|
\anchor CoeffwiseMathFunctionsAccuracy_note3
|
||||||
|
<b>3. lgamma:</b>
|
||||||
|
The vectorized lgamma delegates to the standard library function \c std::lgamma for
|
||||||
|
this SIMD target (SSE2) and therefore has the same accuracy as the platform's C math library.
|
||||||
|
|
||||||
|
\anchor CoeffwiseMathFunctionsAccuracy_note4
|
||||||
|
<b>4. rsqrt max=∞:</b>
|
||||||
|
The infinite max ULP is due to a sign disagreement at a single subnormal input:
|
||||||
|
rsqrt(-0) returns -∞ in %Eigen but the MPFR reference produces NaN (rsqrt of a negative
|
||||||
|
value). Ignoring this edge case, the implementation is accurate (< 2 ULP) everywhere else.
|
||||||
|
For float, 16.8 million subnormal negative inputs (0.4%) also produce ±∞ vs NaN.
|
||||||
|
The mean error excluding these outliers is well below 1 ULP.
|
||||||
|
|
||||||
|
\anchor CoeffwiseMathFunctionsAccuracy_note5
|
||||||
|
<b>5. erf double ∞:</b>
|
||||||
|
The vectorized erf for double returns NaN for ±∞ instead of the correct ±1.
|
||||||
|
This produces an infinite max ULP error at a single input value. Excluding ±∞,
|
||||||
|
the max error is 3 ULP and the mean is 0.050 ULP.
|
||||||
|
|
||||||
*/
|
*/
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|||||||
@@ -532,6 +532,18 @@ if(EIGEN_TEST_BUILD_DOCUMENTATION)
|
|||||||
add_dependencies(buildtests doc)
|
add_dependencies(buildtests doc)
|
||||||
endif()
|
endif()
|
||||||
|
|
||||||
|
# ULP accuracy measurement tool (see test/ulp_accuracy/README.md)
|
||||||
|
find_package(MPFR)
|
||||||
|
find_package(GMP)
|
||||||
|
add_executable(ulp_accuracy ulp_accuracy/ulp_accuracy.cpp)
|
||||||
|
target_compile_options(ulp_accuracy PRIVATE -pthread)
|
||||||
|
target_link_libraries(ulp_accuracy Eigen3::Eigen ${CMAKE_THREAD_LIBS_INIT} pthread)
|
||||||
|
if(MPFR_FOUND AND GMP_FOUND)
|
||||||
|
target_include_directories(ulp_accuracy PRIVATE ${MPFR_INCLUDES} ${GMP_INCLUDES})
|
||||||
|
target_link_libraries(ulp_accuracy ${MPFR_LIBRARIES} ${GMP_LIBRARIES})
|
||||||
|
target_compile_definitions(ulp_accuracy PRIVATE EIGEN_HAS_MPFR)
|
||||||
|
endif()
|
||||||
|
|
||||||
# Register all smoke tests
|
# Register all smoke tests
|
||||||
include("EigenSmokeTestList")
|
include("EigenSmokeTestList")
|
||||||
ei_add_smoke_tests("${ei_smoke_test_list}")
|
ei_add_smoke_tests("${ei_smoke_test_list}")
|
||||||
|
|||||||
145
test/ulp_accuracy/README.md
Normal file
145
test/ulp_accuracy/README.md
Normal file
@@ -0,0 +1,145 @@
|
|||||||
|
# ULP Accuracy Measurement Tool
|
||||||
|
|
||||||
|
Standalone tool for measuring the accuracy of Eigen's vectorized math functions
|
||||||
|
in units of ULP (Unit in the Last Place). Compares Eigen's SIMD implementations
|
||||||
|
against either MPFR (128-bit high-precision reference) or the standard C++ math
|
||||||
|
library.
|
||||||
|
|
||||||
|
## Building
|
||||||
|
|
||||||
|
From the Eigen build directory:
|
||||||
|
|
||||||
|
```bash
|
||||||
|
cd build
|
||||||
|
cmake ..
|
||||||
|
cmake --build . --target ulp_accuracy
|
||||||
|
```
|
||||||
|
|
||||||
|
If MPFR and GMP are installed, the build automatically enables MPFR support
|
||||||
|
(`EIGEN_HAS_MPFR`). Without them, only `--ref=std` is available.
|
||||||
|
|
||||||
|
### Installing MPFR (Debian/Ubuntu)
|
||||||
|
|
||||||
|
```bash
|
||||||
|
sudo apt install libmpfr-dev libgmp-dev
|
||||||
|
```
|
||||||
|
|
||||||
|
## Usage
|
||||||
|
|
||||||
|
```
|
||||||
|
./test/ulp_accuracy [options]
|
||||||
|
|
||||||
|
Options:
|
||||||
|
--func=NAME Function to test (required unless --list)
|
||||||
|
--lo=VAL Start of range (default: -inf)
|
||||||
|
--hi=VAL End of range (default: +inf)
|
||||||
|
--double Test double precision (default: float)
|
||||||
|
--step=EPS Sampling step: advance by (1+EPS)*nextafter(x)
|
||||||
|
(default: 0 = exhaustive; useful for double, e.g. 1e-6)
|
||||||
|
--threads=N Number of threads (default: all cores)
|
||||||
|
--batch=N Batch size for Eigen eval (default: 4096)
|
||||||
|
--ref=MODE Reference: 'std' (default) or 'mpfr'
|
||||||
|
--hist_width=N Histogram half-width in ULPs (default: 10)
|
||||||
|
--list List available functions
|
||||||
|
```
|
||||||
|
|
||||||
|
## Examples
|
||||||
|
|
||||||
|
List all supported functions:
|
||||||
|
```bash
|
||||||
|
./test/ulp_accuracy --list
|
||||||
|
```
|
||||||
|
|
||||||
|
Exhaustive float test of sin against std (tests all ~4.28 billion finite floats):
|
||||||
|
```bash
|
||||||
|
./test/ulp_accuracy --func=sin
|
||||||
|
```
|
||||||
|
|
||||||
|
Float test against MPFR (more accurate reference, but slower):
|
||||||
|
```bash
|
||||||
|
./test/ulp_accuracy --func=sin --ref=mpfr
|
||||||
|
```
|
||||||
|
|
||||||
|
Double precision test with geometric sampling (exhaustive is impractical for double):
|
||||||
|
```bash
|
||||||
|
./test/ulp_accuracy --func=exp --double --step=1e-6
|
||||||
|
```
|
||||||
|
|
||||||
|
Test a specific range:
|
||||||
|
```bash
|
||||||
|
./test/ulp_accuracy --func=sin --lo=0 --hi=6.2832
|
||||||
|
```
|
||||||
|
|
||||||
|
## Output
|
||||||
|
|
||||||
|
The tool prints:
|
||||||
|
|
||||||
|
- **Test configuration**: function, range, reference mode, thread count
|
||||||
|
- **Max |ULP error|**: worst-case absolute ULP error with the offending input value
|
||||||
|
- **Mean |ULP error|**: average absolute ULP error across all tested values
|
||||||
|
- **Signed ULP histogram**: distribution of signed errors showing bias direction
|
||||||
|
|
||||||
|
Example output:
|
||||||
|
```
|
||||||
|
Function: sin (float)
|
||||||
|
Range: [-inf, inf]
|
||||||
|
Representable values in range: 4278190082
|
||||||
|
Reference: MPFR (128-bit)
|
||||||
|
Threads: 32
|
||||||
|
Batch size: 4096
|
||||||
|
|
||||||
|
Results:
|
||||||
|
Values tested: 4278190081
|
||||||
|
Time: 529.04 seconds (8.1 Mvalues/s)
|
||||||
|
Max |ULP error|: 2
|
||||||
|
at x = -1.5413464e+38 (Eigen=-0.482218683, ref=-0.482218742)
|
||||||
|
Mean |ULP error|: 0.0874
|
||||||
|
|
||||||
|
Signed ULP error histogram [-10, +10]:
|
||||||
|
-2 : 51988 ( 0.001%)
|
||||||
|
-1 : 186805349 ( 4.366%)
|
||||||
|
0 : 3904475407 ( 91.265%)
|
||||||
|
1 : 186805349 ( 4.366%)
|
||||||
|
2 : 51988 ( 0.001%)
|
||||||
|
```
|
||||||
|
|
||||||
|
## How it works
|
||||||
|
|
||||||
|
1. **Range splitting**: The input range is divided evenly across threads by
|
||||||
|
splitting the linear ULP space.
|
||||||
|
|
||||||
|
2. **Batched evaluation**: Each thread fills batches of input values, evaluates
|
||||||
|
them through Eigen's vectorized path (using `Eigen::Array` operations), and
|
||||||
|
computes reference values one at a time.
|
||||||
|
|
||||||
|
3. **ULP computation**: IEEE 754 bit patterns are mapped to a linear integer
|
||||||
|
scale where adjacent representable values are adjacent integers. The signed
|
||||||
|
ULP error is the difference between Eigen's result and the reference on this
|
||||||
|
scale. Special cases (NaN, infinity mismatches) report infinite error.
|
||||||
|
|
||||||
|
4. **Result reduction**: Per-thread statistics (max error, mean error, histogram)
|
||||||
|
are merged after all threads complete.
|
||||||
|
|
||||||
|
## Supported functions
|
||||||
|
|
||||||
|
| Category | Functions |
|
||||||
|
|----------|-----------|
|
||||||
|
| Trigonometric | sin, cos, tan, asin, acos, atan |
|
||||||
|
| Hyperbolic | sinh, cosh, tanh, asinh, acosh, atanh |
|
||||||
|
| Exponential/Log | exp, exp2, expm1, log, log1p, log10, log2 |
|
||||||
|
| Error/Gamma | erf, erfc, lgamma |
|
||||||
|
| Other | logistic, sqrt, cbrt, rsqrt |
|
||||||
|
|
||||||
|
## File organization
|
||||||
|
|
||||||
|
- `ulp_accuracy.cpp` — Main tool: ULP computation, worker threads, CLI, result printing
|
||||||
|
- `mpfr_reference.h` — MPFR reference function wrappers and scalar conversion helpers
|
||||||
|
|
||||||
|
## Performance tips
|
||||||
|
|
||||||
|
- Float exhaustive sweeps test ~4.28 billion values. With `--ref=std` this takes
|
||||||
|
~50 seconds per function; with `--ref=mpfr` it takes ~500 seconds (10x slower).
|
||||||
|
- For double precision, exhaustive testing is impractical. Use `--step=1e-6` to
|
||||||
|
sample ~2.88 billion values geometrically.
|
||||||
|
- Thread count defaults to all available cores. MPFR is the bottleneck (single
|
||||||
|
MPFR call per value per thread), so more cores help significantly.
|
||||||
78
test/ulp_accuracy/mpfr_reference.h
Normal file
78
test/ulp_accuracy/mpfr_reference.h
Normal file
@@ -0,0 +1,78 @@
|
|||||||
|
// MPFR high-precision reference implementations for ULP accuracy testing.
|
||||||
|
//
|
||||||
|
// This header provides MPFR-based reference functions for all math operations
|
||||||
|
// tested by the ulp_accuracy tool. It also includes scalar conversion helpers
|
||||||
|
// between float/double and mpfr_t.
|
||||||
|
//
|
||||||
|
// Only compiled when EIGEN_HAS_MPFR is defined (i.e., MPFR and GMP are found).
|
||||||
|
|
||||||
|
#ifndef EIGEN_ULP_ACCURACY_MPFR_REFERENCE_H
|
||||||
|
#define EIGEN_ULP_ACCURACY_MPFR_REFERENCE_H
|
||||||
|
|
||||||
|
#ifdef EIGEN_HAS_MPFR
|
||||||
|
#include <mpfr.h>
|
||||||
|
|
||||||
|
// ---------------------------------------------------------------------------
|
||||||
|
// Scalar <-> mpfr_t conversion
|
||||||
|
// ---------------------------------------------------------------------------
|
||||||
|
|
||||||
|
template <typename Scalar>
|
||||||
|
static void mpfr_set_scalar(mpfr_t rop, Scalar x, mpfr_rnd_t rnd);
|
||||||
|
template <>
|
||||||
|
void mpfr_set_scalar<float>(mpfr_t rop, float x, mpfr_rnd_t rnd) {
|
||||||
|
mpfr_set_flt(rop, x, rnd);
|
||||||
|
}
|
||||||
|
template <>
|
||||||
|
void mpfr_set_scalar<double>(mpfr_t rop, double x, mpfr_rnd_t rnd) {
|
||||||
|
mpfr_set_d(rop, x, rnd);
|
||||||
|
}
|
||||||
|
|
||||||
|
template <typename Scalar>
|
||||||
|
static Scalar mpfr_get_scalar(mpfr_t op, mpfr_rnd_t rnd);
|
||||||
|
template <>
|
||||||
|
float mpfr_get_scalar<float>(mpfr_t op, mpfr_rnd_t rnd) {
|
||||||
|
return mpfr_get_flt(op, rnd);
|
||||||
|
}
|
||||||
|
template <>
|
||||||
|
double mpfr_get_scalar<double>(mpfr_t op, mpfr_rnd_t rnd) {
|
||||||
|
return mpfr_get_d(op, rnd);
|
||||||
|
}
|
||||||
|
|
||||||
|
// ---------------------------------------------------------------------------
|
||||||
|
// MPFR wrappers for functions not directly provided by libmpfr
|
||||||
|
// ---------------------------------------------------------------------------
|
||||||
|
|
||||||
|
// logistic(x) = 1 / (1 + exp(-x))
|
||||||
|
static int mpfr_logistic(mpfr_t rop, const mpfr_t op, mpfr_rnd_t rnd) {
|
||||||
|
mpfr_t tmp, one;
|
||||||
|
mpfr_init2(tmp, mpfr_get_prec(rop));
|
||||||
|
mpfr_init2(one, mpfr_get_prec(rop));
|
||||||
|
mpfr_set_ui(one, 1, rnd);
|
||||||
|
mpfr_neg(tmp, op, rnd);
|
||||||
|
mpfr_exp(tmp, tmp, rnd);
|
||||||
|
mpfr_add(tmp, tmp, one, rnd);
|
||||||
|
int ret = mpfr_div(rop, one, tmp, rnd);
|
||||||
|
mpfr_clear(tmp);
|
||||||
|
mpfr_clear(one);
|
||||||
|
return ret;
|
||||||
|
}
|
||||||
|
|
||||||
|
// rsqrt(x) = 1 / sqrt(x)
|
||||||
|
static int mpfr_rsqrt(mpfr_t rop, const mpfr_t op, mpfr_rnd_t rnd) { return mpfr_rec_sqrt(rop, op, rnd); }
|
||||||
|
|
||||||
|
// exp2(x) = 2^x
|
||||||
|
static int mpfr_exp2_wrap(mpfr_t rop, const mpfr_t op, mpfr_rnd_t rnd) {
|
||||||
|
mpfr_t two;
|
||||||
|
mpfr_init2(two, mpfr_get_prec(rop));
|
||||||
|
mpfr_set_ui(two, 2, rnd);
|
||||||
|
int ret = mpfr_pow(rop, two, op, rnd);
|
||||||
|
mpfr_clear(two);
|
||||||
|
return ret;
|
||||||
|
}
|
||||||
|
|
||||||
|
// log2(x) — thin wrapper to match the function signature convention.
|
||||||
|
static int mpfr_log2_wrap(mpfr_t rop, const mpfr_t op, mpfr_rnd_t rnd) { return mpfr_log2(rop, op, rnd); }
|
||||||
|
|
||||||
|
#endif // EIGEN_HAS_MPFR
|
||||||
|
|
||||||
|
#endif // EIGEN_ULP_ACCURACY_MPFR_REFERENCE_H
|
||||||
611
test/ulp_accuracy/ulp_accuracy.cpp
Normal file
611
test/ulp_accuracy/ulp_accuracy.cpp
Normal file
@@ -0,0 +1,611 @@
|
|||||||
|
// Standalone tool to measure ULP accuracy of Eigen's vectorized math functions
|
||||||
|
// against either MPFR (high-precision reference) or std C++ math functions.
|
||||||
|
//
|
||||||
|
// See README.md in this directory for full documentation.
|
||||||
|
//
|
||||||
|
// Usage:
|
||||||
|
// ./ulp_accuracy --func=sin --lo=0 --hi=6.2832 --threads=16
|
||||||
|
// ./ulp_accuracy --func=exp --threads=16
|
||||||
|
// ./ulp_accuracy --func=sin --ref=mpfr
|
||||||
|
// ./ulp_accuracy --func=sin --double --step=1e-6
|
||||||
|
// ./ulp_accuracy --list
|
||||||
|
//
|
||||||
|
// Build:
|
||||||
|
// cd build && cmake .. && make ulp_accuracy
|
||||||
|
|
||||||
|
#include <Eigen/Core>
|
||||||
|
#include <unsupported/Eigen/SpecialFunctions>
|
||||||
|
|
||||||
|
#ifdef EIGEN_HAS_MPFR
|
||||||
|
#include <mpfr.h>
|
||||||
|
#endif
|
||||||
|
|
||||||
|
#include <cfloat>
|
||||||
|
#include <chrono>
|
||||||
|
#include <cmath>
|
||||||
|
#include <cstdint>
|
||||||
|
#include <cstdio>
|
||||||
|
#include <cstring>
|
||||||
|
#include <functional>
|
||||||
|
#include <limits>
|
||||||
|
#include <memory>
|
||||||
|
#include <string>
|
||||||
|
#include <thread>
|
||||||
|
#include <type_traits>
|
||||||
|
#include <vector>
|
||||||
|
|
||||||
|
#include "mpfr_reference.h"
|
||||||
|
|
||||||
|
// ============================================================================
|
||||||
|
// ULP distance computation
|
||||||
|
// ============================================================================
|
||||||
|
|
||||||
|
// Maps IEEE 754 bits to a linear integer scale where adjacent representable
|
||||||
|
// values are adjacent integers. The mapping is strictly monotonic:
|
||||||
|
// -inf -> most negative, -0.0 -> -1, +0.0 -> 0, +inf -> most positive.
|
||||||
|
static inline int64_t scalar_to_linear(float x) {
|
||||||
|
int32_t bits;
|
||||||
|
std::memcpy(&bits, &x, sizeof(bits));
|
||||||
|
if (bits < 0) {
|
||||||
|
bits = static_cast<int32_t>(INT32_MIN) - bits - 1;
|
||||||
|
}
|
||||||
|
return static_cast<int64_t>(bits);
|
||||||
|
}
|
||||||
|
|
||||||
|
static inline int64_t scalar_to_linear(double x) {
|
||||||
|
int64_t bits;
|
||||||
|
std::memcpy(&bits, &x, sizeof(bits));
|
||||||
|
if (bits < 0) {
|
||||||
|
bits = static_cast<int64_t>(INT64_MIN) - bits - 1;
|
||||||
|
}
|
||||||
|
return bits;
|
||||||
|
}
|
||||||
|
|
||||||
|
// Returns (eigen_val - ref_val) in ULP space.
|
||||||
|
// Positive means Eigen overestimates, negative means it underestimates.
|
||||||
|
// Returns INT64_MAX for incomparable values (NaN vs number, inf mismatch).
|
||||||
|
template <typename Scalar>
|
||||||
|
static inline int64_t signed_ulp_error(Scalar eigen_val, Scalar ref_val) {
|
||||||
|
if (eigen_val == ref_val) return 0; // also handles -0.0 == +0.0
|
||||||
|
bool e_nan = std::isnan(eigen_val), r_nan = std::isnan(ref_val);
|
||||||
|
if (e_nan && r_nan) return 0;
|
||||||
|
if (e_nan || r_nan) return INT64_MAX;
|
||||||
|
if (std::isinf(eigen_val) || std::isinf(ref_val)) return INT64_MAX;
|
||||||
|
int64_t a = scalar_to_linear(eigen_val);
|
||||||
|
int64_t b = scalar_to_linear(ref_val);
|
||||||
|
// Overflow check for a - b.
|
||||||
|
if (b > 0 && a < INT64_MIN + b) return INT64_MAX;
|
||||||
|
if (b < 0 && a > INT64_MAX + b) return INT64_MAX;
|
||||||
|
return a - b;
|
||||||
|
}
|
||||||
|
|
||||||
|
// ============================================================================
|
||||||
|
// Per-thread accumulator with signed ULP histogram
|
||||||
|
// ============================================================================
|
||||||
|
|
||||||
|
template <typename Scalar>
|
||||||
|
struct alignas(128) ThreadResult {
|
||||||
|
int64_t max_abs_ulp = 0;
|
||||||
|
Scalar max_ulp_at = Scalar(0);
|
||||||
|
Scalar max_ulp_eigen = Scalar(0);
|
||||||
|
Scalar max_ulp_ref = Scalar(0);
|
||||||
|
double abs_ulp_sum = 0.0;
|
||||||
|
uint64_t count = 0;
|
||||||
|
|
||||||
|
// Signed histogram: bins for errors in [-hist_width, +hist_width],
|
||||||
|
// plus two overflow bins for < -hist_width and > +hist_width.
|
||||||
|
// Layout: [<-W] [-W] [-W+1] ... [0] ... [W-1] [W] [>W]
|
||||||
|
// Total bins = 2*hist_width + 3
|
||||||
|
int hist_width = 0;
|
||||||
|
std::vector<uint64_t> hist;
|
||||||
|
|
||||||
|
void init(int w) {
|
||||||
|
hist_width = w;
|
||||||
|
hist.assign(2 * w + 3, 0);
|
||||||
|
}
|
||||||
|
|
||||||
|
void record(int64_t signed_err, Scalar x, Scalar eigen_val, Scalar ref_val) {
|
||||||
|
int64_t abs_err = signed_err < 0 ? -signed_err : signed_err;
|
||||||
|
if (signed_err == INT64_MAX) abs_err = INT64_MAX;
|
||||||
|
|
||||||
|
if (abs_err > max_abs_ulp) {
|
||||||
|
max_abs_ulp = abs_err;
|
||||||
|
max_ulp_at = x;
|
||||||
|
max_ulp_eigen = eigen_val;
|
||||||
|
max_ulp_ref = ref_val;
|
||||||
|
}
|
||||||
|
if (abs_err != INT64_MAX) {
|
||||||
|
abs_ulp_sum += static_cast<double>(abs_err);
|
||||||
|
}
|
||||||
|
count++;
|
||||||
|
|
||||||
|
// Histogram bin.
|
||||||
|
int bin;
|
||||||
|
if (signed_err == INT64_MAX || signed_err > hist_width) {
|
||||||
|
bin = 2 * hist_width + 2; // overflow high
|
||||||
|
} else if (signed_err < -hist_width) {
|
||||||
|
bin = 0; // overflow low
|
||||||
|
} else {
|
||||||
|
bin = static_cast<int>(signed_err) + hist_width + 1;
|
||||||
|
}
|
||||||
|
hist[bin]++;
|
||||||
|
}
|
||||||
|
};
|
||||||
|
|
||||||
|
// ============================================================================
|
||||||
|
// Function registry
|
||||||
|
// ============================================================================
|
||||||
|
|
||||||
|
template <typename Scalar>
|
||||||
|
struct FuncEntry {
|
||||||
|
using ArrayType = Eigen::Array<Scalar, Eigen::Dynamic, 1>;
|
||||||
|
using EigenEval = std::function<void(Eigen::Ref<ArrayType>, const Eigen::Ref<const ArrayType>&)>;
|
||||||
|
using StdEval = std::function<Scalar(Scalar)>;
|
||||||
|
|
||||||
|
#ifdef EIGEN_HAS_MPFR
|
||||||
|
using MpfrEval = std::function<int(mpfr_t, const mpfr_t, mpfr_rnd_t)>;
|
||||||
|
#endif
|
||||||
|
|
||||||
|
std::string name;
|
||||||
|
EigenEval eigen_eval;
|
||||||
|
StdEval std_eval;
|
||||||
|
#ifdef EIGEN_HAS_MPFR
|
||||||
|
MpfrEval mpfr_eval;
|
||||||
|
#endif
|
||||||
|
Scalar default_lo;
|
||||||
|
Scalar default_hi;
|
||||||
|
};
|
||||||
|
|
||||||
|
// std::logistic is not part of the C++ standard library.
|
||||||
|
template <typename Scalar>
|
||||||
|
static Scalar std_logistic(Scalar x) {
|
||||||
|
if (x >= 0) {
|
||||||
|
Scalar e = std::exp(-x);
|
||||||
|
return Scalar(1) / (Scalar(1) + e);
|
||||||
|
} else {
|
||||||
|
Scalar e = std::exp(x);
|
||||||
|
return e / (Scalar(1) + e);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
template <typename Scalar>
|
||||||
|
static std::vector<FuncEntry<Scalar>> build_func_table() {
|
||||||
|
using ArrayType = Eigen::Array<Scalar, Eigen::Dynamic, 1>;
|
||||||
|
std::vector<FuncEntry<Scalar>> table;
|
||||||
|
|
||||||
|
#ifdef EIGEN_HAS_MPFR
|
||||||
|
#define ADD_FUNC(fname, eigen_expr, std_expr, mpfr_fn, lo, hi) \
|
||||||
|
table.push_back({#fname, [](Eigen::Ref<ArrayType> out, const Eigen::Ref<const ArrayType>& a) { out = eigen_expr; }, \
|
||||||
|
[](Scalar x) -> Scalar { return std_expr; }, \
|
||||||
|
[](mpfr_t r, const mpfr_t o, mpfr_rnd_t d) { return mpfr_fn(r, o, d); }, lo, hi})
|
||||||
|
#else
|
||||||
|
#define ADD_FUNC(fname, eigen_expr, std_expr, mpfr_fn, lo, hi) \
|
||||||
|
table.push_back({#fname, [](Eigen::Ref<ArrayType> out, const Eigen::Ref<const ArrayType>& a) { out = eigen_expr; }, \
|
||||||
|
[](Scalar x) -> Scalar { return std_expr; }, lo, hi})
|
||||||
|
#endif
|
||||||
|
|
||||||
|
constexpr Scalar kInf = std::numeric_limits<Scalar>::infinity();
|
||||||
|
|
||||||
|
// Trigonometric
|
||||||
|
// clang-format off
|
||||||
|
ADD_FUNC(sin, a.sin(), std::sin(x), mpfr_sin, -kInf, kInf);
|
||||||
|
ADD_FUNC(cos, a.cos(), std::cos(x), mpfr_cos, -kInf, kInf);
|
||||||
|
ADD_FUNC(tan, a.tan(), std::tan(x), mpfr_tan, -kInf, kInf);
|
||||||
|
ADD_FUNC(asin, a.asin(), std::asin(x), mpfr_asin, -kInf, kInf);
|
||||||
|
ADD_FUNC(acos, a.acos(), std::acos(x), mpfr_acos, -kInf, kInf);
|
||||||
|
ADD_FUNC(atan, a.atan(), std::atan(x), mpfr_atan, -kInf, kInf);
|
||||||
|
|
||||||
|
// Hyperbolic
|
||||||
|
ADD_FUNC(sinh, a.sinh(), std::sinh(x), mpfr_sinh, -kInf, kInf);
|
||||||
|
ADD_FUNC(cosh, a.cosh(), std::cosh(x), mpfr_cosh, -kInf, kInf);
|
||||||
|
ADD_FUNC(tanh, a.tanh(), std::tanh(x), mpfr_tanh, -kInf, kInf);
|
||||||
|
ADD_FUNC(asinh, a.asinh(), std::asinh(x), mpfr_asinh, -kInf, kInf);
|
||||||
|
ADD_FUNC(acosh, a.acosh(), std::acosh(x), mpfr_acosh, -kInf, kInf);
|
||||||
|
ADD_FUNC(atanh, a.atanh(), std::atanh(x), mpfr_atanh, -kInf, kInf);
|
||||||
|
|
||||||
|
// Exponential / Logarithmic
|
||||||
|
ADD_FUNC(exp, a.exp(), std::exp(x), mpfr_exp, -kInf, kInf);
|
||||||
|
ADD_FUNC(exp2, a.exp2(), std::exp2(x), mpfr_exp2_wrap, -kInf, kInf);
|
||||||
|
ADD_FUNC(expm1, a.expm1(), std::expm1(x), mpfr_expm1, -kInf, kInf);
|
||||||
|
ADD_FUNC(log, a.log(), std::log(x), mpfr_log, -kInf, kInf);
|
||||||
|
ADD_FUNC(log1p, a.log1p(), std::log1p(x), mpfr_log1p, -kInf, kInf);
|
||||||
|
ADD_FUNC(log10, a.log10(), std::log10(x), mpfr_log10, -kInf, kInf);
|
||||||
|
ADD_FUNC(log2, a.log2(), std::log2(x), mpfr_log2_wrap, -kInf, kInf);
|
||||||
|
|
||||||
|
// Error / Gamma
|
||||||
|
ADD_FUNC(erf, a.erf(), std::erf(x), mpfr_erf, -kInf, kInf);
|
||||||
|
ADD_FUNC(erfc, a.erfc(), std::erfc(x), mpfr_erfc, -kInf, kInf);
|
||||||
|
ADD_FUNC(lgamma, a.lgamma(), std::lgamma(x), mpfr_lngamma, -kInf, kInf);
|
||||||
|
|
||||||
|
// Other
|
||||||
|
ADD_FUNC(logistic, a.logistic(), std_logistic(x), mpfr_logistic, -kInf, kInf);
|
||||||
|
ADD_FUNC(sqrt, a.sqrt(), std::sqrt(x), mpfr_sqrt, -kInf, kInf);
|
||||||
|
ADD_FUNC(cbrt, a.cbrt(), std::cbrt(x), mpfr_cbrt, -kInf, kInf);
|
||||||
|
ADD_FUNC(rsqrt, a.rsqrt(), Scalar(1)/std::sqrt(x), mpfr_rsqrt, -kInf, kInf);
|
||||||
|
// clang-format on
|
||||||
|
|
||||||
|
#undef ADD_FUNC
|
||||||
|
return table;
|
||||||
|
}
|
||||||
|
|
||||||
|
// ============================================================================
|
||||||
|
// Range iteration helpers
|
||||||
|
// ============================================================================
|
||||||
|
|
||||||
|
// Advances x toward +inf by at least 1 ULP. When step_eps > 0, additionally
|
||||||
|
// jumps by a relative factor of (1 + step_eps) to sample the range sparsely.
|
||||||
|
template <typename Scalar>
|
||||||
|
static inline Scalar advance_by_step(Scalar x, double step_eps) {
|
||||||
|
Scalar next = std::nextafter(x, std::numeric_limits<Scalar>::infinity());
|
||||||
|
if (step_eps > 0.0 && std::isfinite(next)) {
|
||||||
|
// Try to jump further by a relative amount.
|
||||||
|
Scalar jumped = next > 0 ? next * static_cast<Scalar>(1.0 + step_eps) : next / static_cast<Scalar>(1.0 + step_eps);
|
||||||
|
// Use the jump only if it actually advances further (handles denormal stalling).
|
||||||
|
if (jumped > next) next = jumped;
|
||||||
|
}
|
||||||
|
return next;
|
||||||
|
}
|
||||||
|
|
||||||
|
// Counts the number of representable scalars in [lo, hi].
|
||||||
|
template <typename Scalar>
|
||||||
|
static uint64_t count_scalars_in_range(Scalar lo, Scalar hi) {
|
||||||
|
if (lo > hi) return 0;
|
||||||
|
uint64_t lo_u = static_cast<uint64_t>(scalar_to_linear(lo));
|
||||||
|
uint64_t hi_u = static_cast<uint64_t>(scalar_to_linear(hi));
|
||||||
|
uint64_t diff = hi_u - lo_u;
|
||||||
|
return diff == UINT64_MAX ? UINT64_MAX : diff + 1;
|
||||||
|
}
|
||||||
|
|
||||||
|
// Advances a scalar by n ULPs in the linear representation.
|
||||||
|
static float advance_scalar(float x, uint64_t n) {
|
||||||
|
int64_t lin = scalar_to_linear(x);
|
||||||
|
lin += static_cast<int64_t>(n);
|
||||||
|
int32_t ibits;
|
||||||
|
if (lin < 0) {
|
||||||
|
ibits = static_cast<int32_t>(INT32_MIN) - static_cast<int32_t>(lin) - 1;
|
||||||
|
} else {
|
||||||
|
ibits = static_cast<int32_t>(lin);
|
||||||
|
}
|
||||||
|
float result;
|
||||||
|
std::memcpy(&result, &ibits, sizeof(result));
|
||||||
|
return result;
|
||||||
|
}
|
||||||
|
|
||||||
|
static double advance_scalar(double x, uint64_t n) {
|
||||||
|
int64_t lin = scalar_to_linear(x);
|
||||||
|
lin += static_cast<int64_t>(n);
|
||||||
|
int64_t ibits;
|
||||||
|
if (lin < 0) {
|
||||||
|
ibits = static_cast<int64_t>(INT64_MIN) - lin - 1;
|
||||||
|
} else {
|
||||||
|
ibits = lin;
|
||||||
|
}
|
||||||
|
double result;
|
||||||
|
std::memcpy(&result, &ibits, sizeof(result));
|
||||||
|
return result;
|
||||||
|
}
|
||||||
|
|
||||||
|
// ============================================================================
|
||||||
|
// Worker thread: evaluates Eigen and reference over a subrange
|
||||||
|
// ============================================================================
|
||||||
|
|
||||||
|
template <typename Scalar>
|
||||||
|
static void worker(const FuncEntry<Scalar>& func, Scalar lo, Scalar hi, int batch_size, bool use_mpfr, double step_eps,
|
||||||
|
ThreadResult<Scalar>& result) {
|
||||||
|
using ArrayType = Eigen::Array<Scalar, Eigen::Dynamic, 1>;
|
||||||
|
ArrayType input(batch_size);
|
||||||
|
ArrayType eigen_out(batch_size);
|
||||||
|
std::vector<Scalar> ref_out(batch_size);
|
||||||
|
|
||||||
|
#ifdef EIGEN_HAS_MPFR
|
||||||
|
mpfr_t mp_in, mp_out;
|
||||||
|
if (use_mpfr) {
|
||||||
|
mpfr_init2(mp_in, 128);
|
||||||
|
mpfr_init2(mp_out, 128);
|
||||||
|
}
|
||||||
|
#else
|
||||||
|
(void)use_mpfr;
|
||||||
|
#endif
|
||||||
|
|
||||||
|
auto process_batch = [&](int n, const ArrayType& in, const ArrayType& eig) {
|
||||||
|
for (int i = 0; i < n; i++) {
|
||||||
|
#ifdef EIGEN_HAS_MPFR
|
||||||
|
if (use_mpfr) {
|
||||||
|
mpfr_set_scalar<Scalar>(mp_in, in[i], MPFR_RNDN);
|
||||||
|
func.mpfr_eval(mp_out, mp_in, MPFR_RNDN);
|
||||||
|
ref_out[i] = mpfr_get_scalar<Scalar>(mp_out, MPFR_RNDN);
|
||||||
|
} else
|
||||||
|
#endif
|
||||||
|
{
|
||||||
|
ref_out[i] = func.std_eval(in[i]);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
for (int i = 0; i < n; i++) {
|
||||||
|
int64_t err = signed_ulp_error(eig[i], ref_out[i]);
|
||||||
|
result.record(err, in[i], eig[i], ref_out[i]);
|
||||||
|
}
|
||||||
|
};
|
||||||
|
|
||||||
|
int idx = 0;
|
||||||
|
Scalar x = lo;
|
||||||
|
for (;;) {
|
||||||
|
input[idx] = x;
|
||||||
|
idx++;
|
||||||
|
|
||||||
|
if (idx == batch_size) {
|
||||||
|
func.eigen_eval(eigen_out, input);
|
||||||
|
process_batch(batch_size, input, eigen_out);
|
||||||
|
idx = 0;
|
||||||
|
}
|
||||||
|
|
||||||
|
if (x >= hi) break;
|
||||||
|
Scalar next = advance_by_step(x, step_eps);
|
||||||
|
x = (next > hi) ? hi : next;
|
||||||
|
}
|
||||||
|
|
||||||
|
// Process remaining partial batch.
|
||||||
|
if (idx > 0) {
|
||||||
|
auto partial_in = input.head(idx);
|
||||||
|
auto partial_eigen = eigen_out.head(idx);
|
||||||
|
func.eigen_eval(partial_eigen, partial_in);
|
||||||
|
process_batch(idx, input, eigen_out);
|
||||||
|
}
|
||||||
|
|
||||||
|
#ifdef EIGEN_HAS_MPFR
|
||||||
|
if (use_mpfr) {
|
||||||
|
mpfr_clear(mp_in);
|
||||||
|
mpfr_clear(mp_out);
|
||||||
|
}
|
||||||
|
#endif
|
||||||
|
}
|
||||||
|
|
||||||
|
// ============================================================================
|
||||||
|
// Test driver: splits range across threads and prints results
|
||||||
|
// ============================================================================
|
||||||
|
|
||||||
|
struct Options {
|
||||||
|
std::string func_name;
|
||||||
|
double lo = std::numeric_limits<double>::quiet_NaN();
|
||||||
|
double hi = std::numeric_limits<double>::quiet_NaN();
|
||||||
|
int num_threads;
|
||||||
|
int batch_size = 4096;
|
||||||
|
int hist_width = 10;
|
||||||
|
bool use_mpfr = false;
|
||||||
|
bool use_double = false;
|
||||||
|
double step_eps = 0.0;
|
||||||
|
bool list_funcs = false;
|
||||||
|
};
|
||||||
|
|
||||||
|
template <typename Scalar>
|
||||||
|
static int run_test(const Options& opts) {
|
||||||
|
const int kDigits = std::is_same<Scalar, float>::value ? 9 : 17;
|
||||||
|
const char* kTypeName = std::is_same<Scalar, float>::value ? "float" : "double";
|
||||||
|
|
||||||
|
auto table = build_func_table<Scalar>();
|
||||||
|
|
||||||
|
if (opts.list_funcs) {
|
||||||
|
std::printf("Available functions:\n");
|
||||||
|
for (const auto& f : table) {
|
||||||
|
std::printf(" %s\n", f.name.c_str());
|
||||||
|
}
|
||||||
|
return 0;
|
||||||
|
}
|
||||||
|
|
||||||
|
// Look up the requested function.
|
||||||
|
const FuncEntry<Scalar>* entry = nullptr;
|
||||||
|
for (const auto& f : table) {
|
||||||
|
if (f.name == opts.func_name) {
|
||||||
|
entry = &f;
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
if (!entry) {
|
||||||
|
std::fprintf(stderr, "Error: unknown function '%s' (use --list to see available functions)\n",
|
||||||
|
opts.func_name.c_str());
|
||||||
|
return 1;
|
||||||
|
}
|
||||||
|
|
||||||
|
Scalar lo = std::isnan(opts.lo) ? entry->default_lo : static_cast<Scalar>(opts.lo);
|
||||||
|
Scalar hi = std::isnan(opts.hi) ? entry->default_hi : static_cast<Scalar>(opts.hi);
|
||||||
|
uint64_t total_scalars = count_scalars_in_range(lo, hi);
|
||||||
|
int num_threads = opts.num_threads;
|
||||||
|
|
||||||
|
// Print test configuration.
|
||||||
|
std::printf("Function: %s (%s)\n", opts.func_name.c_str(), kTypeName);
|
||||||
|
std::printf("Range: [%.*g, %.*g]\n", kDigits, double(lo), kDigits, double(hi));
|
||||||
|
if (opts.step_eps > 0.0) {
|
||||||
|
std::printf("Sampling step: (1 + %g) * nextafter(x)\n", opts.step_eps);
|
||||||
|
} else {
|
||||||
|
std::printf("Representable values in range: %lu\n", static_cast<unsigned long>(total_scalars));
|
||||||
|
}
|
||||||
|
std::printf("Reference: %s\n", opts.use_mpfr ? "MPFR (128-bit)" : "std C++ math");
|
||||||
|
std::printf("Threads: %d\n", num_threads);
|
||||||
|
std::printf("Batch size: %d\n", opts.batch_size);
|
||||||
|
std::printf("\n");
|
||||||
|
std::fflush(stdout);
|
||||||
|
|
||||||
|
// Split range across threads.
|
||||||
|
if (total_scalars > 0 && static_cast<uint64_t>(num_threads) > total_scalars) {
|
||||||
|
num_threads = static_cast<int>(total_scalars);
|
||||||
|
}
|
||||||
|
if (num_threads < 1) num_threads = 1;
|
||||||
|
|
||||||
|
// Heap-allocate each ThreadResult separately to avoid false sharing.
|
||||||
|
std::vector<std::unique_ptr<ThreadResult<Scalar>>> results;
|
||||||
|
results.reserve(num_threads);
|
||||||
|
for (int t = 0; t < num_threads; t++) {
|
||||||
|
results.push_back(std::make_unique<ThreadResult<Scalar>>());
|
||||||
|
results.back()->init(opts.hist_width);
|
||||||
|
}
|
||||||
|
|
||||||
|
std::vector<std::thread> threads;
|
||||||
|
uint64_t scalars_per_thread = total_scalars / num_threads;
|
||||||
|
Scalar chunk_lo = lo;
|
||||||
|
|
||||||
|
auto start_time = std::chrono::steady_clock::now();
|
||||||
|
|
||||||
|
for (int t = 0; t < num_threads; t++) {
|
||||||
|
Scalar chunk_hi;
|
||||||
|
if (t == num_threads - 1) {
|
||||||
|
chunk_hi = hi;
|
||||||
|
} else {
|
||||||
|
chunk_hi = advance_scalar(chunk_lo, scalars_per_thread - 1);
|
||||||
|
}
|
||||||
|
threads.emplace_back(worker<Scalar>, std::cref(*entry), chunk_lo, chunk_hi, opts.batch_size, opts.use_mpfr,
|
||||||
|
opts.step_eps, std::ref(*results[t]));
|
||||||
|
chunk_lo = std::nextafter(chunk_hi, std::numeric_limits<Scalar>::infinity());
|
||||||
|
}
|
||||||
|
|
||||||
|
for (auto& t : threads) t.join();
|
||||||
|
auto end_time = std::chrono::steady_clock::now();
|
||||||
|
double elapsed = std::chrono::duration<double>(end_time - start_time).count();
|
||||||
|
|
||||||
|
// Reduce per-thread results.
|
||||||
|
ThreadResult<Scalar> global;
|
||||||
|
global.init(opts.hist_width);
|
||||||
|
for (int t = 0; t < num_threads; t++) {
|
||||||
|
const auto& r = *results[t];
|
||||||
|
if (r.max_abs_ulp > global.max_abs_ulp) {
|
||||||
|
global.max_abs_ulp = r.max_abs_ulp;
|
||||||
|
global.max_ulp_at = r.max_ulp_at;
|
||||||
|
global.max_ulp_eigen = r.max_ulp_eigen;
|
||||||
|
global.max_ulp_ref = r.max_ulp_ref;
|
||||||
|
}
|
||||||
|
global.abs_ulp_sum += r.abs_ulp_sum;
|
||||||
|
global.count += r.count;
|
||||||
|
for (size_t b = 0; b < global.hist.size(); b++) {
|
||||||
|
global.hist[b] += r.hist[b];
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
double mean_ulp = global.count > 0 ? global.abs_ulp_sum / global.count : 0.0;
|
||||||
|
|
||||||
|
// Print results.
|
||||||
|
std::printf("Results:\n");
|
||||||
|
std::printf(" Values tested: %lu\n", static_cast<unsigned long>(global.count));
|
||||||
|
std::printf(" Time: %.2f seconds (%.1f Mvalues/s)\n", elapsed, global.count / elapsed / 1e6);
|
||||||
|
if (global.max_abs_ulp == INT64_MAX) {
|
||||||
|
std::printf(" Max |ULP error|: inf\n");
|
||||||
|
} else {
|
||||||
|
std::printf(" Max |ULP error|: %ld\n", static_cast<long>(global.max_abs_ulp));
|
||||||
|
}
|
||||||
|
std::printf(" at x = %.*g (Eigen=%.*g, ref=%.*g)\n", kDigits, double(global.max_ulp_at), kDigits,
|
||||||
|
double(global.max_ulp_eigen), kDigits, double(global.max_ulp_ref));
|
||||||
|
std::printf(" Mean |ULP error|: %.4f\n", mean_ulp);
|
||||||
|
std::printf("\n");
|
||||||
|
|
||||||
|
// Print signed error histogram.
|
||||||
|
std::printf("Signed ULP error histogram [-%d, +%d]:\n", opts.hist_width, opts.hist_width);
|
||||||
|
int nbins = 2 * opts.hist_width + 3;
|
||||||
|
for (int b = 0; b < nbins; b++) {
|
||||||
|
if (global.hist[b] == 0) continue;
|
||||||
|
double pct = 100.0 * global.hist[b] / global.count;
|
||||||
|
if (b == 0) {
|
||||||
|
std::printf(" <%-4d: %12lu (%7.3f%%)\n", -opts.hist_width, static_cast<unsigned long>(global.hist[b]), pct);
|
||||||
|
} else if (b == nbins - 1) {
|
||||||
|
std::printf(" >%-4d: %12lu (%7.3f%%)\n", opts.hist_width, static_cast<unsigned long>(global.hist[b]), pct);
|
||||||
|
} else {
|
||||||
|
int err = b - opts.hist_width - 1;
|
||||||
|
std::printf(" %-5d: %12lu (%7.3f%%)\n", err, static_cast<unsigned long>(global.hist[b]), pct);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
return 0;
|
||||||
|
}
|
||||||
|
|
||||||
|
// ============================================================================
|
||||||
|
// Command-line parsing
|
||||||
|
// ============================================================================
|
||||||
|
|
||||||
|
static void print_usage() {
|
||||||
|
std::printf(
|
||||||
|
"Usage: ulp_accuracy [options]\n"
|
||||||
|
" --func=NAME Function to test (required unless --list)\n"
|
||||||
|
" --lo=VAL Start of range (default: -inf)\n"
|
||||||
|
" --hi=VAL End of range (default: +inf)\n"
|
||||||
|
" --double Test double precision (default: float)\n"
|
||||||
|
" --step=EPS Sampling step: advance by (1+EPS)*nextafter(x)\n"
|
||||||
|
" (default: 0 = exhaustive; useful for double, e.g. 1e-6)\n"
|
||||||
|
" --threads=N Number of threads (default: all cores)\n"
|
||||||
|
" --batch=N Batch size for Eigen eval (default: 4096)\n"
|
||||||
|
" --ref=MODE Reference: 'std' (default) or 'mpfr'\n"
|
||||||
|
" --hist_width=N Histogram half-width in ULPs (default: 10)\n"
|
||||||
|
" --list List available functions\n");
|
||||||
|
}
|
||||||
|
|
||||||
|
int main(int argc, char* argv[]) {
|
||||||
|
Options opts;
|
||||||
|
opts.num_threads = static_cast<int>(std::thread::hardware_concurrency());
|
||||||
|
if (opts.num_threads == 0) opts.num_threads = 4;
|
||||||
|
std::string ref_mode;
|
||||||
|
|
||||||
|
for (int i = 1; i < argc; i++) {
|
||||||
|
std::string arg = argv[i];
|
||||||
|
if (arg.substr(0, 7) == "--func=") {
|
||||||
|
opts.func_name = arg.substr(7);
|
||||||
|
} else if (arg.substr(0, 5) == "--lo=") {
|
||||||
|
std::string val = arg.substr(5);
|
||||||
|
if (val == "inf")
|
||||||
|
opts.lo = std::numeric_limits<double>::infinity();
|
||||||
|
else if (val == "-inf")
|
||||||
|
opts.lo = -std::numeric_limits<double>::infinity();
|
||||||
|
else
|
||||||
|
opts.lo = std::stod(val);
|
||||||
|
} else if (arg.substr(0, 5) == "--hi=") {
|
||||||
|
std::string val = arg.substr(5);
|
||||||
|
if (val == "inf")
|
||||||
|
opts.hi = std::numeric_limits<double>::infinity();
|
||||||
|
else if (val == "-inf")
|
||||||
|
opts.hi = -std::numeric_limits<double>::infinity();
|
||||||
|
else
|
||||||
|
opts.hi = std::stod(val);
|
||||||
|
} else if (arg.substr(0, 10) == "--threads=") {
|
||||||
|
opts.num_threads = std::stoi(arg.substr(10));
|
||||||
|
} else if (arg.substr(0, 8) == "--batch=") {
|
||||||
|
opts.batch_size = std::stoi(arg.substr(8));
|
||||||
|
} else if (arg.substr(0, 6) == "--ref=") {
|
||||||
|
ref_mode = arg.substr(6);
|
||||||
|
} else if (arg.substr(0, 13) == "--hist_width=") {
|
||||||
|
opts.hist_width = std::stoi(arg.substr(13));
|
||||||
|
} else if (arg.substr(0, 7) == "--step=") {
|
||||||
|
opts.step_eps = std::stod(arg.substr(7));
|
||||||
|
} else if (arg == "--double") {
|
||||||
|
opts.use_double = true;
|
||||||
|
} else if (arg == "--list") {
|
||||||
|
opts.list_funcs = true;
|
||||||
|
} else if (arg == "--help" || arg == "-h") {
|
||||||
|
print_usage();
|
||||||
|
return 0;
|
||||||
|
} else {
|
||||||
|
std::fprintf(stderr, "Unknown option: %s\n", arg.c_str());
|
||||||
|
print_usage();
|
||||||
|
return 1;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// Determine reference mode (default: std).
|
||||||
|
if (ref_mode.empty() || ref_mode == "std") {
|
||||||
|
opts.use_mpfr = false;
|
||||||
|
} else if (ref_mode == "mpfr") {
|
||||||
|
#ifdef EIGEN_HAS_MPFR
|
||||||
|
opts.use_mpfr = true;
|
||||||
|
#else
|
||||||
|
std::fprintf(stderr, "Error: MPFR support not compiled in. Use --ref=std or rebuild with MPFR.\n");
|
||||||
|
return 1;
|
||||||
|
#endif
|
||||||
|
} else {
|
||||||
|
std::fprintf(stderr, "Error: --ref must be 'std' or 'mpfr'\n");
|
||||||
|
return 1;
|
||||||
|
}
|
||||||
|
|
||||||
|
if (!opts.list_funcs && opts.func_name.empty()) {
|
||||||
|
std::fprintf(stderr, "Error: --func=NAME is required (use --list to see available functions)\n");
|
||||||
|
return 1;
|
||||||
|
}
|
||||||
|
|
||||||
|
if (opts.use_double) {
|
||||||
|
return run_test<double>(opts);
|
||||||
|
} else {
|
||||||
|
return run_test<float>(opts);
|
||||||
|
}
|
||||||
|
}
|
||||||
Reference in New Issue
Block a user