Add nightly benchmark regression detection pipeline

libeigen/eigen!2349

Co-authored-by: Rasmus Munk Larsen <rmlarsen@gmail.com>
This commit is contained in:
Rasmus Munk Larsen
2026-03-29 16:03:56 -07:00
parent 753a6ac5b3
commit 409296d91d
7 changed files with 943 additions and 0 deletions

View File

@@ -15,6 +15,7 @@ stages:
- checkformat
- build
- test
- benchmark
- deploy
variables:
@@ -35,4 +36,5 @@ include:
- "/ci/build.windows.gitlab-ci.yml"
- "/ci/test.linux.gitlab-ci.yml"
- "/ci/test.windows.gitlab-ci.yml"
- "/ci/benchmark.gitlab-ci.yml"
- "/ci/deploy.gitlab-ci.yml"

239
ci/benchmark.gitlab-ci.yml Normal file
View File

@@ -0,0 +1,239 @@
# Benchmark pipeline for performance regression detection.
#
# Runs nightly (core subset) or weekly (all benchmarks) on scheduled
# pipelines, with separate jobs per ISA target. Results are analyzed
# using Welch's t-test against the last 30 runs stored on the perf-data
# branch.
# ============================================================================
# Variables
# ============================================================================
variables:
EIGEN_BENCH_BUILDDIR: .bench-build
EIGEN_BENCH_REPETITIONS: "5"
# Scope: "nightly" runs core subset, "weekly" runs all benchmarks.
# The run script auto-promotes to "weekly" on Sundays so the full suite
# runs once a week without a separate schedule. Override via web UI or
# a dedicated GitLab schedule with EIGEN_BENCH_SCOPE=weekly.
EIGEN_BENCH_SCOPE: "nightly"
# ============================================================================
# Abstract bases
# ============================================================================
.bench:linux:base:
image: ubuntu:22.04
variables:
EIGEN_CI_BUILDDIR: ${EIGEN_BENCH_BUILDDIR}
EIGEN_CI_TARGET_ARCH: ""
EIGEN_BENCH_ISA_FLAGS: ""
EIGEN_BENCH_TARGET: ""
before_script:
- . ci/scripts/common.linux.before_script.sh
rules:
- if: $CI_PIPELINE_SOURCE == "schedule" && $CI_PROJECT_NAMESPACE == "libeigen"
- if: $CI_PIPELINE_SOURCE == "web" && $CI_PROJECT_NAMESPACE == "libeigen"
.bench:linux:build:
extends: .bench:linux:base
stage: benchmark
needs: []
script:
- . ci/scripts/build.benchmark.sh
artifacts:
when: always
name: "$CI_JOB_NAME_SLUG-$CI_COMMIT_REF_SLUG"
paths:
- ${EIGEN_BENCH_BUILDDIR}/
exclude:
- ${EIGEN_BENCH_BUILDDIR}/**/*.o
expire_in: 2 days
tags:
- saas-linux-2xlarge-amd64
.bench:linux:run:
extends: .bench:linux:base
stage: benchmark
script:
- . ci/scripts/run.benchmark.sh
artifacts:
when: always
name: "$CI_JOB_NAME_SLUG-$CI_COMMIT_REF_SLUG"
paths:
- ${EIGEN_BENCH_BUILDDIR}/results/
expire_in: 30 days
# ============================================================================
# Build jobs (one per ISA target, all run in parallel)
# ============================================================================
bench:build:x86-64:sse:
extends: .bench:linux:build
variables:
EIGEN_CI_C_COMPILER: gcc-10
EIGEN_CI_CXX_COMPILER: g++-10
EIGEN_CI_INSTALL: g++-10
EIGEN_CI_TARGET_ARCH: x86_64
EIGEN_BENCH_TARGET: x86-64-sse
bench:build:x86-64:avx2:
extends: .bench:linux:build
variables:
EIGEN_CI_C_COMPILER: gcc-10
EIGEN_CI_CXX_COMPILER: g++-10
EIGEN_CI_INSTALL: g++-10
EIGEN_CI_TARGET_ARCH: x86_64
EIGEN_BENCH_TARGET: x86-64-avx2
EIGEN_BENCH_ISA_FLAGS: "-mavx2 -mfma"
bench:build:x86-64:avx512dq:
extends: .bench:linux:build
variables:
EIGEN_CI_C_COMPILER: gcc-10
EIGEN_CI_CXX_COMPILER: g++-10
EIGEN_CI_INSTALL: g++-10
EIGEN_CI_TARGET_ARCH: x86_64
EIGEN_BENCH_TARGET: x86-64-avx512dq
EIGEN_BENCH_ISA_FLAGS: "-mavx512dq -mfma"
bench:build:aarch64:neon:
extends: .bench:linux:build
variables:
EIGEN_CI_C_COMPILER: gcc-10
EIGEN_CI_CXX_COMPILER: g++-10
EIGEN_CI_INSTALL: g++-10
EIGEN_CI_TARGET_ARCH: aarch64
EIGEN_BENCH_TARGET: aarch64-neon
EIGEN_BENCH_ISA_FLAGS: "-march=armv8.2-a+fp16"
tags:
- saas-linux-large-arm64
# ============================================================================
# Run jobs (one per ISA target, each depends on its build)
# ============================================================================
bench:run:x86-64:sse:
extends: .bench:linux:run
needs: [bench:build:x86-64:sse]
variables:
EIGEN_BENCH_TARGET: x86-64-sse
tags:
- saas-linux-2xlarge-amd64
bench:run:x86-64:avx2:
extends: .bench:linux:run
needs: [bench:build:x86-64:avx2]
variables:
EIGEN_BENCH_TARGET: x86-64-avx2
tags:
- saas-linux-2xlarge-amd64
bench:run:x86-64:avx512dq:
extends: .bench:linux:run
needs: [bench:build:x86-64:avx512dq]
variables:
EIGEN_BENCH_TARGET: x86-64-avx512dq
tags:
- saas-linux-2xlarge-amd64
allow_failure: true
bench:run:aarch64:neon:
extends: .bench:linux:run
needs: [bench:build:aarch64:neon]
variables:
EIGEN_BENCH_TARGET: aarch64-neon
tags:
- saas-linux-large-arm64
# ============================================================================
# Analysis: compare against historical data using Welch's t-test
# ============================================================================
bench:analyze:
stage: benchmark
image: python:3.11-slim
needs:
- job: bench:run:x86-64:sse
artifacts: true
- job: bench:run:x86-64:avx2
artifacts: true
- job: bench:run:x86-64:avx512dq
artifacts: true
optional: true
- job: bench:run:aarch64:neon
artifacts: true
variables:
EIGEN_CI_BUILDDIR: ${EIGEN_BENCH_BUILDDIR}
before_script:
- export DEBIAN_FRONTEND=noninteractive
- apt-get update -qq && apt-get install -y --no-install-recommends git
- pip install --quiet scipy
script:
- |
status=0
python3 ci/scripts/detect_regressions.py \
--results-dir "${EIGEN_BENCH_BUILDDIR}/results/" \
--perf-branch perf-data \
--history-count 30 \
--significance 0.01 \
--min-change-pct 5.0 \
--output-report "${EIGEN_BENCH_BUILDDIR}/results/regression_report.txt" || status=$?
case "${status}" in
0|1) ;;
*) exit "${status}" ;;
esac
printf '%s\n' "${status}" > "${EIGEN_BENCH_BUILDDIR}/results/regression_exit_code.txt"
artifacts:
when: always
paths:
- ${EIGEN_BENCH_BUILDDIR}/results/
reports:
junit: ${EIGEN_BENCH_BUILDDIR}/results/regression_report.xml
expire_in: 90 days
tags:
- saas-linux-small-amd64
rules:
- if: $CI_PIPELINE_SOURCE == "schedule" && $CI_PROJECT_NAMESPACE == "libeigen"
- if: $CI_PIPELINE_SOURCE == "web" && $CI_PROJECT_NAMESPACE == "libeigen"
# ============================================================================
# Storage and gating
# ============================================================================
bench:store-results:
stage: deploy
image: alpine:edge
needs:
- job: bench:analyze
artifacts: true
before_script:
- apk add --no-cache git python3
script:
- . ci/scripts/push_perf_data.sh
variables:
EIGEN_CI_BUILDDIR: ${EIGEN_BENCH_BUILDDIR}
tags:
- saas-linux-small-amd64
rules:
- if: $CI_PIPELINE_SOURCE == "schedule" && $CI_PROJECT_NAMESPACE == "libeigen" && $CI_COMMIT_BRANCH == $CI_DEFAULT_BRANCH
- if: $CI_PIPELINE_SOURCE == "web" && $CI_PROJECT_NAMESPACE == "libeigen" && $CI_COMMIT_BRANCH == $CI_DEFAULT_BRANCH
bench:regression-gate:
stage: deploy
image: alpine:edge
needs:
- job: bench:analyze
artifacts: true
- job: bench:store-results
optional: true
script:
- code=$(cat "${EIGEN_BENCH_BUILDDIR}/results/regression_exit_code.txt")
- test "${code}" != "1"
variables:
EIGEN_CI_BUILDDIR: ${EIGEN_BENCH_BUILDDIR}
tags:
- saas-linux-small-amd64
rules:
- if: $CI_PIPELINE_SOURCE == "schedule" && $CI_PROJECT_NAMESPACE == "libeigen"
- if: $CI_PIPELINE_SOURCE == "web" && $CI_PROJECT_NAMESPACE == "libeigen"

View File

@@ -0,0 +1,19 @@
# Nightly core benchmark subset.
# One executable name per line. Lines starting with # are ignored.
# These cover the most performance-critical dense linear algebra kernels.
# BLAS-like operations
bench_gemm
bench_gemm_double
bench_gemv
bench_dot
bench_vecadd
bench_trsm
bench_reductions
# Decompositions
bench_cholesky
bench_cholesky_double
bench_qr
bench_svd
bench_householder

45
ci/scripts/build.benchmark.sh Executable file
View File

@@ -0,0 +1,45 @@
#!/bin/bash
# Build Eigen benchmarks for a given ISA target.
#
# Expected environment variables:
# EIGEN_CI_BUILDDIR - build directory (default: .bench-build)
# EIGEN_CI_CXX_COMPILER - C++ compiler
# EIGEN_CI_C_COMPILER - C compiler
# EIGEN_BENCH_ISA_FLAGS - ISA-specific compiler flags (e.g. "-mavx2 -mfma")
set -ex
rootdir=$(pwd)
builddir=${EIGEN_CI_BUILDDIR:-.bench-build}
mkdir -p "${builddir}"
cd "${builddir}"
# Install Google Benchmark from source if not already present.
# The common before_script already installs cmake/ninja; we only need
# git and ca-certificates for the clone.
if ! pkg-config --exists benchmark 2>/dev/null; then
apt-get update -qq
apt-get install -y --no-install-recommends git ca-certificates
git clone --depth 1 --branch v1.9.1 https://github.com/google/benchmark.git /tmp/gbench
cmake -G Ninja -S /tmp/gbench -B /tmp/gbench-build \
-DCMAKE_BUILD_TYPE=Release \
-DBENCHMARK_ENABLE_TESTING=OFF \
-DCMAKE_INSTALL_PREFIX=/usr/local
cmake --build /tmp/gbench-build --target install
rm -rf /tmp/gbench /tmp/gbench-build
fi
# Configure benchmarks. ISA flags are passed via CMAKE_CXX_FLAGS so they
# apply globally to all benchmark targets.
cmake -G Ninja \
-DCMAKE_CXX_COMPILER="${EIGEN_CI_CXX_COMPILER}" \
-DCMAKE_C_COMPILER="${EIGEN_CI_C_COMPILER}" \
-DCMAKE_CXX_FLAGS="${EIGEN_BENCH_ISA_FLAGS}" \
-DCMAKE_BUILD_TYPE=Release \
"${rootdir}/benchmarks"
# Build all benchmark targets. The nightly/weekly scope filtering happens
# at run time, not build time.
cmake --build . -- -k0 || cmake --build . -- -k0 -j1
cd "${rootdir}"

394
ci/scripts/detect_regressions.py Executable file
View File

@@ -0,0 +1,394 @@
#!/usr/bin/env python3
"""Benchmark regression detection using Welch's t-test.
Compares the current benchmark run against historical data stored on
the perf-data git branch. A regression is flagged when:
1. Welch's t-test p-value < significance threshold (default 0.01)
2. The relative change exceeds a minimum percentage (default 5%)
3. The direction is a slowdown (higher real_time)
Exit codes:
0 no regressions
1 regressions detected
2 error
"""
import argparse
import glob
import json
import os
import subprocess
import sys
import xml.etree.ElementTree as ET
from collections import defaultdict, namedtuple
# scipy is the only external dependency (pip-installed in the CI job).
from scipy.stats import ttest_ind
Regression = namedtuple(
"Regression",
["target", "key", "current_mean", "historical_mean", "change_pct", "p_value"],
)
def parse_args():
p = argparse.ArgumentParser(description=__doc__)
p.add_argument(
"--results-dir",
required=True,
help="Directory containing current run JSON files.",
)
p.add_argument(
"--perf-branch",
default="perf-data",
help="Git branch storing historical benchmark data.",
)
p.add_argument(
"--history-count",
type=int,
default=14,
help="Number of past runs to compare against.",
)
p.add_argument(
"--significance",
type=float,
default=0.01,
help="P-value threshold for Welch's t-test.",
)
p.add_argument(
"--min-change-pct",
type=float,
default=5.0,
help="Minimum percentage change to flag.",
)
p.add_argument(
"--output-report",
default="regression_report.txt",
help="Path for text report.",
)
return p.parse_args()
def clone_perf_branch(branch, clone_dir):
"""Shallow-clone the perf-data branch. Returns True on success."""
# Construct clone URL from CI environment or fall back to current remote.
url = os.environ.get("CI_REPOSITORY_URL", "")
if not url:
try:
url = subprocess.check_output(
["git", "remote", "get-url", "origin"], text=True
).strip()
except Exception:
return False
try:
subprocess.check_call(
[
"git",
"clone",
"--depth=1",
"--single-branch",
"--branch",
branch,
url,
clone_dir,
],
stdout=subprocess.DEVNULL,
stderr=subprocess.DEVNULL,
)
return True
except subprocess.CalledProcessError:
return False
def _history_sort_key(fpath):
"""Sort key for historical result files.
Prefer the recorded UTC timestamp in the JSON metadata. Fall back to the
filename so older date-only files still participate in the history window.
"""
try:
with open(fpath) as f:
metadata = json.load(f).get("metadata", {})
except Exception:
metadata = {}
return metadata.get("timestamp") or metadata.get("date") or os.path.basename(fpath)
def load_historical_data(perf_dir, target, history_count):
"""Load per-repetition real_time values from the last *history_count* runs.
Returns dict: benchmark_key -> list of raw real_time values (multiple per run).
We load the same non-aggregate rows that load_current_results uses so both
sides of the t-test contain the same kind of measurement (individual
repetitions), avoiding a unit mismatch between per-rep and per-run means.
"""
target_dir = os.path.join(perf_dir, target)
if not os.path.isdir(target_dir):
return {}
files = sorted(
glob.glob(os.path.join(target_dir, "*.json")),
key=_history_sort_key,
reverse=True,
)
files = files[:history_count]
history = defaultdict(list)
for fpath in files:
with open(fpath) as f:
data = json.load(f)
for exe_name, exe_data in data.get("files", {}).items():
for bm in exe_data.get("benchmarks", []):
run_type = bm.get("run_type", "")
if run_type == "aggregate":
continue
name = bm.get("name", "")
key = f"{exe_name}/{name}"
rt = bm.get("real_time")
if rt is not None:
history[key].append(rt)
return history
def load_current_results(results_dir):
"""Load current run results, keyed by target.
Returns dict: target -> dict(benchmark_key -> list of per-repetition real_time).
"""
data = defaultdict(lambda: defaultdict(list))
for jf in sorted(glob.glob(os.path.join(results_dir, "*_*_*.json"))):
with open(jf) as f:
run = json.load(f)
meta = run.get("metadata", {})
target = meta.get("target", "unknown")
for exe_name, exe_data in run.get("files", {}).items():
for bm in exe_data.get("benchmarks", []):
name = bm.get("name", "")
run_type = bm.get("run_type", "")
# Use individual iteration rows (not aggregates) for the
# current run so we have per-repetition samples.
if run_type == "aggregate":
continue
key = f"{exe_name}/{name}"
rt = bm.get("real_time")
if rt is not None:
data[target][key].append(rt)
return data
def find_regressions(current, historical, significance, min_change_pct):
"""Compare current vs historical using Welch's t-test.
Returns (regressions, improvements, skipped_count).
"""
regressions = []
improvements = []
skipped = 0
for key, current_values in sorted(current.items()):
hist_values = historical.get(key)
if not hist_values or len(hist_values) < 5:
skipped += 1
continue
if len(current_values) < 3:
skipped += 1
continue
cur_mean = sum(current_values) / len(current_values)
hist_mean = sum(hist_values) / len(hist_values)
if hist_mean == 0:
skipped += 1
continue
change_pct = (cur_mean - hist_mean) / hist_mean * 100.0
_, p_value = ttest_ind(current_values, hist_values, equal_var=False)
entry = Regression(
target="", # filled in by caller
key=key,
current_mean=cur_mean,
historical_mean=hist_mean,
change_pct=change_pct,
p_value=p_value,
)
if p_value < significance and abs(change_pct) > min_change_pct:
if change_pct > 0:
# Higher real_time = slower = regression.
regressions.append(entry)
else:
improvements.append(entry)
return regressions, improvements, skipped
def _qualified_key(r):
"""Target-qualified display key, e.g. '[x86-64-avx2] bench_gemm/BM_Gemm/256'."""
return f"[{r.target}] {r.key}"
def write_text_report(regressions, improvements, skipped, total, path):
"""Write a human-readable summary."""
with open(path, "w") as f:
f.write("# Benchmark Regression Report\n\n")
if regressions:
f.write(f"## Regressions ({len(regressions)})\n\n")
f.write(
f"{'Benchmark':<70s} {'Historical':>12s} {'Current':>12s} "
f"{'Change':>8s} {'p-value':>8s}\n"
)
f.write("-" * 114 + "\n")
for r in sorted(regressions, key=lambda x: -x.change_pct):
f.write(
f"{_qualified_key(r):<70s} {r.historical_mean:>12.1f} {r.current_mean:>12.1f} "
f"{r.change_pct:>+7.1f}% {r.p_value:>8.4f}\n"
)
f.write("\n")
if improvements:
f.write(f"## Improvements ({len(improvements)})\n\n")
f.write(
f"{'Benchmark':<70s} {'Historical':>12s} {'Current':>12s} "
f"{'Change':>8s} {'p-value':>8s}\n"
)
f.write("-" * 114 + "\n")
for r in sorted(improvements, key=lambda x: x.change_pct):
f.write(
f"{_qualified_key(r):<70s} {r.historical_mean:>12.1f} {r.current_mean:>12.1f} "
f"{r.change_pct:>+7.1f}% {r.p_value:>8.4f}\n"
)
f.write("\n")
f.write(f"## Summary\n\n")
f.write(f"- Benchmarks analyzed: {total}\n")
f.write(f"- Regressions: {len(regressions)}\n")
f.write(f"- Improvements: {len(improvements)}\n")
f.write(f"- Skipped (insufficient data): {skipped}\n")
def write_junit_report(regressions, analyzed_keys, path):
"""Write JUnit XML so GitLab displays results in the test report tab.
Keys in *analyzed_keys* and regression entries are target-qualified
(e.g. "[x86-64-avx2] bench_gemm/BM_Gemm/256") so the same benchmark
on different ISA targets appears as separate test cases.
"""
suite = ET.Element(
"testsuite",
name="benchmark-regressions",
tests=str(len(analyzed_keys)),
failures=str(len(regressions)),
)
regression_by_qkey = {_qualified_key(r): r for r in regressions}
for key in sorted(analyzed_keys):
tc = ET.SubElement(suite, "testcase", name=key, classname="benchmark")
r = regression_by_qkey.get(key)
if r is not None:
ET.SubElement(
tc,
"failure",
message=f"{r.change_pct:+.1f}% regression (p={r.p_value:.4f})",
).text = (
f"historical_mean={r.historical_mean:.1f} "
f"current_mean={r.current_mean:.1f} "
f"change={r.change_pct:+.1f}% p={r.p_value:.6f}"
)
tree = ET.ElementTree(suite)
ET.indent(tree)
tree.write(path, xml_declaration=True, encoding="utf-8")
def main():
args = parse_args()
results_dir = args.results_dir
# Load current results (keyed by target).
current_by_target = load_current_results(results_dir)
if not current_by_target:
print("No current benchmark results found.")
sys.exit(2)
total_benchmarks = sum(len(v) for v in current_by_target.values())
print(f"Loaded {total_benchmarks} benchmarks from current run.")
print(f"Targets: {', '.join(sorted(current_by_target.keys()))}")
# Clone historical data.
perf_dir = "/tmp/perf-data-history"
has_history = clone_perf_branch(args.perf_branch, perf_dir)
if not has_history:
print("No historical data found (perf-data branch missing).")
print("This is expected on the first run. Storing baseline only.")
sys.exit(0)
# Run analysis per target.
all_regressions = []
all_improvements = []
total_analyzed = 0
total_skipped = 0
all_keys = set()
for target in sorted(current_by_target.keys()):
target_current = current_by_target[target]
historical = load_historical_data(perf_dir, target, args.history_count)
if not historical:
print(f" {target}: no historical data, skipping analysis.")
continue
regs, imps, skipped = find_regressions(
target_current, historical, args.significance, args.min_change_pct
)
# Tag regressions with the target.
regs = [r._replace(target=target) for r in regs]
imps = [r._replace(target=target) for r in imps]
all_regressions.extend(regs)
all_improvements.extend(imps)
total_analyzed += len(target_current) - skipped
total_skipped += skipped
# Use target-qualified keys so the same benchmark on different ISAs
# shows up as separate entries in reports.
all_keys.update(f"[{target}] {k}" for k in target_current)
print(
f" {target}: {len(regs)} regressions, "
f"{len(imps)} improvements, {skipped} skipped"
)
# Write reports.
report_path = args.output_report
write_text_report(
all_regressions, all_improvements, total_skipped, total_analyzed, report_path
)
print(f"\nText report: {report_path}")
junit_path = report_path.replace(".txt", ".xml")
write_junit_report(all_regressions, all_keys, junit_path)
print(f"JUnit report: {junit_path}")
# Print summary and exit.
if all_regressions:
print(f"\nREGRESSIONS DETECTED: {len(all_regressions)} benchmark(s)")
for r in all_regressions:
print(f" [{r.target}] {r.key}: {r.change_pct:+.1f}% (p={r.p_value:.4f})")
sys.exit(1)
else:
n_imp = len(all_improvements)
print(f"\nNo regressions detected. {n_imp} improvement(s) found.")
sys.exit(0)
if __name__ == "__main__":
main()

100
ci/scripts/push_perf_data.sh Executable file
View File

@@ -0,0 +1,100 @@
#!/bin/sh
# Push benchmark results to the perf-data orphan branch.
# POSIX sh compatible (runs under Alpine's busybox ash).
#
# Expected environment variables:
# EIGEN_CI_BUILDDIR - build directory containing results/
# EIGEN_CI_GIT_PUSH_URL - authenticated git push URL
# CI_COMMIT_SHORT_SHA - short commit hash
set -ex
results_dir="$(pwd)/${EIGEN_CI_BUILDDIR:-.bench-build}/results"
perf_branch="perf-data"
clone_dir="/tmp/perf-data-push"
push_url="${EIGEN_CI_GIT_PUSH_URL:?EIGEN_CI_GIT_PUSH_URL must be set}"
rm -rf "${clone_dir}"
# Clone perf-data branch, or create orphan if it doesn't exist.
if git clone --depth=1 --single-branch --branch "${perf_branch}" \
"${push_url}" "${clone_dir}" 2>/dev/null; then
echo "Cloned existing ${perf_branch} branch."
else
echo "${perf_branch} branch does not exist, creating orphan..."
mkdir -p "${clone_dir}"
cd "${clone_dir}"
git init
git checkout --orphan "${perf_branch}"
cat > README.md <<'EOF'
# Benchmark Performance Data
This branch stores nightly/weekly benchmark results as JSON files.
It is maintained automatically by the CI benchmark pipeline.
## Structure
<target>/
<date>_<commit>_<target>.json
## Analysis
See `ci/scripts/detect_regressions.py` on the main branch for the
regression detection script that consumes this data.
EOF
git add README.md
git -c user.name="CI Bot" -c user.email="ci@eigen.tuxfamily.org" \
commit -m "Initialize perf-data branch"
git remote add origin "${push_url}"
cd -
fi
cd "${clone_dir}"
# Copy combined result files into target subdirectories.
# Only match canonical combined formats:
# YYYY-MM-DDTHH-MM-SSZ_<hex>_<target>.json
# YYYY-MM-DD_<hex>_<target>.json
# This avoids picking up raw per-benchmark files like bench_gemm_double.json.
copied=0
for combined_json in "${results_dir}"/*.json; do
[ -f "${combined_json}" ] || continue
filename=$(basename "${combined_json}")
# Must start with a UTC timestamp or date, followed by a hex commit hash.
case "${filename}" in
[0-9][0-9][0-9][0-9]-[0-9][0-9]-[0-9][0-9]T[0-9][0-9]-[0-9][0-9]-[0-9][0-9]Z_[0-9a-f]*_*.json) ;;
[0-9][0-9][0-9][0-9]-[0-9][0-9]-[0-9][0-9]_[0-9a-f]*_*.json) ;;
*) continue ;;
esac
# Extract target: strip timestamp/date + commit prefix and .json suffix.
target=$(echo "${filename}" | sed 's/^[^_]*_[a-f0-9]*_//' | sed 's/\.json$//')
mkdir -p "${target}"
cp "${combined_json}" "${target}/${filename}"
copied=$((copied + 1))
done
if [ "${copied}" -eq 0 ]; then
echo "No result files to store."
exit 0
fi
# Prune data older than 90 days to keep the branch manageable.
# We parse the date from the filename since clone mtime is always "now".
cutoff=$(date -u -d "@$(($(date +%s) - 90*86400))" +%Y-%m-%d)
find . -name '*.json' -path './*/*.json' | while IFS= read -r f; do
file_date=$(basename "$f" | grep -oE '^[0-9]{4}-[0-9]{2}-[0-9]{2}')
if [ -n "$file_date" ] && [ "$file_date" \< "$cutoff" ]; then
rm -f "$f"
fi
done
# Commit and push.
git add -A
git -c user.name="CI Bot" -c user.email="ci@eigen.tuxfamily.org" \
commit -m "Add benchmark results for $(date -u +%Y-%m-%d) (${CI_COMMIT_SHORT_SHA:-unknown})" || {
echo "No changes to commit."
exit 0
}
git push origin "${perf_branch}"
echo "Pushed ${copied} result file(s) to ${perf_branch} branch."

144
ci/scripts/run.benchmark.sh Executable file
View File

@@ -0,0 +1,144 @@
#!/bin/bash
# Run Eigen benchmarks and collect JSON results with metadata.
#
# Expected environment variables:
# EIGEN_CI_BUILDDIR - build directory containing benchmark executables
# EIGEN_BENCH_TARGET - ISA target name (e.g. "x86-64-avx2")
# EIGEN_BENCH_SCOPE - "nightly" (core subset) or "weekly" (all)
# EIGEN_BENCH_REPETITIONS - number of repetitions per benchmark (default: 5)
set -ex
rootdir=$(pwd)
builddir=${EIGEN_CI_BUILDDIR:-.bench-build}
results_dir="$(pwd)/${builddir}/results"
mkdir -p "${results_dir}"
target=${EIGEN_BENCH_TARGET:?EIGEN_BENCH_TARGET must be set}
scope=${EIGEN_BENCH_SCOPE:-nightly}
reps=${EIGEN_BENCH_REPETITIONS:-5}
# Auto-promote to weekly on Sundays (day 0) so the full suite runs once a
# week without requiring a separate GitLab schedule.
if [ "${scope}" = "nightly" ] && [ "$(date -u +%u)" = "7" ]; then
echo "Sunday detected, promoting scope from nightly to weekly."
scope="weekly"
fi
# Runtime ISA check: skip if the runner lacks the required instruction set.
if [[ "${target}" == *"avx512"* ]]; then
if ! grep -q 'avx512dq' /proc/cpuinfo 2>/dev/null; then
echo "WARNING: Runner does not support AVX-512 DQ. Skipping benchmarks."
exit 0
fi
fi
if [[ "${target}" == *"avx2"* ]]; then
if ! grep -q 'avx2' /proc/cpuinfo 2>/dev/null; then
echo "WARNING: Runner does not support AVX2. Skipping benchmarks."
exit 0
fi
fi
cd "${builddir}"
# Determine which benchmarks to run.
bench_list=()
if [[ "${scope}" == "weekly" ]]; then
while IFS= read -r -d '' exe; do
bench_list+=("${exe}")
done < <(find . -maxdepth 1 -type f -executable -name 'bench_*' -print0 | sort -z)
else
while IFS= read -r name; do
[[ -z "$name" || "$name" == \#* ]] && continue
name=$(echo "$name" | xargs) # trim whitespace
[[ -z "$name" ]] && continue
if [[ -x "./${name}" ]]; then
bench_list+=("./${name}")
else
echo "WARNING: ${name} not found, skipping."
fi
done < "${rootdir}/ci/scripts/benchmark_targets.txt"
fi
if [[ ${#bench_list[@]} -eq 0 ]]; then
echo "ERROR: No benchmark executables found."
exit 1
fi
# Collect system info.
cpu_model=$(grep -m1 'model name' /proc/cpuinfo 2>/dev/null | cut -d: -f2 | xargs || echo "unknown")
timestamp=$(date -u +%Y-%m-%dT%H-%M-%SZ)
commit=${CI_COMMIT_SHORT_SHA:-$(git rev-parse --short HEAD 2>/dev/null || echo "unknown")}
successful_runs=0
# Run each benchmark executable.
for bench_exe in "${bench_list[@]}"; do
bench_name=$(basename "${bench_exe}")
outfile="${results_dir}/${bench_name}.json"
echo "=== Running ${bench_name} (${reps} repetitions) ==="
if ! "${bench_exe}" \
--benchmark_format=json \
--benchmark_out="${outfile}" \
--benchmark_repetitions="${reps}" \
--benchmark_report_aggregates_only=false \
2>&1; then
echo "WARNING: ${bench_name} failed (possibly SIGILL), skipping."
rm -f "${outfile}"
continue
fi
successful_runs=$((successful_runs + 1))
done
cd "${rootdir}"
if [[ ${successful_runs} -eq 0 ]]; then
echo "ERROR: No benchmark executables completed successfully."
exit 1
fi
# Wrap each result file with metadata and produce a combined output.
python3 - "${results_dir}" "${timestamp}" "${commit}" "${target}" "${cpu_model}" "${scope}" <<'PYEOF'
import json
import glob
import os
import sys
results_dir = sys.argv[1]
timestamp = sys.argv[2]
commit = sys.argv[3]
target = sys.argv[4]
cpu_model = sys.argv[5]
scope = sys.argv[6]
metadata = {
"timestamp": timestamp,
"date": timestamp[:10],
"commit": commit,
"target": target,
"cpu_model": cpu_model,
"scope": scope,
"ci_job_id": os.environ.get("CI_JOB_ID", ""),
"ci_pipeline_id": os.environ.get("CI_PIPELINE_ID", ""),
"runner_description": os.environ.get("CI_RUNNER_DESCRIPTION", ""),
}
combined = {"metadata": metadata, "files": {}}
for jf in sorted(glob.glob(os.path.join(results_dir, "bench_*.json"))):
name = os.path.splitext(os.path.basename(jf))[0]
with open(jf) as f:
data = json.load(f)
entry = {
"context": data.get("context", {}),
"benchmarks": data.get("benchmarks", []),
}
combined["files"][name] = entry
outpath = os.path.join(results_dir, f"{timestamp}_{commit}_{target}.json")
with open(outpath, "w") as f:
json.dump(combined, f, indent=2)
print(f"Combined results written to {outpath}")
print(f" {len(combined['files'])} benchmark files, target={target}")
PYEOF