mirror of
https://gitlab.com/libeigen/eigen.git
synced 2026-04-10 11:34:33 +08:00
Add nightly benchmark regression detection pipeline
libeigen/eigen!2349 Co-authored-by: Rasmus Munk Larsen <rmlarsen@gmail.com>
This commit is contained in:
@@ -15,6 +15,7 @@ stages:
|
||||
- checkformat
|
||||
- build
|
||||
- test
|
||||
- benchmark
|
||||
- deploy
|
||||
|
||||
variables:
|
||||
@@ -35,4 +36,5 @@ include:
|
||||
- "/ci/build.windows.gitlab-ci.yml"
|
||||
- "/ci/test.linux.gitlab-ci.yml"
|
||||
- "/ci/test.windows.gitlab-ci.yml"
|
||||
- "/ci/benchmark.gitlab-ci.yml"
|
||||
- "/ci/deploy.gitlab-ci.yml"
|
||||
|
||||
239
ci/benchmark.gitlab-ci.yml
Normal file
239
ci/benchmark.gitlab-ci.yml
Normal file
@@ -0,0 +1,239 @@
|
||||
# Benchmark pipeline for performance regression detection.
|
||||
#
|
||||
# Runs nightly (core subset) or weekly (all benchmarks) on scheduled
|
||||
# pipelines, with separate jobs per ISA target. Results are analyzed
|
||||
# using Welch's t-test against the last 30 runs stored on the perf-data
|
||||
# branch.
|
||||
|
||||
# ============================================================================
|
||||
# Variables
|
||||
# ============================================================================
|
||||
|
||||
variables:
|
||||
EIGEN_BENCH_BUILDDIR: .bench-build
|
||||
EIGEN_BENCH_REPETITIONS: "5"
|
||||
# Scope: "nightly" runs core subset, "weekly" runs all benchmarks.
|
||||
# The run script auto-promotes to "weekly" on Sundays so the full suite
|
||||
# runs once a week without a separate schedule. Override via web UI or
|
||||
# a dedicated GitLab schedule with EIGEN_BENCH_SCOPE=weekly.
|
||||
EIGEN_BENCH_SCOPE: "nightly"
|
||||
|
||||
# ============================================================================
|
||||
# Abstract bases
|
||||
# ============================================================================
|
||||
|
||||
.bench:linux:base:
|
||||
image: ubuntu:22.04
|
||||
variables:
|
||||
EIGEN_CI_BUILDDIR: ${EIGEN_BENCH_BUILDDIR}
|
||||
EIGEN_CI_TARGET_ARCH: ""
|
||||
EIGEN_BENCH_ISA_FLAGS: ""
|
||||
EIGEN_BENCH_TARGET: ""
|
||||
before_script:
|
||||
- . ci/scripts/common.linux.before_script.sh
|
||||
rules:
|
||||
- if: $CI_PIPELINE_SOURCE == "schedule" && $CI_PROJECT_NAMESPACE == "libeigen"
|
||||
- if: $CI_PIPELINE_SOURCE == "web" && $CI_PROJECT_NAMESPACE == "libeigen"
|
||||
|
||||
.bench:linux:build:
|
||||
extends: .bench:linux:base
|
||||
stage: benchmark
|
||||
needs: []
|
||||
script:
|
||||
- . ci/scripts/build.benchmark.sh
|
||||
artifacts:
|
||||
when: always
|
||||
name: "$CI_JOB_NAME_SLUG-$CI_COMMIT_REF_SLUG"
|
||||
paths:
|
||||
- ${EIGEN_BENCH_BUILDDIR}/
|
||||
exclude:
|
||||
- ${EIGEN_BENCH_BUILDDIR}/**/*.o
|
||||
expire_in: 2 days
|
||||
tags:
|
||||
- saas-linux-2xlarge-amd64
|
||||
|
||||
.bench:linux:run:
|
||||
extends: .bench:linux:base
|
||||
stage: benchmark
|
||||
script:
|
||||
- . ci/scripts/run.benchmark.sh
|
||||
artifacts:
|
||||
when: always
|
||||
name: "$CI_JOB_NAME_SLUG-$CI_COMMIT_REF_SLUG"
|
||||
paths:
|
||||
- ${EIGEN_BENCH_BUILDDIR}/results/
|
||||
expire_in: 30 days
|
||||
|
||||
# ============================================================================
|
||||
# Build jobs (one per ISA target, all run in parallel)
|
||||
# ============================================================================
|
||||
|
||||
bench:build:x86-64:sse:
|
||||
extends: .bench:linux:build
|
||||
variables:
|
||||
EIGEN_CI_C_COMPILER: gcc-10
|
||||
EIGEN_CI_CXX_COMPILER: g++-10
|
||||
EIGEN_CI_INSTALL: g++-10
|
||||
EIGEN_CI_TARGET_ARCH: x86_64
|
||||
EIGEN_BENCH_TARGET: x86-64-sse
|
||||
|
||||
bench:build:x86-64:avx2:
|
||||
extends: .bench:linux:build
|
||||
variables:
|
||||
EIGEN_CI_C_COMPILER: gcc-10
|
||||
EIGEN_CI_CXX_COMPILER: g++-10
|
||||
EIGEN_CI_INSTALL: g++-10
|
||||
EIGEN_CI_TARGET_ARCH: x86_64
|
||||
EIGEN_BENCH_TARGET: x86-64-avx2
|
||||
EIGEN_BENCH_ISA_FLAGS: "-mavx2 -mfma"
|
||||
|
||||
bench:build:x86-64:avx512dq:
|
||||
extends: .bench:linux:build
|
||||
variables:
|
||||
EIGEN_CI_C_COMPILER: gcc-10
|
||||
EIGEN_CI_CXX_COMPILER: g++-10
|
||||
EIGEN_CI_INSTALL: g++-10
|
||||
EIGEN_CI_TARGET_ARCH: x86_64
|
||||
EIGEN_BENCH_TARGET: x86-64-avx512dq
|
||||
EIGEN_BENCH_ISA_FLAGS: "-mavx512dq -mfma"
|
||||
|
||||
bench:build:aarch64:neon:
|
||||
extends: .bench:linux:build
|
||||
variables:
|
||||
EIGEN_CI_C_COMPILER: gcc-10
|
||||
EIGEN_CI_CXX_COMPILER: g++-10
|
||||
EIGEN_CI_INSTALL: g++-10
|
||||
EIGEN_CI_TARGET_ARCH: aarch64
|
||||
EIGEN_BENCH_TARGET: aarch64-neon
|
||||
EIGEN_BENCH_ISA_FLAGS: "-march=armv8.2-a+fp16"
|
||||
tags:
|
||||
- saas-linux-large-arm64
|
||||
|
||||
# ============================================================================
|
||||
# Run jobs (one per ISA target, each depends on its build)
|
||||
# ============================================================================
|
||||
|
||||
bench:run:x86-64:sse:
|
||||
extends: .bench:linux:run
|
||||
needs: [bench:build:x86-64:sse]
|
||||
variables:
|
||||
EIGEN_BENCH_TARGET: x86-64-sse
|
||||
tags:
|
||||
- saas-linux-2xlarge-amd64
|
||||
|
||||
bench:run:x86-64:avx2:
|
||||
extends: .bench:linux:run
|
||||
needs: [bench:build:x86-64:avx2]
|
||||
variables:
|
||||
EIGEN_BENCH_TARGET: x86-64-avx2
|
||||
tags:
|
||||
- saas-linux-2xlarge-amd64
|
||||
|
||||
bench:run:x86-64:avx512dq:
|
||||
extends: .bench:linux:run
|
||||
needs: [bench:build:x86-64:avx512dq]
|
||||
variables:
|
||||
EIGEN_BENCH_TARGET: x86-64-avx512dq
|
||||
tags:
|
||||
- saas-linux-2xlarge-amd64
|
||||
allow_failure: true
|
||||
|
||||
bench:run:aarch64:neon:
|
||||
extends: .bench:linux:run
|
||||
needs: [bench:build:aarch64:neon]
|
||||
variables:
|
||||
EIGEN_BENCH_TARGET: aarch64-neon
|
||||
tags:
|
||||
- saas-linux-large-arm64
|
||||
|
||||
# ============================================================================
|
||||
# Analysis: compare against historical data using Welch's t-test
|
||||
# ============================================================================
|
||||
|
||||
bench:analyze:
|
||||
stage: benchmark
|
||||
image: python:3.11-slim
|
||||
needs:
|
||||
- job: bench:run:x86-64:sse
|
||||
artifacts: true
|
||||
- job: bench:run:x86-64:avx2
|
||||
artifacts: true
|
||||
- job: bench:run:x86-64:avx512dq
|
||||
artifacts: true
|
||||
optional: true
|
||||
- job: bench:run:aarch64:neon
|
||||
artifacts: true
|
||||
variables:
|
||||
EIGEN_CI_BUILDDIR: ${EIGEN_BENCH_BUILDDIR}
|
||||
before_script:
|
||||
- export DEBIAN_FRONTEND=noninteractive
|
||||
- apt-get update -qq && apt-get install -y --no-install-recommends git
|
||||
- pip install --quiet scipy
|
||||
script:
|
||||
- |
|
||||
status=0
|
||||
python3 ci/scripts/detect_regressions.py \
|
||||
--results-dir "${EIGEN_BENCH_BUILDDIR}/results/" \
|
||||
--perf-branch perf-data \
|
||||
--history-count 30 \
|
||||
--significance 0.01 \
|
||||
--min-change-pct 5.0 \
|
||||
--output-report "${EIGEN_BENCH_BUILDDIR}/results/regression_report.txt" || status=$?
|
||||
case "${status}" in
|
||||
0|1) ;;
|
||||
*) exit "${status}" ;;
|
||||
esac
|
||||
printf '%s\n' "${status}" > "${EIGEN_BENCH_BUILDDIR}/results/regression_exit_code.txt"
|
||||
artifacts:
|
||||
when: always
|
||||
paths:
|
||||
- ${EIGEN_BENCH_BUILDDIR}/results/
|
||||
reports:
|
||||
junit: ${EIGEN_BENCH_BUILDDIR}/results/regression_report.xml
|
||||
expire_in: 90 days
|
||||
tags:
|
||||
- saas-linux-small-amd64
|
||||
rules:
|
||||
- if: $CI_PIPELINE_SOURCE == "schedule" && $CI_PROJECT_NAMESPACE == "libeigen"
|
||||
- if: $CI_PIPELINE_SOURCE == "web" && $CI_PROJECT_NAMESPACE == "libeigen"
|
||||
|
||||
# ============================================================================
|
||||
# Storage and gating
|
||||
# ============================================================================
|
||||
|
||||
bench:store-results:
|
||||
stage: deploy
|
||||
image: alpine:edge
|
||||
needs:
|
||||
- job: bench:analyze
|
||||
artifacts: true
|
||||
before_script:
|
||||
- apk add --no-cache git python3
|
||||
script:
|
||||
- . ci/scripts/push_perf_data.sh
|
||||
variables:
|
||||
EIGEN_CI_BUILDDIR: ${EIGEN_BENCH_BUILDDIR}
|
||||
tags:
|
||||
- saas-linux-small-amd64
|
||||
rules:
|
||||
- if: $CI_PIPELINE_SOURCE == "schedule" && $CI_PROJECT_NAMESPACE == "libeigen" && $CI_COMMIT_BRANCH == $CI_DEFAULT_BRANCH
|
||||
- if: $CI_PIPELINE_SOURCE == "web" && $CI_PROJECT_NAMESPACE == "libeigen" && $CI_COMMIT_BRANCH == $CI_DEFAULT_BRANCH
|
||||
|
||||
bench:regression-gate:
|
||||
stage: deploy
|
||||
image: alpine:edge
|
||||
needs:
|
||||
- job: bench:analyze
|
||||
artifacts: true
|
||||
- job: bench:store-results
|
||||
optional: true
|
||||
script:
|
||||
- code=$(cat "${EIGEN_BENCH_BUILDDIR}/results/regression_exit_code.txt")
|
||||
- test "${code}" != "1"
|
||||
variables:
|
||||
EIGEN_CI_BUILDDIR: ${EIGEN_BENCH_BUILDDIR}
|
||||
tags:
|
||||
- saas-linux-small-amd64
|
||||
rules:
|
||||
- if: $CI_PIPELINE_SOURCE == "schedule" && $CI_PROJECT_NAMESPACE == "libeigen"
|
||||
- if: $CI_PIPELINE_SOURCE == "web" && $CI_PROJECT_NAMESPACE == "libeigen"
|
||||
19
ci/scripts/benchmark_targets.txt
Normal file
19
ci/scripts/benchmark_targets.txt
Normal file
@@ -0,0 +1,19 @@
|
||||
# Nightly core benchmark subset.
|
||||
# One executable name per line. Lines starting with # are ignored.
|
||||
# These cover the most performance-critical dense linear algebra kernels.
|
||||
|
||||
# BLAS-like operations
|
||||
bench_gemm
|
||||
bench_gemm_double
|
||||
bench_gemv
|
||||
bench_dot
|
||||
bench_vecadd
|
||||
bench_trsm
|
||||
bench_reductions
|
||||
|
||||
# Decompositions
|
||||
bench_cholesky
|
||||
bench_cholesky_double
|
||||
bench_qr
|
||||
bench_svd
|
||||
bench_householder
|
||||
45
ci/scripts/build.benchmark.sh
Executable file
45
ci/scripts/build.benchmark.sh
Executable file
@@ -0,0 +1,45 @@
|
||||
#!/bin/bash
|
||||
# Build Eigen benchmarks for a given ISA target.
|
||||
#
|
||||
# Expected environment variables:
|
||||
# EIGEN_CI_BUILDDIR - build directory (default: .bench-build)
|
||||
# EIGEN_CI_CXX_COMPILER - C++ compiler
|
||||
# EIGEN_CI_C_COMPILER - C compiler
|
||||
# EIGEN_BENCH_ISA_FLAGS - ISA-specific compiler flags (e.g. "-mavx2 -mfma")
|
||||
|
||||
set -ex
|
||||
|
||||
rootdir=$(pwd)
|
||||
builddir=${EIGEN_CI_BUILDDIR:-.bench-build}
|
||||
mkdir -p "${builddir}"
|
||||
cd "${builddir}"
|
||||
|
||||
# Install Google Benchmark from source if not already present.
|
||||
# The common before_script already installs cmake/ninja; we only need
|
||||
# git and ca-certificates for the clone.
|
||||
if ! pkg-config --exists benchmark 2>/dev/null; then
|
||||
apt-get update -qq
|
||||
apt-get install -y --no-install-recommends git ca-certificates
|
||||
git clone --depth 1 --branch v1.9.1 https://github.com/google/benchmark.git /tmp/gbench
|
||||
cmake -G Ninja -S /tmp/gbench -B /tmp/gbench-build \
|
||||
-DCMAKE_BUILD_TYPE=Release \
|
||||
-DBENCHMARK_ENABLE_TESTING=OFF \
|
||||
-DCMAKE_INSTALL_PREFIX=/usr/local
|
||||
cmake --build /tmp/gbench-build --target install
|
||||
rm -rf /tmp/gbench /tmp/gbench-build
|
||||
fi
|
||||
|
||||
# Configure benchmarks. ISA flags are passed via CMAKE_CXX_FLAGS so they
|
||||
# apply globally to all benchmark targets.
|
||||
cmake -G Ninja \
|
||||
-DCMAKE_CXX_COMPILER="${EIGEN_CI_CXX_COMPILER}" \
|
||||
-DCMAKE_C_COMPILER="${EIGEN_CI_C_COMPILER}" \
|
||||
-DCMAKE_CXX_FLAGS="${EIGEN_BENCH_ISA_FLAGS}" \
|
||||
-DCMAKE_BUILD_TYPE=Release \
|
||||
"${rootdir}/benchmarks"
|
||||
|
||||
# Build all benchmark targets. The nightly/weekly scope filtering happens
|
||||
# at run time, not build time.
|
||||
cmake --build . -- -k0 || cmake --build . -- -k0 -j1
|
||||
|
||||
cd "${rootdir}"
|
||||
394
ci/scripts/detect_regressions.py
Executable file
394
ci/scripts/detect_regressions.py
Executable file
@@ -0,0 +1,394 @@
|
||||
#!/usr/bin/env python3
|
||||
"""Benchmark regression detection using Welch's t-test.
|
||||
|
||||
Compares the current benchmark run against historical data stored on
|
||||
the perf-data git branch. A regression is flagged when:
|
||||
|
||||
1. Welch's t-test p-value < significance threshold (default 0.01)
|
||||
2. The relative change exceeds a minimum percentage (default 5%)
|
||||
3. The direction is a slowdown (higher real_time)
|
||||
|
||||
Exit codes:
|
||||
0 no regressions
|
||||
1 regressions detected
|
||||
2 error
|
||||
"""
|
||||
|
||||
import argparse
|
||||
import glob
|
||||
import json
|
||||
import os
|
||||
import subprocess
|
||||
import sys
|
||||
import xml.etree.ElementTree as ET
|
||||
from collections import defaultdict, namedtuple
|
||||
|
||||
# scipy is the only external dependency (pip-installed in the CI job).
|
||||
from scipy.stats import ttest_ind
|
||||
|
||||
Regression = namedtuple(
|
||||
"Regression",
|
||||
["target", "key", "current_mean", "historical_mean", "change_pct", "p_value"],
|
||||
)
|
||||
|
||||
|
||||
def parse_args():
|
||||
p = argparse.ArgumentParser(description=__doc__)
|
||||
p.add_argument(
|
||||
"--results-dir",
|
||||
required=True,
|
||||
help="Directory containing current run JSON files.",
|
||||
)
|
||||
p.add_argument(
|
||||
"--perf-branch",
|
||||
default="perf-data",
|
||||
help="Git branch storing historical benchmark data.",
|
||||
)
|
||||
p.add_argument(
|
||||
"--history-count",
|
||||
type=int,
|
||||
default=14,
|
||||
help="Number of past runs to compare against.",
|
||||
)
|
||||
p.add_argument(
|
||||
"--significance",
|
||||
type=float,
|
||||
default=0.01,
|
||||
help="P-value threshold for Welch's t-test.",
|
||||
)
|
||||
p.add_argument(
|
||||
"--min-change-pct",
|
||||
type=float,
|
||||
default=5.0,
|
||||
help="Minimum percentage change to flag.",
|
||||
)
|
||||
p.add_argument(
|
||||
"--output-report",
|
||||
default="regression_report.txt",
|
||||
help="Path for text report.",
|
||||
)
|
||||
return p.parse_args()
|
||||
|
||||
|
||||
def clone_perf_branch(branch, clone_dir):
|
||||
"""Shallow-clone the perf-data branch. Returns True on success."""
|
||||
# Construct clone URL from CI environment or fall back to current remote.
|
||||
url = os.environ.get("CI_REPOSITORY_URL", "")
|
||||
if not url:
|
||||
try:
|
||||
url = subprocess.check_output(
|
||||
["git", "remote", "get-url", "origin"], text=True
|
||||
).strip()
|
||||
except Exception:
|
||||
return False
|
||||
|
||||
try:
|
||||
subprocess.check_call(
|
||||
[
|
||||
"git",
|
||||
"clone",
|
||||
"--depth=1",
|
||||
"--single-branch",
|
||||
"--branch",
|
||||
branch,
|
||||
url,
|
||||
clone_dir,
|
||||
],
|
||||
stdout=subprocess.DEVNULL,
|
||||
stderr=subprocess.DEVNULL,
|
||||
)
|
||||
return True
|
||||
except subprocess.CalledProcessError:
|
||||
return False
|
||||
|
||||
|
||||
def _history_sort_key(fpath):
|
||||
"""Sort key for historical result files.
|
||||
|
||||
Prefer the recorded UTC timestamp in the JSON metadata. Fall back to the
|
||||
filename so older date-only files still participate in the history window.
|
||||
"""
|
||||
try:
|
||||
with open(fpath) as f:
|
||||
metadata = json.load(f).get("metadata", {})
|
||||
except Exception:
|
||||
metadata = {}
|
||||
return metadata.get("timestamp") or metadata.get("date") or os.path.basename(fpath)
|
||||
|
||||
|
||||
def load_historical_data(perf_dir, target, history_count):
|
||||
"""Load per-repetition real_time values from the last *history_count* runs.
|
||||
|
||||
Returns dict: benchmark_key -> list of raw real_time values (multiple per run).
|
||||
|
||||
We load the same non-aggregate rows that load_current_results uses so both
|
||||
sides of the t-test contain the same kind of measurement (individual
|
||||
repetitions), avoiding a unit mismatch between per-rep and per-run means.
|
||||
"""
|
||||
target_dir = os.path.join(perf_dir, target)
|
||||
if not os.path.isdir(target_dir):
|
||||
return {}
|
||||
|
||||
files = sorted(
|
||||
glob.glob(os.path.join(target_dir, "*.json")),
|
||||
key=_history_sort_key,
|
||||
reverse=True,
|
||||
)
|
||||
files = files[:history_count]
|
||||
|
||||
history = defaultdict(list)
|
||||
for fpath in files:
|
||||
with open(fpath) as f:
|
||||
data = json.load(f)
|
||||
for exe_name, exe_data in data.get("files", {}).items():
|
||||
for bm in exe_data.get("benchmarks", []):
|
||||
run_type = bm.get("run_type", "")
|
||||
if run_type == "aggregate":
|
||||
continue
|
||||
name = bm.get("name", "")
|
||||
key = f"{exe_name}/{name}"
|
||||
rt = bm.get("real_time")
|
||||
if rt is not None:
|
||||
history[key].append(rt)
|
||||
return history
|
||||
|
||||
|
||||
def load_current_results(results_dir):
|
||||
"""Load current run results, keyed by target.
|
||||
|
||||
Returns dict: target -> dict(benchmark_key -> list of per-repetition real_time).
|
||||
"""
|
||||
data = defaultdict(lambda: defaultdict(list))
|
||||
|
||||
for jf in sorted(glob.glob(os.path.join(results_dir, "*_*_*.json"))):
|
||||
with open(jf) as f:
|
||||
run = json.load(f)
|
||||
meta = run.get("metadata", {})
|
||||
target = meta.get("target", "unknown")
|
||||
|
||||
for exe_name, exe_data in run.get("files", {}).items():
|
||||
for bm in exe_data.get("benchmarks", []):
|
||||
name = bm.get("name", "")
|
||||
run_type = bm.get("run_type", "")
|
||||
# Use individual iteration rows (not aggregates) for the
|
||||
# current run so we have per-repetition samples.
|
||||
if run_type == "aggregate":
|
||||
continue
|
||||
key = f"{exe_name}/{name}"
|
||||
rt = bm.get("real_time")
|
||||
if rt is not None:
|
||||
data[target][key].append(rt)
|
||||
|
||||
return data
|
||||
|
||||
|
||||
def find_regressions(current, historical, significance, min_change_pct):
|
||||
"""Compare current vs historical using Welch's t-test.
|
||||
|
||||
Returns (regressions, improvements, skipped_count).
|
||||
"""
|
||||
regressions = []
|
||||
improvements = []
|
||||
skipped = 0
|
||||
|
||||
for key, current_values in sorted(current.items()):
|
||||
hist_values = historical.get(key)
|
||||
if not hist_values or len(hist_values) < 5:
|
||||
skipped += 1
|
||||
continue
|
||||
if len(current_values) < 3:
|
||||
skipped += 1
|
||||
continue
|
||||
|
||||
cur_mean = sum(current_values) / len(current_values)
|
||||
hist_mean = sum(hist_values) / len(hist_values)
|
||||
|
||||
if hist_mean == 0:
|
||||
skipped += 1
|
||||
continue
|
||||
|
||||
change_pct = (cur_mean - hist_mean) / hist_mean * 100.0
|
||||
|
||||
_, p_value = ttest_ind(current_values, hist_values, equal_var=False)
|
||||
|
||||
entry = Regression(
|
||||
target="", # filled in by caller
|
||||
key=key,
|
||||
current_mean=cur_mean,
|
||||
historical_mean=hist_mean,
|
||||
change_pct=change_pct,
|
||||
p_value=p_value,
|
||||
)
|
||||
|
||||
if p_value < significance and abs(change_pct) > min_change_pct:
|
||||
if change_pct > 0:
|
||||
# Higher real_time = slower = regression.
|
||||
regressions.append(entry)
|
||||
else:
|
||||
improvements.append(entry)
|
||||
|
||||
return regressions, improvements, skipped
|
||||
|
||||
|
||||
def _qualified_key(r):
|
||||
"""Target-qualified display key, e.g. '[x86-64-avx2] bench_gemm/BM_Gemm/256'."""
|
||||
return f"[{r.target}] {r.key}"
|
||||
|
||||
|
||||
def write_text_report(regressions, improvements, skipped, total, path):
|
||||
"""Write a human-readable summary."""
|
||||
with open(path, "w") as f:
|
||||
f.write("# Benchmark Regression Report\n\n")
|
||||
|
||||
if regressions:
|
||||
f.write(f"## Regressions ({len(regressions)})\n\n")
|
||||
f.write(
|
||||
f"{'Benchmark':<70s} {'Historical':>12s} {'Current':>12s} "
|
||||
f"{'Change':>8s} {'p-value':>8s}\n"
|
||||
)
|
||||
f.write("-" * 114 + "\n")
|
||||
for r in sorted(regressions, key=lambda x: -x.change_pct):
|
||||
f.write(
|
||||
f"{_qualified_key(r):<70s} {r.historical_mean:>12.1f} {r.current_mean:>12.1f} "
|
||||
f"{r.change_pct:>+7.1f}% {r.p_value:>8.4f}\n"
|
||||
)
|
||||
f.write("\n")
|
||||
|
||||
if improvements:
|
||||
f.write(f"## Improvements ({len(improvements)})\n\n")
|
||||
f.write(
|
||||
f"{'Benchmark':<70s} {'Historical':>12s} {'Current':>12s} "
|
||||
f"{'Change':>8s} {'p-value':>8s}\n"
|
||||
)
|
||||
f.write("-" * 114 + "\n")
|
||||
for r in sorted(improvements, key=lambda x: x.change_pct):
|
||||
f.write(
|
||||
f"{_qualified_key(r):<70s} {r.historical_mean:>12.1f} {r.current_mean:>12.1f} "
|
||||
f"{r.change_pct:>+7.1f}% {r.p_value:>8.4f}\n"
|
||||
)
|
||||
f.write("\n")
|
||||
|
||||
f.write(f"## Summary\n\n")
|
||||
f.write(f"- Benchmarks analyzed: {total}\n")
|
||||
f.write(f"- Regressions: {len(regressions)}\n")
|
||||
f.write(f"- Improvements: {len(improvements)}\n")
|
||||
f.write(f"- Skipped (insufficient data): {skipped}\n")
|
||||
|
||||
|
||||
def write_junit_report(regressions, analyzed_keys, path):
|
||||
"""Write JUnit XML so GitLab displays results in the test report tab.
|
||||
|
||||
Keys in *analyzed_keys* and regression entries are target-qualified
|
||||
(e.g. "[x86-64-avx2] bench_gemm/BM_Gemm/256") so the same benchmark
|
||||
on different ISA targets appears as separate test cases.
|
||||
"""
|
||||
suite = ET.Element(
|
||||
"testsuite",
|
||||
name="benchmark-regressions",
|
||||
tests=str(len(analyzed_keys)),
|
||||
failures=str(len(regressions)),
|
||||
)
|
||||
|
||||
regression_by_qkey = {_qualified_key(r): r for r in regressions}
|
||||
for key in sorted(analyzed_keys):
|
||||
tc = ET.SubElement(suite, "testcase", name=key, classname="benchmark")
|
||||
r = regression_by_qkey.get(key)
|
||||
if r is not None:
|
||||
ET.SubElement(
|
||||
tc,
|
||||
"failure",
|
||||
message=f"{r.change_pct:+.1f}% regression (p={r.p_value:.4f})",
|
||||
).text = (
|
||||
f"historical_mean={r.historical_mean:.1f} "
|
||||
f"current_mean={r.current_mean:.1f} "
|
||||
f"change={r.change_pct:+.1f}% p={r.p_value:.6f}"
|
||||
)
|
||||
|
||||
tree = ET.ElementTree(suite)
|
||||
ET.indent(tree)
|
||||
tree.write(path, xml_declaration=True, encoding="utf-8")
|
||||
|
||||
|
||||
def main():
|
||||
args = parse_args()
|
||||
results_dir = args.results_dir
|
||||
|
||||
# Load current results (keyed by target).
|
||||
current_by_target = load_current_results(results_dir)
|
||||
if not current_by_target:
|
||||
print("No current benchmark results found.")
|
||||
sys.exit(2)
|
||||
|
||||
total_benchmarks = sum(len(v) for v in current_by_target.values())
|
||||
print(f"Loaded {total_benchmarks} benchmarks from current run.")
|
||||
print(f"Targets: {', '.join(sorted(current_by_target.keys()))}")
|
||||
|
||||
# Clone historical data.
|
||||
perf_dir = "/tmp/perf-data-history"
|
||||
has_history = clone_perf_branch(args.perf_branch, perf_dir)
|
||||
|
||||
if not has_history:
|
||||
print("No historical data found (perf-data branch missing).")
|
||||
print("This is expected on the first run. Storing baseline only.")
|
||||
sys.exit(0)
|
||||
|
||||
# Run analysis per target.
|
||||
all_regressions = []
|
||||
all_improvements = []
|
||||
total_analyzed = 0
|
||||
total_skipped = 0
|
||||
all_keys = set()
|
||||
|
||||
for target in sorted(current_by_target.keys()):
|
||||
target_current = current_by_target[target]
|
||||
historical = load_historical_data(perf_dir, target, args.history_count)
|
||||
if not historical:
|
||||
print(f" {target}: no historical data, skipping analysis.")
|
||||
continue
|
||||
|
||||
regs, imps, skipped = find_regressions(
|
||||
target_current, historical, args.significance, args.min_change_pct
|
||||
)
|
||||
|
||||
# Tag regressions with the target.
|
||||
regs = [r._replace(target=target) for r in regs]
|
||||
imps = [r._replace(target=target) for r in imps]
|
||||
|
||||
all_regressions.extend(regs)
|
||||
all_improvements.extend(imps)
|
||||
total_analyzed += len(target_current) - skipped
|
||||
total_skipped += skipped
|
||||
# Use target-qualified keys so the same benchmark on different ISAs
|
||||
# shows up as separate entries in reports.
|
||||
all_keys.update(f"[{target}] {k}" for k in target_current)
|
||||
|
||||
print(
|
||||
f" {target}: {len(regs)} regressions, "
|
||||
f"{len(imps)} improvements, {skipped} skipped"
|
||||
)
|
||||
|
||||
# Write reports.
|
||||
report_path = args.output_report
|
||||
write_text_report(
|
||||
all_regressions, all_improvements, total_skipped, total_analyzed, report_path
|
||||
)
|
||||
print(f"\nText report: {report_path}")
|
||||
|
||||
junit_path = report_path.replace(".txt", ".xml")
|
||||
write_junit_report(all_regressions, all_keys, junit_path)
|
||||
print(f"JUnit report: {junit_path}")
|
||||
|
||||
# Print summary and exit.
|
||||
if all_regressions:
|
||||
print(f"\nREGRESSIONS DETECTED: {len(all_regressions)} benchmark(s)")
|
||||
for r in all_regressions:
|
||||
print(f" [{r.target}] {r.key}: {r.change_pct:+.1f}% (p={r.p_value:.4f})")
|
||||
sys.exit(1)
|
||||
else:
|
||||
n_imp = len(all_improvements)
|
||||
print(f"\nNo regressions detected. {n_imp} improvement(s) found.")
|
||||
sys.exit(0)
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
100
ci/scripts/push_perf_data.sh
Executable file
100
ci/scripts/push_perf_data.sh
Executable file
@@ -0,0 +1,100 @@
|
||||
#!/bin/sh
|
||||
# Push benchmark results to the perf-data orphan branch.
|
||||
# POSIX sh compatible (runs under Alpine's busybox ash).
|
||||
#
|
||||
# Expected environment variables:
|
||||
# EIGEN_CI_BUILDDIR - build directory containing results/
|
||||
# EIGEN_CI_GIT_PUSH_URL - authenticated git push URL
|
||||
# CI_COMMIT_SHORT_SHA - short commit hash
|
||||
|
||||
set -ex
|
||||
|
||||
results_dir="$(pwd)/${EIGEN_CI_BUILDDIR:-.bench-build}/results"
|
||||
perf_branch="perf-data"
|
||||
clone_dir="/tmp/perf-data-push"
|
||||
push_url="${EIGEN_CI_GIT_PUSH_URL:?EIGEN_CI_GIT_PUSH_URL must be set}"
|
||||
|
||||
rm -rf "${clone_dir}"
|
||||
|
||||
# Clone perf-data branch, or create orphan if it doesn't exist.
|
||||
if git clone --depth=1 --single-branch --branch "${perf_branch}" \
|
||||
"${push_url}" "${clone_dir}" 2>/dev/null; then
|
||||
echo "Cloned existing ${perf_branch} branch."
|
||||
else
|
||||
echo "${perf_branch} branch does not exist, creating orphan..."
|
||||
mkdir -p "${clone_dir}"
|
||||
cd "${clone_dir}"
|
||||
git init
|
||||
git checkout --orphan "${perf_branch}"
|
||||
cat > README.md <<'EOF'
|
||||
# Benchmark Performance Data
|
||||
|
||||
This branch stores nightly/weekly benchmark results as JSON files.
|
||||
It is maintained automatically by the CI benchmark pipeline.
|
||||
|
||||
## Structure
|
||||
|
||||
<target>/
|
||||
<date>_<commit>_<target>.json
|
||||
|
||||
## Analysis
|
||||
|
||||
See `ci/scripts/detect_regressions.py` on the main branch for the
|
||||
regression detection script that consumes this data.
|
||||
EOF
|
||||
git add README.md
|
||||
git -c user.name="CI Bot" -c user.email="ci@eigen.tuxfamily.org" \
|
||||
commit -m "Initialize perf-data branch"
|
||||
git remote add origin "${push_url}"
|
||||
cd -
|
||||
fi
|
||||
|
||||
cd "${clone_dir}"
|
||||
|
||||
# Copy combined result files into target subdirectories.
|
||||
# Only match canonical combined formats:
|
||||
# YYYY-MM-DDTHH-MM-SSZ_<hex>_<target>.json
|
||||
# YYYY-MM-DD_<hex>_<target>.json
|
||||
# This avoids picking up raw per-benchmark files like bench_gemm_double.json.
|
||||
copied=0
|
||||
for combined_json in "${results_dir}"/*.json; do
|
||||
[ -f "${combined_json}" ] || continue
|
||||
filename=$(basename "${combined_json}")
|
||||
# Must start with a UTC timestamp or date, followed by a hex commit hash.
|
||||
case "${filename}" in
|
||||
[0-9][0-9][0-9][0-9]-[0-9][0-9]-[0-9][0-9]T[0-9][0-9]-[0-9][0-9]-[0-9][0-9]Z_[0-9a-f]*_*.json) ;;
|
||||
[0-9][0-9][0-9][0-9]-[0-9][0-9]-[0-9][0-9]_[0-9a-f]*_*.json) ;;
|
||||
*) continue ;;
|
||||
esac
|
||||
# Extract target: strip timestamp/date + commit prefix and .json suffix.
|
||||
target=$(echo "${filename}" | sed 's/^[^_]*_[a-f0-9]*_//' | sed 's/\.json$//')
|
||||
mkdir -p "${target}"
|
||||
cp "${combined_json}" "${target}/${filename}"
|
||||
copied=$((copied + 1))
|
||||
done
|
||||
|
||||
if [ "${copied}" -eq 0 ]; then
|
||||
echo "No result files to store."
|
||||
exit 0
|
||||
fi
|
||||
|
||||
# Prune data older than 90 days to keep the branch manageable.
|
||||
# We parse the date from the filename since clone mtime is always "now".
|
||||
cutoff=$(date -u -d "@$(($(date +%s) - 90*86400))" +%Y-%m-%d)
|
||||
find . -name '*.json' -path './*/*.json' | while IFS= read -r f; do
|
||||
file_date=$(basename "$f" | grep -oE '^[0-9]{4}-[0-9]{2}-[0-9]{2}')
|
||||
if [ -n "$file_date" ] && [ "$file_date" \< "$cutoff" ]; then
|
||||
rm -f "$f"
|
||||
fi
|
||||
done
|
||||
|
||||
# Commit and push.
|
||||
git add -A
|
||||
git -c user.name="CI Bot" -c user.email="ci@eigen.tuxfamily.org" \
|
||||
commit -m "Add benchmark results for $(date -u +%Y-%m-%d) (${CI_COMMIT_SHORT_SHA:-unknown})" || {
|
||||
echo "No changes to commit."
|
||||
exit 0
|
||||
}
|
||||
git push origin "${perf_branch}"
|
||||
|
||||
echo "Pushed ${copied} result file(s) to ${perf_branch} branch."
|
||||
144
ci/scripts/run.benchmark.sh
Executable file
144
ci/scripts/run.benchmark.sh
Executable file
@@ -0,0 +1,144 @@
|
||||
#!/bin/bash
|
||||
# Run Eigen benchmarks and collect JSON results with metadata.
|
||||
#
|
||||
# Expected environment variables:
|
||||
# EIGEN_CI_BUILDDIR - build directory containing benchmark executables
|
||||
# EIGEN_BENCH_TARGET - ISA target name (e.g. "x86-64-avx2")
|
||||
# EIGEN_BENCH_SCOPE - "nightly" (core subset) or "weekly" (all)
|
||||
# EIGEN_BENCH_REPETITIONS - number of repetitions per benchmark (default: 5)
|
||||
|
||||
set -ex
|
||||
|
||||
rootdir=$(pwd)
|
||||
builddir=${EIGEN_CI_BUILDDIR:-.bench-build}
|
||||
results_dir="$(pwd)/${builddir}/results"
|
||||
mkdir -p "${results_dir}"
|
||||
|
||||
target=${EIGEN_BENCH_TARGET:?EIGEN_BENCH_TARGET must be set}
|
||||
scope=${EIGEN_BENCH_SCOPE:-nightly}
|
||||
reps=${EIGEN_BENCH_REPETITIONS:-5}
|
||||
|
||||
# Auto-promote to weekly on Sundays (day 0) so the full suite runs once a
|
||||
# week without requiring a separate GitLab schedule.
|
||||
if [ "${scope}" = "nightly" ] && [ "$(date -u +%u)" = "7" ]; then
|
||||
echo "Sunday detected, promoting scope from nightly to weekly."
|
||||
scope="weekly"
|
||||
fi
|
||||
|
||||
# Runtime ISA check: skip if the runner lacks the required instruction set.
|
||||
if [[ "${target}" == *"avx512"* ]]; then
|
||||
if ! grep -q 'avx512dq' /proc/cpuinfo 2>/dev/null; then
|
||||
echo "WARNING: Runner does not support AVX-512 DQ. Skipping benchmarks."
|
||||
exit 0
|
||||
fi
|
||||
fi
|
||||
if [[ "${target}" == *"avx2"* ]]; then
|
||||
if ! grep -q 'avx2' /proc/cpuinfo 2>/dev/null; then
|
||||
echo "WARNING: Runner does not support AVX2. Skipping benchmarks."
|
||||
exit 0
|
||||
fi
|
||||
fi
|
||||
|
||||
cd "${builddir}"
|
||||
|
||||
# Determine which benchmarks to run.
|
||||
bench_list=()
|
||||
if [[ "${scope}" == "weekly" ]]; then
|
||||
while IFS= read -r -d '' exe; do
|
||||
bench_list+=("${exe}")
|
||||
done < <(find . -maxdepth 1 -type f -executable -name 'bench_*' -print0 | sort -z)
|
||||
else
|
||||
while IFS= read -r name; do
|
||||
[[ -z "$name" || "$name" == \#* ]] && continue
|
||||
name=$(echo "$name" | xargs) # trim whitespace
|
||||
[[ -z "$name" ]] && continue
|
||||
if [[ -x "./${name}" ]]; then
|
||||
bench_list+=("./${name}")
|
||||
else
|
||||
echo "WARNING: ${name} not found, skipping."
|
||||
fi
|
||||
done < "${rootdir}/ci/scripts/benchmark_targets.txt"
|
||||
fi
|
||||
|
||||
if [[ ${#bench_list[@]} -eq 0 ]]; then
|
||||
echo "ERROR: No benchmark executables found."
|
||||
exit 1
|
||||
fi
|
||||
|
||||
# Collect system info.
|
||||
cpu_model=$(grep -m1 'model name' /proc/cpuinfo 2>/dev/null | cut -d: -f2 | xargs || echo "unknown")
|
||||
timestamp=$(date -u +%Y-%m-%dT%H-%M-%SZ)
|
||||
commit=${CI_COMMIT_SHORT_SHA:-$(git rev-parse --short HEAD 2>/dev/null || echo "unknown")}
|
||||
successful_runs=0
|
||||
|
||||
# Run each benchmark executable.
|
||||
for bench_exe in "${bench_list[@]}"; do
|
||||
bench_name=$(basename "${bench_exe}")
|
||||
outfile="${results_dir}/${bench_name}.json"
|
||||
|
||||
echo "=== Running ${bench_name} (${reps} repetitions) ==="
|
||||
if ! "${bench_exe}" \
|
||||
--benchmark_format=json \
|
||||
--benchmark_out="${outfile}" \
|
||||
--benchmark_repetitions="${reps}" \
|
||||
--benchmark_report_aggregates_only=false \
|
||||
2>&1; then
|
||||
echo "WARNING: ${bench_name} failed (possibly SIGILL), skipping."
|
||||
rm -f "${outfile}"
|
||||
continue
|
||||
fi
|
||||
successful_runs=$((successful_runs + 1))
|
||||
done
|
||||
|
||||
cd "${rootdir}"
|
||||
|
||||
if [[ ${successful_runs} -eq 0 ]]; then
|
||||
echo "ERROR: No benchmark executables completed successfully."
|
||||
exit 1
|
||||
fi
|
||||
|
||||
# Wrap each result file with metadata and produce a combined output.
|
||||
python3 - "${results_dir}" "${timestamp}" "${commit}" "${target}" "${cpu_model}" "${scope}" <<'PYEOF'
|
||||
import json
|
||||
import glob
|
||||
import os
|
||||
import sys
|
||||
|
||||
results_dir = sys.argv[1]
|
||||
timestamp = sys.argv[2]
|
||||
commit = sys.argv[3]
|
||||
target = sys.argv[4]
|
||||
cpu_model = sys.argv[5]
|
||||
scope = sys.argv[6]
|
||||
|
||||
metadata = {
|
||||
"timestamp": timestamp,
|
||||
"date": timestamp[:10],
|
||||
"commit": commit,
|
||||
"target": target,
|
||||
"cpu_model": cpu_model,
|
||||
"scope": scope,
|
||||
"ci_job_id": os.environ.get("CI_JOB_ID", ""),
|
||||
"ci_pipeline_id": os.environ.get("CI_PIPELINE_ID", ""),
|
||||
"runner_description": os.environ.get("CI_RUNNER_DESCRIPTION", ""),
|
||||
}
|
||||
|
||||
combined = {"metadata": metadata, "files": {}}
|
||||
|
||||
for jf in sorted(glob.glob(os.path.join(results_dir, "bench_*.json"))):
|
||||
name = os.path.splitext(os.path.basename(jf))[0]
|
||||
with open(jf) as f:
|
||||
data = json.load(f)
|
||||
entry = {
|
||||
"context": data.get("context", {}),
|
||||
"benchmarks": data.get("benchmarks", []),
|
||||
}
|
||||
combined["files"][name] = entry
|
||||
|
||||
outpath = os.path.join(results_dir, f"{timestamp}_{commit}_{target}.json")
|
||||
with open(outpath, "w") as f:
|
||||
json.dump(combined, f, indent=2)
|
||||
|
||||
print(f"Combined results written to {outpath}")
|
||||
print(f" {len(combined['files'])} benchmark files, target={target}")
|
||||
PYEOF
|
||||
Reference in New Issue
Block a user