mirror of
https://gitlab.com/libeigen/eigen.git
synced 2026-04-10 11:34:33 +08:00
395 lines
13 KiB
Python
Executable File
395 lines
13 KiB
Python
Executable File
#!/usr/bin/env python3
|
|
"""Benchmark regression detection using Welch's t-test.
|
|
|
|
Compares the current benchmark run against historical data stored on
|
|
the perf-data git branch. A regression is flagged when:
|
|
|
|
1. Welch's t-test p-value < significance threshold (default 0.01)
|
|
2. The relative change exceeds a minimum percentage (default 5%)
|
|
3. The direction is a slowdown (higher real_time)
|
|
|
|
Exit codes:
|
|
0 no regressions
|
|
1 regressions detected
|
|
2 error
|
|
"""
|
|
|
|
import argparse
|
|
import glob
|
|
import json
|
|
import os
|
|
import subprocess
|
|
import sys
|
|
import xml.etree.ElementTree as ET
|
|
from collections import defaultdict, namedtuple
|
|
|
|
# scipy is the only external dependency (pip-installed in the CI job).
|
|
from scipy.stats import ttest_ind
|
|
|
|
Regression = namedtuple(
|
|
"Regression",
|
|
["target", "key", "current_mean", "historical_mean", "change_pct", "p_value"],
|
|
)
|
|
|
|
|
|
def parse_args():
|
|
p = argparse.ArgumentParser(description=__doc__)
|
|
p.add_argument(
|
|
"--results-dir",
|
|
required=True,
|
|
help="Directory containing current run JSON files.",
|
|
)
|
|
p.add_argument(
|
|
"--perf-branch",
|
|
default="perf-data",
|
|
help="Git branch storing historical benchmark data.",
|
|
)
|
|
p.add_argument(
|
|
"--history-count",
|
|
type=int,
|
|
default=14,
|
|
help="Number of past runs to compare against.",
|
|
)
|
|
p.add_argument(
|
|
"--significance",
|
|
type=float,
|
|
default=0.01,
|
|
help="P-value threshold for Welch's t-test.",
|
|
)
|
|
p.add_argument(
|
|
"--min-change-pct",
|
|
type=float,
|
|
default=5.0,
|
|
help="Minimum percentage change to flag.",
|
|
)
|
|
p.add_argument(
|
|
"--output-report",
|
|
default="regression_report.txt",
|
|
help="Path for text report.",
|
|
)
|
|
return p.parse_args()
|
|
|
|
|
|
def clone_perf_branch(branch, clone_dir):
|
|
"""Shallow-clone the perf-data branch. Returns True on success."""
|
|
# Construct clone URL from CI environment or fall back to current remote.
|
|
url = os.environ.get("CI_REPOSITORY_URL", "")
|
|
if not url:
|
|
try:
|
|
url = subprocess.check_output(
|
|
["git", "remote", "get-url", "origin"], text=True
|
|
).strip()
|
|
except Exception:
|
|
return False
|
|
|
|
try:
|
|
subprocess.check_call(
|
|
[
|
|
"git",
|
|
"clone",
|
|
"--depth=1",
|
|
"--single-branch",
|
|
"--branch",
|
|
branch,
|
|
url,
|
|
clone_dir,
|
|
],
|
|
stdout=subprocess.DEVNULL,
|
|
stderr=subprocess.DEVNULL,
|
|
)
|
|
return True
|
|
except subprocess.CalledProcessError:
|
|
return False
|
|
|
|
|
|
def _history_sort_key(fpath):
|
|
"""Sort key for historical result files.
|
|
|
|
Prefer the recorded UTC timestamp in the JSON metadata. Fall back to the
|
|
filename so older date-only files still participate in the history window.
|
|
"""
|
|
try:
|
|
with open(fpath) as f:
|
|
metadata = json.load(f).get("metadata", {})
|
|
except Exception:
|
|
metadata = {}
|
|
return metadata.get("timestamp") or metadata.get("date") or os.path.basename(fpath)
|
|
|
|
|
|
def load_historical_data(perf_dir, target, history_count):
|
|
"""Load per-repetition real_time values from the last *history_count* runs.
|
|
|
|
Returns dict: benchmark_key -> list of raw real_time values (multiple per run).
|
|
|
|
We load the same non-aggregate rows that load_current_results uses so both
|
|
sides of the t-test contain the same kind of measurement (individual
|
|
repetitions), avoiding a unit mismatch between per-rep and per-run means.
|
|
"""
|
|
target_dir = os.path.join(perf_dir, target)
|
|
if not os.path.isdir(target_dir):
|
|
return {}
|
|
|
|
files = sorted(
|
|
glob.glob(os.path.join(target_dir, "*.json")),
|
|
key=_history_sort_key,
|
|
reverse=True,
|
|
)
|
|
files = files[:history_count]
|
|
|
|
history = defaultdict(list)
|
|
for fpath in files:
|
|
with open(fpath) as f:
|
|
data = json.load(f)
|
|
for exe_name, exe_data in data.get("files", {}).items():
|
|
for bm in exe_data.get("benchmarks", []):
|
|
run_type = bm.get("run_type", "")
|
|
if run_type == "aggregate":
|
|
continue
|
|
name = bm.get("name", "")
|
|
key = f"{exe_name}/{name}"
|
|
rt = bm.get("real_time")
|
|
if rt is not None:
|
|
history[key].append(rt)
|
|
return history
|
|
|
|
|
|
def load_current_results(results_dir):
|
|
"""Load current run results, keyed by target.
|
|
|
|
Returns dict: target -> dict(benchmark_key -> list of per-repetition real_time).
|
|
"""
|
|
data = defaultdict(lambda: defaultdict(list))
|
|
|
|
for jf in sorted(glob.glob(os.path.join(results_dir, "*_*_*.json"))):
|
|
with open(jf) as f:
|
|
run = json.load(f)
|
|
meta = run.get("metadata", {})
|
|
target = meta.get("target", "unknown")
|
|
|
|
for exe_name, exe_data in run.get("files", {}).items():
|
|
for bm in exe_data.get("benchmarks", []):
|
|
name = bm.get("name", "")
|
|
run_type = bm.get("run_type", "")
|
|
# Use individual iteration rows (not aggregates) for the
|
|
# current run so we have per-repetition samples.
|
|
if run_type == "aggregate":
|
|
continue
|
|
key = f"{exe_name}/{name}"
|
|
rt = bm.get("real_time")
|
|
if rt is not None:
|
|
data[target][key].append(rt)
|
|
|
|
return data
|
|
|
|
|
|
def find_regressions(current, historical, significance, min_change_pct):
|
|
"""Compare current vs historical using Welch's t-test.
|
|
|
|
Returns (regressions, improvements, skipped_count).
|
|
"""
|
|
regressions = []
|
|
improvements = []
|
|
skipped = 0
|
|
|
|
for key, current_values in sorted(current.items()):
|
|
hist_values = historical.get(key)
|
|
if not hist_values or len(hist_values) < 5:
|
|
skipped += 1
|
|
continue
|
|
if len(current_values) < 3:
|
|
skipped += 1
|
|
continue
|
|
|
|
cur_mean = sum(current_values) / len(current_values)
|
|
hist_mean = sum(hist_values) / len(hist_values)
|
|
|
|
if hist_mean == 0:
|
|
skipped += 1
|
|
continue
|
|
|
|
change_pct = (cur_mean - hist_mean) / hist_mean * 100.0
|
|
|
|
_, p_value = ttest_ind(current_values, hist_values, equal_var=False)
|
|
|
|
entry = Regression(
|
|
target="", # filled in by caller
|
|
key=key,
|
|
current_mean=cur_mean,
|
|
historical_mean=hist_mean,
|
|
change_pct=change_pct,
|
|
p_value=p_value,
|
|
)
|
|
|
|
if p_value < significance and abs(change_pct) > min_change_pct:
|
|
if change_pct > 0:
|
|
# Higher real_time = slower = regression.
|
|
regressions.append(entry)
|
|
else:
|
|
improvements.append(entry)
|
|
|
|
return regressions, improvements, skipped
|
|
|
|
|
|
def _qualified_key(r):
|
|
"""Target-qualified display key, e.g. '[x86-64-avx2] bench_gemm/BM_Gemm/256'."""
|
|
return f"[{r.target}] {r.key}"
|
|
|
|
|
|
def write_text_report(regressions, improvements, skipped, total, path):
|
|
"""Write a human-readable summary."""
|
|
with open(path, "w") as f:
|
|
f.write("# Benchmark Regression Report\n\n")
|
|
|
|
if regressions:
|
|
f.write(f"## Regressions ({len(regressions)})\n\n")
|
|
f.write(
|
|
f"{'Benchmark':<70s} {'Historical':>12s} {'Current':>12s} "
|
|
f"{'Change':>8s} {'p-value':>8s}\n"
|
|
)
|
|
f.write("-" * 114 + "\n")
|
|
for r in sorted(regressions, key=lambda x: -x.change_pct):
|
|
f.write(
|
|
f"{_qualified_key(r):<70s} {r.historical_mean:>12.1f} {r.current_mean:>12.1f} "
|
|
f"{r.change_pct:>+7.1f}% {r.p_value:>8.4f}\n"
|
|
)
|
|
f.write("\n")
|
|
|
|
if improvements:
|
|
f.write(f"## Improvements ({len(improvements)})\n\n")
|
|
f.write(
|
|
f"{'Benchmark':<70s} {'Historical':>12s} {'Current':>12s} "
|
|
f"{'Change':>8s} {'p-value':>8s}\n"
|
|
)
|
|
f.write("-" * 114 + "\n")
|
|
for r in sorted(improvements, key=lambda x: x.change_pct):
|
|
f.write(
|
|
f"{_qualified_key(r):<70s} {r.historical_mean:>12.1f} {r.current_mean:>12.1f} "
|
|
f"{r.change_pct:>+7.1f}% {r.p_value:>8.4f}\n"
|
|
)
|
|
f.write("\n")
|
|
|
|
f.write(f"## Summary\n\n")
|
|
f.write(f"- Benchmarks analyzed: {total}\n")
|
|
f.write(f"- Regressions: {len(regressions)}\n")
|
|
f.write(f"- Improvements: {len(improvements)}\n")
|
|
f.write(f"- Skipped (insufficient data): {skipped}\n")
|
|
|
|
|
|
def write_junit_report(regressions, analyzed_keys, path):
|
|
"""Write JUnit XML so GitLab displays results in the test report tab.
|
|
|
|
Keys in *analyzed_keys* and regression entries are target-qualified
|
|
(e.g. "[x86-64-avx2] bench_gemm/BM_Gemm/256") so the same benchmark
|
|
on different ISA targets appears as separate test cases.
|
|
"""
|
|
suite = ET.Element(
|
|
"testsuite",
|
|
name="benchmark-regressions",
|
|
tests=str(len(analyzed_keys)),
|
|
failures=str(len(regressions)),
|
|
)
|
|
|
|
regression_by_qkey = {_qualified_key(r): r for r in regressions}
|
|
for key in sorted(analyzed_keys):
|
|
tc = ET.SubElement(suite, "testcase", name=key, classname="benchmark")
|
|
r = regression_by_qkey.get(key)
|
|
if r is not None:
|
|
ET.SubElement(
|
|
tc,
|
|
"failure",
|
|
message=f"{r.change_pct:+.1f}% regression (p={r.p_value:.4f})",
|
|
).text = (
|
|
f"historical_mean={r.historical_mean:.1f} "
|
|
f"current_mean={r.current_mean:.1f} "
|
|
f"change={r.change_pct:+.1f}% p={r.p_value:.6f}"
|
|
)
|
|
|
|
tree = ET.ElementTree(suite)
|
|
ET.indent(tree)
|
|
tree.write(path, xml_declaration=True, encoding="utf-8")
|
|
|
|
|
|
def main():
|
|
args = parse_args()
|
|
results_dir = args.results_dir
|
|
|
|
# Load current results (keyed by target).
|
|
current_by_target = load_current_results(results_dir)
|
|
if not current_by_target:
|
|
print("No current benchmark results found.")
|
|
sys.exit(2)
|
|
|
|
total_benchmarks = sum(len(v) for v in current_by_target.values())
|
|
print(f"Loaded {total_benchmarks} benchmarks from current run.")
|
|
print(f"Targets: {', '.join(sorted(current_by_target.keys()))}")
|
|
|
|
# Clone historical data.
|
|
perf_dir = "/tmp/perf-data-history"
|
|
has_history = clone_perf_branch(args.perf_branch, perf_dir)
|
|
|
|
if not has_history:
|
|
print("No historical data found (perf-data branch missing).")
|
|
print("This is expected on the first run. Storing baseline only.")
|
|
sys.exit(0)
|
|
|
|
# Run analysis per target.
|
|
all_regressions = []
|
|
all_improvements = []
|
|
total_analyzed = 0
|
|
total_skipped = 0
|
|
all_keys = set()
|
|
|
|
for target in sorted(current_by_target.keys()):
|
|
target_current = current_by_target[target]
|
|
historical = load_historical_data(perf_dir, target, args.history_count)
|
|
if not historical:
|
|
print(f" {target}: no historical data, skipping analysis.")
|
|
continue
|
|
|
|
regs, imps, skipped = find_regressions(
|
|
target_current, historical, args.significance, args.min_change_pct
|
|
)
|
|
|
|
# Tag regressions with the target.
|
|
regs = [r._replace(target=target) for r in regs]
|
|
imps = [r._replace(target=target) for r in imps]
|
|
|
|
all_regressions.extend(regs)
|
|
all_improvements.extend(imps)
|
|
total_analyzed += len(target_current) - skipped
|
|
total_skipped += skipped
|
|
# Use target-qualified keys so the same benchmark on different ISAs
|
|
# shows up as separate entries in reports.
|
|
all_keys.update(f"[{target}] {k}" for k in target_current)
|
|
|
|
print(
|
|
f" {target}: {len(regs)} regressions, "
|
|
f"{len(imps)} improvements, {skipped} skipped"
|
|
)
|
|
|
|
# Write reports.
|
|
report_path = args.output_report
|
|
write_text_report(
|
|
all_regressions, all_improvements, total_skipped, total_analyzed, report_path
|
|
)
|
|
print(f"\nText report: {report_path}")
|
|
|
|
junit_path = report_path.replace(".txt", ".xml")
|
|
write_junit_report(all_regressions, all_keys, junit_path)
|
|
print(f"JUnit report: {junit_path}")
|
|
|
|
# Print summary and exit.
|
|
if all_regressions:
|
|
print(f"\nREGRESSIONS DETECTED: {len(all_regressions)} benchmark(s)")
|
|
for r in all_regressions:
|
|
print(f" [{r.target}] {r.key}: {r.change_pct:+.1f}% (p={r.p_value:.4f})")
|
|
sys.exit(1)
|
|
else:
|
|
n_imp = len(all_improvements)
|
|
print(f"\nNo regressions detected. {n_imp} improvement(s) found.")
|
|
sys.exit(0)
|
|
|
|
|
|
if __name__ == "__main__":
|
|
main()
|