diff --git a/.gitlab-ci.yml b/.gitlab-ci.yml index 8872ad0b6..5e2ec27d9 100644 --- a/.gitlab-ci.yml +++ b/.gitlab-ci.yml @@ -15,6 +15,7 @@ stages: - checkformat - build - test + - benchmark - deploy variables: @@ -35,4 +36,5 @@ include: - "/ci/build.windows.gitlab-ci.yml" - "/ci/test.linux.gitlab-ci.yml" - "/ci/test.windows.gitlab-ci.yml" + - "/ci/benchmark.gitlab-ci.yml" - "/ci/deploy.gitlab-ci.yml" diff --git a/ci/benchmark.gitlab-ci.yml b/ci/benchmark.gitlab-ci.yml new file mode 100644 index 000000000..ddbee782d --- /dev/null +++ b/ci/benchmark.gitlab-ci.yml @@ -0,0 +1,239 @@ +# Benchmark pipeline for performance regression detection. +# +# Runs nightly (core subset) or weekly (all benchmarks) on scheduled +# pipelines, with separate jobs per ISA target. Results are analyzed +# using Welch's t-test against the last 30 runs stored on the perf-data +# branch. + +# ============================================================================ +# Variables +# ============================================================================ + +variables: + EIGEN_BENCH_BUILDDIR: .bench-build + EIGEN_BENCH_REPETITIONS: "5" + # Scope: "nightly" runs core subset, "weekly" runs all benchmarks. + # The run script auto-promotes to "weekly" on Sundays so the full suite + # runs once a week without a separate schedule. Override via web UI or + # a dedicated GitLab schedule with EIGEN_BENCH_SCOPE=weekly. + EIGEN_BENCH_SCOPE: "nightly" + +# ============================================================================ +# Abstract bases +# ============================================================================ + +.bench:linux:base: + image: ubuntu:22.04 + variables: + EIGEN_CI_BUILDDIR: ${EIGEN_BENCH_BUILDDIR} + EIGEN_CI_TARGET_ARCH: "" + EIGEN_BENCH_ISA_FLAGS: "" + EIGEN_BENCH_TARGET: "" + before_script: + - . ci/scripts/common.linux.before_script.sh + rules: + - if: $CI_PIPELINE_SOURCE == "schedule" && $CI_PROJECT_NAMESPACE == "libeigen" + - if: $CI_PIPELINE_SOURCE == "web" && $CI_PROJECT_NAMESPACE == "libeigen" + +.bench:linux:build: + extends: .bench:linux:base + stage: benchmark + needs: [] + script: + - . ci/scripts/build.benchmark.sh + artifacts: + when: always + name: "$CI_JOB_NAME_SLUG-$CI_COMMIT_REF_SLUG" + paths: + - ${EIGEN_BENCH_BUILDDIR}/ + exclude: + - ${EIGEN_BENCH_BUILDDIR}/**/*.o + expire_in: 2 days + tags: + - saas-linux-2xlarge-amd64 + +.bench:linux:run: + extends: .bench:linux:base + stage: benchmark + script: + - . ci/scripts/run.benchmark.sh + artifacts: + when: always + name: "$CI_JOB_NAME_SLUG-$CI_COMMIT_REF_SLUG" + paths: + - ${EIGEN_BENCH_BUILDDIR}/results/ + expire_in: 30 days + +# ============================================================================ +# Build jobs (one per ISA target, all run in parallel) +# ============================================================================ + +bench:build:x86-64:sse: + extends: .bench:linux:build + variables: + EIGEN_CI_C_COMPILER: gcc-10 + EIGEN_CI_CXX_COMPILER: g++-10 + EIGEN_CI_INSTALL: g++-10 + EIGEN_CI_TARGET_ARCH: x86_64 + EIGEN_BENCH_TARGET: x86-64-sse + +bench:build:x86-64:avx2: + extends: .bench:linux:build + variables: + EIGEN_CI_C_COMPILER: gcc-10 + EIGEN_CI_CXX_COMPILER: g++-10 + EIGEN_CI_INSTALL: g++-10 + EIGEN_CI_TARGET_ARCH: x86_64 + EIGEN_BENCH_TARGET: x86-64-avx2 + EIGEN_BENCH_ISA_FLAGS: "-mavx2 -mfma" + +bench:build:x86-64:avx512dq: + extends: .bench:linux:build + variables: + EIGEN_CI_C_COMPILER: gcc-10 + EIGEN_CI_CXX_COMPILER: g++-10 + EIGEN_CI_INSTALL: g++-10 + EIGEN_CI_TARGET_ARCH: x86_64 + EIGEN_BENCH_TARGET: x86-64-avx512dq + EIGEN_BENCH_ISA_FLAGS: "-mavx512dq -mfma" + +bench:build:aarch64:neon: + extends: .bench:linux:build + variables: + EIGEN_CI_C_COMPILER: gcc-10 + EIGEN_CI_CXX_COMPILER: g++-10 + EIGEN_CI_INSTALL: g++-10 + EIGEN_CI_TARGET_ARCH: aarch64 + EIGEN_BENCH_TARGET: aarch64-neon + EIGEN_BENCH_ISA_FLAGS: "-march=armv8.2-a+fp16" + tags: + - saas-linux-large-arm64 + +# ============================================================================ +# Run jobs (one per ISA target, each depends on its build) +# ============================================================================ + +bench:run:x86-64:sse: + extends: .bench:linux:run + needs: [bench:build:x86-64:sse] + variables: + EIGEN_BENCH_TARGET: x86-64-sse + tags: + - saas-linux-2xlarge-amd64 + +bench:run:x86-64:avx2: + extends: .bench:linux:run + needs: [bench:build:x86-64:avx2] + variables: + EIGEN_BENCH_TARGET: x86-64-avx2 + tags: + - saas-linux-2xlarge-amd64 + +bench:run:x86-64:avx512dq: + extends: .bench:linux:run + needs: [bench:build:x86-64:avx512dq] + variables: + EIGEN_BENCH_TARGET: x86-64-avx512dq + tags: + - saas-linux-2xlarge-amd64 + allow_failure: true + +bench:run:aarch64:neon: + extends: .bench:linux:run + needs: [bench:build:aarch64:neon] + variables: + EIGEN_BENCH_TARGET: aarch64-neon + tags: + - saas-linux-large-arm64 + +# ============================================================================ +# Analysis: compare against historical data using Welch's t-test +# ============================================================================ + +bench:analyze: + stage: benchmark + image: python:3.11-slim + needs: + - job: bench:run:x86-64:sse + artifacts: true + - job: bench:run:x86-64:avx2 + artifacts: true + - job: bench:run:x86-64:avx512dq + artifacts: true + optional: true + - job: bench:run:aarch64:neon + artifacts: true + variables: + EIGEN_CI_BUILDDIR: ${EIGEN_BENCH_BUILDDIR} + before_script: + - export DEBIAN_FRONTEND=noninteractive + - apt-get update -qq && apt-get install -y --no-install-recommends git + - pip install --quiet scipy + script: + - | + status=0 + python3 ci/scripts/detect_regressions.py \ + --results-dir "${EIGEN_BENCH_BUILDDIR}/results/" \ + --perf-branch perf-data \ + --history-count 30 \ + --significance 0.01 \ + --min-change-pct 5.0 \ + --output-report "${EIGEN_BENCH_BUILDDIR}/results/regression_report.txt" || status=$? + case "${status}" in + 0|1) ;; + *) exit "${status}" ;; + esac + printf '%s\n' "${status}" > "${EIGEN_BENCH_BUILDDIR}/results/regression_exit_code.txt" + artifacts: + when: always + paths: + - ${EIGEN_BENCH_BUILDDIR}/results/ + reports: + junit: ${EIGEN_BENCH_BUILDDIR}/results/regression_report.xml + expire_in: 90 days + tags: + - saas-linux-small-amd64 + rules: + - if: $CI_PIPELINE_SOURCE == "schedule" && $CI_PROJECT_NAMESPACE == "libeigen" + - if: $CI_PIPELINE_SOURCE == "web" && $CI_PROJECT_NAMESPACE == "libeigen" + +# ============================================================================ +# Storage and gating +# ============================================================================ + +bench:store-results: + stage: deploy + image: alpine:edge + needs: + - job: bench:analyze + artifacts: true + before_script: + - apk add --no-cache git python3 + script: + - . ci/scripts/push_perf_data.sh + variables: + EIGEN_CI_BUILDDIR: ${EIGEN_BENCH_BUILDDIR} + tags: + - saas-linux-small-amd64 + rules: + - if: $CI_PIPELINE_SOURCE == "schedule" && $CI_PROJECT_NAMESPACE == "libeigen" && $CI_COMMIT_BRANCH == $CI_DEFAULT_BRANCH + - if: $CI_PIPELINE_SOURCE == "web" && $CI_PROJECT_NAMESPACE == "libeigen" && $CI_COMMIT_BRANCH == $CI_DEFAULT_BRANCH + +bench:regression-gate: + stage: deploy + image: alpine:edge + needs: + - job: bench:analyze + artifacts: true + - job: bench:store-results + optional: true + script: + - code=$(cat "${EIGEN_BENCH_BUILDDIR}/results/regression_exit_code.txt") + - test "${code}" != "1" + variables: + EIGEN_CI_BUILDDIR: ${EIGEN_BENCH_BUILDDIR} + tags: + - saas-linux-small-amd64 + rules: + - if: $CI_PIPELINE_SOURCE == "schedule" && $CI_PROJECT_NAMESPACE == "libeigen" + - if: $CI_PIPELINE_SOURCE == "web" && $CI_PROJECT_NAMESPACE == "libeigen" diff --git a/ci/scripts/benchmark_targets.txt b/ci/scripts/benchmark_targets.txt new file mode 100644 index 000000000..5209c6d85 --- /dev/null +++ b/ci/scripts/benchmark_targets.txt @@ -0,0 +1,19 @@ +# Nightly core benchmark subset. +# One executable name per line. Lines starting with # are ignored. +# These cover the most performance-critical dense linear algebra kernels. + +# BLAS-like operations +bench_gemm +bench_gemm_double +bench_gemv +bench_dot +bench_vecadd +bench_trsm +bench_reductions + +# Decompositions +bench_cholesky +bench_cholesky_double +bench_qr +bench_svd +bench_householder diff --git a/ci/scripts/build.benchmark.sh b/ci/scripts/build.benchmark.sh new file mode 100755 index 000000000..6fc496dc5 --- /dev/null +++ b/ci/scripts/build.benchmark.sh @@ -0,0 +1,45 @@ +#!/bin/bash +# Build Eigen benchmarks for a given ISA target. +# +# Expected environment variables: +# EIGEN_CI_BUILDDIR - build directory (default: .bench-build) +# EIGEN_CI_CXX_COMPILER - C++ compiler +# EIGEN_CI_C_COMPILER - C compiler +# EIGEN_BENCH_ISA_FLAGS - ISA-specific compiler flags (e.g. "-mavx2 -mfma") + +set -ex + +rootdir=$(pwd) +builddir=${EIGEN_CI_BUILDDIR:-.bench-build} +mkdir -p "${builddir}" +cd "${builddir}" + +# Install Google Benchmark from source if not already present. +# The common before_script already installs cmake/ninja; we only need +# git and ca-certificates for the clone. +if ! pkg-config --exists benchmark 2>/dev/null; then + apt-get update -qq + apt-get install -y --no-install-recommends git ca-certificates + git clone --depth 1 --branch v1.9.1 https://github.com/google/benchmark.git /tmp/gbench + cmake -G Ninja -S /tmp/gbench -B /tmp/gbench-build \ + -DCMAKE_BUILD_TYPE=Release \ + -DBENCHMARK_ENABLE_TESTING=OFF \ + -DCMAKE_INSTALL_PREFIX=/usr/local + cmake --build /tmp/gbench-build --target install + rm -rf /tmp/gbench /tmp/gbench-build +fi + +# Configure benchmarks. ISA flags are passed via CMAKE_CXX_FLAGS so they +# apply globally to all benchmark targets. +cmake -G Ninja \ + -DCMAKE_CXX_COMPILER="${EIGEN_CI_CXX_COMPILER}" \ + -DCMAKE_C_COMPILER="${EIGEN_CI_C_COMPILER}" \ + -DCMAKE_CXX_FLAGS="${EIGEN_BENCH_ISA_FLAGS}" \ + -DCMAKE_BUILD_TYPE=Release \ + "${rootdir}/benchmarks" + +# Build all benchmark targets. The nightly/weekly scope filtering happens +# at run time, not build time. +cmake --build . -- -k0 || cmake --build . -- -k0 -j1 + +cd "${rootdir}" diff --git a/ci/scripts/detect_regressions.py b/ci/scripts/detect_regressions.py new file mode 100755 index 000000000..6ce4c1c8c --- /dev/null +++ b/ci/scripts/detect_regressions.py @@ -0,0 +1,394 @@ +#!/usr/bin/env python3 +"""Benchmark regression detection using Welch's t-test. + +Compares the current benchmark run against historical data stored on +the perf-data git branch. A regression is flagged when: + + 1. Welch's t-test p-value < significance threshold (default 0.01) + 2. The relative change exceeds a minimum percentage (default 5%) + 3. The direction is a slowdown (higher real_time) + +Exit codes: + 0 no regressions + 1 regressions detected + 2 error +""" + +import argparse +import glob +import json +import os +import subprocess +import sys +import xml.etree.ElementTree as ET +from collections import defaultdict, namedtuple + +# scipy is the only external dependency (pip-installed in the CI job). +from scipy.stats import ttest_ind + +Regression = namedtuple( + "Regression", + ["target", "key", "current_mean", "historical_mean", "change_pct", "p_value"], +) + + +def parse_args(): + p = argparse.ArgumentParser(description=__doc__) + p.add_argument( + "--results-dir", + required=True, + help="Directory containing current run JSON files.", + ) + p.add_argument( + "--perf-branch", + default="perf-data", + help="Git branch storing historical benchmark data.", + ) + p.add_argument( + "--history-count", + type=int, + default=14, + help="Number of past runs to compare against.", + ) + p.add_argument( + "--significance", + type=float, + default=0.01, + help="P-value threshold for Welch's t-test.", + ) + p.add_argument( + "--min-change-pct", + type=float, + default=5.0, + help="Minimum percentage change to flag.", + ) + p.add_argument( + "--output-report", + default="regression_report.txt", + help="Path for text report.", + ) + return p.parse_args() + + +def clone_perf_branch(branch, clone_dir): + """Shallow-clone the perf-data branch. Returns True on success.""" + # Construct clone URL from CI environment or fall back to current remote. + url = os.environ.get("CI_REPOSITORY_URL", "") + if not url: + try: + url = subprocess.check_output( + ["git", "remote", "get-url", "origin"], text=True + ).strip() + except Exception: + return False + + try: + subprocess.check_call( + [ + "git", + "clone", + "--depth=1", + "--single-branch", + "--branch", + branch, + url, + clone_dir, + ], + stdout=subprocess.DEVNULL, + stderr=subprocess.DEVNULL, + ) + return True + except subprocess.CalledProcessError: + return False + + +def _history_sort_key(fpath): + """Sort key for historical result files. + + Prefer the recorded UTC timestamp in the JSON metadata. Fall back to the + filename so older date-only files still participate in the history window. + """ + try: + with open(fpath) as f: + metadata = json.load(f).get("metadata", {}) + except Exception: + metadata = {} + return metadata.get("timestamp") or metadata.get("date") or os.path.basename(fpath) + + +def load_historical_data(perf_dir, target, history_count): + """Load per-repetition real_time values from the last *history_count* runs. + + Returns dict: benchmark_key -> list of raw real_time values (multiple per run). + + We load the same non-aggregate rows that load_current_results uses so both + sides of the t-test contain the same kind of measurement (individual + repetitions), avoiding a unit mismatch between per-rep and per-run means. + """ + target_dir = os.path.join(perf_dir, target) + if not os.path.isdir(target_dir): + return {} + + files = sorted( + glob.glob(os.path.join(target_dir, "*.json")), + key=_history_sort_key, + reverse=True, + ) + files = files[:history_count] + + history = defaultdict(list) + for fpath in files: + with open(fpath) as f: + data = json.load(f) + for exe_name, exe_data in data.get("files", {}).items(): + for bm in exe_data.get("benchmarks", []): + run_type = bm.get("run_type", "") + if run_type == "aggregate": + continue + name = bm.get("name", "") + key = f"{exe_name}/{name}" + rt = bm.get("real_time") + if rt is not None: + history[key].append(rt) + return history + + +def load_current_results(results_dir): + """Load current run results, keyed by target. + + Returns dict: target -> dict(benchmark_key -> list of per-repetition real_time). + """ + data = defaultdict(lambda: defaultdict(list)) + + for jf in sorted(glob.glob(os.path.join(results_dir, "*_*_*.json"))): + with open(jf) as f: + run = json.load(f) + meta = run.get("metadata", {}) + target = meta.get("target", "unknown") + + for exe_name, exe_data in run.get("files", {}).items(): + for bm in exe_data.get("benchmarks", []): + name = bm.get("name", "") + run_type = bm.get("run_type", "") + # Use individual iteration rows (not aggregates) for the + # current run so we have per-repetition samples. + if run_type == "aggregate": + continue + key = f"{exe_name}/{name}" + rt = bm.get("real_time") + if rt is not None: + data[target][key].append(rt) + + return data + + +def find_regressions(current, historical, significance, min_change_pct): + """Compare current vs historical using Welch's t-test. + + Returns (regressions, improvements, skipped_count). + """ + regressions = [] + improvements = [] + skipped = 0 + + for key, current_values in sorted(current.items()): + hist_values = historical.get(key) + if not hist_values or len(hist_values) < 5: + skipped += 1 + continue + if len(current_values) < 3: + skipped += 1 + continue + + cur_mean = sum(current_values) / len(current_values) + hist_mean = sum(hist_values) / len(hist_values) + + if hist_mean == 0: + skipped += 1 + continue + + change_pct = (cur_mean - hist_mean) / hist_mean * 100.0 + + _, p_value = ttest_ind(current_values, hist_values, equal_var=False) + + entry = Regression( + target="", # filled in by caller + key=key, + current_mean=cur_mean, + historical_mean=hist_mean, + change_pct=change_pct, + p_value=p_value, + ) + + if p_value < significance and abs(change_pct) > min_change_pct: + if change_pct > 0: + # Higher real_time = slower = regression. + regressions.append(entry) + else: + improvements.append(entry) + + return regressions, improvements, skipped + + +def _qualified_key(r): + """Target-qualified display key, e.g. '[x86-64-avx2] bench_gemm/BM_Gemm/256'.""" + return f"[{r.target}] {r.key}" + + +def write_text_report(regressions, improvements, skipped, total, path): + """Write a human-readable summary.""" + with open(path, "w") as f: + f.write("# Benchmark Regression Report\n\n") + + if regressions: + f.write(f"## Regressions ({len(regressions)})\n\n") + f.write( + f"{'Benchmark':<70s} {'Historical':>12s} {'Current':>12s} " + f"{'Change':>8s} {'p-value':>8s}\n" + ) + f.write("-" * 114 + "\n") + for r in sorted(regressions, key=lambda x: -x.change_pct): + f.write( + f"{_qualified_key(r):<70s} {r.historical_mean:>12.1f} {r.current_mean:>12.1f} " + f"{r.change_pct:>+7.1f}% {r.p_value:>8.4f}\n" + ) + f.write("\n") + + if improvements: + f.write(f"## Improvements ({len(improvements)})\n\n") + f.write( + f"{'Benchmark':<70s} {'Historical':>12s} {'Current':>12s} " + f"{'Change':>8s} {'p-value':>8s}\n" + ) + f.write("-" * 114 + "\n") + for r in sorted(improvements, key=lambda x: x.change_pct): + f.write( + f"{_qualified_key(r):<70s} {r.historical_mean:>12.1f} {r.current_mean:>12.1f} " + f"{r.change_pct:>+7.1f}% {r.p_value:>8.4f}\n" + ) + f.write("\n") + + f.write(f"## Summary\n\n") + f.write(f"- Benchmarks analyzed: {total}\n") + f.write(f"- Regressions: {len(regressions)}\n") + f.write(f"- Improvements: {len(improvements)}\n") + f.write(f"- Skipped (insufficient data): {skipped}\n") + + +def write_junit_report(regressions, analyzed_keys, path): + """Write JUnit XML so GitLab displays results in the test report tab. + + Keys in *analyzed_keys* and regression entries are target-qualified + (e.g. "[x86-64-avx2] bench_gemm/BM_Gemm/256") so the same benchmark + on different ISA targets appears as separate test cases. + """ + suite = ET.Element( + "testsuite", + name="benchmark-regressions", + tests=str(len(analyzed_keys)), + failures=str(len(regressions)), + ) + + regression_by_qkey = {_qualified_key(r): r for r in regressions} + for key in sorted(analyzed_keys): + tc = ET.SubElement(suite, "testcase", name=key, classname="benchmark") + r = regression_by_qkey.get(key) + if r is not None: + ET.SubElement( + tc, + "failure", + message=f"{r.change_pct:+.1f}% regression (p={r.p_value:.4f})", + ).text = ( + f"historical_mean={r.historical_mean:.1f} " + f"current_mean={r.current_mean:.1f} " + f"change={r.change_pct:+.1f}% p={r.p_value:.6f}" + ) + + tree = ET.ElementTree(suite) + ET.indent(tree) + tree.write(path, xml_declaration=True, encoding="utf-8") + + +def main(): + args = parse_args() + results_dir = args.results_dir + + # Load current results (keyed by target). + current_by_target = load_current_results(results_dir) + if not current_by_target: + print("No current benchmark results found.") + sys.exit(2) + + total_benchmarks = sum(len(v) for v in current_by_target.values()) + print(f"Loaded {total_benchmarks} benchmarks from current run.") + print(f"Targets: {', '.join(sorted(current_by_target.keys()))}") + + # Clone historical data. + perf_dir = "/tmp/perf-data-history" + has_history = clone_perf_branch(args.perf_branch, perf_dir) + + if not has_history: + print("No historical data found (perf-data branch missing).") + print("This is expected on the first run. Storing baseline only.") + sys.exit(0) + + # Run analysis per target. + all_regressions = [] + all_improvements = [] + total_analyzed = 0 + total_skipped = 0 + all_keys = set() + + for target in sorted(current_by_target.keys()): + target_current = current_by_target[target] + historical = load_historical_data(perf_dir, target, args.history_count) + if not historical: + print(f" {target}: no historical data, skipping analysis.") + continue + + regs, imps, skipped = find_regressions( + target_current, historical, args.significance, args.min_change_pct + ) + + # Tag regressions with the target. + regs = [r._replace(target=target) for r in regs] + imps = [r._replace(target=target) for r in imps] + + all_regressions.extend(regs) + all_improvements.extend(imps) + total_analyzed += len(target_current) - skipped + total_skipped += skipped + # Use target-qualified keys so the same benchmark on different ISAs + # shows up as separate entries in reports. + all_keys.update(f"[{target}] {k}" for k in target_current) + + print( + f" {target}: {len(regs)} regressions, " + f"{len(imps)} improvements, {skipped} skipped" + ) + + # Write reports. + report_path = args.output_report + write_text_report( + all_regressions, all_improvements, total_skipped, total_analyzed, report_path + ) + print(f"\nText report: {report_path}") + + junit_path = report_path.replace(".txt", ".xml") + write_junit_report(all_regressions, all_keys, junit_path) + print(f"JUnit report: {junit_path}") + + # Print summary and exit. + if all_regressions: + print(f"\nREGRESSIONS DETECTED: {len(all_regressions)} benchmark(s)") + for r in all_regressions: + print(f" [{r.target}] {r.key}: {r.change_pct:+.1f}% (p={r.p_value:.4f})") + sys.exit(1) + else: + n_imp = len(all_improvements) + print(f"\nNo regressions detected. {n_imp} improvement(s) found.") + sys.exit(0) + + +if __name__ == "__main__": + main() diff --git a/ci/scripts/push_perf_data.sh b/ci/scripts/push_perf_data.sh new file mode 100755 index 000000000..485912f8b --- /dev/null +++ b/ci/scripts/push_perf_data.sh @@ -0,0 +1,100 @@ +#!/bin/sh +# Push benchmark results to the perf-data orphan branch. +# POSIX sh compatible (runs under Alpine's busybox ash). +# +# Expected environment variables: +# EIGEN_CI_BUILDDIR - build directory containing results/ +# EIGEN_CI_GIT_PUSH_URL - authenticated git push URL +# CI_COMMIT_SHORT_SHA - short commit hash + +set -ex + +results_dir="$(pwd)/${EIGEN_CI_BUILDDIR:-.bench-build}/results" +perf_branch="perf-data" +clone_dir="/tmp/perf-data-push" +push_url="${EIGEN_CI_GIT_PUSH_URL:?EIGEN_CI_GIT_PUSH_URL must be set}" + +rm -rf "${clone_dir}" + +# Clone perf-data branch, or create orphan if it doesn't exist. +if git clone --depth=1 --single-branch --branch "${perf_branch}" \ + "${push_url}" "${clone_dir}" 2>/dev/null; then + echo "Cloned existing ${perf_branch} branch." +else + echo "${perf_branch} branch does not exist, creating orphan..." + mkdir -p "${clone_dir}" + cd "${clone_dir}" + git init + git checkout --orphan "${perf_branch}" + cat > README.md <<'EOF' +# Benchmark Performance Data + +This branch stores nightly/weekly benchmark results as JSON files. +It is maintained automatically by the CI benchmark pipeline. + +## Structure + + / + __.json + +## Analysis + +See `ci/scripts/detect_regressions.py` on the main branch for the +regression detection script that consumes this data. +EOF + git add README.md + git -c user.name="CI Bot" -c user.email="ci@eigen.tuxfamily.org" \ + commit -m "Initialize perf-data branch" + git remote add origin "${push_url}" + cd - +fi + +cd "${clone_dir}" + +# Copy combined result files into target subdirectories. +# Only match canonical combined formats: +# YYYY-MM-DDTHH-MM-SSZ__.json +# YYYY-MM-DD__.json +# This avoids picking up raw per-benchmark files like bench_gemm_double.json. +copied=0 +for combined_json in "${results_dir}"/*.json; do + [ -f "${combined_json}" ] || continue + filename=$(basename "${combined_json}") + # Must start with a UTC timestamp or date, followed by a hex commit hash. + case "${filename}" in + [0-9][0-9][0-9][0-9]-[0-9][0-9]-[0-9][0-9]T[0-9][0-9]-[0-9][0-9]-[0-9][0-9]Z_[0-9a-f]*_*.json) ;; + [0-9][0-9][0-9][0-9]-[0-9][0-9]-[0-9][0-9]_[0-9a-f]*_*.json) ;; + *) continue ;; + esac + # Extract target: strip timestamp/date + commit prefix and .json suffix. + target=$(echo "${filename}" | sed 's/^[^_]*_[a-f0-9]*_//' | sed 's/\.json$//') + mkdir -p "${target}" + cp "${combined_json}" "${target}/${filename}" + copied=$((copied + 1)) +done + +if [ "${copied}" -eq 0 ]; then + echo "No result files to store." + exit 0 +fi + +# Prune data older than 90 days to keep the branch manageable. +# We parse the date from the filename since clone mtime is always "now". +cutoff=$(date -u -d "@$(($(date +%s) - 90*86400))" +%Y-%m-%d) +find . -name '*.json' -path './*/*.json' | while IFS= read -r f; do + file_date=$(basename "$f" | grep -oE '^[0-9]{4}-[0-9]{2}-[0-9]{2}') + if [ -n "$file_date" ] && [ "$file_date" \< "$cutoff" ]; then + rm -f "$f" + fi +done + +# Commit and push. +git add -A +git -c user.name="CI Bot" -c user.email="ci@eigen.tuxfamily.org" \ + commit -m "Add benchmark results for $(date -u +%Y-%m-%d) (${CI_COMMIT_SHORT_SHA:-unknown})" || { + echo "No changes to commit." + exit 0 +} +git push origin "${perf_branch}" + +echo "Pushed ${copied} result file(s) to ${perf_branch} branch." diff --git a/ci/scripts/run.benchmark.sh b/ci/scripts/run.benchmark.sh new file mode 100755 index 000000000..891d8dc03 --- /dev/null +++ b/ci/scripts/run.benchmark.sh @@ -0,0 +1,144 @@ +#!/bin/bash +# Run Eigen benchmarks and collect JSON results with metadata. +# +# Expected environment variables: +# EIGEN_CI_BUILDDIR - build directory containing benchmark executables +# EIGEN_BENCH_TARGET - ISA target name (e.g. "x86-64-avx2") +# EIGEN_BENCH_SCOPE - "nightly" (core subset) or "weekly" (all) +# EIGEN_BENCH_REPETITIONS - number of repetitions per benchmark (default: 5) + +set -ex + +rootdir=$(pwd) +builddir=${EIGEN_CI_BUILDDIR:-.bench-build} +results_dir="$(pwd)/${builddir}/results" +mkdir -p "${results_dir}" + +target=${EIGEN_BENCH_TARGET:?EIGEN_BENCH_TARGET must be set} +scope=${EIGEN_BENCH_SCOPE:-nightly} +reps=${EIGEN_BENCH_REPETITIONS:-5} + +# Auto-promote to weekly on Sundays (day 0) so the full suite runs once a +# week without requiring a separate GitLab schedule. +if [ "${scope}" = "nightly" ] && [ "$(date -u +%u)" = "7" ]; then + echo "Sunday detected, promoting scope from nightly to weekly." + scope="weekly" +fi + +# Runtime ISA check: skip if the runner lacks the required instruction set. +if [[ "${target}" == *"avx512"* ]]; then + if ! grep -q 'avx512dq' /proc/cpuinfo 2>/dev/null; then + echo "WARNING: Runner does not support AVX-512 DQ. Skipping benchmarks." + exit 0 + fi +fi +if [[ "${target}" == *"avx2"* ]]; then + if ! grep -q 'avx2' /proc/cpuinfo 2>/dev/null; then + echo "WARNING: Runner does not support AVX2. Skipping benchmarks." + exit 0 + fi +fi + +cd "${builddir}" + +# Determine which benchmarks to run. +bench_list=() +if [[ "${scope}" == "weekly" ]]; then + while IFS= read -r -d '' exe; do + bench_list+=("${exe}") + done < <(find . -maxdepth 1 -type f -executable -name 'bench_*' -print0 | sort -z) +else + while IFS= read -r name; do + [[ -z "$name" || "$name" == \#* ]] && continue + name=$(echo "$name" | xargs) # trim whitespace + [[ -z "$name" ]] && continue + if [[ -x "./${name}" ]]; then + bench_list+=("./${name}") + else + echo "WARNING: ${name} not found, skipping." + fi + done < "${rootdir}/ci/scripts/benchmark_targets.txt" +fi + +if [[ ${#bench_list[@]} -eq 0 ]]; then + echo "ERROR: No benchmark executables found." + exit 1 +fi + +# Collect system info. +cpu_model=$(grep -m1 'model name' /proc/cpuinfo 2>/dev/null | cut -d: -f2 | xargs || echo "unknown") +timestamp=$(date -u +%Y-%m-%dT%H-%M-%SZ) +commit=${CI_COMMIT_SHORT_SHA:-$(git rev-parse --short HEAD 2>/dev/null || echo "unknown")} +successful_runs=0 + +# Run each benchmark executable. +for bench_exe in "${bench_list[@]}"; do + bench_name=$(basename "${bench_exe}") + outfile="${results_dir}/${bench_name}.json" + + echo "=== Running ${bench_name} (${reps} repetitions) ===" + if ! "${bench_exe}" \ + --benchmark_format=json \ + --benchmark_out="${outfile}" \ + --benchmark_repetitions="${reps}" \ + --benchmark_report_aggregates_only=false \ + 2>&1; then + echo "WARNING: ${bench_name} failed (possibly SIGILL), skipping." + rm -f "${outfile}" + continue + fi + successful_runs=$((successful_runs + 1)) +done + +cd "${rootdir}" + +if [[ ${successful_runs} -eq 0 ]]; then + echo "ERROR: No benchmark executables completed successfully." + exit 1 +fi + +# Wrap each result file with metadata and produce a combined output. +python3 - "${results_dir}" "${timestamp}" "${commit}" "${target}" "${cpu_model}" "${scope}" <<'PYEOF' +import json +import glob +import os +import sys + +results_dir = sys.argv[1] +timestamp = sys.argv[2] +commit = sys.argv[3] +target = sys.argv[4] +cpu_model = sys.argv[5] +scope = sys.argv[6] + +metadata = { + "timestamp": timestamp, + "date": timestamp[:10], + "commit": commit, + "target": target, + "cpu_model": cpu_model, + "scope": scope, + "ci_job_id": os.environ.get("CI_JOB_ID", ""), + "ci_pipeline_id": os.environ.get("CI_PIPELINE_ID", ""), + "runner_description": os.environ.get("CI_RUNNER_DESCRIPTION", ""), +} + +combined = {"metadata": metadata, "files": {}} + +for jf in sorted(glob.glob(os.path.join(results_dir, "bench_*.json"))): + name = os.path.splitext(os.path.basename(jf))[0] + with open(jf) as f: + data = json.load(f) + entry = { + "context": data.get("context", {}), + "benchmarks": data.get("benchmarks", []), + } + combined["files"][name] = entry + +outpath = os.path.join(results_dir, f"{timestamp}_{commit}_{target}.json") +with open(outpath, "w") as f: + json.dump(combined, f, indent=2) + +print(f"Combined results written to {outpath}") +print(f" {len(combined['files'])} benchmark files, target={target}") +PYEOF