Add nightly benchmark regression detection pipeline

libeigen/eigen!2349 Co-authored-by: Rasmus Munk Larsen <rmlarsen@gmail.com>
2026-04-10 11:34:33 +08:00 · 2026-03-29 16:03:56 -07:00
parent 753a6ac5b3
commit 409296d91d
7 changed files with 943 additions and 0 deletions
--- a/.gitlab-ci.yml
+++ b/.gitlab-ci.yml
@@ -15,6 +15,7 @@ stages:
  - checkformat
  - build
  - test
+  - benchmark
  - deploy

 variables:
@@ -35,4 +36,5 @@ include:
  - "/ci/build.windows.gitlab-ci.yml"
  - "/ci/test.linux.gitlab-ci.yml"
  - "/ci/test.windows.gitlab-ci.yml"
+  - "/ci/benchmark.gitlab-ci.yml"
  - "/ci/deploy.gitlab-ci.yml"
--- a/ci/benchmark.gitlab-ci.yml
+++ b/ci/benchmark.gitlab-ci.yml
@@ -0,0 +1,239 @@
+# Benchmark pipeline for performance regression detection.
+#
+# Runs nightly (core subset) or weekly (all benchmarks) on scheduled
+# pipelines, with separate jobs per ISA target.  Results are analyzed
+# using Welch's t-test against the last 30 runs stored on the perf-data
+# branch.
+
+# ============================================================================
+# Variables
+# ============================================================================
+
+variables:
+  EIGEN_BENCH_BUILDDIR: .bench-build
+  EIGEN_BENCH_REPETITIONS: "5"
+  # Scope: "nightly" runs core subset, "weekly" runs all benchmarks.
+  # The run script auto-promotes to "weekly" on Sundays so the full suite
+  # runs once a week without a separate schedule.  Override via web UI or
+  # a dedicated GitLab schedule with EIGEN_BENCH_SCOPE=weekly.
+  EIGEN_BENCH_SCOPE: "nightly"
+
+# ============================================================================
+# Abstract bases
+# ============================================================================
+
+.bench:linux:base:
+  image: ubuntu:22.04
+  variables:
+    EIGEN_CI_BUILDDIR: ${EIGEN_BENCH_BUILDDIR}
+    EIGEN_CI_TARGET_ARCH: ""
+    EIGEN_BENCH_ISA_FLAGS: ""
+    EIGEN_BENCH_TARGET: ""
+  before_script:
+    - . ci/scripts/common.linux.before_script.sh
+  rules:
+    - if: $CI_PIPELINE_SOURCE == "schedule" && $CI_PROJECT_NAMESPACE == "libeigen"
+    - if: $CI_PIPELINE_SOURCE == "web" && $CI_PROJECT_NAMESPACE == "libeigen"
+
+.bench:linux:build:
+  extends: .bench:linux:base
+  stage: benchmark
+  needs: []
+  script:
+    - . ci/scripts/build.benchmark.sh
+  artifacts:
+    when: always
+    name: "$CI_JOB_NAME_SLUG-$CI_COMMIT_REF_SLUG"
+    paths:
+      - ${EIGEN_BENCH_BUILDDIR}/
+    exclude:
+      - ${EIGEN_BENCH_BUILDDIR}/**/*.o
+    expire_in: 2 days
+  tags:
+    - saas-linux-2xlarge-amd64
+
+.bench:linux:run:
+  extends: .bench:linux:base
+  stage: benchmark
+  script:
+    - . ci/scripts/run.benchmark.sh
+  artifacts:
+    when: always
+    name: "$CI_JOB_NAME_SLUG-$CI_COMMIT_REF_SLUG"
+    paths:
+      - ${EIGEN_BENCH_BUILDDIR}/results/
+    expire_in: 30 days
+
+# ============================================================================
+# Build jobs (one per ISA target, all run in parallel)
+# ============================================================================
+
+bench:build:x86-64:sse:
+  extends: .bench:linux:build
+  variables:
+    EIGEN_CI_C_COMPILER: gcc-10
+    EIGEN_CI_CXX_COMPILER: g++-10
+    EIGEN_CI_INSTALL: g++-10
+    EIGEN_CI_TARGET_ARCH: x86_64
+    EIGEN_BENCH_TARGET: x86-64-sse
+
+bench:build:x86-64:avx2:
+  extends: .bench:linux:build
+  variables:
+    EIGEN_CI_C_COMPILER: gcc-10
+    EIGEN_CI_CXX_COMPILER: g++-10
+    EIGEN_CI_INSTALL: g++-10
+    EIGEN_CI_TARGET_ARCH: x86_64
+    EIGEN_BENCH_TARGET: x86-64-avx2
+    EIGEN_BENCH_ISA_FLAGS: "-mavx2 -mfma"
+
+bench:build:x86-64:avx512dq:
+  extends: .bench:linux:build
+  variables:
+    EIGEN_CI_C_COMPILER: gcc-10
+    EIGEN_CI_CXX_COMPILER: g++-10
+    EIGEN_CI_INSTALL: g++-10
+    EIGEN_CI_TARGET_ARCH: x86_64
+    EIGEN_BENCH_TARGET: x86-64-avx512dq
+    EIGEN_BENCH_ISA_FLAGS: "-mavx512dq -mfma"
+
+bench:build:aarch64:neon:
+  extends: .bench:linux:build
+  variables:
+    EIGEN_CI_C_COMPILER: gcc-10
+    EIGEN_CI_CXX_COMPILER: g++-10
+    EIGEN_CI_INSTALL: g++-10
+    EIGEN_CI_TARGET_ARCH: aarch64
+    EIGEN_BENCH_TARGET: aarch64-neon
+    EIGEN_BENCH_ISA_FLAGS: "-march=armv8.2-a+fp16"
+  tags:
+    - saas-linux-large-arm64
+
+# ============================================================================
+# Run jobs (one per ISA target, each depends on its build)
+# ============================================================================
+
+bench:run:x86-64:sse:
+  extends: .bench:linux:run
+  needs: [bench:build:x86-64:sse]
+  variables:
+    EIGEN_BENCH_TARGET: x86-64-sse
+  tags:
+    - saas-linux-2xlarge-amd64
+
+bench:run:x86-64:avx2:
+  extends: .bench:linux:run
+  needs: [bench:build:x86-64:avx2]
+  variables:
+    EIGEN_BENCH_TARGET: x86-64-avx2
+  tags:
+    - saas-linux-2xlarge-amd64
+
+bench:run:x86-64:avx512dq:
+  extends: .bench:linux:run
+  needs: [bench:build:x86-64:avx512dq]
+  variables:
+    EIGEN_BENCH_TARGET: x86-64-avx512dq
+  tags:
+    - saas-linux-2xlarge-amd64
+  allow_failure: true
+
+bench:run:aarch64:neon:
+  extends: .bench:linux:run
+  needs: [bench:build:aarch64:neon]
+  variables:
+    EIGEN_BENCH_TARGET: aarch64-neon
+  tags:
+    - saas-linux-large-arm64
+
+# ============================================================================
+# Analysis: compare against historical data using Welch's t-test
+# ============================================================================
+
+bench:analyze:
+  stage: benchmark
+  image: python:3.11-slim
+  needs:
+    - job: bench:run:x86-64:sse
+      artifacts: true
+    - job: bench:run:x86-64:avx2
+      artifacts: true
+    - job: bench:run:x86-64:avx512dq
+      artifacts: true
+      optional: true
+    - job: bench:run:aarch64:neon
+      artifacts: true
+  variables:
+    EIGEN_CI_BUILDDIR: ${EIGEN_BENCH_BUILDDIR}
+  before_script:
+    - export DEBIAN_FRONTEND=noninteractive
+    - apt-get update -qq && apt-get install -y --no-install-recommends git
+    - pip install --quiet scipy
+  script:
+    - |
+      status=0
+      python3 ci/scripts/detect_regressions.py \
+        --results-dir "${EIGEN_BENCH_BUILDDIR}/results/" \
+        --perf-branch perf-data \
+        --history-count 30 \
+        --significance 0.01 \
+        --min-change-pct 5.0 \
+        --output-report "${EIGEN_BENCH_BUILDDIR}/results/regression_report.txt" || status=$?
+      case "${status}" in
+        0|1) ;;
+        *) exit "${status}" ;;
+      esac
+      printf '%s\n' "${status}" > "${EIGEN_BENCH_BUILDDIR}/results/regression_exit_code.txt"
+  artifacts:
+    when: always
+    paths:
+      - ${EIGEN_BENCH_BUILDDIR}/results/
+    reports:
+      junit: ${EIGEN_BENCH_BUILDDIR}/results/regression_report.xml
+    expire_in: 90 days
+  tags:
+    - saas-linux-small-amd64
+  rules:
+    - if: $CI_PIPELINE_SOURCE == "schedule" && $CI_PROJECT_NAMESPACE == "libeigen"
+    - if: $CI_PIPELINE_SOURCE == "web" && $CI_PROJECT_NAMESPACE == "libeigen"
+
+# ============================================================================
+# Storage and gating
+# ============================================================================
+
+bench:store-results:
+  stage: deploy
+  image: alpine:edge
+  needs:
+    - job: bench:analyze
+      artifacts: true
+  before_script:
+    - apk add --no-cache git python3
+  script:
+    - . ci/scripts/push_perf_data.sh
+  variables:
+    EIGEN_CI_BUILDDIR: ${EIGEN_BENCH_BUILDDIR}
+  tags:
+    - saas-linux-small-amd64
+  rules:
+    - if: $CI_PIPELINE_SOURCE == "schedule" && $CI_PROJECT_NAMESPACE == "libeigen" && $CI_COMMIT_BRANCH == $CI_DEFAULT_BRANCH
+    - if: $CI_PIPELINE_SOURCE == "web" && $CI_PROJECT_NAMESPACE == "libeigen" && $CI_COMMIT_BRANCH == $CI_DEFAULT_BRANCH
+
+bench:regression-gate:
+  stage: deploy
+  image: alpine:edge
+  needs:
+    - job: bench:analyze
+      artifacts: true
+    - job: bench:store-results
+      optional: true
+  script:
+    - code=$(cat "${EIGEN_BENCH_BUILDDIR}/results/regression_exit_code.txt")
+    - test "${code}" != "1"
+  variables:
+    EIGEN_CI_BUILDDIR: ${EIGEN_BENCH_BUILDDIR}
+  tags:
+    - saas-linux-small-amd64
+  rules:
+    - if: $CI_PIPELINE_SOURCE == "schedule" && $CI_PROJECT_NAMESPACE == "libeigen"
+    - if: $CI_PIPELINE_SOURCE == "web" && $CI_PROJECT_NAMESPACE == "libeigen"
--- a/ci/scripts/benchmark_targets.txt
+++ b/ci/scripts/benchmark_targets.txt
@@ -0,0 +1,19 @@
+# Nightly core benchmark subset.
+# One executable name per line. Lines starting with # are ignored.
+# These cover the most performance-critical dense linear algebra kernels.
+
+# BLAS-like operations
+bench_gemm
+bench_gemm_double
+bench_gemv
+bench_dot
+bench_vecadd
+bench_trsm
+bench_reductions
+
+# Decompositions
+bench_cholesky
+bench_cholesky_double
+bench_qr
+bench_svd
+bench_householder
--- a/ci/scripts/build.benchmark.sh
+++ b/ci/scripts/build.benchmark.sh
@@ -0,0 +1,45 @@
+#!/bin/bash
+# Build Eigen benchmarks for a given ISA target.
+#
+# Expected environment variables:
+#   EIGEN_CI_BUILDDIR         - build directory (default: .bench-build)
+#   EIGEN_CI_CXX_COMPILER     - C++ compiler
+#   EIGEN_CI_C_COMPILER        - C compiler
+#   EIGEN_BENCH_ISA_FLAGS     - ISA-specific compiler flags (e.g. "-mavx2 -mfma")
+
+set -ex
+
+rootdir=$(pwd)
+builddir=${EIGEN_CI_BUILDDIR:-.bench-build}
+mkdir -p "${builddir}"
+cd "${builddir}"
+
+# Install Google Benchmark from source if not already present.
+# The common before_script already installs cmake/ninja; we only need
+# git and ca-certificates for the clone.
+if ! pkg-config --exists benchmark 2>/dev/null; then
+  apt-get update -qq
+  apt-get install -y --no-install-recommends git ca-certificates
+  git clone --depth 1 --branch v1.9.1 https://github.com/google/benchmark.git /tmp/gbench
+  cmake -G Ninja -S /tmp/gbench -B /tmp/gbench-build \
+    -DCMAKE_BUILD_TYPE=Release \
+    -DBENCHMARK_ENABLE_TESTING=OFF \
+    -DCMAKE_INSTALL_PREFIX=/usr/local
+  cmake --build /tmp/gbench-build --target install
+  rm -rf /tmp/gbench /tmp/gbench-build
+fi
+
+# Configure benchmarks.  ISA flags are passed via CMAKE_CXX_FLAGS so they
+# apply globally to all benchmark targets.
+cmake -G Ninja \
+  -DCMAKE_CXX_COMPILER="${EIGEN_CI_CXX_COMPILER}" \
+  -DCMAKE_C_COMPILER="${EIGEN_CI_C_COMPILER}" \
+  -DCMAKE_CXX_FLAGS="${EIGEN_BENCH_ISA_FLAGS}" \
+  -DCMAKE_BUILD_TYPE=Release \
+  "${rootdir}/benchmarks"
+
+# Build all benchmark targets.  The nightly/weekly scope filtering happens
+# at run time, not build time.
+cmake --build . -- -k0 || cmake --build . -- -k0 -j1
+
+cd "${rootdir}"
--- a/ci/scripts/detect_regressions.py
+++ b/ci/scripts/detect_regressions.py
@@ -0,0 +1,394 @@
+#!/usr/bin/env python3
+"""Benchmark regression detection using Welch's t-test.
+
+Compares the current benchmark run against historical data stored on
+the perf-data git branch.  A regression is flagged when:
+
+  1. Welch's t-test p-value < significance threshold (default 0.01)
+  2. The relative change exceeds a minimum percentage (default 5%)
+  3. The direction is a slowdown (higher real_time)
+
+Exit codes:
+  0  no regressions
+  1  regressions detected
+  2  error
+"""
+
+import argparse
+import glob
+import json
+import os
+import subprocess
+import sys
+import xml.etree.ElementTree as ET
+from collections import defaultdict, namedtuple
+
+# scipy is the only external dependency (pip-installed in the CI job).
+from scipy.stats import ttest_ind
+
+Regression = namedtuple(
+    "Regression",
+    ["target", "key", "current_mean", "historical_mean", "change_pct", "p_value"],
+)
+
+
+def parse_args():
+    p = argparse.ArgumentParser(description=__doc__)
+    p.add_argument(
+        "--results-dir",
+        required=True,
+        help="Directory containing current run JSON files.",
+    )
+    p.add_argument(
+        "--perf-branch",
+        default="perf-data",
+        help="Git branch storing historical benchmark data.",
+    )
+    p.add_argument(
+        "--history-count",
+        type=int,
+        default=14,
+        help="Number of past runs to compare against.",
+    )
+    p.add_argument(
+        "--significance",
+        type=float,
+        default=0.01,
+        help="P-value threshold for Welch's t-test.",
+    )
+    p.add_argument(
+        "--min-change-pct",
+        type=float,
+        default=5.0,
+        help="Minimum percentage change to flag.",
+    )
+    p.add_argument(
+        "--output-report",
+        default="regression_report.txt",
+        help="Path for text report.",
+    )
+    return p.parse_args()
+
+
+def clone_perf_branch(branch, clone_dir):
+    """Shallow-clone the perf-data branch.  Returns True on success."""
+    # Construct clone URL from CI environment or fall back to current remote.
+    url = os.environ.get("CI_REPOSITORY_URL", "")
+    if not url:
+        try:
+            url = subprocess.check_output(
+                ["git", "remote", "get-url", "origin"], text=True
+            ).strip()
+        except Exception:
+            return False
+
+    try:
+        subprocess.check_call(
+            [
+                "git",
+                "clone",
+                "--depth=1",
+                "--single-branch",
+                "--branch",
+                branch,
+                url,
+                clone_dir,
+            ],
+            stdout=subprocess.DEVNULL,
+            stderr=subprocess.DEVNULL,
+        )
+        return True
+    except subprocess.CalledProcessError:
+        return False
+
+
+def _history_sort_key(fpath):
+    """Sort key for historical result files.
+
+    Prefer the recorded UTC timestamp in the JSON metadata. Fall back to the
+    filename so older date-only files still participate in the history window.
+    """
+    try:
+        with open(fpath) as f:
+            metadata = json.load(f).get("metadata", {})
+    except Exception:
+        metadata = {}
+    return metadata.get("timestamp") or metadata.get("date") or os.path.basename(fpath)
+
+
+def load_historical_data(perf_dir, target, history_count):
+    """Load per-repetition real_time values from the last *history_count* runs.
+
+    Returns dict: benchmark_key -> list of raw real_time values (multiple per run).
+
+    We load the same non-aggregate rows that load_current_results uses so both
+    sides of the t-test contain the same kind of measurement (individual
+    repetitions), avoiding a unit mismatch between per-rep and per-run means.
+    """
+    target_dir = os.path.join(perf_dir, target)
+    if not os.path.isdir(target_dir):
+        return {}
+
+    files = sorted(
+        glob.glob(os.path.join(target_dir, "*.json")),
+        key=_history_sort_key,
+        reverse=True,
+    )
+    files = files[:history_count]
+
+    history = defaultdict(list)
+    for fpath in files:
+        with open(fpath) as f:
+            data = json.load(f)
+        for exe_name, exe_data in data.get("files", {}).items():
+            for bm in exe_data.get("benchmarks", []):
+                run_type = bm.get("run_type", "")
+                if run_type == "aggregate":
+                    continue
+                name = bm.get("name", "")
+                key = f"{exe_name}/{name}"
+                rt = bm.get("real_time")
+                if rt is not None:
+                    history[key].append(rt)
+    return history
+
+
+def load_current_results(results_dir):
+    """Load current run results, keyed by target.
+
+    Returns dict: target -> dict(benchmark_key -> list of per-repetition real_time).
+    """
+    data = defaultdict(lambda: defaultdict(list))
+
+    for jf in sorted(glob.glob(os.path.join(results_dir, "*_*_*.json"))):
+        with open(jf) as f:
+            run = json.load(f)
+        meta = run.get("metadata", {})
+        target = meta.get("target", "unknown")
+
+        for exe_name, exe_data in run.get("files", {}).items():
+            for bm in exe_data.get("benchmarks", []):
+                name = bm.get("name", "")
+                run_type = bm.get("run_type", "")
+                # Use individual iteration rows (not aggregates) for the
+                # current run so we have per-repetition samples.
+                if run_type == "aggregate":
+                    continue
+                key = f"{exe_name}/{name}"
+                rt = bm.get("real_time")
+                if rt is not None:
+                    data[target][key].append(rt)
+
+    return data
+
+
+def find_regressions(current, historical, significance, min_change_pct):
+    """Compare current vs historical using Welch's t-test.
+
+    Returns (regressions, improvements, skipped_count).
+    """
+    regressions = []
+    improvements = []
+    skipped = 0
+
+    for key, current_values in sorted(current.items()):
+        hist_values = historical.get(key)
+        if not hist_values or len(hist_values) < 5:
+            skipped += 1
+            continue
+        if len(current_values) < 3:
+            skipped += 1
+            continue
+
+        cur_mean = sum(current_values) / len(current_values)
+        hist_mean = sum(hist_values) / len(hist_values)
+
+        if hist_mean == 0:
+            skipped += 1
+            continue
+
+        change_pct = (cur_mean - hist_mean) / hist_mean * 100.0
+
+        _, p_value = ttest_ind(current_values, hist_values, equal_var=False)
+
+        entry = Regression(
+            target="",  # filled in by caller
+            key=key,
+            current_mean=cur_mean,
+            historical_mean=hist_mean,
+            change_pct=change_pct,
+            p_value=p_value,
+        )
+
+        if p_value < significance and abs(change_pct) > min_change_pct:
+            if change_pct > 0:
+                # Higher real_time = slower = regression.
+                regressions.append(entry)
+            else:
+                improvements.append(entry)
+
+    return regressions, improvements, skipped
+
+
+def _qualified_key(r):
+    """Target-qualified display key, e.g. '[x86-64-avx2] bench_gemm/BM_Gemm/256'."""
+    return f"[{r.target}] {r.key}"
+
+
+def write_text_report(regressions, improvements, skipped, total, path):
+    """Write a human-readable summary."""
+    with open(path, "w") as f:
+        f.write("# Benchmark Regression Report\n\n")
+
+        if regressions:
+            f.write(f"## Regressions ({len(regressions)})\n\n")
+            f.write(
+                f"{'Benchmark':<70s} {'Historical':>12s} {'Current':>12s} "
+                f"{'Change':>8s} {'p-value':>8s}\n"
+            )
+            f.write("-" * 114 + "\n")
+            for r in sorted(regressions, key=lambda x: -x.change_pct):
+                f.write(
+                    f"{_qualified_key(r):<70s} {r.historical_mean:>12.1f} {r.current_mean:>12.1f} "
+                    f"{r.change_pct:>+7.1f}% {r.p_value:>8.4f}\n"
+                )
+            f.write("\n")
+
+        if improvements:
+            f.write(f"## Improvements ({len(improvements)})\n\n")
+            f.write(
+                f"{'Benchmark':<70s} {'Historical':>12s} {'Current':>12s} "
+                f"{'Change':>8s} {'p-value':>8s}\n"
+            )
+            f.write("-" * 114 + "\n")
+            for r in sorted(improvements, key=lambda x: x.change_pct):
+                f.write(
+                    f"{_qualified_key(r):<70s} {r.historical_mean:>12.1f} {r.current_mean:>12.1f} "
+                    f"{r.change_pct:>+7.1f}% {r.p_value:>8.4f}\n"
+                )
+            f.write("\n")
+
+        f.write(f"## Summary\n\n")
+        f.write(f"- Benchmarks analyzed: {total}\n")
+        f.write(f"- Regressions: {len(regressions)}\n")
+        f.write(f"- Improvements: {len(improvements)}\n")
+        f.write(f"- Skipped (insufficient data): {skipped}\n")
+
+
+def write_junit_report(regressions, analyzed_keys, path):
+    """Write JUnit XML so GitLab displays results in the test report tab.
+
+    Keys in *analyzed_keys* and regression entries are target-qualified
+    (e.g. "[x86-64-avx2] bench_gemm/BM_Gemm/256") so the same benchmark
+    on different ISA targets appears as separate test cases.
+    """
+    suite = ET.Element(
+        "testsuite",
+        name="benchmark-regressions",
+        tests=str(len(analyzed_keys)),
+        failures=str(len(regressions)),
+    )
+
+    regression_by_qkey = {_qualified_key(r): r for r in regressions}
+    for key in sorted(analyzed_keys):
+        tc = ET.SubElement(suite, "testcase", name=key, classname="benchmark")
+        r = regression_by_qkey.get(key)
+        if r is not None:
+            ET.SubElement(
+                tc,
+                "failure",
+                message=f"{r.change_pct:+.1f}% regression (p={r.p_value:.4f})",
+            ).text = (
+                f"historical_mean={r.historical_mean:.1f} "
+                f"current_mean={r.current_mean:.1f} "
+                f"change={r.change_pct:+.1f}% p={r.p_value:.6f}"
+            )
+
+    tree = ET.ElementTree(suite)
+    ET.indent(tree)
+    tree.write(path, xml_declaration=True, encoding="utf-8")
+
+
+def main():
+    args = parse_args()
+    results_dir = args.results_dir
+
+    # Load current results (keyed by target).
+    current_by_target = load_current_results(results_dir)
+    if not current_by_target:
+        print("No current benchmark results found.")
+        sys.exit(2)
+
+    total_benchmarks = sum(len(v) for v in current_by_target.values())
+    print(f"Loaded {total_benchmarks} benchmarks from current run.")
+    print(f"Targets: {', '.join(sorted(current_by_target.keys()))}")
+
+    # Clone historical data.
+    perf_dir = "/tmp/perf-data-history"
+    has_history = clone_perf_branch(args.perf_branch, perf_dir)
+
+    if not has_history:
+        print("No historical data found (perf-data branch missing).")
+        print("This is expected on the first run. Storing baseline only.")
+        sys.exit(0)
+
+    # Run analysis per target.
+    all_regressions = []
+    all_improvements = []
+    total_analyzed = 0
+    total_skipped = 0
+    all_keys = set()
+
+    for target in sorted(current_by_target.keys()):
+        target_current = current_by_target[target]
+        historical = load_historical_data(perf_dir, target, args.history_count)
+        if not historical:
+            print(f"  {target}: no historical data, skipping analysis.")
+            continue
+
+        regs, imps, skipped = find_regressions(
+            target_current, historical, args.significance, args.min_change_pct
+        )
+
+        # Tag regressions with the target.
+        regs = [r._replace(target=target) for r in regs]
+        imps = [r._replace(target=target) for r in imps]
+
+        all_regressions.extend(regs)
+        all_improvements.extend(imps)
+        total_analyzed += len(target_current) - skipped
+        total_skipped += skipped
+        # Use target-qualified keys so the same benchmark on different ISAs
+        # shows up as separate entries in reports.
+        all_keys.update(f"[{target}] {k}" for k in target_current)
+
+        print(
+            f"  {target}: {len(regs)} regressions, "
+            f"{len(imps)} improvements, {skipped} skipped"
+        )
+
+    # Write reports.
+    report_path = args.output_report
+    write_text_report(
+        all_regressions, all_improvements, total_skipped, total_analyzed, report_path
+    )
+    print(f"\nText report: {report_path}")
+
+    junit_path = report_path.replace(".txt", ".xml")
+    write_junit_report(all_regressions, all_keys, junit_path)
+    print(f"JUnit report: {junit_path}")
+
+    # Print summary and exit.
+    if all_regressions:
+        print(f"\nREGRESSIONS DETECTED: {len(all_regressions)} benchmark(s)")
+        for r in all_regressions:
+            print(f"  [{r.target}] {r.key}: {r.change_pct:+.1f}% (p={r.p_value:.4f})")
+        sys.exit(1)
+    else:
+        n_imp = len(all_improvements)
+        print(f"\nNo regressions detected. {n_imp} improvement(s) found.")
+        sys.exit(0)
+
+
+if __name__ == "__main__":
+    main()
--- a/ci/scripts/push_perf_data.sh
+++ b/ci/scripts/push_perf_data.sh
@@ -0,0 +1,100 @@
+#!/bin/sh
+# Push benchmark results to the perf-data orphan branch.
+# POSIX sh compatible (runs under Alpine's busybox ash).
+#
+# Expected environment variables:
+#   EIGEN_CI_BUILDDIR       - build directory containing results/
+#   EIGEN_CI_GIT_PUSH_URL   - authenticated git push URL
+#   CI_COMMIT_SHORT_SHA     - short commit hash
+
+set -ex
+
+results_dir="$(pwd)/${EIGEN_CI_BUILDDIR:-.bench-build}/results"
+perf_branch="perf-data"
+clone_dir="/tmp/perf-data-push"
+push_url="${EIGEN_CI_GIT_PUSH_URL:?EIGEN_CI_GIT_PUSH_URL must be set}"
+
+rm -rf "${clone_dir}"
+
+# Clone perf-data branch, or create orphan if it doesn't exist.
+if git clone --depth=1 --single-branch --branch "${perf_branch}" \
+     "${push_url}" "${clone_dir}" 2>/dev/null; then
+  echo "Cloned existing ${perf_branch} branch."
+else
+  echo "${perf_branch} branch does not exist, creating orphan..."
+  mkdir -p "${clone_dir}"
+  cd "${clone_dir}"
+  git init
+  git checkout --orphan "${perf_branch}"
+  cat > README.md <<'EOF'
+# Benchmark Performance Data
+
+This branch stores nightly/weekly benchmark results as JSON files.
+It is maintained automatically by the CI benchmark pipeline.
+
+## Structure
+
+    <target>/
+      <date>_<commit>_<target>.json
+
+## Analysis
+
+See `ci/scripts/detect_regressions.py` on the main branch for the
+regression detection script that consumes this data.
+EOF
+  git add README.md
+  git -c user.name="CI Bot" -c user.email="ci@eigen.tuxfamily.org" \
+    commit -m "Initialize perf-data branch"
+  git remote add origin "${push_url}"
+  cd -
+fi
+
+cd "${clone_dir}"
+
+# Copy combined result files into target subdirectories.
+# Only match canonical combined formats:
+#   YYYY-MM-DDTHH-MM-SSZ_<hex>_<target>.json
+#   YYYY-MM-DD_<hex>_<target>.json
+# This avoids picking up raw per-benchmark files like bench_gemm_double.json.
+copied=0
+for combined_json in "${results_dir}"/*.json; do
+  [ -f "${combined_json}" ] || continue
+  filename=$(basename "${combined_json}")
+  # Must start with a UTC timestamp or date, followed by a hex commit hash.
+  case "${filename}" in
+    [0-9][0-9][0-9][0-9]-[0-9][0-9]-[0-9][0-9]T[0-9][0-9]-[0-9][0-9]-[0-9][0-9]Z_[0-9a-f]*_*.json) ;;
+    [0-9][0-9][0-9][0-9]-[0-9][0-9]-[0-9][0-9]_[0-9a-f]*_*.json) ;;
+    *) continue ;;
+  esac
+  # Extract target: strip timestamp/date + commit prefix and .json suffix.
+  target=$(echo "${filename}" | sed 's/^[^_]*_[a-f0-9]*_//' | sed 's/\.json$//')
+  mkdir -p "${target}"
+  cp "${combined_json}" "${target}/${filename}"
+  copied=$((copied + 1))
+done
+
+if [ "${copied}" -eq 0 ]; then
+  echo "No result files to store."
+  exit 0
+fi
+
+# Prune data older than 90 days to keep the branch manageable.
+# We parse the date from the filename since clone mtime is always "now".
+cutoff=$(date -u -d "@$(($(date +%s) - 90*86400))" +%Y-%m-%d)
+find . -name '*.json' -path './*/*.json' | while IFS= read -r f; do
+  file_date=$(basename "$f" | grep -oE '^[0-9]{4}-[0-9]{2}-[0-9]{2}')
+  if [ -n "$file_date" ] && [ "$file_date" \< "$cutoff" ]; then
+    rm -f "$f"
+  fi
+done
+
+# Commit and push.
+git add -A
+git -c user.name="CI Bot" -c user.email="ci@eigen.tuxfamily.org" \
+  commit -m "Add benchmark results for $(date -u +%Y-%m-%d) (${CI_COMMIT_SHORT_SHA:-unknown})" || {
+  echo "No changes to commit."
+  exit 0
+}
+git push origin "${perf_branch}"
+
+echo "Pushed ${copied} result file(s) to ${perf_branch} branch."
--- a/ci/scripts/run.benchmark.sh
+++ b/ci/scripts/run.benchmark.sh
@@ -0,0 +1,144 @@
+#!/bin/bash
+# Run Eigen benchmarks and collect JSON results with metadata.
+#
+# Expected environment variables:
+#   EIGEN_CI_BUILDDIR          - build directory containing benchmark executables
+#   EIGEN_BENCH_TARGET         - ISA target name (e.g. "x86-64-avx2")
+#   EIGEN_BENCH_SCOPE          - "nightly" (core subset) or "weekly" (all)
+#   EIGEN_BENCH_REPETITIONS    - number of repetitions per benchmark (default: 5)
+
+set -ex
+
+rootdir=$(pwd)
+builddir=${EIGEN_CI_BUILDDIR:-.bench-build}
+results_dir="$(pwd)/${builddir}/results"
+mkdir -p "${results_dir}"
+
+target=${EIGEN_BENCH_TARGET:?EIGEN_BENCH_TARGET must be set}
+scope=${EIGEN_BENCH_SCOPE:-nightly}
+reps=${EIGEN_BENCH_REPETITIONS:-5}
+
+# Auto-promote to weekly on Sundays (day 0) so the full suite runs once a
+# week without requiring a separate GitLab schedule.
+if [ "${scope}" = "nightly" ] && [ "$(date -u +%u)" = "7" ]; then
+  echo "Sunday detected, promoting scope from nightly to weekly."
+  scope="weekly"
+fi
+
+# Runtime ISA check: skip if the runner lacks the required instruction set.
+if [[ "${target}" == *"avx512"* ]]; then
+  if ! grep -q 'avx512dq' /proc/cpuinfo 2>/dev/null; then
+    echo "WARNING: Runner does not support AVX-512 DQ. Skipping benchmarks."
+    exit 0
+  fi
+fi
+if [[ "${target}" == *"avx2"* ]]; then
+  if ! grep -q 'avx2' /proc/cpuinfo 2>/dev/null; then
+    echo "WARNING: Runner does not support AVX2. Skipping benchmarks."
+    exit 0
+  fi
+fi
+
+cd "${builddir}"
+
+# Determine which benchmarks to run.
+bench_list=()
+if [[ "${scope}" == "weekly" ]]; then
+  while IFS= read -r -d '' exe; do
+    bench_list+=("${exe}")
+  done < <(find . -maxdepth 1 -type f -executable -name 'bench_*' -print0 | sort -z)
+else
+  while IFS= read -r name; do
+    [[ -z "$name" || "$name" == \#* ]] && continue
+    name=$(echo "$name" | xargs)  # trim whitespace
+    [[ -z "$name" ]] && continue
+    if [[ -x "./${name}" ]]; then
+      bench_list+=("./${name}")
+    else
+      echo "WARNING: ${name} not found, skipping."
+    fi
+  done < "${rootdir}/ci/scripts/benchmark_targets.txt"
+fi
+
+if [[ ${#bench_list[@]} -eq 0 ]]; then
+  echo "ERROR: No benchmark executables found."
+  exit 1
+fi
+
+# Collect system info.
+cpu_model=$(grep -m1 'model name' /proc/cpuinfo 2>/dev/null | cut -d: -f2 | xargs || echo "unknown")
+timestamp=$(date -u +%Y-%m-%dT%H-%M-%SZ)
+commit=${CI_COMMIT_SHORT_SHA:-$(git rev-parse --short HEAD 2>/dev/null || echo "unknown")}
+successful_runs=0
+
+# Run each benchmark executable.
+for bench_exe in "${bench_list[@]}"; do
+  bench_name=$(basename "${bench_exe}")
+  outfile="${results_dir}/${bench_name}.json"
+
+  echo "=== Running ${bench_name} (${reps} repetitions) ==="
+  if ! "${bench_exe}" \
+    --benchmark_format=json \
+    --benchmark_out="${outfile}" \
+    --benchmark_repetitions="${reps}" \
+    --benchmark_report_aggregates_only=false \
+    2>&1; then
+    echo "WARNING: ${bench_name} failed (possibly SIGILL), skipping."
+    rm -f "${outfile}"
+    continue
+  fi
+  successful_runs=$((successful_runs + 1))
+done
+
+cd "${rootdir}"
+
+if [[ ${successful_runs} -eq 0 ]]; then
+  echo "ERROR: No benchmark executables completed successfully."
+  exit 1
+fi
+
+# Wrap each result file with metadata and produce a combined output.
+python3 - "${results_dir}" "${timestamp}" "${commit}" "${target}" "${cpu_model}" "${scope}" <<'PYEOF'
+import json
+import glob
+import os
+import sys
+
+results_dir = sys.argv[1]
+timestamp   = sys.argv[2]
+commit      = sys.argv[3]
+target      = sys.argv[4]
+cpu_model   = sys.argv[5]
+scope       = sys.argv[6]
+
+metadata = {
+    "timestamp": timestamp,
+    "date": timestamp[:10],
+    "commit": commit,
+    "target": target,
+    "cpu_model": cpu_model,
+    "scope": scope,
+    "ci_job_id": os.environ.get("CI_JOB_ID", ""),
+    "ci_pipeline_id": os.environ.get("CI_PIPELINE_ID", ""),
+    "runner_description": os.environ.get("CI_RUNNER_DESCRIPTION", ""),
+}
+
+combined = {"metadata": metadata, "files": {}}
+
+for jf in sorted(glob.glob(os.path.join(results_dir, "bench_*.json"))):
+    name = os.path.splitext(os.path.basename(jf))[0]
+    with open(jf) as f:
+        data = json.load(f)
+    entry = {
+        "context": data.get("context", {}),
+        "benchmarks": data.get("benchmarks", []),
+    }
+    combined["files"][name] = entry
+
+outpath = os.path.join(results_dir, f"{timestamp}_{commit}_{target}.json")
+with open(outpath, "w") as f:
+    json.dump(combined, f, indent=2)
+
+print(f"Combined results written to {outpath}")
+print(f"  {len(combined['files'])} benchmark files, target={target}")
+PYEOF