CI: split NVHPC build and make fallback parallelism configurable

libeigen/eigen!2372

Co-authored-by: Rasmus Munk Larsen <rmlarsen@gmail.com>
This commit is contained in:
Rasmus Munk Larsen
2026-04-01 16:43:33 -07:00
parent 4be66f2830
commit 6a07970d7d
5 changed files with 34 additions and 8 deletions

View File

@@ -147,7 +147,9 @@ build:linux:cross:x86-64:clang-19:default:asan-ubsan:unsupported:
# NVHPC (nvc++) uses NVIDIA's HPC SDK container image with the compilers
# pre-installed. We override EIGEN_CI_INSTALL to avoid trying to apt-get
# install the compiler.
build:linux:x86-64:nvhpc-26.1:default:
# Split into official/unsupported because nvc++ is so slow that the full
# build exceeds the 3-hour GitLab SaaS shared-runner timeout.
.build:linux:x86-64:nvhpc-26.1:
extends: .build:linux:cross:x86-64
image: nvcr.io/nvidia/nvhpc:26.1-devel-cuda13.1-ubuntu24.04
variables:
@@ -160,6 +162,17 @@ build:linux:x86-64:nvhpc-26.1:default:
# The shuffled-batch build strategy spreads memory-hungry targets across
# batches, preventing simultaneous OOM-prone compilations.
EIGEN_CI_BUILD_JOBS: "16"
EIGEN_CI_FALLBACK_JOBS: "4"
build:linux:x86-64:nvhpc-26.1:default:official:
extends: .build:linux:x86-64:nvhpc-26.1
variables:
EIGEN_CI_BUILD_TARGET: BuildOfficial
build:linux:x86-64:nvhpc-26.1:default:unsupported:
extends: .build:linux:x86-64:nvhpc-26.1
variables:
EIGEN_CI_BUILD_TARGET: BuildUnsupported
######## CUDA ##################################################################

View File

@@ -21,8 +21,7 @@ fi
# Builds (particularly gcc) sometimes get killed, potentially when running
# out of resources. In that case, keep trying to build the remaining
# targets (k0), then try to build again with a single thread (j1) to minimize
# resource use.
# targets (k0), then retry with reduced parallelism to minimize resource use.
# EIGEN_CI_BUILD_JOBS can be set to limit parallelism for memory-hungry
# compilers (e.g. NVHPC).
jobs=""
@@ -30,6 +29,9 @@ if [[ -n "${EIGEN_CI_BUILD_JOBS}" ]]; then
jobs="-j${EIGEN_CI_BUILD_JOBS}"
fi
# Fallback parallelism for retry builds after a failure (default: 2).
fallback_jobs="-j${EIGEN_CI_FALLBACK_JOBS:-2}"
# For phony meta-targets (e.g. buildtests), shuffle the dependency list and
# build in batches so that memory-hungry compilations (like bdcsvd with
# nvc++) are spread out instead of all running at once. Ninja ignores the
@@ -45,6 +47,16 @@ if [[ -n "${EIGEN_CI_BUILD_TARGET}" ]] && command -v ninja >/dev/null 2>&1; then
{ set +x; } 2>/dev/null
deps=$(ninja -t query "${EIGEN_CI_BUILD_TARGET}" 2>/dev/null \
| awk '/^ input:/{found=1; next} /^ outputs:/{found=0} found && /^ /{print $1}')
# CMake custom targets like BuildOfficial have an intermediate phony
# (e.g. test/BuildOfficial) that holds the real dependencies. If we
# got exactly one dep, resolve it one more level.
if [[ $(echo "$deps" | wc -l) -eq 1 ]] && [[ -n "$deps" ]]; then
inner=$(ninja -t query "$deps" 2>/dev/null \
| awk '/^ input:/{found=1; next} /^ outputs:/{found=0} found && /^ /{print $1}')
if [[ -n "$inner" ]]; then
deps="$inner"
fi
fi
# Deterministic shuffle: hash each target name and sort by hash.
# Stable across runs (helps ninja's .ninja_log and build caches),
# portable (no shuf dependency), and spreads same-family targets apart.
@@ -72,7 +84,7 @@ if [[ -n "${EIGEN_CI_BUILD_TARGET}" ]] && command -v ninja >/dev/null 2>&1; then
while IFS= read -r batch; do
batch_num=$((batch_num + 1))
echo "=== Batch ${batch_num} ==="
ninja -k0 ${jobs} ${batch} || ninja -k0 -j1 ${batch} || build_failed=true
ninja -k0 ${jobs} ${batch} || ninja -k0 ${fallback_jobs} ${batch} || build_failed=true
done < <(echo "$shuffled_deps" | xargs -n "${batch_size}")
if [[ "$build_failed" == "true" ]]; then
echo "Some batches failed."
@@ -83,7 +95,7 @@ if [[ -n "${EIGEN_CI_BUILD_TARGET}" ]] && command -v ninja >/dev/null 2>&1; then
fi
if [[ "$shuffled" != "true" ]]; then
cmake --build . ${target} -- -k0 ${jobs} || cmake --build . ${target} -- -k0 -j1
cmake --build . ${target} -- -k0 ${jobs} || cmake --build . ${target} -- -k0 ${fallback_jobs}
fi
# Return to root directory.

View File

@@ -22,7 +22,7 @@ set +x
EIGEN_CI_CTEST_PARALLEL=${EIGEN_CI_CTEST_PARALLEL:-${NPROC}}
EIGEN_CI_CTEST_REPEAT=${EIGEN_CI_CTEST_REPEAT:-3}
ctest_cmd="ctest ${EIGEN_CI_CTEST_ARGS} --parallel ${EIGEN_CI_CTEST_PARALLEL} --output-on-failure --no-compress-output --no-tests=error --build-noclean ${target} ${exclude}"
ctest_cmd="ctest ${EIGEN_CI_CTEST_ARGS} --parallel ${EIGEN_CI_CTEST_PARALLEL} --output-on-failure --no-compress-output --build-noclean ${target} ${exclude}"
echo "Running initial tests..."
if ${ctest_cmd} -T test; then

View File

@@ -13,7 +13,7 @@ if (${EIGEN_CI_CTEST_REGEX}) {
$target = "-L","${EIGEN_CI_CTEST_LABEL}"
}
$ctest_cmd = { ctest ${EIGEN_CI_CTEST_ARGS} --parallel ${NPROC} --output-on-failure --no-compress-output --no-tests=error --build-noclean ${target} }
$ctest_cmd = { ctest ${EIGEN_CI_CTEST_ARGS} --parallel ${NPROC} --output-on-failure --no-compress-output --build-noclean ${target} }
Write-Host "Running initial tests..."

View File

@@ -215,17 +215,18 @@ test:linux:x86-64:clang-19:default:asan-ubsan:unsupported:
.test:linux:x86-64:nvhpc-26.1:default:
extends: .test:linux:x86-64
image: nvcr.io/nvidia/nvhpc:26.1-devel-cuda13.1-ubuntu24.04
needs: [ build:linux:x86-64:nvhpc-26.1:default ]
variables:
EIGEN_CI_INSTALL: ""
test:linux:x86-64:nvhpc-26.1:default:official:
extends: .test:linux:x86-64:nvhpc-26.1:default
needs: [ build:linux:x86-64:nvhpc-26.1:default:official ]
variables:
EIGEN_CI_CTEST_LABEL: Official
test:linux:x86-64:nvhpc-26.1:default:unsupported:
extends: .test:linux:x86-64:nvhpc-26.1:default
needs: [ build:linux:x86-64:nvhpc-26.1:default:unsupported ]
variables:
EIGEN_CI_CTEST_LABEL: Unsupported