From 8368a12f0fb49bc91cdbbffd3394486e542e85c3 Mon Sep 17 00:00:00 2001 From: Rasmus Munk Larsen <4643818-rmlarsen1@users.noreply.gitlab.com> Date: Wed, 11 Mar 2026 22:36:33 -0700 Subject: [PATCH] Add runtime cache size detection for ARM and improve GEMM blocking libeigen/eigen!2282 Co-authored-by: Rasmus Munk Larsen --- .gitignore | 1 + .../Core/products/GeneralBlockPanelKernel.h | 9 +++-- Eigen/src/Core/util/Memory.h | 35 +++++++++++++++++++ benchmarks/Core/bench_gemm.cpp | 2 +- 4 files changed, 44 insertions(+), 3 deletions(-) diff --git a/.gitignore b/.gitignore index 369ae25b6..7012c2acf 100644 --- a/.gitignore +++ b/.gitignore @@ -39,3 +39,4 @@ Makefile !scripts/buildtests.in !Eigen/Core !Eigen/src/Core +CLAUDE.md diff --git a/Eigen/src/Core/products/GeneralBlockPanelKernel.h b/Eigen/src/Core/products/GeneralBlockPanelKernel.h index 9645958f0..cc22dc75a 100644 --- a/Eigen/src/Core/products/GeneralBlockPanelKernel.h +++ b/Eigen/src/Core/products/GeneralBlockPanelKernel.h @@ -57,6 +57,10 @@ const std::ptrdiff_t defaultL3CacheSize = EIGEN_SET_DEFAULT_L3_CACHE_SIZE(8 * 10 const std::ptrdiff_t defaultL2CacheSize = EIGEN_SET_DEFAULT_L2_CACHE_SIZE(512 * 1024); const std::ptrdiff_t defaultL3CacheSize = EIGEN_SET_DEFAULT_L3_CACHE_SIZE(4 * 1024 * 1024); #endif +#elif EIGEN_ARCH_ARM_OR_ARM64 +const std::ptrdiff_t defaultL1CacheSize = EIGEN_SET_DEFAULT_L1_CACHE_SIZE(64 * 1024); +const std::ptrdiff_t defaultL2CacheSize = EIGEN_SET_DEFAULT_L2_CACHE_SIZE(1024 * 1024); +const std::ptrdiff_t defaultL3CacheSize = EIGEN_SET_DEFAULT_L3_CACHE_SIZE(4 * 1024 * 1024); #else const std::ptrdiff_t defaultL1CacheSize = EIGEN_SET_DEFAULT_L1_CACHE_SIZE(16 * 1024); const std::ptrdiff_t defaultL2CacheSize = EIGEN_SET_DEFAULT_L2_CACHE_SIZE(512 * 1024); @@ -260,8 +264,9 @@ void evaluateProductBlockingSizesHeuristic(Index& k, Index& m, Index& n, Index n // L1 blocking max_nc = remaining_l1 / (k * sizeof(RhsScalar)); } else { - // L2 blocking - max_nc = (3 * actual_l2) / (2 * 2 * max_kc * sizeof(RhsScalar)); + // L2 blocking: use actual kc (k) rather than max_kc so that nc is not + // unnecessarily squeezed when k < max_kc (e.g. on CPUs with large L1). + max_nc = (3 * actual_l2) / (2 * 2 * k * sizeof(RhsScalar)); } // WARNING Below, we assume that Traits::nr is a power of two. Index nc = numext::mini(actual_l2 / (2 * k * sizeof(RhsScalar)), max_nc) & (~(Traits::nr - 1)); diff --git a/Eigen/src/Core/util/Memory.h b/Eigen/src/Core/util/Memory.h index 95712ff41..3d279014a 100644 --- a/Eigen/src/Core/util/Memory.h +++ b/Eigen/src/Core/util/Memory.h @@ -977,6 +977,10 @@ class aligned_allocator { //---------- Cache sizes ---------- +#if EIGEN_OS_MAC +#include +#endif + #if !defined(EIGEN_NO_CPUID) #if EIGEN_COMP_GNUC && EIGEN_ARCH_i386_OR_x86_64 #if defined(__PIC__) && EIGEN_ARCH_i386 @@ -1305,6 +1309,37 @@ inline void queryCacheSizes(int& l1, int& l2, int& l3) { // ||cpuid_is_vendor(abcd,"SiS SiS SiS ") // ||cpuid_is_vendor(abcd,"UMC UMC UMC ") // ||cpuid_is_vendor(abcd,"NexGenDriven") +#elif EIGEN_OS_MAC + // On macOS (including Apple Silicon), use sysctlbyname to query cache sizes. + // The sysctl values are 64-bit, so read into int64_t and convert. + // For L1, prefer P-core (perflevel0) size since compute-heavy work like GEMM + // is typically scheduled on performance cores. L1 is per-core so always safe. + // For L2, use the generic hw.l2cachesize which is more conservative (reports + // the smaller E-core cluster L2 on heterogeneous chips). The P-core L2 is + // shared among all P-cores and would overestimate per-core capacity. + { + int64_t val = 0; + std::size_t val_size = sizeof(val); + l1 = -1; + val_size = sizeof(val); + if (sysctlbyname("hw.perflevel0.l1dcachesize", &val, &val_size, NULL, 0) == 0 && val > 0) + l1 = static_cast(val); + else { + val_size = sizeof(val); + if (sysctlbyname("hw.l1dcachesize", &val, &val_size, NULL, 0) == 0) l1 = static_cast(val); + } + l2 = -1; + val_size = sizeof(val); + if (sysctlbyname("hw.l2cachesize", &val, &val_size, NULL, 0) == 0) l2 = static_cast(val); + l3 = -1; + val_size = sizeof(val); + if (sysctlbyname("hw.l3cachesize", &val, &val_size, NULL, 0) == 0 && val > 0) l3 = static_cast(val); + } +#elif defined(_SC_LEVEL1_DCACHE_SIZE) + // On Linux and other POSIX systems, use sysconf to query cache sizes. + l1 = sysconf(_SC_LEVEL1_DCACHE_SIZE); + l2 = sysconf(_SC_LEVEL2_CACHE_SIZE); + l3 = sysconf(_SC_LEVEL3_CACHE_SIZE); #else l1 = l2 = l3 = -1; #endif diff --git a/benchmarks/Core/bench_gemm.cpp b/benchmarks/Core/bench_gemm.cpp index db061b7fe..5d480ef1d 100644 --- a/benchmarks/Core/bench_gemm.cpp +++ b/benchmarks/Core/bench_gemm.cpp @@ -35,7 +35,7 @@ static void BM_EigenGemm(benchmark::State& state) { } static void GemmSizes(::benchmark::Benchmark* b) { - for (int size : {8, 16, 32, 64, 96, 128, 160, 192, 224, 256, 288, 320, 384, 448, 512, 768, 1024, 1536, 2048}) { + for (int size : {8, 16, 32, 64, 96, 128, 160, 192, 224, 256, 288, 320, 384, 448, 512, 768, 1024, 1536, 2048, 4096}) { b->Args({size, size, size}); } // Non-square sizes