Add runtime cache size detection for ARM and improve GEMM blocking

libeigen/eigen!2282

Co-authored-by: Rasmus Munk Larsen <rmlarsen@gmail.com>
This commit is contained in:
Rasmus Munk Larsen
2026-03-11 22:36:33 -07:00
parent 42c1dbd2c3
commit 8368a12f0f
4 changed files with 44 additions and 3 deletions

1
.gitignore vendored
View File

@@ -39,3 +39,4 @@ Makefile
!scripts/buildtests.in
!Eigen/Core
!Eigen/src/Core
CLAUDE.md

View File

@@ -57,6 +57,10 @@ const std::ptrdiff_t defaultL3CacheSize = EIGEN_SET_DEFAULT_L3_CACHE_SIZE(8 * 10
const std::ptrdiff_t defaultL2CacheSize = EIGEN_SET_DEFAULT_L2_CACHE_SIZE(512 * 1024);
const std::ptrdiff_t defaultL3CacheSize = EIGEN_SET_DEFAULT_L3_CACHE_SIZE(4 * 1024 * 1024);
#endif
#elif EIGEN_ARCH_ARM_OR_ARM64
const std::ptrdiff_t defaultL1CacheSize = EIGEN_SET_DEFAULT_L1_CACHE_SIZE(64 * 1024);
const std::ptrdiff_t defaultL2CacheSize = EIGEN_SET_DEFAULT_L2_CACHE_SIZE(1024 * 1024);
const std::ptrdiff_t defaultL3CacheSize = EIGEN_SET_DEFAULT_L3_CACHE_SIZE(4 * 1024 * 1024);
#else
const std::ptrdiff_t defaultL1CacheSize = EIGEN_SET_DEFAULT_L1_CACHE_SIZE(16 * 1024);
const std::ptrdiff_t defaultL2CacheSize = EIGEN_SET_DEFAULT_L2_CACHE_SIZE(512 * 1024);
@@ -260,8 +264,9 @@ void evaluateProductBlockingSizesHeuristic(Index& k, Index& m, Index& n, Index n
// L1 blocking
max_nc = remaining_l1 / (k * sizeof(RhsScalar));
} else {
// L2 blocking
max_nc = (3 * actual_l2) / (2 * 2 * max_kc * sizeof(RhsScalar));
// L2 blocking: use actual kc (k) rather than max_kc so that nc is not
// unnecessarily squeezed when k < max_kc (e.g. on CPUs with large L1).
max_nc = (3 * actual_l2) / (2 * 2 * k * sizeof(RhsScalar));
}
// WARNING Below, we assume that Traits::nr is a power of two.
Index nc = numext::mini<Index>(actual_l2 / (2 * k * sizeof(RhsScalar)), max_nc) & (~(Traits::nr - 1));

View File

@@ -977,6 +977,10 @@ class aligned_allocator {
//---------- Cache sizes ----------
#if EIGEN_OS_MAC
#include <sys/sysctl.h>
#endif
#if !defined(EIGEN_NO_CPUID)
#if EIGEN_COMP_GNUC && EIGEN_ARCH_i386_OR_x86_64
#if defined(__PIC__) && EIGEN_ARCH_i386
@@ -1305,6 +1309,37 @@ inline void queryCacheSizes(int& l1, int& l2, int& l3) {
// ||cpuid_is_vendor(abcd,"SiS SiS SiS ")
// ||cpuid_is_vendor(abcd,"UMC UMC UMC ")
// ||cpuid_is_vendor(abcd,"NexGenDriven")
#elif EIGEN_OS_MAC
// On macOS (including Apple Silicon), use sysctlbyname to query cache sizes.
// The sysctl values are 64-bit, so read into int64_t and convert.
// For L1, prefer P-core (perflevel0) size since compute-heavy work like GEMM
// is typically scheduled on performance cores. L1 is per-core so always safe.
// For L2, use the generic hw.l2cachesize which is more conservative (reports
// the smaller E-core cluster L2 on heterogeneous chips). The P-core L2 is
// shared among all P-cores and would overestimate per-core capacity.
{
int64_t val = 0;
std::size_t val_size = sizeof(val);
l1 = -1;
val_size = sizeof(val);
if (sysctlbyname("hw.perflevel0.l1dcachesize", &val, &val_size, NULL, 0) == 0 && val > 0)
l1 = static_cast<int>(val);
else {
val_size = sizeof(val);
if (sysctlbyname("hw.l1dcachesize", &val, &val_size, NULL, 0) == 0) l1 = static_cast<int>(val);
}
l2 = -1;
val_size = sizeof(val);
if (sysctlbyname("hw.l2cachesize", &val, &val_size, NULL, 0) == 0) l2 = static_cast<int>(val);
l3 = -1;
val_size = sizeof(val);
if (sysctlbyname("hw.l3cachesize", &val, &val_size, NULL, 0) == 0 && val > 0) l3 = static_cast<int>(val);
}
#elif defined(_SC_LEVEL1_DCACHE_SIZE)
// On Linux and other POSIX systems, use sysconf to query cache sizes.
l1 = sysconf(_SC_LEVEL1_DCACHE_SIZE);
l2 = sysconf(_SC_LEVEL2_CACHE_SIZE);
l3 = sysconf(_SC_LEVEL3_CACHE_SIZE);
#else
l1 = l2 = l3 = -1;
#endif

View File

@@ -35,7 +35,7 @@ static void BM_EigenGemm(benchmark::State& state) {
}
static void GemmSizes(::benchmark::Benchmark* b) {
for (int size : {8, 16, 32, 64, 96, 128, 160, 192, 224, 256, 288, 320, 384, 448, 512, 768, 1024, 1536, 2048}) {
for (int size : {8, 16, 32, 64, 96, 128, 160, 192, 224, 256, 288, 320, 384, 448, 512, 768, 1024, 1536, 2048, 4096}) {
b->Args({size, size, size});
}
// Non-square sizes