From 8368a12f0fb49bc91cdbbffd3394486e542e85c3 Mon Sep 17 00:00:00 2001
From: Rasmus Munk Larsen <4643818-rmlarsen1@users.noreply.gitlab.com>
Date: Wed, 11 Mar 2026 22:36:33 -0700
Subject: [PATCH] Add runtime cache size detection for ARM and improve GEMM
 blocking

libeigen/eigen!2282

Co-authored-by: Rasmus Munk Larsen <rmlarsen@gmail.com>
---
 .gitignore                                    |  1 +
 .../Core/products/GeneralBlockPanelKernel.h   |  9 +++--
 Eigen/src/Core/util/Memory.h                  | 35 +++++++++++++++++++
 benchmarks/Core/bench_gemm.cpp                |  2 +-
 4 files changed, 44 insertions(+), 3 deletions(-)
diff --git a/.gitignore b/.gitignore
index 369ae25b6..7012c2acf 100644
--- a/.gitignore
+++ b/.gitignore
@@ -39,3 +39,4 @@ Makefile
 !scripts/buildtests.in
 !Eigen/Core
 !Eigen/src/Core
+CLAUDE.md
diff --git a/Eigen/src/Core/products/GeneralBlockPanelKernel.h b/Eigen/src/Core/products/GeneralBlockPanelKernel.h
index 9645958f0..cc22dc75a 100644
--- a/Eigen/src/Core/products/GeneralBlockPanelKernel.h
+++ b/Eigen/src/Core/products/GeneralBlockPanelKernel.h
@@ -57,6 +57,10 @@ const std::ptrdiff_t defaultL3CacheSize = EIGEN_SET_DEFAULT_L3_CACHE_SIZE(8 * 10
 const std::ptrdiff_t defaultL2CacheSize = EIGEN_SET_DEFAULT_L2_CACHE_SIZE(512 * 1024);
 const std::ptrdiff_t defaultL3CacheSize = EIGEN_SET_DEFAULT_L3_CACHE_SIZE(4 * 1024 * 1024);
 #endif
+#elif EIGEN_ARCH_ARM_OR_ARM64
+const std::ptrdiff_t defaultL1CacheSize = EIGEN_SET_DEFAULT_L1_CACHE_SIZE(64 * 1024);
+const std::ptrdiff_t defaultL2CacheSize = EIGEN_SET_DEFAULT_L2_CACHE_SIZE(1024 * 1024);
+const std::ptrdiff_t defaultL3CacheSize = EIGEN_SET_DEFAULT_L3_CACHE_SIZE(4 * 1024 * 1024);
 #else
 const std::ptrdiff_t defaultL1CacheSize = EIGEN_SET_DEFAULT_L1_CACHE_SIZE(16 * 1024);
 const std::ptrdiff_t defaultL2CacheSize = EIGEN_SET_DEFAULT_L2_CACHE_SIZE(512 * 1024);
@@ -260,8 +264,9 @@ void evaluateProductBlockingSizesHeuristic(Index& k, Index& m, Index& n, Index n
       // L1 blocking
       max_nc = remaining_l1 / (k * sizeof(RhsScalar));
     } else {
-      // L2 blocking
-      max_nc = (3 * actual_l2) / (2 * 2 * max_kc * sizeof(RhsScalar));
+      // L2 blocking: use actual kc (k) rather than max_kc so that nc is not
+      // unnecessarily squeezed when k < max_kc (e.g. on CPUs with large L1).
+      max_nc = (3 * actual_l2) / (2 * 2 * k * sizeof(RhsScalar));
     }
     // WARNING Below, we assume that Traits::nr is a power of two.
     Index nc = numext::mini<Index>(actual_l2 / (2 * k * sizeof(RhsScalar)), max_nc) & (~(Traits::nr - 1));
diff --git a/Eigen/src/Core/util/Memory.h b/Eigen/src/Core/util/Memory.h
index 95712ff41..3d279014a 100644
--- a/Eigen/src/Core/util/Memory.h
+++ b/Eigen/src/Core/util/Memory.h
@@ -977,6 +977,10 @@ class aligned_allocator {
 
 //---------- Cache sizes ----------
 
+#if EIGEN_OS_MAC
+#include <sys/sysctl.h>
+#endif
+
 #if !defined(EIGEN_NO_CPUID)
 #if EIGEN_COMP_GNUC && EIGEN_ARCH_i386_OR_x86_64
 #if defined(__PIC__) && EIGEN_ARCH_i386
@@ -1305,6 +1309,37 @@ inline void queryCacheSizes(int& l1, int& l2, int& l3) {
     //   ||cpuid_is_vendor(abcd,"SiS SiS SiS ")
     //   ||cpuid_is_vendor(abcd,"UMC UMC UMC ")
     //   ||cpuid_is_vendor(abcd,"NexGenDriven")
+#elif EIGEN_OS_MAC
+  // On macOS (including Apple Silicon), use sysctlbyname to query cache sizes.
+  // The sysctl values are 64-bit, so read into int64_t and convert.
+  // For L1, prefer P-core (perflevel0) size since compute-heavy work like GEMM
+  // is typically scheduled on performance cores. L1 is per-core so always safe.
+  // For L2, use the generic hw.l2cachesize which is more conservative (reports
+  // the smaller E-core cluster L2 on heterogeneous chips). The P-core L2 is
+  // shared among all P-cores and would overestimate per-core capacity.
+  {
+    int64_t val = 0;
+    std::size_t val_size = sizeof(val);
+    l1 = -1;
+    val_size = sizeof(val);
+    if (sysctlbyname("hw.perflevel0.l1dcachesize", &val, &val_size, NULL, 0) == 0 && val > 0)
+      l1 = static_cast<int>(val);
+    else {
+      val_size = sizeof(val);
+      if (sysctlbyname("hw.l1dcachesize", &val, &val_size, NULL, 0) == 0) l1 = static_cast<int>(val);
+    }
+    l2 = -1;
+    val_size = sizeof(val);
+    if (sysctlbyname("hw.l2cachesize", &val, &val_size, NULL, 0) == 0) l2 = static_cast<int>(val);
+    l3 = -1;
+    val_size = sizeof(val);
+    if (sysctlbyname("hw.l3cachesize", &val, &val_size, NULL, 0) == 0 && val > 0) l3 = static_cast<int>(val);
+  }
+#elif defined(_SC_LEVEL1_DCACHE_SIZE)
+  // On Linux and other POSIX systems, use sysconf to query cache sizes.
+  l1 = sysconf(_SC_LEVEL1_DCACHE_SIZE);
+  l2 = sysconf(_SC_LEVEL2_CACHE_SIZE);
+  l3 = sysconf(_SC_LEVEL3_CACHE_SIZE);
 #else
   l1 = l2 = l3 = -1;
 #endif
diff --git a/benchmarks/Core/bench_gemm.cpp b/benchmarks/Core/bench_gemm.cpp
index db061b7fe..5d480ef1d 100644
--- a/benchmarks/Core/bench_gemm.cpp
+++ b/benchmarks/Core/bench_gemm.cpp
@@ -35,7 +35,7 @@ static void BM_EigenGemm(benchmark::State& state) {
 }
 
 static void GemmSizes(::benchmark::Benchmark* b) {
-  for (int size : {8, 16, 32, 64, 96, 128, 160, 192, 224, 256, 288, 320, 384, 448, 512, 768, 1024, 1536, 2048}) {
+  for (int size : {8, 16, 32, 64, 96, 128, 160, 192, 224, 256, 288, 320, 384, 448, 512, 768, 1024, 1536, 2048, 4096}) {
     b->Args({size, size, size});
   }
   // Non-square sizes