mirror of
https://gitlab.com/libeigen/eigen.git
synced 2026-04-10 11:34:33 +08:00
Add runtime cache size detection for ARM and improve GEMM blocking
libeigen/eigen!2282 Co-authored-by: Rasmus Munk Larsen <rmlarsen@gmail.com>
This commit is contained in:
1
.gitignore
vendored
1
.gitignore
vendored
@@ -39,3 +39,4 @@ Makefile
|
||||
!scripts/buildtests.in
|
||||
!Eigen/Core
|
||||
!Eigen/src/Core
|
||||
CLAUDE.md
|
||||
|
||||
@@ -57,6 +57,10 @@ const std::ptrdiff_t defaultL3CacheSize = EIGEN_SET_DEFAULT_L3_CACHE_SIZE(8 * 10
|
||||
const std::ptrdiff_t defaultL2CacheSize = EIGEN_SET_DEFAULT_L2_CACHE_SIZE(512 * 1024);
|
||||
const std::ptrdiff_t defaultL3CacheSize = EIGEN_SET_DEFAULT_L3_CACHE_SIZE(4 * 1024 * 1024);
|
||||
#endif
|
||||
#elif EIGEN_ARCH_ARM_OR_ARM64
|
||||
const std::ptrdiff_t defaultL1CacheSize = EIGEN_SET_DEFAULT_L1_CACHE_SIZE(64 * 1024);
|
||||
const std::ptrdiff_t defaultL2CacheSize = EIGEN_SET_DEFAULT_L2_CACHE_SIZE(1024 * 1024);
|
||||
const std::ptrdiff_t defaultL3CacheSize = EIGEN_SET_DEFAULT_L3_CACHE_SIZE(4 * 1024 * 1024);
|
||||
#else
|
||||
const std::ptrdiff_t defaultL1CacheSize = EIGEN_SET_DEFAULT_L1_CACHE_SIZE(16 * 1024);
|
||||
const std::ptrdiff_t defaultL2CacheSize = EIGEN_SET_DEFAULT_L2_CACHE_SIZE(512 * 1024);
|
||||
@@ -260,8 +264,9 @@ void evaluateProductBlockingSizesHeuristic(Index& k, Index& m, Index& n, Index n
|
||||
// L1 blocking
|
||||
max_nc = remaining_l1 / (k * sizeof(RhsScalar));
|
||||
} else {
|
||||
// L2 blocking
|
||||
max_nc = (3 * actual_l2) / (2 * 2 * max_kc * sizeof(RhsScalar));
|
||||
// L2 blocking: use actual kc (k) rather than max_kc so that nc is not
|
||||
// unnecessarily squeezed when k < max_kc (e.g. on CPUs with large L1).
|
||||
max_nc = (3 * actual_l2) / (2 * 2 * k * sizeof(RhsScalar));
|
||||
}
|
||||
// WARNING Below, we assume that Traits::nr is a power of two.
|
||||
Index nc = numext::mini<Index>(actual_l2 / (2 * k * sizeof(RhsScalar)), max_nc) & (~(Traits::nr - 1));
|
||||
|
||||
@@ -977,6 +977,10 @@ class aligned_allocator {
|
||||
|
||||
//---------- Cache sizes ----------
|
||||
|
||||
#if EIGEN_OS_MAC
|
||||
#include <sys/sysctl.h>
|
||||
#endif
|
||||
|
||||
#if !defined(EIGEN_NO_CPUID)
|
||||
#if EIGEN_COMP_GNUC && EIGEN_ARCH_i386_OR_x86_64
|
||||
#if defined(__PIC__) && EIGEN_ARCH_i386
|
||||
@@ -1305,6 +1309,37 @@ inline void queryCacheSizes(int& l1, int& l2, int& l3) {
|
||||
// ||cpuid_is_vendor(abcd,"SiS SiS SiS ")
|
||||
// ||cpuid_is_vendor(abcd,"UMC UMC UMC ")
|
||||
// ||cpuid_is_vendor(abcd,"NexGenDriven")
|
||||
#elif EIGEN_OS_MAC
|
||||
// On macOS (including Apple Silicon), use sysctlbyname to query cache sizes.
|
||||
// The sysctl values are 64-bit, so read into int64_t and convert.
|
||||
// For L1, prefer P-core (perflevel0) size since compute-heavy work like GEMM
|
||||
// is typically scheduled on performance cores. L1 is per-core so always safe.
|
||||
// For L2, use the generic hw.l2cachesize which is more conservative (reports
|
||||
// the smaller E-core cluster L2 on heterogeneous chips). The P-core L2 is
|
||||
// shared among all P-cores and would overestimate per-core capacity.
|
||||
{
|
||||
int64_t val = 0;
|
||||
std::size_t val_size = sizeof(val);
|
||||
l1 = -1;
|
||||
val_size = sizeof(val);
|
||||
if (sysctlbyname("hw.perflevel0.l1dcachesize", &val, &val_size, NULL, 0) == 0 && val > 0)
|
||||
l1 = static_cast<int>(val);
|
||||
else {
|
||||
val_size = sizeof(val);
|
||||
if (sysctlbyname("hw.l1dcachesize", &val, &val_size, NULL, 0) == 0) l1 = static_cast<int>(val);
|
||||
}
|
||||
l2 = -1;
|
||||
val_size = sizeof(val);
|
||||
if (sysctlbyname("hw.l2cachesize", &val, &val_size, NULL, 0) == 0) l2 = static_cast<int>(val);
|
||||
l3 = -1;
|
||||
val_size = sizeof(val);
|
||||
if (sysctlbyname("hw.l3cachesize", &val, &val_size, NULL, 0) == 0 && val > 0) l3 = static_cast<int>(val);
|
||||
}
|
||||
#elif defined(_SC_LEVEL1_DCACHE_SIZE)
|
||||
// On Linux and other POSIX systems, use sysconf to query cache sizes.
|
||||
l1 = sysconf(_SC_LEVEL1_DCACHE_SIZE);
|
||||
l2 = sysconf(_SC_LEVEL2_CACHE_SIZE);
|
||||
l3 = sysconf(_SC_LEVEL3_CACHE_SIZE);
|
||||
#else
|
||||
l1 = l2 = l3 = -1;
|
||||
#endif
|
||||
|
||||
@@ -35,7 +35,7 @@ static void BM_EigenGemm(benchmark::State& state) {
|
||||
}
|
||||
|
||||
static void GemmSizes(::benchmark::Benchmark* b) {
|
||||
for (int size : {8, 16, 32, 64, 96, 128, 160, 192, 224, 256, 288, 320, 384, 448, 512, 768, 1024, 1536, 2048}) {
|
||||
for (int size : {8, 16, 32, 64, 96, 128, 160, 192, 224, 256, 288, 320, 384, 448, 512, 768, 1024, 1536, 2048, 4096}) {
|
||||
b->Args({size, size, size});
|
||||
}
|
||||
// Non-square sizes
|
||||
|
||||
Reference in New Issue
Block a user