Fine-tune gebp_kernel for aarch64

libeigen/eigen!2278
This commit is contained in:
Yu You
2026-03-20 14:29:03 -07:00
committed by Rasmus Munk Larsen
parent a0b16a7e1b
commit 9d161e0c87

View File

@@ -1411,9 +1411,16 @@ EIGEN_DONT_INLINE void gebp_kernel<LhsScalar, RhsScalar, Index, DataMapper, mr,
EIGEN_IF_CONSTEXPR(mr >= 3 * Traits::LhsProgress) {
std::ptrdiff_t l1, l2, l3;
manage_caching_sizes(GetAction, &l1, &l2, &l3);
const Index rhs_block = sizeof(ResScalar) * mr * nr + depth * nr * sizeof(RhsScalar);
#if EIGEN_ARCH_ARM64
const Index actual_panel_rows =
(3 * LhsProgress) * std::max<Index>(1, ((l1 - sizeof(ResScalar) * mr * nr - depth * nr * sizeof(RhsScalar)) /
(depth * sizeof(LhsScalar) * 3 * LhsProgress)));
(rhs_block <= l1) ? peeled_mc3
: (3 * LhsProgress) *
std::max<Index>(1, ((l1 - rhs_block) / (depth * sizeof(LhsScalar) * 3 * LhsProgress)));
#else
const Index actual_panel_rows =
(3 * LhsProgress) * std::max<Index>(1, ((l1 - rhs_block) / (depth * sizeof(LhsScalar) * 3 * LhsProgress)));
#endif
for (Index i1 = 0; i1 < peeled_mc3; i1 += actual_panel_rows) {
const Index actual_panel_end = (std::min)(i1 + actual_panel_rows, peeled_mc3);
#if EIGEN_ARCH_ARM64 || EIGEN_ARCH_LOONGARCH64
@@ -1442,9 +1449,17 @@ EIGEN_DONT_INLINE void gebp_kernel<LhsScalar, RhsScalar, Index, DataMapper, mr,
EIGEN_IF_CONSTEXPR(mr >= 2 * Traits::LhsProgress) {
std::ptrdiff_t l1, l2, l3;
manage_caching_sizes(GetAction, &l1, &l2, &l3);
const Index rhs_block2 = sizeof(ResScalar) * mr * nr + depth * nr * sizeof(RhsScalar);
#if EIGEN_ARCH_ARM64
Index actual_panel_rows =
(2 * LhsProgress) * std::max<Index>(1, ((l1 - sizeof(ResScalar) * mr * nr - depth * nr * sizeof(RhsScalar)) /
(depth * sizeof(LhsScalar) * 2 * LhsProgress)));
(rhs_block2 <= l1)
? peeled_mc2 - peeled_mc3
: (2 * LhsProgress) *
std::max<Index>(1, ((l1 - rhs_block2) / (depth * sizeof(LhsScalar) * 2 * LhsProgress)));
#else
Index actual_panel_rows =
(2 * LhsProgress) * std::max<Index>(1, ((l1 - rhs_block2) / (depth * sizeof(LhsScalar) * 2 * LhsProgress)));
#endif
for (Index i1 = peeled_mc3; i1 < peeled_mc2; i1 += actual_panel_rows) {
Index actual_panel_end = (std::min)(i1 + actual_panel_rows, peeled_mc2);
#if EIGEN_ARCH_ARM64 || EIGEN_ARCH_LOONGARCH64