Fix mixed-type GEMM packing for backends without half/quarter packets

libeigen/eigen!2297

Co-authored-by: Rasmus Munk Larsen <rmlarsen@gmail.com>
This commit is contained in:
Rasmus Munk Larsen
2026-03-21 09:46:54 -07:00
parent 1d21d62fbc
commit 54b04fc6b1
4 changed files with 39 additions and 16 deletions

View File

@@ -1292,17 +1292,17 @@ EIGEN_ALWAYS_INLINE void gebp_micro_panel_impl(GEBPTraits& traits, const DataMap
#else #else
constexpr int CSize = 3 * NrCols > MrPackets * NrCols ? 3 * NrCols : MrPackets * NrCols; constexpr int CSize = 3 * NrCols > MrPackets * NrCols ? 3 * NrCols : MrPackets * NrCols;
#endif #endif
AccPacketLocal C[CSize]; alignas(AccPacketLocal) AccPacketLocal C[CSize];
for (int n = 0; n < MrPackets * NrCols; ++n) traits.initAcc(C[n]); for (int n = 0; n < MrPackets * NrCols; ++n) traits.initAcc(C[n]);
// Double-accumulation trick for 1pX4 path to break FMA dependency chains // Double-accumulation trick for 1pX4 path to break FMA dependency chains
constexpr bool use_double_accum = (MrPackets == 1 && NrCols == 4); constexpr bool use_double_accum = (MrPackets == 1 && NrCols == 4);
#ifdef EIGEN_HAS_CXX17_IFCONSTEXPR #ifdef EIGEN_HAS_CXX17_IFCONSTEXPR
AccPacketLocal D[use_double_accum ? NrCols : 1]; alignas(AccPacketLocal) AccPacketLocal D[use_double_accum ? NrCols : 1];
#else #else
// Without if constexpr, we must allocate a larger array to satisfy the // Without if constexpr, we must allocate a larger array to satisfy the
// compiler that D[n] is always in bounds for the use_double_accum path. // compiler that D[n] is always in bounds for the use_double_accum path.
AccPacketLocal D[CSize]; alignas(AccPacketLocal) AccPacketLocal D[CSize];
#endif #endif
EIGEN_IF_CONSTEXPR(use_double_accum) { EIGEN_IF_CONSTEXPR(use_double_accum) {
for (int n = 0; n < NrCols; ++n) traits.initAcc(D[n]); for (int n = 0; n < NrCols; ++n) traits.initAcc(D[n]);
@@ -1317,15 +1317,15 @@ EIGEN_ALWAYS_INLINE void gebp_micro_panel_impl(GEBPTraits& traits, const DataMap
// LHS packet staging area. With if constexpr (C++17) we use exact sizes. // LHS packet staging area. With if constexpr (C++17) we use exact sizes.
#ifdef EIGEN_HAS_CXX17_IFCONSTEXPR #ifdef EIGEN_HAS_CXX17_IFCONSTEXPR
LhsPacketLocal A[MrPackets]; alignas(LhsPacketLocal) LhsPacketLocal A[MrPackets];
#else #else
LhsPacketLocal A[3]; alignas(LhsPacketLocal) LhsPacketLocal A[3];
#endif #endif
// ---- Peeled k-loop (pk=8 unrolled) ---- // ---- Peeled k-loop (pk=8 unrolled) ----
for (Index_ k = 0; k < peeled_kc; k += pk) { for (Index_ k = 0; k < peeled_kc; k += pk) {
RhsPanelType rhs_panel; alignas(RhsPanelType) RhsPanelType rhs_panel;
RhsPacketLocal T0; alignas(RhsPacketLocal) RhsPacketLocal T0;
gebp_peeled_loop<MrPackets, NrCols>::template run<GEBPTraits, LhsScalar_, RhsScalar_, decltype(A), RhsPanelType, gebp_peeled_loop<MrPackets, NrCols>::template run<GEBPTraits, LhsScalar_, RhsScalar_, decltype(A), RhsPanelType,
RhsPacketLocal, decltype(C), decltype(D), FullLhsPacket>( RhsPacketLocal, decltype(C), decltype(D), FullLhsPacket>(
@@ -1342,8 +1342,8 @@ EIGEN_ALWAYS_INLINE void gebp_micro_panel_impl(GEBPTraits& traits, const DataMap
// ---- Remainder k-loop ---- // ---- Remainder k-loop ----
for (Index_ k = peeled_kc; k < depth; k++) { for (Index_ k = peeled_kc; k < depth; k++) {
RhsPanelType rhs_panel; alignas(RhsPanelType) RhsPanelType rhs_panel;
RhsPacketLocal T0; alignas(RhsPacketLocal) RhsPacketLocal T0;
gebp_micro_step<0, MrPackets, NrCols>::run(traits, blA, blB, A, rhs_panel, T0, C); gebp_micro_step<0, MrPackets, NrCols>::run(traits, blA, blB, A, rhs_panel, T0, C);
@@ -1352,11 +1352,11 @@ EIGEN_ALWAYS_INLINE void gebp_micro_panel_impl(GEBPTraits& traits, const DataMap
} }
// ---- Store results: C[j + p * NrCols] -> res(i + p*ResPacketSz, j2 + j) ---- // ---- Store results: C[j + p * NrCols] -> res(i + p*ResPacketSz, j2 + j) ----
ResPacketLocal alphav = pset1<ResPacketLocal>(alpha); alignas(ResPacketLocal) ResPacketLocal alphav = pset1<ResPacketLocal>(alpha);
for (int j = 0; j < NrCols; ++j) { for (int j = 0; j < NrCols; ++j) {
LinearMapper_ r = res.getLinearMapper(i, j2 + j); LinearMapper_ r = res.getLinearMapper(i, j2 + j);
for (int p = 0; p < MrPackets; ++p) { for (int p = 0; p < MrPackets; ++p) {
ResPacketLocal R = r.template loadPacket<ResPacketLocal>(p * ResPacketSz); alignas(ResPacketLocal) ResPacketLocal R = r.template loadPacket<ResPacketLocal>(p * ResPacketSz);
traits.acc(C[j + p * NrCols], alphav, R); traits.acc(C[j + p * NrCols], alphav, R);
r.storePacket(p * ResPacketSz, R); r.storePacket(p * ResPacketSz, R);
} }
@@ -1918,14 +1918,23 @@ EIGEN_DONT_INLINE void gemm_pack_lhs<Scalar, Index, DataMapper, Pack1, Pack2, Pa
// address both real & imaginary parts on the rhs. This portion will // address both real & imaginary parts on the rhs. This portion will
// pack those half ones until they match the number expected on the // pack those half ones until they match the number expected on the
// last peeling loop at this point (for the rhs). // last peeling loop at this point (for the rhs).
//
// When there are no half/quarter packet types (HasHalf and HasQuarter
// are both false), last_lhs_progress can exceed Pack2, producing
// interleaved groups that the GEBP micro-kernel cannot consume. In
// that case we use exactly Pack2 rows per group so the kernel's main
// loop (which reads Pack2 = LhsProgress values via ploaddup) can
// handle them; remaining rows fall through to the scalar loop below.
if (Pack2 < PacketSize && Pack2 > 1) { if (Pack2 < PacketSize && Pack2 > 1) {
for (; i < peeled_mc0; i += last_lhs_progress) { const Index pack2_progress = (HasHalf || HasQuarter) ? last_lhs_progress : Pack2;
if (PanelMode) count += last_lhs_progress * offset; const Index peeled = (HasHalf || HasQuarter) ? peeled_mc0 : (rows / Pack2) * Pack2;
for (; i < peeled; i += pack2_progress) {
if (PanelMode) count += pack2_progress * offset;
for (Index k = 0; k < depth; k++) for (Index k = 0; k < depth; k++)
for (Index w = 0; w < last_lhs_progress; w++) blockA[count++] = cj(lhs(i + w, k)); for (Index w = 0; w < pack2_progress; w++) blockA[count++] = cj(lhs(i + w, k));
if (PanelMode) count += last_lhs_progress * (stride - offset - depth); if (PanelMode) count += pack2_progress * (stride - offset - depth);
} }
} }
// Pack scalars // Pack scalars
@@ -2040,7 +2049,11 @@ EIGEN_DONT_INLINE void gemm_pack_lhs<Scalar, Index, DataMapper, Pack1, Pack2, Pa
// address both real & imaginary parts on the rhs. This portion will // address both real & imaginary parts on the rhs. This portion will
// pack those half ones until they match the number expected on the // pack those half ones until they match the number expected on the
// last peeling loop at this point (for the rhs). // last peeling loop at this point (for the rhs).
if (Pack2 < PacketSize && !gone_last) { //
// When there are no half/quarter packet types, the interleaved
// groups cannot be consumed by the GEBP micro-kernel's half/quarter
// loops. Fall through to the scalar loop instead.
if (Pack2 < PacketSize && !gone_last && (HasHalf || HasQuarter)) {
gone_last = true; gone_last = true;
psize = pack = left & ~1; psize = pack = left & ~1;
} }

View File

@@ -196,6 +196,8 @@ if(COMPILER_SUPPORTS_VECTOR_EXTENSIONS)
ei_add_test(packetmath_generic_16 "-DEIGEN_FAST_MATH=1") ei_add_test(packetmath_generic_16 "-DEIGEN_FAST_MATH=1")
ei_add_test(packetmath_generic_32 "-DEIGEN_FAST_MATH=1") ei_add_test(packetmath_generic_32 "-DEIGEN_FAST_MATH=1")
ei_add_test(packetmath_generic_64 "-DEIGEN_FAST_MATH=1") ei_add_test(packetmath_generic_64 "-DEIGEN_FAST_MATH=1")
ei_add_test(mixingtypes_generic_32)
ei_add_test(mixingtypes_generic_64)
endif() endif()
ei_add_test(packet_segment) ei_add_test(packet_segment)
ei_add_test(vectorization_logic) ei_add_test(vectorization_logic)

View File

@@ -0,0 +1,4 @@
// Force the generic clang vector backend with 32-byte vectors.
#define EIGEN_VECTORIZE_GENERIC 1
#define EIGEN_GENERIC_VECTOR_SIZE_BYTES 32
#include "mixingtypes.cpp"

View File

@@ -0,0 +1,4 @@
// Force the generic clang vector backend with 64-byte vectors.
#define EIGEN_VECTORIZE_GENERIC 1
#define EIGEN_GENERIC_VECTOR_SIZE_BYTES 64
#include "mixingtypes.cpp"