|
|
|
|
@@ -88,15 +88,15 @@ void computeProductBlockingSizes(Index& k, Index& m, Index& n, Index num_threads
|
|
|
|
|
#ifdef EIGEN_TEST_SPECIFIC_BLOCKING_SIZES
|
|
|
|
|
EIGEN_UNUSED_VARIABLE(num_threads);
|
|
|
|
|
enum {
|
|
|
|
|
kr = 16,
|
|
|
|
|
kr = 8,
|
|
|
|
|
mr = Traits::mr,
|
|
|
|
|
nr = Traits::nr
|
|
|
|
|
};
|
|
|
|
|
k = std::min<Index>(k, EIGEN_TEST_SPECIFIC_BLOCKING_SIZE_K);
|
|
|
|
|
if (k > kr) k -= k % kr;
|
|
|
|
|
m = std::min<Index>(n, EIGEN_TEST_SPECIFIC_BLOCKING_SIZE_M);
|
|
|
|
|
m = std::min<Index>(m, EIGEN_TEST_SPECIFIC_BLOCKING_SIZE_M);
|
|
|
|
|
if (m > mr) m -= m % mr;
|
|
|
|
|
n = std::min<Index>(k, EIGEN_TEST_SPECIFIC_BLOCKING_SIZE_N);
|
|
|
|
|
n = std::min<Index>(n, EIGEN_TEST_SPECIFIC_BLOCKING_SIZE_N);
|
|
|
|
|
if (n > nr) n -= n % nr;
|
|
|
|
|
return;
|
|
|
|
|
#endif
|
|
|
|
|
@@ -153,16 +153,104 @@ void computeProductBlockingSizes(Index& k, Index& m, Index& n, Index num_threads
|
|
|
|
|
}
|
|
|
|
|
else {
|
|
|
|
|
// In unit tests we do not want to use extra large matrices,
|
|
|
|
|
// so we reduce the block size to check the blocking strategy is not flawed
|
|
|
|
|
#ifndef EIGEN_DEBUG_SMALL_PRODUCT_BLOCKS
|
|
|
|
|
k = std::min<Index>(k,sizeof(LhsScalar)<=4 ? 360 : 240);
|
|
|
|
|
n = std::min<Index>(n,3840/sizeof(RhsScalar));
|
|
|
|
|
m = std::min<Index>(m,3840/sizeof(RhsScalar));
|
|
|
|
|
#else
|
|
|
|
|
k = std::min<Index>(k,24);
|
|
|
|
|
n = std::min<Index>(n,384/sizeof(RhsScalar));
|
|
|
|
|
m = std::min<Index>(m,384/sizeof(RhsScalar));
|
|
|
|
|
// so we reduce the cache size to check the blocking strategy is not flawed
|
|
|
|
|
#ifdef EIGEN_DEBUG_SMALL_PRODUCT_BLOCKS
|
|
|
|
|
l1 = 4*1024;
|
|
|
|
|
l2 = 32*1024;
|
|
|
|
|
l3 = 512*1024;
|
|
|
|
|
#endif
|
|
|
|
|
|
|
|
|
|
// Early return for small problems because the computation below are time consuming for small problems.
|
|
|
|
|
// Perhaps it would make more sense to consider k*n*m??
|
|
|
|
|
// Note that for very tiny problem, this function should be bypassed anyway
|
|
|
|
|
// because we use the coefficient-based implementation for them.
|
|
|
|
|
if(std::max(k,std::max(m,n))<48)
|
|
|
|
|
return;
|
|
|
|
|
|
|
|
|
|
typedef typename Traits::ResScalar ResScalar;
|
|
|
|
|
enum {
|
|
|
|
|
k_peeling = 8,
|
|
|
|
|
k_div = KcFactor * (Traits::mr * sizeof(LhsScalar) + Traits::nr * sizeof(RhsScalar)),
|
|
|
|
|
k_sub = Traits::mr * Traits::nr * sizeof(ResScalar)
|
|
|
|
|
};
|
|
|
|
|
|
|
|
|
|
// ---- 1st level of blocking on L1, yields kc ----
|
|
|
|
|
|
|
|
|
|
// Blocking on the third dimension (i.e., k) is chosen so that an horizontal panel
|
|
|
|
|
// of size mr x kc of the lhs plus a vertical panel of kc x nr of the rhs both fits within L1 cache.
|
|
|
|
|
// We also include a register-level block of the result (mx x nr).
|
|
|
|
|
// (In an ideal world only the lhs panel would stay in L1)
|
|
|
|
|
// Moreover, kc has to be a multiple of 8 to be compatible with loop peeling, leading to a maximum blocking size of:
|
|
|
|
|
const Index max_kc = ((l1-k_sub)/k_div) & (~(k_peeling-1));
|
|
|
|
|
const Index old_k = k;
|
|
|
|
|
if(k>max_kc)
|
|
|
|
|
{
|
|
|
|
|
// We are really blocking on the third dimension:
|
|
|
|
|
// -> reduce blocking size to make sure the last block is as large as possible
|
|
|
|
|
// while keeping the same number of sweeps over the result.
|
|
|
|
|
k = (k%max_kc)==0 ? max_kc
|
|
|
|
|
: max_kc - k_peeling * ((max_kc-1-(k%max_kc))/(k_peeling*(k/max_kc+1)));
|
|
|
|
|
|
|
|
|
|
eigen_internal_assert(((old_k/k) == (old_k/max_kc)) && "the number of sweeps has to remain the same");
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
// ---- 2nd level of blocking on max(L2,L3), yields nc ----
|
|
|
|
|
|
|
|
|
|
// TODO find a reliable way to get the actual amount of cache per core to use for 2nd level blocking, that is:
|
|
|
|
|
// actual_l2 = max(l2, l3/nb_core_sharing_l3)
|
|
|
|
|
// The number below is quite conservative: it is better to underestimate the cache size rather than overestimating it)
|
|
|
|
|
// For instance, it corresponds to 6MB of L3 shared among 4 cores.
|
|
|
|
|
#ifdef EIGEN_DEBUG_SMALL_PRODUCT_BLOCKS
|
|
|
|
|
const Index actual_l2 = l3;
|
|
|
|
|
#else
|
|
|
|
|
const Index actual_l2 = 1572864; // == 1.5 MB
|
|
|
|
|
#endif
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
// Here, nc is chosen such that a block of kc x nc of the rhs fit within half of L2.
|
|
|
|
|
// The second half is implicitly reserved to access the result and lhs coefficients.
|
|
|
|
|
// When k<max_kc, then nc can arbitrarily growth. In practice, it seems to be fruitful
|
|
|
|
|
// to limit this growth: we bound nc to growth by a factor x1.5, leading to:
|
|
|
|
|
const Index max_nc = (3*actual_l2)/(2*2*max_kc*sizeof(RhsScalar));
|
|
|
|
|
// WARNING Below, we assume that Traits::nr is a power of two.
|
|
|
|
|
Index nc = std::min<Index>(actual_l2/(2*k*sizeof(RhsScalar)), max_nc) & (~(Traits::nr-1));
|
|
|
|
|
if(n>nc)
|
|
|
|
|
{
|
|
|
|
|
// We are really blocking over the columns:
|
|
|
|
|
// -> reduce blocking size to make sure the last block is as large as possible
|
|
|
|
|
// while keeping the same number of sweeps over the packed lhs.
|
|
|
|
|
// Here we allow one more sweep if this gives us a perfect match, thus the commented "-1"
|
|
|
|
|
n = (n%nc)==0 ? nc
|
|
|
|
|
: (nc - Traits::nr * ((nc/*-1*/-(n%nc))/(Traits::nr*(n/nc+1))));
|
|
|
|
|
}
|
|
|
|
|
else if(old_k==k)
|
|
|
|
|
{
|
|
|
|
|
// So far, no blocking at all, i.e., kc==k, and nc==n.
|
|
|
|
|
// In this case, let's perform a blocking over the rows such that the packed lhs data is kept in cache L1/L2
|
|
|
|
|
Index problem_size = k*n*sizeof(LhsScalar);
|
|
|
|
|
Index actual_lm = actual_l2;
|
|
|
|
|
Index max_mc = m;
|
|
|
|
|
if(problem_size<=1024)
|
|
|
|
|
{
|
|
|
|
|
// problem is small enough to keep in L1
|
|
|
|
|
// Let's choose m such that lhs's block fit in 1/3 of L1
|
|
|
|
|
actual_lm = l1;
|
|
|
|
|
}
|
|
|
|
|
else if(l3!=0 && problem_size<=32768)
|
|
|
|
|
{
|
|
|
|
|
// we have both L2 and L3, and problem is small enough to be kept in L2
|
|
|
|
|
// Let's choose m such that lhs's block fit in 1/3 of L2
|
|
|
|
|
actual_lm = l2;
|
|
|
|
|
max_mc = 576;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
Index mc = (std::min<Index>)(actual_lm/(3*k*sizeof(LhsScalar)), max_mc);
|
|
|
|
|
if (mc > Traits::mr) mc -= mc % Traits::mr;
|
|
|
|
|
|
|
|
|
|
m = (m%mc)==0 ? mc
|
|
|
|
|
: (mc - Traits::mr * ((mc/*-1*/-(m%mc))/(Traits::mr*(m/mc+1))));
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
@@ -712,6 +800,80 @@ protected:
|
|
|
|
|
conj_helper<ResPacket,ResPacket,false,ConjRhs> cj;
|
|
|
|
|
};
|
|
|
|
|
|
|
|
|
|
// helper for the rotating kernel below
|
|
|
|
|
template <typename GebpKernel, bool UseRotatingKernel = GebpKernel::UseRotatingKernel>
|
|
|
|
|
struct PossiblyRotatingKernelHelper
|
|
|
|
|
{
|
|
|
|
|
// default implementation, not rotating
|
|
|
|
|
|
|
|
|
|
typedef typename GebpKernel::Traits Traits;
|
|
|
|
|
typedef typename Traits::RhsScalar RhsScalar;
|
|
|
|
|
typedef typename Traits::RhsPacket RhsPacket;
|
|
|
|
|
typedef typename Traits::AccPacket AccPacket;
|
|
|
|
|
|
|
|
|
|
const Traits& traits;
|
|
|
|
|
PossiblyRotatingKernelHelper(const Traits& t) : traits(t) {}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
template <size_t K, size_t Index>
|
|
|
|
|
void loadOrRotateRhs(RhsPacket& to, const RhsScalar* from) const
|
|
|
|
|
{
|
|
|
|
|
traits.loadRhs(from + (Index+4*K)*Traits::RhsProgress, to);
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
void unrotateResult(AccPacket&,
|
|
|
|
|
AccPacket&,
|
|
|
|
|
AccPacket&,
|
|
|
|
|
AccPacket&)
|
|
|
|
|
{
|
|
|
|
|
}
|
|
|
|
|
};
|
|
|
|
|
|
|
|
|
|
// rotating implementation
|
|
|
|
|
template <typename GebpKernel>
|
|
|
|
|
struct PossiblyRotatingKernelHelper<GebpKernel, true>
|
|
|
|
|
{
|
|
|
|
|
typedef typename GebpKernel::Traits Traits;
|
|
|
|
|
typedef typename Traits::RhsScalar RhsScalar;
|
|
|
|
|
typedef typename Traits::RhsPacket RhsPacket;
|
|
|
|
|
typedef typename Traits::AccPacket AccPacket;
|
|
|
|
|
|
|
|
|
|
const Traits& traits;
|
|
|
|
|
PossiblyRotatingKernelHelper(const Traits& t) : traits(t) {}
|
|
|
|
|
|
|
|
|
|
template <size_t K, size_t Index>
|
|
|
|
|
void loadOrRotateRhs(RhsPacket& to, const RhsScalar* from) const
|
|
|
|
|
{
|
|
|
|
|
if (Index == 0) {
|
|
|
|
|
to = pload<RhsPacket>(from + 4*K*Traits::RhsProgress);
|
|
|
|
|
} else {
|
|
|
|
|
EIGEN_ASM_COMMENT("Do not reorder code, we're very tight on registers");
|
|
|
|
|
to = protate<1>(to);
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
void unrotateResult(AccPacket& res0,
|
|
|
|
|
AccPacket& res1,
|
|
|
|
|
AccPacket& res2,
|
|
|
|
|
AccPacket& res3)
|
|
|
|
|
{
|
|
|
|
|
PacketBlock<AccPacket> resblock;
|
|
|
|
|
resblock.packet[0] = res0;
|
|
|
|
|
resblock.packet[1] = res1;
|
|
|
|
|
resblock.packet[2] = res2;
|
|
|
|
|
resblock.packet[3] = res3;
|
|
|
|
|
ptranspose(resblock);
|
|
|
|
|
resblock.packet[3] = protate<1>(resblock.packet[3]);
|
|
|
|
|
resblock.packet[2] = protate<2>(resblock.packet[2]);
|
|
|
|
|
resblock.packet[1] = protate<3>(resblock.packet[1]);
|
|
|
|
|
ptranspose(resblock);
|
|
|
|
|
res0 = resblock.packet[0];
|
|
|
|
|
res1 = resblock.packet[1];
|
|
|
|
|
res2 = resblock.packet[2];
|
|
|
|
|
res3 = resblock.packet[3];
|
|
|
|
|
}
|
|
|
|
|
};
|
|
|
|
|
|
|
|
|
|
/* optimized GEneral packed Block * packed Panel product kernel
|
|
|
|
|
*
|
|
|
|
|
* Mixing type logic: C += A * B
|
|
|
|
|
@@ -745,6 +907,16 @@ struct gebp_kernel
|
|
|
|
|
ResPacketSize = Traits::ResPacketSize
|
|
|
|
|
};
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
static const bool UseRotatingKernel =
|
|
|
|
|
EIGEN_ARCH_ARM &&
|
|
|
|
|
internal::is_same<LhsScalar, float>::value &&
|
|
|
|
|
internal::is_same<RhsScalar, float>::value &&
|
|
|
|
|
internal::is_same<ResScalar, float>::value &&
|
|
|
|
|
Traits::LhsPacketSize == 4 &&
|
|
|
|
|
Traits::RhsPacketSize == 4 &&
|
|
|
|
|
Traits::ResPacketSize == 4;
|
|
|
|
|
|
|
|
|
|
EIGEN_DONT_INLINE
|
|
|
|
|
void operator()(const DataMapper& res, const LhsScalar* blockA, const RhsScalar* blockB,
|
|
|
|
|
Index rows, Index depth, Index cols, ResScalar alpha,
|
|
|
|
|
@@ -778,6 +950,8 @@ void gebp_kernel<LhsScalar,RhsScalar,Index,DataMapper,mr,nr,ConjugateLhs,Conjuga
|
|
|
|
|
// Usually, make sense only with FMA
|
|
|
|
|
if(mr>=3*Traits::LhsProgress)
|
|
|
|
|
{
|
|
|
|
|
PossiblyRotatingKernelHelper<gebp_kernel> possiblyRotatingKernelHelper(traits);
|
|
|
|
|
|
|
|
|
|
// loops on each largest micro horizontal panel of lhs (3*Traits::LhsProgress x depth)
|
|
|
|
|
for(Index i=0; i<peeled_mc3; i+=3*Traits::LhsProgress)
|
|
|
|
|
{
|
|
|
|
|
@@ -813,43 +987,12 @@ void gebp_kernel<LhsScalar,RhsScalar,Index,DataMapper,mr,nr,ConjugateLhs,Conjuga
|
|
|
|
|
prefetch(&blB[0]);
|
|
|
|
|
LhsPacket A0, A1;
|
|
|
|
|
|
|
|
|
|
#define EIGEN_ARCH_PREFERS_ROTATING_KERNEL EIGEN_ARCH_ARM
|
|
|
|
|
|
|
|
|
|
#if EIGEN_ARCH_PREFERS_ROTATING_KERNEL
|
|
|
|
|
static const bool UseRotatingKernel =
|
|
|
|
|
Traits::LhsPacketSize == 4 &&
|
|
|
|
|
Traits::RhsPacketSize == 4 &&
|
|
|
|
|
Traits::ResPacketSize == 4;
|
|
|
|
|
#endif
|
|
|
|
|
|
|
|
|
|
for(Index k=0; k<peeled_kc; k+=pk)
|
|
|
|
|
{
|
|
|
|
|
EIGEN_ASM_COMMENT("begin gebp micro kernel 3pX4");
|
|
|
|
|
RhsPacket B_0, T0;
|
|
|
|
|
LhsPacket A2;
|
|
|
|
|
|
|
|
|
|
#define EIGEN_GEBP_ONESTEP_LOADRHS_NONROTATING(K,N) \
|
|
|
|
|
traits.loadRhs(&blB[(N+4*K)*RhsProgress], B_0);
|
|
|
|
|
|
|
|
|
|
#if EIGEN_ARCH_PREFERS_ROTATING_KERNEL
|
|
|
|
|
#define EIGEN_GEBP_ONESTEP_LOADRHS(K,N) \
|
|
|
|
|
do { \
|
|
|
|
|
if (UseRotatingKernel) { \
|
|
|
|
|
if (N == 0) { \
|
|
|
|
|
B_0 = pload<RhsPacket>(&blB[(0+4*K)*RhsProgress]); \
|
|
|
|
|
} else { \
|
|
|
|
|
EIGEN_ASM_COMMENT("Do not reorder code, we're very tight on registers"); \
|
|
|
|
|
B_0 = protate<1>(B_0); \
|
|
|
|
|
} \
|
|
|
|
|
} else { \
|
|
|
|
|
EIGEN_GEBP_ONESTEP_LOADRHS_NONROTATING(K,N); \
|
|
|
|
|
} \
|
|
|
|
|
} while (false)
|
|
|
|
|
#else
|
|
|
|
|
#define EIGEN_GEBP_ONESTEP_LOADRHS(K,N) \
|
|
|
|
|
EIGEN_GEBP_ONESTEP_LOADRHS_NONROTATING(K,N)
|
|
|
|
|
#endif
|
|
|
|
|
|
|
|
|
|
#define EIGEN_GEBP_ONESTEP(K) \
|
|
|
|
|
do { \
|
|
|
|
|
EIGEN_ASM_COMMENT("begin step of gebp micro kernel 3pX4"); \
|
|
|
|
|
@@ -859,19 +1002,19 @@ void gebp_kernel<LhsScalar,RhsScalar,Index,DataMapper,mr,nr,ConjugateLhs,Conjuga
|
|
|
|
|
traits.loadLhs(&blA[(0+3*K)*LhsProgress], A0); \
|
|
|
|
|
traits.loadLhs(&blA[(1+3*K)*LhsProgress], A1); \
|
|
|
|
|
traits.loadLhs(&blA[(2+3*K)*LhsProgress], A2); \
|
|
|
|
|
EIGEN_GEBP_ONESTEP_LOADRHS(K, 0); \
|
|
|
|
|
possiblyRotatingKernelHelper.template loadOrRotateRhs<K, 0>(B_0, blB); \
|
|
|
|
|
traits.madd(A0, B_0, C0, T0); \
|
|
|
|
|
traits.madd(A1, B_0, C4, T0); \
|
|
|
|
|
traits.madd(A2, B_0, C8, B_0); \
|
|
|
|
|
EIGEN_GEBP_ONESTEP_LOADRHS(K, 1); \
|
|
|
|
|
possiblyRotatingKernelHelper.template loadOrRotateRhs<K, 1>(B_0, blB); \
|
|
|
|
|
traits.madd(A0, B_0, C1, T0); \
|
|
|
|
|
traits.madd(A1, B_0, C5, T0); \
|
|
|
|
|
traits.madd(A2, B_0, C9, B_0); \
|
|
|
|
|
EIGEN_GEBP_ONESTEP_LOADRHS(K, 2); \
|
|
|
|
|
possiblyRotatingKernelHelper.template loadOrRotateRhs<K, 2>(B_0, blB); \
|
|
|
|
|
traits.madd(A0, B_0, C2, T0); \
|
|
|
|
|
traits.madd(A1, B_0, C6, T0); \
|
|
|
|
|
traits.madd(A2, B_0, C10, B_0); \
|
|
|
|
|
EIGEN_GEBP_ONESTEP_LOADRHS(K, 3); \
|
|
|
|
|
possiblyRotatingKernelHelper.template loadOrRotateRhs<K, 3>(B_0, blB); \
|
|
|
|
|
traits.madd(A0, B_0, C3 , T0); \
|
|
|
|
|
traits.madd(A1, B_0, C7, T0); \
|
|
|
|
|
traits.madd(A2, B_0, C11, B_0); \
|
|
|
|
|
@@ -904,34 +1047,10 @@ void gebp_kernel<LhsScalar,RhsScalar,Index,DataMapper,mr,nr,ConjugateLhs,Conjuga
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
#undef EIGEN_GEBP_ONESTEP
|
|
|
|
|
#undef EIGEN_GEBP_ONESTEP_LOADRHS
|
|
|
|
|
#undef EIGEN_GEBP_ONESTEP_LOADRHS_NONROTATING
|
|
|
|
|
|
|
|
|
|
#if EIGEN_ARCH_PREFERS_ROTATING_KERNEL
|
|
|
|
|
if (UseRotatingKernel) {
|
|
|
|
|
#define EIGEN_GEBP_UNROTATE_RESULT(res0, res1, res2, res3) \
|
|
|
|
|
do { \
|
|
|
|
|
PacketBlock<ResPacket> resblock; \
|
|
|
|
|
resblock.packet[0] = res0; \
|
|
|
|
|
resblock.packet[1] = res1; \
|
|
|
|
|
resblock.packet[2] = res2; \
|
|
|
|
|
resblock.packet[3] = res3; \
|
|
|
|
|
ptranspose(resblock); \
|
|
|
|
|
resblock.packet[3] = protate<1>(resblock.packet[3]); \
|
|
|
|
|
resblock.packet[2] = protate<2>(resblock.packet[2]); \
|
|
|
|
|
resblock.packet[1] = protate<3>(resblock.packet[1]); \
|
|
|
|
|
ptranspose(resblock); \
|
|
|
|
|
res0 = resblock.packet[0]; \
|
|
|
|
|
res1 = resblock.packet[1]; \
|
|
|
|
|
res2 = resblock.packet[2]; \
|
|
|
|
|
res3 = resblock.packet[3]; \
|
|
|
|
|
} while (false)
|
|
|
|
|
|
|
|
|
|
EIGEN_GEBP_UNROTATE_RESULT(C0, C1, C2, C3);
|
|
|
|
|
EIGEN_GEBP_UNROTATE_RESULT(C4, C5, C6, C7);
|
|
|
|
|
EIGEN_GEBP_UNROTATE_RESULT(C8, C9, C10, C11);
|
|
|
|
|
}
|
|
|
|
|
#endif
|
|
|
|
|
possiblyRotatingKernelHelper.unrotateResult(C0, C1, C2, C3);
|
|
|
|
|
possiblyRotatingKernelHelper.unrotateResult(C4, C5, C6, C7);
|
|
|
|
|
possiblyRotatingKernelHelper.unrotateResult(C8, C9, C10, C11);
|
|
|
|
|
|
|
|
|
|
ResPacket R0, R1, R2;
|
|
|
|
|
ResPacket alphav = pset1<ResPacket>(alpha);
|
|
|
|
|
|