mirror of
https://gitlab.com/libeigen/eigen.git
synced 2026-04-10 11:34:33 +08:00
Compare commits
7 Commits
starting_n
...
starting_n
| Author | SHA1 | Date | |
|---|---|---|---|
|
|
09f3e95447 | ||
|
|
6533187280 | ||
|
|
029f78abf0 | ||
|
|
5d47f6697d | ||
|
|
ad67705447 | ||
|
|
9fc17867e5 | ||
|
|
3999ab2dc7 |
@@ -16,31 +16,29 @@ namespace internal {
|
||||
|
||||
#ifdef __ENABLE_VECTOR_KERNELS__
|
||||
|
||||
#define MICRO_12x1x4() \
|
||||
pRhs = pload<RhsPacket>(rhsPackMap.pCur); \
|
||||
rhsPackMap.advance(1*4); \
|
||||
pRhs0 = pset1<RhsPacket>(pRhs[0]); \
|
||||
pRhs1 = pset1<RhsPacket>(pRhs[1]); \
|
||||
pRhs2 = pset1<RhsPacket>(pRhs[2]); \
|
||||
pRhs3 = pset1<RhsPacket>(pRhs[3]); \
|
||||
pLhs = pload<LhsPacket>(lhsPackMap.pCur); \
|
||||
lhsPackMap.advance(4*1); \
|
||||
acc._acc1.packet[0] += pLhs*pRhs0; \
|
||||
acc._acc1.packet[1] += pLhs*pRhs1; \
|
||||
acc._acc1.packet[2] += pLhs*pRhs2; \
|
||||
acc._acc1.packet[3] += pLhs*pRhs3; \
|
||||
pLhs2 = pload<LhsPacket>(lhsPackMap.pCur); \
|
||||
lhsPackMap.advance(4*1); \
|
||||
acc._acc2.packet[0] += pLhs2*pRhs0; \
|
||||
acc._acc2.packet[1] += pLhs2*pRhs1; \
|
||||
acc._acc2.packet[2] += pLhs2*pRhs2; \
|
||||
acc._acc2.packet[3] += pLhs2*pRhs3; \
|
||||
pLhs3 = pload<LhsPacket>(lhsPackMap.pCur); \
|
||||
acc._acc3.packet[0] += pLhs3*pRhs0; \
|
||||
acc._acc3.packet[1] += pLhs3*pRhs1; \
|
||||
acc._acc3.packet[2] += pLhs3*pRhs2; \
|
||||
acc._acc3.packet[3] += pLhs3*pRhs3; \
|
||||
lhsPackMap.advance(4*1);
|
||||
#define MICRO_12x1x4(K) \
|
||||
lhsPackMap.prefetch((3*K + 16)*4); \
|
||||
rhsPackMap.prefetch((4*K + 16)*1); \
|
||||
pLhs = pload<LhsPacket>(lhsPackMap.pCur + (0 + 3*K)*4); \
|
||||
pLhs2 = pload<LhsPacket>(lhsPackMap.pCur + (1 + 3*K)*4); \
|
||||
pLhs3 = pload<LhsPacket>(lhsPackMap.pCur + (2 + 3*K)*4); \
|
||||
pRhs = pload<RhsPacket>(rhsPackMap.pCur + (0 + 4*K)*1);\
|
||||
pRhs0 = pset1<RhsPacket>(pRhs[0]); \
|
||||
acc._acc1.packet[0] += pLhs*pRhs0; \
|
||||
acc._acc2.packet[0] += pLhs2*pRhs0; \
|
||||
acc._acc3.packet[0] += pLhs3*pRhs0; \
|
||||
pRhs1 = pset1<RhsPacket>(pRhs[1]); \
|
||||
acc._acc1.packet[1] += pLhs*pRhs1; \
|
||||
acc._acc2.packet[1] += pLhs2*pRhs1; \
|
||||
acc._acc3.packet[1] += pLhs3*pRhs1; \
|
||||
pRhs2 = pset1<RhsPacket>(pRhs[2]); \
|
||||
acc._acc1.packet[2] += pLhs*pRhs2; \
|
||||
acc._acc2.packet[2] += pLhs2*pRhs2; \
|
||||
acc._acc3.packet[2] += pLhs3*pRhs2; \
|
||||
pRhs3 = pset1<RhsPacket>(pRhs[3]); \
|
||||
acc._acc1.packet[3] += pLhs*pRhs3; \
|
||||
acc._acc2.packet[3] += pLhs2*pRhs3; \
|
||||
acc._acc3.packet[3] += pLhs3*pRhs3;
|
||||
|
||||
#define MICRO_8x1x4() \
|
||||
pLhs = pload<LhsPacket>(lhsPackMap.pCur); \
|
||||
@@ -76,18 +74,29 @@ namespace internal {
|
||||
lhsPackMap.advance(4*1); \
|
||||
rhsPackMap.advance(1*4);
|
||||
|
||||
#define MICRO_12x1x1() \
|
||||
#define MICRO_2x1x4() \
|
||||
pLhs = pload<LhsPacket>(lhsPackMap.pCur); \
|
||||
pRhs = pset1<RhsPacket>(*rhsPackMap.pCur); \
|
||||
acc._acc.packet[0] = pmadd(pRhs, pLhs, acc._acc.packet[0]); \
|
||||
lhsPackMap.advance(4*1); \
|
||||
pLhs = pload<LhsPacket>(lhsPackMap.pCur); \
|
||||
acc._acc.packet[1] = pmadd(pRhs, pLhs, acc._acc.packet[1]); \
|
||||
lhsPackMap.advance(4*1); \
|
||||
pLhs = pload<LhsPacket>(lhsPackMap.pCur); \
|
||||
acc._acc.packet[2] = pmadd(pRhs, pLhs, acc._acc.packet[2]); \
|
||||
lhsPackMap.advance(4*1); \
|
||||
rhsPackMap.advance(1);
|
||||
pRhs = pload<RhsPacket>(rhsPackMap.pCur); \
|
||||
pRhs0 = pset1<RhsPacket>(pRhs[0]); \
|
||||
pRhs1 = pset1<RhsPacket>(pRhs[1]); \
|
||||
pRhs = pload<RhsPacket>(rhsPackMap.pCur + 2); \
|
||||
pRhs2 = pset1<RhsPacket>(pRhs[0]); \
|
||||
pRhs3 = pset1<RhsPacket>(pRhs[1]); \
|
||||
acc._acc.packet[0] += pLhs*pRhs0; \
|
||||
acc._acc.packet[1] += pLhs*pRhs1; \
|
||||
acc._acc.packet[2] += pLhs*pRhs2; \
|
||||
acc._acc.packet[3] += pLhs*pRhs3; \
|
||||
lhsPackMap.advance(2*1); \
|
||||
rhsPackMap.advance(1*4);
|
||||
|
||||
#define MICRO_12x1x1(K) \
|
||||
pLhs = pload<LhsPacket>(lhsPackMap.pCur + (0 + 3*K)*4); \
|
||||
pLhs2 = pload<LhsPacket>(lhsPackMap.pCur + (1 + 3*K)*4); \
|
||||
pLhs3 = pload<LhsPacket>(lhsPackMap.pCur + (2 + 3*K)*4); \
|
||||
pRhs = pset1<RhsPacket>(*(rhsPackMap.pCur + K));\
|
||||
acc._acc.packet[0] += pLhs*pRhs; \
|
||||
acc._acc.packet[1] += pLhs2*pRhs; \
|
||||
acc._acc.packet[2] += pLhs3*pRhs;
|
||||
|
||||
#define MICRO_8x1x1() \
|
||||
pLhs = pload<LhsPacket>(lhsPackMap.pCur); \
|
||||
@@ -103,7 +112,14 @@ namespace internal {
|
||||
pLhs = pload<LhsPacket>(lhsPackMap.pCur); \
|
||||
pRhs = pset1<RhsPacket>(*rhsPackMap.pCur); \
|
||||
acc._acc += pRhs*pLhs; \
|
||||
lhsPackMap.advance(4*1); \
|
||||
lhsPackMap.advance(4); \
|
||||
rhsPackMap.advance(1);
|
||||
|
||||
#define MICRO_2x1x1() \
|
||||
pLhs = pload<LhsPacket>(lhsPackMap.pCur); \
|
||||
pRhs = pset1<RhsPacket>(*rhsPackMap.pCur); \
|
||||
acc._acc += pRhs*pLhs; \
|
||||
lhsPackMap.advance(2); \
|
||||
rhsPackMap.advance(1);
|
||||
|
||||
template<int CPU, typename Scalar, typename ResScalar, typename DataMapper>
|
||||
@@ -122,6 +138,7 @@ struct Accumulator<0, CPU, Scalar, ResScalar, DataMapper, 12, 1>
|
||||
_acc.packet[2] = pset1<AccPacket>(0);
|
||||
}
|
||||
|
||||
template<int LhsProgress, int DepthProgress, int RhsProgress>
|
||||
EIGEN_STRONG_INLINE void prefetch(const DataMapper&, Index, Index) {}
|
||||
|
||||
template<typename ResPacket_>
|
||||
@@ -132,15 +149,18 @@ struct Accumulator<0, CPU, Scalar, ResScalar, DataMapper, 12, 1>
|
||||
_acc.packet[2] *= pAlpha;
|
||||
}
|
||||
|
||||
EIGEN_STRONG_INLINE void store(const DataMapper& dest, Index row, Index col)
|
||||
template<typename ResPacket_>
|
||||
EIGEN_STRONG_INLINE void store(const DataMapper& dest, Index row, Index col, ResScalar alpha, const ResPacket_& pAlpha)
|
||||
{
|
||||
asm __volatile__("#BEGIN_STORE_12x1\n\t");
|
||||
PacketBlock<ResPacket, 1> block;
|
||||
block.packet[0] = dest.template loadPacket<ResPacket>(row + 0, col) + _acc.packet[0];
|
||||
block.packet[0] = dest.template loadPacket<ResPacket>(row + 0, col) + pAlpha*_acc.packet[0];
|
||||
dest.template storePacketBlock<AccPacket, 1>(row + 0, col, block);
|
||||
block.packet[0] = dest.template loadPacket<ResPacket>(row + 4, col) + _acc.packet[1];
|
||||
block.packet[0] = dest.template loadPacket<ResPacket>(row + 4, col) + pAlpha*_acc.packet[1];
|
||||
dest.template storePacketBlock<AccPacket, 1>(row + 4, col, block);
|
||||
block.packet[0] = dest.template loadPacket<ResPacket>(row + 8, col) + _acc.packet[2];
|
||||
block.packet[0] = dest.template loadPacket<ResPacket>(row + 8, col) + pAlpha*_acc.packet[2];
|
||||
dest.template storePacketBlock<AccPacket, 1>(row + 8, col, block);
|
||||
asm __volatile__("#END_STORE_12x1\n\t");
|
||||
}
|
||||
};
|
||||
|
||||
@@ -159,6 +179,7 @@ struct Accumulator<0, CPU, Scalar, ResScalar, DataMapper, 8, 1>
|
||||
_acc.packet[1] = pset1<AccPacket>(0);
|
||||
}
|
||||
|
||||
template<int LhsProgress, int DepthProgress, int RhsProgress>
|
||||
EIGEN_STRONG_INLINE void prefetch(const DataMapper&, Index, Index) {}
|
||||
|
||||
template<typename ResPacket_>
|
||||
@@ -168,12 +189,13 @@ struct Accumulator<0, CPU, Scalar, ResScalar, DataMapper, 8, 1>
|
||||
_acc.packet[1] *= pAlpha;
|
||||
}
|
||||
|
||||
EIGEN_STRONG_INLINE void store(const DataMapper& dest, Index row, Index col)
|
||||
template<typename ResPacket_>
|
||||
EIGEN_STRONG_INLINE void store(const DataMapper& dest, Index row, Index col, ResScalar alpha, const ResPacket_& pAlpha)
|
||||
{
|
||||
PacketBlock<ResPacket, 1> block;
|
||||
block.packet[0] = dest.template loadPacket<ResPacket>(row, col) + _acc.packet[0];
|
||||
block.packet[0] = dest.template loadPacket<ResPacket>(row, col) + pAlpha*_acc.packet[0];
|
||||
dest.template storePacketBlock<AccPacket, 1>(row, col, block);
|
||||
block.packet[0] = dest.template loadPacket<ResPacket>(row + 4, col) + _acc.packet[1];
|
||||
block.packet[0] = dest.template loadPacket<ResPacket>(row + 4, col) + pAlpha*_acc.packet[1];
|
||||
dest.template storePacketBlock<AccPacket, 1>(row + 4, col, block);
|
||||
}
|
||||
};
|
||||
@@ -191,7 +213,8 @@ struct Accumulator<0, CPU, Scalar, ResScalar, DataMapper, 4, 1>
|
||||
{
|
||||
_acc = pset1<AccPacket>(0);
|
||||
}
|
||||
|
||||
|
||||
template<int LhsProgress, int DepthProgress, int RhsProgress>
|
||||
EIGEN_STRONG_INLINE void prefetch(const DataMapper&, Index, Index) {}
|
||||
|
||||
template<typename ResPacket_>
|
||||
@@ -200,10 +223,43 @@ struct Accumulator<0, CPU, Scalar, ResScalar, DataMapper, 4, 1>
|
||||
_acc *= pAlpha;
|
||||
}
|
||||
|
||||
EIGEN_STRONG_INLINE void store(const DataMapper& dest, Index row, Index col)
|
||||
template<typename ResPacket_>
|
||||
EIGEN_STRONG_INLINE void store(const DataMapper& dest, Index row, Index col, ResScalar alpha, const ResPacket_& pAlpha)
|
||||
{
|
||||
PacketBlock<ResPacket, 1> block;
|
||||
block.packet[0] = dest.template loadPacket<ResPacket>(row, col) + _acc;
|
||||
block.packet[0] = dest.template loadPacket<ResPacket>(row, col) + pAlpha*_acc;
|
||||
dest.template storePacketBlock<AccPacket, 1>(row, col, block);
|
||||
}
|
||||
};
|
||||
|
||||
template<int CPU, typename Scalar, typename ResScalar, typename DataMapper>
|
||||
struct Accumulator<0, CPU, Scalar, ResScalar, DataMapper, 2, 1>
|
||||
{
|
||||
using LinearMapper = typename DataMapper::LinearMapper;
|
||||
using AccPacket = typename packet_traits<Scalar>::half;
|
||||
using ResPacket = typename packet_traits<ResScalar>::half;
|
||||
|
||||
AccPacket _acc;
|
||||
|
||||
EIGEN_STRONG_INLINE void zero()
|
||||
{
|
||||
_acc = pset1<AccPacket>(0);
|
||||
}
|
||||
|
||||
template<int LhsProgress, int DepthProgress, int RhsProgress>
|
||||
EIGEN_STRONG_INLINE void prefetch(const DataMapper&, Index, Index) {}
|
||||
|
||||
template<typename ResPacket_>
|
||||
EIGEN_STRONG_INLINE void scale(ResScalar alpha, const ResPacket_& pAlpha)
|
||||
{
|
||||
_acc *= pAlpha;
|
||||
}
|
||||
|
||||
template<typename ResPacket_>
|
||||
EIGEN_STRONG_INLINE void store(const DataMapper& dest, Index row, Index col, ResScalar alpha, const ResPacket_& pAlpha)
|
||||
{
|
||||
PacketBlock<ResPacket, 1> block;
|
||||
block.packet[0] = dest.template loadPacket<ResPacket>(row, col) + pAlpha*_acc;
|
||||
dest.template storePacketBlock<AccPacket, 1>(row, col, block);
|
||||
}
|
||||
};
|
||||
@@ -222,6 +278,7 @@ struct Accumulator<0, CPU, Scalar, ResScalar, DataMapper, 1, 4>
|
||||
_acc = pset1<AccPacket>(0);
|
||||
}
|
||||
|
||||
template<int LhsProgress, int DepthProgress, int RhsProgress>
|
||||
EIGEN_STRONG_INLINE void prefetch(const DataMapper&, Index, Index) {}
|
||||
|
||||
template<typename ResPacket_>
|
||||
@@ -230,20 +287,27 @@ struct Accumulator<0, CPU, Scalar, ResScalar, DataMapper, 1, 4>
|
||||
_acc *= pAlpha;
|
||||
}
|
||||
|
||||
EIGEN_STRONG_INLINE void store(const DataMapper& dest, Index row, Index col)
|
||||
template<typename ResPacket_>
|
||||
EIGEN_STRONG_INLINE void store(const DataMapper& dest, Index row, Index col, ResScalar alpha, const ResPacket_& pAlpha)
|
||||
{
|
||||
ResPacket r = dest.template gatherPacket<ResPacket>(row, col) + _acc;
|
||||
ResPacket r = dest.template gatherPacket<ResPacket>(row, col) + pAlpha*_acc;
|
||||
dest.template scatterPacket<ResPacket>(row, col, r);
|
||||
}
|
||||
};
|
||||
|
||||
//[TODO] Implement this properly
|
||||
template<int CPU, typename Scalar, typename ResScalar, typename DataMapper>
|
||||
struct Accumulator<0, CPU, Scalar, ResScalar, DataMapper, 4, 4>
|
||||
struct Accumulator<0, CPU, Scalar, ResScalar, DataMapper, 2, 4>
|
||||
{
|
||||
using LinearMapper = typename DataMapper::LinearMapper;
|
||||
using AccPacket = typename packet_traits<Scalar>::type;
|
||||
using AccPacket = typename packet_traits<Scalar>::half;
|
||||
using ResPacket = typename packet_traits<ResScalar>::type;
|
||||
|
||||
LinearMapper r0{nullptr};
|
||||
LinearMapper r1{nullptr};
|
||||
LinearMapper r2{nullptr};
|
||||
LinearMapper r3{nullptr};
|
||||
|
||||
PacketBlock<AccPacket, 4> _acc;
|
||||
|
||||
EIGEN_STRONG_INLINE void zero()
|
||||
@@ -254,12 +318,95 @@ struct Accumulator<0, CPU, Scalar, ResScalar, DataMapper, 4, 4>
|
||||
_acc.packet[3] = pset1<AccPacket>(0);
|
||||
}
|
||||
|
||||
template<int LhsProgress, int DepthProgress, int RhsProgress>
|
||||
EIGEN_STRONG_INLINE void prefetch(const DataMapper& dest, Index row, Index col)
|
||||
{
|
||||
dest.getLinearMapper(row, col + 0).prefetch(0);
|
||||
dest.getLinearMapper(row, col + 1).prefetch(0);
|
||||
dest.getLinearMapper(row, col + 2).prefetch(0);
|
||||
dest.getLinearMapper(row, col + 3).prefetch(0);
|
||||
asm __volatile__("#BEGIN_PREFETCH_2x4\n\t");
|
||||
r0 = dest.getLinearMapper(row + 0, col + 0);
|
||||
r1 = dest.getLinearMapper(row + 0, col + 1);
|
||||
r2 = dest.getLinearMapper(row + 0, col + 2);
|
||||
r3 = dest.getLinearMapper(row + 0, col + 3);
|
||||
|
||||
#ifdef __ENABLE_PREFETCH__
|
||||
r0.prefetch(0);
|
||||
r1.prefetch(0);
|
||||
r2.prefetch(0);
|
||||
r3.prefetch(0);
|
||||
#endif
|
||||
asm __volatile__("#END_PREFETCH_2x4\n\t");
|
||||
}
|
||||
|
||||
template<typename ResPacket_>
|
||||
EIGEN_STRONG_INLINE void scale(ResScalar alpha, const ResPacket_& pAlpha)
|
||||
{
|
||||
// _acc.packet[0] *= pAlpha;
|
||||
// _acc.packet[1] *= pAlpha;
|
||||
// _acc.packet[2] *= pAlpha;
|
||||
// _acc.packet[3] *= pAlpha;
|
||||
}
|
||||
|
||||
template<typename ResPacket_>
|
||||
EIGEN_STRONG_INLINE void store(const DataMapper& dest, Index row, Index col, ResScalar alpha, const ResPacket_& pAlpha)
|
||||
{
|
||||
asm __volatile__("#BEGIN_STORE_2x4\n\t");
|
||||
constexpr auto PacketSize = unpacket_traits<AccPacket>::size;
|
||||
AccPacket ppAlpha = pset1<AccPacket>(alpha);
|
||||
AccPacket R00 = r0.template loadPacket<AccPacket>(0*PacketSize);
|
||||
AccPacket R01 = r1.template loadPacket<AccPacket>(0*PacketSize);
|
||||
AccPacket R02 = r2.template loadPacket<AccPacket>(0*PacketSize);
|
||||
AccPacket R03 = r3.template loadPacket<AccPacket>(0*PacketSize);
|
||||
|
||||
R00 += ppAlpha*_acc.packet[0];
|
||||
R01 += ppAlpha*_acc.packet[1];
|
||||
R02 += ppAlpha*_acc.packet[2];
|
||||
R03 += ppAlpha*_acc.packet[3];
|
||||
|
||||
r0.storePacket(0*PacketSize, R00);
|
||||
r1.storePacket(0*PacketSize, R01);
|
||||
r2.storePacket(0*PacketSize, R02);
|
||||
r3.storePacket(0*PacketSize, R03);
|
||||
asm __volatile__("#END_STORE_2x4\n\t");
|
||||
}
|
||||
};
|
||||
|
||||
template<int CPU, typename Scalar, typename ResScalar, typename DataMapper>
|
||||
struct Accumulator<0, CPU, Scalar, ResScalar, DataMapper, 4, 4>
|
||||
{
|
||||
using LinearMapper = typename DataMapper::LinearMapper;
|
||||
using AccPacket = typename packet_traits<Scalar>::type;
|
||||
using ResPacket = typename packet_traits<ResScalar>::type;
|
||||
|
||||
LinearMapper r0{nullptr};
|
||||
LinearMapper r1{nullptr};
|
||||
LinearMapper r2{nullptr};
|
||||
LinearMapper r3{nullptr};
|
||||
|
||||
PacketBlock<AccPacket, 4> _acc;
|
||||
|
||||
EIGEN_STRONG_INLINE void zero()
|
||||
{
|
||||
_acc.packet[0] = pset1<AccPacket>(0);
|
||||
_acc.packet[1] = pset1<AccPacket>(0);
|
||||
_acc.packet[2] = pset1<AccPacket>(0);
|
||||
_acc.packet[3] = pset1<AccPacket>(0);
|
||||
}
|
||||
|
||||
template<int LhsProgress, int DepthProgress, int RhsProgress>
|
||||
EIGEN_STRONG_INLINE void prefetch(const DataMapper& dest, Index row, Index col)
|
||||
{
|
||||
asm __volatile__("#BEGIN_PREFETCH_4x4\n\t");
|
||||
r0 = dest.getLinearMapper(row + 0, col + 0);
|
||||
r1 = dest.getLinearMapper(row + 0, col + 1);
|
||||
r2 = dest.getLinearMapper(row + 0, col + 2);
|
||||
r3 = dest.getLinearMapper(row + 0, col + 3);
|
||||
|
||||
#ifdef __ENABLE_PREFETCH__
|
||||
r0.prefetch(0);
|
||||
r1.prefetch(0);
|
||||
r2.prefetch(0);
|
||||
r3.prefetch(0);
|
||||
#endif
|
||||
asm __volatile__("#END_PREFETCH_4x4\n\t");
|
||||
}
|
||||
|
||||
template<typename ResPacket_>
|
||||
@@ -271,19 +418,26 @@ struct Accumulator<0, CPU, Scalar, ResScalar, DataMapper, 4, 4>
|
||||
_acc.packet[3] *= pAlpha;
|
||||
}
|
||||
|
||||
EIGEN_STRONG_INLINE void store(const DataMapper& dest, Index row, Index col)
|
||||
template<typename ResPacket_>
|
||||
EIGEN_STRONG_INLINE void store(const DataMapper& dest, Index row, Index col, ResScalar alpha, const ResPacket_& pAlpha)
|
||||
{
|
||||
asm __volatile__("#BEGIN_STORE_4x4\n\t");
|
||||
constexpr auto PacketSize = unpacket_traits<ResPacket>::size;
|
||||
ResPacket R00 = r0.template loadPacket<ResPacket>(0*PacketSize);
|
||||
ResPacket R01 = r1.template loadPacket<ResPacket>(0*PacketSize);
|
||||
ResPacket R02 = r2.template loadPacket<ResPacket>(0*PacketSize);
|
||||
ResPacket R03 = r3.template loadPacket<ResPacket>(0*PacketSize);
|
||||
|
||||
LinearMapper r0 = dest.getLinearMapper(row, col + 0);
|
||||
LinearMapper r1 = dest.getLinearMapper(row, col + 1);
|
||||
LinearMapper r2 = dest.getLinearMapper(row, col + 2);
|
||||
LinearMapper r3 = dest.getLinearMapper(row, col + 3);
|
||||
R00 += pAlpha*_acc.packet[0];
|
||||
R01 += pAlpha*_acc.packet[1];
|
||||
R02 += pAlpha*_acc.packet[2];
|
||||
R03 += pAlpha*_acc.packet[3];
|
||||
|
||||
r0.storePacket(0*PacketSize, r0.template loadPacket<ResPacket>(0*PacketSize) + _acc.packet[0]);
|
||||
r1.storePacket(0*PacketSize, r1.template loadPacket<ResPacket>(0*PacketSize) + _acc.packet[1]);
|
||||
r2.storePacket(0*PacketSize, r2.template loadPacket<ResPacket>(0*PacketSize) + _acc.packet[2]);
|
||||
r3.storePacket(0*PacketSize, r3.template loadPacket<ResPacket>(0*PacketSize) + _acc.packet[3]);
|
||||
r0.storePacket(0*PacketSize, R00);
|
||||
r1.storePacket(0*PacketSize, R01);
|
||||
r2.storePacket(0*PacketSize, R02);
|
||||
r3.storePacket(0*PacketSize, R03);
|
||||
asm __volatile__("#END_STORE_4x4\n\t");
|
||||
}
|
||||
};
|
||||
|
||||
@@ -294,6 +448,11 @@ struct Accumulator<0, CPU, Scalar, ResScalar, DataMapper, 8, 4>
|
||||
using AccPacket = typename packet_traits<Scalar>::type;
|
||||
using ResPacket = typename packet_traits<ResScalar>::type;
|
||||
|
||||
LinearMapper r0{nullptr};
|
||||
LinearMapper r1{nullptr};
|
||||
LinearMapper r2{nullptr};
|
||||
LinearMapper r3{nullptr};
|
||||
|
||||
PacketBlock<AccPacket, 4> _acc1;
|
||||
PacketBlock<AccPacket, 4> _acc2;
|
||||
|
||||
@@ -310,12 +469,21 @@ struct Accumulator<0, CPU, Scalar, ResScalar, DataMapper, 8, 4>
|
||||
_acc2.packet[3] = pset1<AccPacket>(0);
|
||||
}
|
||||
|
||||
template<int LhsProgress, int DepthProgress, int RhsProgress>
|
||||
EIGEN_STRONG_INLINE void prefetch(const DataMapper& dest, Index row, Index col)
|
||||
{
|
||||
dest.getLinearMapper(row + 0, col + 0).prefetch(0);
|
||||
dest.getLinearMapper(row + 0, col + 1).prefetch(0);
|
||||
dest.getLinearMapper(row + 0, col + 2).prefetch(0);
|
||||
dest.getLinearMapper(row + 0, col + 3).prefetch(0);
|
||||
constexpr Index offset = 32 / sizeof(ResScalar);
|
||||
r0 = dest.getLinearMapper(row + 0, col + 0);
|
||||
r1 = dest.getLinearMapper(row + 0, col + 1);
|
||||
r2 = dest.getLinearMapper(row + 0, col + 2);
|
||||
r3 = dest.getLinearMapper(row + 0, col + 3);
|
||||
|
||||
#ifdef __ENABLE_PREFETCH__
|
||||
r0.prefetch(offset);
|
||||
r1.prefetch(offset);
|
||||
r2.prefetch(offset);
|
||||
r3.prefetch(offset);
|
||||
#endif
|
||||
}
|
||||
|
||||
template<typename ResPacket_>
|
||||
@@ -332,24 +500,40 @@ struct Accumulator<0, CPU, Scalar, ResScalar, DataMapper, 8, 4>
|
||||
_acc2.packet[3] *= pAlpha;
|
||||
}
|
||||
|
||||
EIGEN_STRONG_INLINE void store(const DataMapper& dest, Index row, Index col)
|
||||
template<typename ResPacket_>
|
||||
EIGEN_STRONG_INLINE void store(const DataMapper& dest, Index row, Index col, ResScalar alpha, const ResPacket_& pAlpha)
|
||||
{
|
||||
constexpr auto PacketSize = unpacket_traits<ResPacket>::size;
|
||||
|
||||
LinearMapper r0 = dest.getLinearMapper(row, col + 0);
|
||||
LinearMapper r1 = dest.getLinearMapper(row, col + 1);
|
||||
LinearMapper r2 = dest.getLinearMapper(row, col + 2);
|
||||
LinearMapper r3 = dest.getLinearMapper(row, col + 3);
|
||||
ResPacket R00 = r0.template loadPacket<ResPacket>(0*PacketSize);
|
||||
ResPacket R01 = r1.template loadPacket<ResPacket>(0*PacketSize);
|
||||
ResPacket R02 = r2.template loadPacket<ResPacket>(0*PacketSize);
|
||||
ResPacket R03 = r3.template loadPacket<ResPacket>(0*PacketSize);
|
||||
|
||||
r0.storePacket(0*PacketSize, r0.template loadPacket<ResPacket>(0*PacketSize) + _acc1.packet[0]);
|
||||
r1.storePacket(0*PacketSize, r1.template loadPacket<ResPacket>(0*PacketSize) + _acc1.packet[1]);
|
||||
r2.storePacket(0*PacketSize, r2.template loadPacket<ResPacket>(0*PacketSize) + _acc1.packet[2]);
|
||||
r3.storePacket(0*PacketSize, r3.template loadPacket<ResPacket>(0*PacketSize) + _acc1.packet[3]);
|
||||
ResPacket R10 = r0.template loadPacket<ResPacket>(1*PacketSize);
|
||||
ResPacket R11 = r1.template loadPacket<ResPacket>(1*PacketSize);
|
||||
ResPacket R12 = r2.template loadPacket<ResPacket>(1*PacketSize);
|
||||
ResPacket R13 = r3.template loadPacket<ResPacket>(1*PacketSize);
|
||||
|
||||
r0.storePacket(1*PacketSize, r0.template loadPacket<ResPacket>(1*PacketSize) + _acc2.packet[0]);
|
||||
r1.storePacket(1*PacketSize, r1.template loadPacket<ResPacket>(1*PacketSize) + _acc2.packet[1]);
|
||||
r2.storePacket(1*PacketSize, r2.template loadPacket<ResPacket>(1*PacketSize) + _acc2.packet[2]);
|
||||
r3.storePacket(1*PacketSize, r3.template loadPacket<ResPacket>(1*PacketSize) + _acc2.packet[3]);
|
||||
R00 += pAlpha*_acc1.packet[0];
|
||||
R01 += pAlpha*_acc1.packet[1];
|
||||
R02 += pAlpha*_acc1.packet[2];
|
||||
R03 += pAlpha*_acc1.packet[3];
|
||||
|
||||
R10 += pAlpha*_acc2.packet[0];
|
||||
R11 += pAlpha*_acc2.packet[1];
|
||||
R12 += pAlpha*_acc2.packet[2];
|
||||
R13 += pAlpha*_acc2.packet[3];
|
||||
|
||||
r0.storePacket(0*PacketSize, R00);
|
||||
r1.storePacket(0*PacketSize, R01);
|
||||
r2.storePacket(0*PacketSize, R02);
|
||||
r3.storePacket(0*PacketSize, R03);
|
||||
|
||||
r0.storePacket(1*PacketSize, R10);
|
||||
r1.storePacket(1*PacketSize, R11);
|
||||
r2.storePacket(1*PacketSize, R12);
|
||||
r3.storePacket(1*PacketSize, R13);
|
||||
}
|
||||
};
|
||||
|
||||
@@ -360,6 +544,11 @@ struct Accumulator<0, CPU, Scalar, ResScalar, DataMapper, 12, 4>
|
||||
using AccPacket = typename packet_traits<Scalar>::type;
|
||||
using ResPacket = typename packet_traits<ResScalar>::type;
|
||||
|
||||
LinearMapper r0{nullptr};
|
||||
LinearMapper r1{nullptr};
|
||||
LinearMapper r2{nullptr};
|
||||
LinearMapper r3{nullptr};
|
||||
|
||||
PacketBlock<AccPacket, 4> _acc1;
|
||||
PacketBlock<AccPacket, 4> _acc2;
|
||||
PacketBlock<AccPacket, 4> _acc3;
|
||||
@@ -382,12 +571,22 @@ struct Accumulator<0, CPU, Scalar, ResScalar, DataMapper, 12, 4>
|
||||
_acc3.packet[3] = pset1<AccPacket>(0);
|
||||
}
|
||||
|
||||
template<int LhsProgress, int DepthProgress, int RhsProgress>
|
||||
EIGEN_STRONG_INLINE void prefetch(const DataMapper& dest, Index row, Index col)
|
||||
{
|
||||
dest.getLinearMapper(row + 0, col + 0).prefetch(0);
|
||||
dest.getLinearMapper(row + 0, col + 1).prefetch(0);
|
||||
dest.getLinearMapper(row + 0, col + 2).prefetch(0);
|
||||
dest.getLinearMapper(row + 0, col + 3).prefetch(0);
|
||||
asm __volatile__("#BEGIN_PREFETCH_12x4\n\t");
|
||||
r0 = dest.getLinearMapper(row + 0, col + 0);
|
||||
r1 = dest.getLinearMapper(row + 0, col + 1);
|
||||
r2 = dest.getLinearMapper(row + 0, col + 2);
|
||||
r3 = dest.getLinearMapper(row + 0, col + 3);
|
||||
|
||||
#ifdef __ENABLE_PREFETCH__
|
||||
r0.prefetch(0);
|
||||
r1.prefetch(0);
|
||||
r2.prefetch(0);
|
||||
r3.prefetch(0);
|
||||
#endif
|
||||
asm __volatile__("#END_PREFETCH_12x4\n\t");
|
||||
}
|
||||
|
||||
template<typename ResPacket_>
|
||||
@@ -409,29 +608,59 @@ struct Accumulator<0, CPU, Scalar, ResScalar, DataMapper, 12, 4>
|
||||
_acc3.packet[3] *= pAlpha;
|
||||
}
|
||||
|
||||
EIGEN_STRONG_INLINE void store(const DataMapper& dest, Index row, Index col)
|
||||
template<typename ResPacket_>
|
||||
EIGEN_STRONG_INLINE void store(const DataMapper& dest, Index row, Index col, ResScalar alpha, const ResPacket_& pAlpha)
|
||||
{
|
||||
constexpr auto PacketSize = unpacket_traits<ResPacket>::size;
|
||||
|
||||
LinearMapper r0 = dest.getLinearMapper(row, col + 0);
|
||||
LinearMapper r1 = dest.getLinearMapper(row, col + 1);
|
||||
LinearMapper r2 = dest.getLinearMapper(row, col + 2);
|
||||
LinearMapper r3 = dest.getLinearMapper(row, col + 3);
|
||||
asm __volatile__("#BEGIN_STORE_12x4\n\t");
|
||||
|
||||
r0.storePacket(0*PacketSize, r0.template loadPacket<ResPacket>(0*PacketSize) + _acc1.packet[0]);
|
||||
r1.storePacket(0*PacketSize, r1.template loadPacket<ResPacket>(0*PacketSize) + _acc1.packet[1]);
|
||||
r2.storePacket(0*PacketSize, r2.template loadPacket<ResPacket>(0*PacketSize) + _acc1.packet[2]);
|
||||
r3.storePacket(0*PacketSize, r3.template loadPacket<ResPacket>(0*PacketSize) + _acc1.packet[3]);
|
||||
ResPacket R00 = r0.template loadPacket<ResPacket>(0*PacketSize);
|
||||
ResPacket R01 = r1.template loadPacket<ResPacket>(0*PacketSize);
|
||||
ResPacket R02 = r2.template loadPacket<ResPacket>(0*PacketSize);
|
||||
ResPacket R03 = r3.template loadPacket<ResPacket>(0*PacketSize);
|
||||
|
||||
r0.storePacket(1*PacketSize, r0.template loadPacket<ResPacket>(1*PacketSize) + _acc2.packet[0]);
|
||||
r1.storePacket(1*PacketSize, r1.template loadPacket<ResPacket>(1*PacketSize) + _acc2.packet[1]);
|
||||
r2.storePacket(1*PacketSize, r2.template loadPacket<ResPacket>(1*PacketSize) + _acc2.packet[2]);
|
||||
r3.storePacket(1*PacketSize, r3.template loadPacket<ResPacket>(1*PacketSize) + _acc2.packet[3]);
|
||||
ResPacket R10 = r0.template loadPacket<ResPacket>(1*PacketSize);
|
||||
ResPacket R11 = r1.template loadPacket<ResPacket>(1*PacketSize);
|
||||
ResPacket R12 = r2.template loadPacket<ResPacket>(1*PacketSize);
|
||||
ResPacket R13 = r3.template loadPacket<ResPacket>(1*PacketSize);
|
||||
|
||||
ResPacket R20 = r0.template loadPacket<ResPacket>(2*PacketSize);
|
||||
ResPacket R21 = r1.template loadPacket<ResPacket>(2*PacketSize);
|
||||
ResPacket R22 = r2.template loadPacket<ResPacket>(2*PacketSize);
|
||||
ResPacket R23 = r3.template loadPacket<ResPacket>(2*PacketSize);
|
||||
|
||||
r0.storePacket(2*PacketSize, r0.template loadPacket<ResPacket>(2*PacketSize) + _acc3.packet[0]);
|
||||
r1.storePacket(2*PacketSize, r1.template loadPacket<ResPacket>(2*PacketSize) + _acc3.packet[1]);
|
||||
r2.storePacket(2*PacketSize, r2.template loadPacket<ResPacket>(2*PacketSize) + _acc3.packet[2]);
|
||||
r3.storePacket(2*PacketSize, r3.template loadPacket<ResPacket>(2*PacketSize) + _acc3.packet[3]);
|
||||
R00 += pAlpha*_acc1.packet[0];
|
||||
R01 += pAlpha*_acc1.packet[1];
|
||||
R02 += pAlpha*_acc1.packet[2];
|
||||
R03 += pAlpha*_acc1.packet[3];
|
||||
|
||||
R10 += pAlpha*_acc2.packet[0];
|
||||
R11 += pAlpha*_acc2.packet[1];
|
||||
R12 += pAlpha*_acc2.packet[2];
|
||||
R13 += pAlpha*_acc2.packet[3];
|
||||
|
||||
R20 += pAlpha*_acc3.packet[0];
|
||||
R21 += pAlpha*_acc3.packet[1];
|
||||
R22 += pAlpha*_acc3.packet[2];
|
||||
R23 += pAlpha*_acc3.packet[3];
|
||||
|
||||
r0.storePacket(0*PacketSize, R00);
|
||||
r1.storePacket(0*PacketSize, R01);
|
||||
r2.storePacket(0*PacketSize, R02);
|
||||
r3.storePacket(0*PacketSize, R03);
|
||||
|
||||
r0.storePacket(1*PacketSize, R10);
|
||||
r1.storePacket(1*PacketSize, R11);
|
||||
r2.storePacket(1*PacketSize, R12);
|
||||
r3.storePacket(1*PacketSize, R13);
|
||||
|
||||
r0.storePacket(2*PacketSize, R20);
|
||||
r1.storePacket(2*PacketSize, R21);
|
||||
r2.storePacket(2*PacketSize, R22);
|
||||
r3.storePacket(2*PacketSize, R23);
|
||||
|
||||
asm __volatile__("#END_STORE_12x4\n\t");
|
||||
}
|
||||
};
|
||||
|
||||
@@ -484,14 +713,14 @@ struct MicroKernel<0, CPU, Index, LhsScalar, LhsPackMap, RhsScalar, RhsPackMap,
|
||||
|
||||
#if __UNROLL__ == 8
|
||||
#ifdef __ENABLE_PREFETCH__
|
||||
prefetch(rhsPackMap.pCur + (48+0));
|
||||
rhsPackMap.prefetch(48+0);
|
||||
#endif
|
||||
MICRO_8x1x4();
|
||||
MICRO_8x1x4();
|
||||
MICRO_8x1x4();
|
||||
MICRO_8x1x4();
|
||||
#ifdef __ENABLE_PREFETCH__
|
||||
prefetch(rhsPackMap.pCur + (48+16));
|
||||
rhsPackMap.prefetch(48+16);
|
||||
#endif
|
||||
MICRO_8x1x4();
|
||||
MICRO_8x1x4();
|
||||
@@ -525,21 +754,23 @@ struct MicroKernel<0, CPU, Index, LhsScalar, LhsPackMap, RhsScalar, RhsPackMap,
|
||||
|
||||
#if __UNROLL__ == 8
|
||||
#ifdef __ENABLE_PREFETCH__
|
||||
prefetch(rhsPackMap.pCur);
|
||||
rhsPackMap.prefetch(0);
|
||||
#endif
|
||||
MICRO_12x1x4();
|
||||
MICRO_12x1x4();
|
||||
MICRO_12x1x4();
|
||||
MICRO_12x1x4();
|
||||
MICRO_12x1x4();
|
||||
MICRO_12x1x4();
|
||||
MICRO_12x1x4();
|
||||
MICRO_12x1x4();
|
||||
MICRO_12x1x4(0);
|
||||
MICRO_12x1x4(1);
|
||||
MICRO_12x1x4(2);
|
||||
MICRO_12x1x4(3);
|
||||
MICRO_12x1x4(4);
|
||||
MICRO_12x1x4(5);
|
||||
MICRO_12x1x4(6);
|
||||
MICRO_12x1x4(7);
|
||||
lhsPackMap.advance(12*__UNROLL__);
|
||||
rhsPackMap.advance(4*__UNROLL__);
|
||||
#else
|
||||
MICRO_12x1x4();
|
||||
MICRO_12x1x4();
|
||||
MICRO_12x1x4();
|
||||
MICRO_12x1x4();
|
||||
MICRO_12x1x4(0);
|
||||
MICRO_12x1x4(1);
|
||||
MICRO_12x1x4(2);
|
||||
MICRO_12x1x4(3);
|
||||
#endif
|
||||
asm __volatile__("#END_NEON_MICROKERNEL_12x8x4\n\t");
|
||||
};
|
||||
@@ -561,7 +792,10 @@ struct MicroKernel<0, CPU, Index, LhsScalar, LhsPackMap, RhsScalar, RhsPackMap,
|
||||
LhsPacket pLhs, pLhs2, pLhs3;
|
||||
RhsPacket pRhs, pRhs0, pRhs1, pRhs2, pRhs3;
|
||||
|
||||
MICRO_12x1x4();
|
||||
MICRO_12x1x4(0);
|
||||
|
||||
lhsPackMap.advance(12);
|
||||
rhsPackMap.advance(4);
|
||||
|
||||
asm __volatile__("#END_NEON_MICROKERNEL_12x1x4\n\t");
|
||||
};
|
||||
@@ -611,6 +845,28 @@ struct MicroKernel<0, CPU, Index, LhsScalar, LhsPackMap, RhsScalar, RhsPackMap,
|
||||
};
|
||||
};
|
||||
|
||||
template<int CPU, typename Index, typename LhsScalar, typename LhsPackMap, typename RhsScalar, typename RhsPackMap, typename AccScalar, typename ResScalar, typename Accumulator>
|
||||
struct MicroKernel<0, CPU, Index, LhsScalar, LhsPackMap, RhsScalar, RhsPackMap, AccScalar, ResScalar, Accumulator, 2, 1, 4>
|
||||
{
|
||||
EIGEN_STRONG_INLINE void operator()(LhsPackMap& lhsPackMap,
|
||||
RhsPackMap& rhsPackMap,
|
||||
Index rowIdx, Index colIdx, Index depthIdx,
|
||||
Accumulator& acc)
|
||||
{
|
||||
using LhsPacket = typename packet_traits<LhsScalar>::half;
|
||||
using RhsPacket = typename packet_traits<RhsScalar>::half;
|
||||
|
||||
asm __volatile__("#BEGIN_NEON_MICROKERNEL_2x1x4\n\t");
|
||||
|
||||
LhsPacket pLhs;
|
||||
RhsPacket pRhs, pRhs0, pRhs1, pRhs2, pRhs3;
|
||||
|
||||
MICRO_2x1x4();
|
||||
|
||||
asm __volatile__("#END_NEON_MICROKERNEL_2x1x4\n\t");
|
||||
};
|
||||
};
|
||||
|
||||
template<int CPU, typename Index, typename LhsScalar, typename LhsPackMap, typename RhsScalar, typename RhsPackMap, typename AccScalar, typename ResScalar, typename Accumulator>
|
||||
struct MicroKernel<0, CPU, Index, LhsScalar, LhsPackMap, RhsScalar, RhsPackMap, AccScalar, ResScalar, Accumulator, 12, __UNROLL__, 1>
|
||||
{
|
||||
@@ -622,21 +878,23 @@ struct MicroKernel<0, CPU, Index, LhsScalar, LhsPackMap, RhsScalar, RhsPackMap,
|
||||
using LhsPacket = typename packet_traits<LhsScalar>::type;
|
||||
using RhsPacket = typename packet_traits<RhsScalar>::type;
|
||||
|
||||
LhsPacket pLhs;
|
||||
LhsPacket pLhs, pLhs2, pLhs3;
|
||||
RhsPacket pRhs;
|
||||
|
||||
asm __volatile__("#BEGIN_NEON_MICROKERNEL_4x1x1\n\t");
|
||||
asm __volatile__("#BEGIN_NEON_MICROKERNEL_12x8x1\n\t");
|
||||
|
||||
MICRO_12x1x1();
|
||||
MICRO_12x1x1();
|
||||
MICRO_12x1x1();
|
||||
MICRO_12x1x1();
|
||||
MICRO_12x1x1();
|
||||
MICRO_12x1x1();
|
||||
MICRO_12x1x1();
|
||||
MICRO_12x1x1();
|
||||
MICRO_12x1x1(0);
|
||||
MICRO_12x1x1(1);
|
||||
MICRO_12x1x1(2);
|
||||
MICRO_12x1x1(3);
|
||||
MICRO_12x1x1(4);
|
||||
MICRO_12x1x1(5);
|
||||
MICRO_12x1x1(6);
|
||||
MICRO_12x1x1(7);
|
||||
lhsPackMap.advance(12*__UNROLL__);
|
||||
rhsPackMap.advance(1*__UNROLL__);
|
||||
|
||||
asm __volatile__("#END_NEON_MICROKERNEL_4x1x1\n\t");
|
||||
asm __volatile__("#END_NEON_MICROKERNEL_12x8x1\n\t");
|
||||
};
|
||||
};
|
||||
|
||||
@@ -651,14 +909,16 @@ struct MicroKernel<0, CPU, Index, LhsScalar, LhsPackMap, RhsScalar, RhsPackMap,
|
||||
using LhsPacket = typename packet_traits<LhsScalar>::type;
|
||||
using RhsPacket = typename packet_traits<RhsScalar>::type;
|
||||
|
||||
LhsPacket pLhs;
|
||||
LhsPacket pLhs, pLhs2, pLhs3;
|
||||
RhsPacket pRhs;
|
||||
|
||||
asm __volatile__("#BEGIN_NEON_MICROKERNEL_4x1x1\n\t");
|
||||
asm __volatile__("#BEGIN_NEON_MICROKERNEL_12x1x1\n\t");
|
||||
|
||||
MICRO_12x1x1();
|
||||
MICRO_12x1x1(0);
|
||||
lhsPackMap.advance(12);
|
||||
rhsPackMap.advance(1);
|
||||
|
||||
asm __volatile__("#END_NEON_MICROKERNEL_4x1x1\n\t");
|
||||
asm __volatile__("#END_NEON_MICROKERNEL_12x1x1\n\t");
|
||||
};
|
||||
};
|
||||
|
||||
@@ -764,6 +1024,28 @@ struct MicroKernel<0, CPU, Index, LhsScalar, LhsPackMap, RhsScalar, RhsPackMap,
|
||||
};
|
||||
};
|
||||
|
||||
template<int CPU, typename Index, typename LhsScalar, typename LhsPackMap, typename RhsScalar, typename RhsPackMap, typename AccScalar, typename ResScalar, typename Accumulator>
|
||||
struct MicroKernel<0, CPU, Index, LhsScalar, LhsPackMap, RhsScalar, RhsPackMap, AccScalar, ResScalar, Accumulator, 2, 1, 1>
|
||||
{
|
||||
EIGEN_STRONG_INLINE void operator()(LhsPackMap& lhsPackMap,
|
||||
RhsPackMap& rhsPackMap,
|
||||
Index rowIdx, Index colIdx, Index depthIdx,
|
||||
Accumulator& acc)
|
||||
{
|
||||
using LhsPacket = typename packet_traits<LhsScalar>::half;
|
||||
using RhsPacket = typename packet_traits<RhsScalar>::half;
|
||||
|
||||
asm __volatile__("#BEGIN_NEON_MICROKERNEL_4x1x1\n\t");
|
||||
|
||||
LhsPacket pLhs;
|
||||
RhsPacket pRhs;
|
||||
|
||||
MICRO_2x1x1();
|
||||
|
||||
asm __volatile__("#END_NEON_MICROKERNEL_4x1x1\n\t");
|
||||
};
|
||||
};
|
||||
|
||||
template<int CPU, typename Index, typename LhsScalar, typename LhsPackMap, typename RhsScalar, typename RhsPackMap, typename AccScalar, typename ResScalar, typename Accumulator>
|
||||
struct MicroKernel<0, CPU, Index, LhsScalar, LhsPackMap, RhsScalar, RhsPackMap, AccScalar, ResScalar, Accumulator, 1, 1, 4>
|
||||
{
|
||||
|
||||
@@ -23,7 +23,7 @@ namespace internal {
|
||||
#endif
|
||||
|
||||
template<int Architecture, int CPU, typename LhsScalar, typename RhsScalar>
|
||||
constexpr int SHAPES_COUNT = 14;
|
||||
constexpr int SHAPES_COUNT = 16;
|
||||
|
||||
constexpr int SHAPES_DIMENSION = 6;
|
||||
constexpr int SHAPES_LHS_DIMENSION = 0;
|
||||
@@ -44,23 +44,35 @@ constexpr int PACK_SHAPES_DIMENSION = 3;
|
||||
constexpr int PACK_SHAPES_POINTER = 2;
|
||||
constexpr int PACK_SHAPES_END = -1;
|
||||
|
||||
template<typename Scalar>
|
||||
struct PacketMultiples
|
||||
{
|
||||
enum
|
||||
{
|
||||
half = unpacket_traits<typename packet_traits<Scalar>::half>::size,
|
||||
quarter = unpacket_traits<typename packet_traits<Scalar>::half>::size // Is this used?
|
||||
};
|
||||
};
|
||||
|
||||
// lhs_progress x depth_progress x rhs_progress (depth_progress > 1 matrix ops) x pointer to next rhs_progress on the shapes map
|
||||
template<int Architecture, int CPU, typename LhsScalar, typename RhsScalar>
|
||||
constexpr int SHAPES[SHAPES_COUNT<Architecture, CPU, LhsScalar,RhsScalar>][SHAPES_DIMENSION] =
|
||||
{ /* 00 */{ 1, 1,1,SHAPES_POINTER_END, SHAPES_POINTER_END, SHAPES_POINTER_END},
|
||||
/* 01 */{1*packet_traits<RhsScalar>::size, 1,1, 0, 0, SHAPES_POINTER_END},
|
||||
/* 02 */{1*packet_traits<RhsScalar>::size,__UNROLL__,1, 0, 0, 1},
|
||||
/* 03 */{2*packet_traits<RhsScalar>::size, 1,1, 0, 2, SHAPES_POINTER_END},
|
||||
/* 04 */{2*packet_traits<RhsScalar>::size,__UNROLL__,1, 0, 2, 3},
|
||||
/* 05 */{3*packet_traits<RhsScalar>::size, 1,1, 0, 4, SHAPES_POINTER_END},
|
||||
/* 06 */{3*packet_traits<RhsScalar>::size,__UNROLL__,1, 0, 4, 5},
|
||||
/* 07 */{ 1, 1,4, 6, SHAPES_POINTER_END, SHAPES_POINTER_END},
|
||||
/* 08 */{1*packet_traits<RhsScalar>::size, 1,4, 6, 7, SHAPES_POINTER_END},
|
||||
/* 09 */{1*packet_traits<RhsScalar>::size,__UNROLL__,4, 6, 7, 8},
|
||||
/* 10 */{2*packet_traits<RhsScalar>::size, 1,4, 6, 9, SHAPES_POINTER_END},
|
||||
/* 11 */{2*packet_traits<RhsScalar>::size,__UNROLL__,4, 6, 9, 10},
|
||||
/* 12 */{3*packet_traits<RhsScalar>::size, 1,4, 6, 11, SHAPES_POINTER_END},
|
||||
/* 13 */{3*packet_traits<RhsScalar>::size,__UNROLL__,4, 6, 11, 12}};
|
||||
/* 01 */{PacketMultiples<RhsScalar>::half, 1,1, 0, 0, SHAPES_POINTER_END},
|
||||
/* 02 */{1*packet_traits<RhsScalar>::size, 1,1, 0, 1, SHAPES_POINTER_END},
|
||||
/* 03 */{1*packet_traits<RhsScalar>::size,__UNROLL__,1, 0, 1, 2},
|
||||
/* 04 */{2*packet_traits<RhsScalar>::size, 1,1, 0, 3, SHAPES_POINTER_END},
|
||||
/* 05 */{2*packet_traits<RhsScalar>::size,__UNROLL__,1, 0, 3, 4},
|
||||
/* 06 */{3*packet_traits<RhsScalar>::size, 1,1, 0, 5, SHAPES_POINTER_END},
|
||||
/* 07 */{3*packet_traits<RhsScalar>::size,__UNROLL__,1, 0, 5, 6},
|
||||
/* 08 */{ 1, 1,4, 7, SHAPES_POINTER_END, SHAPES_POINTER_END},
|
||||
/* 09 */{PacketMultiples<RhsScalar>::half, 1,4, 7, 8, SHAPES_POINTER_END},
|
||||
/* 10 */{1*packet_traits<RhsScalar>::size, 1,4, 7, 9, SHAPES_POINTER_END},
|
||||
/* 11 */{1*packet_traits<RhsScalar>::size,__UNROLL__,4, 7, 9, 10},
|
||||
/* 12 */{2*packet_traits<RhsScalar>::size, 1,4, 7, 11, SHAPES_POINTER_END},
|
||||
/* 13 */{2*packet_traits<RhsScalar>::size,__UNROLL__,4, 7, 11, 12},
|
||||
/* 14 */{3*packet_traits<RhsScalar>::size, 1,4, 7, 13, SHAPES_POINTER_END},
|
||||
/* 15 */{3*packet_traits<RhsScalar>::size,__UNROLL__,4, 7, 13, 14}};
|
||||
|
||||
// d1progress x d2progress
|
||||
template<int Architecture, int CPU, typename Scalar, bool isLhs>
|
||||
@@ -215,8 +227,17 @@ struct PackMap
|
||||
PackMap(const Scalar *base, Index d2Size, Index stride, Index offset) : pBase(base), pCur(base), d2Size(d2Size), stride(stride), offset(offset) {}
|
||||
|
||||
EIGEN_STRONG_INLINE void resetCur() { pCur = pBase; }
|
||||
EIGEN_STRONG_INLINE void updateBase() { pBase = pCur; }
|
||||
EIGEN_STRONG_INLINE void moveTo(Index p1) { pCur = pBase + pmc.getPosition(p1, d2Size); }
|
||||
EIGEN_STRONG_INLINE void advance(int progress) { pCur += progress; }
|
||||
EIGEN_STRONG_INLINE void advance(Index progress) { pCur += progress; }
|
||||
|
||||
template<int D1Progress=-1, int D2Progress=-1>
|
||||
EIGEN_STRONG_INLINE void prefetch(Index amnt)
|
||||
{
|
||||
#ifdef __ENABLE_PREFETCH__
|
||||
internal::prefetch(pCur + amnt);
|
||||
#endif
|
||||
}
|
||||
};
|
||||
|
||||
template<int Architecture, int CPU, typename Scalar, typename ResScalar, typename DataMapper, int M, int N>
|
||||
@@ -235,6 +256,7 @@ struct Accumulator
|
||||
}
|
||||
}
|
||||
|
||||
template<int LhsProgress=-1, int DepthProgress=-1, int RhsProgress=-1>
|
||||
EIGEN_STRONG_INLINE void prefetch(const DataMapper&, Index, Index) {}
|
||||
|
||||
template<typename ResPacket>
|
||||
@@ -249,13 +271,14 @@ struct Accumulator
|
||||
}
|
||||
}
|
||||
|
||||
EIGEN_STRONG_INLINE void store(const DataMapper& dest, Index row, Index col)
|
||||
template<typename ResPacket>
|
||||
EIGEN_STRONG_INLINE void store(const DataMapper& dest, Index row, Index col, ResScalar alpha, const ResPacket& pAlpha)
|
||||
{
|
||||
for(auto i = 0; i < M; i++)
|
||||
{
|
||||
for(auto j = 0; j < N; j++)
|
||||
{
|
||||
dest(row + i, col + j) += dt[i][j];
|
||||
dest(row + i, col + j) += alpha*dt[i][j];
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -269,26 +292,6 @@ struct MicroKernel
|
||||
Index rowIdx, Index colIdx, Index depthIdx,
|
||||
Accumulator& acc)
|
||||
{
|
||||
#ifdef __DEBUG__
|
||||
std::cout << "Kernel " << M << " x " << K << " x " << N << " @ " << rowIdx << ", " << depthIdx << ", " << colIdx << std::endl;
|
||||
std::cout << "LHS ";
|
||||
for(auto i = 0; i < M; i++)
|
||||
{
|
||||
for(auto j = 0; j < K; j++)
|
||||
{
|
||||
std::cout << lhsPackMap.pCur[i*K + j] << " ";
|
||||
}
|
||||
}
|
||||
std::cout << std::endl << "RHS ";
|
||||
for(auto i = 0; i < K; i++)
|
||||
{
|
||||
for(auto j = 0; j < N; j++)
|
||||
{
|
||||
std::cout << rhsPackMap.pCur[i*N + j] << " ";
|
||||
}
|
||||
}
|
||||
std::cout << std::endl;
|
||||
#endif
|
||||
const RhsScalar *pRhs = rhsPackMap.pCur;
|
||||
for(auto i = 0; i < N; i++)
|
||||
{
|
||||
@@ -303,50 +306,69 @@ struct MicroKernel
|
||||
};
|
||||
};
|
||||
|
||||
template<int Architecture, int CPU, typename Index, typename LhsScalar, typename LhsPackMap, typename RhsScalar, typename RhsPackMap, typename AccScalar, typename ResScalar, typename ResPacket, typename DataMapper, int RHS_SHAPE_IDX, int LHS_SHAPE_IDX, int IDX>
|
||||
template<int Architecture, int CPU, typename Index, typename LhsScalar, typename LhsPackMap, typename RhsScalar, typename RhsPackMap, typename AccScalar, typename ResScalar, typename ResPacket, typename DataMapper, typename AccumulatorType, int RHS_SHAPE_IDX, int LHS_SHAPE_IDX, int IDX>
|
||||
struct DepthLoopStruct
|
||||
{
|
||||
static constexpr auto PREVIOUS = SHAPES<Architecture, CPU, LhsScalar, RhsScalar>[IDX][SHAPES_DEP_POINTER];
|
||||
|
||||
DepthLoopStruct<Architecture, CPU, Index, LhsScalar, LhsPackMap, RhsScalar, RhsPackMap, AccScalar, ResScalar, ResPacket, DataMapper, RHS_SHAPE_IDX, LHS_SHAPE_IDX, PREVIOUS> depthLS;
|
||||
DepthLoopStruct<Architecture, CPU, Index, LhsScalar, LhsPackMap, RhsScalar, RhsPackMap, AccScalar, ResScalar, ResPacket, DataMapper, AccumulatorType, RHS_SHAPE_IDX, LHS_SHAPE_IDX, PREVIOUS> depthLS;
|
||||
|
||||
EIGEN_STRONG_INLINE void operator()(Index rowIdx, Index colIdx, Index depthIdx, const DataMapper& res,
|
||||
EIGEN_STRONG_INLINE void operator()(Index rowIdx, Index colIdx, Index depthIdx, const DataMapper& res, AccumulatorType& acc,
|
||||
Index rows, Index depth, Index cols, ResScalar alpha, const ResPacket& pAlpha, LhsPackMap& lhsPackMap, RhsPackMap& rhsPackMap)
|
||||
{
|
||||
constexpr auto rhsProgress = SHAPES<Architecture, CPU, LhsScalar, RhsScalar>[RHS_SHAPE_IDX][SHAPES_RHS_DIMENSION];
|
||||
constexpr auto lhsProgress = SHAPES<Architecture, CPU, LhsScalar, RhsScalar>[LHS_SHAPE_IDX][SHAPES_LHS_DIMENSION];
|
||||
constexpr auto depthProgress = SHAPES<Architecture, CPU, LhsScalar, RhsScalar>[IDX][SHAPES_DEP_DIMENSION];
|
||||
|
||||
#ifdef __ENABLE_PREFETCH__
|
||||
prefetch(lhsPackMap.pCur);
|
||||
prefetch(rhsPackMap.pCur);
|
||||
#endif
|
||||
|
||||
typedef Accumulator<Architecture, CPU, AccScalar, ResScalar, DataMapper, lhsProgress, rhsProgress> AccumulatorType;
|
||||
//typedef Accumulator<Architecture, CPU, AccScalar, ResScalar, DataMapper, lhsProgress, rhsProgress> AccumulatorType;
|
||||
|
||||
MicroKernel<Architecture, CPU, Index, LhsScalar, LhsPackMap, RhsScalar, RhsPackMap, AccScalar, ResScalar, AccumulatorType, lhsProgress, depthProgress, rhsProgress> mkt;
|
||||
AccumulatorType acc;
|
||||
acc.zero();
|
||||
//AccumulatorType acc;
|
||||
|
||||
#ifdef __ENABLE_PREFETCH__
|
||||
acc.prefetch(res, rowIdx, colIdx);
|
||||
#endif
|
||||
//acc.zero();
|
||||
|
||||
acc.template prefetch<lhsProgress, depthProgress, rhsProgress>(res, rowIdx, colIdx);
|
||||
|
||||
lhsPackMap.template prefetch<lhsProgress, depthProgress>(0);
|
||||
rhsPackMap.template prefetch<rhsProgress, depthProgress>(0);
|
||||
|
||||
for(; depthIdx + depthProgress <= depth; depthIdx+=depthProgress)
|
||||
{
|
||||
#ifdef __DEBUG__
|
||||
auto M = lhsProgress;
|
||||
auto K = depthProgress;
|
||||
auto N = rhsProgress;
|
||||
std::cout << "Kernel " << M << " x " << K << " x " << N << " @ " << rowIdx << ", " << depthIdx << ", " << colIdx << std::endl;
|
||||
std::cout << "LHS ";
|
||||
for(auto i = 0; i < M; i++)
|
||||
{
|
||||
for(auto j = 0; j < K; j++)
|
||||
{
|
||||
std::cout << lhsPackMap.pCur[i*K + j] << " ";
|
||||
}
|
||||
}
|
||||
std::cout << std::endl << "RHS ";
|
||||
for(auto i = 0; i < K; i++)
|
||||
{
|
||||
for(auto j = 0; j < N; j++)
|
||||
{
|
||||
std::cout << rhsPackMap.pCur[i*N + j] << " ";
|
||||
}
|
||||
}
|
||||
std::cout << std::endl;
|
||||
#endif
|
||||
mkt(lhsPackMap, rhsPackMap, rowIdx, colIdx, depthIdx, acc);
|
||||
}
|
||||
acc.scale(alpha, pAlpha);
|
||||
acc.store(res, rowIdx, colIdx);
|
||||
//acc.store(res, rowIdx, colIdx, alpha, pAlpha);
|
||||
|
||||
depthLS(rowIdx, colIdx, depthIdx, res, rows, depth, cols, alpha, pAlpha, lhsPackMap, rhsPackMap);
|
||||
depthLS(rowIdx, colIdx, depthIdx, res, acc, rows, depth, cols, alpha, pAlpha, lhsPackMap, rhsPackMap);
|
||||
}
|
||||
};
|
||||
|
||||
template<int Architecture, int CPU, typename Index, typename LhsScalar, typename LhsPackMap, typename RhsScalar, typename RhsPackMap, typename AccScalar, typename ResScalar, typename ResPacket, typename DataMapper, int RHS_SHAPE_IDX, int LHS_SHAPE_IDX>
|
||||
struct DepthLoopStruct<Architecture, CPU, Index, LhsScalar, LhsPackMap, RhsScalar, RhsPackMap, AccScalar, ResScalar, ResPacket, DataMapper, RHS_SHAPE_IDX, LHS_SHAPE_IDX, -1>
|
||||
template<int Architecture, int CPU, typename Index, typename LhsScalar, typename LhsPackMap, typename RhsScalar, typename RhsPackMap, typename AccScalar, typename ResScalar, typename ResPacket, typename DataMapper, typename AccumulatorType, int RHS_SHAPE_IDX, int LHS_SHAPE_IDX>
|
||||
struct DepthLoopStruct<Architecture, CPU, Index, LhsScalar, LhsPackMap, RhsScalar, RhsPackMap, AccScalar, ResScalar, ResPacket, DataMapper, AccumulatorType, RHS_SHAPE_IDX, LHS_SHAPE_IDX, -1>
|
||||
{
|
||||
EIGEN_STRONG_INLINE void operator()(Index, Index, Index, const DataMapper&,
|
||||
EIGEN_STRONG_INLINE void operator()(Index, Index, Index, const DataMapper&, AccumulatorType&,
|
||||
Index, Index, Index, ResScalar, const ResPacket&, LhsPackMap&, RhsPackMap&) {}
|
||||
};
|
||||
|
||||
@@ -361,14 +383,22 @@ struct LhsLoopStruct
|
||||
{
|
||||
constexpr auto lhsProgress = SHAPES<Architecture, CPU, LhsScalar, RhsScalar>[IDX][SHAPES_LHS_DIMENSION];
|
||||
constexpr auto rhsProgress = SHAPES<Architecture, CPU, LhsScalar, RhsScalar>[IDX][SHAPES_RHS_DIMENSION];
|
||||
DepthLoopStruct<Architecture, CPU, Index, LhsScalar, LhsPackMap, RhsScalar, RhsPackMap, AccScalar, ResScalar, ResPacket, DataMapper, RHS_SHAPE_IDX, IDX, IDX> depthLS;
|
||||
|
||||
typedef Accumulator<Architecture, CPU, AccScalar, ResScalar, DataMapper, lhsProgress, rhsProgress> AccumulatorType;
|
||||
|
||||
DepthLoopStruct<Architecture, CPU, Index, LhsScalar, LhsPackMap, RhsScalar, RhsPackMap, AccScalar, ResScalar, ResPacket, DataMapper, AccumulatorType, RHS_SHAPE_IDX, IDX, IDX> depthLS;
|
||||
|
||||
//rhsPackMap.resetCur();
|
||||
for(;rowIdx + lhsProgress <= rows; rowIdx+=lhsProgress)
|
||||
{
|
||||
lhsPackMap.moveTo(rowIdx);
|
||||
rhsPackMap.moveTo(colIdx);
|
||||
//prefetch(lhsPackMap.pCur + 2*lhsProgress);
|
||||
//prefetch(rhsPackMap.pCur + 2*rhsProgress);
|
||||
depthLS(rowIdx, colIdx, 0, res, rows, depth, cols, alpha, pAlpha, lhsPackMap, rhsPackMap);
|
||||
rhsPackMap.resetCur();
|
||||
AccumulatorType acc;
|
||||
acc.zero();
|
||||
//lhsPackMap.moveTo(rowIdx);
|
||||
//rhsPackMap.moveTo(colIdx);
|
||||
|
||||
depthLS(rowIdx, colIdx, 0, res, acc, rows, depth, cols, alpha, pAlpha, lhsPackMap, rhsPackMap);
|
||||
acc.store(res, rowIdx, colIdx, alpha, pAlpha);
|
||||
}
|
||||
lhsLS(rowIdx, colIdx, res, rows, depth, cols, alpha, pAlpha, lhsPackMap, rhsPackMap);
|
||||
}
|
||||
@@ -395,7 +425,9 @@ struct RhsLoopStruct
|
||||
for(;colIdx + rhsProgress <= cols; colIdx+=rhsProgress)
|
||||
{
|
||||
LhsLoopStruct<Architecture, CPU, Index, LhsScalar, LhsPackMap, RhsScalar, RhsPackMap, AccScalar, ResScalar, ResPacket, DataMapper, IDX, IDX> lhsLS;
|
||||
lhsPackMap.resetCur();
|
||||
lhsLS(0, colIdx, res, rows, depth, cols, alpha, pAlpha, lhsPackMap, rhsPackMap);
|
||||
rhsPackMap.updateBase();
|
||||
}
|
||||
rhsLS(colIdx, res, rows, depth, cols, alpha, pAlpha, lhsPackMap, rhsPackMap);
|
||||
}
|
||||
|
||||
@@ -15,7 +15,8 @@ void set(MatrixXf& A, int m, int n, int id, int digits)
|
||||
int main(int argc, char* argv[])
|
||||
{
|
||||
#ifdef __DEBUG__
|
||||
int m = 32, k = 32, n = 32, max = std::max(std::max(m,k),n);
|
||||
int m = std::atoi(argv[1]), k = std::atoi(argv[1]), n = std::atoi(argv[1]);
|
||||
int max = std::max(std::max(m,k),n);
|
||||
MatrixXf A = MatrixXf::Zero(m, k);
|
||||
MatrixXf B = MatrixXf::Zero(k, n);
|
||||
MatrixXf C = MatrixXf::Zero(m, n);
|
||||
@@ -24,10 +25,17 @@ int main(int argc, char* argv[])
|
||||
set(A, m, k, 1, static_cast<int>(std::log10(max)) + 1);
|
||||
set(B, k, n, 2, static_cast<int>(std::log10(max)) + 1);
|
||||
|
||||
C = A*B;
|
||||
for(auto i = 0; i < 2; i++)
|
||||
C = A*B;
|
||||
|
||||
#ifdef __DEBUG_SHOW_INPUTS__
|
||||
std::cout << A << std::endl;
|
||||
std::cout << B << std::endl;
|
||||
#endif
|
||||
|
||||
#ifdef __DEBUG_SHOW_RESULT__
|
||||
std::cout << C << std::endl;
|
||||
#endif
|
||||
|
||||
std::cout << std::endl;
|
||||
|
||||
@@ -47,24 +55,23 @@ int main(int argc, char* argv[])
|
||||
}
|
||||
}
|
||||
}
|
||||
#ifdef __DEBUG_SHOW_RESULT__
|
||||
std::cout << D << std::endl;
|
||||
#endif
|
||||
#else
|
||||
if(argc < 3)
|
||||
if(argc < 5)
|
||||
{
|
||||
std::cout << "Wrong number of arguments." << std::endl;
|
||||
return -1;
|
||||
}
|
||||
|
||||
int sz = std::atoi(argv[1]);
|
||||
int m = sz, k = sz, n = sz;
|
||||
int RUNS = std::atoi(argv[2]);
|
||||
int m = std::atoi(argv[1]), k = std::atoi(argv[2]), n = std::atoi(argv[3]);
|
||||
int RUNS = std::atoi(argv[4]);
|
||||
double time = 0;
|
||||
|
||||
MatrixXf A = MatrixXf::Random(m,k);
|
||||
MatrixXf B = MatrixXf::Random(k,n);
|
||||
for(auto i = 0; i < RUNS; i++)
|
||||
{
|
||||
MatrixXf A = MatrixXf::Random(m,k);
|
||||
MatrixXf B = MatrixXf::Random(k,n);
|
||||
//set(A,m, k, 1);
|
||||
//set(B,k, n, 2);
|
||||
MatrixXf C = MatrixXf::Zero(m, n);
|
||||
|
||||
std::clock_t start,end;
|
||||
|
||||
20
run.sh
20
run.sh
@@ -9,13 +9,13 @@ function run() {
|
||||
for ((i = 0; i < $EXECS; i++)) do
|
||||
SEL=$(A=$(shuf -i 0-10 -n 1); echo $(($A % 2)))
|
||||
if [ $SEL -eq 0 ]; then
|
||||
T_OLD=$(./gto $SIZE $RUNS)
|
||||
T_NEW=$(./gt $SIZE $RUNS)
|
||||
T_NEWP=$(./gtp $SIZE $RUNS)
|
||||
T_OLD=$(./gto $SIZE $SIZE $SIZE $RUNS)
|
||||
T_NEW=$(./gt $SIZE $SIZE $SIZE $RUNS)
|
||||
T_NEWP=$(./gtp $SIZE $SIZE $SIZE $RUNS)
|
||||
else
|
||||
T_NEW=$(./gt $SIZE $RUNS)
|
||||
T_NEWP=$(./gtp $SIZE $RUNS)
|
||||
T_OLD=$(./gto $SIZE $RUNS)
|
||||
T_NEW=$(./gt $SIZE $SIZE $SIZE $RUNS)
|
||||
T_NEWP=$(./gtp $SIZE $SIZE $SIZE $RUNS)
|
||||
T_OLD=$(./gto $SIZE $SIZE $SIZE $RUNS)
|
||||
fi
|
||||
NEW=$NEW+$T_NEW
|
||||
OLD=$OLD+$T_OLD
|
||||
@@ -27,8 +27,14 @@ function run() {
|
||||
}
|
||||
|
||||
run $1 16 500
|
||||
run $1 21 500
|
||||
run $1 32 500
|
||||
run $1 53 500
|
||||
run $1 64 100
|
||||
run $1 97 100
|
||||
run $1 128 50
|
||||
run $1 203 50
|
||||
run $1 256 10
|
||||
run $1 1024 10
|
||||
run $1 673 10
|
||||
run $1 1024 5
|
||||
run $1 2048 2
|
||||
|
||||
Reference in New Issue
Block a user