Compare commits

...

16 Commits

Author SHA1 Message Date
Everton Constantino
58db05afbc WIP 2 2021-05-13 15:30:08 +00:00
Everton Constantino
bfadb56107 WIP 2 2021-05-13 14:48:40 +00:00
Everton Constantino
9b8cdceea8 WIP 2 2021-05-13 14:42:22 +00:00
Everton Constantino
a8ec6d6a36 WIP with tests 2021-05-12 17:09:33 +00:00
Everton Constantino
54f80f442d WIP - Vector 2021-05-10 20:06:34 +00:00
Everton Constantino
70c0363c28 WIP2 2021-05-10 19:59:47 +00:00
Everton Constantino
b2cd094863 WIP 2021-05-10 16:53:17 +00:00
Everton Constantino
d216764f46 WIP 2021-04-23 17:28:17 +00:00
Everton Constantino
646d92c7f1 WIP 2021-04-23 15:39:04 +00:00
Everton Constantino
c62ed9b214 WIP 2021-04-22 20:42:44 +00:00
Everton Constantino
82a7715b01 WIP 2021-04-22 18:11:53 +00:00
Everton Constantino
43ce8e9d2d WIP 2021-04-22 17:43:22 +00:00
Everton Constantino
ca0d3f92d7 WIP 2021-04-22 14:48:44 +00:00
Everton Constantino
5bffe09624 WIP 2021-04-22 13:14:00 +00:00
Everton Constantino
421891e1db WIP 2021-04-21 17:58:55 +00:00
Everton Constantino
f826663a3a WIP 2021-04-20 20:10:21 +00:00
7 changed files with 1653 additions and 0 deletions

View File

@@ -350,6 +350,9 @@ using std::ptrdiff_t;
#include "src/Core/arch/AltiVec/MatrixProduct.h"
#elif defined EIGEN_VECTORIZE_NEON
#include "src/Core/arch/NEON/GeneralBlockPanelKernel.h"
#include "src/Core/arch/NEON/MatrixProduct.h"
#include "src/Core/arch/NEON/PackingOps.h"
#include "src/Core/arch/NEON/Kernels.h"
#endif
#include "src/Core/BooleanRedux.h"

View File

@@ -0,0 +1,797 @@
// This file is part of Eigen, a lightweight C++ template library
// for linear algebra.
//
// Copyright (C) 2021 Everton Constantino (everton.constantino@hotmail.com)
//
// This Source Code Form is subject to the terms of the Mozilla
// Public License v. 2.0. If a copy of the MPL was not distributed
// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
#ifndef EIGEN_KERNELS_NEON_H
#define EIGEN_KERNELS_NEON_H
namespace Eigen {
namespace internal {
#ifdef __ENABLE_VECTOR_KERNELS__
#define MICRO_12x1x4() \
pRhs = pload<RhsPacket>(rhsPackMap.pCur); \
rhsPackMap.advance(1*4); \
pRhs0 = pset1<RhsPacket>(pRhs[0]); \
pRhs1 = pset1<RhsPacket>(pRhs[1]); \
pRhs2 = pset1<RhsPacket>(pRhs[2]); \
pRhs3 = pset1<RhsPacket>(pRhs[3]); \
pLhs = pload<LhsPacket>(lhsPackMap.pCur); \
lhsPackMap.advance(4*1); \
acc._acc1.packet[0] += pLhs*pRhs0; \
acc._acc1.packet[1] += pLhs*pRhs1; \
acc._acc1.packet[2] += pLhs*pRhs2; \
acc._acc1.packet[3] += pLhs*pRhs3; \
pLhs2 = pload<LhsPacket>(lhsPackMap.pCur); \
lhsPackMap.advance(4*1); \
acc._acc2.packet[0] += pLhs2*pRhs0; \
acc._acc2.packet[1] += pLhs2*pRhs1; \
acc._acc2.packet[2] += pLhs2*pRhs2; \
acc._acc2.packet[3] += pLhs2*pRhs3; \
pLhs3 = pload<LhsPacket>(lhsPackMap.pCur); \
acc._acc3.packet[0] += pLhs3*pRhs0; \
acc._acc3.packet[1] += pLhs3*pRhs1; \
acc._acc3.packet[2] += pLhs3*pRhs2; \
acc._acc3.packet[3] += pLhs3*pRhs3; \
lhsPackMap.advance(4*1);
#define MICRO_8x1x4() \
pLhs = pload<LhsPacket>(lhsPackMap.pCur); \
lhsPackMap.advance(4*1); \
pLhs2 = pload<LhsPacket>(lhsPackMap.pCur); \
pRhs = pload<RhsPacket>(rhsPackMap.pCur); \
pRhs0 = pset1<RhsPacket>(pRhs[0]); \
pRhs1 = pset1<RhsPacket>(pRhs[1]); \
pRhs2 = pset1<RhsPacket>(pRhs[2]); \
pRhs3 = pset1<RhsPacket>(pRhs[3]); \
acc._acc1.packet[0] += pLhs*pRhs0; \
acc._acc1.packet[1] += pLhs*pRhs1; \
acc._acc1.packet[2] += pLhs*pRhs2; \
acc._acc1.packet[3] += pLhs*pRhs3; \
acc._acc2.packet[0] += pLhs2*pRhs0; \
acc._acc2.packet[1] += pLhs2*pRhs1; \
acc._acc2.packet[2] += pLhs2*pRhs2; \
acc._acc2.packet[3] += pLhs2*pRhs3; \
lhsPackMap.advance(4*1); \
rhsPackMap.advance(1*4);
#define MICRO_4x1x4() \
pLhs = pload<LhsPacket>(lhsPackMap.pCur); \
pRhs = pload<RhsPacket>(rhsPackMap.pCur); \
pRhs0 = pset1<RhsPacket>(pRhs[0]); \
pRhs1 = pset1<RhsPacket>(pRhs[1]); \
pRhs2 = pset1<RhsPacket>(pRhs[2]); \
pRhs3 = pset1<RhsPacket>(pRhs[3]); \
acc._acc.packet[0] += pLhs*pRhs0; \
acc._acc.packet[1] += pLhs*pRhs1; \
acc._acc.packet[2] += pLhs*pRhs2; \
acc._acc.packet[3] += pLhs*pRhs3; \
lhsPackMap.advance(4*1); \
rhsPackMap.advance(1*4);
#define MICRO_12x1x1() \
pLhs = pload<LhsPacket>(lhsPackMap.pCur); \
pRhs = pset1<RhsPacket>(*rhsPackMap.pCur); \
acc._acc.packet[0] = pmadd(pRhs, pLhs, acc._acc.packet[0]); \
lhsPackMap.advance(4*1); \
pLhs = pload<LhsPacket>(lhsPackMap.pCur); \
acc._acc.packet[1] = pmadd(pRhs, pLhs, acc._acc.packet[1]); \
lhsPackMap.advance(4*1); \
pLhs = pload<LhsPacket>(lhsPackMap.pCur); \
acc._acc.packet[2] = pmadd(pRhs, pLhs, acc._acc.packet[2]); \
lhsPackMap.advance(4*1); \
rhsPackMap.advance(1);
#define MICRO_8x1x1() \
pLhs = pload<LhsPacket>(lhsPackMap.pCur); \
pRhs = pset1<RhsPacket>(*rhsPackMap.pCur); \
acc._acc.packet[0] = pmadd(pRhs, pLhs, acc._acc.packet[0]); \
lhsPackMap.advance(4*1); \
pLhs = pload<LhsPacket>(lhsPackMap.pCur); \
acc._acc.packet[1] = pmadd(pRhs, pLhs, acc._acc.packet[1]); \
lhsPackMap.advance(4*1); \
rhsPackMap.advance(1);
#define MICRO_4x1x1() \
pLhs = pload<LhsPacket>(lhsPackMap.pCur); \
pRhs = pset1<RhsPacket>(*rhsPackMap.pCur); \
acc._acc += pRhs*pLhs; \
lhsPackMap.advance(4*1); \
rhsPackMap.advance(1);
template<int CPU, typename Scalar, typename ResScalar, typename DataMapper>
struct Accumulator<0, CPU, Scalar, ResScalar, DataMapper, 12, 1>
{
using LinearMapper = typename DataMapper::LinearMapper;
using AccPacket = typename packet_traits<Scalar>::type;
using ResPacket = typename packet_traits<ResScalar>::type;
PacketBlock<AccPacket,3> _acc;
EIGEN_STRONG_INLINE void zero()
{
_acc.packet[0] = pset1<AccPacket>(0);
_acc.packet[1] = pset1<AccPacket>(0);
_acc.packet[2] = pset1<AccPacket>(0);
}
EIGEN_STRONG_INLINE void prefetch(const DataMapper&, Index, Index) {}
template<typename ResPacket_>
EIGEN_STRONG_INLINE void scale(ResScalar alpha, const ResPacket_& pAlpha)
{
_acc.packet[0] *= pAlpha;
_acc.packet[1] *= pAlpha;
_acc.packet[2] *= pAlpha;
}
EIGEN_STRONG_INLINE void store(const DataMapper& dest, Index row, Index col)
{
PacketBlock<ResPacket, 1> block;
block.packet[0] = dest.template loadPacket<ResPacket>(row + 0, col) + _acc.packet[0];
dest.template storePacketBlock<AccPacket, 1>(row + 0, col, block);
block.packet[0] = dest.template loadPacket<ResPacket>(row + 4, col) + _acc.packet[1];
dest.template storePacketBlock<AccPacket, 1>(row + 4, col, block);
block.packet[0] = dest.template loadPacket<ResPacket>(row + 8, col) + _acc.packet[2];
dest.template storePacketBlock<AccPacket, 1>(row + 8, col, block);
}
};
template<int CPU, typename Scalar, typename ResScalar, typename DataMapper>
struct Accumulator<0, CPU, Scalar, ResScalar, DataMapper, 8, 1>
{
using LinearMapper = typename DataMapper::LinearMapper;
using AccPacket = typename packet_traits<Scalar>::type;
using ResPacket = typename packet_traits<ResScalar>::type;
PacketBlock<AccPacket,2> _acc;
EIGEN_STRONG_INLINE void zero()
{
_acc.packet[0] = pset1<AccPacket>(0);
_acc.packet[1] = pset1<AccPacket>(0);
}
EIGEN_STRONG_INLINE void prefetch(const DataMapper&, Index, Index) {}
template<typename ResPacket_>
EIGEN_STRONG_INLINE void scale(ResScalar alpha, const ResPacket_& pAlpha)
{
_acc.packet[0] *= pAlpha;
_acc.packet[1] *= pAlpha;
}
EIGEN_STRONG_INLINE void store(const DataMapper& dest, Index row, Index col)
{
PacketBlock<ResPacket, 1> block;
block.packet[0] = dest.template loadPacket<ResPacket>(row, col) + _acc.packet[0];
dest.template storePacketBlock<AccPacket, 1>(row, col, block);
block.packet[0] = dest.template loadPacket<ResPacket>(row + 4, col) + _acc.packet[1];
dest.template storePacketBlock<AccPacket, 1>(row + 4, col, block);
}
};
template<int CPU, typename Scalar, typename ResScalar, typename DataMapper>
struct Accumulator<0, CPU, Scalar, ResScalar, DataMapper, 4, 1>
{
using LinearMapper = typename DataMapper::LinearMapper;
using AccPacket = typename packet_traits<Scalar>::type;
using ResPacket = typename packet_traits<ResScalar>::type;
AccPacket _acc;
EIGEN_STRONG_INLINE void zero()
{
_acc = pset1<AccPacket>(0);
}
EIGEN_STRONG_INLINE void prefetch(const DataMapper&, Index, Index) {}
template<typename ResPacket_>
EIGEN_STRONG_INLINE void scale(ResScalar alpha, const ResPacket_& pAlpha)
{
_acc *= pAlpha;
}
EIGEN_STRONG_INLINE void store(const DataMapper& dest, Index row, Index col)
{
PacketBlock<ResPacket, 1> block;
block.packet[0] = dest.template loadPacket<ResPacket>(row, col) + _acc;
dest.template storePacketBlock<AccPacket, 1>(row, col, block);
}
};
template<int CPU, typename Scalar, typename ResScalar, typename DataMapper>
struct Accumulator<0, CPU, Scalar, ResScalar, DataMapper, 1, 4>
{
using LinearMapper = typename DataMapper::LinearMapper;
using AccPacket = typename packet_traits<Scalar>::type;
using ResPacket = typename packet_traits<ResScalar>::type;
AccPacket _acc;
EIGEN_STRONG_INLINE void zero()
{
_acc = pset1<AccPacket>(0);
}
EIGEN_STRONG_INLINE void prefetch(const DataMapper&, Index, Index) {}
template<typename ResPacket_>
EIGEN_STRONG_INLINE void scale(ResScalar alpha, const ResPacket_& pAlpha)
{
_acc *= pAlpha;
}
EIGEN_STRONG_INLINE void store(const DataMapper& dest, Index row, Index col)
{
ResPacket r = dest.template gatherPacket<ResPacket>(row, col) + _acc;
dest.template scatterPacket<ResPacket>(row, col, r);
}
};
template<int CPU, typename Scalar, typename ResScalar, typename DataMapper>
struct Accumulator<0, CPU, Scalar, ResScalar, DataMapper, 4, 4>
{
using LinearMapper = typename DataMapper::LinearMapper;
using AccPacket = typename packet_traits<Scalar>::type;
using ResPacket = typename packet_traits<ResScalar>::type;
PacketBlock<AccPacket, 4> _acc;
EIGEN_STRONG_INLINE void zero()
{
_acc.packet[0] = pset1<AccPacket>(0);
_acc.packet[1] = pset1<AccPacket>(0);
_acc.packet[2] = pset1<AccPacket>(0);
_acc.packet[3] = pset1<AccPacket>(0);
}
EIGEN_STRONG_INLINE void prefetch(const DataMapper& dest, Index row, Index col)
{
dest.getLinearMapper(row, col + 0).prefetch(0);
dest.getLinearMapper(row, col + 1).prefetch(0);
dest.getLinearMapper(row, col + 2).prefetch(0);
dest.getLinearMapper(row, col + 3).prefetch(0);
}
template<typename ResPacket_>
EIGEN_STRONG_INLINE void scale(ResScalar alpha, const ResPacket_& pAlpha)
{
_acc.packet[0] *= pAlpha;
_acc.packet[1] *= pAlpha;
_acc.packet[2] *= pAlpha;
_acc.packet[3] *= pAlpha;
}
EIGEN_STRONG_INLINE void store(const DataMapper& dest, Index row, Index col)
{
constexpr auto PacketSize = unpacket_traits<ResPacket>::size;
LinearMapper r0 = dest.getLinearMapper(row, col + 0);
LinearMapper r1 = dest.getLinearMapper(row, col + 1);
LinearMapper r2 = dest.getLinearMapper(row, col + 2);
LinearMapper r3 = dest.getLinearMapper(row, col + 3);
r0.storePacket(0*PacketSize, r0.template loadPacket<ResPacket>(0*PacketSize) + _acc.packet[0]);
r1.storePacket(0*PacketSize, r1.template loadPacket<ResPacket>(0*PacketSize) + _acc.packet[1]);
r2.storePacket(0*PacketSize, r2.template loadPacket<ResPacket>(0*PacketSize) + _acc.packet[2]);
r3.storePacket(0*PacketSize, r3.template loadPacket<ResPacket>(0*PacketSize) + _acc.packet[3]);
}
};
template<int CPU, typename Scalar, typename ResScalar, typename DataMapper>
struct Accumulator<0, CPU, Scalar, ResScalar, DataMapper, 8, 4>
{
using LinearMapper = typename DataMapper::LinearMapper;
using AccPacket = typename packet_traits<Scalar>::type;
using ResPacket = typename packet_traits<ResScalar>::type;
PacketBlock<AccPacket, 4> _acc1;
PacketBlock<AccPacket, 4> _acc2;
EIGEN_STRONG_INLINE void zero()
{
_acc1.packet[0] = pset1<AccPacket>(0);
_acc1.packet[1] = pset1<AccPacket>(0);
_acc1.packet[2] = pset1<AccPacket>(0);
_acc1.packet[3] = pset1<AccPacket>(0);
_acc2.packet[0] = pset1<AccPacket>(0);
_acc2.packet[1] = pset1<AccPacket>(0);
_acc2.packet[2] = pset1<AccPacket>(0);
_acc2.packet[3] = pset1<AccPacket>(0);
}
EIGEN_STRONG_INLINE void prefetch(const DataMapper& dest, Index row, Index col)
{
dest.getLinearMapper(row + 0, col + 0).prefetch(0);
dest.getLinearMapper(row + 0, col + 1).prefetch(0);
dest.getLinearMapper(row + 0, col + 2).prefetch(0);
dest.getLinearMapper(row + 0, col + 3).prefetch(0);
}
template<typename ResPacket_>
EIGEN_STRONG_INLINE void scale(ResScalar alpha, const ResPacket_& pAlpha)
{
_acc1.packet[0] *= pAlpha;
_acc1.packet[1] *= pAlpha;
_acc1.packet[2] *= pAlpha;
_acc1.packet[3] *= pAlpha;
_acc2.packet[0] *= pAlpha;
_acc2.packet[1] *= pAlpha;
_acc2.packet[2] *= pAlpha;
_acc2.packet[3] *= pAlpha;
}
EIGEN_STRONG_INLINE void store(const DataMapper& dest, Index row, Index col)
{
constexpr auto PacketSize = unpacket_traits<ResPacket>::size;
LinearMapper r0 = dest.getLinearMapper(row, col + 0);
LinearMapper r1 = dest.getLinearMapper(row, col + 1);
LinearMapper r2 = dest.getLinearMapper(row, col + 2);
LinearMapper r3 = dest.getLinearMapper(row, col + 3);
r0.storePacket(0*PacketSize, r0.template loadPacket<ResPacket>(0*PacketSize) + _acc1.packet[0]);
r1.storePacket(0*PacketSize, r1.template loadPacket<ResPacket>(0*PacketSize) + _acc1.packet[1]);
r2.storePacket(0*PacketSize, r2.template loadPacket<ResPacket>(0*PacketSize) + _acc1.packet[2]);
r3.storePacket(0*PacketSize, r3.template loadPacket<ResPacket>(0*PacketSize) + _acc1.packet[3]);
r0.storePacket(1*PacketSize, r0.template loadPacket<ResPacket>(1*PacketSize) + _acc2.packet[0]);
r1.storePacket(1*PacketSize, r1.template loadPacket<ResPacket>(1*PacketSize) + _acc2.packet[1]);
r2.storePacket(1*PacketSize, r2.template loadPacket<ResPacket>(1*PacketSize) + _acc2.packet[2]);
r3.storePacket(1*PacketSize, r3.template loadPacket<ResPacket>(1*PacketSize) + _acc2.packet[3]);
}
};
template<int CPU, typename Scalar, typename ResScalar, typename DataMapper>
struct Accumulator<0, CPU, Scalar, ResScalar, DataMapper, 12, 4>
{
using LinearMapper = typename DataMapper::LinearMapper;
using AccPacket = typename packet_traits<Scalar>::type;
using ResPacket = typename packet_traits<ResScalar>::type;
PacketBlock<AccPacket, 4> _acc1;
PacketBlock<AccPacket, 4> _acc2;
PacketBlock<AccPacket, 4> _acc3;
EIGEN_STRONG_INLINE void zero()
{
_acc1.packet[0] = pset1<AccPacket>(0);
_acc1.packet[1] = pset1<AccPacket>(0);
_acc1.packet[2] = pset1<AccPacket>(0);
_acc1.packet[3] = pset1<AccPacket>(0);
_acc2.packet[0] = pset1<AccPacket>(0);
_acc2.packet[1] = pset1<AccPacket>(0);
_acc2.packet[2] = pset1<AccPacket>(0);
_acc2.packet[3] = pset1<AccPacket>(0);
_acc3.packet[0] = pset1<AccPacket>(0);
_acc3.packet[1] = pset1<AccPacket>(0);
_acc3.packet[2] = pset1<AccPacket>(0);
_acc3.packet[3] = pset1<AccPacket>(0);
}
EIGEN_STRONG_INLINE void prefetch(const DataMapper& dest, Index row, Index col)
{
dest.getLinearMapper(row + 0, col + 0).prefetch(0);
dest.getLinearMapper(row + 0, col + 1).prefetch(0);
dest.getLinearMapper(row + 0, col + 2).prefetch(0);
dest.getLinearMapper(row + 0, col + 3).prefetch(0);
}
template<typename ResPacket_>
EIGEN_STRONG_INLINE void scale(ResScalar alpha, const ResPacket_& pAlpha)
{
_acc1.packet[0] *= pAlpha;
_acc1.packet[1] *= pAlpha;
_acc1.packet[2] *= pAlpha;
_acc1.packet[3] *= pAlpha;
_acc2.packet[0] *= pAlpha;
_acc2.packet[1] *= pAlpha;
_acc2.packet[2] *= pAlpha;
_acc2.packet[3] *= pAlpha;
_acc3.packet[0] *= pAlpha;
_acc3.packet[1] *= pAlpha;
_acc3.packet[2] *= pAlpha;
_acc3.packet[3] *= pAlpha;
}
EIGEN_STRONG_INLINE void store(const DataMapper& dest, Index row, Index col)
{
constexpr auto PacketSize = unpacket_traits<ResPacket>::size;
LinearMapper r0 = dest.getLinearMapper(row, col + 0);
LinearMapper r1 = dest.getLinearMapper(row, col + 1);
LinearMapper r2 = dest.getLinearMapper(row, col + 2);
LinearMapper r3 = dest.getLinearMapper(row, col + 3);
r0.storePacket(0*PacketSize, r0.template loadPacket<ResPacket>(0*PacketSize) + _acc1.packet[0]);
r1.storePacket(0*PacketSize, r1.template loadPacket<ResPacket>(0*PacketSize) + _acc1.packet[1]);
r2.storePacket(0*PacketSize, r2.template loadPacket<ResPacket>(0*PacketSize) + _acc1.packet[2]);
r3.storePacket(0*PacketSize, r3.template loadPacket<ResPacket>(0*PacketSize) + _acc1.packet[3]);
r0.storePacket(1*PacketSize, r0.template loadPacket<ResPacket>(1*PacketSize) + _acc2.packet[0]);
r1.storePacket(1*PacketSize, r1.template loadPacket<ResPacket>(1*PacketSize) + _acc2.packet[1]);
r2.storePacket(1*PacketSize, r2.template loadPacket<ResPacket>(1*PacketSize) + _acc2.packet[2]);
r3.storePacket(1*PacketSize, r3.template loadPacket<ResPacket>(1*PacketSize) + _acc2.packet[3]);
r0.storePacket(2*PacketSize, r0.template loadPacket<ResPacket>(2*PacketSize) + _acc3.packet[0]);
r1.storePacket(2*PacketSize, r1.template loadPacket<ResPacket>(2*PacketSize) + _acc3.packet[1]);
r2.storePacket(2*PacketSize, r2.template loadPacket<ResPacket>(2*PacketSize) + _acc3.packet[2]);
r3.storePacket(2*PacketSize, r3.template loadPacket<ResPacket>(2*PacketSize) + _acc3.packet[3]);
}
};
template<int CPU, typename Index, typename LhsScalar, typename LhsPackMap, typename RhsScalar, typename RhsPackMap, typename AccScalar, typename ResScalar, typename Accumulator>
struct MicroKernel<0, CPU, Index, LhsScalar, LhsPackMap, RhsScalar, RhsPackMap, AccScalar, ResScalar, Accumulator, 4, __UNROLL__ , 4>
{
EIGEN_STRONG_INLINE void operator()(LhsPackMap& lhsPackMap,
RhsPackMap& rhsPackMap,
Index rowIdx, Index colIdx, Index depthIdx,
Accumulator& acc)
{
using LhsPacket = typename packet_traits<LhsScalar>::type;
using RhsPacket = typename packet_traits<RhsScalar>::type;
asm __volatile__("#BEGIN_NEON_MICROKERNEL_4x4x4\n\t");
LhsPacket pLhs;
RhsPacket pRhs, pRhs0, pRhs1, pRhs2, pRhs3;
MICRO_4x1x4();
MICRO_4x1x4();
MICRO_4x1x4();
MICRO_4x1x4();
#if __UNROLL__ > 4
MICRO_4x1x4();
MICRO_4x1x4();
MICRO_4x1x4();
MICRO_4x1x4();
#endif
asm __volatile__("#END_NEON_MICROKERNEL_4x4x4\n\t");
};
};
template<int CPU, typename Index, typename LhsScalar, typename LhsPackMap, typename RhsScalar, typename RhsPackMap, typename AccScalar, typename ResScalar, typename Accumulator>
struct MicroKernel<0, CPU, Index, LhsScalar, LhsPackMap, RhsScalar, RhsPackMap, AccScalar, ResScalar, Accumulator, 8, __UNROLL__, 4>
{
EIGEN_STRONG_INLINE void operator()(LhsPackMap& lhsPackMap,
RhsPackMap& rhsPackMap,
Index rowIdx, Index colIdx, Index depthIdx,
Accumulator& acc)
{
using LhsPacket = typename packet_traits<LhsScalar>::type;
using RhsPacket = typename packet_traits<RhsScalar>::type;
asm __volatile__("#BEGIN_NEON_MICROKERNEL_8x8x4\n\t");
LhsPacket pLhs, pLhs2;
RhsPacket pRhs, pRhs0, pRhs1, pRhs2, pRhs3;
#if __UNROLL__ == 8
#ifdef __ENABLE_PREFETCH__
prefetch(rhsPackMap.pCur + (48+0));
#endif
MICRO_8x1x4();
MICRO_8x1x4();
MICRO_8x1x4();
MICRO_8x1x4();
#ifdef __ENABLE_PREFETCH__
prefetch(rhsPackMap.pCur + (48+16));
#endif
MICRO_8x1x4();
MICRO_8x1x4();
MICRO_8x1x4();
MICRO_8x1x4();
#else
MICRO_8x1x4();
MICRO_8x1x4();
MICRO_8x1x4();
MICRO_8x1x4();
#endif
asm __volatile__("#END_NEON_MICROKERNEL_8x8x4\n\t");
};
};
template<int CPU, typename Index, typename LhsScalar, typename LhsPackMap, typename RhsScalar, typename RhsPackMap, typename AccScalar, typename ResScalar, typename Accumulator>
struct MicroKernel<0, CPU, Index, LhsScalar, LhsPackMap, RhsScalar, RhsPackMap, AccScalar, ResScalar, Accumulator, 12, __UNROLL__, 4>
{
EIGEN_STRONG_INLINE void operator()(LhsPackMap& lhsPackMap,
RhsPackMap& rhsPackMap,
Index rowIdx, Index colIdx, Index depthIdx,
Accumulator& acc)
{
using LhsPacket = typename packet_traits<LhsScalar>::type;
using RhsPacket = typename packet_traits<RhsScalar>::type;
asm __volatile__("#BEGIN_NEON_MICROKERNEL_12x8x4\n\t");
LhsPacket pLhs, pLhs2, pLhs3;
RhsPacket pRhs, pRhs0, pRhs1, pRhs2, pRhs3;
#if __UNROLL__ == 8
#ifdef __ENABLE_PREFETCH__
prefetch(rhsPackMap.pCur);
#endif
MICRO_12x1x4();
MICRO_12x1x4();
MICRO_12x1x4();
MICRO_12x1x4();
MICRO_12x1x4();
MICRO_12x1x4();
MICRO_12x1x4();
MICRO_12x1x4();
#else
MICRO_12x1x4();
MICRO_12x1x4();
MICRO_12x1x4();
MICRO_12x1x4();
#endif
asm __volatile__("#END_NEON_MICROKERNEL_12x8x4\n\t");
};
};
template<int CPU, typename Index, typename LhsScalar, typename LhsPackMap, typename RhsScalar, typename RhsPackMap, typename AccScalar, typename ResScalar, typename Accumulator>
struct MicroKernel<0, CPU, Index, LhsScalar, LhsPackMap, RhsScalar, RhsPackMap, AccScalar, ResScalar, Accumulator, 12, 1, 4>
{
EIGEN_STRONG_INLINE void operator()(LhsPackMap& lhsPackMap,
RhsPackMap& rhsPackMap,
Index rowIdx, Index colIdx, Index depthIdx,
Accumulator& acc)
{
using LhsPacket = typename packet_traits<LhsScalar>::type;
using RhsPacket = typename packet_traits<RhsScalar>::type;
asm __volatile__("#BEGIN_NEON_MICROKERNEL_12x1x4\n\t");
LhsPacket pLhs, pLhs2, pLhs3;
RhsPacket pRhs, pRhs0, pRhs1, pRhs2, pRhs3;
MICRO_12x1x4();
asm __volatile__("#END_NEON_MICROKERNEL_12x1x4\n\t");
};
};
template<int CPU, typename Index, typename LhsScalar, typename LhsPackMap, typename RhsScalar, typename RhsPackMap, typename AccScalar, typename ResScalar, typename Accumulator>
struct MicroKernel<0, CPU, Index, LhsScalar, LhsPackMap, RhsScalar, RhsPackMap, AccScalar, ResScalar, Accumulator, 8, 1, 4>
{
EIGEN_STRONG_INLINE void operator()(LhsPackMap& lhsPackMap,
RhsPackMap& rhsPackMap,
Index rowIdx, Index colIdx, Index depthIdx,
Accumulator& acc)
{
using LhsPacket = typename packet_traits<LhsScalar>::type;
using RhsPacket = typename packet_traits<RhsScalar>::type;
asm __volatile__("#BEGIN_NEON_MICROKERNEL_8x1x4\n\t");
LhsPacket pLhs, pLhs2;
RhsPacket pRhs, pRhs0, pRhs1, pRhs2, pRhs3;
MICRO_8x1x4();
asm __volatile__("#END_NEON_MICROKERNEL_8x1x4\n\t");
};
};
template<int CPU, typename Index, typename LhsScalar, typename LhsPackMap, typename RhsScalar, typename RhsPackMap, typename AccScalar, typename ResScalar, typename Accumulator>
struct MicroKernel<0, CPU, Index, LhsScalar, LhsPackMap, RhsScalar, RhsPackMap, AccScalar, ResScalar, Accumulator, 4, 1, 4>
{
EIGEN_STRONG_INLINE void operator()(LhsPackMap& lhsPackMap,
RhsPackMap& rhsPackMap,
Index rowIdx, Index colIdx, Index depthIdx,
Accumulator& acc)
{
using LhsPacket = typename packet_traits<LhsScalar>::type;
using RhsPacket = typename packet_traits<RhsScalar>::type;
asm __volatile__("#BEGIN_NEON_MICROKERNEL_4x1x4\n\t");
LhsPacket pLhs;
RhsPacket pRhs, pRhs0, pRhs1, pRhs2, pRhs3;
MICRO_4x1x4();
asm __volatile__("#END_NEON_MICROKERNEL_4x1x4\n\t");
};
};
template<int CPU, typename Index, typename LhsScalar, typename LhsPackMap, typename RhsScalar, typename RhsPackMap, typename AccScalar, typename ResScalar, typename Accumulator>
struct MicroKernel<0, CPU, Index, LhsScalar, LhsPackMap, RhsScalar, RhsPackMap, AccScalar, ResScalar, Accumulator, 12, __UNROLL__, 1>
{
EIGEN_STRONG_INLINE void operator()(LhsPackMap& lhsPackMap,
RhsPackMap& rhsPackMap,
Index rowIdx, Index colIdx, Index depthIdx,
Accumulator& acc)
{
using LhsPacket = typename packet_traits<LhsScalar>::type;
using RhsPacket = typename packet_traits<RhsScalar>::type;
LhsPacket pLhs;
RhsPacket pRhs;
asm __volatile__("#BEGIN_NEON_MICROKERNEL_4x1x1\n\t");
MICRO_12x1x1();
MICRO_12x1x1();
MICRO_12x1x1();
MICRO_12x1x1();
MICRO_12x1x1();
MICRO_12x1x1();
MICRO_12x1x1();
MICRO_12x1x1();
asm __volatile__("#END_NEON_MICROKERNEL_4x1x1\n\t");
};
};
template<int CPU, typename Index, typename LhsScalar, typename LhsPackMap, typename RhsScalar, typename RhsPackMap, typename AccScalar, typename ResScalar, typename Accumulator>
struct MicroKernel<0, CPU, Index, LhsScalar, LhsPackMap, RhsScalar, RhsPackMap, AccScalar, ResScalar, Accumulator, 12, 1, 1>
{
EIGEN_STRONG_INLINE void operator()(LhsPackMap& lhsPackMap,
RhsPackMap& rhsPackMap,
Index rowIdx, Index colIdx, Index depthIdx,
Accumulator& acc)
{
using LhsPacket = typename packet_traits<LhsScalar>::type;
using RhsPacket = typename packet_traits<RhsScalar>::type;
LhsPacket pLhs;
RhsPacket pRhs;
asm __volatile__("#BEGIN_NEON_MICROKERNEL_4x1x1\n\t");
MICRO_12x1x1();
asm __volatile__("#END_NEON_MICROKERNEL_4x1x1\n\t");
};
};
template<int CPU, typename Index, typename LhsScalar, typename LhsPackMap, typename RhsScalar, typename RhsPackMap, typename AccScalar, typename ResScalar, typename Accumulator>
struct MicroKernel<0, CPU, Index, LhsScalar, LhsPackMap, RhsScalar, RhsPackMap, AccScalar, ResScalar, Accumulator, 8, __UNROLL__, 1>
{
EIGEN_STRONG_INLINE void operator()(LhsPackMap& lhsPackMap,
RhsPackMap& rhsPackMap,
Index rowIdx, Index colIdx, Index depthIdx,
Accumulator& acc)
{
using LhsPacket = typename packet_traits<LhsScalar>::type;
using RhsPacket = typename packet_traits<RhsScalar>::type;
asm __volatile__("#BEGIN_NEON_MICROKERNEL_4x1x1\n\t");
LhsPacket pLhs;
RhsPacket pRhs;
MICRO_8x1x1();
MICRO_8x1x1();
MICRO_8x1x1();
MICRO_8x1x1();
MICRO_8x1x1();
MICRO_8x1x1();
MICRO_8x1x1();
MICRO_8x1x1();
asm __volatile__("#END_NEON_MICROKERNEL_4x1x1\n\t");
};
};
template<int CPU, typename Index, typename LhsScalar, typename LhsPackMap, typename RhsScalar, typename RhsPackMap, typename AccScalar, typename ResScalar, typename Accumulator>
struct MicroKernel<0, CPU, Index, LhsScalar, LhsPackMap, RhsScalar, RhsPackMap, AccScalar, ResScalar, Accumulator, 8, 1, 1>
{
EIGEN_STRONG_INLINE void operator()(LhsPackMap& lhsPackMap,
RhsPackMap& rhsPackMap,
Index rowIdx, Index colIdx, Index depthIdx,
Accumulator& acc)
{
using LhsPacket = typename packet_traits<LhsScalar>::type;
using RhsPacket = typename packet_traits<RhsScalar>::type;
asm __volatile__("#BEGIN_NEON_MICROKERNEL_4x1x1\n\t");
LhsPacket pLhs;
RhsPacket pRhs;
MICRO_8x1x1();
asm __volatile__("#END_NEON_MICROKERNEL_4x1x1\n\t");
};
};
template<int CPU, typename Index, typename LhsScalar, typename LhsPackMap, typename RhsScalar, typename RhsPackMap, typename AccScalar, typename ResScalar, typename Accumulator>
struct MicroKernel<0, CPU, Index, LhsScalar, LhsPackMap, RhsScalar, RhsPackMap, AccScalar, ResScalar, Accumulator, 4, __UNROLL__, 1>
{
EIGEN_STRONG_INLINE void operator()(LhsPackMap& lhsPackMap,
RhsPackMap& rhsPackMap,
Index rowIdx, Index colIdx, Index depthIdx,
Accumulator& acc)
{
using LhsPacket = typename packet_traits<LhsScalar>::type;
using RhsPacket = typename packet_traits<RhsScalar>::type;
asm __volatile__("#BEGIN_NEON_MICROKERNEL_4x1x1\n\t");
LhsPacket pLhs;
RhsPacket pRhs;
MICRO_4x1x1();
MICRO_4x1x1();
MICRO_4x1x1();
MICRO_4x1x1();
MICRO_4x1x1();
MICRO_4x1x1();
MICRO_4x1x1();
MICRO_4x1x1();
asm __volatile__("#END_NEON_MICROKERNEL_4x1x1\n\t");
};
};
template<int CPU, typename Index, typename LhsScalar, typename LhsPackMap, typename RhsScalar, typename RhsPackMap, typename AccScalar, typename ResScalar, typename Accumulator>
struct MicroKernel<0, CPU, Index, LhsScalar, LhsPackMap, RhsScalar, RhsPackMap, AccScalar, ResScalar, Accumulator, 4, 1, 1>
{
EIGEN_STRONG_INLINE void operator()(LhsPackMap& lhsPackMap,
RhsPackMap& rhsPackMap,
Index rowIdx, Index colIdx, Index depthIdx,
Accumulator& acc)
{
using LhsPacket = typename packet_traits<LhsScalar>::type;
using RhsPacket = typename packet_traits<RhsScalar>::type;
asm __volatile__("#BEGIN_NEON_MICROKERNEL_4x1x1\n\t");
LhsPacket pLhs;
RhsPacket pRhs;
MICRO_4x1x1();
asm __volatile__("#END_NEON_MICROKERNEL_4x1x1\n\t");
};
};
template<int CPU, typename Index, typename LhsScalar, typename LhsPackMap, typename RhsScalar, typename RhsPackMap, typename AccScalar, typename ResScalar, typename Accumulator>
struct MicroKernel<0, CPU, Index, LhsScalar, LhsPackMap, RhsScalar, RhsPackMap, AccScalar, ResScalar, Accumulator, 1, 1, 4>
{
EIGEN_STRONG_INLINE void operator()(LhsPackMap& lhsPackMap,
RhsPackMap& rhsPackMap,
Index rowIdx, Index colIdx, Index depthIdx,
Accumulator& acc)
{
using RhsPacket = typename packet_traits<RhsScalar>::type;
using LhsPacket = typename packet_traits<LhsScalar>::type;
asm __volatile__("#BEGIN_NEON_MICROKERNEL_1x1x4\n\t");
LhsPacket pLhs = pset1<LhsPacket>(*lhsPackMap.pCur);
RhsPacket pRhs = pload<RhsPacket>(rhsPackMap.pCur);
acc._acc += pLhs*pRhs;
lhsPackMap.advance(1);
rhsPackMap.advance(4*1);
asm __volatile__("#END_NEON_MICROKERNEL_1x1x4\n\t");
};
};
#endif // __ENABLE_VECTOR_KERNELS__
} // end namespace internal
} // end namespace Eigen
#endif // EIGEN_KERNELS_NEON_H

View File

@@ -0,0 +1,523 @@
// This file is part of Eigen, a lightweight C++ template library
// for linear algebra.
//
// Copyright (C) 2021 Everton Constantino (everton.constantino@hotmail.com)
//
// This Source Code Form is subject to the terms of the Mozilla
// Public License v. 2.0. If a copy of the MPL was not distributed
// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
#ifndef EIGEN_MATRIX_PRODUCT_NEON_H
#define EIGEN_MATRIX_PRODUCT_NEON_H
#ifdef __DEBUG__
#include <iostream>
#endif
namespace Eigen {
namespace internal {
#ifndef __UNROLL__
#define __UNROLL__ 8
#endif
template<int Architecture, int CPU, typename LhsScalar, typename RhsScalar>
constexpr int SHAPES_COUNT = 14;
constexpr int SHAPES_DIMENSION = 6;
constexpr int SHAPES_LHS_DIMENSION = 0;
constexpr int SHAPES_DEP_DIMENSION = 1;
constexpr int SHAPES_RHS_DIMENSION = 2;
constexpr int SHAPES_RHS_POINTER = 3;
constexpr int SHAPES_LHS_POINTER = 4;
constexpr int SHAPES_DEP_POINTER = 5;
constexpr int SHAPES_POINTER_END = -1;
template<int Architecture, int CPU, typename Scalar, bool isLhs>
constexpr int PACK_SHAPES_COUNT = 2;
template<int Architecture, int CPU, typename Scalar>
constexpr int PACK_SHAPES_COUNT<Architecture, CPU, Scalar, true> = 4;
constexpr int PACK_SHAPES_DIMENSION = 3;
constexpr int PACK_SHAPES_POINTER = 2;
constexpr int PACK_SHAPES_END = -1;
// lhs_progress x depth_progress x rhs_progress (depth_progress > 1 matrix ops) x pointer to next rhs_progress on the shapes map
template<int Architecture, int CPU, typename LhsScalar, typename RhsScalar>
constexpr int SHAPES[SHAPES_COUNT<Architecture, CPU, LhsScalar,RhsScalar>][SHAPES_DIMENSION] =
{ /* 00 */{ 1, 1,1,SHAPES_POINTER_END, SHAPES_POINTER_END, SHAPES_POINTER_END},
/* 01 */{1*packet_traits<RhsScalar>::size, 1,1, 0, 0, SHAPES_POINTER_END},
/* 02 */{1*packet_traits<RhsScalar>::size,__UNROLL__,1, 0, 0, 1},
/* 03 */{2*packet_traits<RhsScalar>::size, 1,1, 0, 2, SHAPES_POINTER_END},
/* 04 */{2*packet_traits<RhsScalar>::size,__UNROLL__,1, 0, 2, 3},
/* 05 */{3*packet_traits<RhsScalar>::size, 1,1, 0, 4, SHAPES_POINTER_END},
/* 06 */{3*packet_traits<RhsScalar>::size,__UNROLL__,1, 0, 4, 5},
/* 07 */{ 1, 1,4, 6, SHAPES_POINTER_END, SHAPES_POINTER_END},
/* 08 */{1*packet_traits<RhsScalar>::size, 1,4, 6, 7, SHAPES_POINTER_END},
/* 09 */{1*packet_traits<RhsScalar>::size,__UNROLL__,4, 6, 7, 8},
/* 10 */{2*packet_traits<RhsScalar>::size, 1,4, 6, 9, SHAPES_POINTER_END},
/* 11 */{2*packet_traits<RhsScalar>::size,__UNROLL__,4, 6, 9, 10},
/* 12 */{3*packet_traits<RhsScalar>::size, 1,4, 6, 11, SHAPES_POINTER_END},
/* 13 */{3*packet_traits<RhsScalar>::size,__UNROLL__,4, 6, 11, 12}};
// d1progress x d2progress
template<int Architecture, int CPU, typename Scalar, bool isLhs>
constexpr int PACK_SHAPES[PACK_SHAPES_COUNT<Architecture, CPU, Scalar, isLhs>][PACK_SHAPES_DIMENSION] =
{{ 1, 1, PACK_SHAPES_END},
{ 4, 1, 0}};
template<int Architecture, int CPU, typename Scalar>
constexpr int PACK_SHAPES<Architecture, CPU, Scalar, true>[PACK_SHAPES_COUNT<Architecture, CPU, Scalar, true>][PACK_SHAPES_DIMENSION] =
{{ 1, 1, PACK_SHAPES_END},
{1*packet_traits<Scalar>::size, 1, 0},
{2*packet_traits<Scalar>::size, 1, 1},
{3*packet_traits<Scalar>::size, 1, 2}};
template<int Architecture, int CPU, typename Index, typename Scalar, bool isLhs, typename DataMapper, bool Conjugate, bool PanelMode, int StorageOrder, int M, int N>
struct PackingOperator
{
EIGEN_STRONG_INLINE Scalar* operator()(Index d1Idx, Index d2Idx, Scalar *block, const DataMapper& data)
{
#ifdef __DEBUG__
std::cout << M << "x" << N << " ( " << d1Idx << ", " << d2Idx <<") -> ( " << d1Idx + M << ", " << d2Idx + N << ") ";
#endif
Scalar *c = block;
for(auto i = 0; i < M; i++)
for(auto j = 0; j < N; j++)
{
if(isLhs)
*c = data(d1Idx + i, d2Idx + j);
else
*c = data(d2Idx + j, d1Idx + i);
#ifdef __DEBUG__
std::cout << *c << " ";
#endif
c++;
}
#ifdef __DEBUG__
std::cout << std::endl;
#endif
return c;
}
};
template<int Architecture, int CPU, typename Index, typename Scalar, bool isLhs, typename DataMapper, bool Conjugate, bool PanelMode, int StorageOrder, int D1PROGRESS, int IDX>
struct PackingInnerStruct
{
EIGEN_STRONG_INLINE Scalar* operator()(Index d1Idx, Index d2Idx, Scalar *block, const DataMapper& data, Index d1Size, Index d2Size, Index stride, Index offset)
{
constexpr auto d2Progress = PACK_SHAPES<Architecture, CPU, Scalar, isLhs>[IDX][1];
PackingOperator<Architecture, CPU, Index, Scalar, isLhs, DataMapper, Conjugate, PanelMode, StorageOrder, D1PROGRESS, d2Progress> po;
for(;d2Idx + d2Progress <= d2Size; d2Idx+=d2Progress)
{
block = po(d1Idx, d2Idx, block, data);
}
if(PACK_SHAPES<Architecture, CPU, Scalar, isLhs>[IDX-1][0] == D1PROGRESS)
{
PackingInnerStruct<Architecture, CPU, Index, Scalar, isLhs, DataMapper, Conjugate, PanelMode, StorageOrder, D1PROGRESS, IDX-1> pis;
block = pis(d1Idx, d2Idx, block, data, d1Size, d2Size, stride, offset);
}
return block;
}
};
template<int Architecture, int CPU, typename Index, typename Scalar, bool isLhs, typename DataMapper, bool Conjugate, bool PanelMode, int StorageOrder, int D1PROGRESS>
struct PackingInnerStruct<Architecture, CPU, Index, Scalar, isLhs, DataMapper, Conjugate, PanelMode, StorageOrder, D1PROGRESS, 0>
{
EIGEN_STRONG_INLINE Scalar* operator()(Index d1Idx, Index d2Idx, Scalar *block, const DataMapper& data, Index d1Size, Index d2Size, Index stride, Index offset)
{
constexpr auto d2Progress = PACK_SHAPES<Architecture, CPU, Scalar, isLhs>[0][1];
for(;d2Idx + d2Progress <= d2Size; d2Idx+=d2Progress)
{
PackingOperator<Architecture, CPU, Index, Scalar, isLhs, DataMapper, Conjugate, PanelMode, StorageOrder, D1PROGRESS, d2Progress> po;
block = po(d1Idx, d2Idx, block, data);
}
return block;
}
};
template<int Architecture, int CPU, typename Index, typename Scalar, bool isLhs, typename DataMapper, bool Conjugate, bool PanelMode, int StorageOrder, int PACK_SHAPE_IDX>
struct PackingStruct
{
PackingStruct<Architecture, CPU, Index, Scalar, isLhs, DataMapper, Conjugate, PanelMode, StorageOrder, PACK_SHAPES<Architecture, CPU, Scalar, isLhs>[PACK_SHAPE_IDX][PACK_SHAPES_POINTER]> ps;
EIGEN_STRONG_INLINE Scalar* operator()(Index d1Idx, Scalar *block, const DataMapper& data, Index d1Size, Index d2Size, Index stride, Index offset)
{
constexpr auto d1Progress = PACK_SHAPES<Architecture, CPU, Scalar, isLhs>[PACK_SHAPE_IDX][0];
for(; d1Idx + d1Progress <= d1Size; d1Idx += d1Progress)
{
PackingInnerStruct<Architecture, CPU, Index, Scalar, isLhs, DataMapper, Conjugate, PanelMode, StorageOrder, d1Progress, PACK_SHAPE_IDX> pis;
block = pis(d1Idx, 0, block, data, d1Size, d2Size, stride, offset);
}
return ps(d1Idx, block, data, d1Size, d2Size, stride, offset);
}
};
template<int Architecture, int CPU, typename Index, typename Scalar, bool isLhs, typename DataMapper, bool Conjugate, bool PanelMode, int StorageOrder>
struct PackingStruct<Architecture, CPU, Index, Scalar, isLhs, DataMapper, Conjugate, PanelMode, StorageOrder, -1>
{
EIGEN_STRONG_INLINE Scalar* operator()(Index, Scalar *block, const DataMapper&, Index, Index, Index, Index) { return block; }
};
template<int Architecture, int CPU, typename Index, typename Scalar, typename DataMapper, bool Conjugate, bool PanelMode, int StorageOrder>
struct lhs_pack
{
EIGEN_STRONG_INLINE void operator()(Scalar *blockA, const DataMapper &lhs, Index depth, Index rows, Index stride, Index offset)
{
PackingStruct<Architecture, CPU, Index, Scalar, true, DataMapper, Conjugate, PanelMode, StorageOrder, PACK_SHAPES_COUNT<Architecture, CPU, Scalar, true>-1> ps;
ps(0, blockA, lhs, rows, depth, stride, offset);
}
};
template<int Architecture, int CPU, typename Index, typename Scalar, typename DataMapper, bool Conjugate, bool PanelMode, int StorageOrder>
struct rhs_pack
{
EIGEN_STRONG_INLINE void operator()(Scalar *blockB, const DataMapper &rhs, Index depth, Index cols, Index stride, Index offset)
{
PackingStruct<Architecture, CPU, Index, Scalar, false, DataMapper, Conjugate, PanelMode, StorageOrder, PACK_SHAPES_COUNT<Architecture, CPU, Scalar, false>-1> ps;
ps(0, blockB, rhs, cols, depth, stride, offset);
}
};
template<int Architecture, int CPU, typename Index, typename Scalar, typename DataMapper, bool isLhs, int IDX>
struct PackMapCalculator
{
PackMapCalculator<Architecture, CPU, Index, Scalar, DataMapper, isLhs, PACK_SHAPES<Architecture, CPU, Scalar, isLhs>[IDX][PACK_SHAPES_POINTER]> pmc;
EIGEN_STRONG_INLINE Index getPosition(Index pos, Index d2Size)
{
constexpr auto d1Progress = PACK_SHAPES<Architecture, CPU, Scalar, isLhs>[IDX][0];
Index v = (pos / d1Progress) * d1Progress;
return v*d2Size + pmc.getPosition(pos - v, d2Size);
}
};
template<int Architecture, int CPU, typename Index, typename Scalar, typename DataMapper, bool isLhs>
struct PackMapCalculator<Architecture, CPU, Index, Scalar, DataMapper, isLhs, -1>
{
EIGEN_STRONG_INLINE Index getPosition(Index, Index) { return Index(0); }
};
template<int Architecture, int CPU, typename Index, typename Scalar, typename DataMapper, bool isLhs>
struct PackMap
{
const Scalar *pBase;
const Scalar *pCur;
Index stride;
Index offset;
Index d2Size;
PackMapCalculator<Architecture, CPU, Index, Scalar, DataMapper, isLhs, PACK_SHAPES_COUNT<Architecture, CPU, Scalar, isLhs>-1> pmc;
PackMap(const Scalar *base, Index d2Size, Index stride, Index offset) : pBase(base), pCur(base), d2Size(d2Size), stride(stride), offset(offset) {}
EIGEN_STRONG_INLINE void resetCur() { pCur = pBase; }
EIGEN_STRONG_INLINE void moveTo(Index p1) { pCur = pBase + pmc.getPosition(p1, d2Size); }
EIGEN_STRONG_INLINE void advance(int progress) { pCur += progress; }
};
template<int Architecture, int CPU, typename Scalar, typename ResScalar, typename DataMapper, int M, int N>
struct Accumulator
{
Scalar dt[M][N];
EIGEN_STRONG_INLINE void zero()
{
for(auto i = 0; i < M; i++)
{
for(auto j = 0; j < N; j++)
{
dt[i][j] = Scalar(0);
}
}
}
EIGEN_STRONG_INLINE void prefetch(const DataMapper&, Index, Index) {}
template<typename ResPacket>
EIGEN_STRONG_INLINE void scale(ResScalar alpha, const ResPacket& pAlpha)
{
for(auto i = 0; i < M; i++)
{
for(auto j = 0; j < N; j++)
{
dt[i][j] *= alpha;
}
}
}
EIGEN_STRONG_INLINE void store(const DataMapper& dest, Index row, Index col)
{
for(auto i = 0; i < M; i++)
{
for(auto j = 0; j < N; j++)
{
dest(row + i, col + j) += dt[i][j];
}
}
}
};
template<int Architecture, int CPU, typename Index, typename LhsScalar, typename LhsPackMap, typename RhsScalar, typename RhsPackMap, typename AccScalar, typename ResScalar, typename Accumulator, int M, int K, int N>
struct MicroKernel
{
EIGEN_STRONG_INLINE void operator()(LhsPackMap& lhsPackMap,
RhsPackMap& rhsPackMap,
Index rowIdx, Index colIdx, Index depthIdx,
Accumulator& acc)
{
#ifdef __DEBUG__
std::cout << "Kernel " << M << " x " << K << " x " << N << " @ " << rowIdx << ", " << depthIdx << ", " << colIdx << std::endl;
std::cout << "LHS ";
for(auto i = 0; i < M; i++)
{
for(auto j = 0; j < K; j++)
{
std::cout << lhsPackMap.pCur[i*K + j] << " ";
}
}
std::cout << std::endl << "RHS ";
for(auto i = 0; i < K; i++)
{
for(auto j = 0; j < N; j++)
{
std::cout << rhsPackMap.pCur[i*N + j] << " ";
}
}
std::cout << std::endl;
#endif
const RhsScalar *pRhs = rhsPackMap.pCur;
for(auto i = 0; i < N; i++)
{
const LhsScalar *pLhs = lhsPackMap.pCur;
for(auto j = 0; j < M; j++)
{
acc.dt[j][i] += pRhs[i]*pLhs[j];
}
}
lhsPackMap.advance(M*K);
rhsPackMap.advance(K*N);
};
};
template<int Architecture, int CPU, typename Index, typename LhsScalar, typename LhsPackMap, typename RhsScalar, typename RhsPackMap, typename AccScalar, typename ResScalar, typename ResPacket, typename DataMapper, int RHS_SHAPE_IDX, int LHS_SHAPE_IDX, int IDX>
struct DepthLoopStruct
{
static constexpr auto PREVIOUS = SHAPES<Architecture, CPU, LhsScalar, RhsScalar>[IDX][SHAPES_DEP_POINTER];
DepthLoopStruct<Architecture, CPU, Index, LhsScalar, LhsPackMap, RhsScalar, RhsPackMap, AccScalar, ResScalar, ResPacket, DataMapper, RHS_SHAPE_IDX, LHS_SHAPE_IDX, PREVIOUS> depthLS;
EIGEN_STRONG_INLINE void operator()(Index rowIdx, Index colIdx, Index depthIdx, const DataMapper& res,
Index rows, Index depth, Index cols, ResScalar alpha, const ResPacket& pAlpha, LhsPackMap& lhsPackMap, RhsPackMap& rhsPackMap)
{
constexpr auto rhsProgress = SHAPES<Architecture, CPU, LhsScalar, RhsScalar>[RHS_SHAPE_IDX][SHAPES_RHS_DIMENSION];
constexpr auto lhsProgress = SHAPES<Architecture, CPU, LhsScalar, RhsScalar>[LHS_SHAPE_IDX][SHAPES_LHS_DIMENSION];
constexpr auto depthProgress = SHAPES<Architecture, CPU, LhsScalar, RhsScalar>[IDX][SHAPES_DEP_DIMENSION];
#ifdef __ENABLE_PREFETCH__
prefetch(lhsPackMap.pCur);
prefetch(rhsPackMap.pCur);
#endif
typedef Accumulator<Architecture, CPU, AccScalar, ResScalar, DataMapper, lhsProgress, rhsProgress> AccumulatorType;
MicroKernel<Architecture, CPU, Index, LhsScalar, LhsPackMap, RhsScalar, RhsPackMap, AccScalar, ResScalar, AccumulatorType, lhsProgress, depthProgress, rhsProgress> mkt;
AccumulatorType acc;
acc.zero();
#ifdef __ENABLE_PREFETCH__
acc.prefetch(res, rowIdx, colIdx);
#endif
for(; depthIdx + depthProgress <= depth; depthIdx+=depthProgress)
{
mkt(lhsPackMap, rhsPackMap, rowIdx, colIdx, depthIdx, acc);
}
acc.scale(alpha, pAlpha);
acc.store(res, rowIdx, colIdx);
depthLS(rowIdx, colIdx, depthIdx, res, rows, depth, cols, alpha, pAlpha, lhsPackMap, rhsPackMap);
}
};
template<int Architecture, int CPU, typename Index, typename LhsScalar, typename LhsPackMap, typename RhsScalar, typename RhsPackMap, typename AccScalar, typename ResScalar, typename ResPacket, typename DataMapper, int RHS_SHAPE_IDX, int LHS_SHAPE_IDX>
struct DepthLoopStruct<Architecture, CPU, Index, LhsScalar, LhsPackMap, RhsScalar, RhsPackMap, AccScalar, ResScalar, ResPacket, DataMapper, RHS_SHAPE_IDX, LHS_SHAPE_IDX, -1>
{
EIGEN_STRONG_INLINE void operator()(Index, Index, Index, const DataMapper&,
Index, Index, Index, ResScalar, const ResPacket&, LhsPackMap&, RhsPackMap&) {}
};
template<int Architecture, int CPU, typename Index, typename LhsScalar, typename LhsPackMap, typename RhsScalar, typename RhsPackMap, typename AccScalar, typename ResScalar, typename ResPacket, typename DataMapper, int RHS_SHAPE_IDX, int IDX>
struct LhsLoopStruct
{
static constexpr auto PREVIOUS = SHAPES<Architecture, CPU, LhsScalar, RhsScalar>[IDX][SHAPES_LHS_POINTER];
LhsLoopStruct<Architecture, CPU, Index, LhsScalar, LhsPackMap, RhsScalar, RhsPackMap, AccScalar, ResScalar, ResPacket, DataMapper, RHS_SHAPE_IDX, PREVIOUS> lhsLS;
EIGEN_STRONG_INLINE void operator()(Index rowIdx, int colIdx, const DataMapper& res,
Index rows, Index depth, Index cols, ResScalar alpha, const ResPacket& pAlpha, LhsPackMap& lhsPackMap, RhsPackMap& rhsPackMap)
{
constexpr auto lhsProgress = SHAPES<Architecture, CPU, LhsScalar, RhsScalar>[IDX][SHAPES_LHS_DIMENSION];
constexpr auto rhsProgress = SHAPES<Architecture, CPU, LhsScalar, RhsScalar>[IDX][SHAPES_RHS_DIMENSION];
DepthLoopStruct<Architecture, CPU, Index, LhsScalar, LhsPackMap, RhsScalar, RhsPackMap, AccScalar, ResScalar, ResPacket, DataMapper, RHS_SHAPE_IDX, IDX, IDX> depthLS;
for(;rowIdx + lhsProgress <= rows; rowIdx+=lhsProgress)
{
lhsPackMap.moveTo(rowIdx);
rhsPackMap.moveTo(colIdx);
//prefetch(lhsPackMap.pCur + 2*lhsProgress);
//prefetch(rhsPackMap.pCur + 2*rhsProgress);
depthLS(rowIdx, colIdx, 0, res, rows, depth, cols, alpha, pAlpha, lhsPackMap, rhsPackMap);
}
lhsLS(rowIdx, colIdx, res, rows, depth, cols, alpha, pAlpha, lhsPackMap, rhsPackMap);
}
};
template<int Architecture, int CPU, typename Index, typename LhsScalar, typename LhsPackMap, typename RhsScalar, typename RhsPackMap, typename AccScalar, typename ResScalar, typename ResPacket, typename DataMapper, int RHS_SHAPE_IDX>
struct LhsLoopStruct<Architecture, CPU, Index, LhsScalar, LhsPackMap, RhsScalar, RhsPackMap, AccScalar, ResScalar, ResPacket, DataMapper, RHS_SHAPE_IDX, -1>
{
EIGEN_STRONG_INLINE void operator()(Index, Index, const DataMapper&,
Index, Index, Index, ResScalar, const ResPacket&, LhsPackMap&, RhsPackMap&) {}
};
template<int Architecture, int CPU, typename Index, typename LhsScalar, typename LhsPackMap, typename RhsScalar, typename RhsPackMap, typename AccScalar, typename ResScalar, typename ResPacket, typename DataMapper, int IDX>
struct RhsLoopStruct
{
static constexpr auto PREVIOUS = SHAPES<Architecture, CPU, LhsScalar, RhsScalar>[IDX][SHAPES_RHS_POINTER];
RhsLoopStruct<Architecture, CPU, Index, LhsScalar, LhsPackMap, RhsScalar, RhsPackMap, AccScalar, ResScalar, ResPacket, DataMapper, PREVIOUS> rhsLS;
EIGEN_STRONG_INLINE void operator()(Index colIdx, const DataMapper& res,
Index rows, Index depth, Index cols, ResScalar alpha, const ResPacket& pAlpha, LhsPackMap& lhsPackMap, RhsPackMap& rhsPackMap)
{
constexpr auto rhsProgress = SHAPES<Architecture, CPU, LhsScalar, RhsScalar>[IDX][SHAPES_RHS_DIMENSION];
for(;colIdx + rhsProgress <= cols; colIdx+=rhsProgress)
{
LhsLoopStruct<Architecture, CPU, Index, LhsScalar, LhsPackMap, RhsScalar, RhsPackMap, AccScalar, ResScalar, ResPacket, DataMapper, IDX, IDX> lhsLS;
lhsLS(0, colIdx, res, rows, depth, cols, alpha, pAlpha, lhsPackMap, rhsPackMap);
}
rhsLS(colIdx, res, rows, depth, cols, alpha, pAlpha, lhsPackMap, rhsPackMap);
}
};
template<int Architecture, int CPU, typename Index, typename LhsScalar, typename LhsPackMap, typename RhsScalar, typename RhsPackMap, typename AccScalar, typename ResScalar, typename ResPacket, typename DataMapper>
struct RhsLoopStruct<Architecture, CPU, Index, LhsScalar, LhsPackMap, RhsScalar, RhsPackMap, AccScalar, ResScalar, ResPacket, DataMapper, -1>
{
EIGEN_STRONG_INLINE void operator()(Index colIdx, const DataMapper&,
Index, Index, Index, ResScalar, const ResPacket&, LhsPackMap&, RhsPackMap&) {}
};
template<int Architecture, int CPU, typename ResScalar, typename AccScalar, typename LhsScalar, typename RhsScalar, typename Index, typename DataMapper>
EIGEN_STRONG_INLINE void gemm(const DataMapper& res, const LhsScalar* blockA, const RhsScalar* blockB,
Index rows, Index depth, Index cols, ResScalar alpha, Index strideA, Index strideB, Index offsetA, Index offsetB)
{
using ResPacket = typename unpacket_traits<ResScalar>::type;
typedef PackMap<Architecture, CPU, Index, LhsScalar, DataMapper, true> LhsPackMap;
typedef PackMap<Architecture, CPU, Index, RhsScalar, DataMapper, false> RhsPackMap;
#ifdef __DEBUG__
std::cout << "blockA" << std::endl;
for(auto i = 0; i < rows*depth; i++)
{
if(i % 4 == 0 && i > 0)
std::cout << std::endl;
std::cout << blockA[i] << " ";
}
std::cout << std::endl;
std::cout << "blockB" << std::endl;
for(auto i = 0; i < depth*cols; i++)
{
if(i % 4 == 0 && i > 0)
std::cout << std::endl;
std::cout << blockB[i] << " ";
}
std::cout << std::endl;
#endif
asm __volatile__("#BEGING_GEBP\n\t");
RhsLoopStruct<Architecture, CPU, Index, LhsScalar, LhsPackMap, RhsScalar, RhsPackMap, AccScalar, ResScalar, ResPacket, DataMapper, SHAPES_COUNT<0, 0, LhsScalar, RhsScalar>-1> rhsLS;
LhsPackMap lhsPackMap(blockA, depth, strideA, offsetA);
RhsPackMap rhsPackMap(blockB, depth, strideB, offsetB);
ResPacket pAlpha = pset1<ResPacket>(alpha);
rhsLS(0, res, rows, depth, cols, alpha, pAlpha, lhsPackMap, rhsPackMap);
asm __volatile__("#END_GEBP\n\t");
}
/*
template<typename Index, typename DataMapper, int nr, bool Conjugate, bool PanelMode>
struct gemm_pack_rhs<float, Index, DataMapper, nr, ColMajor, Conjugate, PanelMode>
{
void operator()(float* blockB, const DataMapper& rhs, Index depth, Index cols, Index stride=0, Index offset=0);
};
template<typename Index, typename DataMapper, int nr, bool Conjugate, bool PanelMode>
void gemm_pack_rhs<float, Index, DataMapper, nr, ColMajor, Conjugate, PanelMode>
::operator()(float* blockB, const DataMapper& rhs, Index depth, Index cols, Index stride, Index offset)
{
rhs_pack<0, 0, Index, float, DataMapper, Conjugate, PanelMode, ColMajor> pack;
pack(blockB, rhs, depth, cols, stride, offset);
}
template<typename Index, typename DataMapper, int nr, bool Conjugate, bool PanelMode>
struct gemm_pack_rhs<float, Index, DataMapper, nr, RowMajor, Conjugate, PanelMode>
{
void operator()(float* blockB, const DataMapper& rhs, Index depth, Index cols, Index stride=0, Index offset=0);
};
template<typename Index, typename DataMapper, int nr, bool Conjugate, bool PanelMode>
void gemm_pack_rhs<float, Index, DataMapper, nr, RowMajor, Conjugate, PanelMode>
::operator()(float* blockB, const DataMapper& rhs, Index depth, Index cols, Index stride, Index offset)
{
rhs_pack<0, 0, Index, float, DataMapper, Conjugate, PanelMode, RowMajor> pack;
pack(blockB, rhs, depth, cols, stride, offset);
}
template<typename Index, typename DataMapper, int Pack1, int Pack2, typename Packet, bool Conjugate, bool PanelMode>
struct gemm_pack_lhs<float, Index, DataMapper, Pack1, Pack2, Packet, RowMajor, Conjugate, PanelMode>
{
void operator()(float* blockA, const DataMapper& lhs, Index depth, Index rows, Index stride=0, Index offset=0);
};
template<typename Index, typename DataMapper, int Pack1, int Pack2, typename Packet, bool Conjugate, bool PanelMode>
void gemm_pack_lhs<float, Index, DataMapper, Pack1, Pack2, Packet, RowMajor, Conjugate, PanelMode>
::operator()(float* blockA, const DataMapper& lhs, Index depth, Index rows, Index stride, Index offset)
{
lhs_pack<0, 0, Index, float, DataMapper, Conjugate, PanelMode, RowMajor> pack;
pack(blockA, lhs, depth, rows, stride, offset);
}
template<typename Index, typename DataMapper, int Pack1, int Pack2, typename Packet, bool Conjugate, bool PanelMode>
struct gemm_pack_lhs<float, Index, DataMapper, Pack1, Pack2, Packet, ColMajor, Conjugate, PanelMode>
{
void operator()(float* blockA, const DataMapper& lhs, Index depth, Index rows, Index stride=0, Index offset=0);
};
template<typename Index, typename DataMapper, int Pack1, int Pack2, typename Packet, bool Conjugate, bool PanelMode>
void gemm_pack_lhs<float, Index, DataMapper, Pack1, Pack2, Packet, ColMajor, Conjugate, PanelMode>
::operator()(float* blockA, const DataMapper& lhs, Index depth, Index rows, Index stride, Index offset)
{
lhs_pack<0, 0, Index, float, DataMapper, Conjugate, PanelMode, ColMajor> pack;
pack(blockA, lhs, depth, rows, stride, offset);
}
*/
template<typename Index, typename DataMapper, int mr, int nr, bool ConjugateLhs, bool ConjugateRhs>
struct gebp_kernel<float, float, Index, DataMapper, mr, nr, ConjugateLhs, ConjugateRhs>
{
void operator()(const DataMapper& res, const float* blockA, const float* blockB,
Index rows, Index depth, Index cols, float alpha,
Index strideA=-1, Index strideB=-1, Index offsetA=0, Index offsetB=0);
};
template<typename Index, typename DataMapper, int mr, int nr, bool ConjugateLhs, bool ConjugateRhs>
void gebp_kernel<float, float, Index, DataMapper, mr, nr, ConjugateLhs, ConjugateRhs>
::operator()(const DataMapper& res, const float* blockA, const float* blockB,
Index rows, Index depth, Index cols, float alpha,
Index strideA, Index strideB, Index offsetA, Index offsetB)
{
gemm<0, 0, float, float, float, float, Index, DataMapper>(res, blockA, blockB, rows, depth, cols, alpha, strideA, strideB, offsetA, offsetB);
}
} // end namespace internal
} // end namespace Eigen
#endif // EIGEN_MATRIX_PRODUCT_NEON_H

View File

@@ -0,0 +1,192 @@
// This file is part of Eigen, a lightweight C++ template library
// for linear algebra.
//
// Copyright (C) 2021 Everton Constantino (everton.constantino@hotmail.com)
//
// This Source Code Form is subject to the terms of the Mozilla
// Public License v. 2.0. If a copy of the MPL was not distributed
// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
#ifndef EIGEN_PACKING_OPS_NEON_H
#define EIGEN_PACKING_OPS_NEON_H
namespace Eigen {
namespace internal {
#ifdef __ENABLE_CUSTOM_PACKING__
template<int CPU, typename Scalar, bool isLhs>
constexpr int PACK_SHAPES_COUNT<0, CPU, Scalar, isLhs> = 3;
template<int CPU, typename Scalar>
constexpr int PACK_SHAPES_COUNT<0, CPU, Scalar, true> = 4;
template<int CPU, typename Scalar, bool isLhs>
constexpr int PACK_SHAPES<0, CPU, Scalar, isLhs>[PACK_SHAPES_COUNT<0, CPU, Scalar, isLhs>][PACK_SHAPES_DIMENSION] = {{1,1,PACK_SHAPES_END},{4,1,0},{4,4,0}};
template<int CPU, typename Scalar>
constexpr int PACK_SHAPES<0, CPU, Scalar, true>[PACK_SHAPES_COUNT<0, CPU, Scalar, true>][PACK_SHAPES_DIMENSION] = {{1,1,PACK_SHAPES_END},{4,1,0},{4,4,0},{8,1,2}};
template<int CPU, typename Index, typename Scalar, bool isLhs, typename DataMapper, bool Conjugate, bool PanelMode, int StorageOrder>
struct PackingOperator<0, CPU, Index, Scalar, isLhs, DataMapper, Conjugate, PanelMode, StorageOrder, 4, 4>
{
EIGEN_STRONG_INLINE Scalar* operator()(Index d1Idx, Index d2Idx, Scalar *block, const DataMapper& data)
{
using Packet = typename packet_traits<Scalar>::type;
constexpr int vectorSize = packet_traits<Scalar>::size;
Scalar *c = block;
if(!isLhs)
{
int tD = d1Idx;
d1Idx = d2Idx;
d2Idx = tD;
}
if(isLhs && StorageOrder == ColMajor || !isLhs && StorageOrder == RowMajor)
{
Packet p0 = data.template loadPacket<Packet>(d1Idx, d2Idx + 0);
Packet p1 = data.template loadPacket<Packet>(d1Idx, d2Idx + 1);
Packet p2 = data.template loadPacket<Packet>(d1Idx, d2Idx + 2);
Packet p3 = data.template loadPacket<Packet>(d1Idx, d2Idx + 3);
pstore<Scalar>(c + 0*vectorSize, p0);
pstore<Scalar>(c + 1*vectorSize, p1);
pstore<Scalar>(c + 2*vectorSize, p2);
pstore<Scalar>(c + 3*vectorSize, p3);
c+=4*vectorSize;
} else {
PacketBlock<Packet, 4> pblock;
pblock.packet[0] = data.template loadPacket<Packet>(d1Idx, d2Idx + 0);
pblock.packet[1] = data.template loadPacket<Packet>(d1Idx, d2Idx + 1);
pblock.packet[2] = data.template loadPacket<Packet>(d1Idx, d2Idx + 2);
pblock.packet[3] = data.template loadPacket<Packet>(d1Idx, d2Idx + 3);
ptranspose(pblock);
pstore<Scalar>(c + 0*vectorSize, pblock.packet[0]);
pstore<Scalar>(c + 1*vectorSize, pblock.packet[1]);
pstore<Scalar>(c + 2*vectorSize, pblock.packet[2]);
pstore<Scalar>(c + 3*vectorSize, pblock.packet[3]);
c+=4*vectorSize;
}
return c;
}
};
template<int CPU, typename Index, typename Scalar, bool isLhs, typename DataMapper, bool Conjugate, bool PanelMode, int StorageOrder>
struct PackingOperator<0, CPU, Index, Scalar, isLhs, DataMapper, Conjugate, PanelMode, StorageOrder, 8, 1>
{
EIGEN_STRONG_INLINE Scalar* operator()(Index d1Idx, Index d2Idx, Scalar *block, const DataMapper& data)
{
using Packet = typename packet_traits<Scalar>::type;
Scalar *c = block;
if(isLhs && StorageOrder == ColMajor)
{
Packet p = data.template loadPacket<Packet>(d1Idx + 0, d2Idx);
pstore<Scalar>(c, p);
c+=4;
p = data.template loadPacket<Packet>(d1Idx + 4, d2Idx);
pstore<Scalar>(c, p);
c+=4;
} else if(!isLhs && StorageOrder == RowMajor) {
Packet p = data.template loadPacket<Packet>(d2Idx, d1Idx + 0);
pstore<Scalar>(c, p);
c+=4;
p = data.template loadPacket<Packet>(d2Idx, d1Idx + 4);
pstore<Scalar>(c, p);
c+=4;
} else {
if(isLhs)
{
*c = data(d1Idx + 0, d2Idx + 0);
c++;
*c = data(d1Idx + 1, d2Idx + 0);
c++;
*c = data(d1Idx + 2, d2Idx + 0);
c++;
*c = data(d1Idx + 3, d2Idx + 0);
c++;
*c = data(d1Idx + 0, d2Idx + 4);
c++;
*c = data(d1Idx + 1, d2Idx + 4);
c++;
*c = data(d1Idx + 2, d2Idx + 4);
c++;
*c = data(d1Idx + 3, d2Idx + 4);
c++;
} else {
*c = data(d2Idx, d1Idx + 0);
c++;
*c = data(d2Idx, d1Idx + 1);
c++;
*c = data(d2Idx, d1Idx + 2);
c++;
*c = data(d2Idx, d1Idx + 3);
c++;
*c = data(d2Idx + 4, d1Idx + 0);
c++;
*c = data(d2Idx + 4, d1Idx + 1);
c++;
*c = data(d2Idx + 4, d1Idx + 2);
c++;
*c = data(d2Idx + 4, d1Idx + 3);
c++;
}
}
return c;
}
};
template<int CPU, typename Index, typename Scalar, bool isLhs, typename DataMapper, bool Conjugate, bool PanelMode, int StorageOrder>
struct PackingOperator<0, CPU, Index, Scalar, isLhs, DataMapper, Conjugate, PanelMode, StorageOrder, 4, 1>
{
EIGEN_STRONG_INLINE Scalar* operator()(Index d1Idx, Index d2Idx, Scalar *block, const DataMapper& data)
{
using Packet = typename packet_traits<Scalar>::type;
Scalar *c = block;
if(isLhs && StorageOrder == ColMajor)
{
Packet p = data.template loadPacket<Packet>(d1Idx, d2Idx);
pstore<Scalar>(c, p);
c+=4;
} else if(!isLhs && StorageOrder == RowMajor) {
Packet p = data.template loadPacket<Packet>(d2Idx, d1Idx);
pstore<Scalar>(c, p);
c+=4;
} else {
if(isLhs)
{
*c = data(d1Idx + 0, d2Idx);
c++;
*c = data(d1Idx + 1, d2Idx);
c++;
*c = data(d1Idx + 2, d2Idx);
c++;
*c = data(d1Idx + 3, d2Idx);
c++;
} else {
*c = data(d2Idx, d1Idx + 0);
c++;
*c = data(d2Idx, d1Idx + 1);
c++;
*c = data(d2Idx, d1Idx + 2);
c++;
*c = data(d2Idx, d1Idx + 3);
c++;
}
}
return c;
}
};
#endif // __ENABLE_CUSTOM_PACKING__
} // end namespace internal
} // end namespace Eigen
#endif // EIGEN_PACKING_OPS_NEON_H

6
compile.sh Executable file
View File

@@ -0,0 +1,6 @@
#!/bin/bash
#echo 'Compiling with master'
#g++ -O3 -I../eigen-master -std=c++11 new_gemm_test.cpp -o gto
echo 'Compiling current'
g++ -O3 -I. -std=c++14 new_gemm_test.cpp -D__ENABLE_VECTOR_KERNELS__ -D__ENABLE_PREFETCH__ -o gtp
g++ -O3 -I. -std=c++14 new_gemm_test.cpp -D__ENABLE_VECTOR_KERNELS__ -o gt

98
new_gemm_test.cpp Normal file
View File

@@ -0,0 +1,98 @@
#include <Eigen/Dense>
#include <iostream>
#include <ctime>
#include <cmath>
using namespace Eigen;
void set(MatrixXf& A, int m, int n, int id, int digits)
{
for(auto i = 0; i < m; i++)
for(auto j = 0; j < n; j++)
A(i,j) = id*std::pow(10,(2*digits)) + i*std::pow(10,digits) + j;
}
int main(int argc, char* argv[])
{
#ifdef __DEBUG__
int m = 32, k = 32, n = 32, max = std::max(std::max(m,k),n);
MatrixXf A = MatrixXf::Zero(m, k);
MatrixXf B = MatrixXf::Zero(k, n);
MatrixXf C = MatrixXf::Zero(m, n);
MatrixXf D = MatrixXf::Zero(m, n);
set(A, m, k, 1, static_cast<int>(std::log10(max)) + 1);
set(B, k, n, 2, static_cast<int>(std::log10(max)) + 1);
C = A*B;
std::cout << A << std::endl;
std::cout << B << std::endl;
std::cout << std::endl;
for(auto i = 0; i < m; i++)
{
for(auto j = 0; j < n; j++)
{
float acc=0;
for(auto kk = 0; kk < k; kk++)
{
acc += A(i,kk)*B(kk,j);
}
D(i,j) = acc;
if(std::sqrt(std::pow(D(i,j)-C(i,j),2)) > 1.0e-5)
{
std::cout << "Difference too big at " << i << " ," << j << " is " << C(i,j) << " should be " << D(i,j) << std::endl;
}
}
}
#else
if(argc < 3)
{
std::cout << "Wrong number of arguments." << std::endl;
return -1;
}
int sz = std::atoi(argv[1]);
int m = sz, k = sz, n = sz;
int RUNS = std::atoi(argv[2]);
double time = 0;
for(auto i = 0; i < RUNS; i++)
{
MatrixXf A = MatrixXf::Random(m,k);
MatrixXf B = MatrixXf::Random(k,n);
//set(A,m, k, 1);
//set(B,k, n, 2);
MatrixXf C = MatrixXf::Zero(m, n);
std::clock_t start,end;
start = std::clock();
C = A*B;
end = std::clock();
time += 1000.0*(end-start) / CLOCKS_PER_SEC;
}
std::cout << time << std::endl;
#ifdef TEST_SCALAR
start = std::clock();
for(auto i = 0; i < m; i++)
{
for(auto j = 0; j < n; j++)
{
float acc=0;
for(auto kk = 0; kk < k; kk++)
{
acc += A(i,kk)*B(kk,j);
}
C(i,j) = acc;
}
}
end = std::clock();
std::cout << 1000.0*(end-start) / CLOCKS_PER_SEC << std::endl;
#endif
#endif
return 0;
}

34
run.sh Executable file
View File

@@ -0,0 +1,34 @@
#!/bin/bash
function run() {
OLD=0
NEW=0
NEWP=0
EXECS=$1
SIZE=$2
RUNS=$3
for ((i = 0; i < $EXECS; i++)) do
SEL=$(A=$(shuf -i 0-10 -n 1); echo $(($A % 2)))
if [ $SEL -eq 0 ]; then
T_OLD=$(./gto $SIZE $RUNS)
T_NEW=$(./gt $SIZE $RUNS)
T_NEWP=$(./gtp $SIZE $RUNS)
else
T_NEW=$(./gt $SIZE $RUNS)
T_NEWP=$(./gtp $SIZE $RUNS)
T_OLD=$(./gto $SIZE $RUNS)
fi
NEW=$NEW+$T_NEW
OLD=$OLD+$T_OLD
NEWP=$NEWP+$T_NEWP
done
SPEED=$(echo "($OLD) / ($NEW)" | bc -l)
SPEEDP=$(echo "($OLD) / ($NEWP)" | bc -l)
echo "$SIZE -> $SPEED $SPEEDP"
}
run $1 16 500
run $1 32 500
run $1 64 100
run $1 128 50
run $1 256 10
run $1 1024 10