mirror of
https://gitlab.com/libeigen/eigen.git
synced 2026-04-10 11:34:33 +08:00
Compare commits
16 Commits
3.4-rc1
...
starting_n
| Author | SHA1 | Date | |
|---|---|---|---|
|
|
58db05afbc | ||
|
|
bfadb56107 | ||
|
|
9b8cdceea8 | ||
|
|
a8ec6d6a36 | ||
|
|
54f80f442d | ||
|
|
70c0363c28 | ||
|
|
b2cd094863 | ||
|
|
d216764f46 | ||
|
|
646d92c7f1 | ||
|
|
c62ed9b214 | ||
|
|
82a7715b01 | ||
|
|
43ce8e9d2d | ||
|
|
ca0d3f92d7 | ||
|
|
5bffe09624 | ||
|
|
421891e1db | ||
|
|
f826663a3a |
@@ -350,6 +350,9 @@ using std::ptrdiff_t;
|
||||
#include "src/Core/arch/AltiVec/MatrixProduct.h"
|
||||
#elif defined EIGEN_VECTORIZE_NEON
|
||||
#include "src/Core/arch/NEON/GeneralBlockPanelKernel.h"
|
||||
#include "src/Core/arch/NEON/MatrixProduct.h"
|
||||
#include "src/Core/arch/NEON/PackingOps.h"
|
||||
#include "src/Core/arch/NEON/Kernels.h"
|
||||
#endif
|
||||
|
||||
#include "src/Core/BooleanRedux.h"
|
||||
|
||||
797
Eigen/src/Core/arch/NEON/Kernels.h
Normal file
797
Eigen/src/Core/arch/NEON/Kernels.h
Normal file
@@ -0,0 +1,797 @@
|
||||
// This file is part of Eigen, a lightweight C++ template library
|
||||
// for linear algebra.
|
||||
//
|
||||
// Copyright (C) 2021 Everton Constantino (everton.constantino@hotmail.com)
|
||||
//
|
||||
// This Source Code Form is subject to the terms of the Mozilla
|
||||
// Public License v. 2.0. If a copy of the MPL was not distributed
|
||||
// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
|
||||
|
||||
#ifndef EIGEN_KERNELS_NEON_H
|
||||
#define EIGEN_KERNELS_NEON_H
|
||||
|
||||
namespace Eigen {
|
||||
|
||||
namespace internal {
|
||||
|
||||
#ifdef __ENABLE_VECTOR_KERNELS__
|
||||
|
||||
#define MICRO_12x1x4() \
|
||||
pRhs = pload<RhsPacket>(rhsPackMap.pCur); \
|
||||
rhsPackMap.advance(1*4); \
|
||||
pRhs0 = pset1<RhsPacket>(pRhs[0]); \
|
||||
pRhs1 = pset1<RhsPacket>(pRhs[1]); \
|
||||
pRhs2 = pset1<RhsPacket>(pRhs[2]); \
|
||||
pRhs3 = pset1<RhsPacket>(pRhs[3]); \
|
||||
pLhs = pload<LhsPacket>(lhsPackMap.pCur); \
|
||||
lhsPackMap.advance(4*1); \
|
||||
acc._acc1.packet[0] += pLhs*pRhs0; \
|
||||
acc._acc1.packet[1] += pLhs*pRhs1; \
|
||||
acc._acc1.packet[2] += pLhs*pRhs2; \
|
||||
acc._acc1.packet[3] += pLhs*pRhs3; \
|
||||
pLhs2 = pload<LhsPacket>(lhsPackMap.pCur); \
|
||||
lhsPackMap.advance(4*1); \
|
||||
acc._acc2.packet[0] += pLhs2*pRhs0; \
|
||||
acc._acc2.packet[1] += pLhs2*pRhs1; \
|
||||
acc._acc2.packet[2] += pLhs2*pRhs2; \
|
||||
acc._acc2.packet[3] += pLhs2*pRhs3; \
|
||||
pLhs3 = pload<LhsPacket>(lhsPackMap.pCur); \
|
||||
acc._acc3.packet[0] += pLhs3*pRhs0; \
|
||||
acc._acc3.packet[1] += pLhs3*pRhs1; \
|
||||
acc._acc3.packet[2] += pLhs3*pRhs2; \
|
||||
acc._acc3.packet[3] += pLhs3*pRhs3; \
|
||||
lhsPackMap.advance(4*1);
|
||||
|
||||
#define MICRO_8x1x4() \
|
||||
pLhs = pload<LhsPacket>(lhsPackMap.pCur); \
|
||||
lhsPackMap.advance(4*1); \
|
||||
pLhs2 = pload<LhsPacket>(lhsPackMap.pCur); \
|
||||
pRhs = pload<RhsPacket>(rhsPackMap.pCur); \
|
||||
pRhs0 = pset1<RhsPacket>(pRhs[0]); \
|
||||
pRhs1 = pset1<RhsPacket>(pRhs[1]); \
|
||||
pRhs2 = pset1<RhsPacket>(pRhs[2]); \
|
||||
pRhs3 = pset1<RhsPacket>(pRhs[3]); \
|
||||
acc._acc1.packet[0] += pLhs*pRhs0; \
|
||||
acc._acc1.packet[1] += pLhs*pRhs1; \
|
||||
acc._acc1.packet[2] += pLhs*pRhs2; \
|
||||
acc._acc1.packet[3] += pLhs*pRhs3; \
|
||||
acc._acc2.packet[0] += pLhs2*pRhs0; \
|
||||
acc._acc2.packet[1] += pLhs2*pRhs1; \
|
||||
acc._acc2.packet[2] += pLhs2*pRhs2; \
|
||||
acc._acc2.packet[3] += pLhs2*pRhs3; \
|
||||
lhsPackMap.advance(4*1); \
|
||||
rhsPackMap.advance(1*4);
|
||||
|
||||
#define MICRO_4x1x4() \
|
||||
pLhs = pload<LhsPacket>(lhsPackMap.pCur); \
|
||||
pRhs = pload<RhsPacket>(rhsPackMap.pCur); \
|
||||
pRhs0 = pset1<RhsPacket>(pRhs[0]); \
|
||||
pRhs1 = pset1<RhsPacket>(pRhs[1]); \
|
||||
pRhs2 = pset1<RhsPacket>(pRhs[2]); \
|
||||
pRhs3 = pset1<RhsPacket>(pRhs[3]); \
|
||||
acc._acc.packet[0] += pLhs*pRhs0; \
|
||||
acc._acc.packet[1] += pLhs*pRhs1; \
|
||||
acc._acc.packet[2] += pLhs*pRhs2; \
|
||||
acc._acc.packet[3] += pLhs*pRhs3; \
|
||||
lhsPackMap.advance(4*1); \
|
||||
rhsPackMap.advance(1*4);
|
||||
|
||||
#define MICRO_12x1x1() \
|
||||
pLhs = pload<LhsPacket>(lhsPackMap.pCur); \
|
||||
pRhs = pset1<RhsPacket>(*rhsPackMap.pCur); \
|
||||
acc._acc.packet[0] = pmadd(pRhs, pLhs, acc._acc.packet[0]); \
|
||||
lhsPackMap.advance(4*1); \
|
||||
pLhs = pload<LhsPacket>(lhsPackMap.pCur); \
|
||||
acc._acc.packet[1] = pmadd(pRhs, pLhs, acc._acc.packet[1]); \
|
||||
lhsPackMap.advance(4*1); \
|
||||
pLhs = pload<LhsPacket>(lhsPackMap.pCur); \
|
||||
acc._acc.packet[2] = pmadd(pRhs, pLhs, acc._acc.packet[2]); \
|
||||
lhsPackMap.advance(4*1); \
|
||||
rhsPackMap.advance(1);
|
||||
|
||||
#define MICRO_8x1x1() \
|
||||
pLhs = pload<LhsPacket>(lhsPackMap.pCur); \
|
||||
pRhs = pset1<RhsPacket>(*rhsPackMap.pCur); \
|
||||
acc._acc.packet[0] = pmadd(pRhs, pLhs, acc._acc.packet[0]); \
|
||||
lhsPackMap.advance(4*1); \
|
||||
pLhs = pload<LhsPacket>(lhsPackMap.pCur); \
|
||||
acc._acc.packet[1] = pmadd(pRhs, pLhs, acc._acc.packet[1]); \
|
||||
lhsPackMap.advance(4*1); \
|
||||
rhsPackMap.advance(1);
|
||||
|
||||
#define MICRO_4x1x1() \
|
||||
pLhs = pload<LhsPacket>(lhsPackMap.pCur); \
|
||||
pRhs = pset1<RhsPacket>(*rhsPackMap.pCur); \
|
||||
acc._acc += pRhs*pLhs; \
|
||||
lhsPackMap.advance(4*1); \
|
||||
rhsPackMap.advance(1);
|
||||
|
||||
template<int CPU, typename Scalar, typename ResScalar, typename DataMapper>
|
||||
struct Accumulator<0, CPU, Scalar, ResScalar, DataMapper, 12, 1>
|
||||
{
|
||||
using LinearMapper = typename DataMapper::LinearMapper;
|
||||
using AccPacket = typename packet_traits<Scalar>::type;
|
||||
using ResPacket = typename packet_traits<ResScalar>::type;
|
||||
|
||||
PacketBlock<AccPacket,3> _acc;
|
||||
|
||||
EIGEN_STRONG_INLINE void zero()
|
||||
{
|
||||
_acc.packet[0] = pset1<AccPacket>(0);
|
||||
_acc.packet[1] = pset1<AccPacket>(0);
|
||||
_acc.packet[2] = pset1<AccPacket>(0);
|
||||
}
|
||||
|
||||
EIGEN_STRONG_INLINE void prefetch(const DataMapper&, Index, Index) {}
|
||||
|
||||
template<typename ResPacket_>
|
||||
EIGEN_STRONG_INLINE void scale(ResScalar alpha, const ResPacket_& pAlpha)
|
||||
{
|
||||
_acc.packet[0] *= pAlpha;
|
||||
_acc.packet[1] *= pAlpha;
|
||||
_acc.packet[2] *= pAlpha;
|
||||
}
|
||||
|
||||
EIGEN_STRONG_INLINE void store(const DataMapper& dest, Index row, Index col)
|
||||
{
|
||||
PacketBlock<ResPacket, 1> block;
|
||||
block.packet[0] = dest.template loadPacket<ResPacket>(row + 0, col) + _acc.packet[0];
|
||||
dest.template storePacketBlock<AccPacket, 1>(row + 0, col, block);
|
||||
block.packet[0] = dest.template loadPacket<ResPacket>(row + 4, col) + _acc.packet[1];
|
||||
dest.template storePacketBlock<AccPacket, 1>(row + 4, col, block);
|
||||
block.packet[0] = dest.template loadPacket<ResPacket>(row + 8, col) + _acc.packet[2];
|
||||
dest.template storePacketBlock<AccPacket, 1>(row + 8, col, block);
|
||||
}
|
||||
};
|
||||
|
||||
template<int CPU, typename Scalar, typename ResScalar, typename DataMapper>
|
||||
struct Accumulator<0, CPU, Scalar, ResScalar, DataMapper, 8, 1>
|
||||
{
|
||||
using LinearMapper = typename DataMapper::LinearMapper;
|
||||
using AccPacket = typename packet_traits<Scalar>::type;
|
||||
using ResPacket = typename packet_traits<ResScalar>::type;
|
||||
|
||||
PacketBlock<AccPacket,2> _acc;
|
||||
|
||||
EIGEN_STRONG_INLINE void zero()
|
||||
{
|
||||
_acc.packet[0] = pset1<AccPacket>(0);
|
||||
_acc.packet[1] = pset1<AccPacket>(0);
|
||||
}
|
||||
|
||||
EIGEN_STRONG_INLINE void prefetch(const DataMapper&, Index, Index) {}
|
||||
|
||||
template<typename ResPacket_>
|
||||
EIGEN_STRONG_INLINE void scale(ResScalar alpha, const ResPacket_& pAlpha)
|
||||
{
|
||||
_acc.packet[0] *= pAlpha;
|
||||
_acc.packet[1] *= pAlpha;
|
||||
}
|
||||
|
||||
EIGEN_STRONG_INLINE void store(const DataMapper& dest, Index row, Index col)
|
||||
{
|
||||
PacketBlock<ResPacket, 1> block;
|
||||
block.packet[0] = dest.template loadPacket<ResPacket>(row, col) + _acc.packet[0];
|
||||
dest.template storePacketBlock<AccPacket, 1>(row, col, block);
|
||||
block.packet[0] = dest.template loadPacket<ResPacket>(row + 4, col) + _acc.packet[1];
|
||||
dest.template storePacketBlock<AccPacket, 1>(row + 4, col, block);
|
||||
}
|
||||
};
|
||||
|
||||
template<int CPU, typename Scalar, typename ResScalar, typename DataMapper>
|
||||
struct Accumulator<0, CPU, Scalar, ResScalar, DataMapper, 4, 1>
|
||||
{
|
||||
using LinearMapper = typename DataMapper::LinearMapper;
|
||||
using AccPacket = typename packet_traits<Scalar>::type;
|
||||
using ResPacket = typename packet_traits<ResScalar>::type;
|
||||
|
||||
AccPacket _acc;
|
||||
|
||||
EIGEN_STRONG_INLINE void zero()
|
||||
{
|
||||
_acc = pset1<AccPacket>(0);
|
||||
}
|
||||
|
||||
EIGEN_STRONG_INLINE void prefetch(const DataMapper&, Index, Index) {}
|
||||
|
||||
template<typename ResPacket_>
|
||||
EIGEN_STRONG_INLINE void scale(ResScalar alpha, const ResPacket_& pAlpha)
|
||||
{
|
||||
_acc *= pAlpha;
|
||||
}
|
||||
|
||||
EIGEN_STRONG_INLINE void store(const DataMapper& dest, Index row, Index col)
|
||||
{
|
||||
PacketBlock<ResPacket, 1> block;
|
||||
block.packet[0] = dest.template loadPacket<ResPacket>(row, col) + _acc;
|
||||
dest.template storePacketBlock<AccPacket, 1>(row, col, block);
|
||||
}
|
||||
};
|
||||
|
||||
template<int CPU, typename Scalar, typename ResScalar, typename DataMapper>
|
||||
struct Accumulator<0, CPU, Scalar, ResScalar, DataMapper, 1, 4>
|
||||
{
|
||||
using LinearMapper = typename DataMapper::LinearMapper;
|
||||
using AccPacket = typename packet_traits<Scalar>::type;
|
||||
using ResPacket = typename packet_traits<ResScalar>::type;
|
||||
|
||||
AccPacket _acc;
|
||||
|
||||
EIGEN_STRONG_INLINE void zero()
|
||||
{
|
||||
_acc = pset1<AccPacket>(0);
|
||||
}
|
||||
|
||||
EIGEN_STRONG_INLINE void prefetch(const DataMapper&, Index, Index) {}
|
||||
|
||||
template<typename ResPacket_>
|
||||
EIGEN_STRONG_INLINE void scale(ResScalar alpha, const ResPacket_& pAlpha)
|
||||
{
|
||||
_acc *= pAlpha;
|
||||
}
|
||||
|
||||
EIGEN_STRONG_INLINE void store(const DataMapper& dest, Index row, Index col)
|
||||
{
|
||||
ResPacket r = dest.template gatherPacket<ResPacket>(row, col) + _acc;
|
||||
dest.template scatterPacket<ResPacket>(row, col, r);
|
||||
}
|
||||
};
|
||||
|
||||
template<int CPU, typename Scalar, typename ResScalar, typename DataMapper>
|
||||
struct Accumulator<0, CPU, Scalar, ResScalar, DataMapper, 4, 4>
|
||||
{
|
||||
using LinearMapper = typename DataMapper::LinearMapper;
|
||||
using AccPacket = typename packet_traits<Scalar>::type;
|
||||
using ResPacket = typename packet_traits<ResScalar>::type;
|
||||
|
||||
PacketBlock<AccPacket, 4> _acc;
|
||||
|
||||
EIGEN_STRONG_INLINE void zero()
|
||||
{
|
||||
_acc.packet[0] = pset1<AccPacket>(0);
|
||||
_acc.packet[1] = pset1<AccPacket>(0);
|
||||
_acc.packet[2] = pset1<AccPacket>(0);
|
||||
_acc.packet[3] = pset1<AccPacket>(0);
|
||||
}
|
||||
|
||||
EIGEN_STRONG_INLINE void prefetch(const DataMapper& dest, Index row, Index col)
|
||||
{
|
||||
dest.getLinearMapper(row, col + 0).prefetch(0);
|
||||
dest.getLinearMapper(row, col + 1).prefetch(0);
|
||||
dest.getLinearMapper(row, col + 2).prefetch(0);
|
||||
dest.getLinearMapper(row, col + 3).prefetch(0);
|
||||
}
|
||||
|
||||
template<typename ResPacket_>
|
||||
EIGEN_STRONG_INLINE void scale(ResScalar alpha, const ResPacket_& pAlpha)
|
||||
{
|
||||
_acc.packet[0] *= pAlpha;
|
||||
_acc.packet[1] *= pAlpha;
|
||||
_acc.packet[2] *= pAlpha;
|
||||
_acc.packet[3] *= pAlpha;
|
||||
}
|
||||
|
||||
EIGEN_STRONG_INLINE void store(const DataMapper& dest, Index row, Index col)
|
||||
{
|
||||
constexpr auto PacketSize = unpacket_traits<ResPacket>::size;
|
||||
|
||||
LinearMapper r0 = dest.getLinearMapper(row, col + 0);
|
||||
LinearMapper r1 = dest.getLinearMapper(row, col + 1);
|
||||
LinearMapper r2 = dest.getLinearMapper(row, col + 2);
|
||||
LinearMapper r3 = dest.getLinearMapper(row, col + 3);
|
||||
|
||||
r0.storePacket(0*PacketSize, r0.template loadPacket<ResPacket>(0*PacketSize) + _acc.packet[0]);
|
||||
r1.storePacket(0*PacketSize, r1.template loadPacket<ResPacket>(0*PacketSize) + _acc.packet[1]);
|
||||
r2.storePacket(0*PacketSize, r2.template loadPacket<ResPacket>(0*PacketSize) + _acc.packet[2]);
|
||||
r3.storePacket(0*PacketSize, r3.template loadPacket<ResPacket>(0*PacketSize) + _acc.packet[3]);
|
||||
}
|
||||
};
|
||||
|
||||
template<int CPU, typename Scalar, typename ResScalar, typename DataMapper>
|
||||
struct Accumulator<0, CPU, Scalar, ResScalar, DataMapper, 8, 4>
|
||||
{
|
||||
using LinearMapper = typename DataMapper::LinearMapper;
|
||||
using AccPacket = typename packet_traits<Scalar>::type;
|
||||
using ResPacket = typename packet_traits<ResScalar>::type;
|
||||
|
||||
PacketBlock<AccPacket, 4> _acc1;
|
||||
PacketBlock<AccPacket, 4> _acc2;
|
||||
|
||||
EIGEN_STRONG_INLINE void zero()
|
||||
{
|
||||
_acc1.packet[0] = pset1<AccPacket>(0);
|
||||
_acc1.packet[1] = pset1<AccPacket>(0);
|
||||
_acc1.packet[2] = pset1<AccPacket>(0);
|
||||
_acc1.packet[3] = pset1<AccPacket>(0);
|
||||
|
||||
_acc2.packet[0] = pset1<AccPacket>(0);
|
||||
_acc2.packet[1] = pset1<AccPacket>(0);
|
||||
_acc2.packet[2] = pset1<AccPacket>(0);
|
||||
_acc2.packet[3] = pset1<AccPacket>(0);
|
||||
}
|
||||
|
||||
EIGEN_STRONG_INLINE void prefetch(const DataMapper& dest, Index row, Index col)
|
||||
{
|
||||
dest.getLinearMapper(row + 0, col + 0).prefetch(0);
|
||||
dest.getLinearMapper(row + 0, col + 1).prefetch(0);
|
||||
dest.getLinearMapper(row + 0, col + 2).prefetch(0);
|
||||
dest.getLinearMapper(row + 0, col + 3).prefetch(0);
|
||||
}
|
||||
|
||||
template<typename ResPacket_>
|
||||
EIGEN_STRONG_INLINE void scale(ResScalar alpha, const ResPacket_& pAlpha)
|
||||
{
|
||||
_acc1.packet[0] *= pAlpha;
|
||||
_acc1.packet[1] *= pAlpha;
|
||||
_acc1.packet[2] *= pAlpha;
|
||||
_acc1.packet[3] *= pAlpha;
|
||||
|
||||
_acc2.packet[0] *= pAlpha;
|
||||
_acc2.packet[1] *= pAlpha;
|
||||
_acc2.packet[2] *= pAlpha;
|
||||
_acc2.packet[3] *= pAlpha;
|
||||
}
|
||||
|
||||
EIGEN_STRONG_INLINE void store(const DataMapper& dest, Index row, Index col)
|
||||
{
|
||||
constexpr auto PacketSize = unpacket_traits<ResPacket>::size;
|
||||
|
||||
LinearMapper r0 = dest.getLinearMapper(row, col + 0);
|
||||
LinearMapper r1 = dest.getLinearMapper(row, col + 1);
|
||||
LinearMapper r2 = dest.getLinearMapper(row, col + 2);
|
||||
LinearMapper r3 = dest.getLinearMapper(row, col + 3);
|
||||
|
||||
r0.storePacket(0*PacketSize, r0.template loadPacket<ResPacket>(0*PacketSize) + _acc1.packet[0]);
|
||||
r1.storePacket(0*PacketSize, r1.template loadPacket<ResPacket>(0*PacketSize) + _acc1.packet[1]);
|
||||
r2.storePacket(0*PacketSize, r2.template loadPacket<ResPacket>(0*PacketSize) + _acc1.packet[2]);
|
||||
r3.storePacket(0*PacketSize, r3.template loadPacket<ResPacket>(0*PacketSize) + _acc1.packet[3]);
|
||||
|
||||
r0.storePacket(1*PacketSize, r0.template loadPacket<ResPacket>(1*PacketSize) + _acc2.packet[0]);
|
||||
r1.storePacket(1*PacketSize, r1.template loadPacket<ResPacket>(1*PacketSize) + _acc2.packet[1]);
|
||||
r2.storePacket(1*PacketSize, r2.template loadPacket<ResPacket>(1*PacketSize) + _acc2.packet[2]);
|
||||
r3.storePacket(1*PacketSize, r3.template loadPacket<ResPacket>(1*PacketSize) + _acc2.packet[3]);
|
||||
}
|
||||
};
|
||||
|
||||
template<int CPU, typename Scalar, typename ResScalar, typename DataMapper>
|
||||
struct Accumulator<0, CPU, Scalar, ResScalar, DataMapper, 12, 4>
|
||||
{
|
||||
using LinearMapper = typename DataMapper::LinearMapper;
|
||||
using AccPacket = typename packet_traits<Scalar>::type;
|
||||
using ResPacket = typename packet_traits<ResScalar>::type;
|
||||
|
||||
PacketBlock<AccPacket, 4> _acc1;
|
||||
PacketBlock<AccPacket, 4> _acc2;
|
||||
PacketBlock<AccPacket, 4> _acc3;
|
||||
|
||||
EIGEN_STRONG_INLINE void zero()
|
||||
{
|
||||
_acc1.packet[0] = pset1<AccPacket>(0);
|
||||
_acc1.packet[1] = pset1<AccPacket>(0);
|
||||
_acc1.packet[2] = pset1<AccPacket>(0);
|
||||
_acc1.packet[3] = pset1<AccPacket>(0);
|
||||
|
||||
_acc2.packet[0] = pset1<AccPacket>(0);
|
||||
_acc2.packet[1] = pset1<AccPacket>(0);
|
||||
_acc2.packet[2] = pset1<AccPacket>(0);
|
||||
_acc2.packet[3] = pset1<AccPacket>(0);
|
||||
|
||||
_acc3.packet[0] = pset1<AccPacket>(0);
|
||||
_acc3.packet[1] = pset1<AccPacket>(0);
|
||||
_acc3.packet[2] = pset1<AccPacket>(0);
|
||||
_acc3.packet[3] = pset1<AccPacket>(0);
|
||||
}
|
||||
|
||||
EIGEN_STRONG_INLINE void prefetch(const DataMapper& dest, Index row, Index col)
|
||||
{
|
||||
dest.getLinearMapper(row + 0, col + 0).prefetch(0);
|
||||
dest.getLinearMapper(row + 0, col + 1).prefetch(0);
|
||||
dest.getLinearMapper(row + 0, col + 2).prefetch(0);
|
||||
dest.getLinearMapper(row + 0, col + 3).prefetch(0);
|
||||
}
|
||||
|
||||
template<typename ResPacket_>
|
||||
EIGEN_STRONG_INLINE void scale(ResScalar alpha, const ResPacket_& pAlpha)
|
||||
{
|
||||
_acc1.packet[0] *= pAlpha;
|
||||
_acc1.packet[1] *= pAlpha;
|
||||
_acc1.packet[2] *= pAlpha;
|
||||
_acc1.packet[3] *= pAlpha;
|
||||
|
||||
_acc2.packet[0] *= pAlpha;
|
||||
_acc2.packet[1] *= pAlpha;
|
||||
_acc2.packet[2] *= pAlpha;
|
||||
_acc2.packet[3] *= pAlpha;
|
||||
|
||||
_acc3.packet[0] *= pAlpha;
|
||||
_acc3.packet[1] *= pAlpha;
|
||||
_acc3.packet[2] *= pAlpha;
|
||||
_acc3.packet[3] *= pAlpha;
|
||||
}
|
||||
|
||||
EIGEN_STRONG_INLINE void store(const DataMapper& dest, Index row, Index col)
|
||||
{
|
||||
constexpr auto PacketSize = unpacket_traits<ResPacket>::size;
|
||||
|
||||
LinearMapper r0 = dest.getLinearMapper(row, col + 0);
|
||||
LinearMapper r1 = dest.getLinearMapper(row, col + 1);
|
||||
LinearMapper r2 = dest.getLinearMapper(row, col + 2);
|
||||
LinearMapper r3 = dest.getLinearMapper(row, col + 3);
|
||||
|
||||
r0.storePacket(0*PacketSize, r0.template loadPacket<ResPacket>(0*PacketSize) + _acc1.packet[0]);
|
||||
r1.storePacket(0*PacketSize, r1.template loadPacket<ResPacket>(0*PacketSize) + _acc1.packet[1]);
|
||||
r2.storePacket(0*PacketSize, r2.template loadPacket<ResPacket>(0*PacketSize) + _acc1.packet[2]);
|
||||
r3.storePacket(0*PacketSize, r3.template loadPacket<ResPacket>(0*PacketSize) + _acc1.packet[3]);
|
||||
|
||||
r0.storePacket(1*PacketSize, r0.template loadPacket<ResPacket>(1*PacketSize) + _acc2.packet[0]);
|
||||
r1.storePacket(1*PacketSize, r1.template loadPacket<ResPacket>(1*PacketSize) + _acc2.packet[1]);
|
||||
r2.storePacket(1*PacketSize, r2.template loadPacket<ResPacket>(1*PacketSize) + _acc2.packet[2]);
|
||||
r3.storePacket(1*PacketSize, r3.template loadPacket<ResPacket>(1*PacketSize) + _acc2.packet[3]);
|
||||
|
||||
r0.storePacket(2*PacketSize, r0.template loadPacket<ResPacket>(2*PacketSize) + _acc3.packet[0]);
|
||||
r1.storePacket(2*PacketSize, r1.template loadPacket<ResPacket>(2*PacketSize) + _acc3.packet[1]);
|
||||
r2.storePacket(2*PacketSize, r2.template loadPacket<ResPacket>(2*PacketSize) + _acc3.packet[2]);
|
||||
r3.storePacket(2*PacketSize, r3.template loadPacket<ResPacket>(2*PacketSize) + _acc3.packet[3]);
|
||||
}
|
||||
};
|
||||
|
||||
template<int CPU, typename Index, typename LhsScalar, typename LhsPackMap, typename RhsScalar, typename RhsPackMap, typename AccScalar, typename ResScalar, typename Accumulator>
|
||||
struct MicroKernel<0, CPU, Index, LhsScalar, LhsPackMap, RhsScalar, RhsPackMap, AccScalar, ResScalar, Accumulator, 4, __UNROLL__ , 4>
|
||||
{
|
||||
EIGEN_STRONG_INLINE void operator()(LhsPackMap& lhsPackMap,
|
||||
RhsPackMap& rhsPackMap,
|
||||
Index rowIdx, Index colIdx, Index depthIdx,
|
||||
Accumulator& acc)
|
||||
{
|
||||
using LhsPacket = typename packet_traits<LhsScalar>::type;
|
||||
using RhsPacket = typename packet_traits<RhsScalar>::type;
|
||||
|
||||
asm __volatile__("#BEGIN_NEON_MICROKERNEL_4x4x4\n\t");
|
||||
|
||||
LhsPacket pLhs;
|
||||
RhsPacket pRhs, pRhs0, pRhs1, pRhs2, pRhs3;
|
||||
|
||||
MICRO_4x1x4();
|
||||
MICRO_4x1x4();
|
||||
MICRO_4x1x4();
|
||||
MICRO_4x1x4();
|
||||
#if __UNROLL__ > 4
|
||||
MICRO_4x1x4();
|
||||
MICRO_4x1x4();
|
||||
MICRO_4x1x4();
|
||||
MICRO_4x1x4();
|
||||
#endif
|
||||
|
||||
asm __volatile__("#END_NEON_MICROKERNEL_4x4x4\n\t");
|
||||
};
|
||||
};
|
||||
|
||||
template<int CPU, typename Index, typename LhsScalar, typename LhsPackMap, typename RhsScalar, typename RhsPackMap, typename AccScalar, typename ResScalar, typename Accumulator>
|
||||
struct MicroKernel<0, CPU, Index, LhsScalar, LhsPackMap, RhsScalar, RhsPackMap, AccScalar, ResScalar, Accumulator, 8, __UNROLL__, 4>
|
||||
{
|
||||
EIGEN_STRONG_INLINE void operator()(LhsPackMap& lhsPackMap,
|
||||
RhsPackMap& rhsPackMap,
|
||||
Index rowIdx, Index colIdx, Index depthIdx,
|
||||
Accumulator& acc)
|
||||
{
|
||||
using LhsPacket = typename packet_traits<LhsScalar>::type;
|
||||
using RhsPacket = typename packet_traits<RhsScalar>::type;
|
||||
|
||||
asm __volatile__("#BEGIN_NEON_MICROKERNEL_8x8x4\n\t");
|
||||
|
||||
LhsPacket pLhs, pLhs2;
|
||||
RhsPacket pRhs, pRhs0, pRhs1, pRhs2, pRhs3;
|
||||
|
||||
#if __UNROLL__ == 8
|
||||
#ifdef __ENABLE_PREFETCH__
|
||||
prefetch(rhsPackMap.pCur + (48+0));
|
||||
#endif
|
||||
MICRO_8x1x4();
|
||||
MICRO_8x1x4();
|
||||
MICRO_8x1x4();
|
||||
MICRO_8x1x4();
|
||||
#ifdef __ENABLE_PREFETCH__
|
||||
prefetch(rhsPackMap.pCur + (48+16));
|
||||
#endif
|
||||
MICRO_8x1x4();
|
||||
MICRO_8x1x4();
|
||||
MICRO_8x1x4();
|
||||
MICRO_8x1x4();
|
||||
#else
|
||||
MICRO_8x1x4();
|
||||
MICRO_8x1x4();
|
||||
MICRO_8x1x4();
|
||||
MICRO_8x1x4();
|
||||
#endif
|
||||
asm __volatile__("#END_NEON_MICROKERNEL_8x8x4\n\t");
|
||||
};
|
||||
};
|
||||
|
||||
template<int CPU, typename Index, typename LhsScalar, typename LhsPackMap, typename RhsScalar, typename RhsPackMap, typename AccScalar, typename ResScalar, typename Accumulator>
|
||||
struct MicroKernel<0, CPU, Index, LhsScalar, LhsPackMap, RhsScalar, RhsPackMap, AccScalar, ResScalar, Accumulator, 12, __UNROLL__, 4>
|
||||
{
|
||||
EIGEN_STRONG_INLINE void operator()(LhsPackMap& lhsPackMap,
|
||||
RhsPackMap& rhsPackMap,
|
||||
Index rowIdx, Index colIdx, Index depthIdx,
|
||||
Accumulator& acc)
|
||||
{
|
||||
using LhsPacket = typename packet_traits<LhsScalar>::type;
|
||||
using RhsPacket = typename packet_traits<RhsScalar>::type;
|
||||
|
||||
asm __volatile__("#BEGIN_NEON_MICROKERNEL_12x8x4\n\t");
|
||||
|
||||
LhsPacket pLhs, pLhs2, pLhs3;
|
||||
RhsPacket pRhs, pRhs0, pRhs1, pRhs2, pRhs3;
|
||||
|
||||
#if __UNROLL__ == 8
|
||||
#ifdef __ENABLE_PREFETCH__
|
||||
prefetch(rhsPackMap.pCur);
|
||||
#endif
|
||||
MICRO_12x1x4();
|
||||
MICRO_12x1x4();
|
||||
MICRO_12x1x4();
|
||||
MICRO_12x1x4();
|
||||
MICRO_12x1x4();
|
||||
MICRO_12x1x4();
|
||||
MICRO_12x1x4();
|
||||
MICRO_12x1x4();
|
||||
#else
|
||||
MICRO_12x1x4();
|
||||
MICRO_12x1x4();
|
||||
MICRO_12x1x4();
|
||||
MICRO_12x1x4();
|
||||
#endif
|
||||
asm __volatile__("#END_NEON_MICROKERNEL_12x8x4\n\t");
|
||||
};
|
||||
};
|
||||
|
||||
template<int CPU, typename Index, typename LhsScalar, typename LhsPackMap, typename RhsScalar, typename RhsPackMap, typename AccScalar, typename ResScalar, typename Accumulator>
|
||||
struct MicroKernel<0, CPU, Index, LhsScalar, LhsPackMap, RhsScalar, RhsPackMap, AccScalar, ResScalar, Accumulator, 12, 1, 4>
|
||||
{
|
||||
EIGEN_STRONG_INLINE void operator()(LhsPackMap& lhsPackMap,
|
||||
RhsPackMap& rhsPackMap,
|
||||
Index rowIdx, Index colIdx, Index depthIdx,
|
||||
Accumulator& acc)
|
||||
{
|
||||
using LhsPacket = typename packet_traits<LhsScalar>::type;
|
||||
using RhsPacket = typename packet_traits<RhsScalar>::type;
|
||||
|
||||
asm __volatile__("#BEGIN_NEON_MICROKERNEL_12x1x4\n\t");
|
||||
|
||||
LhsPacket pLhs, pLhs2, pLhs3;
|
||||
RhsPacket pRhs, pRhs0, pRhs1, pRhs2, pRhs3;
|
||||
|
||||
MICRO_12x1x4();
|
||||
|
||||
asm __volatile__("#END_NEON_MICROKERNEL_12x1x4\n\t");
|
||||
};
|
||||
};
|
||||
|
||||
template<int CPU, typename Index, typename LhsScalar, typename LhsPackMap, typename RhsScalar, typename RhsPackMap, typename AccScalar, typename ResScalar, typename Accumulator>
|
||||
struct MicroKernel<0, CPU, Index, LhsScalar, LhsPackMap, RhsScalar, RhsPackMap, AccScalar, ResScalar, Accumulator, 8, 1, 4>
|
||||
{
|
||||
EIGEN_STRONG_INLINE void operator()(LhsPackMap& lhsPackMap,
|
||||
RhsPackMap& rhsPackMap,
|
||||
Index rowIdx, Index colIdx, Index depthIdx,
|
||||
Accumulator& acc)
|
||||
{
|
||||
using LhsPacket = typename packet_traits<LhsScalar>::type;
|
||||
using RhsPacket = typename packet_traits<RhsScalar>::type;
|
||||
|
||||
asm __volatile__("#BEGIN_NEON_MICROKERNEL_8x1x4\n\t");
|
||||
|
||||
LhsPacket pLhs, pLhs2;
|
||||
RhsPacket pRhs, pRhs0, pRhs1, pRhs2, pRhs3;
|
||||
|
||||
MICRO_8x1x4();
|
||||
|
||||
asm __volatile__("#END_NEON_MICROKERNEL_8x1x4\n\t");
|
||||
};
|
||||
};
|
||||
|
||||
template<int CPU, typename Index, typename LhsScalar, typename LhsPackMap, typename RhsScalar, typename RhsPackMap, typename AccScalar, typename ResScalar, typename Accumulator>
|
||||
struct MicroKernel<0, CPU, Index, LhsScalar, LhsPackMap, RhsScalar, RhsPackMap, AccScalar, ResScalar, Accumulator, 4, 1, 4>
|
||||
{
|
||||
EIGEN_STRONG_INLINE void operator()(LhsPackMap& lhsPackMap,
|
||||
RhsPackMap& rhsPackMap,
|
||||
Index rowIdx, Index colIdx, Index depthIdx,
|
||||
Accumulator& acc)
|
||||
{
|
||||
using LhsPacket = typename packet_traits<LhsScalar>::type;
|
||||
using RhsPacket = typename packet_traits<RhsScalar>::type;
|
||||
|
||||
asm __volatile__("#BEGIN_NEON_MICROKERNEL_4x1x4\n\t");
|
||||
|
||||
LhsPacket pLhs;
|
||||
RhsPacket pRhs, pRhs0, pRhs1, pRhs2, pRhs3;
|
||||
|
||||
MICRO_4x1x4();
|
||||
|
||||
asm __volatile__("#END_NEON_MICROKERNEL_4x1x4\n\t");
|
||||
};
|
||||
};
|
||||
|
||||
template<int CPU, typename Index, typename LhsScalar, typename LhsPackMap, typename RhsScalar, typename RhsPackMap, typename AccScalar, typename ResScalar, typename Accumulator>
|
||||
struct MicroKernel<0, CPU, Index, LhsScalar, LhsPackMap, RhsScalar, RhsPackMap, AccScalar, ResScalar, Accumulator, 12, __UNROLL__, 1>
|
||||
{
|
||||
EIGEN_STRONG_INLINE void operator()(LhsPackMap& lhsPackMap,
|
||||
RhsPackMap& rhsPackMap,
|
||||
Index rowIdx, Index colIdx, Index depthIdx,
|
||||
Accumulator& acc)
|
||||
{
|
||||
using LhsPacket = typename packet_traits<LhsScalar>::type;
|
||||
using RhsPacket = typename packet_traits<RhsScalar>::type;
|
||||
|
||||
LhsPacket pLhs;
|
||||
RhsPacket pRhs;
|
||||
|
||||
asm __volatile__("#BEGIN_NEON_MICROKERNEL_4x1x1\n\t");
|
||||
|
||||
MICRO_12x1x1();
|
||||
MICRO_12x1x1();
|
||||
MICRO_12x1x1();
|
||||
MICRO_12x1x1();
|
||||
MICRO_12x1x1();
|
||||
MICRO_12x1x1();
|
||||
MICRO_12x1x1();
|
||||
MICRO_12x1x1();
|
||||
|
||||
asm __volatile__("#END_NEON_MICROKERNEL_4x1x1\n\t");
|
||||
};
|
||||
};
|
||||
|
||||
template<int CPU, typename Index, typename LhsScalar, typename LhsPackMap, typename RhsScalar, typename RhsPackMap, typename AccScalar, typename ResScalar, typename Accumulator>
|
||||
struct MicroKernel<0, CPU, Index, LhsScalar, LhsPackMap, RhsScalar, RhsPackMap, AccScalar, ResScalar, Accumulator, 12, 1, 1>
|
||||
{
|
||||
EIGEN_STRONG_INLINE void operator()(LhsPackMap& lhsPackMap,
|
||||
RhsPackMap& rhsPackMap,
|
||||
Index rowIdx, Index colIdx, Index depthIdx,
|
||||
Accumulator& acc)
|
||||
{
|
||||
using LhsPacket = typename packet_traits<LhsScalar>::type;
|
||||
using RhsPacket = typename packet_traits<RhsScalar>::type;
|
||||
|
||||
LhsPacket pLhs;
|
||||
RhsPacket pRhs;
|
||||
|
||||
asm __volatile__("#BEGIN_NEON_MICROKERNEL_4x1x1\n\t");
|
||||
|
||||
MICRO_12x1x1();
|
||||
|
||||
asm __volatile__("#END_NEON_MICROKERNEL_4x1x1\n\t");
|
||||
};
|
||||
};
|
||||
|
||||
template<int CPU, typename Index, typename LhsScalar, typename LhsPackMap, typename RhsScalar, typename RhsPackMap, typename AccScalar, typename ResScalar, typename Accumulator>
|
||||
struct MicroKernel<0, CPU, Index, LhsScalar, LhsPackMap, RhsScalar, RhsPackMap, AccScalar, ResScalar, Accumulator, 8, __UNROLL__, 1>
|
||||
{
|
||||
EIGEN_STRONG_INLINE void operator()(LhsPackMap& lhsPackMap,
|
||||
RhsPackMap& rhsPackMap,
|
||||
Index rowIdx, Index colIdx, Index depthIdx,
|
||||
Accumulator& acc)
|
||||
{
|
||||
using LhsPacket = typename packet_traits<LhsScalar>::type;
|
||||
using RhsPacket = typename packet_traits<RhsScalar>::type;
|
||||
|
||||
asm __volatile__("#BEGIN_NEON_MICROKERNEL_4x1x1\n\t");
|
||||
|
||||
LhsPacket pLhs;
|
||||
RhsPacket pRhs;
|
||||
|
||||
MICRO_8x1x1();
|
||||
MICRO_8x1x1();
|
||||
MICRO_8x1x1();
|
||||
MICRO_8x1x1();
|
||||
MICRO_8x1x1();
|
||||
MICRO_8x1x1();
|
||||
MICRO_8x1x1();
|
||||
MICRO_8x1x1();
|
||||
|
||||
asm __volatile__("#END_NEON_MICROKERNEL_4x1x1\n\t");
|
||||
};
|
||||
};
|
||||
|
||||
template<int CPU, typename Index, typename LhsScalar, typename LhsPackMap, typename RhsScalar, typename RhsPackMap, typename AccScalar, typename ResScalar, typename Accumulator>
|
||||
struct MicroKernel<0, CPU, Index, LhsScalar, LhsPackMap, RhsScalar, RhsPackMap, AccScalar, ResScalar, Accumulator, 8, 1, 1>
|
||||
{
|
||||
EIGEN_STRONG_INLINE void operator()(LhsPackMap& lhsPackMap,
|
||||
RhsPackMap& rhsPackMap,
|
||||
Index rowIdx, Index colIdx, Index depthIdx,
|
||||
Accumulator& acc)
|
||||
{
|
||||
using LhsPacket = typename packet_traits<LhsScalar>::type;
|
||||
using RhsPacket = typename packet_traits<RhsScalar>::type;
|
||||
|
||||
asm __volatile__("#BEGIN_NEON_MICROKERNEL_4x1x1\n\t");
|
||||
|
||||
LhsPacket pLhs;
|
||||
RhsPacket pRhs;
|
||||
|
||||
MICRO_8x1x1();
|
||||
|
||||
asm __volatile__("#END_NEON_MICROKERNEL_4x1x1\n\t");
|
||||
};
|
||||
};
|
||||
|
||||
template<int CPU, typename Index, typename LhsScalar, typename LhsPackMap, typename RhsScalar, typename RhsPackMap, typename AccScalar, typename ResScalar, typename Accumulator>
|
||||
struct MicroKernel<0, CPU, Index, LhsScalar, LhsPackMap, RhsScalar, RhsPackMap, AccScalar, ResScalar, Accumulator, 4, __UNROLL__, 1>
|
||||
{
|
||||
EIGEN_STRONG_INLINE void operator()(LhsPackMap& lhsPackMap,
|
||||
RhsPackMap& rhsPackMap,
|
||||
Index rowIdx, Index colIdx, Index depthIdx,
|
||||
Accumulator& acc)
|
||||
{
|
||||
using LhsPacket = typename packet_traits<LhsScalar>::type;
|
||||
using RhsPacket = typename packet_traits<RhsScalar>::type;
|
||||
|
||||
asm __volatile__("#BEGIN_NEON_MICROKERNEL_4x1x1\n\t");
|
||||
|
||||
LhsPacket pLhs;
|
||||
RhsPacket pRhs;
|
||||
|
||||
MICRO_4x1x1();
|
||||
MICRO_4x1x1();
|
||||
MICRO_4x1x1();
|
||||
MICRO_4x1x1();
|
||||
MICRO_4x1x1();
|
||||
MICRO_4x1x1();
|
||||
MICRO_4x1x1();
|
||||
MICRO_4x1x1();
|
||||
|
||||
asm __volatile__("#END_NEON_MICROKERNEL_4x1x1\n\t");
|
||||
};
|
||||
};
|
||||
|
||||
template<int CPU, typename Index, typename LhsScalar, typename LhsPackMap, typename RhsScalar, typename RhsPackMap, typename AccScalar, typename ResScalar, typename Accumulator>
|
||||
struct MicroKernel<0, CPU, Index, LhsScalar, LhsPackMap, RhsScalar, RhsPackMap, AccScalar, ResScalar, Accumulator, 4, 1, 1>
|
||||
{
|
||||
EIGEN_STRONG_INLINE void operator()(LhsPackMap& lhsPackMap,
|
||||
RhsPackMap& rhsPackMap,
|
||||
Index rowIdx, Index colIdx, Index depthIdx,
|
||||
Accumulator& acc)
|
||||
{
|
||||
using LhsPacket = typename packet_traits<LhsScalar>::type;
|
||||
using RhsPacket = typename packet_traits<RhsScalar>::type;
|
||||
|
||||
asm __volatile__("#BEGIN_NEON_MICROKERNEL_4x1x1\n\t");
|
||||
|
||||
LhsPacket pLhs;
|
||||
RhsPacket pRhs;
|
||||
|
||||
MICRO_4x1x1();
|
||||
|
||||
asm __volatile__("#END_NEON_MICROKERNEL_4x1x1\n\t");
|
||||
};
|
||||
};
|
||||
|
||||
template<int CPU, typename Index, typename LhsScalar, typename LhsPackMap, typename RhsScalar, typename RhsPackMap, typename AccScalar, typename ResScalar, typename Accumulator>
|
||||
struct MicroKernel<0, CPU, Index, LhsScalar, LhsPackMap, RhsScalar, RhsPackMap, AccScalar, ResScalar, Accumulator, 1, 1, 4>
|
||||
{
|
||||
EIGEN_STRONG_INLINE void operator()(LhsPackMap& lhsPackMap,
|
||||
RhsPackMap& rhsPackMap,
|
||||
Index rowIdx, Index colIdx, Index depthIdx,
|
||||
Accumulator& acc)
|
||||
{
|
||||
using RhsPacket = typename packet_traits<RhsScalar>::type;
|
||||
using LhsPacket = typename packet_traits<LhsScalar>::type;
|
||||
|
||||
asm __volatile__("#BEGIN_NEON_MICROKERNEL_1x1x4\n\t");
|
||||
|
||||
LhsPacket pLhs = pset1<LhsPacket>(*lhsPackMap.pCur);
|
||||
RhsPacket pRhs = pload<RhsPacket>(rhsPackMap.pCur);
|
||||
|
||||
acc._acc += pLhs*pRhs;
|
||||
|
||||
lhsPackMap.advance(1);
|
||||
rhsPackMap.advance(4*1);
|
||||
asm __volatile__("#END_NEON_MICROKERNEL_1x1x4\n\t");
|
||||
};
|
||||
};
|
||||
|
||||
#endif // __ENABLE_VECTOR_KERNELS__
|
||||
|
||||
} // end namespace internal
|
||||
|
||||
} // end namespace Eigen
|
||||
|
||||
#endif // EIGEN_KERNELS_NEON_H
|
||||
523
Eigen/src/Core/arch/NEON/MatrixProduct.h
Normal file
523
Eigen/src/Core/arch/NEON/MatrixProduct.h
Normal file
@@ -0,0 +1,523 @@
|
||||
// This file is part of Eigen, a lightweight C++ template library
|
||||
// for linear algebra.
|
||||
//
|
||||
// Copyright (C) 2021 Everton Constantino (everton.constantino@hotmail.com)
|
||||
//
|
||||
// This Source Code Form is subject to the terms of the Mozilla
|
||||
// Public License v. 2.0. If a copy of the MPL was not distributed
|
||||
// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
|
||||
|
||||
#ifndef EIGEN_MATRIX_PRODUCT_NEON_H
|
||||
#define EIGEN_MATRIX_PRODUCT_NEON_H
|
||||
|
||||
#ifdef __DEBUG__
|
||||
#include <iostream>
|
||||
#endif
|
||||
|
||||
namespace Eigen {
|
||||
|
||||
namespace internal {
|
||||
|
||||
#ifndef __UNROLL__
|
||||
#define __UNROLL__ 8
|
||||
#endif
|
||||
|
||||
template<int Architecture, int CPU, typename LhsScalar, typename RhsScalar>
|
||||
constexpr int SHAPES_COUNT = 14;
|
||||
|
||||
constexpr int SHAPES_DIMENSION = 6;
|
||||
constexpr int SHAPES_LHS_DIMENSION = 0;
|
||||
constexpr int SHAPES_DEP_DIMENSION = 1;
|
||||
constexpr int SHAPES_RHS_DIMENSION = 2;
|
||||
constexpr int SHAPES_RHS_POINTER = 3;
|
||||
constexpr int SHAPES_LHS_POINTER = 4;
|
||||
constexpr int SHAPES_DEP_POINTER = 5;
|
||||
constexpr int SHAPES_POINTER_END = -1;
|
||||
|
||||
template<int Architecture, int CPU, typename Scalar, bool isLhs>
|
||||
constexpr int PACK_SHAPES_COUNT = 2;
|
||||
|
||||
template<int Architecture, int CPU, typename Scalar>
|
||||
constexpr int PACK_SHAPES_COUNT<Architecture, CPU, Scalar, true> = 4;
|
||||
|
||||
constexpr int PACK_SHAPES_DIMENSION = 3;
|
||||
constexpr int PACK_SHAPES_POINTER = 2;
|
||||
constexpr int PACK_SHAPES_END = -1;
|
||||
|
||||
// lhs_progress x depth_progress x rhs_progress (depth_progress > 1 matrix ops) x pointer to next rhs_progress on the shapes map
|
||||
template<int Architecture, int CPU, typename LhsScalar, typename RhsScalar>
|
||||
constexpr int SHAPES[SHAPES_COUNT<Architecture, CPU, LhsScalar,RhsScalar>][SHAPES_DIMENSION] =
|
||||
{ /* 00 */{ 1, 1,1,SHAPES_POINTER_END, SHAPES_POINTER_END, SHAPES_POINTER_END},
|
||||
/* 01 */{1*packet_traits<RhsScalar>::size, 1,1, 0, 0, SHAPES_POINTER_END},
|
||||
/* 02 */{1*packet_traits<RhsScalar>::size,__UNROLL__,1, 0, 0, 1},
|
||||
/* 03 */{2*packet_traits<RhsScalar>::size, 1,1, 0, 2, SHAPES_POINTER_END},
|
||||
/* 04 */{2*packet_traits<RhsScalar>::size,__UNROLL__,1, 0, 2, 3},
|
||||
/* 05 */{3*packet_traits<RhsScalar>::size, 1,1, 0, 4, SHAPES_POINTER_END},
|
||||
/* 06 */{3*packet_traits<RhsScalar>::size,__UNROLL__,1, 0, 4, 5},
|
||||
/* 07 */{ 1, 1,4, 6, SHAPES_POINTER_END, SHAPES_POINTER_END},
|
||||
/* 08 */{1*packet_traits<RhsScalar>::size, 1,4, 6, 7, SHAPES_POINTER_END},
|
||||
/* 09 */{1*packet_traits<RhsScalar>::size,__UNROLL__,4, 6, 7, 8},
|
||||
/* 10 */{2*packet_traits<RhsScalar>::size, 1,4, 6, 9, SHAPES_POINTER_END},
|
||||
/* 11 */{2*packet_traits<RhsScalar>::size,__UNROLL__,4, 6, 9, 10},
|
||||
/* 12 */{3*packet_traits<RhsScalar>::size, 1,4, 6, 11, SHAPES_POINTER_END},
|
||||
/* 13 */{3*packet_traits<RhsScalar>::size,__UNROLL__,4, 6, 11, 12}};
|
||||
|
||||
// d1progress x d2progress
|
||||
template<int Architecture, int CPU, typename Scalar, bool isLhs>
|
||||
constexpr int PACK_SHAPES[PACK_SHAPES_COUNT<Architecture, CPU, Scalar, isLhs>][PACK_SHAPES_DIMENSION] =
|
||||
{{ 1, 1, PACK_SHAPES_END},
|
||||
{ 4, 1, 0}};
|
||||
|
||||
template<int Architecture, int CPU, typename Scalar>
|
||||
constexpr int PACK_SHAPES<Architecture, CPU, Scalar, true>[PACK_SHAPES_COUNT<Architecture, CPU, Scalar, true>][PACK_SHAPES_DIMENSION] =
|
||||
{{ 1, 1, PACK_SHAPES_END},
|
||||
{1*packet_traits<Scalar>::size, 1, 0},
|
||||
{2*packet_traits<Scalar>::size, 1, 1},
|
||||
{3*packet_traits<Scalar>::size, 1, 2}};
|
||||
|
||||
template<int Architecture, int CPU, typename Index, typename Scalar, bool isLhs, typename DataMapper, bool Conjugate, bool PanelMode, int StorageOrder, int M, int N>
|
||||
struct PackingOperator
|
||||
{
|
||||
EIGEN_STRONG_INLINE Scalar* operator()(Index d1Idx, Index d2Idx, Scalar *block, const DataMapper& data)
|
||||
{
|
||||
#ifdef __DEBUG__
|
||||
std::cout << M << "x" << N << " ( " << d1Idx << ", " << d2Idx <<") -> ( " << d1Idx + M << ", " << d2Idx + N << ") ";
|
||||
#endif
|
||||
Scalar *c = block;
|
||||
for(auto i = 0; i < M; i++)
|
||||
for(auto j = 0; j < N; j++)
|
||||
{
|
||||
if(isLhs)
|
||||
*c = data(d1Idx + i, d2Idx + j);
|
||||
else
|
||||
*c = data(d2Idx + j, d1Idx + i);
|
||||
#ifdef __DEBUG__
|
||||
std::cout << *c << " ";
|
||||
#endif
|
||||
c++;
|
||||
}
|
||||
#ifdef __DEBUG__
|
||||
std::cout << std::endl;
|
||||
#endif
|
||||
return c;
|
||||
}
|
||||
};
|
||||
|
||||
template<int Architecture, int CPU, typename Index, typename Scalar, bool isLhs, typename DataMapper, bool Conjugate, bool PanelMode, int StorageOrder, int D1PROGRESS, int IDX>
|
||||
struct PackingInnerStruct
|
||||
{
|
||||
EIGEN_STRONG_INLINE Scalar* operator()(Index d1Idx, Index d2Idx, Scalar *block, const DataMapper& data, Index d1Size, Index d2Size, Index stride, Index offset)
|
||||
{
|
||||
constexpr auto d2Progress = PACK_SHAPES<Architecture, CPU, Scalar, isLhs>[IDX][1];
|
||||
PackingOperator<Architecture, CPU, Index, Scalar, isLhs, DataMapper, Conjugate, PanelMode, StorageOrder, D1PROGRESS, d2Progress> po;
|
||||
|
||||
for(;d2Idx + d2Progress <= d2Size; d2Idx+=d2Progress)
|
||||
{
|
||||
block = po(d1Idx, d2Idx, block, data);
|
||||
}
|
||||
|
||||
if(PACK_SHAPES<Architecture, CPU, Scalar, isLhs>[IDX-1][0] == D1PROGRESS)
|
||||
{
|
||||
PackingInnerStruct<Architecture, CPU, Index, Scalar, isLhs, DataMapper, Conjugate, PanelMode, StorageOrder, D1PROGRESS, IDX-1> pis;
|
||||
block = pis(d1Idx, d2Idx, block, data, d1Size, d2Size, stride, offset);
|
||||
}
|
||||
return block;
|
||||
}
|
||||
};
|
||||
|
||||
template<int Architecture, int CPU, typename Index, typename Scalar, bool isLhs, typename DataMapper, bool Conjugate, bool PanelMode, int StorageOrder, int D1PROGRESS>
|
||||
struct PackingInnerStruct<Architecture, CPU, Index, Scalar, isLhs, DataMapper, Conjugate, PanelMode, StorageOrder, D1PROGRESS, 0>
|
||||
{
|
||||
EIGEN_STRONG_INLINE Scalar* operator()(Index d1Idx, Index d2Idx, Scalar *block, const DataMapper& data, Index d1Size, Index d2Size, Index stride, Index offset)
|
||||
{
|
||||
constexpr auto d2Progress = PACK_SHAPES<Architecture, CPU, Scalar, isLhs>[0][1];
|
||||
for(;d2Idx + d2Progress <= d2Size; d2Idx+=d2Progress)
|
||||
{
|
||||
PackingOperator<Architecture, CPU, Index, Scalar, isLhs, DataMapper, Conjugate, PanelMode, StorageOrder, D1PROGRESS, d2Progress> po;
|
||||
block = po(d1Idx, d2Idx, block, data);
|
||||
}
|
||||
return block;
|
||||
}
|
||||
};
|
||||
|
||||
template<int Architecture, int CPU, typename Index, typename Scalar, bool isLhs, typename DataMapper, bool Conjugate, bool PanelMode, int StorageOrder, int PACK_SHAPE_IDX>
|
||||
struct PackingStruct
|
||||
{
|
||||
PackingStruct<Architecture, CPU, Index, Scalar, isLhs, DataMapper, Conjugate, PanelMode, StorageOrder, PACK_SHAPES<Architecture, CPU, Scalar, isLhs>[PACK_SHAPE_IDX][PACK_SHAPES_POINTER]> ps;
|
||||
|
||||
EIGEN_STRONG_INLINE Scalar* operator()(Index d1Idx, Scalar *block, const DataMapper& data, Index d1Size, Index d2Size, Index stride, Index offset)
|
||||
{
|
||||
constexpr auto d1Progress = PACK_SHAPES<Architecture, CPU, Scalar, isLhs>[PACK_SHAPE_IDX][0];
|
||||
|
||||
for(; d1Idx + d1Progress <= d1Size; d1Idx += d1Progress)
|
||||
{
|
||||
PackingInnerStruct<Architecture, CPU, Index, Scalar, isLhs, DataMapper, Conjugate, PanelMode, StorageOrder, d1Progress, PACK_SHAPE_IDX> pis;
|
||||
block = pis(d1Idx, 0, block, data, d1Size, d2Size, stride, offset);
|
||||
}
|
||||
return ps(d1Idx, block, data, d1Size, d2Size, stride, offset);
|
||||
}
|
||||
};
|
||||
|
||||
template<int Architecture, int CPU, typename Index, typename Scalar, bool isLhs, typename DataMapper, bool Conjugate, bool PanelMode, int StorageOrder>
|
||||
struct PackingStruct<Architecture, CPU, Index, Scalar, isLhs, DataMapper, Conjugate, PanelMode, StorageOrder, -1>
|
||||
{
|
||||
EIGEN_STRONG_INLINE Scalar* operator()(Index, Scalar *block, const DataMapper&, Index, Index, Index, Index) { return block; }
|
||||
};
|
||||
|
||||
template<int Architecture, int CPU, typename Index, typename Scalar, typename DataMapper, bool Conjugate, bool PanelMode, int StorageOrder>
|
||||
struct lhs_pack
|
||||
{
|
||||
EIGEN_STRONG_INLINE void operator()(Scalar *blockA, const DataMapper &lhs, Index depth, Index rows, Index stride, Index offset)
|
||||
{
|
||||
PackingStruct<Architecture, CPU, Index, Scalar, true, DataMapper, Conjugate, PanelMode, StorageOrder, PACK_SHAPES_COUNT<Architecture, CPU, Scalar, true>-1> ps;
|
||||
ps(0, blockA, lhs, rows, depth, stride, offset);
|
||||
}
|
||||
};
|
||||
|
||||
template<int Architecture, int CPU, typename Index, typename Scalar, typename DataMapper, bool Conjugate, bool PanelMode, int StorageOrder>
|
||||
struct rhs_pack
|
||||
{
|
||||
EIGEN_STRONG_INLINE void operator()(Scalar *blockB, const DataMapper &rhs, Index depth, Index cols, Index stride, Index offset)
|
||||
{
|
||||
PackingStruct<Architecture, CPU, Index, Scalar, false, DataMapper, Conjugate, PanelMode, StorageOrder, PACK_SHAPES_COUNT<Architecture, CPU, Scalar, false>-1> ps;
|
||||
ps(0, blockB, rhs, cols, depth, stride, offset);
|
||||
}
|
||||
};
|
||||
|
||||
template<int Architecture, int CPU, typename Index, typename Scalar, typename DataMapper, bool isLhs, int IDX>
|
||||
struct PackMapCalculator
|
||||
{
|
||||
PackMapCalculator<Architecture, CPU, Index, Scalar, DataMapper, isLhs, PACK_SHAPES<Architecture, CPU, Scalar, isLhs>[IDX][PACK_SHAPES_POINTER]> pmc;
|
||||
EIGEN_STRONG_INLINE Index getPosition(Index pos, Index d2Size)
|
||||
{
|
||||
constexpr auto d1Progress = PACK_SHAPES<Architecture, CPU, Scalar, isLhs>[IDX][0];
|
||||
Index v = (pos / d1Progress) * d1Progress;
|
||||
return v*d2Size + pmc.getPosition(pos - v, d2Size);
|
||||
}
|
||||
};
|
||||
|
||||
template<int Architecture, int CPU, typename Index, typename Scalar, typename DataMapper, bool isLhs>
|
||||
struct PackMapCalculator<Architecture, CPU, Index, Scalar, DataMapper, isLhs, -1>
|
||||
{
|
||||
EIGEN_STRONG_INLINE Index getPosition(Index, Index) { return Index(0); }
|
||||
};
|
||||
|
||||
template<int Architecture, int CPU, typename Index, typename Scalar, typename DataMapper, bool isLhs>
|
||||
struct PackMap
|
||||
{
|
||||
const Scalar *pBase;
|
||||
const Scalar *pCur;
|
||||
Index stride;
|
||||
Index offset;
|
||||
Index d2Size;
|
||||
PackMapCalculator<Architecture, CPU, Index, Scalar, DataMapper, isLhs, PACK_SHAPES_COUNT<Architecture, CPU, Scalar, isLhs>-1> pmc;
|
||||
|
||||
PackMap(const Scalar *base, Index d2Size, Index stride, Index offset) : pBase(base), pCur(base), d2Size(d2Size), stride(stride), offset(offset) {}
|
||||
|
||||
EIGEN_STRONG_INLINE void resetCur() { pCur = pBase; }
|
||||
EIGEN_STRONG_INLINE void moveTo(Index p1) { pCur = pBase + pmc.getPosition(p1, d2Size); }
|
||||
EIGEN_STRONG_INLINE void advance(int progress) { pCur += progress; }
|
||||
};
|
||||
|
||||
template<int Architecture, int CPU, typename Scalar, typename ResScalar, typename DataMapper, int M, int N>
|
||||
struct Accumulator
|
||||
{
|
||||
Scalar dt[M][N];
|
||||
|
||||
EIGEN_STRONG_INLINE void zero()
|
||||
{
|
||||
for(auto i = 0; i < M; i++)
|
||||
{
|
||||
for(auto j = 0; j < N; j++)
|
||||
{
|
||||
dt[i][j] = Scalar(0);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
EIGEN_STRONG_INLINE void prefetch(const DataMapper&, Index, Index) {}
|
||||
|
||||
template<typename ResPacket>
|
||||
EIGEN_STRONG_INLINE void scale(ResScalar alpha, const ResPacket& pAlpha)
|
||||
{
|
||||
for(auto i = 0; i < M; i++)
|
||||
{
|
||||
for(auto j = 0; j < N; j++)
|
||||
{
|
||||
dt[i][j] *= alpha;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
EIGEN_STRONG_INLINE void store(const DataMapper& dest, Index row, Index col)
|
||||
{
|
||||
for(auto i = 0; i < M; i++)
|
||||
{
|
||||
for(auto j = 0; j < N; j++)
|
||||
{
|
||||
dest(row + i, col + j) += dt[i][j];
|
||||
}
|
||||
}
|
||||
}
|
||||
};
|
||||
|
||||
template<int Architecture, int CPU, typename Index, typename LhsScalar, typename LhsPackMap, typename RhsScalar, typename RhsPackMap, typename AccScalar, typename ResScalar, typename Accumulator, int M, int K, int N>
|
||||
struct MicroKernel
|
||||
{
|
||||
EIGEN_STRONG_INLINE void operator()(LhsPackMap& lhsPackMap,
|
||||
RhsPackMap& rhsPackMap,
|
||||
Index rowIdx, Index colIdx, Index depthIdx,
|
||||
Accumulator& acc)
|
||||
{
|
||||
#ifdef __DEBUG__
|
||||
std::cout << "Kernel " << M << " x " << K << " x " << N << " @ " << rowIdx << ", " << depthIdx << ", " << colIdx << std::endl;
|
||||
std::cout << "LHS ";
|
||||
for(auto i = 0; i < M; i++)
|
||||
{
|
||||
for(auto j = 0; j < K; j++)
|
||||
{
|
||||
std::cout << lhsPackMap.pCur[i*K + j] << " ";
|
||||
}
|
||||
}
|
||||
std::cout << std::endl << "RHS ";
|
||||
for(auto i = 0; i < K; i++)
|
||||
{
|
||||
for(auto j = 0; j < N; j++)
|
||||
{
|
||||
std::cout << rhsPackMap.pCur[i*N + j] << " ";
|
||||
}
|
||||
}
|
||||
std::cout << std::endl;
|
||||
#endif
|
||||
const RhsScalar *pRhs = rhsPackMap.pCur;
|
||||
for(auto i = 0; i < N; i++)
|
||||
{
|
||||
const LhsScalar *pLhs = lhsPackMap.pCur;
|
||||
for(auto j = 0; j < M; j++)
|
||||
{
|
||||
acc.dt[j][i] += pRhs[i]*pLhs[j];
|
||||
}
|
||||
}
|
||||
lhsPackMap.advance(M*K);
|
||||
rhsPackMap.advance(K*N);
|
||||
};
|
||||
};
|
||||
|
||||
template<int Architecture, int CPU, typename Index, typename LhsScalar, typename LhsPackMap, typename RhsScalar, typename RhsPackMap, typename AccScalar, typename ResScalar, typename ResPacket, typename DataMapper, int RHS_SHAPE_IDX, int LHS_SHAPE_IDX, int IDX>
|
||||
struct DepthLoopStruct
|
||||
{
|
||||
static constexpr auto PREVIOUS = SHAPES<Architecture, CPU, LhsScalar, RhsScalar>[IDX][SHAPES_DEP_POINTER];
|
||||
|
||||
DepthLoopStruct<Architecture, CPU, Index, LhsScalar, LhsPackMap, RhsScalar, RhsPackMap, AccScalar, ResScalar, ResPacket, DataMapper, RHS_SHAPE_IDX, LHS_SHAPE_IDX, PREVIOUS> depthLS;
|
||||
|
||||
EIGEN_STRONG_INLINE void operator()(Index rowIdx, Index colIdx, Index depthIdx, const DataMapper& res,
|
||||
Index rows, Index depth, Index cols, ResScalar alpha, const ResPacket& pAlpha, LhsPackMap& lhsPackMap, RhsPackMap& rhsPackMap)
|
||||
{
|
||||
constexpr auto rhsProgress = SHAPES<Architecture, CPU, LhsScalar, RhsScalar>[RHS_SHAPE_IDX][SHAPES_RHS_DIMENSION];
|
||||
constexpr auto lhsProgress = SHAPES<Architecture, CPU, LhsScalar, RhsScalar>[LHS_SHAPE_IDX][SHAPES_LHS_DIMENSION];
|
||||
constexpr auto depthProgress = SHAPES<Architecture, CPU, LhsScalar, RhsScalar>[IDX][SHAPES_DEP_DIMENSION];
|
||||
|
||||
#ifdef __ENABLE_PREFETCH__
|
||||
prefetch(lhsPackMap.pCur);
|
||||
prefetch(rhsPackMap.pCur);
|
||||
#endif
|
||||
|
||||
typedef Accumulator<Architecture, CPU, AccScalar, ResScalar, DataMapper, lhsProgress, rhsProgress> AccumulatorType;
|
||||
|
||||
MicroKernel<Architecture, CPU, Index, LhsScalar, LhsPackMap, RhsScalar, RhsPackMap, AccScalar, ResScalar, AccumulatorType, lhsProgress, depthProgress, rhsProgress> mkt;
|
||||
AccumulatorType acc;
|
||||
acc.zero();
|
||||
|
||||
#ifdef __ENABLE_PREFETCH__
|
||||
acc.prefetch(res, rowIdx, colIdx);
|
||||
#endif
|
||||
|
||||
for(; depthIdx + depthProgress <= depth; depthIdx+=depthProgress)
|
||||
{
|
||||
mkt(lhsPackMap, rhsPackMap, rowIdx, colIdx, depthIdx, acc);
|
||||
}
|
||||
acc.scale(alpha, pAlpha);
|
||||
acc.store(res, rowIdx, colIdx);
|
||||
|
||||
depthLS(rowIdx, colIdx, depthIdx, res, rows, depth, cols, alpha, pAlpha, lhsPackMap, rhsPackMap);
|
||||
}
|
||||
};
|
||||
|
||||
template<int Architecture, int CPU, typename Index, typename LhsScalar, typename LhsPackMap, typename RhsScalar, typename RhsPackMap, typename AccScalar, typename ResScalar, typename ResPacket, typename DataMapper, int RHS_SHAPE_IDX, int LHS_SHAPE_IDX>
|
||||
struct DepthLoopStruct<Architecture, CPU, Index, LhsScalar, LhsPackMap, RhsScalar, RhsPackMap, AccScalar, ResScalar, ResPacket, DataMapper, RHS_SHAPE_IDX, LHS_SHAPE_IDX, -1>
|
||||
{
|
||||
EIGEN_STRONG_INLINE void operator()(Index, Index, Index, const DataMapper&,
|
||||
Index, Index, Index, ResScalar, const ResPacket&, LhsPackMap&, RhsPackMap&) {}
|
||||
};
|
||||
|
||||
template<int Architecture, int CPU, typename Index, typename LhsScalar, typename LhsPackMap, typename RhsScalar, typename RhsPackMap, typename AccScalar, typename ResScalar, typename ResPacket, typename DataMapper, int RHS_SHAPE_IDX, int IDX>
|
||||
struct LhsLoopStruct
|
||||
{
|
||||
static constexpr auto PREVIOUS = SHAPES<Architecture, CPU, LhsScalar, RhsScalar>[IDX][SHAPES_LHS_POINTER];
|
||||
LhsLoopStruct<Architecture, CPU, Index, LhsScalar, LhsPackMap, RhsScalar, RhsPackMap, AccScalar, ResScalar, ResPacket, DataMapper, RHS_SHAPE_IDX, PREVIOUS> lhsLS;
|
||||
|
||||
EIGEN_STRONG_INLINE void operator()(Index rowIdx, int colIdx, const DataMapper& res,
|
||||
Index rows, Index depth, Index cols, ResScalar alpha, const ResPacket& pAlpha, LhsPackMap& lhsPackMap, RhsPackMap& rhsPackMap)
|
||||
{
|
||||
constexpr auto lhsProgress = SHAPES<Architecture, CPU, LhsScalar, RhsScalar>[IDX][SHAPES_LHS_DIMENSION];
|
||||
constexpr auto rhsProgress = SHAPES<Architecture, CPU, LhsScalar, RhsScalar>[IDX][SHAPES_RHS_DIMENSION];
|
||||
DepthLoopStruct<Architecture, CPU, Index, LhsScalar, LhsPackMap, RhsScalar, RhsPackMap, AccScalar, ResScalar, ResPacket, DataMapper, RHS_SHAPE_IDX, IDX, IDX> depthLS;
|
||||
for(;rowIdx + lhsProgress <= rows; rowIdx+=lhsProgress)
|
||||
{
|
||||
lhsPackMap.moveTo(rowIdx);
|
||||
rhsPackMap.moveTo(colIdx);
|
||||
//prefetch(lhsPackMap.pCur + 2*lhsProgress);
|
||||
//prefetch(rhsPackMap.pCur + 2*rhsProgress);
|
||||
depthLS(rowIdx, colIdx, 0, res, rows, depth, cols, alpha, pAlpha, lhsPackMap, rhsPackMap);
|
||||
}
|
||||
lhsLS(rowIdx, colIdx, res, rows, depth, cols, alpha, pAlpha, lhsPackMap, rhsPackMap);
|
||||
}
|
||||
};
|
||||
|
||||
template<int Architecture, int CPU, typename Index, typename LhsScalar, typename LhsPackMap, typename RhsScalar, typename RhsPackMap, typename AccScalar, typename ResScalar, typename ResPacket, typename DataMapper, int RHS_SHAPE_IDX>
|
||||
struct LhsLoopStruct<Architecture, CPU, Index, LhsScalar, LhsPackMap, RhsScalar, RhsPackMap, AccScalar, ResScalar, ResPacket, DataMapper, RHS_SHAPE_IDX, -1>
|
||||
{
|
||||
EIGEN_STRONG_INLINE void operator()(Index, Index, const DataMapper&,
|
||||
Index, Index, Index, ResScalar, const ResPacket&, LhsPackMap&, RhsPackMap&) {}
|
||||
};
|
||||
|
||||
template<int Architecture, int CPU, typename Index, typename LhsScalar, typename LhsPackMap, typename RhsScalar, typename RhsPackMap, typename AccScalar, typename ResScalar, typename ResPacket, typename DataMapper, int IDX>
|
||||
struct RhsLoopStruct
|
||||
{
|
||||
static constexpr auto PREVIOUS = SHAPES<Architecture, CPU, LhsScalar, RhsScalar>[IDX][SHAPES_RHS_POINTER];
|
||||
RhsLoopStruct<Architecture, CPU, Index, LhsScalar, LhsPackMap, RhsScalar, RhsPackMap, AccScalar, ResScalar, ResPacket, DataMapper, PREVIOUS> rhsLS;
|
||||
|
||||
EIGEN_STRONG_INLINE void operator()(Index colIdx, const DataMapper& res,
|
||||
Index rows, Index depth, Index cols, ResScalar alpha, const ResPacket& pAlpha, LhsPackMap& lhsPackMap, RhsPackMap& rhsPackMap)
|
||||
{
|
||||
constexpr auto rhsProgress = SHAPES<Architecture, CPU, LhsScalar, RhsScalar>[IDX][SHAPES_RHS_DIMENSION];
|
||||
|
||||
for(;colIdx + rhsProgress <= cols; colIdx+=rhsProgress)
|
||||
{
|
||||
LhsLoopStruct<Architecture, CPU, Index, LhsScalar, LhsPackMap, RhsScalar, RhsPackMap, AccScalar, ResScalar, ResPacket, DataMapper, IDX, IDX> lhsLS;
|
||||
lhsLS(0, colIdx, res, rows, depth, cols, alpha, pAlpha, lhsPackMap, rhsPackMap);
|
||||
}
|
||||
rhsLS(colIdx, res, rows, depth, cols, alpha, pAlpha, lhsPackMap, rhsPackMap);
|
||||
}
|
||||
};
|
||||
|
||||
template<int Architecture, int CPU, typename Index, typename LhsScalar, typename LhsPackMap, typename RhsScalar, typename RhsPackMap, typename AccScalar, typename ResScalar, typename ResPacket, typename DataMapper>
|
||||
struct RhsLoopStruct<Architecture, CPU, Index, LhsScalar, LhsPackMap, RhsScalar, RhsPackMap, AccScalar, ResScalar, ResPacket, DataMapper, -1>
|
||||
{
|
||||
EIGEN_STRONG_INLINE void operator()(Index colIdx, const DataMapper&,
|
||||
Index, Index, Index, ResScalar, const ResPacket&, LhsPackMap&, RhsPackMap&) {}
|
||||
};
|
||||
|
||||
template<int Architecture, int CPU, typename ResScalar, typename AccScalar, typename LhsScalar, typename RhsScalar, typename Index, typename DataMapper>
|
||||
EIGEN_STRONG_INLINE void gemm(const DataMapper& res, const LhsScalar* blockA, const RhsScalar* blockB,
|
||||
Index rows, Index depth, Index cols, ResScalar alpha, Index strideA, Index strideB, Index offsetA, Index offsetB)
|
||||
{
|
||||
using ResPacket = typename unpacket_traits<ResScalar>::type;
|
||||
typedef PackMap<Architecture, CPU, Index, LhsScalar, DataMapper, true> LhsPackMap;
|
||||
typedef PackMap<Architecture, CPU, Index, RhsScalar, DataMapper, false> RhsPackMap;
|
||||
|
||||
#ifdef __DEBUG__
|
||||
std::cout << "blockA" << std::endl;
|
||||
for(auto i = 0; i < rows*depth; i++)
|
||||
{
|
||||
if(i % 4 == 0 && i > 0)
|
||||
std::cout << std::endl;
|
||||
std::cout << blockA[i] << " ";
|
||||
}
|
||||
std::cout << std::endl;
|
||||
std::cout << "blockB" << std::endl;
|
||||
for(auto i = 0; i < depth*cols; i++)
|
||||
{
|
||||
if(i % 4 == 0 && i > 0)
|
||||
std::cout << std::endl;
|
||||
std::cout << blockB[i] << " ";
|
||||
}
|
||||
std::cout << std::endl;
|
||||
#endif
|
||||
asm __volatile__("#BEGING_GEBP\n\t");
|
||||
RhsLoopStruct<Architecture, CPU, Index, LhsScalar, LhsPackMap, RhsScalar, RhsPackMap, AccScalar, ResScalar, ResPacket, DataMapper, SHAPES_COUNT<0, 0, LhsScalar, RhsScalar>-1> rhsLS;
|
||||
LhsPackMap lhsPackMap(blockA, depth, strideA, offsetA);
|
||||
RhsPackMap rhsPackMap(blockB, depth, strideB, offsetB);
|
||||
|
||||
ResPacket pAlpha = pset1<ResPacket>(alpha);
|
||||
|
||||
rhsLS(0, res, rows, depth, cols, alpha, pAlpha, lhsPackMap, rhsPackMap);
|
||||
asm __volatile__("#END_GEBP\n\t");
|
||||
}
|
||||
/*
|
||||
template<typename Index, typename DataMapper, int nr, bool Conjugate, bool PanelMode>
|
||||
struct gemm_pack_rhs<float, Index, DataMapper, nr, ColMajor, Conjugate, PanelMode>
|
||||
{
|
||||
void operator()(float* blockB, const DataMapper& rhs, Index depth, Index cols, Index stride=0, Index offset=0);
|
||||
};
|
||||
|
||||
template<typename Index, typename DataMapper, int nr, bool Conjugate, bool PanelMode>
|
||||
void gemm_pack_rhs<float, Index, DataMapper, nr, ColMajor, Conjugate, PanelMode>
|
||||
::operator()(float* blockB, const DataMapper& rhs, Index depth, Index cols, Index stride, Index offset)
|
||||
{
|
||||
rhs_pack<0, 0, Index, float, DataMapper, Conjugate, PanelMode, ColMajor> pack;
|
||||
pack(blockB, rhs, depth, cols, stride, offset);
|
||||
}
|
||||
|
||||
template<typename Index, typename DataMapper, int nr, bool Conjugate, bool PanelMode>
|
||||
struct gemm_pack_rhs<float, Index, DataMapper, nr, RowMajor, Conjugate, PanelMode>
|
||||
{
|
||||
void operator()(float* blockB, const DataMapper& rhs, Index depth, Index cols, Index stride=0, Index offset=0);
|
||||
};
|
||||
|
||||
template<typename Index, typename DataMapper, int nr, bool Conjugate, bool PanelMode>
|
||||
void gemm_pack_rhs<float, Index, DataMapper, nr, RowMajor, Conjugate, PanelMode>
|
||||
::operator()(float* blockB, const DataMapper& rhs, Index depth, Index cols, Index stride, Index offset)
|
||||
{
|
||||
rhs_pack<0, 0, Index, float, DataMapper, Conjugate, PanelMode, RowMajor> pack;
|
||||
pack(blockB, rhs, depth, cols, stride, offset);
|
||||
}
|
||||
|
||||
template<typename Index, typename DataMapper, int Pack1, int Pack2, typename Packet, bool Conjugate, bool PanelMode>
|
||||
struct gemm_pack_lhs<float, Index, DataMapper, Pack1, Pack2, Packet, RowMajor, Conjugate, PanelMode>
|
||||
{
|
||||
void operator()(float* blockA, const DataMapper& lhs, Index depth, Index rows, Index stride=0, Index offset=0);
|
||||
};
|
||||
|
||||
template<typename Index, typename DataMapper, int Pack1, int Pack2, typename Packet, bool Conjugate, bool PanelMode>
|
||||
void gemm_pack_lhs<float, Index, DataMapper, Pack1, Pack2, Packet, RowMajor, Conjugate, PanelMode>
|
||||
::operator()(float* blockA, const DataMapper& lhs, Index depth, Index rows, Index stride, Index offset)
|
||||
{
|
||||
lhs_pack<0, 0, Index, float, DataMapper, Conjugate, PanelMode, RowMajor> pack;
|
||||
pack(blockA, lhs, depth, rows, stride, offset);
|
||||
}
|
||||
|
||||
template<typename Index, typename DataMapper, int Pack1, int Pack2, typename Packet, bool Conjugate, bool PanelMode>
|
||||
struct gemm_pack_lhs<float, Index, DataMapper, Pack1, Pack2, Packet, ColMajor, Conjugate, PanelMode>
|
||||
{
|
||||
void operator()(float* blockA, const DataMapper& lhs, Index depth, Index rows, Index stride=0, Index offset=0);
|
||||
};
|
||||
|
||||
template<typename Index, typename DataMapper, int Pack1, int Pack2, typename Packet, bool Conjugate, bool PanelMode>
|
||||
void gemm_pack_lhs<float, Index, DataMapper, Pack1, Pack2, Packet, ColMajor, Conjugate, PanelMode>
|
||||
::operator()(float* blockA, const DataMapper& lhs, Index depth, Index rows, Index stride, Index offset)
|
||||
{
|
||||
lhs_pack<0, 0, Index, float, DataMapper, Conjugate, PanelMode, ColMajor> pack;
|
||||
pack(blockA, lhs, depth, rows, stride, offset);
|
||||
}
|
||||
*/
|
||||
template<typename Index, typename DataMapper, int mr, int nr, bool ConjugateLhs, bool ConjugateRhs>
|
||||
struct gebp_kernel<float, float, Index, DataMapper, mr, nr, ConjugateLhs, ConjugateRhs>
|
||||
{
|
||||
void operator()(const DataMapper& res, const float* blockA, const float* blockB,
|
||||
Index rows, Index depth, Index cols, float alpha,
|
||||
Index strideA=-1, Index strideB=-1, Index offsetA=0, Index offsetB=0);
|
||||
};
|
||||
|
||||
template<typename Index, typename DataMapper, int mr, int nr, bool ConjugateLhs, bool ConjugateRhs>
|
||||
void gebp_kernel<float, float, Index, DataMapper, mr, nr, ConjugateLhs, ConjugateRhs>
|
||||
::operator()(const DataMapper& res, const float* blockA, const float* blockB,
|
||||
Index rows, Index depth, Index cols, float alpha,
|
||||
Index strideA, Index strideB, Index offsetA, Index offsetB)
|
||||
{
|
||||
gemm<0, 0, float, float, float, float, Index, DataMapper>(res, blockA, blockB, rows, depth, cols, alpha, strideA, strideB, offsetA, offsetB);
|
||||
}
|
||||
} // end namespace internal
|
||||
|
||||
} // end namespace Eigen
|
||||
#endif // EIGEN_MATRIX_PRODUCT_NEON_H
|
||||
192
Eigen/src/Core/arch/NEON/PackingOps.h
Normal file
192
Eigen/src/Core/arch/NEON/PackingOps.h
Normal file
@@ -0,0 +1,192 @@
|
||||
// This file is part of Eigen, a lightweight C++ template library
|
||||
// for linear algebra.
|
||||
//
|
||||
// Copyright (C) 2021 Everton Constantino (everton.constantino@hotmail.com)
|
||||
//
|
||||
// This Source Code Form is subject to the terms of the Mozilla
|
||||
// Public License v. 2.0. If a copy of the MPL was not distributed
|
||||
// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
|
||||
|
||||
#ifndef EIGEN_PACKING_OPS_NEON_H
|
||||
#define EIGEN_PACKING_OPS_NEON_H
|
||||
|
||||
namespace Eigen {
|
||||
|
||||
namespace internal {
|
||||
|
||||
#ifdef __ENABLE_CUSTOM_PACKING__
|
||||
|
||||
template<int CPU, typename Scalar, bool isLhs>
|
||||
constexpr int PACK_SHAPES_COUNT<0, CPU, Scalar, isLhs> = 3;
|
||||
|
||||
template<int CPU, typename Scalar>
|
||||
constexpr int PACK_SHAPES_COUNT<0, CPU, Scalar, true> = 4;
|
||||
|
||||
template<int CPU, typename Scalar, bool isLhs>
|
||||
constexpr int PACK_SHAPES<0, CPU, Scalar, isLhs>[PACK_SHAPES_COUNT<0, CPU, Scalar, isLhs>][PACK_SHAPES_DIMENSION] = {{1,1,PACK_SHAPES_END},{4,1,0},{4,4,0}};
|
||||
|
||||
template<int CPU, typename Scalar>
|
||||
constexpr int PACK_SHAPES<0, CPU, Scalar, true>[PACK_SHAPES_COUNT<0, CPU, Scalar, true>][PACK_SHAPES_DIMENSION] = {{1,1,PACK_SHAPES_END},{4,1,0},{4,4,0},{8,1,2}};
|
||||
|
||||
template<int CPU, typename Index, typename Scalar, bool isLhs, typename DataMapper, bool Conjugate, bool PanelMode, int StorageOrder>
|
||||
struct PackingOperator<0, CPU, Index, Scalar, isLhs, DataMapper, Conjugate, PanelMode, StorageOrder, 4, 4>
|
||||
{
|
||||
EIGEN_STRONG_INLINE Scalar* operator()(Index d1Idx, Index d2Idx, Scalar *block, const DataMapper& data)
|
||||
{
|
||||
using Packet = typename packet_traits<Scalar>::type;
|
||||
constexpr int vectorSize = packet_traits<Scalar>::size;
|
||||
|
||||
Scalar *c = block;
|
||||
|
||||
if(!isLhs)
|
||||
{
|
||||
int tD = d1Idx;
|
||||
d1Idx = d2Idx;
|
||||
d2Idx = tD;
|
||||
}
|
||||
|
||||
if(isLhs && StorageOrder == ColMajor || !isLhs && StorageOrder == RowMajor)
|
||||
{
|
||||
Packet p0 = data.template loadPacket<Packet>(d1Idx, d2Idx + 0);
|
||||
Packet p1 = data.template loadPacket<Packet>(d1Idx, d2Idx + 1);
|
||||
Packet p2 = data.template loadPacket<Packet>(d1Idx, d2Idx + 2);
|
||||
Packet p3 = data.template loadPacket<Packet>(d1Idx, d2Idx + 3);
|
||||
|
||||
pstore<Scalar>(c + 0*vectorSize, p0);
|
||||
pstore<Scalar>(c + 1*vectorSize, p1);
|
||||
pstore<Scalar>(c + 2*vectorSize, p2);
|
||||
pstore<Scalar>(c + 3*vectorSize, p3);
|
||||
c+=4*vectorSize;
|
||||
} else {
|
||||
PacketBlock<Packet, 4> pblock;
|
||||
|
||||
pblock.packet[0] = data.template loadPacket<Packet>(d1Idx, d2Idx + 0);
|
||||
pblock.packet[1] = data.template loadPacket<Packet>(d1Idx, d2Idx + 1);
|
||||
pblock.packet[2] = data.template loadPacket<Packet>(d1Idx, d2Idx + 2);
|
||||
pblock.packet[3] = data.template loadPacket<Packet>(d1Idx, d2Idx + 3);
|
||||
|
||||
ptranspose(pblock);
|
||||
|
||||
pstore<Scalar>(c + 0*vectorSize, pblock.packet[0]);
|
||||
pstore<Scalar>(c + 1*vectorSize, pblock.packet[1]);
|
||||
pstore<Scalar>(c + 2*vectorSize, pblock.packet[2]);
|
||||
pstore<Scalar>(c + 3*vectorSize, pblock.packet[3]);
|
||||
c+=4*vectorSize;
|
||||
}
|
||||
return c;
|
||||
}
|
||||
};
|
||||
|
||||
template<int CPU, typename Index, typename Scalar, bool isLhs, typename DataMapper, bool Conjugate, bool PanelMode, int StorageOrder>
|
||||
struct PackingOperator<0, CPU, Index, Scalar, isLhs, DataMapper, Conjugate, PanelMode, StorageOrder, 8, 1>
|
||||
{
|
||||
EIGEN_STRONG_INLINE Scalar* operator()(Index d1Idx, Index d2Idx, Scalar *block, const DataMapper& data)
|
||||
{
|
||||
using Packet = typename packet_traits<Scalar>::type;
|
||||
Scalar *c = block;
|
||||
if(isLhs && StorageOrder == ColMajor)
|
||||
{
|
||||
Packet p = data.template loadPacket<Packet>(d1Idx + 0, d2Idx);
|
||||
pstore<Scalar>(c, p);
|
||||
c+=4;
|
||||
p = data.template loadPacket<Packet>(d1Idx + 4, d2Idx);
|
||||
pstore<Scalar>(c, p);
|
||||
c+=4;
|
||||
} else if(!isLhs && StorageOrder == RowMajor) {
|
||||
Packet p = data.template loadPacket<Packet>(d2Idx, d1Idx + 0);
|
||||
pstore<Scalar>(c, p);
|
||||
c+=4;
|
||||
p = data.template loadPacket<Packet>(d2Idx, d1Idx + 4);
|
||||
pstore<Scalar>(c, p);
|
||||
c+=4;
|
||||
} else {
|
||||
if(isLhs)
|
||||
{
|
||||
*c = data(d1Idx + 0, d2Idx + 0);
|
||||
c++;
|
||||
*c = data(d1Idx + 1, d2Idx + 0);
|
||||
c++;
|
||||
*c = data(d1Idx + 2, d2Idx + 0);
|
||||
c++;
|
||||
*c = data(d1Idx + 3, d2Idx + 0);
|
||||
c++;
|
||||
*c = data(d1Idx + 0, d2Idx + 4);
|
||||
c++;
|
||||
*c = data(d1Idx + 1, d2Idx + 4);
|
||||
c++;
|
||||
*c = data(d1Idx + 2, d2Idx + 4);
|
||||
c++;
|
||||
*c = data(d1Idx + 3, d2Idx + 4);
|
||||
c++;
|
||||
} else {
|
||||
*c = data(d2Idx, d1Idx + 0);
|
||||
c++;
|
||||
*c = data(d2Idx, d1Idx + 1);
|
||||
c++;
|
||||
*c = data(d2Idx, d1Idx + 2);
|
||||
c++;
|
||||
*c = data(d2Idx, d1Idx + 3);
|
||||
c++;
|
||||
*c = data(d2Idx + 4, d1Idx + 0);
|
||||
c++;
|
||||
*c = data(d2Idx + 4, d1Idx + 1);
|
||||
c++;
|
||||
*c = data(d2Idx + 4, d1Idx + 2);
|
||||
c++;
|
||||
*c = data(d2Idx + 4, d1Idx + 3);
|
||||
c++;
|
||||
}
|
||||
}
|
||||
return c;
|
||||
}
|
||||
};
|
||||
|
||||
template<int CPU, typename Index, typename Scalar, bool isLhs, typename DataMapper, bool Conjugate, bool PanelMode, int StorageOrder>
|
||||
struct PackingOperator<0, CPU, Index, Scalar, isLhs, DataMapper, Conjugate, PanelMode, StorageOrder, 4, 1>
|
||||
{
|
||||
EIGEN_STRONG_INLINE Scalar* operator()(Index d1Idx, Index d2Idx, Scalar *block, const DataMapper& data)
|
||||
{
|
||||
using Packet = typename packet_traits<Scalar>::type;
|
||||
Scalar *c = block;
|
||||
if(isLhs && StorageOrder == ColMajor)
|
||||
{
|
||||
Packet p = data.template loadPacket<Packet>(d1Idx, d2Idx);
|
||||
pstore<Scalar>(c, p);
|
||||
c+=4;
|
||||
} else if(!isLhs && StorageOrder == RowMajor) {
|
||||
Packet p = data.template loadPacket<Packet>(d2Idx, d1Idx);
|
||||
pstore<Scalar>(c, p);
|
||||
c+=4;
|
||||
} else {
|
||||
if(isLhs)
|
||||
{
|
||||
*c = data(d1Idx + 0, d2Idx);
|
||||
c++;
|
||||
*c = data(d1Idx + 1, d2Idx);
|
||||
c++;
|
||||
*c = data(d1Idx + 2, d2Idx);
|
||||
c++;
|
||||
*c = data(d1Idx + 3, d2Idx);
|
||||
c++;
|
||||
} else {
|
||||
*c = data(d2Idx, d1Idx + 0);
|
||||
c++;
|
||||
*c = data(d2Idx, d1Idx + 1);
|
||||
c++;
|
||||
*c = data(d2Idx, d1Idx + 2);
|
||||
c++;
|
||||
*c = data(d2Idx, d1Idx + 3);
|
||||
c++;
|
||||
}
|
||||
}
|
||||
return c;
|
||||
}
|
||||
};
|
||||
|
||||
#endif // __ENABLE_CUSTOM_PACKING__
|
||||
|
||||
} // end namespace internal
|
||||
|
||||
} // end namespace Eigen
|
||||
|
||||
#endif // EIGEN_PACKING_OPS_NEON_H
|
||||
6
compile.sh
Executable file
6
compile.sh
Executable file
@@ -0,0 +1,6 @@
|
||||
#!/bin/bash
|
||||
#echo 'Compiling with master'
|
||||
#g++ -O3 -I../eigen-master -std=c++11 new_gemm_test.cpp -o gto
|
||||
echo 'Compiling current'
|
||||
g++ -O3 -I. -std=c++14 new_gemm_test.cpp -D__ENABLE_VECTOR_KERNELS__ -D__ENABLE_PREFETCH__ -o gtp
|
||||
g++ -O3 -I. -std=c++14 new_gemm_test.cpp -D__ENABLE_VECTOR_KERNELS__ -o gt
|
||||
98
new_gemm_test.cpp
Normal file
98
new_gemm_test.cpp
Normal file
@@ -0,0 +1,98 @@
|
||||
#include <Eigen/Dense>
|
||||
#include <iostream>
|
||||
#include <ctime>
|
||||
#include <cmath>
|
||||
|
||||
using namespace Eigen;
|
||||
|
||||
void set(MatrixXf& A, int m, int n, int id, int digits)
|
||||
{
|
||||
for(auto i = 0; i < m; i++)
|
||||
for(auto j = 0; j < n; j++)
|
||||
A(i,j) = id*std::pow(10,(2*digits)) + i*std::pow(10,digits) + j;
|
||||
}
|
||||
|
||||
int main(int argc, char* argv[])
|
||||
{
|
||||
#ifdef __DEBUG__
|
||||
int m = 32, k = 32, n = 32, max = std::max(std::max(m,k),n);
|
||||
MatrixXf A = MatrixXf::Zero(m, k);
|
||||
MatrixXf B = MatrixXf::Zero(k, n);
|
||||
MatrixXf C = MatrixXf::Zero(m, n);
|
||||
MatrixXf D = MatrixXf::Zero(m, n);
|
||||
|
||||
set(A, m, k, 1, static_cast<int>(std::log10(max)) + 1);
|
||||
set(B, k, n, 2, static_cast<int>(std::log10(max)) + 1);
|
||||
|
||||
C = A*B;
|
||||
|
||||
std::cout << A << std::endl;
|
||||
std::cout << B << std::endl;
|
||||
|
||||
std::cout << std::endl;
|
||||
|
||||
for(auto i = 0; i < m; i++)
|
||||
{
|
||||
for(auto j = 0; j < n; j++)
|
||||
{
|
||||
float acc=0;
|
||||
for(auto kk = 0; kk < k; kk++)
|
||||
{
|
||||
acc += A(i,kk)*B(kk,j);
|
||||
}
|
||||
D(i,j) = acc;
|
||||
if(std::sqrt(std::pow(D(i,j)-C(i,j),2)) > 1.0e-5)
|
||||
{
|
||||
std::cout << "Difference too big at " << i << " ," << j << " is " << C(i,j) << " should be " << D(i,j) << std::endl;
|
||||
}
|
||||
}
|
||||
}
|
||||
#else
|
||||
if(argc < 3)
|
||||
{
|
||||
std::cout << "Wrong number of arguments." << std::endl;
|
||||
return -1;
|
||||
}
|
||||
|
||||
int sz = std::atoi(argv[1]);
|
||||
int m = sz, k = sz, n = sz;
|
||||
int RUNS = std::atoi(argv[2]);
|
||||
double time = 0;
|
||||
|
||||
for(auto i = 0; i < RUNS; i++)
|
||||
{
|
||||
MatrixXf A = MatrixXf::Random(m,k);
|
||||
MatrixXf B = MatrixXf::Random(k,n);
|
||||
//set(A,m, k, 1);
|
||||
//set(B,k, n, 2);
|
||||
MatrixXf C = MatrixXf::Zero(m, n);
|
||||
|
||||
std::clock_t start,end;
|
||||
start = std::clock();
|
||||
C = A*B;
|
||||
end = std::clock();
|
||||
|
||||
time += 1000.0*(end-start) / CLOCKS_PER_SEC;
|
||||
}
|
||||
std::cout << time << std::endl;
|
||||
#ifdef TEST_SCALAR
|
||||
start = std::clock();
|
||||
for(auto i = 0; i < m; i++)
|
||||
{
|
||||
for(auto j = 0; j < n; j++)
|
||||
{
|
||||
float acc=0;
|
||||
for(auto kk = 0; kk < k; kk++)
|
||||
{
|
||||
acc += A(i,kk)*B(kk,j);
|
||||
}
|
||||
C(i,j) = acc;
|
||||
}
|
||||
}
|
||||
end = std::clock();
|
||||
|
||||
std::cout << 1000.0*(end-start) / CLOCKS_PER_SEC << std::endl;
|
||||
#endif
|
||||
#endif
|
||||
return 0;
|
||||
}
|
||||
34
run.sh
Executable file
34
run.sh
Executable file
@@ -0,0 +1,34 @@
|
||||
#!/bin/bash
|
||||
function run() {
|
||||
OLD=0
|
||||
NEW=0
|
||||
NEWP=0
|
||||
EXECS=$1
|
||||
SIZE=$2
|
||||
RUNS=$3
|
||||
for ((i = 0; i < $EXECS; i++)) do
|
||||
SEL=$(A=$(shuf -i 0-10 -n 1); echo $(($A % 2)))
|
||||
if [ $SEL -eq 0 ]; then
|
||||
T_OLD=$(./gto $SIZE $RUNS)
|
||||
T_NEW=$(./gt $SIZE $RUNS)
|
||||
T_NEWP=$(./gtp $SIZE $RUNS)
|
||||
else
|
||||
T_NEW=$(./gt $SIZE $RUNS)
|
||||
T_NEWP=$(./gtp $SIZE $RUNS)
|
||||
T_OLD=$(./gto $SIZE $RUNS)
|
||||
fi
|
||||
NEW=$NEW+$T_NEW
|
||||
OLD=$OLD+$T_OLD
|
||||
NEWP=$NEWP+$T_NEWP
|
||||
done
|
||||
SPEED=$(echo "($OLD) / ($NEW)" | bc -l)
|
||||
SPEEDP=$(echo "($OLD) / ($NEWP)" | bc -l)
|
||||
echo "$SIZE -> $SPEED $SPEEDP"
|
||||
}
|
||||
|
||||
run $1 16 500
|
||||
run $1 32 500
|
||||
run $1 64 100
|
||||
run $1 128 50
|
||||
run $1 256 10
|
||||
run $1 1024 10
|
||||
Reference in New Issue
Block a user