Compare commits

...

23 Commits

Author SHA1 Message Date
Everton Constantino
09f3e95447 WIP 2 2021-05-19 17:29:42 +00:00
Everton Constantino
6533187280 WIP 2 - need to implement 2x1x1 2021-05-18 20:42:08 +00:00
Everton Constantino
029f78abf0 WIP 2 2021-05-14 20:21:52 +00:00
Everton Constantino
5d47f6697d WIP 2 2021-05-14 16:26:33 +00:00
Everton Constantino
ad67705447 WIP2 2021-05-14 12:29:37 +00:00
Everton Constantino
9fc17867e5 WIP 2 2021-05-13 19:21:48 +00:00
Everton Constantino
3999ab2dc7 WIP 2 2021-05-13 18:12:52 +00:00
Everton Constantino
58db05afbc WIP 2 2021-05-13 15:30:08 +00:00
Everton Constantino
bfadb56107 WIP 2 2021-05-13 14:48:40 +00:00
Everton Constantino
9b8cdceea8 WIP 2 2021-05-13 14:42:22 +00:00
Everton Constantino
a8ec6d6a36 WIP with tests 2021-05-12 17:09:33 +00:00
Everton Constantino
54f80f442d WIP - Vector 2021-05-10 20:06:34 +00:00
Everton Constantino
70c0363c28 WIP2 2021-05-10 19:59:47 +00:00
Everton Constantino
b2cd094863 WIP 2021-05-10 16:53:17 +00:00
Everton Constantino
d216764f46 WIP 2021-04-23 17:28:17 +00:00
Everton Constantino
646d92c7f1 WIP 2021-04-23 15:39:04 +00:00
Everton Constantino
c62ed9b214 WIP 2021-04-22 20:42:44 +00:00
Everton Constantino
82a7715b01 WIP 2021-04-22 18:11:53 +00:00
Everton Constantino
43ce8e9d2d WIP 2021-04-22 17:43:22 +00:00
Everton Constantino
ca0d3f92d7 WIP 2021-04-22 14:48:44 +00:00
Everton Constantino
5bffe09624 WIP 2021-04-22 13:14:00 +00:00
Everton Constantino
421891e1db WIP 2021-04-21 17:58:55 +00:00
Everton Constantino
f826663a3a WIP 2021-04-20 20:10:21 +00:00
7 changed files with 1980 additions and 0 deletions

View File

@@ -350,6 +350,9 @@ using std::ptrdiff_t;
#include "src/Core/arch/AltiVec/MatrixProduct.h"
#elif defined EIGEN_VECTORIZE_NEON
#include "src/Core/arch/NEON/GeneralBlockPanelKernel.h"
#include "src/Core/arch/NEON/MatrixProduct.h"
#include "src/Core/arch/NEON/PackingOps.h"
#include "src/Core/arch/NEON/Kernels.h"
#endif
#include "src/Core/BooleanRedux.h"

File diff suppressed because it is too large Load Diff

View File

@@ -0,0 +1,555 @@
// This file is part of Eigen, a lightweight C++ template library
// for linear algebra.
//
// Copyright (C) 2021 Everton Constantino (everton.constantino@hotmail.com)
//
// This Source Code Form is subject to the terms of the Mozilla
// Public License v. 2.0. If a copy of the MPL was not distributed
// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
#ifndef EIGEN_MATRIX_PRODUCT_NEON_H
#define EIGEN_MATRIX_PRODUCT_NEON_H
#ifdef __DEBUG__
#include <iostream>
#endif
namespace Eigen {
namespace internal {
#ifndef __UNROLL__
#define __UNROLL__ 8
#endif
template<int Architecture, int CPU, typename LhsScalar, typename RhsScalar>
constexpr int SHAPES_COUNT = 16;
constexpr int SHAPES_DIMENSION = 6;
constexpr int SHAPES_LHS_DIMENSION = 0;
constexpr int SHAPES_DEP_DIMENSION = 1;
constexpr int SHAPES_RHS_DIMENSION = 2;
constexpr int SHAPES_RHS_POINTER = 3;
constexpr int SHAPES_LHS_POINTER = 4;
constexpr int SHAPES_DEP_POINTER = 5;
constexpr int SHAPES_POINTER_END = -1;
template<int Architecture, int CPU, typename Scalar, bool isLhs>
constexpr int PACK_SHAPES_COUNT = 2;
template<int Architecture, int CPU, typename Scalar>
constexpr int PACK_SHAPES_COUNT<Architecture, CPU, Scalar, true> = 4;
constexpr int PACK_SHAPES_DIMENSION = 3;
constexpr int PACK_SHAPES_POINTER = 2;
constexpr int PACK_SHAPES_END = -1;
template<typename Scalar>
struct PacketMultiples
{
enum
{
half = unpacket_traits<typename packet_traits<Scalar>::half>::size,
quarter = unpacket_traits<typename packet_traits<Scalar>::half>::size // Is this used?
};
};
// lhs_progress x depth_progress x rhs_progress (depth_progress > 1 matrix ops) x pointer to next rhs_progress on the shapes map
template<int Architecture, int CPU, typename LhsScalar, typename RhsScalar>
constexpr int SHAPES[SHAPES_COUNT<Architecture, CPU, LhsScalar,RhsScalar>][SHAPES_DIMENSION] =
{ /* 00 */{ 1, 1,1,SHAPES_POINTER_END, SHAPES_POINTER_END, SHAPES_POINTER_END},
/* 01 */{PacketMultiples<RhsScalar>::half, 1,1, 0, 0, SHAPES_POINTER_END},
/* 02 */{1*packet_traits<RhsScalar>::size, 1,1, 0, 1, SHAPES_POINTER_END},
/* 03 */{1*packet_traits<RhsScalar>::size,__UNROLL__,1, 0, 1, 2},
/* 04 */{2*packet_traits<RhsScalar>::size, 1,1, 0, 3, SHAPES_POINTER_END},
/* 05 */{2*packet_traits<RhsScalar>::size,__UNROLL__,1, 0, 3, 4},
/* 06 */{3*packet_traits<RhsScalar>::size, 1,1, 0, 5, SHAPES_POINTER_END},
/* 07 */{3*packet_traits<RhsScalar>::size,__UNROLL__,1, 0, 5, 6},
/* 08 */{ 1, 1,4, 7, SHAPES_POINTER_END, SHAPES_POINTER_END},
/* 09 */{PacketMultiples<RhsScalar>::half, 1,4, 7, 8, SHAPES_POINTER_END},
/* 10 */{1*packet_traits<RhsScalar>::size, 1,4, 7, 9, SHAPES_POINTER_END},
/* 11 */{1*packet_traits<RhsScalar>::size,__UNROLL__,4, 7, 9, 10},
/* 12 */{2*packet_traits<RhsScalar>::size, 1,4, 7, 11, SHAPES_POINTER_END},
/* 13 */{2*packet_traits<RhsScalar>::size,__UNROLL__,4, 7, 11, 12},
/* 14 */{3*packet_traits<RhsScalar>::size, 1,4, 7, 13, SHAPES_POINTER_END},
/* 15 */{3*packet_traits<RhsScalar>::size,__UNROLL__,4, 7, 13, 14}};
// d1progress x d2progress
template<int Architecture, int CPU, typename Scalar, bool isLhs>
constexpr int PACK_SHAPES[PACK_SHAPES_COUNT<Architecture, CPU, Scalar, isLhs>][PACK_SHAPES_DIMENSION] =
{{ 1, 1, PACK_SHAPES_END},
{ 4, 1, 0}};
template<int Architecture, int CPU, typename Scalar>
constexpr int PACK_SHAPES<Architecture, CPU, Scalar, true>[PACK_SHAPES_COUNT<Architecture, CPU, Scalar, true>][PACK_SHAPES_DIMENSION] =
{{ 1, 1, PACK_SHAPES_END},
{1*packet_traits<Scalar>::size, 1, 0},
{2*packet_traits<Scalar>::size, 1, 1},
{3*packet_traits<Scalar>::size, 1, 2}};
template<int Architecture, int CPU, typename Index, typename Scalar, bool isLhs, typename DataMapper, bool Conjugate, bool PanelMode, int StorageOrder, int M, int N>
struct PackingOperator
{
EIGEN_STRONG_INLINE Scalar* operator()(Index d1Idx, Index d2Idx, Scalar *block, const DataMapper& data)
{
#ifdef __DEBUG__
std::cout << M << "x" << N << " ( " << d1Idx << ", " << d2Idx <<") -> ( " << d1Idx + M << ", " << d2Idx + N << ") ";
#endif
Scalar *c = block;
for(auto i = 0; i < M; i++)
for(auto j = 0; j < N; j++)
{
if(isLhs)
*c = data(d1Idx + i, d2Idx + j);
else
*c = data(d2Idx + j, d1Idx + i);
#ifdef __DEBUG__
std::cout << *c << " ";
#endif
c++;
}
#ifdef __DEBUG__
std::cout << std::endl;
#endif
return c;
}
};
template<int Architecture, int CPU, typename Index, typename Scalar, bool isLhs, typename DataMapper, bool Conjugate, bool PanelMode, int StorageOrder, int D1PROGRESS, int IDX>
struct PackingInnerStruct
{
EIGEN_STRONG_INLINE Scalar* operator()(Index d1Idx, Index d2Idx, Scalar *block, const DataMapper& data, Index d1Size, Index d2Size, Index stride, Index offset)
{
constexpr auto d2Progress = PACK_SHAPES<Architecture, CPU, Scalar, isLhs>[IDX][1];
PackingOperator<Architecture, CPU, Index, Scalar, isLhs, DataMapper, Conjugate, PanelMode, StorageOrder, D1PROGRESS, d2Progress> po;
for(;d2Idx + d2Progress <= d2Size; d2Idx+=d2Progress)
{
block = po(d1Idx, d2Idx, block, data);
}
if(PACK_SHAPES<Architecture, CPU, Scalar, isLhs>[IDX-1][0] == D1PROGRESS)
{
PackingInnerStruct<Architecture, CPU, Index, Scalar, isLhs, DataMapper, Conjugate, PanelMode, StorageOrder, D1PROGRESS, IDX-1> pis;
block = pis(d1Idx, d2Idx, block, data, d1Size, d2Size, stride, offset);
}
return block;
}
};
template<int Architecture, int CPU, typename Index, typename Scalar, bool isLhs, typename DataMapper, bool Conjugate, bool PanelMode, int StorageOrder, int D1PROGRESS>
struct PackingInnerStruct<Architecture, CPU, Index, Scalar, isLhs, DataMapper, Conjugate, PanelMode, StorageOrder, D1PROGRESS, 0>
{
EIGEN_STRONG_INLINE Scalar* operator()(Index d1Idx, Index d2Idx, Scalar *block, const DataMapper& data, Index d1Size, Index d2Size, Index stride, Index offset)
{
constexpr auto d2Progress = PACK_SHAPES<Architecture, CPU, Scalar, isLhs>[0][1];
for(;d2Idx + d2Progress <= d2Size; d2Idx+=d2Progress)
{
PackingOperator<Architecture, CPU, Index, Scalar, isLhs, DataMapper, Conjugate, PanelMode, StorageOrder, D1PROGRESS, d2Progress> po;
block = po(d1Idx, d2Idx, block, data);
}
return block;
}
};
template<int Architecture, int CPU, typename Index, typename Scalar, bool isLhs, typename DataMapper, bool Conjugate, bool PanelMode, int StorageOrder, int PACK_SHAPE_IDX>
struct PackingStruct
{
PackingStruct<Architecture, CPU, Index, Scalar, isLhs, DataMapper, Conjugate, PanelMode, StorageOrder, PACK_SHAPES<Architecture, CPU, Scalar, isLhs>[PACK_SHAPE_IDX][PACK_SHAPES_POINTER]> ps;
EIGEN_STRONG_INLINE Scalar* operator()(Index d1Idx, Scalar *block, const DataMapper& data, Index d1Size, Index d2Size, Index stride, Index offset)
{
constexpr auto d1Progress = PACK_SHAPES<Architecture, CPU, Scalar, isLhs>[PACK_SHAPE_IDX][0];
for(; d1Idx + d1Progress <= d1Size; d1Idx += d1Progress)
{
PackingInnerStruct<Architecture, CPU, Index, Scalar, isLhs, DataMapper, Conjugate, PanelMode, StorageOrder, d1Progress, PACK_SHAPE_IDX> pis;
block = pis(d1Idx, 0, block, data, d1Size, d2Size, stride, offset);
}
return ps(d1Idx, block, data, d1Size, d2Size, stride, offset);
}
};
template<int Architecture, int CPU, typename Index, typename Scalar, bool isLhs, typename DataMapper, bool Conjugate, bool PanelMode, int StorageOrder>
struct PackingStruct<Architecture, CPU, Index, Scalar, isLhs, DataMapper, Conjugate, PanelMode, StorageOrder, -1>
{
EIGEN_STRONG_INLINE Scalar* operator()(Index, Scalar *block, const DataMapper&, Index, Index, Index, Index) { return block; }
};
template<int Architecture, int CPU, typename Index, typename Scalar, typename DataMapper, bool Conjugate, bool PanelMode, int StorageOrder>
struct lhs_pack
{
EIGEN_STRONG_INLINE void operator()(Scalar *blockA, const DataMapper &lhs, Index depth, Index rows, Index stride, Index offset)
{
PackingStruct<Architecture, CPU, Index, Scalar, true, DataMapper, Conjugate, PanelMode, StorageOrder, PACK_SHAPES_COUNT<Architecture, CPU, Scalar, true>-1> ps;
ps(0, blockA, lhs, rows, depth, stride, offset);
}
};
template<int Architecture, int CPU, typename Index, typename Scalar, typename DataMapper, bool Conjugate, bool PanelMode, int StorageOrder>
struct rhs_pack
{
EIGEN_STRONG_INLINE void operator()(Scalar *blockB, const DataMapper &rhs, Index depth, Index cols, Index stride, Index offset)
{
PackingStruct<Architecture, CPU, Index, Scalar, false, DataMapper, Conjugate, PanelMode, StorageOrder, PACK_SHAPES_COUNT<Architecture, CPU, Scalar, false>-1> ps;
ps(0, blockB, rhs, cols, depth, stride, offset);
}
};
template<int Architecture, int CPU, typename Index, typename Scalar, typename DataMapper, bool isLhs, int IDX>
struct PackMapCalculator
{
PackMapCalculator<Architecture, CPU, Index, Scalar, DataMapper, isLhs, PACK_SHAPES<Architecture, CPU, Scalar, isLhs>[IDX][PACK_SHAPES_POINTER]> pmc;
EIGEN_STRONG_INLINE Index getPosition(Index pos, Index d2Size)
{
constexpr auto d1Progress = PACK_SHAPES<Architecture, CPU, Scalar, isLhs>[IDX][0];
Index v = (pos / d1Progress) * d1Progress;
return v*d2Size + pmc.getPosition(pos - v, d2Size);
}
};
template<int Architecture, int CPU, typename Index, typename Scalar, typename DataMapper, bool isLhs>
struct PackMapCalculator<Architecture, CPU, Index, Scalar, DataMapper, isLhs, -1>
{
EIGEN_STRONG_INLINE Index getPosition(Index, Index) { return Index(0); }
};
template<int Architecture, int CPU, typename Index, typename Scalar, typename DataMapper, bool isLhs>
struct PackMap
{
const Scalar *pBase;
const Scalar *pCur;
Index stride;
Index offset;
Index d2Size;
PackMapCalculator<Architecture, CPU, Index, Scalar, DataMapper, isLhs, PACK_SHAPES_COUNT<Architecture, CPU, Scalar, isLhs>-1> pmc;
PackMap(const Scalar *base, Index d2Size, Index stride, Index offset) : pBase(base), pCur(base), d2Size(d2Size), stride(stride), offset(offset) {}
EIGEN_STRONG_INLINE void resetCur() { pCur = pBase; }
EIGEN_STRONG_INLINE void updateBase() { pBase = pCur; }
EIGEN_STRONG_INLINE void moveTo(Index p1) { pCur = pBase + pmc.getPosition(p1, d2Size); }
EIGEN_STRONG_INLINE void advance(Index progress) { pCur += progress; }
template<int D1Progress=-1, int D2Progress=-1>
EIGEN_STRONG_INLINE void prefetch(Index amnt)
{
#ifdef __ENABLE_PREFETCH__
internal::prefetch(pCur + amnt);
#endif
}
};
template<int Architecture, int CPU, typename Scalar, typename ResScalar, typename DataMapper, int M, int N>
struct Accumulator
{
Scalar dt[M][N];
EIGEN_STRONG_INLINE void zero()
{
for(auto i = 0; i < M; i++)
{
for(auto j = 0; j < N; j++)
{
dt[i][j] = Scalar(0);
}
}
}
template<int LhsProgress=-1, int DepthProgress=-1, int RhsProgress=-1>
EIGEN_STRONG_INLINE void prefetch(const DataMapper&, Index, Index) {}
template<typename ResPacket>
EIGEN_STRONG_INLINE void scale(ResScalar alpha, const ResPacket& pAlpha)
{
for(auto i = 0; i < M; i++)
{
for(auto j = 0; j < N; j++)
{
dt[i][j] *= alpha;
}
}
}
template<typename ResPacket>
EIGEN_STRONG_INLINE void store(const DataMapper& dest, Index row, Index col, ResScalar alpha, const ResPacket& pAlpha)
{
for(auto i = 0; i < M; i++)
{
for(auto j = 0; j < N; j++)
{
dest(row + i, col + j) += alpha*dt[i][j];
}
}
}
};
template<int Architecture, int CPU, typename Index, typename LhsScalar, typename LhsPackMap, typename RhsScalar, typename RhsPackMap, typename AccScalar, typename ResScalar, typename Accumulator, int M, int K, int N>
struct MicroKernel
{
EIGEN_STRONG_INLINE void operator()(LhsPackMap& lhsPackMap,
RhsPackMap& rhsPackMap,
Index rowIdx, Index colIdx, Index depthIdx,
Accumulator& acc)
{
const RhsScalar *pRhs = rhsPackMap.pCur;
for(auto i = 0; i < N; i++)
{
const LhsScalar *pLhs = lhsPackMap.pCur;
for(auto j = 0; j < M; j++)
{
acc.dt[j][i] += pRhs[i]*pLhs[j];
}
}
lhsPackMap.advance(M*K);
rhsPackMap.advance(K*N);
};
};
template<int Architecture, int CPU, typename Index, typename LhsScalar, typename LhsPackMap, typename RhsScalar, typename RhsPackMap, typename AccScalar, typename ResScalar, typename ResPacket, typename DataMapper, typename AccumulatorType, int RHS_SHAPE_IDX, int LHS_SHAPE_IDX, int IDX>
struct DepthLoopStruct
{
static constexpr auto PREVIOUS = SHAPES<Architecture, CPU, LhsScalar, RhsScalar>[IDX][SHAPES_DEP_POINTER];
DepthLoopStruct<Architecture, CPU, Index, LhsScalar, LhsPackMap, RhsScalar, RhsPackMap, AccScalar, ResScalar, ResPacket, DataMapper, AccumulatorType, RHS_SHAPE_IDX, LHS_SHAPE_IDX, PREVIOUS> depthLS;
EIGEN_STRONG_INLINE void operator()(Index rowIdx, Index colIdx, Index depthIdx, const DataMapper& res, AccumulatorType& acc,
Index rows, Index depth, Index cols, ResScalar alpha, const ResPacket& pAlpha, LhsPackMap& lhsPackMap, RhsPackMap& rhsPackMap)
{
constexpr auto rhsProgress = SHAPES<Architecture, CPU, LhsScalar, RhsScalar>[RHS_SHAPE_IDX][SHAPES_RHS_DIMENSION];
constexpr auto lhsProgress = SHAPES<Architecture, CPU, LhsScalar, RhsScalar>[LHS_SHAPE_IDX][SHAPES_LHS_DIMENSION];
constexpr auto depthProgress = SHAPES<Architecture, CPU, LhsScalar, RhsScalar>[IDX][SHAPES_DEP_DIMENSION];
//typedef Accumulator<Architecture, CPU, AccScalar, ResScalar, DataMapper, lhsProgress, rhsProgress> AccumulatorType;
MicroKernel<Architecture, CPU, Index, LhsScalar, LhsPackMap, RhsScalar, RhsPackMap, AccScalar, ResScalar, AccumulatorType, lhsProgress, depthProgress, rhsProgress> mkt;
//AccumulatorType acc;
//acc.zero();
acc.template prefetch<lhsProgress, depthProgress, rhsProgress>(res, rowIdx, colIdx);
lhsPackMap.template prefetch<lhsProgress, depthProgress>(0);
rhsPackMap.template prefetch<rhsProgress, depthProgress>(0);
for(; depthIdx + depthProgress <= depth; depthIdx+=depthProgress)
{
#ifdef __DEBUG__
auto M = lhsProgress;
auto K = depthProgress;
auto N = rhsProgress;
std::cout << "Kernel " << M << " x " << K << " x " << N << " @ " << rowIdx << ", " << depthIdx << ", " << colIdx << std::endl;
std::cout << "LHS ";
for(auto i = 0; i < M; i++)
{
for(auto j = 0; j < K; j++)
{
std::cout << lhsPackMap.pCur[i*K + j] << " ";
}
}
std::cout << std::endl << "RHS ";
for(auto i = 0; i < K; i++)
{
for(auto j = 0; j < N; j++)
{
std::cout << rhsPackMap.pCur[i*N + j] << " ";
}
}
std::cout << std::endl;
#endif
mkt(lhsPackMap, rhsPackMap, rowIdx, colIdx, depthIdx, acc);
}
//acc.store(res, rowIdx, colIdx, alpha, pAlpha);
depthLS(rowIdx, colIdx, depthIdx, res, acc, rows, depth, cols, alpha, pAlpha, lhsPackMap, rhsPackMap);
}
};
template<int Architecture, int CPU, typename Index, typename LhsScalar, typename LhsPackMap, typename RhsScalar, typename RhsPackMap, typename AccScalar, typename ResScalar, typename ResPacket, typename DataMapper, typename AccumulatorType, int RHS_SHAPE_IDX, int LHS_SHAPE_IDX>
struct DepthLoopStruct<Architecture, CPU, Index, LhsScalar, LhsPackMap, RhsScalar, RhsPackMap, AccScalar, ResScalar, ResPacket, DataMapper, AccumulatorType, RHS_SHAPE_IDX, LHS_SHAPE_IDX, -1>
{
EIGEN_STRONG_INLINE void operator()(Index, Index, Index, const DataMapper&, AccumulatorType&,
Index, Index, Index, ResScalar, const ResPacket&, LhsPackMap&, RhsPackMap&) {}
};
template<int Architecture, int CPU, typename Index, typename LhsScalar, typename LhsPackMap, typename RhsScalar, typename RhsPackMap, typename AccScalar, typename ResScalar, typename ResPacket, typename DataMapper, int RHS_SHAPE_IDX, int IDX>
struct LhsLoopStruct
{
static constexpr auto PREVIOUS = SHAPES<Architecture, CPU, LhsScalar, RhsScalar>[IDX][SHAPES_LHS_POINTER];
LhsLoopStruct<Architecture, CPU, Index, LhsScalar, LhsPackMap, RhsScalar, RhsPackMap, AccScalar, ResScalar, ResPacket, DataMapper, RHS_SHAPE_IDX, PREVIOUS> lhsLS;
EIGEN_STRONG_INLINE void operator()(Index rowIdx, int colIdx, const DataMapper& res,
Index rows, Index depth, Index cols, ResScalar alpha, const ResPacket& pAlpha, LhsPackMap& lhsPackMap, RhsPackMap& rhsPackMap)
{
constexpr auto lhsProgress = SHAPES<Architecture, CPU, LhsScalar, RhsScalar>[IDX][SHAPES_LHS_DIMENSION];
constexpr auto rhsProgress = SHAPES<Architecture, CPU, LhsScalar, RhsScalar>[IDX][SHAPES_RHS_DIMENSION];
typedef Accumulator<Architecture, CPU, AccScalar, ResScalar, DataMapper, lhsProgress, rhsProgress> AccumulatorType;
DepthLoopStruct<Architecture, CPU, Index, LhsScalar, LhsPackMap, RhsScalar, RhsPackMap, AccScalar, ResScalar, ResPacket, DataMapper, AccumulatorType, RHS_SHAPE_IDX, IDX, IDX> depthLS;
//rhsPackMap.resetCur();
for(;rowIdx + lhsProgress <= rows; rowIdx+=lhsProgress)
{
rhsPackMap.resetCur();
AccumulatorType acc;
acc.zero();
//lhsPackMap.moveTo(rowIdx);
//rhsPackMap.moveTo(colIdx);
depthLS(rowIdx, colIdx, 0, res, acc, rows, depth, cols, alpha, pAlpha, lhsPackMap, rhsPackMap);
acc.store(res, rowIdx, colIdx, alpha, pAlpha);
}
lhsLS(rowIdx, colIdx, res, rows, depth, cols, alpha, pAlpha, lhsPackMap, rhsPackMap);
}
};
template<int Architecture, int CPU, typename Index, typename LhsScalar, typename LhsPackMap, typename RhsScalar, typename RhsPackMap, typename AccScalar, typename ResScalar, typename ResPacket, typename DataMapper, int RHS_SHAPE_IDX>
struct LhsLoopStruct<Architecture, CPU, Index, LhsScalar, LhsPackMap, RhsScalar, RhsPackMap, AccScalar, ResScalar, ResPacket, DataMapper, RHS_SHAPE_IDX, -1>
{
EIGEN_STRONG_INLINE void operator()(Index, Index, const DataMapper&,
Index, Index, Index, ResScalar, const ResPacket&, LhsPackMap&, RhsPackMap&) {}
};
template<int Architecture, int CPU, typename Index, typename LhsScalar, typename LhsPackMap, typename RhsScalar, typename RhsPackMap, typename AccScalar, typename ResScalar, typename ResPacket, typename DataMapper, int IDX>
struct RhsLoopStruct
{
static constexpr auto PREVIOUS = SHAPES<Architecture, CPU, LhsScalar, RhsScalar>[IDX][SHAPES_RHS_POINTER];
RhsLoopStruct<Architecture, CPU, Index, LhsScalar, LhsPackMap, RhsScalar, RhsPackMap, AccScalar, ResScalar, ResPacket, DataMapper, PREVIOUS> rhsLS;
EIGEN_STRONG_INLINE void operator()(Index colIdx, const DataMapper& res,
Index rows, Index depth, Index cols, ResScalar alpha, const ResPacket& pAlpha, LhsPackMap& lhsPackMap, RhsPackMap& rhsPackMap)
{
constexpr auto rhsProgress = SHAPES<Architecture, CPU, LhsScalar, RhsScalar>[IDX][SHAPES_RHS_DIMENSION];
for(;colIdx + rhsProgress <= cols; colIdx+=rhsProgress)
{
LhsLoopStruct<Architecture, CPU, Index, LhsScalar, LhsPackMap, RhsScalar, RhsPackMap, AccScalar, ResScalar, ResPacket, DataMapper, IDX, IDX> lhsLS;
lhsPackMap.resetCur();
lhsLS(0, colIdx, res, rows, depth, cols, alpha, pAlpha, lhsPackMap, rhsPackMap);
rhsPackMap.updateBase();
}
rhsLS(colIdx, res, rows, depth, cols, alpha, pAlpha, lhsPackMap, rhsPackMap);
}
};
template<int Architecture, int CPU, typename Index, typename LhsScalar, typename LhsPackMap, typename RhsScalar, typename RhsPackMap, typename AccScalar, typename ResScalar, typename ResPacket, typename DataMapper>
struct RhsLoopStruct<Architecture, CPU, Index, LhsScalar, LhsPackMap, RhsScalar, RhsPackMap, AccScalar, ResScalar, ResPacket, DataMapper, -1>
{
EIGEN_STRONG_INLINE void operator()(Index colIdx, const DataMapper&,
Index, Index, Index, ResScalar, const ResPacket&, LhsPackMap&, RhsPackMap&) {}
};
template<int Architecture, int CPU, typename ResScalar, typename AccScalar, typename LhsScalar, typename RhsScalar, typename Index, typename DataMapper>
EIGEN_STRONG_INLINE void gemm(const DataMapper& res, const LhsScalar* blockA, const RhsScalar* blockB,
Index rows, Index depth, Index cols, ResScalar alpha, Index strideA, Index strideB, Index offsetA, Index offsetB)
{
using ResPacket = typename unpacket_traits<ResScalar>::type;
typedef PackMap<Architecture, CPU, Index, LhsScalar, DataMapper, true> LhsPackMap;
typedef PackMap<Architecture, CPU, Index, RhsScalar, DataMapper, false> RhsPackMap;
#ifdef __DEBUG__
std::cout << "blockA" << std::endl;
for(auto i = 0; i < rows*depth; i++)
{
if(i % 4 == 0 && i > 0)
std::cout << std::endl;
std::cout << blockA[i] << " ";
}
std::cout << std::endl;
std::cout << "blockB" << std::endl;
for(auto i = 0; i < depth*cols; i++)
{
if(i % 4 == 0 && i > 0)
std::cout << std::endl;
std::cout << blockB[i] << " ";
}
std::cout << std::endl;
#endif
asm __volatile__("#BEGING_GEBP\n\t");
RhsLoopStruct<Architecture, CPU, Index, LhsScalar, LhsPackMap, RhsScalar, RhsPackMap, AccScalar, ResScalar, ResPacket, DataMapper, SHAPES_COUNT<0, 0, LhsScalar, RhsScalar>-1> rhsLS;
LhsPackMap lhsPackMap(blockA, depth, strideA, offsetA);
RhsPackMap rhsPackMap(blockB, depth, strideB, offsetB);
ResPacket pAlpha = pset1<ResPacket>(alpha);
rhsLS(0, res, rows, depth, cols, alpha, pAlpha, lhsPackMap, rhsPackMap);
asm __volatile__("#END_GEBP\n\t");
}
/*
template<typename Index, typename DataMapper, int nr, bool Conjugate, bool PanelMode>
struct gemm_pack_rhs<float, Index, DataMapper, nr, ColMajor, Conjugate, PanelMode>
{
void operator()(float* blockB, const DataMapper& rhs, Index depth, Index cols, Index stride=0, Index offset=0);
};
template<typename Index, typename DataMapper, int nr, bool Conjugate, bool PanelMode>
void gemm_pack_rhs<float, Index, DataMapper, nr, ColMajor, Conjugate, PanelMode>
::operator()(float* blockB, const DataMapper& rhs, Index depth, Index cols, Index stride, Index offset)
{
rhs_pack<0, 0, Index, float, DataMapper, Conjugate, PanelMode, ColMajor> pack;
pack(blockB, rhs, depth, cols, stride, offset);
}
template<typename Index, typename DataMapper, int nr, bool Conjugate, bool PanelMode>
struct gemm_pack_rhs<float, Index, DataMapper, nr, RowMajor, Conjugate, PanelMode>
{
void operator()(float* blockB, const DataMapper& rhs, Index depth, Index cols, Index stride=0, Index offset=0);
};
template<typename Index, typename DataMapper, int nr, bool Conjugate, bool PanelMode>
void gemm_pack_rhs<float, Index, DataMapper, nr, RowMajor, Conjugate, PanelMode>
::operator()(float* blockB, const DataMapper& rhs, Index depth, Index cols, Index stride, Index offset)
{
rhs_pack<0, 0, Index, float, DataMapper, Conjugate, PanelMode, RowMajor> pack;
pack(blockB, rhs, depth, cols, stride, offset);
}
template<typename Index, typename DataMapper, int Pack1, int Pack2, typename Packet, bool Conjugate, bool PanelMode>
struct gemm_pack_lhs<float, Index, DataMapper, Pack1, Pack2, Packet, RowMajor, Conjugate, PanelMode>
{
void operator()(float* blockA, const DataMapper& lhs, Index depth, Index rows, Index stride=0, Index offset=0);
};
template<typename Index, typename DataMapper, int Pack1, int Pack2, typename Packet, bool Conjugate, bool PanelMode>
void gemm_pack_lhs<float, Index, DataMapper, Pack1, Pack2, Packet, RowMajor, Conjugate, PanelMode>
::operator()(float* blockA, const DataMapper& lhs, Index depth, Index rows, Index stride, Index offset)
{
lhs_pack<0, 0, Index, float, DataMapper, Conjugate, PanelMode, RowMajor> pack;
pack(blockA, lhs, depth, rows, stride, offset);
}
template<typename Index, typename DataMapper, int Pack1, int Pack2, typename Packet, bool Conjugate, bool PanelMode>
struct gemm_pack_lhs<float, Index, DataMapper, Pack1, Pack2, Packet, ColMajor, Conjugate, PanelMode>
{
void operator()(float* blockA, const DataMapper& lhs, Index depth, Index rows, Index stride=0, Index offset=0);
};
template<typename Index, typename DataMapper, int Pack1, int Pack2, typename Packet, bool Conjugate, bool PanelMode>
void gemm_pack_lhs<float, Index, DataMapper, Pack1, Pack2, Packet, ColMajor, Conjugate, PanelMode>
::operator()(float* blockA, const DataMapper& lhs, Index depth, Index rows, Index stride, Index offset)
{
lhs_pack<0, 0, Index, float, DataMapper, Conjugate, PanelMode, ColMajor> pack;
pack(blockA, lhs, depth, rows, stride, offset);
}
*/
template<typename Index, typename DataMapper, int mr, int nr, bool ConjugateLhs, bool ConjugateRhs>
struct gebp_kernel<float, float, Index, DataMapper, mr, nr, ConjugateLhs, ConjugateRhs>
{
void operator()(const DataMapper& res, const float* blockA, const float* blockB,
Index rows, Index depth, Index cols, float alpha,
Index strideA=-1, Index strideB=-1, Index offsetA=0, Index offsetB=0);
};
template<typename Index, typename DataMapper, int mr, int nr, bool ConjugateLhs, bool ConjugateRhs>
void gebp_kernel<float, float, Index, DataMapper, mr, nr, ConjugateLhs, ConjugateRhs>
::operator()(const DataMapper& res, const float* blockA, const float* blockB,
Index rows, Index depth, Index cols, float alpha,
Index strideA, Index strideB, Index offsetA, Index offsetB)
{
gemm<0, 0, float, float, float, float, Index, DataMapper>(res, blockA, blockB, rows, depth, cols, alpha, strideA, strideB, offsetA, offsetB);
}
} // end namespace internal
} // end namespace Eigen
#endif // EIGEN_MATRIX_PRODUCT_NEON_H

View File

@@ -0,0 +1,192 @@
// This file is part of Eigen, a lightweight C++ template library
// for linear algebra.
//
// Copyright (C) 2021 Everton Constantino (everton.constantino@hotmail.com)
//
// This Source Code Form is subject to the terms of the Mozilla
// Public License v. 2.0. If a copy of the MPL was not distributed
// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
#ifndef EIGEN_PACKING_OPS_NEON_H
#define EIGEN_PACKING_OPS_NEON_H
namespace Eigen {
namespace internal {
#ifdef __ENABLE_CUSTOM_PACKING__
template<int CPU, typename Scalar, bool isLhs>
constexpr int PACK_SHAPES_COUNT<0, CPU, Scalar, isLhs> = 3;
template<int CPU, typename Scalar>
constexpr int PACK_SHAPES_COUNT<0, CPU, Scalar, true> = 4;
template<int CPU, typename Scalar, bool isLhs>
constexpr int PACK_SHAPES<0, CPU, Scalar, isLhs>[PACK_SHAPES_COUNT<0, CPU, Scalar, isLhs>][PACK_SHAPES_DIMENSION] = {{1,1,PACK_SHAPES_END},{4,1,0},{4,4,0}};
template<int CPU, typename Scalar>
constexpr int PACK_SHAPES<0, CPU, Scalar, true>[PACK_SHAPES_COUNT<0, CPU, Scalar, true>][PACK_SHAPES_DIMENSION] = {{1,1,PACK_SHAPES_END},{4,1,0},{4,4,0},{8,1,2}};
template<int CPU, typename Index, typename Scalar, bool isLhs, typename DataMapper, bool Conjugate, bool PanelMode, int StorageOrder>
struct PackingOperator<0, CPU, Index, Scalar, isLhs, DataMapper, Conjugate, PanelMode, StorageOrder, 4, 4>
{
EIGEN_STRONG_INLINE Scalar* operator()(Index d1Idx, Index d2Idx, Scalar *block, const DataMapper& data)
{
using Packet = typename packet_traits<Scalar>::type;
constexpr int vectorSize = packet_traits<Scalar>::size;
Scalar *c = block;
if(!isLhs)
{
int tD = d1Idx;
d1Idx = d2Idx;
d2Idx = tD;
}
if(isLhs && StorageOrder == ColMajor || !isLhs && StorageOrder == RowMajor)
{
Packet p0 = data.template loadPacket<Packet>(d1Idx, d2Idx + 0);
Packet p1 = data.template loadPacket<Packet>(d1Idx, d2Idx + 1);
Packet p2 = data.template loadPacket<Packet>(d1Idx, d2Idx + 2);
Packet p3 = data.template loadPacket<Packet>(d1Idx, d2Idx + 3);
pstore<Scalar>(c + 0*vectorSize, p0);
pstore<Scalar>(c + 1*vectorSize, p1);
pstore<Scalar>(c + 2*vectorSize, p2);
pstore<Scalar>(c + 3*vectorSize, p3);
c+=4*vectorSize;
} else {
PacketBlock<Packet, 4> pblock;
pblock.packet[0] = data.template loadPacket<Packet>(d1Idx, d2Idx + 0);
pblock.packet[1] = data.template loadPacket<Packet>(d1Idx, d2Idx + 1);
pblock.packet[2] = data.template loadPacket<Packet>(d1Idx, d2Idx + 2);
pblock.packet[3] = data.template loadPacket<Packet>(d1Idx, d2Idx + 3);
ptranspose(pblock);
pstore<Scalar>(c + 0*vectorSize, pblock.packet[0]);
pstore<Scalar>(c + 1*vectorSize, pblock.packet[1]);
pstore<Scalar>(c + 2*vectorSize, pblock.packet[2]);
pstore<Scalar>(c + 3*vectorSize, pblock.packet[3]);
c+=4*vectorSize;
}
return c;
}
};
template<int CPU, typename Index, typename Scalar, bool isLhs, typename DataMapper, bool Conjugate, bool PanelMode, int StorageOrder>
struct PackingOperator<0, CPU, Index, Scalar, isLhs, DataMapper, Conjugate, PanelMode, StorageOrder, 8, 1>
{
EIGEN_STRONG_INLINE Scalar* operator()(Index d1Idx, Index d2Idx, Scalar *block, const DataMapper& data)
{
using Packet = typename packet_traits<Scalar>::type;
Scalar *c = block;
if(isLhs && StorageOrder == ColMajor)
{
Packet p = data.template loadPacket<Packet>(d1Idx + 0, d2Idx);
pstore<Scalar>(c, p);
c+=4;
p = data.template loadPacket<Packet>(d1Idx + 4, d2Idx);
pstore<Scalar>(c, p);
c+=4;
} else if(!isLhs && StorageOrder == RowMajor) {
Packet p = data.template loadPacket<Packet>(d2Idx, d1Idx + 0);
pstore<Scalar>(c, p);
c+=4;
p = data.template loadPacket<Packet>(d2Idx, d1Idx + 4);
pstore<Scalar>(c, p);
c+=4;
} else {
if(isLhs)
{
*c = data(d1Idx + 0, d2Idx + 0);
c++;
*c = data(d1Idx + 1, d2Idx + 0);
c++;
*c = data(d1Idx + 2, d2Idx + 0);
c++;
*c = data(d1Idx + 3, d2Idx + 0);
c++;
*c = data(d1Idx + 0, d2Idx + 4);
c++;
*c = data(d1Idx + 1, d2Idx + 4);
c++;
*c = data(d1Idx + 2, d2Idx + 4);
c++;
*c = data(d1Idx + 3, d2Idx + 4);
c++;
} else {
*c = data(d2Idx, d1Idx + 0);
c++;
*c = data(d2Idx, d1Idx + 1);
c++;
*c = data(d2Idx, d1Idx + 2);
c++;
*c = data(d2Idx, d1Idx + 3);
c++;
*c = data(d2Idx + 4, d1Idx + 0);
c++;
*c = data(d2Idx + 4, d1Idx + 1);
c++;
*c = data(d2Idx + 4, d1Idx + 2);
c++;
*c = data(d2Idx + 4, d1Idx + 3);
c++;
}
}
return c;
}
};
template<int CPU, typename Index, typename Scalar, bool isLhs, typename DataMapper, bool Conjugate, bool PanelMode, int StorageOrder>
struct PackingOperator<0, CPU, Index, Scalar, isLhs, DataMapper, Conjugate, PanelMode, StorageOrder, 4, 1>
{
EIGEN_STRONG_INLINE Scalar* operator()(Index d1Idx, Index d2Idx, Scalar *block, const DataMapper& data)
{
using Packet = typename packet_traits<Scalar>::type;
Scalar *c = block;
if(isLhs && StorageOrder == ColMajor)
{
Packet p = data.template loadPacket<Packet>(d1Idx, d2Idx);
pstore<Scalar>(c, p);
c+=4;
} else if(!isLhs && StorageOrder == RowMajor) {
Packet p = data.template loadPacket<Packet>(d2Idx, d1Idx);
pstore<Scalar>(c, p);
c+=4;
} else {
if(isLhs)
{
*c = data(d1Idx + 0, d2Idx);
c++;
*c = data(d1Idx + 1, d2Idx);
c++;
*c = data(d1Idx + 2, d2Idx);
c++;
*c = data(d1Idx + 3, d2Idx);
c++;
} else {
*c = data(d2Idx, d1Idx + 0);
c++;
*c = data(d2Idx, d1Idx + 1);
c++;
*c = data(d2Idx, d1Idx + 2);
c++;
*c = data(d2Idx, d1Idx + 3);
c++;
}
}
return c;
}
};
#endif // __ENABLE_CUSTOM_PACKING__
} // end namespace internal
} // end namespace Eigen
#endif // EIGEN_PACKING_OPS_NEON_H

6
compile.sh Executable file
View File

@@ -0,0 +1,6 @@
#!/bin/bash
#echo 'Compiling with master'
#g++ -O3 -I../eigen-master -std=c++11 new_gemm_test.cpp -o gto
echo 'Compiling current'
g++ -O3 -I. -std=c++14 new_gemm_test.cpp -D__ENABLE_VECTOR_KERNELS__ -D__ENABLE_PREFETCH__ -o gtp
g++ -O3 -I. -std=c++14 new_gemm_test.cpp -D__ENABLE_VECTOR_KERNELS__ -o gt

105
new_gemm_test.cpp Normal file
View File

@@ -0,0 +1,105 @@
#include <Eigen/Dense>
#include <iostream>
#include <ctime>
#include <cmath>
using namespace Eigen;
void set(MatrixXf& A, int m, int n, int id, int digits)
{
for(auto i = 0; i < m; i++)
for(auto j = 0; j < n; j++)
A(i,j) = id*std::pow(10,(2*digits)) + i*std::pow(10,digits) + j;
}
int main(int argc, char* argv[])
{
#ifdef __DEBUG__
int m = std::atoi(argv[1]), k = std::atoi(argv[1]), n = std::atoi(argv[1]);
int max = std::max(std::max(m,k),n);
MatrixXf A = MatrixXf::Zero(m, k);
MatrixXf B = MatrixXf::Zero(k, n);
MatrixXf C = MatrixXf::Zero(m, n);
MatrixXf D = MatrixXf::Zero(m, n);
set(A, m, k, 1, static_cast<int>(std::log10(max)) + 1);
set(B, k, n, 2, static_cast<int>(std::log10(max)) + 1);
for(auto i = 0; i < 2; i++)
C = A*B;
#ifdef __DEBUG_SHOW_INPUTS__
std::cout << A << std::endl;
std::cout << B << std::endl;
#endif
#ifdef __DEBUG_SHOW_RESULT__
std::cout << C << std::endl;
#endif
std::cout << std::endl;
for(auto i = 0; i < m; i++)
{
for(auto j = 0; j < n; j++)
{
float acc=0;
for(auto kk = 0; kk < k; kk++)
{
acc += A(i,kk)*B(kk,j);
}
D(i,j) = acc;
if(std::sqrt(std::pow(D(i,j)-C(i,j),2)) > 1.0e-5)
{
std::cout << "Difference too big at " << i << " ," << j << " is " << C(i,j) << " should be " << D(i,j) << std::endl;
}
}
}
#ifdef __DEBUG_SHOW_RESULT__
std::cout << D << std::endl;
#endif
#else
if(argc < 5)
{
std::cout << "Wrong number of arguments." << std::endl;
return -1;
}
int m = std::atoi(argv[1]), k = std::atoi(argv[2]), n = std::atoi(argv[3]);
int RUNS = std::atoi(argv[4]);
double time = 0;
MatrixXf A = MatrixXf::Random(m,k);
MatrixXf B = MatrixXf::Random(k,n);
for(auto i = 0; i < RUNS; i++)
{
MatrixXf C = MatrixXf::Zero(m, n);
std::clock_t start,end;
start = std::clock();
C = A*B;
end = std::clock();
time += 1000.0*(end-start) / CLOCKS_PER_SEC;
}
std::cout << time << std::endl;
#ifdef TEST_SCALAR
start = std::clock();
for(auto i = 0; i < m; i++)
{
for(auto j = 0; j < n; j++)
{
float acc=0;
for(auto kk = 0; kk < k; kk++)
{
acc += A(i,kk)*B(kk,j);
}
C(i,j) = acc;
}
}
end = std::clock();
std::cout << 1000.0*(end-start) / CLOCKS_PER_SEC << std::endl;
#endif
#endif
return 0;
}

40
run.sh Executable file
View File

@@ -0,0 +1,40 @@
#!/bin/bash
function run() {
OLD=0
NEW=0
NEWP=0
EXECS=$1
SIZE=$2
RUNS=$3
for ((i = 0; i < $EXECS; i++)) do
SEL=$(A=$(shuf -i 0-10 -n 1); echo $(($A % 2)))
if [ $SEL -eq 0 ]; then
T_OLD=$(./gto $SIZE $SIZE $SIZE $RUNS)
T_NEW=$(./gt $SIZE $SIZE $SIZE $RUNS)
T_NEWP=$(./gtp $SIZE $SIZE $SIZE $RUNS)
else
T_NEW=$(./gt $SIZE $SIZE $SIZE $RUNS)
T_NEWP=$(./gtp $SIZE $SIZE $SIZE $RUNS)
T_OLD=$(./gto $SIZE $SIZE $SIZE $RUNS)
fi
NEW=$NEW+$T_NEW
OLD=$OLD+$T_OLD
NEWP=$NEWP+$T_NEWP
done
SPEED=$(echo "($OLD) / ($NEW)" | bc -l)
SPEEDP=$(echo "($OLD) / ($NEWP)" | bc -l)
echo "$SIZE -> $SPEED $SPEEDP"
}
run $1 16 500
run $1 21 500
run $1 32 500
run $1 53 500
run $1 64 100
run $1 97 100
run $1 128 50
run $1 203 50
run $1 256 10
run $1 673 10
run $1 1024 5
run $1 2048 2