Compare commits

..

16 Commits

Author SHA1 Message Date
Everton Constantino
58db05afbc WIP 2 2021-05-13 15:30:08 +00:00
Everton Constantino
bfadb56107 WIP 2 2021-05-13 14:48:40 +00:00
Everton Constantino
9b8cdceea8 WIP 2 2021-05-13 14:42:22 +00:00
Everton Constantino
a8ec6d6a36 WIP with tests 2021-05-12 17:09:33 +00:00
Everton Constantino
54f80f442d WIP - Vector 2021-05-10 20:06:34 +00:00
Everton Constantino
70c0363c28 WIP2 2021-05-10 19:59:47 +00:00
Everton Constantino
b2cd094863 WIP 2021-05-10 16:53:17 +00:00
Everton Constantino
d216764f46 WIP 2021-04-23 17:28:17 +00:00
Everton Constantino
646d92c7f1 WIP 2021-04-23 15:39:04 +00:00
Everton Constantino
c62ed9b214 WIP 2021-04-22 20:42:44 +00:00
Everton Constantino
82a7715b01 WIP 2021-04-22 18:11:53 +00:00
Everton Constantino
43ce8e9d2d WIP 2021-04-22 17:43:22 +00:00
Everton Constantino
ca0d3f92d7 WIP 2021-04-22 14:48:44 +00:00
Everton Constantino
5bffe09624 WIP 2021-04-22 13:14:00 +00:00
Everton Constantino
421891e1db WIP 2021-04-21 17:58:55 +00:00
Everton Constantino
f826663a3a WIP 2021-04-20 20:10:21 +00:00
38 changed files with 1802 additions and 566 deletions

View File

@@ -8,8 +8,6 @@
# with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
stages:
- buildsmoketests
- smoketests
- build
- test
@@ -18,6 +16,5 @@ variables:
EIGEN_CI_CMAKE_GENEATOR: "Ninja"
include:
- "/ci/smoketests.gitlab-ci.yml"
- "/ci/build.gitlab-ci.yml"
- "/ci/test.gitlab-ci.yml"

View File

@@ -350,6 +350,9 @@ using std::ptrdiff_t;
#include "src/Core/arch/AltiVec/MatrixProduct.h"
#elif defined EIGEN_VECTORIZE_NEON
#include "src/Core/arch/NEON/GeneralBlockPanelKernel.h"
#include "src/Core/arch/NEON/MatrixProduct.h"
#include "src/Core/arch/NEON/PackingOps.h"
#include "src/Core/arch/NEON/Kernels.h"
#endif
#include "src/Core/BooleanRedux.h"

View File

@@ -556,7 +556,7 @@ template<typename Packet> EIGEN_DEVICE_FUNC inline Packet
peven_mask(const Packet& /*a*/) {
typedef typename unpacket_traits<Packet>::type Scalar;
const size_t n = unpacket_traits<Packet>::size;
EIGEN_ALIGN_TO_BOUNDARY(sizeof(Packet)) Scalar elements[n];
Scalar elements[n];
for(size_t i = 0; i < n; ++i) {
memset(elements+i, ((i & 1) == 0 ? 0xff : 0), sizeof(Scalar));
}
@@ -731,7 +731,7 @@ EIGEN_DEVICE_FUNC inline typename unpacket_traits<Packet>::type
predux_helper(const Packet& a, Op op) {
typedef typename unpacket_traits<Packet>::type Scalar;
const size_t n = unpacket_traits<Packet>::size;
EIGEN_ALIGN_TO_BOUNDARY(sizeof(Packet)) Scalar elements[n];
Scalar elements[n];
pstoreu<Scalar>(elements, a);
for(size_t k = n / 2; k > 0; k /= 2) {
for(size_t i = 0; i < k; ++i) {

View File

@@ -7,9 +7,6 @@
// Public License v. 2.0. If a copy of the MPL was not distributed
// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
#ifndef EIGEN_STLITERATORS_H
#define EIGEN_STLITERATORS_H
namespace Eigen {
namespace internal {
@@ -33,10 +30,10 @@ public:
typedef Index difference_type;
typedef std::random_access_iterator_tag iterator_category;
indexed_based_stl_iterator_base() EIGEN_NO_THROW : mp_xpr(0), m_index(0) {}
indexed_based_stl_iterator_base(XprType& xpr, Index index) EIGEN_NO_THROW : mp_xpr(&xpr), m_index(index) {}
indexed_based_stl_iterator_base() : mp_xpr(0), m_index(0) {}
indexed_based_stl_iterator_base(XprType& xpr, Index index) : mp_xpr(&xpr), m_index(index) {}
indexed_based_stl_iterator_base(const non_const_iterator& other) EIGEN_NO_THROW
indexed_based_stl_iterator_base(const non_const_iterator& other)
: mp_xpr(other.mp_xpr), m_index(other.m_index)
{}
@@ -193,17 +190,17 @@ public:
typedef typename internal::conditional<bool(is_lvalue), value_type&, const value_type&>::type reference;
pointer_based_stl_iterator() EIGEN_NO_THROW : m_ptr(0) {}
pointer_based_stl_iterator(XprType& xpr, Index index) EIGEN_NO_THROW : m_incr(xpr.innerStride())
pointer_based_stl_iterator() : m_ptr(0) {}
pointer_based_stl_iterator(XprType& xpr, Index index) : m_incr(xpr.innerStride())
{
m_ptr = xpr.data() + index * m_incr.value();
}
pointer_based_stl_iterator(const non_const_iterator& other) EIGEN_NO_THROW
pointer_based_stl_iterator(const non_const_iterator& other)
: m_ptr(other.m_ptr), m_incr(other.m_incr)
{}
pointer_based_stl_iterator& operator=(const non_const_iterator& other) EIGEN_NO_THROW
pointer_based_stl_iterator& operator=(const non_const_iterator& other)
{
m_ptr = other.m_ptr;
m_incr.setValue(other.m_incr);
@@ -459,5 +456,3 @@ inline typename DenseBase<Derived>::const_iterator DenseBase<Derived>::cend() co
}
} // namespace Eigen
#endif // EIGEN_STLITERATORS_H

View File

@@ -139,8 +139,8 @@ EIGEN_STRONG_INLINE Packet2cf pload2(const std::complex<float>* from0, const std
__asm__ ("xxpermdi %x0, %x2, %x1, 0" : "=wa" (res0) : "wa" (res0), "wa" (res1));
#endif
#else
*reinterpret_cast<std::complex<float> *>(&res0) = *from0;
*reinterpret_cast<std::complex<float> *>(&res1) = *from1;
*((std::complex<float> *)&res0[0]) = *from0;
*((std::complex<float> *)&res1[0]) = *from1;
res0 = vec_perm(res0, res1, p16uc_TRANSPOSE64_HI);
#endif
return Packet2cf(res0);

View File

@@ -486,28 +486,19 @@ struct dhs_cpack {
if(((StorageOrder == ColMajor) && UseLhs) || (((StorageOrder == RowMajor) && !UseLhs)))
{
if (UseLhs) {
cblock.packet[0] = lhs.template loadPacket<PacketC>(j + 0, i);
cblock.packet[1] = lhs.template loadPacket<PacketC>(j + 2, i);
cblock.packet[0] = pload<PacketC>(&lhs(j + 0, i));
cblock.packet[1] = pload<PacketC>(&lhs(j + 2, i));
} else {
cblock.packet[0] = lhs.template loadPacket<PacketC>(i, j + 0);
cblock.packet[1] = lhs.template loadPacket<PacketC>(i, j + 2);
cblock.packet[0] = pload<PacketC>(&lhs(i, j + 0));
cblock.packet[1] = pload<PacketC>(&lhs(i, j + 2));
}
} else {
const std::complex<Scalar> *lhs0, *lhs1;
if (UseLhs) {
lhs0 = &lhs(j + 0, i);
lhs1 = &lhs(j + 1, i);
cblock.packet[0] = pload2(lhs0, lhs1);
lhs0 = &lhs(j + 2, i);
lhs1 = &lhs(j + 3, i);
cblock.packet[1] = pload2(lhs0, lhs1);
cblock.packet[0] = pload2(&lhs(j + 0, i), &lhs(j + 1, i));
cblock.packet[1] = pload2(&lhs(j + 2, i), &lhs(j + 3, i));
} else {
lhs0 = &lhs(i, j + 0);
lhs1 = &lhs(i, j + 1);
cblock.packet[0] = pload2(lhs0, lhs1);
lhs0 = &lhs(i, j + 2);
lhs1 = &lhs(i, j + 3);
cblock.packet[1] = pload2(lhs0, lhs1);
cblock.packet[0] = pload2(&lhs(i, j + 0), &lhs(i, j + 1));
cblock.packet[1] = pload2(&lhs(i, j + 2), &lhs(i, j + 3));
}
}
@@ -868,8 +859,8 @@ struct dhs_cpack<double, Index, DataMapper, Packet, PacketC, StorageOrder, Conju
PacketBlock<Packet,1> blockr, blocki;
PacketBlock<PacketC,2> cblock;
cblock.packet[0] = lhs.template loadPacket<PacketC>(j + 0, i);
cblock.packet[1] = lhs.template loadPacket<PacketC>(j + 1, i);
cblock.packet[0] = pload<PacketC>(&lhs(j + 0, i));
cblock.packet[1] = pload<PacketC>(&lhs(j + 1, i));
blockr.packet[0] = vec_perm(cblock.packet[0].v, cblock.packet[1].v, p16uc_GETREAL64);
blocki.packet[0] = vec_perm(cblock.packet[0].v, cblock.packet[1].v, p16uc_GETIMAG64);
@@ -1109,7 +1100,7 @@ EIGEN_STRONG_INLINE void pgerc(PacketBlock<Packet,N>* accReal, PacketBlock<Packe
template<typename Scalar, typename Packet>
EIGEN_STRONG_INLINE Packet ploadLhs(const Scalar* lhs)
{
return *reinterpret_cast<Packet *>(const_cast<Scalar *>(lhs));
return *((Packet *)lhs);
}
// Zero the accumulator on PacketBlock.
@@ -1808,6 +1799,24 @@ EIGEN_STRONG_INLINE void MICRO_COMPLEX_EXTRA_COL(
else EIGEN_UNUSED_VARIABLE(rhs_ptr_imag);
}
template<typename Scalar, typename Packetc, typename Index, const Index accCols>
EIGEN_STRONG_INLINE void pstore_add_half(std::complex<Scalar>* to, Packetc &from)
{
#ifdef __VSX__
Packetc from2;
#ifndef _BIG_ENDIAN
__asm__ ("xxswapd %x0, %x0" : : "wa" (from.v));
#endif
__asm__ ("lxsdx %x0,%y1" : "=wa" (from2.v) : "Z" (*to));
from2 += from;
__asm__ ("stxsdx %x0,%y1" : : "wa" (from2.v), "Z" (*to));
#else
std::complex<Scalar> mem[accColsC];
pstoreu<std::complex<Scalar> >(mem, from);
*to += *mem;
#endif
}
template<typename Scalar, typename Packet, typename Packetc, typename DataMapper, typename Index, const Index accRows, const Index accCols, bool ConjugateLhs, bool ConjugateRhs, bool LhsIsReal, bool RhsIsReal>
EIGEN_STRONG_INLINE void gemm_complex_extra_col(
const DataMapper& res,
@@ -1877,12 +1886,12 @@ EIGEN_STRONG_INLINE void gemm_complex_extra_col(
if ((sizeof(Scalar) == sizeof(float)) && (remaining_rows == 1))
{
res(row + 0, col + 0) += pfirst<Packetc>(acc0.packet[0]);
pstore_add_half<Scalar, Packetc, Index, accCols>(&res(row + 0, col + 0), acc0.packet[0]);
} else {
acc0.packet[0] += res.template loadPacket<Packetc>(row + 0, col + 0);
res.template storePacketBlock<Packetc,1>(row + 0, col + 0, acc0);
if(remaining_rows > accColsC) {
res(row + accColsC, col + 0) += pfirst<Packetc>(acc1.packet[0]);
pstore_add_half<Scalar, Packetc, Index, accCols>(&res(row + accColsC, col + 0), acc1.packet[0]);
}
}
}
@@ -1988,7 +1997,7 @@ asm("#gemm_complex begin");
if ((sizeof(Scalar) == sizeof(float)) && (remaining_rows == 1))
{
for(Index j = 0; j < 4; j++) {
res(row + 0, col + j) += pfirst<Packetc>(acc0.packet[j]);
pstore_add_half<Scalar, Packetc, Index, accCols>(&res(row + 0, col + j), acc0.packet[j]);
}
} else {
for(Index j = 0; j < 4; j++) {
@@ -1996,7 +2005,7 @@ asm("#gemm_complex begin");
acc2.packet[0] = res.template loadPacket<Packetc>(row + 0, col + j) + acc0.packet[j];
res.template storePacketBlock<Packetc,1>(row + 0, col + j, acc2);
if(remaining_rows > accColsC) {
res(row + accColsC, col + j) += pfirst<Packetc>(acc1.packet[j]);
pstore_add_half<Scalar, Packetc, Index, accCols>(&res(row + accColsC, col + j), acc1.packet[j]);
}
}
}

View File

@@ -214,7 +214,7 @@ EIGEN_STRONG_INLINE void bcouple_common<Packet2d, Packet1cd>(PacketBlock<Packet2
template<typename Scalar, typename Packet>
EIGEN_STRONG_INLINE Packet ploadRhs(const Scalar* rhs)
{
return *reinterpret_cast<Packet *>(const_cast<Scalar *>(rhs));
return *((Packet *)rhs);
}
} // end namespace internal

View File

@@ -0,0 +1,797 @@
// This file is part of Eigen, a lightweight C++ template library
// for linear algebra.
//
// Copyright (C) 2021 Everton Constantino (everton.constantino@hotmail.com)
//
// This Source Code Form is subject to the terms of the Mozilla
// Public License v. 2.0. If a copy of the MPL was not distributed
// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
#ifndef EIGEN_KERNELS_NEON_H
#define EIGEN_KERNELS_NEON_H
namespace Eigen {
namespace internal {
#ifdef __ENABLE_VECTOR_KERNELS__
#define MICRO_12x1x4() \
pRhs = pload<RhsPacket>(rhsPackMap.pCur); \
rhsPackMap.advance(1*4); \
pRhs0 = pset1<RhsPacket>(pRhs[0]); \
pRhs1 = pset1<RhsPacket>(pRhs[1]); \
pRhs2 = pset1<RhsPacket>(pRhs[2]); \
pRhs3 = pset1<RhsPacket>(pRhs[3]); \
pLhs = pload<LhsPacket>(lhsPackMap.pCur); \
lhsPackMap.advance(4*1); \
acc._acc1.packet[0] += pLhs*pRhs0; \
acc._acc1.packet[1] += pLhs*pRhs1; \
acc._acc1.packet[2] += pLhs*pRhs2; \
acc._acc1.packet[3] += pLhs*pRhs3; \
pLhs2 = pload<LhsPacket>(lhsPackMap.pCur); \
lhsPackMap.advance(4*1); \
acc._acc2.packet[0] += pLhs2*pRhs0; \
acc._acc2.packet[1] += pLhs2*pRhs1; \
acc._acc2.packet[2] += pLhs2*pRhs2; \
acc._acc2.packet[3] += pLhs2*pRhs3; \
pLhs3 = pload<LhsPacket>(lhsPackMap.pCur); \
acc._acc3.packet[0] += pLhs3*pRhs0; \
acc._acc3.packet[1] += pLhs3*pRhs1; \
acc._acc3.packet[2] += pLhs3*pRhs2; \
acc._acc3.packet[3] += pLhs3*pRhs3; \
lhsPackMap.advance(4*1);
#define MICRO_8x1x4() \
pLhs = pload<LhsPacket>(lhsPackMap.pCur); \
lhsPackMap.advance(4*1); \
pLhs2 = pload<LhsPacket>(lhsPackMap.pCur); \
pRhs = pload<RhsPacket>(rhsPackMap.pCur); \
pRhs0 = pset1<RhsPacket>(pRhs[0]); \
pRhs1 = pset1<RhsPacket>(pRhs[1]); \
pRhs2 = pset1<RhsPacket>(pRhs[2]); \
pRhs3 = pset1<RhsPacket>(pRhs[3]); \
acc._acc1.packet[0] += pLhs*pRhs0; \
acc._acc1.packet[1] += pLhs*pRhs1; \
acc._acc1.packet[2] += pLhs*pRhs2; \
acc._acc1.packet[3] += pLhs*pRhs3; \
acc._acc2.packet[0] += pLhs2*pRhs0; \
acc._acc2.packet[1] += pLhs2*pRhs1; \
acc._acc2.packet[2] += pLhs2*pRhs2; \
acc._acc2.packet[3] += pLhs2*pRhs3; \
lhsPackMap.advance(4*1); \
rhsPackMap.advance(1*4);
#define MICRO_4x1x4() \
pLhs = pload<LhsPacket>(lhsPackMap.pCur); \
pRhs = pload<RhsPacket>(rhsPackMap.pCur); \
pRhs0 = pset1<RhsPacket>(pRhs[0]); \
pRhs1 = pset1<RhsPacket>(pRhs[1]); \
pRhs2 = pset1<RhsPacket>(pRhs[2]); \
pRhs3 = pset1<RhsPacket>(pRhs[3]); \
acc._acc.packet[0] += pLhs*pRhs0; \
acc._acc.packet[1] += pLhs*pRhs1; \
acc._acc.packet[2] += pLhs*pRhs2; \
acc._acc.packet[3] += pLhs*pRhs3; \
lhsPackMap.advance(4*1); \
rhsPackMap.advance(1*4);
#define MICRO_12x1x1() \
pLhs = pload<LhsPacket>(lhsPackMap.pCur); \
pRhs = pset1<RhsPacket>(*rhsPackMap.pCur); \
acc._acc.packet[0] = pmadd(pRhs, pLhs, acc._acc.packet[0]); \
lhsPackMap.advance(4*1); \
pLhs = pload<LhsPacket>(lhsPackMap.pCur); \
acc._acc.packet[1] = pmadd(pRhs, pLhs, acc._acc.packet[1]); \
lhsPackMap.advance(4*1); \
pLhs = pload<LhsPacket>(lhsPackMap.pCur); \
acc._acc.packet[2] = pmadd(pRhs, pLhs, acc._acc.packet[2]); \
lhsPackMap.advance(4*1); \
rhsPackMap.advance(1);
#define MICRO_8x1x1() \
pLhs = pload<LhsPacket>(lhsPackMap.pCur); \
pRhs = pset1<RhsPacket>(*rhsPackMap.pCur); \
acc._acc.packet[0] = pmadd(pRhs, pLhs, acc._acc.packet[0]); \
lhsPackMap.advance(4*1); \
pLhs = pload<LhsPacket>(lhsPackMap.pCur); \
acc._acc.packet[1] = pmadd(pRhs, pLhs, acc._acc.packet[1]); \
lhsPackMap.advance(4*1); \
rhsPackMap.advance(1);
#define MICRO_4x1x1() \
pLhs = pload<LhsPacket>(lhsPackMap.pCur); \
pRhs = pset1<RhsPacket>(*rhsPackMap.pCur); \
acc._acc += pRhs*pLhs; \
lhsPackMap.advance(4*1); \
rhsPackMap.advance(1);
template<int CPU, typename Scalar, typename ResScalar, typename DataMapper>
struct Accumulator<0, CPU, Scalar, ResScalar, DataMapper, 12, 1>
{
using LinearMapper = typename DataMapper::LinearMapper;
using AccPacket = typename packet_traits<Scalar>::type;
using ResPacket = typename packet_traits<ResScalar>::type;
PacketBlock<AccPacket,3> _acc;
EIGEN_STRONG_INLINE void zero()
{
_acc.packet[0] = pset1<AccPacket>(0);
_acc.packet[1] = pset1<AccPacket>(0);
_acc.packet[2] = pset1<AccPacket>(0);
}
EIGEN_STRONG_INLINE void prefetch(const DataMapper&, Index, Index) {}
template<typename ResPacket_>
EIGEN_STRONG_INLINE void scale(ResScalar alpha, const ResPacket_& pAlpha)
{
_acc.packet[0] *= pAlpha;
_acc.packet[1] *= pAlpha;
_acc.packet[2] *= pAlpha;
}
EIGEN_STRONG_INLINE void store(const DataMapper& dest, Index row, Index col)
{
PacketBlock<ResPacket, 1> block;
block.packet[0] = dest.template loadPacket<ResPacket>(row + 0, col) + _acc.packet[0];
dest.template storePacketBlock<AccPacket, 1>(row + 0, col, block);
block.packet[0] = dest.template loadPacket<ResPacket>(row + 4, col) + _acc.packet[1];
dest.template storePacketBlock<AccPacket, 1>(row + 4, col, block);
block.packet[0] = dest.template loadPacket<ResPacket>(row + 8, col) + _acc.packet[2];
dest.template storePacketBlock<AccPacket, 1>(row + 8, col, block);
}
};
template<int CPU, typename Scalar, typename ResScalar, typename DataMapper>
struct Accumulator<0, CPU, Scalar, ResScalar, DataMapper, 8, 1>
{
using LinearMapper = typename DataMapper::LinearMapper;
using AccPacket = typename packet_traits<Scalar>::type;
using ResPacket = typename packet_traits<ResScalar>::type;
PacketBlock<AccPacket,2> _acc;
EIGEN_STRONG_INLINE void zero()
{
_acc.packet[0] = pset1<AccPacket>(0);
_acc.packet[1] = pset1<AccPacket>(0);
}
EIGEN_STRONG_INLINE void prefetch(const DataMapper&, Index, Index) {}
template<typename ResPacket_>
EIGEN_STRONG_INLINE void scale(ResScalar alpha, const ResPacket_& pAlpha)
{
_acc.packet[0] *= pAlpha;
_acc.packet[1] *= pAlpha;
}
EIGEN_STRONG_INLINE void store(const DataMapper& dest, Index row, Index col)
{
PacketBlock<ResPacket, 1> block;
block.packet[0] = dest.template loadPacket<ResPacket>(row, col) + _acc.packet[0];
dest.template storePacketBlock<AccPacket, 1>(row, col, block);
block.packet[0] = dest.template loadPacket<ResPacket>(row + 4, col) + _acc.packet[1];
dest.template storePacketBlock<AccPacket, 1>(row + 4, col, block);
}
};
template<int CPU, typename Scalar, typename ResScalar, typename DataMapper>
struct Accumulator<0, CPU, Scalar, ResScalar, DataMapper, 4, 1>
{
using LinearMapper = typename DataMapper::LinearMapper;
using AccPacket = typename packet_traits<Scalar>::type;
using ResPacket = typename packet_traits<ResScalar>::type;
AccPacket _acc;
EIGEN_STRONG_INLINE void zero()
{
_acc = pset1<AccPacket>(0);
}
EIGEN_STRONG_INLINE void prefetch(const DataMapper&, Index, Index) {}
template<typename ResPacket_>
EIGEN_STRONG_INLINE void scale(ResScalar alpha, const ResPacket_& pAlpha)
{
_acc *= pAlpha;
}
EIGEN_STRONG_INLINE void store(const DataMapper& dest, Index row, Index col)
{
PacketBlock<ResPacket, 1> block;
block.packet[0] = dest.template loadPacket<ResPacket>(row, col) + _acc;
dest.template storePacketBlock<AccPacket, 1>(row, col, block);
}
};
template<int CPU, typename Scalar, typename ResScalar, typename DataMapper>
struct Accumulator<0, CPU, Scalar, ResScalar, DataMapper, 1, 4>
{
using LinearMapper = typename DataMapper::LinearMapper;
using AccPacket = typename packet_traits<Scalar>::type;
using ResPacket = typename packet_traits<ResScalar>::type;
AccPacket _acc;
EIGEN_STRONG_INLINE void zero()
{
_acc = pset1<AccPacket>(0);
}
EIGEN_STRONG_INLINE void prefetch(const DataMapper&, Index, Index) {}
template<typename ResPacket_>
EIGEN_STRONG_INLINE void scale(ResScalar alpha, const ResPacket_& pAlpha)
{
_acc *= pAlpha;
}
EIGEN_STRONG_INLINE void store(const DataMapper& dest, Index row, Index col)
{
ResPacket r = dest.template gatherPacket<ResPacket>(row, col) + _acc;
dest.template scatterPacket<ResPacket>(row, col, r);
}
};
template<int CPU, typename Scalar, typename ResScalar, typename DataMapper>
struct Accumulator<0, CPU, Scalar, ResScalar, DataMapper, 4, 4>
{
using LinearMapper = typename DataMapper::LinearMapper;
using AccPacket = typename packet_traits<Scalar>::type;
using ResPacket = typename packet_traits<ResScalar>::type;
PacketBlock<AccPacket, 4> _acc;
EIGEN_STRONG_INLINE void zero()
{
_acc.packet[0] = pset1<AccPacket>(0);
_acc.packet[1] = pset1<AccPacket>(0);
_acc.packet[2] = pset1<AccPacket>(0);
_acc.packet[3] = pset1<AccPacket>(0);
}
EIGEN_STRONG_INLINE void prefetch(const DataMapper& dest, Index row, Index col)
{
dest.getLinearMapper(row, col + 0).prefetch(0);
dest.getLinearMapper(row, col + 1).prefetch(0);
dest.getLinearMapper(row, col + 2).prefetch(0);
dest.getLinearMapper(row, col + 3).prefetch(0);
}
template<typename ResPacket_>
EIGEN_STRONG_INLINE void scale(ResScalar alpha, const ResPacket_& pAlpha)
{
_acc.packet[0] *= pAlpha;
_acc.packet[1] *= pAlpha;
_acc.packet[2] *= pAlpha;
_acc.packet[3] *= pAlpha;
}
EIGEN_STRONG_INLINE void store(const DataMapper& dest, Index row, Index col)
{
constexpr auto PacketSize = unpacket_traits<ResPacket>::size;
LinearMapper r0 = dest.getLinearMapper(row, col + 0);
LinearMapper r1 = dest.getLinearMapper(row, col + 1);
LinearMapper r2 = dest.getLinearMapper(row, col + 2);
LinearMapper r3 = dest.getLinearMapper(row, col + 3);
r0.storePacket(0*PacketSize, r0.template loadPacket<ResPacket>(0*PacketSize) + _acc.packet[0]);
r1.storePacket(0*PacketSize, r1.template loadPacket<ResPacket>(0*PacketSize) + _acc.packet[1]);
r2.storePacket(0*PacketSize, r2.template loadPacket<ResPacket>(0*PacketSize) + _acc.packet[2]);
r3.storePacket(0*PacketSize, r3.template loadPacket<ResPacket>(0*PacketSize) + _acc.packet[3]);
}
};
template<int CPU, typename Scalar, typename ResScalar, typename DataMapper>
struct Accumulator<0, CPU, Scalar, ResScalar, DataMapper, 8, 4>
{
using LinearMapper = typename DataMapper::LinearMapper;
using AccPacket = typename packet_traits<Scalar>::type;
using ResPacket = typename packet_traits<ResScalar>::type;
PacketBlock<AccPacket, 4> _acc1;
PacketBlock<AccPacket, 4> _acc2;
EIGEN_STRONG_INLINE void zero()
{
_acc1.packet[0] = pset1<AccPacket>(0);
_acc1.packet[1] = pset1<AccPacket>(0);
_acc1.packet[2] = pset1<AccPacket>(0);
_acc1.packet[3] = pset1<AccPacket>(0);
_acc2.packet[0] = pset1<AccPacket>(0);
_acc2.packet[1] = pset1<AccPacket>(0);
_acc2.packet[2] = pset1<AccPacket>(0);
_acc2.packet[3] = pset1<AccPacket>(0);
}
EIGEN_STRONG_INLINE void prefetch(const DataMapper& dest, Index row, Index col)
{
dest.getLinearMapper(row + 0, col + 0).prefetch(0);
dest.getLinearMapper(row + 0, col + 1).prefetch(0);
dest.getLinearMapper(row + 0, col + 2).prefetch(0);
dest.getLinearMapper(row + 0, col + 3).prefetch(0);
}
template<typename ResPacket_>
EIGEN_STRONG_INLINE void scale(ResScalar alpha, const ResPacket_& pAlpha)
{
_acc1.packet[0] *= pAlpha;
_acc1.packet[1] *= pAlpha;
_acc1.packet[2] *= pAlpha;
_acc1.packet[3] *= pAlpha;
_acc2.packet[0] *= pAlpha;
_acc2.packet[1] *= pAlpha;
_acc2.packet[2] *= pAlpha;
_acc2.packet[3] *= pAlpha;
}
EIGEN_STRONG_INLINE void store(const DataMapper& dest, Index row, Index col)
{
constexpr auto PacketSize = unpacket_traits<ResPacket>::size;
LinearMapper r0 = dest.getLinearMapper(row, col + 0);
LinearMapper r1 = dest.getLinearMapper(row, col + 1);
LinearMapper r2 = dest.getLinearMapper(row, col + 2);
LinearMapper r3 = dest.getLinearMapper(row, col + 3);
r0.storePacket(0*PacketSize, r0.template loadPacket<ResPacket>(0*PacketSize) + _acc1.packet[0]);
r1.storePacket(0*PacketSize, r1.template loadPacket<ResPacket>(0*PacketSize) + _acc1.packet[1]);
r2.storePacket(0*PacketSize, r2.template loadPacket<ResPacket>(0*PacketSize) + _acc1.packet[2]);
r3.storePacket(0*PacketSize, r3.template loadPacket<ResPacket>(0*PacketSize) + _acc1.packet[3]);
r0.storePacket(1*PacketSize, r0.template loadPacket<ResPacket>(1*PacketSize) + _acc2.packet[0]);
r1.storePacket(1*PacketSize, r1.template loadPacket<ResPacket>(1*PacketSize) + _acc2.packet[1]);
r2.storePacket(1*PacketSize, r2.template loadPacket<ResPacket>(1*PacketSize) + _acc2.packet[2]);
r3.storePacket(1*PacketSize, r3.template loadPacket<ResPacket>(1*PacketSize) + _acc2.packet[3]);
}
};
template<int CPU, typename Scalar, typename ResScalar, typename DataMapper>
struct Accumulator<0, CPU, Scalar, ResScalar, DataMapper, 12, 4>
{
using LinearMapper = typename DataMapper::LinearMapper;
using AccPacket = typename packet_traits<Scalar>::type;
using ResPacket = typename packet_traits<ResScalar>::type;
PacketBlock<AccPacket, 4> _acc1;
PacketBlock<AccPacket, 4> _acc2;
PacketBlock<AccPacket, 4> _acc3;
EIGEN_STRONG_INLINE void zero()
{
_acc1.packet[0] = pset1<AccPacket>(0);
_acc1.packet[1] = pset1<AccPacket>(0);
_acc1.packet[2] = pset1<AccPacket>(0);
_acc1.packet[3] = pset1<AccPacket>(0);
_acc2.packet[0] = pset1<AccPacket>(0);
_acc2.packet[1] = pset1<AccPacket>(0);
_acc2.packet[2] = pset1<AccPacket>(0);
_acc2.packet[3] = pset1<AccPacket>(0);
_acc3.packet[0] = pset1<AccPacket>(0);
_acc3.packet[1] = pset1<AccPacket>(0);
_acc3.packet[2] = pset1<AccPacket>(0);
_acc3.packet[3] = pset1<AccPacket>(0);
}
EIGEN_STRONG_INLINE void prefetch(const DataMapper& dest, Index row, Index col)
{
dest.getLinearMapper(row + 0, col + 0).prefetch(0);
dest.getLinearMapper(row + 0, col + 1).prefetch(0);
dest.getLinearMapper(row + 0, col + 2).prefetch(0);
dest.getLinearMapper(row + 0, col + 3).prefetch(0);
}
template<typename ResPacket_>
EIGEN_STRONG_INLINE void scale(ResScalar alpha, const ResPacket_& pAlpha)
{
_acc1.packet[0] *= pAlpha;
_acc1.packet[1] *= pAlpha;
_acc1.packet[2] *= pAlpha;
_acc1.packet[3] *= pAlpha;
_acc2.packet[0] *= pAlpha;
_acc2.packet[1] *= pAlpha;
_acc2.packet[2] *= pAlpha;
_acc2.packet[3] *= pAlpha;
_acc3.packet[0] *= pAlpha;
_acc3.packet[1] *= pAlpha;
_acc3.packet[2] *= pAlpha;
_acc3.packet[3] *= pAlpha;
}
EIGEN_STRONG_INLINE void store(const DataMapper& dest, Index row, Index col)
{
constexpr auto PacketSize = unpacket_traits<ResPacket>::size;
LinearMapper r0 = dest.getLinearMapper(row, col + 0);
LinearMapper r1 = dest.getLinearMapper(row, col + 1);
LinearMapper r2 = dest.getLinearMapper(row, col + 2);
LinearMapper r3 = dest.getLinearMapper(row, col + 3);
r0.storePacket(0*PacketSize, r0.template loadPacket<ResPacket>(0*PacketSize) + _acc1.packet[0]);
r1.storePacket(0*PacketSize, r1.template loadPacket<ResPacket>(0*PacketSize) + _acc1.packet[1]);
r2.storePacket(0*PacketSize, r2.template loadPacket<ResPacket>(0*PacketSize) + _acc1.packet[2]);
r3.storePacket(0*PacketSize, r3.template loadPacket<ResPacket>(0*PacketSize) + _acc1.packet[3]);
r0.storePacket(1*PacketSize, r0.template loadPacket<ResPacket>(1*PacketSize) + _acc2.packet[0]);
r1.storePacket(1*PacketSize, r1.template loadPacket<ResPacket>(1*PacketSize) + _acc2.packet[1]);
r2.storePacket(1*PacketSize, r2.template loadPacket<ResPacket>(1*PacketSize) + _acc2.packet[2]);
r3.storePacket(1*PacketSize, r3.template loadPacket<ResPacket>(1*PacketSize) + _acc2.packet[3]);
r0.storePacket(2*PacketSize, r0.template loadPacket<ResPacket>(2*PacketSize) + _acc3.packet[0]);
r1.storePacket(2*PacketSize, r1.template loadPacket<ResPacket>(2*PacketSize) + _acc3.packet[1]);
r2.storePacket(2*PacketSize, r2.template loadPacket<ResPacket>(2*PacketSize) + _acc3.packet[2]);
r3.storePacket(2*PacketSize, r3.template loadPacket<ResPacket>(2*PacketSize) + _acc3.packet[3]);
}
};
template<int CPU, typename Index, typename LhsScalar, typename LhsPackMap, typename RhsScalar, typename RhsPackMap, typename AccScalar, typename ResScalar, typename Accumulator>
struct MicroKernel<0, CPU, Index, LhsScalar, LhsPackMap, RhsScalar, RhsPackMap, AccScalar, ResScalar, Accumulator, 4, __UNROLL__ , 4>
{
EIGEN_STRONG_INLINE void operator()(LhsPackMap& lhsPackMap,
RhsPackMap& rhsPackMap,
Index rowIdx, Index colIdx, Index depthIdx,
Accumulator& acc)
{
using LhsPacket = typename packet_traits<LhsScalar>::type;
using RhsPacket = typename packet_traits<RhsScalar>::type;
asm __volatile__("#BEGIN_NEON_MICROKERNEL_4x4x4\n\t");
LhsPacket pLhs;
RhsPacket pRhs, pRhs0, pRhs1, pRhs2, pRhs3;
MICRO_4x1x4();
MICRO_4x1x4();
MICRO_4x1x4();
MICRO_4x1x4();
#if __UNROLL__ > 4
MICRO_4x1x4();
MICRO_4x1x4();
MICRO_4x1x4();
MICRO_4x1x4();
#endif
asm __volatile__("#END_NEON_MICROKERNEL_4x4x4\n\t");
};
};
template<int CPU, typename Index, typename LhsScalar, typename LhsPackMap, typename RhsScalar, typename RhsPackMap, typename AccScalar, typename ResScalar, typename Accumulator>
struct MicroKernel<0, CPU, Index, LhsScalar, LhsPackMap, RhsScalar, RhsPackMap, AccScalar, ResScalar, Accumulator, 8, __UNROLL__, 4>
{
EIGEN_STRONG_INLINE void operator()(LhsPackMap& lhsPackMap,
RhsPackMap& rhsPackMap,
Index rowIdx, Index colIdx, Index depthIdx,
Accumulator& acc)
{
using LhsPacket = typename packet_traits<LhsScalar>::type;
using RhsPacket = typename packet_traits<RhsScalar>::type;
asm __volatile__("#BEGIN_NEON_MICROKERNEL_8x8x4\n\t");
LhsPacket pLhs, pLhs2;
RhsPacket pRhs, pRhs0, pRhs1, pRhs2, pRhs3;
#if __UNROLL__ == 8
#ifdef __ENABLE_PREFETCH__
prefetch(rhsPackMap.pCur + (48+0));
#endif
MICRO_8x1x4();
MICRO_8x1x4();
MICRO_8x1x4();
MICRO_8x1x4();
#ifdef __ENABLE_PREFETCH__
prefetch(rhsPackMap.pCur + (48+16));
#endif
MICRO_8x1x4();
MICRO_8x1x4();
MICRO_8x1x4();
MICRO_8x1x4();
#else
MICRO_8x1x4();
MICRO_8x1x4();
MICRO_8x1x4();
MICRO_8x1x4();
#endif
asm __volatile__("#END_NEON_MICROKERNEL_8x8x4\n\t");
};
};
template<int CPU, typename Index, typename LhsScalar, typename LhsPackMap, typename RhsScalar, typename RhsPackMap, typename AccScalar, typename ResScalar, typename Accumulator>
struct MicroKernel<0, CPU, Index, LhsScalar, LhsPackMap, RhsScalar, RhsPackMap, AccScalar, ResScalar, Accumulator, 12, __UNROLL__, 4>
{
EIGEN_STRONG_INLINE void operator()(LhsPackMap& lhsPackMap,
RhsPackMap& rhsPackMap,
Index rowIdx, Index colIdx, Index depthIdx,
Accumulator& acc)
{
using LhsPacket = typename packet_traits<LhsScalar>::type;
using RhsPacket = typename packet_traits<RhsScalar>::type;
asm __volatile__("#BEGIN_NEON_MICROKERNEL_12x8x4\n\t");
LhsPacket pLhs, pLhs2, pLhs3;
RhsPacket pRhs, pRhs0, pRhs1, pRhs2, pRhs3;
#if __UNROLL__ == 8
#ifdef __ENABLE_PREFETCH__
prefetch(rhsPackMap.pCur);
#endif
MICRO_12x1x4();
MICRO_12x1x4();
MICRO_12x1x4();
MICRO_12x1x4();
MICRO_12x1x4();
MICRO_12x1x4();
MICRO_12x1x4();
MICRO_12x1x4();
#else
MICRO_12x1x4();
MICRO_12x1x4();
MICRO_12x1x4();
MICRO_12x1x4();
#endif
asm __volatile__("#END_NEON_MICROKERNEL_12x8x4\n\t");
};
};
template<int CPU, typename Index, typename LhsScalar, typename LhsPackMap, typename RhsScalar, typename RhsPackMap, typename AccScalar, typename ResScalar, typename Accumulator>
struct MicroKernel<0, CPU, Index, LhsScalar, LhsPackMap, RhsScalar, RhsPackMap, AccScalar, ResScalar, Accumulator, 12, 1, 4>
{
EIGEN_STRONG_INLINE void operator()(LhsPackMap& lhsPackMap,
RhsPackMap& rhsPackMap,
Index rowIdx, Index colIdx, Index depthIdx,
Accumulator& acc)
{
using LhsPacket = typename packet_traits<LhsScalar>::type;
using RhsPacket = typename packet_traits<RhsScalar>::type;
asm __volatile__("#BEGIN_NEON_MICROKERNEL_12x1x4\n\t");
LhsPacket pLhs, pLhs2, pLhs3;
RhsPacket pRhs, pRhs0, pRhs1, pRhs2, pRhs3;
MICRO_12x1x4();
asm __volatile__("#END_NEON_MICROKERNEL_12x1x4\n\t");
};
};
template<int CPU, typename Index, typename LhsScalar, typename LhsPackMap, typename RhsScalar, typename RhsPackMap, typename AccScalar, typename ResScalar, typename Accumulator>
struct MicroKernel<0, CPU, Index, LhsScalar, LhsPackMap, RhsScalar, RhsPackMap, AccScalar, ResScalar, Accumulator, 8, 1, 4>
{
EIGEN_STRONG_INLINE void operator()(LhsPackMap& lhsPackMap,
RhsPackMap& rhsPackMap,
Index rowIdx, Index colIdx, Index depthIdx,
Accumulator& acc)
{
using LhsPacket = typename packet_traits<LhsScalar>::type;
using RhsPacket = typename packet_traits<RhsScalar>::type;
asm __volatile__("#BEGIN_NEON_MICROKERNEL_8x1x4\n\t");
LhsPacket pLhs, pLhs2;
RhsPacket pRhs, pRhs0, pRhs1, pRhs2, pRhs3;
MICRO_8x1x4();
asm __volatile__("#END_NEON_MICROKERNEL_8x1x4\n\t");
};
};
template<int CPU, typename Index, typename LhsScalar, typename LhsPackMap, typename RhsScalar, typename RhsPackMap, typename AccScalar, typename ResScalar, typename Accumulator>
struct MicroKernel<0, CPU, Index, LhsScalar, LhsPackMap, RhsScalar, RhsPackMap, AccScalar, ResScalar, Accumulator, 4, 1, 4>
{
EIGEN_STRONG_INLINE void operator()(LhsPackMap& lhsPackMap,
RhsPackMap& rhsPackMap,
Index rowIdx, Index colIdx, Index depthIdx,
Accumulator& acc)
{
using LhsPacket = typename packet_traits<LhsScalar>::type;
using RhsPacket = typename packet_traits<RhsScalar>::type;
asm __volatile__("#BEGIN_NEON_MICROKERNEL_4x1x4\n\t");
LhsPacket pLhs;
RhsPacket pRhs, pRhs0, pRhs1, pRhs2, pRhs3;
MICRO_4x1x4();
asm __volatile__("#END_NEON_MICROKERNEL_4x1x4\n\t");
};
};
template<int CPU, typename Index, typename LhsScalar, typename LhsPackMap, typename RhsScalar, typename RhsPackMap, typename AccScalar, typename ResScalar, typename Accumulator>
struct MicroKernel<0, CPU, Index, LhsScalar, LhsPackMap, RhsScalar, RhsPackMap, AccScalar, ResScalar, Accumulator, 12, __UNROLL__, 1>
{
EIGEN_STRONG_INLINE void operator()(LhsPackMap& lhsPackMap,
RhsPackMap& rhsPackMap,
Index rowIdx, Index colIdx, Index depthIdx,
Accumulator& acc)
{
using LhsPacket = typename packet_traits<LhsScalar>::type;
using RhsPacket = typename packet_traits<RhsScalar>::type;
LhsPacket pLhs;
RhsPacket pRhs;
asm __volatile__("#BEGIN_NEON_MICROKERNEL_4x1x1\n\t");
MICRO_12x1x1();
MICRO_12x1x1();
MICRO_12x1x1();
MICRO_12x1x1();
MICRO_12x1x1();
MICRO_12x1x1();
MICRO_12x1x1();
MICRO_12x1x1();
asm __volatile__("#END_NEON_MICROKERNEL_4x1x1\n\t");
};
};
template<int CPU, typename Index, typename LhsScalar, typename LhsPackMap, typename RhsScalar, typename RhsPackMap, typename AccScalar, typename ResScalar, typename Accumulator>
struct MicroKernel<0, CPU, Index, LhsScalar, LhsPackMap, RhsScalar, RhsPackMap, AccScalar, ResScalar, Accumulator, 12, 1, 1>
{
EIGEN_STRONG_INLINE void operator()(LhsPackMap& lhsPackMap,
RhsPackMap& rhsPackMap,
Index rowIdx, Index colIdx, Index depthIdx,
Accumulator& acc)
{
using LhsPacket = typename packet_traits<LhsScalar>::type;
using RhsPacket = typename packet_traits<RhsScalar>::type;
LhsPacket pLhs;
RhsPacket pRhs;
asm __volatile__("#BEGIN_NEON_MICROKERNEL_4x1x1\n\t");
MICRO_12x1x1();
asm __volatile__("#END_NEON_MICROKERNEL_4x1x1\n\t");
};
};
template<int CPU, typename Index, typename LhsScalar, typename LhsPackMap, typename RhsScalar, typename RhsPackMap, typename AccScalar, typename ResScalar, typename Accumulator>
struct MicroKernel<0, CPU, Index, LhsScalar, LhsPackMap, RhsScalar, RhsPackMap, AccScalar, ResScalar, Accumulator, 8, __UNROLL__, 1>
{
EIGEN_STRONG_INLINE void operator()(LhsPackMap& lhsPackMap,
RhsPackMap& rhsPackMap,
Index rowIdx, Index colIdx, Index depthIdx,
Accumulator& acc)
{
using LhsPacket = typename packet_traits<LhsScalar>::type;
using RhsPacket = typename packet_traits<RhsScalar>::type;
asm __volatile__("#BEGIN_NEON_MICROKERNEL_4x1x1\n\t");
LhsPacket pLhs;
RhsPacket pRhs;
MICRO_8x1x1();
MICRO_8x1x1();
MICRO_8x1x1();
MICRO_8x1x1();
MICRO_8x1x1();
MICRO_8x1x1();
MICRO_8x1x1();
MICRO_8x1x1();
asm __volatile__("#END_NEON_MICROKERNEL_4x1x1\n\t");
};
};
template<int CPU, typename Index, typename LhsScalar, typename LhsPackMap, typename RhsScalar, typename RhsPackMap, typename AccScalar, typename ResScalar, typename Accumulator>
struct MicroKernel<0, CPU, Index, LhsScalar, LhsPackMap, RhsScalar, RhsPackMap, AccScalar, ResScalar, Accumulator, 8, 1, 1>
{
EIGEN_STRONG_INLINE void operator()(LhsPackMap& lhsPackMap,
RhsPackMap& rhsPackMap,
Index rowIdx, Index colIdx, Index depthIdx,
Accumulator& acc)
{
using LhsPacket = typename packet_traits<LhsScalar>::type;
using RhsPacket = typename packet_traits<RhsScalar>::type;
asm __volatile__("#BEGIN_NEON_MICROKERNEL_4x1x1\n\t");
LhsPacket pLhs;
RhsPacket pRhs;
MICRO_8x1x1();
asm __volatile__("#END_NEON_MICROKERNEL_4x1x1\n\t");
};
};
template<int CPU, typename Index, typename LhsScalar, typename LhsPackMap, typename RhsScalar, typename RhsPackMap, typename AccScalar, typename ResScalar, typename Accumulator>
struct MicroKernel<0, CPU, Index, LhsScalar, LhsPackMap, RhsScalar, RhsPackMap, AccScalar, ResScalar, Accumulator, 4, __UNROLL__, 1>
{
EIGEN_STRONG_INLINE void operator()(LhsPackMap& lhsPackMap,
RhsPackMap& rhsPackMap,
Index rowIdx, Index colIdx, Index depthIdx,
Accumulator& acc)
{
using LhsPacket = typename packet_traits<LhsScalar>::type;
using RhsPacket = typename packet_traits<RhsScalar>::type;
asm __volatile__("#BEGIN_NEON_MICROKERNEL_4x1x1\n\t");
LhsPacket pLhs;
RhsPacket pRhs;
MICRO_4x1x1();
MICRO_4x1x1();
MICRO_4x1x1();
MICRO_4x1x1();
MICRO_4x1x1();
MICRO_4x1x1();
MICRO_4x1x1();
MICRO_4x1x1();
asm __volatile__("#END_NEON_MICROKERNEL_4x1x1\n\t");
};
};
template<int CPU, typename Index, typename LhsScalar, typename LhsPackMap, typename RhsScalar, typename RhsPackMap, typename AccScalar, typename ResScalar, typename Accumulator>
struct MicroKernel<0, CPU, Index, LhsScalar, LhsPackMap, RhsScalar, RhsPackMap, AccScalar, ResScalar, Accumulator, 4, 1, 1>
{
EIGEN_STRONG_INLINE void operator()(LhsPackMap& lhsPackMap,
RhsPackMap& rhsPackMap,
Index rowIdx, Index colIdx, Index depthIdx,
Accumulator& acc)
{
using LhsPacket = typename packet_traits<LhsScalar>::type;
using RhsPacket = typename packet_traits<RhsScalar>::type;
asm __volatile__("#BEGIN_NEON_MICROKERNEL_4x1x1\n\t");
LhsPacket pLhs;
RhsPacket pRhs;
MICRO_4x1x1();
asm __volatile__("#END_NEON_MICROKERNEL_4x1x1\n\t");
};
};
template<int CPU, typename Index, typename LhsScalar, typename LhsPackMap, typename RhsScalar, typename RhsPackMap, typename AccScalar, typename ResScalar, typename Accumulator>
struct MicroKernel<0, CPU, Index, LhsScalar, LhsPackMap, RhsScalar, RhsPackMap, AccScalar, ResScalar, Accumulator, 1, 1, 4>
{
EIGEN_STRONG_INLINE void operator()(LhsPackMap& lhsPackMap,
RhsPackMap& rhsPackMap,
Index rowIdx, Index colIdx, Index depthIdx,
Accumulator& acc)
{
using RhsPacket = typename packet_traits<RhsScalar>::type;
using LhsPacket = typename packet_traits<LhsScalar>::type;
asm __volatile__("#BEGIN_NEON_MICROKERNEL_1x1x4\n\t");
LhsPacket pLhs = pset1<LhsPacket>(*lhsPackMap.pCur);
RhsPacket pRhs = pload<RhsPacket>(rhsPackMap.pCur);
acc._acc += pLhs*pRhs;
lhsPackMap.advance(1);
rhsPackMap.advance(4*1);
asm __volatile__("#END_NEON_MICROKERNEL_1x1x4\n\t");
};
};
#endif // __ENABLE_VECTOR_KERNELS__
} // end namespace internal
} // end namespace Eigen
#endif // EIGEN_KERNELS_NEON_H

View File

@@ -0,0 +1,523 @@
// This file is part of Eigen, a lightweight C++ template library
// for linear algebra.
//
// Copyright (C) 2021 Everton Constantino (everton.constantino@hotmail.com)
//
// This Source Code Form is subject to the terms of the Mozilla
// Public License v. 2.0. If a copy of the MPL was not distributed
// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
#ifndef EIGEN_MATRIX_PRODUCT_NEON_H
#define EIGEN_MATRIX_PRODUCT_NEON_H
#ifdef __DEBUG__
#include <iostream>
#endif
namespace Eigen {
namespace internal {
#ifndef __UNROLL__
#define __UNROLL__ 8
#endif
template<int Architecture, int CPU, typename LhsScalar, typename RhsScalar>
constexpr int SHAPES_COUNT = 14;
constexpr int SHAPES_DIMENSION = 6;
constexpr int SHAPES_LHS_DIMENSION = 0;
constexpr int SHAPES_DEP_DIMENSION = 1;
constexpr int SHAPES_RHS_DIMENSION = 2;
constexpr int SHAPES_RHS_POINTER = 3;
constexpr int SHAPES_LHS_POINTER = 4;
constexpr int SHAPES_DEP_POINTER = 5;
constexpr int SHAPES_POINTER_END = -1;
template<int Architecture, int CPU, typename Scalar, bool isLhs>
constexpr int PACK_SHAPES_COUNT = 2;
template<int Architecture, int CPU, typename Scalar>
constexpr int PACK_SHAPES_COUNT<Architecture, CPU, Scalar, true> = 4;
constexpr int PACK_SHAPES_DIMENSION = 3;
constexpr int PACK_SHAPES_POINTER = 2;
constexpr int PACK_SHAPES_END = -1;
// lhs_progress x depth_progress x rhs_progress (depth_progress > 1 matrix ops) x pointer to next rhs_progress on the shapes map
template<int Architecture, int CPU, typename LhsScalar, typename RhsScalar>
constexpr int SHAPES[SHAPES_COUNT<Architecture, CPU, LhsScalar,RhsScalar>][SHAPES_DIMENSION] =
{ /* 00 */{ 1, 1,1,SHAPES_POINTER_END, SHAPES_POINTER_END, SHAPES_POINTER_END},
/* 01 */{1*packet_traits<RhsScalar>::size, 1,1, 0, 0, SHAPES_POINTER_END},
/* 02 */{1*packet_traits<RhsScalar>::size,__UNROLL__,1, 0, 0, 1},
/* 03 */{2*packet_traits<RhsScalar>::size, 1,1, 0, 2, SHAPES_POINTER_END},
/* 04 */{2*packet_traits<RhsScalar>::size,__UNROLL__,1, 0, 2, 3},
/* 05 */{3*packet_traits<RhsScalar>::size, 1,1, 0, 4, SHAPES_POINTER_END},
/* 06 */{3*packet_traits<RhsScalar>::size,__UNROLL__,1, 0, 4, 5},
/* 07 */{ 1, 1,4, 6, SHAPES_POINTER_END, SHAPES_POINTER_END},
/* 08 */{1*packet_traits<RhsScalar>::size, 1,4, 6, 7, SHAPES_POINTER_END},
/* 09 */{1*packet_traits<RhsScalar>::size,__UNROLL__,4, 6, 7, 8},
/* 10 */{2*packet_traits<RhsScalar>::size, 1,4, 6, 9, SHAPES_POINTER_END},
/* 11 */{2*packet_traits<RhsScalar>::size,__UNROLL__,4, 6, 9, 10},
/* 12 */{3*packet_traits<RhsScalar>::size, 1,4, 6, 11, SHAPES_POINTER_END},
/* 13 */{3*packet_traits<RhsScalar>::size,__UNROLL__,4, 6, 11, 12}};
// d1progress x d2progress
template<int Architecture, int CPU, typename Scalar, bool isLhs>
constexpr int PACK_SHAPES[PACK_SHAPES_COUNT<Architecture, CPU, Scalar, isLhs>][PACK_SHAPES_DIMENSION] =
{{ 1, 1, PACK_SHAPES_END},
{ 4, 1, 0}};
template<int Architecture, int CPU, typename Scalar>
constexpr int PACK_SHAPES<Architecture, CPU, Scalar, true>[PACK_SHAPES_COUNT<Architecture, CPU, Scalar, true>][PACK_SHAPES_DIMENSION] =
{{ 1, 1, PACK_SHAPES_END},
{1*packet_traits<Scalar>::size, 1, 0},
{2*packet_traits<Scalar>::size, 1, 1},
{3*packet_traits<Scalar>::size, 1, 2}};
template<int Architecture, int CPU, typename Index, typename Scalar, bool isLhs, typename DataMapper, bool Conjugate, bool PanelMode, int StorageOrder, int M, int N>
struct PackingOperator
{
EIGEN_STRONG_INLINE Scalar* operator()(Index d1Idx, Index d2Idx, Scalar *block, const DataMapper& data)
{
#ifdef __DEBUG__
std::cout << M << "x" << N << " ( " << d1Idx << ", " << d2Idx <<") -> ( " << d1Idx + M << ", " << d2Idx + N << ") ";
#endif
Scalar *c = block;
for(auto i = 0; i < M; i++)
for(auto j = 0; j < N; j++)
{
if(isLhs)
*c = data(d1Idx + i, d2Idx + j);
else
*c = data(d2Idx + j, d1Idx + i);
#ifdef __DEBUG__
std::cout << *c << " ";
#endif
c++;
}
#ifdef __DEBUG__
std::cout << std::endl;
#endif
return c;
}
};
template<int Architecture, int CPU, typename Index, typename Scalar, bool isLhs, typename DataMapper, bool Conjugate, bool PanelMode, int StorageOrder, int D1PROGRESS, int IDX>
struct PackingInnerStruct
{
EIGEN_STRONG_INLINE Scalar* operator()(Index d1Idx, Index d2Idx, Scalar *block, const DataMapper& data, Index d1Size, Index d2Size, Index stride, Index offset)
{
constexpr auto d2Progress = PACK_SHAPES<Architecture, CPU, Scalar, isLhs>[IDX][1];
PackingOperator<Architecture, CPU, Index, Scalar, isLhs, DataMapper, Conjugate, PanelMode, StorageOrder, D1PROGRESS, d2Progress> po;
for(;d2Idx + d2Progress <= d2Size; d2Idx+=d2Progress)
{
block = po(d1Idx, d2Idx, block, data);
}
if(PACK_SHAPES<Architecture, CPU, Scalar, isLhs>[IDX-1][0] == D1PROGRESS)
{
PackingInnerStruct<Architecture, CPU, Index, Scalar, isLhs, DataMapper, Conjugate, PanelMode, StorageOrder, D1PROGRESS, IDX-1> pis;
block = pis(d1Idx, d2Idx, block, data, d1Size, d2Size, stride, offset);
}
return block;
}
};
template<int Architecture, int CPU, typename Index, typename Scalar, bool isLhs, typename DataMapper, bool Conjugate, bool PanelMode, int StorageOrder, int D1PROGRESS>
struct PackingInnerStruct<Architecture, CPU, Index, Scalar, isLhs, DataMapper, Conjugate, PanelMode, StorageOrder, D1PROGRESS, 0>
{
EIGEN_STRONG_INLINE Scalar* operator()(Index d1Idx, Index d2Idx, Scalar *block, const DataMapper& data, Index d1Size, Index d2Size, Index stride, Index offset)
{
constexpr auto d2Progress = PACK_SHAPES<Architecture, CPU, Scalar, isLhs>[0][1];
for(;d2Idx + d2Progress <= d2Size; d2Idx+=d2Progress)
{
PackingOperator<Architecture, CPU, Index, Scalar, isLhs, DataMapper, Conjugate, PanelMode, StorageOrder, D1PROGRESS, d2Progress> po;
block = po(d1Idx, d2Idx, block, data);
}
return block;
}
};
template<int Architecture, int CPU, typename Index, typename Scalar, bool isLhs, typename DataMapper, bool Conjugate, bool PanelMode, int StorageOrder, int PACK_SHAPE_IDX>
struct PackingStruct
{
PackingStruct<Architecture, CPU, Index, Scalar, isLhs, DataMapper, Conjugate, PanelMode, StorageOrder, PACK_SHAPES<Architecture, CPU, Scalar, isLhs>[PACK_SHAPE_IDX][PACK_SHAPES_POINTER]> ps;
EIGEN_STRONG_INLINE Scalar* operator()(Index d1Idx, Scalar *block, const DataMapper& data, Index d1Size, Index d2Size, Index stride, Index offset)
{
constexpr auto d1Progress = PACK_SHAPES<Architecture, CPU, Scalar, isLhs>[PACK_SHAPE_IDX][0];
for(; d1Idx + d1Progress <= d1Size; d1Idx += d1Progress)
{
PackingInnerStruct<Architecture, CPU, Index, Scalar, isLhs, DataMapper, Conjugate, PanelMode, StorageOrder, d1Progress, PACK_SHAPE_IDX> pis;
block = pis(d1Idx, 0, block, data, d1Size, d2Size, stride, offset);
}
return ps(d1Idx, block, data, d1Size, d2Size, stride, offset);
}
};
template<int Architecture, int CPU, typename Index, typename Scalar, bool isLhs, typename DataMapper, bool Conjugate, bool PanelMode, int StorageOrder>
struct PackingStruct<Architecture, CPU, Index, Scalar, isLhs, DataMapper, Conjugate, PanelMode, StorageOrder, -1>
{
EIGEN_STRONG_INLINE Scalar* operator()(Index, Scalar *block, const DataMapper&, Index, Index, Index, Index) { return block; }
};
template<int Architecture, int CPU, typename Index, typename Scalar, typename DataMapper, bool Conjugate, bool PanelMode, int StorageOrder>
struct lhs_pack
{
EIGEN_STRONG_INLINE void operator()(Scalar *blockA, const DataMapper &lhs, Index depth, Index rows, Index stride, Index offset)
{
PackingStruct<Architecture, CPU, Index, Scalar, true, DataMapper, Conjugate, PanelMode, StorageOrder, PACK_SHAPES_COUNT<Architecture, CPU, Scalar, true>-1> ps;
ps(0, blockA, lhs, rows, depth, stride, offset);
}
};
template<int Architecture, int CPU, typename Index, typename Scalar, typename DataMapper, bool Conjugate, bool PanelMode, int StorageOrder>
struct rhs_pack
{
EIGEN_STRONG_INLINE void operator()(Scalar *blockB, const DataMapper &rhs, Index depth, Index cols, Index stride, Index offset)
{
PackingStruct<Architecture, CPU, Index, Scalar, false, DataMapper, Conjugate, PanelMode, StorageOrder, PACK_SHAPES_COUNT<Architecture, CPU, Scalar, false>-1> ps;
ps(0, blockB, rhs, cols, depth, stride, offset);
}
};
template<int Architecture, int CPU, typename Index, typename Scalar, typename DataMapper, bool isLhs, int IDX>
struct PackMapCalculator
{
PackMapCalculator<Architecture, CPU, Index, Scalar, DataMapper, isLhs, PACK_SHAPES<Architecture, CPU, Scalar, isLhs>[IDX][PACK_SHAPES_POINTER]> pmc;
EIGEN_STRONG_INLINE Index getPosition(Index pos, Index d2Size)
{
constexpr auto d1Progress = PACK_SHAPES<Architecture, CPU, Scalar, isLhs>[IDX][0];
Index v = (pos / d1Progress) * d1Progress;
return v*d2Size + pmc.getPosition(pos - v, d2Size);
}
};
template<int Architecture, int CPU, typename Index, typename Scalar, typename DataMapper, bool isLhs>
struct PackMapCalculator<Architecture, CPU, Index, Scalar, DataMapper, isLhs, -1>
{
EIGEN_STRONG_INLINE Index getPosition(Index, Index) { return Index(0); }
};
template<int Architecture, int CPU, typename Index, typename Scalar, typename DataMapper, bool isLhs>
struct PackMap
{
const Scalar *pBase;
const Scalar *pCur;
Index stride;
Index offset;
Index d2Size;
PackMapCalculator<Architecture, CPU, Index, Scalar, DataMapper, isLhs, PACK_SHAPES_COUNT<Architecture, CPU, Scalar, isLhs>-1> pmc;
PackMap(const Scalar *base, Index d2Size, Index stride, Index offset) : pBase(base), pCur(base), d2Size(d2Size), stride(stride), offset(offset) {}
EIGEN_STRONG_INLINE void resetCur() { pCur = pBase; }
EIGEN_STRONG_INLINE void moveTo(Index p1) { pCur = pBase + pmc.getPosition(p1, d2Size); }
EIGEN_STRONG_INLINE void advance(int progress) { pCur += progress; }
};
template<int Architecture, int CPU, typename Scalar, typename ResScalar, typename DataMapper, int M, int N>
struct Accumulator
{
Scalar dt[M][N];
EIGEN_STRONG_INLINE void zero()
{
for(auto i = 0; i < M; i++)
{
for(auto j = 0; j < N; j++)
{
dt[i][j] = Scalar(0);
}
}
}
EIGEN_STRONG_INLINE void prefetch(const DataMapper&, Index, Index) {}
template<typename ResPacket>
EIGEN_STRONG_INLINE void scale(ResScalar alpha, const ResPacket& pAlpha)
{
for(auto i = 0; i < M; i++)
{
for(auto j = 0; j < N; j++)
{
dt[i][j] *= alpha;
}
}
}
EIGEN_STRONG_INLINE void store(const DataMapper& dest, Index row, Index col)
{
for(auto i = 0; i < M; i++)
{
for(auto j = 0; j < N; j++)
{
dest(row + i, col + j) += dt[i][j];
}
}
}
};
template<int Architecture, int CPU, typename Index, typename LhsScalar, typename LhsPackMap, typename RhsScalar, typename RhsPackMap, typename AccScalar, typename ResScalar, typename Accumulator, int M, int K, int N>
struct MicroKernel
{
EIGEN_STRONG_INLINE void operator()(LhsPackMap& lhsPackMap,
RhsPackMap& rhsPackMap,
Index rowIdx, Index colIdx, Index depthIdx,
Accumulator& acc)
{
#ifdef __DEBUG__
std::cout << "Kernel " << M << " x " << K << " x " << N << " @ " << rowIdx << ", " << depthIdx << ", " << colIdx << std::endl;
std::cout << "LHS ";
for(auto i = 0; i < M; i++)
{
for(auto j = 0; j < K; j++)
{
std::cout << lhsPackMap.pCur[i*K + j] << " ";
}
}
std::cout << std::endl << "RHS ";
for(auto i = 0; i < K; i++)
{
for(auto j = 0; j < N; j++)
{
std::cout << rhsPackMap.pCur[i*N + j] << " ";
}
}
std::cout << std::endl;
#endif
const RhsScalar *pRhs = rhsPackMap.pCur;
for(auto i = 0; i < N; i++)
{
const LhsScalar *pLhs = lhsPackMap.pCur;
for(auto j = 0; j < M; j++)
{
acc.dt[j][i] += pRhs[i]*pLhs[j];
}
}
lhsPackMap.advance(M*K);
rhsPackMap.advance(K*N);
};
};
template<int Architecture, int CPU, typename Index, typename LhsScalar, typename LhsPackMap, typename RhsScalar, typename RhsPackMap, typename AccScalar, typename ResScalar, typename ResPacket, typename DataMapper, int RHS_SHAPE_IDX, int LHS_SHAPE_IDX, int IDX>
struct DepthLoopStruct
{
static constexpr auto PREVIOUS = SHAPES<Architecture, CPU, LhsScalar, RhsScalar>[IDX][SHAPES_DEP_POINTER];
DepthLoopStruct<Architecture, CPU, Index, LhsScalar, LhsPackMap, RhsScalar, RhsPackMap, AccScalar, ResScalar, ResPacket, DataMapper, RHS_SHAPE_IDX, LHS_SHAPE_IDX, PREVIOUS> depthLS;
EIGEN_STRONG_INLINE void operator()(Index rowIdx, Index colIdx, Index depthIdx, const DataMapper& res,
Index rows, Index depth, Index cols, ResScalar alpha, const ResPacket& pAlpha, LhsPackMap& lhsPackMap, RhsPackMap& rhsPackMap)
{
constexpr auto rhsProgress = SHAPES<Architecture, CPU, LhsScalar, RhsScalar>[RHS_SHAPE_IDX][SHAPES_RHS_DIMENSION];
constexpr auto lhsProgress = SHAPES<Architecture, CPU, LhsScalar, RhsScalar>[LHS_SHAPE_IDX][SHAPES_LHS_DIMENSION];
constexpr auto depthProgress = SHAPES<Architecture, CPU, LhsScalar, RhsScalar>[IDX][SHAPES_DEP_DIMENSION];
#ifdef __ENABLE_PREFETCH__
prefetch(lhsPackMap.pCur);
prefetch(rhsPackMap.pCur);
#endif
typedef Accumulator<Architecture, CPU, AccScalar, ResScalar, DataMapper, lhsProgress, rhsProgress> AccumulatorType;
MicroKernel<Architecture, CPU, Index, LhsScalar, LhsPackMap, RhsScalar, RhsPackMap, AccScalar, ResScalar, AccumulatorType, lhsProgress, depthProgress, rhsProgress> mkt;
AccumulatorType acc;
acc.zero();
#ifdef __ENABLE_PREFETCH__
acc.prefetch(res, rowIdx, colIdx);
#endif
for(; depthIdx + depthProgress <= depth; depthIdx+=depthProgress)
{
mkt(lhsPackMap, rhsPackMap, rowIdx, colIdx, depthIdx, acc);
}
acc.scale(alpha, pAlpha);
acc.store(res, rowIdx, colIdx);
depthLS(rowIdx, colIdx, depthIdx, res, rows, depth, cols, alpha, pAlpha, lhsPackMap, rhsPackMap);
}
};
template<int Architecture, int CPU, typename Index, typename LhsScalar, typename LhsPackMap, typename RhsScalar, typename RhsPackMap, typename AccScalar, typename ResScalar, typename ResPacket, typename DataMapper, int RHS_SHAPE_IDX, int LHS_SHAPE_IDX>
struct DepthLoopStruct<Architecture, CPU, Index, LhsScalar, LhsPackMap, RhsScalar, RhsPackMap, AccScalar, ResScalar, ResPacket, DataMapper, RHS_SHAPE_IDX, LHS_SHAPE_IDX, -1>
{
EIGEN_STRONG_INLINE void operator()(Index, Index, Index, const DataMapper&,
Index, Index, Index, ResScalar, const ResPacket&, LhsPackMap&, RhsPackMap&) {}
};
template<int Architecture, int CPU, typename Index, typename LhsScalar, typename LhsPackMap, typename RhsScalar, typename RhsPackMap, typename AccScalar, typename ResScalar, typename ResPacket, typename DataMapper, int RHS_SHAPE_IDX, int IDX>
struct LhsLoopStruct
{
static constexpr auto PREVIOUS = SHAPES<Architecture, CPU, LhsScalar, RhsScalar>[IDX][SHAPES_LHS_POINTER];
LhsLoopStruct<Architecture, CPU, Index, LhsScalar, LhsPackMap, RhsScalar, RhsPackMap, AccScalar, ResScalar, ResPacket, DataMapper, RHS_SHAPE_IDX, PREVIOUS> lhsLS;
EIGEN_STRONG_INLINE void operator()(Index rowIdx, int colIdx, const DataMapper& res,
Index rows, Index depth, Index cols, ResScalar alpha, const ResPacket& pAlpha, LhsPackMap& lhsPackMap, RhsPackMap& rhsPackMap)
{
constexpr auto lhsProgress = SHAPES<Architecture, CPU, LhsScalar, RhsScalar>[IDX][SHAPES_LHS_DIMENSION];
constexpr auto rhsProgress = SHAPES<Architecture, CPU, LhsScalar, RhsScalar>[IDX][SHAPES_RHS_DIMENSION];
DepthLoopStruct<Architecture, CPU, Index, LhsScalar, LhsPackMap, RhsScalar, RhsPackMap, AccScalar, ResScalar, ResPacket, DataMapper, RHS_SHAPE_IDX, IDX, IDX> depthLS;
for(;rowIdx + lhsProgress <= rows; rowIdx+=lhsProgress)
{
lhsPackMap.moveTo(rowIdx);
rhsPackMap.moveTo(colIdx);
//prefetch(lhsPackMap.pCur + 2*lhsProgress);
//prefetch(rhsPackMap.pCur + 2*rhsProgress);
depthLS(rowIdx, colIdx, 0, res, rows, depth, cols, alpha, pAlpha, lhsPackMap, rhsPackMap);
}
lhsLS(rowIdx, colIdx, res, rows, depth, cols, alpha, pAlpha, lhsPackMap, rhsPackMap);
}
};
template<int Architecture, int CPU, typename Index, typename LhsScalar, typename LhsPackMap, typename RhsScalar, typename RhsPackMap, typename AccScalar, typename ResScalar, typename ResPacket, typename DataMapper, int RHS_SHAPE_IDX>
struct LhsLoopStruct<Architecture, CPU, Index, LhsScalar, LhsPackMap, RhsScalar, RhsPackMap, AccScalar, ResScalar, ResPacket, DataMapper, RHS_SHAPE_IDX, -1>
{
EIGEN_STRONG_INLINE void operator()(Index, Index, const DataMapper&,
Index, Index, Index, ResScalar, const ResPacket&, LhsPackMap&, RhsPackMap&) {}
};
template<int Architecture, int CPU, typename Index, typename LhsScalar, typename LhsPackMap, typename RhsScalar, typename RhsPackMap, typename AccScalar, typename ResScalar, typename ResPacket, typename DataMapper, int IDX>
struct RhsLoopStruct
{
static constexpr auto PREVIOUS = SHAPES<Architecture, CPU, LhsScalar, RhsScalar>[IDX][SHAPES_RHS_POINTER];
RhsLoopStruct<Architecture, CPU, Index, LhsScalar, LhsPackMap, RhsScalar, RhsPackMap, AccScalar, ResScalar, ResPacket, DataMapper, PREVIOUS> rhsLS;
EIGEN_STRONG_INLINE void operator()(Index colIdx, const DataMapper& res,
Index rows, Index depth, Index cols, ResScalar alpha, const ResPacket& pAlpha, LhsPackMap& lhsPackMap, RhsPackMap& rhsPackMap)
{
constexpr auto rhsProgress = SHAPES<Architecture, CPU, LhsScalar, RhsScalar>[IDX][SHAPES_RHS_DIMENSION];
for(;colIdx + rhsProgress <= cols; colIdx+=rhsProgress)
{
LhsLoopStruct<Architecture, CPU, Index, LhsScalar, LhsPackMap, RhsScalar, RhsPackMap, AccScalar, ResScalar, ResPacket, DataMapper, IDX, IDX> lhsLS;
lhsLS(0, colIdx, res, rows, depth, cols, alpha, pAlpha, lhsPackMap, rhsPackMap);
}
rhsLS(colIdx, res, rows, depth, cols, alpha, pAlpha, lhsPackMap, rhsPackMap);
}
};
template<int Architecture, int CPU, typename Index, typename LhsScalar, typename LhsPackMap, typename RhsScalar, typename RhsPackMap, typename AccScalar, typename ResScalar, typename ResPacket, typename DataMapper>
struct RhsLoopStruct<Architecture, CPU, Index, LhsScalar, LhsPackMap, RhsScalar, RhsPackMap, AccScalar, ResScalar, ResPacket, DataMapper, -1>
{
EIGEN_STRONG_INLINE void operator()(Index colIdx, const DataMapper&,
Index, Index, Index, ResScalar, const ResPacket&, LhsPackMap&, RhsPackMap&) {}
};
template<int Architecture, int CPU, typename ResScalar, typename AccScalar, typename LhsScalar, typename RhsScalar, typename Index, typename DataMapper>
EIGEN_STRONG_INLINE void gemm(const DataMapper& res, const LhsScalar* blockA, const RhsScalar* blockB,
Index rows, Index depth, Index cols, ResScalar alpha, Index strideA, Index strideB, Index offsetA, Index offsetB)
{
using ResPacket = typename unpacket_traits<ResScalar>::type;
typedef PackMap<Architecture, CPU, Index, LhsScalar, DataMapper, true> LhsPackMap;
typedef PackMap<Architecture, CPU, Index, RhsScalar, DataMapper, false> RhsPackMap;
#ifdef __DEBUG__
std::cout << "blockA" << std::endl;
for(auto i = 0; i < rows*depth; i++)
{
if(i % 4 == 0 && i > 0)
std::cout << std::endl;
std::cout << blockA[i] << " ";
}
std::cout << std::endl;
std::cout << "blockB" << std::endl;
for(auto i = 0; i < depth*cols; i++)
{
if(i % 4 == 0 && i > 0)
std::cout << std::endl;
std::cout << blockB[i] << " ";
}
std::cout << std::endl;
#endif
asm __volatile__("#BEGING_GEBP\n\t");
RhsLoopStruct<Architecture, CPU, Index, LhsScalar, LhsPackMap, RhsScalar, RhsPackMap, AccScalar, ResScalar, ResPacket, DataMapper, SHAPES_COUNT<0, 0, LhsScalar, RhsScalar>-1> rhsLS;
LhsPackMap lhsPackMap(blockA, depth, strideA, offsetA);
RhsPackMap rhsPackMap(blockB, depth, strideB, offsetB);
ResPacket pAlpha = pset1<ResPacket>(alpha);
rhsLS(0, res, rows, depth, cols, alpha, pAlpha, lhsPackMap, rhsPackMap);
asm __volatile__("#END_GEBP\n\t");
}
/*
template<typename Index, typename DataMapper, int nr, bool Conjugate, bool PanelMode>
struct gemm_pack_rhs<float, Index, DataMapper, nr, ColMajor, Conjugate, PanelMode>
{
void operator()(float* blockB, const DataMapper& rhs, Index depth, Index cols, Index stride=0, Index offset=0);
};
template<typename Index, typename DataMapper, int nr, bool Conjugate, bool PanelMode>
void gemm_pack_rhs<float, Index, DataMapper, nr, ColMajor, Conjugate, PanelMode>
::operator()(float* blockB, const DataMapper& rhs, Index depth, Index cols, Index stride, Index offset)
{
rhs_pack<0, 0, Index, float, DataMapper, Conjugate, PanelMode, ColMajor> pack;
pack(blockB, rhs, depth, cols, stride, offset);
}
template<typename Index, typename DataMapper, int nr, bool Conjugate, bool PanelMode>
struct gemm_pack_rhs<float, Index, DataMapper, nr, RowMajor, Conjugate, PanelMode>
{
void operator()(float* blockB, const DataMapper& rhs, Index depth, Index cols, Index stride=0, Index offset=0);
};
template<typename Index, typename DataMapper, int nr, bool Conjugate, bool PanelMode>
void gemm_pack_rhs<float, Index, DataMapper, nr, RowMajor, Conjugate, PanelMode>
::operator()(float* blockB, const DataMapper& rhs, Index depth, Index cols, Index stride, Index offset)
{
rhs_pack<0, 0, Index, float, DataMapper, Conjugate, PanelMode, RowMajor> pack;
pack(blockB, rhs, depth, cols, stride, offset);
}
template<typename Index, typename DataMapper, int Pack1, int Pack2, typename Packet, bool Conjugate, bool PanelMode>
struct gemm_pack_lhs<float, Index, DataMapper, Pack1, Pack2, Packet, RowMajor, Conjugate, PanelMode>
{
void operator()(float* blockA, const DataMapper& lhs, Index depth, Index rows, Index stride=0, Index offset=0);
};
template<typename Index, typename DataMapper, int Pack1, int Pack2, typename Packet, bool Conjugate, bool PanelMode>
void gemm_pack_lhs<float, Index, DataMapper, Pack1, Pack2, Packet, RowMajor, Conjugate, PanelMode>
::operator()(float* blockA, const DataMapper& lhs, Index depth, Index rows, Index stride, Index offset)
{
lhs_pack<0, 0, Index, float, DataMapper, Conjugate, PanelMode, RowMajor> pack;
pack(blockA, lhs, depth, rows, stride, offset);
}
template<typename Index, typename DataMapper, int Pack1, int Pack2, typename Packet, bool Conjugate, bool PanelMode>
struct gemm_pack_lhs<float, Index, DataMapper, Pack1, Pack2, Packet, ColMajor, Conjugate, PanelMode>
{
void operator()(float* blockA, const DataMapper& lhs, Index depth, Index rows, Index stride=0, Index offset=0);
};
template<typename Index, typename DataMapper, int Pack1, int Pack2, typename Packet, bool Conjugate, bool PanelMode>
void gemm_pack_lhs<float, Index, DataMapper, Pack1, Pack2, Packet, ColMajor, Conjugate, PanelMode>
::operator()(float* blockA, const DataMapper& lhs, Index depth, Index rows, Index stride, Index offset)
{
lhs_pack<0, 0, Index, float, DataMapper, Conjugate, PanelMode, ColMajor> pack;
pack(blockA, lhs, depth, rows, stride, offset);
}
*/
template<typename Index, typename DataMapper, int mr, int nr, bool ConjugateLhs, bool ConjugateRhs>
struct gebp_kernel<float, float, Index, DataMapper, mr, nr, ConjugateLhs, ConjugateRhs>
{
void operator()(const DataMapper& res, const float* blockA, const float* blockB,
Index rows, Index depth, Index cols, float alpha,
Index strideA=-1, Index strideB=-1, Index offsetA=0, Index offsetB=0);
};
template<typename Index, typename DataMapper, int mr, int nr, bool ConjugateLhs, bool ConjugateRhs>
void gebp_kernel<float, float, Index, DataMapper, mr, nr, ConjugateLhs, ConjugateRhs>
::operator()(const DataMapper& res, const float* blockA, const float* blockB,
Index rows, Index depth, Index cols, float alpha,
Index strideA, Index strideB, Index offsetA, Index offsetB)
{
gemm<0, 0, float, float, float, float, Index, DataMapper>(res, blockA, blockB, rows, depth, cols, alpha, strideA, strideB, offsetA, offsetB);
}
} // end namespace internal
} // end namespace Eigen
#endif // EIGEN_MATRIX_PRODUCT_NEON_H

View File

@@ -0,0 +1,192 @@
// This file is part of Eigen, a lightweight C++ template library
// for linear algebra.
//
// Copyright (C) 2021 Everton Constantino (everton.constantino@hotmail.com)
//
// This Source Code Form is subject to the terms of the Mozilla
// Public License v. 2.0. If a copy of the MPL was not distributed
// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
#ifndef EIGEN_PACKING_OPS_NEON_H
#define EIGEN_PACKING_OPS_NEON_H
namespace Eigen {
namespace internal {
#ifdef __ENABLE_CUSTOM_PACKING__
template<int CPU, typename Scalar, bool isLhs>
constexpr int PACK_SHAPES_COUNT<0, CPU, Scalar, isLhs> = 3;
template<int CPU, typename Scalar>
constexpr int PACK_SHAPES_COUNT<0, CPU, Scalar, true> = 4;
template<int CPU, typename Scalar, bool isLhs>
constexpr int PACK_SHAPES<0, CPU, Scalar, isLhs>[PACK_SHAPES_COUNT<0, CPU, Scalar, isLhs>][PACK_SHAPES_DIMENSION] = {{1,1,PACK_SHAPES_END},{4,1,0},{4,4,0}};
template<int CPU, typename Scalar>
constexpr int PACK_SHAPES<0, CPU, Scalar, true>[PACK_SHAPES_COUNT<0, CPU, Scalar, true>][PACK_SHAPES_DIMENSION] = {{1,1,PACK_SHAPES_END},{4,1,0},{4,4,0},{8,1,2}};
template<int CPU, typename Index, typename Scalar, bool isLhs, typename DataMapper, bool Conjugate, bool PanelMode, int StorageOrder>
struct PackingOperator<0, CPU, Index, Scalar, isLhs, DataMapper, Conjugate, PanelMode, StorageOrder, 4, 4>
{
EIGEN_STRONG_INLINE Scalar* operator()(Index d1Idx, Index d2Idx, Scalar *block, const DataMapper& data)
{
using Packet = typename packet_traits<Scalar>::type;
constexpr int vectorSize = packet_traits<Scalar>::size;
Scalar *c = block;
if(!isLhs)
{
int tD = d1Idx;
d1Idx = d2Idx;
d2Idx = tD;
}
if(isLhs && StorageOrder == ColMajor || !isLhs && StorageOrder == RowMajor)
{
Packet p0 = data.template loadPacket<Packet>(d1Idx, d2Idx + 0);
Packet p1 = data.template loadPacket<Packet>(d1Idx, d2Idx + 1);
Packet p2 = data.template loadPacket<Packet>(d1Idx, d2Idx + 2);
Packet p3 = data.template loadPacket<Packet>(d1Idx, d2Idx + 3);
pstore<Scalar>(c + 0*vectorSize, p0);
pstore<Scalar>(c + 1*vectorSize, p1);
pstore<Scalar>(c + 2*vectorSize, p2);
pstore<Scalar>(c + 3*vectorSize, p3);
c+=4*vectorSize;
} else {
PacketBlock<Packet, 4> pblock;
pblock.packet[0] = data.template loadPacket<Packet>(d1Idx, d2Idx + 0);
pblock.packet[1] = data.template loadPacket<Packet>(d1Idx, d2Idx + 1);
pblock.packet[2] = data.template loadPacket<Packet>(d1Idx, d2Idx + 2);
pblock.packet[3] = data.template loadPacket<Packet>(d1Idx, d2Idx + 3);
ptranspose(pblock);
pstore<Scalar>(c + 0*vectorSize, pblock.packet[0]);
pstore<Scalar>(c + 1*vectorSize, pblock.packet[1]);
pstore<Scalar>(c + 2*vectorSize, pblock.packet[2]);
pstore<Scalar>(c + 3*vectorSize, pblock.packet[3]);
c+=4*vectorSize;
}
return c;
}
};
template<int CPU, typename Index, typename Scalar, bool isLhs, typename DataMapper, bool Conjugate, bool PanelMode, int StorageOrder>
struct PackingOperator<0, CPU, Index, Scalar, isLhs, DataMapper, Conjugate, PanelMode, StorageOrder, 8, 1>
{
EIGEN_STRONG_INLINE Scalar* operator()(Index d1Idx, Index d2Idx, Scalar *block, const DataMapper& data)
{
using Packet = typename packet_traits<Scalar>::type;
Scalar *c = block;
if(isLhs && StorageOrder == ColMajor)
{
Packet p = data.template loadPacket<Packet>(d1Idx + 0, d2Idx);
pstore<Scalar>(c, p);
c+=4;
p = data.template loadPacket<Packet>(d1Idx + 4, d2Idx);
pstore<Scalar>(c, p);
c+=4;
} else if(!isLhs && StorageOrder == RowMajor) {
Packet p = data.template loadPacket<Packet>(d2Idx, d1Idx + 0);
pstore<Scalar>(c, p);
c+=4;
p = data.template loadPacket<Packet>(d2Idx, d1Idx + 4);
pstore<Scalar>(c, p);
c+=4;
} else {
if(isLhs)
{
*c = data(d1Idx + 0, d2Idx + 0);
c++;
*c = data(d1Idx + 1, d2Idx + 0);
c++;
*c = data(d1Idx + 2, d2Idx + 0);
c++;
*c = data(d1Idx + 3, d2Idx + 0);
c++;
*c = data(d1Idx + 0, d2Idx + 4);
c++;
*c = data(d1Idx + 1, d2Idx + 4);
c++;
*c = data(d1Idx + 2, d2Idx + 4);
c++;
*c = data(d1Idx + 3, d2Idx + 4);
c++;
} else {
*c = data(d2Idx, d1Idx + 0);
c++;
*c = data(d2Idx, d1Idx + 1);
c++;
*c = data(d2Idx, d1Idx + 2);
c++;
*c = data(d2Idx, d1Idx + 3);
c++;
*c = data(d2Idx + 4, d1Idx + 0);
c++;
*c = data(d2Idx + 4, d1Idx + 1);
c++;
*c = data(d2Idx + 4, d1Idx + 2);
c++;
*c = data(d2Idx + 4, d1Idx + 3);
c++;
}
}
return c;
}
};
template<int CPU, typename Index, typename Scalar, bool isLhs, typename DataMapper, bool Conjugate, bool PanelMode, int StorageOrder>
struct PackingOperator<0, CPU, Index, Scalar, isLhs, DataMapper, Conjugate, PanelMode, StorageOrder, 4, 1>
{
EIGEN_STRONG_INLINE Scalar* operator()(Index d1Idx, Index d2Idx, Scalar *block, const DataMapper& data)
{
using Packet = typename packet_traits<Scalar>::type;
Scalar *c = block;
if(isLhs && StorageOrder == ColMajor)
{
Packet p = data.template loadPacket<Packet>(d1Idx, d2Idx);
pstore<Scalar>(c, p);
c+=4;
} else if(!isLhs && StorageOrder == RowMajor) {
Packet p = data.template loadPacket<Packet>(d2Idx, d1Idx);
pstore<Scalar>(c, p);
c+=4;
} else {
if(isLhs)
{
*c = data(d1Idx + 0, d2Idx);
c++;
*c = data(d1Idx + 1, d2Idx);
c++;
*c = data(d1Idx + 2, d2Idx);
c++;
*c = data(d1Idx + 3, d2Idx);
c++;
} else {
*c = data(d2Idx, d1Idx + 0);
c++;
*c = data(d2Idx, d1Idx + 1);
c++;
*c = data(d2Idx, d1Idx + 2);
c++;
*c = data(d2Idx, d1Idx + 3);
c++;
}
}
return c;
}
};
#endif // __ENABLE_CUSTOM_PACKING__
} // end namespace internal
} // end namespace Eigen
#endif // EIGEN_PACKING_OPS_NEON_H

View File

@@ -113,13 +113,19 @@ template<> EIGEN_STRONG_INLINE Packet2cf ploadu<Packet2cf>(const std::complex<fl
template<> EIGEN_STRONG_INLINE Packet2cf pset1<Packet2cf>(const std::complex<float>& from)
{
Packet2cf res;
#ifdef EIGEN_VECTORIZE_SSE3
res.v = _mm_castpd_ps(_mm_loaddup_pd(reinterpret_cast<double const*>(&from)));
#if EIGEN_GNUC_AT_MOST(4,2)
// Workaround annoying "may be used uninitialized in this function" warning with gcc 4.2
res.v = _mm_loadl_pi(_mm_set1_ps(0.0f), reinterpret_cast<const __m64*>(&from));
#elif EIGEN_GNUC_AT_LEAST(4,6)
// Suppress annoying "may be used uninitialized in this function" warning with gcc >= 4.6
#pragma GCC diagnostic push
#pragma GCC diagnostic ignored "-Wuninitialized"
res.v = _mm_loadl_pi(res.v, (const __m64*)&from);
#pragma GCC diagnostic pop
#else
res.v = _mm_castpd_ps(_mm_load_sd(reinterpret_cast<double const*>(&from)));
res.v = _mm_movelh_ps(res.v, res.v);
res.v = _mm_loadl_pi(res.v, (const __m64*)&from);
#endif
return res;
return Packet2cf(_mm_movelh_ps(res.v,res.v));
}
template<> EIGEN_STRONG_INLINE Packet2cf ploaddup<Packet2cf>(const std::complex<float>* from) { return pset1<Packet2cf>(*from); }

View File

@@ -16,8 +16,8 @@
//------------------------------------------------------------------------------------------
#define EIGEN_WORLD_VERSION 3
#define EIGEN_MAJOR_VERSION 3
#define EIGEN_MINOR_VERSION 91
#define EIGEN_MAJOR_VERSION 4
#define EIGEN_MINOR_VERSION 99
#define EIGEN_VERSION_AT_LEAST(x,y,z) (EIGEN_WORLD_VERSION>x || (EIGEN_WORLD_VERSION>=x && \
(EIGEN_MAJOR_VERSION>y || (EIGEN_MAJOR_VERSION>=y && \
@@ -684,7 +684,8 @@
// Does the compiler support result_of?
// result_of was deprecated in c++17 and removed in c++ 20
#ifndef EIGEN_HAS_STD_RESULT_OF
#if EIGEN_HAS_CXX11 && EIGEN_COMP_CXXVER < 17
#if EIGEN_MAX_CPP_VER >= 11 && \
(defined(__cplusplus) && __cplusplus >= 201103L && __cplusplus < 201703L)
#define EIGEN_HAS_STD_RESULT_OF 1
#else
#define EIGEN_HAS_STD_RESULT_OF 0
@@ -703,7 +704,8 @@
#endif // EIGEN_HAS_STD_HASH
#ifndef EIGEN_HAS_STD_INVOKE_RESULT
#if EIGEN_MAX_CPP_VER >= 17 && EIGEN_COMP_CXXVER >= 17
#if EIGEN_MAX_CPP_VER >= 17 && \
(defined(__cplusplus) && __cplusplus >= 201703L)
#define EIGEN_HAS_STD_INVOKE_RESULT 1
#else
#define EIGEN_HAS_STD_INVOKE_RESULT 0

View File

@@ -136,14 +136,15 @@ template<typename T, int Value> class variable_if_dynamic
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE EIGEN_CONSTEXPR
operator T() const { return T(Value); }
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
void setValue(T v) const { EIGEN_ONLY_USED_FOR_DEBUG(v); eigen_assert(v == T(Value)); }
void setValue(T) const {}
};
template<typename T> class variable_if_dynamic<T, Dynamic>
{
T m_value;
EIGEN_DEVICE_FUNC variable_if_dynamic() { eigen_assert(false); }
public:
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE explicit variable_if_dynamic(T value = 0) EIGEN_NO_THROW : m_value(value) {}
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE explicit variable_if_dynamic(T value) : m_value(value) {}
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE T value() const { return m_value; }
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE operator T() const { return m_value; }
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void setValue(T value) { m_value = value; }

View File

@@ -498,6 +498,8 @@ template<typename MatrixType, typename DiagType, typename SubDiagType>
EIGEN_DEVICE_FUNC
ComputationInfo computeFromTridiagonal_impl(DiagType& diag, SubDiagType& subdiag, const Index maxIterations, bool computeEigenvectors, MatrixType& eivec)
{
EIGEN_USING_STD(abs);
ComputationInfo info;
typedef typename MatrixType::Scalar Scalar;
@@ -508,23 +510,15 @@ ComputationInfo computeFromTridiagonal_impl(DiagType& diag, SubDiagType& subdiag
typedef typename DiagType::RealScalar RealScalar;
const RealScalar considerAsZero = (std::numeric_limits<RealScalar>::min)();
const RealScalar precision_inv = RealScalar(1)/NumTraits<RealScalar>::epsilon();
const RealScalar precision = RealScalar(2)*NumTraits<RealScalar>::epsilon();
while (end>0)
{
for (Index i = start; i<end; ++i) {
if (numext::abs(subdiag[i]) < considerAsZero) {
subdiag[i] = RealScalar(0);
} else {
// abs(subdiag[i]) <= epsilon * sqrt(abs(diag[i]) + abs(diag[i+1]))
// Scaled to prevent underflows.
const RealScalar scaled_subdiag = precision_inv * subdiag[i];
if (scaled_subdiag * scaled_subdiag <= (numext::abs(diag[i])+numext::abs(diag[i+1]))) {
subdiag[i] = RealScalar(0);
}
}
}
for (Index i = start; i<end; ++i)
if (internal::isMuchSmallerThan(abs(subdiag[i]),(abs(diag[i])+abs(diag[i+1])),precision) || abs(subdiag[i]) <= considerAsZero)
subdiag[i] = 0;
// find the largest unreduced block at the end of the matrix.
// find the largest unreduced block
while (end>0 && subdiag[end-1]==RealScalar(0))
{
end--;
@@ -827,38 +821,32 @@ SelfAdjointEigenSolver<MatrixType>& SelfAdjointEigenSolver<MatrixType>
}
namespace internal {
// Francis implicit QR step.
template<int StorageOrder,typename RealScalar, typename Scalar, typename Index>
EIGEN_DEVICE_FUNC
static void tridiagonal_qr_step(RealScalar* diag, RealScalar* subdiag, Index start, Index end, Scalar* matrixQ, Index n)
{
// Wilkinson Shift.
EIGEN_USING_STD(abs);
RealScalar td = (diag[end-1] - diag[end])*RealScalar(0.5);
RealScalar e = subdiag[end-1];
// Note that thanks to scaling, e^2 or td^2 cannot overflow, however they can still
// underflow thus leading to inf/NaN values when using the following commented code:
// RealScalar e2 = numext::abs2(subdiag[end-1]);
// RealScalar mu = diag[end] - e2 / (td + (td>0 ? 1 : -1) * sqrt(td*td + e2));
// RealScalar e2 = numext::abs2(subdiag[end-1]);
// RealScalar mu = diag[end] - e2 / (td + (td>0 ? 1 : -1) * sqrt(td*td + e2));
// This explain the following, somewhat more complicated, version:
RealScalar mu = diag[end];
if(td==RealScalar(0)) {
mu -= numext::abs(e);
} else if (e != RealScalar(0)) {
const RealScalar e2 = numext::abs2(e);
const RealScalar h = numext::hypot(td,e);
if(e2 == RealScalar(0)) {
mu -= e / ((td + (td>RealScalar(0) ? h : -h)) / e);
} else {
mu -= e2 / (td + (td>RealScalar(0) ? h : -h));
}
if(td==RealScalar(0))
mu -= abs(e);
else
{
RealScalar e2 = numext::abs2(subdiag[end-1]);
RealScalar h = numext::hypot(td,e);
if(e2==RealScalar(0)) mu -= (e / (td + (td>RealScalar(0) ? RealScalar(1) : RealScalar(-1)))) * (e / h);
else mu -= e2 / (td + (td>RealScalar(0) ? h : -h));
}
RealScalar x = diag[start] - mu;
RealScalar z = subdiag[start];
// If z ever becomes zero, the Givens rotation will be the identity and
// z will stay zero for all future iterations.
for (Index k = start; k < end && z != RealScalar(0); ++k)
for (Index k = start; k < end; ++k)
{
JacobiRotation<RealScalar> rot;
rot.makeGivens(x, z);
@@ -871,11 +859,12 @@ static void tridiagonal_qr_step(RealScalar* diag, RealScalar* subdiag, Index sta
diag[k+1] = rot.s() * sdk + rot.c() * dkp1;
subdiag[k] = rot.c() * sdk - rot.s() * dkp1;
if (k > start)
subdiag[k - 1] = rot.c() * subdiag[k-1] - rot.s() * z;
// "Chasing the bulge" to return to triangular form.
x = subdiag[k];
if (k < end - 1)
{
z = -rot.s() * subdiag[k+1];

View File

@@ -141,8 +141,8 @@ struct compute_inverse_size4<Architecture::Target, float, MatrixType, ResultType
iC = psub(iC, pmul(vec4f_swizzle2(A, A, 1, 0, 3, 2), vec4f_swizzle2(DC, DC, 2, 1, 2, 1)));
iC = psub(pmul(B, vec4f_duplane(dC, 0)), iC);
const float sign_mask[4] = {0.0f, -0.0f, -0.0f, 0.0f};
const Packet4f p4f_sign_PNNP = pset<Packet4f>(sign_mask);
const int bits[4] = {0, -2147483648, -2147483648, 0};
const Packet4f p4f_sign_PNNP = preinterpret<Packet4f, Packet4i>(pgather<int, Packet4i>(bits, static_cast<Eigen::Index>(1)));
rd = pxor(rd, p4f_sign_PNNP);
iA = pmul(iA, rd);
iB = pmul(iB, rd);
@@ -323,12 +323,12 @@ struct compute_inverse_size4<Architecture::Target, double, MatrixType, ResultTyp
iC1 = psub(pmul(B1, dC), iC1);
iC2 = psub(pmul(B2, dC), iC2);
const double sign_mask1[2] = {0.0, -0.0};
const double sign_mask2[2] = {-0.0, 0.0};
const Packet2d sign_PN = pset<Packet2d>(sign_mask1);
const Packet2d sign_NP = pset<Packet2d>(sign_mask2);
d1 = pxor(rd, sign_PN);
d2 = pxor(rd, sign_NP);
const int bits1[4] = {0, -2147483648, 0, 0};
const int bits2[4] = {0, 0, 0, -2147483648};
const Packet2d _Sign_NP = preinterpret<Packet2d, Packet4i>(pgather<int, Packet4i>(bits1, static_cast<Eigen::Index>(1)));
const Packet2d _Sign_PN = preinterpret<Packet2d, Packet4i>(pgather<int, Packet4i>(bits2, static_cast<Eigen::Index>(1)));
d1 = pxor(rd, _Sign_PN);
d2 = pxor(rd, _Sign_NP);
Index res_stride = result.outerStride();
double *res = result.data();

View File

@@ -208,7 +208,6 @@ protected:
using Base::m_computeThinV;
using Base::m_matrixU;
using Base::m_matrixV;
using Base::m_info;
using Base::m_isInitialized;
using Base::m_nonzeroSingularValues;
@@ -257,25 +256,16 @@ BDCSVD<MatrixType>& BDCSVD<MatrixType>::compute(const MatrixType& matrix, unsign
{
// FIXME this line involves temporaries
JacobiSVD<MatrixType> jsvd(matrix,computationOptions);
if(computeU()) m_matrixU = jsvd.matrixU();
if(computeV()) m_matrixV = jsvd.matrixV();
m_singularValues = jsvd.singularValues();
m_nonzeroSingularValues = jsvd.nonzeroSingularValues();
m_isInitialized = true;
m_info = jsvd.info();
if (m_info == Success || m_info == NoConvergence) {
if(computeU()) m_matrixU = jsvd.matrixU();
if(computeV()) m_matrixV = jsvd.matrixV();
m_singularValues = jsvd.singularValues();
m_nonzeroSingularValues = jsvd.nonzeroSingularValues();
}
return *this;
}
//**** step 0 - Copy the input matrix and apply scaling to reduce over/under-flows
RealScalar scale = matrix.cwiseAbs().template maxCoeff<PropagateNaN>();
if (!(numext::isfinite)(scale)) {
m_isInitialized = true;
m_info = InvalidInput;
return *this;
}
RealScalar scale = matrix.cwiseAbs().maxCoeff();
if(scale==Literal(0)) scale = Literal(1);
MatrixX copy;
if (m_isTranspose) copy = matrix.adjoint()/scale;
@@ -292,11 +282,7 @@ BDCSVD<MatrixType>& BDCSVD<MatrixType>::compute(const MatrixType& matrix, unsign
m_computed.topRows(m_diagSize) = bid.bidiagonal().toDenseMatrix().transpose();
m_computed.template bottomRows<1>().setZero();
divide(0, m_diagSize - 1, 0, 0, 0);
if (m_info != Success && m_info != NoConvergence) {
m_isInitialized = true;
return *this;
}
//**** step 3 - Copy singular values and vectors
for (int i=0; i<m_diagSize; i++)
{
@@ -408,7 +394,7 @@ void BDCSVD<MatrixType>::structured_update(Block<MatrixXr,Dynamic,Dynamic> A, co
//@param shift : Each time one takes the left submatrix, one must add 1 to the shift. Why? Because! We actually want the last column of the U submatrix
// to become the first column (*coeff) and to shift all the other columns to the right. There are more details on the reference paper.
template<typename MatrixType>
void BDCSVD<MatrixType>::divide(Eigen::Index firstCol, Eigen::Index lastCol, Eigen::Index firstRowW, Eigen::Index firstColW, Eigen::Index shift)
void BDCSVD<MatrixType>::divide (Eigen::Index firstCol, Eigen::Index lastCol, Eigen::Index firstRowW, Eigen::Index firstColW, Eigen::Index shift)
{
// requires rows = cols + 1;
using std::pow;
@@ -428,8 +414,6 @@ void BDCSVD<MatrixType>::divide(Eigen::Index firstCol, Eigen::Index lastCol, Eig
{
// FIXME this line involves temporaries
JacobiSVD<MatrixXr> b(m_computed.block(firstCol, firstCol, n + 1, n), ComputeFullU | (m_compV ? ComputeFullV : 0));
m_info = b.info();
if (m_info != Success && m_info != NoConvergence) return;
if (m_compU)
m_naiveU.block(firstCol, firstCol, n + 1, n + 1).real() = b.matrixU();
else
@@ -449,9 +433,7 @@ void BDCSVD<MatrixType>::divide(Eigen::Index firstCol, Eigen::Index lastCol, Eig
// and the divide of the right submatrice reads one column of the left submatrice. That's why we need to treat the
// right submatrix before the left one.
divide(k + 1 + firstCol, lastCol, k + 1 + firstRowW, k + 1 + firstColW, shift);
if (m_info != Success && m_info != NoConvergence) return;
divide(firstCol, k - 1 + firstCol, firstRowW, firstColW + 1, shift + 1);
if (m_info != Success && m_info != NoConvergence) return;
if (m_compU)
{

View File

@@ -585,7 +585,6 @@ template<typename _MatrixType, int QRPreconditioner> class JacobiSVD
using Base::m_matrixU;
using Base::m_matrixV;
using Base::m_singularValues;
using Base::m_info;
using Base::m_isInitialized;
using Base::m_isAllocated;
using Base::m_usePrescribedThreshold;
@@ -626,7 +625,6 @@ void JacobiSVD<MatrixType, QRPreconditioner>::allocate(Eigen::Index rows, Eigen:
m_rows = rows;
m_cols = cols;
m_info = Success;
m_isInitialized = false;
m_isAllocated = true;
m_computationOptions = computationOptions;
@@ -676,12 +674,7 @@ JacobiSVD<MatrixType, QRPreconditioner>::compute(const MatrixType& matrix, unsig
const RealScalar considerAsZero = (std::numeric_limits<RealScalar>::min)();
// Scaling factor to reduce over/under-flows
RealScalar scale = matrix.cwiseAbs().template maxCoeff<PropagateNaN>();
if (!(numext::isfinite)(scale)) {
m_isInitialized = true;
m_info = InvalidInput;
return *this;
}
RealScalar scale = matrix.cwiseAbs().maxCoeff();
if(scale==RealScalar(0)) scale = RealScalar(1);
/*** step 1. The R-SVD step: we use a QR decomposition to reduce to the case of a square matrix */

View File

@@ -51,11 +51,8 @@ template<typename Derived> struct traits<SVDBase<Derived> >
* smaller value among \a n and \a p, there are only \a m singular vectors; the remaining columns of \a U and \a V do not correspond to actual
* singular vectors. Asking for \em thin \a U or \a V means asking for only their \a m first columns to be formed. So \a U is then a n-by-m matrix,
* and \a V is then a p-by-m matrix. Notice that thin \a U and \a V are all you need for (least squares) solving.
*
* The status of the computation can be retrived using the \a info() method. Unless \a info() returns \a Success, the results should be not
* considered well defined.
*
* If the input matrix has inf or nan coefficients, the result of the computation is undefined, and \a info() will return \a InvalidInput, but the computation is guaranteed to
* If the input matrix has inf or nan coefficients, the result of the computation is undefined, but the computation is guaranteed to
* terminate in finite (and reasonable) time.
* \sa class BDCSVD, class JacobiSVD
*/
@@ -100,7 +97,7 @@ public:
*/
const MatrixUType& matrixU() const
{
_check_compute_assertions();
eigen_assert(m_isInitialized && "SVD is not initialized.");
eigen_assert(computeU() && "This SVD decomposition didn't compute U. Did you ask for it?");
return m_matrixU;
}
@@ -116,7 +113,7 @@ public:
*/
const MatrixVType& matrixV() const
{
_check_compute_assertions();
eigen_assert(m_isInitialized && "SVD is not initialized.");
eigen_assert(computeV() && "This SVD decomposition didn't compute V. Did you ask for it?");
return m_matrixV;
}
@@ -128,14 +125,14 @@ public:
*/
const SingularValuesType& singularValues() const
{
_check_compute_assertions();
eigen_assert(m_isInitialized && "SVD is not initialized.");
return m_singularValues;
}
/** \returns the number of singular values that are not exactly 0 */
Index nonzeroSingularValues() const
{
_check_compute_assertions();
eigen_assert(m_isInitialized && "SVD is not initialized.");
return m_nonzeroSingularValues;
}
@@ -148,7 +145,7 @@ public:
inline Index rank() const
{
using std::abs;
_check_compute_assertions();
eigen_assert(m_isInitialized && "JacobiSVD is not initialized.");
if(m_singularValues.size()==0) return 0;
RealScalar premultiplied_threshold = numext::maxi<RealScalar>(m_singularValues.coeff(0) * threshold(), (std::numeric_limits<RealScalar>::min)());
Index i = m_nonzeroSingularValues-1;
@@ -227,18 +224,6 @@ public:
solve(const MatrixBase<Rhs>& b) const;
#endif
/** \brief Reports whether previous computation was successful.
*
* \returns \c Success if computation was successful.
*/
EIGEN_DEVICE_FUNC
ComputationInfo info() const
{
eigen_assert(m_isInitialized && "SVD is not initialized.");
return m_info;
}
#ifndef EIGEN_PARSED_BY_DOXYGEN
template<typename RhsType, typename DstType>
void _solve_impl(const RhsType &rhs, DstType &dst) const;
@@ -248,31 +233,26 @@ public:
#endif
protected:
static void check_template_parameters()
{
EIGEN_STATIC_ASSERT_NON_INTEGER(Scalar);
}
void _check_compute_assertions() const {
eigen_assert(m_isInitialized && "SVD is not initialized.");
}
template<bool Transpose_, typename Rhs>
void _check_solve_assertion(const Rhs& b) const {
EIGEN_ONLY_USED_FOR_DEBUG(b);
_check_compute_assertions();
eigen_assert(m_isInitialized && "SVD is not initialized.");
eigen_assert(computeU() && computeV() && "SVDBase::solve(): Both unitaries U and V are required to be computed (thin unitaries suffice).");
eigen_assert((Transpose_?cols():rows())==b.rows() && "SVDBase::solve(): invalid number of rows of the right hand side matrix b");
}
// return true if already allocated
bool allocate(Index rows, Index cols, unsigned int computationOptions) ;
MatrixUType m_matrixU;
MatrixVType m_matrixV;
SingularValuesType m_singularValues;
ComputationInfo m_info;
bool m_isInitialized, m_isAllocated, m_usePrescribedThreshold;
bool m_computeFullU, m_computeThinU;
bool m_computeFullV, m_computeThinV;
@@ -285,8 +265,7 @@ protected:
* Default constructor of SVDBase
*/
SVDBase()
: m_info(Success),
m_isInitialized(false),
: m_isInitialized(false),
m_isAllocated(false),
m_usePrescribedThreshold(false),
m_computeFullU(false),
@@ -348,7 +327,6 @@ bool SVDBase<MatrixType>::allocate(Index rows, Index cols, unsigned int computat
m_rows = rows;
m_cols = cols;
m_info = Success;
m_isInitialized = false;
m_isAllocated = true;
m_computationOptions = computationOptions;

View File

@@ -1,7 +1,6 @@
#define NOGMM
#define NOMTL
#define EIGEN_GOOGLEHASH_SUPPORT 1
#include <map>
#include <ext/hash_map>

View File

@@ -1,107 +0,0 @@
.buildsmoketests:linux:base:
stage: buildsmoketests
image: ubuntu:18.04
before_script:
- apt-get update -y
- apt-get install -y --no-install-recommends software-properties-common
- add-apt-repository -y ppa:ubuntu-toolchain-r/test
- apt-get update
- apt-get install --no-install-recommends -y ${EIGEN_CI_CXX_COMPILER}
${EIGEN_CI_CC_COMPILER} cmake ninja-build
script:
- mkdir -p ${BUILDDIR} && cd ${BUILDDIR}
- CXX=${EIGEN_CI_CXX_COMPILER} CC=${EIGEN_CI_CC_COMPILER} cmake -G
${EIGEN_CI_CMAKE_GENEATOR} -DEIGEN_TEST_CXX11=${EIGEN_TEST_CXX11}
${EIGEN_CI_ADDITIONAL_ARGS} ..
- cmake --build . --target buildsmoketests
artifacts:
name: "$CI_JOB_NAME-$CI_COMMIT_REF_NAME"
paths:
- ${BUILDDIR}/
expire_in: 5 days
only:
- merge_requests
buildsmoketests:x86-64:linux:gcc-10:cxx11-off:
extends: .buildsmoketests:linux:base
variables:
EIGEN_CI_CXX_COMPILER: "g++-10"
EIGEN_CI_CC_COMPILER: "gcc-10"
EIGEN_TEST_CXX11: "off"
buildsmoketests:x86-64:linux:gcc-10:cxx11-on:
extends: .buildsmoketests:linux:base
variables:
EIGEN_CI_CXX_COMPILER: "g++-10"
EIGEN_CI_CC_COMPILER: "gcc-10"
EIGEN_TEST_CXX11: "on"
buildsmoketests:x86-64:linux:clang-10:cxx11-off:
extends: .buildsmoketests:linux:base
variables:
EIGEN_CI_CXX_COMPILER: "clang++-10"
EIGEN_CI_CC_COMPILER: "clang-10"
EIGEN_TEST_CXX11: "off"
buildsmoketests:x86-64:linux:clang-10:cxx11-on:
extends: .buildsmoketests:linux:base
variables:
EIGEN_CI_CXX_COMPILER: "clang++-10"
EIGEN_CI_CC_COMPILER: "clang-10"
EIGEN_TEST_CXX11: "on"
.smoketests:linux:base:
stage: smoketests
image: ubuntu:18.04
before_script:
- apt-get update -y
- apt-get install -y --no-install-recommends software-properties-common
- add-apt-repository -y ppa:ubuntu-toolchain-r/test
- apt-get update
- apt-get install --no-install-recommends -y ${EIGEN_CI_CXX_COMPILER}
${EIGEN_CI_CC_COMPILER} cmake ninja-build xsltproc
script:
- export CXX=${EIGEN_CI_CXX_COMPILER}
- export CC=${EIGEN_CI_CC_COMPILER}
- cd ${BUILDDIR} && ctest --output-on-failure --no-compress-output
--build-no-clean -T test -L smoketest
after_script:
- apt-get update -y
- apt-get install --no-install-recommends -y xsltproc
- cd ${BUILDDIR}
- xsltproc ../ci/CTest2JUnit.xsl Testing/`head -n 1 < Testing/TAG`/Test.xml > "JUnitTestResults_$CI_JOB_ID.xml"
artifacts:
reports:
junit:
- ${BUILDDIR}/JUnitTestResults_$CI_JOB_ID.xml
expire_in: 5 days
only:
- merge_requests
smoketests:x86-64:linux:gcc-10:cxx11-off:
extends: .smoketests:linux:base
variables:
EIGEN_CI_CXX_COMPILER: g++-10
EIGEN_CI_CC_COMPILER: gcc-10
needs: [ "buildsmoketests:x86-64:linux:gcc-10:cxx11-off" ]
smoketests:x86-64:linux:gcc-10:cxx11-on:
extends: .smoketests:linux:base
variables:
EIGEN_CI_CXX_COMPILER: g++-10
EIGEN_CI_CC_COMPILER: gcc-10
needs: [ "buildsmoketests:x86-64:linux:gcc-10:cxx11-on" ]
smoketests:x86-64:linux:clang-10:cxx11-off:
extends: .smoketests:linux:base
variables:
EIGEN_CI_CXX_COMPILER: clang++-10
EIGEN_CI_CC_COMPILER: clang-10
needs: [ "buildsmoketests:x86-64:linux:clang-10:cxx11-off" ]
smoketests:x86-64:linux:clang-10:cxx11-on:
extends: .smoketests:linux:base
variables:
EIGEN_CI_CXX_COMPILER: clang++-10
EIGEN_CI_CC_COMPILER: clang-10
needs: [ "buildsmoketests:x86-64:linux:clang-10:cxx11-on" ]

View File

@@ -1,131 +0,0 @@
# List of tests that will be build and run during Eigen's smoke testing. If one
# of these tests doesn't exists or cannot be build with the current configuration
# it will just be skipped.
set(ei_smoke_test_list
adjoint_1
alignedvector3
array_cwise_7
array_cwise_8
array_for_matrix_1
array_of_string
array_replicate_1
array_reverse_1
autodiff_1
autodiff_scalar_1
bandmatrix
bdcsvd_9
bessel_functions_1
bfloat16_float
blasutil_1
block_5
BVH
cholesky_1
cholmod_support_23
cholmod_support_24
conservative_resize_1
constructor_1
corners_1
ctorleakmiscmatrices_4
dense_storage
determinant_1
diagonal_1
diagonal_2
diagonalmatrices_1
dynalloc
eigensolver_complex_1
eigensolver_selfadjoint_8
EulerAngles_1
exceptions
fastmath
first_aligned
geo_alignedbox_2
geo_eulerangles_1
geo_homogeneous_1
geo_hyperplane_1
geo_orthomethods_1
geo_parametrizedline_1
geo_transformations_7
half_float
hessenberg_1
hessenberg_6qr_10
householder_8
indexed_view_1
inplace_decomposition_1
integer_types_1
inverse_1
is_same_dense
jacobi_1
jacobisvd_1
kronecker_product
linearstructure_1
mapped_matrix_1
mapstaticmethods_1
mapstride_1
matrix_square_root_1
meta
minres_2
miscmatrices_1
mixingtypes_7
nestbyvalue
nesting_ops_1
nomalloc_1
nullary_1
num_dimensions
NumericalDiff
numext
packetmath
permutationmatrices_1
polynomialsolver_1
prec_inverse_4x4_1
product_extra_5
product_selfadjoint_1
product_small_7
product_symm_1
product_syrk_1
product_trmm_1
product_trmv_1
product_trsolve_5
qr_1
qr_colpivoting_7
qr_fullpivoting_4
rand
real_qz_1
redux_1
ref_1
resize
rvalue_types_1
schur_complex_1
schur_real_1
selfadjoint_1
sizeof
sizeoverflow
smallvectors
sparse_basic_3
sparse_block_1
sparse_extra_4
sparse_permutations_2
sparse_product_4
sparse_ref_1
sparse_solvers_1
sparse_vector_1
special_functions_1
special_numbers_1
special_packetmath_1
spqr_support_2
stable_norm_1
stddeque_1
stddeque_overload_1
stdlist_1
stdlist_overload_1
stdvector_1
stdvector_overload_1
stl_iterators_1
swap_1
symbolic_index_1
triangular_1
type_aliaslu_9
umeyama_3
unalignedassert
unalignedcount
vectorwiseop_1
visitor_1)

View File

@@ -18,11 +18,6 @@ macro(ei_add_test_internal testname testname_with_suffix)
set(filename ${testname}.cpp)
endif()
# Add the current target to the list of subtest targets
get_property(EIGEN_SUBTESTS_LIST GLOBAL PROPERTY EIGEN_SUBTESTS_LIST)
set(EIGEN_SUBTESTS_LIST "${EIGEN_SUBTESTS_LIST}${targetname}\n")
set_property(GLOBAL PROPERTY EIGEN_SUBTESTS_LIST "${EIGEN_SUBTESTS_LIST}")
if(EIGEN_ADD_TEST_FILENAME_EXTENSION STREQUAL cu)
if(EIGEN_TEST_HIP)
hip_reset_flags()
@@ -418,13 +413,11 @@ macro(ei_init_testing)
define_property(GLOBAL PROPERTY EIGEN_MISSING_BACKENDS BRIEF_DOCS " " FULL_DOCS " ")
define_property(GLOBAL PROPERTY EIGEN_TESTING_SUMMARY BRIEF_DOCS " " FULL_DOCS " ")
define_property(GLOBAL PROPERTY EIGEN_TESTS_LIST BRIEF_DOCS " " FULL_DOCS " ")
define_property(GLOBAL PROPERTY EIGEN_SUBTESTS_LIST BRIEF_DOCS " " FULL_DOCS " ")
set_property(GLOBAL PROPERTY EIGEN_TESTED_BACKENDS "")
set_property(GLOBAL PROPERTY EIGEN_MISSING_BACKENDS "")
set_property(GLOBAL PROPERTY EIGEN_TESTING_SUMMARY "")
set_property(GLOBAL PROPERTY EIGEN_TESTS_LIST "")
set_property(GLOBAL PROPERTY EIGEN_SUBTESTS_LIST "")
define_property(GLOBAL PROPERTY EIGEN_FAILTEST_FAILURE_COUNT BRIEF_DOCS " " FULL_DOCS " ")
define_property(GLOBAL PROPERTY EIGEN_FAILTEST_COUNT BRIEF_DOCS " " FULL_DOCS " ")
@@ -715,56 +708,3 @@ macro(ei_split_testsuite num_splits)
add_dependencies("${current_target}" "${curr_test}")
endforeach()
endmacro(ei_split_testsuite num_splits)
# Defines the custom command buildsmoketests to build a number of tests
# specified in smoke_test_list.
#
# Test in smoke_test_list can be either test targets (e.g. packetmath) or
# subtests targets (e.g. packetmath_2). If any of the test are not available
# in the current configuration they are just skipped.
#
# All tests added via this macro are labeled with the smoketest label. This
# allows running smoketests only using ctest.
#
# Smoke tests are intended to be run before the whole test suite is invoked,
# e.g., to smoke test patches.
macro(ei_add_smoke_tests smoke_test_list)
# Set the build target to build smoketests
set(buildtarget "buildsmoketests")
add_custom_target("${buildtarget}")
# Get list of all tests and translate it into a CMake list
get_property(EIGEN_TESTS_LIST GLOBAL PROPERTY EIGEN_TESTS_LIST)
string(REGEX REPLACE "\n" " " EIGEN_TESTS_LIST "${EIGEN_TESTS_LIST}")
set(EIGEN_TESTS_LIST "${EIGEN_TESTS_LIST}")
separate_arguments(EIGEN_TESTS_LIST)
# Check if the test in smoke_test_list is a currently valid test target
foreach(test IN ITEMS ${smoke_test_list})
# Add tests in smoke_test_list to our smoke test target but only if the test
# is currently available, i.e., is in EIGEN_SUBTESTS_LIST
if ("${test}" IN_LIST EIGEN_TESTS_LIST)
add_dependencies("${buildtarget}" "${test}")
# In the case of a test we match all subtests
set(ctest_regex "${ctest_regex}^${test}_[0-9]+$$|")
endif()
endforeach()
# Get list of all subtests and translate it into a CMake list
get_property(EIGEN_SUBTESTS_LIST GLOBAL PROPERTY EIGEN_SUBTESTS_LIST)
string(REGEX REPLACE "\n" " " EIGEN_SUBTESTS_LIST "${EIGEN_SUBTESTS_LIST}")
set(EIGEN_SUBTESTS_LIST "${EIGEN_SUBTESTS_LIST}")
separate_arguments(EIGEN_SUBTESTS_LIST)
# Check if the test in smoke_test_list is a currently valid subtest target
foreach(test IN ITEMS ${smoke_test_list})
# Add tests in smoke_test_list to our smoke test target but only if the test
# is currently available, i.e., is in EIGEN_SUBTESTS_LIST
if ("${test}" IN_LIST EIGEN_SUBTESTS_LIST)
add_dependencies("${buildtarget}" "${test}")
# Add label smoketest to be able to run smoketests using ctest
get_property(test_labels TEST ${test} PROPERTY LABELS)
set_property(TEST ${test} PROPERTY LABELS "${test_labels};smoketest")
endif()
endforeach()
endmacro(ei_add_smoke_tests)

6
compile.sh Executable file
View File

@@ -0,0 +1,6 @@
#!/bin/bash
#echo 'Compiling with master'
#g++ -O3 -I../eigen-master -std=c++11 new_gemm_test.cpp -o gto
echo 'Compiling current'
g++ -O3 -I. -std=c++14 new_gemm_test.cpp -D__ENABLE_VECTOR_KERNELS__ -D__ENABLE_PREFETCH__ -o gtp
g++ -O3 -I. -std=c++14 new_gemm_test.cpp -D__ENABLE_VECTOR_KERNELS__ -o gt

98
new_gemm_test.cpp Normal file
View File

@@ -0,0 +1,98 @@
#include <Eigen/Dense>
#include <iostream>
#include <ctime>
#include <cmath>
using namespace Eigen;
void set(MatrixXf& A, int m, int n, int id, int digits)
{
for(auto i = 0; i < m; i++)
for(auto j = 0; j < n; j++)
A(i,j) = id*std::pow(10,(2*digits)) + i*std::pow(10,digits) + j;
}
int main(int argc, char* argv[])
{
#ifdef __DEBUG__
int m = 32, k = 32, n = 32, max = std::max(std::max(m,k),n);
MatrixXf A = MatrixXf::Zero(m, k);
MatrixXf B = MatrixXf::Zero(k, n);
MatrixXf C = MatrixXf::Zero(m, n);
MatrixXf D = MatrixXf::Zero(m, n);
set(A, m, k, 1, static_cast<int>(std::log10(max)) + 1);
set(B, k, n, 2, static_cast<int>(std::log10(max)) + 1);
C = A*B;
std::cout << A << std::endl;
std::cout << B << std::endl;
std::cout << std::endl;
for(auto i = 0; i < m; i++)
{
for(auto j = 0; j < n; j++)
{
float acc=0;
for(auto kk = 0; kk < k; kk++)
{
acc += A(i,kk)*B(kk,j);
}
D(i,j) = acc;
if(std::sqrt(std::pow(D(i,j)-C(i,j),2)) > 1.0e-5)
{
std::cout << "Difference too big at " << i << " ," << j << " is " << C(i,j) << " should be " << D(i,j) << std::endl;
}
}
}
#else
if(argc < 3)
{
std::cout << "Wrong number of arguments." << std::endl;
return -1;
}
int sz = std::atoi(argv[1]);
int m = sz, k = sz, n = sz;
int RUNS = std::atoi(argv[2]);
double time = 0;
for(auto i = 0; i < RUNS; i++)
{
MatrixXf A = MatrixXf::Random(m,k);
MatrixXf B = MatrixXf::Random(k,n);
//set(A,m, k, 1);
//set(B,k, n, 2);
MatrixXf C = MatrixXf::Zero(m, n);
std::clock_t start,end;
start = std::clock();
C = A*B;
end = std::clock();
time += 1000.0*(end-start) / CLOCKS_PER_SEC;
}
std::cout << time << std::endl;
#ifdef TEST_SCALAR
start = std::clock();
for(auto i = 0; i < m; i++)
{
for(auto j = 0; j < n; j++)
{
float acc=0;
for(auto kk = 0; kk < k; kk++)
{
acc += A(i,kk)*B(kk,j);
}
C(i,j) = acc;
}
}
end = std::clock();
std::cout << 1000.0*(end-start) / CLOCKS_PER_SEC << std::endl;
#endif
#endif
return 0;
}

34
run.sh Executable file
View File

@@ -0,0 +1,34 @@
#!/bin/bash
function run() {
OLD=0
NEW=0
NEWP=0
EXECS=$1
SIZE=$2
RUNS=$3
for ((i = 0; i < $EXECS; i++)) do
SEL=$(A=$(shuf -i 0-10 -n 1); echo $(($A % 2)))
if [ $SEL -eq 0 ]; then
T_OLD=$(./gto $SIZE $RUNS)
T_NEW=$(./gt $SIZE $RUNS)
T_NEWP=$(./gtp $SIZE $RUNS)
else
T_NEW=$(./gt $SIZE $RUNS)
T_NEWP=$(./gtp $SIZE $RUNS)
T_OLD=$(./gto $SIZE $RUNS)
fi
NEW=$NEW+$T_NEW
OLD=$OLD+$T_OLD
NEWP=$NEWP+$T_NEWP
done
SPEED=$(echo "($OLD) / ($NEW)" | bc -l)
SPEEDP=$(echo "($OLD) / ($NEWP)" | bc -l)
echo "$SIZE -> $SPEED $SPEEDP"
}
run $1 16 500
run $1 32 500
run $1 64 100
run $1 128 50
run $1 256 10
run $1 1024 10

View File

@@ -76,20 +76,20 @@ class AnnoyingScalar
AnnoyingScalar operator/(const AnnoyingScalar& other) const
{ return AnnoyingScalar((*v)/(*other.v)); }
AnnoyingScalar& operator+=(const AnnoyingScalar& other) { *v += *other.v; return *this; }
AnnoyingScalar& operator-=(const AnnoyingScalar& other) { *v -= *other.v; return *this; }
AnnoyingScalar& operator*=(const AnnoyingScalar& other) { *v *= *other.v; return *this; }
AnnoyingScalar& operator/=(const AnnoyingScalar& other) { *v /= *other.v; return *this; }
AnnoyingScalar& operator= (const AnnoyingScalar& other) { *v = *other.v; return *this; }
bool operator==(const AnnoyingScalar& other) const { return *v == *other.v; }
bool operator!=(const AnnoyingScalar& other) const { return *v != *other.v; }
bool operator<=(const AnnoyingScalar& other) const { return *v <= *other.v; }
bool operator< (const AnnoyingScalar& other) const { return *v < *other.v; }
bool operator>=(const AnnoyingScalar& other) const { return *v >= *other.v; }
bool operator> (const AnnoyingScalar& other) const { return *v > *other.v; }
float* v;
float data;
static int instances;
@@ -136,23 +136,12 @@ struct NumTraits<AnnoyingScalar> : NumTraits<float>
template<> inline AnnoyingScalar test_precision<AnnoyingScalar>() { return test_precision<float>(); }
namespace numext {
template<>
EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE
bool (isfinite)(const AnnoyingScalar& x) {
return (numext::isfinite)(*x.v);
}
namespace internal {
template<> double cast(const AnnoyingScalar& x) { return double(*x.v); }
template<> float cast(const AnnoyingScalar& x) { return *x.v; }
}
namespace internal {
template<> EIGEN_STRONG_INLINE AnnoyingScalar pcmp_eq(const AnnoyingScalar& a, const AnnoyingScalar& b)
{ return AnnoyingScalar(pcmp_eq(*a.v, *b.v)); }
template<> EIGEN_STRONG_INLINE AnnoyingScalar pselect(const AnnoyingScalar& mask, const AnnoyingScalar& a, const AnnoyingScalar& b)
{ return numext::equal_strict(*mask.v, 0.f) ? b : a; }
template<> EIGEN_STRONG_INLINE double cast(const AnnoyingScalar& x) { return double(*x.v); }
template<> EIGEN_STRONG_INLINE float cast(const AnnoyingScalar& x) { return *x.v; }
}
} // namespace Eigen
AnnoyingScalar get_test_precision(const AnnoyingScalar&)
{ return Eigen::test_precision<AnnoyingScalar>(); }

View File

@@ -460,7 +460,3 @@ cmake_dependent_option(EIGEN_TEST_BUILD_DOCUMENTATION "Test building the doxygen
if(EIGEN_TEST_BUILD_DOCUMENTATION)
add_dependencies(buildtests doc)
endif()
# Register all smoke tests
include("EigenSmokeTestList")
ei_add_smoke_tests("${ei_smoke_test_list}")

View File

@@ -332,9 +332,7 @@ EIGEN_DECLARE_TEST(geo_quaternion)
CALL_SUBTEST_2(( quaternionAlignment<double>() ));
CALL_SUBTEST_2( mapQuaternion<double>() );
#ifndef EIGEN_TEST_ANNOYING_SCALAR_DONT_THROW
AnnoyingScalar::dont_throw = true;
#endif
CALL_SUBTEST_3(( quaternion<AnnoyingScalar,AutoAlign>() ));
}
}

View File

@@ -29,6 +29,10 @@
#endif
#ifdef EIGEN_GOOGLEHASH_SUPPORT
#include <google/sparse_hash_map>
#endif
#include <Eigen/Cholesky>
#include <Eigen/LU>
#include <Eigen/Sparse>

View File

@@ -7,9 +7,9 @@
// Public License v. 2.0. If a copy of the MPL was not distributed
// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
#include "main.h"
#include <iterator>
#include <numeric>
#include "main.h"
template< class Iterator >
std::reverse_iterator<Iterator>
@@ -47,18 +47,6 @@ bool is_pointer_based_stl_iterator(const internal::pointer_based_stl_iterator<Xp
template<typename XprType>
bool is_generic_randaccess_stl_iterator(const internal::generic_randaccess_stl_iterator<XprType> &) { return true; }
template<typename Iter>
bool is_default_constructible_and_assignable(const Iter& it)
{
#if EIGEN_HAS_CXX11
VERIFY(std::is_default_constructible<Iter>::value);
VERIFY(std::is_nothrow_default_constructible<Iter>::value);
#endif
Iter it2;
it2 = it;
return (it==it2);
}
template<typename Xpr>
void check_begin_end_for_loop(Xpr xpr)
{
@@ -136,22 +124,6 @@ void test_stl_iterators(int rows=Rows, int cols=Cols)
Index i, j;
// Verify that iterators are default constructible (See bug #1900)
{
VERIFY( is_default_constructible_and_assignable(v.begin()));
VERIFY( is_default_constructible_and_assignable(v.end()));
VERIFY( is_default_constructible_and_assignable(cv.begin()));
VERIFY( is_default_constructible_and_assignable(cv.end()));
VERIFY( is_default_constructible_and_assignable(A.row(0).begin()));
VERIFY( is_default_constructible_and_assignable(A.row(0).end()));
VERIFY( is_default_constructible_and_assignable(cA.row(0).begin()));
VERIFY( is_default_constructible_and_assignable(cA.row(0).end()));
VERIFY( is_default_constructible_and_assignable(B.row(0).begin()));
VERIFY( is_default_constructible_and_assignable(B.row(0).end()));
}
// Check we got a fast pointer-based iterator when expected
{
VERIFY( is_pointer_based_stl_iterator(v.begin()) );

View File

@@ -298,8 +298,7 @@ EIGEN_DONT_INLINE Scalar zero() { return Scalar(0); }
// workaround aggressive optimization in ICC
template<typename T> EIGEN_DONT_INLINE T sub(T a, T b) { return a - b; }
// This function verifies we don't iterate infinitely on nan/inf values,
// and that info() returns InvalidInput.
// all this function does is verify we don't iterate infinitely on nan/inf values
template<typename SvdType, typename MatrixType>
void svd_inf_nan()
{
@@ -308,22 +307,18 @@ void svd_inf_nan()
Scalar some_inf = Scalar(1) / zero<Scalar>();
VERIFY(sub(some_inf, some_inf) != sub(some_inf, some_inf));
svd.compute(MatrixType::Constant(10,10,some_inf), ComputeFullU | ComputeFullV);
VERIFY(svd.info() == InvalidInput);
Scalar nan = std::numeric_limits<Scalar>::quiet_NaN();
VERIFY(nan != nan);
svd.compute(MatrixType::Constant(10,10,nan), ComputeFullU | ComputeFullV);
VERIFY(svd.info() == InvalidInput);
MatrixType m = MatrixType::Zero(10,10);
m(internal::random<int>(0,9), internal::random<int>(0,9)) = some_inf;
svd.compute(m, ComputeFullU | ComputeFullV);
VERIFY(svd.info() == InvalidInput);
m = MatrixType::Zero(10,10);
m(internal::random<int>(0,9), internal::random<int>(0,9)) = nan;
svd.compute(m, ComputeFullU | ComputeFullV);
VERIFY(svd.info() == InvalidInput);
// regression test for bug 791
m.resize(3,3);
@@ -331,7 +326,6 @@ void svd_inf_nan()
0, -0.5, 0,
nan, 0, 0;
svd.compute(m, ComputeFullU | ComputeFullV);
VERIFY(svd.info() == InvalidInput);
m.resize(4,4);
m << 1, 0, 0, 0,
@@ -339,7 +333,6 @@ void svd_inf_nan()
1, 0, 1, nan,
0, nan, nan, 0;
svd.compute(m, ComputeFullU | ComputeFullV);
VERIFY(svd.info() == InvalidInput);
}
// Regression test for bug 286: JacobiSVD loops indefinitely with some

View File

@@ -466,7 +466,7 @@ struct sizes_match_below_dim {
template <typename Dims1, typename Dims2, ptrdiff_t n>
struct sizes_match_below_dim<Dims1, Dims2, n, n> {
static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE bool run(Dims1& dims1, Dims2& dims2) {
return (array_get<n-1>(dims1) == array_get<n-1>(dims2)) &&
return (array_get<n-1>(dims1) == array_get<n-1>(dims2)) &
sizes_match_below_dim<Dims1, Dims2, n-1, n-1>::run(dims1, dims2);
}
};

View File

@@ -357,8 +357,8 @@ __global__ EIGEN_HIP_LAUNCH_BOUNDS_1024 void ScanKernel(Self self, Index total_s
}
template <typename Self, typename Reducer, bool Vectorize>
struct ScanLauncher<Self, Reducer, GpuDevice, Vectorize> {
template <typename Self, typename Reducer>
struct ScanLauncher<Self, Reducer, GpuDevice, false> {
void operator()(const Self& self, typename Self::CoeffReturnType* data) {
Index total_size = internal::array_prod(self.dimensions());
Index num_blocks = (total_size / self.size() + 63) / 64;

View File

@@ -14,7 +14,6 @@
#include "../../Eigen/Jacobi"
#include "../../Eigen/Householder"
/**
* \defgroup IterativeLinearSolvers_Module Iterative solvers module
* This module aims to provide various iterative linear and non linear solver algorithms.
@@ -24,12 +23,11 @@
* - an IDR(s) implementation
* - a DGMRES implementation
* - a MINRES implementation
*
* \code
* #include <unsupported/Eigen/IterativeSolvers>
* \endcode
*/
//@{
#include "../../Eigen/src/Core/util/DisableStupidWarnings.h"
@@ -47,5 +45,6 @@
#include "../../Eigen/src/Core/util/ReenableStupidWarnings.h"
//@}
#endif // EIGEN_ITERATIVE_SOLVERS_MODULE_H

View File

@@ -24,7 +24,6 @@
#ifdef EIGEN_GOOGLEHASH_SUPPORT
#include <google/dense_hash_map>
#include <google/sparse_hash_map>
#endif
/**

View File

@@ -10,13 +10,7 @@
#ifndef EIGEN_RANDOMSETTER_H
#define EIGEN_RANDOMSETTER_H
#if defined(EIGEN_GOOGLEHASH_SUPPORT)
// Ensure the ::google namespace exists, required for checking existence of
// ::google::dense_hash_map and ::google::sparse_hash_map.
namespace google {}
#endif
namespace Eigen {
namespace Eigen {
/** Represents a std::map
*
@@ -62,26 +56,7 @@ template<typename Scalar> struct StdUnorderedMapTraits
};
#endif // EIGEN_UNORDERED_MAP_SUPPORT
#if defined(EIGEN_GOOGLEHASH_SUPPORT)
namespace google {
// Namespace work-around, since sometimes dense_hash_map and sparse_hash_map
// are in the global namespace, and other times they are under ::google.
using namespace ::google;
template<typename KeyType, typename Scalar>
struct DenseHashMap {
typedef dense_hash_map<KeyType, Scalar> type;
};
template<typename KeyType, typename Scalar>
struct SparseHashMap {
typedef sparse_hash_map<KeyType, Scalar> type;
};
} // namespace google
#ifdef _DENSE_HASH_MAP_H_
/** Represents a google::dense_hash_map
*
* \see RandomSetter
@@ -89,7 +64,7 @@ struct SparseHashMap {
template<typename Scalar> struct GoogleDenseHashMapTraits
{
typedef int KeyType;
typedef typename google::DenseHashMap<KeyType,Scalar>::type Type;
typedef google::dense_hash_map<KeyType,Scalar> Type;
enum {
IsSorted = 0
};
@@ -97,7 +72,9 @@ template<typename Scalar> struct GoogleDenseHashMapTraits
static void setInvalidKey(Type& map, const KeyType& k)
{ map.set_empty_key(k); }
};
#endif
#ifdef _SPARSE_HASH_MAP_H_
/** Represents a google::sparse_hash_map
*
* \see RandomSetter
@@ -105,7 +82,7 @@ template<typename Scalar> struct GoogleDenseHashMapTraits
template<typename Scalar> struct GoogleSparseHashMapTraits
{
typedef int KeyType;
typedef typename google::SparseHashMap<KeyType,Scalar>::type Type;
typedef google::sparse_hash_map<KeyType,Scalar> Type;
enum {
IsSorted = 0
};
@@ -157,17 +134,18 @@ template<typename Scalar> struct GoogleSparseHashMapTraits
* GoogleSparseHashMapTraits, GnuHashMapTraits, and finally StdMapTraits.
*
* For performance and memory consumption reasons it is highly recommended to use one of
* Google's hash_map implementations. To enable the support for them, you must define
* EIGEN_GOOGLEHASH_SUPPORT. This will include both <google/dense_hash_map> and
* <google/sparse_hash_map> for you.
* the Google's hash_map implementation. To enable the support for them, you have two options:
* - \#include <google/dense_hash_map> yourself \b before Eigen/Sparse header
* - define EIGEN_GOOGLEHASH_SUPPORT
* In the later case the inclusion of <google/dense_hash_map> is made for you.
*
* \see https://github.com/sparsehash/sparsehash
* \see http://code.google.com/p/google-sparsehash/
*/
template<typename SparseMatrixType,
template <typename T> class MapTraits =
#if defined(EIGEN_GOOGLEHASH_SUPPORT)
#if defined _DENSE_HASH_MAP_H_
GoogleDenseHashMapTraits
#elif defined(_HASH_MAP)
#elif defined _HASH_MAP
GnuHashMapTraits
#else
StdMapTraits

View File

@@ -444,7 +444,7 @@ void test_gpu_forced_evals() {
d_float, num_elem);
Eigen::TensorMap<Eigen::Tensor<float, 1>, Eigen::Aligned> gpu_res_half1(
d_res_half1, num_elem);
Eigen::TensorMap<Eigen::Tensor<float, 1>, Eigen::Unaligned> gpu_res_half2(
Eigen::TensorMap<Eigen::Tensor<float, 1>, Eigen::Unaligned> gpu_res_half2(
d_res_half2, num_elem);
Eigen::TensorMap<Eigen::Tensor<float, 1>, Eigen::Aligned> gpu_res_float(
d_res_float, num_elem);
@@ -461,7 +461,7 @@ void test_gpu_forced_evals() {
Tensor<float, 1> half_prec2(num_elem);
Tensor<float, 1> full_prec(num_elem);
gpu_device.memcpyDeviceToHost(half_prec1.data(), d_res_half1, num_elem*sizeof(float));
gpu_device.memcpyDeviceToHost(half_prec2.data(), d_res_half2, num_elem*sizeof(float));
gpu_device.memcpyDeviceToHost(half_prec2.data(), d_res_half1, num_elem*sizeof(float));
gpu_device.memcpyDeviceToHost(full_prec.data(), d_res_float, num_elem*sizeof(float));
gpu_device.synchronize();

View File

@@ -123,8 +123,10 @@ template<typename SparseMatrixType> void sparse_extra(const SparseMatrixType& re
#ifdef EIGEN_UNORDERED_MAP_SUPPORT
VERIFY(( test_random_setter<RandomSetter<SparseMatrixType, StdUnorderedMapTraits> >(m,refMat,nonzeroCoords) ));
#endif
#ifdef EIGEN_GOOGLEHASH_SUPPORT
#ifdef _DENSE_HASH_MAP_H_
VERIFY(( test_random_setter<RandomSetter<SparseMatrixType, GoogleDenseHashMapTraits> >(m,refMat,nonzeroCoords) ));
#endif
#ifdef _SPARSE_HASH_MAP_H_
VERIFY(( test_random_setter<RandomSetter<SparseMatrixType, GoogleSparseHashMapTraits> >(m,refMat,nonzeroCoords) ));
#endif