mirror of
https://gitlab.com/libeigen/eigen.git
synced 2026-04-10 11:34:33 +08:00
Fix incorrect NEON native fp16 multiplication.
This commit is contained in:
committed by
Rasmus Munk Larsen
parent
dd85d26946
commit
bb6675caf7
@@ -2,7 +2,7 @@
|
||||
|
||||
namespace Eigen {
|
||||
namespace internal {
|
||||
|
||||
|
||||
#if EIGEN_ARCH_ARM && EIGEN_COMP_CLANG
|
||||
|
||||
// Clang seems to excessively spill registers in the GEBP kernel on 32-bit arm.
|
||||
@@ -218,7 +218,9 @@ struct gebp_traits <half,half,false,false,Architecture::NEON>
|
||||
|
||||
EIGEN_STRONG_INLINE void loadRhsQuad(const RhsScalar* b, RhsPacket& dest) const
|
||||
{
|
||||
loadRhs(b,dest);
|
||||
// If LHS is a Packet8h, we cannot correctly mimic a ploadquad of the RHS
|
||||
// using a single scalar value.
|
||||
eigen_assert(false && "Cannot loadRhsQuad for a scalar RHS.");
|
||||
}
|
||||
|
||||
EIGEN_STRONG_INLINE void madd(const LhsPacket& a, const RhsPacket& b, AccPacket& c, RhsPacket& /*tmp*/, const FixedInt<0>&) const
|
||||
|
||||
@@ -751,6 +751,9 @@ void loadQuadToDoublePacket(const Scalar* b, DoublePacket<RealPacket>& dest,
|
||||
|
||||
template<typename Packet> struct unpacket_traits<DoublePacket<Packet> > {
|
||||
typedef DoublePacket<typename unpacket_traits<Packet>::half> half;
|
||||
enum{
|
||||
size = 2 * unpacket_traits<Packet>::size
|
||||
};
|
||||
};
|
||||
// template<typename Packet>
|
||||
// DoublePacket<Packet> pmadd(const DoublePacket<Packet> &a, const DoublePacket<Packet> &b)
|
||||
@@ -2490,7 +2493,13 @@ void gebp_kernel<LhsScalar,RhsScalar,Index,DataMapper,mr,nr,ConjugateLhs,Conjuga
|
||||
// nr (which is currently 4) for the return type.
|
||||
const int SResPacketHalfSize = unpacket_traits<typename unpacket_traits<SResPacket>::half>::size;
|
||||
const int SResPacketQuarterSize = unpacket_traits<typename unpacket_traits<typename unpacket_traits<SResPacket>::half>::half>::size;
|
||||
if ((SwappedTraits::LhsProgress % 4) == 0 &&
|
||||
// The following code assumes we can load SRhsPacket in such a way that
|
||||
// it multiplies blocks of 4 elements in SLhsPacket. This is not the
|
||||
// case for some customized kernels (i.e. NEON fp16). If the assumption
|
||||
// fails, drop down to the scalar path.
|
||||
constexpr bool kCanLoadSRhsQuad = (unpacket_traits<SLhsPacket>::size < 4) || (unpacket_traits<SRhsPacket>::size % (unpacket_traits<SLhsPacket>::size / 4)) == 0;
|
||||
if (kCanLoadSRhsQuad &&
|
||||
(SwappedTraits::LhsProgress % 4) == 0 &&
|
||||
(SwappedTraits::LhsProgress<=16) &&
|
||||
(SwappedTraits::LhsProgress!=8 || SResPacketHalfSize==nr) &&
|
||||
(SwappedTraits::LhsProgress!=16 || SResPacketQuarterSize==nr))
|
||||
|
||||
Reference in New Issue
Block a user