diff --git a/Eigen/Core b/Eigen/Core index 86608f3c8..616b184b1 100644 --- a/Eigen/Core +++ b/Eigen/Core @@ -437,6 +437,8 @@ using std::ptrdiff_t; #include "src/Core/arch/NEON/GeneralBlockPanelKernel.h" #elif defined EIGEN_VECTORIZE_LSX #include "src/Core/arch/LSX/GeneralBlockPanelKernel.h" +#elif defined EIGEN_VECTORIZE_RVV10 +#include "src/Core/arch/RVV10/GeneralBlockPanelKernel.h" #endif #if defined(EIGEN_VECTORIZE_AVX512) diff --git a/Eigen/src/Core/arch/RVV10/GeneralBlockPanelKernel.h b/Eigen/src/Core/arch/RVV10/GeneralBlockPanelKernel.h new file mode 100644 index 000000000..026e1dbc1 --- /dev/null +++ b/Eigen/src/Core/arch/RVV10/GeneralBlockPanelKernel.h @@ -0,0 +1,236 @@ +// This file is part of Eigen, a lightweight C template library +// for linear algebra. +// +// Copyright (C) 2024 Kseniya Zaytseva +// Copyright (C) 2025 Chip Kerchner +// +// This Source Code Form is subject to the terms of the Mozilla +// Public License v. 2.0. If a copy of the MPL was not distributed +// with this file, You can obtain one at http://mozilla.org/MPL/2.0/. + +#ifndef EIGEN_RVV10_GENERAL_BLOCK_KERNEL_H +#define EIGEN_RVV10_GENERAL_BLOCK_KERNEL_H + +// IWYU pragma: private +#include "../../InternalHeaderCheck.h" + +namespace Eigen { +namespace internal { + +/********************************* real ************************************/ + +template <> +struct gebp_traits + : gebp_traits { + typedef float RhsPacket; + typedef QuadPacket RhsPacketx4; + EIGEN_STRONG_INLINE void loadRhs(const RhsScalar* b, RhsPacket& dest) const { dest = pset1(*b); } + EIGEN_STRONG_INLINE void loadRhs(const RhsScalar* b, RhsPacketx4& dest) const { + pbroadcast4(b, dest.B_0, dest.B1, dest.B2, dest.B3); + } + + EIGEN_STRONG_INLINE void updateRhs(const RhsScalar* b, RhsPacket& dest) const { loadRhs(b, dest); } + + EIGEN_STRONG_INLINE void updateRhs(const RhsScalar*, RhsPacketx4&) const {} + + EIGEN_STRONG_INLINE void loadRhsQuad(const RhsScalar* b, RhsPacket& dest) const { dest = ploadquad(b); } + + EIGEN_STRONG_INLINE void madd(const LhsPacket& a, const RhsPacket& b, AccPacket& c, RhsPacket& /*tmp*/, + const FixedInt<0>&) const { +#if EIGEN_RISCV64_DEFAULT_LMUL == 1 + c = __riscv_vfmadd_vf_f32m1(a, b, c, unpacket_traits::size); +#elif EIGEN_RISCV64_DEFAULT_LMUL == 2 + c = __riscv_vfmadd_vf_f32m2(a, b, c, unpacket_traits::size); +#elif EIGEN_RISCV64_DEFAULT_LMUL == 4 + c = __riscv_vfmadd_vf_f32m4(a, b, c, unpacket_traits::size); +#endif + } + +#if EIGEN_RISCV64_DEFAULT_LMUL >= 2 + EIGEN_STRONG_INLINE void madd(const Packet1Xf& a, const RhsPacket& b, Packet1Xf& c, RhsPacket& /*tmp*/, + const FixedInt<0>&) const { + c = __riscv_vfmadd_vf_f32m1(a, b, c, unpacket_traits::size); + } +#endif +#if EIGEN_RISCV64_DEFAULT_LMUL == 4 + EIGEN_STRONG_INLINE void madd(const Packet2Xf& a, const RhsPacket& b, Packet2Xf& c, RhsPacket& /*tmp*/, + const FixedInt<0>&) const { + c = __riscv_vfmadd_vf_f32m2(a, b, c, unpacket_traits::size); + } +#endif + + template + EIGEN_STRONG_INLINE void madd(const LhsPacket& a, const RhsPacketx4& b, AccPacket& c, RhsPacket& /*tmp*/, + const LaneIdType& lane) const { +#if EIGEN_RISCV64_DEFAULT_LMUL == 1 + c = __riscv_vfmadd_vf_f32m1(a, b.get(lane), c, unpacket_traits::size); +#elif EIGEN_RISCV64_DEFAULT_LMUL == 2 + c = __riscv_vfmadd_vf_f32m2(a, b.get(lane), c, unpacket_traits::size); +#elif EIGEN_RISCV64_DEFAULT_LMUL == 4 + c = __riscv_vfmadd_vf_f32m4(a, b.get(lane), c, unpacket_traits::size); +#endif + } +}; + +template <> +struct gebp_traits + : gebp_traits { + typedef double RhsPacket; + typedef QuadPacket RhsPacketx4; + EIGEN_STRONG_INLINE void loadRhs(const RhsScalar* b, RhsPacket& dest) const { dest = pset1(*b); } + EIGEN_STRONG_INLINE void loadRhs(const RhsScalar* b, RhsPacketx4& dest) const { + pbroadcast4(b, dest.B_0, dest.B1, dest.B2, dest.B3); + } + + EIGEN_STRONG_INLINE void updateRhs(const RhsScalar* b, RhsPacket& dest) const { loadRhs(b, dest); } + + EIGEN_STRONG_INLINE void updateRhs(const RhsScalar*, RhsPacketx4&) const {} + + EIGEN_STRONG_INLINE void loadRhsQuad(const RhsScalar* b, RhsPacket& dest) const { dest = ploadquad(b); } + + EIGEN_STRONG_INLINE void madd(const LhsPacket& a, const RhsPacket& b, AccPacket& c, RhsPacket& /*tmp*/, + const FixedInt<0>&) const { +#if EIGEN_RISCV64_DEFAULT_LMUL == 1 + c = __riscv_vfmadd_vf_f64m1(a, b, c, unpacket_traits::size); +#elif EIGEN_RISCV64_DEFAULT_LMUL == 2 + c = __riscv_vfmadd_vf_f64m2(a, b, c, unpacket_traits::size); +#elif EIGEN_RISCV64_DEFAULT_LMUL == 4 + c = __riscv_vfmadd_vf_f64m4(a, b, c, unpacket_traits::size); +#endif + } + +#if EIGEN_RISCV64_DEFAULT_LMUL >= 2 + EIGEN_STRONG_INLINE void madd(const Packet1Xd& a, const RhsPacket& b, Packet1Xd& c, RhsPacket& /*tmp*/, + const FixedInt<0>&) const { + c = __riscv_vfmadd_vf_f64m1(a, b, c, unpacket_traits::size); + } +#endif +#if EIGEN_RISCV64_DEFAULT_LMUL == 4 + EIGEN_STRONG_INLINE void madd(const Packet2Xd& a, const RhsPacket& b, Packet2Xd& c, RhsPacket& /*tmp*/, + const FixedInt<0>&) const { + c = __riscv_vfmadd_vf_f64m2(a, b, c, unpacket_traits::size); + } +#endif + + template + EIGEN_STRONG_INLINE void madd(const LhsPacket& a, const RhsPacketx4& b, AccPacket& c, RhsPacket& /*tmp*/, + const LaneIdType& lane) const { +#if EIGEN_RISCV64_DEFAULT_LMUL == 1 + c = __riscv_vfmadd_vf_f64m1(a, b.get(lane), c, unpacket_traits::size); +#elif EIGEN_RISCV64_DEFAULT_LMUL == 2 + c = __riscv_vfmadd_vf_f64m2(a, b.get(lane), c, unpacket_traits::size); +#elif EIGEN_RISCV64_DEFAULT_LMUL == 4 + c = __riscv_vfmadd_vf_f64m4(a, b.get(lane), c, unpacket_traits::size); +#endif + } +}; + +#if defined(EIGEN_VECTORIZE_RVV10FP16) + +template <> +struct gebp_traits + : gebp_traits { + typedef half RhsPacket; + typedef PacketXh LhsPacket; + typedef PacketXh AccPacket; + typedef QuadPacket RhsPacketx4; + + EIGEN_STRONG_INLINE void loadRhs(const RhsScalar* b, RhsPacket& dest) const { dest = pset1(*b); } + EIGEN_STRONG_INLINE void loadRhs(const RhsScalar* b, RhsPacketx4& dest) const { + pbroadcast4(b, dest.B_0, dest.B1, dest.B2, dest.B3); + } + + EIGEN_STRONG_INLINE void updateRhs(const RhsScalar* b, RhsPacket& dest) const { loadRhs(b, dest); } + + EIGEN_STRONG_INLINE void updateRhs(const RhsScalar*, RhsPacketx4&) const {} + + EIGEN_STRONG_INLINE void loadRhsQuad(const RhsScalar* b, RhsPacket& dest) const { dest = pload(b); } + + EIGEN_STRONG_INLINE void madd(const LhsPacket& a, const RhsPacket& b, AccPacket& c, RhsPacket& /*tmp*/, + const FixedInt<0>&) const { +#if EIGEN_RISCV64_DEFAULT_LMUL == 1 + c = __riscv_vfmadd_vf_f16m1(a, numext::bit_cast<_Float16>(b), c, unpacket_traits::size); +#else + c = __riscv_vfmadd_vf_f16m2(a, numext::bit_cast<_Float16>(b), c, unpacket_traits::size); +#endif + } + +#if EIGEN_RISCV64_DEFAULT_LMUL >= 2 + EIGEN_STRONG_INLINE void madd(const Packet1Xh& a, const RhsPacket& b, Packet1Xh& c, RhsPacket& /*tmp*/, + const FixedInt<0>&) const { + c = __riscv_vfmadd_vf_f16m1(a, numext::bit_cast<_Float16>(b), c, unpacket_traits::size); + } +#endif + + template + EIGEN_STRONG_INLINE void madd(const LhsPacket& a, const RhsPacketx4& b, AccPacket& c, RhsPacket& /*tmp*/, + const LaneIdType& lane) const { +#if EIGEN_RISCV64_DEFAULT_LMUL == 1 + c = __riscv_vfmadd_vf_f16m1(a, numext::bit_cast<_Float16>(b.get(lane)), c, unpacket_traits::size); +#else + c = __riscv_vfmadd_vf_f16m2(a, numext::bit_cast<_Float16>(b.get(lane)), c, unpacket_traits::size); +#endif + } +}; + +#endif + +#if defined(EIGEN_VECTORIZE_RVV10BF16) + +template <> +struct gebp_traits + : gebp_traits { + typedef bfloat16 RhsPacket; + typedef PacketXbf LhsPacket; + typedef PacketXbf AccPacket; + typedef QuadPacket RhsPacketx4; + + EIGEN_STRONG_INLINE void loadRhs(const RhsScalar* b, RhsPacket& dest) const { dest = pset1(*b); } + EIGEN_STRONG_INLINE void loadRhs(const RhsScalar* b, RhsPacketx4& dest) const { + pbroadcast4(b, dest.B_0, dest.B1, dest.B2, dest.B3); + } + + EIGEN_STRONG_INLINE void updateRhs(const RhsScalar* b, RhsPacket& dest) const { loadRhs(b, dest); } + + EIGEN_STRONG_INLINE void updateRhs(const RhsScalar*, RhsPacketx4&) const {} + + EIGEN_STRONG_INLINE void loadRhsQuad(const RhsScalar* b, RhsPacket& dest) const { dest = pload(b); } + + EIGEN_STRONG_INLINE void madd(const LhsPacket& a, const RhsPacket& b, AccPacket& c, RhsPacket& /*tmp*/, + const FixedInt<0>&) const { +#if EIGEN_RISCV64_DEFAULT_LMUL == 1 + c = F32ToBf16( + __riscv_vfwmaccbf16_vf_f32m2(Bf16ToF32(c), numext::bit_cast<__bf16>(b), a, unpacket_traits::size)); +#else + c = F32ToBf16( + __riscv_vfwmaccbf16_vf_f32m4(Bf16ToF32(c), numext::bit_cast<__bf16>(b), a, unpacket_traits::size)); +#endif + } + +#if EIGEN_RISCV64_DEFAULT_LMUL >= 2 + EIGEN_STRONG_INLINE void madd(const Packet1Xbf& a, const RhsPacket& b, Packet1Xbf& c, RhsPacket& /*tmp*/, + const FixedInt<0>&) const { + c = F32ToBf16( + __riscv_vfwmaccbf16_vf_f32m2(Bf16ToF32(c), numext::bit_cast<__bf16>(b), a, unpacket_traits::size)); + } +#endif + + template + EIGEN_STRONG_INLINE void madd(const LhsPacket& a, const RhsPacketx4& b, AccPacket& c, RhsPacket& /*tmp*/, + const LaneIdType& lane) const { +#if EIGEN_RISCV64_DEFAULT_LMUL == 1 + c = F32ToBf16(__riscv_vfwmaccbf16_vf_f32m2(Bf16ToF32(c), numext::bit_cast<__bf16>(b.get(lane)), a, + unpacket_traits::size)); +#else + c = F32ToBf16(__riscv_vfwmaccbf16_vf_f32m4(Bf16ToF32(c), numext::bit_cast<__bf16>(b.get(lane)), a, + unpacket_traits::size)); +#endif + } +}; + +#endif + +} // namespace internal +} // namespace Eigen + +#endif // EIGEN_RVV10_GENERAL_BLOCK_KERNEL_H diff --git a/Eigen/src/Core/arch/RVV10/PacketMath.h b/Eigen/src/Core/arch/RVV10/PacketMath.h index e0e0be4a8..909fd88ce 100644 --- a/Eigen/src/Core/arch/RVV10/PacketMath.h +++ b/Eigen/src/Core/arch/RVV10/PacketMath.h @@ -2,6 +2,7 @@ // for linear algebra. // // Copyright (C) 2024 Kseniya Zaytseva +// Copyright (C) 2025 Chip Kerchner // // This Source Code Form is subject to the terms of the Mozilla // Public License v. 2.0. If a copy of the MPL was not distributed @@ -51,15 +52,11 @@ typedef vbool4_t PacketMask4; typedef eigen_packet_wrapper Packet1Xi; typedef eigen_packet_wrapper Packet1Xu; -typedef eigen_packet_wrapper - Packet2Xi; -typedef eigen_packet_wrapper - Packet2Xu; +typedef eigen_packet_wrapper Packet2Xi; +typedef eigen_packet_wrapper Packet2Xu; -typedef eigen_packet_wrapper - Packet4Xi; -typedef eigen_packet_wrapper - Packet4Xu; +typedef eigen_packet_wrapper Packet4Xi; +typedef eigen_packet_wrapper Packet4Xu; #if EIGEN_RISCV64_DEFAULT_LMUL == 1 typedef Packet1Xi PacketXi; @@ -351,18 +348,17 @@ EIGEN_STRONG_INLINE Packet1Xi ploadu(const numext::int32_t* from) { template <> EIGEN_STRONG_INLINE Packet1Xi ploaddup(const numext::int32_t* from) { - Packet1Xu idx = __riscv_vid_v_u32m1(unpacket_traits::size); - idx = __riscv_vsll_vx_u32m1(__riscv_vand_vx_u32m1(idx, 0xfffffffeu, unpacket_traits::size), 1, - unpacket_traits::size); - // idx = 0 0 sizeof(int32_t) sizeof(int32_t) 2*sizeof(int32_t) 2*sizeof(int32_t) ... - return __riscv_vloxei32_v_i32m1(from, idx, unpacket_traits::size); + Packet1Xu data = __riscv_vreinterpret_v_i32m1_u32m1(pload(from)); + return __riscv_vreinterpret_v_i64m1_i32m1(__riscv_vreinterpret_v_u64m1_i64m1(__riscv_vlmul_trunc_v_u64m2_u64m1( + __riscv_vwmaccu_vx_u64m2(__riscv_vwaddu_vv_u64m2(data, data, unpacket_traits::size), 0xffffffffu, data, + unpacket_traits::size)))); } template <> EIGEN_STRONG_INLINE Packet1Xi ploadquad(const numext::int32_t* from) { - Packet1Xu idx = __riscv_vid_v_u32m1(unpacket_traits::size); - idx = __riscv_vand_vx_u32m1(idx, 0xfffffffcu, unpacket_traits::size); - return __riscv_vloxei32_v_i32m1(from, idx, unpacket_traits::size); + Packet1Xu idx = + __riscv_vsrl_vx_u32m1(__riscv_vid_v_u32m1(unpacket_traits::size), 2, unpacket_traits::size); + return __riscv_vrgather_vv_i32m1(pload(from), idx, unpacket_traits::size); } template <> @@ -382,7 +378,7 @@ EIGEN_DEVICE_FUNC inline Packet1Xi pgather(const num template <> EIGEN_DEVICE_FUNC inline void pscatter(numext::int32_t* to, const Packet1Xi& from, - Index stride) { + Index stride) { __riscv_vsse32(to, stride * sizeof(numext::int32_t), from, unpacket_traits::size); } @@ -394,7 +390,7 @@ EIGEN_STRONG_INLINE numext::int32_t pfirst(const Packet1Xi& a) { template <> EIGEN_STRONG_INLINE Packet1Xi preverse(const Packet1Xi& a) { Packet1Xu idx = __riscv_vrsub_vx_u32m1(__riscv_vid_v_u32m1(unpacket_traits::size), - unpacket_traits::size - 1, unpacket_traits::size); + unpacket_traits::size - 1, unpacket_traits::size); return __riscv_vrgather_vv_i32m1(a, idx, unpacket_traits::size); } @@ -669,6 +665,12 @@ EIGEN_STRONG_INLINE Packet1Xf pabs(const Packet1Xf& a) { return __riscv_vfabs_v_f32m1(a, unpacket_traits::size); } +template <> +EIGEN_STRONG_INLINE Packet1Xf pabsdiff(const Packet1Xf& a, const Packet1Xf& b) { + return __riscv_vfabs_v_f32m1(__riscv_vfsub_vv_f32m1(a, b, unpacket_traits::size), + unpacket_traits::size); +} + template <> EIGEN_STRONG_INLINE Packet1Xf pset1(const float& from) { return __riscv_vfmv_v_f_f32m1(from, unpacket_traits::size); @@ -687,6 +689,16 @@ EIGEN_STRONG_INLINE Packet1Xf plset(const float& a) { return __riscv_vfadd_vf_f32m1(idx, a, unpacket_traits::size); } +template <> +EIGEN_STRONG_INLINE void pbroadcast4(const float* a, Packet1Xf& a0, Packet1Xf& a1, Packet1Xf& a2, + Packet1Xf& a3) { + vfloat32m1_t aa = __riscv_vle32_v_f32m1(a, 4); + a0 = __riscv_vrgather_vx_f32m1(aa, 0, unpacket_traits::size); + a1 = __riscv_vrgather_vx_f32m1(aa, 1, unpacket_traits::size); + a2 = __riscv_vrgather_vx_f32m1(aa, 2, unpacket_traits::size); + a3 = __riscv_vrgather_vx_f32m1(aa, 3, unpacket_traits::size); +} + template <> EIGEN_STRONG_INLINE Packet1Xf padd(const Packet1Xf& a, const Packet1Xf& b) { return __riscv_vfadd_vv_f32m1(a, b, unpacket_traits::size); @@ -702,6 +714,12 @@ EIGEN_STRONG_INLINE Packet1Xf pnegate(const Packet1Xf& a) { return __riscv_vfneg_v_f32m1(a, unpacket_traits::size); } +template <> +EIGEN_STRONG_INLINE Packet1Xf psignbit(const Packet1Xf& a) { + return __riscv_vreinterpret_v_i32m1_f32m1( + __riscv_vsra_vx_i32m1(__riscv_vreinterpret_v_f32m1_i32m1(a), 31, unpacket_traits::size)); +} + template <> EIGEN_STRONG_INLINE Packet1Xf pconj(const Packet1Xf& a) { return a; @@ -840,17 +858,18 @@ EIGEN_STRONG_INLINE Packet1Xf ploadu(const float* from) { template <> EIGEN_STRONG_INLINE Packet1Xf ploaddup(const float* from) { - Packet1Xu idx = __riscv_vid_v_u32m1(unpacket_traits::size); - idx = __riscv_vsll_vx_u32m1(__riscv_vand_vx_u32m1(idx, 0xfffffffeu, unpacket_traits::size), 1, - unpacket_traits::size); - return __riscv_vloxei32_v_f32m1(from, idx, unpacket_traits::size); + Packet1Xu data = __riscv_vreinterpret_v_f32m1_u32m1(pload(from)); + return __riscv_vreinterpret_v_i32m1_f32m1( + __riscv_vreinterpret_v_i64m1_i32m1(__riscv_vreinterpret_v_u64m1_i64m1(__riscv_vlmul_trunc_v_u64m2_u64m1( + __riscv_vwmaccu_vx_u64m2(__riscv_vwaddu_vv_u64m2(data, data, unpacket_traits::size), 0xffffffffu, + data, unpacket_traits::size))))); } template <> EIGEN_STRONG_INLINE Packet1Xf ploadquad(const float* from) { - Packet1Xu idx = __riscv_vid_v_u32m1(unpacket_traits::size); - idx = __riscv_vand_vx_u32m1(idx, 0xfffffffcu, unpacket_traits::size); - return __riscv_vloxei32_v_f32m1(from, idx, unpacket_traits::size); + Packet1Xu idx = + __riscv_vsrl_vx_u32m1(__riscv_vid_v_u32m1(unpacket_traits::size), 2, unpacket_traits::size); + return __riscv_vrgather_vv_f32m1(pload(from), idx, unpacket_traits::size); } template <> @@ -891,7 +910,7 @@ EIGEN_STRONG_INLINE Packet1Xf print(const Packet1Xf& a) { PacketMask32 mask = __riscv_vmfne_vv_f32m1_b32(a, a, unpacket_traits::size); const Packet1Xf x = __riscv_vfadd_vv_f32m1_tumu(mask, a, a, a, unpacket_traits::size); const Packet1Xf new_x = __riscv_vfcvt_f_x_v_f32m1(__riscv_vfcvt_x_f_v_i32m1(a, unpacket_traits::size), - unpacket_traits::size); + unpacket_traits::size); mask = __riscv_vmflt_vv_f32m1_b32(abs_a, limit, unpacket_traits::size); Packet1Xf signed_x = __riscv_vfsgnj_vv_f32m1(new_x, x, unpacket_traits::size); @@ -909,7 +928,7 @@ EIGEN_STRONG_INLINE Packet1Xf pfloor(const Packet1Xf& a) { template <> EIGEN_STRONG_INLINE Packet1Xf preverse(const Packet1Xf& a) { Packet1Xu idx = __riscv_vrsub_vx_u32m1(__riscv_vid_v_u32m1(unpacket_traits::size), - unpacket_traits::size - 1, unpacket_traits::size); + unpacket_traits::size - 1, unpacket_traits::size); return __riscv_vrgather_vv_f32m1(a, idx, unpacket_traits::size); } @@ -952,22 +971,20 @@ EIGEN_STRONG_INLINE float predux_mul(const Packet1Xf& a) { template <> EIGEN_STRONG_INLINE float predux_min(const Packet1Xf& a) { - return ( - std::min)(__riscv_vfmv_f(__riscv_vfredmin_vs_f32m1_f32m1( - a, - __riscv_vfmv_v_f_f32m1((std::numeric_limits::quiet_NaN)(), unpacket_traits::size), - unpacket_traits::size)), - (std::numeric_limits::max)()); + return (std::min)( + __riscv_vfmv_f(__riscv_vfredmin_vs_f32m1_f32m1( + a, __riscv_vfmv_v_f_f32m1((std::numeric_limits::quiet_NaN)(), unpacket_traits::size), + unpacket_traits::size)), + (std::numeric_limits::max)()); } template <> EIGEN_STRONG_INLINE float predux_max(const Packet1Xf& a) { - return ( - std::max)(__riscv_vfmv_f(__riscv_vfredmax_vs_f32m1_f32m1( - a, - __riscv_vfmv_v_f_f32m1((std::numeric_limits::quiet_NaN)(), unpacket_traits::size), - unpacket_traits::size)), - -(std::numeric_limits::max)()); + return (std::max)( + __riscv_vfmv_f(__riscv_vfredmax_vs_f32m1_f32m1( + a, __riscv_vfmv_v_f_f32m1((std::numeric_limits::quiet_NaN)(), unpacket_traits::size), + unpacket_traits::size)), + -(std::numeric_limits::max)()); } template @@ -1012,18 +1029,22 @@ EIGEN_STRONG_INLINE Packet1Xf pselect(const PacketMask32& mask, const Packet1Xf& return __riscv_vmerge_vvm_f32m1(b, a, mask, unpacket_traits::size); } +EIGEN_STRONG_INLINE Packet1Xf pselect(const Packet1Xf& mask, const Packet1Xf& a, const Packet1Xf& b) { + PacketMask32 mask2 = + __riscv_vmsne_vx_i32m1_b32(__riscv_vreinterpret_v_f32m1_i32m1(mask), 0, unpacket_traits::size); + return __riscv_vmerge_vvm_f32m1(b, a, mask2, unpacket_traits::size); +} + /********************************* int64 **************************************/ typedef eigen_packet_wrapper Packet1Xl; typedef eigen_packet_wrapper Packet1Xul; -typedef eigen_packet_wrapper - Packet2Xl; +typedef eigen_packet_wrapper Packet2Xl; typedef eigen_packet_wrapper Packet2Xul; -typedef eigen_packet_wrapper - Packet4Xl; +typedef eigen_packet_wrapper Packet4Xl; typedef eigen_packet_wrapper Packet4Xul; @@ -1317,20 +1338,16 @@ EIGEN_STRONG_INLINE Packet1Xl ploadu(const numext::int64_t* from) { template <> EIGEN_STRONG_INLINE Packet1Xl ploaddup(const numext::int64_t* from) { - Packet1Xul idx = __riscv_vid_v_u64m1(unpacket_traits::size); - idx = __riscv_vsll_vx_u64m1(__riscv_vand_vx_u64m1(idx, 0xfffffffffffffffeu, unpacket_traits::size), 2, - unpacket_traits::size); - // idx = 0 0 sizeof(int64_t) sizeof(int64_t) 2*sizeof(int64_t) 2*sizeof(int64_t) ... - return __riscv_vloxei64_v_i64m1(from, idx, unpacket_traits::size); + Packet1Xul idx = + __riscv_vsrl_vx_u64m1(__riscv_vid_v_u64m1(unpacket_traits::size), 1, unpacket_traits::size); + return __riscv_vrgather_vv_i64m1(pload(from), idx, unpacket_traits::size); } template <> EIGEN_STRONG_INLINE Packet1Xl ploadquad(const numext::int64_t* from) { - Packet1Xul idx = __riscv_vid_v_u64m1(unpacket_traits::size); - idx = __riscv_vsll_vx_u64m1(__riscv_vand_vx_u64m1(idx, 0xfffffffffffffffcu, unpacket_traits::size), 1, - unpacket_traits::size); - ; - return __riscv_vloxei64_v_i64m1(from, idx, unpacket_traits::size); + Packet1Xul idx = + __riscv_vsrl_vx_u64m1(__riscv_vid_v_u64m1(unpacket_traits::size), 2, unpacket_traits::size); + return __riscv_vrgather_vv_i64m1(pload(from), idx, unpacket_traits::size); } template <> @@ -1350,7 +1367,7 @@ EIGEN_DEVICE_FUNC inline Packet1Xl pgather(const num template <> EIGEN_DEVICE_FUNC inline void pscatter(numext::int64_t* to, const Packet1Xl& from, - Index stride) { + Index stride) { __riscv_vsse64(to, stride * sizeof(numext::int64_t), from, unpacket_traits::size); } @@ -1362,7 +1379,7 @@ EIGEN_STRONG_INLINE numext::int64_t pfirst(const Packet1Xl& a) { template <> EIGEN_STRONG_INLINE Packet1Xl preverse(const Packet1Xl& a) { Packet1Xul idx = __riscv_vrsub_vx_u64m1(__riscv_vid_v_u64m1(unpacket_traits::size), - unpacket_traits::size - 1, unpacket_traits::size); + unpacket_traits::size - 1, unpacket_traits::size); return __riscv_vrgather_vv_i64m1(a, idx, unpacket_traits::size); } @@ -1621,6 +1638,12 @@ EIGEN_STRONG_INLINE Packet1Xd pabs(const Packet1Xd& a) { return __riscv_vfabs_v_f64m1(a, unpacket_traits::size); } +template <> +EIGEN_STRONG_INLINE Packet1Xd pabsdiff(const Packet1Xd& a, const Packet1Xd& b) { + return __riscv_vfabs_v_f64m1(__riscv_vfsub_vv_f64m1(a, b, unpacket_traits::size), + unpacket_traits::size); +} + template <> EIGEN_STRONG_INLINE Packet1Xd pset1(const double& from) { return __riscv_vfmv_v_f_f64m1(from, unpacket_traits::size); @@ -1639,6 +1662,16 @@ EIGEN_STRONG_INLINE Packet1Xd plset(const double& a) { return __riscv_vfadd_vf_f64m1(idx, a, unpacket_traits::size); } +template <> +EIGEN_STRONG_INLINE void pbroadcast4(const double* a, Packet1Xd& a0, Packet1Xd& a1, Packet1Xd& a2, + Packet1Xd& a3) { + vfloat64m1_t aa = __riscv_vle64_v_f64m1(a, 4); + a0 = __riscv_vrgather_vx_f64m1(aa, 0, unpacket_traits::size); + a1 = __riscv_vrgather_vx_f64m1(aa, 1, unpacket_traits::size); + a2 = __riscv_vrgather_vx_f64m1(aa, 2, unpacket_traits::size); + a3 = __riscv_vrgather_vx_f64m1(aa, 3, unpacket_traits::size); +} + template <> EIGEN_STRONG_INLINE Packet1Xd padd(const Packet1Xd& a, const Packet1Xd& b) { return __riscv_vfadd_vv_f64m1(a, b, unpacket_traits::size); @@ -1654,6 +1687,12 @@ EIGEN_STRONG_INLINE Packet1Xd pnegate(const Packet1Xd& a) { return __riscv_vfneg_v_f64m1(a, unpacket_traits::size); } +template <> +EIGEN_STRONG_INLINE Packet1Xd psignbit(const Packet1Xd& a) { + return __riscv_vreinterpret_v_i64m1_f64m1( + __riscv_vsra_vx_i64m1(__riscv_vreinterpret_v_f64m1_i64m1(a), 63, unpacket_traits::size)); +} + template <> EIGEN_STRONG_INLINE Packet1Xd pconj(const Packet1Xd& a) { return a; @@ -1792,19 +1831,16 @@ EIGEN_STRONG_INLINE Packet1Xd ploadu(const double* from) { template <> EIGEN_STRONG_INLINE Packet1Xd ploaddup(const double* from) { - Packet1Xul idx = __riscv_vid_v_u64m1(unpacket_traits::size); - idx = __riscv_vsll_vx_u64m1(__riscv_vand_vx_u64m1(idx, 0xfffffffffffffffeu, unpacket_traits::size), 2, - unpacket_traits::size); - return __riscv_vloxei64_v_f64m1(from, idx, unpacket_traits::size); + Packet1Xul idx = + __riscv_vsrl_vx_u64m1(__riscv_vid_v_u64m1(unpacket_traits::size), 1, unpacket_traits::size); + return __riscv_vrgather_vv_f64m1(pload(from), idx, unpacket_traits::size); } template <> EIGEN_STRONG_INLINE Packet1Xd ploadquad(const double* from) { - Packet1Xul idx = __riscv_vid_v_u64m1(unpacket_traits::size); - idx = __riscv_vsll_vx_u64m1(__riscv_vand_vx_u64m1(idx, 0xfffffffffffffffcu, unpacket_traits::size), 1, - unpacket_traits::size); - ; - return __riscv_vloxei64_v_f64m1(from, idx, unpacket_traits::size); + Packet1Xul idx = + __riscv_vsrl_vx_u64m1(__riscv_vid_v_u64m1(unpacket_traits::size), 2, unpacket_traits::size); + return __riscv_vrgather_vv_f64m1(pload(from), idx, unpacket_traits::size); } template <> @@ -1845,7 +1881,7 @@ EIGEN_STRONG_INLINE Packet1Xd print(const Packet1Xd& a) { PacketMask64 mask = __riscv_vmfne_vv_f64m1_b64(a, a, unpacket_traits::size); const Packet1Xd x = __riscv_vfadd_vv_f64m1_tumu(mask, a, a, a, unpacket_traits::size); const Packet1Xd new_x = __riscv_vfcvt_f_x_v_f64m1(__riscv_vfcvt_x_f_v_i64m1(a, unpacket_traits::size), - unpacket_traits::size); + unpacket_traits::size); mask = __riscv_vmflt_vv_f64m1_b64(abs_a, limit, unpacket_traits::size); Packet1Xd signed_x = __riscv_vfsgnj_vv_f64m1(new_x, x, unpacket_traits::size); @@ -1863,7 +1899,7 @@ EIGEN_STRONG_INLINE Packet1Xd pfloor(const Packet1Xd& a) { template <> EIGEN_STRONG_INLINE Packet1Xd preverse(const Packet1Xd& a) { Packet1Xul idx = __riscv_vrsub_vx_u64m1(__riscv_vid_v_u64m1(unpacket_traits::size), - unpacket_traits::size - 1, unpacket_traits::size); + unpacket_traits::size - 1, unpacket_traits::size); return __riscv_vrgather_vv_f64m1(a, idx, unpacket_traits::size); } @@ -1903,22 +1939,20 @@ EIGEN_STRONG_INLINE double predux_mul(const Packet1Xd& a) { template <> EIGEN_STRONG_INLINE double predux_min(const Packet1Xd& a) { - return ( - std::min)(__riscv_vfmv_f(__riscv_vfredmin_vs_f64m1_f64m1( - a, - __riscv_vfmv_v_f_f64m1((std::numeric_limits::quiet_NaN)(), unpacket_traits::size), - unpacket_traits::size)), - (std::numeric_limits::max)()); + return (std::min)( + __riscv_vfmv_f(__riscv_vfredmin_vs_f64m1_f64m1( + a, __riscv_vfmv_v_f_f64m1((std::numeric_limits::quiet_NaN)(), unpacket_traits::size), + unpacket_traits::size)), + (std::numeric_limits::max)()); } template <> EIGEN_STRONG_INLINE double predux_max(const Packet1Xd& a) { - return ( - std::max)(__riscv_vfmv_f(__riscv_vfredmax_vs_f64m1_f64m1( - a, - __riscv_vfmv_v_f_f64m1((std::numeric_limits::quiet_NaN)(), unpacket_traits::size), - unpacket_traits::size)), - -(std::numeric_limits::max)()); + return (std::max)( + __riscv_vfmv_f(__riscv_vfredmax_vs_f64m1_f64m1( + a, __riscv_vfmv_v_f_f64m1((std::numeric_limits::quiet_NaN)(), unpacket_traits::size), + unpacket_traits::size)), + -(std::numeric_limits::max)()); } template @@ -1968,18 +2002,22 @@ EIGEN_STRONG_INLINE Packet1Xd pselect(const PacketMask64& mask, const Packet1Xd& return __riscv_vmerge_vvm_f64m1(b, a, mask, unpacket_traits::size); } +EIGEN_STRONG_INLINE Packet1Xd pselect(const Packet1Xd& mask, const Packet1Xd& a, const Packet1Xd& b) { + PacketMask64 mask2 = + __riscv_vmsne_vx_i64m1_b64(__riscv_vreinterpret_v_f64m1_i64m1(mask), 0, unpacket_traits::size); + return __riscv_vmerge_vvm_f64m1(b, a, mask2, unpacket_traits::size); +} + /********************************* short **************************************/ typedef eigen_packet_wrapper Packet1Xs; typedef eigen_packet_wrapper Packet1Xsu; -typedef eigen_packet_wrapper - Packet2Xs; +typedef eigen_packet_wrapper Packet2Xs; typedef eigen_packet_wrapper Packet2Xsu; -typedef eigen_packet_wrapper - Packet4Xs; +typedef eigen_packet_wrapper Packet4Xs; typedef eigen_packet_wrapper Packet4Xsu; @@ -2273,18 +2311,17 @@ EIGEN_STRONG_INLINE Packet1Xs ploadu(const numext::int16_t* from) { template <> EIGEN_STRONG_INLINE Packet1Xs ploaddup(const numext::int16_t* from) { - Packet1Xsu idx = __riscv_vid_v_u16m1(unpacket_traits::size); - idx = __riscv_vand_vx_u16m1(idx, 0xfffeu, unpacket_traits::size); - // idx = 0 0 sizeof(int16_t) sizeof(int16_t) 2*sizeof(int16_t) 2*sizeof(int16_t) ... - return __riscv_vloxei16_v_i16m1(from, idx, unpacket_traits::size); + Packet1Xsu data = __riscv_vreinterpret_v_i16m1_u16m1(pload(from)); + return __riscv_vreinterpret_v_i32m1_i16m1(__riscv_vreinterpret_v_u32m1_i32m1(__riscv_vlmul_trunc_v_u32m2_u32m1( + __riscv_vwmaccu_vx_u32m2(__riscv_vwaddu_vv_u32m2(data, data, unpacket_traits::size), 0xffffu, data, + unpacket_traits::size)))); } template <> EIGEN_STRONG_INLINE Packet1Xs ploadquad(const numext::int16_t* from) { - Packet1Xsu idx = __riscv_vid_v_u16m1(unpacket_traits::size); - idx = __riscv_vsrl_vx_u16m1(__riscv_vand_vx_u16m1(idx, 0xfffcu, unpacket_traits::size), 1, - unpacket_traits::size); - return __riscv_vloxei16_v_i16m1(from, idx, unpacket_traits::size); + Packet1Xsu idx = + __riscv_vsrl_vx_u16m1(__riscv_vid_v_u16m1(unpacket_traits::size), 2, unpacket_traits::size); + return __riscv_vrgather_vv_i16m1(pload(from), idx, unpacket_traits::size); } template <> @@ -2304,7 +2341,7 @@ EIGEN_DEVICE_FUNC inline Packet1Xs pgather(const num template <> EIGEN_DEVICE_FUNC inline void pscatter(numext::int16_t* to, const Packet1Xs& from, - Index stride) { + Index stride) { __riscv_vsse16(to, stride * sizeof(numext::int16_t), from, unpacket_traits::size); } @@ -2316,7 +2353,7 @@ EIGEN_STRONG_INLINE numext::int16_t pfirst(const Packet1Xs& a) { template <> EIGEN_STRONG_INLINE Packet1Xs preverse(const Packet1Xs& a) { Packet1Xsu idx = __riscv_vrsub_vx_u16m1(__riscv_vid_v_u16m1(unpacket_traits::size), - unpacket_traits::size - 1, unpacket_traits::size); + unpacket_traits::size - 1, unpacket_traits::size); return __riscv_vrgather_vv_i16m1(a, idx, unpacket_traits::size); } diff --git a/Eigen/src/Core/arch/RVV10/PacketMath2.h b/Eigen/src/Core/arch/RVV10/PacketMath2.h index e230ba16b..d99a154cc 100644 --- a/Eigen/src/Core/arch/RVV10/PacketMath2.h +++ b/Eigen/src/Core/arch/RVV10/PacketMath2.h @@ -2,6 +2,7 @@ // for linear algebra. // // Copyright (C) 2024 Kseniya Zaytseva +// Copyright (C) 2025 Chip Kerchner // // This Source Code Form is subject to the terms of the Mozilla // Public License v. 2.0. If a copy of the MPL was not distributed @@ -166,18 +167,17 @@ EIGEN_STRONG_INLINE Packet2Xi ploadu(const numext::int32_t* from) { template <> EIGEN_STRONG_INLINE Packet2Xi ploaddup(const numext::int32_t* from) { - Packet2Xu idx = __riscv_vid_v_u32m2(unpacket_traits::size); - idx = __riscv_vsll_vx_u32m2(__riscv_vand_vx_u32m2(idx, 0xfffffffeu, unpacket_traits::size), 1, - unpacket_traits::size); - // idx = 0 0 sizeof(int32_t) sizeof(int32_t) 2*sizeof(int32_t) 2*sizeof(int32_t) ... - return __riscv_vloxei32_v_i32m2(from, idx, unpacket_traits::size); + Packet2Xu data = __riscv_vreinterpret_v_i32m2_u32m2(pload(from)); + return __riscv_vreinterpret_v_i64m2_i32m2(__riscv_vreinterpret_v_u64m2_i64m2(__riscv_vlmul_trunc_v_u64m4_u64m2( + __riscv_vwmaccu_vx_u64m4(__riscv_vwaddu_vv_u64m4(data, data, unpacket_traits::size), 0xffffffffu, data, + unpacket_traits::size)))); } template <> EIGEN_STRONG_INLINE Packet2Xi ploadquad(const numext::int32_t* from) { - Packet2Xu idx = __riscv_vid_v_u32m2(unpacket_traits::size); - idx = __riscv_vand_vx_u32m2(idx, 0xfffffffcu, unpacket_traits::size); - return __riscv_vloxei32_v_i32m2(from, idx, unpacket_traits::size); + Packet2Xu idx = + __riscv_vsrl_vx_u32m2(__riscv_vid_v_u32m2(unpacket_traits::size), 2, unpacket_traits::size); + return __riscv_vrgather_vv_i32m2(pload(from), idx, unpacket_traits::size); } template <> @@ -191,14 +191,13 @@ EIGEN_STRONG_INLINE void pstoreu(numext::int32_t* to, const Pac } template <> -EIGEN_DEVICE_FUNC inline Packet2Xi pgather(const numext::int32_t* from, - Index stride) { +EIGEN_DEVICE_FUNC inline Packet2Xi pgather(const numext::int32_t* from, Index stride) { return __riscv_vlse32_v_i32m2(from, stride * sizeof(numext::int32_t), unpacket_traits::size); } template <> EIGEN_DEVICE_FUNC inline void pscatter(numext::int32_t* to, const Packet2Xi& from, - Index stride) { + Index stride) { __riscv_vsse32(to, stride * sizeof(numext::int32_t), from, unpacket_traits::size); } @@ -209,9 +208,8 @@ EIGEN_STRONG_INLINE numext::int32_t pfirst(const Packet2Xi& a) { template <> EIGEN_STRONG_INLINE Packet2Xi preverse(const Packet2Xi& a) { - Packet2Xu idx = - __riscv_vrsub_vx_u32m2(__riscv_vid_v_u32m2(unpacket_traits::size), - unpacket_traits::size - 1, unpacket_traits::size); + Packet2Xu idx = __riscv_vrsub_vx_u32m2(__riscv_vid_v_u32m2(unpacket_traits::size), + unpacket_traits::size - 1, unpacket_traits::size); return __riscv_vrgather_vv_i32m2(a, idx, unpacket_traits::size); } @@ -224,14 +222,14 @@ EIGEN_STRONG_INLINE Packet2Xi pabs(const Packet2Xi& a) { template <> EIGEN_STRONG_INLINE numext::int32_t predux(const Packet2Xi& a) { - return __riscv_vmv_x(__riscv_vredsum_vs_i32m2_i32m1( - a, __riscv_vmv_v_x_i32m1(0, unpacket_traits::size / 2), unpacket_traits::size)); + return __riscv_vmv_x(__riscv_vredsum_vs_i32m2_i32m1(a, __riscv_vmv_v_x_i32m1(0, unpacket_traits::size / 2), + unpacket_traits::size)); } template <> EIGEN_STRONG_INLINE numext::int32_t predux_mul(const Packet2Xi& a) { return predux_mul(__riscv_vmul_vv_i32m1(__riscv_vget_v_i32m2_i32m1(a, 0), __riscv_vget_v_i32m2_i32m1(a, 1), - unpacket_traits::size)); + unpacket_traits::size)); } template <> @@ -264,18 +262,18 @@ EIGEN_DEVICE_FUNC inline void ptranspose(PacketBlock& kernel) { template EIGEN_STRONG_INLINE -typename std::enable_if::value && (unpacket_traits::size % 8) == 0, - Packet2Xi>::type -predux_half(const Packet4Xi& a) { + typename std::enable_if::value && (unpacket_traits::size % 8) == 0, + Packet2Xi>::type + predux_half(const Packet4Xi& a) { return __riscv_vadd_vv_i32m2(__riscv_vget_v_i32m4_i32m2(a, 0), __riscv_vget_v_i32m4_i32m2(a, 1), unpacket_traits::size); } template EIGEN_STRONG_INLINE -typename std::enable_if::value && (unpacket_traits::size % 8) == 0, - Packet1Xi>::type -predux_half(const Packet2Xi& a) { + typename std::enable_if::value && (unpacket_traits::size % 8) == 0, + Packet1Xi>::type + predux_half(const Packet2Xi& a) { return __riscv_vadd_vv_i32m1(__riscv_vget_v_i32m2_i32m1(a, 0), __riscv_vget_v_i32m2_i32m1(a, 1), unpacket_traits::size); } @@ -297,6 +295,12 @@ EIGEN_STRONG_INLINE Packet2Xf pabs(const Packet2Xf& a) { return __riscv_vfabs_v_f32m2(a, unpacket_traits::size); } +template <> +EIGEN_STRONG_INLINE Packet2Xf pabsdiff(const Packet2Xf& a, const Packet2Xf& b) { + return __riscv_vfabs_v_f32m2(__riscv_vfsub_vv_f32m2(a, b, unpacket_traits::size), + unpacket_traits::size); +} + template <> EIGEN_STRONG_INLINE Packet2Xf pset1(const float& from) { return __riscv_vfmv_v_f_f32m2(from, unpacket_traits::size); @@ -315,6 +319,16 @@ EIGEN_STRONG_INLINE Packet2Xf plset(const float& a) { return __riscv_vfadd_vf_f32m2(idx, a, unpacket_traits::size); } +template <> +EIGEN_STRONG_INLINE void pbroadcast4(const float* a, Packet2Xf& a0, Packet2Xf& a1, Packet2Xf& a2, + Packet2Xf& a3) { + vfloat32m2_t aa = __riscv_vle32_v_f32m2(a, 4); + a0 = __riscv_vrgather_vx_f32m2(aa, 0, unpacket_traits::size); + a1 = __riscv_vrgather_vx_f32m2(aa, 1, unpacket_traits::size); + a2 = __riscv_vrgather_vx_f32m2(aa, 2, unpacket_traits::size); + a3 = __riscv_vrgather_vx_f32m2(aa, 3, unpacket_traits::size); +} + template <> EIGEN_STRONG_INLINE Packet2Xf padd(const Packet2Xf& a, const Packet2Xf& b) { return __riscv_vfadd_vv_f32m2(a, b, unpacket_traits::size); @@ -330,6 +344,12 @@ EIGEN_STRONG_INLINE Packet2Xf pnegate(const Packet2Xf& a) { return __riscv_vfneg_v_f32m2(a, unpacket_traits::size); } +template <> +EIGEN_STRONG_INLINE Packet2Xf psignbit(const Packet2Xf& a) { + return __riscv_vreinterpret_v_i32m2_f32m2( + __riscv_vsra_vx_i32m2(__riscv_vreinterpret_v_f32m2_i32m2(a), 31, unpacket_traits::size)); +} + template <> EIGEN_STRONG_INLINE Packet2Xf pconj(const Packet2Xf& a) { return a; @@ -367,8 +387,7 @@ EIGEN_STRONG_INLINE Packet2Xf pnmsub(const Packet2Xf& a, const Packet2Xf& b, con template <> EIGEN_STRONG_INLINE Packet2Xf pmin(const Packet2Xf& a, const Packet2Xf& b) { - Packet2Xf nans = - __riscv_vfmv_v_f_f32m2((std::numeric_limits::quiet_NaN)(), unpacket_traits::size); + Packet2Xf nans = __riscv_vfmv_v_f_f32m2((std::numeric_limits::quiet_NaN)(), unpacket_traits::size); PacketMask16 mask = __riscv_vmfeq_vv_f32m2_b16(a, a, unpacket_traits::size); PacketMask16 mask2 = __riscv_vmfeq_vv_f32m2_b16(b, b, unpacket_traits::size); mask = __riscv_vmand_mm_b16(mask, mask2, unpacket_traits::size); @@ -388,8 +407,7 @@ EIGEN_STRONG_INLINE Packet2Xf pmin(const Packet2Xf& template <> EIGEN_STRONG_INLINE Packet2Xf pmax(const Packet2Xf& a, const Packet2Xf& b) { - Packet2Xf nans = - __riscv_vfmv_v_f_f32m2((std::numeric_limits::quiet_NaN)(), unpacket_traits::size); + Packet2Xf nans = __riscv_vfmv_v_f_f32m2((std::numeric_limits::quiet_NaN)(), unpacket_traits::size); PacketMask16 mask = __riscv_vmfeq_vv_f32m2_b16(a, a, unpacket_traits::size); PacketMask16 mask2 = __riscv_vmfeq_vv_f32m2_b16(b, b, unpacket_traits::size); mask = __riscv_vmand_mm_b16(mask, mask2, unpacket_traits::size); @@ -410,22 +428,19 @@ EIGEN_STRONG_INLINE Packet2Xf pmax(const Packet2Xf& template <> EIGEN_STRONG_INLINE Packet2Xf pcmp_le(const Packet2Xf& a, const Packet2Xf& b) { PacketMask16 mask = __riscv_vmfle_vv_f32m2_b16(a, b, unpacket_traits::size); - return __riscv_vmerge_vvm_f32m2(pzero(a), ptrue(a), mask, - unpacket_traits::size); + return __riscv_vmerge_vvm_f32m2(pzero(a), ptrue(a), mask, unpacket_traits::size); } template <> EIGEN_STRONG_INLINE Packet2Xf pcmp_lt(const Packet2Xf& a, const Packet2Xf& b) { PacketMask16 mask = __riscv_vmflt_vv_f32m2_b16(a, b, unpacket_traits::size); - return __riscv_vmerge_vvm_f32m2(pzero(a), ptrue(a), mask, - unpacket_traits::size); + return __riscv_vmerge_vvm_f32m2(pzero(a), ptrue(a), mask, unpacket_traits::size); } template <> EIGEN_STRONG_INLINE Packet2Xf pcmp_eq(const Packet2Xf& a, const Packet2Xf& b) { PacketMask16 mask = __riscv_vmfeq_vv_f32m2_b16(a, b, unpacket_traits::size); - return __riscv_vmerge_vvm_f32m2(pzero(a), ptrue(a), mask, - unpacket_traits::size); + return __riscv_vmerge_vvm_f32m2(pzero(a), ptrue(a), mask, unpacket_traits::size); } template <> @@ -434,26 +449,33 @@ EIGEN_STRONG_INLINE Packet2Xf pcmp_lt_or_nan(const Packet2Xf& a, cons return __riscv_vfmerge_vfm_f32m2(ptrue(a), 0.0f, mask, unpacket_traits::size); } +EIGEN_STRONG_INLINE Packet2Xf pselect(const PacketMask16& mask, const Packet2Xf& a, const Packet2Xf& b) { + return __riscv_vmerge_vvm_f32m2(b, a, mask, unpacket_traits::size); +} + +EIGEN_STRONG_INLINE Packet2Xf pselect(const Packet2Xf& mask, const Packet2Xf& a, const Packet2Xf& b) { + PacketMask16 mask2 = + __riscv_vmsne_vx_i32m2_b16(__riscv_vreinterpret_v_f32m2_i32m2(mask), 0, unpacket_traits::size); + return __riscv_vmerge_vvm_f32m2(b, a, mask2, unpacket_traits::size); +} + // Logical Operations are not supported for float, so reinterpret casts template <> EIGEN_STRONG_INLINE Packet2Xf pand(const Packet2Xf& a, const Packet2Xf& b) { - return __riscv_vreinterpret_v_u32m2_f32m2(__riscv_vand_vv_u32m2(__riscv_vreinterpret_v_f32m2_u32m2(a), - __riscv_vreinterpret_v_f32m2_u32m2(b), - unpacket_traits::size)); + return __riscv_vreinterpret_v_u32m2_f32m2(__riscv_vand_vv_u32m2( + __riscv_vreinterpret_v_f32m2_u32m2(a), __riscv_vreinterpret_v_f32m2_u32m2(b), unpacket_traits::size)); } template <> EIGEN_STRONG_INLINE Packet2Xf por(const Packet2Xf& a, const Packet2Xf& b) { - return __riscv_vreinterpret_v_u32m2_f32m2(__riscv_vor_vv_u32m2(__riscv_vreinterpret_v_f32m2_u32m2(a), - __riscv_vreinterpret_v_f32m2_u32m2(b), - unpacket_traits::size)); + return __riscv_vreinterpret_v_u32m2_f32m2(__riscv_vor_vv_u32m2( + __riscv_vreinterpret_v_f32m2_u32m2(a), __riscv_vreinterpret_v_f32m2_u32m2(b), unpacket_traits::size)); } template <> EIGEN_STRONG_INLINE Packet2Xf pxor(const Packet2Xf& a, const Packet2Xf& b) { - return __riscv_vreinterpret_v_u32m2_f32m2(__riscv_vxor_vv_u32m2(__riscv_vreinterpret_v_f32m2_u32m2(a), - __riscv_vreinterpret_v_f32m2_u32m2(b), - unpacket_traits::size)); + return __riscv_vreinterpret_v_u32m2_f32m2(__riscv_vxor_vv_u32m2( + __riscv_vreinterpret_v_f32m2_u32m2(a), __riscv_vreinterpret_v_f32m2_u32m2(b), unpacket_traits::size)); } template <> @@ -476,17 +498,18 @@ EIGEN_STRONG_INLINE Packet2Xf ploadu(const float* from) { template <> EIGEN_STRONG_INLINE Packet2Xf ploaddup(const float* from) { - Packet2Xu idx = __riscv_vid_v_u32m2(unpacket_traits::size); - idx = __riscv_vsll_vx_u32m2(__riscv_vand_vx_u32m2(idx, 0xfffffffeu, unpacket_traits::size), 1, - unpacket_traits::size); - return __riscv_vloxei32_v_f32m2(from, idx, unpacket_traits::size); + Packet2Xu data = __riscv_vreinterpret_v_f32m2_u32m2(pload(from)); + return __riscv_vreinterpret_v_i32m2_f32m2( + __riscv_vreinterpret_v_i64m2_i32m2(__riscv_vreinterpret_v_u64m2_i64m2(__riscv_vlmul_trunc_v_u64m4_u64m2( + __riscv_vwmaccu_vx_u64m4(__riscv_vwaddu_vv_u64m4(data, data, unpacket_traits::size), 0xffffffffu, + data, unpacket_traits::size))))); } template <> EIGEN_STRONG_INLINE Packet2Xf ploadquad(const float* from) { - Packet2Xu idx = __riscv_vid_v_u32m2(unpacket_traits::size); - idx = __riscv_vand_vx_u32m2(idx, 0xfffffffcu, unpacket_traits::size); - return __riscv_vloxei32_v_f32m2(from, idx, unpacket_traits::size); + Packet2Xu idx = + __riscv_vsrl_vx_u32m2(__riscv_vid_v_u32m2(unpacket_traits::size), 2, unpacket_traits::size); + return __riscv_vrgather_vv_f32m2(pload(from), idx, unpacket_traits::size); } template <> @@ -526,8 +549,8 @@ EIGEN_STRONG_INLINE Packet2Xf print(const Packet2Xf& a) { PacketMask16 mask = __riscv_vmfne_vv_f32m2_b16(a, a, unpacket_traits::size); const Packet2Xf x = __riscv_vfadd_vv_f32m2_tumu(mask, a, a, a, unpacket_traits::size); - const Packet2Xf new_x = __riscv_vfcvt_f_x_v_f32m2( - __riscv_vfcvt_x_f_v_i32m2(a, unpacket_traits::size), unpacket_traits::size); + const Packet2Xf new_x = __riscv_vfcvt_f_x_v_f32m2(__riscv_vfcvt_x_f_v_i32m2(a, unpacket_traits::size), + unpacket_traits::size); mask = __riscv_vmflt_vv_f32m2_b16(abs_a, limit, unpacket_traits::size); Packet2Xf signed_x = __riscv_vfsgnj_vv_f32m2(new_x, x, unpacket_traits::size); @@ -544,9 +567,8 @@ EIGEN_STRONG_INLINE Packet2Xf pfloor(const Packet2Xf& a) { template <> EIGEN_STRONG_INLINE Packet2Xf preverse(const Packet2Xf& a) { - Packet2Xu idx = - __riscv_vrsub_vx_u32m2(__riscv_vid_v_u32m2(unpacket_traits::size), - unpacket_traits::size - 1, unpacket_traits::size); + Packet2Xu idx = __riscv_vrsub_vx_u32m2(__riscv_vid_v_u32m2(unpacket_traits::size), + unpacket_traits::size - 1, unpacket_traits::size); return __riscv_vrgather_vv_f32m2(a, idx, unpacket_traits::size); } @@ -563,28 +585,26 @@ EIGEN_STRONG_INLINE float predux(const Packet2Xf& a) { template <> EIGEN_STRONG_INLINE float predux_mul(const Packet2Xf& a) { - return predux_mul(__riscv_vfmul_vv_f32m1(__riscv_vget_v_f32m2_f32m1(a, 0), __riscv_vget_v_f32m2_f32m1(a, 1), - unpacket_traits::size)); + return predux_mul(__riscv_vfmul_vv_f32m1( + __riscv_vget_v_f32m2_f32m1(a, 0), __riscv_vget_v_f32m2_f32m1(a, 1), unpacket_traits::size)); } template <> EIGEN_STRONG_INLINE float predux_min(const Packet2Xf& a) { - return (std::min)(__riscv_vfmv_f(__riscv_vfredmin_vs_f32m2_f32m1( - a, - __riscv_vfmv_v_f_f32m1((std::numeric_limits::quiet_NaN)(), - unpacket_traits::size / 2), - unpacket_traits::size)), - (std::numeric_limits::max)()); + return (std::min)( + __riscv_vfmv_f(__riscv_vfredmin_vs_f32m2_f32m1( + a, __riscv_vfmv_v_f_f32m1((std::numeric_limits::quiet_NaN)(), unpacket_traits::size / 2), + unpacket_traits::size)), + (std::numeric_limits::max)()); } template <> EIGEN_STRONG_INLINE float predux_max(const Packet2Xf& a) { - return (std::max)(__riscv_vfmv_f(__riscv_vfredmax_vs_f32m2_f32m1( - a, - __riscv_vfmv_v_f_f32m1((std::numeric_limits::quiet_NaN)(), - unpacket_traits::size / 2), - unpacket_traits::size)), - -(std::numeric_limits::max)()); + return (std::max)( + __riscv_vfmv_f(__riscv_vfredmax_vs_f32m2_f32m1( + a, __riscv_vfmv_v_f_f32m1((std::numeric_limits::quiet_NaN)(), unpacket_traits::size / 2), + unpacket_traits::size)), + -(std::numeric_limits::max)()); } template @@ -609,18 +629,18 @@ EIGEN_STRONG_INLINE Packet2Xf pldexp(const Packet2Xf& a, const Packet template EIGEN_STRONG_INLINE -typename std::enable_if::value && (unpacket_traits::size % 8) == 0, - Packet2Xf>::type -predux_half(const Packet4Xf& a) { + typename std::enable_if::value && (unpacket_traits::size % 8) == 0, + Packet2Xf>::type + predux_half(const Packet4Xf& a) { return __riscv_vfadd_vv_f32m2(__riscv_vget_v_f32m4_f32m2(a, 0), __riscv_vget_v_f32m4_f32m2(a, 1), unpacket_traits::size); } template EIGEN_STRONG_INLINE -typename std::enable_if::value && (unpacket_traits::size % 8) == 0, - Packet1Xf>::type -predux_half(const Packet2Xf& a) { + typename std::enable_if::value && (unpacket_traits::size % 8) == 0, + Packet1Xf>::type + predux_half(const Packet2Xf& a) { return __riscv_vfadd_vv_f32m1(__riscv_vget_v_f32m2_f32m1(a, 0), __riscv_vget_v_f32m2_f32m1(a, 1), unpacket_traits::size); } @@ -775,19 +795,16 @@ EIGEN_STRONG_INLINE Packet2Xl ploadu(const numext::int64_t* from) { template <> EIGEN_STRONG_INLINE Packet2Xl ploaddup(const numext::int64_t* from) { - Packet2Xul idx = __riscv_vid_v_u64m2(unpacket_traits::size); - idx = __riscv_vsll_vx_u64m2(__riscv_vand_vx_u64m2(idx, 0xfffffffffffffffeu, unpacket_traits::size), 2, - unpacket_traits::size); - // idx = 0 0 sizeof(int64_t) sizeof(int64_t) 2*sizeof(int64_t) 2*sizeof(int64_t) ... - return __riscv_vloxei64_v_i64m2(from, idx, unpacket_traits::size); + Packet2Xul idx = + __riscv_vsrl_vx_u64m2(__riscv_vid_v_u64m2(unpacket_traits::size), 1, unpacket_traits::size); + return __riscv_vrgather_vv_i64m2(pload(from), idx, unpacket_traits::size); } template <> EIGEN_STRONG_INLINE Packet2Xl ploadquad(const numext::int64_t* from) { - Packet2Xul idx = __riscv_vid_v_u64m2(unpacket_traits::size); - idx = __riscv_vsll_vx_u64m2(__riscv_vand_vx_u64m2(idx, 0xfffffffffffffffcu, unpacket_traits::size), 1, - unpacket_traits::size); - return __riscv_vloxei64_v_i64m2(from, idx, unpacket_traits::size); + Packet2Xul idx = + __riscv_vsrl_vx_u64m2(__riscv_vid_v_u64m2(unpacket_traits::size), 2, unpacket_traits::size); + return __riscv_vrgather_vv_i64m2(pload(from), idx, unpacket_traits::size); } template <> @@ -801,14 +818,13 @@ EIGEN_STRONG_INLINE void pstoreu(numext::int64_t* to, const Pac } template <> -EIGEN_DEVICE_FUNC inline Packet2Xl pgather(const numext::int64_t* from, - Index stride) { +EIGEN_DEVICE_FUNC inline Packet2Xl pgather(const numext::int64_t* from, Index stride) { return __riscv_vlse64_v_i64m2(from, stride * sizeof(numext::int64_t), unpacket_traits::size); } template <> EIGEN_DEVICE_FUNC inline void pscatter(numext::int64_t* to, const Packet2Xl& from, - Index stride) { + Index stride) { __riscv_vsse64(to, stride * sizeof(numext::int64_t), from, unpacket_traits::size); } @@ -819,9 +835,8 @@ EIGEN_STRONG_INLINE numext::int64_t pfirst(const Packet2Xl& a) { template <> EIGEN_STRONG_INLINE Packet2Xl preverse(const Packet2Xl& a) { - Packet2Xul idx = - __riscv_vrsub_vx_u64m2(__riscv_vid_v_u64m2(unpacket_traits::size), - unpacket_traits::size - 1, unpacket_traits::size); + Packet2Xul idx = __riscv_vrsub_vx_u64m2(__riscv_vid_v_u64m2(unpacket_traits::size), + unpacket_traits::size - 1, unpacket_traits::size); return __riscv_vrgather_vv_i64m2(a, idx, unpacket_traits::size); } @@ -834,14 +849,14 @@ EIGEN_STRONG_INLINE Packet2Xl pabs(const Packet2Xl& a) { template <> EIGEN_STRONG_INLINE numext::int64_t predux(const Packet2Xl& a) { - return __riscv_vmv_x(__riscv_vredsum_vs_i64m2_i64m1( - a, __riscv_vmv_v_x_i64m1(0, unpacket_traits::size / 2), unpacket_traits::size)); + return __riscv_vmv_x(__riscv_vredsum_vs_i64m2_i64m1(a, __riscv_vmv_v_x_i64m1(0, unpacket_traits::size / 2), + unpacket_traits::size)); } template <> EIGEN_STRONG_INLINE numext::int64_t predux_mul(const Packet2Xl& a) { return predux_mul(__riscv_vmul_vv_i64m1(__riscv_vget_v_i64m2_i64m1(a, 0), __riscv_vget_v_i64m2_i64m1(a, 1), - unpacket_traits::size)); + unpacket_traits::size)); } template <> @@ -874,18 +889,18 @@ EIGEN_DEVICE_FUNC inline void ptranspose(PacketBlock& kernel) { template EIGEN_STRONG_INLINE -typename std::enable_if::value && (unpacket_traits::size % 8) == 0, - Packet2Xl>::type -predux_half(const Packet4Xl& a) { + typename std::enable_if::value && (unpacket_traits::size % 8) == 0, + Packet2Xl>::type + predux_half(const Packet4Xl& a) { return __riscv_vadd_vv_i64m2(__riscv_vget_v_i64m4_i64m2(a, 0), __riscv_vget_v_i64m4_i64m2(a, 1), unpacket_traits::size); } template EIGEN_STRONG_INLINE -typename std::enable_if::value && (unpacket_traits::size % 8) == 0, - Packet1Xl>::type -predux_half(const Packet2Xl& a) { + typename std::enable_if::value && (unpacket_traits::size % 8) == 0, + Packet1Xl>::type + predux_half(const Packet2Xl& a) { return __riscv_vadd_vv_i64m1(__riscv_vget_v_i64m2_i64m1(a, 0), __riscv_vget_v_i64m2_i64m1(a, 1), unpacket_traits::size); } @@ -907,6 +922,12 @@ EIGEN_STRONG_INLINE Packet2Xd pabs(const Packet2Xd& a) { return __riscv_vfabs_v_f64m2(a, unpacket_traits::size); } +template <> +EIGEN_STRONG_INLINE Packet2Xd pabsdiff(const Packet2Xd& a, const Packet2Xd& b) { + return __riscv_vfabs_v_f64m2(__riscv_vfsub_vv_f64m2(a, b, unpacket_traits::size), + unpacket_traits::size); +} + template <> EIGEN_STRONG_INLINE Packet2Xd pset1(const double& from) { return __riscv_vfmv_v_f_f64m2(from, unpacket_traits::size); @@ -925,6 +946,16 @@ EIGEN_STRONG_INLINE Packet2Xd plset(const double& a) { return __riscv_vfadd_vf_f64m2(idx, a, unpacket_traits::size); } +template <> +EIGEN_STRONG_INLINE void pbroadcast4(const double* a, Packet2Xd& a0, Packet2Xd& a1, Packet2Xd& a2, + Packet2Xd& a3) { + vfloat64m2_t aa = __riscv_vle64_v_f64m2(a, 4); + a0 = __riscv_vrgather_vx_f64m2(aa, 0, unpacket_traits::size); + a1 = __riscv_vrgather_vx_f64m2(aa, 1, unpacket_traits::size); + a2 = __riscv_vrgather_vx_f64m2(aa, 2, unpacket_traits::size); + a3 = __riscv_vrgather_vx_f64m2(aa, 3, unpacket_traits::size); +} + template <> EIGEN_STRONG_INLINE Packet2Xd padd(const Packet2Xd& a, const Packet2Xd& b) { return __riscv_vfadd_vv_f64m2(a, b, unpacket_traits::size); @@ -940,6 +971,12 @@ EIGEN_STRONG_INLINE Packet2Xd pnegate(const Packet2Xd& a) { return __riscv_vfneg_v_f64m2(a, unpacket_traits::size); } +template <> +EIGEN_STRONG_INLINE Packet2Xd psignbit(const Packet2Xd& a) { + return __riscv_vreinterpret_v_i64m2_f64m2( + __riscv_vsra_vx_i64m2(__riscv_vreinterpret_v_f64m2_i64m2(a), 63, unpacket_traits::size)); +} + template <> EIGEN_STRONG_INLINE Packet2Xd pconj(const Packet2Xd& a) { return a; @@ -977,8 +1014,7 @@ EIGEN_STRONG_INLINE Packet2Xd pnmsub(const Packet2Xd& a, const Packet2Xd& b, con template <> EIGEN_STRONG_INLINE Packet2Xd pmin(const Packet2Xd& a, const Packet2Xd& b) { - Packet2Xd nans = - __riscv_vfmv_v_f_f64m2((std::numeric_limits::quiet_NaN)(), unpacket_traits::size); + Packet2Xd nans = __riscv_vfmv_v_f_f64m2((std::numeric_limits::quiet_NaN)(), unpacket_traits::size); PacketMask32 mask = __riscv_vmfeq_vv_f64m2_b32(a, a, unpacket_traits::size); PacketMask32 mask2 = __riscv_vmfeq_vv_f64m2_b32(b, b, unpacket_traits::size); mask = __riscv_vmand_mm_b32(mask, mask2, unpacket_traits::size); @@ -998,8 +1034,7 @@ EIGEN_STRONG_INLINE Packet2Xd pmin(const Packet2Xd& template <> EIGEN_STRONG_INLINE Packet2Xd pmax(const Packet2Xd& a, const Packet2Xd& b) { - Packet2Xd nans = - __riscv_vfmv_v_f_f64m2((std::numeric_limits::quiet_NaN)(), unpacket_traits::size); + Packet2Xd nans = __riscv_vfmv_v_f_f64m2((std::numeric_limits::quiet_NaN)(), unpacket_traits::size); PacketMask32 mask = __riscv_vmfeq_vv_f64m2_b32(a, a, unpacket_traits::size); PacketMask32 mask2 = __riscv_vmfeq_vv_f64m2_b32(b, b, unpacket_traits::size); mask = __riscv_vmand_mm_b32(mask, mask2, unpacket_traits::size); @@ -1020,22 +1055,19 @@ EIGEN_STRONG_INLINE Packet2Xd pmax(const Packet2Xd& template <> EIGEN_STRONG_INLINE Packet2Xd pcmp_le(const Packet2Xd& a, const Packet2Xd& b) { PacketMask32 mask = __riscv_vmfle_vv_f64m2_b32(a, b, unpacket_traits::size); - return __riscv_vmerge_vvm_f64m2(pzero(a), ptrue(a), mask, - unpacket_traits::size); + return __riscv_vmerge_vvm_f64m2(pzero(a), ptrue(a), mask, unpacket_traits::size); } template <> EIGEN_STRONG_INLINE Packet2Xd pcmp_lt(const Packet2Xd& a, const Packet2Xd& b) { PacketMask32 mask = __riscv_vmflt_vv_f64m2_b32(a, b, unpacket_traits::size); - return __riscv_vmerge_vvm_f64m2(pzero(a), ptrue(a), mask, - unpacket_traits::size); + return __riscv_vmerge_vvm_f64m2(pzero(a), ptrue(a), mask, unpacket_traits::size); } template <> EIGEN_STRONG_INLINE Packet2Xd pcmp_eq(const Packet2Xd& a, const Packet2Xd& b) { PacketMask32 mask = __riscv_vmfeq_vv_f64m2_b32(a, b, unpacket_traits::size); - return __riscv_vmerge_vvm_f64m2(pzero(a), ptrue(a), mask, - unpacket_traits::size); + return __riscv_vmerge_vvm_f64m2(pzero(a), ptrue(a), mask, unpacket_traits::size); } template <> @@ -1044,26 +1076,33 @@ EIGEN_STRONG_INLINE Packet2Xd pcmp_lt_or_nan(const Packet2Xd& a, cons return __riscv_vfmerge_vfm_f64m2(ptrue(a), 0.0, mask, unpacket_traits::size); } +EIGEN_STRONG_INLINE Packet2Xd pselect(const PacketMask32& mask, const Packet2Xd& a, const Packet2Xd& b) { + return __riscv_vmerge_vvm_f64m2(b, a, mask, unpacket_traits::size); +} + +EIGEN_STRONG_INLINE Packet2Xd pselect(const Packet2Xd& mask, const Packet2Xd& a, const Packet2Xd& b) { + PacketMask32 mask2 = + __riscv_vmsne_vx_i64m2_b32(__riscv_vreinterpret_v_f64m2_i64m2(mask), 0, unpacket_traits::size); + return __riscv_vmerge_vvm_f64m2(b, a, mask2, unpacket_traits::size); +} + // Logical Operations are not supported for double, so reinterpret casts template <> EIGEN_STRONG_INLINE Packet2Xd pand(const Packet2Xd& a, const Packet2Xd& b) { - return __riscv_vreinterpret_v_u64m2_f64m2(__riscv_vand_vv_u64m2(__riscv_vreinterpret_v_f64m2_u64m2(a), - __riscv_vreinterpret_v_f64m2_u64m2(b), - unpacket_traits::size)); + return __riscv_vreinterpret_v_u64m2_f64m2(__riscv_vand_vv_u64m2( + __riscv_vreinterpret_v_f64m2_u64m2(a), __riscv_vreinterpret_v_f64m2_u64m2(b), unpacket_traits::size)); } template <> EIGEN_STRONG_INLINE Packet2Xd por(const Packet2Xd& a, const Packet2Xd& b) { - return __riscv_vreinterpret_v_u64m2_f64m2(__riscv_vor_vv_u64m2(__riscv_vreinterpret_v_f64m2_u64m2(a), - __riscv_vreinterpret_v_f64m2_u64m2(b), - unpacket_traits::size)); + return __riscv_vreinterpret_v_u64m2_f64m2(__riscv_vor_vv_u64m2( + __riscv_vreinterpret_v_f64m2_u64m2(a), __riscv_vreinterpret_v_f64m2_u64m2(b), unpacket_traits::size)); } template <> EIGEN_STRONG_INLINE Packet2Xd pxor(const Packet2Xd& a, const Packet2Xd& b) { - return __riscv_vreinterpret_v_u64m2_f64m2(__riscv_vxor_vv_u64m2(__riscv_vreinterpret_v_f64m2_u64m2(a), - __riscv_vreinterpret_v_f64m2_u64m2(b), - unpacket_traits::size)); + return __riscv_vreinterpret_v_u64m2_f64m2(__riscv_vxor_vv_u64m2( + __riscv_vreinterpret_v_f64m2_u64m2(a), __riscv_vreinterpret_v_f64m2_u64m2(b), unpacket_traits::size)); } template <> @@ -1086,18 +1125,16 @@ EIGEN_STRONG_INLINE Packet2Xd ploadu(const double* from) { template <> EIGEN_STRONG_INLINE Packet2Xd ploaddup(const double* from) { - Packet2Xul idx = __riscv_vid_v_u64m2(unpacket_traits::size); - idx = __riscv_vsll_vx_u64m2(__riscv_vand_vx_u64m2(idx, 0xfffffffffffffffeu, unpacket_traits::size), 2, - unpacket_traits::size); - return __riscv_vloxei64_v_f64m2(from, idx, unpacket_traits::size); + Packet2Xul idx = + __riscv_vsrl_vx_u64m2(__riscv_vid_v_u64m2(unpacket_traits::size), 1, unpacket_traits::size); + return __riscv_vrgather_vv_f64m2(pload(from), idx, unpacket_traits::size); } template <> EIGEN_STRONG_INLINE Packet2Xd ploadquad(const double* from) { - Packet2Xul idx = __riscv_vid_v_u64m2(unpacket_traits::size); - idx = __riscv_vsll_vx_u64m2(__riscv_vand_vx_u64m2(idx, 0xfffffffffffffffcu, unpacket_traits::size), 1, - unpacket_traits::size); - return __riscv_vloxei64_v_f64m2(from, idx, unpacket_traits::size); + Packet2Xul idx = + __riscv_vsrl_vx_u64m2(__riscv_vid_v_u64m2(unpacket_traits::size), 2, unpacket_traits::size); + return __riscv_vrgather_vv_f64m2(pload(from), idx, unpacket_traits::size); } template <> @@ -1137,8 +1174,8 @@ EIGEN_STRONG_INLINE Packet2Xd print(const Packet2Xd& a) { PacketMask32 mask = __riscv_vmfne_vv_f64m2_b32(a, a, unpacket_traits::size); const Packet2Xd x = __riscv_vfadd_vv_f64m2_tumu(mask, a, a, a, unpacket_traits::size); - const Packet2Xd new_x = __riscv_vfcvt_f_x_v_f64m2( - __riscv_vfcvt_x_f_v_i64m2(a, unpacket_traits::size), unpacket_traits::size); + const Packet2Xd new_x = __riscv_vfcvt_f_x_v_f64m2(__riscv_vfcvt_x_f_v_i64m2(a, unpacket_traits::size), + unpacket_traits::size); mask = __riscv_vmflt_vv_f64m2_b32(abs_a, limit, unpacket_traits::size); Packet2Xd signed_x = __riscv_vfsgnj_vv_f64m2(new_x, x, unpacket_traits::size); @@ -1155,9 +1192,8 @@ EIGEN_STRONG_INLINE Packet2Xd pfloor(const Packet2Xd& a) { template <> EIGEN_STRONG_INLINE Packet2Xd preverse(const Packet2Xd& a) { - Packet2Xul idx = - __riscv_vrsub_vx_u64m2(__riscv_vid_v_u64m2(unpacket_traits::size), - unpacket_traits::size - 1, unpacket_traits::size); + Packet2Xul idx = __riscv_vrsub_vx_u64m2(__riscv_vid_v_u64m2(unpacket_traits::size), + unpacket_traits::size - 1, unpacket_traits::size); return __riscv_vrgather_vv_f64m2(a, idx, unpacket_traits::size); } @@ -1174,28 +1210,26 @@ EIGEN_STRONG_INLINE double predux(const Packet2Xd& a) { template <> EIGEN_STRONG_INLINE double predux_mul(const Packet2Xd& a) { - return predux_mul(__riscv_vfmul_vv_f64m1(__riscv_vget_v_f64m2_f64m1(a, 0), __riscv_vget_v_f64m2_f64m1(a, 1), - unpacket_traits::size)); + return predux_mul(__riscv_vfmul_vv_f64m1( + __riscv_vget_v_f64m2_f64m1(a, 0), __riscv_vget_v_f64m2_f64m1(a, 1), unpacket_traits::size)); } template <> EIGEN_STRONG_INLINE double predux_min(const Packet2Xd& a) { - return (std::min)(__riscv_vfmv_f(__riscv_vfredmin_vs_f64m2_f64m1( - a, - __riscv_vfmv_v_f_f64m1((std::numeric_limits::quiet_NaN)(), - unpacket_traits::size / 2), - unpacket_traits::size)), - (std::numeric_limits::max)()); + return (std::min)( + __riscv_vfmv_f(__riscv_vfredmin_vs_f64m2_f64m1( + a, __riscv_vfmv_v_f_f64m1((std::numeric_limits::quiet_NaN)(), unpacket_traits::size / 2), + unpacket_traits::size)), + (std::numeric_limits::max)()); } template <> EIGEN_STRONG_INLINE double predux_max(const Packet2Xd& a) { - return (std::max)(__riscv_vfmv_f(__riscv_vfredmax_vs_f64m2_f64m1( - a, - __riscv_vfmv_v_f_f64m1((std::numeric_limits::quiet_NaN)(), - unpacket_traits::size / 2), - unpacket_traits::size)), - -(std::numeric_limits::max)()); + return (std::max)( + __riscv_vfmv_f(__riscv_vfredmax_vs_f64m2_f64m1( + a, __riscv_vfmv_v_f_f64m1((std::numeric_limits::quiet_NaN)(), unpacket_traits::size / 2), + unpacket_traits::size)), + -(std::numeric_limits::max)()); } template @@ -1220,18 +1254,18 @@ EIGEN_STRONG_INLINE Packet2Xd pldexp(const Packet2Xd& a, const Packet template EIGEN_STRONG_INLINE -typename std::enable_if::value && (unpacket_traits::size % 8) == 0, - Packet2Xd>::type -predux_half(const Packet4Xd& a) { + typename std::enable_if::value && (unpacket_traits::size % 8) == 0, + Packet2Xd>::type + predux_half(const Packet4Xd& a) { return __riscv_vfadd_vv_f64m2(__riscv_vget_v_f64m4_f64m2(a, 0), __riscv_vget_v_f64m4_f64m2(a, 1), unpacket_traits::size); } template EIGEN_STRONG_INLINE -typename std::enable_if::value && (unpacket_traits::size % 8) == 0, - Packet1Xd>::type -predux_half(const Packet2Xd& a) { + typename std::enable_if::value && (unpacket_traits::size % 8) == 0, + Packet1Xd>::type + predux_half(const Packet2Xd& a) { return __riscv_vfadd_vv_f64m1(__riscv_vget_v_f64m2_f64m1(a, 0), __riscv_vget_v_f64m2_f64m1(a, 1), unpacket_traits::size); } @@ -1386,18 +1420,17 @@ EIGEN_STRONG_INLINE Packet2Xs ploadu(const numext::int16_t* from) { template <> EIGEN_STRONG_INLINE Packet2Xs ploaddup(const numext::int16_t* from) { - Packet2Xsu idx = __riscv_vid_v_u16m2(unpacket_traits::size); - idx = __riscv_vand_vx_u16m2(idx, 0xfffeu, unpacket_traits::size); - // idx = 0 0 sizeof(int16_t) sizeof(int16_t) 2*sizeof(int16_t) 2*sizeof(int16_t) ... - return __riscv_vloxei16_v_i16m2(from, idx, unpacket_traits::size); + Packet2Xsu data = __riscv_vreinterpret_v_i16m2_u16m2(pload(from)); + return __riscv_vreinterpret_v_i32m2_i16m2(__riscv_vreinterpret_v_u32m2_i32m2(__riscv_vlmul_trunc_v_u32m4_u32m2( + __riscv_vwmaccu_vx_u32m4(__riscv_vwaddu_vv_u32m4(data, data, unpacket_traits::size), 0xffffu, data, + unpacket_traits::size)))); } template <> EIGEN_STRONG_INLINE Packet2Xs ploadquad(const numext::int16_t* from) { - Packet2Xsu idx = __riscv_vid_v_u16m2(unpacket_traits::size); - idx = __riscv_vsrl_vx_u16m2(__riscv_vand_vx_u16m2(idx, 0xfffcu, unpacket_traits::size), 1, - unpacket_traits::size); - return __riscv_vloxei16_v_i16m2(from, idx, unpacket_traits::size); + Packet2Xsu idx = + __riscv_vsrl_vx_u16m2(__riscv_vid_v_u16m2(unpacket_traits::size), 2, unpacket_traits::size); + return __riscv_vrgather_vv_i16m2(pload(from), idx, unpacket_traits::size); } template <> @@ -1411,14 +1444,13 @@ EIGEN_STRONG_INLINE void pstoreu(numext::int16_t* to, const Pac } template <> -EIGEN_DEVICE_FUNC inline Packet2Xs pgather(const numext::int16_t* from, - Index stride) { +EIGEN_DEVICE_FUNC inline Packet2Xs pgather(const numext::int16_t* from, Index stride) { return __riscv_vlse16_v_i16m2(from, stride * sizeof(numext::int16_t), unpacket_traits::size); } template <> EIGEN_DEVICE_FUNC inline void pscatter(numext::int16_t* to, const Packet2Xs& from, - Index stride) { + Index stride) { __riscv_vsse16(to, stride * sizeof(numext::int16_t), from, unpacket_traits::size); } @@ -1429,9 +1461,8 @@ EIGEN_STRONG_INLINE numext::int16_t pfirst(const Packet2Xs& a) { template <> EIGEN_STRONG_INLINE Packet2Xs preverse(const Packet2Xs& a) { - Packet2Xsu idx = - __riscv_vrsub_vx_u16m2(__riscv_vid_v_u16m2(unpacket_traits::size), - unpacket_traits::size - 1, unpacket_traits::size); + Packet2Xsu idx = __riscv_vrsub_vx_u16m2(__riscv_vid_v_u16m2(unpacket_traits::size), + unpacket_traits::size - 1, unpacket_traits::size); return __riscv_vrgather_vv_i16m2(a, idx, unpacket_traits::size); } @@ -1444,14 +1475,14 @@ EIGEN_STRONG_INLINE Packet2Xs pabs(const Packet2Xs& a) { template <> EIGEN_STRONG_INLINE numext::int16_t predux(const Packet2Xs& a) { - return __riscv_vmv_x(__riscv_vredsum_vs_i16m2_i16m1( - a, __riscv_vmv_v_x_i16m1(0, unpacket_traits::size / 2), unpacket_traits::size)); + return __riscv_vmv_x(__riscv_vredsum_vs_i16m2_i16m1(a, __riscv_vmv_v_x_i16m1(0, unpacket_traits::size / 2), + unpacket_traits::size)); } template <> EIGEN_STRONG_INLINE numext::int16_t predux_mul(const Packet2Xs& a) { return predux_mul(__riscv_vmul_vv_i16m1(__riscv_vget_v_i16m2_i16m1(a, 0), __riscv_vget_v_i16m2_i16m1(a, 1), - unpacket_traits::size)); + unpacket_traits::size)); } template <> @@ -1484,18 +1515,18 @@ EIGEN_DEVICE_FUNC inline void ptranspose(PacketBlock& kernel) { template EIGEN_STRONG_INLINE -typename std::enable_if::value && (unpacket_traits::size % 8) == 0, - Packet2Xs>::type -predux_half(const Packet4Xs& a) { + typename std::enable_if::value && (unpacket_traits::size % 8) == 0, + Packet2Xs>::type + predux_half(const Packet4Xs& a) { return __riscv_vadd_vv_i16m2(__riscv_vget_v_i16m4_i16m2(a, 0), __riscv_vget_v_i16m4_i16m2(a, 1), unpacket_traits::size); } template EIGEN_STRONG_INLINE -typename std::enable_if::value && (unpacket_traits::size % 8) == 0, - Packet1Xs>::type -predux_half(const Packet2Xs& a) { + typename std::enable_if::value && (unpacket_traits::size % 8) == 0, + Packet1Xs>::type + predux_half(const Packet2Xs& a) { return __riscv_vadd_vv_i16m1(__riscv_vget_v_i16m2_i16m1(a, 0), __riscv_vget_v_i16m2_i16m1(a, 1), unpacket_traits::size); } diff --git a/Eigen/src/Core/arch/RVV10/PacketMath4.h b/Eigen/src/Core/arch/RVV10/PacketMath4.h index 30f5ca33d..249dadfee 100644 --- a/Eigen/src/Core/arch/RVV10/PacketMath4.h +++ b/Eigen/src/Core/arch/RVV10/PacketMath4.h @@ -2,6 +2,7 @@ // for linear algebra. // // Copyright (C) 2024 Kseniya Zaytseva +// Copyright (C) 2025 Chip Kerchner // // This Source Code Form is subject to the terms of the Mozilla // Public License v. 2.0. If a copy of the MPL was not distributed @@ -166,18 +167,17 @@ EIGEN_STRONG_INLINE Packet4Xi ploadu(const numext::int32_t* from) { template <> EIGEN_STRONG_INLINE Packet4Xi ploaddup(const numext::int32_t* from) { - Packet4Xu idx = __riscv_vid_v_u32m4(unpacket_traits::size); - idx = __riscv_vsll_vx_u32m4(__riscv_vand_vx_u32m4(idx, 0xfffffffeu, unpacket_traits::size), 1, - unpacket_traits::size); - // idx = 0 0 sizeof(int32_t) sizeof(int32_t) 2*sizeof(int32_t) 2*sizeof(int32_t) ... - return __riscv_vloxei32_v_i32m4(from, idx, unpacket_traits::size); + Packet4Xu data = __riscv_vreinterpret_v_i32m4_u32m4(pload(from)); + return __riscv_vreinterpret_v_i64m4_i32m4(__riscv_vreinterpret_v_u64m4_i64m4(__riscv_vlmul_trunc_v_u64m8_u64m4( + __riscv_vwmaccu_vx_u64m8(__riscv_vwaddu_vv_u64m8(data, data, unpacket_traits::size), 0xffffffffu, data, + unpacket_traits::size)))); } template <> EIGEN_STRONG_INLINE Packet4Xi ploadquad(const numext::int32_t* from) { - Packet4Xu idx = __riscv_vid_v_u32m4(unpacket_traits::size); - idx = __riscv_vand_vx_u32m4(idx, 0xfffffffcu, unpacket_traits::size); - return __riscv_vloxei32_v_i32m4(from, idx, unpacket_traits::size); + Packet4Xu idx = + __riscv_vsrl_vx_u32m4(__riscv_vid_v_u32m4(unpacket_traits::size), 2, unpacket_traits::size); + return __riscv_vrgather_vv_i32m4(pload(from), idx, unpacket_traits::size); } template <> @@ -191,14 +191,13 @@ EIGEN_STRONG_INLINE void pstoreu(numext::int32_t* to, const Pac } template <> -EIGEN_DEVICE_FUNC inline Packet4Xi pgather(const numext::int32_t* from, - Index stride) { +EIGEN_DEVICE_FUNC inline Packet4Xi pgather(const numext::int32_t* from, Index stride) { return __riscv_vlse32_v_i32m4(from, stride * sizeof(numext::int32_t), unpacket_traits::size); } template <> EIGEN_DEVICE_FUNC inline void pscatter(numext::int32_t* to, const Packet4Xi& from, - Index stride) { + Index stride) { __riscv_vsse32(to, stride * sizeof(numext::int32_t), from, unpacket_traits::size); } @@ -209,9 +208,8 @@ EIGEN_STRONG_INLINE numext::int32_t pfirst(const Packet4Xi& a) { template <> EIGEN_STRONG_INLINE Packet4Xi preverse(const Packet4Xi& a) { - Packet4Xu idx = - __riscv_vrsub_vx_u32m4(__riscv_vid_v_u32m4(unpacket_traits::size), - unpacket_traits::size - 1, unpacket_traits::size); + Packet4Xu idx = __riscv_vrsub_vx_u32m4(__riscv_vid_v_u32m4(unpacket_traits::size), + unpacket_traits::size - 1, unpacket_traits::size); return __riscv_vrgather_vv_i32m4(a, idx, unpacket_traits::size); } @@ -224,16 +222,16 @@ EIGEN_STRONG_INLINE Packet4Xi pabs(const Packet4Xi& a) { template <> EIGEN_STRONG_INLINE numext::int32_t predux(const Packet4Xi& a) { - return __riscv_vmv_x(__riscv_vredsum_vs_i32m4_i32m1( - a, __riscv_vmv_v_x_i32m1(0, unpacket_traits::size / 4), unpacket_traits::size)); + return __riscv_vmv_x(__riscv_vredsum_vs_i32m4_i32m1(a, __riscv_vmv_v_x_i32m1(0, unpacket_traits::size / 4), + unpacket_traits::size)); } template <> EIGEN_STRONG_INLINE numext::int32_t predux_mul(const Packet4Xi& a) { Packet1Xi half1 = __riscv_vmul_vv_i32m1(__riscv_vget_v_i32m4_i32m1(a, 0), __riscv_vget_v_i32m4_i32m1(a, 1), - unpacket_traits::size); + unpacket_traits::size); Packet1Xi half2 = __riscv_vmul_vv_i32m1(__riscv_vget_v_i32m4_i32m1(a, 2), __riscv_vget_v_i32m4_i32m1(a, 3), - unpacket_traits::size); + unpacket_traits::size); return predux_mul(__riscv_vmul_vv_i32m1(half1, half2, unpacket_traits::size)); } @@ -282,6 +280,12 @@ EIGEN_STRONG_INLINE Packet4Xf pabs(const Packet4Xf& a) { return __riscv_vfabs_v_f32m4(a, unpacket_traits::size); } +template <> +EIGEN_STRONG_INLINE Packet4Xf pabsdiff(const Packet4Xf& a, const Packet4Xf& b) { + return __riscv_vfabs_v_f32m4(__riscv_vfsub_vv_f32m4(a, b, unpacket_traits::size), + unpacket_traits::size); +} + template <> EIGEN_STRONG_INLINE Packet4Xf pset1(const float& from) { return __riscv_vfmv_v_f_f32m4(from, unpacket_traits::size); @@ -300,6 +304,16 @@ EIGEN_STRONG_INLINE Packet4Xf plset(const float& a) { return __riscv_vfadd_vf_f32m4(idx, a, unpacket_traits::size); } +template <> +EIGEN_STRONG_INLINE void pbroadcast4(const float* a, Packet4Xf& a0, Packet4Xf& a1, Packet4Xf& a2, + Packet4Xf& a3) { + vfloat32m4_t aa = __riscv_vle32_v_f32m4(a, 4); + a0 = __riscv_vrgather_vx_f32m4(aa, 0, unpacket_traits::size); + a1 = __riscv_vrgather_vx_f32m4(aa, 1, unpacket_traits::size); + a2 = __riscv_vrgather_vx_f32m4(aa, 2, unpacket_traits::size); + a3 = __riscv_vrgather_vx_f32m4(aa, 3, unpacket_traits::size); +} + template <> EIGEN_STRONG_INLINE Packet4Xf padd(const Packet4Xf& a, const Packet4Xf& b) { return __riscv_vfadd_vv_f32m4(a, b, unpacket_traits::size); @@ -315,6 +329,12 @@ EIGEN_STRONG_INLINE Packet4Xf pnegate(const Packet4Xf& a) { return __riscv_vfneg_v_f32m4(a, unpacket_traits::size); } +template <> +EIGEN_STRONG_INLINE Packet4Xf psignbit(const Packet4Xf& a) { + return __riscv_vreinterpret_v_i32m4_f32m4( + __riscv_vsra_vx_i32m4(__riscv_vreinterpret_v_f32m4_i32m4(a), 31, unpacket_traits::size)); +} + template <> EIGEN_STRONG_INLINE Packet4Xf pconj(const Packet4Xf& a) { return a; @@ -352,8 +372,7 @@ EIGEN_STRONG_INLINE Packet4Xf pnmsub(const Packet4Xf& a, const Packet4Xf& b, con template <> EIGEN_STRONG_INLINE Packet4Xf pmin(const Packet4Xf& a, const Packet4Xf& b) { - Packet4Xf nans = - __riscv_vfmv_v_f_f32m4((std::numeric_limits::quiet_NaN)(), unpacket_traits::size); + Packet4Xf nans = __riscv_vfmv_v_f_f32m4((std::numeric_limits::quiet_NaN)(), unpacket_traits::size); PacketMask8 mask = __riscv_vmfeq_vv_f32m4_b8(a, a, unpacket_traits::size); PacketMask8 mask2 = __riscv_vmfeq_vv_f32m4_b8(b, b, unpacket_traits::size); mask = __riscv_vmand_mm_b8(mask, mask2, unpacket_traits::size); @@ -373,8 +392,7 @@ EIGEN_STRONG_INLINE Packet4Xf pmin(const Packet4Xf& template <> EIGEN_STRONG_INLINE Packet4Xf pmax(const Packet4Xf& a, const Packet4Xf& b) { - Packet4Xf nans = - __riscv_vfmv_v_f_f32m4((std::numeric_limits::quiet_NaN)(), unpacket_traits::size); + Packet4Xf nans = __riscv_vfmv_v_f_f32m4((std::numeric_limits::quiet_NaN)(), unpacket_traits::size); PacketMask8 mask = __riscv_vmfeq_vv_f32m4_b8(a, a, unpacket_traits::size); PacketMask8 mask2 = __riscv_vmfeq_vv_f32m4_b8(b, b, unpacket_traits::size); mask = __riscv_vmand_mm_b8(mask, mask2, unpacket_traits::size); @@ -395,22 +413,19 @@ EIGEN_STRONG_INLINE Packet4Xf pmax(const Packet4Xf& template <> EIGEN_STRONG_INLINE Packet4Xf pcmp_le(const Packet4Xf& a, const Packet4Xf& b) { PacketMask8 mask = __riscv_vmfle_vv_f32m4_b8(a, b, unpacket_traits::size); - return __riscv_vmerge_vvm_f32m4(pzero(a), ptrue(a), mask, - unpacket_traits::size); + return __riscv_vmerge_vvm_f32m4(pzero(a), ptrue(a), mask, unpacket_traits::size); } template <> EIGEN_STRONG_INLINE Packet4Xf pcmp_lt(const Packet4Xf& a, const Packet4Xf& b) { PacketMask8 mask = __riscv_vmflt_vv_f32m4_b8(a, b, unpacket_traits::size); - return __riscv_vmerge_vvm_f32m4(pzero(a), ptrue(a), mask, - unpacket_traits::size); + return __riscv_vmerge_vvm_f32m4(pzero(a), ptrue(a), mask, unpacket_traits::size); } template <> EIGEN_STRONG_INLINE Packet4Xf pcmp_eq(const Packet4Xf& a, const Packet4Xf& b) { PacketMask8 mask = __riscv_vmfeq_vv_f32m4_b8(a, b, unpacket_traits::size); - return __riscv_vmerge_vvm_f32m4(pzero(a), ptrue(a), mask, - unpacket_traits::size); + return __riscv_vmerge_vvm_f32m4(pzero(a), ptrue(a), mask, unpacket_traits::size); } template <> @@ -419,26 +434,33 @@ EIGEN_STRONG_INLINE Packet4Xf pcmp_lt_or_nan(const Packet4Xf& a, cons return __riscv_vfmerge_vfm_f32m4(ptrue(a), 0.0f, mask, unpacket_traits::size); } +EIGEN_STRONG_INLINE Packet4Xf pselect(const PacketMask8& mask, const Packet4Xf& a, const Packet4Xf& b) { + return __riscv_vmerge_vvm_f32m4(b, a, mask, unpacket_traits::size); +} + +EIGEN_STRONG_INLINE Packet4Xf pselect(const Packet4Xf& mask, const Packet4Xf& a, const Packet4Xf& b) { + PacketMask8 mask2 = + __riscv_vmsne_vx_i32m4_b8(__riscv_vreinterpret_v_f32m4_i32m4(mask), 0, unpacket_traits::size); + return __riscv_vmerge_vvm_f32m4(b, a, mask2, unpacket_traits::size); +} + // Logical Operations are not supported for float, so reinterpret casts template <> EIGEN_STRONG_INLINE Packet4Xf pand(const Packet4Xf& a, const Packet4Xf& b) { - return __riscv_vreinterpret_v_u32m4_f32m4(__riscv_vand_vv_u32m4(__riscv_vreinterpret_v_f32m4_u32m4(a), - __riscv_vreinterpret_v_f32m4_u32m4(b), - unpacket_traits::size)); + return __riscv_vreinterpret_v_u32m4_f32m4(__riscv_vand_vv_u32m4( + __riscv_vreinterpret_v_f32m4_u32m4(a), __riscv_vreinterpret_v_f32m4_u32m4(b), unpacket_traits::size)); } template <> EIGEN_STRONG_INLINE Packet4Xf por(const Packet4Xf& a, const Packet4Xf& b) { - return __riscv_vreinterpret_v_u32m4_f32m4(__riscv_vor_vv_u32m4(__riscv_vreinterpret_v_f32m4_u32m4(a), - __riscv_vreinterpret_v_f32m4_u32m4(b), - unpacket_traits::size)); + return __riscv_vreinterpret_v_u32m4_f32m4(__riscv_vor_vv_u32m4( + __riscv_vreinterpret_v_f32m4_u32m4(a), __riscv_vreinterpret_v_f32m4_u32m4(b), unpacket_traits::size)); } template <> EIGEN_STRONG_INLINE Packet4Xf pxor(const Packet4Xf& a, const Packet4Xf& b) { - return __riscv_vreinterpret_v_u32m4_f32m4(__riscv_vxor_vv_u32m4(__riscv_vreinterpret_v_f32m4_u32m4(a), - __riscv_vreinterpret_v_f32m4_u32m4(b), - unpacket_traits::size)); + return __riscv_vreinterpret_v_u32m4_f32m4(__riscv_vxor_vv_u32m4( + __riscv_vreinterpret_v_f32m4_u32m4(a), __riscv_vreinterpret_v_f32m4_u32m4(b), unpacket_traits::size)); } template <> @@ -461,17 +483,18 @@ EIGEN_STRONG_INLINE Packet4Xf ploadu(const float* from) { template <> EIGEN_STRONG_INLINE Packet4Xf ploaddup(const float* from) { - Packet4Xu idx = __riscv_vid_v_u32m4(unpacket_traits::size); - idx = __riscv_vsll_vx_u32m4(__riscv_vand_vx_u32m4(idx, 0xfffffffeu, unpacket_traits::size), 1, - unpacket_traits::size); - return __riscv_vloxei32_v_f32m4(from, idx, unpacket_traits::size); + Packet4Xu data = __riscv_vreinterpret_v_f32m4_u32m4(pload(from)); + return __riscv_vreinterpret_v_i32m4_f32m4( + __riscv_vreinterpret_v_i64m4_i32m4(__riscv_vreinterpret_v_u64m4_i64m4(__riscv_vlmul_trunc_v_u64m8_u64m4( + __riscv_vwmaccu_vx_u64m8(__riscv_vwaddu_vv_u64m8(data, data, unpacket_traits::size), 0xffffffffu, + data, unpacket_traits::size))))); } template <> EIGEN_STRONG_INLINE Packet4Xf ploadquad(const float* from) { - Packet4Xu idx = __riscv_vid_v_u32m4(unpacket_traits::size); - idx = __riscv_vand_vx_u32m4(idx, 0xfffffffcu, unpacket_traits::size); - return __riscv_vloxei32_v_f32m4(from, idx, unpacket_traits::size); + Packet4Xu idx = + __riscv_vsrl_vx_u32m4(__riscv_vid_v_u32m4(unpacket_traits::size), 2, unpacket_traits::size); + return __riscv_vrgather_vv_f32m4(pload(from), idx, unpacket_traits::size); } template <> @@ -511,8 +534,8 @@ EIGEN_STRONG_INLINE Packet4Xf print(const Packet4Xf& a) { PacketMask8 mask = __riscv_vmfne_vv_f32m4_b8(a, a, unpacket_traits::size); const Packet4Xf x = __riscv_vfadd_vv_f32m4_tumu(mask, a, a, a, unpacket_traits::size); - const Packet4Xf new_x = __riscv_vfcvt_f_x_v_f32m4( - __riscv_vfcvt_x_f_v_i32m4(a, unpacket_traits::size), unpacket_traits::size); + const Packet4Xf new_x = __riscv_vfcvt_f_x_v_f32m4(__riscv_vfcvt_x_f_v_i32m4(a, unpacket_traits::size), + unpacket_traits::size); mask = __riscv_vmflt_vv_f32m4_b8(abs_a, limit, unpacket_traits::size); Packet4Xf signed_x = __riscv_vfsgnj_vv_f32m4(new_x, x, unpacket_traits::size); @@ -529,9 +552,8 @@ EIGEN_STRONG_INLINE Packet4Xf pfloor(const Packet4Xf& a) { template <> EIGEN_STRONG_INLINE Packet4Xf preverse(const Packet4Xf& a) { - Packet4Xu idx = - __riscv_vrsub_vx_u32m4(__riscv_vid_v_u32m4(unpacket_traits::size), - unpacket_traits::size - 1, unpacket_traits::size); + Packet4Xu idx = __riscv_vrsub_vx_u32m4(__riscv_vid_v_u32m4(unpacket_traits::size), + unpacket_traits::size - 1, unpacket_traits::size); return __riscv_vrgather_vv_f32m4(a, idx, unpacket_traits::size); } @@ -549,30 +571,28 @@ EIGEN_STRONG_INLINE float predux(const Packet4Xf& a) { template <> EIGEN_STRONG_INLINE float predux_mul(const Packet4Xf& a) { Packet1Xf half1 = __riscv_vfmul_vv_f32m1(__riscv_vget_v_f32m4_f32m1(a, 0), __riscv_vget_v_f32m4_f32m1(a, 1), - unpacket_traits::size); + unpacket_traits::size); Packet1Xf half2 = __riscv_vfmul_vv_f32m1(__riscv_vget_v_f32m4_f32m1(a, 2), __riscv_vget_v_f32m4_f32m1(a, 3), - unpacket_traits::size); + unpacket_traits::size); return predux_mul(__riscv_vfmul_vv_f32m1(half1, half2, unpacket_traits::size)); } template <> EIGEN_STRONG_INLINE float predux_min(const Packet4Xf& a) { - return (std::min)(__riscv_vfmv_f(__riscv_vfredmin_vs_f32m4_f32m1( - a, - __riscv_vfmv_v_f_f32m1((std::numeric_limits::quiet_NaN)(), - unpacket_traits::size / 4), - unpacket_traits::size)), - (std::numeric_limits::max)()); + return (std::min)( + __riscv_vfmv_f(__riscv_vfredmin_vs_f32m4_f32m1( + a, __riscv_vfmv_v_f_f32m1((std::numeric_limits::quiet_NaN)(), unpacket_traits::size / 4), + unpacket_traits::size)), + (std::numeric_limits::max)()); } template <> EIGEN_STRONG_INLINE float predux_max(const Packet4Xf& a) { - return (std::max)(__riscv_vfmv_f(__riscv_vfredmax_vs_f32m4_f32m1( - a, - __riscv_vfmv_v_f_f32m1((std::numeric_limits::quiet_NaN)(), - unpacket_traits::size / 4), - unpacket_traits::size)), - -(std::numeric_limits::max)()); + return (std::max)( + __riscv_vfmv_f(__riscv_vfredmax_vs_f32m4_f32m1( + a, __riscv_vfmv_v_f_f32m1((std::numeric_limits::quiet_NaN)(), unpacket_traits::size / 4), + unpacket_traits::size)), + -(std::numeric_limits::max)()); } template @@ -745,19 +765,16 @@ EIGEN_STRONG_INLINE Packet4Xl ploadu(const numext::int64_t* from) { template <> EIGEN_STRONG_INLINE Packet4Xl ploaddup(const numext::int64_t* from) { - Packet4Xul idx = __riscv_vid_v_u64m4(unpacket_traits::size); - idx = __riscv_vsll_vx_u64m4(__riscv_vand_vx_u64m4(idx, 0xfffffffffffffffeu, unpacket_traits::size), 2, - unpacket_traits::size); - // idx = 0 0 sizeof(int64_t) sizeof(int64_t) 2*sizeof(int64_t) 2*sizeof(int64_t) ... - return __riscv_vloxei64_v_i64m4(from, idx, unpacket_traits::size); + Packet4Xul idx = + __riscv_vsrl_vx_u64m4(__riscv_vid_v_u64m4(unpacket_traits::size), 1, unpacket_traits::size); + return __riscv_vrgather_vv_i64m4(pload(from), idx, unpacket_traits::size); } template <> EIGEN_STRONG_INLINE Packet4Xl ploadquad(const numext::int64_t* from) { - Packet4Xul idx = __riscv_vid_v_u64m4(unpacket_traits::size); - idx = __riscv_vsll_vx_u64m4(__riscv_vand_vx_u64m4(idx, 0xfffffffffffffffcu, unpacket_traits::size), 1, - unpacket_traits::size); - return __riscv_vloxei64_v_i64m4(from, idx, unpacket_traits::size); + Packet4Xul idx = + __riscv_vsrl_vx_u64m4(__riscv_vid_v_u64m4(unpacket_traits::size), 2, unpacket_traits::size); + return __riscv_vrgather_vv_i64m4(pload(from), idx, unpacket_traits::size); } template <> @@ -771,14 +788,13 @@ EIGEN_STRONG_INLINE void pstoreu(numext::int64_t* to, const Pac } template <> -EIGEN_DEVICE_FUNC inline Packet4Xl pgather(const numext::int64_t* from, - Index stride) { +EIGEN_DEVICE_FUNC inline Packet4Xl pgather(const numext::int64_t* from, Index stride) { return __riscv_vlse64_v_i64m4(from, stride * sizeof(numext::int64_t), unpacket_traits::size); } template <> EIGEN_DEVICE_FUNC inline void pscatter(numext::int64_t* to, const Packet4Xl& from, - Index stride) { + Index stride) { __riscv_vsse64(to, stride * sizeof(numext::int64_t), from, unpacket_traits::size); } @@ -789,9 +805,8 @@ EIGEN_STRONG_INLINE numext::int64_t pfirst(const Packet4Xl& a) { template <> EIGEN_STRONG_INLINE Packet4Xl preverse(const Packet4Xl& a) { - Packet4Xul idx = - __riscv_vrsub_vx_u64m4(__riscv_vid_v_u64m4(unpacket_traits::size), - unpacket_traits::size - 1, unpacket_traits::size); + Packet4Xul idx = __riscv_vrsub_vx_u64m4(__riscv_vid_v_u64m4(unpacket_traits::size), + unpacket_traits::size - 1, unpacket_traits::size); return __riscv_vrgather_vv_i64m4(a, idx, unpacket_traits::size); } @@ -804,16 +819,16 @@ EIGEN_STRONG_INLINE Packet4Xl pabs(const Packet4Xl& a) { template <> EIGEN_STRONG_INLINE numext::int64_t predux(const Packet4Xl& a) { - return __riscv_vmv_x(__riscv_vredsum_vs_i64m4_i64m1( - a, __riscv_vmv_v_x_i64m1(0, unpacket_traits::size / 4), unpacket_traits::size)); + return __riscv_vmv_x(__riscv_vredsum_vs_i64m4_i64m1(a, __riscv_vmv_v_x_i64m1(0, unpacket_traits::size / 4), + unpacket_traits::size)); } template <> EIGEN_STRONG_INLINE numext::int64_t predux_mul(const Packet4Xl& a) { Packet1Xl half1 = __riscv_vmul_vv_i64m1(__riscv_vget_v_i64m4_i64m1(a, 0), __riscv_vget_v_i64m4_i64m1(a, 1), - unpacket_traits::size); + unpacket_traits::size); Packet1Xl half2 = __riscv_vmul_vv_i64m1(__riscv_vget_v_i64m4_i64m1(a, 2), __riscv_vget_v_i64m4_i64m1(a, 3), - unpacket_traits::size); + unpacket_traits::size); return predux_mul(__riscv_vmul_vv_i64m1(half1, half2, unpacket_traits::size)); } @@ -862,6 +877,12 @@ EIGEN_STRONG_INLINE Packet4Xd pabs(const Packet4Xd& a) { return __riscv_vfabs_v_f64m4(a, unpacket_traits::size); } +template <> +EIGEN_STRONG_INLINE Packet4Xd pabsdiff(const Packet4Xd& a, const Packet4Xd& b) { + return __riscv_vfabs_v_f64m4(__riscv_vfsub_vv_f64m4(a, b, unpacket_traits::size), + unpacket_traits::size); +} + template <> EIGEN_STRONG_INLINE Packet4Xd pset1(const double& from) { return __riscv_vfmv_v_f_f64m4(from, unpacket_traits::size); @@ -880,6 +901,16 @@ EIGEN_STRONG_INLINE Packet4Xd plset(const double& a) { return __riscv_vfadd_vf_f64m4(idx, a, unpacket_traits::size); } +template <> +EIGEN_STRONG_INLINE void pbroadcast4(const double* a, Packet4Xd& a0, Packet4Xd& a1, Packet4Xd& a2, + Packet4Xd& a3) { + vfloat64m4_t aa = __riscv_vle64_v_f64m4(a, 4); + a0 = __riscv_vrgather_vx_f64m4(aa, 0, unpacket_traits::size); + a1 = __riscv_vrgather_vx_f64m4(aa, 1, unpacket_traits::size); + a2 = __riscv_vrgather_vx_f64m4(aa, 2, unpacket_traits::size); + a3 = __riscv_vrgather_vx_f64m4(aa, 3, unpacket_traits::size); +} + template <> EIGEN_STRONG_INLINE Packet4Xd padd(const Packet4Xd& a, const Packet4Xd& b) { return __riscv_vfadd_vv_f64m4(a, b, unpacket_traits::size); @@ -895,6 +926,12 @@ EIGEN_STRONG_INLINE Packet4Xd pnegate(const Packet4Xd& a) { return __riscv_vfneg_v_f64m4(a, unpacket_traits::size); } +template <> +EIGEN_STRONG_INLINE Packet4Xd psignbit(const Packet4Xd& a) { + return __riscv_vreinterpret_v_i64m4_f64m4( + __riscv_vsra_vx_i64m4(__riscv_vreinterpret_v_f64m4_i64m4(a), 63, unpacket_traits::size)); +} + template <> EIGEN_STRONG_INLINE Packet4Xd pconj(const Packet4Xd& a) { return a; @@ -932,8 +969,7 @@ EIGEN_STRONG_INLINE Packet4Xd pnmsub(const Packet4Xd& a, const Packet4Xd& b, con template <> EIGEN_STRONG_INLINE Packet4Xd pmin(const Packet4Xd& a, const Packet4Xd& b) { - Packet4Xd nans = - __riscv_vfmv_v_f_f64m4((std::numeric_limits::quiet_NaN)(), unpacket_traits::size); + Packet4Xd nans = __riscv_vfmv_v_f_f64m4((std::numeric_limits::quiet_NaN)(), unpacket_traits::size); PacketMask16 mask = __riscv_vmfeq_vv_f64m4_b16(a, a, unpacket_traits::size); PacketMask16 mask2 = __riscv_vmfeq_vv_f64m4_b16(b, b, unpacket_traits::size); mask = __riscv_vmand_mm_b16(mask, mask2, unpacket_traits::size); @@ -953,8 +989,7 @@ EIGEN_STRONG_INLINE Packet4Xd pmin(const Packet4Xd& template <> EIGEN_STRONG_INLINE Packet4Xd pmax(const Packet4Xd& a, const Packet4Xd& b) { - Packet4Xd nans = - __riscv_vfmv_v_f_f64m4((std::numeric_limits::quiet_NaN)(), unpacket_traits::size); + Packet4Xd nans = __riscv_vfmv_v_f_f64m4((std::numeric_limits::quiet_NaN)(), unpacket_traits::size); PacketMask16 mask = __riscv_vmfeq_vv_f64m4_b16(a, a, unpacket_traits::size); PacketMask16 mask2 = __riscv_vmfeq_vv_f64m4_b16(b, b, unpacket_traits::size); mask = __riscv_vmand_mm_b16(mask, mask2, unpacket_traits::size); @@ -975,22 +1010,19 @@ EIGEN_STRONG_INLINE Packet4Xd pmax(const Packet4Xd& template <> EIGEN_STRONG_INLINE Packet4Xd pcmp_le(const Packet4Xd& a, const Packet4Xd& b) { PacketMask16 mask = __riscv_vmfle_vv_f64m4_b16(a, b, unpacket_traits::size); - return __riscv_vmerge_vvm_f64m4(pzero(a), ptrue(a), mask, - unpacket_traits::size); + return __riscv_vmerge_vvm_f64m4(pzero(a), ptrue(a), mask, unpacket_traits::size); } template <> EIGEN_STRONG_INLINE Packet4Xd pcmp_lt(const Packet4Xd& a, const Packet4Xd& b) { PacketMask16 mask = __riscv_vmflt_vv_f64m4_b16(a, b, unpacket_traits::size); - return __riscv_vmerge_vvm_f64m4(pzero(a), ptrue(a), mask, - unpacket_traits::size); + return __riscv_vmerge_vvm_f64m4(pzero(a), ptrue(a), mask, unpacket_traits::size); } template <> EIGEN_STRONG_INLINE Packet4Xd pcmp_eq(const Packet4Xd& a, const Packet4Xd& b) { PacketMask16 mask = __riscv_vmfeq_vv_f64m4_b16(a, b, unpacket_traits::size); - return __riscv_vmerge_vvm_f64m4(pzero(a), ptrue(a), mask, - unpacket_traits::size); + return __riscv_vmerge_vvm_f64m4(pzero(a), ptrue(a), mask, unpacket_traits::size); } template <> @@ -999,26 +1031,33 @@ EIGEN_STRONG_INLINE Packet4Xd pcmp_lt_or_nan(const Packet4Xd& a, cons return __riscv_vfmerge_vfm_f64m4(ptrue(a), 0.0, mask, unpacket_traits::size); } +EIGEN_STRONG_INLINE Packet4Xd pselect(const PacketMask16& mask, const Packet4Xd& a, const Packet4Xd& b) { + return __riscv_vmerge_vvm_f64m4(b, a, mask, unpacket_traits::size); +} + +EIGEN_STRONG_INLINE Packet4Xd pselect(const Packet4Xd& mask, const Packet4Xd& a, const Packet4Xd& b) { + PacketMask16 mask2 = + __riscv_vmsne_vx_i64m4_b16(__riscv_vreinterpret_v_f64m4_i64m4(mask), 0, unpacket_traits::size); + return __riscv_vmerge_vvm_f64m4(b, a, mask2, unpacket_traits::size); +} + // Logical Operations are not supported for double, so reinterpret casts template <> EIGEN_STRONG_INLINE Packet4Xd pand(const Packet4Xd& a, const Packet4Xd& b) { - return __riscv_vreinterpret_v_u64m4_f64m4(__riscv_vand_vv_u64m4(__riscv_vreinterpret_v_f64m4_u64m4(a), - __riscv_vreinterpret_v_f64m4_u64m4(b), - unpacket_traits::size)); + return __riscv_vreinterpret_v_u64m4_f64m4(__riscv_vand_vv_u64m4( + __riscv_vreinterpret_v_f64m4_u64m4(a), __riscv_vreinterpret_v_f64m4_u64m4(b), unpacket_traits::size)); } template <> EIGEN_STRONG_INLINE Packet4Xd por(const Packet4Xd& a, const Packet4Xd& b) { - return __riscv_vreinterpret_v_u64m4_f64m4(__riscv_vor_vv_u64m4(__riscv_vreinterpret_v_f64m4_u64m4(a), - __riscv_vreinterpret_v_f64m4_u64m4(b), - unpacket_traits::size)); + return __riscv_vreinterpret_v_u64m4_f64m4(__riscv_vor_vv_u64m4( + __riscv_vreinterpret_v_f64m4_u64m4(a), __riscv_vreinterpret_v_f64m4_u64m4(b), unpacket_traits::size)); } template <> EIGEN_STRONG_INLINE Packet4Xd pxor(const Packet4Xd& a, const Packet4Xd& b) { - return __riscv_vreinterpret_v_u64m4_f64m4(__riscv_vxor_vv_u64m4(__riscv_vreinterpret_v_f64m4_u64m4(a), - __riscv_vreinterpret_v_f64m4_u64m4(b), - unpacket_traits::size)); + return __riscv_vreinterpret_v_u64m4_f64m4(__riscv_vxor_vv_u64m4( + __riscv_vreinterpret_v_f64m4_u64m4(a), __riscv_vreinterpret_v_f64m4_u64m4(b), unpacket_traits::size)); } template <> @@ -1041,18 +1080,16 @@ EIGEN_STRONG_INLINE Packet4Xd ploadu(const double* from) { template <> EIGEN_STRONG_INLINE Packet4Xd ploaddup(const double* from) { - Packet4Xul idx = __riscv_vid_v_u64m4(unpacket_traits::size); - idx = __riscv_vsll_vx_u64m4(__riscv_vand_vx_u64m4(idx, 0xfffffffffffffffeu, unpacket_traits::size), 2, - unpacket_traits::size); - return __riscv_vloxei64_v_f64m4(from, idx, unpacket_traits::size); + Packet4Xul idx = + __riscv_vsrl_vx_u64m4(__riscv_vid_v_u64m4(unpacket_traits::size), 1, unpacket_traits::size); + return __riscv_vrgather_vv_f64m4(pload(from), idx, unpacket_traits::size); } template <> EIGEN_STRONG_INLINE Packet4Xd ploadquad(const double* from) { - Packet4Xul idx = __riscv_vid_v_u64m4(unpacket_traits::size); - idx = __riscv_vsll_vx_u64m4(__riscv_vand_vx_u64m4(idx, 0xfffffffffffffffcu, unpacket_traits::size), 1, - unpacket_traits::size); - return __riscv_vloxei64_v_f64m4(from, idx, unpacket_traits::size); + Packet4Xul idx = + __riscv_vsrl_vx_u64m4(__riscv_vid_v_u64m4(unpacket_traits::size), 2, unpacket_traits::size); + return __riscv_vrgather_vv_f64m4(pload(from), idx, unpacket_traits::size); } template <> @@ -1092,8 +1129,8 @@ EIGEN_STRONG_INLINE Packet4Xd print(const Packet4Xd& a) { PacketMask16 mask = __riscv_vmfne_vv_f64m4_b16(a, a, unpacket_traits::size); const Packet4Xd x = __riscv_vfadd_vv_f64m4_tumu(mask, a, a, a, unpacket_traits::size); - const Packet4Xd new_x = __riscv_vfcvt_f_x_v_f64m4( - __riscv_vfcvt_x_f_v_i64m4(a, unpacket_traits::size), unpacket_traits::size); + const Packet4Xd new_x = __riscv_vfcvt_f_x_v_f64m4(__riscv_vfcvt_x_f_v_i64m4(a, unpacket_traits::size), + unpacket_traits::size); mask = __riscv_vmflt_vv_f64m4_b16(abs_a, limit, unpacket_traits::size); Packet4Xd signed_x = __riscv_vfsgnj_vv_f64m4(new_x, x, unpacket_traits::size); @@ -1110,9 +1147,8 @@ EIGEN_STRONG_INLINE Packet4Xd pfloor(const Packet4Xd& a) { template <> EIGEN_STRONG_INLINE Packet4Xd preverse(const Packet4Xd& a) { - Packet4Xul idx = - __riscv_vrsub_vx_u64m4(__riscv_vid_v_u64m4(unpacket_traits::size), - unpacket_traits::size - 1, unpacket_traits::size); + Packet4Xul idx = __riscv_vrsub_vx_u64m4(__riscv_vid_v_u64m4(unpacket_traits::size), + unpacket_traits::size - 1, unpacket_traits::size); return __riscv_vrgather_vv_f64m4(a, idx, unpacket_traits::size); } @@ -1130,30 +1166,28 @@ EIGEN_STRONG_INLINE double predux(const Packet4Xd& a) { template <> EIGEN_STRONG_INLINE double predux_mul(const Packet4Xd& a) { Packet1Xd half1 = __riscv_vfmul_vv_f64m1(__riscv_vget_v_f64m4_f64m1(a, 0), __riscv_vget_v_f64m4_f64m1(a, 1), - unpacket_traits::size); + unpacket_traits::size); Packet1Xd half2 = __riscv_vfmul_vv_f64m1(__riscv_vget_v_f64m4_f64m1(a, 2), __riscv_vget_v_f64m4_f64m1(a, 3), - unpacket_traits::size); + unpacket_traits::size); return predux_mul(__riscv_vfmul_vv_f64m1(half1, half2, unpacket_traits::size)); } template <> EIGEN_STRONG_INLINE double predux_min(const Packet4Xd& a) { - return (std::min)(__riscv_vfmv_f(__riscv_vfredmin_vs_f64m4_f64m1( - a, - __riscv_vfmv_v_f_f64m1((std::numeric_limits::quiet_NaN)(), - unpacket_traits::size / 4), - unpacket_traits::size)), - (std::numeric_limits::max)()); + return (std::min)( + __riscv_vfmv_f(__riscv_vfredmin_vs_f64m4_f64m1( + a, __riscv_vfmv_v_f_f64m1((std::numeric_limits::quiet_NaN)(), unpacket_traits::size / 4), + unpacket_traits::size)), + (std::numeric_limits::max)()); } template <> EIGEN_STRONG_INLINE double predux_max(const Packet4Xd& a) { - return (std::max)(__riscv_vfmv_f(__riscv_vfredmax_vs_f64m4_f64m1( - a, - __riscv_vfmv_v_f_f64m1((std::numeric_limits::quiet_NaN)(), - unpacket_traits::size / 4), - unpacket_traits::size)), - -(std::numeric_limits::max)()); + return (std::max)( + __riscv_vfmv_f(__riscv_vfredmax_vs_f64m4_f64m1( + a, __riscv_vfmv_v_f_f64m1((std::numeric_limits::quiet_NaN)(), unpacket_traits::size / 4), + unpacket_traits::size)), + -(std::numeric_limits::max)()); } template @@ -1326,18 +1360,17 @@ EIGEN_STRONG_INLINE Packet4Xs ploadu(const numext::int16_t* from) { template <> EIGEN_STRONG_INLINE Packet4Xs ploaddup(const numext::int16_t* from) { - Packet4Xsu idx = __riscv_vid_v_u16m4(unpacket_traits::size); - idx = __riscv_vand_vx_u16m4(idx, 0xfffeu, unpacket_traits::size); - // idx = 0 0 sizeof(int16_t) sizeof(int16_t) 2*sizeof(int16_t) 2*sizeof(int16_t) ... - return __riscv_vloxei16_v_i16m4(from, idx, unpacket_traits::size); + Packet4Xsu data = __riscv_vreinterpret_v_i16m4_u16m4(pload(from)); + return __riscv_vreinterpret_v_i32m4_i16m4(__riscv_vreinterpret_v_u32m4_i32m4(__riscv_vlmul_trunc_v_u32m8_u32m4( + __riscv_vwmaccu_vx_u32m8(__riscv_vwaddu_vv_u32m8(data, data, unpacket_traits::size), 0xffffu, data, + unpacket_traits::size)))); } template <> EIGEN_STRONG_INLINE Packet4Xs ploadquad(const numext::int16_t* from) { - Packet4Xsu idx = __riscv_vid_v_u16m4(unpacket_traits::size); - idx = __riscv_vsrl_vx_u16m4(__riscv_vand_vx_u16m4(idx, 0xfffcu, unpacket_traits::size), 1, - unpacket_traits::size); - return __riscv_vloxei16_v_i16m4(from, idx, unpacket_traits::size); + Packet4Xsu idx = + __riscv_vsrl_vx_u16m4(__riscv_vid_v_u16m4(unpacket_traits::size), 2, unpacket_traits::size); + return __riscv_vrgather_vv_i16m4(pload(from), idx, unpacket_traits::size); } template <> @@ -1351,14 +1384,13 @@ EIGEN_STRONG_INLINE void pstoreu(numext::int16_t* to, const Pac } template <> -EIGEN_DEVICE_FUNC inline Packet4Xs pgather(const numext::int16_t* from, - Index stride) { +EIGEN_DEVICE_FUNC inline Packet4Xs pgather(const numext::int16_t* from, Index stride) { return __riscv_vlse16_v_i16m4(from, stride * sizeof(numext::int16_t), unpacket_traits::size); } template <> EIGEN_DEVICE_FUNC inline void pscatter(numext::int16_t* to, const Packet4Xs& from, - Index stride) { + Index stride) { __riscv_vsse16(to, stride * sizeof(numext::int16_t), from, unpacket_traits::size); } @@ -1369,9 +1401,8 @@ EIGEN_STRONG_INLINE numext::int16_t pfirst(const Packet4Xs& a) { template <> EIGEN_STRONG_INLINE Packet4Xs preverse(const Packet4Xs& a) { - Packet4Xsu idx = - __riscv_vrsub_vx_u16m4(__riscv_vid_v_u16m4(unpacket_traits::size), - unpacket_traits::size - 1, unpacket_traits::size); + Packet4Xsu idx = __riscv_vrsub_vx_u16m4(__riscv_vid_v_u16m4(unpacket_traits::size), + unpacket_traits::size - 1, unpacket_traits::size); return __riscv_vrgather_vv_i16m4(a, idx, unpacket_traits::size); } @@ -1384,16 +1415,16 @@ EIGEN_STRONG_INLINE Packet4Xs pabs(const Packet4Xs& a) { template <> EIGEN_STRONG_INLINE numext::int16_t predux(const Packet4Xs& a) { - return __riscv_vmv_x(__riscv_vredsum_vs_i16m4_i16m1( - a, __riscv_vmv_v_x_i16m1(0, unpacket_traits::size / 4), unpacket_traits::size)); + return __riscv_vmv_x(__riscv_vredsum_vs_i16m4_i16m1(a, __riscv_vmv_v_x_i16m1(0, unpacket_traits::size / 4), + unpacket_traits::size)); } template <> EIGEN_STRONG_INLINE numext::int16_t predux_mul(const Packet4Xs& a) { Packet1Xs half1 = __riscv_vmul_vv_i16m1(__riscv_vget_v_i16m4_i16m1(a, 0), __riscv_vget_v_i16m4_i16m1(a, 1), - unpacket_traits::size); + unpacket_traits::size); Packet1Xs half2 = __riscv_vmul_vv_i16m1(__riscv_vget_v_i16m4_i16m1(a, 2), __riscv_vget_v_i16m4_i16m1(a, 3), - unpacket_traits::size); + unpacket_traits::size); return predux_mul(__riscv_vmul_vv_i16m1(half1, half2, unpacket_traits::size)); } diff --git a/Eigen/src/Core/arch/RVV10/PacketMathBF16.h b/Eigen/src/Core/arch/RVV10/PacketMathBF16.h index 80502593c..ec0e42b2a 100644 --- a/Eigen/src/Core/arch/RVV10/PacketMathBF16.h +++ b/Eigen/src/Core/arch/RVV10/PacketMathBF16.h @@ -16,8 +16,7 @@ namespace Eigen { namespace internal { -typedef eigen_packet_wrapper - Packet1Xbf; +typedef eigen_packet_wrapper Packet1Xbf; typedef eigen_packet_wrapper Packet2Xbf; @@ -148,7 +147,8 @@ EIGEN_STRONG_INLINE Packet1Xbf F32ToBf16(const Packet2Xf& a) { template <> EIGEN_STRONG_INLINE Packet1Xbf ptrue(const Packet1Xbf& /*a*/) { - return __riscv_vreinterpret_bf16m1(__riscv_vmv_v_x_u16m1(static_cast(0xffffu), unpacket_traits::size)); + return __riscv_vreinterpret_bf16m1( + __riscv_vmv_v_x_u16m1(static_cast(0xffffu), unpacket_traits::size)); } template <> @@ -159,8 +159,14 @@ EIGEN_STRONG_INLINE Packet1Xbf pzero(const Packet1Xbf& /*a*/) { template <> EIGEN_STRONG_INLINE Packet1Xbf pabs(const Packet1Xbf& a) { - return __riscv_vreinterpret_v_u16m1_bf16m1(__riscv_vand_vx_u16m1( - __riscv_vreinterpret_v_bf16m1_u16m1(a), static_cast(0x7fffu), unpacket_traits::size)); + return __riscv_vreinterpret_v_u16m1_bf16m1(__riscv_vand_vx_u16m1(__riscv_vreinterpret_v_bf16m1_u16m1(a), + static_cast(0x7fffu), + unpacket_traits::size)); +} + +template <> +EIGEN_STRONG_INLINE Packet1Xbf pabsdiff(const Packet1Xbf& a, const Packet1Xbf& b) { + return F32ToBf16(pabsdiff(Bf16ToF32(a), Bf16ToF32(b))); } template <> @@ -179,6 +185,16 @@ EIGEN_STRONG_INLINE Packet1Xbf plset(const bfloat16& a) { return F32ToBf16(plset(static_cast(a))); } +template <> +EIGEN_STRONG_INLINE void pbroadcast4(const bfloat16* a, Packet1Xbf& a0, Packet1Xbf& a1, Packet1Xbf& a2, + Packet1Xbf& a3) { + vint16m1_t aa = __riscv_vle16_v_i16m1(reinterpret_cast(a), 4); + a0 = __riscv_vreinterpret_bf16m1(__riscv_vrgather_vx_i16m1(aa, 0, unpacket_traits::size)); + a1 = __riscv_vreinterpret_bf16m1(__riscv_vrgather_vx_i16m1(aa, 1, unpacket_traits::size)); + a2 = __riscv_vreinterpret_bf16m1(__riscv_vrgather_vx_i16m1(aa, 2, unpacket_traits::size)); + a3 = __riscv_vreinterpret_bf16m1(__riscv_vrgather_vx_i16m1(aa, 3, unpacket_traits::size)); +} + template <> EIGEN_STRONG_INLINE Packet1Xbf padd(const Packet1Xbf& a, const Packet1Xbf& b) { return F32ToBf16(padd(Bf16ToF32(a), Bf16ToF32(b))); @@ -191,14 +207,15 @@ EIGEN_STRONG_INLINE Packet1Xbf psub(const Packet1Xbf& a, const Packe template <> EIGEN_STRONG_INLINE Packet1Xbf pnegate(const Packet1Xbf& a) { - return __riscv_vreinterpret_v_u16m1_bf16m1(__riscv_vxor_vx_u16m1( - __riscv_vreinterpret_v_bf16m1_u16m1(a), static_cast(0x8000u), unpacket_traits::size)); + return __riscv_vreinterpret_v_u16m1_bf16m1(__riscv_vxor_vx_u16m1(__riscv_vreinterpret_v_bf16m1_u16m1(a), + static_cast(0x8000u), + unpacket_traits::size)); } template <> EIGEN_STRONG_INLINE Packet1Xbf psignbit(const Packet1Xbf& a) { - return __riscv_vreinterpret_v_i16m1_bf16m1(__riscv_vsra_vx_i16m1( - __riscv_vreinterpret_v_bf16m1_i16m1(a), 15, unpacket_traits::size)); + return __riscv_vreinterpret_v_i16m1_bf16m1( + __riscv_vsra_vx_i16m1(__riscv_vreinterpret_v_bf16m1_i16m1(a), 15, unpacket_traits::size)); } template <> @@ -224,17 +241,20 @@ EIGEN_STRONG_INLINE Packet1Xbf pmadd(const Packet1Xbf& a, const Packet1Xbf& b, c template <> EIGEN_STRONG_INLINE Packet1Xbf pmsub(const Packet1Xbf& a, const Packet1Xbf& b, const Packet1Xbf& c) { - return F32ToBf16(__riscv_vfwmaccbf16_vv_f32m2(Bf16ToF32(pnegate(c)), a, b, unpacket_traits::size)); + return F32ToBf16( + __riscv_vfwmaccbf16_vv_f32m2(Bf16ToF32(pnegate(c)), a, b, unpacket_traits::size)); } template <> EIGEN_STRONG_INLINE Packet1Xbf pnmadd(const Packet1Xbf& a, const Packet1Xbf& b, const Packet1Xbf& c) { - return F32ToBf16(__riscv_vfwmaccbf16_vv_f32m2(Bf16ToF32(c), pnegate(a), b, unpacket_traits::size)); + return F32ToBf16( + __riscv_vfwmaccbf16_vv_f32m2(Bf16ToF32(c), pnegate(a), b, unpacket_traits::size)); } template <> EIGEN_STRONG_INLINE Packet1Xbf pnmsub(const Packet1Xbf& a, const Packet1Xbf& b, const Packet1Xbf& c) { - return pnegate(F32ToBf16(__riscv_vfwmaccbf16_vv_f32m2(Bf16ToF32(c), a, b, unpacket_traits::size))); + return pnegate( + F32ToBf16(__riscv_vfwmaccbf16_vv_f32m2(Bf16ToF32(c), a, b, unpacket_traits::size))); } template <> @@ -287,23 +307,40 @@ EIGEN_STRONG_INLINE Packet1Xbf pcmp_lt_or_nan(const Packet1Xbf& a, c return F32ToBf16(pcmp_lt_or_nan(Bf16ToF32(a), Bf16ToF32(b))); } +EIGEN_STRONG_INLINE Packet1Xbf pselect(const PacketMask16& mask, const Packet1Xbf& a, const Packet1Xbf& b) { + return __riscv_vreinterpret_v_i16m1_bf16m1(__riscv_vmerge_vvm_i16m1(__riscv_vreinterpret_v_bf16m1_i16m1(b), + __riscv_vreinterpret_v_bf16m1_i16m1(a), mask, + unpacket_traits::size)); +} + +EIGEN_STRONG_INLINE Packet1Xbf pselect(const Packet1Xbf& mask, const Packet1Xbf& a, const Packet1Xbf& b) { + PacketMask16 mask2 = + __riscv_vmsne_vx_i16m1_b16(__riscv_vreinterpret_v_bf16m1_i16m1(mask), 0, unpacket_traits::size); + return __riscv_vreinterpret_v_i16m1_bf16m1(__riscv_vmerge_vvm_i16m1(__riscv_vreinterpret_v_bf16m1_i16m1(b), + __riscv_vreinterpret_v_bf16m1_i16m1(a), mask2, + unpacket_traits::size)); +} + // Logical Operations are not supported for bfloat16, so reinterpret casts template <> EIGEN_STRONG_INLINE Packet1Xbf pand(const Packet1Xbf& a, const Packet1Xbf& b) { - return __riscv_vreinterpret_v_u16m1_bf16m1(__riscv_vand_vv_u16m1( - __riscv_vreinterpret_v_bf16m1_u16m1(a), __riscv_vreinterpret_v_bf16m1_u16m1(b), unpacket_traits::size)); + return __riscv_vreinterpret_v_u16m1_bf16m1(__riscv_vand_vv_u16m1(__riscv_vreinterpret_v_bf16m1_u16m1(a), + __riscv_vreinterpret_v_bf16m1_u16m1(b), + unpacket_traits::size)); } template <> EIGEN_STRONG_INLINE Packet1Xbf por(const Packet1Xbf& a, const Packet1Xbf& b) { - return __riscv_vreinterpret_v_u16m1_bf16m1(__riscv_vor_vv_u16m1( - __riscv_vreinterpret_v_bf16m1_u16m1(a), __riscv_vreinterpret_v_bf16m1_u16m1(b), unpacket_traits::size)); + return __riscv_vreinterpret_v_u16m1_bf16m1(__riscv_vor_vv_u16m1(__riscv_vreinterpret_v_bf16m1_u16m1(a), + __riscv_vreinterpret_v_bf16m1_u16m1(b), + unpacket_traits::size)); } template <> EIGEN_STRONG_INLINE Packet1Xbf pxor(const Packet1Xbf& a, const Packet1Xbf& b) { - return __riscv_vreinterpret_v_u16m1_bf16m1(__riscv_vxor_vv_u16m1( - __riscv_vreinterpret_v_bf16m1_u16m1(a), __riscv_vreinterpret_v_bf16m1_u16m1(b), unpacket_traits::size)); + return __riscv_vreinterpret_v_u16m1_bf16m1(__riscv_vxor_vv_u16m1(__riscv_vreinterpret_v_bf16m1_u16m1(a), + __riscv_vreinterpret_v_bf16m1_u16m1(b), + unpacket_traits::size)); } template <> @@ -317,46 +354,48 @@ EIGEN_STRONG_INLINE Packet1Xbf pandnot(const Packet1Xbf& a, const Pa template <> EIGEN_STRONG_INLINE Packet1Xbf pload(const bfloat16* from) { EIGEN_DEBUG_ALIGNED_LOAD return __riscv_vle16_v_bf16m1(reinterpret_cast(from), - unpacket_traits::size); + unpacket_traits::size); } template <> EIGEN_STRONG_INLINE Packet1Xbf ploadu(const bfloat16* from) { EIGEN_DEBUG_UNALIGNED_LOAD return __riscv_vle16_v_bf16m1(reinterpret_cast(from), - unpacket_traits::size); + unpacket_traits::size); } template <> EIGEN_STRONG_INLINE Packet1Xbf ploaddup(const bfloat16* from) { - Packet1Xsu idx = __riscv_vid_v_u16m1(unpacket_traits::size); - idx = __riscv_vand_vx_u16m1(idx, static_cast(0xfffeu), unpacket_traits::size); - return __riscv_vloxei16_v_bf16m1(reinterpret_cast(from), idx, unpacket_traits::size); + Packet1Xsu data = __riscv_vreinterpret_v_bf16m1_u16m1(pload(from)); + return __riscv_vreinterpret_v_i16m1_bf16m1( + __riscv_vreinterpret_v_i32m1_i16m1(__riscv_vreinterpret_v_u32m1_i32m1(__riscv_vlmul_trunc_v_u32m2_u32m1( + __riscv_vwmaccu_vx_u32m2(__riscv_vwaddu_vv_u32m2(data, data, unpacket_traits::size), 0xffffu, data, + unpacket_traits::size))))); } template <> EIGEN_STRONG_INLINE Packet1Xbf ploadquad(const bfloat16* from) { - Packet1Xsu idx = __riscv_vid_v_u16m1(unpacket_traits::size); - idx = __riscv_vsrl_vx_u16m1(__riscv_vand_vx_u16m1(idx, static_cast(0xfffcu), unpacket_traits::size), 1, - unpacket_traits::size); - return __riscv_vloxei16_v_bf16m1(reinterpret_cast(from), idx, unpacket_traits::size); + Packet1Xsu idx = __riscv_vsrl_vx_u16m1(__riscv_vid_v_u16m1(unpacket_traits::size), 2, + unpacket_traits::size); + return __riscv_vreinterpret_v_i16m1_bf16m1(__riscv_vrgather_vv_i16m1( + pload(reinterpret_cast(from)), idx, unpacket_traits::size)); } template <> EIGEN_STRONG_INLINE void pstore(bfloat16* to, const Packet1Xbf& from) { EIGEN_DEBUG_ALIGNED_STORE __riscv_vse16_v_bf16m1(reinterpret_cast<__bf16*>(to), from, - unpacket_traits::size); + unpacket_traits::size); } template <> EIGEN_STRONG_INLINE void pstoreu(bfloat16* to, const Packet1Xbf& from) { EIGEN_DEBUG_UNALIGNED_STORE __riscv_vse16_v_bf16m1(reinterpret_cast<__bf16*>(to), from, - unpacket_traits::size); + unpacket_traits::size); } template <> EIGEN_DEVICE_FUNC inline Packet1Xbf pgather(const bfloat16* from, Index stride) { return __riscv_vlse16_v_bf16m1(reinterpret_cast(from), stride * sizeof(bfloat16), - unpacket_traits::size); + unpacket_traits::size); } template <> @@ -421,7 +460,7 @@ EIGEN_DEVICE_FUNC inline void ptranspose(PacketBlock& kernel) { for (i = 0; i < N; i++) { kernel.packet[i] = __riscv_vle16_v_bf16m1(reinterpret_cast<__bf16*>(&buffer[i * unpacket_traits::size]), - unpacket_traits::size); + unpacket_traits::size); } } @@ -437,7 +476,8 @@ EIGEN_STRONG_INLINE Packet2Xbf F32ToBf16(const Packet4Xf& a) { template <> EIGEN_STRONG_INLINE Packet2Xbf ptrue(const Packet2Xbf& /*a*/) { - return __riscv_vreinterpret_bf16m2(__riscv_vmv_v_x_u16m2(static_cast(0xffffu), unpacket_traits::size)); + return __riscv_vreinterpret_bf16m2( + __riscv_vmv_v_x_u16m2(static_cast(0xffffu), unpacket_traits::size)); } template <> @@ -448,8 +488,14 @@ EIGEN_STRONG_INLINE Packet2Xbf pzero(const Packet2Xbf& /*a*/) { template <> EIGEN_STRONG_INLINE Packet2Xbf pabs(const Packet2Xbf& a) { - return __riscv_vreinterpret_v_u16m2_bf16m2(__riscv_vand_vx_u16m2( - __riscv_vreinterpret_v_bf16m2_u16m2(a), static_cast(0x7fffu), unpacket_traits::size)); + return __riscv_vreinterpret_v_u16m2_bf16m2(__riscv_vand_vx_u16m2(__riscv_vreinterpret_v_bf16m2_u16m2(a), + static_cast(0x7fffu), + unpacket_traits::size)); +} + +template <> +EIGEN_STRONG_INLINE Packet2Xbf pabsdiff(const Packet2Xbf& a, const Packet2Xbf& b) { + return F32ToBf16(pabsdiff(Bf16ToF32(a), Bf16ToF32(b))); } template <> @@ -468,6 +514,16 @@ EIGEN_STRONG_INLINE Packet2Xbf plset(const bfloat16& a) { return F32ToBf16(plset(static_cast(a))); } +template <> +EIGEN_STRONG_INLINE void pbroadcast4(const bfloat16* a, Packet2Xbf& a0, Packet2Xbf& a1, Packet2Xbf& a2, + Packet2Xbf& a3) { + vint16m2_t aa = __riscv_vle16_v_i16m2(reinterpret_cast(a), 4); + a0 = __riscv_vreinterpret_bf16m2(__riscv_vrgather_vx_i16m2(aa, 0, unpacket_traits::size)); + a1 = __riscv_vreinterpret_bf16m2(__riscv_vrgather_vx_i16m2(aa, 1, unpacket_traits::size)); + a2 = __riscv_vreinterpret_bf16m2(__riscv_vrgather_vx_i16m2(aa, 2, unpacket_traits::size)); + a3 = __riscv_vreinterpret_bf16m2(__riscv_vrgather_vx_i16m2(aa, 3, unpacket_traits::size)); +} + template <> EIGEN_STRONG_INLINE Packet2Xbf padd(const Packet2Xbf& a, const Packet2Xbf& b) { return F32ToBf16(padd(Bf16ToF32(a), Bf16ToF32(b))); @@ -480,14 +536,15 @@ EIGEN_STRONG_INLINE Packet2Xbf psub(const Packet2Xbf& a, const Packe template <> EIGEN_STRONG_INLINE Packet2Xbf pnegate(const Packet2Xbf& a) { - return __riscv_vreinterpret_v_u16m2_bf16m2(__riscv_vxor_vx_u16m2( - __riscv_vreinterpret_v_bf16m2_u16m2(a), static_cast(0x8000u), unpacket_traits::size)); + return __riscv_vreinterpret_v_u16m2_bf16m2(__riscv_vxor_vx_u16m2(__riscv_vreinterpret_v_bf16m2_u16m2(a), + static_cast(0x8000u), + unpacket_traits::size)); } template <> EIGEN_STRONG_INLINE Packet2Xbf psignbit(const Packet2Xbf& a) { - return __riscv_vreinterpret_v_i16m2_bf16m2(__riscv_vsra_vx_i16m2( - __riscv_vreinterpret_v_bf16m2_i16m2(a), 15, unpacket_traits::size)); + return __riscv_vreinterpret_v_i16m2_bf16m2( + __riscv_vsra_vx_i16m2(__riscv_vreinterpret_v_bf16m2_i16m2(a), 15, unpacket_traits::size)); } template <> @@ -513,17 +570,20 @@ EIGEN_STRONG_INLINE Packet2Xbf pmadd(const Packet2Xbf& a, const Packet2Xbf& b, c template <> EIGEN_STRONG_INLINE Packet2Xbf pmsub(const Packet2Xbf& a, const Packet2Xbf& b, const Packet2Xbf& c) { - return F32ToBf16(__riscv_vfwmaccbf16_vv_f32m4(Bf16ToF32(pnegate(c)), a, b, unpacket_traits::size)); + return F32ToBf16( + __riscv_vfwmaccbf16_vv_f32m4(Bf16ToF32(pnegate(c)), a, b, unpacket_traits::size)); } template <> EIGEN_STRONG_INLINE Packet2Xbf pnmadd(const Packet2Xbf& a, const Packet2Xbf& b, const Packet2Xbf& c) { - return F32ToBf16(__riscv_vfwmaccbf16_vv_f32m4(Bf16ToF32(c), pnegate(a), b, unpacket_traits::size)); + return F32ToBf16( + __riscv_vfwmaccbf16_vv_f32m4(Bf16ToF32(c), pnegate(a), b, unpacket_traits::size)); } template <> EIGEN_STRONG_INLINE Packet2Xbf pnmsub(const Packet2Xbf& a, const Packet2Xbf& b, const Packet2Xbf& c) { - return pnegate(F32ToBf16(__riscv_vfwmaccbf16_vv_f32m4(Bf16ToF32(c), a, b, unpacket_traits::size))); + return pnegate( + F32ToBf16(__riscv_vfwmaccbf16_vv_f32m4(Bf16ToF32(c), a, b, unpacket_traits::size))); } template <> @@ -576,26 +636,40 @@ EIGEN_STRONG_INLINE Packet2Xbf pcmp_lt_or_nan(const Packet2Xbf& a, c return F32ToBf16(pcmp_lt_or_nan(Bf16ToF32(a), Bf16ToF32(b))); } +EIGEN_STRONG_INLINE Packet2Xbf pselect(const PacketMask8& mask, const Packet2Xbf& a, const Packet2Xbf& b) { + return __riscv_vreinterpret_v_i16m2_bf16m2(__riscv_vmerge_vvm_i16m2(__riscv_vreinterpret_v_bf16m2_i16m2(b), + __riscv_vreinterpret_v_bf16m2_i16m2(a), mask, + unpacket_traits::size)); +} + +EIGEN_STRONG_INLINE Packet2Xbf pselect(const Packet2Xbf& mask, const Packet2Xbf& a, const Packet2Xbf& b) { + PacketMask8 mask2 = + __riscv_vmsne_vx_i16m2_b8(__riscv_vreinterpret_v_bf16m2_i16m2(mask), 0, unpacket_traits::size); + return __riscv_vreinterpret_v_i16m2_bf16m2(__riscv_vmerge_vvm_i16m2(__riscv_vreinterpret_v_bf16m2_i16m2(b), + __riscv_vreinterpret_v_bf16m2_i16m2(a), mask2, + unpacket_traits::size)); +} + // Logical Operations are not supported for bflaot16, so reinterpret casts template <> EIGEN_STRONG_INLINE Packet2Xbf pand(const Packet2Xbf& a, const Packet2Xbf& b) { return __riscv_vreinterpret_v_u16m2_bf16m2(__riscv_vand_vv_u16m2(__riscv_vreinterpret_v_bf16m2_u16m2(a), - __riscv_vreinterpret_v_bf16m2_u16m2(b), - unpacket_traits::size)); + __riscv_vreinterpret_v_bf16m2_u16m2(b), + unpacket_traits::size)); } template <> EIGEN_STRONG_INLINE Packet2Xbf por(const Packet2Xbf& a, const Packet2Xbf& b) { return __riscv_vreinterpret_v_u16m2_bf16m2(__riscv_vor_vv_u16m2(__riscv_vreinterpret_v_bf16m2_u16m2(a), - __riscv_vreinterpret_v_bf16m2_u16m2(b), - unpacket_traits::size)); + __riscv_vreinterpret_v_bf16m2_u16m2(b), + unpacket_traits::size)); } template <> EIGEN_STRONG_INLINE Packet2Xbf pxor(const Packet2Xbf& a, const Packet2Xbf& b) { return __riscv_vreinterpret_v_u16m2_bf16m2(__riscv_vxor_vv_u16m2(__riscv_vreinterpret_v_bf16m2_u16m2(a), - __riscv_vreinterpret_v_bf16m2_u16m2(b), - unpacket_traits::size)); + __riscv_vreinterpret_v_bf16m2_u16m2(b), + unpacket_traits::size)); } template <> @@ -609,58 +683,58 @@ EIGEN_STRONG_INLINE Packet2Xbf pandnot(const Packet2Xbf& a, const Pa template <> EIGEN_STRONG_INLINE Packet2Xbf pload(const bfloat16* from) { EIGEN_DEBUG_ALIGNED_LOAD return __riscv_vle16_v_bf16m2(reinterpret_cast(from), - unpacket_traits::size); + unpacket_traits::size); } template <> EIGEN_STRONG_INLINE Packet2Xbf ploadu(const bfloat16* from) { EIGEN_DEBUG_UNALIGNED_LOAD return __riscv_vle16_v_bf16m2(reinterpret_cast(from), - unpacket_traits::size); + unpacket_traits::size); } template <> EIGEN_STRONG_INLINE Packet2Xbf ploaddup(const bfloat16* from) { - Packet2Xsu idx = __riscv_vid_v_u16m2(unpacket_traits::size); - idx = __riscv_vand_vx_u16m2(idx, static_cast(0xfffeu), unpacket_traits::size); - return __riscv_vloxei16_v_bf16m2(reinterpret_cast(from), idx, unpacket_traits::size); + Packet2Xsu data = __riscv_vreinterpret_v_bf16m2_u16m2(pload(from)); + return __riscv_vreinterpret_v_i16m2_bf16m2( + __riscv_vreinterpret_v_i32m2_i16m2(__riscv_vreinterpret_v_u32m2_i32m2(__riscv_vlmul_trunc_v_u32m4_u32m2( + __riscv_vwmaccu_vx_u32m4(__riscv_vwaddu_vv_u32m4(data, data, unpacket_traits::size), 0xffffu, data, + unpacket_traits::size))))); } template <> EIGEN_STRONG_INLINE Packet2Xbf ploadquad(const bfloat16* from) { - Packet2Xsu idx = __riscv_vid_v_u16m2(unpacket_traits::size); - idx = __riscv_vsrl_vx_u16m2(__riscv_vand_vx_u16m2(idx, static_cast(0xfffcu), unpacket_traits::size), 1, - unpacket_traits::size); - return __riscv_vloxei16_v_bf16m2(reinterpret_cast(from), idx, unpacket_traits::size); + Packet2Xsu idx = __riscv_vsrl_vx_u16m2(__riscv_vid_v_u16m2(unpacket_traits::size), 2, + unpacket_traits::size); + return __riscv_vreinterpret_v_i16m2_bf16m2(__riscv_vrgather_vv_i16m2( + pload(reinterpret_cast(from)), idx, unpacket_traits::size)); } template <> EIGEN_STRONG_INLINE void pstore(bfloat16* to, const Packet2Xbf& from) { EIGEN_DEBUG_ALIGNED_STORE __riscv_vse16_v_bf16m2(reinterpret_cast<__bf16*>(to), from, - unpacket_traits::size); + unpacket_traits::size); } template <> EIGEN_STRONG_INLINE void pstoreu(bfloat16* to, const Packet2Xbf& from) { EIGEN_DEBUG_UNALIGNED_STORE __riscv_vse16_v_bf16m2(reinterpret_cast<__bf16*>(to), from, - unpacket_traits::size); + unpacket_traits::size); } template <> EIGEN_DEVICE_FUNC inline Packet2Xbf pgather(const bfloat16* from, Index stride) { return __riscv_vlse16_v_bf16m2(reinterpret_cast(from), stride * sizeof(bfloat16), - unpacket_traits::size); + unpacket_traits::size); } template <> -EIGEN_DEVICE_FUNC inline void pscatter(bfloat16* to, const Packet2Xbf& from, - Index stride) { - __riscv_vsse16(reinterpret_cast<__bf16*>(to), stride * sizeof(bfloat16), from, - unpacket_traits::size); +EIGEN_DEVICE_FUNC inline void pscatter(bfloat16* to, const Packet2Xbf& from, Index stride) { + __riscv_vsse16(reinterpret_cast<__bf16*>(to), stride * sizeof(bfloat16), from, unpacket_traits::size); } template <> EIGEN_STRONG_INLINE bfloat16 pfirst(const Packet2Xbf& a) { - return static_cast(__riscv_vmv_x_s_i16m2_i16(__riscv_vreinterpret_v_bf16m2_i16m2(a))); + return numext::bit_cast(__riscv_vmv_x_s_i16m2_i16(__riscv_vreinterpret_v_bf16m2_i16m2(a))); } template <> @@ -714,17 +788,16 @@ EIGEN_DEVICE_FUNC inline void ptranspose(PacketBlock& kernel) { } for (i = 0; i < N; i++) { - kernel.packet[i] = - __riscv_vle16_v_bf16m2(reinterpret_cast<__bf16*>(&buffer[i * unpacket_traits::size]), - unpacket_traits::size); + kernel.packet[i] = __riscv_vle16_v_bf16m2(reinterpret_cast<__bf16*>(&buffer[i * unpacket_traits::size]), + unpacket_traits::size); } } template EIGEN_STRONG_INLINE -typename std::enable_if::value && (unpacket_traits::size % 8) == 0, - Packet1Xbf>::type -predux_half(const Packet2Xbf& a) { + typename std::enable_if::value && (unpacket_traits::size % 8) == 0, + Packet1Xbf>::type + predux_half(const Packet2Xbf& a) { return padd(__riscv_vget_v_bf16m2_bf16m1(a, 0), __riscv_vget_v_bf16m2_bf16m1(a, 1)); } diff --git a/Eigen/src/Core/arch/RVV10/PacketMathFP16.h b/Eigen/src/Core/arch/RVV10/PacketMathFP16.h index f3e5924c1..517b703e3 100644 --- a/Eigen/src/Core/arch/RVV10/PacketMathFP16.h +++ b/Eigen/src/Core/arch/RVV10/PacketMathFP16.h @@ -16,8 +16,7 @@ namespace Eigen { namespace internal { -typedef eigen_packet_wrapper - Packet1Xh; +typedef eigen_packet_wrapper Packet1Xh; typedef eigen_packet_wrapper Packet2Xh; @@ -155,6 +154,12 @@ EIGEN_STRONG_INLINE Packet1Xh pabs(const Packet1Xh& a) { return __riscv_vfabs_v_f16m1(a, unpacket_traits::size); } +template <> +EIGEN_STRONG_INLINE Packet1Xh pabsdiff(const Packet1Xh& a, const Packet1Xh& b) { + return __riscv_vfabs_v_f16m1(__riscv_vfsub_vv_f16m1(a, b, unpacket_traits::size), + unpacket_traits::size); +} + template <> EIGEN_STRONG_INLINE Packet1Xh pset1(const Eigen::half& from) { return __riscv_vfmv_v_f_f16m1(numext::bit_cast<_Float16>(from), unpacket_traits::size); @@ -167,12 +172,22 @@ EIGEN_STRONG_INLINE Packet1Xh pset1frombits(numext::uint16_t from) { template <> EIGEN_STRONG_INLINE Packet1Xh plset(const Eigen::half& a) { - Packet1Xh idx = - __riscv_vfcvt_f_x_v_f16m1(__riscv_vreinterpret_v_u16m1_i16m1(__riscv_vid_v_u16m1(unpacket_traits::size)), + Packet1Xh idx = __riscv_vfcvt_f_x_v_f16m1( + __riscv_vreinterpret_v_u16m1_i16m1(__riscv_vid_v_u16m1(unpacket_traits::size)), unpacket_traits::size); return __riscv_vfadd_vf_f16m1(idx, numext::bit_cast<_Float16>(a), unpacket_traits::size); } +template <> +EIGEN_STRONG_INLINE void pbroadcast4(const Eigen::half* a, Packet1Xh& a0, Packet1Xh& a1, Packet1Xh& a2, + Packet1Xh& a3) { + vfloat16m1_t aa = __riscv_vle16_v_f16m1(reinterpret_cast(a), 4); + a0 = __riscv_vrgather_vx_f16m1(aa, 0, unpacket_traits::size); + a1 = __riscv_vrgather_vx_f16m1(aa, 1, unpacket_traits::size); + a2 = __riscv_vrgather_vx_f16m1(aa, 2, unpacket_traits::size); + a3 = __riscv_vrgather_vx_f16m1(aa, 3, unpacket_traits::size); +} + template <> EIGEN_STRONG_INLINE Packet1Xh padd(const Packet1Xh& a, const Packet1Xh& b) { return __riscv_vfadd_vv_f16m1(a, b, unpacket_traits::size); @@ -188,6 +203,12 @@ EIGEN_STRONG_INLINE Packet1Xh pnegate(const Packet1Xh& a) { return __riscv_vfneg_v_f16m1(a, unpacket_traits::size); } +template <> +EIGEN_STRONG_INLINE Packet1Xh psignbit(const Packet1Xh& a) { + return __riscv_vreinterpret_v_i16m1_f16m1( + __riscv_vsra_vx_i16m1(__riscv_vreinterpret_v_f16m1_i16m1(a), 15, unpacket_traits::size)); +} + template <> EIGEN_STRONG_INLINE Packet1Xh pconj(const Packet1Xh& a) { return a; @@ -226,8 +247,7 @@ EIGEN_STRONG_INLINE Packet1Xh pnmsub(const Packet1Xh& a, const Packet1Xh& b, con template <> EIGEN_STRONG_INLINE Packet1Xh pmin(const Packet1Xh& a, const Packet1Xh& b) { const Eigen::half nan = (std::numeric_limits::quiet_NaN)(); - Packet1Xh nans = - __riscv_vfmv_v_f_f16m1(numext::bit_cast<_Float16>(nan), unpacket_traits::size); + Packet1Xh nans = __riscv_vfmv_v_f_f16m1(numext::bit_cast<_Float16>(nan), unpacket_traits::size); PacketMask16 mask = __riscv_vmfeq_vv_f16m1_b16(a, a, unpacket_traits::size); PacketMask16 mask2 = __riscv_vmfeq_vv_f16m1_b16(b, b, unpacket_traits::size); mask = __riscv_vmand_mm_b16(mask, mask2, unpacket_traits::size); @@ -248,8 +268,7 @@ EIGEN_STRONG_INLINE Packet1Xh pmin(const Packet1Xh& template <> EIGEN_STRONG_INLINE Packet1Xh pmax(const Packet1Xh& a, const Packet1Xh& b) { const Eigen::half nan = (std::numeric_limits::quiet_NaN)(); - Packet1Xh nans = - __riscv_vfmv_v_f_f16m1(numext::bit_cast<_Float16>(nan), unpacket_traits::size); + Packet1Xh nans = __riscv_vfmv_v_f_f16m1(numext::bit_cast<_Float16>(nan), unpacket_traits::size); PacketMask16 mask = __riscv_vmfeq_vv_f16m1_b16(a, a, unpacket_traits::size); PacketMask16 mask2 = __riscv_vmfeq_vv_f16m1_b16(b, b, unpacket_traits::size); mask = __riscv_vmand_mm_b16(mask, mask2, unpacket_traits::size); @@ -292,6 +311,18 @@ EIGEN_STRONG_INLINE Packet1Xh pcmp_lt_or_nan(const Packet1Xh& a, cons unpacket_traits::size); } +EIGEN_STRONG_INLINE Packet1Xh pselect(const PacketMask16& mask, const Packet1Xh& a, const Packet1Xh& b) { + return __riscv_vmerge_vvm_f16m1(b, a, mask, unpacket_traits::size); +} + +EIGEN_STRONG_INLINE Packet1Xh pselect(const Packet1Xh& mask, const Packet1Xh& a, const Packet1Xh& b) { + PacketMask16 mask2 = + __riscv_vmsne_vx_i16m1_b16(__riscv_vreinterpret_v_f16m1_i16m1(mask), 0, unpacket_traits::size); + return __riscv_vreinterpret_v_i16m1_f16m1(__riscv_vmerge_vvm_i16m1(__riscv_vreinterpret_v_f16m1_i16m1(b), + __riscv_vreinterpret_v_f16m1_i16m1(a), mask2, + unpacket_traits::size)); +} + // Logical Operations are not supported for half, so reinterpret casts template <> EIGEN_STRONG_INLINE Packet1Xh pand(const Packet1Xh& a, const Packet1Xh& b) { @@ -333,17 +364,18 @@ EIGEN_STRONG_INLINE Packet1Xh ploadu(const Eigen::half* from) { template <> EIGEN_STRONG_INLINE Packet1Xh ploaddup(const Eigen::half* from) { - Packet1Xsu idx = __riscv_vid_v_u16m1(unpacket_traits::size); - idx = __riscv_vand_vx_u16m1(idx, 0xfffeu, unpacket_traits::size); - return __riscv_vloxei16_v_f16m1(reinterpret_cast(from), idx, unpacket_traits::size); + Packet1Xsu data = __riscv_vreinterpret_v_f16m1_u16m1(pload(from)); + return __riscv_vreinterpret_v_i16m1_f16m1( + __riscv_vreinterpret_v_i32m1_i16m1(__riscv_vreinterpret_v_u32m1_i32m1(__riscv_vlmul_trunc_v_u32m2_u32m1( + __riscv_vwmaccu_vx_u32m2(__riscv_vwaddu_vv_u32m2(data, data, unpacket_traits::size), 0xffffu, data, + unpacket_traits::size))))); } template <> EIGEN_STRONG_INLINE Packet1Xh ploadquad(const Eigen::half* from) { - Packet1Xsu idx = __riscv_vid_v_u16m1(unpacket_traits::size); - idx = __riscv_vsrl_vx_u16m1(__riscv_vand_vx_u16m1(idx, 0xfffcu, unpacket_traits::size), 1, - unpacket_traits::size); - return __riscv_vloxei16_v_f16m1(reinterpret_cast(from), idx, unpacket_traits::size); + Packet1Xsu idx = + __riscv_vsrl_vx_u16m1(__riscv_vid_v_u16m1(unpacket_traits::size), 2, unpacket_traits::size); + return __riscv_vrgather_vv_f16m1(pload(from), idx, unpacket_traits::size); } template <> @@ -387,7 +419,7 @@ EIGEN_STRONG_INLINE Packet1Xh print(const Packet1Xh& a) { PacketMask16 mask = __riscv_vmfne_vv_f16m1_b16(a, a, unpacket_traits::size); const Packet1Xh x = __riscv_vfadd_vv_f16m1_tumu(mask, a, a, a, unpacket_traits::size); const Packet1Xh new_x = __riscv_vfcvt_f_x_v_f16m1(__riscv_vfcvt_x_f_v_i16m1(a, unpacket_traits::size), - unpacket_traits::size); + unpacket_traits::size); mask = __riscv_vmflt_vv_f16m1_b16(abs_a, limit, unpacket_traits::size); Packet1Xh signed_x = __riscv_vfsgnj_vv_f16m1(new_x, x, unpacket_traits::size); @@ -405,7 +437,7 @@ EIGEN_STRONG_INLINE Packet1Xh pfloor(const Packet1Xh& a) { template <> EIGEN_STRONG_INLINE Packet1Xh preverse(const Packet1Xh& a) { Packet1Xsu idx = __riscv_vrsub_vx_u16m1(__riscv_vid_v_u16m1(unpacket_traits::size), - unpacket_traits::size - 1, unpacket_traits::size); + unpacket_traits::size - 1, unpacket_traits::size); return __riscv_vrgather_vv_f16m1(a, idx, unpacket_traits::size); } @@ -502,6 +534,12 @@ EIGEN_STRONG_INLINE Packet2Xh pabs(const Packet2Xh& a) { return __riscv_vfabs_v_f16m2(a, unpacket_traits::size); } +template <> +EIGEN_STRONG_INLINE Packet2Xh pabsdiff(const Packet2Xh& a, const Packet2Xh& b) { + return __riscv_vfabs_v_f16m2(__riscv_vfsub_vv_f16m2(a, b, unpacket_traits::size), + unpacket_traits::size); +} + template <> EIGEN_STRONG_INLINE Packet2Xh pset1(const Eigen::half& from) { return __riscv_vfmv_v_f_f16m2(numext::bit_cast<_Float16>(from), unpacket_traits::size); @@ -520,6 +558,16 @@ EIGEN_STRONG_INLINE Packet2Xh plset(const Eigen::half& a) { return __riscv_vfadd_vf_f16m2(idx, numext::bit_cast<_Float16>(a), unpacket_traits::size); } +template <> +EIGEN_STRONG_INLINE void pbroadcast4(const Eigen::half* a, Packet2Xh& a0, Packet2Xh& a1, Packet2Xh& a2, + Packet2Xh& a3) { + vfloat16m2_t aa = __riscv_vle16_v_f16m2(reinterpret_cast(a), 4); + a0 = __riscv_vrgather_vx_f16m2(aa, 0, unpacket_traits::size); + a1 = __riscv_vrgather_vx_f16m2(aa, 1, unpacket_traits::size); + a2 = __riscv_vrgather_vx_f16m2(aa, 2, unpacket_traits::size); + a3 = __riscv_vrgather_vx_f16m2(aa, 3, unpacket_traits::size); +} + template <> EIGEN_STRONG_INLINE Packet2Xh padd(const Packet2Xh& a, const Packet2Xh& b) { return __riscv_vfadd_vv_f16m2(a, b, unpacket_traits::size); @@ -535,6 +583,12 @@ EIGEN_STRONG_INLINE Packet2Xh pnegate(const Packet2Xh& a) { return __riscv_vfneg_v_f16m2(a, unpacket_traits::size); } +template <> +EIGEN_STRONG_INLINE Packet2Xh psignbit(const Packet2Xh& a) { + return __riscv_vreinterpret_v_i16m2_f16m2( + __riscv_vsra_vx_i16m2(__riscv_vreinterpret_v_f16m2_i16m2(a), 15, unpacket_traits::size)); +} + template <> EIGEN_STRONG_INLINE Packet2Xh pconj(const Packet2Xh& a) { return a; @@ -573,8 +627,7 @@ EIGEN_STRONG_INLINE Packet2Xh pnmsub(const Packet2Xh& a, const Packet2Xh& b, con template <> EIGEN_STRONG_INLINE Packet2Xh pmin(const Packet2Xh& a, const Packet2Xh& b) { const Eigen::half nan = (std::numeric_limits::quiet_NaN)(); - Packet2Xh nans = - __riscv_vfmv_v_f_f16m2(numext::bit_cast<_Float16>(nan), unpacket_traits::size); + Packet2Xh nans = __riscv_vfmv_v_f_f16m2(numext::bit_cast<_Float16>(nan), unpacket_traits::size); PacketMask8 mask = __riscv_vmfeq_vv_f16m2_b8(a, a, unpacket_traits::size); PacketMask8 mask2 = __riscv_vmfeq_vv_f16m2_b8(b, b, unpacket_traits::size); mask = __riscv_vmand_mm_b8(mask, mask2, unpacket_traits::size); @@ -595,8 +648,7 @@ EIGEN_STRONG_INLINE Packet2Xh pmin(const Packet2Xh& template <> EIGEN_STRONG_INLINE Packet2Xh pmax(const Packet2Xh& a, const Packet2Xh& b) { const Eigen::half nan = (std::numeric_limits::quiet_NaN)(); - Packet2Xh nans = - __riscv_vfmv_v_f_f16m2(numext::bit_cast<_Float16>(nan), unpacket_traits::size); + Packet2Xh nans = __riscv_vfmv_v_f_f16m2(numext::bit_cast<_Float16>(nan), unpacket_traits::size); PacketMask8 mask = __riscv_vmfeq_vv_f16m2_b8(a, a, unpacket_traits::size); PacketMask8 mask2 = __riscv_vmfeq_vv_f16m2_b8(b, b, unpacket_traits::size); mask = __riscv_vmand_mm_b8(mask, mask2, unpacket_traits::size); @@ -617,22 +669,19 @@ EIGEN_STRONG_INLINE Packet2Xh pmax(const Packet2Xh& template <> EIGEN_STRONG_INLINE Packet2Xh pcmp_le(const Packet2Xh& a, const Packet2Xh& b) { PacketMask8 mask = __riscv_vmfle_vv_f16m2_b8(a, b, unpacket_traits::size); - return __riscv_vmerge_vvm_f16m2(pzero(a), ptrue(a), mask, - unpacket_traits::size); + return __riscv_vmerge_vvm_f16m2(pzero(a), ptrue(a), mask, unpacket_traits::size); } template <> EIGEN_STRONG_INLINE Packet2Xh pcmp_lt(const Packet2Xh& a, const Packet2Xh& b) { PacketMask8 mask = __riscv_vmflt_vv_f16m2_b8(a, b, unpacket_traits::size); - return __riscv_vmerge_vvm_f16m2(pzero(a), ptrue(a), mask, - unpacket_traits::size); + return __riscv_vmerge_vvm_f16m2(pzero(a), ptrue(a), mask, unpacket_traits::size); } template <> EIGEN_STRONG_INLINE Packet2Xh pcmp_eq(const Packet2Xh& a, const Packet2Xh& b) { PacketMask8 mask = __riscv_vmfeq_vv_f16m2_b8(a, b, unpacket_traits::size); - return __riscv_vmerge_vvm_f16m2(pzero(a), ptrue(a), mask, - unpacket_traits::size); + return __riscv_vmerge_vvm_f16m2(pzero(a), ptrue(a), mask, unpacket_traits::size); } template <> @@ -642,26 +691,35 @@ EIGEN_STRONG_INLINE Packet2Xh pcmp_lt_or_nan(const Packet2Xh& a, cons unpacket_traits::size); } +EIGEN_STRONG_INLINE Packet2Xh pselect(const PacketMask8& mask, const Packet2Xh& a, const Packet2Xh& b) { + return __riscv_vmerge_vvm_f16m2(b, a, mask, unpacket_traits::size); +} + +EIGEN_STRONG_INLINE Packet2Xh pselect(const Packet2Xh& mask, const Packet2Xh& a, const Packet2Xh& b) { + PacketMask8 mask2 = + __riscv_vmsne_vx_i16m2_b8(__riscv_vreinterpret_v_f16m2_i16m2(mask), 0, unpacket_traits::size); + return __riscv_vreinterpret_v_i16m2_f16m2(__riscv_vmerge_vvm_i16m2(__riscv_vreinterpret_v_f16m2_i16m2(b), + __riscv_vreinterpret_v_f16m2_i16m2(a), mask2, + unpacket_traits::size)); +} + // Logical Operations are not supported for half, so reinterpret casts template <> EIGEN_STRONG_INLINE Packet2Xh pand(const Packet2Xh& a, const Packet2Xh& b) { - return __riscv_vreinterpret_v_u16m2_f16m2(__riscv_vand_vv_u16m2(__riscv_vreinterpret_v_f16m2_u16m2(a), - __riscv_vreinterpret_v_f16m2_u16m2(b), - unpacket_traits::size)); + return __riscv_vreinterpret_v_u16m2_f16m2(__riscv_vand_vv_u16m2( + __riscv_vreinterpret_v_f16m2_u16m2(a), __riscv_vreinterpret_v_f16m2_u16m2(b), unpacket_traits::size)); } template <> EIGEN_STRONG_INLINE Packet2Xh por(const Packet2Xh& a, const Packet2Xh& b) { - return __riscv_vreinterpret_v_u16m2_f16m2(__riscv_vor_vv_u16m2(__riscv_vreinterpret_v_f16m2_u16m2(a), - __riscv_vreinterpret_v_f16m2_u16m2(b), - unpacket_traits::size)); + return __riscv_vreinterpret_v_u16m2_f16m2(__riscv_vor_vv_u16m2( + __riscv_vreinterpret_v_f16m2_u16m2(a), __riscv_vreinterpret_v_f16m2_u16m2(b), unpacket_traits::size)); } template <> EIGEN_STRONG_INLINE Packet2Xh pxor(const Packet2Xh& a, const Packet2Xh& b) { - return __riscv_vreinterpret_v_u16m2_f16m2(__riscv_vxor_vv_u16m2(__riscv_vreinterpret_v_f16m2_u16m2(a), - __riscv_vreinterpret_v_f16m2_u16m2(b), - unpacket_traits::size)); + return __riscv_vreinterpret_v_u16m2_f16m2(__riscv_vxor_vv_u16m2( + __riscv_vreinterpret_v_f16m2_u16m2(a), __riscv_vreinterpret_v_f16m2_u16m2(b), unpacket_traits::size)); } template <> @@ -686,17 +744,18 @@ EIGEN_STRONG_INLINE Packet2Xh ploadu(const Eigen::half* from) { template <> EIGEN_STRONG_INLINE Packet2Xh ploaddup(const Eigen::half* from) { - Packet2Xsu idx = __riscv_vid_v_u16m2(unpacket_traits::size); - idx = __riscv_vand_vx_u16m2(idx, 0xfffeu, unpacket_traits::size); - return __riscv_vloxei16_v_f16m2(reinterpret_cast(from), idx, unpacket_traits::size); + Packet2Xsu data = __riscv_vreinterpret_v_f16m2_u16m2(pload(from)); + return __riscv_vreinterpret_v_i16m2_f16m2( + __riscv_vreinterpret_v_i32m2_i16m2(__riscv_vreinterpret_v_u32m2_i32m2(__riscv_vlmul_trunc_v_u32m4_u32m2( + __riscv_vwmaccu_vx_u32m4(__riscv_vwaddu_vv_u32m4(data, data, unpacket_traits::size), 0xffffu, data, + unpacket_traits::size))))); } template <> EIGEN_STRONG_INLINE Packet2Xh ploadquad(const Eigen::half* from) { - Packet2Xsu idx = __riscv_vid_v_u16m2(unpacket_traits::size); - idx = __riscv_vsrl_vx_u16m2(__riscv_vand_vx_u16m2(idx, 0xfffcu, unpacket_traits::size), 1, - unpacket_traits::size); - return __riscv_vloxei16_v_f16m2(reinterpret_cast(from), idx, unpacket_traits::size); + Packet2Xsu idx = + __riscv_vsrl_vx_u16m2(__riscv_vid_v_u16m2(unpacket_traits::size), 2, unpacket_traits::size); + return __riscv_vrgather_vv_f16m2(pload(from), idx, unpacket_traits::size); } template <> @@ -718,15 +777,13 @@ EIGEN_DEVICE_FUNC inline Packet2Xh pgather(const Eigen:: } template <> -EIGEN_DEVICE_FUNC inline void pscatter(Eigen::half* to, const Packet2Xh& from, - Index stride) { - __riscv_vsse16(reinterpret_cast<_Float16*>(to), stride * sizeof(Eigen::half), from, - unpacket_traits::size); +EIGEN_DEVICE_FUNC inline void pscatter(Eigen::half* to, const Packet2Xh& from, Index stride) { + __riscv_vsse16(reinterpret_cast<_Float16*>(to), stride * sizeof(Eigen::half), from, unpacket_traits::size); } template <> EIGEN_STRONG_INLINE Eigen::half pfirst(const Packet2Xh& a) { - return static_cast(__riscv_vfmv_f_s_f16m2_f16(a)); + return numext::bit_cast(__riscv_vfmv_f_s_f16m2_f16(a)); } template <> @@ -741,8 +798,8 @@ EIGEN_STRONG_INLINE Packet2Xh print(const Packet2Xh& a) { PacketMask8 mask = __riscv_vmfne_vv_f16m2_b8(a, a, unpacket_traits::size); const Packet2Xh x = __riscv_vfadd_vv_f16m2_tumu(mask, a, a, a, unpacket_traits::size); - const Packet2Xh new_x = __riscv_vfcvt_f_x_v_f16m2( - __riscv_vfcvt_x_f_v_i16m2(a, unpacket_traits::size), unpacket_traits::size); + const Packet2Xh new_x = __riscv_vfcvt_f_x_v_f16m2(__riscv_vfcvt_x_f_v_i16m2(a, unpacket_traits::size), + unpacket_traits::size); mask = __riscv_vmflt_vv_f16m2_b8(abs_a, limit, unpacket_traits::size); Packet2Xh signed_x = __riscv_vfsgnj_vv_f16m2(new_x, x, unpacket_traits::size); @@ -759,9 +816,8 @@ EIGEN_STRONG_INLINE Packet2Xh pfloor(const Packet2Xh& a) { template <> EIGEN_STRONG_INLINE Packet2Xh preverse(const Packet2Xh& a) { - Packet2Xsu idx = - __riscv_vrsub_vx_u16m2(__riscv_vid_v_u16m2(unpacket_traits::size), - unpacket_traits::size - 1, unpacket_traits::size); + Packet2Xsu idx = __riscv_vrsub_vx_u16m2(__riscv_vid_v_u16m2(unpacket_traits::size), + unpacket_traits::size - 1, unpacket_traits::size); return __riscv_vrgather_vv_f16m2(a, idx, unpacket_traits::size); } @@ -774,8 +830,8 @@ EIGEN_STRONG_INLINE Eigen::half predux(const Packet2Xh& a) { template <> EIGEN_STRONG_INLINE Eigen::half predux_mul(const Packet2Xh& a) { - return predux_mul(__riscv_vfmul_vv_f16m1(__riscv_vget_v_f16m2_f16m1(a, 0), __riscv_vget_v_f16m2_f16m1(a, 1), - unpacket_traits::size)); + return predux_mul(__riscv_vfmul_vv_f16m1( + __riscv_vget_v_f16m2_f16m1(a, 0), __riscv_vget_v_f16m2_f16m1(a, 1), unpacket_traits::size)); } template <> @@ -805,9 +861,8 @@ EIGEN_DEVICE_FUNC inline void ptranspose(PacketBlock& kernel) { } for (i = 0; i < N; i++) { - kernel.packet[i] = - __riscv_vle16_v_f16m2(reinterpret_cast<_Float16*>(&buffer[i * unpacket_traits::size]), - unpacket_traits::size); + kernel.packet[i] = __riscv_vle16_v_f16m2(reinterpret_cast<_Float16*>(&buffer[i * unpacket_traits::size]), + unpacket_traits::size); } } @@ -821,9 +876,9 @@ EIGEN_STRONG_INLINE Packet2Xh float2half(const Packet4Xf& a) { template EIGEN_STRONG_INLINE -typename std::enable_if::value && (unpacket_traits::size % 8) == 0, - Packet1Xh>::type -predux_half(const Packet2Xh& a) { + typename std::enable_if::value && (unpacket_traits::size % 8) == 0, + Packet1Xh>::type + predux_half(const Packet2Xh& a) { return __riscv_vfadd_vv_f16m1(__riscv_vget_v_f16m2_f16m1(a, 0), __riscv_vget_v_f16m2_f16m1(a, 1), unpacket_traits::size); } @@ -904,7 +959,7 @@ EIGEN_STRONG_INLINE Packet2Xs preinterpret(const Packet2Xh template <> EIGEN_STRONG_INLINE Packet4Xs pcast(const Packet1Xh& a, const Packet1Xh& b, const Packet1Xh& c, - const Packet1Xh& d) { + const Packet1Xh& d) { return __riscv_vcreate_v_i16m1_i16m4(__riscv_vfcvt_rtz_x_f_v_i16m1(a, unpacket_traits::size), __riscv_vfcvt_rtz_x_f_v_i16m1(b, unpacket_traits::size), __riscv_vfcvt_rtz_x_f_v_i16m1(c, unpacket_traits::size),