// This file is part of Eigen, a lightweight C++ template library // for linear algebra. // // Copyright (C) 2016 Benoit Steiner // // This Source Code Form is subject to the terms of the Mozilla // Public License v. 2.0. If a copy of the MPL was not distributed // with this file, You can obtain one at http://mozilla.org/MPL/2.0/. #ifndef EIGEN_PACKET_MATH_HALF_CUDA_H #define EIGEN_PACKET_MATH_HALF_CUDA_H #if defined(EIGEN_HAS_CUDA_FP16) // Make sure this is only available when targeting a GPU: we don't want to // introduce conflicts between these packet_traits definitions and the ones // we'll use on the host side (SSE, AVX, ...) #if defined(__CUDACC__) && defined(EIGEN_USE_GPU) // Most of the following operations require arch >= 3.0 #if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 300 namespace Eigen { namespace internal { template<> struct is_arithmetic { enum { value = true }; }; template<> struct packet_traits : default_packet_traits { typedef half2 type; typedef half2 half; enum { Vectorizable = 1, AlignedOnScalar = 1, size=2, HasHalfPacket = 0, HasDiv = 1, HasSqrt = 1, HasRsqrt = 1, HasExp = 1, HasLog = 1 }; }; template<> struct unpacket_traits { typedef half type; enum {size=2, alignment=Aligned16}; typedef half2 half; }; template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 pset1(const half& from) { return __half2half2(from); } template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 pload(const half* from) { return *reinterpret_cast(from); } template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 ploadu(const half* from) { return __halves2half2(from[0], from[1]); } template<> EIGEN_STRONG_INLINE half2 ploaddup(const half* from) { return __halves2half2(from[0], from[0]); } template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void pstore(half* to, const half2& from) { *reinterpret_cast(to) = from; } template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void pstoreu(half* to, const half2& from) { to[0] = __low2half(from); to[1] = __high2half(from); } template<> EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE half2 ploadt_ro(const half* from) { #if __CUDA_ARCH__ >= 320 return __ldg((const half2*)from); #else return __halves2half2(*(from+0), *(from+1)); #endif } template<> EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE half2 ploadt_ro(const half* from) { #if __CUDA_ARCH__ >= 320 return __halves2half2(__ldg(from+0), __ldg(from+1)); #else return __halves2half2(*(from+0), *(from+1)); #endif } template<> EIGEN_DEVICE_FUNC inline half2 pgather(const half* from, Index stride) { return __halves2half2(from[0*stride], from[1*stride]); } template<> EIGEN_DEVICE_FUNC inline void pscatter(half* to, const half2& from, Index stride) { to[stride*0] = __low2half(from); to[stride*1] = __high2half(from); } template<> EIGEN_DEVICE_FUNC inline half pfirst(const half2& a) { return __low2half(a); } template<> EIGEN_DEVICE_FUNC inline half2 pabs(const half2& a) { half2 result; result.x = a.x & 0x7FFF7FFF; return result; } EIGEN_DEVICE_FUNC inline void ptranspose(PacketBlock& kernel) { half a1 = __low2half(kernel.packet[0]); half a2 = __high2half(kernel.packet[0]); half b1 = __low2half(kernel.packet[1]); half b2 = __high2half(kernel.packet[1]); kernel.packet[0] = __halves2half2(a1, b1); kernel.packet[1] = __halves2half2(a2, b2); } template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 plset(const half& a) { #if __CUDA_ARCH__ >= 530 return __halves2half2(a, __hadd(a, __float2half(1.0f))); #else float f = __half2float(a) + 1.0f; return __halves2half2(a, __float2half(f)); #endif } template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 padd(const half2& a, const half2& b) { #if __CUDA_ARCH__ >= 530 return __hadd2(a, b); #else float a1 = __low2float(a); float a2 = __high2float(a); float b1 = __low2float(b); float b2 = __high2float(b); float r1 = a1 + b1; float r2 = a2 + b2; return __floats2half2_rn(r1, r2); #endif } template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 psub(const half2& a, const half2& b) { #if __CUDA_ARCH__ >= 530 return __hsub2(a, b); #else float a1 = __low2float(a); float a2 = __high2float(a); float b1 = __low2float(b); float b2 = __high2float(b); float r1 = a1 - b1; float r2 = a2 - b2; return __floats2half2_rn(r1, r2); #endif } template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 pnegate(const half2& a) { #if __CUDA_ARCH__ >= 530 return __hneg2(a); #else float a1 = __low2float(a); float a2 = __high2float(a); return __floats2half2_rn(-a1, -a2); #endif } template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 pconj(const half2& a) { return a; } template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 pmul(const half2& a, const half2& b) { #if __CUDA_ARCH__ >= 530 return __hmul2(a, b); #else float a1 = __low2float(a); float a2 = __high2float(a); float b1 = __low2float(b); float b2 = __high2float(b); float r1 = a1 * b1; float r2 = a2 * b2; return __floats2half2_rn(r1, r2); #endif } template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 pmadd(const half2& a, const half2& b, const half2& c) { #if __CUDA_ARCH__ >= 530 return __hfma2(a, b, c); #else float a1 = __low2float(a); float a2 = __high2float(a); float b1 = __low2float(b); float b2 = __high2float(b); float c1 = __low2float(c); float c2 = __high2float(c); float r1 = a1 * b1 + c1; float r2 = a2 * b2 + c2; return __floats2half2_rn(r1, r2); #endif } template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 pdiv(const half2& a, const half2& b) { float a1 = __low2float(a); float a2 = __high2float(a); float b1 = __low2float(b); float b2 = __high2float(b); float r1 = a1 / b1; float r2 = a2 / b2; return __floats2half2_rn(r1, r2); } template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 pmin(const half2& a, const half2& b) { float a1 = __low2float(a); float a2 = __high2float(a); float b1 = __low2float(b); float b2 = __high2float(b); half r1 = a1 < b1 ? __low2half(a) : __low2half(b); half r2 = a2 < b2 ? __high2half(a) : __high2half(b); return __halves2half2(r1, r2); } template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 pmax(const half2& a, const half2& b) { float a1 = __low2float(a); float a2 = __high2float(a); float b1 = __low2float(b); float b2 = __high2float(b); half r1 = a1 > b1 ? __low2half(a) : __low2half(b); half r2 = a2 > b2 ? __high2half(a) : __high2half(b); return __halves2half2(r1, r2); } template<> EIGEN_DEVICE_FUNC inline half predux(const half2& a) { #if __CUDA_ARCH__ >= 530 return __hadd(__low2half(a), __high2half(a)); #else float a1 = __low2float(a); float a2 = __high2float(a); return half(__float2half_rn(a1 + a2)); #endif } template<> EIGEN_DEVICE_FUNC inline half predux_max(const half2& a) { #if __CUDA_ARCH__ >= 530 half first = __low2half(a); half second = __high2half(a); return __hgt(first, second) ? first : second; #else float a1 = __low2float(a); float a2 = __high2float(a); return half(__float2half_rn(numext::maxi(a1, a2))); #endif } template<> EIGEN_DEVICE_FUNC inline half predux_min(const half2& a) { #if __CUDA_ARCH__ >= 530 half first = __low2half(a); half second = __high2half(a); return __hlt(first, second) ? first : second; #else float a1 = __low2float(a); float a2 = __high2float(a); return half(__float2half_rn(numext::mini(a1, a2))); #endif } template<> EIGEN_DEVICE_FUNC inline half predux_mul(const half2& a) { #if __CUDA_ARCH__ >= 530 return __hmul(__low2half(a), __high2half(a)); #else float a1 = __low2float(a); float a2 = __high2float(a); return half(__float2half_rn(a1 * a2)); #endif } template<> EIGEN_DEVICE_FUNC inline half2 plog(const half2& a) { float a1 = __low2float(a); float a2 = __high2float(a); float r1 = logf(a1); float r2 = logf(a2); return __floats2half2_rn(r1, r2); } template<> EIGEN_DEVICE_FUNC inline half2 pexp(const half2& a) { float a1 = __low2float(a); float a2 = __high2float(a); float r1 = expf(a1); float r2 = expf(a2); return __floats2half2_rn(r1, r2); } template<> EIGEN_DEVICE_FUNC inline half2 psqrt(const half2& a) { float a1 = __low2float(a); float a2 = __high2float(a); float r1 = sqrtf(a1); float r2 = sqrtf(a2); return __floats2half2_rn(r1, r2); } template<> EIGEN_DEVICE_FUNC inline half2 prsqrt(const half2& a) { float a1 = __low2float(a); float a2 = __high2float(a); float r1 = rsqrtf(a1); float r2 = rsqrtf(a2); return __floats2half2_rn(r1, r2); } } // end namespace internal } // end namespace Eigen #endif #endif #endif #endif // EIGEN_PACKET_MATH_HALF_CUDA_H