mirror of
https://gitlab.com/libeigen/eigen.git
synced 2026-04-10 11:34:33 +08:00
Merge remote-tracking branch 'origin2/master'
This commit is contained in:
@@ -266,7 +266,7 @@ template <typename Packet = Packet4Xi>
|
||||
EIGEN_STRONG_INLINE
|
||||
typename std::enable_if<std::is_same<Packet, Packet4Xi>::value && (unpacket_traits<Packet4Xi>::size % 8) == 0,
|
||||
Packet2Xi>::type
|
||||
predux_half_dowto4(const Packet4Xi& a) {
|
||||
predux_half(const Packet4Xi& a) {
|
||||
return __riscv_vadd_vv_i32m2(__riscv_vget_v_i32m4_i32m2(a, 0), __riscv_vget_v_i32m4_i32m2(a, 1),
|
||||
unpacket_traits<Packet2Xi>::size);
|
||||
}
|
||||
@@ -275,7 +275,7 @@ template <typename Packet = Packet2Xi>
|
||||
EIGEN_STRONG_INLINE
|
||||
typename std::enable_if<std::is_same<Packet, Packet2Xi>::value && (unpacket_traits<Packet2Xi>::size % 8) == 0,
|
||||
Packet1Xi>::type
|
||||
predux_half_dowto4(const Packet2Xi& a) {
|
||||
predux_half(const Packet2Xi& a) {
|
||||
return __riscv_vadd_vv_i32m1(__riscv_vget_v_i32m2_i32m1(a, 0), __riscv_vget_v_i32m2_i32m1(a, 1),
|
||||
unpacket_traits<Packet1Xi>::size);
|
||||
}
|
||||
@@ -611,7 +611,7 @@ template <typename Packet = Packet4Xf>
|
||||
EIGEN_STRONG_INLINE
|
||||
typename std::enable_if<std::is_same<Packet, Packet4Xf>::value && (unpacket_traits<Packet4Xf>::size % 8) == 0,
|
||||
Packet2Xf>::type
|
||||
predux_half_dowto4(const Packet4Xf& a) {
|
||||
predux_half(const Packet4Xf& a) {
|
||||
return __riscv_vfadd_vv_f32m2(__riscv_vget_v_f32m4_f32m2(a, 0), __riscv_vget_v_f32m4_f32m2(a, 1),
|
||||
unpacket_traits<Packet2Xf>::size);
|
||||
}
|
||||
@@ -620,7 +620,7 @@ template <typename Packet = Packet2Xf>
|
||||
EIGEN_STRONG_INLINE
|
||||
typename std::enable_if<std::is_same<Packet, Packet2Xf>::value && (unpacket_traits<Packet2Xf>::size % 8) == 0,
|
||||
Packet1Xf>::type
|
||||
predux_half_dowto4(const Packet2Xf& a) {
|
||||
predux_half(const Packet2Xf& a) {
|
||||
return __riscv_vfadd_vv_f32m1(__riscv_vget_v_f32m2_f32m1(a, 0), __riscv_vget_v_f32m2_f32m1(a, 1),
|
||||
unpacket_traits<Packet1Xf>::size);
|
||||
}
|
||||
@@ -876,7 +876,7 @@ template <typename Packet = Packet4Xl>
|
||||
EIGEN_STRONG_INLINE
|
||||
typename std::enable_if<std::is_same<Packet, Packet4Xl>::value && (unpacket_traits<Packet4Xl>::size % 8) == 0,
|
||||
Packet2Xl>::type
|
||||
predux_half_dowto4(const Packet4Xl& a) {
|
||||
predux_half(const Packet4Xl& a) {
|
||||
return __riscv_vadd_vv_i64m2(__riscv_vget_v_i64m4_i64m2(a, 0), __riscv_vget_v_i64m4_i64m2(a, 1),
|
||||
unpacket_traits<Packet2Xl>::size);
|
||||
}
|
||||
@@ -885,7 +885,7 @@ template <typename Packet = Packet2Xl>
|
||||
EIGEN_STRONG_INLINE
|
||||
typename std::enable_if<std::is_same<Packet, Packet2Xl>::value && (unpacket_traits<Packet2Xl>::size % 8) == 0,
|
||||
Packet1Xl>::type
|
||||
predux_half_dowto4(const Packet2Xl& a) {
|
||||
predux_half(const Packet2Xl& a) {
|
||||
return __riscv_vadd_vv_i64m1(__riscv_vget_v_i64m2_i64m1(a, 0), __riscv_vget_v_i64m2_i64m1(a, 1),
|
||||
unpacket_traits<Packet1Xl>::size);
|
||||
}
|
||||
@@ -1222,7 +1222,7 @@ template <typename Packet = Packet4Xd>
|
||||
EIGEN_STRONG_INLINE
|
||||
typename std::enable_if<std::is_same<Packet, Packet4Xd>::value && (unpacket_traits<Packet4Xd>::size % 8) == 0,
|
||||
Packet2Xd>::type
|
||||
predux_half_dowto4(const Packet4Xd& a) {
|
||||
predux_half(const Packet4Xd& a) {
|
||||
return __riscv_vfadd_vv_f64m2(__riscv_vget_v_f64m4_f64m2(a, 0), __riscv_vget_v_f64m4_f64m2(a, 1),
|
||||
unpacket_traits<Packet2Xd>::size);
|
||||
}
|
||||
@@ -1231,7 +1231,7 @@ template <typename Packet = Packet2Xd>
|
||||
EIGEN_STRONG_INLINE
|
||||
typename std::enable_if<std::is_same<Packet, Packet2Xd>::value && (unpacket_traits<Packet2Xd>::size % 8) == 0,
|
||||
Packet1Xd>::type
|
||||
predux_half_dowto4(const Packet2Xd& a) {
|
||||
predux_half(const Packet2Xd& a) {
|
||||
return __riscv_vfadd_vv_f64m1(__riscv_vget_v_f64m2_f64m1(a, 0), __riscv_vget_v_f64m2_f64m1(a, 1),
|
||||
unpacket_traits<Packet1Xd>::size);
|
||||
}
|
||||
@@ -1486,7 +1486,7 @@ template <typename Packet = Packet4Xs>
|
||||
EIGEN_STRONG_INLINE
|
||||
typename std::enable_if<std::is_same<Packet, Packet4Xs>::value && (unpacket_traits<Packet4Xs>::size % 8) == 0,
|
||||
Packet2Xs>::type
|
||||
predux_half_dowto4(const Packet4Xs& a) {
|
||||
predux_half(const Packet4Xs& a) {
|
||||
return __riscv_vadd_vv_i16m2(__riscv_vget_v_i16m4_i16m2(a, 0), __riscv_vget_v_i16m4_i16m2(a, 1),
|
||||
unpacket_traits<Packet2Xs>::size);
|
||||
}
|
||||
@@ -1495,7 +1495,7 @@ template <typename Packet = Packet2Xs>
|
||||
EIGEN_STRONG_INLINE
|
||||
typename std::enable_if<std::is_same<Packet, Packet2Xs>::value && (unpacket_traits<Packet2Xs>::size % 8) == 0,
|
||||
Packet1Xs>::type
|
||||
predux_half_dowto4(const Packet2Xs& a) {
|
||||
predux_half(const Packet2Xs& a) {
|
||||
return __riscv_vadd_vv_i16m1(__riscv_vget_v_i16m2_i16m1(a, 0), __riscv_vget_v_i16m2_i16m1(a, 1),
|
||||
unpacket_traits<Packet1Xs>::size);
|
||||
}
|
||||
|
||||
@@ -811,7 +811,7 @@ template <typename Packet = Packet2Xh>
|
||||
EIGEN_STRONG_INLINE
|
||||
typename std::enable_if<std::is_same<Packet, Packet2Xh>::value && (unpacket_traits<Packet2Xh>::size % 8) == 0,
|
||||
PacketXh>::type
|
||||
predux_half_dowto4(const Packet2Xh& a) {
|
||||
predux_half(const Packet2Xh& a) {
|
||||
return __riscv_vfadd_vv_f16m1(__riscv_vget_v_f16m2_f16m1(a, 0), __riscv_vget_v_f16m2_f16m1(a, 1),
|
||||
unpacket_traits<PacketXh>::size);
|
||||
}
|
||||
|
||||
@@ -44,68 +44,40 @@ class StreamInterface {
|
||||
|
||||
class GpuDeviceProperties {
|
||||
public:
|
||||
GpuDeviceProperties() : initialized_(false), first_(true), device_properties_(nullptr) {}
|
||||
static const GpuDeviceProperties& instance() {
|
||||
static const GpuDeviceProperties& kInstance = *new GpuDeviceProperties();
|
||||
|
||||
~GpuDeviceProperties() {
|
||||
if (device_properties_) {
|
||||
delete[] device_properties_;
|
||||
}
|
||||
return kInstance;
|
||||
}
|
||||
|
||||
EIGEN_STRONG_INLINE const gpuDeviceProp_t& get(int device) const { return device_properties_[device]; }
|
||||
|
||||
EIGEN_STRONG_INLINE bool isInitialized() const { return initialized_; }
|
||||
private:
|
||||
GpuDeviceProperties() = default;
|
||||
|
||||
void initialize() {
|
||||
if (!initialized_) {
|
||||
// Attempts to ensure proper behavior in the case of multiple threads
|
||||
// calling this function simultaneously. This would be trivial to
|
||||
// implement if we could use std::mutex, but unfortunately mutex don't
|
||||
// compile with nvcc, so we resort to atomics and thread fences instead.
|
||||
// Note that if the caller uses a compiler that doesn't support c++11 we
|
||||
// can't ensure that the initialization is thread safe.
|
||||
if (first_.exchange(false)) {
|
||||
// We're the first thread to reach this point.
|
||||
int num_devices;
|
||||
gpuError_t status = gpuGetDeviceCount(&num_devices);
|
||||
if (status != gpuSuccess) {
|
||||
std::cerr << "Failed to get the number of GPU devices: " << gpuGetErrorString(status) << std::endl;
|
||||
gpu_assert(status == gpuSuccess);
|
||||
}
|
||||
device_properties_ = new gpuDeviceProp_t[num_devices];
|
||||
for (int i = 0; i < num_devices; ++i) {
|
||||
status = gpuGetDeviceProperties(&device_properties_[i], i);
|
||||
if (status != gpuSuccess) {
|
||||
std::cerr << "Failed to initialize GPU device #" << i << ": " << gpuGetErrorString(status) << std::endl;
|
||||
gpu_assert(status == gpuSuccess);
|
||||
}
|
||||
}
|
||||
|
||||
std::atomic_thread_fence(std::memory_order_release);
|
||||
initialized_ = true;
|
||||
} else {
|
||||
// Wait for the other thread to inititialize the properties.
|
||||
while (!initialized_) {
|
||||
std::atomic_thread_fence(std::memory_order_acquire);
|
||||
std::this_thread::sleep_for(std::chrono::milliseconds(1000));
|
||||
}
|
||||
static std::vector<gpuDeviceProp_t> GetDeviceProperties() {
|
||||
int num_devices = 0;
|
||||
gpuError_t status = gpuGetDeviceCount(&num_devices);
|
||||
if (status != gpuSuccess) {
|
||||
std::cerr << "Failed to get the number of GPU devices: " << gpuGetErrorString(status) << std::endl;
|
||||
gpu_assert(status == gpuSuccess);
|
||||
}
|
||||
std::vector<gpuDeviceProp_t> device_properties(num_devices);
|
||||
for (int i = 0; i < num_devices; ++i) {
|
||||
status = gpuGetDeviceProperties(&device_properties_[i], i);
|
||||
if (status != gpuSuccess) {
|
||||
std::cerr << "Failed to initialize GPU device #" << i << ": " << gpuGetErrorString(status) << std::endl;
|
||||
gpu_assert(status == gpuSuccess);
|
||||
}
|
||||
}
|
||||
|
||||
return device_properties;
|
||||
}
|
||||
|
||||
private:
|
||||
volatile bool initialized_;
|
||||
std::atomic<bool> first_;
|
||||
gpuDeviceProp_t* device_properties_;
|
||||
std::vector<gpuDeviceProp_t> device_properties_ = GetDeviceProperties();
|
||||
};
|
||||
|
||||
EIGEN_ALWAYS_INLINE const GpuDeviceProperties& GetGpuDeviceProperties() {
|
||||
static GpuDeviceProperties* deviceProperties = new GpuDeviceProperties();
|
||||
if (!deviceProperties->isInitialized()) {
|
||||
deviceProperties->initialize();
|
||||
}
|
||||
return *deviceProperties;
|
||||
}
|
||||
EIGEN_ALWAYS_INLINE const GpuDeviceProperties& GetGpuDeviceProperties() { return GpuDeviceProperties::instance(); }
|
||||
|
||||
EIGEN_ALWAYS_INLINE const gpuDeviceProp_t& GetGpuDeviceProperties(int device) {
|
||||
return GetGpuDeviceProperties().get(device);
|
||||
|
||||
Reference in New Issue
Block a user