Add CUDA complex sqrt.

This is to support scalar `sqrt` of complex numbers `std::complex<T>` on
device, requested by Tensorflow folks.

Technically `std::complex` is not supported by NVCC on device
(though it is by clang), so the default `sqrt(std::complex<T>)` function only
works on the host. Here we create an overload to add back the
functionality.

Also modified the CMake file to add `--relaxed-constexpr` (or
equivalent) flag for NVCC to allow calling constexpr functions from
device functions, and added support for specifying compute architecture for
NVCC (was already available for clang).
This commit is contained in:
Antonio Sanchez
2020-12-22 22:49:06 -08:00
parent fdf2ee62c5
commit 070d303d56
7 changed files with 217 additions and 28 deletions

View File

@@ -395,6 +395,12 @@ find_package(CUDA 5.0)
if(CUDA_FOUND)
set(CUDA_PROPAGATE_HOST_FLAGS OFF)
set(EIGEN_CUDA_RELAXED_CONSTEXPR "--expt-relaxed-constexpr")
if (${CUDA_VERSION} STREQUAL "7.0")
set(EIGEN_CUDA_RELAXED_CONSTEXPR "--relaxed-constexpr")
endif()
if("${CMAKE_CXX_COMPILER_ID}" STREQUAL "Clang")
set(CUDA_NVCC_FLAGS "-ccbin ${CMAKE_C_COMPILER}" CACHE STRING "nvcc flags" FORCE)
endif()
@@ -404,7 +410,12 @@ if(CUDA_FOUND)
foreach(GPU IN LISTS EIGEN_CUDA_COMPUTE_ARCH)
string(APPEND CMAKE_CXX_FLAGS " --cuda-gpu-arch=sm_${GPU}")
endforeach()
else()
foreach(GPU IN LISTS EIGEN_CUDA_COMPUTE_ARCH)
string(APPEND CUDA_NVCC_FLAGS " -gencode arch=compute_${GPU},code=sm_${GPU}")
endforeach()
endif()
string(APPEND CUDA_NVCC_FLAGS " ${EIGEN_CUDA_RELAXED_CONSTEXPR}")
set(EIGEN_ADD_TEST_FILENAME_EXTENSION "cu")
ei_add_test(gpu_basic)

View File

@@ -14,7 +14,6 @@
#endif
#define EIGEN_TEST_NO_LONGDOUBLE
#define EIGEN_TEST_NO_COMPLEX
#define EIGEN_DEFAULT_DENSE_INDEX_TYPE int
#include "main.h"
@@ -54,6 +53,59 @@ struct coeff_wise {
}
};
template<typename T>
struct complex_sqrt {
EIGEN_DEVICE_FUNC
void operator()(int i, const typename T::Scalar* in, typename T::Scalar* out) const
{
using namespace Eigen;
typedef typename T::Scalar ComplexType;
typedef typename T::Scalar::value_type ValueType;
const int num_special_inputs = 18;
if (i == 0) {
const ValueType nan = std::numeric_limits<ValueType>::quiet_NaN();
typedef Eigen::Vector<ComplexType, num_special_inputs> SpecialInputs;
SpecialInputs special_in;
special_in.setZero();
int idx = 0;
special_in[idx++] = ComplexType(0, 0);
special_in[idx++] = ComplexType(-0, 0);
special_in[idx++] = ComplexType(0, -0);
special_in[idx++] = ComplexType(-0, -0);
// GCC's fallback sqrt implementation fails for inf inputs.
// It is called when _GLIBCXX_USE_C99_COMPLEX is false or if
// clang includes the GCC header (which temporarily disables
// _GLIBCXX_USE_C99_COMPLEX)
#if !defined(_GLIBCXX_COMPLEX) || \
(_GLIBCXX_USE_C99_COMPLEX && !defined(__CLANG_CUDA_WRAPPERS_COMPLEX))
const ValueType inf = std::numeric_limits<ValueType>::infinity();
special_in[idx++] = ComplexType(1.0, inf);
special_in[idx++] = ComplexType(nan, inf);
special_in[idx++] = ComplexType(1.0, -inf);
special_in[idx++] = ComplexType(nan, -inf);
special_in[idx++] = ComplexType(-inf, 1.0);
special_in[idx++] = ComplexType(inf, 1.0);
special_in[idx++] = ComplexType(-inf, -1.0);
special_in[idx++] = ComplexType(inf, -1.0);
special_in[idx++] = ComplexType(-inf, nan);
special_in[idx++] = ComplexType(inf, nan);
#endif
special_in[idx++] = ComplexType(1.0, nan);
special_in[idx++] = ComplexType(nan, 1.0);
special_in[idx++] = ComplexType(nan, -1.0);
special_in[idx++] = ComplexType(nan, nan);
Map<SpecialInputs> special_out(out);
special_out = special_in.cwiseSqrt();
}
T x1(in + i);
Map<T> res(out + num_special_inputs + i*T::MaxSizeAtCompileTime);
res = x1.cwiseSqrt();
}
};
template<typename T>
struct replicate {
EIGEN_DEVICE_FUNC
@@ -161,17 +213,58 @@ struct matrix_inverse {
}
};
template<typename Type1, typename Type2>
bool verifyIsApproxWithInfsNans(const Type1& a, const Type2& b, typename Type1::Scalar* = 0) // Enabled for Eigen's type only
{
if (a.rows() != b.rows()) {
return false;
}
if (a.cols() != b.cols()) {
return false;
}
for (Index r = 0; r < a.rows(); ++r) {
for (Index c = 0; c < a.cols(); ++c) {
if (a(r, c) != b(r, c)
&& !((numext::isnan)(a(r, c)) && (numext::isnan)(b(r, c)))
&& !test_isApprox(a(r, c), b(r, c))) {
return false;
}
}
}
return true;
}
template<typename Kernel, typename Input, typename Output>
void test_with_infs_nans(const Kernel& ker, int n, const Input& in, Output& out)
{
Output out_ref, out_gpu;
#if !defined(EIGEN_GPU_COMPILE_PHASE)
out_ref = out_gpu = out;
#else
EIGEN_UNUSED_VARIABLE(in);
EIGEN_UNUSED_VARIABLE(out);
#endif
run_on_cpu (ker, n, in, out_ref);
run_on_gpu(ker, n, in, out_gpu);
#if !defined(EIGEN_GPU_COMPILE_PHASE)
verifyIsApproxWithInfsNans(out_ref, out_gpu);
#endif
}
EIGEN_DECLARE_TEST(gpu_basic)
{
ei_test_init_gpu();
int nthreads = 100;
Eigen::VectorXf in, out;
Eigen::VectorXcf cfin, cfout;
#if !defined(__CUDA_ARCH__) && !defined(__HIP_DEVICE_COMPILE__)
#if !defined(EIGEN_GPU_COMPILE_PHASE)
int data_size = nthreads * 512;
in.setRandom(data_size);
out.setRandom(data_size);
out.setConstant(data_size, -1);
cfin.setRandom(data_size);
cfout.setConstant(data_size, -1);
#endif
CALL_SUBTEST( run_and_compare_to_gpu(coeff_wise<Vector3f>(), nthreads, in, out) );
@@ -204,6 +297,8 @@ EIGEN_DECLARE_TEST(gpu_basic)
CALL_SUBTEST( run_and_compare_to_gpu(eigenvalues_direct<Matrix3f>(), nthreads, in, out) );
CALL_SUBTEST( run_and_compare_to_gpu(eigenvalues_direct<Matrix2f>(), nthreads, in, out) );
CALL_SUBTEST( test_with_infs_nans(complex_sqrt<Vector3cf>(), nthreads, cfin, cfout) );
#if defined(__NVCC__)
// FIXME
// These subtests compiles only with nvcc and fail with HIPCC and clang-cuda

View File

@@ -68,8 +68,20 @@ void run_on_gpu(const Kernel& ker, int n, const Input& in, Output& out)
#else
run_on_gpu_meta_kernel<<<Grids,Blocks>>>(ker, n, d_in, d_out);
#endif
// Pre-launch errors.
gpuError_t err = gpuGetLastError();
if (err != gpuSuccess) {
printf("%s: %s\n", gpuGetErrorName(err), gpuGetErrorString(err));
gpu_assert(false);
}
// Kernel execution errors.
err = gpuDeviceSynchronize();
if (err != gpuSuccess) {
printf("%s: %s\n", gpuGetErrorName(err), gpuGetErrorString(err));
gpu_assert(false);
}
gpuDeviceSynchronize();
// check inputs have not been modified
gpuMemcpy(const_cast<typename Input::Scalar*>(in.data()), d_in, in_bytes, gpuMemcpyDeviceToHost);
@@ -85,7 +97,7 @@ void run_and_compare_to_gpu(const Kernel& ker, int n, const Input& in, Output& o
{
Input in_ref, in_gpu;
Output out_ref, out_gpu;
#if !defined(__CUDA_ARCH__) && !defined(__HIP_DEVICE_COMPILE__)
#if !defined(EIGEN_GPU_COMPILE_PHASE)
in_ref = in_gpu = in;
out_ref = out_gpu = out;
#else
@@ -94,7 +106,7 @@ void run_and_compare_to_gpu(const Kernel& ker, int n, const Input& in, Output& o
#endif
run_on_cpu (ker, n, in_ref, out_ref);
run_on_gpu(ker, n, in_gpu, out_gpu);
#if !defined(__CUDA_ARCH__) && !defined(__HIP_DEVICE_COMPILE__)
#if !defined(EIGEN_GPU_COMPILE_PHASE)
VERIFY_IS_APPROX(in_ref, in_gpu);
VERIFY_IS_APPROX(out_ref, out_gpu);
#endif
@@ -102,14 +114,16 @@ void run_and_compare_to_gpu(const Kernel& ker, int n, const Input& in, Output& o
struct compile_time_device_info {
EIGEN_DEVICE_FUNC
void operator()(int /*i*/, const int* /*in*/, int* info) const
void operator()(int i, const int* /*in*/, int* info) const
{
#if defined(__CUDA_ARCH__)
info[0] = int(__CUDA_ARCH__ +0);
#endif
#if defined(EIGEN_HIP_DEVICE_COMPILE)
info[1] = int(EIGEN_HIP_DEVICE_COMPILE +0);
#endif
if (i == 0) {
#if defined(__CUDA_ARCH__)
info[0] = int(__CUDA_ARCH__ +0);
#endif
#if defined(EIGEN_HIP_DEVICE_COMPILE)
info[1] = int(EIGEN_HIP_DEVICE_COMPILE +0);
#endif
}
}
};