mirror of
https://gitlab.com/libeigen/eigen.git
synced 2026-04-10 11:34:33 +08:00
removing the *Hip files from the unsupported/Eigen/CXX11/src/Tensor and unsupported/test directories
This commit is contained in:
@@ -1,251 +0,0 @@
|
||||
// This file is part of Eigen, a lightweight C++ template library
|
||||
// for linear algebra.
|
||||
//
|
||||
// Copyright (C) 2014 Benoit Steiner <benoit.steiner.goog@gmail.com>
|
||||
//
|
||||
// This Source Code Form is subject to the terms of the Mozilla
|
||||
// Public License v. 2.0. If a copy of the MPL was not distributed
|
||||
// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
|
||||
|
||||
|
||||
#define EIGEN_TEST_NO_LONGDOUBLE
|
||||
#define EIGEN_TEST_FUNC cxx11_tensor_hip
|
||||
#define EIGEN_USE_GPU
|
||||
|
||||
#include "main.h"
|
||||
#include <unsupported/Eigen/CXX11/Tensor>
|
||||
|
||||
using Eigen::Tensor;
|
||||
|
||||
template <int Layout>
|
||||
void test_hip_simple_argmax()
|
||||
{
|
||||
Tensor<double, 3, Layout> in(Eigen::array<DenseIndex, 3>(72,53,97));
|
||||
Tensor<DenseIndex, 1, Layout> out_max(Eigen::array<DenseIndex, 1>(1));
|
||||
Tensor<DenseIndex, 1, Layout> out_min(Eigen::array<DenseIndex, 1>(1));
|
||||
in.setRandom();
|
||||
in *= in.constant(100.0);
|
||||
in(0, 0, 0) = -1000.0;
|
||||
in(71, 52, 96) = 1000.0;
|
||||
|
||||
std::size_t in_bytes = in.size() * sizeof(double);
|
||||
std::size_t out_bytes = out_max.size() * sizeof(DenseIndex);
|
||||
|
||||
double* d_in;
|
||||
DenseIndex* d_out_max;
|
||||
DenseIndex* d_out_min;
|
||||
hipMalloc((void**)(&d_in), in_bytes);
|
||||
hipMalloc((void**)(&d_out_max), out_bytes);
|
||||
hipMalloc((void**)(&d_out_min), out_bytes);
|
||||
|
||||
hipMemcpy(d_in, in.data(), in_bytes, hipMemcpyHostToDevice);
|
||||
|
||||
Eigen::HipStreamDevice stream;
|
||||
Eigen::GpuDevice gpu_device(&stream);
|
||||
|
||||
Eigen::TensorMap<Eigen::Tensor<double, 3, Layout>, Aligned > gpu_in(d_in, Eigen::array<DenseIndex, 3>(72,53,97));
|
||||
Eigen::TensorMap<Eigen::Tensor<DenseIndex, 1, Layout>, Aligned > gpu_out_max(d_out_max, Eigen::array<DenseIndex, 1>(1));
|
||||
Eigen::TensorMap<Eigen::Tensor<DenseIndex, 1, Layout>, Aligned > gpu_out_min(d_out_min, Eigen::array<DenseIndex, 1>(1));
|
||||
|
||||
gpu_out_max.device(gpu_device) = gpu_in.argmax();
|
||||
gpu_out_min.device(gpu_device) = gpu_in.argmin();
|
||||
|
||||
assert(hipMemcpyAsync(out_max.data(), d_out_max, out_bytes, hipMemcpyDeviceToHost, gpu_device.stream()) == hipSuccess);
|
||||
assert(hipMemcpyAsync(out_min.data(), d_out_min, out_bytes, hipMemcpyDeviceToHost, gpu_device.stream()) == hipSuccess);
|
||||
assert(hipStreamSynchronize(gpu_device.stream()) == hipSuccess);
|
||||
|
||||
VERIFY_IS_EQUAL(out_max(Eigen::array<DenseIndex, 1>(0)), 72*53*97 - 1);
|
||||
VERIFY_IS_EQUAL(out_min(Eigen::array<DenseIndex, 1>(0)), 0);
|
||||
|
||||
hipFree(d_in);
|
||||
hipFree(d_out_max);
|
||||
hipFree(d_out_min);
|
||||
}
|
||||
|
||||
template <int DataLayout>
|
||||
void test_hip_argmax_dim()
|
||||
{
|
||||
Tensor<float, 4, DataLayout> tensor(2,3,5,7);
|
||||
std::vector<int> dims;
|
||||
dims.push_back(2); dims.push_back(3); dims.push_back(5); dims.push_back(7);
|
||||
|
||||
for (int dim = 0; dim < 4; ++dim) {
|
||||
tensor.setRandom();
|
||||
tensor = (tensor + tensor.constant(0.5)).log();
|
||||
|
||||
array<DenseIndex, 3> out_shape;
|
||||
for (int d = 0; d < 3; ++d) out_shape[d] = (d < dim) ? dims[d] : dims[d+1];
|
||||
|
||||
Tensor<DenseIndex, 3, DataLayout> tensor_arg(out_shape);
|
||||
|
||||
array<DenseIndex, 4> ix;
|
||||
for (int i = 0; i < 2; ++i) {
|
||||
for (int j = 0; j < 3; ++j) {
|
||||
for (int k = 0; k < 5; ++k) {
|
||||
for (int l = 0; l < 7; ++l) {
|
||||
ix[0] = i; ix[1] = j; ix[2] = k; ix[3] = l;
|
||||
if (ix[dim] != 0) continue;
|
||||
// suppose dim == 1, then for all i, k, l, set tensor(i, 0, k, l) = 10.0
|
||||
tensor(ix) = 10.0;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
std::size_t in_bytes = tensor.size() * sizeof(float);
|
||||
std::size_t out_bytes = tensor_arg.size() * sizeof(DenseIndex);
|
||||
|
||||
float* d_in;
|
||||
DenseIndex* d_out;
|
||||
hipMalloc((void**)(&d_in), in_bytes);
|
||||
hipMalloc((void**)(&d_out), out_bytes);
|
||||
|
||||
hipMemcpy(d_in, tensor.data(), in_bytes, hipMemcpyHostToDevice);
|
||||
|
||||
Eigen::HipStreamDevice stream;
|
||||
Eigen::GpuDevice gpu_device(&stream);
|
||||
|
||||
Eigen::TensorMap<Eigen::Tensor<float, 4, DataLayout>, Aligned > gpu_in(d_in, Eigen::array<DenseIndex, 4>(2, 3, 5, 7));
|
||||
Eigen::TensorMap<Eigen::Tensor<DenseIndex, 3, DataLayout>, Aligned > gpu_out(d_out, out_shape);
|
||||
|
||||
gpu_out.device(gpu_device) = gpu_in.argmax(dim);
|
||||
|
||||
assert(hipMemcpyAsync(tensor_arg.data(), d_out, out_bytes, hipMemcpyDeviceToHost, gpu_device.stream()) == hipSuccess);
|
||||
assert(hipStreamSynchronize(gpu_device.stream()) == hipSuccess);
|
||||
|
||||
VERIFY_IS_EQUAL(tensor_arg.size(),
|
||||
size_t(2*3*5*7 / tensor.dimension(dim)));
|
||||
|
||||
for (DenseIndex n = 0; n < tensor_arg.size(); ++n) {
|
||||
// Expect max to be in the first index of the reduced dimension
|
||||
VERIFY_IS_EQUAL(tensor_arg.data()[n], 0);
|
||||
}
|
||||
|
||||
for (int i = 0; i < 2; ++i) {
|
||||
for (int j = 0; j < 3; ++j) {
|
||||
for (int k = 0; k < 5; ++k) {
|
||||
for (int l = 0; l < 7; ++l) {
|
||||
ix[0] = i; ix[1] = j; ix[2] = k; ix[3] = l;
|
||||
if (ix[dim] != tensor.dimension(dim) - 1) continue;
|
||||
// suppose dim == 1, then for all i, k, l, set tensor(i, 2, k, l) = 20.0
|
||||
tensor(ix) = 20.0;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
hipMemcpy(d_in, tensor.data(), in_bytes, hipMemcpyHostToDevice);
|
||||
|
||||
gpu_out.device(gpu_device) = gpu_in.argmax(dim);
|
||||
|
||||
assert(hipMemcpyAsync(tensor_arg.data(), d_out, out_bytes, hipMemcpyDeviceToHost, gpu_device.stream()) == hipSuccess);
|
||||
assert(hipStreamSynchronize(gpu_device.stream()) == hipSuccess);
|
||||
|
||||
for (DenseIndex n = 0; n < tensor_arg.size(); ++n) {
|
||||
// Expect max to be in the last index of the reduced dimension
|
||||
VERIFY_IS_EQUAL(tensor_arg.data()[n], tensor.dimension(dim) - 1);
|
||||
}
|
||||
|
||||
hipFree(d_in);
|
||||
hipFree(d_out);
|
||||
}
|
||||
}
|
||||
|
||||
template <int DataLayout>
|
||||
void test_hip_argmin_dim()
|
||||
{
|
||||
Tensor<float, 4, DataLayout> tensor(2,3,5,7);
|
||||
std::vector<int> dims;
|
||||
dims.push_back(2); dims.push_back(3); dims.push_back(5); dims.push_back(7);
|
||||
|
||||
for (int dim = 0; dim < 4; ++dim) {
|
||||
tensor.setRandom();
|
||||
tensor = (tensor + tensor.constant(0.5)).log();
|
||||
|
||||
array<DenseIndex, 3> out_shape;
|
||||
for (int d = 0; d < 3; ++d) out_shape[d] = (d < dim) ? dims[d] : dims[d+1];
|
||||
|
||||
Tensor<DenseIndex, 3, DataLayout> tensor_arg(out_shape);
|
||||
|
||||
array<DenseIndex, 4> ix;
|
||||
for (int i = 0; i < 2; ++i) {
|
||||
for (int j = 0; j < 3; ++j) {
|
||||
for (int k = 0; k < 5; ++k) {
|
||||
for (int l = 0; l < 7; ++l) {
|
||||
ix[0] = i; ix[1] = j; ix[2] = k; ix[3] = l;
|
||||
if (ix[dim] != 0) continue;
|
||||
// suppose dim == 1, then for all i, k, l, set tensor(i, 0, k, l) = 10.0
|
||||
tensor(ix) = -10.0;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
std::size_t in_bytes = tensor.size() * sizeof(float);
|
||||
std::size_t out_bytes = tensor_arg.size() * sizeof(DenseIndex);
|
||||
|
||||
float* d_in;
|
||||
DenseIndex* d_out;
|
||||
hipMalloc((void**)(&d_in), in_bytes);
|
||||
hipMalloc((void**)(&d_out), out_bytes);
|
||||
|
||||
hipMemcpy(d_in, tensor.data(), in_bytes, hipMemcpyHostToDevice);
|
||||
|
||||
Eigen::HipStreamDevice stream;
|
||||
Eigen::GpuDevice gpu_device(&stream);
|
||||
|
||||
Eigen::TensorMap<Eigen::Tensor<float, 4, DataLayout>, Aligned > gpu_in(d_in, Eigen::array<DenseIndex, 4>(2, 3, 5, 7));
|
||||
Eigen::TensorMap<Eigen::Tensor<DenseIndex, 3, DataLayout>, Aligned > gpu_out(d_out, out_shape);
|
||||
|
||||
gpu_out.device(gpu_device) = gpu_in.argmin(dim);
|
||||
|
||||
assert(hipMemcpyAsync(tensor_arg.data(), d_out, out_bytes, hipMemcpyDeviceToHost, gpu_device.stream()) == hipSuccess);
|
||||
assert(hipStreamSynchronize(gpu_device.stream()) == hipSuccess);
|
||||
|
||||
VERIFY_IS_EQUAL(tensor_arg.size(),
|
||||
2*3*5*7 / tensor.dimension(dim));
|
||||
|
||||
for (DenseIndex n = 0; n < tensor_arg.size(); ++n) {
|
||||
// Expect min to be in the first index of the reduced dimension
|
||||
VERIFY_IS_EQUAL(tensor_arg.data()[n], 0);
|
||||
}
|
||||
|
||||
for (int i = 0; i < 2; ++i) {
|
||||
for (int j = 0; j < 3; ++j) {
|
||||
for (int k = 0; k < 5; ++k) {
|
||||
for (int l = 0; l < 7; ++l) {
|
||||
ix[0] = i; ix[1] = j; ix[2] = k; ix[3] = l;
|
||||
if (ix[dim] != tensor.dimension(dim) - 1) continue;
|
||||
// suppose dim == 1, then for all i, k, l, set tensor(i, 2, k, l) = 20.0
|
||||
tensor(ix) = -20.0;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
hipMemcpy(d_in, tensor.data(), in_bytes, hipMemcpyHostToDevice);
|
||||
|
||||
gpu_out.device(gpu_device) = gpu_in.argmin(dim);
|
||||
|
||||
assert(hipMemcpyAsync(tensor_arg.data(), d_out, out_bytes, hipMemcpyDeviceToHost, gpu_device.stream()) == hipSuccess);
|
||||
assert(hipStreamSynchronize(gpu_device.stream()) == hipSuccess);
|
||||
|
||||
for (DenseIndex n = 0; n < tensor_arg.size(); ++n) {
|
||||
// Expect max to be in the last index of the reduced dimension
|
||||
VERIFY_IS_EQUAL(tensor_arg.data()[n], tensor.dimension(dim) - 1);
|
||||
}
|
||||
|
||||
hipFree(d_in);
|
||||
hipFree(d_out);
|
||||
}
|
||||
}
|
||||
|
||||
void test_cxx11_tensor_hip()
|
||||
{
|
||||
CALL_SUBTEST(test_hip_simple_argmax<RowMajor>());
|
||||
CALL_SUBTEST(test_hip_simple_argmax<ColMajor>());
|
||||
CALL_SUBTEST(test_hip_argmax_dim<RowMajor>());
|
||||
CALL_SUBTEST(test_hip_argmax_dim<ColMajor>());
|
||||
CALL_SUBTEST(test_hip_argmin_dim<RowMajor>());
|
||||
CALL_SUBTEST(test_hip_argmin_dim<ColMajor>());
|
||||
}
|
||||
@@ -1,79 +0,0 @@
|
||||
// This file is part of Eigen, a lightweight C++ template library
|
||||
// for linear algebra.
|
||||
//
|
||||
// Copyright (C) 2016 Benoit Steiner <benoit.steiner.goog@gmail.com>
|
||||
//
|
||||
// This Source Code Form is subject to the terms of the Mozilla
|
||||
// Public License v. 2.0. If a copy of the MPL was not distributed
|
||||
// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
|
||||
|
||||
#define EIGEN_TEST_NO_LONGDOUBLE
|
||||
#define EIGEN_TEST_NO_COMPLEX
|
||||
#define EIGEN_TEST_FUNC cxx11_tensor_cast_float16_hip
|
||||
#define EIGEN_DEFAULT_DENSE_INDEX_TYPE int
|
||||
#define EIGEN_USE_GPU
|
||||
|
||||
#include "main.h"
|
||||
#include <unsupported/Eigen/CXX11/Tensor>
|
||||
|
||||
using Eigen::Tensor;
|
||||
|
||||
void test_hip_conversion() {
|
||||
Eigen::HipStreamDevice stream;
|
||||
Eigen::GpuDevice gpu_device(&stream);
|
||||
int num_elem = 101;
|
||||
|
||||
Tensor<float, 1> floats(num_elem);
|
||||
floats.setRandom();
|
||||
|
||||
float* d_float = (float*)gpu_device.allocate(num_elem * sizeof(float));
|
||||
Eigen::half* d_half = (Eigen::half*)gpu_device.allocate(num_elem * sizeof(Eigen::half));
|
||||
float* d_conv = (float*)gpu_device.allocate(num_elem * sizeof(float));
|
||||
|
||||
Eigen::TensorMap<Eigen::Tensor<float, 1>, Eigen::Aligned> gpu_float(
|
||||
d_float, num_elem);
|
||||
Eigen::TensorMap<Eigen::Tensor<Eigen::half, 1>, Eigen::Aligned> gpu_half(
|
||||
d_half, num_elem);
|
||||
Eigen::TensorMap<Eigen::Tensor<float, 1>, Eigen::Aligned> gpu_conv(
|
||||
d_conv, num_elem);
|
||||
|
||||
gpu_device.memcpyHostToDevice(d_float, floats.data(), num_elem*sizeof(float));
|
||||
|
||||
gpu_half.device(gpu_device) = gpu_float.cast<Eigen::half>();
|
||||
gpu_conv.device(gpu_device) = gpu_half.cast<float>();
|
||||
|
||||
Tensor<float, 1> initial(num_elem);
|
||||
Tensor<float, 1> final(num_elem);
|
||||
gpu_device.memcpyDeviceToHost(initial.data(), d_float, num_elem*sizeof(float));
|
||||
gpu_device.memcpyDeviceToHost(final.data(), d_conv, num_elem*sizeof(float));
|
||||
gpu_device.synchronize();
|
||||
|
||||
for (int i = 0; i < num_elem; ++i) {
|
||||
VERIFY_IS_APPROX(initial(i), final(i));
|
||||
}
|
||||
|
||||
gpu_device.deallocate(d_float);
|
||||
gpu_device.deallocate(d_half);
|
||||
gpu_device.deallocate(d_conv);
|
||||
}
|
||||
|
||||
|
||||
void test_fallback_conversion() {
|
||||
int num_elem = 101;
|
||||
Tensor<float, 1> floats(num_elem);
|
||||
floats.setRandom();
|
||||
|
||||
Eigen::Tensor<Eigen::half, 1> halfs = floats.cast<Eigen::half>();
|
||||
Eigen::Tensor<float, 1> conv = halfs.cast<float>();
|
||||
|
||||
for (int i = 0; i < num_elem; ++i) {
|
||||
VERIFY_IS_APPROX(floats(i), conv(i));
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
void test_cxx11_tensor_cast_float16_hip()
|
||||
{
|
||||
CALL_SUBTEST(test_hip_conversion());
|
||||
CALL_SUBTEST(test_fallback_conversion());
|
||||
}
|
||||
@@ -1,215 +0,0 @@
|
||||
// This file is part of Eigen, a lightweight C++ template library
|
||||
// for linear algebra.
|
||||
//
|
||||
// Copyright (C) 2014 Benoit Steiner <benoit.steiner.goog@gmail.com>
|
||||
// Copyright (C) 2014 Navdeep Jaitly <ndjaitly@google.com>
|
||||
//
|
||||
// This Source Code Form is subject to the terms of the Mozilla
|
||||
// Public License v. 2.0. If a copy of the MPL was not distributed
|
||||
// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
|
||||
|
||||
#define EIGEN_TEST_NO_LONGDOUBLE
|
||||
#define EIGEN_TEST_NO_COMPLEX
|
||||
#define EIGEN_TEST_FUNC cxx11_tensor_hip
|
||||
#define EIGEN_DEFAULT_DENSE_INDEX_TYPE int
|
||||
#define EIGEN_USE_GPU
|
||||
|
||||
#include "main.h"
|
||||
#include <unsupported/Eigen/CXX11/Tensor>
|
||||
|
||||
|
||||
using Eigen::Tensor;
|
||||
typedef Tensor<float, 1>::DimensionPair DimPair;
|
||||
|
||||
template<int DataLayout>
|
||||
void test_hip_contraction(int m_size, int k_size, int n_size)
|
||||
{
|
||||
std::cout << "Testing for (" << m_size << "," << k_size << "," << n_size << ")" << std::endl;
|
||||
// with these dimensions, the output has 300 * 140 elements, which is
|
||||
// more than 30 * 1024, which is the number of threads in blocks on
|
||||
// a 15 SM GK110 GPU
|
||||
Tensor<float, 2, DataLayout> t_left(m_size, k_size);
|
||||
Tensor<float, 2, DataLayout> t_right(k_size, n_size);
|
||||
Tensor<float, 2, DataLayout> t_result(m_size, n_size);
|
||||
Tensor<float, 2, DataLayout> t_result_gpu(m_size, n_size);
|
||||
Eigen::array<DimPair, 1> dims(DimPair(1, 0));
|
||||
|
||||
t_left.setRandom();
|
||||
t_right.setRandom();
|
||||
|
||||
std::size_t t_left_bytes = t_left.size() * sizeof(float);
|
||||
std::size_t t_right_bytes = t_right.size() * sizeof(float);
|
||||
std::size_t t_result_bytes = t_result.size() * sizeof(float);
|
||||
|
||||
float* d_t_left;
|
||||
float* d_t_right;
|
||||
float* d_t_result;
|
||||
|
||||
hipMalloc((void**)(&d_t_left), t_left_bytes);
|
||||
hipMalloc((void**)(&d_t_right), t_right_bytes);
|
||||
hipMalloc((void**)(&d_t_result), t_result_bytes);
|
||||
|
||||
hipMemcpy(d_t_left, t_left.data(), t_left_bytes, hipMemcpyHostToDevice);
|
||||
hipMemcpy(d_t_right, t_right.data(), t_right_bytes, hipMemcpyHostToDevice);
|
||||
|
||||
Eigen::HipStreamDevice stream;
|
||||
Eigen::GpuDevice gpu_device(&stream);
|
||||
|
||||
Eigen::TensorMap<Eigen::Tensor<float, 2, DataLayout> >
|
||||
gpu_t_left(d_t_left, Eigen::array<int, 2>(m_size, k_size));
|
||||
Eigen::TensorMap<Eigen::Tensor<float, 2, DataLayout> >
|
||||
gpu_t_right(d_t_right, Eigen::array<int, 2>(k_size, n_size));
|
||||
Eigen::TensorMap<Eigen::Tensor<float, 2, DataLayout> >
|
||||
gpu_t_result(d_t_result, Eigen::array<int, 2>(m_size, n_size));
|
||||
|
||||
|
||||
gpu_t_result.device(gpu_device) = gpu_t_left.contract(gpu_t_right, dims);
|
||||
t_result = t_left.contract(t_right, dims);
|
||||
|
||||
hipMemcpy(t_result_gpu.data(), d_t_result, t_result_bytes, hipMemcpyDeviceToHost);
|
||||
for (DenseIndex i = 0; i < t_result.size(); i++) {
|
||||
if (fabs(t_result(i) - t_result_gpu(i)) < 1e-4f) {
|
||||
continue;
|
||||
}
|
||||
if (Eigen::internal::isApprox(t_result(i), t_result_gpu(i), 1e-4f)) {
|
||||
continue;
|
||||
}
|
||||
std::cout << "mismatch detected at index " << i << ": " << t_result(i)
|
||||
<< " vs " << t_result_gpu(i) << std::endl;
|
||||
assert(false);
|
||||
}
|
||||
|
||||
hipFree((void*)d_t_left);
|
||||
hipFree((void*)d_t_right);
|
||||
hipFree((void*)d_t_result);
|
||||
}
|
||||
|
||||
|
||||
template<int DataLayout>
|
||||
void test_scalar(int m_size, int k_size, int n_size)
|
||||
{
|
||||
std::cout << "Testing for (" << m_size << "," << k_size << "," << n_size << ")" << std::endl;
|
||||
// with these dimensions, the output has 300 * 140 elements, which is
|
||||
// more than 30 * 1024, which is the number of threads in blocks on
|
||||
// a 15 SM GK110 GPU
|
||||
Tensor<float, 2, DataLayout> t_left(m_size, k_size);
|
||||
Tensor<float, 2, DataLayout> t_right(k_size, n_size);
|
||||
Tensor<float, 0, DataLayout> t_result;
|
||||
Tensor<float, 0, DataLayout> t_result_gpu;
|
||||
Eigen::array<DimPair, 2> dims(DimPair(0, 0), DimPair(1, 1));
|
||||
|
||||
t_left.setRandom();
|
||||
t_right.setRandom();
|
||||
|
||||
std::size_t t_left_bytes = t_left.size() * sizeof(float);
|
||||
std::size_t t_right_bytes = t_right.size() * sizeof(float);
|
||||
std::size_t t_result_bytes = sizeof(float);
|
||||
|
||||
float* d_t_left;
|
||||
float* d_t_right;
|
||||
float* d_t_result;
|
||||
|
||||
hipMalloc((void**)(&d_t_left), t_left_bytes);
|
||||
hipMalloc((void**)(&d_t_right), t_right_bytes);
|
||||
hipMalloc((void**)(&d_t_result), t_result_bytes);
|
||||
|
||||
hipMemcpy(d_t_left, t_left.data(), t_left_bytes, hipMemcpyHostToDevice);
|
||||
hipMemcpy(d_t_right, t_right.data(), t_right_bytes, hipMemcpyHostToDevice);
|
||||
|
||||
Eigen::HipStreamDevice stream;
|
||||
Eigen::GpuDevice gpu_device(&stream);
|
||||
|
||||
Eigen::TensorMap<Eigen::Tensor<float, 2, DataLayout> >
|
||||
gpu_t_left(d_t_left, m_size, k_size);
|
||||
Eigen::TensorMap<Eigen::Tensor<float, 2, DataLayout> >
|
||||
gpu_t_right(d_t_right, k_size, n_size);
|
||||
Eigen::TensorMap<Eigen::Tensor<float, 0, DataLayout> >
|
||||
gpu_t_result(d_t_result);
|
||||
|
||||
gpu_t_result.device(gpu_device) = gpu_t_left.contract(gpu_t_right, dims);
|
||||
t_result = t_left.contract(t_right, dims);
|
||||
|
||||
hipMemcpy(t_result_gpu.data(), d_t_result, t_result_bytes, hipMemcpyDeviceToHost);
|
||||
if (fabs(t_result() - t_result_gpu()) > 1e-4f &&
|
||||
!Eigen::internal::isApprox(t_result(), t_result_gpu(), 1e-4f)) {
|
||||
std::cout << "mismatch detected: " << t_result()
|
||||
<< " vs " << t_result_gpu() << std::endl;
|
||||
assert(false);
|
||||
}
|
||||
|
||||
hipFree((void*)d_t_left);
|
||||
hipFree((void*)d_t_right);
|
||||
hipFree((void*)d_t_result);
|
||||
}
|
||||
|
||||
|
||||
template<int DataLayout>
|
||||
void test_hip_contraction_m() {
|
||||
for (int k = 32; k < 256; k++) {
|
||||
test_hip_contraction<ColMajor>(k, 128, 128);
|
||||
test_hip_contraction<RowMajor>(k, 128, 128);
|
||||
}
|
||||
}
|
||||
|
||||
template<int DataLayout>
|
||||
void test_hip_contraction_k() {
|
||||
for (int k = 32; k < 256; k++) {
|
||||
test_hip_contraction<ColMajor>(128, k, 128);
|
||||
test_hip_contraction<RowMajor>(128, k, 128);
|
||||
}
|
||||
}
|
||||
|
||||
template<int DataLayout>
|
||||
void test_hip_contraction_n() {
|
||||
for (int k = 32; k < 256; k++) {
|
||||
test_hip_contraction<ColMajor>(128, 128, k);
|
||||
test_hip_contraction<RowMajor>(128, 128, k);
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
template<int DataLayout>
|
||||
void test_hip_contraction_sizes() {
|
||||
int m_sizes[] = { 31, 39, 63, 64, 65,
|
||||
127, 129, 255, 257 , 511,
|
||||
512, 513, 1023, 1024, 1025};
|
||||
|
||||
int n_sizes[] = { 31, 39, 63, 64, 65,
|
||||
127, 129, 255, 257, 511,
|
||||
512, 513, 1023, 1024, 1025};
|
||||
|
||||
int k_sizes[] = { 31, 39, 63, 64, 65,
|
||||
95, 96, 127, 129, 255,
|
||||
257, 511, 512, 513, 1023,
|
||||
1024, 1025};
|
||||
|
||||
for (int i = 0; i < 15; i++) {
|
||||
for (int j = 0; j < 15; j++) {
|
||||
for (int k = 0; k < 17; k++) {
|
||||
test_hip_contraction<DataLayout>(m_sizes[i], n_sizes[j], k_sizes[k]);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
void test_cxx11_tensor_hip()
|
||||
{
|
||||
CALL_SUBTEST(test_hip_contraction<ColMajor>(128, 128, 128));
|
||||
CALL_SUBTEST(test_hip_contraction<RowMajor>(128, 128, 128));
|
||||
|
||||
CALL_SUBTEST(test_scalar<ColMajor>(128, 128, 128));
|
||||
CALL_SUBTEST(test_scalar<RowMajor>(128, 128, 128));
|
||||
|
||||
CALL_SUBTEST(test_hip_contraction_m<ColMajor>());
|
||||
CALL_SUBTEST(test_hip_contraction_m<RowMajor>());
|
||||
|
||||
CALL_SUBTEST(test_hip_contraction_k<ColMajor>());
|
||||
CALL_SUBTEST(test_hip_contraction_k<RowMajor>());
|
||||
|
||||
CALL_SUBTEST(test_hip_contraction_n<ColMajor>());
|
||||
CALL_SUBTEST(test_hip_contraction_n<RowMajor>());
|
||||
|
||||
// Commenting out these tests due to long runtimes
|
||||
// CALL_SUBTEST(test_hip_contraction_sizes<ColMajor>());
|
||||
// CALL_SUBTEST(test_hip_contraction_sizes<RowMajor>());
|
||||
}
|
||||
@@ -1,389 +0,0 @@
|
||||
// This file is part of Eigen, a lightweight C++ template library
|
||||
// for linear algebra.
|
||||
//
|
||||
// Copyright (C) 2014 Benoit Steiner <benoit.steiner.goog@gmail.com>
|
||||
//
|
||||
// This Source Code Form is subject to the terms of the Mozilla
|
||||
// Public License v. 2.0. If a copy of the MPL was not distributed
|
||||
// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
|
||||
|
||||
#define EIGEN_TEST_NO_LONGDOUBLE
|
||||
#define EIGEN_TEST_NO_COMPLEX
|
||||
#define EIGEN_TEST_FUNC cxx11_tensor_device
|
||||
#define EIGEN_DEFAULT_DENSE_INDEX_TYPE int
|
||||
#define EIGEN_USE_GPU
|
||||
|
||||
#include "main.h"
|
||||
#include <unsupported/Eigen/CXX11/Tensor>
|
||||
|
||||
using Eigen::Tensor;
|
||||
using Eigen::RowMajor;
|
||||
|
||||
// Context for evaluation on cpu
|
||||
struct CPUContext {
|
||||
CPUContext(const Eigen::Tensor<float, 3>& in1, Eigen::Tensor<float, 3>& in2, Eigen::Tensor<float, 3>& out) : in1_(in1), in2_(in2), out_(out), kernel_1d_(2), kernel_2d_(2,2), kernel_3d_(2,2,2) {
|
||||
kernel_1d_(0) = 3.14f;
|
||||
kernel_1d_(1) = 2.7f;
|
||||
|
||||
kernel_2d_(0,0) = 3.14f;
|
||||
kernel_2d_(1,0) = 2.7f;
|
||||
kernel_2d_(0,1) = 0.2f;
|
||||
kernel_2d_(1,1) = 7.0f;
|
||||
|
||||
kernel_3d_(0,0,0) = 3.14f;
|
||||
kernel_3d_(0,1,0) = 2.7f;
|
||||
kernel_3d_(0,0,1) = 0.2f;
|
||||
kernel_3d_(0,1,1) = 7.0f;
|
||||
kernel_3d_(1,0,0) = -1.0f;
|
||||
kernel_3d_(1,1,0) = -0.3f;
|
||||
kernel_3d_(1,0,1) = -0.7f;
|
||||
kernel_3d_(1,1,1) = -0.5f;
|
||||
}
|
||||
|
||||
const Eigen::DefaultDevice& device() const { return cpu_device_; }
|
||||
|
||||
const Eigen::Tensor<float, 3>& in1() const { return in1_; }
|
||||
const Eigen::Tensor<float, 3>& in2() const { return in2_; }
|
||||
Eigen::Tensor<float, 3>& out() { return out_; }
|
||||
const Eigen::Tensor<float, 1>& kernel1d() const { return kernel_1d_; }
|
||||
const Eigen::Tensor<float, 2>& kernel2d() const { return kernel_2d_; }
|
||||
const Eigen::Tensor<float, 3>& kernel3d() const { return kernel_3d_; }
|
||||
|
||||
private:
|
||||
const Eigen::Tensor<float, 3>& in1_;
|
||||
const Eigen::Tensor<float, 3>& in2_;
|
||||
Eigen::Tensor<float, 3>& out_;
|
||||
|
||||
Eigen::Tensor<float, 1> kernel_1d_;
|
||||
Eigen::Tensor<float, 2> kernel_2d_;
|
||||
Eigen::Tensor<float, 3> kernel_3d_;
|
||||
|
||||
Eigen::DefaultDevice cpu_device_;
|
||||
};
|
||||
|
||||
|
||||
// Context for evaluation on GPU
|
||||
struct GPUContext {
|
||||
GPUContext(const Eigen::TensorMap<Eigen::Tensor<float, 3> >& in1, Eigen::TensorMap<Eigen::Tensor<float, 3> >& in2, Eigen::TensorMap<Eigen::Tensor<float, 3> >& out) : in1_(in1), in2_(in2), out_(out), gpu_device_(&stream_) {
|
||||
assert(hipMalloc((void**)(&kernel_1d_), 2*sizeof(float)) == hipSuccess);
|
||||
float kernel_1d_val[] = {3.14f, 2.7f};
|
||||
assert(hipMemcpy(kernel_1d_, kernel_1d_val, 2*sizeof(float), hipMemcpyHostToDevice) == hipSuccess);
|
||||
|
||||
assert(hipMalloc((void**)(&kernel_2d_), 4*sizeof(float)) == hipSuccess);
|
||||
float kernel_2d_val[] = {3.14f, 2.7f, 0.2f, 7.0f};
|
||||
assert(hipMemcpy(kernel_2d_, kernel_2d_val, 4*sizeof(float), hipMemcpyHostToDevice) == hipSuccess);
|
||||
|
||||
assert(hipMalloc((void**)(&kernel_3d_), 8*sizeof(float)) == hipSuccess);
|
||||
float kernel_3d_val[] = {3.14f, -1.0f, 2.7f, -0.3f, 0.2f, -0.7f, 7.0f, -0.5f};
|
||||
assert(hipMemcpy(kernel_3d_, kernel_3d_val, 8*sizeof(float), hipMemcpyHostToDevice) == hipSuccess);
|
||||
}
|
||||
~GPUContext() {
|
||||
assert(hipFree(kernel_1d_) == hipSuccess);
|
||||
assert(hipFree(kernel_2d_) == hipSuccess);
|
||||
assert(hipFree(kernel_3d_) == hipSuccess);
|
||||
}
|
||||
|
||||
const Eigen::GpuDevice& device() const { return gpu_device_; }
|
||||
|
||||
const Eigen::TensorMap<Eigen::Tensor<float, 3> >& in1() const { return in1_; }
|
||||
const Eigen::TensorMap<Eigen::Tensor<float, 3> >& in2() const { return in2_; }
|
||||
Eigen::TensorMap<Eigen::Tensor<float, 3> >& out() { return out_; }
|
||||
Eigen::TensorMap<Eigen::Tensor<float, 1> > kernel1d() const { return Eigen::TensorMap<Eigen::Tensor<float, 1> >(kernel_1d_, 2); }
|
||||
Eigen::TensorMap<Eigen::Tensor<float, 2> > kernel2d() const { return Eigen::TensorMap<Eigen::Tensor<float, 2> >(kernel_2d_, 2, 2); }
|
||||
Eigen::TensorMap<Eigen::Tensor<float, 3> > kernel3d() const { return Eigen::TensorMap<Eigen::Tensor<float, 3> >(kernel_3d_, 2, 2, 2); }
|
||||
|
||||
private:
|
||||
const Eigen::TensorMap<Eigen::Tensor<float, 3> >& in1_;
|
||||
const Eigen::TensorMap<Eigen::Tensor<float, 3> >& in2_;
|
||||
Eigen::TensorMap<Eigen::Tensor<float, 3> >& out_;
|
||||
|
||||
float* kernel_1d_;
|
||||
float* kernel_2d_;
|
||||
float* kernel_3d_;
|
||||
|
||||
Eigen::HipStreamDevice stream_;
|
||||
Eigen::GpuDevice gpu_device_;
|
||||
};
|
||||
|
||||
|
||||
// The actual expression to evaluate
|
||||
template <typename Context>
|
||||
void test_contextual_eval(Context* context)
|
||||
{
|
||||
context->out().device(context->device()) = context->in1() + context->in2() * 3.14f + context->in1().constant(2.718f);
|
||||
}
|
||||
|
||||
template <typename Context>
|
||||
void test_forced_contextual_eval(Context* context)
|
||||
{
|
||||
context->out().device(context->device()) = (context->in1() + context->in2()).eval() * 3.14f + context->in1().constant(2.718f);
|
||||
}
|
||||
|
||||
template <typename Context>
|
||||
void test_compound_assignment(Context* context)
|
||||
{
|
||||
context->out().device(context->device()) = context->in1().constant(2.718f);
|
||||
context->out().device(context->device()) += context->in1() + context->in2() * 3.14f;
|
||||
}
|
||||
|
||||
|
||||
template <typename Context>
|
||||
void test_contraction(Context* context)
|
||||
{
|
||||
Eigen::array<std::pair<int, int>, 2> dims;
|
||||
dims[0] = std::make_pair(1, 1);
|
||||
dims[1] = std::make_pair(2, 2);
|
||||
|
||||
Eigen::array<int, 2> shape(40, 50*70);
|
||||
|
||||
Eigen::DSizes<int, 2> indices(0,0);
|
||||
Eigen::DSizes<int, 2> sizes(40,40);
|
||||
|
||||
context->out().reshape(shape).slice(indices, sizes).device(context->device()) = context->in1().contract(context->in2(), dims);
|
||||
}
|
||||
|
||||
|
||||
template <typename Context>
|
||||
void test_1d_convolution(Context* context)
|
||||
{
|
||||
Eigen::DSizes<int, 3> indices(0,0,0);
|
||||
Eigen::DSizes<int, 3> sizes(40,49,70);
|
||||
|
||||
Eigen::array<int, 1> dims(1);
|
||||
context->out().slice(indices, sizes).device(context->device()) = context->in1().convolve(context->kernel1d(), dims);
|
||||
}
|
||||
|
||||
template <typename Context>
|
||||
void test_2d_convolution(Context* context)
|
||||
{
|
||||
Eigen::DSizes<int, 3> indices(0,0,0);
|
||||
Eigen::DSizes<int, 3> sizes(40,49,69);
|
||||
|
||||
Eigen::array<int, 2> dims(1,2);
|
||||
context->out().slice(indices, sizes).device(context->device()) = context->in1().convolve(context->kernel2d(), dims);
|
||||
}
|
||||
|
||||
template <typename Context>
|
||||
void test_3d_convolution(Context* context)
|
||||
{
|
||||
Eigen::DSizes<int, 3> indices(0,0,0);
|
||||
Eigen::DSizes<int, 3> sizes(39,49,69);
|
||||
|
||||
Eigen::array<int, 3> dims(0,1,2);
|
||||
context->out().slice(indices, sizes).device(context->device()) = context->in1().convolve(context->kernel3d(), dims);
|
||||
}
|
||||
|
||||
|
||||
void test_cpu() {
|
||||
Eigen::Tensor<float, 3> in1(40,50,70);
|
||||
Eigen::Tensor<float, 3> in2(40,50,70);
|
||||
Eigen::Tensor<float, 3> out(40,50,70);
|
||||
|
||||
in1 = in1.random() + in1.constant(10.0f);
|
||||
in2 = in2.random() + in2.constant(10.0f);
|
||||
|
||||
CPUContext context(in1, in2, out);
|
||||
test_contextual_eval(&context);
|
||||
for (int i = 0; i < 40; ++i) {
|
||||
for (int j = 0; j < 50; ++j) {
|
||||
for (int k = 0; k < 70; ++k) {
|
||||
VERIFY_IS_APPROX(out(i,j,k), in1(i,j,k) + in2(i,j,k) * 3.14f + 2.718f);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
test_forced_contextual_eval(&context);
|
||||
for (int i = 0; i < 40; ++i) {
|
||||
for (int j = 0; j < 50; ++j) {
|
||||
for (int k = 0; k < 70; ++k) {
|
||||
VERIFY_IS_APPROX(out(i,j,k), (in1(i,j,k) + in2(i,j,k)) * 3.14f + 2.718f);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
test_compound_assignment(&context);
|
||||
for (int i = 0; i < 40; ++i) {
|
||||
for (int j = 0; j < 50; ++j) {
|
||||
for (int k = 0; k < 70; ++k) {
|
||||
VERIFY_IS_APPROX(out(i,j,k), in1(i,j,k) + in2(i,j,k) * 3.14f + 2.718f);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
test_contraction(&context);
|
||||
for (int i = 0; i < 40; ++i) {
|
||||
for (int j = 0; j < 40; ++j) {
|
||||
const float result = out(i,j,0);
|
||||
float expected = 0;
|
||||
for (int k = 0; k < 50; ++k) {
|
||||
for (int l = 0; l < 70; ++l) {
|
||||
expected += in1(i, k, l) * in2(j, k, l);
|
||||
}
|
||||
}
|
||||
VERIFY_IS_APPROX(expected, result);
|
||||
}
|
||||
}
|
||||
|
||||
test_1d_convolution(&context);
|
||||
for (int i = 0; i < 40; ++i) {
|
||||
for (int j = 0; j < 49; ++j) {
|
||||
for (int k = 0; k < 70; ++k) {
|
||||
VERIFY_IS_APPROX(out(i,j,k), (in1(i,j,k) * 3.14f + in1(i,j+1,k) * 2.7f));
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
test_2d_convolution(&context);
|
||||
for (int i = 0; i < 40; ++i) {
|
||||
for (int j = 0; j < 49; ++j) {
|
||||
for (int k = 0; k < 69; ++k) {
|
||||
const float result = out(i,j,k);
|
||||
const float expected = (in1(i,j,k) * 3.14f + in1(i,j+1,k) * 2.7f) +
|
||||
(in1(i,j,k+1) * 0.2f + in1(i,j+1,k+1) * 7.0f);
|
||||
if (fabs(expected) < 1e-4f && fabs(result) < 1e-4f) {
|
||||
continue;
|
||||
}
|
||||
VERIFY_IS_APPROX(expected, result);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
test_3d_convolution(&context);
|
||||
for (int i = 0; i < 39; ++i) {
|
||||
for (int j = 0; j < 49; ++j) {
|
||||
for (int k = 0; k < 69; ++k) {
|
||||
const float result = out(i,j,k);
|
||||
const float expected = (in1(i,j,k) * 3.14f + in1(i,j+1,k) * 2.7f +
|
||||
in1(i,j,k+1) * 0.2f + in1(i,j+1,k+1) * 7.0f) +
|
||||
(in1(i+1,j,k) * -1.0f + in1(i+1,j+1,k) * -0.3f +
|
||||
in1(i+1,j,k+1) * -0.7f + in1(i+1,j+1,k+1) * -0.5f);
|
||||
if (fabs(expected) < 1e-4f && fabs(result) < 1e-4f) {
|
||||
continue;
|
||||
}
|
||||
VERIFY_IS_APPROX(expected, result);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
void test_gpu() {
|
||||
Eigen::Tensor<float, 3> in1(40,50,70);
|
||||
Eigen::Tensor<float, 3> in2(40,50,70);
|
||||
Eigen::Tensor<float, 3> out(40,50,70);
|
||||
in1 = in1.random() + in1.constant(10.0f);
|
||||
in2 = in2.random() + in2.constant(10.0f);
|
||||
|
||||
std::size_t in1_bytes = in1.size() * sizeof(float);
|
||||
std::size_t in2_bytes = in2.size() * sizeof(float);
|
||||
std::size_t out_bytes = out.size() * sizeof(float);
|
||||
|
||||
float* d_in1;
|
||||
float* d_in2;
|
||||
float* d_out;
|
||||
hipMalloc((void**)(&d_in1), in1_bytes);
|
||||
hipMalloc((void**)(&d_in2), in2_bytes);
|
||||
hipMalloc((void**)(&d_out), out_bytes);
|
||||
|
||||
hipMemcpy(d_in1, in1.data(), in1_bytes, hipMemcpyHostToDevice);
|
||||
hipMemcpy(d_in2, in2.data(), in2_bytes, hipMemcpyHostToDevice);
|
||||
|
||||
Eigen::TensorMap<Eigen::Tensor<float, 3> > gpu_in1(d_in1, 40,50,70);
|
||||
Eigen::TensorMap<Eigen::Tensor<float, 3> > gpu_in2(d_in2, 40,50,70);
|
||||
Eigen::TensorMap<Eigen::Tensor<float, 3> > gpu_out(d_out, 40,50,70);
|
||||
|
||||
GPUContext context(gpu_in1, gpu_in2, gpu_out);
|
||||
test_contextual_eval(&context);
|
||||
assert(hipMemcpy(out.data(), d_out, out_bytes, hipMemcpyDeviceToHost) == hipSuccess);
|
||||
for (int i = 0; i < 40; ++i) {
|
||||
for (int j = 0; j < 50; ++j) {
|
||||
for (int k = 0; k < 70; ++k) {
|
||||
VERIFY_IS_APPROX(out(i,j,k), in1(i,j,k) + in2(i,j,k) * 3.14f + 2.718f);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
test_forced_contextual_eval(&context);
|
||||
assert(hipMemcpy(out.data(), d_out, out_bytes, hipMemcpyDeviceToHost) == hipSuccess);
|
||||
for (int i = 0; i < 40; ++i) {
|
||||
for (int j = 0; j < 50; ++j) {
|
||||
for (int k = 0; k < 70; ++k) {
|
||||
VERIFY_IS_APPROX(out(i,j,k), (in1(i,j,k) + in2(i,j,k)) * 3.14f + 2.718f);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
test_compound_assignment(&context);
|
||||
assert(hipMemcpy(out.data(), d_out, out_bytes, hipMemcpyDeviceToHost) == hipSuccess);
|
||||
for (int i = 0; i < 40; ++i) {
|
||||
for (int j = 0; j < 50; ++j) {
|
||||
for (int k = 0; k < 70; ++k) {
|
||||
VERIFY_IS_APPROX(out(i,j,k), in1(i,j,k) + in2(i,j,k) * 3.14f + 2.718f);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
test_contraction(&context);
|
||||
assert(hipMemcpy(out.data(), d_out, out_bytes, hipMemcpyDeviceToHost) == hipSuccess);
|
||||
for (int i = 0; i < 40; ++i) {
|
||||
for (int j = 0; j < 40; ++j) {
|
||||
const float result = out(i,j,0);
|
||||
float expected = 0;
|
||||
for (int k = 0; k < 50; ++k) {
|
||||
for (int l = 0; l < 70; ++l) {
|
||||
expected += in1(i, k, l) * in2(j, k, l);
|
||||
}
|
||||
}
|
||||
VERIFY_IS_APPROX(expected, result);
|
||||
}
|
||||
}
|
||||
|
||||
test_1d_convolution(&context);
|
||||
assert(hipMemcpyAsync(out.data(), d_out, out_bytes, hipMemcpyDeviceToHost, context.device().stream()) == hipSuccess);
|
||||
assert(hipStreamSynchronize(context.device().stream()) == hipSuccess);
|
||||
for (int i = 0; i < 40; ++i) {
|
||||
for (int j = 0; j < 49; ++j) {
|
||||
for (int k = 0; k < 70; ++k) {
|
||||
VERIFY_IS_APPROX(out(i,j,k), (in1(i,j,k) * 3.14f + in1(i,j+1,k) * 2.7f));
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
test_2d_convolution(&context);
|
||||
assert(hipMemcpyAsync(out.data(), d_out, out_bytes, hipMemcpyDeviceToHost, context.device().stream()) == hipSuccess);
|
||||
assert(hipStreamSynchronize(context.device().stream()) == hipSuccess);
|
||||
for (int i = 0; i < 40; ++i) {
|
||||
for (int j = 0; j < 49; ++j) {
|
||||
for (int k = 0; k < 69; ++k) {
|
||||
const float result = out(i,j,k);
|
||||
const float expected = (in1(i,j,k) * 3.14f + in1(i,j+1,k) * 2.7f +
|
||||
in1(i,j,k+1) * 0.2f + in1(i,j+1,k+1) * 7.0f);
|
||||
VERIFY_IS_APPROX(expected, result);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/*
|
||||
test_3d_convolution(&context);
|
||||
assert(hipMemcpyAsync(out.data(), d_out, out_bytes, hipMemcpyDeviceToHost, context.device().stream()) == hipSuccess);
|
||||
assert(hipStreamSynchronize(context.device().stream()) == hipSuccess);
|
||||
for (int i = 0; i < 39; ++i) {
|
||||
for (int j = 0; j < 49; ++j) {
|
||||
for (int k = 0; k < 69; ++k) {
|
||||
const float result = out(i,j,k);
|
||||
const float expected = (in1(i,j,k) * 3.14f + in1(i,j+1,k) * 2.7f +
|
||||
in1(i,j,k+1) * 0.2f + in1(i,j+1,k+1) * 7.0f +
|
||||
in1(i+1,j,k) * -1.0f + in1(i+1,j+1,k) * -0.3f +
|
||||
in1(i+1,j,k+1) * -0.7f + in1(i+1,j+1,k+1) * -0.5f);
|
||||
VERIFY_IS_APPROX(expected, result);
|
||||
}
|
||||
}
|
||||
}
|
||||
*/
|
||||
}
|
||||
|
||||
|
||||
void test_cxx11_tensor_device()
|
||||
{
|
||||
CALL_SUBTEST(test_cpu());
|
||||
CALL_SUBTEST(test_gpu());
|
||||
}
|
||||
File diff suppressed because it is too large
Load Diff
@@ -1,498 +0,0 @@
|
||||
// This file is part of Eigen, a lightweight C++ template library
|
||||
// for linear algebra.
|
||||
//
|
||||
// Copyright (C) 2016 Benoit Steiner <benoit.steiner.goog@gmail.com>
|
||||
//
|
||||
// This Source Code Form is subject to the terms of the Mozilla
|
||||
// Public License v. 2.0. If a copy of the MPL was not distributed
|
||||
// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
|
||||
|
||||
#define EIGEN_TEST_NO_LONGDOUBLE
|
||||
#define EIGEN_TEST_NO_COMPLEX
|
||||
#define EIGEN_TEST_FUNC cxx11_tensor_of_float16_hip
|
||||
#define EIGEN_DEFAULT_DENSE_INDEX_TYPE int
|
||||
#define EIGEN_USE_GPU
|
||||
|
||||
#include "main.h"
|
||||
#include <unsupported/Eigen/CXX11/Tensor>
|
||||
|
||||
|
||||
using Eigen::Tensor;
|
||||
|
||||
template<typename>
|
||||
void test_hip_numext() {
|
||||
Eigen::HipStreamDevice stream;
|
||||
Eigen::GpuDevice gpu_device(&stream);
|
||||
int num_elem = 101;
|
||||
|
||||
float* d_float = (float*)gpu_device.allocate(num_elem * sizeof(float));
|
||||
bool* d_res_half = (bool*)gpu_device.allocate(num_elem * sizeof(bool));
|
||||
bool* d_res_float = (bool*)gpu_device.allocate(num_elem * sizeof(bool));
|
||||
|
||||
Eigen::TensorMap<Eigen::Tensor<float, 1>, Eigen::Aligned> gpu_float(
|
||||
d_float, num_elem);
|
||||
Eigen::TensorMap<Eigen::Tensor<bool, 1>, Eigen::Aligned> gpu_res_half(
|
||||
d_res_half, num_elem);
|
||||
Eigen::TensorMap<Eigen::Tensor<bool, 1>, Eigen::Aligned> gpu_res_float(
|
||||
d_res_float, num_elem);
|
||||
|
||||
gpu_float.device(gpu_device) = gpu_float.random() - gpu_float.constant(0.5f);
|
||||
gpu_res_float.device(gpu_device) = gpu_float.unaryExpr(Eigen::internal::scalar_isnan_op<float>());
|
||||
gpu_res_half.device(gpu_device) = gpu_float.cast<Eigen::half>().unaryExpr(Eigen::internal::scalar_isnan_op<Eigen::half>());
|
||||
|
||||
Tensor<bool, 1> half_prec(num_elem);
|
||||
Tensor<bool, 1> full_prec(num_elem);
|
||||
gpu_device.memcpyDeviceToHost(half_prec.data(), d_res_half, num_elem*sizeof(bool));
|
||||
gpu_device.memcpyDeviceToHost(full_prec.data(), d_res_float, num_elem*sizeof(bool));
|
||||
gpu_device.synchronize();
|
||||
|
||||
for (int i = 0; i < num_elem; ++i) {
|
||||
std::cout << "Checking numext " << i << std::endl;
|
||||
VERIFY_IS_EQUAL(full_prec(i), half_prec(i));
|
||||
}
|
||||
|
||||
gpu_device.deallocate(d_float);
|
||||
gpu_device.deallocate(d_res_half);
|
||||
gpu_device.deallocate(d_res_float);
|
||||
}
|
||||
|
||||
|
||||
#ifdef EIGEN_HAS_HIP_FP16
|
||||
|
||||
template<typename>
|
||||
void test_hip_conversion() {
|
||||
Eigen::HipStreamDevice stream;
|
||||
Eigen::GpuDevice gpu_device(&stream);
|
||||
int num_elem = 101;
|
||||
|
||||
float* d_float = (float*)gpu_device.allocate(num_elem * sizeof(float));
|
||||
Eigen::half* d_half = (Eigen::half*)gpu_device.allocate(num_elem * sizeof(Eigen::half));
|
||||
float* d_conv = (float*)gpu_device.allocate(num_elem * sizeof(float));
|
||||
|
||||
Eigen::TensorMap<Eigen::Tensor<float, 1>, Eigen::Aligned> gpu_float(
|
||||
d_float, num_elem);
|
||||
Eigen::TensorMap<Eigen::Tensor<Eigen::half, 1>, Eigen::Aligned> gpu_half(
|
||||
d_half, num_elem);
|
||||
Eigen::TensorMap<Eigen::Tensor<float, 1>, Eigen::Aligned> gpu_conv(
|
||||
d_conv, num_elem);
|
||||
|
||||
gpu_float.device(gpu_device) = gpu_float.random();
|
||||
gpu_half.device(gpu_device) = gpu_float.cast<Eigen::half>();
|
||||
gpu_conv.device(gpu_device) = gpu_half.cast<float>();
|
||||
|
||||
Tensor<float, 1> initial(num_elem);
|
||||
Tensor<float, 1> final(num_elem);
|
||||
gpu_device.memcpyDeviceToHost(initial.data(), d_float, num_elem*sizeof(float));
|
||||
gpu_device.memcpyDeviceToHost(final.data(), d_conv, num_elem*sizeof(float));
|
||||
|
||||
for (int i = 0; i < num_elem; ++i) {
|
||||
VERIFY_IS_APPROX(initial(i), final(i));
|
||||
}
|
||||
|
||||
gpu_device.deallocate(d_float);
|
||||
gpu_device.deallocate(d_half);
|
||||
gpu_device.deallocate(d_conv);
|
||||
}
|
||||
|
||||
template<typename>
|
||||
void test_hip_unary() {
|
||||
Eigen::HipStreamDevice stream;
|
||||
Eigen::GpuDevice gpu_device(&stream);
|
||||
int num_elem = 101;
|
||||
|
||||
float* d_float = (float*)gpu_device.allocate(num_elem * sizeof(float));
|
||||
float* d_res_half = (float*)gpu_device.allocate(num_elem * sizeof(float));
|
||||
float* d_res_float = (float*)gpu_device.allocate(num_elem * sizeof(float));
|
||||
|
||||
Eigen::TensorMap<Eigen::Tensor<float, 1>, Eigen::Aligned> gpu_float(
|
||||
d_float, num_elem);
|
||||
Eigen::TensorMap<Eigen::Tensor<float, 1>, Eigen::Aligned> gpu_res_half(
|
||||
d_res_half, num_elem);
|
||||
Eigen::TensorMap<Eigen::Tensor<float, 1>, Eigen::Aligned> gpu_res_float(
|
||||
d_res_float, num_elem);
|
||||
|
||||
gpu_float.device(gpu_device) = gpu_float.random() - gpu_float.constant(0.5f);
|
||||
gpu_res_float.device(gpu_device) = gpu_float.abs();
|
||||
gpu_res_half.device(gpu_device) = gpu_float.cast<Eigen::half>().abs().cast<float>();
|
||||
|
||||
Tensor<float, 1> half_prec(num_elem);
|
||||
Tensor<float, 1> full_prec(num_elem);
|
||||
gpu_device.memcpyDeviceToHost(half_prec.data(), d_res_half, num_elem*sizeof(float));
|
||||
gpu_device.memcpyDeviceToHost(full_prec.data(), d_res_float, num_elem*sizeof(float));
|
||||
gpu_device.synchronize();
|
||||
|
||||
for (int i = 0; i < num_elem; ++i) {
|
||||
std::cout << "Checking unary " << i << std::endl;
|
||||
VERIFY_IS_APPROX(full_prec(i), half_prec(i));
|
||||
}
|
||||
|
||||
gpu_device.deallocate(d_float);
|
||||
gpu_device.deallocate(d_res_half);
|
||||
gpu_device.deallocate(d_res_float);
|
||||
}
|
||||
|
||||
template<typename>
|
||||
void test_hip_elementwise() {
|
||||
Eigen::HipStreamDevice stream;
|
||||
Eigen::GpuDevice gpu_device(&stream);
|
||||
int num_elem = 101;
|
||||
|
||||
float* d_float1 = (float*)gpu_device.allocate(num_elem * sizeof(float));
|
||||
float* d_float2 = (float*)gpu_device.allocate(num_elem * sizeof(float));
|
||||
float* d_res_half = (float*)gpu_device.allocate(num_elem * sizeof(float));
|
||||
float* d_res_float = (float*)gpu_device.allocate(num_elem * sizeof(float));
|
||||
|
||||
Eigen::TensorMap<Eigen::Tensor<float, 1>, Eigen::Aligned> gpu_float1(
|
||||
d_float1, num_elem);
|
||||
Eigen::TensorMap<Eigen::Tensor<float, 1>, Eigen::Aligned> gpu_float2(
|
||||
d_float2, num_elem);
|
||||
Eigen::TensorMap<Eigen::Tensor<float, 1>, Eigen::Aligned> gpu_res_half(
|
||||
d_res_half, num_elem);
|
||||
Eigen::TensorMap<Eigen::Tensor<float, 1>, Eigen::Aligned> gpu_res_float(
|
||||
d_res_float, num_elem);
|
||||
|
||||
gpu_float1.device(gpu_device) = gpu_float1.random();
|
||||
gpu_float2.device(gpu_device) = gpu_float2.random();
|
||||
gpu_res_float.device(gpu_device) = (gpu_float1 + gpu_float2) * gpu_float1;
|
||||
gpu_res_half.device(gpu_device) = ((gpu_float1.cast<Eigen::half>() + gpu_float2.cast<Eigen::half>()) * gpu_float1.cast<Eigen::half>()).cast<float>();
|
||||
|
||||
Tensor<float, 1> half_prec(num_elem);
|
||||
Tensor<float, 1> full_prec(num_elem);
|
||||
gpu_device.memcpyDeviceToHost(half_prec.data(), d_res_half, num_elem*sizeof(float));
|
||||
gpu_device.memcpyDeviceToHost(full_prec.data(), d_res_float, num_elem*sizeof(float));
|
||||
gpu_device.synchronize();
|
||||
|
||||
for (int i = 0; i < num_elem; ++i) {
|
||||
std::cout << "Checking elemwise " << i << ": full prec = " << full_prec(i) << " vs half prec = " << half_prec(i) << std::endl;
|
||||
VERIFY_IS_APPROX(static_cast<Eigen::half>(full_prec(i)), static_cast<Eigen::half>(half_prec(i)));
|
||||
}
|
||||
|
||||
gpu_device.deallocate(d_float1);
|
||||
gpu_device.deallocate(d_float2);
|
||||
gpu_device.deallocate(d_res_half);
|
||||
gpu_device.deallocate(d_res_float);
|
||||
}
|
||||
|
||||
template<typename>
|
||||
void test_hip_trancendental() {
|
||||
Eigen::HipStreamDevice stream;
|
||||
Eigen::GpuDevice gpu_device(&stream);
|
||||
int num_elem = 101;
|
||||
|
||||
float* d_float1 = (float*)gpu_device.allocate(num_elem * sizeof(float));
|
||||
float* d_float2 = (float*)gpu_device.allocate(num_elem * sizeof(float));
|
||||
float* d_float3 = (float*)gpu_device.allocate(num_elem * sizeof(float));
|
||||
Eigen::half* d_res1_half = (Eigen::half*)gpu_device.allocate(num_elem * sizeof(Eigen::half));
|
||||
Eigen::half* d_res1_float = (Eigen::half*)gpu_device.allocate(num_elem * sizeof(Eigen::half));
|
||||
Eigen::half* d_res2_half = (Eigen::half*)gpu_device.allocate(num_elem * sizeof(Eigen::half));
|
||||
Eigen::half* d_res2_float = (Eigen::half*)gpu_device.allocate(num_elem * sizeof(Eigen::half));
|
||||
Eigen::half* d_res3_half = (Eigen::half*)gpu_device.allocate(num_elem * sizeof(Eigen::half));
|
||||
Eigen::half* d_res3_float = (Eigen::half*)gpu_device.allocate(num_elem * sizeof(Eigen::half));
|
||||
|
||||
Eigen::TensorMap<Eigen::Tensor<float, 1>, Eigen::Aligned> gpu_float1(d_float1, num_elem);
|
||||
Eigen::TensorMap<Eigen::Tensor<float, 1>, Eigen::Aligned> gpu_float2(d_float2, num_elem);
|
||||
Eigen::TensorMap<Eigen::Tensor<float, 1>, Eigen::Aligned> gpu_float3(d_float3, num_elem);
|
||||
Eigen::TensorMap<Eigen::Tensor<Eigen::half, 1>, Eigen::Aligned> gpu_res1_half(d_res1_half, num_elem);
|
||||
Eigen::TensorMap<Eigen::Tensor<Eigen::half, 1>, Eigen::Aligned> gpu_res1_float(d_res1_float, num_elem);
|
||||
Eigen::TensorMap<Eigen::Tensor<Eigen::half, 1>, Eigen::Aligned> gpu_res2_half(d_res2_half, num_elem);
|
||||
Eigen::TensorMap<Eigen::Tensor<Eigen::half, 1>, Eigen::Aligned> gpu_res2_float(d_res2_float, num_elem);
|
||||
Eigen::TensorMap<Eigen::Tensor<Eigen::half, 1>, Eigen::Aligned> gpu_res3_half(d_res3_half, num_elem);
|
||||
Eigen::TensorMap<Eigen::Tensor<Eigen::half, 1>, Eigen::Aligned> gpu_res3_float(d_res3_float, num_elem);
|
||||
Eigen::TensorMap<Eigen::Tensor<Eigen::half, 1>, Eigen::Aligned> gpu_res4_half(d_res3_half, num_elem);
|
||||
Eigen::TensorMap<Eigen::Tensor<Eigen::half, 1>, Eigen::Aligned> gpu_res4_float(d_res3_float, num_elem);
|
||||
|
||||
gpu_float1.device(gpu_device) = gpu_float1.random() - gpu_float1.constant(0.5f);
|
||||
gpu_float2.device(gpu_device) = gpu_float2.random() + gpu_float1.constant(0.5f);
|
||||
gpu_float3.device(gpu_device) = gpu_float3.random();
|
||||
gpu_res1_float.device(gpu_device) = gpu_float1.exp().cast<Eigen::half>();
|
||||
gpu_res2_float.device(gpu_device) = gpu_float2.log().cast<Eigen::half>();
|
||||
gpu_res3_float.device(gpu_device) = gpu_float3.log1p().cast<Eigen::half>();
|
||||
gpu_res4_float.device(gpu_device) = gpu_float3.expm1().cast<Eigen::half>();
|
||||
|
||||
gpu_res1_half.device(gpu_device) = gpu_float1.cast<Eigen::half>();
|
||||
gpu_res1_half.device(gpu_device) = gpu_res1_half.exp();
|
||||
|
||||
gpu_res2_half.device(gpu_device) = gpu_float2.cast<Eigen::half>();
|
||||
gpu_res2_half.device(gpu_device) = gpu_res2_half.log();
|
||||
|
||||
gpu_res3_half.device(gpu_device) = gpu_float3.cast<Eigen::half>();
|
||||
gpu_res3_half.device(gpu_device) = gpu_res3_half.log1p();
|
||||
|
||||
gpu_res3_half.device(gpu_device) = gpu_float3.cast<Eigen::half>();
|
||||
gpu_res3_half.device(gpu_device) = gpu_res3_half.expm1();
|
||||
|
||||
Tensor<float, 1> input1(num_elem);
|
||||
Tensor<Eigen::half, 1> half_prec1(num_elem);
|
||||
Tensor<Eigen::half, 1> full_prec1(num_elem);
|
||||
Tensor<float, 1> input2(num_elem);
|
||||
Tensor<Eigen::half, 1> half_prec2(num_elem);
|
||||
Tensor<Eigen::half, 1> full_prec2(num_elem);
|
||||
Tensor<float, 1> input3(num_elem);
|
||||
Tensor<Eigen::half, 1> half_prec3(num_elem);
|
||||
Tensor<Eigen::half, 1> full_prec3(num_elem);
|
||||
gpu_device.memcpyDeviceToHost(input1.data(), d_float1, num_elem*sizeof(float));
|
||||
gpu_device.memcpyDeviceToHost(input2.data(), d_float2, num_elem*sizeof(float));
|
||||
gpu_device.memcpyDeviceToHost(input3.data(), d_float3, num_elem*sizeof(float));
|
||||
gpu_device.memcpyDeviceToHost(half_prec1.data(), d_res1_half, num_elem*sizeof(Eigen::half));
|
||||
gpu_device.memcpyDeviceToHost(full_prec1.data(), d_res1_float, num_elem*sizeof(Eigen::half));
|
||||
gpu_device.memcpyDeviceToHost(half_prec2.data(), d_res2_half, num_elem*sizeof(Eigen::half));
|
||||
gpu_device.memcpyDeviceToHost(full_prec2.data(), d_res2_float, num_elem*sizeof(Eigen::half));
|
||||
gpu_device.memcpyDeviceToHost(half_prec3.data(), d_res3_half, num_elem*sizeof(Eigen::half));
|
||||
gpu_device.memcpyDeviceToHost(full_prec3.data(), d_res3_float, num_elem*sizeof(Eigen::half));
|
||||
gpu_device.synchronize();
|
||||
|
||||
for (int i = 0; i < num_elem; ++i) {
|
||||
std::cout << "Checking elemwise exp " << i << " input = " << input1(i) << " full = " << full_prec1(i) << " half = " << half_prec1(i) << std::endl;
|
||||
VERIFY_IS_APPROX(full_prec1(i), half_prec1(i));
|
||||
}
|
||||
for (int i = 0; i < num_elem; ++i) {
|
||||
std::cout << "Checking elemwise log " << i << " input = " << input2(i) << " full = " << full_prec2(i) << " half = " << half_prec2(i) << std::endl;
|
||||
if(std::abs(input2(i)-1.f)<0.05f) // log lacks accurary nearby 1
|
||||
VERIFY_IS_APPROX(full_prec2(i)+Eigen::half(0.1f), half_prec2(i)+Eigen::half(0.1f));
|
||||
else
|
||||
VERIFY_IS_APPROX(full_prec2(i), half_prec2(i));
|
||||
}
|
||||
for (int i = 0; i < num_elem; ++i) {
|
||||
std::cout << "Checking elemwise plog1 " << i << " input = " << input3(i) << " full = " << full_prec3(i) << " half = " << half_prec3(i) << std::endl;
|
||||
VERIFY_IS_APPROX(full_prec3(i), half_prec3(i));
|
||||
}
|
||||
gpu_device.deallocate(d_float1);
|
||||
gpu_device.deallocate(d_float2);
|
||||
gpu_device.deallocate(d_float3);
|
||||
gpu_device.deallocate(d_res1_half);
|
||||
gpu_device.deallocate(d_res1_float);
|
||||
gpu_device.deallocate(d_res2_half);
|
||||
gpu_device.deallocate(d_res2_float);
|
||||
gpu_device.deallocate(d_res3_float);
|
||||
gpu_device.deallocate(d_res3_half);
|
||||
}
|
||||
|
||||
template<typename>
|
||||
void test_hip_contractions() {
|
||||
Eigen::HipStreamDevice stream;
|
||||
Eigen::GpuDevice gpu_device(&stream);
|
||||
int rows = 23;
|
||||
int cols = 23;
|
||||
int num_elem = rows*cols;
|
||||
|
||||
float* d_float1 = (float*)gpu_device.allocate(num_elem * sizeof(float));
|
||||
float* d_float2 = (float*)gpu_device.allocate(num_elem * sizeof(float));
|
||||
Eigen::half* d_res_half = (Eigen::half*)gpu_device.allocate(num_elem * sizeof(Eigen::half));
|
||||
Eigen::half* d_res_float = (Eigen::half*)gpu_device.allocate(num_elem * sizeof(Eigen::half));
|
||||
|
||||
Eigen::TensorMap<Eigen::Tensor<float, 2>, Eigen::Aligned> gpu_float1(
|
||||
d_float1, rows, cols);
|
||||
Eigen::TensorMap<Eigen::Tensor<float, 2>, Eigen::Aligned> gpu_float2(
|
||||
d_float2, rows, cols);
|
||||
Eigen::TensorMap<Eigen::Tensor<Eigen::half, 2>, Eigen::Aligned> gpu_res_half(
|
||||
d_res_half, rows, cols);
|
||||
Eigen::TensorMap<Eigen::Tensor<Eigen::half, 2>, Eigen::Aligned> gpu_res_float(
|
||||
d_res_float, rows, cols);
|
||||
|
||||
gpu_float1.device(gpu_device) = gpu_float1.random() - gpu_float1.constant(0.5f);
|
||||
gpu_float2.device(gpu_device) = gpu_float2.random() - gpu_float2.constant(0.5f);
|
||||
|
||||
typedef Tensor<float, 2>::DimensionPair DimPair;
|
||||
Eigen::array<DimPair, 1> dims(DimPair(1, 0));
|
||||
gpu_res_float.device(gpu_device) = gpu_float1.contract(gpu_float2, dims).cast<Eigen::half>();
|
||||
gpu_res_half.device(gpu_device) = gpu_float1.cast<Eigen::half>().contract(gpu_float2.cast<Eigen::half>(), dims);
|
||||
|
||||
Tensor<Eigen::half, 2> half_prec(rows, cols);
|
||||
Tensor<Eigen::half, 2> full_prec(rows, cols);
|
||||
gpu_device.memcpyDeviceToHost(half_prec.data(), d_res_half, num_elem*sizeof(Eigen::half));
|
||||
gpu_device.memcpyDeviceToHost(full_prec.data(), d_res_float, num_elem*sizeof(Eigen::half));
|
||||
gpu_device.synchronize();
|
||||
|
||||
for (int i = 0; i < rows; ++i) {
|
||||
for (int j = 0; j < cols; ++j) {
|
||||
std::cout << "Checking contract " << i << " " << j << full_prec(i, j) << " " << half_prec(i, j) << std::endl;
|
||||
if (numext::abs(full_prec(i, j) - half_prec(i, j)) > Eigen::half(1e-2f)) {
|
||||
VERIFY_IS_APPROX(full_prec(i, j), half_prec(i, j));
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
gpu_device.deallocate(d_float1);
|
||||
gpu_device.deallocate(d_float2);
|
||||
gpu_device.deallocate(d_res_half);
|
||||
gpu_device.deallocate(d_res_float);
|
||||
}
|
||||
|
||||
template<typename>
|
||||
void test_hip_reductions(int size1, int size2, int redux) {
|
||||
|
||||
std::cout << "Reducing " << size1 << " by " << size2
|
||||
<< " tensor along dim " << redux << std::endl;
|
||||
|
||||
Eigen::HipStreamDevice stream;
|
||||
Eigen::GpuDevice gpu_device(&stream);
|
||||
int num_elem = size1*size2;
|
||||
int result_size = (redux == 1 ? size1 : size2);
|
||||
|
||||
float* d_float1 = (float*)gpu_device.allocate(num_elem * sizeof(float));
|
||||
float* d_float2 = (float*)gpu_device.allocate(num_elem * sizeof(float));
|
||||
Eigen::half* d_res_half = (Eigen::half*)gpu_device.allocate(result_size * sizeof(Eigen::half));
|
||||
Eigen::half* d_res_float = (Eigen::half*)gpu_device.allocate(result_size * sizeof(Eigen::half));
|
||||
|
||||
Eigen::TensorMap<Eigen::Tensor<float, 2>, Eigen::Aligned> gpu_float1(
|
||||
d_float1, size1, size2);
|
||||
Eigen::TensorMap<Eigen::Tensor<float, 2>, Eigen::Aligned> gpu_float2(
|
||||
d_float2, size1, size2);
|
||||
Eigen::TensorMap<Eigen::Tensor<Eigen::half, 1>, Eigen::Aligned> gpu_res_half(
|
||||
d_res_half, result_size);
|
||||
Eigen::TensorMap<Eigen::Tensor<Eigen::half, 1>, Eigen::Aligned> gpu_res_float(
|
||||
d_res_float, result_size);
|
||||
|
||||
gpu_float1.device(gpu_device) = gpu_float1.random() * 2.0f;
|
||||
gpu_float2.device(gpu_device) = gpu_float2.random() * 2.0f;
|
||||
|
||||
Eigen::array<int, 1> redux_dim(redux);
|
||||
gpu_res_float.device(gpu_device) = gpu_float1.sum(redux_dim).cast<Eigen::half>();
|
||||
gpu_res_half.device(gpu_device) = gpu_float1.cast<Eigen::half>().sum(redux_dim);
|
||||
|
||||
Tensor<Eigen::half, 1> half_prec(result_size);
|
||||
Tensor<Eigen::half, 1> full_prec(result_size);
|
||||
gpu_device.memcpyDeviceToHost(half_prec.data(), d_res_half, result_size*sizeof(Eigen::half));
|
||||
gpu_device.memcpyDeviceToHost(full_prec.data(), d_res_float, result_size*sizeof(Eigen::half));
|
||||
gpu_device.synchronize();
|
||||
|
||||
for (int i = 0; i < result_size; ++i) {
|
||||
std::cout << "EXPECTED " << full_prec(i) << " GOT " << half_prec(i) << std::endl;
|
||||
VERIFY_IS_APPROX(full_prec(i), half_prec(i));
|
||||
}
|
||||
|
||||
gpu_device.deallocate(d_float1);
|
||||
gpu_device.deallocate(d_float2);
|
||||
gpu_device.deallocate(d_res_half);
|
||||
gpu_device.deallocate(d_res_float);
|
||||
}
|
||||
|
||||
template<typename>
|
||||
void test_hip_reductions() {
|
||||
test_hip_reductions<void>(13, 13, 0);
|
||||
test_hip_reductions<void>(13, 13, 1);
|
||||
|
||||
test_hip_reductions<void>(35, 36, 0);
|
||||
test_hip_reductions<void>(35, 36, 1);
|
||||
|
||||
test_hip_reductions<void>(36, 35, 0);
|
||||
test_hip_reductions<void>(36, 35, 1);
|
||||
}
|
||||
|
||||
template<typename>
|
||||
void test_hip_full_reductions() {
|
||||
Eigen::HipStreamDevice stream;
|
||||
Eigen::GpuDevice gpu_device(&stream);
|
||||
int size = 13;
|
||||
int num_elem = size*size;
|
||||
|
||||
float* d_float1 = (float*)gpu_device.allocate(num_elem * sizeof(float));
|
||||
float* d_float2 = (float*)gpu_device.allocate(num_elem * sizeof(float));
|
||||
Eigen::half* d_res_half = (Eigen::half*)gpu_device.allocate(1 * sizeof(Eigen::half));
|
||||
Eigen::half* d_res_float = (Eigen::half*)gpu_device.allocate(1 * sizeof(Eigen::half));
|
||||
|
||||
Eigen::TensorMap<Eigen::Tensor<float, 2>, Eigen::Aligned> gpu_float1(
|
||||
d_float1, size, size);
|
||||
Eigen::TensorMap<Eigen::Tensor<float, 2>, Eigen::Aligned> gpu_float2(
|
||||
d_float2, size, size);
|
||||
Eigen::TensorMap<Eigen::Tensor<Eigen::half, 0>, Eigen::Aligned> gpu_res_half(
|
||||
d_res_half);
|
||||
Eigen::TensorMap<Eigen::Tensor<Eigen::half, 0>, Eigen::Aligned> gpu_res_float(
|
||||
d_res_float);
|
||||
|
||||
gpu_float1.device(gpu_device) = gpu_float1.random();
|
||||
gpu_float2.device(gpu_device) = gpu_float2.random();
|
||||
|
||||
gpu_res_float.device(gpu_device) = gpu_float1.sum().cast<Eigen::half>();
|
||||
gpu_res_half.device(gpu_device) = gpu_float1.cast<Eigen::half>().sum();
|
||||
|
||||
Tensor<Eigen::half, 0> half_prec;
|
||||
Tensor<Eigen::half, 0> full_prec;
|
||||
gpu_device.memcpyDeviceToHost(half_prec.data(), d_res_half, sizeof(Eigen::half));
|
||||
gpu_device.memcpyDeviceToHost(full_prec.data(), d_res_float, sizeof(Eigen::half));
|
||||
gpu_device.synchronize();
|
||||
|
||||
VERIFY_IS_APPROX(full_prec(), half_prec());
|
||||
|
||||
gpu_res_float.device(gpu_device) = gpu_float1.maximum().cast<Eigen::half>();
|
||||
gpu_res_half.device(gpu_device) = gpu_float1.cast<Eigen::half>().maximum();
|
||||
gpu_device.memcpyDeviceToHost(half_prec.data(), d_res_half, sizeof(Eigen::half));
|
||||
gpu_device.memcpyDeviceToHost(full_prec.data(), d_res_float, sizeof(Eigen::half));
|
||||
gpu_device.synchronize();
|
||||
|
||||
VERIFY_IS_APPROX(full_prec(), half_prec());
|
||||
|
||||
gpu_device.deallocate(d_float1);
|
||||
gpu_device.deallocate(d_float2);
|
||||
gpu_device.deallocate(d_res_half);
|
||||
gpu_device.deallocate(d_res_float);
|
||||
}
|
||||
|
||||
template<typename>
|
||||
void test_hip_forced_evals() {
|
||||
|
||||
Eigen::HipStreamDevice stream;
|
||||
Eigen::GpuDevice gpu_device(&stream);
|
||||
int num_elem = 101;
|
||||
|
||||
float* d_float = (float*)gpu_device.allocate(num_elem * sizeof(float));
|
||||
float* d_res_half1 = (float*)gpu_device.allocate(num_elem * sizeof(float));
|
||||
float* d_res_half2 = (float*)gpu_device.allocate(num_elem * sizeof(float));
|
||||
float* d_res_float = (float*)gpu_device.allocate(num_elem * sizeof(float));
|
||||
|
||||
Eigen::TensorMap<Eigen::Tensor<float, 1>, Eigen::Aligned> gpu_float(
|
||||
d_float, num_elem);
|
||||
Eigen::TensorMap<Eigen::Tensor<float, 1>, Eigen::Aligned> gpu_res_half1(
|
||||
d_res_half1, num_elem);
|
||||
Eigen::TensorMap<Eigen::Tensor<float, 1>, Eigen::Unaligned> gpu_res_half2(
|
||||
d_res_half2, num_elem);
|
||||
Eigen::TensorMap<Eigen::Tensor<float, 1>, Eigen::Aligned> gpu_res_float(
|
||||
d_res_float, num_elem);
|
||||
|
||||
Eigen::array<int, 1> no_bcast;
|
||||
no_bcast[0] = 1;
|
||||
|
||||
gpu_float.device(gpu_device) = gpu_float.random() - gpu_float.constant(0.5f);
|
||||
gpu_res_float.device(gpu_device) = gpu_float.abs();
|
||||
gpu_res_half1.device(gpu_device) = gpu_float.cast<Eigen::half>().abs().eval().cast<float>();
|
||||
gpu_res_half2.device(gpu_device) = gpu_float.cast<Eigen::half>().abs().broadcast(no_bcast).eval().cast<float>();
|
||||
|
||||
Tensor<float, 1> half_prec1(num_elem);
|
||||
Tensor<float, 1> half_prec2(num_elem);
|
||||
Tensor<float, 1> full_prec(num_elem);
|
||||
gpu_device.memcpyDeviceToHost(half_prec1.data(), d_res_half1, num_elem*sizeof(float));
|
||||
gpu_device.memcpyDeviceToHost(half_prec2.data(), d_res_half1, num_elem*sizeof(float));
|
||||
gpu_device.memcpyDeviceToHost(full_prec.data(), d_res_float, num_elem*sizeof(float));
|
||||
gpu_device.synchronize();
|
||||
|
||||
for (int i = 0; i < num_elem; ++i) {
|
||||
std::cout << "Checking forced eval " << i << full_prec(i) << " vs " << half_prec1(i) << " vs " << half_prec2(i) << std::endl;
|
||||
VERIFY_IS_APPROX(full_prec(i), half_prec1(i));
|
||||
VERIFY_IS_APPROX(full_prec(i), half_prec2(i));
|
||||
}
|
||||
|
||||
gpu_device.deallocate(d_float);
|
||||
gpu_device.deallocate(d_res_half1);
|
||||
gpu_device.deallocate(d_res_half2);
|
||||
gpu_device.deallocate(d_res_float);
|
||||
}
|
||||
#endif
|
||||
|
||||
|
||||
void test_cxx11_tensor_of_float16_hip()
|
||||
{
|
||||
CALL_SUBTEST(test_hip_numext<void>());
|
||||
|
||||
#ifdef EIGEN_HAS_HIP_FP16
|
||||
CALL_SUBTEST(test_hip_conversion<void>());
|
||||
CALL_SUBTEST(test_hip_unary<void>());
|
||||
CALL_SUBTEST(test_hip_elementwise<void>());
|
||||
CALL_SUBTEST(test_hip_trancendental<void>());
|
||||
CALL_SUBTEST(test_hip_contractions<void>());
|
||||
CALL_SUBTEST(test_hip_reductions<void>());
|
||||
CALL_SUBTEST(test_hip_full_reductions<void>());
|
||||
CALL_SUBTEST(test_hip_forced_evals<void>());
|
||||
#else
|
||||
std::cout << "Half floats are not supported by this version of hip: skipping the test" << std::endl;
|
||||
#endif
|
||||
}
|
||||
@@ -1,85 +0,0 @@
|
||||
// This file is part of Eigen, a lightweight C++ template library
|
||||
// for linear algebra.
|
||||
//
|
||||
// Copyright (C) 2014 Benoit Steiner <benoit.steiner.goog@gmail.com>
|
||||
//
|
||||
// This Source Code Form is subject to the terms of the Mozilla
|
||||
// Public License v. 2.0. If a copy of the MPL was not distributed
|
||||
// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
|
||||
|
||||
#define EIGEN_TEST_NO_LONGDOUBLE
|
||||
#define EIGEN_TEST_NO_COMPLEX
|
||||
#define EIGEN_TEST_FUNC cxx11_tensor_random_hip
|
||||
#define EIGEN_DEFAULT_DENSE_INDEX_TYPE int
|
||||
#define EIGEN_USE_GPU
|
||||
|
||||
#include "main.h"
|
||||
#include <Eigen/CXX11/Tensor>
|
||||
|
||||
|
||||
void test_hip_random_uniform()
|
||||
{
|
||||
Tensor<float, 2> out(72,97);
|
||||
out.setZero();
|
||||
|
||||
std::size_t out_bytes = out.size() * sizeof(float);
|
||||
|
||||
float* d_out;
|
||||
hipMalloc((void**)(&d_out), out_bytes);
|
||||
|
||||
Eigen::HipStreamDevice stream;
|
||||
Eigen::GpuDevice gpu_device(&stream);
|
||||
|
||||
Eigen::TensorMap<Eigen::Tensor<float, 2> > gpu_out(d_out, 72,97);
|
||||
|
||||
gpu_out.device(gpu_device) = gpu_out.random();
|
||||
|
||||
assert(hipMemcpyAsync(out.data(), d_out, out_bytes, hipMemcpyDeviceToHost, gpu_device.stream()) == hipSuccess);
|
||||
assert(hipStreamSynchronize(gpu_device.stream()) == hipSuccess);
|
||||
|
||||
// For now we just check thes code doesn't crash.
|
||||
// TODO: come up with a valid test of randomness
|
||||
}
|
||||
|
||||
|
||||
void test_hip_random_normal()
|
||||
{
|
||||
Tensor<float, 2> out(72,97);
|
||||
out.setZero();
|
||||
|
||||
std::size_t out_bytes = out.size() * sizeof(float);
|
||||
|
||||
float* d_out;
|
||||
hipMalloc((void**)(&d_out), out_bytes);
|
||||
|
||||
Eigen::HipStreamDevice stream;
|
||||
Eigen::GpuDevice gpu_device(&stream);
|
||||
|
||||
Eigen::TensorMap<Eigen::Tensor<float, 2> > gpu_out(d_out, 72,97);
|
||||
|
||||
Eigen::internal::NormalRandomGenerator<float> gen(true);
|
||||
gpu_out.device(gpu_device) = gpu_out.random(gen);
|
||||
|
||||
assert(hipMemcpyAsync(out.data(), d_out, out_bytes, hipMemcpyDeviceToHost, gpu_device.stream()) == hipSuccess);
|
||||
assert(hipStreamSynchronize(gpu_device.stream()) == hipSuccess);
|
||||
}
|
||||
|
||||
static void test_complex()
|
||||
{
|
||||
Tensor<std::complex<float>, 1> vec(6);
|
||||
vec.setRandom();
|
||||
|
||||
// Fixme: we should check that the generated numbers follow a uniform
|
||||
// distribution instead.
|
||||
for (int i = 1; i < 6; ++i) {
|
||||
VERIFY_IS_NOT_EQUAL(vec(i), vec(i-1));
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
void test_cxx11_tensor_random_hip()
|
||||
{
|
||||
CALL_SUBTEST(test_hip_random_uniform());
|
||||
CALL_SUBTEST(test_hip_random_normal());
|
||||
CALL_SUBTEST(test_complex());
|
||||
}
|
||||
@@ -1,154 +0,0 @@
|
||||
// This file is part of Eigen, a lightweight C++ template library
|
||||
// for linear algebra.
|
||||
//
|
||||
// Copyright (C) 2015 Benoit Steiner <benoit.steiner.goog@gmail.com>
|
||||
//
|
||||
// This Source Code Form is subject to the terms of the Mozilla
|
||||
// Public License v. 2.0. If a copy of the MPL was not distributed
|
||||
// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
|
||||
|
||||
#define EIGEN_TEST_NO_LONGDOUBLE
|
||||
#define EIGEN_TEST_NO_COMPLEX
|
||||
#define EIGEN_TEST_FUNC cxx11_tensor_reduction_hip
|
||||
#define EIGEN_USE_GPU
|
||||
|
||||
#include "main.h"
|
||||
#include <unsupported/Eigen/CXX11/Tensor>
|
||||
|
||||
|
||||
template<typename Type, int DataLayout>
|
||||
static void test_full_reductions() {
|
||||
|
||||
Eigen::HipStreamDevice stream;
|
||||
Eigen::GpuDevice gpu_device(&stream);
|
||||
|
||||
const int num_rows = internal::random<int>(1024, 5*1024);
|
||||
const int num_cols = internal::random<int>(1024, 5*1024);
|
||||
|
||||
Tensor<Type, 2, DataLayout> in(num_rows, num_cols);
|
||||
in.setRandom();
|
||||
|
||||
Tensor<Type, 0, DataLayout> full_redux;
|
||||
full_redux = in.sum();
|
||||
|
||||
std::size_t in_bytes = in.size() * sizeof(Type);
|
||||
std::size_t out_bytes = full_redux.size() * sizeof(Type);
|
||||
Type* gpu_in_ptr = static_cast<Type*>(gpu_device.allocate(in_bytes));
|
||||
Type* gpu_out_ptr = static_cast<Type*>(gpu_device.allocate(out_bytes));
|
||||
gpu_device.memcpyHostToDevice(gpu_in_ptr, in.data(), in_bytes);
|
||||
|
||||
TensorMap<Tensor<Type, 2, DataLayout> > in_gpu(gpu_in_ptr, num_rows, num_cols);
|
||||
TensorMap<Tensor<Type, 0, DataLayout> > out_gpu(gpu_out_ptr);
|
||||
|
||||
out_gpu.device(gpu_device) = in_gpu.sum();
|
||||
|
||||
Tensor<Type, 0, DataLayout> full_redux_gpu;
|
||||
gpu_device.memcpyDeviceToHost(full_redux_gpu.data(), gpu_out_ptr, out_bytes);
|
||||
gpu_device.synchronize();
|
||||
|
||||
// Check that the CPU and GPU reductions return the same result.
|
||||
VERIFY_IS_APPROX(full_redux(), full_redux_gpu());
|
||||
|
||||
gpu_device.deallocate(gpu_in_ptr);
|
||||
gpu_device.deallocate(gpu_out_ptr);
|
||||
}
|
||||
|
||||
template<typename Type, int DataLayout>
|
||||
static void test_first_dim_reductions() {
|
||||
int dim_x = 33;
|
||||
int dim_y = 1;
|
||||
int dim_z = 128;
|
||||
|
||||
Tensor<Type, 3, DataLayout> in(dim_x, dim_y, dim_z);
|
||||
in.setRandom();
|
||||
|
||||
Eigen::array<int, 1> red_axis;
|
||||
red_axis[0] = 0;
|
||||
Tensor<Type, 2, DataLayout> redux = in.sum(red_axis);
|
||||
|
||||
// Create device
|
||||
Eigen::HipStreamDevice stream;
|
||||
Eigen::GpuDevice dev(&stream);
|
||||
|
||||
// Create data(T)
|
||||
Type* in_data = (Type*)dev.allocate(dim_x*dim_y*dim_z*sizeof(Type));
|
||||
Type* out_data = (Type*)dev.allocate(dim_z*dim_y*sizeof(Type));
|
||||
Eigen::TensorMap<Eigen::Tensor<Type, 3, DataLayout> > gpu_in(in_data, dim_x, dim_y, dim_z);
|
||||
Eigen::TensorMap<Eigen::Tensor<Type, 2, DataLayout> > gpu_out(out_data, dim_y, dim_z);
|
||||
|
||||
// Perform operation
|
||||
dev.memcpyHostToDevice(in_data, in.data(), in.size()*sizeof(Type));
|
||||
gpu_out.device(dev) = gpu_in.sum(red_axis);
|
||||
gpu_out.device(dev) += gpu_in.sum(red_axis);
|
||||
Tensor<Type, 2, DataLayout> redux_gpu(dim_y, dim_z);
|
||||
dev.memcpyDeviceToHost(redux_gpu.data(), out_data, gpu_out.size()*sizeof(Type));
|
||||
dev.synchronize();
|
||||
|
||||
// Check that the CPU and GPU reductions return the same result.
|
||||
for (int i = 0; i < gpu_out.size(); ++i) {
|
||||
VERIFY_IS_APPROX(2*redux(i), redux_gpu(i));
|
||||
}
|
||||
|
||||
dev.deallocate(in_data);
|
||||
dev.deallocate(out_data);
|
||||
}
|
||||
|
||||
template<typename Type, int DataLayout>
|
||||
static void test_last_dim_reductions() {
|
||||
int dim_x = 128;
|
||||
int dim_y = 1;
|
||||
int dim_z = 33;
|
||||
|
||||
Tensor<Type, 3, DataLayout> in(dim_x, dim_y, dim_z);
|
||||
in.setRandom();
|
||||
|
||||
Eigen::array<int, 1> red_axis;
|
||||
red_axis[0] = 2;
|
||||
Tensor<Type, 2, DataLayout> redux = in.sum(red_axis);
|
||||
|
||||
// Create device
|
||||
Eigen::HipStreamDevice stream;
|
||||
Eigen::GpuDevice dev(&stream);
|
||||
|
||||
// Create data
|
||||
Type* in_data = (Type*)dev.allocate(dim_x*dim_y*dim_z*sizeof(Type));
|
||||
Type* out_data = (Type*)dev.allocate(dim_x*dim_y*sizeof(Type));
|
||||
Eigen::TensorMap<Eigen::Tensor<Type, 3, DataLayout> > gpu_in(in_data, dim_x, dim_y, dim_z);
|
||||
Eigen::TensorMap<Eigen::Tensor<Type, 2, DataLayout> > gpu_out(out_data, dim_x, dim_y);
|
||||
|
||||
// Perform operation
|
||||
dev.memcpyHostToDevice(in_data, in.data(), in.size()*sizeof(Type));
|
||||
gpu_out.device(dev) = gpu_in.sum(red_axis);
|
||||
gpu_out.device(dev) += gpu_in.sum(red_axis);
|
||||
Tensor<Type, 2, DataLayout> redux_gpu(dim_x, dim_y);
|
||||
dev.memcpyDeviceToHost(redux_gpu.data(), out_data, gpu_out.size()*sizeof(Type));
|
||||
dev.synchronize();
|
||||
|
||||
// Check that the CPU and GPU reductions return the same result.
|
||||
for (int i = 0; i < gpu_out.size(); ++i) {
|
||||
VERIFY_IS_APPROX(2*redux(i), redux_gpu(i));
|
||||
}
|
||||
|
||||
dev.deallocate(in_data);
|
||||
dev.deallocate(out_data);
|
||||
}
|
||||
|
||||
|
||||
void test_cxx11_tensor_reduction_hip() {
|
||||
CALL_SUBTEST((test_full_reductions<float, ColMajor>()));
|
||||
CALL_SUBTEST((test_full_reductions<double, ColMajor>()));
|
||||
CALL_SUBTEST((test_full_reductions<float, RowMajor>()));
|
||||
CALL_SUBTEST((test_full_reductions<double, RowMajor>()));
|
||||
|
||||
CALL_SUBTEST((test_first_dim_reductions<float, ColMajor>()));
|
||||
CALL_SUBTEST((test_first_dim_reductions<double, ColMajor>()));
|
||||
CALL_SUBTEST((test_first_dim_reductions<float, RowMajor>()));
|
||||
// Outer reductions of doubles aren't supported just yet.
|
||||
// CALL_SUBTEST((test_first_dim_reductions<double, RowMajor>()))
|
||||
|
||||
CALL_SUBTEST((test_last_dim_reductions<float, ColMajor>()));
|
||||
// Outer reductions of doubles aren't supported just yet.
|
||||
// CALL_SUBTEST((test_last_dim_reductions<double, ColMajor>()));
|
||||
CALL_SUBTEST((test_last_dim_reductions<float, RowMajor>()));
|
||||
CALL_SUBTEST((test_last_dim_reductions<double, RowMajor>()));
|
||||
}
|
||||
@@ -1,76 +0,0 @@
|
||||
// This file is part of Eigen, a lightweight C++ template library
|
||||
// for linear algebra.
|
||||
//
|
||||
// Copyright (C) 2016 Benoit Steiner <benoit.steiner.goog@gmail.com>
|
||||
//
|
||||
// This Source Code Form is subject to the terms of the Mozilla
|
||||
// Public License v. 2.0. If a copy of the MPL was not distributed
|
||||
// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
|
||||
|
||||
#define EIGEN_TEST_NO_LONGDOUBLE
|
||||
#define EIGEN_TEST_NO_COMPLEX
|
||||
#define EIGEN_TEST_FUNC cxx11_tensor_scan_hip
|
||||
#define EIGEN_DEFAULT_DENSE_INDEX_TYPE int
|
||||
#define EIGEN_USE_GPU
|
||||
|
||||
#include "main.h"
|
||||
#include <unsupported/Eigen/CXX11/Tensor>
|
||||
|
||||
using Eigen::Tensor;
|
||||
typedef Tensor<float, 1>::DimensionPair DimPair;
|
||||
|
||||
template<int DataLayout>
|
||||
void test_hip_cumsum(int m_size, int k_size, int n_size)
|
||||
{
|
||||
std::cout << "Testing for (" << m_size << "," << k_size << "," << n_size << ")" << std::endl;
|
||||
Tensor<float, 3, DataLayout> t_input(m_size, k_size, n_size);
|
||||
Tensor<float, 3, DataLayout> t_result(m_size, k_size, n_size);
|
||||
Tensor<float, 3, DataLayout> t_result_gpu(m_size, k_size, n_size);
|
||||
|
||||
t_input.setRandom();
|
||||
|
||||
std::size_t t_input_bytes = t_input.size() * sizeof(float);
|
||||
std::size_t t_result_bytes = t_result.size() * sizeof(float);
|
||||
|
||||
float* d_t_input;
|
||||
float* d_t_result;
|
||||
|
||||
hipMalloc((void**)(&d_t_input), t_input_bytes);
|
||||
hipMalloc((void**)(&d_t_result), t_result_bytes);
|
||||
|
||||
hipMemcpy(d_t_input, t_input.data(), t_input_bytes, hipMemcpyHostToDevice);
|
||||
|
||||
Eigen::HipStreamDevice stream;
|
||||
Eigen::GpuDevice gpu_device(&stream);
|
||||
|
||||
Eigen::TensorMap<Eigen::Tensor<float, 3, DataLayout> >
|
||||
gpu_t_input(d_t_input, Eigen::array<int, 3>(m_size, k_size, n_size));
|
||||
Eigen::TensorMap<Eigen::Tensor<float, 3, DataLayout> >
|
||||
gpu_t_result(d_t_result, Eigen::array<int, 3>(m_size, k_size, n_size));
|
||||
|
||||
gpu_t_result.device(gpu_device) = gpu_t_input.cumsum(1);
|
||||
t_result = t_input.cumsum(1);
|
||||
|
||||
hipMemcpy(t_result_gpu.data(), d_t_result, t_result_bytes, hipMemcpyDeviceToHost);
|
||||
for (DenseIndex i = 0; i < t_result.size(); i++) {
|
||||
if (fabs(t_result(i) - t_result_gpu(i)) < 1e-4f) {
|
||||
continue;
|
||||
}
|
||||
if (Eigen::internal::isApprox(t_result(i), t_result_gpu(i), 1e-4f)) {
|
||||
continue;
|
||||
}
|
||||
std::cout << "mismatch detected at index " << i << ": " << t_result(i)
|
||||
<< " vs " << t_result_gpu(i) << std::endl;
|
||||
assert(false);
|
||||
}
|
||||
|
||||
hipFree((void*)d_t_input);
|
||||
hipFree((void*)d_t_result);
|
||||
}
|
||||
|
||||
|
||||
void test_cxx11_tensor_scan_hip()
|
||||
{
|
||||
CALL_SUBTEST(test_hip_cumsum<ColMajor>(128, 128, 128));
|
||||
CALL_SUBTEST(test_hip_cumsum<RowMajor>(128, 128, 128));
|
||||
}
|
||||
Reference in New Issue
Block a user