removing the *Hip files from the unsupported/Eigen/CXX11/src/Tensor and unsupported/test directories

2026-04-10 11:34:33 +08:00 · 2018-06-20 12:57:02 -04:00
parent 7e41c8f1a9
commit cfdabbcc8f
13 changed files with 0 additions and 6849 deletions
--- a/unsupported/test/cxx11_tensor_argmax_hip.cu
+++ b/unsupported/test/cxx11_tensor_argmax_hip.cu
@@ -1,251 +0,0 @@
-// This file is part of Eigen, a lightweight C++ template library
-// for linear algebra.
-//
-// Copyright (C) 2014 Benoit Steiner <benoit.steiner.goog@gmail.com>
-//
-// This Source Code Form is subject to the terms of the Mozilla
-// Public License v. 2.0. If a copy of the MPL was not distributed
-// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
-
-
-#define EIGEN_TEST_NO_LONGDOUBLE
-#define EIGEN_TEST_FUNC cxx11_tensor_hip
-#define EIGEN_USE_GPU
-
-#include "main.h"
-#include <unsupported/Eigen/CXX11/Tensor>
-
-using Eigen::Tensor;
-
-template <int Layout>
-void test_hip_simple_argmax()
-{
-  Tensor<double, 3, Layout> in(Eigen::array<DenseIndex, 3>(72,53,97));
-  Tensor<DenseIndex, 1, Layout> out_max(Eigen::array<DenseIndex, 1>(1));
-  Tensor<DenseIndex, 1, Layout> out_min(Eigen::array<DenseIndex, 1>(1));
-  in.setRandom();
-  in *= in.constant(100.0);
-  in(0, 0, 0) = -1000.0;
-  in(71, 52, 96) = 1000.0;
-
-  std::size_t in_bytes = in.size() * sizeof(double);
-  std::size_t out_bytes = out_max.size() * sizeof(DenseIndex);
-
-  double* d_in;
-  DenseIndex* d_out_max;
-  DenseIndex* d_out_min;
-  hipMalloc((void**)(&d_in), in_bytes);
-  hipMalloc((void**)(&d_out_max), out_bytes);
-  hipMalloc((void**)(&d_out_min), out_bytes);
-
-  hipMemcpy(d_in, in.data(), in_bytes, hipMemcpyHostToDevice);
-
-  Eigen::HipStreamDevice stream;
-  Eigen::GpuDevice gpu_device(&stream);
-
-  Eigen::TensorMap<Eigen::Tensor<double, 3, Layout>, Aligned > gpu_in(d_in, Eigen::array<DenseIndex, 3>(72,53,97));
-  Eigen::TensorMap<Eigen::Tensor<DenseIndex, 1, Layout>, Aligned > gpu_out_max(d_out_max, Eigen::array<DenseIndex, 1>(1));
-  Eigen::TensorMap<Eigen::Tensor<DenseIndex, 1, Layout>, Aligned > gpu_out_min(d_out_min, Eigen::array<DenseIndex, 1>(1));
-
-  gpu_out_max.device(gpu_device) = gpu_in.argmax();
-  gpu_out_min.device(gpu_device) = gpu_in.argmin();
-
-  assert(hipMemcpyAsync(out_max.data(), d_out_max, out_bytes, hipMemcpyDeviceToHost, gpu_device.stream()) == hipSuccess);
-  assert(hipMemcpyAsync(out_min.data(), d_out_min, out_bytes, hipMemcpyDeviceToHost, gpu_device.stream()) == hipSuccess);
-  assert(hipStreamSynchronize(gpu_device.stream()) == hipSuccess);
-
-  VERIFY_IS_EQUAL(out_max(Eigen::array<DenseIndex, 1>(0)), 72*53*97 - 1);
-  VERIFY_IS_EQUAL(out_min(Eigen::array<DenseIndex, 1>(0)), 0);
-
-  hipFree(d_in);
-  hipFree(d_out_max);
-  hipFree(d_out_min);
-}
-
-template <int DataLayout>
-void test_hip_argmax_dim()
-{
-  Tensor<float, 4, DataLayout> tensor(2,3,5,7);
-  std::vector<int> dims;
-  dims.push_back(2); dims.push_back(3); dims.push_back(5); dims.push_back(7);
-
-  for (int dim = 0; dim < 4; ++dim) {
-    tensor.setRandom();
-    tensor = (tensor + tensor.constant(0.5)).log();
-
-    array<DenseIndex, 3> out_shape;
-    for (int d = 0; d < 3; ++d) out_shape[d] = (d < dim) ? dims[d] : dims[d+1];
-
-    Tensor<DenseIndex, 3, DataLayout> tensor_arg(out_shape);
-
-    array<DenseIndex, 4> ix;
-    for (int i = 0; i < 2; ++i) {
-      for (int j = 0; j < 3; ++j) {
-        for (int k = 0; k < 5; ++k) {
-          for (int l = 0; l < 7; ++l) {
-            ix[0] = i; ix[1] = j; ix[2] = k; ix[3] = l;
-            if (ix[dim] != 0) continue;
-            // suppose dim == 1, then for all i, k, l, set tensor(i, 0, k, l) = 10.0
-            tensor(ix) = 10.0;
-          }
-        }
-      }
-    }
-
-    std::size_t in_bytes = tensor.size() * sizeof(float);
-    std::size_t out_bytes = tensor_arg.size() * sizeof(DenseIndex);
-
-    float* d_in;
-    DenseIndex* d_out;
-    hipMalloc((void**)(&d_in), in_bytes);
-    hipMalloc((void**)(&d_out), out_bytes);
-
-    hipMemcpy(d_in, tensor.data(), in_bytes, hipMemcpyHostToDevice);
-
-    Eigen::HipStreamDevice stream;
-    Eigen::GpuDevice gpu_device(&stream);
-
-    Eigen::TensorMap<Eigen::Tensor<float, 4, DataLayout>, Aligned > gpu_in(d_in, Eigen::array<DenseIndex, 4>(2, 3, 5, 7));
-    Eigen::TensorMap<Eigen::Tensor<DenseIndex, 3, DataLayout>, Aligned > gpu_out(d_out, out_shape);
-
-    gpu_out.device(gpu_device) = gpu_in.argmax(dim);
-
-    assert(hipMemcpyAsync(tensor_arg.data(), d_out, out_bytes, hipMemcpyDeviceToHost, gpu_device.stream()) == hipSuccess);
-    assert(hipStreamSynchronize(gpu_device.stream()) == hipSuccess);
-
-    VERIFY_IS_EQUAL(tensor_arg.size(),
-                    size_t(2*3*5*7 / tensor.dimension(dim)));
-
-    for (DenseIndex n = 0; n < tensor_arg.size(); ++n) {
-      // Expect max to be in the first index of the reduced dimension
-      VERIFY_IS_EQUAL(tensor_arg.data()[n], 0);
-    }
-
-    for (int i = 0; i < 2; ++i) {
-      for (int j = 0; j < 3; ++j) {
-        for (int k = 0; k < 5; ++k) {
-          for (int l = 0; l < 7; ++l) {
-            ix[0] = i; ix[1] = j; ix[2] = k; ix[3] = l;
-            if (ix[dim] != tensor.dimension(dim) - 1) continue;
-            // suppose dim == 1, then for all i, k, l, set tensor(i, 2, k, l) = 20.0
-            tensor(ix) = 20.0;
-          }
-        }
-      }
-    }
-
-    hipMemcpy(d_in, tensor.data(), in_bytes, hipMemcpyHostToDevice);
-
-    gpu_out.device(gpu_device) = gpu_in.argmax(dim);
-
-    assert(hipMemcpyAsync(tensor_arg.data(), d_out, out_bytes, hipMemcpyDeviceToHost, gpu_device.stream()) == hipSuccess);
-    assert(hipStreamSynchronize(gpu_device.stream()) == hipSuccess);
-
-    for (DenseIndex n = 0; n < tensor_arg.size(); ++n) {
-      // Expect max to be in the last index of the reduced dimension
-      VERIFY_IS_EQUAL(tensor_arg.data()[n], tensor.dimension(dim) - 1);
-    }
-
-    hipFree(d_in);
-    hipFree(d_out);
-  }
-}
-
-template <int DataLayout>
-void test_hip_argmin_dim()
-{
-  Tensor<float, 4, DataLayout> tensor(2,3,5,7);
-  std::vector<int> dims;
-  dims.push_back(2); dims.push_back(3); dims.push_back(5); dims.push_back(7);
-
-  for (int dim = 0; dim < 4; ++dim) {
-    tensor.setRandom();
-    tensor = (tensor + tensor.constant(0.5)).log();
-
-    array<DenseIndex, 3> out_shape;
-    for (int d = 0; d < 3; ++d) out_shape[d] = (d < dim) ? dims[d] : dims[d+1];
-
-    Tensor<DenseIndex, 3, DataLayout> tensor_arg(out_shape);
-
-    array<DenseIndex, 4> ix;
-    for (int i = 0; i < 2; ++i) {
-      for (int j = 0; j < 3; ++j) {
-        for (int k = 0; k < 5; ++k) {
-          for (int l = 0; l < 7; ++l) {
-            ix[0] = i; ix[1] = j; ix[2] = k; ix[3] = l;
-            if (ix[dim] != 0) continue;
-            // suppose dim == 1, then for all i, k, l, set tensor(i, 0, k, l) = 10.0
-            tensor(ix) = -10.0;
-          }
-        }
-      }
-    }
-
-    std::size_t in_bytes = tensor.size() * sizeof(float);
-    std::size_t out_bytes = tensor_arg.size() * sizeof(DenseIndex);
-
-    float* d_in;
-    DenseIndex* d_out;
-    hipMalloc((void**)(&d_in), in_bytes);
-    hipMalloc((void**)(&d_out), out_bytes);
-
-    hipMemcpy(d_in, tensor.data(), in_bytes, hipMemcpyHostToDevice);
-
-    Eigen::HipStreamDevice stream;
-    Eigen::GpuDevice gpu_device(&stream);
-
-    Eigen::TensorMap<Eigen::Tensor<float, 4, DataLayout>, Aligned > gpu_in(d_in, Eigen::array<DenseIndex, 4>(2, 3, 5, 7));
-    Eigen::TensorMap<Eigen::Tensor<DenseIndex, 3, DataLayout>, Aligned > gpu_out(d_out, out_shape);
-
-    gpu_out.device(gpu_device) = gpu_in.argmin(dim);
-
-    assert(hipMemcpyAsync(tensor_arg.data(), d_out, out_bytes, hipMemcpyDeviceToHost, gpu_device.stream()) == hipSuccess);
-    assert(hipStreamSynchronize(gpu_device.stream()) == hipSuccess);
-
-    VERIFY_IS_EQUAL(tensor_arg.size(),
-                    2*3*5*7 / tensor.dimension(dim));
-
-    for (DenseIndex n = 0; n < tensor_arg.size(); ++n) {
-      // Expect min to be in the first index of the reduced dimension
-      VERIFY_IS_EQUAL(tensor_arg.data()[n], 0);
-    }
-
-    for (int i = 0; i < 2; ++i) {
-      for (int j = 0; j < 3; ++j) {
-        for (int k = 0; k < 5; ++k) {
-          for (int l = 0; l < 7; ++l) {
-            ix[0] = i; ix[1] = j; ix[2] = k; ix[3] = l;
-            if (ix[dim] != tensor.dimension(dim) - 1) continue;
-            // suppose dim == 1, then for all i, k, l, set tensor(i, 2, k, l) = 20.0
-            tensor(ix) = -20.0;
-          }
-        }
-      }
-    }
-
-    hipMemcpy(d_in, tensor.data(), in_bytes, hipMemcpyHostToDevice);
-
-    gpu_out.device(gpu_device) = gpu_in.argmin(dim);
-
-    assert(hipMemcpyAsync(tensor_arg.data(), d_out, out_bytes, hipMemcpyDeviceToHost, gpu_device.stream()) == hipSuccess);
-    assert(hipStreamSynchronize(gpu_device.stream()) == hipSuccess);
-
-    for (DenseIndex n = 0; n < tensor_arg.size(); ++n) {
-      // Expect max to be in the last index of the reduced dimension
-      VERIFY_IS_EQUAL(tensor_arg.data()[n], tensor.dimension(dim) - 1);
-    }
-
-    hipFree(d_in);
-    hipFree(d_out);
-  }
-}
-
-void test_cxx11_tensor_hip()
-{
-  CALL_SUBTEST(test_hip_simple_argmax<RowMajor>());
-  CALL_SUBTEST(test_hip_simple_argmax<ColMajor>());
-  CALL_SUBTEST(test_hip_argmax_dim<RowMajor>());
-  CALL_SUBTEST(test_hip_argmax_dim<ColMajor>());
-  CALL_SUBTEST(test_hip_argmin_dim<RowMajor>());
-  CALL_SUBTEST(test_hip_argmin_dim<ColMajor>());
-}
--- a/unsupported/test/cxx11_tensor_cast_float16_hip.cu
+++ b/unsupported/test/cxx11_tensor_cast_float16_hip.cu
@@ -1,79 +0,0 @@
-// This file is part of Eigen, a lightweight C++ template library
-// for linear algebra.
-//
-// Copyright (C) 2016 Benoit Steiner <benoit.steiner.goog@gmail.com>
-//
-// This Source Code Form is subject to the terms of the Mozilla
-// Public License v. 2.0. If a copy of the MPL was not distributed
-// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
-
-#define EIGEN_TEST_NO_LONGDOUBLE
-#define EIGEN_TEST_NO_COMPLEX
-#define EIGEN_TEST_FUNC cxx11_tensor_cast_float16_hip
-#define EIGEN_DEFAULT_DENSE_INDEX_TYPE int
-#define EIGEN_USE_GPU
-
-#include "main.h"
-#include <unsupported/Eigen/CXX11/Tensor>
-
-using Eigen::Tensor;
-
-void test_hip_conversion() {
-  Eigen::HipStreamDevice stream;
-  Eigen::GpuDevice gpu_device(&stream);
-  int num_elem = 101;
-
-  Tensor<float, 1> floats(num_elem);
-  floats.setRandom();
-
-  float* d_float = (float*)gpu_device.allocate(num_elem * sizeof(float));
-  Eigen::half* d_half = (Eigen::half*)gpu_device.allocate(num_elem * sizeof(Eigen::half));
-  float* d_conv = (float*)gpu_device.allocate(num_elem * sizeof(float));
-
-  Eigen::TensorMap<Eigen::Tensor<float, 1>, Eigen::Aligned> gpu_float(
-      d_float, num_elem);
-  Eigen::TensorMap<Eigen::Tensor<Eigen::half, 1>, Eigen::Aligned> gpu_half(
-      d_half, num_elem);
-  Eigen::TensorMap<Eigen::Tensor<float, 1>, Eigen::Aligned> gpu_conv(
-      d_conv, num_elem);
-
-  gpu_device.memcpyHostToDevice(d_float, floats.data(), num_elem*sizeof(float));
-
-  gpu_half.device(gpu_device) = gpu_float.cast<Eigen::half>();
-  gpu_conv.device(gpu_device) = gpu_half.cast<float>();
-
-  Tensor<float, 1> initial(num_elem);
-  Tensor<float, 1> final(num_elem);
-  gpu_device.memcpyDeviceToHost(initial.data(), d_float, num_elem*sizeof(float));
-  gpu_device.memcpyDeviceToHost(final.data(), d_conv, num_elem*sizeof(float));
-  gpu_device.synchronize();
-
-  for (int i = 0; i < num_elem; ++i) {
-    VERIFY_IS_APPROX(initial(i), final(i));
-  }
-
-  gpu_device.deallocate(d_float);
-  gpu_device.deallocate(d_half);
-  gpu_device.deallocate(d_conv);
-}
-
-
-void test_fallback_conversion() {
-  int num_elem = 101;
-  Tensor<float, 1> floats(num_elem);
-  floats.setRandom();
-
-  Eigen::Tensor<Eigen::half, 1> halfs = floats.cast<Eigen::half>();
-  Eigen::Tensor<float, 1> conv = halfs.cast<float>();
-
-  for (int i = 0; i < num_elem; ++i) {
-    VERIFY_IS_APPROX(floats(i), conv(i));
-  }
-}
-
-
-void test_cxx11_tensor_cast_float16_hip()
-{
-  CALL_SUBTEST(test_hip_conversion());
-  CALL_SUBTEST(test_fallback_conversion());
-}
--- a/unsupported/test/cxx11_tensor_contract_hip.cu
+++ b/unsupported/test/cxx11_tensor_contract_hip.cu
@@ -1,215 +0,0 @@
-// This file is part of Eigen, a lightweight C++ template library
-// for linear algebra.
-//
-// Copyright (C) 2014 Benoit Steiner <benoit.steiner.goog@gmail.com>
-// Copyright (C) 2014 Navdeep Jaitly <ndjaitly@google.com>
-//
-// This Source Code Form is subject to the terms of the Mozilla
-// Public License v. 2.0. If a copy of the MPL was not distributed
-// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
-
-#define EIGEN_TEST_NO_LONGDOUBLE
-#define EIGEN_TEST_NO_COMPLEX
-#define EIGEN_TEST_FUNC cxx11_tensor_hip
-#define EIGEN_DEFAULT_DENSE_INDEX_TYPE int
-#define EIGEN_USE_GPU
-
-#include "main.h"
-#include <unsupported/Eigen/CXX11/Tensor>
-
-
-using Eigen::Tensor;
-typedef Tensor<float, 1>::DimensionPair DimPair;
-
-template<int DataLayout>
-void test_hip_contraction(int m_size, int k_size, int n_size)
-{
-  std::cout << "Testing for (" << m_size << "," << k_size << "," << n_size << ")" << std::endl;
-  // with these dimensions, the output has 300 * 140 elements, which is
-  // more than 30 * 1024, which is the number of threads in blocks on
-  // a 15 SM GK110 GPU
-  Tensor<float, 2, DataLayout> t_left(m_size, k_size);
-  Tensor<float, 2, DataLayout> t_right(k_size, n_size);
-  Tensor<float, 2, DataLayout> t_result(m_size, n_size);
-  Tensor<float, 2, DataLayout> t_result_gpu(m_size, n_size);
-  Eigen::array<DimPair, 1> dims(DimPair(1, 0));
-
-  t_left.setRandom();
-  t_right.setRandom();
-
-  std::size_t t_left_bytes = t_left.size()  * sizeof(float);
-  std::size_t t_right_bytes = t_right.size() * sizeof(float);
-  std::size_t t_result_bytes = t_result.size() * sizeof(float);
-
-  float* d_t_left;
-  float* d_t_right;
-  float* d_t_result;
-
-  hipMalloc((void**)(&d_t_left), t_left_bytes);
-  hipMalloc((void**)(&d_t_right), t_right_bytes);
-  hipMalloc((void**)(&d_t_result), t_result_bytes);
-
-  hipMemcpy(d_t_left, t_left.data(), t_left_bytes, hipMemcpyHostToDevice);
-  hipMemcpy(d_t_right, t_right.data(), t_right_bytes, hipMemcpyHostToDevice);
-
-  Eigen::HipStreamDevice stream;
-  Eigen::GpuDevice gpu_device(&stream);
-
-  Eigen::TensorMap<Eigen::Tensor<float, 2, DataLayout> >
-      gpu_t_left(d_t_left, Eigen::array<int, 2>(m_size, k_size));
-  Eigen::TensorMap<Eigen::Tensor<float, 2, DataLayout> >
-      gpu_t_right(d_t_right, Eigen::array<int, 2>(k_size, n_size));
-  Eigen::TensorMap<Eigen::Tensor<float, 2, DataLayout> >
-      gpu_t_result(d_t_result, Eigen::array<int, 2>(m_size, n_size));
-
-
-  gpu_t_result.device(gpu_device) = gpu_t_left.contract(gpu_t_right, dims);
-  t_result = t_left.contract(t_right, dims);
-
-  hipMemcpy(t_result_gpu.data(), d_t_result, t_result_bytes, hipMemcpyDeviceToHost);
-  for (DenseIndex i = 0; i < t_result.size(); i++) {
-    if (fabs(t_result(i) - t_result_gpu(i)) < 1e-4f) {
-      continue;
-    }
-    if (Eigen::internal::isApprox(t_result(i), t_result_gpu(i), 1e-4f)) {
-      continue;
-    }
-    std::cout << "mismatch detected at index " << i << ": " << t_result(i)
-              << " vs " <<  t_result_gpu(i) << std::endl;
-    assert(false);
-  }
-
-  hipFree((void*)d_t_left);
-  hipFree((void*)d_t_right);
-  hipFree((void*)d_t_result);
-}
-
-
-template<int DataLayout>
-void test_scalar(int m_size, int k_size, int n_size)
-{
-  std::cout << "Testing for (" << m_size << "," << k_size << "," << n_size << ")" << std::endl;
-  // with these dimensions, the output has 300 * 140 elements, which is
-  // more than 30 * 1024, which is the number of threads in blocks on
-  // a 15 SM GK110 GPU
-  Tensor<float, 2, DataLayout> t_left(m_size, k_size);
-  Tensor<float, 2, DataLayout> t_right(k_size, n_size);
-  Tensor<float, 0, DataLayout> t_result;
-  Tensor<float, 0, DataLayout> t_result_gpu;
-  Eigen::array<DimPair, 2> dims(DimPair(0, 0), DimPair(1, 1));
-
-  t_left.setRandom();
-  t_right.setRandom();
-
-  std::size_t t_left_bytes = t_left.size()  * sizeof(float);
-  std::size_t t_right_bytes = t_right.size() * sizeof(float);
-  std::size_t t_result_bytes = sizeof(float);
-
-  float* d_t_left;
-  float* d_t_right;
-  float* d_t_result;
-
-  hipMalloc((void**)(&d_t_left), t_left_bytes);
-  hipMalloc((void**)(&d_t_right), t_right_bytes);
-  hipMalloc((void**)(&d_t_result), t_result_bytes);
-
-  hipMemcpy(d_t_left, t_left.data(), t_left_bytes, hipMemcpyHostToDevice);
-  hipMemcpy(d_t_right, t_right.data(), t_right_bytes, hipMemcpyHostToDevice);
-
-  Eigen::HipStreamDevice stream;
-  Eigen::GpuDevice gpu_device(&stream);
-
-  Eigen::TensorMap<Eigen::Tensor<float, 2, DataLayout> >
-      gpu_t_left(d_t_left, m_size, k_size);
-  Eigen::TensorMap<Eigen::Tensor<float, 2, DataLayout> >
-      gpu_t_right(d_t_right, k_size, n_size);
-  Eigen::TensorMap<Eigen::Tensor<float, 0, DataLayout> >
-      gpu_t_result(d_t_result);
-
-  gpu_t_result.device(gpu_device) = gpu_t_left.contract(gpu_t_right, dims);
-  t_result = t_left.contract(t_right, dims);
-
-  hipMemcpy(t_result_gpu.data(), d_t_result, t_result_bytes, hipMemcpyDeviceToHost);
-  if (fabs(t_result() - t_result_gpu()) > 1e-4f &&
-      !Eigen::internal::isApprox(t_result(), t_result_gpu(), 1e-4f)) {
-    std::cout << "mismatch detected: " << t_result()
-              << " vs " <<  t_result_gpu() << std::endl;
-    assert(false);
-  }
-
-  hipFree((void*)d_t_left);
-  hipFree((void*)d_t_right);
-  hipFree((void*)d_t_result);
-}
-
-
-template<int DataLayout>
-void test_hip_contraction_m() {
-  for (int k = 32; k < 256; k++) {
-    test_hip_contraction<ColMajor>(k, 128, 128);
-    test_hip_contraction<RowMajor>(k, 128, 128);
-  }
-}
-
-template<int DataLayout>
-void test_hip_contraction_k() {
-  for (int k = 32; k < 256; k++) {
-    test_hip_contraction<ColMajor>(128, k, 128);
-    test_hip_contraction<RowMajor>(128, k, 128);
-  }
-}
-
-template<int DataLayout>
-void test_hip_contraction_n() {
-  for (int k = 32; k < 256; k++) {
-    test_hip_contraction<ColMajor>(128, 128, k);
-    test_hip_contraction<RowMajor>(128, 128, k);
-  }
-}
-
-
-template<int DataLayout>
-void test_hip_contraction_sizes() {
-  int m_sizes[] = { 31,  39,   63,   64,   65,
-                   127, 129,  255,  257 , 511,
-                   512, 513, 1023, 1024, 1025};
-
-  int n_sizes[] = { 31,  39,   63,   64,   65,
-                   127, 129,  255,  257,  511,
-                   512, 513, 1023, 1024, 1025};
-
-  int k_sizes[] = {  31,   39,  63,  64,   65,
-                     95,   96, 127, 129,  255,
-                    257,  511, 512, 513, 1023,
-                   1024, 1025};
-
-  for (int i = 0; i < 15; i++) {
-    for (int j = 0; j < 15; j++) {
-      for (int k = 0; k < 17; k++) {
-        test_hip_contraction<DataLayout>(m_sizes[i], n_sizes[j], k_sizes[k]);
-      }
-    }
-  }
-}
-
-void test_cxx11_tensor_hip()
-{
-  CALL_SUBTEST(test_hip_contraction<ColMajor>(128, 128, 128));
-  CALL_SUBTEST(test_hip_contraction<RowMajor>(128, 128, 128));
-
-  CALL_SUBTEST(test_scalar<ColMajor>(128, 128, 128));
-  CALL_SUBTEST(test_scalar<RowMajor>(128, 128, 128));
-
-  CALL_SUBTEST(test_hip_contraction_m<ColMajor>());
-  CALL_SUBTEST(test_hip_contraction_m<RowMajor>());
-
-  CALL_SUBTEST(test_hip_contraction_k<ColMajor>());
-  CALL_SUBTEST(test_hip_contraction_k<RowMajor>());
-
-  CALL_SUBTEST(test_hip_contraction_n<ColMajor>());
-  CALL_SUBTEST(test_hip_contraction_n<RowMajor>());
-
-  // Commenting out these tests due to long runtimes
-  // CALL_SUBTEST(test_hip_contraction_sizes<ColMajor>());
-  // CALL_SUBTEST(test_hip_contraction_sizes<RowMajor>());
-}
--- a/unsupported/test/cxx11_tensor_device_hip.cu
+++ b/unsupported/test/cxx11_tensor_device_hip.cu
@@ -1,389 +0,0 @@
-// This file is part of Eigen, a lightweight C++ template library
-// for linear algebra.
-//
-// Copyright (C) 2014 Benoit Steiner <benoit.steiner.goog@gmail.com>
-//
-// This Source Code Form is subject to the terms of the Mozilla
-// Public License v. 2.0. If a copy of the MPL was not distributed
-// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
-
-#define EIGEN_TEST_NO_LONGDOUBLE
-#define EIGEN_TEST_NO_COMPLEX
-#define EIGEN_TEST_FUNC cxx11_tensor_device
-#define EIGEN_DEFAULT_DENSE_INDEX_TYPE int
-#define EIGEN_USE_GPU
-
-#include "main.h"
-#include <unsupported/Eigen/CXX11/Tensor>
-
-using Eigen::Tensor;
-using Eigen::RowMajor;
-
-// Context for evaluation on cpu
-struct CPUContext {
-  CPUContext(const Eigen::Tensor<float, 3>& in1, Eigen::Tensor<float, 3>& in2, Eigen::Tensor<float, 3>& out) : in1_(in1), in2_(in2), out_(out), kernel_1d_(2), kernel_2d_(2,2), kernel_3d_(2,2,2) {
-    kernel_1d_(0) = 3.14f;
-    kernel_1d_(1) = 2.7f;
-
-    kernel_2d_(0,0) = 3.14f;
-    kernel_2d_(1,0) = 2.7f;
-    kernel_2d_(0,1) = 0.2f;
-    kernel_2d_(1,1) = 7.0f;
-
-    kernel_3d_(0,0,0) = 3.14f;
-    kernel_3d_(0,1,0) = 2.7f;
-    kernel_3d_(0,0,1) = 0.2f;
-    kernel_3d_(0,1,1) = 7.0f;
-    kernel_3d_(1,0,0) = -1.0f;
-    kernel_3d_(1,1,0) = -0.3f;
-    kernel_3d_(1,0,1) = -0.7f;
-    kernel_3d_(1,1,1) = -0.5f;
-  }
-
-  const Eigen::DefaultDevice& device() const { return cpu_device_; }
-
-  const Eigen::Tensor<float, 3>& in1() const { return in1_; }
-  const Eigen::Tensor<float, 3>& in2() const { return in2_; }
-  Eigen::Tensor<float, 3>& out() { return out_; }
-  const Eigen::Tensor<float, 1>& kernel1d() const { return kernel_1d_; }
-  const Eigen::Tensor<float, 2>& kernel2d() const { return kernel_2d_; }
-  const Eigen::Tensor<float, 3>& kernel3d() const { return kernel_3d_; }
-
- private:
-  const Eigen::Tensor<float, 3>& in1_;
-  const Eigen::Tensor<float, 3>& in2_;
-  Eigen::Tensor<float, 3>& out_;
-
-  Eigen::Tensor<float, 1> kernel_1d_;
-  Eigen::Tensor<float, 2> kernel_2d_;
-  Eigen::Tensor<float, 3> kernel_3d_;
-
-  Eigen::DefaultDevice cpu_device_;
-};
-
-
-// Context for evaluation on GPU
-struct GPUContext {
-  GPUContext(const Eigen::TensorMap<Eigen::Tensor<float, 3> >& in1, Eigen::TensorMap<Eigen::Tensor<float, 3> >& in2, Eigen::TensorMap<Eigen::Tensor<float, 3> >& out) : in1_(in1), in2_(in2), out_(out), gpu_device_(&stream_) {
-    assert(hipMalloc((void**)(&kernel_1d_), 2*sizeof(float)) == hipSuccess);
-    float kernel_1d_val[] = {3.14f, 2.7f};
-    assert(hipMemcpy(kernel_1d_, kernel_1d_val, 2*sizeof(float), hipMemcpyHostToDevice) == hipSuccess);
-
-    assert(hipMalloc((void**)(&kernel_2d_), 4*sizeof(float)) == hipSuccess);
-    float kernel_2d_val[] = {3.14f, 2.7f, 0.2f, 7.0f};
-    assert(hipMemcpy(kernel_2d_, kernel_2d_val, 4*sizeof(float), hipMemcpyHostToDevice) == hipSuccess);
-
-    assert(hipMalloc((void**)(&kernel_3d_), 8*sizeof(float)) == hipSuccess);
-    float kernel_3d_val[] = {3.14f, -1.0f, 2.7f, -0.3f, 0.2f, -0.7f, 7.0f, -0.5f};
-    assert(hipMemcpy(kernel_3d_, kernel_3d_val, 8*sizeof(float), hipMemcpyHostToDevice) == hipSuccess);
-  }
-  ~GPUContext() {
-    assert(hipFree(kernel_1d_) == hipSuccess);
-    assert(hipFree(kernel_2d_) == hipSuccess);
-    assert(hipFree(kernel_3d_) == hipSuccess);
-  }
-
-  const Eigen::GpuDevice& device() const { return gpu_device_; }
-
-  const Eigen::TensorMap<Eigen::Tensor<float, 3> >& in1() const { return in1_; }
-  const Eigen::TensorMap<Eigen::Tensor<float, 3> >& in2() const { return in2_; }
-  Eigen::TensorMap<Eigen::Tensor<float, 3> >& out() { return out_; }
-  Eigen::TensorMap<Eigen::Tensor<float, 1> > kernel1d() const { return Eigen::TensorMap<Eigen::Tensor<float, 1> >(kernel_1d_, 2); }
-  Eigen::TensorMap<Eigen::Tensor<float, 2> > kernel2d() const { return Eigen::TensorMap<Eigen::Tensor<float, 2> >(kernel_2d_, 2, 2); }
-  Eigen::TensorMap<Eigen::Tensor<float, 3> > kernel3d() const { return Eigen::TensorMap<Eigen::Tensor<float, 3> >(kernel_3d_, 2, 2, 2); }
-
- private:
-  const Eigen::TensorMap<Eigen::Tensor<float, 3> >& in1_;
-  const Eigen::TensorMap<Eigen::Tensor<float, 3> >& in2_;
-  Eigen::TensorMap<Eigen::Tensor<float, 3> >& out_;
-
-  float* kernel_1d_;
-  float* kernel_2d_;
-  float* kernel_3d_;
-
-  Eigen::HipStreamDevice stream_;
-  Eigen::GpuDevice gpu_device_;
-};
-
-
-// The actual expression to evaluate
-template <typename Context>
-void test_contextual_eval(Context* context)
-{
-  context->out().device(context->device()) = context->in1() + context->in2() * 3.14f + context->in1().constant(2.718f);
-}
-
-template <typename Context>
-void test_forced_contextual_eval(Context* context)
-{
-  context->out().device(context->device()) = (context->in1() + context->in2()).eval() * 3.14f + context->in1().constant(2.718f);
-}
-
-template <typename Context>
-void test_compound_assignment(Context* context)
-{
-  context->out().device(context->device()) = context->in1().constant(2.718f);
-  context->out().device(context->device()) += context->in1() + context->in2() * 3.14f;
-}
-
-
-template <typename Context>
-void test_contraction(Context* context)
-{
-  Eigen::array<std::pair<int, int>, 2> dims;
-  dims[0] = std::make_pair(1, 1);
-  dims[1] = std::make_pair(2, 2);
-
-  Eigen::array<int, 2> shape(40, 50*70);
-
-  Eigen::DSizes<int, 2> indices(0,0);
-  Eigen::DSizes<int, 2> sizes(40,40);
-
-  context->out().reshape(shape).slice(indices, sizes).device(context->device()) = context->in1().contract(context->in2(), dims);
-}
-
-
-template <typename Context>
-void test_1d_convolution(Context* context)
-{
-  Eigen::DSizes<int, 3> indices(0,0,0);
-  Eigen::DSizes<int, 3> sizes(40,49,70);
-
-  Eigen::array<int, 1> dims(1);
-  context->out().slice(indices, sizes).device(context->device()) = context->in1().convolve(context->kernel1d(), dims);
-}
-
-template <typename Context>
-void test_2d_convolution(Context* context)
-{
-  Eigen::DSizes<int, 3> indices(0,0,0);
-  Eigen::DSizes<int, 3> sizes(40,49,69);
-
-  Eigen::array<int, 2> dims(1,2);
-  context->out().slice(indices, sizes).device(context->device()) = context->in1().convolve(context->kernel2d(), dims);
-}
-
-template <typename Context>
-void test_3d_convolution(Context* context)
-{
-  Eigen::DSizes<int, 3> indices(0,0,0);
-  Eigen::DSizes<int, 3> sizes(39,49,69);
-
-  Eigen::array<int, 3> dims(0,1,2);
-  context->out().slice(indices, sizes).device(context->device()) = context->in1().convolve(context->kernel3d(), dims);
-}
-
-
-void test_cpu() {
-  Eigen::Tensor<float, 3> in1(40,50,70);
-  Eigen::Tensor<float, 3> in2(40,50,70);
-  Eigen::Tensor<float, 3> out(40,50,70);
-
-  in1 = in1.random() + in1.constant(10.0f);
-  in2 = in2.random() + in2.constant(10.0f);
-
-  CPUContext context(in1, in2, out);
-  test_contextual_eval(&context);
-  for (int i = 0; i < 40; ++i) {
-    for (int j = 0; j < 50; ++j) {
-      for (int k = 0; k < 70; ++k) {
-        VERIFY_IS_APPROX(out(i,j,k), in1(i,j,k) + in2(i,j,k) * 3.14f + 2.718f);
-      }
-    }
-  }
-
-  test_forced_contextual_eval(&context);
-  for (int i = 0; i < 40; ++i) {
-    for (int j = 0; j < 50; ++j) {
-      for (int k = 0; k < 70; ++k) {
-        VERIFY_IS_APPROX(out(i,j,k), (in1(i,j,k) + in2(i,j,k)) * 3.14f + 2.718f);
-      }
-    }
-  }
-
-  test_compound_assignment(&context);
-  for (int i = 0; i < 40; ++i) {
-    for (int j = 0; j < 50; ++j) {
-      for (int k = 0; k < 70; ++k) {
-        VERIFY_IS_APPROX(out(i,j,k), in1(i,j,k) + in2(i,j,k) * 3.14f + 2.718f);
-      }
-    }
-  }
-
-  test_contraction(&context);
-  for (int i = 0; i < 40; ++i) {
-    for (int j = 0; j < 40; ++j) {
-      const float result = out(i,j,0);
-      float expected = 0;
-      for (int k = 0; k < 50; ++k) {
-        for (int l = 0; l < 70; ++l) {
-          expected += in1(i, k, l) * in2(j, k, l);
-        }
-      }
-      VERIFY_IS_APPROX(expected, result);
-    }
-  }
-
-  test_1d_convolution(&context);
-  for (int i = 0; i < 40; ++i) {
-    for (int j = 0; j < 49; ++j) {
-      for (int k = 0; k < 70; ++k) {
-        VERIFY_IS_APPROX(out(i,j,k), (in1(i,j,k) * 3.14f + in1(i,j+1,k) * 2.7f));
-      }
-    }
-  }
-
-  test_2d_convolution(&context);
-  for (int i = 0; i < 40; ++i) {
-    for (int j = 0; j < 49; ++j) {
-      for (int k = 0; k < 69; ++k) {
-        const float result = out(i,j,k);
-        const float expected = (in1(i,j,k) * 3.14f + in1(i,j+1,k) * 2.7f) +
-                               (in1(i,j,k+1) * 0.2f + in1(i,j+1,k+1) * 7.0f);
-        if (fabs(expected) < 1e-4f && fabs(result) < 1e-4f) {
-          continue;
-        }
-        VERIFY_IS_APPROX(expected, result);
-      }
-    }
-  }
-
-  test_3d_convolution(&context);
-  for (int i = 0; i < 39; ++i) {
-    for (int j = 0; j < 49; ++j) {
-      for (int k = 0; k < 69; ++k) {
-        const float result = out(i,j,k);
-        const float expected = (in1(i,j,k) * 3.14f + in1(i,j+1,k) * 2.7f +
-                                in1(i,j,k+1) * 0.2f + in1(i,j+1,k+1) * 7.0f) +
-                               (in1(i+1,j,k) * -1.0f + in1(i+1,j+1,k) * -0.3f +
-                                in1(i+1,j,k+1) * -0.7f + in1(i+1,j+1,k+1) * -0.5f);
-        if (fabs(expected) < 1e-4f && fabs(result) < 1e-4f) {
-          continue;
-        }
-        VERIFY_IS_APPROX(expected, result);
-      }
-    }
-  }
-}
-
-void test_gpu() {
-  Eigen::Tensor<float, 3> in1(40,50,70);
-  Eigen::Tensor<float, 3> in2(40,50,70);
-  Eigen::Tensor<float, 3> out(40,50,70);
-  in1 = in1.random() + in1.constant(10.0f);
-  in2 = in2.random() + in2.constant(10.0f);
-
-  std::size_t in1_bytes = in1.size() * sizeof(float);
-  std::size_t in2_bytes = in2.size() * sizeof(float);
-  std::size_t out_bytes = out.size() * sizeof(float);
-
-  float* d_in1;
-  float* d_in2;
-  float* d_out;
-  hipMalloc((void**)(&d_in1), in1_bytes);
-  hipMalloc((void**)(&d_in2), in2_bytes);
-  hipMalloc((void**)(&d_out), out_bytes);
-
-  hipMemcpy(d_in1, in1.data(), in1_bytes, hipMemcpyHostToDevice);
-  hipMemcpy(d_in2, in2.data(), in2_bytes, hipMemcpyHostToDevice);
-
-  Eigen::TensorMap<Eigen::Tensor<float, 3> > gpu_in1(d_in1, 40,50,70);
-  Eigen::TensorMap<Eigen::Tensor<float, 3> > gpu_in2(d_in2, 40,50,70);
-  Eigen::TensorMap<Eigen::Tensor<float, 3> > gpu_out(d_out, 40,50,70);
-
-  GPUContext context(gpu_in1, gpu_in2, gpu_out);
-  test_contextual_eval(&context);
-  assert(hipMemcpy(out.data(), d_out, out_bytes, hipMemcpyDeviceToHost) == hipSuccess);
-  for (int i = 0; i < 40; ++i) {
-    for (int j = 0; j < 50; ++j) {
-      for (int k = 0; k < 70; ++k) {
-        VERIFY_IS_APPROX(out(i,j,k), in1(i,j,k) + in2(i,j,k) * 3.14f + 2.718f);
-      }
-    }
-  }
-
-  test_forced_contextual_eval(&context);
-  assert(hipMemcpy(out.data(), d_out, out_bytes, hipMemcpyDeviceToHost) == hipSuccess);
-  for (int i = 0; i < 40; ++i) {
-    for (int j = 0; j < 50; ++j) {
-      for (int k = 0; k < 70; ++k) {
-        VERIFY_IS_APPROX(out(i,j,k), (in1(i,j,k) + in2(i,j,k)) * 3.14f + 2.718f);
-      }
-    }
-  }
-
-  test_compound_assignment(&context);
-  assert(hipMemcpy(out.data(), d_out, out_bytes, hipMemcpyDeviceToHost) == hipSuccess);
-  for (int i = 0; i < 40; ++i) {
-    for (int j = 0; j < 50; ++j) {
-      for (int k = 0; k < 70; ++k) {
-        VERIFY_IS_APPROX(out(i,j,k), in1(i,j,k) + in2(i,j,k) * 3.14f + 2.718f);
-      }
-    }
-  }
-
-  test_contraction(&context);
-  assert(hipMemcpy(out.data(), d_out, out_bytes, hipMemcpyDeviceToHost) == hipSuccess);
-  for (int i = 0; i < 40; ++i) {
-    for (int j = 0; j < 40; ++j) {
-      const float result = out(i,j,0);
-      float expected = 0;
-      for (int k = 0; k < 50; ++k) {
-        for (int l = 0; l < 70; ++l) {
-          expected += in1(i, k, l) * in2(j, k, l);
-        }
-      }
-      VERIFY_IS_APPROX(expected, result);
-    }
-  }
-
-  test_1d_convolution(&context);
-  assert(hipMemcpyAsync(out.data(), d_out, out_bytes, hipMemcpyDeviceToHost, context.device().stream()) == hipSuccess);
-  assert(hipStreamSynchronize(context.device().stream()) == hipSuccess);
-  for (int i = 0; i < 40; ++i) {
-    for (int j = 0; j < 49; ++j) {
-      for (int k = 0; k < 70; ++k) {
-        VERIFY_IS_APPROX(out(i,j,k), (in1(i,j,k) * 3.14f + in1(i,j+1,k) * 2.7f));
-      }
-    }
-  }
-
-  test_2d_convolution(&context);
-  assert(hipMemcpyAsync(out.data(), d_out, out_bytes, hipMemcpyDeviceToHost, context.device().stream()) == hipSuccess);
-  assert(hipStreamSynchronize(context.device().stream()) == hipSuccess);
-  for (int i = 0; i < 40; ++i) {
-    for (int j = 0; j < 49; ++j) {
-      for (int k = 0; k < 69; ++k) {
-        const float result = out(i,j,k);
-        const float expected = (in1(i,j,k) * 3.14f + in1(i,j+1,k) * 2.7f +
-                                in1(i,j,k+1) * 0.2f + in1(i,j+1,k+1) * 7.0f);
-        VERIFY_IS_APPROX(expected, result);
-      }
-    }
-  }
-
-  /*
-  test_3d_convolution(&context);
-  assert(hipMemcpyAsync(out.data(), d_out, out_bytes, hipMemcpyDeviceToHost, context.device().stream()) == hipSuccess);
-  assert(hipStreamSynchronize(context.device().stream()) == hipSuccess);
-  for (int i = 0; i < 39; ++i) {
-    for (int j = 0; j < 49; ++j) {
-      for (int k = 0; k < 69; ++k) {
-       const float result = out(i,j,k);
-        const float expected = (in1(i,j,k) * 3.14f + in1(i,j+1,k) * 2.7f +
-                                in1(i,j,k+1) * 0.2f + in1(i,j+1,k+1) * 7.0f +
-                                in1(i+1,j,k) * -1.0f + in1(i+1,j+1,k) * -0.3f +
-                                in1(i+1,j,k+1) * -0.7f + in1(i+1,j+1,k+1) * -0.5f);
-        VERIFY_IS_APPROX(expected, result);
-      }
-    }
-  }
-  */
-}
-
-
-void test_cxx11_tensor_device()
-{
-  CALL_SUBTEST(test_cpu());
-  CALL_SUBTEST(test_gpu());
-}
--- a/unsupported/test/cxx11_tensor_hip.cu
+++ b/unsupported/test/cxx11_tensor_hip.cu
--- a/unsupported/test/cxx11_tensor_of_float16_hip.cu
+++ b/unsupported/test/cxx11_tensor_of_float16_hip.cu
@@ -1,498 +0,0 @@
-// This file is part of Eigen, a lightweight C++ template library
-// for linear algebra.
-//
-// Copyright (C) 2016 Benoit Steiner <benoit.steiner.goog@gmail.com>
-//
-// This Source Code Form is subject to the terms of the Mozilla
-// Public License v. 2.0. If a copy of the MPL was not distributed
-// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
-
-#define EIGEN_TEST_NO_LONGDOUBLE
-#define EIGEN_TEST_NO_COMPLEX
-#define EIGEN_TEST_FUNC cxx11_tensor_of_float16_hip
-#define EIGEN_DEFAULT_DENSE_INDEX_TYPE int
-#define EIGEN_USE_GPU
-
-#include "main.h"
-#include <unsupported/Eigen/CXX11/Tensor>
-
-
-using Eigen::Tensor;
-
-template<typename>
-void test_hip_numext() {
-  Eigen::HipStreamDevice stream;
-  Eigen::GpuDevice gpu_device(&stream);
-  int num_elem = 101;
-
-  float* d_float = (float*)gpu_device.allocate(num_elem * sizeof(float));
-  bool* d_res_half = (bool*)gpu_device.allocate(num_elem * sizeof(bool));
-  bool* d_res_float = (bool*)gpu_device.allocate(num_elem * sizeof(bool));
-
-  Eigen::TensorMap<Eigen::Tensor<float, 1>, Eigen::Aligned> gpu_float(
-      d_float, num_elem);
-  Eigen::TensorMap<Eigen::Tensor<bool, 1>, Eigen::Aligned> gpu_res_half(
-      d_res_half, num_elem);
-  Eigen::TensorMap<Eigen::Tensor<bool, 1>, Eigen::Aligned> gpu_res_float(
-      d_res_float, num_elem);
-
-  gpu_float.device(gpu_device) = gpu_float.random() - gpu_float.constant(0.5f);
-  gpu_res_float.device(gpu_device) = gpu_float.unaryExpr(Eigen::internal::scalar_isnan_op<float>());
-  gpu_res_half.device(gpu_device) = gpu_float.cast<Eigen::half>().unaryExpr(Eigen::internal::scalar_isnan_op<Eigen::half>());
-
-  Tensor<bool, 1> half_prec(num_elem);
-  Tensor<bool, 1> full_prec(num_elem);
-  gpu_device.memcpyDeviceToHost(half_prec.data(), d_res_half, num_elem*sizeof(bool));
-  gpu_device.memcpyDeviceToHost(full_prec.data(), d_res_float, num_elem*sizeof(bool));
-  gpu_device.synchronize();
-
-  for (int i = 0; i < num_elem; ++i) {
-    std::cout << "Checking numext " << i << std::endl;
-    VERIFY_IS_EQUAL(full_prec(i), half_prec(i));
-  }
-
-  gpu_device.deallocate(d_float);
-  gpu_device.deallocate(d_res_half);
-  gpu_device.deallocate(d_res_float);
-}
-
-
-#ifdef EIGEN_HAS_HIP_FP16
-
-template<typename>
-void test_hip_conversion() {
-  Eigen::HipStreamDevice stream;
-  Eigen::GpuDevice gpu_device(&stream);
-  int num_elem = 101;
-  
-  float* d_float = (float*)gpu_device.allocate(num_elem * sizeof(float));
-  Eigen::half* d_half = (Eigen::half*)gpu_device.allocate(num_elem * sizeof(Eigen::half));
-  float* d_conv = (float*)gpu_device.allocate(num_elem * sizeof(float));
-
-  Eigen::TensorMap<Eigen::Tensor<float, 1>, Eigen::Aligned> gpu_float(
-      d_float, num_elem);
-  Eigen::TensorMap<Eigen::Tensor<Eigen::half, 1>, Eigen::Aligned> gpu_half(
-      d_half, num_elem);
-  Eigen::TensorMap<Eigen::Tensor<float, 1>, Eigen::Aligned> gpu_conv(
-      d_conv, num_elem);
-
-  gpu_float.device(gpu_device) = gpu_float.random();
-  gpu_half.device(gpu_device) = gpu_float.cast<Eigen::half>();
-  gpu_conv.device(gpu_device) = gpu_half.cast<float>();
-
-  Tensor<float, 1> initial(num_elem);
-  Tensor<float, 1> final(num_elem);
-  gpu_device.memcpyDeviceToHost(initial.data(), d_float, num_elem*sizeof(float));
-  gpu_device.memcpyDeviceToHost(final.data(), d_conv, num_elem*sizeof(float));
-
-  for (int i = 0; i < num_elem; ++i) {
-    VERIFY_IS_APPROX(initial(i), final(i));
-  }
-
-  gpu_device.deallocate(d_float);
-  gpu_device.deallocate(d_half);
-  gpu_device.deallocate(d_conv);
-}
-
-template<typename>
-void test_hip_unary() {
-  Eigen::HipStreamDevice stream;
-  Eigen::GpuDevice gpu_device(&stream);
-  int num_elem = 101;
-
-  float* d_float = (float*)gpu_device.allocate(num_elem * sizeof(float));
-  float* d_res_half = (float*)gpu_device.allocate(num_elem * sizeof(float));
-  float* d_res_float = (float*)gpu_device.allocate(num_elem * sizeof(float));
-
-  Eigen::TensorMap<Eigen::Tensor<float, 1>, Eigen::Aligned> gpu_float(
-      d_float, num_elem);
-  Eigen::TensorMap<Eigen::Tensor<float, 1>, Eigen::Aligned> gpu_res_half(
-      d_res_half, num_elem);
-  Eigen::TensorMap<Eigen::Tensor<float, 1>, Eigen::Aligned> gpu_res_float(
-      d_res_float, num_elem);
-
-  gpu_float.device(gpu_device) = gpu_float.random() - gpu_float.constant(0.5f);
-  gpu_res_float.device(gpu_device) = gpu_float.abs();
-  gpu_res_half.device(gpu_device) = gpu_float.cast<Eigen::half>().abs().cast<float>();
-
-  Tensor<float, 1> half_prec(num_elem);
-  Tensor<float, 1> full_prec(num_elem);
-  gpu_device.memcpyDeviceToHost(half_prec.data(), d_res_half, num_elem*sizeof(float));
-  gpu_device.memcpyDeviceToHost(full_prec.data(), d_res_float, num_elem*sizeof(float));
-  gpu_device.synchronize();
-
-  for (int i = 0; i < num_elem; ++i) {
-    std::cout << "Checking unary " << i << std::endl;
-    VERIFY_IS_APPROX(full_prec(i), half_prec(i));
-  }
-
-  gpu_device.deallocate(d_float);
-  gpu_device.deallocate(d_res_half);
-  gpu_device.deallocate(d_res_float);
-}
-
-template<typename>
-void test_hip_elementwise() {
-  Eigen::HipStreamDevice stream;
-  Eigen::GpuDevice gpu_device(&stream);
-  int num_elem = 101;
-
-  float* d_float1 = (float*)gpu_device.allocate(num_elem * sizeof(float));
-  float* d_float2 = (float*)gpu_device.allocate(num_elem * sizeof(float));
-  float* d_res_half = (float*)gpu_device.allocate(num_elem * sizeof(float));
-  float* d_res_float = (float*)gpu_device.allocate(num_elem * sizeof(float));
-
-  Eigen::TensorMap<Eigen::Tensor<float, 1>, Eigen::Aligned> gpu_float1(
-      d_float1, num_elem);
-  Eigen::TensorMap<Eigen::Tensor<float, 1>, Eigen::Aligned> gpu_float2(
-      d_float2, num_elem);
-  Eigen::TensorMap<Eigen::Tensor<float, 1>, Eigen::Aligned> gpu_res_half(
-      d_res_half, num_elem);
-  Eigen::TensorMap<Eigen::Tensor<float, 1>, Eigen::Aligned> gpu_res_float(
-      d_res_float, num_elem);
-
-  gpu_float1.device(gpu_device) = gpu_float1.random();
-  gpu_float2.device(gpu_device) = gpu_float2.random();
-  gpu_res_float.device(gpu_device) = (gpu_float1 + gpu_float2) * gpu_float1;
-  gpu_res_half.device(gpu_device) = ((gpu_float1.cast<Eigen::half>() + gpu_float2.cast<Eigen::half>()) * gpu_float1.cast<Eigen::half>()).cast<float>();
-
-  Tensor<float, 1> half_prec(num_elem);
-  Tensor<float, 1> full_prec(num_elem);
-  gpu_device.memcpyDeviceToHost(half_prec.data(), d_res_half, num_elem*sizeof(float));
-  gpu_device.memcpyDeviceToHost(full_prec.data(), d_res_float, num_elem*sizeof(float));
-  gpu_device.synchronize();
-
-  for (int i = 0; i < num_elem; ++i) {
-    std::cout << "Checking elemwise " << i << ": full prec = " << full_prec(i) << " vs half prec = " << half_prec(i) << std::endl;
-    VERIFY_IS_APPROX(static_cast<Eigen::half>(full_prec(i)), static_cast<Eigen::half>(half_prec(i)));
-  }
-
-  gpu_device.deallocate(d_float1);
-  gpu_device.deallocate(d_float2);
-  gpu_device.deallocate(d_res_half);
-  gpu_device.deallocate(d_res_float);
-}
-
-template<typename>
-void test_hip_trancendental() {
-  Eigen::HipStreamDevice stream;
-  Eigen::GpuDevice gpu_device(&stream);
-  int num_elem = 101;
-
-  float* d_float1 = (float*)gpu_device.allocate(num_elem * sizeof(float));
-  float* d_float2 = (float*)gpu_device.allocate(num_elem * sizeof(float));
-  float* d_float3 = (float*)gpu_device.allocate(num_elem * sizeof(float));
-  Eigen::half* d_res1_half = (Eigen::half*)gpu_device.allocate(num_elem * sizeof(Eigen::half));
-  Eigen::half* d_res1_float = (Eigen::half*)gpu_device.allocate(num_elem * sizeof(Eigen::half));
-  Eigen::half* d_res2_half = (Eigen::half*)gpu_device.allocate(num_elem * sizeof(Eigen::half));
-  Eigen::half* d_res2_float = (Eigen::half*)gpu_device.allocate(num_elem * sizeof(Eigen::half));
-  Eigen::half* d_res3_half = (Eigen::half*)gpu_device.allocate(num_elem * sizeof(Eigen::half));
-  Eigen::half* d_res3_float = (Eigen::half*)gpu_device.allocate(num_elem * sizeof(Eigen::half));
-
-  Eigen::TensorMap<Eigen::Tensor<float, 1>, Eigen::Aligned> gpu_float1(d_float1, num_elem);
-  Eigen::TensorMap<Eigen::Tensor<float, 1>, Eigen::Aligned> gpu_float2(d_float2, num_elem);
-  Eigen::TensorMap<Eigen::Tensor<float, 1>, Eigen::Aligned> gpu_float3(d_float3, num_elem);
-  Eigen::TensorMap<Eigen::Tensor<Eigen::half, 1>, Eigen::Aligned> gpu_res1_half(d_res1_half, num_elem);
-  Eigen::TensorMap<Eigen::Tensor<Eigen::half, 1>, Eigen::Aligned> gpu_res1_float(d_res1_float, num_elem);
-  Eigen::TensorMap<Eigen::Tensor<Eigen::half, 1>, Eigen::Aligned> gpu_res2_half(d_res2_half, num_elem);
-  Eigen::TensorMap<Eigen::Tensor<Eigen::half, 1>, Eigen::Aligned> gpu_res2_float(d_res2_float, num_elem);
-  Eigen::TensorMap<Eigen::Tensor<Eigen::half, 1>, Eigen::Aligned> gpu_res3_half(d_res3_half, num_elem);
-  Eigen::TensorMap<Eigen::Tensor<Eigen::half, 1>, Eigen::Aligned> gpu_res3_float(d_res3_float, num_elem);
-  Eigen::TensorMap<Eigen::Tensor<Eigen::half, 1>, Eigen::Aligned> gpu_res4_half(d_res3_half, num_elem);
-  Eigen::TensorMap<Eigen::Tensor<Eigen::half, 1>, Eigen::Aligned> gpu_res4_float(d_res3_float, num_elem);
-
-  gpu_float1.device(gpu_device) = gpu_float1.random() - gpu_float1.constant(0.5f);
-  gpu_float2.device(gpu_device) = gpu_float2.random() + gpu_float1.constant(0.5f);
-  gpu_float3.device(gpu_device) = gpu_float3.random();
-  gpu_res1_float.device(gpu_device) = gpu_float1.exp().cast<Eigen::half>();
-  gpu_res2_float.device(gpu_device) = gpu_float2.log().cast<Eigen::half>();
-  gpu_res3_float.device(gpu_device) = gpu_float3.log1p().cast<Eigen::half>();
-  gpu_res4_float.device(gpu_device) = gpu_float3.expm1().cast<Eigen::half>();
-
-  gpu_res1_half.device(gpu_device) = gpu_float1.cast<Eigen::half>();
-  gpu_res1_half.device(gpu_device) = gpu_res1_half.exp();
-
-  gpu_res2_half.device(gpu_device) = gpu_float2.cast<Eigen::half>();
-  gpu_res2_half.device(gpu_device) = gpu_res2_half.log();
-
-  gpu_res3_half.device(gpu_device) = gpu_float3.cast<Eigen::half>();
-  gpu_res3_half.device(gpu_device) = gpu_res3_half.log1p();
-
-  gpu_res3_half.device(gpu_device) = gpu_float3.cast<Eigen::half>();
-  gpu_res3_half.device(gpu_device) = gpu_res3_half.expm1();
-
-  Tensor<float, 1> input1(num_elem);
-  Tensor<Eigen::half, 1> half_prec1(num_elem);
-  Tensor<Eigen::half, 1> full_prec1(num_elem);
-  Tensor<float, 1> input2(num_elem);
-  Tensor<Eigen::half, 1> half_prec2(num_elem);
-  Tensor<Eigen::half, 1> full_prec2(num_elem);
-  Tensor<float, 1> input3(num_elem);
-  Tensor<Eigen::half, 1> half_prec3(num_elem);
-  Tensor<Eigen::half, 1> full_prec3(num_elem);
-  gpu_device.memcpyDeviceToHost(input1.data(), d_float1, num_elem*sizeof(float));
-  gpu_device.memcpyDeviceToHost(input2.data(), d_float2, num_elem*sizeof(float));
-  gpu_device.memcpyDeviceToHost(input3.data(), d_float3, num_elem*sizeof(float));
-  gpu_device.memcpyDeviceToHost(half_prec1.data(), d_res1_half, num_elem*sizeof(Eigen::half));
-  gpu_device.memcpyDeviceToHost(full_prec1.data(), d_res1_float, num_elem*sizeof(Eigen::half));
-  gpu_device.memcpyDeviceToHost(half_prec2.data(), d_res2_half, num_elem*sizeof(Eigen::half));
-  gpu_device.memcpyDeviceToHost(full_prec2.data(), d_res2_float, num_elem*sizeof(Eigen::half));
-  gpu_device.memcpyDeviceToHost(half_prec3.data(), d_res3_half, num_elem*sizeof(Eigen::half));
-  gpu_device.memcpyDeviceToHost(full_prec3.data(), d_res3_float, num_elem*sizeof(Eigen::half));
-  gpu_device.synchronize();
-
-  for (int i = 0; i < num_elem; ++i) {
-    std::cout << "Checking elemwise exp " << i << " input = " << input1(i) << " full = " << full_prec1(i) << " half = " << half_prec1(i) << std::endl;
-    VERIFY_IS_APPROX(full_prec1(i), half_prec1(i));
-  }
-  for (int i = 0; i < num_elem; ++i) {
-    std::cout << "Checking elemwise log " << i << " input = " << input2(i) << " full = " << full_prec2(i) << " half = " << half_prec2(i) << std::endl;
-    if(std::abs(input2(i)-1.f)<0.05f) // log lacks accurary nearby 1
-      VERIFY_IS_APPROX(full_prec2(i)+Eigen::half(0.1f), half_prec2(i)+Eigen::half(0.1f));
-    else
-      VERIFY_IS_APPROX(full_prec2(i), half_prec2(i));
-  }
-  for (int i = 0; i < num_elem; ++i) {
-    std::cout << "Checking elemwise plog1 " << i << " input = " << input3(i) << " full = " << full_prec3(i) << " half = " << half_prec3(i) << std::endl;
-    VERIFY_IS_APPROX(full_prec3(i), half_prec3(i));
-  }
-  gpu_device.deallocate(d_float1);
-  gpu_device.deallocate(d_float2);
-  gpu_device.deallocate(d_float3);
-  gpu_device.deallocate(d_res1_half);
-  gpu_device.deallocate(d_res1_float);
-  gpu_device.deallocate(d_res2_half);
-  gpu_device.deallocate(d_res2_float);
-  gpu_device.deallocate(d_res3_float);
-  gpu_device.deallocate(d_res3_half);
-}
-
-template<typename>
-void test_hip_contractions() {
-  Eigen::HipStreamDevice stream;
-  Eigen::GpuDevice gpu_device(&stream);
-  int rows = 23;
-  int cols = 23;
-  int num_elem = rows*cols;
-
-  float* d_float1 = (float*)gpu_device.allocate(num_elem * sizeof(float));
-  float* d_float2 = (float*)gpu_device.allocate(num_elem * sizeof(float));
-  Eigen::half* d_res_half = (Eigen::half*)gpu_device.allocate(num_elem * sizeof(Eigen::half));
-  Eigen::half* d_res_float = (Eigen::half*)gpu_device.allocate(num_elem * sizeof(Eigen::half));
-
-  Eigen::TensorMap<Eigen::Tensor<float, 2>, Eigen::Aligned> gpu_float1(
-      d_float1, rows, cols);
-  Eigen::TensorMap<Eigen::Tensor<float, 2>, Eigen::Aligned> gpu_float2(
-      d_float2, rows, cols);
-  Eigen::TensorMap<Eigen::Tensor<Eigen::half, 2>, Eigen::Aligned> gpu_res_half(
-      d_res_half, rows, cols);
-  Eigen::TensorMap<Eigen::Tensor<Eigen::half, 2>, Eigen::Aligned> gpu_res_float(
-      d_res_float, rows, cols);
-
-  gpu_float1.device(gpu_device) = gpu_float1.random() - gpu_float1.constant(0.5f);
-  gpu_float2.device(gpu_device) = gpu_float2.random() - gpu_float2.constant(0.5f);
-
-  typedef Tensor<float, 2>::DimensionPair DimPair;
-  Eigen::array<DimPair, 1> dims(DimPair(1, 0));
-  gpu_res_float.device(gpu_device) = gpu_float1.contract(gpu_float2, dims).cast<Eigen::half>();
-  gpu_res_half.device(gpu_device) = gpu_float1.cast<Eigen::half>().contract(gpu_float2.cast<Eigen::half>(), dims);
-
-  Tensor<Eigen::half, 2> half_prec(rows, cols);
-  Tensor<Eigen::half, 2> full_prec(rows, cols);
-  gpu_device.memcpyDeviceToHost(half_prec.data(), d_res_half, num_elem*sizeof(Eigen::half));
-  gpu_device.memcpyDeviceToHost(full_prec.data(), d_res_float, num_elem*sizeof(Eigen::half));
-  gpu_device.synchronize();
-
-  for (int i = 0; i < rows; ++i) {
-    for (int j = 0; j < cols; ++j) {
-      std::cout << "Checking contract " << i << " " << j << full_prec(i, j) << " " << half_prec(i, j) << std::endl;
-      if (numext::abs(full_prec(i, j) - half_prec(i, j)) > Eigen::half(1e-2f)) {
-        VERIFY_IS_APPROX(full_prec(i, j), half_prec(i, j));
-      }
-    }
-  }
-
-  gpu_device.deallocate(d_float1);
-  gpu_device.deallocate(d_float2);
-  gpu_device.deallocate(d_res_half);
-  gpu_device.deallocate(d_res_float);
-}
-
-template<typename>
-void test_hip_reductions(int size1, int size2, int redux) {
-
-   std::cout << "Reducing " << size1 << " by " << size2
-             << " tensor along dim " << redux << std::endl; 
-
-  Eigen::HipStreamDevice stream;
-  Eigen::GpuDevice gpu_device(&stream);
-  int num_elem = size1*size2;
-  int result_size = (redux == 1 ? size1 : size2);
-
-  float* d_float1 = (float*)gpu_device.allocate(num_elem * sizeof(float));
-  float* d_float2 = (float*)gpu_device.allocate(num_elem * sizeof(float));
-  Eigen::half* d_res_half = (Eigen::half*)gpu_device.allocate(result_size * sizeof(Eigen::half));
-  Eigen::half* d_res_float = (Eigen::half*)gpu_device.allocate(result_size * sizeof(Eigen::half));
-
-  Eigen::TensorMap<Eigen::Tensor<float, 2>, Eigen::Aligned> gpu_float1(
-      d_float1, size1, size2);
-  Eigen::TensorMap<Eigen::Tensor<float, 2>, Eigen::Aligned> gpu_float2(
-      d_float2, size1, size2);
-  Eigen::TensorMap<Eigen::Tensor<Eigen::half, 1>, Eigen::Aligned> gpu_res_half(
-      d_res_half, result_size);
-  Eigen::TensorMap<Eigen::Tensor<Eigen::half, 1>, Eigen::Aligned> gpu_res_float(
-      d_res_float, result_size);
-
-  gpu_float1.device(gpu_device) = gpu_float1.random() * 2.0f;
-  gpu_float2.device(gpu_device) = gpu_float2.random() * 2.0f;
-
-  Eigen::array<int, 1> redux_dim(redux);
-  gpu_res_float.device(gpu_device) = gpu_float1.sum(redux_dim).cast<Eigen::half>();
-  gpu_res_half.device(gpu_device) = gpu_float1.cast<Eigen::half>().sum(redux_dim);
-
-  Tensor<Eigen::half, 1> half_prec(result_size);
-  Tensor<Eigen::half, 1> full_prec(result_size);
-  gpu_device.memcpyDeviceToHost(half_prec.data(), d_res_half, result_size*sizeof(Eigen::half));
-  gpu_device.memcpyDeviceToHost(full_prec.data(), d_res_float, result_size*sizeof(Eigen::half));
-  gpu_device.synchronize();
-
-  for (int i = 0; i < result_size; ++i) {
-    std::cout << "EXPECTED " << full_prec(i) << " GOT " << half_prec(i) << std::endl;
-    VERIFY_IS_APPROX(full_prec(i), half_prec(i));
-  }
-
-  gpu_device.deallocate(d_float1);
-  gpu_device.deallocate(d_float2);
-  gpu_device.deallocate(d_res_half);
-  gpu_device.deallocate(d_res_float);
-}
-
-template<typename>
-void test_hip_reductions() {
-  test_hip_reductions<void>(13, 13, 0);
-  test_hip_reductions<void>(13, 13, 1);
-
-  test_hip_reductions<void>(35, 36, 0);
-  test_hip_reductions<void>(35, 36, 1);
-
-  test_hip_reductions<void>(36, 35, 0);
-  test_hip_reductions<void>(36, 35, 1);
-}
-
-template<typename>
-void test_hip_full_reductions() {
-  Eigen::HipStreamDevice stream;
-  Eigen::GpuDevice gpu_device(&stream);
-  int size = 13;
-  int num_elem = size*size;
-
-  float* d_float1 = (float*)gpu_device.allocate(num_elem * sizeof(float));
-  float* d_float2 = (float*)gpu_device.allocate(num_elem * sizeof(float));
-  Eigen::half* d_res_half = (Eigen::half*)gpu_device.allocate(1 * sizeof(Eigen::half));
-  Eigen::half* d_res_float = (Eigen::half*)gpu_device.allocate(1 * sizeof(Eigen::half));
-
-  Eigen::TensorMap<Eigen::Tensor<float, 2>, Eigen::Aligned> gpu_float1(
-      d_float1, size, size);
-  Eigen::TensorMap<Eigen::Tensor<float, 2>, Eigen::Aligned> gpu_float2(
-      d_float2, size, size);
-  Eigen::TensorMap<Eigen::Tensor<Eigen::half, 0>, Eigen::Aligned> gpu_res_half(
-      d_res_half);
-  Eigen::TensorMap<Eigen::Tensor<Eigen::half, 0>, Eigen::Aligned> gpu_res_float(
-      d_res_float);
-
-  gpu_float1.device(gpu_device) = gpu_float1.random();
-  gpu_float2.device(gpu_device) = gpu_float2.random();
-
-  gpu_res_float.device(gpu_device) = gpu_float1.sum().cast<Eigen::half>();
-  gpu_res_half.device(gpu_device) = gpu_float1.cast<Eigen::half>().sum();
-
-  Tensor<Eigen::half, 0> half_prec;
-  Tensor<Eigen::half, 0> full_prec;
-  gpu_device.memcpyDeviceToHost(half_prec.data(), d_res_half, sizeof(Eigen::half));
-  gpu_device.memcpyDeviceToHost(full_prec.data(), d_res_float, sizeof(Eigen::half));
-  gpu_device.synchronize();
-
-  VERIFY_IS_APPROX(full_prec(), half_prec());
-
-  gpu_res_float.device(gpu_device) = gpu_float1.maximum().cast<Eigen::half>();
-  gpu_res_half.device(gpu_device) = gpu_float1.cast<Eigen::half>().maximum();
-  gpu_device.memcpyDeviceToHost(half_prec.data(), d_res_half, sizeof(Eigen::half));
-  gpu_device.memcpyDeviceToHost(full_prec.data(), d_res_float, sizeof(Eigen::half));
-  gpu_device.synchronize();
-
-  VERIFY_IS_APPROX(full_prec(), half_prec());
-
-  gpu_device.deallocate(d_float1);
-  gpu_device.deallocate(d_float2);
-  gpu_device.deallocate(d_res_half);
-  gpu_device.deallocate(d_res_float);
-}
-
-template<typename>
-void test_hip_forced_evals() {
-
-  Eigen::HipStreamDevice stream;
-  Eigen::GpuDevice gpu_device(&stream);
-  int num_elem = 101;
-
-  float* d_float = (float*)gpu_device.allocate(num_elem * sizeof(float));
-  float* d_res_half1 = (float*)gpu_device.allocate(num_elem * sizeof(float));
-  float* d_res_half2 = (float*)gpu_device.allocate(num_elem * sizeof(float));
-  float* d_res_float = (float*)gpu_device.allocate(num_elem * sizeof(float));
-
-  Eigen::TensorMap<Eigen::Tensor<float, 1>, Eigen::Aligned> gpu_float(
-      d_float, num_elem);
-  Eigen::TensorMap<Eigen::Tensor<float, 1>, Eigen::Aligned> gpu_res_half1(
-      d_res_half1, num_elem);
- Eigen::TensorMap<Eigen::Tensor<float, 1>, Eigen::Unaligned> gpu_res_half2(
-      d_res_half2, num_elem);
-  Eigen::TensorMap<Eigen::Tensor<float, 1>, Eigen::Aligned> gpu_res_float(
-      d_res_float, num_elem);
-
-  Eigen::array<int, 1> no_bcast;
-  no_bcast[0] = 1;
-
-  gpu_float.device(gpu_device) = gpu_float.random() - gpu_float.constant(0.5f);
-  gpu_res_float.device(gpu_device) = gpu_float.abs();
-  gpu_res_half1.device(gpu_device) = gpu_float.cast<Eigen::half>().abs().eval().cast<float>();
-  gpu_res_half2.device(gpu_device) = gpu_float.cast<Eigen::half>().abs().broadcast(no_bcast).eval().cast<float>();
-
-  Tensor<float, 1> half_prec1(num_elem);
-  Tensor<float, 1> half_prec2(num_elem);
-  Tensor<float, 1> full_prec(num_elem);
-  gpu_device.memcpyDeviceToHost(half_prec1.data(), d_res_half1, num_elem*sizeof(float));
-  gpu_device.memcpyDeviceToHost(half_prec2.data(), d_res_half1, num_elem*sizeof(float));
-  gpu_device.memcpyDeviceToHost(full_prec.data(), d_res_float, num_elem*sizeof(float));
-  gpu_device.synchronize();
-
-  for (int i = 0; i < num_elem; ++i) {
-    std::cout << "Checking forced eval " << i << full_prec(i) << " vs " << half_prec1(i) << " vs " << half_prec2(i) << std::endl;
-    VERIFY_IS_APPROX(full_prec(i), half_prec1(i));
-    VERIFY_IS_APPROX(full_prec(i), half_prec2(i));
-  }
-
-  gpu_device.deallocate(d_float);
-  gpu_device.deallocate(d_res_half1);
-  gpu_device.deallocate(d_res_half2);
-  gpu_device.deallocate(d_res_float);
-}
-#endif
-
-
-void test_cxx11_tensor_of_float16_hip()
-{
-  CALL_SUBTEST(test_hip_numext<void>());
-
-#ifdef EIGEN_HAS_HIP_FP16
-  CALL_SUBTEST(test_hip_conversion<void>());
-  CALL_SUBTEST(test_hip_unary<void>());
-  CALL_SUBTEST(test_hip_elementwise<void>());
-  CALL_SUBTEST(test_hip_trancendental<void>());
-  CALL_SUBTEST(test_hip_contractions<void>());
-  CALL_SUBTEST(test_hip_reductions<void>());
-  CALL_SUBTEST(test_hip_full_reductions<void>());
-  CALL_SUBTEST(test_hip_forced_evals<void>());
-#else
-  std::cout << "Half floats are not supported by this version of hip: skipping the test" << std::endl;
-#endif
-}
--- a/unsupported/test/cxx11_tensor_random_hip.cu
+++ b/unsupported/test/cxx11_tensor_random_hip.cu
@@ -1,85 +0,0 @@
-// This file is part of Eigen, a lightweight C++ template library
-// for linear algebra.
-//
-// Copyright (C) 2014 Benoit Steiner <benoit.steiner.goog@gmail.com>
-//
-// This Source Code Form is subject to the terms of the Mozilla
-// Public License v. 2.0. If a copy of the MPL was not distributed
-// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
-
-#define EIGEN_TEST_NO_LONGDOUBLE
-#define EIGEN_TEST_NO_COMPLEX
-#define EIGEN_TEST_FUNC cxx11_tensor_random_hip
-#define EIGEN_DEFAULT_DENSE_INDEX_TYPE int
-#define EIGEN_USE_GPU
-
-#include "main.h"
-#include <Eigen/CXX11/Tensor>
-
-
-void test_hip_random_uniform()
-{
-  Tensor<float, 2> out(72,97);
-  out.setZero();
-
-  std::size_t out_bytes = out.size() * sizeof(float);
-
-  float* d_out;
-  hipMalloc((void**)(&d_out), out_bytes);
-
-  Eigen::HipStreamDevice stream;
-  Eigen::GpuDevice gpu_device(&stream);
-
-  Eigen::TensorMap<Eigen::Tensor<float, 2> > gpu_out(d_out, 72,97);
-
-  gpu_out.device(gpu_device) = gpu_out.random();
-
-  assert(hipMemcpyAsync(out.data(), d_out, out_bytes, hipMemcpyDeviceToHost, gpu_device.stream()) == hipSuccess);
-  assert(hipStreamSynchronize(gpu_device.stream()) == hipSuccess);
-
-  // For now we just check thes code doesn't crash.
-  // TODO: come up with a valid test of randomness
-}
-
-
-void test_hip_random_normal()
-{
-  Tensor<float, 2> out(72,97);
-  out.setZero();
-
-  std::size_t out_bytes = out.size() * sizeof(float);
-
-  float* d_out;
-  hipMalloc((void**)(&d_out), out_bytes);
-
-  Eigen::HipStreamDevice stream;
-  Eigen::GpuDevice gpu_device(&stream);
-
-  Eigen::TensorMap<Eigen::Tensor<float, 2> > gpu_out(d_out, 72,97);
-
-  Eigen::internal::NormalRandomGenerator<float> gen(true);
-  gpu_out.device(gpu_device) = gpu_out.random(gen);
-
-  assert(hipMemcpyAsync(out.data(), d_out, out_bytes, hipMemcpyDeviceToHost, gpu_device.stream()) == hipSuccess);
-  assert(hipStreamSynchronize(gpu_device.stream()) == hipSuccess);
-}
-
-static void test_complex()
-{
-  Tensor<std::complex<float>, 1> vec(6);
-  vec.setRandom();
-
-  // Fixme: we should check that the generated numbers follow a uniform
-  // distribution instead.
-  for (int i = 1; i < 6; ++i) {
-    VERIFY_IS_NOT_EQUAL(vec(i), vec(i-1));
-  }
-}
-
-
-void test_cxx11_tensor_random_hip()
-{
-  CALL_SUBTEST(test_hip_random_uniform());
-  CALL_SUBTEST(test_hip_random_normal());
-  CALL_SUBTEST(test_complex());
-}
--- a/unsupported/test/cxx11_tensor_reduction_hip.cu
+++ b/unsupported/test/cxx11_tensor_reduction_hip.cu
@@ -1,154 +0,0 @@
-// This file is part of Eigen, a lightweight C++ template library
-// for linear algebra.
-//
-// Copyright (C) 2015 Benoit Steiner <benoit.steiner.goog@gmail.com>
-//
-// This Source Code Form is subject to the terms of the Mozilla
-// Public License v. 2.0. If a copy of the MPL was not distributed
-// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
-
-#define EIGEN_TEST_NO_LONGDOUBLE
-#define EIGEN_TEST_NO_COMPLEX
-#define EIGEN_TEST_FUNC cxx11_tensor_reduction_hip
-#define EIGEN_USE_GPU
-
-#include "main.h"
-#include <unsupported/Eigen/CXX11/Tensor>
-
-
-template<typename Type, int DataLayout>
-static void test_full_reductions() {
-
-  Eigen::HipStreamDevice stream;
-  Eigen::GpuDevice gpu_device(&stream);
-
-  const int num_rows = internal::random<int>(1024, 5*1024);
-  const int num_cols = internal::random<int>(1024, 5*1024);
-
-  Tensor<Type, 2, DataLayout> in(num_rows, num_cols);
-  in.setRandom();
-
-  Tensor<Type, 0, DataLayout> full_redux;
-  full_redux = in.sum();
-
-  std::size_t in_bytes = in.size() * sizeof(Type);
-  std::size_t out_bytes = full_redux.size() * sizeof(Type);
-  Type* gpu_in_ptr = static_cast<Type*>(gpu_device.allocate(in_bytes));
-  Type* gpu_out_ptr = static_cast<Type*>(gpu_device.allocate(out_bytes));
-  gpu_device.memcpyHostToDevice(gpu_in_ptr, in.data(), in_bytes);
-
-  TensorMap<Tensor<Type, 2, DataLayout> > in_gpu(gpu_in_ptr, num_rows, num_cols);
-  TensorMap<Tensor<Type, 0, DataLayout> > out_gpu(gpu_out_ptr);
-
-  out_gpu.device(gpu_device) = in_gpu.sum();
-
-  Tensor<Type, 0, DataLayout> full_redux_gpu;
-  gpu_device.memcpyDeviceToHost(full_redux_gpu.data(), gpu_out_ptr, out_bytes);
-  gpu_device.synchronize();
-
-  // Check that the CPU and GPU reductions return the same result.
-  VERIFY_IS_APPROX(full_redux(), full_redux_gpu());
-
-  gpu_device.deallocate(gpu_in_ptr);
-  gpu_device.deallocate(gpu_out_ptr);
-}
-
-template<typename Type, int DataLayout>
-static void test_first_dim_reductions() {
-  int dim_x = 33;
-  int dim_y = 1;
-  int dim_z = 128;
-
-  Tensor<Type, 3, DataLayout> in(dim_x, dim_y, dim_z);
-  in.setRandom();
-
-  Eigen::array<int, 1> red_axis;
-  red_axis[0] = 0;
-  Tensor<Type, 2, DataLayout> redux = in.sum(red_axis);
-
-  // Create device
-  Eigen::HipStreamDevice stream;
-  Eigen::GpuDevice dev(&stream);
-  
-  // Create data(T)
-  Type* in_data = (Type*)dev.allocate(dim_x*dim_y*dim_z*sizeof(Type));
-  Type* out_data = (Type*)dev.allocate(dim_z*dim_y*sizeof(Type));
-  Eigen::TensorMap<Eigen::Tensor<Type, 3, DataLayout> > gpu_in(in_data, dim_x, dim_y, dim_z);
-  Eigen::TensorMap<Eigen::Tensor<Type, 2, DataLayout> > gpu_out(out_data, dim_y, dim_z);
-  
-  // Perform operation
-  dev.memcpyHostToDevice(in_data, in.data(), in.size()*sizeof(Type));
-  gpu_out.device(dev) = gpu_in.sum(red_axis);
-  gpu_out.device(dev) += gpu_in.sum(red_axis);
-  Tensor<Type, 2, DataLayout> redux_gpu(dim_y, dim_z);
-  dev.memcpyDeviceToHost(redux_gpu.data(), out_data, gpu_out.size()*sizeof(Type));
-  dev.synchronize();
-
-  // Check that the CPU and GPU reductions return the same result.
-  for (int i = 0; i < gpu_out.size(); ++i) {
-    VERIFY_IS_APPROX(2*redux(i), redux_gpu(i));
-  }
-
-  dev.deallocate(in_data);
-  dev.deallocate(out_data);
-}
-
-template<typename Type, int DataLayout>
-static void test_last_dim_reductions() {
-  int dim_x = 128;
-  int dim_y = 1;
-  int dim_z = 33;
-
-  Tensor<Type, 3, DataLayout> in(dim_x, dim_y, dim_z);
-  in.setRandom();
-
-  Eigen::array<int, 1> red_axis;
-  red_axis[0] = 2;
-  Tensor<Type, 2, DataLayout> redux = in.sum(red_axis);
-
-  // Create device
-  Eigen::HipStreamDevice stream;
-  Eigen::GpuDevice dev(&stream);
-  
-  // Create data
-  Type* in_data = (Type*)dev.allocate(dim_x*dim_y*dim_z*sizeof(Type));
-  Type* out_data = (Type*)dev.allocate(dim_x*dim_y*sizeof(Type));
-  Eigen::TensorMap<Eigen::Tensor<Type, 3, DataLayout> > gpu_in(in_data, dim_x, dim_y, dim_z);
-  Eigen::TensorMap<Eigen::Tensor<Type, 2, DataLayout> > gpu_out(out_data, dim_x, dim_y);
-  
-  // Perform operation
-  dev.memcpyHostToDevice(in_data, in.data(), in.size()*sizeof(Type));
-  gpu_out.device(dev) = gpu_in.sum(red_axis);
-  gpu_out.device(dev) += gpu_in.sum(red_axis);
-  Tensor<Type, 2, DataLayout> redux_gpu(dim_x, dim_y);
-  dev.memcpyDeviceToHost(redux_gpu.data(), out_data, gpu_out.size()*sizeof(Type));
-  dev.synchronize();
-
-  // Check that the CPU and GPU reductions return the same result.
-  for (int i = 0; i < gpu_out.size(); ++i) {
-    VERIFY_IS_APPROX(2*redux(i), redux_gpu(i));
-  }
-
-  dev.deallocate(in_data);
-  dev.deallocate(out_data);
-}
-
-
-void test_cxx11_tensor_reduction_hip() {
-  CALL_SUBTEST((test_full_reductions<float, ColMajor>()));
-  CALL_SUBTEST((test_full_reductions<double, ColMajor>()));
-  CALL_SUBTEST((test_full_reductions<float, RowMajor>()));
-  CALL_SUBTEST((test_full_reductions<double, RowMajor>()));
-  
-  CALL_SUBTEST((test_first_dim_reductions<float, ColMajor>()));
-  CALL_SUBTEST((test_first_dim_reductions<double, ColMajor>()));
-  CALL_SUBTEST((test_first_dim_reductions<float, RowMajor>()));
-// Outer reductions of doubles aren't supported just yet.  					      
-//  CALL_SUBTEST((test_first_dim_reductions<double, RowMajor>()))
-
-  CALL_SUBTEST((test_last_dim_reductions<float, ColMajor>()));
-// Outer reductions of doubles aren't supported just yet.  					      
-//  CALL_SUBTEST((test_last_dim_reductions<double, ColMajor>()));
-  CALL_SUBTEST((test_last_dim_reductions<float, RowMajor>()));
-  CALL_SUBTEST((test_last_dim_reductions<double, RowMajor>()));
-}
--- a/unsupported/test/cxx11_tensor_scan_hip.cu
+++ b/unsupported/test/cxx11_tensor_scan_hip.cu
@@ -1,76 +0,0 @@
-// This file is part of Eigen, a lightweight C++ template library
-// for linear algebra.
-//
-// Copyright (C) 2016 Benoit Steiner <benoit.steiner.goog@gmail.com>
-//
-// This Source Code Form is subject to the terms of the Mozilla
-// Public License v. 2.0. If a copy of the MPL was not distributed
-// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
-
-#define EIGEN_TEST_NO_LONGDOUBLE
-#define EIGEN_TEST_NO_COMPLEX
-#define EIGEN_TEST_FUNC cxx11_tensor_scan_hip
-#define EIGEN_DEFAULT_DENSE_INDEX_TYPE int
-#define EIGEN_USE_GPU
-
-#include "main.h"
-#include <unsupported/Eigen/CXX11/Tensor>
-
-using Eigen::Tensor;
-typedef Tensor<float, 1>::DimensionPair DimPair;
-
-template<int DataLayout>
-void test_hip_cumsum(int m_size, int k_size, int n_size)
-{
-  std::cout << "Testing for (" << m_size << "," << k_size << "," << n_size << ")" << std::endl;
-  Tensor<float, 3, DataLayout> t_input(m_size, k_size, n_size);
-  Tensor<float, 3, DataLayout> t_result(m_size, k_size, n_size);
-  Tensor<float, 3, DataLayout> t_result_gpu(m_size, k_size, n_size);
-
-  t_input.setRandom();
-
-  std::size_t t_input_bytes = t_input.size()  * sizeof(float);
-  std::size_t t_result_bytes = t_result.size() * sizeof(float);
-
-  float* d_t_input;
-  float* d_t_result;
-
-  hipMalloc((void**)(&d_t_input), t_input_bytes);
-  hipMalloc((void**)(&d_t_result), t_result_bytes);
-
-  hipMemcpy(d_t_input, t_input.data(), t_input_bytes, hipMemcpyHostToDevice);
-
-  Eigen::HipStreamDevice stream;
-  Eigen::GpuDevice gpu_device(&stream);
-
-  Eigen::TensorMap<Eigen::Tensor<float, 3, DataLayout> >
-      gpu_t_input(d_t_input, Eigen::array<int, 3>(m_size, k_size, n_size));
-  Eigen::TensorMap<Eigen::Tensor<float, 3, DataLayout> >
-      gpu_t_result(d_t_result, Eigen::array<int, 3>(m_size, k_size, n_size));
-
-  gpu_t_result.device(gpu_device) = gpu_t_input.cumsum(1);
-  t_result = t_input.cumsum(1);
-
-  hipMemcpy(t_result_gpu.data(), d_t_result, t_result_bytes, hipMemcpyDeviceToHost);
-  for (DenseIndex i = 0; i < t_result.size(); i++) {
-    if (fabs(t_result(i) - t_result_gpu(i)) < 1e-4f) {
-      continue;
-    }
-    if (Eigen::internal::isApprox(t_result(i), t_result_gpu(i), 1e-4f)) {
-      continue;
-    }
-    std::cout << "mismatch detected at index " << i << ": " << t_result(i)
-              << " vs " <<  t_result_gpu(i) << std::endl;
-    assert(false);
-  }
-
-  hipFree((void*)d_t_input);
-  hipFree((void*)d_t_result);
-}
-
-
-void test_cxx11_tensor_scan_hip()
-{
-  CALL_SUBTEST(test_hip_cumsum<ColMajor>(128, 128, 128));
-  CALL_SUBTEST(test_hip_cumsum<RowMajor>(128, 128, 128));
-}