From bd21aba1817f76f4e72ddf3c55ef23d4a62ed6f7 Mon Sep 17 00:00:00 2001
From: Benoit Steiner <benoit.steiner.goog@gmail.com>
Date: Sat, 30 Jan 2016 11:47:09 -0800
Subject: [PATCH] Sharded the cxx11_tensor_cuda test and fixed a memory leak

---
 unsupported/test/cxx11_tensor_cuda.cu | 131 +++++++++++++++++---------
 1 file changed, 88 insertions(+), 43 deletions(-)
diff --git a/unsupported/test/cxx11_tensor_cuda.cu b/unsupported/test/cxx11_tensor_cuda.cu
index 79f1c5315..60f9314a5 100644
--- a/unsupported/test/cxx11_tensor_cuda.cu
+++ b/unsupported/test/cxx11_tensor_cuda.cu
@@ -63,6 +63,10 @@ void test_cuda_elementwise_small() {
         out(Eigen::array<int, 1>(i)),
         in1(Eigen::array<int, 1>(i)) + in2(Eigen::array<int, 1>(i)));
   }
+
+  cudaFree(d_in1);
+  cudaFree(d_in2);
+  cudaFree(d_out);
 }
 
 void test_cuda_elementwise()
@@ -113,6 +117,11 @@ void test_cuda_elementwise()
       }
     }
   }
+
+  cudaFree(d_in1);
+  cudaFree(d_in2);
+  cudaFree(d_in3);
+  cudaFree(d_out);
 }
 
 void test_cuda_reduction()
@@ -158,10 +167,13 @@ void test_cuda_reduction()
       VERIFY_IS_APPROX(out(i,j), expected);
     }
   }
+
+  cudaFree(d_in1);
+  cudaFree(d_out);
 }
 
 template<int DataLayout>
-static void test_cuda_contraction()
+void test_cuda_contraction()
 {
   // with these dimensions, the output has 300 * 140 elements, which is
   // more than 30 * 1024, which is the number of threads in blocks on
@@ -216,10 +228,14 @@ static void test_cuda_contraction()
       assert(false);
     }
   }
+
+  cudaFree(d_t_left);
+  cudaFree(d_t_right);
+  cudaFree(d_t_result);
 }
 
 template<int DataLayout>
-static void test_cuda_convolution_1d()
+void test_cuda_convolution_1d()
 {
   Tensor<float, 4, DataLayout> input(74,37,11,137);
   Tensor<float, 1, DataLayout> kernel(4);
@@ -266,9 +282,13 @@ static void test_cuda_convolution_1d()
       }
     }
   }
+
+  cudaFree(d_input);
+  cudaFree(d_kernel);
+  cudaFree(d_out);
 }
 
-static void test_cuda_convolution_inner_dim_col_major_1d()
+void test_cuda_convolution_inner_dim_col_major_1d()
 {
   Tensor<float, 4, ColMajor> input(74,9,11,7);
   Tensor<float, 1, ColMajor> kernel(4);
@@ -315,9 +335,13 @@ static void test_cuda_convolution_inner_dim_col_major_1d()
       }
     }
   }
+
+  cudaFree(d_input);
+  cudaFree(d_kernel);
+  cudaFree(d_out);
 }
 
-static void test_cuda_convolution_inner_dim_row_major_1d()
+void test_cuda_convolution_inner_dim_row_major_1d()
 {
   Tensor<float, 4, RowMajor> input(7,9,11,74);
   Tensor<float, 1, RowMajor> kernel(4);
@@ -364,10 +388,14 @@ static void test_cuda_convolution_inner_dim_row_major_1d()
       }
     }
   }
+
+  cudaFree(d_input);
+  cudaFree(d_kernel);
+  cudaFree(d_out);
 }
 
 template<int DataLayout>
-static void test_cuda_convolution_2d()
+void test_cuda_convolution_2d()
 {
   Tensor<float, 4, DataLayout> input(74,37,11,137);
   Tensor<float, 2, DataLayout> kernel(3,4);
@@ -424,10 +452,14 @@ static void test_cuda_convolution_2d()
       }
     }
   }
+
+  cudaFree(d_input);
+  cudaFree(d_kernel);
+  cudaFree(d_out);
 }
 
 template<int DataLayout>
-static void test_cuda_convolution_3d()
+void test_cuda_convolution_3d()
 {
   Tensor<float, 5, DataLayout> input(Eigen::array<int, 5>(74,37,11,137,17));
   Tensor<float, 3, DataLayout> kernel(3,4,2);
@@ -498,6 +530,10 @@ static void test_cuda_convolution_3d()
       }
     }
   }
+
+  cudaFree(d_input);
+  cudaFree(d_kernel);
+  cudaFree(d_out);
 }
 
 
@@ -535,6 +571,9 @@ void test_cuda_lgamma(const Scalar stddev)
       VERIFY_IS_APPROX(out(i,j), (std::lgamma)(in(i,j)));
     }
   }
+
+  cudaFree(d_in);
+  cudaFree(d_out);
 }
 
 template <typename Scalar>
@@ -571,6 +610,9 @@ void test_cuda_erf(const Scalar stddev)
       VERIFY_IS_APPROX(out(i,j), (std::erf)(in(i,j)));
     }
   }
+
+  cudaFree(d_in);
+  cudaFree(d_out);
 }
 
 template <typename Scalar>
@@ -607,47 +649,50 @@ void test_cuda_erfc(const Scalar stddev)
       VERIFY_IS_APPROX(out(i,j), (std::erfc)(in(i,j)));
     }
   }
+
+  cudaFree(d_in);
+  cudaFree(d_out);
 }
 
 void test_cxx11_tensor_cuda()
 {
-  CALL_SUBTEST(test_cuda_elementwise_small());
-  CALL_SUBTEST(test_cuda_elementwise());
-  CALL_SUBTEST(test_cuda_reduction());
-  CALL_SUBTEST(test_cuda_contraction<ColMajor>());
-  CALL_SUBTEST(test_cuda_contraction<RowMajor>());
-  CALL_SUBTEST(test_cuda_convolution_1d<ColMajor>());
-  CALL_SUBTEST(test_cuda_convolution_1d<RowMajor>());
-  CALL_SUBTEST(test_cuda_convolution_inner_dim_col_major_1d());
-  CALL_SUBTEST(test_cuda_convolution_inner_dim_row_major_1d());
-  CALL_SUBTEST(test_cuda_convolution_2d<ColMajor>());
-  CALL_SUBTEST(test_cuda_convolution_2d<RowMajor>());
-  CALL_SUBTEST(test_cuda_convolution_3d<ColMajor>());
-  CALL_SUBTEST(test_cuda_convolution_3d<RowMajor>());
-  CALL_SUBTEST(test_cuda_lgamma<float>(1.0f));
-  CALL_SUBTEST(test_cuda_lgamma<float>(100.0f));
-  CALL_SUBTEST(test_cuda_lgamma<float>(0.01f));
-  CALL_SUBTEST(test_cuda_lgamma<float>(0.001f));
-  CALL_SUBTEST(test_cuda_erf<float>(1.0f));
-  CALL_SUBTEST(test_cuda_erf<float>(100.0f));
-  CALL_SUBTEST(test_cuda_erf<float>(0.01f));
-  CALL_SUBTEST(test_cuda_erf<float>(0.001f));
-  CALL_SUBTEST(test_cuda_erfc<float>(1.0f));
+  CALL_SUBTEST_1(test_cuda_elementwise_small());
+  CALL_SUBTEST_1(test_cuda_elementwise());
+  CALL_SUBTEST_1(test_cuda_reduction());
+  CALL_SUBTEST_2(test_cuda_contraction<ColMajor>());
+  CALL_SUBTEST_2(test_cuda_contraction<RowMajor>());
+  CALL_SUBTEST_3(test_cuda_convolution_1d<ColMajor>());
+  CALL_SUBTEST_3(test_cuda_convolution_1d<RowMajor>());
+  CALL_SUBTEST_3(test_cuda_convolution_inner_dim_col_major_1d());
+  CALL_SUBTEST_3(test_cuda_convolution_inner_dim_row_major_1d());
+  CALL_SUBTEST_3(test_cuda_convolution_2d<ColMajor>());
+  CALL_SUBTEST_3(test_cuda_convolution_2d<RowMajor>());
+  CALL_SUBTEST_3(test_cuda_convolution_3d<ColMajor>());
+  CALL_SUBTEST_3(test_cuda_convolution_3d<RowMajor>());
+  CALL_SUBTEST_4(test_cuda_lgamma<float>(1.0f));
+  CALL_SUBTEST_4(test_cuda_lgamma<float>(100.0f));
+  CALL_SUBTEST_4(test_cuda_lgamma<float>(0.01f));
+  CALL_SUBTEST_4(test_cuda_lgamma<float>(0.001f));
+  CALL_SUBTEST_4(test_cuda_erf<float>(1.0f));
+  CALL_SUBTEST_4(test_cuda_erf<float>(100.0f));
+  CALL_SUBTEST_4(test_cuda_erf<float>(0.01f));
+  CALL_SUBTEST_4(test_cuda_erf<float>(0.001f));
+  CALL_SUBTEST_4(test_cuda_erfc<float>(1.0f));
   // CALL_SUBTEST(test_cuda_erfc<float>(100.0f));
-  CALL_SUBTEST(test_cuda_erfc<float>(5.0f)); // CUDA erfc lacks precision for large inputs
-  CALL_SUBTEST(test_cuda_erfc<float>(0.01f));
-  CALL_SUBTEST(test_cuda_erfc<float>(0.001f));
-  CALL_SUBTEST(test_cuda_lgamma<double>(1.0));
-  CALL_SUBTEST(test_cuda_lgamma<double>(100.0));
-  CALL_SUBTEST(test_cuda_lgamma<double>(0.01));
-  CALL_SUBTEST(test_cuda_lgamma<double>(0.001));
-  CALL_SUBTEST(test_cuda_erf<double>(1.0));
-  CALL_SUBTEST(test_cuda_erf<double>(100.0));
-  CALL_SUBTEST(test_cuda_erf<double>(0.01));
-  CALL_SUBTEST(test_cuda_erf<double>(0.001));
-  CALL_SUBTEST(test_cuda_erfc<double>(1.0));
+  CALL_SUBTEST_4(test_cuda_erfc<float>(5.0f)); // CUDA erfc lacks precision for large inputs
+  CALL_SUBTEST_4(test_cuda_erfc<float>(0.01f));
+  CALL_SUBTEST_4(test_cuda_erfc<float>(0.001f));
+  CALL_SUBTEST_4(test_cuda_lgamma<double>(1.0));
+  CALL_SUBTEST_4(test_cuda_lgamma<double>(100.0));
+  CALL_SUBTEST_4(test_cuda_lgamma<double>(0.01));
+  CALL_SUBTEST_4(test_cuda_lgamma<double>(0.001));
+  CALL_SUBTEST_4(test_cuda_erf<double>(1.0));
+  CALL_SUBTEST_4(test_cuda_erf<double>(100.0));
+  CALL_SUBTEST_4(test_cuda_erf<double>(0.01));
+  CALL_SUBTEST_4(test_cuda_erf<double>(0.001));
+  CALL_SUBTEST_4(test_cuda_erfc<double>(1.0));
   // CALL_SUBTEST(test_cuda_erfc<double>(100.0));
-  CALL_SUBTEST(test_cuda_erfc<double>(5.0)); // CUDA erfc lacks precision for large inputs
-  CALL_SUBTEST(test_cuda_erfc<double>(0.01));
-  CALL_SUBTEST(test_cuda_erfc<double>(0.001));
+  CALL_SUBTEST_4(test_cuda_erfc<double>(5.0)); // CUDA erfc lacks precision for large inputs
+  CALL_SUBTEST_4(test_cuda_erfc<double>(0.01));
+  CALL_SUBTEST_4(test_cuda_erfc<double>(0.001));
 }