Speedup trivial tensor broadcasting on GPU by enforcing unaligned loads. See PR 437.

2026-04-10 11:34:33 +08:00 · 2018-07-31 10:10:14 +02:00
parent 723856dec1
commit 679eece876
1 changed files with 11 additions and 0 deletions
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorBroadcasting.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorBroadcasting.h
@@ -284,7 +284,13 @@ struct TensorEvaluator<const TensorBroadcastingOp<Broadcast, ArgType>, Device>

    if (static_cast<int>(Layout) == static_cast<int>(ColMajor)) {
      if (isCopy) {
+        #ifdef EIGEN_GPU_COMPILE_PHASE
+        // See PR 437: on NVIDIA P100 and K20m we observed a x3-4 speed up by enforcing
+        // unaligned loads here. The reason is unclear though.
+        return m_impl.template packet<Unaligned>(index);
+        #else
        return m_impl.template packet<LoadMode>(index);
+        #endif
      } else if (oneByN && !nByOne) {
        return packetNByOne<LoadMode>(index);
      } else if (!oneByN && nByOne) {
@@ -296,7 +302,12 @@ struct TensorEvaluator<const TensorBroadcastingOp<Broadcast, ArgType>, Device>
      }
    } else {
      if (isCopy) {
+        #ifdef EIGEN_GPU_COMPILE_PHASE
+        // See above.
+        return m_impl.template packet<Unaligned>(index);
+        #else
        return m_impl.template packet<LoadMode>(index);
+        #endif
      } else if (oneByN && !nByOne) {
        return packetOneByN<LoadMode>(index);
      } else if (!oneByN && nByOne) {