Optimize sparse-dense product by bypassing InnerIterator for compressed storage

libeigen/eigen!2134 Closes #3026 Co-authored-by: Rasmus Munk Larsen <rmlarsen@gmail.com>
2026-04-10 11:34:33 +08:00 · 2026-02-17 19:19:18 -08:00
parent b6b2f31ba8
commit 50d6d92a70
1 changed files with 188 additions and 20 deletions
--- a/Eigen/src/SparseCore/SparseDenseProduct.h
+++ b/Eigen/src/SparseCore/SparseDenseProduct.h
@@ -26,11 +26,18 @@ struct product_promote_storage_type<Dense, Sparse, OuterProduct> {
  typedef Sparse ret;
 };

+// Type trait to detect if a sparse type supports direct compressed storage access
+// (i.e., has valuePtr(), innerIndexPtr(), outerIndexPtr(), isCompressed()).
+// All types deriving from SparseCompressedBase provide these methods.
+template <typename T>
+struct has_compressed_storage : std::is_base_of<SparseCompressedBase<T>, T> {};
+
 template <typename SparseLhsType, typename DenseRhsType, typename DenseResType, typename AlphaType,
          int LhsStorageOrder = ((SparseLhsType::Flags & RowMajorBit) == RowMajorBit) ? RowMajor : ColMajor,
          bool ColPerCol = ((DenseRhsType::Flags & RowMajorBit) == 0) || DenseRhsType::ColsAtCompileTime == 1>
 struct sparse_time_dense_product_impl;

+// RowMajor, single column (ColPerCol=true): CSR SpMV
 template <typename SparseLhsType, typename DenseRhsType, typename DenseResType>
 struct sparse_time_dense_product_impl<SparseLhsType, DenseRhsType, DenseResType, typename DenseResType::Scalar,
                                      RowMajor, true> {
@@ -39,36 +46,101 @@ struct sparse_time_dense_product_impl<SparseLhsType, DenseRhsType, DenseResType,
  typedef internal::remove_all_t<DenseResType> Res;
  typedef typename evaluator<Lhs>::InnerIterator LhsInnerIterator;
  typedef evaluator<Lhs> LhsEval;
+  typedef typename Res::Scalar ResScalar;
+
  static void run(const SparseLhsType& lhs, const DenseRhsType& rhs, DenseResType& res,
                  const typename Res::Scalar& alpha) {
    LhsEval lhsEval(lhs);
-
    Index n = lhs.outerSize();
-#ifdef EIGEN_HAS_OPENMP
-    Index threads = Eigen::nbThreads();
-#endif

    for (Index c = 0; c < rhs.cols(); ++c) {
+      runCol(lhsEval, lhs, rhs, res, alpha, n, c, std::integral_constant<bool, has_compressed_storage<Lhs>::value>());
+    }
+  }
+
+  // Direct pointer path: works for both compressed and non-compressed storage.
+  static void runCol(const LhsEval& /*lhsEval*/, const SparseLhsType& lhs, const DenseRhsType& rhs, DenseResType& res,
+                     const ResScalar& alpha, Index n, Index c, std::true_type /* has_compressed_storage */) {
+    const Lhs& mat = lhs;
+    const auto* vals = mat.valuePtr();
+    const auto* inds = mat.innerIndexPtr();
+    const auto* outer = mat.outerIndexPtr();
+    const auto* innerNnz = mat.innerNonZeroPtr();
+    // The fast rhs pointer path requires unit inner stride (common case: VectorXd, contiguous matrix column).
+    if (rhs.innerStride() == 1) {
+      const auto* x = rhs.data() + c * rhs.outerStride();
 #ifdef EIGEN_HAS_OPENMP
-      // This 20000 threshold has been found experimentally on 2D and 3D Poisson problems.
-      // It basically represents the minimal amount of work to be done to be worth it.
-      if (threads > 1 && lhsEval.nonZerosEstimate() > 20000) {
+      Index threads = Eigen::nbThreads();
+      if (threads > 1 && mat.nonZeros() > 20000) {
 #pragma omp parallel for schedule(dynamic, (n + threads * 4 - 1) / (threads * 4)) num_threads(threads)
-        for (Index i = 0; i < n; ++i) processRow(lhsEval, rhs, res, alpha, i, c);
+        for (Index i = 0; i < n; ++i) {
+          Index k = outer[i];
+          const Index end = innerNnz ? outer[i] + innerNnz[i] : outer[i + 1];
+          ResScalar sum0(0), sum1(0);
+          for (; k < end; ++k) {
+            sum0 += vals[k] * x[inds[k]];
+            ++k;
+            if (k < end) {
+              sum1 += vals[k] * x[inds[k]];
+            }
+          }
+          res.coeffRef(i, c) += alpha * (sum0 + sum1);
+        }
      } else
 #endif
      {
-        for (Index i = 0; i < n; ++i) processRow(lhsEval, rhs, res, alpha, i, c);
+        for (Index i = 0; i < n; ++i) {
+          Index k = outer[i];
+          const Index end = innerNnz ? outer[i] + innerNnz[i] : outer[i + 1];
+          // Two independent accumulators to break the dependency chain
+          ResScalar sum0(0), sum1(0);
+          for (; k < end; ++k) {
+            sum0 += vals[k] * x[inds[k]];
+            ++k;
+            if (k < end) {
+              sum1 += vals[k] * x[inds[k]];
+            }
+          }
+          res.coeffRef(i, c) += alpha * (sum0 + sum1);
+        }
+      }
+    } else {
+      // Non-unit rhs stride: use direct pointers for sparse side, coeff() for rhs
+      for (Index i = 0; i < n; ++i) {
+        Index k = outer[i];
+        const Index end = innerNnz ? outer[i] + innerNnz[i] : outer[i + 1];
+        ResScalar sum0(0), sum1(0);
+        for (; k < end; ++k) {
+          sum0 += vals[k] * rhs.coeff(inds[k], c);
+          ++k;
+          if (k < end) {
+            sum1 += vals[k] * rhs.coeff(inds[k], c);
+          }
+        }
+        res.coeffRef(i, c) += alpha * (sum0 + sum1);
      }
    }
  }

-  static void processRow(const LhsEval& lhsEval, const DenseRhsType& rhs, DenseResType& res,
-                         const typename Res::Scalar& alpha, Index i, Index col) {
-    // Two accumulators, which breaks the dependency chain on the accumulator
-    // and allows more instruction-level parallelism in the following loop
-    typename Res::Scalar tmp_a(0);
-    typename Res::Scalar tmp_b(0);
+  // Iterator fallback path
+  static void runCol(const LhsEval& lhsEval, const SparseLhsType& /*lhs*/, const DenseRhsType& rhs, DenseResType& res,
+                     const ResScalar& alpha, Index n, Index c, std::false_type /* has_compressed_storage */) {
+#ifdef EIGEN_HAS_OPENMP
+    Index threads = Eigen::nbThreads();
+    if (threads > 1 && lhsEval.nonZerosEstimate() > 20000) {
+#pragma omp parallel for schedule(dynamic, (n + threads * 4 - 1) / (threads * 4)) num_threads(threads)
+      for (Index i = 0; i < n; ++i) processRow(lhsEval, rhs, res, alpha, i, c);
+    } else
+#endif
+    {
+      for (Index i = 0; i < n; ++i) processRow(lhsEval, rhs, res, alpha, i, c);
+    }
+  }
+
+  static void processRow(const LhsEval& lhsEval, const DenseRhsType& rhs, DenseResType& res, const ResScalar& alpha,
+                         Index i, Index col) {
+    ResScalar tmp_a(0);
+    ResScalar tmp_b(0);
    for (LhsInnerIterator it(lhsEval, i); it; ++it) {
      tmp_a += it.value() * rhs.coeff(it.index(), col);
      ++it;
@@ -91,6 +163,7 @@ struct sparse_time_dense_product_impl<SparseLhsType, DenseRhsType, DenseResType,
 //   typedef typename CwiseUnaryOp<scalar_multiple2_op<T1, typename T2::Scalar>, T2>::PlainObject ReturnType;
 // };

+// ColMajor, single column (ColPerCol=true): CSC SpMV
 template <typename SparseLhsType, typename DenseRhsType, typename DenseResType, typename AlphaType>
 struct sparse_time_dense_product_impl<SparseLhsType, DenseRhsType, DenseResType, AlphaType, ColMajor, true> {
  typedef internal::remove_all_t<SparseLhsType> Lhs;
@@ -98,11 +171,60 @@ struct sparse_time_dense_product_impl<SparseLhsType, DenseRhsType, DenseResType,
  typedef internal::remove_all_t<DenseResType> Res;
  typedef evaluator<Lhs> LhsEval;
  typedef typename LhsEval::InnerIterator LhsInnerIterator;
+
  static void run(const SparseLhsType& lhs, const DenseRhsType& rhs, DenseResType& res, const AlphaType& alpha) {
+    runImpl(lhs, rhs, res, alpha, std::integral_constant<bool, has_compressed_storage<Lhs>::value>());
+  }
+
+  // Direct pointer path: works for both compressed and non-compressed storage.
+  static void runImpl(const SparseLhsType& lhs, const DenseRhsType& rhs, DenseResType& res, const AlphaType& alpha,
+                      std::true_type /* has_compressed_storage */) {
+    typedef typename Lhs::Scalar LhsScalar;
+    typedef typename Lhs::StorageIndex StorageIndex;
+    const Lhs& mat = lhs;
+    const LhsScalar* vals = mat.valuePtr();
+    const StorageIndex* inds = mat.innerIndexPtr();
+    const auto* outer = mat.outerIndexPtr();
+    const auto* innerNnz = mat.innerNonZeroPtr();
+    // The fast result pointer path requires contiguous ColMajor result layout.
+    // Transpose<ColMajor> reports innerStride()==1 but is actually RowMajor, so check both.
+    if (!(Res::Flags & RowMajorBit) && res.innerStride() == 1) {
+      for (Index c = 0; c < rhs.cols(); ++c) {
+        typename Res::Scalar* y = res.data() + c * res.outerStride();
+        for (Index j = 0; j < lhs.outerSize(); ++j) {
+          typename ScalarBinaryOpTraits<AlphaType, typename Rhs::Scalar>::ReturnType rhs_j(alpha * rhs.coeff(j, c));
+          const Index start = outer[j];
+          const Index end = innerNnz ? outer[j] + innerNnz[j] : outer[j + 1];
+          Index k = start;
+          // 4-way unrolled scatter-add (no SIMD: writes are scattered)
+          for (; k + 3 < end; k += 4) {
+            y[inds[k]] += vals[k] * rhs_j;
+            y[inds[k + 1]] += vals[k + 1] * rhs_j;
+            y[inds[k + 2]] += vals[k + 2] * rhs_j;
+            y[inds[k + 3]] += vals[k + 3] * rhs_j;
+          }
+          for (; k < end; ++k) y[inds[k]] += vals[k] * rhs_j;
+        }
+      }
+    } else {
+      // Non-unit result stride: use coeffRef() for result access
+      for (Index c = 0; c < rhs.cols(); ++c) {
+        for (Index j = 0; j < lhs.outerSize(); ++j) {
+          typename ScalarBinaryOpTraits<AlphaType, typename Rhs::Scalar>::ReturnType rhs_j(alpha * rhs.coeff(j, c));
+          const Index start = outer[j];
+          const Index end = innerNnz ? outer[j] + innerNnz[j] : outer[j + 1];
+          for (Index k = start; k < end; ++k) res.coeffRef(inds[k], c) += vals[k] * rhs_j;
+        }
+      }
+    }
+  }
+
+  // Iterator-based fallback
+  static void runImpl(const SparseLhsType& lhs, const DenseRhsType& rhs, DenseResType& res, const AlphaType& alpha,
+                      std::false_type /* has_compressed_storage */) {
    LhsEval lhsEval(lhs);
    for (Index c = 0; c < rhs.cols(); ++c) {
      for (Index j = 0; j < lhs.outerSize(); ++j) {
-        //        typename Res::Scalar rhs_j = alpha * rhs.coeff(j,c);
        typename ScalarBinaryOpTraits<AlphaType, typename Rhs::Scalar>::ReturnType rhs_j(alpha * rhs.coeff(j, c));
        for (LhsInnerIterator it(lhsEval, j); it; ++it) res.coeffRef(it.index(), c) += it.value() * rhs_j;
      }
@@ -110,6 +232,7 @@ struct sparse_time_dense_product_impl<SparseLhsType, DenseRhsType, DenseResType,
  }
 };

+// RowMajor, multiple columns (ColPerCol=false): sparse * dense_matrix
 template <typename SparseLhsType, typename DenseRhsType, typename DenseResType>
 struct sparse_time_dense_product_impl<SparseLhsType, DenseRhsType, DenseResType, typename DenseResType::Scalar,
                                      RowMajor, false> {
@@ -118,6 +241,9 @@ struct sparse_time_dense_product_impl<SparseLhsType, DenseRhsType, DenseResType,
  typedef internal::remove_all_t<DenseResType> Res;
  typedef evaluator<Lhs> LhsEval;
  typedef typename LhsEval::InnerIterator LhsInnerIterator;
+
+  static constexpr bool IsCompressedLhs = has_compressed_storage<Lhs>::value;
+
  static void run(const SparseLhsType& lhs, const DenseRhsType& rhs, DenseResType& res,
                  const typename Res::Scalar& alpha) {
    Index n = lhs.rows();
@@ -129,21 +255,39 @@ struct sparse_time_dense_product_impl<SparseLhsType, DenseRhsType, DenseResType,
    // It basically represents the minimal amount of work to be done to be worth it.
    if (threads > 1 && lhsEval.nonZerosEstimate() * rhs.cols() > 20000) {
 #pragma omp parallel for schedule(dynamic, (n + threads * 4 - 1) / (threads * 4)) num_threads(threads)
-      for (Index i = 0; i < n; ++i) processRow(lhsEval, rhs, res, alpha, i);
+      for (Index i = 0; i < n; ++i)
+        processRow(lhsEval, lhs, rhs, res, alpha, i, std::integral_constant<bool, IsCompressedLhs>());
    } else
 #endif
    {
-      for (Index i = 0; i < n; ++i) processRow(lhsEval, rhs, res, alpha, i);
+      for (Index i = 0; i < n; ++i)
+        processRow(lhsEval, lhs, rhs, res, alpha, i, std::integral_constant<bool, IsCompressedLhs>());
    }
  }

-  static void processRow(const LhsEval& lhsEval, const DenseRhsType& rhs, Res& res, const typename Res::Scalar& alpha,
-                         Index i) {
+  // Direct pointer path: works for both compressed and non-compressed storage.
+  static void processRow(const LhsEval& /*lhsEval*/, const SparseLhsType& lhs, const DenseRhsType& rhs, Res& res,
+                         const typename Res::Scalar& alpha, Index i, std::true_type /* has_compressed_storage */) {
+    typedef typename Lhs::Scalar LhsScalar;
+    typedef typename Lhs::StorageIndex StorageIndex;
+    const Lhs& mat = lhs;
+    const LhsScalar* vals = mat.valuePtr();
+    const StorageIndex* inds = mat.innerIndexPtr();
+    const Index start = mat.outerIndexPtr()[i];
+    const auto* innerNnz = mat.innerNonZeroPtr();
+    const Index end = innerNnz ? start + innerNnz[i] : mat.outerIndexPtr()[i + 1];
+    typename Res::RowXpr res_i(res.row(i));
+    for (Index k = start; k < end; ++k) res_i += (alpha * vals[k]) * rhs.row(inds[k]);
+  }
+
+  static void processRow(const LhsEval& lhsEval, const SparseLhsType& /*lhs*/, const DenseRhsType& rhs, Res& res,
+                         const typename Res::Scalar& alpha, Index i, std::false_type /* has_compressed_storage */) {
    typename Res::RowXpr res_i(res.row(i));
    for (LhsInnerIterator it(lhsEval, i); it; ++it) res_i += (alpha * it.value()) * rhs.row(it.index());
  }
 };

+// ColMajor, multiple columns (ColPerCol=false): sparse * dense_matrix
 template <typename SparseLhsType, typename DenseRhsType, typename DenseResType>
 struct sparse_time_dense_product_impl<SparseLhsType, DenseRhsType, DenseResType, typename DenseResType::Scalar,
                                      ColMajor, false> {
@@ -151,8 +295,32 @@ struct sparse_time_dense_product_impl<SparseLhsType, DenseRhsType, DenseResType,
  typedef internal::remove_all_t<DenseRhsType> Rhs;
  typedef internal::remove_all_t<DenseResType> Res;
  typedef typename evaluator<Lhs>::InnerIterator LhsInnerIterator;
+
  static void run(const SparseLhsType& lhs, const DenseRhsType& rhs, DenseResType& res,
                  const typename Res::Scalar& alpha) {
+    runImpl(lhs, rhs, res, alpha, std::integral_constant<bool, has_compressed_storage<Lhs>::value>());
+  }
+
+  // Direct pointer path: works for both compressed and non-compressed storage.
+  static void runImpl(const SparseLhsType& lhs, const DenseRhsType& rhs, DenseResType& res,
+                      const typename Res::Scalar& alpha, std::true_type /* has_compressed_storage */) {
+    typedef typename Lhs::Scalar LhsScalar;
+    typedef typename Lhs::StorageIndex StorageIndex;
+    const Lhs& mat = lhs;
+    const LhsScalar* vals = mat.valuePtr();
+    const StorageIndex* inds = mat.innerIndexPtr();
+    const auto* outer = mat.outerIndexPtr();
+    const auto* innerNnz = mat.innerNonZeroPtr();
+    for (Index j = 0; j < lhs.outerSize(); ++j) {
+      typename Rhs::ConstRowXpr rhs_j(rhs.row(j));
+      const Index start = outer[j];
+      const Index end = innerNnz ? outer[j] + innerNnz[j] : outer[j + 1];
+      for (Index k = start; k < end; ++k) res.row(inds[k]) += (alpha * vals[k]) * rhs_j;
+    }
+  }
+
+  static void runImpl(const SparseLhsType& lhs, const DenseRhsType& rhs, DenseResType& res,
+                      const typename Res::Scalar& alpha, std::false_type /* has_compressed_storage */) {
    evaluator<Lhs> lhsEval(lhs);
    for (Index j = 0; j < lhs.outerSize(); ++j) {
      typename Rhs::ConstRowXpr rhs_j(rhs.row(j));