From 50d6d92a70a5eba68cc047804b119c7cda1338c1 Mon Sep 17 00:00:00 2001
From: Rasmus Munk Larsen <4643818-rmlarsen1@users.noreply.gitlab.com>
Date: Tue, 17 Feb 2026 19:19:18 -0800
Subject: [PATCH] Optimize sparse-dense product by bypassing InnerIterator for
 compressed storage

libeigen/eigen!2134

Closes #3026

Co-authored-by: Rasmus Munk Larsen <rmlarsen@gmail.com>
---
 Eigen/src/SparseCore/SparseDenseProduct.h | 208 +++++++++++++++++++---
 1 file changed, 188 insertions(+), 20 deletions(-)
diff --git a/Eigen/src/SparseCore/SparseDenseProduct.h b/Eigen/src/SparseCore/SparseDenseProduct.h
index 17ce596a5..46bdacce6 100644
--- a/Eigen/src/SparseCore/SparseDenseProduct.h
+++ b/Eigen/src/SparseCore/SparseDenseProduct.h
@@ -26,11 +26,18 @@ struct product_promote_storage_type<Dense, Sparse, OuterProduct> {
   typedef Sparse ret;
 };
 
+// Type trait to detect if a sparse type supports direct compressed storage access
+// (i.e., has valuePtr(), innerIndexPtr(), outerIndexPtr(), isCompressed()).
+// All types deriving from SparseCompressedBase provide these methods.
+template <typename T>
+struct has_compressed_storage : std::is_base_of<SparseCompressedBase<T>, T> {};
+
 template <typename SparseLhsType, typename DenseRhsType, typename DenseResType, typename AlphaType,
           int LhsStorageOrder = ((SparseLhsType::Flags & RowMajorBit) == RowMajorBit) ? RowMajor : ColMajor,
           bool ColPerCol = ((DenseRhsType::Flags & RowMajorBit) == 0) || DenseRhsType::ColsAtCompileTime == 1>
 struct sparse_time_dense_product_impl;
 
+// RowMajor, single column (ColPerCol=true): CSR SpMV
 template <typename SparseLhsType, typename DenseRhsType, typename DenseResType>
 struct sparse_time_dense_product_impl<SparseLhsType, DenseRhsType, DenseResType, typename DenseResType::Scalar,
                                       RowMajor, true> {
@@ -39,36 +46,101 @@ struct sparse_time_dense_product_impl<SparseLhsType, DenseRhsType, DenseResType,
   typedef internal::remove_all_t<DenseResType> Res;
   typedef typename evaluator<Lhs>::InnerIterator LhsInnerIterator;
   typedef evaluator<Lhs> LhsEval;
+  typedef typename Res::Scalar ResScalar;
+
   static void run(const SparseLhsType& lhs, const DenseRhsType& rhs, DenseResType& res,
                   const typename Res::Scalar& alpha) {
     LhsEval lhsEval(lhs);
-
     Index n = lhs.outerSize();
-#ifdef EIGEN_HAS_OPENMP
-    Index threads = Eigen::nbThreads();
-#endif
 
     for (Index c = 0; c < rhs.cols(); ++c) {
+      runCol(lhsEval, lhs, rhs, res, alpha, n, c, std::integral_constant<bool, has_compressed_storage<Lhs>::value>());
+    }
+  }
+
+  // Direct pointer path: works for both compressed and non-compressed storage.
+  static void runCol(const LhsEval& /*lhsEval*/, const SparseLhsType& lhs, const DenseRhsType& rhs, DenseResType& res,
+                     const ResScalar& alpha, Index n, Index c, std::true_type /* has_compressed_storage */) {
+    const Lhs& mat = lhs;
+    const auto* vals = mat.valuePtr();
+    const auto* inds = mat.innerIndexPtr();
+    const auto* outer = mat.outerIndexPtr();
+    const auto* innerNnz = mat.innerNonZeroPtr();
+    // The fast rhs pointer path requires unit inner stride (common case: VectorXd, contiguous matrix column).
+    if (rhs.innerStride() == 1) {
+      const auto* x = rhs.data() + c * rhs.outerStride();
 #ifdef EIGEN_HAS_OPENMP
-      // This 20000 threshold has been found experimentally on 2D and 3D Poisson problems.
-      // It basically represents the minimal amount of work to be done to be worth it.
-      if (threads > 1 && lhsEval.nonZerosEstimate() > 20000) {
+      Index threads = Eigen::nbThreads();
+      if (threads > 1 && mat.nonZeros() > 20000) {
 #pragma omp parallel for schedule(dynamic, (n + threads * 4 - 1) / (threads * 4)) num_threads(threads)
-        for (Index i = 0; i < n; ++i) processRow(lhsEval, rhs, res, alpha, i, c);
+        for (Index i = 0; i < n; ++i) {
+          Index k = outer[i];
+          const Index end = innerNnz ? outer[i] + innerNnz[i] : outer[i + 1];
+          ResScalar sum0(0), sum1(0);
+          for (; k < end; ++k) {
+            sum0 += vals[k] * x[inds[k]];
+            ++k;
+            if (k < end) {
+              sum1 += vals[k] * x[inds[k]];
+            }
+          }
+          res.coeffRef(i, c) += alpha * (sum0 + sum1);
+        }
       } else
 #endif
       {
-        for (Index i = 0; i < n; ++i) processRow(lhsEval, rhs, res, alpha, i, c);
+        for (Index i = 0; i < n; ++i) {
+          Index k = outer[i];
+          const Index end = innerNnz ? outer[i] + innerNnz[i] : outer[i + 1];
+          // Two independent accumulators to break the dependency chain
+          ResScalar sum0(0), sum1(0);
+          for (; k < end; ++k) {
+            sum0 += vals[k] * x[inds[k]];
+            ++k;
+            if (k < end) {
+              sum1 += vals[k] * x[inds[k]];
+            }
+          }
+          res.coeffRef(i, c) += alpha * (sum0 + sum1);
+        }
+      }
+    } else {
+      // Non-unit rhs stride: use direct pointers for sparse side, coeff() for rhs
+      for (Index i = 0; i < n; ++i) {
+        Index k = outer[i];
+        const Index end = innerNnz ? outer[i] + innerNnz[i] : outer[i + 1];
+        ResScalar sum0(0), sum1(0);
+        for (; k < end; ++k) {
+          sum0 += vals[k] * rhs.coeff(inds[k], c);
+          ++k;
+          if (k < end) {
+            sum1 += vals[k] * rhs.coeff(inds[k], c);
+          }
+        }
+        res.coeffRef(i, c) += alpha * (sum0 + sum1);
       }
     }
   }
 
-  static void processRow(const LhsEval& lhsEval, const DenseRhsType& rhs, DenseResType& res,
-                         const typename Res::Scalar& alpha, Index i, Index col) {
-    // Two accumulators, which breaks the dependency chain on the accumulator
-    // and allows more instruction-level parallelism in the following loop
-    typename Res::Scalar tmp_a(0);
-    typename Res::Scalar tmp_b(0);
+  // Iterator fallback path
+  static void runCol(const LhsEval& lhsEval, const SparseLhsType& /*lhs*/, const DenseRhsType& rhs, DenseResType& res,
+                     const ResScalar& alpha, Index n, Index c, std::false_type /* has_compressed_storage */) {
+#ifdef EIGEN_HAS_OPENMP
+    Index threads = Eigen::nbThreads();
+    if (threads > 1 && lhsEval.nonZerosEstimate() > 20000) {
+#pragma omp parallel for schedule(dynamic, (n + threads * 4 - 1) / (threads * 4)) num_threads(threads)
+      for (Index i = 0; i < n; ++i) processRow(lhsEval, rhs, res, alpha, i, c);
+    } else
+#endif
+    {
+      for (Index i = 0; i < n; ++i) processRow(lhsEval, rhs, res, alpha, i, c);
+    }
+  }
+
+  static void processRow(const LhsEval& lhsEval, const DenseRhsType& rhs, DenseResType& res, const ResScalar& alpha,
+                         Index i, Index col) {
+    ResScalar tmp_a(0);
+    ResScalar tmp_b(0);
     for (LhsInnerIterator it(lhsEval, i); it; ++it) {
       tmp_a += it.value() * rhs.coeff(it.index(), col);
       ++it;
@@ -91,6 +163,7 @@ struct sparse_time_dense_product_impl<SparseLhsType, DenseRhsType, DenseResType,
 //   typedef typename CwiseUnaryOp<scalar_multiple2_op<T1, typename T2::Scalar>, T2>::PlainObject ReturnType;
 // };
 
+// ColMajor, single column (ColPerCol=true): CSC SpMV
 template <typename SparseLhsType, typename DenseRhsType, typename DenseResType, typename AlphaType>
 struct sparse_time_dense_product_impl<SparseLhsType, DenseRhsType, DenseResType, AlphaType, ColMajor, true> {
   typedef internal::remove_all_t<SparseLhsType> Lhs;
@@ -98,11 +171,60 @@ struct sparse_time_dense_product_impl<SparseLhsType, DenseRhsType, DenseResType,
   typedef internal::remove_all_t<DenseResType> Res;
   typedef evaluator<Lhs> LhsEval;
   typedef typename LhsEval::InnerIterator LhsInnerIterator;
+
   static void run(const SparseLhsType& lhs, const DenseRhsType& rhs, DenseResType& res, const AlphaType& alpha) {
+    runImpl(lhs, rhs, res, alpha, std::integral_constant<bool, has_compressed_storage<Lhs>::value>());
+  }
+
+  // Direct pointer path: works for both compressed and non-compressed storage.
+  static void runImpl(const SparseLhsType& lhs, const DenseRhsType& rhs, DenseResType& res, const AlphaType& alpha,
+                      std::true_type /* has_compressed_storage */) {
+    typedef typename Lhs::Scalar LhsScalar;
+    typedef typename Lhs::StorageIndex StorageIndex;
+    const Lhs& mat = lhs;
+    const LhsScalar* vals = mat.valuePtr();
+    const StorageIndex* inds = mat.innerIndexPtr();
+    const auto* outer = mat.outerIndexPtr();
+    const auto* innerNnz = mat.innerNonZeroPtr();
+    // The fast result pointer path requires contiguous ColMajor result layout.
+    // Transpose<ColMajor> reports innerStride()==1 but is actually RowMajor, so check both.
+    if (!(Res::Flags & RowMajorBit) && res.innerStride() == 1) {
+      for (Index c = 0; c < rhs.cols(); ++c) {
+        typename Res::Scalar* y = res.data() + c * res.outerStride();
+        for (Index j = 0; j < lhs.outerSize(); ++j) {
+          typename ScalarBinaryOpTraits<AlphaType, typename Rhs::Scalar>::ReturnType rhs_j(alpha * rhs.coeff(j, c));
+          const Index start = outer[j];
+          const Index end = innerNnz ? outer[j] + innerNnz[j] : outer[j + 1];
+          Index k = start;
+          // 4-way unrolled scatter-add (no SIMD: writes are scattered)
+          for (; k + 3 < end; k += 4) {
+            y[inds[k]] += vals[k] * rhs_j;
+            y[inds[k + 1]] += vals[k + 1] * rhs_j;
+            y[inds[k + 2]] += vals[k + 2] * rhs_j;
+            y[inds[k + 3]] += vals[k + 3] * rhs_j;
+          }
+          for (; k < end; ++k) y[inds[k]] += vals[k] * rhs_j;
+        }
+      }
+    } else {
+      // Non-unit result stride: use coeffRef() for result access
+      for (Index c = 0; c < rhs.cols(); ++c) {
+        for (Index j = 0; j < lhs.outerSize(); ++j) {
+          typename ScalarBinaryOpTraits<AlphaType, typename Rhs::Scalar>::ReturnType rhs_j(alpha * rhs.coeff(j, c));
+          const Index start = outer[j];
+          const Index end = innerNnz ? outer[j] + innerNnz[j] : outer[j + 1];
+          for (Index k = start; k < end; ++k) res.coeffRef(inds[k], c) += vals[k] * rhs_j;
+        }
+      }
+    }
+  }
+
+  // Iterator-based fallback
+  static void runImpl(const SparseLhsType& lhs, const DenseRhsType& rhs, DenseResType& res, const AlphaType& alpha,
+                      std::false_type /* has_compressed_storage */) {
     LhsEval lhsEval(lhs);
     for (Index c = 0; c < rhs.cols(); ++c) {
       for (Index j = 0; j < lhs.outerSize(); ++j) {
-        //        typename Res::Scalar rhs_j = alpha * rhs.coeff(j,c);
         typename ScalarBinaryOpTraits<AlphaType, typename Rhs::Scalar>::ReturnType rhs_j(alpha * rhs.coeff(j, c));
         for (LhsInnerIterator it(lhsEval, j); it; ++it) res.coeffRef(it.index(), c) += it.value() * rhs_j;
       }
@@ -110,6 +232,7 @@ struct sparse_time_dense_product_impl<SparseLhsType, DenseRhsType, DenseResType,
   }
 };
 
+// RowMajor, multiple columns (ColPerCol=false): sparse * dense_matrix
 template <typename SparseLhsType, typename DenseRhsType, typename DenseResType>
 struct sparse_time_dense_product_impl<SparseLhsType, DenseRhsType, DenseResType, typename DenseResType::Scalar,
                                       RowMajor, false> {
@@ -118,6 +241,9 @@ struct sparse_time_dense_product_impl<SparseLhsType, DenseRhsType, DenseResType,
   typedef internal::remove_all_t<DenseResType> Res;
   typedef evaluator<Lhs> LhsEval;
   typedef typename LhsEval::InnerIterator LhsInnerIterator;
+
+  static constexpr bool IsCompressedLhs = has_compressed_storage<Lhs>::value;
+
   static void run(const SparseLhsType& lhs, const DenseRhsType& rhs, DenseResType& res,
                   const typename Res::Scalar& alpha) {
     Index n = lhs.rows();
@@ -129,21 +255,39 @@ struct sparse_time_dense_product_impl<SparseLhsType, DenseRhsType, DenseResType,
     // It basically represents the minimal amount of work to be done to be worth it.
     if (threads > 1 && lhsEval.nonZerosEstimate() * rhs.cols() > 20000) {
 #pragma omp parallel for schedule(dynamic, (n + threads * 4 - 1) / (threads * 4)) num_threads(threads)
-      for (Index i = 0; i < n; ++i) processRow(lhsEval, rhs, res, alpha, i);
+      for (Index i = 0; i < n; ++i)
+        processRow(lhsEval, lhs, rhs, res, alpha, i, std::integral_constant<bool, IsCompressedLhs>());
     } else
 #endif
     {
-      for (Index i = 0; i < n; ++i) processRow(lhsEval, rhs, res, alpha, i);
+      for (Index i = 0; i < n; ++i)
+        processRow(lhsEval, lhs, rhs, res, alpha, i, std::integral_constant<bool, IsCompressedLhs>());
     }
   }
 
-  static void processRow(const LhsEval& lhsEval, const DenseRhsType& rhs, Res& res, const typename Res::Scalar& alpha,
-                         Index i) {
+  // Direct pointer path: works for both compressed and non-compressed storage.
+  static void processRow(const LhsEval& /*lhsEval*/, const SparseLhsType& lhs, const DenseRhsType& rhs, Res& res,
+                         const typename Res::Scalar& alpha, Index i, std::true_type /* has_compressed_storage */) {
+    typedef typename Lhs::Scalar LhsScalar;
+    typedef typename Lhs::StorageIndex StorageIndex;
+    const Lhs& mat = lhs;
+    const LhsScalar* vals = mat.valuePtr();
+    const StorageIndex* inds = mat.innerIndexPtr();
+    const Index start = mat.outerIndexPtr()[i];
+    const auto* innerNnz = mat.innerNonZeroPtr();
+    const Index end = innerNnz ? start + innerNnz[i] : mat.outerIndexPtr()[i + 1];
+    typename Res::RowXpr res_i(res.row(i));
+    for (Index k = start; k < end; ++k) res_i += (alpha * vals[k]) * rhs.row(inds[k]);
+  }
+
+  static void processRow(const LhsEval& lhsEval, const SparseLhsType& /*lhs*/, const DenseRhsType& rhs, Res& res,
+                         const typename Res::Scalar& alpha, Index i, std::false_type /* has_compressed_storage */) {
     typename Res::RowXpr res_i(res.row(i));
     for (LhsInnerIterator it(lhsEval, i); it; ++it) res_i += (alpha * it.value()) * rhs.row(it.index());
   }
 };
 
+// ColMajor, multiple columns (ColPerCol=false): sparse * dense_matrix
 template <typename SparseLhsType, typename DenseRhsType, typename DenseResType>
 struct sparse_time_dense_product_impl<SparseLhsType, DenseRhsType, DenseResType, typename DenseResType::Scalar,
                                       ColMajor, false> {
@@ -151,8 +295,32 @@ struct sparse_time_dense_product_impl<SparseLhsType, DenseRhsType, DenseResType,
   typedef internal::remove_all_t<DenseRhsType> Rhs;
   typedef internal::remove_all_t<DenseResType> Res;
   typedef typename evaluator<Lhs>::InnerIterator LhsInnerIterator;
+
   static void run(const SparseLhsType& lhs, const DenseRhsType& rhs, DenseResType& res,
                   const typename Res::Scalar& alpha) {
+    runImpl(lhs, rhs, res, alpha, std::integral_constant<bool, has_compressed_storage<Lhs>::value>());
+  }
+
+  // Direct pointer path: works for both compressed and non-compressed storage.
+  static void runImpl(const SparseLhsType& lhs, const DenseRhsType& rhs, DenseResType& res,
+                      const typename Res::Scalar& alpha, std::true_type /* has_compressed_storage */) {
+    typedef typename Lhs::Scalar LhsScalar;
+    typedef typename Lhs::StorageIndex StorageIndex;
+    const Lhs& mat = lhs;
+    const LhsScalar* vals = mat.valuePtr();
+    const StorageIndex* inds = mat.innerIndexPtr();
+    const auto* outer = mat.outerIndexPtr();
+    const auto* innerNnz = mat.innerNonZeroPtr();
+    for (Index j = 0; j < lhs.outerSize(); ++j) {
+      typename Rhs::ConstRowXpr rhs_j(rhs.row(j));
+      const Index start = outer[j];
+      const Index end = innerNnz ? outer[j] + innerNnz[j] : outer[j + 1];
+      for (Index k = start; k < end; ++k) res.row(inds[k]) += (alpha * vals[k]) * rhs_j;
+    }
+  }
+
+  static void runImpl(const SparseLhsType& lhs, const DenseRhsType& rhs, DenseResType& res,
+                      const typename Res::Scalar& alpha, std::false_type /* has_compressed_storage */) {
     evaluator<Lhs> lhsEval(lhs);
     for (Index j = 0; j < lhs.outerSize(); ++j) {
       typename Rhs::ConstRowXpr rhs_j(rhs.row(j));