From 50d6d92a70a5eba68cc047804b119c7cda1338c1 Mon Sep 17 00:00:00 2001 From: Rasmus Munk Larsen <4643818-rmlarsen1@users.noreply.gitlab.com> Date: Tue, 17 Feb 2026 19:19:18 -0800 Subject: [PATCH] Optimize sparse-dense product by bypassing InnerIterator for compressed storage libeigen/eigen!2134 Closes #3026 Co-authored-by: Rasmus Munk Larsen --- Eigen/src/SparseCore/SparseDenseProduct.h | 208 +++++++++++++++++++--- 1 file changed, 188 insertions(+), 20 deletions(-) diff --git a/Eigen/src/SparseCore/SparseDenseProduct.h b/Eigen/src/SparseCore/SparseDenseProduct.h index 17ce596a5..46bdacce6 100644 --- a/Eigen/src/SparseCore/SparseDenseProduct.h +++ b/Eigen/src/SparseCore/SparseDenseProduct.h @@ -26,11 +26,18 @@ struct product_promote_storage_type { typedef Sparse ret; }; +// Type trait to detect if a sparse type supports direct compressed storage access +// (i.e., has valuePtr(), innerIndexPtr(), outerIndexPtr(), isCompressed()). +// All types deriving from SparseCompressedBase provide these methods. +template +struct has_compressed_storage : std::is_base_of, T> {}; + template struct sparse_time_dense_product_impl; +// RowMajor, single column (ColPerCol=true): CSR SpMV template struct sparse_time_dense_product_impl { @@ -39,36 +46,101 @@ struct sparse_time_dense_product_impl Res; typedef typename evaluator::InnerIterator LhsInnerIterator; typedef evaluator LhsEval; + typedef typename Res::Scalar ResScalar; + static void run(const SparseLhsType& lhs, const DenseRhsType& rhs, DenseResType& res, const typename Res::Scalar& alpha) { LhsEval lhsEval(lhs); - Index n = lhs.outerSize(); -#ifdef EIGEN_HAS_OPENMP - Index threads = Eigen::nbThreads(); -#endif for (Index c = 0; c < rhs.cols(); ++c) { + runCol(lhsEval, lhs, rhs, res, alpha, n, c, std::integral_constant::value>()); + } + } + + // Direct pointer path: works for both compressed and non-compressed storage. + static void runCol(const LhsEval& /*lhsEval*/, const SparseLhsType& lhs, const DenseRhsType& rhs, DenseResType& res, + const ResScalar& alpha, Index n, Index c, std::true_type /* has_compressed_storage */) { + const Lhs& mat = lhs; + const auto* vals = mat.valuePtr(); + const auto* inds = mat.innerIndexPtr(); + const auto* outer = mat.outerIndexPtr(); + const auto* innerNnz = mat.innerNonZeroPtr(); + // The fast rhs pointer path requires unit inner stride (common case: VectorXd, contiguous matrix column). + if (rhs.innerStride() == 1) { + const auto* x = rhs.data() + c * rhs.outerStride(); #ifdef EIGEN_HAS_OPENMP - // This 20000 threshold has been found experimentally on 2D and 3D Poisson problems. - // It basically represents the minimal amount of work to be done to be worth it. - if (threads > 1 && lhsEval.nonZerosEstimate() > 20000) { + Index threads = Eigen::nbThreads(); + if (threads > 1 && mat.nonZeros() > 20000) { #pragma omp parallel for schedule(dynamic, (n + threads * 4 - 1) / (threads * 4)) num_threads(threads) - for (Index i = 0; i < n; ++i) processRow(lhsEval, rhs, res, alpha, i, c); + for (Index i = 0; i < n; ++i) { + Index k = outer[i]; + const Index end = innerNnz ? outer[i] + innerNnz[i] : outer[i + 1]; + ResScalar sum0(0), sum1(0); + for (; k < end; ++k) { + sum0 += vals[k] * x[inds[k]]; + ++k; + if (k < end) { + sum1 += vals[k] * x[inds[k]]; + } + } + res.coeffRef(i, c) += alpha * (sum0 + sum1); + } } else #endif { - for (Index i = 0; i < n; ++i) processRow(lhsEval, rhs, res, alpha, i, c); + for (Index i = 0; i < n; ++i) { + Index k = outer[i]; + const Index end = innerNnz ? outer[i] + innerNnz[i] : outer[i + 1]; + // Two independent accumulators to break the dependency chain + ResScalar sum0(0), sum1(0); + for (; k < end; ++k) { + sum0 += vals[k] * x[inds[k]]; + ++k; + if (k < end) { + sum1 += vals[k] * x[inds[k]]; + } + } + res.coeffRef(i, c) += alpha * (sum0 + sum1); + } + } + } else { + // Non-unit rhs stride: use direct pointers for sparse side, coeff() for rhs + for (Index i = 0; i < n; ++i) { + Index k = outer[i]; + const Index end = innerNnz ? outer[i] + innerNnz[i] : outer[i + 1]; + ResScalar sum0(0), sum1(0); + for (; k < end; ++k) { + sum0 += vals[k] * rhs.coeff(inds[k], c); + ++k; + if (k < end) { + sum1 += vals[k] * rhs.coeff(inds[k], c); + } + } + res.coeffRef(i, c) += alpha * (sum0 + sum1); } } } - static void processRow(const LhsEval& lhsEval, const DenseRhsType& rhs, DenseResType& res, - const typename Res::Scalar& alpha, Index i, Index col) { - // Two accumulators, which breaks the dependency chain on the accumulator - // and allows more instruction-level parallelism in the following loop - typename Res::Scalar tmp_a(0); - typename Res::Scalar tmp_b(0); + // Iterator fallback path + static void runCol(const LhsEval& lhsEval, const SparseLhsType& /*lhs*/, const DenseRhsType& rhs, DenseResType& res, + const ResScalar& alpha, Index n, Index c, std::false_type /* has_compressed_storage */) { +#ifdef EIGEN_HAS_OPENMP + Index threads = Eigen::nbThreads(); + if (threads > 1 && lhsEval.nonZerosEstimate() > 20000) { +#pragma omp parallel for schedule(dynamic, (n + threads * 4 - 1) / (threads * 4)) num_threads(threads) + for (Index i = 0; i < n; ++i) processRow(lhsEval, rhs, res, alpha, i, c); + } else +#endif + { + for (Index i = 0; i < n; ++i) processRow(lhsEval, rhs, res, alpha, i, c); + } + } + + static void processRow(const LhsEval& lhsEval, const DenseRhsType& rhs, DenseResType& res, const ResScalar& alpha, + Index i, Index col) { + ResScalar tmp_a(0); + ResScalar tmp_b(0); for (LhsInnerIterator it(lhsEval, i); it; ++it) { tmp_a += it.value() * rhs.coeff(it.index(), col); ++it; @@ -91,6 +163,7 @@ struct sparse_time_dense_product_impl, T2>::PlainObject ReturnType; // }; +// ColMajor, single column (ColPerCol=true): CSC SpMV template struct sparse_time_dense_product_impl { typedef internal::remove_all_t Lhs; @@ -98,11 +171,60 @@ struct sparse_time_dense_product_impl Res; typedef evaluator LhsEval; typedef typename LhsEval::InnerIterator LhsInnerIterator; + static void run(const SparseLhsType& lhs, const DenseRhsType& rhs, DenseResType& res, const AlphaType& alpha) { + runImpl(lhs, rhs, res, alpha, std::integral_constant::value>()); + } + + // Direct pointer path: works for both compressed and non-compressed storage. + static void runImpl(const SparseLhsType& lhs, const DenseRhsType& rhs, DenseResType& res, const AlphaType& alpha, + std::true_type /* has_compressed_storage */) { + typedef typename Lhs::Scalar LhsScalar; + typedef typename Lhs::StorageIndex StorageIndex; + const Lhs& mat = lhs; + const LhsScalar* vals = mat.valuePtr(); + const StorageIndex* inds = mat.innerIndexPtr(); + const auto* outer = mat.outerIndexPtr(); + const auto* innerNnz = mat.innerNonZeroPtr(); + // The fast result pointer path requires contiguous ColMajor result layout. + // Transpose reports innerStride()==1 but is actually RowMajor, so check both. + if (!(Res::Flags & RowMajorBit) && res.innerStride() == 1) { + for (Index c = 0; c < rhs.cols(); ++c) { + typename Res::Scalar* y = res.data() + c * res.outerStride(); + for (Index j = 0; j < lhs.outerSize(); ++j) { + typename ScalarBinaryOpTraits::ReturnType rhs_j(alpha * rhs.coeff(j, c)); + const Index start = outer[j]; + const Index end = innerNnz ? outer[j] + innerNnz[j] : outer[j + 1]; + Index k = start; + // 4-way unrolled scatter-add (no SIMD: writes are scattered) + for (; k + 3 < end; k += 4) { + y[inds[k]] += vals[k] * rhs_j; + y[inds[k + 1]] += vals[k + 1] * rhs_j; + y[inds[k + 2]] += vals[k + 2] * rhs_j; + y[inds[k + 3]] += vals[k + 3] * rhs_j; + } + for (; k < end; ++k) y[inds[k]] += vals[k] * rhs_j; + } + } + } else { + // Non-unit result stride: use coeffRef() for result access + for (Index c = 0; c < rhs.cols(); ++c) { + for (Index j = 0; j < lhs.outerSize(); ++j) { + typename ScalarBinaryOpTraits::ReturnType rhs_j(alpha * rhs.coeff(j, c)); + const Index start = outer[j]; + const Index end = innerNnz ? outer[j] + innerNnz[j] : outer[j + 1]; + for (Index k = start; k < end; ++k) res.coeffRef(inds[k], c) += vals[k] * rhs_j; + } + } + } + } + + // Iterator-based fallback + static void runImpl(const SparseLhsType& lhs, const DenseRhsType& rhs, DenseResType& res, const AlphaType& alpha, + std::false_type /* has_compressed_storage */) { LhsEval lhsEval(lhs); for (Index c = 0; c < rhs.cols(); ++c) { for (Index j = 0; j < lhs.outerSize(); ++j) { - // typename Res::Scalar rhs_j = alpha * rhs.coeff(j,c); typename ScalarBinaryOpTraits::ReturnType rhs_j(alpha * rhs.coeff(j, c)); for (LhsInnerIterator it(lhsEval, j); it; ++it) res.coeffRef(it.index(), c) += it.value() * rhs_j; } @@ -110,6 +232,7 @@ struct sparse_time_dense_product_impl struct sparse_time_dense_product_impl { @@ -118,6 +241,9 @@ struct sparse_time_dense_product_impl Res; typedef evaluator LhsEval; typedef typename LhsEval::InnerIterator LhsInnerIterator; + + static constexpr bool IsCompressedLhs = has_compressed_storage::value; + static void run(const SparseLhsType& lhs, const DenseRhsType& rhs, DenseResType& res, const typename Res::Scalar& alpha) { Index n = lhs.rows(); @@ -129,21 +255,39 @@ struct sparse_time_dense_product_impl 1 && lhsEval.nonZerosEstimate() * rhs.cols() > 20000) { #pragma omp parallel for schedule(dynamic, (n + threads * 4 - 1) / (threads * 4)) num_threads(threads) - for (Index i = 0; i < n; ++i) processRow(lhsEval, rhs, res, alpha, i); + for (Index i = 0; i < n; ++i) + processRow(lhsEval, lhs, rhs, res, alpha, i, std::integral_constant()); } else #endif { - for (Index i = 0; i < n; ++i) processRow(lhsEval, rhs, res, alpha, i); + for (Index i = 0; i < n; ++i) + processRow(lhsEval, lhs, rhs, res, alpha, i, std::integral_constant()); } } - static void processRow(const LhsEval& lhsEval, const DenseRhsType& rhs, Res& res, const typename Res::Scalar& alpha, - Index i) { + // Direct pointer path: works for both compressed and non-compressed storage. + static void processRow(const LhsEval& /*lhsEval*/, const SparseLhsType& lhs, const DenseRhsType& rhs, Res& res, + const typename Res::Scalar& alpha, Index i, std::true_type /* has_compressed_storage */) { + typedef typename Lhs::Scalar LhsScalar; + typedef typename Lhs::StorageIndex StorageIndex; + const Lhs& mat = lhs; + const LhsScalar* vals = mat.valuePtr(); + const StorageIndex* inds = mat.innerIndexPtr(); + const Index start = mat.outerIndexPtr()[i]; + const auto* innerNnz = mat.innerNonZeroPtr(); + const Index end = innerNnz ? start + innerNnz[i] : mat.outerIndexPtr()[i + 1]; + typename Res::RowXpr res_i(res.row(i)); + for (Index k = start; k < end; ++k) res_i += (alpha * vals[k]) * rhs.row(inds[k]); + } + + static void processRow(const LhsEval& lhsEval, const SparseLhsType& /*lhs*/, const DenseRhsType& rhs, Res& res, + const typename Res::Scalar& alpha, Index i, std::false_type /* has_compressed_storage */) { typename Res::RowXpr res_i(res.row(i)); for (LhsInnerIterator it(lhsEval, i); it; ++it) res_i += (alpha * it.value()) * rhs.row(it.index()); } }; +// ColMajor, multiple columns (ColPerCol=false): sparse * dense_matrix template struct sparse_time_dense_product_impl { @@ -151,8 +295,32 @@ struct sparse_time_dense_product_impl Rhs; typedef internal::remove_all_t Res; typedef typename evaluator::InnerIterator LhsInnerIterator; + static void run(const SparseLhsType& lhs, const DenseRhsType& rhs, DenseResType& res, const typename Res::Scalar& alpha) { + runImpl(lhs, rhs, res, alpha, std::integral_constant::value>()); + } + + // Direct pointer path: works for both compressed and non-compressed storage. + static void runImpl(const SparseLhsType& lhs, const DenseRhsType& rhs, DenseResType& res, + const typename Res::Scalar& alpha, std::true_type /* has_compressed_storage */) { + typedef typename Lhs::Scalar LhsScalar; + typedef typename Lhs::StorageIndex StorageIndex; + const Lhs& mat = lhs; + const LhsScalar* vals = mat.valuePtr(); + const StorageIndex* inds = mat.innerIndexPtr(); + const auto* outer = mat.outerIndexPtr(); + const auto* innerNnz = mat.innerNonZeroPtr(); + for (Index j = 0; j < lhs.outerSize(); ++j) { + typename Rhs::ConstRowXpr rhs_j(rhs.row(j)); + const Index start = outer[j]; + const Index end = innerNnz ? outer[j] + innerNnz[j] : outer[j + 1]; + for (Index k = start; k < end; ++k) res.row(inds[k]) += (alpha * vals[k]) * rhs_j; + } + } + + static void runImpl(const SparseLhsType& lhs, const DenseRhsType& rhs, DenseResType& res, + const typename Res::Scalar& alpha, std::false_type /* has_compressed_storage */) { evaluator lhsEval(lhs); for (Index j = 0; j < lhs.outerSize(); ++j) { typename Rhs::ConstRowXpr rhs_j(rhs.row(j));