Optimize sparse-dense product by bypassing InnerIterator for compressed storage

libeigen/eigen!2134

Closes #3026

Co-authored-by: Rasmus Munk Larsen <rmlarsen@gmail.com>
This commit is contained in:
Rasmus Munk Larsen
2026-02-17 19:19:18 -08:00
parent b6b2f31ba8
commit 50d6d92a70

View File

@@ -26,11 +26,18 @@ struct product_promote_storage_type<Dense, Sparse, OuterProduct> {
typedef Sparse ret;
};
// Type trait to detect if a sparse type supports direct compressed storage access
// (i.e., has valuePtr(), innerIndexPtr(), outerIndexPtr(), isCompressed()).
// All types deriving from SparseCompressedBase provide these methods.
template <typename T>
struct has_compressed_storage : std::is_base_of<SparseCompressedBase<T>, T> {};
template <typename SparseLhsType, typename DenseRhsType, typename DenseResType, typename AlphaType,
int LhsStorageOrder = ((SparseLhsType::Flags & RowMajorBit) == RowMajorBit) ? RowMajor : ColMajor,
bool ColPerCol = ((DenseRhsType::Flags & RowMajorBit) == 0) || DenseRhsType::ColsAtCompileTime == 1>
struct sparse_time_dense_product_impl;
// RowMajor, single column (ColPerCol=true): CSR SpMV
template <typename SparseLhsType, typename DenseRhsType, typename DenseResType>
struct sparse_time_dense_product_impl<SparseLhsType, DenseRhsType, DenseResType, typename DenseResType::Scalar,
RowMajor, true> {
@@ -39,36 +46,101 @@ struct sparse_time_dense_product_impl<SparseLhsType, DenseRhsType, DenseResType,
typedef internal::remove_all_t<DenseResType> Res;
typedef typename evaluator<Lhs>::InnerIterator LhsInnerIterator;
typedef evaluator<Lhs> LhsEval;
typedef typename Res::Scalar ResScalar;
static void run(const SparseLhsType& lhs, const DenseRhsType& rhs, DenseResType& res,
const typename Res::Scalar& alpha) {
LhsEval lhsEval(lhs);
Index n = lhs.outerSize();
#ifdef EIGEN_HAS_OPENMP
Index threads = Eigen::nbThreads();
#endif
for (Index c = 0; c < rhs.cols(); ++c) {
runCol(lhsEval, lhs, rhs, res, alpha, n, c, std::integral_constant<bool, has_compressed_storage<Lhs>::value>());
}
}
// Direct pointer path: works for both compressed and non-compressed storage.
static void runCol(const LhsEval& /*lhsEval*/, const SparseLhsType& lhs, const DenseRhsType& rhs, DenseResType& res,
const ResScalar& alpha, Index n, Index c, std::true_type /* has_compressed_storage */) {
const Lhs& mat = lhs;
const auto* vals = mat.valuePtr();
const auto* inds = mat.innerIndexPtr();
const auto* outer = mat.outerIndexPtr();
const auto* innerNnz = mat.innerNonZeroPtr();
// The fast rhs pointer path requires unit inner stride (common case: VectorXd, contiguous matrix column).
if (rhs.innerStride() == 1) {
const auto* x = rhs.data() + c * rhs.outerStride();
#ifdef EIGEN_HAS_OPENMP
// This 20000 threshold has been found experimentally on 2D and 3D Poisson problems.
// It basically represents the minimal amount of work to be done to be worth it.
if (threads > 1 && lhsEval.nonZerosEstimate() > 20000) {
Index threads = Eigen::nbThreads();
if (threads > 1 && mat.nonZeros() > 20000) {
#pragma omp parallel for schedule(dynamic, (n + threads * 4 - 1) / (threads * 4)) num_threads(threads)
for (Index i = 0; i < n; ++i) processRow(lhsEval, rhs, res, alpha, i, c);
for (Index i = 0; i < n; ++i) {
Index k = outer[i];
const Index end = innerNnz ? outer[i] + innerNnz[i] : outer[i + 1];
ResScalar sum0(0), sum1(0);
for (; k < end; ++k) {
sum0 += vals[k] * x[inds[k]];
++k;
if (k < end) {
sum1 += vals[k] * x[inds[k]];
}
}
res.coeffRef(i, c) += alpha * (sum0 + sum1);
}
} else
#endif
{
for (Index i = 0; i < n; ++i) processRow(lhsEval, rhs, res, alpha, i, c);
for (Index i = 0; i < n; ++i) {
Index k = outer[i];
const Index end = innerNnz ? outer[i] + innerNnz[i] : outer[i + 1];
// Two independent accumulators to break the dependency chain
ResScalar sum0(0), sum1(0);
for (; k < end; ++k) {
sum0 += vals[k] * x[inds[k]];
++k;
if (k < end) {
sum1 += vals[k] * x[inds[k]];
}
}
res.coeffRef(i, c) += alpha * (sum0 + sum1);
}
}
} else {
// Non-unit rhs stride: use direct pointers for sparse side, coeff() for rhs
for (Index i = 0; i < n; ++i) {
Index k = outer[i];
const Index end = innerNnz ? outer[i] + innerNnz[i] : outer[i + 1];
ResScalar sum0(0), sum1(0);
for (; k < end; ++k) {
sum0 += vals[k] * rhs.coeff(inds[k], c);
++k;
if (k < end) {
sum1 += vals[k] * rhs.coeff(inds[k], c);
}
}
res.coeffRef(i, c) += alpha * (sum0 + sum1);
}
}
}
static void processRow(const LhsEval& lhsEval, const DenseRhsType& rhs, DenseResType& res,
const typename Res::Scalar& alpha, Index i, Index col) {
// Two accumulators, which breaks the dependency chain on the accumulator
// and allows more instruction-level parallelism in the following loop
typename Res::Scalar tmp_a(0);
typename Res::Scalar tmp_b(0);
// Iterator fallback path
static void runCol(const LhsEval& lhsEval, const SparseLhsType& /*lhs*/, const DenseRhsType& rhs, DenseResType& res,
const ResScalar& alpha, Index n, Index c, std::false_type /* has_compressed_storage */) {
#ifdef EIGEN_HAS_OPENMP
Index threads = Eigen::nbThreads();
if (threads > 1 && lhsEval.nonZerosEstimate() > 20000) {
#pragma omp parallel for schedule(dynamic, (n + threads * 4 - 1) / (threads * 4)) num_threads(threads)
for (Index i = 0; i < n; ++i) processRow(lhsEval, rhs, res, alpha, i, c);
} else
#endif
{
for (Index i = 0; i < n; ++i) processRow(lhsEval, rhs, res, alpha, i, c);
}
}
static void processRow(const LhsEval& lhsEval, const DenseRhsType& rhs, DenseResType& res, const ResScalar& alpha,
Index i, Index col) {
ResScalar tmp_a(0);
ResScalar tmp_b(0);
for (LhsInnerIterator it(lhsEval, i); it; ++it) {
tmp_a += it.value() * rhs.coeff(it.index(), col);
++it;
@@ -91,6 +163,7 @@ struct sparse_time_dense_product_impl<SparseLhsType, DenseRhsType, DenseResType,
// typedef typename CwiseUnaryOp<scalar_multiple2_op<T1, typename T2::Scalar>, T2>::PlainObject ReturnType;
// };
// ColMajor, single column (ColPerCol=true): CSC SpMV
template <typename SparseLhsType, typename DenseRhsType, typename DenseResType, typename AlphaType>
struct sparse_time_dense_product_impl<SparseLhsType, DenseRhsType, DenseResType, AlphaType, ColMajor, true> {
typedef internal::remove_all_t<SparseLhsType> Lhs;
@@ -98,11 +171,60 @@ struct sparse_time_dense_product_impl<SparseLhsType, DenseRhsType, DenseResType,
typedef internal::remove_all_t<DenseResType> Res;
typedef evaluator<Lhs> LhsEval;
typedef typename LhsEval::InnerIterator LhsInnerIterator;
static void run(const SparseLhsType& lhs, const DenseRhsType& rhs, DenseResType& res, const AlphaType& alpha) {
runImpl(lhs, rhs, res, alpha, std::integral_constant<bool, has_compressed_storage<Lhs>::value>());
}
// Direct pointer path: works for both compressed and non-compressed storage.
static void runImpl(const SparseLhsType& lhs, const DenseRhsType& rhs, DenseResType& res, const AlphaType& alpha,
std::true_type /* has_compressed_storage */) {
typedef typename Lhs::Scalar LhsScalar;
typedef typename Lhs::StorageIndex StorageIndex;
const Lhs& mat = lhs;
const LhsScalar* vals = mat.valuePtr();
const StorageIndex* inds = mat.innerIndexPtr();
const auto* outer = mat.outerIndexPtr();
const auto* innerNnz = mat.innerNonZeroPtr();
// The fast result pointer path requires contiguous ColMajor result layout.
// Transpose<ColMajor> reports innerStride()==1 but is actually RowMajor, so check both.
if (!(Res::Flags & RowMajorBit) && res.innerStride() == 1) {
for (Index c = 0; c < rhs.cols(); ++c) {
typename Res::Scalar* y = res.data() + c * res.outerStride();
for (Index j = 0; j < lhs.outerSize(); ++j) {
typename ScalarBinaryOpTraits<AlphaType, typename Rhs::Scalar>::ReturnType rhs_j(alpha * rhs.coeff(j, c));
const Index start = outer[j];
const Index end = innerNnz ? outer[j] + innerNnz[j] : outer[j + 1];
Index k = start;
// 4-way unrolled scatter-add (no SIMD: writes are scattered)
for (; k + 3 < end; k += 4) {
y[inds[k]] += vals[k] * rhs_j;
y[inds[k + 1]] += vals[k + 1] * rhs_j;
y[inds[k + 2]] += vals[k + 2] * rhs_j;
y[inds[k + 3]] += vals[k + 3] * rhs_j;
}
for (; k < end; ++k) y[inds[k]] += vals[k] * rhs_j;
}
}
} else {
// Non-unit result stride: use coeffRef() for result access
for (Index c = 0; c < rhs.cols(); ++c) {
for (Index j = 0; j < lhs.outerSize(); ++j) {
typename ScalarBinaryOpTraits<AlphaType, typename Rhs::Scalar>::ReturnType rhs_j(alpha * rhs.coeff(j, c));
const Index start = outer[j];
const Index end = innerNnz ? outer[j] + innerNnz[j] : outer[j + 1];
for (Index k = start; k < end; ++k) res.coeffRef(inds[k], c) += vals[k] * rhs_j;
}
}
}
}
// Iterator-based fallback
static void runImpl(const SparseLhsType& lhs, const DenseRhsType& rhs, DenseResType& res, const AlphaType& alpha,
std::false_type /* has_compressed_storage */) {
LhsEval lhsEval(lhs);
for (Index c = 0; c < rhs.cols(); ++c) {
for (Index j = 0; j < lhs.outerSize(); ++j) {
// typename Res::Scalar rhs_j = alpha * rhs.coeff(j,c);
typename ScalarBinaryOpTraits<AlphaType, typename Rhs::Scalar>::ReturnType rhs_j(alpha * rhs.coeff(j, c));
for (LhsInnerIterator it(lhsEval, j); it; ++it) res.coeffRef(it.index(), c) += it.value() * rhs_j;
}
@@ -110,6 +232,7 @@ struct sparse_time_dense_product_impl<SparseLhsType, DenseRhsType, DenseResType,
}
};
// RowMajor, multiple columns (ColPerCol=false): sparse * dense_matrix
template <typename SparseLhsType, typename DenseRhsType, typename DenseResType>
struct sparse_time_dense_product_impl<SparseLhsType, DenseRhsType, DenseResType, typename DenseResType::Scalar,
RowMajor, false> {
@@ -118,6 +241,9 @@ struct sparse_time_dense_product_impl<SparseLhsType, DenseRhsType, DenseResType,
typedef internal::remove_all_t<DenseResType> Res;
typedef evaluator<Lhs> LhsEval;
typedef typename LhsEval::InnerIterator LhsInnerIterator;
static constexpr bool IsCompressedLhs = has_compressed_storage<Lhs>::value;
static void run(const SparseLhsType& lhs, const DenseRhsType& rhs, DenseResType& res,
const typename Res::Scalar& alpha) {
Index n = lhs.rows();
@@ -129,21 +255,39 @@ struct sparse_time_dense_product_impl<SparseLhsType, DenseRhsType, DenseResType,
// It basically represents the minimal amount of work to be done to be worth it.
if (threads > 1 && lhsEval.nonZerosEstimate() * rhs.cols() > 20000) {
#pragma omp parallel for schedule(dynamic, (n + threads * 4 - 1) / (threads * 4)) num_threads(threads)
for (Index i = 0; i < n; ++i) processRow(lhsEval, rhs, res, alpha, i);
for (Index i = 0; i < n; ++i)
processRow(lhsEval, lhs, rhs, res, alpha, i, std::integral_constant<bool, IsCompressedLhs>());
} else
#endif
{
for (Index i = 0; i < n; ++i) processRow(lhsEval, rhs, res, alpha, i);
for (Index i = 0; i < n; ++i)
processRow(lhsEval, lhs, rhs, res, alpha, i, std::integral_constant<bool, IsCompressedLhs>());
}
}
static void processRow(const LhsEval& lhsEval, const DenseRhsType& rhs, Res& res, const typename Res::Scalar& alpha,
Index i) {
// Direct pointer path: works for both compressed and non-compressed storage.
static void processRow(const LhsEval& /*lhsEval*/, const SparseLhsType& lhs, const DenseRhsType& rhs, Res& res,
const typename Res::Scalar& alpha, Index i, std::true_type /* has_compressed_storage */) {
typedef typename Lhs::Scalar LhsScalar;
typedef typename Lhs::StorageIndex StorageIndex;
const Lhs& mat = lhs;
const LhsScalar* vals = mat.valuePtr();
const StorageIndex* inds = mat.innerIndexPtr();
const Index start = mat.outerIndexPtr()[i];
const auto* innerNnz = mat.innerNonZeroPtr();
const Index end = innerNnz ? start + innerNnz[i] : mat.outerIndexPtr()[i + 1];
typename Res::RowXpr res_i(res.row(i));
for (Index k = start; k < end; ++k) res_i += (alpha * vals[k]) * rhs.row(inds[k]);
}
static void processRow(const LhsEval& lhsEval, const SparseLhsType& /*lhs*/, const DenseRhsType& rhs, Res& res,
const typename Res::Scalar& alpha, Index i, std::false_type /* has_compressed_storage */) {
typename Res::RowXpr res_i(res.row(i));
for (LhsInnerIterator it(lhsEval, i); it; ++it) res_i += (alpha * it.value()) * rhs.row(it.index());
}
};
// ColMajor, multiple columns (ColPerCol=false): sparse * dense_matrix
template <typename SparseLhsType, typename DenseRhsType, typename DenseResType>
struct sparse_time_dense_product_impl<SparseLhsType, DenseRhsType, DenseResType, typename DenseResType::Scalar,
ColMajor, false> {
@@ -151,8 +295,32 @@ struct sparse_time_dense_product_impl<SparseLhsType, DenseRhsType, DenseResType,
typedef internal::remove_all_t<DenseRhsType> Rhs;
typedef internal::remove_all_t<DenseResType> Res;
typedef typename evaluator<Lhs>::InnerIterator LhsInnerIterator;
static void run(const SparseLhsType& lhs, const DenseRhsType& rhs, DenseResType& res,
const typename Res::Scalar& alpha) {
runImpl(lhs, rhs, res, alpha, std::integral_constant<bool, has_compressed_storage<Lhs>::value>());
}
// Direct pointer path: works for both compressed and non-compressed storage.
static void runImpl(const SparseLhsType& lhs, const DenseRhsType& rhs, DenseResType& res,
const typename Res::Scalar& alpha, std::true_type /* has_compressed_storage */) {
typedef typename Lhs::Scalar LhsScalar;
typedef typename Lhs::StorageIndex StorageIndex;
const Lhs& mat = lhs;
const LhsScalar* vals = mat.valuePtr();
const StorageIndex* inds = mat.innerIndexPtr();
const auto* outer = mat.outerIndexPtr();
const auto* innerNnz = mat.innerNonZeroPtr();
for (Index j = 0; j < lhs.outerSize(); ++j) {
typename Rhs::ConstRowXpr rhs_j(rhs.row(j));
const Index start = outer[j];
const Index end = innerNnz ? outer[j] + innerNnz[j] : outer[j + 1];
for (Index k = start; k < end; ++k) res.row(inds[k]) += (alpha * vals[k]) * rhs_j;
}
}
static void runImpl(const SparseLhsType& lhs, const DenseRhsType& rhs, DenseResType& res,
const typename Res::Scalar& alpha, std::false_type /* has_compressed_storage */) {
evaluator<Lhs> lhsEval(lhs);
for (Index j = 0; j < lhs.outerSize(); ++j) {
typename Rhs::ConstRowXpr rhs_j(rhs.row(j));