mirror of
https://gitlab.com/libeigen/eigen.git
synced 2026-04-10 11:34:33 +08:00
Optimize sparse-dense product by bypassing InnerIterator for compressed storage
libeigen/eigen!2134 Closes #3026 Co-authored-by: Rasmus Munk Larsen <rmlarsen@gmail.com>
This commit is contained in:
@@ -26,11 +26,18 @@ struct product_promote_storage_type<Dense, Sparse, OuterProduct> {
|
||||
typedef Sparse ret;
|
||||
};
|
||||
|
||||
// Type trait to detect if a sparse type supports direct compressed storage access
|
||||
// (i.e., has valuePtr(), innerIndexPtr(), outerIndexPtr(), isCompressed()).
|
||||
// All types deriving from SparseCompressedBase provide these methods.
|
||||
template <typename T>
|
||||
struct has_compressed_storage : std::is_base_of<SparseCompressedBase<T>, T> {};
|
||||
|
||||
template <typename SparseLhsType, typename DenseRhsType, typename DenseResType, typename AlphaType,
|
||||
int LhsStorageOrder = ((SparseLhsType::Flags & RowMajorBit) == RowMajorBit) ? RowMajor : ColMajor,
|
||||
bool ColPerCol = ((DenseRhsType::Flags & RowMajorBit) == 0) || DenseRhsType::ColsAtCompileTime == 1>
|
||||
struct sparse_time_dense_product_impl;
|
||||
|
||||
// RowMajor, single column (ColPerCol=true): CSR SpMV
|
||||
template <typename SparseLhsType, typename DenseRhsType, typename DenseResType>
|
||||
struct sparse_time_dense_product_impl<SparseLhsType, DenseRhsType, DenseResType, typename DenseResType::Scalar,
|
||||
RowMajor, true> {
|
||||
@@ -39,36 +46,101 @@ struct sparse_time_dense_product_impl<SparseLhsType, DenseRhsType, DenseResType,
|
||||
typedef internal::remove_all_t<DenseResType> Res;
|
||||
typedef typename evaluator<Lhs>::InnerIterator LhsInnerIterator;
|
||||
typedef evaluator<Lhs> LhsEval;
|
||||
typedef typename Res::Scalar ResScalar;
|
||||
|
||||
static void run(const SparseLhsType& lhs, const DenseRhsType& rhs, DenseResType& res,
|
||||
const typename Res::Scalar& alpha) {
|
||||
LhsEval lhsEval(lhs);
|
||||
|
||||
Index n = lhs.outerSize();
|
||||
#ifdef EIGEN_HAS_OPENMP
|
||||
Index threads = Eigen::nbThreads();
|
||||
#endif
|
||||
|
||||
for (Index c = 0; c < rhs.cols(); ++c) {
|
||||
runCol(lhsEval, lhs, rhs, res, alpha, n, c, std::integral_constant<bool, has_compressed_storage<Lhs>::value>());
|
||||
}
|
||||
}
|
||||
|
||||
// Direct pointer path: works for both compressed and non-compressed storage.
|
||||
static void runCol(const LhsEval& /*lhsEval*/, const SparseLhsType& lhs, const DenseRhsType& rhs, DenseResType& res,
|
||||
const ResScalar& alpha, Index n, Index c, std::true_type /* has_compressed_storage */) {
|
||||
const Lhs& mat = lhs;
|
||||
const auto* vals = mat.valuePtr();
|
||||
const auto* inds = mat.innerIndexPtr();
|
||||
const auto* outer = mat.outerIndexPtr();
|
||||
const auto* innerNnz = mat.innerNonZeroPtr();
|
||||
// The fast rhs pointer path requires unit inner stride (common case: VectorXd, contiguous matrix column).
|
||||
if (rhs.innerStride() == 1) {
|
||||
const auto* x = rhs.data() + c * rhs.outerStride();
|
||||
#ifdef EIGEN_HAS_OPENMP
|
||||
// This 20000 threshold has been found experimentally on 2D and 3D Poisson problems.
|
||||
// It basically represents the minimal amount of work to be done to be worth it.
|
||||
if (threads > 1 && lhsEval.nonZerosEstimate() > 20000) {
|
||||
Index threads = Eigen::nbThreads();
|
||||
if (threads > 1 && mat.nonZeros() > 20000) {
|
||||
#pragma omp parallel for schedule(dynamic, (n + threads * 4 - 1) / (threads * 4)) num_threads(threads)
|
||||
for (Index i = 0; i < n; ++i) processRow(lhsEval, rhs, res, alpha, i, c);
|
||||
for (Index i = 0; i < n; ++i) {
|
||||
Index k = outer[i];
|
||||
const Index end = innerNnz ? outer[i] + innerNnz[i] : outer[i + 1];
|
||||
ResScalar sum0(0), sum1(0);
|
||||
for (; k < end; ++k) {
|
||||
sum0 += vals[k] * x[inds[k]];
|
||||
++k;
|
||||
if (k < end) {
|
||||
sum1 += vals[k] * x[inds[k]];
|
||||
}
|
||||
}
|
||||
res.coeffRef(i, c) += alpha * (sum0 + sum1);
|
||||
}
|
||||
} else
|
||||
#endif
|
||||
{
|
||||
for (Index i = 0; i < n; ++i) processRow(lhsEval, rhs, res, alpha, i, c);
|
||||
for (Index i = 0; i < n; ++i) {
|
||||
Index k = outer[i];
|
||||
const Index end = innerNnz ? outer[i] + innerNnz[i] : outer[i + 1];
|
||||
// Two independent accumulators to break the dependency chain
|
||||
ResScalar sum0(0), sum1(0);
|
||||
for (; k < end; ++k) {
|
||||
sum0 += vals[k] * x[inds[k]];
|
||||
++k;
|
||||
if (k < end) {
|
||||
sum1 += vals[k] * x[inds[k]];
|
||||
}
|
||||
}
|
||||
res.coeffRef(i, c) += alpha * (sum0 + sum1);
|
||||
}
|
||||
}
|
||||
} else {
|
||||
// Non-unit rhs stride: use direct pointers for sparse side, coeff() for rhs
|
||||
for (Index i = 0; i < n; ++i) {
|
||||
Index k = outer[i];
|
||||
const Index end = innerNnz ? outer[i] + innerNnz[i] : outer[i + 1];
|
||||
ResScalar sum0(0), sum1(0);
|
||||
for (; k < end; ++k) {
|
||||
sum0 += vals[k] * rhs.coeff(inds[k], c);
|
||||
++k;
|
||||
if (k < end) {
|
||||
sum1 += vals[k] * rhs.coeff(inds[k], c);
|
||||
}
|
||||
}
|
||||
res.coeffRef(i, c) += alpha * (sum0 + sum1);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
static void processRow(const LhsEval& lhsEval, const DenseRhsType& rhs, DenseResType& res,
|
||||
const typename Res::Scalar& alpha, Index i, Index col) {
|
||||
// Two accumulators, which breaks the dependency chain on the accumulator
|
||||
// and allows more instruction-level parallelism in the following loop
|
||||
typename Res::Scalar tmp_a(0);
|
||||
typename Res::Scalar tmp_b(0);
|
||||
// Iterator fallback path
|
||||
static void runCol(const LhsEval& lhsEval, const SparseLhsType& /*lhs*/, const DenseRhsType& rhs, DenseResType& res,
|
||||
const ResScalar& alpha, Index n, Index c, std::false_type /* has_compressed_storage */) {
|
||||
#ifdef EIGEN_HAS_OPENMP
|
||||
Index threads = Eigen::nbThreads();
|
||||
if (threads > 1 && lhsEval.nonZerosEstimate() > 20000) {
|
||||
#pragma omp parallel for schedule(dynamic, (n + threads * 4 - 1) / (threads * 4)) num_threads(threads)
|
||||
for (Index i = 0; i < n; ++i) processRow(lhsEval, rhs, res, alpha, i, c);
|
||||
} else
|
||||
#endif
|
||||
{
|
||||
for (Index i = 0; i < n; ++i) processRow(lhsEval, rhs, res, alpha, i, c);
|
||||
}
|
||||
}
|
||||
|
||||
static void processRow(const LhsEval& lhsEval, const DenseRhsType& rhs, DenseResType& res, const ResScalar& alpha,
|
||||
Index i, Index col) {
|
||||
ResScalar tmp_a(0);
|
||||
ResScalar tmp_b(0);
|
||||
for (LhsInnerIterator it(lhsEval, i); it; ++it) {
|
||||
tmp_a += it.value() * rhs.coeff(it.index(), col);
|
||||
++it;
|
||||
@@ -91,6 +163,7 @@ struct sparse_time_dense_product_impl<SparseLhsType, DenseRhsType, DenseResType,
|
||||
// typedef typename CwiseUnaryOp<scalar_multiple2_op<T1, typename T2::Scalar>, T2>::PlainObject ReturnType;
|
||||
// };
|
||||
|
||||
// ColMajor, single column (ColPerCol=true): CSC SpMV
|
||||
template <typename SparseLhsType, typename DenseRhsType, typename DenseResType, typename AlphaType>
|
||||
struct sparse_time_dense_product_impl<SparseLhsType, DenseRhsType, DenseResType, AlphaType, ColMajor, true> {
|
||||
typedef internal::remove_all_t<SparseLhsType> Lhs;
|
||||
@@ -98,11 +171,60 @@ struct sparse_time_dense_product_impl<SparseLhsType, DenseRhsType, DenseResType,
|
||||
typedef internal::remove_all_t<DenseResType> Res;
|
||||
typedef evaluator<Lhs> LhsEval;
|
||||
typedef typename LhsEval::InnerIterator LhsInnerIterator;
|
||||
|
||||
static void run(const SparseLhsType& lhs, const DenseRhsType& rhs, DenseResType& res, const AlphaType& alpha) {
|
||||
runImpl(lhs, rhs, res, alpha, std::integral_constant<bool, has_compressed_storage<Lhs>::value>());
|
||||
}
|
||||
|
||||
// Direct pointer path: works for both compressed and non-compressed storage.
|
||||
static void runImpl(const SparseLhsType& lhs, const DenseRhsType& rhs, DenseResType& res, const AlphaType& alpha,
|
||||
std::true_type /* has_compressed_storage */) {
|
||||
typedef typename Lhs::Scalar LhsScalar;
|
||||
typedef typename Lhs::StorageIndex StorageIndex;
|
||||
const Lhs& mat = lhs;
|
||||
const LhsScalar* vals = mat.valuePtr();
|
||||
const StorageIndex* inds = mat.innerIndexPtr();
|
||||
const auto* outer = mat.outerIndexPtr();
|
||||
const auto* innerNnz = mat.innerNonZeroPtr();
|
||||
// The fast result pointer path requires contiguous ColMajor result layout.
|
||||
// Transpose<ColMajor> reports innerStride()==1 but is actually RowMajor, so check both.
|
||||
if (!(Res::Flags & RowMajorBit) && res.innerStride() == 1) {
|
||||
for (Index c = 0; c < rhs.cols(); ++c) {
|
||||
typename Res::Scalar* y = res.data() + c * res.outerStride();
|
||||
for (Index j = 0; j < lhs.outerSize(); ++j) {
|
||||
typename ScalarBinaryOpTraits<AlphaType, typename Rhs::Scalar>::ReturnType rhs_j(alpha * rhs.coeff(j, c));
|
||||
const Index start = outer[j];
|
||||
const Index end = innerNnz ? outer[j] + innerNnz[j] : outer[j + 1];
|
||||
Index k = start;
|
||||
// 4-way unrolled scatter-add (no SIMD: writes are scattered)
|
||||
for (; k + 3 < end; k += 4) {
|
||||
y[inds[k]] += vals[k] * rhs_j;
|
||||
y[inds[k + 1]] += vals[k + 1] * rhs_j;
|
||||
y[inds[k + 2]] += vals[k + 2] * rhs_j;
|
||||
y[inds[k + 3]] += vals[k + 3] * rhs_j;
|
||||
}
|
||||
for (; k < end; ++k) y[inds[k]] += vals[k] * rhs_j;
|
||||
}
|
||||
}
|
||||
} else {
|
||||
// Non-unit result stride: use coeffRef() for result access
|
||||
for (Index c = 0; c < rhs.cols(); ++c) {
|
||||
for (Index j = 0; j < lhs.outerSize(); ++j) {
|
||||
typename ScalarBinaryOpTraits<AlphaType, typename Rhs::Scalar>::ReturnType rhs_j(alpha * rhs.coeff(j, c));
|
||||
const Index start = outer[j];
|
||||
const Index end = innerNnz ? outer[j] + innerNnz[j] : outer[j + 1];
|
||||
for (Index k = start; k < end; ++k) res.coeffRef(inds[k], c) += vals[k] * rhs_j;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// Iterator-based fallback
|
||||
static void runImpl(const SparseLhsType& lhs, const DenseRhsType& rhs, DenseResType& res, const AlphaType& alpha,
|
||||
std::false_type /* has_compressed_storage */) {
|
||||
LhsEval lhsEval(lhs);
|
||||
for (Index c = 0; c < rhs.cols(); ++c) {
|
||||
for (Index j = 0; j < lhs.outerSize(); ++j) {
|
||||
// typename Res::Scalar rhs_j = alpha * rhs.coeff(j,c);
|
||||
typename ScalarBinaryOpTraits<AlphaType, typename Rhs::Scalar>::ReturnType rhs_j(alpha * rhs.coeff(j, c));
|
||||
for (LhsInnerIterator it(lhsEval, j); it; ++it) res.coeffRef(it.index(), c) += it.value() * rhs_j;
|
||||
}
|
||||
@@ -110,6 +232,7 @@ struct sparse_time_dense_product_impl<SparseLhsType, DenseRhsType, DenseResType,
|
||||
}
|
||||
};
|
||||
|
||||
// RowMajor, multiple columns (ColPerCol=false): sparse * dense_matrix
|
||||
template <typename SparseLhsType, typename DenseRhsType, typename DenseResType>
|
||||
struct sparse_time_dense_product_impl<SparseLhsType, DenseRhsType, DenseResType, typename DenseResType::Scalar,
|
||||
RowMajor, false> {
|
||||
@@ -118,6 +241,9 @@ struct sparse_time_dense_product_impl<SparseLhsType, DenseRhsType, DenseResType,
|
||||
typedef internal::remove_all_t<DenseResType> Res;
|
||||
typedef evaluator<Lhs> LhsEval;
|
||||
typedef typename LhsEval::InnerIterator LhsInnerIterator;
|
||||
|
||||
static constexpr bool IsCompressedLhs = has_compressed_storage<Lhs>::value;
|
||||
|
||||
static void run(const SparseLhsType& lhs, const DenseRhsType& rhs, DenseResType& res,
|
||||
const typename Res::Scalar& alpha) {
|
||||
Index n = lhs.rows();
|
||||
@@ -129,21 +255,39 @@ struct sparse_time_dense_product_impl<SparseLhsType, DenseRhsType, DenseResType,
|
||||
// It basically represents the minimal amount of work to be done to be worth it.
|
||||
if (threads > 1 && lhsEval.nonZerosEstimate() * rhs.cols() > 20000) {
|
||||
#pragma omp parallel for schedule(dynamic, (n + threads * 4 - 1) / (threads * 4)) num_threads(threads)
|
||||
for (Index i = 0; i < n; ++i) processRow(lhsEval, rhs, res, alpha, i);
|
||||
for (Index i = 0; i < n; ++i)
|
||||
processRow(lhsEval, lhs, rhs, res, alpha, i, std::integral_constant<bool, IsCompressedLhs>());
|
||||
} else
|
||||
#endif
|
||||
{
|
||||
for (Index i = 0; i < n; ++i) processRow(lhsEval, rhs, res, alpha, i);
|
||||
for (Index i = 0; i < n; ++i)
|
||||
processRow(lhsEval, lhs, rhs, res, alpha, i, std::integral_constant<bool, IsCompressedLhs>());
|
||||
}
|
||||
}
|
||||
|
||||
static void processRow(const LhsEval& lhsEval, const DenseRhsType& rhs, Res& res, const typename Res::Scalar& alpha,
|
||||
Index i) {
|
||||
// Direct pointer path: works for both compressed and non-compressed storage.
|
||||
static void processRow(const LhsEval& /*lhsEval*/, const SparseLhsType& lhs, const DenseRhsType& rhs, Res& res,
|
||||
const typename Res::Scalar& alpha, Index i, std::true_type /* has_compressed_storage */) {
|
||||
typedef typename Lhs::Scalar LhsScalar;
|
||||
typedef typename Lhs::StorageIndex StorageIndex;
|
||||
const Lhs& mat = lhs;
|
||||
const LhsScalar* vals = mat.valuePtr();
|
||||
const StorageIndex* inds = mat.innerIndexPtr();
|
||||
const Index start = mat.outerIndexPtr()[i];
|
||||
const auto* innerNnz = mat.innerNonZeroPtr();
|
||||
const Index end = innerNnz ? start + innerNnz[i] : mat.outerIndexPtr()[i + 1];
|
||||
typename Res::RowXpr res_i(res.row(i));
|
||||
for (Index k = start; k < end; ++k) res_i += (alpha * vals[k]) * rhs.row(inds[k]);
|
||||
}
|
||||
|
||||
static void processRow(const LhsEval& lhsEval, const SparseLhsType& /*lhs*/, const DenseRhsType& rhs, Res& res,
|
||||
const typename Res::Scalar& alpha, Index i, std::false_type /* has_compressed_storage */) {
|
||||
typename Res::RowXpr res_i(res.row(i));
|
||||
for (LhsInnerIterator it(lhsEval, i); it; ++it) res_i += (alpha * it.value()) * rhs.row(it.index());
|
||||
}
|
||||
};
|
||||
|
||||
// ColMajor, multiple columns (ColPerCol=false): sparse * dense_matrix
|
||||
template <typename SparseLhsType, typename DenseRhsType, typename DenseResType>
|
||||
struct sparse_time_dense_product_impl<SparseLhsType, DenseRhsType, DenseResType, typename DenseResType::Scalar,
|
||||
ColMajor, false> {
|
||||
@@ -151,8 +295,32 @@ struct sparse_time_dense_product_impl<SparseLhsType, DenseRhsType, DenseResType,
|
||||
typedef internal::remove_all_t<DenseRhsType> Rhs;
|
||||
typedef internal::remove_all_t<DenseResType> Res;
|
||||
typedef typename evaluator<Lhs>::InnerIterator LhsInnerIterator;
|
||||
|
||||
static void run(const SparseLhsType& lhs, const DenseRhsType& rhs, DenseResType& res,
|
||||
const typename Res::Scalar& alpha) {
|
||||
runImpl(lhs, rhs, res, alpha, std::integral_constant<bool, has_compressed_storage<Lhs>::value>());
|
||||
}
|
||||
|
||||
// Direct pointer path: works for both compressed and non-compressed storage.
|
||||
static void runImpl(const SparseLhsType& lhs, const DenseRhsType& rhs, DenseResType& res,
|
||||
const typename Res::Scalar& alpha, std::true_type /* has_compressed_storage */) {
|
||||
typedef typename Lhs::Scalar LhsScalar;
|
||||
typedef typename Lhs::StorageIndex StorageIndex;
|
||||
const Lhs& mat = lhs;
|
||||
const LhsScalar* vals = mat.valuePtr();
|
||||
const StorageIndex* inds = mat.innerIndexPtr();
|
||||
const auto* outer = mat.outerIndexPtr();
|
||||
const auto* innerNnz = mat.innerNonZeroPtr();
|
||||
for (Index j = 0; j < lhs.outerSize(); ++j) {
|
||||
typename Rhs::ConstRowXpr rhs_j(rhs.row(j));
|
||||
const Index start = outer[j];
|
||||
const Index end = innerNnz ? outer[j] + innerNnz[j] : outer[j + 1];
|
||||
for (Index k = start; k < end; ++k) res.row(inds[k]) += (alpha * vals[k]) * rhs_j;
|
||||
}
|
||||
}
|
||||
|
||||
static void runImpl(const SparseLhsType& lhs, const DenseRhsType& rhs, DenseResType& res,
|
||||
const typename Res::Scalar& alpha, std::false_type /* has_compressed_storage */) {
|
||||
evaluator<Lhs> lhsEval(lhs);
|
||||
for (Index j = 0; j < lhs.outerSize(); ++j) {
|
||||
typename Rhs::ConstRowXpr rhs_j(rhs.row(j));
|
||||
|
||||
Reference in New Issue
Block a user