From 8eabfb5342feac323968944fde789f716bec83ec Mon Sep 17 00:00:00 2001 From: Rasmus Munk Larsen <4643818-rmlarsen1@users.noreply.gitlab.com> Date: Sun, 5 Apr 2026 18:53:11 -0700 Subject: [PATCH] Vectorize BLAS level 1/2 routines with Eigen expressions libeigen/eigen!2404 Co-authored-by: Rasmus Munk Larsen --- blas/level1_cplx_impl.h | 14 ++-- blas/level1_impl.h | 14 +++- blas/level1_real_impl.h | 12 ++- blas/level2_cplx_impl.h | 169 +++++++++++++++++++++------------------- blas/level2_impl.h | 124 ++++++++++++++++++++++------- blas/level2_real_impl.h | 165 ++++++++++++++++++++------------------- 6 files changed, 294 insertions(+), 204 deletions(-) diff --git a/blas/level1_cplx_impl.h b/blas/level1_cplx_impl.h index 3181a5038..68698dc1f 100644 --- a/blas/level1_cplx_impl.h +++ b/blas/level1_cplx_impl.h @@ -25,15 +25,19 @@ struct functor_traits { // computes the sum of magnitudes of all vector elements or, for a complex vector x, the sum // res = |Rex1| + |Imx1| + |Rex2| + |Imx2| + ... + |Rexn| + |Imxn|, where x is a vector of order n extern "C" RealScalar EIGEN_CAT(REAL_SCALAR_SUFFIX, EIGEN_BLAS_FUNC_NAME(asum))(int *n, RealScalar *px, int *incx) { - // std::cerr << "__asum " << *n << " " << *incx << "\n"; - Complex *x = reinterpret_cast(px); - if (*n <= 0) return 0; + // std::complex is layout-compatible with T[2], so we can reinterpret + // a complex vector of length n as a real vector of length 2*n and use + // the fully vectorized cwiseAbs().sum() path. if (*incx == 1) - return make_vector(x, *n).unaryExpr().sum(); - else + return make_vector(px, 2 * *n).cwiseAbs().sum(); + else { + // For non-unit stride, fall back to the scalar_norm1_op approach since + // the real components are not contiguous across complex elements. + Complex *x = reinterpret_cast(px); return make_vector(x, *n, std::abs(*incx)).unaryExpr().sum(); + } } extern "C" int EIGEN_CAT(i, EIGEN_BLAS_FUNC_NAME(amax))(int *n, RealScalar *px, int *incx) { diff --git a/blas/level1_impl.h b/blas/level1_impl.h index 085b35650..13cac93c2 100644 --- a/blas/level1_impl.h +++ b/blas/level1_impl.h @@ -69,15 +69,21 @@ EIGEN_BLAS_FUNC(copy)(int *n, RealScalar *px, int *incx, RealScalar *py, int *in // be careful, *incx==0 is allowed !! if (*incx == 1 && *incy == 1) make_vector(y, *n) = make_vector(x, *n); - else { - if (*incx < 0) x = x - (*n - 1) * (*incx); + else if (*incx == 0) { + // Broadcast: copy x[0] to all elements of y. if (*incy < 0) y = y - (*n - 1) * (*incy); for (int i = 0; i < *n; ++i) { *y = *x; - x += *incx; y += *incy; } - } + } else if (*incx > 0 && *incy > 0) + make_vector(y, *n, *incy) = make_vector(x, *n, *incx); + else if (*incx > 0 && *incy < 0) + make_vector(y, *n, -*incy).reverse() = make_vector(x, *n, *incx); + else if (*incx < 0 && *incy > 0) + make_vector(y, *n, *incy) = make_vector(x, *n, -*incx).reverse(); + else if (*incx < 0 && *incy < 0) + make_vector(y, *n, -*incy) = make_vector(x, *n, -*incx); } EIGEN_BLAS_FUNC(rotg)(RealScalar *pa, RealScalar *pb, RealScalar *pc, RealScalar *ps) { diff --git a/blas/level1_real_impl.h b/blas/level1_real_impl.h index 074883d3e..cc003a518 100644 --- a/blas/level1_real_impl.h +++ b/blas/level1_real_impl.h @@ -58,23 +58,21 @@ extern "C" Scalar EIGEN_BLAS_FUNC_NAME(dot)(int *n, Scalar *px, int *incx, Scala Scalar *y = reinterpret_cast(py); if (*incx == 1 && *incy == 1) - return (make_vector(x, *n).cwiseProduct(make_vector(y, *n))).sum(); + return make_vector(x, *n).dot(make_vector(y, *n)); else if (*incx > 0 && *incy > 0) - return (make_vector(x, *n, *incx).cwiseProduct(make_vector(y, *n, *incy))).sum(); + return make_vector(x, *n, *incx).dot(make_vector(y, *n, *incy)); else if (*incx < 0 && *incy > 0) - return (make_vector(x, *n, -*incx).reverse().cwiseProduct(make_vector(y, *n, *incy))).sum(); + return make_vector(x, *n, -*incx).reverse().dot(make_vector(y, *n, *incy)); else if (*incx > 0 && *incy < 0) - return (make_vector(x, *n, *incx).cwiseProduct(make_vector(y, *n, -*incy).reverse())).sum(); + return make_vector(x, *n, *incx).dot(make_vector(y, *n, -*incy).reverse()); else if (*incx < 0 && *incy < 0) - return (make_vector(x, *n, -*incx).reverse().cwiseProduct(make_vector(y, *n, -*incy).reverse())).sum(); + return make_vector(x, *n, -*incx).reverse().dot(make_vector(y, *n, -*incy).reverse()); else return 0; } // computes the Euclidean norm of a vector. -// FIXME extern "C" Scalar EIGEN_BLAS_FUNC_NAME(nrm2)(int *n, Scalar *px, int *incx) { - // std::cerr << "_nrm2 " << *n << " " << *incx << "\n"; if (*n <= 0) return 0; Scalar *x = reinterpret_cast(px); diff --git a/blas/level2_cplx_impl.h b/blas/level2_cplx_impl.h index f6172400e..f330975b4 100644 --- a/blas/level2_cplx_impl.h +++ b/blas/level2_cplx_impl.h @@ -106,64 +106,79 @@ EIGEN_BLAS_FUNC(hbmv) if (*n == 0 || (alpha == Scalar(0) && beta == Scalar(1))) return; - int kx = *incx > 0 ? 0 : (1 - *n) * *incx; - int ky = *incy > 0 ? 0 : (1 - *n) * *incy; + const Scalar *actual_x = get_compact_vector(x, *n, *incx); + Scalar *actual_y = get_compact_vector(y, *n, *incy); // First form y := beta*y. if (beta != Scalar(1)) { - int iy = ky; - for (int i = 0; i < *n; ++i) { - y[iy] = (beta == Scalar(0)) ? Scalar(0) : beta * y[iy]; - iy += *incy; - } + if (beta == Scalar(0)) + make_vector(actual_y, *n).setZero(); + else + make_vector(actual_y, *n) *= beta; } - if (alpha == Scalar(0)) return; + if (alpha == Scalar(0)) { + if (actual_x != x) delete[] actual_x; + if (actual_y != y) delete[] copy_back(actual_y, y, *n, *incy); + return; + } - if (UPLO(*uplo) == UP) { - // Upper triangle: A[i,j] at a[(k+i-j) + j*lda], diagonal at row k. - int jx = kx, jy = ky; - for (int j = 0; j < *n; ++j) { - Scalar temp1 = alpha * x[jx]; - Scalar temp2 = Scalar(0); - int ix = kx, iy = ky; - for (int i = std::max(0, j - *k); i < j; ++i) { - Scalar aij = a[(*k + i - j) + j * *lda]; - y[iy] += temp1 * aij; - temp2 += Eigen::numext::conj(aij) * x[ix]; - ix += *incx; - iy += *incy; + if (*k >= 8) { + // Vectorized path: use Eigen Map segments for the inner band operations. + ConstMatrixType band(a, *k + 1, *n, *lda); + if (UPLO(*uplo) == UP) { + for (int j = 0; j < *n; ++j) { + int start = std::max(0, j - *k); + int len = j - start; + int offset = *k - (j - start); + Scalar temp1 = alpha * actual_x[j]; + actual_y[j] += Scalar(Eigen::numext::real(band(*k, j))) * temp1; + if (len > 0) { + make_vector(actual_y + start, len) += temp1 * band.col(j).segment(offset, len); + actual_y[j] += alpha * band.col(j).segment(offset, len).dot(make_vector(actual_x + start, len)); + } } - // Diagonal is real. - y[jy] += Scalar(Eigen::numext::real(a[*k + j * *lda])) * temp1 + alpha * temp2; - jx += *incx; - jy += *incy; - if (j >= *k) { - kx += *incx; - ky += *incy; + } else { + for (int j = 0; j < *n; ++j) { + int len = std::min(*n - 1, j + *k) - j; + Scalar temp1 = alpha * actual_x[j]; + actual_y[j] += Scalar(Eigen::numext::real(band(0, j))) * temp1; + if (len > 0) { + make_vector(actual_y + j + 1, len) += temp1 * band.col(j).segment(1, len); + actual_y[j] += alpha * band.col(j).segment(1, len).dot(make_vector(actual_x + j + 1, len)); + } } } } else { - // Lower triangle: A[i,j] at a[(i-j) + j*lda], diagonal at row 0. - int jx = kx, jy = ky; - for (int j = 0; j < *n; ++j) { - Scalar temp1 = alpha * x[jx]; - Scalar temp2 = Scalar(0); - // Diagonal is real. - y[jy] += Scalar(Eigen::numext::real(a[j * *lda])) * temp1; - int ix = jx, iy = jy; - for (int i = j + 1; i <= std::min(*n - 1, j + *k); ++i) { - ix += *incx; - iy += *incy; - Scalar aij = a[(i - j) + j * *lda]; - y[iy] += temp1 * aij; - temp2 += Eigen::numext::conj(aij) * x[ix]; + // Scalar path: for narrow bandwidth, avoid Map overhead. + if (UPLO(*uplo) == UP) { + for (int j = 0; j < *n; ++j) { + Scalar temp1 = alpha * actual_x[j]; + Scalar temp2 = Scalar(0); + for (int i = std::max(0, j - *k); i < j; ++i) { + Scalar aij = a[(*k + i - j) + j * *lda]; + actual_y[i] += temp1 * aij; + temp2 += Eigen::numext::conj(aij) * actual_x[i]; + } + actual_y[j] += Scalar(Eigen::numext::real(a[*k + j * *lda])) * temp1 + alpha * temp2; + } + } else { + for (int j = 0; j < *n; ++j) { + Scalar temp1 = alpha * actual_x[j]; + Scalar temp2 = Scalar(0); + actual_y[j] += Scalar(Eigen::numext::real(a[j * *lda])) * temp1; + for (int i = j + 1; i <= std::min(*n - 1, j + *k); ++i) { + Scalar aij = a[(i - j) + j * *lda]; + actual_y[i] += temp1 * aij; + temp2 += Eigen::numext::conj(aij) * actual_x[i]; + } + actual_y[j] += alpha * temp2; } - y[jy] += alpha * temp2; - jx += *incx; - jy += *incy; } } + + if (actual_x != x) delete[] actual_x; + if (actual_y != y) delete[] copy_back(actual_y, y, *n, *incy); } /** HPMV performs the matrix-vector operation @@ -196,61 +211,53 @@ EIGEN_BLAS_FUNC(hpmv) if (*n == 0 || (alpha == Scalar(0) && beta == Scalar(1))) return; - int kx = *incx > 0 ? 0 : (1 - *n) * *incx; - int ky = *incy > 0 ? 0 : (1 - *n) * *incy; + const Scalar *actual_x = get_compact_vector(x, *n, *incx); + Scalar *actual_y = get_compact_vector(y, *n, *incy); // First form y := beta*y. if (beta != Scalar(1)) { - int iy = ky; - for (int i = 0; i < *n; ++i) { - y[iy] = (beta == Scalar(0)) ? Scalar(0) : beta * y[iy]; - iy += *incy; - } + if (beta == Scalar(0)) + make_vector(actual_y, *n).setZero(); + else + make_vector(actual_y, *n) *= beta; } - if (alpha == Scalar(0)) return; + if (alpha == Scalar(0)) { + if (actual_x != x) delete[] actual_x; + if (actual_y != y) delete[] copy_back(actual_y, y, *n, *incy); + return; + } int kk = 0; if (UPLO(*uplo) == UP) { - // Upper triangle packed. - int jx = kx, jy = ky; + // Upper triangle packed: column j occupies ap[kk..kk+j]. for (int j = 0; j < *n; ++j) { - Scalar temp1 = alpha * x[jx]; - Scalar temp2 = Scalar(0); - int ix = kx, iy = ky; - for (int i = 0; i < j; ++i) { - y[iy] += temp1 * ap[kk + i]; - temp2 += Eigen::numext::conj(ap[kk + i]) * x[ix]; - ix += *incx; - iy += *incy; - } + Scalar temp1 = alpha * actual_x[j]; // Diagonal is real. - y[jy] += Scalar(Eigen::numext::real(ap[kk + j])) * temp1 + alpha * temp2; - jx += *incx; - jy += *incy; + actual_y[j] += Scalar(Eigen::numext::real(ap[kk + j])) * temp1; + if (j > 0) { + make_vector(actual_y, j) += temp1 * make_vector(ap + kk, j); + actual_y[j] += alpha * make_vector(ap + kk, j).dot(make_vector(actual_x, j)); + } kk += j + 1; } } else { - // Lower triangle packed. - int jx = kx, jy = ky; + // Lower triangle packed: column j occupies ap[kk..kk+(n-j-1)]. for (int j = 0; j < *n; ++j) { - Scalar temp1 = alpha * x[jx]; - Scalar temp2 = Scalar(0); + int len = *n - j - 1; + Scalar temp1 = alpha * actual_x[j]; // Diagonal is real. - y[jy] += Scalar(Eigen::numext::real(ap[kk])) * temp1; - int ix = jx, iy = jy; - for (int i = 1; i < *n - j; ++i) { - ix += *incx; - iy += *incy; - y[iy] += temp1 * ap[kk + i]; - temp2 += Eigen::numext::conj(ap[kk + i]) * x[ix]; + actual_y[j] += Scalar(Eigen::numext::real(ap[kk])) * temp1; + if (len > 0) { + make_vector(actual_y + j + 1, len) += temp1 * make_vector(ap + kk + 1, len); + actual_y[j] += alpha * make_vector(ap + kk + 1, len).dot(make_vector(actual_x + j + 1, len)); } - y[jy] += alpha * temp2; - jx += *incx; - jy += *incy; kk += *n - j; } } + + if (actual_x != x) delete[] actual_x; + if (actual_y != y) delete[] copy_back(actual_y, y, *n, *incy); } /** ZHPR performs the hermitian rank 1 operation diff --git a/blas/level2_impl.h b/blas/level2_impl.h index f8394eedd..9165d7d70 100644 --- a/blas/level2_impl.h +++ b/blas/level2_impl.h @@ -343,46 +343,112 @@ EIGEN_BLAS_FUNC(tbmv) int op = OP(*opa); bool unit = (DIAG(*diag) == UNIT); - if (op == NOTR) { - if (upper) { - // x := A*x, upper band. Process columns left to right. - for (int j = 0; j < *n; ++j) { - if (actual_x[j] != Scalar(0)) { + if (*k >= 8) { + // Vectorized path: use Eigen Map segments for the inner band operations. + ConstMatrixType band(a, *k + 1, *n, *lda); + if (op == NOTR) { + if (upper) { + for (int j = 0; j < *n; ++j) { + if (actual_x[j] != Scalar(0)) { + int start = std::max(0, j - *k); + int len = j - start; + int offset = *k - (j - start); + Scalar temp = actual_x[j]; + if (len > 0) make_vector(actual_x + start, len) += temp * band.col(j).segment(offset, len); + if (!unit) actual_x[j] = temp * band(*k, j); + } + } + } else { + for (int j = *n - 1; j >= 0; --j) { + if (actual_x[j] != Scalar(0)) { + int len = std::min(*n - 1, j + *k) - j; + Scalar temp = actual_x[j]; + if (len > 0) make_vector(actual_x + j + 1, len) += temp * band.col(j).segment(1, len); + if (!unit) actual_x[j] = temp * band(0, j); + } + } + } + } else if (op == TR) { + if (upper) { + for (int j = *n - 1; j >= 0; --j) { + int start = std::max(0, j - *k); + int len = j - start; + int offset = *k - (j - start); Scalar temp = actual_x[j]; - for (int i = std::max(0, j - *k); i < j; ++i) actual_x[i] += temp * a[(*k + i - j) + j * *lda]; - if (!unit) actual_x[j] = temp * a[*k + j * *lda]; + if (!unit) temp *= band(*k, j); + if (len > 0) + temp += (band.col(j).segment(offset, len).cwiseProduct(make_vector(actual_x + start, len))).sum(); + actual_x[j] = temp; + } + } else { + for (int j = 0; j < *n; ++j) { + int len = std::min(*n - 1, j + *k) - j; + Scalar temp = actual_x[j]; + if (!unit) temp *= band(0, j); + if (len > 0) temp += (band.col(j).segment(1, len).cwiseProduct(make_vector(actual_x + j + 1, len))).sum(); + actual_x[j] = temp; } } } else { - // x := A*x, lower band. Process columns right to left. - for (int j = *n - 1; j >= 0; --j) { - if (actual_x[j] != Scalar(0)) { + // Conjugate transpose: .dot() computes conj(lhs) . rhs. + if (upper) { + for (int j = *n - 1; j >= 0; --j) { + int start = std::max(0, j - *k); + int len = j - start; + int offset = *k - (j - start); Scalar temp = actual_x[j]; - for (int i = j + 1; i <= std::min(*n - 1, j + *k); ++i) actual_x[i] += temp * a[(i - j) + j * *lda]; - if (!unit) actual_x[j] = temp * a[j * *lda]; + if (!unit) temp *= Eigen::numext::conj(band(*k, j)); + if (len > 0) temp += band.col(j).segment(offset, len).dot(make_vector(actual_x + start, len)); + actual_x[j] = temp; + } + } else { + for (int j = 0; j < *n; ++j) { + int len = std::min(*n - 1, j + *k) - j; + Scalar temp = actual_x[j]; + if (!unit) temp *= Eigen::numext::conj(band(0, j)); + if (len > 0) temp += band.col(j).segment(1, len).dot(make_vector(actual_x + j + 1, len)); + actual_x[j] = temp; } } } } else { - // Transpose or conjugate transpose. - bool do_conj = (op == ADJ); - auto maybe_conj = [do_conj](Scalar val) -> Scalar { return do_conj ? Eigen::numext::conj(val) : val; }; - - if (upper) { - // x := op(A)*x, upper band. Process columns right to left. - for (int j = *n - 1; j >= 0; --j) { - Scalar temp = actual_x[j]; - if (!unit) temp *= maybe_conj(a[*k + j * *lda]); - for (int i = std::max(0, j - *k); i < j; ++i) temp += maybe_conj(a[(*k + i - j) + j * *lda]) * actual_x[i]; - actual_x[j] = temp; + // Scalar path: for narrow bandwidth, avoid Map overhead. + if (op == NOTR) { + if (upper) { + for (int j = 0; j < *n; ++j) { + if (actual_x[j] != Scalar(0)) { + Scalar temp = actual_x[j]; + for (int i = std::max(0, j - *k); i < j; ++i) actual_x[i] += temp * a[(*k + i - j) + j * *lda]; + if (!unit) actual_x[j] = temp * a[*k + j * *lda]; + } + } + } else { + for (int j = *n - 1; j >= 0; --j) { + if (actual_x[j] != Scalar(0)) { + Scalar temp = actual_x[j]; + for (int i = j + 1; i <= std::min(*n - 1, j + *k); ++i) actual_x[i] += temp * a[(i - j) + j * *lda]; + if (!unit) actual_x[j] = temp * a[j * *lda]; + } + } } } else { - // x := op(A)*x, lower band. Process columns left to right. - for (int j = 0; j < *n; ++j) { - Scalar temp = actual_x[j]; - if (!unit) temp *= maybe_conj(a[j * *lda]); - for (int i = j + 1; i <= std::min(*n - 1, j + *k); ++i) temp += maybe_conj(a[(i - j) + j * *lda]) * actual_x[i]; - actual_x[j] = temp; + // Transpose or conjugate transpose. + auto maybe_conj = [op](Scalar val) -> Scalar { return op == ADJ ? Eigen::numext::conj(val) : val; }; + if (upper) { + for (int j = *n - 1; j >= 0; --j) { + Scalar temp = actual_x[j]; + if (!unit) temp *= maybe_conj(a[*k + j * *lda]); + for (int i = std::max(0, j - *k); i < j; ++i) temp += maybe_conj(a[(*k + i - j) + j * *lda]) * actual_x[i]; + actual_x[j] = temp; + } + } else { + for (int j = 0; j < *n; ++j) { + Scalar temp = actual_x[j]; + if (!unit) temp *= maybe_conj(a[j * *lda]); + for (int i = j + 1; i <= std::min(*n - 1, j + *k); ++i) + temp += maybe_conj(a[(i - j) + j * *lda]) * actual_x[i]; + actual_x[j] = temp; + } } } } diff --git a/blas/level2_real_impl.h b/blas/level2_real_impl.h index 6072007ff..3cf31b246 100644 --- a/blas/level2_real_impl.h +++ b/blas/level2_real_impl.h @@ -194,62 +194,79 @@ EIGEN_BLAS_FUNC(sbmv) if (*n == 0 || (alpha == Scalar(0) && beta == Scalar(1))) return; - int kx = *incx > 0 ? 0 : (1 - *n) * *incx; - int ky = *incy > 0 ? 0 : (1 - *n) * *incy; + const Scalar *actual_x = get_compact_vector(x, *n, *incx); + Scalar *actual_y = get_compact_vector(y, *n, *incy); // First form y := beta*y. if (beta != Scalar(1)) { - int iy = ky; - for (int i = 0; i < *n; ++i) { - y[iy] = (beta == Scalar(0)) ? Scalar(0) : beta * y[iy]; - iy += *incy; - } + if (beta == Scalar(0)) + make_vector(actual_y, *n).setZero(); + else + make_vector(actual_y, *n) *= beta; } - if (alpha == Scalar(0)) return; + if (alpha == Scalar(0)) { + if (actual_x != x) delete[] actual_x; + if (actual_y != y) delete[] copy_back(actual_y, y, *n, *incy); + return; + } - if (UPLO(*uplo) == UP) { - // Upper triangle: A[i,j] at a[(k+i-j) + j*lda], diagonal at row k. - int jx = kx, jy = ky; - for (int j = 0; j < *n; ++j) { - Scalar temp1 = alpha * x[jx]; - Scalar temp2 = Scalar(0); - int ix = kx, iy = ky; - for (int i = std::max(0, j - *k); i < j; ++i) { - Scalar aij = a[(*k + i - j) + j * *lda]; - y[iy] += temp1 * aij; - temp2 += aij * x[ix]; - ix += *incx; - iy += *incy; + if (*k >= 8) { + // Vectorized path: use Eigen Map segments for the inner band operations. + ConstMatrixType band(a, *k + 1, *n, *lda); + if (UPLO(*uplo) == UP) { + for (int j = 0; j < *n; ++j) { + int start = std::max(0, j - *k); + int len = j - start; + int offset = *k - (j - start); + Scalar temp1 = alpha * actual_x[j]; + actual_y[j] += temp1 * band(*k, j); + if (len > 0) { + make_vector(actual_y + start, len) += temp1 * band.col(j).segment(offset, len); + actual_y[j] += alpha * band.col(j).segment(offset, len).dot(make_vector(actual_x + start, len)); + } } - y[jy] += temp1 * a[*k + j * *lda] + alpha * temp2; - jx += *incx; - jy += *incy; - if (j >= *k) { - kx += *incx; - ky += *incy; + } else { + for (int j = 0; j < *n; ++j) { + int len = std::min(*n - 1, j + *k) - j; + Scalar temp1 = alpha * actual_x[j]; + actual_y[j] += temp1 * band(0, j); + if (len > 0) { + make_vector(actual_y + j + 1, len) += temp1 * band.col(j).segment(1, len); + actual_y[j] += alpha * band.col(j).segment(1, len).dot(make_vector(actual_x + j + 1, len)); + } } } } else { - // Lower triangle: A[i,j] at a[(i-j) + j*lda], diagonal at row 0. - int jx = kx, jy = ky; - for (int j = 0; j < *n; ++j) { - Scalar temp1 = alpha * x[jx]; - Scalar temp2 = Scalar(0); - y[jy] += temp1 * a[j * *lda]; - int ix = jx, iy = jy; - for (int i = j + 1; i <= std::min(*n - 1, j + *k); ++i) { - ix += *incx; - iy += *incy; - Scalar aij = a[(i - j) + j * *lda]; - y[iy] += temp1 * aij; - temp2 += aij * x[ix]; + // Scalar path: for narrow bandwidth, avoid Map overhead. + if (UPLO(*uplo) == UP) { + for (int j = 0; j < *n; ++j) { + Scalar temp1 = alpha * actual_x[j]; + Scalar temp2 = Scalar(0); + for (int i = std::max(0, j - *k); i < j; ++i) { + Scalar aij = a[(*k + i - j) + j * *lda]; + actual_y[i] += temp1 * aij; + temp2 += aij * actual_x[i]; + } + actual_y[j] += temp1 * a[*k + j * *lda] + alpha * temp2; + } + } else { + for (int j = 0; j < *n; ++j) { + Scalar temp1 = alpha * actual_x[j]; + Scalar temp2 = Scalar(0); + actual_y[j] += temp1 * a[j * *lda]; + for (int i = j + 1; i <= std::min(*n - 1, j + *k); ++i) { + Scalar aij = a[(i - j) + j * *lda]; + actual_y[i] += temp1 * aij; + temp2 += aij * actual_x[i]; + } + actual_y[j] += alpha * temp2; } - y[jy] += alpha * temp2; - jx += *incx; - jy += *incy; } } + + if (actual_x != x) delete[] actual_x; + if (actual_y != y) delete[] copy_back(actual_y, y, *n, *incy); } /** SPMV performs the matrix-vector operation @@ -285,59 +302,51 @@ EIGEN_BLAS_FUNC(spmv) if (*n == 0 || (alpha == Scalar(0) && beta == Scalar(1))) return; - int kx = *incx > 0 ? 0 : (1 - *n) * *incx; - int ky = *incy > 0 ? 0 : (1 - *n) * *incy; + const Scalar *actual_x = get_compact_vector(x, *n, *incx); + Scalar *actual_y = get_compact_vector(y, *n, *incy); // First form y := beta*y. if (beta != Scalar(1)) { - int iy = ky; - for (int i = 0; i < *n; ++i) { - y[iy] = (beta == Scalar(0)) ? Scalar(0) : beta * y[iy]; - iy += *incy; - } + if (beta == Scalar(0)) + make_vector(actual_y, *n).setZero(); + else + make_vector(actual_y, *n) *= beta; } - if (alpha == Scalar(0)) return; + if (alpha == Scalar(0)) { + if (actual_x != x) delete[] actual_x; + if (actual_y != y) delete[] copy_back(actual_y, y, *n, *incy); + return; + } int kk = 0; if (UPLO(*uplo) == UP) { - // Upper triangle packed. - int jx = kx, jy = ky; + // Upper triangle packed: column j occupies ap[kk..kk+j]. for (int j = 0; j < *n; ++j) { - Scalar temp1 = alpha * x[jx]; - Scalar temp2 = Scalar(0); - int ix = kx, iy = ky; - for (int i = 0; i < j; ++i) { - y[iy] += temp1 * ap[kk + i]; - temp2 += ap[kk + i] * x[ix]; - ix += *incx; - iy += *incy; + Scalar temp1 = alpha * actual_x[j]; + actual_y[j] += temp1 * ap[kk + j]; + if (j > 0) { + make_vector(actual_y, j) += temp1 * make_vector(ap + kk, j); + actual_y[j] += alpha * make_vector(ap + kk, j).dot(make_vector(actual_x, j)); } - y[jy] += temp1 * ap[kk + j] + alpha * temp2; - jx += *incx; - jy += *incy; kk += j + 1; } } else { - // Lower triangle packed. - int jx = kx, jy = ky; + // Lower triangle packed: column j occupies ap[kk..kk+(n-j-1)]. for (int j = 0; j < *n; ++j) { - Scalar temp1 = alpha * x[jx]; - Scalar temp2 = Scalar(0); - y[jy] += temp1 * ap[kk]; - int ix = jx, iy = jy; - for (int i = 1; i < *n - j; ++i) { - ix += *incx; - iy += *incy; - y[iy] += temp1 * ap[kk + i]; - temp2 += ap[kk + i] * x[ix]; + int len = *n - j - 1; + Scalar temp1 = alpha * actual_x[j]; + actual_y[j] += temp1 * ap[kk]; + if (len > 0) { + make_vector(actual_y + j + 1, len) += temp1 * make_vector(ap + kk + 1, len); + actual_y[j] += alpha * make_vector(ap + kk + 1, len).dot(make_vector(actual_x + j + 1, len)); } - y[jy] += alpha * temp2; - jx += *incx; - jy += *incy; kk += *n - j; } } + + if (actual_x != x) delete[] actual_x; + if (actual_y != y) delete[] copy_back(actual_y, y, *n, *incy); } /** DSPR performs the symmetric rank 1 operation