Vectorize BLAS level 1/2 routines with Eigen expressions

libeigen/eigen!2404

Co-authored-by: Rasmus Munk Larsen <rmlarsen@gmail.com>
This commit is contained in:
Rasmus Munk Larsen
2026-04-05 18:53:11 -07:00
parent 4ad90a60f1
commit 8eabfb5342
6 changed files with 294 additions and 204 deletions

View File

@@ -25,15 +25,19 @@ struct functor_traits<scalar_norm1_op> {
// computes the sum of magnitudes of all vector elements or, for a complex vector x, the sum
// res = |Rex1| + |Imx1| + |Rex2| + |Imx2| + ... + |Rexn| + |Imxn|, where x is a vector of order n
extern "C" RealScalar EIGEN_CAT(REAL_SCALAR_SUFFIX, EIGEN_BLAS_FUNC_NAME(asum))(int *n, RealScalar *px, int *incx) {
// std::cerr << "__asum " << *n << " " << *incx << "\n";
Complex *x = reinterpret_cast<Complex *>(px);
if (*n <= 0) return 0;
// std::complex<T> is layout-compatible with T[2], so we can reinterpret
// a complex vector of length n as a real vector of length 2*n and use
// the fully vectorized cwiseAbs().sum() path.
if (*incx == 1)
return make_vector(x, *n).unaryExpr<scalar_norm1_op>().sum();
else
return make_vector(px, 2 * *n).cwiseAbs().sum();
else {
// For non-unit stride, fall back to the scalar_norm1_op approach since
// the real components are not contiguous across complex elements.
Complex *x = reinterpret_cast<Complex *>(px);
return make_vector(x, *n, std::abs(*incx)).unaryExpr<scalar_norm1_op>().sum();
}
}
extern "C" int EIGEN_CAT(i, EIGEN_BLAS_FUNC_NAME(amax))(int *n, RealScalar *px, int *incx) {

View File

@@ -69,15 +69,21 @@ EIGEN_BLAS_FUNC(copy)(int *n, RealScalar *px, int *incx, RealScalar *py, int *in
// be careful, *incx==0 is allowed !!
if (*incx == 1 && *incy == 1)
make_vector(y, *n) = make_vector(x, *n);
else {
if (*incx < 0) x = x - (*n - 1) * (*incx);
else if (*incx == 0) {
// Broadcast: copy x[0] to all elements of y.
if (*incy < 0) y = y - (*n - 1) * (*incy);
for (int i = 0; i < *n; ++i) {
*y = *x;
x += *incx;
y += *incy;
}
}
} else if (*incx > 0 && *incy > 0)
make_vector(y, *n, *incy) = make_vector(x, *n, *incx);
else if (*incx > 0 && *incy < 0)
make_vector(y, *n, -*incy).reverse() = make_vector(x, *n, *incx);
else if (*incx < 0 && *incy > 0)
make_vector(y, *n, *incy) = make_vector(x, *n, -*incx).reverse();
else if (*incx < 0 && *incy < 0)
make_vector(y, *n, -*incy) = make_vector(x, *n, -*incx);
}
EIGEN_BLAS_FUNC(rotg)(RealScalar *pa, RealScalar *pb, RealScalar *pc, RealScalar *ps) {

View File

@@ -58,23 +58,21 @@ extern "C" Scalar EIGEN_BLAS_FUNC_NAME(dot)(int *n, Scalar *px, int *incx, Scala
Scalar *y = reinterpret_cast<Scalar *>(py);
if (*incx == 1 && *incy == 1)
return (make_vector(x, *n).cwiseProduct(make_vector(y, *n))).sum();
return make_vector(x, *n).dot(make_vector(y, *n));
else if (*incx > 0 && *incy > 0)
return (make_vector(x, *n, *incx).cwiseProduct(make_vector(y, *n, *incy))).sum();
return make_vector(x, *n, *incx).dot(make_vector(y, *n, *incy));
else if (*incx < 0 && *incy > 0)
return (make_vector(x, *n, -*incx).reverse().cwiseProduct(make_vector(y, *n, *incy))).sum();
return make_vector(x, *n, -*incx).reverse().dot(make_vector(y, *n, *incy));
else if (*incx > 0 && *incy < 0)
return (make_vector(x, *n, *incx).cwiseProduct(make_vector(y, *n, -*incy).reverse())).sum();
return make_vector(x, *n, *incx).dot(make_vector(y, *n, -*incy).reverse());
else if (*incx < 0 && *incy < 0)
return (make_vector(x, *n, -*incx).reverse().cwiseProduct(make_vector(y, *n, -*incy).reverse())).sum();
return make_vector(x, *n, -*incx).reverse().dot(make_vector(y, *n, -*incy).reverse());
else
return 0;
}
// computes the Euclidean norm of a vector.
// FIXME
extern "C" Scalar EIGEN_BLAS_FUNC_NAME(nrm2)(int *n, Scalar *px, int *incx) {
// std::cerr << "_nrm2 " << *n << " " << *incx << "\n";
if (*n <= 0) return 0;
Scalar *x = reinterpret_cast<Scalar *>(px);

View File

@@ -106,64 +106,79 @@ EIGEN_BLAS_FUNC(hbmv)
if (*n == 0 || (alpha == Scalar(0) && beta == Scalar(1))) return;
int kx = *incx > 0 ? 0 : (1 - *n) * *incx;
int ky = *incy > 0 ? 0 : (1 - *n) * *incy;
const Scalar *actual_x = get_compact_vector(x, *n, *incx);
Scalar *actual_y = get_compact_vector(y, *n, *incy);
// First form y := beta*y.
if (beta != Scalar(1)) {
int iy = ky;
for (int i = 0; i < *n; ++i) {
y[iy] = (beta == Scalar(0)) ? Scalar(0) : beta * y[iy];
iy += *incy;
}
if (beta == Scalar(0))
make_vector(actual_y, *n).setZero();
else
make_vector(actual_y, *n) *= beta;
}
if (alpha == Scalar(0)) return;
if (alpha == Scalar(0)) {
if (actual_x != x) delete[] actual_x;
if (actual_y != y) delete[] copy_back(actual_y, y, *n, *incy);
return;
}
if (UPLO(*uplo) == UP) {
// Upper triangle: A[i,j] at a[(k+i-j) + j*lda], diagonal at row k.
int jx = kx, jy = ky;
for (int j = 0; j < *n; ++j) {
Scalar temp1 = alpha * x[jx];
Scalar temp2 = Scalar(0);
int ix = kx, iy = ky;
for (int i = std::max(0, j - *k); i < j; ++i) {
Scalar aij = a[(*k + i - j) + j * *lda];
y[iy] += temp1 * aij;
temp2 += Eigen::numext::conj(aij) * x[ix];
ix += *incx;
iy += *incy;
if (*k >= 8) {
// Vectorized path: use Eigen Map segments for the inner band operations.
ConstMatrixType band(a, *k + 1, *n, *lda);
if (UPLO(*uplo) == UP) {
for (int j = 0; j < *n; ++j) {
int start = std::max(0, j - *k);
int len = j - start;
int offset = *k - (j - start);
Scalar temp1 = alpha * actual_x[j];
actual_y[j] += Scalar(Eigen::numext::real(band(*k, j))) * temp1;
if (len > 0) {
make_vector(actual_y + start, len) += temp1 * band.col(j).segment(offset, len);
actual_y[j] += alpha * band.col(j).segment(offset, len).dot(make_vector(actual_x + start, len));
}
}
// Diagonal is real.
y[jy] += Scalar(Eigen::numext::real(a[*k + j * *lda])) * temp1 + alpha * temp2;
jx += *incx;
jy += *incy;
if (j >= *k) {
kx += *incx;
ky += *incy;
} else {
for (int j = 0; j < *n; ++j) {
int len = std::min(*n - 1, j + *k) - j;
Scalar temp1 = alpha * actual_x[j];
actual_y[j] += Scalar(Eigen::numext::real(band(0, j))) * temp1;
if (len > 0) {
make_vector(actual_y + j + 1, len) += temp1 * band.col(j).segment(1, len);
actual_y[j] += alpha * band.col(j).segment(1, len).dot(make_vector(actual_x + j + 1, len));
}
}
}
} else {
// Lower triangle: A[i,j] at a[(i-j) + j*lda], diagonal at row 0.
int jx = kx, jy = ky;
for (int j = 0; j < *n; ++j) {
Scalar temp1 = alpha * x[jx];
Scalar temp2 = Scalar(0);
// Diagonal is real.
y[jy] += Scalar(Eigen::numext::real(a[j * *lda])) * temp1;
int ix = jx, iy = jy;
for (int i = j + 1; i <= std::min(*n - 1, j + *k); ++i) {
ix += *incx;
iy += *incy;
Scalar aij = a[(i - j) + j * *lda];
y[iy] += temp1 * aij;
temp2 += Eigen::numext::conj(aij) * x[ix];
// Scalar path: for narrow bandwidth, avoid Map overhead.
if (UPLO(*uplo) == UP) {
for (int j = 0; j < *n; ++j) {
Scalar temp1 = alpha * actual_x[j];
Scalar temp2 = Scalar(0);
for (int i = std::max(0, j - *k); i < j; ++i) {
Scalar aij = a[(*k + i - j) + j * *lda];
actual_y[i] += temp1 * aij;
temp2 += Eigen::numext::conj(aij) * actual_x[i];
}
actual_y[j] += Scalar(Eigen::numext::real(a[*k + j * *lda])) * temp1 + alpha * temp2;
}
} else {
for (int j = 0; j < *n; ++j) {
Scalar temp1 = alpha * actual_x[j];
Scalar temp2 = Scalar(0);
actual_y[j] += Scalar(Eigen::numext::real(a[j * *lda])) * temp1;
for (int i = j + 1; i <= std::min(*n - 1, j + *k); ++i) {
Scalar aij = a[(i - j) + j * *lda];
actual_y[i] += temp1 * aij;
temp2 += Eigen::numext::conj(aij) * actual_x[i];
}
actual_y[j] += alpha * temp2;
}
y[jy] += alpha * temp2;
jx += *incx;
jy += *incy;
}
}
if (actual_x != x) delete[] actual_x;
if (actual_y != y) delete[] copy_back(actual_y, y, *n, *incy);
}
/** HPMV performs the matrix-vector operation
@@ -196,61 +211,53 @@ EIGEN_BLAS_FUNC(hpmv)
if (*n == 0 || (alpha == Scalar(0) && beta == Scalar(1))) return;
int kx = *incx > 0 ? 0 : (1 - *n) * *incx;
int ky = *incy > 0 ? 0 : (1 - *n) * *incy;
const Scalar *actual_x = get_compact_vector(x, *n, *incx);
Scalar *actual_y = get_compact_vector(y, *n, *incy);
// First form y := beta*y.
if (beta != Scalar(1)) {
int iy = ky;
for (int i = 0; i < *n; ++i) {
y[iy] = (beta == Scalar(0)) ? Scalar(0) : beta * y[iy];
iy += *incy;
}
if (beta == Scalar(0))
make_vector(actual_y, *n).setZero();
else
make_vector(actual_y, *n) *= beta;
}
if (alpha == Scalar(0)) return;
if (alpha == Scalar(0)) {
if (actual_x != x) delete[] actual_x;
if (actual_y != y) delete[] copy_back(actual_y, y, *n, *incy);
return;
}
int kk = 0;
if (UPLO(*uplo) == UP) {
// Upper triangle packed.
int jx = kx, jy = ky;
// Upper triangle packed: column j occupies ap[kk..kk+j].
for (int j = 0; j < *n; ++j) {
Scalar temp1 = alpha * x[jx];
Scalar temp2 = Scalar(0);
int ix = kx, iy = ky;
for (int i = 0; i < j; ++i) {
y[iy] += temp1 * ap[kk + i];
temp2 += Eigen::numext::conj(ap[kk + i]) * x[ix];
ix += *incx;
iy += *incy;
}
Scalar temp1 = alpha * actual_x[j];
// Diagonal is real.
y[jy] += Scalar(Eigen::numext::real(ap[kk + j])) * temp1 + alpha * temp2;
jx += *incx;
jy += *incy;
actual_y[j] += Scalar(Eigen::numext::real(ap[kk + j])) * temp1;
if (j > 0) {
make_vector(actual_y, j) += temp1 * make_vector(ap + kk, j);
actual_y[j] += alpha * make_vector(ap + kk, j).dot(make_vector(actual_x, j));
}
kk += j + 1;
}
} else {
// Lower triangle packed.
int jx = kx, jy = ky;
// Lower triangle packed: column j occupies ap[kk..kk+(n-j-1)].
for (int j = 0; j < *n; ++j) {
Scalar temp1 = alpha * x[jx];
Scalar temp2 = Scalar(0);
int len = *n - j - 1;
Scalar temp1 = alpha * actual_x[j];
// Diagonal is real.
y[jy] += Scalar(Eigen::numext::real(ap[kk])) * temp1;
int ix = jx, iy = jy;
for (int i = 1; i < *n - j; ++i) {
ix += *incx;
iy += *incy;
y[iy] += temp1 * ap[kk + i];
temp2 += Eigen::numext::conj(ap[kk + i]) * x[ix];
actual_y[j] += Scalar(Eigen::numext::real(ap[kk])) * temp1;
if (len > 0) {
make_vector(actual_y + j + 1, len) += temp1 * make_vector(ap + kk + 1, len);
actual_y[j] += alpha * make_vector(ap + kk + 1, len).dot(make_vector(actual_x + j + 1, len));
}
y[jy] += alpha * temp2;
jx += *incx;
jy += *incy;
kk += *n - j;
}
}
if (actual_x != x) delete[] actual_x;
if (actual_y != y) delete[] copy_back(actual_y, y, *n, *incy);
}
/** ZHPR performs the hermitian rank 1 operation

View File

@@ -343,46 +343,112 @@ EIGEN_BLAS_FUNC(tbmv)
int op = OP(*opa);
bool unit = (DIAG(*diag) == UNIT);
if (op == NOTR) {
if (upper) {
// x := A*x, upper band. Process columns left to right.
for (int j = 0; j < *n; ++j) {
if (actual_x[j] != Scalar(0)) {
if (*k >= 8) {
// Vectorized path: use Eigen Map segments for the inner band operations.
ConstMatrixType band(a, *k + 1, *n, *lda);
if (op == NOTR) {
if (upper) {
for (int j = 0; j < *n; ++j) {
if (actual_x[j] != Scalar(0)) {
int start = std::max(0, j - *k);
int len = j - start;
int offset = *k - (j - start);
Scalar temp = actual_x[j];
if (len > 0) make_vector(actual_x + start, len) += temp * band.col(j).segment(offset, len);
if (!unit) actual_x[j] = temp * band(*k, j);
}
}
} else {
for (int j = *n - 1; j >= 0; --j) {
if (actual_x[j] != Scalar(0)) {
int len = std::min(*n - 1, j + *k) - j;
Scalar temp = actual_x[j];
if (len > 0) make_vector(actual_x + j + 1, len) += temp * band.col(j).segment(1, len);
if (!unit) actual_x[j] = temp * band(0, j);
}
}
}
} else if (op == TR) {
if (upper) {
for (int j = *n - 1; j >= 0; --j) {
int start = std::max(0, j - *k);
int len = j - start;
int offset = *k - (j - start);
Scalar temp = actual_x[j];
for (int i = std::max(0, j - *k); i < j; ++i) actual_x[i] += temp * a[(*k + i - j) + j * *lda];
if (!unit) actual_x[j] = temp * a[*k + j * *lda];
if (!unit) temp *= band(*k, j);
if (len > 0)
temp += (band.col(j).segment(offset, len).cwiseProduct(make_vector(actual_x + start, len))).sum();
actual_x[j] = temp;
}
} else {
for (int j = 0; j < *n; ++j) {
int len = std::min(*n - 1, j + *k) - j;
Scalar temp = actual_x[j];
if (!unit) temp *= band(0, j);
if (len > 0) temp += (band.col(j).segment(1, len).cwiseProduct(make_vector(actual_x + j + 1, len))).sum();
actual_x[j] = temp;
}
}
} else {
// x := A*x, lower band. Process columns right to left.
for (int j = *n - 1; j >= 0; --j) {
if (actual_x[j] != Scalar(0)) {
// Conjugate transpose: .dot() computes conj(lhs) . rhs.
if (upper) {
for (int j = *n - 1; j >= 0; --j) {
int start = std::max(0, j - *k);
int len = j - start;
int offset = *k - (j - start);
Scalar temp = actual_x[j];
for (int i = j + 1; i <= std::min(*n - 1, j + *k); ++i) actual_x[i] += temp * a[(i - j) + j * *lda];
if (!unit) actual_x[j] = temp * a[j * *lda];
if (!unit) temp *= Eigen::numext::conj(band(*k, j));
if (len > 0) temp += band.col(j).segment(offset, len).dot(make_vector(actual_x + start, len));
actual_x[j] = temp;
}
} else {
for (int j = 0; j < *n; ++j) {
int len = std::min(*n - 1, j + *k) - j;
Scalar temp = actual_x[j];
if (!unit) temp *= Eigen::numext::conj(band(0, j));
if (len > 0) temp += band.col(j).segment(1, len).dot(make_vector(actual_x + j + 1, len));
actual_x[j] = temp;
}
}
}
} else {
// Transpose or conjugate transpose.
bool do_conj = (op == ADJ);
auto maybe_conj = [do_conj](Scalar val) -> Scalar { return do_conj ? Eigen::numext::conj(val) : val; };
if (upper) {
// x := op(A)*x, upper band. Process columns right to left.
for (int j = *n - 1; j >= 0; --j) {
Scalar temp = actual_x[j];
if (!unit) temp *= maybe_conj(a[*k + j * *lda]);
for (int i = std::max(0, j - *k); i < j; ++i) temp += maybe_conj(a[(*k + i - j) + j * *lda]) * actual_x[i];
actual_x[j] = temp;
// Scalar path: for narrow bandwidth, avoid Map overhead.
if (op == NOTR) {
if (upper) {
for (int j = 0; j < *n; ++j) {
if (actual_x[j] != Scalar(0)) {
Scalar temp = actual_x[j];
for (int i = std::max(0, j - *k); i < j; ++i) actual_x[i] += temp * a[(*k + i - j) + j * *lda];
if (!unit) actual_x[j] = temp * a[*k + j * *lda];
}
}
} else {
for (int j = *n - 1; j >= 0; --j) {
if (actual_x[j] != Scalar(0)) {
Scalar temp = actual_x[j];
for (int i = j + 1; i <= std::min(*n - 1, j + *k); ++i) actual_x[i] += temp * a[(i - j) + j * *lda];
if (!unit) actual_x[j] = temp * a[j * *lda];
}
}
}
} else {
// x := op(A)*x, lower band. Process columns left to right.
for (int j = 0; j < *n; ++j) {
Scalar temp = actual_x[j];
if (!unit) temp *= maybe_conj(a[j * *lda]);
for (int i = j + 1; i <= std::min(*n - 1, j + *k); ++i) temp += maybe_conj(a[(i - j) + j * *lda]) * actual_x[i];
actual_x[j] = temp;
// Transpose or conjugate transpose.
auto maybe_conj = [op](Scalar val) -> Scalar { return op == ADJ ? Eigen::numext::conj(val) : val; };
if (upper) {
for (int j = *n - 1; j >= 0; --j) {
Scalar temp = actual_x[j];
if (!unit) temp *= maybe_conj(a[*k + j * *lda]);
for (int i = std::max(0, j - *k); i < j; ++i) temp += maybe_conj(a[(*k + i - j) + j * *lda]) * actual_x[i];
actual_x[j] = temp;
}
} else {
for (int j = 0; j < *n; ++j) {
Scalar temp = actual_x[j];
if (!unit) temp *= maybe_conj(a[j * *lda]);
for (int i = j + 1; i <= std::min(*n - 1, j + *k); ++i)
temp += maybe_conj(a[(i - j) + j * *lda]) * actual_x[i];
actual_x[j] = temp;
}
}
}
}

View File

@@ -194,62 +194,79 @@ EIGEN_BLAS_FUNC(sbmv)
if (*n == 0 || (alpha == Scalar(0) && beta == Scalar(1))) return;
int kx = *incx > 0 ? 0 : (1 - *n) * *incx;
int ky = *incy > 0 ? 0 : (1 - *n) * *incy;
const Scalar *actual_x = get_compact_vector(x, *n, *incx);
Scalar *actual_y = get_compact_vector(y, *n, *incy);
// First form y := beta*y.
if (beta != Scalar(1)) {
int iy = ky;
for (int i = 0; i < *n; ++i) {
y[iy] = (beta == Scalar(0)) ? Scalar(0) : beta * y[iy];
iy += *incy;
}
if (beta == Scalar(0))
make_vector(actual_y, *n).setZero();
else
make_vector(actual_y, *n) *= beta;
}
if (alpha == Scalar(0)) return;
if (alpha == Scalar(0)) {
if (actual_x != x) delete[] actual_x;
if (actual_y != y) delete[] copy_back(actual_y, y, *n, *incy);
return;
}
if (UPLO(*uplo) == UP) {
// Upper triangle: A[i,j] at a[(k+i-j) + j*lda], diagonal at row k.
int jx = kx, jy = ky;
for (int j = 0; j < *n; ++j) {
Scalar temp1 = alpha * x[jx];
Scalar temp2 = Scalar(0);
int ix = kx, iy = ky;
for (int i = std::max(0, j - *k); i < j; ++i) {
Scalar aij = a[(*k + i - j) + j * *lda];
y[iy] += temp1 * aij;
temp2 += aij * x[ix];
ix += *incx;
iy += *incy;
if (*k >= 8) {
// Vectorized path: use Eigen Map segments for the inner band operations.
ConstMatrixType band(a, *k + 1, *n, *lda);
if (UPLO(*uplo) == UP) {
for (int j = 0; j < *n; ++j) {
int start = std::max(0, j - *k);
int len = j - start;
int offset = *k - (j - start);
Scalar temp1 = alpha * actual_x[j];
actual_y[j] += temp1 * band(*k, j);
if (len > 0) {
make_vector(actual_y + start, len) += temp1 * band.col(j).segment(offset, len);
actual_y[j] += alpha * band.col(j).segment(offset, len).dot(make_vector(actual_x + start, len));
}
}
y[jy] += temp1 * a[*k + j * *lda] + alpha * temp2;
jx += *incx;
jy += *incy;
if (j >= *k) {
kx += *incx;
ky += *incy;
} else {
for (int j = 0; j < *n; ++j) {
int len = std::min(*n - 1, j + *k) - j;
Scalar temp1 = alpha * actual_x[j];
actual_y[j] += temp1 * band(0, j);
if (len > 0) {
make_vector(actual_y + j + 1, len) += temp1 * band.col(j).segment(1, len);
actual_y[j] += alpha * band.col(j).segment(1, len).dot(make_vector(actual_x + j + 1, len));
}
}
}
} else {
// Lower triangle: A[i,j] at a[(i-j) + j*lda], diagonal at row 0.
int jx = kx, jy = ky;
for (int j = 0; j < *n; ++j) {
Scalar temp1 = alpha * x[jx];
Scalar temp2 = Scalar(0);
y[jy] += temp1 * a[j * *lda];
int ix = jx, iy = jy;
for (int i = j + 1; i <= std::min(*n - 1, j + *k); ++i) {
ix += *incx;
iy += *incy;
Scalar aij = a[(i - j) + j * *lda];
y[iy] += temp1 * aij;
temp2 += aij * x[ix];
// Scalar path: for narrow bandwidth, avoid Map overhead.
if (UPLO(*uplo) == UP) {
for (int j = 0; j < *n; ++j) {
Scalar temp1 = alpha * actual_x[j];
Scalar temp2 = Scalar(0);
for (int i = std::max(0, j - *k); i < j; ++i) {
Scalar aij = a[(*k + i - j) + j * *lda];
actual_y[i] += temp1 * aij;
temp2 += aij * actual_x[i];
}
actual_y[j] += temp1 * a[*k + j * *lda] + alpha * temp2;
}
} else {
for (int j = 0; j < *n; ++j) {
Scalar temp1 = alpha * actual_x[j];
Scalar temp2 = Scalar(0);
actual_y[j] += temp1 * a[j * *lda];
for (int i = j + 1; i <= std::min(*n - 1, j + *k); ++i) {
Scalar aij = a[(i - j) + j * *lda];
actual_y[i] += temp1 * aij;
temp2 += aij * actual_x[i];
}
actual_y[j] += alpha * temp2;
}
y[jy] += alpha * temp2;
jx += *incx;
jy += *incy;
}
}
if (actual_x != x) delete[] actual_x;
if (actual_y != y) delete[] copy_back(actual_y, y, *n, *incy);
}
/** SPMV performs the matrix-vector operation
@@ -285,59 +302,51 @@ EIGEN_BLAS_FUNC(spmv)
if (*n == 0 || (alpha == Scalar(0) && beta == Scalar(1))) return;
int kx = *incx > 0 ? 0 : (1 - *n) * *incx;
int ky = *incy > 0 ? 0 : (1 - *n) * *incy;
const Scalar *actual_x = get_compact_vector(x, *n, *incx);
Scalar *actual_y = get_compact_vector(y, *n, *incy);
// First form y := beta*y.
if (beta != Scalar(1)) {
int iy = ky;
for (int i = 0; i < *n; ++i) {
y[iy] = (beta == Scalar(0)) ? Scalar(0) : beta * y[iy];
iy += *incy;
}
if (beta == Scalar(0))
make_vector(actual_y, *n).setZero();
else
make_vector(actual_y, *n) *= beta;
}
if (alpha == Scalar(0)) return;
if (alpha == Scalar(0)) {
if (actual_x != x) delete[] actual_x;
if (actual_y != y) delete[] copy_back(actual_y, y, *n, *incy);
return;
}
int kk = 0;
if (UPLO(*uplo) == UP) {
// Upper triangle packed.
int jx = kx, jy = ky;
// Upper triangle packed: column j occupies ap[kk..kk+j].
for (int j = 0; j < *n; ++j) {
Scalar temp1 = alpha * x[jx];
Scalar temp2 = Scalar(0);
int ix = kx, iy = ky;
for (int i = 0; i < j; ++i) {
y[iy] += temp1 * ap[kk + i];
temp2 += ap[kk + i] * x[ix];
ix += *incx;
iy += *incy;
Scalar temp1 = alpha * actual_x[j];
actual_y[j] += temp1 * ap[kk + j];
if (j > 0) {
make_vector(actual_y, j) += temp1 * make_vector(ap + kk, j);
actual_y[j] += alpha * make_vector(ap + kk, j).dot(make_vector(actual_x, j));
}
y[jy] += temp1 * ap[kk + j] + alpha * temp2;
jx += *incx;
jy += *incy;
kk += j + 1;
}
} else {
// Lower triangle packed.
int jx = kx, jy = ky;
// Lower triangle packed: column j occupies ap[kk..kk+(n-j-1)].
for (int j = 0; j < *n; ++j) {
Scalar temp1 = alpha * x[jx];
Scalar temp2 = Scalar(0);
y[jy] += temp1 * ap[kk];
int ix = jx, iy = jy;
for (int i = 1; i < *n - j; ++i) {
ix += *incx;
iy += *incy;
y[iy] += temp1 * ap[kk + i];
temp2 += ap[kk + i] * x[ix];
int len = *n - j - 1;
Scalar temp1 = alpha * actual_x[j];
actual_y[j] += temp1 * ap[kk];
if (len > 0) {
make_vector(actual_y + j + 1, len) += temp1 * make_vector(ap + kk + 1, len);
actual_y[j] += alpha * make_vector(ap + kk + 1, len).dot(make_vector(actual_x + j + 1, len));
}
y[jy] += alpha * temp2;
jx += *incx;
jy += *incy;
kk += *n - j;
}
}
if (actual_x != x) delete[] actual_x;
if (actual_y != y) delete[] copy_back(actual_y, y, *n, *incy);
}
/** DSPR performs the symmetric rank 1 operation