mirror of
https://gitlab.com/libeigen/eigen.git
synced 2026-04-10 11:34:33 +08:00
Vectorize BLAS level 1/2 routines with Eigen expressions
libeigen/eigen!2404 Co-authored-by: Rasmus Munk Larsen <rmlarsen@gmail.com>
This commit is contained in:
@@ -25,15 +25,19 @@ struct functor_traits<scalar_norm1_op> {
|
||||
// computes the sum of magnitudes of all vector elements or, for a complex vector x, the sum
|
||||
// res = |Rex1| + |Imx1| + |Rex2| + |Imx2| + ... + |Rexn| + |Imxn|, where x is a vector of order n
|
||||
extern "C" RealScalar EIGEN_CAT(REAL_SCALAR_SUFFIX, EIGEN_BLAS_FUNC_NAME(asum))(int *n, RealScalar *px, int *incx) {
|
||||
// std::cerr << "__asum " << *n << " " << *incx << "\n";
|
||||
Complex *x = reinterpret_cast<Complex *>(px);
|
||||
|
||||
if (*n <= 0) return 0;
|
||||
|
||||
// std::complex<T> is layout-compatible with T[2], so we can reinterpret
|
||||
// a complex vector of length n as a real vector of length 2*n and use
|
||||
// the fully vectorized cwiseAbs().sum() path.
|
||||
if (*incx == 1)
|
||||
return make_vector(x, *n).unaryExpr<scalar_norm1_op>().sum();
|
||||
else
|
||||
return make_vector(px, 2 * *n).cwiseAbs().sum();
|
||||
else {
|
||||
// For non-unit stride, fall back to the scalar_norm1_op approach since
|
||||
// the real components are not contiguous across complex elements.
|
||||
Complex *x = reinterpret_cast<Complex *>(px);
|
||||
return make_vector(x, *n, std::abs(*incx)).unaryExpr<scalar_norm1_op>().sum();
|
||||
}
|
||||
}
|
||||
|
||||
extern "C" int EIGEN_CAT(i, EIGEN_BLAS_FUNC_NAME(amax))(int *n, RealScalar *px, int *incx) {
|
||||
|
||||
@@ -69,15 +69,21 @@ EIGEN_BLAS_FUNC(copy)(int *n, RealScalar *px, int *incx, RealScalar *py, int *in
|
||||
// be careful, *incx==0 is allowed !!
|
||||
if (*incx == 1 && *incy == 1)
|
||||
make_vector(y, *n) = make_vector(x, *n);
|
||||
else {
|
||||
if (*incx < 0) x = x - (*n - 1) * (*incx);
|
||||
else if (*incx == 0) {
|
||||
// Broadcast: copy x[0] to all elements of y.
|
||||
if (*incy < 0) y = y - (*n - 1) * (*incy);
|
||||
for (int i = 0; i < *n; ++i) {
|
||||
*y = *x;
|
||||
x += *incx;
|
||||
y += *incy;
|
||||
}
|
||||
}
|
||||
} else if (*incx > 0 && *incy > 0)
|
||||
make_vector(y, *n, *incy) = make_vector(x, *n, *incx);
|
||||
else if (*incx > 0 && *incy < 0)
|
||||
make_vector(y, *n, -*incy).reverse() = make_vector(x, *n, *incx);
|
||||
else if (*incx < 0 && *incy > 0)
|
||||
make_vector(y, *n, *incy) = make_vector(x, *n, -*incx).reverse();
|
||||
else if (*incx < 0 && *incy < 0)
|
||||
make_vector(y, *n, -*incy) = make_vector(x, *n, -*incx);
|
||||
}
|
||||
|
||||
EIGEN_BLAS_FUNC(rotg)(RealScalar *pa, RealScalar *pb, RealScalar *pc, RealScalar *ps) {
|
||||
|
||||
@@ -58,23 +58,21 @@ extern "C" Scalar EIGEN_BLAS_FUNC_NAME(dot)(int *n, Scalar *px, int *incx, Scala
|
||||
Scalar *y = reinterpret_cast<Scalar *>(py);
|
||||
|
||||
if (*incx == 1 && *incy == 1)
|
||||
return (make_vector(x, *n).cwiseProduct(make_vector(y, *n))).sum();
|
||||
return make_vector(x, *n).dot(make_vector(y, *n));
|
||||
else if (*incx > 0 && *incy > 0)
|
||||
return (make_vector(x, *n, *incx).cwiseProduct(make_vector(y, *n, *incy))).sum();
|
||||
return make_vector(x, *n, *incx).dot(make_vector(y, *n, *incy));
|
||||
else if (*incx < 0 && *incy > 0)
|
||||
return (make_vector(x, *n, -*incx).reverse().cwiseProduct(make_vector(y, *n, *incy))).sum();
|
||||
return make_vector(x, *n, -*incx).reverse().dot(make_vector(y, *n, *incy));
|
||||
else if (*incx > 0 && *incy < 0)
|
||||
return (make_vector(x, *n, *incx).cwiseProduct(make_vector(y, *n, -*incy).reverse())).sum();
|
||||
return make_vector(x, *n, *incx).dot(make_vector(y, *n, -*incy).reverse());
|
||||
else if (*incx < 0 && *incy < 0)
|
||||
return (make_vector(x, *n, -*incx).reverse().cwiseProduct(make_vector(y, *n, -*incy).reverse())).sum();
|
||||
return make_vector(x, *n, -*incx).reverse().dot(make_vector(y, *n, -*incy).reverse());
|
||||
else
|
||||
return 0;
|
||||
}
|
||||
|
||||
// computes the Euclidean norm of a vector.
|
||||
// FIXME
|
||||
extern "C" Scalar EIGEN_BLAS_FUNC_NAME(nrm2)(int *n, Scalar *px, int *incx) {
|
||||
// std::cerr << "_nrm2 " << *n << " " << *incx << "\n";
|
||||
if (*n <= 0) return 0;
|
||||
|
||||
Scalar *x = reinterpret_cast<Scalar *>(px);
|
||||
|
||||
@@ -106,64 +106,79 @@ EIGEN_BLAS_FUNC(hbmv)
|
||||
|
||||
if (*n == 0 || (alpha == Scalar(0) && beta == Scalar(1))) return;
|
||||
|
||||
int kx = *incx > 0 ? 0 : (1 - *n) * *incx;
|
||||
int ky = *incy > 0 ? 0 : (1 - *n) * *incy;
|
||||
const Scalar *actual_x = get_compact_vector(x, *n, *incx);
|
||||
Scalar *actual_y = get_compact_vector(y, *n, *incy);
|
||||
|
||||
// First form y := beta*y.
|
||||
if (beta != Scalar(1)) {
|
||||
int iy = ky;
|
||||
for (int i = 0; i < *n; ++i) {
|
||||
y[iy] = (beta == Scalar(0)) ? Scalar(0) : beta * y[iy];
|
||||
iy += *incy;
|
||||
}
|
||||
if (beta == Scalar(0))
|
||||
make_vector(actual_y, *n).setZero();
|
||||
else
|
||||
make_vector(actual_y, *n) *= beta;
|
||||
}
|
||||
|
||||
if (alpha == Scalar(0)) return;
|
||||
if (alpha == Scalar(0)) {
|
||||
if (actual_x != x) delete[] actual_x;
|
||||
if (actual_y != y) delete[] copy_back(actual_y, y, *n, *incy);
|
||||
return;
|
||||
}
|
||||
|
||||
if (UPLO(*uplo) == UP) {
|
||||
// Upper triangle: A[i,j] at a[(k+i-j) + j*lda], diagonal at row k.
|
||||
int jx = kx, jy = ky;
|
||||
for (int j = 0; j < *n; ++j) {
|
||||
Scalar temp1 = alpha * x[jx];
|
||||
Scalar temp2 = Scalar(0);
|
||||
int ix = kx, iy = ky;
|
||||
for (int i = std::max(0, j - *k); i < j; ++i) {
|
||||
Scalar aij = a[(*k + i - j) + j * *lda];
|
||||
y[iy] += temp1 * aij;
|
||||
temp2 += Eigen::numext::conj(aij) * x[ix];
|
||||
ix += *incx;
|
||||
iy += *incy;
|
||||
if (*k >= 8) {
|
||||
// Vectorized path: use Eigen Map segments for the inner band operations.
|
||||
ConstMatrixType band(a, *k + 1, *n, *lda);
|
||||
if (UPLO(*uplo) == UP) {
|
||||
for (int j = 0; j < *n; ++j) {
|
||||
int start = std::max(0, j - *k);
|
||||
int len = j - start;
|
||||
int offset = *k - (j - start);
|
||||
Scalar temp1 = alpha * actual_x[j];
|
||||
actual_y[j] += Scalar(Eigen::numext::real(band(*k, j))) * temp1;
|
||||
if (len > 0) {
|
||||
make_vector(actual_y + start, len) += temp1 * band.col(j).segment(offset, len);
|
||||
actual_y[j] += alpha * band.col(j).segment(offset, len).dot(make_vector(actual_x + start, len));
|
||||
}
|
||||
}
|
||||
// Diagonal is real.
|
||||
y[jy] += Scalar(Eigen::numext::real(a[*k + j * *lda])) * temp1 + alpha * temp2;
|
||||
jx += *incx;
|
||||
jy += *incy;
|
||||
if (j >= *k) {
|
||||
kx += *incx;
|
||||
ky += *incy;
|
||||
} else {
|
||||
for (int j = 0; j < *n; ++j) {
|
||||
int len = std::min(*n - 1, j + *k) - j;
|
||||
Scalar temp1 = alpha * actual_x[j];
|
||||
actual_y[j] += Scalar(Eigen::numext::real(band(0, j))) * temp1;
|
||||
if (len > 0) {
|
||||
make_vector(actual_y + j + 1, len) += temp1 * band.col(j).segment(1, len);
|
||||
actual_y[j] += alpha * band.col(j).segment(1, len).dot(make_vector(actual_x + j + 1, len));
|
||||
}
|
||||
}
|
||||
}
|
||||
} else {
|
||||
// Lower triangle: A[i,j] at a[(i-j) + j*lda], diagonal at row 0.
|
||||
int jx = kx, jy = ky;
|
||||
for (int j = 0; j < *n; ++j) {
|
||||
Scalar temp1 = alpha * x[jx];
|
||||
Scalar temp2 = Scalar(0);
|
||||
// Diagonal is real.
|
||||
y[jy] += Scalar(Eigen::numext::real(a[j * *lda])) * temp1;
|
||||
int ix = jx, iy = jy;
|
||||
for (int i = j + 1; i <= std::min(*n - 1, j + *k); ++i) {
|
||||
ix += *incx;
|
||||
iy += *incy;
|
||||
Scalar aij = a[(i - j) + j * *lda];
|
||||
y[iy] += temp1 * aij;
|
||||
temp2 += Eigen::numext::conj(aij) * x[ix];
|
||||
// Scalar path: for narrow bandwidth, avoid Map overhead.
|
||||
if (UPLO(*uplo) == UP) {
|
||||
for (int j = 0; j < *n; ++j) {
|
||||
Scalar temp1 = alpha * actual_x[j];
|
||||
Scalar temp2 = Scalar(0);
|
||||
for (int i = std::max(0, j - *k); i < j; ++i) {
|
||||
Scalar aij = a[(*k + i - j) + j * *lda];
|
||||
actual_y[i] += temp1 * aij;
|
||||
temp2 += Eigen::numext::conj(aij) * actual_x[i];
|
||||
}
|
||||
actual_y[j] += Scalar(Eigen::numext::real(a[*k + j * *lda])) * temp1 + alpha * temp2;
|
||||
}
|
||||
} else {
|
||||
for (int j = 0; j < *n; ++j) {
|
||||
Scalar temp1 = alpha * actual_x[j];
|
||||
Scalar temp2 = Scalar(0);
|
||||
actual_y[j] += Scalar(Eigen::numext::real(a[j * *lda])) * temp1;
|
||||
for (int i = j + 1; i <= std::min(*n - 1, j + *k); ++i) {
|
||||
Scalar aij = a[(i - j) + j * *lda];
|
||||
actual_y[i] += temp1 * aij;
|
||||
temp2 += Eigen::numext::conj(aij) * actual_x[i];
|
||||
}
|
||||
actual_y[j] += alpha * temp2;
|
||||
}
|
||||
y[jy] += alpha * temp2;
|
||||
jx += *incx;
|
||||
jy += *incy;
|
||||
}
|
||||
}
|
||||
|
||||
if (actual_x != x) delete[] actual_x;
|
||||
if (actual_y != y) delete[] copy_back(actual_y, y, *n, *incy);
|
||||
}
|
||||
|
||||
/** HPMV performs the matrix-vector operation
|
||||
@@ -196,61 +211,53 @@ EIGEN_BLAS_FUNC(hpmv)
|
||||
|
||||
if (*n == 0 || (alpha == Scalar(0) && beta == Scalar(1))) return;
|
||||
|
||||
int kx = *incx > 0 ? 0 : (1 - *n) * *incx;
|
||||
int ky = *incy > 0 ? 0 : (1 - *n) * *incy;
|
||||
const Scalar *actual_x = get_compact_vector(x, *n, *incx);
|
||||
Scalar *actual_y = get_compact_vector(y, *n, *incy);
|
||||
|
||||
// First form y := beta*y.
|
||||
if (beta != Scalar(1)) {
|
||||
int iy = ky;
|
||||
for (int i = 0; i < *n; ++i) {
|
||||
y[iy] = (beta == Scalar(0)) ? Scalar(0) : beta * y[iy];
|
||||
iy += *incy;
|
||||
}
|
||||
if (beta == Scalar(0))
|
||||
make_vector(actual_y, *n).setZero();
|
||||
else
|
||||
make_vector(actual_y, *n) *= beta;
|
||||
}
|
||||
|
||||
if (alpha == Scalar(0)) return;
|
||||
if (alpha == Scalar(0)) {
|
||||
if (actual_x != x) delete[] actual_x;
|
||||
if (actual_y != y) delete[] copy_back(actual_y, y, *n, *incy);
|
||||
return;
|
||||
}
|
||||
|
||||
int kk = 0;
|
||||
if (UPLO(*uplo) == UP) {
|
||||
// Upper triangle packed.
|
||||
int jx = kx, jy = ky;
|
||||
// Upper triangle packed: column j occupies ap[kk..kk+j].
|
||||
for (int j = 0; j < *n; ++j) {
|
||||
Scalar temp1 = alpha * x[jx];
|
||||
Scalar temp2 = Scalar(0);
|
||||
int ix = kx, iy = ky;
|
||||
for (int i = 0; i < j; ++i) {
|
||||
y[iy] += temp1 * ap[kk + i];
|
||||
temp2 += Eigen::numext::conj(ap[kk + i]) * x[ix];
|
||||
ix += *incx;
|
||||
iy += *incy;
|
||||
}
|
||||
Scalar temp1 = alpha * actual_x[j];
|
||||
// Diagonal is real.
|
||||
y[jy] += Scalar(Eigen::numext::real(ap[kk + j])) * temp1 + alpha * temp2;
|
||||
jx += *incx;
|
||||
jy += *incy;
|
||||
actual_y[j] += Scalar(Eigen::numext::real(ap[kk + j])) * temp1;
|
||||
if (j > 0) {
|
||||
make_vector(actual_y, j) += temp1 * make_vector(ap + kk, j);
|
||||
actual_y[j] += alpha * make_vector(ap + kk, j).dot(make_vector(actual_x, j));
|
||||
}
|
||||
kk += j + 1;
|
||||
}
|
||||
} else {
|
||||
// Lower triangle packed.
|
||||
int jx = kx, jy = ky;
|
||||
// Lower triangle packed: column j occupies ap[kk..kk+(n-j-1)].
|
||||
for (int j = 0; j < *n; ++j) {
|
||||
Scalar temp1 = alpha * x[jx];
|
||||
Scalar temp2 = Scalar(0);
|
||||
int len = *n - j - 1;
|
||||
Scalar temp1 = alpha * actual_x[j];
|
||||
// Diagonal is real.
|
||||
y[jy] += Scalar(Eigen::numext::real(ap[kk])) * temp1;
|
||||
int ix = jx, iy = jy;
|
||||
for (int i = 1; i < *n - j; ++i) {
|
||||
ix += *incx;
|
||||
iy += *incy;
|
||||
y[iy] += temp1 * ap[kk + i];
|
||||
temp2 += Eigen::numext::conj(ap[kk + i]) * x[ix];
|
||||
actual_y[j] += Scalar(Eigen::numext::real(ap[kk])) * temp1;
|
||||
if (len > 0) {
|
||||
make_vector(actual_y + j + 1, len) += temp1 * make_vector(ap + kk + 1, len);
|
||||
actual_y[j] += alpha * make_vector(ap + kk + 1, len).dot(make_vector(actual_x + j + 1, len));
|
||||
}
|
||||
y[jy] += alpha * temp2;
|
||||
jx += *incx;
|
||||
jy += *incy;
|
||||
kk += *n - j;
|
||||
}
|
||||
}
|
||||
|
||||
if (actual_x != x) delete[] actual_x;
|
||||
if (actual_y != y) delete[] copy_back(actual_y, y, *n, *incy);
|
||||
}
|
||||
|
||||
/** ZHPR performs the hermitian rank 1 operation
|
||||
|
||||
@@ -343,46 +343,112 @@ EIGEN_BLAS_FUNC(tbmv)
|
||||
int op = OP(*opa);
|
||||
bool unit = (DIAG(*diag) == UNIT);
|
||||
|
||||
if (op == NOTR) {
|
||||
if (upper) {
|
||||
// x := A*x, upper band. Process columns left to right.
|
||||
for (int j = 0; j < *n; ++j) {
|
||||
if (actual_x[j] != Scalar(0)) {
|
||||
if (*k >= 8) {
|
||||
// Vectorized path: use Eigen Map segments for the inner band operations.
|
||||
ConstMatrixType band(a, *k + 1, *n, *lda);
|
||||
if (op == NOTR) {
|
||||
if (upper) {
|
||||
for (int j = 0; j < *n; ++j) {
|
||||
if (actual_x[j] != Scalar(0)) {
|
||||
int start = std::max(0, j - *k);
|
||||
int len = j - start;
|
||||
int offset = *k - (j - start);
|
||||
Scalar temp = actual_x[j];
|
||||
if (len > 0) make_vector(actual_x + start, len) += temp * band.col(j).segment(offset, len);
|
||||
if (!unit) actual_x[j] = temp * band(*k, j);
|
||||
}
|
||||
}
|
||||
} else {
|
||||
for (int j = *n - 1; j >= 0; --j) {
|
||||
if (actual_x[j] != Scalar(0)) {
|
||||
int len = std::min(*n - 1, j + *k) - j;
|
||||
Scalar temp = actual_x[j];
|
||||
if (len > 0) make_vector(actual_x + j + 1, len) += temp * band.col(j).segment(1, len);
|
||||
if (!unit) actual_x[j] = temp * band(0, j);
|
||||
}
|
||||
}
|
||||
}
|
||||
} else if (op == TR) {
|
||||
if (upper) {
|
||||
for (int j = *n - 1; j >= 0; --j) {
|
||||
int start = std::max(0, j - *k);
|
||||
int len = j - start;
|
||||
int offset = *k - (j - start);
|
||||
Scalar temp = actual_x[j];
|
||||
for (int i = std::max(0, j - *k); i < j; ++i) actual_x[i] += temp * a[(*k + i - j) + j * *lda];
|
||||
if (!unit) actual_x[j] = temp * a[*k + j * *lda];
|
||||
if (!unit) temp *= band(*k, j);
|
||||
if (len > 0)
|
||||
temp += (band.col(j).segment(offset, len).cwiseProduct(make_vector(actual_x + start, len))).sum();
|
||||
actual_x[j] = temp;
|
||||
}
|
||||
} else {
|
||||
for (int j = 0; j < *n; ++j) {
|
||||
int len = std::min(*n - 1, j + *k) - j;
|
||||
Scalar temp = actual_x[j];
|
||||
if (!unit) temp *= band(0, j);
|
||||
if (len > 0) temp += (band.col(j).segment(1, len).cwiseProduct(make_vector(actual_x + j + 1, len))).sum();
|
||||
actual_x[j] = temp;
|
||||
}
|
||||
}
|
||||
} else {
|
||||
// x := A*x, lower band. Process columns right to left.
|
||||
for (int j = *n - 1; j >= 0; --j) {
|
||||
if (actual_x[j] != Scalar(0)) {
|
||||
// Conjugate transpose: .dot() computes conj(lhs) . rhs.
|
||||
if (upper) {
|
||||
for (int j = *n - 1; j >= 0; --j) {
|
||||
int start = std::max(0, j - *k);
|
||||
int len = j - start;
|
||||
int offset = *k - (j - start);
|
||||
Scalar temp = actual_x[j];
|
||||
for (int i = j + 1; i <= std::min(*n - 1, j + *k); ++i) actual_x[i] += temp * a[(i - j) + j * *lda];
|
||||
if (!unit) actual_x[j] = temp * a[j * *lda];
|
||||
if (!unit) temp *= Eigen::numext::conj(band(*k, j));
|
||||
if (len > 0) temp += band.col(j).segment(offset, len).dot(make_vector(actual_x + start, len));
|
||||
actual_x[j] = temp;
|
||||
}
|
||||
} else {
|
||||
for (int j = 0; j < *n; ++j) {
|
||||
int len = std::min(*n - 1, j + *k) - j;
|
||||
Scalar temp = actual_x[j];
|
||||
if (!unit) temp *= Eigen::numext::conj(band(0, j));
|
||||
if (len > 0) temp += band.col(j).segment(1, len).dot(make_vector(actual_x + j + 1, len));
|
||||
actual_x[j] = temp;
|
||||
}
|
||||
}
|
||||
}
|
||||
} else {
|
||||
// Transpose or conjugate transpose.
|
||||
bool do_conj = (op == ADJ);
|
||||
auto maybe_conj = [do_conj](Scalar val) -> Scalar { return do_conj ? Eigen::numext::conj(val) : val; };
|
||||
|
||||
if (upper) {
|
||||
// x := op(A)*x, upper band. Process columns right to left.
|
||||
for (int j = *n - 1; j >= 0; --j) {
|
||||
Scalar temp = actual_x[j];
|
||||
if (!unit) temp *= maybe_conj(a[*k + j * *lda]);
|
||||
for (int i = std::max(0, j - *k); i < j; ++i) temp += maybe_conj(a[(*k + i - j) + j * *lda]) * actual_x[i];
|
||||
actual_x[j] = temp;
|
||||
// Scalar path: for narrow bandwidth, avoid Map overhead.
|
||||
if (op == NOTR) {
|
||||
if (upper) {
|
||||
for (int j = 0; j < *n; ++j) {
|
||||
if (actual_x[j] != Scalar(0)) {
|
||||
Scalar temp = actual_x[j];
|
||||
for (int i = std::max(0, j - *k); i < j; ++i) actual_x[i] += temp * a[(*k + i - j) + j * *lda];
|
||||
if (!unit) actual_x[j] = temp * a[*k + j * *lda];
|
||||
}
|
||||
}
|
||||
} else {
|
||||
for (int j = *n - 1; j >= 0; --j) {
|
||||
if (actual_x[j] != Scalar(0)) {
|
||||
Scalar temp = actual_x[j];
|
||||
for (int i = j + 1; i <= std::min(*n - 1, j + *k); ++i) actual_x[i] += temp * a[(i - j) + j * *lda];
|
||||
if (!unit) actual_x[j] = temp * a[j * *lda];
|
||||
}
|
||||
}
|
||||
}
|
||||
} else {
|
||||
// x := op(A)*x, lower band. Process columns left to right.
|
||||
for (int j = 0; j < *n; ++j) {
|
||||
Scalar temp = actual_x[j];
|
||||
if (!unit) temp *= maybe_conj(a[j * *lda]);
|
||||
for (int i = j + 1; i <= std::min(*n - 1, j + *k); ++i) temp += maybe_conj(a[(i - j) + j * *lda]) * actual_x[i];
|
||||
actual_x[j] = temp;
|
||||
// Transpose or conjugate transpose.
|
||||
auto maybe_conj = [op](Scalar val) -> Scalar { return op == ADJ ? Eigen::numext::conj(val) : val; };
|
||||
if (upper) {
|
||||
for (int j = *n - 1; j >= 0; --j) {
|
||||
Scalar temp = actual_x[j];
|
||||
if (!unit) temp *= maybe_conj(a[*k + j * *lda]);
|
||||
for (int i = std::max(0, j - *k); i < j; ++i) temp += maybe_conj(a[(*k + i - j) + j * *lda]) * actual_x[i];
|
||||
actual_x[j] = temp;
|
||||
}
|
||||
} else {
|
||||
for (int j = 0; j < *n; ++j) {
|
||||
Scalar temp = actual_x[j];
|
||||
if (!unit) temp *= maybe_conj(a[j * *lda]);
|
||||
for (int i = j + 1; i <= std::min(*n - 1, j + *k); ++i)
|
||||
temp += maybe_conj(a[(i - j) + j * *lda]) * actual_x[i];
|
||||
actual_x[j] = temp;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
@@ -194,62 +194,79 @@ EIGEN_BLAS_FUNC(sbmv)
|
||||
|
||||
if (*n == 0 || (alpha == Scalar(0) && beta == Scalar(1))) return;
|
||||
|
||||
int kx = *incx > 0 ? 0 : (1 - *n) * *incx;
|
||||
int ky = *incy > 0 ? 0 : (1 - *n) * *incy;
|
||||
const Scalar *actual_x = get_compact_vector(x, *n, *incx);
|
||||
Scalar *actual_y = get_compact_vector(y, *n, *incy);
|
||||
|
||||
// First form y := beta*y.
|
||||
if (beta != Scalar(1)) {
|
||||
int iy = ky;
|
||||
for (int i = 0; i < *n; ++i) {
|
||||
y[iy] = (beta == Scalar(0)) ? Scalar(0) : beta * y[iy];
|
||||
iy += *incy;
|
||||
}
|
||||
if (beta == Scalar(0))
|
||||
make_vector(actual_y, *n).setZero();
|
||||
else
|
||||
make_vector(actual_y, *n) *= beta;
|
||||
}
|
||||
|
||||
if (alpha == Scalar(0)) return;
|
||||
if (alpha == Scalar(0)) {
|
||||
if (actual_x != x) delete[] actual_x;
|
||||
if (actual_y != y) delete[] copy_back(actual_y, y, *n, *incy);
|
||||
return;
|
||||
}
|
||||
|
||||
if (UPLO(*uplo) == UP) {
|
||||
// Upper triangle: A[i,j] at a[(k+i-j) + j*lda], diagonal at row k.
|
||||
int jx = kx, jy = ky;
|
||||
for (int j = 0; j < *n; ++j) {
|
||||
Scalar temp1 = alpha * x[jx];
|
||||
Scalar temp2 = Scalar(0);
|
||||
int ix = kx, iy = ky;
|
||||
for (int i = std::max(0, j - *k); i < j; ++i) {
|
||||
Scalar aij = a[(*k + i - j) + j * *lda];
|
||||
y[iy] += temp1 * aij;
|
||||
temp2 += aij * x[ix];
|
||||
ix += *incx;
|
||||
iy += *incy;
|
||||
if (*k >= 8) {
|
||||
// Vectorized path: use Eigen Map segments for the inner band operations.
|
||||
ConstMatrixType band(a, *k + 1, *n, *lda);
|
||||
if (UPLO(*uplo) == UP) {
|
||||
for (int j = 0; j < *n; ++j) {
|
||||
int start = std::max(0, j - *k);
|
||||
int len = j - start;
|
||||
int offset = *k - (j - start);
|
||||
Scalar temp1 = alpha * actual_x[j];
|
||||
actual_y[j] += temp1 * band(*k, j);
|
||||
if (len > 0) {
|
||||
make_vector(actual_y + start, len) += temp1 * band.col(j).segment(offset, len);
|
||||
actual_y[j] += alpha * band.col(j).segment(offset, len).dot(make_vector(actual_x + start, len));
|
||||
}
|
||||
}
|
||||
y[jy] += temp1 * a[*k + j * *lda] + alpha * temp2;
|
||||
jx += *incx;
|
||||
jy += *incy;
|
||||
if (j >= *k) {
|
||||
kx += *incx;
|
||||
ky += *incy;
|
||||
} else {
|
||||
for (int j = 0; j < *n; ++j) {
|
||||
int len = std::min(*n - 1, j + *k) - j;
|
||||
Scalar temp1 = alpha * actual_x[j];
|
||||
actual_y[j] += temp1 * band(0, j);
|
||||
if (len > 0) {
|
||||
make_vector(actual_y + j + 1, len) += temp1 * band.col(j).segment(1, len);
|
||||
actual_y[j] += alpha * band.col(j).segment(1, len).dot(make_vector(actual_x + j + 1, len));
|
||||
}
|
||||
}
|
||||
}
|
||||
} else {
|
||||
// Lower triangle: A[i,j] at a[(i-j) + j*lda], diagonal at row 0.
|
||||
int jx = kx, jy = ky;
|
||||
for (int j = 0; j < *n; ++j) {
|
||||
Scalar temp1 = alpha * x[jx];
|
||||
Scalar temp2 = Scalar(0);
|
||||
y[jy] += temp1 * a[j * *lda];
|
||||
int ix = jx, iy = jy;
|
||||
for (int i = j + 1; i <= std::min(*n - 1, j + *k); ++i) {
|
||||
ix += *incx;
|
||||
iy += *incy;
|
||||
Scalar aij = a[(i - j) + j * *lda];
|
||||
y[iy] += temp1 * aij;
|
||||
temp2 += aij * x[ix];
|
||||
// Scalar path: for narrow bandwidth, avoid Map overhead.
|
||||
if (UPLO(*uplo) == UP) {
|
||||
for (int j = 0; j < *n; ++j) {
|
||||
Scalar temp1 = alpha * actual_x[j];
|
||||
Scalar temp2 = Scalar(0);
|
||||
for (int i = std::max(0, j - *k); i < j; ++i) {
|
||||
Scalar aij = a[(*k + i - j) + j * *lda];
|
||||
actual_y[i] += temp1 * aij;
|
||||
temp2 += aij * actual_x[i];
|
||||
}
|
||||
actual_y[j] += temp1 * a[*k + j * *lda] + alpha * temp2;
|
||||
}
|
||||
} else {
|
||||
for (int j = 0; j < *n; ++j) {
|
||||
Scalar temp1 = alpha * actual_x[j];
|
||||
Scalar temp2 = Scalar(0);
|
||||
actual_y[j] += temp1 * a[j * *lda];
|
||||
for (int i = j + 1; i <= std::min(*n - 1, j + *k); ++i) {
|
||||
Scalar aij = a[(i - j) + j * *lda];
|
||||
actual_y[i] += temp1 * aij;
|
||||
temp2 += aij * actual_x[i];
|
||||
}
|
||||
actual_y[j] += alpha * temp2;
|
||||
}
|
||||
y[jy] += alpha * temp2;
|
||||
jx += *incx;
|
||||
jy += *incy;
|
||||
}
|
||||
}
|
||||
|
||||
if (actual_x != x) delete[] actual_x;
|
||||
if (actual_y != y) delete[] copy_back(actual_y, y, *n, *incy);
|
||||
}
|
||||
|
||||
/** SPMV performs the matrix-vector operation
|
||||
@@ -285,59 +302,51 @@ EIGEN_BLAS_FUNC(spmv)
|
||||
|
||||
if (*n == 0 || (alpha == Scalar(0) && beta == Scalar(1))) return;
|
||||
|
||||
int kx = *incx > 0 ? 0 : (1 - *n) * *incx;
|
||||
int ky = *incy > 0 ? 0 : (1 - *n) * *incy;
|
||||
const Scalar *actual_x = get_compact_vector(x, *n, *incx);
|
||||
Scalar *actual_y = get_compact_vector(y, *n, *incy);
|
||||
|
||||
// First form y := beta*y.
|
||||
if (beta != Scalar(1)) {
|
||||
int iy = ky;
|
||||
for (int i = 0; i < *n; ++i) {
|
||||
y[iy] = (beta == Scalar(0)) ? Scalar(0) : beta * y[iy];
|
||||
iy += *incy;
|
||||
}
|
||||
if (beta == Scalar(0))
|
||||
make_vector(actual_y, *n).setZero();
|
||||
else
|
||||
make_vector(actual_y, *n) *= beta;
|
||||
}
|
||||
|
||||
if (alpha == Scalar(0)) return;
|
||||
if (alpha == Scalar(0)) {
|
||||
if (actual_x != x) delete[] actual_x;
|
||||
if (actual_y != y) delete[] copy_back(actual_y, y, *n, *incy);
|
||||
return;
|
||||
}
|
||||
|
||||
int kk = 0;
|
||||
if (UPLO(*uplo) == UP) {
|
||||
// Upper triangle packed.
|
||||
int jx = kx, jy = ky;
|
||||
// Upper triangle packed: column j occupies ap[kk..kk+j].
|
||||
for (int j = 0; j < *n; ++j) {
|
||||
Scalar temp1 = alpha * x[jx];
|
||||
Scalar temp2 = Scalar(0);
|
||||
int ix = kx, iy = ky;
|
||||
for (int i = 0; i < j; ++i) {
|
||||
y[iy] += temp1 * ap[kk + i];
|
||||
temp2 += ap[kk + i] * x[ix];
|
||||
ix += *incx;
|
||||
iy += *incy;
|
||||
Scalar temp1 = alpha * actual_x[j];
|
||||
actual_y[j] += temp1 * ap[kk + j];
|
||||
if (j > 0) {
|
||||
make_vector(actual_y, j) += temp1 * make_vector(ap + kk, j);
|
||||
actual_y[j] += alpha * make_vector(ap + kk, j).dot(make_vector(actual_x, j));
|
||||
}
|
||||
y[jy] += temp1 * ap[kk + j] + alpha * temp2;
|
||||
jx += *incx;
|
||||
jy += *incy;
|
||||
kk += j + 1;
|
||||
}
|
||||
} else {
|
||||
// Lower triangle packed.
|
||||
int jx = kx, jy = ky;
|
||||
// Lower triangle packed: column j occupies ap[kk..kk+(n-j-1)].
|
||||
for (int j = 0; j < *n; ++j) {
|
||||
Scalar temp1 = alpha * x[jx];
|
||||
Scalar temp2 = Scalar(0);
|
||||
y[jy] += temp1 * ap[kk];
|
||||
int ix = jx, iy = jy;
|
||||
for (int i = 1; i < *n - j; ++i) {
|
||||
ix += *incx;
|
||||
iy += *incy;
|
||||
y[iy] += temp1 * ap[kk + i];
|
||||
temp2 += ap[kk + i] * x[ix];
|
||||
int len = *n - j - 1;
|
||||
Scalar temp1 = alpha * actual_x[j];
|
||||
actual_y[j] += temp1 * ap[kk];
|
||||
if (len > 0) {
|
||||
make_vector(actual_y + j + 1, len) += temp1 * make_vector(ap + kk + 1, len);
|
||||
actual_y[j] += alpha * make_vector(ap + kk + 1, len).dot(make_vector(actual_x + j + 1, len));
|
||||
}
|
||||
y[jy] += alpha * temp2;
|
||||
jx += *incx;
|
||||
jy += *incy;
|
||||
kk += *n - j;
|
||||
}
|
||||
}
|
||||
|
||||
if (actual_x != x) delete[] actual_x;
|
||||
if (actual_y != y) delete[] copy_back(actual_y, y, *n, *incy);
|
||||
}
|
||||
|
||||
/** DSPR performs the symmetric rank 1 operation
|
||||
|
||||
Reference in New Issue
Block a user