Vectorize BLAS level 1/2 routines with Eigen expressions

libeigen/eigen!2404 Co-authored-by: Rasmus Munk Larsen <rmlarsen@gmail.com>
2026-04-10 11:34:33 +08:00 · 2026-04-05 18:53:11 -07:00
parent 4ad90a60f1
commit 8eabfb5342
6 changed files with 294 additions and 204 deletions
--- a/blas/level1_cplx_impl.h
+++ b/blas/level1_cplx_impl.h
@@ -25,15 +25,19 @@ struct functor_traits<scalar_norm1_op> {
 // computes the sum of magnitudes of all vector elements or, for a complex vector x, the sum
 // res = |Rex1| + |Imx1| + |Rex2| + |Imx2| + ... + |Rexn| + |Imxn|, where x is a vector of order n
 extern "C" RealScalar EIGEN_CAT(REAL_SCALAR_SUFFIX, EIGEN_BLAS_FUNC_NAME(asum))(int *n, RealScalar *px, int *incx) {
-  //   std::cerr << "__asum " << *n << " " << *incx << "\n";
-  Complex *x = reinterpret_cast<Complex *>(px);
-
  if (*n <= 0) return 0;

+  // std::complex<T> is layout-compatible with T[2], so we can reinterpret
+  // a complex vector of length n as a real vector of length 2*n and use
+  // the fully vectorized cwiseAbs().sum() path.
  if (*incx == 1)
-    return make_vector(x, *n).unaryExpr<scalar_norm1_op>().sum();
-  else
+    return make_vector(px, 2 * *n).cwiseAbs().sum();
+  else {
+    // For non-unit stride, fall back to the scalar_norm1_op approach since
+    // the real components are not contiguous across complex elements.
+    Complex *x = reinterpret_cast<Complex *>(px);
    return make_vector(x, *n, std::abs(*incx)).unaryExpr<scalar_norm1_op>().sum();
+  }
 }

 extern "C" int EIGEN_CAT(i, EIGEN_BLAS_FUNC_NAME(amax))(int *n, RealScalar *px, int *incx) {
--- a/blas/level1_impl.h
+++ b/blas/level1_impl.h
@@ -69,15 +69,21 @@ EIGEN_BLAS_FUNC(copy)(int *n, RealScalar *px, int *incx, RealScalar *py, int *in
  // be careful, *incx==0 is allowed !!
  if (*incx == 1 && *incy == 1)
    make_vector(y, *n) = make_vector(x, *n);
-  else {
-    if (*incx < 0) x = x - (*n - 1) * (*incx);
+  else if (*incx == 0) {
+    // Broadcast: copy x[0] to all elements of y.
    if (*incy < 0) y = y - (*n - 1) * (*incy);
    for (int i = 0; i < *n; ++i) {
      *y = *x;
-      x += *incx;
      y += *incy;
    }
-  }
+  } else if (*incx > 0 && *incy > 0)
+    make_vector(y, *n, *incy) = make_vector(x, *n, *incx);
+  else if (*incx > 0 && *incy < 0)
+    make_vector(y, *n, -*incy).reverse() = make_vector(x, *n, *incx);
+  else if (*incx < 0 && *incy > 0)
+    make_vector(y, *n, *incy) = make_vector(x, *n, -*incx).reverse();
+  else if (*incx < 0 && *incy < 0)
+    make_vector(y, *n, -*incy) = make_vector(x, *n, -*incx);
 }

 EIGEN_BLAS_FUNC(rotg)(RealScalar *pa, RealScalar *pb, RealScalar *pc, RealScalar *ps) {
--- a/blas/level1_real_impl.h
+++ b/blas/level1_real_impl.h
@@ -58,23 +58,21 @@ extern "C" Scalar EIGEN_BLAS_FUNC_NAME(dot)(int *n, Scalar *px, int *incx, Scala
  Scalar *y = reinterpret_cast<Scalar *>(py);

  if (*incx == 1 && *incy == 1)
-    return (make_vector(x, *n).cwiseProduct(make_vector(y, *n))).sum();
+    return make_vector(x, *n).dot(make_vector(y, *n));
  else if (*incx > 0 && *incy > 0)
-    return (make_vector(x, *n, *incx).cwiseProduct(make_vector(y, *n, *incy))).sum();
+    return make_vector(x, *n, *incx).dot(make_vector(y, *n, *incy));
  else if (*incx < 0 && *incy > 0)
-    return (make_vector(x, *n, -*incx).reverse().cwiseProduct(make_vector(y, *n, *incy))).sum();
+    return make_vector(x, *n, -*incx).reverse().dot(make_vector(y, *n, *incy));
  else if (*incx > 0 && *incy < 0)
-    return (make_vector(x, *n, *incx).cwiseProduct(make_vector(y, *n, -*incy).reverse())).sum();
+    return make_vector(x, *n, *incx).dot(make_vector(y, *n, -*incy).reverse());
  else if (*incx < 0 && *incy < 0)
-    return (make_vector(x, *n, -*incx).reverse().cwiseProduct(make_vector(y, *n, -*incy).reverse())).sum();
+    return make_vector(x, *n, -*incx).reverse().dot(make_vector(y, *n, -*incy).reverse());
  else
    return 0;
 }

 // computes the Euclidean norm of a vector.
-// FIXME
 extern "C" Scalar EIGEN_BLAS_FUNC_NAME(nrm2)(int *n, Scalar *px, int *incx) {
-  //   std::cerr << "_nrm2 " << *n << " " << *incx << "\n";
  if (*n <= 0) return 0;

  Scalar *x = reinterpret_cast<Scalar *>(px);
--- a/blas/level2_cplx_impl.h
+++ b/blas/level2_cplx_impl.h
@@ -106,64 +106,79 @@ EIGEN_BLAS_FUNC(hbmv)

  if (*n == 0 || (alpha == Scalar(0) && beta == Scalar(1))) return;

-  int kx = *incx > 0 ? 0 : (1 - *n) * *incx;
-  int ky = *incy > 0 ? 0 : (1 - *n) * *incy;
+  const Scalar *actual_x = get_compact_vector(x, *n, *incx);
+  Scalar *actual_y = get_compact_vector(y, *n, *incy);

  // First form y := beta*y.
  if (beta != Scalar(1)) {
-    int iy = ky;
-    for (int i = 0; i < *n; ++i) {
-      y[iy] = (beta == Scalar(0)) ? Scalar(0) : beta * y[iy];
-      iy += *incy;
-    }
+    if (beta == Scalar(0))
+      make_vector(actual_y, *n).setZero();
+    else
+      make_vector(actual_y, *n) *= beta;
  }

-  if (alpha == Scalar(0)) return;
+  if (alpha == Scalar(0)) {
+    if (actual_x != x) delete[] actual_x;
+    if (actual_y != y) delete[] copy_back(actual_y, y, *n, *incy);
+    return;
+  }

-  if (UPLO(*uplo) == UP) {
-    // Upper triangle: A[i,j] at a[(k+i-j) + j*lda], diagonal at row k.
-    int jx = kx, jy = ky;
-    for (int j = 0; j < *n; ++j) {
-      Scalar temp1 = alpha * x[jx];
-      Scalar temp2 = Scalar(0);
-      int ix = kx, iy = ky;
-      for (int i = std::max(0, j - *k); i < j; ++i) {
-        Scalar aij = a[(*k + i - j) + j * *lda];
-        y[iy] += temp1 * aij;
-        temp2 += Eigen::numext::conj(aij) * x[ix];
-        ix += *incx;
-        iy += *incy;
+  if (*k >= 8) {
+    // Vectorized path: use Eigen Map segments for the inner band operations.
+    ConstMatrixType band(a, *k + 1, *n, *lda);
+    if (UPLO(*uplo) == UP) {
+      for (int j = 0; j < *n; ++j) {
+        int start = std::max(0, j - *k);
+        int len = j - start;
+        int offset = *k - (j - start);
+        Scalar temp1 = alpha * actual_x[j];
+        actual_y[j] += Scalar(Eigen::numext::real(band(*k, j))) * temp1;
+        if (len > 0) {
+          make_vector(actual_y + start, len) += temp1 * band.col(j).segment(offset, len);
+          actual_y[j] += alpha * band.col(j).segment(offset, len).dot(make_vector(actual_x + start, len));
+        }
      }
-      // Diagonal is real.
-      y[jy] += Scalar(Eigen::numext::real(a[*k + j * *lda])) * temp1 + alpha * temp2;
-      jx += *incx;
-      jy += *incy;
-      if (j >= *k) {
-        kx += *incx;
-        ky += *incy;
+    } else {
+      for (int j = 0; j < *n; ++j) {
+        int len = std::min(*n - 1, j + *k) - j;
+        Scalar temp1 = alpha * actual_x[j];
+        actual_y[j] += Scalar(Eigen::numext::real(band(0, j))) * temp1;
+        if (len > 0) {
+          make_vector(actual_y + j + 1, len) += temp1 * band.col(j).segment(1, len);
+          actual_y[j] += alpha * band.col(j).segment(1, len).dot(make_vector(actual_x + j + 1, len));
+        }
      }
    }
  } else {
-    // Lower triangle: A[i,j] at a[(i-j) + j*lda], diagonal at row 0.
-    int jx = kx, jy = ky;
-    for (int j = 0; j < *n; ++j) {
-      Scalar temp1 = alpha * x[jx];
-      Scalar temp2 = Scalar(0);
-      // Diagonal is real.
-      y[jy] += Scalar(Eigen::numext::real(a[j * *lda])) * temp1;
-      int ix = jx, iy = jy;
-      for (int i = j + 1; i <= std::min(*n - 1, j + *k); ++i) {
-        ix += *incx;
-        iy += *incy;
-        Scalar aij = a[(i - j) + j * *lda];
-        y[iy] += temp1 * aij;
-        temp2 += Eigen::numext::conj(aij) * x[ix];
+    // Scalar path: for narrow bandwidth, avoid Map overhead.
+    if (UPLO(*uplo) == UP) {
+      for (int j = 0; j < *n; ++j) {
+        Scalar temp1 = alpha * actual_x[j];
+        Scalar temp2 = Scalar(0);
+        for (int i = std::max(0, j - *k); i < j; ++i) {
+          Scalar aij = a[(*k + i - j) + j * *lda];
+          actual_y[i] += temp1 * aij;
+          temp2 += Eigen::numext::conj(aij) * actual_x[i];
+        }
+        actual_y[j] += Scalar(Eigen::numext::real(a[*k + j * *lda])) * temp1 + alpha * temp2;
+      }
+    } else {
+      for (int j = 0; j < *n; ++j) {
+        Scalar temp1 = alpha * actual_x[j];
+        Scalar temp2 = Scalar(0);
+        actual_y[j] += Scalar(Eigen::numext::real(a[j * *lda])) * temp1;
+        for (int i = j + 1; i <= std::min(*n - 1, j + *k); ++i) {
+          Scalar aij = a[(i - j) + j * *lda];
+          actual_y[i] += temp1 * aij;
+          temp2 += Eigen::numext::conj(aij) * actual_x[i];
+        }
+        actual_y[j] += alpha * temp2;
      }
-      y[jy] += alpha * temp2;
-      jx += *incx;
-      jy += *incy;
    }
  }
+
+  if (actual_x != x) delete[] actual_x;
+  if (actual_y != y) delete[] copy_back(actual_y, y, *n, *incy);
 }

 /**  HPMV  performs the matrix-vector operation
@@ -196,61 +211,53 @@ EIGEN_BLAS_FUNC(hpmv)

  if (*n == 0 || (alpha == Scalar(0) && beta == Scalar(1))) return;

-  int kx = *incx > 0 ? 0 : (1 - *n) * *incx;
-  int ky = *incy > 0 ? 0 : (1 - *n) * *incy;
+  const Scalar *actual_x = get_compact_vector(x, *n, *incx);
+  Scalar *actual_y = get_compact_vector(y, *n, *incy);

  // First form y := beta*y.
  if (beta != Scalar(1)) {
-    int iy = ky;
-    for (int i = 0; i < *n; ++i) {
-      y[iy] = (beta == Scalar(0)) ? Scalar(0) : beta * y[iy];
-      iy += *incy;
-    }
+    if (beta == Scalar(0))
+      make_vector(actual_y, *n).setZero();
+    else
+      make_vector(actual_y, *n) *= beta;
  }

-  if (alpha == Scalar(0)) return;
+  if (alpha == Scalar(0)) {
+    if (actual_x != x) delete[] actual_x;
+    if (actual_y != y) delete[] copy_back(actual_y, y, *n, *incy);
+    return;
+  }

  int kk = 0;
  if (UPLO(*uplo) == UP) {
-    // Upper triangle packed.
-    int jx = kx, jy = ky;
+    // Upper triangle packed: column j occupies ap[kk..kk+j].
    for (int j = 0; j < *n; ++j) {
-      Scalar temp1 = alpha * x[jx];
-      Scalar temp2 = Scalar(0);
-      int ix = kx, iy = ky;
-      for (int i = 0; i < j; ++i) {
-        y[iy] += temp1 * ap[kk + i];
-        temp2 += Eigen::numext::conj(ap[kk + i]) * x[ix];
-        ix += *incx;
-        iy += *incy;
-      }
+      Scalar temp1 = alpha * actual_x[j];
      // Diagonal is real.
-      y[jy] += Scalar(Eigen::numext::real(ap[kk + j])) * temp1 + alpha * temp2;
-      jx += *incx;
-      jy += *incy;
+      actual_y[j] += Scalar(Eigen::numext::real(ap[kk + j])) * temp1;
+      if (j > 0) {
+        make_vector(actual_y, j) += temp1 * make_vector(ap + kk, j);
+        actual_y[j] += alpha * make_vector(ap + kk, j).dot(make_vector(actual_x, j));
+      }
      kk += j + 1;
    }
  } else {
-    // Lower triangle packed.
-    int jx = kx, jy = ky;
+    // Lower triangle packed: column j occupies ap[kk..kk+(n-j-1)].
    for (int j = 0; j < *n; ++j) {
-      Scalar temp1 = alpha * x[jx];
-      Scalar temp2 = Scalar(0);
+      int len = *n - j - 1;
+      Scalar temp1 = alpha * actual_x[j];
      // Diagonal is real.
-      y[jy] += Scalar(Eigen::numext::real(ap[kk])) * temp1;
-      int ix = jx, iy = jy;
-      for (int i = 1; i < *n - j; ++i) {
-        ix += *incx;
-        iy += *incy;
-        y[iy] += temp1 * ap[kk + i];
-        temp2 += Eigen::numext::conj(ap[kk + i]) * x[ix];
+      actual_y[j] += Scalar(Eigen::numext::real(ap[kk])) * temp1;
+      if (len > 0) {
+        make_vector(actual_y + j + 1, len) += temp1 * make_vector(ap + kk + 1, len);
+        actual_y[j] += alpha * make_vector(ap + kk + 1, len).dot(make_vector(actual_x + j + 1, len));
      }
-      y[jy] += alpha * temp2;
-      jx += *incx;
-      jy += *incy;
      kk += *n - j;
    }
  }
+
+  if (actual_x != x) delete[] actual_x;
+  if (actual_y != y) delete[] copy_back(actual_y, y, *n, *incy);
 }

 /**  ZHPR    performs the hermitian rank 1 operation
--- a/blas/level2_impl.h
+++ b/blas/level2_impl.h
@@ -343,46 +343,112 @@ EIGEN_BLAS_FUNC(tbmv)
  int op = OP(*opa);
  bool unit = (DIAG(*diag) == UNIT);

-  if (op == NOTR) {
-    if (upper) {
-      // x := A*x, upper band. Process columns left to right.
-      for (int j = 0; j < *n; ++j) {
-        if (actual_x[j] != Scalar(0)) {
+  if (*k >= 8) {
+    // Vectorized path: use Eigen Map segments for the inner band operations.
+    ConstMatrixType band(a, *k + 1, *n, *lda);
+    if (op == NOTR) {
+      if (upper) {
+        for (int j = 0; j < *n; ++j) {
+          if (actual_x[j] != Scalar(0)) {
+            int start = std::max(0, j - *k);
+            int len = j - start;
+            int offset = *k - (j - start);
+            Scalar temp = actual_x[j];
+            if (len > 0) make_vector(actual_x + start, len) += temp * band.col(j).segment(offset, len);
+            if (!unit) actual_x[j] = temp * band(*k, j);
+          }
+        }
+      } else {
+        for (int j = *n - 1; j >= 0; --j) {
+          if (actual_x[j] != Scalar(0)) {
+            int len = std::min(*n - 1, j + *k) - j;
+            Scalar temp = actual_x[j];
+            if (len > 0) make_vector(actual_x + j + 1, len) += temp * band.col(j).segment(1, len);
+            if (!unit) actual_x[j] = temp * band(0, j);
+          }
+        }
+      }
+    } else if (op == TR) {
+      if (upper) {
+        for (int j = *n - 1; j >= 0; --j) {
+          int start = std::max(0, j - *k);
+          int len = j - start;
+          int offset = *k - (j - start);
          Scalar temp = actual_x[j];
-          for (int i = std::max(0, j - *k); i < j; ++i) actual_x[i] += temp * a[(*k + i - j) + j * *lda];
-          if (!unit) actual_x[j] = temp * a[*k + j * *lda];
+          if (!unit) temp *= band(*k, j);
+          if (len > 0)
+            temp += (band.col(j).segment(offset, len).cwiseProduct(make_vector(actual_x + start, len))).sum();
+          actual_x[j] = temp;
+        }
+      } else {
+        for (int j = 0; j < *n; ++j) {
+          int len = std::min(*n - 1, j + *k) - j;
+          Scalar temp = actual_x[j];
+          if (!unit) temp *= band(0, j);
+          if (len > 0) temp += (band.col(j).segment(1, len).cwiseProduct(make_vector(actual_x + j + 1, len))).sum();
+          actual_x[j] = temp;
        }
      }
    } else {
-      // x := A*x, lower band. Process columns right to left.
-      for (int j = *n - 1; j >= 0; --j) {
-        if (actual_x[j] != Scalar(0)) {
+      // Conjugate transpose: .dot() computes conj(lhs) . rhs.
+      if (upper) {
+        for (int j = *n - 1; j >= 0; --j) {
+          int start = std::max(0, j - *k);
+          int len = j - start;
+          int offset = *k - (j - start);
          Scalar temp = actual_x[j];
-          for (int i = j + 1; i <= std::min(*n - 1, j + *k); ++i) actual_x[i] += temp * a[(i - j) + j * *lda];
-          if (!unit) actual_x[j] = temp * a[j * *lda];
+          if (!unit) temp *= Eigen::numext::conj(band(*k, j));
+          if (len > 0) temp += band.col(j).segment(offset, len).dot(make_vector(actual_x + start, len));
+          actual_x[j] = temp;
+        }
+      } else {
+        for (int j = 0; j < *n; ++j) {
+          int len = std::min(*n - 1, j + *k) - j;
+          Scalar temp = actual_x[j];
+          if (!unit) temp *= Eigen::numext::conj(band(0, j));
+          if (len > 0) temp += band.col(j).segment(1, len).dot(make_vector(actual_x + j + 1, len));
+          actual_x[j] = temp;
        }
      }
    }
  } else {
-    // Transpose or conjugate transpose.
-    bool do_conj = (op == ADJ);
-    auto maybe_conj = [do_conj](Scalar val) -> Scalar { return do_conj ? Eigen::numext::conj(val) : val; };
-
-    if (upper) {
-      // x := op(A)*x, upper band. Process columns right to left.
-      for (int j = *n - 1; j >= 0; --j) {
-        Scalar temp = actual_x[j];
-        if (!unit) temp *= maybe_conj(a[*k + j * *lda]);
-        for (int i = std::max(0, j - *k); i < j; ++i) temp += maybe_conj(a[(*k + i - j) + j * *lda]) * actual_x[i];
-        actual_x[j] = temp;
+    // Scalar path: for narrow bandwidth, avoid Map overhead.
+    if (op == NOTR) {
+      if (upper) {
+        for (int j = 0; j < *n; ++j) {
+          if (actual_x[j] != Scalar(0)) {
+            Scalar temp = actual_x[j];
+            for (int i = std::max(0, j - *k); i < j; ++i) actual_x[i] += temp * a[(*k + i - j) + j * *lda];
+            if (!unit) actual_x[j] = temp * a[*k + j * *lda];
+          }
+        }
+      } else {
+        for (int j = *n - 1; j >= 0; --j) {
+          if (actual_x[j] != Scalar(0)) {
+            Scalar temp = actual_x[j];
+            for (int i = j + 1; i <= std::min(*n - 1, j + *k); ++i) actual_x[i] += temp * a[(i - j) + j * *lda];
+            if (!unit) actual_x[j] = temp * a[j * *lda];
+          }
+        }
      }
    } else {
-      // x := op(A)*x, lower band. Process columns left to right.
-      for (int j = 0; j < *n; ++j) {
-        Scalar temp = actual_x[j];
-        if (!unit) temp *= maybe_conj(a[j * *lda]);
-        for (int i = j + 1; i <= std::min(*n - 1, j + *k); ++i) temp += maybe_conj(a[(i - j) + j * *lda]) * actual_x[i];
-        actual_x[j] = temp;
+      // Transpose or conjugate transpose.
+      auto maybe_conj = [op](Scalar val) -> Scalar { return op == ADJ ? Eigen::numext::conj(val) : val; };
+      if (upper) {
+        for (int j = *n - 1; j >= 0; --j) {
+          Scalar temp = actual_x[j];
+          if (!unit) temp *= maybe_conj(a[*k + j * *lda]);
+          for (int i = std::max(0, j - *k); i < j; ++i) temp += maybe_conj(a[(*k + i - j) + j * *lda]) * actual_x[i];
+          actual_x[j] = temp;
+        }
+      } else {
+        for (int j = 0; j < *n; ++j) {
+          Scalar temp = actual_x[j];
+          if (!unit) temp *= maybe_conj(a[j * *lda]);
+          for (int i = j + 1; i <= std::min(*n - 1, j + *k); ++i)
+            temp += maybe_conj(a[(i - j) + j * *lda]) * actual_x[i];
+          actual_x[j] = temp;
+        }
      }
    }
  }
--- a/blas/level2_real_impl.h
+++ b/blas/level2_real_impl.h
@@ -194,62 +194,79 @@ EIGEN_BLAS_FUNC(sbmv)

  if (*n == 0 || (alpha == Scalar(0) && beta == Scalar(1))) return;

-  int kx = *incx > 0 ? 0 : (1 - *n) * *incx;
-  int ky = *incy > 0 ? 0 : (1 - *n) * *incy;
+  const Scalar *actual_x = get_compact_vector(x, *n, *incx);
+  Scalar *actual_y = get_compact_vector(y, *n, *incy);

  // First form y := beta*y.
  if (beta != Scalar(1)) {
-    int iy = ky;
-    for (int i = 0; i < *n; ++i) {
-      y[iy] = (beta == Scalar(0)) ? Scalar(0) : beta * y[iy];
-      iy += *incy;
-    }
+    if (beta == Scalar(0))
+      make_vector(actual_y, *n).setZero();
+    else
+      make_vector(actual_y, *n) *= beta;
  }

-  if (alpha == Scalar(0)) return;
+  if (alpha == Scalar(0)) {
+    if (actual_x != x) delete[] actual_x;
+    if (actual_y != y) delete[] copy_back(actual_y, y, *n, *incy);
+    return;
+  }

-  if (UPLO(*uplo) == UP) {
-    // Upper triangle: A[i,j] at a[(k+i-j) + j*lda], diagonal at row k.
-    int jx = kx, jy = ky;
-    for (int j = 0; j < *n; ++j) {
-      Scalar temp1 = alpha * x[jx];
-      Scalar temp2 = Scalar(0);
-      int ix = kx, iy = ky;
-      for (int i = std::max(0, j - *k); i < j; ++i) {
-        Scalar aij = a[(*k + i - j) + j * *lda];
-        y[iy] += temp1 * aij;
-        temp2 += aij * x[ix];
-        ix += *incx;
-        iy += *incy;
+  if (*k >= 8) {
+    // Vectorized path: use Eigen Map segments for the inner band operations.
+    ConstMatrixType band(a, *k + 1, *n, *lda);
+    if (UPLO(*uplo) == UP) {
+      for (int j = 0; j < *n; ++j) {
+        int start = std::max(0, j - *k);
+        int len = j - start;
+        int offset = *k - (j - start);
+        Scalar temp1 = alpha * actual_x[j];
+        actual_y[j] += temp1 * band(*k, j);
+        if (len > 0) {
+          make_vector(actual_y + start, len) += temp1 * band.col(j).segment(offset, len);
+          actual_y[j] += alpha * band.col(j).segment(offset, len).dot(make_vector(actual_x + start, len));
+        }
      }
-      y[jy] += temp1 * a[*k + j * *lda] + alpha * temp2;
-      jx += *incx;
-      jy += *incy;
-      if (j >= *k) {
-        kx += *incx;
-        ky += *incy;
+    } else {
+      for (int j = 0; j < *n; ++j) {
+        int len = std::min(*n - 1, j + *k) - j;
+        Scalar temp1 = alpha * actual_x[j];
+        actual_y[j] += temp1 * band(0, j);
+        if (len > 0) {
+          make_vector(actual_y + j + 1, len) += temp1 * band.col(j).segment(1, len);
+          actual_y[j] += alpha * band.col(j).segment(1, len).dot(make_vector(actual_x + j + 1, len));
+        }
      }
    }
  } else {
-    // Lower triangle: A[i,j] at a[(i-j) + j*lda], diagonal at row 0.
-    int jx = kx, jy = ky;
-    for (int j = 0; j < *n; ++j) {
-      Scalar temp1 = alpha * x[jx];
-      Scalar temp2 = Scalar(0);
-      y[jy] += temp1 * a[j * *lda];
-      int ix = jx, iy = jy;
-      for (int i = j + 1; i <= std::min(*n - 1, j + *k); ++i) {
-        ix += *incx;
-        iy += *incy;
-        Scalar aij = a[(i - j) + j * *lda];
-        y[iy] += temp1 * aij;
-        temp2 += aij * x[ix];
+    // Scalar path: for narrow bandwidth, avoid Map overhead.
+    if (UPLO(*uplo) == UP) {
+      for (int j = 0; j < *n; ++j) {
+        Scalar temp1 = alpha * actual_x[j];
+        Scalar temp2 = Scalar(0);
+        for (int i = std::max(0, j - *k); i < j; ++i) {
+          Scalar aij = a[(*k + i - j) + j * *lda];
+          actual_y[i] += temp1 * aij;
+          temp2 += aij * actual_x[i];
+        }
+        actual_y[j] += temp1 * a[*k + j * *lda] + alpha * temp2;
+      }
+    } else {
+      for (int j = 0; j < *n; ++j) {
+        Scalar temp1 = alpha * actual_x[j];
+        Scalar temp2 = Scalar(0);
+        actual_y[j] += temp1 * a[j * *lda];
+        for (int i = j + 1; i <= std::min(*n - 1, j + *k); ++i) {
+          Scalar aij = a[(i - j) + j * *lda];
+          actual_y[i] += temp1 * aij;
+          temp2 += aij * actual_x[i];
+        }
+        actual_y[j] += alpha * temp2;
      }
-      y[jy] += alpha * temp2;
-      jx += *incx;
-      jy += *incy;
    }
  }
+
+  if (actual_x != x) delete[] actual_x;
+  if (actual_y != y) delete[] copy_back(actual_y, y, *n, *incy);
 }

 /**  SPMV  performs the matrix-vector operation
@@ -285,59 +302,51 @@ EIGEN_BLAS_FUNC(spmv)

  if (*n == 0 || (alpha == Scalar(0) && beta == Scalar(1))) return;

-  int kx = *incx > 0 ? 0 : (1 - *n) * *incx;
-  int ky = *incy > 0 ? 0 : (1 - *n) * *incy;
+  const Scalar *actual_x = get_compact_vector(x, *n, *incx);
+  Scalar *actual_y = get_compact_vector(y, *n, *incy);

  // First form y := beta*y.
  if (beta != Scalar(1)) {
-    int iy = ky;
-    for (int i = 0; i < *n; ++i) {
-      y[iy] = (beta == Scalar(0)) ? Scalar(0) : beta * y[iy];
-      iy += *incy;
-    }
+    if (beta == Scalar(0))
+      make_vector(actual_y, *n).setZero();
+    else
+      make_vector(actual_y, *n) *= beta;
  }

-  if (alpha == Scalar(0)) return;
+  if (alpha == Scalar(0)) {
+    if (actual_x != x) delete[] actual_x;
+    if (actual_y != y) delete[] copy_back(actual_y, y, *n, *incy);
+    return;
+  }

  int kk = 0;
  if (UPLO(*uplo) == UP) {
-    // Upper triangle packed.
-    int jx = kx, jy = ky;
+    // Upper triangle packed: column j occupies ap[kk..kk+j].
    for (int j = 0; j < *n; ++j) {
-      Scalar temp1 = alpha * x[jx];
-      Scalar temp2 = Scalar(0);
-      int ix = kx, iy = ky;
-      for (int i = 0; i < j; ++i) {
-        y[iy] += temp1 * ap[kk + i];
-        temp2 += ap[kk + i] * x[ix];
-        ix += *incx;
-        iy += *incy;
+      Scalar temp1 = alpha * actual_x[j];
+      actual_y[j] += temp1 * ap[kk + j];
+      if (j > 0) {
+        make_vector(actual_y, j) += temp1 * make_vector(ap + kk, j);
+        actual_y[j] += alpha * make_vector(ap + kk, j).dot(make_vector(actual_x, j));
      }
-      y[jy] += temp1 * ap[kk + j] + alpha * temp2;
-      jx += *incx;
-      jy += *incy;
      kk += j + 1;
    }
  } else {
-    // Lower triangle packed.
-    int jx = kx, jy = ky;
+    // Lower triangle packed: column j occupies ap[kk..kk+(n-j-1)].
    for (int j = 0; j < *n; ++j) {
-      Scalar temp1 = alpha * x[jx];
-      Scalar temp2 = Scalar(0);
-      y[jy] += temp1 * ap[kk];
-      int ix = jx, iy = jy;
-      for (int i = 1; i < *n - j; ++i) {
-        ix += *incx;
-        iy += *incy;
-        y[iy] += temp1 * ap[kk + i];
-        temp2 += ap[kk + i] * x[ix];
+      int len = *n - j - 1;
+      Scalar temp1 = alpha * actual_x[j];
+      actual_y[j] += temp1 * ap[kk];
+      if (len > 0) {
+        make_vector(actual_y + j + 1, len) += temp1 * make_vector(ap + kk + 1, len);
+        actual_y[j] += alpha * make_vector(ap + kk + 1, len).dot(make_vector(actual_x + j + 1, len));
      }
-      y[jy] += alpha * temp2;
-      jx += *incx;
-      jy += *incy;
      kk += *n - j;
    }
  }
+
+  if (actual_x != x) delete[] actual_x;
+  if (actual_y != y) delete[] copy_back(actual_y, y, *n, *incy);
 }

 /**  DSPR    performs the symmetric rank 1 operation