Optimize SYMV, SYR, SYR2, and TRMV product kernels

libeigen/eigen!2228 Co-authored-by: Rasmus Munk Larsen <rmlarsen@gmail.com>
2026-04-10 11:34:33 +08:00 · 2026-03-01 19:40:11 -08:00
parent c66fc52868
commit 662d5c21ff
9 changed files with 866 additions and 151 deletions
--- a/Eigen/src/Core/products/SelfadjointMatrixVector.h
+++ b/Eigen/src/Core/products/SelfadjointMatrixVector.h
@@ -18,9 +18,11 @@ namespace Eigen {
 namespace internal {

 /* Optimized selfadjoint matrix * vector product:
- * This algorithm processes 2 columns at once that allows to both reduce
- * the number of load/stores of the result by a factor 2 and to reduce
- * the instruction dependency.
+ * This algorithm processes 4 columns at once to reduce the number of
+ * load/stores of the result vector by a factor of 4 compared to the
+ * naive approach, and to increase instruction-level parallelism.
+ * A 2-column cleanup handles the remaining even columns, and a
+ * 1-column loop handles any final odd column.
 */

 template <typename Scalar, typename Index, int StorageOrder, int UpLo, bool ConjugateLhs, bool ConjugateRhs,
@@ -61,82 +63,246 @@ selfadjoint_matrix_vector_product<Scalar, Index, StorageOrder, UpLo, ConjugateLh

  Scalar cjAlpha = ConjugateRhs ? numext::conj(alpha) : alpha;

-  Index bound = numext::maxi(Index(0), size - 8) & 0xfffffffe;
-  if (FirstTriangular) bound = size - bound;
+  // Compute column counts for 4-col, 2-col, and 1-col processing phases.
+  // We leave up to ~8 columns near the diagonal for cleanup (short off-diagonal ranges).
+  Index n4 = (numext::maxi(Index(0), size - 8) / 4) * 4;
+  Index n2 = ((size - n4) / 2) * 2;
+  // Remaining (size - n4 - n2) is 0 or 1 columns.

-  for (Index j = FirstTriangular ? bound : 0; j < (FirstTriangular ? size : bound); j += 2) {
-    const Scalar* EIGEN_RESTRICT A0 = lhs + j * lhsStride;
-    const Scalar* EIGEN_RESTRICT A1 = lhs + (j + 1) * lhsStride;
+  // For !FirstTriangular: 4-col [0, n4), 2-col [n4, n4+n2), 1-col [n4+n2, size)
+  // For FirstTriangular:  1-col [0, size-n4-n2), 2-col [size-n4-n2, size-n4), 4-col [size-n4, size)

-    Scalar t0 = cjAlpha * rhs[j];
-    Packet ptmp0 = pset1<Packet>(t0);
-    Scalar t1 = cjAlpha * rhs[j + 1];
-    Packet ptmp1 = pset1<Packet>(t1);
+  // === Phase 1: 4 columns at a time ===
+  {
+    Index jStart = FirstTriangular ? (size - n4) : 0;
+    Index jEnd = FirstTriangular ? size : n4;

-    Scalar t2(0);
-    Packet ptmp2 = pset1<Packet>(t2);
-    Scalar t3(0);
-    Packet ptmp3 = pset1<Packet>(t3);
+    for (Index j = jStart; j < jEnd; j += 4) {
+      const Scalar* EIGEN_RESTRICT A0 = lhs + j * lhsStride;
+      const Scalar* EIGEN_RESTRICT A1 = lhs + (j + 1) * lhsStride;
+      const Scalar* EIGEN_RESTRICT A2 = lhs + (j + 2) * lhsStride;
+      const Scalar* EIGEN_RESTRICT A3 = lhs + (j + 3) * lhsStride;

-    Index starti = FirstTriangular ? 0 : j + 2;
-    Index endi = FirstTriangular ? j : size;
-    Index alignedStart = (starti) + internal::first_default_aligned(&res[starti], endi - starti);
-    Index alignedEnd = alignedStart + ((endi - alignedStart) / (PacketSize)) * (PacketSize);
+      Scalar t0 = cjAlpha * rhs[j];
+      Scalar t1 = cjAlpha * rhs[j + 1];
+      Scalar t2 = cjAlpha * rhs[j + 2];
+      Scalar t3 = cjAlpha * rhs[j + 3];
+      Packet ptmp0 = pset1<Packet>(t0);
+      Packet ptmp1 = pset1<Packet>(t1);
+      Packet ptmp2 = pset1<Packet>(t2);
+      Packet ptmp3 = pset1<Packet>(t3);

-    res[j] += cjd.pmul(numext::real(A0[j]), t0);
-    res[j + 1] += cjd.pmul(numext::real(A1[j + 1]), t1);
-    if (FirstTriangular) {
-      res[j] += cj0.pmul(A1[j], t1);
-      t3 += cj1.pmul(A1[j], rhs[j]);
-    } else {
-      res[j + 1] += cj0.pmul(A0[j + 1], t0);
-      t2 += cj1.pmul(A0[j + 1], rhs[j + 1]);
+      Scalar t4(0), t5(0), t6(0), t7(0);
+      Packet ptmp4 = pzero(Packet{});
+      Packet ptmp5 = pzero(Packet{});
+      Packet ptmp6 = pzero(Packet{});
+      Packet ptmp7 = pzero(Packet{});
+
+      Index starti = FirstTriangular ? 0 : j + 4;
+      Index endi = FirstTriangular ? j : size;
+      Index alignedStart = starti + internal::first_default_aligned(&res[starti], endi - starti);
+      Index alignedEnd = alignedStart + ((endi - alignedStart) / PacketSize) * PacketSize;
+
+      // Handle the 4x4 diagonal block: diagonal elements
+      res[j] += cjd.pmul(numext::real(A0[j]), t0);
+      res[j + 1] += cjd.pmul(numext::real(A1[j + 1]), t1);
+      res[j + 2] += cjd.pmul(numext::real(A2[j + 2]), t2);
+      res[j + 3] += cjd.pmul(numext::real(A3[j + 3]), t3);
+
+      // Handle the 4x4 diagonal block: off-diagonal cross terms
+      if (FirstTriangular) {
+        // Upper triangle stored (A_k[l] for l <= k)
+        res[j] += cj0.pmul(A1[j], t1) + cj0.pmul(A2[j], t2) + cj0.pmul(A3[j], t3);
+        res[j + 1] += cj0.pmul(A2[j + 1], t2) + cj0.pmul(A3[j + 1], t3);
+        res[j + 2] += cj0.pmul(A3[j + 2], t3);
+
+        t5 += cj1.pmul(A1[j], rhs[j]);
+        t6 += cj1.pmul(A2[j], rhs[j]) + cj1.pmul(A2[j + 1], rhs[j + 1]);
+        t7 += cj1.pmul(A3[j], rhs[j]) + cj1.pmul(A3[j + 1], rhs[j + 1]) + cj1.pmul(A3[j + 2], rhs[j + 2]);
+      } else {
+        // Lower triangle stored (A_k[l] for l >= k)
+        res[j + 1] += cj0.pmul(A0[j + 1], t0);
+        res[j + 2] += cj0.pmul(A0[j + 2], t0) + cj0.pmul(A1[j + 2], t1);
+        res[j + 3] += cj0.pmul(A0[j + 3], t0) + cj0.pmul(A1[j + 3], t1) + cj0.pmul(A2[j + 3], t2);
+
+        t4 += cj1.pmul(A0[j + 1], rhs[j + 1]) + cj1.pmul(A0[j + 2], rhs[j + 2]) + cj1.pmul(A0[j + 3], rhs[j + 3]);
+        t5 += cj1.pmul(A1[j + 2], rhs[j + 2]) + cj1.pmul(A1[j + 3], rhs[j + 3]);
+        t6 += cj1.pmul(A2[j + 3], rhs[j + 3]);
+      }
+
+      // Pre-alignment scalar loop
+      for (Index i = starti; i < alignedStart; ++i) {
+        res[i] += cj0.pmul(A0[i], t0) + cj0.pmul(A1[i], t1) + cj0.pmul(A2[i], t2) + cj0.pmul(A3[i], t3);
+        t4 += cj1.pmul(A0[i], rhs[i]);
+        t5 += cj1.pmul(A1[i], rhs[i]);
+        t6 += cj1.pmul(A2[i], rhs[i]);
+        t7 += cj1.pmul(A3[i], rhs[i]);
+      }
+
+      // Main vectorized loop: 4 matrix column loads, 1 rhs load, 1 result load/store
+      const Scalar* EIGEN_RESTRICT a0It = A0 + alignedStart;
+      const Scalar* EIGEN_RESTRICT a1It = A1 + alignedStart;
+      const Scalar* EIGEN_RESTRICT a2It = A2 + alignedStart;
+      const Scalar* EIGEN_RESTRICT a3It = A3 + alignedStart;
+      const Scalar* EIGEN_RESTRICT rhsIt = rhs + alignedStart;
+      Scalar* EIGEN_RESTRICT resIt = res + alignedStart;
+      for (Index i = alignedStart; i < alignedEnd; i += PacketSize) {
+        Packet A0i = ploadu<Packet>(a0It);
+        a0It += PacketSize;
+        Packet A1i = ploadu<Packet>(a1It);
+        a1It += PacketSize;
+        Packet A2i = ploadu<Packet>(a2It);
+        a2It += PacketSize;
+        Packet A3i = ploadu<Packet>(a3It);
+        a3It += PacketSize;
+        Packet Bi = ploadu<Packet>(rhsIt);
+        rhsIt += PacketSize;
+        Packet Xi = pload<Packet>(resIt);
+
+        Xi = pcj0.pmadd(A0i, ptmp0, Xi);
+        Xi = pcj0.pmadd(A1i, ptmp1, Xi);
+        Xi = pcj0.pmadd(A2i, ptmp2, Xi);
+        Xi = pcj0.pmadd(A3i, ptmp3, Xi);
+        pstore(resIt, Xi);
+        resIt += PacketSize;
+
+        ptmp4 = pcj1.pmadd(A0i, Bi, ptmp4);
+        ptmp5 = pcj1.pmadd(A1i, Bi, ptmp5);
+        ptmp6 = pcj1.pmadd(A2i, Bi, ptmp6);
+        ptmp7 = pcj1.pmadd(A3i, Bi, ptmp7);
+      }
+
+      // Post-alignment scalar loop
+      for (Index i = alignedEnd; i < endi; ++i) {
+        res[i] += cj0.pmul(A0[i], t0) + cj0.pmul(A1[i], t1) + cj0.pmul(A2[i], t2) + cj0.pmul(A3[i], t3);
+        t4 += cj1.pmul(A0[i], rhs[i]);
+        t5 += cj1.pmul(A1[i], rhs[i]);
+        t6 += cj1.pmul(A2[i], rhs[i]);
+        t7 += cj1.pmul(A3[i], rhs[i]);
+      }
+
+      res[j] += alpha * (t4 + predux(ptmp4));
+      res[j + 1] += alpha * (t5 + predux(ptmp5));
+      res[j + 2] += alpha * (t6 + predux(ptmp6));
+      res[j + 3] += alpha * (t7 + predux(ptmp7));
    }
-
-    for (Index i = starti; i < alignedStart; ++i) {
-      res[i] += cj0.pmul(A0[i], t0) + cj0.pmul(A1[i], t1);
-      t2 += cj1.pmul(A0[i], rhs[i]);
-      t3 += cj1.pmul(A1[i], rhs[i]);
-    }
-    const Scalar* EIGEN_RESTRICT a0It = A0 + alignedStart;
-    const Scalar* EIGEN_RESTRICT a1It = A1 + alignedStart;
-    const Scalar* EIGEN_RESTRICT rhsIt = rhs + alignedStart;
-    Scalar* EIGEN_RESTRICT resIt = res + alignedStart;
-    for (Index i = alignedStart; i < alignedEnd; i += PacketSize) {
-      Packet A0i = ploadu<Packet>(a0It);
-      a0It += PacketSize;
-      Packet A1i = ploadu<Packet>(a1It);
-      a1It += PacketSize;
-      Packet Bi = ploadu<Packet>(rhsIt);
-      rhsIt += PacketSize;  // FIXME: should be aligned in most cases.
-      Packet Xi = pload<Packet>(resIt);
-
-      Xi = pcj0.pmadd(A0i, ptmp0, pcj0.pmadd(A1i, ptmp1, Xi));
-      ptmp2 = pcj1.pmadd(A0i, Bi, ptmp2);
-      ptmp3 = pcj1.pmadd(A1i, Bi, ptmp3);
-      pstore(resIt, Xi);
-      resIt += PacketSize;
-    }
-    for (Index i = alignedEnd; i < endi; i++) {
-      res[i] += cj0.pmul(A0[i], t0) + cj0.pmul(A1[i], t1);
-      t2 += cj1.pmul(A0[i], rhs[i]);
-      t3 += cj1.pmul(A1[i], rhs[i]);
-    }
-
-    res[j] += alpha * (t2 + predux(ptmp2));
-    res[j + 1] += alpha * (t3 + predux(ptmp3));
  }
-  for (Index j = FirstTriangular ? 0 : bound; j < (FirstTriangular ? bound : size); j++) {
-    const Scalar* EIGEN_RESTRICT A0 = lhs + j * lhsStride;

-    Scalar t1 = cjAlpha * rhs[j];
-    Scalar t2(0);
-    res[j] += cjd.pmul(numext::real(A0[j]), t1);
-    for (Index i = FirstTriangular ? 0 : j + 1; i < (FirstTriangular ? j : size); i++) {
-      res[i] += cj0.pmul(A0[i], t1);
-      t2 += cj1.pmul(A0[i], rhs[i]);
+  // === Phase 2: 2 columns at a time ===
+  {
+    Index jStart = FirstTriangular ? (size - n4 - n2) : n4;
+    Index jEnd = FirstTriangular ? (size - n4) : (n4 + n2);
+
+    for (Index j = jStart; j < jEnd; j += 2) {
+      const Scalar* EIGEN_RESTRICT A0 = lhs + j * lhsStride;
+      const Scalar* EIGEN_RESTRICT A1 = lhs + (j + 1) * lhsStride;
+
+      Scalar t0 = cjAlpha * rhs[j];
+      Packet ptmp0 = pset1<Packet>(t0);
+      Scalar t1 = cjAlpha * rhs[j + 1];
+      Packet ptmp1 = pset1<Packet>(t1);
+
+      Scalar t2(0);
+      Packet ptmp2 = pzero(Packet{});
+      Scalar t3(0);
+      Packet ptmp3 = pzero(Packet{});
+
+      Index starti = FirstTriangular ? 0 : j + 2;
+      Index endi = FirstTriangular ? j : size;
+      Index alignedStart = starti + internal::first_default_aligned(&res[starti], endi - starti);
+      Index alignedEnd = alignedStart + ((endi - alignedStart) / PacketSize) * PacketSize;
+
+      res[j] += cjd.pmul(numext::real(A0[j]), t0);
+      res[j + 1] += cjd.pmul(numext::real(A1[j + 1]), t1);
+      if (FirstTriangular) {
+        res[j] += cj0.pmul(A1[j], t1);
+        t3 += cj1.pmul(A1[j], rhs[j]);
+      } else {
+        res[j + 1] += cj0.pmul(A0[j + 1], t0);
+        t2 += cj1.pmul(A0[j + 1], rhs[j + 1]);
+      }
+
+      for (Index i = starti; i < alignedStart; ++i) {
+        res[i] += cj0.pmul(A0[i], t0) + cj0.pmul(A1[i], t1);
+        t2 += cj1.pmul(A0[i], rhs[i]);
+        t3 += cj1.pmul(A1[i], rhs[i]);
+      }
+      const Scalar* EIGEN_RESTRICT a0It = A0 + alignedStart;
+      const Scalar* EIGEN_RESTRICT a1It = A1 + alignedStart;
+      const Scalar* EIGEN_RESTRICT rhsIt = rhs + alignedStart;
+      Scalar* EIGEN_RESTRICT resIt = res + alignedStart;
+      for (Index i = alignedStart; i < alignedEnd; i += PacketSize) {
+        Packet A0i = ploadu<Packet>(a0It);
+        a0It += PacketSize;
+        Packet A1i = ploadu<Packet>(a1It);
+        a1It += PacketSize;
+        Packet Bi = ploadu<Packet>(rhsIt);
+        rhsIt += PacketSize;
+        Packet Xi = pload<Packet>(resIt);
+
+        Xi = pcj0.pmadd(A0i, ptmp0, pcj0.pmadd(A1i, ptmp1, Xi));
+        ptmp2 = pcj1.pmadd(A0i, Bi, ptmp2);
+        ptmp3 = pcj1.pmadd(A1i, Bi, ptmp3);
+        pstore(resIt, Xi);
+        resIt += PacketSize;
+      }
+      for (Index i = alignedEnd; i < endi; i++) {
+        res[i] += cj0.pmul(A0[i], t0) + cj0.pmul(A1[i], t1);
+        t2 += cj1.pmul(A0[i], rhs[i]);
+        t3 += cj1.pmul(A1[i], rhs[i]);
+      }
+
+      res[j] += alpha * (t2 + predux(ptmp2));
+      res[j + 1] += alpha * (t3 + predux(ptmp3));
+    }
+  }
+
+  // === Phase 3: 1 column at a time ===
+  {
+    Index jStart = FirstTriangular ? 0 : (n4 + n2);
+    Index jEnd = FirstTriangular ? (size - n4 - n2) : size;
+
+    for (Index j = jStart; j < jEnd; j++) {
+      const Scalar* EIGEN_RESTRICT A0 = lhs + j * lhsStride;
+
+      Scalar t1 = cjAlpha * rhs[j];
+      Scalar t2(0);
+      Packet ptmp1 = pset1<Packet>(t1);
+      Packet ptmp2 = pzero(Packet{});
+
+      res[j] += cjd.pmul(numext::real(A0[j]), t1);
+
+      Index starti = FirstTriangular ? 0 : j + 1;
+      Index endi = FirstTriangular ? j : size;
+      Index alignedStart = starti + internal::first_default_aligned(&res[starti], endi - starti);
+      Index alignedEnd = alignedStart + ((endi - alignedStart) / PacketSize) * PacketSize;
+
+      for (Index i = starti; i < alignedStart; ++i) {
+        res[i] += cj0.pmul(A0[i], t1);
+        t2 += cj1.pmul(A0[i], rhs[i]);
+      }
+      const Scalar* EIGEN_RESTRICT a0It = A0 + alignedStart;
+      const Scalar* EIGEN_RESTRICT rhsIt = rhs + alignedStart;
+      Scalar* EIGEN_RESTRICT resIt = res + alignedStart;
+      for (Index i = alignedStart; i < alignedEnd; i += PacketSize) {
+        Packet A0i = ploadu<Packet>(a0It);
+        a0It += PacketSize;
+        Packet Bi = ploadu<Packet>(rhsIt);
+        rhsIt += PacketSize;
+        Packet Xi = pload<Packet>(resIt);
+
+        Xi = pcj0.pmadd(A0i, ptmp1, Xi);
+        pstore(resIt, Xi);
+        resIt += PacketSize;
+
+        ptmp2 = pcj1.pmadd(A0i, Bi, ptmp2);
+      }
+      for (Index i = alignedEnd; i < endi; i++) {
+        res[i] += cj0.pmul(A0[i], t1);
+        t2 += cj1.pmul(A0[i], rhs[i]);
+      }
+      res[j] += alpha * (t2 + predux(ptmp2));
    }
-    res[j] += alpha * t2;
  }
 }

--- a/Eigen/src/Core/products/SelfadjointProduct.h
+++ b/Eigen/src/Core/products/SelfadjointProduct.h
@@ -24,14 +24,104 @@ namespace Eigen {
 template <typename Scalar, typename Index, int UpLo, bool ConjLhs, bool ConjRhs>
 struct selfadjoint_rank1_update<Scalar, Index, ColMajor, UpLo, ConjLhs, ConjRhs> {
  static void run(Index size, Scalar* mat, Index stride, const Scalar* vecX, const Scalar* vecY, const Scalar& alpha) {
-    internal::conj_if<ConjRhs> cj;
-    typedef Map<const Matrix<Scalar, Dynamic, 1> > OtherMap;
-    typedef std::conditional_t<ConjLhs, typename OtherMap::ConjugateReturnType, const OtherMap&> ConjLhsType;
-    for (Index i = 0; i < size; ++i) {
-      Map<Matrix<Scalar, Dynamic, 1> >(mat + stride * i + (UpLo == Lower ? i : 0),
-                                       (UpLo == Lower ? size - i : (i + 1))) +=
-          (alpha * cj(vecY[i])) *
-          ConjLhsType(OtherMap(vecX + (UpLo == Lower ? i : 0), UpLo == Lower ? size - i : (i + 1)));
+    typedef typename internal::packet_traits<Scalar>::type Packet;
+    const Index PacketSize = internal::unpacket_traits<Packet>::size;
+
+    internal::conj_if<ConjRhs> cjy;
+    internal::conj_if<ConjLhs> cjx;
+    internal::conj_helper<Packet, Packet, ConjLhs, false> pcj;
+
+    // Process 2 columns at a time to share vecX loads and reduce loop overhead.
+    Index j = 0;
+    for (; j + 1 < size; j += 2) {
+      Scalar s0 = alpha * cjy(vecY[j]);
+      Scalar s1 = alpha * cjy(vecY[j + 1]);
+      Packet ps0 = internal::pset1<Packet>(s0);
+      Packet ps1 = internal::pset1<Packet>(s1);
+
+      if (UpLo == Lower) {
+        Scalar* EIGEN_RESTRICT col0 = mat + stride * j + j;
+        Scalar* EIGEN_RESTRICT col1 = mat + stride * (j + 1) + (j + 1);
+
+        // Diagonal and cross-diagonal scalar elements
+        col0[0] += s0 * cjx(vecX[j]);
+        col0[1] += s0 * cjx(vecX[j + 1]);
+        col1[0] += s1 * cjx(vecX[j + 1]);
+
+        // Shared vectorized loop for rows j+2..size-1
+        Index len = size - j - 2;
+        const Scalar* EIGEN_RESTRICT xp = vecX + j + 2;
+        Scalar* EIGEN_RESTRICT d0 = col0 + 2;
+        Scalar* EIGEN_RESTRICT d1 = col1 + 1;
+
+        Index k = 0;
+        Index vectorizedEnd = (len / PacketSize) * PacketSize;
+        for (; k < vectorizedEnd; k += PacketSize) {
+          Packet xi = internal::ploadu<Packet>(xp + k);
+          Packet m0 = internal::ploadu<Packet>(d0 + k);
+          m0 = pcj.pmadd(xi, ps0, m0);
+          internal::pstoreu(d0 + k, m0);
+          Packet m1 = internal::ploadu<Packet>(d1 + k);
+          m1 = pcj.pmadd(xi, ps1, m1);
+          internal::pstoreu(d1 + k, m1);
+        }
+        for (; k < len; ++k) {
+          Scalar cx = cjx(xp[k]);
+          d0[k] += s0 * cx;
+          d1[k] += s1 * cx;
+        }
+      } else {
+        // UpLo == Upper
+        Scalar* EIGEN_RESTRICT col0 = mat + stride * j;
+        Scalar* EIGEN_RESTRICT col1 = mat + stride * (j + 1);
+
+        // Shared vectorized loop for rows 0..j-1
+        const Scalar* EIGEN_RESTRICT xp = vecX;
+        Index len = j;
+        Index k = 0;
+        Index vectorizedEnd = (len / PacketSize) * PacketSize;
+        for (; k < vectorizedEnd; k += PacketSize) {
+          Packet xi = internal::ploadu<Packet>(xp + k);
+          Packet m0 = internal::ploadu<Packet>(col0 + k);
+          Packet m1 = internal::ploadu<Packet>(col1 + k);
+          m0 = pcj.pmadd(xi, ps0, m0);
+          m1 = pcj.pmadd(xi, ps1, m1);
+          internal::pstoreu(col0 + k, m0);
+          internal::pstoreu(col1 + k, m1);
+        }
+        for (; k < len; ++k) {
+          Scalar cx = cjx(xp[k]);
+          col0[k] += s0 * cx;
+          col1[k] += s1 * cx;
+        }
+
+        // Diagonal and cross-diagonal scalar elements
+        col0[j] += s0 * cjx(vecX[j]);
+        col1[j] += s1 * cjx(vecX[j]);
+        col1[j + 1] += s1 * cjx(vecX[j + 1]);
+      }
+    }
+
+    // Handle last column if size is odd
+    if (j < size) {
+      Scalar s = alpha * cjy(vecY[j]);
+      Packet ps = internal::pset1<Packet>(s);
+      Index start = UpLo == Lower ? j : 0;
+      Index len = UpLo == Lower ? size - j : j + 1;
+      Scalar* EIGEN_RESTRICT dst = mat + stride * j + start;
+      const Scalar* EIGEN_RESTRICT xp = vecX + start;
+
+      Index k = 0;
+      Index vectorizedEnd = (len / PacketSize) * PacketSize;
+      for (; k < vectorizedEnd; k += PacketSize) {
+        Packet xi = internal::ploadu<Packet>(xp + k);
+        Packet di = internal::ploadu<Packet>(dst + k);
+        di = pcj.pmadd(xi, ps, di);
+        internal::pstoreu(dst + k, di);
+      }
+      for (; k < len; ++k) {
+        dst[k] += s * cjx(xp[k]);
+      }
    }
  }
 };
--- a/Eigen/src/Core/products/SelfadjointRank2Update.h
+++ b/Eigen/src/Core/products/SelfadjointRank2Update.h
@@ -21,29 +21,168 @@ namespace internal {
 * It corresponds to the Level2 syr2 BLAS routine
 */

-template <typename Scalar, typename Index, typename UType, typename VType, int UpLo>
+template <typename Scalar, typename Index, int UpLo>
 struct selfadjoint_rank2_update_selector;

-template <typename Scalar, typename Index, typename UType, typename VType>
-struct selfadjoint_rank2_update_selector<Scalar, Index, UType, VType, Lower> {
-  static EIGEN_DEVICE_FUNC void run(Scalar* mat, Index stride, const UType& u, const VType& v, const Scalar& alpha) {
-    const Index size = u.size();
-    for (Index i = 0; i < size; ++i) {
-      Map<Matrix<Scalar, Dynamic, 1>>(mat + stride * i + i, size - i) +=
-          (numext::conj(alpha) * numext::conj(u.coeff(i))) * v.tail(size - i) +
-          (alpha * numext::conj(v.coeff(i))) * u.tail(size - i);
+template <typename Scalar, typename Index>
+struct selfadjoint_rank2_update_selector<Scalar, Index, Lower> {
+  static void run(Index size, Scalar* mat, Index stride, const Scalar* u, const Scalar* v, const Scalar& alpha) {
+    typedef typename packet_traits<Scalar>::type Packet;
+    const Index PacketSize = unpacket_traits<Packet>::size;
+    const Scalar cAlpha = numext::conj(alpha);
+
+    // Process 2 columns at a time to share u/v loads and reduce loop overhead.
+    Index j = 0;
+    for (; j + 1 < size; j += 2) {
+      // Scale factors: col[j:] += s0u * v[j:] + s0v * u[j:]
+      Scalar s0u = cAlpha * numext::conj(u[j]);
+      Scalar s0v = alpha * numext::conj(v[j]);
+      Scalar s1u = cAlpha * numext::conj(u[j + 1]);
+      Scalar s1v = alpha * numext::conj(v[j + 1]);
+
+      Packet ps0u = pset1<Packet>(s0u);
+      Packet ps0v = pset1<Packet>(s0v);
+      Packet ps1u = pset1<Packet>(s1u);
+      Packet ps1v = pset1<Packet>(s1v);
+
+      Scalar* EIGEN_RESTRICT col0 = mat + stride * j + j;
+      Scalar* EIGEN_RESTRICT col1 = mat + stride * (j + 1) + (j + 1);
+
+      // Diagonal and cross-diagonal scalar elements
+      col0[0] += s0u * v[j] + s0v * u[j];
+      col0[1] += s0u * v[j + 1] + s0v * u[j + 1];
+      col1[0] += s1u * v[j + 1] + s1v * u[j + 1];
+
+      // Shared vectorized loop for rows j+2..size-1
+      Index len = size - j - 2;
+      const Scalar* EIGEN_RESTRICT up = u + j + 2;
+      const Scalar* EIGEN_RESTRICT vp = v + j + 2;
+      Scalar* EIGEN_RESTRICT d0 = col0 + 2;
+      Scalar* EIGEN_RESTRICT d1 = col1 + 1;
+
+      Index k = 0;
+      Index vectorizedEnd = (len / PacketSize) * PacketSize;
+      for (; k < vectorizedEnd; k += PacketSize) {
+        Packet ui = ploadu<Packet>(up + k);
+        Packet vi = ploadu<Packet>(vp + k);
+        Packet m0 = ploadu<Packet>(d0 + k);
+        m0 = pmadd(vi, ps0u, m0);
+        m0 = pmadd(ui, ps0v, m0);
+        pstoreu(d0 + k, m0);
+        Packet m1 = ploadu<Packet>(d1 + k);
+        m1 = pmadd(vi, ps1u, m1);
+        m1 = pmadd(ui, ps1v, m1);
+        pstoreu(d1 + k, m1);
+      }
+      for (; k < len; ++k) {
+        d0[k] += s0u * vp[k] + s0v * up[k];
+        d1[k] += s1u * vp[k] + s1v * up[k];
+      }
+    }
+
+    // Handle last column if size is odd
+    if (j < size) {
+      Scalar su = cAlpha * numext::conj(u[j]);
+      Scalar sv = alpha * numext::conj(v[j]);
+      Packet psu = pset1<Packet>(su);
+      Packet psv = pset1<Packet>(sv);
+
+      Scalar* EIGEN_RESTRICT dst = mat + stride * j + j;
+      const Scalar* EIGEN_RESTRICT up = u + j;
+      const Scalar* EIGEN_RESTRICT vp = v + j;
+      Index len = size - j;
+
+      Index k = 0;
+      Index vectorizedEnd = (len / PacketSize) * PacketSize;
+      for (; k < vectorizedEnd; k += PacketSize) {
+        Packet ui = ploadu<Packet>(up + k);
+        Packet vi = ploadu<Packet>(vp + k);
+        Packet di = ploadu<Packet>(dst + k);
+        di = pmadd(vi, psu, di);
+        di = pmadd(ui, psv, di);
+        pstoreu(dst + k, di);
+      }
+      for (; k < len; ++k) {
+        dst[k] += su * vp[k] + sv * up[k];
+      }
    }
  }
 };

-template <typename Scalar, typename Index, typename UType, typename VType>
-struct selfadjoint_rank2_update_selector<Scalar, Index, UType, VType, Upper> {
-  static void run(Scalar* mat, Index stride, const UType& u, const VType& v, const Scalar& alpha) {
-    const Index size = u.size();
-    for (Index i = 0; i < size; ++i)
-      Map<Matrix<Scalar, Dynamic, 1>>(mat + stride * i, i + 1) +=
-          (numext::conj(alpha) * numext::conj(u.coeff(i))) * v.head(i + 1) +
-          (alpha * numext::conj(v.coeff(i))) * u.head(i + 1);
+template <typename Scalar, typename Index>
+struct selfadjoint_rank2_update_selector<Scalar, Index, Upper> {
+  static void run(Index size, Scalar* mat, Index stride, const Scalar* u, const Scalar* v, const Scalar& alpha) {
+    typedef typename packet_traits<Scalar>::type Packet;
+    const Index PacketSize = unpacket_traits<Packet>::size;
+    const Scalar cAlpha = numext::conj(alpha);
+
+    // Process 2 columns at a time to share u/v loads and reduce loop overhead.
+    Index j = 0;
+    for (; j + 1 < size; j += 2) {
+      Scalar s0u = cAlpha * numext::conj(u[j]);
+      Scalar s0v = alpha * numext::conj(v[j]);
+      Scalar s1u = cAlpha * numext::conj(u[j + 1]);
+      Scalar s1v = alpha * numext::conj(v[j + 1]);
+
+      Packet ps0u = pset1<Packet>(s0u);
+      Packet ps0v = pset1<Packet>(s0v);
+      Packet ps1u = pset1<Packet>(s1u);
+      Packet ps1v = pset1<Packet>(s1v);
+
+      Scalar* EIGEN_RESTRICT col0 = mat + stride * j;
+      Scalar* EIGEN_RESTRICT col1 = mat + stride * (j + 1);
+
+      // Shared vectorized loop for rows 0..j-1
+      Index len = j;
+      Index k = 0;
+      Index vectorizedEnd = (len / PacketSize) * PacketSize;
+      for (; k < vectorizedEnd; k += PacketSize) {
+        Packet ui = ploadu<Packet>(u + k);
+        Packet vi = ploadu<Packet>(v + k);
+        Packet m0 = ploadu<Packet>(col0 + k);
+        m0 = pmadd(vi, ps0u, m0);
+        m0 = pmadd(ui, ps0v, m0);
+        pstoreu(col0 + k, m0);
+        Packet m1 = ploadu<Packet>(col1 + k);
+        m1 = pmadd(vi, ps1u, m1);
+        m1 = pmadd(ui, ps1v, m1);
+        pstoreu(col1 + k, m1);
+      }
+      for (; k < len; ++k) {
+        col0[k] += s0u * v[k] + s0v * u[k];
+        col1[k] += s1u * v[k] + s1v * u[k];
+      }
+
+      // Diagonal and cross-diagonal scalar elements
+      col0[j] += s0u * v[j] + s0v * u[j];
+      col1[j] += s1u * v[j] + s1v * u[j];
+      col1[j + 1] += s1u * v[j + 1] + s1v * u[j + 1];
+    }
+
+    // Handle last column if size is odd
+    if (j < size) {
+      Scalar su = cAlpha * numext::conj(u[j]);
+      Scalar sv = alpha * numext::conj(v[j]);
+      Packet psu = pset1<Packet>(su);
+      Packet psv = pset1<Packet>(sv);
+
+      Scalar* EIGEN_RESTRICT dst = mat + stride * j;
+      Index len = j + 1;
+
+      Index k = 0;
+      Index vectorizedEnd = (len / PacketSize) * PacketSize;
+      for (; k < vectorizedEnd; k += PacketSize) {
+        Packet ui = ploadu<Packet>(u + k);
+        Packet vi = ploadu<Packet>(v + k);
+        Packet di = ploadu<Packet>(dst + k);
+        di = pmadd(vi, psu, di);
+        di = pmadd(ui, psv, di);
+        pstoreu(dst + k, di);
+      }
+      for (; k < len; ++k) {
+        dst[k] += su * v[k] + sv * u[k];
+      }
+    }
  }
 };

@@ -69,23 +208,47 @@ EIGEN_DEVICE_FUNC SelfAdjointView<MatrixType, UpLo>& SelfAdjointView<MatrixType,

  // If MatrixType is row major, then we use the routine for lower triangular in the upper triangular case and
  // vice versa, and take the complex conjugate of all coefficients and vector entries.
+  enum {
+    IsRowMajor = (internal::traits<MatrixType>::Flags & RowMajorBit) ? 1 : 0,
+    // Only need to conjugate if complex and the condition triggers
+    NeedConjU = (int(IsRowMajor) ^ int(UBlasTraits::NeedToConjugate)) && NumTraits<Scalar>::IsComplex,
+    NeedConjV = (int(IsRowMajor) ^ int(VBlasTraits::NeedToConjugate)) && NumTraits<Scalar>::IsComplex,
+    UseUDirectly = ActualUType_::InnerStrideAtCompileTime == 1 && !NeedConjU,
+    UseVDirectly = ActualVType_::InnerStrideAtCompileTime == 1 && !NeedConjV
+  };

-  enum { IsRowMajor = (internal::traits<MatrixType>::Flags & RowMajorBit) ? 1 : 0 };
  Scalar actualAlpha = alpha * UBlasTraits::extractScalarFactor(u.derived()) *
                       numext::conj(VBlasTraits::extractScalarFactor(v.derived()));
  if (IsRowMajor) actualAlpha = numext::conj(actualAlpha);

-  typedef internal::remove_all_t<
-      typename internal::conj_expr_if<int(IsRowMajor) ^ int(UBlasTraits::NeedToConjugate), ActualUType_>::type>
-      UType;
-  typedef internal::remove_all_t<
-      typename internal::conj_expr_if<int(IsRowMajor) ^ int(VBlasTraits::NeedToConjugate), ActualVType_>::type>
-      VType;
-  internal::selfadjoint_rank2_update_selector<Scalar, Index, UType, VType,
-                                              (IsRowMajor ? int(UpLo == Upper ? Lower : Upper)
-                                                          : UpLo)>::run(_expression().const_cast_derived().data(),
-                                                                        _expression().outerStride(), UType(actualU),
-                                                                        VType(actualV), actualAlpha);
+  const Index size = u.size();
+
+  // Copy u to contiguous buffer, applying conjugation if needed
+  internal::gemv_static_vector_if<Scalar, DerivedU::SizeAtCompileTime, DerivedU::MaxSizeAtCompileTime, !UseUDirectly>
+      static_u;
+  ei_declare_aligned_stack_constructed_variable(Scalar, uPtr, size,
+                                                (UseUDirectly ? const_cast<Scalar*>(actualU.data()) : static_u.data()));
+  if (!UseUDirectly) {
+    if (NeedConjU)
+      Map<typename ActualUType_::PlainObject>(uPtr, size) = actualU.conjugate();
+    else
+      Map<typename ActualUType_::PlainObject>(uPtr, size) = actualU;
+  }
+
+  // Copy v to contiguous buffer, applying conjugation if needed
+  internal::gemv_static_vector_if<Scalar, DerivedV::SizeAtCompileTime, DerivedV::MaxSizeAtCompileTime, !UseVDirectly>
+      static_v;
+  ei_declare_aligned_stack_constructed_variable(Scalar, vPtr, size,
+                                                (UseVDirectly ? const_cast<Scalar*>(actualV.data()) : static_v.data()));
+  if (!UseVDirectly) {
+    if (NeedConjV)
+      Map<typename ActualVType_::PlainObject>(vPtr, size) = actualV.conjugate();
+    else
+      Map<typename ActualVType_::PlainObject>(vPtr, size) = actualV;
+  }
+
+  internal::selfadjoint_rank2_update_selector<Scalar, Index, (IsRowMajor ? int(UpLo == Upper ? Lower : Upper) : UpLo)>::
+      run(size, _expression().const_cast_derived().data(), _expression().outerStride(), uPtr, vPtr, actualAlpha);

  return *this;
 }
--- a/Eigen/src/Core/products/TriangularMatrixVector.h
+++ b/Eigen/src/Core/products/TriangularMatrixVector.h
@@ -43,43 +43,102 @@ EIGEN_DONT_INLINE void triangular_matrix_vector_product<Index, Mode, LhsScalar,
  Index rows = IsLower ? _rows : (std::min)(_rows, _cols);
  Index cols = IsLower ? (std::min)(_rows, _cols) : _cols;

-  typedef Map<const Matrix<LhsScalar, Dynamic, Dynamic, ColMajor>, 0, OuterStride<> > LhsMap;
-  const LhsMap lhs(lhs_, rows, cols, OuterStride<>(lhsStride));
-  typename conj_expr_if<ConjLhs, LhsMap>::type cjLhs(lhs);
-
-  typedef Map<const Matrix<RhsScalar, Dynamic, 1>, 0, InnerStride<> > RhsMap;
-  const RhsMap rhs(rhs_, cols, InnerStride<>(rhsIncr));
-  typename conj_expr_if<ConjRhs, RhsMap>::type cjRhs(rhs);
-
-  typedef Map<Matrix<ResScalar, Dynamic, 1> > ResMap;
-  ResMap res(res_, rows);
-
  typedef const_blas_data_mapper<LhsScalar, Index, ColMajor> LhsMapper;
  typedef const_blas_data_mapper<RhsScalar, Index, RowMajor> RhsMapper;

+  conj_if<ConjLhs> cjl;
+  conj_if<ConjRhs> cjr;
+
  for (Index pi = 0; pi < size; pi += PanelWidth) {
    Index actualPanelWidth = (std::min)(PanelWidth, size - pi);
-    for (Index k = 0; k < actualPanelWidth; ++k) {
-      Index i = pi + k;
-      Index s = IsLower ? ((HasUnitDiag || HasZeroDiag) ? i + 1 : i) : pi;
-      Index r = IsLower ? actualPanelWidth - k : k + 1;
-      if ((!(HasUnitDiag || HasZeroDiag)) || (--r) > 0)
-        res.segment(s, r) += (alpha * cjRhs.coeff(i)) * cjLhs.col(i).segment(s, r);
-      if (HasUnitDiag) res.coeffRef(i) += alpha * cjRhs.coeff(i);
+
+    // Process the triangular panel using raw pointer operations with 2-column batching
+    // to eliminate expression template overhead and share result loads/stores.
+    if (IsLower) {
+      Index k = 0;
+      for (; k + 1 < actualPanelWidth; k += 2) {
+        Index i0 = pi + k;
+        Index i1 = i0 + 1;
+        ResScalar s0 = alpha * cjr(rhs_[i0 * rhsIncr]);
+        ResScalar s1 = alpha * cjr(rhs_[i1 * rhsIncr]);
+        const LhsScalar* EIGEN_RESTRICT c0 = lhs_ + i0 * lhsStride;
+        const LhsScalar* EIGEN_RESTRICT c1 = lhs_ + i1 * lhsStride;
+
+        // Diagonal of column 0
+        if (!(HasUnitDiag || HasZeroDiag)) res_[i0] += s0 * cjl(c0[i0]);
+        // Row i1: contribution from column 0 + diagonal of column 1
+        {
+          ResScalar r1 = s0 * cjl(c0[i1]);
+          if (!(HasUnitDiag || HasZeroDiag)) r1 += s1 * cjl(c1[i1]);
+          res_[i1] += r1;
+        }
+        // Shared rows where both columns contribute
+        Index panelEnd = pi + actualPanelWidth;
+        for (Index j = i1 + 1; j < panelEnd; ++j) res_[j] += s0 * cjl(c0[j]) + s1 * cjl(c1[j]);
+
+        if (HasUnitDiag) {
+          res_[i0] += s0;
+          res_[i1] += s1;
+        }
+      }
+      if (k < actualPanelWidth) {
+        Index i = pi + k;
+        ResScalar s = alpha * cjr(rhs_[i * rhsIncr]);
+        const LhsScalar* EIGEN_RESTRICT c = lhs_ + i * lhsStride;
+        if (!(HasUnitDiag || HasZeroDiag)) res_[i] += s * cjl(c[i]);
+        if (HasUnitDiag) res_[i] += s;
+      }
+    } else {
+      // Upper triangular: process 2 columns at a time
+      Index k = 0;
+      for (; k + 1 < actualPanelWidth; k += 2) {
+        Index i0 = pi + k;
+        Index i1 = i0 + 1;
+        ResScalar s0 = alpha * cjr(rhs_[i0 * rhsIncr]);
+        ResScalar s1 = alpha * cjr(rhs_[i1 * rhsIncr]);
+        const LhsScalar* EIGEN_RESTRICT c0 = lhs_ + i0 * lhsStride;
+        const LhsScalar* EIGEN_RESTRICT c1 = lhs_ + i1 * lhsStride;
+
+        // Shared rows before the diagonal block
+        for (Index j = pi; j < i0; ++j) res_[j] += s0 * cjl(c0[j]) + s1 * cjl(c1[j]);
+
+        // Row i0: diagonal of col0 + contribution from col1
+        {
+          ResScalar r0 = s1 * cjl(c1[i0]);
+          if (!(HasUnitDiag || HasZeroDiag)) r0 += s0 * cjl(c0[i0]);
+          res_[i0] += r0;
+        }
+        // Diagonal of column 1
+        if (!(HasUnitDiag || HasZeroDiag)) res_[i1] += s1 * cjl(c1[i1]);
+
+        if (HasUnitDiag) {
+          res_[i0] += s0;
+          res_[i1] += s1;
+        }
+      }
+      if (k < actualPanelWidth) {
+        Index i = pi + k;
+        ResScalar s = alpha * cjr(rhs_[i * rhsIncr]);
+        const LhsScalar* EIGEN_RESTRICT c = lhs_ + i * lhsStride;
+        for (Index j = pi; j < i; ++j) res_[j] += s * cjl(c[j]);
+        if (!(HasUnitDiag || HasZeroDiag)) res_[i] += s * cjl(c[i]);
+        if (HasUnitDiag) res_[i] += s;
+      }
    }
+
+    // Rectangular part: delegate to optimized GEMV
    Index r = IsLower ? rows - pi - actualPanelWidth : pi;
    if (r > 0) {
      Index s = IsLower ? pi + actualPanelWidth : 0;
      general_matrix_vector_product<Index, LhsScalar, LhsMapper, ColMajor, ConjLhs, RhsScalar, RhsMapper, ConjRhs,
-                                    BuiltIn>::run(r, actualPanelWidth, LhsMapper(&lhs.coeffRef(s, pi), lhsStride),
-                                                  RhsMapper(&rhs.coeffRef(pi), rhsIncr), &res.coeffRef(s), resIncr,
-                                                  alpha);
+                                    BuiltIn>::run(r, actualPanelWidth, LhsMapper(&lhs_[pi * lhsStride + s], lhsStride),
+                                                  RhsMapper(&rhs_[pi * rhsIncr], rhsIncr), &res_[s], resIncr, alpha);
    }
  }
  if ((!IsLower) && cols > size) {
    general_matrix_vector_product<Index, LhsScalar, LhsMapper, ColMajor, ConjLhs, RhsScalar, RhsMapper, ConjRhs>::run(
-        rows, cols - size, LhsMapper(&lhs.coeffRef(0, size), lhsStride), RhsMapper(&rhs.coeffRef(size), rhsIncr), res_,
-        resIncr, alpha);
+        rows, cols - size, LhsMapper(&lhs_[size * lhsStride], lhsStride), RhsMapper(&rhs_[size * rhsIncr], rhsIncr),
+        res_, resIncr, alpha);
  }
 }

@@ -105,43 +164,48 @@ EIGEN_DONT_INLINE void triangular_matrix_vector_product<Index, Mode, LhsScalar,
  Index rows = IsLower ? _rows : diagSize;
  Index cols = IsLower ? diagSize : _cols;

-  typedef Map<const Matrix<LhsScalar, Dynamic, Dynamic, RowMajor>, 0, OuterStride<> > LhsMap;
-  const LhsMap lhs(lhs_, rows, cols, OuterStride<>(lhsStride));
-  typename conj_expr_if<ConjLhs, LhsMap>::type cjLhs(lhs);
-
-  typedef Map<const Matrix<RhsScalar, Dynamic, 1> > RhsMap;
-  const RhsMap rhs(rhs_, cols);
-  typename conj_expr_if<ConjRhs, RhsMap>::type cjRhs(rhs);
-
-  typedef Map<Matrix<ResScalar, Dynamic, 1>, 0, InnerStride<> > ResMap;
-  ResMap res(res_, rows, InnerStride<>(resIncr));
-
  typedef const_blas_data_mapper<LhsScalar, Index, RowMajor> LhsMapper;
  typedef const_blas_data_mapper<RhsScalar, Index, RowMajor> RhsMapper;

+  conj_if<ConjLhs> cjl;
+  conj_if<ConjRhs> cjr;
+
  for (Index pi = 0; pi < diagSize; pi += PanelWidth) {
    Index actualPanelWidth = (std::min)(PanelWidth, diagSize - pi);
+
+    // Process the triangular panel using raw dot products to eliminate
+    // the cwiseProduct().sum() expression template overhead.
    for (Index k = 0; k < actualPanelWidth; ++k) {
      Index i = pi + k;
-      Index s = IsLower ? pi : ((HasUnitDiag || HasZeroDiag) ? i + 1 : i);
-      Index r = IsLower ? k + 1 : actualPanelWidth - k;
-      if ((!(HasUnitDiag || HasZeroDiag)) || (--r) > 0)
-        res.coeffRef(i) += alpha * (cjLhs.row(i).segment(s, r).cwiseProduct(cjRhs.segment(s, r).transpose())).sum();
-      if (HasUnitDiag) res.coeffRef(i) += alpha * cjRhs.coeff(i);
+      const LhsScalar* EIGEN_RESTRICT row_i = lhs_ + i * lhsStride;
+      ResScalar dot = ResScalar(0);
+
+      if (IsLower) {
+        Index s = pi;
+        Index len = (HasUnitDiag || HasZeroDiag) ? k : k + 1;
+        for (Index j = 0; j < len; ++j) dot += cjl(row_i[s + j]) * cjr(rhs_[s + j]);
+      } else {
+        Index s = (HasUnitDiag || HasZeroDiag) ? i + 1 : i;
+        Index len = pi + actualPanelWidth - s;
+        for (Index j = 0; j < len; ++j) dot += cjl(row_i[s + j]) * cjr(rhs_[s + j]);
+      }
+      res_[i * resIncr] += alpha * dot;
+      if (HasUnitDiag) res_[i * resIncr] += alpha * cjr(rhs_[i]);
    }
+
+    // Rectangular part: delegate to optimized GEMV
    Index r = IsLower ? pi : cols - pi - actualPanelWidth;
    if (r > 0) {
      Index s = IsLower ? 0 : pi + actualPanelWidth;
      general_matrix_vector_product<Index, LhsScalar, LhsMapper, RowMajor, ConjLhs, RhsScalar, RhsMapper, ConjRhs,
-                                    BuiltIn>::run(actualPanelWidth, r, LhsMapper(&lhs.coeffRef(pi, s), lhsStride),
-                                                  RhsMapper(&rhs.coeffRef(s), rhsIncr), &res.coeffRef(pi), resIncr,
-                                                  alpha);
+                                    BuiltIn>::run(actualPanelWidth, r, LhsMapper(&lhs_[pi * lhsStride + s], lhsStride),
+                                                  RhsMapper(&rhs_[s], rhsIncr), &res_[pi * resIncr], resIncr, alpha);
    }
  }
  if (IsLower && rows > diagSize) {
    general_matrix_vector_product<Index, LhsScalar, LhsMapper, RowMajor, ConjLhs, RhsScalar, RhsMapper, ConjRhs>::run(
-        rows - diagSize, cols, LhsMapper(&lhs.coeffRef(diagSize, 0), lhsStride), RhsMapper(&rhs.coeffRef(0), rhsIncr),
-        &res.coeffRef(diagSize), resIncr, alpha);
+        rows - diagSize, cols, LhsMapper(&lhs_[diagSize * lhsStride], lhsStride), RhsMapper(rhs_, rhsIncr),
+        &res_[diagSize * resIncr], resIncr, alpha);
  }
 }

--- a/benchmarks/Core/CMakeLists.txt
+++ b/benchmarks/Core/CMakeLists.txt
@@ -14,6 +14,10 @@ eigen_add_benchmark(bench_map bench_map.cpp)
 eigen_add_benchmark(bench_diagonal bench_diagonal.cpp)
 eigen_add_benchmark(bench_triangular_product bench_triangular_product.cpp)
 eigen_add_benchmark(bench_selfadjoint_product bench_selfadjoint_product.cpp)
+eigen_add_benchmark(bench_symv bench_symv.cpp)
+eigen_add_benchmark(bench_trmv bench_trmv.cpp)
+eigen_add_benchmark(bench_syr bench_syr.cpp)
+eigen_add_benchmark(bench_syr2 bench_syr2.cpp)
 eigen_add_benchmark(bench_construction bench_construction.cpp)
 eigen_add_benchmark(bench_fixed_size bench_fixed_size.cpp)
 eigen_add_benchmark(bench_fixed_size_double bench_fixed_size.cpp DEFINITIONS SCALAR=double)
--- a/benchmarks/Core/bench_symv.cpp
+++ b/benchmarks/Core/bench_symv.cpp
@@ -0,0 +1,62 @@
+// Benchmarks for selfadjoint matrix-vector product (SYMV/HEMV).
+//
+// Tests y += selfadjointView(A) * x for various sizes and scalar types.
+// Exercises SelfadjointMatrixVector.h kernel.
+
+#include <benchmark/benchmark.h>
+#include <Eigen/Core>
+
+using namespace Eigen;
+
+template <typename Scalar>
+double symvFlops(Index n) {
+  // SYMV uses n^2 multiply-adds (exploiting symmetry)
+  return (NumTraits<Scalar>::IsComplex ? 8.0 : 2.0) * n * n;
+}
+
+// y += selfadjointView<Lower>(A) * x
+template <typename Scalar>
+static void BM_SYMV_Lower(benchmark::State& state) {
+  const Index n = state.range(0);
+  using Mat = Matrix<Scalar, Dynamic, Dynamic>;
+  using Vec = Matrix<Scalar, Dynamic, 1>;
+  Mat A = Mat::Random(n, n);
+  A = (A + A.transpose().eval()) / Scalar(2);
+  Vec x = Vec::Random(n);
+  Vec y = Vec::Random(n);
+  for (auto _ : state) {
+    y.noalias() += A.template selfadjointView<Lower>() * x;
+    benchmark::DoNotOptimize(y.data());
+    benchmark::ClobberMemory();
+  }
+  state.counters["GFLOPS"] = benchmark::Counter(symvFlops<Scalar>(n), benchmark::Counter::kIsIterationInvariantRate,
+                                                benchmark::Counter::kIs1000);
+}
+
+// y += selfadjointView<Upper>(A) * x
+template <typename Scalar>
+static void BM_SYMV_Upper(benchmark::State& state) {
+  const Index n = state.range(0);
+  using Mat = Matrix<Scalar, Dynamic, Dynamic>;
+  using Vec = Matrix<Scalar, Dynamic, 1>;
+  Mat A = Mat::Random(n, n);
+  A = (A + A.transpose().eval()) / Scalar(2);
+  Vec x = Vec::Random(n);
+  Vec y = Vec::Random(n);
+  for (auto _ : state) {
+    y.noalias() += A.template selfadjointView<Upper>() * x;
+    benchmark::DoNotOptimize(y.data());
+    benchmark::ClobberMemory();
+  }
+  state.counters["GFLOPS"] = benchmark::Counter(symvFlops<Scalar>(n), benchmark::Counter::kIsIterationInvariantRate,
+                                                benchmark::Counter::kIs1000);
+}
+
+static void SymvSizes(::benchmark::Benchmark* b) {
+  for (int n : {8, 16, 32, 64, 128, 256, 512, 1024, 2048}) b->Arg(n);
+}
+
+BENCHMARK(BM_SYMV_Lower<float>)->Apply(SymvSizes)->Name("SYMV_Lower_float");
+BENCHMARK(BM_SYMV_Lower<double>)->Apply(SymvSizes)->Name("SYMV_Lower_double");
+BENCHMARK(BM_SYMV_Upper<float>)->Apply(SymvSizes)->Name("SYMV_Upper_float");
+BENCHMARK(BM_SYMV_Upper<double>)->Apply(SymvSizes)->Name("SYMV_Upper_double");
--- a/benchmarks/Core/bench_syr.cpp
+++ b/benchmarks/Core/bench_syr.cpp
@@ -0,0 +1,59 @@
+// Benchmarks for symmetric rank-1 update (SYR).
+//
+// Tests C.selfadjointView<Lower>().rankUpdate(v, alpha) which computes
+// C += alpha * v * v^T, updating only the lower (or upper) triangle.
+// Exercises SelfadjointProduct.h / selfadjoint_rank1_update.
+
+#include <benchmark/benchmark.h>
+#include <Eigen/Core>
+
+using namespace Eigen;
+
+template <typename Scalar>
+double syrFlops(Index n) {
+  // SYR: n*(n+1)/2 multiply-adds ~ n^2
+  return (NumTraits<Scalar>::IsComplex ? 8.0 : 2.0) * n * (n + 1) / 2;
+}
+
+template <typename Scalar>
+static void BM_SYR_Lower(benchmark::State& state) {
+  const Index n = state.range(0);
+  using Mat = Matrix<Scalar, Dynamic, Dynamic>;
+  using Vec = Matrix<Scalar, Dynamic, 1>;
+  Vec v = Vec::Random(n);
+  Mat C = Mat::Zero(n, n);
+  Scalar alpha(1);
+  for (auto _ : state) {
+    C.template selfadjointView<Lower>().rankUpdate(v, alpha);
+    benchmark::DoNotOptimize(C.data());
+    benchmark::ClobberMemory();
+  }
+  state.counters["GFLOPS"] = benchmark::Counter(syrFlops<Scalar>(n), benchmark::Counter::kIsIterationInvariantRate,
+                                                benchmark::Counter::kIs1000);
+}
+
+template <typename Scalar>
+static void BM_SYR_Upper(benchmark::State& state) {
+  const Index n = state.range(0);
+  using Mat = Matrix<Scalar, Dynamic, Dynamic>;
+  using Vec = Matrix<Scalar, Dynamic, 1>;
+  Vec v = Vec::Random(n);
+  Mat C = Mat::Zero(n, n);
+  Scalar alpha(1);
+  for (auto _ : state) {
+    C.template selfadjointView<Upper>().rankUpdate(v, alpha);
+    benchmark::DoNotOptimize(C.data());
+    benchmark::ClobberMemory();
+  }
+  state.counters["GFLOPS"] = benchmark::Counter(syrFlops<Scalar>(n), benchmark::Counter::kIsIterationInvariantRate,
+                                                benchmark::Counter::kIs1000);
+}
+
+static void SyrSizes(::benchmark::Benchmark* b) {
+  for (int n : {8, 16, 32, 64, 128, 256, 512, 1024, 2048}) b->Arg(n);
+}
+
+BENCHMARK(BM_SYR_Lower<float>)->Apply(SyrSizes)->Name("SYR_Lower_float");
+BENCHMARK(BM_SYR_Lower<double>)->Apply(SyrSizes)->Name("SYR_Lower_double");
+BENCHMARK(BM_SYR_Upper<float>)->Apply(SyrSizes)->Name("SYR_Upper_float");
+BENCHMARK(BM_SYR_Upper<double>)->Apply(SyrSizes)->Name("SYR_Upper_double");
--- a/benchmarks/Core/bench_syr2.cpp
+++ b/benchmarks/Core/bench_syr2.cpp
@@ -0,0 +1,61 @@
+// Benchmarks for symmetric rank-2 update (SYR2).
+//
+// Tests C.selfadjointView<Lower>().rankUpdate(u, v, alpha) which computes
+// C += alpha * u * v^T + conj(alpha) * v * u^T.
+// Exercises SelfadjointRank2Update.h.
+
+#include <benchmark/benchmark.h>
+#include <Eigen/Core>
+
+using namespace Eigen;
+
+template <typename Scalar>
+double syr2Flops(Index n) {
+  // SYR2: 2 * n*(n+1)/2 multiply-adds ~ 2*n^2
+  return (NumTraits<Scalar>::IsComplex ? 8.0 : 2.0) * 2 * n * (n + 1) / 2;
+}
+
+template <typename Scalar>
+static void BM_SYR2_Lower(benchmark::State& state) {
+  const Index n = state.range(0);
+  using Mat = Matrix<Scalar, Dynamic, Dynamic>;
+  using Vec = Matrix<Scalar, Dynamic, 1>;
+  Vec u = Vec::Random(n);
+  Vec v = Vec::Random(n);
+  Mat C = Mat::Zero(n, n);
+  Scalar alpha(1);
+  for (auto _ : state) {
+    C.template selfadjointView<Lower>().rankUpdate(u, v, alpha);
+    benchmark::DoNotOptimize(C.data());
+    benchmark::ClobberMemory();
+  }
+  state.counters["GFLOPS"] = benchmark::Counter(syr2Flops<Scalar>(n), benchmark::Counter::kIsIterationInvariantRate,
+                                                benchmark::Counter::kIs1000);
+}
+
+template <typename Scalar>
+static void BM_SYR2_Upper(benchmark::State& state) {
+  const Index n = state.range(0);
+  using Mat = Matrix<Scalar, Dynamic, Dynamic>;
+  using Vec = Matrix<Scalar, Dynamic, 1>;
+  Vec u = Vec::Random(n);
+  Vec v = Vec::Random(n);
+  Mat C = Mat::Zero(n, n);
+  Scalar alpha(1);
+  for (auto _ : state) {
+    C.template selfadjointView<Upper>().rankUpdate(u, v, alpha);
+    benchmark::DoNotOptimize(C.data());
+    benchmark::ClobberMemory();
+  }
+  state.counters["GFLOPS"] = benchmark::Counter(syr2Flops<Scalar>(n), benchmark::Counter::kIsIterationInvariantRate,
+                                                benchmark::Counter::kIs1000);
+}
+
+static void Syr2Sizes(::benchmark::Benchmark* b) {
+  for (int n : {8, 16, 32, 64, 128, 256, 512, 1024, 2048}) b->Arg(n);
+}
+
+BENCHMARK(BM_SYR2_Lower<float>)->Apply(Syr2Sizes)->Name("SYR2_Lower_float");
+BENCHMARK(BM_SYR2_Lower<double>)->Apply(Syr2Sizes)->Name("SYR2_Lower_double");
+BENCHMARK(BM_SYR2_Upper<float>)->Apply(Syr2Sizes)->Name("SYR2_Upper_float");
+BENCHMARK(BM_SYR2_Upper<double>)->Apply(Syr2Sizes)->Name("SYR2_Upper_double");
--- a/benchmarks/Core/bench_trmv.cpp
+++ b/benchmarks/Core/bench_trmv.cpp
@@ -0,0 +1,46 @@
+// Benchmarks for triangular matrix-vector product (TRMV).
+//
+// Tests y += triangularView(A) * x for various modes and sizes.
+// Exercises TriangularMatrixVector.h kernel.
+
+#include <benchmark/benchmark.h>
+#include <Eigen/Core>
+
+using namespace Eigen;
+
+template <typename Scalar>
+double trmvFlops(Index n) {
+  // TRMV: ~n^2 multiply-adds
+  return (NumTraits<Scalar>::IsComplex ? 8.0 : 2.0) * n * n;
+}
+
+// y = triangularView<Mode>(A) * x
+template <typename Scalar, unsigned int Mode>
+static void BM_TRMV(benchmark::State& state) {
+  const Index n = state.range(0);
+  using Mat = Matrix<Scalar, Dynamic, Dynamic>;
+  using Vec = Matrix<Scalar, Dynamic, 1>;
+  Mat A = Mat::Random(n, n);
+  Vec x = Vec::Random(n);
+  Vec y(n);
+  for (auto _ : state) {
+    y.noalias() = A.template triangularView<Mode>() * x;
+    benchmark::DoNotOptimize(y.data());
+    benchmark::ClobberMemory();
+  }
+  state.counters["GFLOPS"] = benchmark::Counter(trmvFlops<Scalar>(n), benchmark::Counter::kIsIterationInvariantRate,
+                                                benchmark::Counter::kIs1000);
+}
+
+static void TrmvSizes(::benchmark::Benchmark* b) {
+  for (int n : {8, 16, 32, 64, 128, 256, 512, 1024, 2048}) b->Arg(n);
+}
+
+BENCHMARK(BM_TRMV<float, Lower>)->Apply(TrmvSizes)->Name("TRMV_float_Lower");
+BENCHMARK(BM_TRMV<float, Upper>)->Apply(TrmvSizes)->Name("TRMV_float_Upper");
+BENCHMARK(BM_TRMV<float, UnitLower>)->Apply(TrmvSizes)->Name("TRMV_float_UnitLower");
+BENCHMARK(BM_TRMV<float, UnitUpper>)->Apply(TrmvSizes)->Name("TRMV_float_UnitUpper");
+BENCHMARK(BM_TRMV<double, Lower>)->Apply(TrmvSizes)->Name("TRMV_double_Lower");
+BENCHMARK(BM_TRMV<double, Upper>)->Apply(TrmvSizes)->Name("TRMV_double_Upper");
+BENCHMARK(BM_TRMV<double, UnitLower>)->Apply(TrmvSizes)->Name("TRMV_double_UnitLower");
+BENCHMARK(BM_TRMV<double, UnitUpper>)->Apply(TrmvSizes)->Name("TRMV_double_UnitUpper");