From 662d5c21ff84359e433799b72738f96063cc60f2 Mon Sep 17 00:00:00 2001 From: Rasmus Munk Larsen <4643818-rmlarsen1@users.noreply.gitlab.com> Date: Sun, 1 Mar 2026 19:40:11 -0800 Subject: [PATCH] Optimize SYMV, SYR, SYR2, and TRMV product kernels libeigen/eigen!2228 Co-authored-by: Rasmus Munk Larsen --- .../Core/products/SelfadjointMatrixVector.h | 306 ++++++++++++++---- Eigen/src/Core/products/SelfadjointProduct.h | 106 +++++- .../Core/products/SelfadjointRank2Update.h | 221 +++++++++++-- .../Core/products/TriangularMatrixVector.h | 152 ++++++--- benchmarks/Core/CMakeLists.txt | 4 + benchmarks/Core/bench_symv.cpp | 62 ++++ benchmarks/Core/bench_syr.cpp | 59 ++++ benchmarks/Core/bench_syr2.cpp | 61 ++++ benchmarks/Core/bench_trmv.cpp | 46 +++ 9 files changed, 866 insertions(+), 151 deletions(-) create mode 100644 benchmarks/Core/bench_symv.cpp create mode 100644 benchmarks/Core/bench_syr.cpp create mode 100644 benchmarks/Core/bench_syr2.cpp create mode 100644 benchmarks/Core/bench_trmv.cpp diff --git a/Eigen/src/Core/products/SelfadjointMatrixVector.h b/Eigen/src/Core/products/SelfadjointMatrixVector.h index 8a84aa45d..f87509bb3 100644 --- a/Eigen/src/Core/products/SelfadjointMatrixVector.h +++ b/Eigen/src/Core/products/SelfadjointMatrixVector.h @@ -18,9 +18,11 @@ namespace Eigen { namespace internal { /* Optimized selfadjoint matrix * vector product: - * This algorithm processes 2 columns at once that allows to both reduce - * the number of load/stores of the result by a factor 2 and to reduce - * the instruction dependency. + * This algorithm processes 4 columns at once to reduce the number of + * load/stores of the result vector by a factor of 4 compared to the + * naive approach, and to increase instruction-level parallelism. + * A 2-column cleanup handles the remaining even columns, and a + * 1-column loop handles any final odd column. */ template (t0); - Scalar t1 = cjAlpha * rhs[j + 1]; - Packet ptmp1 = pset1(t1); + // === Phase 1: 4 columns at a time === + { + Index jStart = FirstTriangular ? (size - n4) : 0; + Index jEnd = FirstTriangular ? size : n4; - Scalar t2(0); - Packet ptmp2 = pset1(t2); - Scalar t3(0); - Packet ptmp3 = pset1(t3); + for (Index j = jStart; j < jEnd; j += 4) { + const Scalar* EIGEN_RESTRICT A0 = lhs + j * lhsStride; + const Scalar* EIGEN_RESTRICT A1 = lhs + (j + 1) * lhsStride; + const Scalar* EIGEN_RESTRICT A2 = lhs + (j + 2) * lhsStride; + const Scalar* EIGEN_RESTRICT A3 = lhs + (j + 3) * lhsStride; - Index starti = FirstTriangular ? 0 : j + 2; - Index endi = FirstTriangular ? j : size; - Index alignedStart = (starti) + internal::first_default_aligned(&res[starti], endi - starti); - Index alignedEnd = alignedStart + ((endi - alignedStart) / (PacketSize)) * (PacketSize); + Scalar t0 = cjAlpha * rhs[j]; + Scalar t1 = cjAlpha * rhs[j + 1]; + Scalar t2 = cjAlpha * rhs[j + 2]; + Scalar t3 = cjAlpha * rhs[j + 3]; + Packet ptmp0 = pset1(t0); + Packet ptmp1 = pset1(t1); + Packet ptmp2 = pset1(t2); + Packet ptmp3 = pset1(t3); - res[j] += cjd.pmul(numext::real(A0[j]), t0); - res[j + 1] += cjd.pmul(numext::real(A1[j + 1]), t1); - if (FirstTriangular) { - res[j] += cj0.pmul(A1[j], t1); - t3 += cj1.pmul(A1[j], rhs[j]); - } else { - res[j + 1] += cj0.pmul(A0[j + 1], t0); - t2 += cj1.pmul(A0[j + 1], rhs[j + 1]); + Scalar t4(0), t5(0), t6(0), t7(0); + Packet ptmp4 = pzero(Packet{}); + Packet ptmp5 = pzero(Packet{}); + Packet ptmp6 = pzero(Packet{}); + Packet ptmp7 = pzero(Packet{}); + + Index starti = FirstTriangular ? 0 : j + 4; + Index endi = FirstTriangular ? j : size; + Index alignedStart = starti + internal::first_default_aligned(&res[starti], endi - starti); + Index alignedEnd = alignedStart + ((endi - alignedStart) / PacketSize) * PacketSize; + + // Handle the 4x4 diagonal block: diagonal elements + res[j] += cjd.pmul(numext::real(A0[j]), t0); + res[j + 1] += cjd.pmul(numext::real(A1[j + 1]), t1); + res[j + 2] += cjd.pmul(numext::real(A2[j + 2]), t2); + res[j + 3] += cjd.pmul(numext::real(A3[j + 3]), t3); + + // Handle the 4x4 diagonal block: off-diagonal cross terms + if (FirstTriangular) { + // Upper triangle stored (A_k[l] for l <= k) + res[j] += cj0.pmul(A1[j], t1) + cj0.pmul(A2[j], t2) + cj0.pmul(A3[j], t3); + res[j + 1] += cj0.pmul(A2[j + 1], t2) + cj0.pmul(A3[j + 1], t3); + res[j + 2] += cj0.pmul(A3[j + 2], t3); + + t5 += cj1.pmul(A1[j], rhs[j]); + t6 += cj1.pmul(A2[j], rhs[j]) + cj1.pmul(A2[j + 1], rhs[j + 1]); + t7 += cj1.pmul(A3[j], rhs[j]) + cj1.pmul(A3[j + 1], rhs[j + 1]) + cj1.pmul(A3[j + 2], rhs[j + 2]); + } else { + // Lower triangle stored (A_k[l] for l >= k) + res[j + 1] += cj0.pmul(A0[j + 1], t0); + res[j + 2] += cj0.pmul(A0[j + 2], t0) + cj0.pmul(A1[j + 2], t1); + res[j + 3] += cj0.pmul(A0[j + 3], t0) + cj0.pmul(A1[j + 3], t1) + cj0.pmul(A2[j + 3], t2); + + t4 += cj1.pmul(A0[j + 1], rhs[j + 1]) + cj1.pmul(A0[j + 2], rhs[j + 2]) + cj1.pmul(A0[j + 3], rhs[j + 3]); + t5 += cj1.pmul(A1[j + 2], rhs[j + 2]) + cj1.pmul(A1[j + 3], rhs[j + 3]); + t6 += cj1.pmul(A2[j + 3], rhs[j + 3]); + } + + // Pre-alignment scalar loop + for (Index i = starti; i < alignedStart; ++i) { + res[i] += cj0.pmul(A0[i], t0) + cj0.pmul(A1[i], t1) + cj0.pmul(A2[i], t2) + cj0.pmul(A3[i], t3); + t4 += cj1.pmul(A0[i], rhs[i]); + t5 += cj1.pmul(A1[i], rhs[i]); + t6 += cj1.pmul(A2[i], rhs[i]); + t7 += cj1.pmul(A3[i], rhs[i]); + } + + // Main vectorized loop: 4 matrix column loads, 1 rhs load, 1 result load/store + const Scalar* EIGEN_RESTRICT a0It = A0 + alignedStart; + const Scalar* EIGEN_RESTRICT a1It = A1 + alignedStart; + const Scalar* EIGEN_RESTRICT a2It = A2 + alignedStart; + const Scalar* EIGEN_RESTRICT a3It = A3 + alignedStart; + const Scalar* EIGEN_RESTRICT rhsIt = rhs + alignedStart; + Scalar* EIGEN_RESTRICT resIt = res + alignedStart; + for (Index i = alignedStart; i < alignedEnd; i += PacketSize) { + Packet A0i = ploadu(a0It); + a0It += PacketSize; + Packet A1i = ploadu(a1It); + a1It += PacketSize; + Packet A2i = ploadu(a2It); + a2It += PacketSize; + Packet A3i = ploadu(a3It); + a3It += PacketSize; + Packet Bi = ploadu(rhsIt); + rhsIt += PacketSize; + Packet Xi = pload(resIt); + + Xi = pcj0.pmadd(A0i, ptmp0, Xi); + Xi = pcj0.pmadd(A1i, ptmp1, Xi); + Xi = pcj0.pmadd(A2i, ptmp2, Xi); + Xi = pcj0.pmadd(A3i, ptmp3, Xi); + pstore(resIt, Xi); + resIt += PacketSize; + + ptmp4 = pcj1.pmadd(A0i, Bi, ptmp4); + ptmp5 = pcj1.pmadd(A1i, Bi, ptmp5); + ptmp6 = pcj1.pmadd(A2i, Bi, ptmp6); + ptmp7 = pcj1.pmadd(A3i, Bi, ptmp7); + } + + // Post-alignment scalar loop + for (Index i = alignedEnd; i < endi; ++i) { + res[i] += cj0.pmul(A0[i], t0) + cj0.pmul(A1[i], t1) + cj0.pmul(A2[i], t2) + cj0.pmul(A3[i], t3); + t4 += cj1.pmul(A0[i], rhs[i]); + t5 += cj1.pmul(A1[i], rhs[i]); + t6 += cj1.pmul(A2[i], rhs[i]); + t7 += cj1.pmul(A3[i], rhs[i]); + } + + res[j] += alpha * (t4 + predux(ptmp4)); + res[j + 1] += alpha * (t5 + predux(ptmp5)); + res[j + 2] += alpha * (t6 + predux(ptmp6)); + res[j + 3] += alpha * (t7 + predux(ptmp7)); } - - for (Index i = starti; i < alignedStart; ++i) { - res[i] += cj0.pmul(A0[i], t0) + cj0.pmul(A1[i], t1); - t2 += cj1.pmul(A0[i], rhs[i]); - t3 += cj1.pmul(A1[i], rhs[i]); - } - const Scalar* EIGEN_RESTRICT a0It = A0 + alignedStart; - const Scalar* EIGEN_RESTRICT a1It = A1 + alignedStart; - const Scalar* EIGEN_RESTRICT rhsIt = rhs + alignedStart; - Scalar* EIGEN_RESTRICT resIt = res + alignedStart; - for (Index i = alignedStart; i < alignedEnd; i += PacketSize) { - Packet A0i = ploadu(a0It); - a0It += PacketSize; - Packet A1i = ploadu(a1It); - a1It += PacketSize; - Packet Bi = ploadu(rhsIt); - rhsIt += PacketSize; // FIXME: should be aligned in most cases. - Packet Xi = pload(resIt); - - Xi = pcj0.pmadd(A0i, ptmp0, pcj0.pmadd(A1i, ptmp1, Xi)); - ptmp2 = pcj1.pmadd(A0i, Bi, ptmp2); - ptmp3 = pcj1.pmadd(A1i, Bi, ptmp3); - pstore(resIt, Xi); - resIt += PacketSize; - } - for (Index i = alignedEnd; i < endi; i++) { - res[i] += cj0.pmul(A0[i], t0) + cj0.pmul(A1[i], t1); - t2 += cj1.pmul(A0[i], rhs[i]); - t3 += cj1.pmul(A1[i], rhs[i]); - } - - res[j] += alpha * (t2 + predux(ptmp2)); - res[j + 1] += alpha * (t3 + predux(ptmp3)); } - for (Index j = FirstTriangular ? 0 : bound; j < (FirstTriangular ? bound : size); j++) { - const Scalar* EIGEN_RESTRICT A0 = lhs + j * lhsStride; - Scalar t1 = cjAlpha * rhs[j]; - Scalar t2(0); - res[j] += cjd.pmul(numext::real(A0[j]), t1); - for (Index i = FirstTriangular ? 0 : j + 1; i < (FirstTriangular ? j : size); i++) { - res[i] += cj0.pmul(A0[i], t1); - t2 += cj1.pmul(A0[i], rhs[i]); + // === Phase 2: 2 columns at a time === + { + Index jStart = FirstTriangular ? (size - n4 - n2) : n4; + Index jEnd = FirstTriangular ? (size - n4) : (n4 + n2); + + for (Index j = jStart; j < jEnd; j += 2) { + const Scalar* EIGEN_RESTRICT A0 = lhs + j * lhsStride; + const Scalar* EIGEN_RESTRICT A1 = lhs + (j + 1) * lhsStride; + + Scalar t0 = cjAlpha * rhs[j]; + Packet ptmp0 = pset1(t0); + Scalar t1 = cjAlpha * rhs[j + 1]; + Packet ptmp1 = pset1(t1); + + Scalar t2(0); + Packet ptmp2 = pzero(Packet{}); + Scalar t3(0); + Packet ptmp3 = pzero(Packet{}); + + Index starti = FirstTriangular ? 0 : j + 2; + Index endi = FirstTriangular ? j : size; + Index alignedStart = starti + internal::first_default_aligned(&res[starti], endi - starti); + Index alignedEnd = alignedStart + ((endi - alignedStart) / PacketSize) * PacketSize; + + res[j] += cjd.pmul(numext::real(A0[j]), t0); + res[j + 1] += cjd.pmul(numext::real(A1[j + 1]), t1); + if (FirstTriangular) { + res[j] += cj0.pmul(A1[j], t1); + t3 += cj1.pmul(A1[j], rhs[j]); + } else { + res[j + 1] += cj0.pmul(A0[j + 1], t0); + t2 += cj1.pmul(A0[j + 1], rhs[j + 1]); + } + + for (Index i = starti; i < alignedStart; ++i) { + res[i] += cj0.pmul(A0[i], t0) + cj0.pmul(A1[i], t1); + t2 += cj1.pmul(A0[i], rhs[i]); + t3 += cj1.pmul(A1[i], rhs[i]); + } + const Scalar* EIGEN_RESTRICT a0It = A0 + alignedStart; + const Scalar* EIGEN_RESTRICT a1It = A1 + alignedStart; + const Scalar* EIGEN_RESTRICT rhsIt = rhs + alignedStart; + Scalar* EIGEN_RESTRICT resIt = res + alignedStart; + for (Index i = alignedStart; i < alignedEnd; i += PacketSize) { + Packet A0i = ploadu(a0It); + a0It += PacketSize; + Packet A1i = ploadu(a1It); + a1It += PacketSize; + Packet Bi = ploadu(rhsIt); + rhsIt += PacketSize; + Packet Xi = pload(resIt); + + Xi = pcj0.pmadd(A0i, ptmp0, pcj0.pmadd(A1i, ptmp1, Xi)); + ptmp2 = pcj1.pmadd(A0i, Bi, ptmp2); + ptmp3 = pcj1.pmadd(A1i, Bi, ptmp3); + pstore(resIt, Xi); + resIt += PacketSize; + } + for (Index i = alignedEnd; i < endi; i++) { + res[i] += cj0.pmul(A0[i], t0) + cj0.pmul(A1[i], t1); + t2 += cj1.pmul(A0[i], rhs[i]); + t3 += cj1.pmul(A1[i], rhs[i]); + } + + res[j] += alpha * (t2 + predux(ptmp2)); + res[j + 1] += alpha * (t3 + predux(ptmp3)); + } + } + + // === Phase 3: 1 column at a time === + { + Index jStart = FirstTriangular ? 0 : (n4 + n2); + Index jEnd = FirstTriangular ? (size - n4 - n2) : size; + + for (Index j = jStart; j < jEnd; j++) { + const Scalar* EIGEN_RESTRICT A0 = lhs + j * lhsStride; + + Scalar t1 = cjAlpha * rhs[j]; + Scalar t2(0); + Packet ptmp1 = pset1(t1); + Packet ptmp2 = pzero(Packet{}); + + res[j] += cjd.pmul(numext::real(A0[j]), t1); + + Index starti = FirstTriangular ? 0 : j + 1; + Index endi = FirstTriangular ? j : size; + Index alignedStart = starti + internal::first_default_aligned(&res[starti], endi - starti); + Index alignedEnd = alignedStart + ((endi - alignedStart) / PacketSize) * PacketSize; + + for (Index i = starti; i < alignedStart; ++i) { + res[i] += cj0.pmul(A0[i], t1); + t2 += cj1.pmul(A0[i], rhs[i]); + } + const Scalar* EIGEN_RESTRICT a0It = A0 + alignedStart; + const Scalar* EIGEN_RESTRICT rhsIt = rhs + alignedStart; + Scalar* EIGEN_RESTRICT resIt = res + alignedStart; + for (Index i = alignedStart; i < alignedEnd; i += PacketSize) { + Packet A0i = ploadu(a0It); + a0It += PacketSize; + Packet Bi = ploadu(rhsIt); + rhsIt += PacketSize; + Packet Xi = pload(resIt); + + Xi = pcj0.pmadd(A0i, ptmp1, Xi); + pstore(resIt, Xi); + resIt += PacketSize; + + ptmp2 = pcj1.pmadd(A0i, Bi, ptmp2); + } + for (Index i = alignedEnd; i < endi; i++) { + res[i] += cj0.pmul(A0[i], t1); + t2 += cj1.pmul(A0[i], rhs[i]); + } + res[j] += alpha * (t2 + predux(ptmp2)); } - res[j] += alpha * t2; } } diff --git a/Eigen/src/Core/products/SelfadjointProduct.h b/Eigen/src/Core/products/SelfadjointProduct.h index f1034655e..4a453b18d 100644 --- a/Eigen/src/Core/products/SelfadjointProduct.h +++ b/Eigen/src/Core/products/SelfadjointProduct.h @@ -24,14 +24,104 @@ namespace Eigen { template struct selfadjoint_rank1_update { static void run(Index size, Scalar* mat, Index stride, const Scalar* vecX, const Scalar* vecY, const Scalar& alpha) { - internal::conj_if cj; - typedef Map > OtherMap; - typedef std::conditional_t ConjLhsType; - for (Index i = 0; i < size; ++i) { - Map >(mat + stride * i + (UpLo == Lower ? i : 0), - (UpLo == Lower ? size - i : (i + 1))) += - (alpha * cj(vecY[i])) * - ConjLhsType(OtherMap(vecX + (UpLo == Lower ? i : 0), UpLo == Lower ? size - i : (i + 1))); + typedef typename internal::packet_traits::type Packet; + const Index PacketSize = internal::unpacket_traits::size; + + internal::conj_if cjy; + internal::conj_if cjx; + internal::conj_helper pcj; + + // Process 2 columns at a time to share vecX loads and reduce loop overhead. + Index j = 0; + for (; j + 1 < size; j += 2) { + Scalar s0 = alpha * cjy(vecY[j]); + Scalar s1 = alpha * cjy(vecY[j + 1]); + Packet ps0 = internal::pset1(s0); + Packet ps1 = internal::pset1(s1); + + if (UpLo == Lower) { + Scalar* EIGEN_RESTRICT col0 = mat + stride * j + j; + Scalar* EIGEN_RESTRICT col1 = mat + stride * (j + 1) + (j + 1); + + // Diagonal and cross-diagonal scalar elements + col0[0] += s0 * cjx(vecX[j]); + col0[1] += s0 * cjx(vecX[j + 1]); + col1[0] += s1 * cjx(vecX[j + 1]); + + // Shared vectorized loop for rows j+2..size-1 + Index len = size - j - 2; + const Scalar* EIGEN_RESTRICT xp = vecX + j + 2; + Scalar* EIGEN_RESTRICT d0 = col0 + 2; + Scalar* EIGEN_RESTRICT d1 = col1 + 1; + + Index k = 0; + Index vectorizedEnd = (len / PacketSize) * PacketSize; + for (; k < vectorizedEnd; k += PacketSize) { + Packet xi = internal::ploadu(xp + k); + Packet m0 = internal::ploadu(d0 + k); + m0 = pcj.pmadd(xi, ps0, m0); + internal::pstoreu(d0 + k, m0); + Packet m1 = internal::ploadu(d1 + k); + m1 = pcj.pmadd(xi, ps1, m1); + internal::pstoreu(d1 + k, m1); + } + for (; k < len; ++k) { + Scalar cx = cjx(xp[k]); + d0[k] += s0 * cx; + d1[k] += s1 * cx; + } + } else { + // UpLo == Upper + Scalar* EIGEN_RESTRICT col0 = mat + stride * j; + Scalar* EIGEN_RESTRICT col1 = mat + stride * (j + 1); + + // Shared vectorized loop for rows 0..j-1 + const Scalar* EIGEN_RESTRICT xp = vecX; + Index len = j; + Index k = 0; + Index vectorizedEnd = (len / PacketSize) * PacketSize; + for (; k < vectorizedEnd; k += PacketSize) { + Packet xi = internal::ploadu(xp + k); + Packet m0 = internal::ploadu(col0 + k); + Packet m1 = internal::ploadu(col1 + k); + m0 = pcj.pmadd(xi, ps0, m0); + m1 = pcj.pmadd(xi, ps1, m1); + internal::pstoreu(col0 + k, m0); + internal::pstoreu(col1 + k, m1); + } + for (; k < len; ++k) { + Scalar cx = cjx(xp[k]); + col0[k] += s0 * cx; + col1[k] += s1 * cx; + } + + // Diagonal and cross-diagonal scalar elements + col0[j] += s0 * cjx(vecX[j]); + col1[j] += s1 * cjx(vecX[j]); + col1[j + 1] += s1 * cjx(vecX[j + 1]); + } + } + + // Handle last column if size is odd + if (j < size) { + Scalar s = alpha * cjy(vecY[j]); + Packet ps = internal::pset1(s); + Index start = UpLo == Lower ? j : 0; + Index len = UpLo == Lower ? size - j : j + 1; + Scalar* EIGEN_RESTRICT dst = mat + stride * j + start; + const Scalar* EIGEN_RESTRICT xp = vecX + start; + + Index k = 0; + Index vectorizedEnd = (len / PacketSize) * PacketSize; + for (; k < vectorizedEnd; k += PacketSize) { + Packet xi = internal::ploadu(xp + k); + Packet di = internal::ploadu(dst + k); + di = pcj.pmadd(xi, ps, di); + internal::pstoreu(dst + k, di); + } + for (; k < len; ++k) { + dst[k] += s * cjx(xp[k]); + } } } }; diff --git a/Eigen/src/Core/products/SelfadjointRank2Update.h b/Eigen/src/Core/products/SelfadjointRank2Update.h index 9c234ec2a..75e4ceb00 100644 --- a/Eigen/src/Core/products/SelfadjointRank2Update.h +++ b/Eigen/src/Core/products/SelfadjointRank2Update.h @@ -21,29 +21,168 @@ namespace internal { * It corresponds to the Level2 syr2 BLAS routine */ -template +template struct selfadjoint_rank2_update_selector; -template -struct selfadjoint_rank2_update_selector { - static EIGEN_DEVICE_FUNC void run(Scalar* mat, Index stride, const UType& u, const VType& v, const Scalar& alpha) { - const Index size = u.size(); - for (Index i = 0; i < size; ++i) { - Map>(mat + stride * i + i, size - i) += - (numext::conj(alpha) * numext::conj(u.coeff(i))) * v.tail(size - i) + - (alpha * numext::conj(v.coeff(i))) * u.tail(size - i); +template +struct selfadjoint_rank2_update_selector { + static void run(Index size, Scalar* mat, Index stride, const Scalar* u, const Scalar* v, const Scalar& alpha) { + typedef typename packet_traits::type Packet; + const Index PacketSize = unpacket_traits::size; + const Scalar cAlpha = numext::conj(alpha); + + // Process 2 columns at a time to share u/v loads and reduce loop overhead. + Index j = 0; + for (; j + 1 < size; j += 2) { + // Scale factors: col[j:] += s0u * v[j:] + s0v * u[j:] + Scalar s0u = cAlpha * numext::conj(u[j]); + Scalar s0v = alpha * numext::conj(v[j]); + Scalar s1u = cAlpha * numext::conj(u[j + 1]); + Scalar s1v = alpha * numext::conj(v[j + 1]); + + Packet ps0u = pset1(s0u); + Packet ps0v = pset1(s0v); + Packet ps1u = pset1(s1u); + Packet ps1v = pset1(s1v); + + Scalar* EIGEN_RESTRICT col0 = mat + stride * j + j; + Scalar* EIGEN_RESTRICT col1 = mat + stride * (j + 1) + (j + 1); + + // Diagonal and cross-diagonal scalar elements + col0[0] += s0u * v[j] + s0v * u[j]; + col0[1] += s0u * v[j + 1] + s0v * u[j + 1]; + col1[0] += s1u * v[j + 1] + s1v * u[j + 1]; + + // Shared vectorized loop for rows j+2..size-1 + Index len = size - j - 2; + const Scalar* EIGEN_RESTRICT up = u + j + 2; + const Scalar* EIGEN_RESTRICT vp = v + j + 2; + Scalar* EIGEN_RESTRICT d0 = col0 + 2; + Scalar* EIGEN_RESTRICT d1 = col1 + 1; + + Index k = 0; + Index vectorizedEnd = (len / PacketSize) * PacketSize; + for (; k < vectorizedEnd; k += PacketSize) { + Packet ui = ploadu(up + k); + Packet vi = ploadu(vp + k); + Packet m0 = ploadu(d0 + k); + m0 = pmadd(vi, ps0u, m0); + m0 = pmadd(ui, ps0v, m0); + pstoreu(d0 + k, m0); + Packet m1 = ploadu(d1 + k); + m1 = pmadd(vi, ps1u, m1); + m1 = pmadd(ui, ps1v, m1); + pstoreu(d1 + k, m1); + } + for (; k < len; ++k) { + d0[k] += s0u * vp[k] + s0v * up[k]; + d1[k] += s1u * vp[k] + s1v * up[k]; + } + } + + // Handle last column if size is odd + if (j < size) { + Scalar su = cAlpha * numext::conj(u[j]); + Scalar sv = alpha * numext::conj(v[j]); + Packet psu = pset1(su); + Packet psv = pset1(sv); + + Scalar* EIGEN_RESTRICT dst = mat + stride * j + j; + const Scalar* EIGEN_RESTRICT up = u + j; + const Scalar* EIGEN_RESTRICT vp = v + j; + Index len = size - j; + + Index k = 0; + Index vectorizedEnd = (len / PacketSize) * PacketSize; + for (; k < vectorizedEnd; k += PacketSize) { + Packet ui = ploadu(up + k); + Packet vi = ploadu(vp + k); + Packet di = ploadu(dst + k); + di = pmadd(vi, psu, di); + di = pmadd(ui, psv, di); + pstoreu(dst + k, di); + } + for (; k < len; ++k) { + dst[k] += su * vp[k] + sv * up[k]; + } } } }; -template -struct selfadjoint_rank2_update_selector { - static void run(Scalar* mat, Index stride, const UType& u, const VType& v, const Scalar& alpha) { - const Index size = u.size(); - for (Index i = 0; i < size; ++i) - Map>(mat + stride * i, i + 1) += - (numext::conj(alpha) * numext::conj(u.coeff(i))) * v.head(i + 1) + - (alpha * numext::conj(v.coeff(i))) * u.head(i + 1); +template +struct selfadjoint_rank2_update_selector { + static void run(Index size, Scalar* mat, Index stride, const Scalar* u, const Scalar* v, const Scalar& alpha) { + typedef typename packet_traits::type Packet; + const Index PacketSize = unpacket_traits::size; + const Scalar cAlpha = numext::conj(alpha); + + // Process 2 columns at a time to share u/v loads and reduce loop overhead. + Index j = 0; + for (; j + 1 < size; j += 2) { + Scalar s0u = cAlpha * numext::conj(u[j]); + Scalar s0v = alpha * numext::conj(v[j]); + Scalar s1u = cAlpha * numext::conj(u[j + 1]); + Scalar s1v = alpha * numext::conj(v[j + 1]); + + Packet ps0u = pset1(s0u); + Packet ps0v = pset1(s0v); + Packet ps1u = pset1(s1u); + Packet ps1v = pset1(s1v); + + Scalar* EIGEN_RESTRICT col0 = mat + stride * j; + Scalar* EIGEN_RESTRICT col1 = mat + stride * (j + 1); + + // Shared vectorized loop for rows 0..j-1 + Index len = j; + Index k = 0; + Index vectorizedEnd = (len / PacketSize) * PacketSize; + for (; k < vectorizedEnd; k += PacketSize) { + Packet ui = ploadu(u + k); + Packet vi = ploadu(v + k); + Packet m0 = ploadu(col0 + k); + m0 = pmadd(vi, ps0u, m0); + m0 = pmadd(ui, ps0v, m0); + pstoreu(col0 + k, m0); + Packet m1 = ploadu(col1 + k); + m1 = pmadd(vi, ps1u, m1); + m1 = pmadd(ui, ps1v, m1); + pstoreu(col1 + k, m1); + } + for (; k < len; ++k) { + col0[k] += s0u * v[k] + s0v * u[k]; + col1[k] += s1u * v[k] + s1v * u[k]; + } + + // Diagonal and cross-diagonal scalar elements + col0[j] += s0u * v[j] + s0v * u[j]; + col1[j] += s1u * v[j] + s1v * u[j]; + col1[j + 1] += s1u * v[j + 1] + s1v * u[j + 1]; + } + + // Handle last column if size is odd + if (j < size) { + Scalar su = cAlpha * numext::conj(u[j]); + Scalar sv = alpha * numext::conj(v[j]); + Packet psu = pset1(su); + Packet psv = pset1(sv); + + Scalar* EIGEN_RESTRICT dst = mat + stride * j; + Index len = j + 1; + + Index k = 0; + Index vectorizedEnd = (len / PacketSize) * PacketSize; + for (; k < vectorizedEnd; k += PacketSize) { + Packet ui = ploadu(u + k); + Packet vi = ploadu(v + k); + Packet di = ploadu(dst + k); + di = pmadd(vi, psu, di); + di = pmadd(ui, psv, di); + pstoreu(dst + k, di); + } + for (; k < len; ++k) { + dst[k] += su * v[k] + sv * u[k]; + } + } } }; @@ -69,23 +208,47 @@ EIGEN_DEVICE_FUNC SelfAdjointView& SelfAdjointView::Flags & RowMajorBit) ? 1 : 0, + // Only need to conjugate if complex and the condition triggers + NeedConjU = (int(IsRowMajor) ^ int(UBlasTraits::NeedToConjugate)) && NumTraits::IsComplex, + NeedConjV = (int(IsRowMajor) ^ int(VBlasTraits::NeedToConjugate)) && NumTraits::IsComplex, + UseUDirectly = ActualUType_::InnerStrideAtCompileTime == 1 && !NeedConjU, + UseVDirectly = ActualVType_::InnerStrideAtCompileTime == 1 && !NeedConjV + }; - enum { IsRowMajor = (internal::traits::Flags & RowMajorBit) ? 1 : 0 }; Scalar actualAlpha = alpha * UBlasTraits::extractScalarFactor(u.derived()) * numext::conj(VBlasTraits::extractScalarFactor(v.derived())); if (IsRowMajor) actualAlpha = numext::conj(actualAlpha); - typedef internal::remove_all_t< - typename internal::conj_expr_if::type> - UType; - typedef internal::remove_all_t< - typename internal::conj_expr_if::type> - VType; - internal::selfadjoint_rank2_update_selector::run(_expression().const_cast_derived().data(), - _expression().outerStride(), UType(actualU), - VType(actualV), actualAlpha); + const Index size = u.size(); + + // Copy u to contiguous buffer, applying conjugation if needed + internal::gemv_static_vector_if + static_u; + ei_declare_aligned_stack_constructed_variable(Scalar, uPtr, size, + (UseUDirectly ? const_cast(actualU.data()) : static_u.data())); + if (!UseUDirectly) { + if (NeedConjU) + Map(uPtr, size) = actualU.conjugate(); + else + Map(uPtr, size) = actualU; + } + + // Copy v to contiguous buffer, applying conjugation if needed + internal::gemv_static_vector_if + static_v; + ei_declare_aligned_stack_constructed_variable(Scalar, vPtr, size, + (UseVDirectly ? const_cast(actualV.data()) : static_v.data())); + if (!UseVDirectly) { + if (NeedConjV) + Map(vPtr, size) = actualV.conjugate(); + else + Map(vPtr, size) = actualV; + } + + internal::selfadjoint_rank2_update_selector:: + run(size, _expression().const_cast_derived().data(), _expression().outerStride(), uPtr, vPtr, actualAlpha); return *this; } diff --git a/Eigen/src/Core/products/TriangularMatrixVector.h b/Eigen/src/Core/products/TriangularMatrixVector.h index 535f48136..41395f951 100644 --- a/Eigen/src/Core/products/TriangularMatrixVector.h +++ b/Eigen/src/Core/products/TriangularMatrixVector.h @@ -43,43 +43,102 @@ EIGEN_DONT_INLINE void triangular_matrix_vector_product, 0, OuterStride<> > LhsMap; - const LhsMap lhs(lhs_, rows, cols, OuterStride<>(lhsStride)); - typename conj_expr_if::type cjLhs(lhs); - - typedef Map, 0, InnerStride<> > RhsMap; - const RhsMap rhs(rhs_, cols, InnerStride<>(rhsIncr)); - typename conj_expr_if::type cjRhs(rhs); - - typedef Map > ResMap; - ResMap res(res_, rows); - typedef const_blas_data_mapper LhsMapper; typedef const_blas_data_mapper RhsMapper; + conj_if cjl; + conj_if cjr; + for (Index pi = 0; pi < size; pi += PanelWidth) { Index actualPanelWidth = (std::min)(PanelWidth, size - pi); - for (Index k = 0; k < actualPanelWidth; ++k) { - Index i = pi + k; - Index s = IsLower ? ((HasUnitDiag || HasZeroDiag) ? i + 1 : i) : pi; - Index r = IsLower ? actualPanelWidth - k : k + 1; - if ((!(HasUnitDiag || HasZeroDiag)) || (--r) > 0) - res.segment(s, r) += (alpha * cjRhs.coeff(i)) * cjLhs.col(i).segment(s, r); - if (HasUnitDiag) res.coeffRef(i) += alpha * cjRhs.coeff(i); + + // Process the triangular panel using raw pointer operations with 2-column batching + // to eliminate expression template overhead and share result loads/stores. + if (IsLower) { + Index k = 0; + for (; k + 1 < actualPanelWidth; k += 2) { + Index i0 = pi + k; + Index i1 = i0 + 1; + ResScalar s0 = alpha * cjr(rhs_[i0 * rhsIncr]); + ResScalar s1 = alpha * cjr(rhs_[i1 * rhsIncr]); + const LhsScalar* EIGEN_RESTRICT c0 = lhs_ + i0 * lhsStride; + const LhsScalar* EIGEN_RESTRICT c1 = lhs_ + i1 * lhsStride; + + // Diagonal of column 0 + if (!(HasUnitDiag || HasZeroDiag)) res_[i0] += s0 * cjl(c0[i0]); + // Row i1: contribution from column 0 + diagonal of column 1 + { + ResScalar r1 = s0 * cjl(c0[i1]); + if (!(HasUnitDiag || HasZeroDiag)) r1 += s1 * cjl(c1[i1]); + res_[i1] += r1; + } + // Shared rows where both columns contribute + Index panelEnd = pi + actualPanelWidth; + for (Index j = i1 + 1; j < panelEnd; ++j) res_[j] += s0 * cjl(c0[j]) + s1 * cjl(c1[j]); + + if (HasUnitDiag) { + res_[i0] += s0; + res_[i1] += s1; + } + } + if (k < actualPanelWidth) { + Index i = pi + k; + ResScalar s = alpha * cjr(rhs_[i * rhsIncr]); + const LhsScalar* EIGEN_RESTRICT c = lhs_ + i * lhsStride; + if (!(HasUnitDiag || HasZeroDiag)) res_[i] += s * cjl(c[i]); + if (HasUnitDiag) res_[i] += s; + } + } else { + // Upper triangular: process 2 columns at a time + Index k = 0; + for (; k + 1 < actualPanelWidth; k += 2) { + Index i0 = pi + k; + Index i1 = i0 + 1; + ResScalar s0 = alpha * cjr(rhs_[i0 * rhsIncr]); + ResScalar s1 = alpha * cjr(rhs_[i1 * rhsIncr]); + const LhsScalar* EIGEN_RESTRICT c0 = lhs_ + i0 * lhsStride; + const LhsScalar* EIGEN_RESTRICT c1 = lhs_ + i1 * lhsStride; + + // Shared rows before the diagonal block + for (Index j = pi; j < i0; ++j) res_[j] += s0 * cjl(c0[j]) + s1 * cjl(c1[j]); + + // Row i0: diagonal of col0 + contribution from col1 + { + ResScalar r0 = s1 * cjl(c1[i0]); + if (!(HasUnitDiag || HasZeroDiag)) r0 += s0 * cjl(c0[i0]); + res_[i0] += r0; + } + // Diagonal of column 1 + if (!(HasUnitDiag || HasZeroDiag)) res_[i1] += s1 * cjl(c1[i1]); + + if (HasUnitDiag) { + res_[i0] += s0; + res_[i1] += s1; + } + } + if (k < actualPanelWidth) { + Index i = pi + k; + ResScalar s = alpha * cjr(rhs_[i * rhsIncr]); + const LhsScalar* EIGEN_RESTRICT c = lhs_ + i * lhsStride; + for (Index j = pi; j < i; ++j) res_[j] += s * cjl(c[j]); + if (!(HasUnitDiag || HasZeroDiag)) res_[i] += s * cjl(c[i]); + if (HasUnitDiag) res_[i] += s; + } } + + // Rectangular part: delegate to optimized GEMV Index r = IsLower ? rows - pi - actualPanelWidth : pi; if (r > 0) { Index s = IsLower ? pi + actualPanelWidth : 0; general_matrix_vector_product::run(r, actualPanelWidth, LhsMapper(&lhs.coeffRef(s, pi), lhsStride), - RhsMapper(&rhs.coeffRef(pi), rhsIncr), &res.coeffRef(s), resIncr, - alpha); + BuiltIn>::run(r, actualPanelWidth, LhsMapper(&lhs_[pi * lhsStride + s], lhsStride), + RhsMapper(&rhs_[pi * rhsIncr], rhsIncr), &res_[s], resIncr, alpha); } } if ((!IsLower) && cols > size) { general_matrix_vector_product::run( - rows, cols - size, LhsMapper(&lhs.coeffRef(0, size), lhsStride), RhsMapper(&rhs.coeffRef(size), rhsIncr), res_, - resIncr, alpha); + rows, cols - size, LhsMapper(&lhs_[size * lhsStride], lhsStride), RhsMapper(&rhs_[size * rhsIncr], rhsIncr), + res_, resIncr, alpha); } } @@ -105,43 +164,48 @@ EIGEN_DONT_INLINE void triangular_matrix_vector_product, 0, OuterStride<> > LhsMap; - const LhsMap lhs(lhs_, rows, cols, OuterStride<>(lhsStride)); - typename conj_expr_if::type cjLhs(lhs); - - typedef Map > RhsMap; - const RhsMap rhs(rhs_, cols); - typename conj_expr_if::type cjRhs(rhs); - - typedef Map, 0, InnerStride<> > ResMap; - ResMap res(res_, rows, InnerStride<>(resIncr)); - typedef const_blas_data_mapper LhsMapper; typedef const_blas_data_mapper RhsMapper; + conj_if cjl; + conj_if cjr; + for (Index pi = 0; pi < diagSize; pi += PanelWidth) { Index actualPanelWidth = (std::min)(PanelWidth, diagSize - pi); + + // Process the triangular panel using raw dot products to eliminate + // the cwiseProduct().sum() expression template overhead. for (Index k = 0; k < actualPanelWidth; ++k) { Index i = pi + k; - Index s = IsLower ? pi : ((HasUnitDiag || HasZeroDiag) ? i + 1 : i); - Index r = IsLower ? k + 1 : actualPanelWidth - k; - if ((!(HasUnitDiag || HasZeroDiag)) || (--r) > 0) - res.coeffRef(i) += alpha * (cjLhs.row(i).segment(s, r).cwiseProduct(cjRhs.segment(s, r).transpose())).sum(); - if (HasUnitDiag) res.coeffRef(i) += alpha * cjRhs.coeff(i); + const LhsScalar* EIGEN_RESTRICT row_i = lhs_ + i * lhsStride; + ResScalar dot = ResScalar(0); + + if (IsLower) { + Index s = pi; + Index len = (HasUnitDiag || HasZeroDiag) ? k : k + 1; + for (Index j = 0; j < len; ++j) dot += cjl(row_i[s + j]) * cjr(rhs_[s + j]); + } else { + Index s = (HasUnitDiag || HasZeroDiag) ? i + 1 : i; + Index len = pi + actualPanelWidth - s; + for (Index j = 0; j < len; ++j) dot += cjl(row_i[s + j]) * cjr(rhs_[s + j]); + } + res_[i * resIncr] += alpha * dot; + if (HasUnitDiag) res_[i * resIncr] += alpha * cjr(rhs_[i]); } + + // Rectangular part: delegate to optimized GEMV Index r = IsLower ? pi : cols - pi - actualPanelWidth; if (r > 0) { Index s = IsLower ? 0 : pi + actualPanelWidth; general_matrix_vector_product::run(actualPanelWidth, r, LhsMapper(&lhs.coeffRef(pi, s), lhsStride), - RhsMapper(&rhs.coeffRef(s), rhsIncr), &res.coeffRef(pi), resIncr, - alpha); + BuiltIn>::run(actualPanelWidth, r, LhsMapper(&lhs_[pi * lhsStride + s], lhsStride), + RhsMapper(&rhs_[s], rhsIncr), &res_[pi * resIncr], resIncr, alpha); } } if (IsLower && rows > diagSize) { general_matrix_vector_product::run( - rows - diagSize, cols, LhsMapper(&lhs.coeffRef(diagSize, 0), lhsStride), RhsMapper(&rhs.coeffRef(0), rhsIncr), - &res.coeffRef(diagSize), resIncr, alpha); + rows - diagSize, cols, LhsMapper(&lhs_[diagSize * lhsStride], lhsStride), RhsMapper(rhs_, rhsIncr), + &res_[diagSize * resIncr], resIncr, alpha); } } diff --git a/benchmarks/Core/CMakeLists.txt b/benchmarks/Core/CMakeLists.txt index a95e9bb2a..ff4709e9c 100644 --- a/benchmarks/Core/CMakeLists.txt +++ b/benchmarks/Core/CMakeLists.txt @@ -14,6 +14,10 @@ eigen_add_benchmark(bench_map bench_map.cpp) eigen_add_benchmark(bench_diagonal bench_diagonal.cpp) eigen_add_benchmark(bench_triangular_product bench_triangular_product.cpp) eigen_add_benchmark(bench_selfadjoint_product bench_selfadjoint_product.cpp) +eigen_add_benchmark(bench_symv bench_symv.cpp) +eigen_add_benchmark(bench_trmv bench_trmv.cpp) +eigen_add_benchmark(bench_syr bench_syr.cpp) +eigen_add_benchmark(bench_syr2 bench_syr2.cpp) eigen_add_benchmark(bench_construction bench_construction.cpp) eigen_add_benchmark(bench_fixed_size bench_fixed_size.cpp) eigen_add_benchmark(bench_fixed_size_double bench_fixed_size.cpp DEFINITIONS SCALAR=double) diff --git a/benchmarks/Core/bench_symv.cpp b/benchmarks/Core/bench_symv.cpp new file mode 100644 index 000000000..dea82cdd5 --- /dev/null +++ b/benchmarks/Core/bench_symv.cpp @@ -0,0 +1,62 @@ +// Benchmarks for selfadjoint matrix-vector product (SYMV/HEMV). +// +// Tests y += selfadjointView(A) * x for various sizes and scalar types. +// Exercises SelfadjointMatrixVector.h kernel. + +#include +#include + +using namespace Eigen; + +template +double symvFlops(Index n) { + // SYMV uses n^2 multiply-adds (exploiting symmetry) + return (NumTraits::IsComplex ? 8.0 : 2.0) * n * n; +} + +// y += selfadjointView(A) * x +template +static void BM_SYMV_Lower(benchmark::State& state) { + const Index n = state.range(0); + using Mat = Matrix; + using Vec = Matrix; + Mat A = Mat::Random(n, n); + A = (A + A.transpose().eval()) / Scalar(2); + Vec x = Vec::Random(n); + Vec y = Vec::Random(n); + for (auto _ : state) { + y.noalias() += A.template selfadjointView() * x; + benchmark::DoNotOptimize(y.data()); + benchmark::ClobberMemory(); + } + state.counters["GFLOPS"] = benchmark::Counter(symvFlops(n), benchmark::Counter::kIsIterationInvariantRate, + benchmark::Counter::kIs1000); +} + +// y += selfadjointView(A) * x +template +static void BM_SYMV_Upper(benchmark::State& state) { + const Index n = state.range(0); + using Mat = Matrix; + using Vec = Matrix; + Mat A = Mat::Random(n, n); + A = (A + A.transpose().eval()) / Scalar(2); + Vec x = Vec::Random(n); + Vec y = Vec::Random(n); + for (auto _ : state) { + y.noalias() += A.template selfadjointView() * x; + benchmark::DoNotOptimize(y.data()); + benchmark::ClobberMemory(); + } + state.counters["GFLOPS"] = benchmark::Counter(symvFlops(n), benchmark::Counter::kIsIterationInvariantRate, + benchmark::Counter::kIs1000); +} + +static void SymvSizes(::benchmark::Benchmark* b) { + for (int n : {8, 16, 32, 64, 128, 256, 512, 1024, 2048}) b->Arg(n); +} + +BENCHMARK(BM_SYMV_Lower)->Apply(SymvSizes)->Name("SYMV_Lower_float"); +BENCHMARK(BM_SYMV_Lower)->Apply(SymvSizes)->Name("SYMV_Lower_double"); +BENCHMARK(BM_SYMV_Upper)->Apply(SymvSizes)->Name("SYMV_Upper_float"); +BENCHMARK(BM_SYMV_Upper)->Apply(SymvSizes)->Name("SYMV_Upper_double"); diff --git a/benchmarks/Core/bench_syr.cpp b/benchmarks/Core/bench_syr.cpp new file mode 100644 index 000000000..ba6c2fce0 --- /dev/null +++ b/benchmarks/Core/bench_syr.cpp @@ -0,0 +1,59 @@ +// Benchmarks for symmetric rank-1 update (SYR). +// +// Tests C.selfadjointView().rankUpdate(v, alpha) which computes +// C += alpha * v * v^T, updating only the lower (or upper) triangle. +// Exercises SelfadjointProduct.h / selfadjoint_rank1_update. + +#include +#include + +using namespace Eigen; + +template +double syrFlops(Index n) { + // SYR: n*(n+1)/2 multiply-adds ~ n^2 + return (NumTraits::IsComplex ? 8.0 : 2.0) * n * (n + 1) / 2; +} + +template +static void BM_SYR_Lower(benchmark::State& state) { + const Index n = state.range(0); + using Mat = Matrix; + using Vec = Matrix; + Vec v = Vec::Random(n); + Mat C = Mat::Zero(n, n); + Scalar alpha(1); + for (auto _ : state) { + C.template selfadjointView().rankUpdate(v, alpha); + benchmark::DoNotOptimize(C.data()); + benchmark::ClobberMemory(); + } + state.counters["GFLOPS"] = benchmark::Counter(syrFlops(n), benchmark::Counter::kIsIterationInvariantRate, + benchmark::Counter::kIs1000); +} + +template +static void BM_SYR_Upper(benchmark::State& state) { + const Index n = state.range(0); + using Mat = Matrix; + using Vec = Matrix; + Vec v = Vec::Random(n); + Mat C = Mat::Zero(n, n); + Scalar alpha(1); + for (auto _ : state) { + C.template selfadjointView().rankUpdate(v, alpha); + benchmark::DoNotOptimize(C.data()); + benchmark::ClobberMemory(); + } + state.counters["GFLOPS"] = benchmark::Counter(syrFlops(n), benchmark::Counter::kIsIterationInvariantRate, + benchmark::Counter::kIs1000); +} + +static void SyrSizes(::benchmark::Benchmark* b) { + for (int n : {8, 16, 32, 64, 128, 256, 512, 1024, 2048}) b->Arg(n); +} + +BENCHMARK(BM_SYR_Lower)->Apply(SyrSizes)->Name("SYR_Lower_float"); +BENCHMARK(BM_SYR_Lower)->Apply(SyrSizes)->Name("SYR_Lower_double"); +BENCHMARK(BM_SYR_Upper)->Apply(SyrSizes)->Name("SYR_Upper_float"); +BENCHMARK(BM_SYR_Upper)->Apply(SyrSizes)->Name("SYR_Upper_double"); diff --git a/benchmarks/Core/bench_syr2.cpp b/benchmarks/Core/bench_syr2.cpp new file mode 100644 index 000000000..e7d28c0f5 --- /dev/null +++ b/benchmarks/Core/bench_syr2.cpp @@ -0,0 +1,61 @@ +// Benchmarks for symmetric rank-2 update (SYR2). +// +// Tests C.selfadjointView().rankUpdate(u, v, alpha) which computes +// C += alpha * u * v^T + conj(alpha) * v * u^T. +// Exercises SelfadjointRank2Update.h. + +#include +#include + +using namespace Eigen; + +template +double syr2Flops(Index n) { + // SYR2: 2 * n*(n+1)/2 multiply-adds ~ 2*n^2 + return (NumTraits::IsComplex ? 8.0 : 2.0) * 2 * n * (n + 1) / 2; +} + +template +static void BM_SYR2_Lower(benchmark::State& state) { + const Index n = state.range(0); + using Mat = Matrix; + using Vec = Matrix; + Vec u = Vec::Random(n); + Vec v = Vec::Random(n); + Mat C = Mat::Zero(n, n); + Scalar alpha(1); + for (auto _ : state) { + C.template selfadjointView().rankUpdate(u, v, alpha); + benchmark::DoNotOptimize(C.data()); + benchmark::ClobberMemory(); + } + state.counters["GFLOPS"] = benchmark::Counter(syr2Flops(n), benchmark::Counter::kIsIterationInvariantRate, + benchmark::Counter::kIs1000); +} + +template +static void BM_SYR2_Upper(benchmark::State& state) { + const Index n = state.range(0); + using Mat = Matrix; + using Vec = Matrix; + Vec u = Vec::Random(n); + Vec v = Vec::Random(n); + Mat C = Mat::Zero(n, n); + Scalar alpha(1); + for (auto _ : state) { + C.template selfadjointView().rankUpdate(u, v, alpha); + benchmark::DoNotOptimize(C.data()); + benchmark::ClobberMemory(); + } + state.counters["GFLOPS"] = benchmark::Counter(syr2Flops(n), benchmark::Counter::kIsIterationInvariantRate, + benchmark::Counter::kIs1000); +} + +static void Syr2Sizes(::benchmark::Benchmark* b) { + for (int n : {8, 16, 32, 64, 128, 256, 512, 1024, 2048}) b->Arg(n); +} + +BENCHMARK(BM_SYR2_Lower)->Apply(Syr2Sizes)->Name("SYR2_Lower_float"); +BENCHMARK(BM_SYR2_Lower)->Apply(Syr2Sizes)->Name("SYR2_Lower_double"); +BENCHMARK(BM_SYR2_Upper)->Apply(Syr2Sizes)->Name("SYR2_Upper_float"); +BENCHMARK(BM_SYR2_Upper)->Apply(Syr2Sizes)->Name("SYR2_Upper_double"); diff --git a/benchmarks/Core/bench_trmv.cpp b/benchmarks/Core/bench_trmv.cpp new file mode 100644 index 000000000..56e75591a --- /dev/null +++ b/benchmarks/Core/bench_trmv.cpp @@ -0,0 +1,46 @@ +// Benchmarks for triangular matrix-vector product (TRMV). +// +// Tests y += triangularView(A) * x for various modes and sizes. +// Exercises TriangularMatrixVector.h kernel. + +#include +#include + +using namespace Eigen; + +template +double trmvFlops(Index n) { + // TRMV: ~n^2 multiply-adds + return (NumTraits::IsComplex ? 8.0 : 2.0) * n * n; +} + +// y = triangularView(A) * x +template +static void BM_TRMV(benchmark::State& state) { + const Index n = state.range(0); + using Mat = Matrix; + using Vec = Matrix; + Mat A = Mat::Random(n, n); + Vec x = Vec::Random(n); + Vec y(n); + for (auto _ : state) { + y.noalias() = A.template triangularView() * x; + benchmark::DoNotOptimize(y.data()); + benchmark::ClobberMemory(); + } + state.counters["GFLOPS"] = benchmark::Counter(trmvFlops(n), benchmark::Counter::kIsIterationInvariantRate, + benchmark::Counter::kIs1000); +} + +static void TrmvSizes(::benchmark::Benchmark* b) { + for (int n : {8, 16, 32, 64, 128, 256, 512, 1024, 2048}) b->Arg(n); +} + +BENCHMARK(BM_TRMV)->Apply(TrmvSizes)->Name("TRMV_float_Lower"); +BENCHMARK(BM_TRMV)->Apply(TrmvSizes)->Name("TRMV_float_Upper"); +BENCHMARK(BM_TRMV)->Apply(TrmvSizes)->Name("TRMV_float_UnitLower"); +BENCHMARK(BM_TRMV)->Apply(TrmvSizes)->Name("TRMV_float_UnitUpper"); +BENCHMARK(BM_TRMV)->Apply(TrmvSizes)->Name("TRMV_double_Lower"); +BENCHMARK(BM_TRMV)->Apply(TrmvSizes)->Name("TRMV_double_Upper"); +BENCHMARK(BM_TRMV)->Apply(TrmvSizes)->Name("TRMV_double_UnitLower"); +BENCHMARK(BM_TRMV)->Apply(TrmvSizes)->Name("TRMV_double_UnitUpper");