Optimize SYMV, SYR, SYR2, and TRMV product kernels

libeigen/eigen!2228

Co-authored-by: Rasmus Munk Larsen <rmlarsen@gmail.com>
This commit is contained in:
Rasmus Munk Larsen
2026-03-01 19:40:11 -08:00
parent c66fc52868
commit 662d5c21ff
9 changed files with 866 additions and 151 deletions

View File

@@ -18,9 +18,11 @@ namespace Eigen {
namespace internal {
/* Optimized selfadjoint matrix * vector product:
* This algorithm processes 2 columns at once that allows to both reduce
* the number of load/stores of the result by a factor 2 and to reduce
* the instruction dependency.
* This algorithm processes 4 columns at once to reduce the number of
* load/stores of the result vector by a factor of 4 compared to the
* naive approach, and to increase instruction-level parallelism.
* A 2-column cleanup handles the remaining even columns, and a
* 1-column loop handles any final odd column.
*/
template <typename Scalar, typename Index, int StorageOrder, int UpLo, bool ConjugateLhs, bool ConjugateRhs,
@@ -61,82 +63,246 @@ selfadjoint_matrix_vector_product<Scalar, Index, StorageOrder, UpLo, ConjugateLh
Scalar cjAlpha = ConjugateRhs ? numext::conj(alpha) : alpha;
Index bound = numext::maxi(Index(0), size - 8) & 0xfffffffe;
if (FirstTriangular) bound = size - bound;
// Compute column counts for 4-col, 2-col, and 1-col processing phases.
// We leave up to ~8 columns near the diagonal for cleanup (short off-diagonal ranges).
Index n4 = (numext::maxi(Index(0), size - 8) / 4) * 4;
Index n2 = ((size - n4) / 2) * 2;
// Remaining (size - n4 - n2) is 0 or 1 columns.
for (Index j = FirstTriangular ? bound : 0; j < (FirstTriangular ? size : bound); j += 2) {
const Scalar* EIGEN_RESTRICT A0 = lhs + j * lhsStride;
const Scalar* EIGEN_RESTRICT A1 = lhs + (j + 1) * lhsStride;
// For !FirstTriangular: 4-col [0, n4), 2-col [n4, n4+n2), 1-col [n4+n2, size)
// For FirstTriangular: 1-col [0, size-n4-n2), 2-col [size-n4-n2, size-n4), 4-col [size-n4, size)
Scalar t0 = cjAlpha * rhs[j];
Packet ptmp0 = pset1<Packet>(t0);
Scalar t1 = cjAlpha * rhs[j + 1];
Packet ptmp1 = pset1<Packet>(t1);
// === Phase 1: 4 columns at a time ===
{
Index jStart = FirstTriangular ? (size - n4) : 0;
Index jEnd = FirstTriangular ? size : n4;
Scalar t2(0);
Packet ptmp2 = pset1<Packet>(t2);
Scalar t3(0);
Packet ptmp3 = pset1<Packet>(t3);
for (Index j = jStart; j < jEnd; j += 4) {
const Scalar* EIGEN_RESTRICT A0 = lhs + j * lhsStride;
const Scalar* EIGEN_RESTRICT A1 = lhs + (j + 1) * lhsStride;
const Scalar* EIGEN_RESTRICT A2 = lhs + (j + 2) * lhsStride;
const Scalar* EIGEN_RESTRICT A3 = lhs + (j + 3) * lhsStride;
Index starti = FirstTriangular ? 0 : j + 2;
Index endi = FirstTriangular ? j : size;
Index alignedStart = (starti) + internal::first_default_aligned(&res[starti], endi - starti);
Index alignedEnd = alignedStart + ((endi - alignedStart) / (PacketSize)) * (PacketSize);
Scalar t0 = cjAlpha * rhs[j];
Scalar t1 = cjAlpha * rhs[j + 1];
Scalar t2 = cjAlpha * rhs[j + 2];
Scalar t3 = cjAlpha * rhs[j + 3];
Packet ptmp0 = pset1<Packet>(t0);
Packet ptmp1 = pset1<Packet>(t1);
Packet ptmp2 = pset1<Packet>(t2);
Packet ptmp3 = pset1<Packet>(t3);
res[j] += cjd.pmul(numext::real(A0[j]), t0);
res[j + 1] += cjd.pmul(numext::real(A1[j + 1]), t1);
if (FirstTriangular) {
res[j] += cj0.pmul(A1[j], t1);
t3 += cj1.pmul(A1[j], rhs[j]);
} else {
res[j + 1] += cj0.pmul(A0[j + 1], t0);
t2 += cj1.pmul(A0[j + 1], rhs[j + 1]);
Scalar t4(0), t5(0), t6(0), t7(0);
Packet ptmp4 = pzero(Packet{});
Packet ptmp5 = pzero(Packet{});
Packet ptmp6 = pzero(Packet{});
Packet ptmp7 = pzero(Packet{});
Index starti = FirstTriangular ? 0 : j + 4;
Index endi = FirstTriangular ? j : size;
Index alignedStart = starti + internal::first_default_aligned(&res[starti], endi - starti);
Index alignedEnd = alignedStart + ((endi - alignedStart) / PacketSize) * PacketSize;
// Handle the 4x4 diagonal block: diagonal elements
res[j] += cjd.pmul(numext::real(A0[j]), t0);
res[j + 1] += cjd.pmul(numext::real(A1[j + 1]), t1);
res[j + 2] += cjd.pmul(numext::real(A2[j + 2]), t2);
res[j + 3] += cjd.pmul(numext::real(A3[j + 3]), t3);
// Handle the 4x4 diagonal block: off-diagonal cross terms
if (FirstTriangular) {
// Upper triangle stored (A_k[l] for l <= k)
res[j] += cj0.pmul(A1[j], t1) + cj0.pmul(A2[j], t2) + cj0.pmul(A3[j], t3);
res[j + 1] += cj0.pmul(A2[j + 1], t2) + cj0.pmul(A3[j + 1], t3);
res[j + 2] += cj0.pmul(A3[j + 2], t3);
t5 += cj1.pmul(A1[j], rhs[j]);
t6 += cj1.pmul(A2[j], rhs[j]) + cj1.pmul(A2[j + 1], rhs[j + 1]);
t7 += cj1.pmul(A3[j], rhs[j]) + cj1.pmul(A3[j + 1], rhs[j + 1]) + cj1.pmul(A3[j + 2], rhs[j + 2]);
} else {
// Lower triangle stored (A_k[l] for l >= k)
res[j + 1] += cj0.pmul(A0[j + 1], t0);
res[j + 2] += cj0.pmul(A0[j + 2], t0) + cj0.pmul(A1[j + 2], t1);
res[j + 3] += cj0.pmul(A0[j + 3], t0) + cj0.pmul(A1[j + 3], t1) + cj0.pmul(A2[j + 3], t2);
t4 += cj1.pmul(A0[j + 1], rhs[j + 1]) + cj1.pmul(A0[j + 2], rhs[j + 2]) + cj1.pmul(A0[j + 3], rhs[j + 3]);
t5 += cj1.pmul(A1[j + 2], rhs[j + 2]) + cj1.pmul(A1[j + 3], rhs[j + 3]);
t6 += cj1.pmul(A2[j + 3], rhs[j + 3]);
}
// Pre-alignment scalar loop
for (Index i = starti; i < alignedStart; ++i) {
res[i] += cj0.pmul(A0[i], t0) + cj0.pmul(A1[i], t1) + cj0.pmul(A2[i], t2) + cj0.pmul(A3[i], t3);
t4 += cj1.pmul(A0[i], rhs[i]);
t5 += cj1.pmul(A1[i], rhs[i]);
t6 += cj1.pmul(A2[i], rhs[i]);
t7 += cj1.pmul(A3[i], rhs[i]);
}
// Main vectorized loop: 4 matrix column loads, 1 rhs load, 1 result load/store
const Scalar* EIGEN_RESTRICT a0It = A0 + alignedStart;
const Scalar* EIGEN_RESTRICT a1It = A1 + alignedStart;
const Scalar* EIGEN_RESTRICT a2It = A2 + alignedStart;
const Scalar* EIGEN_RESTRICT a3It = A3 + alignedStart;
const Scalar* EIGEN_RESTRICT rhsIt = rhs + alignedStart;
Scalar* EIGEN_RESTRICT resIt = res + alignedStart;
for (Index i = alignedStart; i < alignedEnd; i += PacketSize) {
Packet A0i = ploadu<Packet>(a0It);
a0It += PacketSize;
Packet A1i = ploadu<Packet>(a1It);
a1It += PacketSize;
Packet A2i = ploadu<Packet>(a2It);
a2It += PacketSize;
Packet A3i = ploadu<Packet>(a3It);
a3It += PacketSize;
Packet Bi = ploadu<Packet>(rhsIt);
rhsIt += PacketSize;
Packet Xi = pload<Packet>(resIt);
Xi = pcj0.pmadd(A0i, ptmp0, Xi);
Xi = pcj0.pmadd(A1i, ptmp1, Xi);
Xi = pcj0.pmadd(A2i, ptmp2, Xi);
Xi = pcj0.pmadd(A3i, ptmp3, Xi);
pstore(resIt, Xi);
resIt += PacketSize;
ptmp4 = pcj1.pmadd(A0i, Bi, ptmp4);
ptmp5 = pcj1.pmadd(A1i, Bi, ptmp5);
ptmp6 = pcj1.pmadd(A2i, Bi, ptmp6);
ptmp7 = pcj1.pmadd(A3i, Bi, ptmp7);
}
// Post-alignment scalar loop
for (Index i = alignedEnd; i < endi; ++i) {
res[i] += cj0.pmul(A0[i], t0) + cj0.pmul(A1[i], t1) + cj0.pmul(A2[i], t2) + cj0.pmul(A3[i], t3);
t4 += cj1.pmul(A0[i], rhs[i]);
t5 += cj1.pmul(A1[i], rhs[i]);
t6 += cj1.pmul(A2[i], rhs[i]);
t7 += cj1.pmul(A3[i], rhs[i]);
}
res[j] += alpha * (t4 + predux(ptmp4));
res[j + 1] += alpha * (t5 + predux(ptmp5));
res[j + 2] += alpha * (t6 + predux(ptmp6));
res[j + 3] += alpha * (t7 + predux(ptmp7));
}
for (Index i = starti; i < alignedStart; ++i) {
res[i] += cj0.pmul(A0[i], t0) + cj0.pmul(A1[i], t1);
t2 += cj1.pmul(A0[i], rhs[i]);
t3 += cj1.pmul(A1[i], rhs[i]);
}
const Scalar* EIGEN_RESTRICT a0It = A0 + alignedStart;
const Scalar* EIGEN_RESTRICT a1It = A1 + alignedStart;
const Scalar* EIGEN_RESTRICT rhsIt = rhs + alignedStart;
Scalar* EIGEN_RESTRICT resIt = res + alignedStart;
for (Index i = alignedStart; i < alignedEnd; i += PacketSize) {
Packet A0i = ploadu<Packet>(a0It);
a0It += PacketSize;
Packet A1i = ploadu<Packet>(a1It);
a1It += PacketSize;
Packet Bi = ploadu<Packet>(rhsIt);
rhsIt += PacketSize; // FIXME: should be aligned in most cases.
Packet Xi = pload<Packet>(resIt);
Xi = pcj0.pmadd(A0i, ptmp0, pcj0.pmadd(A1i, ptmp1, Xi));
ptmp2 = pcj1.pmadd(A0i, Bi, ptmp2);
ptmp3 = pcj1.pmadd(A1i, Bi, ptmp3);
pstore(resIt, Xi);
resIt += PacketSize;
}
for (Index i = alignedEnd; i < endi; i++) {
res[i] += cj0.pmul(A0[i], t0) + cj0.pmul(A1[i], t1);
t2 += cj1.pmul(A0[i], rhs[i]);
t3 += cj1.pmul(A1[i], rhs[i]);
}
res[j] += alpha * (t2 + predux(ptmp2));
res[j + 1] += alpha * (t3 + predux(ptmp3));
}
for (Index j = FirstTriangular ? 0 : bound; j < (FirstTriangular ? bound : size); j++) {
const Scalar* EIGEN_RESTRICT A0 = lhs + j * lhsStride;
Scalar t1 = cjAlpha * rhs[j];
Scalar t2(0);
res[j] += cjd.pmul(numext::real(A0[j]), t1);
for (Index i = FirstTriangular ? 0 : j + 1; i < (FirstTriangular ? j : size); i++) {
res[i] += cj0.pmul(A0[i], t1);
t2 += cj1.pmul(A0[i], rhs[i]);
// === Phase 2: 2 columns at a time ===
{
Index jStart = FirstTriangular ? (size - n4 - n2) : n4;
Index jEnd = FirstTriangular ? (size - n4) : (n4 + n2);
for (Index j = jStart; j < jEnd; j += 2) {
const Scalar* EIGEN_RESTRICT A0 = lhs + j * lhsStride;
const Scalar* EIGEN_RESTRICT A1 = lhs + (j + 1) * lhsStride;
Scalar t0 = cjAlpha * rhs[j];
Packet ptmp0 = pset1<Packet>(t0);
Scalar t1 = cjAlpha * rhs[j + 1];
Packet ptmp1 = pset1<Packet>(t1);
Scalar t2(0);
Packet ptmp2 = pzero(Packet{});
Scalar t3(0);
Packet ptmp3 = pzero(Packet{});
Index starti = FirstTriangular ? 0 : j + 2;
Index endi = FirstTriangular ? j : size;
Index alignedStart = starti + internal::first_default_aligned(&res[starti], endi - starti);
Index alignedEnd = alignedStart + ((endi - alignedStart) / PacketSize) * PacketSize;
res[j] += cjd.pmul(numext::real(A0[j]), t0);
res[j + 1] += cjd.pmul(numext::real(A1[j + 1]), t1);
if (FirstTriangular) {
res[j] += cj0.pmul(A1[j], t1);
t3 += cj1.pmul(A1[j], rhs[j]);
} else {
res[j + 1] += cj0.pmul(A0[j + 1], t0);
t2 += cj1.pmul(A0[j + 1], rhs[j + 1]);
}
for (Index i = starti; i < alignedStart; ++i) {
res[i] += cj0.pmul(A0[i], t0) + cj0.pmul(A1[i], t1);
t2 += cj1.pmul(A0[i], rhs[i]);
t3 += cj1.pmul(A1[i], rhs[i]);
}
const Scalar* EIGEN_RESTRICT a0It = A0 + alignedStart;
const Scalar* EIGEN_RESTRICT a1It = A1 + alignedStart;
const Scalar* EIGEN_RESTRICT rhsIt = rhs + alignedStart;
Scalar* EIGEN_RESTRICT resIt = res + alignedStart;
for (Index i = alignedStart; i < alignedEnd; i += PacketSize) {
Packet A0i = ploadu<Packet>(a0It);
a0It += PacketSize;
Packet A1i = ploadu<Packet>(a1It);
a1It += PacketSize;
Packet Bi = ploadu<Packet>(rhsIt);
rhsIt += PacketSize;
Packet Xi = pload<Packet>(resIt);
Xi = pcj0.pmadd(A0i, ptmp0, pcj0.pmadd(A1i, ptmp1, Xi));
ptmp2 = pcj1.pmadd(A0i, Bi, ptmp2);
ptmp3 = pcj1.pmadd(A1i, Bi, ptmp3);
pstore(resIt, Xi);
resIt += PacketSize;
}
for (Index i = alignedEnd; i < endi; i++) {
res[i] += cj0.pmul(A0[i], t0) + cj0.pmul(A1[i], t1);
t2 += cj1.pmul(A0[i], rhs[i]);
t3 += cj1.pmul(A1[i], rhs[i]);
}
res[j] += alpha * (t2 + predux(ptmp2));
res[j + 1] += alpha * (t3 + predux(ptmp3));
}
}
// === Phase 3: 1 column at a time ===
{
Index jStart = FirstTriangular ? 0 : (n4 + n2);
Index jEnd = FirstTriangular ? (size - n4 - n2) : size;
for (Index j = jStart; j < jEnd; j++) {
const Scalar* EIGEN_RESTRICT A0 = lhs + j * lhsStride;
Scalar t1 = cjAlpha * rhs[j];
Scalar t2(0);
Packet ptmp1 = pset1<Packet>(t1);
Packet ptmp2 = pzero(Packet{});
res[j] += cjd.pmul(numext::real(A0[j]), t1);
Index starti = FirstTriangular ? 0 : j + 1;
Index endi = FirstTriangular ? j : size;
Index alignedStart = starti + internal::first_default_aligned(&res[starti], endi - starti);
Index alignedEnd = alignedStart + ((endi - alignedStart) / PacketSize) * PacketSize;
for (Index i = starti; i < alignedStart; ++i) {
res[i] += cj0.pmul(A0[i], t1);
t2 += cj1.pmul(A0[i], rhs[i]);
}
const Scalar* EIGEN_RESTRICT a0It = A0 + alignedStart;
const Scalar* EIGEN_RESTRICT rhsIt = rhs + alignedStart;
Scalar* EIGEN_RESTRICT resIt = res + alignedStart;
for (Index i = alignedStart; i < alignedEnd; i += PacketSize) {
Packet A0i = ploadu<Packet>(a0It);
a0It += PacketSize;
Packet Bi = ploadu<Packet>(rhsIt);
rhsIt += PacketSize;
Packet Xi = pload<Packet>(resIt);
Xi = pcj0.pmadd(A0i, ptmp1, Xi);
pstore(resIt, Xi);
resIt += PacketSize;
ptmp2 = pcj1.pmadd(A0i, Bi, ptmp2);
}
for (Index i = alignedEnd; i < endi; i++) {
res[i] += cj0.pmul(A0[i], t1);
t2 += cj1.pmul(A0[i], rhs[i]);
}
res[j] += alpha * (t2 + predux(ptmp2));
}
res[j] += alpha * t2;
}
}

View File

@@ -24,14 +24,104 @@ namespace Eigen {
template <typename Scalar, typename Index, int UpLo, bool ConjLhs, bool ConjRhs>
struct selfadjoint_rank1_update<Scalar, Index, ColMajor, UpLo, ConjLhs, ConjRhs> {
static void run(Index size, Scalar* mat, Index stride, const Scalar* vecX, const Scalar* vecY, const Scalar& alpha) {
internal::conj_if<ConjRhs> cj;
typedef Map<const Matrix<Scalar, Dynamic, 1> > OtherMap;
typedef std::conditional_t<ConjLhs, typename OtherMap::ConjugateReturnType, const OtherMap&> ConjLhsType;
for (Index i = 0; i < size; ++i) {
Map<Matrix<Scalar, Dynamic, 1> >(mat + stride * i + (UpLo == Lower ? i : 0),
(UpLo == Lower ? size - i : (i + 1))) +=
(alpha * cj(vecY[i])) *
ConjLhsType(OtherMap(vecX + (UpLo == Lower ? i : 0), UpLo == Lower ? size - i : (i + 1)));
typedef typename internal::packet_traits<Scalar>::type Packet;
const Index PacketSize = internal::unpacket_traits<Packet>::size;
internal::conj_if<ConjRhs> cjy;
internal::conj_if<ConjLhs> cjx;
internal::conj_helper<Packet, Packet, ConjLhs, false> pcj;
// Process 2 columns at a time to share vecX loads and reduce loop overhead.
Index j = 0;
for (; j + 1 < size; j += 2) {
Scalar s0 = alpha * cjy(vecY[j]);
Scalar s1 = alpha * cjy(vecY[j + 1]);
Packet ps0 = internal::pset1<Packet>(s0);
Packet ps1 = internal::pset1<Packet>(s1);
if (UpLo == Lower) {
Scalar* EIGEN_RESTRICT col0 = mat + stride * j + j;
Scalar* EIGEN_RESTRICT col1 = mat + stride * (j + 1) + (j + 1);
// Diagonal and cross-diagonal scalar elements
col0[0] += s0 * cjx(vecX[j]);
col0[1] += s0 * cjx(vecX[j + 1]);
col1[0] += s1 * cjx(vecX[j + 1]);
// Shared vectorized loop for rows j+2..size-1
Index len = size - j - 2;
const Scalar* EIGEN_RESTRICT xp = vecX + j + 2;
Scalar* EIGEN_RESTRICT d0 = col0 + 2;
Scalar* EIGEN_RESTRICT d1 = col1 + 1;
Index k = 0;
Index vectorizedEnd = (len / PacketSize) * PacketSize;
for (; k < vectorizedEnd; k += PacketSize) {
Packet xi = internal::ploadu<Packet>(xp + k);
Packet m0 = internal::ploadu<Packet>(d0 + k);
m0 = pcj.pmadd(xi, ps0, m0);
internal::pstoreu(d0 + k, m0);
Packet m1 = internal::ploadu<Packet>(d1 + k);
m1 = pcj.pmadd(xi, ps1, m1);
internal::pstoreu(d1 + k, m1);
}
for (; k < len; ++k) {
Scalar cx = cjx(xp[k]);
d0[k] += s0 * cx;
d1[k] += s1 * cx;
}
} else {
// UpLo == Upper
Scalar* EIGEN_RESTRICT col0 = mat + stride * j;
Scalar* EIGEN_RESTRICT col1 = mat + stride * (j + 1);
// Shared vectorized loop for rows 0..j-1
const Scalar* EIGEN_RESTRICT xp = vecX;
Index len = j;
Index k = 0;
Index vectorizedEnd = (len / PacketSize) * PacketSize;
for (; k < vectorizedEnd; k += PacketSize) {
Packet xi = internal::ploadu<Packet>(xp + k);
Packet m0 = internal::ploadu<Packet>(col0 + k);
Packet m1 = internal::ploadu<Packet>(col1 + k);
m0 = pcj.pmadd(xi, ps0, m0);
m1 = pcj.pmadd(xi, ps1, m1);
internal::pstoreu(col0 + k, m0);
internal::pstoreu(col1 + k, m1);
}
for (; k < len; ++k) {
Scalar cx = cjx(xp[k]);
col0[k] += s0 * cx;
col1[k] += s1 * cx;
}
// Diagonal and cross-diagonal scalar elements
col0[j] += s0 * cjx(vecX[j]);
col1[j] += s1 * cjx(vecX[j]);
col1[j + 1] += s1 * cjx(vecX[j + 1]);
}
}
// Handle last column if size is odd
if (j < size) {
Scalar s = alpha * cjy(vecY[j]);
Packet ps = internal::pset1<Packet>(s);
Index start = UpLo == Lower ? j : 0;
Index len = UpLo == Lower ? size - j : j + 1;
Scalar* EIGEN_RESTRICT dst = mat + stride * j + start;
const Scalar* EIGEN_RESTRICT xp = vecX + start;
Index k = 0;
Index vectorizedEnd = (len / PacketSize) * PacketSize;
for (; k < vectorizedEnd; k += PacketSize) {
Packet xi = internal::ploadu<Packet>(xp + k);
Packet di = internal::ploadu<Packet>(dst + k);
di = pcj.pmadd(xi, ps, di);
internal::pstoreu(dst + k, di);
}
for (; k < len; ++k) {
dst[k] += s * cjx(xp[k]);
}
}
}
};

View File

@@ -21,29 +21,168 @@ namespace internal {
* It corresponds to the Level2 syr2 BLAS routine
*/
template <typename Scalar, typename Index, typename UType, typename VType, int UpLo>
template <typename Scalar, typename Index, int UpLo>
struct selfadjoint_rank2_update_selector;
template <typename Scalar, typename Index, typename UType, typename VType>
struct selfadjoint_rank2_update_selector<Scalar, Index, UType, VType, Lower> {
static EIGEN_DEVICE_FUNC void run(Scalar* mat, Index stride, const UType& u, const VType& v, const Scalar& alpha) {
const Index size = u.size();
for (Index i = 0; i < size; ++i) {
Map<Matrix<Scalar, Dynamic, 1>>(mat + stride * i + i, size - i) +=
(numext::conj(alpha) * numext::conj(u.coeff(i))) * v.tail(size - i) +
(alpha * numext::conj(v.coeff(i))) * u.tail(size - i);
template <typename Scalar, typename Index>
struct selfadjoint_rank2_update_selector<Scalar, Index, Lower> {
static void run(Index size, Scalar* mat, Index stride, const Scalar* u, const Scalar* v, const Scalar& alpha) {
typedef typename packet_traits<Scalar>::type Packet;
const Index PacketSize = unpacket_traits<Packet>::size;
const Scalar cAlpha = numext::conj(alpha);
// Process 2 columns at a time to share u/v loads and reduce loop overhead.
Index j = 0;
for (; j + 1 < size; j += 2) {
// Scale factors: col[j:] += s0u * v[j:] + s0v * u[j:]
Scalar s0u = cAlpha * numext::conj(u[j]);
Scalar s0v = alpha * numext::conj(v[j]);
Scalar s1u = cAlpha * numext::conj(u[j + 1]);
Scalar s1v = alpha * numext::conj(v[j + 1]);
Packet ps0u = pset1<Packet>(s0u);
Packet ps0v = pset1<Packet>(s0v);
Packet ps1u = pset1<Packet>(s1u);
Packet ps1v = pset1<Packet>(s1v);
Scalar* EIGEN_RESTRICT col0 = mat + stride * j + j;
Scalar* EIGEN_RESTRICT col1 = mat + stride * (j + 1) + (j + 1);
// Diagonal and cross-diagonal scalar elements
col0[0] += s0u * v[j] + s0v * u[j];
col0[1] += s0u * v[j + 1] + s0v * u[j + 1];
col1[0] += s1u * v[j + 1] + s1v * u[j + 1];
// Shared vectorized loop for rows j+2..size-1
Index len = size - j - 2;
const Scalar* EIGEN_RESTRICT up = u + j + 2;
const Scalar* EIGEN_RESTRICT vp = v + j + 2;
Scalar* EIGEN_RESTRICT d0 = col0 + 2;
Scalar* EIGEN_RESTRICT d1 = col1 + 1;
Index k = 0;
Index vectorizedEnd = (len / PacketSize) * PacketSize;
for (; k < vectorizedEnd; k += PacketSize) {
Packet ui = ploadu<Packet>(up + k);
Packet vi = ploadu<Packet>(vp + k);
Packet m0 = ploadu<Packet>(d0 + k);
m0 = pmadd(vi, ps0u, m0);
m0 = pmadd(ui, ps0v, m0);
pstoreu(d0 + k, m0);
Packet m1 = ploadu<Packet>(d1 + k);
m1 = pmadd(vi, ps1u, m1);
m1 = pmadd(ui, ps1v, m1);
pstoreu(d1 + k, m1);
}
for (; k < len; ++k) {
d0[k] += s0u * vp[k] + s0v * up[k];
d1[k] += s1u * vp[k] + s1v * up[k];
}
}
// Handle last column if size is odd
if (j < size) {
Scalar su = cAlpha * numext::conj(u[j]);
Scalar sv = alpha * numext::conj(v[j]);
Packet psu = pset1<Packet>(su);
Packet psv = pset1<Packet>(sv);
Scalar* EIGEN_RESTRICT dst = mat + stride * j + j;
const Scalar* EIGEN_RESTRICT up = u + j;
const Scalar* EIGEN_RESTRICT vp = v + j;
Index len = size - j;
Index k = 0;
Index vectorizedEnd = (len / PacketSize) * PacketSize;
for (; k < vectorizedEnd; k += PacketSize) {
Packet ui = ploadu<Packet>(up + k);
Packet vi = ploadu<Packet>(vp + k);
Packet di = ploadu<Packet>(dst + k);
di = pmadd(vi, psu, di);
di = pmadd(ui, psv, di);
pstoreu(dst + k, di);
}
for (; k < len; ++k) {
dst[k] += su * vp[k] + sv * up[k];
}
}
}
};
template <typename Scalar, typename Index, typename UType, typename VType>
struct selfadjoint_rank2_update_selector<Scalar, Index, UType, VType, Upper> {
static void run(Scalar* mat, Index stride, const UType& u, const VType& v, const Scalar& alpha) {
const Index size = u.size();
for (Index i = 0; i < size; ++i)
Map<Matrix<Scalar, Dynamic, 1>>(mat + stride * i, i + 1) +=
(numext::conj(alpha) * numext::conj(u.coeff(i))) * v.head(i + 1) +
(alpha * numext::conj(v.coeff(i))) * u.head(i + 1);
template <typename Scalar, typename Index>
struct selfadjoint_rank2_update_selector<Scalar, Index, Upper> {
static void run(Index size, Scalar* mat, Index stride, const Scalar* u, const Scalar* v, const Scalar& alpha) {
typedef typename packet_traits<Scalar>::type Packet;
const Index PacketSize = unpacket_traits<Packet>::size;
const Scalar cAlpha = numext::conj(alpha);
// Process 2 columns at a time to share u/v loads and reduce loop overhead.
Index j = 0;
for (; j + 1 < size; j += 2) {
Scalar s0u = cAlpha * numext::conj(u[j]);
Scalar s0v = alpha * numext::conj(v[j]);
Scalar s1u = cAlpha * numext::conj(u[j + 1]);
Scalar s1v = alpha * numext::conj(v[j + 1]);
Packet ps0u = pset1<Packet>(s0u);
Packet ps0v = pset1<Packet>(s0v);
Packet ps1u = pset1<Packet>(s1u);
Packet ps1v = pset1<Packet>(s1v);
Scalar* EIGEN_RESTRICT col0 = mat + stride * j;
Scalar* EIGEN_RESTRICT col1 = mat + stride * (j + 1);
// Shared vectorized loop for rows 0..j-1
Index len = j;
Index k = 0;
Index vectorizedEnd = (len / PacketSize) * PacketSize;
for (; k < vectorizedEnd; k += PacketSize) {
Packet ui = ploadu<Packet>(u + k);
Packet vi = ploadu<Packet>(v + k);
Packet m0 = ploadu<Packet>(col0 + k);
m0 = pmadd(vi, ps0u, m0);
m0 = pmadd(ui, ps0v, m0);
pstoreu(col0 + k, m0);
Packet m1 = ploadu<Packet>(col1 + k);
m1 = pmadd(vi, ps1u, m1);
m1 = pmadd(ui, ps1v, m1);
pstoreu(col1 + k, m1);
}
for (; k < len; ++k) {
col0[k] += s0u * v[k] + s0v * u[k];
col1[k] += s1u * v[k] + s1v * u[k];
}
// Diagonal and cross-diagonal scalar elements
col0[j] += s0u * v[j] + s0v * u[j];
col1[j] += s1u * v[j] + s1v * u[j];
col1[j + 1] += s1u * v[j + 1] + s1v * u[j + 1];
}
// Handle last column if size is odd
if (j < size) {
Scalar su = cAlpha * numext::conj(u[j]);
Scalar sv = alpha * numext::conj(v[j]);
Packet psu = pset1<Packet>(su);
Packet psv = pset1<Packet>(sv);
Scalar* EIGEN_RESTRICT dst = mat + stride * j;
Index len = j + 1;
Index k = 0;
Index vectorizedEnd = (len / PacketSize) * PacketSize;
for (; k < vectorizedEnd; k += PacketSize) {
Packet ui = ploadu<Packet>(u + k);
Packet vi = ploadu<Packet>(v + k);
Packet di = ploadu<Packet>(dst + k);
di = pmadd(vi, psu, di);
di = pmadd(ui, psv, di);
pstoreu(dst + k, di);
}
for (; k < len; ++k) {
dst[k] += su * v[k] + sv * u[k];
}
}
}
};
@@ -69,23 +208,47 @@ EIGEN_DEVICE_FUNC SelfAdjointView<MatrixType, UpLo>& SelfAdjointView<MatrixType,
// If MatrixType is row major, then we use the routine for lower triangular in the upper triangular case and
// vice versa, and take the complex conjugate of all coefficients and vector entries.
enum {
IsRowMajor = (internal::traits<MatrixType>::Flags & RowMajorBit) ? 1 : 0,
// Only need to conjugate if complex and the condition triggers
NeedConjU = (int(IsRowMajor) ^ int(UBlasTraits::NeedToConjugate)) && NumTraits<Scalar>::IsComplex,
NeedConjV = (int(IsRowMajor) ^ int(VBlasTraits::NeedToConjugate)) && NumTraits<Scalar>::IsComplex,
UseUDirectly = ActualUType_::InnerStrideAtCompileTime == 1 && !NeedConjU,
UseVDirectly = ActualVType_::InnerStrideAtCompileTime == 1 && !NeedConjV
};
enum { IsRowMajor = (internal::traits<MatrixType>::Flags & RowMajorBit) ? 1 : 0 };
Scalar actualAlpha = alpha * UBlasTraits::extractScalarFactor(u.derived()) *
numext::conj(VBlasTraits::extractScalarFactor(v.derived()));
if (IsRowMajor) actualAlpha = numext::conj(actualAlpha);
typedef internal::remove_all_t<
typename internal::conj_expr_if<int(IsRowMajor) ^ int(UBlasTraits::NeedToConjugate), ActualUType_>::type>
UType;
typedef internal::remove_all_t<
typename internal::conj_expr_if<int(IsRowMajor) ^ int(VBlasTraits::NeedToConjugate), ActualVType_>::type>
VType;
internal::selfadjoint_rank2_update_selector<Scalar, Index, UType, VType,
(IsRowMajor ? int(UpLo == Upper ? Lower : Upper)
: UpLo)>::run(_expression().const_cast_derived().data(),
_expression().outerStride(), UType(actualU),
VType(actualV), actualAlpha);
const Index size = u.size();
// Copy u to contiguous buffer, applying conjugation if needed
internal::gemv_static_vector_if<Scalar, DerivedU::SizeAtCompileTime, DerivedU::MaxSizeAtCompileTime, !UseUDirectly>
static_u;
ei_declare_aligned_stack_constructed_variable(Scalar, uPtr, size,
(UseUDirectly ? const_cast<Scalar*>(actualU.data()) : static_u.data()));
if (!UseUDirectly) {
if (NeedConjU)
Map<typename ActualUType_::PlainObject>(uPtr, size) = actualU.conjugate();
else
Map<typename ActualUType_::PlainObject>(uPtr, size) = actualU;
}
// Copy v to contiguous buffer, applying conjugation if needed
internal::gemv_static_vector_if<Scalar, DerivedV::SizeAtCompileTime, DerivedV::MaxSizeAtCompileTime, !UseVDirectly>
static_v;
ei_declare_aligned_stack_constructed_variable(Scalar, vPtr, size,
(UseVDirectly ? const_cast<Scalar*>(actualV.data()) : static_v.data()));
if (!UseVDirectly) {
if (NeedConjV)
Map<typename ActualVType_::PlainObject>(vPtr, size) = actualV.conjugate();
else
Map<typename ActualVType_::PlainObject>(vPtr, size) = actualV;
}
internal::selfadjoint_rank2_update_selector<Scalar, Index, (IsRowMajor ? int(UpLo == Upper ? Lower : Upper) : UpLo)>::
run(size, _expression().const_cast_derived().data(), _expression().outerStride(), uPtr, vPtr, actualAlpha);
return *this;
}

View File

@@ -43,43 +43,102 @@ EIGEN_DONT_INLINE void triangular_matrix_vector_product<Index, Mode, LhsScalar,
Index rows = IsLower ? _rows : (std::min)(_rows, _cols);
Index cols = IsLower ? (std::min)(_rows, _cols) : _cols;
typedef Map<const Matrix<LhsScalar, Dynamic, Dynamic, ColMajor>, 0, OuterStride<> > LhsMap;
const LhsMap lhs(lhs_, rows, cols, OuterStride<>(lhsStride));
typename conj_expr_if<ConjLhs, LhsMap>::type cjLhs(lhs);
typedef Map<const Matrix<RhsScalar, Dynamic, 1>, 0, InnerStride<> > RhsMap;
const RhsMap rhs(rhs_, cols, InnerStride<>(rhsIncr));
typename conj_expr_if<ConjRhs, RhsMap>::type cjRhs(rhs);
typedef Map<Matrix<ResScalar, Dynamic, 1> > ResMap;
ResMap res(res_, rows);
typedef const_blas_data_mapper<LhsScalar, Index, ColMajor> LhsMapper;
typedef const_blas_data_mapper<RhsScalar, Index, RowMajor> RhsMapper;
conj_if<ConjLhs> cjl;
conj_if<ConjRhs> cjr;
for (Index pi = 0; pi < size; pi += PanelWidth) {
Index actualPanelWidth = (std::min)(PanelWidth, size - pi);
for (Index k = 0; k < actualPanelWidth; ++k) {
Index i = pi + k;
Index s = IsLower ? ((HasUnitDiag || HasZeroDiag) ? i + 1 : i) : pi;
Index r = IsLower ? actualPanelWidth - k : k + 1;
if ((!(HasUnitDiag || HasZeroDiag)) || (--r) > 0)
res.segment(s, r) += (alpha * cjRhs.coeff(i)) * cjLhs.col(i).segment(s, r);
if (HasUnitDiag) res.coeffRef(i) += alpha * cjRhs.coeff(i);
// Process the triangular panel using raw pointer operations with 2-column batching
// to eliminate expression template overhead and share result loads/stores.
if (IsLower) {
Index k = 0;
for (; k + 1 < actualPanelWidth; k += 2) {
Index i0 = pi + k;
Index i1 = i0 + 1;
ResScalar s0 = alpha * cjr(rhs_[i0 * rhsIncr]);
ResScalar s1 = alpha * cjr(rhs_[i1 * rhsIncr]);
const LhsScalar* EIGEN_RESTRICT c0 = lhs_ + i0 * lhsStride;
const LhsScalar* EIGEN_RESTRICT c1 = lhs_ + i1 * lhsStride;
// Diagonal of column 0
if (!(HasUnitDiag || HasZeroDiag)) res_[i0] += s0 * cjl(c0[i0]);
// Row i1: contribution from column 0 + diagonal of column 1
{
ResScalar r1 = s0 * cjl(c0[i1]);
if (!(HasUnitDiag || HasZeroDiag)) r1 += s1 * cjl(c1[i1]);
res_[i1] += r1;
}
// Shared rows where both columns contribute
Index panelEnd = pi + actualPanelWidth;
for (Index j = i1 + 1; j < panelEnd; ++j) res_[j] += s0 * cjl(c0[j]) + s1 * cjl(c1[j]);
if (HasUnitDiag) {
res_[i0] += s0;
res_[i1] += s1;
}
}
if (k < actualPanelWidth) {
Index i = pi + k;
ResScalar s = alpha * cjr(rhs_[i * rhsIncr]);
const LhsScalar* EIGEN_RESTRICT c = lhs_ + i * lhsStride;
if (!(HasUnitDiag || HasZeroDiag)) res_[i] += s * cjl(c[i]);
if (HasUnitDiag) res_[i] += s;
}
} else {
// Upper triangular: process 2 columns at a time
Index k = 0;
for (; k + 1 < actualPanelWidth; k += 2) {
Index i0 = pi + k;
Index i1 = i0 + 1;
ResScalar s0 = alpha * cjr(rhs_[i0 * rhsIncr]);
ResScalar s1 = alpha * cjr(rhs_[i1 * rhsIncr]);
const LhsScalar* EIGEN_RESTRICT c0 = lhs_ + i0 * lhsStride;
const LhsScalar* EIGEN_RESTRICT c1 = lhs_ + i1 * lhsStride;
// Shared rows before the diagonal block
for (Index j = pi; j < i0; ++j) res_[j] += s0 * cjl(c0[j]) + s1 * cjl(c1[j]);
// Row i0: diagonal of col0 + contribution from col1
{
ResScalar r0 = s1 * cjl(c1[i0]);
if (!(HasUnitDiag || HasZeroDiag)) r0 += s0 * cjl(c0[i0]);
res_[i0] += r0;
}
// Diagonal of column 1
if (!(HasUnitDiag || HasZeroDiag)) res_[i1] += s1 * cjl(c1[i1]);
if (HasUnitDiag) {
res_[i0] += s0;
res_[i1] += s1;
}
}
if (k < actualPanelWidth) {
Index i = pi + k;
ResScalar s = alpha * cjr(rhs_[i * rhsIncr]);
const LhsScalar* EIGEN_RESTRICT c = lhs_ + i * lhsStride;
for (Index j = pi; j < i; ++j) res_[j] += s * cjl(c[j]);
if (!(HasUnitDiag || HasZeroDiag)) res_[i] += s * cjl(c[i]);
if (HasUnitDiag) res_[i] += s;
}
}
// Rectangular part: delegate to optimized GEMV
Index r = IsLower ? rows - pi - actualPanelWidth : pi;
if (r > 0) {
Index s = IsLower ? pi + actualPanelWidth : 0;
general_matrix_vector_product<Index, LhsScalar, LhsMapper, ColMajor, ConjLhs, RhsScalar, RhsMapper, ConjRhs,
BuiltIn>::run(r, actualPanelWidth, LhsMapper(&lhs.coeffRef(s, pi), lhsStride),
RhsMapper(&rhs.coeffRef(pi), rhsIncr), &res.coeffRef(s), resIncr,
alpha);
BuiltIn>::run(r, actualPanelWidth, LhsMapper(&lhs_[pi * lhsStride + s], lhsStride),
RhsMapper(&rhs_[pi * rhsIncr], rhsIncr), &res_[s], resIncr, alpha);
}
}
if ((!IsLower) && cols > size) {
general_matrix_vector_product<Index, LhsScalar, LhsMapper, ColMajor, ConjLhs, RhsScalar, RhsMapper, ConjRhs>::run(
rows, cols - size, LhsMapper(&lhs.coeffRef(0, size), lhsStride), RhsMapper(&rhs.coeffRef(size), rhsIncr), res_,
resIncr, alpha);
rows, cols - size, LhsMapper(&lhs_[size * lhsStride], lhsStride), RhsMapper(&rhs_[size * rhsIncr], rhsIncr),
res_, resIncr, alpha);
}
}
@@ -105,43 +164,48 @@ EIGEN_DONT_INLINE void triangular_matrix_vector_product<Index, Mode, LhsScalar,
Index rows = IsLower ? _rows : diagSize;
Index cols = IsLower ? diagSize : _cols;
typedef Map<const Matrix<LhsScalar, Dynamic, Dynamic, RowMajor>, 0, OuterStride<> > LhsMap;
const LhsMap lhs(lhs_, rows, cols, OuterStride<>(lhsStride));
typename conj_expr_if<ConjLhs, LhsMap>::type cjLhs(lhs);
typedef Map<const Matrix<RhsScalar, Dynamic, 1> > RhsMap;
const RhsMap rhs(rhs_, cols);
typename conj_expr_if<ConjRhs, RhsMap>::type cjRhs(rhs);
typedef Map<Matrix<ResScalar, Dynamic, 1>, 0, InnerStride<> > ResMap;
ResMap res(res_, rows, InnerStride<>(resIncr));
typedef const_blas_data_mapper<LhsScalar, Index, RowMajor> LhsMapper;
typedef const_blas_data_mapper<RhsScalar, Index, RowMajor> RhsMapper;
conj_if<ConjLhs> cjl;
conj_if<ConjRhs> cjr;
for (Index pi = 0; pi < diagSize; pi += PanelWidth) {
Index actualPanelWidth = (std::min)(PanelWidth, diagSize - pi);
// Process the triangular panel using raw dot products to eliminate
// the cwiseProduct().sum() expression template overhead.
for (Index k = 0; k < actualPanelWidth; ++k) {
Index i = pi + k;
Index s = IsLower ? pi : ((HasUnitDiag || HasZeroDiag) ? i + 1 : i);
Index r = IsLower ? k + 1 : actualPanelWidth - k;
if ((!(HasUnitDiag || HasZeroDiag)) || (--r) > 0)
res.coeffRef(i) += alpha * (cjLhs.row(i).segment(s, r).cwiseProduct(cjRhs.segment(s, r).transpose())).sum();
if (HasUnitDiag) res.coeffRef(i) += alpha * cjRhs.coeff(i);
const LhsScalar* EIGEN_RESTRICT row_i = lhs_ + i * lhsStride;
ResScalar dot = ResScalar(0);
if (IsLower) {
Index s = pi;
Index len = (HasUnitDiag || HasZeroDiag) ? k : k + 1;
for (Index j = 0; j < len; ++j) dot += cjl(row_i[s + j]) * cjr(rhs_[s + j]);
} else {
Index s = (HasUnitDiag || HasZeroDiag) ? i + 1 : i;
Index len = pi + actualPanelWidth - s;
for (Index j = 0; j < len; ++j) dot += cjl(row_i[s + j]) * cjr(rhs_[s + j]);
}
res_[i * resIncr] += alpha * dot;
if (HasUnitDiag) res_[i * resIncr] += alpha * cjr(rhs_[i]);
}
// Rectangular part: delegate to optimized GEMV
Index r = IsLower ? pi : cols - pi - actualPanelWidth;
if (r > 0) {
Index s = IsLower ? 0 : pi + actualPanelWidth;
general_matrix_vector_product<Index, LhsScalar, LhsMapper, RowMajor, ConjLhs, RhsScalar, RhsMapper, ConjRhs,
BuiltIn>::run(actualPanelWidth, r, LhsMapper(&lhs.coeffRef(pi, s), lhsStride),
RhsMapper(&rhs.coeffRef(s), rhsIncr), &res.coeffRef(pi), resIncr,
alpha);
BuiltIn>::run(actualPanelWidth, r, LhsMapper(&lhs_[pi * lhsStride + s], lhsStride),
RhsMapper(&rhs_[s], rhsIncr), &res_[pi * resIncr], resIncr, alpha);
}
}
if (IsLower && rows > diagSize) {
general_matrix_vector_product<Index, LhsScalar, LhsMapper, RowMajor, ConjLhs, RhsScalar, RhsMapper, ConjRhs>::run(
rows - diagSize, cols, LhsMapper(&lhs.coeffRef(diagSize, 0), lhsStride), RhsMapper(&rhs.coeffRef(0), rhsIncr),
&res.coeffRef(diagSize), resIncr, alpha);
rows - diagSize, cols, LhsMapper(&lhs_[diagSize * lhsStride], lhsStride), RhsMapper(rhs_, rhsIncr),
&res_[diagSize * resIncr], resIncr, alpha);
}
}

View File

@@ -14,6 +14,10 @@ eigen_add_benchmark(bench_map bench_map.cpp)
eigen_add_benchmark(bench_diagonal bench_diagonal.cpp)
eigen_add_benchmark(bench_triangular_product bench_triangular_product.cpp)
eigen_add_benchmark(bench_selfadjoint_product bench_selfadjoint_product.cpp)
eigen_add_benchmark(bench_symv bench_symv.cpp)
eigen_add_benchmark(bench_trmv bench_trmv.cpp)
eigen_add_benchmark(bench_syr bench_syr.cpp)
eigen_add_benchmark(bench_syr2 bench_syr2.cpp)
eigen_add_benchmark(bench_construction bench_construction.cpp)
eigen_add_benchmark(bench_fixed_size bench_fixed_size.cpp)
eigen_add_benchmark(bench_fixed_size_double bench_fixed_size.cpp DEFINITIONS SCALAR=double)

View File

@@ -0,0 +1,62 @@
// Benchmarks for selfadjoint matrix-vector product (SYMV/HEMV).
//
// Tests y += selfadjointView(A) * x for various sizes and scalar types.
// Exercises SelfadjointMatrixVector.h kernel.
#include <benchmark/benchmark.h>
#include <Eigen/Core>
using namespace Eigen;
template <typename Scalar>
double symvFlops(Index n) {
// SYMV uses n^2 multiply-adds (exploiting symmetry)
return (NumTraits<Scalar>::IsComplex ? 8.0 : 2.0) * n * n;
}
// y += selfadjointView<Lower>(A) * x
template <typename Scalar>
static void BM_SYMV_Lower(benchmark::State& state) {
const Index n = state.range(0);
using Mat = Matrix<Scalar, Dynamic, Dynamic>;
using Vec = Matrix<Scalar, Dynamic, 1>;
Mat A = Mat::Random(n, n);
A = (A + A.transpose().eval()) / Scalar(2);
Vec x = Vec::Random(n);
Vec y = Vec::Random(n);
for (auto _ : state) {
y.noalias() += A.template selfadjointView<Lower>() * x;
benchmark::DoNotOptimize(y.data());
benchmark::ClobberMemory();
}
state.counters["GFLOPS"] = benchmark::Counter(symvFlops<Scalar>(n), benchmark::Counter::kIsIterationInvariantRate,
benchmark::Counter::kIs1000);
}
// y += selfadjointView<Upper>(A) * x
template <typename Scalar>
static void BM_SYMV_Upper(benchmark::State& state) {
const Index n = state.range(0);
using Mat = Matrix<Scalar, Dynamic, Dynamic>;
using Vec = Matrix<Scalar, Dynamic, 1>;
Mat A = Mat::Random(n, n);
A = (A + A.transpose().eval()) / Scalar(2);
Vec x = Vec::Random(n);
Vec y = Vec::Random(n);
for (auto _ : state) {
y.noalias() += A.template selfadjointView<Upper>() * x;
benchmark::DoNotOptimize(y.data());
benchmark::ClobberMemory();
}
state.counters["GFLOPS"] = benchmark::Counter(symvFlops<Scalar>(n), benchmark::Counter::kIsIterationInvariantRate,
benchmark::Counter::kIs1000);
}
static void SymvSizes(::benchmark::Benchmark* b) {
for (int n : {8, 16, 32, 64, 128, 256, 512, 1024, 2048}) b->Arg(n);
}
BENCHMARK(BM_SYMV_Lower<float>)->Apply(SymvSizes)->Name("SYMV_Lower_float");
BENCHMARK(BM_SYMV_Lower<double>)->Apply(SymvSizes)->Name("SYMV_Lower_double");
BENCHMARK(BM_SYMV_Upper<float>)->Apply(SymvSizes)->Name("SYMV_Upper_float");
BENCHMARK(BM_SYMV_Upper<double>)->Apply(SymvSizes)->Name("SYMV_Upper_double");

View File

@@ -0,0 +1,59 @@
// Benchmarks for symmetric rank-1 update (SYR).
//
// Tests C.selfadjointView<Lower>().rankUpdate(v, alpha) which computes
// C += alpha * v * v^T, updating only the lower (or upper) triangle.
// Exercises SelfadjointProduct.h / selfadjoint_rank1_update.
#include <benchmark/benchmark.h>
#include <Eigen/Core>
using namespace Eigen;
template <typename Scalar>
double syrFlops(Index n) {
// SYR: n*(n+1)/2 multiply-adds ~ n^2
return (NumTraits<Scalar>::IsComplex ? 8.0 : 2.0) * n * (n + 1) / 2;
}
template <typename Scalar>
static void BM_SYR_Lower(benchmark::State& state) {
const Index n = state.range(0);
using Mat = Matrix<Scalar, Dynamic, Dynamic>;
using Vec = Matrix<Scalar, Dynamic, 1>;
Vec v = Vec::Random(n);
Mat C = Mat::Zero(n, n);
Scalar alpha(1);
for (auto _ : state) {
C.template selfadjointView<Lower>().rankUpdate(v, alpha);
benchmark::DoNotOptimize(C.data());
benchmark::ClobberMemory();
}
state.counters["GFLOPS"] = benchmark::Counter(syrFlops<Scalar>(n), benchmark::Counter::kIsIterationInvariantRate,
benchmark::Counter::kIs1000);
}
template <typename Scalar>
static void BM_SYR_Upper(benchmark::State& state) {
const Index n = state.range(0);
using Mat = Matrix<Scalar, Dynamic, Dynamic>;
using Vec = Matrix<Scalar, Dynamic, 1>;
Vec v = Vec::Random(n);
Mat C = Mat::Zero(n, n);
Scalar alpha(1);
for (auto _ : state) {
C.template selfadjointView<Upper>().rankUpdate(v, alpha);
benchmark::DoNotOptimize(C.data());
benchmark::ClobberMemory();
}
state.counters["GFLOPS"] = benchmark::Counter(syrFlops<Scalar>(n), benchmark::Counter::kIsIterationInvariantRate,
benchmark::Counter::kIs1000);
}
static void SyrSizes(::benchmark::Benchmark* b) {
for (int n : {8, 16, 32, 64, 128, 256, 512, 1024, 2048}) b->Arg(n);
}
BENCHMARK(BM_SYR_Lower<float>)->Apply(SyrSizes)->Name("SYR_Lower_float");
BENCHMARK(BM_SYR_Lower<double>)->Apply(SyrSizes)->Name("SYR_Lower_double");
BENCHMARK(BM_SYR_Upper<float>)->Apply(SyrSizes)->Name("SYR_Upper_float");
BENCHMARK(BM_SYR_Upper<double>)->Apply(SyrSizes)->Name("SYR_Upper_double");

View File

@@ -0,0 +1,61 @@
// Benchmarks for symmetric rank-2 update (SYR2).
//
// Tests C.selfadjointView<Lower>().rankUpdate(u, v, alpha) which computes
// C += alpha * u * v^T + conj(alpha) * v * u^T.
// Exercises SelfadjointRank2Update.h.
#include <benchmark/benchmark.h>
#include <Eigen/Core>
using namespace Eigen;
template <typename Scalar>
double syr2Flops(Index n) {
// SYR2: 2 * n*(n+1)/2 multiply-adds ~ 2*n^2
return (NumTraits<Scalar>::IsComplex ? 8.0 : 2.0) * 2 * n * (n + 1) / 2;
}
template <typename Scalar>
static void BM_SYR2_Lower(benchmark::State& state) {
const Index n = state.range(0);
using Mat = Matrix<Scalar, Dynamic, Dynamic>;
using Vec = Matrix<Scalar, Dynamic, 1>;
Vec u = Vec::Random(n);
Vec v = Vec::Random(n);
Mat C = Mat::Zero(n, n);
Scalar alpha(1);
for (auto _ : state) {
C.template selfadjointView<Lower>().rankUpdate(u, v, alpha);
benchmark::DoNotOptimize(C.data());
benchmark::ClobberMemory();
}
state.counters["GFLOPS"] = benchmark::Counter(syr2Flops<Scalar>(n), benchmark::Counter::kIsIterationInvariantRate,
benchmark::Counter::kIs1000);
}
template <typename Scalar>
static void BM_SYR2_Upper(benchmark::State& state) {
const Index n = state.range(0);
using Mat = Matrix<Scalar, Dynamic, Dynamic>;
using Vec = Matrix<Scalar, Dynamic, 1>;
Vec u = Vec::Random(n);
Vec v = Vec::Random(n);
Mat C = Mat::Zero(n, n);
Scalar alpha(1);
for (auto _ : state) {
C.template selfadjointView<Upper>().rankUpdate(u, v, alpha);
benchmark::DoNotOptimize(C.data());
benchmark::ClobberMemory();
}
state.counters["GFLOPS"] = benchmark::Counter(syr2Flops<Scalar>(n), benchmark::Counter::kIsIterationInvariantRate,
benchmark::Counter::kIs1000);
}
static void Syr2Sizes(::benchmark::Benchmark* b) {
for (int n : {8, 16, 32, 64, 128, 256, 512, 1024, 2048}) b->Arg(n);
}
BENCHMARK(BM_SYR2_Lower<float>)->Apply(Syr2Sizes)->Name("SYR2_Lower_float");
BENCHMARK(BM_SYR2_Lower<double>)->Apply(Syr2Sizes)->Name("SYR2_Lower_double");
BENCHMARK(BM_SYR2_Upper<float>)->Apply(Syr2Sizes)->Name("SYR2_Upper_float");
BENCHMARK(BM_SYR2_Upper<double>)->Apply(Syr2Sizes)->Name("SYR2_Upper_double");

View File

@@ -0,0 +1,46 @@
// Benchmarks for triangular matrix-vector product (TRMV).
//
// Tests y += triangularView(A) * x for various modes and sizes.
// Exercises TriangularMatrixVector.h kernel.
#include <benchmark/benchmark.h>
#include <Eigen/Core>
using namespace Eigen;
template <typename Scalar>
double trmvFlops(Index n) {
// TRMV: ~n^2 multiply-adds
return (NumTraits<Scalar>::IsComplex ? 8.0 : 2.0) * n * n;
}
// y = triangularView<Mode>(A) * x
template <typename Scalar, unsigned int Mode>
static void BM_TRMV(benchmark::State& state) {
const Index n = state.range(0);
using Mat = Matrix<Scalar, Dynamic, Dynamic>;
using Vec = Matrix<Scalar, Dynamic, 1>;
Mat A = Mat::Random(n, n);
Vec x = Vec::Random(n);
Vec y(n);
for (auto _ : state) {
y.noalias() = A.template triangularView<Mode>() * x;
benchmark::DoNotOptimize(y.data());
benchmark::ClobberMemory();
}
state.counters["GFLOPS"] = benchmark::Counter(trmvFlops<Scalar>(n), benchmark::Counter::kIsIterationInvariantRate,
benchmark::Counter::kIs1000);
}
static void TrmvSizes(::benchmark::Benchmark* b) {
for (int n : {8, 16, 32, 64, 128, 256, 512, 1024, 2048}) b->Arg(n);
}
BENCHMARK(BM_TRMV<float, Lower>)->Apply(TrmvSizes)->Name("TRMV_float_Lower");
BENCHMARK(BM_TRMV<float, Upper>)->Apply(TrmvSizes)->Name("TRMV_float_Upper");
BENCHMARK(BM_TRMV<float, UnitLower>)->Apply(TrmvSizes)->Name("TRMV_float_UnitLower");
BENCHMARK(BM_TRMV<float, UnitUpper>)->Apply(TrmvSizes)->Name("TRMV_float_UnitUpper");
BENCHMARK(BM_TRMV<double, Lower>)->Apply(TrmvSizes)->Name("TRMV_double_Lower");
BENCHMARK(BM_TRMV<double, Upper>)->Apply(TrmvSizes)->Name("TRMV_double_Upper");
BENCHMARK(BM_TRMV<double, UnitLower>)->Apply(TrmvSizes)->Name("TRMV_double_UnitLower");
BENCHMARK(BM_TRMV<double, UnitUpper>)->Apply(TrmvSizes)->Name("TRMV_double_UnitUpper");