mirror of
https://gitlab.com/libeigen/eigen.git
synced 2026-04-10 11:34:33 +08:00
Optimize SYMV, SYR, SYR2, and TRMV product kernels
libeigen/eigen!2228 Co-authored-by: Rasmus Munk Larsen <rmlarsen@gmail.com>
This commit is contained in:
@@ -18,9 +18,11 @@ namespace Eigen {
|
||||
namespace internal {
|
||||
|
||||
/* Optimized selfadjoint matrix * vector product:
|
||||
* This algorithm processes 2 columns at once that allows to both reduce
|
||||
* the number of load/stores of the result by a factor 2 and to reduce
|
||||
* the instruction dependency.
|
||||
* This algorithm processes 4 columns at once to reduce the number of
|
||||
* load/stores of the result vector by a factor of 4 compared to the
|
||||
* naive approach, and to increase instruction-level parallelism.
|
||||
* A 2-column cleanup handles the remaining even columns, and a
|
||||
* 1-column loop handles any final odd column.
|
||||
*/
|
||||
|
||||
template <typename Scalar, typename Index, int StorageOrder, int UpLo, bool ConjugateLhs, bool ConjugateRhs,
|
||||
@@ -61,82 +63,246 @@ selfadjoint_matrix_vector_product<Scalar, Index, StorageOrder, UpLo, ConjugateLh
|
||||
|
||||
Scalar cjAlpha = ConjugateRhs ? numext::conj(alpha) : alpha;
|
||||
|
||||
Index bound = numext::maxi(Index(0), size - 8) & 0xfffffffe;
|
||||
if (FirstTriangular) bound = size - bound;
|
||||
// Compute column counts for 4-col, 2-col, and 1-col processing phases.
|
||||
// We leave up to ~8 columns near the diagonal for cleanup (short off-diagonal ranges).
|
||||
Index n4 = (numext::maxi(Index(0), size - 8) / 4) * 4;
|
||||
Index n2 = ((size - n4) / 2) * 2;
|
||||
// Remaining (size - n4 - n2) is 0 or 1 columns.
|
||||
|
||||
for (Index j = FirstTriangular ? bound : 0; j < (FirstTriangular ? size : bound); j += 2) {
|
||||
const Scalar* EIGEN_RESTRICT A0 = lhs + j * lhsStride;
|
||||
const Scalar* EIGEN_RESTRICT A1 = lhs + (j + 1) * lhsStride;
|
||||
// For !FirstTriangular: 4-col [0, n4), 2-col [n4, n4+n2), 1-col [n4+n2, size)
|
||||
// For FirstTriangular: 1-col [0, size-n4-n2), 2-col [size-n4-n2, size-n4), 4-col [size-n4, size)
|
||||
|
||||
Scalar t0 = cjAlpha * rhs[j];
|
||||
Packet ptmp0 = pset1<Packet>(t0);
|
||||
Scalar t1 = cjAlpha * rhs[j + 1];
|
||||
Packet ptmp1 = pset1<Packet>(t1);
|
||||
// === Phase 1: 4 columns at a time ===
|
||||
{
|
||||
Index jStart = FirstTriangular ? (size - n4) : 0;
|
||||
Index jEnd = FirstTriangular ? size : n4;
|
||||
|
||||
Scalar t2(0);
|
||||
Packet ptmp2 = pset1<Packet>(t2);
|
||||
Scalar t3(0);
|
||||
Packet ptmp3 = pset1<Packet>(t3);
|
||||
for (Index j = jStart; j < jEnd; j += 4) {
|
||||
const Scalar* EIGEN_RESTRICT A0 = lhs + j * lhsStride;
|
||||
const Scalar* EIGEN_RESTRICT A1 = lhs + (j + 1) * lhsStride;
|
||||
const Scalar* EIGEN_RESTRICT A2 = lhs + (j + 2) * lhsStride;
|
||||
const Scalar* EIGEN_RESTRICT A3 = lhs + (j + 3) * lhsStride;
|
||||
|
||||
Index starti = FirstTriangular ? 0 : j + 2;
|
||||
Index endi = FirstTriangular ? j : size;
|
||||
Index alignedStart = (starti) + internal::first_default_aligned(&res[starti], endi - starti);
|
||||
Index alignedEnd = alignedStart + ((endi - alignedStart) / (PacketSize)) * (PacketSize);
|
||||
Scalar t0 = cjAlpha * rhs[j];
|
||||
Scalar t1 = cjAlpha * rhs[j + 1];
|
||||
Scalar t2 = cjAlpha * rhs[j + 2];
|
||||
Scalar t3 = cjAlpha * rhs[j + 3];
|
||||
Packet ptmp0 = pset1<Packet>(t0);
|
||||
Packet ptmp1 = pset1<Packet>(t1);
|
||||
Packet ptmp2 = pset1<Packet>(t2);
|
||||
Packet ptmp3 = pset1<Packet>(t3);
|
||||
|
||||
res[j] += cjd.pmul(numext::real(A0[j]), t0);
|
||||
res[j + 1] += cjd.pmul(numext::real(A1[j + 1]), t1);
|
||||
if (FirstTriangular) {
|
||||
res[j] += cj0.pmul(A1[j], t1);
|
||||
t3 += cj1.pmul(A1[j], rhs[j]);
|
||||
} else {
|
||||
res[j + 1] += cj0.pmul(A0[j + 1], t0);
|
||||
t2 += cj1.pmul(A0[j + 1], rhs[j + 1]);
|
||||
Scalar t4(0), t5(0), t6(0), t7(0);
|
||||
Packet ptmp4 = pzero(Packet{});
|
||||
Packet ptmp5 = pzero(Packet{});
|
||||
Packet ptmp6 = pzero(Packet{});
|
||||
Packet ptmp7 = pzero(Packet{});
|
||||
|
||||
Index starti = FirstTriangular ? 0 : j + 4;
|
||||
Index endi = FirstTriangular ? j : size;
|
||||
Index alignedStart = starti + internal::first_default_aligned(&res[starti], endi - starti);
|
||||
Index alignedEnd = alignedStart + ((endi - alignedStart) / PacketSize) * PacketSize;
|
||||
|
||||
// Handle the 4x4 diagonal block: diagonal elements
|
||||
res[j] += cjd.pmul(numext::real(A0[j]), t0);
|
||||
res[j + 1] += cjd.pmul(numext::real(A1[j + 1]), t1);
|
||||
res[j + 2] += cjd.pmul(numext::real(A2[j + 2]), t2);
|
||||
res[j + 3] += cjd.pmul(numext::real(A3[j + 3]), t3);
|
||||
|
||||
// Handle the 4x4 diagonal block: off-diagonal cross terms
|
||||
if (FirstTriangular) {
|
||||
// Upper triangle stored (A_k[l] for l <= k)
|
||||
res[j] += cj0.pmul(A1[j], t1) + cj0.pmul(A2[j], t2) + cj0.pmul(A3[j], t3);
|
||||
res[j + 1] += cj0.pmul(A2[j + 1], t2) + cj0.pmul(A3[j + 1], t3);
|
||||
res[j + 2] += cj0.pmul(A3[j + 2], t3);
|
||||
|
||||
t5 += cj1.pmul(A1[j], rhs[j]);
|
||||
t6 += cj1.pmul(A2[j], rhs[j]) + cj1.pmul(A2[j + 1], rhs[j + 1]);
|
||||
t7 += cj1.pmul(A3[j], rhs[j]) + cj1.pmul(A3[j + 1], rhs[j + 1]) + cj1.pmul(A3[j + 2], rhs[j + 2]);
|
||||
} else {
|
||||
// Lower triangle stored (A_k[l] for l >= k)
|
||||
res[j + 1] += cj0.pmul(A0[j + 1], t0);
|
||||
res[j + 2] += cj0.pmul(A0[j + 2], t0) + cj0.pmul(A1[j + 2], t1);
|
||||
res[j + 3] += cj0.pmul(A0[j + 3], t0) + cj0.pmul(A1[j + 3], t1) + cj0.pmul(A2[j + 3], t2);
|
||||
|
||||
t4 += cj1.pmul(A0[j + 1], rhs[j + 1]) + cj1.pmul(A0[j + 2], rhs[j + 2]) + cj1.pmul(A0[j + 3], rhs[j + 3]);
|
||||
t5 += cj1.pmul(A1[j + 2], rhs[j + 2]) + cj1.pmul(A1[j + 3], rhs[j + 3]);
|
||||
t6 += cj1.pmul(A2[j + 3], rhs[j + 3]);
|
||||
}
|
||||
|
||||
// Pre-alignment scalar loop
|
||||
for (Index i = starti; i < alignedStart; ++i) {
|
||||
res[i] += cj0.pmul(A0[i], t0) + cj0.pmul(A1[i], t1) + cj0.pmul(A2[i], t2) + cj0.pmul(A3[i], t3);
|
||||
t4 += cj1.pmul(A0[i], rhs[i]);
|
||||
t5 += cj1.pmul(A1[i], rhs[i]);
|
||||
t6 += cj1.pmul(A2[i], rhs[i]);
|
||||
t7 += cj1.pmul(A3[i], rhs[i]);
|
||||
}
|
||||
|
||||
// Main vectorized loop: 4 matrix column loads, 1 rhs load, 1 result load/store
|
||||
const Scalar* EIGEN_RESTRICT a0It = A0 + alignedStart;
|
||||
const Scalar* EIGEN_RESTRICT a1It = A1 + alignedStart;
|
||||
const Scalar* EIGEN_RESTRICT a2It = A2 + alignedStart;
|
||||
const Scalar* EIGEN_RESTRICT a3It = A3 + alignedStart;
|
||||
const Scalar* EIGEN_RESTRICT rhsIt = rhs + alignedStart;
|
||||
Scalar* EIGEN_RESTRICT resIt = res + alignedStart;
|
||||
for (Index i = alignedStart; i < alignedEnd; i += PacketSize) {
|
||||
Packet A0i = ploadu<Packet>(a0It);
|
||||
a0It += PacketSize;
|
||||
Packet A1i = ploadu<Packet>(a1It);
|
||||
a1It += PacketSize;
|
||||
Packet A2i = ploadu<Packet>(a2It);
|
||||
a2It += PacketSize;
|
||||
Packet A3i = ploadu<Packet>(a3It);
|
||||
a3It += PacketSize;
|
||||
Packet Bi = ploadu<Packet>(rhsIt);
|
||||
rhsIt += PacketSize;
|
||||
Packet Xi = pload<Packet>(resIt);
|
||||
|
||||
Xi = pcj0.pmadd(A0i, ptmp0, Xi);
|
||||
Xi = pcj0.pmadd(A1i, ptmp1, Xi);
|
||||
Xi = pcj0.pmadd(A2i, ptmp2, Xi);
|
||||
Xi = pcj0.pmadd(A3i, ptmp3, Xi);
|
||||
pstore(resIt, Xi);
|
||||
resIt += PacketSize;
|
||||
|
||||
ptmp4 = pcj1.pmadd(A0i, Bi, ptmp4);
|
||||
ptmp5 = pcj1.pmadd(A1i, Bi, ptmp5);
|
||||
ptmp6 = pcj1.pmadd(A2i, Bi, ptmp6);
|
||||
ptmp7 = pcj1.pmadd(A3i, Bi, ptmp7);
|
||||
}
|
||||
|
||||
// Post-alignment scalar loop
|
||||
for (Index i = alignedEnd; i < endi; ++i) {
|
||||
res[i] += cj0.pmul(A0[i], t0) + cj0.pmul(A1[i], t1) + cj0.pmul(A2[i], t2) + cj0.pmul(A3[i], t3);
|
||||
t4 += cj1.pmul(A0[i], rhs[i]);
|
||||
t5 += cj1.pmul(A1[i], rhs[i]);
|
||||
t6 += cj1.pmul(A2[i], rhs[i]);
|
||||
t7 += cj1.pmul(A3[i], rhs[i]);
|
||||
}
|
||||
|
||||
res[j] += alpha * (t4 + predux(ptmp4));
|
||||
res[j + 1] += alpha * (t5 + predux(ptmp5));
|
||||
res[j + 2] += alpha * (t6 + predux(ptmp6));
|
||||
res[j + 3] += alpha * (t7 + predux(ptmp7));
|
||||
}
|
||||
|
||||
for (Index i = starti; i < alignedStart; ++i) {
|
||||
res[i] += cj0.pmul(A0[i], t0) + cj0.pmul(A1[i], t1);
|
||||
t2 += cj1.pmul(A0[i], rhs[i]);
|
||||
t3 += cj1.pmul(A1[i], rhs[i]);
|
||||
}
|
||||
const Scalar* EIGEN_RESTRICT a0It = A0 + alignedStart;
|
||||
const Scalar* EIGEN_RESTRICT a1It = A1 + alignedStart;
|
||||
const Scalar* EIGEN_RESTRICT rhsIt = rhs + alignedStart;
|
||||
Scalar* EIGEN_RESTRICT resIt = res + alignedStart;
|
||||
for (Index i = alignedStart; i < alignedEnd; i += PacketSize) {
|
||||
Packet A0i = ploadu<Packet>(a0It);
|
||||
a0It += PacketSize;
|
||||
Packet A1i = ploadu<Packet>(a1It);
|
||||
a1It += PacketSize;
|
||||
Packet Bi = ploadu<Packet>(rhsIt);
|
||||
rhsIt += PacketSize; // FIXME: should be aligned in most cases.
|
||||
Packet Xi = pload<Packet>(resIt);
|
||||
|
||||
Xi = pcj0.pmadd(A0i, ptmp0, pcj0.pmadd(A1i, ptmp1, Xi));
|
||||
ptmp2 = pcj1.pmadd(A0i, Bi, ptmp2);
|
||||
ptmp3 = pcj1.pmadd(A1i, Bi, ptmp3);
|
||||
pstore(resIt, Xi);
|
||||
resIt += PacketSize;
|
||||
}
|
||||
for (Index i = alignedEnd; i < endi; i++) {
|
||||
res[i] += cj0.pmul(A0[i], t0) + cj0.pmul(A1[i], t1);
|
||||
t2 += cj1.pmul(A0[i], rhs[i]);
|
||||
t3 += cj1.pmul(A1[i], rhs[i]);
|
||||
}
|
||||
|
||||
res[j] += alpha * (t2 + predux(ptmp2));
|
||||
res[j + 1] += alpha * (t3 + predux(ptmp3));
|
||||
}
|
||||
for (Index j = FirstTriangular ? 0 : bound; j < (FirstTriangular ? bound : size); j++) {
|
||||
const Scalar* EIGEN_RESTRICT A0 = lhs + j * lhsStride;
|
||||
|
||||
Scalar t1 = cjAlpha * rhs[j];
|
||||
Scalar t2(0);
|
||||
res[j] += cjd.pmul(numext::real(A0[j]), t1);
|
||||
for (Index i = FirstTriangular ? 0 : j + 1; i < (FirstTriangular ? j : size); i++) {
|
||||
res[i] += cj0.pmul(A0[i], t1);
|
||||
t2 += cj1.pmul(A0[i], rhs[i]);
|
||||
// === Phase 2: 2 columns at a time ===
|
||||
{
|
||||
Index jStart = FirstTriangular ? (size - n4 - n2) : n4;
|
||||
Index jEnd = FirstTriangular ? (size - n4) : (n4 + n2);
|
||||
|
||||
for (Index j = jStart; j < jEnd; j += 2) {
|
||||
const Scalar* EIGEN_RESTRICT A0 = lhs + j * lhsStride;
|
||||
const Scalar* EIGEN_RESTRICT A1 = lhs + (j + 1) * lhsStride;
|
||||
|
||||
Scalar t0 = cjAlpha * rhs[j];
|
||||
Packet ptmp0 = pset1<Packet>(t0);
|
||||
Scalar t1 = cjAlpha * rhs[j + 1];
|
||||
Packet ptmp1 = pset1<Packet>(t1);
|
||||
|
||||
Scalar t2(0);
|
||||
Packet ptmp2 = pzero(Packet{});
|
||||
Scalar t3(0);
|
||||
Packet ptmp3 = pzero(Packet{});
|
||||
|
||||
Index starti = FirstTriangular ? 0 : j + 2;
|
||||
Index endi = FirstTriangular ? j : size;
|
||||
Index alignedStart = starti + internal::first_default_aligned(&res[starti], endi - starti);
|
||||
Index alignedEnd = alignedStart + ((endi - alignedStart) / PacketSize) * PacketSize;
|
||||
|
||||
res[j] += cjd.pmul(numext::real(A0[j]), t0);
|
||||
res[j + 1] += cjd.pmul(numext::real(A1[j + 1]), t1);
|
||||
if (FirstTriangular) {
|
||||
res[j] += cj0.pmul(A1[j], t1);
|
||||
t3 += cj1.pmul(A1[j], rhs[j]);
|
||||
} else {
|
||||
res[j + 1] += cj0.pmul(A0[j + 1], t0);
|
||||
t2 += cj1.pmul(A0[j + 1], rhs[j + 1]);
|
||||
}
|
||||
|
||||
for (Index i = starti; i < alignedStart; ++i) {
|
||||
res[i] += cj0.pmul(A0[i], t0) + cj0.pmul(A1[i], t1);
|
||||
t2 += cj1.pmul(A0[i], rhs[i]);
|
||||
t3 += cj1.pmul(A1[i], rhs[i]);
|
||||
}
|
||||
const Scalar* EIGEN_RESTRICT a0It = A0 + alignedStart;
|
||||
const Scalar* EIGEN_RESTRICT a1It = A1 + alignedStart;
|
||||
const Scalar* EIGEN_RESTRICT rhsIt = rhs + alignedStart;
|
||||
Scalar* EIGEN_RESTRICT resIt = res + alignedStart;
|
||||
for (Index i = alignedStart; i < alignedEnd; i += PacketSize) {
|
||||
Packet A0i = ploadu<Packet>(a0It);
|
||||
a0It += PacketSize;
|
||||
Packet A1i = ploadu<Packet>(a1It);
|
||||
a1It += PacketSize;
|
||||
Packet Bi = ploadu<Packet>(rhsIt);
|
||||
rhsIt += PacketSize;
|
||||
Packet Xi = pload<Packet>(resIt);
|
||||
|
||||
Xi = pcj0.pmadd(A0i, ptmp0, pcj0.pmadd(A1i, ptmp1, Xi));
|
||||
ptmp2 = pcj1.pmadd(A0i, Bi, ptmp2);
|
||||
ptmp3 = pcj1.pmadd(A1i, Bi, ptmp3);
|
||||
pstore(resIt, Xi);
|
||||
resIt += PacketSize;
|
||||
}
|
||||
for (Index i = alignedEnd; i < endi; i++) {
|
||||
res[i] += cj0.pmul(A0[i], t0) + cj0.pmul(A1[i], t1);
|
||||
t2 += cj1.pmul(A0[i], rhs[i]);
|
||||
t3 += cj1.pmul(A1[i], rhs[i]);
|
||||
}
|
||||
|
||||
res[j] += alpha * (t2 + predux(ptmp2));
|
||||
res[j + 1] += alpha * (t3 + predux(ptmp3));
|
||||
}
|
||||
}
|
||||
|
||||
// === Phase 3: 1 column at a time ===
|
||||
{
|
||||
Index jStart = FirstTriangular ? 0 : (n4 + n2);
|
||||
Index jEnd = FirstTriangular ? (size - n4 - n2) : size;
|
||||
|
||||
for (Index j = jStart; j < jEnd; j++) {
|
||||
const Scalar* EIGEN_RESTRICT A0 = lhs + j * lhsStride;
|
||||
|
||||
Scalar t1 = cjAlpha * rhs[j];
|
||||
Scalar t2(0);
|
||||
Packet ptmp1 = pset1<Packet>(t1);
|
||||
Packet ptmp2 = pzero(Packet{});
|
||||
|
||||
res[j] += cjd.pmul(numext::real(A0[j]), t1);
|
||||
|
||||
Index starti = FirstTriangular ? 0 : j + 1;
|
||||
Index endi = FirstTriangular ? j : size;
|
||||
Index alignedStart = starti + internal::first_default_aligned(&res[starti], endi - starti);
|
||||
Index alignedEnd = alignedStart + ((endi - alignedStart) / PacketSize) * PacketSize;
|
||||
|
||||
for (Index i = starti; i < alignedStart; ++i) {
|
||||
res[i] += cj0.pmul(A0[i], t1);
|
||||
t2 += cj1.pmul(A0[i], rhs[i]);
|
||||
}
|
||||
const Scalar* EIGEN_RESTRICT a0It = A0 + alignedStart;
|
||||
const Scalar* EIGEN_RESTRICT rhsIt = rhs + alignedStart;
|
||||
Scalar* EIGEN_RESTRICT resIt = res + alignedStart;
|
||||
for (Index i = alignedStart; i < alignedEnd; i += PacketSize) {
|
||||
Packet A0i = ploadu<Packet>(a0It);
|
||||
a0It += PacketSize;
|
||||
Packet Bi = ploadu<Packet>(rhsIt);
|
||||
rhsIt += PacketSize;
|
||||
Packet Xi = pload<Packet>(resIt);
|
||||
|
||||
Xi = pcj0.pmadd(A0i, ptmp1, Xi);
|
||||
pstore(resIt, Xi);
|
||||
resIt += PacketSize;
|
||||
|
||||
ptmp2 = pcj1.pmadd(A0i, Bi, ptmp2);
|
||||
}
|
||||
for (Index i = alignedEnd; i < endi; i++) {
|
||||
res[i] += cj0.pmul(A0[i], t1);
|
||||
t2 += cj1.pmul(A0[i], rhs[i]);
|
||||
}
|
||||
res[j] += alpha * (t2 + predux(ptmp2));
|
||||
}
|
||||
res[j] += alpha * t2;
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
@@ -24,14 +24,104 @@ namespace Eigen {
|
||||
template <typename Scalar, typename Index, int UpLo, bool ConjLhs, bool ConjRhs>
|
||||
struct selfadjoint_rank1_update<Scalar, Index, ColMajor, UpLo, ConjLhs, ConjRhs> {
|
||||
static void run(Index size, Scalar* mat, Index stride, const Scalar* vecX, const Scalar* vecY, const Scalar& alpha) {
|
||||
internal::conj_if<ConjRhs> cj;
|
||||
typedef Map<const Matrix<Scalar, Dynamic, 1> > OtherMap;
|
||||
typedef std::conditional_t<ConjLhs, typename OtherMap::ConjugateReturnType, const OtherMap&> ConjLhsType;
|
||||
for (Index i = 0; i < size; ++i) {
|
||||
Map<Matrix<Scalar, Dynamic, 1> >(mat + stride * i + (UpLo == Lower ? i : 0),
|
||||
(UpLo == Lower ? size - i : (i + 1))) +=
|
||||
(alpha * cj(vecY[i])) *
|
||||
ConjLhsType(OtherMap(vecX + (UpLo == Lower ? i : 0), UpLo == Lower ? size - i : (i + 1)));
|
||||
typedef typename internal::packet_traits<Scalar>::type Packet;
|
||||
const Index PacketSize = internal::unpacket_traits<Packet>::size;
|
||||
|
||||
internal::conj_if<ConjRhs> cjy;
|
||||
internal::conj_if<ConjLhs> cjx;
|
||||
internal::conj_helper<Packet, Packet, ConjLhs, false> pcj;
|
||||
|
||||
// Process 2 columns at a time to share vecX loads and reduce loop overhead.
|
||||
Index j = 0;
|
||||
for (; j + 1 < size; j += 2) {
|
||||
Scalar s0 = alpha * cjy(vecY[j]);
|
||||
Scalar s1 = alpha * cjy(vecY[j + 1]);
|
||||
Packet ps0 = internal::pset1<Packet>(s0);
|
||||
Packet ps1 = internal::pset1<Packet>(s1);
|
||||
|
||||
if (UpLo == Lower) {
|
||||
Scalar* EIGEN_RESTRICT col0 = mat + stride * j + j;
|
||||
Scalar* EIGEN_RESTRICT col1 = mat + stride * (j + 1) + (j + 1);
|
||||
|
||||
// Diagonal and cross-diagonal scalar elements
|
||||
col0[0] += s0 * cjx(vecX[j]);
|
||||
col0[1] += s0 * cjx(vecX[j + 1]);
|
||||
col1[0] += s1 * cjx(vecX[j + 1]);
|
||||
|
||||
// Shared vectorized loop for rows j+2..size-1
|
||||
Index len = size - j - 2;
|
||||
const Scalar* EIGEN_RESTRICT xp = vecX + j + 2;
|
||||
Scalar* EIGEN_RESTRICT d0 = col0 + 2;
|
||||
Scalar* EIGEN_RESTRICT d1 = col1 + 1;
|
||||
|
||||
Index k = 0;
|
||||
Index vectorizedEnd = (len / PacketSize) * PacketSize;
|
||||
for (; k < vectorizedEnd; k += PacketSize) {
|
||||
Packet xi = internal::ploadu<Packet>(xp + k);
|
||||
Packet m0 = internal::ploadu<Packet>(d0 + k);
|
||||
m0 = pcj.pmadd(xi, ps0, m0);
|
||||
internal::pstoreu(d0 + k, m0);
|
||||
Packet m1 = internal::ploadu<Packet>(d1 + k);
|
||||
m1 = pcj.pmadd(xi, ps1, m1);
|
||||
internal::pstoreu(d1 + k, m1);
|
||||
}
|
||||
for (; k < len; ++k) {
|
||||
Scalar cx = cjx(xp[k]);
|
||||
d0[k] += s0 * cx;
|
||||
d1[k] += s1 * cx;
|
||||
}
|
||||
} else {
|
||||
// UpLo == Upper
|
||||
Scalar* EIGEN_RESTRICT col0 = mat + stride * j;
|
||||
Scalar* EIGEN_RESTRICT col1 = mat + stride * (j + 1);
|
||||
|
||||
// Shared vectorized loop for rows 0..j-1
|
||||
const Scalar* EIGEN_RESTRICT xp = vecX;
|
||||
Index len = j;
|
||||
Index k = 0;
|
||||
Index vectorizedEnd = (len / PacketSize) * PacketSize;
|
||||
for (; k < vectorizedEnd; k += PacketSize) {
|
||||
Packet xi = internal::ploadu<Packet>(xp + k);
|
||||
Packet m0 = internal::ploadu<Packet>(col0 + k);
|
||||
Packet m1 = internal::ploadu<Packet>(col1 + k);
|
||||
m0 = pcj.pmadd(xi, ps0, m0);
|
||||
m1 = pcj.pmadd(xi, ps1, m1);
|
||||
internal::pstoreu(col0 + k, m0);
|
||||
internal::pstoreu(col1 + k, m1);
|
||||
}
|
||||
for (; k < len; ++k) {
|
||||
Scalar cx = cjx(xp[k]);
|
||||
col0[k] += s0 * cx;
|
||||
col1[k] += s1 * cx;
|
||||
}
|
||||
|
||||
// Diagonal and cross-diagonal scalar elements
|
||||
col0[j] += s0 * cjx(vecX[j]);
|
||||
col1[j] += s1 * cjx(vecX[j]);
|
||||
col1[j + 1] += s1 * cjx(vecX[j + 1]);
|
||||
}
|
||||
}
|
||||
|
||||
// Handle last column if size is odd
|
||||
if (j < size) {
|
||||
Scalar s = alpha * cjy(vecY[j]);
|
||||
Packet ps = internal::pset1<Packet>(s);
|
||||
Index start = UpLo == Lower ? j : 0;
|
||||
Index len = UpLo == Lower ? size - j : j + 1;
|
||||
Scalar* EIGEN_RESTRICT dst = mat + stride * j + start;
|
||||
const Scalar* EIGEN_RESTRICT xp = vecX + start;
|
||||
|
||||
Index k = 0;
|
||||
Index vectorizedEnd = (len / PacketSize) * PacketSize;
|
||||
for (; k < vectorizedEnd; k += PacketSize) {
|
||||
Packet xi = internal::ploadu<Packet>(xp + k);
|
||||
Packet di = internal::ploadu<Packet>(dst + k);
|
||||
di = pcj.pmadd(xi, ps, di);
|
||||
internal::pstoreu(dst + k, di);
|
||||
}
|
||||
for (; k < len; ++k) {
|
||||
dst[k] += s * cjx(xp[k]);
|
||||
}
|
||||
}
|
||||
}
|
||||
};
|
||||
|
||||
@@ -21,29 +21,168 @@ namespace internal {
|
||||
* It corresponds to the Level2 syr2 BLAS routine
|
||||
*/
|
||||
|
||||
template <typename Scalar, typename Index, typename UType, typename VType, int UpLo>
|
||||
template <typename Scalar, typename Index, int UpLo>
|
||||
struct selfadjoint_rank2_update_selector;
|
||||
|
||||
template <typename Scalar, typename Index, typename UType, typename VType>
|
||||
struct selfadjoint_rank2_update_selector<Scalar, Index, UType, VType, Lower> {
|
||||
static EIGEN_DEVICE_FUNC void run(Scalar* mat, Index stride, const UType& u, const VType& v, const Scalar& alpha) {
|
||||
const Index size = u.size();
|
||||
for (Index i = 0; i < size; ++i) {
|
||||
Map<Matrix<Scalar, Dynamic, 1>>(mat + stride * i + i, size - i) +=
|
||||
(numext::conj(alpha) * numext::conj(u.coeff(i))) * v.tail(size - i) +
|
||||
(alpha * numext::conj(v.coeff(i))) * u.tail(size - i);
|
||||
template <typename Scalar, typename Index>
|
||||
struct selfadjoint_rank2_update_selector<Scalar, Index, Lower> {
|
||||
static void run(Index size, Scalar* mat, Index stride, const Scalar* u, const Scalar* v, const Scalar& alpha) {
|
||||
typedef typename packet_traits<Scalar>::type Packet;
|
||||
const Index PacketSize = unpacket_traits<Packet>::size;
|
||||
const Scalar cAlpha = numext::conj(alpha);
|
||||
|
||||
// Process 2 columns at a time to share u/v loads and reduce loop overhead.
|
||||
Index j = 0;
|
||||
for (; j + 1 < size; j += 2) {
|
||||
// Scale factors: col[j:] += s0u * v[j:] + s0v * u[j:]
|
||||
Scalar s0u = cAlpha * numext::conj(u[j]);
|
||||
Scalar s0v = alpha * numext::conj(v[j]);
|
||||
Scalar s1u = cAlpha * numext::conj(u[j + 1]);
|
||||
Scalar s1v = alpha * numext::conj(v[j + 1]);
|
||||
|
||||
Packet ps0u = pset1<Packet>(s0u);
|
||||
Packet ps0v = pset1<Packet>(s0v);
|
||||
Packet ps1u = pset1<Packet>(s1u);
|
||||
Packet ps1v = pset1<Packet>(s1v);
|
||||
|
||||
Scalar* EIGEN_RESTRICT col0 = mat + stride * j + j;
|
||||
Scalar* EIGEN_RESTRICT col1 = mat + stride * (j + 1) + (j + 1);
|
||||
|
||||
// Diagonal and cross-diagonal scalar elements
|
||||
col0[0] += s0u * v[j] + s0v * u[j];
|
||||
col0[1] += s0u * v[j + 1] + s0v * u[j + 1];
|
||||
col1[0] += s1u * v[j + 1] + s1v * u[j + 1];
|
||||
|
||||
// Shared vectorized loop for rows j+2..size-1
|
||||
Index len = size - j - 2;
|
||||
const Scalar* EIGEN_RESTRICT up = u + j + 2;
|
||||
const Scalar* EIGEN_RESTRICT vp = v + j + 2;
|
||||
Scalar* EIGEN_RESTRICT d0 = col0 + 2;
|
||||
Scalar* EIGEN_RESTRICT d1 = col1 + 1;
|
||||
|
||||
Index k = 0;
|
||||
Index vectorizedEnd = (len / PacketSize) * PacketSize;
|
||||
for (; k < vectorizedEnd; k += PacketSize) {
|
||||
Packet ui = ploadu<Packet>(up + k);
|
||||
Packet vi = ploadu<Packet>(vp + k);
|
||||
Packet m0 = ploadu<Packet>(d0 + k);
|
||||
m0 = pmadd(vi, ps0u, m0);
|
||||
m0 = pmadd(ui, ps0v, m0);
|
||||
pstoreu(d0 + k, m0);
|
||||
Packet m1 = ploadu<Packet>(d1 + k);
|
||||
m1 = pmadd(vi, ps1u, m1);
|
||||
m1 = pmadd(ui, ps1v, m1);
|
||||
pstoreu(d1 + k, m1);
|
||||
}
|
||||
for (; k < len; ++k) {
|
||||
d0[k] += s0u * vp[k] + s0v * up[k];
|
||||
d1[k] += s1u * vp[k] + s1v * up[k];
|
||||
}
|
||||
}
|
||||
|
||||
// Handle last column if size is odd
|
||||
if (j < size) {
|
||||
Scalar su = cAlpha * numext::conj(u[j]);
|
||||
Scalar sv = alpha * numext::conj(v[j]);
|
||||
Packet psu = pset1<Packet>(su);
|
||||
Packet psv = pset1<Packet>(sv);
|
||||
|
||||
Scalar* EIGEN_RESTRICT dst = mat + stride * j + j;
|
||||
const Scalar* EIGEN_RESTRICT up = u + j;
|
||||
const Scalar* EIGEN_RESTRICT vp = v + j;
|
||||
Index len = size - j;
|
||||
|
||||
Index k = 0;
|
||||
Index vectorizedEnd = (len / PacketSize) * PacketSize;
|
||||
for (; k < vectorizedEnd; k += PacketSize) {
|
||||
Packet ui = ploadu<Packet>(up + k);
|
||||
Packet vi = ploadu<Packet>(vp + k);
|
||||
Packet di = ploadu<Packet>(dst + k);
|
||||
di = pmadd(vi, psu, di);
|
||||
di = pmadd(ui, psv, di);
|
||||
pstoreu(dst + k, di);
|
||||
}
|
||||
for (; k < len; ++k) {
|
||||
dst[k] += su * vp[k] + sv * up[k];
|
||||
}
|
||||
}
|
||||
}
|
||||
};
|
||||
|
||||
template <typename Scalar, typename Index, typename UType, typename VType>
|
||||
struct selfadjoint_rank2_update_selector<Scalar, Index, UType, VType, Upper> {
|
||||
static void run(Scalar* mat, Index stride, const UType& u, const VType& v, const Scalar& alpha) {
|
||||
const Index size = u.size();
|
||||
for (Index i = 0; i < size; ++i)
|
||||
Map<Matrix<Scalar, Dynamic, 1>>(mat + stride * i, i + 1) +=
|
||||
(numext::conj(alpha) * numext::conj(u.coeff(i))) * v.head(i + 1) +
|
||||
(alpha * numext::conj(v.coeff(i))) * u.head(i + 1);
|
||||
template <typename Scalar, typename Index>
|
||||
struct selfadjoint_rank2_update_selector<Scalar, Index, Upper> {
|
||||
static void run(Index size, Scalar* mat, Index stride, const Scalar* u, const Scalar* v, const Scalar& alpha) {
|
||||
typedef typename packet_traits<Scalar>::type Packet;
|
||||
const Index PacketSize = unpacket_traits<Packet>::size;
|
||||
const Scalar cAlpha = numext::conj(alpha);
|
||||
|
||||
// Process 2 columns at a time to share u/v loads and reduce loop overhead.
|
||||
Index j = 0;
|
||||
for (; j + 1 < size; j += 2) {
|
||||
Scalar s0u = cAlpha * numext::conj(u[j]);
|
||||
Scalar s0v = alpha * numext::conj(v[j]);
|
||||
Scalar s1u = cAlpha * numext::conj(u[j + 1]);
|
||||
Scalar s1v = alpha * numext::conj(v[j + 1]);
|
||||
|
||||
Packet ps0u = pset1<Packet>(s0u);
|
||||
Packet ps0v = pset1<Packet>(s0v);
|
||||
Packet ps1u = pset1<Packet>(s1u);
|
||||
Packet ps1v = pset1<Packet>(s1v);
|
||||
|
||||
Scalar* EIGEN_RESTRICT col0 = mat + stride * j;
|
||||
Scalar* EIGEN_RESTRICT col1 = mat + stride * (j + 1);
|
||||
|
||||
// Shared vectorized loop for rows 0..j-1
|
||||
Index len = j;
|
||||
Index k = 0;
|
||||
Index vectorizedEnd = (len / PacketSize) * PacketSize;
|
||||
for (; k < vectorizedEnd; k += PacketSize) {
|
||||
Packet ui = ploadu<Packet>(u + k);
|
||||
Packet vi = ploadu<Packet>(v + k);
|
||||
Packet m0 = ploadu<Packet>(col0 + k);
|
||||
m0 = pmadd(vi, ps0u, m0);
|
||||
m0 = pmadd(ui, ps0v, m0);
|
||||
pstoreu(col0 + k, m0);
|
||||
Packet m1 = ploadu<Packet>(col1 + k);
|
||||
m1 = pmadd(vi, ps1u, m1);
|
||||
m1 = pmadd(ui, ps1v, m1);
|
||||
pstoreu(col1 + k, m1);
|
||||
}
|
||||
for (; k < len; ++k) {
|
||||
col0[k] += s0u * v[k] + s0v * u[k];
|
||||
col1[k] += s1u * v[k] + s1v * u[k];
|
||||
}
|
||||
|
||||
// Diagonal and cross-diagonal scalar elements
|
||||
col0[j] += s0u * v[j] + s0v * u[j];
|
||||
col1[j] += s1u * v[j] + s1v * u[j];
|
||||
col1[j + 1] += s1u * v[j + 1] + s1v * u[j + 1];
|
||||
}
|
||||
|
||||
// Handle last column if size is odd
|
||||
if (j < size) {
|
||||
Scalar su = cAlpha * numext::conj(u[j]);
|
||||
Scalar sv = alpha * numext::conj(v[j]);
|
||||
Packet psu = pset1<Packet>(su);
|
||||
Packet psv = pset1<Packet>(sv);
|
||||
|
||||
Scalar* EIGEN_RESTRICT dst = mat + stride * j;
|
||||
Index len = j + 1;
|
||||
|
||||
Index k = 0;
|
||||
Index vectorizedEnd = (len / PacketSize) * PacketSize;
|
||||
for (; k < vectorizedEnd; k += PacketSize) {
|
||||
Packet ui = ploadu<Packet>(u + k);
|
||||
Packet vi = ploadu<Packet>(v + k);
|
||||
Packet di = ploadu<Packet>(dst + k);
|
||||
di = pmadd(vi, psu, di);
|
||||
di = pmadd(ui, psv, di);
|
||||
pstoreu(dst + k, di);
|
||||
}
|
||||
for (; k < len; ++k) {
|
||||
dst[k] += su * v[k] + sv * u[k];
|
||||
}
|
||||
}
|
||||
}
|
||||
};
|
||||
|
||||
@@ -69,23 +208,47 @@ EIGEN_DEVICE_FUNC SelfAdjointView<MatrixType, UpLo>& SelfAdjointView<MatrixType,
|
||||
|
||||
// If MatrixType is row major, then we use the routine for lower triangular in the upper triangular case and
|
||||
// vice versa, and take the complex conjugate of all coefficients and vector entries.
|
||||
enum {
|
||||
IsRowMajor = (internal::traits<MatrixType>::Flags & RowMajorBit) ? 1 : 0,
|
||||
// Only need to conjugate if complex and the condition triggers
|
||||
NeedConjU = (int(IsRowMajor) ^ int(UBlasTraits::NeedToConjugate)) && NumTraits<Scalar>::IsComplex,
|
||||
NeedConjV = (int(IsRowMajor) ^ int(VBlasTraits::NeedToConjugate)) && NumTraits<Scalar>::IsComplex,
|
||||
UseUDirectly = ActualUType_::InnerStrideAtCompileTime == 1 && !NeedConjU,
|
||||
UseVDirectly = ActualVType_::InnerStrideAtCompileTime == 1 && !NeedConjV
|
||||
};
|
||||
|
||||
enum { IsRowMajor = (internal::traits<MatrixType>::Flags & RowMajorBit) ? 1 : 0 };
|
||||
Scalar actualAlpha = alpha * UBlasTraits::extractScalarFactor(u.derived()) *
|
||||
numext::conj(VBlasTraits::extractScalarFactor(v.derived()));
|
||||
if (IsRowMajor) actualAlpha = numext::conj(actualAlpha);
|
||||
|
||||
typedef internal::remove_all_t<
|
||||
typename internal::conj_expr_if<int(IsRowMajor) ^ int(UBlasTraits::NeedToConjugate), ActualUType_>::type>
|
||||
UType;
|
||||
typedef internal::remove_all_t<
|
||||
typename internal::conj_expr_if<int(IsRowMajor) ^ int(VBlasTraits::NeedToConjugate), ActualVType_>::type>
|
||||
VType;
|
||||
internal::selfadjoint_rank2_update_selector<Scalar, Index, UType, VType,
|
||||
(IsRowMajor ? int(UpLo == Upper ? Lower : Upper)
|
||||
: UpLo)>::run(_expression().const_cast_derived().data(),
|
||||
_expression().outerStride(), UType(actualU),
|
||||
VType(actualV), actualAlpha);
|
||||
const Index size = u.size();
|
||||
|
||||
// Copy u to contiguous buffer, applying conjugation if needed
|
||||
internal::gemv_static_vector_if<Scalar, DerivedU::SizeAtCompileTime, DerivedU::MaxSizeAtCompileTime, !UseUDirectly>
|
||||
static_u;
|
||||
ei_declare_aligned_stack_constructed_variable(Scalar, uPtr, size,
|
||||
(UseUDirectly ? const_cast<Scalar*>(actualU.data()) : static_u.data()));
|
||||
if (!UseUDirectly) {
|
||||
if (NeedConjU)
|
||||
Map<typename ActualUType_::PlainObject>(uPtr, size) = actualU.conjugate();
|
||||
else
|
||||
Map<typename ActualUType_::PlainObject>(uPtr, size) = actualU;
|
||||
}
|
||||
|
||||
// Copy v to contiguous buffer, applying conjugation if needed
|
||||
internal::gemv_static_vector_if<Scalar, DerivedV::SizeAtCompileTime, DerivedV::MaxSizeAtCompileTime, !UseVDirectly>
|
||||
static_v;
|
||||
ei_declare_aligned_stack_constructed_variable(Scalar, vPtr, size,
|
||||
(UseVDirectly ? const_cast<Scalar*>(actualV.data()) : static_v.data()));
|
||||
if (!UseVDirectly) {
|
||||
if (NeedConjV)
|
||||
Map<typename ActualVType_::PlainObject>(vPtr, size) = actualV.conjugate();
|
||||
else
|
||||
Map<typename ActualVType_::PlainObject>(vPtr, size) = actualV;
|
||||
}
|
||||
|
||||
internal::selfadjoint_rank2_update_selector<Scalar, Index, (IsRowMajor ? int(UpLo == Upper ? Lower : Upper) : UpLo)>::
|
||||
run(size, _expression().const_cast_derived().data(), _expression().outerStride(), uPtr, vPtr, actualAlpha);
|
||||
|
||||
return *this;
|
||||
}
|
||||
|
||||
@@ -43,43 +43,102 @@ EIGEN_DONT_INLINE void triangular_matrix_vector_product<Index, Mode, LhsScalar,
|
||||
Index rows = IsLower ? _rows : (std::min)(_rows, _cols);
|
||||
Index cols = IsLower ? (std::min)(_rows, _cols) : _cols;
|
||||
|
||||
typedef Map<const Matrix<LhsScalar, Dynamic, Dynamic, ColMajor>, 0, OuterStride<> > LhsMap;
|
||||
const LhsMap lhs(lhs_, rows, cols, OuterStride<>(lhsStride));
|
||||
typename conj_expr_if<ConjLhs, LhsMap>::type cjLhs(lhs);
|
||||
|
||||
typedef Map<const Matrix<RhsScalar, Dynamic, 1>, 0, InnerStride<> > RhsMap;
|
||||
const RhsMap rhs(rhs_, cols, InnerStride<>(rhsIncr));
|
||||
typename conj_expr_if<ConjRhs, RhsMap>::type cjRhs(rhs);
|
||||
|
||||
typedef Map<Matrix<ResScalar, Dynamic, 1> > ResMap;
|
||||
ResMap res(res_, rows);
|
||||
|
||||
typedef const_blas_data_mapper<LhsScalar, Index, ColMajor> LhsMapper;
|
||||
typedef const_blas_data_mapper<RhsScalar, Index, RowMajor> RhsMapper;
|
||||
|
||||
conj_if<ConjLhs> cjl;
|
||||
conj_if<ConjRhs> cjr;
|
||||
|
||||
for (Index pi = 0; pi < size; pi += PanelWidth) {
|
||||
Index actualPanelWidth = (std::min)(PanelWidth, size - pi);
|
||||
for (Index k = 0; k < actualPanelWidth; ++k) {
|
||||
Index i = pi + k;
|
||||
Index s = IsLower ? ((HasUnitDiag || HasZeroDiag) ? i + 1 : i) : pi;
|
||||
Index r = IsLower ? actualPanelWidth - k : k + 1;
|
||||
if ((!(HasUnitDiag || HasZeroDiag)) || (--r) > 0)
|
||||
res.segment(s, r) += (alpha * cjRhs.coeff(i)) * cjLhs.col(i).segment(s, r);
|
||||
if (HasUnitDiag) res.coeffRef(i) += alpha * cjRhs.coeff(i);
|
||||
|
||||
// Process the triangular panel using raw pointer operations with 2-column batching
|
||||
// to eliminate expression template overhead and share result loads/stores.
|
||||
if (IsLower) {
|
||||
Index k = 0;
|
||||
for (; k + 1 < actualPanelWidth; k += 2) {
|
||||
Index i0 = pi + k;
|
||||
Index i1 = i0 + 1;
|
||||
ResScalar s0 = alpha * cjr(rhs_[i0 * rhsIncr]);
|
||||
ResScalar s1 = alpha * cjr(rhs_[i1 * rhsIncr]);
|
||||
const LhsScalar* EIGEN_RESTRICT c0 = lhs_ + i0 * lhsStride;
|
||||
const LhsScalar* EIGEN_RESTRICT c1 = lhs_ + i1 * lhsStride;
|
||||
|
||||
// Diagonal of column 0
|
||||
if (!(HasUnitDiag || HasZeroDiag)) res_[i0] += s0 * cjl(c0[i0]);
|
||||
// Row i1: contribution from column 0 + diagonal of column 1
|
||||
{
|
||||
ResScalar r1 = s0 * cjl(c0[i1]);
|
||||
if (!(HasUnitDiag || HasZeroDiag)) r1 += s1 * cjl(c1[i1]);
|
||||
res_[i1] += r1;
|
||||
}
|
||||
// Shared rows where both columns contribute
|
||||
Index panelEnd = pi + actualPanelWidth;
|
||||
for (Index j = i1 + 1; j < panelEnd; ++j) res_[j] += s0 * cjl(c0[j]) + s1 * cjl(c1[j]);
|
||||
|
||||
if (HasUnitDiag) {
|
||||
res_[i0] += s0;
|
||||
res_[i1] += s1;
|
||||
}
|
||||
}
|
||||
if (k < actualPanelWidth) {
|
||||
Index i = pi + k;
|
||||
ResScalar s = alpha * cjr(rhs_[i * rhsIncr]);
|
||||
const LhsScalar* EIGEN_RESTRICT c = lhs_ + i * lhsStride;
|
||||
if (!(HasUnitDiag || HasZeroDiag)) res_[i] += s * cjl(c[i]);
|
||||
if (HasUnitDiag) res_[i] += s;
|
||||
}
|
||||
} else {
|
||||
// Upper triangular: process 2 columns at a time
|
||||
Index k = 0;
|
||||
for (; k + 1 < actualPanelWidth; k += 2) {
|
||||
Index i0 = pi + k;
|
||||
Index i1 = i0 + 1;
|
||||
ResScalar s0 = alpha * cjr(rhs_[i0 * rhsIncr]);
|
||||
ResScalar s1 = alpha * cjr(rhs_[i1 * rhsIncr]);
|
||||
const LhsScalar* EIGEN_RESTRICT c0 = lhs_ + i0 * lhsStride;
|
||||
const LhsScalar* EIGEN_RESTRICT c1 = lhs_ + i1 * lhsStride;
|
||||
|
||||
// Shared rows before the diagonal block
|
||||
for (Index j = pi; j < i0; ++j) res_[j] += s0 * cjl(c0[j]) + s1 * cjl(c1[j]);
|
||||
|
||||
// Row i0: diagonal of col0 + contribution from col1
|
||||
{
|
||||
ResScalar r0 = s1 * cjl(c1[i0]);
|
||||
if (!(HasUnitDiag || HasZeroDiag)) r0 += s0 * cjl(c0[i0]);
|
||||
res_[i0] += r0;
|
||||
}
|
||||
// Diagonal of column 1
|
||||
if (!(HasUnitDiag || HasZeroDiag)) res_[i1] += s1 * cjl(c1[i1]);
|
||||
|
||||
if (HasUnitDiag) {
|
||||
res_[i0] += s0;
|
||||
res_[i1] += s1;
|
||||
}
|
||||
}
|
||||
if (k < actualPanelWidth) {
|
||||
Index i = pi + k;
|
||||
ResScalar s = alpha * cjr(rhs_[i * rhsIncr]);
|
||||
const LhsScalar* EIGEN_RESTRICT c = lhs_ + i * lhsStride;
|
||||
for (Index j = pi; j < i; ++j) res_[j] += s * cjl(c[j]);
|
||||
if (!(HasUnitDiag || HasZeroDiag)) res_[i] += s * cjl(c[i]);
|
||||
if (HasUnitDiag) res_[i] += s;
|
||||
}
|
||||
}
|
||||
|
||||
// Rectangular part: delegate to optimized GEMV
|
||||
Index r = IsLower ? rows - pi - actualPanelWidth : pi;
|
||||
if (r > 0) {
|
||||
Index s = IsLower ? pi + actualPanelWidth : 0;
|
||||
general_matrix_vector_product<Index, LhsScalar, LhsMapper, ColMajor, ConjLhs, RhsScalar, RhsMapper, ConjRhs,
|
||||
BuiltIn>::run(r, actualPanelWidth, LhsMapper(&lhs.coeffRef(s, pi), lhsStride),
|
||||
RhsMapper(&rhs.coeffRef(pi), rhsIncr), &res.coeffRef(s), resIncr,
|
||||
alpha);
|
||||
BuiltIn>::run(r, actualPanelWidth, LhsMapper(&lhs_[pi * lhsStride + s], lhsStride),
|
||||
RhsMapper(&rhs_[pi * rhsIncr], rhsIncr), &res_[s], resIncr, alpha);
|
||||
}
|
||||
}
|
||||
if ((!IsLower) && cols > size) {
|
||||
general_matrix_vector_product<Index, LhsScalar, LhsMapper, ColMajor, ConjLhs, RhsScalar, RhsMapper, ConjRhs>::run(
|
||||
rows, cols - size, LhsMapper(&lhs.coeffRef(0, size), lhsStride), RhsMapper(&rhs.coeffRef(size), rhsIncr), res_,
|
||||
resIncr, alpha);
|
||||
rows, cols - size, LhsMapper(&lhs_[size * lhsStride], lhsStride), RhsMapper(&rhs_[size * rhsIncr], rhsIncr),
|
||||
res_, resIncr, alpha);
|
||||
}
|
||||
}
|
||||
|
||||
@@ -105,43 +164,48 @@ EIGEN_DONT_INLINE void triangular_matrix_vector_product<Index, Mode, LhsScalar,
|
||||
Index rows = IsLower ? _rows : diagSize;
|
||||
Index cols = IsLower ? diagSize : _cols;
|
||||
|
||||
typedef Map<const Matrix<LhsScalar, Dynamic, Dynamic, RowMajor>, 0, OuterStride<> > LhsMap;
|
||||
const LhsMap lhs(lhs_, rows, cols, OuterStride<>(lhsStride));
|
||||
typename conj_expr_if<ConjLhs, LhsMap>::type cjLhs(lhs);
|
||||
|
||||
typedef Map<const Matrix<RhsScalar, Dynamic, 1> > RhsMap;
|
||||
const RhsMap rhs(rhs_, cols);
|
||||
typename conj_expr_if<ConjRhs, RhsMap>::type cjRhs(rhs);
|
||||
|
||||
typedef Map<Matrix<ResScalar, Dynamic, 1>, 0, InnerStride<> > ResMap;
|
||||
ResMap res(res_, rows, InnerStride<>(resIncr));
|
||||
|
||||
typedef const_blas_data_mapper<LhsScalar, Index, RowMajor> LhsMapper;
|
||||
typedef const_blas_data_mapper<RhsScalar, Index, RowMajor> RhsMapper;
|
||||
|
||||
conj_if<ConjLhs> cjl;
|
||||
conj_if<ConjRhs> cjr;
|
||||
|
||||
for (Index pi = 0; pi < diagSize; pi += PanelWidth) {
|
||||
Index actualPanelWidth = (std::min)(PanelWidth, diagSize - pi);
|
||||
|
||||
// Process the triangular panel using raw dot products to eliminate
|
||||
// the cwiseProduct().sum() expression template overhead.
|
||||
for (Index k = 0; k < actualPanelWidth; ++k) {
|
||||
Index i = pi + k;
|
||||
Index s = IsLower ? pi : ((HasUnitDiag || HasZeroDiag) ? i + 1 : i);
|
||||
Index r = IsLower ? k + 1 : actualPanelWidth - k;
|
||||
if ((!(HasUnitDiag || HasZeroDiag)) || (--r) > 0)
|
||||
res.coeffRef(i) += alpha * (cjLhs.row(i).segment(s, r).cwiseProduct(cjRhs.segment(s, r).transpose())).sum();
|
||||
if (HasUnitDiag) res.coeffRef(i) += alpha * cjRhs.coeff(i);
|
||||
const LhsScalar* EIGEN_RESTRICT row_i = lhs_ + i * lhsStride;
|
||||
ResScalar dot = ResScalar(0);
|
||||
|
||||
if (IsLower) {
|
||||
Index s = pi;
|
||||
Index len = (HasUnitDiag || HasZeroDiag) ? k : k + 1;
|
||||
for (Index j = 0; j < len; ++j) dot += cjl(row_i[s + j]) * cjr(rhs_[s + j]);
|
||||
} else {
|
||||
Index s = (HasUnitDiag || HasZeroDiag) ? i + 1 : i;
|
||||
Index len = pi + actualPanelWidth - s;
|
||||
for (Index j = 0; j < len; ++j) dot += cjl(row_i[s + j]) * cjr(rhs_[s + j]);
|
||||
}
|
||||
res_[i * resIncr] += alpha * dot;
|
||||
if (HasUnitDiag) res_[i * resIncr] += alpha * cjr(rhs_[i]);
|
||||
}
|
||||
|
||||
// Rectangular part: delegate to optimized GEMV
|
||||
Index r = IsLower ? pi : cols - pi - actualPanelWidth;
|
||||
if (r > 0) {
|
||||
Index s = IsLower ? 0 : pi + actualPanelWidth;
|
||||
general_matrix_vector_product<Index, LhsScalar, LhsMapper, RowMajor, ConjLhs, RhsScalar, RhsMapper, ConjRhs,
|
||||
BuiltIn>::run(actualPanelWidth, r, LhsMapper(&lhs.coeffRef(pi, s), lhsStride),
|
||||
RhsMapper(&rhs.coeffRef(s), rhsIncr), &res.coeffRef(pi), resIncr,
|
||||
alpha);
|
||||
BuiltIn>::run(actualPanelWidth, r, LhsMapper(&lhs_[pi * lhsStride + s], lhsStride),
|
||||
RhsMapper(&rhs_[s], rhsIncr), &res_[pi * resIncr], resIncr, alpha);
|
||||
}
|
||||
}
|
||||
if (IsLower && rows > diagSize) {
|
||||
general_matrix_vector_product<Index, LhsScalar, LhsMapper, RowMajor, ConjLhs, RhsScalar, RhsMapper, ConjRhs>::run(
|
||||
rows - diagSize, cols, LhsMapper(&lhs.coeffRef(diagSize, 0), lhsStride), RhsMapper(&rhs.coeffRef(0), rhsIncr),
|
||||
&res.coeffRef(diagSize), resIncr, alpha);
|
||||
rows - diagSize, cols, LhsMapper(&lhs_[diagSize * lhsStride], lhsStride), RhsMapper(rhs_, rhsIncr),
|
||||
&res_[diagSize * resIncr], resIncr, alpha);
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
@@ -14,6 +14,10 @@ eigen_add_benchmark(bench_map bench_map.cpp)
|
||||
eigen_add_benchmark(bench_diagonal bench_diagonal.cpp)
|
||||
eigen_add_benchmark(bench_triangular_product bench_triangular_product.cpp)
|
||||
eigen_add_benchmark(bench_selfadjoint_product bench_selfadjoint_product.cpp)
|
||||
eigen_add_benchmark(bench_symv bench_symv.cpp)
|
||||
eigen_add_benchmark(bench_trmv bench_trmv.cpp)
|
||||
eigen_add_benchmark(bench_syr bench_syr.cpp)
|
||||
eigen_add_benchmark(bench_syr2 bench_syr2.cpp)
|
||||
eigen_add_benchmark(bench_construction bench_construction.cpp)
|
||||
eigen_add_benchmark(bench_fixed_size bench_fixed_size.cpp)
|
||||
eigen_add_benchmark(bench_fixed_size_double bench_fixed_size.cpp DEFINITIONS SCALAR=double)
|
||||
|
||||
62
benchmarks/Core/bench_symv.cpp
Normal file
62
benchmarks/Core/bench_symv.cpp
Normal file
@@ -0,0 +1,62 @@
|
||||
// Benchmarks for selfadjoint matrix-vector product (SYMV/HEMV).
|
||||
//
|
||||
// Tests y += selfadjointView(A) * x for various sizes and scalar types.
|
||||
// Exercises SelfadjointMatrixVector.h kernel.
|
||||
|
||||
#include <benchmark/benchmark.h>
|
||||
#include <Eigen/Core>
|
||||
|
||||
using namespace Eigen;
|
||||
|
||||
template <typename Scalar>
|
||||
double symvFlops(Index n) {
|
||||
// SYMV uses n^2 multiply-adds (exploiting symmetry)
|
||||
return (NumTraits<Scalar>::IsComplex ? 8.0 : 2.0) * n * n;
|
||||
}
|
||||
|
||||
// y += selfadjointView<Lower>(A) * x
|
||||
template <typename Scalar>
|
||||
static void BM_SYMV_Lower(benchmark::State& state) {
|
||||
const Index n = state.range(0);
|
||||
using Mat = Matrix<Scalar, Dynamic, Dynamic>;
|
||||
using Vec = Matrix<Scalar, Dynamic, 1>;
|
||||
Mat A = Mat::Random(n, n);
|
||||
A = (A + A.transpose().eval()) / Scalar(2);
|
||||
Vec x = Vec::Random(n);
|
||||
Vec y = Vec::Random(n);
|
||||
for (auto _ : state) {
|
||||
y.noalias() += A.template selfadjointView<Lower>() * x;
|
||||
benchmark::DoNotOptimize(y.data());
|
||||
benchmark::ClobberMemory();
|
||||
}
|
||||
state.counters["GFLOPS"] = benchmark::Counter(symvFlops<Scalar>(n), benchmark::Counter::kIsIterationInvariantRate,
|
||||
benchmark::Counter::kIs1000);
|
||||
}
|
||||
|
||||
// y += selfadjointView<Upper>(A) * x
|
||||
template <typename Scalar>
|
||||
static void BM_SYMV_Upper(benchmark::State& state) {
|
||||
const Index n = state.range(0);
|
||||
using Mat = Matrix<Scalar, Dynamic, Dynamic>;
|
||||
using Vec = Matrix<Scalar, Dynamic, 1>;
|
||||
Mat A = Mat::Random(n, n);
|
||||
A = (A + A.transpose().eval()) / Scalar(2);
|
||||
Vec x = Vec::Random(n);
|
||||
Vec y = Vec::Random(n);
|
||||
for (auto _ : state) {
|
||||
y.noalias() += A.template selfadjointView<Upper>() * x;
|
||||
benchmark::DoNotOptimize(y.data());
|
||||
benchmark::ClobberMemory();
|
||||
}
|
||||
state.counters["GFLOPS"] = benchmark::Counter(symvFlops<Scalar>(n), benchmark::Counter::kIsIterationInvariantRate,
|
||||
benchmark::Counter::kIs1000);
|
||||
}
|
||||
|
||||
static void SymvSizes(::benchmark::Benchmark* b) {
|
||||
for (int n : {8, 16, 32, 64, 128, 256, 512, 1024, 2048}) b->Arg(n);
|
||||
}
|
||||
|
||||
BENCHMARK(BM_SYMV_Lower<float>)->Apply(SymvSizes)->Name("SYMV_Lower_float");
|
||||
BENCHMARK(BM_SYMV_Lower<double>)->Apply(SymvSizes)->Name("SYMV_Lower_double");
|
||||
BENCHMARK(BM_SYMV_Upper<float>)->Apply(SymvSizes)->Name("SYMV_Upper_float");
|
||||
BENCHMARK(BM_SYMV_Upper<double>)->Apply(SymvSizes)->Name("SYMV_Upper_double");
|
||||
59
benchmarks/Core/bench_syr.cpp
Normal file
59
benchmarks/Core/bench_syr.cpp
Normal file
@@ -0,0 +1,59 @@
|
||||
// Benchmarks for symmetric rank-1 update (SYR).
|
||||
//
|
||||
// Tests C.selfadjointView<Lower>().rankUpdate(v, alpha) which computes
|
||||
// C += alpha * v * v^T, updating only the lower (or upper) triangle.
|
||||
// Exercises SelfadjointProduct.h / selfadjoint_rank1_update.
|
||||
|
||||
#include <benchmark/benchmark.h>
|
||||
#include <Eigen/Core>
|
||||
|
||||
using namespace Eigen;
|
||||
|
||||
template <typename Scalar>
|
||||
double syrFlops(Index n) {
|
||||
// SYR: n*(n+1)/2 multiply-adds ~ n^2
|
||||
return (NumTraits<Scalar>::IsComplex ? 8.0 : 2.0) * n * (n + 1) / 2;
|
||||
}
|
||||
|
||||
template <typename Scalar>
|
||||
static void BM_SYR_Lower(benchmark::State& state) {
|
||||
const Index n = state.range(0);
|
||||
using Mat = Matrix<Scalar, Dynamic, Dynamic>;
|
||||
using Vec = Matrix<Scalar, Dynamic, 1>;
|
||||
Vec v = Vec::Random(n);
|
||||
Mat C = Mat::Zero(n, n);
|
||||
Scalar alpha(1);
|
||||
for (auto _ : state) {
|
||||
C.template selfadjointView<Lower>().rankUpdate(v, alpha);
|
||||
benchmark::DoNotOptimize(C.data());
|
||||
benchmark::ClobberMemory();
|
||||
}
|
||||
state.counters["GFLOPS"] = benchmark::Counter(syrFlops<Scalar>(n), benchmark::Counter::kIsIterationInvariantRate,
|
||||
benchmark::Counter::kIs1000);
|
||||
}
|
||||
|
||||
template <typename Scalar>
|
||||
static void BM_SYR_Upper(benchmark::State& state) {
|
||||
const Index n = state.range(0);
|
||||
using Mat = Matrix<Scalar, Dynamic, Dynamic>;
|
||||
using Vec = Matrix<Scalar, Dynamic, 1>;
|
||||
Vec v = Vec::Random(n);
|
||||
Mat C = Mat::Zero(n, n);
|
||||
Scalar alpha(1);
|
||||
for (auto _ : state) {
|
||||
C.template selfadjointView<Upper>().rankUpdate(v, alpha);
|
||||
benchmark::DoNotOptimize(C.data());
|
||||
benchmark::ClobberMemory();
|
||||
}
|
||||
state.counters["GFLOPS"] = benchmark::Counter(syrFlops<Scalar>(n), benchmark::Counter::kIsIterationInvariantRate,
|
||||
benchmark::Counter::kIs1000);
|
||||
}
|
||||
|
||||
static void SyrSizes(::benchmark::Benchmark* b) {
|
||||
for (int n : {8, 16, 32, 64, 128, 256, 512, 1024, 2048}) b->Arg(n);
|
||||
}
|
||||
|
||||
BENCHMARK(BM_SYR_Lower<float>)->Apply(SyrSizes)->Name("SYR_Lower_float");
|
||||
BENCHMARK(BM_SYR_Lower<double>)->Apply(SyrSizes)->Name("SYR_Lower_double");
|
||||
BENCHMARK(BM_SYR_Upper<float>)->Apply(SyrSizes)->Name("SYR_Upper_float");
|
||||
BENCHMARK(BM_SYR_Upper<double>)->Apply(SyrSizes)->Name("SYR_Upper_double");
|
||||
61
benchmarks/Core/bench_syr2.cpp
Normal file
61
benchmarks/Core/bench_syr2.cpp
Normal file
@@ -0,0 +1,61 @@
|
||||
// Benchmarks for symmetric rank-2 update (SYR2).
|
||||
//
|
||||
// Tests C.selfadjointView<Lower>().rankUpdate(u, v, alpha) which computes
|
||||
// C += alpha * u * v^T + conj(alpha) * v * u^T.
|
||||
// Exercises SelfadjointRank2Update.h.
|
||||
|
||||
#include <benchmark/benchmark.h>
|
||||
#include <Eigen/Core>
|
||||
|
||||
using namespace Eigen;
|
||||
|
||||
template <typename Scalar>
|
||||
double syr2Flops(Index n) {
|
||||
// SYR2: 2 * n*(n+1)/2 multiply-adds ~ 2*n^2
|
||||
return (NumTraits<Scalar>::IsComplex ? 8.0 : 2.0) * 2 * n * (n + 1) / 2;
|
||||
}
|
||||
|
||||
template <typename Scalar>
|
||||
static void BM_SYR2_Lower(benchmark::State& state) {
|
||||
const Index n = state.range(0);
|
||||
using Mat = Matrix<Scalar, Dynamic, Dynamic>;
|
||||
using Vec = Matrix<Scalar, Dynamic, 1>;
|
||||
Vec u = Vec::Random(n);
|
||||
Vec v = Vec::Random(n);
|
||||
Mat C = Mat::Zero(n, n);
|
||||
Scalar alpha(1);
|
||||
for (auto _ : state) {
|
||||
C.template selfadjointView<Lower>().rankUpdate(u, v, alpha);
|
||||
benchmark::DoNotOptimize(C.data());
|
||||
benchmark::ClobberMemory();
|
||||
}
|
||||
state.counters["GFLOPS"] = benchmark::Counter(syr2Flops<Scalar>(n), benchmark::Counter::kIsIterationInvariantRate,
|
||||
benchmark::Counter::kIs1000);
|
||||
}
|
||||
|
||||
template <typename Scalar>
|
||||
static void BM_SYR2_Upper(benchmark::State& state) {
|
||||
const Index n = state.range(0);
|
||||
using Mat = Matrix<Scalar, Dynamic, Dynamic>;
|
||||
using Vec = Matrix<Scalar, Dynamic, 1>;
|
||||
Vec u = Vec::Random(n);
|
||||
Vec v = Vec::Random(n);
|
||||
Mat C = Mat::Zero(n, n);
|
||||
Scalar alpha(1);
|
||||
for (auto _ : state) {
|
||||
C.template selfadjointView<Upper>().rankUpdate(u, v, alpha);
|
||||
benchmark::DoNotOptimize(C.data());
|
||||
benchmark::ClobberMemory();
|
||||
}
|
||||
state.counters["GFLOPS"] = benchmark::Counter(syr2Flops<Scalar>(n), benchmark::Counter::kIsIterationInvariantRate,
|
||||
benchmark::Counter::kIs1000);
|
||||
}
|
||||
|
||||
static void Syr2Sizes(::benchmark::Benchmark* b) {
|
||||
for (int n : {8, 16, 32, 64, 128, 256, 512, 1024, 2048}) b->Arg(n);
|
||||
}
|
||||
|
||||
BENCHMARK(BM_SYR2_Lower<float>)->Apply(Syr2Sizes)->Name("SYR2_Lower_float");
|
||||
BENCHMARK(BM_SYR2_Lower<double>)->Apply(Syr2Sizes)->Name("SYR2_Lower_double");
|
||||
BENCHMARK(BM_SYR2_Upper<float>)->Apply(Syr2Sizes)->Name("SYR2_Upper_float");
|
||||
BENCHMARK(BM_SYR2_Upper<double>)->Apply(Syr2Sizes)->Name("SYR2_Upper_double");
|
||||
46
benchmarks/Core/bench_trmv.cpp
Normal file
46
benchmarks/Core/bench_trmv.cpp
Normal file
@@ -0,0 +1,46 @@
|
||||
// Benchmarks for triangular matrix-vector product (TRMV).
|
||||
//
|
||||
// Tests y += triangularView(A) * x for various modes and sizes.
|
||||
// Exercises TriangularMatrixVector.h kernel.
|
||||
|
||||
#include <benchmark/benchmark.h>
|
||||
#include <Eigen/Core>
|
||||
|
||||
using namespace Eigen;
|
||||
|
||||
template <typename Scalar>
|
||||
double trmvFlops(Index n) {
|
||||
// TRMV: ~n^2 multiply-adds
|
||||
return (NumTraits<Scalar>::IsComplex ? 8.0 : 2.0) * n * n;
|
||||
}
|
||||
|
||||
// y = triangularView<Mode>(A) * x
|
||||
template <typename Scalar, unsigned int Mode>
|
||||
static void BM_TRMV(benchmark::State& state) {
|
||||
const Index n = state.range(0);
|
||||
using Mat = Matrix<Scalar, Dynamic, Dynamic>;
|
||||
using Vec = Matrix<Scalar, Dynamic, 1>;
|
||||
Mat A = Mat::Random(n, n);
|
||||
Vec x = Vec::Random(n);
|
||||
Vec y(n);
|
||||
for (auto _ : state) {
|
||||
y.noalias() = A.template triangularView<Mode>() * x;
|
||||
benchmark::DoNotOptimize(y.data());
|
||||
benchmark::ClobberMemory();
|
||||
}
|
||||
state.counters["GFLOPS"] = benchmark::Counter(trmvFlops<Scalar>(n), benchmark::Counter::kIsIterationInvariantRate,
|
||||
benchmark::Counter::kIs1000);
|
||||
}
|
||||
|
||||
static void TrmvSizes(::benchmark::Benchmark* b) {
|
||||
for (int n : {8, 16, 32, 64, 128, 256, 512, 1024, 2048}) b->Arg(n);
|
||||
}
|
||||
|
||||
BENCHMARK(BM_TRMV<float, Lower>)->Apply(TrmvSizes)->Name("TRMV_float_Lower");
|
||||
BENCHMARK(BM_TRMV<float, Upper>)->Apply(TrmvSizes)->Name("TRMV_float_Upper");
|
||||
BENCHMARK(BM_TRMV<float, UnitLower>)->Apply(TrmvSizes)->Name("TRMV_float_UnitLower");
|
||||
BENCHMARK(BM_TRMV<float, UnitUpper>)->Apply(TrmvSizes)->Name("TRMV_float_UnitUpper");
|
||||
BENCHMARK(BM_TRMV<double, Lower>)->Apply(TrmvSizes)->Name("TRMV_double_Lower");
|
||||
BENCHMARK(BM_TRMV<double, Upper>)->Apply(TrmvSizes)->Name("TRMV_double_Upper");
|
||||
BENCHMARK(BM_TRMV<double, UnitLower>)->Apply(TrmvSizes)->Name("TRMV_double_UnitLower");
|
||||
BENCHMARK(BM_TRMV<double, UnitUpper>)->Apply(TrmvSizes)->Name("TRMV_double_UnitUpper");
|
||||
Reference in New Issue
Block a user