mirror of
https://gitlab.com/libeigen/eigen.git
synced 2026-04-10 11:34:33 +08:00
Compare commits
4 Commits
master
...
selfadjoin
| Author | SHA1 | Date | |
|---|---|---|---|
|
|
8346cc3410 | ||
|
|
93e9970964 | ||
|
|
3eed3b0ab9 | ||
|
|
8ddbe44799 |
@@ -141,140 +141,6 @@ EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS Packet plog2_float(const Pac
|
||||
return plog_impl_float<Packet, /* base2 */ true>(_x);
|
||||
}
|
||||
|
||||
// -----------------------------------------------------------------------
|
||||
// Double logarithm: shared polynomial + two range-reduction backends
|
||||
// -----------------------------------------------------------------------
|
||||
|
||||
// Cephes rational-polynomial approximation of log(1+f) for
|
||||
// f in [sqrt(0.5)-1, sqrt(2)-1].
|
||||
// Evaluates x - 0.5*x^2 + x^3 * P(x)/Q(x) where P and Q are degree-5.
|
||||
// See: http://www.netlib.org/cephes/
|
||||
template <typename Packet>
|
||||
EIGEN_STRONG_INLINE Packet plog_mantissa_double(const Packet x) {
|
||||
const Packet cst_cephes_log_p0 = pset1<Packet>(1.01875663804580931796E-4);
|
||||
const Packet cst_cephes_log_p1 = pset1<Packet>(4.97494994976747001425E-1);
|
||||
const Packet cst_cephes_log_p2 = pset1<Packet>(4.70579119878881725854E0);
|
||||
const Packet cst_cephes_log_p3 = pset1<Packet>(1.44989225341610930846E1);
|
||||
const Packet cst_cephes_log_p4 = pset1<Packet>(1.79368678507819816313E1);
|
||||
const Packet cst_cephes_log_p5 = pset1<Packet>(7.70838733755885391666E0);
|
||||
// Q0 = 1.0; pmadd(1, x, q1) simplifies to padd(x, q1).
|
||||
const Packet cst_cephes_log_q1 = pset1<Packet>(1.12873587189167450590E1);
|
||||
const Packet cst_cephes_log_q2 = pset1<Packet>(4.52279145837532221105E1);
|
||||
const Packet cst_cephes_log_q3 = pset1<Packet>(8.29875266912776603211E1);
|
||||
const Packet cst_cephes_log_q4 = pset1<Packet>(7.11544750618563894466E1);
|
||||
const Packet cst_cephes_log_q5 = pset1<Packet>(2.31251620126765340583E1);
|
||||
|
||||
Packet x2 = pmul(x, x);
|
||||
Packet x3 = pmul(x2, x);
|
||||
|
||||
// Evaluate P and Q simultaneously for better ILP.
|
||||
Packet y, y1, y_;
|
||||
y = pmadd(cst_cephes_log_p0, x, cst_cephes_log_p1);
|
||||
y1 = pmadd(cst_cephes_log_p3, x, cst_cephes_log_p4);
|
||||
y = pmadd(y, x, cst_cephes_log_p2);
|
||||
y1 = pmadd(y1, x, cst_cephes_log_p5);
|
||||
y_ = pmadd(y, x3, y1);
|
||||
|
||||
y = padd(x, cst_cephes_log_q1);
|
||||
y1 = pmadd(cst_cephes_log_q3, x, cst_cephes_log_q4);
|
||||
y = pmadd(y, x, cst_cephes_log_q2);
|
||||
y1 = pmadd(y1, x, cst_cephes_log_q5);
|
||||
y = pmadd(y, x3, y1);
|
||||
|
||||
y_ = pmul(y_, x3);
|
||||
y = pdiv(y_, y);
|
||||
y = pnmadd(pset1<Packet>(0.5), x2, y);
|
||||
return padd(x, y);
|
||||
}
|
||||
|
||||
// Detect whether unpacket_traits<Packet>::integer_packet is defined.
|
||||
template <typename Packet, typename = void>
|
||||
struct packet_has_integer_packet : std::false_type {};
|
||||
template <typename Packet>
|
||||
struct packet_has_integer_packet<Packet, void_t<typename unpacket_traits<Packet>::integer_packet>> : std::true_type {};
|
||||
|
||||
// Dispatch struct for double-precision range reduction.
|
||||
// Primary template: pfrexp-based fallback (used when integer_packet is absent).
|
||||
template <typename Packet, bool UseIntegerPacket>
|
||||
struct plog_range_reduce_double {
|
||||
EIGEN_STRONG_INLINE static void run(const Packet v, Packet& f, Packet& e) {
|
||||
const Packet one = pset1<Packet>(1.0);
|
||||
const Packet cst_cephes_SQRTHF = pset1<Packet>(0.70710678118654752440E0);
|
||||
// pfrexp: f in [0.5, 1), e = unbiased exponent as double.
|
||||
f = pfrexp(v, e);
|
||||
// Shift [0.5,1) -> [sqrt(0.5)-1, sqrt(2)-1] with exponent correction:
|
||||
// if f < sqrt(0.5): f = f + f - 1, e -= 1 (giving f in [0, sqrt(2)-1))
|
||||
// else: f = f - 1 (giving f in [sqrt(0.5)-1, 0))
|
||||
Packet mask = pcmp_lt(f, cst_cephes_SQRTHF);
|
||||
Packet tmp = pand(f, mask);
|
||||
f = psub(f, one);
|
||||
e = psub(e, pand(one, mask));
|
||||
f = padd(f, tmp);
|
||||
}
|
||||
};
|
||||
|
||||
// Specialisation: fast integer-bit-manipulation path (musl-inspired).
|
||||
// Requires unpacket_traits<Packet>::integer_packet to be a 64-bit integer packet.
|
||||
template <typename Packet>
|
||||
struct plog_range_reduce_double<Packet, true> {
|
||||
EIGEN_STRONG_INLINE static void run(const Packet v, Packet& f, Packet& e) {
|
||||
typedef typename unpacket_traits<Packet>::integer_packet PacketI;
|
||||
// 2^-1022: smallest positive normal double.
|
||||
const PacketI cst_min_normal = pset1<PacketI>(static_cast<int64_t>(0x0010000000000000LL));
|
||||
// Lower 52-bit mask (IEEE mantissa field).
|
||||
const PacketI cst_mant_mask = pset1<PacketI>(static_cast<int64_t>(0x000FFFFFFFFFFFFFLL));
|
||||
// Offset = 1.0_bits - sqrt(0.5)_bits. Adding this to the integer
|
||||
// representation shifts the exponent field so that the [sqrt(0.5), sqrt(2))
|
||||
// half-octave boundary falls on an exact biased-exponent boundary, letting
|
||||
// us extract e with a single right shift. The constant is:
|
||||
// 0x3FF0000000000000 - 0x3FE6A09E667F3BCD = 0x00095F619980C433
|
||||
const PacketI cst_sqrt_half_offset =
|
||||
pset1<PacketI>(static_cast<int64_t>(0x3FF0000000000000LL - 0x3FE6A09E667F3BCDLL));
|
||||
// IEEE double exponent bias (1023).
|
||||
const PacketI cst_exp_bias = pset1<PacketI>(static_cast<int64_t>(1023));
|
||||
// sqrt(0.5) IEEE bits — used to reconstruct f from biased mantissa.
|
||||
const PacketI cst_half_mant = pset1<PacketI>(static_cast<int64_t>(0x3FE6A09E667F3BCDLL));
|
||||
|
||||
// Reinterpret v as a 64-bit integer vector.
|
||||
PacketI vi = preinterpret<PacketI>(v);
|
||||
|
||||
// Normalise denormals: multiply by 2^52 and correct the exponent by -52.
|
||||
PacketI is_denormal = pcmp_lt(vi, cst_min_normal);
|
||||
// 2^52 via bit pattern: biased exponent = 52 + 1023 = 0x433, mantissa = 0.
|
||||
Packet v_norm = pmul(v, pset1frombits<Packet>(static_cast<uint64_t>(int64_t(52 + 0x3ff) << 52)));
|
||||
vi = pselect(is_denormal, preinterpret<PacketI>(v_norm), vi);
|
||||
PacketI denorm_adj = pand(is_denormal, pset1<PacketI>(static_cast<int64_t>(52)));
|
||||
|
||||
// Bias the integer representation so the exponent field directly encodes
|
||||
// the half-octave index.
|
||||
PacketI vi_biased = padd(vi, cst_sqrt_half_offset);
|
||||
// Extract unbiased exponent: shift out mantissa bits, subtract IEEE bias
|
||||
// and denormal adjustment.
|
||||
PacketI e_int = psub(psub(plogical_shift_right<52>(vi_biased), cst_exp_bias), denorm_adj);
|
||||
// Convert integer exponent to floating-point.
|
||||
e = pcast<PacketI, Packet>(e_int);
|
||||
|
||||
// Reconstruct mantissa in [sqrt(0.5), sqrt(2)) via integer arithmetic.
|
||||
// The integer addition of the masked mantissa bits and the sqrt(0.5) bit
|
||||
// pattern carries into the exponent field, yielding a value in that range.
|
||||
// Then subtract 1 to centre on 0: f in [sqrt(0.5)-1, sqrt(2)-1].
|
||||
f = psub(preinterpret<Packet>(padd(pand(vi_biased, cst_mant_mask), cst_half_mant)), pset1<Packet>(1.0));
|
||||
}
|
||||
};
|
||||
|
||||
// Core range reduction and polynomial for double logarithm.
|
||||
// Input: v > 0 (zero / negative / inf / nan are handled by the caller).
|
||||
// Output: log_mantissa ≈ log(mantissa of v in [sqrt(0.5), sqrt(2))),
|
||||
// e = unbiased exponent of v as a double.
|
||||
// Selects the fast integer path when integer_packet is available, otherwise
|
||||
// falls back to pfrexp.
|
||||
template <typename Packet>
|
||||
EIGEN_STRONG_INLINE void plog_core_double(const Packet v, Packet& log_mantissa, Packet& e) {
|
||||
Packet f;
|
||||
plog_range_reduce_double<Packet, packet_has_integer_packet<Packet>::value>::run(v, f, e);
|
||||
log_mantissa = plog_mantissa_double(f);
|
||||
}
|
||||
|
||||
/* Returns the base e (2.718...) or base 2 logarithm of x.
|
||||
* The argument is separated into its exponent and fractional parts.
|
||||
* The logarithm of the fraction in the interval [sqrt(1/2), sqrt(2)],
|
||||
@@ -286,29 +152,87 @@ EIGEN_STRONG_INLINE void plog_core_double(const Packet v, Packet& log_mantissa,
|
||||
*/
|
||||
template <typename Packet, bool base2>
|
||||
EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS Packet plog_impl_double(const Packet _x) {
|
||||
Packet x = _x;
|
||||
|
||||
const Packet cst_1 = pset1<Packet>(1.0);
|
||||
const Packet cst_neg_half = pset1<Packet>(-0.5);
|
||||
const Packet cst_minus_inf = pset1frombits<Packet>(static_cast<uint64_t>(0xfff0000000000000ull));
|
||||
const Packet cst_pos_inf = pset1frombits<Packet>(static_cast<uint64_t>(0x7ff0000000000000ull));
|
||||
|
||||
Packet log_mantissa, e;
|
||||
plog_core_double(_x, log_mantissa, e);
|
||||
// Polynomial Coefficients for log(1+x) = x - x**2/2 + x**3 P(x)/Q(x)
|
||||
// 1/sqrt(2) <= x < sqrt(2)
|
||||
const Packet cst_cephes_SQRTHF = pset1<Packet>(0.70710678118654752440E0);
|
||||
const Packet cst_cephes_log_p0 = pset1<Packet>(1.01875663804580931796E-4);
|
||||
const Packet cst_cephes_log_p1 = pset1<Packet>(4.97494994976747001425E-1);
|
||||
const Packet cst_cephes_log_p2 = pset1<Packet>(4.70579119878881725854E0);
|
||||
const Packet cst_cephes_log_p3 = pset1<Packet>(1.44989225341610930846E1);
|
||||
const Packet cst_cephes_log_p4 = pset1<Packet>(1.79368678507819816313E1);
|
||||
const Packet cst_cephes_log_p5 = pset1<Packet>(7.70838733755885391666E0);
|
||||
|
||||
// Combine: log(x) = e * ln2 + log(mantissa), or log2(x) = log(mantissa)*log2e + e.
|
||||
Packet x;
|
||||
const Packet cst_cephes_log_q0 = pset1<Packet>(1.0);
|
||||
const Packet cst_cephes_log_q1 = pset1<Packet>(1.12873587189167450590E1);
|
||||
const Packet cst_cephes_log_q2 = pset1<Packet>(4.52279145837532221105E1);
|
||||
const Packet cst_cephes_log_q3 = pset1<Packet>(8.29875266912776603211E1);
|
||||
const Packet cst_cephes_log_q4 = pset1<Packet>(7.11544750618563894466E1);
|
||||
const Packet cst_cephes_log_q5 = pset1<Packet>(2.31251620126765340583E1);
|
||||
|
||||
Packet e;
|
||||
// extract significant in the range [0.5,1) and exponent
|
||||
x = pfrexp(x, e);
|
||||
|
||||
// Shift the inputs from the range [0.5,1) to [sqrt(1/2),sqrt(2))
|
||||
// and shift by -1. The values are then centered around 0, which improves
|
||||
// the stability of the polynomial evaluation.
|
||||
// if( x < SQRTHF ) {
|
||||
// e -= 1;
|
||||
// x = x + x - 1.0;
|
||||
// } else { x = x - 1.0; }
|
||||
Packet mask = pcmp_lt(x, cst_cephes_SQRTHF);
|
||||
Packet tmp = pand(x, mask);
|
||||
x = psub(x, cst_1);
|
||||
e = psub(e, pand(cst_1, mask));
|
||||
x = padd(x, tmp);
|
||||
|
||||
Packet x2 = pmul(x, x);
|
||||
Packet x3 = pmul(x2, x);
|
||||
|
||||
// Evaluate the polynomial in factored form for better instruction-level parallelism.
|
||||
// y = x - 0.5*x^2 + x^3 * polevl( x, P, 5 ) / p1evl( x, Q, 5 ) );
|
||||
Packet y, y1, y_;
|
||||
y = pmadd(cst_cephes_log_p0, x, cst_cephes_log_p1);
|
||||
y1 = pmadd(cst_cephes_log_p3, x, cst_cephes_log_p4);
|
||||
y = pmadd(y, x, cst_cephes_log_p2);
|
||||
y1 = pmadd(y1, x, cst_cephes_log_p5);
|
||||
y_ = pmadd(y, x3, y1);
|
||||
|
||||
y = pmadd(cst_cephes_log_q0, x, cst_cephes_log_q1);
|
||||
y1 = pmadd(cst_cephes_log_q3, x, cst_cephes_log_q4);
|
||||
y = pmadd(y, x, cst_cephes_log_q2);
|
||||
y1 = pmadd(y1, x, cst_cephes_log_q5);
|
||||
y = pmadd(y, x3, y1);
|
||||
|
||||
y_ = pmul(y_, x3);
|
||||
y = pdiv(y_, y);
|
||||
|
||||
y = pmadd(cst_neg_half, x2, y);
|
||||
x = padd(x, y);
|
||||
|
||||
// Add the logarithm of the exponent back to the result of the interpolation.
|
||||
if (base2) {
|
||||
const Packet cst_log2e = pset1<Packet>(static_cast<double>(EIGEN_LOG2E));
|
||||
x = pmadd(log_mantissa, cst_log2e, e);
|
||||
x = pmadd(x, cst_log2e, e);
|
||||
} else {
|
||||
const Packet cst_ln2 = pset1<Packet>(static_cast<double>(EIGEN_LN2));
|
||||
x = pmadd(e, cst_ln2, log_mantissa);
|
||||
x = pmadd(e, cst_ln2, x);
|
||||
}
|
||||
|
||||
Packet invalid_mask = pcmp_lt_or_nan(_x, pzero(_x));
|
||||
Packet iszero_mask = pcmp_eq(_x, pzero(_x));
|
||||
Packet pos_inf_mask = pcmp_eq(_x, cst_pos_inf);
|
||||
// Filter out invalid inputs:
|
||||
// - negative arg → NAN
|
||||
// - 0 → -INF
|
||||
// - +INF → +INF
|
||||
// Filter out invalid inputs, i.e.:
|
||||
// - negative arg will be NAN
|
||||
// - 0 will be -INF
|
||||
// - +INF will be +INF
|
||||
return pselect(iszero_mask, cst_minus_inf, por(pselect(pos_inf_mask, cst_pos_inf, x), invalid_mask));
|
||||
}
|
||||
|
||||
@@ -362,11 +286,8 @@ EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS Packet generic_log1p_float(c
|
||||
return result;
|
||||
}
|
||||
|
||||
/** \internal \returns log(1 + x) for double precision.
|
||||
Computes log(1+x) using plog_core_double for the core range reduction and
|
||||
polynomial evaluation. The rounding error from forming u = fl(1+x) is
|
||||
recovered as dx = x - (u - 1) and folded in as a first-order correction
|
||||
dx/u after the polynomial evaluation.
|
||||
/** \internal \returns log(1 + x) for double precision float.
|
||||
Same direct approach as the float version.
|
||||
*/
|
||||
template <typename Packet>
|
||||
EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS Packet generic_log1p_double(const Packet& x) {
|
||||
@@ -374,31 +295,67 @@ EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS Packet generic_log1p_double(
|
||||
const Packet cst_minus_inf = pset1frombits<Packet>(static_cast<uint64_t>(0xfff0000000000000ull));
|
||||
const Packet cst_pos_inf = pset1frombits<Packet>(static_cast<uint64_t>(0x7ff0000000000000ull));
|
||||
|
||||
// u = 1 + x, with rounding. Recover the lost low bits: dx = x - (u - 1).
|
||||
Packet u = padd(one, x);
|
||||
Packet dx = psub(x, psub(u, one));
|
||||
|
||||
// For |x| tiny enough that u rounds to 1, return x directly.
|
||||
Packet small_mask = pcmp_eq(u, one);
|
||||
// For u = +inf (x very large), return +inf.
|
||||
Packet inf_mask = pcmp_eq(u, cst_pos_inf);
|
||||
|
||||
// Core range reduction and polynomial on u.
|
||||
Packet log_u, e;
|
||||
plog_core_double(u, log_u, e);
|
||||
const Packet cst_cephes_SQRTHF = pset1<Packet>(0.70710678118654752440E0);
|
||||
Packet e;
|
||||
Packet m = pfrexp(u, e);
|
||||
Packet mask = pcmp_lt(m, cst_cephes_SQRTHF);
|
||||
Packet tmp = pand(m, mask);
|
||||
m = psub(m, one);
|
||||
e = psub(e, pand(one, mask));
|
||||
m = padd(m, tmp);
|
||||
|
||||
// result = e * ln2 + log(u) + dx/u.
|
||||
// The dx/u term corrects for the rounding error in u = fl(1+x).
|
||||
// Same polynomial as plog_double.
|
||||
const Packet cst_neg_half = pset1<Packet>(-0.5);
|
||||
const Packet cst_cephes_log_p0 = pset1<Packet>(1.01875663804580931796E-4);
|
||||
const Packet cst_cephes_log_p1 = pset1<Packet>(4.97494994976747001425E-1);
|
||||
const Packet cst_cephes_log_p2 = pset1<Packet>(4.70579119878881725854E0);
|
||||
const Packet cst_cephes_log_p3 = pset1<Packet>(1.44989225341610930846E1);
|
||||
const Packet cst_cephes_log_p4 = pset1<Packet>(1.79368678507819816313E1);
|
||||
const Packet cst_cephes_log_p5 = pset1<Packet>(7.70838733755885391666E0);
|
||||
const Packet cst_cephes_log_q0 = pset1<Packet>(1.0);
|
||||
const Packet cst_cephes_log_q1 = pset1<Packet>(1.12873587189167450590E1);
|
||||
const Packet cst_cephes_log_q2 = pset1<Packet>(4.52279145837532221105E1);
|
||||
const Packet cst_cephes_log_q3 = pset1<Packet>(8.29875266912776603211E1);
|
||||
const Packet cst_cephes_log_q4 = pset1<Packet>(7.11544750618563894466E1);
|
||||
const Packet cst_cephes_log_q5 = pset1<Packet>(2.31251620126765340583E1);
|
||||
|
||||
Packet m2 = pmul(m, m);
|
||||
Packet m3 = pmul(m2, m);
|
||||
|
||||
Packet y, y1, y_;
|
||||
y = pmadd(cst_cephes_log_p0, m, cst_cephes_log_p1);
|
||||
y1 = pmadd(cst_cephes_log_p3, m, cst_cephes_log_p4);
|
||||
y = pmadd(y, m, cst_cephes_log_p2);
|
||||
y1 = pmadd(y1, m, cst_cephes_log_p5);
|
||||
y_ = pmadd(y, m3, y1);
|
||||
|
||||
y = pmadd(cst_cephes_log_q0, m, cst_cephes_log_q1);
|
||||
y1 = pmadd(cst_cephes_log_q3, m, cst_cephes_log_q4);
|
||||
y = pmadd(y, m, cst_cephes_log_q2);
|
||||
y1 = pmadd(y1, m, cst_cephes_log_q5);
|
||||
y = pmadd(y, m3, y1);
|
||||
|
||||
y_ = pmul(y_, m3);
|
||||
Packet log_m = pdiv(y_, y);
|
||||
log_m = pmadd(cst_neg_half, m2, log_m);
|
||||
log_m = padd(m, log_m);
|
||||
|
||||
// result = e * ln2 + log(m) + dx/u.
|
||||
const Packet cst_ln2 = pset1<Packet>(static_cast<double>(EIGEN_LN2));
|
||||
Packet result = pmadd(e, cst_ln2, padd(log_u, pdiv(dx, u)));
|
||||
Packet result = pmadd(e, cst_ln2, padd(log_m, pdiv(dx, u)));
|
||||
|
||||
// Handle special cases.
|
||||
Packet neg_mask = pcmp_lt(u, pzero(u));
|
||||
Packet zero_mask = pcmp_eq(x, pset1<Packet>(-1.0));
|
||||
result = pselect(small_mask, x, result);
|
||||
result = pselect(inf_mask, cst_pos_inf, result);
|
||||
result = pselect(zero_mask, cst_minus_inf, result);
|
||||
result = por(neg_mask, result); // NaN for x < -1
|
||||
result = por(neg_mask, result);
|
||||
return result;
|
||||
}
|
||||
|
||||
|
||||
@@ -230,31 +230,40 @@ EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS Packet ptan_float(const Pack
|
||||
return psincos_float<TrigFunction::Tan>(x);
|
||||
}
|
||||
|
||||
// Pi/2 split into 3 double-precision parts (triple-double).
|
||||
// c1 + c2 + c3 = pi/2 to ~159 bits. Computed by Sollya.
|
||||
// c1 = RD(pi/2), c2 = RD(pi/2 - c1), c3 = RD(pi/2 - c1 - c2).
|
||||
template <typename Packet>
|
||||
Packet cst_pio2_1() {
|
||||
return pset1<Packet>(-1.5707963267948965579989817342720925807952880859375); // -0x1.921fb54442d18p0
|
||||
}
|
||||
template <typename Packet>
|
||||
Packet cst_pio2_2() {
|
||||
return pset1<Packet>(-6.12323399573676603586882014729198302312846062338790e-17); // -0x1.1a62633145c07p-54
|
||||
}
|
||||
template <typename Packet>
|
||||
Packet cst_pio2_3() {
|
||||
return pset1<Packet>(1.4973849048591698329435081771059920083527504761695190e-33); // 0x1.f1976b7ed8fbcp-110
|
||||
}
|
||||
|
||||
// Trigonometric argument reduction for double, small inputs (|x| < small_th).
|
||||
// Reduces x to t such that x = q * pi/2 + t, where |t| <= pi/4.
|
||||
// Uses a triple-double split of pi/2 with FMA for high accuracy.
|
||||
// Trigonometric argument reduction for double for inputs smaller than 15.
|
||||
// Reduces trigonometric arguments for double inputs where x < 15. Given an argument x and its corresponding quadrant
|
||||
// count n, the function computes and returns the reduced argument t such that x = n * pi/2 + t.
|
||||
template <typename Packet>
|
||||
Packet trig_reduce_small_double(const Packet& x, const Packet& q) {
|
||||
// Pi/2 split into 2 values
|
||||
const Packet cst_pio2_a = pset1<Packet>(-1.570796325802803);
|
||||
const Packet cst_pio2_b = pset1<Packet>(-9.920935184482005e-10);
|
||||
|
||||
Packet t;
|
||||
t = pmadd(cst_pio2_1<Packet>(), q, x);
|
||||
t = pmadd(cst_pio2_2<Packet>(), q, t);
|
||||
t = pmadd(cst_pio2_3<Packet>(), q, t);
|
||||
t = pmadd(cst_pio2_a, q, x);
|
||||
t = pmadd(cst_pio2_b, q, t);
|
||||
return t;
|
||||
}
|
||||
|
||||
// Trigonometric argument reduction for double for inputs smaller than 1e14.
|
||||
// Reduces trigonometric arguments for double inputs where x < 1e14. Given an argument x and its corresponding quadrant
|
||||
// count n, the function computes and returns the reduced argument t such that x = n * pi/2 + t.
|
||||
template <typename Packet>
|
||||
Packet trig_reduce_medium_double(const Packet& x, const Packet& q_high, const Packet& q_low) {
|
||||
// Pi/2 split into 4 values
|
||||
const Packet cst_pio2_a = pset1<Packet>(-1.570796325802803);
|
||||
const Packet cst_pio2_b = pset1<Packet>(-9.920935184482005e-10);
|
||||
const Packet cst_pio2_c = pset1<Packet>(-6.123234014771656e-17);
|
||||
const Packet cst_pio2_d = pset1<Packet>(1.903488962019325e-25);
|
||||
|
||||
Packet t;
|
||||
t = pmadd(cst_pio2_a, q_high, x);
|
||||
t = pmadd(cst_pio2_a, q_low, t);
|
||||
t = pmadd(cst_pio2_b, q_high, t);
|
||||
t = pmadd(cst_pio2_b, q_low, t);
|
||||
t = pmadd(cst_pio2_c, q_high, t);
|
||||
t = pmadd(cst_pio2_c, q_low, t);
|
||||
t = pmadd(cst_pio2_d, padd(q_low, q_high), t);
|
||||
return t;
|
||||
}
|
||||
|
||||
@@ -275,13 +284,11 @@ EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS
|
||||
// If the argument is bigger than this value, use the non-vectorized std version
|
||||
const double huge_th = 1e14;
|
||||
|
||||
// 2/PI as a double-word: hi + lo = 2/pi to ~107 bits. Computed by Sollya.
|
||||
const Packet cst_2oPI_hi =
|
||||
pset1<Packet>(0.63661977236758138243288840385503135621547698974609375); // 0x1.45f306dc9c883p-1
|
||||
const Packet cst_2oPI_lo =
|
||||
pset1<Packet>(-3.9357353350364971763790381828183628368294820823718866e-17); // -0x1.6b01ec5417056p-55
|
||||
const Packet cst_2oPI = pset1<Packet>(0.63661977236758134307553505349006); // 2/PI
|
||||
// Integer Packet constants
|
||||
const PacketI cst_one = pset1<PacketI>(ScalarI(1));
|
||||
// Constant for splitting
|
||||
const Packet cst_split = pset1<Packet>(1 << 24);
|
||||
|
||||
Packet x_abs = pabs(x);
|
||||
|
||||
@@ -291,56 +298,76 @@ EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS
|
||||
|
||||
// TODO Implement huge angle argument reduction
|
||||
if (EIGEN_PREDICT_FALSE(predux_any(pcmp_le(pset1<Packet>(small_th), x_abs)))) {
|
||||
// Medium path: use double-word product x * (2/pi) for precise quadrant computation.
|
||||
Packet prod_hi, prod_lo;
|
||||
twoprod(x_abs, cst_2oPI_hi, prod_hi, prod_lo);
|
||||
// Correction for 2/pi truncation: add x * lo(2/pi)
|
||||
prod_lo = pmadd(x_abs, cst_2oPI_lo, prod_lo);
|
||||
|
||||
// Round the double-word (prod_hi, prod_lo) to the nearest integer.
|
||||
Packet q = pround(prod_hi);
|
||||
// Compute exact fractional part to check if rounding was correct.
|
||||
Packet frac = padd(psub(prod_hi, q), prod_lo);
|
||||
// Correct if fractional part crossed +-0.5 boundary.
|
||||
q = padd(q, pand(pcmp_lt(pset1<Packet>(0.5), frac), pset1<Packet>(1.0)));
|
||||
q = padd(q, pand(pcmp_lt(frac, pset1<Packet>(-0.5)), pset1<Packet>(-1.0)));
|
||||
|
||||
q_int = pcast<Packet, PacketI>(q);
|
||||
s = trig_reduce_small_double(x_abs, q);
|
||||
Packet q_high = pmul(pfloor(pmul(x_abs, pdiv(cst_2oPI, cst_split))), cst_split);
|
||||
Packet q_low_noround = psub(pmul(x_abs, cst_2oPI), q_high);
|
||||
q_int = pcast<Packet, PacketI>(padd(q_low_noround, pset1<Packet>(0.5)));
|
||||
Packet q_low = pcast<PacketI, Packet>(q_int);
|
||||
s = trig_reduce_medium_double(x_abs, q_high, q_low);
|
||||
} else {
|
||||
// Small path: simple reduction with triple-double pi/2 split.
|
||||
Packet qval_noround = pmul(x_abs, cst_2oPI_hi);
|
||||
Packet qval_noround = pmul(x_abs, cst_2oPI);
|
||||
q_int = pcast<Packet, PacketI>(padd(qval_noround, pset1<Packet>(0.5)));
|
||||
Packet q = pcast<PacketI, Packet>(q_int);
|
||||
s = trig_reduce_small_double(x_abs, q);
|
||||
}
|
||||
|
||||
// All the upcoming approximating polynomials have even exponents
|
||||
Packet ss = pmul(s, s);
|
||||
|
||||
// Minimax polynomial approximation of cos(x) on [-pi/4, pi/4].
|
||||
// cos(x) = 1 + u * P(u), where u = x^2 and P is degree 6 (7 FMAs total).
|
||||
// Coefficients computed by Sollya fpminimax. Max polynomial error ~1.3e-19.
|
||||
Packet scos = pset1<Packet>(-1.1368926065317776472832699312119132152576472805094454088248312473297119140625e-11);
|
||||
scos = pmadd(scos, ss, pset1<Packet>(2.0875905481768720039634091158002593413556269297259859740734100341796875e-09));
|
||||
scos = pmadd(scos, ss, pset1<Packet>(-2.7557315712466412785356544880299711763882442028261721134185791015625e-07));
|
||||
scos = pmadd(scos, ss, pset1<Packet>(2.480158729424286522739599714082459058772656135261058807373046875e-05));
|
||||
scos = pmadd(scos, ss, pset1<Packet>(-1.388888888888178789471350427220386336557567119598388671875e-03));
|
||||
scos = pmadd(scos, ss, pset1<Packet>(4.166666666666664353702032030923874117434024810791015625e-02));
|
||||
scos = pmadd(scos, ss, pset1<Packet>(-0.5));
|
||||
scos = pmadd(scos, ss, pset1<Packet>(1.0));
|
||||
// Padé approximant of cos(x)
|
||||
// Assuring < 1 ULP error on the interval [-pi/4, pi/4]
|
||||
// cos(x) ~= (80737373*x^8 - 13853547000*x^6 + 727718024880*x^4 - 11275015752000*x^2 + 23594700729600)/(147173*x^8 +
|
||||
// 39328920*x^6 + 5772800880*x^4 + 522334612800*x^2 + 23594700729600)
|
||||
// MATLAB code to compute those coefficients:
|
||||
// syms x;
|
||||
// cosf = @(x) cos(x);
|
||||
// pade_cosf = pade(cosf(x), x, 0, 'Order', 8)
|
||||
const Packet cn4 = pset1<Packet>(80737373);
|
||||
const Packet cn3 = pset1<Packet>(-13853547000);
|
||||
const Packet cn2 = pset1<Packet>(727718024880);
|
||||
const Packet cn1 = pset1<Packet>(-11275015752000);
|
||||
const Packet cn0 = pset1<Packet>(23594700729600); // shared with cd0
|
||||
const Packet cd3 = pset1<Packet>(147173);
|
||||
const Packet cd2 = pset1<Packet>(39328920);
|
||||
const Packet cd1 = pset1<Packet>(5772800880);
|
||||
const Packet cd0 = pset1<Packet>(522334612800);
|
||||
Packet sc1_num = pmadd(ss, cn4, cn3);
|
||||
Packet sc2_num = pmadd(sc1_num, ss, cn2);
|
||||
Packet sc3_num = pmadd(sc2_num, ss, cn1);
|
||||
Packet sc4_num = pmadd(sc3_num, ss, cn0);
|
||||
Packet sc1_denum = pmadd(ss, cd3, cd2);
|
||||
Packet sc2_denum = pmadd(sc1_denum, ss, cd1);
|
||||
Packet sc3_denum = pmadd(sc2_denum, ss, cd0);
|
||||
Packet sc4_denum = pmadd(sc3_denum, ss, cn0);
|
||||
Packet scos = pdiv(sc4_num, sc4_denum);
|
||||
|
||||
// Minimax polynomial approximation of sin(x) on [-pi/4, pi/4].
|
||||
// sin(x) = x * (1 + u * R(u)), where u = x^2 and R is degree 5.
|
||||
// Computed as: x + x * u * R(u) (6 FMAs + 1 mul).
|
||||
// Coefficients computed by Sollya fpminimax. Max polynomial error ~1.0e-17.
|
||||
Packet ssin = pset1<Packet>(1.59193066075142890698150587293845624470289834562208852730691432952880859375e-10);
|
||||
ssin = pmadd(ssin, ss, pset1<Packet>(-2.50511517945670206974594627392927126408039839589037001132965087890625e-08));
|
||||
ssin = pmadd(ssin, ss, pset1<Packet>(2.755731622544328228235042954619160582296899519860744476318359375e-06));
|
||||
ssin = pmadd(ssin, ss, pset1<Packet>(-1.9841269837089632013978068858506276228581555187702178955078125e-04));
|
||||
ssin = pmadd(ssin, ss, pset1<Packet>(8.333333333331312264835588621281203813850879669189453125e-03));
|
||||
ssin = pmadd(ssin, ss, pset1<Packet>(-0.1666666666666666574148081281236954964697360992431640625));
|
||||
ssin = pmul(ssin, ss);
|
||||
ssin = pmadd(ssin, s, s);
|
||||
// Padé approximant of sin(x)
|
||||
// Assuring < 1 ULP error on the interval [-pi/4, pi/4]
|
||||
// sin(x) ~= (x*(4585922449*x^8 - 1066023933480*x^6 + 83284044283440*x^4 - 2303682236856000*x^2 +
|
||||
// 15605159573203200))/(45*(1029037*x^8 + 345207016*x^6 + 61570292784*x^4 + 6603948711360*x^2 + 346781323848960))
|
||||
// MATLAB code to compute those coefficients:
|
||||
// syms x;
|
||||
// sinf = @(x) sin(x);
|
||||
// pade_sinf = pade(sinf(x), x, 0, 'Order', 8, 'OrderMode', 'relative')
|
||||
const Packet sn4 = pset1<Packet>(4585922449);
|
||||
const Packet sn3 = pset1<Packet>(-1066023933480);
|
||||
const Packet sn2 = pset1<Packet>(83284044283440);
|
||||
const Packet sn1 = pset1<Packet>(-2303682236856000);
|
||||
const Packet sn0 = pset1<Packet>(15605159573203200);
|
||||
const Packet sd3 = pset1<Packet>(1029037);
|
||||
const Packet sd2 = pset1<Packet>(345207016);
|
||||
const Packet sd1 = pset1<Packet>(61570292784);
|
||||
const Packet sd0_inner = pset1<Packet>(6603948711360);
|
||||
const Packet sd0 = pset1<Packet>(346781323848960);
|
||||
const Packet cst_45 = pset1<Packet>(45);
|
||||
Packet ss1_num = pmadd(ss, sn4, sn3);
|
||||
Packet ss2_num = pmadd(ss1_num, ss, sn2);
|
||||
Packet ss3_num = pmadd(ss2_num, ss, sn1);
|
||||
Packet ss4_num = pmadd(ss3_num, ss, sn0);
|
||||
Packet ss1_denum = pmadd(ss, sd3, sd2);
|
||||
Packet ss2_denum = pmadd(ss1_denum, ss, sd1);
|
||||
Packet ss3_denum = pmadd(ss2_denum, ss, sd0_inner);
|
||||
Packet ss4_denum = pmadd(ss3_denum, ss, sd0);
|
||||
Packet ssin = pdiv(pmul(s, ss4_num), pmul(cst_45, ss4_denum));
|
||||
|
||||
Packet poly_mask = preinterpret<Packet>(pcmp_eq(pand(q_int, cst_one), pzero(q_int)));
|
||||
|
||||
|
||||
@@ -25,7 +25,7 @@ namespace internal {
|
||||
template <typename SolverType, int Size, bool IsComplex>
|
||||
struct direct_selfadjoint_eigenvalues;
|
||||
|
||||
template <bool PerBlockScaling, typename MatrixType, typename DiagType, typename SubDiagType>
|
||||
template <typename MatrixType, typename DiagType, typename SubDiagType>
|
||||
EIGEN_DEVICE_FUNC ComputationInfo computeFromTridiagonal_impl(DiagType& diag, SubDiagType& subdiag,
|
||||
const Index maxIterations, bool computeEigenvectors,
|
||||
MatrixType& eivec);
|
||||
@@ -438,7 +438,7 @@ EIGEN_DEVICE_FUNC SelfAdjointEigenSolver<MatrixType>& SelfAdjointEigenSolver<Mat
|
||||
m_eivec = matrix;
|
||||
m_eivalues.coeffRef(0, 0) = numext::real(m_eivec.coeff(0, 0));
|
||||
if (computeEigenvectors) m_eivec.setOnes(n, n);
|
||||
m_info = (numext::isfinite)(m_eivalues.coeffRef(0, 0)) ? Success : NoConvergence;
|
||||
m_info = Success;
|
||||
m_isInitialized = true;
|
||||
m_eigenvectorsOk = computeEigenvectors;
|
||||
return *this;
|
||||
@@ -448,11 +448,7 @@ EIGEN_DEVICE_FUNC SelfAdjointEigenSolver<MatrixType>& SelfAdjointEigenSolver<Mat
|
||||
RealVectorType& diag = m_eivalues;
|
||||
EigenvectorsType& mat = m_eivec;
|
||||
|
||||
// Scale the matrix to [-1:1] to avoid overflow/underflow during tridiagonalization
|
||||
// and subsequent QR iteration. This uniform scaling ensures the tridiagonal output is
|
||||
// well-conditioned. Note: for block-diagonal matrices with widely separated scales, this
|
||||
// can underflow small blocks. Users with such matrices should tridiagonalize separately
|
||||
// and call computeFromTridiagonal(), which uses per-block scaling.
|
||||
// map the matrix coefficients to [-1:1] to avoid over- and underflow.
|
||||
mat = matrix.template triangularView<Lower>();
|
||||
RealScalar scale = mat.cwiseAbs().maxCoeff();
|
||||
if (!(numext::isfinite)(scale)) {
|
||||
@@ -468,9 +464,9 @@ EIGEN_DEVICE_FUNC SelfAdjointEigenSolver<MatrixType>& SelfAdjointEigenSolver<Mat
|
||||
m_hcoeffs.resize(n - 1);
|
||||
internal::tridiagonalization_inplace(mat, diag, m_subdiag, m_hcoeffs, m_workspace, computeEigenvectors);
|
||||
|
||||
m_info = internal::computeFromTridiagonal_impl<false>(diag, m_subdiag, m_maxIterations, computeEigenvectors, m_eivec);
|
||||
m_info = internal::computeFromTridiagonal_impl(diag, m_subdiag, m_maxIterations, computeEigenvectors, m_eivec);
|
||||
|
||||
// Scale back the eigenvalues.
|
||||
// scale back the eigen values
|
||||
m_eivalues *= scale;
|
||||
|
||||
m_isInitialized = true;
|
||||
@@ -486,26 +482,30 @@ SelfAdjointEigenSolver<MatrixType>& SelfAdjointEigenSolver<MatrixType>::computeF
|
||||
m_eivalues = diag;
|
||||
m_subdiag = subdiag;
|
||||
|
||||
// Check for Inf/NaN in the input.
|
||||
{
|
||||
RealScalar scale = RealScalar(0);
|
||||
if (m_eivalues.size() > 0) scale = m_eivalues.cwiseAbs().maxCoeff();
|
||||
if (m_subdiag.size() > 0) scale = numext::maxi(scale, m_subdiag.cwiseAbs().maxCoeff());
|
||||
if (!(numext::isfinite)(scale)) {
|
||||
m_info = NoConvergence;
|
||||
m_isInitialized = true;
|
||||
m_eigenvectorsOk = false;
|
||||
return *this;
|
||||
}
|
||||
// Scale the tridiagonal matrix to [-1:1] to avoid over- and underflow,
|
||||
// just like compute() does for the full matrix.
|
||||
RealScalar scale = m_eivalues.cwiseAbs().maxCoeff();
|
||||
if (m_subdiag.size() > 0) scale = numext::maxi(scale, m_subdiag.cwiseAbs().maxCoeff());
|
||||
if (!(numext::isfinite)(scale)) {
|
||||
m_info = NoConvergence;
|
||||
m_isInitialized = true;
|
||||
m_eigenvectorsOk = false;
|
||||
return *this;
|
||||
}
|
||||
if (numext::is_exactly_zero(scale)) scale = RealScalar(1);
|
||||
const bool needsScaling = scale != RealScalar(1);
|
||||
if (needsScaling) {
|
||||
m_eivalues /= scale;
|
||||
m_subdiag /= scale;
|
||||
}
|
||||
|
||||
if (computeEigenvectors) {
|
||||
m_eivec.setIdentity(diag.size(), diag.size());
|
||||
}
|
||||
// Use per-deflation-block scaling (like LAPACK's DSTERF) to avoid losing
|
||||
// precision when the tridiagonal entries span a wide range of magnitudes.
|
||||
m_info =
|
||||
internal::computeFromTridiagonal_impl<true>(m_eivalues, m_subdiag, m_maxIterations, computeEigenvectors, m_eivec);
|
||||
m_info = internal::computeFromTridiagonal_impl(m_eivalues, m_subdiag, m_maxIterations, computeEigenvectors, m_eivec);
|
||||
|
||||
// Scale back the eigenvalues.
|
||||
if (needsScaling) m_eivalues *= scale;
|
||||
|
||||
m_isInitialized = true;
|
||||
m_eigenvectorsOk = computeEigenvectors;
|
||||
@@ -517,10 +517,6 @@ namespace internal {
|
||||
* \internal
|
||||
* \brief Compute the eigendecomposition from a tridiagonal matrix
|
||||
*
|
||||
* \tparam PerBlockScaling If true, each deflation block is independently scaled to [-1,1] before
|
||||
* QR iteration, following LAPACK's DSTERF approach. This prevents precision loss when entries
|
||||
* span a wide range of magnitudes. When false, the caller is responsible for ensuring the
|
||||
* entries are in a safe range (e.g. by pre-scaling the dense matrix before tridiagonalization).
|
||||
* \param[in,out] diag : On input, the diagonal of the matrix, on output the eigenvalues
|
||||
* \param[in,out] subdiag : The subdiagonal part of the matrix (entries are modified during the decomposition)
|
||||
* \param[in] maxIterations : the maximum number of iterations
|
||||
@@ -528,7 +524,7 @@ namespace internal {
|
||||
* \param[out] eivec : The matrix to store the eigenvectors if computeEigenvectors==true. Must be allocated on input.
|
||||
* \returns \c Success or \c NoConvergence
|
||||
*/
|
||||
template <bool PerBlockScaling, typename MatrixType, typename DiagType, typename SubDiagType>
|
||||
template <typename MatrixType, typename DiagType, typename SubDiagType>
|
||||
EIGEN_DEVICE_FUNC ComputationInfo computeFromTridiagonal_impl(DiagType& diag, SubDiagType& subdiag,
|
||||
const Index maxIterations, bool computeEigenvectors,
|
||||
MatrixType& eivec) {
|
||||
@@ -543,32 +539,21 @@ EIGEN_DEVICE_FUNC ComputationInfo computeFromTridiagonal_impl(DiagType& diag, Su
|
||||
typedef typename DiagType::RealScalar RealScalar;
|
||||
const RealScalar considerAsZero = (std::numeric_limits<RealScalar>::min)();
|
||||
const RealScalar precision_inv = RealScalar(1) / NumTraits<RealScalar>::epsilon();
|
||||
|
||||
// Helper lambda for the deflation test.
|
||||
auto deflate = [&](Index lo, Index hi) {
|
||||
for (Index i = lo; i < hi; ++i) {
|
||||
while (end > 0) {
|
||||
for (Index i = start; i < end; ++i) {
|
||||
if (numext::abs(subdiag[i]) < considerAsZero) {
|
||||
subdiag[i] = RealScalar(0);
|
||||
} else {
|
||||
// abs(subdiag[i]) <= epsilon * sqrt(abs(diag[i]) + abs(diag[i+1]))
|
||||
// Scaled to prevent underflows.
|
||||
const RealScalar scaled_subdiag = precision_inv * subdiag[i];
|
||||
if (scaled_subdiag * scaled_subdiag <= (numext::abs(diag[i]) + numext::abs(diag[i + 1]))) {
|
||||
subdiag[i] = RealScalar(0);
|
||||
}
|
||||
}
|
||||
}
|
||||
};
|
||||
|
||||
// For per-block scaling, track the currently scaled block and its scale factor.
|
||||
// When the outer loop identifies a block outside the scaled region, unscale the old
|
||||
// block and scale the new one. This keeps the same outer loop structure (one QR step
|
||||
// per iteration) while ensuring each block is processed in scaled coordinates.
|
||||
Index scaled_start = -1, scaled_end = -1;
|
||||
RealScalar block_scale = RealScalar(1);
|
||||
|
||||
while (end > 0) {
|
||||
deflate(start, end);
|
||||
|
||||
// Find the largest unreduced block at the end of the matrix.
|
||||
// find the largest unreduced block at the end of the matrix.
|
||||
while (end > 0 && numext::is_exactly_zero(subdiag[end - 1])) {
|
||||
end--;
|
||||
}
|
||||
@@ -581,42 +566,9 @@ EIGEN_DEVICE_FUNC ComputationInfo computeFromTridiagonal_impl(DiagType& diag, Su
|
||||
start = end - 1;
|
||||
while (start > 0 && !numext::is_exactly_zero(subdiag[start - 1])) start--;
|
||||
|
||||
if (PerBlockScaling) {
|
||||
// Check if we've moved to a different block than the one currently scaled.
|
||||
if (start != scaled_start || end != scaled_end) {
|
||||
// Unscale the previous block if it was scaled.
|
||||
if (block_scale != RealScalar(1)) {
|
||||
for (Index i = scaled_start; i <= scaled_end; ++i) diag[i] /= block_scale;
|
||||
for (Index i = scaled_start; i < scaled_end; ++i) {
|
||||
if (!numext::is_exactly_zero(subdiag[i])) subdiag[i] /= block_scale;
|
||||
}
|
||||
block_scale = RealScalar(1);
|
||||
}
|
||||
// Compute the norm and scale the new block to [-1:1].
|
||||
RealScalar block_norm = RealScalar(0);
|
||||
for (Index i = start; i <= end; ++i) block_norm = numext::maxi(block_norm, numext::abs(diag[i]));
|
||||
for (Index i = start; i < end; ++i) block_norm = numext::maxi(block_norm, numext::abs(subdiag[i]));
|
||||
if (block_norm > RealScalar(0) && block_norm != RealScalar(1)) {
|
||||
block_scale = RealScalar(1) / block_norm;
|
||||
for (Index i = start; i <= end; ++i) diag[i] *= block_scale;
|
||||
for (Index i = start; i < end; ++i) subdiag[i] *= block_scale;
|
||||
}
|
||||
scaled_start = start;
|
||||
scaled_end = end;
|
||||
}
|
||||
}
|
||||
|
||||
internal::tridiagonal_qr_step<MatrixType::Flags & RowMajorBit ? RowMajor : ColMajor>(
|
||||
diag.data(), subdiag.data(), start, end, computeEigenvectors ? eivec.data() : (Scalar*)0, n);
|
||||
}
|
||||
|
||||
// Unscale any remaining scaled block.
|
||||
if (PerBlockScaling && block_scale != RealScalar(1)) {
|
||||
for (Index i = scaled_start; i <= scaled_end; ++i) diag[i] /= block_scale;
|
||||
for (Index i = scaled_start; i < scaled_end; ++i) {
|
||||
if (!numext::is_exactly_zero(subdiag[i])) subdiag[i] /= block_scale;
|
||||
}
|
||||
}
|
||||
if (iter <= maxIterations * n)
|
||||
info = Success;
|
||||
else
|
||||
|
||||
@@ -1,17 +0,0 @@
|
||||
# Benchmarks for Eigen's built-in BLAS implementation.
|
||||
# Compiles the Eigen BLAS sources directly into the benchmark executable
|
||||
# so there is no external BLAS dependency.
|
||||
|
||||
set(EIGEN_BLAS_SRCS
|
||||
${EIGEN_SOURCE_DIR}/blas/single.cpp
|
||||
${EIGEN_SOURCE_DIR}/blas/double.cpp
|
||||
${EIGEN_SOURCE_DIR}/blas/complex_single.cpp
|
||||
${EIGEN_SOURCE_DIR}/blas/complex_double.cpp
|
||||
${EIGEN_SOURCE_DIR}/blas/xerbla.cpp
|
||||
${EIGEN_SOURCE_DIR}/blas/lsame.cpp
|
||||
${EIGEN_SOURCE_DIR}/blas/complexdots.cpp
|
||||
)
|
||||
|
||||
eigen_add_benchmark(bench_blas bench_blas.cpp)
|
||||
target_sources(bench_blas PRIVATE ${EIGEN_BLAS_SRCS})
|
||||
target_include_directories(bench_blas PRIVATE ${EIGEN_SOURCE_DIR}/blas)
|
||||
@@ -1,488 +0,0 @@
|
||||
// Benchmark for Eigen's BLAS implementation.
|
||||
//
|
||||
// Calls the Eigen BLAS C interface directly (the extern "C" functions defined
|
||||
// in blas/{single,double,complex_single,complex_double}.cpp).
|
||||
//
|
||||
// Covers Level 1, 2, and 3 routines — with emphasis on the routines that
|
||||
// were recently rewritten from f2c to C++: rotm, rotmg, spmv, sbmv, hbmv,
|
||||
// hpmv, tbmv, lsame, and complex dot products.
|
||||
|
||||
#include <benchmark/benchmark.h>
|
||||
|
||||
#include <Eigen/Core>
|
||||
#include <complex>
|
||||
#include <vector>
|
||||
|
||||
#include "blas/blas.h"
|
||||
|
||||
using Eigen::Index;
|
||||
|
||||
// ---------------------------------------------------------------------------
|
||||
// Helpers
|
||||
// ---------------------------------------------------------------------------
|
||||
|
||||
// Flop-rate counter (units = individual flops per call).
|
||||
static benchmark::Counter GflopsCounter(double flops) {
|
||||
return benchmark::Counter(flops, benchmark::Counter::kIsIterationInvariantRate, benchmark::Counter::kIs1000);
|
||||
}
|
||||
|
||||
// Fill a vector with random values in [-1, 1].
|
||||
template <typename T>
|
||||
static void fillRand(T* data, Index n) {
|
||||
Eigen::Map<Eigen::Matrix<T, Eigen::Dynamic, 1>>(data, n).setRandom();
|
||||
}
|
||||
|
||||
// Fill a symmetric band matrix A in BLAS band storage (column-major).
|
||||
// Upper triangle: A[i,j] stored at a[(k+i-j) + j*lda], 0 <= j-i <= k.
|
||||
template <typename T>
|
||||
static void fillSymBandUpper(T* a, int n, int k, int lda) {
|
||||
std::fill(a, a + lda * n, T(0));
|
||||
for (int j = 0; j < n; ++j)
|
||||
for (int i = std::max(0, j - k); i <= j; ++i) a[(k + i - j) + j * lda] = T(std::rand()) / T(RAND_MAX) - T(0.5);
|
||||
}
|
||||
|
||||
// Fill a packed symmetric matrix (upper triangle, column-major).
|
||||
template <typename T>
|
||||
static void fillSymPacked(T* ap, int n) {
|
||||
int sz = n * (n + 1) / 2;
|
||||
for (int i = 0; i < sz; ++i) ap[i] = T(std::rand()) / T(RAND_MAX) - T(0.5);
|
||||
}
|
||||
|
||||
// Fill a triangular band matrix in BLAS band storage (upper, column-major).
|
||||
template <typename T>
|
||||
static void fillTriBandUpper(T* a, int n, int k, int lda) {
|
||||
std::fill(a, a + lda * n, T(0));
|
||||
for (int j = 0; j < n; ++j)
|
||||
for (int i = std::max(0, j - k); i <= j; ++i) {
|
||||
T val = T(std::rand()) / T(RAND_MAX) - T(0.5);
|
||||
if (i == j) val += T(n); // diagonal dominance
|
||||
a[(k + i - j) + j * lda] = val;
|
||||
}
|
||||
}
|
||||
|
||||
// ---------------------------------------------------------------------------
|
||||
// Type-dispatched BLAS wrappers
|
||||
// ---------------------------------------------------------------------------
|
||||
|
||||
inline float blas_dot(int* n, float* x, int* incx, float* y, int* incy) { return sdot_(n, x, incx, y, incy); }
|
||||
inline double blas_dot(int* n, double* x, int* incx, double* y, int* incy) { return ddot_(n, x, incx, y, incy); }
|
||||
|
||||
inline void blas_axpy(int* n, float* a, float* x, int* incx, float* y, int* incy) { saxpy_(n, a, x, incx, y, incy); }
|
||||
inline void blas_axpy(int* n, double* a, double* x, int* incx, double* y, int* incy) { daxpy_(n, a, x, incx, y, incy); }
|
||||
|
||||
inline float blas_nrm2(int* n, float* x, int* incx) { return snrm2_(n, x, incx); }
|
||||
inline double blas_nrm2(int* n, double* x, int* incx) { return dnrm2_(n, x, incx); }
|
||||
|
||||
inline void blas_rotm(int* n, float* x, int* incx, float* y, int* incy, float* p) { srotm_(n, x, incx, y, incy, p); }
|
||||
inline void blas_rotm(int* n, double* x, int* incx, double* y, int* incy, double* p) { drotm_(n, x, incx, y, incy, p); }
|
||||
|
||||
inline void blas_rotmg(float* d1, float* d2, float* x1, float* y1, float* p) { srotmg_(d1, d2, x1, y1, p); }
|
||||
inline void blas_rotmg(double* d1, double* d2, double* x1, double* y1, double* p) { drotmg_(d1, d2, x1, y1, p); }
|
||||
|
||||
inline void blas_dotcw(int* n, float* cx, int* incx, float* cy, int* incy, float* res) {
|
||||
cdotcw_(n, cx, incx, cy, incy, res);
|
||||
}
|
||||
inline void blas_dotcw(int* n, double* cx, int* incx, double* cy, int* incy, double* res) {
|
||||
zdotcw_(n, cx, incx, cy, incy, res);
|
||||
}
|
||||
|
||||
inline void blas_gemv(char* t, int* m, int* n, float* a, float* A, int* lda, float* x, int* incx, float* b, float* y,
|
||||
int* incy) {
|
||||
sgemv_(t, m, n, a, A, lda, x, incx, b, y, incy);
|
||||
}
|
||||
inline void blas_gemv(char* t, int* m, int* n, double* a, double* A, int* lda, double* x, int* incx, double* b,
|
||||
double* y, int* incy) {
|
||||
dgemv_(t, m, n, a, A, lda, x, incx, b, y, incy);
|
||||
}
|
||||
|
||||
inline void blas_spmv(char* uplo, int* n, float* alpha, float* ap, float* x, int* incx, float* beta, float* y,
|
||||
int* incy) {
|
||||
sspmv_(uplo, n, alpha, ap, x, incx, beta, y, incy);
|
||||
}
|
||||
inline void blas_spmv(char* uplo, int* n, double* alpha, double* ap, double* x, int* incx, double* beta, double* y,
|
||||
int* incy) {
|
||||
dspmv_(uplo, n, alpha, ap, x, incx, beta, y, incy);
|
||||
}
|
||||
|
||||
inline void blas_sbmv(char* uplo, int* n, int* k, float* alpha, float* a, int* lda, float* x, int* incx, float* beta,
|
||||
float* y, int* incy) {
|
||||
ssbmv_(uplo, n, k, alpha, a, lda, x, incx, beta, y, incy);
|
||||
}
|
||||
inline void blas_sbmv(char* uplo, int* n, int* k, double* alpha, double* a, int* lda, double* x, int* incx,
|
||||
double* beta, double* y, int* incy) {
|
||||
dsbmv_(uplo, n, k, alpha, a, lda, x, incx, beta, y, incy);
|
||||
}
|
||||
|
||||
inline void blas_tbmv(char* uplo, char* trans, char* diag, int* n, int* k, float* a, int* lda, float* x, int* incx) {
|
||||
stbmv_(uplo, trans, diag, n, k, a, lda, x, incx);
|
||||
}
|
||||
inline void blas_tbmv(char* uplo, char* trans, char* diag, int* n, int* k, double* a, int* lda, double* x, int* incx) {
|
||||
dtbmv_(uplo, trans, diag, n, k, a, lda, x, incx);
|
||||
}
|
||||
|
||||
inline void blas_hbmv(char* uplo, int* n, int* k, float* alpha, float* a, int* lda, float* x, int* incx, float* beta,
|
||||
float* y, int* incy) {
|
||||
chbmv_(uplo, n, k, alpha, a, lda, x, incx, beta, y, incy);
|
||||
}
|
||||
inline void blas_hbmv(char* uplo, int* n, int* k, double* alpha, double* a, int* lda, double* x, int* incx,
|
||||
double* beta, double* y, int* incy) {
|
||||
zhbmv_(uplo, n, k, alpha, a, lda, x, incx, beta, y, incy);
|
||||
}
|
||||
|
||||
inline void blas_hpmv(char* uplo, int* n, float* alpha, float* ap, float* x, int* incx, float* beta, float* y,
|
||||
int* incy) {
|
||||
chpmv_(uplo, n, alpha, ap, x, incx, beta, y, incy);
|
||||
}
|
||||
inline void blas_hpmv(char* uplo, int* n, double* alpha, double* ap, double* x, int* incx, double* beta, double* y,
|
||||
int* incy) {
|
||||
zhpmv_(uplo, n, alpha, ap, x, incx, beta, y, incy);
|
||||
}
|
||||
|
||||
inline void blas_gemm(char* ta, char* tb, int* m, int* n, int* k, float* alpha, float* a, int* lda, float* b, int* ldb,
|
||||
float* beta, float* c, int* ldc) {
|
||||
sgemm_(ta, tb, m, n, k, alpha, a, lda, b, ldb, beta, c, ldc);
|
||||
}
|
||||
inline void blas_gemm(char* ta, char* tb, int* m, int* n, int* k, double* alpha, double* a, int* lda, double* b,
|
||||
int* ldb, double* beta, double* c, int* ldc) {
|
||||
dgemm_(ta, tb, m, n, k, alpha, a, lda, b, ldb, beta, c, ldc);
|
||||
}
|
||||
|
||||
// =========================================================================
|
||||
// Level 1 — Real
|
||||
// =========================================================================
|
||||
|
||||
// ----- SDOT / DDOT -----
|
||||
template <typename T>
|
||||
static void BM_dot(benchmark::State& state) {
|
||||
int n = static_cast<int>(state.range(0));
|
||||
int one = 1;
|
||||
std::vector<T> x(n), y(n);
|
||||
fillRand(x.data(), n);
|
||||
fillRand(y.data(), n);
|
||||
for (auto _ : state) {
|
||||
T r = blas_dot(&n, x.data(), &one, y.data(), &one);
|
||||
benchmark::DoNotOptimize(r);
|
||||
}
|
||||
state.counters["GFLOPS"] = GflopsCounter(2.0 * n);
|
||||
}
|
||||
|
||||
// ----- SAXPY / DAXPY -----
|
||||
template <typename T>
|
||||
static void BM_axpy(benchmark::State& state) {
|
||||
int n = static_cast<int>(state.range(0));
|
||||
int one = 1;
|
||||
T alpha = T(2.5);
|
||||
std::vector<T> x(n), y(n);
|
||||
fillRand(x.data(), n);
|
||||
fillRand(y.data(), n);
|
||||
for (auto _ : state) {
|
||||
blas_axpy(&n, &alpha, x.data(), &one, y.data(), &one);
|
||||
benchmark::DoNotOptimize(y.data());
|
||||
}
|
||||
state.counters["GFLOPS"] = GflopsCounter(2.0 * n);
|
||||
}
|
||||
|
||||
// ----- SNRM2 / DNRM2 -----
|
||||
template <typename T>
|
||||
static void BM_nrm2(benchmark::State& state) {
|
||||
int n = static_cast<int>(state.range(0));
|
||||
int one = 1;
|
||||
std::vector<T> x(n);
|
||||
fillRand(x.data(), n);
|
||||
for (auto _ : state) {
|
||||
T r = blas_nrm2(&n, x.data(), &one);
|
||||
benchmark::DoNotOptimize(r);
|
||||
}
|
||||
// Nominal flops; Eigen's stableNorm() does more work internally.
|
||||
state.counters["GFLOPS"] = GflopsCounter(2.0 * n - 1);
|
||||
}
|
||||
|
||||
// ----- SROTM / DROTM -----
|
||||
template <typename T>
|
||||
static void BM_rotm(benchmark::State& state) {
|
||||
int n = static_cast<int>(state.range(0));
|
||||
int one = 1;
|
||||
std::vector<T> x(n), y(n);
|
||||
T param[5] = {T(-1), T(0.6), T(-0.8), T(0.8), T(0.6)}; // full rotation
|
||||
fillRand(x.data(), n);
|
||||
fillRand(y.data(), n);
|
||||
for (auto _ : state) {
|
||||
blas_rotm(&n, x.data(), &one, y.data(), &one, param);
|
||||
benchmark::DoNotOptimize(x.data());
|
||||
benchmark::DoNotOptimize(y.data());
|
||||
}
|
||||
// 4 muls + 2 adds per element pair.
|
||||
state.counters["GFLOPS"] = GflopsCounter(6.0 * n);
|
||||
}
|
||||
|
||||
// ----- SROTMG / DROTMG -----
|
||||
template <typename T>
|
||||
static void BM_rotmg(benchmark::State& state) {
|
||||
T d1 = T(2), d2 = T(3), x1 = T(1), y1 = T(0.5);
|
||||
T param[5];
|
||||
for (auto _ : state) {
|
||||
T td1 = d1, td2 = d2, tx1 = x1;
|
||||
blas_rotmg(&td1, &td2, &tx1, &y1, param);
|
||||
benchmark::DoNotOptimize(param);
|
||||
}
|
||||
}
|
||||
|
||||
// =========================================================================
|
||||
// Level 1 — Complex
|
||||
// =========================================================================
|
||||
|
||||
// Complex conjugate dot product via the worker functions (cdotcw_ / zdotcw_)
|
||||
// which use an output pointer, avoiding the ABI ambiguity of the struct-returning
|
||||
// cdotc_ / zdotc_ wrappers.
|
||||
template <typename T>
|
||||
static void BM_dotc(benchmark::State& state) {
|
||||
int n = static_cast<int>(state.range(0));
|
||||
int one = 1;
|
||||
std::vector<T> x(2 * n), y(2 * n); // interleaved real/imag
|
||||
fillRand(x.data(), 2 * n);
|
||||
fillRand(y.data(), 2 * n);
|
||||
T res[2];
|
||||
for (auto _ : state) {
|
||||
blas_dotcw(&n, x.data(), &one, y.data(), &one, res);
|
||||
benchmark::DoNotOptimize(res);
|
||||
}
|
||||
// Conjugate dot: 6 mul + 2 add per element = 8n flops.
|
||||
state.counters["GFLOPS"] = GflopsCounter(8.0 * n);
|
||||
}
|
||||
|
||||
// =========================================================================
|
||||
// Level 2 — General Matrix-Vector (SGEMV / DGEMV)
|
||||
// =========================================================================
|
||||
|
||||
template <typename T>
|
||||
static void BM_gemv(benchmark::State& state) {
|
||||
int m = static_cast<int>(state.range(0));
|
||||
int n = static_cast<int>(state.range(1));
|
||||
int one = 1;
|
||||
T alpha = T(1), beta = T(0);
|
||||
char trans = 'N';
|
||||
std::vector<T> a(m * n), x(n), y(m);
|
||||
fillRand(a.data(), m * n);
|
||||
fillRand(x.data(), n);
|
||||
fillRand(y.data(), m);
|
||||
for (auto _ : state) {
|
||||
blas_gemv(&trans, &m, &n, &alpha, a.data(), &m, x.data(), &one, &beta, y.data(), &one);
|
||||
benchmark::DoNotOptimize(y.data());
|
||||
}
|
||||
state.counters["GFLOPS"] = GflopsCounter(2.0 * m * n);
|
||||
}
|
||||
|
||||
// =========================================================================
|
||||
// Level 2 — Symmetric Packed (SSPMV / DSPMV)
|
||||
// =========================================================================
|
||||
|
||||
template <typename T>
|
||||
static void BM_spmv(benchmark::State& state) {
|
||||
int n = static_cast<int>(state.range(0));
|
||||
int one = 1;
|
||||
T alpha = T(1), beta = T(0);
|
||||
char uplo = 'U';
|
||||
int sz = n * (n + 1) / 2;
|
||||
std::vector<T> ap(sz), x(n), y(n);
|
||||
fillSymPacked(ap.data(), n);
|
||||
fillRand(x.data(), n);
|
||||
fillRand(y.data(), n);
|
||||
for (auto _ : state) {
|
||||
blas_spmv(&uplo, &n, &alpha, ap.data(), x.data(), &one, &beta, y.data(), &one);
|
||||
benchmark::DoNotOptimize(y.data());
|
||||
}
|
||||
// Symmetric: each off-diag element contributes to two y entries.
|
||||
state.counters["GFLOPS"] = GflopsCounter(2.0 * n * n);
|
||||
}
|
||||
|
||||
// =========================================================================
|
||||
// Level 2 — Symmetric Band (SSBMV / DSBMV)
|
||||
// =========================================================================
|
||||
|
||||
template <typename T>
|
||||
static void BM_sbmv(benchmark::State& state) {
|
||||
int n = static_cast<int>(state.range(0));
|
||||
int k = static_cast<int>(state.range(1));
|
||||
int lda = k + 1;
|
||||
int one = 1;
|
||||
T alpha = T(1), beta = T(0);
|
||||
char uplo = 'U';
|
||||
std::vector<T> a(lda * n), x(n), y(n);
|
||||
fillSymBandUpper(a.data(), n, k, lda);
|
||||
fillRand(x.data(), n);
|
||||
fillRand(y.data(), n);
|
||||
for (auto _ : state) {
|
||||
blas_sbmv(&uplo, &n, &k, &alpha, a.data(), &lda, x.data(), &one, &beta, y.data(), &one);
|
||||
benchmark::DoNotOptimize(y.data());
|
||||
}
|
||||
state.counters["GFLOPS"] = GflopsCounter(2.0 * n * (2 * k + 1));
|
||||
}
|
||||
|
||||
// =========================================================================
|
||||
// Level 2 — Triangular Band (STBMV / DTBMV)
|
||||
// =========================================================================
|
||||
|
||||
template <typename T>
|
||||
static void BM_tbmv(benchmark::State& state) {
|
||||
int n = static_cast<int>(state.range(0));
|
||||
int k = static_cast<int>(state.range(1));
|
||||
int lda = k + 1;
|
||||
int one = 1;
|
||||
char uplo = 'U', trans = 'N', diag = 'N';
|
||||
std::vector<T> a(lda * n), x(n), x_orig(n);
|
||||
fillTriBandUpper(a.data(), n, k, lda);
|
||||
fillRand(x_orig.data(), n);
|
||||
for (auto _ : state) {
|
||||
state.PauseTiming();
|
||||
std::copy(x_orig.begin(), x_orig.end(), x.begin());
|
||||
state.ResumeTiming();
|
||||
blas_tbmv(&uplo, &trans, &diag, &n, &k, a.data(), &lda, x.data(), &one);
|
||||
benchmark::DoNotOptimize(x.data());
|
||||
}
|
||||
state.counters["GFLOPS"] = GflopsCounter(1.0 * n * (k + 1));
|
||||
}
|
||||
|
||||
// =========================================================================
|
||||
// Level 2 — Hermitian Band (CHBMV / ZHBMV)
|
||||
// =========================================================================
|
||||
|
||||
template <typename T>
|
||||
static void BM_hbmv(benchmark::State& state) {
|
||||
int n = static_cast<int>(state.range(0));
|
||||
int k = static_cast<int>(state.range(1));
|
||||
int lda = k + 1;
|
||||
int one = 1;
|
||||
char uplo = 'U';
|
||||
// Complex: each element is 2 reals.
|
||||
std::vector<T> a(2 * lda * n), x(2 * n), y(2 * n);
|
||||
T alpha[2] = {T(1), T(0)};
|
||||
T beta[2] = {T(0), T(0)};
|
||||
fillRand(a.data(), 2 * lda * n);
|
||||
// Make diagonal real (imag part = 0).
|
||||
for (int j = 0; j < n; ++j) a[2 * (k + j * lda) + 1] = T(0);
|
||||
fillRand(x.data(), 2 * n);
|
||||
fillRand(y.data(), 2 * n);
|
||||
for (auto _ : state) {
|
||||
blas_hbmv(&uplo, &n, &k, alpha, a.data(), &lda, x.data(), &one, beta, y.data(), &one);
|
||||
benchmark::DoNotOptimize(y.data());
|
||||
}
|
||||
// Complex hermitian band: 8*n*(2k+1) flops approximately.
|
||||
state.counters["GFLOPS"] = GflopsCounter(8.0 * n * (2 * k + 1));
|
||||
}
|
||||
|
||||
// =========================================================================
|
||||
// Level 2 — Hermitian Packed (CHPMV / ZHPMV)
|
||||
// =========================================================================
|
||||
|
||||
template <typename T>
|
||||
static void BM_hpmv(benchmark::State& state) {
|
||||
int n = static_cast<int>(state.range(0));
|
||||
int one = 1;
|
||||
char uplo = 'U';
|
||||
int sz = n * (n + 1) / 2;
|
||||
std::vector<T> ap(2 * sz), x(2 * n), y(2 * n);
|
||||
T alpha[2] = {T(1), T(0)};
|
||||
T beta[2] = {T(0), T(0)};
|
||||
fillRand(ap.data(), 2 * sz);
|
||||
// Make diagonal real.
|
||||
int kk = 0;
|
||||
for (int j = 0; j < n; ++j) {
|
||||
ap[2 * (kk + j) + 1] = T(0);
|
||||
kk += j + 1;
|
||||
}
|
||||
fillRand(x.data(), 2 * n);
|
||||
fillRand(y.data(), 2 * n);
|
||||
for (auto _ : state) {
|
||||
blas_hpmv(&uplo, &n, alpha, ap.data(), x.data(), &one, beta, y.data(), &one);
|
||||
benchmark::DoNotOptimize(y.data());
|
||||
}
|
||||
state.counters["GFLOPS"] = GflopsCounter(8.0 * n * n);
|
||||
}
|
||||
|
||||
// =========================================================================
|
||||
// Level 3 — General Matrix Multiply (SGEMM / DGEMM)
|
||||
// =========================================================================
|
||||
|
||||
template <typename T>
|
||||
static void BM_gemm(benchmark::State& state) {
|
||||
int n = static_cast<int>(state.range(0));
|
||||
T alpha = T(1), beta = T(0);
|
||||
char trans = 'N';
|
||||
std::vector<T> a(n * n), b(n * n), c(n * n);
|
||||
fillRand(a.data(), n * n);
|
||||
fillRand(b.data(), n * n);
|
||||
fillRand(c.data(), n * n);
|
||||
for (auto _ : state) {
|
||||
blas_gemm(&trans, &trans, &n, &n, &n, &alpha, a.data(), &n, b.data(), &n, &beta, c.data(), &n);
|
||||
benchmark::DoNotOptimize(c.data());
|
||||
}
|
||||
state.counters["GFLOPS"] = GflopsCounter(2.0 * n * n * n);
|
||||
}
|
||||
|
||||
// =========================================================================
|
||||
// Register benchmarks
|
||||
// =========================================================================
|
||||
|
||||
// clang-format off
|
||||
|
||||
// --- Vector sizes for Level 1 ---
|
||||
#define L1_SIZES ->Arg(64)->Arg(256)->Arg(1024)->Arg(4096)->Arg(16384)->Arg(65536)
|
||||
|
||||
BENCHMARK(BM_dot<float>) L1_SIZES ->Name("sdot");
|
||||
BENCHMARK(BM_dot<double>) L1_SIZES ->Name("ddot");
|
||||
BENCHMARK(BM_axpy<float>) L1_SIZES ->Name("saxpy");
|
||||
BENCHMARK(BM_axpy<double>) L1_SIZES ->Name("daxpy");
|
||||
BENCHMARK(BM_nrm2<float>) L1_SIZES ->Name("snrm2");
|
||||
BENCHMARK(BM_nrm2<double>) L1_SIZES ->Name("dnrm2");
|
||||
BENCHMARK(BM_rotm<float>) L1_SIZES ->Name("srotm");
|
||||
BENCHMARK(BM_rotm<double>) L1_SIZES ->Name("drotm");
|
||||
BENCHMARK(BM_rotmg<float>) ->Name("srotmg");
|
||||
BENCHMARK(BM_rotmg<double>) ->Name("drotmg");
|
||||
BENCHMARK(BM_dotc<float>) L1_SIZES ->Name("cdotc");
|
||||
BENCHMARK(BM_dotc<double>) L1_SIZES ->Name("zdotc");
|
||||
|
||||
#undef L1_SIZES
|
||||
|
||||
// --- Matrix sizes for Level 2 ---
|
||||
// GEMV: {m, n}
|
||||
#define GEMV_SIZES \
|
||||
->Args({64, 64})->Args({256, 256})->Args({1024, 1024})->Args({4096, 4096}) \
|
||||
->Args({4096, 64})->Args({64, 4096})
|
||||
|
||||
BENCHMARK(BM_gemv<float>) GEMV_SIZES ->Name("sgemv");
|
||||
BENCHMARK(BM_gemv<double>) GEMV_SIZES ->Name("dgemv");
|
||||
#undef GEMV_SIZES
|
||||
|
||||
// Symmetric packed: {n}
|
||||
#define SPM_SIZES ->Arg(64)->Arg(256)->Arg(1024)->Arg(4096)
|
||||
|
||||
BENCHMARK(BM_spmv<float>) SPM_SIZES ->Name("sspmv");
|
||||
BENCHMARK(BM_spmv<double>) SPM_SIZES ->Name("dspmv");
|
||||
BENCHMARK(BM_hpmv<float>) SPM_SIZES ->Name("chpmv");
|
||||
BENCHMARK(BM_hpmv<double>) SPM_SIZES ->Name("zhpmv");
|
||||
|
||||
#undef SPM_SIZES
|
||||
|
||||
// Band: {n, k}
|
||||
#define BAND_SIZES \
|
||||
->Args({256, 4})->Args({256, 32})->Args({1024, 4})->Args({1024, 32}) \
|
||||
->Args({4096, 4})->Args({4096, 32})->Args({4096, 128})
|
||||
|
||||
BENCHMARK(BM_sbmv<float>) BAND_SIZES ->Name("ssbmv");
|
||||
BENCHMARK(BM_sbmv<double>) BAND_SIZES ->Name("dsbmv");
|
||||
BENCHMARK(BM_tbmv<float>) BAND_SIZES ->Name("stbmv");
|
||||
BENCHMARK(BM_tbmv<double>) BAND_SIZES ->Name("dtbmv");
|
||||
BENCHMARK(BM_hbmv<float>) BAND_SIZES ->Name("chbmv");
|
||||
BENCHMARK(BM_hbmv<double>) BAND_SIZES ->Name("zhbmv");
|
||||
|
||||
#undef BAND_SIZES
|
||||
|
||||
// --- Square sizes for Level 3 ---
|
||||
#define GEMM_SIZES ->Arg(32)->Arg(64)->Arg(128)->Arg(256)->Arg(512)->Arg(1024)
|
||||
|
||||
BENCHMARK(BM_gemm<float>) GEMM_SIZES ->Name("sgemm");
|
||||
BENCHMARK(BM_gemm<double>) GEMM_SIZES ->Name("dgemm");
|
||||
|
||||
#undef GEMM_SIZES
|
||||
|
||||
// clang-format on
|
||||
@@ -20,11 +20,7 @@ function(eigen_add_benchmark name source)
|
||||
if(BENCH_LIBRARIES)
|
||||
target_link_libraries(${name} PRIVATE ${BENCH_LIBRARIES})
|
||||
endif()
|
||||
target_compile_options(${name} PRIVATE
|
||||
$<$<CXX_COMPILER_ID:MSVC>:/O2>
|
||||
$<$<NOT:$<CXX_COMPILER_ID:MSVC>>:-O3>
|
||||
)
|
||||
target_compile_definitions(${name} PRIVATE NDEBUG)
|
||||
target_compile_options(${name} PRIVATE -O3 -DNDEBUG)
|
||||
if(BENCH_DEFINITIONS)
|
||||
target_compile_definitions(${name} PRIVATE ${BENCH_DEFINITIONS})
|
||||
endif()
|
||||
@@ -42,4 +38,3 @@ add_subdirectory(FFT)
|
||||
add_subdirectory(Householder)
|
||||
add_subdirectory(Solvers)
|
||||
add_subdirectory(Tuning)
|
||||
add_subdirectory(BLAS)
|
||||
|
||||
@@ -6,7 +6,11 @@ if(EIGEN_BUILD_BLAS)
|
||||
add_custom_target(blas)
|
||||
|
||||
set(EigenBlas_SRCS single.cpp double.cpp complex_single.cpp complex_double.cpp xerbla.cpp
|
||||
lsame.cpp complexdots.cpp
|
||||
f2c/srotm.c f2c/srotmg.c f2c/drotm.c f2c/drotmg.c
|
||||
f2c/lsame.c f2c/dspmv.c f2c/ssbmv.c f2c/chbmv.c
|
||||
f2c/sspmv.c f2c/zhbmv.c f2c/chpmv.c f2c/dsbmv.c
|
||||
f2c/zhpmv.c f2c/dtbmv.c f2c/stbmv.c f2c/ctbmv.c
|
||||
f2c/ztbmv.c f2c/complexdots.c
|
||||
)
|
||||
|
||||
set(EIGEN_BLAS_TARGETS "")
|
||||
|
||||
@@ -1,72 +0,0 @@
|
||||
// This file is part of Eigen, a lightweight C++ template library
|
||||
// for linear algebra.
|
||||
//
|
||||
// This Source Code Form is subject to the terms of the Mozilla
|
||||
// Public License v. 2.0. If a copy of the MPL was not distributed
|
||||
// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
|
||||
|
||||
// C++ replacements for the f2c complex dot product wrappers.
|
||||
// These are thin wrappers around the worker functions (cdotcw_, etc.)
|
||||
// defined in level1_cplx_impl.h.
|
||||
//
|
||||
// Note: blas.h declares these as void, but gfortran expects complex functions
|
||||
// to return by value. We define the correct signatures here and do not include
|
||||
// blas.h to avoid the conflicting declarations.
|
||||
|
||||
#if defined(_WIN32)
|
||||
#if defined(EIGEN_BLAS_BUILD_DLL)
|
||||
#define EIGEN_BLAS_CDOT_API __declspec(dllexport)
|
||||
#else
|
||||
#define EIGEN_BLAS_CDOT_API
|
||||
#endif
|
||||
#elif ((defined(__GNUC__) && __GNUC__ >= 4) || defined(__clang__)) && defined(EIGEN_BLAS_BUILD_DLL)
|
||||
#define EIGEN_BLAS_CDOT_API __attribute__((visibility("default")))
|
||||
#else
|
||||
#define EIGEN_BLAS_CDOT_API
|
||||
#endif
|
||||
|
||||
extern "C" {
|
||||
|
||||
// Worker function declarations (defined in level1_cplx_impl.h via complex_single.cpp / complex_double.cpp).
|
||||
void cdotcw_(int *n, float *cx, int *incx, float *cy, int *incy, float *res);
|
||||
void cdotuw_(int *n, float *cx, int *incx, float *cy, int *incy, float *res);
|
||||
void zdotcw_(int *n, double *cx, int *incx, double *cy, int *incy, double *res);
|
||||
void zdotuw_(int *n, double *cx, int *incx, double *cy, int *incy, double *res);
|
||||
|
||||
// POD complex types for C-compatible return values (matches Fortran complex layout).
|
||||
struct eigen_blas_complex_float {
|
||||
float r, i;
|
||||
};
|
||||
struct eigen_blas_complex_double {
|
||||
double r, i;
|
||||
};
|
||||
|
||||
// CDOTC computes the conjugated dot product of two single-precision complex vectors.
|
||||
EIGEN_BLAS_CDOT_API eigen_blas_complex_float cdotc_(int *n, float *cx, int *incx, float *cy, int *incy) {
|
||||
eigen_blas_complex_float res = {0.0f, 0.0f};
|
||||
cdotcw_(n, cx, incx, cy, incy, &res.r);
|
||||
return res;
|
||||
}
|
||||
|
||||
// CDOTU computes the unconjugated dot product of two single-precision complex vectors.
|
||||
EIGEN_BLAS_CDOT_API eigen_blas_complex_float cdotu_(int *n, float *cx, int *incx, float *cy, int *incy) {
|
||||
eigen_blas_complex_float res = {0.0f, 0.0f};
|
||||
cdotuw_(n, cx, incx, cy, incy, &res.r);
|
||||
return res;
|
||||
}
|
||||
|
||||
// ZDOTC computes the conjugated dot product of two double-precision complex vectors.
|
||||
EIGEN_BLAS_CDOT_API eigen_blas_complex_double zdotc_(int *n, double *cx, int *incx, double *cy, int *incy) {
|
||||
eigen_blas_complex_double res = {0.0, 0.0};
|
||||
zdotcw_(n, cx, incx, cy, incy, &res.r);
|
||||
return res;
|
||||
}
|
||||
|
||||
// ZDOTU computes the unconjugated dot product of two double-precision complex vectors.
|
||||
EIGEN_BLAS_CDOT_API eigen_blas_complex_double zdotu_(int *n, double *cx, int *incx, double *cy, int *incy) {
|
||||
eigen_blas_complex_double res = {0.0, 0.0};
|
||||
zdotuw_(n, cx, incx, cy, incy, &res.r);
|
||||
return res;
|
||||
}
|
||||
|
||||
} // extern "C"
|
||||
456
blas/f2c/chbmv.c
Normal file
456
blas/f2c/chbmv.c
Normal file
@@ -0,0 +1,456 @@
|
||||
/* chbmv.f -- translated by f2c (version 20100827).
|
||||
You must link the resulting object file with libf2c:
|
||||
on Microsoft Windows system, link with libf2c.lib;
|
||||
on Linux or Unix systems, link with .../path/to/libf2c.a -lm
|
||||
or, if you install libf2c.a in a standard place, with -lf2c -lm
|
||||
-- in that order, at the end of the command line, as in
|
||||
cc *.o -lf2c -lm
|
||||
Source for libf2c is in /netlib/f2c/libf2c.zip, e.g.,
|
||||
|
||||
http://www.netlib.org/f2c/libf2c.zip
|
||||
*/
|
||||
|
||||
#include "datatypes.h"
|
||||
|
||||
static inline void r_cnjg(complex *r, complex *z) {
|
||||
r->r = z->r;
|
||||
r->i = -(z->i);
|
||||
}
|
||||
|
||||
/* Subroutine */ void chbmv_(char *uplo, integer *n, integer *k, complex *alpha, complex *a, integer *lda, complex *x,
|
||||
integer *incx, complex *beta, complex *y, integer *incy) {
|
||||
/* System generated locals */
|
||||
integer a_dim1, a_offset, i__1, i__2, i__3, i__4, i__5;
|
||||
real r__1;
|
||||
complex q__1, q__2, q__3, q__4;
|
||||
|
||||
/* Local variables */
|
||||
integer i__, j, l, ix, iy, jx, jy, kx, ky, info;
|
||||
complex temp1, temp2;
|
||||
extern logical lsame_(char *, char *);
|
||||
integer kplus1;
|
||||
extern /* Subroutine */ void xerbla_(const char *, integer *);
|
||||
|
||||
/* .. Scalar Arguments .. */
|
||||
/* .. */
|
||||
/* .. Array Arguments .. */
|
||||
/* .. */
|
||||
|
||||
/* Purpose */
|
||||
/* ======= */
|
||||
|
||||
/* CHBMV performs the matrix-vector operation */
|
||||
|
||||
/* y := alpha*A*x + beta*y, */
|
||||
|
||||
/* where alpha and beta are scalars, x and y are n element vectors and */
|
||||
/* A is an n by n hermitian band matrix, with k super-diagonals. */
|
||||
|
||||
/* Arguments */
|
||||
/* ========== */
|
||||
|
||||
/* UPLO - CHARACTER*1. */
|
||||
/* On entry, UPLO specifies whether the upper or lower */
|
||||
/* triangular part of the band matrix A is being supplied as */
|
||||
/* follows: */
|
||||
|
||||
/* UPLO = 'U' or 'u' The upper triangular part of A is */
|
||||
/* being supplied. */
|
||||
|
||||
/* UPLO = 'L' or 'l' The lower triangular part of A is */
|
||||
/* being supplied. */
|
||||
|
||||
/* Unchanged on exit. */
|
||||
|
||||
/* N - INTEGER. */
|
||||
/* On entry, N specifies the order of the matrix A. */
|
||||
/* N must be at least zero. */
|
||||
/* Unchanged on exit. */
|
||||
|
||||
/* K - INTEGER. */
|
||||
/* On entry, K specifies the number of super-diagonals of the */
|
||||
/* matrix A. K must satisfy 0 .le. K. */
|
||||
/* Unchanged on exit. */
|
||||
|
||||
/* ALPHA - COMPLEX . */
|
||||
/* On entry, ALPHA specifies the scalar alpha. */
|
||||
/* Unchanged on exit. */
|
||||
|
||||
/* A - COMPLEX array of DIMENSION ( LDA, n ). */
|
||||
/* Before entry with UPLO = 'U' or 'u', the leading ( k + 1 ) */
|
||||
/* by n part of the array A must contain the upper triangular */
|
||||
/* band part of the hermitian matrix, supplied column by */
|
||||
/* column, with the leading diagonal of the matrix in row */
|
||||
/* ( k + 1 ) of the array, the first super-diagonal starting at */
|
||||
/* position 2 in row k, and so on. The top left k by k triangle */
|
||||
/* of the array A is not referenced. */
|
||||
/* The following program segment will transfer the upper */
|
||||
/* triangular part of a hermitian band matrix from conventional */
|
||||
/* full matrix storage to band storage: */
|
||||
|
||||
/* DO 20, J = 1, N */
|
||||
/* M = K + 1 - J */
|
||||
/* DO 10, I = MAX( 1, J - K ), J */
|
||||
/* A( M + I, J ) = matrix( I, J ) */
|
||||
/* 10 CONTINUE */
|
||||
/* 20 CONTINUE */
|
||||
|
||||
/* Before entry with UPLO = 'L' or 'l', the leading ( k + 1 ) */
|
||||
/* by n part of the array A must contain the lower triangular */
|
||||
/* band part of the hermitian matrix, supplied column by */
|
||||
/* column, with the leading diagonal of the matrix in row 1 of */
|
||||
/* the array, the first sub-diagonal starting at position 1 in */
|
||||
/* row 2, and so on. The bottom right k by k triangle of the */
|
||||
/* array A is not referenced. */
|
||||
/* The following program segment will transfer the lower */
|
||||
/* triangular part of a hermitian band matrix from conventional */
|
||||
/* full matrix storage to band storage: */
|
||||
|
||||
/* DO 20, J = 1, N */
|
||||
/* M = 1 - J */
|
||||
/* DO 10, I = J, MIN( N, J + K ) */
|
||||
/* A( M + I, J ) = matrix( I, J ) */
|
||||
/* 10 CONTINUE */
|
||||
/* 20 CONTINUE */
|
||||
|
||||
/* Note that the imaginary parts of the diagonal elements need */
|
||||
/* not be set and are assumed to be zero. */
|
||||
/* Unchanged on exit. */
|
||||
|
||||
/* LDA - INTEGER. */
|
||||
/* On entry, LDA specifies the first dimension of A as declared */
|
||||
/* in the calling (sub) program. LDA must be at least */
|
||||
/* ( k + 1 ). */
|
||||
/* Unchanged on exit. */
|
||||
|
||||
/* X - COMPLEX array of DIMENSION at least */
|
||||
/* ( 1 + ( n - 1 )*abs( INCX ) ). */
|
||||
/* Before entry, the incremented array X must contain the */
|
||||
/* vector x. */
|
||||
/* Unchanged on exit. */
|
||||
|
||||
/* INCX - INTEGER. */
|
||||
/* On entry, INCX specifies the increment for the elements of */
|
||||
/* X. INCX must not be zero. */
|
||||
/* Unchanged on exit. */
|
||||
|
||||
/* BETA - COMPLEX . */
|
||||
/* On entry, BETA specifies the scalar beta. */
|
||||
/* Unchanged on exit. */
|
||||
|
||||
/* Y - COMPLEX array of DIMENSION at least */
|
||||
/* ( 1 + ( n - 1 )*abs( INCY ) ). */
|
||||
/* Before entry, the incremented array Y must contain the */
|
||||
/* vector y. On exit, Y is overwritten by the updated vector y. */
|
||||
|
||||
/* INCY - INTEGER. */
|
||||
/* On entry, INCY specifies the increment for the elements of */
|
||||
/* Y. INCY must not be zero. */
|
||||
/* Unchanged on exit. */
|
||||
|
||||
/* Further Details */
|
||||
/* =============== */
|
||||
|
||||
/* Level 2 Blas routine. */
|
||||
|
||||
/* -- Written on 22-October-1986. */
|
||||
/* Jack Dongarra, Argonne National Lab. */
|
||||
/* Jeremy Du Croz, Nag Central Office. */
|
||||
/* Sven Hammarling, Nag Central Office. */
|
||||
/* Richard Hanson, Sandia National Labs. */
|
||||
|
||||
/* ===================================================================== */
|
||||
|
||||
/* .. Parameters .. */
|
||||
/* .. */
|
||||
/* .. Local Scalars .. */
|
||||
/* .. */
|
||||
/* .. External Functions .. */
|
||||
/* .. */
|
||||
/* .. External Subroutines .. */
|
||||
/* .. */
|
||||
/* .. Intrinsic Functions .. */
|
||||
/* .. */
|
||||
|
||||
/* Test the input parameters. */
|
||||
|
||||
/* Parameter adjustments */
|
||||
a_dim1 = *lda;
|
||||
a_offset = 1 + a_dim1;
|
||||
a -= a_offset;
|
||||
--x;
|
||||
--y;
|
||||
|
||||
/* Function Body */
|
||||
info = 0;
|
||||
if (!lsame_(uplo, "U") && !lsame_(uplo, "L")) {
|
||||
info = 1;
|
||||
} else if (*n < 0) {
|
||||
info = 2;
|
||||
} else if (*k < 0) {
|
||||
info = 3;
|
||||
} else if (*lda < *k + 1) {
|
||||
info = 6;
|
||||
} else if (*incx == 0) {
|
||||
info = 8;
|
||||
} else if (*incy == 0) {
|
||||
info = 11;
|
||||
}
|
||||
if (info != 0) {
|
||||
xerbla_("CHBMV ", &info);
|
||||
return;
|
||||
}
|
||||
|
||||
/* Quick return if possible. */
|
||||
|
||||
if (*n == 0 || (alpha->r == 0.f && alpha->i == 0.f && (beta->r == 1.f && beta->i == 0.f))) {
|
||||
return;
|
||||
}
|
||||
|
||||
/* Set up the start points in X and Y. */
|
||||
|
||||
if (*incx > 0) {
|
||||
kx = 1;
|
||||
} else {
|
||||
kx = 1 - (*n - 1) * *incx;
|
||||
}
|
||||
if (*incy > 0) {
|
||||
ky = 1;
|
||||
} else {
|
||||
ky = 1 - (*n - 1) * *incy;
|
||||
}
|
||||
|
||||
/* Start the operations. In this version the elements of the array A */
|
||||
/* are accessed sequentially with one pass through A. */
|
||||
|
||||
/* First form y := beta*y. */
|
||||
|
||||
if (beta->r != 1.f || beta->i != 0.f) {
|
||||
if (*incy == 1) {
|
||||
if (beta->r == 0.f && beta->i == 0.f) {
|
||||
i__1 = *n;
|
||||
for (i__ = 1; i__ <= i__1; ++i__) {
|
||||
i__2 = i__;
|
||||
y[i__2].r = 0.f, y[i__2].i = 0.f;
|
||||
/* L10: */
|
||||
}
|
||||
} else {
|
||||
i__1 = *n;
|
||||
for (i__ = 1; i__ <= i__1; ++i__) {
|
||||
i__2 = i__;
|
||||
i__3 = i__;
|
||||
q__1.r = beta->r * y[i__3].r - beta->i * y[i__3].i, q__1.i = beta->r * y[i__3].i + beta->i * y[i__3].r;
|
||||
y[i__2].r = q__1.r, y[i__2].i = q__1.i;
|
||||
/* L20: */
|
||||
}
|
||||
}
|
||||
} else {
|
||||
iy = ky;
|
||||
if (beta->r == 0.f && beta->i == 0.f) {
|
||||
i__1 = *n;
|
||||
for (i__ = 1; i__ <= i__1; ++i__) {
|
||||
i__2 = iy;
|
||||
y[i__2].r = 0.f, y[i__2].i = 0.f;
|
||||
iy += *incy;
|
||||
/* L30: */
|
||||
}
|
||||
} else {
|
||||
i__1 = *n;
|
||||
for (i__ = 1; i__ <= i__1; ++i__) {
|
||||
i__2 = iy;
|
||||
i__3 = iy;
|
||||
q__1.r = beta->r * y[i__3].r - beta->i * y[i__3].i, q__1.i = beta->r * y[i__3].i + beta->i * y[i__3].r;
|
||||
y[i__2].r = q__1.r, y[i__2].i = q__1.i;
|
||||
iy += *incy;
|
||||
/* L40: */
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
if (alpha->r == 0.f && alpha->i == 0.f) {
|
||||
return;
|
||||
}
|
||||
if (lsame_(uplo, "U")) {
|
||||
/* Form y when upper triangle of A is stored. */
|
||||
|
||||
kplus1 = *k + 1;
|
||||
if (*incx == 1 && *incy == 1) {
|
||||
i__1 = *n;
|
||||
for (j = 1; j <= i__1; ++j) {
|
||||
i__2 = j;
|
||||
q__1.r = alpha->r * x[i__2].r - alpha->i * x[i__2].i, q__1.i = alpha->r * x[i__2].i + alpha->i * x[i__2].r;
|
||||
temp1.r = q__1.r, temp1.i = q__1.i;
|
||||
temp2.r = 0.f, temp2.i = 0.f;
|
||||
l = kplus1 - j;
|
||||
/* Computing MAX */
|
||||
i__2 = 1, i__3 = j - *k;
|
||||
i__4 = j - 1;
|
||||
for (i__ = max(i__2, i__3); i__ <= i__4; ++i__) {
|
||||
i__2 = i__;
|
||||
i__3 = i__;
|
||||
i__5 = l + i__ + j * a_dim1;
|
||||
q__2.r = temp1.r * a[i__5].r - temp1.i * a[i__5].i, q__2.i = temp1.r * a[i__5].i + temp1.i * a[i__5].r;
|
||||
q__1.r = y[i__3].r + q__2.r, q__1.i = y[i__3].i + q__2.i;
|
||||
y[i__2].r = q__1.r, y[i__2].i = q__1.i;
|
||||
r_cnjg(&q__3, &a[l + i__ + j * a_dim1]);
|
||||
i__2 = i__;
|
||||
q__2.r = q__3.r * x[i__2].r - q__3.i * x[i__2].i, q__2.i = q__3.r * x[i__2].i + q__3.i * x[i__2].r;
|
||||
q__1.r = temp2.r + q__2.r, q__1.i = temp2.i + q__2.i;
|
||||
temp2.r = q__1.r, temp2.i = q__1.i;
|
||||
/* L50: */
|
||||
}
|
||||
i__4 = j;
|
||||
i__2 = j;
|
||||
i__3 = kplus1 + j * a_dim1;
|
||||
r__1 = a[i__3].r;
|
||||
q__3.r = r__1 * temp1.r, q__3.i = r__1 * temp1.i;
|
||||
q__2.r = y[i__2].r + q__3.r, q__2.i = y[i__2].i + q__3.i;
|
||||
q__4.r = alpha->r * temp2.r - alpha->i * temp2.i, q__4.i = alpha->r * temp2.i + alpha->i * temp2.r;
|
||||
q__1.r = q__2.r + q__4.r, q__1.i = q__2.i + q__4.i;
|
||||
y[i__4].r = q__1.r, y[i__4].i = q__1.i;
|
||||
/* L60: */
|
||||
}
|
||||
} else {
|
||||
jx = kx;
|
||||
jy = ky;
|
||||
i__1 = *n;
|
||||
for (j = 1; j <= i__1; ++j) {
|
||||
i__4 = jx;
|
||||
q__1.r = alpha->r * x[i__4].r - alpha->i * x[i__4].i, q__1.i = alpha->r * x[i__4].i + alpha->i * x[i__4].r;
|
||||
temp1.r = q__1.r, temp1.i = q__1.i;
|
||||
temp2.r = 0.f, temp2.i = 0.f;
|
||||
ix = kx;
|
||||
iy = ky;
|
||||
l = kplus1 - j;
|
||||
/* Computing MAX */
|
||||
i__4 = 1, i__2 = j - *k;
|
||||
i__3 = j - 1;
|
||||
for (i__ = max(i__4, i__2); i__ <= i__3; ++i__) {
|
||||
i__4 = iy;
|
||||
i__2 = iy;
|
||||
i__5 = l + i__ + j * a_dim1;
|
||||
q__2.r = temp1.r * a[i__5].r - temp1.i * a[i__5].i, q__2.i = temp1.r * a[i__5].i + temp1.i * a[i__5].r;
|
||||
q__1.r = y[i__2].r + q__2.r, q__1.i = y[i__2].i + q__2.i;
|
||||
y[i__4].r = q__1.r, y[i__4].i = q__1.i;
|
||||
r_cnjg(&q__3, &a[l + i__ + j * a_dim1]);
|
||||
i__4 = ix;
|
||||
q__2.r = q__3.r * x[i__4].r - q__3.i * x[i__4].i, q__2.i = q__3.r * x[i__4].i + q__3.i * x[i__4].r;
|
||||
q__1.r = temp2.r + q__2.r, q__1.i = temp2.i + q__2.i;
|
||||
temp2.r = q__1.r, temp2.i = q__1.i;
|
||||
ix += *incx;
|
||||
iy += *incy;
|
||||
/* L70: */
|
||||
}
|
||||
i__3 = jy;
|
||||
i__4 = jy;
|
||||
i__2 = kplus1 + j * a_dim1;
|
||||
r__1 = a[i__2].r;
|
||||
q__3.r = r__1 * temp1.r, q__3.i = r__1 * temp1.i;
|
||||
q__2.r = y[i__4].r + q__3.r, q__2.i = y[i__4].i + q__3.i;
|
||||
q__4.r = alpha->r * temp2.r - alpha->i * temp2.i, q__4.i = alpha->r * temp2.i + alpha->i * temp2.r;
|
||||
q__1.r = q__2.r + q__4.r, q__1.i = q__2.i + q__4.i;
|
||||
y[i__3].r = q__1.r, y[i__3].i = q__1.i;
|
||||
jx += *incx;
|
||||
jy += *incy;
|
||||
if (j > *k) {
|
||||
kx += *incx;
|
||||
ky += *incy;
|
||||
}
|
||||
/* L80: */
|
||||
}
|
||||
}
|
||||
} else {
|
||||
/* Form y when lower triangle of A is stored. */
|
||||
|
||||
if (*incx == 1 && *incy == 1) {
|
||||
i__1 = *n;
|
||||
for (j = 1; j <= i__1; ++j) {
|
||||
i__3 = j;
|
||||
q__1.r = alpha->r * x[i__3].r - alpha->i * x[i__3].i, q__1.i = alpha->r * x[i__3].i + alpha->i * x[i__3].r;
|
||||
temp1.r = q__1.r, temp1.i = q__1.i;
|
||||
temp2.r = 0.f, temp2.i = 0.f;
|
||||
i__3 = j;
|
||||
i__4 = j;
|
||||
i__2 = j * a_dim1 + 1;
|
||||
r__1 = a[i__2].r;
|
||||
q__2.r = r__1 * temp1.r, q__2.i = r__1 * temp1.i;
|
||||
q__1.r = y[i__4].r + q__2.r, q__1.i = y[i__4].i + q__2.i;
|
||||
y[i__3].r = q__1.r, y[i__3].i = q__1.i;
|
||||
l = 1 - j;
|
||||
/* Computing MIN */
|
||||
i__4 = *n, i__2 = j + *k;
|
||||
i__3 = min(i__4, i__2);
|
||||
for (i__ = j + 1; i__ <= i__3; ++i__) {
|
||||
i__4 = i__;
|
||||
i__2 = i__;
|
||||
i__5 = l + i__ + j * a_dim1;
|
||||
q__2.r = temp1.r * a[i__5].r - temp1.i * a[i__5].i, q__2.i = temp1.r * a[i__5].i + temp1.i * a[i__5].r;
|
||||
q__1.r = y[i__2].r + q__2.r, q__1.i = y[i__2].i + q__2.i;
|
||||
y[i__4].r = q__1.r, y[i__4].i = q__1.i;
|
||||
r_cnjg(&q__3, &a[l + i__ + j * a_dim1]);
|
||||
i__4 = i__;
|
||||
q__2.r = q__3.r * x[i__4].r - q__3.i * x[i__4].i, q__2.i = q__3.r * x[i__4].i + q__3.i * x[i__4].r;
|
||||
q__1.r = temp2.r + q__2.r, q__1.i = temp2.i + q__2.i;
|
||||
temp2.r = q__1.r, temp2.i = q__1.i;
|
||||
/* L90: */
|
||||
}
|
||||
i__3 = j;
|
||||
i__4 = j;
|
||||
q__2.r = alpha->r * temp2.r - alpha->i * temp2.i, q__2.i = alpha->r * temp2.i + alpha->i * temp2.r;
|
||||
q__1.r = y[i__4].r + q__2.r, q__1.i = y[i__4].i + q__2.i;
|
||||
y[i__3].r = q__1.r, y[i__3].i = q__1.i;
|
||||
/* L100: */
|
||||
}
|
||||
} else {
|
||||
jx = kx;
|
||||
jy = ky;
|
||||
i__1 = *n;
|
||||
for (j = 1; j <= i__1; ++j) {
|
||||
i__3 = jx;
|
||||
q__1.r = alpha->r * x[i__3].r - alpha->i * x[i__3].i, q__1.i = alpha->r * x[i__3].i + alpha->i * x[i__3].r;
|
||||
temp1.r = q__1.r, temp1.i = q__1.i;
|
||||
temp2.r = 0.f, temp2.i = 0.f;
|
||||
i__3 = jy;
|
||||
i__4 = jy;
|
||||
i__2 = j * a_dim1 + 1;
|
||||
r__1 = a[i__2].r;
|
||||
q__2.r = r__1 * temp1.r, q__2.i = r__1 * temp1.i;
|
||||
q__1.r = y[i__4].r + q__2.r, q__1.i = y[i__4].i + q__2.i;
|
||||
y[i__3].r = q__1.r, y[i__3].i = q__1.i;
|
||||
l = 1 - j;
|
||||
ix = jx;
|
||||
iy = jy;
|
||||
/* Computing MIN */
|
||||
i__4 = *n, i__2 = j + *k;
|
||||
i__3 = min(i__4, i__2);
|
||||
for (i__ = j + 1; i__ <= i__3; ++i__) {
|
||||
ix += *incx;
|
||||
iy += *incy;
|
||||
i__4 = iy;
|
||||
i__2 = iy;
|
||||
i__5 = l + i__ + j * a_dim1;
|
||||
q__2.r = temp1.r * a[i__5].r - temp1.i * a[i__5].i, q__2.i = temp1.r * a[i__5].i + temp1.i * a[i__5].r;
|
||||
q__1.r = y[i__2].r + q__2.r, q__1.i = y[i__2].i + q__2.i;
|
||||
y[i__4].r = q__1.r, y[i__4].i = q__1.i;
|
||||
r_cnjg(&q__3, &a[l + i__ + j * a_dim1]);
|
||||
i__4 = ix;
|
||||
q__2.r = q__3.r * x[i__4].r - q__3.i * x[i__4].i, q__2.i = q__3.r * x[i__4].i + q__3.i * x[i__4].r;
|
||||
q__1.r = temp2.r + q__2.r, q__1.i = temp2.i + q__2.i;
|
||||
temp2.r = q__1.r, temp2.i = q__1.i;
|
||||
/* L110: */
|
||||
}
|
||||
i__3 = jy;
|
||||
i__4 = jy;
|
||||
q__2.r = alpha->r * temp2.r - alpha->i * temp2.i, q__2.i = alpha->r * temp2.i + alpha->i * temp2.r;
|
||||
q__1.r = y[i__4].r + q__2.r, q__1.i = y[i__4].i + q__2.i;
|
||||
y[i__3].r = q__1.r, y[i__3].i = q__1.i;
|
||||
jx += *incx;
|
||||
jy += *incy;
|
||||
/* L120: */
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/* End of CHBMV . */
|
||||
|
||||
} /* chbmv_ */
|
||||
407
blas/f2c/chpmv.c
Normal file
407
blas/f2c/chpmv.c
Normal file
@@ -0,0 +1,407 @@
|
||||
/* chpmv.f -- translated by f2c (version 20100827).
|
||||
You must link the resulting object file with libf2c:
|
||||
on Microsoft Windows system, link with libf2c.lib;
|
||||
on Linux or Unix systems, link with .../path/to/libf2c.a -lm
|
||||
or, if you install libf2c.a in a standard place, with -lf2c -lm
|
||||
-- in that order, at the end of the command line, as in
|
||||
cc *.o -lf2c -lm
|
||||
Source for libf2c is in /netlib/f2c/libf2c.zip, e.g.,
|
||||
|
||||
http://www.netlib.org/f2c/libf2c.zip
|
||||
*/
|
||||
|
||||
#include "datatypes.h"
|
||||
|
||||
static inline void r_cnjg(complex *r, complex *z) {
|
||||
r->r = z->r;
|
||||
r->i = -(z->i);
|
||||
}
|
||||
|
||||
/* Subroutine */ void chpmv_(char *uplo, integer *n, complex *alpha, complex *ap, complex *x, integer *incx,
|
||||
complex *beta, complex *y, integer *incy) {
|
||||
/* System generated locals */
|
||||
integer i__1, i__2, i__3, i__4, i__5;
|
||||
real r__1;
|
||||
complex q__1, q__2, q__3, q__4;
|
||||
|
||||
/* Local variables */
|
||||
integer i__, j, k, kk, ix, iy, jx, jy, kx, ky, info;
|
||||
complex temp1, temp2;
|
||||
extern logical lsame_(char *, char *);
|
||||
extern /* Subroutine */ void xerbla_(const char *, integer *);
|
||||
|
||||
/* .. Scalar Arguments .. */
|
||||
/* .. */
|
||||
/* .. Array Arguments .. */
|
||||
/* .. */
|
||||
|
||||
/* Purpose */
|
||||
/* ======= */
|
||||
|
||||
/* CHPMV performs the matrix-vector operation */
|
||||
|
||||
/* y := alpha*A*x + beta*y, */
|
||||
|
||||
/* where alpha and beta are scalars, x and y are n element vectors and */
|
||||
/* A is an n by n hermitian matrix, supplied in packed form. */
|
||||
|
||||
/* Arguments */
|
||||
/* ========== */
|
||||
|
||||
/* UPLO - CHARACTER*1. */
|
||||
/* On entry, UPLO specifies whether the upper or lower */
|
||||
/* triangular part of the matrix A is supplied in the packed */
|
||||
/* array AP as follows: */
|
||||
|
||||
/* UPLO = 'U' or 'u' The upper triangular part of A is */
|
||||
/* supplied in AP. */
|
||||
|
||||
/* UPLO = 'L' or 'l' The lower triangular part of A is */
|
||||
/* supplied in AP. */
|
||||
|
||||
/* Unchanged on exit. */
|
||||
|
||||
/* N - INTEGER. */
|
||||
/* On entry, N specifies the order of the matrix A. */
|
||||
/* N must be at least zero. */
|
||||
/* Unchanged on exit. */
|
||||
|
||||
/* ALPHA - COMPLEX . */
|
||||
/* On entry, ALPHA specifies the scalar alpha. */
|
||||
/* Unchanged on exit. */
|
||||
|
||||
/* AP - COMPLEX array of DIMENSION at least */
|
||||
/* ( ( n*( n + 1 ) )/2 ). */
|
||||
/* Before entry with UPLO = 'U' or 'u', the array AP must */
|
||||
/* contain the upper triangular part of the hermitian matrix */
|
||||
/* packed sequentially, column by column, so that AP( 1 ) */
|
||||
/* contains a( 1, 1 ), AP( 2 ) and AP( 3 ) contain a( 1, 2 ) */
|
||||
/* and a( 2, 2 ) respectively, and so on. */
|
||||
/* Before entry with UPLO = 'L' or 'l', the array AP must */
|
||||
/* contain the lower triangular part of the hermitian matrix */
|
||||
/* packed sequentially, column by column, so that AP( 1 ) */
|
||||
/* contains a( 1, 1 ), AP( 2 ) and AP( 3 ) contain a( 2, 1 ) */
|
||||
/* and a( 3, 1 ) respectively, and so on. */
|
||||
/* Note that the imaginary parts of the diagonal elements need */
|
||||
/* not be set and are assumed to be zero. */
|
||||
/* Unchanged on exit. */
|
||||
|
||||
/* X - COMPLEX array of dimension at least */
|
||||
/* ( 1 + ( n - 1 )*abs( INCX ) ). */
|
||||
/* Before entry, the incremented array X must contain the n */
|
||||
/* element vector x. */
|
||||
/* Unchanged on exit. */
|
||||
|
||||
/* INCX - INTEGER. */
|
||||
/* On entry, INCX specifies the increment for the elements of */
|
||||
/* X. INCX must not be zero. */
|
||||
/* Unchanged on exit. */
|
||||
|
||||
/* BETA - COMPLEX . */
|
||||
/* On entry, BETA specifies the scalar beta. When BETA is */
|
||||
/* supplied as zero then Y need not be set on input. */
|
||||
/* Unchanged on exit. */
|
||||
|
||||
/* Y - COMPLEX array of dimension at least */
|
||||
/* ( 1 + ( n - 1 )*abs( INCY ) ). */
|
||||
/* Before entry, the incremented array Y must contain the n */
|
||||
/* element vector y. On exit, Y is overwritten by the updated */
|
||||
/* vector y. */
|
||||
|
||||
/* INCY - INTEGER. */
|
||||
/* On entry, INCY specifies the increment for the elements of */
|
||||
/* Y. INCY must not be zero. */
|
||||
/* Unchanged on exit. */
|
||||
|
||||
/* Further Details */
|
||||
/* =============== */
|
||||
|
||||
/* Level 2 Blas routine. */
|
||||
|
||||
/* -- Written on 22-October-1986. */
|
||||
/* Jack Dongarra, Argonne National Lab. */
|
||||
/* Jeremy Du Croz, Nag Central Office. */
|
||||
/* Sven Hammarling, Nag Central Office. */
|
||||
/* Richard Hanson, Sandia National Labs. */
|
||||
|
||||
/* ===================================================================== */
|
||||
|
||||
/* .. Parameters .. */
|
||||
/* .. */
|
||||
/* .. Local Scalars .. */
|
||||
/* .. */
|
||||
/* .. External Functions .. */
|
||||
/* .. */
|
||||
/* .. External Subroutines .. */
|
||||
/* .. */
|
||||
/* .. Intrinsic Functions .. */
|
||||
/* .. */
|
||||
|
||||
/* Test the input parameters. */
|
||||
|
||||
/* Parameter adjustments */
|
||||
--y;
|
||||
--x;
|
||||
--ap;
|
||||
|
||||
/* Function Body */
|
||||
info = 0;
|
||||
if (!lsame_(uplo, "U") && !lsame_(uplo, "L")) {
|
||||
info = 1;
|
||||
} else if (*n < 0) {
|
||||
info = 2;
|
||||
} else if (*incx == 0) {
|
||||
info = 6;
|
||||
} else if (*incy == 0) {
|
||||
info = 9;
|
||||
}
|
||||
if (info != 0) {
|
||||
xerbla_("CHPMV ", &info);
|
||||
return;
|
||||
}
|
||||
|
||||
/* Quick return if possible. */
|
||||
|
||||
if (*n == 0 || (alpha->r == 0.f && alpha->i == 0.f && (beta->r == 1.f && beta->i == 0.f))) {
|
||||
return;
|
||||
}
|
||||
|
||||
/* Set up the start points in X and Y. */
|
||||
|
||||
if (*incx > 0) {
|
||||
kx = 1;
|
||||
} else {
|
||||
kx = 1 - (*n - 1) * *incx;
|
||||
}
|
||||
if (*incy > 0) {
|
||||
ky = 1;
|
||||
} else {
|
||||
ky = 1 - (*n - 1) * *incy;
|
||||
}
|
||||
|
||||
/* Start the operations. In this version the elements of the array AP */
|
||||
/* are accessed sequentially with one pass through AP. */
|
||||
|
||||
/* First form y := beta*y. */
|
||||
|
||||
if (beta->r != 1.f || beta->i != 0.f) {
|
||||
if (*incy == 1) {
|
||||
if (beta->r == 0.f && beta->i == 0.f) {
|
||||
i__1 = *n;
|
||||
for (i__ = 1; i__ <= i__1; ++i__) {
|
||||
i__2 = i__;
|
||||
y[i__2].r = 0.f, y[i__2].i = 0.f;
|
||||
/* L10: */
|
||||
}
|
||||
} else {
|
||||
i__1 = *n;
|
||||
for (i__ = 1; i__ <= i__1; ++i__) {
|
||||
i__2 = i__;
|
||||
i__3 = i__;
|
||||
q__1.r = beta->r * y[i__3].r - beta->i * y[i__3].i, q__1.i = beta->r * y[i__3].i + beta->i * y[i__3].r;
|
||||
y[i__2].r = q__1.r, y[i__2].i = q__1.i;
|
||||
/* L20: */
|
||||
}
|
||||
}
|
||||
} else {
|
||||
iy = ky;
|
||||
if (beta->r == 0.f && beta->i == 0.f) {
|
||||
i__1 = *n;
|
||||
for (i__ = 1; i__ <= i__1; ++i__) {
|
||||
i__2 = iy;
|
||||
y[i__2].r = 0.f, y[i__2].i = 0.f;
|
||||
iy += *incy;
|
||||
/* L30: */
|
||||
}
|
||||
} else {
|
||||
i__1 = *n;
|
||||
for (i__ = 1; i__ <= i__1; ++i__) {
|
||||
i__2 = iy;
|
||||
i__3 = iy;
|
||||
q__1.r = beta->r * y[i__3].r - beta->i * y[i__3].i, q__1.i = beta->r * y[i__3].i + beta->i * y[i__3].r;
|
||||
y[i__2].r = q__1.r, y[i__2].i = q__1.i;
|
||||
iy += *incy;
|
||||
/* L40: */
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
if (alpha->r == 0.f && alpha->i == 0.f) {
|
||||
return;
|
||||
}
|
||||
kk = 1;
|
||||
if (lsame_(uplo, "U")) {
|
||||
/* Form y when AP contains the upper triangle. */
|
||||
|
||||
if (*incx == 1 && *incy == 1) {
|
||||
i__1 = *n;
|
||||
for (j = 1; j <= i__1; ++j) {
|
||||
i__2 = j;
|
||||
q__1.r = alpha->r * x[i__2].r - alpha->i * x[i__2].i, q__1.i = alpha->r * x[i__2].i + alpha->i * x[i__2].r;
|
||||
temp1.r = q__1.r, temp1.i = q__1.i;
|
||||
temp2.r = 0.f, temp2.i = 0.f;
|
||||
k = kk;
|
||||
i__2 = j - 1;
|
||||
for (i__ = 1; i__ <= i__2; ++i__) {
|
||||
i__3 = i__;
|
||||
i__4 = i__;
|
||||
i__5 = k;
|
||||
q__2.r = temp1.r * ap[i__5].r - temp1.i * ap[i__5].i, q__2.i = temp1.r * ap[i__5].i + temp1.i * ap[i__5].r;
|
||||
q__1.r = y[i__4].r + q__2.r, q__1.i = y[i__4].i + q__2.i;
|
||||
y[i__3].r = q__1.r, y[i__3].i = q__1.i;
|
||||
r_cnjg(&q__3, &ap[k]);
|
||||
i__3 = i__;
|
||||
q__2.r = q__3.r * x[i__3].r - q__3.i * x[i__3].i, q__2.i = q__3.r * x[i__3].i + q__3.i * x[i__3].r;
|
||||
q__1.r = temp2.r + q__2.r, q__1.i = temp2.i + q__2.i;
|
||||
temp2.r = q__1.r, temp2.i = q__1.i;
|
||||
++k;
|
||||
/* L50: */
|
||||
}
|
||||
i__2 = j;
|
||||
i__3 = j;
|
||||
i__4 = kk + j - 1;
|
||||
r__1 = ap[i__4].r;
|
||||
q__3.r = r__1 * temp1.r, q__3.i = r__1 * temp1.i;
|
||||
q__2.r = y[i__3].r + q__3.r, q__2.i = y[i__3].i + q__3.i;
|
||||
q__4.r = alpha->r * temp2.r - alpha->i * temp2.i, q__4.i = alpha->r * temp2.i + alpha->i * temp2.r;
|
||||
q__1.r = q__2.r + q__4.r, q__1.i = q__2.i + q__4.i;
|
||||
y[i__2].r = q__1.r, y[i__2].i = q__1.i;
|
||||
kk += j;
|
||||
/* L60: */
|
||||
}
|
||||
} else {
|
||||
jx = kx;
|
||||
jy = ky;
|
||||
i__1 = *n;
|
||||
for (j = 1; j <= i__1; ++j) {
|
||||
i__2 = jx;
|
||||
q__1.r = alpha->r * x[i__2].r - alpha->i * x[i__2].i, q__1.i = alpha->r * x[i__2].i + alpha->i * x[i__2].r;
|
||||
temp1.r = q__1.r, temp1.i = q__1.i;
|
||||
temp2.r = 0.f, temp2.i = 0.f;
|
||||
ix = kx;
|
||||
iy = ky;
|
||||
i__2 = kk + j - 2;
|
||||
for (k = kk; k <= i__2; ++k) {
|
||||
i__3 = iy;
|
||||
i__4 = iy;
|
||||
i__5 = k;
|
||||
q__2.r = temp1.r * ap[i__5].r - temp1.i * ap[i__5].i, q__2.i = temp1.r * ap[i__5].i + temp1.i * ap[i__5].r;
|
||||
q__1.r = y[i__4].r + q__2.r, q__1.i = y[i__4].i + q__2.i;
|
||||
y[i__3].r = q__1.r, y[i__3].i = q__1.i;
|
||||
r_cnjg(&q__3, &ap[k]);
|
||||
i__3 = ix;
|
||||
q__2.r = q__3.r * x[i__3].r - q__3.i * x[i__3].i, q__2.i = q__3.r * x[i__3].i + q__3.i * x[i__3].r;
|
||||
q__1.r = temp2.r + q__2.r, q__1.i = temp2.i + q__2.i;
|
||||
temp2.r = q__1.r, temp2.i = q__1.i;
|
||||
ix += *incx;
|
||||
iy += *incy;
|
||||
/* L70: */
|
||||
}
|
||||
i__2 = jy;
|
||||
i__3 = jy;
|
||||
i__4 = kk + j - 1;
|
||||
r__1 = ap[i__4].r;
|
||||
q__3.r = r__1 * temp1.r, q__3.i = r__1 * temp1.i;
|
||||
q__2.r = y[i__3].r + q__3.r, q__2.i = y[i__3].i + q__3.i;
|
||||
q__4.r = alpha->r * temp2.r - alpha->i * temp2.i, q__4.i = alpha->r * temp2.i + alpha->i * temp2.r;
|
||||
q__1.r = q__2.r + q__4.r, q__1.i = q__2.i + q__4.i;
|
||||
y[i__2].r = q__1.r, y[i__2].i = q__1.i;
|
||||
jx += *incx;
|
||||
jy += *incy;
|
||||
kk += j;
|
||||
/* L80: */
|
||||
}
|
||||
}
|
||||
} else {
|
||||
/* Form y when AP contains the lower triangle. */
|
||||
|
||||
if (*incx == 1 && *incy == 1) {
|
||||
i__1 = *n;
|
||||
for (j = 1; j <= i__1; ++j) {
|
||||
i__2 = j;
|
||||
q__1.r = alpha->r * x[i__2].r - alpha->i * x[i__2].i, q__1.i = alpha->r * x[i__2].i + alpha->i * x[i__2].r;
|
||||
temp1.r = q__1.r, temp1.i = q__1.i;
|
||||
temp2.r = 0.f, temp2.i = 0.f;
|
||||
i__2 = j;
|
||||
i__3 = j;
|
||||
i__4 = kk;
|
||||
r__1 = ap[i__4].r;
|
||||
q__2.r = r__1 * temp1.r, q__2.i = r__1 * temp1.i;
|
||||
q__1.r = y[i__3].r + q__2.r, q__1.i = y[i__3].i + q__2.i;
|
||||
y[i__2].r = q__1.r, y[i__2].i = q__1.i;
|
||||
k = kk + 1;
|
||||
i__2 = *n;
|
||||
for (i__ = j + 1; i__ <= i__2; ++i__) {
|
||||
i__3 = i__;
|
||||
i__4 = i__;
|
||||
i__5 = k;
|
||||
q__2.r = temp1.r * ap[i__5].r - temp1.i * ap[i__5].i, q__2.i = temp1.r * ap[i__5].i + temp1.i * ap[i__5].r;
|
||||
q__1.r = y[i__4].r + q__2.r, q__1.i = y[i__4].i + q__2.i;
|
||||
y[i__3].r = q__1.r, y[i__3].i = q__1.i;
|
||||
r_cnjg(&q__3, &ap[k]);
|
||||
i__3 = i__;
|
||||
q__2.r = q__3.r * x[i__3].r - q__3.i * x[i__3].i, q__2.i = q__3.r * x[i__3].i + q__3.i * x[i__3].r;
|
||||
q__1.r = temp2.r + q__2.r, q__1.i = temp2.i + q__2.i;
|
||||
temp2.r = q__1.r, temp2.i = q__1.i;
|
||||
++k;
|
||||
/* L90: */
|
||||
}
|
||||
i__2 = j;
|
||||
i__3 = j;
|
||||
q__2.r = alpha->r * temp2.r - alpha->i * temp2.i, q__2.i = alpha->r * temp2.i + alpha->i * temp2.r;
|
||||
q__1.r = y[i__3].r + q__2.r, q__1.i = y[i__3].i + q__2.i;
|
||||
y[i__2].r = q__1.r, y[i__2].i = q__1.i;
|
||||
kk += *n - j + 1;
|
||||
/* L100: */
|
||||
}
|
||||
} else {
|
||||
jx = kx;
|
||||
jy = ky;
|
||||
i__1 = *n;
|
||||
for (j = 1; j <= i__1; ++j) {
|
||||
i__2 = jx;
|
||||
q__1.r = alpha->r * x[i__2].r - alpha->i * x[i__2].i, q__1.i = alpha->r * x[i__2].i + alpha->i * x[i__2].r;
|
||||
temp1.r = q__1.r, temp1.i = q__1.i;
|
||||
temp2.r = 0.f, temp2.i = 0.f;
|
||||
i__2 = jy;
|
||||
i__3 = jy;
|
||||
i__4 = kk;
|
||||
r__1 = ap[i__4].r;
|
||||
q__2.r = r__1 * temp1.r, q__2.i = r__1 * temp1.i;
|
||||
q__1.r = y[i__3].r + q__2.r, q__1.i = y[i__3].i + q__2.i;
|
||||
y[i__2].r = q__1.r, y[i__2].i = q__1.i;
|
||||
ix = jx;
|
||||
iy = jy;
|
||||
i__2 = kk + *n - j;
|
||||
for (k = kk + 1; k <= i__2; ++k) {
|
||||
ix += *incx;
|
||||
iy += *incy;
|
||||
i__3 = iy;
|
||||
i__4 = iy;
|
||||
i__5 = k;
|
||||
q__2.r = temp1.r * ap[i__5].r - temp1.i * ap[i__5].i, q__2.i = temp1.r * ap[i__5].i + temp1.i * ap[i__5].r;
|
||||
q__1.r = y[i__4].r + q__2.r, q__1.i = y[i__4].i + q__2.i;
|
||||
y[i__3].r = q__1.r, y[i__3].i = q__1.i;
|
||||
r_cnjg(&q__3, &ap[k]);
|
||||
i__3 = ix;
|
||||
q__2.r = q__3.r * x[i__3].r - q__3.i * x[i__3].i, q__2.i = q__3.r * x[i__3].i + q__3.i * x[i__3].r;
|
||||
q__1.r = temp2.r + q__2.r, q__1.i = temp2.i + q__2.i;
|
||||
temp2.r = q__1.r, temp2.i = q__1.i;
|
||||
/* L110: */
|
||||
}
|
||||
i__2 = jy;
|
||||
i__3 = jy;
|
||||
q__2.r = alpha->r * temp2.r - alpha->i * temp2.i, q__2.i = alpha->r * temp2.i + alpha->i * temp2.r;
|
||||
q__1.r = y[i__3].r + q__2.r, q__1.i = y[i__3].i + q__2.i;
|
||||
y[i__2].r = q__1.r, y[i__2].i = q__1.i;
|
||||
jx += *incx;
|
||||
jy += *incy;
|
||||
kk += *n - j + 1;
|
||||
/* L120: */
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/* End of CHPMV . */
|
||||
|
||||
} /* chpmv_ */
|
||||
73
blas/f2c/complexdots.c
Normal file
73
blas/f2c/complexdots.c
Normal file
@@ -0,0 +1,73 @@
|
||||
/* This file has been modified to use the standard gfortran calling
|
||||
convention, rather than the f2c calling convention.
|
||||
|
||||
It does not require -ff2c when compiled with gfortran.
|
||||
*/
|
||||
|
||||
/* complexdots.f -- translated by f2c (version 20100827).
|
||||
You must link the resulting object file with libf2c:
|
||||
on Microsoft Windows system, link with libf2c.lib;
|
||||
on Linux or Unix systems, link with .../path/to/libf2c.a -lm
|
||||
or, if you install libf2c.a in a standard place, with -lf2c -lm
|
||||
-- in that order, at the end of the command line, as in
|
||||
cc *.o -lf2c -lm
|
||||
Source for libf2c is in /netlib/f2c/libf2c.zip, e.g.,
|
||||
|
||||
http://www.netlib.org/f2c/libf2c.zip
|
||||
*/
|
||||
|
||||
#include "datatypes.h"
|
||||
|
||||
complex cdotc_(integer *n, complex *cx, integer *incx, complex *cy, integer *incy) {
|
||||
complex res;
|
||||
extern /* Subroutine */ void cdotcw_(integer *, complex *, integer *, complex *, integer *, complex *);
|
||||
|
||||
/* Parameter adjustments */
|
||||
--cy;
|
||||
--cx;
|
||||
|
||||
/* Function Body */
|
||||
cdotcw_(n, &cx[1], incx, &cy[1], incy, &res);
|
||||
return res;
|
||||
} /* cdotc_ */
|
||||
|
||||
complex cdotu_(integer *n, complex *cx, integer *incx, complex *cy, integer *incy) {
|
||||
complex res;
|
||||
extern /* Subroutine */ void cdotuw_(integer *, complex *, integer *, complex *, integer *, complex *);
|
||||
|
||||
/* Parameter adjustments */
|
||||
--cy;
|
||||
--cx;
|
||||
|
||||
/* Function Body */
|
||||
cdotuw_(n, &cx[1], incx, &cy[1], incy, &res);
|
||||
return res;
|
||||
} /* cdotu_ */
|
||||
|
||||
doublecomplex zdotc_(integer *n, doublecomplex *cx, integer *incx, doublecomplex *cy, integer *incy) {
|
||||
doublecomplex res;
|
||||
extern /* Subroutine */ void zdotcw_(integer *, doublecomplex *, integer *, doublecomplex *, integer *,
|
||||
doublecomplex *);
|
||||
|
||||
/* Parameter adjustments */
|
||||
--cy;
|
||||
--cx;
|
||||
|
||||
/* Function Body */
|
||||
zdotcw_(n, &cx[1], incx, &cy[1], incy, &res);
|
||||
return res;
|
||||
} /* zdotc_ */
|
||||
|
||||
doublecomplex zdotu_(integer *n, doublecomplex *cx, integer *incx, doublecomplex *cy, integer *incy) {
|
||||
doublecomplex res;
|
||||
extern /* Subroutine */ void zdotuw_(integer *, doublecomplex *, integer *, doublecomplex *, integer *,
|
||||
doublecomplex *);
|
||||
|
||||
/* Parameter adjustments */
|
||||
--cy;
|
||||
--cx;
|
||||
|
||||
/* Function Body */
|
||||
zdotuw_(n, &cx[1], incx, &cy[1], incy, &res);
|
||||
return res;
|
||||
} /* zdotu_ */
|
||||
586
blas/f2c/ctbmv.c
Normal file
586
blas/f2c/ctbmv.c
Normal file
@@ -0,0 +1,586 @@
|
||||
/* ctbmv.f -- translated by f2c (version 20100827).
|
||||
You must link the resulting object file with libf2c:
|
||||
on Microsoft Windows system, link with libf2c.lib;
|
||||
on Linux or Unix systems, link with .../path/to/libf2c.a -lm
|
||||
or, if you install libf2c.a in a standard place, with -lf2c -lm
|
||||
-- in that order, at the end of the command line, as in
|
||||
cc *.o -lf2c -lm
|
||||
Source for libf2c is in /netlib/f2c/libf2c.zip, e.g.,
|
||||
|
||||
http://www.netlib.org/f2c/libf2c.zip
|
||||
*/
|
||||
|
||||
#include "datatypes.h"
|
||||
|
||||
static inline void r_cnjg(complex *r, complex *z) {
|
||||
r->r = z->r;
|
||||
r->i = -(z->i);
|
||||
}
|
||||
|
||||
/* Subroutine */ void ctbmv_(char *uplo, char *trans, char *diag, integer *n, integer *k, complex *a, integer *lda,
|
||||
complex *x, integer *incx) {
|
||||
/* System generated locals */
|
||||
integer a_dim1, a_offset, i__1, i__2, i__3, i__4, i__5;
|
||||
complex q__1, q__2, q__3;
|
||||
|
||||
/* Local variables */
|
||||
integer i__, j, l, ix, jx, kx, info;
|
||||
complex temp;
|
||||
extern logical lsame_(char *, char *);
|
||||
integer kplus1;
|
||||
extern /* Subroutine */ void xerbla_(const char *, integer *);
|
||||
logical noconj, nounit;
|
||||
|
||||
/* .. Scalar Arguments .. */
|
||||
/* .. */
|
||||
/* .. Array Arguments .. */
|
||||
/* .. */
|
||||
|
||||
/* Purpose */
|
||||
/* ======= */
|
||||
|
||||
/* CTBMV performs one of the matrix-vector operations */
|
||||
|
||||
/* x := A*x, or x := A'*x, or x := conjg( A' )*x, */
|
||||
|
||||
/* where x is an n element vector and A is an n by n unit, or non-unit, */
|
||||
/* upper or lower triangular band matrix, with ( k + 1 ) diagonals. */
|
||||
|
||||
/* Arguments */
|
||||
/* ========== */
|
||||
|
||||
/* UPLO - CHARACTER*1. */
|
||||
/* On entry, UPLO specifies whether the matrix is an upper or */
|
||||
/* lower triangular matrix as follows: */
|
||||
|
||||
/* UPLO = 'U' or 'u' A is an upper triangular matrix. */
|
||||
|
||||
/* UPLO = 'L' or 'l' A is a lower triangular matrix. */
|
||||
|
||||
/* Unchanged on exit. */
|
||||
|
||||
/* TRANS - CHARACTER*1. */
|
||||
/* On entry, TRANS specifies the operation to be performed as */
|
||||
/* follows: */
|
||||
|
||||
/* TRANS = 'N' or 'n' x := A*x. */
|
||||
|
||||
/* TRANS = 'T' or 't' x := A'*x. */
|
||||
|
||||
/* TRANS = 'C' or 'c' x := conjg( A' )*x. */
|
||||
|
||||
/* Unchanged on exit. */
|
||||
|
||||
/* DIAG - CHARACTER*1. */
|
||||
/* On entry, DIAG specifies whether or not A is unit */
|
||||
/* triangular as follows: */
|
||||
|
||||
/* DIAG = 'U' or 'u' A is assumed to be unit triangular. */
|
||||
|
||||
/* DIAG = 'N' or 'n' A is not assumed to be unit */
|
||||
/* triangular. */
|
||||
|
||||
/* Unchanged on exit. */
|
||||
|
||||
/* N - INTEGER. */
|
||||
/* On entry, N specifies the order of the matrix A. */
|
||||
/* N must be at least zero. */
|
||||
/* Unchanged on exit. */
|
||||
|
||||
/* K - INTEGER. */
|
||||
/* On entry with UPLO = 'U' or 'u', K specifies the number of */
|
||||
/* super-diagonals of the matrix A. */
|
||||
/* On entry with UPLO = 'L' or 'l', K specifies the number of */
|
||||
/* sub-diagonals of the matrix A. */
|
||||
/* K must satisfy 0 .le. K. */
|
||||
/* Unchanged on exit. */
|
||||
|
||||
/* A - COMPLEX array of DIMENSION ( LDA, n ). */
|
||||
/* Before entry with UPLO = 'U' or 'u', the leading ( k + 1 ) */
|
||||
/* by n part of the array A must contain the upper triangular */
|
||||
/* band part of the matrix of coefficients, supplied column by */
|
||||
/* column, with the leading diagonal of the matrix in row */
|
||||
/* ( k + 1 ) of the array, the first super-diagonal starting at */
|
||||
/* position 2 in row k, and so on. The top left k by k triangle */
|
||||
/* of the array A is not referenced. */
|
||||
/* The following program segment will transfer an upper */
|
||||
/* triangular band matrix from conventional full matrix storage */
|
||||
/* to band storage: */
|
||||
|
||||
/* DO 20, J = 1, N */
|
||||
/* M = K + 1 - J */
|
||||
/* DO 10, I = MAX( 1, J - K ), J */
|
||||
/* A( M + I, J ) = matrix( I, J ) */
|
||||
/* 10 CONTINUE */
|
||||
/* 20 CONTINUE */
|
||||
|
||||
/* Before entry with UPLO = 'L' or 'l', the leading ( k + 1 ) */
|
||||
/* by n part of the array A must contain the lower triangular */
|
||||
/* band part of the matrix of coefficients, supplied column by */
|
||||
/* column, with the leading diagonal of the matrix in row 1 of */
|
||||
/* the array, the first sub-diagonal starting at position 1 in */
|
||||
/* row 2, and so on. The bottom right k by k triangle of the */
|
||||
/* array A is not referenced. */
|
||||
/* The following program segment will transfer a lower */
|
||||
/* triangular band matrix from conventional full matrix storage */
|
||||
/* to band storage: */
|
||||
|
||||
/* DO 20, J = 1, N */
|
||||
/* M = 1 - J */
|
||||
/* DO 10, I = J, MIN( N, J + K ) */
|
||||
/* A( M + I, J ) = matrix( I, J ) */
|
||||
/* 10 CONTINUE */
|
||||
/* 20 CONTINUE */
|
||||
|
||||
/* Note that when DIAG = 'U' or 'u' the elements of the array A */
|
||||
/* corresponding to the diagonal elements of the matrix are not */
|
||||
/* referenced, but are assumed to be unity. */
|
||||
/* Unchanged on exit. */
|
||||
|
||||
/* LDA - INTEGER. */
|
||||
/* On entry, LDA specifies the first dimension of A as declared */
|
||||
/* in the calling (sub) program. LDA must be at least */
|
||||
/* ( k + 1 ). */
|
||||
/* Unchanged on exit. */
|
||||
|
||||
/* X - COMPLEX array of dimension at least */
|
||||
/* ( 1 + ( n - 1 )*abs( INCX ) ). */
|
||||
/* Before entry, the incremented array X must contain the n */
|
||||
/* element vector x. On exit, X is overwritten with the */
|
||||
/* transformed vector x. */
|
||||
|
||||
/* INCX - INTEGER. */
|
||||
/* On entry, INCX specifies the increment for the elements of */
|
||||
/* X. INCX must not be zero. */
|
||||
/* Unchanged on exit. */
|
||||
|
||||
/* Further Details */
|
||||
/* =============== */
|
||||
|
||||
/* Level 2 Blas routine. */
|
||||
|
||||
/* -- Written on 22-October-1986. */
|
||||
/* Jack Dongarra, Argonne National Lab. */
|
||||
/* Jeremy Du Croz, Nag Central Office. */
|
||||
/* Sven Hammarling, Nag Central Office. */
|
||||
/* Richard Hanson, Sandia National Labs. */
|
||||
|
||||
/* ===================================================================== */
|
||||
|
||||
/* .. Parameters .. */
|
||||
/* .. */
|
||||
/* .. Local Scalars .. */
|
||||
/* .. */
|
||||
/* .. External Functions .. */
|
||||
/* .. */
|
||||
/* .. External Subroutines .. */
|
||||
/* .. */
|
||||
/* .. Intrinsic Functions .. */
|
||||
/* .. */
|
||||
|
||||
/* Test the input parameters. */
|
||||
|
||||
/* Parameter adjustments */
|
||||
a_dim1 = *lda;
|
||||
a_offset = 1 + a_dim1;
|
||||
a -= a_offset;
|
||||
--x;
|
||||
|
||||
/* Function Body */
|
||||
info = 0;
|
||||
if (!lsame_(uplo, "U") && !lsame_(uplo, "L")) {
|
||||
info = 1;
|
||||
} else if (!lsame_(trans, "N") && !lsame_(trans, "T") && !lsame_(trans, "C")) {
|
||||
info = 2;
|
||||
} else if (!lsame_(diag, "U") && !lsame_(diag, "N")) {
|
||||
info = 3;
|
||||
} else if (*n < 0) {
|
||||
info = 4;
|
||||
} else if (*k < 0) {
|
||||
info = 5;
|
||||
} else if (*lda < *k + 1) {
|
||||
info = 7;
|
||||
} else if (*incx == 0) {
|
||||
info = 9;
|
||||
}
|
||||
if (info != 0) {
|
||||
xerbla_("CTBMV ", &info);
|
||||
return;
|
||||
}
|
||||
|
||||
/* Quick return if possible. */
|
||||
|
||||
if (*n == 0) {
|
||||
return;
|
||||
}
|
||||
|
||||
noconj = lsame_(trans, "T");
|
||||
nounit = lsame_(diag, "N");
|
||||
|
||||
/* Set up the start point in X if the increment is not unity. This */
|
||||
/* will be ( N - 1 )*INCX too small for descending loops. */
|
||||
|
||||
if (*incx <= 0) {
|
||||
kx = 1 - (*n - 1) * *incx;
|
||||
} else if (*incx != 1) {
|
||||
kx = 1;
|
||||
}
|
||||
|
||||
/* Start the operations. In this version the elements of A are */
|
||||
/* accessed sequentially with one pass through A. */
|
||||
|
||||
if (lsame_(trans, "N")) {
|
||||
/* Form x := A*x. */
|
||||
|
||||
if (lsame_(uplo, "U")) {
|
||||
kplus1 = *k + 1;
|
||||
if (*incx == 1) {
|
||||
i__1 = *n;
|
||||
for (j = 1; j <= i__1; ++j) {
|
||||
i__2 = j;
|
||||
if (x[i__2].r != 0.f || x[i__2].i != 0.f) {
|
||||
i__2 = j;
|
||||
temp.r = x[i__2].r, temp.i = x[i__2].i;
|
||||
l = kplus1 - j;
|
||||
/* Computing MAX */
|
||||
i__2 = 1, i__3 = j - *k;
|
||||
i__4 = j - 1;
|
||||
for (i__ = max(i__2, i__3); i__ <= i__4; ++i__) {
|
||||
i__2 = i__;
|
||||
i__3 = i__;
|
||||
i__5 = l + i__ + j * a_dim1;
|
||||
q__2.r = temp.r * a[i__5].r - temp.i * a[i__5].i, q__2.i = temp.r * a[i__5].i + temp.i * a[i__5].r;
|
||||
q__1.r = x[i__3].r + q__2.r, q__1.i = x[i__3].i + q__2.i;
|
||||
x[i__2].r = q__1.r, x[i__2].i = q__1.i;
|
||||
/* L10: */
|
||||
}
|
||||
if (nounit) {
|
||||
i__4 = j;
|
||||
i__2 = j;
|
||||
i__3 = kplus1 + j * a_dim1;
|
||||
q__1.r = x[i__2].r * a[i__3].r - x[i__2].i * a[i__3].i,
|
||||
q__1.i = x[i__2].r * a[i__3].i + x[i__2].i * a[i__3].r;
|
||||
x[i__4].r = q__1.r, x[i__4].i = q__1.i;
|
||||
}
|
||||
}
|
||||
/* L20: */
|
||||
}
|
||||
} else {
|
||||
jx = kx;
|
||||
i__1 = *n;
|
||||
for (j = 1; j <= i__1; ++j) {
|
||||
i__4 = jx;
|
||||
if (x[i__4].r != 0.f || x[i__4].i != 0.f) {
|
||||
i__4 = jx;
|
||||
temp.r = x[i__4].r, temp.i = x[i__4].i;
|
||||
ix = kx;
|
||||
l = kplus1 - j;
|
||||
/* Computing MAX */
|
||||
i__4 = 1, i__2 = j - *k;
|
||||
i__3 = j - 1;
|
||||
for (i__ = max(i__4, i__2); i__ <= i__3; ++i__) {
|
||||
i__4 = ix;
|
||||
i__2 = ix;
|
||||
i__5 = l + i__ + j * a_dim1;
|
||||
q__2.r = temp.r * a[i__5].r - temp.i * a[i__5].i, q__2.i = temp.r * a[i__5].i + temp.i * a[i__5].r;
|
||||
q__1.r = x[i__2].r + q__2.r, q__1.i = x[i__2].i + q__2.i;
|
||||
x[i__4].r = q__1.r, x[i__4].i = q__1.i;
|
||||
ix += *incx;
|
||||
/* L30: */
|
||||
}
|
||||
if (nounit) {
|
||||
i__3 = jx;
|
||||
i__4 = jx;
|
||||
i__2 = kplus1 + j * a_dim1;
|
||||
q__1.r = x[i__4].r * a[i__2].r - x[i__4].i * a[i__2].i,
|
||||
q__1.i = x[i__4].r * a[i__2].i + x[i__4].i * a[i__2].r;
|
||||
x[i__3].r = q__1.r, x[i__3].i = q__1.i;
|
||||
}
|
||||
}
|
||||
jx += *incx;
|
||||
if (j > *k) {
|
||||
kx += *incx;
|
||||
}
|
||||
/* L40: */
|
||||
}
|
||||
}
|
||||
} else {
|
||||
if (*incx == 1) {
|
||||
for (j = *n; j >= 1; --j) {
|
||||
i__1 = j;
|
||||
if (x[i__1].r != 0.f || x[i__1].i != 0.f) {
|
||||
i__1 = j;
|
||||
temp.r = x[i__1].r, temp.i = x[i__1].i;
|
||||
l = 1 - j;
|
||||
/* Computing MIN */
|
||||
i__1 = *n, i__3 = j + *k;
|
||||
i__4 = j + 1;
|
||||
for (i__ = min(i__1, i__3); i__ >= i__4; --i__) {
|
||||
i__1 = i__;
|
||||
i__3 = i__;
|
||||
i__2 = l + i__ + j * a_dim1;
|
||||
q__2.r = temp.r * a[i__2].r - temp.i * a[i__2].i, q__2.i = temp.r * a[i__2].i + temp.i * a[i__2].r;
|
||||
q__1.r = x[i__3].r + q__2.r, q__1.i = x[i__3].i + q__2.i;
|
||||
x[i__1].r = q__1.r, x[i__1].i = q__1.i;
|
||||
/* L50: */
|
||||
}
|
||||
if (nounit) {
|
||||
i__4 = j;
|
||||
i__1 = j;
|
||||
i__3 = j * a_dim1 + 1;
|
||||
q__1.r = x[i__1].r * a[i__3].r - x[i__1].i * a[i__3].i,
|
||||
q__1.i = x[i__1].r * a[i__3].i + x[i__1].i * a[i__3].r;
|
||||
x[i__4].r = q__1.r, x[i__4].i = q__1.i;
|
||||
}
|
||||
}
|
||||
/* L60: */
|
||||
}
|
||||
} else {
|
||||
kx += (*n - 1) * *incx;
|
||||
jx = kx;
|
||||
for (j = *n; j >= 1; --j) {
|
||||
i__4 = jx;
|
||||
if (x[i__4].r != 0.f || x[i__4].i != 0.f) {
|
||||
i__4 = jx;
|
||||
temp.r = x[i__4].r, temp.i = x[i__4].i;
|
||||
ix = kx;
|
||||
l = 1 - j;
|
||||
/* Computing MIN */
|
||||
i__4 = *n, i__1 = j + *k;
|
||||
i__3 = j + 1;
|
||||
for (i__ = min(i__4, i__1); i__ >= i__3; --i__) {
|
||||
i__4 = ix;
|
||||
i__1 = ix;
|
||||
i__2 = l + i__ + j * a_dim1;
|
||||
q__2.r = temp.r * a[i__2].r - temp.i * a[i__2].i, q__2.i = temp.r * a[i__2].i + temp.i * a[i__2].r;
|
||||
q__1.r = x[i__1].r + q__2.r, q__1.i = x[i__1].i + q__2.i;
|
||||
x[i__4].r = q__1.r, x[i__4].i = q__1.i;
|
||||
ix -= *incx;
|
||||
/* L70: */
|
||||
}
|
||||
if (nounit) {
|
||||
i__3 = jx;
|
||||
i__4 = jx;
|
||||
i__1 = j * a_dim1 + 1;
|
||||
q__1.r = x[i__4].r * a[i__1].r - x[i__4].i * a[i__1].i,
|
||||
q__1.i = x[i__4].r * a[i__1].i + x[i__4].i * a[i__1].r;
|
||||
x[i__3].r = q__1.r, x[i__3].i = q__1.i;
|
||||
}
|
||||
}
|
||||
jx -= *incx;
|
||||
if (*n - j >= *k) {
|
||||
kx -= *incx;
|
||||
}
|
||||
/* L80: */
|
||||
}
|
||||
}
|
||||
}
|
||||
} else {
|
||||
/* Form x := A'*x or x := conjg( A' )*x. */
|
||||
|
||||
if (lsame_(uplo, "U")) {
|
||||
kplus1 = *k + 1;
|
||||
if (*incx == 1) {
|
||||
for (j = *n; j >= 1; --j) {
|
||||
i__3 = j;
|
||||
temp.r = x[i__3].r, temp.i = x[i__3].i;
|
||||
l = kplus1 - j;
|
||||
if (noconj) {
|
||||
if (nounit) {
|
||||
i__3 = kplus1 + j * a_dim1;
|
||||
q__1.r = temp.r * a[i__3].r - temp.i * a[i__3].i, q__1.i = temp.r * a[i__3].i + temp.i * a[i__3].r;
|
||||
temp.r = q__1.r, temp.i = q__1.i;
|
||||
}
|
||||
/* Computing MAX */
|
||||
i__4 = 1, i__1 = j - *k;
|
||||
i__3 = max(i__4, i__1);
|
||||
for (i__ = j - 1; i__ >= i__3; --i__) {
|
||||
i__4 = l + i__ + j * a_dim1;
|
||||
i__1 = i__;
|
||||
q__2.r = a[i__4].r * x[i__1].r - a[i__4].i * x[i__1].i,
|
||||
q__2.i = a[i__4].r * x[i__1].i + a[i__4].i * x[i__1].r;
|
||||
q__1.r = temp.r + q__2.r, q__1.i = temp.i + q__2.i;
|
||||
temp.r = q__1.r, temp.i = q__1.i;
|
||||
/* L90: */
|
||||
}
|
||||
} else {
|
||||
if (nounit) {
|
||||
r_cnjg(&q__2, &a[kplus1 + j * a_dim1]);
|
||||
q__1.r = temp.r * q__2.r - temp.i * q__2.i, q__1.i = temp.r * q__2.i + temp.i * q__2.r;
|
||||
temp.r = q__1.r, temp.i = q__1.i;
|
||||
}
|
||||
/* Computing MAX */
|
||||
i__4 = 1, i__1 = j - *k;
|
||||
i__3 = max(i__4, i__1);
|
||||
for (i__ = j - 1; i__ >= i__3; --i__) {
|
||||
r_cnjg(&q__3, &a[l + i__ + j * a_dim1]);
|
||||
i__4 = i__;
|
||||
q__2.r = q__3.r * x[i__4].r - q__3.i * x[i__4].i, q__2.i = q__3.r * x[i__4].i + q__3.i * x[i__4].r;
|
||||
q__1.r = temp.r + q__2.r, q__1.i = temp.i + q__2.i;
|
||||
temp.r = q__1.r, temp.i = q__1.i;
|
||||
/* L100: */
|
||||
}
|
||||
}
|
||||
i__3 = j;
|
||||
x[i__3].r = temp.r, x[i__3].i = temp.i;
|
||||
/* L110: */
|
||||
}
|
||||
} else {
|
||||
kx += (*n - 1) * *incx;
|
||||
jx = kx;
|
||||
for (j = *n; j >= 1; --j) {
|
||||
i__3 = jx;
|
||||
temp.r = x[i__3].r, temp.i = x[i__3].i;
|
||||
kx -= *incx;
|
||||
ix = kx;
|
||||
l = kplus1 - j;
|
||||
if (noconj) {
|
||||
if (nounit) {
|
||||
i__3 = kplus1 + j * a_dim1;
|
||||
q__1.r = temp.r * a[i__3].r - temp.i * a[i__3].i, q__1.i = temp.r * a[i__3].i + temp.i * a[i__3].r;
|
||||
temp.r = q__1.r, temp.i = q__1.i;
|
||||
}
|
||||
/* Computing MAX */
|
||||
i__4 = 1, i__1 = j - *k;
|
||||
i__3 = max(i__4, i__1);
|
||||
for (i__ = j - 1; i__ >= i__3; --i__) {
|
||||
i__4 = l + i__ + j * a_dim1;
|
||||
i__1 = ix;
|
||||
q__2.r = a[i__4].r * x[i__1].r - a[i__4].i * x[i__1].i,
|
||||
q__2.i = a[i__4].r * x[i__1].i + a[i__4].i * x[i__1].r;
|
||||
q__1.r = temp.r + q__2.r, q__1.i = temp.i + q__2.i;
|
||||
temp.r = q__1.r, temp.i = q__1.i;
|
||||
ix -= *incx;
|
||||
/* L120: */
|
||||
}
|
||||
} else {
|
||||
if (nounit) {
|
||||
r_cnjg(&q__2, &a[kplus1 + j * a_dim1]);
|
||||
q__1.r = temp.r * q__2.r - temp.i * q__2.i, q__1.i = temp.r * q__2.i + temp.i * q__2.r;
|
||||
temp.r = q__1.r, temp.i = q__1.i;
|
||||
}
|
||||
/* Computing MAX */
|
||||
i__4 = 1, i__1 = j - *k;
|
||||
i__3 = max(i__4, i__1);
|
||||
for (i__ = j - 1; i__ >= i__3; --i__) {
|
||||
r_cnjg(&q__3, &a[l + i__ + j * a_dim1]);
|
||||
i__4 = ix;
|
||||
q__2.r = q__3.r * x[i__4].r - q__3.i * x[i__4].i, q__2.i = q__3.r * x[i__4].i + q__3.i * x[i__4].r;
|
||||
q__1.r = temp.r + q__2.r, q__1.i = temp.i + q__2.i;
|
||||
temp.r = q__1.r, temp.i = q__1.i;
|
||||
ix -= *incx;
|
||||
/* L130: */
|
||||
}
|
||||
}
|
||||
i__3 = jx;
|
||||
x[i__3].r = temp.r, x[i__3].i = temp.i;
|
||||
jx -= *incx;
|
||||
/* L140: */
|
||||
}
|
||||
}
|
||||
} else {
|
||||
if (*incx == 1) {
|
||||
i__3 = *n;
|
||||
for (j = 1; j <= i__3; ++j) {
|
||||
i__4 = j;
|
||||
temp.r = x[i__4].r, temp.i = x[i__4].i;
|
||||
l = 1 - j;
|
||||
if (noconj) {
|
||||
if (nounit) {
|
||||
i__4 = j * a_dim1 + 1;
|
||||
q__1.r = temp.r * a[i__4].r - temp.i * a[i__4].i, q__1.i = temp.r * a[i__4].i + temp.i * a[i__4].r;
|
||||
temp.r = q__1.r, temp.i = q__1.i;
|
||||
}
|
||||
/* Computing MIN */
|
||||
i__1 = *n, i__2 = j + *k;
|
||||
i__4 = min(i__1, i__2);
|
||||
for (i__ = j + 1; i__ <= i__4; ++i__) {
|
||||
i__1 = l + i__ + j * a_dim1;
|
||||
i__2 = i__;
|
||||
q__2.r = a[i__1].r * x[i__2].r - a[i__1].i * x[i__2].i,
|
||||
q__2.i = a[i__1].r * x[i__2].i + a[i__1].i * x[i__2].r;
|
||||
q__1.r = temp.r + q__2.r, q__1.i = temp.i + q__2.i;
|
||||
temp.r = q__1.r, temp.i = q__1.i;
|
||||
/* L150: */
|
||||
}
|
||||
} else {
|
||||
if (nounit) {
|
||||
r_cnjg(&q__2, &a[j * a_dim1 + 1]);
|
||||
q__1.r = temp.r * q__2.r - temp.i * q__2.i, q__1.i = temp.r * q__2.i + temp.i * q__2.r;
|
||||
temp.r = q__1.r, temp.i = q__1.i;
|
||||
}
|
||||
/* Computing MIN */
|
||||
i__1 = *n, i__2 = j + *k;
|
||||
i__4 = min(i__1, i__2);
|
||||
for (i__ = j + 1; i__ <= i__4; ++i__) {
|
||||
r_cnjg(&q__3, &a[l + i__ + j * a_dim1]);
|
||||
i__1 = i__;
|
||||
q__2.r = q__3.r * x[i__1].r - q__3.i * x[i__1].i, q__2.i = q__3.r * x[i__1].i + q__3.i * x[i__1].r;
|
||||
q__1.r = temp.r + q__2.r, q__1.i = temp.i + q__2.i;
|
||||
temp.r = q__1.r, temp.i = q__1.i;
|
||||
/* L160: */
|
||||
}
|
||||
}
|
||||
i__4 = j;
|
||||
x[i__4].r = temp.r, x[i__4].i = temp.i;
|
||||
/* L170: */
|
||||
}
|
||||
} else {
|
||||
jx = kx;
|
||||
i__3 = *n;
|
||||
for (j = 1; j <= i__3; ++j) {
|
||||
i__4 = jx;
|
||||
temp.r = x[i__4].r, temp.i = x[i__4].i;
|
||||
kx += *incx;
|
||||
ix = kx;
|
||||
l = 1 - j;
|
||||
if (noconj) {
|
||||
if (nounit) {
|
||||
i__4 = j * a_dim1 + 1;
|
||||
q__1.r = temp.r * a[i__4].r - temp.i * a[i__4].i, q__1.i = temp.r * a[i__4].i + temp.i * a[i__4].r;
|
||||
temp.r = q__1.r, temp.i = q__1.i;
|
||||
}
|
||||
/* Computing MIN */
|
||||
i__1 = *n, i__2 = j + *k;
|
||||
i__4 = min(i__1, i__2);
|
||||
for (i__ = j + 1; i__ <= i__4; ++i__) {
|
||||
i__1 = l + i__ + j * a_dim1;
|
||||
i__2 = ix;
|
||||
q__2.r = a[i__1].r * x[i__2].r - a[i__1].i * x[i__2].i,
|
||||
q__2.i = a[i__1].r * x[i__2].i + a[i__1].i * x[i__2].r;
|
||||
q__1.r = temp.r + q__2.r, q__1.i = temp.i + q__2.i;
|
||||
temp.r = q__1.r, temp.i = q__1.i;
|
||||
ix += *incx;
|
||||
/* L180: */
|
||||
}
|
||||
} else {
|
||||
if (nounit) {
|
||||
r_cnjg(&q__2, &a[j * a_dim1 + 1]);
|
||||
q__1.r = temp.r * q__2.r - temp.i * q__2.i, q__1.i = temp.r * q__2.i + temp.i * q__2.r;
|
||||
temp.r = q__1.r, temp.i = q__1.i;
|
||||
}
|
||||
/* Computing MIN */
|
||||
i__1 = *n, i__2 = j + *k;
|
||||
i__4 = min(i__1, i__2);
|
||||
for (i__ = j + 1; i__ <= i__4; ++i__) {
|
||||
r_cnjg(&q__3, &a[l + i__ + j * a_dim1]);
|
||||
i__1 = ix;
|
||||
q__2.r = q__3.r * x[i__1].r - q__3.i * x[i__1].i, q__2.i = q__3.r * x[i__1].i + q__3.i * x[i__1].r;
|
||||
q__1.r = temp.r + q__2.r, q__1.i = temp.i + q__2.i;
|
||||
temp.r = q__1.r, temp.i = q__1.i;
|
||||
ix += *incx;
|
||||
/* L190: */
|
||||
}
|
||||
}
|
||||
i__4 = jx;
|
||||
x[i__4].r = temp.r, x[i__4].i = temp.i;
|
||||
jx += *incx;
|
||||
/* L200: */
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/* End of CTBMV . */
|
||||
|
||||
} /* ctbmv_ */
|
||||
27
blas/f2c/datatypes.h
Normal file
27
blas/f2c/datatypes.h
Normal file
@@ -0,0 +1,27 @@
|
||||
/* This contains a limited subset of the typedefs exposed by f2c
|
||||
for use by the Eigen BLAS C-only implementation.
|
||||
*/
|
||||
|
||||
#ifndef __EIGEN_DATATYPES_H__
|
||||
#define __EIGEN_DATATYPES_H__
|
||||
|
||||
typedef int integer;
|
||||
typedef unsigned int uinteger;
|
||||
typedef float real;
|
||||
typedef double doublereal;
|
||||
typedef struct {
|
||||
real r, i;
|
||||
} complex;
|
||||
typedef struct {
|
||||
doublereal r, i;
|
||||
} doublecomplex;
|
||||
typedef int logical;
|
||||
|
||||
#define abs(x) ((x) >= 0 ? (x) : -(x))
|
||||
#define dabs(x) (doublereal) abs(x)
|
||||
#define min(a, b) ((a) <= (b) ? (a) : (b))
|
||||
#define max(a, b) ((a) >= (b) ? (a) : (b))
|
||||
#define dmin(a, b) (doublereal) min(a, b)
|
||||
#define dmax(a, b) (doublereal) max(a, b)
|
||||
|
||||
#endif
|
||||
213
blas/f2c/drotm.c
Normal file
213
blas/f2c/drotm.c
Normal file
@@ -0,0 +1,213 @@
|
||||
/* drotm.f -- translated by f2c (version 20100827).
|
||||
You must link the resulting object file with libf2c:
|
||||
on Microsoft Windows system, link with libf2c.lib;
|
||||
on Linux or Unix systems, link with .../path/to/libf2c.a -lm
|
||||
or, if you install libf2c.a in a standard place, with -lf2c -lm
|
||||
-- in that order, at the end of the command line, as in
|
||||
cc *.o -lf2c -lm
|
||||
Source for libf2c is in /netlib/f2c/libf2c.zip, e.g.,
|
||||
|
||||
http://www.netlib.org/f2c/libf2c.zip
|
||||
*/
|
||||
|
||||
#include "datatypes.h"
|
||||
|
||||
/* Subroutine */ void drotm_(integer *n, doublereal *dx, integer *incx, doublereal *dy, integer *incy,
|
||||
doublereal *dparam) {
|
||||
/* Initialized data */
|
||||
|
||||
static doublereal zero = 0.;
|
||||
static doublereal two = 2.;
|
||||
|
||||
/* System generated locals */
|
||||
integer i__1, i__2;
|
||||
|
||||
/* Local variables */
|
||||
integer i__;
|
||||
doublereal w, z__;
|
||||
integer kx, ky;
|
||||
doublereal dh11, dh12, dh21, dh22, dflag;
|
||||
integer nsteps;
|
||||
|
||||
/* .. Scalar Arguments .. */
|
||||
/* .. */
|
||||
/* .. Array Arguments .. */
|
||||
/* .. */
|
||||
|
||||
/* Purpose */
|
||||
/* ======= */
|
||||
|
||||
/* APPLY THE MODIFIED GIVENS TRANSFORMATION, H, TO THE 2 BY N MATRIX */
|
||||
|
||||
/* (DX**T) , WHERE **T INDICATES TRANSPOSE. THE ELEMENTS OF DX ARE IN */
|
||||
/* (DY**T) */
|
||||
|
||||
/* DX(LX+I*INCX), I = 0 TO N-1, WHERE LX = 1 IF INCX .GE. 0, ELSE */
|
||||
/* LX = (-INCX)*N, AND SIMILARLY FOR SY USING LY AND INCY. */
|
||||
/* WITH DPARAM(1)=DFLAG, H HAS ONE OF THE FOLLOWING FORMS.. */
|
||||
|
||||
/* DFLAG=-1.D0 DFLAG=0.D0 DFLAG=1.D0 DFLAG=-2.D0 */
|
||||
|
||||
/* (DH11 DH12) (1.D0 DH12) (DH11 1.D0) (1.D0 0.D0) */
|
||||
/* H=( ) ( ) ( ) ( ) */
|
||||
/* (DH21 DH22), (DH21 1.D0), (-1.D0 DH22), (0.D0 1.D0). */
|
||||
/* SEE DROTMG FOR A DESCRIPTION OF DATA STORAGE IN DPARAM. */
|
||||
|
||||
/* Arguments */
|
||||
/* ========= */
|
||||
|
||||
/* N (input) INTEGER */
|
||||
/* number of elements in input vector(s) */
|
||||
|
||||
/* DX (input/output) DOUBLE PRECISION array, dimension N */
|
||||
/* double precision vector with N elements */
|
||||
|
||||
/* INCX (input) INTEGER */
|
||||
/* storage spacing between elements of DX */
|
||||
|
||||
/* DY (input/output) DOUBLE PRECISION array, dimension N */
|
||||
/* double precision vector with N elements */
|
||||
|
||||
/* INCY (input) INTEGER */
|
||||
/* storage spacing between elements of DY */
|
||||
|
||||
/* DPARAM (input/output) DOUBLE PRECISION array, dimension 5 */
|
||||
/* DPARAM(1)=DFLAG */
|
||||
/* DPARAM(2)=DH11 */
|
||||
/* DPARAM(3)=DH21 */
|
||||
/* DPARAM(4)=DH12 */
|
||||
/* DPARAM(5)=DH22 */
|
||||
|
||||
/* ===================================================================== */
|
||||
|
||||
/* .. Local Scalars .. */
|
||||
/* .. */
|
||||
/* .. Data statements .. */
|
||||
/* Parameter adjustments */
|
||||
--dparam;
|
||||
--dy;
|
||||
--dx;
|
||||
|
||||
/* Function Body */
|
||||
/* .. */
|
||||
|
||||
dflag = dparam[1];
|
||||
if (*n <= 0 || dflag + two == zero) {
|
||||
goto L140;
|
||||
}
|
||||
if (!(*incx == *incy && *incx > 0)) {
|
||||
goto L70;
|
||||
}
|
||||
|
||||
nsteps = *n * *incx;
|
||||
if (dflag < 0.) {
|
||||
goto L50;
|
||||
} else if (dflag == 0) {
|
||||
goto L10;
|
||||
} else {
|
||||
goto L30;
|
||||
}
|
||||
L10:
|
||||
dh12 = dparam[4];
|
||||
dh21 = dparam[3];
|
||||
i__1 = nsteps;
|
||||
i__2 = *incx;
|
||||
for (i__ = 1; i__2 < 0 ? i__ >= i__1 : i__ <= i__1; i__ += i__2) {
|
||||
w = dx[i__];
|
||||
z__ = dy[i__];
|
||||
dx[i__] = w + z__ * dh12;
|
||||
dy[i__] = w * dh21 + z__;
|
||||
/* L20: */
|
||||
}
|
||||
goto L140;
|
||||
L30:
|
||||
dh11 = dparam[2];
|
||||
dh22 = dparam[5];
|
||||
i__2 = nsteps;
|
||||
i__1 = *incx;
|
||||
for (i__ = 1; i__1 < 0 ? i__ >= i__2 : i__ <= i__2; i__ += i__1) {
|
||||
w = dx[i__];
|
||||
z__ = dy[i__];
|
||||
dx[i__] = w * dh11 + z__;
|
||||
dy[i__] = -w + dh22 * z__;
|
||||
/* L40: */
|
||||
}
|
||||
goto L140;
|
||||
L50:
|
||||
dh11 = dparam[2];
|
||||
dh12 = dparam[4];
|
||||
dh21 = dparam[3];
|
||||
dh22 = dparam[5];
|
||||
i__1 = nsteps;
|
||||
i__2 = *incx;
|
||||
for (i__ = 1; i__2 < 0 ? i__ >= i__1 : i__ <= i__1; i__ += i__2) {
|
||||
w = dx[i__];
|
||||
z__ = dy[i__];
|
||||
dx[i__] = w * dh11 + z__ * dh12;
|
||||
dy[i__] = w * dh21 + z__ * dh22;
|
||||
/* L60: */
|
||||
}
|
||||
goto L140;
|
||||
L70:
|
||||
kx = 1;
|
||||
ky = 1;
|
||||
if (*incx < 0) {
|
||||
kx = (1 - *n) * *incx + 1;
|
||||
}
|
||||
if (*incy < 0) {
|
||||
ky = (1 - *n) * *incy + 1;
|
||||
}
|
||||
|
||||
if (dflag < 0.) {
|
||||
goto L120;
|
||||
} else if (dflag == 0) {
|
||||
goto L80;
|
||||
} else {
|
||||
goto L100;
|
||||
}
|
||||
L80:
|
||||
dh12 = dparam[4];
|
||||
dh21 = dparam[3];
|
||||
i__2 = *n;
|
||||
for (i__ = 1; i__ <= i__2; ++i__) {
|
||||
w = dx[kx];
|
||||
z__ = dy[ky];
|
||||
dx[kx] = w + z__ * dh12;
|
||||
dy[ky] = w * dh21 + z__;
|
||||
kx += *incx;
|
||||
ky += *incy;
|
||||
/* L90: */
|
||||
}
|
||||
goto L140;
|
||||
L100:
|
||||
dh11 = dparam[2];
|
||||
dh22 = dparam[5];
|
||||
i__2 = *n;
|
||||
for (i__ = 1; i__ <= i__2; ++i__) {
|
||||
w = dx[kx];
|
||||
z__ = dy[ky];
|
||||
dx[kx] = w * dh11 + z__;
|
||||
dy[ky] = -w + dh22 * z__;
|
||||
kx += *incx;
|
||||
ky += *incy;
|
||||
/* L110: */
|
||||
}
|
||||
goto L140;
|
||||
L120:
|
||||
dh11 = dparam[2];
|
||||
dh12 = dparam[4];
|
||||
dh21 = dparam[3];
|
||||
dh22 = dparam[5];
|
||||
i__2 = *n;
|
||||
for (i__ = 1; i__ <= i__2; ++i__) {
|
||||
w = dx[kx];
|
||||
z__ = dy[ky];
|
||||
dx[kx] = w * dh11 + z__ * dh12;
|
||||
dy[ky] = w * dh21 + z__ * dh22;
|
||||
kx += *incx;
|
||||
ky += *incy;
|
||||
/* L130: */
|
||||
}
|
||||
L140:
|
||||
return;
|
||||
} /* drotm_ */
|
||||
293
blas/f2c/drotmg.c
Normal file
293
blas/f2c/drotmg.c
Normal file
@@ -0,0 +1,293 @@
|
||||
/* drotmg.f -- translated by f2c (version 20100827).
|
||||
You must link the resulting object file with libf2c:
|
||||
on Microsoft Windows system, link with libf2c.lib;
|
||||
on Linux or Unix systems, link with .../path/to/libf2c.a -lm
|
||||
or, if you install libf2c.a in a standard place, with -lf2c -lm
|
||||
-- in that order, at the end of the command line, as in
|
||||
cc *.o -lf2c -lm
|
||||
Source for libf2c is in /netlib/f2c/libf2c.zip, e.g.,
|
||||
|
||||
http://www.netlib.org/f2c/libf2c.zip
|
||||
*/
|
||||
|
||||
#include "datatypes.h"
|
||||
|
||||
/* Subroutine */ void drotmg_(doublereal *dd1, doublereal *dd2, doublereal *dx1, doublereal *dy1, doublereal *dparam) {
|
||||
/* Initialized data */
|
||||
|
||||
static doublereal zero = 0.;
|
||||
static doublereal one = 1.;
|
||||
static doublereal two = 2.;
|
||||
static doublereal gam = 4096.;
|
||||
static doublereal gamsq = 16777216.;
|
||||
static doublereal rgamsq = 5.9604645e-8;
|
||||
|
||||
/* Format strings */
|
||||
static char fmt_120[] = "";
|
||||
static char fmt_150[] = "";
|
||||
static char fmt_180[] = "";
|
||||
static char fmt_210[] = "";
|
||||
|
||||
/* System generated locals */
|
||||
doublereal d__1;
|
||||
|
||||
/* Local variables */
|
||||
doublereal du, dp1, dp2, dq1, dq2, dh11, dh12, dh21, dh22;
|
||||
integer igo;
|
||||
doublereal dflag, dtemp;
|
||||
|
||||
/* Assigned format variables */
|
||||
static char *igo_fmt;
|
||||
(void)igo_fmt;
|
||||
|
||||
/* .. Scalar Arguments .. */
|
||||
/* .. */
|
||||
/* .. Array Arguments .. */
|
||||
/* .. */
|
||||
|
||||
/* Purpose */
|
||||
/* ======= */
|
||||
|
||||
/* CONSTRUCT THE MODIFIED GIVENS TRANSFORMATION MATRIX H WHICH ZEROS */
|
||||
/* THE SECOND COMPONENT OF THE 2-VECTOR (DSQRT(DD1)*DX1,DSQRT(DD2)* */
|
||||
/* DY2)**T. */
|
||||
/* WITH DPARAM(1)=DFLAG, H HAS ONE OF THE FOLLOWING FORMS.. */
|
||||
|
||||
/* DFLAG=-1.D0 DFLAG=0.D0 DFLAG=1.D0 DFLAG=-2.D0 */
|
||||
|
||||
/* (DH11 DH12) (1.D0 DH12) (DH11 1.D0) (1.D0 0.D0) */
|
||||
/* H=( ) ( ) ( ) ( ) */
|
||||
/* (DH21 DH22), (DH21 1.D0), (-1.D0 DH22), (0.D0 1.D0). */
|
||||
/* LOCATIONS 2-4 OF DPARAM CONTAIN DH11, DH21, DH12, AND DH22 */
|
||||
/* RESPECTIVELY. (VALUES OF 1.D0, -1.D0, OR 0.D0 IMPLIED BY THE */
|
||||
/* VALUE OF DPARAM(1) ARE NOT STORED IN DPARAM.) */
|
||||
|
||||
/* THE VALUES OF GAMSQ AND RGAMSQ SET IN THE DATA STATEMENT MAY BE */
|
||||
/* INEXACT. THIS IS OK AS THEY ARE ONLY USED FOR TESTING THE SIZE */
|
||||
/* OF DD1 AND DD2. ALL ACTUAL SCALING OF DATA IS DONE USING GAM. */
|
||||
|
||||
/* Arguments */
|
||||
/* ========= */
|
||||
|
||||
/* DD1 (input/output) DOUBLE PRECISION */
|
||||
|
||||
/* DD2 (input/output) DOUBLE PRECISION */
|
||||
|
||||
/* DX1 (input/output) DOUBLE PRECISION */
|
||||
|
||||
/* DY1 (input) DOUBLE PRECISION */
|
||||
|
||||
/* DPARAM (input/output) DOUBLE PRECISION array, dimension 5 */
|
||||
/* DPARAM(1)=DFLAG */
|
||||
/* DPARAM(2)=DH11 */
|
||||
/* DPARAM(3)=DH21 */
|
||||
/* DPARAM(4)=DH12 */
|
||||
/* DPARAM(5)=DH22 */
|
||||
|
||||
/* ===================================================================== */
|
||||
|
||||
/* .. Local Scalars .. */
|
||||
/* .. */
|
||||
/* .. Intrinsic Functions .. */
|
||||
/* .. */
|
||||
/* .. Data statements .. */
|
||||
|
||||
/* Parameter adjustments */
|
||||
--dparam;
|
||||
|
||||
/* Function Body */
|
||||
/* .. */
|
||||
if (!(*dd1 < zero)) {
|
||||
goto L10;
|
||||
}
|
||||
/* GO ZERO-H-D-AND-DX1.. */
|
||||
goto L60;
|
||||
L10:
|
||||
/* CASE-DD1-NONNEGATIVE */
|
||||
dp2 = *dd2 * *dy1;
|
||||
if (!(dp2 == zero)) {
|
||||
goto L20;
|
||||
}
|
||||
dflag = -two;
|
||||
goto L260;
|
||||
/* REGULAR-CASE.. */
|
||||
L20:
|
||||
dp1 = *dd1 * *dx1;
|
||||
dq2 = dp2 * *dy1;
|
||||
dq1 = dp1 * *dx1;
|
||||
|
||||
if (!(abs(dq1) > abs(dq2))) {
|
||||
goto L40;
|
||||
}
|
||||
dh21 = -(*dy1) / *dx1;
|
||||
dh12 = dp2 / dp1;
|
||||
|
||||
du = one - dh12 * dh21;
|
||||
|
||||
if (!(du <= zero)) {
|
||||
goto L30;
|
||||
}
|
||||
/* GO ZERO-H-D-AND-DX1.. */
|
||||
goto L60;
|
||||
L30:
|
||||
dflag = zero;
|
||||
*dd1 /= du;
|
||||
*dd2 /= du;
|
||||
*dx1 *= du;
|
||||
/* GO SCALE-CHECK.. */
|
||||
goto L100;
|
||||
L40:
|
||||
if (!(dq2 < zero)) {
|
||||
goto L50;
|
||||
}
|
||||
/* GO ZERO-H-D-AND-DX1.. */
|
||||
goto L60;
|
||||
L50:
|
||||
dflag = one;
|
||||
dh11 = dp1 / dp2;
|
||||
dh22 = *dx1 / *dy1;
|
||||
du = one + dh11 * dh22;
|
||||
dtemp = *dd2 / du;
|
||||
*dd2 = *dd1 / du;
|
||||
*dd1 = dtemp;
|
||||
*dx1 = *dy1 * du;
|
||||
/* GO SCALE-CHECK */
|
||||
goto L100;
|
||||
/* PROCEDURE..ZERO-H-D-AND-DX1.. */
|
||||
L60:
|
||||
dflag = -one;
|
||||
dh11 = zero;
|
||||
dh12 = zero;
|
||||
dh21 = zero;
|
||||
dh22 = zero;
|
||||
|
||||
*dd1 = zero;
|
||||
*dd2 = zero;
|
||||
*dx1 = zero;
|
||||
/* RETURN.. */
|
||||
goto L220;
|
||||
/* PROCEDURE..FIX-H.. */
|
||||
L70:
|
||||
if (!(dflag >= zero)) {
|
||||
goto L90;
|
||||
}
|
||||
|
||||
if (!(dflag == zero)) {
|
||||
goto L80;
|
||||
}
|
||||
dh11 = one;
|
||||
dh22 = one;
|
||||
dflag = -one;
|
||||
goto L90;
|
||||
L80:
|
||||
dh21 = -one;
|
||||
dh12 = one;
|
||||
dflag = -one;
|
||||
L90:
|
||||
switch (igo) {
|
||||
case 0:
|
||||
goto L120;
|
||||
case 1:
|
||||
goto L150;
|
||||
case 2:
|
||||
goto L180;
|
||||
case 3:
|
||||
goto L210;
|
||||
}
|
||||
/* PROCEDURE..SCALE-CHECK */
|
||||
L100:
|
||||
L110:
|
||||
if (!(*dd1 <= rgamsq)) {
|
||||
goto L130;
|
||||
}
|
||||
if (*dd1 == zero) {
|
||||
goto L160;
|
||||
}
|
||||
igo = 0;
|
||||
igo_fmt = fmt_120;
|
||||
/* FIX-H.. */
|
||||
goto L70;
|
||||
L120:
|
||||
/* Computing 2nd power */
|
||||
d__1 = gam;
|
||||
*dd1 *= d__1 * d__1;
|
||||
*dx1 /= gam;
|
||||
dh11 /= gam;
|
||||
dh12 /= gam;
|
||||
goto L110;
|
||||
L130:
|
||||
L140:
|
||||
if (!(*dd1 >= gamsq)) {
|
||||
goto L160;
|
||||
}
|
||||
igo = 1;
|
||||
igo_fmt = fmt_150;
|
||||
/* FIX-H.. */
|
||||
goto L70;
|
||||
L150:
|
||||
/* Computing 2nd power */
|
||||
d__1 = gam;
|
||||
*dd1 /= d__1 * d__1;
|
||||
*dx1 *= gam;
|
||||
dh11 *= gam;
|
||||
dh12 *= gam;
|
||||
goto L140;
|
||||
L160:
|
||||
L170:
|
||||
if (!(abs(*dd2) <= rgamsq)) {
|
||||
goto L190;
|
||||
}
|
||||
if (*dd2 == zero) {
|
||||
goto L220;
|
||||
}
|
||||
igo = 2;
|
||||
igo_fmt = fmt_180;
|
||||
/* FIX-H.. */
|
||||
goto L70;
|
||||
L180:
|
||||
/* Computing 2nd power */
|
||||
d__1 = gam;
|
||||
*dd2 *= d__1 * d__1;
|
||||
dh21 /= gam;
|
||||
dh22 /= gam;
|
||||
goto L170;
|
||||
L190:
|
||||
L200:
|
||||
if (!(abs(*dd2) >= gamsq)) {
|
||||
goto L220;
|
||||
}
|
||||
igo = 3;
|
||||
igo_fmt = fmt_210;
|
||||
/* FIX-H.. */
|
||||
goto L70;
|
||||
L210:
|
||||
/* Computing 2nd power */
|
||||
d__1 = gam;
|
||||
*dd2 /= d__1 * d__1;
|
||||
dh21 *= gam;
|
||||
dh22 *= gam;
|
||||
goto L200;
|
||||
L220:
|
||||
if (dflag < 0.) {
|
||||
goto L250;
|
||||
} else if (dflag == 0) {
|
||||
goto L230;
|
||||
} else {
|
||||
goto L240;
|
||||
}
|
||||
L230:
|
||||
dparam[3] = dh21;
|
||||
dparam[4] = dh12;
|
||||
goto L260;
|
||||
L240:
|
||||
dparam[2] = dh11;
|
||||
dparam[5] = dh22;
|
||||
goto L260;
|
||||
L250:
|
||||
dparam[2] = dh11;
|
||||
dparam[3] = dh21;
|
||||
dparam[4] = dh12;
|
||||
dparam[5] = dh22;
|
||||
L260:
|
||||
dparam[1] = dflag;
|
||||
} /* drotmg_ */
|
||||
356
blas/f2c/dsbmv.c
Normal file
356
blas/f2c/dsbmv.c
Normal file
@@ -0,0 +1,356 @@
|
||||
/* dsbmv.f -- translated by f2c (version 20100827).
|
||||
You must link the resulting object file with libf2c:
|
||||
on Microsoft Windows system, link with libf2c.lib;
|
||||
on Linux or Unix systems, link with .../path/to/libf2c.a -lm
|
||||
or, if you install libf2c.a in a standard place, with -lf2c -lm
|
||||
-- in that order, at the end of the command line, as in
|
||||
cc *.o -lf2c -lm
|
||||
Source for libf2c is in /netlib/f2c/libf2c.zip, e.g.,
|
||||
|
||||
http://www.netlib.org/f2c/libf2c.zip
|
||||
*/
|
||||
|
||||
#include "datatypes.h"
|
||||
|
||||
/* Subroutine */ void dsbmv_(char *uplo, integer *n, integer *k, doublereal *alpha, doublereal *a, integer *lda,
|
||||
doublereal *x, integer *incx, doublereal *beta, doublereal *y, integer *incy) {
|
||||
/* System generated locals */
|
||||
integer a_dim1, a_offset, i__1, i__2, i__3, i__4;
|
||||
|
||||
/* Local variables */
|
||||
integer i__, j, l, ix, iy, jx, jy, kx, ky, info;
|
||||
doublereal temp1, temp2;
|
||||
extern logical lsame_(char *, char *);
|
||||
integer kplus1;
|
||||
extern /* Subroutine */ void xerbla_(const char *, integer *);
|
||||
|
||||
/* .. Scalar Arguments .. */
|
||||
/* .. */
|
||||
/* .. Array Arguments .. */
|
||||
/* .. */
|
||||
|
||||
/* Purpose */
|
||||
/* ======= */
|
||||
|
||||
/* DSBMV performs the matrix-vector operation */
|
||||
|
||||
/* y := alpha*A*x + beta*y, */
|
||||
|
||||
/* where alpha and beta are scalars, x and y are n element vectors and */
|
||||
/* A is an n by n symmetric band matrix, with k super-diagonals. */
|
||||
|
||||
/* Arguments */
|
||||
/* ========== */
|
||||
|
||||
/* UPLO - CHARACTER*1. */
|
||||
/* On entry, UPLO specifies whether the upper or lower */
|
||||
/* triangular part of the band matrix A is being supplied as */
|
||||
/* follows: */
|
||||
|
||||
/* UPLO = 'U' or 'u' The upper triangular part of A is */
|
||||
/* being supplied. */
|
||||
|
||||
/* UPLO = 'L' or 'l' The lower triangular part of A is */
|
||||
/* being supplied. */
|
||||
|
||||
/* Unchanged on exit. */
|
||||
|
||||
/* N - INTEGER. */
|
||||
/* On entry, N specifies the order of the matrix A. */
|
||||
/* N must be at least zero. */
|
||||
/* Unchanged on exit. */
|
||||
|
||||
/* K - INTEGER. */
|
||||
/* On entry, K specifies the number of super-diagonals of the */
|
||||
/* matrix A. K must satisfy 0 .le. K. */
|
||||
/* Unchanged on exit. */
|
||||
|
||||
/* ALPHA - DOUBLE PRECISION. */
|
||||
/* On entry, ALPHA specifies the scalar alpha. */
|
||||
/* Unchanged on exit. */
|
||||
|
||||
/* A - DOUBLE PRECISION array of DIMENSION ( LDA, n ). */
|
||||
/* Before entry with UPLO = 'U' or 'u', the leading ( k + 1 ) */
|
||||
/* by n part of the array A must contain the upper triangular */
|
||||
/* band part of the symmetric matrix, supplied column by */
|
||||
/* column, with the leading diagonal of the matrix in row */
|
||||
/* ( k + 1 ) of the array, the first super-diagonal starting at */
|
||||
/* position 2 in row k, and so on. The top left k by k triangle */
|
||||
/* of the array A is not referenced. */
|
||||
/* The following program segment will transfer the upper */
|
||||
/* triangular part of a symmetric band matrix from conventional */
|
||||
/* full matrix storage to band storage: */
|
||||
|
||||
/* DO 20, J = 1, N */
|
||||
/* M = K + 1 - J */
|
||||
/* DO 10, I = MAX( 1, J - K ), J */
|
||||
/* A( M + I, J ) = matrix( I, J ) */
|
||||
/* 10 CONTINUE */
|
||||
/* 20 CONTINUE */
|
||||
|
||||
/* Before entry with UPLO = 'L' or 'l', the leading ( k + 1 ) */
|
||||
/* by n part of the array A must contain the lower triangular */
|
||||
/* band part of the symmetric matrix, supplied column by */
|
||||
/* column, with the leading diagonal of the matrix in row 1 of */
|
||||
/* the array, the first sub-diagonal starting at position 1 in */
|
||||
/* row 2, and so on. The bottom right k by k triangle of the */
|
||||
/* array A is not referenced. */
|
||||
/* The following program segment will transfer the lower */
|
||||
/* triangular part of a symmetric band matrix from conventional */
|
||||
/* full matrix storage to band storage: */
|
||||
|
||||
/* DO 20, J = 1, N */
|
||||
/* M = 1 - J */
|
||||
/* DO 10, I = J, MIN( N, J + K ) */
|
||||
/* A( M + I, J ) = matrix( I, J ) */
|
||||
/* 10 CONTINUE */
|
||||
/* 20 CONTINUE */
|
||||
|
||||
/* Unchanged on exit. */
|
||||
|
||||
/* LDA - INTEGER. */
|
||||
/* On entry, LDA specifies the first dimension of A as declared */
|
||||
/* in the calling (sub) program. LDA must be at least */
|
||||
/* ( k + 1 ). */
|
||||
/* Unchanged on exit. */
|
||||
|
||||
/* X - DOUBLE PRECISION array of DIMENSION at least */
|
||||
/* ( 1 + ( n - 1 )*abs( INCX ) ). */
|
||||
/* Before entry, the incremented array X must contain the */
|
||||
/* vector x. */
|
||||
/* Unchanged on exit. */
|
||||
|
||||
/* INCX - INTEGER. */
|
||||
/* On entry, INCX specifies the increment for the elements of */
|
||||
/* X. INCX must not be zero. */
|
||||
/* Unchanged on exit. */
|
||||
|
||||
/* BETA - DOUBLE PRECISION. */
|
||||
/* On entry, BETA specifies the scalar beta. */
|
||||
/* Unchanged on exit. */
|
||||
|
||||
/* Y - DOUBLE PRECISION array of DIMENSION at least */
|
||||
/* ( 1 + ( n - 1 )*abs( INCY ) ). */
|
||||
/* Before entry, the incremented array Y must contain the */
|
||||
/* vector y. On exit, Y is overwritten by the updated vector y. */
|
||||
|
||||
/* INCY - INTEGER. */
|
||||
/* On entry, INCY specifies the increment for the elements of */
|
||||
/* Y. INCY must not be zero. */
|
||||
/* Unchanged on exit. */
|
||||
|
||||
/* Level 2 Blas routine. */
|
||||
|
||||
/* -- Written on 22-October-1986. */
|
||||
/* Jack Dongarra, Argonne National Lab. */
|
||||
/* Jeremy Du Croz, Nag Central Office. */
|
||||
/* Sven Hammarling, Nag Central Office. */
|
||||
/* Richard Hanson, Sandia National Labs. */
|
||||
|
||||
/* ===================================================================== */
|
||||
|
||||
/* .. Parameters .. */
|
||||
/* .. */
|
||||
/* .. Local Scalars .. */
|
||||
/* .. */
|
||||
/* .. External Functions .. */
|
||||
/* .. */
|
||||
/* .. External Subroutines .. */
|
||||
/* .. */
|
||||
/* .. Intrinsic Functions .. */
|
||||
/* .. */
|
||||
|
||||
/* Test the input parameters. */
|
||||
|
||||
/* Parameter adjustments */
|
||||
a_dim1 = *lda;
|
||||
a_offset = 1 + a_dim1;
|
||||
a -= a_offset;
|
||||
--x;
|
||||
--y;
|
||||
|
||||
/* Function Body */
|
||||
info = 0;
|
||||
if (!lsame_(uplo, "U") && !lsame_(uplo, "L")) {
|
||||
info = 1;
|
||||
} else if (*n < 0) {
|
||||
info = 2;
|
||||
} else if (*k < 0) {
|
||||
info = 3;
|
||||
} else if (*lda < *k + 1) {
|
||||
info = 6;
|
||||
} else if (*incx == 0) {
|
||||
info = 8;
|
||||
} else if (*incy == 0) {
|
||||
info = 11;
|
||||
}
|
||||
if (info != 0) {
|
||||
xerbla_("DSBMV ", &info);
|
||||
return;
|
||||
}
|
||||
|
||||
/* Quick return if possible. */
|
||||
|
||||
if (*n == 0 || (*alpha == 0. && *beta == 1.)) {
|
||||
return;
|
||||
}
|
||||
|
||||
/* Set up the start points in X and Y. */
|
||||
|
||||
if (*incx > 0) {
|
||||
kx = 1;
|
||||
} else {
|
||||
kx = 1 - (*n - 1) * *incx;
|
||||
}
|
||||
if (*incy > 0) {
|
||||
ky = 1;
|
||||
} else {
|
||||
ky = 1 - (*n - 1) * *incy;
|
||||
}
|
||||
|
||||
/* Start the operations. In this version the elements of the array A */
|
||||
/* are accessed sequentially with one pass through A. */
|
||||
|
||||
/* First form y := beta*y. */
|
||||
|
||||
if (*beta != 1.) {
|
||||
if (*incy == 1) {
|
||||
if (*beta == 0.) {
|
||||
i__1 = *n;
|
||||
for (i__ = 1; i__ <= i__1; ++i__) {
|
||||
y[i__] = 0.;
|
||||
/* L10: */
|
||||
}
|
||||
} else {
|
||||
i__1 = *n;
|
||||
for (i__ = 1; i__ <= i__1; ++i__) {
|
||||
y[i__] = *beta * y[i__];
|
||||
/* L20: */
|
||||
}
|
||||
}
|
||||
} else {
|
||||
iy = ky;
|
||||
if (*beta == 0.) {
|
||||
i__1 = *n;
|
||||
for (i__ = 1; i__ <= i__1; ++i__) {
|
||||
y[iy] = 0.;
|
||||
iy += *incy;
|
||||
/* L30: */
|
||||
}
|
||||
} else {
|
||||
i__1 = *n;
|
||||
for (i__ = 1; i__ <= i__1; ++i__) {
|
||||
y[iy] = *beta * y[iy];
|
||||
iy += *incy;
|
||||
/* L40: */
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
if (*alpha == 0.) {
|
||||
return;
|
||||
}
|
||||
if (lsame_(uplo, "U")) {
|
||||
/* Form y when upper triangle of A is stored. */
|
||||
|
||||
kplus1 = *k + 1;
|
||||
if (*incx == 1 && *incy == 1) {
|
||||
i__1 = *n;
|
||||
for (j = 1; j <= i__1; ++j) {
|
||||
temp1 = *alpha * x[j];
|
||||
temp2 = 0.;
|
||||
l = kplus1 - j;
|
||||
/* Computing MAX */
|
||||
i__2 = 1, i__3 = j - *k;
|
||||
i__4 = j - 1;
|
||||
for (i__ = max(i__2, i__3); i__ <= i__4; ++i__) {
|
||||
y[i__] += temp1 * a[l + i__ + j * a_dim1];
|
||||
temp2 += a[l + i__ + j * a_dim1] * x[i__];
|
||||
/* L50: */
|
||||
}
|
||||
y[j] = y[j] + temp1 * a[kplus1 + j * a_dim1] + *alpha * temp2;
|
||||
/* L60: */
|
||||
}
|
||||
} else {
|
||||
jx = kx;
|
||||
jy = ky;
|
||||
i__1 = *n;
|
||||
for (j = 1; j <= i__1; ++j) {
|
||||
temp1 = *alpha * x[jx];
|
||||
temp2 = 0.;
|
||||
ix = kx;
|
||||
iy = ky;
|
||||
l = kplus1 - j;
|
||||
/* Computing MAX */
|
||||
i__4 = 1, i__2 = j - *k;
|
||||
i__3 = j - 1;
|
||||
for (i__ = max(i__4, i__2); i__ <= i__3; ++i__) {
|
||||
y[iy] += temp1 * a[l + i__ + j * a_dim1];
|
||||
temp2 += a[l + i__ + j * a_dim1] * x[ix];
|
||||
ix += *incx;
|
||||
iy += *incy;
|
||||
/* L70: */
|
||||
}
|
||||
y[jy] = y[jy] + temp1 * a[kplus1 + j * a_dim1] + *alpha * temp2;
|
||||
jx += *incx;
|
||||
jy += *incy;
|
||||
if (j > *k) {
|
||||
kx += *incx;
|
||||
ky += *incy;
|
||||
}
|
||||
/* L80: */
|
||||
}
|
||||
}
|
||||
} else {
|
||||
/* Form y when lower triangle of A is stored. */
|
||||
|
||||
if (*incx == 1 && *incy == 1) {
|
||||
i__1 = *n;
|
||||
for (j = 1; j <= i__1; ++j) {
|
||||
temp1 = *alpha * x[j];
|
||||
temp2 = 0.;
|
||||
y[j] += temp1 * a[j * a_dim1 + 1];
|
||||
l = 1 - j;
|
||||
/* Computing MIN */
|
||||
i__4 = *n, i__2 = j + *k;
|
||||
i__3 = min(i__4, i__2);
|
||||
for (i__ = j + 1; i__ <= i__3; ++i__) {
|
||||
y[i__] += temp1 * a[l + i__ + j * a_dim1];
|
||||
temp2 += a[l + i__ + j * a_dim1] * x[i__];
|
||||
/* L90: */
|
||||
}
|
||||
y[j] += *alpha * temp2;
|
||||
/* L100: */
|
||||
}
|
||||
} else {
|
||||
jx = kx;
|
||||
jy = ky;
|
||||
i__1 = *n;
|
||||
for (j = 1; j <= i__1; ++j) {
|
||||
temp1 = *alpha * x[jx];
|
||||
temp2 = 0.;
|
||||
y[jy] += temp1 * a[j * a_dim1 + 1];
|
||||
l = 1 - j;
|
||||
ix = jx;
|
||||
iy = jy;
|
||||
/* Computing MIN */
|
||||
i__4 = *n, i__2 = j + *k;
|
||||
i__3 = min(i__4, i__2);
|
||||
for (i__ = j + 1; i__ <= i__3; ++i__) {
|
||||
ix += *incx;
|
||||
iy += *incy;
|
||||
y[iy] += temp1 * a[l + i__ + j * a_dim1];
|
||||
temp2 += a[l + i__ + j * a_dim1] * x[ix];
|
||||
/* L110: */
|
||||
}
|
||||
y[jy] += *alpha * temp2;
|
||||
jx += *incx;
|
||||
jy += *incy;
|
||||
/* L120: */
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/* End of DSBMV . */
|
||||
|
||||
} /* dsbmv_ */
|
||||
308
blas/f2c/dspmv.c
Normal file
308
blas/f2c/dspmv.c
Normal file
@@ -0,0 +1,308 @@
|
||||
/* dspmv.f -- translated by f2c (version 20100827).
|
||||
You must link the resulting object file with libf2c:
|
||||
on Microsoft Windows system, link with libf2c.lib;
|
||||
on Linux or Unix systems, link with .../path/to/libf2c.a -lm
|
||||
or, if you install libf2c.a in a standard place, with -lf2c -lm
|
||||
-- in that order, at the end of the command line, as in
|
||||
cc *.o -lf2c -lm
|
||||
Source for libf2c is in /netlib/f2c/libf2c.zip, e.g.,
|
||||
|
||||
http://www.netlib.org/f2c/libf2c.zip
|
||||
*/
|
||||
|
||||
#include "datatypes.h"
|
||||
|
||||
/* Subroutine */ void dspmv_(char *uplo, integer *n, doublereal *alpha, doublereal *ap, doublereal *x, integer *incx,
|
||||
doublereal *beta, doublereal *y, integer *incy) {
|
||||
/* System generated locals */
|
||||
integer i__1, i__2;
|
||||
|
||||
/* Local variables */
|
||||
integer i__, j, k, kk, ix, iy, jx, jy, kx, ky, info;
|
||||
doublereal temp1, temp2;
|
||||
extern logical lsame_(char *, char *);
|
||||
extern /* Subroutine */ void xerbla_(const char *, integer *);
|
||||
|
||||
/* .. Scalar Arguments .. */
|
||||
/* .. */
|
||||
/* .. Array Arguments .. */
|
||||
/* .. */
|
||||
|
||||
/* Purpose */
|
||||
/* ======= */
|
||||
|
||||
/* DSPMV performs the matrix-vector operation */
|
||||
|
||||
/* y := alpha*A*x + beta*y, */
|
||||
|
||||
/* where alpha and beta are scalars, x and y are n element vectors and */
|
||||
/* A is an n by n symmetric matrix, supplied in packed form. */
|
||||
|
||||
/* Arguments */
|
||||
/* ========== */
|
||||
|
||||
/* UPLO - CHARACTER*1. */
|
||||
/* On entry, UPLO specifies whether the upper or lower */
|
||||
/* triangular part of the matrix A is supplied in the packed */
|
||||
/* array AP as follows: */
|
||||
|
||||
/* UPLO = 'U' or 'u' The upper triangular part of A is */
|
||||
/* supplied in AP. */
|
||||
|
||||
/* UPLO = 'L' or 'l' The lower triangular part of A is */
|
||||
/* supplied in AP. */
|
||||
|
||||
/* Unchanged on exit. */
|
||||
|
||||
/* N - INTEGER. */
|
||||
/* On entry, N specifies the order of the matrix A. */
|
||||
/* N must be at least zero. */
|
||||
/* Unchanged on exit. */
|
||||
|
||||
/* ALPHA - DOUBLE PRECISION. */
|
||||
/* On entry, ALPHA specifies the scalar alpha. */
|
||||
/* Unchanged on exit. */
|
||||
|
||||
/* AP - DOUBLE PRECISION array of DIMENSION at least */
|
||||
/* ( ( n*( n + 1 ) )/2 ). */
|
||||
/* Before entry with UPLO = 'U' or 'u', the array AP must */
|
||||
/* contain the upper triangular part of the symmetric matrix */
|
||||
/* packed sequentially, column by column, so that AP( 1 ) */
|
||||
/* contains a( 1, 1 ), AP( 2 ) and AP( 3 ) contain a( 1, 2 ) */
|
||||
/* and a( 2, 2 ) respectively, and so on. */
|
||||
/* Before entry with UPLO = 'L' or 'l', the array AP must */
|
||||
/* contain the lower triangular part of the symmetric matrix */
|
||||
/* packed sequentially, column by column, so that AP( 1 ) */
|
||||
/* contains a( 1, 1 ), AP( 2 ) and AP( 3 ) contain a( 2, 1 ) */
|
||||
/* and a( 3, 1 ) respectively, and so on. */
|
||||
/* Unchanged on exit. */
|
||||
|
||||
/* X - DOUBLE PRECISION array of dimension at least */
|
||||
/* ( 1 + ( n - 1 )*abs( INCX ) ). */
|
||||
/* Before entry, the incremented array X must contain the n */
|
||||
/* element vector x. */
|
||||
/* Unchanged on exit. */
|
||||
|
||||
/* INCX - INTEGER. */
|
||||
/* On entry, INCX specifies the increment for the elements of */
|
||||
/* X. INCX must not be zero. */
|
||||
/* Unchanged on exit. */
|
||||
|
||||
/* BETA - DOUBLE PRECISION. */
|
||||
/* On entry, BETA specifies the scalar beta. When BETA is */
|
||||
/* supplied as zero then Y need not be set on input. */
|
||||
/* Unchanged on exit. */
|
||||
|
||||
/* Y - DOUBLE PRECISION array of dimension at least */
|
||||
/* ( 1 + ( n - 1 )*abs( INCY ) ). */
|
||||
/* Before entry, the incremented array Y must contain the n */
|
||||
/* element vector y. On exit, Y is overwritten by the updated */
|
||||
/* vector y. */
|
||||
|
||||
/* INCY - INTEGER. */
|
||||
/* On entry, INCY specifies the increment for the elements of */
|
||||
/* Y. INCY must not be zero. */
|
||||
/* Unchanged on exit. */
|
||||
|
||||
/* Further Details */
|
||||
/* =============== */
|
||||
|
||||
/* Level 2 Blas routine. */
|
||||
|
||||
/* -- Written on 22-October-1986. */
|
||||
/* Jack Dongarra, Argonne National Lab. */
|
||||
/* Jeremy Du Croz, Nag Central Office. */
|
||||
/* Sven Hammarling, Nag Central Office. */
|
||||
/* Richard Hanson, Sandia National Labs. */
|
||||
|
||||
/* ===================================================================== */
|
||||
|
||||
/* .. Parameters .. */
|
||||
/* .. */
|
||||
/* .. Local Scalars .. */
|
||||
/* .. */
|
||||
/* .. External Functions .. */
|
||||
/* .. */
|
||||
/* .. External Subroutines .. */
|
||||
/* .. */
|
||||
|
||||
/* Test the input parameters. */
|
||||
|
||||
/* Parameter adjustments */
|
||||
--y;
|
||||
--x;
|
||||
--ap;
|
||||
|
||||
/* Function Body */
|
||||
info = 0;
|
||||
if (!lsame_(uplo, "U") && !lsame_(uplo, "L")) {
|
||||
info = 1;
|
||||
} else if (*n < 0) {
|
||||
info = 2;
|
||||
} else if (*incx == 0) {
|
||||
info = 6;
|
||||
} else if (*incy == 0) {
|
||||
info = 9;
|
||||
}
|
||||
if (info != 0) {
|
||||
xerbla_("DSPMV ", &info);
|
||||
return;
|
||||
}
|
||||
|
||||
/* Quick return if possible. */
|
||||
|
||||
if (*n == 0 || (*alpha == 0. && *beta == 1.)) {
|
||||
return;
|
||||
}
|
||||
|
||||
/* Set up the start points in X and Y. */
|
||||
|
||||
if (*incx > 0) {
|
||||
kx = 1;
|
||||
} else {
|
||||
kx = 1 - (*n - 1) * *incx;
|
||||
}
|
||||
if (*incy > 0) {
|
||||
ky = 1;
|
||||
} else {
|
||||
ky = 1 - (*n - 1) * *incy;
|
||||
}
|
||||
|
||||
/* Start the operations. In this version the elements of the array AP */
|
||||
/* are accessed sequentially with one pass through AP. */
|
||||
|
||||
/* First form y := beta*y. */
|
||||
|
||||
if (*beta != 1.) {
|
||||
if (*incy == 1) {
|
||||
if (*beta == 0.) {
|
||||
i__1 = *n;
|
||||
for (i__ = 1; i__ <= i__1; ++i__) {
|
||||
y[i__] = 0.;
|
||||
/* L10: */
|
||||
}
|
||||
} else {
|
||||
i__1 = *n;
|
||||
for (i__ = 1; i__ <= i__1; ++i__) {
|
||||
y[i__] = *beta * y[i__];
|
||||
/* L20: */
|
||||
}
|
||||
}
|
||||
} else {
|
||||
iy = ky;
|
||||
if (*beta == 0.) {
|
||||
i__1 = *n;
|
||||
for (i__ = 1; i__ <= i__1; ++i__) {
|
||||
y[iy] = 0.;
|
||||
iy += *incy;
|
||||
/* L30: */
|
||||
}
|
||||
} else {
|
||||
i__1 = *n;
|
||||
for (i__ = 1; i__ <= i__1; ++i__) {
|
||||
y[iy] = *beta * y[iy];
|
||||
iy += *incy;
|
||||
/* L40: */
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
if (*alpha == 0.) {
|
||||
return;
|
||||
}
|
||||
kk = 1;
|
||||
if (lsame_(uplo, "U")) {
|
||||
/* Form y when AP contains the upper triangle. */
|
||||
|
||||
if (*incx == 1 && *incy == 1) {
|
||||
i__1 = *n;
|
||||
for (j = 1; j <= i__1; ++j) {
|
||||
temp1 = *alpha * x[j];
|
||||
temp2 = 0.;
|
||||
k = kk;
|
||||
i__2 = j - 1;
|
||||
for (i__ = 1; i__ <= i__2; ++i__) {
|
||||
y[i__] += temp1 * ap[k];
|
||||
temp2 += ap[k] * x[i__];
|
||||
++k;
|
||||
/* L50: */
|
||||
}
|
||||
y[j] = y[j] + temp1 * ap[kk + j - 1] + *alpha * temp2;
|
||||
kk += j;
|
||||
/* L60: */
|
||||
}
|
||||
} else {
|
||||
jx = kx;
|
||||
jy = ky;
|
||||
i__1 = *n;
|
||||
for (j = 1; j <= i__1; ++j) {
|
||||
temp1 = *alpha * x[jx];
|
||||
temp2 = 0.;
|
||||
ix = kx;
|
||||
iy = ky;
|
||||
i__2 = kk + j - 2;
|
||||
for (k = kk; k <= i__2; ++k) {
|
||||
y[iy] += temp1 * ap[k];
|
||||
temp2 += ap[k] * x[ix];
|
||||
ix += *incx;
|
||||
iy += *incy;
|
||||
/* L70: */
|
||||
}
|
||||
y[jy] = y[jy] + temp1 * ap[kk + j - 1] + *alpha * temp2;
|
||||
jx += *incx;
|
||||
jy += *incy;
|
||||
kk += j;
|
||||
/* L80: */
|
||||
}
|
||||
}
|
||||
} else {
|
||||
/* Form y when AP contains the lower triangle. */
|
||||
|
||||
if (*incx == 1 && *incy == 1) {
|
||||
i__1 = *n;
|
||||
for (j = 1; j <= i__1; ++j) {
|
||||
temp1 = *alpha * x[j];
|
||||
temp2 = 0.;
|
||||
y[j] += temp1 * ap[kk];
|
||||
k = kk + 1;
|
||||
i__2 = *n;
|
||||
for (i__ = j + 1; i__ <= i__2; ++i__) {
|
||||
y[i__] += temp1 * ap[k];
|
||||
temp2 += ap[k] * x[i__];
|
||||
++k;
|
||||
/* L90: */
|
||||
}
|
||||
y[j] += *alpha * temp2;
|
||||
kk += *n - j + 1;
|
||||
/* L100: */
|
||||
}
|
||||
} else {
|
||||
jx = kx;
|
||||
jy = ky;
|
||||
i__1 = *n;
|
||||
for (j = 1; j <= i__1; ++j) {
|
||||
temp1 = *alpha * x[jx];
|
||||
temp2 = 0.;
|
||||
y[jy] += temp1 * ap[kk];
|
||||
ix = jx;
|
||||
iy = jy;
|
||||
i__2 = kk + *n - j;
|
||||
for (k = kk + 1; k <= i__2; ++k) {
|
||||
ix += *incx;
|
||||
iy += *incy;
|
||||
y[iy] += temp1 * ap[k];
|
||||
temp2 += ap[k] * x[ix];
|
||||
/* L110: */
|
||||
}
|
||||
y[jy] += *alpha * temp2;
|
||||
jx += *incx;
|
||||
jy += *incy;
|
||||
kk += *n - j + 1;
|
||||
/* L120: */
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/* End of DSPMV . */
|
||||
|
||||
} /* dspmv_ */
|
||||
417
blas/f2c/dtbmv.c
Normal file
417
blas/f2c/dtbmv.c
Normal file
@@ -0,0 +1,417 @@
|
||||
/* dtbmv.f -- translated by f2c (version 20100827).
|
||||
You must link the resulting object file with libf2c:
|
||||
on Microsoft Windows system, link with libf2c.lib;
|
||||
on Linux or Unix systems, link with .../path/to/libf2c.a -lm
|
||||
or, if you install libf2c.a in a standard place, with -lf2c -lm
|
||||
-- in that order, at the end of the command line, as in
|
||||
cc *.o -lf2c -lm
|
||||
Source for libf2c is in /netlib/f2c/libf2c.zip, e.g.,
|
||||
|
||||
http://www.netlib.org/f2c/libf2c.zip
|
||||
*/
|
||||
|
||||
#include "datatypes.h"
|
||||
|
||||
/* Subroutine */ void dtbmv_(char *uplo, char *trans, char *diag, integer *n, integer *k, doublereal *a, integer *lda,
|
||||
doublereal *x, integer *incx) {
|
||||
/* System generated locals */
|
||||
integer a_dim1, a_offset, i__1, i__2, i__3, i__4;
|
||||
|
||||
/* Local variables */
|
||||
integer i__, j, l, ix, jx, kx, info;
|
||||
doublereal temp;
|
||||
extern logical lsame_(char *, char *);
|
||||
integer kplus1;
|
||||
extern /* Subroutine */ void xerbla_(const char *, integer *);
|
||||
logical nounit;
|
||||
|
||||
/* .. Scalar Arguments .. */
|
||||
/* .. */
|
||||
/* .. Array Arguments .. */
|
||||
/* .. */
|
||||
|
||||
/* Purpose */
|
||||
/* ======= */
|
||||
|
||||
/* DTBMV performs one of the matrix-vector operations */
|
||||
|
||||
/* x := A*x, or x := A'*x, */
|
||||
|
||||
/* where x is an n element vector and A is an n by n unit, or non-unit, */
|
||||
/* upper or lower triangular band matrix, with ( k + 1 ) diagonals. */
|
||||
|
||||
/* Arguments */
|
||||
/* ========== */
|
||||
|
||||
/* UPLO - CHARACTER*1. */
|
||||
/* On entry, UPLO specifies whether the matrix is an upper or */
|
||||
/* lower triangular matrix as follows: */
|
||||
|
||||
/* UPLO = 'U' or 'u' A is an upper triangular matrix. */
|
||||
|
||||
/* UPLO = 'L' or 'l' A is a lower triangular matrix. */
|
||||
|
||||
/* Unchanged on exit. */
|
||||
|
||||
/* TRANS - CHARACTER*1. */
|
||||
/* On entry, TRANS specifies the operation to be performed as */
|
||||
/* follows: */
|
||||
|
||||
/* TRANS = 'N' or 'n' x := A*x. */
|
||||
|
||||
/* TRANS = 'T' or 't' x := A'*x. */
|
||||
|
||||
/* TRANS = 'C' or 'c' x := A'*x. */
|
||||
|
||||
/* Unchanged on exit. */
|
||||
|
||||
/* DIAG - CHARACTER*1. */
|
||||
/* On entry, DIAG specifies whether or not A is unit */
|
||||
/* triangular as follows: */
|
||||
|
||||
/* DIAG = 'U' or 'u' A is assumed to be unit triangular. */
|
||||
|
||||
/* DIAG = 'N' or 'n' A is not assumed to be unit */
|
||||
/* triangular. */
|
||||
|
||||
/* Unchanged on exit. */
|
||||
|
||||
/* N - INTEGER. */
|
||||
/* On entry, N specifies the order of the matrix A. */
|
||||
/* N must be at least zero. */
|
||||
/* Unchanged on exit. */
|
||||
|
||||
/* K - INTEGER. */
|
||||
/* On entry with UPLO = 'U' or 'u', K specifies the number of */
|
||||
/* super-diagonals of the matrix A. */
|
||||
/* On entry with UPLO = 'L' or 'l', K specifies the number of */
|
||||
/* sub-diagonals of the matrix A. */
|
||||
/* K must satisfy 0 .le. K. */
|
||||
/* Unchanged on exit. */
|
||||
|
||||
/* A - DOUBLE PRECISION array of DIMENSION ( LDA, n ). */
|
||||
/* Before entry with UPLO = 'U' or 'u', the leading ( k + 1 ) */
|
||||
/* by n part of the array A must contain the upper triangular */
|
||||
/* band part of the matrix of coefficients, supplied column by */
|
||||
/* column, with the leading diagonal of the matrix in row */
|
||||
/* ( k + 1 ) of the array, the first super-diagonal starting at */
|
||||
/* position 2 in row k, and so on. The top left k by k triangle */
|
||||
/* of the array A is not referenced. */
|
||||
/* The following program segment will transfer an upper */
|
||||
/* triangular band matrix from conventional full matrix storage */
|
||||
/* to band storage: */
|
||||
|
||||
/* DO 20, J = 1, N */
|
||||
/* M = K + 1 - J */
|
||||
/* DO 10, I = MAX( 1, J - K ), J */
|
||||
/* A( M + I, J ) = matrix( I, J ) */
|
||||
/* 10 CONTINUE */
|
||||
/* 20 CONTINUE */
|
||||
|
||||
/* Before entry with UPLO = 'L' or 'l', the leading ( k + 1 ) */
|
||||
/* by n part of the array A must contain the lower triangular */
|
||||
/* band part of the matrix of coefficients, supplied column by */
|
||||
/* column, with the leading diagonal of the matrix in row 1 of */
|
||||
/* the array, the first sub-diagonal starting at position 1 in */
|
||||
/* row 2, and so on. The bottom right k by k triangle of the */
|
||||
/* array A is not referenced. */
|
||||
/* The following program segment will transfer a lower */
|
||||
/* triangular band matrix from conventional full matrix storage */
|
||||
/* to band storage: */
|
||||
|
||||
/* DO 20, J = 1, N */
|
||||
/* M = 1 - J */
|
||||
/* DO 10, I = J, MIN( N, J + K ) */
|
||||
/* A( M + I, J ) = matrix( I, J ) */
|
||||
/* 10 CONTINUE */
|
||||
/* 20 CONTINUE */
|
||||
|
||||
/* Note that when DIAG = 'U' or 'u' the elements of the array A */
|
||||
/* corresponding to the diagonal elements of the matrix are not */
|
||||
/* referenced, but are assumed to be unity. */
|
||||
/* Unchanged on exit. */
|
||||
|
||||
/* LDA - INTEGER. */
|
||||
/* On entry, LDA specifies the first dimension of A as declared */
|
||||
/* in the calling (sub) program. LDA must be at least */
|
||||
/* ( k + 1 ). */
|
||||
/* Unchanged on exit. */
|
||||
|
||||
/* X - DOUBLE PRECISION array of dimension at least */
|
||||
/* ( 1 + ( n - 1 )*abs( INCX ) ). */
|
||||
/* Before entry, the incremented array X must contain the n */
|
||||
/* element vector x. On exit, X is overwritten with the */
|
||||
/* transformed vector x. */
|
||||
|
||||
/* INCX - INTEGER. */
|
||||
/* On entry, INCX specifies the increment for the elements of */
|
||||
/* X. INCX must not be zero. */
|
||||
/* Unchanged on exit. */
|
||||
|
||||
/* Further Details */
|
||||
/* =============== */
|
||||
|
||||
/* Level 2 Blas routine. */
|
||||
|
||||
/* -- Written on 22-October-1986. */
|
||||
/* Jack Dongarra, Argonne National Lab. */
|
||||
/* Jeremy Du Croz, Nag Central Office. */
|
||||
/* Sven Hammarling, Nag Central Office. */
|
||||
/* Richard Hanson, Sandia National Labs. */
|
||||
|
||||
/* ===================================================================== */
|
||||
|
||||
/* .. Parameters .. */
|
||||
/* .. */
|
||||
/* .. Local Scalars .. */
|
||||
/* .. */
|
||||
/* .. External Functions .. */
|
||||
/* .. */
|
||||
/* .. External Subroutines .. */
|
||||
/* .. */
|
||||
/* .. Intrinsic Functions .. */
|
||||
/* .. */
|
||||
|
||||
/* Test the input parameters. */
|
||||
|
||||
/* Parameter adjustments */
|
||||
a_dim1 = *lda;
|
||||
a_offset = 1 + a_dim1;
|
||||
a -= a_offset;
|
||||
--x;
|
||||
|
||||
/* Function Body */
|
||||
info = 0;
|
||||
if (!lsame_(uplo, "U") && !lsame_(uplo, "L")) {
|
||||
info = 1;
|
||||
} else if (!lsame_(trans, "N") && !lsame_(trans, "T") && !lsame_(trans, "C")) {
|
||||
info = 2;
|
||||
} else if (!lsame_(diag, "U") && !lsame_(diag, "N")) {
|
||||
info = 3;
|
||||
} else if (*n < 0) {
|
||||
info = 4;
|
||||
} else if (*k < 0) {
|
||||
info = 5;
|
||||
} else if (*lda < *k + 1) {
|
||||
info = 7;
|
||||
} else if (*incx == 0) {
|
||||
info = 9;
|
||||
}
|
||||
if (info != 0) {
|
||||
xerbla_("DTBMV ", &info);
|
||||
return;
|
||||
}
|
||||
|
||||
/* Quick return if possible. */
|
||||
|
||||
if (*n == 0) {
|
||||
return;
|
||||
}
|
||||
|
||||
nounit = lsame_(diag, "N");
|
||||
|
||||
/* Set up the start point in X if the increment is not unity. This */
|
||||
/* will be ( N - 1 )*INCX too small for descending loops. */
|
||||
|
||||
if (*incx <= 0) {
|
||||
kx = 1 - (*n - 1) * *incx;
|
||||
} else if (*incx != 1) {
|
||||
kx = 1;
|
||||
}
|
||||
|
||||
/* Start the operations. In this version the elements of A are */
|
||||
/* accessed sequentially with one pass through A. */
|
||||
|
||||
if (lsame_(trans, "N")) {
|
||||
/* Form x := A*x. */
|
||||
|
||||
if (lsame_(uplo, "U")) {
|
||||
kplus1 = *k + 1;
|
||||
if (*incx == 1) {
|
||||
i__1 = *n;
|
||||
for (j = 1; j <= i__1; ++j) {
|
||||
if (x[j] != 0.) {
|
||||
temp = x[j];
|
||||
l = kplus1 - j;
|
||||
/* Computing MAX */
|
||||
i__2 = 1, i__3 = j - *k;
|
||||
i__4 = j - 1;
|
||||
for (i__ = max(i__2, i__3); i__ <= i__4; ++i__) {
|
||||
x[i__] += temp * a[l + i__ + j * a_dim1];
|
||||
/* L10: */
|
||||
}
|
||||
if (nounit) {
|
||||
x[j] *= a[kplus1 + j * a_dim1];
|
||||
}
|
||||
}
|
||||
/* L20: */
|
||||
}
|
||||
} else {
|
||||
jx = kx;
|
||||
i__1 = *n;
|
||||
for (j = 1; j <= i__1; ++j) {
|
||||
if (x[jx] != 0.) {
|
||||
temp = x[jx];
|
||||
ix = kx;
|
||||
l = kplus1 - j;
|
||||
/* Computing MAX */
|
||||
i__4 = 1, i__2 = j - *k;
|
||||
i__3 = j - 1;
|
||||
for (i__ = max(i__4, i__2); i__ <= i__3; ++i__) {
|
||||
x[ix] += temp * a[l + i__ + j * a_dim1];
|
||||
ix += *incx;
|
||||
/* L30: */
|
||||
}
|
||||
if (nounit) {
|
||||
x[jx] *= a[kplus1 + j * a_dim1];
|
||||
}
|
||||
}
|
||||
jx += *incx;
|
||||
if (j > *k) {
|
||||
kx += *incx;
|
||||
}
|
||||
/* L40: */
|
||||
}
|
||||
}
|
||||
} else {
|
||||
if (*incx == 1) {
|
||||
for (j = *n; j >= 1; --j) {
|
||||
if (x[j] != 0.) {
|
||||
temp = x[j];
|
||||
l = 1 - j;
|
||||
/* Computing MIN */
|
||||
i__1 = *n, i__3 = j + *k;
|
||||
i__4 = j + 1;
|
||||
for (i__ = min(i__1, i__3); i__ >= i__4; --i__) {
|
||||
x[i__] += temp * a[l + i__ + j * a_dim1];
|
||||
/* L50: */
|
||||
}
|
||||
if (nounit) {
|
||||
x[j] *= a[j * a_dim1 + 1];
|
||||
}
|
||||
}
|
||||
/* L60: */
|
||||
}
|
||||
} else {
|
||||
kx += (*n - 1) * *incx;
|
||||
jx = kx;
|
||||
for (j = *n; j >= 1; --j) {
|
||||
if (x[jx] != 0.) {
|
||||
temp = x[jx];
|
||||
ix = kx;
|
||||
l = 1 - j;
|
||||
/* Computing MIN */
|
||||
i__4 = *n, i__1 = j + *k;
|
||||
i__3 = j + 1;
|
||||
for (i__ = min(i__4, i__1); i__ >= i__3; --i__) {
|
||||
x[ix] += temp * a[l + i__ + j * a_dim1];
|
||||
ix -= *incx;
|
||||
/* L70: */
|
||||
}
|
||||
if (nounit) {
|
||||
x[jx] *= a[j * a_dim1 + 1];
|
||||
}
|
||||
}
|
||||
jx -= *incx;
|
||||
if (*n - j >= *k) {
|
||||
kx -= *incx;
|
||||
}
|
||||
/* L80: */
|
||||
}
|
||||
}
|
||||
}
|
||||
} else {
|
||||
/* Form x := A'*x. */
|
||||
|
||||
if (lsame_(uplo, "U")) {
|
||||
kplus1 = *k + 1;
|
||||
if (*incx == 1) {
|
||||
for (j = *n; j >= 1; --j) {
|
||||
temp = x[j];
|
||||
l = kplus1 - j;
|
||||
if (nounit) {
|
||||
temp *= a[kplus1 + j * a_dim1];
|
||||
}
|
||||
/* Computing MAX */
|
||||
i__4 = 1, i__1 = j - *k;
|
||||
i__3 = max(i__4, i__1);
|
||||
for (i__ = j - 1; i__ >= i__3; --i__) {
|
||||
temp += a[l + i__ + j * a_dim1] * x[i__];
|
||||
/* L90: */
|
||||
}
|
||||
x[j] = temp;
|
||||
/* L100: */
|
||||
}
|
||||
} else {
|
||||
kx += (*n - 1) * *incx;
|
||||
jx = kx;
|
||||
for (j = *n; j >= 1; --j) {
|
||||
temp = x[jx];
|
||||
kx -= *incx;
|
||||
ix = kx;
|
||||
l = kplus1 - j;
|
||||
if (nounit) {
|
||||
temp *= a[kplus1 + j * a_dim1];
|
||||
}
|
||||
/* Computing MAX */
|
||||
i__4 = 1, i__1 = j - *k;
|
||||
i__3 = max(i__4, i__1);
|
||||
for (i__ = j - 1; i__ >= i__3; --i__) {
|
||||
temp += a[l + i__ + j * a_dim1] * x[ix];
|
||||
ix -= *incx;
|
||||
/* L110: */
|
||||
}
|
||||
x[jx] = temp;
|
||||
jx -= *incx;
|
||||
/* L120: */
|
||||
}
|
||||
}
|
||||
} else {
|
||||
if (*incx == 1) {
|
||||
i__3 = *n;
|
||||
for (j = 1; j <= i__3; ++j) {
|
||||
temp = x[j];
|
||||
l = 1 - j;
|
||||
if (nounit) {
|
||||
temp *= a[j * a_dim1 + 1];
|
||||
}
|
||||
/* Computing MIN */
|
||||
i__1 = *n, i__2 = j + *k;
|
||||
i__4 = min(i__1, i__2);
|
||||
for (i__ = j + 1; i__ <= i__4; ++i__) {
|
||||
temp += a[l + i__ + j * a_dim1] * x[i__];
|
||||
/* L130: */
|
||||
}
|
||||
x[j] = temp;
|
||||
/* L140: */
|
||||
}
|
||||
} else {
|
||||
jx = kx;
|
||||
i__3 = *n;
|
||||
for (j = 1; j <= i__3; ++j) {
|
||||
temp = x[jx];
|
||||
kx += *incx;
|
||||
ix = kx;
|
||||
l = 1 - j;
|
||||
if (nounit) {
|
||||
temp *= a[j * a_dim1 + 1];
|
||||
}
|
||||
/* Computing MIN */
|
||||
i__1 = *n, i__2 = j + *k;
|
||||
i__4 = min(i__1, i__2);
|
||||
for (i__ = j + 1; i__ <= i__4; ++i__) {
|
||||
temp += a[l + i__ + j * a_dim1] * x[ix];
|
||||
ix += *incx;
|
||||
/* L150: */
|
||||
}
|
||||
x[jx] = temp;
|
||||
jx += *incx;
|
||||
/* L160: */
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/* End of DTBMV . */
|
||||
|
||||
} /* dtbmv_ */
|
||||
109
blas/f2c/lsame.c
Normal file
109
blas/f2c/lsame.c
Normal file
@@ -0,0 +1,109 @@
|
||||
/* lsame.f -- translated by f2c (version 20100827).
|
||||
You must link the resulting object file with libf2c:
|
||||
on Microsoft Windows system, link with libf2c.lib;
|
||||
on Linux or Unix systems, link with .../path/to/libf2c.a -lm
|
||||
or, if you install libf2c.a in a standard place, with -lf2c -lm
|
||||
-- in that order, at the end of the command line, as in
|
||||
cc *.o -lf2c -lm
|
||||
Source for libf2c is in /netlib/f2c/libf2c.zip, e.g.,
|
||||
|
||||
http://www.netlib.org/f2c/libf2c.zip
|
||||
*/
|
||||
|
||||
#include "datatypes.h"
|
||||
|
||||
logical lsame_(char *ca, char *cb) {
|
||||
/* System generated locals */
|
||||
logical ret_val;
|
||||
|
||||
/* Local variables */
|
||||
integer inta, intb, zcode;
|
||||
|
||||
/* -- LAPACK auxiliary routine (version 3.1) -- */
|
||||
/* Univ. of Tennessee, Univ. of California Berkeley and NAG Ltd.. */
|
||||
/* November 2006 */
|
||||
|
||||
/* .. Scalar Arguments .. */
|
||||
/* .. */
|
||||
|
||||
/* Purpose */
|
||||
/* ======= */
|
||||
|
||||
/* LSAME returns .TRUE. if CA is the same letter as CB regardless of */
|
||||
/* case. */
|
||||
|
||||
/* Arguments */
|
||||
/* ========= */
|
||||
|
||||
/* CA (input) CHARACTER*1 */
|
||||
|
||||
/* CB (input) CHARACTER*1 */
|
||||
/* CA and CB specify the single characters to be compared. */
|
||||
|
||||
/* ===================================================================== */
|
||||
|
||||
/* .. Intrinsic Functions .. */
|
||||
/* .. */
|
||||
/* .. Local Scalars .. */
|
||||
/* .. */
|
||||
|
||||
/* Test if the characters are equal */
|
||||
|
||||
ret_val = *(unsigned char *)ca == *(unsigned char *)cb;
|
||||
if (ret_val) {
|
||||
return ret_val;
|
||||
}
|
||||
|
||||
/* Now test for equivalence if both characters are alphabetic. */
|
||||
|
||||
zcode = 'Z';
|
||||
|
||||
/* Use 'Z' rather than 'A' so that ASCII can be detected on Prime */
|
||||
/* machines, on which ICHAR returns a value with bit 8 set. */
|
||||
/* ICHAR('A') on Prime machines returns 193 which is the same as */
|
||||
/* ICHAR('A') on an EBCDIC machine. */
|
||||
|
||||
inta = *(unsigned char *)ca;
|
||||
intb = *(unsigned char *)cb;
|
||||
|
||||
if (zcode == 90 || zcode == 122) {
|
||||
/* ASCII is assumed - ZCODE is the ASCII code of either lower or */
|
||||
/* upper case 'Z'. */
|
||||
|
||||
if (inta >= 97 && inta <= 122) {
|
||||
inta += -32;
|
||||
}
|
||||
if (intb >= 97 && intb <= 122) {
|
||||
intb += -32;
|
||||
}
|
||||
|
||||
} else if (zcode == 233 || zcode == 169) {
|
||||
/* EBCDIC is assumed - ZCODE is the EBCDIC code of either lower or */
|
||||
/* upper case 'Z'. */
|
||||
|
||||
if ((inta >= 129 && inta <= 137) || (inta >= 145 && inta <= 153) || (inta >= 162 && inta <= 169)) {
|
||||
inta += 64;
|
||||
}
|
||||
if ((intb >= 129 && intb <= 137) || (intb >= 145 && intb <= 153) || (intb >= 162 && intb <= 169)) {
|
||||
intb += 64;
|
||||
}
|
||||
|
||||
} else if (zcode == 218 || zcode == 250) {
|
||||
/* ASCII is assumed, on Prime machines - ZCODE is the ASCII code */
|
||||
/* plus 128 of either lower or upper case 'Z'. */
|
||||
|
||||
if (inta >= 225 && inta <= 250) {
|
||||
inta += -32;
|
||||
}
|
||||
if (intb >= 225 && intb <= 250) {
|
||||
intb += -32;
|
||||
}
|
||||
}
|
||||
ret_val = inta == intb;
|
||||
|
||||
/* RETURN */
|
||||
|
||||
/* End of LSAME */
|
||||
|
||||
return ret_val;
|
||||
} /* lsame_ */
|
||||
212
blas/f2c/srotm.c
Normal file
212
blas/f2c/srotm.c
Normal file
@@ -0,0 +1,212 @@
|
||||
/* srotm.f -- translated by f2c (version 20100827).
|
||||
You must link the resulting object file with libf2c:
|
||||
on Microsoft Windows system, link with libf2c.lib;
|
||||
on Linux or Unix systems, link with .../path/to/libf2c.a -lm
|
||||
or, if you install libf2c.a in a standard place, with -lf2c -lm
|
||||
-- in that order, at the end of the command line, as in
|
||||
cc *.o -lf2c -lm
|
||||
Source for libf2c is in /netlib/f2c/libf2c.zip, e.g.,
|
||||
|
||||
http://www.netlib.org/f2c/libf2c.zip
|
||||
*/
|
||||
|
||||
#include "datatypes.h"
|
||||
|
||||
/* Subroutine */ void srotm_(integer *n, real *sx, integer *incx, real *sy, integer *incy, real *sparam) {
|
||||
/* Initialized data */
|
||||
|
||||
static real zero = 0.f;
|
||||
static real two = 2.f;
|
||||
|
||||
/* System generated locals */
|
||||
integer i__1, i__2;
|
||||
|
||||
/* Local variables */
|
||||
integer i__;
|
||||
real w, z__;
|
||||
integer kx, ky;
|
||||
real sh11, sh12, sh21, sh22, sflag;
|
||||
integer nsteps;
|
||||
|
||||
/* .. Scalar Arguments .. */
|
||||
/* .. */
|
||||
/* .. Array Arguments .. */
|
||||
/* .. */
|
||||
|
||||
/* Purpose */
|
||||
/* ======= */
|
||||
|
||||
/* APPLY THE MODIFIED GIVENS TRANSFORMATION, H, TO THE 2 BY N MATRIX */
|
||||
|
||||
/* (SX**T) , WHERE **T INDICATES TRANSPOSE. THE ELEMENTS OF SX ARE IN */
|
||||
/* (DX**T) */
|
||||
|
||||
/* SX(LX+I*INCX), I = 0 TO N-1, WHERE LX = 1 IF INCX .GE. 0, ELSE */
|
||||
/* LX = (-INCX)*N, AND SIMILARLY FOR SY USING USING LY AND INCY. */
|
||||
/* WITH SPARAM(1)=SFLAG, H HAS ONE OF THE FOLLOWING FORMS.. */
|
||||
|
||||
/* SFLAG=-1.E0 SFLAG=0.E0 SFLAG=1.E0 SFLAG=-2.E0 */
|
||||
|
||||
/* (SH11 SH12) (1.E0 SH12) (SH11 1.E0) (1.E0 0.E0) */
|
||||
/* H=( ) ( ) ( ) ( ) */
|
||||
/* (SH21 SH22), (SH21 1.E0), (-1.E0 SH22), (0.E0 1.E0). */
|
||||
/* SEE SROTMG FOR A DESCRIPTION OF DATA STORAGE IN SPARAM. */
|
||||
|
||||
/* Arguments */
|
||||
/* ========= */
|
||||
|
||||
/* N (input) INTEGER */
|
||||
/* number of elements in input vector(s) */
|
||||
|
||||
/* SX (input/output) REAL array, dimension N */
|
||||
/* double precision vector with N elements */
|
||||
|
||||
/* INCX (input) INTEGER */
|
||||
/* storage spacing between elements of SX */
|
||||
|
||||
/* SY (input/output) REAL array, dimension N */
|
||||
/* double precision vector with N elements */
|
||||
|
||||
/* INCY (input) INTEGER */
|
||||
/* storage spacing between elements of SY */
|
||||
|
||||
/* SPARAM (input/output) REAL array, dimension 5 */
|
||||
/* SPARAM(1)=SFLAG */
|
||||
/* SPARAM(2)=SH11 */
|
||||
/* SPARAM(3)=SH21 */
|
||||
/* SPARAM(4)=SH12 */
|
||||
/* SPARAM(5)=SH22 */
|
||||
|
||||
/* ===================================================================== */
|
||||
|
||||
/* .. Local Scalars .. */
|
||||
/* .. */
|
||||
/* .. Data statements .. */
|
||||
/* Parameter adjustments */
|
||||
--sparam;
|
||||
--sy;
|
||||
--sx;
|
||||
|
||||
/* Function Body */
|
||||
/* .. */
|
||||
|
||||
sflag = sparam[1];
|
||||
if (*n <= 0 || sflag + two == zero) {
|
||||
goto L140;
|
||||
}
|
||||
if (!(*incx == *incy && *incx > 0)) {
|
||||
goto L70;
|
||||
}
|
||||
|
||||
nsteps = *n * *incx;
|
||||
if (sflag < 0.f) {
|
||||
goto L50;
|
||||
} else if (sflag == 0) {
|
||||
goto L10;
|
||||
} else {
|
||||
goto L30;
|
||||
}
|
||||
L10:
|
||||
sh12 = sparam[4];
|
||||
sh21 = sparam[3];
|
||||
i__1 = nsteps;
|
||||
i__2 = *incx;
|
||||
for (i__ = 1; i__2 < 0 ? i__ >= i__1 : i__ <= i__1; i__ += i__2) {
|
||||
w = sx[i__];
|
||||
z__ = sy[i__];
|
||||
sx[i__] = w + z__ * sh12;
|
||||
sy[i__] = w * sh21 + z__;
|
||||
/* L20: */
|
||||
}
|
||||
goto L140;
|
||||
L30:
|
||||
sh11 = sparam[2];
|
||||
sh22 = sparam[5];
|
||||
i__2 = nsteps;
|
||||
i__1 = *incx;
|
||||
for (i__ = 1; i__1 < 0 ? i__ >= i__2 : i__ <= i__2; i__ += i__1) {
|
||||
w = sx[i__];
|
||||
z__ = sy[i__];
|
||||
sx[i__] = w * sh11 + z__;
|
||||
sy[i__] = -w + sh22 * z__;
|
||||
/* L40: */
|
||||
}
|
||||
goto L140;
|
||||
L50:
|
||||
sh11 = sparam[2];
|
||||
sh12 = sparam[4];
|
||||
sh21 = sparam[3];
|
||||
sh22 = sparam[5];
|
||||
i__1 = nsteps;
|
||||
i__2 = *incx;
|
||||
for (i__ = 1; i__2 < 0 ? i__ >= i__1 : i__ <= i__1; i__ += i__2) {
|
||||
w = sx[i__];
|
||||
z__ = sy[i__];
|
||||
sx[i__] = w * sh11 + z__ * sh12;
|
||||
sy[i__] = w * sh21 + z__ * sh22;
|
||||
/* L60: */
|
||||
}
|
||||
goto L140;
|
||||
L70:
|
||||
kx = 1;
|
||||
ky = 1;
|
||||
if (*incx < 0) {
|
||||
kx = (1 - *n) * *incx + 1;
|
||||
}
|
||||
if (*incy < 0) {
|
||||
ky = (1 - *n) * *incy + 1;
|
||||
}
|
||||
|
||||
if (sflag < 0.f) {
|
||||
goto L120;
|
||||
} else if (sflag == 0) {
|
||||
goto L80;
|
||||
} else {
|
||||
goto L100;
|
||||
}
|
||||
L80:
|
||||
sh12 = sparam[4];
|
||||
sh21 = sparam[3];
|
||||
i__2 = *n;
|
||||
for (i__ = 1; i__ <= i__2; ++i__) {
|
||||
w = sx[kx];
|
||||
z__ = sy[ky];
|
||||
sx[kx] = w + z__ * sh12;
|
||||
sy[ky] = w * sh21 + z__;
|
||||
kx += *incx;
|
||||
ky += *incy;
|
||||
/* L90: */
|
||||
}
|
||||
goto L140;
|
||||
L100:
|
||||
sh11 = sparam[2];
|
||||
sh22 = sparam[5];
|
||||
i__2 = *n;
|
||||
for (i__ = 1; i__ <= i__2; ++i__) {
|
||||
w = sx[kx];
|
||||
z__ = sy[ky];
|
||||
sx[kx] = w * sh11 + z__;
|
||||
sy[ky] = -w + sh22 * z__;
|
||||
kx += *incx;
|
||||
ky += *incy;
|
||||
/* L110: */
|
||||
}
|
||||
goto L140;
|
||||
L120:
|
||||
sh11 = sparam[2];
|
||||
sh12 = sparam[4];
|
||||
sh21 = sparam[3];
|
||||
sh22 = sparam[5];
|
||||
i__2 = *n;
|
||||
for (i__ = 1; i__ <= i__2; ++i__) {
|
||||
w = sx[kx];
|
||||
z__ = sy[ky];
|
||||
sx[kx] = w * sh11 + z__ * sh12;
|
||||
sy[ky] = w * sh21 + z__ * sh22;
|
||||
kx += *incx;
|
||||
ky += *incy;
|
||||
/* L130: */
|
||||
}
|
||||
L140:
|
||||
return;
|
||||
} /* srotm_ */
|
||||
293
blas/f2c/srotmg.c
Normal file
293
blas/f2c/srotmg.c
Normal file
@@ -0,0 +1,293 @@
|
||||
/* srotmg.f -- translated by f2c (version 20100827).
|
||||
You must link the resulting object file with libf2c:
|
||||
on Microsoft Windows system, link with libf2c.lib;
|
||||
on Linux or Unix systems, link with .../path/to/libf2c.a -lm
|
||||
or, if you install libf2c.a in a standard place, with -lf2c -lm
|
||||
-- in that order, at the end of the command line, as in
|
||||
cc *.o -lf2c -lm
|
||||
Source for libf2c is in /netlib/f2c/libf2c.zip, e.g.,
|
||||
|
||||
http://www.netlib.org/f2c/libf2c.zip
|
||||
*/
|
||||
|
||||
#include "datatypes.h"
|
||||
|
||||
/* Subroutine */ void srotmg_(real *sd1, real *sd2, real *sx1, real *sy1, real *sparam) {
|
||||
/* Initialized data */
|
||||
|
||||
static real zero = 0.f;
|
||||
static real one = 1.f;
|
||||
static real two = 2.f;
|
||||
static real gam = 4096.f;
|
||||
static real gamsq = 16777200.f;
|
||||
static real rgamsq = 5.96046e-8f;
|
||||
|
||||
/* Format strings */
|
||||
static char fmt_120[] = "";
|
||||
static char fmt_150[] = "";
|
||||
static char fmt_180[] = "";
|
||||
static char fmt_210[] = "";
|
||||
|
||||
/* System generated locals */
|
||||
real r__1;
|
||||
|
||||
/* Local variables */
|
||||
real su, sp1, sp2, sq1, sq2, sh11, sh12, sh21, sh22;
|
||||
integer igo;
|
||||
real sflag, stemp;
|
||||
|
||||
/* Assigned format variables */
|
||||
static char *igo_fmt;
|
||||
(void)igo_fmt;
|
||||
|
||||
/* .. Scalar Arguments .. */
|
||||
/* .. */
|
||||
/* .. Array Arguments .. */
|
||||
/* .. */
|
||||
|
||||
/* Purpose */
|
||||
/* ======= */
|
||||
|
||||
/* CONSTRUCT THE MODIFIED GIVENS TRANSFORMATION MATRIX H WHICH ZEROS */
|
||||
/* THE SECOND COMPONENT OF THE 2-VECTOR (SQRT(SD1)*SX1,SQRT(SD2)* */
|
||||
/* SY2)**T. */
|
||||
/* WITH SPARAM(1)=SFLAG, H HAS ONE OF THE FOLLOWING FORMS.. */
|
||||
|
||||
/* SFLAG=-1.E0 SFLAG=0.E0 SFLAG=1.E0 SFLAG=-2.E0 */
|
||||
|
||||
/* (SH11 SH12) (1.E0 SH12) (SH11 1.E0) (1.E0 0.E0) */
|
||||
/* H=( ) ( ) ( ) ( ) */
|
||||
/* (SH21 SH22), (SH21 1.E0), (-1.E0 SH22), (0.E0 1.E0). */
|
||||
/* LOCATIONS 2-4 OF SPARAM CONTAIN SH11,SH21,SH12, AND SH22 */
|
||||
/* RESPECTIVELY. (VALUES OF 1.E0, -1.E0, OR 0.E0 IMPLIED BY THE */
|
||||
/* VALUE OF SPARAM(1) ARE NOT STORED IN SPARAM.) */
|
||||
|
||||
/* THE VALUES OF GAMSQ AND RGAMSQ SET IN THE DATA STATEMENT MAY BE */
|
||||
/* INEXACT. THIS IS OK AS THEY ARE ONLY USED FOR TESTING THE SIZE */
|
||||
/* OF SD1 AND SD2. ALL ACTUAL SCALING OF DATA IS DONE USING GAM. */
|
||||
|
||||
/* Arguments */
|
||||
/* ========= */
|
||||
|
||||
/* SD1 (input/output) REAL */
|
||||
|
||||
/* SD2 (input/output) REAL */
|
||||
|
||||
/* SX1 (input/output) REAL */
|
||||
|
||||
/* SY1 (input) REAL */
|
||||
|
||||
/* SPARAM (input/output) REAL array, dimension 5 */
|
||||
/* SPARAM(1)=SFLAG */
|
||||
/* SPARAM(2)=SH11 */
|
||||
/* SPARAM(3)=SH21 */
|
||||
/* SPARAM(4)=SH12 */
|
||||
/* SPARAM(5)=SH22 */
|
||||
|
||||
/* ===================================================================== */
|
||||
|
||||
/* .. Local Scalars .. */
|
||||
/* .. */
|
||||
/* .. Intrinsic Functions .. */
|
||||
/* .. */
|
||||
/* .. Data statements .. */
|
||||
|
||||
/* Parameter adjustments */
|
||||
--sparam;
|
||||
|
||||
/* Function Body */
|
||||
/* .. */
|
||||
if (!(*sd1 < zero)) {
|
||||
goto L10;
|
||||
}
|
||||
/* GO ZERO-H-D-AND-SX1.. */
|
||||
goto L60;
|
||||
L10:
|
||||
/* CASE-SD1-NONNEGATIVE */
|
||||
sp2 = *sd2 * *sy1;
|
||||
if (!(sp2 == zero)) {
|
||||
goto L20;
|
||||
}
|
||||
sflag = -two;
|
||||
goto L260;
|
||||
/* REGULAR-CASE.. */
|
||||
L20:
|
||||
sp1 = *sd1 * *sx1;
|
||||
sq2 = sp2 * *sy1;
|
||||
sq1 = sp1 * *sx1;
|
||||
|
||||
if (!(dabs(sq1) > dabs(sq2))) {
|
||||
goto L40;
|
||||
}
|
||||
sh21 = -(*sy1) / *sx1;
|
||||
sh12 = sp2 / sp1;
|
||||
|
||||
su = one - sh12 * sh21;
|
||||
|
||||
if (!(su <= zero)) {
|
||||
goto L30;
|
||||
}
|
||||
/* GO ZERO-H-D-AND-SX1.. */
|
||||
goto L60;
|
||||
L30:
|
||||
sflag = zero;
|
||||
*sd1 /= su;
|
||||
*sd2 /= su;
|
||||
*sx1 *= su;
|
||||
/* GO SCALE-CHECK.. */
|
||||
goto L100;
|
||||
L40:
|
||||
if (!(sq2 < zero)) {
|
||||
goto L50;
|
||||
}
|
||||
/* GO ZERO-H-D-AND-SX1.. */
|
||||
goto L60;
|
||||
L50:
|
||||
sflag = one;
|
||||
sh11 = sp1 / sp2;
|
||||
sh22 = *sx1 / *sy1;
|
||||
su = one + sh11 * sh22;
|
||||
stemp = *sd2 / su;
|
||||
*sd2 = *sd1 / su;
|
||||
*sd1 = stemp;
|
||||
*sx1 = *sy1 * su;
|
||||
/* GO SCALE-CHECK */
|
||||
goto L100;
|
||||
/* PROCEDURE..ZERO-H-D-AND-SX1.. */
|
||||
L60:
|
||||
sflag = -one;
|
||||
sh11 = zero;
|
||||
sh12 = zero;
|
||||
sh21 = zero;
|
||||
sh22 = zero;
|
||||
|
||||
*sd1 = zero;
|
||||
*sd2 = zero;
|
||||
*sx1 = zero;
|
||||
/* RETURN.. */
|
||||
goto L220;
|
||||
/* PROCEDURE..FIX-H.. */
|
||||
L70:
|
||||
if (!(sflag >= zero)) {
|
||||
goto L90;
|
||||
}
|
||||
|
||||
if (!(sflag == zero)) {
|
||||
goto L80;
|
||||
}
|
||||
sh11 = one;
|
||||
sh22 = one;
|
||||
sflag = -one;
|
||||
goto L90;
|
||||
L80:
|
||||
sh21 = -one;
|
||||
sh12 = one;
|
||||
sflag = -one;
|
||||
L90:
|
||||
switch (igo) {
|
||||
case 0:
|
||||
goto L120;
|
||||
case 1:
|
||||
goto L150;
|
||||
case 2:
|
||||
goto L180;
|
||||
case 3:
|
||||
goto L210;
|
||||
}
|
||||
/* PROCEDURE..SCALE-CHECK */
|
||||
L100:
|
||||
L110:
|
||||
if (!(*sd1 <= rgamsq)) {
|
||||
goto L130;
|
||||
}
|
||||
if (*sd1 == zero) {
|
||||
goto L160;
|
||||
}
|
||||
igo = 0;
|
||||
igo_fmt = fmt_120;
|
||||
/* FIX-H.. */
|
||||
goto L70;
|
||||
L120:
|
||||
/* Computing 2nd power */
|
||||
r__1 = gam;
|
||||
*sd1 *= r__1 * r__1;
|
||||
*sx1 /= gam;
|
||||
sh11 /= gam;
|
||||
sh12 /= gam;
|
||||
goto L110;
|
||||
L130:
|
||||
L140:
|
||||
if (!(*sd1 >= gamsq)) {
|
||||
goto L160;
|
||||
}
|
||||
igo = 1;
|
||||
igo_fmt = fmt_150;
|
||||
/* FIX-H.. */
|
||||
goto L70;
|
||||
L150:
|
||||
/* Computing 2nd power */
|
||||
r__1 = gam;
|
||||
*sd1 /= r__1 * r__1;
|
||||
*sx1 *= gam;
|
||||
sh11 *= gam;
|
||||
sh12 *= gam;
|
||||
goto L140;
|
||||
L160:
|
||||
L170:
|
||||
if (!(dabs(*sd2) <= rgamsq)) {
|
||||
goto L190;
|
||||
}
|
||||
if (*sd2 == zero) {
|
||||
goto L220;
|
||||
}
|
||||
igo = 2;
|
||||
igo_fmt = fmt_180;
|
||||
/* FIX-H.. */
|
||||
goto L70;
|
||||
L180:
|
||||
/* Computing 2nd power */
|
||||
r__1 = gam;
|
||||
*sd2 *= r__1 * r__1;
|
||||
sh21 /= gam;
|
||||
sh22 /= gam;
|
||||
goto L170;
|
||||
L190:
|
||||
L200:
|
||||
if (!(dabs(*sd2) >= gamsq)) {
|
||||
goto L220;
|
||||
}
|
||||
igo = 3;
|
||||
igo_fmt = fmt_210;
|
||||
/* FIX-H.. */
|
||||
goto L70;
|
||||
L210:
|
||||
/* Computing 2nd power */
|
||||
r__1 = gam;
|
||||
*sd2 /= r__1 * r__1;
|
||||
sh21 *= gam;
|
||||
sh22 *= gam;
|
||||
goto L200;
|
||||
L220:
|
||||
if (sflag < 0.f) {
|
||||
goto L250;
|
||||
} else if (sflag == 0) {
|
||||
goto L230;
|
||||
} else {
|
||||
goto L240;
|
||||
}
|
||||
L230:
|
||||
sparam[3] = sh21;
|
||||
sparam[4] = sh12;
|
||||
goto L260;
|
||||
L240:
|
||||
sparam[2] = sh11;
|
||||
sparam[5] = sh22;
|
||||
goto L260;
|
||||
L250:
|
||||
sparam[2] = sh11;
|
||||
sparam[3] = sh21;
|
||||
sparam[4] = sh12;
|
||||
sparam[5] = sh22;
|
||||
L260:
|
||||
sparam[1] = sflag;
|
||||
} /* srotmg_ */
|
||||
359
blas/f2c/ssbmv.c
Normal file
359
blas/f2c/ssbmv.c
Normal file
@@ -0,0 +1,359 @@
|
||||
/* ssbmv.f -- translated by f2c (version 20100827).
|
||||
You must link the resulting object file with libf2c:
|
||||
on Microsoft Windows system, link with libf2c.lib;
|
||||
on Linux or Unix systems, link with .../path/to/libf2c.a -lm
|
||||
or, if you install libf2c.a in a standard place, with -lf2c -lm
|
||||
-- in that order, at the end of the command line, as in
|
||||
cc *.o -lf2c -lm
|
||||
Source for libf2c is in /netlib/f2c/libf2c.zip, e.g.,
|
||||
|
||||
http://www.netlib.org/f2c/libf2c.zip
|
||||
*/
|
||||
|
||||
#include "datatypes.h"
|
||||
|
||||
/* Subroutine */ void ssbmv_(char *uplo, integer *n, integer *k, real *alpha, real *a, integer *lda, real *x,
|
||||
integer *incx, real *beta, real *y, integer *incy) {
|
||||
/* System generated locals */
|
||||
integer a_dim1, a_offset, i__1, i__2, i__3, i__4;
|
||||
|
||||
/* Local variables */
|
||||
integer i__, j, l, ix, iy, jx, jy, kx, ky, info;
|
||||
real temp1, temp2;
|
||||
extern logical lsame_(char *, char *);
|
||||
integer kplus1;
|
||||
extern /* Subroutine */ void xerbla_(const char *, integer *);
|
||||
|
||||
/* .. Scalar Arguments .. */
|
||||
/* .. */
|
||||
/* .. Array Arguments .. */
|
||||
/* .. */
|
||||
|
||||
/* Purpose */
|
||||
/* ======= */
|
||||
|
||||
/* SSBMV performs the matrix-vector operation */
|
||||
|
||||
/* y := alpha*A*x + beta*y, */
|
||||
|
||||
/* where alpha and beta are scalars, x and y are n element vectors and */
|
||||
/* A is an n by n symmetric band matrix, with k super-diagonals. */
|
||||
|
||||
/* Arguments */
|
||||
/* ========== */
|
||||
|
||||
/* UPLO - CHARACTER*1. */
|
||||
/* On entry, UPLO specifies whether the upper or lower */
|
||||
/* triangular part of the band matrix A is being supplied as */
|
||||
/* follows: */
|
||||
|
||||
/* UPLO = 'U' or 'u' The upper triangular part of A is */
|
||||
/* being supplied. */
|
||||
|
||||
/* UPLO = 'L' or 'l' The lower triangular part of A is */
|
||||
/* being supplied. */
|
||||
|
||||
/* Unchanged on exit. */
|
||||
|
||||
/* N - INTEGER. */
|
||||
/* On entry, N specifies the order of the matrix A. */
|
||||
/* N must be at least zero. */
|
||||
/* Unchanged on exit. */
|
||||
|
||||
/* K - INTEGER. */
|
||||
/* On entry, K specifies the number of super-diagonals of the */
|
||||
/* matrix A. K must satisfy 0 .le. K. */
|
||||
/* Unchanged on exit. */
|
||||
|
||||
/* ALPHA - REAL . */
|
||||
/* On entry, ALPHA specifies the scalar alpha. */
|
||||
/* Unchanged on exit. */
|
||||
|
||||
/* A - REAL array of DIMENSION ( LDA, n ). */
|
||||
/* Before entry with UPLO = 'U' or 'u', the leading ( k + 1 ) */
|
||||
/* by n part of the array A must contain the upper triangular */
|
||||
/* band part of the symmetric matrix, supplied column by */
|
||||
/* column, with the leading diagonal of the matrix in row */
|
||||
/* ( k + 1 ) of the array, the first super-diagonal starting at */
|
||||
/* position 2 in row k, and so on. The top left k by k triangle */
|
||||
/* of the array A is not referenced. */
|
||||
/* The following program segment will transfer the upper */
|
||||
/* triangular part of a symmetric band matrix from conventional */
|
||||
/* full matrix storage to band storage: */
|
||||
|
||||
/* DO 20, J = 1, N */
|
||||
/* M = K + 1 - J */
|
||||
/* DO 10, I = MAX( 1, J - K ), J */
|
||||
/* A( M + I, J ) = matrix( I, J ) */
|
||||
/* 10 CONTINUE */
|
||||
/* 20 CONTINUE */
|
||||
|
||||
/* Before entry with UPLO = 'L' or 'l', the leading ( k + 1 ) */
|
||||
/* by n part of the array A must contain the lower triangular */
|
||||
/* band part of the symmetric matrix, supplied column by */
|
||||
/* column, with the leading diagonal of the matrix in row 1 of */
|
||||
/* the array, the first sub-diagonal starting at position 1 in */
|
||||
/* row 2, and so on. The bottom right k by k triangle of the */
|
||||
/* array A is not referenced. */
|
||||
/* The following program segment will transfer the lower */
|
||||
/* triangular part of a symmetric band matrix from conventional */
|
||||
/* full matrix storage to band storage: */
|
||||
|
||||
/* DO 20, J = 1, N */
|
||||
/* M = 1 - J */
|
||||
/* DO 10, I = J, MIN( N, J + K ) */
|
||||
/* A( M + I, J ) = matrix( I, J ) */
|
||||
/* 10 CONTINUE */
|
||||
/* 20 CONTINUE */
|
||||
|
||||
/* Unchanged on exit. */
|
||||
|
||||
/* LDA - INTEGER. */
|
||||
/* On entry, LDA specifies the first dimension of A as declared */
|
||||
/* in the calling (sub) program. LDA must be at least */
|
||||
/* ( k + 1 ). */
|
||||
/* Unchanged on exit. */
|
||||
|
||||
/* X - REAL array of DIMENSION at least */
|
||||
/* ( 1 + ( n - 1 )*abs( INCX ) ). */
|
||||
/* Before entry, the incremented array X must contain the */
|
||||
/* vector x. */
|
||||
/* Unchanged on exit. */
|
||||
|
||||
/* INCX - INTEGER. */
|
||||
/* On entry, INCX specifies the increment for the elements of */
|
||||
/* X. INCX must not be zero. */
|
||||
/* Unchanged on exit. */
|
||||
|
||||
/* BETA - REAL . */
|
||||
/* On entry, BETA specifies the scalar beta. */
|
||||
/* Unchanged on exit. */
|
||||
|
||||
/* Y - REAL array of DIMENSION at least */
|
||||
/* ( 1 + ( n - 1 )*abs( INCY ) ). */
|
||||
/* Before entry, the incremented array Y must contain the */
|
||||
/* vector y. On exit, Y is overwritten by the updated vector y. */
|
||||
|
||||
/* INCY - INTEGER. */
|
||||
/* On entry, INCY specifies the increment for the elements of */
|
||||
/* Y. INCY must not be zero. */
|
||||
/* Unchanged on exit. */
|
||||
|
||||
/* Further Details */
|
||||
/* =============== */
|
||||
|
||||
/* Level 2 Blas routine. */
|
||||
|
||||
/* -- Written on 22-October-1986. */
|
||||
/* Jack Dongarra, Argonne National Lab. */
|
||||
/* Jeremy Du Croz, Nag Central Office. */
|
||||
/* Sven Hammarling, Nag Central Office. */
|
||||
/* Richard Hanson, Sandia National Labs. */
|
||||
|
||||
/* ===================================================================== */
|
||||
|
||||
/* .. Parameters .. */
|
||||
/* .. */
|
||||
/* .. Local Scalars .. */
|
||||
/* .. */
|
||||
/* .. External Functions .. */
|
||||
/* .. */
|
||||
/* .. External Subroutines .. */
|
||||
/* .. */
|
||||
/* .. Intrinsic Functions .. */
|
||||
/* .. */
|
||||
|
||||
/* Test the input parameters. */
|
||||
|
||||
/* Parameter adjustments */
|
||||
a_dim1 = *lda;
|
||||
a_offset = 1 + a_dim1;
|
||||
a -= a_offset;
|
||||
--x;
|
||||
--y;
|
||||
|
||||
/* Function Body */
|
||||
info = 0;
|
||||
if (!lsame_(uplo, "U") && !lsame_(uplo, "L")) {
|
||||
info = 1;
|
||||
} else if (*n < 0) {
|
||||
info = 2;
|
||||
} else if (*k < 0) {
|
||||
info = 3;
|
||||
} else if (*lda < *k + 1) {
|
||||
info = 6;
|
||||
} else if (*incx == 0) {
|
||||
info = 8;
|
||||
} else if (*incy == 0) {
|
||||
info = 11;
|
||||
}
|
||||
if (info != 0) {
|
||||
xerbla_("SSBMV ", &info);
|
||||
return;
|
||||
}
|
||||
|
||||
/* Quick return if possible. */
|
||||
|
||||
if (*n == 0 || (*alpha == 0.f && *beta == 1.f)) {
|
||||
return;
|
||||
}
|
||||
|
||||
/* Set up the start points in X and Y. */
|
||||
|
||||
if (*incx > 0) {
|
||||
kx = 1;
|
||||
} else {
|
||||
kx = 1 - (*n - 1) * *incx;
|
||||
}
|
||||
if (*incy > 0) {
|
||||
ky = 1;
|
||||
} else {
|
||||
ky = 1 - (*n - 1) * *incy;
|
||||
}
|
||||
|
||||
/* Start the operations. In this version the elements of the array A */
|
||||
/* are accessed sequentially with one pass through A. */
|
||||
|
||||
/* First form y := beta*y. */
|
||||
|
||||
if (*beta != 1.f) {
|
||||
if (*incy == 1) {
|
||||
if (*beta == 0.f) {
|
||||
i__1 = *n;
|
||||
for (i__ = 1; i__ <= i__1; ++i__) {
|
||||
y[i__] = 0.f;
|
||||
/* L10: */
|
||||
}
|
||||
} else {
|
||||
i__1 = *n;
|
||||
for (i__ = 1; i__ <= i__1; ++i__) {
|
||||
y[i__] = *beta * y[i__];
|
||||
/* L20: */
|
||||
}
|
||||
}
|
||||
} else {
|
||||
iy = ky;
|
||||
if (*beta == 0.f) {
|
||||
i__1 = *n;
|
||||
for (i__ = 1; i__ <= i__1; ++i__) {
|
||||
y[iy] = 0.f;
|
||||
iy += *incy;
|
||||
/* L30: */
|
||||
}
|
||||
} else {
|
||||
i__1 = *n;
|
||||
for (i__ = 1; i__ <= i__1; ++i__) {
|
||||
y[iy] = *beta * y[iy];
|
||||
iy += *incy;
|
||||
/* L40: */
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
if (*alpha == 0.f) {
|
||||
return;
|
||||
}
|
||||
if (lsame_(uplo, "U")) {
|
||||
/* Form y when upper triangle of A is stored. */
|
||||
|
||||
kplus1 = *k + 1;
|
||||
if (*incx == 1 && *incy == 1) {
|
||||
i__1 = *n;
|
||||
for (j = 1; j <= i__1; ++j) {
|
||||
temp1 = *alpha * x[j];
|
||||
temp2 = 0.f;
|
||||
l = kplus1 - j;
|
||||
/* Computing MAX */
|
||||
i__2 = 1, i__3 = j - *k;
|
||||
i__4 = j - 1;
|
||||
for (i__ = max(i__2, i__3); i__ <= i__4; ++i__) {
|
||||
y[i__] += temp1 * a[l + i__ + j * a_dim1];
|
||||
temp2 += a[l + i__ + j * a_dim1] * x[i__];
|
||||
/* L50: */
|
||||
}
|
||||
y[j] = y[j] + temp1 * a[kplus1 + j * a_dim1] + *alpha * temp2;
|
||||
/* L60: */
|
||||
}
|
||||
} else {
|
||||
jx = kx;
|
||||
jy = ky;
|
||||
i__1 = *n;
|
||||
for (j = 1; j <= i__1; ++j) {
|
||||
temp1 = *alpha * x[jx];
|
||||
temp2 = 0.f;
|
||||
ix = kx;
|
||||
iy = ky;
|
||||
l = kplus1 - j;
|
||||
/* Computing MAX */
|
||||
i__4 = 1, i__2 = j - *k;
|
||||
i__3 = j - 1;
|
||||
for (i__ = max(i__4, i__2); i__ <= i__3; ++i__) {
|
||||
y[iy] += temp1 * a[l + i__ + j * a_dim1];
|
||||
temp2 += a[l + i__ + j * a_dim1] * x[ix];
|
||||
ix += *incx;
|
||||
iy += *incy;
|
||||
/* L70: */
|
||||
}
|
||||
y[jy] = y[jy] + temp1 * a[kplus1 + j * a_dim1] + *alpha * temp2;
|
||||
jx += *incx;
|
||||
jy += *incy;
|
||||
if (j > *k) {
|
||||
kx += *incx;
|
||||
ky += *incy;
|
||||
}
|
||||
/* L80: */
|
||||
}
|
||||
}
|
||||
} else {
|
||||
/* Form y when lower triangle of A is stored. */
|
||||
|
||||
if (*incx == 1 && *incy == 1) {
|
||||
i__1 = *n;
|
||||
for (j = 1; j <= i__1; ++j) {
|
||||
temp1 = *alpha * x[j];
|
||||
temp2 = 0.f;
|
||||
y[j] += temp1 * a[j * a_dim1 + 1];
|
||||
l = 1 - j;
|
||||
/* Computing MIN */
|
||||
i__4 = *n, i__2 = j + *k;
|
||||
i__3 = min(i__4, i__2);
|
||||
for (i__ = j + 1; i__ <= i__3; ++i__) {
|
||||
y[i__] += temp1 * a[l + i__ + j * a_dim1];
|
||||
temp2 += a[l + i__ + j * a_dim1] * x[i__];
|
||||
/* L90: */
|
||||
}
|
||||
y[j] += *alpha * temp2;
|
||||
/* L100: */
|
||||
}
|
||||
} else {
|
||||
jx = kx;
|
||||
jy = ky;
|
||||
i__1 = *n;
|
||||
for (j = 1; j <= i__1; ++j) {
|
||||
temp1 = *alpha * x[jx];
|
||||
temp2 = 0.f;
|
||||
y[jy] += temp1 * a[j * a_dim1 + 1];
|
||||
l = 1 - j;
|
||||
ix = jx;
|
||||
iy = jy;
|
||||
/* Computing MIN */
|
||||
i__4 = *n, i__2 = j + *k;
|
||||
i__3 = min(i__4, i__2);
|
||||
for (i__ = j + 1; i__ <= i__3; ++i__) {
|
||||
ix += *incx;
|
||||
iy += *incy;
|
||||
y[iy] += temp1 * a[l + i__ + j * a_dim1];
|
||||
temp2 += a[l + i__ + j * a_dim1] * x[ix];
|
||||
/* L110: */
|
||||
}
|
||||
y[jy] += *alpha * temp2;
|
||||
jx += *incx;
|
||||
jy += *incy;
|
||||
/* L120: */
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/* End of SSBMV . */
|
||||
|
||||
} /* ssbmv_ */
|
||||
308
blas/f2c/sspmv.c
Normal file
308
blas/f2c/sspmv.c
Normal file
@@ -0,0 +1,308 @@
|
||||
/* sspmv.f -- translated by f2c (version 20100827).
|
||||
You must link the resulting object file with libf2c:
|
||||
on Microsoft Windows system, link with libf2c.lib;
|
||||
on Linux or Unix systems, link with .../path/to/libf2c.a -lm
|
||||
or, if you install libf2c.a in a standard place, with -lf2c -lm
|
||||
-- in that order, at the end of the command line, as in
|
||||
cc *.o -lf2c -lm
|
||||
Source for libf2c is in /netlib/f2c/libf2c.zip, e.g.,
|
||||
|
||||
http://www.netlib.org/f2c/libf2c.zip
|
||||
*/
|
||||
|
||||
#include "datatypes.h"
|
||||
|
||||
/* Subroutine */ void sspmv_(char *uplo, integer *n, real *alpha, real *ap, real *x, integer *incx, real *beta, real *y,
|
||||
integer *incy) {
|
||||
/* System generated locals */
|
||||
integer i__1, i__2;
|
||||
|
||||
/* Local variables */
|
||||
integer i__, j, k, kk, ix, iy, jx, jy, kx, ky, info;
|
||||
real temp1, temp2;
|
||||
extern logical lsame_(char *, char *);
|
||||
extern /* Subroutine */ void xerbla_(const char *, integer *);
|
||||
|
||||
/* .. Scalar Arguments .. */
|
||||
/* .. */
|
||||
/* .. Array Arguments .. */
|
||||
/* .. */
|
||||
|
||||
/* Purpose */
|
||||
/* ======= */
|
||||
|
||||
/* SSPMV performs the matrix-vector operation */
|
||||
|
||||
/* y := alpha*A*x + beta*y, */
|
||||
|
||||
/* where alpha and beta are scalars, x and y are n element vectors and */
|
||||
/* A is an n by n symmetric matrix, supplied in packed form. */
|
||||
|
||||
/* Arguments */
|
||||
/* ========== */
|
||||
|
||||
/* UPLO - CHARACTER*1. */
|
||||
/* On entry, UPLO specifies whether the upper or lower */
|
||||
/* triangular part of the matrix A is supplied in the packed */
|
||||
/* array AP as follows: */
|
||||
|
||||
/* UPLO = 'U' or 'u' The upper triangular part of A is */
|
||||
/* supplied in AP. */
|
||||
|
||||
/* UPLO = 'L' or 'l' The lower triangular part of A is */
|
||||
/* supplied in AP. */
|
||||
|
||||
/* Unchanged on exit. */
|
||||
|
||||
/* N - INTEGER. */
|
||||
/* On entry, N specifies the order of the matrix A. */
|
||||
/* N must be at least zero. */
|
||||
/* Unchanged on exit. */
|
||||
|
||||
/* ALPHA - REAL . */
|
||||
/* On entry, ALPHA specifies the scalar alpha. */
|
||||
/* Unchanged on exit. */
|
||||
|
||||
/* AP - REAL array of DIMENSION at least */
|
||||
/* ( ( n*( n + 1 ) )/2 ). */
|
||||
/* Before entry with UPLO = 'U' or 'u', the array AP must */
|
||||
/* contain the upper triangular part of the symmetric matrix */
|
||||
/* packed sequentially, column by column, so that AP( 1 ) */
|
||||
/* contains a( 1, 1 ), AP( 2 ) and AP( 3 ) contain a( 1, 2 ) */
|
||||
/* and a( 2, 2 ) respectively, and so on. */
|
||||
/* Before entry with UPLO = 'L' or 'l', the array AP must */
|
||||
/* contain the lower triangular part of the symmetric matrix */
|
||||
/* packed sequentially, column by column, so that AP( 1 ) */
|
||||
/* contains a( 1, 1 ), AP( 2 ) and AP( 3 ) contain a( 2, 1 ) */
|
||||
/* and a( 3, 1 ) respectively, and so on. */
|
||||
/* Unchanged on exit. */
|
||||
|
||||
/* X - REAL array of dimension at least */
|
||||
/* ( 1 + ( n - 1 )*abs( INCX ) ). */
|
||||
/* Before entry, the incremented array X must contain the n */
|
||||
/* element vector x. */
|
||||
/* Unchanged on exit. */
|
||||
|
||||
/* INCX - INTEGER. */
|
||||
/* On entry, INCX specifies the increment for the elements of */
|
||||
/* X. INCX must not be zero. */
|
||||
/* Unchanged on exit. */
|
||||
|
||||
/* BETA - REAL . */
|
||||
/* On entry, BETA specifies the scalar beta. When BETA is */
|
||||
/* supplied as zero then Y need not be set on input. */
|
||||
/* Unchanged on exit. */
|
||||
|
||||
/* Y - REAL array of dimension at least */
|
||||
/* ( 1 + ( n - 1 )*abs( INCY ) ). */
|
||||
/* Before entry, the incremented array Y must contain the n */
|
||||
/* element vector y. On exit, Y is overwritten by the updated */
|
||||
/* vector y. */
|
||||
|
||||
/* INCY - INTEGER. */
|
||||
/* On entry, INCY specifies the increment for the elements of */
|
||||
/* Y. INCY must not be zero. */
|
||||
/* Unchanged on exit. */
|
||||
|
||||
/* Further Details */
|
||||
/* =============== */
|
||||
|
||||
/* Level 2 Blas routine. */
|
||||
|
||||
/* -- Written on 22-October-1986. */
|
||||
/* Jack Dongarra, Argonne National Lab. */
|
||||
/* Jeremy Du Croz, Nag Central Office. */
|
||||
/* Sven Hammarling, Nag Central Office. */
|
||||
/* Richard Hanson, Sandia National Labs. */
|
||||
|
||||
/* ===================================================================== */
|
||||
|
||||
/* .. Parameters .. */
|
||||
/* .. */
|
||||
/* .. Local Scalars .. */
|
||||
/* .. */
|
||||
/* .. External Functions .. */
|
||||
/* .. */
|
||||
/* .. External Subroutines .. */
|
||||
/* .. */
|
||||
|
||||
/* Test the input parameters. */
|
||||
|
||||
/* Parameter adjustments */
|
||||
--y;
|
||||
--x;
|
||||
--ap;
|
||||
|
||||
/* Function Body */
|
||||
info = 0;
|
||||
if (!lsame_(uplo, "U") && !lsame_(uplo, "L")) {
|
||||
info = 1;
|
||||
} else if (*n < 0) {
|
||||
info = 2;
|
||||
} else if (*incx == 0) {
|
||||
info = 6;
|
||||
} else if (*incy == 0) {
|
||||
info = 9;
|
||||
}
|
||||
if (info != 0) {
|
||||
xerbla_("SSPMV ", &info);
|
||||
return;
|
||||
}
|
||||
|
||||
/* Quick return if possible. */
|
||||
|
||||
if (*n == 0 || (*alpha == 0.f && *beta == 1.f)) {
|
||||
return;
|
||||
}
|
||||
|
||||
/* Set up the start points in X and Y. */
|
||||
|
||||
if (*incx > 0) {
|
||||
kx = 1;
|
||||
} else {
|
||||
kx = 1 - (*n - 1) * *incx;
|
||||
}
|
||||
if (*incy > 0) {
|
||||
ky = 1;
|
||||
} else {
|
||||
ky = 1 - (*n - 1) * *incy;
|
||||
}
|
||||
|
||||
/* Start the operations. In this version the elements of the array AP */
|
||||
/* are accessed sequentially with one pass through AP. */
|
||||
|
||||
/* First form y := beta*y. */
|
||||
|
||||
if (*beta != 1.f) {
|
||||
if (*incy == 1) {
|
||||
if (*beta == 0.f) {
|
||||
i__1 = *n;
|
||||
for (i__ = 1; i__ <= i__1; ++i__) {
|
||||
y[i__] = 0.f;
|
||||
/* L10: */
|
||||
}
|
||||
} else {
|
||||
i__1 = *n;
|
||||
for (i__ = 1; i__ <= i__1; ++i__) {
|
||||
y[i__] = *beta * y[i__];
|
||||
/* L20: */
|
||||
}
|
||||
}
|
||||
} else {
|
||||
iy = ky;
|
||||
if (*beta == 0.f) {
|
||||
i__1 = *n;
|
||||
for (i__ = 1; i__ <= i__1; ++i__) {
|
||||
y[iy] = 0.f;
|
||||
iy += *incy;
|
||||
/* L30: */
|
||||
}
|
||||
} else {
|
||||
i__1 = *n;
|
||||
for (i__ = 1; i__ <= i__1; ++i__) {
|
||||
y[iy] = *beta * y[iy];
|
||||
iy += *incy;
|
||||
/* L40: */
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
if (*alpha == 0.f) {
|
||||
return;
|
||||
}
|
||||
kk = 1;
|
||||
if (lsame_(uplo, "U")) {
|
||||
/* Form y when AP contains the upper triangle. */
|
||||
|
||||
if (*incx == 1 && *incy == 1) {
|
||||
i__1 = *n;
|
||||
for (j = 1; j <= i__1; ++j) {
|
||||
temp1 = *alpha * x[j];
|
||||
temp2 = 0.f;
|
||||
k = kk;
|
||||
i__2 = j - 1;
|
||||
for (i__ = 1; i__ <= i__2; ++i__) {
|
||||
y[i__] += temp1 * ap[k];
|
||||
temp2 += ap[k] * x[i__];
|
||||
++k;
|
||||
/* L50: */
|
||||
}
|
||||
y[j] = y[j] + temp1 * ap[kk + j - 1] + *alpha * temp2;
|
||||
kk += j;
|
||||
/* L60: */
|
||||
}
|
||||
} else {
|
||||
jx = kx;
|
||||
jy = ky;
|
||||
i__1 = *n;
|
||||
for (j = 1; j <= i__1; ++j) {
|
||||
temp1 = *alpha * x[jx];
|
||||
temp2 = 0.f;
|
||||
ix = kx;
|
||||
iy = ky;
|
||||
i__2 = kk + j - 2;
|
||||
for (k = kk; k <= i__2; ++k) {
|
||||
y[iy] += temp1 * ap[k];
|
||||
temp2 += ap[k] * x[ix];
|
||||
ix += *incx;
|
||||
iy += *incy;
|
||||
/* L70: */
|
||||
}
|
||||
y[jy] = y[jy] + temp1 * ap[kk + j - 1] + *alpha * temp2;
|
||||
jx += *incx;
|
||||
jy += *incy;
|
||||
kk += j;
|
||||
/* L80: */
|
||||
}
|
||||
}
|
||||
} else {
|
||||
/* Form y when AP contains the lower triangle. */
|
||||
|
||||
if (*incx == 1 && *incy == 1) {
|
||||
i__1 = *n;
|
||||
for (j = 1; j <= i__1; ++j) {
|
||||
temp1 = *alpha * x[j];
|
||||
temp2 = 0.f;
|
||||
y[j] += temp1 * ap[kk];
|
||||
k = kk + 1;
|
||||
i__2 = *n;
|
||||
for (i__ = j + 1; i__ <= i__2; ++i__) {
|
||||
y[i__] += temp1 * ap[k];
|
||||
temp2 += ap[k] * x[i__];
|
||||
++k;
|
||||
/* L90: */
|
||||
}
|
||||
y[j] += *alpha * temp2;
|
||||
kk += *n - j + 1;
|
||||
/* L100: */
|
||||
}
|
||||
} else {
|
||||
jx = kx;
|
||||
jy = ky;
|
||||
i__1 = *n;
|
||||
for (j = 1; j <= i__1; ++j) {
|
||||
temp1 = *alpha * x[jx];
|
||||
temp2 = 0.f;
|
||||
y[jy] += temp1 * ap[kk];
|
||||
ix = jx;
|
||||
iy = jy;
|
||||
i__2 = kk + *n - j;
|
||||
for (k = kk + 1; k <= i__2; ++k) {
|
||||
ix += *incx;
|
||||
iy += *incy;
|
||||
y[iy] += temp1 * ap[k];
|
||||
temp2 += ap[k] * x[ix];
|
||||
/* L110: */
|
||||
}
|
||||
y[jy] += *alpha * temp2;
|
||||
jx += *incx;
|
||||
jy += *incy;
|
||||
kk += *n - j + 1;
|
||||
/* L120: */
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/* End of SSPMV . */
|
||||
|
||||
} /* sspmv_ */
|
||||
417
blas/f2c/stbmv.c
Normal file
417
blas/f2c/stbmv.c
Normal file
@@ -0,0 +1,417 @@
|
||||
/* stbmv.f -- translated by f2c (version 20100827).
|
||||
You must link the resulting object file with libf2c:
|
||||
on Microsoft Windows system, link with libf2c.lib;
|
||||
on Linux or Unix systems, link with .../path/to/libf2c.a -lm
|
||||
or, if you install libf2c.a in a standard place, with -lf2c -lm
|
||||
-- in that order, at the end of the command line, as in
|
||||
cc *.o -lf2c -lm
|
||||
Source for libf2c is in /netlib/f2c/libf2c.zip, e.g.,
|
||||
|
||||
http://www.netlib.org/f2c/libf2c.zip
|
||||
*/
|
||||
|
||||
#include "datatypes.h"
|
||||
|
||||
/* Subroutine */ void stbmv_(char *uplo, char *trans, char *diag, integer *n, integer *k, real *a, integer *lda,
|
||||
real *x, integer *incx) {
|
||||
/* System generated locals */
|
||||
integer a_dim1, a_offset, i__1, i__2, i__3, i__4;
|
||||
|
||||
/* Local variables */
|
||||
integer i__, j, l, ix, jx, kx, info;
|
||||
real temp;
|
||||
extern logical lsame_(char *, char *);
|
||||
integer kplus1;
|
||||
extern /* Subroutine */ void xerbla_(const char *, integer *);
|
||||
logical nounit;
|
||||
|
||||
/* .. Scalar Arguments .. */
|
||||
/* .. */
|
||||
/* .. Array Arguments .. */
|
||||
/* .. */
|
||||
|
||||
/* Purpose */
|
||||
/* ======= */
|
||||
|
||||
/* STBMV performs one of the matrix-vector operations */
|
||||
|
||||
/* x := A*x, or x := A'*x, */
|
||||
|
||||
/* where x is an n element vector and A is an n by n unit, or non-unit, */
|
||||
/* upper or lower triangular band matrix, with ( k + 1 ) diagonals. */
|
||||
|
||||
/* Arguments */
|
||||
/* ========== */
|
||||
|
||||
/* UPLO - CHARACTER*1. */
|
||||
/* On entry, UPLO specifies whether the matrix is an upper or */
|
||||
/* lower triangular matrix as follows: */
|
||||
|
||||
/* UPLO = 'U' or 'u' A is an upper triangular matrix. */
|
||||
|
||||
/* UPLO = 'L' or 'l' A is a lower triangular matrix. */
|
||||
|
||||
/* Unchanged on exit. */
|
||||
|
||||
/* TRANS - CHARACTER*1. */
|
||||
/* On entry, TRANS specifies the operation to be performed as */
|
||||
/* follows: */
|
||||
|
||||
/* TRANS = 'N' or 'n' x := A*x. */
|
||||
|
||||
/* TRANS = 'T' or 't' x := A'*x. */
|
||||
|
||||
/* TRANS = 'C' or 'c' x := A'*x. */
|
||||
|
||||
/* Unchanged on exit. */
|
||||
|
||||
/* DIAG - CHARACTER*1. */
|
||||
/* On entry, DIAG specifies whether or not A is unit */
|
||||
/* triangular as follows: */
|
||||
|
||||
/* DIAG = 'U' or 'u' A is assumed to be unit triangular. */
|
||||
|
||||
/* DIAG = 'N' or 'n' A is not assumed to be unit */
|
||||
/* triangular. */
|
||||
|
||||
/* Unchanged on exit. */
|
||||
|
||||
/* N - INTEGER. */
|
||||
/* On entry, N specifies the order of the matrix A. */
|
||||
/* N must be at least zero. */
|
||||
/* Unchanged on exit. */
|
||||
|
||||
/* K - INTEGER. */
|
||||
/* On entry with UPLO = 'U' or 'u', K specifies the number of */
|
||||
/* super-diagonals of the matrix A. */
|
||||
/* On entry with UPLO = 'L' or 'l', K specifies the number of */
|
||||
/* sub-diagonals of the matrix A. */
|
||||
/* K must satisfy 0 .le. K. */
|
||||
/* Unchanged on exit. */
|
||||
|
||||
/* A - REAL array of DIMENSION ( LDA, n ). */
|
||||
/* Before entry with UPLO = 'U' or 'u', the leading ( k + 1 ) */
|
||||
/* by n part of the array A must contain the upper triangular */
|
||||
/* band part of the matrix of coefficients, supplied column by */
|
||||
/* column, with the leading diagonal of the matrix in row */
|
||||
/* ( k + 1 ) of the array, the first super-diagonal starting at */
|
||||
/* position 2 in row k, and so on. The top left k by k triangle */
|
||||
/* of the array A is not referenced. */
|
||||
/* The following program segment will transfer an upper */
|
||||
/* triangular band matrix from conventional full matrix storage */
|
||||
/* to band storage: */
|
||||
|
||||
/* DO 20, J = 1, N */
|
||||
/* M = K + 1 - J */
|
||||
/* DO 10, I = MAX( 1, J - K ), J */
|
||||
/* A( M + I, J ) = matrix( I, J ) */
|
||||
/* 10 CONTINUE */
|
||||
/* 20 CONTINUE */
|
||||
|
||||
/* Before entry with UPLO = 'L' or 'l', the leading ( k + 1 ) */
|
||||
/* by n part of the array A must contain the lower triangular */
|
||||
/* band part of the matrix of coefficients, supplied column by */
|
||||
/* column, with the leading diagonal of the matrix in row 1 of */
|
||||
/* the array, the first sub-diagonal starting at position 1 in */
|
||||
/* row 2, and so on. The bottom right k by k triangle of the */
|
||||
/* array A is not referenced. */
|
||||
/* The following program segment will transfer a lower */
|
||||
/* triangular band matrix from conventional full matrix storage */
|
||||
/* to band storage: */
|
||||
|
||||
/* DO 20, J = 1, N */
|
||||
/* M = 1 - J */
|
||||
/* DO 10, I = J, MIN( N, J + K ) */
|
||||
/* A( M + I, J ) = matrix( I, J ) */
|
||||
/* 10 CONTINUE */
|
||||
/* 20 CONTINUE */
|
||||
|
||||
/* Note that when DIAG = 'U' or 'u' the elements of the array A */
|
||||
/* corresponding to the diagonal elements of the matrix are not */
|
||||
/* referenced, but are assumed to be unity. */
|
||||
/* Unchanged on exit. */
|
||||
|
||||
/* LDA - INTEGER. */
|
||||
/* On entry, LDA specifies the first dimension of A as declared */
|
||||
/* in the calling (sub) program. LDA must be at least */
|
||||
/* ( k + 1 ). */
|
||||
/* Unchanged on exit. */
|
||||
|
||||
/* X - REAL array of dimension at least */
|
||||
/* ( 1 + ( n - 1 )*abs( INCX ) ). */
|
||||
/* Before entry, the incremented array X must contain the n */
|
||||
/* element vector x. On exit, X is overwritten with the */
|
||||
/* transformed vector x. */
|
||||
|
||||
/* INCX - INTEGER. */
|
||||
/* On entry, INCX specifies the increment for the elements of */
|
||||
/* X. INCX must not be zero. */
|
||||
/* Unchanged on exit. */
|
||||
|
||||
/* Further Details */
|
||||
/* =============== */
|
||||
|
||||
/* Level 2 Blas routine. */
|
||||
|
||||
/* -- Written on 22-October-1986. */
|
||||
/* Jack Dongarra, Argonne National Lab. */
|
||||
/* Jeremy Du Croz, Nag Central Office. */
|
||||
/* Sven Hammarling, Nag Central Office. */
|
||||
/* Richard Hanson, Sandia National Labs. */
|
||||
|
||||
/* ===================================================================== */
|
||||
|
||||
/* .. Parameters .. */
|
||||
/* .. */
|
||||
/* .. Local Scalars .. */
|
||||
/* .. */
|
||||
/* .. External Functions .. */
|
||||
/* .. */
|
||||
/* .. External Subroutines .. */
|
||||
/* .. */
|
||||
/* .. Intrinsic Functions .. */
|
||||
/* .. */
|
||||
|
||||
/* Test the input parameters. */
|
||||
|
||||
/* Parameter adjustments */
|
||||
a_dim1 = *lda;
|
||||
a_offset = 1 + a_dim1;
|
||||
a -= a_offset;
|
||||
--x;
|
||||
|
||||
/* Function Body */
|
||||
info = 0;
|
||||
if (!lsame_(uplo, "U") && !lsame_(uplo, "L")) {
|
||||
info = 1;
|
||||
} else if (!lsame_(trans, "N") && !lsame_(trans, "T") && !lsame_(trans, "C")) {
|
||||
info = 2;
|
||||
} else if (!lsame_(diag, "U") && !lsame_(diag, "N")) {
|
||||
info = 3;
|
||||
} else if (*n < 0) {
|
||||
info = 4;
|
||||
} else if (*k < 0) {
|
||||
info = 5;
|
||||
} else if (*lda < *k + 1) {
|
||||
info = 7;
|
||||
} else if (*incx == 0) {
|
||||
info = 9;
|
||||
}
|
||||
if (info != 0) {
|
||||
xerbla_("STBMV ", &info);
|
||||
return;
|
||||
}
|
||||
|
||||
/* Quick return if possible. */
|
||||
|
||||
if (*n == 0) {
|
||||
return;
|
||||
}
|
||||
|
||||
nounit = lsame_(diag, "N");
|
||||
|
||||
/* Set up the start point in X if the increment is not unity. This */
|
||||
/* will be ( N - 1 )*INCX too small for descending loops. */
|
||||
|
||||
if (*incx <= 0) {
|
||||
kx = 1 - (*n - 1) * *incx;
|
||||
} else if (*incx != 1) {
|
||||
kx = 1;
|
||||
}
|
||||
|
||||
/* Start the operations. In this version the elements of A are */
|
||||
/* accessed sequentially with one pass through A. */
|
||||
|
||||
if (lsame_(trans, "N")) {
|
||||
/* Form x := A*x. */
|
||||
|
||||
if (lsame_(uplo, "U")) {
|
||||
kplus1 = *k + 1;
|
||||
if (*incx == 1) {
|
||||
i__1 = *n;
|
||||
for (j = 1; j <= i__1; ++j) {
|
||||
if (x[j] != 0.f) {
|
||||
temp = x[j];
|
||||
l = kplus1 - j;
|
||||
/* Computing MAX */
|
||||
i__2 = 1, i__3 = j - *k;
|
||||
i__4 = j - 1;
|
||||
for (i__ = max(i__2, i__3); i__ <= i__4; ++i__) {
|
||||
x[i__] += temp * a[l + i__ + j * a_dim1];
|
||||
/* L10: */
|
||||
}
|
||||
if (nounit) {
|
||||
x[j] *= a[kplus1 + j * a_dim1];
|
||||
}
|
||||
}
|
||||
/* L20: */
|
||||
}
|
||||
} else {
|
||||
jx = kx;
|
||||
i__1 = *n;
|
||||
for (j = 1; j <= i__1; ++j) {
|
||||
if (x[jx] != 0.f) {
|
||||
temp = x[jx];
|
||||
ix = kx;
|
||||
l = kplus1 - j;
|
||||
/* Computing MAX */
|
||||
i__4 = 1, i__2 = j - *k;
|
||||
i__3 = j - 1;
|
||||
for (i__ = max(i__4, i__2); i__ <= i__3; ++i__) {
|
||||
x[ix] += temp * a[l + i__ + j * a_dim1];
|
||||
ix += *incx;
|
||||
/* L30: */
|
||||
}
|
||||
if (nounit) {
|
||||
x[jx] *= a[kplus1 + j * a_dim1];
|
||||
}
|
||||
}
|
||||
jx += *incx;
|
||||
if (j > *k) {
|
||||
kx += *incx;
|
||||
}
|
||||
/* L40: */
|
||||
}
|
||||
}
|
||||
} else {
|
||||
if (*incx == 1) {
|
||||
for (j = *n; j >= 1; --j) {
|
||||
if (x[j] != 0.f) {
|
||||
temp = x[j];
|
||||
l = 1 - j;
|
||||
/* Computing MIN */
|
||||
i__1 = *n, i__3 = j + *k;
|
||||
i__4 = j + 1;
|
||||
for (i__ = min(i__1, i__3); i__ >= i__4; --i__) {
|
||||
x[i__] += temp * a[l + i__ + j * a_dim1];
|
||||
/* L50: */
|
||||
}
|
||||
if (nounit) {
|
||||
x[j] *= a[j * a_dim1 + 1];
|
||||
}
|
||||
}
|
||||
/* L60: */
|
||||
}
|
||||
} else {
|
||||
kx += (*n - 1) * *incx;
|
||||
jx = kx;
|
||||
for (j = *n; j >= 1; --j) {
|
||||
if (x[jx] != 0.f) {
|
||||
temp = x[jx];
|
||||
ix = kx;
|
||||
l = 1 - j;
|
||||
/* Computing MIN */
|
||||
i__4 = *n, i__1 = j + *k;
|
||||
i__3 = j + 1;
|
||||
for (i__ = min(i__4, i__1); i__ >= i__3; --i__) {
|
||||
x[ix] += temp * a[l + i__ + j * a_dim1];
|
||||
ix -= *incx;
|
||||
/* L70: */
|
||||
}
|
||||
if (nounit) {
|
||||
x[jx] *= a[j * a_dim1 + 1];
|
||||
}
|
||||
}
|
||||
jx -= *incx;
|
||||
if (*n - j >= *k) {
|
||||
kx -= *incx;
|
||||
}
|
||||
/* L80: */
|
||||
}
|
||||
}
|
||||
}
|
||||
} else {
|
||||
/* Form x := A'*x. */
|
||||
|
||||
if (lsame_(uplo, "U")) {
|
||||
kplus1 = *k + 1;
|
||||
if (*incx == 1) {
|
||||
for (j = *n; j >= 1; --j) {
|
||||
temp = x[j];
|
||||
l = kplus1 - j;
|
||||
if (nounit) {
|
||||
temp *= a[kplus1 + j * a_dim1];
|
||||
}
|
||||
/* Computing MAX */
|
||||
i__4 = 1, i__1 = j - *k;
|
||||
i__3 = max(i__4, i__1);
|
||||
for (i__ = j - 1; i__ >= i__3; --i__) {
|
||||
temp += a[l + i__ + j * a_dim1] * x[i__];
|
||||
/* L90: */
|
||||
}
|
||||
x[j] = temp;
|
||||
/* L100: */
|
||||
}
|
||||
} else {
|
||||
kx += (*n - 1) * *incx;
|
||||
jx = kx;
|
||||
for (j = *n; j >= 1; --j) {
|
||||
temp = x[jx];
|
||||
kx -= *incx;
|
||||
ix = kx;
|
||||
l = kplus1 - j;
|
||||
if (nounit) {
|
||||
temp *= a[kplus1 + j * a_dim1];
|
||||
}
|
||||
/* Computing MAX */
|
||||
i__4 = 1, i__1 = j - *k;
|
||||
i__3 = max(i__4, i__1);
|
||||
for (i__ = j - 1; i__ >= i__3; --i__) {
|
||||
temp += a[l + i__ + j * a_dim1] * x[ix];
|
||||
ix -= *incx;
|
||||
/* L110: */
|
||||
}
|
||||
x[jx] = temp;
|
||||
jx -= *incx;
|
||||
/* L120: */
|
||||
}
|
||||
}
|
||||
} else {
|
||||
if (*incx == 1) {
|
||||
i__3 = *n;
|
||||
for (j = 1; j <= i__3; ++j) {
|
||||
temp = x[j];
|
||||
l = 1 - j;
|
||||
if (nounit) {
|
||||
temp *= a[j * a_dim1 + 1];
|
||||
}
|
||||
/* Computing MIN */
|
||||
i__1 = *n, i__2 = j + *k;
|
||||
i__4 = min(i__1, i__2);
|
||||
for (i__ = j + 1; i__ <= i__4; ++i__) {
|
||||
temp += a[l + i__ + j * a_dim1] * x[i__];
|
||||
/* L130: */
|
||||
}
|
||||
x[j] = temp;
|
||||
/* L140: */
|
||||
}
|
||||
} else {
|
||||
jx = kx;
|
||||
i__3 = *n;
|
||||
for (j = 1; j <= i__3; ++j) {
|
||||
temp = x[jx];
|
||||
kx += *incx;
|
||||
ix = kx;
|
||||
l = 1 - j;
|
||||
if (nounit) {
|
||||
temp *= a[j * a_dim1 + 1];
|
||||
}
|
||||
/* Computing MIN */
|
||||
i__1 = *n, i__2 = j + *k;
|
||||
i__4 = min(i__1, i__2);
|
||||
for (i__ = j + 1; i__ <= i__4; ++i__) {
|
||||
temp += a[l + i__ + j * a_dim1] * x[ix];
|
||||
ix += *incx;
|
||||
/* L150: */
|
||||
}
|
||||
x[jx] = temp;
|
||||
jx += *incx;
|
||||
/* L160: */
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/* End of STBMV . */
|
||||
|
||||
} /* stbmv_ */
|
||||
456
blas/f2c/zhbmv.c
Normal file
456
blas/f2c/zhbmv.c
Normal file
@@ -0,0 +1,456 @@
|
||||
/* zhbmv.f -- translated by f2c (version 20100827).
|
||||
You must link the resulting object file with libf2c:
|
||||
on Microsoft Windows system, link with libf2c.lib;
|
||||
on Linux or Unix systems, link with .../path/to/libf2c.a -lm
|
||||
or, if you install libf2c.a in a standard place, with -lf2c -lm
|
||||
-- in that order, at the end of the command line, as in
|
||||
cc *.o -lf2c -lm
|
||||
Source for libf2c is in /netlib/f2c/libf2c.zip, e.g.,
|
||||
|
||||
http://www.netlib.org/f2c/libf2c.zip
|
||||
*/
|
||||
|
||||
#include "datatypes.h"
|
||||
|
||||
static inline void d_cnjg(doublecomplex *r, doublecomplex *z) {
|
||||
r->r = z->r;
|
||||
r->i = -(z->i);
|
||||
}
|
||||
|
||||
/* Subroutine */ void zhbmv_(char *uplo, integer *n, integer *k, doublecomplex *alpha, doublecomplex *a, integer *lda,
|
||||
doublecomplex *x, integer *incx, doublecomplex *beta, doublecomplex *y, integer *incy) {
|
||||
/* System generated locals */
|
||||
integer a_dim1, a_offset, i__1, i__2, i__3, i__4, i__5;
|
||||
doublereal d__1;
|
||||
doublecomplex z__1, z__2, z__3, z__4;
|
||||
|
||||
/* Local variables */
|
||||
integer i__, j, l, ix, iy, jx, jy, kx, ky, info;
|
||||
doublecomplex temp1, temp2;
|
||||
extern logical lsame_(char *, char *);
|
||||
integer kplus1;
|
||||
extern /* Subroutine */ void xerbla_(const char *, integer *);
|
||||
|
||||
/* .. Scalar Arguments .. */
|
||||
/* .. */
|
||||
/* .. Array Arguments .. */
|
||||
/* .. */
|
||||
|
||||
/* Purpose */
|
||||
/* ======= */
|
||||
|
||||
/* ZHBMV performs the matrix-vector operation */
|
||||
|
||||
/* y := alpha*A*x + beta*y, */
|
||||
|
||||
/* where alpha and beta are scalars, x and y are n element vectors and */
|
||||
/* A is an n by n hermitian band matrix, with k super-diagonals. */
|
||||
|
||||
/* Arguments */
|
||||
/* ========== */
|
||||
|
||||
/* UPLO - CHARACTER*1. */
|
||||
/* On entry, UPLO specifies whether the upper or lower */
|
||||
/* triangular part of the band matrix A is being supplied as */
|
||||
/* follows: */
|
||||
|
||||
/* UPLO = 'U' or 'u' The upper triangular part of A is */
|
||||
/* being supplied. */
|
||||
|
||||
/* UPLO = 'L' or 'l' The lower triangular part of A is */
|
||||
/* being supplied. */
|
||||
|
||||
/* Unchanged on exit. */
|
||||
|
||||
/* N - INTEGER. */
|
||||
/* On entry, N specifies the order of the matrix A. */
|
||||
/* N must be at least zero. */
|
||||
/* Unchanged on exit. */
|
||||
|
||||
/* K - INTEGER. */
|
||||
/* On entry, K specifies the number of super-diagonals of the */
|
||||
/* matrix A. K must satisfy 0 .le. K. */
|
||||
/* Unchanged on exit. */
|
||||
|
||||
/* ALPHA - COMPLEX*16 . */
|
||||
/* On entry, ALPHA specifies the scalar alpha. */
|
||||
/* Unchanged on exit. */
|
||||
|
||||
/* A - COMPLEX*16 array of DIMENSION ( LDA, n ). */
|
||||
/* Before entry with UPLO = 'U' or 'u', the leading ( k + 1 ) */
|
||||
/* by n part of the array A must contain the upper triangular */
|
||||
/* band part of the hermitian matrix, supplied column by */
|
||||
/* column, with the leading diagonal of the matrix in row */
|
||||
/* ( k + 1 ) of the array, the first super-diagonal starting at */
|
||||
/* position 2 in row k, and so on. The top left k by k triangle */
|
||||
/* of the array A is not referenced. */
|
||||
/* The following program segment will transfer the upper */
|
||||
/* triangular part of a hermitian band matrix from conventional */
|
||||
/* full matrix storage to band storage: */
|
||||
|
||||
/* DO 20, J = 1, N */
|
||||
/* M = K + 1 - J */
|
||||
/* DO 10, I = MAX( 1, J - K ), J */
|
||||
/* A( M + I, J ) = matrix( I, J ) */
|
||||
/* 10 CONTINUE */
|
||||
/* 20 CONTINUE */
|
||||
|
||||
/* Before entry with UPLO = 'L' or 'l', the leading ( k + 1 ) */
|
||||
/* by n part of the array A must contain the lower triangular */
|
||||
/* band part of the hermitian matrix, supplied column by */
|
||||
/* column, with the leading diagonal of the matrix in row 1 of */
|
||||
/* the array, the first sub-diagonal starting at position 1 in */
|
||||
/* row 2, and so on. The bottom right k by k triangle of the */
|
||||
/* array A is not referenced. */
|
||||
/* The following program segment will transfer the lower */
|
||||
/* triangular part of a hermitian band matrix from conventional */
|
||||
/* full matrix storage to band storage: */
|
||||
|
||||
/* DO 20, J = 1, N */
|
||||
/* M = 1 - J */
|
||||
/* DO 10, I = J, MIN( N, J + K ) */
|
||||
/* A( M + I, J ) = matrix( I, J ) */
|
||||
/* 10 CONTINUE */
|
||||
/* 20 CONTINUE */
|
||||
|
||||
/* Note that the imaginary parts of the diagonal elements need */
|
||||
/* not be set and are assumed to be zero. */
|
||||
/* Unchanged on exit. */
|
||||
|
||||
/* LDA - INTEGER. */
|
||||
/* On entry, LDA specifies the first dimension of A as declared */
|
||||
/* in the calling (sub) program. LDA must be at least */
|
||||
/* ( k + 1 ). */
|
||||
/* Unchanged on exit. */
|
||||
|
||||
/* X - COMPLEX*16 array of DIMENSION at least */
|
||||
/* ( 1 + ( n - 1 )*abs( INCX ) ). */
|
||||
/* Before entry, the incremented array X must contain the */
|
||||
/* vector x. */
|
||||
/* Unchanged on exit. */
|
||||
|
||||
/* INCX - INTEGER. */
|
||||
/* On entry, INCX specifies the increment for the elements of */
|
||||
/* X. INCX must not be zero. */
|
||||
/* Unchanged on exit. */
|
||||
|
||||
/* BETA - COMPLEX*16 . */
|
||||
/* On entry, BETA specifies the scalar beta. */
|
||||
/* Unchanged on exit. */
|
||||
|
||||
/* Y - COMPLEX*16 array of DIMENSION at least */
|
||||
/* ( 1 + ( n - 1 )*abs( INCY ) ). */
|
||||
/* Before entry, the incremented array Y must contain the */
|
||||
/* vector y. On exit, Y is overwritten by the updated vector y. */
|
||||
|
||||
/* INCY - INTEGER. */
|
||||
/* On entry, INCY specifies the increment for the elements of */
|
||||
/* Y. INCY must not be zero. */
|
||||
/* Unchanged on exit. */
|
||||
|
||||
/* Further Details */
|
||||
/* =============== */
|
||||
|
||||
/* Level 2 Blas routine. */
|
||||
|
||||
/* -- Written on 22-October-1986. */
|
||||
/* Jack Dongarra, Argonne National Lab. */
|
||||
/* Jeremy Du Croz, Nag Central Office. */
|
||||
/* Sven Hammarling, Nag Central Office. */
|
||||
/* Richard Hanson, Sandia National Labs. */
|
||||
|
||||
/* ===================================================================== */
|
||||
|
||||
/* .. Parameters .. */
|
||||
/* .. */
|
||||
/* .. Local Scalars .. */
|
||||
/* .. */
|
||||
/* .. External Functions .. */
|
||||
/* .. */
|
||||
/* .. External Subroutines .. */
|
||||
/* .. */
|
||||
/* .. Intrinsic Functions .. */
|
||||
/* .. */
|
||||
|
||||
/* Test the input parameters. */
|
||||
|
||||
/* Parameter adjustments */
|
||||
a_dim1 = *lda;
|
||||
a_offset = 1 + a_dim1;
|
||||
a -= a_offset;
|
||||
--x;
|
||||
--y;
|
||||
|
||||
/* Function Body */
|
||||
info = 0;
|
||||
if (!lsame_(uplo, "U") && !lsame_(uplo, "L")) {
|
||||
info = 1;
|
||||
} else if (*n < 0) {
|
||||
info = 2;
|
||||
} else if (*k < 0) {
|
||||
info = 3;
|
||||
} else if (*lda < *k + 1) {
|
||||
info = 6;
|
||||
} else if (*incx == 0) {
|
||||
info = 8;
|
||||
} else if (*incy == 0) {
|
||||
info = 11;
|
||||
}
|
||||
if (info != 0) {
|
||||
xerbla_("ZHBMV ", &info);
|
||||
return;
|
||||
}
|
||||
|
||||
/* Quick return if possible. */
|
||||
|
||||
if (*n == 0 || (alpha->r == 0. && alpha->i == 0. && (beta->r == 1. && beta->i == 0.))) {
|
||||
return;
|
||||
}
|
||||
|
||||
/* Set up the start points in X and Y. */
|
||||
|
||||
if (*incx > 0) {
|
||||
kx = 1;
|
||||
} else {
|
||||
kx = 1 - (*n - 1) * *incx;
|
||||
}
|
||||
if (*incy > 0) {
|
||||
ky = 1;
|
||||
} else {
|
||||
ky = 1 - (*n - 1) * *incy;
|
||||
}
|
||||
|
||||
/* Start the operations. In this version the elements of the array A */
|
||||
/* are accessed sequentially with one pass through A. */
|
||||
|
||||
/* First form y := beta*y. */
|
||||
|
||||
if (beta->r != 1. || beta->i != 0.) {
|
||||
if (*incy == 1) {
|
||||
if (beta->r == 0. && beta->i == 0.) {
|
||||
i__1 = *n;
|
||||
for (i__ = 1; i__ <= i__1; ++i__) {
|
||||
i__2 = i__;
|
||||
y[i__2].r = 0., y[i__2].i = 0.;
|
||||
/* L10: */
|
||||
}
|
||||
} else {
|
||||
i__1 = *n;
|
||||
for (i__ = 1; i__ <= i__1; ++i__) {
|
||||
i__2 = i__;
|
||||
i__3 = i__;
|
||||
z__1.r = beta->r * y[i__3].r - beta->i * y[i__3].i, z__1.i = beta->r * y[i__3].i + beta->i * y[i__3].r;
|
||||
y[i__2].r = z__1.r, y[i__2].i = z__1.i;
|
||||
/* L20: */
|
||||
}
|
||||
}
|
||||
} else {
|
||||
iy = ky;
|
||||
if (beta->r == 0. && beta->i == 0.) {
|
||||
i__1 = *n;
|
||||
for (i__ = 1; i__ <= i__1; ++i__) {
|
||||
i__2 = iy;
|
||||
y[i__2].r = 0., y[i__2].i = 0.;
|
||||
iy += *incy;
|
||||
/* L30: */
|
||||
}
|
||||
} else {
|
||||
i__1 = *n;
|
||||
for (i__ = 1; i__ <= i__1; ++i__) {
|
||||
i__2 = iy;
|
||||
i__3 = iy;
|
||||
z__1.r = beta->r * y[i__3].r - beta->i * y[i__3].i, z__1.i = beta->r * y[i__3].i + beta->i * y[i__3].r;
|
||||
y[i__2].r = z__1.r, y[i__2].i = z__1.i;
|
||||
iy += *incy;
|
||||
/* L40: */
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
if (alpha->r == 0. && alpha->i == 0.) {
|
||||
return;
|
||||
}
|
||||
if (lsame_(uplo, "U")) {
|
||||
/* Form y when upper triangle of A is stored. */
|
||||
|
||||
kplus1 = *k + 1;
|
||||
if (*incx == 1 && *incy == 1) {
|
||||
i__1 = *n;
|
||||
for (j = 1; j <= i__1; ++j) {
|
||||
i__2 = j;
|
||||
z__1.r = alpha->r * x[i__2].r - alpha->i * x[i__2].i, z__1.i = alpha->r * x[i__2].i + alpha->i * x[i__2].r;
|
||||
temp1.r = z__1.r, temp1.i = z__1.i;
|
||||
temp2.r = 0., temp2.i = 0.;
|
||||
l = kplus1 - j;
|
||||
/* Computing MAX */
|
||||
i__2 = 1, i__3 = j - *k;
|
||||
i__4 = j - 1;
|
||||
for (i__ = max(i__2, i__3); i__ <= i__4; ++i__) {
|
||||
i__2 = i__;
|
||||
i__3 = i__;
|
||||
i__5 = l + i__ + j * a_dim1;
|
||||
z__2.r = temp1.r * a[i__5].r - temp1.i * a[i__5].i, z__2.i = temp1.r * a[i__5].i + temp1.i * a[i__5].r;
|
||||
z__1.r = y[i__3].r + z__2.r, z__1.i = y[i__3].i + z__2.i;
|
||||
y[i__2].r = z__1.r, y[i__2].i = z__1.i;
|
||||
d_cnjg(&z__3, &a[l + i__ + j * a_dim1]);
|
||||
i__2 = i__;
|
||||
z__2.r = z__3.r * x[i__2].r - z__3.i * x[i__2].i, z__2.i = z__3.r * x[i__2].i + z__3.i * x[i__2].r;
|
||||
z__1.r = temp2.r + z__2.r, z__1.i = temp2.i + z__2.i;
|
||||
temp2.r = z__1.r, temp2.i = z__1.i;
|
||||
/* L50: */
|
||||
}
|
||||
i__4 = j;
|
||||
i__2 = j;
|
||||
i__3 = kplus1 + j * a_dim1;
|
||||
d__1 = a[i__3].r;
|
||||
z__3.r = d__1 * temp1.r, z__3.i = d__1 * temp1.i;
|
||||
z__2.r = y[i__2].r + z__3.r, z__2.i = y[i__2].i + z__3.i;
|
||||
z__4.r = alpha->r * temp2.r - alpha->i * temp2.i, z__4.i = alpha->r * temp2.i + alpha->i * temp2.r;
|
||||
z__1.r = z__2.r + z__4.r, z__1.i = z__2.i + z__4.i;
|
||||
y[i__4].r = z__1.r, y[i__4].i = z__1.i;
|
||||
/* L60: */
|
||||
}
|
||||
} else {
|
||||
jx = kx;
|
||||
jy = ky;
|
||||
i__1 = *n;
|
||||
for (j = 1; j <= i__1; ++j) {
|
||||
i__4 = jx;
|
||||
z__1.r = alpha->r * x[i__4].r - alpha->i * x[i__4].i, z__1.i = alpha->r * x[i__4].i + alpha->i * x[i__4].r;
|
||||
temp1.r = z__1.r, temp1.i = z__1.i;
|
||||
temp2.r = 0., temp2.i = 0.;
|
||||
ix = kx;
|
||||
iy = ky;
|
||||
l = kplus1 - j;
|
||||
/* Computing MAX */
|
||||
i__4 = 1, i__2 = j - *k;
|
||||
i__3 = j - 1;
|
||||
for (i__ = max(i__4, i__2); i__ <= i__3; ++i__) {
|
||||
i__4 = iy;
|
||||
i__2 = iy;
|
||||
i__5 = l + i__ + j * a_dim1;
|
||||
z__2.r = temp1.r * a[i__5].r - temp1.i * a[i__5].i, z__2.i = temp1.r * a[i__5].i + temp1.i * a[i__5].r;
|
||||
z__1.r = y[i__2].r + z__2.r, z__1.i = y[i__2].i + z__2.i;
|
||||
y[i__4].r = z__1.r, y[i__4].i = z__1.i;
|
||||
d_cnjg(&z__3, &a[l + i__ + j * a_dim1]);
|
||||
i__4 = ix;
|
||||
z__2.r = z__3.r * x[i__4].r - z__3.i * x[i__4].i, z__2.i = z__3.r * x[i__4].i + z__3.i * x[i__4].r;
|
||||
z__1.r = temp2.r + z__2.r, z__1.i = temp2.i + z__2.i;
|
||||
temp2.r = z__1.r, temp2.i = z__1.i;
|
||||
ix += *incx;
|
||||
iy += *incy;
|
||||
/* L70: */
|
||||
}
|
||||
i__3 = jy;
|
||||
i__4 = jy;
|
||||
i__2 = kplus1 + j * a_dim1;
|
||||
d__1 = a[i__2].r;
|
||||
z__3.r = d__1 * temp1.r, z__3.i = d__1 * temp1.i;
|
||||
z__2.r = y[i__4].r + z__3.r, z__2.i = y[i__4].i + z__3.i;
|
||||
z__4.r = alpha->r * temp2.r - alpha->i * temp2.i, z__4.i = alpha->r * temp2.i + alpha->i * temp2.r;
|
||||
z__1.r = z__2.r + z__4.r, z__1.i = z__2.i + z__4.i;
|
||||
y[i__3].r = z__1.r, y[i__3].i = z__1.i;
|
||||
jx += *incx;
|
||||
jy += *incy;
|
||||
if (j > *k) {
|
||||
kx += *incx;
|
||||
ky += *incy;
|
||||
}
|
||||
/* L80: */
|
||||
}
|
||||
}
|
||||
} else {
|
||||
/* Form y when lower triangle of A is stored. */
|
||||
|
||||
if (*incx == 1 && *incy == 1) {
|
||||
i__1 = *n;
|
||||
for (j = 1; j <= i__1; ++j) {
|
||||
i__3 = j;
|
||||
z__1.r = alpha->r * x[i__3].r - alpha->i * x[i__3].i, z__1.i = alpha->r * x[i__3].i + alpha->i * x[i__3].r;
|
||||
temp1.r = z__1.r, temp1.i = z__1.i;
|
||||
temp2.r = 0., temp2.i = 0.;
|
||||
i__3 = j;
|
||||
i__4 = j;
|
||||
i__2 = j * a_dim1 + 1;
|
||||
d__1 = a[i__2].r;
|
||||
z__2.r = d__1 * temp1.r, z__2.i = d__1 * temp1.i;
|
||||
z__1.r = y[i__4].r + z__2.r, z__1.i = y[i__4].i + z__2.i;
|
||||
y[i__3].r = z__1.r, y[i__3].i = z__1.i;
|
||||
l = 1 - j;
|
||||
/* Computing MIN */
|
||||
i__4 = *n, i__2 = j + *k;
|
||||
i__3 = min(i__4, i__2);
|
||||
for (i__ = j + 1; i__ <= i__3; ++i__) {
|
||||
i__4 = i__;
|
||||
i__2 = i__;
|
||||
i__5 = l + i__ + j * a_dim1;
|
||||
z__2.r = temp1.r * a[i__5].r - temp1.i * a[i__5].i, z__2.i = temp1.r * a[i__5].i + temp1.i * a[i__5].r;
|
||||
z__1.r = y[i__2].r + z__2.r, z__1.i = y[i__2].i + z__2.i;
|
||||
y[i__4].r = z__1.r, y[i__4].i = z__1.i;
|
||||
d_cnjg(&z__3, &a[l + i__ + j * a_dim1]);
|
||||
i__4 = i__;
|
||||
z__2.r = z__3.r * x[i__4].r - z__3.i * x[i__4].i, z__2.i = z__3.r * x[i__4].i + z__3.i * x[i__4].r;
|
||||
z__1.r = temp2.r + z__2.r, z__1.i = temp2.i + z__2.i;
|
||||
temp2.r = z__1.r, temp2.i = z__1.i;
|
||||
/* L90: */
|
||||
}
|
||||
i__3 = j;
|
||||
i__4 = j;
|
||||
z__2.r = alpha->r * temp2.r - alpha->i * temp2.i, z__2.i = alpha->r * temp2.i + alpha->i * temp2.r;
|
||||
z__1.r = y[i__4].r + z__2.r, z__1.i = y[i__4].i + z__2.i;
|
||||
y[i__3].r = z__1.r, y[i__3].i = z__1.i;
|
||||
/* L100: */
|
||||
}
|
||||
} else {
|
||||
jx = kx;
|
||||
jy = ky;
|
||||
i__1 = *n;
|
||||
for (j = 1; j <= i__1; ++j) {
|
||||
i__3 = jx;
|
||||
z__1.r = alpha->r * x[i__3].r - alpha->i * x[i__3].i, z__1.i = alpha->r * x[i__3].i + alpha->i * x[i__3].r;
|
||||
temp1.r = z__1.r, temp1.i = z__1.i;
|
||||
temp2.r = 0., temp2.i = 0.;
|
||||
i__3 = jy;
|
||||
i__4 = jy;
|
||||
i__2 = j * a_dim1 + 1;
|
||||
d__1 = a[i__2].r;
|
||||
z__2.r = d__1 * temp1.r, z__2.i = d__1 * temp1.i;
|
||||
z__1.r = y[i__4].r + z__2.r, z__1.i = y[i__4].i + z__2.i;
|
||||
y[i__3].r = z__1.r, y[i__3].i = z__1.i;
|
||||
l = 1 - j;
|
||||
ix = jx;
|
||||
iy = jy;
|
||||
/* Computing MIN */
|
||||
i__4 = *n, i__2 = j + *k;
|
||||
i__3 = min(i__4, i__2);
|
||||
for (i__ = j + 1; i__ <= i__3; ++i__) {
|
||||
ix += *incx;
|
||||
iy += *incy;
|
||||
i__4 = iy;
|
||||
i__2 = iy;
|
||||
i__5 = l + i__ + j * a_dim1;
|
||||
z__2.r = temp1.r * a[i__5].r - temp1.i * a[i__5].i, z__2.i = temp1.r * a[i__5].i + temp1.i * a[i__5].r;
|
||||
z__1.r = y[i__2].r + z__2.r, z__1.i = y[i__2].i + z__2.i;
|
||||
y[i__4].r = z__1.r, y[i__4].i = z__1.i;
|
||||
d_cnjg(&z__3, &a[l + i__ + j * a_dim1]);
|
||||
i__4 = ix;
|
||||
z__2.r = z__3.r * x[i__4].r - z__3.i * x[i__4].i, z__2.i = z__3.r * x[i__4].i + z__3.i * x[i__4].r;
|
||||
z__1.r = temp2.r + z__2.r, z__1.i = temp2.i + z__2.i;
|
||||
temp2.r = z__1.r, temp2.i = z__1.i;
|
||||
/* L110: */
|
||||
}
|
||||
i__3 = jy;
|
||||
i__4 = jy;
|
||||
z__2.r = alpha->r * temp2.r - alpha->i * temp2.i, z__2.i = alpha->r * temp2.i + alpha->i * temp2.r;
|
||||
z__1.r = y[i__4].r + z__2.r, z__1.i = y[i__4].i + z__2.i;
|
||||
y[i__3].r = z__1.r, y[i__3].i = z__1.i;
|
||||
jx += *incx;
|
||||
jy += *incy;
|
||||
/* L120: */
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/* End of ZHBMV . */
|
||||
|
||||
} /* zhbmv_ */
|
||||
407
blas/f2c/zhpmv.c
Normal file
407
blas/f2c/zhpmv.c
Normal file
@@ -0,0 +1,407 @@
|
||||
/* zhpmv.f -- translated by f2c (version 20100827).
|
||||
You must link the resulting object file with libf2c:
|
||||
on Microsoft Windows system, link with libf2c.lib;
|
||||
on Linux or Unix systems, link with .../path/to/libf2c.a -lm
|
||||
or, if you install libf2c.a in a standard place, with -lf2c -lm
|
||||
-- in that order, at the end of the command line, as in
|
||||
cc *.o -lf2c -lm
|
||||
Source for libf2c is in /netlib/f2c/libf2c.zip, e.g.,
|
||||
|
||||
http://www.netlib.org/f2c/libf2c.zip
|
||||
*/
|
||||
|
||||
#include "datatypes.h"
|
||||
|
||||
static inline void d_cnjg(doublecomplex *r, doublecomplex *z) {
|
||||
r->r = z->r;
|
||||
r->i = -(z->i);
|
||||
}
|
||||
|
||||
/* Subroutine */ void zhpmv_(char *uplo, integer *n, doublecomplex *alpha, doublecomplex *ap, doublecomplex *x,
|
||||
integer *incx, doublecomplex *beta, doublecomplex *y, integer *incy) {
|
||||
/* System generated locals */
|
||||
integer i__1, i__2, i__3, i__4, i__5;
|
||||
doublereal d__1;
|
||||
doublecomplex z__1, z__2, z__3, z__4;
|
||||
|
||||
/* Local variables */
|
||||
integer i__, j, k, kk, ix, iy, jx, jy, kx, ky, info;
|
||||
doublecomplex temp1, temp2;
|
||||
extern logical lsame_(char *, char *);
|
||||
extern /* Subroutine */ void xerbla_(const char *, integer *);
|
||||
|
||||
/* .. Scalar Arguments .. */
|
||||
/* .. */
|
||||
/* .. Array Arguments .. */
|
||||
/* .. */
|
||||
|
||||
/* Purpose */
|
||||
/* ======= */
|
||||
|
||||
/* ZHPMV performs the matrix-vector operation */
|
||||
|
||||
/* y := alpha*A*x + beta*y, */
|
||||
|
||||
/* where alpha and beta are scalars, x and y are n element vectors and */
|
||||
/* A is an n by n hermitian matrix, supplied in packed form. */
|
||||
|
||||
/* Arguments */
|
||||
/* ========== */
|
||||
|
||||
/* UPLO - CHARACTER*1. */
|
||||
/* On entry, UPLO specifies whether the upper or lower */
|
||||
/* triangular part of the matrix A is supplied in the packed */
|
||||
/* array AP as follows: */
|
||||
|
||||
/* UPLO = 'U' or 'u' The upper triangular part of A is */
|
||||
/* supplied in AP. */
|
||||
|
||||
/* UPLO = 'L' or 'l' The lower triangular part of A is */
|
||||
/* supplied in AP. */
|
||||
|
||||
/* Unchanged on exit. */
|
||||
|
||||
/* N - INTEGER. */
|
||||
/* On entry, N specifies the order of the matrix A. */
|
||||
/* N must be at least zero. */
|
||||
/* Unchanged on exit. */
|
||||
|
||||
/* ALPHA - COMPLEX*16 . */
|
||||
/* On entry, ALPHA specifies the scalar alpha. */
|
||||
/* Unchanged on exit. */
|
||||
|
||||
/* AP - COMPLEX*16 array of DIMENSION at least */
|
||||
/* ( ( n*( n + 1 ) )/2 ). */
|
||||
/* Before entry with UPLO = 'U' or 'u', the array AP must */
|
||||
/* contain the upper triangular part of the hermitian matrix */
|
||||
/* packed sequentially, column by column, so that AP( 1 ) */
|
||||
/* contains a( 1, 1 ), AP( 2 ) and AP( 3 ) contain a( 1, 2 ) */
|
||||
/* and a( 2, 2 ) respectively, and so on. */
|
||||
/* Before entry with UPLO = 'L' or 'l', the array AP must */
|
||||
/* contain the lower triangular part of the hermitian matrix */
|
||||
/* packed sequentially, column by column, so that AP( 1 ) */
|
||||
/* contains a( 1, 1 ), AP( 2 ) and AP( 3 ) contain a( 2, 1 ) */
|
||||
/* and a( 3, 1 ) respectively, and so on. */
|
||||
/* Note that the imaginary parts of the diagonal elements need */
|
||||
/* not be set and are assumed to be zero. */
|
||||
/* Unchanged on exit. */
|
||||
|
||||
/* X - COMPLEX*16 array of dimension at least */
|
||||
/* ( 1 + ( n - 1 )*abs( INCX ) ). */
|
||||
/* Before entry, the incremented array X must contain the n */
|
||||
/* element vector x. */
|
||||
/* Unchanged on exit. */
|
||||
|
||||
/* INCX - INTEGER. */
|
||||
/* On entry, INCX specifies the increment for the elements of */
|
||||
/* X. INCX must not be zero. */
|
||||
/* Unchanged on exit. */
|
||||
|
||||
/* BETA - COMPLEX*16 . */
|
||||
/* On entry, BETA specifies the scalar beta. When BETA is */
|
||||
/* supplied as zero then Y need not be set on input. */
|
||||
/* Unchanged on exit. */
|
||||
|
||||
/* Y - COMPLEX*16 array of dimension at least */
|
||||
/* ( 1 + ( n - 1 )*abs( INCY ) ). */
|
||||
/* Before entry, the incremented array Y must contain the n */
|
||||
/* element vector y. On exit, Y is overwritten by the updated */
|
||||
/* vector y. */
|
||||
|
||||
/* INCY - INTEGER. */
|
||||
/* On entry, INCY specifies the increment for the elements of */
|
||||
/* Y. INCY must not be zero. */
|
||||
/* Unchanged on exit. */
|
||||
|
||||
/* Further Details */
|
||||
/* =============== */
|
||||
|
||||
/* Level 2 Blas routine. */
|
||||
|
||||
/* -- Written on 22-October-1986. */
|
||||
/* Jack Dongarra, Argonne National Lab. */
|
||||
/* Jeremy Du Croz, Nag Central Office. */
|
||||
/* Sven Hammarling, Nag Central Office. */
|
||||
/* Richard Hanson, Sandia National Labs. */
|
||||
|
||||
/* ===================================================================== */
|
||||
|
||||
/* .. Parameters .. */
|
||||
/* .. */
|
||||
/* .. Local Scalars .. */
|
||||
/* .. */
|
||||
/* .. External Functions .. */
|
||||
/* .. */
|
||||
/* .. External Subroutines .. */
|
||||
/* .. */
|
||||
/* .. Intrinsic Functions .. */
|
||||
/* .. */
|
||||
|
||||
/* Test the input parameters. */
|
||||
|
||||
/* Parameter adjustments */
|
||||
--y;
|
||||
--x;
|
||||
--ap;
|
||||
|
||||
/* Function Body */
|
||||
info = 0;
|
||||
if (!lsame_(uplo, "U") && !lsame_(uplo, "L")) {
|
||||
info = 1;
|
||||
} else if (*n < 0) {
|
||||
info = 2;
|
||||
} else if (*incx == 0) {
|
||||
info = 6;
|
||||
} else if (*incy == 0) {
|
||||
info = 9;
|
||||
}
|
||||
if (info != 0) {
|
||||
xerbla_("ZHPMV ", &info);
|
||||
return;
|
||||
}
|
||||
|
||||
/* Quick return if possible. */
|
||||
|
||||
if (*n == 0 || (alpha->r == 0. && alpha->i == 0. && (beta->r == 1. && beta->i == 0.))) {
|
||||
return;
|
||||
}
|
||||
|
||||
/* Set up the start points in X and Y. */
|
||||
|
||||
if (*incx > 0) {
|
||||
kx = 1;
|
||||
} else {
|
||||
kx = 1 - (*n - 1) * *incx;
|
||||
}
|
||||
if (*incy > 0) {
|
||||
ky = 1;
|
||||
} else {
|
||||
ky = 1 - (*n - 1) * *incy;
|
||||
}
|
||||
|
||||
/* Start the operations. In this version the elements of the array AP */
|
||||
/* are accessed sequentially with one pass through AP. */
|
||||
|
||||
/* First form y := beta*y. */
|
||||
|
||||
if (beta->r != 1. || beta->i != 0.) {
|
||||
if (*incy == 1) {
|
||||
if (beta->r == 0. && beta->i == 0.) {
|
||||
i__1 = *n;
|
||||
for (i__ = 1; i__ <= i__1; ++i__) {
|
||||
i__2 = i__;
|
||||
y[i__2].r = 0., y[i__2].i = 0.;
|
||||
/* L10: */
|
||||
}
|
||||
} else {
|
||||
i__1 = *n;
|
||||
for (i__ = 1; i__ <= i__1; ++i__) {
|
||||
i__2 = i__;
|
||||
i__3 = i__;
|
||||
z__1.r = beta->r * y[i__3].r - beta->i * y[i__3].i, z__1.i = beta->r * y[i__3].i + beta->i * y[i__3].r;
|
||||
y[i__2].r = z__1.r, y[i__2].i = z__1.i;
|
||||
/* L20: */
|
||||
}
|
||||
}
|
||||
} else {
|
||||
iy = ky;
|
||||
if (beta->r == 0. && beta->i == 0.) {
|
||||
i__1 = *n;
|
||||
for (i__ = 1; i__ <= i__1; ++i__) {
|
||||
i__2 = iy;
|
||||
y[i__2].r = 0., y[i__2].i = 0.;
|
||||
iy += *incy;
|
||||
/* L30: */
|
||||
}
|
||||
} else {
|
||||
i__1 = *n;
|
||||
for (i__ = 1; i__ <= i__1; ++i__) {
|
||||
i__2 = iy;
|
||||
i__3 = iy;
|
||||
z__1.r = beta->r * y[i__3].r - beta->i * y[i__3].i, z__1.i = beta->r * y[i__3].i + beta->i * y[i__3].r;
|
||||
y[i__2].r = z__1.r, y[i__2].i = z__1.i;
|
||||
iy += *incy;
|
||||
/* L40: */
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
if (alpha->r == 0. && alpha->i == 0.) {
|
||||
return;
|
||||
}
|
||||
kk = 1;
|
||||
if (lsame_(uplo, "U")) {
|
||||
/* Form y when AP contains the upper triangle. */
|
||||
|
||||
if (*incx == 1 && *incy == 1) {
|
||||
i__1 = *n;
|
||||
for (j = 1; j <= i__1; ++j) {
|
||||
i__2 = j;
|
||||
z__1.r = alpha->r * x[i__2].r - alpha->i * x[i__2].i, z__1.i = alpha->r * x[i__2].i + alpha->i * x[i__2].r;
|
||||
temp1.r = z__1.r, temp1.i = z__1.i;
|
||||
temp2.r = 0., temp2.i = 0.;
|
||||
k = kk;
|
||||
i__2 = j - 1;
|
||||
for (i__ = 1; i__ <= i__2; ++i__) {
|
||||
i__3 = i__;
|
||||
i__4 = i__;
|
||||
i__5 = k;
|
||||
z__2.r = temp1.r * ap[i__5].r - temp1.i * ap[i__5].i, z__2.i = temp1.r * ap[i__5].i + temp1.i * ap[i__5].r;
|
||||
z__1.r = y[i__4].r + z__2.r, z__1.i = y[i__4].i + z__2.i;
|
||||
y[i__3].r = z__1.r, y[i__3].i = z__1.i;
|
||||
d_cnjg(&z__3, &ap[k]);
|
||||
i__3 = i__;
|
||||
z__2.r = z__3.r * x[i__3].r - z__3.i * x[i__3].i, z__2.i = z__3.r * x[i__3].i + z__3.i * x[i__3].r;
|
||||
z__1.r = temp2.r + z__2.r, z__1.i = temp2.i + z__2.i;
|
||||
temp2.r = z__1.r, temp2.i = z__1.i;
|
||||
++k;
|
||||
/* L50: */
|
||||
}
|
||||
i__2 = j;
|
||||
i__3 = j;
|
||||
i__4 = kk + j - 1;
|
||||
d__1 = ap[i__4].r;
|
||||
z__3.r = d__1 * temp1.r, z__3.i = d__1 * temp1.i;
|
||||
z__2.r = y[i__3].r + z__3.r, z__2.i = y[i__3].i + z__3.i;
|
||||
z__4.r = alpha->r * temp2.r - alpha->i * temp2.i, z__4.i = alpha->r * temp2.i + alpha->i * temp2.r;
|
||||
z__1.r = z__2.r + z__4.r, z__1.i = z__2.i + z__4.i;
|
||||
y[i__2].r = z__1.r, y[i__2].i = z__1.i;
|
||||
kk += j;
|
||||
/* L60: */
|
||||
}
|
||||
} else {
|
||||
jx = kx;
|
||||
jy = ky;
|
||||
i__1 = *n;
|
||||
for (j = 1; j <= i__1; ++j) {
|
||||
i__2 = jx;
|
||||
z__1.r = alpha->r * x[i__2].r - alpha->i * x[i__2].i, z__1.i = alpha->r * x[i__2].i + alpha->i * x[i__2].r;
|
||||
temp1.r = z__1.r, temp1.i = z__1.i;
|
||||
temp2.r = 0., temp2.i = 0.;
|
||||
ix = kx;
|
||||
iy = ky;
|
||||
i__2 = kk + j - 2;
|
||||
for (k = kk; k <= i__2; ++k) {
|
||||
i__3 = iy;
|
||||
i__4 = iy;
|
||||
i__5 = k;
|
||||
z__2.r = temp1.r * ap[i__5].r - temp1.i * ap[i__5].i, z__2.i = temp1.r * ap[i__5].i + temp1.i * ap[i__5].r;
|
||||
z__1.r = y[i__4].r + z__2.r, z__1.i = y[i__4].i + z__2.i;
|
||||
y[i__3].r = z__1.r, y[i__3].i = z__1.i;
|
||||
d_cnjg(&z__3, &ap[k]);
|
||||
i__3 = ix;
|
||||
z__2.r = z__3.r * x[i__3].r - z__3.i * x[i__3].i, z__2.i = z__3.r * x[i__3].i + z__3.i * x[i__3].r;
|
||||
z__1.r = temp2.r + z__2.r, z__1.i = temp2.i + z__2.i;
|
||||
temp2.r = z__1.r, temp2.i = z__1.i;
|
||||
ix += *incx;
|
||||
iy += *incy;
|
||||
/* L70: */
|
||||
}
|
||||
i__2 = jy;
|
||||
i__3 = jy;
|
||||
i__4 = kk + j - 1;
|
||||
d__1 = ap[i__4].r;
|
||||
z__3.r = d__1 * temp1.r, z__3.i = d__1 * temp1.i;
|
||||
z__2.r = y[i__3].r + z__3.r, z__2.i = y[i__3].i + z__3.i;
|
||||
z__4.r = alpha->r * temp2.r - alpha->i * temp2.i, z__4.i = alpha->r * temp2.i + alpha->i * temp2.r;
|
||||
z__1.r = z__2.r + z__4.r, z__1.i = z__2.i + z__4.i;
|
||||
y[i__2].r = z__1.r, y[i__2].i = z__1.i;
|
||||
jx += *incx;
|
||||
jy += *incy;
|
||||
kk += j;
|
||||
/* L80: */
|
||||
}
|
||||
}
|
||||
} else {
|
||||
/* Form y when AP contains the lower triangle. */
|
||||
|
||||
if (*incx == 1 && *incy == 1) {
|
||||
i__1 = *n;
|
||||
for (j = 1; j <= i__1; ++j) {
|
||||
i__2 = j;
|
||||
z__1.r = alpha->r * x[i__2].r - alpha->i * x[i__2].i, z__1.i = alpha->r * x[i__2].i + alpha->i * x[i__2].r;
|
||||
temp1.r = z__1.r, temp1.i = z__1.i;
|
||||
temp2.r = 0., temp2.i = 0.;
|
||||
i__2 = j;
|
||||
i__3 = j;
|
||||
i__4 = kk;
|
||||
d__1 = ap[i__4].r;
|
||||
z__2.r = d__1 * temp1.r, z__2.i = d__1 * temp1.i;
|
||||
z__1.r = y[i__3].r + z__2.r, z__1.i = y[i__3].i + z__2.i;
|
||||
y[i__2].r = z__1.r, y[i__2].i = z__1.i;
|
||||
k = kk + 1;
|
||||
i__2 = *n;
|
||||
for (i__ = j + 1; i__ <= i__2; ++i__) {
|
||||
i__3 = i__;
|
||||
i__4 = i__;
|
||||
i__5 = k;
|
||||
z__2.r = temp1.r * ap[i__5].r - temp1.i * ap[i__5].i, z__2.i = temp1.r * ap[i__5].i + temp1.i * ap[i__5].r;
|
||||
z__1.r = y[i__4].r + z__2.r, z__1.i = y[i__4].i + z__2.i;
|
||||
y[i__3].r = z__1.r, y[i__3].i = z__1.i;
|
||||
d_cnjg(&z__3, &ap[k]);
|
||||
i__3 = i__;
|
||||
z__2.r = z__3.r * x[i__3].r - z__3.i * x[i__3].i, z__2.i = z__3.r * x[i__3].i + z__3.i * x[i__3].r;
|
||||
z__1.r = temp2.r + z__2.r, z__1.i = temp2.i + z__2.i;
|
||||
temp2.r = z__1.r, temp2.i = z__1.i;
|
||||
++k;
|
||||
/* L90: */
|
||||
}
|
||||
i__2 = j;
|
||||
i__3 = j;
|
||||
z__2.r = alpha->r * temp2.r - alpha->i * temp2.i, z__2.i = alpha->r * temp2.i + alpha->i * temp2.r;
|
||||
z__1.r = y[i__3].r + z__2.r, z__1.i = y[i__3].i + z__2.i;
|
||||
y[i__2].r = z__1.r, y[i__2].i = z__1.i;
|
||||
kk += *n - j + 1;
|
||||
/* L100: */
|
||||
}
|
||||
} else {
|
||||
jx = kx;
|
||||
jy = ky;
|
||||
i__1 = *n;
|
||||
for (j = 1; j <= i__1; ++j) {
|
||||
i__2 = jx;
|
||||
z__1.r = alpha->r * x[i__2].r - alpha->i * x[i__2].i, z__1.i = alpha->r * x[i__2].i + alpha->i * x[i__2].r;
|
||||
temp1.r = z__1.r, temp1.i = z__1.i;
|
||||
temp2.r = 0., temp2.i = 0.;
|
||||
i__2 = jy;
|
||||
i__3 = jy;
|
||||
i__4 = kk;
|
||||
d__1 = ap[i__4].r;
|
||||
z__2.r = d__1 * temp1.r, z__2.i = d__1 * temp1.i;
|
||||
z__1.r = y[i__3].r + z__2.r, z__1.i = y[i__3].i + z__2.i;
|
||||
y[i__2].r = z__1.r, y[i__2].i = z__1.i;
|
||||
ix = jx;
|
||||
iy = jy;
|
||||
i__2 = kk + *n - j;
|
||||
for (k = kk + 1; k <= i__2; ++k) {
|
||||
ix += *incx;
|
||||
iy += *incy;
|
||||
i__3 = iy;
|
||||
i__4 = iy;
|
||||
i__5 = k;
|
||||
z__2.r = temp1.r * ap[i__5].r - temp1.i * ap[i__5].i, z__2.i = temp1.r * ap[i__5].i + temp1.i * ap[i__5].r;
|
||||
z__1.r = y[i__4].r + z__2.r, z__1.i = y[i__4].i + z__2.i;
|
||||
y[i__3].r = z__1.r, y[i__3].i = z__1.i;
|
||||
d_cnjg(&z__3, &ap[k]);
|
||||
i__3 = ix;
|
||||
z__2.r = z__3.r * x[i__3].r - z__3.i * x[i__3].i, z__2.i = z__3.r * x[i__3].i + z__3.i * x[i__3].r;
|
||||
z__1.r = temp2.r + z__2.r, z__1.i = temp2.i + z__2.i;
|
||||
temp2.r = z__1.r, temp2.i = z__1.i;
|
||||
/* L110: */
|
||||
}
|
||||
i__2 = jy;
|
||||
i__3 = jy;
|
||||
z__2.r = alpha->r * temp2.r - alpha->i * temp2.i, z__2.i = alpha->r * temp2.i + alpha->i * temp2.r;
|
||||
z__1.r = y[i__3].r + z__2.r, z__1.i = y[i__3].i + z__2.i;
|
||||
y[i__2].r = z__1.r, y[i__2].i = z__1.i;
|
||||
jx += *incx;
|
||||
jy += *incy;
|
||||
kk += *n - j + 1;
|
||||
/* L120: */
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/* End of ZHPMV . */
|
||||
|
||||
} /* zhpmv_ */
|
||||
586
blas/f2c/ztbmv.c
Normal file
586
blas/f2c/ztbmv.c
Normal file
@@ -0,0 +1,586 @@
|
||||
/* ztbmv.f -- translated by f2c (version 20100827).
|
||||
You must link the resulting object file with libf2c:
|
||||
on Microsoft Windows system, link with libf2c.lib;
|
||||
on Linux or Unix systems, link with .../path/to/libf2c.a -lm
|
||||
or, if you install libf2c.a in a standard place, with -lf2c -lm
|
||||
-- in that order, at the end of the command line, as in
|
||||
cc *.o -lf2c -lm
|
||||
Source for libf2c is in /netlib/f2c/libf2c.zip, e.g.,
|
||||
|
||||
http://www.netlib.org/f2c/libf2c.zip
|
||||
*/
|
||||
|
||||
#include "datatypes.h"
|
||||
|
||||
static inline void d_cnjg(doublecomplex *r, doublecomplex *z) {
|
||||
r->r = z->r;
|
||||
r->i = -(z->i);
|
||||
}
|
||||
|
||||
/* Subroutine */ void ztbmv_(char *uplo, char *trans, char *diag, integer *n, integer *k, doublecomplex *a,
|
||||
integer *lda, doublecomplex *x, integer *incx) {
|
||||
/* System generated locals */
|
||||
integer a_dim1, a_offset, i__1, i__2, i__3, i__4, i__5;
|
||||
doublecomplex z__1, z__2, z__3;
|
||||
|
||||
/* Local variables */
|
||||
integer i__, j, l, ix, jx, kx, info;
|
||||
doublecomplex temp;
|
||||
extern logical lsame_(char *, char *);
|
||||
integer kplus1;
|
||||
extern /* Subroutine */ void xerbla_(const char *, integer *);
|
||||
logical noconj, nounit;
|
||||
|
||||
/* .. Scalar Arguments .. */
|
||||
/* .. */
|
||||
/* .. Array Arguments .. */
|
||||
/* .. */
|
||||
|
||||
/* Purpose */
|
||||
/* ======= */
|
||||
|
||||
/* ZTBMV performs one of the matrix-vector operations */
|
||||
|
||||
/* x := A*x, or x := A'*x, or x := conjg( A' )*x, */
|
||||
|
||||
/* where x is an n element vector and A is an n by n unit, or non-unit, */
|
||||
/* upper or lower triangular band matrix, with ( k + 1 ) diagonals. */
|
||||
|
||||
/* Arguments */
|
||||
/* ========== */
|
||||
|
||||
/* UPLO - CHARACTER*1. */
|
||||
/* On entry, UPLO specifies whether the matrix is an upper or */
|
||||
/* lower triangular matrix as follows: */
|
||||
|
||||
/* UPLO = 'U' or 'u' A is an upper triangular matrix. */
|
||||
|
||||
/* UPLO = 'L' or 'l' A is a lower triangular matrix. */
|
||||
|
||||
/* Unchanged on exit. */
|
||||
|
||||
/* TRANS - CHARACTER*1. */
|
||||
/* On entry, TRANS specifies the operation to be performed as */
|
||||
/* follows: */
|
||||
|
||||
/* TRANS = 'N' or 'n' x := A*x. */
|
||||
|
||||
/* TRANS = 'T' or 't' x := A'*x. */
|
||||
|
||||
/* TRANS = 'C' or 'c' x := conjg( A' )*x. */
|
||||
|
||||
/* Unchanged on exit. */
|
||||
|
||||
/* DIAG - CHARACTER*1. */
|
||||
/* On entry, DIAG specifies whether or not A is unit */
|
||||
/* triangular as follows: */
|
||||
|
||||
/* DIAG = 'U' or 'u' A is assumed to be unit triangular. */
|
||||
|
||||
/* DIAG = 'N' or 'n' A is not assumed to be unit */
|
||||
/* triangular. */
|
||||
|
||||
/* Unchanged on exit. */
|
||||
|
||||
/* N - INTEGER. */
|
||||
/* On entry, N specifies the order of the matrix A. */
|
||||
/* N must be at least zero. */
|
||||
/* Unchanged on exit. */
|
||||
|
||||
/* K - INTEGER. */
|
||||
/* On entry with UPLO = 'U' or 'u', K specifies the number of */
|
||||
/* super-diagonals of the matrix A. */
|
||||
/* On entry with UPLO = 'L' or 'l', K specifies the number of */
|
||||
/* sub-diagonals of the matrix A. */
|
||||
/* K must satisfy 0 .le. K. */
|
||||
/* Unchanged on exit. */
|
||||
|
||||
/* A - COMPLEX*16 array of DIMENSION ( LDA, n ). */
|
||||
/* Before entry with UPLO = 'U' or 'u', the leading ( k + 1 ) */
|
||||
/* by n part of the array A must contain the upper triangular */
|
||||
/* band part of the matrix of coefficients, supplied column by */
|
||||
/* column, with the leading diagonal of the matrix in row */
|
||||
/* ( k + 1 ) of the array, the first super-diagonal starting at */
|
||||
/* position 2 in row k, and so on. The top left k by k triangle */
|
||||
/* of the array A is not referenced. */
|
||||
/* The following program segment will transfer an upper */
|
||||
/* triangular band matrix from conventional full matrix storage */
|
||||
/* to band storage: */
|
||||
|
||||
/* DO 20, J = 1, N */
|
||||
/* M = K + 1 - J */
|
||||
/* DO 10, I = MAX( 1, J - K ), J */
|
||||
/* A( M + I, J ) = matrix( I, J ) */
|
||||
/* 10 CONTINUE */
|
||||
/* 20 CONTINUE */
|
||||
|
||||
/* Before entry with UPLO = 'L' or 'l', the leading ( k + 1 ) */
|
||||
/* by n part of the array A must contain the lower triangular */
|
||||
/* band part of the matrix of coefficients, supplied column by */
|
||||
/* column, with the leading diagonal of the matrix in row 1 of */
|
||||
/* the array, the first sub-diagonal starting at position 1 in */
|
||||
/* row 2, and so on. The bottom right k by k triangle of the */
|
||||
/* array A is not referenced. */
|
||||
/* The following program segment will transfer a lower */
|
||||
/* triangular band matrix from conventional full matrix storage */
|
||||
/* to band storage: */
|
||||
|
||||
/* DO 20, J = 1, N */
|
||||
/* M = 1 - J */
|
||||
/* DO 10, I = J, MIN( N, J + K ) */
|
||||
/* A( M + I, J ) = matrix( I, J ) */
|
||||
/* 10 CONTINUE */
|
||||
/* 20 CONTINUE */
|
||||
|
||||
/* Note that when DIAG = 'U' or 'u' the elements of the array A */
|
||||
/* corresponding to the diagonal elements of the matrix are not */
|
||||
/* referenced, but are assumed to be unity. */
|
||||
/* Unchanged on exit. */
|
||||
|
||||
/* LDA - INTEGER. */
|
||||
/* On entry, LDA specifies the first dimension of A as declared */
|
||||
/* in the calling (sub) program. LDA must be at least */
|
||||
/* ( k + 1 ). */
|
||||
/* Unchanged on exit. */
|
||||
|
||||
/* X - COMPLEX*16 array of dimension at least */
|
||||
/* ( 1 + ( n - 1 )*abs( INCX ) ). */
|
||||
/* Before entry, the incremented array X must contain the n */
|
||||
/* element vector x. On exit, X is overwritten with the */
|
||||
/* transformed vector x. */
|
||||
|
||||
/* INCX - INTEGER. */
|
||||
/* On entry, INCX specifies the increment for the elements of */
|
||||
/* X. INCX must not be zero. */
|
||||
/* Unchanged on exit. */
|
||||
|
||||
/* Further Details */
|
||||
/* =============== */
|
||||
|
||||
/* Level 2 Blas routine. */
|
||||
|
||||
/* -- Written on 22-October-1986. */
|
||||
/* Jack Dongarra, Argonne National Lab. */
|
||||
/* Jeremy Du Croz, Nag Central Office. */
|
||||
/* Sven Hammarling, Nag Central Office. */
|
||||
/* Richard Hanson, Sandia National Labs. */
|
||||
|
||||
/* ===================================================================== */
|
||||
|
||||
/* .. Parameters .. */
|
||||
/* .. */
|
||||
/* .. Local Scalars .. */
|
||||
/* .. */
|
||||
/* .. External Functions .. */
|
||||
/* .. */
|
||||
/* .. External Subroutines .. */
|
||||
/* .. */
|
||||
/* .. Intrinsic Functions .. */
|
||||
/* .. */
|
||||
|
||||
/* Test the input parameters. */
|
||||
|
||||
/* Parameter adjustments */
|
||||
a_dim1 = *lda;
|
||||
a_offset = 1 + a_dim1;
|
||||
a -= a_offset;
|
||||
--x;
|
||||
|
||||
/* Function Body */
|
||||
info = 0;
|
||||
if (!lsame_(uplo, "U") && !lsame_(uplo, "L")) {
|
||||
info = 1;
|
||||
} else if (!lsame_(trans, "N") && !lsame_(trans, "T") && !lsame_(trans, "C")) {
|
||||
info = 2;
|
||||
} else if (!lsame_(diag, "U") && !lsame_(diag, "N")) {
|
||||
info = 3;
|
||||
} else if (*n < 0) {
|
||||
info = 4;
|
||||
} else if (*k < 0) {
|
||||
info = 5;
|
||||
} else if (*lda < *k + 1) {
|
||||
info = 7;
|
||||
} else if (*incx == 0) {
|
||||
info = 9;
|
||||
}
|
||||
if (info != 0) {
|
||||
xerbla_("ZTBMV ", &info);
|
||||
return;
|
||||
}
|
||||
|
||||
/* Quick return if possible. */
|
||||
|
||||
if (*n == 0) {
|
||||
return;
|
||||
}
|
||||
|
||||
noconj = lsame_(trans, "T");
|
||||
nounit = lsame_(diag, "N");
|
||||
|
||||
/* Set up the start point in X if the increment is not unity. This */
|
||||
/* will be ( N - 1 )*INCX too small for descending loops. */
|
||||
|
||||
if (*incx <= 0) {
|
||||
kx = 1 - (*n - 1) * *incx;
|
||||
} else if (*incx != 1) {
|
||||
kx = 1;
|
||||
}
|
||||
|
||||
/* Start the operations. In this version the elements of A are */
|
||||
/* accessed sequentially with one pass through A. */
|
||||
|
||||
if (lsame_(trans, "N")) {
|
||||
/* Form x := A*x. */
|
||||
|
||||
if (lsame_(uplo, "U")) {
|
||||
kplus1 = *k + 1;
|
||||
if (*incx == 1) {
|
||||
i__1 = *n;
|
||||
for (j = 1; j <= i__1; ++j) {
|
||||
i__2 = j;
|
||||
if (x[i__2].r != 0. || x[i__2].i != 0.) {
|
||||
i__2 = j;
|
||||
temp.r = x[i__2].r, temp.i = x[i__2].i;
|
||||
l = kplus1 - j;
|
||||
/* Computing MAX */
|
||||
i__2 = 1, i__3 = j - *k;
|
||||
i__4 = j - 1;
|
||||
for (i__ = max(i__2, i__3); i__ <= i__4; ++i__) {
|
||||
i__2 = i__;
|
||||
i__3 = i__;
|
||||
i__5 = l + i__ + j * a_dim1;
|
||||
z__2.r = temp.r * a[i__5].r - temp.i * a[i__5].i, z__2.i = temp.r * a[i__5].i + temp.i * a[i__5].r;
|
||||
z__1.r = x[i__3].r + z__2.r, z__1.i = x[i__3].i + z__2.i;
|
||||
x[i__2].r = z__1.r, x[i__2].i = z__1.i;
|
||||
/* L10: */
|
||||
}
|
||||
if (nounit) {
|
||||
i__4 = j;
|
||||
i__2 = j;
|
||||
i__3 = kplus1 + j * a_dim1;
|
||||
z__1.r = x[i__2].r * a[i__3].r - x[i__2].i * a[i__3].i,
|
||||
z__1.i = x[i__2].r * a[i__3].i + x[i__2].i * a[i__3].r;
|
||||
x[i__4].r = z__1.r, x[i__4].i = z__1.i;
|
||||
}
|
||||
}
|
||||
/* L20: */
|
||||
}
|
||||
} else {
|
||||
jx = kx;
|
||||
i__1 = *n;
|
||||
for (j = 1; j <= i__1; ++j) {
|
||||
i__4 = jx;
|
||||
if (x[i__4].r != 0. || x[i__4].i != 0.) {
|
||||
i__4 = jx;
|
||||
temp.r = x[i__4].r, temp.i = x[i__4].i;
|
||||
ix = kx;
|
||||
l = kplus1 - j;
|
||||
/* Computing MAX */
|
||||
i__4 = 1, i__2 = j - *k;
|
||||
i__3 = j - 1;
|
||||
for (i__ = max(i__4, i__2); i__ <= i__3; ++i__) {
|
||||
i__4 = ix;
|
||||
i__2 = ix;
|
||||
i__5 = l + i__ + j * a_dim1;
|
||||
z__2.r = temp.r * a[i__5].r - temp.i * a[i__5].i, z__2.i = temp.r * a[i__5].i + temp.i * a[i__5].r;
|
||||
z__1.r = x[i__2].r + z__2.r, z__1.i = x[i__2].i + z__2.i;
|
||||
x[i__4].r = z__1.r, x[i__4].i = z__1.i;
|
||||
ix += *incx;
|
||||
/* L30: */
|
||||
}
|
||||
if (nounit) {
|
||||
i__3 = jx;
|
||||
i__4 = jx;
|
||||
i__2 = kplus1 + j * a_dim1;
|
||||
z__1.r = x[i__4].r * a[i__2].r - x[i__4].i * a[i__2].i,
|
||||
z__1.i = x[i__4].r * a[i__2].i + x[i__4].i * a[i__2].r;
|
||||
x[i__3].r = z__1.r, x[i__3].i = z__1.i;
|
||||
}
|
||||
}
|
||||
jx += *incx;
|
||||
if (j > *k) {
|
||||
kx += *incx;
|
||||
}
|
||||
/* L40: */
|
||||
}
|
||||
}
|
||||
} else {
|
||||
if (*incx == 1) {
|
||||
for (j = *n; j >= 1; --j) {
|
||||
i__1 = j;
|
||||
if (x[i__1].r != 0. || x[i__1].i != 0.) {
|
||||
i__1 = j;
|
||||
temp.r = x[i__1].r, temp.i = x[i__1].i;
|
||||
l = 1 - j;
|
||||
/* Computing MIN */
|
||||
i__1 = *n, i__3 = j + *k;
|
||||
i__4 = j + 1;
|
||||
for (i__ = min(i__1, i__3); i__ >= i__4; --i__) {
|
||||
i__1 = i__;
|
||||
i__3 = i__;
|
||||
i__2 = l + i__ + j * a_dim1;
|
||||
z__2.r = temp.r * a[i__2].r - temp.i * a[i__2].i, z__2.i = temp.r * a[i__2].i + temp.i * a[i__2].r;
|
||||
z__1.r = x[i__3].r + z__2.r, z__1.i = x[i__3].i + z__2.i;
|
||||
x[i__1].r = z__1.r, x[i__1].i = z__1.i;
|
||||
/* L50: */
|
||||
}
|
||||
if (nounit) {
|
||||
i__4 = j;
|
||||
i__1 = j;
|
||||
i__3 = j * a_dim1 + 1;
|
||||
z__1.r = x[i__1].r * a[i__3].r - x[i__1].i * a[i__3].i,
|
||||
z__1.i = x[i__1].r * a[i__3].i + x[i__1].i * a[i__3].r;
|
||||
x[i__4].r = z__1.r, x[i__4].i = z__1.i;
|
||||
}
|
||||
}
|
||||
/* L60: */
|
||||
}
|
||||
} else {
|
||||
kx += (*n - 1) * *incx;
|
||||
jx = kx;
|
||||
for (j = *n; j >= 1; --j) {
|
||||
i__4 = jx;
|
||||
if (x[i__4].r != 0. || x[i__4].i != 0.) {
|
||||
i__4 = jx;
|
||||
temp.r = x[i__4].r, temp.i = x[i__4].i;
|
||||
ix = kx;
|
||||
l = 1 - j;
|
||||
/* Computing MIN */
|
||||
i__4 = *n, i__1 = j + *k;
|
||||
i__3 = j + 1;
|
||||
for (i__ = min(i__4, i__1); i__ >= i__3; --i__) {
|
||||
i__4 = ix;
|
||||
i__1 = ix;
|
||||
i__2 = l + i__ + j * a_dim1;
|
||||
z__2.r = temp.r * a[i__2].r - temp.i * a[i__2].i, z__2.i = temp.r * a[i__2].i + temp.i * a[i__2].r;
|
||||
z__1.r = x[i__1].r + z__2.r, z__1.i = x[i__1].i + z__2.i;
|
||||
x[i__4].r = z__1.r, x[i__4].i = z__1.i;
|
||||
ix -= *incx;
|
||||
/* L70: */
|
||||
}
|
||||
if (nounit) {
|
||||
i__3 = jx;
|
||||
i__4 = jx;
|
||||
i__1 = j * a_dim1 + 1;
|
||||
z__1.r = x[i__4].r * a[i__1].r - x[i__4].i * a[i__1].i,
|
||||
z__1.i = x[i__4].r * a[i__1].i + x[i__4].i * a[i__1].r;
|
||||
x[i__3].r = z__1.r, x[i__3].i = z__1.i;
|
||||
}
|
||||
}
|
||||
jx -= *incx;
|
||||
if (*n - j >= *k) {
|
||||
kx -= *incx;
|
||||
}
|
||||
/* L80: */
|
||||
}
|
||||
}
|
||||
}
|
||||
} else {
|
||||
/* Form x := A'*x or x := conjg( A' )*x. */
|
||||
|
||||
if (lsame_(uplo, "U")) {
|
||||
kplus1 = *k + 1;
|
||||
if (*incx == 1) {
|
||||
for (j = *n; j >= 1; --j) {
|
||||
i__3 = j;
|
||||
temp.r = x[i__3].r, temp.i = x[i__3].i;
|
||||
l = kplus1 - j;
|
||||
if (noconj) {
|
||||
if (nounit) {
|
||||
i__3 = kplus1 + j * a_dim1;
|
||||
z__1.r = temp.r * a[i__3].r - temp.i * a[i__3].i, z__1.i = temp.r * a[i__3].i + temp.i * a[i__3].r;
|
||||
temp.r = z__1.r, temp.i = z__1.i;
|
||||
}
|
||||
/* Computing MAX */
|
||||
i__4 = 1, i__1 = j - *k;
|
||||
i__3 = max(i__4, i__1);
|
||||
for (i__ = j - 1; i__ >= i__3; --i__) {
|
||||
i__4 = l + i__ + j * a_dim1;
|
||||
i__1 = i__;
|
||||
z__2.r = a[i__4].r * x[i__1].r - a[i__4].i * x[i__1].i,
|
||||
z__2.i = a[i__4].r * x[i__1].i + a[i__4].i * x[i__1].r;
|
||||
z__1.r = temp.r + z__2.r, z__1.i = temp.i + z__2.i;
|
||||
temp.r = z__1.r, temp.i = z__1.i;
|
||||
/* L90: */
|
||||
}
|
||||
} else {
|
||||
if (nounit) {
|
||||
d_cnjg(&z__2, &a[kplus1 + j * a_dim1]);
|
||||
z__1.r = temp.r * z__2.r - temp.i * z__2.i, z__1.i = temp.r * z__2.i + temp.i * z__2.r;
|
||||
temp.r = z__1.r, temp.i = z__1.i;
|
||||
}
|
||||
/* Computing MAX */
|
||||
i__4 = 1, i__1 = j - *k;
|
||||
i__3 = max(i__4, i__1);
|
||||
for (i__ = j - 1; i__ >= i__3; --i__) {
|
||||
d_cnjg(&z__3, &a[l + i__ + j * a_dim1]);
|
||||
i__4 = i__;
|
||||
z__2.r = z__3.r * x[i__4].r - z__3.i * x[i__4].i, z__2.i = z__3.r * x[i__4].i + z__3.i * x[i__4].r;
|
||||
z__1.r = temp.r + z__2.r, z__1.i = temp.i + z__2.i;
|
||||
temp.r = z__1.r, temp.i = z__1.i;
|
||||
/* L100: */
|
||||
}
|
||||
}
|
||||
i__3 = j;
|
||||
x[i__3].r = temp.r, x[i__3].i = temp.i;
|
||||
/* L110: */
|
||||
}
|
||||
} else {
|
||||
kx += (*n - 1) * *incx;
|
||||
jx = kx;
|
||||
for (j = *n; j >= 1; --j) {
|
||||
i__3 = jx;
|
||||
temp.r = x[i__3].r, temp.i = x[i__3].i;
|
||||
kx -= *incx;
|
||||
ix = kx;
|
||||
l = kplus1 - j;
|
||||
if (noconj) {
|
||||
if (nounit) {
|
||||
i__3 = kplus1 + j * a_dim1;
|
||||
z__1.r = temp.r * a[i__3].r - temp.i * a[i__3].i, z__1.i = temp.r * a[i__3].i + temp.i * a[i__3].r;
|
||||
temp.r = z__1.r, temp.i = z__1.i;
|
||||
}
|
||||
/* Computing MAX */
|
||||
i__4 = 1, i__1 = j - *k;
|
||||
i__3 = max(i__4, i__1);
|
||||
for (i__ = j - 1; i__ >= i__3; --i__) {
|
||||
i__4 = l + i__ + j * a_dim1;
|
||||
i__1 = ix;
|
||||
z__2.r = a[i__4].r * x[i__1].r - a[i__4].i * x[i__1].i,
|
||||
z__2.i = a[i__4].r * x[i__1].i + a[i__4].i * x[i__1].r;
|
||||
z__1.r = temp.r + z__2.r, z__1.i = temp.i + z__2.i;
|
||||
temp.r = z__1.r, temp.i = z__1.i;
|
||||
ix -= *incx;
|
||||
/* L120: */
|
||||
}
|
||||
} else {
|
||||
if (nounit) {
|
||||
d_cnjg(&z__2, &a[kplus1 + j * a_dim1]);
|
||||
z__1.r = temp.r * z__2.r - temp.i * z__2.i, z__1.i = temp.r * z__2.i + temp.i * z__2.r;
|
||||
temp.r = z__1.r, temp.i = z__1.i;
|
||||
}
|
||||
/* Computing MAX */
|
||||
i__4 = 1, i__1 = j - *k;
|
||||
i__3 = max(i__4, i__1);
|
||||
for (i__ = j - 1; i__ >= i__3; --i__) {
|
||||
d_cnjg(&z__3, &a[l + i__ + j * a_dim1]);
|
||||
i__4 = ix;
|
||||
z__2.r = z__3.r * x[i__4].r - z__3.i * x[i__4].i, z__2.i = z__3.r * x[i__4].i + z__3.i * x[i__4].r;
|
||||
z__1.r = temp.r + z__2.r, z__1.i = temp.i + z__2.i;
|
||||
temp.r = z__1.r, temp.i = z__1.i;
|
||||
ix -= *incx;
|
||||
/* L130: */
|
||||
}
|
||||
}
|
||||
i__3 = jx;
|
||||
x[i__3].r = temp.r, x[i__3].i = temp.i;
|
||||
jx -= *incx;
|
||||
/* L140: */
|
||||
}
|
||||
}
|
||||
} else {
|
||||
if (*incx == 1) {
|
||||
i__3 = *n;
|
||||
for (j = 1; j <= i__3; ++j) {
|
||||
i__4 = j;
|
||||
temp.r = x[i__4].r, temp.i = x[i__4].i;
|
||||
l = 1 - j;
|
||||
if (noconj) {
|
||||
if (nounit) {
|
||||
i__4 = j * a_dim1 + 1;
|
||||
z__1.r = temp.r * a[i__4].r - temp.i * a[i__4].i, z__1.i = temp.r * a[i__4].i + temp.i * a[i__4].r;
|
||||
temp.r = z__1.r, temp.i = z__1.i;
|
||||
}
|
||||
/* Computing MIN */
|
||||
i__1 = *n, i__2 = j + *k;
|
||||
i__4 = min(i__1, i__2);
|
||||
for (i__ = j + 1; i__ <= i__4; ++i__) {
|
||||
i__1 = l + i__ + j * a_dim1;
|
||||
i__2 = i__;
|
||||
z__2.r = a[i__1].r * x[i__2].r - a[i__1].i * x[i__2].i,
|
||||
z__2.i = a[i__1].r * x[i__2].i + a[i__1].i * x[i__2].r;
|
||||
z__1.r = temp.r + z__2.r, z__1.i = temp.i + z__2.i;
|
||||
temp.r = z__1.r, temp.i = z__1.i;
|
||||
/* L150: */
|
||||
}
|
||||
} else {
|
||||
if (nounit) {
|
||||
d_cnjg(&z__2, &a[j * a_dim1 + 1]);
|
||||
z__1.r = temp.r * z__2.r - temp.i * z__2.i, z__1.i = temp.r * z__2.i + temp.i * z__2.r;
|
||||
temp.r = z__1.r, temp.i = z__1.i;
|
||||
}
|
||||
/* Computing MIN */
|
||||
i__1 = *n, i__2 = j + *k;
|
||||
i__4 = min(i__1, i__2);
|
||||
for (i__ = j + 1; i__ <= i__4; ++i__) {
|
||||
d_cnjg(&z__3, &a[l + i__ + j * a_dim1]);
|
||||
i__1 = i__;
|
||||
z__2.r = z__3.r * x[i__1].r - z__3.i * x[i__1].i, z__2.i = z__3.r * x[i__1].i + z__3.i * x[i__1].r;
|
||||
z__1.r = temp.r + z__2.r, z__1.i = temp.i + z__2.i;
|
||||
temp.r = z__1.r, temp.i = z__1.i;
|
||||
/* L160: */
|
||||
}
|
||||
}
|
||||
i__4 = j;
|
||||
x[i__4].r = temp.r, x[i__4].i = temp.i;
|
||||
/* L170: */
|
||||
}
|
||||
} else {
|
||||
jx = kx;
|
||||
i__3 = *n;
|
||||
for (j = 1; j <= i__3; ++j) {
|
||||
i__4 = jx;
|
||||
temp.r = x[i__4].r, temp.i = x[i__4].i;
|
||||
kx += *incx;
|
||||
ix = kx;
|
||||
l = 1 - j;
|
||||
if (noconj) {
|
||||
if (nounit) {
|
||||
i__4 = j * a_dim1 + 1;
|
||||
z__1.r = temp.r * a[i__4].r - temp.i * a[i__4].i, z__1.i = temp.r * a[i__4].i + temp.i * a[i__4].r;
|
||||
temp.r = z__1.r, temp.i = z__1.i;
|
||||
}
|
||||
/* Computing MIN */
|
||||
i__1 = *n, i__2 = j + *k;
|
||||
i__4 = min(i__1, i__2);
|
||||
for (i__ = j + 1; i__ <= i__4; ++i__) {
|
||||
i__1 = l + i__ + j * a_dim1;
|
||||
i__2 = ix;
|
||||
z__2.r = a[i__1].r * x[i__2].r - a[i__1].i * x[i__2].i,
|
||||
z__2.i = a[i__1].r * x[i__2].i + a[i__1].i * x[i__2].r;
|
||||
z__1.r = temp.r + z__2.r, z__1.i = temp.i + z__2.i;
|
||||
temp.r = z__1.r, temp.i = z__1.i;
|
||||
ix += *incx;
|
||||
/* L180: */
|
||||
}
|
||||
} else {
|
||||
if (nounit) {
|
||||
d_cnjg(&z__2, &a[j * a_dim1 + 1]);
|
||||
z__1.r = temp.r * z__2.r - temp.i * z__2.i, z__1.i = temp.r * z__2.i + temp.i * z__2.r;
|
||||
temp.r = z__1.r, temp.i = z__1.i;
|
||||
}
|
||||
/* Computing MIN */
|
||||
i__1 = *n, i__2 = j + *k;
|
||||
i__4 = min(i__1, i__2);
|
||||
for (i__ = j + 1; i__ <= i__4; ++i__) {
|
||||
d_cnjg(&z__3, &a[l + i__ + j * a_dim1]);
|
||||
i__1 = ix;
|
||||
z__2.r = z__3.r * x[i__1].r - z__3.i * x[i__1].i, z__2.i = z__3.r * x[i__1].i + z__3.i * x[i__1].r;
|
||||
z__1.r = temp.r + z__2.r, z__1.i = temp.i + z__2.i;
|
||||
temp.r = z__1.r, temp.i = z__1.i;
|
||||
ix += *incx;
|
||||
/* L190: */
|
||||
}
|
||||
}
|
||||
i__4 = jx;
|
||||
x[i__4].r = temp.r, x[i__4].i = temp.i;
|
||||
jx += *incx;
|
||||
/* L200: */
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/* End of ZTBMV . */
|
||||
|
||||
} /* ztbmv_ */
|
||||
@@ -25,19 +25,15 @@ struct functor_traits<scalar_norm1_op> {
|
||||
// computes the sum of magnitudes of all vector elements or, for a complex vector x, the sum
|
||||
// res = |Rex1| + |Imx1| + |Rex2| + |Imx2| + ... + |Rexn| + |Imxn|, where x is a vector of order n
|
||||
extern "C" RealScalar EIGEN_CAT(REAL_SCALAR_SUFFIX, EIGEN_BLAS_FUNC_NAME(asum))(int *n, RealScalar *px, int *incx) {
|
||||
// std::cerr << "__asum " << *n << " " << *incx << "\n";
|
||||
Complex *x = reinterpret_cast<Complex *>(px);
|
||||
|
||||
if (*n <= 0) return 0;
|
||||
|
||||
// std::complex<T> is layout-compatible with T[2], so we can reinterpret
|
||||
// a complex vector of length n as a real vector of length 2*n and use
|
||||
// the fully vectorized cwiseAbs().sum() path.
|
||||
if (*incx == 1)
|
||||
return make_vector(px, 2 * *n).cwiseAbs().sum();
|
||||
else {
|
||||
// For non-unit stride, fall back to the scalar_norm1_op approach since
|
||||
// the real components are not contiguous across complex elements.
|
||||
Complex *x = reinterpret_cast<Complex *>(px);
|
||||
return make_vector(x, *n).unaryExpr<scalar_norm1_op>().sum();
|
||||
else
|
||||
return make_vector(x, *n, std::abs(*incx)).unaryExpr<scalar_norm1_op>().sum();
|
||||
}
|
||||
}
|
||||
|
||||
extern "C" int EIGEN_CAT(i, EIGEN_BLAS_FUNC_NAME(amax))(int *n, RealScalar *px, int *incx) {
|
||||
|
||||
@@ -69,21 +69,15 @@ EIGEN_BLAS_FUNC(copy)(int *n, RealScalar *px, int *incx, RealScalar *py, int *in
|
||||
// be careful, *incx==0 is allowed !!
|
||||
if (*incx == 1 && *incy == 1)
|
||||
make_vector(y, *n) = make_vector(x, *n);
|
||||
else if (*incx == 0) {
|
||||
// Broadcast: copy x[0] to all elements of y.
|
||||
else {
|
||||
if (*incx < 0) x = x - (*n - 1) * (*incx);
|
||||
if (*incy < 0) y = y - (*n - 1) * (*incy);
|
||||
for (int i = 0; i < *n; ++i) {
|
||||
*y = *x;
|
||||
x += *incx;
|
||||
y += *incy;
|
||||
}
|
||||
} else if (*incx > 0 && *incy > 0)
|
||||
make_vector(y, *n, *incy) = make_vector(x, *n, *incx);
|
||||
else if (*incx > 0 && *incy < 0)
|
||||
make_vector(y, *n, -*incy).reverse() = make_vector(x, *n, *incx);
|
||||
else if (*incx < 0 && *incy > 0)
|
||||
make_vector(y, *n, *incy) = make_vector(x, *n, -*incx).reverse();
|
||||
else if (*incx < 0 && *incy < 0)
|
||||
make_vector(y, *n, -*incy) = make_vector(x, *n, -*incx);
|
||||
}
|
||||
}
|
||||
|
||||
EIGEN_BLAS_FUNC(rotg)(RealScalar *pa, RealScalar *pb, RealScalar *pc, RealScalar *ps) {
|
||||
|
||||
@@ -58,21 +58,23 @@ extern "C" Scalar EIGEN_BLAS_FUNC_NAME(dot)(int *n, Scalar *px, int *incx, Scala
|
||||
Scalar *y = reinterpret_cast<Scalar *>(py);
|
||||
|
||||
if (*incx == 1 && *incy == 1)
|
||||
return make_vector(x, *n).dot(make_vector(y, *n));
|
||||
return (make_vector(x, *n).cwiseProduct(make_vector(y, *n))).sum();
|
||||
else if (*incx > 0 && *incy > 0)
|
||||
return make_vector(x, *n, *incx).dot(make_vector(y, *n, *incy));
|
||||
return (make_vector(x, *n, *incx).cwiseProduct(make_vector(y, *n, *incy))).sum();
|
||||
else if (*incx < 0 && *incy > 0)
|
||||
return make_vector(x, *n, -*incx).reverse().dot(make_vector(y, *n, *incy));
|
||||
return (make_vector(x, *n, -*incx).reverse().cwiseProduct(make_vector(y, *n, *incy))).sum();
|
||||
else if (*incx > 0 && *incy < 0)
|
||||
return make_vector(x, *n, *incx).dot(make_vector(y, *n, -*incy).reverse());
|
||||
return (make_vector(x, *n, *incx).cwiseProduct(make_vector(y, *n, -*incy).reverse())).sum();
|
||||
else if (*incx < 0 && *incy < 0)
|
||||
return make_vector(x, *n, -*incx).reverse().dot(make_vector(y, *n, -*incy).reverse());
|
||||
return (make_vector(x, *n, -*incx).reverse().cwiseProduct(make_vector(y, *n, -*incy).reverse())).sum();
|
||||
else
|
||||
return 0;
|
||||
}
|
||||
|
||||
// computes the Euclidean norm of a vector.
|
||||
// FIXME
|
||||
extern "C" Scalar EIGEN_BLAS_FUNC_NAME(nrm2)(int *n, Scalar *px, int *incx) {
|
||||
// std::cerr << "_nrm2 " << *n << " " << *incx << "\n";
|
||||
if (*n <= 0) return 0;
|
||||
|
||||
Scalar *x = reinterpret_cast<Scalar *>(px);
|
||||
@@ -106,171 +108,23 @@ EIGEN_BLAS_FUNC(rot)(int *n, Scalar *px, int *incx, Scalar *py, int *incy, Scala
|
||||
Eigen::internal::apply_rotation_in_the_plane(vx, vy, Eigen::JacobiRotation<Scalar>(c, s));
|
||||
}
|
||||
|
||||
// Applies modified Givens rotation H to vectors x and y.
|
||||
// param[0] = flag:
|
||||
// -1: H = [[h11, h12], [h21, h22]] (all 4 elements from param)
|
||||
// 0: H = [[1, h12], [h21, 1]] (h12, h21 from param)
|
||||
// 1: H = [[h11, 1], [-1, h22]] (h11, h22 from param)
|
||||
// -2: H = identity (no-op)
|
||||
// param[1..4] = h11, h21, h12, h22
|
||||
EIGEN_BLAS_FUNC(rotm)(int *n, Scalar *px, int *incx, Scalar *py, int *incy, Scalar *param) {
|
||||
Scalar *x = reinterpret_cast<Scalar *>(px);
|
||||
Scalar *y = reinterpret_cast<Scalar *>(py);
|
||||
/*
|
||||
// performs rotation of points in the modified plane.
|
||||
EIGEN_BLAS_FUNC(rotm)(int *n, Scalar *px, int *incx, Scalar *py, int *incy, Scalar *param)
|
||||
{
|
||||
Scalar* x = reinterpret_cast<Scalar*>(px);
|
||||
Scalar* y = reinterpret_cast<Scalar*>(py);
|
||||
|
||||
Scalar flag = param[0];
|
||||
if (*n <= 0 || flag == Scalar(-2)) return;
|
||||
// TODO
|
||||
|
||||
Scalar h11, h12, h21, h22;
|
||||
if (flag < Scalar(0)) {
|
||||
h11 = param[1];
|
||||
h21 = param[2];
|
||||
h12 = param[3];
|
||||
h22 = param[4];
|
||||
} else if (flag == Scalar(0)) {
|
||||
h11 = Scalar(1);
|
||||
h21 = param[2];
|
||||
h12 = param[3];
|
||||
h22 = Scalar(1);
|
||||
} else {
|
||||
h11 = param[1];
|
||||
h21 = Scalar(-1);
|
||||
h12 = Scalar(1);
|
||||
h22 = param[4];
|
||||
}
|
||||
|
||||
int kx = *incx > 0 ? 0 : (1 - *n) * *incx;
|
||||
int ky = *incy > 0 ? 0 : (1 - *n) * *incy;
|
||||
|
||||
for (int i = 0; i < *n; ++i) {
|
||||
Scalar w = x[kx];
|
||||
Scalar z = y[ky];
|
||||
x[kx] = h11 * w + h12 * z;
|
||||
y[ky] = h21 * w + h22 * z;
|
||||
kx += *incx;
|
||||
ky += *incy;
|
||||
}
|
||||
return 0;
|
||||
}
|
||||
|
||||
// Constructs the modified Givens transformation matrix H which zeros the second
|
||||
// component of (sqrt(d1)*x1, sqrt(d2)*y1)^T.
|
||||
EIGEN_BLAS_FUNC(rotmg)(Scalar *d1, Scalar *d2, Scalar *x1, Scalar *y1, Scalar *param) {
|
||||
using std::abs;
|
||||
// computes the modified parameters for a Givens rotation.
|
||||
EIGEN_BLAS_FUNC(rotmg)(Scalar *d1, Scalar *d2, Scalar *x1, Scalar *x2, Scalar *param)
|
||||
{
|
||||
// TODO
|
||||
|
||||
const Scalar gam = Scalar(4096);
|
||||
const Scalar gamsq = gam * gam;
|
||||
const Scalar rgamsq = Scalar(1) / gamsq;
|
||||
|
||||
Scalar flag, h11 = Scalar(0), h12 = Scalar(0), h21 = Scalar(0), h22 = Scalar(0);
|
||||
|
||||
if (*d1 < Scalar(0)) {
|
||||
// Negative d1: zero everything.
|
||||
flag = Scalar(-1);
|
||||
*d1 = *d2 = *x1 = Scalar(0);
|
||||
} else {
|
||||
Scalar p2 = *d2 * *y1;
|
||||
if (p2 == Scalar(0)) {
|
||||
// d2*y1 == 0: identity transform.
|
||||
param[0] = Scalar(-2);
|
||||
return;
|
||||
}
|
||||
|
||||
Scalar p1 = *d1 * *x1;
|
||||
Scalar q2 = p2 * *y1;
|
||||
Scalar q1 = p1 * *x1;
|
||||
bool do_scale = true;
|
||||
|
||||
if (abs(q1) > abs(q2)) {
|
||||
h21 = -(*y1) / *x1;
|
||||
h12 = p2 / p1;
|
||||
Scalar u = Scalar(1) - h12 * h21;
|
||||
if (u <= Scalar(0)) {
|
||||
flag = Scalar(-1);
|
||||
h11 = h12 = h21 = h22 = Scalar(0);
|
||||
*d1 = *d2 = *x1 = Scalar(0);
|
||||
do_scale = false;
|
||||
} else {
|
||||
flag = Scalar(0);
|
||||
*d1 /= u;
|
||||
*d2 /= u;
|
||||
*x1 *= u;
|
||||
}
|
||||
} else if (q2 < Scalar(0)) {
|
||||
flag = Scalar(-1);
|
||||
h11 = h12 = h21 = h22 = Scalar(0);
|
||||
*d1 = *d2 = *x1 = Scalar(0);
|
||||
do_scale = false;
|
||||
} else {
|
||||
flag = Scalar(1);
|
||||
h11 = p1 / p2;
|
||||
h22 = *x1 / *y1;
|
||||
Scalar u = Scalar(1) + h11 * h22;
|
||||
Scalar temp = *d2 / u;
|
||||
*d2 = *d1 / u;
|
||||
*d1 = temp;
|
||||
*x1 = *y1 * u;
|
||||
}
|
||||
|
||||
if (do_scale) {
|
||||
// Converts compact H representation (flag 0 or 1) to full form (flag -1)
|
||||
// so that scaling factors can be absorbed into all four elements.
|
||||
auto fix_h = [&]() {
|
||||
if (flag >= Scalar(0)) {
|
||||
if (flag == Scalar(0)) {
|
||||
h11 = Scalar(1);
|
||||
h22 = Scalar(1);
|
||||
} else {
|
||||
h21 = Scalar(-1);
|
||||
h12 = Scalar(1);
|
||||
}
|
||||
flag = Scalar(-1);
|
||||
}
|
||||
};
|
||||
|
||||
// Scale d1 up if too small.
|
||||
while (*d1 <= rgamsq && *d1 != Scalar(0)) {
|
||||
fix_h();
|
||||
*d1 *= gamsq;
|
||||
*x1 /= gam;
|
||||
h11 /= gam;
|
||||
h12 /= gam;
|
||||
}
|
||||
// Scale d1 down if too large.
|
||||
while (*d1 >= gamsq) {
|
||||
fix_h();
|
||||
*d1 /= gamsq;
|
||||
*x1 *= gam;
|
||||
h11 *= gam;
|
||||
h12 *= gam;
|
||||
}
|
||||
// Scale |d2| up if too small.
|
||||
while (abs(*d2) <= rgamsq && *d2 != Scalar(0)) {
|
||||
fix_h();
|
||||
*d2 *= gamsq;
|
||||
h21 /= gam;
|
||||
h22 /= gam;
|
||||
}
|
||||
// Scale |d2| down if too large.
|
||||
while (abs(*d2) >= gamsq) {
|
||||
fix_h();
|
||||
*d2 /= gamsq;
|
||||
h21 *= gam;
|
||||
h22 *= gam;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// Store result in param array.
|
||||
if (flag < Scalar(0)) {
|
||||
param[1] = h11;
|
||||
param[2] = h21;
|
||||
param[3] = h12;
|
||||
param[4] = h22;
|
||||
} else if (flag == Scalar(0)) {
|
||||
param[2] = h21;
|
||||
param[3] = h12;
|
||||
} else {
|
||||
param[1] = h11;
|
||||
param[4] = h22;
|
||||
}
|
||||
param[0] = flag;
|
||||
return 0;
|
||||
}
|
||||
*/
|
||||
|
||||
@@ -72,193 +72,31 @@ EIGEN_BLAS_FUNC(hemv)
|
||||
if (actual_y != y) delete[] copy_back(actual_y, y, *n, *incy);
|
||||
}
|
||||
|
||||
/** HBMV performs the matrix-vector operation
|
||||
/** ZHBMV performs the matrix-vector operation
|
||||
*
|
||||
* y := alpha*A*x + beta*y,
|
||||
*
|
||||
* where alpha and beta are scalars, x and y are n element vectors and
|
||||
* A is an n by n hermitian band matrix, with k super-diagonals.
|
||||
* Diagonal elements are real; off-diagonal contributions use conjugation.
|
||||
*/
|
||||
EIGEN_BLAS_FUNC(hbmv)
|
||||
(char *uplo, int *n, int *k, RealScalar *palpha, RealScalar *pa, int *lda, RealScalar *px, int *incx, RealScalar *pbeta,
|
||||
RealScalar *py, int *incy) {
|
||||
const Scalar alpha = *reinterpret_cast<const Scalar *>(palpha);
|
||||
const Scalar beta = *reinterpret_cast<const Scalar *>(pbeta);
|
||||
const Scalar *a = reinterpret_cast<const Scalar *>(pa);
|
||||
const Scalar *x = reinterpret_cast<const Scalar *>(px);
|
||||
Scalar *y = reinterpret_cast<Scalar *>(py);
|
||||
// EIGEN_BLAS_FUNC(hbmv)(char *uplo, int *n, int *k, RealScalar *alpha, RealScalar *a, int *lda,
|
||||
// RealScalar *x, int *incx, RealScalar *beta, RealScalar *y, int *incy)
|
||||
// {
|
||||
// return 1;
|
||||
// }
|
||||
|
||||
int info = 0;
|
||||
if (UPLO(*uplo) == INVALID)
|
||||
info = 1;
|
||||
else if (*n < 0)
|
||||
info = 2;
|
||||
else if (*k < 0)
|
||||
info = 3;
|
||||
else if (*lda < *k + 1)
|
||||
info = 6;
|
||||
else if (*incx == 0)
|
||||
info = 8;
|
||||
else if (*incy == 0)
|
||||
info = 11;
|
||||
if (info) return xerbla_(SCALAR_SUFFIX_UP "HBMV ", &info);
|
||||
|
||||
if (*n == 0 || (alpha == Scalar(0) && beta == Scalar(1))) return;
|
||||
|
||||
const Scalar *actual_x = get_compact_vector(x, *n, *incx);
|
||||
Scalar *actual_y = get_compact_vector(y, *n, *incy);
|
||||
|
||||
// First form y := beta*y.
|
||||
if (beta != Scalar(1)) {
|
||||
if (beta == Scalar(0))
|
||||
make_vector(actual_y, *n).setZero();
|
||||
else
|
||||
make_vector(actual_y, *n) *= beta;
|
||||
}
|
||||
|
||||
if (alpha == Scalar(0)) {
|
||||
if (actual_x != x) delete[] actual_x;
|
||||
if (actual_y != y) delete[] copy_back(actual_y, y, *n, *incy);
|
||||
return;
|
||||
}
|
||||
|
||||
if (*k >= 8) {
|
||||
// Vectorized path: use Eigen Map segments for the inner band operations.
|
||||
ConstMatrixType band(a, *k + 1, *n, *lda);
|
||||
if (UPLO(*uplo) == UP) {
|
||||
for (int j = 0; j < *n; ++j) {
|
||||
int start = std::max(0, j - *k);
|
||||
int len = j - start;
|
||||
int offset = *k - (j - start);
|
||||
Scalar temp1 = alpha * actual_x[j];
|
||||
actual_y[j] += Scalar(Eigen::numext::real(band(*k, j))) * temp1;
|
||||
if (len > 0) {
|
||||
make_vector(actual_y + start, len) += temp1 * band.col(j).segment(offset, len);
|
||||
actual_y[j] += alpha * band.col(j).segment(offset, len).dot(make_vector(actual_x + start, len));
|
||||
}
|
||||
}
|
||||
} else {
|
||||
for (int j = 0; j < *n; ++j) {
|
||||
int len = std::min(*n - 1, j + *k) - j;
|
||||
Scalar temp1 = alpha * actual_x[j];
|
||||
actual_y[j] += Scalar(Eigen::numext::real(band(0, j))) * temp1;
|
||||
if (len > 0) {
|
||||
make_vector(actual_y + j + 1, len) += temp1 * band.col(j).segment(1, len);
|
||||
actual_y[j] += alpha * band.col(j).segment(1, len).dot(make_vector(actual_x + j + 1, len));
|
||||
}
|
||||
}
|
||||
}
|
||||
} else {
|
||||
// Scalar path: for narrow bandwidth, avoid Map overhead.
|
||||
if (UPLO(*uplo) == UP) {
|
||||
for (int j = 0; j < *n; ++j) {
|
||||
Scalar temp1 = alpha * actual_x[j];
|
||||
Scalar temp2 = Scalar(0);
|
||||
for (int i = std::max(0, j - *k); i < j; ++i) {
|
||||
Scalar aij = a[(*k + i - j) + j * *lda];
|
||||
actual_y[i] += temp1 * aij;
|
||||
temp2 += Eigen::numext::conj(aij) * actual_x[i];
|
||||
}
|
||||
actual_y[j] += Scalar(Eigen::numext::real(a[*k + j * *lda])) * temp1 + alpha * temp2;
|
||||
}
|
||||
} else {
|
||||
for (int j = 0; j < *n; ++j) {
|
||||
Scalar temp1 = alpha * actual_x[j];
|
||||
Scalar temp2 = Scalar(0);
|
||||
actual_y[j] += Scalar(Eigen::numext::real(a[j * *lda])) * temp1;
|
||||
for (int i = j + 1; i <= std::min(*n - 1, j + *k); ++i) {
|
||||
Scalar aij = a[(i - j) + j * *lda];
|
||||
actual_y[i] += temp1 * aij;
|
||||
temp2 += Eigen::numext::conj(aij) * actual_x[i];
|
||||
}
|
||||
actual_y[j] += alpha * temp2;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
if (actual_x != x) delete[] actual_x;
|
||||
if (actual_y != y) delete[] copy_back(actual_y, y, *n, *incy);
|
||||
}
|
||||
|
||||
/** HPMV performs the matrix-vector operation
|
||||
/** ZHPMV performs the matrix-vector operation
|
||||
*
|
||||
* y := alpha*A*x + beta*y,
|
||||
*
|
||||
* where alpha and beta are scalars, x and y are n element vectors and
|
||||
* A is an n by n hermitian matrix, supplied in packed form.
|
||||
* Diagonal elements are real; off-diagonal contributions use conjugation.
|
||||
*/
|
||||
EIGEN_BLAS_FUNC(hpmv)
|
||||
(char *uplo, int *n, RealScalar *palpha, RealScalar *pap, RealScalar *px, int *incx, RealScalar *pbeta, RealScalar *py,
|
||||
int *incy) {
|
||||
const Scalar alpha = *reinterpret_cast<const Scalar *>(palpha);
|
||||
const Scalar beta = *reinterpret_cast<const Scalar *>(pbeta);
|
||||
const Scalar *ap = reinterpret_cast<const Scalar *>(pap);
|
||||
const Scalar *x = reinterpret_cast<const Scalar *>(px);
|
||||
Scalar *y = reinterpret_cast<Scalar *>(py);
|
||||
|
||||
int info = 0;
|
||||
if (UPLO(*uplo) == INVALID)
|
||||
info = 1;
|
||||
else if (*n < 0)
|
||||
info = 2;
|
||||
else if (*incx == 0)
|
||||
info = 6;
|
||||
else if (*incy == 0)
|
||||
info = 9;
|
||||
if (info) return xerbla_(SCALAR_SUFFIX_UP "HPMV ", &info);
|
||||
|
||||
if (*n == 0 || (alpha == Scalar(0) && beta == Scalar(1))) return;
|
||||
|
||||
const Scalar *actual_x = get_compact_vector(x, *n, *incx);
|
||||
Scalar *actual_y = get_compact_vector(y, *n, *incy);
|
||||
|
||||
// First form y := beta*y.
|
||||
if (beta != Scalar(1)) {
|
||||
if (beta == Scalar(0))
|
||||
make_vector(actual_y, *n).setZero();
|
||||
else
|
||||
make_vector(actual_y, *n) *= beta;
|
||||
}
|
||||
|
||||
if (alpha == Scalar(0)) {
|
||||
if (actual_x != x) delete[] actual_x;
|
||||
if (actual_y != y) delete[] copy_back(actual_y, y, *n, *incy);
|
||||
return;
|
||||
}
|
||||
|
||||
int kk = 0;
|
||||
if (UPLO(*uplo) == UP) {
|
||||
// Upper triangle packed: column j occupies ap[kk..kk+j].
|
||||
for (int j = 0; j < *n; ++j) {
|
||||
Scalar temp1 = alpha * actual_x[j];
|
||||
// Diagonal is real.
|
||||
actual_y[j] += Scalar(Eigen::numext::real(ap[kk + j])) * temp1;
|
||||
if (j > 0) {
|
||||
make_vector(actual_y, j) += temp1 * make_vector(ap + kk, j);
|
||||
actual_y[j] += alpha * make_vector(ap + kk, j).dot(make_vector(actual_x, j));
|
||||
}
|
||||
kk += j + 1;
|
||||
}
|
||||
} else {
|
||||
// Lower triangle packed: column j occupies ap[kk..kk+(n-j-1)].
|
||||
for (int j = 0; j < *n; ++j) {
|
||||
int len = *n - j - 1;
|
||||
Scalar temp1 = alpha * actual_x[j];
|
||||
// Diagonal is real.
|
||||
actual_y[j] += Scalar(Eigen::numext::real(ap[kk])) * temp1;
|
||||
if (len > 0) {
|
||||
make_vector(actual_y + j + 1, len) += temp1 * make_vector(ap + kk + 1, len);
|
||||
actual_y[j] += alpha * make_vector(ap + kk + 1, len).dot(make_vector(actual_x + j + 1, len));
|
||||
}
|
||||
kk += *n - j;
|
||||
}
|
||||
}
|
||||
|
||||
if (actual_x != x) delete[] actual_x;
|
||||
if (actual_y != y) delete[] copy_back(actual_y, y, *n, *incy);
|
||||
}
|
||||
// EIGEN_BLAS_FUNC(hpmv)(char *uplo, int *n, RealScalar *alpha, RealScalar *ap, RealScalar *x, int *incx, RealScalar
|
||||
// *beta, RealScalar *y, int *incy)
|
||||
// {
|
||||
// return 1;
|
||||
// }
|
||||
|
||||
/** ZHPR performs the hermitian rank 1 operation
|
||||
*
|
||||
|
||||
@@ -303,158 +303,61 @@ EIGEN_BLAS_FUNC(gbmv)
|
||||
if (actual_y != y) delete[] copy_back(actual_y, y, actual_m, *incy);
|
||||
}
|
||||
|
||||
#if 0
|
||||
/** TBMV performs one of the matrix-vector operations
|
||||
*
|
||||
* x := A*x, or x := A'*x, or x := conjg(A')*x,
|
||||
*
|
||||
* where x is an n element vector and A is an n by n unit, or non-unit,
|
||||
* upper or lower triangular band matrix, with ( k + 1 ) diagonals.
|
||||
*
|
||||
* Band storage: upper triangle stores A[i,j] at a[(k+i-j) + j*lda],
|
||||
* lower triangle stores A[i,j] at a[(i-j) + j*lda].
|
||||
*/
|
||||
EIGEN_BLAS_FUNC(tbmv)
|
||||
(char *uplo, char *opa, char *diag, int *n, int *k, RealScalar *pa, int *lda, RealScalar *px, int *incx) {
|
||||
Scalar *a = reinterpret_cast<Scalar *>(pa);
|
||||
Scalar *x = reinterpret_cast<Scalar *>(px);
|
||||
*
|
||||
* x := A*x, or x := A'*x,
|
||||
*
|
||||
* where x is an n element vector and A is an n by n unit, or non-unit,
|
||||
* upper or lower triangular band matrix, with ( k + 1 ) diagonals.
|
||||
*/
|
||||
EIGEN_BLAS_FUNC(tbmv)(char *uplo, char *opa, char *diag, int *n, int *k, RealScalar *pa, int *lda, RealScalar *px, int *incx)
|
||||
{
|
||||
Scalar* a = reinterpret_cast<Scalar*>(pa);
|
||||
Scalar* x = reinterpret_cast<Scalar*>(px);
|
||||
int coeff_rows = *k + 1;
|
||||
|
||||
int info = 0;
|
||||
if (UPLO(*uplo) == INVALID)
|
||||
info = 1;
|
||||
else if (OP(*opa) == INVALID)
|
||||
info = 2;
|
||||
else if (DIAG(*diag) == INVALID)
|
||||
info = 3;
|
||||
else if (*n < 0)
|
||||
info = 4;
|
||||
else if (*k < 0)
|
||||
info = 5;
|
||||
else if (*lda < *k + 1)
|
||||
info = 7;
|
||||
else if (*incx == 0)
|
||||
info = 9;
|
||||
if (info) return xerbla_(SCALAR_SUFFIX_UP "TBMV ", &info);
|
||||
if(UPLO(*uplo)==INVALID) info = 1;
|
||||
else if(OP(*opa)==INVALID) info = 2;
|
||||
else if(DIAG(*diag)==INVALID) info = 3;
|
||||
else if(*n<0) info = 4;
|
||||
else if(*k<0) info = 5;
|
||||
else if(*lda<coeff_rows) info = 7;
|
||||
else if(*incx==0) info = 9;
|
||||
if(info)
|
||||
return xerbla_(SCALAR_SUFFIX_UP"TBMV ",&info,6);
|
||||
|
||||
if (*n == 0) return;
|
||||
if(*n==0) return;
|
||||
|
||||
Scalar *actual_x = get_compact_vector(x, *n, *incx);
|
||||
int actual_n = *n;
|
||||
|
||||
bool upper = (UPLO(*uplo) == UP);
|
||||
int op = OP(*opa);
|
||||
bool unit = (DIAG(*diag) == UNIT);
|
||||
Scalar* actual_x = get_compact_vector(x,actual_n,*incx);
|
||||
|
||||
if (*k >= 8) {
|
||||
// Vectorized path: use Eigen Map segments for the inner band operations.
|
||||
ConstMatrixType band(a, *k + 1, *n, *lda);
|
||||
if (op == NOTR) {
|
||||
if (upper) {
|
||||
for (int j = 0; j < *n; ++j) {
|
||||
if (actual_x[j] != Scalar(0)) {
|
||||
int start = std::max(0, j - *k);
|
||||
int len = j - start;
|
||||
int offset = *k - (j - start);
|
||||
Scalar temp = actual_x[j];
|
||||
if (len > 0) make_vector(actual_x + start, len) += temp * band.col(j).segment(offset, len);
|
||||
if (!unit) actual_x[j] = temp * band(*k, j);
|
||||
}
|
||||
}
|
||||
} else {
|
||||
for (int j = *n - 1; j >= 0; --j) {
|
||||
if (actual_x[j] != Scalar(0)) {
|
||||
int len = std::min(*n - 1, j + *k) - j;
|
||||
Scalar temp = actual_x[j];
|
||||
if (len > 0) make_vector(actual_x + j + 1, len) += temp * band.col(j).segment(1, len);
|
||||
if (!unit) actual_x[j] = temp * band(0, j);
|
||||
}
|
||||
}
|
||||
}
|
||||
} else if (op == TR) {
|
||||
if (upper) {
|
||||
for (int j = *n - 1; j >= 0; --j) {
|
||||
int start = std::max(0, j - *k);
|
||||
int len = j - start;
|
||||
int offset = *k - (j - start);
|
||||
Scalar temp = actual_x[j];
|
||||
if (!unit) temp *= band(*k, j);
|
||||
if (len > 0)
|
||||
temp += (band.col(j).segment(offset, len).cwiseProduct(make_vector(actual_x + start, len))).sum();
|
||||
actual_x[j] = temp;
|
||||
}
|
||||
} else {
|
||||
for (int j = 0; j < *n; ++j) {
|
||||
int len = std::min(*n - 1, j + *k) - j;
|
||||
Scalar temp = actual_x[j];
|
||||
if (!unit) temp *= band(0, j);
|
||||
if (len > 0) temp += (band.col(j).segment(1, len).cwiseProduct(make_vector(actual_x + j + 1, len))).sum();
|
||||
actual_x[j] = temp;
|
||||
}
|
||||
}
|
||||
} else {
|
||||
// Conjugate transpose: .dot() computes conj(lhs) . rhs.
|
||||
if (upper) {
|
||||
for (int j = *n - 1; j >= 0; --j) {
|
||||
int start = std::max(0, j - *k);
|
||||
int len = j - start;
|
||||
int offset = *k - (j - start);
|
||||
Scalar temp = actual_x[j];
|
||||
if (!unit) temp *= Eigen::numext::conj(band(*k, j));
|
||||
if (len > 0) temp += band.col(j).segment(offset, len).dot(make_vector(actual_x + start, len));
|
||||
actual_x[j] = temp;
|
||||
}
|
||||
} else {
|
||||
for (int j = 0; j < *n; ++j) {
|
||||
int len = std::min(*n - 1, j + *k) - j;
|
||||
Scalar temp = actual_x[j];
|
||||
if (!unit) temp *= Eigen::numext::conj(band(0, j));
|
||||
if (len > 0) temp += band.col(j).segment(1, len).dot(make_vector(actual_x + j + 1, len));
|
||||
actual_x[j] = temp;
|
||||
}
|
||||
}
|
||||
}
|
||||
} else {
|
||||
// Scalar path: for narrow bandwidth, avoid Map overhead.
|
||||
if (op == NOTR) {
|
||||
if (upper) {
|
||||
for (int j = 0; j < *n; ++j) {
|
||||
if (actual_x[j] != Scalar(0)) {
|
||||
Scalar temp = actual_x[j];
|
||||
for (int i = std::max(0, j - *k); i < j; ++i) actual_x[i] += temp * a[(*k + i - j) + j * *lda];
|
||||
if (!unit) actual_x[j] = temp * a[*k + j * *lda];
|
||||
}
|
||||
}
|
||||
} else {
|
||||
for (int j = *n - 1; j >= 0; --j) {
|
||||
if (actual_x[j] != Scalar(0)) {
|
||||
Scalar temp = actual_x[j];
|
||||
for (int i = j + 1; i <= std::min(*n - 1, j + *k); ++i) actual_x[i] += temp * a[(i - j) + j * *lda];
|
||||
if (!unit) actual_x[j] = temp * a[j * *lda];
|
||||
}
|
||||
}
|
||||
}
|
||||
} else {
|
||||
// Transpose or conjugate transpose.
|
||||
auto maybe_conj = [op](Scalar val) -> Scalar { return op == ADJ ? Eigen::numext::conj(val) : val; };
|
||||
if (upper) {
|
||||
for (int j = *n - 1; j >= 0; --j) {
|
||||
Scalar temp = actual_x[j];
|
||||
if (!unit) temp *= maybe_conj(a[*k + j * *lda]);
|
||||
for (int i = std::max(0, j - *k); i < j; ++i) temp += maybe_conj(a[(*k + i - j) + j * *lda]) * actual_x[i];
|
||||
actual_x[j] = temp;
|
||||
}
|
||||
} else {
|
||||
for (int j = 0; j < *n; ++j) {
|
||||
Scalar temp = actual_x[j];
|
||||
if (!unit) temp *= maybe_conj(a[j * *lda]);
|
||||
for (int i = j + 1; i <= std::min(*n - 1, j + *k); ++i)
|
||||
temp += maybe_conj(a[(i - j) + j * *lda]) * actual_x[i];
|
||||
actual_x[j] = temp;
|
||||
}
|
||||
}
|
||||
}
|
||||
MatrixType mat_coeffs(a,coeff_rows,*n,*lda);
|
||||
|
||||
int ku = UPLO(*uplo)==UPPER ? *k : 0;
|
||||
int kl = UPLO(*uplo)==LOWER ? *k : 0;
|
||||
|
||||
for(int j=0; j<*n; ++j)
|
||||
{
|
||||
int start = std::max(0,j - ku);
|
||||
int end = std::min((*m)-1,j + kl);
|
||||
int len = end - start + 1;
|
||||
int offset = (ku) - j + start;
|
||||
|
||||
if(OP(*trans)==NOTR)
|
||||
make_vector(actual_y+start,len) += (alpha*actual_x[j]) * mat_coeffs.col(j).segment(offset,len);
|
||||
else if(OP(*trans)==TR)
|
||||
actual_y[j] += alpha * ( mat_coeffs.col(j).segment(offset,len).transpose() * make_vector(actual_x+start,len) ).value();
|
||||
else
|
||||
actual_y[j] += alpha * ( mat_coeffs.col(j).segment(offset,len).adjoint() * make_vector(actual_x+start,len) ).value();
|
||||
}
|
||||
|
||||
if (actual_x != x) delete[] copy_back(actual_x, x, *n, *incx);
|
||||
if(actual_x!=x) delete[] actual_x;
|
||||
if(actual_y!=y) delete[] copy_back(actual_y,y,actual_m,*incy);
|
||||
}
|
||||
#endif
|
||||
|
||||
/** DTBSV solves one of the systems of equations
|
||||
*
|
||||
|
||||
@@ -158,196 +158,32 @@ EIGEN_BLAS_FUNC(syr2)
|
||||
// func[code](*n, a, *inca, b, *incb, c, *ldc, alpha);
|
||||
}
|
||||
|
||||
/** SBMV performs the matrix-vector operation
|
||||
/** DSBMV performs the matrix-vector operation
|
||||
*
|
||||
* y := alpha*A*x + beta*y,
|
||||
*
|
||||
* where alpha and beta are scalars, x and y are n element vectors and
|
||||
* A is an n by n symmetric band matrix, with k super-diagonals.
|
||||
*
|
||||
* Band storage: upper triangle stores A[i,j] at a[(k+i-j) + j*lda],
|
||||
* lower triangle stores A[i,j] at a[(i-j) + j*lda].
|
||||
*/
|
||||
EIGEN_BLAS_FUNC(sbmv)
|
||||
(char *uplo, int *n, int *k, RealScalar *palpha, RealScalar *pa, int *lda, RealScalar *px, int *incx, RealScalar *pbeta,
|
||||
RealScalar *py, int *incy) {
|
||||
const Scalar alpha = *reinterpret_cast<const Scalar *>(palpha);
|
||||
const Scalar beta = *reinterpret_cast<const Scalar *>(pbeta);
|
||||
const Scalar *a = reinterpret_cast<const Scalar *>(pa);
|
||||
const Scalar *x = reinterpret_cast<const Scalar *>(px);
|
||||
Scalar *y = reinterpret_cast<Scalar *>(py);
|
||||
// EIGEN_BLAS_FUNC(sbmv)( char *uplo, int *n, int *k, RealScalar *alpha, RealScalar *a, int *lda,
|
||||
// RealScalar *x, int *incx, RealScalar *beta, RealScalar *y, int *incy)
|
||||
// {
|
||||
// return 1;
|
||||
// }
|
||||
|
||||
int info = 0;
|
||||
if (UPLO(*uplo) == INVALID)
|
||||
info = 1;
|
||||
else if (*n < 0)
|
||||
info = 2;
|
||||
else if (*k < 0)
|
||||
info = 3;
|
||||
else if (*lda < *k + 1)
|
||||
info = 6;
|
||||
else if (*incx == 0)
|
||||
info = 8;
|
||||
else if (*incy == 0)
|
||||
info = 11;
|
||||
if (info) return xerbla_(SCALAR_SUFFIX_UP "SBMV ", &info);
|
||||
|
||||
if (*n == 0 || (alpha == Scalar(0) && beta == Scalar(1))) return;
|
||||
|
||||
const Scalar *actual_x = get_compact_vector(x, *n, *incx);
|
||||
Scalar *actual_y = get_compact_vector(y, *n, *incy);
|
||||
|
||||
// First form y := beta*y.
|
||||
if (beta != Scalar(1)) {
|
||||
if (beta == Scalar(0))
|
||||
make_vector(actual_y, *n).setZero();
|
||||
else
|
||||
make_vector(actual_y, *n) *= beta;
|
||||
}
|
||||
|
||||
if (alpha == Scalar(0)) {
|
||||
if (actual_x != x) delete[] actual_x;
|
||||
if (actual_y != y) delete[] copy_back(actual_y, y, *n, *incy);
|
||||
return;
|
||||
}
|
||||
|
||||
if (*k >= 8) {
|
||||
// Vectorized path: use Eigen Map segments for the inner band operations.
|
||||
ConstMatrixType band(a, *k + 1, *n, *lda);
|
||||
if (UPLO(*uplo) == UP) {
|
||||
for (int j = 0; j < *n; ++j) {
|
||||
int start = std::max(0, j - *k);
|
||||
int len = j - start;
|
||||
int offset = *k - (j - start);
|
||||
Scalar temp1 = alpha * actual_x[j];
|
||||
actual_y[j] += temp1 * band(*k, j);
|
||||
if (len > 0) {
|
||||
make_vector(actual_y + start, len) += temp1 * band.col(j).segment(offset, len);
|
||||
actual_y[j] += alpha * band.col(j).segment(offset, len).dot(make_vector(actual_x + start, len));
|
||||
}
|
||||
}
|
||||
} else {
|
||||
for (int j = 0; j < *n; ++j) {
|
||||
int len = std::min(*n - 1, j + *k) - j;
|
||||
Scalar temp1 = alpha * actual_x[j];
|
||||
actual_y[j] += temp1 * band(0, j);
|
||||
if (len > 0) {
|
||||
make_vector(actual_y + j + 1, len) += temp1 * band.col(j).segment(1, len);
|
||||
actual_y[j] += alpha * band.col(j).segment(1, len).dot(make_vector(actual_x + j + 1, len));
|
||||
}
|
||||
}
|
||||
}
|
||||
} else {
|
||||
// Scalar path: for narrow bandwidth, avoid Map overhead.
|
||||
if (UPLO(*uplo) == UP) {
|
||||
for (int j = 0; j < *n; ++j) {
|
||||
Scalar temp1 = alpha * actual_x[j];
|
||||
Scalar temp2 = Scalar(0);
|
||||
for (int i = std::max(0, j - *k); i < j; ++i) {
|
||||
Scalar aij = a[(*k + i - j) + j * *lda];
|
||||
actual_y[i] += temp1 * aij;
|
||||
temp2 += aij * actual_x[i];
|
||||
}
|
||||
actual_y[j] += temp1 * a[*k + j * *lda] + alpha * temp2;
|
||||
}
|
||||
} else {
|
||||
for (int j = 0; j < *n; ++j) {
|
||||
Scalar temp1 = alpha * actual_x[j];
|
||||
Scalar temp2 = Scalar(0);
|
||||
actual_y[j] += temp1 * a[j * *lda];
|
||||
for (int i = j + 1; i <= std::min(*n - 1, j + *k); ++i) {
|
||||
Scalar aij = a[(i - j) + j * *lda];
|
||||
actual_y[i] += temp1 * aij;
|
||||
temp2 += aij * actual_x[i];
|
||||
}
|
||||
actual_y[j] += alpha * temp2;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
if (actual_x != x) delete[] actual_x;
|
||||
if (actual_y != y) delete[] copy_back(actual_y, y, *n, *incy);
|
||||
}
|
||||
|
||||
/** SPMV performs the matrix-vector operation
|
||||
/** DSPMV performs the matrix-vector operation
|
||||
*
|
||||
* y := alpha*A*x + beta*y,
|
||||
*
|
||||
* where alpha and beta are scalars, x and y are n element vectors and
|
||||
* A is an n by n symmetric matrix, supplied in packed form.
|
||||
*
|
||||
* Packed storage: upper triangle stores columns sequentially so that
|
||||
* column j occupies positions kk..kk+j (where kk = j*(j+1)/2),
|
||||
* lower triangle stores column j at positions kk..kk+(n-j-1).
|
||||
*/
|
||||
EIGEN_BLAS_FUNC(spmv)
|
||||
(char *uplo, int *n, RealScalar *palpha, RealScalar *pap, RealScalar *px, int *incx, RealScalar *pbeta, RealScalar *py,
|
||||
int *incy) {
|
||||
const Scalar alpha = *reinterpret_cast<const Scalar *>(palpha);
|
||||
const Scalar beta = *reinterpret_cast<const Scalar *>(pbeta);
|
||||
const Scalar *ap = reinterpret_cast<const Scalar *>(pap);
|
||||
const Scalar *x = reinterpret_cast<const Scalar *>(px);
|
||||
Scalar *y = reinterpret_cast<Scalar *>(py);
|
||||
|
||||
int info = 0;
|
||||
if (UPLO(*uplo) == INVALID)
|
||||
info = 1;
|
||||
else if (*n < 0)
|
||||
info = 2;
|
||||
else if (*incx == 0)
|
||||
info = 6;
|
||||
else if (*incy == 0)
|
||||
info = 9;
|
||||
if (info) return xerbla_(SCALAR_SUFFIX_UP "SPMV ", &info);
|
||||
|
||||
if (*n == 0 || (alpha == Scalar(0) && beta == Scalar(1))) return;
|
||||
|
||||
const Scalar *actual_x = get_compact_vector(x, *n, *incx);
|
||||
Scalar *actual_y = get_compact_vector(y, *n, *incy);
|
||||
|
||||
// First form y := beta*y.
|
||||
if (beta != Scalar(1)) {
|
||||
if (beta == Scalar(0))
|
||||
make_vector(actual_y, *n).setZero();
|
||||
else
|
||||
make_vector(actual_y, *n) *= beta;
|
||||
}
|
||||
|
||||
if (alpha == Scalar(0)) {
|
||||
if (actual_x != x) delete[] actual_x;
|
||||
if (actual_y != y) delete[] copy_back(actual_y, y, *n, *incy);
|
||||
return;
|
||||
}
|
||||
|
||||
int kk = 0;
|
||||
if (UPLO(*uplo) == UP) {
|
||||
// Upper triangle packed: column j occupies ap[kk..kk+j].
|
||||
for (int j = 0; j < *n; ++j) {
|
||||
Scalar temp1 = alpha * actual_x[j];
|
||||
actual_y[j] += temp1 * ap[kk + j];
|
||||
if (j > 0) {
|
||||
make_vector(actual_y, j) += temp1 * make_vector(ap + kk, j);
|
||||
actual_y[j] += alpha * make_vector(ap + kk, j).dot(make_vector(actual_x, j));
|
||||
}
|
||||
kk += j + 1;
|
||||
}
|
||||
} else {
|
||||
// Lower triangle packed: column j occupies ap[kk..kk+(n-j-1)].
|
||||
for (int j = 0; j < *n; ++j) {
|
||||
int len = *n - j - 1;
|
||||
Scalar temp1 = alpha * actual_x[j];
|
||||
actual_y[j] += temp1 * ap[kk];
|
||||
if (len > 0) {
|
||||
make_vector(actual_y + j + 1, len) += temp1 * make_vector(ap + kk + 1, len);
|
||||
actual_y[j] += alpha * make_vector(ap + kk + 1, len).dot(make_vector(actual_x + j + 1, len));
|
||||
}
|
||||
kk += *n - j;
|
||||
}
|
||||
}
|
||||
|
||||
if (actual_x != x) delete[] actual_x;
|
||||
if (actual_y != y) delete[] copy_back(actual_y, y, *n, *incy);
|
||||
}
|
||||
// EIGEN_BLAS_FUNC(spmv)(char *uplo, int *n, RealScalar *alpha, RealScalar *ap, RealScalar *x, int *incx, RealScalar
|
||||
// *beta, RealScalar *y, int *incy)
|
||||
// {
|
||||
// return 1;
|
||||
// }
|
||||
|
||||
/** DSPR performs the symmetric rank 1 operation
|
||||
*
|
||||
|
||||
@@ -1,15 +0,0 @@
|
||||
// This file is part of Eigen, a lightweight C++ template library
|
||||
// for linear algebra.
|
||||
//
|
||||
// This Source Code Form is subject to the terms of the Mozilla
|
||||
// Public License v. 2.0. If a copy of the MPL was not distributed
|
||||
// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
|
||||
|
||||
#include <cctype>
|
||||
|
||||
#include "blas.h"
|
||||
|
||||
// LSAME returns true if ca and cb are the same letter, regardless of case.
|
||||
extern "C" EIGEN_BLAS_API int lsame_(const char *ca, const char *cb) {
|
||||
return std::toupper(static_cast<unsigned char>(*ca)) == std::toupper(static_cast<unsigned char>(*cb));
|
||||
}
|
||||
@@ -386,6 +386,6 @@ build:linux:cross:x86-64:clang-14:sanitizer:smoketest:
|
||||
rules:
|
||||
- if: $CI_PIPELINE_SOURCE == "merge_request_event"
|
||||
tags:
|
||||
- saas-linux-large-amd64
|
||||
- saas-linux-medium-amd64
|
||||
allow_failure: true
|
||||
timeout: 30m
|
||||
|
||||
@@ -488,6 +488,7 @@ test:linux:x86-64:clang-14:sanitizer:smoketest:
|
||||
variables:
|
||||
EIGEN_CI_INSTALL: clang-14 llvm-14 libclang-rt-14-dev
|
||||
EIGEN_CI_CTEST_LABEL: smoketest
|
||||
EIGEN_CI_CTEST_PARALLEL: "2"
|
||||
EIGEN_CI_CTEST_ARGS: --timeout 120
|
||||
ASAN_OPTIONS: "detect_leaks=0:halt_on_error=1:abort_on_error=1:allocator_may_return_null=1:print_stacktrace=1:detect_stack_use_after_return=0"
|
||||
ASAN_SYMBOLIZER_PATH: "/usr/lib/llvm-14/bin/llvm-symbolizer"
|
||||
@@ -495,6 +496,6 @@ test:linux:x86-64:clang-14:sanitizer:smoketest:
|
||||
rules:
|
||||
- if: $CI_PIPELINE_SOURCE == "merge_request_event"
|
||||
tags:
|
||||
- saas-linux-large-amd64
|
||||
- saas-linux-medium-amd64
|
||||
allow_failure: true
|
||||
timeout: 30m
|
||||
|
||||
@@ -30,11 +30,10 @@ Timings are in \b milliseconds, and factors are relative to the LLT decompositio
|
||||
<a name="note_ls">\b *: </a> This decomposition do not support direct least-square solving for over-constrained problems, and the reported timing include the cost to form the symmetric covariance matrix \f$ A^T A \f$.
|
||||
|
||||
\b Observations:
|
||||
+ LLT is always the fastest solver.
|
||||
+ LLT is always the fastest solvers.
|
||||
+ For largely over-constrained problems, the cost of Cholesky/LU decompositions is dominated by the computation of the symmetric covariance matrix.
|
||||
+ For large problem sizes, only the decompositions implementing a cache-friendly blocking strategy scale well. Those include LLT, PartialPivLU, HouseholderQR, and BDCSVD. This explains why for a 4k x 4k matrix, HouseholderQR is faster than LDLT.
|
||||
+ For large problem sizes, only the decomposition implementing a cache-friendly blocking strategy scale well. Those include LLT, PartialPivLU, HouseholderQR, and BDCSVD. This explain why for a 4k x 4k matrix, HouseholderQR is faster than LDLT. In the future, LDLT and ColPivHouseholderQR will also implement blocking strategies.
|
||||
+ CompleteOrthogonalDecomposition is based on ColPivHouseholderQR and they thus achieve the same level of performance.
|
||||
+ FullPivLU and FullPivHouseholderQR are dramatically slower for large matrices due to the lack of blocking, and are not shown for the 4k x 4k case.
|
||||
|
||||
The above table was originally generated by a benchmark tool. Feel free to write your own benchmark to generate a table matching your hardware, compiler, and favorite problem sizes.
|
||||
|
||||
|
||||
@@ -7,33 +7,13 @@ of equations, say \a Ax = \a b, has no solutions. In this case, it makes sense t
|
||||
vector \a x which is closest to being a solution, in the sense that the difference \a Ax - \a b is
|
||||
as small as possible. This \a x is called the least square solution (if the Euclidean norm is used).
|
||||
|
||||
The methods discussed on this page are the complete orthogonal decomposition (COD), the SVD
|
||||
decomposition, other QR decompositions, and normal equations. For most problems, we recommend
|
||||
CompleteOrthogonalDecomposition: it robustly computes the minimum-norm least squares solution
|
||||
(like the SVD) for both over- and under-determined systems, including rank-deficient ones, but at
|
||||
QR-like speed. The SVD is the most robust but also the slowest; use it when you also need singular
|
||||
values or vectors. Normal equations are the fastest but least robust.
|
||||
The three methods discussed on this page are the SVD decomposition, the QR decomposition and normal
|
||||
equations. Of these, the SVD decomposition is generally the most accurate but the slowest, normal
|
||||
equations is the fastest but least accurate, and the QR decomposition is in between.
|
||||
|
||||
\eigenAutoToc
|
||||
|
||||
|
||||
\section LeastSquaresCOD Using the complete orthogonal decomposition (recommended)
|
||||
|
||||
CompleteOrthogonalDecomposition is the recommended method for least squares problems. It handles the
|
||||
widest class of problems — overdetermined, underdetermined, and rank-deficient systems — and computes
|
||||
the minimum-norm solution when the system is rank-deficient or underdetermined, just like the SVD.
|
||||
It is based on a rank-revealing QR factorization (ColPivHouseholderQR) followed by a post-processing
|
||||
step, so it is significantly faster than SVD while providing comparable robustness.
|
||||
|
||||
<table class="example">
|
||||
<tr><th>Example:</th><th>Output:</th></tr>
|
||||
<tr>
|
||||
<td>\include LeastSquaresCOD.cpp </td>
|
||||
<td>\verbinclude LeastSquaresCOD.out </td>
|
||||
</tr>
|
||||
</table>
|
||||
|
||||
|
||||
\section LeastSquaresSVD Using the SVD decomposition
|
||||
|
||||
The \link BDCSVD::solve() solve() \endlink method in the BDCSVD class can be directly used to
|
||||
@@ -50,19 +30,16 @@ computing least squares solutions:
|
||||
</table>
|
||||
|
||||
This is example from the page \link TutorialLinearAlgebra Linear algebra and decompositions \endlink.
|
||||
The SVD gives you singular values and vectors in addition to the least squares solution, but if you
|
||||
only need the solution, CompleteOrthogonalDecomposition (above) is faster.
|
||||
If you just need to solve the least squares problem, but are not interested in the SVD per se, a
|
||||
faster alternative method is CompleteOrthogonalDecomposition.
|
||||
|
||||
|
||||
\section LeastSquaresQR Using other QR decompositions
|
||||
\section LeastSquaresQR Using the QR decomposition
|
||||
|
||||
The solve() method in QR decomposition classes also computes the least squares solution. Besides
|
||||
CompleteOrthogonalDecomposition (above), there are three other QR decomposition classes:
|
||||
HouseholderQR (no pivoting, so fast but unreliable if your matrix is not full rank),
|
||||
ColPivHouseholderQR (column pivoting, a bit slower but rank-revealing), and FullPivHouseholderQR
|
||||
(full pivoting, significantly slower and rarely needed in practice).
|
||||
Note that only CompleteOrthogonalDecomposition and the SVD-based solvers compute minimum-norm
|
||||
solutions for rank-deficient or underdetermined problems; the other QR variants do not.
|
||||
The solve() method in QR decomposition classes also computes the least squares solution. There are
|
||||
three QR decomposition classes: HouseholderQR (no pivoting, fast but unstable if your matrix is
|
||||
not rull rank), ColPivHouseholderQR (column pivoting, thus a bit slower but more stable) and
|
||||
FullPivHouseholderQR (full pivoting, so slowest and slightly more stable than ColPivHouseholderQR).
|
||||
Here is an example with column pivoting:
|
||||
|
||||
<table class="example">
|
||||
|
||||
@@ -42,10 +42,10 @@ To get an overview of the true relative speed of the different decompositions, c
|
||||
<tr class="alt">
|
||||
<td>FullPivLU</td>
|
||||
<td>-</td>
|
||||
<td>Slow (no blocking)</td>
|
||||
<td>Slow</td>
|
||||
<td>Proven</td>
|
||||
<td>Yes</td>
|
||||
<td>Rank, kernel, image</td>
|
||||
<td>-</td>
|
||||
<td>Yes</td>
|
||||
<td>Excellent</td>
|
||||
<td>-</td>
|
||||
@@ -78,7 +78,7 @@ To get an overview of the true relative speed of the different decompositions, c
|
||||
<tr>
|
||||
<td>FullPivHouseholderQR</td>
|
||||
<td>-</td>
|
||||
<td>Slow (no blocking)</td>
|
||||
<td>Slow</td>
|
||||
<td>Proven</td>
|
||||
<td>Yes</td>
|
||||
<td>Orthogonalization</td>
|
||||
@@ -120,7 +120,7 @@ To get an overview of the true relative speed of the different decompositions, c
|
||||
<td>-</td>
|
||||
<td>Yes</td>
|
||||
<td>Excellent</td>
|
||||
<td>-</td>
|
||||
<td><em>Soon: blocking</em></td>
|
||||
</tr>
|
||||
|
||||
<tr><th class="inter" colspan="9">\n Singular values and eigenvalues decompositions</th></tr>
|
||||
@@ -232,7 +232,7 @@ To get an overview of the true relative speed of the different decompositions, c
|
||||
<td>-</td>
|
||||
<td>-</td>
|
||||
<td>Good</td>
|
||||
<td>-</td>
|
||||
<td><em>Soon: blocking</em></td>
|
||||
</tr>
|
||||
|
||||
<tr>
|
||||
@@ -244,7 +244,7 @@ To get an overview of the true relative speed of the different decompositions, c
|
||||
<td>-</td>
|
||||
<td>-</td>
|
||||
<td>Good</td>
|
||||
<td>-</td>
|
||||
<td><em>Soon: blocking</em></td>
|
||||
</tr>
|
||||
|
||||
</table>
|
||||
@@ -253,32 +253,9 @@ To get an overview of the true relative speed of the different decompositions, c
|
||||
<ul>
|
||||
<li><a name="note1">\b 1: </a>There exist two variants of the LDLT algorithm. Eigen's one produces a pure diagonal D matrix, and therefore it cannot handle indefinite matrices, unlike Lapack's one which produces a block diagonal D matrix.</li>
|
||||
<li><a name="note2">\b 2: </a>Eigenvalues, SVD and Schur decompositions rely on iterative algorithms. Their convergence speed depends on how well the eigenvalues are separated.</li>
|
||||
<li><a name="note3">\b 3: </a>Our JacobiSVD is two-sided, making for proven and optimal precision for square matrices. For non-square matrices, we have to use a QR preconditioner first. The default choice, ColPivHouseholderQR, is already very reliable, but if you want it to be proven, use FullPivHouseholderQR instead.</li>
|
||||
<li><a name="note3">\b 3: </a>Our JacobiSVD is two-sided, making for proven and optimal precision for square matrices. For non-square matrices, we have to use a QR preconditioner first. The default choice, ColPivHouseholderQR, is already very reliable, but if you want it to be proven, use FullPivHouseholderQR instead.
|
||||
</ul>
|
||||
|
||||
\section TopicLinAlgPracticalGuidance Practical guidance
|
||||
|
||||
The following recommendations apply to the most common use cases:
|
||||
|
||||
\li <b>Symmetric positive definite systems:</b> Use \b LLT. It is the fastest solver and has excellent
|
||||
numerical properties for this class of problems. For semidefinite or nearly singular symmetric systems,
|
||||
use \b LDLT.
|
||||
\li <b>General invertible systems:</b> Use \b PartialPivLU. It uses cache-friendly blocking and implicit
|
||||
multi-threading, making it the fastest general-purpose solver. Partial pivoting is sufficient for
|
||||
virtually all practical problems.
|
||||
\li <b>Least squares (over- or under-determined systems):</b> Use \b CompleteOrthogonalDecomposition as
|
||||
the default. Like the SVD, it robustly computes the minimum-norm solution for rank-deficient and
|
||||
under-determined problems, but at QR-like speed. Use \b BDCSVD when you also need singular values
|
||||
or vectors, not just the least squares solution.
|
||||
\li <b>Full-rank least squares (overdetermined systems):</b> When the matrix is known to be full rank,
|
||||
\b HouseholderQR is the fastest option. For very tall and skinny well-conditioned matrices,
|
||||
solving via the normal equations with \b LLT can be faster still.
|
||||
\li <b>FullPivLU and FullPivHouseholderQR</b> use complete pivoting, which prevents the use of
|
||||
cache-friendly blocking algorithms and makes them significantly slower than their partial/column
|
||||
pivoting counterparts. In practice, complete pivoting rarely provides meaningful accuracy benefits.
|
||||
These decompositions are primarily useful for debugging, pedagogy, or the very rare case
|
||||
where column pivoting is insufficient.
|
||||
|
||||
\section TopicLinAlgTerminology Terminology
|
||||
|
||||
<dl>
|
||||
|
||||
@@ -43,23 +43,7 @@ depending on your matrix, the problem you are trying to solve, and the trade-off
|
||||
<th>Requirements<br/>on the matrix</th>
|
||||
<th>Speed<br/> (small-to-medium)</th>
|
||||
<th>Speed<br/> (large)</th>
|
||||
<th>Robustness<sup><a href="#note_robust">*</a></sup></th>
|
||||
</tr>
|
||||
<tr>
|
||||
<td>LLT</td>
|
||||
<td>llt()</td>
|
||||
<td>Positive definite</td>
|
||||
<td>+++</td>
|
||||
<td>+++</td>
|
||||
<td>+</td>
|
||||
</tr>
|
||||
<tr class="alt">
|
||||
<td>LDLT</td>
|
||||
<td>ldlt()</td>
|
||||
<td>Positive or negative<br/> semidefinite</td>
|
||||
<td>+++</td>
|
||||
<td>+</td>
|
||||
<td>++</td>
|
||||
<th>Accuracy</th>
|
||||
</tr>
|
||||
<tr>
|
||||
<td>PartialPivLU</td>
|
||||
@@ -70,6 +54,14 @@ depending on your matrix, the problem you are trying to solve, and the trade-off
|
||||
<td>+</td>
|
||||
</tr>
|
||||
<tr class="alt">
|
||||
<td>FullPivLU</td>
|
||||
<td>fullPivLu()</td>
|
||||
<td>None</td>
|
||||
<td>-</td>
|
||||
<td>- -</td>
|
||||
<td>+++</td>
|
||||
</tr>
|
||||
<tr>
|
||||
<td>HouseholderQR</td>
|
||||
<td>householderQr()</td>
|
||||
<td>None</td>
|
||||
@@ -77,7 +69,7 @@ depending on your matrix, the problem you are trying to solve, and the trade-off
|
||||
<td>++</td>
|
||||
<td>+</td>
|
||||
</tr>
|
||||
<tr>
|
||||
<tr class="alt">
|
||||
<td>ColPivHouseholderQR</td>
|
||||
<td>colPivHouseholderQr()</td>
|
||||
<td>None</td>
|
||||
@@ -85,6 +77,14 @@ depending on your matrix, the problem you are trying to solve, and the trade-off
|
||||
<td>-</td>
|
||||
<td>+++</td>
|
||||
</tr>
|
||||
<tr>
|
||||
<td>FullPivHouseholderQR</td>
|
||||
<td>fullPivHouseholderQr()</td>
|
||||
<td>None</td>
|
||||
<td>-</td>
|
||||
<td>- -</td>
|
||||
<td>+++</td>
|
||||
</tr>
|
||||
<tr class="alt">
|
||||
<td>CompleteOrthogonalDecomposition</td>
|
||||
<td>completeOrthogonalDecomposition()</td>
|
||||
@@ -93,7 +93,23 @@ depending on your matrix, the problem you are trying to solve, and the trade-off
|
||||
<td>-</td>
|
||||
<td>+++</td>
|
||||
</tr>
|
||||
<tr class="alt">
|
||||
<td>LLT</td>
|
||||
<td>llt()</td>
|
||||
<td>Positive definite</td>
|
||||
<td>+++</td>
|
||||
<td>+++</td>
|
||||
<td>+</td>
|
||||
</tr>
|
||||
<tr>
|
||||
<td>LDLT</td>
|
||||
<td>ldlt()</td>
|
||||
<td>Positive or negative<br/> semidefinite</td>
|
||||
<td>+++</td>
|
||||
<td>+</td>
|
||||
<td>++</td>
|
||||
</tr>
|
||||
<tr class="alt">
|
||||
<td>BDCSVD</td>
|
||||
<td>bdcSvd()</td>
|
||||
<td>None</td>
|
||||
@@ -110,36 +126,15 @@ depending on your matrix, the problem you are trying to solve, and the trade-off
|
||||
<td>+++</td>
|
||||
</tr>
|
||||
</table>
|
||||
|
||||
<a name="note_robust"><b>*</b></a> The <b>Robustness</b> column indicates how well the decomposition handles
|
||||
ill-conditioned or rank-deficient matrices. All decompositions give excellent accuracy when their
|
||||
requirements on the matrix are met and the problem is well-conditioned.
|
||||
|
||||
To get an overview of the true relative speed of the different decompositions, check this \link DenseDecompositionBenchmark benchmark \endlink.
|
||||
|
||||
All of these decompositions offer a solve() method that works as in the above example.
|
||||
All of these decompositions offer a solve() method that works as in the above example.
|
||||
|
||||
\b Practical \b recommendations:
|
||||
\li If your matrix is symmetric positive definite, use \b LLT. It is the fastest and is perfectly accurate
|
||||
for this class of problems. If your matrix is only positive or negative semidefinite, use \b LDLT.
|
||||
\li For a general invertible matrix, \b PartialPivLU is the best choice. It is fast (uses cache-friendly
|
||||
blocking) and reliable for the vast majority of problems.
|
||||
\li For least squares problems (over- or under-determined systems), \b CompleteOrthogonalDecomposition
|
||||
is the recommended default. Like the SVD, it robustly computes the minimum-norm solution for
|
||||
rank-deficient and under-determined problems, but at the cost of a QR decomposition rather than
|
||||
an SVD. Use \b ColPivHouseholderQR if you only need least squares for full-rank overdetermined
|
||||
systems and don't need the minimum-norm property.
|
||||
\li \b SVD decompositions (BDCSVD, JacobiSVD) are the most robust but also the slowest. Use these when
|
||||
you need singular values/vectors, not just the solution.
|
||||
\li \b HouseholderQR is the fastest option for full-rank least squares problems, but it does not
|
||||
reveal rank and cannot compute minimum-norm solutions for rank-deficient problems.
|
||||
\li FullPivLU and FullPivHouseholderQR use complete pivoting, which is significantly slower due to
|
||||
lack of blocking. In practice, they rarely provide meaningful benefits over PartialPivLU and
|
||||
ColPivHouseholderQR, respectively, and are not recommended for general use. They are primarily useful
|
||||
for debugging or for pedagogical purposes.
|
||||
|
||||
Here's an example showing the use of LLT for a symmetric positive definite system, also demonstrating
|
||||
that using a general matrix (not a vector) as right hand side is possible:
|
||||
If you know more about the properties of your matrix, you can use the above table to select the best method.
|
||||
For example, a good choice for solving linear systems with a non-symmetric matrix of full rank is PartialPivLU.
|
||||
If you know that your matrix is also symmetric and positive definite, the above table says that
|
||||
a very good choice is the LLT or LDLT decomposition. Here's an example, also demonstrating that using a general
|
||||
matrix (not a vector) as right hand side is possible:
|
||||
|
||||
<table class="example">
|
||||
<tr><th>Example:</th><th>Output:</th></tr>
|
||||
@@ -156,15 +151,14 @@ supports many other decompositions), see our special page on
|
||||
|
||||
\section TutorialLinAlgLeastsquares Least squares solving
|
||||
|
||||
The recommended method to solve under- or over-determined linear systems in the least squares sense is
|
||||
\b CompleteOrthogonalDecomposition. Like the SVD, it robustly computes the minimum-norm least squares
|
||||
solution, correctly handling rank-deficient and under-determined problems, but it is significantly faster
|
||||
since it is based on a rank-revealing QR decomposition rather than a full SVD.
|
||||
The most general and accurate method to solve under- or over-determined linear systems
|
||||
in the least squares sense, is the SVD decomposition. Eigen provides two implementations.
|
||||
The recommended one is the BDCSVD class, which scales well for large problems
|
||||
and automatically falls back to the JacobiSVD class for smaller problems.
|
||||
For both classes, their solve() method solved the linear system in the least-squares
|
||||
sense.
|
||||
|
||||
If you also need the singular values or vectors themselves (not just the least squares solution), use
|
||||
\b BDCSVD, which scales well for large problems and automatically falls back to JacobiSVD for smaller ones.
|
||||
|
||||
Here is an example using the SVD:
|
||||
Here is an example:
|
||||
<table class="example">
|
||||
<tr><th>Example:</th><th>Output:</th></tr>
|
||||
<tr>
|
||||
@@ -173,9 +167,11 @@ Here is an example using the SVD:
|
||||
</tr>
|
||||
</table>
|
||||
|
||||
If you know more about the problem, faster methods are available.
|
||||
If your matrix is full rank, HouseholderQR is the fastest method. If your matrix is full rank and
|
||||
well conditioned, using the Cholesky decomposition (LLT) on the normal equations can be faster still.
|
||||
An alternative to the SVD, which is usually faster and about as accurate, is CompleteOrthogonalDecomposition.
|
||||
|
||||
Again, if you know more about the problem, the table above contains methods that are potentially faster.
|
||||
If your matrix is full rank, HouseHolderQR is the method of choice. If your matrix is full rank and well conditioned,
|
||||
using the Cholesky decomposition (LLT) on the matrix of the normal equations can be faster still.
|
||||
Our page on \link LeastSquares least squares solving \endlink has more details.
|
||||
|
||||
|
||||
@@ -271,9 +267,8 @@ singular matrix). On \ref TopicLinearAlgebraDecompositions "this table" you can
|
||||
whether they are rank-revealing or not.
|
||||
|
||||
Rank-revealing decompositions offer at least a rank() method. They can also offer convenience methods such as isInvertible(),
|
||||
and some are also providing methods to compute the kernel (null-space) and image (column-space) of the matrix.
|
||||
ColPivHouseholderQR, CompleteOrthogonalDecomposition, and FullPivLU all provide these methods. Here is an example using
|
||||
FullPivLU:
|
||||
and some are also providing methods to compute the kernel (null-space) and image (column-space) of the matrix, as is the
|
||||
case with FullPivLU:
|
||||
|
||||
<table class="example">
|
||||
<tr><th>Example:</th><th>Output:</th></tr>
|
||||
|
||||
@@ -1,3 +0,0 @@
|
||||
MatrixXf A = MatrixXf::Random(3, 2);
|
||||
VectorXf b = VectorXf::Random(3);
|
||||
cout << "The solution using the COD is:\n" << A.completeOrthogonalDecomposition().solve(b) << endl;
|
||||
155
test/bdcsvd.cpp
155
test/bdcsvd.cpp
@@ -15,7 +15,6 @@
|
||||
#define EIGEN_RUNTIME_NO_MALLOC
|
||||
|
||||
#include "main.h"
|
||||
#include "tridiag_test_matrices.h"
|
||||
#include <Eigen/SVD>
|
||||
|
||||
#define SVD_DEFAULT(M) BDCSVD<M>
|
||||
@@ -147,26 +146,148 @@ void verify_bidiagonal_vs_matrix_svd(const Matrix<RealScalar, Dynamic, 1>& diag,
|
||||
|
||||
template <typename RealScalar>
|
||||
void bdcsvd_bidiagonal_hard_cases() {
|
||||
using std::abs;
|
||||
using std::cos;
|
||||
using std::pow;
|
||||
using std::sin;
|
||||
typedef Matrix<RealScalar, Dynamic, 1> VectorXr;
|
||||
|
||||
Eigen::internal::set_is_malloc_allowed(true);
|
||||
|
||||
// Use the shared tridiagonal test matrix generators.
|
||||
// Each generator fills (diag, offdiag) which we treat as (diagonal, superdiagonal)
|
||||
// of a bidiagonal matrix.
|
||||
test::for_all_tridiag_test_matrices<RealScalar>(
|
||||
[](const auto& diag, const auto& offdiag) { verify_bidiagonal_svd<RealScalar>(diag, offdiag); });
|
||||
const RealScalar eps = NumTraits<RealScalar>::epsilon();
|
||||
|
||||
// Additional SVD-specific test: identity with cross-validation against full matrix SVD.
|
||||
test::for_tridiag_sizes<RealScalar>([](auto& diag, auto& offdiag) {
|
||||
test::tridiag_identity(diag, offdiag);
|
||||
verify_bidiagonal_vs_matrix_svd<RealScalar>(diag, offdiag);
|
||||
});
|
||||
// Test sizes: cover n=1, very small, below/above algoSwap (16), and larger.
|
||||
const int sizes[] = {1, 2, 3, 5, 10, 16, 20, 50, 100};
|
||||
const int numSizes = sizeof(sizes) / sizeof(sizes[0]);
|
||||
|
||||
// Additional SVD-specific test: scalar for n=1.
|
||||
{
|
||||
typedef Matrix<RealScalar, Dynamic, 1> VectorXr;
|
||||
VectorXr diag(1), offdiag(0);
|
||||
diag(0) = RealScalar(3.14);
|
||||
verify_bidiagonal_svd<RealScalar>(diag, offdiag);
|
||||
for (int si = 0; si < numSizes; ++si) {
|
||||
const Index n = sizes[si];
|
||||
VectorXr diag(n), superdiag(n > 1 ? n - 1 : 0);
|
||||
|
||||
// 1. Identity: d=[1,...,1], e=[0,...,0]
|
||||
diag.setOnes();
|
||||
superdiag.setZero();
|
||||
verify_bidiagonal_svd<RealScalar>(diag, superdiag);
|
||||
verify_bidiagonal_vs_matrix_svd<RealScalar>(diag, superdiag);
|
||||
|
||||
// 2. Zero: d=[0,...,0], e=[0,...,0]
|
||||
diag.setZero();
|
||||
superdiag.setZero();
|
||||
verify_bidiagonal_svd<RealScalar>(diag, superdiag);
|
||||
|
||||
// 3. Scalar (only meaningful for n=1, but runs for all)
|
||||
if (n == 1) {
|
||||
diag(0) = RealScalar(3.14);
|
||||
verify_bidiagonal_svd<RealScalar>(diag, superdiag);
|
||||
}
|
||||
|
||||
// 4. Golub-Kahan: d=[1,...,1], e=[1,...,1]
|
||||
diag.setOnes();
|
||||
if (n > 1) superdiag.setOnes();
|
||||
verify_bidiagonal_svd<RealScalar>(diag, superdiag);
|
||||
|
||||
// 5. Kahan matrix: d_i = s^(i-1), e_i = -c*s^(i-1)
|
||||
// Clamp exponents so condition number stays bounded by 1/eps.
|
||||
{
|
||||
const RealScalar theta = RealScalar(0.3);
|
||||
const RealScalar s = sin(theta);
|
||||
const RealScalar c = cos(theta);
|
||||
using std::log;
|
||||
const RealScalar maxPower = -log(eps) / (-log(s));
|
||||
for (Index i = 0; i < n; ++i) diag(i) = pow(s, numext::mini(RealScalar(i), maxPower));
|
||||
for (Index i = 0; i < n - 1; ++i) superdiag(i) = -c * pow(s, numext::mini(RealScalar(i), maxPower));
|
||||
verify_bidiagonal_svd<RealScalar>(diag, superdiag);
|
||||
}
|
||||
|
||||
// 6. Geometric decay diagonal: d_i = 0.5^i, e=[0,...,0]
|
||||
// Clamp so condition number stays bounded by 1/eps.
|
||||
{
|
||||
using std::log;
|
||||
const RealScalar base = RealScalar(0.5);
|
||||
const RealScalar maxPower = -log(eps) / (-log(base));
|
||||
for (Index i = 0; i < n; ++i) diag(i) = pow(base, numext::mini(RealScalar(i), maxPower));
|
||||
superdiag.setZero();
|
||||
verify_bidiagonal_svd<RealScalar>(diag, superdiag);
|
||||
}
|
||||
|
||||
// 7. Geometric decay superdiagonal: d=[1,...,1], e_i = 0.5^i
|
||||
diag.setOnes();
|
||||
for (Index i = 0; i < n - 1; ++i) superdiag(i) = pow(RealScalar(0.5), RealScalar(i));
|
||||
verify_bidiagonal_svd<RealScalar>(diag, superdiag);
|
||||
|
||||
// 8. Clustered at 1: d_i = 1 + i*eps, e=[0,...,0]
|
||||
for (Index i = 0; i < n; ++i) diag(i) = RealScalar(1) + RealScalar(i) * eps;
|
||||
superdiag.setZero();
|
||||
verify_bidiagonal_svd<RealScalar>(diag, superdiag);
|
||||
|
||||
// 9. Two clusters: half ≈ 1, half ≈ eps
|
||||
for (Index i = 0; i < n; ++i) diag(i) = (i < n / 2) ? RealScalar(1) : eps;
|
||||
superdiag.setZero();
|
||||
verify_bidiagonal_svd<RealScalar>(diag, superdiag);
|
||||
|
||||
// 10. Single tiny singular value: d=[1,...,1,eps], e=[eps^2,...]
|
||||
diag.setOnes();
|
||||
diag(n - 1) = eps;
|
||||
for (Index i = 0; i < n - 1; ++i) superdiag(i) = eps * eps;
|
||||
verify_bidiagonal_svd<RealScalar>(diag, superdiag);
|
||||
|
||||
// 11. Graded: d_i = 10^(-i), e_i = 10^(-i)
|
||||
for (Index i = 0; i < n; ++i) diag(i) = pow(RealScalar(10), -RealScalar(i));
|
||||
for (Index i = 0; i < n - 1; ++i) superdiag(i) = pow(RealScalar(10), -RealScalar(i));
|
||||
verify_bidiagonal_svd<RealScalar>(diag, superdiag);
|
||||
|
||||
// 12. Nearly diagonal: random diag, eps * random superdiag
|
||||
diag = VectorXr::Random(n).cwiseAbs() + VectorXr::Constant(n, RealScalar(0.1));
|
||||
for (Index i = 0; i < n - 1; ++i) superdiag(i) = eps * (RealScalar(0.5) + abs(internal::random<RealScalar>()));
|
||||
verify_bidiagonal_svd<RealScalar>(diag, superdiag);
|
||||
|
||||
// 13. All equal: d=[c,...,c], e=[c,...,c]
|
||||
diag.setConstant(RealScalar(2.5));
|
||||
if (n > 1) superdiag.setConstant(RealScalar(2.5));
|
||||
verify_bidiagonal_svd<RealScalar>(diag, superdiag);
|
||||
|
||||
// 14. Wilkinson: d_i = |n/2 - i|, e=[1,...,1]
|
||||
for (Index i = 0; i < n; ++i) diag(i) = abs(RealScalar(n / 2) - RealScalar(i));
|
||||
if (n > 1) superdiag.setOnes();
|
||||
verify_bidiagonal_svd<RealScalar>(diag, superdiag);
|
||||
|
||||
// 15. Overflow/underflow: alternating big/tiny diagonal, tiny/big superdiagonal
|
||||
{
|
||||
const RealScalar big = (std::numeric_limits<RealScalar>::max)() / RealScalar(1000);
|
||||
const RealScalar tiny = (std::numeric_limits<RealScalar>::min)() * RealScalar(1000);
|
||||
for (Index i = 0; i < n; ++i) diag(i) = (i % 2 == 0) ? big : tiny;
|
||||
for (Index i = 0; i < n - 1; ++i) superdiag(i) = (i % 2 == 0) ? tiny : big;
|
||||
verify_bidiagonal_svd<RealScalar>(diag, superdiag);
|
||||
}
|
||||
|
||||
// 16. Prescribed condition number: d_i = kappa^(-i/(n-1)), e_i = eps * random
|
||||
if (n > 1) {
|
||||
const RealScalar kappa = RealScalar(1) / eps;
|
||||
for (Index i = 0; i < n; ++i) diag(i) = pow(kappa, -RealScalar(i) / RealScalar(n - 1));
|
||||
for (Index i = 0; i < n - 1; ++i) superdiag(i) = eps * abs(internal::random<RealScalar>());
|
||||
verify_bidiagonal_svd<RealScalar>(diag, superdiag);
|
||||
}
|
||||
|
||||
// 17. Rank-deficient: d=[1,..,0,..,0,..,1], e=[0,...,0]
|
||||
for (Index i = 0; i < n; ++i) diag(i) = (i < n / 3 || i >= 2 * n / 3) ? RealScalar(1) : RealScalar(0);
|
||||
superdiag.setZero();
|
||||
verify_bidiagonal_svd<RealScalar>(diag, superdiag);
|
||||
|
||||
// 18. Arrowhead stress: d_i = linspace(1, n), e_i = 1/(i+1)
|
||||
for (Index i = 0; i < n; ++i) diag(i) = RealScalar(1) + RealScalar(i);
|
||||
for (Index i = 0; i < n - 1; ++i) superdiag(i) = RealScalar(1) / RealScalar(i + 1);
|
||||
verify_bidiagonal_svd<RealScalar>(diag, superdiag);
|
||||
|
||||
// 19. Repeated singular values: d=[1,2,3,1,2,3,...], e=[0,...,0]
|
||||
for (Index i = 0; i < n; ++i) diag(i) = RealScalar((i % 3) + 1);
|
||||
superdiag.setZero();
|
||||
verify_bidiagonal_svd<RealScalar>(diag, superdiag);
|
||||
|
||||
// 20. Glued identity: d=[1,...,1], e=0 except e[n/2-1]=eps
|
||||
diag.setOnes();
|
||||
superdiag.setZero();
|
||||
if (n > 2) superdiag(n / 2 - 1) = eps;
|
||||
verify_bidiagonal_svd<RealScalar>(diag, superdiag);
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
@@ -10,7 +10,6 @@
|
||||
|
||||
#include "main.h"
|
||||
#include "svd_fill.h"
|
||||
#include "tridiag_test_matrices.h"
|
||||
#include <limits>
|
||||
#include <Eigen/Eigenvalues>
|
||||
#include <Eigen/SparseCore>
|
||||
@@ -32,19 +31,8 @@ void selfadjointeigensolver_essential_check(const MatrixType& m) {
|
||||
if (scaling < (std::numeric_limits<RealScalar>::min)()) {
|
||||
VERIFY(eiSymm.eigenvalues().cwiseAbs().maxCoeff() <= (std::numeric_limits<RealScalar>::min)());
|
||||
} else {
|
||||
// Columnwise residual check: for each eigenpair (lambda_i, v_i),
|
||||
// ||A*v_i - lambda_i*v_i|| / ||A||_max <= c * n * eps
|
||||
// This ensures accuracy for every eigenpair, including those corresponding
|
||||
// to small eigenvalues (which a Frobenius norm check would miss).
|
||||
// Computed in scaled space (dividing by ||A||_max) to avoid overflow.
|
||||
MatrixType scaledA = m.template selfadjointView<Lower>();
|
||||
scaledA /= scaling;
|
||||
MatrixType residual =
|
||||
scaledA * eiSymm.eigenvectors() - eiSymm.eigenvectors() * (eiSymm.eigenvalues() / scaling).asDiagonal();
|
||||
RealScalar tol = RealScalar(4) * RealScalar(numext::maxi(Index(1), n)) * NumTraits<RealScalar>::epsilon();
|
||||
for (Index i = 0; i < n; ++i) {
|
||||
VERIFY(residual.col(i).norm() <= tol);
|
||||
}
|
||||
VERIFY_IS_APPROX((m.template selfadjointView<Lower>() * eiSymm.eigenvectors()) / scaling,
|
||||
(eiSymm.eigenvectors() * eiSymm.eigenvalues().asDiagonal()) / scaling);
|
||||
}
|
||||
VERIFY_IS_APPROX(m.template selfadjointView<Lower>().eigenvalues(), eiSymm.eigenvalues());
|
||||
|
||||
@@ -76,8 +64,6 @@ void selfadjointeigensolver_essential_check(const MatrixType& m) {
|
||||
VERIFY(eiDirect.eigenvalues().cwiseAbs().maxCoeff() <= (std::numeric_limits<RealScalar>::min)());
|
||||
} else {
|
||||
VERIFY_IS_APPROX(eiSymm.eigenvalues() / scaling, eiDirect.eigenvalues() / scaling);
|
||||
// TODO: the direct 3x3 solver can produce large backward errors (>>n*eps*||A||)
|
||||
// on some matrices. Investigate and fix, then tighten this to a Frobenius norm check.
|
||||
VERIFY_IS_APPROX((m.template selfadjointView<Lower>() * eiDirect.eigenvectors()) / scaling,
|
||||
(eiDirect.eigenvectors() * eiDirect.eigenvalues().asDiagonal()) / scaling);
|
||||
VERIFY_IS_APPROX(m.template selfadjointView<Lower>().eigenvalues() / scaling, eiDirect.eigenvalues() / scaling);
|
||||
@@ -422,137 +408,6 @@ void selfadjointeigensolver_tridiagonal_scaled(const MatrixType& m) {
|
||||
VERIFY_IS_APPROX(eig2.eigenvalues(), eig2v.eigenvalues());
|
||||
}
|
||||
|
||||
// Test computeFromTridiagonal with wide dynamic range across decoupled blocks.
|
||||
// This exercises the per-block scaling in computeFromTridiagonal_impl: a zero on the
|
||||
// subdiagonal decouples the matrix into blocks with vastly different scales. Global
|
||||
// scaling would underflow the small block; per-block scaling handles both correctly.
|
||||
template <typename RealScalar>
|
||||
void selfadjointeigensolver_tridiagonal_wide_range() {
|
||||
using std::sqrt;
|
||||
typedef Matrix<RealScalar, Dynamic, Dynamic> MatrixType;
|
||||
typedef Matrix<RealScalar, Dynamic, 1> VectorType;
|
||||
|
||||
// Block 1: entries near overflow threshold.
|
||||
// Block 2: entries near 1.
|
||||
// Separated by a zero subdiagonal entry.
|
||||
const RealScalar big = sqrt(NumTraits<RealScalar>::highest()) / RealScalar(10);
|
||||
const Index n = 6;
|
||||
VectorType diag(n), subdiag(n - 1);
|
||||
|
||||
// First block: [0..2], large scale.
|
||||
diag(0) = big;
|
||||
diag(1) = big * RealScalar(1.1);
|
||||
diag(2) = big * RealScalar(0.9);
|
||||
subdiag(0) = big * RealScalar(0.01);
|
||||
subdiag(1) = big * RealScalar(0.02);
|
||||
// Zero subdiagonal decouples the two blocks.
|
||||
subdiag(2) = RealScalar(0);
|
||||
// Second block: [3..5], O(1) scale.
|
||||
diag(3) = RealScalar(1);
|
||||
diag(4) = RealScalar(2);
|
||||
diag(5) = RealScalar(3);
|
||||
subdiag(3) = RealScalar(0.5);
|
||||
subdiag(4) = RealScalar(0.3);
|
||||
|
||||
// Build the full tridiagonal matrix for residual checking.
|
||||
MatrixType T = MatrixType::Zero(n, n);
|
||||
T.diagonal() = diag;
|
||||
T.template diagonal<1>() = subdiag;
|
||||
T.template diagonal<-1>() = subdiag;
|
||||
|
||||
SelfAdjointEigenSolver<MatrixType> eig;
|
||||
eig.computeFromTridiagonal(diag, subdiag, ComputeEigenvectors);
|
||||
VERIFY_IS_EQUAL(eig.info(), Success);
|
||||
|
||||
// Eigenvalues must be sorted.
|
||||
for (Index i = 1; i < n; ++i) {
|
||||
VERIFY(eig.eigenvalues()(i) >= eig.eigenvalues()(i - 1));
|
||||
}
|
||||
|
||||
// Eigenvectors must be orthonormal.
|
||||
RealScalar unitary_tol = RealScalar(4) * RealScalar(n) * NumTraits<RealScalar>::epsilon();
|
||||
VERIFY(eig.eigenvectors().isUnitary(unitary_tol));
|
||||
|
||||
// Full residual check in scaled coordinates.
|
||||
RealScalar Tnorm = T.cwiseAbs().maxCoeff();
|
||||
MatrixType Tscaled = T / Tnorm;
|
||||
MatrixType residual = Tscaled * eig.eigenvectors() - eig.eigenvectors() * (eig.eigenvalues() / Tnorm).asDiagonal();
|
||||
RealScalar rel_err = residual.norm() / Tscaled.norm();
|
||||
VERIFY(rel_err <= RealScalar(8) * RealScalar(n) * NumTraits<RealScalar>::epsilon());
|
||||
|
||||
// The small eigenvalues (~1,2,3) must be accurate, not lost to underflow.
|
||||
// With global scaling to [-1,1], dividing by 'big' would underflow these to zero.
|
||||
// Verify the small eigenvalues are within O(eps) of their true values.
|
||||
// The small block is exactly [[1, 0.5, 0], [0.5, 2, 0.3], [0, 0.3, 3]].
|
||||
MatrixType T_small(3, 3);
|
||||
T_small << RealScalar(1), RealScalar(0.5), RealScalar(0), RealScalar(0.5), RealScalar(2), RealScalar(0.3),
|
||||
RealScalar(0), RealScalar(0.3), RealScalar(3);
|
||||
SelfAdjointEigenSolver<MatrixType> eig_small(T_small);
|
||||
VectorType small_evals = eig_small.eigenvalues();
|
||||
|
||||
// Find the 3 smallest eigenvalues from the combined solver (they should be sorted first).
|
||||
VectorType combined_small = eig.eigenvalues().head(3);
|
||||
VERIFY_IS_APPROX(combined_small, small_evals);
|
||||
|
||||
// Eigenvalues-only mode must agree.
|
||||
SelfAdjointEigenSolver<MatrixType> eig_vals;
|
||||
eig_vals.computeFromTridiagonal(diag, subdiag, EigenvaluesOnly);
|
||||
VERIFY_IS_EQUAL(eig_vals.info(), Success);
|
||||
VERIFY_IS_APPROX(eig.eigenvalues() / Tnorm, eig_vals.eigenvalues() / Tnorm);
|
||||
}
|
||||
|
||||
// Test computeFromTridiagonal with structured hard-case matrices from the literature.
|
||||
template <typename RealScalar>
|
||||
void selfadjointeigensolver_structured_tridiagonal() {
|
||||
typedef Matrix<RealScalar, Dynamic, Dynamic> MatrixType;
|
||||
|
||||
test::for_all_symmetric_tridiag_test_matrices<RealScalar>([](const auto& diag, const auto& offdiag) {
|
||||
Index n = diag.size();
|
||||
|
||||
// Build the full symmetric tridiagonal matrix for residual checking.
|
||||
MatrixType T = MatrixType::Zero(n, n);
|
||||
T.diagonal() = diag;
|
||||
if (n > 1) {
|
||||
T.template diagonal<1>() = offdiag;
|
||||
T.template diagonal<-1>() = offdiag;
|
||||
}
|
||||
RealScalar Tnorm = T.cwiseAbs().maxCoeff();
|
||||
|
||||
// Test with eigenvectors.
|
||||
SelfAdjointEigenSolver<MatrixType> eig;
|
||||
eig.computeFromTridiagonal(diag, offdiag, ComputeEigenvectors);
|
||||
VERIFY_IS_EQUAL(eig.info(), Success);
|
||||
|
||||
// Eigenvalues must be sorted.
|
||||
for (Index i = 1; i < n; ++i) {
|
||||
VERIFY(eig.eigenvalues()(i) >= eig.eigenvalues()(i - 1));
|
||||
}
|
||||
|
||||
// Eigenvectors must be orthonormal.
|
||||
RealScalar unitary_tol =
|
||||
numext::maxi(RealScalar(4) * RealScalar(n) * NumTraits<RealScalar>::epsilon(), test_precision<RealScalar>());
|
||||
VERIFY(eig.eigenvectors().isUnitary(unitary_tol));
|
||||
|
||||
// Residual check: ||T*V - V*D||_F / ||T||_max should be O(n*eps).
|
||||
// Scale T to avoid overflow in the matrix product when entries span extreme ranges.
|
||||
if (Tnorm > (std::numeric_limits<RealScalar>::min)()) {
|
||||
MatrixType Tscaled = T / Tnorm;
|
||||
MatrixType residual =
|
||||
Tscaled * eig.eigenvectors() - eig.eigenvectors() * (eig.eigenvalues() / Tnorm).asDiagonal();
|
||||
RealScalar rel_err = residual.norm() / Tscaled.norm();
|
||||
VERIFY(rel_err <= RealScalar(8) * RealScalar(n) * NumTraits<RealScalar>::epsilon());
|
||||
}
|
||||
|
||||
// Eigenvalues-only mode must produce the same eigenvalues.
|
||||
SelfAdjointEigenSolver<MatrixType> eig_vals;
|
||||
eig_vals.computeFromTridiagonal(diag, offdiag, EigenvaluesOnly);
|
||||
VERIFY_IS_EQUAL(eig_vals.info(), Success);
|
||||
if (Tnorm > (std::numeric_limits<RealScalar>::min)()) {
|
||||
VERIFY_IS_APPROX(eig.eigenvalues() / Tnorm, eig_vals.eigenvalues() / Tnorm);
|
||||
}
|
||||
});
|
||||
}
|
||||
|
||||
// Test with diagonal matrices (tridiagonalization is trivial).
|
||||
template <typename MatrixType>
|
||||
void selfadjointeigensolver_diagonal(const MatrixType& m) {
|
||||
@@ -709,78 +564,11 @@ void bug_1225() {
|
||||
VERIFY_IS_APPROX(eig1.eigenvalues(), eig2.eigenvalues());
|
||||
}
|
||||
|
||||
// Verify that non-finite inputs are detected for all sizes, including 1x1.
|
||||
template <int>
|
||||
void selfadjointeigensolver_nonfinite() {
|
||||
const double inf = std::numeric_limits<double>::infinity();
|
||||
const double nan = std::numeric_limits<double>::quiet_NaN();
|
||||
|
||||
// 1x1 Inf.
|
||||
{
|
||||
Matrix<double, 1, 1> m;
|
||||
m << inf;
|
||||
SelfAdjointEigenSolver<Matrix<double, 1, 1>> eig(m);
|
||||
VERIFY_IS_EQUAL(eig.info(), NoConvergence);
|
||||
}
|
||||
// 1x1 NaN.
|
||||
{
|
||||
Matrix<double, 1, 1> m;
|
||||
m << nan;
|
||||
SelfAdjointEigenSolver<Matrix<double, 1, 1>> eig(m);
|
||||
VERIFY_IS_EQUAL(eig.info(), NoConvergence);
|
||||
}
|
||||
// 1x1 -Inf.
|
||||
{
|
||||
Matrix<double, 1, 1> m;
|
||||
m << -inf;
|
||||
SelfAdjointEigenSolver<Matrix<double, 1, 1>> eig(m);
|
||||
VERIFY_IS_EQUAL(eig.info(), NoConvergence);
|
||||
}
|
||||
// 3x3 with Inf.
|
||||
{
|
||||
Matrix3d m = Matrix3d::Identity();
|
||||
m(1, 1) = inf;
|
||||
SelfAdjointEigenSolver<Matrix3d> eig(m);
|
||||
VERIFY_IS_EQUAL(eig.info(), NoConvergence);
|
||||
}
|
||||
// 3x3 with NaN.
|
||||
{
|
||||
Matrix3d m = Matrix3d::Identity();
|
||||
m(0, 1) = m(1, 0) = nan;
|
||||
SelfAdjointEigenSolver<Matrix3d> eig(m);
|
||||
VERIFY_IS_EQUAL(eig.info(), NoConvergence);
|
||||
}
|
||||
// Dynamic size with Inf.
|
||||
{
|
||||
MatrixXd m = MatrixXd::Identity(5, 5);
|
||||
m(3, 3) = inf;
|
||||
SelfAdjointEigenSolver<MatrixXd> eig(m);
|
||||
VERIFY_IS_EQUAL(eig.info(), NoConvergence);
|
||||
}
|
||||
}
|
||||
|
||||
template <int>
|
||||
void bug_1204() {
|
||||
SparseMatrix<double> A(2, 2);
|
||||
A.setIdentity();
|
||||
SelfAdjointEigenSolver<Eigen::SparseMatrix<double>> eig(A);
|
||||
}
|
||||
|
||||
template <int>
|
||||
void selfadjointeigensolver_tridiagonal_zerosized() {
|
||||
SelfAdjointEigenSolver<MatrixXd> eig;
|
||||
VectorXd diag(0), subdiag(0);
|
||||
|
||||
eig.computeFromTridiagonal(diag, subdiag, EigenvaluesOnly);
|
||||
VERIFY_IS_EQUAL(eig.info(), Success);
|
||||
VERIFY_IS_EQUAL(eig.eigenvalues().size(), 0);
|
||||
VERIFY_RAISES_ASSERT(eig.eigenvectors());
|
||||
|
||||
eig.computeFromTridiagonal(diag, subdiag, ComputeEigenvectors);
|
||||
VERIFY_IS_EQUAL(eig.info(), Success);
|
||||
VERIFY_IS_EQUAL(eig.eigenvalues().size(), 0);
|
||||
VERIFY_IS_EQUAL(eig.eigenvectors().rows(), 0);
|
||||
VERIFY_IS_EQUAL(eig.eigenvectors().cols(), 0);
|
||||
SelfAdjointEigenSolver<Eigen::SparseMatrix<double> > eig(A);
|
||||
}
|
||||
|
||||
// Specific 3x3 test cases that stress the direct solver.
|
||||
@@ -918,14 +706,6 @@ EIGEN_DECLARE_TEST(eigensolver_selfadjoint) {
|
||||
CALL_SUBTEST_4(selfadjointeigensolver_tridiagonal_scaled(MatrixXd(s, s)));
|
||||
CALL_SUBTEST_3(selfadjointeigensolver_tridiagonal_scaled(MatrixXf(s, s)));
|
||||
|
||||
// structured tridiagonal hard cases from the literature
|
||||
CALL_SUBTEST_4(selfadjointeigensolver_structured_tridiagonal<double>());
|
||||
CALL_SUBTEST_3(selfadjointeigensolver_structured_tridiagonal<float>());
|
||||
|
||||
// wide dynamic range tridiagonal (per-block scaling regression)
|
||||
CALL_SUBTEST_4(selfadjointeigensolver_tridiagonal_wide_range<double>());
|
||||
CALL_SUBTEST_3(selfadjointeigensolver_tridiagonal_wide_range<float>());
|
||||
|
||||
// diagonal matrices
|
||||
CALL_SUBTEST_17(selfadjointeigensolver_diagonal(Matrix3d()));
|
||||
CALL_SUBTEST_4(selfadjointeigensolver_diagonal(MatrixXd(s, s)));
|
||||
@@ -944,8 +724,6 @@ EIGEN_DECLARE_TEST(eigensolver_selfadjoint) {
|
||||
CALL_SUBTEST_17(bug_1014<0>());
|
||||
CALL_SUBTEST_17(bug_1204<0>());
|
||||
CALL_SUBTEST_17(bug_1225<0>());
|
||||
CALL_SUBTEST_17(selfadjointeigensolver_nonfinite<0>());
|
||||
CALL_SUBTEST_8(selfadjointeigensolver_tridiagonal_zerosized<0>());
|
||||
|
||||
// Stress tests for direct 3x3 and 2x2 solvers.
|
||||
CALL_SUBTEST_17(direct_3x3_stress<0>());
|
||||
|
||||
@@ -1,383 +0,0 @@
|
||||
// This file is part of Eigen, a lightweight C++ template library
|
||||
// for linear algebra.
|
||||
//
|
||||
// Copyright (C) 2025 Rasmus Munk Larsen <rmlarsen@gmail.com>
|
||||
//
|
||||
// This Source Code Form is subject to the terms of the Mozilla
|
||||
// Public License v. 2.0. If a copy of the MPL was not distributed
|
||||
// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
|
||||
|
||||
#ifndef EIGEN_TEST_TRIDIAG_TEST_MATRICES_H
|
||||
#define EIGEN_TEST_TRIDIAG_TEST_MATRICES_H
|
||||
|
||||
// Structured tridiagonal test matrices from the numerical linear algebra
|
||||
// literature. Used by both the bidiagonal SVD and symmetric eigenvalue tests.
|
||||
//
|
||||
// Each generator writes into pre-allocated (diag, offdiag) vectors.
|
||||
// For SVD, offdiag is the superdiagonal of a bidiagonal matrix.
|
||||
// For eigenvalues, offdiag is the subdiagonal of a symmetric tridiagonal matrix.
|
||||
//
|
||||
// Usage:
|
||||
// Matrix<RealScalar, Dynamic, 1> diag(n), offdiag(n-1);
|
||||
// tridiag_identity(diag, offdiag); // fills diag and offdiag
|
||||
// my_verify(diag, offdiag); // solver-specific verification
|
||||
|
||||
#include <Eigen/Core>
|
||||
|
||||
namespace Eigen {
|
||||
namespace test {
|
||||
|
||||
// 1. Identity: d=[1,...,1], e=[0,...,0]
|
||||
template <typename VectorType>
|
||||
void tridiag_identity(VectorType& diag, VectorType& offdiag) {
|
||||
diag.setOnes();
|
||||
offdiag.setZero();
|
||||
}
|
||||
|
||||
// 2. Zero: d=[0,...,0], e=[0,...,0]
|
||||
template <typename VectorType>
|
||||
void tridiag_zero(VectorType& diag, VectorType& offdiag) {
|
||||
diag.setZero();
|
||||
offdiag.setZero();
|
||||
}
|
||||
|
||||
// 3. Constant: d=[c,...,c], e=[c,...,c]
|
||||
template <typename VectorType>
|
||||
void tridiag_constant(VectorType& diag, VectorType& offdiag,
|
||||
typename VectorType::Scalar c = typename VectorType::Scalar(2.5)) {
|
||||
diag.setConstant(c);
|
||||
offdiag.setConstant(c);
|
||||
}
|
||||
|
||||
// 4. 1-2-1 Toeplitz: d=[2,...,2], e=[1,...,1]
|
||||
// Eigenvalues: 2 - 2*cos(k*pi/(n+1)) for k=1,...,n
|
||||
template <typename VectorType>
|
||||
void tridiag_1_2_1(VectorType& diag, VectorType& offdiag) {
|
||||
typedef typename VectorType::Scalar Scalar;
|
||||
diag.setConstant(Scalar(2));
|
||||
offdiag.setOnes();
|
||||
}
|
||||
|
||||
// 5. Wilkinson W_{2m+1}: d_i = |m - i|, e=[1,...,1]
|
||||
// Has pairs of eigenvalues agreeing to many digits; stresses deflation.
|
||||
template <typename VectorType>
|
||||
void tridiag_wilkinson(VectorType& diag, VectorType& offdiag) {
|
||||
typedef typename VectorType::Scalar Scalar;
|
||||
Index n = diag.size();
|
||||
for (Index i = 0; i < n; ++i) diag(i) = numext::abs(Scalar(n / 2) - Scalar(i));
|
||||
offdiag.setOnes();
|
||||
}
|
||||
|
||||
// 6. Clement matrix: d=[0,...,0], e_i = sqrt(i*(n-1-i))
|
||||
// Known eigenvalues: -(n-1), -(n-3), ..., (n-3), (n-1)
|
||||
template <typename VectorType>
|
||||
void tridiag_clement(VectorType& diag, VectorType& offdiag) {
|
||||
EIGEN_USING_STD(sqrt);
|
||||
typedef typename VectorType::Scalar Scalar;
|
||||
Index n = diag.size();
|
||||
diag.setZero();
|
||||
for (Index i = 0; i < n - 1; ++i) offdiag(i) = sqrt(Scalar(i + 1) * Scalar(n - 1 - i));
|
||||
}
|
||||
|
||||
// 7. Kahan-style: d_i = s^i, e_i = -c*s^i with s=sin(theta), c=cos(theta).
|
||||
// Geometric decay with controlled condition number.
|
||||
template <typename VectorType>
|
||||
void tridiag_kahan(VectorType& diag, VectorType& offdiag,
|
||||
typename VectorType::Scalar theta = typename VectorType::Scalar(0.3)) {
|
||||
EIGEN_USING_STD(sin);
|
||||
EIGEN_USING_STD(cos);
|
||||
EIGEN_USING_STD(pow);
|
||||
EIGEN_USING_STD(log);
|
||||
typedef typename VectorType::Scalar Scalar;
|
||||
Index n = diag.size();
|
||||
const Scalar eps = NumTraits<Scalar>::epsilon();
|
||||
const Scalar s = sin(theta);
|
||||
const Scalar c = cos(theta);
|
||||
const Scalar maxPower = -log(eps) / (-log(s));
|
||||
for (Index i = 0; i < n; ++i) diag(i) = pow(s, numext::mini(Scalar(i), maxPower));
|
||||
for (Index i = 0; i < n - 1; ++i) offdiag(i) = -c * pow(s, numext::mini(Scalar(i), maxPower));
|
||||
}
|
||||
|
||||
// 8. Graded: d_i = base^(-i), e_i = base^(-i)
|
||||
template <typename VectorType>
|
||||
void tridiag_graded(VectorType& diag, VectorType& offdiag,
|
||||
typename VectorType::Scalar base = typename VectorType::Scalar(10)) {
|
||||
EIGEN_USING_STD(pow);
|
||||
typedef typename VectorType::Scalar Scalar;
|
||||
Index n = diag.size();
|
||||
for (Index i = 0; i < n; ++i) diag(i) = pow(base, -Scalar(i));
|
||||
for (Index i = 0; i < n - 1; ++i) offdiag(i) = pow(base, -Scalar(i));
|
||||
}
|
||||
|
||||
// 9. Geometric decay diagonal: d_i = base^i, e=[0,...,0]
|
||||
template <typename VectorType>
|
||||
void tridiag_geometric_diagonal(VectorType& diag, VectorType& offdiag,
|
||||
typename VectorType::Scalar base = typename VectorType::Scalar(0.5)) {
|
||||
EIGEN_USING_STD(pow);
|
||||
EIGEN_USING_STD(log);
|
||||
typedef typename VectorType::Scalar Scalar;
|
||||
Index n = diag.size();
|
||||
const Scalar eps = NumTraits<Scalar>::epsilon();
|
||||
const Scalar maxPower = -log(eps) / (-log(base));
|
||||
for (Index i = 0; i < n; ++i) diag(i) = pow(base, numext::mini(Scalar(i), maxPower));
|
||||
offdiag.setZero();
|
||||
}
|
||||
|
||||
// 10. Geometric decay offdiagonal: d=[1,...,1], e_i = base^i
|
||||
template <typename VectorType>
|
||||
void tridiag_geometric_offdiag(VectorType& diag, VectorType& offdiag,
|
||||
typename VectorType::Scalar base = typename VectorType::Scalar(0.5)) {
|
||||
EIGEN_USING_STD(pow);
|
||||
typedef typename VectorType::Scalar Scalar;
|
||||
Index n = diag.size();
|
||||
diag.setOnes();
|
||||
for (Index i = 0; i < n - 1; ++i) offdiag(i) = pow(base, Scalar(i));
|
||||
}
|
||||
|
||||
// 11. Clustered eigenvalues: d_i = 1 + i*eps, e=[0,...,0]
|
||||
template <typename VectorType>
|
||||
void tridiag_clustered(VectorType& diag, VectorType& offdiag) {
|
||||
typedef typename VectorType::Scalar Scalar;
|
||||
Index n = diag.size();
|
||||
const Scalar eps = NumTraits<Scalar>::epsilon();
|
||||
for (Index i = 0; i < n; ++i) diag(i) = Scalar(1) + Scalar(i) * eps;
|
||||
offdiag.setZero();
|
||||
}
|
||||
|
||||
// 12. Two clusters: half at 1, half at eps.
|
||||
template <typename VectorType>
|
||||
void tridiag_two_clusters(VectorType& diag, VectorType& offdiag) {
|
||||
typedef typename VectorType::Scalar Scalar;
|
||||
Index n = diag.size();
|
||||
const Scalar eps = NumTraits<Scalar>::epsilon();
|
||||
for (Index i = 0; i < n; ++i) diag(i) = (i < n / 2) ? Scalar(1) : eps;
|
||||
offdiag.setZero();
|
||||
}
|
||||
|
||||
// 13. Single tiny value: d=[1,...,1,eps], e=[eps^2,...,eps^2]
|
||||
template <typename VectorType>
|
||||
void tridiag_single_tiny(VectorType& diag, VectorType& offdiag) {
|
||||
typedef typename VectorType::Scalar Scalar;
|
||||
Index n = diag.size();
|
||||
const Scalar eps = NumTraits<Scalar>::epsilon();
|
||||
diag.setOnes();
|
||||
diag(n - 1) = eps;
|
||||
offdiag.setConstant(eps * eps);
|
||||
}
|
||||
|
||||
// 14. Overflow/underflow: alternating big/tiny diagonal and offdiagonal.
|
||||
template <typename VectorType>
|
||||
void tridiag_overflow_underflow(VectorType& diag, VectorType& offdiag) {
|
||||
typedef typename VectorType::Scalar Scalar;
|
||||
Index n = diag.size();
|
||||
const Scalar big = (std::numeric_limits<Scalar>::max)() / Scalar(1000);
|
||||
const Scalar tiny = (std::numeric_limits<Scalar>::min)() * Scalar(1000);
|
||||
for (Index i = 0; i < n; ++i) diag(i) = (i % 2 == 0) ? big : tiny;
|
||||
for (Index i = 0; i < n - 1; ++i) offdiag(i) = (i % 2 == 0) ? tiny : big;
|
||||
}
|
||||
|
||||
// 15. Prescribed condition number: d_i = kappa^(-i/(n-1)), e_i = eps * random.
|
||||
template <typename VectorType>
|
||||
void tridiag_prescribed_cond(VectorType& diag, VectorType& offdiag) {
|
||||
EIGEN_USING_STD(pow);
|
||||
EIGEN_USING_STD(abs);
|
||||
typedef typename VectorType::Scalar Scalar;
|
||||
Index n = diag.size();
|
||||
const Scalar eps = NumTraits<Scalar>::epsilon();
|
||||
const Scalar kappa = Scalar(1) / eps;
|
||||
for (Index i = 0; i < n; ++i) diag(i) = pow(kappa, -Scalar(i) / Scalar(n - 1));
|
||||
for (Index i = 0; i < n - 1; ++i) offdiag(i) = eps * abs(internal::random<Scalar>());
|
||||
}
|
||||
|
||||
// 16. Rank-deficient: d=[1,..,0,..,0,..,1], e=[0,...,0]
|
||||
template <typename VectorType>
|
||||
void tridiag_rank_deficient(VectorType& diag, VectorType& offdiag) {
|
||||
typedef typename VectorType::Scalar Scalar;
|
||||
Index n = diag.size();
|
||||
for (Index i = 0; i < n; ++i) diag(i) = (i < n / 3 || i >= 2 * n / 3) ? Scalar(1) : Scalar(0);
|
||||
offdiag.setZero();
|
||||
}
|
||||
|
||||
// 17. Arrowhead-like: d_i = linspace(1,n), e_i = 1/(i+1)
|
||||
template <typename VectorType>
|
||||
void tridiag_arrowhead(VectorType& diag, VectorType& offdiag) {
|
||||
typedef typename VectorType::Scalar Scalar;
|
||||
Index n = diag.size();
|
||||
for (Index i = 0; i < n; ++i) diag(i) = Scalar(1) + Scalar(i);
|
||||
for (Index i = 0; i < n - 1; ++i) offdiag(i) = Scalar(1) / Scalar(i + 1);
|
||||
}
|
||||
|
||||
// 18. Repeated values: d=[1,2,3,1,2,3,...], e=[0,...,0]
|
||||
template <typename VectorType>
|
||||
void tridiag_repeated(VectorType& diag, VectorType& offdiag) {
|
||||
typedef typename VectorType::Scalar Scalar;
|
||||
Index n = diag.size();
|
||||
for (Index i = 0; i < n; ++i) diag(i) = Scalar((i % 3) + 1);
|
||||
offdiag.setZero();
|
||||
}
|
||||
|
||||
// 19. Glued blocks: d=[1,...,1], e=0 except e[n/2-1]=eps.
|
||||
// Two identity blocks coupled by a tiny off-diagonal entry.
|
||||
template <typename VectorType>
|
||||
void tridiag_glued(VectorType& diag, VectorType& offdiag) {
|
||||
typedef typename VectorType::Scalar Scalar;
|
||||
Index n = diag.size();
|
||||
diag.setOnes();
|
||||
offdiag.setZero();
|
||||
if (n > 2) offdiag(n / 2 - 1) = NumTraits<Scalar>::epsilon();
|
||||
}
|
||||
|
||||
// 20. Nearly diagonal: random diag, eps * random offdiag.
|
||||
template <typename VectorType>
|
||||
void tridiag_nearly_diagonal(VectorType& diag, VectorType& offdiag) {
|
||||
EIGEN_USING_STD(abs);
|
||||
typedef typename VectorType::Scalar Scalar;
|
||||
Index n = diag.size();
|
||||
const Scalar eps = NumTraits<Scalar>::epsilon();
|
||||
diag = VectorType::Random(n).cwiseAbs() + VectorType::Constant(n, Scalar(0.1));
|
||||
for (Index i = 0; i < n - 1; ++i) offdiag(i) = eps * (Scalar(0.5) + abs(internal::random<Scalar>()));
|
||||
}
|
||||
|
||||
// 21. Negative eigenvalues: d_i = -i, e=[1,...,1]
|
||||
// (Only meaningful for symmetric eigenvalue problems, not SVD.)
|
||||
template <typename VectorType>
|
||||
void tridiag_negative(VectorType& diag, VectorType& offdiag) {
|
||||
typedef typename VectorType::Scalar Scalar;
|
||||
Index n = diag.size();
|
||||
for (Index i = 0; i < n; ++i) diag(i) = -Scalar(i + 1);
|
||||
offdiag.setOnes();
|
||||
}
|
||||
|
||||
// 22. Mixed sign diagonal: d_i = (-1)^i * (i+1), e=[1,...,1]
|
||||
// (Only meaningful for symmetric eigenvalue problems, not SVD.)
|
||||
template <typename VectorType>
|
||||
void tridiag_mixed_sign(VectorType& diag, VectorType& offdiag) {
|
||||
typedef typename VectorType::Scalar Scalar;
|
||||
Index n = diag.size();
|
||||
for (Index i = 0; i < n; ++i) diag(i) = ((i % 2 == 0) ? Scalar(1) : Scalar(-1)) * Scalar(i + 1);
|
||||
offdiag.setOnes();
|
||||
}
|
||||
|
||||
// Helper: iterate over a set of sizes and call a functor with each (diag, offdiag) pair
|
||||
// generated by a generator function.
|
||||
//
|
||||
// Usage:
|
||||
// for_tridiag_sizes([](auto& diag, auto& offdiag) {
|
||||
// tridiag_wilkinson(diag, offdiag);
|
||||
// my_verify(diag, offdiag);
|
||||
// });
|
||||
template <typename Scalar, typename Func>
|
||||
void for_tridiag_sizes(Func&& func) {
|
||||
const int sizes[] = {1, 2, 3, 5, 10, 16, 20, 50, 100};
|
||||
typedef Matrix<Scalar, Dynamic, 1> VectorType;
|
||||
for (int si = 0; si < int(sizeof(sizes) / sizeof(sizes[0])); ++si) {
|
||||
const Index n = sizes[si];
|
||||
VectorType diag(n), offdiag(n > 1 ? n - 1 : 0);
|
||||
func(diag, offdiag);
|
||||
}
|
||||
}
|
||||
|
||||
// Helper: run all generators (suitable for both SVD and eigenvalue problems).
|
||||
// The callback receives (diag, offdiag) after each generator fills them.
|
||||
template <typename Scalar, typename Func>
|
||||
void for_all_tridiag_test_matrices(Func&& verify) {
|
||||
const int sizes[] = {1, 2, 3, 5, 10, 16, 20, 50, 100};
|
||||
typedef Matrix<Scalar, Dynamic, 1> VectorType;
|
||||
|
||||
for (int si = 0; si < int(sizeof(sizes) / sizeof(sizes[0])); ++si) {
|
||||
const Index n = sizes[si];
|
||||
VectorType diag(n), offdiag(n > 1 ? n - 1 : 0);
|
||||
|
||||
tridiag_identity(diag, offdiag);
|
||||
verify(diag, offdiag);
|
||||
|
||||
tridiag_zero(diag, offdiag);
|
||||
verify(diag, offdiag);
|
||||
|
||||
tridiag_constant(diag, offdiag);
|
||||
verify(diag, offdiag);
|
||||
|
||||
tridiag_1_2_1(diag, offdiag);
|
||||
verify(diag, offdiag);
|
||||
|
||||
tridiag_wilkinson(diag, offdiag);
|
||||
verify(diag, offdiag);
|
||||
|
||||
if (n > 1) {
|
||||
tridiag_clement(diag, offdiag);
|
||||
verify(diag, offdiag);
|
||||
}
|
||||
|
||||
tridiag_kahan(diag, offdiag);
|
||||
verify(diag, offdiag);
|
||||
|
||||
tridiag_graded(diag, offdiag);
|
||||
verify(diag, offdiag);
|
||||
|
||||
tridiag_geometric_diagonal(diag, offdiag);
|
||||
verify(diag, offdiag);
|
||||
|
||||
if (n > 1) {
|
||||
tridiag_geometric_offdiag(diag, offdiag);
|
||||
verify(diag, offdiag);
|
||||
}
|
||||
|
||||
tridiag_clustered(diag, offdiag);
|
||||
verify(diag, offdiag);
|
||||
|
||||
tridiag_two_clusters(diag, offdiag);
|
||||
verify(diag, offdiag);
|
||||
|
||||
tridiag_single_tiny(diag, offdiag);
|
||||
verify(diag, offdiag);
|
||||
|
||||
tridiag_overflow_underflow(diag, offdiag);
|
||||
verify(diag, offdiag);
|
||||
|
||||
if (n > 1) {
|
||||
tridiag_prescribed_cond(diag, offdiag);
|
||||
verify(diag, offdiag);
|
||||
}
|
||||
|
||||
tridiag_rank_deficient(diag, offdiag);
|
||||
verify(diag, offdiag);
|
||||
|
||||
tridiag_arrowhead(diag, offdiag);
|
||||
verify(diag, offdiag);
|
||||
|
||||
tridiag_repeated(diag, offdiag);
|
||||
verify(diag, offdiag);
|
||||
|
||||
tridiag_glued(diag, offdiag);
|
||||
verify(diag, offdiag);
|
||||
|
||||
tridiag_nearly_diagonal(diag, offdiag);
|
||||
verify(diag, offdiag);
|
||||
}
|
||||
}
|
||||
|
||||
// Helper: run all generators, including those with negative values
|
||||
// (suitable only for symmetric eigenvalue problems, not SVD).
|
||||
template <typename Scalar, typename Func>
|
||||
void for_all_symmetric_tridiag_test_matrices(Func&& verify) {
|
||||
for_all_tridiag_test_matrices<Scalar>(verify);
|
||||
|
||||
const int sizes[] = {1, 2, 3, 5, 10, 16, 20, 50, 100};
|
||||
typedef Matrix<Scalar, Dynamic, 1> VectorType;
|
||||
|
||||
for (int si = 0; si < int(sizeof(sizes) / sizeof(sizes[0])); ++si) {
|
||||
const Index n = sizes[si];
|
||||
VectorType diag(n), offdiag(n > 1 ? n - 1 : 0);
|
||||
|
||||
tridiag_negative(diag, offdiag);
|
||||
verify(diag, offdiag);
|
||||
|
||||
tridiag_mixed_sign(diag, offdiag);
|
||||
verify(diag, offdiag);
|
||||
}
|
||||
}
|
||||
|
||||
} // namespace test
|
||||
} // namespace Eigen
|
||||
|
||||
#endif // EIGEN_TEST_TRIDIAG_TEST_MATRICES_H
|
||||
Reference in New Issue
Block a user