Compare commits

...

10 Commits

Author SHA1 Message Date
Rasmus Munk Larsen
6a9405bf7a GPU: Raise CUDA/HIP minimum and remove legacy guards
- Raise CUDA minimum from 9.0 to 11.4 (sm_70/Volta).
- Raise HIP minimum to GFX906 (Vega 20/MI50) / ROCm 5.6.
- Remove EIGEN_HAS_{CUDA,HIP,GPU}_FP16 guards — FP16 is always available
  on sm_70+ and GFX906+.
- Remove obsolete __HIP_ARCH_HAS_* preprocessor branches.
- C++14 cleanup: remove pre-C++14 workarounds in GPU code.
- Fix NVCC warnings (deprecated register keyword, unreachable code,
  tautological comparisons).
- Fix HIP test execution on gfx1151.
- Update CI configuration for new minimum versions.
2026-04-09 15:21:39 -07:00
Rasmus Munk Larsen
e055e4e415 Add plog_core_double with fallback for AVX without AVX2
libeigen/eigen!2407

Co-authored-by: Rasmus Munk Larsen <rlarsen@nvidia.com>
2026-04-08 19:41:07 -07:00
Rasmus Munk Larsen
b1d2ce4c85 Revert "Speed up plog_double ~1.7x with fast integer range reduction"
This reverts merge request !2385
2026-04-08 13:03:48 -07:00
Rasmus Munk Larsen
ab70739c9c Speed up plog_double ~1.7x with fast integer range reduction
libeigen/eigen!2385

Co-authored-by: Rasmus Munk Larsen <rmlarsen@gmail.com>
2026-04-07 21:48:25 -07:00
Rasmus Munk Larsen
e778b5d22b Switch ASAN/UBSAN smoketest pipelines to large runners
libeigen/eigen!2405

Co-authored-by: Rasmus Munk Larsen <rmlarsen@gmail.com>
2026-04-07 21:37:58 -07:00
Rasmus Munk Larsen
def45c5e1e Improve psincos_double: faster polynomials + accurate range reduction
libeigen/eigen!2389

Closes #3052

Co-authored-by: Rasmus Munk Larsen <rmlarsen@gmail.com>
2026-04-07 21:24:24 -07:00
Rasmus Munk Larsen
110530a4d8 Fix bugs and improve robustness of SelfAdjointEigenSolver, improve test coverage
libeigen/eigen!2396

Co-authored-by: Rasmus Munk Larsen <rmlarsen@gmail.com>
2026-04-07 21:08:29 -07:00
Rasmus Munk Larsen
bde3a68bae Improve dense linear solver docs with practical guidance
libeigen/eigen!2395

Co-authored-by: Rasmus Munk Larsen <rmlarsen@gmail.com>
2026-04-05 21:40:42 -07:00
Rasmus Munk Larsen
8eabfb5342 Vectorize BLAS level 1/2 routines with Eigen expressions
libeigen/eigen!2404

Co-authored-by: Rasmus Munk Larsen <rmlarsen@gmail.com>
2026-04-05 18:53:11 -07:00
Rasmus Munk Larsen
4ad90a60f1 Replace blas/f2c with clean C++ implementations
libeigen/eigen!2402

Co-authored-by: Rasmus Munk Larsen <rmlarsen@gmail.com>
2026-04-05 16:04:41 -07:00
73 changed files with 3488 additions and 7536 deletions

View File

@@ -672,7 +672,7 @@ if (EIGEN_BUILD_TESTING)
endif()
set(EIGEN_CUDA_CXX_FLAGS "" CACHE STRING "Additional flags to pass to the cuda compiler.")
set(EIGEN_CUDA_COMPUTE_ARCH 30 CACHE STRING "The CUDA compute architecture(s) to target when compiling CUDA code")
set(EIGEN_CUDA_COMPUTE_ARCH 70 CACHE STRING "The CUDA compute architecture(s) to target when compiling CUDA code")
option(EIGEN_TEST_SYCL "Add Sycl support." OFF)
if(EIGEN_TEST_SYCL)
@@ -817,4 +817,3 @@ endif()
message(STATUS "")
message(STATUS "Configured Eigen ${EIGEN_VERSION_STRING}")
message(STATUS "")

View File

@@ -50,9 +50,9 @@
#include "src/Core/util/AOCL_Support.h"
#if defined(EIGEN_HAS_CUDA_FP16) || defined(EIGEN_HAS_HIP_FP16)
#define EIGEN_HAS_GPU_FP16
#endif
// EIGEN_HAS_GPU_FP16 is now always true when compiling with CUDA or HIP.
// Use EIGEN_GPUCC (compile-time) or EIGEN_GPU_COMPILE_PHASE (device phase) instead.
// TODO: Remove EIGEN_HAS_GPU_BF16 similarly once HIP bf16 guards are cleaned up.
#if defined(EIGEN_HAS_CUDA_BF16) || defined(EIGEN_HAS_HIP_BF16)
#define EIGEN_HAS_GPU_BF16

View File

@@ -858,16 +858,8 @@ struct hash<Eigen::bfloat16> {
} // namespace std
#endif
// Add the missing shfl* intrinsics.
// The __shfl* functions are only valid on HIP or _CUDA_ARCH_ >= 300.
// CUDA defines them for (__CUDA_ARCH__ >= 300 || !defined(__CUDA_ARCH__))
//
// HIP and CUDA prior to SDK 9.0 define
// __shfl, __shfl_up, __shfl_down, __shfl_xor for int and float
// CUDA since 9.0 deprecates those and instead defines
// __shfl_sync, __shfl_up_sync, __shfl_down_sync, __shfl_xor_sync,
// with native support for __half and __nv_bfloat16
//
// Warp shuffle overloads for Eigen::bfloat16.
// HIP uses non-sync __shfl variants; CUDA has native __nv_bfloat16 support in __shfl_sync.
// Note that the following are __device__ - only functions.
#if defined(EIGEN_HIPCC)

View File

@@ -141,6 +141,140 @@ EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS Packet plog2_float(const Pac
return plog_impl_float<Packet, /* base2 */ true>(_x);
}
// -----------------------------------------------------------------------
// Double logarithm: shared polynomial + two range-reduction backends
// -----------------------------------------------------------------------
// Cephes rational-polynomial approximation of log(1+f) for
// f in [sqrt(0.5)-1, sqrt(2)-1].
// Evaluates x - 0.5*x^2 + x^3 * P(x)/Q(x) where P and Q are degree-5.
// See: http://www.netlib.org/cephes/
template <typename Packet>
EIGEN_STRONG_INLINE Packet plog_mantissa_double(const Packet x) {
const Packet cst_cephes_log_p0 = pset1<Packet>(1.01875663804580931796E-4);
const Packet cst_cephes_log_p1 = pset1<Packet>(4.97494994976747001425E-1);
const Packet cst_cephes_log_p2 = pset1<Packet>(4.70579119878881725854E0);
const Packet cst_cephes_log_p3 = pset1<Packet>(1.44989225341610930846E1);
const Packet cst_cephes_log_p4 = pset1<Packet>(1.79368678507819816313E1);
const Packet cst_cephes_log_p5 = pset1<Packet>(7.70838733755885391666E0);
// Q0 = 1.0; pmadd(1, x, q1) simplifies to padd(x, q1).
const Packet cst_cephes_log_q1 = pset1<Packet>(1.12873587189167450590E1);
const Packet cst_cephes_log_q2 = pset1<Packet>(4.52279145837532221105E1);
const Packet cst_cephes_log_q3 = pset1<Packet>(8.29875266912776603211E1);
const Packet cst_cephes_log_q4 = pset1<Packet>(7.11544750618563894466E1);
const Packet cst_cephes_log_q5 = pset1<Packet>(2.31251620126765340583E1);
Packet x2 = pmul(x, x);
Packet x3 = pmul(x2, x);
// Evaluate P and Q simultaneously for better ILP.
Packet y, y1, y_;
y = pmadd(cst_cephes_log_p0, x, cst_cephes_log_p1);
y1 = pmadd(cst_cephes_log_p3, x, cst_cephes_log_p4);
y = pmadd(y, x, cst_cephes_log_p2);
y1 = pmadd(y1, x, cst_cephes_log_p5);
y_ = pmadd(y, x3, y1);
y = padd(x, cst_cephes_log_q1);
y1 = pmadd(cst_cephes_log_q3, x, cst_cephes_log_q4);
y = pmadd(y, x, cst_cephes_log_q2);
y1 = pmadd(y1, x, cst_cephes_log_q5);
y = pmadd(y, x3, y1);
y_ = pmul(y_, x3);
y = pdiv(y_, y);
y = pnmadd(pset1<Packet>(0.5), x2, y);
return padd(x, y);
}
// Detect whether unpacket_traits<Packet>::integer_packet is defined.
template <typename Packet, typename = void>
struct packet_has_integer_packet : std::false_type {};
template <typename Packet>
struct packet_has_integer_packet<Packet, void_t<typename unpacket_traits<Packet>::integer_packet>> : std::true_type {};
// Dispatch struct for double-precision range reduction.
// Primary template: pfrexp-based fallback (used when integer_packet is absent).
template <typename Packet, bool UseIntegerPacket>
struct plog_range_reduce_double {
EIGEN_STRONG_INLINE static void run(const Packet v, Packet& f, Packet& e) {
const Packet one = pset1<Packet>(1.0);
const Packet cst_cephes_SQRTHF = pset1<Packet>(0.70710678118654752440E0);
// pfrexp: f in [0.5, 1), e = unbiased exponent as double.
f = pfrexp(v, e);
// Shift [0.5,1) -> [sqrt(0.5)-1, sqrt(2)-1] with exponent correction:
// if f < sqrt(0.5): f = f + f - 1, e -= 1 (giving f in [0, sqrt(2)-1))
// else: f = f - 1 (giving f in [sqrt(0.5)-1, 0))
Packet mask = pcmp_lt(f, cst_cephes_SQRTHF);
Packet tmp = pand(f, mask);
f = psub(f, one);
e = psub(e, pand(one, mask));
f = padd(f, tmp);
}
};
// Specialisation: fast integer-bit-manipulation path (musl-inspired).
// Requires unpacket_traits<Packet>::integer_packet to be a 64-bit integer packet.
template <typename Packet>
struct plog_range_reduce_double<Packet, true> {
EIGEN_STRONG_INLINE static void run(const Packet v, Packet& f, Packet& e) {
typedef typename unpacket_traits<Packet>::integer_packet PacketI;
// 2^-1022: smallest positive normal double.
const PacketI cst_min_normal = pset1<PacketI>(static_cast<int64_t>(0x0010000000000000LL));
// Lower 52-bit mask (IEEE mantissa field).
const PacketI cst_mant_mask = pset1<PacketI>(static_cast<int64_t>(0x000FFFFFFFFFFFFFLL));
// Offset = 1.0_bits - sqrt(0.5)_bits. Adding this to the integer
// representation shifts the exponent field so that the [sqrt(0.5), sqrt(2))
// half-octave boundary falls on an exact biased-exponent boundary, letting
// us extract e with a single right shift. The constant is:
// 0x3FF0000000000000 - 0x3FE6A09E667F3BCD = 0x00095F619980C433
const PacketI cst_sqrt_half_offset =
pset1<PacketI>(static_cast<int64_t>(0x3FF0000000000000LL - 0x3FE6A09E667F3BCDLL));
// IEEE double exponent bias (1023).
const PacketI cst_exp_bias = pset1<PacketI>(static_cast<int64_t>(1023));
// sqrt(0.5) IEEE bits — used to reconstruct f from biased mantissa.
const PacketI cst_half_mant = pset1<PacketI>(static_cast<int64_t>(0x3FE6A09E667F3BCDLL));
// Reinterpret v as a 64-bit integer vector.
PacketI vi = preinterpret<PacketI>(v);
// Normalise denormals: multiply by 2^52 and correct the exponent by -52.
PacketI is_denormal = pcmp_lt(vi, cst_min_normal);
// 2^52 via bit pattern: biased exponent = 52 + 1023 = 0x433, mantissa = 0.
Packet v_norm = pmul(v, pset1frombits<Packet>(static_cast<uint64_t>(int64_t(52 + 0x3ff) << 52)));
vi = pselect(is_denormal, preinterpret<PacketI>(v_norm), vi);
PacketI denorm_adj = pand(is_denormal, pset1<PacketI>(static_cast<int64_t>(52)));
// Bias the integer representation so the exponent field directly encodes
// the half-octave index.
PacketI vi_biased = padd(vi, cst_sqrt_half_offset);
// Extract unbiased exponent: shift out mantissa bits, subtract IEEE bias
// and denormal adjustment.
PacketI e_int = psub(psub(plogical_shift_right<52>(vi_biased), cst_exp_bias), denorm_adj);
// Convert integer exponent to floating-point.
e = pcast<PacketI, Packet>(e_int);
// Reconstruct mantissa in [sqrt(0.5), sqrt(2)) via integer arithmetic.
// The integer addition of the masked mantissa bits and the sqrt(0.5) bit
// pattern carries into the exponent field, yielding a value in that range.
// Then subtract 1 to centre on 0: f in [sqrt(0.5)-1, sqrt(2)-1].
f = psub(preinterpret<Packet>(padd(pand(vi_biased, cst_mant_mask), cst_half_mant)), pset1<Packet>(1.0));
}
};
// Core range reduction and polynomial for double logarithm.
// Input: v > 0 (zero / negative / inf / nan are handled by the caller).
// Output: log_mantissa ≈ log(mantissa of v in [sqrt(0.5), sqrt(2))),
// e = unbiased exponent of v as a double.
// Selects the fast integer path when integer_packet is available, otherwise
// falls back to pfrexp.
template <typename Packet>
EIGEN_STRONG_INLINE void plog_core_double(const Packet v, Packet& log_mantissa, Packet& e) {
Packet f;
plog_range_reduce_double<Packet, packet_has_integer_packet<Packet>::value>::run(v, f, e);
log_mantissa = plog_mantissa_double(f);
}
/* Returns the base e (2.718...) or base 2 logarithm of x.
* The argument is separated into its exponent and fractional parts.
* The logarithm of the fraction in the interval [sqrt(1/2), sqrt(2)],
@@ -152,87 +286,29 @@ EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS Packet plog2_float(const Pac
*/
template <typename Packet, bool base2>
EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS Packet plog_impl_double(const Packet _x) {
Packet x = _x;
const Packet cst_1 = pset1<Packet>(1.0);
const Packet cst_neg_half = pset1<Packet>(-0.5);
const Packet cst_minus_inf = pset1frombits<Packet>(static_cast<uint64_t>(0xfff0000000000000ull));
const Packet cst_pos_inf = pset1frombits<Packet>(static_cast<uint64_t>(0x7ff0000000000000ull));
// Polynomial Coefficients for log(1+x) = x - x**2/2 + x**3 P(x)/Q(x)
// 1/sqrt(2) <= x < sqrt(2)
const Packet cst_cephes_SQRTHF = pset1<Packet>(0.70710678118654752440E0);
const Packet cst_cephes_log_p0 = pset1<Packet>(1.01875663804580931796E-4);
const Packet cst_cephes_log_p1 = pset1<Packet>(4.97494994976747001425E-1);
const Packet cst_cephes_log_p2 = pset1<Packet>(4.70579119878881725854E0);
const Packet cst_cephes_log_p3 = pset1<Packet>(1.44989225341610930846E1);
const Packet cst_cephes_log_p4 = pset1<Packet>(1.79368678507819816313E1);
const Packet cst_cephes_log_p5 = pset1<Packet>(7.70838733755885391666E0);
Packet log_mantissa, e;
plog_core_double(_x, log_mantissa, e);
const Packet cst_cephes_log_q0 = pset1<Packet>(1.0);
const Packet cst_cephes_log_q1 = pset1<Packet>(1.12873587189167450590E1);
const Packet cst_cephes_log_q2 = pset1<Packet>(4.52279145837532221105E1);
const Packet cst_cephes_log_q3 = pset1<Packet>(8.29875266912776603211E1);
const Packet cst_cephes_log_q4 = pset1<Packet>(7.11544750618563894466E1);
const Packet cst_cephes_log_q5 = pset1<Packet>(2.31251620126765340583E1);
Packet e;
// extract significant in the range [0.5,1) and exponent
x = pfrexp(x, e);
// Shift the inputs from the range [0.5,1) to [sqrt(1/2),sqrt(2))
// and shift by -1. The values are then centered around 0, which improves
// the stability of the polynomial evaluation.
// if( x < SQRTHF ) {
// e -= 1;
// x = x + x - 1.0;
// } else { x = x - 1.0; }
Packet mask = pcmp_lt(x, cst_cephes_SQRTHF);
Packet tmp = pand(x, mask);
x = psub(x, cst_1);
e = psub(e, pand(cst_1, mask));
x = padd(x, tmp);
Packet x2 = pmul(x, x);
Packet x3 = pmul(x2, x);
// Evaluate the polynomial in factored form for better instruction-level parallelism.
// y = x - 0.5*x^2 + x^3 * polevl( x, P, 5 ) / p1evl( x, Q, 5 ) );
Packet y, y1, y_;
y = pmadd(cst_cephes_log_p0, x, cst_cephes_log_p1);
y1 = pmadd(cst_cephes_log_p3, x, cst_cephes_log_p4);
y = pmadd(y, x, cst_cephes_log_p2);
y1 = pmadd(y1, x, cst_cephes_log_p5);
y_ = pmadd(y, x3, y1);
y = pmadd(cst_cephes_log_q0, x, cst_cephes_log_q1);
y1 = pmadd(cst_cephes_log_q3, x, cst_cephes_log_q4);
y = pmadd(y, x, cst_cephes_log_q2);
y1 = pmadd(y1, x, cst_cephes_log_q5);
y = pmadd(y, x3, y1);
y_ = pmul(y_, x3);
y = pdiv(y_, y);
y = pmadd(cst_neg_half, x2, y);
x = padd(x, y);
// Add the logarithm of the exponent back to the result of the interpolation.
// Combine: log(x) = e * ln2 + log(mantissa), or log2(x) = log(mantissa)*log2e + e.
Packet x;
if (base2) {
const Packet cst_log2e = pset1<Packet>(static_cast<double>(EIGEN_LOG2E));
x = pmadd(x, cst_log2e, e);
x = pmadd(log_mantissa, cst_log2e, e);
} else {
const Packet cst_ln2 = pset1<Packet>(static_cast<double>(EIGEN_LN2));
x = pmadd(e, cst_ln2, x);
x = pmadd(e, cst_ln2, log_mantissa);
}
Packet invalid_mask = pcmp_lt_or_nan(_x, pzero(_x));
Packet iszero_mask = pcmp_eq(_x, pzero(_x));
Packet pos_inf_mask = pcmp_eq(_x, cst_pos_inf);
// Filter out invalid inputs, i.e.:
// - negative arg will be NAN
// - 0 will be -INF
// - +INF will be +INF
// Filter out invalid inputs:
// - negative arg NAN
// - 0 -INF
// - +INF +INF
return pselect(iszero_mask, cst_minus_inf, por(pselect(pos_inf_mask, cst_pos_inf, x), invalid_mask));
}
@@ -286,8 +362,11 @@ EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS Packet generic_log1p_float(c
return result;
}
/** \internal \returns log(1 + x) for double precision float.
Same direct approach as the float version.
/** \internal \returns log(1 + x) for double precision.
Computes log(1+x) using plog_core_double for the core range reduction and
polynomial evaluation. The rounding error from forming u = fl(1+x) is
recovered as dx = x - (u - 1) and folded in as a first-order correction
dx/u after the polynomial evaluation.
*/
template <typename Packet>
EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS Packet generic_log1p_double(const Packet& x) {
@@ -295,67 +374,31 @@ EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS Packet generic_log1p_double(
const Packet cst_minus_inf = pset1frombits<Packet>(static_cast<uint64_t>(0xfff0000000000000ull));
const Packet cst_pos_inf = pset1frombits<Packet>(static_cast<uint64_t>(0x7ff0000000000000ull));
// u = 1 + x, with rounding. Recover the lost low bits: dx = x - (u - 1).
Packet u = padd(one, x);
Packet dx = psub(x, psub(u, one));
// For |x| tiny enough that u rounds to 1, return x directly.
Packet small_mask = pcmp_eq(u, one);
// For u = +inf (x very large), return +inf.
Packet inf_mask = pcmp_eq(u, cst_pos_inf);
const Packet cst_cephes_SQRTHF = pset1<Packet>(0.70710678118654752440E0);
Packet e;
Packet m = pfrexp(u, e);
Packet mask = pcmp_lt(m, cst_cephes_SQRTHF);
Packet tmp = pand(m, mask);
m = psub(m, one);
e = psub(e, pand(one, mask));
m = padd(m, tmp);
// Core range reduction and polynomial on u.
Packet log_u, e;
plog_core_double(u, log_u, e);
// Same polynomial as plog_double.
const Packet cst_neg_half = pset1<Packet>(-0.5);
const Packet cst_cephes_log_p0 = pset1<Packet>(1.01875663804580931796E-4);
const Packet cst_cephes_log_p1 = pset1<Packet>(4.97494994976747001425E-1);
const Packet cst_cephes_log_p2 = pset1<Packet>(4.70579119878881725854E0);
const Packet cst_cephes_log_p3 = pset1<Packet>(1.44989225341610930846E1);
const Packet cst_cephes_log_p4 = pset1<Packet>(1.79368678507819816313E1);
const Packet cst_cephes_log_p5 = pset1<Packet>(7.70838733755885391666E0);
const Packet cst_cephes_log_q0 = pset1<Packet>(1.0);
const Packet cst_cephes_log_q1 = pset1<Packet>(1.12873587189167450590E1);
const Packet cst_cephes_log_q2 = pset1<Packet>(4.52279145837532221105E1);
const Packet cst_cephes_log_q3 = pset1<Packet>(8.29875266912776603211E1);
const Packet cst_cephes_log_q4 = pset1<Packet>(7.11544750618563894466E1);
const Packet cst_cephes_log_q5 = pset1<Packet>(2.31251620126765340583E1);
Packet m2 = pmul(m, m);
Packet m3 = pmul(m2, m);
Packet y, y1, y_;
y = pmadd(cst_cephes_log_p0, m, cst_cephes_log_p1);
y1 = pmadd(cst_cephes_log_p3, m, cst_cephes_log_p4);
y = pmadd(y, m, cst_cephes_log_p2);
y1 = pmadd(y1, m, cst_cephes_log_p5);
y_ = pmadd(y, m3, y1);
y = pmadd(cst_cephes_log_q0, m, cst_cephes_log_q1);
y1 = pmadd(cst_cephes_log_q3, m, cst_cephes_log_q4);
y = pmadd(y, m, cst_cephes_log_q2);
y1 = pmadd(y1, m, cst_cephes_log_q5);
y = pmadd(y, m3, y1);
y_ = pmul(y_, m3);
Packet log_m = pdiv(y_, y);
log_m = pmadd(cst_neg_half, m2, log_m);
log_m = padd(m, log_m);
// result = e * ln2 + log(m) + dx/u.
// result = e * ln2 + log(u) + dx/u.
// The dx/u term corrects for the rounding error in u = fl(1+x).
const Packet cst_ln2 = pset1<Packet>(static_cast<double>(EIGEN_LN2));
Packet result = pmadd(e, cst_ln2, padd(log_m, pdiv(dx, u)));
Packet result = pmadd(e, cst_ln2, padd(log_u, pdiv(dx, u)));
// Handle special cases.
Packet neg_mask = pcmp_lt(u, pzero(u));
Packet zero_mask = pcmp_eq(x, pset1<Packet>(-1.0));
result = pselect(small_mask, x, result);
result = pselect(inf_mask, cst_pos_inf, result);
result = pselect(zero_mask, cst_minus_inf, result);
result = por(neg_mask, result);
result = por(neg_mask, result); // NaN for x < -1
return result;
}

View File

@@ -230,40 +230,31 @@ EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS Packet ptan_float(const Pack
return psincos_float<TrigFunction::Tan>(x);
}
// Trigonometric argument reduction for double for inputs smaller than 15.
// Reduces trigonometric arguments for double inputs where x < 15. Given an argument x and its corresponding quadrant
// count n, the function computes and returns the reduced argument t such that x = n * pi/2 + t.
// Pi/2 split into 3 double-precision parts (triple-double).
// c1 + c2 + c3 = pi/2 to ~159 bits. Computed by Sollya.
// c1 = RD(pi/2), c2 = RD(pi/2 - c1), c3 = RD(pi/2 - c1 - c2).
template <typename Packet>
Packet trig_reduce_small_double(const Packet& x, const Packet& q) {
// Pi/2 split into 2 values
const Packet cst_pio2_a = pset1<Packet>(-1.570796325802803);
const Packet cst_pio2_b = pset1<Packet>(-9.920935184482005e-10);
Packet t;
t = pmadd(cst_pio2_a, q, x);
t = pmadd(cst_pio2_b, q, t);
return t;
Packet cst_pio2_1() {
return pset1<Packet>(-1.5707963267948965579989817342720925807952880859375); // -0x1.921fb54442d18p0
}
template <typename Packet>
Packet cst_pio2_2() {
return pset1<Packet>(-6.12323399573676603586882014729198302312846062338790e-17); // -0x1.1a62633145c07p-54
}
template <typename Packet>
Packet cst_pio2_3() {
return pset1<Packet>(1.4973849048591698329435081771059920083527504761695190e-33); // 0x1.f1976b7ed8fbcp-110
}
// Trigonometric argument reduction for double for inputs smaller than 1e14.
// Reduces trigonometric arguments for double inputs where x < 1e14. Given an argument x and its corresponding quadrant
// count n, the function computes and returns the reduced argument t such that x = n * pi/2 + t.
// Trigonometric argument reduction for double, small inputs (|x| < small_th).
// Reduces x to t such that x = q * pi/2 + t, where |t| <= pi/4.
// Uses a triple-double split of pi/2 with FMA for high accuracy.
template <typename Packet>
Packet trig_reduce_medium_double(const Packet& x, const Packet& q_high, const Packet& q_low) {
// Pi/2 split into 4 values
const Packet cst_pio2_a = pset1<Packet>(-1.570796325802803);
const Packet cst_pio2_b = pset1<Packet>(-9.920935184482005e-10);
const Packet cst_pio2_c = pset1<Packet>(-6.123234014771656e-17);
const Packet cst_pio2_d = pset1<Packet>(1.903488962019325e-25);
Packet trig_reduce_small_double(const Packet& x, const Packet& q) {
Packet t;
t = pmadd(cst_pio2_a, q_high, x);
t = pmadd(cst_pio2_a, q_low, t);
t = pmadd(cst_pio2_b, q_high, t);
t = pmadd(cst_pio2_b, q_low, t);
t = pmadd(cst_pio2_c, q_high, t);
t = pmadd(cst_pio2_c, q_low, t);
t = pmadd(cst_pio2_d, padd(q_low, q_high), t);
t = pmadd(cst_pio2_1<Packet>(), q, x);
t = pmadd(cst_pio2_2<Packet>(), q, t);
t = pmadd(cst_pio2_3<Packet>(), q, t);
return t;
}
@@ -284,11 +275,13 @@ EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS
// If the argument is bigger than this value, use the non-vectorized std version
const double huge_th = 1e14;
const Packet cst_2oPI = pset1<Packet>(0.63661977236758134307553505349006); // 2/PI
// 2/PI as a double-word: hi + lo = 2/pi to ~107 bits. Computed by Sollya.
const Packet cst_2oPI_hi =
pset1<Packet>(0.63661977236758138243288840385503135621547698974609375); // 0x1.45f306dc9c883p-1
const Packet cst_2oPI_lo =
pset1<Packet>(-3.9357353350364971763790381828183628368294820823718866e-17); // -0x1.6b01ec5417056p-55
// Integer Packet constants
const PacketI cst_one = pset1<PacketI>(ScalarI(1));
// Constant for splitting
const Packet cst_split = pset1<Packet>(1 << 24);
Packet x_abs = pabs(x);
@@ -298,76 +291,56 @@ EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS
// TODO Implement huge angle argument reduction
if (EIGEN_PREDICT_FALSE(predux_any(pcmp_le(pset1<Packet>(small_th), x_abs)))) {
Packet q_high = pmul(pfloor(pmul(x_abs, pdiv(cst_2oPI, cst_split))), cst_split);
Packet q_low_noround = psub(pmul(x_abs, cst_2oPI), q_high);
q_int = pcast<Packet, PacketI>(padd(q_low_noround, pset1<Packet>(0.5)));
Packet q_low = pcast<PacketI, Packet>(q_int);
s = trig_reduce_medium_double(x_abs, q_high, q_low);
// Medium path: use double-word product x * (2/pi) for precise quadrant computation.
Packet prod_hi, prod_lo;
twoprod(x_abs, cst_2oPI_hi, prod_hi, prod_lo);
// Correction for 2/pi truncation: add x * lo(2/pi)
prod_lo = pmadd(x_abs, cst_2oPI_lo, prod_lo);
// Round the double-word (prod_hi, prod_lo) to the nearest integer.
Packet q = pround(prod_hi);
// Compute exact fractional part to check if rounding was correct.
Packet frac = padd(psub(prod_hi, q), prod_lo);
// Correct if fractional part crossed +-0.5 boundary.
q = padd(q, pand(pcmp_lt(pset1<Packet>(0.5), frac), pset1<Packet>(1.0)));
q = padd(q, pand(pcmp_lt(frac, pset1<Packet>(-0.5)), pset1<Packet>(-1.0)));
q_int = pcast<Packet, PacketI>(q);
s = trig_reduce_small_double(x_abs, q);
} else {
Packet qval_noround = pmul(x_abs, cst_2oPI);
// Small path: simple reduction with triple-double pi/2 split.
Packet qval_noround = pmul(x_abs, cst_2oPI_hi);
q_int = pcast<Packet, PacketI>(padd(qval_noround, pset1<Packet>(0.5)));
Packet q = pcast<PacketI, Packet>(q_int);
s = trig_reduce_small_double(x_abs, q);
}
// All the upcoming approximating polynomials have even exponents
Packet ss = pmul(s, s);
// Padé approximant of cos(x)
// Assuring < 1 ULP error on the interval [-pi/4, pi/4]
// cos(x) ~= (80737373*x^8 - 13853547000*x^6 + 727718024880*x^4 - 11275015752000*x^2 + 23594700729600)/(147173*x^8 +
// 39328920*x^6 + 5772800880*x^4 + 522334612800*x^2 + 23594700729600)
// MATLAB code to compute those coefficients:
// syms x;
// cosf = @(x) cos(x);
// pade_cosf = pade(cosf(x), x, 0, 'Order', 8)
const Packet cn4 = pset1<Packet>(80737373);
const Packet cn3 = pset1<Packet>(-13853547000);
const Packet cn2 = pset1<Packet>(727718024880);
const Packet cn1 = pset1<Packet>(-11275015752000);
const Packet cn0 = pset1<Packet>(23594700729600); // shared with cd0
const Packet cd3 = pset1<Packet>(147173);
const Packet cd2 = pset1<Packet>(39328920);
const Packet cd1 = pset1<Packet>(5772800880);
const Packet cd0 = pset1<Packet>(522334612800);
Packet sc1_num = pmadd(ss, cn4, cn3);
Packet sc2_num = pmadd(sc1_num, ss, cn2);
Packet sc3_num = pmadd(sc2_num, ss, cn1);
Packet sc4_num = pmadd(sc3_num, ss, cn0);
Packet sc1_denum = pmadd(ss, cd3, cd2);
Packet sc2_denum = pmadd(sc1_denum, ss, cd1);
Packet sc3_denum = pmadd(sc2_denum, ss, cd0);
Packet sc4_denum = pmadd(sc3_denum, ss, cn0);
Packet scos = pdiv(sc4_num, sc4_denum);
// Minimax polynomial approximation of cos(x) on [-pi/4, pi/4].
// cos(x) = 1 + u * P(u), where u = x^2 and P is degree 6 (7 FMAs total).
// Coefficients computed by Sollya fpminimax. Max polynomial error ~1.3e-19.
Packet scos = pset1<Packet>(-1.1368926065317776472832699312119132152576472805094454088248312473297119140625e-11);
scos = pmadd(scos, ss, pset1<Packet>(2.0875905481768720039634091158002593413556269297259859740734100341796875e-09));
scos = pmadd(scos, ss, pset1<Packet>(-2.7557315712466412785356544880299711763882442028261721134185791015625e-07));
scos = pmadd(scos, ss, pset1<Packet>(2.480158729424286522739599714082459058772656135261058807373046875e-05));
scos = pmadd(scos, ss, pset1<Packet>(-1.388888888888178789471350427220386336557567119598388671875e-03));
scos = pmadd(scos, ss, pset1<Packet>(4.166666666666664353702032030923874117434024810791015625e-02));
scos = pmadd(scos, ss, pset1<Packet>(-0.5));
scos = pmadd(scos, ss, pset1<Packet>(1.0));
// Padé approximant of sin(x)
// Assuring < 1 ULP error on the interval [-pi/4, pi/4]
// sin(x) ~= (x*(4585922449*x^8 - 1066023933480*x^6 + 83284044283440*x^4 - 2303682236856000*x^2 +
// 15605159573203200))/(45*(1029037*x^8 + 345207016*x^6 + 61570292784*x^4 + 6603948711360*x^2 + 346781323848960))
// MATLAB code to compute those coefficients:
// syms x;
// sinf = @(x) sin(x);
// pade_sinf = pade(sinf(x), x, 0, 'Order', 8, 'OrderMode', 'relative')
const Packet sn4 = pset1<Packet>(4585922449);
const Packet sn3 = pset1<Packet>(-1066023933480);
const Packet sn2 = pset1<Packet>(83284044283440);
const Packet sn1 = pset1<Packet>(-2303682236856000);
const Packet sn0 = pset1<Packet>(15605159573203200);
const Packet sd3 = pset1<Packet>(1029037);
const Packet sd2 = pset1<Packet>(345207016);
const Packet sd1 = pset1<Packet>(61570292784);
const Packet sd0_inner = pset1<Packet>(6603948711360);
const Packet sd0 = pset1<Packet>(346781323848960);
const Packet cst_45 = pset1<Packet>(45);
Packet ss1_num = pmadd(ss, sn4, sn3);
Packet ss2_num = pmadd(ss1_num, ss, sn2);
Packet ss3_num = pmadd(ss2_num, ss, sn1);
Packet ss4_num = pmadd(ss3_num, ss, sn0);
Packet ss1_denum = pmadd(ss, sd3, sd2);
Packet ss2_denum = pmadd(ss1_denum, ss, sd1);
Packet ss3_denum = pmadd(ss2_denum, ss, sd0_inner);
Packet ss4_denum = pmadd(ss3_denum, ss, sd0);
Packet ssin = pdiv(pmul(s, ss4_num), pmul(cst_45, ss4_denum));
// Minimax polynomial approximation of sin(x) on [-pi/4, pi/4].
// sin(x) = x * (1 + u * R(u)), where u = x^2 and R is degree 5.
// Computed as: x + x * u * R(u) (6 FMAs + 1 mul).
// Coefficients computed by Sollya fpminimax. Max polynomial error ~1.0e-17.
Packet ssin = pset1<Packet>(1.59193066075142890698150587293845624470289834562208852730691432952880859375e-10);
ssin = pmadd(ssin, ss, pset1<Packet>(-2.50511517945670206974594627392927126408039839589037001132965087890625e-08));
ssin = pmadd(ssin, ss, pset1<Packet>(2.755731622544328228235042954619160582296899519860744476318359375e-06));
ssin = pmadd(ssin, ss, pset1<Packet>(-1.9841269837089632013978068858506276228581555187702178955078125e-04));
ssin = pmadd(ssin, ss, pset1<Packet>(8.333333333331312264835588621281203813850879669189453125e-03));
ssin = pmadd(ssin, ss, pset1<Packet>(-0.1666666666666666574148081281236954964697360992431640625));
ssin = pmul(ssin, ss);
ssin = pmadd(ssin, s, s);
Packet poly_mask = preinterpret<Packet>(pcmp_eq(pand(q_int, cst_one), pzero(q_int)));

View File

@@ -45,7 +45,7 @@
// Eigen with GPU support.
// Any functions that require `numext::bit_cast` may also not be constexpr,
// including any native types when setting via raw bit values.
#if defined(EIGEN_HAS_GPU_FP16) || defined(EIGEN_HAS_ARM64_FP16_SCALAR_ARITHMETIC) || defined(EIGEN_HAS_BUILTIN_FLOAT16)
#if defined(EIGEN_GPUCC) || defined(EIGEN_HAS_ARM64_FP16_SCALAR_ARITHMETIC) || defined(EIGEN_HAS_BUILTIN_FLOAT16)
#define _EIGEN_MAYBE_CONSTEXPR
#else
#define _EIGEN_MAYBE_CONSTEXPR constexpr
@@ -121,12 +121,12 @@ namespace half_impl {
//
// Making the host side compile phase of hipcc use the same Eigen::half impl, as the gcc compile, resolves
// this error, and hence the following convoluted #if condition
#if !defined(EIGEN_HAS_GPU_FP16) || !defined(EIGEN_GPU_COMPILE_PHASE)
#if !defined(EIGEN_GPUCC) || !defined(EIGEN_GPU_COMPILE_PHASE)
// Make our own __half_raw definition that is similar to CUDA's.
struct __half_raw {
struct construct_from_rep_tag {};
#if (defined(EIGEN_HAS_GPU_FP16) && !defined(EIGEN_GPU_COMPILE_PHASE))
#if (defined(EIGEN_GPUCC) && !defined(EIGEN_GPU_COMPILE_PHASE))
// Eigen::half can be used as the datatype for shared memory declarations (in Eigen and TF)
// The element type for shared memory cannot have non-trivial constructors
// and hence the following special casing (which skips the zero-initilization).
@@ -152,16 +152,12 @@ struct __half_raw {
#endif
};
#elif defined(EIGEN_HAS_HIP_FP16)
#elif defined(EIGEN_HIPCC)
// HIP GPU compile phase: nothing to do here.
// HIP fp16 header file has a definition for __half_raw
#elif defined(EIGEN_HAS_CUDA_FP16)
#elif defined(EIGEN_CUDACC)
// CUDA GPU compile phase.
#if EIGEN_CUDA_SDK_VER < 90000
// In CUDA < 9.0, __half is the equivalent of CUDA 9's __half_raw
typedef __half __half_raw;
#endif // defined(EIGEN_HAS_CUDA_FP16)
#elif defined(SYCL_DEVICE_ONLY)
typedef cl::sycl::half __half_raw;
@@ -175,15 +171,13 @@ struct half_base : public __half_raw {
EIGEN_DEVICE_FUNC _EIGEN_MAYBE_CONSTEXPR half_base() {}
EIGEN_DEVICE_FUNC _EIGEN_MAYBE_CONSTEXPR half_base(const __half_raw& h) : __half_raw(h) {}
#if defined(EIGEN_HAS_GPU_FP16)
#if defined(EIGEN_HAS_HIP_FP16)
#if defined(EIGEN_GPUCC)
#if defined(EIGEN_HIPCC)
EIGEN_DEVICE_FUNC _EIGEN_MAYBE_CONSTEXPR half_base(const __half& h) { x = __half_as_ushort(h); }
#elif defined(EIGEN_HAS_CUDA_FP16)
#if EIGEN_CUDA_SDK_VER >= 90000
#elif defined(EIGEN_CUDACC)
EIGEN_DEVICE_FUNC _EIGEN_MAYBE_CONSTEXPR half_base(const __half& h) : __half_raw(*(__half_raw*)&h) {}
#endif
#endif
#endif
};
} // namespace half_impl
@@ -192,36 +186,29 @@ struct half_base : public __half_raw {
struct half : public half_impl::half_base {
// Writing this out as separate #if-else blocks to make the code easier to follow
// The same applies to most #if-else blocks in this file
#if !defined(EIGEN_HAS_GPU_FP16) || !defined(EIGEN_GPU_COMPILE_PHASE)
#if !defined(EIGEN_GPUCC) || !defined(EIGEN_GPU_COMPILE_PHASE)
// Use the same base class for the following two scenarios
// * when compiling without GPU support enabled
// * during host compile phase when compiling with GPU support enabled
typedef half_impl::__half_raw __half_raw;
#elif defined(EIGEN_HAS_HIP_FP16)
#elif defined(EIGEN_HIPCC)
// Nothing to do here
// HIP fp16 header file has a definition for __half_raw
#elif defined(EIGEN_HAS_CUDA_FP16)
// Note that EIGEN_CUDA_SDK_VER is set to 0 even when compiling with HIP, so
// (EIGEN_CUDA_SDK_VER < 90000) is true even for HIP! So keeping this within
// #if defined(EIGEN_HAS_CUDA_FP16) is needed
#if defined(EIGEN_CUDA_SDK_VER) && EIGEN_CUDA_SDK_VER < 90000
typedef half_impl::__half_raw __half_raw;
#endif
#elif defined(EIGEN_CUDACC)
// Nothing to do here.
#endif
EIGEN_DEVICE_FUNC _EIGEN_MAYBE_CONSTEXPR half() {}
EIGEN_DEVICE_FUNC _EIGEN_MAYBE_CONSTEXPR half(const __half_raw& h) : half_impl::half_base(h) {}
#if defined(EIGEN_HAS_GPU_FP16)
#if defined(EIGEN_HAS_HIP_FP16)
#if defined(EIGEN_GPUCC)
#if defined(EIGEN_HIPCC)
EIGEN_DEVICE_FUNC _EIGEN_MAYBE_CONSTEXPR half(const __half& h) : half_impl::half_base(h) {}
#elif defined(EIGEN_HAS_CUDA_FP16)
#if defined(EIGEN_CUDA_SDK_VER) && EIGEN_CUDA_SDK_VER >= 90000
#elif defined(EIGEN_CUDACC)
EIGEN_DEVICE_FUNC _EIGEN_MAYBE_CONSTEXPR half(const __half& h) : half_impl::half_base(h) {}
#endif
#endif
#endif
#if defined(EIGEN_HAS_ARM64_FP16_SCALAR_ARITHMETIC)
explicit EIGEN_DEVICE_FUNC _EIGEN_MAYBE_CONSTEXPR half(__fp16 b)
@@ -248,7 +235,7 @@ struct half : public half_impl::half_base {
return half_impl::half_to_float(*this);
}
#if defined(EIGEN_HAS_GPU_FP16) && !defined(EIGEN_GPU_COMPILE_PHASE)
#if defined(EIGEN_GPUCC) && !defined(EIGEN_GPU_COMPILE_PHASE)
EIGEN_DEVICE_FUNC operator __half() const {
::__half_raw hr;
hr.x = x;
@@ -380,8 +367,7 @@ namespace Eigen {
namespace half_impl {
#if (defined(EIGEN_HAS_CUDA_FP16) && defined(EIGEN_CUDA_ARCH) && EIGEN_CUDA_ARCH >= 530) || \
(defined(EIGEN_HAS_HIP_FP16) && defined(HIP_DEVICE_COMPILE))
#if defined(EIGEN_GPU_COMPILE_PHASE)
// Note: We deliberately do *not* define this to 1 even if we have Arm's native
// fp16 type since GPU half types are rather different from native CPU half types.
#define EIGEN_HAS_NATIVE_GPU_FP16
@@ -393,24 +379,10 @@ namespace half_impl {
// conversion steps back and forth.
#if defined(EIGEN_HAS_NATIVE_GPU_FP16)
EIGEN_STRONG_INLINE __device__ half operator+(const half& a, const half& b) {
#if defined(EIGEN_CUDA_SDK_VER) && EIGEN_CUDA_SDK_VER >= 90000
return __hadd(::__half(a), ::__half(b));
#else
return __hadd(a, b);
#endif
}
EIGEN_STRONG_INLINE __device__ half operator+(const half& a, const half& b) { return __hadd(::__half(a), ::__half(b)); }
EIGEN_STRONG_INLINE __device__ half operator*(const half& a, const half& b) { return __hmul(a, b); }
EIGEN_STRONG_INLINE __device__ half operator-(const half& a, const half& b) { return __hsub(a, b); }
EIGEN_STRONG_INLINE __device__ half operator/(const half& a, const half& b) {
#if defined(EIGEN_CUDA_SDK_VER) && EIGEN_CUDA_SDK_VER >= 90000
return __hdiv(a, b);
#else
float num = __half2float(a);
float denom = __half2float(b);
return __float2half(num / denom);
#endif
}
EIGEN_STRONG_INLINE __device__ half operator/(const half& a, const half& b) { return __hdiv(a, b); }
EIGEN_STRONG_INLINE __device__ half operator-(const half& a) { return __hneg(a); }
EIGEN_STRONG_INLINE __device__ half& operator+=(half& a, const half& b) {
a = a + b;
@@ -505,7 +477,7 @@ EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC bool operator>=(const half& a, const half&
// We need to provide emulated *host-side* FP16 operators for clang.
#pragma push_macro("EIGEN_DEVICE_FUNC")
#undef EIGEN_DEVICE_FUNC
#if defined(EIGEN_HAS_CUDA_FP16) && defined(EIGEN_HAS_NATIVE_GPU_FP16)
#if defined(EIGEN_CUDACC) && defined(EIGEN_HAS_NATIVE_GPU_FP16)
#define EIGEN_DEVICE_FUNC __host__
#else // both host and device need emulated ops.
#define EIGEN_DEVICE_FUNC __host__ __device__
@@ -636,7 +608,7 @@ EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC _EIGEN_MAYBE_CONSTEXPR __half_raw raw_uint
// because this is constexpr function.
// Fortunately, since we need to disable EIGEN_CONSTEXPR for GPU anyway, we can get out
// of this catch22 by having separate bodies for GPU / non GPU
#if defined(EIGEN_HAS_GPU_FP16)
#if defined(EIGEN_GPUCC)
__half_raw h;
h.x = x;
return h;
@@ -661,8 +633,7 @@ EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC numext::uint16_t raw_half_as_uint16(const
}
EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC __half_raw float_to_half_rtne(float ff) {
#if (defined(EIGEN_HAS_CUDA_FP16) && defined(EIGEN_CUDA_ARCH) && EIGEN_CUDA_ARCH >= 300) || \
(defined(EIGEN_HAS_HIP_FP16) && defined(EIGEN_HIP_DEVICE_COMPILE))
#if defined(EIGEN_GPU_COMPILE_PHASE)
__half tmp_ff = __float2half(ff);
return *(__half_raw*)&tmp_ff;
@@ -735,8 +706,7 @@ EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC __half_raw float_to_half_rtne(float ff) {
}
EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC float half_to_float(__half_raw h) {
#if (defined(EIGEN_HAS_CUDA_FP16) && defined(EIGEN_CUDA_ARCH) && EIGEN_CUDA_ARCH >= 300) || \
(defined(EIGEN_HAS_HIP_FP16) && defined(EIGEN_HIP_DEVICE_COMPILE))
#if defined(EIGEN_GPU_COMPILE_PHASE)
return __half2float(h);
#elif defined(EIGEN_HAS_ARM64_FP16_SCALAR_ARITHMETIC) || defined(EIGEN_HAS_BUILTIN_FLOAT16)
return static_cast<float>(h.x);
@@ -778,8 +748,7 @@ EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC bool(isinf)(const half& a) {
#endif
}
EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC bool(isnan)(const half& a) {
#if (defined(EIGEN_HAS_CUDA_FP16) && defined(EIGEN_CUDA_ARCH) && EIGEN_CUDA_ARCH >= 530) || \
(defined(EIGEN_HAS_HIP_FP16) && defined(EIGEN_HIP_DEVICE_COMPILE))
#if defined(EIGEN_GPU_COMPILE_PHASE)
return __hisnan(a);
#elif defined(EIGEN_HAS_ARM64_FP16_SCALAR_ARITHMETIC) || defined(EIGEN_HAS_BUILTIN_FLOAT16)
return (numext::bit_cast<numext::uint16_t>(a.x) & 0x7fff) > 0x7c00;
@@ -810,16 +779,14 @@ EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC half abs(const half& a) {
#endif
}
EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC half exp(const half& a) {
#if (EIGEN_CUDA_SDK_VER >= 80000 && defined EIGEN_CUDA_ARCH && EIGEN_CUDA_ARCH >= 530) || \
defined(EIGEN_HIP_DEVICE_COMPILE)
#if defined(EIGEN_CUDA_ARCH) || defined(EIGEN_HIP_DEVICE_COMPILE)
return half(hexp(a));
#else
return half(::expf(float(a)));
#endif
}
EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC half exp2(const half& a) {
#if (EIGEN_CUDA_SDK_VER >= 80000 && defined EIGEN_CUDA_ARCH && EIGEN_CUDA_ARCH >= 530) || \
defined(EIGEN_HIP_DEVICE_COMPILE)
#if defined(EIGEN_CUDA_ARCH) || defined(EIGEN_HIP_DEVICE_COMPILE)
return half(hexp2(a));
#else
return half(::exp2f(float(a)));
@@ -827,9 +794,7 @@ EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC half exp2(const half& a) {
}
EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC half expm1(const half& a) { return half(numext::expm1(float(a))); }
EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC half log(const half& a) {
#if (defined(EIGEN_HAS_CUDA_FP16) && EIGEN_CUDA_SDK_VER >= 80000 && defined(EIGEN_CUDA_ARCH) && \
EIGEN_CUDA_ARCH >= 530) || \
(defined(EIGEN_HAS_HIP_FP16) && defined(EIGEN_HIP_DEVICE_COMPILE))
#if defined(EIGEN_GPU_COMPILE_PHASE)
return half(hlog(a));
#else
return half(::logf(float(a)));
@@ -842,8 +807,7 @@ EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC half log2(const half& a) {
}
EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC half sqrt(const half& a) {
#if (EIGEN_CUDA_SDK_VER >= 80000 && defined EIGEN_CUDA_ARCH && EIGEN_CUDA_ARCH >= 530) || \
defined(EIGEN_HIP_DEVICE_COMPILE)
#if defined(EIGEN_CUDA_ARCH) || defined(EIGEN_HIP_DEVICE_COMPILE)
return half(hsqrt(a));
#else
return half(::sqrtf(float(a)));
@@ -864,16 +828,14 @@ EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC half acos(const half& a) { return half(::a
EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC half atan(const half& a) { return half(::atanf(float(a))); }
EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC half atanh(const half& a) { return half(::atanhf(float(a))); }
EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC half floor(const half& a) {
#if (EIGEN_CUDA_SDK_VER >= 80000 && defined EIGEN_CUDA_ARCH && EIGEN_CUDA_ARCH >= 300) || \
defined(EIGEN_HIP_DEVICE_COMPILE)
#if (defined(EIGEN_CUDA_ARCH)) || defined(EIGEN_HIP_DEVICE_COMPILE)
return half(hfloor(a));
#else
return half(::floorf(float(a)));
#endif
}
EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC half ceil(const half& a) {
#if (EIGEN_CUDA_SDK_VER >= 80000 && defined EIGEN_CUDA_ARCH && EIGEN_CUDA_ARCH >= 300) || \
defined(EIGEN_HIP_DEVICE_COMPILE)
#if (defined(EIGEN_CUDA_ARCH)) || defined(EIGEN_HIP_DEVICE_COMPILE)
return half(hceil(a));
#else
return half(::ceilf(float(a)));
@@ -1007,20 +969,12 @@ EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC Eigen::half madd<Eigen::half>(const Eigen:
} // namespace numext
} // namespace Eigen
// Add the missing shfl* intrinsics.
// The __shfl* functions are only valid on HIP or _CUDA_ARCH_ >= 300.
// CUDA defines them for (__CUDA_ARCH__ >= 300 || !defined(__CUDA_ARCH__))
//
// HIP and CUDA prior to SDK 9.0 define
// __shfl, __shfl_up, __shfl_down, __shfl_xor for int and float
// CUDA since 9.0 deprecates those and instead defines
// __shfl_sync, __shfl_up_sync, __shfl_down_sync, __shfl_xor_sync,
// with native support for __half and __nv_bfloat16
//
// Warp shuffle overloads for Eigen::half.
// CUDA uses __shfl_*_sync (with mask); HIP uses __shfl_* (no mask).
// Note that the following are __device__ - only functions.
#if (defined(EIGEN_CUDACC) && (!defined(EIGEN_CUDA_ARCH) || EIGEN_CUDA_ARCH >= 300)) || defined(EIGEN_HIPCC)
#if defined(EIGEN_CUDACC) || defined(EIGEN_HIPCC)
#if defined(EIGEN_HAS_CUDA_FP16) && EIGEN_CUDA_SDK_VER >= 90000
#if defined(EIGEN_CUDACC)
__device__ EIGEN_STRONG_INLINE Eigen::half __shfl_sync(unsigned mask, Eigen::half var, int srcLane,
int width = warpSize) {
@@ -1046,7 +1000,7 @@ __device__ EIGEN_STRONG_INLINE Eigen::half __shfl_xor_sync(unsigned mask, Eigen:
return static_cast<Eigen::half>(__shfl_xor_sync(mask, h, laneMask, width));
}
#else // HIP or CUDA SDK < 9.0
#else // HIP
__device__ EIGEN_STRONG_INLINE Eigen::half __shfl(Eigen::half var, int srcLane, int width = warpSize) {
const int ivar = static_cast<int>(Eigen::numext::bit_cast<Eigen::numext::uint16_t>(var));
@@ -1072,7 +1026,7 @@ __device__ EIGEN_STRONG_INLINE Eigen::half __shfl_xor(Eigen::half var, int laneM
#endif // __shfl*
// ldg() has an overload for __half_raw, but we also need one for Eigen::half.
#if (defined(EIGEN_CUDACC) && (!defined(EIGEN_CUDA_ARCH) || EIGEN_CUDA_ARCH >= 350)) || defined(EIGEN_HIPCC)
#if defined(EIGEN_CUDACC) || defined(EIGEN_HIPCC)
EIGEN_STRONG_INLINE __device__ Eigen::half __ldg(const Eigen::half* ptr) {
return Eigen::half_impl::raw_uint16_to_half(__ldg(reinterpret_cast<const Eigen::numext::uint16_t*>(ptr)));
}
@@ -1095,8 +1049,7 @@ namespace internal {
template <>
struct cast_impl<float, half> {
EIGEN_DEVICE_FUNC static inline half run(const float& a) {
#if (defined(EIGEN_HAS_CUDA_FP16) && defined(EIGEN_CUDA_ARCH) && EIGEN_CUDA_ARCH >= 300) || \
(defined(EIGEN_HAS_HIP_FP16) && defined(EIGEN_HIP_DEVICE_COMPILE))
#if defined(EIGEN_GPU_COMPILE_PHASE)
return __float2half(a);
#else
return half(a);
@@ -1107,8 +1060,7 @@ struct cast_impl<float, half> {
template <>
struct cast_impl<int, half> {
EIGEN_DEVICE_FUNC static inline half run(const int& a) {
#if (defined(EIGEN_HAS_CUDA_FP16) && defined(EIGEN_CUDA_ARCH) && EIGEN_CUDA_ARCH >= 300) || \
(defined(EIGEN_HAS_HIP_FP16) && defined(EIGEN_HIP_DEVICE_COMPILE))
#if defined(EIGEN_GPU_COMPILE_PHASE)
return __float2half(static_cast<float>(a));
#else
return half(static_cast<float>(a));
@@ -1119,8 +1071,7 @@ struct cast_impl<int, half> {
template <>
struct cast_impl<half, float> {
EIGEN_DEVICE_FUNC static inline float run(const half& a) {
#if (defined(EIGEN_HAS_CUDA_FP16) && defined(EIGEN_CUDA_ARCH) && EIGEN_CUDA_ARCH >= 300) || \
(defined(EIGEN_HAS_HIP_FP16) && defined(EIGEN_HIP_DEVICE_COMPILE))
#if defined(EIGEN_GPU_COMPILE_PHASE)
return __half2float(a);
#else
return static_cast<float>(a);

View File

@@ -17,19 +17,8 @@ namespace Eigen {
namespace internal {
// Read-only data cached load available.
#if defined(EIGEN_HIP_DEVICE_COMPILE) || (defined(EIGEN_CUDA_ARCH) && EIGEN_CUDA_ARCH >= 350)
#define EIGEN_GPU_HAS_LDG 1
#endif
// FP16 math available.
#if (defined(EIGEN_CUDA_ARCH) && EIGEN_CUDA_ARCH >= 530)
#define EIGEN_CUDA_HAS_FP16_ARITHMETIC 1
#endif
#if defined(EIGEN_HIP_DEVICE_COMPILE) || defined(EIGEN_CUDA_HAS_FP16_ARITHMETIC)
#define EIGEN_GPU_HAS_FP16_ARITHMETIC 1
#endif
// Read-only data cached load (__ldg) and native FP16 arithmetic are available
// on all supported GPU architectures (sm_70+ for CUDA, GFX906+ for HIP).
// We need to distinguish clang as the CUDA compiler from clang as the host compiler,
// invoked by NVCC (e.g. on MacOS). The former needs to see both host and device implementation
@@ -56,92 +45,84 @@ struct is_arithmetic<double2> {
template <>
struct packet_traits<float> : default_packet_traits {
typedef float4 type;
typedef float4 half;
enum {
Vectorizable = 1,
AlignedOnScalar = 1,
size = 4,
using type = float4;
using half = float4;
static constexpr int Vectorizable = 1;
static constexpr int AlignedOnScalar = 1;
static constexpr int size = 4;
HasDiv = 1,
HasSin = 0,
HasCos = 0,
HasLog = 1,
HasExp = 1,
HasSqrt = 1,
HasRsqrt = 1,
HasLGamma = 1,
HasDiGamma = 1,
HasZeta = 1,
HasPolygamma = 1,
HasErf = 1,
HasErfc = 1,
HasNdtri = 1,
HasBessel = 1,
HasIGamma = 1,
HasIGammaDerA = 1,
HasGammaSampleDerAlpha = 1,
HasIGammac = 1,
HasBetaInc = 1,
static constexpr int HasDiv = 1;
static constexpr int HasSin = 0;
static constexpr int HasCos = 0;
static constexpr int HasLog = 1;
static constexpr int HasExp = 1;
static constexpr int HasSqrt = 1;
static constexpr int HasRsqrt = 1;
static constexpr int HasLGamma = 1;
static constexpr int HasDiGamma = 1;
static constexpr int HasZeta = 1;
static constexpr int HasPolygamma = 1;
static constexpr int HasErf = 1;
static constexpr int HasErfc = 1;
static constexpr int HasNdtri = 1;
static constexpr int HasBessel = 1;
static constexpr int HasIGamma = 1;
static constexpr int HasIGammaDerA = 1;
static constexpr int HasGammaSampleDerAlpha = 1;
static constexpr int HasIGammac = 1;
static constexpr int HasBetaInc = 1;
HasFloor = 1,
HasCmp = EIGEN_HAS_GPU_DEVICE_FUNCTIONS
};
static constexpr int HasFloor = 1;
static constexpr int HasCmp = EIGEN_HAS_GPU_DEVICE_FUNCTIONS;
};
template <>
struct packet_traits<double> : default_packet_traits {
typedef double2 type;
typedef double2 half;
enum {
Vectorizable = 1,
AlignedOnScalar = 1,
size = 2,
using type = double2;
using half = double2;
static constexpr int Vectorizable = 1;
static constexpr int AlignedOnScalar = 1;
static constexpr int size = 2;
HasDiv = 1,
HasLog = 1,
HasExp = 1,
HasSqrt = 1,
HasRsqrt = 1,
HasLGamma = 1,
HasDiGamma = 1,
HasZeta = 1,
HasPolygamma = 1,
HasErf = 1,
HasErfc = 1,
HasNdtri = 1,
HasBessel = 1,
HasIGamma = 1,
HasIGammaDerA = 1,
HasGammaSampleDerAlpha = 1,
HasIGammac = 1,
HasBetaInc = 1,
};
static constexpr int HasDiv = 1;
static constexpr int HasLog = 1;
static constexpr int HasExp = 1;
static constexpr int HasSqrt = 1;
static constexpr int HasRsqrt = 1;
static constexpr int HasLGamma = 1;
static constexpr int HasDiGamma = 1;
static constexpr int HasZeta = 1;
static constexpr int HasPolygamma = 1;
static constexpr int HasErf = 1;
static constexpr int HasErfc = 1;
static constexpr int HasNdtri = 1;
static constexpr int HasBessel = 1;
static constexpr int HasIGamma = 1;
static constexpr int HasIGammaDerA = 1;
static constexpr int HasGammaSampleDerAlpha = 1;
static constexpr int HasIGammac = 1;
static constexpr int HasBetaInc = 1;
};
template <>
struct unpacket_traits<float4> {
typedef float type;
enum {
size = 4,
alignment = Aligned16,
vectorizable = true,
masked_load_available = false,
masked_store_available = false
};
typedef float4 half;
using type = float;
static constexpr int size = 4;
static constexpr int alignment = Aligned16;
static constexpr bool vectorizable = true;
static constexpr bool masked_load_available = false;
static constexpr bool masked_store_available = false;
using half = float4;
};
template <>
struct unpacket_traits<double2> {
typedef double type;
enum {
size = 2,
alignment = Aligned16,
vectorizable = true,
masked_load_available = false,
masked_store_available = false
};
typedef double2 half;
using type = double;
static constexpr int size = 2;
static constexpr int alignment = Aligned16;
static constexpr bool vectorizable = true;
static constexpr bool masked_load_available = false;
static constexpr bool masked_store_available = false;
using half = double2;
};
template <>
@@ -403,7 +384,7 @@ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void pstoreu<double>(double* to, const dou
template <>
EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE float4 ploadt_ro<float4, Aligned>(const float* from) {
#if defined(EIGEN_GPU_HAS_LDG)
#if defined(EIGEN_GPU_COMPILE_PHASE)
return __ldg(reinterpret_cast<const float4*>(from));
#else
return make_float4(from[0], from[1], from[2], from[3]);
@@ -411,7 +392,7 @@ EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE float4 ploadt_ro<float4, Aligned>(const fl
}
template <>
EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE double2 ploadt_ro<double2, Aligned>(const double* from) {
#if defined(EIGEN_GPU_HAS_LDG)
#if defined(EIGEN_GPU_COMPILE_PHASE)
return __ldg(reinterpret_cast<const double2*>(from));
#else
return make_double2(from[0], from[1]);
@@ -420,7 +401,7 @@ EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE double2 ploadt_ro<double2, Aligned>(const
template <>
EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE float4 ploadt_ro<float4, Unaligned>(const float* from) {
#if defined(EIGEN_GPU_HAS_LDG)
#if defined(EIGEN_GPU_COMPILE_PHASE)
return make_float4(__ldg(from + 0), __ldg(from + 1), __ldg(from + 2), __ldg(from + 3));
#else
return make_float4(from[0], from[1], from[2], from[3]);
@@ -428,7 +409,7 @@ EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE float4 ploadt_ro<float4, Unaligned>(const
}
template <>
EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE double2 ploadt_ro<double2, Unaligned>(const double* from) {
#if defined(EIGEN_GPU_HAS_LDG)
#if defined(EIGEN_GPU_COMPILE_PHASE)
return make_double2(__ldg(from + 0), __ldg(from + 1));
#else
return make_double2(from[0], from[1]);
@@ -591,23 +572,20 @@ EIGEN_DEVICE_FUNC inline void ptranspose(PacketBlock<double2, 2>& kernel) {
#endif // defined(EIGEN_GPUCC) && defined(EIGEN_USE_GPU)
// Half-packet functions are not available on the host for CUDA 9.0-9.2, only
// on device. There is no benefit to using them on the host anyways, since they are
// emulated.
#if (defined(EIGEN_HAS_CUDA_FP16) || defined(EIGEN_HAS_HIP_FP16)) && defined(EIGEN_GPU_COMPILE_PHASE)
// Half-packet functions are only available in GPU device compilation — they use
// intrinsics (__half2, etc.) that have no host-side benefit.
#if defined(EIGEN_GPU_COMPILE_PHASE)
typedef ulonglong2 Packet4h2;
using Packet4h2 = ulonglong2;
template <>
struct unpacket_traits<Packet4h2> {
typedef Eigen::half type;
enum {
size = 8,
alignment = Aligned16,
vectorizable = true,
masked_load_available = false,
masked_store_available = false
};
typedef Packet4h2 half;
using type = Eigen::half;
static constexpr int size = 8;
static constexpr int alignment = Aligned16;
static constexpr bool vectorizable = true;
static constexpr bool masked_load_available = false;
static constexpr bool masked_store_available = false;
using half = Packet4h2;
};
template <>
struct is_arithmetic<Packet4h2> {
@@ -616,15 +594,13 @@ struct is_arithmetic<Packet4h2> {
template <>
struct unpacket_traits<half2> {
typedef Eigen::half type;
enum {
size = 2,
alignment = Aligned16,
vectorizable = true,
masked_load_available = false,
masked_store_available = false
};
typedef half2 half;
using type = Eigen::half;
static constexpr int size = 2;
static constexpr int alignment = Aligned16;
static constexpr bool vectorizable = true;
static constexpr bool masked_load_available = false;
static constexpr bool masked_store_available = false;
using half = half2;
};
template <>
struct is_arithmetic<half2> {
@@ -633,23 +609,21 @@ struct is_arithmetic<half2> {
template <>
struct packet_traits<Eigen::half> : default_packet_traits {
typedef Packet4h2 type;
typedef Packet4h2 half;
enum {
Vectorizable = 1,
AlignedOnScalar = 1,
size = 8,
HasAdd = 1,
HasSub = 1,
HasMul = 1,
HasDiv = 1,
HasSqrt = 1,
HasRsqrt = 1,
HasExp = 1,
HasExpm1 = 1,
HasLog = 1,
HasLog1p = 1
};
using type = Packet4h2;
using half = Packet4h2;
static constexpr int Vectorizable = 1;
static constexpr int AlignedOnScalar = 1;
static constexpr int size = 8;
static constexpr int HasAdd = 1;
static constexpr int HasSub = 1;
static constexpr int HasMul = 1;
static constexpr int HasDiv = 1;
static constexpr int HasSqrt = 1;
static constexpr int HasRsqrt = 1;
static constexpr int HasExp = 1;
static constexpr int HasExpm1 = 1;
static constexpr int HasLog = 1;
static constexpr int HasLog1p = 1;
};
template <>
@@ -690,7 +664,7 @@ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void pstoreu(Eigen::half* to, const half2&
}
EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE half2 ploadt_ro_aligned(const Eigen::half* from) {
#if defined(EIGEN_GPU_HAS_LDG)
#if defined(EIGEN_GPU_COMPILE_PHASE)
// Input is guaranteed to be properly aligned.
return __ldg(reinterpret_cast<const half2*>(from));
#else
@@ -699,7 +673,7 @@ EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE half2 ploadt_ro_aligned(const Eigen::half*
}
EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE half2 ploadt_ro_unaligned(const Eigen::half* from) {
#if defined(EIGEN_GPU_HAS_LDG)
#if defined(EIGEN_GPU_COMPILE_PHASE)
return __halves2half2(__ldg(from + 0), __ldg(from + 1));
#else
return __halves2half2(*(from + 0), *(from + 1));
@@ -745,12 +719,7 @@ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void ptranspose(PacketBlock<half2, 2>& ker
}
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 plset(const Eigen::half& a) {
#if defined(EIGEN_GPU_HAS_FP16_ARITHMETIC)
return __halves2half2(a, __hadd(a, __float2half(1.0f)));
#else
float f = __half2float(a) + 1.0f;
return __halves2half2(a, __float2half(f));
#endif
}
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 pselect(const half2& mask, const half2& a, const half2& b) {
@@ -837,89 +806,21 @@ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 pandnot(const half2& a, const half2&
return __halves2half2(result1, result2);
}
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 padd(const half2& a, const half2& b) {
#if defined(EIGEN_GPU_HAS_FP16_ARITHMETIC)
return __hadd2(a, b);
#else
float a1 = __low2float(a);
float a2 = __high2float(a);
float b1 = __low2float(b);
float b2 = __high2float(b);
float r1 = a1 + b1;
float r2 = a2 + b2;
return __floats2half2_rn(r1, r2);
#endif
}
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 padd(const half2& a, const half2& b) { return __hadd2(a, b); }
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 psub(const half2& a, const half2& b) {
#if defined(EIGEN_GPU_HAS_FP16_ARITHMETIC)
return __hsub2(a, b);
#else
float a1 = __low2float(a);
float a2 = __high2float(a);
float b1 = __low2float(b);
float b2 = __high2float(b);
float r1 = a1 - b1;
float r2 = a2 - b2;
return __floats2half2_rn(r1, r2);
#endif
}
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 psub(const half2& a, const half2& b) { return __hsub2(a, b); }
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 pnegate(const half2& a) {
#if defined(EIGEN_GPU_HAS_FP16_ARITHMETIC)
return __hneg2(a);
#else
float a1 = __low2float(a);
float a2 = __high2float(a);
return __floats2half2_rn(-a1, -a2);
#endif
}
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 pnegate(const half2& a) { return __hneg2(a); }
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 pconj(const half2& a) { return a; }
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 pmul(const half2& a, const half2& b) {
#if defined(EIGEN_GPU_HAS_FP16_ARITHMETIC)
return __hmul2(a, b);
#else
float a1 = __low2float(a);
float a2 = __high2float(a);
float b1 = __low2float(b);
float b2 = __high2float(b);
float r1 = a1 * b1;
float r2 = a2 * b2;
return __floats2half2_rn(r1, r2);
#endif
}
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 pmul(const half2& a, const half2& b) { return __hmul2(a, b); }
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 pmadd(const half2& a, const half2& b, const half2& c) {
#if defined(EIGEN_GPU_HAS_FP16_ARITHMETIC)
return __hfma2(a, b, c);
#else
float a1 = __low2float(a);
float a2 = __high2float(a);
float b1 = __low2float(b);
float b2 = __high2float(b);
float c1 = __low2float(c);
float c2 = __high2float(c);
float r1 = a1 * b1 + c1;
float r2 = a2 * b2 + c2;
return __floats2half2_rn(r1, r2);
#endif
}
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 pdiv(const half2& a, const half2& b) {
#if defined(EIGEN_GPU_HAS_FP16_ARITHMETIC)
return __h2div(a, b);
#else
float a1 = __low2float(a);
float a2 = __high2float(a);
float b1 = __low2float(b);
float b2 = __high2float(b);
float r1 = a1 / b1;
float r2 = a2 / b2;
return __floats2half2_rn(r1, r2);
#endif
}
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 pdiv(const half2& a, const half2& b) { return __h2div(a, b); }
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 pmin(const half2& a, const half2& b) {
float a1 = __low2float(a);
@@ -942,47 +843,23 @@ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 pmax(const half2& a, const half2& b)
}
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Eigen::half predux(const half2& a) {
#if defined(EIGEN_GPU_HAS_FP16_ARITHMETIC)
return __hadd(__low2half(a), __high2half(a));
#else
float a1 = __low2float(a);
float a2 = __high2float(a);
return Eigen::half(__float2half(a1 + a2));
#endif
}
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Eigen::half predux_max(const half2& a) {
#if defined(EIGEN_GPU_HAS_FP16_ARITHMETIC)
__half first = __low2half(a);
__half second = __high2half(a);
return __hgt(first, second) ? first : second;
#else
float a1 = __low2float(a);
float a2 = __high2float(a);
return a1 > a2 ? __low2half(a) : __high2half(a);
#endif
}
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Eigen::half predux_min(const half2& a) {
#if defined(EIGEN_GPU_HAS_FP16_ARITHMETIC)
__half first = __low2half(a);
__half second = __high2half(a);
return __hlt(first, second) ? first : second;
#else
float a1 = __low2float(a);
float a2 = __high2float(a);
return a1 < a2 ? __low2half(a) : __high2half(a);
#endif
}
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Eigen::half predux_mul(const half2& a) {
#if defined(EIGEN_GPU_HAS_FP16_ARITHMETIC)
return __hmul(__low2half(a), __high2half(a));
#else
float a1 = __low2float(a);
float a2 = __high2float(a);
return Eigen::half(__float2half(a1 * a2));
#endif
}
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 plog1p(const half2& a) {
@@ -1001,8 +878,6 @@ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 pexpm1(const half2& a) {
return __floats2half2_rn(r1, r2);
}
#if (EIGEN_CUDA_SDK_VER >= 80000 && defined(EIGEN_CUDA_HAS_FP16_ARITHMETIC)) || defined(EIGEN_HIP_DEVICE_COMPILE)
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 plog(const half2& a) { return h2log(a); }
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 pexp(const half2& a) { return h2exp(a); }
@@ -1010,41 +885,6 @@ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 pexp(const half2& a) { return h2exp(
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 psqrt(const half2& a) { return h2sqrt(a); }
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 prsqrt(const half2& a) { return h2rsqrt(a); }
#else
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 plog(const half2& a) {
float a1 = __low2float(a);
float a2 = __high2float(a);
float r1 = logf(a1);
float r2 = logf(a2);
return __floats2half2_rn(r1, r2);
}
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 pexp(const half2& a) {
float a1 = __low2float(a);
float a2 = __high2float(a);
float r1 = expf(a1);
float r2 = expf(a2);
return __floats2half2_rn(r1, r2);
}
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 psqrt(const half2& a) {
float a1 = __low2float(a);
float a2 = __high2float(a);
float r1 = sqrtf(a1);
float r2 = sqrtf(a2);
return __floats2half2_rn(r1, r2);
}
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 prsqrt(const half2& a) {
float a1 = __low2float(a);
float a2 = __high2float(a);
float r1 = rsqrtf(a1);
float r2 = rsqrtf(a2);
return __floats2half2_rn(r1, r2);
}
#endif
} // namespace
template <>
@@ -1091,19 +931,17 @@ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void pstoreu<Eigen::half>(Eigen::half* to,
template <>
EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE Packet4h2 ploadt_ro<Packet4h2, Aligned>(const Eigen::half* from) {
#if defined(EIGEN_GPU_HAS_LDG)
Packet4h2 r;
#if defined(EIGEN_GPU_COMPILE_PHASE)
r = __ldg(reinterpret_cast<const Packet4h2*>(from));
return r;
#else
Packet4h2 r;
half2* r_alias = reinterpret_cast<half2*>(&r);
r_alias[0] = ploadt_ro_aligned(from + 0);
r_alias[1] = ploadt_ro_aligned(from + 2);
r_alias[2] = ploadt_ro_aligned(from + 4);
r_alias[3] = ploadt_ro_aligned(from + 6);
return r;
#endif
return r;
}
template <>
@@ -1272,7 +1110,7 @@ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet4h2 plset<Packet4h2>(const Eigen::ha
p_alias[2] = __halves2half2(__hadd(a, __float2half(4.0f)), __hadd(a, __float2half(5.0f)));
p_alias[3] = __halves2half2(__hadd(a, __float2half(6.0f)), __hadd(a, __float2half(7.0f)));
return r;
#elif defined(EIGEN_CUDA_HAS_FP16_ARITHMETIC)
#elif defined(EIGEN_CUDA_ARCH)
Packet4h2 r;
half2* r_alias = reinterpret_cast<half2*>(&r);
@@ -1290,16 +1128,6 @@ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet4h2 plset<Packet4h2>(const Eigen::ha
r_alias[3] = plset(__high2half(c));
return r;
#else
float f = __half2float(a);
Packet4h2 r;
half2* p_alias = reinterpret_cast<half2*>(&r);
p_alias[0] = __halves2half2(a, __float2half(f + 1.0f));
p_alias[1] = __halves2half2(__float2half(f + 2.0f), __float2half(f + 3.0f));
p_alias[2] = __halves2half2(__float2half(f + 4.0f), __float2half(f + 5.0f));
p_alias[3] = __halves2half2(__float2half(f + 6.0f), __float2half(f + 7.0f));
return r;
#endif
}
@@ -1533,7 +1361,7 @@ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Eigen::half predux_max<Packet4h2>(const Pa
half2 m1 = __halves2half2(predux_max(a_alias[2]), predux_max(a_alias[3]));
__half first = predux_max(m0);
__half second = predux_max(m1);
#if defined(EIGEN_CUDA_HAS_FP16_ARITHMETIC)
#if defined(EIGEN_CUDA_ARCH)
return (__hgt(first, second) ? first : second);
#else
float ffirst = __half2float(first);
@@ -1549,7 +1377,7 @@ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Eigen::half predux_min<Packet4h2>(const Pa
half2 m1 = __halves2half2(predux_min(a_alias[2]), predux_min(a_alias[3]));
__half first = predux_min(m0);
__half second = predux_min(m1);
#if defined(EIGEN_CUDA_HAS_FP16_ARITHMETIC)
#if defined(EIGEN_CUDA_ARCH)
return (__hlt(first, second) ? first : second);
#else
float ffirst = __half2float(first);
@@ -1641,47 +1469,17 @@ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet4h2 prsqrt<Packet4h2>(const Packet4h
// the implementation of GPU half reduction.
template <>
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 padd<half2>(const half2& a, const half2& b) {
#if defined(EIGEN_GPU_HAS_FP16_ARITHMETIC)
return __hadd2(a, b);
#else
float a1 = __low2float(a);
float a2 = __high2float(a);
float b1 = __low2float(b);
float b2 = __high2float(b);
float r1 = a1 + b1;
float r2 = a2 + b2;
return __floats2half2_rn(r1, r2);
#endif
}
template <>
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 pmul<half2>(const half2& a, const half2& b) {
#if defined(EIGEN_GPU_HAS_FP16_ARITHMETIC)
return __hmul2(a, b);
#else
float a1 = __low2float(a);
float a2 = __high2float(a);
float b1 = __low2float(b);
float b2 = __high2float(b);
float r1 = a1 * b1;
float r2 = a2 * b2;
return __floats2half2_rn(r1, r2);
#endif
}
template <>
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 pdiv<half2>(const half2& a, const half2& b) {
#if defined(EIGEN_GPU_HAS_FP16_ARITHMETIC)
return __h2div(a, b);
#else
float a1 = __low2float(a);
float a2 = __high2float(a);
float b1 = __low2float(b);
float b2 = __high2float(b);
float r1 = a1 / b1;
float r2 = a2 / b2;
return __floats2half2_rn(r1, r2);
#endif
}
template <>
@@ -1706,11 +1504,7 @@ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 pmax<half2>(const half2& a, const ha
return __halves2half2(r1, r2);
}
#endif // (defined(EIGEN_HAS_CUDA_FP16) || defined(EIGEN_HAS_HIP_FP16)) && defined(EIGEN_GPU_COMPILE_PHASE)
#undef EIGEN_GPU_HAS_LDG
#undef EIGEN_CUDA_HAS_FP16_ARITHMETIC
#undef EIGEN_GPU_HAS_FP16_ARITHMETIC
#endif // defined(EIGEN_GPU_COMPILE_PHASE)
} // end namespace internal

View File

@@ -17,8 +17,7 @@ namespace Eigen {
namespace internal {
#if (defined(EIGEN_HAS_CUDA_FP16) && defined(EIGEN_CUDA_ARCH) && EIGEN_CUDA_ARCH >= 300) || \
(defined(EIGEN_HAS_HIP_FP16) && defined(EIGEN_HIP_DEVICE_COMPILE))
#if defined(EIGEN_GPU_COMPILE_PHASE)
template <>
struct type_casting_traits<Eigen::half, float> {

View File

@@ -541,12 +541,6 @@ extern "C" {
#if defined EIGEN_CUDACC
#define EIGEN_VECTORIZE_GPU
#include <vector_types.h>
#if EIGEN_CUDA_SDK_VER >= 70500
#define EIGEN_HAS_CUDA_FP16
#endif
#endif
#if defined(EIGEN_HAS_CUDA_FP16)
#include <cuda_runtime_api.h>
#include <cuda_fp16.h>
#endif
@@ -554,7 +548,6 @@ extern "C" {
#if defined(EIGEN_HIPCC)
#define EIGEN_VECTORIZE_GPU
#include <hip/hip_vector_types.h>
#define EIGEN_HAS_HIP_FP16
#include <hip/hip_fp16.h>
#define EIGEN_HAS_HIP_BF16
#include <hip/hip_bfloat16.h>

View File

@@ -84,8 +84,7 @@
#endif
#if defined __NVCC__ && defined __CUDACC__
// MSVC 14.16 (required by CUDA 9.*) does not support the _Pragma keyword, so
// we instead use Microsoft's __pragma extension.
// MSVC does not support the _Pragma keyword, so we use Microsoft's __pragma extension.
#if defined _MSC_VER
#define EIGEN_MAKE_PRAGMA(X) __pragma(#X)
#else

View File

@@ -148,13 +148,8 @@
#endif
#if defined(__NVCC__)
#if defined(__CUDACC_VER_MAJOR__) && (__CUDACC_VER_MAJOR__ >= 9)
// CUDA 11.4+ always defines __CUDACC_VER_MAJOR__.
#define EIGEN_COMP_NVCC ((__CUDACC_VER_MAJOR__ * 10000) + (__CUDACC_VER_MINOR__ * 100))
#elif defined(__CUDACC_VER__)
#define EIGEN_COMP_NVCC __CUDACC_VER__
#else
#error "NVCC did not define compiler version."
#endif
#else
#define EIGEN_COMP_NVCC 0
#endif
@@ -575,6 +570,10 @@
#define EIGEN_CUDA_SDK_VER 0
#endif
#if defined(EIGEN_CUDACC) && EIGEN_CUDA_SDK_VER > 0 && EIGEN_CUDA_SDK_VER < 110400
#error "Eigen requires CUDA 11.4 or later."
#endif
#if defined(__HIPCC__) && !defined(EIGEN_NO_HIP) && !defined(__SYCL_DEVICE_ONLY__)
// Means the compiler is HIPCC (analogous to EIGEN_CUDACC, but for HIP)
#define EIGEN_HIPCC __HIPCC__
@@ -584,22 +583,20 @@
// ++ host_defines.h which contains the defines for the __host__ and __device__ macros
#include <hip/hip_runtime.h>
// Eigen requires ROCm/HIP >= 5.6 (GFX906 minimum architecture).
// This floor exists to allow simplifying shared CUDA/HIP preprocessor guards —
// all __HIP_ARCH_HAS_WARP_SHUFFLE__, __HIP_ARCH_HAS_FP16__, etc. are always true on GFX906+.
#if defined(HIP_VERSION_MAJOR) && (HIP_VERSION_MAJOR < 5 || (HIP_VERSION_MAJOR == 5 && HIP_VERSION_MINOR < 6))
#error "Eigen requires ROCm/HIP >= 5.6."
#endif
#if defined(__HIP_DEVICE_COMPILE__) && !defined(__SYCL_DEVICE_ONLY__)
// analogous to EIGEN_CUDA_ARCH, but for HIP
#define EIGEN_HIP_DEVICE_COMPILE __HIP_DEVICE_COMPILE__
#endif
// For HIP (ROCm 3.5 and higher), we need to explicitly set the launch_bounds attribute
// value to 1024. The compiler assigns a default value of 256 when the attribute is not
// specified. This results in failures on the HIP platform, for cases when a GPU kernel
// without an explicit launch_bounds attribute is called with a threads_per_block value
// greater than 256.
//
// This is a regression in functionality and is expected to be fixed within the next
// couple of ROCm releases (compiler will go back to using 1024 value as the default)
//
// In the meantime, we will use a "only enabled for HIP" macro to set the launch_bounds
// attribute.
// HIP compilers default to launch_bounds(256), which causes failures when kernels
// are called with more than 256 threads per block. Explicitly set to 1024 for HIP.
#define EIGEN_HIP_LAUNCH_BOUNDS_1024 __launch_bounds__(1024)

View File

@@ -25,7 +25,7 @@ namespace internal {
template <typename SolverType, int Size, bool IsComplex>
struct direct_selfadjoint_eigenvalues;
template <typename MatrixType, typename DiagType, typename SubDiagType>
template <bool PerBlockScaling, typename MatrixType, typename DiagType, typename SubDiagType>
EIGEN_DEVICE_FUNC ComputationInfo computeFromTridiagonal_impl(DiagType& diag, SubDiagType& subdiag,
const Index maxIterations, bool computeEigenvectors,
MatrixType& eivec);
@@ -438,7 +438,7 @@ EIGEN_DEVICE_FUNC SelfAdjointEigenSolver<MatrixType>& SelfAdjointEigenSolver<Mat
m_eivec = matrix;
m_eivalues.coeffRef(0, 0) = numext::real(m_eivec.coeff(0, 0));
if (computeEigenvectors) m_eivec.setOnes(n, n);
m_info = Success;
m_info = (numext::isfinite)(m_eivalues.coeffRef(0, 0)) ? Success : NoConvergence;
m_isInitialized = true;
m_eigenvectorsOk = computeEigenvectors;
return *this;
@@ -448,18 +448,29 @@ EIGEN_DEVICE_FUNC SelfAdjointEigenSolver<MatrixType>& SelfAdjointEigenSolver<Mat
RealVectorType& diag = m_eivalues;
EigenvectorsType& mat = m_eivec;
// map the matrix coefficients to [-1:1] to avoid over- and underflow.
// Scale the matrix to [-1:1] to avoid overflow/underflow during tridiagonalization
// and subsequent QR iteration. This uniform scaling ensures the tridiagonal output is
// well-conditioned. Note: for block-diagonal matrices with widely separated scales, this
// can underflow small blocks. Users with such matrices should tridiagonalize separately
// and call computeFromTridiagonal(), which uses per-block scaling.
mat = matrix.template triangularView<Lower>();
RealScalar scale = mat.cwiseAbs().maxCoeff();
if (!(numext::isfinite)(scale)) {
// Input contains Inf or NaN.
m_info = NoConvergence;
m_isInitialized = true;
m_eigenvectorsOk = false;
return *this;
}
if (numext::is_exactly_zero(scale)) scale = RealScalar(1);
mat.template triangularView<Lower>() /= scale;
m_subdiag.resize(n - 1);
m_hcoeffs.resize(n - 1);
internal::tridiagonalization_inplace(mat, diag, m_subdiag, m_hcoeffs, m_workspace, computeEigenvectors);
m_info = internal::computeFromTridiagonal_impl(diag, m_subdiag, m_maxIterations, computeEigenvectors, m_eivec);
m_info = internal::computeFromTridiagonal_impl<false>(diag, m_subdiag, m_maxIterations, computeEigenvectors, m_eivec);
// scale back the eigen values
// Scale back the eigenvalues.
m_eivalues *= scale;
m_isInitialized = true;
@@ -470,15 +481,31 @@ EIGEN_DEVICE_FUNC SelfAdjointEigenSolver<MatrixType>& SelfAdjointEigenSolver<Mat
template <typename MatrixType>
SelfAdjointEigenSolver<MatrixType>& SelfAdjointEigenSolver<MatrixType>::computeFromTridiagonal(
const RealVectorType& diag, const SubDiagonalType& subdiag, int options) {
// TODO : Add an option to scale the values beforehand
bool computeEigenvectors = (options & ComputeEigenvectors) == ComputeEigenvectors;
m_eivalues = diag;
m_subdiag = subdiag;
// Check for Inf/NaN in the input.
{
RealScalar scale = RealScalar(0);
if (m_eivalues.size() > 0) scale = m_eivalues.cwiseAbs().maxCoeff();
if (m_subdiag.size() > 0) scale = numext::maxi(scale, m_subdiag.cwiseAbs().maxCoeff());
if (!(numext::isfinite)(scale)) {
m_info = NoConvergence;
m_isInitialized = true;
m_eigenvectorsOk = false;
return *this;
}
}
if (computeEigenvectors) {
m_eivec.setIdentity(diag.size(), diag.size());
}
m_info = internal::computeFromTridiagonal_impl(m_eivalues, m_subdiag, m_maxIterations, computeEigenvectors, m_eivec);
// Use per-deflation-block scaling (like LAPACK's DSTERF) to avoid losing
// precision when the tridiagonal entries span a wide range of magnitudes.
m_info =
internal::computeFromTridiagonal_impl<true>(m_eivalues, m_subdiag, m_maxIterations, computeEigenvectors, m_eivec);
m_isInitialized = true;
m_eigenvectorsOk = computeEigenvectors;
@@ -490,6 +517,10 @@ namespace internal {
* \internal
* \brief Compute the eigendecomposition from a tridiagonal matrix
*
* \tparam PerBlockScaling If true, each deflation block is independently scaled to [-1,1] before
* QR iteration, following LAPACK's DSTERF approach. This prevents precision loss when entries
* span a wide range of magnitudes. When false, the caller is responsible for ensuring the
* entries are in a safe range (e.g. by pre-scaling the dense matrix before tridiagonalization).
* \param[in,out] diag : On input, the diagonal of the matrix, on output the eigenvalues
* \param[in,out] subdiag : The subdiagonal part of the matrix (entries are modified during the decomposition)
* \param[in] maxIterations : the maximum number of iterations
@@ -497,7 +528,7 @@ namespace internal {
* \param[out] eivec : The matrix to store the eigenvectors if computeEigenvectors==true. Must be allocated on input.
* \returns \c Success or \c NoConvergence
*/
template <typename MatrixType, typename DiagType, typename SubDiagType>
template <bool PerBlockScaling, typename MatrixType, typename DiagType, typename SubDiagType>
EIGEN_DEVICE_FUNC ComputationInfo computeFromTridiagonal_impl(DiagType& diag, SubDiagType& subdiag,
const Index maxIterations, bool computeEigenvectors,
MatrixType& eivec) {
@@ -512,21 +543,32 @@ EIGEN_DEVICE_FUNC ComputationInfo computeFromTridiagonal_impl(DiagType& diag, Su
typedef typename DiagType::RealScalar RealScalar;
const RealScalar considerAsZero = (std::numeric_limits<RealScalar>::min)();
const RealScalar precision_inv = RealScalar(1) / NumTraits<RealScalar>::epsilon();
while (end > 0) {
for (Index i = start; i < end; ++i) {
// Helper lambda for the deflation test.
auto deflate = [&](Index lo, Index hi) {
for (Index i = lo; i < hi; ++i) {
if (numext::abs(subdiag[i]) < considerAsZero) {
subdiag[i] = RealScalar(0);
} else {
// abs(subdiag[i]) <= epsilon * sqrt(abs(diag[i]) + abs(diag[i+1]))
// Scaled to prevent underflows.
const RealScalar scaled_subdiag = precision_inv * subdiag[i];
if (scaled_subdiag * scaled_subdiag <= (numext::abs(diag[i]) + numext::abs(diag[i + 1]))) {
subdiag[i] = RealScalar(0);
}
}
}
};
// find the largest unreduced block at the end of the matrix.
// For per-block scaling, track the currently scaled block and its scale factor.
// When the outer loop identifies a block outside the scaled region, unscale the old
// block and scale the new one. This keeps the same outer loop structure (one QR step
// per iteration) while ensuring each block is processed in scaled coordinates.
Index scaled_start = -1, scaled_end = -1;
RealScalar block_scale = RealScalar(1);
while (end > 0) {
deflate(start, end);
// Find the largest unreduced block at the end of the matrix.
while (end > 0 && numext::is_exactly_zero(subdiag[end - 1])) {
end--;
}
@@ -539,9 +581,42 @@ EIGEN_DEVICE_FUNC ComputationInfo computeFromTridiagonal_impl(DiagType& diag, Su
start = end - 1;
while (start > 0 && !numext::is_exactly_zero(subdiag[start - 1])) start--;
if (PerBlockScaling) {
// Check if we've moved to a different block than the one currently scaled.
if (start != scaled_start || end != scaled_end) {
// Unscale the previous block if it was scaled.
if (block_scale != RealScalar(1)) {
for (Index i = scaled_start; i <= scaled_end; ++i) diag[i] /= block_scale;
for (Index i = scaled_start; i < scaled_end; ++i) {
if (!numext::is_exactly_zero(subdiag[i])) subdiag[i] /= block_scale;
}
block_scale = RealScalar(1);
}
// Compute the norm and scale the new block to [-1:1].
RealScalar block_norm = RealScalar(0);
for (Index i = start; i <= end; ++i) block_norm = numext::maxi(block_norm, numext::abs(diag[i]));
for (Index i = start; i < end; ++i) block_norm = numext::maxi(block_norm, numext::abs(subdiag[i]));
if (block_norm > RealScalar(0) && block_norm != RealScalar(1)) {
block_scale = RealScalar(1) / block_norm;
for (Index i = start; i <= end; ++i) diag[i] *= block_scale;
for (Index i = start; i < end; ++i) subdiag[i] *= block_scale;
}
scaled_start = start;
scaled_end = end;
}
}
internal::tridiagonal_qr_step<MatrixType::Flags & RowMajorBit ? RowMajor : ColMajor>(
diag.data(), subdiag.data(), start, end, computeEigenvectors ? eivec.data() : (Scalar*)0, n);
}
// Unscale any remaining scaled block.
if (PerBlockScaling && block_scale != RealScalar(1)) {
for (Index i = scaled_start; i <= scaled_end; ++i) diag[i] /= block_scale;
for (Index i = scaled_start; i < scaled_end; ++i) {
if (!numext::is_exactly_zero(subdiag[i])) subdiag[i] /= block_scale;
}
}
if (iter <= maxIterations * n)
info = Success;
else
@@ -662,6 +737,28 @@ struct direct_selfadjoint_eigenvalues<SolverType, 3, false> {
// compute the eigenvalues
computeRoots(scaledMat, eivals);
// computeRoots produces theoretically sorted roots, but floating-point
// rounding in the trigonometric formulas can break the ordering.
// Enforce sorting with a branchless min/max network (3 elements).
{
Scalar tmp;
if (eivals(0) > eivals(1)) {
tmp = eivals(0);
eivals(0) = eivals(1);
eivals(1) = tmp;
}
if (eivals(1) > eivals(2)) {
tmp = eivals(1);
eivals(1) = eivals(2);
eivals(2) = tmp;
}
if (eivals(0) > eivals(1)) {
tmp = eivals(0);
eivals(0) = eivals(1);
eivals(1) = tmp;
}
}
// compute the eigenvectors
if (computeEigenvectors) {
if ((eivals(2) - eivals(0)) <= Eigen::NumTraits<Scalar>::epsilon()) {
@@ -691,7 +788,7 @@ struct direct_selfadjoint_eigenvalues<SolverType, 3, false> {
if (d0 <= 2 * Eigen::NumTraits<Scalar>::epsilon() * d1) {
// If d0 is too small, then the two other eigenvalues are numerically the same,
// and thus we only have to ortho-normalize the near orthogonal vector we saved above.
eivecs.col(l) -= eivecs.col(k).dot(eivecs.col(l)) * eivecs.col(l);
eivecs.col(l) -= eivecs.col(k).dot(eivecs.col(l)) * eivecs.col(k);
eivecs.col(l).normalize();
} else {
tmp = scaledMat;

View File

@@ -0,0 +1,17 @@
# Benchmarks for Eigen's built-in BLAS implementation.
# Compiles the Eigen BLAS sources directly into the benchmark executable
# so there is no external BLAS dependency.
set(EIGEN_BLAS_SRCS
${EIGEN_SOURCE_DIR}/blas/single.cpp
${EIGEN_SOURCE_DIR}/blas/double.cpp
${EIGEN_SOURCE_DIR}/blas/complex_single.cpp
${EIGEN_SOURCE_DIR}/blas/complex_double.cpp
${EIGEN_SOURCE_DIR}/blas/xerbla.cpp
${EIGEN_SOURCE_DIR}/blas/lsame.cpp
${EIGEN_SOURCE_DIR}/blas/complexdots.cpp
)
eigen_add_benchmark(bench_blas bench_blas.cpp)
target_sources(bench_blas PRIVATE ${EIGEN_BLAS_SRCS})
target_include_directories(bench_blas PRIVATE ${EIGEN_SOURCE_DIR}/blas)

View File

@@ -0,0 +1,488 @@
// Benchmark for Eigen's BLAS implementation.
//
// Calls the Eigen BLAS C interface directly (the extern "C" functions defined
// in blas/{single,double,complex_single,complex_double}.cpp).
//
// Covers Level 1, 2, and 3 routines — with emphasis on the routines that
// were recently rewritten from f2c to C++: rotm, rotmg, spmv, sbmv, hbmv,
// hpmv, tbmv, lsame, and complex dot products.
#include <benchmark/benchmark.h>
#include <Eigen/Core>
#include <complex>
#include <vector>
#include "blas/blas.h"
using Eigen::Index;
// ---------------------------------------------------------------------------
// Helpers
// ---------------------------------------------------------------------------
// Flop-rate counter (units = individual flops per call).
static benchmark::Counter GflopsCounter(double flops) {
return benchmark::Counter(flops, benchmark::Counter::kIsIterationInvariantRate, benchmark::Counter::kIs1000);
}
// Fill a vector with random values in [-1, 1].
template <typename T>
static void fillRand(T* data, Index n) {
Eigen::Map<Eigen::Matrix<T, Eigen::Dynamic, 1>>(data, n).setRandom();
}
// Fill a symmetric band matrix A in BLAS band storage (column-major).
// Upper triangle: A[i,j] stored at a[(k+i-j) + j*lda], 0 <= j-i <= k.
template <typename T>
static void fillSymBandUpper(T* a, int n, int k, int lda) {
std::fill(a, a + lda * n, T(0));
for (int j = 0; j < n; ++j)
for (int i = std::max(0, j - k); i <= j; ++i) a[(k + i - j) + j * lda] = T(std::rand()) / T(RAND_MAX) - T(0.5);
}
// Fill a packed symmetric matrix (upper triangle, column-major).
template <typename T>
static void fillSymPacked(T* ap, int n) {
int sz = n * (n + 1) / 2;
for (int i = 0; i < sz; ++i) ap[i] = T(std::rand()) / T(RAND_MAX) - T(0.5);
}
// Fill a triangular band matrix in BLAS band storage (upper, column-major).
template <typename T>
static void fillTriBandUpper(T* a, int n, int k, int lda) {
std::fill(a, a + lda * n, T(0));
for (int j = 0; j < n; ++j)
for (int i = std::max(0, j - k); i <= j; ++i) {
T val = T(std::rand()) / T(RAND_MAX) - T(0.5);
if (i == j) val += T(n); // diagonal dominance
a[(k + i - j) + j * lda] = val;
}
}
// ---------------------------------------------------------------------------
// Type-dispatched BLAS wrappers
// ---------------------------------------------------------------------------
inline float blas_dot(int* n, float* x, int* incx, float* y, int* incy) { return sdot_(n, x, incx, y, incy); }
inline double blas_dot(int* n, double* x, int* incx, double* y, int* incy) { return ddot_(n, x, incx, y, incy); }
inline void blas_axpy(int* n, float* a, float* x, int* incx, float* y, int* incy) { saxpy_(n, a, x, incx, y, incy); }
inline void blas_axpy(int* n, double* a, double* x, int* incx, double* y, int* incy) { daxpy_(n, a, x, incx, y, incy); }
inline float blas_nrm2(int* n, float* x, int* incx) { return snrm2_(n, x, incx); }
inline double blas_nrm2(int* n, double* x, int* incx) { return dnrm2_(n, x, incx); }
inline void blas_rotm(int* n, float* x, int* incx, float* y, int* incy, float* p) { srotm_(n, x, incx, y, incy, p); }
inline void blas_rotm(int* n, double* x, int* incx, double* y, int* incy, double* p) { drotm_(n, x, incx, y, incy, p); }
inline void blas_rotmg(float* d1, float* d2, float* x1, float* y1, float* p) { srotmg_(d1, d2, x1, y1, p); }
inline void blas_rotmg(double* d1, double* d2, double* x1, double* y1, double* p) { drotmg_(d1, d2, x1, y1, p); }
inline void blas_dotcw(int* n, float* cx, int* incx, float* cy, int* incy, float* res) {
cdotcw_(n, cx, incx, cy, incy, res);
}
inline void blas_dotcw(int* n, double* cx, int* incx, double* cy, int* incy, double* res) {
zdotcw_(n, cx, incx, cy, incy, res);
}
inline void blas_gemv(char* t, int* m, int* n, float* a, float* A, int* lda, float* x, int* incx, float* b, float* y,
int* incy) {
sgemv_(t, m, n, a, A, lda, x, incx, b, y, incy);
}
inline void blas_gemv(char* t, int* m, int* n, double* a, double* A, int* lda, double* x, int* incx, double* b,
double* y, int* incy) {
dgemv_(t, m, n, a, A, lda, x, incx, b, y, incy);
}
inline void blas_spmv(char* uplo, int* n, float* alpha, float* ap, float* x, int* incx, float* beta, float* y,
int* incy) {
sspmv_(uplo, n, alpha, ap, x, incx, beta, y, incy);
}
inline void blas_spmv(char* uplo, int* n, double* alpha, double* ap, double* x, int* incx, double* beta, double* y,
int* incy) {
dspmv_(uplo, n, alpha, ap, x, incx, beta, y, incy);
}
inline void blas_sbmv(char* uplo, int* n, int* k, float* alpha, float* a, int* lda, float* x, int* incx, float* beta,
float* y, int* incy) {
ssbmv_(uplo, n, k, alpha, a, lda, x, incx, beta, y, incy);
}
inline void blas_sbmv(char* uplo, int* n, int* k, double* alpha, double* a, int* lda, double* x, int* incx,
double* beta, double* y, int* incy) {
dsbmv_(uplo, n, k, alpha, a, lda, x, incx, beta, y, incy);
}
inline void blas_tbmv(char* uplo, char* trans, char* diag, int* n, int* k, float* a, int* lda, float* x, int* incx) {
stbmv_(uplo, trans, diag, n, k, a, lda, x, incx);
}
inline void blas_tbmv(char* uplo, char* trans, char* diag, int* n, int* k, double* a, int* lda, double* x, int* incx) {
dtbmv_(uplo, trans, diag, n, k, a, lda, x, incx);
}
inline void blas_hbmv(char* uplo, int* n, int* k, float* alpha, float* a, int* lda, float* x, int* incx, float* beta,
float* y, int* incy) {
chbmv_(uplo, n, k, alpha, a, lda, x, incx, beta, y, incy);
}
inline void blas_hbmv(char* uplo, int* n, int* k, double* alpha, double* a, int* lda, double* x, int* incx,
double* beta, double* y, int* incy) {
zhbmv_(uplo, n, k, alpha, a, lda, x, incx, beta, y, incy);
}
inline void blas_hpmv(char* uplo, int* n, float* alpha, float* ap, float* x, int* incx, float* beta, float* y,
int* incy) {
chpmv_(uplo, n, alpha, ap, x, incx, beta, y, incy);
}
inline void blas_hpmv(char* uplo, int* n, double* alpha, double* ap, double* x, int* incx, double* beta, double* y,
int* incy) {
zhpmv_(uplo, n, alpha, ap, x, incx, beta, y, incy);
}
inline void blas_gemm(char* ta, char* tb, int* m, int* n, int* k, float* alpha, float* a, int* lda, float* b, int* ldb,
float* beta, float* c, int* ldc) {
sgemm_(ta, tb, m, n, k, alpha, a, lda, b, ldb, beta, c, ldc);
}
inline void blas_gemm(char* ta, char* tb, int* m, int* n, int* k, double* alpha, double* a, int* lda, double* b,
int* ldb, double* beta, double* c, int* ldc) {
dgemm_(ta, tb, m, n, k, alpha, a, lda, b, ldb, beta, c, ldc);
}
// =========================================================================
// Level 1 — Real
// =========================================================================
// ----- SDOT / DDOT -----
template <typename T>
static void BM_dot(benchmark::State& state) {
int n = static_cast<int>(state.range(0));
int one = 1;
std::vector<T> x(n), y(n);
fillRand(x.data(), n);
fillRand(y.data(), n);
for (auto _ : state) {
T r = blas_dot(&n, x.data(), &one, y.data(), &one);
benchmark::DoNotOptimize(r);
}
state.counters["GFLOPS"] = GflopsCounter(2.0 * n);
}
// ----- SAXPY / DAXPY -----
template <typename T>
static void BM_axpy(benchmark::State& state) {
int n = static_cast<int>(state.range(0));
int one = 1;
T alpha = T(2.5);
std::vector<T> x(n), y(n);
fillRand(x.data(), n);
fillRand(y.data(), n);
for (auto _ : state) {
blas_axpy(&n, &alpha, x.data(), &one, y.data(), &one);
benchmark::DoNotOptimize(y.data());
}
state.counters["GFLOPS"] = GflopsCounter(2.0 * n);
}
// ----- SNRM2 / DNRM2 -----
template <typename T>
static void BM_nrm2(benchmark::State& state) {
int n = static_cast<int>(state.range(0));
int one = 1;
std::vector<T> x(n);
fillRand(x.data(), n);
for (auto _ : state) {
T r = blas_nrm2(&n, x.data(), &one);
benchmark::DoNotOptimize(r);
}
// Nominal flops; Eigen's stableNorm() does more work internally.
state.counters["GFLOPS"] = GflopsCounter(2.0 * n - 1);
}
// ----- SROTM / DROTM -----
template <typename T>
static void BM_rotm(benchmark::State& state) {
int n = static_cast<int>(state.range(0));
int one = 1;
std::vector<T> x(n), y(n);
T param[5] = {T(-1), T(0.6), T(-0.8), T(0.8), T(0.6)}; // full rotation
fillRand(x.data(), n);
fillRand(y.data(), n);
for (auto _ : state) {
blas_rotm(&n, x.data(), &one, y.data(), &one, param);
benchmark::DoNotOptimize(x.data());
benchmark::DoNotOptimize(y.data());
}
// 4 muls + 2 adds per element pair.
state.counters["GFLOPS"] = GflopsCounter(6.0 * n);
}
// ----- SROTMG / DROTMG -----
template <typename T>
static void BM_rotmg(benchmark::State& state) {
T d1 = T(2), d2 = T(3), x1 = T(1), y1 = T(0.5);
T param[5];
for (auto _ : state) {
T td1 = d1, td2 = d2, tx1 = x1;
blas_rotmg(&td1, &td2, &tx1, &y1, param);
benchmark::DoNotOptimize(param);
}
}
// =========================================================================
// Level 1 — Complex
// =========================================================================
// Complex conjugate dot product via the worker functions (cdotcw_ / zdotcw_)
// which use an output pointer, avoiding the ABI ambiguity of the struct-returning
// cdotc_ / zdotc_ wrappers.
template <typename T>
static void BM_dotc(benchmark::State& state) {
int n = static_cast<int>(state.range(0));
int one = 1;
std::vector<T> x(2 * n), y(2 * n); // interleaved real/imag
fillRand(x.data(), 2 * n);
fillRand(y.data(), 2 * n);
T res[2];
for (auto _ : state) {
blas_dotcw(&n, x.data(), &one, y.data(), &one, res);
benchmark::DoNotOptimize(res);
}
// Conjugate dot: 6 mul + 2 add per element = 8n flops.
state.counters["GFLOPS"] = GflopsCounter(8.0 * n);
}
// =========================================================================
// Level 2 — General Matrix-Vector (SGEMV / DGEMV)
// =========================================================================
template <typename T>
static void BM_gemv(benchmark::State& state) {
int m = static_cast<int>(state.range(0));
int n = static_cast<int>(state.range(1));
int one = 1;
T alpha = T(1), beta = T(0);
char trans = 'N';
std::vector<T> a(m * n), x(n), y(m);
fillRand(a.data(), m * n);
fillRand(x.data(), n);
fillRand(y.data(), m);
for (auto _ : state) {
blas_gemv(&trans, &m, &n, &alpha, a.data(), &m, x.data(), &one, &beta, y.data(), &one);
benchmark::DoNotOptimize(y.data());
}
state.counters["GFLOPS"] = GflopsCounter(2.0 * m * n);
}
// =========================================================================
// Level 2 — Symmetric Packed (SSPMV / DSPMV)
// =========================================================================
template <typename T>
static void BM_spmv(benchmark::State& state) {
int n = static_cast<int>(state.range(0));
int one = 1;
T alpha = T(1), beta = T(0);
char uplo = 'U';
int sz = n * (n + 1) / 2;
std::vector<T> ap(sz), x(n), y(n);
fillSymPacked(ap.data(), n);
fillRand(x.data(), n);
fillRand(y.data(), n);
for (auto _ : state) {
blas_spmv(&uplo, &n, &alpha, ap.data(), x.data(), &one, &beta, y.data(), &one);
benchmark::DoNotOptimize(y.data());
}
// Symmetric: each off-diag element contributes to two y entries.
state.counters["GFLOPS"] = GflopsCounter(2.0 * n * n);
}
// =========================================================================
// Level 2 — Symmetric Band (SSBMV / DSBMV)
// =========================================================================
template <typename T>
static void BM_sbmv(benchmark::State& state) {
int n = static_cast<int>(state.range(0));
int k = static_cast<int>(state.range(1));
int lda = k + 1;
int one = 1;
T alpha = T(1), beta = T(0);
char uplo = 'U';
std::vector<T> a(lda * n), x(n), y(n);
fillSymBandUpper(a.data(), n, k, lda);
fillRand(x.data(), n);
fillRand(y.data(), n);
for (auto _ : state) {
blas_sbmv(&uplo, &n, &k, &alpha, a.data(), &lda, x.data(), &one, &beta, y.data(), &one);
benchmark::DoNotOptimize(y.data());
}
state.counters["GFLOPS"] = GflopsCounter(2.0 * n * (2 * k + 1));
}
// =========================================================================
// Level 2 — Triangular Band (STBMV / DTBMV)
// =========================================================================
template <typename T>
static void BM_tbmv(benchmark::State& state) {
int n = static_cast<int>(state.range(0));
int k = static_cast<int>(state.range(1));
int lda = k + 1;
int one = 1;
char uplo = 'U', trans = 'N', diag = 'N';
std::vector<T> a(lda * n), x(n), x_orig(n);
fillTriBandUpper(a.data(), n, k, lda);
fillRand(x_orig.data(), n);
for (auto _ : state) {
state.PauseTiming();
std::copy(x_orig.begin(), x_orig.end(), x.begin());
state.ResumeTiming();
blas_tbmv(&uplo, &trans, &diag, &n, &k, a.data(), &lda, x.data(), &one);
benchmark::DoNotOptimize(x.data());
}
state.counters["GFLOPS"] = GflopsCounter(1.0 * n * (k + 1));
}
// =========================================================================
// Level 2 — Hermitian Band (CHBMV / ZHBMV)
// =========================================================================
template <typename T>
static void BM_hbmv(benchmark::State& state) {
int n = static_cast<int>(state.range(0));
int k = static_cast<int>(state.range(1));
int lda = k + 1;
int one = 1;
char uplo = 'U';
// Complex: each element is 2 reals.
std::vector<T> a(2 * lda * n), x(2 * n), y(2 * n);
T alpha[2] = {T(1), T(0)};
T beta[2] = {T(0), T(0)};
fillRand(a.data(), 2 * lda * n);
// Make diagonal real (imag part = 0).
for (int j = 0; j < n; ++j) a[2 * (k + j * lda) + 1] = T(0);
fillRand(x.data(), 2 * n);
fillRand(y.data(), 2 * n);
for (auto _ : state) {
blas_hbmv(&uplo, &n, &k, alpha, a.data(), &lda, x.data(), &one, beta, y.data(), &one);
benchmark::DoNotOptimize(y.data());
}
// Complex hermitian band: 8*n*(2k+1) flops approximately.
state.counters["GFLOPS"] = GflopsCounter(8.0 * n * (2 * k + 1));
}
// =========================================================================
// Level 2 — Hermitian Packed (CHPMV / ZHPMV)
// =========================================================================
template <typename T>
static void BM_hpmv(benchmark::State& state) {
int n = static_cast<int>(state.range(0));
int one = 1;
char uplo = 'U';
int sz = n * (n + 1) / 2;
std::vector<T> ap(2 * sz), x(2 * n), y(2 * n);
T alpha[2] = {T(1), T(0)};
T beta[2] = {T(0), T(0)};
fillRand(ap.data(), 2 * sz);
// Make diagonal real.
int kk = 0;
for (int j = 0; j < n; ++j) {
ap[2 * (kk + j) + 1] = T(0);
kk += j + 1;
}
fillRand(x.data(), 2 * n);
fillRand(y.data(), 2 * n);
for (auto _ : state) {
blas_hpmv(&uplo, &n, alpha, ap.data(), x.data(), &one, beta, y.data(), &one);
benchmark::DoNotOptimize(y.data());
}
state.counters["GFLOPS"] = GflopsCounter(8.0 * n * n);
}
// =========================================================================
// Level 3 — General Matrix Multiply (SGEMM / DGEMM)
// =========================================================================
template <typename T>
static void BM_gemm(benchmark::State& state) {
int n = static_cast<int>(state.range(0));
T alpha = T(1), beta = T(0);
char trans = 'N';
std::vector<T> a(n * n), b(n * n), c(n * n);
fillRand(a.data(), n * n);
fillRand(b.data(), n * n);
fillRand(c.data(), n * n);
for (auto _ : state) {
blas_gemm(&trans, &trans, &n, &n, &n, &alpha, a.data(), &n, b.data(), &n, &beta, c.data(), &n);
benchmark::DoNotOptimize(c.data());
}
state.counters["GFLOPS"] = GflopsCounter(2.0 * n * n * n);
}
// =========================================================================
// Register benchmarks
// =========================================================================
// clang-format off
// --- Vector sizes for Level 1 ---
#define L1_SIZES ->Arg(64)->Arg(256)->Arg(1024)->Arg(4096)->Arg(16384)->Arg(65536)
BENCHMARK(BM_dot<float>) L1_SIZES ->Name("sdot");
BENCHMARK(BM_dot<double>) L1_SIZES ->Name("ddot");
BENCHMARK(BM_axpy<float>) L1_SIZES ->Name("saxpy");
BENCHMARK(BM_axpy<double>) L1_SIZES ->Name("daxpy");
BENCHMARK(BM_nrm2<float>) L1_SIZES ->Name("snrm2");
BENCHMARK(BM_nrm2<double>) L1_SIZES ->Name("dnrm2");
BENCHMARK(BM_rotm<float>) L1_SIZES ->Name("srotm");
BENCHMARK(BM_rotm<double>) L1_SIZES ->Name("drotm");
BENCHMARK(BM_rotmg<float>) ->Name("srotmg");
BENCHMARK(BM_rotmg<double>) ->Name("drotmg");
BENCHMARK(BM_dotc<float>) L1_SIZES ->Name("cdotc");
BENCHMARK(BM_dotc<double>) L1_SIZES ->Name("zdotc");
#undef L1_SIZES
// --- Matrix sizes for Level 2 ---
// GEMV: {m, n}
#define GEMV_SIZES \
->Args({64, 64})->Args({256, 256})->Args({1024, 1024})->Args({4096, 4096}) \
->Args({4096, 64})->Args({64, 4096})
BENCHMARK(BM_gemv<float>) GEMV_SIZES ->Name("sgemv");
BENCHMARK(BM_gemv<double>) GEMV_SIZES ->Name("dgemv");
#undef GEMV_SIZES
// Symmetric packed: {n}
#define SPM_SIZES ->Arg(64)->Arg(256)->Arg(1024)->Arg(4096)
BENCHMARK(BM_spmv<float>) SPM_SIZES ->Name("sspmv");
BENCHMARK(BM_spmv<double>) SPM_SIZES ->Name("dspmv");
BENCHMARK(BM_hpmv<float>) SPM_SIZES ->Name("chpmv");
BENCHMARK(BM_hpmv<double>) SPM_SIZES ->Name("zhpmv");
#undef SPM_SIZES
// Band: {n, k}
#define BAND_SIZES \
->Args({256, 4})->Args({256, 32})->Args({1024, 4})->Args({1024, 32}) \
->Args({4096, 4})->Args({4096, 32})->Args({4096, 128})
BENCHMARK(BM_sbmv<float>) BAND_SIZES ->Name("ssbmv");
BENCHMARK(BM_sbmv<double>) BAND_SIZES ->Name("dsbmv");
BENCHMARK(BM_tbmv<float>) BAND_SIZES ->Name("stbmv");
BENCHMARK(BM_tbmv<double>) BAND_SIZES ->Name("dtbmv");
BENCHMARK(BM_hbmv<float>) BAND_SIZES ->Name("chbmv");
BENCHMARK(BM_hbmv<double>) BAND_SIZES ->Name("zhbmv");
#undef BAND_SIZES
// --- Square sizes for Level 3 ---
#define GEMM_SIZES ->Arg(32)->Arg(64)->Arg(128)->Arg(256)->Arg(512)->Arg(1024)
BENCHMARK(BM_gemm<float>) GEMM_SIZES ->Name("sgemm");
BENCHMARK(BM_gemm<double>) GEMM_SIZES ->Name("dgemm");
#undef GEMM_SIZES
// clang-format on

View File

@@ -20,7 +20,11 @@ function(eigen_add_benchmark name source)
if(BENCH_LIBRARIES)
target_link_libraries(${name} PRIVATE ${BENCH_LIBRARIES})
endif()
target_compile_options(${name} PRIVATE -O3 -DNDEBUG)
target_compile_options(${name} PRIVATE
$<$<CXX_COMPILER_ID:MSVC>:/O2>
$<$<NOT:$<CXX_COMPILER_ID:MSVC>>:-O3>
)
target_compile_definitions(${name} PRIVATE NDEBUG)
if(BENCH_DEFINITIONS)
target_compile_definitions(${name} PRIVATE ${BENCH_DEFINITIONS})
endif()
@@ -38,3 +42,4 @@ add_subdirectory(FFT)
add_subdirectory(Householder)
add_subdirectory(Solvers)
add_subdirectory(Tuning)
add_subdirectory(BLAS)

View File

@@ -21,3 +21,4 @@ eigen_add_benchmark(bench_syr2 bench_syr2.cpp)
eigen_add_benchmark(bench_construction bench_construction.cpp)
eigen_add_benchmark(bench_fixed_size bench_fixed_size.cpp)
eigen_add_benchmark(bench_fixed_size_double bench_fixed_size.cpp DEFINITIONS SCALAR=double)
eigen_add_benchmark(bench_small_matrix bench_small_matrix.cpp)

View File

@@ -0,0 +1,316 @@
#include <benchmark/benchmark.h>
#include <Eigen/Core>
#include <Eigen/LU>
#include <Eigen/Cholesky>
#include <Eigen/QR>
#include <Eigen/SVD>
#include <Eigen/Eigenvalues>
using namespace Eigen;
// ============================================================================
// Fixed-size matrix multiply (the fundamental operation)
// ============================================================================
template <typename Scalar, int N>
static void BM_MatMul(benchmark::State& state) {
Matrix<Scalar, N, N> a, b, c;
a.setRandom();
b.setRandom();
for (auto _ : state) {
c.noalias() = a * b;
benchmark::DoNotOptimize(c.data());
}
}
// Matrix-vector multiply
template <typename Scalar, int N>
static void BM_MatVec(benchmark::State& state) {
Matrix<Scalar, N, N> a;
Matrix<Scalar, N, 1> v, r;
a.setRandom();
v.setRandom();
for (auto _ : state) {
r.noalias() = a * v;
benchmark::DoNotOptimize(r.data());
}
}
// ============================================================================
// Fixed-size inverse (critical for transform operations)
// ============================================================================
template <typename Scalar, int N>
EIGEN_DONT_INLINE void do_inverse(const Matrix<Scalar, N, N>& a, Matrix<Scalar, N, N>& r) {
r = a.inverse();
}
template <typename Scalar, int N>
static void BM_Inverse(benchmark::State& state) {
Matrix<Scalar, N, N> a, r;
a.setRandom();
a += Matrix<Scalar, N, N>::Identity() * Scalar(N); // ensure well-conditioned
for (auto _ : state) {
do_inverse(a, r);
benchmark::DoNotOptimize(r.data());
}
}
// ============================================================================
// Fixed-size determinant
// ============================================================================
template <typename Scalar, int N>
static void BM_Determinant(benchmark::State& state) {
Matrix<Scalar, N, N> a;
a.setRandom();
Scalar d;
for (auto _ : state) {
d = a.determinant();
benchmark::DoNotOptimize(d);
}
}
// ============================================================================
// LLT (Cholesky) — for SPD matrices (covariance, mass matrices)
// ============================================================================
template <typename Scalar, int N>
static void BM_LLT_Compute(benchmark::State& state) {
Matrix<Scalar, N, N> a;
a.setRandom();
a = a.transpose() * a + Matrix<Scalar, N, N>::Identity(); // SPD
LLT<Matrix<Scalar, N, N>> llt;
for (auto _ : state) {
llt.compute(a);
benchmark::DoNotOptimize(&llt);
}
}
template <typename Scalar, int N>
static void BM_LLT_Solve(benchmark::State& state) {
Matrix<Scalar, N, N> a;
a.setRandom();
a = a.transpose() * a + Matrix<Scalar, N, N>::Identity();
Matrix<Scalar, N, 1> b = Matrix<Scalar, N, 1>::Random();
LLT<Matrix<Scalar, N, N>> llt(a);
Matrix<Scalar, N, 1> x;
for (auto _ : state) {
x = llt.solve(b);
benchmark::DoNotOptimize(x.data());
}
}
// ============================================================================
// LDLT — for semi-definite matrices
// ============================================================================
template <typename Scalar, int N>
static void BM_LDLT_Compute(benchmark::State& state) {
Matrix<Scalar, N, N> a;
a.setRandom();
a = a.transpose() * a + Matrix<Scalar, N, N>::Identity();
LDLT<Matrix<Scalar, N, N>> ldlt;
for (auto _ : state) {
ldlt.compute(a);
benchmark::DoNotOptimize(&ldlt);
}
}
// ============================================================================
// PartialPivLU — for general square systems
// ============================================================================
template <typename Scalar, int N>
static void BM_PartialPivLU_Compute(benchmark::State& state) {
Matrix<Scalar, N, N> a;
a.setRandom();
a += Matrix<Scalar, N, N>::Identity() * Scalar(N);
PartialPivLU<Matrix<Scalar, N, N>> lu;
for (auto _ : state) {
lu.compute(a);
benchmark::DoNotOptimize(lu.matrixLU().data());
}
}
template <typename Scalar, int N>
static void BM_PartialPivLU_Solve(benchmark::State& state) {
Matrix<Scalar, N, N> a;
a.setRandom();
a += Matrix<Scalar, N, N>::Identity() * Scalar(N);
Matrix<Scalar, N, 1> b = Matrix<Scalar, N, 1>::Random();
PartialPivLU<Matrix<Scalar, N, N>> lu(a);
Matrix<Scalar, N, 1> x;
for (auto _ : state) {
x = lu.solve(b);
benchmark::DoNotOptimize(x.data());
}
}
// ============================================================================
// ColPivHouseholderQR — for least-squares (camera calibration, etc.)
// ============================================================================
template <typename Scalar, int Rows, int Cols>
static void BM_ColPivQR_Compute(benchmark::State& state) {
Matrix<Scalar, Rows, Cols> a;
a.setRandom();
ColPivHouseholderQR<Matrix<Scalar, Rows, Cols>> qr;
for (auto _ : state) {
qr.compute(a);
benchmark::DoNotOptimize(qr.matrixR().data());
}
}
// ============================================================================
// JacobiSVD — the workhorse for small matrices in CV
// ============================================================================
template <typename Scalar, int Rows, int Cols, int Options = ComputeThinU | ComputeThinV>
static void BM_JacobiSVD_Compute(benchmark::State& state) {
Matrix<Scalar, Rows, Cols> a;
a.setRandom();
JacobiSVD<Matrix<Scalar, Rows, Cols>, Options> svd;
for (auto _ : state) {
svd.compute(a);
benchmark::DoNotOptimize(svd.singularValues().data());
}
}
template <typename Scalar, int Rows, int Cols>
static void BM_JacobiSVD_Solve(benchmark::State& state) {
Matrix<Scalar, Rows, Cols> a;
a.setRandom();
Matrix<Scalar, Rows, 1> b = Matrix<Scalar, Rows, 1>::Random();
JacobiSVD<Matrix<Scalar, Rows, Cols>, ComputeThinU | ComputeThinV> svd(a);
Matrix<Scalar, Cols, 1> x;
for (auto _ : state) {
x = svd.solve(b);
benchmark::DoNotOptimize(x.data());
}
}
// ============================================================================
// SelfAdjointEigenSolver — PCA, normal estimation
// ============================================================================
template <typename Scalar, int N>
static void BM_SelfAdjointEig_Compute(benchmark::State& state) {
Matrix<Scalar, N, N> a;
a.setRandom();
a = a.transpose() * a;
SelfAdjointEigenSolver<Matrix<Scalar, N, N>> eig;
for (auto _ : state) {
eig.compute(a);
benchmark::DoNotOptimize(eig.eigenvalues().data());
}
}
// SelfAdjointEigenSolver::computeDirect — closed-form for 2x2 and 3x3
template <typename Scalar, int N>
static void BM_SelfAdjointEig_ComputeDirect(benchmark::State& state) {
Matrix<Scalar, N, N> a;
a.setRandom();
a = a.transpose() * a;
SelfAdjointEigenSolver<Matrix<Scalar, N, N>> eig;
for (auto _ : state) {
eig.computeDirect(a);
benchmark::DoNotOptimize(eig.eigenvalues().data());
}
}
// ============================================================================
// Registration — focus on robotics/CV sizes
// ============================================================================
// Matrix multiply: 2x2, 3x3, 4x4, 6x6
BENCHMARK(BM_MatMul<float, 2>);
BENCHMARK(BM_MatMul<float, 3>);
BENCHMARK(BM_MatMul<float, 4>);
BENCHMARK(BM_MatMul<float, 6>);
BENCHMARK(BM_MatMul<double, 2>);
BENCHMARK(BM_MatMul<double, 3>);
BENCHMARK(BM_MatMul<double, 4>);
BENCHMARK(BM_MatMul<double, 6>);
// Matrix-vector multiply
BENCHMARK(BM_MatVec<float, 3>);
BENCHMARK(BM_MatVec<float, 4>);
BENCHMARK(BM_MatVec<float, 6>);
BENCHMARK(BM_MatVec<double, 3>);
BENCHMARK(BM_MatVec<double, 4>);
BENCHMARK(BM_MatVec<double, 6>);
// Inverse
BENCHMARK(BM_Inverse<float, 2>);
BENCHMARK(BM_Inverse<float, 3>);
BENCHMARK(BM_Inverse<float, 4>);
BENCHMARK(BM_Inverse<double, 2>);
BENCHMARK(BM_Inverse<double, 3>);
BENCHMARK(BM_Inverse<double, 4>);
// Determinant
BENCHMARK(BM_Determinant<float, 2>);
BENCHMARK(BM_Determinant<float, 3>);
BENCHMARK(BM_Determinant<float, 4>);
BENCHMARK(BM_Determinant<double, 2>);
BENCHMARK(BM_Determinant<double, 3>);
BENCHMARK(BM_Determinant<double, 4>);
// LLT (Cholesky)
BENCHMARK(BM_LLT_Compute<float, 3>);
BENCHMARK(BM_LLT_Compute<float, 4>);
BENCHMARK(BM_LLT_Compute<float, 6>);
BENCHMARK(BM_LLT_Compute<double, 3>);
BENCHMARK(BM_LLT_Compute<double, 4>);
BENCHMARK(BM_LLT_Compute<double, 6>);
BENCHMARK(BM_LLT_Solve<double, 3>);
BENCHMARK(BM_LLT_Solve<double, 6>);
// LDLT
BENCHMARK(BM_LDLT_Compute<double, 3>);
BENCHMARK(BM_LDLT_Compute<double, 6>);
// PartialPivLU
BENCHMARK(BM_PartialPivLU_Compute<float, 3>);
BENCHMARK(BM_PartialPivLU_Compute<float, 4>);
BENCHMARK(BM_PartialPivLU_Compute<double, 3>);
BENCHMARK(BM_PartialPivLU_Compute<double, 4>);
BENCHMARK(BM_PartialPivLU_Solve<double, 3>);
BENCHMARK(BM_PartialPivLU_Solve<double, 4>);
// ColPivHouseholderQR
BENCHMARK(BM_ColPivQR_Compute<float, 3, 3>);
BENCHMARK(BM_ColPivQR_Compute<double, 3, 3>);
BENCHMARK(BM_ColPivQR_Compute<double, 6, 6>);
BENCHMARK(BM_ColPivQR_Compute<double, 8, 3>); // overdetermined least-squares
// JacobiSVD — the key CV sizes
BENCHMARK(BM_JacobiSVD_Compute<float, 2, 2>);
BENCHMARK(BM_JacobiSVD_Compute<float, 3, 3>);
BENCHMARK(BM_JacobiSVD_Compute<float, 4, 4>);
BENCHMARK(BM_JacobiSVD_Compute<double, 2, 2>);
BENCHMARK(BM_JacobiSVD_Compute<double, 3, 3>);
BENCHMARK(BM_JacobiSVD_Compute<double, 4, 4>);
BENCHMARK(BM_JacobiSVD_Compute<double, 3, 4>); // projection matrix
BENCHMARK(BM_JacobiSVD_Compute<double, 6, 6>); // manipulator Jacobian
BENCHMARK(BM_JacobiSVD_Compute<double, 8, 9>); // fundamental matrix (8-point)
BENCHMARK(BM_JacobiSVD_Solve<double, 3, 3>);
BENCHMARK(BM_JacobiSVD_Solve<double, 6, 6>);
// Values-only SVD (when you just need singular values)
BENCHMARK((BM_JacobiSVD_Compute<double, 3, 3, 0>));
BENCHMARK((BM_JacobiSVD_Compute<double, 6, 6, 0>));
// SelfAdjointEigenSolver — PCA, normal estimation
BENCHMARK(BM_SelfAdjointEig_Compute<float, 3>);
BENCHMARK(BM_SelfAdjointEig_Compute<float, 4>);
BENCHMARK(BM_SelfAdjointEig_Compute<double, 3>);
BENCHMARK(BM_SelfAdjointEig_Compute<double, 4>);
BENCHMARK(BM_SelfAdjointEig_Compute<double, 6>);
// SelfAdjointEigenSolver::computeDirect (closed-form, 2x2 and 3x3 only)
BENCHMARK(BM_SelfAdjointEig_ComputeDirect<float, 2>);
BENCHMARK(BM_SelfAdjointEig_ComputeDirect<float, 3>);
BENCHMARK(BM_SelfAdjointEig_ComputeDirect<double, 2>);
BENCHMARK(BM_SelfAdjointEig_ComputeDirect<double, 3>);

View File

@@ -6,11 +6,7 @@ if(EIGEN_BUILD_BLAS)
add_custom_target(blas)
set(EigenBlas_SRCS single.cpp double.cpp complex_single.cpp complex_double.cpp xerbla.cpp
f2c/srotm.c f2c/srotmg.c f2c/drotm.c f2c/drotmg.c
f2c/lsame.c f2c/dspmv.c f2c/ssbmv.c f2c/chbmv.c
f2c/sspmv.c f2c/zhbmv.c f2c/chpmv.c f2c/dsbmv.c
f2c/zhpmv.c f2c/dtbmv.c f2c/stbmv.c f2c/ctbmv.c
f2c/ztbmv.c f2c/complexdots.c
lsame.cpp complexdots.cpp
)
set(EIGEN_BLAS_TARGETS "")

72
blas/complexdots.cpp Normal file
View File

@@ -0,0 +1,72 @@
// This file is part of Eigen, a lightweight C++ template library
// for linear algebra.
//
// This Source Code Form is subject to the terms of the Mozilla
// Public License v. 2.0. If a copy of the MPL was not distributed
// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
// C++ replacements for the f2c complex dot product wrappers.
// These are thin wrappers around the worker functions (cdotcw_, etc.)
// defined in level1_cplx_impl.h.
//
// Note: blas.h declares these as void, but gfortran expects complex functions
// to return by value. We define the correct signatures here and do not include
// blas.h to avoid the conflicting declarations.
#if defined(_WIN32)
#if defined(EIGEN_BLAS_BUILD_DLL)
#define EIGEN_BLAS_CDOT_API __declspec(dllexport)
#else
#define EIGEN_BLAS_CDOT_API
#endif
#elif ((defined(__GNUC__) && __GNUC__ >= 4) || defined(__clang__)) && defined(EIGEN_BLAS_BUILD_DLL)
#define EIGEN_BLAS_CDOT_API __attribute__((visibility("default")))
#else
#define EIGEN_BLAS_CDOT_API
#endif
extern "C" {
// Worker function declarations (defined in level1_cplx_impl.h via complex_single.cpp / complex_double.cpp).
void cdotcw_(int *n, float *cx, int *incx, float *cy, int *incy, float *res);
void cdotuw_(int *n, float *cx, int *incx, float *cy, int *incy, float *res);
void zdotcw_(int *n, double *cx, int *incx, double *cy, int *incy, double *res);
void zdotuw_(int *n, double *cx, int *incx, double *cy, int *incy, double *res);
// POD complex types for C-compatible return values (matches Fortran complex layout).
struct eigen_blas_complex_float {
float r, i;
};
struct eigen_blas_complex_double {
double r, i;
};
// CDOTC computes the conjugated dot product of two single-precision complex vectors.
EIGEN_BLAS_CDOT_API eigen_blas_complex_float cdotc_(int *n, float *cx, int *incx, float *cy, int *incy) {
eigen_blas_complex_float res = {0.0f, 0.0f};
cdotcw_(n, cx, incx, cy, incy, &res.r);
return res;
}
// CDOTU computes the unconjugated dot product of two single-precision complex vectors.
EIGEN_BLAS_CDOT_API eigen_blas_complex_float cdotu_(int *n, float *cx, int *incx, float *cy, int *incy) {
eigen_blas_complex_float res = {0.0f, 0.0f};
cdotuw_(n, cx, incx, cy, incy, &res.r);
return res;
}
// ZDOTC computes the conjugated dot product of two double-precision complex vectors.
EIGEN_BLAS_CDOT_API eigen_blas_complex_double zdotc_(int *n, double *cx, int *incx, double *cy, int *incy) {
eigen_blas_complex_double res = {0.0, 0.0};
zdotcw_(n, cx, incx, cy, incy, &res.r);
return res;
}
// ZDOTU computes the unconjugated dot product of two double-precision complex vectors.
EIGEN_BLAS_CDOT_API eigen_blas_complex_double zdotu_(int *n, double *cx, int *incx, double *cy, int *incy) {
eigen_blas_complex_double res = {0.0, 0.0};
zdotuw_(n, cx, incx, cy, incy, &res.r);
return res;
}
} // extern "C"

View File

@@ -1,456 +0,0 @@
/* chbmv.f -- translated by f2c (version 20100827).
You must link the resulting object file with libf2c:
on Microsoft Windows system, link with libf2c.lib;
on Linux or Unix systems, link with .../path/to/libf2c.a -lm
or, if you install libf2c.a in a standard place, with -lf2c -lm
-- in that order, at the end of the command line, as in
cc *.o -lf2c -lm
Source for libf2c is in /netlib/f2c/libf2c.zip, e.g.,
http://www.netlib.org/f2c/libf2c.zip
*/
#include "datatypes.h"
static inline void r_cnjg(complex *r, complex *z) {
r->r = z->r;
r->i = -(z->i);
}
/* Subroutine */ void chbmv_(char *uplo, integer *n, integer *k, complex *alpha, complex *a, integer *lda, complex *x,
integer *incx, complex *beta, complex *y, integer *incy) {
/* System generated locals */
integer a_dim1, a_offset, i__1, i__2, i__3, i__4, i__5;
real r__1;
complex q__1, q__2, q__3, q__4;
/* Local variables */
integer i__, j, l, ix, iy, jx, jy, kx, ky, info;
complex temp1, temp2;
extern logical lsame_(char *, char *);
integer kplus1;
extern /* Subroutine */ void xerbla_(const char *, integer *);
/* .. Scalar Arguments .. */
/* .. */
/* .. Array Arguments .. */
/* .. */
/* Purpose */
/* ======= */
/* CHBMV performs the matrix-vector operation */
/* y := alpha*A*x + beta*y, */
/* where alpha and beta are scalars, x and y are n element vectors and */
/* A is an n by n hermitian band matrix, with k super-diagonals. */
/* Arguments */
/* ========== */
/* UPLO - CHARACTER*1. */
/* On entry, UPLO specifies whether the upper or lower */
/* triangular part of the band matrix A is being supplied as */
/* follows: */
/* UPLO = 'U' or 'u' The upper triangular part of A is */
/* being supplied. */
/* UPLO = 'L' or 'l' The lower triangular part of A is */
/* being supplied. */
/* Unchanged on exit. */
/* N - INTEGER. */
/* On entry, N specifies the order of the matrix A. */
/* N must be at least zero. */
/* Unchanged on exit. */
/* K - INTEGER. */
/* On entry, K specifies the number of super-diagonals of the */
/* matrix A. K must satisfy 0 .le. K. */
/* Unchanged on exit. */
/* ALPHA - COMPLEX . */
/* On entry, ALPHA specifies the scalar alpha. */
/* Unchanged on exit. */
/* A - COMPLEX array of DIMENSION ( LDA, n ). */
/* Before entry with UPLO = 'U' or 'u', the leading ( k + 1 ) */
/* by n part of the array A must contain the upper triangular */
/* band part of the hermitian matrix, supplied column by */
/* column, with the leading diagonal of the matrix in row */
/* ( k + 1 ) of the array, the first super-diagonal starting at */
/* position 2 in row k, and so on. The top left k by k triangle */
/* of the array A is not referenced. */
/* The following program segment will transfer the upper */
/* triangular part of a hermitian band matrix from conventional */
/* full matrix storage to band storage: */
/* DO 20, J = 1, N */
/* M = K + 1 - J */
/* DO 10, I = MAX( 1, J - K ), J */
/* A( M + I, J ) = matrix( I, J ) */
/* 10 CONTINUE */
/* 20 CONTINUE */
/* Before entry with UPLO = 'L' or 'l', the leading ( k + 1 ) */
/* by n part of the array A must contain the lower triangular */
/* band part of the hermitian matrix, supplied column by */
/* column, with the leading diagonal of the matrix in row 1 of */
/* the array, the first sub-diagonal starting at position 1 in */
/* row 2, and so on. The bottom right k by k triangle of the */
/* array A is not referenced. */
/* The following program segment will transfer the lower */
/* triangular part of a hermitian band matrix from conventional */
/* full matrix storage to band storage: */
/* DO 20, J = 1, N */
/* M = 1 - J */
/* DO 10, I = J, MIN( N, J + K ) */
/* A( M + I, J ) = matrix( I, J ) */
/* 10 CONTINUE */
/* 20 CONTINUE */
/* Note that the imaginary parts of the diagonal elements need */
/* not be set and are assumed to be zero. */
/* Unchanged on exit. */
/* LDA - INTEGER. */
/* On entry, LDA specifies the first dimension of A as declared */
/* in the calling (sub) program. LDA must be at least */
/* ( k + 1 ). */
/* Unchanged on exit. */
/* X - COMPLEX array of DIMENSION at least */
/* ( 1 + ( n - 1 )*abs( INCX ) ). */
/* Before entry, the incremented array X must contain the */
/* vector x. */
/* Unchanged on exit. */
/* INCX - INTEGER. */
/* On entry, INCX specifies the increment for the elements of */
/* X. INCX must not be zero. */
/* Unchanged on exit. */
/* BETA - COMPLEX . */
/* On entry, BETA specifies the scalar beta. */
/* Unchanged on exit. */
/* Y - COMPLEX array of DIMENSION at least */
/* ( 1 + ( n - 1 )*abs( INCY ) ). */
/* Before entry, the incremented array Y must contain the */
/* vector y. On exit, Y is overwritten by the updated vector y. */
/* INCY - INTEGER. */
/* On entry, INCY specifies the increment for the elements of */
/* Y. INCY must not be zero. */
/* Unchanged on exit. */
/* Further Details */
/* =============== */
/* Level 2 Blas routine. */
/* -- Written on 22-October-1986. */
/* Jack Dongarra, Argonne National Lab. */
/* Jeremy Du Croz, Nag Central Office. */
/* Sven Hammarling, Nag Central Office. */
/* Richard Hanson, Sandia National Labs. */
/* ===================================================================== */
/* .. Parameters .. */
/* .. */
/* .. Local Scalars .. */
/* .. */
/* .. External Functions .. */
/* .. */
/* .. External Subroutines .. */
/* .. */
/* .. Intrinsic Functions .. */
/* .. */
/* Test the input parameters. */
/* Parameter adjustments */
a_dim1 = *lda;
a_offset = 1 + a_dim1;
a -= a_offset;
--x;
--y;
/* Function Body */
info = 0;
if (!lsame_(uplo, "U") && !lsame_(uplo, "L")) {
info = 1;
} else if (*n < 0) {
info = 2;
} else if (*k < 0) {
info = 3;
} else if (*lda < *k + 1) {
info = 6;
} else if (*incx == 0) {
info = 8;
} else if (*incy == 0) {
info = 11;
}
if (info != 0) {
xerbla_("CHBMV ", &info);
return;
}
/* Quick return if possible. */
if (*n == 0 || (alpha->r == 0.f && alpha->i == 0.f && (beta->r == 1.f && beta->i == 0.f))) {
return;
}
/* Set up the start points in X and Y. */
if (*incx > 0) {
kx = 1;
} else {
kx = 1 - (*n - 1) * *incx;
}
if (*incy > 0) {
ky = 1;
} else {
ky = 1 - (*n - 1) * *incy;
}
/* Start the operations. In this version the elements of the array A */
/* are accessed sequentially with one pass through A. */
/* First form y := beta*y. */
if (beta->r != 1.f || beta->i != 0.f) {
if (*incy == 1) {
if (beta->r == 0.f && beta->i == 0.f) {
i__1 = *n;
for (i__ = 1; i__ <= i__1; ++i__) {
i__2 = i__;
y[i__2].r = 0.f, y[i__2].i = 0.f;
/* L10: */
}
} else {
i__1 = *n;
for (i__ = 1; i__ <= i__1; ++i__) {
i__2 = i__;
i__3 = i__;
q__1.r = beta->r * y[i__3].r - beta->i * y[i__3].i, q__1.i = beta->r * y[i__3].i + beta->i * y[i__3].r;
y[i__2].r = q__1.r, y[i__2].i = q__1.i;
/* L20: */
}
}
} else {
iy = ky;
if (beta->r == 0.f && beta->i == 0.f) {
i__1 = *n;
for (i__ = 1; i__ <= i__1; ++i__) {
i__2 = iy;
y[i__2].r = 0.f, y[i__2].i = 0.f;
iy += *incy;
/* L30: */
}
} else {
i__1 = *n;
for (i__ = 1; i__ <= i__1; ++i__) {
i__2 = iy;
i__3 = iy;
q__1.r = beta->r * y[i__3].r - beta->i * y[i__3].i, q__1.i = beta->r * y[i__3].i + beta->i * y[i__3].r;
y[i__2].r = q__1.r, y[i__2].i = q__1.i;
iy += *incy;
/* L40: */
}
}
}
}
if (alpha->r == 0.f && alpha->i == 0.f) {
return;
}
if (lsame_(uplo, "U")) {
/* Form y when upper triangle of A is stored. */
kplus1 = *k + 1;
if (*incx == 1 && *incy == 1) {
i__1 = *n;
for (j = 1; j <= i__1; ++j) {
i__2 = j;
q__1.r = alpha->r * x[i__2].r - alpha->i * x[i__2].i, q__1.i = alpha->r * x[i__2].i + alpha->i * x[i__2].r;
temp1.r = q__1.r, temp1.i = q__1.i;
temp2.r = 0.f, temp2.i = 0.f;
l = kplus1 - j;
/* Computing MAX */
i__2 = 1, i__3 = j - *k;
i__4 = j - 1;
for (i__ = max(i__2, i__3); i__ <= i__4; ++i__) {
i__2 = i__;
i__3 = i__;
i__5 = l + i__ + j * a_dim1;
q__2.r = temp1.r * a[i__5].r - temp1.i * a[i__5].i, q__2.i = temp1.r * a[i__5].i + temp1.i * a[i__5].r;
q__1.r = y[i__3].r + q__2.r, q__1.i = y[i__3].i + q__2.i;
y[i__2].r = q__1.r, y[i__2].i = q__1.i;
r_cnjg(&q__3, &a[l + i__ + j * a_dim1]);
i__2 = i__;
q__2.r = q__3.r * x[i__2].r - q__3.i * x[i__2].i, q__2.i = q__3.r * x[i__2].i + q__3.i * x[i__2].r;
q__1.r = temp2.r + q__2.r, q__1.i = temp2.i + q__2.i;
temp2.r = q__1.r, temp2.i = q__1.i;
/* L50: */
}
i__4 = j;
i__2 = j;
i__3 = kplus1 + j * a_dim1;
r__1 = a[i__3].r;
q__3.r = r__1 * temp1.r, q__3.i = r__1 * temp1.i;
q__2.r = y[i__2].r + q__3.r, q__2.i = y[i__2].i + q__3.i;
q__4.r = alpha->r * temp2.r - alpha->i * temp2.i, q__4.i = alpha->r * temp2.i + alpha->i * temp2.r;
q__1.r = q__2.r + q__4.r, q__1.i = q__2.i + q__4.i;
y[i__4].r = q__1.r, y[i__4].i = q__1.i;
/* L60: */
}
} else {
jx = kx;
jy = ky;
i__1 = *n;
for (j = 1; j <= i__1; ++j) {
i__4 = jx;
q__1.r = alpha->r * x[i__4].r - alpha->i * x[i__4].i, q__1.i = alpha->r * x[i__4].i + alpha->i * x[i__4].r;
temp1.r = q__1.r, temp1.i = q__1.i;
temp2.r = 0.f, temp2.i = 0.f;
ix = kx;
iy = ky;
l = kplus1 - j;
/* Computing MAX */
i__4 = 1, i__2 = j - *k;
i__3 = j - 1;
for (i__ = max(i__4, i__2); i__ <= i__3; ++i__) {
i__4 = iy;
i__2 = iy;
i__5 = l + i__ + j * a_dim1;
q__2.r = temp1.r * a[i__5].r - temp1.i * a[i__5].i, q__2.i = temp1.r * a[i__5].i + temp1.i * a[i__5].r;
q__1.r = y[i__2].r + q__2.r, q__1.i = y[i__2].i + q__2.i;
y[i__4].r = q__1.r, y[i__4].i = q__1.i;
r_cnjg(&q__3, &a[l + i__ + j * a_dim1]);
i__4 = ix;
q__2.r = q__3.r * x[i__4].r - q__3.i * x[i__4].i, q__2.i = q__3.r * x[i__4].i + q__3.i * x[i__4].r;
q__1.r = temp2.r + q__2.r, q__1.i = temp2.i + q__2.i;
temp2.r = q__1.r, temp2.i = q__1.i;
ix += *incx;
iy += *incy;
/* L70: */
}
i__3 = jy;
i__4 = jy;
i__2 = kplus1 + j * a_dim1;
r__1 = a[i__2].r;
q__3.r = r__1 * temp1.r, q__3.i = r__1 * temp1.i;
q__2.r = y[i__4].r + q__3.r, q__2.i = y[i__4].i + q__3.i;
q__4.r = alpha->r * temp2.r - alpha->i * temp2.i, q__4.i = alpha->r * temp2.i + alpha->i * temp2.r;
q__1.r = q__2.r + q__4.r, q__1.i = q__2.i + q__4.i;
y[i__3].r = q__1.r, y[i__3].i = q__1.i;
jx += *incx;
jy += *incy;
if (j > *k) {
kx += *incx;
ky += *incy;
}
/* L80: */
}
}
} else {
/* Form y when lower triangle of A is stored. */
if (*incx == 1 && *incy == 1) {
i__1 = *n;
for (j = 1; j <= i__1; ++j) {
i__3 = j;
q__1.r = alpha->r * x[i__3].r - alpha->i * x[i__3].i, q__1.i = alpha->r * x[i__3].i + alpha->i * x[i__3].r;
temp1.r = q__1.r, temp1.i = q__1.i;
temp2.r = 0.f, temp2.i = 0.f;
i__3 = j;
i__4 = j;
i__2 = j * a_dim1 + 1;
r__1 = a[i__2].r;
q__2.r = r__1 * temp1.r, q__2.i = r__1 * temp1.i;
q__1.r = y[i__4].r + q__2.r, q__1.i = y[i__4].i + q__2.i;
y[i__3].r = q__1.r, y[i__3].i = q__1.i;
l = 1 - j;
/* Computing MIN */
i__4 = *n, i__2 = j + *k;
i__3 = min(i__4, i__2);
for (i__ = j + 1; i__ <= i__3; ++i__) {
i__4 = i__;
i__2 = i__;
i__5 = l + i__ + j * a_dim1;
q__2.r = temp1.r * a[i__5].r - temp1.i * a[i__5].i, q__2.i = temp1.r * a[i__5].i + temp1.i * a[i__5].r;
q__1.r = y[i__2].r + q__2.r, q__1.i = y[i__2].i + q__2.i;
y[i__4].r = q__1.r, y[i__4].i = q__1.i;
r_cnjg(&q__3, &a[l + i__ + j * a_dim1]);
i__4 = i__;
q__2.r = q__3.r * x[i__4].r - q__3.i * x[i__4].i, q__2.i = q__3.r * x[i__4].i + q__3.i * x[i__4].r;
q__1.r = temp2.r + q__2.r, q__1.i = temp2.i + q__2.i;
temp2.r = q__1.r, temp2.i = q__1.i;
/* L90: */
}
i__3 = j;
i__4 = j;
q__2.r = alpha->r * temp2.r - alpha->i * temp2.i, q__2.i = alpha->r * temp2.i + alpha->i * temp2.r;
q__1.r = y[i__4].r + q__2.r, q__1.i = y[i__4].i + q__2.i;
y[i__3].r = q__1.r, y[i__3].i = q__1.i;
/* L100: */
}
} else {
jx = kx;
jy = ky;
i__1 = *n;
for (j = 1; j <= i__1; ++j) {
i__3 = jx;
q__1.r = alpha->r * x[i__3].r - alpha->i * x[i__3].i, q__1.i = alpha->r * x[i__3].i + alpha->i * x[i__3].r;
temp1.r = q__1.r, temp1.i = q__1.i;
temp2.r = 0.f, temp2.i = 0.f;
i__3 = jy;
i__4 = jy;
i__2 = j * a_dim1 + 1;
r__1 = a[i__2].r;
q__2.r = r__1 * temp1.r, q__2.i = r__1 * temp1.i;
q__1.r = y[i__4].r + q__2.r, q__1.i = y[i__4].i + q__2.i;
y[i__3].r = q__1.r, y[i__3].i = q__1.i;
l = 1 - j;
ix = jx;
iy = jy;
/* Computing MIN */
i__4 = *n, i__2 = j + *k;
i__3 = min(i__4, i__2);
for (i__ = j + 1; i__ <= i__3; ++i__) {
ix += *incx;
iy += *incy;
i__4 = iy;
i__2 = iy;
i__5 = l + i__ + j * a_dim1;
q__2.r = temp1.r * a[i__5].r - temp1.i * a[i__5].i, q__2.i = temp1.r * a[i__5].i + temp1.i * a[i__5].r;
q__1.r = y[i__2].r + q__2.r, q__1.i = y[i__2].i + q__2.i;
y[i__4].r = q__1.r, y[i__4].i = q__1.i;
r_cnjg(&q__3, &a[l + i__ + j * a_dim1]);
i__4 = ix;
q__2.r = q__3.r * x[i__4].r - q__3.i * x[i__4].i, q__2.i = q__3.r * x[i__4].i + q__3.i * x[i__4].r;
q__1.r = temp2.r + q__2.r, q__1.i = temp2.i + q__2.i;
temp2.r = q__1.r, temp2.i = q__1.i;
/* L110: */
}
i__3 = jy;
i__4 = jy;
q__2.r = alpha->r * temp2.r - alpha->i * temp2.i, q__2.i = alpha->r * temp2.i + alpha->i * temp2.r;
q__1.r = y[i__4].r + q__2.r, q__1.i = y[i__4].i + q__2.i;
y[i__3].r = q__1.r, y[i__3].i = q__1.i;
jx += *incx;
jy += *incy;
/* L120: */
}
}
}
/* End of CHBMV . */
} /* chbmv_ */

View File

@@ -1,407 +0,0 @@
/* chpmv.f -- translated by f2c (version 20100827).
You must link the resulting object file with libf2c:
on Microsoft Windows system, link with libf2c.lib;
on Linux or Unix systems, link with .../path/to/libf2c.a -lm
or, if you install libf2c.a in a standard place, with -lf2c -lm
-- in that order, at the end of the command line, as in
cc *.o -lf2c -lm
Source for libf2c is in /netlib/f2c/libf2c.zip, e.g.,
http://www.netlib.org/f2c/libf2c.zip
*/
#include "datatypes.h"
static inline void r_cnjg(complex *r, complex *z) {
r->r = z->r;
r->i = -(z->i);
}
/* Subroutine */ void chpmv_(char *uplo, integer *n, complex *alpha, complex *ap, complex *x, integer *incx,
complex *beta, complex *y, integer *incy) {
/* System generated locals */
integer i__1, i__2, i__3, i__4, i__5;
real r__1;
complex q__1, q__2, q__3, q__4;
/* Local variables */
integer i__, j, k, kk, ix, iy, jx, jy, kx, ky, info;
complex temp1, temp2;
extern logical lsame_(char *, char *);
extern /* Subroutine */ void xerbla_(const char *, integer *);
/* .. Scalar Arguments .. */
/* .. */
/* .. Array Arguments .. */
/* .. */
/* Purpose */
/* ======= */
/* CHPMV performs the matrix-vector operation */
/* y := alpha*A*x + beta*y, */
/* where alpha and beta are scalars, x and y are n element vectors and */
/* A is an n by n hermitian matrix, supplied in packed form. */
/* Arguments */
/* ========== */
/* UPLO - CHARACTER*1. */
/* On entry, UPLO specifies whether the upper or lower */
/* triangular part of the matrix A is supplied in the packed */
/* array AP as follows: */
/* UPLO = 'U' or 'u' The upper triangular part of A is */
/* supplied in AP. */
/* UPLO = 'L' or 'l' The lower triangular part of A is */
/* supplied in AP. */
/* Unchanged on exit. */
/* N - INTEGER. */
/* On entry, N specifies the order of the matrix A. */
/* N must be at least zero. */
/* Unchanged on exit. */
/* ALPHA - COMPLEX . */
/* On entry, ALPHA specifies the scalar alpha. */
/* Unchanged on exit. */
/* AP - COMPLEX array of DIMENSION at least */
/* ( ( n*( n + 1 ) )/2 ). */
/* Before entry with UPLO = 'U' or 'u', the array AP must */
/* contain the upper triangular part of the hermitian matrix */
/* packed sequentially, column by column, so that AP( 1 ) */
/* contains a( 1, 1 ), AP( 2 ) and AP( 3 ) contain a( 1, 2 ) */
/* and a( 2, 2 ) respectively, and so on. */
/* Before entry with UPLO = 'L' or 'l', the array AP must */
/* contain the lower triangular part of the hermitian matrix */
/* packed sequentially, column by column, so that AP( 1 ) */
/* contains a( 1, 1 ), AP( 2 ) and AP( 3 ) contain a( 2, 1 ) */
/* and a( 3, 1 ) respectively, and so on. */
/* Note that the imaginary parts of the diagonal elements need */
/* not be set and are assumed to be zero. */
/* Unchanged on exit. */
/* X - COMPLEX array of dimension at least */
/* ( 1 + ( n - 1 )*abs( INCX ) ). */
/* Before entry, the incremented array X must contain the n */
/* element vector x. */
/* Unchanged on exit. */
/* INCX - INTEGER. */
/* On entry, INCX specifies the increment for the elements of */
/* X. INCX must not be zero. */
/* Unchanged on exit. */
/* BETA - COMPLEX . */
/* On entry, BETA specifies the scalar beta. When BETA is */
/* supplied as zero then Y need not be set on input. */
/* Unchanged on exit. */
/* Y - COMPLEX array of dimension at least */
/* ( 1 + ( n - 1 )*abs( INCY ) ). */
/* Before entry, the incremented array Y must contain the n */
/* element vector y. On exit, Y is overwritten by the updated */
/* vector y. */
/* INCY - INTEGER. */
/* On entry, INCY specifies the increment for the elements of */
/* Y. INCY must not be zero. */
/* Unchanged on exit. */
/* Further Details */
/* =============== */
/* Level 2 Blas routine. */
/* -- Written on 22-October-1986. */
/* Jack Dongarra, Argonne National Lab. */
/* Jeremy Du Croz, Nag Central Office. */
/* Sven Hammarling, Nag Central Office. */
/* Richard Hanson, Sandia National Labs. */
/* ===================================================================== */
/* .. Parameters .. */
/* .. */
/* .. Local Scalars .. */
/* .. */
/* .. External Functions .. */
/* .. */
/* .. External Subroutines .. */
/* .. */
/* .. Intrinsic Functions .. */
/* .. */
/* Test the input parameters. */
/* Parameter adjustments */
--y;
--x;
--ap;
/* Function Body */
info = 0;
if (!lsame_(uplo, "U") && !lsame_(uplo, "L")) {
info = 1;
} else if (*n < 0) {
info = 2;
} else if (*incx == 0) {
info = 6;
} else if (*incy == 0) {
info = 9;
}
if (info != 0) {
xerbla_("CHPMV ", &info);
return;
}
/* Quick return if possible. */
if (*n == 0 || (alpha->r == 0.f && alpha->i == 0.f && (beta->r == 1.f && beta->i == 0.f))) {
return;
}
/* Set up the start points in X and Y. */
if (*incx > 0) {
kx = 1;
} else {
kx = 1 - (*n - 1) * *incx;
}
if (*incy > 0) {
ky = 1;
} else {
ky = 1 - (*n - 1) * *incy;
}
/* Start the operations. In this version the elements of the array AP */
/* are accessed sequentially with one pass through AP. */
/* First form y := beta*y. */
if (beta->r != 1.f || beta->i != 0.f) {
if (*incy == 1) {
if (beta->r == 0.f && beta->i == 0.f) {
i__1 = *n;
for (i__ = 1; i__ <= i__1; ++i__) {
i__2 = i__;
y[i__2].r = 0.f, y[i__2].i = 0.f;
/* L10: */
}
} else {
i__1 = *n;
for (i__ = 1; i__ <= i__1; ++i__) {
i__2 = i__;
i__3 = i__;
q__1.r = beta->r * y[i__3].r - beta->i * y[i__3].i, q__1.i = beta->r * y[i__3].i + beta->i * y[i__3].r;
y[i__2].r = q__1.r, y[i__2].i = q__1.i;
/* L20: */
}
}
} else {
iy = ky;
if (beta->r == 0.f && beta->i == 0.f) {
i__1 = *n;
for (i__ = 1; i__ <= i__1; ++i__) {
i__2 = iy;
y[i__2].r = 0.f, y[i__2].i = 0.f;
iy += *incy;
/* L30: */
}
} else {
i__1 = *n;
for (i__ = 1; i__ <= i__1; ++i__) {
i__2 = iy;
i__3 = iy;
q__1.r = beta->r * y[i__3].r - beta->i * y[i__3].i, q__1.i = beta->r * y[i__3].i + beta->i * y[i__3].r;
y[i__2].r = q__1.r, y[i__2].i = q__1.i;
iy += *incy;
/* L40: */
}
}
}
}
if (alpha->r == 0.f && alpha->i == 0.f) {
return;
}
kk = 1;
if (lsame_(uplo, "U")) {
/* Form y when AP contains the upper triangle. */
if (*incx == 1 && *incy == 1) {
i__1 = *n;
for (j = 1; j <= i__1; ++j) {
i__2 = j;
q__1.r = alpha->r * x[i__2].r - alpha->i * x[i__2].i, q__1.i = alpha->r * x[i__2].i + alpha->i * x[i__2].r;
temp1.r = q__1.r, temp1.i = q__1.i;
temp2.r = 0.f, temp2.i = 0.f;
k = kk;
i__2 = j - 1;
for (i__ = 1; i__ <= i__2; ++i__) {
i__3 = i__;
i__4 = i__;
i__5 = k;
q__2.r = temp1.r * ap[i__5].r - temp1.i * ap[i__5].i, q__2.i = temp1.r * ap[i__5].i + temp1.i * ap[i__5].r;
q__1.r = y[i__4].r + q__2.r, q__1.i = y[i__4].i + q__2.i;
y[i__3].r = q__1.r, y[i__3].i = q__1.i;
r_cnjg(&q__3, &ap[k]);
i__3 = i__;
q__2.r = q__3.r * x[i__3].r - q__3.i * x[i__3].i, q__2.i = q__3.r * x[i__3].i + q__3.i * x[i__3].r;
q__1.r = temp2.r + q__2.r, q__1.i = temp2.i + q__2.i;
temp2.r = q__1.r, temp2.i = q__1.i;
++k;
/* L50: */
}
i__2 = j;
i__3 = j;
i__4 = kk + j - 1;
r__1 = ap[i__4].r;
q__3.r = r__1 * temp1.r, q__3.i = r__1 * temp1.i;
q__2.r = y[i__3].r + q__3.r, q__2.i = y[i__3].i + q__3.i;
q__4.r = alpha->r * temp2.r - alpha->i * temp2.i, q__4.i = alpha->r * temp2.i + alpha->i * temp2.r;
q__1.r = q__2.r + q__4.r, q__1.i = q__2.i + q__4.i;
y[i__2].r = q__1.r, y[i__2].i = q__1.i;
kk += j;
/* L60: */
}
} else {
jx = kx;
jy = ky;
i__1 = *n;
for (j = 1; j <= i__1; ++j) {
i__2 = jx;
q__1.r = alpha->r * x[i__2].r - alpha->i * x[i__2].i, q__1.i = alpha->r * x[i__2].i + alpha->i * x[i__2].r;
temp1.r = q__1.r, temp1.i = q__1.i;
temp2.r = 0.f, temp2.i = 0.f;
ix = kx;
iy = ky;
i__2 = kk + j - 2;
for (k = kk; k <= i__2; ++k) {
i__3 = iy;
i__4 = iy;
i__5 = k;
q__2.r = temp1.r * ap[i__5].r - temp1.i * ap[i__5].i, q__2.i = temp1.r * ap[i__5].i + temp1.i * ap[i__5].r;
q__1.r = y[i__4].r + q__2.r, q__1.i = y[i__4].i + q__2.i;
y[i__3].r = q__1.r, y[i__3].i = q__1.i;
r_cnjg(&q__3, &ap[k]);
i__3 = ix;
q__2.r = q__3.r * x[i__3].r - q__3.i * x[i__3].i, q__2.i = q__3.r * x[i__3].i + q__3.i * x[i__3].r;
q__1.r = temp2.r + q__2.r, q__1.i = temp2.i + q__2.i;
temp2.r = q__1.r, temp2.i = q__1.i;
ix += *incx;
iy += *incy;
/* L70: */
}
i__2 = jy;
i__3 = jy;
i__4 = kk + j - 1;
r__1 = ap[i__4].r;
q__3.r = r__1 * temp1.r, q__3.i = r__1 * temp1.i;
q__2.r = y[i__3].r + q__3.r, q__2.i = y[i__3].i + q__3.i;
q__4.r = alpha->r * temp2.r - alpha->i * temp2.i, q__4.i = alpha->r * temp2.i + alpha->i * temp2.r;
q__1.r = q__2.r + q__4.r, q__1.i = q__2.i + q__4.i;
y[i__2].r = q__1.r, y[i__2].i = q__1.i;
jx += *incx;
jy += *incy;
kk += j;
/* L80: */
}
}
} else {
/* Form y when AP contains the lower triangle. */
if (*incx == 1 && *incy == 1) {
i__1 = *n;
for (j = 1; j <= i__1; ++j) {
i__2 = j;
q__1.r = alpha->r * x[i__2].r - alpha->i * x[i__2].i, q__1.i = alpha->r * x[i__2].i + alpha->i * x[i__2].r;
temp1.r = q__1.r, temp1.i = q__1.i;
temp2.r = 0.f, temp2.i = 0.f;
i__2 = j;
i__3 = j;
i__4 = kk;
r__1 = ap[i__4].r;
q__2.r = r__1 * temp1.r, q__2.i = r__1 * temp1.i;
q__1.r = y[i__3].r + q__2.r, q__1.i = y[i__3].i + q__2.i;
y[i__2].r = q__1.r, y[i__2].i = q__1.i;
k = kk + 1;
i__2 = *n;
for (i__ = j + 1; i__ <= i__2; ++i__) {
i__3 = i__;
i__4 = i__;
i__5 = k;
q__2.r = temp1.r * ap[i__5].r - temp1.i * ap[i__5].i, q__2.i = temp1.r * ap[i__5].i + temp1.i * ap[i__5].r;
q__1.r = y[i__4].r + q__2.r, q__1.i = y[i__4].i + q__2.i;
y[i__3].r = q__1.r, y[i__3].i = q__1.i;
r_cnjg(&q__3, &ap[k]);
i__3 = i__;
q__2.r = q__3.r * x[i__3].r - q__3.i * x[i__3].i, q__2.i = q__3.r * x[i__3].i + q__3.i * x[i__3].r;
q__1.r = temp2.r + q__2.r, q__1.i = temp2.i + q__2.i;
temp2.r = q__1.r, temp2.i = q__1.i;
++k;
/* L90: */
}
i__2 = j;
i__3 = j;
q__2.r = alpha->r * temp2.r - alpha->i * temp2.i, q__2.i = alpha->r * temp2.i + alpha->i * temp2.r;
q__1.r = y[i__3].r + q__2.r, q__1.i = y[i__3].i + q__2.i;
y[i__2].r = q__1.r, y[i__2].i = q__1.i;
kk += *n - j + 1;
/* L100: */
}
} else {
jx = kx;
jy = ky;
i__1 = *n;
for (j = 1; j <= i__1; ++j) {
i__2 = jx;
q__1.r = alpha->r * x[i__2].r - alpha->i * x[i__2].i, q__1.i = alpha->r * x[i__2].i + alpha->i * x[i__2].r;
temp1.r = q__1.r, temp1.i = q__1.i;
temp2.r = 0.f, temp2.i = 0.f;
i__2 = jy;
i__3 = jy;
i__4 = kk;
r__1 = ap[i__4].r;
q__2.r = r__1 * temp1.r, q__2.i = r__1 * temp1.i;
q__1.r = y[i__3].r + q__2.r, q__1.i = y[i__3].i + q__2.i;
y[i__2].r = q__1.r, y[i__2].i = q__1.i;
ix = jx;
iy = jy;
i__2 = kk + *n - j;
for (k = kk + 1; k <= i__2; ++k) {
ix += *incx;
iy += *incy;
i__3 = iy;
i__4 = iy;
i__5 = k;
q__2.r = temp1.r * ap[i__5].r - temp1.i * ap[i__5].i, q__2.i = temp1.r * ap[i__5].i + temp1.i * ap[i__5].r;
q__1.r = y[i__4].r + q__2.r, q__1.i = y[i__4].i + q__2.i;
y[i__3].r = q__1.r, y[i__3].i = q__1.i;
r_cnjg(&q__3, &ap[k]);
i__3 = ix;
q__2.r = q__3.r * x[i__3].r - q__3.i * x[i__3].i, q__2.i = q__3.r * x[i__3].i + q__3.i * x[i__3].r;
q__1.r = temp2.r + q__2.r, q__1.i = temp2.i + q__2.i;
temp2.r = q__1.r, temp2.i = q__1.i;
/* L110: */
}
i__2 = jy;
i__3 = jy;
q__2.r = alpha->r * temp2.r - alpha->i * temp2.i, q__2.i = alpha->r * temp2.i + alpha->i * temp2.r;
q__1.r = y[i__3].r + q__2.r, q__1.i = y[i__3].i + q__2.i;
y[i__2].r = q__1.r, y[i__2].i = q__1.i;
jx += *incx;
jy += *incy;
kk += *n - j + 1;
/* L120: */
}
}
}
/* End of CHPMV . */
} /* chpmv_ */

View File

@@ -1,73 +0,0 @@
/* This file has been modified to use the standard gfortran calling
convention, rather than the f2c calling convention.
It does not require -ff2c when compiled with gfortran.
*/
/* complexdots.f -- translated by f2c (version 20100827).
You must link the resulting object file with libf2c:
on Microsoft Windows system, link with libf2c.lib;
on Linux or Unix systems, link with .../path/to/libf2c.a -lm
or, if you install libf2c.a in a standard place, with -lf2c -lm
-- in that order, at the end of the command line, as in
cc *.o -lf2c -lm
Source for libf2c is in /netlib/f2c/libf2c.zip, e.g.,
http://www.netlib.org/f2c/libf2c.zip
*/
#include "datatypes.h"
complex cdotc_(integer *n, complex *cx, integer *incx, complex *cy, integer *incy) {
complex res;
extern /* Subroutine */ void cdotcw_(integer *, complex *, integer *, complex *, integer *, complex *);
/* Parameter adjustments */
--cy;
--cx;
/* Function Body */
cdotcw_(n, &cx[1], incx, &cy[1], incy, &res);
return res;
} /* cdotc_ */
complex cdotu_(integer *n, complex *cx, integer *incx, complex *cy, integer *incy) {
complex res;
extern /* Subroutine */ void cdotuw_(integer *, complex *, integer *, complex *, integer *, complex *);
/* Parameter adjustments */
--cy;
--cx;
/* Function Body */
cdotuw_(n, &cx[1], incx, &cy[1], incy, &res);
return res;
} /* cdotu_ */
doublecomplex zdotc_(integer *n, doublecomplex *cx, integer *incx, doublecomplex *cy, integer *incy) {
doublecomplex res;
extern /* Subroutine */ void zdotcw_(integer *, doublecomplex *, integer *, doublecomplex *, integer *,
doublecomplex *);
/* Parameter adjustments */
--cy;
--cx;
/* Function Body */
zdotcw_(n, &cx[1], incx, &cy[1], incy, &res);
return res;
} /* zdotc_ */
doublecomplex zdotu_(integer *n, doublecomplex *cx, integer *incx, doublecomplex *cy, integer *incy) {
doublecomplex res;
extern /* Subroutine */ void zdotuw_(integer *, doublecomplex *, integer *, doublecomplex *, integer *,
doublecomplex *);
/* Parameter adjustments */
--cy;
--cx;
/* Function Body */
zdotuw_(n, &cx[1], incx, &cy[1], incy, &res);
return res;
} /* zdotu_ */

View File

@@ -1,586 +0,0 @@
/* ctbmv.f -- translated by f2c (version 20100827).
You must link the resulting object file with libf2c:
on Microsoft Windows system, link with libf2c.lib;
on Linux or Unix systems, link with .../path/to/libf2c.a -lm
or, if you install libf2c.a in a standard place, with -lf2c -lm
-- in that order, at the end of the command line, as in
cc *.o -lf2c -lm
Source for libf2c is in /netlib/f2c/libf2c.zip, e.g.,
http://www.netlib.org/f2c/libf2c.zip
*/
#include "datatypes.h"
static inline void r_cnjg(complex *r, complex *z) {
r->r = z->r;
r->i = -(z->i);
}
/* Subroutine */ void ctbmv_(char *uplo, char *trans, char *diag, integer *n, integer *k, complex *a, integer *lda,
complex *x, integer *incx) {
/* System generated locals */
integer a_dim1, a_offset, i__1, i__2, i__3, i__4, i__5;
complex q__1, q__2, q__3;
/* Local variables */
integer i__, j, l, ix, jx, kx, info;
complex temp;
extern logical lsame_(char *, char *);
integer kplus1;
extern /* Subroutine */ void xerbla_(const char *, integer *);
logical noconj, nounit;
/* .. Scalar Arguments .. */
/* .. */
/* .. Array Arguments .. */
/* .. */
/* Purpose */
/* ======= */
/* CTBMV performs one of the matrix-vector operations */
/* x := A*x, or x := A'*x, or x := conjg( A' )*x, */
/* where x is an n element vector and A is an n by n unit, or non-unit, */
/* upper or lower triangular band matrix, with ( k + 1 ) diagonals. */
/* Arguments */
/* ========== */
/* UPLO - CHARACTER*1. */
/* On entry, UPLO specifies whether the matrix is an upper or */
/* lower triangular matrix as follows: */
/* UPLO = 'U' or 'u' A is an upper triangular matrix. */
/* UPLO = 'L' or 'l' A is a lower triangular matrix. */
/* Unchanged on exit. */
/* TRANS - CHARACTER*1. */
/* On entry, TRANS specifies the operation to be performed as */
/* follows: */
/* TRANS = 'N' or 'n' x := A*x. */
/* TRANS = 'T' or 't' x := A'*x. */
/* TRANS = 'C' or 'c' x := conjg( A' )*x. */
/* Unchanged on exit. */
/* DIAG - CHARACTER*1. */
/* On entry, DIAG specifies whether or not A is unit */
/* triangular as follows: */
/* DIAG = 'U' or 'u' A is assumed to be unit triangular. */
/* DIAG = 'N' or 'n' A is not assumed to be unit */
/* triangular. */
/* Unchanged on exit. */
/* N - INTEGER. */
/* On entry, N specifies the order of the matrix A. */
/* N must be at least zero. */
/* Unchanged on exit. */
/* K - INTEGER. */
/* On entry with UPLO = 'U' or 'u', K specifies the number of */
/* super-diagonals of the matrix A. */
/* On entry with UPLO = 'L' or 'l', K specifies the number of */
/* sub-diagonals of the matrix A. */
/* K must satisfy 0 .le. K. */
/* Unchanged on exit. */
/* A - COMPLEX array of DIMENSION ( LDA, n ). */
/* Before entry with UPLO = 'U' or 'u', the leading ( k + 1 ) */
/* by n part of the array A must contain the upper triangular */
/* band part of the matrix of coefficients, supplied column by */
/* column, with the leading diagonal of the matrix in row */
/* ( k + 1 ) of the array, the first super-diagonal starting at */
/* position 2 in row k, and so on. The top left k by k triangle */
/* of the array A is not referenced. */
/* The following program segment will transfer an upper */
/* triangular band matrix from conventional full matrix storage */
/* to band storage: */
/* DO 20, J = 1, N */
/* M = K + 1 - J */
/* DO 10, I = MAX( 1, J - K ), J */
/* A( M + I, J ) = matrix( I, J ) */
/* 10 CONTINUE */
/* 20 CONTINUE */
/* Before entry with UPLO = 'L' or 'l', the leading ( k + 1 ) */
/* by n part of the array A must contain the lower triangular */
/* band part of the matrix of coefficients, supplied column by */
/* column, with the leading diagonal of the matrix in row 1 of */
/* the array, the first sub-diagonal starting at position 1 in */
/* row 2, and so on. The bottom right k by k triangle of the */
/* array A is not referenced. */
/* The following program segment will transfer a lower */
/* triangular band matrix from conventional full matrix storage */
/* to band storage: */
/* DO 20, J = 1, N */
/* M = 1 - J */
/* DO 10, I = J, MIN( N, J + K ) */
/* A( M + I, J ) = matrix( I, J ) */
/* 10 CONTINUE */
/* 20 CONTINUE */
/* Note that when DIAG = 'U' or 'u' the elements of the array A */
/* corresponding to the diagonal elements of the matrix are not */
/* referenced, but are assumed to be unity. */
/* Unchanged on exit. */
/* LDA - INTEGER. */
/* On entry, LDA specifies the first dimension of A as declared */
/* in the calling (sub) program. LDA must be at least */
/* ( k + 1 ). */
/* Unchanged on exit. */
/* X - COMPLEX array of dimension at least */
/* ( 1 + ( n - 1 )*abs( INCX ) ). */
/* Before entry, the incremented array X must contain the n */
/* element vector x. On exit, X is overwritten with the */
/* transformed vector x. */
/* INCX - INTEGER. */
/* On entry, INCX specifies the increment for the elements of */
/* X. INCX must not be zero. */
/* Unchanged on exit. */
/* Further Details */
/* =============== */
/* Level 2 Blas routine. */
/* -- Written on 22-October-1986. */
/* Jack Dongarra, Argonne National Lab. */
/* Jeremy Du Croz, Nag Central Office. */
/* Sven Hammarling, Nag Central Office. */
/* Richard Hanson, Sandia National Labs. */
/* ===================================================================== */
/* .. Parameters .. */
/* .. */
/* .. Local Scalars .. */
/* .. */
/* .. External Functions .. */
/* .. */
/* .. External Subroutines .. */
/* .. */
/* .. Intrinsic Functions .. */
/* .. */
/* Test the input parameters. */
/* Parameter adjustments */
a_dim1 = *lda;
a_offset = 1 + a_dim1;
a -= a_offset;
--x;
/* Function Body */
info = 0;
if (!lsame_(uplo, "U") && !lsame_(uplo, "L")) {
info = 1;
} else if (!lsame_(trans, "N") && !lsame_(trans, "T") && !lsame_(trans, "C")) {
info = 2;
} else if (!lsame_(diag, "U") && !lsame_(diag, "N")) {
info = 3;
} else if (*n < 0) {
info = 4;
} else if (*k < 0) {
info = 5;
} else if (*lda < *k + 1) {
info = 7;
} else if (*incx == 0) {
info = 9;
}
if (info != 0) {
xerbla_("CTBMV ", &info);
return;
}
/* Quick return if possible. */
if (*n == 0) {
return;
}
noconj = lsame_(trans, "T");
nounit = lsame_(diag, "N");
/* Set up the start point in X if the increment is not unity. This */
/* will be ( N - 1 )*INCX too small for descending loops. */
if (*incx <= 0) {
kx = 1 - (*n - 1) * *incx;
} else if (*incx != 1) {
kx = 1;
}
/* Start the operations. In this version the elements of A are */
/* accessed sequentially with one pass through A. */
if (lsame_(trans, "N")) {
/* Form x := A*x. */
if (lsame_(uplo, "U")) {
kplus1 = *k + 1;
if (*incx == 1) {
i__1 = *n;
for (j = 1; j <= i__1; ++j) {
i__2 = j;
if (x[i__2].r != 0.f || x[i__2].i != 0.f) {
i__2 = j;
temp.r = x[i__2].r, temp.i = x[i__2].i;
l = kplus1 - j;
/* Computing MAX */
i__2 = 1, i__3 = j - *k;
i__4 = j - 1;
for (i__ = max(i__2, i__3); i__ <= i__4; ++i__) {
i__2 = i__;
i__3 = i__;
i__5 = l + i__ + j * a_dim1;
q__2.r = temp.r * a[i__5].r - temp.i * a[i__5].i, q__2.i = temp.r * a[i__5].i + temp.i * a[i__5].r;
q__1.r = x[i__3].r + q__2.r, q__1.i = x[i__3].i + q__2.i;
x[i__2].r = q__1.r, x[i__2].i = q__1.i;
/* L10: */
}
if (nounit) {
i__4 = j;
i__2 = j;
i__3 = kplus1 + j * a_dim1;
q__1.r = x[i__2].r * a[i__3].r - x[i__2].i * a[i__3].i,
q__1.i = x[i__2].r * a[i__3].i + x[i__2].i * a[i__3].r;
x[i__4].r = q__1.r, x[i__4].i = q__1.i;
}
}
/* L20: */
}
} else {
jx = kx;
i__1 = *n;
for (j = 1; j <= i__1; ++j) {
i__4 = jx;
if (x[i__4].r != 0.f || x[i__4].i != 0.f) {
i__4 = jx;
temp.r = x[i__4].r, temp.i = x[i__4].i;
ix = kx;
l = kplus1 - j;
/* Computing MAX */
i__4 = 1, i__2 = j - *k;
i__3 = j - 1;
for (i__ = max(i__4, i__2); i__ <= i__3; ++i__) {
i__4 = ix;
i__2 = ix;
i__5 = l + i__ + j * a_dim1;
q__2.r = temp.r * a[i__5].r - temp.i * a[i__5].i, q__2.i = temp.r * a[i__5].i + temp.i * a[i__5].r;
q__1.r = x[i__2].r + q__2.r, q__1.i = x[i__2].i + q__2.i;
x[i__4].r = q__1.r, x[i__4].i = q__1.i;
ix += *incx;
/* L30: */
}
if (nounit) {
i__3 = jx;
i__4 = jx;
i__2 = kplus1 + j * a_dim1;
q__1.r = x[i__4].r * a[i__2].r - x[i__4].i * a[i__2].i,
q__1.i = x[i__4].r * a[i__2].i + x[i__4].i * a[i__2].r;
x[i__3].r = q__1.r, x[i__3].i = q__1.i;
}
}
jx += *incx;
if (j > *k) {
kx += *incx;
}
/* L40: */
}
}
} else {
if (*incx == 1) {
for (j = *n; j >= 1; --j) {
i__1 = j;
if (x[i__1].r != 0.f || x[i__1].i != 0.f) {
i__1 = j;
temp.r = x[i__1].r, temp.i = x[i__1].i;
l = 1 - j;
/* Computing MIN */
i__1 = *n, i__3 = j + *k;
i__4 = j + 1;
for (i__ = min(i__1, i__3); i__ >= i__4; --i__) {
i__1 = i__;
i__3 = i__;
i__2 = l + i__ + j * a_dim1;
q__2.r = temp.r * a[i__2].r - temp.i * a[i__2].i, q__2.i = temp.r * a[i__2].i + temp.i * a[i__2].r;
q__1.r = x[i__3].r + q__2.r, q__1.i = x[i__3].i + q__2.i;
x[i__1].r = q__1.r, x[i__1].i = q__1.i;
/* L50: */
}
if (nounit) {
i__4 = j;
i__1 = j;
i__3 = j * a_dim1 + 1;
q__1.r = x[i__1].r * a[i__3].r - x[i__1].i * a[i__3].i,
q__1.i = x[i__1].r * a[i__3].i + x[i__1].i * a[i__3].r;
x[i__4].r = q__1.r, x[i__4].i = q__1.i;
}
}
/* L60: */
}
} else {
kx += (*n - 1) * *incx;
jx = kx;
for (j = *n; j >= 1; --j) {
i__4 = jx;
if (x[i__4].r != 0.f || x[i__4].i != 0.f) {
i__4 = jx;
temp.r = x[i__4].r, temp.i = x[i__4].i;
ix = kx;
l = 1 - j;
/* Computing MIN */
i__4 = *n, i__1 = j + *k;
i__3 = j + 1;
for (i__ = min(i__4, i__1); i__ >= i__3; --i__) {
i__4 = ix;
i__1 = ix;
i__2 = l + i__ + j * a_dim1;
q__2.r = temp.r * a[i__2].r - temp.i * a[i__2].i, q__2.i = temp.r * a[i__2].i + temp.i * a[i__2].r;
q__1.r = x[i__1].r + q__2.r, q__1.i = x[i__1].i + q__2.i;
x[i__4].r = q__1.r, x[i__4].i = q__1.i;
ix -= *incx;
/* L70: */
}
if (nounit) {
i__3 = jx;
i__4 = jx;
i__1 = j * a_dim1 + 1;
q__1.r = x[i__4].r * a[i__1].r - x[i__4].i * a[i__1].i,
q__1.i = x[i__4].r * a[i__1].i + x[i__4].i * a[i__1].r;
x[i__3].r = q__1.r, x[i__3].i = q__1.i;
}
}
jx -= *incx;
if (*n - j >= *k) {
kx -= *incx;
}
/* L80: */
}
}
}
} else {
/* Form x := A'*x or x := conjg( A' )*x. */
if (lsame_(uplo, "U")) {
kplus1 = *k + 1;
if (*incx == 1) {
for (j = *n; j >= 1; --j) {
i__3 = j;
temp.r = x[i__3].r, temp.i = x[i__3].i;
l = kplus1 - j;
if (noconj) {
if (nounit) {
i__3 = kplus1 + j * a_dim1;
q__1.r = temp.r * a[i__3].r - temp.i * a[i__3].i, q__1.i = temp.r * a[i__3].i + temp.i * a[i__3].r;
temp.r = q__1.r, temp.i = q__1.i;
}
/* Computing MAX */
i__4 = 1, i__1 = j - *k;
i__3 = max(i__4, i__1);
for (i__ = j - 1; i__ >= i__3; --i__) {
i__4 = l + i__ + j * a_dim1;
i__1 = i__;
q__2.r = a[i__4].r * x[i__1].r - a[i__4].i * x[i__1].i,
q__2.i = a[i__4].r * x[i__1].i + a[i__4].i * x[i__1].r;
q__1.r = temp.r + q__2.r, q__1.i = temp.i + q__2.i;
temp.r = q__1.r, temp.i = q__1.i;
/* L90: */
}
} else {
if (nounit) {
r_cnjg(&q__2, &a[kplus1 + j * a_dim1]);
q__1.r = temp.r * q__2.r - temp.i * q__2.i, q__1.i = temp.r * q__2.i + temp.i * q__2.r;
temp.r = q__1.r, temp.i = q__1.i;
}
/* Computing MAX */
i__4 = 1, i__1 = j - *k;
i__3 = max(i__4, i__1);
for (i__ = j - 1; i__ >= i__3; --i__) {
r_cnjg(&q__3, &a[l + i__ + j * a_dim1]);
i__4 = i__;
q__2.r = q__3.r * x[i__4].r - q__3.i * x[i__4].i, q__2.i = q__3.r * x[i__4].i + q__3.i * x[i__4].r;
q__1.r = temp.r + q__2.r, q__1.i = temp.i + q__2.i;
temp.r = q__1.r, temp.i = q__1.i;
/* L100: */
}
}
i__3 = j;
x[i__3].r = temp.r, x[i__3].i = temp.i;
/* L110: */
}
} else {
kx += (*n - 1) * *incx;
jx = kx;
for (j = *n; j >= 1; --j) {
i__3 = jx;
temp.r = x[i__3].r, temp.i = x[i__3].i;
kx -= *incx;
ix = kx;
l = kplus1 - j;
if (noconj) {
if (nounit) {
i__3 = kplus1 + j * a_dim1;
q__1.r = temp.r * a[i__3].r - temp.i * a[i__3].i, q__1.i = temp.r * a[i__3].i + temp.i * a[i__3].r;
temp.r = q__1.r, temp.i = q__1.i;
}
/* Computing MAX */
i__4 = 1, i__1 = j - *k;
i__3 = max(i__4, i__1);
for (i__ = j - 1; i__ >= i__3; --i__) {
i__4 = l + i__ + j * a_dim1;
i__1 = ix;
q__2.r = a[i__4].r * x[i__1].r - a[i__4].i * x[i__1].i,
q__2.i = a[i__4].r * x[i__1].i + a[i__4].i * x[i__1].r;
q__1.r = temp.r + q__2.r, q__1.i = temp.i + q__2.i;
temp.r = q__1.r, temp.i = q__1.i;
ix -= *incx;
/* L120: */
}
} else {
if (nounit) {
r_cnjg(&q__2, &a[kplus1 + j * a_dim1]);
q__1.r = temp.r * q__2.r - temp.i * q__2.i, q__1.i = temp.r * q__2.i + temp.i * q__2.r;
temp.r = q__1.r, temp.i = q__1.i;
}
/* Computing MAX */
i__4 = 1, i__1 = j - *k;
i__3 = max(i__4, i__1);
for (i__ = j - 1; i__ >= i__3; --i__) {
r_cnjg(&q__3, &a[l + i__ + j * a_dim1]);
i__4 = ix;
q__2.r = q__3.r * x[i__4].r - q__3.i * x[i__4].i, q__2.i = q__3.r * x[i__4].i + q__3.i * x[i__4].r;
q__1.r = temp.r + q__2.r, q__1.i = temp.i + q__2.i;
temp.r = q__1.r, temp.i = q__1.i;
ix -= *incx;
/* L130: */
}
}
i__3 = jx;
x[i__3].r = temp.r, x[i__3].i = temp.i;
jx -= *incx;
/* L140: */
}
}
} else {
if (*incx == 1) {
i__3 = *n;
for (j = 1; j <= i__3; ++j) {
i__4 = j;
temp.r = x[i__4].r, temp.i = x[i__4].i;
l = 1 - j;
if (noconj) {
if (nounit) {
i__4 = j * a_dim1 + 1;
q__1.r = temp.r * a[i__4].r - temp.i * a[i__4].i, q__1.i = temp.r * a[i__4].i + temp.i * a[i__4].r;
temp.r = q__1.r, temp.i = q__1.i;
}
/* Computing MIN */
i__1 = *n, i__2 = j + *k;
i__4 = min(i__1, i__2);
for (i__ = j + 1; i__ <= i__4; ++i__) {
i__1 = l + i__ + j * a_dim1;
i__2 = i__;
q__2.r = a[i__1].r * x[i__2].r - a[i__1].i * x[i__2].i,
q__2.i = a[i__1].r * x[i__2].i + a[i__1].i * x[i__2].r;
q__1.r = temp.r + q__2.r, q__1.i = temp.i + q__2.i;
temp.r = q__1.r, temp.i = q__1.i;
/* L150: */
}
} else {
if (nounit) {
r_cnjg(&q__2, &a[j * a_dim1 + 1]);
q__1.r = temp.r * q__2.r - temp.i * q__2.i, q__1.i = temp.r * q__2.i + temp.i * q__2.r;
temp.r = q__1.r, temp.i = q__1.i;
}
/* Computing MIN */
i__1 = *n, i__2 = j + *k;
i__4 = min(i__1, i__2);
for (i__ = j + 1; i__ <= i__4; ++i__) {
r_cnjg(&q__3, &a[l + i__ + j * a_dim1]);
i__1 = i__;
q__2.r = q__3.r * x[i__1].r - q__3.i * x[i__1].i, q__2.i = q__3.r * x[i__1].i + q__3.i * x[i__1].r;
q__1.r = temp.r + q__2.r, q__1.i = temp.i + q__2.i;
temp.r = q__1.r, temp.i = q__1.i;
/* L160: */
}
}
i__4 = j;
x[i__4].r = temp.r, x[i__4].i = temp.i;
/* L170: */
}
} else {
jx = kx;
i__3 = *n;
for (j = 1; j <= i__3; ++j) {
i__4 = jx;
temp.r = x[i__4].r, temp.i = x[i__4].i;
kx += *incx;
ix = kx;
l = 1 - j;
if (noconj) {
if (nounit) {
i__4 = j * a_dim1 + 1;
q__1.r = temp.r * a[i__4].r - temp.i * a[i__4].i, q__1.i = temp.r * a[i__4].i + temp.i * a[i__4].r;
temp.r = q__1.r, temp.i = q__1.i;
}
/* Computing MIN */
i__1 = *n, i__2 = j + *k;
i__4 = min(i__1, i__2);
for (i__ = j + 1; i__ <= i__4; ++i__) {
i__1 = l + i__ + j * a_dim1;
i__2 = ix;
q__2.r = a[i__1].r * x[i__2].r - a[i__1].i * x[i__2].i,
q__2.i = a[i__1].r * x[i__2].i + a[i__1].i * x[i__2].r;
q__1.r = temp.r + q__2.r, q__1.i = temp.i + q__2.i;
temp.r = q__1.r, temp.i = q__1.i;
ix += *incx;
/* L180: */
}
} else {
if (nounit) {
r_cnjg(&q__2, &a[j * a_dim1 + 1]);
q__1.r = temp.r * q__2.r - temp.i * q__2.i, q__1.i = temp.r * q__2.i + temp.i * q__2.r;
temp.r = q__1.r, temp.i = q__1.i;
}
/* Computing MIN */
i__1 = *n, i__2 = j + *k;
i__4 = min(i__1, i__2);
for (i__ = j + 1; i__ <= i__4; ++i__) {
r_cnjg(&q__3, &a[l + i__ + j * a_dim1]);
i__1 = ix;
q__2.r = q__3.r * x[i__1].r - q__3.i * x[i__1].i, q__2.i = q__3.r * x[i__1].i + q__3.i * x[i__1].r;
q__1.r = temp.r + q__2.r, q__1.i = temp.i + q__2.i;
temp.r = q__1.r, temp.i = q__1.i;
ix += *incx;
/* L190: */
}
}
i__4 = jx;
x[i__4].r = temp.r, x[i__4].i = temp.i;
jx += *incx;
/* L200: */
}
}
}
}
/* End of CTBMV . */
} /* ctbmv_ */

View File

@@ -1,27 +0,0 @@
/* This contains a limited subset of the typedefs exposed by f2c
for use by the Eigen BLAS C-only implementation.
*/
#ifndef __EIGEN_DATATYPES_H__
#define __EIGEN_DATATYPES_H__
typedef int integer;
typedef unsigned int uinteger;
typedef float real;
typedef double doublereal;
typedef struct {
real r, i;
} complex;
typedef struct {
doublereal r, i;
} doublecomplex;
typedef int logical;
#define abs(x) ((x) >= 0 ? (x) : -(x))
#define dabs(x) (doublereal) abs(x)
#define min(a, b) ((a) <= (b) ? (a) : (b))
#define max(a, b) ((a) >= (b) ? (a) : (b))
#define dmin(a, b) (doublereal) min(a, b)
#define dmax(a, b) (doublereal) max(a, b)
#endif

View File

@@ -1,213 +0,0 @@
/* drotm.f -- translated by f2c (version 20100827).
You must link the resulting object file with libf2c:
on Microsoft Windows system, link with libf2c.lib;
on Linux or Unix systems, link with .../path/to/libf2c.a -lm
or, if you install libf2c.a in a standard place, with -lf2c -lm
-- in that order, at the end of the command line, as in
cc *.o -lf2c -lm
Source for libf2c is in /netlib/f2c/libf2c.zip, e.g.,
http://www.netlib.org/f2c/libf2c.zip
*/
#include "datatypes.h"
/* Subroutine */ void drotm_(integer *n, doublereal *dx, integer *incx, doublereal *dy, integer *incy,
doublereal *dparam) {
/* Initialized data */
static doublereal zero = 0.;
static doublereal two = 2.;
/* System generated locals */
integer i__1, i__2;
/* Local variables */
integer i__;
doublereal w, z__;
integer kx, ky;
doublereal dh11, dh12, dh21, dh22, dflag;
integer nsteps;
/* .. Scalar Arguments .. */
/* .. */
/* .. Array Arguments .. */
/* .. */
/* Purpose */
/* ======= */
/* APPLY THE MODIFIED GIVENS TRANSFORMATION, H, TO THE 2 BY N MATRIX */
/* (DX**T) , WHERE **T INDICATES TRANSPOSE. THE ELEMENTS OF DX ARE IN */
/* (DY**T) */
/* DX(LX+I*INCX), I = 0 TO N-1, WHERE LX = 1 IF INCX .GE. 0, ELSE */
/* LX = (-INCX)*N, AND SIMILARLY FOR SY USING LY AND INCY. */
/* WITH DPARAM(1)=DFLAG, H HAS ONE OF THE FOLLOWING FORMS.. */
/* DFLAG=-1.D0 DFLAG=0.D0 DFLAG=1.D0 DFLAG=-2.D0 */
/* (DH11 DH12) (1.D0 DH12) (DH11 1.D0) (1.D0 0.D0) */
/* H=( ) ( ) ( ) ( ) */
/* (DH21 DH22), (DH21 1.D0), (-1.D0 DH22), (0.D0 1.D0). */
/* SEE DROTMG FOR A DESCRIPTION OF DATA STORAGE IN DPARAM. */
/* Arguments */
/* ========= */
/* N (input) INTEGER */
/* number of elements in input vector(s) */
/* DX (input/output) DOUBLE PRECISION array, dimension N */
/* double precision vector with N elements */
/* INCX (input) INTEGER */
/* storage spacing between elements of DX */
/* DY (input/output) DOUBLE PRECISION array, dimension N */
/* double precision vector with N elements */
/* INCY (input) INTEGER */
/* storage spacing between elements of DY */
/* DPARAM (input/output) DOUBLE PRECISION array, dimension 5 */
/* DPARAM(1)=DFLAG */
/* DPARAM(2)=DH11 */
/* DPARAM(3)=DH21 */
/* DPARAM(4)=DH12 */
/* DPARAM(5)=DH22 */
/* ===================================================================== */
/* .. Local Scalars .. */
/* .. */
/* .. Data statements .. */
/* Parameter adjustments */
--dparam;
--dy;
--dx;
/* Function Body */
/* .. */
dflag = dparam[1];
if (*n <= 0 || dflag + two == zero) {
goto L140;
}
if (!(*incx == *incy && *incx > 0)) {
goto L70;
}
nsteps = *n * *incx;
if (dflag < 0.) {
goto L50;
} else if (dflag == 0) {
goto L10;
} else {
goto L30;
}
L10:
dh12 = dparam[4];
dh21 = dparam[3];
i__1 = nsteps;
i__2 = *incx;
for (i__ = 1; i__2 < 0 ? i__ >= i__1 : i__ <= i__1; i__ += i__2) {
w = dx[i__];
z__ = dy[i__];
dx[i__] = w + z__ * dh12;
dy[i__] = w * dh21 + z__;
/* L20: */
}
goto L140;
L30:
dh11 = dparam[2];
dh22 = dparam[5];
i__2 = nsteps;
i__1 = *incx;
for (i__ = 1; i__1 < 0 ? i__ >= i__2 : i__ <= i__2; i__ += i__1) {
w = dx[i__];
z__ = dy[i__];
dx[i__] = w * dh11 + z__;
dy[i__] = -w + dh22 * z__;
/* L40: */
}
goto L140;
L50:
dh11 = dparam[2];
dh12 = dparam[4];
dh21 = dparam[3];
dh22 = dparam[5];
i__1 = nsteps;
i__2 = *incx;
for (i__ = 1; i__2 < 0 ? i__ >= i__1 : i__ <= i__1; i__ += i__2) {
w = dx[i__];
z__ = dy[i__];
dx[i__] = w * dh11 + z__ * dh12;
dy[i__] = w * dh21 + z__ * dh22;
/* L60: */
}
goto L140;
L70:
kx = 1;
ky = 1;
if (*incx < 0) {
kx = (1 - *n) * *incx + 1;
}
if (*incy < 0) {
ky = (1 - *n) * *incy + 1;
}
if (dflag < 0.) {
goto L120;
} else if (dflag == 0) {
goto L80;
} else {
goto L100;
}
L80:
dh12 = dparam[4];
dh21 = dparam[3];
i__2 = *n;
for (i__ = 1; i__ <= i__2; ++i__) {
w = dx[kx];
z__ = dy[ky];
dx[kx] = w + z__ * dh12;
dy[ky] = w * dh21 + z__;
kx += *incx;
ky += *incy;
/* L90: */
}
goto L140;
L100:
dh11 = dparam[2];
dh22 = dparam[5];
i__2 = *n;
for (i__ = 1; i__ <= i__2; ++i__) {
w = dx[kx];
z__ = dy[ky];
dx[kx] = w * dh11 + z__;
dy[ky] = -w + dh22 * z__;
kx += *incx;
ky += *incy;
/* L110: */
}
goto L140;
L120:
dh11 = dparam[2];
dh12 = dparam[4];
dh21 = dparam[3];
dh22 = dparam[5];
i__2 = *n;
for (i__ = 1; i__ <= i__2; ++i__) {
w = dx[kx];
z__ = dy[ky];
dx[kx] = w * dh11 + z__ * dh12;
dy[ky] = w * dh21 + z__ * dh22;
kx += *incx;
ky += *incy;
/* L130: */
}
L140:
return;
} /* drotm_ */

View File

@@ -1,293 +0,0 @@
/* drotmg.f -- translated by f2c (version 20100827).
You must link the resulting object file with libf2c:
on Microsoft Windows system, link with libf2c.lib;
on Linux or Unix systems, link with .../path/to/libf2c.a -lm
or, if you install libf2c.a in a standard place, with -lf2c -lm
-- in that order, at the end of the command line, as in
cc *.o -lf2c -lm
Source for libf2c is in /netlib/f2c/libf2c.zip, e.g.,
http://www.netlib.org/f2c/libf2c.zip
*/
#include "datatypes.h"
/* Subroutine */ void drotmg_(doublereal *dd1, doublereal *dd2, doublereal *dx1, doublereal *dy1, doublereal *dparam) {
/* Initialized data */
static doublereal zero = 0.;
static doublereal one = 1.;
static doublereal two = 2.;
static doublereal gam = 4096.;
static doublereal gamsq = 16777216.;
static doublereal rgamsq = 5.9604645e-8;
/* Format strings */
static char fmt_120[] = "";
static char fmt_150[] = "";
static char fmt_180[] = "";
static char fmt_210[] = "";
/* System generated locals */
doublereal d__1;
/* Local variables */
doublereal du, dp1, dp2, dq1, dq2, dh11, dh12, dh21, dh22;
integer igo;
doublereal dflag, dtemp;
/* Assigned format variables */
static char *igo_fmt;
(void)igo_fmt;
/* .. Scalar Arguments .. */
/* .. */
/* .. Array Arguments .. */
/* .. */
/* Purpose */
/* ======= */
/* CONSTRUCT THE MODIFIED GIVENS TRANSFORMATION MATRIX H WHICH ZEROS */
/* THE SECOND COMPONENT OF THE 2-VECTOR (DSQRT(DD1)*DX1,DSQRT(DD2)* */
/* DY2)**T. */
/* WITH DPARAM(1)=DFLAG, H HAS ONE OF THE FOLLOWING FORMS.. */
/* DFLAG=-1.D0 DFLAG=0.D0 DFLAG=1.D0 DFLAG=-2.D0 */
/* (DH11 DH12) (1.D0 DH12) (DH11 1.D0) (1.D0 0.D0) */
/* H=( ) ( ) ( ) ( ) */
/* (DH21 DH22), (DH21 1.D0), (-1.D0 DH22), (0.D0 1.D0). */
/* LOCATIONS 2-4 OF DPARAM CONTAIN DH11, DH21, DH12, AND DH22 */
/* RESPECTIVELY. (VALUES OF 1.D0, -1.D0, OR 0.D0 IMPLIED BY THE */
/* VALUE OF DPARAM(1) ARE NOT STORED IN DPARAM.) */
/* THE VALUES OF GAMSQ AND RGAMSQ SET IN THE DATA STATEMENT MAY BE */
/* INEXACT. THIS IS OK AS THEY ARE ONLY USED FOR TESTING THE SIZE */
/* OF DD1 AND DD2. ALL ACTUAL SCALING OF DATA IS DONE USING GAM. */
/* Arguments */
/* ========= */
/* DD1 (input/output) DOUBLE PRECISION */
/* DD2 (input/output) DOUBLE PRECISION */
/* DX1 (input/output) DOUBLE PRECISION */
/* DY1 (input) DOUBLE PRECISION */
/* DPARAM (input/output) DOUBLE PRECISION array, dimension 5 */
/* DPARAM(1)=DFLAG */
/* DPARAM(2)=DH11 */
/* DPARAM(3)=DH21 */
/* DPARAM(4)=DH12 */
/* DPARAM(5)=DH22 */
/* ===================================================================== */
/* .. Local Scalars .. */
/* .. */
/* .. Intrinsic Functions .. */
/* .. */
/* .. Data statements .. */
/* Parameter adjustments */
--dparam;
/* Function Body */
/* .. */
if (!(*dd1 < zero)) {
goto L10;
}
/* GO ZERO-H-D-AND-DX1.. */
goto L60;
L10:
/* CASE-DD1-NONNEGATIVE */
dp2 = *dd2 * *dy1;
if (!(dp2 == zero)) {
goto L20;
}
dflag = -two;
goto L260;
/* REGULAR-CASE.. */
L20:
dp1 = *dd1 * *dx1;
dq2 = dp2 * *dy1;
dq1 = dp1 * *dx1;
if (!(abs(dq1) > abs(dq2))) {
goto L40;
}
dh21 = -(*dy1) / *dx1;
dh12 = dp2 / dp1;
du = one - dh12 * dh21;
if (!(du <= zero)) {
goto L30;
}
/* GO ZERO-H-D-AND-DX1.. */
goto L60;
L30:
dflag = zero;
*dd1 /= du;
*dd2 /= du;
*dx1 *= du;
/* GO SCALE-CHECK.. */
goto L100;
L40:
if (!(dq2 < zero)) {
goto L50;
}
/* GO ZERO-H-D-AND-DX1.. */
goto L60;
L50:
dflag = one;
dh11 = dp1 / dp2;
dh22 = *dx1 / *dy1;
du = one + dh11 * dh22;
dtemp = *dd2 / du;
*dd2 = *dd1 / du;
*dd1 = dtemp;
*dx1 = *dy1 * du;
/* GO SCALE-CHECK */
goto L100;
/* PROCEDURE..ZERO-H-D-AND-DX1.. */
L60:
dflag = -one;
dh11 = zero;
dh12 = zero;
dh21 = zero;
dh22 = zero;
*dd1 = zero;
*dd2 = zero;
*dx1 = zero;
/* RETURN.. */
goto L220;
/* PROCEDURE..FIX-H.. */
L70:
if (!(dflag >= zero)) {
goto L90;
}
if (!(dflag == zero)) {
goto L80;
}
dh11 = one;
dh22 = one;
dflag = -one;
goto L90;
L80:
dh21 = -one;
dh12 = one;
dflag = -one;
L90:
switch (igo) {
case 0:
goto L120;
case 1:
goto L150;
case 2:
goto L180;
case 3:
goto L210;
}
/* PROCEDURE..SCALE-CHECK */
L100:
L110:
if (!(*dd1 <= rgamsq)) {
goto L130;
}
if (*dd1 == zero) {
goto L160;
}
igo = 0;
igo_fmt = fmt_120;
/* FIX-H.. */
goto L70;
L120:
/* Computing 2nd power */
d__1 = gam;
*dd1 *= d__1 * d__1;
*dx1 /= gam;
dh11 /= gam;
dh12 /= gam;
goto L110;
L130:
L140:
if (!(*dd1 >= gamsq)) {
goto L160;
}
igo = 1;
igo_fmt = fmt_150;
/* FIX-H.. */
goto L70;
L150:
/* Computing 2nd power */
d__1 = gam;
*dd1 /= d__1 * d__1;
*dx1 *= gam;
dh11 *= gam;
dh12 *= gam;
goto L140;
L160:
L170:
if (!(abs(*dd2) <= rgamsq)) {
goto L190;
}
if (*dd2 == zero) {
goto L220;
}
igo = 2;
igo_fmt = fmt_180;
/* FIX-H.. */
goto L70;
L180:
/* Computing 2nd power */
d__1 = gam;
*dd2 *= d__1 * d__1;
dh21 /= gam;
dh22 /= gam;
goto L170;
L190:
L200:
if (!(abs(*dd2) >= gamsq)) {
goto L220;
}
igo = 3;
igo_fmt = fmt_210;
/* FIX-H.. */
goto L70;
L210:
/* Computing 2nd power */
d__1 = gam;
*dd2 /= d__1 * d__1;
dh21 *= gam;
dh22 *= gam;
goto L200;
L220:
if (dflag < 0.) {
goto L250;
} else if (dflag == 0) {
goto L230;
} else {
goto L240;
}
L230:
dparam[3] = dh21;
dparam[4] = dh12;
goto L260;
L240:
dparam[2] = dh11;
dparam[5] = dh22;
goto L260;
L250:
dparam[2] = dh11;
dparam[3] = dh21;
dparam[4] = dh12;
dparam[5] = dh22;
L260:
dparam[1] = dflag;
} /* drotmg_ */

View File

@@ -1,356 +0,0 @@
/* dsbmv.f -- translated by f2c (version 20100827).
You must link the resulting object file with libf2c:
on Microsoft Windows system, link with libf2c.lib;
on Linux or Unix systems, link with .../path/to/libf2c.a -lm
or, if you install libf2c.a in a standard place, with -lf2c -lm
-- in that order, at the end of the command line, as in
cc *.o -lf2c -lm
Source for libf2c is in /netlib/f2c/libf2c.zip, e.g.,
http://www.netlib.org/f2c/libf2c.zip
*/
#include "datatypes.h"
/* Subroutine */ void dsbmv_(char *uplo, integer *n, integer *k, doublereal *alpha, doublereal *a, integer *lda,
doublereal *x, integer *incx, doublereal *beta, doublereal *y, integer *incy) {
/* System generated locals */
integer a_dim1, a_offset, i__1, i__2, i__3, i__4;
/* Local variables */
integer i__, j, l, ix, iy, jx, jy, kx, ky, info;
doublereal temp1, temp2;
extern logical lsame_(char *, char *);
integer kplus1;
extern /* Subroutine */ void xerbla_(const char *, integer *);
/* .. Scalar Arguments .. */
/* .. */
/* .. Array Arguments .. */
/* .. */
/* Purpose */
/* ======= */
/* DSBMV performs the matrix-vector operation */
/* y := alpha*A*x + beta*y, */
/* where alpha and beta are scalars, x and y are n element vectors and */
/* A is an n by n symmetric band matrix, with k super-diagonals. */
/* Arguments */
/* ========== */
/* UPLO - CHARACTER*1. */
/* On entry, UPLO specifies whether the upper or lower */
/* triangular part of the band matrix A is being supplied as */
/* follows: */
/* UPLO = 'U' or 'u' The upper triangular part of A is */
/* being supplied. */
/* UPLO = 'L' or 'l' The lower triangular part of A is */
/* being supplied. */
/* Unchanged on exit. */
/* N - INTEGER. */
/* On entry, N specifies the order of the matrix A. */
/* N must be at least zero. */
/* Unchanged on exit. */
/* K - INTEGER. */
/* On entry, K specifies the number of super-diagonals of the */
/* matrix A. K must satisfy 0 .le. K. */
/* Unchanged on exit. */
/* ALPHA - DOUBLE PRECISION. */
/* On entry, ALPHA specifies the scalar alpha. */
/* Unchanged on exit. */
/* A - DOUBLE PRECISION array of DIMENSION ( LDA, n ). */
/* Before entry with UPLO = 'U' or 'u', the leading ( k + 1 ) */
/* by n part of the array A must contain the upper triangular */
/* band part of the symmetric matrix, supplied column by */
/* column, with the leading diagonal of the matrix in row */
/* ( k + 1 ) of the array, the first super-diagonal starting at */
/* position 2 in row k, and so on. The top left k by k triangle */
/* of the array A is not referenced. */
/* The following program segment will transfer the upper */
/* triangular part of a symmetric band matrix from conventional */
/* full matrix storage to band storage: */
/* DO 20, J = 1, N */
/* M = K + 1 - J */
/* DO 10, I = MAX( 1, J - K ), J */
/* A( M + I, J ) = matrix( I, J ) */
/* 10 CONTINUE */
/* 20 CONTINUE */
/* Before entry with UPLO = 'L' or 'l', the leading ( k + 1 ) */
/* by n part of the array A must contain the lower triangular */
/* band part of the symmetric matrix, supplied column by */
/* column, with the leading diagonal of the matrix in row 1 of */
/* the array, the first sub-diagonal starting at position 1 in */
/* row 2, and so on. The bottom right k by k triangle of the */
/* array A is not referenced. */
/* The following program segment will transfer the lower */
/* triangular part of a symmetric band matrix from conventional */
/* full matrix storage to band storage: */
/* DO 20, J = 1, N */
/* M = 1 - J */
/* DO 10, I = J, MIN( N, J + K ) */
/* A( M + I, J ) = matrix( I, J ) */
/* 10 CONTINUE */
/* 20 CONTINUE */
/* Unchanged on exit. */
/* LDA - INTEGER. */
/* On entry, LDA specifies the first dimension of A as declared */
/* in the calling (sub) program. LDA must be at least */
/* ( k + 1 ). */
/* Unchanged on exit. */
/* X - DOUBLE PRECISION array of DIMENSION at least */
/* ( 1 + ( n - 1 )*abs( INCX ) ). */
/* Before entry, the incremented array X must contain the */
/* vector x. */
/* Unchanged on exit. */
/* INCX - INTEGER. */
/* On entry, INCX specifies the increment for the elements of */
/* X. INCX must not be zero. */
/* Unchanged on exit. */
/* BETA - DOUBLE PRECISION. */
/* On entry, BETA specifies the scalar beta. */
/* Unchanged on exit. */
/* Y - DOUBLE PRECISION array of DIMENSION at least */
/* ( 1 + ( n - 1 )*abs( INCY ) ). */
/* Before entry, the incremented array Y must contain the */
/* vector y. On exit, Y is overwritten by the updated vector y. */
/* INCY - INTEGER. */
/* On entry, INCY specifies the increment for the elements of */
/* Y. INCY must not be zero. */
/* Unchanged on exit. */
/* Level 2 Blas routine. */
/* -- Written on 22-October-1986. */
/* Jack Dongarra, Argonne National Lab. */
/* Jeremy Du Croz, Nag Central Office. */
/* Sven Hammarling, Nag Central Office. */
/* Richard Hanson, Sandia National Labs. */
/* ===================================================================== */
/* .. Parameters .. */
/* .. */
/* .. Local Scalars .. */
/* .. */
/* .. External Functions .. */
/* .. */
/* .. External Subroutines .. */
/* .. */
/* .. Intrinsic Functions .. */
/* .. */
/* Test the input parameters. */
/* Parameter adjustments */
a_dim1 = *lda;
a_offset = 1 + a_dim1;
a -= a_offset;
--x;
--y;
/* Function Body */
info = 0;
if (!lsame_(uplo, "U") && !lsame_(uplo, "L")) {
info = 1;
} else if (*n < 0) {
info = 2;
} else if (*k < 0) {
info = 3;
} else if (*lda < *k + 1) {
info = 6;
} else if (*incx == 0) {
info = 8;
} else if (*incy == 0) {
info = 11;
}
if (info != 0) {
xerbla_("DSBMV ", &info);
return;
}
/* Quick return if possible. */
if (*n == 0 || (*alpha == 0. && *beta == 1.)) {
return;
}
/* Set up the start points in X and Y. */
if (*incx > 0) {
kx = 1;
} else {
kx = 1 - (*n - 1) * *incx;
}
if (*incy > 0) {
ky = 1;
} else {
ky = 1 - (*n - 1) * *incy;
}
/* Start the operations. In this version the elements of the array A */
/* are accessed sequentially with one pass through A. */
/* First form y := beta*y. */
if (*beta != 1.) {
if (*incy == 1) {
if (*beta == 0.) {
i__1 = *n;
for (i__ = 1; i__ <= i__1; ++i__) {
y[i__] = 0.;
/* L10: */
}
} else {
i__1 = *n;
for (i__ = 1; i__ <= i__1; ++i__) {
y[i__] = *beta * y[i__];
/* L20: */
}
}
} else {
iy = ky;
if (*beta == 0.) {
i__1 = *n;
for (i__ = 1; i__ <= i__1; ++i__) {
y[iy] = 0.;
iy += *incy;
/* L30: */
}
} else {
i__1 = *n;
for (i__ = 1; i__ <= i__1; ++i__) {
y[iy] = *beta * y[iy];
iy += *incy;
/* L40: */
}
}
}
}
if (*alpha == 0.) {
return;
}
if (lsame_(uplo, "U")) {
/* Form y when upper triangle of A is stored. */
kplus1 = *k + 1;
if (*incx == 1 && *incy == 1) {
i__1 = *n;
for (j = 1; j <= i__1; ++j) {
temp1 = *alpha * x[j];
temp2 = 0.;
l = kplus1 - j;
/* Computing MAX */
i__2 = 1, i__3 = j - *k;
i__4 = j - 1;
for (i__ = max(i__2, i__3); i__ <= i__4; ++i__) {
y[i__] += temp1 * a[l + i__ + j * a_dim1];
temp2 += a[l + i__ + j * a_dim1] * x[i__];
/* L50: */
}
y[j] = y[j] + temp1 * a[kplus1 + j * a_dim1] + *alpha * temp2;
/* L60: */
}
} else {
jx = kx;
jy = ky;
i__1 = *n;
for (j = 1; j <= i__1; ++j) {
temp1 = *alpha * x[jx];
temp2 = 0.;
ix = kx;
iy = ky;
l = kplus1 - j;
/* Computing MAX */
i__4 = 1, i__2 = j - *k;
i__3 = j - 1;
for (i__ = max(i__4, i__2); i__ <= i__3; ++i__) {
y[iy] += temp1 * a[l + i__ + j * a_dim1];
temp2 += a[l + i__ + j * a_dim1] * x[ix];
ix += *incx;
iy += *incy;
/* L70: */
}
y[jy] = y[jy] + temp1 * a[kplus1 + j * a_dim1] + *alpha * temp2;
jx += *incx;
jy += *incy;
if (j > *k) {
kx += *incx;
ky += *incy;
}
/* L80: */
}
}
} else {
/* Form y when lower triangle of A is stored. */
if (*incx == 1 && *incy == 1) {
i__1 = *n;
for (j = 1; j <= i__1; ++j) {
temp1 = *alpha * x[j];
temp2 = 0.;
y[j] += temp1 * a[j * a_dim1 + 1];
l = 1 - j;
/* Computing MIN */
i__4 = *n, i__2 = j + *k;
i__3 = min(i__4, i__2);
for (i__ = j + 1; i__ <= i__3; ++i__) {
y[i__] += temp1 * a[l + i__ + j * a_dim1];
temp2 += a[l + i__ + j * a_dim1] * x[i__];
/* L90: */
}
y[j] += *alpha * temp2;
/* L100: */
}
} else {
jx = kx;
jy = ky;
i__1 = *n;
for (j = 1; j <= i__1; ++j) {
temp1 = *alpha * x[jx];
temp2 = 0.;
y[jy] += temp1 * a[j * a_dim1 + 1];
l = 1 - j;
ix = jx;
iy = jy;
/* Computing MIN */
i__4 = *n, i__2 = j + *k;
i__3 = min(i__4, i__2);
for (i__ = j + 1; i__ <= i__3; ++i__) {
ix += *incx;
iy += *incy;
y[iy] += temp1 * a[l + i__ + j * a_dim1];
temp2 += a[l + i__ + j * a_dim1] * x[ix];
/* L110: */
}
y[jy] += *alpha * temp2;
jx += *incx;
jy += *incy;
/* L120: */
}
}
}
/* End of DSBMV . */
} /* dsbmv_ */

View File

@@ -1,308 +0,0 @@
/* dspmv.f -- translated by f2c (version 20100827).
You must link the resulting object file with libf2c:
on Microsoft Windows system, link with libf2c.lib;
on Linux or Unix systems, link with .../path/to/libf2c.a -lm
or, if you install libf2c.a in a standard place, with -lf2c -lm
-- in that order, at the end of the command line, as in
cc *.o -lf2c -lm
Source for libf2c is in /netlib/f2c/libf2c.zip, e.g.,
http://www.netlib.org/f2c/libf2c.zip
*/
#include "datatypes.h"
/* Subroutine */ void dspmv_(char *uplo, integer *n, doublereal *alpha, doublereal *ap, doublereal *x, integer *incx,
doublereal *beta, doublereal *y, integer *incy) {
/* System generated locals */
integer i__1, i__2;
/* Local variables */
integer i__, j, k, kk, ix, iy, jx, jy, kx, ky, info;
doublereal temp1, temp2;
extern logical lsame_(char *, char *);
extern /* Subroutine */ void xerbla_(const char *, integer *);
/* .. Scalar Arguments .. */
/* .. */
/* .. Array Arguments .. */
/* .. */
/* Purpose */
/* ======= */
/* DSPMV performs the matrix-vector operation */
/* y := alpha*A*x + beta*y, */
/* where alpha and beta are scalars, x and y are n element vectors and */
/* A is an n by n symmetric matrix, supplied in packed form. */
/* Arguments */
/* ========== */
/* UPLO - CHARACTER*1. */
/* On entry, UPLO specifies whether the upper or lower */
/* triangular part of the matrix A is supplied in the packed */
/* array AP as follows: */
/* UPLO = 'U' or 'u' The upper triangular part of A is */
/* supplied in AP. */
/* UPLO = 'L' or 'l' The lower triangular part of A is */
/* supplied in AP. */
/* Unchanged on exit. */
/* N - INTEGER. */
/* On entry, N specifies the order of the matrix A. */
/* N must be at least zero. */
/* Unchanged on exit. */
/* ALPHA - DOUBLE PRECISION. */
/* On entry, ALPHA specifies the scalar alpha. */
/* Unchanged on exit. */
/* AP - DOUBLE PRECISION array of DIMENSION at least */
/* ( ( n*( n + 1 ) )/2 ). */
/* Before entry with UPLO = 'U' or 'u', the array AP must */
/* contain the upper triangular part of the symmetric matrix */
/* packed sequentially, column by column, so that AP( 1 ) */
/* contains a( 1, 1 ), AP( 2 ) and AP( 3 ) contain a( 1, 2 ) */
/* and a( 2, 2 ) respectively, and so on. */
/* Before entry with UPLO = 'L' or 'l', the array AP must */
/* contain the lower triangular part of the symmetric matrix */
/* packed sequentially, column by column, so that AP( 1 ) */
/* contains a( 1, 1 ), AP( 2 ) and AP( 3 ) contain a( 2, 1 ) */
/* and a( 3, 1 ) respectively, and so on. */
/* Unchanged on exit. */
/* X - DOUBLE PRECISION array of dimension at least */
/* ( 1 + ( n - 1 )*abs( INCX ) ). */
/* Before entry, the incremented array X must contain the n */
/* element vector x. */
/* Unchanged on exit. */
/* INCX - INTEGER. */
/* On entry, INCX specifies the increment for the elements of */
/* X. INCX must not be zero. */
/* Unchanged on exit. */
/* BETA - DOUBLE PRECISION. */
/* On entry, BETA specifies the scalar beta. When BETA is */
/* supplied as zero then Y need not be set on input. */
/* Unchanged on exit. */
/* Y - DOUBLE PRECISION array of dimension at least */
/* ( 1 + ( n - 1 )*abs( INCY ) ). */
/* Before entry, the incremented array Y must contain the n */
/* element vector y. On exit, Y is overwritten by the updated */
/* vector y. */
/* INCY - INTEGER. */
/* On entry, INCY specifies the increment for the elements of */
/* Y. INCY must not be zero. */
/* Unchanged on exit. */
/* Further Details */
/* =============== */
/* Level 2 Blas routine. */
/* -- Written on 22-October-1986. */
/* Jack Dongarra, Argonne National Lab. */
/* Jeremy Du Croz, Nag Central Office. */
/* Sven Hammarling, Nag Central Office. */
/* Richard Hanson, Sandia National Labs. */
/* ===================================================================== */
/* .. Parameters .. */
/* .. */
/* .. Local Scalars .. */
/* .. */
/* .. External Functions .. */
/* .. */
/* .. External Subroutines .. */
/* .. */
/* Test the input parameters. */
/* Parameter adjustments */
--y;
--x;
--ap;
/* Function Body */
info = 0;
if (!lsame_(uplo, "U") && !lsame_(uplo, "L")) {
info = 1;
} else if (*n < 0) {
info = 2;
} else if (*incx == 0) {
info = 6;
} else if (*incy == 0) {
info = 9;
}
if (info != 0) {
xerbla_("DSPMV ", &info);
return;
}
/* Quick return if possible. */
if (*n == 0 || (*alpha == 0. && *beta == 1.)) {
return;
}
/* Set up the start points in X and Y. */
if (*incx > 0) {
kx = 1;
} else {
kx = 1 - (*n - 1) * *incx;
}
if (*incy > 0) {
ky = 1;
} else {
ky = 1 - (*n - 1) * *incy;
}
/* Start the operations. In this version the elements of the array AP */
/* are accessed sequentially with one pass through AP. */
/* First form y := beta*y. */
if (*beta != 1.) {
if (*incy == 1) {
if (*beta == 0.) {
i__1 = *n;
for (i__ = 1; i__ <= i__1; ++i__) {
y[i__] = 0.;
/* L10: */
}
} else {
i__1 = *n;
for (i__ = 1; i__ <= i__1; ++i__) {
y[i__] = *beta * y[i__];
/* L20: */
}
}
} else {
iy = ky;
if (*beta == 0.) {
i__1 = *n;
for (i__ = 1; i__ <= i__1; ++i__) {
y[iy] = 0.;
iy += *incy;
/* L30: */
}
} else {
i__1 = *n;
for (i__ = 1; i__ <= i__1; ++i__) {
y[iy] = *beta * y[iy];
iy += *incy;
/* L40: */
}
}
}
}
if (*alpha == 0.) {
return;
}
kk = 1;
if (lsame_(uplo, "U")) {
/* Form y when AP contains the upper triangle. */
if (*incx == 1 && *incy == 1) {
i__1 = *n;
for (j = 1; j <= i__1; ++j) {
temp1 = *alpha * x[j];
temp2 = 0.;
k = kk;
i__2 = j - 1;
for (i__ = 1; i__ <= i__2; ++i__) {
y[i__] += temp1 * ap[k];
temp2 += ap[k] * x[i__];
++k;
/* L50: */
}
y[j] = y[j] + temp1 * ap[kk + j - 1] + *alpha * temp2;
kk += j;
/* L60: */
}
} else {
jx = kx;
jy = ky;
i__1 = *n;
for (j = 1; j <= i__1; ++j) {
temp1 = *alpha * x[jx];
temp2 = 0.;
ix = kx;
iy = ky;
i__2 = kk + j - 2;
for (k = kk; k <= i__2; ++k) {
y[iy] += temp1 * ap[k];
temp2 += ap[k] * x[ix];
ix += *incx;
iy += *incy;
/* L70: */
}
y[jy] = y[jy] + temp1 * ap[kk + j - 1] + *alpha * temp2;
jx += *incx;
jy += *incy;
kk += j;
/* L80: */
}
}
} else {
/* Form y when AP contains the lower triangle. */
if (*incx == 1 && *incy == 1) {
i__1 = *n;
for (j = 1; j <= i__1; ++j) {
temp1 = *alpha * x[j];
temp2 = 0.;
y[j] += temp1 * ap[kk];
k = kk + 1;
i__2 = *n;
for (i__ = j + 1; i__ <= i__2; ++i__) {
y[i__] += temp1 * ap[k];
temp2 += ap[k] * x[i__];
++k;
/* L90: */
}
y[j] += *alpha * temp2;
kk += *n - j + 1;
/* L100: */
}
} else {
jx = kx;
jy = ky;
i__1 = *n;
for (j = 1; j <= i__1; ++j) {
temp1 = *alpha * x[jx];
temp2 = 0.;
y[jy] += temp1 * ap[kk];
ix = jx;
iy = jy;
i__2 = kk + *n - j;
for (k = kk + 1; k <= i__2; ++k) {
ix += *incx;
iy += *incy;
y[iy] += temp1 * ap[k];
temp2 += ap[k] * x[ix];
/* L110: */
}
y[jy] += *alpha * temp2;
jx += *incx;
jy += *incy;
kk += *n - j + 1;
/* L120: */
}
}
}
/* End of DSPMV . */
} /* dspmv_ */

View File

@@ -1,417 +0,0 @@
/* dtbmv.f -- translated by f2c (version 20100827).
You must link the resulting object file with libf2c:
on Microsoft Windows system, link with libf2c.lib;
on Linux or Unix systems, link with .../path/to/libf2c.a -lm
or, if you install libf2c.a in a standard place, with -lf2c -lm
-- in that order, at the end of the command line, as in
cc *.o -lf2c -lm
Source for libf2c is in /netlib/f2c/libf2c.zip, e.g.,
http://www.netlib.org/f2c/libf2c.zip
*/
#include "datatypes.h"
/* Subroutine */ void dtbmv_(char *uplo, char *trans, char *diag, integer *n, integer *k, doublereal *a, integer *lda,
doublereal *x, integer *incx) {
/* System generated locals */
integer a_dim1, a_offset, i__1, i__2, i__3, i__4;
/* Local variables */
integer i__, j, l, ix, jx, kx, info;
doublereal temp;
extern logical lsame_(char *, char *);
integer kplus1;
extern /* Subroutine */ void xerbla_(const char *, integer *);
logical nounit;
/* .. Scalar Arguments .. */
/* .. */
/* .. Array Arguments .. */
/* .. */
/* Purpose */
/* ======= */
/* DTBMV performs one of the matrix-vector operations */
/* x := A*x, or x := A'*x, */
/* where x is an n element vector and A is an n by n unit, or non-unit, */
/* upper or lower triangular band matrix, with ( k + 1 ) diagonals. */
/* Arguments */
/* ========== */
/* UPLO - CHARACTER*1. */
/* On entry, UPLO specifies whether the matrix is an upper or */
/* lower triangular matrix as follows: */
/* UPLO = 'U' or 'u' A is an upper triangular matrix. */
/* UPLO = 'L' or 'l' A is a lower triangular matrix. */
/* Unchanged on exit. */
/* TRANS - CHARACTER*1. */
/* On entry, TRANS specifies the operation to be performed as */
/* follows: */
/* TRANS = 'N' or 'n' x := A*x. */
/* TRANS = 'T' or 't' x := A'*x. */
/* TRANS = 'C' or 'c' x := A'*x. */
/* Unchanged on exit. */
/* DIAG - CHARACTER*1. */
/* On entry, DIAG specifies whether or not A is unit */
/* triangular as follows: */
/* DIAG = 'U' or 'u' A is assumed to be unit triangular. */
/* DIAG = 'N' or 'n' A is not assumed to be unit */
/* triangular. */
/* Unchanged on exit. */
/* N - INTEGER. */
/* On entry, N specifies the order of the matrix A. */
/* N must be at least zero. */
/* Unchanged on exit. */
/* K - INTEGER. */
/* On entry with UPLO = 'U' or 'u', K specifies the number of */
/* super-diagonals of the matrix A. */
/* On entry with UPLO = 'L' or 'l', K specifies the number of */
/* sub-diagonals of the matrix A. */
/* K must satisfy 0 .le. K. */
/* Unchanged on exit. */
/* A - DOUBLE PRECISION array of DIMENSION ( LDA, n ). */
/* Before entry with UPLO = 'U' or 'u', the leading ( k + 1 ) */
/* by n part of the array A must contain the upper triangular */
/* band part of the matrix of coefficients, supplied column by */
/* column, with the leading diagonal of the matrix in row */
/* ( k + 1 ) of the array, the first super-diagonal starting at */
/* position 2 in row k, and so on. The top left k by k triangle */
/* of the array A is not referenced. */
/* The following program segment will transfer an upper */
/* triangular band matrix from conventional full matrix storage */
/* to band storage: */
/* DO 20, J = 1, N */
/* M = K + 1 - J */
/* DO 10, I = MAX( 1, J - K ), J */
/* A( M + I, J ) = matrix( I, J ) */
/* 10 CONTINUE */
/* 20 CONTINUE */
/* Before entry with UPLO = 'L' or 'l', the leading ( k + 1 ) */
/* by n part of the array A must contain the lower triangular */
/* band part of the matrix of coefficients, supplied column by */
/* column, with the leading diagonal of the matrix in row 1 of */
/* the array, the first sub-diagonal starting at position 1 in */
/* row 2, and so on. The bottom right k by k triangle of the */
/* array A is not referenced. */
/* The following program segment will transfer a lower */
/* triangular band matrix from conventional full matrix storage */
/* to band storage: */
/* DO 20, J = 1, N */
/* M = 1 - J */
/* DO 10, I = J, MIN( N, J + K ) */
/* A( M + I, J ) = matrix( I, J ) */
/* 10 CONTINUE */
/* 20 CONTINUE */
/* Note that when DIAG = 'U' or 'u' the elements of the array A */
/* corresponding to the diagonal elements of the matrix are not */
/* referenced, but are assumed to be unity. */
/* Unchanged on exit. */
/* LDA - INTEGER. */
/* On entry, LDA specifies the first dimension of A as declared */
/* in the calling (sub) program. LDA must be at least */
/* ( k + 1 ). */
/* Unchanged on exit. */
/* X - DOUBLE PRECISION array of dimension at least */
/* ( 1 + ( n - 1 )*abs( INCX ) ). */
/* Before entry, the incremented array X must contain the n */
/* element vector x. On exit, X is overwritten with the */
/* transformed vector x. */
/* INCX - INTEGER. */
/* On entry, INCX specifies the increment for the elements of */
/* X. INCX must not be zero. */
/* Unchanged on exit. */
/* Further Details */
/* =============== */
/* Level 2 Blas routine. */
/* -- Written on 22-October-1986. */
/* Jack Dongarra, Argonne National Lab. */
/* Jeremy Du Croz, Nag Central Office. */
/* Sven Hammarling, Nag Central Office. */
/* Richard Hanson, Sandia National Labs. */
/* ===================================================================== */
/* .. Parameters .. */
/* .. */
/* .. Local Scalars .. */
/* .. */
/* .. External Functions .. */
/* .. */
/* .. External Subroutines .. */
/* .. */
/* .. Intrinsic Functions .. */
/* .. */
/* Test the input parameters. */
/* Parameter adjustments */
a_dim1 = *lda;
a_offset = 1 + a_dim1;
a -= a_offset;
--x;
/* Function Body */
info = 0;
if (!lsame_(uplo, "U") && !lsame_(uplo, "L")) {
info = 1;
} else if (!lsame_(trans, "N") && !lsame_(trans, "T") && !lsame_(trans, "C")) {
info = 2;
} else if (!lsame_(diag, "U") && !lsame_(diag, "N")) {
info = 3;
} else if (*n < 0) {
info = 4;
} else if (*k < 0) {
info = 5;
} else if (*lda < *k + 1) {
info = 7;
} else if (*incx == 0) {
info = 9;
}
if (info != 0) {
xerbla_("DTBMV ", &info);
return;
}
/* Quick return if possible. */
if (*n == 0) {
return;
}
nounit = lsame_(diag, "N");
/* Set up the start point in X if the increment is not unity. This */
/* will be ( N - 1 )*INCX too small for descending loops. */
if (*incx <= 0) {
kx = 1 - (*n - 1) * *incx;
} else if (*incx != 1) {
kx = 1;
}
/* Start the operations. In this version the elements of A are */
/* accessed sequentially with one pass through A. */
if (lsame_(trans, "N")) {
/* Form x := A*x. */
if (lsame_(uplo, "U")) {
kplus1 = *k + 1;
if (*incx == 1) {
i__1 = *n;
for (j = 1; j <= i__1; ++j) {
if (x[j] != 0.) {
temp = x[j];
l = kplus1 - j;
/* Computing MAX */
i__2 = 1, i__3 = j - *k;
i__4 = j - 1;
for (i__ = max(i__2, i__3); i__ <= i__4; ++i__) {
x[i__] += temp * a[l + i__ + j * a_dim1];
/* L10: */
}
if (nounit) {
x[j] *= a[kplus1 + j * a_dim1];
}
}
/* L20: */
}
} else {
jx = kx;
i__1 = *n;
for (j = 1; j <= i__1; ++j) {
if (x[jx] != 0.) {
temp = x[jx];
ix = kx;
l = kplus1 - j;
/* Computing MAX */
i__4 = 1, i__2 = j - *k;
i__3 = j - 1;
for (i__ = max(i__4, i__2); i__ <= i__3; ++i__) {
x[ix] += temp * a[l + i__ + j * a_dim1];
ix += *incx;
/* L30: */
}
if (nounit) {
x[jx] *= a[kplus1 + j * a_dim1];
}
}
jx += *incx;
if (j > *k) {
kx += *incx;
}
/* L40: */
}
}
} else {
if (*incx == 1) {
for (j = *n; j >= 1; --j) {
if (x[j] != 0.) {
temp = x[j];
l = 1 - j;
/* Computing MIN */
i__1 = *n, i__3 = j + *k;
i__4 = j + 1;
for (i__ = min(i__1, i__3); i__ >= i__4; --i__) {
x[i__] += temp * a[l + i__ + j * a_dim1];
/* L50: */
}
if (nounit) {
x[j] *= a[j * a_dim1 + 1];
}
}
/* L60: */
}
} else {
kx += (*n - 1) * *incx;
jx = kx;
for (j = *n; j >= 1; --j) {
if (x[jx] != 0.) {
temp = x[jx];
ix = kx;
l = 1 - j;
/* Computing MIN */
i__4 = *n, i__1 = j + *k;
i__3 = j + 1;
for (i__ = min(i__4, i__1); i__ >= i__3; --i__) {
x[ix] += temp * a[l + i__ + j * a_dim1];
ix -= *incx;
/* L70: */
}
if (nounit) {
x[jx] *= a[j * a_dim1 + 1];
}
}
jx -= *incx;
if (*n - j >= *k) {
kx -= *incx;
}
/* L80: */
}
}
}
} else {
/* Form x := A'*x. */
if (lsame_(uplo, "U")) {
kplus1 = *k + 1;
if (*incx == 1) {
for (j = *n; j >= 1; --j) {
temp = x[j];
l = kplus1 - j;
if (nounit) {
temp *= a[kplus1 + j * a_dim1];
}
/* Computing MAX */
i__4 = 1, i__1 = j - *k;
i__3 = max(i__4, i__1);
for (i__ = j - 1; i__ >= i__3; --i__) {
temp += a[l + i__ + j * a_dim1] * x[i__];
/* L90: */
}
x[j] = temp;
/* L100: */
}
} else {
kx += (*n - 1) * *incx;
jx = kx;
for (j = *n; j >= 1; --j) {
temp = x[jx];
kx -= *incx;
ix = kx;
l = kplus1 - j;
if (nounit) {
temp *= a[kplus1 + j * a_dim1];
}
/* Computing MAX */
i__4 = 1, i__1 = j - *k;
i__3 = max(i__4, i__1);
for (i__ = j - 1; i__ >= i__3; --i__) {
temp += a[l + i__ + j * a_dim1] * x[ix];
ix -= *incx;
/* L110: */
}
x[jx] = temp;
jx -= *incx;
/* L120: */
}
}
} else {
if (*incx == 1) {
i__3 = *n;
for (j = 1; j <= i__3; ++j) {
temp = x[j];
l = 1 - j;
if (nounit) {
temp *= a[j * a_dim1 + 1];
}
/* Computing MIN */
i__1 = *n, i__2 = j + *k;
i__4 = min(i__1, i__2);
for (i__ = j + 1; i__ <= i__4; ++i__) {
temp += a[l + i__ + j * a_dim1] * x[i__];
/* L130: */
}
x[j] = temp;
/* L140: */
}
} else {
jx = kx;
i__3 = *n;
for (j = 1; j <= i__3; ++j) {
temp = x[jx];
kx += *incx;
ix = kx;
l = 1 - j;
if (nounit) {
temp *= a[j * a_dim1 + 1];
}
/* Computing MIN */
i__1 = *n, i__2 = j + *k;
i__4 = min(i__1, i__2);
for (i__ = j + 1; i__ <= i__4; ++i__) {
temp += a[l + i__ + j * a_dim1] * x[ix];
ix += *incx;
/* L150: */
}
x[jx] = temp;
jx += *incx;
/* L160: */
}
}
}
}
/* End of DTBMV . */
} /* dtbmv_ */

View File

@@ -1,109 +0,0 @@
/* lsame.f -- translated by f2c (version 20100827).
You must link the resulting object file with libf2c:
on Microsoft Windows system, link with libf2c.lib;
on Linux or Unix systems, link with .../path/to/libf2c.a -lm
or, if you install libf2c.a in a standard place, with -lf2c -lm
-- in that order, at the end of the command line, as in
cc *.o -lf2c -lm
Source for libf2c is in /netlib/f2c/libf2c.zip, e.g.,
http://www.netlib.org/f2c/libf2c.zip
*/
#include "datatypes.h"
logical lsame_(char *ca, char *cb) {
/* System generated locals */
logical ret_val;
/* Local variables */
integer inta, intb, zcode;
/* -- LAPACK auxiliary routine (version 3.1) -- */
/* Univ. of Tennessee, Univ. of California Berkeley and NAG Ltd.. */
/* November 2006 */
/* .. Scalar Arguments .. */
/* .. */
/* Purpose */
/* ======= */
/* LSAME returns .TRUE. if CA is the same letter as CB regardless of */
/* case. */
/* Arguments */
/* ========= */
/* CA (input) CHARACTER*1 */
/* CB (input) CHARACTER*1 */
/* CA and CB specify the single characters to be compared. */
/* ===================================================================== */
/* .. Intrinsic Functions .. */
/* .. */
/* .. Local Scalars .. */
/* .. */
/* Test if the characters are equal */
ret_val = *(unsigned char *)ca == *(unsigned char *)cb;
if (ret_val) {
return ret_val;
}
/* Now test for equivalence if both characters are alphabetic. */
zcode = 'Z';
/* Use 'Z' rather than 'A' so that ASCII can be detected on Prime */
/* machines, on which ICHAR returns a value with bit 8 set. */
/* ICHAR('A') on Prime machines returns 193 which is the same as */
/* ICHAR('A') on an EBCDIC machine. */
inta = *(unsigned char *)ca;
intb = *(unsigned char *)cb;
if (zcode == 90 || zcode == 122) {
/* ASCII is assumed - ZCODE is the ASCII code of either lower or */
/* upper case 'Z'. */
if (inta >= 97 && inta <= 122) {
inta += -32;
}
if (intb >= 97 && intb <= 122) {
intb += -32;
}
} else if (zcode == 233 || zcode == 169) {
/* EBCDIC is assumed - ZCODE is the EBCDIC code of either lower or */
/* upper case 'Z'. */
if ((inta >= 129 && inta <= 137) || (inta >= 145 && inta <= 153) || (inta >= 162 && inta <= 169)) {
inta += 64;
}
if ((intb >= 129 && intb <= 137) || (intb >= 145 && intb <= 153) || (intb >= 162 && intb <= 169)) {
intb += 64;
}
} else if (zcode == 218 || zcode == 250) {
/* ASCII is assumed, on Prime machines - ZCODE is the ASCII code */
/* plus 128 of either lower or upper case 'Z'. */
if (inta >= 225 && inta <= 250) {
inta += -32;
}
if (intb >= 225 && intb <= 250) {
intb += -32;
}
}
ret_val = inta == intb;
/* RETURN */
/* End of LSAME */
return ret_val;
} /* lsame_ */

View File

@@ -1,212 +0,0 @@
/* srotm.f -- translated by f2c (version 20100827).
You must link the resulting object file with libf2c:
on Microsoft Windows system, link with libf2c.lib;
on Linux or Unix systems, link with .../path/to/libf2c.a -lm
or, if you install libf2c.a in a standard place, with -lf2c -lm
-- in that order, at the end of the command line, as in
cc *.o -lf2c -lm
Source for libf2c is in /netlib/f2c/libf2c.zip, e.g.,
http://www.netlib.org/f2c/libf2c.zip
*/
#include "datatypes.h"
/* Subroutine */ void srotm_(integer *n, real *sx, integer *incx, real *sy, integer *incy, real *sparam) {
/* Initialized data */
static real zero = 0.f;
static real two = 2.f;
/* System generated locals */
integer i__1, i__2;
/* Local variables */
integer i__;
real w, z__;
integer kx, ky;
real sh11, sh12, sh21, sh22, sflag;
integer nsteps;
/* .. Scalar Arguments .. */
/* .. */
/* .. Array Arguments .. */
/* .. */
/* Purpose */
/* ======= */
/* APPLY THE MODIFIED GIVENS TRANSFORMATION, H, TO THE 2 BY N MATRIX */
/* (SX**T) , WHERE **T INDICATES TRANSPOSE. THE ELEMENTS OF SX ARE IN */
/* (DX**T) */
/* SX(LX+I*INCX), I = 0 TO N-1, WHERE LX = 1 IF INCX .GE. 0, ELSE */
/* LX = (-INCX)*N, AND SIMILARLY FOR SY USING USING LY AND INCY. */
/* WITH SPARAM(1)=SFLAG, H HAS ONE OF THE FOLLOWING FORMS.. */
/* SFLAG=-1.E0 SFLAG=0.E0 SFLAG=1.E0 SFLAG=-2.E0 */
/* (SH11 SH12) (1.E0 SH12) (SH11 1.E0) (1.E0 0.E0) */
/* H=( ) ( ) ( ) ( ) */
/* (SH21 SH22), (SH21 1.E0), (-1.E0 SH22), (0.E0 1.E0). */
/* SEE SROTMG FOR A DESCRIPTION OF DATA STORAGE IN SPARAM. */
/* Arguments */
/* ========= */
/* N (input) INTEGER */
/* number of elements in input vector(s) */
/* SX (input/output) REAL array, dimension N */
/* double precision vector with N elements */
/* INCX (input) INTEGER */
/* storage spacing between elements of SX */
/* SY (input/output) REAL array, dimension N */
/* double precision vector with N elements */
/* INCY (input) INTEGER */
/* storage spacing between elements of SY */
/* SPARAM (input/output) REAL array, dimension 5 */
/* SPARAM(1)=SFLAG */
/* SPARAM(2)=SH11 */
/* SPARAM(3)=SH21 */
/* SPARAM(4)=SH12 */
/* SPARAM(5)=SH22 */
/* ===================================================================== */
/* .. Local Scalars .. */
/* .. */
/* .. Data statements .. */
/* Parameter adjustments */
--sparam;
--sy;
--sx;
/* Function Body */
/* .. */
sflag = sparam[1];
if (*n <= 0 || sflag + two == zero) {
goto L140;
}
if (!(*incx == *incy && *incx > 0)) {
goto L70;
}
nsteps = *n * *incx;
if (sflag < 0.f) {
goto L50;
} else if (sflag == 0) {
goto L10;
} else {
goto L30;
}
L10:
sh12 = sparam[4];
sh21 = sparam[3];
i__1 = nsteps;
i__2 = *incx;
for (i__ = 1; i__2 < 0 ? i__ >= i__1 : i__ <= i__1; i__ += i__2) {
w = sx[i__];
z__ = sy[i__];
sx[i__] = w + z__ * sh12;
sy[i__] = w * sh21 + z__;
/* L20: */
}
goto L140;
L30:
sh11 = sparam[2];
sh22 = sparam[5];
i__2 = nsteps;
i__1 = *incx;
for (i__ = 1; i__1 < 0 ? i__ >= i__2 : i__ <= i__2; i__ += i__1) {
w = sx[i__];
z__ = sy[i__];
sx[i__] = w * sh11 + z__;
sy[i__] = -w + sh22 * z__;
/* L40: */
}
goto L140;
L50:
sh11 = sparam[2];
sh12 = sparam[4];
sh21 = sparam[3];
sh22 = sparam[5];
i__1 = nsteps;
i__2 = *incx;
for (i__ = 1; i__2 < 0 ? i__ >= i__1 : i__ <= i__1; i__ += i__2) {
w = sx[i__];
z__ = sy[i__];
sx[i__] = w * sh11 + z__ * sh12;
sy[i__] = w * sh21 + z__ * sh22;
/* L60: */
}
goto L140;
L70:
kx = 1;
ky = 1;
if (*incx < 0) {
kx = (1 - *n) * *incx + 1;
}
if (*incy < 0) {
ky = (1 - *n) * *incy + 1;
}
if (sflag < 0.f) {
goto L120;
} else if (sflag == 0) {
goto L80;
} else {
goto L100;
}
L80:
sh12 = sparam[4];
sh21 = sparam[3];
i__2 = *n;
for (i__ = 1; i__ <= i__2; ++i__) {
w = sx[kx];
z__ = sy[ky];
sx[kx] = w + z__ * sh12;
sy[ky] = w * sh21 + z__;
kx += *incx;
ky += *incy;
/* L90: */
}
goto L140;
L100:
sh11 = sparam[2];
sh22 = sparam[5];
i__2 = *n;
for (i__ = 1; i__ <= i__2; ++i__) {
w = sx[kx];
z__ = sy[ky];
sx[kx] = w * sh11 + z__;
sy[ky] = -w + sh22 * z__;
kx += *incx;
ky += *incy;
/* L110: */
}
goto L140;
L120:
sh11 = sparam[2];
sh12 = sparam[4];
sh21 = sparam[3];
sh22 = sparam[5];
i__2 = *n;
for (i__ = 1; i__ <= i__2; ++i__) {
w = sx[kx];
z__ = sy[ky];
sx[kx] = w * sh11 + z__ * sh12;
sy[ky] = w * sh21 + z__ * sh22;
kx += *incx;
ky += *incy;
/* L130: */
}
L140:
return;
} /* srotm_ */

View File

@@ -1,293 +0,0 @@
/* srotmg.f -- translated by f2c (version 20100827).
You must link the resulting object file with libf2c:
on Microsoft Windows system, link with libf2c.lib;
on Linux or Unix systems, link with .../path/to/libf2c.a -lm
or, if you install libf2c.a in a standard place, with -lf2c -lm
-- in that order, at the end of the command line, as in
cc *.o -lf2c -lm
Source for libf2c is in /netlib/f2c/libf2c.zip, e.g.,
http://www.netlib.org/f2c/libf2c.zip
*/
#include "datatypes.h"
/* Subroutine */ void srotmg_(real *sd1, real *sd2, real *sx1, real *sy1, real *sparam) {
/* Initialized data */
static real zero = 0.f;
static real one = 1.f;
static real two = 2.f;
static real gam = 4096.f;
static real gamsq = 16777200.f;
static real rgamsq = 5.96046e-8f;
/* Format strings */
static char fmt_120[] = "";
static char fmt_150[] = "";
static char fmt_180[] = "";
static char fmt_210[] = "";
/* System generated locals */
real r__1;
/* Local variables */
real su, sp1, sp2, sq1, sq2, sh11, sh12, sh21, sh22;
integer igo;
real sflag, stemp;
/* Assigned format variables */
static char *igo_fmt;
(void)igo_fmt;
/* .. Scalar Arguments .. */
/* .. */
/* .. Array Arguments .. */
/* .. */
/* Purpose */
/* ======= */
/* CONSTRUCT THE MODIFIED GIVENS TRANSFORMATION MATRIX H WHICH ZEROS */
/* THE SECOND COMPONENT OF THE 2-VECTOR (SQRT(SD1)*SX1,SQRT(SD2)* */
/* SY2)**T. */
/* WITH SPARAM(1)=SFLAG, H HAS ONE OF THE FOLLOWING FORMS.. */
/* SFLAG=-1.E0 SFLAG=0.E0 SFLAG=1.E0 SFLAG=-2.E0 */
/* (SH11 SH12) (1.E0 SH12) (SH11 1.E0) (1.E0 0.E0) */
/* H=( ) ( ) ( ) ( ) */
/* (SH21 SH22), (SH21 1.E0), (-1.E0 SH22), (0.E0 1.E0). */
/* LOCATIONS 2-4 OF SPARAM CONTAIN SH11,SH21,SH12, AND SH22 */
/* RESPECTIVELY. (VALUES OF 1.E0, -1.E0, OR 0.E0 IMPLIED BY THE */
/* VALUE OF SPARAM(1) ARE NOT STORED IN SPARAM.) */
/* THE VALUES OF GAMSQ AND RGAMSQ SET IN THE DATA STATEMENT MAY BE */
/* INEXACT. THIS IS OK AS THEY ARE ONLY USED FOR TESTING THE SIZE */
/* OF SD1 AND SD2. ALL ACTUAL SCALING OF DATA IS DONE USING GAM. */
/* Arguments */
/* ========= */
/* SD1 (input/output) REAL */
/* SD2 (input/output) REAL */
/* SX1 (input/output) REAL */
/* SY1 (input) REAL */
/* SPARAM (input/output) REAL array, dimension 5 */
/* SPARAM(1)=SFLAG */
/* SPARAM(2)=SH11 */
/* SPARAM(3)=SH21 */
/* SPARAM(4)=SH12 */
/* SPARAM(5)=SH22 */
/* ===================================================================== */
/* .. Local Scalars .. */
/* .. */
/* .. Intrinsic Functions .. */
/* .. */
/* .. Data statements .. */
/* Parameter adjustments */
--sparam;
/* Function Body */
/* .. */
if (!(*sd1 < zero)) {
goto L10;
}
/* GO ZERO-H-D-AND-SX1.. */
goto L60;
L10:
/* CASE-SD1-NONNEGATIVE */
sp2 = *sd2 * *sy1;
if (!(sp2 == zero)) {
goto L20;
}
sflag = -two;
goto L260;
/* REGULAR-CASE.. */
L20:
sp1 = *sd1 * *sx1;
sq2 = sp2 * *sy1;
sq1 = sp1 * *sx1;
if (!(dabs(sq1) > dabs(sq2))) {
goto L40;
}
sh21 = -(*sy1) / *sx1;
sh12 = sp2 / sp1;
su = one - sh12 * sh21;
if (!(su <= zero)) {
goto L30;
}
/* GO ZERO-H-D-AND-SX1.. */
goto L60;
L30:
sflag = zero;
*sd1 /= su;
*sd2 /= su;
*sx1 *= su;
/* GO SCALE-CHECK.. */
goto L100;
L40:
if (!(sq2 < zero)) {
goto L50;
}
/* GO ZERO-H-D-AND-SX1.. */
goto L60;
L50:
sflag = one;
sh11 = sp1 / sp2;
sh22 = *sx1 / *sy1;
su = one + sh11 * sh22;
stemp = *sd2 / su;
*sd2 = *sd1 / su;
*sd1 = stemp;
*sx1 = *sy1 * su;
/* GO SCALE-CHECK */
goto L100;
/* PROCEDURE..ZERO-H-D-AND-SX1.. */
L60:
sflag = -one;
sh11 = zero;
sh12 = zero;
sh21 = zero;
sh22 = zero;
*sd1 = zero;
*sd2 = zero;
*sx1 = zero;
/* RETURN.. */
goto L220;
/* PROCEDURE..FIX-H.. */
L70:
if (!(sflag >= zero)) {
goto L90;
}
if (!(sflag == zero)) {
goto L80;
}
sh11 = one;
sh22 = one;
sflag = -one;
goto L90;
L80:
sh21 = -one;
sh12 = one;
sflag = -one;
L90:
switch (igo) {
case 0:
goto L120;
case 1:
goto L150;
case 2:
goto L180;
case 3:
goto L210;
}
/* PROCEDURE..SCALE-CHECK */
L100:
L110:
if (!(*sd1 <= rgamsq)) {
goto L130;
}
if (*sd1 == zero) {
goto L160;
}
igo = 0;
igo_fmt = fmt_120;
/* FIX-H.. */
goto L70;
L120:
/* Computing 2nd power */
r__1 = gam;
*sd1 *= r__1 * r__1;
*sx1 /= gam;
sh11 /= gam;
sh12 /= gam;
goto L110;
L130:
L140:
if (!(*sd1 >= gamsq)) {
goto L160;
}
igo = 1;
igo_fmt = fmt_150;
/* FIX-H.. */
goto L70;
L150:
/* Computing 2nd power */
r__1 = gam;
*sd1 /= r__1 * r__1;
*sx1 *= gam;
sh11 *= gam;
sh12 *= gam;
goto L140;
L160:
L170:
if (!(dabs(*sd2) <= rgamsq)) {
goto L190;
}
if (*sd2 == zero) {
goto L220;
}
igo = 2;
igo_fmt = fmt_180;
/* FIX-H.. */
goto L70;
L180:
/* Computing 2nd power */
r__1 = gam;
*sd2 *= r__1 * r__1;
sh21 /= gam;
sh22 /= gam;
goto L170;
L190:
L200:
if (!(dabs(*sd2) >= gamsq)) {
goto L220;
}
igo = 3;
igo_fmt = fmt_210;
/* FIX-H.. */
goto L70;
L210:
/* Computing 2nd power */
r__1 = gam;
*sd2 /= r__1 * r__1;
sh21 *= gam;
sh22 *= gam;
goto L200;
L220:
if (sflag < 0.f) {
goto L250;
} else if (sflag == 0) {
goto L230;
} else {
goto L240;
}
L230:
sparam[3] = sh21;
sparam[4] = sh12;
goto L260;
L240:
sparam[2] = sh11;
sparam[5] = sh22;
goto L260;
L250:
sparam[2] = sh11;
sparam[3] = sh21;
sparam[4] = sh12;
sparam[5] = sh22;
L260:
sparam[1] = sflag;
} /* srotmg_ */

View File

@@ -1,359 +0,0 @@
/* ssbmv.f -- translated by f2c (version 20100827).
You must link the resulting object file with libf2c:
on Microsoft Windows system, link with libf2c.lib;
on Linux or Unix systems, link with .../path/to/libf2c.a -lm
or, if you install libf2c.a in a standard place, with -lf2c -lm
-- in that order, at the end of the command line, as in
cc *.o -lf2c -lm
Source for libf2c is in /netlib/f2c/libf2c.zip, e.g.,
http://www.netlib.org/f2c/libf2c.zip
*/
#include "datatypes.h"
/* Subroutine */ void ssbmv_(char *uplo, integer *n, integer *k, real *alpha, real *a, integer *lda, real *x,
integer *incx, real *beta, real *y, integer *incy) {
/* System generated locals */
integer a_dim1, a_offset, i__1, i__2, i__3, i__4;
/* Local variables */
integer i__, j, l, ix, iy, jx, jy, kx, ky, info;
real temp1, temp2;
extern logical lsame_(char *, char *);
integer kplus1;
extern /* Subroutine */ void xerbla_(const char *, integer *);
/* .. Scalar Arguments .. */
/* .. */
/* .. Array Arguments .. */
/* .. */
/* Purpose */
/* ======= */
/* SSBMV performs the matrix-vector operation */
/* y := alpha*A*x + beta*y, */
/* where alpha and beta are scalars, x and y are n element vectors and */
/* A is an n by n symmetric band matrix, with k super-diagonals. */
/* Arguments */
/* ========== */
/* UPLO - CHARACTER*1. */
/* On entry, UPLO specifies whether the upper or lower */
/* triangular part of the band matrix A is being supplied as */
/* follows: */
/* UPLO = 'U' or 'u' The upper triangular part of A is */
/* being supplied. */
/* UPLO = 'L' or 'l' The lower triangular part of A is */
/* being supplied. */
/* Unchanged on exit. */
/* N - INTEGER. */
/* On entry, N specifies the order of the matrix A. */
/* N must be at least zero. */
/* Unchanged on exit. */
/* K - INTEGER. */
/* On entry, K specifies the number of super-diagonals of the */
/* matrix A. K must satisfy 0 .le. K. */
/* Unchanged on exit. */
/* ALPHA - REAL . */
/* On entry, ALPHA specifies the scalar alpha. */
/* Unchanged on exit. */
/* A - REAL array of DIMENSION ( LDA, n ). */
/* Before entry with UPLO = 'U' or 'u', the leading ( k + 1 ) */
/* by n part of the array A must contain the upper triangular */
/* band part of the symmetric matrix, supplied column by */
/* column, with the leading diagonal of the matrix in row */
/* ( k + 1 ) of the array, the first super-diagonal starting at */
/* position 2 in row k, and so on. The top left k by k triangle */
/* of the array A is not referenced. */
/* The following program segment will transfer the upper */
/* triangular part of a symmetric band matrix from conventional */
/* full matrix storage to band storage: */
/* DO 20, J = 1, N */
/* M = K + 1 - J */
/* DO 10, I = MAX( 1, J - K ), J */
/* A( M + I, J ) = matrix( I, J ) */
/* 10 CONTINUE */
/* 20 CONTINUE */
/* Before entry with UPLO = 'L' or 'l', the leading ( k + 1 ) */
/* by n part of the array A must contain the lower triangular */
/* band part of the symmetric matrix, supplied column by */
/* column, with the leading diagonal of the matrix in row 1 of */
/* the array, the first sub-diagonal starting at position 1 in */
/* row 2, and so on. The bottom right k by k triangle of the */
/* array A is not referenced. */
/* The following program segment will transfer the lower */
/* triangular part of a symmetric band matrix from conventional */
/* full matrix storage to band storage: */
/* DO 20, J = 1, N */
/* M = 1 - J */
/* DO 10, I = J, MIN( N, J + K ) */
/* A( M + I, J ) = matrix( I, J ) */
/* 10 CONTINUE */
/* 20 CONTINUE */
/* Unchanged on exit. */
/* LDA - INTEGER. */
/* On entry, LDA specifies the first dimension of A as declared */
/* in the calling (sub) program. LDA must be at least */
/* ( k + 1 ). */
/* Unchanged on exit. */
/* X - REAL array of DIMENSION at least */
/* ( 1 + ( n - 1 )*abs( INCX ) ). */
/* Before entry, the incremented array X must contain the */
/* vector x. */
/* Unchanged on exit. */
/* INCX - INTEGER. */
/* On entry, INCX specifies the increment for the elements of */
/* X. INCX must not be zero. */
/* Unchanged on exit. */
/* BETA - REAL . */
/* On entry, BETA specifies the scalar beta. */
/* Unchanged on exit. */
/* Y - REAL array of DIMENSION at least */
/* ( 1 + ( n - 1 )*abs( INCY ) ). */
/* Before entry, the incremented array Y must contain the */
/* vector y. On exit, Y is overwritten by the updated vector y. */
/* INCY - INTEGER. */
/* On entry, INCY specifies the increment for the elements of */
/* Y. INCY must not be zero. */
/* Unchanged on exit. */
/* Further Details */
/* =============== */
/* Level 2 Blas routine. */
/* -- Written on 22-October-1986. */
/* Jack Dongarra, Argonne National Lab. */
/* Jeremy Du Croz, Nag Central Office. */
/* Sven Hammarling, Nag Central Office. */
/* Richard Hanson, Sandia National Labs. */
/* ===================================================================== */
/* .. Parameters .. */
/* .. */
/* .. Local Scalars .. */
/* .. */
/* .. External Functions .. */
/* .. */
/* .. External Subroutines .. */
/* .. */
/* .. Intrinsic Functions .. */
/* .. */
/* Test the input parameters. */
/* Parameter adjustments */
a_dim1 = *lda;
a_offset = 1 + a_dim1;
a -= a_offset;
--x;
--y;
/* Function Body */
info = 0;
if (!lsame_(uplo, "U") && !lsame_(uplo, "L")) {
info = 1;
} else if (*n < 0) {
info = 2;
} else if (*k < 0) {
info = 3;
} else if (*lda < *k + 1) {
info = 6;
} else if (*incx == 0) {
info = 8;
} else if (*incy == 0) {
info = 11;
}
if (info != 0) {
xerbla_("SSBMV ", &info);
return;
}
/* Quick return if possible. */
if (*n == 0 || (*alpha == 0.f && *beta == 1.f)) {
return;
}
/* Set up the start points in X and Y. */
if (*incx > 0) {
kx = 1;
} else {
kx = 1 - (*n - 1) * *incx;
}
if (*incy > 0) {
ky = 1;
} else {
ky = 1 - (*n - 1) * *incy;
}
/* Start the operations. In this version the elements of the array A */
/* are accessed sequentially with one pass through A. */
/* First form y := beta*y. */
if (*beta != 1.f) {
if (*incy == 1) {
if (*beta == 0.f) {
i__1 = *n;
for (i__ = 1; i__ <= i__1; ++i__) {
y[i__] = 0.f;
/* L10: */
}
} else {
i__1 = *n;
for (i__ = 1; i__ <= i__1; ++i__) {
y[i__] = *beta * y[i__];
/* L20: */
}
}
} else {
iy = ky;
if (*beta == 0.f) {
i__1 = *n;
for (i__ = 1; i__ <= i__1; ++i__) {
y[iy] = 0.f;
iy += *incy;
/* L30: */
}
} else {
i__1 = *n;
for (i__ = 1; i__ <= i__1; ++i__) {
y[iy] = *beta * y[iy];
iy += *incy;
/* L40: */
}
}
}
}
if (*alpha == 0.f) {
return;
}
if (lsame_(uplo, "U")) {
/* Form y when upper triangle of A is stored. */
kplus1 = *k + 1;
if (*incx == 1 && *incy == 1) {
i__1 = *n;
for (j = 1; j <= i__1; ++j) {
temp1 = *alpha * x[j];
temp2 = 0.f;
l = kplus1 - j;
/* Computing MAX */
i__2 = 1, i__3 = j - *k;
i__4 = j - 1;
for (i__ = max(i__2, i__3); i__ <= i__4; ++i__) {
y[i__] += temp1 * a[l + i__ + j * a_dim1];
temp2 += a[l + i__ + j * a_dim1] * x[i__];
/* L50: */
}
y[j] = y[j] + temp1 * a[kplus1 + j * a_dim1] + *alpha * temp2;
/* L60: */
}
} else {
jx = kx;
jy = ky;
i__1 = *n;
for (j = 1; j <= i__1; ++j) {
temp1 = *alpha * x[jx];
temp2 = 0.f;
ix = kx;
iy = ky;
l = kplus1 - j;
/* Computing MAX */
i__4 = 1, i__2 = j - *k;
i__3 = j - 1;
for (i__ = max(i__4, i__2); i__ <= i__3; ++i__) {
y[iy] += temp1 * a[l + i__ + j * a_dim1];
temp2 += a[l + i__ + j * a_dim1] * x[ix];
ix += *incx;
iy += *incy;
/* L70: */
}
y[jy] = y[jy] + temp1 * a[kplus1 + j * a_dim1] + *alpha * temp2;
jx += *incx;
jy += *incy;
if (j > *k) {
kx += *incx;
ky += *incy;
}
/* L80: */
}
}
} else {
/* Form y when lower triangle of A is stored. */
if (*incx == 1 && *incy == 1) {
i__1 = *n;
for (j = 1; j <= i__1; ++j) {
temp1 = *alpha * x[j];
temp2 = 0.f;
y[j] += temp1 * a[j * a_dim1 + 1];
l = 1 - j;
/* Computing MIN */
i__4 = *n, i__2 = j + *k;
i__3 = min(i__4, i__2);
for (i__ = j + 1; i__ <= i__3; ++i__) {
y[i__] += temp1 * a[l + i__ + j * a_dim1];
temp2 += a[l + i__ + j * a_dim1] * x[i__];
/* L90: */
}
y[j] += *alpha * temp2;
/* L100: */
}
} else {
jx = kx;
jy = ky;
i__1 = *n;
for (j = 1; j <= i__1; ++j) {
temp1 = *alpha * x[jx];
temp2 = 0.f;
y[jy] += temp1 * a[j * a_dim1 + 1];
l = 1 - j;
ix = jx;
iy = jy;
/* Computing MIN */
i__4 = *n, i__2 = j + *k;
i__3 = min(i__4, i__2);
for (i__ = j + 1; i__ <= i__3; ++i__) {
ix += *incx;
iy += *incy;
y[iy] += temp1 * a[l + i__ + j * a_dim1];
temp2 += a[l + i__ + j * a_dim1] * x[ix];
/* L110: */
}
y[jy] += *alpha * temp2;
jx += *incx;
jy += *incy;
/* L120: */
}
}
}
/* End of SSBMV . */
} /* ssbmv_ */

View File

@@ -1,308 +0,0 @@
/* sspmv.f -- translated by f2c (version 20100827).
You must link the resulting object file with libf2c:
on Microsoft Windows system, link with libf2c.lib;
on Linux or Unix systems, link with .../path/to/libf2c.a -lm
or, if you install libf2c.a in a standard place, with -lf2c -lm
-- in that order, at the end of the command line, as in
cc *.o -lf2c -lm
Source for libf2c is in /netlib/f2c/libf2c.zip, e.g.,
http://www.netlib.org/f2c/libf2c.zip
*/
#include "datatypes.h"
/* Subroutine */ void sspmv_(char *uplo, integer *n, real *alpha, real *ap, real *x, integer *incx, real *beta, real *y,
integer *incy) {
/* System generated locals */
integer i__1, i__2;
/* Local variables */
integer i__, j, k, kk, ix, iy, jx, jy, kx, ky, info;
real temp1, temp2;
extern logical lsame_(char *, char *);
extern /* Subroutine */ void xerbla_(const char *, integer *);
/* .. Scalar Arguments .. */
/* .. */
/* .. Array Arguments .. */
/* .. */
/* Purpose */
/* ======= */
/* SSPMV performs the matrix-vector operation */
/* y := alpha*A*x + beta*y, */
/* where alpha and beta are scalars, x and y are n element vectors and */
/* A is an n by n symmetric matrix, supplied in packed form. */
/* Arguments */
/* ========== */
/* UPLO - CHARACTER*1. */
/* On entry, UPLO specifies whether the upper or lower */
/* triangular part of the matrix A is supplied in the packed */
/* array AP as follows: */
/* UPLO = 'U' or 'u' The upper triangular part of A is */
/* supplied in AP. */
/* UPLO = 'L' or 'l' The lower triangular part of A is */
/* supplied in AP. */
/* Unchanged on exit. */
/* N - INTEGER. */
/* On entry, N specifies the order of the matrix A. */
/* N must be at least zero. */
/* Unchanged on exit. */
/* ALPHA - REAL . */
/* On entry, ALPHA specifies the scalar alpha. */
/* Unchanged on exit. */
/* AP - REAL array of DIMENSION at least */
/* ( ( n*( n + 1 ) )/2 ). */
/* Before entry with UPLO = 'U' or 'u', the array AP must */
/* contain the upper triangular part of the symmetric matrix */
/* packed sequentially, column by column, so that AP( 1 ) */
/* contains a( 1, 1 ), AP( 2 ) and AP( 3 ) contain a( 1, 2 ) */
/* and a( 2, 2 ) respectively, and so on. */
/* Before entry with UPLO = 'L' or 'l', the array AP must */
/* contain the lower triangular part of the symmetric matrix */
/* packed sequentially, column by column, so that AP( 1 ) */
/* contains a( 1, 1 ), AP( 2 ) and AP( 3 ) contain a( 2, 1 ) */
/* and a( 3, 1 ) respectively, and so on. */
/* Unchanged on exit. */
/* X - REAL array of dimension at least */
/* ( 1 + ( n - 1 )*abs( INCX ) ). */
/* Before entry, the incremented array X must contain the n */
/* element vector x. */
/* Unchanged on exit. */
/* INCX - INTEGER. */
/* On entry, INCX specifies the increment for the elements of */
/* X. INCX must not be zero. */
/* Unchanged on exit. */
/* BETA - REAL . */
/* On entry, BETA specifies the scalar beta. When BETA is */
/* supplied as zero then Y need not be set on input. */
/* Unchanged on exit. */
/* Y - REAL array of dimension at least */
/* ( 1 + ( n - 1 )*abs( INCY ) ). */
/* Before entry, the incremented array Y must contain the n */
/* element vector y. On exit, Y is overwritten by the updated */
/* vector y. */
/* INCY - INTEGER. */
/* On entry, INCY specifies the increment for the elements of */
/* Y. INCY must not be zero. */
/* Unchanged on exit. */
/* Further Details */
/* =============== */
/* Level 2 Blas routine. */
/* -- Written on 22-October-1986. */
/* Jack Dongarra, Argonne National Lab. */
/* Jeremy Du Croz, Nag Central Office. */
/* Sven Hammarling, Nag Central Office. */
/* Richard Hanson, Sandia National Labs. */
/* ===================================================================== */
/* .. Parameters .. */
/* .. */
/* .. Local Scalars .. */
/* .. */
/* .. External Functions .. */
/* .. */
/* .. External Subroutines .. */
/* .. */
/* Test the input parameters. */
/* Parameter adjustments */
--y;
--x;
--ap;
/* Function Body */
info = 0;
if (!lsame_(uplo, "U") && !lsame_(uplo, "L")) {
info = 1;
} else if (*n < 0) {
info = 2;
} else if (*incx == 0) {
info = 6;
} else if (*incy == 0) {
info = 9;
}
if (info != 0) {
xerbla_("SSPMV ", &info);
return;
}
/* Quick return if possible. */
if (*n == 0 || (*alpha == 0.f && *beta == 1.f)) {
return;
}
/* Set up the start points in X and Y. */
if (*incx > 0) {
kx = 1;
} else {
kx = 1 - (*n - 1) * *incx;
}
if (*incy > 0) {
ky = 1;
} else {
ky = 1 - (*n - 1) * *incy;
}
/* Start the operations. In this version the elements of the array AP */
/* are accessed sequentially with one pass through AP. */
/* First form y := beta*y. */
if (*beta != 1.f) {
if (*incy == 1) {
if (*beta == 0.f) {
i__1 = *n;
for (i__ = 1; i__ <= i__1; ++i__) {
y[i__] = 0.f;
/* L10: */
}
} else {
i__1 = *n;
for (i__ = 1; i__ <= i__1; ++i__) {
y[i__] = *beta * y[i__];
/* L20: */
}
}
} else {
iy = ky;
if (*beta == 0.f) {
i__1 = *n;
for (i__ = 1; i__ <= i__1; ++i__) {
y[iy] = 0.f;
iy += *incy;
/* L30: */
}
} else {
i__1 = *n;
for (i__ = 1; i__ <= i__1; ++i__) {
y[iy] = *beta * y[iy];
iy += *incy;
/* L40: */
}
}
}
}
if (*alpha == 0.f) {
return;
}
kk = 1;
if (lsame_(uplo, "U")) {
/* Form y when AP contains the upper triangle. */
if (*incx == 1 && *incy == 1) {
i__1 = *n;
for (j = 1; j <= i__1; ++j) {
temp1 = *alpha * x[j];
temp2 = 0.f;
k = kk;
i__2 = j - 1;
for (i__ = 1; i__ <= i__2; ++i__) {
y[i__] += temp1 * ap[k];
temp2 += ap[k] * x[i__];
++k;
/* L50: */
}
y[j] = y[j] + temp1 * ap[kk + j - 1] + *alpha * temp2;
kk += j;
/* L60: */
}
} else {
jx = kx;
jy = ky;
i__1 = *n;
for (j = 1; j <= i__1; ++j) {
temp1 = *alpha * x[jx];
temp2 = 0.f;
ix = kx;
iy = ky;
i__2 = kk + j - 2;
for (k = kk; k <= i__2; ++k) {
y[iy] += temp1 * ap[k];
temp2 += ap[k] * x[ix];
ix += *incx;
iy += *incy;
/* L70: */
}
y[jy] = y[jy] + temp1 * ap[kk + j - 1] + *alpha * temp2;
jx += *incx;
jy += *incy;
kk += j;
/* L80: */
}
}
} else {
/* Form y when AP contains the lower triangle. */
if (*incx == 1 && *incy == 1) {
i__1 = *n;
for (j = 1; j <= i__1; ++j) {
temp1 = *alpha * x[j];
temp2 = 0.f;
y[j] += temp1 * ap[kk];
k = kk + 1;
i__2 = *n;
for (i__ = j + 1; i__ <= i__2; ++i__) {
y[i__] += temp1 * ap[k];
temp2 += ap[k] * x[i__];
++k;
/* L90: */
}
y[j] += *alpha * temp2;
kk += *n - j + 1;
/* L100: */
}
} else {
jx = kx;
jy = ky;
i__1 = *n;
for (j = 1; j <= i__1; ++j) {
temp1 = *alpha * x[jx];
temp2 = 0.f;
y[jy] += temp1 * ap[kk];
ix = jx;
iy = jy;
i__2 = kk + *n - j;
for (k = kk + 1; k <= i__2; ++k) {
ix += *incx;
iy += *incy;
y[iy] += temp1 * ap[k];
temp2 += ap[k] * x[ix];
/* L110: */
}
y[jy] += *alpha * temp2;
jx += *incx;
jy += *incy;
kk += *n - j + 1;
/* L120: */
}
}
}
/* End of SSPMV . */
} /* sspmv_ */

View File

@@ -1,417 +0,0 @@
/* stbmv.f -- translated by f2c (version 20100827).
You must link the resulting object file with libf2c:
on Microsoft Windows system, link with libf2c.lib;
on Linux or Unix systems, link with .../path/to/libf2c.a -lm
or, if you install libf2c.a in a standard place, with -lf2c -lm
-- in that order, at the end of the command line, as in
cc *.o -lf2c -lm
Source for libf2c is in /netlib/f2c/libf2c.zip, e.g.,
http://www.netlib.org/f2c/libf2c.zip
*/
#include "datatypes.h"
/* Subroutine */ void stbmv_(char *uplo, char *trans, char *diag, integer *n, integer *k, real *a, integer *lda,
real *x, integer *incx) {
/* System generated locals */
integer a_dim1, a_offset, i__1, i__2, i__3, i__4;
/* Local variables */
integer i__, j, l, ix, jx, kx, info;
real temp;
extern logical lsame_(char *, char *);
integer kplus1;
extern /* Subroutine */ void xerbla_(const char *, integer *);
logical nounit;
/* .. Scalar Arguments .. */
/* .. */
/* .. Array Arguments .. */
/* .. */
/* Purpose */
/* ======= */
/* STBMV performs one of the matrix-vector operations */
/* x := A*x, or x := A'*x, */
/* where x is an n element vector and A is an n by n unit, or non-unit, */
/* upper or lower triangular band matrix, with ( k + 1 ) diagonals. */
/* Arguments */
/* ========== */
/* UPLO - CHARACTER*1. */
/* On entry, UPLO specifies whether the matrix is an upper or */
/* lower triangular matrix as follows: */
/* UPLO = 'U' or 'u' A is an upper triangular matrix. */
/* UPLO = 'L' or 'l' A is a lower triangular matrix. */
/* Unchanged on exit. */
/* TRANS - CHARACTER*1. */
/* On entry, TRANS specifies the operation to be performed as */
/* follows: */
/* TRANS = 'N' or 'n' x := A*x. */
/* TRANS = 'T' or 't' x := A'*x. */
/* TRANS = 'C' or 'c' x := A'*x. */
/* Unchanged on exit. */
/* DIAG - CHARACTER*1. */
/* On entry, DIAG specifies whether or not A is unit */
/* triangular as follows: */
/* DIAG = 'U' or 'u' A is assumed to be unit triangular. */
/* DIAG = 'N' or 'n' A is not assumed to be unit */
/* triangular. */
/* Unchanged on exit. */
/* N - INTEGER. */
/* On entry, N specifies the order of the matrix A. */
/* N must be at least zero. */
/* Unchanged on exit. */
/* K - INTEGER. */
/* On entry with UPLO = 'U' or 'u', K specifies the number of */
/* super-diagonals of the matrix A. */
/* On entry with UPLO = 'L' or 'l', K specifies the number of */
/* sub-diagonals of the matrix A. */
/* K must satisfy 0 .le. K. */
/* Unchanged on exit. */
/* A - REAL array of DIMENSION ( LDA, n ). */
/* Before entry with UPLO = 'U' or 'u', the leading ( k + 1 ) */
/* by n part of the array A must contain the upper triangular */
/* band part of the matrix of coefficients, supplied column by */
/* column, with the leading diagonal of the matrix in row */
/* ( k + 1 ) of the array, the first super-diagonal starting at */
/* position 2 in row k, and so on. The top left k by k triangle */
/* of the array A is not referenced. */
/* The following program segment will transfer an upper */
/* triangular band matrix from conventional full matrix storage */
/* to band storage: */
/* DO 20, J = 1, N */
/* M = K + 1 - J */
/* DO 10, I = MAX( 1, J - K ), J */
/* A( M + I, J ) = matrix( I, J ) */
/* 10 CONTINUE */
/* 20 CONTINUE */
/* Before entry with UPLO = 'L' or 'l', the leading ( k + 1 ) */
/* by n part of the array A must contain the lower triangular */
/* band part of the matrix of coefficients, supplied column by */
/* column, with the leading diagonal of the matrix in row 1 of */
/* the array, the first sub-diagonal starting at position 1 in */
/* row 2, and so on. The bottom right k by k triangle of the */
/* array A is not referenced. */
/* The following program segment will transfer a lower */
/* triangular band matrix from conventional full matrix storage */
/* to band storage: */
/* DO 20, J = 1, N */
/* M = 1 - J */
/* DO 10, I = J, MIN( N, J + K ) */
/* A( M + I, J ) = matrix( I, J ) */
/* 10 CONTINUE */
/* 20 CONTINUE */
/* Note that when DIAG = 'U' or 'u' the elements of the array A */
/* corresponding to the diagonal elements of the matrix are not */
/* referenced, but are assumed to be unity. */
/* Unchanged on exit. */
/* LDA - INTEGER. */
/* On entry, LDA specifies the first dimension of A as declared */
/* in the calling (sub) program. LDA must be at least */
/* ( k + 1 ). */
/* Unchanged on exit. */
/* X - REAL array of dimension at least */
/* ( 1 + ( n - 1 )*abs( INCX ) ). */
/* Before entry, the incremented array X must contain the n */
/* element vector x. On exit, X is overwritten with the */
/* transformed vector x. */
/* INCX - INTEGER. */
/* On entry, INCX specifies the increment for the elements of */
/* X. INCX must not be zero. */
/* Unchanged on exit. */
/* Further Details */
/* =============== */
/* Level 2 Blas routine. */
/* -- Written on 22-October-1986. */
/* Jack Dongarra, Argonne National Lab. */
/* Jeremy Du Croz, Nag Central Office. */
/* Sven Hammarling, Nag Central Office. */
/* Richard Hanson, Sandia National Labs. */
/* ===================================================================== */
/* .. Parameters .. */
/* .. */
/* .. Local Scalars .. */
/* .. */
/* .. External Functions .. */
/* .. */
/* .. External Subroutines .. */
/* .. */
/* .. Intrinsic Functions .. */
/* .. */
/* Test the input parameters. */
/* Parameter adjustments */
a_dim1 = *lda;
a_offset = 1 + a_dim1;
a -= a_offset;
--x;
/* Function Body */
info = 0;
if (!lsame_(uplo, "U") && !lsame_(uplo, "L")) {
info = 1;
} else if (!lsame_(trans, "N") && !lsame_(trans, "T") && !lsame_(trans, "C")) {
info = 2;
} else if (!lsame_(diag, "U") && !lsame_(diag, "N")) {
info = 3;
} else if (*n < 0) {
info = 4;
} else if (*k < 0) {
info = 5;
} else if (*lda < *k + 1) {
info = 7;
} else if (*incx == 0) {
info = 9;
}
if (info != 0) {
xerbla_("STBMV ", &info);
return;
}
/* Quick return if possible. */
if (*n == 0) {
return;
}
nounit = lsame_(diag, "N");
/* Set up the start point in X if the increment is not unity. This */
/* will be ( N - 1 )*INCX too small for descending loops. */
if (*incx <= 0) {
kx = 1 - (*n - 1) * *incx;
} else if (*incx != 1) {
kx = 1;
}
/* Start the operations. In this version the elements of A are */
/* accessed sequentially with one pass through A. */
if (lsame_(trans, "N")) {
/* Form x := A*x. */
if (lsame_(uplo, "U")) {
kplus1 = *k + 1;
if (*incx == 1) {
i__1 = *n;
for (j = 1; j <= i__1; ++j) {
if (x[j] != 0.f) {
temp = x[j];
l = kplus1 - j;
/* Computing MAX */
i__2 = 1, i__3 = j - *k;
i__4 = j - 1;
for (i__ = max(i__2, i__3); i__ <= i__4; ++i__) {
x[i__] += temp * a[l + i__ + j * a_dim1];
/* L10: */
}
if (nounit) {
x[j] *= a[kplus1 + j * a_dim1];
}
}
/* L20: */
}
} else {
jx = kx;
i__1 = *n;
for (j = 1; j <= i__1; ++j) {
if (x[jx] != 0.f) {
temp = x[jx];
ix = kx;
l = kplus1 - j;
/* Computing MAX */
i__4 = 1, i__2 = j - *k;
i__3 = j - 1;
for (i__ = max(i__4, i__2); i__ <= i__3; ++i__) {
x[ix] += temp * a[l + i__ + j * a_dim1];
ix += *incx;
/* L30: */
}
if (nounit) {
x[jx] *= a[kplus1 + j * a_dim1];
}
}
jx += *incx;
if (j > *k) {
kx += *incx;
}
/* L40: */
}
}
} else {
if (*incx == 1) {
for (j = *n; j >= 1; --j) {
if (x[j] != 0.f) {
temp = x[j];
l = 1 - j;
/* Computing MIN */
i__1 = *n, i__3 = j + *k;
i__4 = j + 1;
for (i__ = min(i__1, i__3); i__ >= i__4; --i__) {
x[i__] += temp * a[l + i__ + j * a_dim1];
/* L50: */
}
if (nounit) {
x[j] *= a[j * a_dim1 + 1];
}
}
/* L60: */
}
} else {
kx += (*n - 1) * *incx;
jx = kx;
for (j = *n; j >= 1; --j) {
if (x[jx] != 0.f) {
temp = x[jx];
ix = kx;
l = 1 - j;
/* Computing MIN */
i__4 = *n, i__1 = j + *k;
i__3 = j + 1;
for (i__ = min(i__4, i__1); i__ >= i__3; --i__) {
x[ix] += temp * a[l + i__ + j * a_dim1];
ix -= *incx;
/* L70: */
}
if (nounit) {
x[jx] *= a[j * a_dim1 + 1];
}
}
jx -= *incx;
if (*n - j >= *k) {
kx -= *incx;
}
/* L80: */
}
}
}
} else {
/* Form x := A'*x. */
if (lsame_(uplo, "U")) {
kplus1 = *k + 1;
if (*incx == 1) {
for (j = *n; j >= 1; --j) {
temp = x[j];
l = kplus1 - j;
if (nounit) {
temp *= a[kplus1 + j * a_dim1];
}
/* Computing MAX */
i__4 = 1, i__1 = j - *k;
i__3 = max(i__4, i__1);
for (i__ = j - 1; i__ >= i__3; --i__) {
temp += a[l + i__ + j * a_dim1] * x[i__];
/* L90: */
}
x[j] = temp;
/* L100: */
}
} else {
kx += (*n - 1) * *incx;
jx = kx;
for (j = *n; j >= 1; --j) {
temp = x[jx];
kx -= *incx;
ix = kx;
l = kplus1 - j;
if (nounit) {
temp *= a[kplus1 + j * a_dim1];
}
/* Computing MAX */
i__4 = 1, i__1 = j - *k;
i__3 = max(i__4, i__1);
for (i__ = j - 1; i__ >= i__3; --i__) {
temp += a[l + i__ + j * a_dim1] * x[ix];
ix -= *incx;
/* L110: */
}
x[jx] = temp;
jx -= *incx;
/* L120: */
}
}
} else {
if (*incx == 1) {
i__3 = *n;
for (j = 1; j <= i__3; ++j) {
temp = x[j];
l = 1 - j;
if (nounit) {
temp *= a[j * a_dim1 + 1];
}
/* Computing MIN */
i__1 = *n, i__2 = j + *k;
i__4 = min(i__1, i__2);
for (i__ = j + 1; i__ <= i__4; ++i__) {
temp += a[l + i__ + j * a_dim1] * x[i__];
/* L130: */
}
x[j] = temp;
/* L140: */
}
} else {
jx = kx;
i__3 = *n;
for (j = 1; j <= i__3; ++j) {
temp = x[jx];
kx += *incx;
ix = kx;
l = 1 - j;
if (nounit) {
temp *= a[j * a_dim1 + 1];
}
/* Computing MIN */
i__1 = *n, i__2 = j + *k;
i__4 = min(i__1, i__2);
for (i__ = j + 1; i__ <= i__4; ++i__) {
temp += a[l + i__ + j * a_dim1] * x[ix];
ix += *incx;
/* L150: */
}
x[jx] = temp;
jx += *incx;
/* L160: */
}
}
}
}
/* End of STBMV . */
} /* stbmv_ */

View File

@@ -1,456 +0,0 @@
/* zhbmv.f -- translated by f2c (version 20100827).
You must link the resulting object file with libf2c:
on Microsoft Windows system, link with libf2c.lib;
on Linux or Unix systems, link with .../path/to/libf2c.a -lm
or, if you install libf2c.a in a standard place, with -lf2c -lm
-- in that order, at the end of the command line, as in
cc *.o -lf2c -lm
Source for libf2c is in /netlib/f2c/libf2c.zip, e.g.,
http://www.netlib.org/f2c/libf2c.zip
*/
#include "datatypes.h"
static inline void d_cnjg(doublecomplex *r, doublecomplex *z) {
r->r = z->r;
r->i = -(z->i);
}
/* Subroutine */ void zhbmv_(char *uplo, integer *n, integer *k, doublecomplex *alpha, doublecomplex *a, integer *lda,
doublecomplex *x, integer *incx, doublecomplex *beta, doublecomplex *y, integer *incy) {
/* System generated locals */
integer a_dim1, a_offset, i__1, i__2, i__3, i__4, i__5;
doublereal d__1;
doublecomplex z__1, z__2, z__3, z__4;
/* Local variables */
integer i__, j, l, ix, iy, jx, jy, kx, ky, info;
doublecomplex temp1, temp2;
extern logical lsame_(char *, char *);
integer kplus1;
extern /* Subroutine */ void xerbla_(const char *, integer *);
/* .. Scalar Arguments .. */
/* .. */
/* .. Array Arguments .. */
/* .. */
/* Purpose */
/* ======= */
/* ZHBMV performs the matrix-vector operation */
/* y := alpha*A*x + beta*y, */
/* where alpha and beta are scalars, x and y are n element vectors and */
/* A is an n by n hermitian band matrix, with k super-diagonals. */
/* Arguments */
/* ========== */
/* UPLO - CHARACTER*1. */
/* On entry, UPLO specifies whether the upper or lower */
/* triangular part of the band matrix A is being supplied as */
/* follows: */
/* UPLO = 'U' or 'u' The upper triangular part of A is */
/* being supplied. */
/* UPLO = 'L' or 'l' The lower triangular part of A is */
/* being supplied. */
/* Unchanged on exit. */
/* N - INTEGER. */
/* On entry, N specifies the order of the matrix A. */
/* N must be at least zero. */
/* Unchanged on exit. */
/* K - INTEGER. */
/* On entry, K specifies the number of super-diagonals of the */
/* matrix A. K must satisfy 0 .le. K. */
/* Unchanged on exit. */
/* ALPHA - COMPLEX*16 . */
/* On entry, ALPHA specifies the scalar alpha. */
/* Unchanged on exit. */
/* A - COMPLEX*16 array of DIMENSION ( LDA, n ). */
/* Before entry with UPLO = 'U' or 'u', the leading ( k + 1 ) */
/* by n part of the array A must contain the upper triangular */
/* band part of the hermitian matrix, supplied column by */
/* column, with the leading diagonal of the matrix in row */
/* ( k + 1 ) of the array, the first super-diagonal starting at */
/* position 2 in row k, and so on. The top left k by k triangle */
/* of the array A is not referenced. */
/* The following program segment will transfer the upper */
/* triangular part of a hermitian band matrix from conventional */
/* full matrix storage to band storage: */
/* DO 20, J = 1, N */
/* M = K + 1 - J */
/* DO 10, I = MAX( 1, J - K ), J */
/* A( M + I, J ) = matrix( I, J ) */
/* 10 CONTINUE */
/* 20 CONTINUE */
/* Before entry with UPLO = 'L' or 'l', the leading ( k + 1 ) */
/* by n part of the array A must contain the lower triangular */
/* band part of the hermitian matrix, supplied column by */
/* column, with the leading diagonal of the matrix in row 1 of */
/* the array, the first sub-diagonal starting at position 1 in */
/* row 2, and so on. The bottom right k by k triangle of the */
/* array A is not referenced. */
/* The following program segment will transfer the lower */
/* triangular part of a hermitian band matrix from conventional */
/* full matrix storage to band storage: */
/* DO 20, J = 1, N */
/* M = 1 - J */
/* DO 10, I = J, MIN( N, J + K ) */
/* A( M + I, J ) = matrix( I, J ) */
/* 10 CONTINUE */
/* 20 CONTINUE */
/* Note that the imaginary parts of the diagonal elements need */
/* not be set and are assumed to be zero. */
/* Unchanged on exit. */
/* LDA - INTEGER. */
/* On entry, LDA specifies the first dimension of A as declared */
/* in the calling (sub) program. LDA must be at least */
/* ( k + 1 ). */
/* Unchanged on exit. */
/* X - COMPLEX*16 array of DIMENSION at least */
/* ( 1 + ( n - 1 )*abs( INCX ) ). */
/* Before entry, the incremented array X must contain the */
/* vector x. */
/* Unchanged on exit. */
/* INCX - INTEGER. */
/* On entry, INCX specifies the increment for the elements of */
/* X. INCX must not be zero. */
/* Unchanged on exit. */
/* BETA - COMPLEX*16 . */
/* On entry, BETA specifies the scalar beta. */
/* Unchanged on exit. */
/* Y - COMPLEX*16 array of DIMENSION at least */
/* ( 1 + ( n - 1 )*abs( INCY ) ). */
/* Before entry, the incremented array Y must contain the */
/* vector y. On exit, Y is overwritten by the updated vector y. */
/* INCY - INTEGER. */
/* On entry, INCY specifies the increment for the elements of */
/* Y. INCY must not be zero. */
/* Unchanged on exit. */
/* Further Details */
/* =============== */
/* Level 2 Blas routine. */
/* -- Written on 22-October-1986. */
/* Jack Dongarra, Argonne National Lab. */
/* Jeremy Du Croz, Nag Central Office. */
/* Sven Hammarling, Nag Central Office. */
/* Richard Hanson, Sandia National Labs. */
/* ===================================================================== */
/* .. Parameters .. */
/* .. */
/* .. Local Scalars .. */
/* .. */
/* .. External Functions .. */
/* .. */
/* .. External Subroutines .. */
/* .. */
/* .. Intrinsic Functions .. */
/* .. */
/* Test the input parameters. */
/* Parameter adjustments */
a_dim1 = *lda;
a_offset = 1 + a_dim1;
a -= a_offset;
--x;
--y;
/* Function Body */
info = 0;
if (!lsame_(uplo, "U") && !lsame_(uplo, "L")) {
info = 1;
} else if (*n < 0) {
info = 2;
} else if (*k < 0) {
info = 3;
} else if (*lda < *k + 1) {
info = 6;
} else if (*incx == 0) {
info = 8;
} else if (*incy == 0) {
info = 11;
}
if (info != 0) {
xerbla_("ZHBMV ", &info);
return;
}
/* Quick return if possible. */
if (*n == 0 || (alpha->r == 0. && alpha->i == 0. && (beta->r == 1. && beta->i == 0.))) {
return;
}
/* Set up the start points in X and Y. */
if (*incx > 0) {
kx = 1;
} else {
kx = 1 - (*n - 1) * *incx;
}
if (*incy > 0) {
ky = 1;
} else {
ky = 1 - (*n - 1) * *incy;
}
/* Start the operations. In this version the elements of the array A */
/* are accessed sequentially with one pass through A. */
/* First form y := beta*y. */
if (beta->r != 1. || beta->i != 0.) {
if (*incy == 1) {
if (beta->r == 0. && beta->i == 0.) {
i__1 = *n;
for (i__ = 1; i__ <= i__1; ++i__) {
i__2 = i__;
y[i__2].r = 0., y[i__2].i = 0.;
/* L10: */
}
} else {
i__1 = *n;
for (i__ = 1; i__ <= i__1; ++i__) {
i__2 = i__;
i__3 = i__;
z__1.r = beta->r * y[i__3].r - beta->i * y[i__3].i, z__1.i = beta->r * y[i__3].i + beta->i * y[i__3].r;
y[i__2].r = z__1.r, y[i__2].i = z__1.i;
/* L20: */
}
}
} else {
iy = ky;
if (beta->r == 0. && beta->i == 0.) {
i__1 = *n;
for (i__ = 1; i__ <= i__1; ++i__) {
i__2 = iy;
y[i__2].r = 0., y[i__2].i = 0.;
iy += *incy;
/* L30: */
}
} else {
i__1 = *n;
for (i__ = 1; i__ <= i__1; ++i__) {
i__2 = iy;
i__3 = iy;
z__1.r = beta->r * y[i__3].r - beta->i * y[i__3].i, z__1.i = beta->r * y[i__3].i + beta->i * y[i__3].r;
y[i__2].r = z__1.r, y[i__2].i = z__1.i;
iy += *incy;
/* L40: */
}
}
}
}
if (alpha->r == 0. && alpha->i == 0.) {
return;
}
if (lsame_(uplo, "U")) {
/* Form y when upper triangle of A is stored. */
kplus1 = *k + 1;
if (*incx == 1 && *incy == 1) {
i__1 = *n;
for (j = 1; j <= i__1; ++j) {
i__2 = j;
z__1.r = alpha->r * x[i__2].r - alpha->i * x[i__2].i, z__1.i = alpha->r * x[i__2].i + alpha->i * x[i__2].r;
temp1.r = z__1.r, temp1.i = z__1.i;
temp2.r = 0., temp2.i = 0.;
l = kplus1 - j;
/* Computing MAX */
i__2 = 1, i__3 = j - *k;
i__4 = j - 1;
for (i__ = max(i__2, i__3); i__ <= i__4; ++i__) {
i__2 = i__;
i__3 = i__;
i__5 = l + i__ + j * a_dim1;
z__2.r = temp1.r * a[i__5].r - temp1.i * a[i__5].i, z__2.i = temp1.r * a[i__5].i + temp1.i * a[i__5].r;
z__1.r = y[i__3].r + z__2.r, z__1.i = y[i__3].i + z__2.i;
y[i__2].r = z__1.r, y[i__2].i = z__1.i;
d_cnjg(&z__3, &a[l + i__ + j * a_dim1]);
i__2 = i__;
z__2.r = z__3.r * x[i__2].r - z__3.i * x[i__2].i, z__2.i = z__3.r * x[i__2].i + z__3.i * x[i__2].r;
z__1.r = temp2.r + z__2.r, z__1.i = temp2.i + z__2.i;
temp2.r = z__1.r, temp2.i = z__1.i;
/* L50: */
}
i__4 = j;
i__2 = j;
i__3 = kplus1 + j * a_dim1;
d__1 = a[i__3].r;
z__3.r = d__1 * temp1.r, z__3.i = d__1 * temp1.i;
z__2.r = y[i__2].r + z__3.r, z__2.i = y[i__2].i + z__3.i;
z__4.r = alpha->r * temp2.r - alpha->i * temp2.i, z__4.i = alpha->r * temp2.i + alpha->i * temp2.r;
z__1.r = z__2.r + z__4.r, z__1.i = z__2.i + z__4.i;
y[i__4].r = z__1.r, y[i__4].i = z__1.i;
/* L60: */
}
} else {
jx = kx;
jy = ky;
i__1 = *n;
for (j = 1; j <= i__1; ++j) {
i__4 = jx;
z__1.r = alpha->r * x[i__4].r - alpha->i * x[i__4].i, z__1.i = alpha->r * x[i__4].i + alpha->i * x[i__4].r;
temp1.r = z__1.r, temp1.i = z__1.i;
temp2.r = 0., temp2.i = 0.;
ix = kx;
iy = ky;
l = kplus1 - j;
/* Computing MAX */
i__4 = 1, i__2 = j - *k;
i__3 = j - 1;
for (i__ = max(i__4, i__2); i__ <= i__3; ++i__) {
i__4 = iy;
i__2 = iy;
i__5 = l + i__ + j * a_dim1;
z__2.r = temp1.r * a[i__5].r - temp1.i * a[i__5].i, z__2.i = temp1.r * a[i__5].i + temp1.i * a[i__5].r;
z__1.r = y[i__2].r + z__2.r, z__1.i = y[i__2].i + z__2.i;
y[i__4].r = z__1.r, y[i__4].i = z__1.i;
d_cnjg(&z__3, &a[l + i__ + j * a_dim1]);
i__4 = ix;
z__2.r = z__3.r * x[i__4].r - z__3.i * x[i__4].i, z__2.i = z__3.r * x[i__4].i + z__3.i * x[i__4].r;
z__1.r = temp2.r + z__2.r, z__1.i = temp2.i + z__2.i;
temp2.r = z__1.r, temp2.i = z__1.i;
ix += *incx;
iy += *incy;
/* L70: */
}
i__3 = jy;
i__4 = jy;
i__2 = kplus1 + j * a_dim1;
d__1 = a[i__2].r;
z__3.r = d__1 * temp1.r, z__3.i = d__1 * temp1.i;
z__2.r = y[i__4].r + z__3.r, z__2.i = y[i__4].i + z__3.i;
z__4.r = alpha->r * temp2.r - alpha->i * temp2.i, z__4.i = alpha->r * temp2.i + alpha->i * temp2.r;
z__1.r = z__2.r + z__4.r, z__1.i = z__2.i + z__4.i;
y[i__3].r = z__1.r, y[i__3].i = z__1.i;
jx += *incx;
jy += *incy;
if (j > *k) {
kx += *incx;
ky += *incy;
}
/* L80: */
}
}
} else {
/* Form y when lower triangle of A is stored. */
if (*incx == 1 && *incy == 1) {
i__1 = *n;
for (j = 1; j <= i__1; ++j) {
i__3 = j;
z__1.r = alpha->r * x[i__3].r - alpha->i * x[i__3].i, z__1.i = alpha->r * x[i__3].i + alpha->i * x[i__3].r;
temp1.r = z__1.r, temp1.i = z__1.i;
temp2.r = 0., temp2.i = 0.;
i__3 = j;
i__4 = j;
i__2 = j * a_dim1 + 1;
d__1 = a[i__2].r;
z__2.r = d__1 * temp1.r, z__2.i = d__1 * temp1.i;
z__1.r = y[i__4].r + z__2.r, z__1.i = y[i__4].i + z__2.i;
y[i__3].r = z__1.r, y[i__3].i = z__1.i;
l = 1 - j;
/* Computing MIN */
i__4 = *n, i__2 = j + *k;
i__3 = min(i__4, i__2);
for (i__ = j + 1; i__ <= i__3; ++i__) {
i__4 = i__;
i__2 = i__;
i__5 = l + i__ + j * a_dim1;
z__2.r = temp1.r * a[i__5].r - temp1.i * a[i__5].i, z__2.i = temp1.r * a[i__5].i + temp1.i * a[i__5].r;
z__1.r = y[i__2].r + z__2.r, z__1.i = y[i__2].i + z__2.i;
y[i__4].r = z__1.r, y[i__4].i = z__1.i;
d_cnjg(&z__3, &a[l + i__ + j * a_dim1]);
i__4 = i__;
z__2.r = z__3.r * x[i__4].r - z__3.i * x[i__4].i, z__2.i = z__3.r * x[i__4].i + z__3.i * x[i__4].r;
z__1.r = temp2.r + z__2.r, z__1.i = temp2.i + z__2.i;
temp2.r = z__1.r, temp2.i = z__1.i;
/* L90: */
}
i__3 = j;
i__4 = j;
z__2.r = alpha->r * temp2.r - alpha->i * temp2.i, z__2.i = alpha->r * temp2.i + alpha->i * temp2.r;
z__1.r = y[i__4].r + z__2.r, z__1.i = y[i__4].i + z__2.i;
y[i__3].r = z__1.r, y[i__3].i = z__1.i;
/* L100: */
}
} else {
jx = kx;
jy = ky;
i__1 = *n;
for (j = 1; j <= i__1; ++j) {
i__3 = jx;
z__1.r = alpha->r * x[i__3].r - alpha->i * x[i__3].i, z__1.i = alpha->r * x[i__3].i + alpha->i * x[i__3].r;
temp1.r = z__1.r, temp1.i = z__1.i;
temp2.r = 0., temp2.i = 0.;
i__3 = jy;
i__4 = jy;
i__2 = j * a_dim1 + 1;
d__1 = a[i__2].r;
z__2.r = d__1 * temp1.r, z__2.i = d__1 * temp1.i;
z__1.r = y[i__4].r + z__2.r, z__1.i = y[i__4].i + z__2.i;
y[i__3].r = z__1.r, y[i__3].i = z__1.i;
l = 1 - j;
ix = jx;
iy = jy;
/* Computing MIN */
i__4 = *n, i__2 = j + *k;
i__3 = min(i__4, i__2);
for (i__ = j + 1; i__ <= i__3; ++i__) {
ix += *incx;
iy += *incy;
i__4 = iy;
i__2 = iy;
i__5 = l + i__ + j * a_dim1;
z__2.r = temp1.r * a[i__5].r - temp1.i * a[i__5].i, z__2.i = temp1.r * a[i__5].i + temp1.i * a[i__5].r;
z__1.r = y[i__2].r + z__2.r, z__1.i = y[i__2].i + z__2.i;
y[i__4].r = z__1.r, y[i__4].i = z__1.i;
d_cnjg(&z__3, &a[l + i__ + j * a_dim1]);
i__4 = ix;
z__2.r = z__3.r * x[i__4].r - z__3.i * x[i__4].i, z__2.i = z__3.r * x[i__4].i + z__3.i * x[i__4].r;
z__1.r = temp2.r + z__2.r, z__1.i = temp2.i + z__2.i;
temp2.r = z__1.r, temp2.i = z__1.i;
/* L110: */
}
i__3 = jy;
i__4 = jy;
z__2.r = alpha->r * temp2.r - alpha->i * temp2.i, z__2.i = alpha->r * temp2.i + alpha->i * temp2.r;
z__1.r = y[i__4].r + z__2.r, z__1.i = y[i__4].i + z__2.i;
y[i__3].r = z__1.r, y[i__3].i = z__1.i;
jx += *incx;
jy += *incy;
/* L120: */
}
}
}
/* End of ZHBMV . */
} /* zhbmv_ */

View File

@@ -1,407 +0,0 @@
/* zhpmv.f -- translated by f2c (version 20100827).
You must link the resulting object file with libf2c:
on Microsoft Windows system, link with libf2c.lib;
on Linux or Unix systems, link with .../path/to/libf2c.a -lm
or, if you install libf2c.a in a standard place, with -lf2c -lm
-- in that order, at the end of the command line, as in
cc *.o -lf2c -lm
Source for libf2c is in /netlib/f2c/libf2c.zip, e.g.,
http://www.netlib.org/f2c/libf2c.zip
*/
#include "datatypes.h"
static inline void d_cnjg(doublecomplex *r, doublecomplex *z) {
r->r = z->r;
r->i = -(z->i);
}
/* Subroutine */ void zhpmv_(char *uplo, integer *n, doublecomplex *alpha, doublecomplex *ap, doublecomplex *x,
integer *incx, doublecomplex *beta, doublecomplex *y, integer *incy) {
/* System generated locals */
integer i__1, i__2, i__3, i__4, i__5;
doublereal d__1;
doublecomplex z__1, z__2, z__3, z__4;
/* Local variables */
integer i__, j, k, kk, ix, iy, jx, jy, kx, ky, info;
doublecomplex temp1, temp2;
extern logical lsame_(char *, char *);
extern /* Subroutine */ void xerbla_(const char *, integer *);
/* .. Scalar Arguments .. */
/* .. */
/* .. Array Arguments .. */
/* .. */
/* Purpose */
/* ======= */
/* ZHPMV performs the matrix-vector operation */
/* y := alpha*A*x + beta*y, */
/* where alpha and beta are scalars, x and y are n element vectors and */
/* A is an n by n hermitian matrix, supplied in packed form. */
/* Arguments */
/* ========== */
/* UPLO - CHARACTER*1. */
/* On entry, UPLO specifies whether the upper or lower */
/* triangular part of the matrix A is supplied in the packed */
/* array AP as follows: */
/* UPLO = 'U' or 'u' The upper triangular part of A is */
/* supplied in AP. */
/* UPLO = 'L' or 'l' The lower triangular part of A is */
/* supplied in AP. */
/* Unchanged on exit. */
/* N - INTEGER. */
/* On entry, N specifies the order of the matrix A. */
/* N must be at least zero. */
/* Unchanged on exit. */
/* ALPHA - COMPLEX*16 . */
/* On entry, ALPHA specifies the scalar alpha. */
/* Unchanged on exit. */
/* AP - COMPLEX*16 array of DIMENSION at least */
/* ( ( n*( n + 1 ) )/2 ). */
/* Before entry with UPLO = 'U' or 'u', the array AP must */
/* contain the upper triangular part of the hermitian matrix */
/* packed sequentially, column by column, so that AP( 1 ) */
/* contains a( 1, 1 ), AP( 2 ) and AP( 3 ) contain a( 1, 2 ) */
/* and a( 2, 2 ) respectively, and so on. */
/* Before entry with UPLO = 'L' or 'l', the array AP must */
/* contain the lower triangular part of the hermitian matrix */
/* packed sequentially, column by column, so that AP( 1 ) */
/* contains a( 1, 1 ), AP( 2 ) and AP( 3 ) contain a( 2, 1 ) */
/* and a( 3, 1 ) respectively, and so on. */
/* Note that the imaginary parts of the diagonal elements need */
/* not be set and are assumed to be zero. */
/* Unchanged on exit. */
/* X - COMPLEX*16 array of dimension at least */
/* ( 1 + ( n - 1 )*abs( INCX ) ). */
/* Before entry, the incremented array X must contain the n */
/* element vector x. */
/* Unchanged on exit. */
/* INCX - INTEGER. */
/* On entry, INCX specifies the increment for the elements of */
/* X. INCX must not be zero. */
/* Unchanged on exit. */
/* BETA - COMPLEX*16 . */
/* On entry, BETA specifies the scalar beta. When BETA is */
/* supplied as zero then Y need not be set on input. */
/* Unchanged on exit. */
/* Y - COMPLEX*16 array of dimension at least */
/* ( 1 + ( n - 1 )*abs( INCY ) ). */
/* Before entry, the incremented array Y must contain the n */
/* element vector y. On exit, Y is overwritten by the updated */
/* vector y. */
/* INCY - INTEGER. */
/* On entry, INCY specifies the increment for the elements of */
/* Y. INCY must not be zero. */
/* Unchanged on exit. */
/* Further Details */
/* =============== */
/* Level 2 Blas routine. */
/* -- Written on 22-October-1986. */
/* Jack Dongarra, Argonne National Lab. */
/* Jeremy Du Croz, Nag Central Office. */
/* Sven Hammarling, Nag Central Office. */
/* Richard Hanson, Sandia National Labs. */
/* ===================================================================== */
/* .. Parameters .. */
/* .. */
/* .. Local Scalars .. */
/* .. */
/* .. External Functions .. */
/* .. */
/* .. External Subroutines .. */
/* .. */
/* .. Intrinsic Functions .. */
/* .. */
/* Test the input parameters. */
/* Parameter adjustments */
--y;
--x;
--ap;
/* Function Body */
info = 0;
if (!lsame_(uplo, "U") && !lsame_(uplo, "L")) {
info = 1;
} else if (*n < 0) {
info = 2;
} else if (*incx == 0) {
info = 6;
} else if (*incy == 0) {
info = 9;
}
if (info != 0) {
xerbla_("ZHPMV ", &info);
return;
}
/* Quick return if possible. */
if (*n == 0 || (alpha->r == 0. && alpha->i == 0. && (beta->r == 1. && beta->i == 0.))) {
return;
}
/* Set up the start points in X and Y. */
if (*incx > 0) {
kx = 1;
} else {
kx = 1 - (*n - 1) * *incx;
}
if (*incy > 0) {
ky = 1;
} else {
ky = 1 - (*n - 1) * *incy;
}
/* Start the operations. In this version the elements of the array AP */
/* are accessed sequentially with one pass through AP. */
/* First form y := beta*y. */
if (beta->r != 1. || beta->i != 0.) {
if (*incy == 1) {
if (beta->r == 0. && beta->i == 0.) {
i__1 = *n;
for (i__ = 1; i__ <= i__1; ++i__) {
i__2 = i__;
y[i__2].r = 0., y[i__2].i = 0.;
/* L10: */
}
} else {
i__1 = *n;
for (i__ = 1; i__ <= i__1; ++i__) {
i__2 = i__;
i__3 = i__;
z__1.r = beta->r * y[i__3].r - beta->i * y[i__3].i, z__1.i = beta->r * y[i__3].i + beta->i * y[i__3].r;
y[i__2].r = z__1.r, y[i__2].i = z__1.i;
/* L20: */
}
}
} else {
iy = ky;
if (beta->r == 0. && beta->i == 0.) {
i__1 = *n;
for (i__ = 1; i__ <= i__1; ++i__) {
i__2 = iy;
y[i__2].r = 0., y[i__2].i = 0.;
iy += *incy;
/* L30: */
}
} else {
i__1 = *n;
for (i__ = 1; i__ <= i__1; ++i__) {
i__2 = iy;
i__3 = iy;
z__1.r = beta->r * y[i__3].r - beta->i * y[i__3].i, z__1.i = beta->r * y[i__3].i + beta->i * y[i__3].r;
y[i__2].r = z__1.r, y[i__2].i = z__1.i;
iy += *incy;
/* L40: */
}
}
}
}
if (alpha->r == 0. && alpha->i == 0.) {
return;
}
kk = 1;
if (lsame_(uplo, "U")) {
/* Form y when AP contains the upper triangle. */
if (*incx == 1 && *incy == 1) {
i__1 = *n;
for (j = 1; j <= i__1; ++j) {
i__2 = j;
z__1.r = alpha->r * x[i__2].r - alpha->i * x[i__2].i, z__1.i = alpha->r * x[i__2].i + alpha->i * x[i__2].r;
temp1.r = z__1.r, temp1.i = z__1.i;
temp2.r = 0., temp2.i = 0.;
k = kk;
i__2 = j - 1;
for (i__ = 1; i__ <= i__2; ++i__) {
i__3 = i__;
i__4 = i__;
i__5 = k;
z__2.r = temp1.r * ap[i__5].r - temp1.i * ap[i__5].i, z__2.i = temp1.r * ap[i__5].i + temp1.i * ap[i__5].r;
z__1.r = y[i__4].r + z__2.r, z__1.i = y[i__4].i + z__2.i;
y[i__3].r = z__1.r, y[i__3].i = z__1.i;
d_cnjg(&z__3, &ap[k]);
i__3 = i__;
z__2.r = z__3.r * x[i__3].r - z__3.i * x[i__3].i, z__2.i = z__3.r * x[i__3].i + z__3.i * x[i__3].r;
z__1.r = temp2.r + z__2.r, z__1.i = temp2.i + z__2.i;
temp2.r = z__1.r, temp2.i = z__1.i;
++k;
/* L50: */
}
i__2 = j;
i__3 = j;
i__4 = kk + j - 1;
d__1 = ap[i__4].r;
z__3.r = d__1 * temp1.r, z__3.i = d__1 * temp1.i;
z__2.r = y[i__3].r + z__3.r, z__2.i = y[i__3].i + z__3.i;
z__4.r = alpha->r * temp2.r - alpha->i * temp2.i, z__4.i = alpha->r * temp2.i + alpha->i * temp2.r;
z__1.r = z__2.r + z__4.r, z__1.i = z__2.i + z__4.i;
y[i__2].r = z__1.r, y[i__2].i = z__1.i;
kk += j;
/* L60: */
}
} else {
jx = kx;
jy = ky;
i__1 = *n;
for (j = 1; j <= i__1; ++j) {
i__2 = jx;
z__1.r = alpha->r * x[i__2].r - alpha->i * x[i__2].i, z__1.i = alpha->r * x[i__2].i + alpha->i * x[i__2].r;
temp1.r = z__1.r, temp1.i = z__1.i;
temp2.r = 0., temp2.i = 0.;
ix = kx;
iy = ky;
i__2 = kk + j - 2;
for (k = kk; k <= i__2; ++k) {
i__3 = iy;
i__4 = iy;
i__5 = k;
z__2.r = temp1.r * ap[i__5].r - temp1.i * ap[i__5].i, z__2.i = temp1.r * ap[i__5].i + temp1.i * ap[i__5].r;
z__1.r = y[i__4].r + z__2.r, z__1.i = y[i__4].i + z__2.i;
y[i__3].r = z__1.r, y[i__3].i = z__1.i;
d_cnjg(&z__3, &ap[k]);
i__3 = ix;
z__2.r = z__3.r * x[i__3].r - z__3.i * x[i__3].i, z__2.i = z__3.r * x[i__3].i + z__3.i * x[i__3].r;
z__1.r = temp2.r + z__2.r, z__1.i = temp2.i + z__2.i;
temp2.r = z__1.r, temp2.i = z__1.i;
ix += *incx;
iy += *incy;
/* L70: */
}
i__2 = jy;
i__3 = jy;
i__4 = kk + j - 1;
d__1 = ap[i__4].r;
z__3.r = d__1 * temp1.r, z__3.i = d__1 * temp1.i;
z__2.r = y[i__3].r + z__3.r, z__2.i = y[i__3].i + z__3.i;
z__4.r = alpha->r * temp2.r - alpha->i * temp2.i, z__4.i = alpha->r * temp2.i + alpha->i * temp2.r;
z__1.r = z__2.r + z__4.r, z__1.i = z__2.i + z__4.i;
y[i__2].r = z__1.r, y[i__2].i = z__1.i;
jx += *incx;
jy += *incy;
kk += j;
/* L80: */
}
}
} else {
/* Form y when AP contains the lower triangle. */
if (*incx == 1 && *incy == 1) {
i__1 = *n;
for (j = 1; j <= i__1; ++j) {
i__2 = j;
z__1.r = alpha->r * x[i__2].r - alpha->i * x[i__2].i, z__1.i = alpha->r * x[i__2].i + alpha->i * x[i__2].r;
temp1.r = z__1.r, temp1.i = z__1.i;
temp2.r = 0., temp2.i = 0.;
i__2 = j;
i__3 = j;
i__4 = kk;
d__1 = ap[i__4].r;
z__2.r = d__1 * temp1.r, z__2.i = d__1 * temp1.i;
z__1.r = y[i__3].r + z__2.r, z__1.i = y[i__3].i + z__2.i;
y[i__2].r = z__1.r, y[i__2].i = z__1.i;
k = kk + 1;
i__2 = *n;
for (i__ = j + 1; i__ <= i__2; ++i__) {
i__3 = i__;
i__4 = i__;
i__5 = k;
z__2.r = temp1.r * ap[i__5].r - temp1.i * ap[i__5].i, z__2.i = temp1.r * ap[i__5].i + temp1.i * ap[i__5].r;
z__1.r = y[i__4].r + z__2.r, z__1.i = y[i__4].i + z__2.i;
y[i__3].r = z__1.r, y[i__3].i = z__1.i;
d_cnjg(&z__3, &ap[k]);
i__3 = i__;
z__2.r = z__3.r * x[i__3].r - z__3.i * x[i__3].i, z__2.i = z__3.r * x[i__3].i + z__3.i * x[i__3].r;
z__1.r = temp2.r + z__2.r, z__1.i = temp2.i + z__2.i;
temp2.r = z__1.r, temp2.i = z__1.i;
++k;
/* L90: */
}
i__2 = j;
i__3 = j;
z__2.r = alpha->r * temp2.r - alpha->i * temp2.i, z__2.i = alpha->r * temp2.i + alpha->i * temp2.r;
z__1.r = y[i__3].r + z__2.r, z__1.i = y[i__3].i + z__2.i;
y[i__2].r = z__1.r, y[i__2].i = z__1.i;
kk += *n - j + 1;
/* L100: */
}
} else {
jx = kx;
jy = ky;
i__1 = *n;
for (j = 1; j <= i__1; ++j) {
i__2 = jx;
z__1.r = alpha->r * x[i__2].r - alpha->i * x[i__2].i, z__1.i = alpha->r * x[i__2].i + alpha->i * x[i__2].r;
temp1.r = z__1.r, temp1.i = z__1.i;
temp2.r = 0., temp2.i = 0.;
i__2 = jy;
i__3 = jy;
i__4 = kk;
d__1 = ap[i__4].r;
z__2.r = d__1 * temp1.r, z__2.i = d__1 * temp1.i;
z__1.r = y[i__3].r + z__2.r, z__1.i = y[i__3].i + z__2.i;
y[i__2].r = z__1.r, y[i__2].i = z__1.i;
ix = jx;
iy = jy;
i__2 = kk + *n - j;
for (k = kk + 1; k <= i__2; ++k) {
ix += *incx;
iy += *incy;
i__3 = iy;
i__4 = iy;
i__5 = k;
z__2.r = temp1.r * ap[i__5].r - temp1.i * ap[i__5].i, z__2.i = temp1.r * ap[i__5].i + temp1.i * ap[i__5].r;
z__1.r = y[i__4].r + z__2.r, z__1.i = y[i__4].i + z__2.i;
y[i__3].r = z__1.r, y[i__3].i = z__1.i;
d_cnjg(&z__3, &ap[k]);
i__3 = ix;
z__2.r = z__3.r * x[i__3].r - z__3.i * x[i__3].i, z__2.i = z__3.r * x[i__3].i + z__3.i * x[i__3].r;
z__1.r = temp2.r + z__2.r, z__1.i = temp2.i + z__2.i;
temp2.r = z__1.r, temp2.i = z__1.i;
/* L110: */
}
i__2 = jy;
i__3 = jy;
z__2.r = alpha->r * temp2.r - alpha->i * temp2.i, z__2.i = alpha->r * temp2.i + alpha->i * temp2.r;
z__1.r = y[i__3].r + z__2.r, z__1.i = y[i__3].i + z__2.i;
y[i__2].r = z__1.r, y[i__2].i = z__1.i;
jx += *incx;
jy += *incy;
kk += *n - j + 1;
/* L120: */
}
}
}
/* End of ZHPMV . */
} /* zhpmv_ */

View File

@@ -1,586 +0,0 @@
/* ztbmv.f -- translated by f2c (version 20100827).
You must link the resulting object file with libf2c:
on Microsoft Windows system, link with libf2c.lib;
on Linux or Unix systems, link with .../path/to/libf2c.a -lm
or, if you install libf2c.a in a standard place, with -lf2c -lm
-- in that order, at the end of the command line, as in
cc *.o -lf2c -lm
Source for libf2c is in /netlib/f2c/libf2c.zip, e.g.,
http://www.netlib.org/f2c/libf2c.zip
*/
#include "datatypes.h"
static inline void d_cnjg(doublecomplex *r, doublecomplex *z) {
r->r = z->r;
r->i = -(z->i);
}
/* Subroutine */ void ztbmv_(char *uplo, char *trans, char *diag, integer *n, integer *k, doublecomplex *a,
integer *lda, doublecomplex *x, integer *incx) {
/* System generated locals */
integer a_dim1, a_offset, i__1, i__2, i__3, i__4, i__5;
doublecomplex z__1, z__2, z__3;
/* Local variables */
integer i__, j, l, ix, jx, kx, info;
doublecomplex temp;
extern logical lsame_(char *, char *);
integer kplus1;
extern /* Subroutine */ void xerbla_(const char *, integer *);
logical noconj, nounit;
/* .. Scalar Arguments .. */
/* .. */
/* .. Array Arguments .. */
/* .. */
/* Purpose */
/* ======= */
/* ZTBMV performs one of the matrix-vector operations */
/* x := A*x, or x := A'*x, or x := conjg( A' )*x, */
/* where x is an n element vector and A is an n by n unit, or non-unit, */
/* upper or lower triangular band matrix, with ( k + 1 ) diagonals. */
/* Arguments */
/* ========== */
/* UPLO - CHARACTER*1. */
/* On entry, UPLO specifies whether the matrix is an upper or */
/* lower triangular matrix as follows: */
/* UPLO = 'U' or 'u' A is an upper triangular matrix. */
/* UPLO = 'L' or 'l' A is a lower triangular matrix. */
/* Unchanged on exit. */
/* TRANS - CHARACTER*1. */
/* On entry, TRANS specifies the operation to be performed as */
/* follows: */
/* TRANS = 'N' or 'n' x := A*x. */
/* TRANS = 'T' or 't' x := A'*x. */
/* TRANS = 'C' or 'c' x := conjg( A' )*x. */
/* Unchanged on exit. */
/* DIAG - CHARACTER*1. */
/* On entry, DIAG specifies whether or not A is unit */
/* triangular as follows: */
/* DIAG = 'U' or 'u' A is assumed to be unit triangular. */
/* DIAG = 'N' or 'n' A is not assumed to be unit */
/* triangular. */
/* Unchanged on exit. */
/* N - INTEGER. */
/* On entry, N specifies the order of the matrix A. */
/* N must be at least zero. */
/* Unchanged on exit. */
/* K - INTEGER. */
/* On entry with UPLO = 'U' or 'u', K specifies the number of */
/* super-diagonals of the matrix A. */
/* On entry with UPLO = 'L' or 'l', K specifies the number of */
/* sub-diagonals of the matrix A. */
/* K must satisfy 0 .le. K. */
/* Unchanged on exit. */
/* A - COMPLEX*16 array of DIMENSION ( LDA, n ). */
/* Before entry with UPLO = 'U' or 'u', the leading ( k + 1 ) */
/* by n part of the array A must contain the upper triangular */
/* band part of the matrix of coefficients, supplied column by */
/* column, with the leading diagonal of the matrix in row */
/* ( k + 1 ) of the array, the first super-diagonal starting at */
/* position 2 in row k, and so on. The top left k by k triangle */
/* of the array A is not referenced. */
/* The following program segment will transfer an upper */
/* triangular band matrix from conventional full matrix storage */
/* to band storage: */
/* DO 20, J = 1, N */
/* M = K + 1 - J */
/* DO 10, I = MAX( 1, J - K ), J */
/* A( M + I, J ) = matrix( I, J ) */
/* 10 CONTINUE */
/* 20 CONTINUE */
/* Before entry with UPLO = 'L' or 'l', the leading ( k + 1 ) */
/* by n part of the array A must contain the lower triangular */
/* band part of the matrix of coefficients, supplied column by */
/* column, with the leading diagonal of the matrix in row 1 of */
/* the array, the first sub-diagonal starting at position 1 in */
/* row 2, and so on. The bottom right k by k triangle of the */
/* array A is not referenced. */
/* The following program segment will transfer a lower */
/* triangular band matrix from conventional full matrix storage */
/* to band storage: */
/* DO 20, J = 1, N */
/* M = 1 - J */
/* DO 10, I = J, MIN( N, J + K ) */
/* A( M + I, J ) = matrix( I, J ) */
/* 10 CONTINUE */
/* 20 CONTINUE */
/* Note that when DIAG = 'U' or 'u' the elements of the array A */
/* corresponding to the diagonal elements of the matrix are not */
/* referenced, but are assumed to be unity. */
/* Unchanged on exit. */
/* LDA - INTEGER. */
/* On entry, LDA specifies the first dimension of A as declared */
/* in the calling (sub) program. LDA must be at least */
/* ( k + 1 ). */
/* Unchanged on exit. */
/* X - COMPLEX*16 array of dimension at least */
/* ( 1 + ( n - 1 )*abs( INCX ) ). */
/* Before entry, the incremented array X must contain the n */
/* element vector x. On exit, X is overwritten with the */
/* transformed vector x. */
/* INCX - INTEGER. */
/* On entry, INCX specifies the increment for the elements of */
/* X. INCX must not be zero. */
/* Unchanged on exit. */
/* Further Details */
/* =============== */
/* Level 2 Blas routine. */
/* -- Written on 22-October-1986. */
/* Jack Dongarra, Argonne National Lab. */
/* Jeremy Du Croz, Nag Central Office. */
/* Sven Hammarling, Nag Central Office. */
/* Richard Hanson, Sandia National Labs. */
/* ===================================================================== */
/* .. Parameters .. */
/* .. */
/* .. Local Scalars .. */
/* .. */
/* .. External Functions .. */
/* .. */
/* .. External Subroutines .. */
/* .. */
/* .. Intrinsic Functions .. */
/* .. */
/* Test the input parameters. */
/* Parameter adjustments */
a_dim1 = *lda;
a_offset = 1 + a_dim1;
a -= a_offset;
--x;
/* Function Body */
info = 0;
if (!lsame_(uplo, "U") && !lsame_(uplo, "L")) {
info = 1;
} else if (!lsame_(trans, "N") && !lsame_(trans, "T") && !lsame_(trans, "C")) {
info = 2;
} else if (!lsame_(diag, "U") && !lsame_(diag, "N")) {
info = 3;
} else if (*n < 0) {
info = 4;
} else if (*k < 0) {
info = 5;
} else if (*lda < *k + 1) {
info = 7;
} else if (*incx == 0) {
info = 9;
}
if (info != 0) {
xerbla_("ZTBMV ", &info);
return;
}
/* Quick return if possible. */
if (*n == 0) {
return;
}
noconj = lsame_(trans, "T");
nounit = lsame_(diag, "N");
/* Set up the start point in X if the increment is not unity. This */
/* will be ( N - 1 )*INCX too small for descending loops. */
if (*incx <= 0) {
kx = 1 - (*n - 1) * *incx;
} else if (*incx != 1) {
kx = 1;
}
/* Start the operations. In this version the elements of A are */
/* accessed sequentially with one pass through A. */
if (lsame_(trans, "N")) {
/* Form x := A*x. */
if (lsame_(uplo, "U")) {
kplus1 = *k + 1;
if (*incx == 1) {
i__1 = *n;
for (j = 1; j <= i__1; ++j) {
i__2 = j;
if (x[i__2].r != 0. || x[i__2].i != 0.) {
i__2 = j;
temp.r = x[i__2].r, temp.i = x[i__2].i;
l = kplus1 - j;
/* Computing MAX */
i__2 = 1, i__3 = j - *k;
i__4 = j - 1;
for (i__ = max(i__2, i__3); i__ <= i__4; ++i__) {
i__2 = i__;
i__3 = i__;
i__5 = l + i__ + j * a_dim1;
z__2.r = temp.r * a[i__5].r - temp.i * a[i__5].i, z__2.i = temp.r * a[i__5].i + temp.i * a[i__5].r;
z__1.r = x[i__3].r + z__2.r, z__1.i = x[i__3].i + z__2.i;
x[i__2].r = z__1.r, x[i__2].i = z__1.i;
/* L10: */
}
if (nounit) {
i__4 = j;
i__2 = j;
i__3 = kplus1 + j * a_dim1;
z__1.r = x[i__2].r * a[i__3].r - x[i__2].i * a[i__3].i,
z__1.i = x[i__2].r * a[i__3].i + x[i__2].i * a[i__3].r;
x[i__4].r = z__1.r, x[i__4].i = z__1.i;
}
}
/* L20: */
}
} else {
jx = kx;
i__1 = *n;
for (j = 1; j <= i__1; ++j) {
i__4 = jx;
if (x[i__4].r != 0. || x[i__4].i != 0.) {
i__4 = jx;
temp.r = x[i__4].r, temp.i = x[i__4].i;
ix = kx;
l = kplus1 - j;
/* Computing MAX */
i__4 = 1, i__2 = j - *k;
i__3 = j - 1;
for (i__ = max(i__4, i__2); i__ <= i__3; ++i__) {
i__4 = ix;
i__2 = ix;
i__5 = l + i__ + j * a_dim1;
z__2.r = temp.r * a[i__5].r - temp.i * a[i__5].i, z__2.i = temp.r * a[i__5].i + temp.i * a[i__5].r;
z__1.r = x[i__2].r + z__2.r, z__1.i = x[i__2].i + z__2.i;
x[i__4].r = z__1.r, x[i__4].i = z__1.i;
ix += *incx;
/* L30: */
}
if (nounit) {
i__3 = jx;
i__4 = jx;
i__2 = kplus1 + j * a_dim1;
z__1.r = x[i__4].r * a[i__2].r - x[i__4].i * a[i__2].i,
z__1.i = x[i__4].r * a[i__2].i + x[i__4].i * a[i__2].r;
x[i__3].r = z__1.r, x[i__3].i = z__1.i;
}
}
jx += *incx;
if (j > *k) {
kx += *incx;
}
/* L40: */
}
}
} else {
if (*incx == 1) {
for (j = *n; j >= 1; --j) {
i__1 = j;
if (x[i__1].r != 0. || x[i__1].i != 0.) {
i__1 = j;
temp.r = x[i__1].r, temp.i = x[i__1].i;
l = 1 - j;
/* Computing MIN */
i__1 = *n, i__3 = j + *k;
i__4 = j + 1;
for (i__ = min(i__1, i__3); i__ >= i__4; --i__) {
i__1 = i__;
i__3 = i__;
i__2 = l + i__ + j * a_dim1;
z__2.r = temp.r * a[i__2].r - temp.i * a[i__2].i, z__2.i = temp.r * a[i__2].i + temp.i * a[i__2].r;
z__1.r = x[i__3].r + z__2.r, z__1.i = x[i__3].i + z__2.i;
x[i__1].r = z__1.r, x[i__1].i = z__1.i;
/* L50: */
}
if (nounit) {
i__4 = j;
i__1 = j;
i__3 = j * a_dim1 + 1;
z__1.r = x[i__1].r * a[i__3].r - x[i__1].i * a[i__3].i,
z__1.i = x[i__1].r * a[i__3].i + x[i__1].i * a[i__3].r;
x[i__4].r = z__1.r, x[i__4].i = z__1.i;
}
}
/* L60: */
}
} else {
kx += (*n - 1) * *incx;
jx = kx;
for (j = *n; j >= 1; --j) {
i__4 = jx;
if (x[i__4].r != 0. || x[i__4].i != 0.) {
i__4 = jx;
temp.r = x[i__4].r, temp.i = x[i__4].i;
ix = kx;
l = 1 - j;
/* Computing MIN */
i__4 = *n, i__1 = j + *k;
i__3 = j + 1;
for (i__ = min(i__4, i__1); i__ >= i__3; --i__) {
i__4 = ix;
i__1 = ix;
i__2 = l + i__ + j * a_dim1;
z__2.r = temp.r * a[i__2].r - temp.i * a[i__2].i, z__2.i = temp.r * a[i__2].i + temp.i * a[i__2].r;
z__1.r = x[i__1].r + z__2.r, z__1.i = x[i__1].i + z__2.i;
x[i__4].r = z__1.r, x[i__4].i = z__1.i;
ix -= *incx;
/* L70: */
}
if (nounit) {
i__3 = jx;
i__4 = jx;
i__1 = j * a_dim1 + 1;
z__1.r = x[i__4].r * a[i__1].r - x[i__4].i * a[i__1].i,
z__1.i = x[i__4].r * a[i__1].i + x[i__4].i * a[i__1].r;
x[i__3].r = z__1.r, x[i__3].i = z__1.i;
}
}
jx -= *incx;
if (*n - j >= *k) {
kx -= *incx;
}
/* L80: */
}
}
}
} else {
/* Form x := A'*x or x := conjg( A' )*x. */
if (lsame_(uplo, "U")) {
kplus1 = *k + 1;
if (*incx == 1) {
for (j = *n; j >= 1; --j) {
i__3 = j;
temp.r = x[i__3].r, temp.i = x[i__3].i;
l = kplus1 - j;
if (noconj) {
if (nounit) {
i__3 = kplus1 + j * a_dim1;
z__1.r = temp.r * a[i__3].r - temp.i * a[i__3].i, z__1.i = temp.r * a[i__3].i + temp.i * a[i__3].r;
temp.r = z__1.r, temp.i = z__1.i;
}
/* Computing MAX */
i__4 = 1, i__1 = j - *k;
i__3 = max(i__4, i__1);
for (i__ = j - 1; i__ >= i__3; --i__) {
i__4 = l + i__ + j * a_dim1;
i__1 = i__;
z__2.r = a[i__4].r * x[i__1].r - a[i__4].i * x[i__1].i,
z__2.i = a[i__4].r * x[i__1].i + a[i__4].i * x[i__1].r;
z__1.r = temp.r + z__2.r, z__1.i = temp.i + z__2.i;
temp.r = z__1.r, temp.i = z__1.i;
/* L90: */
}
} else {
if (nounit) {
d_cnjg(&z__2, &a[kplus1 + j * a_dim1]);
z__1.r = temp.r * z__2.r - temp.i * z__2.i, z__1.i = temp.r * z__2.i + temp.i * z__2.r;
temp.r = z__1.r, temp.i = z__1.i;
}
/* Computing MAX */
i__4 = 1, i__1 = j - *k;
i__3 = max(i__4, i__1);
for (i__ = j - 1; i__ >= i__3; --i__) {
d_cnjg(&z__3, &a[l + i__ + j * a_dim1]);
i__4 = i__;
z__2.r = z__3.r * x[i__4].r - z__3.i * x[i__4].i, z__2.i = z__3.r * x[i__4].i + z__3.i * x[i__4].r;
z__1.r = temp.r + z__2.r, z__1.i = temp.i + z__2.i;
temp.r = z__1.r, temp.i = z__1.i;
/* L100: */
}
}
i__3 = j;
x[i__3].r = temp.r, x[i__3].i = temp.i;
/* L110: */
}
} else {
kx += (*n - 1) * *incx;
jx = kx;
for (j = *n; j >= 1; --j) {
i__3 = jx;
temp.r = x[i__3].r, temp.i = x[i__3].i;
kx -= *incx;
ix = kx;
l = kplus1 - j;
if (noconj) {
if (nounit) {
i__3 = kplus1 + j * a_dim1;
z__1.r = temp.r * a[i__3].r - temp.i * a[i__3].i, z__1.i = temp.r * a[i__3].i + temp.i * a[i__3].r;
temp.r = z__1.r, temp.i = z__1.i;
}
/* Computing MAX */
i__4 = 1, i__1 = j - *k;
i__3 = max(i__4, i__1);
for (i__ = j - 1; i__ >= i__3; --i__) {
i__4 = l + i__ + j * a_dim1;
i__1 = ix;
z__2.r = a[i__4].r * x[i__1].r - a[i__4].i * x[i__1].i,
z__2.i = a[i__4].r * x[i__1].i + a[i__4].i * x[i__1].r;
z__1.r = temp.r + z__2.r, z__1.i = temp.i + z__2.i;
temp.r = z__1.r, temp.i = z__1.i;
ix -= *incx;
/* L120: */
}
} else {
if (nounit) {
d_cnjg(&z__2, &a[kplus1 + j * a_dim1]);
z__1.r = temp.r * z__2.r - temp.i * z__2.i, z__1.i = temp.r * z__2.i + temp.i * z__2.r;
temp.r = z__1.r, temp.i = z__1.i;
}
/* Computing MAX */
i__4 = 1, i__1 = j - *k;
i__3 = max(i__4, i__1);
for (i__ = j - 1; i__ >= i__3; --i__) {
d_cnjg(&z__3, &a[l + i__ + j * a_dim1]);
i__4 = ix;
z__2.r = z__3.r * x[i__4].r - z__3.i * x[i__4].i, z__2.i = z__3.r * x[i__4].i + z__3.i * x[i__4].r;
z__1.r = temp.r + z__2.r, z__1.i = temp.i + z__2.i;
temp.r = z__1.r, temp.i = z__1.i;
ix -= *incx;
/* L130: */
}
}
i__3 = jx;
x[i__3].r = temp.r, x[i__3].i = temp.i;
jx -= *incx;
/* L140: */
}
}
} else {
if (*incx == 1) {
i__3 = *n;
for (j = 1; j <= i__3; ++j) {
i__4 = j;
temp.r = x[i__4].r, temp.i = x[i__4].i;
l = 1 - j;
if (noconj) {
if (nounit) {
i__4 = j * a_dim1 + 1;
z__1.r = temp.r * a[i__4].r - temp.i * a[i__4].i, z__1.i = temp.r * a[i__4].i + temp.i * a[i__4].r;
temp.r = z__1.r, temp.i = z__1.i;
}
/* Computing MIN */
i__1 = *n, i__2 = j + *k;
i__4 = min(i__1, i__2);
for (i__ = j + 1; i__ <= i__4; ++i__) {
i__1 = l + i__ + j * a_dim1;
i__2 = i__;
z__2.r = a[i__1].r * x[i__2].r - a[i__1].i * x[i__2].i,
z__2.i = a[i__1].r * x[i__2].i + a[i__1].i * x[i__2].r;
z__1.r = temp.r + z__2.r, z__1.i = temp.i + z__2.i;
temp.r = z__1.r, temp.i = z__1.i;
/* L150: */
}
} else {
if (nounit) {
d_cnjg(&z__2, &a[j * a_dim1 + 1]);
z__1.r = temp.r * z__2.r - temp.i * z__2.i, z__1.i = temp.r * z__2.i + temp.i * z__2.r;
temp.r = z__1.r, temp.i = z__1.i;
}
/* Computing MIN */
i__1 = *n, i__2 = j + *k;
i__4 = min(i__1, i__2);
for (i__ = j + 1; i__ <= i__4; ++i__) {
d_cnjg(&z__3, &a[l + i__ + j * a_dim1]);
i__1 = i__;
z__2.r = z__3.r * x[i__1].r - z__3.i * x[i__1].i, z__2.i = z__3.r * x[i__1].i + z__3.i * x[i__1].r;
z__1.r = temp.r + z__2.r, z__1.i = temp.i + z__2.i;
temp.r = z__1.r, temp.i = z__1.i;
/* L160: */
}
}
i__4 = j;
x[i__4].r = temp.r, x[i__4].i = temp.i;
/* L170: */
}
} else {
jx = kx;
i__3 = *n;
for (j = 1; j <= i__3; ++j) {
i__4 = jx;
temp.r = x[i__4].r, temp.i = x[i__4].i;
kx += *incx;
ix = kx;
l = 1 - j;
if (noconj) {
if (nounit) {
i__4 = j * a_dim1 + 1;
z__1.r = temp.r * a[i__4].r - temp.i * a[i__4].i, z__1.i = temp.r * a[i__4].i + temp.i * a[i__4].r;
temp.r = z__1.r, temp.i = z__1.i;
}
/* Computing MIN */
i__1 = *n, i__2 = j + *k;
i__4 = min(i__1, i__2);
for (i__ = j + 1; i__ <= i__4; ++i__) {
i__1 = l + i__ + j * a_dim1;
i__2 = ix;
z__2.r = a[i__1].r * x[i__2].r - a[i__1].i * x[i__2].i,
z__2.i = a[i__1].r * x[i__2].i + a[i__1].i * x[i__2].r;
z__1.r = temp.r + z__2.r, z__1.i = temp.i + z__2.i;
temp.r = z__1.r, temp.i = z__1.i;
ix += *incx;
/* L180: */
}
} else {
if (nounit) {
d_cnjg(&z__2, &a[j * a_dim1 + 1]);
z__1.r = temp.r * z__2.r - temp.i * z__2.i, z__1.i = temp.r * z__2.i + temp.i * z__2.r;
temp.r = z__1.r, temp.i = z__1.i;
}
/* Computing MIN */
i__1 = *n, i__2 = j + *k;
i__4 = min(i__1, i__2);
for (i__ = j + 1; i__ <= i__4; ++i__) {
d_cnjg(&z__3, &a[l + i__ + j * a_dim1]);
i__1 = ix;
z__2.r = z__3.r * x[i__1].r - z__3.i * x[i__1].i, z__2.i = z__3.r * x[i__1].i + z__3.i * x[i__1].r;
z__1.r = temp.r + z__2.r, z__1.i = temp.i + z__2.i;
temp.r = z__1.r, temp.i = z__1.i;
ix += *incx;
/* L190: */
}
}
i__4 = jx;
x[i__4].r = temp.r, x[i__4].i = temp.i;
jx += *incx;
/* L200: */
}
}
}
}
/* End of ZTBMV . */
} /* ztbmv_ */

View File

@@ -25,15 +25,19 @@ struct functor_traits<scalar_norm1_op> {
// computes the sum of magnitudes of all vector elements or, for a complex vector x, the sum
// res = |Rex1| + |Imx1| + |Rex2| + |Imx2| + ... + |Rexn| + |Imxn|, where x is a vector of order n
extern "C" RealScalar EIGEN_CAT(REAL_SCALAR_SUFFIX, EIGEN_BLAS_FUNC_NAME(asum))(int *n, RealScalar *px, int *incx) {
// std::cerr << "__asum " << *n << " " << *incx << "\n";
Complex *x = reinterpret_cast<Complex *>(px);
if (*n <= 0) return 0;
// std::complex<T> is layout-compatible with T[2], so we can reinterpret
// a complex vector of length n as a real vector of length 2*n and use
// the fully vectorized cwiseAbs().sum() path.
if (*incx == 1)
return make_vector(x, *n).unaryExpr<scalar_norm1_op>().sum();
else
return make_vector(px, 2 * *n).cwiseAbs().sum();
else {
// For non-unit stride, fall back to the scalar_norm1_op approach since
// the real components are not contiguous across complex elements.
Complex *x = reinterpret_cast<Complex *>(px);
return make_vector(x, *n, std::abs(*incx)).unaryExpr<scalar_norm1_op>().sum();
}
}
extern "C" int EIGEN_CAT(i, EIGEN_BLAS_FUNC_NAME(amax))(int *n, RealScalar *px, int *incx) {

View File

@@ -69,15 +69,21 @@ EIGEN_BLAS_FUNC(copy)(int *n, RealScalar *px, int *incx, RealScalar *py, int *in
// be careful, *incx==0 is allowed !!
if (*incx == 1 && *incy == 1)
make_vector(y, *n) = make_vector(x, *n);
else {
if (*incx < 0) x = x - (*n - 1) * (*incx);
else if (*incx == 0) {
// Broadcast: copy x[0] to all elements of y.
if (*incy < 0) y = y - (*n - 1) * (*incy);
for (int i = 0; i < *n; ++i) {
*y = *x;
x += *incx;
y += *incy;
}
}
} else if (*incx > 0 && *incy > 0)
make_vector(y, *n, *incy) = make_vector(x, *n, *incx);
else if (*incx > 0 && *incy < 0)
make_vector(y, *n, -*incy).reverse() = make_vector(x, *n, *incx);
else if (*incx < 0 && *incy > 0)
make_vector(y, *n, *incy) = make_vector(x, *n, -*incx).reverse();
else if (*incx < 0 && *incy < 0)
make_vector(y, *n, -*incy) = make_vector(x, *n, -*incx);
}
EIGEN_BLAS_FUNC(rotg)(RealScalar *pa, RealScalar *pb, RealScalar *pc, RealScalar *ps) {

View File

@@ -58,23 +58,21 @@ extern "C" Scalar EIGEN_BLAS_FUNC_NAME(dot)(int *n, Scalar *px, int *incx, Scala
Scalar *y = reinterpret_cast<Scalar *>(py);
if (*incx == 1 && *incy == 1)
return (make_vector(x, *n).cwiseProduct(make_vector(y, *n))).sum();
return make_vector(x, *n).dot(make_vector(y, *n));
else if (*incx > 0 && *incy > 0)
return (make_vector(x, *n, *incx).cwiseProduct(make_vector(y, *n, *incy))).sum();
return make_vector(x, *n, *incx).dot(make_vector(y, *n, *incy));
else if (*incx < 0 && *incy > 0)
return (make_vector(x, *n, -*incx).reverse().cwiseProduct(make_vector(y, *n, *incy))).sum();
return make_vector(x, *n, -*incx).reverse().dot(make_vector(y, *n, *incy));
else if (*incx > 0 && *incy < 0)
return (make_vector(x, *n, *incx).cwiseProduct(make_vector(y, *n, -*incy).reverse())).sum();
return make_vector(x, *n, *incx).dot(make_vector(y, *n, -*incy).reverse());
else if (*incx < 0 && *incy < 0)
return (make_vector(x, *n, -*incx).reverse().cwiseProduct(make_vector(y, *n, -*incy).reverse())).sum();
return make_vector(x, *n, -*incx).reverse().dot(make_vector(y, *n, -*incy).reverse());
else
return 0;
}
// computes the Euclidean norm of a vector.
// FIXME
extern "C" Scalar EIGEN_BLAS_FUNC_NAME(nrm2)(int *n, Scalar *px, int *incx) {
// std::cerr << "_nrm2 " << *n << " " << *incx << "\n";
if (*n <= 0) return 0;
Scalar *x = reinterpret_cast<Scalar *>(px);
@@ -108,23 +106,171 @@ EIGEN_BLAS_FUNC(rot)(int *n, Scalar *px, int *incx, Scalar *py, int *incy, Scala
Eigen::internal::apply_rotation_in_the_plane(vx, vy, Eigen::JacobiRotation<Scalar>(c, s));
}
/*
// performs rotation of points in the modified plane.
EIGEN_BLAS_FUNC(rotm)(int *n, Scalar *px, int *incx, Scalar *py, int *incy, Scalar *param)
{
Scalar* x = reinterpret_cast<Scalar*>(px);
Scalar* y = reinterpret_cast<Scalar*>(py);
// Applies modified Givens rotation H to vectors x and y.
// param[0] = flag:
// -1: H = [[h11, h12], [h21, h22]] (all 4 elements from param)
// 0: H = [[1, h12], [h21, 1]] (h12, h21 from param)
// 1: H = [[h11, 1], [-1, h22]] (h11, h22 from param)
// -2: H = identity (no-op)
// param[1..4] = h11, h21, h12, h22
EIGEN_BLAS_FUNC(rotm)(int *n, Scalar *px, int *incx, Scalar *py, int *incy, Scalar *param) {
Scalar *x = reinterpret_cast<Scalar *>(px);
Scalar *y = reinterpret_cast<Scalar *>(py);
// TODO
Scalar flag = param[0];
if (*n <= 0 || flag == Scalar(-2)) return;
return 0;
Scalar h11, h12, h21, h22;
if (flag < Scalar(0)) {
h11 = param[1];
h21 = param[2];
h12 = param[3];
h22 = param[4];
} else if (flag == Scalar(0)) {
h11 = Scalar(1);
h21 = param[2];
h12 = param[3];
h22 = Scalar(1);
} else {
h11 = param[1];
h21 = Scalar(-1);
h12 = Scalar(1);
h22 = param[4];
}
int kx = *incx > 0 ? 0 : (1 - *n) * *incx;
int ky = *incy > 0 ? 0 : (1 - *n) * *incy;
for (int i = 0; i < *n; ++i) {
Scalar w = x[kx];
Scalar z = y[ky];
x[kx] = h11 * w + h12 * z;
y[ky] = h21 * w + h22 * z;
kx += *incx;
ky += *incy;
}
}
// computes the modified parameters for a Givens rotation.
EIGEN_BLAS_FUNC(rotmg)(Scalar *d1, Scalar *d2, Scalar *x1, Scalar *x2, Scalar *param)
{
// TODO
// Constructs the modified Givens transformation matrix H which zeros the second
// component of (sqrt(d1)*x1, sqrt(d2)*y1)^T.
EIGEN_BLAS_FUNC(rotmg)(Scalar *d1, Scalar *d2, Scalar *x1, Scalar *y1, Scalar *param) {
using std::abs;
return 0;
const Scalar gam = Scalar(4096);
const Scalar gamsq = gam * gam;
const Scalar rgamsq = Scalar(1) / gamsq;
Scalar flag, h11 = Scalar(0), h12 = Scalar(0), h21 = Scalar(0), h22 = Scalar(0);
if (*d1 < Scalar(0)) {
// Negative d1: zero everything.
flag = Scalar(-1);
*d1 = *d2 = *x1 = Scalar(0);
} else {
Scalar p2 = *d2 * *y1;
if (p2 == Scalar(0)) {
// d2*y1 == 0: identity transform.
param[0] = Scalar(-2);
return;
}
Scalar p1 = *d1 * *x1;
Scalar q2 = p2 * *y1;
Scalar q1 = p1 * *x1;
bool do_scale = true;
if (abs(q1) > abs(q2)) {
h21 = -(*y1) / *x1;
h12 = p2 / p1;
Scalar u = Scalar(1) - h12 * h21;
if (u <= Scalar(0)) {
flag = Scalar(-1);
h11 = h12 = h21 = h22 = Scalar(0);
*d1 = *d2 = *x1 = Scalar(0);
do_scale = false;
} else {
flag = Scalar(0);
*d1 /= u;
*d2 /= u;
*x1 *= u;
}
} else if (q2 < Scalar(0)) {
flag = Scalar(-1);
h11 = h12 = h21 = h22 = Scalar(0);
*d1 = *d2 = *x1 = Scalar(0);
do_scale = false;
} else {
flag = Scalar(1);
h11 = p1 / p2;
h22 = *x1 / *y1;
Scalar u = Scalar(1) + h11 * h22;
Scalar temp = *d2 / u;
*d2 = *d1 / u;
*d1 = temp;
*x1 = *y1 * u;
}
if (do_scale) {
// Converts compact H representation (flag 0 or 1) to full form (flag -1)
// so that scaling factors can be absorbed into all four elements.
auto fix_h = [&]() {
if (flag >= Scalar(0)) {
if (flag == Scalar(0)) {
h11 = Scalar(1);
h22 = Scalar(1);
} else {
h21 = Scalar(-1);
h12 = Scalar(1);
}
flag = Scalar(-1);
}
};
// Scale d1 up if too small.
while (*d1 <= rgamsq && *d1 != Scalar(0)) {
fix_h();
*d1 *= gamsq;
*x1 /= gam;
h11 /= gam;
h12 /= gam;
}
// Scale d1 down if too large.
while (*d1 >= gamsq) {
fix_h();
*d1 /= gamsq;
*x1 *= gam;
h11 *= gam;
h12 *= gam;
}
// Scale |d2| up if too small.
while (abs(*d2) <= rgamsq && *d2 != Scalar(0)) {
fix_h();
*d2 *= gamsq;
h21 /= gam;
h22 /= gam;
}
// Scale |d2| down if too large.
while (abs(*d2) >= gamsq) {
fix_h();
*d2 /= gamsq;
h21 *= gam;
h22 *= gam;
}
}
}
// Store result in param array.
if (flag < Scalar(0)) {
param[1] = h11;
param[2] = h21;
param[3] = h12;
param[4] = h22;
} else if (flag == Scalar(0)) {
param[2] = h21;
param[3] = h12;
} else {
param[1] = h11;
param[4] = h22;
}
param[0] = flag;
}
*/

View File

@@ -72,31 +72,193 @@ EIGEN_BLAS_FUNC(hemv)
if (actual_y != y) delete[] copy_back(actual_y, y, *n, *incy);
}
/** ZHBMV performs the matrix-vector operation
/** HBMV performs the matrix-vector operation
*
* y := alpha*A*x + beta*y,
*
* where alpha and beta are scalars, x and y are n element vectors and
* A is an n by n hermitian band matrix, with k super-diagonals.
* Diagonal elements are real; off-diagonal contributions use conjugation.
*/
// EIGEN_BLAS_FUNC(hbmv)(char *uplo, int *n, int *k, RealScalar *alpha, RealScalar *a, int *lda,
// RealScalar *x, int *incx, RealScalar *beta, RealScalar *y, int *incy)
// {
// return 1;
// }
EIGEN_BLAS_FUNC(hbmv)
(char *uplo, int *n, int *k, RealScalar *palpha, RealScalar *pa, int *lda, RealScalar *px, int *incx, RealScalar *pbeta,
RealScalar *py, int *incy) {
const Scalar alpha = *reinterpret_cast<const Scalar *>(palpha);
const Scalar beta = *reinterpret_cast<const Scalar *>(pbeta);
const Scalar *a = reinterpret_cast<const Scalar *>(pa);
const Scalar *x = reinterpret_cast<const Scalar *>(px);
Scalar *y = reinterpret_cast<Scalar *>(py);
/** ZHPMV performs the matrix-vector operation
int info = 0;
if (UPLO(*uplo) == INVALID)
info = 1;
else if (*n < 0)
info = 2;
else if (*k < 0)
info = 3;
else if (*lda < *k + 1)
info = 6;
else if (*incx == 0)
info = 8;
else if (*incy == 0)
info = 11;
if (info) return xerbla_(SCALAR_SUFFIX_UP "HBMV ", &info);
if (*n == 0 || (alpha == Scalar(0) && beta == Scalar(1))) return;
const Scalar *actual_x = get_compact_vector(x, *n, *incx);
Scalar *actual_y = get_compact_vector(y, *n, *incy);
// First form y := beta*y.
if (beta != Scalar(1)) {
if (beta == Scalar(0))
make_vector(actual_y, *n).setZero();
else
make_vector(actual_y, *n) *= beta;
}
if (alpha == Scalar(0)) {
if (actual_x != x) delete[] actual_x;
if (actual_y != y) delete[] copy_back(actual_y, y, *n, *incy);
return;
}
if (*k >= 8) {
// Vectorized path: use Eigen Map segments for the inner band operations.
ConstMatrixType band(a, *k + 1, *n, *lda);
if (UPLO(*uplo) == UP) {
for (int j = 0; j < *n; ++j) {
int start = std::max(0, j - *k);
int len = j - start;
int offset = *k - (j - start);
Scalar temp1 = alpha * actual_x[j];
actual_y[j] += Scalar(Eigen::numext::real(band(*k, j))) * temp1;
if (len > 0) {
make_vector(actual_y + start, len) += temp1 * band.col(j).segment(offset, len);
actual_y[j] += alpha * band.col(j).segment(offset, len).dot(make_vector(actual_x + start, len));
}
}
} else {
for (int j = 0; j < *n; ++j) {
int len = std::min(*n - 1, j + *k) - j;
Scalar temp1 = alpha * actual_x[j];
actual_y[j] += Scalar(Eigen::numext::real(band(0, j))) * temp1;
if (len > 0) {
make_vector(actual_y + j + 1, len) += temp1 * band.col(j).segment(1, len);
actual_y[j] += alpha * band.col(j).segment(1, len).dot(make_vector(actual_x + j + 1, len));
}
}
}
} else {
// Scalar path: for narrow bandwidth, avoid Map overhead.
if (UPLO(*uplo) == UP) {
for (int j = 0; j < *n; ++j) {
Scalar temp1 = alpha * actual_x[j];
Scalar temp2 = Scalar(0);
for (int i = std::max(0, j - *k); i < j; ++i) {
Scalar aij = a[(*k + i - j) + j * *lda];
actual_y[i] += temp1 * aij;
temp2 += Eigen::numext::conj(aij) * actual_x[i];
}
actual_y[j] += Scalar(Eigen::numext::real(a[*k + j * *lda])) * temp1 + alpha * temp2;
}
} else {
for (int j = 0; j < *n; ++j) {
Scalar temp1 = alpha * actual_x[j];
Scalar temp2 = Scalar(0);
actual_y[j] += Scalar(Eigen::numext::real(a[j * *lda])) * temp1;
for (int i = j + 1; i <= std::min(*n - 1, j + *k); ++i) {
Scalar aij = a[(i - j) + j * *lda];
actual_y[i] += temp1 * aij;
temp2 += Eigen::numext::conj(aij) * actual_x[i];
}
actual_y[j] += alpha * temp2;
}
}
}
if (actual_x != x) delete[] actual_x;
if (actual_y != y) delete[] copy_back(actual_y, y, *n, *incy);
}
/** HPMV performs the matrix-vector operation
*
* y := alpha*A*x + beta*y,
*
* where alpha and beta are scalars, x and y are n element vectors and
* A is an n by n hermitian matrix, supplied in packed form.
* Diagonal elements are real; off-diagonal contributions use conjugation.
*/
// EIGEN_BLAS_FUNC(hpmv)(char *uplo, int *n, RealScalar *alpha, RealScalar *ap, RealScalar *x, int *incx, RealScalar
// *beta, RealScalar *y, int *incy)
// {
// return 1;
// }
EIGEN_BLAS_FUNC(hpmv)
(char *uplo, int *n, RealScalar *palpha, RealScalar *pap, RealScalar *px, int *incx, RealScalar *pbeta, RealScalar *py,
int *incy) {
const Scalar alpha = *reinterpret_cast<const Scalar *>(palpha);
const Scalar beta = *reinterpret_cast<const Scalar *>(pbeta);
const Scalar *ap = reinterpret_cast<const Scalar *>(pap);
const Scalar *x = reinterpret_cast<const Scalar *>(px);
Scalar *y = reinterpret_cast<Scalar *>(py);
int info = 0;
if (UPLO(*uplo) == INVALID)
info = 1;
else if (*n < 0)
info = 2;
else if (*incx == 0)
info = 6;
else if (*incy == 0)
info = 9;
if (info) return xerbla_(SCALAR_SUFFIX_UP "HPMV ", &info);
if (*n == 0 || (alpha == Scalar(0) && beta == Scalar(1))) return;
const Scalar *actual_x = get_compact_vector(x, *n, *incx);
Scalar *actual_y = get_compact_vector(y, *n, *incy);
// First form y := beta*y.
if (beta != Scalar(1)) {
if (beta == Scalar(0))
make_vector(actual_y, *n).setZero();
else
make_vector(actual_y, *n) *= beta;
}
if (alpha == Scalar(0)) {
if (actual_x != x) delete[] actual_x;
if (actual_y != y) delete[] copy_back(actual_y, y, *n, *incy);
return;
}
int kk = 0;
if (UPLO(*uplo) == UP) {
// Upper triangle packed: column j occupies ap[kk..kk+j].
for (int j = 0; j < *n; ++j) {
Scalar temp1 = alpha * actual_x[j];
// Diagonal is real.
actual_y[j] += Scalar(Eigen::numext::real(ap[kk + j])) * temp1;
if (j > 0) {
make_vector(actual_y, j) += temp1 * make_vector(ap + kk, j);
actual_y[j] += alpha * make_vector(ap + kk, j).dot(make_vector(actual_x, j));
}
kk += j + 1;
}
} else {
// Lower triangle packed: column j occupies ap[kk..kk+(n-j-1)].
for (int j = 0; j < *n; ++j) {
int len = *n - j - 1;
Scalar temp1 = alpha * actual_x[j];
// Diagonal is real.
actual_y[j] += Scalar(Eigen::numext::real(ap[kk])) * temp1;
if (len > 0) {
make_vector(actual_y + j + 1, len) += temp1 * make_vector(ap + kk + 1, len);
actual_y[j] += alpha * make_vector(ap + kk + 1, len).dot(make_vector(actual_x + j + 1, len));
}
kk += *n - j;
}
}
if (actual_x != x) delete[] actual_x;
if (actual_y != y) delete[] copy_back(actual_y, y, *n, *incy);
}
/** ZHPR performs the hermitian rank 1 operation
*

View File

@@ -303,61 +303,158 @@ EIGEN_BLAS_FUNC(gbmv)
if (actual_y != y) delete[] copy_back(actual_y, y, actual_m, *incy);
}
#if 0
/** TBMV performs one of the matrix-vector operations
*
* x := A*x, or x := A'*x,
*
* where x is an n element vector and A is an n by n unit, or non-unit,
* upper or lower triangular band matrix, with ( k + 1 ) diagonals.
*/
EIGEN_BLAS_FUNC(tbmv)(char *uplo, char *opa, char *diag, int *n, int *k, RealScalar *pa, int *lda, RealScalar *px, int *incx)
{
Scalar* a = reinterpret_cast<Scalar*>(pa);
Scalar* x = reinterpret_cast<Scalar*>(px);
int coeff_rows = *k + 1;
*
* x := A*x, or x := A'*x, or x := conjg(A')*x,
*
* where x is an n element vector and A is an n by n unit, or non-unit,
* upper or lower triangular band matrix, with ( k + 1 ) diagonals.
*
* Band storage: upper triangle stores A[i,j] at a[(k+i-j) + j*lda],
* lower triangle stores A[i,j] at a[(i-j) + j*lda].
*/
EIGEN_BLAS_FUNC(tbmv)
(char *uplo, char *opa, char *diag, int *n, int *k, RealScalar *pa, int *lda, RealScalar *px, int *incx) {
Scalar *a = reinterpret_cast<Scalar *>(pa);
Scalar *x = reinterpret_cast<Scalar *>(px);
int info = 0;
if(UPLO(*uplo)==INVALID) info = 1;
else if(OP(*opa)==INVALID) info = 2;
else if(DIAG(*diag)==INVALID) info = 3;
else if(*n<0) info = 4;
else if(*k<0) info = 5;
else if(*lda<coeff_rows) info = 7;
else if(*incx==0) info = 9;
if(info)
return xerbla_(SCALAR_SUFFIX_UP"TBMV ",&info,6);
if (UPLO(*uplo) == INVALID)
info = 1;
else if (OP(*opa) == INVALID)
info = 2;
else if (DIAG(*diag) == INVALID)
info = 3;
else if (*n < 0)
info = 4;
else if (*k < 0)
info = 5;
else if (*lda < *k + 1)
info = 7;
else if (*incx == 0)
info = 9;
if (info) return xerbla_(SCALAR_SUFFIX_UP "TBMV ", &info);
if(*n==0) return;
if (*n == 0) return;
int actual_n = *n;
Scalar *actual_x = get_compact_vector(x, *n, *incx);
Scalar* actual_x = get_compact_vector(x,actual_n,*incx);
bool upper = (UPLO(*uplo) == UP);
int op = OP(*opa);
bool unit = (DIAG(*diag) == UNIT);
MatrixType mat_coeffs(a,coeff_rows,*n,*lda);
int ku = UPLO(*uplo)==UPPER ? *k : 0;
int kl = UPLO(*uplo)==LOWER ? *k : 0;
for(int j=0; j<*n; ++j)
{
int start = std::max(0,j - ku);
int end = std::min((*m)-1,j + kl);
int len = end - start + 1;
int offset = (ku) - j + start;
if(OP(*trans)==NOTR)
make_vector(actual_y+start,len) += (alpha*actual_x[j]) * mat_coeffs.col(j).segment(offset,len);
else if(OP(*trans)==TR)
actual_y[j] += alpha * ( mat_coeffs.col(j).segment(offset,len).transpose() * make_vector(actual_x+start,len) ).value();
else
actual_y[j] += alpha * ( mat_coeffs.col(j).segment(offset,len).adjoint() * make_vector(actual_x+start,len) ).value();
if (*k >= 8) {
// Vectorized path: use Eigen Map segments for the inner band operations.
ConstMatrixType band(a, *k + 1, *n, *lda);
if (op == NOTR) {
if (upper) {
for (int j = 0; j < *n; ++j) {
if (actual_x[j] != Scalar(0)) {
int start = std::max(0, j - *k);
int len = j - start;
int offset = *k - (j - start);
Scalar temp = actual_x[j];
if (len > 0) make_vector(actual_x + start, len) += temp * band.col(j).segment(offset, len);
if (!unit) actual_x[j] = temp * band(*k, j);
}
}
} else {
for (int j = *n - 1; j >= 0; --j) {
if (actual_x[j] != Scalar(0)) {
int len = std::min(*n - 1, j + *k) - j;
Scalar temp = actual_x[j];
if (len > 0) make_vector(actual_x + j + 1, len) += temp * band.col(j).segment(1, len);
if (!unit) actual_x[j] = temp * band(0, j);
}
}
}
} else if (op == TR) {
if (upper) {
for (int j = *n - 1; j >= 0; --j) {
int start = std::max(0, j - *k);
int len = j - start;
int offset = *k - (j - start);
Scalar temp = actual_x[j];
if (!unit) temp *= band(*k, j);
if (len > 0)
temp += (band.col(j).segment(offset, len).cwiseProduct(make_vector(actual_x + start, len))).sum();
actual_x[j] = temp;
}
} else {
for (int j = 0; j < *n; ++j) {
int len = std::min(*n - 1, j + *k) - j;
Scalar temp = actual_x[j];
if (!unit) temp *= band(0, j);
if (len > 0) temp += (band.col(j).segment(1, len).cwiseProduct(make_vector(actual_x + j + 1, len))).sum();
actual_x[j] = temp;
}
}
} else {
// Conjugate transpose: .dot() computes conj(lhs) . rhs.
if (upper) {
for (int j = *n - 1; j >= 0; --j) {
int start = std::max(0, j - *k);
int len = j - start;
int offset = *k - (j - start);
Scalar temp = actual_x[j];
if (!unit) temp *= Eigen::numext::conj(band(*k, j));
if (len > 0) temp += band.col(j).segment(offset, len).dot(make_vector(actual_x + start, len));
actual_x[j] = temp;
}
} else {
for (int j = 0; j < *n; ++j) {
int len = std::min(*n - 1, j + *k) - j;
Scalar temp = actual_x[j];
if (!unit) temp *= Eigen::numext::conj(band(0, j));
if (len > 0) temp += band.col(j).segment(1, len).dot(make_vector(actual_x + j + 1, len));
actual_x[j] = temp;
}
}
}
} else {
// Scalar path: for narrow bandwidth, avoid Map overhead.
if (op == NOTR) {
if (upper) {
for (int j = 0; j < *n; ++j) {
if (actual_x[j] != Scalar(0)) {
Scalar temp = actual_x[j];
for (int i = std::max(0, j - *k); i < j; ++i) actual_x[i] += temp * a[(*k + i - j) + j * *lda];
if (!unit) actual_x[j] = temp * a[*k + j * *lda];
}
}
} else {
for (int j = *n - 1; j >= 0; --j) {
if (actual_x[j] != Scalar(0)) {
Scalar temp = actual_x[j];
for (int i = j + 1; i <= std::min(*n - 1, j + *k); ++i) actual_x[i] += temp * a[(i - j) + j * *lda];
if (!unit) actual_x[j] = temp * a[j * *lda];
}
}
}
} else {
// Transpose or conjugate transpose.
auto maybe_conj = [op](Scalar val) -> Scalar { return op == ADJ ? Eigen::numext::conj(val) : val; };
if (upper) {
for (int j = *n - 1; j >= 0; --j) {
Scalar temp = actual_x[j];
if (!unit) temp *= maybe_conj(a[*k + j * *lda]);
for (int i = std::max(0, j - *k); i < j; ++i) temp += maybe_conj(a[(*k + i - j) + j * *lda]) * actual_x[i];
actual_x[j] = temp;
}
} else {
for (int j = 0; j < *n; ++j) {
Scalar temp = actual_x[j];
if (!unit) temp *= maybe_conj(a[j * *lda]);
for (int i = j + 1; i <= std::min(*n - 1, j + *k); ++i)
temp += maybe_conj(a[(i - j) + j * *lda]) * actual_x[i];
actual_x[j] = temp;
}
}
}
}
if(actual_x!=x) delete[] actual_x;
if(actual_y!=y) delete[] copy_back(actual_y,y,actual_m,*incy);
if (actual_x != x) delete[] copy_back(actual_x, x, *n, *incx);
}
#endif
/** DTBSV solves one of the systems of equations
*

View File

@@ -158,32 +158,196 @@ EIGEN_BLAS_FUNC(syr2)
// func[code](*n, a, *inca, b, *incb, c, *ldc, alpha);
}
/** DSBMV performs the matrix-vector operation
/** SBMV performs the matrix-vector operation
*
* y := alpha*A*x + beta*y,
*
* where alpha and beta are scalars, x and y are n element vectors and
* A is an n by n symmetric band matrix, with k super-diagonals.
*
* Band storage: upper triangle stores A[i,j] at a[(k+i-j) + j*lda],
* lower triangle stores A[i,j] at a[(i-j) + j*lda].
*/
// EIGEN_BLAS_FUNC(sbmv)( char *uplo, int *n, int *k, RealScalar *alpha, RealScalar *a, int *lda,
// RealScalar *x, int *incx, RealScalar *beta, RealScalar *y, int *incy)
// {
// return 1;
// }
EIGEN_BLAS_FUNC(sbmv)
(char *uplo, int *n, int *k, RealScalar *palpha, RealScalar *pa, int *lda, RealScalar *px, int *incx, RealScalar *pbeta,
RealScalar *py, int *incy) {
const Scalar alpha = *reinterpret_cast<const Scalar *>(palpha);
const Scalar beta = *reinterpret_cast<const Scalar *>(pbeta);
const Scalar *a = reinterpret_cast<const Scalar *>(pa);
const Scalar *x = reinterpret_cast<const Scalar *>(px);
Scalar *y = reinterpret_cast<Scalar *>(py);
/** DSPMV performs the matrix-vector operation
int info = 0;
if (UPLO(*uplo) == INVALID)
info = 1;
else if (*n < 0)
info = 2;
else if (*k < 0)
info = 3;
else if (*lda < *k + 1)
info = 6;
else if (*incx == 0)
info = 8;
else if (*incy == 0)
info = 11;
if (info) return xerbla_(SCALAR_SUFFIX_UP "SBMV ", &info);
if (*n == 0 || (alpha == Scalar(0) && beta == Scalar(1))) return;
const Scalar *actual_x = get_compact_vector(x, *n, *incx);
Scalar *actual_y = get_compact_vector(y, *n, *incy);
// First form y := beta*y.
if (beta != Scalar(1)) {
if (beta == Scalar(0))
make_vector(actual_y, *n).setZero();
else
make_vector(actual_y, *n) *= beta;
}
if (alpha == Scalar(0)) {
if (actual_x != x) delete[] actual_x;
if (actual_y != y) delete[] copy_back(actual_y, y, *n, *incy);
return;
}
if (*k >= 8) {
// Vectorized path: use Eigen Map segments for the inner band operations.
ConstMatrixType band(a, *k + 1, *n, *lda);
if (UPLO(*uplo) == UP) {
for (int j = 0; j < *n; ++j) {
int start = std::max(0, j - *k);
int len = j - start;
int offset = *k - (j - start);
Scalar temp1 = alpha * actual_x[j];
actual_y[j] += temp1 * band(*k, j);
if (len > 0) {
make_vector(actual_y + start, len) += temp1 * band.col(j).segment(offset, len);
actual_y[j] += alpha * band.col(j).segment(offset, len).dot(make_vector(actual_x + start, len));
}
}
} else {
for (int j = 0; j < *n; ++j) {
int len = std::min(*n - 1, j + *k) - j;
Scalar temp1 = alpha * actual_x[j];
actual_y[j] += temp1 * band(0, j);
if (len > 0) {
make_vector(actual_y + j + 1, len) += temp1 * band.col(j).segment(1, len);
actual_y[j] += alpha * band.col(j).segment(1, len).dot(make_vector(actual_x + j + 1, len));
}
}
}
} else {
// Scalar path: for narrow bandwidth, avoid Map overhead.
if (UPLO(*uplo) == UP) {
for (int j = 0; j < *n; ++j) {
Scalar temp1 = alpha * actual_x[j];
Scalar temp2 = Scalar(0);
for (int i = std::max(0, j - *k); i < j; ++i) {
Scalar aij = a[(*k + i - j) + j * *lda];
actual_y[i] += temp1 * aij;
temp2 += aij * actual_x[i];
}
actual_y[j] += temp1 * a[*k + j * *lda] + alpha * temp2;
}
} else {
for (int j = 0; j < *n; ++j) {
Scalar temp1 = alpha * actual_x[j];
Scalar temp2 = Scalar(0);
actual_y[j] += temp1 * a[j * *lda];
for (int i = j + 1; i <= std::min(*n - 1, j + *k); ++i) {
Scalar aij = a[(i - j) + j * *lda];
actual_y[i] += temp1 * aij;
temp2 += aij * actual_x[i];
}
actual_y[j] += alpha * temp2;
}
}
}
if (actual_x != x) delete[] actual_x;
if (actual_y != y) delete[] copy_back(actual_y, y, *n, *incy);
}
/** SPMV performs the matrix-vector operation
*
* y := alpha*A*x + beta*y,
*
* where alpha and beta are scalars, x and y are n element vectors and
* A is an n by n symmetric matrix, supplied in packed form.
*
* Packed storage: upper triangle stores columns sequentially so that
* column j occupies positions kk..kk+j (where kk = j*(j+1)/2),
* lower triangle stores column j at positions kk..kk+(n-j-1).
*/
// EIGEN_BLAS_FUNC(spmv)(char *uplo, int *n, RealScalar *alpha, RealScalar *ap, RealScalar *x, int *incx, RealScalar
// *beta, RealScalar *y, int *incy)
// {
// return 1;
// }
EIGEN_BLAS_FUNC(spmv)
(char *uplo, int *n, RealScalar *palpha, RealScalar *pap, RealScalar *px, int *incx, RealScalar *pbeta, RealScalar *py,
int *incy) {
const Scalar alpha = *reinterpret_cast<const Scalar *>(palpha);
const Scalar beta = *reinterpret_cast<const Scalar *>(pbeta);
const Scalar *ap = reinterpret_cast<const Scalar *>(pap);
const Scalar *x = reinterpret_cast<const Scalar *>(px);
Scalar *y = reinterpret_cast<Scalar *>(py);
int info = 0;
if (UPLO(*uplo) == INVALID)
info = 1;
else if (*n < 0)
info = 2;
else if (*incx == 0)
info = 6;
else if (*incy == 0)
info = 9;
if (info) return xerbla_(SCALAR_SUFFIX_UP "SPMV ", &info);
if (*n == 0 || (alpha == Scalar(0) && beta == Scalar(1))) return;
const Scalar *actual_x = get_compact_vector(x, *n, *incx);
Scalar *actual_y = get_compact_vector(y, *n, *incy);
// First form y := beta*y.
if (beta != Scalar(1)) {
if (beta == Scalar(0))
make_vector(actual_y, *n).setZero();
else
make_vector(actual_y, *n) *= beta;
}
if (alpha == Scalar(0)) {
if (actual_x != x) delete[] actual_x;
if (actual_y != y) delete[] copy_back(actual_y, y, *n, *incy);
return;
}
int kk = 0;
if (UPLO(*uplo) == UP) {
// Upper triangle packed: column j occupies ap[kk..kk+j].
for (int j = 0; j < *n; ++j) {
Scalar temp1 = alpha * actual_x[j];
actual_y[j] += temp1 * ap[kk + j];
if (j > 0) {
make_vector(actual_y, j) += temp1 * make_vector(ap + kk, j);
actual_y[j] += alpha * make_vector(ap + kk, j).dot(make_vector(actual_x, j));
}
kk += j + 1;
}
} else {
// Lower triangle packed: column j occupies ap[kk..kk+(n-j-1)].
for (int j = 0; j < *n; ++j) {
int len = *n - j - 1;
Scalar temp1 = alpha * actual_x[j];
actual_y[j] += temp1 * ap[kk];
if (len > 0) {
make_vector(actual_y + j + 1, len) += temp1 * make_vector(ap + kk + 1, len);
actual_y[j] += alpha * make_vector(ap + kk + 1, len).dot(make_vector(actual_x + j + 1, len));
}
kk += *n - j;
}
}
if (actual_x != x) delete[] actual_x;
if (actual_y != y) delete[] copy_back(actual_y, y, *n, *incy);
}
/** DSPR performs the symmetric rank 1 operation
*

15
blas/lsame.cpp Normal file
View File

@@ -0,0 +1,15 @@
// This file is part of Eigen, a lightweight C++ template library
// for linear algebra.
//
// This Source Code Form is subject to the terms of the Mozilla
// Public License v. 2.0. If a copy of the MPL was not distributed
// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
#include <cctype>
#include "blas.h"
// LSAME returns true if ca and cb are the same letter, regardless of case.
extern "C" EIGEN_BLAS_API int lsame_(const char *ca, const char *cb) {
return std::toupper(static_cast<unsigned char>(*ca)) == std::toupper(static_cast<unsigned char>(*cb));
}

View File

@@ -197,7 +197,7 @@ build:linux:x86-64:nvhpc-26.1:default:unsupported:
# Additional flags passed to the cuda compiler.
EIGEN_CI_CUDA_CXX_FLAGS: ""
# Compute architectures present in the GitLab CI runners.
EIGEN_CI_CUDA_COMPUTE_ARCH: "50;75"
EIGEN_CI_CUDA_COMPUTE_ARCH: "70;75"
EIGEN_CI_BUILD_TARGET: buildtests_gpu
EIGEN_CI_TEST_CUDA_CLANG: "off"
EIGEN_CI_TEST_CUDA_NVC: "off"
@@ -234,7 +234,7 @@ build:linux:cuda-12.2:clang-12:
# ROCm HIP
build:linux:rocm-latest:gcc-10:
extends: .build:linux:cross
image: rocm/dev-ubuntu-24.04:latest
image: rocm/dev-ubuntu-24.04:6.3.1
variables:
EIGEN_CI_C_COMPILER: gcc-10
EIGEN_CI_CXX_COMPILER: g++-10
@@ -386,6 +386,6 @@ build:linux:cross:x86-64:clang-14:sanitizer:smoketest:
rules:
- if: $CI_PIPELINE_SOURCE == "merge_request_event"
tags:
- saas-linux-medium-amd64
- saas-linux-large-amd64
allow_failure: true
timeout: 30m

View File

@@ -55,7 +55,7 @@ build:windows:x86-64:msvc-14.29:avx512dq:
extends: .build:windows
variables:
# Compute architectures present in the GitLab CI runners.
EIGEN_CI_CUDA_COMPUTE_ARCH: "50;75"
EIGEN_CI_CUDA_COMPUTE_ARCH: "70;75"
EIGEN_CI_BUILD_TARGET: buildtests_gpu
EIGEN_CI_ADDITIONAL_ARGS:
-DEIGEN_TEST_CUDA=on
@@ -66,8 +66,8 @@ build:windows:x86-64:msvc-14.29:avx512dq:
- x86-64
- cuda
# MSVC 14.29 + CUDA 11.4
build:windows:x86-64:cuda-11.4:msvc-14.29:
# MSVC 14.29 + CUDA 12.2
build:windows:x86-64:cuda-12.2:msvc-14.29:
extends: .build:windows:cuda
variables:
EIGEN_CI_BEFORE_SCRIPT: $$env:CUDA_PATH=$$env:CUDA_PATH_V11_4
EIGEN_CI_BEFORE_SCRIPT: $$env:CUDA_PATH=$$env:CUDA_PATH_V12_2

View File

@@ -488,7 +488,6 @@ test:linux:x86-64:clang-14:sanitizer:smoketest:
variables:
EIGEN_CI_INSTALL: clang-14 llvm-14 libclang-rt-14-dev
EIGEN_CI_CTEST_LABEL: smoketest
EIGEN_CI_CTEST_PARALLEL: "2"
EIGEN_CI_CTEST_ARGS: --timeout 120
ASAN_OPTIONS: "detect_leaks=0:halt_on_error=1:abort_on_error=1:allocator_may_return_null=1:print_stacktrace=1:detect_stack_use_after_return=0"
ASAN_SYMBOLIZER_PATH: "/usr/lib/llvm-14/bin/llvm-symbolizer"
@@ -496,6 +495,6 @@ test:linux:x86-64:clang-14:sanitizer:smoketest:
rules:
- if: $CI_PIPELINE_SOURCE == "merge_request_event"
tags:
- saas-linux-medium-amd64
- saas-linux-large-amd64
allow_failure: true
timeout: 30m

View File

@@ -71,7 +71,7 @@ test:windows:x86-64:msvc-14.29:avx512dq:unsupported:
- x86-64
- cuda
# MSVC 14.29 + CUDA 11.4
test:windows:x86-64:cuda-11.4:msvc-14.29:
# MSVC 14.29 + CUDA 12.2
test:windows:x86-64:cuda-12.2:msvc-14.29:
extends: .test:windows:cuda
needs: [ build:windows:x86-64:cuda-11.4:msvc-14.29 ]
needs: [ build:windows:x86-64:cuda-12.2:msvc-14.29 ]

View File

@@ -20,7 +20,8 @@ add_dependencies(check buildtests)
# Convenience target for only building GPU tests.
add_custom_target(buildtests_gpu)
add_custom_target(check_gpu COMMAND "ctest" "--output-on-failure"
add_custom_target(check_gpu COMMAND "ctest" ${EIGEN_CTEST_ARGS}
"--output-on-failure"
"--no-compress-output"
"--build-no-clean"
"-T" "test"
@@ -71,4 +72,3 @@ elseif(MSVC)
set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} /D_CRT_SECURE_NO_WARNINGS /D_SCL_SECURE_NO_WARNINGS")
endif()

View File

@@ -8,6 +8,12 @@ macro(ei_add_property prop value)
endif()
endmacro()
if(EIGEN_TEST_HIP AND NOT DEFINED EIGEN_HIP_ARCHITECTURES)
set(EIGEN_HIP_ARCHITECTURES
gfx900;gfx906;gfx908;gfx90a;gfx940;gfx941;gfx942;gfx1030;gfx1100;gfx1101;gfx1102;gfx1150;gfx1151
CACHE STRING "HIP GPU architectures to build Eigen's HIP tests for.")
endif()
#internal. See documentation of ei_add_test for details.
macro(ei_add_test_internal testname testname_with_suffix)
set(targetname ${testname_with_suffix})
@@ -30,7 +36,7 @@ macro(ei_add_test_internal testname testname_with_suffix)
hip_reset_flags()
hip_add_executable(${targetname} ${filename} HIPCC_OPTIONS -std=c++14)
target_compile_definitions(${targetname} PRIVATE -DEIGEN_USE_HIP)
set_property(TARGET ${targetname} PROPERTY HIP_ARCHITECTURES gfx900 gfx906 gfx908 gfx90a gfx940 gfx941 gfx942 gfx1030)
set_property(TARGET ${targetname} PROPERTY HIP_ARCHITECTURES "${EIGEN_HIP_ARCHITECTURES}")
elseif(EIGEN_TEST_CUDA_CLANG)
set_source_files_properties(${filename} PROPERTIES LANGUAGE CXX)
@@ -134,6 +140,7 @@ macro(ei_add_test_internal testname testname_with_suffix)
if (is_gpu_test)
# Add gpu tag for testing only GPU tests.
set_property(TEST ${testname_with_suffix} APPEND PROPERTY LABELS "gpu")
set_property(TEST ${testname_with_suffix} PROPERTY SKIP_RETURN_CODE 77)
endif()
if(EIGEN_SYCL)

View File

@@ -30,10 +30,11 @@ Timings are in \b milliseconds, and factors are relative to the LLT decompositio
<a name="note_ls">\b *: </a> This decomposition do not support direct least-square solving for over-constrained problems, and the reported timing include the cost to form the symmetric covariance matrix \f$ A^T A \f$.
\b Observations:
+ LLT is always the fastest solvers.
+ LLT is always the fastest solver.
+ For largely over-constrained problems, the cost of Cholesky/LU decompositions is dominated by the computation of the symmetric covariance matrix.
+ For large problem sizes, only the decomposition implementing a cache-friendly blocking strategy scale well. Those include LLT, PartialPivLU, HouseholderQR, and BDCSVD. This explain why for a 4k x 4k matrix, HouseholderQR is faster than LDLT. In the future, LDLT and ColPivHouseholderQR will also implement blocking strategies.
+ For large problem sizes, only the decompositions implementing a cache-friendly blocking strategy scale well. Those include LLT, PartialPivLU, HouseholderQR, and BDCSVD. This explains why for a 4k x 4k matrix, HouseholderQR is faster than LDLT.
+ CompleteOrthogonalDecomposition is based on ColPivHouseholderQR and they thus achieve the same level of performance.
+ FullPivLU and FullPivHouseholderQR are dramatically slower for large matrices due to the lack of blocking, and are not shown for the 4k x 4k case.
The above table was originally generated by a benchmark tool. Feel free to write your own benchmark to generate a table matching your hardware, compiler, and favorite problem sizes.

View File

@@ -7,13 +7,33 @@ of equations, say \a Ax = \a b, has no solutions. In this case, it makes sense t
vector \a x which is closest to being a solution, in the sense that the difference \a Ax - \a b is
as small as possible. This \a x is called the least square solution (if the Euclidean norm is used).
The three methods discussed on this page are the SVD decomposition, the QR decomposition and normal
equations. Of these, the SVD decomposition is generally the most accurate but the slowest, normal
equations is the fastest but least accurate, and the QR decomposition is in between.
The methods discussed on this page are the complete orthogonal decomposition (COD), the SVD
decomposition, other QR decompositions, and normal equations. For most problems, we recommend
CompleteOrthogonalDecomposition: it robustly computes the minimum-norm least squares solution
(like the SVD) for both over- and under-determined systems, including rank-deficient ones, but at
QR-like speed. The SVD is the most robust but also the slowest; use it when you also need singular
values or vectors. Normal equations are the fastest but least robust.
\eigenAutoToc
\section LeastSquaresCOD Using the complete orthogonal decomposition (recommended)
CompleteOrthogonalDecomposition is the recommended method for least squares problems. It handles the
widest class of problems — overdetermined, underdetermined, and rank-deficient systems — and computes
the minimum-norm solution when the system is rank-deficient or underdetermined, just like the SVD.
It is based on a rank-revealing QR factorization (ColPivHouseholderQR) followed by a post-processing
step, so it is significantly faster than SVD while providing comparable robustness.
<table class="example">
<tr><th>Example:</th><th>Output:</th></tr>
<tr>
<td>\include LeastSquaresCOD.cpp </td>
<td>\verbinclude LeastSquaresCOD.out </td>
</tr>
</table>
\section LeastSquaresSVD Using the SVD decomposition
The \link BDCSVD::solve() solve() \endlink method in the BDCSVD class can be directly used to
@@ -30,16 +50,19 @@ computing least squares solutions:
</table>
This is example from the page \link TutorialLinearAlgebra Linear algebra and decompositions \endlink.
If you just need to solve the least squares problem, but are not interested in the SVD per se, a
faster alternative method is CompleteOrthogonalDecomposition.
The SVD gives you singular values and vectors in addition to the least squares solution, but if you
only need the solution, CompleteOrthogonalDecomposition (above) is faster.
\section LeastSquaresQR Using the QR decomposition
\section LeastSquaresQR Using other QR decompositions
The solve() method in QR decomposition classes also computes the least squares solution. There are
three QR decomposition classes: HouseholderQR (no pivoting, fast but unstable if your matrix is
not rull rank), ColPivHouseholderQR (column pivoting, thus a bit slower but more stable) and
FullPivHouseholderQR (full pivoting, so slowest and slightly more stable than ColPivHouseholderQR).
The solve() method in QR decomposition classes also computes the least squares solution. Besides
CompleteOrthogonalDecomposition (above), there are three other QR decomposition classes:
HouseholderQR (no pivoting, so fast but unreliable if your matrix is not full rank),
ColPivHouseholderQR (column pivoting, a bit slower but rank-revealing), and FullPivHouseholderQR
(full pivoting, significantly slower and rarely needed in practice).
Note that only CompleteOrthogonalDecomposition and the SVD-based solvers compute minimum-norm
solutions for rank-deficient or underdetermined problems; the other QR variants do not.
Here is an example with column pivoting:
<table class="example">

View File

@@ -42,10 +42,10 @@ To get an overview of the true relative speed of the different decompositions, c
<tr class="alt">
<td>FullPivLU</td>
<td>-</td>
<td>Slow</td>
<td>Slow (no blocking)</td>
<td>Proven</td>
<td>Yes</td>
<td>-</td>
<td>Rank, kernel, image</td>
<td>Yes</td>
<td>Excellent</td>
<td>-</td>
@@ -78,7 +78,7 @@ To get an overview of the true relative speed of the different decompositions, c
<tr>
<td>FullPivHouseholderQR</td>
<td>-</td>
<td>Slow</td>
<td>Slow (no blocking)</td>
<td>Proven</td>
<td>Yes</td>
<td>Orthogonalization</td>
@@ -120,7 +120,7 @@ To get an overview of the true relative speed of the different decompositions, c
<td>-</td>
<td>Yes</td>
<td>Excellent</td>
<td><em>Soon: blocking</em></td>
<td>-</td>
</tr>
<tr><th class="inter" colspan="9">\n Singular values and eigenvalues decompositions</th></tr>
@@ -232,7 +232,7 @@ To get an overview of the true relative speed of the different decompositions, c
<td>-</td>
<td>-</td>
<td>Good</td>
<td><em>Soon: blocking</em></td>
<td>-</td>
</tr>
<tr>
@@ -244,7 +244,7 @@ To get an overview of the true relative speed of the different decompositions, c
<td>-</td>
<td>-</td>
<td>Good</td>
<td><em>Soon: blocking</em></td>
<td>-</td>
</tr>
</table>
@@ -253,9 +253,32 @@ To get an overview of the true relative speed of the different decompositions, c
<ul>
<li><a name="note1">\b 1: </a>There exist two variants of the LDLT algorithm. Eigen's one produces a pure diagonal D matrix, and therefore it cannot handle indefinite matrices, unlike Lapack's one which produces a block diagonal D matrix.</li>
<li><a name="note2">\b 2: </a>Eigenvalues, SVD and Schur decompositions rely on iterative algorithms. Their convergence speed depends on how well the eigenvalues are separated.</li>
<li><a name="note3">\b 3: </a>Our JacobiSVD is two-sided, making for proven and optimal precision for square matrices. For non-square matrices, we have to use a QR preconditioner first. The default choice, ColPivHouseholderQR, is already very reliable, but if you want it to be proven, use FullPivHouseholderQR instead.
<li><a name="note3">\b 3: </a>Our JacobiSVD is two-sided, making for proven and optimal precision for square matrices. For non-square matrices, we have to use a QR preconditioner first. The default choice, ColPivHouseholderQR, is already very reliable, but if you want it to be proven, use FullPivHouseholderQR instead.</li>
</ul>
\section TopicLinAlgPracticalGuidance Practical guidance
The following recommendations apply to the most common use cases:
\li <b>Symmetric positive definite systems:</b> Use \b LLT. It is the fastest solver and has excellent
numerical properties for this class of problems. For semidefinite or nearly singular symmetric systems,
use \b LDLT.
\li <b>General invertible systems:</b> Use \b PartialPivLU. It uses cache-friendly blocking and implicit
multi-threading, making it the fastest general-purpose solver. Partial pivoting is sufficient for
virtually all practical problems.
\li <b>Least squares (over- or under-determined systems):</b> Use \b CompleteOrthogonalDecomposition as
the default. Like the SVD, it robustly computes the minimum-norm solution for rank-deficient and
under-determined problems, but at QR-like speed. Use \b BDCSVD when you also need singular values
or vectors, not just the least squares solution.
\li <b>Full-rank least squares (overdetermined systems):</b> When the matrix is known to be full rank,
\b HouseholderQR is the fastest option. For very tall and skinny well-conditioned matrices,
solving via the normal equations with \b LLT can be faster still.
\li <b>FullPivLU and FullPivHouseholderQR</b> use complete pivoting, which prevents the use of
cache-friendly blocking algorithms and makes them significantly slower than their partial/column
pivoting counterparts. In practice, complete pivoting rarely provides meaningful accuracy benefits.
These decompositions are primarily useful for debugging, pedagogy, or the very rare case
where column pivoting is insufficient.
\section TopicLinAlgTerminology Terminology
<dl>

View File

@@ -43,7 +43,23 @@ depending on your matrix, the problem you are trying to solve, and the trade-off
<th>Requirements<br/>on the matrix</th>
<th>Speed<br/> (small-to-medium)</th>
<th>Speed<br/> (large)</th>
<th>Accuracy</th>
<th>Robustness<sup><a href="#note_robust">*</a></sup></th>
</tr>
<tr>
<td>LLT</td>
<td>llt()</td>
<td>Positive definite</td>
<td>+++</td>
<td>+++</td>
<td>+</td>
</tr>
<tr class="alt">
<td>LDLT</td>
<td>ldlt()</td>
<td>Positive or negative<br/> semidefinite</td>
<td>+++</td>
<td>+</td>
<td>++</td>
</tr>
<tr>
<td>PartialPivLU</td>
@@ -54,14 +70,6 @@ depending on your matrix, the problem you are trying to solve, and the trade-off
<td>+</td>
</tr>
<tr class="alt">
<td>FullPivLU</td>
<td>fullPivLu()</td>
<td>None</td>
<td>-</td>
<td>- -</td>
<td>+++</td>
</tr>
<tr>
<td>HouseholderQR</td>
<td>householderQr()</td>
<td>None</td>
@@ -69,7 +77,7 @@ depending on your matrix, the problem you are trying to solve, and the trade-off
<td>++</td>
<td>+</td>
</tr>
<tr class="alt">
<tr>
<td>ColPivHouseholderQR</td>
<td>colPivHouseholderQr()</td>
<td>None</td>
@@ -77,14 +85,6 @@ depending on your matrix, the problem you are trying to solve, and the trade-off
<td>-</td>
<td>+++</td>
</tr>
<tr>
<td>FullPivHouseholderQR</td>
<td>fullPivHouseholderQr()</td>
<td>None</td>
<td>-</td>
<td>- -</td>
<td>+++</td>
</tr>
<tr class="alt">
<td>CompleteOrthogonalDecomposition</td>
<td>completeOrthogonalDecomposition()</td>
@@ -93,23 +93,7 @@ depending on your matrix, the problem you are trying to solve, and the trade-off
<td>-</td>
<td>+++</td>
</tr>
<tr class="alt">
<td>LLT</td>
<td>llt()</td>
<td>Positive definite</td>
<td>+++</td>
<td>+++</td>
<td>+</td>
</tr>
<tr>
<td>LDLT</td>
<td>ldlt()</td>
<td>Positive or negative<br/> semidefinite</td>
<td>+++</td>
<td>+</td>
<td>++</td>
</tr>
<tr class="alt">
<td>BDCSVD</td>
<td>bdcSvd()</td>
<td>None</td>
@@ -126,15 +110,36 @@ depending on your matrix, the problem you are trying to solve, and the trade-off
<td>+++</td>
</tr>
</table>
<a name="note_robust"><b>*</b></a> The <b>Robustness</b> column indicates how well the decomposition handles
ill-conditioned or rank-deficient matrices. All decompositions give excellent accuracy when their
requirements on the matrix are met and the problem is well-conditioned.
To get an overview of the true relative speed of the different decompositions, check this \link DenseDecompositionBenchmark benchmark \endlink.
All of these decompositions offer a solve() method that works as in the above example.
All of these decompositions offer a solve() method that works as in the above example.
If you know more about the properties of your matrix, you can use the above table to select the best method.
For example, a good choice for solving linear systems with a non-symmetric matrix of full rank is PartialPivLU.
If you know that your matrix is also symmetric and positive definite, the above table says that
a very good choice is the LLT or LDLT decomposition. Here's an example, also demonstrating that using a general
matrix (not a vector) as right hand side is possible:
\b Practical \b recommendations:
\li If your matrix is symmetric positive definite, use \b LLT. It is the fastest and is perfectly accurate
for this class of problems. If your matrix is only positive or negative semidefinite, use \b LDLT.
\li For a general invertible matrix, \b PartialPivLU is the best choice. It is fast (uses cache-friendly
blocking) and reliable for the vast majority of problems.
\li For least squares problems (over- or under-determined systems), \b CompleteOrthogonalDecomposition
is the recommended default. Like the SVD, it robustly computes the minimum-norm solution for
rank-deficient and under-determined problems, but at the cost of a QR decomposition rather than
an SVD. Use \b ColPivHouseholderQR if you only need least squares for full-rank overdetermined
systems and don't need the minimum-norm property.
\li \b SVD decompositions (BDCSVD, JacobiSVD) are the most robust but also the slowest. Use these when
you need singular values/vectors, not just the solution.
\li \b HouseholderQR is the fastest option for full-rank least squares problems, but it does not
reveal rank and cannot compute minimum-norm solutions for rank-deficient problems.
\li FullPivLU and FullPivHouseholderQR use complete pivoting, which is significantly slower due to
lack of blocking. In practice, they rarely provide meaningful benefits over PartialPivLU and
ColPivHouseholderQR, respectively, and are not recommended for general use. They are primarily useful
for debugging or for pedagogical purposes.
Here's an example showing the use of LLT for a symmetric positive definite system, also demonstrating
that using a general matrix (not a vector) as right hand side is possible:
<table class="example">
<tr><th>Example:</th><th>Output:</th></tr>
@@ -151,14 +156,15 @@ supports many other decompositions), see our special page on
\section TutorialLinAlgLeastsquares Least squares solving
The most general and accurate method to solve under- or over-determined linear systems
in the least squares sense, is the SVD decomposition. Eigen provides two implementations.
The recommended one is the BDCSVD class, which scales well for large problems
and automatically falls back to the JacobiSVD class for smaller problems.
For both classes, their solve() method solved the linear system in the least-squares
sense.
The recommended method to solve under- or over-determined linear systems in the least squares sense is
\b CompleteOrthogonalDecomposition. Like the SVD, it robustly computes the minimum-norm least squares
solution, correctly handling rank-deficient and under-determined problems, but it is significantly faster
since it is based on a rank-revealing QR decomposition rather than a full SVD.
Here is an example:
If you also need the singular values or vectors themselves (not just the least squares solution), use
\b BDCSVD, which scales well for large problems and automatically falls back to JacobiSVD for smaller ones.
Here is an example using the SVD:
<table class="example">
<tr><th>Example:</th><th>Output:</th></tr>
<tr>
@@ -167,11 +173,9 @@ Here is an example:
</tr>
</table>
An alternative to the SVD, which is usually faster and about as accurate, is CompleteOrthogonalDecomposition.
Again, if you know more about the problem, the table above contains methods that are potentially faster.
If your matrix is full rank, HouseHolderQR is the method of choice. If your matrix is full rank and well conditioned,
using the Cholesky decomposition (LLT) on the matrix of the normal equations can be faster still.
If you know more about the problem, faster methods are available.
If your matrix is full rank, HouseholderQR is the fastest method. If your matrix is full rank and
well conditioned, using the Cholesky decomposition (LLT) on the normal equations can be faster still.
Our page on \link LeastSquares least squares solving \endlink has more details.
@@ -267,8 +271,9 @@ singular matrix). On \ref TopicLinearAlgebraDecompositions "this table" you can
whether they are rank-revealing or not.
Rank-revealing decompositions offer at least a rank() method. They can also offer convenience methods such as isInvertible(),
and some are also providing methods to compute the kernel (null-space) and image (column-space) of the matrix, as is the
case with FullPivLU:
and some are also providing methods to compute the kernel (null-space) and image (column-space) of the matrix.
ColPivHouseholderQR, CompleteOrthogonalDecomposition, and FullPivLU all provide these methods. Here is an example using
FullPivLU:
<table class="example">
<tr><th>Example:</th><th>Output:</th></tr>

View File

@@ -0,0 +1,3 @@
MatrixXf A = MatrixXf::Random(3, 2);
VectorXf b = VectorXf::Random(3);
cout << "The solution using the COD is:\n" << A.completeOrthogonalDecomposition().solve(b) << endl;

View File

@@ -433,7 +433,7 @@ if(EIGEN_TEST_CUDA_NVC AND NOT CMAKE_CXX_COMPILER_ID MATCHES "NVHPC")
message(WARNING "EIGEN_TEST_CUDA_NVC is set, but CMAKE_CXX_COMPILER does not appear to be nvc++.")
endif()
find_package(CUDA 9.0)
find_package(CUDA 11.4)
if(CUDA_FOUND AND EIGEN_TEST_CUDA)
# Make sure to compile without the -pedantic, -Wundef, -Wnon-virtual-dtor
# and -fno-check-new flags since they trigger thousands of compilation warnings
@@ -502,6 +502,9 @@ if (EIGEN_TEST_HIP)
endif()
find_package(HIP REQUIRED)
if (HIP_FOUND AND HIP_VERSION VERSION_LESS "5.6")
message(FATAL_ERROR "Eigen requires ROCm/HIP >= 5.6, found ${HIP_VERSION}")
endif()
if (HIP_FOUND)
execute_process(COMMAND ${HIP_PATH}/bin/hipconfig --platform OUTPUT_VARIABLE HIP_PLATFORM)

View File

@@ -15,6 +15,7 @@
#define EIGEN_RUNTIME_NO_MALLOC
#include "main.h"
#include "tridiag_test_matrices.h"
#include <Eigen/SVD>
#define SVD_DEFAULT(M) BDCSVD<M>
@@ -146,148 +147,26 @@ void verify_bidiagonal_vs_matrix_svd(const Matrix<RealScalar, Dynamic, 1>& diag,
template <typename RealScalar>
void bdcsvd_bidiagonal_hard_cases() {
using std::abs;
using std::cos;
using std::pow;
using std::sin;
typedef Matrix<RealScalar, Dynamic, 1> VectorXr;
Eigen::internal::set_is_malloc_allowed(true);
const RealScalar eps = NumTraits<RealScalar>::epsilon();
// Use the shared tridiagonal test matrix generators.
// Each generator fills (diag, offdiag) which we treat as (diagonal, superdiagonal)
// of a bidiagonal matrix.
test::for_all_tridiag_test_matrices<RealScalar>(
[](const auto& diag, const auto& offdiag) { verify_bidiagonal_svd<RealScalar>(diag, offdiag); });
// Test sizes: cover n=1, very small, below/above algoSwap (16), and larger.
const int sizes[] = {1, 2, 3, 5, 10, 16, 20, 50, 100};
const int numSizes = sizeof(sizes) / sizeof(sizes[0]);
// Additional SVD-specific test: identity with cross-validation against full matrix SVD.
test::for_tridiag_sizes<RealScalar>([](auto& diag, auto& offdiag) {
test::tridiag_identity(diag, offdiag);
verify_bidiagonal_vs_matrix_svd<RealScalar>(diag, offdiag);
});
for (int si = 0; si < numSizes; ++si) {
const Index n = sizes[si];
VectorXr diag(n), superdiag(n > 1 ? n - 1 : 0);
// 1. Identity: d=[1,...,1], e=[0,...,0]
diag.setOnes();
superdiag.setZero();
verify_bidiagonal_svd<RealScalar>(diag, superdiag);
verify_bidiagonal_vs_matrix_svd<RealScalar>(diag, superdiag);
// 2. Zero: d=[0,...,0], e=[0,...,0]
diag.setZero();
superdiag.setZero();
verify_bidiagonal_svd<RealScalar>(diag, superdiag);
// 3. Scalar (only meaningful for n=1, but runs for all)
if (n == 1) {
diag(0) = RealScalar(3.14);
verify_bidiagonal_svd<RealScalar>(diag, superdiag);
}
// 4. Golub-Kahan: d=[1,...,1], e=[1,...,1]
diag.setOnes();
if (n > 1) superdiag.setOnes();
verify_bidiagonal_svd<RealScalar>(diag, superdiag);
// 5. Kahan matrix: d_i = s^(i-1), e_i = -c*s^(i-1)
// Clamp exponents so condition number stays bounded by 1/eps.
{
const RealScalar theta = RealScalar(0.3);
const RealScalar s = sin(theta);
const RealScalar c = cos(theta);
using std::log;
const RealScalar maxPower = -log(eps) / (-log(s));
for (Index i = 0; i < n; ++i) diag(i) = pow(s, numext::mini(RealScalar(i), maxPower));
for (Index i = 0; i < n - 1; ++i) superdiag(i) = -c * pow(s, numext::mini(RealScalar(i), maxPower));
verify_bidiagonal_svd<RealScalar>(diag, superdiag);
}
// 6. Geometric decay diagonal: d_i = 0.5^i, e=[0,...,0]
// Clamp so condition number stays bounded by 1/eps.
{
using std::log;
const RealScalar base = RealScalar(0.5);
const RealScalar maxPower = -log(eps) / (-log(base));
for (Index i = 0; i < n; ++i) diag(i) = pow(base, numext::mini(RealScalar(i), maxPower));
superdiag.setZero();
verify_bidiagonal_svd<RealScalar>(diag, superdiag);
}
// 7. Geometric decay superdiagonal: d=[1,...,1], e_i = 0.5^i
diag.setOnes();
for (Index i = 0; i < n - 1; ++i) superdiag(i) = pow(RealScalar(0.5), RealScalar(i));
verify_bidiagonal_svd<RealScalar>(diag, superdiag);
// 8. Clustered at 1: d_i = 1 + i*eps, e=[0,...,0]
for (Index i = 0; i < n; ++i) diag(i) = RealScalar(1) + RealScalar(i) * eps;
superdiag.setZero();
verify_bidiagonal_svd<RealScalar>(diag, superdiag);
// 9. Two clusters: half ≈ 1, half ≈ eps
for (Index i = 0; i < n; ++i) diag(i) = (i < n / 2) ? RealScalar(1) : eps;
superdiag.setZero();
verify_bidiagonal_svd<RealScalar>(diag, superdiag);
// 10. Single tiny singular value: d=[1,...,1,eps], e=[eps^2,...]
diag.setOnes();
diag(n - 1) = eps;
for (Index i = 0; i < n - 1; ++i) superdiag(i) = eps * eps;
verify_bidiagonal_svd<RealScalar>(diag, superdiag);
// 11. Graded: d_i = 10^(-i), e_i = 10^(-i)
for (Index i = 0; i < n; ++i) diag(i) = pow(RealScalar(10), -RealScalar(i));
for (Index i = 0; i < n - 1; ++i) superdiag(i) = pow(RealScalar(10), -RealScalar(i));
verify_bidiagonal_svd<RealScalar>(diag, superdiag);
// 12. Nearly diagonal: random diag, eps * random superdiag
diag = VectorXr::Random(n).cwiseAbs() + VectorXr::Constant(n, RealScalar(0.1));
for (Index i = 0; i < n - 1; ++i) superdiag(i) = eps * (RealScalar(0.5) + abs(internal::random<RealScalar>()));
verify_bidiagonal_svd<RealScalar>(diag, superdiag);
// 13. All equal: d=[c,...,c], e=[c,...,c]
diag.setConstant(RealScalar(2.5));
if (n > 1) superdiag.setConstant(RealScalar(2.5));
verify_bidiagonal_svd<RealScalar>(diag, superdiag);
// 14. Wilkinson: d_i = |n/2 - i|, e=[1,...,1]
for (Index i = 0; i < n; ++i) diag(i) = abs(RealScalar(n / 2) - RealScalar(i));
if (n > 1) superdiag.setOnes();
verify_bidiagonal_svd<RealScalar>(diag, superdiag);
// 15. Overflow/underflow: alternating big/tiny diagonal, tiny/big superdiagonal
{
const RealScalar big = (std::numeric_limits<RealScalar>::max)() / RealScalar(1000);
const RealScalar tiny = (std::numeric_limits<RealScalar>::min)() * RealScalar(1000);
for (Index i = 0; i < n; ++i) diag(i) = (i % 2 == 0) ? big : tiny;
for (Index i = 0; i < n - 1; ++i) superdiag(i) = (i % 2 == 0) ? tiny : big;
verify_bidiagonal_svd<RealScalar>(diag, superdiag);
}
// 16. Prescribed condition number: d_i = kappa^(-i/(n-1)), e_i = eps * random
if (n > 1) {
const RealScalar kappa = RealScalar(1) / eps;
for (Index i = 0; i < n; ++i) diag(i) = pow(kappa, -RealScalar(i) / RealScalar(n - 1));
for (Index i = 0; i < n - 1; ++i) superdiag(i) = eps * abs(internal::random<RealScalar>());
verify_bidiagonal_svd<RealScalar>(diag, superdiag);
}
// 17. Rank-deficient: d=[1,..,0,..,0,..,1], e=[0,...,0]
for (Index i = 0; i < n; ++i) diag(i) = (i < n / 3 || i >= 2 * n / 3) ? RealScalar(1) : RealScalar(0);
superdiag.setZero();
verify_bidiagonal_svd<RealScalar>(diag, superdiag);
// 18. Arrowhead stress: d_i = linspace(1, n), e_i = 1/(i+1)
for (Index i = 0; i < n; ++i) diag(i) = RealScalar(1) + RealScalar(i);
for (Index i = 0; i < n - 1; ++i) superdiag(i) = RealScalar(1) / RealScalar(i + 1);
verify_bidiagonal_svd<RealScalar>(diag, superdiag);
// 19. Repeated singular values: d=[1,2,3,1,2,3,...], e=[0,...,0]
for (Index i = 0; i < n; ++i) diag(i) = RealScalar((i % 3) + 1);
superdiag.setZero();
verify_bidiagonal_svd<RealScalar>(diag, superdiag);
// 20. Glued identity: d=[1,...,1], e=0 except e[n/2-1]=eps
diag.setOnes();
superdiag.setZero();
if (n > 2) superdiag(n / 2 - 1) = eps;
verify_bidiagonal_svd<RealScalar>(diag, superdiag);
// Additional SVD-specific test: scalar for n=1.
{
typedef Matrix<RealScalar, Dynamic, 1> VectorXr;
VectorXr diag(1), offdiag(0);
diag(0) = RealScalar(3.14);
verify_bidiagonal_svd<RealScalar>(diag, offdiag);
}
}

View File

@@ -10,6 +10,7 @@
#include "main.h"
#include "svd_fill.h"
#include "tridiag_test_matrices.h"
#include <limits>
#include <Eigen/Eigenvalues>
#include <Eigen/SparseCore>
@@ -25,17 +26,39 @@ void selfadjointeigensolver_essential_check(const MatrixType& m) {
SelfAdjointEigenSolver<MatrixType> eiSymm(m);
VERIFY_IS_EQUAL(eiSymm.info(), Success);
Index n = m.cols();
RealScalar scaling = m.cwiseAbs().maxCoeff();
RealScalar unitary_error_factor = RealScalar(32);
if (scaling < (std::numeric_limits<RealScalar>::min)()) {
VERIFY(eiSymm.eigenvalues().cwiseAbs().maxCoeff() <= (std::numeric_limits<RealScalar>::min)());
} else {
VERIFY_IS_APPROX((m.template selfadjointView<Lower>() * eiSymm.eigenvectors()) / scaling,
(eiSymm.eigenvectors() * eiSymm.eigenvalues().asDiagonal()) / scaling);
// Columnwise residual check: for each eigenpair (lambda_i, v_i),
// ||A*v_i - lambda_i*v_i|| / ||A||_max <= c * n * eps
// This ensures accuracy for every eigenpair, including those corresponding
// to small eigenvalues (which a Frobenius norm check would miss).
// Computed in scaled space (dividing by ||A||_max) to avoid overflow.
MatrixType scaledA = m.template selfadjointView<Lower>();
scaledA /= scaling;
MatrixType residual =
scaledA * eiSymm.eigenvectors() - eiSymm.eigenvectors() * (eiSymm.eigenvalues() / scaling).asDiagonal();
RealScalar tol = RealScalar(4) * RealScalar(numext::maxi(Index(1), n)) * NumTraits<RealScalar>::epsilon();
for (Index i = 0; i < n; ++i) {
VERIFY(residual.col(i).norm() <= tol);
}
}
VERIFY_IS_APPROX(m.template selfadjointView<Lower>().eigenvalues(), eiSymm.eigenvalues());
VERIFY(eiSymm.eigenvectors().isUnitary(test_precision<RealScalar>() * unitary_error_factor));
// Eigenvectors must be unitary. Use a tolerance proportional to n*epsilon,
// which is the expected rounding error for Householder-based orthogonal transformations.
RealScalar unitary_tol = RealScalar(4) * RealScalar(numext::maxi(Index(1), n)) * NumTraits<RealScalar>::epsilon();
// But don't go below the test_precision floor (matters for float).
unitary_tol = numext::maxi(unitary_tol, test_precision<RealScalar>());
VERIFY(eiSymm.eigenvectors().isUnitary(unitary_tol));
// Verify eigenvalues are sorted in non-decreasing order.
for (Index i = 1; i < n; ++i) {
VERIFY(eiSymm.eigenvalues()(i) >= eiSymm.eigenvalues()(i - 1));
}
if (m.cols() <= 4) {
SelfAdjointEigenSolver<MatrixType> eiDirect;
@@ -53,12 +76,20 @@ void selfadjointeigensolver_essential_check(const MatrixType& m) {
VERIFY(eiDirect.eigenvalues().cwiseAbs().maxCoeff() <= (std::numeric_limits<RealScalar>::min)());
} else {
VERIFY_IS_APPROX(eiSymm.eigenvalues() / scaling, eiDirect.eigenvalues() / scaling);
// TODO: the direct 3x3 solver can produce large backward errors (>>n*eps*||A||)
// on some matrices. Investigate and fix, then tighten this to a Frobenius norm check.
VERIFY_IS_APPROX((m.template selfadjointView<Lower>() * eiDirect.eigenvectors()) / scaling,
(eiDirect.eigenvectors() * eiDirect.eigenvalues().asDiagonal()) / scaling);
VERIFY_IS_APPROX(m.template selfadjointView<Lower>().eigenvalues() / scaling, eiDirect.eigenvalues() / scaling);
}
VERIFY(eiDirect.eigenvectors().isUnitary(test_precision<RealScalar>() * unitary_error_factor));
// Direct solver eigenvectors must also be unitary.
VERIFY(eiDirect.eigenvectors().isUnitary(unitary_tol));
// Direct solver eigenvalues must also be sorted.
for (Index i = 1; i < n; ++i) {
VERIFY(eiDirect.eigenvalues()(i) >= eiDirect.eigenvalues()(i - 1));
}
}
}
@@ -149,9 +180,15 @@ void selfadjointeigensolver(const MatrixType& m) {
VERIFY_IS_APPROX(tridiag.diagonal(), tridiag.matrixT().diagonal());
VERIFY_IS_APPROX(tridiag.subDiagonal(), tridiag.matrixT().template diagonal<-1>());
Matrix<RealScalar, Dynamic, Dynamic> T = tridiag.matrixT();
if (rows > 1 && cols > 1) {
// FIXME check that upper and lower part are 0:
// VERIFY(T.topRightCorner(rows-2, cols-2).template triangularView<Upper>().isZero());
if (rows > 2) {
// Verify that the tridiagonal matrix is actually tridiagonal (zero outside the three central diagonals).
for (Index i = 0; i < rows; ++i) {
for (Index j = 0; j < cols; ++j) {
if (numext::abs(i - j) > 1) {
VERIFY(numext::is_exactly_zero(T(i, j)));
}
}
}
}
VERIFY_IS_APPROX(tridiag.diagonal(), T.diagonal());
VERIFY_IS_APPROX(tridiag.subDiagonal(), T.template diagonal<1>());
@@ -170,7 +207,7 @@ void selfadjointeigensolver(const MatrixType& m) {
eiSymmTridiag.eigenvectors().real().transpose());
}
// Test matrix expponential from eigendecomposition.
// Test matrix exponential from eigendecomposition.
// First scale to avoid overflow.
symmB = symmB / symmB.norm();
eiSymm.compute(symmB);
@@ -202,6 +239,451 @@ void selfadjointeigensolver(const MatrixType& m) {
}
}
// Test matrices with exact eigenvalue multiplicities.
template <typename MatrixType>
void selfadjointeigensolver_repeated_eigenvalues(const MatrixType& m) {
typedef typename MatrixType::Scalar Scalar;
typedef typename NumTraits<Scalar>::Real RealScalar;
Index n = m.rows();
if (n < 2) return;
// Create a random unitary matrix via QR.
MatrixType q = MatrixType::Random(n, n);
HouseholderQR<MatrixType> qr(q);
q = qr.householderQ();
// All eigenvalues equal (scalar multiple of identity).
{
RealScalar lambda = internal::random<RealScalar>(-10, 10);
MatrixType A = lambda * MatrixType::Identity(n, n);
selfadjointeigensolver_essential_check(A);
}
// Eigenvalue of multiplicity n-1 (one distinct, rest equal).
{
Matrix<RealScalar, Dynamic, 1> d = Matrix<RealScalar, Dynamic, 1>::Constant(n, RealScalar(3));
d(0) = RealScalar(-2);
MatrixType A = (q * d.template cast<Scalar>().asDiagonal() * q.adjoint()).eval();
A.template triangularView<StrictlyUpper>().setZero();
selfadjointeigensolver_essential_check(A);
}
// Two clusters: first half one value, second half another.
if (n >= 4) {
Matrix<RealScalar, Dynamic, 1> d(n);
for (Index i = 0; i < n / 2; ++i) d(i) = RealScalar(1);
for (Index i = n / 2; i < n; ++i) d(i) = RealScalar(5);
MatrixType A = (q * d.template cast<Scalar>().asDiagonal() * q.adjoint()).eval();
A.template triangularView<StrictlyUpper>().setZero();
selfadjointeigensolver_essential_check(A);
}
// Nearly repeated eigenvalues: separated by O(epsilon).
{
Matrix<RealScalar, Dynamic, 1> d(n);
for (Index i = 0; i < n; ++i) {
d(i) = RealScalar(1) + RealScalar(i) * NumTraits<RealScalar>::epsilon() * RealScalar(10);
}
MatrixType A = (q * d.template cast<Scalar>().asDiagonal() * q.adjoint()).eval();
A.template triangularView<StrictlyUpper>().setZero();
selfadjointeigensolver_essential_check(A);
}
}
// Test matrices with extreme condition numbers and eigenvalue ranges.
template <typename MatrixType>
void selfadjointeigensolver_extreme_eigenvalues(const MatrixType& m) {
using std::pow;
typedef typename MatrixType::Scalar Scalar;
typedef typename NumTraits<Scalar>::Real RealScalar;
Index n = m.rows();
if (n < 2) return;
// Create a random unitary matrix.
MatrixType q = MatrixType::Random(n, n);
HouseholderQR<MatrixType> qr(q);
q = qr.householderQ();
// Eigenvalues spanning many orders of magnitude (high condition number).
{
RealScalar maxExp = RealScalar(std::numeric_limits<RealScalar>::max_exponent10) / RealScalar(4);
Matrix<RealScalar, Dynamic, 1> d(n);
for (Index i = 0; i < n; ++i) {
RealScalar exponent = -maxExp + RealScalar(2) * maxExp * RealScalar(i) / RealScalar(n - 1);
d(i) = pow(RealScalar(10), exponent);
}
MatrixType A = (q * d.template cast<Scalar>().asDiagonal() * q.adjoint()).eval();
A.template triangularView<StrictlyUpper>().setZero();
SelfAdjointEigenSolver<MatrixType> eig(A);
VERIFY_IS_EQUAL(eig.info(), Success);
// For ill-conditioned matrices we can only check the relative residual.
// ||A*V - V*D|| / ||A|| should be O(n * epsilon).
RealScalar Anorm = A.template selfadjointView<Lower>().operatorNorm();
if (Anorm > (std::numeric_limits<RealScalar>::min)()) {
MatrixType residual = A.template selfadjointView<Lower>() * eig.eigenvectors() -
eig.eigenvectors() * eig.eigenvalues().asDiagonal();
RealScalar rel_err = residual.norm() / Anorm;
RealScalar tol = RealScalar(4) * RealScalar(n) * NumTraits<RealScalar>::epsilon();
VERIFY(rel_err <= tol);
}
// Eigenvalues must still be sorted.
for (Index i = 1; i < n; ++i) {
VERIFY(eig.eigenvalues()(i) >= eig.eigenvalues()(i - 1));
}
}
// Very tiny eigenvalues (near underflow).
{
RealScalar tiny = (std::numeric_limits<RealScalar>::min)() * RealScalar(100);
Matrix<RealScalar, Dynamic, 1> d(n);
for (Index i = 0; i < n; ++i) {
d(i) = tiny * (RealScalar(1) + RealScalar(i));
}
MatrixType A = (q * d.template cast<Scalar>().asDiagonal() * q.adjoint()).eval();
A.template triangularView<StrictlyUpper>().setZero();
selfadjointeigensolver_essential_check(A);
}
// Very large eigenvalues (near overflow).
{
RealScalar huge = (std::numeric_limits<RealScalar>::max)() / (RealScalar(n) * RealScalar(100));
Matrix<RealScalar, Dynamic, 1> d(n);
for (Index i = 0; i < n; ++i) {
d(i) = huge * (RealScalar(1) + RealScalar(i) * RealScalar(0.01));
}
MatrixType A = (q * d.template cast<Scalar>().asDiagonal() * q.adjoint()).eval();
A.template triangularView<StrictlyUpper>().setZero();
selfadjointeigensolver_essential_check(A);
}
// Mix of positive and negative eigenvalues.
{
Matrix<RealScalar, Dynamic, 1> d(n);
for (Index i = 0; i < n; ++i) {
d(i) = (i % 2 == 0) ? RealScalar(i + 1) : RealScalar(-(i + 1));
}
MatrixType A = (q * d.template cast<Scalar>().asDiagonal() * q.adjoint()).eval();
A.template triangularView<StrictlyUpper>().setZero();
selfadjointeigensolver_essential_check(A);
}
// One zero eigenvalue among non-zero ones (rank-deficient).
{
Matrix<RealScalar, Dynamic, 1> d = Matrix<RealScalar, Dynamic, 1>::LinSpaced(n, RealScalar(0), RealScalar(n - 1));
MatrixType A = (q * d.template cast<Scalar>().asDiagonal() * q.adjoint()).eval();
A.template triangularView<StrictlyUpper>().setZero();
selfadjointeigensolver_essential_check(A);
}
}
// Test computeFromTridiagonal with scaled inputs (regression for missing scaling).
template <typename MatrixType>
void selfadjointeigensolver_tridiagonal_scaled(const MatrixType& m) {
typedef typename MatrixType::Scalar Scalar;
typedef typename NumTraits<Scalar>::Real RealScalar;
Index n = m.rows();
if (n < 2) return;
// Create a tridiagonal matrix with large entries.
typedef Matrix<RealScalar, Dynamic, 1> RealVectorType;
RealVectorType diag(n), subdiag(n - 1);
// Case 1: Large values.
RealScalar scale = (std::numeric_limits<RealScalar>::max)() / (RealScalar(n) * RealScalar(100));
for (Index i = 0; i < n; ++i) diag(i) = scale * RealScalar(i + 1);
for (Index i = 0; i < n - 1; ++i) subdiag(i) = scale * RealScalar(0.5);
SelfAdjointEigenSolver<MatrixType> eig1;
eig1.computeFromTridiagonal(diag, subdiag, ComputeEigenvectors);
VERIFY_IS_EQUAL(eig1.info(), Success);
// Reconstruct tridiagonal and check residual.
Matrix<RealScalar, Dynamic, Dynamic> T = Matrix<RealScalar, Dynamic, Dynamic>::Zero(n, n);
T.diagonal() = diag;
T.template diagonal<1>() = subdiag;
T.template diagonal<-1>() = subdiag;
VERIFY_IS_APPROX(
T, eig1.eigenvectors().real() * eig1.eigenvalues().asDiagonal() * eig1.eigenvectors().real().transpose());
// Case 2: Tiny values.
scale = (std::numeric_limits<RealScalar>::min)() * RealScalar(100);
for (Index i = 0; i < n; ++i) diag(i) = scale * RealScalar(i + 1);
for (Index i = 0; i < n - 1; ++i) subdiag(i) = scale * RealScalar(0.5);
SelfAdjointEigenSolver<MatrixType> eig2;
eig2.computeFromTridiagonal(diag, subdiag, ComputeEigenvectors);
VERIFY_IS_EQUAL(eig2.info(), Success);
// Eigenvalues-only mode should produce the same eigenvalues.
SelfAdjointEigenSolver<MatrixType> eig2v;
eig2v.computeFromTridiagonal(diag, subdiag, EigenvaluesOnly);
VERIFY_IS_EQUAL(eig2v.info(), Success);
VERIFY_IS_APPROX(eig2.eigenvalues(), eig2v.eigenvalues());
}
// Test computeFromTridiagonal with wide dynamic range across decoupled blocks.
// This exercises the per-block scaling in computeFromTridiagonal_impl: a zero on the
// subdiagonal decouples the matrix into blocks with vastly different scales. Global
// scaling would underflow the small block; per-block scaling handles both correctly.
template <typename RealScalar>
void selfadjointeigensolver_tridiagonal_wide_range() {
using std::sqrt;
typedef Matrix<RealScalar, Dynamic, Dynamic> MatrixType;
typedef Matrix<RealScalar, Dynamic, 1> VectorType;
// Block 1: entries near overflow threshold.
// Block 2: entries near 1.
// Separated by a zero subdiagonal entry.
const RealScalar big = sqrt(NumTraits<RealScalar>::highest()) / RealScalar(10);
const Index n = 6;
VectorType diag(n), subdiag(n - 1);
// First block: [0..2], large scale.
diag(0) = big;
diag(1) = big * RealScalar(1.1);
diag(2) = big * RealScalar(0.9);
subdiag(0) = big * RealScalar(0.01);
subdiag(1) = big * RealScalar(0.02);
// Zero subdiagonal decouples the two blocks.
subdiag(2) = RealScalar(0);
// Second block: [3..5], O(1) scale.
diag(3) = RealScalar(1);
diag(4) = RealScalar(2);
diag(5) = RealScalar(3);
subdiag(3) = RealScalar(0.5);
subdiag(4) = RealScalar(0.3);
// Build the full tridiagonal matrix for residual checking.
MatrixType T = MatrixType::Zero(n, n);
T.diagonal() = diag;
T.template diagonal<1>() = subdiag;
T.template diagonal<-1>() = subdiag;
SelfAdjointEigenSolver<MatrixType> eig;
eig.computeFromTridiagonal(diag, subdiag, ComputeEigenvectors);
VERIFY_IS_EQUAL(eig.info(), Success);
// Eigenvalues must be sorted.
for (Index i = 1; i < n; ++i) {
VERIFY(eig.eigenvalues()(i) >= eig.eigenvalues()(i - 1));
}
// Eigenvectors must be orthonormal.
RealScalar unitary_tol = RealScalar(4) * RealScalar(n) * NumTraits<RealScalar>::epsilon();
VERIFY(eig.eigenvectors().isUnitary(unitary_tol));
// Full residual check in scaled coordinates.
RealScalar Tnorm = T.cwiseAbs().maxCoeff();
MatrixType Tscaled = T / Tnorm;
MatrixType residual = Tscaled * eig.eigenvectors() - eig.eigenvectors() * (eig.eigenvalues() / Tnorm).asDiagonal();
RealScalar rel_err = residual.norm() / Tscaled.norm();
VERIFY(rel_err <= RealScalar(8) * RealScalar(n) * NumTraits<RealScalar>::epsilon());
// The small eigenvalues (~1,2,3) must be accurate, not lost to underflow.
// With global scaling to [-1,1], dividing by 'big' would underflow these to zero.
// Verify the small eigenvalues are within O(eps) of their true values.
// The small block is exactly [[1, 0.5, 0], [0.5, 2, 0.3], [0, 0.3, 3]].
MatrixType T_small(3, 3);
T_small << RealScalar(1), RealScalar(0.5), RealScalar(0), RealScalar(0.5), RealScalar(2), RealScalar(0.3),
RealScalar(0), RealScalar(0.3), RealScalar(3);
SelfAdjointEigenSolver<MatrixType> eig_small(T_small);
VectorType small_evals = eig_small.eigenvalues();
// Find the 3 smallest eigenvalues from the combined solver (they should be sorted first).
VectorType combined_small = eig.eigenvalues().head(3);
VERIFY_IS_APPROX(combined_small, small_evals);
// Eigenvalues-only mode must agree.
SelfAdjointEigenSolver<MatrixType> eig_vals;
eig_vals.computeFromTridiagonal(diag, subdiag, EigenvaluesOnly);
VERIFY_IS_EQUAL(eig_vals.info(), Success);
VERIFY_IS_APPROX(eig.eigenvalues() / Tnorm, eig_vals.eigenvalues() / Tnorm);
}
// Test computeFromTridiagonal with structured hard-case matrices from the literature.
template <typename RealScalar>
void selfadjointeigensolver_structured_tridiagonal() {
typedef Matrix<RealScalar, Dynamic, Dynamic> MatrixType;
test::for_all_symmetric_tridiag_test_matrices<RealScalar>([](const auto& diag, const auto& offdiag) {
Index n = diag.size();
// Build the full symmetric tridiagonal matrix for residual checking.
MatrixType T = MatrixType::Zero(n, n);
T.diagonal() = diag;
if (n > 1) {
T.template diagonal<1>() = offdiag;
T.template diagonal<-1>() = offdiag;
}
RealScalar Tnorm = T.cwiseAbs().maxCoeff();
// Test with eigenvectors.
SelfAdjointEigenSolver<MatrixType> eig;
eig.computeFromTridiagonal(diag, offdiag, ComputeEigenvectors);
VERIFY_IS_EQUAL(eig.info(), Success);
// Eigenvalues must be sorted.
for (Index i = 1; i < n; ++i) {
VERIFY(eig.eigenvalues()(i) >= eig.eigenvalues()(i - 1));
}
// Eigenvectors must be orthonormal.
RealScalar unitary_tol =
numext::maxi(RealScalar(4) * RealScalar(n) * NumTraits<RealScalar>::epsilon(), test_precision<RealScalar>());
VERIFY(eig.eigenvectors().isUnitary(unitary_tol));
// Residual check: ||T*V - V*D||_F / ||T||_max should be O(n*eps).
// Scale T to avoid overflow in the matrix product when entries span extreme ranges.
if (Tnorm > (std::numeric_limits<RealScalar>::min)()) {
MatrixType Tscaled = T / Tnorm;
MatrixType residual =
Tscaled * eig.eigenvectors() - eig.eigenvectors() * (eig.eigenvalues() / Tnorm).asDiagonal();
RealScalar rel_err = residual.norm() / Tscaled.norm();
VERIFY(rel_err <= RealScalar(8) * RealScalar(n) * NumTraits<RealScalar>::epsilon());
}
// Eigenvalues-only mode must produce the same eigenvalues.
SelfAdjointEigenSolver<MatrixType> eig_vals;
eig_vals.computeFromTridiagonal(diag, offdiag, EigenvaluesOnly);
VERIFY_IS_EQUAL(eig_vals.info(), Success);
if (Tnorm > (std::numeric_limits<RealScalar>::min)()) {
VERIFY_IS_APPROX(eig.eigenvalues() / Tnorm, eig_vals.eigenvalues() / Tnorm);
}
});
}
// Test with diagonal matrices (tridiagonalization is trivial).
template <typename MatrixType>
void selfadjointeigensolver_diagonal(const MatrixType& m) {
typedef typename MatrixType::Scalar Scalar;
typedef typename NumTraits<Scalar>::Real RealScalar;
Index n = m.rows();
// Random diagonal matrix.
MatrixType diag = MatrixType::Zero(n, n);
for (Index i = 0; i < n; ++i) {
diag(i, i) = internal::random<RealScalar>(-100, 100);
}
selfadjointeigensolver_essential_check(diag);
// The eigenvalues should be the diagonal entries, sorted.
SelfAdjointEigenSolver<MatrixType> eig(diag);
VERIFY_IS_EQUAL(eig.info(), Success);
Matrix<RealScalar, Dynamic, 1> expected_evals(n);
for (Index i = 0; i < n; ++i) expected_evals(i) = numext::real(diag(i, i));
std::sort(expected_evals.data(), expected_evals.data() + n);
VERIFY_IS_APPROX(eig.eigenvalues(), expected_evals);
}
// Test operatorInverseSqrt more thoroughly.
template <typename MatrixType>
void selfadjointeigensolver_inverse_sqrt(const MatrixType& m) {
Index n = m.rows();
if (n < 1) return;
// Create a positive-definite matrix.
MatrixType a = MatrixType::Random(n, n);
MatrixType spd = a.adjoint() * a + MatrixType::Identity(n, n);
spd.template triangularView<StrictlyUpper>().setZero();
SelfAdjointEigenSolver<MatrixType> eig(spd);
VERIFY_IS_EQUAL(eig.info(), Success);
MatrixType sqrtA = eig.operatorSqrt();
MatrixType invSqrtA = eig.operatorInverseSqrt();
// sqrtA * invSqrtA should be identity.
VERIFY_IS_APPROX(sqrtA * invSqrtA, MatrixType::Identity(n, n));
// invSqrtA * A * invSqrtA should be identity.
VERIFY_IS_APPROX(invSqrtA * spd.template selfadjointView<Lower>() * invSqrtA, MatrixType::Identity(n, n));
// invSqrtA should be symmetric/selfadjoint.
VERIFY_IS_APPROX(invSqrtA, invSqrtA.adjoint());
}
// Test that RowMajor matrices work correctly with computeDirect.
template <int>
void selfadjointeigensolver_rowmajor() {
typedef Matrix<double, 3, 3, RowMajor> RowMajorMatrix3d;
typedef Matrix<double, 2, 2, RowMajor> RowMajorMatrix2d;
typedef Matrix<float, 3, 3, RowMajor> RowMajorMatrix3f;
typedef Matrix<float, 2, 2, RowMajor> RowMajorMatrix2f;
// 3x3 RowMajor double
{
RowMajorMatrix3d a = RowMajorMatrix3d::Random();
RowMajorMatrix3d symmA = a.transpose() * a;
SelfAdjointEigenSolver<RowMajorMatrix3d> eig;
eig.computeDirect(symmA);
VERIFY_IS_EQUAL(eig.info(), Success);
// Compare with iterative solver.
SelfAdjointEigenSolver<RowMajorMatrix3d> eigRef(symmA);
VERIFY_IS_APPROX(eigRef.eigenvalues(), eig.eigenvalues());
}
// 2x2 RowMajor double
{
RowMajorMatrix2d a = RowMajorMatrix2d::Random();
RowMajorMatrix2d symmA = a.transpose() * a;
SelfAdjointEigenSolver<RowMajorMatrix2d> eig;
eig.computeDirect(symmA);
VERIFY_IS_EQUAL(eig.info(), Success);
SelfAdjointEigenSolver<RowMajorMatrix2d> eigRef(symmA);
VERIFY_IS_APPROX(eigRef.eigenvalues(), eig.eigenvalues());
}
// 3x3 RowMajor float
{
RowMajorMatrix3f a = RowMajorMatrix3f::Random();
RowMajorMatrix3f symmA = a.transpose() * a;
SelfAdjointEigenSolver<RowMajorMatrix3f> eig;
eig.computeDirect(symmA);
VERIFY_IS_EQUAL(eig.info(), Success);
SelfAdjointEigenSolver<RowMajorMatrix3f> eigRef(symmA);
VERIFY_IS_APPROX(eigRef.eigenvalues(), eig.eigenvalues());
}
// 2x2 RowMajor float
{
RowMajorMatrix2f a = RowMajorMatrix2f::Random();
RowMajorMatrix2f symmA = a.transpose() * a;
SelfAdjointEigenSolver<RowMajorMatrix2f> eig;
eig.computeDirect(symmA);
VERIFY_IS_EQUAL(eig.info(), Success);
SelfAdjointEigenSolver<RowMajorMatrix2f> eigRef(symmA);
VERIFY_IS_APPROX(eigRef.eigenvalues(), eig.eigenvalues());
}
// Dynamic RowMajor with iterative solver
{
typedef Matrix<double, Dynamic, Dynamic, RowMajor> RowMajorMatrixXd;
int s = internal::random<int>(2, 20);
RowMajorMatrixXd a = RowMajorMatrixXd::Random(s, s);
RowMajorMatrixXd symmA = a.transpose() * a;
SelfAdjointEigenSolver<RowMajorMatrixXd> eig(symmA);
VERIFY_IS_EQUAL(eig.info(), Success);
double scaling = symmA.cwiseAbs().maxCoeff();
if (scaling > (std::numeric_limits<double>::min)()) {
VERIFY_IS_APPROX((symmA.template selfadjointView<Lower>() * eig.eigenvectors()) / scaling,
(eig.eigenvectors() * eig.eigenvalues().asDiagonal()) / scaling);
}
}
}
// Test matrix with Inf entries returns NoConvergence (similar to NaN test).
template <int>
void selfadjointeigensolver_inf() {
Matrix3d m;
m.setRandom();
m = m * m.transpose();
m(1, 1) = std::numeric_limits<double>::infinity();
SelfAdjointEigenSolver<Matrix3d> eig(m);
VERIFY_IS_EQUAL(eig.info(), NoConvergence);
}
template <int>
void bug_854() {
Matrix3d m;
@@ -227,11 +709,160 @@ void bug_1225() {
VERIFY_IS_APPROX(eig1.eigenvalues(), eig2.eigenvalues());
}
// Verify that non-finite inputs are detected for all sizes, including 1x1.
template <int>
void selfadjointeigensolver_nonfinite() {
const double inf = std::numeric_limits<double>::infinity();
const double nan = std::numeric_limits<double>::quiet_NaN();
// 1x1 Inf.
{
Matrix<double, 1, 1> m;
m << inf;
SelfAdjointEigenSolver<Matrix<double, 1, 1>> eig(m);
VERIFY_IS_EQUAL(eig.info(), NoConvergence);
}
// 1x1 NaN.
{
Matrix<double, 1, 1> m;
m << nan;
SelfAdjointEigenSolver<Matrix<double, 1, 1>> eig(m);
VERIFY_IS_EQUAL(eig.info(), NoConvergence);
}
// 1x1 -Inf.
{
Matrix<double, 1, 1> m;
m << -inf;
SelfAdjointEigenSolver<Matrix<double, 1, 1>> eig(m);
VERIFY_IS_EQUAL(eig.info(), NoConvergence);
}
// 3x3 with Inf.
{
Matrix3d m = Matrix3d::Identity();
m(1, 1) = inf;
SelfAdjointEigenSolver<Matrix3d> eig(m);
VERIFY_IS_EQUAL(eig.info(), NoConvergence);
}
// 3x3 with NaN.
{
Matrix3d m = Matrix3d::Identity();
m(0, 1) = m(1, 0) = nan;
SelfAdjointEigenSolver<Matrix3d> eig(m);
VERIFY_IS_EQUAL(eig.info(), NoConvergence);
}
// Dynamic size with Inf.
{
MatrixXd m = MatrixXd::Identity(5, 5);
m(3, 3) = inf;
SelfAdjointEigenSolver<MatrixXd> eig(m);
VERIFY_IS_EQUAL(eig.info(), NoConvergence);
}
}
template <int>
void bug_1204() {
SparseMatrix<double> A(2, 2);
A.setIdentity();
SelfAdjointEigenSolver<Eigen::SparseMatrix<double> > eig(A);
SelfAdjointEigenSolver<Eigen::SparseMatrix<double>> eig(A);
}
template <int>
void selfadjointeigensolver_tridiagonal_zerosized() {
SelfAdjointEigenSolver<MatrixXd> eig;
VectorXd diag(0), subdiag(0);
eig.computeFromTridiagonal(diag, subdiag, EigenvaluesOnly);
VERIFY_IS_EQUAL(eig.info(), Success);
VERIFY_IS_EQUAL(eig.eigenvalues().size(), 0);
VERIFY_RAISES_ASSERT(eig.eigenvectors());
eig.computeFromTridiagonal(diag, subdiag, ComputeEigenvectors);
VERIFY_IS_EQUAL(eig.info(), Success);
VERIFY_IS_EQUAL(eig.eigenvalues().size(), 0);
VERIFY_IS_EQUAL(eig.eigenvectors().rows(), 0);
VERIFY_IS_EQUAL(eig.eigenvectors().cols(), 0);
}
// Specific 3x3 test cases that stress the direct solver.
template <int>
void direct_3x3_stress() {
// Near-planar point cloud covariance: two large eigenvalues, one near-zero.
{
Matrix3d m;
m << 100, 50, 0.001, 50, 100, 0.002, 0.001, 0.002, 1e-10;
selfadjointeigensolver_essential_check(m);
}
// All equal diagonal entries (triple eigenvalue).
{
Matrix3d m = Matrix3d::Identity() * 7.0;
selfadjointeigensolver_essential_check(m);
}
// Two exactly equal eigenvalues (from explicit construction).
{
Matrix3d q;
q << 1, 0, 0, 0, 1.0 / std::sqrt(2.0), 1.0 / std::sqrt(2.0), 0, -1.0 / std::sqrt(2.0), 1.0 / std::sqrt(2.0);
Vector3d d(1.0, 5.0, 5.0);
Matrix3d m = q * d.asDiagonal() * q.transpose();
selfadjointeigensolver_essential_check(m);
}
// Large off-diagonal relative to diagonal.
{
Matrix3d m;
m << 1, 1000, 1000, 1000, 1, 1000, 1000, 1000, 1;
selfadjointeigensolver_essential_check(m);
}
// Nearly singular: one eigenvalue much smaller than others.
{
Matrix3d m;
m << 1, 0.5, 0.3, 0.5, 1, 0.4, 0.3, 0.4, 1;
m *= 1e15;
Matrix3d perturbation = Matrix3d::Zero();
perturbation(0, 0) = 1e-15;
m += perturbation;
selfadjointeigensolver_essential_check(m);
}
}
// Specific 2x2 test cases that stress the direct solver.
template <int>
void direct_2x2_stress() {
// Equal eigenvalues.
{
Matrix2d m = Matrix2d::Identity() * 42.0;
selfadjointeigensolver_essential_check(m);
}
// Very small off-diagonal.
{
Matrix2d m;
m << 1.0, 1e-15, 1e-15, 1.0;
selfadjointeigensolver_essential_check(m);
}
// Huge ratio between diagonal entries.
{
Matrix2d m;
m << 1e100, 0, 0, 1e-100;
selfadjointeigensolver_essential_check(m);
}
// Anti-diagonal dominant.
{
Matrix2d m;
m << 0, 1e10, 1e10, 0;
selfadjointeigensolver_essential_check(m);
}
// Negative entries.
{
Matrix2d m;
m << -5.0, 3.0, 3.0, -5.0;
selfadjointeigensolver_essential_check(m);
}
}
EIGEN_DECLARE_TEST(eigensolver_selfadjoint) {
@@ -266,12 +897,62 @@ EIGEN_DECLARE_TEST(eigensolver_selfadjoint) {
CALL_SUBTEST_5(selfadjointeigensolver(MatrixXcd(2, 2)));
CALL_SUBTEST_6(selfadjointeigensolver(Matrix<double, 1, 1>()));
CALL_SUBTEST_7(selfadjointeigensolver(Matrix<double, 2, 2>()));
// repeated eigenvalues
CALL_SUBTEST_17(selfadjointeigensolver_repeated_eigenvalues(Matrix3d()));
CALL_SUBTEST_15(selfadjointeigensolver_repeated_eigenvalues(Matrix2d()));
CALL_SUBTEST_2(selfadjointeigensolver_repeated_eigenvalues(Matrix4d()));
CALL_SUBTEST_4(selfadjointeigensolver_repeated_eigenvalues(MatrixXd(s, s)));
CALL_SUBTEST_13(selfadjointeigensolver_repeated_eigenvalues(Matrix3f()));
CALL_SUBTEST_12(selfadjointeigensolver_repeated_eigenvalues(Matrix2f()));
CALL_SUBTEST_18(selfadjointeigensolver_repeated_eigenvalues(Matrix3cd()));
// extreme eigenvalues (near overflow/underflow, high condition number)
CALL_SUBTEST_17(selfadjointeigensolver_extreme_eigenvalues(Matrix3d()));
CALL_SUBTEST_2(selfadjointeigensolver_extreme_eigenvalues(Matrix4d()));
CALL_SUBTEST_4(selfadjointeigensolver_extreme_eigenvalues(MatrixXd(s, s)));
CALL_SUBTEST_13(selfadjointeigensolver_extreme_eigenvalues(Matrix3f()));
CALL_SUBTEST_3(selfadjointeigensolver_extreme_eigenvalues(MatrixXf(s, s)));
// computeFromTridiagonal with scaled inputs
CALL_SUBTEST_4(selfadjointeigensolver_tridiagonal_scaled(MatrixXd(s, s)));
CALL_SUBTEST_3(selfadjointeigensolver_tridiagonal_scaled(MatrixXf(s, s)));
// structured tridiagonal hard cases from the literature
CALL_SUBTEST_4(selfadjointeigensolver_structured_tridiagonal<double>());
CALL_SUBTEST_3(selfadjointeigensolver_structured_tridiagonal<float>());
// wide dynamic range tridiagonal (per-block scaling regression)
CALL_SUBTEST_4(selfadjointeigensolver_tridiagonal_wide_range<double>());
CALL_SUBTEST_3(selfadjointeigensolver_tridiagonal_wide_range<float>());
// diagonal matrices
CALL_SUBTEST_17(selfadjointeigensolver_diagonal(Matrix3d()));
CALL_SUBTEST_4(selfadjointeigensolver_diagonal(MatrixXd(s, s)));
// operatorInverseSqrt
CALL_SUBTEST_17(selfadjointeigensolver_inverse_sqrt(Matrix3d()));
CALL_SUBTEST_2(selfadjointeigensolver_inverse_sqrt(Matrix4d()));
CALL_SUBTEST_4(selfadjointeigensolver_inverse_sqrt(MatrixXd(s, s)));
CALL_SUBTEST_13(selfadjointeigensolver_inverse_sqrt(Matrix3f()));
// RowMajor
CALL_SUBTEST_19(selfadjointeigensolver_rowmajor<0>());
}
CALL_SUBTEST_17(bug_854<0>());
CALL_SUBTEST_17(bug_1014<0>());
CALL_SUBTEST_17(bug_1204<0>());
CALL_SUBTEST_17(bug_1225<0>());
CALL_SUBTEST_17(selfadjointeigensolver_nonfinite<0>());
CALL_SUBTEST_8(selfadjointeigensolver_tridiagonal_zerosized<0>());
// Stress tests for direct 3x3 and 2x2 solvers.
CALL_SUBTEST_17(direct_3x3_stress<0>());
CALL_SUBTEST_15(direct_2x2_stress<0>());
// Test Inf input handling.
CALL_SUBTEST_17(selfadjointeigensolver_inf<0>());
// Test problem size constructors
s = internal::random<int>(1, EIGEN_TEST_MAX_SIZE / 4);

View File

@@ -7,12 +7,6 @@
// Public License v. 2.0. If a copy of the MPL was not distributed
// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
// workaround issue between gcc >= 4.7 and cuda 5.5
#if (defined __GNUC__) && (__GNUC__ > 4 || __GNUC_MINOR__ >= 7)
#undef _GLIBCXX_ATOMIC_BUILTINS
#undef _GLIBCXX_USE_INT128
#endif
#define EIGEN_TEST_NO_LONGDOUBLE
#define EIGEN_DEFAULT_DENSE_INDEX_TYPE int

View File

@@ -6,10 +6,8 @@
// Allow gpu** macros for generic tests.
#include <Eigen/src/Core/util/GpuHipCudaDefines.inc>
// std::tuple cannot be used on device, and there is a bug in cuda < 9.2 that
// doesn't allow std::tuple to compile for host code either. In these cases,
// use our custom implementation.
#if defined(EIGEN_GPU_COMPILE_PHASE) || (defined(EIGEN_CUDACC) && EIGEN_CUDA_SDK_VER < 92000)
// std::tuple cannot be used on device, so use our custom implementation there.
#if defined(EIGEN_GPU_COMPILE_PHASE)
#define EIGEN_USE_CUSTOM_TUPLE 1
#else
#define EIGEN_USE_CUSTOM_TUPLE 0
@@ -42,6 +40,12 @@ using tuple_impl::tuple;
#undef EIGEN_USE_CUSTOM_TUPLE
} // namespace test_detail
template <typename T>
using decay_t = typename std::decay<T>::type;
template <typename Func, typename... Args>
using kernel_result_t = decltype(std::declval<Func>()(std::declval<Args>()...));
template <size_t N, size_t Idx, typename OutputIndexSequence, typename... Ts>
struct extract_output_indices_helper;
@@ -90,14 +94,15 @@ struct void_helper {
// Non-void return value.
template <typename Func, typename... Args>
static EIGEN_ALWAYS_INLINE EIGEN_DEVICE_FUNC auto call(Func&& func, Args&&... args)
-> std::enable_if_t<!std::is_same<decltype(func(args...)), void>::value, decltype(func(args...))> {
-> std::enable_if_t<!std::is_same<kernel_result_t<Func&&, Args&&...>, void>::value,
kernel_result_t<Func&&, Args&&...>> {
return func(std::forward<Args>(args)...);
}
// Void return value.
template <typename Func, typename... Args>
static EIGEN_ALWAYS_INLINE EIGEN_DEVICE_FUNC auto call(Func&& func, Args&&... args)
-> std::enable_if_t<std::is_same<decltype(func(args...)), void>::value, Void> {
-> std::enable_if_t<std::is_same<kernel_result_t<Func&&, Args&&...>, void>::value, Void> {
func(std::forward<Args>(args)...);
return Void{};
}
@@ -135,18 +140,18 @@ EIGEN_DEVICE_FUNC void run_serialized(std::index_sequence<Indices...>, std::inde
const uint8_t* read_end = buffer + capacity;
read_ptr = Eigen::deserialize(read_ptr, read_end, input_size);
// Create value-type instances to populate.
auto args = make_tuple(typename std::decay<Args>::type{}...);
auto args = make_tuple(decay_t<Args>{}...);
EIGEN_UNUSED_VARIABLE(args); // Avoid NVCC compile warning.
// NVCC 9.1 requires us to spell out the template parameters explicitly.
read_ptr = Eigen::deserialize(read_ptr, read_end, get<Indices, typename std::decay<Args>::type...>(args)...);
read_ptr = Eigen::deserialize(read_ptr, read_end, get<Indices, decay_t<Args>...>(args)...);
// Call function, with void->Void conversion so we are guaranteed a complete
// output type.
auto result = void_helper::call(kernel, get<Indices, typename std::decay<Args>::type...>(args)...);
auto result = void_helper::call(kernel, get<Indices, decay_t<Args>...>(args)...);
// Determine required output size.
size_t output_size = Eigen::serialize_size(capacity);
output_size += Eigen::serialize_size(get<OutputIndices, typename std::decay<Args>::type...>(args)...);
output_size += Eigen::serialize_size(get<OutputIndices, decay_t<Args>...>(args)...);
output_size += Eigen::serialize_size(result);
// Always serialize required buffer size.
@@ -157,7 +162,7 @@ EIGEN_DEVICE_FUNC void run_serialized(std::index_sequence<Indices...>, std::inde
// Serialize outputs if they fit in the buffer.
if (output_size <= capacity) {
// Collect outputs and result.
write_ptr = Eigen::serialize(write_ptr, write_end, get<OutputIndices, typename std::decay<Args>::type...>(args)...);
write_ptr = Eigen::serialize(write_ptr, write_end, get<OutputIndices, decay_t<Args>...>(args)...);
write_ptr = Eigen::serialize(write_ptr, write_end, result);
}
}
@@ -282,7 +287,7 @@ auto run_serialized_on_gpu(size_t buffer_capacity_hint, std::index_sequence<Indi
* \return kernel(args...).
*/
template <typename Kernel, typename... Args>
auto run_on_cpu(Kernel kernel, Args&&... args) -> decltype(kernel(args...)) {
auto run_on_cpu(Kernel kernel, Args&&... args) -> internal::kernel_result_t<Kernel, Args&&...> {
return kernel(std::forward<Args>(args)...);
}
@@ -301,7 +306,7 @@ auto run_on_cpu(Kernel kernel, Args&&... args) -> decltype(kernel(args...)) {
* \return kernel(args...).
*/
template <typename Kernel, typename... Args>
auto run_on_gpu(Kernel kernel, Args&&... args) -> decltype(kernel(args...)) {
auto run_on_gpu(Kernel kernel, Args&&... args) -> internal::kernel_result_t<Kernel, Args&&...> {
return internal::run_serialized_on_gpu<Kernel, Args...>(
/*buffer_capacity_hint=*/0, std::make_index_sequence<sizeof...(Args)>{},
internal::extract_output_indices<Args...>{}, kernel, std::forward<Args>(args)...);
@@ -322,7 +327,8 @@ auto run_on_gpu(Kernel kernel, Args&&... args) -> decltype(kernel(args...)) {
* \sa run_on_gpu
*/
template <typename Kernel, typename... Args>
auto run_on_gpu_with_hint(size_t buffer_capacity_hint, Kernel kernel, Args&&... args) -> decltype(kernel(args...)) {
auto run_on_gpu_with_hint(size_t buffer_capacity_hint, Kernel kernel, Args&&... args)
-> internal::kernel_result_t<Kernel, Args&&...> {
return internal::run_serialized_on_gpu<Kernel, Args...>(
buffer_capacity_hint, std::make_index_sequence<sizeof...(Args)>{}, internal::extract_output_indices<Args...>{},
kernel, std::forward<Args>(args)...);
@@ -409,7 +415,7 @@ void print_gpu_device_info() {
* \return kernel(args...).
*/
template <typename Kernel, typename... Args>
auto run(Kernel kernel, Args&&... args) -> decltype(kernel(args...)) {
auto run(Kernel kernel, Args&&... args) -> internal::kernel_result_t<Kernel, Args&&...> {
#ifdef EIGEN_GPUCC
return run_on_gpu(kernel, std::forward<Args>(args)...);
#else
@@ -432,7 +438,8 @@ auto run(Kernel kernel, Args&&... args) -> decltype(kernel(args...)) {
* \sa run
*/
template <typename Kernel, typename... Args>
auto run_with_hint(size_t buffer_capacity_hint, Kernel kernel, Args&&... args) -> decltype(kernel(args...)) {
auto run_with_hint(size_t buffer_capacity_hint, Kernel kernel, Args&&... args)
-> internal::kernel_result_t<Kernel, Args&&...> {
#ifdef EIGEN_GPUCC
return run_on_gpu_with_hint(buffer_capacity_hint, kernel, std::forward<Args>(args)...);
#else

View File

@@ -76,10 +76,8 @@
#include <cuda.h>
#include <cuda_runtime.h>
#include <cuda_runtime_api.h>
#if CUDA_VERSION >= 7050
#include <cuda_fp16.h>
#endif
#endif
#if defined(EIGEN_CUDACC) || defined(EIGEN_HIPCC)
#define EIGEN_TEST_NO_LONGDOUBLE
@@ -949,6 +947,37 @@ inline void set_seed_from_time() {
g_seed = static_cast<decltype(g_seed)>(ns);
}
#if defined(EIGEN_USE_GPU)
inline int maybe_skip_gpu_tests() {
#if defined(EIGEN_USE_HIP)
int device_count = 0;
hipError_t status = hipGetDeviceCount(&device_count);
if (status != hipSuccess) {
std::cout << "SKIP: HIP GPU tests require a visible ROCm device. hipGetDeviceCount failed with: "
<< hipGetErrorString(status) << std::endl;
return 77;
}
if (device_count <= 0) {
std::cout << "SKIP: HIP GPU tests require a visible ROCm device." << std::endl;
return 77;
}
#elif defined(EIGEN_CUDACC)
int device_count = 0;
cudaError_t status = cudaGetDeviceCount(&device_count);
if (status != cudaSuccess) {
std::cout << "SKIP: CUDA GPU tests require a visible CUDA device. cudaGetDeviceCount failed with: "
<< cudaGetErrorString(status) << std::endl;
return 77;
}
if (device_count <= 0) {
std::cout << "SKIP: CUDA GPU tests require a visible CUDA device." << std::endl;
return 77;
}
#endif
return 0;
}
#endif
int main(int argc, char* argv[]) {
g_has_set_repeat = false;
g_has_set_seed = false;
@@ -997,6 +1026,13 @@ int main(int argc, char* argv[]) {
srand(g_seed);
std::cout << "Repeating each test " << g_repeat << " times" << std::endl;
#if defined(EIGEN_USE_GPU)
{
const int skip_code = maybe_skip_gpu_tests();
if (skip_code != 0) return skip_code;
}
#endif
VERIFY(EigenTest::all().size() > 0);
for (std::size_t i = 0; i < EigenTest::all().size(); ++i) {

View File

@@ -0,0 +1,383 @@
// This file is part of Eigen, a lightweight C++ template library
// for linear algebra.
//
// Copyright (C) 2025 Rasmus Munk Larsen <rmlarsen@gmail.com>
//
// This Source Code Form is subject to the terms of the Mozilla
// Public License v. 2.0. If a copy of the MPL was not distributed
// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
#ifndef EIGEN_TEST_TRIDIAG_TEST_MATRICES_H
#define EIGEN_TEST_TRIDIAG_TEST_MATRICES_H
// Structured tridiagonal test matrices from the numerical linear algebra
// literature. Used by both the bidiagonal SVD and symmetric eigenvalue tests.
//
// Each generator writes into pre-allocated (diag, offdiag) vectors.
// For SVD, offdiag is the superdiagonal of a bidiagonal matrix.
// For eigenvalues, offdiag is the subdiagonal of a symmetric tridiagonal matrix.
//
// Usage:
// Matrix<RealScalar, Dynamic, 1> diag(n), offdiag(n-1);
// tridiag_identity(diag, offdiag); // fills diag and offdiag
// my_verify(diag, offdiag); // solver-specific verification
#include <Eigen/Core>
namespace Eigen {
namespace test {
// 1. Identity: d=[1,...,1], e=[0,...,0]
template <typename VectorType>
void tridiag_identity(VectorType& diag, VectorType& offdiag) {
diag.setOnes();
offdiag.setZero();
}
// 2. Zero: d=[0,...,0], e=[0,...,0]
template <typename VectorType>
void tridiag_zero(VectorType& diag, VectorType& offdiag) {
diag.setZero();
offdiag.setZero();
}
// 3. Constant: d=[c,...,c], e=[c,...,c]
template <typename VectorType>
void tridiag_constant(VectorType& diag, VectorType& offdiag,
typename VectorType::Scalar c = typename VectorType::Scalar(2.5)) {
diag.setConstant(c);
offdiag.setConstant(c);
}
// 4. 1-2-1 Toeplitz: d=[2,...,2], e=[1,...,1]
// Eigenvalues: 2 - 2*cos(k*pi/(n+1)) for k=1,...,n
template <typename VectorType>
void tridiag_1_2_1(VectorType& diag, VectorType& offdiag) {
typedef typename VectorType::Scalar Scalar;
diag.setConstant(Scalar(2));
offdiag.setOnes();
}
// 5. Wilkinson W_{2m+1}: d_i = |m - i|, e=[1,...,1]
// Has pairs of eigenvalues agreeing to many digits; stresses deflation.
template <typename VectorType>
void tridiag_wilkinson(VectorType& diag, VectorType& offdiag) {
typedef typename VectorType::Scalar Scalar;
Index n = diag.size();
for (Index i = 0; i < n; ++i) diag(i) = numext::abs(Scalar(n / 2) - Scalar(i));
offdiag.setOnes();
}
// 6. Clement matrix: d=[0,...,0], e_i = sqrt(i*(n-1-i))
// Known eigenvalues: -(n-1), -(n-3), ..., (n-3), (n-1)
template <typename VectorType>
void tridiag_clement(VectorType& diag, VectorType& offdiag) {
EIGEN_USING_STD(sqrt);
typedef typename VectorType::Scalar Scalar;
Index n = diag.size();
diag.setZero();
for (Index i = 0; i < n - 1; ++i) offdiag(i) = sqrt(Scalar(i + 1) * Scalar(n - 1 - i));
}
// 7. Kahan-style: d_i = s^i, e_i = -c*s^i with s=sin(theta), c=cos(theta).
// Geometric decay with controlled condition number.
template <typename VectorType>
void tridiag_kahan(VectorType& diag, VectorType& offdiag,
typename VectorType::Scalar theta = typename VectorType::Scalar(0.3)) {
EIGEN_USING_STD(sin);
EIGEN_USING_STD(cos);
EIGEN_USING_STD(pow);
EIGEN_USING_STD(log);
typedef typename VectorType::Scalar Scalar;
Index n = diag.size();
const Scalar eps = NumTraits<Scalar>::epsilon();
const Scalar s = sin(theta);
const Scalar c = cos(theta);
const Scalar maxPower = -log(eps) / (-log(s));
for (Index i = 0; i < n; ++i) diag(i) = pow(s, numext::mini(Scalar(i), maxPower));
for (Index i = 0; i < n - 1; ++i) offdiag(i) = -c * pow(s, numext::mini(Scalar(i), maxPower));
}
// 8. Graded: d_i = base^(-i), e_i = base^(-i)
template <typename VectorType>
void tridiag_graded(VectorType& diag, VectorType& offdiag,
typename VectorType::Scalar base = typename VectorType::Scalar(10)) {
EIGEN_USING_STD(pow);
typedef typename VectorType::Scalar Scalar;
Index n = diag.size();
for (Index i = 0; i < n; ++i) diag(i) = pow(base, -Scalar(i));
for (Index i = 0; i < n - 1; ++i) offdiag(i) = pow(base, -Scalar(i));
}
// 9. Geometric decay diagonal: d_i = base^i, e=[0,...,0]
template <typename VectorType>
void tridiag_geometric_diagonal(VectorType& diag, VectorType& offdiag,
typename VectorType::Scalar base = typename VectorType::Scalar(0.5)) {
EIGEN_USING_STD(pow);
EIGEN_USING_STD(log);
typedef typename VectorType::Scalar Scalar;
Index n = diag.size();
const Scalar eps = NumTraits<Scalar>::epsilon();
const Scalar maxPower = -log(eps) / (-log(base));
for (Index i = 0; i < n; ++i) diag(i) = pow(base, numext::mini(Scalar(i), maxPower));
offdiag.setZero();
}
// 10. Geometric decay offdiagonal: d=[1,...,1], e_i = base^i
template <typename VectorType>
void tridiag_geometric_offdiag(VectorType& diag, VectorType& offdiag,
typename VectorType::Scalar base = typename VectorType::Scalar(0.5)) {
EIGEN_USING_STD(pow);
typedef typename VectorType::Scalar Scalar;
Index n = diag.size();
diag.setOnes();
for (Index i = 0; i < n - 1; ++i) offdiag(i) = pow(base, Scalar(i));
}
// 11. Clustered eigenvalues: d_i = 1 + i*eps, e=[0,...,0]
template <typename VectorType>
void tridiag_clustered(VectorType& diag, VectorType& offdiag) {
typedef typename VectorType::Scalar Scalar;
Index n = diag.size();
const Scalar eps = NumTraits<Scalar>::epsilon();
for (Index i = 0; i < n; ++i) diag(i) = Scalar(1) + Scalar(i) * eps;
offdiag.setZero();
}
// 12. Two clusters: half at 1, half at eps.
template <typename VectorType>
void tridiag_two_clusters(VectorType& diag, VectorType& offdiag) {
typedef typename VectorType::Scalar Scalar;
Index n = diag.size();
const Scalar eps = NumTraits<Scalar>::epsilon();
for (Index i = 0; i < n; ++i) diag(i) = (i < n / 2) ? Scalar(1) : eps;
offdiag.setZero();
}
// 13. Single tiny value: d=[1,...,1,eps], e=[eps^2,...,eps^2]
template <typename VectorType>
void tridiag_single_tiny(VectorType& diag, VectorType& offdiag) {
typedef typename VectorType::Scalar Scalar;
Index n = diag.size();
const Scalar eps = NumTraits<Scalar>::epsilon();
diag.setOnes();
diag(n - 1) = eps;
offdiag.setConstant(eps * eps);
}
// 14. Overflow/underflow: alternating big/tiny diagonal and offdiagonal.
template <typename VectorType>
void tridiag_overflow_underflow(VectorType& diag, VectorType& offdiag) {
typedef typename VectorType::Scalar Scalar;
Index n = diag.size();
const Scalar big = (std::numeric_limits<Scalar>::max)() / Scalar(1000);
const Scalar tiny = (std::numeric_limits<Scalar>::min)() * Scalar(1000);
for (Index i = 0; i < n; ++i) diag(i) = (i % 2 == 0) ? big : tiny;
for (Index i = 0; i < n - 1; ++i) offdiag(i) = (i % 2 == 0) ? tiny : big;
}
// 15. Prescribed condition number: d_i = kappa^(-i/(n-1)), e_i = eps * random.
template <typename VectorType>
void tridiag_prescribed_cond(VectorType& diag, VectorType& offdiag) {
EIGEN_USING_STD(pow);
EIGEN_USING_STD(abs);
typedef typename VectorType::Scalar Scalar;
Index n = diag.size();
const Scalar eps = NumTraits<Scalar>::epsilon();
const Scalar kappa = Scalar(1) / eps;
for (Index i = 0; i < n; ++i) diag(i) = pow(kappa, -Scalar(i) / Scalar(n - 1));
for (Index i = 0; i < n - 1; ++i) offdiag(i) = eps * abs(internal::random<Scalar>());
}
// 16. Rank-deficient: d=[1,..,0,..,0,..,1], e=[0,...,0]
template <typename VectorType>
void tridiag_rank_deficient(VectorType& diag, VectorType& offdiag) {
typedef typename VectorType::Scalar Scalar;
Index n = diag.size();
for (Index i = 0; i < n; ++i) diag(i) = (i < n / 3 || i >= 2 * n / 3) ? Scalar(1) : Scalar(0);
offdiag.setZero();
}
// 17. Arrowhead-like: d_i = linspace(1,n), e_i = 1/(i+1)
template <typename VectorType>
void tridiag_arrowhead(VectorType& diag, VectorType& offdiag) {
typedef typename VectorType::Scalar Scalar;
Index n = diag.size();
for (Index i = 0; i < n; ++i) diag(i) = Scalar(1) + Scalar(i);
for (Index i = 0; i < n - 1; ++i) offdiag(i) = Scalar(1) / Scalar(i + 1);
}
// 18. Repeated values: d=[1,2,3,1,2,3,...], e=[0,...,0]
template <typename VectorType>
void tridiag_repeated(VectorType& diag, VectorType& offdiag) {
typedef typename VectorType::Scalar Scalar;
Index n = diag.size();
for (Index i = 0; i < n; ++i) diag(i) = Scalar((i % 3) + 1);
offdiag.setZero();
}
// 19. Glued blocks: d=[1,...,1], e=0 except e[n/2-1]=eps.
// Two identity blocks coupled by a tiny off-diagonal entry.
template <typename VectorType>
void tridiag_glued(VectorType& diag, VectorType& offdiag) {
typedef typename VectorType::Scalar Scalar;
Index n = diag.size();
diag.setOnes();
offdiag.setZero();
if (n > 2) offdiag(n / 2 - 1) = NumTraits<Scalar>::epsilon();
}
// 20. Nearly diagonal: random diag, eps * random offdiag.
template <typename VectorType>
void tridiag_nearly_diagonal(VectorType& diag, VectorType& offdiag) {
EIGEN_USING_STD(abs);
typedef typename VectorType::Scalar Scalar;
Index n = diag.size();
const Scalar eps = NumTraits<Scalar>::epsilon();
diag = VectorType::Random(n).cwiseAbs() + VectorType::Constant(n, Scalar(0.1));
for (Index i = 0; i < n - 1; ++i) offdiag(i) = eps * (Scalar(0.5) + abs(internal::random<Scalar>()));
}
// 21. Negative eigenvalues: d_i = -i, e=[1,...,1]
// (Only meaningful for symmetric eigenvalue problems, not SVD.)
template <typename VectorType>
void tridiag_negative(VectorType& diag, VectorType& offdiag) {
typedef typename VectorType::Scalar Scalar;
Index n = diag.size();
for (Index i = 0; i < n; ++i) diag(i) = -Scalar(i + 1);
offdiag.setOnes();
}
// 22. Mixed sign diagonal: d_i = (-1)^i * (i+1), e=[1,...,1]
// (Only meaningful for symmetric eigenvalue problems, not SVD.)
template <typename VectorType>
void tridiag_mixed_sign(VectorType& diag, VectorType& offdiag) {
typedef typename VectorType::Scalar Scalar;
Index n = diag.size();
for (Index i = 0; i < n; ++i) diag(i) = ((i % 2 == 0) ? Scalar(1) : Scalar(-1)) * Scalar(i + 1);
offdiag.setOnes();
}
// Helper: iterate over a set of sizes and call a functor with each (diag, offdiag) pair
// generated by a generator function.
//
// Usage:
// for_tridiag_sizes([](auto& diag, auto& offdiag) {
// tridiag_wilkinson(diag, offdiag);
// my_verify(diag, offdiag);
// });
template <typename Scalar, typename Func>
void for_tridiag_sizes(Func&& func) {
const int sizes[] = {1, 2, 3, 5, 10, 16, 20, 50, 100};
typedef Matrix<Scalar, Dynamic, 1> VectorType;
for (int si = 0; si < int(sizeof(sizes) / sizeof(sizes[0])); ++si) {
const Index n = sizes[si];
VectorType diag(n), offdiag(n > 1 ? n - 1 : 0);
func(diag, offdiag);
}
}
// Helper: run all generators (suitable for both SVD and eigenvalue problems).
// The callback receives (diag, offdiag) after each generator fills them.
template <typename Scalar, typename Func>
void for_all_tridiag_test_matrices(Func&& verify) {
const int sizes[] = {1, 2, 3, 5, 10, 16, 20, 50, 100};
typedef Matrix<Scalar, Dynamic, 1> VectorType;
for (int si = 0; si < int(sizeof(sizes) / sizeof(sizes[0])); ++si) {
const Index n = sizes[si];
VectorType diag(n), offdiag(n > 1 ? n - 1 : 0);
tridiag_identity(diag, offdiag);
verify(diag, offdiag);
tridiag_zero(diag, offdiag);
verify(diag, offdiag);
tridiag_constant(diag, offdiag);
verify(diag, offdiag);
tridiag_1_2_1(diag, offdiag);
verify(diag, offdiag);
tridiag_wilkinson(diag, offdiag);
verify(diag, offdiag);
if (n > 1) {
tridiag_clement(diag, offdiag);
verify(diag, offdiag);
}
tridiag_kahan(diag, offdiag);
verify(diag, offdiag);
tridiag_graded(diag, offdiag);
verify(diag, offdiag);
tridiag_geometric_diagonal(diag, offdiag);
verify(diag, offdiag);
if (n > 1) {
tridiag_geometric_offdiag(diag, offdiag);
verify(diag, offdiag);
}
tridiag_clustered(diag, offdiag);
verify(diag, offdiag);
tridiag_two_clusters(diag, offdiag);
verify(diag, offdiag);
tridiag_single_tiny(diag, offdiag);
verify(diag, offdiag);
tridiag_overflow_underflow(diag, offdiag);
verify(diag, offdiag);
if (n > 1) {
tridiag_prescribed_cond(diag, offdiag);
verify(diag, offdiag);
}
tridiag_rank_deficient(diag, offdiag);
verify(diag, offdiag);
tridiag_arrowhead(diag, offdiag);
verify(diag, offdiag);
tridiag_repeated(diag, offdiag);
verify(diag, offdiag);
tridiag_glued(diag, offdiag);
verify(diag, offdiag);
tridiag_nearly_diagonal(diag, offdiag);
verify(diag, offdiag);
}
}
// Helper: run all generators, including those with negative values
// (suitable only for symmetric eigenvalue problems, not SVD).
template <typename Scalar, typename Func>
void for_all_symmetric_tridiag_test_matrices(Func&& verify) {
for_all_tridiag_test_matrices<Scalar>(verify);
const int sizes[] = {1, 2, 3, 5, 10, 16, 20, 50, 100};
typedef Matrix<Scalar, Dynamic, 1> VectorType;
for (int si = 0; si < int(sizeof(sizes) / sizeof(sizes[0])); ++si) {
const Index n = sizes[si];
VectorType diag(n), offdiag(n > 1 ? n - 1 : 0);
tridiag_negative(diag, offdiag);
verify(diag, offdiag);
tridiag_mixed_sign(diag, offdiag);
verify(diag, offdiag);
}
}
} // namespace test
} // namespace Eigen
#endif // EIGEN_TEST_TRIDIAG_TEST_MATRICES_H

View File

@@ -393,7 +393,8 @@ __device__ EIGEN_STRONG_INLINE void EigenContractionKernelInternal(const LhsMapp
// the sum across all big k blocks of the product of little k block of index (x, y)
// with block of index (y, z). To compute the final output, we need to reduce
// the 8 threads over y by summation.
#if defined(EIGEN_HIPCC) || (defined(EIGEN_CUDA_SDK_VER) && EIGEN_CUDA_SDK_VER < 90000)
// HIP uses non-sync warp shuffles; CUDA requires the _sync variants.
#if defined(EIGEN_HIPCC)
#define shuffleInc(i, j, mask) res(i, j) += __shfl_xor(res(i, j), mask)
#else
#define shuffleInc(i, j, mask) res(i, j) += __shfl_xor_sync(0xFFFFFFFF, res(i, j), mask)
@@ -622,7 +623,7 @@ __device__ __forceinline__ void EigenFloatContractionKernelInternal16x16(const L
x1 = rhs_pf0.x;
x2 = rhs_pf0.z;
}
#if defined(EIGEN_HIPCC) || (defined(EIGEN_CUDA_SDK_VER) && EIGEN_CUDA_SDK_VER < 90000)
#if defined(EIGEN_HIPCC)
x1 = __shfl_xor(x1, 4);
x2 = __shfl_xor(x2, 4);
#else
@@ -1377,13 +1378,6 @@ struct TensorEvaluator<const TensorContractionOp<Indices, LeftArgType, RightArgT
this->m_right_contracting_strides, this->m_k_strides);
OutputMapper output(buffer, m);
#if defined(EIGEN_USE_HIP)
setGpuSharedMemConfig(hipSharedMemBankSizeEightByte);
#else
setGpuSharedMemConfig(cudaSharedMemBankSizeEightByte);
#endif
LaunchKernels<LhsScalar, RhsScalar, Index, LhsMapper, RhsMapper, OutputMapper>::Run(lhs, rhs, output, m, n, k,
this->m_device);
}

View File

@@ -89,7 +89,7 @@ class IndexMapper {
}
} else {
for (int i = NumDims - 1; i >= 0; --i) {
if (static_cast<size_t>(i + 1) < offset) {
if (i + 1 < static_cast<int>(offset)) {
m_gpuInputStrides[i] = m_gpuInputStrides[i + 1] * gpuInputDimensions[i + 1];
m_gpuOutputStrides[i] = m_gpuOutputStrides[i + 1] * gpuOutputDimensions[i + 1];
} else {

View File

@@ -342,19 +342,6 @@ struct GpuDevice {
#endif
// FIXME: Should be device and kernel specific.
#ifdef EIGEN_GPUCC
static EIGEN_DEVICE_FUNC inline void setGpuSharedMemConfig(gpuSharedMemConfig config) {
#ifndef EIGEN_GPU_COMPILE_PHASE
gpuError_t status = gpuDeviceSetSharedMemConfig(config);
EIGEN_UNUSED_VARIABLE(status);
gpu_assert(status == gpuSuccess);
#else
EIGEN_UNUSED_VARIABLE(config);
#endif
}
#endif
} // end namespace Eigen
// undefine all the gpu* macros we defined at the beginning of the file

View File

@@ -175,7 +175,7 @@ EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE T loadConstant(const T* address) {
return *address;
}
// Use the texture cache on CUDA devices whenever possible
#if defined(EIGEN_CUDA_ARCH) && EIGEN_CUDA_ARCH >= 350
#if defined(EIGEN_CUDA_ARCH)
template <>
EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE float loadConstant(const float* address) {
return __ldg(address);

View File

@@ -49,7 +49,7 @@ struct PacketType : internal::packet_traits<Scalar> {
};
// For CUDA packet types when using a GpuDevice
#if defined(EIGEN_USE_GPU) && defined(EIGEN_HAS_GPU_FP16) && defined(EIGEN_GPU_COMPILE_PHASE)
#if defined(EIGEN_USE_GPU) && defined(EIGEN_GPU_COMPILE_PHASE)
typedef ulonglong2 Packet4h2;
template <>

View File

@@ -453,7 +453,7 @@ template <int B, int N, typename S, typename R, typename I_>
__global__ EIGEN_HIP_LAUNCH_BOUNDS_1024 void FullReductionKernel(R, const S, I_, typename S::CoeffReturnType*,
unsigned int*);
#if defined(EIGEN_HAS_GPU_FP16)
#if defined(EIGEN_GPUCC)
template <typename S, typename R, typename I_>
__global__ EIGEN_HIP_LAUNCH_BOUNDS_1024 void ReductionInitFullReduxKernelHalfFloat(
R, const S, I_, internal::packet_traits<half>::type*);
@@ -883,7 +883,7 @@ struct TensorReductionEvaluatorBase<const TensorReductionOp<Op, Dims, ArgType, M
#if defined(EIGEN_USE_GPU) && (defined(EIGEN_GPUCC))
template <int B, int N, typename S, typename R, typename I_>
KERNEL_FRIEND void internal::FullReductionKernel(R, const S, I_, typename S::CoeffReturnType*, unsigned int*);
#if defined(EIGEN_HAS_GPU_FP16)
#if defined(EIGEN_GPUCC)
template <typename S, typename R, typename I_>
KERNEL_FRIEND void internal::ReductionInitFullReduxKernelHalfFloat(R, const S, I_,
internal::packet_traits<Eigen::half>::type*);

View File

@@ -25,7 +25,6 @@ namespace internal {
// updated the content of the output address it will try again.
template <typename T, typename R>
__device__ EIGEN_ALWAYS_INLINE void atomicReduce(T* output, T accum, R& reducer) {
#if (defined(EIGEN_HIP_DEVICE_COMPILE) && defined(__HIP_ARCH_HAS_WARP_SHUFFLE__)) || (EIGEN_CUDA_ARCH >= 300)
if (sizeof(T) == 4) {
unsigned int oldval = *reinterpret_cast<unsigned int*>(output);
unsigned int newval = oldval;
@@ -61,12 +60,6 @@ __device__ EIGEN_ALWAYS_INLINE void atomicReduce(T* output, T accum, R& reducer)
} else {
gpu_assert(0 && "Wordsize not supported");
}
#else // EIGEN_CUDA_ARCH >= 300
EIGEN_UNUSED_VARIABLE(output);
EIGEN_UNUSED_VARIABLE(accum);
EIGEN_UNUSED_VARIABLE(reducer);
gpu_assert(0 && "Shouldn't be called on unsupported device");
#endif // EIGEN_CUDA_ARCH >= 300
}
// We extend atomicExch to support extra data types
@@ -75,13 +68,42 @@ __device__ inline Type atomicExchCustom(Type* address, Type val) {
return atomicExch(address, val);
}
template <typename T>
EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR auto reduction_shuffle_mask() {
#if defined(EIGEN_HIP_DEVICE_COMPILE)
return 0xFFFFFFFFFFFFFFFFull;
#else
return 0xFFFFFFFFu;
#endif
}
template <typename T>
__device__ EIGEN_ALWAYS_INLINE T reduction_shuffle_down(T value, int offset) {
return __shfl_down_sync(reduction_shuffle_mask<T>(), value, offset, warpSize);
}
template <>
__device__ EIGEN_ALWAYS_INLINE int reduction_shuffle_down<int>(int value, int offset) {
return __shfl_down_sync(reduction_shuffle_mask<int>(), value, offset, warpSize);
}
template <>
__device__ EIGEN_ALWAYS_INLINE float reduction_shuffle_down<float>(float value, int offset) {
return __shfl_down_sync(reduction_shuffle_mask<float>(), value, offset, warpSize);
}
template <>
__device__ EIGEN_ALWAYS_INLINE double reduction_shuffle_down<double>(double value, int offset) {
return __shfl_down_sync(reduction_shuffle_mask<double>(), value, offset, warpSize);
}
template <>
__device__ inline double atomicExchCustom(double* address, double val) {
unsigned long long int* address_as_ull = reinterpret_cast<unsigned long long int*>(address);
return __longlong_as_double(atomicExch(address_as_ull, __double_as_longlong(val)));
}
#ifdef EIGEN_HAS_GPU_FP16
// Half-float reduction specializations.
template <typename R>
__device__ inline void atomicReduce(half2* output, half2 accum, R& reducer) {
unsigned int oldval = *reinterpret_cast<unsigned int*>(output);
@@ -111,17 +133,10 @@ __device__ inline void atomicReduce(Packet4h2* output, Packet4h2 accum, R& reduc
}
}
#endif // EIGEN_GPU_COMPILE_PHASE
#endif // EIGEN_HAS_GPU_FP16
template <>
__device__ inline void atomicReduce(float* output, float accum, SumReducer<float>&) {
#if (defined(EIGEN_HIP_DEVICE_COMPILE) && defined(__HIP_ARCH_HAS_WARP_SHUFFLE__)) || (EIGEN_CUDA_ARCH >= 300)
atomicAdd(output, accum);
#else // EIGEN_CUDA_ARCH >= 300
EIGEN_UNUSED_VARIABLE(output);
EIGEN_UNUSED_VARIABLE(accum);
gpu_assert(0 && "Shouldn't be called on unsupported device");
#endif // EIGEN_CUDA_ARCH >= 300
}
template <typename CoeffType, typename Index>
@@ -138,7 +153,6 @@ template <int BlockSize, int NumPerThread, typename Self, typename Reducer, type
__global__ EIGEN_HIP_LAUNCH_BOUNDS_1024 void FullReductionKernel(Reducer reducer, const Self input, Index num_coeffs,
typename Self::CoeffReturnType* output,
unsigned int* semaphore) {
#if (defined(EIGEN_HIP_DEVICE_COMPILE) && defined(__HIP_ARCH_HAS_WARP_SHUFFLE__)) || (EIGEN_CUDA_ARCH >= 300)
// Initialize the output value
const Index first_index = blockIdx.x * BlockSize * NumPerThread + threadIdx.x;
if (gridDim.x == 1) {
@@ -179,20 +193,7 @@ __global__ EIGEN_HIP_LAUNCH_BOUNDS_1024 void FullReductionKernel(Reducer reducer
#pragma unroll
for (int offset = warpSize / 2; offset > 0; offset /= 2) {
#if defined(EIGEN_HIPCC)
// use std::is_floating_point to determine the type of reduced_val
// This is needed because when Type == double, hipcc will give a "call to __shfl_down is ambiguous" error
// and list the float and int versions of __shfl_down as the candidate functions.
if (std::is_floating_point<typename Self::CoeffReturnType>::value) {
reducer.reduce(__shfl_down(static_cast<float>(accum), offset, warpSize), &accum);
} else {
reducer.reduce(__shfl_down(static_cast<int>(accum), offset, warpSize), &accum);
}
#elif defined(EIGEN_CUDA_SDK_VER) && EIGEN_CUDA_SDK_VER < 90000
reducer.reduce(__shfl_down(accum, offset, warpSize), &accum);
#else
reducer.reduce(__shfl_down_sync(0xFFFFFFFF, accum, offset, warpSize), &accum);
#endif
reducer.reduce(reduction_shuffle_down(accum, offset), &accum);
}
if ((threadIdx.x & (warpSize - 1)) == 0) {
@@ -206,17 +207,9 @@ __global__ EIGEN_HIP_LAUNCH_BOUNDS_1024 void FullReductionKernel(Reducer reducer
__threadfence_system();
#endif
}
#else // EIGEN_CUDA_ARCH >= 300
EIGEN_UNUSED_VARIABLE(reducer);
EIGEN_UNUSED_VARIABLE(input);
EIGEN_UNUSED_VARIABLE(num_coeffs);
EIGEN_UNUSED_VARIABLE(output);
EIGEN_UNUSED_VARIABLE(semaphore);
gpu_assert(0 && "Shouldn't be called on unsupported device");
#endif // EIGEN_CUDA_ARCH >= 300
}
#ifdef EIGEN_HAS_GPU_FP16
// Half-float reduction specializations.
template <typename Self, typename Reducer, typename Index>
__global__ EIGEN_HIP_LAUNCH_BOUNDS_1024 void ReductionInitFullReduxKernelHalfFloat(Reducer reducer, const Self input,
Index num_coeffs, half* scratch) {
@@ -319,14 +312,6 @@ __global__ EIGEN_HIP_LAUNCH_BOUNDS_1024 void FullReductionKernelHalfFloat(Reduce
hr[i] = wka_out.h;
}
reducer.reducePacket(r1, &accum);
#elif defined(EIGEN_CUDA_SDK_VER) && EIGEN_CUDA_SDK_VER < 90000
PacketType r1;
half2* hr = reinterpret_cast<half2*>(&r1);
half2* hacc = reinterpret_cast<half2*>(&accum);
for (int i = 0; i < packet_width / 2; i++) {
hr[i] = __shfl_down(hacc[i], offset, warpSize);
}
reducer.reducePacket(r1, &accum);
#else
PacketType r1;
half2* hr = reinterpret_cast<half2*>(&r1);
@@ -377,8 +362,6 @@ __global__ EIGEN_HIP_LAUNCH_BOUNDS_1024 void ReductionCleanupKernelHalfFloat(Op
}
}
#endif // EIGEN_HAS_GPU_FP16
template <typename Self, typename Op, typename OutputType, bool PacketAccess, typename Enabled = void>
struct FullReductionLauncher {
static void run(const Self&, Op&, const GpuDevice&, OutputType*, typename Self::Index) {
@@ -409,7 +392,7 @@ struct FullReductionLauncher<
}
};
#ifdef EIGEN_HAS_GPU_FP16
// Half-float reduction specializations.
template <typename Self, typename Op>
struct FullReductionLauncher<Self, Op, Eigen::half, false> {
static void run(const Self&, Op&, const GpuDevice&, half*, typename Self::Index) {
@@ -443,24 +426,18 @@ struct FullReductionLauncher<Self, Op, Eigen::half, true> {
}
}
};
#endif // EIGEN_HAS_GPU_FP16
template <typename Self, typename Op, bool Vectorizable>
struct FullReducer<Self, Op, GpuDevice, Vectorizable> {
// Unfortunately nvidia doesn't support well exotic types such as complex,
// so reduce the scope of the optimized version of the code to the simple cases
// of doubles, floats and half floats
#ifdef EIGEN_HAS_GPU_FP16
// Half-float reduction specializations.
static constexpr bool HasOptimizedImplementation =
!Self::ReducerTraits::IsStateful && (internal::is_same<typename Self::CoeffReturnType, float>::value ||
internal::is_same<typename Self::CoeffReturnType, double>::value ||
(internal::is_same<typename Self::CoeffReturnType, Eigen::half>::value &&
reducer_traits<Op, GpuDevice>::PacketAccess));
#else // EIGEN_HAS_GPU_FP16
static constexpr bool HasOptimizedImplementation =
!Self::ReducerTraits::IsStateful && (internal::is_same<typename Self::CoeffReturnType, float>::value ||
internal::is_same<typename Self::CoeffReturnType, double>::value);
#endif // EIGEN_HAS_GPU_FP16
template <typename OutputType>
static void run(const Self& self, Op& reducer, const GpuDevice& device, OutputType* output) {
@@ -481,7 +458,6 @@ __global__ EIGEN_HIP_LAUNCH_BOUNDS_1024 void InnerReductionKernel(Reducer reduce
Index num_coeffs_to_reduce,
Index num_preserved_coeffs,
typename Self::CoeffReturnType* output) {
#if (defined(EIGEN_HIP_DEVICE_COMPILE) && defined(__HIP_ARCH_HAS_WARP_SHUFFLE__)) || (EIGEN_CUDA_ARCH >= 300)
typedef typename Self::CoeffReturnType Type;
eigen_assert(blockDim.y == 1);
eigen_assert(blockDim.z == 1);
@@ -534,20 +510,7 @@ __global__ EIGEN_HIP_LAUNCH_BOUNDS_1024 void InnerReductionKernel(Reducer reduce
#pragma unroll
for (int offset = warpSize / 2; offset > 0; offset /= 2) {
#if defined(EIGEN_HIPCC)
// use std::is_floating_point to determine the type of reduced_val
// This is needed because when Type == double, hipcc will give a "call to __shfl_down is ambiguous" error
// and list the float and int versions of __shfl_down as the candidate functions.
if (std::is_floating_point<Type>::value) {
reducer.reduce(__shfl_down(static_cast<float>(reduced_val), offset), &reduced_val);
} else {
reducer.reduce(__shfl_down(static_cast<int>(reduced_val), offset), &reduced_val);
}
#elif defined(EIGEN_CUDA_SDK_VER) && EIGEN_CUDA_SDK_VER < 90000
reducer.reduce(__shfl_down(reduced_val, offset), &reduced_val);
#else
reducer.reduce(__shfl_down_sync(0xFFFFFFFF, reduced_val, offset), &reduced_val);
#endif
reducer.reduce(reduction_shuffle_down(reduced_val, offset), &reduced_val);
}
if ((threadIdx.x & (warpSize - 1)) == 0) {
@@ -555,17 +518,9 @@ __global__ EIGEN_HIP_LAUNCH_BOUNDS_1024 void InnerReductionKernel(Reducer reduce
}
}
}
#else // EIGEN_CUDA_ARCH >= 300
EIGEN_UNUSED_VARIABLE(reducer);
EIGEN_UNUSED_VARIABLE(input);
EIGEN_UNUSED_VARIABLE(num_coeffs_to_reduce);
EIGEN_UNUSED_VARIABLE(num_preserved_coeffs);
EIGEN_UNUSED_VARIABLE(output);
gpu_assert(0 && "Shouldn't be called on unsupported device");
#endif // EIGEN_CUDA_ARCH >= 300
}
#ifdef EIGEN_HAS_GPU_FP16
// Half-float reduction specializations.
template <int NumPerThread, typename Self, typename Reducer, typename Index>
__global__ EIGEN_HIP_LAUNCH_BOUNDS_1024 void InnerReductionKernelHalfFloat(Reducer reducer, const Self input,
@@ -688,19 +643,6 @@ __global__ EIGEN_HIP_LAUNCH_BOUNDS_1024 void InnerReductionKernelHalfFloat(Reduc
}
reducer.reducePacket(r1, &reduced_val1);
reducer.reducePacket(r2, &reduced_val2);
#elif defined(EIGEN_CUDA_SDK_VER) && EIGEN_CUDA_SDK_VER < 90000
PacketType r1;
PacketType r2;
half2* hr1 = reinterpret_cast<half2*>(&r1);
half2* hr2 = reinterpret_cast<half2*>(&r2);
half2* rv1 = reinterpret_cast<half2*>(&reduced_val1);
half2* rv2 = reinterpret_cast<half2*>(&reduced_val2);
for (int i = 0; i < packet_width / 2; i++) {
hr1[i] = __shfl_down(rv1[i], offset, warpSize);
hr2[i] = __shfl_down(rv2[i], offset, warpSize);
}
reducer.reducePacket(r1, &reduced_val1);
reducer.reducePacket(r2, &reduced_val2);
#else
PacketType r1;
PacketType r2;
@@ -741,8 +683,6 @@ __global__ EIGEN_HIP_LAUNCH_BOUNDS_1024 void InnerReductionKernelHalfFloat(Reduc
}
}
#endif // EIGEN_HAS_GPU_FP16
template <typename Self, typename Op, typename OutputType, bool PacketAccess, typename Enabled = void>
struct InnerReductionLauncher {
static EIGEN_DEVICE_FUNC bool run(const Self&, Op&, const GpuDevice&, OutputType*, typename Self::Index,
@@ -786,7 +726,7 @@ struct InnerReductionLauncher<
}
};
#ifdef EIGEN_HAS_GPU_FP16
// Half-float reduction specializations.
template <typename Self, typename Op>
struct InnerReductionLauncher<Self, Op, Eigen::half, false> {
static bool run(const Self&, Op&, const GpuDevice&, half*, typename Self::Index, typename Self::Index) {
@@ -826,24 +766,18 @@ struct InnerReductionLauncher<Self, Op, Eigen::half, true> {
return false;
}
};
#endif // EIGEN_HAS_GPU_FP16
template <typename Self, typename Op>
struct InnerReducer<Self, Op, GpuDevice> {
// Unfortunately nvidia doesn't support well exotic types such as complex,
// so reduce the scope of the optimized version of the code to the simple case
// of floats and half floats.
#ifdef EIGEN_HAS_GPU_FP16
// Half-float reduction specializations.
static constexpr bool HasOptimizedImplementation =
!Self::ReducerTraits::IsStateful && (internal::is_same<typename Self::CoeffReturnType, float>::value ||
internal::is_same<typename Self::CoeffReturnType, double>::value ||
(internal::is_same<typename Self::CoeffReturnType, Eigen::half>::value &&
reducer_traits<Op, GpuDevice>::PacketAccess));
#else // EIGEN_HAS_GPU_FP16
static constexpr bool HasOptimizedImplementation =
!Self::ReducerTraits::IsStateful && (internal::is_same<typename Self::CoeffReturnType, float>::value ||
internal::is_same<typename Self::CoeffReturnType, double>::value);
#endif // EIGEN_HAS_GPU_FP16
template <typename OutputType>
static bool run(const Self& self, Op& reducer, const GpuDevice& device, OutputType* output,

View File

@@ -237,7 +237,7 @@ if("${CMAKE_SIZEOF_VOID_P}" EQUAL "8" AND NOT CMAKE_CXX_COMPILER_ID STREQUAL "MS
ei_add_test(cxx11_tensor_uint128)
endif()
find_package(CUDA 9.0)
find_package(CUDA 11.4)
if(CUDA_FOUND AND EIGEN_TEST_CUDA)
# Make sure to compile without the -pedantic, -Wundef, -Wnon-virtual-dtor
# and -fno-check-new flags since they trigger thousands of compilation warnings
@@ -281,26 +281,11 @@ if(CUDA_FOUND AND EIGEN_TEST_CUDA)
ei_add_test(cxx11_tensor_argmax_gpu)
ei_add_test(cxx11_tensor_cast_float16_gpu)
ei_add_test(cxx11_tensor_scan_gpu)
set(EIGEN_CUDA_OLDEST_COMPUTE_ARCH 9999)
foreach(ARCH IN LISTS EIGEN_CUDA_COMPUTE_ARCH)
if(${ARCH} LESS ${EIGEN_CUDA_OLDEST_COMPUTE_ARCH})
set(EIGEN_CUDA_OLDEST_COMPUTE_ARCH ${ARCH})
endif()
endforeach()
# Contractions require arch 3.0 or higher
if (${EIGEN_CUDA_OLDEST_COMPUTE_ARCH} GREATER 29)
ei_add_test(cxx11_tensor_device)
ei_add_test(cxx11_tensor_gpu)
ei_add_test(cxx11_tensor_contract_gpu)
ei_add_test(cxx11_tensor_of_float16_gpu)
endif()
# The random number generation code requires arch 3.5 or greater.
if (${EIGEN_CUDA_OLDEST_COMPUTE_ARCH} GREATER 34)
ei_add_test(cxx11_tensor_random_gpu)
endif()
ei_add_test(cxx11_tensor_device)
ei_add_test(cxx11_tensor_gpu)
ei_add_test(cxx11_tensor_contract_gpu)
ei_add_test(cxx11_tensor_of_float16_gpu)
ei_add_test(cxx11_tensor_random_gpu)
unset(EIGEN_ADD_TEST_FILENAME_EXTENSION)
endif()
@@ -341,7 +326,6 @@ if (EIGEN_TEST_HIP)
ei_add_test(cxx11_tensor_cast_float16_gpu)
ei_add_test(cxx11_tensor_scan_gpu)
ei_add_test(cxx11_tensor_device)
ei_add_test(cxx11_tensor_gpu)
ei_add_test(cxx11_tensor_contract_gpu)
ei_add_test(cxx11_tensor_of_float16_gpu)

View File

@@ -850,6 +850,7 @@ void test_gpu_igamma() {
Tensor<Scalar, 2> a(6, 6);
Tensor<Scalar, 2> x(6, 6);
Tensor<Scalar, 2> out(6, 6);
Tensor<Scalar, 2> expected_out(6, 6);
out.setZero();
Scalar a_s[] = {Scalar(0), Scalar(1), Scalar(1.5), Scalar(4), Scalar(0.0001), Scalar(1000.5)};
@@ -862,14 +863,11 @@ void test_gpu_igamma() {
}
}
Scalar nan = std::numeric_limits<Scalar>::quiet_NaN();
Scalar igamma_s[][6] = {
{0.0, nan, nan, nan, nan, nan},
{0.0, 0.6321205588285578, 0.7768698398515702, 0.9816843611112658, 9.999500016666262e-05, 1.0},
{0.0, 0.4275932955291202, 0.608374823728911, 0.9539882943107686, 7.522076445089201e-07, 1.0},
{0.0, 0.01898815687615381, 0.06564245437845008, 0.5665298796332909, 4.166333347221828e-18, 1.0},
{0.0, 0.9999780593618628, 0.9999899967080838, 0.9999996219837988, 0.9991370418689945, 1.0},
{0.0, 0.0, 0.0, 0.0, 0.0, 0.5042041932513908}};
for (int i = 0; i < 6; ++i) {
for (int j = 0; j < 6; ++j) {
expected_out(i, j) = numext::igamma(a(i, j), x(i, j));
}
}
std::size_t bytes = a.size() * sizeof(Scalar);
@@ -897,10 +895,10 @@ void test_gpu_igamma() {
for (int i = 0; i < 6; ++i) {
for (int j = 0; j < 6; ++j) {
if ((std::isnan)(igamma_s[i][j])) {
if ((std::isnan)(expected_out(i, j))) {
VERIFY((std::isnan)(out(i, j)));
} else {
VERIFY_IS_APPROX(out(i, j), igamma_s[i][j]);
VERIFY_IS_APPROX(out(i, j), expected_out(i, j));
}
}
}
@@ -915,6 +913,7 @@ void test_gpu_igammac() {
Tensor<Scalar, 2> a(6, 6);
Tensor<Scalar, 2> x(6, 6);
Tensor<Scalar, 2> out(6, 6);
Tensor<Scalar, 2> expected_out(6, 6);
out.setZero();
Scalar a_s[] = {Scalar(0), Scalar(1), Scalar(1.5), Scalar(4), Scalar(0.0001), Scalar(1000.5)};
@@ -927,14 +926,11 @@ void test_gpu_igammac() {
}
}
Scalar nan = std::numeric_limits<Scalar>::quiet_NaN();
Scalar igammac_s[][6] = {
{nan, nan, nan, nan, nan, nan},
{1.0, 0.36787944117144233, 0.22313016014842982, 0.018315638888734182, 0.9999000049998333, 0.0},
{1.0, 0.5724067044708798, 0.3916251762710878, 0.04601170568923136, 0.9999992477923555, 0.0},
{1.0, 0.9810118431238462, 0.9343575456215499, 0.4334701203667089, 1.0, 0.0},
{1.0, 2.1940638138146658e-05, 1.0003291916285e-05, 3.7801620118431334e-07, 0.0008629581310054535, 0.0},
{1.0, 1.0, 1.0, 1.0, 1.0, 0.49579580674813944}};
for (int i = 0; i < 6; ++i) {
for (int j = 0; j < 6; ++j) {
expected_out(i, j) = numext::igammac(a(i, j), x(i, j));
}
}
std::size_t bytes = a.size() * sizeof(Scalar);
@@ -962,10 +958,10 @@ void test_gpu_igammac() {
for (int i = 0; i < 6; ++i) {
for (int j = 0; j < 6; ++j) {
if ((std::isnan)(igammac_s[i][j])) {
if ((std::isnan)(expected_out(i, j))) {
VERIFY((std::isnan)(out(i, j)));
} else {
VERIFY_IS_APPROX(out(i, j), igammac_s[i][j]);
VERIFY_IS_APPROX(out(i, j), expected_out(i, j));
}
}
}
@@ -1068,15 +1064,9 @@ void test_gpu_ndtri() {
in_x(7) = Scalar(0.99);
in_x(8) = Scalar(0.01);
expected_out(0) = std::numeric_limits<Scalar>::infinity();
expected_out(1) = -std::numeric_limits<Scalar>::infinity();
expected_out(2) = Scalar(0.0);
expected_out(3) = Scalar(-0.8416212335729142);
expected_out(4) = Scalar(0.8416212335729142);
expected_out(5) = Scalar(1.2815515655446004);
expected_out(6) = Scalar(-1.2815515655446004);
expected_out(7) = Scalar(2.3263478740408408);
expected_out(8) = Scalar(-2.3263478740408408);
for (int i = 0; i < 9; ++i) {
expected_out(i) = numext::ndtri(in_x(i));
}
std::size_t bytes = in_x.size() * sizeof(Scalar);
@@ -1090,15 +1080,15 @@ void test_gpu_ndtri() {
Eigen::GpuStreamDevice stream;
Eigen::GpuDevice gpu_device(&stream);
Eigen::TensorMap<Eigen::Tensor<Scalar, 1> > gpu_in_x(d_in_x, 6);
Eigen::TensorMap<Eigen::Tensor<Scalar, 1> > gpu_out(d_out, 6);
Eigen::TensorMap<Eigen::Tensor<Scalar, 1> > gpu_in_x(d_in_x, 9);
Eigen::TensorMap<Eigen::Tensor<Scalar, 1> > gpu_out(d_out, 9);
gpu_out.device(gpu_device) = gpu_in_x.ndtri();
assert(gpuMemcpyAsync(out.data(), d_out, bytes, gpuMemcpyDeviceToHost, gpu_device.stream()) == gpuSuccess);
assert(gpuStreamSynchronize(gpu_device.stream()) == gpuSuccess);
for (int i = 0; i < 6; ++i) {
for (int i = 0; i < 9; ++i) {
VERIFY_IS_CWISE_APPROX(out(i), expected_out(i));
}
@@ -1115,12 +1105,9 @@ void test_gpu_betainc() {
Tensor<Scalar, 1> expected_out(125);
out.setZero();
Scalar nan = std::numeric_limits<Scalar>::quiet_NaN();
Array<Scalar, 1, Dynamic> x(125);
Array<Scalar, 1, Dynamic> a(125);
Array<Scalar, 1, Dynamic> b(125);
Array<Scalar, 1, Dynamic> v(125);
a << 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0,
0.0, 0.0, 0.0, 0.03062277660168379, 0.03062277660168379, 0.03062277660168379, 0.03062277660168379,
@@ -1160,25 +1147,11 @@ void test_gpu_betainc() {
0.5, 0.8, 1.1, -0.1, 0.2, 0.5, 0.8, 1.1, -0.1, 0.2, 0.5, 0.8, 1.1, -0.1, 0.2, 0.5, 0.8, 1.1, -0.1, 0.2, 0.5, 0.8,
1.1, -0.1, 0.2, 0.5, 0.8, 1.1, -0.1, 0.2, 0.5, 0.8, 1.1, -0.1, 0.2, 0.5, 0.8, 1.1;
v << nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan,
nan, nan, nan, nan, nan, nan, nan, nan, nan, 0.47972119876364683, 0.5, 0.5202788012363533, nan, nan,
0.9518683957740043, 0.9789663010413743, 0.9931729188073435, nan, nan, 0.999995949033062, 0.9999999999993698,
0.9999999999999999, nan, nan, 0.9999999999999999, 0.9999999999999999, 0.9999999999999999, nan, nan, nan, nan, nan,
nan, nan, 0.006827081192655869, 0.0210336989586256, 0.04813160422599567, nan, nan, 0.20014344256217678,
0.5000000000000001, 0.7998565574378232, nan, nan, 0.9991401428435834, 0.999999999698403, 0.9999999999999999, nan,
nan, 0.9999999999999999, 0.9999999999999999, 0.9999999999999999, nan, nan, nan, nan, nan, nan, nan,
1.0646600232370887e-25, 6.301722877826246e-13, 4.050966937974938e-06, nan, nan, 7.864342668429763e-23,
3.015969667594166e-10, 0.0008598571564165444, nan, nan, 6.031987710123844e-08, 0.5000000000000007,
0.9999999396801229, nan, nan, 0.9999999999999999, 0.9999999999999999, 0.9999999999999999, nan, nan, nan, nan, nan,
nan, nan, 0.0, 7.029920380986636e-306, 2.2450728208591345e-101, nan, nan, 0.0, 9.275871147869727e-302,
1.2232913026152827e-97, nan, nan, 0.0, 3.0891393081932924e-252, 2.9303043666183996e-60, nan, nan,
2.248913486879199e-196, 0.5000000000004947, 0.9999999999999999, nan;
for (int i = 0; i < 125; ++i) {
in_x(i) = x(i);
in_a(i) = a(i);
in_b(i) = b(i);
expected_out(i) = v(i);
expected_out(i) = numext::betainc(a(i), b(i), x(i));
}
std::size_t bytes = in_x.size() * sizeof(Scalar);

View File

@@ -53,8 +53,6 @@ void test_gpu_numext() {
gpu_device.deallocate(d_res_float);
}
#ifdef EIGEN_HAS_GPU_FP16
template <typename>
void test_gpu_conversion() {
Eigen::GpuStreamDevice stream;
@@ -442,12 +440,10 @@ void test_gpu_forced_evals() {
gpu_device.deallocate(d_res_half2);
gpu_device.deallocate(d_res_float);
}
#endif
EIGEN_DECLARE_TEST(cxx11_tensor_of_float16_gpu) {
CALL_SUBTEST_1(test_gpu_numext<void>());
#ifdef EIGEN_HAS_GPU_FP16
CALL_SUBTEST_1(test_gpu_conversion<void>());
CALL_SUBTEST_1(test_gpu_unary<void>());
CALL_SUBTEST_1(test_gpu_elementwise<void>());
@@ -456,7 +452,4 @@ EIGEN_DECLARE_TEST(cxx11_tensor_of_float16_gpu) {
CALL_SUBTEST_3(test_gpu_reductions<void>());
CALL_SUBTEST_4(test_gpu_full_reductions<void>());
CALL_SUBTEST_5(test_gpu_forced_evals<void>());
#else
std::cout << "Half floats are not supported by this version of gpu: skipping the test" << std::endl;
#endif
}