GPU: Raise CUDA/HIP minimum and remove legacy guards

- Raise CUDA minimum from 9.0 to 11.4 (sm_70/Volta). - Raise HIP minimum to GFX906 (Vega 20/MI50) / ROCm 5.6. - Remove EIGEN_HAS_{CUDA,HIP,GPU}_FP16 guards — FP16 is always available on sm_70+ and GFX906+. - Remove obsolete __HIP_ARCH_HAS_* preprocessor branches. - C++14 cleanup: remove pre-C++14 workarounds in GPU code. - Fix NVCC warnings (deprecated register keyword, unreachable code, tautological comparisons). - Fix HIP test execution on gfx1151. - Update CI configuration for new minimum versions.
Add plog_core_double with fallback for AVX without AVX2
2026-04-10 11:34:33 +08:00 · 2026-04-09 15:21:39 -07:00 · 2026-04-08 19:41:07 -07:00 · 2026-04-08 13:03:48 -07:00 · 2026-04-07 21:48:25 -07:00 · 2026-04-07 21:37:58 -07:00
73 changed files with 3488 additions and 7536 deletions
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -672,7 +672,7 @@ if (EIGEN_BUILD_TESTING)
  endif()

  set(EIGEN_CUDA_CXX_FLAGS "" CACHE STRING "Additional flags to pass to the cuda compiler.")
-  set(EIGEN_CUDA_COMPUTE_ARCH 30 CACHE STRING "The CUDA compute architecture(s) to target when compiling CUDA code")
+  set(EIGEN_CUDA_COMPUTE_ARCH 70 CACHE STRING "The CUDA compute architecture(s) to target when compiling CUDA code")

  option(EIGEN_TEST_SYCL "Add Sycl support." OFF)
  if(EIGEN_TEST_SYCL)
@@ -817,4 +817,3 @@ endif()
 message(STATUS "")
 message(STATUS "Configured Eigen ${EIGEN_VERSION_STRING}")
 message(STATUS "")
-
--- a/Eigen/Core
+++ b/Eigen/Core
@@ -50,9 +50,9 @@
 #include "src/Core/util/AOCL_Support.h"


-#if defined(EIGEN_HAS_CUDA_FP16) || defined(EIGEN_HAS_HIP_FP16)
-#define EIGEN_HAS_GPU_FP16
-#endif
+// EIGEN_HAS_GPU_FP16 is now always true when compiling with CUDA or HIP.
+// Use EIGEN_GPUCC (compile-time) or EIGEN_GPU_COMPILE_PHASE (device phase) instead.
+// TODO: Remove EIGEN_HAS_GPU_BF16 similarly once HIP bf16 guards are cleaned up.

 #if defined(EIGEN_HAS_CUDA_BF16) || defined(EIGEN_HAS_HIP_BF16)
 #define EIGEN_HAS_GPU_BF16
--- a/Eigen/src/Core/arch/Default/BFloat16.h
+++ b/Eigen/src/Core/arch/Default/BFloat16.h
@@ -858,16 +858,8 @@ struct hash<Eigen::bfloat16> {
 }  // namespace std
 #endif

-// Add the missing shfl* intrinsics.
-// The __shfl* functions are only valid on HIP or _CUDA_ARCH_ >= 300.
-//   CUDA defines them for (__CUDA_ARCH__ >= 300 || !defined(__CUDA_ARCH__))
-//
-// HIP and CUDA prior to SDK 9.0 define
-//    __shfl, __shfl_up, __shfl_down, __shfl_xor for int and float
-// CUDA since 9.0 deprecates those and instead defines
-//    __shfl_sync, __shfl_up_sync, __shfl_down_sync, __shfl_xor_sync,
-//    with native support for __half and __nv_bfloat16
-//
+// Warp shuffle overloads for Eigen::bfloat16.
+// HIP uses non-sync __shfl variants; CUDA has native __nv_bfloat16 support in __shfl_sync.
 // Note that the following are __device__ - only functions.
 #if defined(EIGEN_HIPCC)

--- a/Eigen/src/Core/arch/Default/GenericPacketMathFunctions.h
+++ b/Eigen/src/Core/arch/Default/GenericPacketMathFunctions.h
@@ -141,6 +141,140 @@ EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS Packet plog2_float(const Pac
  return plog_impl_float<Packet, /* base2 */ true>(_x);
 }

+// -----------------------------------------------------------------------
+// Double logarithm: shared polynomial + two range-reduction backends
+// -----------------------------------------------------------------------
+
+// Cephes rational-polynomial approximation of log(1+f) for
+// f in [sqrt(0.5)-1, sqrt(2)-1].
+// Evaluates x - 0.5*x^2 + x^3 * P(x)/Q(x) where P and Q are degree-5.
+// See: http://www.netlib.org/cephes/
+template <typename Packet>
+EIGEN_STRONG_INLINE Packet plog_mantissa_double(const Packet x) {
+  const Packet cst_cephes_log_p0 = pset1<Packet>(1.01875663804580931796E-4);
+  const Packet cst_cephes_log_p1 = pset1<Packet>(4.97494994976747001425E-1);
+  const Packet cst_cephes_log_p2 = pset1<Packet>(4.70579119878881725854E0);
+  const Packet cst_cephes_log_p3 = pset1<Packet>(1.44989225341610930846E1);
+  const Packet cst_cephes_log_p4 = pset1<Packet>(1.79368678507819816313E1);
+  const Packet cst_cephes_log_p5 = pset1<Packet>(7.70838733755885391666E0);
+  // Q0 = 1.0; pmadd(1, x, q1) simplifies to padd(x, q1).
+  const Packet cst_cephes_log_q1 = pset1<Packet>(1.12873587189167450590E1);
+  const Packet cst_cephes_log_q2 = pset1<Packet>(4.52279145837532221105E1);
+  const Packet cst_cephes_log_q3 = pset1<Packet>(8.29875266912776603211E1);
+  const Packet cst_cephes_log_q4 = pset1<Packet>(7.11544750618563894466E1);
+  const Packet cst_cephes_log_q5 = pset1<Packet>(2.31251620126765340583E1);
+
+  Packet x2 = pmul(x, x);
+  Packet x3 = pmul(x2, x);
+
+  // Evaluate P and Q simultaneously for better ILP.
+  Packet y, y1, y_;
+  y = pmadd(cst_cephes_log_p0, x, cst_cephes_log_p1);
+  y1 = pmadd(cst_cephes_log_p3, x, cst_cephes_log_p4);
+  y = pmadd(y, x, cst_cephes_log_p2);
+  y1 = pmadd(y1, x, cst_cephes_log_p5);
+  y_ = pmadd(y, x3, y1);
+
+  y = padd(x, cst_cephes_log_q1);
+  y1 = pmadd(cst_cephes_log_q3, x, cst_cephes_log_q4);
+  y = pmadd(y, x, cst_cephes_log_q2);
+  y1 = pmadd(y1, x, cst_cephes_log_q5);
+  y = pmadd(y, x3, y1);
+
+  y_ = pmul(y_, x3);
+  y = pdiv(y_, y);
+  y = pnmadd(pset1<Packet>(0.5), x2, y);
+  return padd(x, y);
+}
+
+// Detect whether unpacket_traits<Packet>::integer_packet is defined.
+template <typename Packet, typename = void>
+struct packet_has_integer_packet : std::false_type {};
+template <typename Packet>
+struct packet_has_integer_packet<Packet, void_t<typename unpacket_traits<Packet>::integer_packet>> : std::true_type {};
+
+// Dispatch struct for double-precision range reduction.
+// Primary template: pfrexp-based fallback (used when integer_packet is absent).
+template <typename Packet, bool UseIntegerPacket>
+struct plog_range_reduce_double {
+  EIGEN_STRONG_INLINE static void run(const Packet v, Packet& f, Packet& e) {
+    const Packet one = pset1<Packet>(1.0);
+    const Packet cst_cephes_SQRTHF = pset1<Packet>(0.70710678118654752440E0);
+    // pfrexp: f in [0.5, 1), e = unbiased exponent as double.
+    f = pfrexp(v, e);
+    // Shift [0.5,1) -> [sqrt(0.5)-1, sqrt(2)-1] with exponent correction:
+    //   if f < sqrt(0.5): f = f + f - 1, e -= 1   (giving f in [0, sqrt(2)-1))
+    //   else:             f = f - 1                (giving f in [sqrt(0.5)-1, 0))
+    Packet mask = pcmp_lt(f, cst_cephes_SQRTHF);
+    Packet tmp = pand(f, mask);
+    f = psub(f, one);
+    e = psub(e, pand(one, mask));
+    f = padd(f, tmp);
+  }
+};
+
+// Specialisation: fast integer-bit-manipulation path (musl-inspired).
+// Requires unpacket_traits<Packet>::integer_packet to be a 64-bit integer packet.
+template <typename Packet>
+struct plog_range_reduce_double<Packet, true> {
+  EIGEN_STRONG_INLINE static void run(const Packet v, Packet& f, Packet& e) {
+    typedef typename unpacket_traits<Packet>::integer_packet PacketI;
+    // 2^-1022: smallest positive normal double.
+    const PacketI cst_min_normal = pset1<PacketI>(static_cast<int64_t>(0x0010000000000000LL));
+    // Lower 52-bit mask (IEEE mantissa field).
+    const PacketI cst_mant_mask = pset1<PacketI>(static_cast<int64_t>(0x000FFFFFFFFFFFFFLL));
+    // Offset = 1.0_bits - sqrt(0.5)_bits.  Adding this to the integer
+    // representation shifts the exponent field so that the [sqrt(0.5), sqrt(2))
+    // half-octave boundary falls on an exact biased-exponent boundary, letting
+    // us extract e with a single right shift.  The constant is:
+    //   0x3FF0000000000000 - 0x3FE6A09E667F3BCD = 0x00095F619980C433
+    const PacketI cst_sqrt_half_offset =
+        pset1<PacketI>(static_cast<int64_t>(0x3FF0000000000000LL - 0x3FE6A09E667F3BCDLL));
+    // IEEE double exponent bias (1023).
+    const PacketI cst_exp_bias = pset1<PacketI>(static_cast<int64_t>(1023));
+    // sqrt(0.5) IEEE bits — used to reconstruct f from biased mantissa.
+    const PacketI cst_half_mant = pset1<PacketI>(static_cast<int64_t>(0x3FE6A09E667F3BCDLL));
+
+    // Reinterpret v as a 64-bit integer vector.
+    PacketI vi = preinterpret<PacketI>(v);
+
+    // Normalise denormals: multiply by 2^52 and correct the exponent by -52.
+    PacketI is_denormal = pcmp_lt(vi, cst_min_normal);
+    // 2^52 via bit pattern: biased exponent = 52 + 1023 = 0x433, mantissa = 0.
+    Packet v_norm = pmul(v, pset1frombits<Packet>(static_cast<uint64_t>(int64_t(52 + 0x3ff) << 52)));
+    vi = pselect(is_denormal, preinterpret<PacketI>(v_norm), vi);
+    PacketI denorm_adj = pand(is_denormal, pset1<PacketI>(static_cast<int64_t>(52)));
+
+    // Bias the integer representation so the exponent field directly encodes
+    // the half-octave index.
+    PacketI vi_biased = padd(vi, cst_sqrt_half_offset);
+    // Extract unbiased exponent: shift out mantissa bits, subtract IEEE bias
+    // and denormal adjustment.
+    PacketI e_int = psub(psub(plogical_shift_right<52>(vi_biased), cst_exp_bias), denorm_adj);
+    // Convert integer exponent to floating-point.
+    e = pcast<PacketI, Packet>(e_int);
+
+    // Reconstruct mantissa in [sqrt(0.5), sqrt(2)) via integer arithmetic.
+    // The integer addition of the masked mantissa bits and the sqrt(0.5) bit
+    // pattern carries into the exponent field, yielding a value in that range.
+    // Then subtract 1 to centre on 0: f in [sqrt(0.5)-1, sqrt(2)-1].
+    f = psub(preinterpret<Packet>(padd(pand(vi_biased, cst_mant_mask), cst_half_mant)), pset1<Packet>(1.0));
+  }
+};
+
+// Core range reduction and polynomial for double logarithm.
+// Input:  v > 0 (zero / negative / inf / nan are handled by the caller).
+// Output: log_mantissa ≈ log(mantissa of v in [sqrt(0.5), sqrt(2))),
+//         e            = unbiased exponent of v as a double.
+// Selects the fast integer path when integer_packet is available, otherwise
+// falls back to pfrexp.
+template <typename Packet>
+EIGEN_STRONG_INLINE void plog_core_double(const Packet v, Packet& log_mantissa, Packet& e) {
+  Packet f;
+  plog_range_reduce_double<Packet, packet_has_integer_packet<Packet>::value>::run(v, f, e);
+  log_mantissa = plog_mantissa_double(f);
+}
+
 /* Returns the base e (2.718...) or base 2 logarithm of x.
 * The argument is separated into its exponent and fractional parts.
 * The logarithm of the fraction in the interval [sqrt(1/2), sqrt(2)],
@@ -152,87 +286,29 @@ EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS Packet plog2_float(const Pac
 */
 template <typename Packet, bool base2>
 EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS Packet plog_impl_double(const Packet _x) {
-  Packet x = _x;
-
-  const Packet cst_1 = pset1<Packet>(1.0);
-  const Packet cst_neg_half = pset1<Packet>(-0.5);
  const Packet cst_minus_inf = pset1frombits<Packet>(static_cast<uint64_t>(0xfff0000000000000ull));
  const Packet cst_pos_inf = pset1frombits<Packet>(static_cast<uint64_t>(0x7ff0000000000000ull));

-  // Polynomial Coefficients for log(1+x) = x - x**2/2 + x**3 P(x)/Q(x)
-  //                             1/sqrt(2) <= x < sqrt(2)
-  const Packet cst_cephes_SQRTHF = pset1<Packet>(0.70710678118654752440E0);
-  const Packet cst_cephes_log_p0 = pset1<Packet>(1.01875663804580931796E-4);
-  const Packet cst_cephes_log_p1 = pset1<Packet>(4.97494994976747001425E-1);
-  const Packet cst_cephes_log_p2 = pset1<Packet>(4.70579119878881725854E0);
-  const Packet cst_cephes_log_p3 = pset1<Packet>(1.44989225341610930846E1);
-  const Packet cst_cephes_log_p4 = pset1<Packet>(1.79368678507819816313E1);
-  const Packet cst_cephes_log_p5 = pset1<Packet>(7.70838733755885391666E0);
+  Packet log_mantissa, e;
+  plog_core_double(_x, log_mantissa, e);

-  const Packet cst_cephes_log_q0 = pset1<Packet>(1.0);
-  const Packet cst_cephes_log_q1 = pset1<Packet>(1.12873587189167450590E1);
-  const Packet cst_cephes_log_q2 = pset1<Packet>(4.52279145837532221105E1);
-  const Packet cst_cephes_log_q3 = pset1<Packet>(8.29875266912776603211E1);
-  const Packet cst_cephes_log_q4 = pset1<Packet>(7.11544750618563894466E1);
-  const Packet cst_cephes_log_q5 = pset1<Packet>(2.31251620126765340583E1);
-
-  Packet e;
-  // extract significant in the range [0.5,1) and exponent
-  x = pfrexp(x, e);
-
-  // Shift the inputs from the range [0.5,1) to [sqrt(1/2),sqrt(2))
-  // and shift by -1. The values are then centered around 0, which improves
-  // the stability of the polynomial evaluation.
-  //   if( x < SQRTHF ) {
-  //     e -= 1;
-  //     x = x + x - 1.0;
-  //   } else { x = x - 1.0; }
-  Packet mask = pcmp_lt(x, cst_cephes_SQRTHF);
-  Packet tmp = pand(x, mask);
-  x = psub(x, cst_1);
-  e = psub(e, pand(cst_1, mask));
-  x = padd(x, tmp);
-
-  Packet x2 = pmul(x, x);
-  Packet x3 = pmul(x2, x);
-
-  // Evaluate the polynomial in factored form for better instruction-level parallelism.
-  // y = x - 0.5*x^2 + x^3 * polevl( x, P, 5 ) / p1evl( x, Q, 5 ) );
-  Packet y, y1, y_;
-  y = pmadd(cst_cephes_log_p0, x, cst_cephes_log_p1);
-  y1 = pmadd(cst_cephes_log_p3, x, cst_cephes_log_p4);
-  y = pmadd(y, x, cst_cephes_log_p2);
-  y1 = pmadd(y1, x, cst_cephes_log_p5);
-  y_ = pmadd(y, x3, y1);
-
-  y = pmadd(cst_cephes_log_q0, x, cst_cephes_log_q1);
-  y1 = pmadd(cst_cephes_log_q3, x, cst_cephes_log_q4);
-  y = pmadd(y, x, cst_cephes_log_q2);
-  y1 = pmadd(y1, x, cst_cephes_log_q5);
-  y = pmadd(y, x3, y1);
-
-  y_ = pmul(y_, x3);
-  y = pdiv(y_, y);
-
-  y = pmadd(cst_neg_half, x2, y);
-  x = padd(x, y);
-
-  // Add the logarithm of the exponent back to the result of the interpolation.
+  // Combine: log(x) = e * ln2 + log(mantissa), or log2(x) = log(mantissa)*log2e + e.
+  Packet x;
  if (base2) {
    const Packet cst_log2e = pset1<Packet>(static_cast<double>(EIGEN_LOG2E));
-    x = pmadd(x, cst_log2e, e);
+    x = pmadd(log_mantissa, cst_log2e, e);
  } else {
    const Packet cst_ln2 = pset1<Packet>(static_cast<double>(EIGEN_LN2));
-    x = pmadd(e, cst_ln2, x);
+    x = pmadd(e, cst_ln2, log_mantissa);
  }

  Packet invalid_mask = pcmp_lt_or_nan(_x, pzero(_x));
  Packet iszero_mask = pcmp_eq(_x, pzero(_x));
  Packet pos_inf_mask = pcmp_eq(_x, cst_pos_inf);
-  // Filter out invalid inputs, i.e.:
-  //  - negative arg will be NAN
-  //  - 0 will be -INF
-  //  - +INF will be +INF
+  // Filter out invalid inputs:
+  //  - negative arg → NAN
+  //  - 0            → -INF
+  //  - +INF         → +INF
  return pselect(iszero_mask, cst_minus_inf, por(pselect(pos_inf_mask, cst_pos_inf, x), invalid_mask));
 }

@@ -286,8 +362,11 @@ EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS Packet generic_log1p_float(c
  return result;
 }

-/** \internal \returns log(1 + x) for double precision float.
-    Same direct approach as the float version.
+/** \internal \returns log(1 + x) for double precision.
+    Computes log(1+x) using plog_core_double for the core range reduction and
+    polynomial evaluation.  The rounding error from forming u = fl(1+x) is
+    recovered as dx = x - (u - 1) and folded in as a first-order correction
+    dx/u after the polynomial evaluation.
 */
 template <typename Packet>
 EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS Packet generic_log1p_double(const Packet& x) {
@@ -295,67 +374,31 @@ EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS Packet generic_log1p_double(
  const Packet cst_minus_inf = pset1frombits<Packet>(static_cast<uint64_t>(0xfff0000000000000ull));
  const Packet cst_pos_inf = pset1frombits<Packet>(static_cast<uint64_t>(0x7ff0000000000000ull));

+  // u = 1 + x, with rounding.  Recover the lost low bits: dx = x - (u - 1).
  Packet u = padd(one, x);
  Packet dx = psub(x, psub(u, one));

+  // For |x| tiny enough that u rounds to 1, return x directly.
  Packet small_mask = pcmp_eq(u, one);
+  // For u = +inf (x very large), return +inf.
  Packet inf_mask = pcmp_eq(u, cst_pos_inf);

-  const Packet cst_cephes_SQRTHF = pset1<Packet>(0.70710678118654752440E0);
-  Packet e;
-  Packet m = pfrexp(u, e);
-  Packet mask = pcmp_lt(m, cst_cephes_SQRTHF);
-  Packet tmp = pand(m, mask);
-  m = psub(m, one);
-  e = psub(e, pand(one, mask));
-  m = padd(m, tmp);
+  // Core range reduction and polynomial on u.
+  Packet log_u, e;
+  plog_core_double(u, log_u, e);

-  // Same polynomial as plog_double.
-  const Packet cst_neg_half = pset1<Packet>(-0.5);
-  const Packet cst_cephes_log_p0 = pset1<Packet>(1.01875663804580931796E-4);
-  const Packet cst_cephes_log_p1 = pset1<Packet>(4.97494994976747001425E-1);
-  const Packet cst_cephes_log_p2 = pset1<Packet>(4.70579119878881725854E0);
-  const Packet cst_cephes_log_p3 = pset1<Packet>(1.44989225341610930846E1);
-  const Packet cst_cephes_log_p4 = pset1<Packet>(1.79368678507819816313E1);
-  const Packet cst_cephes_log_p5 = pset1<Packet>(7.70838733755885391666E0);
-  const Packet cst_cephes_log_q0 = pset1<Packet>(1.0);
-  const Packet cst_cephes_log_q1 = pset1<Packet>(1.12873587189167450590E1);
-  const Packet cst_cephes_log_q2 = pset1<Packet>(4.52279145837532221105E1);
-  const Packet cst_cephes_log_q3 = pset1<Packet>(8.29875266912776603211E1);
-  const Packet cst_cephes_log_q4 = pset1<Packet>(7.11544750618563894466E1);
-  const Packet cst_cephes_log_q5 = pset1<Packet>(2.31251620126765340583E1);
-
-  Packet m2 = pmul(m, m);
-  Packet m3 = pmul(m2, m);
-
-  Packet y, y1, y_;
-  y = pmadd(cst_cephes_log_p0, m, cst_cephes_log_p1);
-  y1 = pmadd(cst_cephes_log_p3, m, cst_cephes_log_p4);
-  y = pmadd(y, m, cst_cephes_log_p2);
-  y1 = pmadd(y1, m, cst_cephes_log_p5);
-  y_ = pmadd(y, m3, y1);
-
-  y = pmadd(cst_cephes_log_q0, m, cst_cephes_log_q1);
-  y1 = pmadd(cst_cephes_log_q3, m, cst_cephes_log_q4);
-  y = pmadd(y, m, cst_cephes_log_q2);
-  y1 = pmadd(y1, m, cst_cephes_log_q5);
-  y = pmadd(y, m3, y1);
-
-  y_ = pmul(y_, m3);
-  Packet log_m = pdiv(y_, y);
-  log_m = pmadd(cst_neg_half, m2, log_m);
-  log_m = padd(m, log_m);
-
-  // result = e * ln2 + log(m) + dx/u.
+  // result = e * ln2 + log(u) + dx/u.
+  // The dx/u term corrects for the rounding error in u = fl(1+x).
  const Packet cst_ln2 = pset1<Packet>(static_cast<double>(EIGEN_LN2));
-  Packet result = pmadd(e, cst_ln2, padd(log_m, pdiv(dx, u)));
+  Packet result = pmadd(e, cst_ln2, padd(log_u, pdiv(dx, u)));

+  // Handle special cases.
  Packet neg_mask = pcmp_lt(u, pzero(u));
  Packet zero_mask = pcmp_eq(x, pset1<Packet>(-1.0));
  result = pselect(small_mask, x, result);
  result = pselect(inf_mask, cst_pos_inf, result);
  result = pselect(zero_mask, cst_minus_inf, result);
-  result = por(neg_mask, result);
+  result = por(neg_mask, result);  // NaN for x < -1
  return result;
 }

--- a/Eigen/src/Core/arch/Default/GenericPacketMathTrig.h
+++ b/Eigen/src/Core/arch/Default/GenericPacketMathTrig.h
@@ -230,40 +230,31 @@ EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS Packet ptan_float(const Pack
  return psincos_float<TrigFunction::Tan>(x);
 }

-// Trigonometric argument reduction for double for inputs smaller than 15.
-// Reduces trigonometric arguments for double inputs where x < 15. Given an argument x and its corresponding quadrant
-// count n, the function computes and returns the reduced argument t such that x = n * pi/2 + t.
+// Pi/2 split into 3 double-precision parts (triple-double).
+// c1 + c2 + c3 = pi/2 to ~159 bits. Computed by Sollya.
+// c1 = RD(pi/2), c2 = RD(pi/2 - c1), c3 = RD(pi/2 - c1 - c2).
 template <typename Packet>
-Packet trig_reduce_small_double(const Packet& x, const Packet& q) {
-  // Pi/2 split into 2 values
-  const Packet cst_pio2_a = pset1<Packet>(-1.570796325802803);
-  const Packet cst_pio2_b = pset1<Packet>(-9.920935184482005e-10);
-
-  Packet t;
-  t = pmadd(cst_pio2_a, q, x);
-  t = pmadd(cst_pio2_b, q, t);
-  return t;
+Packet cst_pio2_1() {
+  return pset1<Packet>(-1.5707963267948965579989817342720925807952880859375);  // -0x1.921fb54442d18p0
+}
+template <typename Packet>
+Packet cst_pio2_2() {
+  return pset1<Packet>(-6.12323399573676603586882014729198302312846062338790e-17);  // -0x1.1a62633145c07p-54
+}
+template <typename Packet>
+Packet cst_pio2_3() {
+  return pset1<Packet>(1.4973849048591698329435081771059920083527504761695190e-33);  //  0x1.f1976b7ed8fbcp-110
 }

-// Trigonometric argument reduction for double for inputs smaller than 1e14.
-// Reduces trigonometric arguments for double inputs where x < 1e14. Given an argument x and its corresponding quadrant
-// count n, the function computes and returns the reduced argument t such that x = n * pi/2 + t.
+// Trigonometric argument reduction for double, small inputs (|x| < small_th).
+// Reduces x to t such that x = q * pi/2 + t, where |t| <= pi/4.
+// Uses a triple-double split of pi/2 with FMA for high accuracy.
 template <typename Packet>
-Packet trig_reduce_medium_double(const Packet& x, const Packet& q_high, const Packet& q_low) {
-  // Pi/2 split into 4 values
-  const Packet cst_pio2_a = pset1<Packet>(-1.570796325802803);
-  const Packet cst_pio2_b = pset1<Packet>(-9.920935184482005e-10);
-  const Packet cst_pio2_c = pset1<Packet>(-6.123234014771656e-17);
-  const Packet cst_pio2_d = pset1<Packet>(1.903488962019325e-25);
-
+Packet trig_reduce_small_double(const Packet& x, const Packet& q) {
  Packet t;
-  t = pmadd(cst_pio2_a, q_high, x);
-  t = pmadd(cst_pio2_a, q_low, t);
-  t = pmadd(cst_pio2_b, q_high, t);
-  t = pmadd(cst_pio2_b, q_low, t);
-  t = pmadd(cst_pio2_c, q_high, t);
-  t = pmadd(cst_pio2_c, q_low, t);
-  t = pmadd(cst_pio2_d, padd(q_low, q_high), t);
+  t = pmadd(cst_pio2_1<Packet>(), q, x);
+  t = pmadd(cst_pio2_2<Packet>(), q, t);
+  t = pmadd(cst_pio2_3<Packet>(), q, t);
  return t;
 }

@@ -284,11 +275,13 @@ EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS
  // If the argument is bigger than this value, use the non-vectorized std version
  const double huge_th = 1e14;

-  const Packet cst_2oPI = pset1<Packet>(0.63661977236758134307553505349006);  // 2/PI
+  // 2/PI as a double-word: hi + lo = 2/pi to ~107 bits. Computed by Sollya.
+  const Packet cst_2oPI_hi =
+      pset1<Packet>(0.63661977236758138243288840385503135621547698974609375);  // 0x1.45f306dc9c883p-1
+  const Packet cst_2oPI_lo =
+      pset1<Packet>(-3.9357353350364971763790381828183628368294820823718866e-17);  // -0x1.6b01ec5417056p-55
  // Integer Packet constants
  const PacketI cst_one = pset1<PacketI>(ScalarI(1));
-  // Constant for splitting
-  const Packet cst_split = pset1<Packet>(1 << 24);

  Packet x_abs = pabs(x);

@@ -298,76 +291,56 @@ EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS

  // TODO Implement huge angle argument reduction
  if (EIGEN_PREDICT_FALSE(predux_any(pcmp_le(pset1<Packet>(small_th), x_abs)))) {
-    Packet q_high = pmul(pfloor(pmul(x_abs, pdiv(cst_2oPI, cst_split))), cst_split);
-    Packet q_low_noround = psub(pmul(x_abs, cst_2oPI), q_high);
-    q_int = pcast<Packet, PacketI>(padd(q_low_noround, pset1<Packet>(0.5)));
-    Packet q_low = pcast<PacketI, Packet>(q_int);
-    s = trig_reduce_medium_double(x_abs, q_high, q_low);
+    // Medium path: use double-word product x * (2/pi) for precise quadrant computation.
+    Packet prod_hi, prod_lo;
+    twoprod(x_abs, cst_2oPI_hi, prod_hi, prod_lo);
+    // Correction for 2/pi truncation: add x * lo(2/pi)
+    prod_lo = pmadd(x_abs, cst_2oPI_lo, prod_lo);
+
+    // Round the double-word (prod_hi, prod_lo) to the nearest integer.
+    Packet q = pround(prod_hi);
+    // Compute exact fractional part to check if rounding was correct.
+    Packet frac = padd(psub(prod_hi, q), prod_lo);
+    // Correct if fractional part crossed +-0.5 boundary.
+    q = padd(q, pand(pcmp_lt(pset1<Packet>(0.5), frac), pset1<Packet>(1.0)));
+    q = padd(q, pand(pcmp_lt(frac, pset1<Packet>(-0.5)), pset1<Packet>(-1.0)));
+
+    q_int = pcast<Packet, PacketI>(q);
+    s = trig_reduce_small_double(x_abs, q);
  } else {
-    Packet qval_noround = pmul(x_abs, cst_2oPI);
+    // Small path: simple reduction with triple-double pi/2 split.
+    Packet qval_noround = pmul(x_abs, cst_2oPI_hi);
    q_int = pcast<Packet, PacketI>(padd(qval_noround, pset1<Packet>(0.5)));
    Packet q = pcast<PacketI, Packet>(q_int);
    s = trig_reduce_small_double(x_abs, q);
  }

-  // All the upcoming approximating polynomials have even exponents
  Packet ss = pmul(s, s);

-  // Padé approximant of cos(x)
-  // Assuring < 1 ULP error on the interval [-pi/4, pi/4]
-  // cos(x) ~= (80737373*x^8 - 13853547000*x^6 + 727718024880*x^4 - 11275015752000*x^2 + 23594700729600)/(147173*x^8 +
-  // 39328920*x^6 + 5772800880*x^4 + 522334612800*x^2 + 23594700729600)
-  // MATLAB code to compute those coefficients:
-  //    syms x;
-  //    cosf = @(x) cos(x);
-  //    pade_cosf = pade(cosf(x), x, 0, 'Order', 8)
-  const Packet cn4 = pset1<Packet>(80737373);
-  const Packet cn3 = pset1<Packet>(-13853547000);
-  const Packet cn2 = pset1<Packet>(727718024880);
-  const Packet cn1 = pset1<Packet>(-11275015752000);
-  const Packet cn0 = pset1<Packet>(23594700729600);  // shared with cd0
-  const Packet cd3 = pset1<Packet>(147173);
-  const Packet cd2 = pset1<Packet>(39328920);
-  const Packet cd1 = pset1<Packet>(5772800880);
-  const Packet cd0 = pset1<Packet>(522334612800);
-  Packet sc1_num = pmadd(ss, cn4, cn3);
-  Packet sc2_num = pmadd(sc1_num, ss, cn2);
-  Packet sc3_num = pmadd(sc2_num, ss, cn1);
-  Packet sc4_num = pmadd(sc3_num, ss, cn0);
-  Packet sc1_denum = pmadd(ss, cd3, cd2);
-  Packet sc2_denum = pmadd(sc1_denum, ss, cd1);
-  Packet sc3_denum = pmadd(sc2_denum, ss, cd0);
-  Packet sc4_denum = pmadd(sc3_denum, ss, cn0);
-  Packet scos = pdiv(sc4_num, sc4_denum);
+  // Minimax polynomial approximation of cos(x) on [-pi/4, pi/4].
+  // cos(x) = 1 + u * P(u), where u = x^2 and P is degree 6 (7 FMAs total).
+  // Coefficients computed by Sollya fpminimax. Max polynomial error ~1.3e-19.
+  Packet scos = pset1<Packet>(-1.1368926065317776472832699312119132152576472805094454088248312473297119140625e-11);
+  scos = pmadd(scos, ss, pset1<Packet>(2.0875905481768720039634091158002593413556269297259859740734100341796875e-09));
+  scos = pmadd(scos, ss, pset1<Packet>(-2.7557315712466412785356544880299711763882442028261721134185791015625e-07));
+  scos = pmadd(scos, ss, pset1<Packet>(2.480158729424286522739599714082459058772656135261058807373046875e-05));
+  scos = pmadd(scos, ss, pset1<Packet>(-1.388888888888178789471350427220386336557567119598388671875e-03));
+  scos = pmadd(scos, ss, pset1<Packet>(4.166666666666664353702032030923874117434024810791015625e-02));
+  scos = pmadd(scos, ss, pset1<Packet>(-0.5));
+  scos = pmadd(scos, ss, pset1<Packet>(1.0));

-  // Padé approximant of sin(x)
-  // Assuring < 1 ULP error on the interval [-pi/4, pi/4]
-  // sin(x) ~= (x*(4585922449*x^8 - 1066023933480*x^6 + 83284044283440*x^4 - 2303682236856000*x^2 +
-  // 15605159573203200))/(45*(1029037*x^8 + 345207016*x^6 + 61570292784*x^4 + 6603948711360*x^2 + 346781323848960))
-  // MATLAB code to compute those coefficients:
-  //    syms x;
-  //    sinf = @(x) sin(x);
-  //    pade_sinf = pade(sinf(x), x, 0, 'Order', 8, 'OrderMode', 'relative')
-  const Packet sn4 = pset1<Packet>(4585922449);
-  const Packet sn3 = pset1<Packet>(-1066023933480);
-  const Packet sn2 = pset1<Packet>(83284044283440);
-  const Packet sn1 = pset1<Packet>(-2303682236856000);
-  const Packet sn0 = pset1<Packet>(15605159573203200);
-  const Packet sd3 = pset1<Packet>(1029037);
-  const Packet sd2 = pset1<Packet>(345207016);
-  const Packet sd1 = pset1<Packet>(61570292784);
-  const Packet sd0_inner = pset1<Packet>(6603948711360);
-  const Packet sd0 = pset1<Packet>(346781323848960);
-  const Packet cst_45 = pset1<Packet>(45);
-  Packet ss1_num = pmadd(ss, sn4, sn3);
-  Packet ss2_num = pmadd(ss1_num, ss, sn2);
-  Packet ss3_num = pmadd(ss2_num, ss, sn1);
-  Packet ss4_num = pmadd(ss3_num, ss, sn0);
-  Packet ss1_denum = pmadd(ss, sd3, sd2);
-  Packet ss2_denum = pmadd(ss1_denum, ss, sd1);
-  Packet ss3_denum = pmadd(ss2_denum, ss, sd0_inner);
-  Packet ss4_denum = pmadd(ss3_denum, ss, sd0);
-  Packet ssin = pdiv(pmul(s, ss4_num), pmul(cst_45, ss4_denum));
+  // Minimax polynomial approximation of sin(x) on [-pi/4, pi/4].
+  // sin(x) = x * (1 + u * R(u)), where u = x^2 and R is degree 5.
+  // Computed as: x + x * u * R(u) (6 FMAs + 1 mul).
+  // Coefficients computed by Sollya fpminimax. Max polynomial error ~1.0e-17.
+  Packet ssin = pset1<Packet>(1.59193066075142890698150587293845624470289834562208852730691432952880859375e-10);
+  ssin = pmadd(ssin, ss, pset1<Packet>(-2.50511517945670206974594627392927126408039839589037001132965087890625e-08));
+  ssin = pmadd(ssin, ss, pset1<Packet>(2.755731622544328228235042954619160582296899519860744476318359375e-06));
+  ssin = pmadd(ssin, ss, pset1<Packet>(-1.9841269837089632013978068858506276228581555187702178955078125e-04));
+  ssin = pmadd(ssin, ss, pset1<Packet>(8.333333333331312264835588621281203813850879669189453125e-03));
+  ssin = pmadd(ssin, ss, pset1<Packet>(-0.1666666666666666574148081281236954964697360992431640625));
+  ssin = pmul(ssin, ss);
+  ssin = pmadd(ssin, s, s);

  Packet poly_mask = preinterpret<Packet>(pcmp_eq(pand(q_int, cst_one), pzero(q_int)));

--- a/Eigen/src/Core/arch/Default/Half.h
+++ b/Eigen/src/Core/arch/Default/Half.h
@@ -45,7 +45,7 @@
 // Eigen with GPU support.
 // Any functions that require `numext::bit_cast` may also not be constexpr,
 // including any native types when setting via raw bit values.
-#if defined(EIGEN_HAS_GPU_FP16) || defined(EIGEN_HAS_ARM64_FP16_SCALAR_ARITHMETIC) || defined(EIGEN_HAS_BUILTIN_FLOAT16)
+#if defined(EIGEN_GPUCC) || defined(EIGEN_HAS_ARM64_FP16_SCALAR_ARITHMETIC) || defined(EIGEN_HAS_BUILTIN_FLOAT16)
 #define _EIGEN_MAYBE_CONSTEXPR
 #else
 #define _EIGEN_MAYBE_CONSTEXPR constexpr
@@ -121,12 +121,12 @@ namespace half_impl {
 //
 // Making the host side compile phase of hipcc use the same Eigen::half impl, as the gcc compile, resolves
 // this error, and hence the following convoluted #if condition
-#if !defined(EIGEN_HAS_GPU_FP16) || !defined(EIGEN_GPU_COMPILE_PHASE)
+#if !defined(EIGEN_GPUCC) || !defined(EIGEN_GPU_COMPILE_PHASE)

 // Make our own __half_raw definition that is similar to CUDA's.
 struct __half_raw {
  struct construct_from_rep_tag {};
-#if (defined(EIGEN_HAS_GPU_FP16) && !defined(EIGEN_GPU_COMPILE_PHASE))
+#if (defined(EIGEN_GPUCC) && !defined(EIGEN_GPU_COMPILE_PHASE))
  // Eigen::half can be used as the datatype for shared memory declarations (in Eigen and TF)
  // The element type for shared memory cannot have non-trivial constructors
  // and hence the following special casing (which skips the zero-initilization).
@@ -152,16 +152,12 @@ struct __half_raw {
 #endif
 };

-#elif defined(EIGEN_HAS_HIP_FP16)
+#elif defined(EIGEN_HIPCC)
 // HIP GPU compile phase: nothing to do here.
 // HIP fp16 header file has a definition for __half_raw
-#elif defined(EIGEN_HAS_CUDA_FP16)
+#elif defined(EIGEN_CUDACC)

 // CUDA GPU compile phase.
-#if EIGEN_CUDA_SDK_VER < 90000
-// In CUDA < 9.0, __half is the equivalent of CUDA 9's __half_raw
-typedef __half __half_raw;
-#endif  // defined(EIGEN_HAS_CUDA_FP16)

 #elif defined(SYCL_DEVICE_ONLY)
 typedef cl::sycl::half __half_raw;
@@ -175,15 +171,13 @@ struct half_base : public __half_raw {
  EIGEN_DEVICE_FUNC _EIGEN_MAYBE_CONSTEXPR half_base() {}
  EIGEN_DEVICE_FUNC _EIGEN_MAYBE_CONSTEXPR half_base(const __half_raw& h) : __half_raw(h) {}

-#if defined(EIGEN_HAS_GPU_FP16)
-#if defined(EIGEN_HAS_HIP_FP16)
+#if defined(EIGEN_GPUCC)
+#if defined(EIGEN_HIPCC)
  EIGEN_DEVICE_FUNC _EIGEN_MAYBE_CONSTEXPR half_base(const __half& h) { x = __half_as_ushort(h); }
-#elif defined(EIGEN_HAS_CUDA_FP16)
-#if EIGEN_CUDA_SDK_VER >= 90000
+#elif defined(EIGEN_CUDACC)
  EIGEN_DEVICE_FUNC _EIGEN_MAYBE_CONSTEXPR half_base(const __half& h) : __half_raw(*(__half_raw*)&h) {}
 #endif
 #endif
-#endif
 };

 }  // namespace half_impl
@@ -192,36 +186,29 @@ struct half_base : public __half_raw {
 struct half : public half_impl::half_base {
  // Writing this out as separate #if-else blocks to make the code easier to follow
  // The same applies to most #if-else blocks in this file
-#if !defined(EIGEN_HAS_GPU_FP16) || !defined(EIGEN_GPU_COMPILE_PHASE)
+#if !defined(EIGEN_GPUCC) || !defined(EIGEN_GPU_COMPILE_PHASE)
  // Use the same base class for the following two scenarios
  // * when compiling without GPU support enabled
  // * during host compile phase when compiling with GPU support enabled
  typedef half_impl::__half_raw __half_raw;
-#elif defined(EIGEN_HAS_HIP_FP16)
+#elif defined(EIGEN_HIPCC)
  // Nothing to do here
  // HIP fp16 header file has a definition for __half_raw
-#elif defined(EIGEN_HAS_CUDA_FP16)
-// Note that EIGEN_CUDA_SDK_VER is set to 0 even when compiling with HIP, so
-// (EIGEN_CUDA_SDK_VER < 90000) is true even for HIP!  So keeping this within
-// #if defined(EIGEN_HAS_CUDA_FP16) is needed
-#if defined(EIGEN_CUDA_SDK_VER) && EIGEN_CUDA_SDK_VER < 90000
-  typedef half_impl::__half_raw __half_raw;
-#endif
+#elif defined(EIGEN_CUDACC)
+  // Nothing to do here.
 #endif

  EIGEN_DEVICE_FUNC _EIGEN_MAYBE_CONSTEXPR half() {}

  EIGEN_DEVICE_FUNC _EIGEN_MAYBE_CONSTEXPR half(const __half_raw& h) : half_impl::half_base(h) {}

-#if defined(EIGEN_HAS_GPU_FP16)
-#if defined(EIGEN_HAS_HIP_FP16)
+#if defined(EIGEN_GPUCC)
+#if defined(EIGEN_HIPCC)
  EIGEN_DEVICE_FUNC _EIGEN_MAYBE_CONSTEXPR half(const __half& h) : half_impl::half_base(h) {}
-#elif defined(EIGEN_HAS_CUDA_FP16)
-#if defined(EIGEN_CUDA_SDK_VER) && EIGEN_CUDA_SDK_VER >= 90000
+#elif defined(EIGEN_CUDACC)
  EIGEN_DEVICE_FUNC _EIGEN_MAYBE_CONSTEXPR half(const __half& h) : half_impl::half_base(h) {}
 #endif
 #endif
-#endif

 #if defined(EIGEN_HAS_ARM64_FP16_SCALAR_ARITHMETIC)
  explicit EIGEN_DEVICE_FUNC _EIGEN_MAYBE_CONSTEXPR half(__fp16 b)
@@ -248,7 +235,7 @@ struct half : public half_impl::half_base {
    return half_impl::half_to_float(*this);
  }

-#if defined(EIGEN_HAS_GPU_FP16) && !defined(EIGEN_GPU_COMPILE_PHASE)
+#if defined(EIGEN_GPUCC) && !defined(EIGEN_GPU_COMPILE_PHASE)
  EIGEN_DEVICE_FUNC operator __half() const {
    ::__half_raw hr;
    hr.x = x;
@@ -380,8 +367,7 @@ namespace Eigen {

 namespace half_impl {

-#if (defined(EIGEN_HAS_CUDA_FP16) && defined(EIGEN_CUDA_ARCH) && EIGEN_CUDA_ARCH >= 530) || \
-    (defined(EIGEN_HAS_HIP_FP16) && defined(HIP_DEVICE_COMPILE))
+#if defined(EIGEN_GPU_COMPILE_PHASE)
 // Note: We deliberately do *not* define this to 1 even if we have Arm's native
 // fp16 type since GPU half types are rather different from native CPU half types.
 #define EIGEN_HAS_NATIVE_GPU_FP16
@@ -393,24 +379,10 @@ namespace half_impl {
 // conversion steps back and forth.

 #if defined(EIGEN_HAS_NATIVE_GPU_FP16)
-EIGEN_STRONG_INLINE __device__ half operator+(const half& a, const half& b) {
-#if defined(EIGEN_CUDA_SDK_VER) && EIGEN_CUDA_SDK_VER >= 90000
-  return __hadd(::__half(a), ::__half(b));
-#else
-  return __hadd(a, b);
-#endif
-}
+EIGEN_STRONG_INLINE __device__ half operator+(const half& a, const half& b) { return __hadd(::__half(a), ::__half(b)); }
 EIGEN_STRONG_INLINE __device__ half operator*(const half& a, const half& b) { return __hmul(a, b); }
 EIGEN_STRONG_INLINE __device__ half operator-(const half& a, const half& b) { return __hsub(a, b); }
-EIGEN_STRONG_INLINE __device__ half operator/(const half& a, const half& b) {
-#if defined(EIGEN_CUDA_SDK_VER) && EIGEN_CUDA_SDK_VER >= 90000
-  return __hdiv(a, b);
-#else
-  float num = __half2float(a);
-  float denom = __half2float(b);
-  return __float2half(num / denom);
-#endif
-}
+EIGEN_STRONG_INLINE __device__ half operator/(const half& a, const half& b) { return __hdiv(a, b); }
 EIGEN_STRONG_INLINE __device__ half operator-(const half& a) { return __hneg(a); }
 EIGEN_STRONG_INLINE __device__ half& operator+=(half& a, const half& b) {
  a = a + b;
@@ -505,7 +477,7 @@ EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC bool operator>=(const half& a, const half&
 // We need to provide emulated *host-side* FP16 operators for clang.
 #pragma push_macro("EIGEN_DEVICE_FUNC")
 #undef EIGEN_DEVICE_FUNC
-#if defined(EIGEN_HAS_CUDA_FP16) && defined(EIGEN_HAS_NATIVE_GPU_FP16)
+#if defined(EIGEN_CUDACC) && defined(EIGEN_HAS_NATIVE_GPU_FP16)
 #define EIGEN_DEVICE_FUNC __host__
 #else  // both host and device need emulated ops.
 #define EIGEN_DEVICE_FUNC __host__ __device__
@@ -636,7 +608,7 @@ EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC _EIGEN_MAYBE_CONSTEXPR __half_raw raw_uint
  // because this is constexpr function.
  // Fortunately, since we need to disable EIGEN_CONSTEXPR for GPU anyway, we can get out
  // of this catch22 by having separate bodies for GPU / non GPU
-#if defined(EIGEN_HAS_GPU_FP16)
+#if defined(EIGEN_GPUCC)
  __half_raw h;
  h.x = x;
  return h;
@@ -661,8 +633,7 @@ EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC numext::uint16_t raw_half_as_uint16(const
 }

 EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC __half_raw float_to_half_rtne(float ff) {
-#if (defined(EIGEN_HAS_CUDA_FP16) && defined(EIGEN_CUDA_ARCH) && EIGEN_CUDA_ARCH >= 300) || \
-    (defined(EIGEN_HAS_HIP_FP16) && defined(EIGEN_HIP_DEVICE_COMPILE))
+#if defined(EIGEN_GPU_COMPILE_PHASE)
  __half tmp_ff = __float2half(ff);
  return *(__half_raw*)&tmp_ff;

@@ -735,8 +706,7 @@ EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC __half_raw float_to_half_rtne(float ff) {
 }

 EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC float half_to_float(__half_raw h) {
-#if (defined(EIGEN_HAS_CUDA_FP16) && defined(EIGEN_CUDA_ARCH) && EIGEN_CUDA_ARCH >= 300) || \
-    (defined(EIGEN_HAS_HIP_FP16) && defined(EIGEN_HIP_DEVICE_COMPILE))
+#if defined(EIGEN_GPU_COMPILE_PHASE)
  return __half2float(h);
 #elif defined(EIGEN_HAS_ARM64_FP16_SCALAR_ARITHMETIC) || defined(EIGEN_HAS_BUILTIN_FLOAT16)
  return static_cast<float>(h.x);
@@ -778,8 +748,7 @@ EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC bool(isinf)(const half& a) {
 #endif
 }
 EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC bool(isnan)(const half& a) {
-#if (defined(EIGEN_HAS_CUDA_FP16) && defined(EIGEN_CUDA_ARCH) && EIGEN_CUDA_ARCH >= 530) || \
-    (defined(EIGEN_HAS_HIP_FP16) && defined(EIGEN_HIP_DEVICE_COMPILE))
+#if defined(EIGEN_GPU_COMPILE_PHASE)
  return __hisnan(a);
 #elif defined(EIGEN_HAS_ARM64_FP16_SCALAR_ARITHMETIC) || defined(EIGEN_HAS_BUILTIN_FLOAT16)
  return (numext::bit_cast<numext::uint16_t>(a.x) & 0x7fff) > 0x7c00;
@@ -810,16 +779,14 @@ EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC half abs(const half& a) {
 #endif
 }
 EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC half exp(const half& a) {
-#if (EIGEN_CUDA_SDK_VER >= 80000 && defined EIGEN_CUDA_ARCH && EIGEN_CUDA_ARCH >= 530) || \
-    defined(EIGEN_HIP_DEVICE_COMPILE)
+#if defined(EIGEN_CUDA_ARCH) || defined(EIGEN_HIP_DEVICE_COMPILE)
  return half(hexp(a));
 #else
  return half(::expf(float(a)));
 #endif
 }
 EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC half exp2(const half& a) {
-#if (EIGEN_CUDA_SDK_VER >= 80000 && defined EIGEN_CUDA_ARCH && EIGEN_CUDA_ARCH >= 530) || \
-    defined(EIGEN_HIP_DEVICE_COMPILE)
+#if defined(EIGEN_CUDA_ARCH) || defined(EIGEN_HIP_DEVICE_COMPILE)
  return half(hexp2(a));
 #else
  return half(::exp2f(float(a)));
@@ -827,9 +794,7 @@ EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC half exp2(const half& a) {
 }
 EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC half expm1(const half& a) { return half(numext::expm1(float(a))); }
 EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC half log(const half& a) {
-#if (defined(EIGEN_HAS_CUDA_FP16) && EIGEN_CUDA_SDK_VER >= 80000 && defined(EIGEN_CUDA_ARCH) && \
-     EIGEN_CUDA_ARCH >= 530) ||                                                                 \
-    (defined(EIGEN_HAS_HIP_FP16) && defined(EIGEN_HIP_DEVICE_COMPILE))
+#if defined(EIGEN_GPU_COMPILE_PHASE)
  return half(hlog(a));
 #else
  return half(::logf(float(a)));
@@ -842,8 +807,7 @@ EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC half log2(const half& a) {
 }

 EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC half sqrt(const half& a) {
-#if (EIGEN_CUDA_SDK_VER >= 80000 && defined EIGEN_CUDA_ARCH && EIGEN_CUDA_ARCH >= 530) || \
-    defined(EIGEN_HIP_DEVICE_COMPILE)
+#if defined(EIGEN_CUDA_ARCH) || defined(EIGEN_HIP_DEVICE_COMPILE)
  return half(hsqrt(a));
 #else
  return half(::sqrtf(float(a)));
@@ -864,16 +828,14 @@ EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC half acos(const half& a) { return half(::a
 EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC half atan(const half& a) { return half(::atanf(float(a))); }
 EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC half atanh(const half& a) { return half(::atanhf(float(a))); }
 EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC half floor(const half& a) {
-#if (EIGEN_CUDA_SDK_VER >= 80000 && defined EIGEN_CUDA_ARCH && EIGEN_CUDA_ARCH >= 300) || \
-    defined(EIGEN_HIP_DEVICE_COMPILE)
+#if (defined(EIGEN_CUDA_ARCH)) || defined(EIGEN_HIP_DEVICE_COMPILE)
  return half(hfloor(a));
 #else
  return half(::floorf(float(a)));
 #endif
 }
 EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC half ceil(const half& a) {
-#if (EIGEN_CUDA_SDK_VER >= 80000 && defined EIGEN_CUDA_ARCH && EIGEN_CUDA_ARCH >= 300) || \
-    defined(EIGEN_HIP_DEVICE_COMPILE)
+#if (defined(EIGEN_CUDA_ARCH)) || defined(EIGEN_HIP_DEVICE_COMPILE)
  return half(hceil(a));
 #else
  return half(::ceilf(float(a)));
@@ -1007,20 +969,12 @@ EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC Eigen::half madd<Eigen::half>(const Eigen:
 }  // namespace numext
 }  // namespace Eigen

-// Add the missing shfl* intrinsics.
-// The __shfl* functions are only valid on HIP or _CUDA_ARCH_ >= 300.
-//   CUDA defines them for (__CUDA_ARCH__ >= 300 || !defined(__CUDA_ARCH__))
-//
-// HIP and CUDA prior to SDK 9.0 define
-//    __shfl, __shfl_up, __shfl_down, __shfl_xor for int and float
-// CUDA since 9.0 deprecates those and instead defines
-//    __shfl_sync, __shfl_up_sync, __shfl_down_sync, __shfl_xor_sync,
-//    with native support for __half and __nv_bfloat16
-//
+// Warp shuffle overloads for Eigen::half.
+// CUDA uses __shfl_*_sync (with mask); HIP uses __shfl_* (no mask).
 // Note that the following are __device__ - only functions.
-#if (defined(EIGEN_CUDACC) && (!defined(EIGEN_CUDA_ARCH) || EIGEN_CUDA_ARCH >= 300)) || defined(EIGEN_HIPCC)
+#if defined(EIGEN_CUDACC) || defined(EIGEN_HIPCC)

-#if defined(EIGEN_HAS_CUDA_FP16) && EIGEN_CUDA_SDK_VER >= 90000
+#if defined(EIGEN_CUDACC)

 __device__ EIGEN_STRONG_INLINE Eigen::half __shfl_sync(unsigned mask, Eigen::half var, int srcLane,
                                                       int width = warpSize) {
@@ -1046,7 +1000,7 @@ __device__ EIGEN_STRONG_INLINE Eigen::half __shfl_xor_sync(unsigned mask, Eigen:
  return static_cast<Eigen::half>(__shfl_xor_sync(mask, h, laneMask, width));
 }

-#else  // HIP or CUDA SDK < 9.0
+#else  // HIP

 __device__ EIGEN_STRONG_INLINE Eigen::half __shfl(Eigen::half var, int srcLane, int width = warpSize) {
  const int ivar = static_cast<int>(Eigen::numext::bit_cast<Eigen::numext::uint16_t>(var));
@@ -1072,7 +1026,7 @@ __device__ EIGEN_STRONG_INLINE Eigen::half __shfl_xor(Eigen::half var, int laneM
 #endif  // __shfl*

 // ldg() has an overload for __half_raw, but we also need one for Eigen::half.
-#if (defined(EIGEN_CUDACC) && (!defined(EIGEN_CUDA_ARCH) || EIGEN_CUDA_ARCH >= 350)) || defined(EIGEN_HIPCC)
+#if defined(EIGEN_CUDACC) || defined(EIGEN_HIPCC)
 EIGEN_STRONG_INLINE __device__ Eigen::half __ldg(const Eigen::half* ptr) {
  return Eigen::half_impl::raw_uint16_to_half(__ldg(reinterpret_cast<const Eigen::numext::uint16_t*>(ptr)));
 }
@@ -1095,8 +1049,7 @@ namespace internal {
 template <>
 struct cast_impl<float, half> {
  EIGEN_DEVICE_FUNC static inline half run(const float& a) {
-#if (defined(EIGEN_HAS_CUDA_FP16) && defined(EIGEN_CUDA_ARCH) && EIGEN_CUDA_ARCH >= 300) || \
-    (defined(EIGEN_HAS_HIP_FP16) && defined(EIGEN_HIP_DEVICE_COMPILE))
+#if defined(EIGEN_GPU_COMPILE_PHASE)
    return __float2half(a);
 #else
    return half(a);
@@ -1107,8 +1060,7 @@ struct cast_impl<float, half> {
 template <>
 struct cast_impl<int, half> {
  EIGEN_DEVICE_FUNC static inline half run(const int& a) {
-#if (defined(EIGEN_HAS_CUDA_FP16) && defined(EIGEN_CUDA_ARCH) && EIGEN_CUDA_ARCH >= 300) || \
-    (defined(EIGEN_HAS_HIP_FP16) && defined(EIGEN_HIP_DEVICE_COMPILE))
+#if defined(EIGEN_GPU_COMPILE_PHASE)
    return __float2half(static_cast<float>(a));
 #else
    return half(static_cast<float>(a));
@@ -1119,8 +1071,7 @@ struct cast_impl<int, half> {
 template <>
 struct cast_impl<half, float> {
  EIGEN_DEVICE_FUNC static inline float run(const half& a) {
-#if (defined(EIGEN_HAS_CUDA_FP16) && defined(EIGEN_CUDA_ARCH) && EIGEN_CUDA_ARCH >= 300) || \
-    (defined(EIGEN_HAS_HIP_FP16) && defined(EIGEN_HIP_DEVICE_COMPILE))
+#if defined(EIGEN_GPU_COMPILE_PHASE)
    return __half2float(a);
 #else
    return static_cast<float>(a);
--- a/Eigen/src/Core/arch/GPU/PacketMath.h
+++ b/Eigen/src/Core/arch/GPU/PacketMath.h
@@ -17,19 +17,8 @@ namespace Eigen {

 namespace internal {

-// Read-only data cached load available.
-#if defined(EIGEN_HIP_DEVICE_COMPILE) || (defined(EIGEN_CUDA_ARCH) && EIGEN_CUDA_ARCH >= 350)
-#define EIGEN_GPU_HAS_LDG 1
-#endif
-
-// FP16 math available.
-#if (defined(EIGEN_CUDA_ARCH) && EIGEN_CUDA_ARCH >= 530)
-#define EIGEN_CUDA_HAS_FP16_ARITHMETIC 1
-#endif
-
-#if defined(EIGEN_HIP_DEVICE_COMPILE) || defined(EIGEN_CUDA_HAS_FP16_ARITHMETIC)
-#define EIGEN_GPU_HAS_FP16_ARITHMETIC 1
-#endif
+// Read-only data cached load (__ldg) and native FP16 arithmetic are available
+// on all supported GPU architectures (sm_70+ for CUDA, GFX906+ for HIP).

 // We need to distinguish ‘clang as the CUDA compiler’ from ‘clang as the host compiler,
 // invoked by NVCC’ (e.g. on MacOS). The former needs to see both host and device implementation
@@ -56,92 +45,84 @@ struct is_arithmetic<double2> {

 template <>
 struct packet_traits<float> : default_packet_traits {
-  typedef float4 type;
-  typedef float4 half;
-  enum {
-    Vectorizable = 1,
-    AlignedOnScalar = 1,
-    size = 4,
+  using type = float4;
+  using half = float4;
+  static constexpr int Vectorizable = 1;
+  static constexpr int AlignedOnScalar = 1;
+  static constexpr int size = 4;

-    HasDiv = 1,
-    HasSin = 0,
-    HasCos = 0,
-    HasLog = 1,
-    HasExp = 1,
-    HasSqrt = 1,
-    HasRsqrt = 1,
-    HasLGamma = 1,
-    HasDiGamma = 1,
-    HasZeta = 1,
-    HasPolygamma = 1,
-    HasErf = 1,
-    HasErfc = 1,
-    HasNdtri = 1,
-    HasBessel = 1,
-    HasIGamma = 1,
-    HasIGammaDerA = 1,
-    HasGammaSampleDerAlpha = 1,
-    HasIGammac = 1,
-    HasBetaInc = 1,
+  static constexpr int HasDiv = 1;
+  static constexpr int HasSin = 0;
+  static constexpr int HasCos = 0;
+  static constexpr int HasLog = 1;
+  static constexpr int HasExp = 1;
+  static constexpr int HasSqrt = 1;
+  static constexpr int HasRsqrt = 1;
+  static constexpr int HasLGamma = 1;
+  static constexpr int HasDiGamma = 1;
+  static constexpr int HasZeta = 1;
+  static constexpr int HasPolygamma = 1;
+  static constexpr int HasErf = 1;
+  static constexpr int HasErfc = 1;
+  static constexpr int HasNdtri = 1;
+  static constexpr int HasBessel = 1;
+  static constexpr int HasIGamma = 1;
+  static constexpr int HasIGammaDerA = 1;
+  static constexpr int HasGammaSampleDerAlpha = 1;
+  static constexpr int HasIGammac = 1;
+  static constexpr int HasBetaInc = 1;

-    HasFloor = 1,
-    HasCmp = EIGEN_HAS_GPU_DEVICE_FUNCTIONS
-  };
+  static constexpr int HasFloor = 1;
+  static constexpr int HasCmp = EIGEN_HAS_GPU_DEVICE_FUNCTIONS;
 };

 template <>
 struct packet_traits<double> : default_packet_traits {
-  typedef double2 type;
-  typedef double2 half;
-  enum {
-    Vectorizable = 1,
-    AlignedOnScalar = 1,
-    size = 2,
+  using type = double2;
+  using half = double2;
+  static constexpr int Vectorizable = 1;
+  static constexpr int AlignedOnScalar = 1;
+  static constexpr int size = 2;

-    HasDiv = 1,
-    HasLog = 1,
-    HasExp = 1,
-    HasSqrt = 1,
-    HasRsqrt = 1,
-    HasLGamma = 1,
-    HasDiGamma = 1,
-    HasZeta = 1,
-    HasPolygamma = 1,
-    HasErf = 1,
-    HasErfc = 1,
-    HasNdtri = 1,
-    HasBessel = 1,
-    HasIGamma = 1,
-    HasIGammaDerA = 1,
-    HasGammaSampleDerAlpha = 1,
-    HasIGammac = 1,
-    HasBetaInc = 1,
-  };
+  static constexpr int HasDiv = 1;
+  static constexpr int HasLog = 1;
+  static constexpr int HasExp = 1;
+  static constexpr int HasSqrt = 1;
+  static constexpr int HasRsqrt = 1;
+  static constexpr int HasLGamma = 1;
+  static constexpr int HasDiGamma = 1;
+  static constexpr int HasZeta = 1;
+  static constexpr int HasPolygamma = 1;
+  static constexpr int HasErf = 1;
+  static constexpr int HasErfc = 1;
+  static constexpr int HasNdtri = 1;
+  static constexpr int HasBessel = 1;
+  static constexpr int HasIGamma = 1;
+  static constexpr int HasIGammaDerA = 1;
+  static constexpr int HasGammaSampleDerAlpha = 1;
+  static constexpr int HasIGammac = 1;
+  static constexpr int HasBetaInc = 1;
 };

 template <>
 struct unpacket_traits<float4> {
-  typedef float type;
-  enum {
-    size = 4,
-    alignment = Aligned16,
-    vectorizable = true,
-    masked_load_available = false,
-    masked_store_available = false
-  };
-  typedef float4 half;
+  using type = float;
+  static constexpr int size = 4;
+  static constexpr int alignment = Aligned16;
+  static constexpr bool vectorizable = true;
+  static constexpr bool masked_load_available = false;
+  static constexpr bool masked_store_available = false;
+  using half = float4;
 };
 template <>
 struct unpacket_traits<double2> {
-  typedef double type;
-  enum {
-    size = 2,
-    alignment = Aligned16,
-    vectorizable = true,
-    masked_load_available = false,
-    masked_store_available = false
-  };
-  typedef double2 half;
+  using type = double;
+  static constexpr int size = 2;
+  static constexpr int alignment = Aligned16;
+  static constexpr bool vectorizable = true;
+  static constexpr bool masked_load_available = false;
+  static constexpr bool masked_store_available = false;
+  using half = double2;
 };

 template <>
@@ -403,7 +384,7 @@ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void pstoreu<double>(double* to, const dou

 template <>
 EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE float4 ploadt_ro<float4, Aligned>(const float* from) {
-#if defined(EIGEN_GPU_HAS_LDG)
+#if defined(EIGEN_GPU_COMPILE_PHASE)
  return __ldg(reinterpret_cast<const float4*>(from));
 #else
  return make_float4(from[0], from[1], from[2], from[3]);
@@ -411,7 +392,7 @@ EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE float4 ploadt_ro<float4, Aligned>(const fl
 }
 template <>
 EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE double2 ploadt_ro<double2, Aligned>(const double* from) {
-#if defined(EIGEN_GPU_HAS_LDG)
+#if defined(EIGEN_GPU_COMPILE_PHASE)
  return __ldg(reinterpret_cast<const double2*>(from));
 #else
  return make_double2(from[0], from[1]);
@@ -420,7 +401,7 @@ EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE double2 ploadt_ro<double2, Aligned>(const

 template <>
 EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE float4 ploadt_ro<float4, Unaligned>(const float* from) {
-#if defined(EIGEN_GPU_HAS_LDG)
+#if defined(EIGEN_GPU_COMPILE_PHASE)
  return make_float4(__ldg(from + 0), __ldg(from + 1), __ldg(from + 2), __ldg(from + 3));
 #else
  return make_float4(from[0], from[1], from[2], from[3]);
@@ -428,7 +409,7 @@ EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE float4 ploadt_ro<float4, Unaligned>(const
 }
 template <>
 EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE double2 ploadt_ro<double2, Unaligned>(const double* from) {
-#if defined(EIGEN_GPU_HAS_LDG)
+#if defined(EIGEN_GPU_COMPILE_PHASE)
  return make_double2(__ldg(from + 0), __ldg(from + 1));
 #else
  return make_double2(from[0], from[1]);
@@ -591,23 +572,20 @@ EIGEN_DEVICE_FUNC inline void ptranspose(PacketBlock<double2, 2>& kernel) {

 #endif  // defined(EIGEN_GPUCC) && defined(EIGEN_USE_GPU)

-// Half-packet functions are not available on the host for CUDA 9.0-9.2, only
-// on device. There is no benefit to using them on the host anyways, since they are
-// emulated.
-#if (defined(EIGEN_HAS_CUDA_FP16) || defined(EIGEN_HAS_HIP_FP16)) && defined(EIGEN_GPU_COMPILE_PHASE)
+// Half-packet functions are only available in GPU device compilation — they use
+// intrinsics (__half2, etc.) that have no host-side benefit.
+#if defined(EIGEN_GPU_COMPILE_PHASE)

-typedef ulonglong2 Packet4h2;
+using Packet4h2 = ulonglong2;
 template <>
 struct unpacket_traits<Packet4h2> {
-  typedef Eigen::half type;
-  enum {
-    size = 8,
-    alignment = Aligned16,
-    vectorizable = true,
-    masked_load_available = false,
-    masked_store_available = false
-  };
-  typedef Packet4h2 half;
+  using type = Eigen::half;
+  static constexpr int size = 8;
+  static constexpr int alignment = Aligned16;
+  static constexpr bool vectorizable = true;
+  static constexpr bool masked_load_available = false;
+  static constexpr bool masked_store_available = false;
+  using half = Packet4h2;
 };
 template <>
 struct is_arithmetic<Packet4h2> {
@@ -616,15 +594,13 @@ struct is_arithmetic<Packet4h2> {

 template <>
 struct unpacket_traits<half2> {
-  typedef Eigen::half type;
-  enum {
-    size = 2,
-    alignment = Aligned16,
-    vectorizable = true,
-    masked_load_available = false,
-    masked_store_available = false
-  };
-  typedef half2 half;
+  using type = Eigen::half;
+  static constexpr int size = 2;
+  static constexpr int alignment = Aligned16;
+  static constexpr bool vectorizable = true;
+  static constexpr bool masked_load_available = false;
+  static constexpr bool masked_store_available = false;
+  using half = half2;
 };
 template <>
 struct is_arithmetic<half2> {
@@ -633,23 +609,21 @@ struct is_arithmetic<half2> {

 template <>
 struct packet_traits<Eigen::half> : default_packet_traits {
-  typedef Packet4h2 type;
-  typedef Packet4h2 half;
-  enum {
-    Vectorizable = 1,
-    AlignedOnScalar = 1,
-    size = 8,
-    HasAdd = 1,
-    HasSub = 1,
-    HasMul = 1,
-    HasDiv = 1,
-    HasSqrt = 1,
-    HasRsqrt = 1,
-    HasExp = 1,
-    HasExpm1 = 1,
-    HasLog = 1,
-    HasLog1p = 1
-  };
+  using type = Packet4h2;
+  using half = Packet4h2;
+  static constexpr int Vectorizable = 1;
+  static constexpr int AlignedOnScalar = 1;
+  static constexpr int size = 8;
+  static constexpr int HasAdd = 1;
+  static constexpr int HasSub = 1;
+  static constexpr int HasMul = 1;
+  static constexpr int HasDiv = 1;
+  static constexpr int HasSqrt = 1;
+  static constexpr int HasRsqrt = 1;
+  static constexpr int HasExp = 1;
+  static constexpr int HasExpm1 = 1;
+  static constexpr int HasLog = 1;
+  static constexpr int HasLog1p = 1;
 };

 template <>
@@ -690,7 +664,7 @@ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void pstoreu(Eigen::half* to, const half2&
 }

 EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE half2 ploadt_ro_aligned(const Eigen::half* from) {
-#if defined(EIGEN_GPU_HAS_LDG)
+#if defined(EIGEN_GPU_COMPILE_PHASE)
  // Input is guaranteed to be properly aligned.
  return __ldg(reinterpret_cast<const half2*>(from));
 #else
@@ -699,7 +673,7 @@ EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE half2 ploadt_ro_aligned(const Eigen::half*
 }

 EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE half2 ploadt_ro_unaligned(const Eigen::half* from) {
-#if defined(EIGEN_GPU_HAS_LDG)
+#if defined(EIGEN_GPU_COMPILE_PHASE)
  return __halves2half2(__ldg(from + 0), __ldg(from + 1));
 #else
  return __halves2half2(*(from + 0), *(from + 1));
@@ -745,12 +719,7 @@ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void ptranspose(PacketBlock<half2, 2>& ker
 }

 EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 plset(const Eigen::half& a) {
-#if defined(EIGEN_GPU_HAS_FP16_ARITHMETIC)
  return __halves2half2(a, __hadd(a, __float2half(1.0f)));
-#else
-  float f = __half2float(a) + 1.0f;
-  return __halves2half2(a, __float2half(f));
-#endif
 }

 EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 pselect(const half2& mask, const half2& a, const half2& b) {
@@ -837,89 +806,21 @@ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 pandnot(const half2& a, const half2&
  return __halves2half2(result1, result2);
 }

-EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 padd(const half2& a, const half2& b) {
-#if defined(EIGEN_GPU_HAS_FP16_ARITHMETIC)
-  return __hadd2(a, b);
-#else
-  float a1 = __low2float(a);
-  float a2 = __high2float(a);
-  float b1 = __low2float(b);
-  float b2 = __high2float(b);
-  float r1 = a1 + b1;
-  float r2 = a2 + b2;
-  return __floats2half2_rn(r1, r2);
-#endif
-}
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 padd(const half2& a, const half2& b) { return __hadd2(a, b); }

-EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 psub(const half2& a, const half2& b) {
-#if defined(EIGEN_GPU_HAS_FP16_ARITHMETIC)
-  return __hsub2(a, b);
-#else
-  float a1 = __low2float(a);
-  float a2 = __high2float(a);
-  float b1 = __low2float(b);
-  float b2 = __high2float(b);
-  float r1 = a1 - b1;
-  float r2 = a2 - b2;
-  return __floats2half2_rn(r1, r2);
-#endif
-}
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 psub(const half2& a, const half2& b) { return __hsub2(a, b); }

-EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 pnegate(const half2& a) {
-#if defined(EIGEN_GPU_HAS_FP16_ARITHMETIC)
-  return __hneg2(a);
-#else
-  float a1 = __low2float(a);
-  float a2 = __high2float(a);
-  return __floats2half2_rn(-a1, -a2);
-#endif
-}
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 pnegate(const half2& a) { return __hneg2(a); }

 EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 pconj(const half2& a) { return a; }

-EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 pmul(const half2& a, const half2& b) {
-#if defined(EIGEN_GPU_HAS_FP16_ARITHMETIC)
-  return __hmul2(a, b);
-#else
-  float a1 = __low2float(a);
-  float a2 = __high2float(a);
-  float b1 = __low2float(b);
-  float b2 = __high2float(b);
-  float r1 = a1 * b1;
-  float r2 = a2 * b2;
-  return __floats2half2_rn(r1, r2);
-#endif
-}
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 pmul(const half2& a, const half2& b) { return __hmul2(a, b); }

 EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 pmadd(const half2& a, const half2& b, const half2& c) {
-#if defined(EIGEN_GPU_HAS_FP16_ARITHMETIC)
  return __hfma2(a, b, c);
-#else
-  float a1 = __low2float(a);
-  float a2 = __high2float(a);
-  float b1 = __low2float(b);
-  float b2 = __high2float(b);
-  float c1 = __low2float(c);
-  float c2 = __high2float(c);
-  float r1 = a1 * b1 + c1;
-  float r2 = a2 * b2 + c2;
-  return __floats2half2_rn(r1, r2);
-#endif
 }

-EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 pdiv(const half2& a, const half2& b) {
-#if defined(EIGEN_GPU_HAS_FP16_ARITHMETIC)
-  return __h2div(a, b);
-#else
-  float a1 = __low2float(a);
-  float a2 = __high2float(a);
-  float b1 = __low2float(b);
-  float b2 = __high2float(b);
-  float r1 = a1 / b1;
-  float r2 = a2 / b2;
-  return __floats2half2_rn(r1, r2);
-#endif
-}
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 pdiv(const half2& a, const half2& b) { return __h2div(a, b); }

 EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 pmin(const half2& a, const half2& b) {
  float a1 = __low2float(a);
@@ -942,47 +843,23 @@ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 pmax(const half2& a, const half2& b)
 }

 EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Eigen::half predux(const half2& a) {
-#if defined(EIGEN_GPU_HAS_FP16_ARITHMETIC)
  return __hadd(__low2half(a), __high2half(a));
-#else
-  float a1 = __low2float(a);
-  float a2 = __high2float(a);
-  return Eigen::half(__float2half(a1 + a2));
-#endif
 }

 EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Eigen::half predux_max(const half2& a) {
-#if defined(EIGEN_GPU_HAS_FP16_ARITHMETIC)
  __half first = __low2half(a);
  __half second = __high2half(a);
  return __hgt(first, second) ? first : second;
-#else
-  float a1 = __low2float(a);
-  float a2 = __high2float(a);
-  return a1 > a2 ? __low2half(a) : __high2half(a);
-#endif
 }

 EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Eigen::half predux_min(const half2& a) {
-#if defined(EIGEN_GPU_HAS_FP16_ARITHMETIC)
  __half first = __low2half(a);
  __half second = __high2half(a);
  return __hlt(first, second) ? first : second;
-#else
-  float a1 = __low2float(a);
-  float a2 = __high2float(a);
-  return a1 < a2 ? __low2half(a) : __high2half(a);
-#endif
 }

 EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Eigen::half predux_mul(const half2& a) {
-#if defined(EIGEN_GPU_HAS_FP16_ARITHMETIC)
  return __hmul(__low2half(a), __high2half(a));
-#else
-  float a1 = __low2float(a);
-  float a2 = __high2float(a);
-  return Eigen::half(__float2half(a1 * a2));
-#endif
 }

 EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 plog1p(const half2& a) {
@@ -1001,8 +878,6 @@ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 pexpm1(const half2& a) {
  return __floats2half2_rn(r1, r2);
 }

-#if (EIGEN_CUDA_SDK_VER >= 80000 && defined(EIGEN_CUDA_HAS_FP16_ARITHMETIC)) || defined(EIGEN_HIP_DEVICE_COMPILE)
-
 EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 plog(const half2& a) { return h2log(a); }

 EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 pexp(const half2& a) { return h2exp(a); }
@@ -1010,41 +885,6 @@ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 pexp(const half2& a) { return h2exp(
 EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 psqrt(const half2& a) { return h2sqrt(a); }

 EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 prsqrt(const half2& a) { return h2rsqrt(a); }
-
-#else
-
-EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 plog(const half2& a) {
-  float a1 = __low2float(a);
-  float a2 = __high2float(a);
-  float r1 = logf(a1);
-  float r2 = logf(a2);
-  return __floats2half2_rn(r1, r2);
-}
-
-EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 pexp(const half2& a) {
-  float a1 = __low2float(a);
-  float a2 = __high2float(a);
-  float r1 = expf(a1);
-  float r2 = expf(a2);
-  return __floats2half2_rn(r1, r2);
-}
-
-EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 psqrt(const half2& a) {
-  float a1 = __low2float(a);
-  float a2 = __high2float(a);
-  float r1 = sqrtf(a1);
-  float r2 = sqrtf(a2);
-  return __floats2half2_rn(r1, r2);
-}
-
-EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 prsqrt(const half2& a) {
-  float a1 = __low2float(a);
-  float a2 = __high2float(a);
-  float r1 = rsqrtf(a1);
-  float r2 = rsqrtf(a2);
-  return __floats2half2_rn(r1, r2);
-}
-#endif
 }  // namespace

 template <>
@@ -1091,19 +931,17 @@ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void pstoreu<Eigen::half>(Eigen::half* to,

 template <>
 EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE Packet4h2 ploadt_ro<Packet4h2, Aligned>(const Eigen::half* from) {
-#if defined(EIGEN_GPU_HAS_LDG)
  Packet4h2 r;
+#if defined(EIGEN_GPU_COMPILE_PHASE)
  r = __ldg(reinterpret_cast<const Packet4h2*>(from));
-  return r;
 #else
-  Packet4h2 r;
  half2* r_alias = reinterpret_cast<half2*>(&r);
  r_alias[0] = ploadt_ro_aligned(from + 0);
  r_alias[1] = ploadt_ro_aligned(from + 2);
  r_alias[2] = ploadt_ro_aligned(from + 4);
  r_alias[3] = ploadt_ro_aligned(from + 6);
-  return r;
 #endif
+  return r;
 }

 template <>
@@ -1272,7 +1110,7 @@ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet4h2 plset<Packet4h2>(const Eigen::ha
  p_alias[2] = __halves2half2(__hadd(a, __float2half(4.0f)), __hadd(a, __float2half(5.0f)));
  p_alias[3] = __halves2half2(__hadd(a, __float2half(6.0f)), __hadd(a, __float2half(7.0f)));
  return r;
-#elif defined(EIGEN_CUDA_HAS_FP16_ARITHMETIC)
+#elif defined(EIGEN_CUDA_ARCH)
  Packet4h2 r;
  half2* r_alias = reinterpret_cast<half2*>(&r);

@@ -1290,16 +1128,6 @@ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet4h2 plset<Packet4h2>(const Eigen::ha
  r_alias[3] = plset(__high2half(c));

  return r;
-
-#else
-  float f = __half2float(a);
-  Packet4h2 r;
-  half2* p_alias = reinterpret_cast<half2*>(&r);
-  p_alias[0] = __halves2half2(a, __float2half(f + 1.0f));
-  p_alias[1] = __halves2half2(__float2half(f + 2.0f), __float2half(f + 3.0f));
-  p_alias[2] = __halves2half2(__float2half(f + 4.0f), __float2half(f + 5.0f));
-  p_alias[3] = __halves2half2(__float2half(f + 6.0f), __float2half(f + 7.0f));
-  return r;
 #endif
 }

@@ -1533,7 +1361,7 @@ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Eigen::half predux_max<Packet4h2>(const Pa
  half2 m1 = __halves2half2(predux_max(a_alias[2]), predux_max(a_alias[3]));
  __half first = predux_max(m0);
  __half second = predux_max(m1);
-#if defined(EIGEN_CUDA_HAS_FP16_ARITHMETIC)
+#if defined(EIGEN_CUDA_ARCH)
  return (__hgt(first, second) ? first : second);
 #else
  float ffirst = __half2float(first);
@@ -1549,7 +1377,7 @@ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Eigen::half predux_min<Packet4h2>(const Pa
  half2 m1 = __halves2half2(predux_min(a_alias[2]), predux_min(a_alias[3]));
  __half first = predux_min(m0);
  __half second = predux_min(m1);
-#if defined(EIGEN_CUDA_HAS_FP16_ARITHMETIC)
+#if defined(EIGEN_CUDA_ARCH)
  return (__hlt(first, second) ? first : second);
 #else
  float ffirst = __half2float(first);
@@ -1641,47 +1469,17 @@ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet4h2 prsqrt<Packet4h2>(const Packet4h
 // the implementation of GPU half reduction.
 template <>
 EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 padd<half2>(const half2& a, const half2& b) {
-#if defined(EIGEN_GPU_HAS_FP16_ARITHMETIC)
  return __hadd2(a, b);
-#else
-  float a1 = __low2float(a);
-  float a2 = __high2float(a);
-  float b1 = __low2float(b);
-  float b2 = __high2float(b);
-  float r1 = a1 + b1;
-  float r2 = a2 + b2;
-  return __floats2half2_rn(r1, r2);
-#endif
 }

 template <>
 EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 pmul<half2>(const half2& a, const half2& b) {
-#if defined(EIGEN_GPU_HAS_FP16_ARITHMETIC)
  return __hmul2(a, b);
-#else
-  float a1 = __low2float(a);
-  float a2 = __high2float(a);
-  float b1 = __low2float(b);
-  float b2 = __high2float(b);
-  float r1 = a1 * b1;
-  float r2 = a2 * b2;
-  return __floats2half2_rn(r1, r2);
-#endif
 }

 template <>
 EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 pdiv<half2>(const half2& a, const half2& b) {
-#if defined(EIGEN_GPU_HAS_FP16_ARITHMETIC)
  return __h2div(a, b);
-#else
-  float a1 = __low2float(a);
-  float a2 = __high2float(a);
-  float b1 = __low2float(b);
-  float b2 = __high2float(b);
-  float r1 = a1 / b1;
-  float r2 = a2 / b2;
-  return __floats2half2_rn(r1, r2);
-#endif
 }

 template <>
@@ -1706,11 +1504,7 @@ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 pmax<half2>(const half2& a, const ha
  return __halves2half2(r1, r2);
 }

-#endif  // (defined(EIGEN_HAS_CUDA_FP16) || defined(EIGEN_HAS_HIP_FP16)) && defined(EIGEN_GPU_COMPILE_PHASE)
-
-#undef EIGEN_GPU_HAS_LDG
-#undef EIGEN_CUDA_HAS_FP16_ARITHMETIC
-#undef EIGEN_GPU_HAS_FP16_ARITHMETIC
+#endif  // defined(EIGEN_GPU_COMPILE_PHASE)

 }  // end namespace internal

--- a/Eigen/src/Core/arch/GPU/TypeCasting.h
+++ b/Eigen/src/Core/arch/GPU/TypeCasting.h
@@ -17,8 +17,7 @@ namespace Eigen {

 namespace internal {

-#if (defined(EIGEN_HAS_CUDA_FP16) && defined(EIGEN_CUDA_ARCH) && EIGEN_CUDA_ARCH >= 300) || \
-    (defined(EIGEN_HAS_HIP_FP16) && defined(EIGEN_HIP_DEVICE_COMPILE))
+#if defined(EIGEN_GPU_COMPILE_PHASE)

 template <>
 struct type_casting_traits<Eigen::half, float> {
--- a/Eigen/src/Core/util/ConfigureVectorization.h
+++ b/Eigen/src/Core/util/ConfigureVectorization.h
@@ -541,12 +541,6 @@ extern "C" {
 #if defined EIGEN_CUDACC
 #define EIGEN_VECTORIZE_GPU
 #include <vector_types.h>
-#if EIGEN_CUDA_SDK_VER >= 70500
-#define EIGEN_HAS_CUDA_FP16
-#endif
-#endif
-
-#if defined(EIGEN_HAS_CUDA_FP16)
 #include <cuda_runtime_api.h>
 #include <cuda_fp16.h>
 #endif
@@ -554,7 +548,6 @@ extern "C" {
 #if defined(EIGEN_HIPCC)
 #define EIGEN_VECTORIZE_GPU
 #include <hip/hip_vector_types.h>
-#define EIGEN_HAS_HIP_FP16
 #include <hip/hip_fp16.h>
 #define EIGEN_HAS_HIP_BF16
 #include <hip/hip_bfloat16.h>
--- a/Eigen/src/Core/util/DisableStupidWarnings.h
+++ b/Eigen/src/Core/util/DisableStupidWarnings.h
@@ -84,8 +84,7 @@
 #endif

 #if defined __NVCC__ && defined __CUDACC__
-// MSVC 14.16 (required by CUDA 9.*) does not support the _Pragma keyword, so
-// we instead use Microsoft's __pragma extension.
+// MSVC does not support the _Pragma keyword, so we use Microsoft's __pragma extension.
 #if defined _MSC_VER
 #define EIGEN_MAKE_PRAGMA(X) __pragma(#X)
 #else
--- a/Eigen/src/Core/util/Macros.h
+++ b/Eigen/src/Core/util/Macros.h
@@ -148,13 +148,8 @@
 #endif

 #if defined(__NVCC__)
-#if defined(__CUDACC_VER_MAJOR__) && (__CUDACC_VER_MAJOR__ >= 9)
+// CUDA 11.4+ always defines __CUDACC_VER_MAJOR__.
 #define EIGEN_COMP_NVCC ((__CUDACC_VER_MAJOR__ * 10000) + (__CUDACC_VER_MINOR__ * 100))
-#elif defined(__CUDACC_VER__)
-#define EIGEN_COMP_NVCC __CUDACC_VER__
-#else
-#error "NVCC did not define compiler version."
-#endif
 #else
 #define EIGEN_COMP_NVCC 0
 #endif
@@ -575,6 +570,10 @@
 #define EIGEN_CUDA_SDK_VER 0
 #endif

+#if defined(EIGEN_CUDACC) && EIGEN_CUDA_SDK_VER > 0 && EIGEN_CUDA_SDK_VER < 110400
+#error "Eigen requires CUDA 11.4 or later."
+#endif
+
 #if defined(__HIPCC__) && !defined(EIGEN_NO_HIP) && !defined(__SYCL_DEVICE_ONLY__)
 // Means the compiler is HIPCC (analogous to EIGEN_CUDACC, but for HIP)
 #define EIGEN_HIPCC __HIPCC__
@@ -584,22 +583,20 @@
 // ++ host_defines.h which contains the defines for the __host__ and __device__ macros
 #include <hip/hip_runtime.h>

+// Eigen requires ROCm/HIP >= 5.6 (GFX906 minimum architecture).
+// This floor exists to allow simplifying shared CUDA/HIP preprocessor guards —
+// all __HIP_ARCH_HAS_WARP_SHUFFLE__, __HIP_ARCH_HAS_FP16__, etc. are always true on GFX906+.
+#if defined(HIP_VERSION_MAJOR) && (HIP_VERSION_MAJOR < 5 || (HIP_VERSION_MAJOR == 5 && HIP_VERSION_MINOR < 6))
+#error "Eigen requires ROCm/HIP >= 5.6."
+#endif
+
 #if defined(__HIP_DEVICE_COMPILE__) && !defined(__SYCL_DEVICE_ONLY__)
 // analogous to EIGEN_CUDA_ARCH, but for HIP
 #define EIGEN_HIP_DEVICE_COMPILE __HIP_DEVICE_COMPILE__
 #endif

-// For HIP (ROCm 3.5 and higher), we need to explicitly set the launch_bounds attribute
-// value to 1024. The compiler assigns a default value of 256 when the attribute is not
-// specified. This results in failures on the HIP platform, for cases when a GPU kernel
-// without an explicit launch_bounds attribute is called with a threads_per_block value
-// greater than 256.
-//
-// This is a regression in functionality and is expected to be fixed within the next
-// couple of ROCm releases (compiler will go back to using 1024 value as the default)
-//
-// In the meantime, we will use a "only enabled for HIP" macro to set the launch_bounds
-// attribute.
+// HIP compilers default to launch_bounds(256), which causes failures when kernels
+// are called with more than 256 threads per block. Explicitly set to 1024 for HIP.

 #define EIGEN_HIP_LAUNCH_BOUNDS_1024 __launch_bounds__(1024)

--- a/Eigen/src/Eigenvalues/SelfAdjointEigenSolver.h
+++ b/Eigen/src/Eigenvalues/SelfAdjointEigenSolver.h
@@ -25,7 +25,7 @@ namespace internal {
 template <typename SolverType, int Size, bool IsComplex>
 struct direct_selfadjoint_eigenvalues;

-template <typename MatrixType, typename DiagType, typename SubDiagType>
+template <bool PerBlockScaling, typename MatrixType, typename DiagType, typename SubDiagType>
 EIGEN_DEVICE_FUNC ComputationInfo computeFromTridiagonal_impl(DiagType& diag, SubDiagType& subdiag,
                                                              const Index maxIterations, bool computeEigenvectors,
                                                              MatrixType& eivec);
@@ -438,7 +438,7 @@ EIGEN_DEVICE_FUNC SelfAdjointEigenSolver<MatrixType>& SelfAdjointEigenSolver<Mat
    m_eivec = matrix;
    m_eivalues.coeffRef(0, 0) = numext::real(m_eivec.coeff(0, 0));
    if (computeEigenvectors) m_eivec.setOnes(n, n);
-    m_info = Success;
+    m_info = (numext::isfinite)(m_eivalues.coeffRef(0, 0)) ? Success : NoConvergence;
    m_isInitialized = true;
    m_eigenvectorsOk = computeEigenvectors;
    return *this;
@@ -448,18 +448,29 @@ EIGEN_DEVICE_FUNC SelfAdjointEigenSolver<MatrixType>& SelfAdjointEigenSolver<Mat
  RealVectorType& diag = m_eivalues;
  EigenvectorsType& mat = m_eivec;

-  // map the matrix coefficients to [-1:1] to avoid over- and underflow.
+  // Scale the matrix to [-1:1] to avoid overflow/underflow during tridiagonalization
+  // and subsequent QR iteration. This uniform scaling ensures the tridiagonal output is
+  // well-conditioned. Note: for block-diagonal matrices with widely separated scales, this
+  // can underflow small blocks. Users with such matrices should tridiagonalize separately
+  // and call computeFromTridiagonal(), which uses per-block scaling.
  mat = matrix.template triangularView<Lower>();
  RealScalar scale = mat.cwiseAbs().maxCoeff();
+  if (!(numext::isfinite)(scale)) {
+    // Input contains Inf or NaN.
+    m_info = NoConvergence;
+    m_isInitialized = true;
+    m_eigenvectorsOk = false;
+    return *this;
+  }
  if (numext::is_exactly_zero(scale)) scale = RealScalar(1);
  mat.template triangularView<Lower>() /= scale;
  m_subdiag.resize(n - 1);
  m_hcoeffs.resize(n - 1);
  internal::tridiagonalization_inplace(mat, diag, m_subdiag, m_hcoeffs, m_workspace, computeEigenvectors);

-  m_info = internal::computeFromTridiagonal_impl(diag, m_subdiag, m_maxIterations, computeEigenvectors, m_eivec);
+  m_info = internal::computeFromTridiagonal_impl<false>(diag, m_subdiag, m_maxIterations, computeEigenvectors, m_eivec);

-  // scale back the eigen values
+  // Scale back the eigenvalues.
  m_eivalues *= scale;

  m_isInitialized = true;
@@ -470,15 +481,31 @@ EIGEN_DEVICE_FUNC SelfAdjointEigenSolver<MatrixType>& SelfAdjointEigenSolver<Mat
 template <typename MatrixType>
 SelfAdjointEigenSolver<MatrixType>& SelfAdjointEigenSolver<MatrixType>::computeFromTridiagonal(
    const RealVectorType& diag, const SubDiagonalType& subdiag, int options) {
-  // TODO : Add an option to scale the values beforehand
  bool computeEigenvectors = (options & ComputeEigenvectors) == ComputeEigenvectors;

  m_eivalues = diag;
  m_subdiag = subdiag;
+
+  // Check for Inf/NaN in the input.
+  {
+    RealScalar scale = RealScalar(0);
+    if (m_eivalues.size() > 0) scale = m_eivalues.cwiseAbs().maxCoeff();
+    if (m_subdiag.size() > 0) scale = numext::maxi(scale, m_subdiag.cwiseAbs().maxCoeff());
+    if (!(numext::isfinite)(scale)) {
+      m_info = NoConvergence;
+      m_isInitialized = true;
+      m_eigenvectorsOk = false;
+      return *this;
+    }
+  }
+
  if (computeEigenvectors) {
    m_eivec.setIdentity(diag.size(), diag.size());
  }
-  m_info = internal::computeFromTridiagonal_impl(m_eivalues, m_subdiag, m_maxIterations, computeEigenvectors, m_eivec);
+  // Use per-deflation-block scaling (like LAPACK's DSTERF) to avoid losing
+  // precision when the tridiagonal entries span a wide range of magnitudes.
+  m_info =
+      internal::computeFromTridiagonal_impl<true>(m_eivalues, m_subdiag, m_maxIterations, computeEigenvectors, m_eivec);

  m_isInitialized = true;
  m_eigenvectorsOk = computeEigenvectors;
@@ -490,6 +517,10 @@ namespace internal {
 * \internal
 * \brief Compute the eigendecomposition from a tridiagonal matrix
 *
+ * \tparam PerBlockScaling If true, each deflation block is independently scaled to [-1,1] before
+ *         QR iteration, following LAPACK's DSTERF approach. This prevents precision loss when entries
+ *         span a wide range of magnitudes. When false, the caller is responsible for ensuring the
+ *         entries are in a safe range (e.g. by pre-scaling the dense matrix before tridiagonalization).
 * \param[in,out] diag : On input, the diagonal of the matrix, on output the eigenvalues
 * \param[in,out] subdiag : The subdiagonal part of the matrix (entries are modified during the decomposition)
 * \param[in] maxIterations : the maximum number of iterations
@@ -497,7 +528,7 @@ namespace internal {
 * \param[out] eivec : The matrix to store the eigenvectors if computeEigenvectors==true. Must be allocated on input.
 * \returns \c Success or \c NoConvergence
 */
-template <typename MatrixType, typename DiagType, typename SubDiagType>
+template <bool PerBlockScaling, typename MatrixType, typename DiagType, typename SubDiagType>
 EIGEN_DEVICE_FUNC ComputationInfo computeFromTridiagonal_impl(DiagType& diag, SubDiagType& subdiag,
                                                              const Index maxIterations, bool computeEigenvectors,
                                                              MatrixType& eivec) {
@@ -512,21 +543,32 @@ EIGEN_DEVICE_FUNC ComputationInfo computeFromTridiagonal_impl(DiagType& diag, Su
  typedef typename DiagType::RealScalar RealScalar;
  const RealScalar considerAsZero = (std::numeric_limits<RealScalar>::min)();
  const RealScalar precision_inv = RealScalar(1) / NumTraits<RealScalar>::epsilon();
-  while (end > 0) {
-    for (Index i = start; i < end; ++i) {
+
+  // Helper lambda for the deflation test.
+  auto deflate = [&](Index lo, Index hi) {
+    for (Index i = lo; i < hi; ++i) {
      if (numext::abs(subdiag[i]) < considerAsZero) {
        subdiag[i] = RealScalar(0);
      } else {
-        // abs(subdiag[i]) <= epsilon * sqrt(abs(diag[i]) + abs(diag[i+1]))
-        // Scaled to prevent underflows.
        const RealScalar scaled_subdiag = precision_inv * subdiag[i];
        if (scaled_subdiag * scaled_subdiag <= (numext::abs(diag[i]) + numext::abs(diag[i + 1]))) {
          subdiag[i] = RealScalar(0);
        }
      }
    }
+  };

-    // find the largest unreduced block at the end of the matrix.
+  // For per-block scaling, track the currently scaled block and its scale factor.
+  // When the outer loop identifies a block outside the scaled region, unscale the old
+  // block and scale the new one. This keeps the same outer loop structure (one QR step
+  // per iteration) while ensuring each block is processed in scaled coordinates.
+  Index scaled_start = -1, scaled_end = -1;
+  RealScalar block_scale = RealScalar(1);
+
+  while (end > 0) {
+    deflate(start, end);
+
+    // Find the largest unreduced block at the end of the matrix.
    while (end > 0 && numext::is_exactly_zero(subdiag[end - 1])) {
      end--;
    }
@@ -539,9 +581,42 @@ EIGEN_DEVICE_FUNC ComputationInfo computeFromTridiagonal_impl(DiagType& diag, Su
    start = end - 1;
    while (start > 0 && !numext::is_exactly_zero(subdiag[start - 1])) start--;

+    if (PerBlockScaling) {
+      // Check if we've moved to a different block than the one currently scaled.
+      if (start != scaled_start || end != scaled_end) {
+        // Unscale the previous block if it was scaled.
+        if (block_scale != RealScalar(1)) {
+          for (Index i = scaled_start; i <= scaled_end; ++i) diag[i] /= block_scale;
+          for (Index i = scaled_start; i < scaled_end; ++i) {
+            if (!numext::is_exactly_zero(subdiag[i])) subdiag[i] /= block_scale;
+          }
+          block_scale = RealScalar(1);
+        }
+        // Compute the norm and scale the new block to [-1:1].
+        RealScalar block_norm = RealScalar(0);
+        for (Index i = start; i <= end; ++i) block_norm = numext::maxi(block_norm, numext::abs(diag[i]));
+        for (Index i = start; i < end; ++i) block_norm = numext::maxi(block_norm, numext::abs(subdiag[i]));
+        if (block_norm > RealScalar(0) && block_norm != RealScalar(1)) {
+          block_scale = RealScalar(1) / block_norm;
+          for (Index i = start; i <= end; ++i) diag[i] *= block_scale;
+          for (Index i = start; i < end; ++i) subdiag[i] *= block_scale;
+        }
+        scaled_start = start;
+        scaled_end = end;
+      }
+    }
+
    internal::tridiagonal_qr_step<MatrixType::Flags & RowMajorBit ? RowMajor : ColMajor>(
        diag.data(), subdiag.data(), start, end, computeEigenvectors ? eivec.data() : (Scalar*)0, n);
  }
+
+  // Unscale any remaining scaled block.
+  if (PerBlockScaling && block_scale != RealScalar(1)) {
+    for (Index i = scaled_start; i <= scaled_end; ++i) diag[i] /= block_scale;
+    for (Index i = scaled_start; i < scaled_end; ++i) {
+      if (!numext::is_exactly_zero(subdiag[i])) subdiag[i] /= block_scale;
+    }
+  }
  if (iter <= maxIterations * n)
    info = Success;
  else
@@ -662,6 +737,28 @@ struct direct_selfadjoint_eigenvalues<SolverType, 3, false> {
    // compute the eigenvalues
    computeRoots(scaledMat, eivals);

+    // computeRoots produces theoretically sorted roots, but floating-point
+    // rounding in the trigonometric formulas can break the ordering.
+    // Enforce sorting with a branchless min/max network (3 elements).
+    {
+      Scalar tmp;
+      if (eivals(0) > eivals(1)) {
+        tmp = eivals(0);
+        eivals(0) = eivals(1);
+        eivals(1) = tmp;
+      }
+      if (eivals(1) > eivals(2)) {
+        tmp = eivals(1);
+        eivals(1) = eivals(2);
+        eivals(2) = tmp;
+      }
+      if (eivals(0) > eivals(1)) {
+        tmp = eivals(0);
+        eivals(0) = eivals(1);
+        eivals(1) = tmp;
+      }
+    }
+
    // compute the eigenvectors
    if (computeEigenvectors) {
      if ((eivals(2) - eivals(0)) <= Eigen::NumTraits<Scalar>::epsilon()) {
@@ -691,7 +788,7 @@ struct direct_selfadjoint_eigenvalues<SolverType, 3, false> {
        if (d0 <= 2 * Eigen::NumTraits<Scalar>::epsilon() * d1) {
          // If d0 is too small, then the two other eigenvalues are numerically the same,
          // and thus we only have to ortho-normalize the near orthogonal vector we saved above.
-          eivecs.col(l) -= eivecs.col(k).dot(eivecs.col(l)) * eivecs.col(l);
+          eivecs.col(l) -= eivecs.col(k).dot(eivecs.col(l)) * eivecs.col(k);
          eivecs.col(l).normalize();
        } else {
          tmp = scaledMat;
--- a/benchmarks/BLAS/CMakeLists.txt
+++ b/benchmarks/BLAS/CMakeLists.txt
@@ -0,0 +1,17 @@
+# Benchmarks for Eigen's built-in BLAS implementation.
+# Compiles the Eigen BLAS sources directly into the benchmark executable
+# so there is no external BLAS dependency.
+
+set(EIGEN_BLAS_SRCS
+  ${EIGEN_SOURCE_DIR}/blas/single.cpp
+  ${EIGEN_SOURCE_DIR}/blas/double.cpp
+  ${EIGEN_SOURCE_DIR}/blas/complex_single.cpp
+  ${EIGEN_SOURCE_DIR}/blas/complex_double.cpp
+  ${EIGEN_SOURCE_DIR}/blas/xerbla.cpp
+  ${EIGEN_SOURCE_DIR}/blas/lsame.cpp
+  ${EIGEN_SOURCE_DIR}/blas/complexdots.cpp
+)
+
+eigen_add_benchmark(bench_blas bench_blas.cpp)
+target_sources(bench_blas PRIVATE ${EIGEN_BLAS_SRCS})
+target_include_directories(bench_blas PRIVATE ${EIGEN_SOURCE_DIR}/blas)
--- a/benchmarks/BLAS/bench_blas.cpp
+++ b/benchmarks/BLAS/bench_blas.cpp
@@ -0,0 +1,488 @@
+// Benchmark for Eigen's BLAS implementation.
+//
+// Calls the Eigen BLAS C interface directly (the extern "C" functions defined
+// in blas/{single,double,complex_single,complex_double}.cpp).
+//
+// Covers Level 1, 2, and 3 routines — with emphasis on the routines that
+// were recently rewritten from f2c to C++: rotm, rotmg, spmv, sbmv, hbmv,
+// hpmv, tbmv, lsame, and complex dot products.
+
+#include <benchmark/benchmark.h>
+
+#include <Eigen/Core>
+#include <complex>
+#include <vector>
+
+#include "blas/blas.h"
+
+using Eigen::Index;
+
+// ---------------------------------------------------------------------------
+// Helpers
+// ---------------------------------------------------------------------------
+
+// Flop-rate counter (units = individual flops per call).
+static benchmark::Counter GflopsCounter(double flops) {
+  return benchmark::Counter(flops, benchmark::Counter::kIsIterationInvariantRate, benchmark::Counter::kIs1000);
+}
+
+// Fill a vector with random values in [-1, 1].
+template <typename T>
+static void fillRand(T* data, Index n) {
+  Eigen::Map<Eigen::Matrix<T, Eigen::Dynamic, 1>>(data, n).setRandom();
+}
+
+// Fill a symmetric band matrix A in BLAS band storage (column-major).
+// Upper triangle: A[i,j] stored at a[(k+i-j) + j*lda], 0 <= j-i <= k.
+template <typename T>
+static void fillSymBandUpper(T* a, int n, int k, int lda) {
+  std::fill(a, a + lda * n, T(0));
+  for (int j = 0; j < n; ++j)
+    for (int i = std::max(0, j - k); i <= j; ++i) a[(k + i - j) + j * lda] = T(std::rand()) / T(RAND_MAX) - T(0.5);
+}
+
+// Fill a packed symmetric matrix (upper triangle, column-major).
+template <typename T>
+static void fillSymPacked(T* ap, int n) {
+  int sz = n * (n + 1) / 2;
+  for (int i = 0; i < sz; ++i) ap[i] = T(std::rand()) / T(RAND_MAX) - T(0.5);
+}
+
+// Fill a triangular band matrix in BLAS band storage (upper, column-major).
+template <typename T>
+static void fillTriBandUpper(T* a, int n, int k, int lda) {
+  std::fill(a, a + lda * n, T(0));
+  for (int j = 0; j < n; ++j)
+    for (int i = std::max(0, j - k); i <= j; ++i) {
+      T val = T(std::rand()) / T(RAND_MAX) - T(0.5);
+      if (i == j) val += T(n);  // diagonal dominance
+      a[(k + i - j) + j * lda] = val;
+    }
+}
+
+// ---------------------------------------------------------------------------
+// Type-dispatched BLAS wrappers
+// ---------------------------------------------------------------------------
+
+inline float blas_dot(int* n, float* x, int* incx, float* y, int* incy) { return sdot_(n, x, incx, y, incy); }
+inline double blas_dot(int* n, double* x, int* incx, double* y, int* incy) { return ddot_(n, x, incx, y, incy); }
+
+inline void blas_axpy(int* n, float* a, float* x, int* incx, float* y, int* incy) { saxpy_(n, a, x, incx, y, incy); }
+inline void blas_axpy(int* n, double* a, double* x, int* incx, double* y, int* incy) { daxpy_(n, a, x, incx, y, incy); }
+
+inline float blas_nrm2(int* n, float* x, int* incx) { return snrm2_(n, x, incx); }
+inline double blas_nrm2(int* n, double* x, int* incx) { return dnrm2_(n, x, incx); }
+
+inline void blas_rotm(int* n, float* x, int* incx, float* y, int* incy, float* p) { srotm_(n, x, incx, y, incy, p); }
+inline void blas_rotm(int* n, double* x, int* incx, double* y, int* incy, double* p) { drotm_(n, x, incx, y, incy, p); }
+
+inline void blas_rotmg(float* d1, float* d2, float* x1, float* y1, float* p) { srotmg_(d1, d2, x1, y1, p); }
+inline void blas_rotmg(double* d1, double* d2, double* x1, double* y1, double* p) { drotmg_(d1, d2, x1, y1, p); }
+
+inline void blas_dotcw(int* n, float* cx, int* incx, float* cy, int* incy, float* res) {
+  cdotcw_(n, cx, incx, cy, incy, res);
+}
+inline void blas_dotcw(int* n, double* cx, int* incx, double* cy, int* incy, double* res) {
+  zdotcw_(n, cx, incx, cy, incy, res);
+}
+
+inline void blas_gemv(char* t, int* m, int* n, float* a, float* A, int* lda, float* x, int* incx, float* b, float* y,
+                      int* incy) {
+  sgemv_(t, m, n, a, A, lda, x, incx, b, y, incy);
+}
+inline void blas_gemv(char* t, int* m, int* n, double* a, double* A, int* lda, double* x, int* incx, double* b,
+                      double* y, int* incy) {
+  dgemv_(t, m, n, a, A, lda, x, incx, b, y, incy);
+}
+
+inline void blas_spmv(char* uplo, int* n, float* alpha, float* ap, float* x, int* incx, float* beta, float* y,
+                      int* incy) {
+  sspmv_(uplo, n, alpha, ap, x, incx, beta, y, incy);
+}
+inline void blas_spmv(char* uplo, int* n, double* alpha, double* ap, double* x, int* incx, double* beta, double* y,
+                      int* incy) {
+  dspmv_(uplo, n, alpha, ap, x, incx, beta, y, incy);
+}
+
+inline void blas_sbmv(char* uplo, int* n, int* k, float* alpha, float* a, int* lda, float* x, int* incx, float* beta,
+                      float* y, int* incy) {
+  ssbmv_(uplo, n, k, alpha, a, lda, x, incx, beta, y, incy);
+}
+inline void blas_sbmv(char* uplo, int* n, int* k, double* alpha, double* a, int* lda, double* x, int* incx,
+                      double* beta, double* y, int* incy) {
+  dsbmv_(uplo, n, k, alpha, a, lda, x, incx, beta, y, incy);
+}
+
+inline void blas_tbmv(char* uplo, char* trans, char* diag, int* n, int* k, float* a, int* lda, float* x, int* incx) {
+  stbmv_(uplo, trans, diag, n, k, a, lda, x, incx);
+}
+inline void blas_tbmv(char* uplo, char* trans, char* diag, int* n, int* k, double* a, int* lda, double* x, int* incx) {
+  dtbmv_(uplo, trans, diag, n, k, a, lda, x, incx);
+}
+
+inline void blas_hbmv(char* uplo, int* n, int* k, float* alpha, float* a, int* lda, float* x, int* incx, float* beta,
+                      float* y, int* incy) {
+  chbmv_(uplo, n, k, alpha, a, lda, x, incx, beta, y, incy);
+}
+inline void blas_hbmv(char* uplo, int* n, int* k, double* alpha, double* a, int* lda, double* x, int* incx,
+                      double* beta, double* y, int* incy) {
+  zhbmv_(uplo, n, k, alpha, a, lda, x, incx, beta, y, incy);
+}
+
+inline void blas_hpmv(char* uplo, int* n, float* alpha, float* ap, float* x, int* incx, float* beta, float* y,
+                      int* incy) {
+  chpmv_(uplo, n, alpha, ap, x, incx, beta, y, incy);
+}
+inline void blas_hpmv(char* uplo, int* n, double* alpha, double* ap, double* x, int* incx, double* beta, double* y,
+                      int* incy) {
+  zhpmv_(uplo, n, alpha, ap, x, incx, beta, y, incy);
+}
+
+inline void blas_gemm(char* ta, char* tb, int* m, int* n, int* k, float* alpha, float* a, int* lda, float* b, int* ldb,
+                      float* beta, float* c, int* ldc) {
+  sgemm_(ta, tb, m, n, k, alpha, a, lda, b, ldb, beta, c, ldc);
+}
+inline void blas_gemm(char* ta, char* tb, int* m, int* n, int* k, double* alpha, double* a, int* lda, double* b,
+                      int* ldb, double* beta, double* c, int* ldc) {
+  dgemm_(ta, tb, m, n, k, alpha, a, lda, b, ldb, beta, c, ldc);
+}
+
+// =========================================================================
+//  Level 1 — Real
+// =========================================================================
+
+// ----- SDOT / DDOT -----
+template <typename T>
+static void BM_dot(benchmark::State& state) {
+  int n = static_cast<int>(state.range(0));
+  int one = 1;
+  std::vector<T> x(n), y(n);
+  fillRand(x.data(), n);
+  fillRand(y.data(), n);
+  for (auto _ : state) {
+    T r = blas_dot(&n, x.data(), &one, y.data(), &one);
+    benchmark::DoNotOptimize(r);
+  }
+  state.counters["GFLOPS"] = GflopsCounter(2.0 * n);
+}
+
+// ----- SAXPY / DAXPY -----
+template <typename T>
+static void BM_axpy(benchmark::State& state) {
+  int n = static_cast<int>(state.range(0));
+  int one = 1;
+  T alpha = T(2.5);
+  std::vector<T> x(n), y(n);
+  fillRand(x.data(), n);
+  fillRand(y.data(), n);
+  for (auto _ : state) {
+    blas_axpy(&n, &alpha, x.data(), &one, y.data(), &one);
+    benchmark::DoNotOptimize(y.data());
+  }
+  state.counters["GFLOPS"] = GflopsCounter(2.0 * n);
+}
+
+// ----- SNRM2 / DNRM2 -----
+template <typename T>
+static void BM_nrm2(benchmark::State& state) {
+  int n = static_cast<int>(state.range(0));
+  int one = 1;
+  std::vector<T> x(n);
+  fillRand(x.data(), n);
+  for (auto _ : state) {
+    T r = blas_nrm2(&n, x.data(), &one);
+    benchmark::DoNotOptimize(r);
+  }
+  // Nominal flops; Eigen's stableNorm() does more work internally.
+  state.counters["GFLOPS"] = GflopsCounter(2.0 * n - 1);
+}
+
+// ----- SROTM / DROTM -----
+template <typename T>
+static void BM_rotm(benchmark::State& state) {
+  int n = static_cast<int>(state.range(0));
+  int one = 1;
+  std::vector<T> x(n), y(n);
+  T param[5] = {T(-1), T(0.6), T(-0.8), T(0.8), T(0.6)};  // full rotation
+  fillRand(x.data(), n);
+  fillRand(y.data(), n);
+  for (auto _ : state) {
+    blas_rotm(&n, x.data(), &one, y.data(), &one, param);
+    benchmark::DoNotOptimize(x.data());
+    benchmark::DoNotOptimize(y.data());
+  }
+  // 4 muls + 2 adds per element pair.
+  state.counters["GFLOPS"] = GflopsCounter(6.0 * n);
+}
+
+// ----- SROTMG / DROTMG -----
+template <typename T>
+static void BM_rotmg(benchmark::State& state) {
+  T d1 = T(2), d2 = T(3), x1 = T(1), y1 = T(0.5);
+  T param[5];
+  for (auto _ : state) {
+    T td1 = d1, td2 = d2, tx1 = x1;
+    blas_rotmg(&td1, &td2, &tx1, &y1, param);
+    benchmark::DoNotOptimize(param);
+  }
+}
+
+// =========================================================================
+//  Level 1 — Complex
+// =========================================================================
+
+// Complex conjugate dot product via the worker functions (cdotcw_ / zdotcw_)
+// which use an output pointer, avoiding the ABI ambiguity of the struct-returning
+// cdotc_ / zdotc_ wrappers.
+template <typename T>
+static void BM_dotc(benchmark::State& state) {
+  int n = static_cast<int>(state.range(0));
+  int one = 1;
+  std::vector<T> x(2 * n), y(2 * n);  // interleaved real/imag
+  fillRand(x.data(), 2 * n);
+  fillRand(y.data(), 2 * n);
+  T res[2];
+  for (auto _ : state) {
+    blas_dotcw(&n, x.data(), &one, y.data(), &one, res);
+    benchmark::DoNotOptimize(res);
+  }
+  // Conjugate dot: 6 mul + 2 add per element = 8n flops.
+  state.counters["GFLOPS"] = GflopsCounter(8.0 * n);
+}
+
+// =========================================================================
+//  Level 2 — General Matrix-Vector (SGEMV / DGEMV)
+// =========================================================================
+
+template <typename T>
+static void BM_gemv(benchmark::State& state) {
+  int m = static_cast<int>(state.range(0));
+  int n = static_cast<int>(state.range(1));
+  int one = 1;
+  T alpha = T(1), beta = T(0);
+  char trans = 'N';
+  std::vector<T> a(m * n), x(n), y(m);
+  fillRand(a.data(), m * n);
+  fillRand(x.data(), n);
+  fillRand(y.data(), m);
+  for (auto _ : state) {
+    blas_gemv(&trans, &m, &n, &alpha, a.data(), &m, x.data(), &one, &beta, y.data(), &one);
+    benchmark::DoNotOptimize(y.data());
+  }
+  state.counters["GFLOPS"] = GflopsCounter(2.0 * m * n);
+}
+
+// =========================================================================
+//  Level 2 — Symmetric Packed (SSPMV / DSPMV)
+// =========================================================================
+
+template <typename T>
+static void BM_spmv(benchmark::State& state) {
+  int n = static_cast<int>(state.range(0));
+  int one = 1;
+  T alpha = T(1), beta = T(0);
+  char uplo = 'U';
+  int sz = n * (n + 1) / 2;
+  std::vector<T> ap(sz), x(n), y(n);
+  fillSymPacked(ap.data(), n);
+  fillRand(x.data(), n);
+  fillRand(y.data(), n);
+  for (auto _ : state) {
+    blas_spmv(&uplo, &n, &alpha, ap.data(), x.data(), &one, &beta, y.data(), &one);
+    benchmark::DoNotOptimize(y.data());
+  }
+  // Symmetric: each off-diag element contributes to two y entries.
+  state.counters["GFLOPS"] = GflopsCounter(2.0 * n * n);
+}
+
+// =========================================================================
+//  Level 2 — Symmetric Band (SSBMV / DSBMV)
+// =========================================================================
+
+template <typename T>
+static void BM_sbmv(benchmark::State& state) {
+  int n = static_cast<int>(state.range(0));
+  int k = static_cast<int>(state.range(1));
+  int lda = k + 1;
+  int one = 1;
+  T alpha = T(1), beta = T(0);
+  char uplo = 'U';
+  std::vector<T> a(lda * n), x(n), y(n);
+  fillSymBandUpper(a.data(), n, k, lda);
+  fillRand(x.data(), n);
+  fillRand(y.data(), n);
+  for (auto _ : state) {
+    blas_sbmv(&uplo, &n, &k, &alpha, a.data(), &lda, x.data(), &one, &beta, y.data(), &one);
+    benchmark::DoNotOptimize(y.data());
+  }
+  state.counters["GFLOPS"] = GflopsCounter(2.0 * n * (2 * k + 1));
+}
+
+// =========================================================================
+//  Level 2 — Triangular Band (STBMV / DTBMV)
+// =========================================================================
+
+template <typename T>
+static void BM_tbmv(benchmark::State& state) {
+  int n = static_cast<int>(state.range(0));
+  int k = static_cast<int>(state.range(1));
+  int lda = k + 1;
+  int one = 1;
+  char uplo = 'U', trans = 'N', diag = 'N';
+  std::vector<T> a(lda * n), x(n), x_orig(n);
+  fillTriBandUpper(a.data(), n, k, lda);
+  fillRand(x_orig.data(), n);
+  for (auto _ : state) {
+    state.PauseTiming();
+    std::copy(x_orig.begin(), x_orig.end(), x.begin());
+    state.ResumeTiming();
+    blas_tbmv(&uplo, &trans, &diag, &n, &k, a.data(), &lda, x.data(), &one);
+    benchmark::DoNotOptimize(x.data());
+  }
+  state.counters["GFLOPS"] = GflopsCounter(1.0 * n * (k + 1));
+}
+
+// =========================================================================
+//  Level 2 — Hermitian Band (CHBMV / ZHBMV)
+// =========================================================================
+
+template <typename T>
+static void BM_hbmv(benchmark::State& state) {
+  int n = static_cast<int>(state.range(0));
+  int k = static_cast<int>(state.range(1));
+  int lda = k + 1;
+  int one = 1;
+  char uplo = 'U';
+  // Complex: each element is 2 reals.
+  std::vector<T> a(2 * lda * n), x(2 * n), y(2 * n);
+  T alpha[2] = {T(1), T(0)};
+  T beta[2] = {T(0), T(0)};
+  fillRand(a.data(), 2 * lda * n);
+  // Make diagonal real (imag part = 0).
+  for (int j = 0; j < n; ++j) a[2 * (k + j * lda) + 1] = T(0);
+  fillRand(x.data(), 2 * n);
+  fillRand(y.data(), 2 * n);
+  for (auto _ : state) {
+    blas_hbmv(&uplo, &n, &k, alpha, a.data(), &lda, x.data(), &one, beta, y.data(), &one);
+    benchmark::DoNotOptimize(y.data());
+  }
+  // Complex hermitian band: 8*n*(2k+1) flops approximately.
+  state.counters["GFLOPS"] = GflopsCounter(8.0 * n * (2 * k + 1));
+}
+
+// =========================================================================
+//  Level 2 — Hermitian Packed (CHPMV / ZHPMV)
+// =========================================================================
+
+template <typename T>
+static void BM_hpmv(benchmark::State& state) {
+  int n = static_cast<int>(state.range(0));
+  int one = 1;
+  char uplo = 'U';
+  int sz = n * (n + 1) / 2;
+  std::vector<T> ap(2 * sz), x(2 * n), y(2 * n);
+  T alpha[2] = {T(1), T(0)};
+  T beta[2] = {T(0), T(0)};
+  fillRand(ap.data(), 2 * sz);
+  // Make diagonal real.
+  int kk = 0;
+  for (int j = 0; j < n; ++j) {
+    ap[2 * (kk + j) + 1] = T(0);
+    kk += j + 1;
+  }
+  fillRand(x.data(), 2 * n);
+  fillRand(y.data(), 2 * n);
+  for (auto _ : state) {
+    blas_hpmv(&uplo, &n, alpha, ap.data(), x.data(), &one, beta, y.data(), &one);
+    benchmark::DoNotOptimize(y.data());
+  }
+  state.counters["GFLOPS"] = GflopsCounter(8.0 * n * n);
+}
+
+// =========================================================================
+//  Level 3 — General Matrix Multiply (SGEMM / DGEMM)
+// =========================================================================
+
+template <typename T>
+static void BM_gemm(benchmark::State& state) {
+  int n = static_cast<int>(state.range(0));
+  T alpha = T(1), beta = T(0);
+  char trans = 'N';
+  std::vector<T> a(n * n), b(n * n), c(n * n);
+  fillRand(a.data(), n * n);
+  fillRand(b.data(), n * n);
+  fillRand(c.data(), n * n);
+  for (auto _ : state) {
+    blas_gemm(&trans, &trans, &n, &n, &n, &alpha, a.data(), &n, b.data(), &n, &beta, c.data(), &n);
+    benchmark::DoNotOptimize(c.data());
+  }
+  state.counters["GFLOPS"] = GflopsCounter(2.0 * n * n * n);
+}
+
+// =========================================================================
+//  Register benchmarks
+// =========================================================================
+
+// clang-format off
+
+// --- Vector sizes for Level 1 ---
+#define L1_SIZES ->Arg(64)->Arg(256)->Arg(1024)->Arg(4096)->Arg(16384)->Arg(65536)
+
+BENCHMARK(BM_dot<float>)    L1_SIZES ->Name("sdot");
+BENCHMARK(BM_dot<double>)   L1_SIZES ->Name("ddot");
+BENCHMARK(BM_axpy<float>)   L1_SIZES ->Name("saxpy");
+BENCHMARK(BM_axpy<double>)  L1_SIZES ->Name("daxpy");
+BENCHMARK(BM_nrm2<float>)   L1_SIZES ->Name("snrm2");
+BENCHMARK(BM_nrm2<double>)  L1_SIZES ->Name("dnrm2");
+BENCHMARK(BM_rotm<float>)   L1_SIZES ->Name("srotm");
+BENCHMARK(BM_rotm<double>)  L1_SIZES ->Name("drotm");
+BENCHMARK(BM_rotmg<float>)  ->Name("srotmg");
+BENCHMARK(BM_rotmg<double>) ->Name("drotmg");
+BENCHMARK(BM_dotc<float>)   L1_SIZES ->Name("cdotc");
+BENCHMARK(BM_dotc<double>)  L1_SIZES ->Name("zdotc");
+
+#undef L1_SIZES
+
+// --- Matrix sizes for Level 2 ---
+// GEMV: {m, n}
+#define GEMV_SIZES \
+    ->Args({64, 64})->Args({256, 256})->Args({1024, 1024})->Args({4096, 4096}) \
+    ->Args({4096, 64})->Args({64, 4096})
+
+BENCHMARK(BM_gemv<float>)   GEMV_SIZES ->Name("sgemv");
+BENCHMARK(BM_gemv<double>)  GEMV_SIZES ->Name("dgemv");
+#undef GEMV_SIZES
+
+// Symmetric packed: {n}
+#define SPM_SIZES ->Arg(64)->Arg(256)->Arg(1024)->Arg(4096)
+
+BENCHMARK(BM_spmv<float>)  SPM_SIZES ->Name("sspmv");
+BENCHMARK(BM_spmv<double>) SPM_SIZES ->Name("dspmv");
+BENCHMARK(BM_hpmv<float>)  SPM_SIZES ->Name("chpmv");
+BENCHMARK(BM_hpmv<double>) SPM_SIZES ->Name("zhpmv");
+
+#undef SPM_SIZES
+
+// Band: {n, k}
+#define BAND_SIZES \
+    ->Args({256, 4})->Args({256, 32})->Args({1024, 4})->Args({1024, 32}) \
+    ->Args({4096, 4})->Args({4096, 32})->Args({4096, 128})
+
+BENCHMARK(BM_sbmv<float>)  BAND_SIZES ->Name("ssbmv");
+BENCHMARK(BM_sbmv<double>) BAND_SIZES ->Name("dsbmv");
+BENCHMARK(BM_tbmv<float>)  BAND_SIZES ->Name("stbmv");
+BENCHMARK(BM_tbmv<double>) BAND_SIZES ->Name("dtbmv");
+BENCHMARK(BM_hbmv<float>)  BAND_SIZES ->Name("chbmv");
+BENCHMARK(BM_hbmv<double>) BAND_SIZES ->Name("zhbmv");
+
+#undef BAND_SIZES
+
+// --- Square sizes for Level 3 ---
+#define GEMM_SIZES ->Arg(32)->Arg(64)->Arg(128)->Arg(256)->Arg(512)->Arg(1024)
+
+BENCHMARK(BM_gemm<float>)  GEMM_SIZES ->Name("sgemm");
+BENCHMARK(BM_gemm<double>) GEMM_SIZES ->Name("dgemm");
+
+#undef GEMM_SIZES
+
+// clang-format on
--- a/benchmarks/CMakeLists.txt
+++ b/benchmarks/CMakeLists.txt
@@ -20,7 +20,11 @@ function(eigen_add_benchmark name source)
  if(BENCH_LIBRARIES)
    target_link_libraries(${name} PRIVATE ${BENCH_LIBRARIES})
  endif()
-  target_compile_options(${name} PRIVATE -O3 -DNDEBUG)
+  target_compile_options(${name} PRIVATE
+    $<$<CXX_COMPILER_ID:MSVC>:/O2>
+    $<$<NOT:$<CXX_COMPILER_ID:MSVC>>:-O3>
+  )
+  target_compile_definitions(${name} PRIVATE NDEBUG)
  if(BENCH_DEFINITIONS)
    target_compile_definitions(${name} PRIVATE ${BENCH_DEFINITIONS})
  endif()
@@ -38,3 +42,4 @@ add_subdirectory(FFT)
 add_subdirectory(Householder)
 add_subdirectory(Solvers)
 add_subdirectory(Tuning)
+add_subdirectory(BLAS)
--- a/benchmarks/Core/CMakeLists.txt
+++ b/benchmarks/Core/CMakeLists.txt
@@ -21,3 +21,4 @@ eigen_add_benchmark(bench_syr2 bench_syr2.cpp)
 eigen_add_benchmark(bench_construction bench_construction.cpp)
 eigen_add_benchmark(bench_fixed_size bench_fixed_size.cpp)
 eigen_add_benchmark(bench_fixed_size_double bench_fixed_size.cpp DEFINITIONS SCALAR=double)
+eigen_add_benchmark(bench_small_matrix bench_small_matrix.cpp)
--- a/benchmarks/Core/bench_small_matrix.cpp
+++ b/benchmarks/Core/bench_small_matrix.cpp
@@ -0,0 +1,316 @@
+#include <benchmark/benchmark.h>
+#include <Eigen/Core>
+#include <Eigen/LU>
+#include <Eigen/Cholesky>
+#include <Eigen/QR>
+#include <Eigen/SVD>
+#include <Eigen/Eigenvalues>
+
+using namespace Eigen;
+
+// ============================================================================
+// Fixed-size matrix multiply (the fundamental operation)
+// ============================================================================
+
+template <typename Scalar, int N>
+static void BM_MatMul(benchmark::State& state) {
+  Matrix<Scalar, N, N> a, b, c;
+  a.setRandom();
+  b.setRandom();
+  for (auto _ : state) {
+    c.noalias() = a * b;
+    benchmark::DoNotOptimize(c.data());
+  }
+}
+
+// Matrix-vector multiply
+template <typename Scalar, int N>
+static void BM_MatVec(benchmark::State& state) {
+  Matrix<Scalar, N, N> a;
+  Matrix<Scalar, N, 1> v, r;
+  a.setRandom();
+  v.setRandom();
+  for (auto _ : state) {
+    r.noalias() = a * v;
+    benchmark::DoNotOptimize(r.data());
+  }
+}
+
+// ============================================================================
+// Fixed-size inverse (critical for transform operations)
+// ============================================================================
+
+template <typename Scalar, int N>
+EIGEN_DONT_INLINE void do_inverse(const Matrix<Scalar, N, N>& a, Matrix<Scalar, N, N>& r) {
+  r = a.inverse();
+}
+
+template <typename Scalar, int N>
+static void BM_Inverse(benchmark::State& state) {
+  Matrix<Scalar, N, N> a, r;
+  a.setRandom();
+  a += Matrix<Scalar, N, N>::Identity() * Scalar(N);  // ensure well-conditioned
+  for (auto _ : state) {
+    do_inverse(a, r);
+    benchmark::DoNotOptimize(r.data());
+  }
+}
+
+// ============================================================================
+// Fixed-size determinant
+// ============================================================================
+
+template <typename Scalar, int N>
+static void BM_Determinant(benchmark::State& state) {
+  Matrix<Scalar, N, N> a;
+  a.setRandom();
+  Scalar d;
+  for (auto _ : state) {
+    d = a.determinant();
+    benchmark::DoNotOptimize(d);
+  }
+}
+
+// ============================================================================
+// LLT (Cholesky) — for SPD matrices (covariance, mass matrices)
+// ============================================================================
+
+template <typename Scalar, int N>
+static void BM_LLT_Compute(benchmark::State& state) {
+  Matrix<Scalar, N, N> a;
+  a.setRandom();
+  a = a.transpose() * a + Matrix<Scalar, N, N>::Identity();  // SPD
+  LLT<Matrix<Scalar, N, N>> llt;
+  for (auto _ : state) {
+    llt.compute(a);
+    benchmark::DoNotOptimize(&llt);
+  }
+}
+
+template <typename Scalar, int N>
+static void BM_LLT_Solve(benchmark::State& state) {
+  Matrix<Scalar, N, N> a;
+  a.setRandom();
+  a = a.transpose() * a + Matrix<Scalar, N, N>::Identity();
+  Matrix<Scalar, N, 1> b = Matrix<Scalar, N, 1>::Random();
+  LLT<Matrix<Scalar, N, N>> llt(a);
+  Matrix<Scalar, N, 1> x;
+  for (auto _ : state) {
+    x = llt.solve(b);
+    benchmark::DoNotOptimize(x.data());
+  }
+}
+
+// ============================================================================
+// LDLT — for semi-definite matrices
+// ============================================================================
+
+template <typename Scalar, int N>
+static void BM_LDLT_Compute(benchmark::State& state) {
+  Matrix<Scalar, N, N> a;
+  a.setRandom();
+  a = a.transpose() * a + Matrix<Scalar, N, N>::Identity();
+  LDLT<Matrix<Scalar, N, N>> ldlt;
+  for (auto _ : state) {
+    ldlt.compute(a);
+    benchmark::DoNotOptimize(&ldlt);
+  }
+}
+
+// ============================================================================
+// PartialPivLU — for general square systems
+// ============================================================================
+
+template <typename Scalar, int N>
+static void BM_PartialPivLU_Compute(benchmark::State& state) {
+  Matrix<Scalar, N, N> a;
+  a.setRandom();
+  a += Matrix<Scalar, N, N>::Identity() * Scalar(N);
+  PartialPivLU<Matrix<Scalar, N, N>> lu;
+  for (auto _ : state) {
+    lu.compute(a);
+    benchmark::DoNotOptimize(lu.matrixLU().data());
+  }
+}
+
+template <typename Scalar, int N>
+static void BM_PartialPivLU_Solve(benchmark::State& state) {
+  Matrix<Scalar, N, N> a;
+  a.setRandom();
+  a += Matrix<Scalar, N, N>::Identity() * Scalar(N);
+  Matrix<Scalar, N, 1> b = Matrix<Scalar, N, 1>::Random();
+  PartialPivLU<Matrix<Scalar, N, N>> lu(a);
+  Matrix<Scalar, N, 1> x;
+  for (auto _ : state) {
+    x = lu.solve(b);
+    benchmark::DoNotOptimize(x.data());
+  }
+}
+
+// ============================================================================
+// ColPivHouseholderQR — for least-squares (camera calibration, etc.)
+// ============================================================================
+
+template <typename Scalar, int Rows, int Cols>
+static void BM_ColPivQR_Compute(benchmark::State& state) {
+  Matrix<Scalar, Rows, Cols> a;
+  a.setRandom();
+  ColPivHouseholderQR<Matrix<Scalar, Rows, Cols>> qr;
+  for (auto _ : state) {
+    qr.compute(a);
+    benchmark::DoNotOptimize(qr.matrixR().data());
+  }
+}
+
+// ============================================================================
+// JacobiSVD — the workhorse for small matrices in CV
+// ============================================================================
+
+template <typename Scalar, int Rows, int Cols, int Options = ComputeThinU | ComputeThinV>
+static void BM_JacobiSVD_Compute(benchmark::State& state) {
+  Matrix<Scalar, Rows, Cols> a;
+  a.setRandom();
+  JacobiSVD<Matrix<Scalar, Rows, Cols>, Options> svd;
+  for (auto _ : state) {
+    svd.compute(a);
+    benchmark::DoNotOptimize(svd.singularValues().data());
+  }
+}
+
+template <typename Scalar, int Rows, int Cols>
+static void BM_JacobiSVD_Solve(benchmark::State& state) {
+  Matrix<Scalar, Rows, Cols> a;
+  a.setRandom();
+  Matrix<Scalar, Rows, 1> b = Matrix<Scalar, Rows, 1>::Random();
+  JacobiSVD<Matrix<Scalar, Rows, Cols>, ComputeThinU | ComputeThinV> svd(a);
+  Matrix<Scalar, Cols, 1> x;
+  for (auto _ : state) {
+    x = svd.solve(b);
+    benchmark::DoNotOptimize(x.data());
+  }
+}
+
+// ============================================================================
+// SelfAdjointEigenSolver — PCA, normal estimation
+// ============================================================================
+
+template <typename Scalar, int N>
+static void BM_SelfAdjointEig_Compute(benchmark::State& state) {
+  Matrix<Scalar, N, N> a;
+  a.setRandom();
+  a = a.transpose() * a;
+  SelfAdjointEigenSolver<Matrix<Scalar, N, N>> eig;
+  for (auto _ : state) {
+    eig.compute(a);
+    benchmark::DoNotOptimize(eig.eigenvalues().data());
+  }
+}
+
+// SelfAdjointEigenSolver::computeDirect — closed-form for 2x2 and 3x3
+template <typename Scalar, int N>
+static void BM_SelfAdjointEig_ComputeDirect(benchmark::State& state) {
+  Matrix<Scalar, N, N> a;
+  a.setRandom();
+  a = a.transpose() * a;
+  SelfAdjointEigenSolver<Matrix<Scalar, N, N>> eig;
+  for (auto _ : state) {
+    eig.computeDirect(a);
+    benchmark::DoNotOptimize(eig.eigenvalues().data());
+  }
+}
+
+// ============================================================================
+// Registration — focus on robotics/CV sizes
+// ============================================================================
+
+// Matrix multiply: 2x2, 3x3, 4x4, 6x6
+BENCHMARK(BM_MatMul<float, 2>);
+BENCHMARK(BM_MatMul<float, 3>);
+BENCHMARK(BM_MatMul<float, 4>);
+BENCHMARK(BM_MatMul<float, 6>);
+BENCHMARK(BM_MatMul<double, 2>);
+BENCHMARK(BM_MatMul<double, 3>);
+BENCHMARK(BM_MatMul<double, 4>);
+BENCHMARK(BM_MatMul<double, 6>);
+
+// Matrix-vector multiply
+BENCHMARK(BM_MatVec<float, 3>);
+BENCHMARK(BM_MatVec<float, 4>);
+BENCHMARK(BM_MatVec<float, 6>);
+BENCHMARK(BM_MatVec<double, 3>);
+BENCHMARK(BM_MatVec<double, 4>);
+BENCHMARK(BM_MatVec<double, 6>);
+
+// Inverse
+BENCHMARK(BM_Inverse<float, 2>);
+BENCHMARK(BM_Inverse<float, 3>);
+BENCHMARK(BM_Inverse<float, 4>);
+BENCHMARK(BM_Inverse<double, 2>);
+BENCHMARK(BM_Inverse<double, 3>);
+BENCHMARK(BM_Inverse<double, 4>);
+
+// Determinant
+BENCHMARK(BM_Determinant<float, 2>);
+BENCHMARK(BM_Determinant<float, 3>);
+BENCHMARK(BM_Determinant<float, 4>);
+BENCHMARK(BM_Determinant<double, 2>);
+BENCHMARK(BM_Determinant<double, 3>);
+BENCHMARK(BM_Determinant<double, 4>);
+
+// LLT (Cholesky)
+BENCHMARK(BM_LLT_Compute<float, 3>);
+BENCHMARK(BM_LLT_Compute<float, 4>);
+BENCHMARK(BM_LLT_Compute<float, 6>);
+BENCHMARK(BM_LLT_Compute<double, 3>);
+BENCHMARK(BM_LLT_Compute<double, 4>);
+BENCHMARK(BM_LLT_Compute<double, 6>);
+BENCHMARK(BM_LLT_Solve<double, 3>);
+BENCHMARK(BM_LLT_Solve<double, 6>);
+
+// LDLT
+BENCHMARK(BM_LDLT_Compute<double, 3>);
+BENCHMARK(BM_LDLT_Compute<double, 6>);
+
+// PartialPivLU
+BENCHMARK(BM_PartialPivLU_Compute<float, 3>);
+BENCHMARK(BM_PartialPivLU_Compute<float, 4>);
+BENCHMARK(BM_PartialPivLU_Compute<double, 3>);
+BENCHMARK(BM_PartialPivLU_Compute<double, 4>);
+BENCHMARK(BM_PartialPivLU_Solve<double, 3>);
+BENCHMARK(BM_PartialPivLU_Solve<double, 4>);
+
+// ColPivHouseholderQR
+BENCHMARK(BM_ColPivQR_Compute<float, 3, 3>);
+BENCHMARK(BM_ColPivQR_Compute<double, 3, 3>);
+BENCHMARK(BM_ColPivQR_Compute<double, 6, 6>);
+BENCHMARK(BM_ColPivQR_Compute<double, 8, 3>);  // overdetermined least-squares
+
+// JacobiSVD — the key CV sizes
+BENCHMARK(BM_JacobiSVD_Compute<float, 2, 2>);
+BENCHMARK(BM_JacobiSVD_Compute<float, 3, 3>);
+BENCHMARK(BM_JacobiSVD_Compute<float, 4, 4>);
+BENCHMARK(BM_JacobiSVD_Compute<double, 2, 2>);
+BENCHMARK(BM_JacobiSVD_Compute<double, 3, 3>);
+BENCHMARK(BM_JacobiSVD_Compute<double, 4, 4>);
+BENCHMARK(BM_JacobiSVD_Compute<double, 3, 4>);  // projection matrix
+BENCHMARK(BM_JacobiSVD_Compute<double, 6, 6>);  // manipulator Jacobian
+BENCHMARK(BM_JacobiSVD_Compute<double, 8, 9>);  // fundamental matrix (8-point)
+BENCHMARK(BM_JacobiSVD_Solve<double, 3, 3>);
+BENCHMARK(BM_JacobiSVD_Solve<double, 6, 6>);
+
+// Values-only SVD (when you just need singular values)
+BENCHMARK((BM_JacobiSVD_Compute<double, 3, 3, 0>));
+BENCHMARK((BM_JacobiSVD_Compute<double, 6, 6, 0>));
+
+// SelfAdjointEigenSolver — PCA, normal estimation
+BENCHMARK(BM_SelfAdjointEig_Compute<float, 3>);
+BENCHMARK(BM_SelfAdjointEig_Compute<float, 4>);
+BENCHMARK(BM_SelfAdjointEig_Compute<double, 3>);
+BENCHMARK(BM_SelfAdjointEig_Compute<double, 4>);
+BENCHMARK(BM_SelfAdjointEig_Compute<double, 6>);
+
+// SelfAdjointEigenSolver::computeDirect (closed-form, 2x2 and 3x3 only)
+BENCHMARK(BM_SelfAdjointEig_ComputeDirect<float, 2>);
+BENCHMARK(BM_SelfAdjointEig_ComputeDirect<float, 3>);
+BENCHMARK(BM_SelfAdjointEig_ComputeDirect<double, 2>);
+BENCHMARK(BM_SelfAdjointEig_ComputeDirect<double, 3>);
--- a/blas/CMakeLists.txt
+++ b/blas/CMakeLists.txt
@@ -6,11 +6,7 @@ if(EIGEN_BUILD_BLAS)
 add_custom_target(blas)

 set(EigenBlas_SRCS  single.cpp double.cpp complex_single.cpp complex_double.cpp xerbla.cpp
-                    f2c/srotm.c   f2c/srotmg.c  f2c/drotm.c f2c/drotmg.c
-                    f2c/lsame.c   f2c/dspmv.c   f2c/ssbmv.c f2c/chbmv.c
-                    f2c/sspmv.c   f2c/zhbmv.c   f2c/chpmv.c f2c/dsbmv.c
-                    f2c/zhpmv.c   f2c/dtbmv.c   f2c/stbmv.c f2c/ctbmv.c
-                    f2c/ztbmv.c   f2c/complexdots.c
+                    lsame.cpp complexdots.cpp
   )

 set(EIGEN_BLAS_TARGETS "")
--- a/blas/complexdots.cpp
+++ b/blas/complexdots.cpp
@@ -0,0 +1,72 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+// C++ replacements for the f2c complex dot product wrappers.
+// These are thin wrappers around the worker functions (cdotcw_, etc.)
+// defined in level1_cplx_impl.h.
+//
+// Note: blas.h declares these as void, but gfortran expects complex functions
+// to return by value. We define the correct signatures here and do not include
+// blas.h to avoid the conflicting declarations.
+
+#if defined(_WIN32)
+#if defined(EIGEN_BLAS_BUILD_DLL)
+#define EIGEN_BLAS_CDOT_API __declspec(dllexport)
+#else
+#define EIGEN_BLAS_CDOT_API
+#endif
+#elif ((defined(__GNUC__) && __GNUC__ >= 4) || defined(__clang__)) && defined(EIGEN_BLAS_BUILD_DLL)
+#define EIGEN_BLAS_CDOT_API __attribute__((visibility("default")))
+#else
+#define EIGEN_BLAS_CDOT_API
+#endif
+
+extern "C" {
+
+// Worker function declarations (defined in level1_cplx_impl.h via complex_single.cpp / complex_double.cpp).
+void cdotcw_(int *n, float *cx, int *incx, float *cy, int *incy, float *res);
+void cdotuw_(int *n, float *cx, int *incx, float *cy, int *incy, float *res);
+void zdotcw_(int *n, double *cx, int *incx, double *cy, int *incy, double *res);
+void zdotuw_(int *n, double *cx, int *incx, double *cy, int *incy, double *res);
+
+// POD complex types for C-compatible return values (matches Fortran complex layout).
+struct eigen_blas_complex_float {
+  float r, i;
+};
+struct eigen_blas_complex_double {
+  double r, i;
+};
+
+// CDOTC computes the conjugated dot product of two single-precision complex vectors.
+EIGEN_BLAS_CDOT_API eigen_blas_complex_float cdotc_(int *n, float *cx, int *incx, float *cy, int *incy) {
+  eigen_blas_complex_float res = {0.0f, 0.0f};
+  cdotcw_(n, cx, incx, cy, incy, &res.r);
+  return res;
+}
+
+// CDOTU computes the unconjugated dot product of two single-precision complex vectors.
+EIGEN_BLAS_CDOT_API eigen_blas_complex_float cdotu_(int *n, float *cx, int *incx, float *cy, int *incy) {
+  eigen_blas_complex_float res = {0.0f, 0.0f};
+  cdotuw_(n, cx, incx, cy, incy, &res.r);
+  return res;
+}
+
+// ZDOTC computes the conjugated dot product of two double-precision complex vectors.
+EIGEN_BLAS_CDOT_API eigen_blas_complex_double zdotc_(int *n, double *cx, int *incx, double *cy, int *incy) {
+  eigen_blas_complex_double res = {0.0, 0.0};
+  zdotcw_(n, cx, incx, cy, incy, &res.r);
+  return res;
+}
+
+// ZDOTU computes the unconjugated dot product of two double-precision complex vectors.
+EIGEN_BLAS_CDOT_API eigen_blas_complex_double zdotu_(int *n, double *cx, int *incx, double *cy, int *incy) {
+  eigen_blas_complex_double res = {0.0, 0.0};
+  zdotuw_(n, cx, incx, cy, incy, &res.r);
+  return res;
+}
+
+}  // extern "C"
--- a/blas/f2c/chbmv.c
+++ b/blas/f2c/chbmv.c
@@ -1,456 +0,0 @@
-/* chbmv.f -- translated by f2c (version 20100827).
-   You must link the resulting object file with libf2c:
-        on Microsoft Windows system, link with libf2c.lib;
-        on Linux or Unix systems, link with .../path/to/libf2c.a -lm
-        or, if you install libf2c.a in a standard place, with -lf2c -lm
-        -- in that order, at the end of the command line, as in
-                cc *.o -lf2c -lm
-        Source for libf2c is in /netlib/f2c/libf2c.zip, e.g.,
-
-                http://www.netlib.org/f2c/libf2c.zip
-*/
-
-#include "datatypes.h"
-
-static inline void r_cnjg(complex *r, complex *z) {
-  r->r = z->r;
-  r->i = -(z->i);
-}
-
-/* Subroutine */ void chbmv_(char *uplo, integer *n, integer *k, complex *alpha, complex *a, integer *lda, complex *x,
-                             integer *incx, complex *beta, complex *y, integer *incy) {
-  /* System generated locals */
-  integer a_dim1, a_offset, i__1, i__2, i__3, i__4, i__5;
-  real r__1;
-  complex q__1, q__2, q__3, q__4;
-
-  /* Local variables */
-  integer i__, j, l, ix, iy, jx, jy, kx, ky, info;
-  complex temp1, temp2;
-  extern logical lsame_(char *, char *);
-  integer kplus1;
-  extern /* Subroutine */ void xerbla_(const char *, integer *);
-
-  /*     .. Scalar Arguments .. */
-  /*     .. */
-  /*     .. Array Arguments .. */
-  /*     .. */
-
-  /*  Purpose */
-  /*  ======= */
-
-  /*  CHBMV  performs the matrix-vector  operation */
-
-  /*     y := alpha*A*x + beta*y, */
-
-  /*  where alpha and beta are scalars, x and y are n element vectors and */
-  /*  A is an n by n hermitian band matrix, with k super-diagonals. */
-
-  /*  Arguments */
-  /*  ========== */
-
-  /*  UPLO   - CHARACTER*1. */
-  /*           On entry, UPLO specifies whether the upper or lower */
-  /*           triangular part of the band matrix A is being supplied as */
-  /*           follows: */
-
-  /*              UPLO = 'U' or 'u'   The upper triangular part of A is */
-  /*                                  being supplied. */
-
-  /*              UPLO = 'L' or 'l'   The lower triangular part of A is */
-  /*                                  being supplied. */
-
-  /*           Unchanged on exit. */
-
-  /*  N      - INTEGER. */
-  /*           On entry, N specifies the order of the matrix A. */
-  /*           N must be at least zero. */
-  /*           Unchanged on exit. */
-
-  /*  K      - INTEGER. */
-  /*           On entry, K specifies the number of super-diagonals of the */
-  /*           matrix A. K must satisfy  0 .le. K. */
-  /*           Unchanged on exit. */
-
-  /*  ALPHA  - COMPLEX         . */
-  /*           On entry, ALPHA specifies the scalar alpha. */
-  /*           Unchanged on exit. */
-
-  /*  A      - COMPLEX          array of DIMENSION ( LDA, n ). */
-  /*           Before entry with UPLO = 'U' or 'u', the leading ( k + 1 ) */
-  /*           by n part of the array A must contain the upper triangular */
-  /*           band part of the hermitian matrix, supplied column by */
-  /*           column, with the leading diagonal of the matrix in row */
-  /*           ( k + 1 ) of the array, the first super-diagonal starting at */
-  /*           position 2 in row k, and so on. The top left k by k triangle */
-  /*           of the array A is not referenced. */
-  /*           The following program segment will transfer the upper */
-  /*           triangular part of a hermitian band matrix from conventional */
-  /*           full matrix storage to band storage: */
-
-  /*                 DO 20, J = 1, N */
-  /*                    M = K + 1 - J */
-  /*                    DO 10, I = MAX( 1, J - K ), J */
-  /*                       A( M + I, J ) = matrix( I, J ) */
-  /*              10    CONTINUE */
-  /*              20 CONTINUE */
-
-  /*           Before entry with UPLO = 'L' or 'l', the leading ( k + 1 ) */
-  /*           by n part of the array A must contain the lower triangular */
-  /*           band part of the hermitian matrix, supplied column by */
-  /*           column, with the leading diagonal of the matrix in row 1 of */
-  /*           the array, the first sub-diagonal starting at position 1 in */
-  /*           row 2, and so on. The bottom right k by k triangle of the */
-  /*           array A is not referenced. */
-  /*           The following program segment will transfer the lower */
-  /*           triangular part of a hermitian band matrix from conventional */
-  /*           full matrix storage to band storage: */
-
-  /*                 DO 20, J = 1, N */
-  /*                    M = 1 - J */
-  /*                    DO 10, I = J, MIN( N, J + K ) */
-  /*                       A( M + I, J ) = matrix( I, J ) */
-  /*              10    CONTINUE */
-  /*              20 CONTINUE */
-
-  /*           Note that the imaginary parts of the diagonal elements need */
-  /*           not be set and are assumed to be zero. */
-  /*           Unchanged on exit. */
-
-  /*  LDA    - INTEGER. */
-  /*           On entry, LDA specifies the first dimension of A as declared */
-  /*           in the calling (sub) program. LDA must be at least */
-  /*           ( k + 1 ). */
-  /*           Unchanged on exit. */
-
-  /*  X      - COMPLEX          array of DIMENSION at least */
-  /*           ( 1 + ( n - 1 )*abs( INCX ) ). */
-  /*           Before entry, the incremented array X must contain the */
-  /*           vector x. */
-  /*           Unchanged on exit. */
-
-  /*  INCX   - INTEGER. */
-  /*           On entry, INCX specifies the increment for the elements of */
-  /*           X. INCX must not be zero. */
-  /*           Unchanged on exit. */
-
-  /*  BETA   - COMPLEX         . */
-  /*           On entry, BETA specifies the scalar beta. */
-  /*           Unchanged on exit. */
-
-  /*  Y      - COMPLEX          array of DIMENSION at least */
-  /*           ( 1 + ( n - 1 )*abs( INCY ) ). */
-  /*           Before entry, the incremented array Y must contain the */
-  /*           vector y. On exit, Y is overwritten by the updated vector y. */
-
-  /*  INCY   - INTEGER. */
-  /*           On entry, INCY specifies the increment for the elements of */
-  /*           Y. INCY must not be zero. */
-  /*           Unchanged on exit. */
-
-  /*  Further Details */
-  /*  =============== */
-
-  /*  Level 2 Blas routine. */
-
-  /*  -- Written on 22-October-1986. */
-  /*     Jack Dongarra, Argonne National Lab. */
-  /*     Jeremy Du Croz, Nag Central Office. */
-  /*     Sven Hammarling, Nag Central Office. */
-  /*     Richard Hanson, Sandia National Labs. */
-
-  /*  ===================================================================== */
-
-  /*     .. Parameters .. */
-  /*     .. */
-  /*     .. Local Scalars .. */
-  /*     .. */
-  /*     .. External Functions .. */
-  /*     .. */
-  /*     .. External Subroutines .. */
-  /*     .. */
-  /*     .. Intrinsic Functions .. */
-  /*     .. */
-
-  /*     Test the input parameters. */
-
-  /* Parameter adjustments */
-  a_dim1 = *lda;
-  a_offset = 1 + a_dim1;
-  a -= a_offset;
-  --x;
-  --y;
-
-  /* Function Body */
-  info = 0;
-  if (!lsame_(uplo, "U") && !lsame_(uplo, "L")) {
-    info = 1;
-  } else if (*n < 0) {
-    info = 2;
-  } else if (*k < 0) {
-    info = 3;
-  } else if (*lda < *k + 1) {
-    info = 6;
-  } else if (*incx == 0) {
-    info = 8;
-  } else if (*incy == 0) {
-    info = 11;
-  }
-  if (info != 0) {
-    xerbla_("CHBMV ", &info);
-    return;
-  }
-
-  /*     Quick return if possible. */
-
-  if (*n == 0 || (alpha->r == 0.f && alpha->i == 0.f && (beta->r == 1.f && beta->i == 0.f))) {
-    return;
-  }
-
-  /*     Set up the start points in  X  and  Y. */
-
-  if (*incx > 0) {
-    kx = 1;
-  } else {
-    kx = 1 - (*n - 1) * *incx;
-  }
-  if (*incy > 0) {
-    ky = 1;
-  } else {
-    ky = 1 - (*n - 1) * *incy;
-  }
-
-  /*     Start the operations. In this version the elements of the array A */
-  /*     are accessed sequentially with one pass through A. */
-
-  /*     First form  y := beta*y. */
-
-  if (beta->r != 1.f || beta->i != 0.f) {
-    if (*incy == 1) {
-      if (beta->r == 0.f && beta->i == 0.f) {
-        i__1 = *n;
-        for (i__ = 1; i__ <= i__1; ++i__) {
-          i__2 = i__;
-          y[i__2].r = 0.f, y[i__2].i = 0.f;
-          /* L10: */
-        }
-      } else {
-        i__1 = *n;
-        for (i__ = 1; i__ <= i__1; ++i__) {
-          i__2 = i__;
-          i__3 = i__;
-          q__1.r = beta->r * y[i__3].r - beta->i * y[i__3].i, q__1.i = beta->r * y[i__3].i + beta->i * y[i__3].r;
-          y[i__2].r = q__1.r, y[i__2].i = q__1.i;
-          /* L20: */
-        }
-      }
-    } else {
-      iy = ky;
-      if (beta->r == 0.f && beta->i == 0.f) {
-        i__1 = *n;
-        for (i__ = 1; i__ <= i__1; ++i__) {
-          i__2 = iy;
-          y[i__2].r = 0.f, y[i__2].i = 0.f;
-          iy += *incy;
-          /* L30: */
-        }
-      } else {
-        i__1 = *n;
-        for (i__ = 1; i__ <= i__1; ++i__) {
-          i__2 = iy;
-          i__3 = iy;
-          q__1.r = beta->r * y[i__3].r - beta->i * y[i__3].i, q__1.i = beta->r * y[i__3].i + beta->i * y[i__3].r;
-          y[i__2].r = q__1.r, y[i__2].i = q__1.i;
-          iy += *incy;
-          /* L40: */
-        }
-      }
-    }
-  }
-  if (alpha->r == 0.f && alpha->i == 0.f) {
-    return;
-  }
-  if (lsame_(uplo, "U")) {
-    /*        Form  y  when upper triangle of A is stored. */
-
-    kplus1 = *k + 1;
-    if (*incx == 1 && *incy == 1) {
-      i__1 = *n;
-      for (j = 1; j <= i__1; ++j) {
-        i__2 = j;
-        q__1.r = alpha->r * x[i__2].r - alpha->i * x[i__2].i, q__1.i = alpha->r * x[i__2].i + alpha->i * x[i__2].r;
-        temp1.r = q__1.r, temp1.i = q__1.i;
-        temp2.r = 0.f, temp2.i = 0.f;
-        l = kplus1 - j;
-        /* Computing MAX */
-        i__2 = 1, i__3 = j - *k;
-        i__4 = j - 1;
-        for (i__ = max(i__2, i__3); i__ <= i__4; ++i__) {
-          i__2 = i__;
-          i__3 = i__;
-          i__5 = l + i__ + j * a_dim1;
-          q__2.r = temp1.r * a[i__5].r - temp1.i * a[i__5].i, q__2.i = temp1.r * a[i__5].i + temp1.i * a[i__5].r;
-          q__1.r = y[i__3].r + q__2.r, q__1.i = y[i__3].i + q__2.i;
-          y[i__2].r = q__1.r, y[i__2].i = q__1.i;
-          r_cnjg(&q__3, &a[l + i__ + j * a_dim1]);
-          i__2 = i__;
-          q__2.r = q__3.r * x[i__2].r - q__3.i * x[i__2].i, q__2.i = q__3.r * x[i__2].i + q__3.i * x[i__2].r;
-          q__1.r = temp2.r + q__2.r, q__1.i = temp2.i + q__2.i;
-          temp2.r = q__1.r, temp2.i = q__1.i;
-          /* L50: */
-        }
-        i__4 = j;
-        i__2 = j;
-        i__3 = kplus1 + j * a_dim1;
-        r__1 = a[i__3].r;
-        q__3.r = r__1 * temp1.r, q__3.i = r__1 * temp1.i;
-        q__2.r = y[i__2].r + q__3.r, q__2.i = y[i__2].i + q__3.i;
-        q__4.r = alpha->r * temp2.r - alpha->i * temp2.i, q__4.i = alpha->r * temp2.i + alpha->i * temp2.r;
-        q__1.r = q__2.r + q__4.r, q__1.i = q__2.i + q__4.i;
-        y[i__4].r = q__1.r, y[i__4].i = q__1.i;
-        /* L60: */
-      }
-    } else {
-      jx = kx;
-      jy = ky;
-      i__1 = *n;
-      for (j = 1; j <= i__1; ++j) {
-        i__4 = jx;
-        q__1.r = alpha->r * x[i__4].r - alpha->i * x[i__4].i, q__1.i = alpha->r * x[i__4].i + alpha->i * x[i__4].r;
-        temp1.r = q__1.r, temp1.i = q__1.i;
-        temp2.r = 0.f, temp2.i = 0.f;
-        ix = kx;
-        iy = ky;
-        l = kplus1 - j;
-        /* Computing MAX */
-        i__4 = 1, i__2 = j - *k;
-        i__3 = j - 1;
-        for (i__ = max(i__4, i__2); i__ <= i__3; ++i__) {
-          i__4 = iy;
-          i__2 = iy;
-          i__5 = l + i__ + j * a_dim1;
-          q__2.r = temp1.r * a[i__5].r - temp1.i * a[i__5].i, q__2.i = temp1.r * a[i__5].i + temp1.i * a[i__5].r;
-          q__1.r = y[i__2].r + q__2.r, q__1.i = y[i__2].i + q__2.i;
-          y[i__4].r = q__1.r, y[i__4].i = q__1.i;
-          r_cnjg(&q__3, &a[l + i__ + j * a_dim1]);
-          i__4 = ix;
-          q__2.r = q__3.r * x[i__4].r - q__3.i * x[i__4].i, q__2.i = q__3.r * x[i__4].i + q__3.i * x[i__4].r;
-          q__1.r = temp2.r + q__2.r, q__1.i = temp2.i + q__2.i;
-          temp2.r = q__1.r, temp2.i = q__1.i;
-          ix += *incx;
-          iy += *incy;
-          /* L70: */
-        }
-        i__3 = jy;
-        i__4 = jy;
-        i__2 = kplus1 + j * a_dim1;
-        r__1 = a[i__2].r;
-        q__3.r = r__1 * temp1.r, q__3.i = r__1 * temp1.i;
-        q__2.r = y[i__4].r + q__3.r, q__2.i = y[i__4].i + q__3.i;
-        q__4.r = alpha->r * temp2.r - alpha->i * temp2.i, q__4.i = alpha->r * temp2.i + alpha->i * temp2.r;
-        q__1.r = q__2.r + q__4.r, q__1.i = q__2.i + q__4.i;
-        y[i__3].r = q__1.r, y[i__3].i = q__1.i;
-        jx += *incx;
-        jy += *incy;
-        if (j > *k) {
-          kx += *incx;
-          ky += *incy;
-        }
-        /* L80: */
-      }
-    }
-  } else {
-    /*        Form  y  when lower triangle of A is stored. */
-
-    if (*incx == 1 && *incy == 1) {
-      i__1 = *n;
-      for (j = 1; j <= i__1; ++j) {
-        i__3 = j;
-        q__1.r = alpha->r * x[i__3].r - alpha->i * x[i__3].i, q__1.i = alpha->r * x[i__3].i + alpha->i * x[i__3].r;
-        temp1.r = q__1.r, temp1.i = q__1.i;
-        temp2.r = 0.f, temp2.i = 0.f;
-        i__3 = j;
-        i__4 = j;
-        i__2 = j * a_dim1 + 1;
-        r__1 = a[i__2].r;
-        q__2.r = r__1 * temp1.r, q__2.i = r__1 * temp1.i;
-        q__1.r = y[i__4].r + q__2.r, q__1.i = y[i__4].i + q__2.i;
-        y[i__3].r = q__1.r, y[i__3].i = q__1.i;
-        l = 1 - j;
-        /* Computing MIN */
-        i__4 = *n, i__2 = j + *k;
-        i__3 = min(i__4, i__2);
-        for (i__ = j + 1; i__ <= i__3; ++i__) {
-          i__4 = i__;
-          i__2 = i__;
-          i__5 = l + i__ + j * a_dim1;
-          q__2.r = temp1.r * a[i__5].r - temp1.i * a[i__5].i, q__2.i = temp1.r * a[i__5].i + temp1.i * a[i__5].r;
-          q__1.r = y[i__2].r + q__2.r, q__1.i = y[i__2].i + q__2.i;
-          y[i__4].r = q__1.r, y[i__4].i = q__1.i;
-          r_cnjg(&q__3, &a[l + i__ + j * a_dim1]);
-          i__4 = i__;
-          q__2.r = q__3.r * x[i__4].r - q__3.i * x[i__4].i, q__2.i = q__3.r * x[i__4].i + q__3.i * x[i__4].r;
-          q__1.r = temp2.r + q__2.r, q__1.i = temp2.i + q__2.i;
-          temp2.r = q__1.r, temp2.i = q__1.i;
-          /* L90: */
-        }
-        i__3 = j;
-        i__4 = j;
-        q__2.r = alpha->r * temp2.r - alpha->i * temp2.i, q__2.i = alpha->r * temp2.i + alpha->i * temp2.r;
-        q__1.r = y[i__4].r + q__2.r, q__1.i = y[i__4].i + q__2.i;
-        y[i__3].r = q__1.r, y[i__3].i = q__1.i;
-        /* L100: */
-      }
-    } else {
-      jx = kx;
-      jy = ky;
-      i__1 = *n;
-      for (j = 1; j <= i__1; ++j) {
-        i__3 = jx;
-        q__1.r = alpha->r * x[i__3].r - alpha->i * x[i__3].i, q__1.i = alpha->r * x[i__3].i + alpha->i * x[i__3].r;
-        temp1.r = q__1.r, temp1.i = q__1.i;
-        temp2.r = 0.f, temp2.i = 0.f;
-        i__3 = jy;
-        i__4 = jy;
-        i__2 = j * a_dim1 + 1;
-        r__1 = a[i__2].r;
-        q__2.r = r__1 * temp1.r, q__2.i = r__1 * temp1.i;
-        q__1.r = y[i__4].r + q__2.r, q__1.i = y[i__4].i + q__2.i;
-        y[i__3].r = q__1.r, y[i__3].i = q__1.i;
-        l = 1 - j;
-        ix = jx;
-        iy = jy;
-        /* Computing MIN */
-        i__4 = *n, i__2 = j + *k;
-        i__3 = min(i__4, i__2);
-        for (i__ = j + 1; i__ <= i__3; ++i__) {
-          ix += *incx;
-          iy += *incy;
-          i__4 = iy;
-          i__2 = iy;
-          i__5 = l + i__ + j * a_dim1;
-          q__2.r = temp1.r * a[i__5].r - temp1.i * a[i__5].i, q__2.i = temp1.r * a[i__5].i + temp1.i * a[i__5].r;
-          q__1.r = y[i__2].r + q__2.r, q__1.i = y[i__2].i + q__2.i;
-          y[i__4].r = q__1.r, y[i__4].i = q__1.i;
-          r_cnjg(&q__3, &a[l + i__ + j * a_dim1]);
-          i__4 = ix;
-          q__2.r = q__3.r * x[i__4].r - q__3.i * x[i__4].i, q__2.i = q__3.r * x[i__4].i + q__3.i * x[i__4].r;
-          q__1.r = temp2.r + q__2.r, q__1.i = temp2.i + q__2.i;
-          temp2.r = q__1.r, temp2.i = q__1.i;
-          /* L110: */
-        }
-        i__3 = jy;
-        i__4 = jy;
-        q__2.r = alpha->r * temp2.r - alpha->i * temp2.i, q__2.i = alpha->r * temp2.i + alpha->i * temp2.r;
-        q__1.r = y[i__4].r + q__2.r, q__1.i = y[i__4].i + q__2.i;
-        y[i__3].r = q__1.r, y[i__3].i = q__1.i;
-        jx += *incx;
-        jy += *incy;
-        /* L120: */
-      }
-    }
-  }
-
-  /*     End of CHBMV . */
-
-} /* chbmv_ */
--- a/blas/f2c/chpmv.c
+++ b/blas/f2c/chpmv.c
@@ -1,407 +0,0 @@
-/* chpmv.f -- translated by f2c (version 20100827).
-   You must link the resulting object file with libf2c:
-        on Microsoft Windows system, link with libf2c.lib;
-        on Linux or Unix systems, link with .../path/to/libf2c.a -lm
-        or, if you install libf2c.a in a standard place, with -lf2c -lm
-        -- in that order, at the end of the command line, as in
-                cc *.o -lf2c -lm
-        Source for libf2c is in /netlib/f2c/libf2c.zip, e.g.,
-
-                http://www.netlib.org/f2c/libf2c.zip
-*/
-
-#include "datatypes.h"
-
-static inline void r_cnjg(complex *r, complex *z) {
-  r->r = z->r;
-  r->i = -(z->i);
-}
-
-/* Subroutine */ void chpmv_(char *uplo, integer *n, complex *alpha, complex *ap, complex *x, integer *incx,
-                             complex *beta, complex *y, integer *incy) {
-  /* System generated locals */
-  integer i__1, i__2, i__3, i__4, i__5;
-  real r__1;
-  complex q__1, q__2, q__3, q__4;
-
-  /* Local variables */
-  integer i__, j, k, kk, ix, iy, jx, jy, kx, ky, info;
-  complex temp1, temp2;
-  extern logical lsame_(char *, char *);
-  extern /* Subroutine */ void xerbla_(const char *, integer *);
-
-  /*     .. Scalar Arguments .. */
-  /*     .. */
-  /*     .. Array Arguments .. */
-  /*     .. */
-
-  /*  Purpose */
-  /*  ======= */
-
-  /*  CHPMV  performs the matrix-vector operation */
-
-  /*     y := alpha*A*x + beta*y, */
-
-  /*  where alpha and beta are scalars, x and y are n element vectors and */
-  /*  A is an n by n hermitian matrix, supplied in packed form. */
-
-  /*  Arguments */
-  /*  ========== */
-
-  /*  UPLO   - CHARACTER*1. */
-  /*           On entry, UPLO specifies whether the upper or lower */
-  /*           triangular part of the matrix A is supplied in the packed */
-  /*           array AP as follows: */
-
-  /*              UPLO = 'U' or 'u'   The upper triangular part of A is */
-  /*                                  supplied in AP. */
-
-  /*              UPLO = 'L' or 'l'   The lower triangular part of A is */
-  /*                                  supplied in AP. */
-
-  /*           Unchanged on exit. */
-
-  /*  N      - INTEGER. */
-  /*           On entry, N specifies the order of the matrix A. */
-  /*           N must be at least zero. */
-  /*           Unchanged on exit. */
-
-  /*  ALPHA  - COMPLEX         . */
-  /*           On entry, ALPHA specifies the scalar alpha. */
-  /*           Unchanged on exit. */
-
-  /*  AP     - COMPLEX          array of DIMENSION at least */
-  /*           ( ( n*( n + 1 ) )/2 ). */
-  /*           Before entry with UPLO = 'U' or 'u', the array AP must */
-  /*           contain the upper triangular part of the hermitian matrix */
-  /*           packed sequentially, column by column, so that AP( 1 ) */
-  /*           contains a( 1, 1 ), AP( 2 ) and AP( 3 ) contain a( 1, 2 ) */
-  /*           and a( 2, 2 ) respectively, and so on. */
-  /*           Before entry with UPLO = 'L' or 'l', the array AP must */
-  /*           contain the lower triangular part of the hermitian matrix */
-  /*           packed sequentially, column by column, so that AP( 1 ) */
-  /*           contains a( 1, 1 ), AP( 2 ) and AP( 3 ) contain a( 2, 1 ) */
-  /*           and a( 3, 1 ) respectively, and so on. */
-  /*           Note that the imaginary parts of the diagonal elements need */
-  /*           not be set and are assumed to be zero. */
-  /*           Unchanged on exit. */
-
-  /*  X      - COMPLEX          array of dimension at least */
-  /*           ( 1 + ( n - 1 )*abs( INCX ) ). */
-  /*           Before entry, the incremented array X must contain the n */
-  /*           element vector x. */
-  /*           Unchanged on exit. */
-
-  /*  INCX   - INTEGER. */
-  /*           On entry, INCX specifies the increment for the elements of */
-  /*           X. INCX must not be zero. */
-  /*           Unchanged on exit. */
-
-  /*  BETA   - COMPLEX         . */
-  /*           On entry, BETA specifies the scalar beta. When BETA is */
-  /*           supplied as zero then Y need not be set on input. */
-  /*           Unchanged on exit. */
-
-  /*  Y      - COMPLEX          array of dimension at least */
-  /*           ( 1 + ( n - 1 )*abs( INCY ) ). */
-  /*           Before entry, the incremented array Y must contain the n */
-  /*           element vector y. On exit, Y is overwritten by the updated */
-  /*           vector y. */
-
-  /*  INCY   - INTEGER. */
-  /*           On entry, INCY specifies the increment for the elements of */
-  /*           Y. INCY must not be zero. */
-  /*           Unchanged on exit. */
-
-  /*  Further Details */
-  /*  =============== */
-
-  /*  Level 2 Blas routine. */
-
-  /*  -- Written on 22-October-1986. */
-  /*     Jack Dongarra, Argonne National Lab. */
-  /*     Jeremy Du Croz, Nag Central Office. */
-  /*     Sven Hammarling, Nag Central Office. */
-  /*     Richard Hanson, Sandia National Labs. */
-
-  /*  ===================================================================== */
-
-  /*     .. Parameters .. */
-  /*     .. */
-  /*     .. Local Scalars .. */
-  /*     .. */
-  /*     .. External Functions .. */
-  /*     .. */
-  /*     .. External Subroutines .. */
-  /*     .. */
-  /*     .. Intrinsic Functions .. */
-  /*     .. */
-
-  /*     Test the input parameters. */
-
-  /* Parameter adjustments */
-  --y;
-  --x;
-  --ap;
-
-  /* Function Body */
-  info = 0;
-  if (!lsame_(uplo, "U") && !lsame_(uplo, "L")) {
-    info = 1;
-  } else if (*n < 0) {
-    info = 2;
-  } else if (*incx == 0) {
-    info = 6;
-  } else if (*incy == 0) {
-    info = 9;
-  }
-  if (info != 0) {
-    xerbla_("CHPMV ", &info);
-    return;
-  }
-
-  /*     Quick return if possible. */
-
-  if (*n == 0 || (alpha->r == 0.f && alpha->i == 0.f && (beta->r == 1.f && beta->i == 0.f))) {
-    return;
-  }
-
-  /*     Set up the start points in  X  and  Y. */
-
-  if (*incx > 0) {
-    kx = 1;
-  } else {
-    kx = 1 - (*n - 1) * *incx;
-  }
-  if (*incy > 0) {
-    ky = 1;
-  } else {
-    ky = 1 - (*n - 1) * *incy;
-  }
-
-  /*     Start the operations. In this version the elements of the array AP */
-  /*     are accessed sequentially with one pass through AP. */
-
-  /*     First form  y := beta*y. */
-
-  if (beta->r != 1.f || beta->i != 0.f) {
-    if (*incy == 1) {
-      if (beta->r == 0.f && beta->i == 0.f) {
-        i__1 = *n;
-        for (i__ = 1; i__ <= i__1; ++i__) {
-          i__2 = i__;
-          y[i__2].r = 0.f, y[i__2].i = 0.f;
-          /* L10: */
-        }
-      } else {
-        i__1 = *n;
-        for (i__ = 1; i__ <= i__1; ++i__) {
-          i__2 = i__;
-          i__3 = i__;
-          q__1.r = beta->r * y[i__3].r - beta->i * y[i__3].i, q__1.i = beta->r * y[i__3].i + beta->i * y[i__3].r;
-          y[i__2].r = q__1.r, y[i__2].i = q__1.i;
-          /* L20: */
-        }
-      }
-    } else {
-      iy = ky;
-      if (beta->r == 0.f && beta->i == 0.f) {
-        i__1 = *n;
-        for (i__ = 1; i__ <= i__1; ++i__) {
-          i__2 = iy;
-          y[i__2].r = 0.f, y[i__2].i = 0.f;
-          iy += *incy;
-          /* L30: */
-        }
-      } else {
-        i__1 = *n;
-        for (i__ = 1; i__ <= i__1; ++i__) {
-          i__2 = iy;
-          i__3 = iy;
-          q__1.r = beta->r * y[i__3].r - beta->i * y[i__3].i, q__1.i = beta->r * y[i__3].i + beta->i * y[i__3].r;
-          y[i__2].r = q__1.r, y[i__2].i = q__1.i;
-          iy += *incy;
-          /* L40: */
-        }
-      }
-    }
-  }
-  if (alpha->r == 0.f && alpha->i == 0.f) {
-    return;
-  }
-  kk = 1;
-  if (lsame_(uplo, "U")) {
-    /*        Form  y  when AP contains the upper triangle. */
-
-    if (*incx == 1 && *incy == 1) {
-      i__1 = *n;
-      for (j = 1; j <= i__1; ++j) {
-        i__2 = j;
-        q__1.r = alpha->r * x[i__2].r - alpha->i * x[i__2].i, q__1.i = alpha->r * x[i__2].i + alpha->i * x[i__2].r;
-        temp1.r = q__1.r, temp1.i = q__1.i;
-        temp2.r = 0.f, temp2.i = 0.f;
-        k = kk;
-        i__2 = j - 1;
-        for (i__ = 1; i__ <= i__2; ++i__) {
-          i__3 = i__;
-          i__4 = i__;
-          i__5 = k;
-          q__2.r = temp1.r * ap[i__5].r - temp1.i * ap[i__5].i, q__2.i = temp1.r * ap[i__5].i + temp1.i * ap[i__5].r;
-          q__1.r = y[i__4].r + q__2.r, q__1.i = y[i__4].i + q__2.i;
-          y[i__3].r = q__1.r, y[i__3].i = q__1.i;
-          r_cnjg(&q__3, &ap[k]);
-          i__3 = i__;
-          q__2.r = q__3.r * x[i__3].r - q__3.i * x[i__3].i, q__2.i = q__3.r * x[i__3].i + q__3.i * x[i__3].r;
-          q__1.r = temp2.r + q__2.r, q__1.i = temp2.i + q__2.i;
-          temp2.r = q__1.r, temp2.i = q__1.i;
-          ++k;
-          /* L50: */
-        }
-        i__2 = j;
-        i__3 = j;
-        i__4 = kk + j - 1;
-        r__1 = ap[i__4].r;
-        q__3.r = r__1 * temp1.r, q__3.i = r__1 * temp1.i;
-        q__2.r = y[i__3].r + q__3.r, q__2.i = y[i__3].i + q__3.i;
-        q__4.r = alpha->r * temp2.r - alpha->i * temp2.i, q__4.i = alpha->r * temp2.i + alpha->i * temp2.r;
-        q__1.r = q__2.r + q__4.r, q__1.i = q__2.i + q__4.i;
-        y[i__2].r = q__1.r, y[i__2].i = q__1.i;
-        kk += j;
-        /* L60: */
-      }
-    } else {
-      jx = kx;
-      jy = ky;
-      i__1 = *n;
-      for (j = 1; j <= i__1; ++j) {
-        i__2 = jx;
-        q__1.r = alpha->r * x[i__2].r - alpha->i * x[i__2].i, q__1.i = alpha->r * x[i__2].i + alpha->i * x[i__2].r;
-        temp1.r = q__1.r, temp1.i = q__1.i;
-        temp2.r = 0.f, temp2.i = 0.f;
-        ix = kx;
-        iy = ky;
-        i__2 = kk + j - 2;
-        for (k = kk; k <= i__2; ++k) {
-          i__3 = iy;
-          i__4 = iy;
-          i__5 = k;
-          q__2.r = temp1.r * ap[i__5].r - temp1.i * ap[i__5].i, q__2.i = temp1.r * ap[i__5].i + temp1.i * ap[i__5].r;
-          q__1.r = y[i__4].r + q__2.r, q__1.i = y[i__4].i + q__2.i;
-          y[i__3].r = q__1.r, y[i__3].i = q__1.i;
-          r_cnjg(&q__3, &ap[k]);
-          i__3 = ix;
-          q__2.r = q__3.r * x[i__3].r - q__3.i * x[i__3].i, q__2.i = q__3.r * x[i__3].i + q__3.i * x[i__3].r;
-          q__1.r = temp2.r + q__2.r, q__1.i = temp2.i + q__2.i;
-          temp2.r = q__1.r, temp2.i = q__1.i;
-          ix += *incx;
-          iy += *incy;
-          /* L70: */
-        }
-        i__2 = jy;
-        i__3 = jy;
-        i__4 = kk + j - 1;
-        r__1 = ap[i__4].r;
-        q__3.r = r__1 * temp1.r, q__3.i = r__1 * temp1.i;
-        q__2.r = y[i__3].r + q__3.r, q__2.i = y[i__3].i + q__3.i;
-        q__4.r = alpha->r * temp2.r - alpha->i * temp2.i, q__4.i = alpha->r * temp2.i + alpha->i * temp2.r;
-        q__1.r = q__2.r + q__4.r, q__1.i = q__2.i + q__4.i;
-        y[i__2].r = q__1.r, y[i__2].i = q__1.i;
-        jx += *incx;
-        jy += *incy;
-        kk += j;
-        /* L80: */
-      }
-    }
-  } else {
-    /*        Form  y  when AP contains the lower triangle. */
-
-    if (*incx == 1 && *incy == 1) {
-      i__1 = *n;
-      for (j = 1; j <= i__1; ++j) {
-        i__2 = j;
-        q__1.r = alpha->r * x[i__2].r - alpha->i * x[i__2].i, q__1.i = alpha->r * x[i__2].i + alpha->i * x[i__2].r;
-        temp1.r = q__1.r, temp1.i = q__1.i;
-        temp2.r = 0.f, temp2.i = 0.f;
-        i__2 = j;
-        i__3 = j;
-        i__4 = kk;
-        r__1 = ap[i__4].r;
-        q__2.r = r__1 * temp1.r, q__2.i = r__1 * temp1.i;
-        q__1.r = y[i__3].r + q__2.r, q__1.i = y[i__3].i + q__2.i;
-        y[i__2].r = q__1.r, y[i__2].i = q__1.i;
-        k = kk + 1;
-        i__2 = *n;
-        for (i__ = j + 1; i__ <= i__2; ++i__) {
-          i__3 = i__;
-          i__4 = i__;
-          i__5 = k;
-          q__2.r = temp1.r * ap[i__5].r - temp1.i * ap[i__5].i, q__2.i = temp1.r * ap[i__5].i + temp1.i * ap[i__5].r;
-          q__1.r = y[i__4].r + q__2.r, q__1.i = y[i__4].i + q__2.i;
-          y[i__3].r = q__1.r, y[i__3].i = q__1.i;
-          r_cnjg(&q__3, &ap[k]);
-          i__3 = i__;
-          q__2.r = q__3.r * x[i__3].r - q__3.i * x[i__3].i, q__2.i = q__3.r * x[i__3].i + q__3.i * x[i__3].r;
-          q__1.r = temp2.r + q__2.r, q__1.i = temp2.i + q__2.i;
-          temp2.r = q__1.r, temp2.i = q__1.i;
-          ++k;
-          /* L90: */
-        }
-        i__2 = j;
-        i__3 = j;
-        q__2.r = alpha->r * temp2.r - alpha->i * temp2.i, q__2.i = alpha->r * temp2.i + alpha->i * temp2.r;
-        q__1.r = y[i__3].r + q__2.r, q__1.i = y[i__3].i + q__2.i;
-        y[i__2].r = q__1.r, y[i__2].i = q__1.i;
-        kk += *n - j + 1;
-        /* L100: */
-      }
-    } else {
-      jx = kx;
-      jy = ky;
-      i__1 = *n;
-      for (j = 1; j <= i__1; ++j) {
-        i__2 = jx;
-        q__1.r = alpha->r * x[i__2].r - alpha->i * x[i__2].i, q__1.i = alpha->r * x[i__2].i + alpha->i * x[i__2].r;
-        temp1.r = q__1.r, temp1.i = q__1.i;
-        temp2.r = 0.f, temp2.i = 0.f;
-        i__2 = jy;
-        i__3 = jy;
-        i__4 = kk;
-        r__1 = ap[i__4].r;
-        q__2.r = r__1 * temp1.r, q__2.i = r__1 * temp1.i;
-        q__1.r = y[i__3].r + q__2.r, q__1.i = y[i__3].i + q__2.i;
-        y[i__2].r = q__1.r, y[i__2].i = q__1.i;
-        ix = jx;
-        iy = jy;
-        i__2 = kk + *n - j;
-        for (k = kk + 1; k <= i__2; ++k) {
-          ix += *incx;
-          iy += *incy;
-          i__3 = iy;
-          i__4 = iy;
-          i__5 = k;
-          q__2.r = temp1.r * ap[i__5].r - temp1.i * ap[i__5].i, q__2.i = temp1.r * ap[i__5].i + temp1.i * ap[i__5].r;
-          q__1.r = y[i__4].r + q__2.r, q__1.i = y[i__4].i + q__2.i;
-          y[i__3].r = q__1.r, y[i__3].i = q__1.i;
-          r_cnjg(&q__3, &ap[k]);
-          i__3 = ix;
-          q__2.r = q__3.r * x[i__3].r - q__3.i * x[i__3].i, q__2.i = q__3.r * x[i__3].i + q__3.i * x[i__3].r;
-          q__1.r = temp2.r + q__2.r, q__1.i = temp2.i + q__2.i;
-          temp2.r = q__1.r, temp2.i = q__1.i;
-          /* L110: */
-        }
-        i__2 = jy;
-        i__3 = jy;
-        q__2.r = alpha->r * temp2.r - alpha->i * temp2.i, q__2.i = alpha->r * temp2.i + alpha->i * temp2.r;
-        q__1.r = y[i__3].r + q__2.r, q__1.i = y[i__3].i + q__2.i;
-        y[i__2].r = q__1.r, y[i__2].i = q__1.i;
-        jx += *incx;
-        jy += *incy;
-        kk += *n - j + 1;
-        /* L120: */
-      }
-    }
-  }
-
-  /*     End of CHPMV . */
-
-} /* chpmv_ */
--- a/blas/f2c/complexdots.c
+++ b/blas/f2c/complexdots.c
@@ -1,73 +0,0 @@
-/* This file has been modified to use the standard gfortran calling
-   convention, rather than the f2c calling convention.
-
-   It does not require -ff2c when compiled with gfortran.
-*/
-
-/* complexdots.f -- translated by f2c (version 20100827).
-   You must link the resulting object file with libf2c:
-        on Microsoft Windows system, link with libf2c.lib;
-        on Linux or Unix systems, link with .../path/to/libf2c.a -lm
-        or, if you install libf2c.a in a standard place, with -lf2c -lm
-        -- in that order, at the end of the command line, as in
-                cc *.o -lf2c -lm
-        Source for libf2c is in /netlib/f2c/libf2c.zip, e.g.,
-
-                http://www.netlib.org/f2c/libf2c.zip
-*/
-
-#include "datatypes.h"
-
-complex cdotc_(integer *n, complex *cx, integer *incx, complex *cy, integer *incy) {
-  complex res;
-  extern /* Subroutine */ void cdotcw_(integer *, complex *, integer *, complex *, integer *, complex *);
-
-  /* Parameter adjustments */
-  --cy;
-  --cx;
-
-  /* Function Body */
-  cdotcw_(n, &cx[1], incx, &cy[1], incy, &res);
-  return res;
-} /* cdotc_ */
-
-complex cdotu_(integer *n, complex *cx, integer *incx, complex *cy, integer *incy) {
-  complex res;
-  extern /* Subroutine */ void cdotuw_(integer *, complex *, integer *, complex *, integer *, complex *);
-
-  /* Parameter adjustments */
-  --cy;
-  --cx;
-
-  /* Function Body */
-  cdotuw_(n, &cx[1], incx, &cy[1], incy, &res);
-  return res;
-} /* cdotu_ */
-
-doublecomplex zdotc_(integer *n, doublecomplex *cx, integer *incx, doublecomplex *cy, integer *incy) {
-  doublecomplex res;
-  extern /* Subroutine */ void zdotcw_(integer *, doublecomplex *, integer *, doublecomplex *, integer *,
-                                       doublecomplex *);
-
-  /* Parameter adjustments */
-  --cy;
-  --cx;
-
-  /* Function Body */
-  zdotcw_(n, &cx[1], incx, &cy[1], incy, &res);
-  return res;
-} /* zdotc_ */
-
-doublecomplex zdotu_(integer *n, doublecomplex *cx, integer *incx, doublecomplex *cy, integer *incy) {
-  doublecomplex res;
-  extern /* Subroutine */ void zdotuw_(integer *, doublecomplex *, integer *, doublecomplex *, integer *,
-                                       doublecomplex *);
-
-  /* Parameter adjustments */
-  --cy;
-  --cx;
-
-  /* Function Body */
-  zdotuw_(n, &cx[1], incx, &cy[1], incy, &res);
-  return res;
-} /* zdotu_ */
--- a/blas/f2c/ctbmv.c
+++ b/blas/f2c/ctbmv.c
@@ -1,586 +0,0 @@
-/* ctbmv.f -- translated by f2c (version 20100827).
-   You must link the resulting object file with libf2c:
-        on Microsoft Windows system, link with libf2c.lib;
-        on Linux or Unix systems, link with .../path/to/libf2c.a -lm
-        or, if you install libf2c.a in a standard place, with -lf2c -lm
-        -- in that order, at the end of the command line, as in
-                cc *.o -lf2c -lm
-        Source for libf2c is in /netlib/f2c/libf2c.zip, e.g.,
-
-                http://www.netlib.org/f2c/libf2c.zip
-*/
-
-#include "datatypes.h"
-
-static inline void r_cnjg(complex *r, complex *z) {
-  r->r = z->r;
-  r->i = -(z->i);
-}
-
-/* Subroutine */ void ctbmv_(char *uplo, char *trans, char *diag, integer *n, integer *k, complex *a, integer *lda,
-                             complex *x, integer *incx) {
-  /* System generated locals */
-  integer a_dim1, a_offset, i__1, i__2, i__3, i__4, i__5;
-  complex q__1, q__2, q__3;
-
-  /* Local variables */
-  integer i__, j, l, ix, jx, kx, info;
-  complex temp;
-  extern logical lsame_(char *, char *);
-  integer kplus1;
-  extern /* Subroutine */ void xerbla_(const char *, integer *);
-  logical noconj, nounit;
-
-  /*     .. Scalar Arguments .. */
-  /*     .. */
-  /*     .. Array Arguments .. */
-  /*     .. */
-
-  /*  Purpose */
-  /*  ======= */
-
-  /*  CTBMV  performs one of the matrix-vector operations */
-
-  /*     x := A*x,   or   x := A'*x,   or   x := conjg( A' )*x, */
-
-  /*  where x is an n element vector and  A is an n by n unit, or non-unit, */
-  /*  upper or lower triangular band matrix, with ( k + 1 ) diagonals. */
-
-  /*  Arguments */
-  /*  ========== */
-
-  /*  UPLO   - CHARACTER*1. */
-  /*           On entry, UPLO specifies whether the matrix is an upper or */
-  /*           lower triangular matrix as follows: */
-
-  /*              UPLO = 'U' or 'u'   A is an upper triangular matrix. */
-
-  /*              UPLO = 'L' or 'l'   A is a lower triangular matrix. */
-
-  /*           Unchanged on exit. */
-
-  /*  TRANS  - CHARACTER*1. */
-  /*           On entry, TRANS specifies the operation to be performed as */
-  /*           follows: */
-
-  /*              TRANS = 'N' or 'n'   x := A*x. */
-
-  /*              TRANS = 'T' or 't'   x := A'*x. */
-
-  /*              TRANS = 'C' or 'c'   x := conjg( A' )*x. */
-
-  /*           Unchanged on exit. */
-
-  /*  DIAG   - CHARACTER*1. */
-  /*           On entry, DIAG specifies whether or not A is unit */
-  /*           triangular as follows: */
-
-  /*              DIAG = 'U' or 'u'   A is assumed to be unit triangular. */
-
-  /*              DIAG = 'N' or 'n'   A is not assumed to be unit */
-  /*                                  triangular. */
-
-  /*           Unchanged on exit. */
-
-  /*  N      - INTEGER. */
-  /*           On entry, N specifies the order of the matrix A. */
-  /*           N must be at least zero. */
-  /*           Unchanged on exit. */
-
-  /*  K      - INTEGER. */
-  /*           On entry with UPLO = 'U' or 'u', K specifies the number of */
-  /*           super-diagonals of the matrix A. */
-  /*           On entry with UPLO = 'L' or 'l', K specifies the number of */
-  /*           sub-diagonals of the matrix A. */
-  /*           K must satisfy  0 .le. K. */
-  /*           Unchanged on exit. */
-
-  /*  A      - COMPLEX          array of DIMENSION ( LDA, n ). */
-  /*           Before entry with UPLO = 'U' or 'u', the leading ( k + 1 ) */
-  /*           by n part of the array A must contain the upper triangular */
-  /*           band part of the matrix of coefficients, supplied column by */
-  /*           column, with the leading diagonal of the matrix in row */
-  /*           ( k + 1 ) of the array, the first super-diagonal starting at */
-  /*           position 2 in row k, and so on. The top left k by k triangle */
-  /*           of the array A is not referenced. */
-  /*           The following program segment will transfer an upper */
-  /*           triangular band matrix from conventional full matrix storage */
-  /*           to band storage: */
-
-  /*                 DO 20, J = 1, N */
-  /*                    M = K + 1 - J */
-  /*                    DO 10, I = MAX( 1, J - K ), J */
-  /*                       A( M + I, J ) = matrix( I, J ) */
-  /*              10    CONTINUE */
-  /*              20 CONTINUE */
-
-  /*           Before entry with UPLO = 'L' or 'l', the leading ( k + 1 ) */
-  /*           by n part of the array A must contain the lower triangular */
-  /*           band part of the matrix of coefficients, supplied column by */
-  /*           column, with the leading diagonal of the matrix in row 1 of */
-  /*           the array, the first sub-diagonal starting at position 1 in */
-  /*           row 2, and so on. The bottom right k by k triangle of the */
-  /*           array A is not referenced. */
-  /*           The following program segment will transfer a lower */
-  /*           triangular band matrix from conventional full matrix storage */
-  /*           to band storage: */
-
-  /*                 DO 20, J = 1, N */
-  /*                    M = 1 - J */
-  /*                    DO 10, I = J, MIN( N, J + K ) */
-  /*                       A( M + I, J ) = matrix( I, J ) */
-  /*              10    CONTINUE */
-  /*              20 CONTINUE */
-
-  /*           Note that when DIAG = 'U' or 'u' the elements of the array A */
-  /*           corresponding to the diagonal elements of the matrix are not */
-  /*           referenced, but are assumed to be unity. */
-  /*           Unchanged on exit. */
-
-  /*  LDA    - INTEGER. */
-  /*           On entry, LDA specifies the first dimension of A as declared */
-  /*           in the calling (sub) program. LDA must be at least */
-  /*           ( k + 1 ). */
-  /*           Unchanged on exit. */
-
-  /*  X      - COMPLEX          array of dimension at least */
-  /*           ( 1 + ( n - 1 )*abs( INCX ) ). */
-  /*           Before entry, the incremented array X must contain the n */
-  /*           element vector x. On exit, X is overwritten with the */
-  /*           transformed vector x. */
-
-  /*  INCX   - INTEGER. */
-  /*           On entry, INCX specifies the increment for the elements of */
-  /*           X. INCX must not be zero. */
-  /*           Unchanged on exit. */
-
-  /*  Further Details */
-  /*  =============== */
-
-  /*  Level 2 Blas routine. */
-
-  /*  -- Written on 22-October-1986. */
-  /*     Jack Dongarra, Argonne National Lab. */
-  /*     Jeremy Du Croz, Nag Central Office. */
-  /*     Sven Hammarling, Nag Central Office. */
-  /*     Richard Hanson, Sandia National Labs. */
-
-  /*  ===================================================================== */
-
-  /*     .. Parameters .. */
-  /*     .. */
-  /*     .. Local Scalars .. */
-  /*     .. */
-  /*     .. External Functions .. */
-  /*     .. */
-  /*     .. External Subroutines .. */
-  /*     .. */
-  /*     .. Intrinsic Functions .. */
-  /*     .. */
-
-  /*     Test the input parameters. */
-
-  /* Parameter adjustments */
-  a_dim1 = *lda;
-  a_offset = 1 + a_dim1;
-  a -= a_offset;
-  --x;
-
-  /* Function Body */
-  info = 0;
-  if (!lsame_(uplo, "U") && !lsame_(uplo, "L")) {
-    info = 1;
-  } else if (!lsame_(trans, "N") && !lsame_(trans, "T") && !lsame_(trans, "C")) {
-    info = 2;
-  } else if (!lsame_(diag, "U") && !lsame_(diag, "N")) {
-    info = 3;
-  } else if (*n < 0) {
-    info = 4;
-  } else if (*k < 0) {
-    info = 5;
-  } else if (*lda < *k + 1) {
-    info = 7;
-  } else if (*incx == 0) {
-    info = 9;
-  }
-  if (info != 0) {
-    xerbla_("CTBMV ", &info);
-    return;
-  }
-
-  /*     Quick return if possible. */
-
-  if (*n == 0) {
-    return;
-  }
-
-  noconj = lsame_(trans, "T");
-  nounit = lsame_(diag, "N");
-
-  /*     Set up the start point in X if the increment is not unity. This */
-  /*     will be  ( N - 1 )*INCX   too small for descending loops. */
-
-  if (*incx <= 0) {
-    kx = 1 - (*n - 1) * *incx;
-  } else if (*incx != 1) {
-    kx = 1;
-  }
-
-  /*     Start the operations. In this version the elements of A are */
-  /*     accessed sequentially with one pass through A. */
-
-  if (lsame_(trans, "N")) {
-    /*         Form  x := A*x. */
-
-    if (lsame_(uplo, "U")) {
-      kplus1 = *k + 1;
-      if (*incx == 1) {
-        i__1 = *n;
-        for (j = 1; j <= i__1; ++j) {
-          i__2 = j;
-          if (x[i__2].r != 0.f || x[i__2].i != 0.f) {
-            i__2 = j;
-            temp.r = x[i__2].r, temp.i = x[i__2].i;
-            l = kplus1 - j;
-            /* Computing MAX */
-            i__2 = 1, i__3 = j - *k;
-            i__4 = j - 1;
-            for (i__ = max(i__2, i__3); i__ <= i__4; ++i__) {
-              i__2 = i__;
-              i__3 = i__;
-              i__5 = l + i__ + j * a_dim1;
-              q__2.r = temp.r * a[i__5].r - temp.i * a[i__5].i, q__2.i = temp.r * a[i__5].i + temp.i * a[i__5].r;
-              q__1.r = x[i__3].r + q__2.r, q__1.i = x[i__3].i + q__2.i;
-              x[i__2].r = q__1.r, x[i__2].i = q__1.i;
-              /* L10: */
-            }
-            if (nounit) {
-              i__4 = j;
-              i__2 = j;
-              i__3 = kplus1 + j * a_dim1;
-              q__1.r = x[i__2].r * a[i__3].r - x[i__2].i * a[i__3].i,
-              q__1.i = x[i__2].r * a[i__3].i + x[i__2].i * a[i__3].r;
-              x[i__4].r = q__1.r, x[i__4].i = q__1.i;
-            }
-          }
-          /* L20: */
-        }
-      } else {
-        jx = kx;
-        i__1 = *n;
-        for (j = 1; j <= i__1; ++j) {
-          i__4 = jx;
-          if (x[i__4].r != 0.f || x[i__4].i != 0.f) {
-            i__4 = jx;
-            temp.r = x[i__4].r, temp.i = x[i__4].i;
-            ix = kx;
-            l = kplus1 - j;
-            /* Computing MAX */
-            i__4 = 1, i__2 = j - *k;
-            i__3 = j - 1;
-            for (i__ = max(i__4, i__2); i__ <= i__3; ++i__) {
-              i__4 = ix;
-              i__2 = ix;
-              i__5 = l + i__ + j * a_dim1;
-              q__2.r = temp.r * a[i__5].r - temp.i * a[i__5].i, q__2.i = temp.r * a[i__5].i + temp.i * a[i__5].r;
-              q__1.r = x[i__2].r + q__2.r, q__1.i = x[i__2].i + q__2.i;
-              x[i__4].r = q__1.r, x[i__4].i = q__1.i;
-              ix += *incx;
-              /* L30: */
-            }
-            if (nounit) {
-              i__3 = jx;
-              i__4 = jx;
-              i__2 = kplus1 + j * a_dim1;
-              q__1.r = x[i__4].r * a[i__2].r - x[i__4].i * a[i__2].i,
-              q__1.i = x[i__4].r * a[i__2].i + x[i__4].i * a[i__2].r;
-              x[i__3].r = q__1.r, x[i__3].i = q__1.i;
-            }
-          }
-          jx += *incx;
-          if (j > *k) {
-            kx += *incx;
-          }
-          /* L40: */
-        }
-      }
-    } else {
-      if (*incx == 1) {
-        for (j = *n; j >= 1; --j) {
-          i__1 = j;
-          if (x[i__1].r != 0.f || x[i__1].i != 0.f) {
-            i__1 = j;
-            temp.r = x[i__1].r, temp.i = x[i__1].i;
-            l = 1 - j;
-            /* Computing MIN */
-            i__1 = *n, i__3 = j + *k;
-            i__4 = j + 1;
-            for (i__ = min(i__1, i__3); i__ >= i__4; --i__) {
-              i__1 = i__;
-              i__3 = i__;
-              i__2 = l + i__ + j * a_dim1;
-              q__2.r = temp.r * a[i__2].r - temp.i * a[i__2].i, q__2.i = temp.r * a[i__2].i + temp.i * a[i__2].r;
-              q__1.r = x[i__3].r + q__2.r, q__1.i = x[i__3].i + q__2.i;
-              x[i__1].r = q__1.r, x[i__1].i = q__1.i;
-              /* L50: */
-            }
-            if (nounit) {
-              i__4 = j;
-              i__1 = j;
-              i__3 = j * a_dim1 + 1;
-              q__1.r = x[i__1].r * a[i__3].r - x[i__1].i * a[i__3].i,
-              q__1.i = x[i__1].r * a[i__3].i + x[i__1].i * a[i__3].r;
-              x[i__4].r = q__1.r, x[i__4].i = q__1.i;
-            }
-          }
-          /* L60: */
-        }
-      } else {
-        kx += (*n - 1) * *incx;
-        jx = kx;
-        for (j = *n; j >= 1; --j) {
-          i__4 = jx;
-          if (x[i__4].r != 0.f || x[i__4].i != 0.f) {
-            i__4 = jx;
-            temp.r = x[i__4].r, temp.i = x[i__4].i;
-            ix = kx;
-            l = 1 - j;
-            /* Computing MIN */
-            i__4 = *n, i__1 = j + *k;
-            i__3 = j + 1;
-            for (i__ = min(i__4, i__1); i__ >= i__3; --i__) {
-              i__4 = ix;
-              i__1 = ix;
-              i__2 = l + i__ + j * a_dim1;
-              q__2.r = temp.r * a[i__2].r - temp.i * a[i__2].i, q__2.i = temp.r * a[i__2].i + temp.i * a[i__2].r;
-              q__1.r = x[i__1].r + q__2.r, q__1.i = x[i__1].i + q__2.i;
-              x[i__4].r = q__1.r, x[i__4].i = q__1.i;
-              ix -= *incx;
-              /* L70: */
-            }
-            if (nounit) {
-              i__3 = jx;
-              i__4 = jx;
-              i__1 = j * a_dim1 + 1;
-              q__1.r = x[i__4].r * a[i__1].r - x[i__4].i * a[i__1].i,
-              q__1.i = x[i__4].r * a[i__1].i + x[i__4].i * a[i__1].r;
-              x[i__3].r = q__1.r, x[i__3].i = q__1.i;
-            }
-          }
-          jx -= *incx;
-          if (*n - j >= *k) {
-            kx -= *incx;
-          }
-          /* L80: */
-        }
-      }
-    }
-  } else {
-    /*        Form  x := A'*x  or  x := conjg( A' )*x. */
-
-    if (lsame_(uplo, "U")) {
-      kplus1 = *k + 1;
-      if (*incx == 1) {
-        for (j = *n; j >= 1; --j) {
-          i__3 = j;
-          temp.r = x[i__3].r, temp.i = x[i__3].i;
-          l = kplus1 - j;
-          if (noconj) {
-            if (nounit) {
-              i__3 = kplus1 + j * a_dim1;
-              q__1.r = temp.r * a[i__3].r - temp.i * a[i__3].i, q__1.i = temp.r * a[i__3].i + temp.i * a[i__3].r;
-              temp.r = q__1.r, temp.i = q__1.i;
-            }
-            /* Computing MAX */
-            i__4 = 1, i__1 = j - *k;
-            i__3 = max(i__4, i__1);
-            for (i__ = j - 1; i__ >= i__3; --i__) {
-              i__4 = l + i__ + j * a_dim1;
-              i__1 = i__;
-              q__2.r = a[i__4].r * x[i__1].r - a[i__4].i * x[i__1].i,
-              q__2.i = a[i__4].r * x[i__1].i + a[i__4].i * x[i__1].r;
-              q__1.r = temp.r + q__2.r, q__1.i = temp.i + q__2.i;
-              temp.r = q__1.r, temp.i = q__1.i;
-              /* L90: */
-            }
-          } else {
-            if (nounit) {
-              r_cnjg(&q__2, &a[kplus1 + j * a_dim1]);
-              q__1.r = temp.r * q__2.r - temp.i * q__2.i, q__1.i = temp.r * q__2.i + temp.i * q__2.r;
-              temp.r = q__1.r, temp.i = q__1.i;
-            }
-            /* Computing MAX */
-            i__4 = 1, i__1 = j - *k;
-            i__3 = max(i__4, i__1);
-            for (i__ = j - 1; i__ >= i__3; --i__) {
-              r_cnjg(&q__3, &a[l + i__ + j * a_dim1]);
-              i__4 = i__;
-              q__2.r = q__3.r * x[i__4].r - q__3.i * x[i__4].i, q__2.i = q__3.r * x[i__4].i + q__3.i * x[i__4].r;
-              q__1.r = temp.r + q__2.r, q__1.i = temp.i + q__2.i;
-              temp.r = q__1.r, temp.i = q__1.i;
-              /* L100: */
-            }
-          }
-          i__3 = j;
-          x[i__3].r = temp.r, x[i__3].i = temp.i;
-          /* L110: */
-        }
-      } else {
-        kx += (*n - 1) * *incx;
-        jx = kx;
-        for (j = *n; j >= 1; --j) {
-          i__3 = jx;
-          temp.r = x[i__3].r, temp.i = x[i__3].i;
-          kx -= *incx;
-          ix = kx;
-          l = kplus1 - j;
-          if (noconj) {
-            if (nounit) {
-              i__3 = kplus1 + j * a_dim1;
-              q__1.r = temp.r * a[i__3].r - temp.i * a[i__3].i, q__1.i = temp.r * a[i__3].i + temp.i * a[i__3].r;
-              temp.r = q__1.r, temp.i = q__1.i;
-            }
-            /* Computing MAX */
-            i__4 = 1, i__1 = j - *k;
-            i__3 = max(i__4, i__1);
-            for (i__ = j - 1; i__ >= i__3; --i__) {
-              i__4 = l + i__ + j * a_dim1;
-              i__1 = ix;
-              q__2.r = a[i__4].r * x[i__1].r - a[i__4].i * x[i__1].i,
-              q__2.i = a[i__4].r * x[i__1].i + a[i__4].i * x[i__1].r;
-              q__1.r = temp.r + q__2.r, q__1.i = temp.i + q__2.i;
-              temp.r = q__1.r, temp.i = q__1.i;
-              ix -= *incx;
-              /* L120: */
-            }
-          } else {
-            if (nounit) {
-              r_cnjg(&q__2, &a[kplus1 + j * a_dim1]);
-              q__1.r = temp.r * q__2.r - temp.i * q__2.i, q__1.i = temp.r * q__2.i + temp.i * q__2.r;
-              temp.r = q__1.r, temp.i = q__1.i;
-            }
-            /* Computing MAX */
-            i__4 = 1, i__1 = j - *k;
-            i__3 = max(i__4, i__1);
-            for (i__ = j - 1; i__ >= i__3; --i__) {
-              r_cnjg(&q__3, &a[l + i__ + j * a_dim1]);
-              i__4 = ix;
-              q__2.r = q__3.r * x[i__4].r - q__3.i * x[i__4].i, q__2.i = q__3.r * x[i__4].i + q__3.i * x[i__4].r;
-              q__1.r = temp.r + q__2.r, q__1.i = temp.i + q__2.i;
-              temp.r = q__1.r, temp.i = q__1.i;
-              ix -= *incx;
-              /* L130: */
-            }
-          }
-          i__3 = jx;
-          x[i__3].r = temp.r, x[i__3].i = temp.i;
-          jx -= *incx;
-          /* L140: */
-        }
-      }
-    } else {
-      if (*incx == 1) {
-        i__3 = *n;
-        for (j = 1; j <= i__3; ++j) {
-          i__4 = j;
-          temp.r = x[i__4].r, temp.i = x[i__4].i;
-          l = 1 - j;
-          if (noconj) {
-            if (nounit) {
-              i__4 = j * a_dim1 + 1;
-              q__1.r = temp.r * a[i__4].r - temp.i * a[i__4].i, q__1.i = temp.r * a[i__4].i + temp.i * a[i__4].r;
-              temp.r = q__1.r, temp.i = q__1.i;
-            }
-            /* Computing MIN */
-            i__1 = *n, i__2 = j + *k;
-            i__4 = min(i__1, i__2);
-            for (i__ = j + 1; i__ <= i__4; ++i__) {
-              i__1 = l + i__ + j * a_dim1;
-              i__2 = i__;
-              q__2.r = a[i__1].r * x[i__2].r - a[i__1].i * x[i__2].i,
-              q__2.i = a[i__1].r * x[i__2].i + a[i__1].i * x[i__2].r;
-              q__1.r = temp.r + q__2.r, q__1.i = temp.i + q__2.i;
-              temp.r = q__1.r, temp.i = q__1.i;
-              /* L150: */
-            }
-          } else {
-            if (nounit) {
-              r_cnjg(&q__2, &a[j * a_dim1 + 1]);
-              q__1.r = temp.r * q__2.r - temp.i * q__2.i, q__1.i = temp.r * q__2.i + temp.i * q__2.r;
-              temp.r = q__1.r, temp.i = q__1.i;
-            }
-            /* Computing MIN */
-            i__1 = *n, i__2 = j + *k;
-            i__4 = min(i__1, i__2);
-            for (i__ = j + 1; i__ <= i__4; ++i__) {
-              r_cnjg(&q__3, &a[l + i__ + j * a_dim1]);
-              i__1 = i__;
-              q__2.r = q__3.r * x[i__1].r - q__3.i * x[i__1].i, q__2.i = q__3.r * x[i__1].i + q__3.i * x[i__1].r;
-              q__1.r = temp.r + q__2.r, q__1.i = temp.i + q__2.i;
-              temp.r = q__1.r, temp.i = q__1.i;
-              /* L160: */
-            }
-          }
-          i__4 = j;
-          x[i__4].r = temp.r, x[i__4].i = temp.i;
-          /* L170: */
-        }
-      } else {
-        jx = kx;
-        i__3 = *n;
-        for (j = 1; j <= i__3; ++j) {
-          i__4 = jx;
-          temp.r = x[i__4].r, temp.i = x[i__4].i;
-          kx += *incx;
-          ix = kx;
-          l = 1 - j;
-          if (noconj) {
-            if (nounit) {
-              i__4 = j * a_dim1 + 1;
-              q__1.r = temp.r * a[i__4].r - temp.i * a[i__4].i, q__1.i = temp.r * a[i__4].i + temp.i * a[i__4].r;
-              temp.r = q__1.r, temp.i = q__1.i;
-            }
-            /* Computing MIN */
-            i__1 = *n, i__2 = j + *k;
-            i__4 = min(i__1, i__2);
-            for (i__ = j + 1; i__ <= i__4; ++i__) {
-              i__1 = l + i__ + j * a_dim1;
-              i__2 = ix;
-              q__2.r = a[i__1].r * x[i__2].r - a[i__1].i * x[i__2].i,
-              q__2.i = a[i__1].r * x[i__2].i + a[i__1].i * x[i__2].r;
-              q__1.r = temp.r + q__2.r, q__1.i = temp.i + q__2.i;
-              temp.r = q__1.r, temp.i = q__1.i;
-              ix += *incx;
-              /* L180: */
-            }
-          } else {
-            if (nounit) {
-              r_cnjg(&q__2, &a[j * a_dim1 + 1]);
-              q__1.r = temp.r * q__2.r - temp.i * q__2.i, q__1.i = temp.r * q__2.i + temp.i * q__2.r;
-              temp.r = q__1.r, temp.i = q__1.i;
-            }
-            /* Computing MIN */
-            i__1 = *n, i__2 = j + *k;
-            i__4 = min(i__1, i__2);
-            for (i__ = j + 1; i__ <= i__4; ++i__) {
-              r_cnjg(&q__3, &a[l + i__ + j * a_dim1]);
-              i__1 = ix;
-              q__2.r = q__3.r * x[i__1].r - q__3.i * x[i__1].i, q__2.i = q__3.r * x[i__1].i + q__3.i * x[i__1].r;
-              q__1.r = temp.r + q__2.r, q__1.i = temp.i + q__2.i;
-              temp.r = q__1.r, temp.i = q__1.i;
-              ix += *incx;
-              /* L190: */
-            }
-          }
-          i__4 = jx;
-          x[i__4].r = temp.r, x[i__4].i = temp.i;
-          jx += *incx;
-          /* L200: */
-        }
-      }
-    }
-  }
-
-  /*     End of CTBMV . */
-
-} /* ctbmv_ */
--- a/blas/f2c/datatypes.h
+++ b/blas/f2c/datatypes.h
@@ -1,27 +0,0 @@
-/* This contains a limited subset of the typedefs exposed by f2c
-   for use by the Eigen BLAS C-only implementation.
-*/
-
-#ifndef __EIGEN_DATATYPES_H__
-#define __EIGEN_DATATYPES_H__
-
-typedef int integer;
-typedef unsigned int uinteger;
-typedef float real;
-typedef double doublereal;
-typedef struct {
-  real r, i;
-} complex;
-typedef struct {
-  doublereal r, i;
-} doublecomplex;
-typedef int logical;
-
-#define abs(x) ((x) >= 0 ? (x) : -(x))
-#define dabs(x) (doublereal) abs(x)
-#define min(a, b) ((a) <= (b) ? (a) : (b))
-#define max(a, b) ((a) >= (b) ? (a) : (b))
-#define dmin(a, b) (doublereal) min(a, b)
-#define dmax(a, b) (doublereal) max(a, b)
-
-#endif
--- a/blas/f2c/drotm.c
+++ b/blas/f2c/drotm.c
@@ -1,213 +0,0 @@
-/* drotm.f -- translated by f2c (version 20100827).
-   You must link the resulting object file with libf2c:
-        on Microsoft Windows system, link with libf2c.lib;
-        on Linux or Unix systems, link with .../path/to/libf2c.a -lm
-        or, if you install libf2c.a in a standard place, with -lf2c -lm
-        -- in that order, at the end of the command line, as in
-                cc *.o -lf2c -lm
-        Source for libf2c is in /netlib/f2c/libf2c.zip, e.g.,
-
-                http://www.netlib.org/f2c/libf2c.zip
-*/
-
-#include "datatypes.h"
-
-/* Subroutine */ void drotm_(integer *n, doublereal *dx, integer *incx, doublereal *dy, integer *incy,
-                             doublereal *dparam) {
-  /* Initialized data */
-
-  static doublereal zero = 0.;
-  static doublereal two = 2.;
-
-  /* System generated locals */
-  integer i__1, i__2;
-
-  /* Local variables */
-  integer i__;
-  doublereal w, z__;
-  integer kx, ky;
-  doublereal dh11, dh12, dh21, dh22, dflag;
-  integer nsteps;
-
-  /*     .. Scalar Arguments .. */
-  /*     .. */
-  /*     .. Array Arguments .. */
-  /*     .. */
-
-  /*  Purpose */
-  /*  ======= */
-
-  /*     APPLY THE MODIFIED GIVENS TRANSFORMATION, H, TO THE 2 BY N MATRIX */
-
-  /*     (DX**T) , WHERE **T INDICATES TRANSPOSE. THE ELEMENTS OF DX ARE IN */
-  /*     (DY**T) */
-
-  /*     DX(LX+I*INCX), I = 0 TO N-1, WHERE LX = 1 IF INCX .GE. 0, ELSE */
-  /*     LX = (-INCX)*N, AND SIMILARLY FOR SY USING LY AND INCY. */
-  /*     WITH DPARAM(1)=DFLAG, H HAS ONE OF THE FOLLOWING FORMS.. */
-
-  /*     DFLAG=-1.D0     DFLAG=0.D0        DFLAG=1.D0     DFLAG=-2.D0 */
-
-  /*       (DH11  DH12)    (1.D0  DH12)    (DH11  1.D0)    (1.D0  0.D0) */
-  /*     H=(          )    (          )    (          )    (          ) */
-  /*       (DH21  DH22),   (DH21  1.D0),   (-1.D0 DH22),   (0.D0  1.D0). */
-  /*     SEE DROTMG FOR A DESCRIPTION OF DATA STORAGE IN DPARAM. */
-
-  /*  Arguments */
-  /*  ========= */
-
-  /*  N      (input) INTEGER */
-  /*         number of elements in input vector(s) */
-
-  /*  DX     (input/output) DOUBLE PRECISION array, dimension N */
-  /*         double precision vector with N elements */
-
-  /*  INCX   (input) INTEGER */
-  /*         storage spacing between elements of DX */
-
-  /*  DY     (input/output) DOUBLE PRECISION array, dimension N */
-  /*         double precision vector with N elements */
-
-  /*  INCY   (input) INTEGER */
-  /*         storage spacing between elements of DY */
-
-  /*  DPARAM (input/output)  DOUBLE PRECISION array, dimension 5 */
-  /*     DPARAM(1)=DFLAG */
-  /*     DPARAM(2)=DH11 */
-  /*     DPARAM(3)=DH21 */
-  /*     DPARAM(4)=DH12 */
-  /*     DPARAM(5)=DH22 */
-
-  /*  ===================================================================== */
-
-  /*     .. Local Scalars .. */
-  /*     .. */
-  /*     .. Data statements .. */
-  /* Parameter adjustments */
-  --dparam;
-  --dy;
-  --dx;
-
-  /* Function Body */
-  /*     .. */
-
-  dflag = dparam[1];
-  if (*n <= 0 || dflag + two == zero) {
-    goto L140;
-  }
-  if (!(*incx == *incy && *incx > 0)) {
-    goto L70;
-  }
-
-  nsteps = *n * *incx;
-  if (dflag < 0.) {
-    goto L50;
-  } else if (dflag == 0) {
-    goto L10;
-  } else {
-    goto L30;
-  }
-L10:
-  dh12 = dparam[4];
-  dh21 = dparam[3];
-  i__1 = nsteps;
-  i__2 = *incx;
-  for (i__ = 1; i__2 < 0 ? i__ >= i__1 : i__ <= i__1; i__ += i__2) {
-    w = dx[i__];
-    z__ = dy[i__];
-    dx[i__] = w + z__ * dh12;
-    dy[i__] = w * dh21 + z__;
-    /* L20: */
-  }
-  goto L140;
-L30:
-  dh11 = dparam[2];
-  dh22 = dparam[5];
-  i__2 = nsteps;
-  i__1 = *incx;
-  for (i__ = 1; i__1 < 0 ? i__ >= i__2 : i__ <= i__2; i__ += i__1) {
-    w = dx[i__];
-    z__ = dy[i__];
-    dx[i__] = w * dh11 + z__;
-    dy[i__] = -w + dh22 * z__;
-    /* L40: */
-  }
-  goto L140;
-L50:
-  dh11 = dparam[2];
-  dh12 = dparam[4];
-  dh21 = dparam[3];
-  dh22 = dparam[5];
-  i__1 = nsteps;
-  i__2 = *incx;
-  for (i__ = 1; i__2 < 0 ? i__ >= i__1 : i__ <= i__1; i__ += i__2) {
-    w = dx[i__];
-    z__ = dy[i__];
-    dx[i__] = w * dh11 + z__ * dh12;
-    dy[i__] = w * dh21 + z__ * dh22;
-    /* L60: */
-  }
-  goto L140;
-L70:
-  kx = 1;
-  ky = 1;
-  if (*incx < 0) {
-    kx = (1 - *n) * *incx + 1;
-  }
-  if (*incy < 0) {
-    ky = (1 - *n) * *incy + 1;
-  }
-
-  if (dflag < 0.) {
-    goto L120;
-  } else if (dflag == 0) {
-    goto L80;
-  } else {
-    goto L100;
-  }
-L80:
-  dh12 = dparam[4];
-  dh21 = dparam[3];
-  i__2 = *n;
-  for (i__ = 1; i__ <= i__2; ++i__) {
-    w = dx[kx];
-    z__ = dy[ky];
-    dx[kx] = w + z__ * dh12;
-    dy[ky] = w * dh21 + z__;
-    kx += *incx;
-    ky += *incy;
-    /* L90: */
-  }
-  goto L140;
-L100:
-  dh11 = dparam[2];
-  dh22 = dparam[5];
-  i__2 = *n;
-  for (i__ = 1; i__ <= i__2; ++i__) {
-    w = dx[kx];
-    z__ = dy[ky];
-    dx[kx] = w * dh11 + z__;
-    dy[ky] = -w + dh22 * z__;
-    kx += *incx;
-    ky += *incy;
-    /* L110: */
-  }
-  goto L140;
-L120:
-  dh11 = dparam[2];
-  dh12 = dparam[4];
-  dh21 = dparam[3];
-  dh22 = dparam[5];
-  i__2 = *n;
-  for (i__ = 1; i__ <= i__2; ++i__) {
-    w = dx[kx];
-    z__ = dy[ky];
-    dx[kx] = w * dh11 + z__ * dh12;
-    dy[ky] = w * dh21 + z__ * dh22;
-    kx += *incx;
-    ky += *incy;
-    /* L130: */
-  }
-L140:
-  return;
-} /* drotm_ */
--- a/blas/f2c/drotmg.c
+++ b/blas/f2c/drotmg.c
@@ -1,293 +0,0 @@
-/* drotmg.f -- translated by f2c (version 20100827).
-   You must link the resulting object file with libf2c:
-        on Microsoft Windows system, link with libf2c.lib;
-        on Linux or Unix systems, link with .../path/to/libf2c.a -lm
-        or, if you install libf2c.a in a standard place, with -lf2c -lm
-        -- in that order, at the end of the command line, as in
-                cc *.o -lf2c -lm
-        Source for libf2c is in /netlib/f2c/libf2c.zip, e.g.,
-
-                http://www.netlib.org/f2c/libf2c.zip
-*/
-
-#include "datatypes.h"
-
-/* Subroutine */ void drotmg_(doublereal *dd1, doublereal *dd2, doublereal *dx1, doublereal *dy1, doublereal *dparam) {
-  /* Initialized data */
-
-  static doublereal zero = 0.;
-  static doublereal one = 1.;
-  static doublereal two = 2.;
-  static doublereal gam = 4096.;
-  static doublereal gamsq = 16777216.;
-  static doublereal rgamsq = 5.9604645e-8;
-
-  /* Format strings */
-  static char fmt_120[] = "";
-  static char fmt_150[] = "";
-  static char fmt_180[] = "";
-  static char fmt_210[] = "";
-
-  /* System generated locals */
-  doublereal d__1;
-
-  /* Local variables */
-  doublereal du, dp1, dp2, dq1, dq2, dh11, dh12, dh21, dh22;
-  integer igo;
-  doublereal dflag, dtemp;
-
-  /* Assigned format variables */
-  static char *igo_fmt;
-  (void)igo_fmt;
-
-  /*     .. Scalar Arguments .. */
-  /*     .. */
-  /*     .. Array Arguments .. */
-  /*     .. */
-
-  /*  Purpose */
-  /*  ======= */
-
-  /*     CONSTRUCT THE MODIFIED GIVENS TRANSFORMATION MATRIX H WHICH ZEROS */
-  /*     THE SECOND COMPONENT OF THE 2-VECTOR  (DSQRT(DD1)*DX1,DSQRT(DD2)* */
-  /*     DY2)**T. */
-  /*     WITH DPARAM(1)=DFLAG, H HAS ONE OF THE FOLLOWING FORMS.. */
-
-  /*     DFLAG=-1.D0     DFLAG=0.D0        DFLAG=1.D0     DFLAG=-2.D0 */
-
-  /*       (DH11  DH12)    (1.D0  DH12)    (DH11  1.D0)    (1.D0  0.D0) */
-  /*     H=(          )    (          )    (          )    (          ) */
-  /*       (DH21  DH22),   (DH21  1.D0),   (-1.D0 DH22),   (0.D0  1.D0). */
-  /*     LOCATIONS 2-4 OF DPARAM CONTAIN DH11, DH21, DH12, AND DH22 */
-  /*     RESPECTIVELY. (VALUES OF 1.D0, -1.D0, OR 0.D0 IMPLIED BY THE */
-  /*     VALUE OF DPARAM(1) ARE NOT STORED IN DPARAM.) */
-
-  /*     THE VALUES OF GAMSQ AND RGAMSQ SET IN THE DATA STATEMENT MAY BE */
-  /*     INEXACT.  THIS IS OK AS THEY ARE ONLY USED FOR TESTING THE SIZE */
-  /*     OF DD1 AND DD2.  ALL ACTUAL SCALING OF DATA IS DONE USING GAM. */
-
-  /*  Arguments */
-  /*  ========= */
-
-  /*  DD1    (input/output) DOUBLE PRECISION */
-
-  /*  DD2    (input/output) DOUBLE PRECISION */
-
-  /*  DX1    (input/output) DOUBLE PRECISION */
-
-  /*  DY1    (input) DOUBLE PRECISION */
-
-  /*  DPARAM (input/output)  DOUBLE PRECISION array, dimension 5 */
-  /*     DPARAM(1)=DFLAG */
-  /*     DPARAM(2)=DH11 */
-  /*     DPARAM(3)=DH21 */
-  /*     DPARAM(4)=DH12 */
-  /*     DPARAM(5)=DH22 */
-
-  /*  ===================================================================== */
-
-  /*     .. Local Scalars .. */
-  /*     .. */
-  /*     .. Intrinsic Functions .. */
-  /*     .. */
-  /*     .. Data statements .. */
-
-  /* Parameter adjustments */
-  --dparam;
-
-  /* Function Body */
-  /*     .. */
-  if (!(*dd1 < zero)) {
-    goto L10;
-  }
-  /*       GO ZERO-H-D-AND-DX1.. */
-  goto L60;
-L10:
-  /*     CASE-DD1-NONNEGATIVE */
-  dp2 = *dd2 * *dy1;
-  if (!(dp2 == zero)) {
-    goto L20;
-  }
-  dflag = -two;
-  goto L260;
-/*     REGULAR-CASE.. */
-L20:
-  dp1 = *dd1 * *dx1;
-  dq2 = dp2 * *dy1;
-  dq1 = dp1 * *dx1;
-
-  if (!(abs(dq1) > abs(dq2))) {
-    goto L40;
-  }
-  dh21 = -(*dy1) / *dx1;
-  dh12 = dp2 / dp1;
-
-  du = one - dh12 * dh21;
-
-  if (!(du <= zero)) {
-    goto L30;
-  }
-  /*         GO ZERO-H-D-AND-DX1.. */
-  goto L60;
-L30:
-  dflag = zero;
-  *dd1 /= du;
-  *dd2 /= du;
-  *dx1 *= du;
-  /*         GO SCALE-CHECK.. */
-  goto L100;
-L40:
-  if (!(dq2 < zero)) {
-    goto L50;
-  }
-  /*         GO ZERO-H-D-AND-DX1.. */
-  goto L60;
-L50:
-  dflag = one;
-  dh11 = dp1 / dp2;
-  dh22 = *dx1 / *dy1;
-  du = one + dh11 * dh22;
-  dtemp = *dd2 / du;
-  *dd2 = *dd1 / du;
-  *dd1 = dtemp;
-  *dx1 = *dy1 * du;
-  /*         GO SCALE-CHECK */
-  goto L100;
-/*     PROCEDURE..ZERO-H-D-AND-DX1.. */
-L60:
-  dflag = -one;
-  dh11 = zero;
-  dh12 = zero;
-  dh21 = zero;
-  dh22 = zero;
-
-  *dd1 = zero;
-  *dd2 = zero;
-  *dx1 = zero;
-  /*         RETURN.. */
-  goto L220;
-/*     PROCEDURE..FIX-H.. */
-L70:
-  if (!(dflag >= zero)) {
-    goto L90;
-  }
-
-  if (!(dflag == zero)) {
-    goto L80;
-  }
-  dh11 = one;
-  dh22 = one;
-  dflag = -one;
-  goto L90;
-L80:
-  dh21 = -one;
-  dh12 = one;
-  dflag = -one;
-L90:
-  switch (igo) {
-    case 0:
-      goto L120;
-    case 1:
-      goto L150;
-    case 2:
-      goto L180;
-    case 3:
-      goto L210;
-  }
-/*     PROCEDURE..SCALE-CHECK */
-L100:
-L110:
-  if (!(*dd1 <= rgamsq)) {
-    goto L130;
-  }
-  if (*dd1 == zero) {
-    goto L160;
-  }
-  igo = 0;
-  igo_fmt = fmt_120;
-  /*              FIX-H.. */
-  goto L70;
-L120:
-  /* Computing 2nd power */
-  d__1 = gam;
-  *dd1 *= d__1 * d__1;
-  *dx1 /= gam;
-  dh11 /= gam;
-  dh12 /= gam;
-  goto L110;
-L130:
-L140:
-  if (!(*dd1 >= gamsq)) {
-    goto L160;
-  }
-  igo = 1;
-  igo_fmt = fmt_150;
-  /*              FIX-H.. */
-  goto L70;
-L150:
-  /* Computing 2nd power */
-  d__1 = gam;
-  *dd1 /= d__1 * d__1;
-  *dx1 *= gam;
-  dh11 *= gam;
-  dh12 *= gam;
-  goto L140;
-L160:
-L170:
-  if (!(abs(*dd2) <= rgamsq)) {
-    goto L190;
-  }
-  if (*dd2 == zero) {
-    goto L220;
-  }
-  igo = 2;
-  igo_fmt = fmt_180;
-  /*              FIX-H.. */
-  goto L70;
-L180:
-  /* Computing 2nd power */
-  d__1 = gam;
-  *dd2 *= d__1 * d__1;
-  dh21 /= gam;
-  dh22 /= gam;
-  goto L170;
-L190:
-L200:
-  if (!(abs(*dd2) >= gamsq)) {
-    goto L220;
-  }
-  igo = 3;
-  igo_fmt = fmt_210;
-  /*              FIX-H.. */
-  goto L70;
-L210:
-  /* Computing 2nd power */
-  d__1 = gam;
-  *dd2 /= d__1 * d__1;
-  dh21 *= gam;
-  dh22 *= gam;
-  goto L200;
-L220:
-  if (dflag < 0.) {
-    goto L250;
-  } else if (dflag == 0) {
-    goto L230;
-  } else {
-    goto L240;
-  }
-L230:
-  dparam[3] = dh21;
-  dparam[4] = dh12;
-  goto L260;
-L240:
-  dparam[2] = dh11;
-  dparam[5] = dh22;
-  goto L260;
-L250:
-  dparam[2] = dh11;
-  dparam[3] = dh21;
-  dparam[4] = dh12;
-  dparam[5] = dh22;
-L260:
-  dparam[1] = dflag;
-} /* drotmg_ */
--- a/blas/f2c/dsbmv.c
+++ b/blas/f2c/dsbmv.c
@@ -1,356 +0,0 @@
-/* dsbmv.f -- translated by f2c (version 20100827).
-   You must link the resulting object file with libf2c:
-        on Microsoft Windows system, link with libf2c.lib;
-        on Linux or Unix systems, link with .../path/to/libf2c.a -lm
-        or, if you install libf2c.a in a standard place, with -lf2c -lm
-        -- in that order, at the end of the command line, as in
-                cc *.o -lf2c -lm
-        Source for libf2c is in /netlib/f2c/libf2c.zip, e.g.,
-
-                http://www.netlib.org/f2c/libf2c.zip
-*/
-
-#include "datatypes.h"
-
-/* Subroutine */ void dsbmv_(char *uplo, integer *n, integer *k, doublereal *alpha, doublereal *a, integer *lda,
-                             doublereal *x, integer *incx, doublereal *beta, doublereal *y, integer *incy) {
-  /* System generated locals */
-  integer a_dim1, a_offset, i__1, i__2, i__3, i__4;
-
-  /* Local variables */
-  integer i__, j, l, ix, iy, jx, jy, kx, ky, info;
-  doublereal temp1, temp2;
-  extern logical lsame_(char *, char *);
-  integer kplus1;
-  extern /* Subroutine */ void xerbla_(const char *, integer *);
-
-  /*     .. Scalar Arguments .. */
-  /*     .. */
-  /*     .. Array Arguments .. */
-  /*     .. */
-
-  /*  Purpose */
-  /*  ======= */
-
-  /*  DSBMV  performs the matrix-vector  operation */
-
-  /*     y := alpha*A*x + beta*y, */
-
-  /*  where alpha and beta are scalars, x and y are n element vectors and */
-  /*  A is an n by n symmetric band matrix, with k super-diagonals. */
-
-  /*  Arguments */
-  /*  ========== */
-
-  /*  UPLO   - CHARACTER*1. */
-  /*           On entry, UPLO specifies whether the upper or lower */
-  /*           triangular part of the band matrix A is being supplied as */
-  /*           follows: */
-
-  /*              UPLO = 'U' or 'u'   The upper triangular part of A is */
-  /*                                  being supplied. */
-
-  /*              UPLO = 'L' or 'l'   The lower triangular part of A is */
-  /*                                  being supplied. */
-
-  /*           Unchanged on exit. */
-
-  /*  N      - INTEGER. */
-  /*           On entry, N specifies the order of the matrix A. */
-  /*           N must be at least zero. */
-  /*           Unchanged on exit. */
-
-  /*  K      - INTEGER. */
-  /*           On entry, K specifies the number of super-diagonals of the */
-  /*           matrix A. K must satisfy  0 .le. K. */
-  /*           Unchanged on exit. */
-
-  /*  ALPHA  - DOUBLE PRECISION. */
-  /*           On entry, ALPHA specifies the scalar alpha. */
-  /*           Unchanged on exit. */
-
-  /*  A      - DOUBLE PRECISION array of DIMENSION ( LDA, n ). */
-  /*           Before entry with UPLO = 'U' or 'u', the leading ( k + 1 ) */
-  /*           by n part of the array A must contain the upper triangular */
-  /*           band part of the symmetric matrix, supplied column by */
-  /*           column, with the leading diagonal of the matrix in row */
-  /*           ( k + 1 ) of the array, the first super-diagonal starting at */
-  /*           position 2 in row k, and so on. The top left k by k triangle */
-  /*           of the array A is not referenced. */
-  /*           The following program segment will transfer the upper */
-  /*           triangular part of a symmetric band matrix from conventional */
-  /*           full matrix storage to band storage: */
-
-  /*                 DO 20, J = 1, N */
-  /*                    M = K + 1 - J */
-  /*                    DO 10, I = MAX( 1, J - K ), J */
-  /*                       A( M + I, J ) = matrix( I, J ) */
-  /*              10    CONTINUE */
-  /*              20 CONTINUE */
-
-  /*           Before entry with UPLO = 'L' or 'l', the leading ( k + 1 ) */
-  /*           by n part of the array A must contain the lower triangular */
-  /*           band part of the symmetric matrix, supplied column by */
-  /*           column, with the leading diagonal of the matrix in row 1 of */
-  /*           the array, the first sub-diagonal starting at position 1 in */
-  /*           row 2, and so on. The bottom right k by k triangle of the */
-  /*           array A is not referenced. */
-  /*           The following program segment will transfer the lower */
-  /*           triangular part of a symmetric band matrix from conventional */
-  /*           full matrix storage to band storage: */
-
-  /*                 DO 20, J = 1, N */
-  /*                    M = 1 - J */
-  /*                    DO 10, I = J, MIN( N, J + K ) */
-  /*                       A( M + I, J ) = matrix( I, J ) */
-  /*              10    CONTINUE */
-  /*              20 CONTINUE */
-
-  /*           Unchanged on exit. */
-
-  /*  LDA    - INTEGER. */
-  /*           On entry, LDA specifies the first dimension of A as declared */
-  /*           in the calling (sub) program. LDA must be at least */
-  /*           ( k + 1 ). */
-  /*           Unchanged on exit. */
-
-  /*  X      - DOUBLE PRECISION array of DIMENSION at least */
-  /*           ( 1 + ( n - 1 )*abs( INCX ) ). */
-  /*           Before entry, the incremented array X must contain the */
-  /*           vector x. */
-  /*           Unchanged on exit. */
-
-  /*  INCX   - INTEGER. */
-  /*           On entry, INCX specifies the increment for the elements of */
-  /*           X. INCX must not be zero. */
-  /*           Unchanged on exit. */
-
-  /*  BETA   - DOUBLE PRECISION. */
-  /*           On entry, BETA specifies the scalar beta. */
-  /*           Unchanged on exit. */
-
-  /*  Y      - DOUBLE PRECISION array of DIMENSION at least */
-  /*           ( 1 + ( n - 1 )*abs( INCY ) ). */
-  /*           Before entry, the incremented array Y must contain the */
-  /*           vector y. On exit, Y is overwritten by the updated vector y. */
-
-  /*  INCY   - INTEGER. */
-  /*           On entry, INCY specifies the increment for the elements of */
-  /*           Y. INCY must not be zero. */
-  /*           Unchanged on exit. */
-
-  /*  Level 2 Blas routine. */
-
-  /*  -- Written on 22-October-1986. */
-  /*     Jack Dongarra, Argonne National Lab. */
-  /*     Jeremy Du Croz, Nag Central Office. */
-  /*     Sven Hammarling, Nag Central Office. */
-  /*     Richard Hanson, Sandia National Labs. */
-
-  /*  ===================================================================== */
-
-  /*     .. Parameters .. */
-  /*     .. */
-  /*     .. Local Scalars .. */
-  /*     .. */
-  /*     .. External Functions .. */
-  /*     .. */
-  /*     .. External Subroutines .. */
-  /*     .. */
-  /*     .. Intrinsic Functions .. */
-  /*     .. */
-
-  /*     Test the input parameters. */
-
-  /* Parameter adjustments */
-  a_dim1 = *lda;
-  a_offset = 1 + a_dim1;
-  a -= a_offset;
-  --x;
-  --y;
-
-  /* Function Body */
-  info = 0;
-  if (!lsame_(uplo, "U") && !lsame_(uplo, "L")) {
-    info = 1;
-  } else if (*n < 0) {
-    info = 2;
-  } else if (*k < 0) {
-    info = 3;
-  } else if (*lda < *k + 1) {
-    info = 6;
-  } else if (*incx == 0) {
-    info = 8;
-  } else if (*incy == 0) {
-    info = 11;
-  }
-  if (info != 0) {
-    xerbla_("DSBMV ", &info);
-    return;
-  }
-
-  /*     Quick return if possible. */
-
-  if (*n == 0 || (*alpha == 0. && *beta == 1.)) {
-    return;
-  }
-
-  /*     Set up the start points in  X  and  Y. */
-
-  if (*incx > 0) {
-    kx = 1;
-  } else {
-    kx = 1 - (*n - 1) * *incx;
-  }
-  if (*incy > 0) {
-    ky = 1;
-  } else {
-    ky = 1 - (*n - 1) * *incy;
-  }
-
-  /*     Start the operations. In this version the elements of the array A */
-  /*     are accessed sequentially with one pass through A. */
-
-  /*     First form  y := beta*y. */
-
-  if (*beta != 1.) {
-    if (*incy == 1) {
-      if (*beta == 0.) {
-        i__1 = *n;
-        for (i__ = 1; i__ <= i__1; ++i__) {
-          y[i__] = 0.;
-          /* L10: */
-        }
-      } else {
-        i__1 = *n;
-        for (i__ = 1; i__ <= i__1; ++i__) {
-          y[i__] = *beta * y[i__];
-          /* L20: */
-        }
-      }
-    } else {
-      iy = ky;
-      if (*beta == 0.) {
-        i__1 = *n;
-        for (i__ = 1; i__ <= i__1; ++i__) {
-          y[iy] = 0.;
-          iy += *incy;
-          /* L30: */
-        }
-      } else {
-        i__1 = *n;
-        for (i__ = 1; i__ <= i__1; ++i__) {
-          y[iy] = *beta * y[iy];
-          iy += *incy;
-          /* L40: */
-        }
-      }
-    }
-  }
-  if (*alpha == 0.) {
-    return;
-  }
-  if (lsame_(uplo, "U")) {
-    /*        Form  y  when upper triangle of A is stored. */
-
-    kplus1 = *k + 1;
-    if (*incx == 1 && *incy == 1) {
-      i__1 = *n;
-      for (j = 1; j <= i__1; ++j) {
-        temp1 = *alpha * x[j];
-        temp2 = 0.;
-        l = kplus1 - j;
-        /* Computing MAX */
-        i__2 = 1, i__3 = j - *k;
-        i__4 = j - 1;
-        for (i__ = max(i__2, i__3); i__ <= i__4; ++i__) {
-          y[i__] += temp1 * a[l + i__ + j * a_dim1];
-          temp2 += a[l + i__ + j * a_dim1] * x[i__];
-          /* L50: */
-        }
-        y[j] = y[j] + temp1 * a[kplus1 + j * a_dim1] + *alpha * temp2;
-        /* L60: */
-      }
-    } else {
-      jx = kx;
-      jy = ky;
-      i__1 = *n;
-      for (j = 1; j <= i__1; ++j) {
-        temp1 = *alpha * x[jx];
-        temp2 = 0.;
-        ix = kx;
-        iy = ky;
-        l = kplus1 - j;
-        /* Computing MAX */
-        i__4 = 1, i__2 = j - *k;
-        i__3 = j - 1;
-        for (i__ = max(i__4, i__2); i__ <= i__3; ++i__) {
-          y[iy] += temp1 * a[l + i__ + j * a_dim1];
-          temp2 += a[l + i__ + j * a_dim1] * x[ix];
-          ix += *incx;
-          iy += *incy;
-          /* L70: */
-        }
-        y[jy] = y[jy] + temp1 * a[kplus1 + j * a_dim1] + *alpha * temp2;
-        jx += *incx;
-        jy += *incy;
-        if (j > *k) {
-          kx += *incx;
-          ky += *incy;
-        }
-        /* L80: */
-      }
-    }
-  } else {
-    /*        Form  y  when lower triangle of A is stored. */
-
-    if (*incx == 1 && *incy == 1) {
-      i__1 = *n;
-      for (j = 1; j <= i__1; ++j) {
-        temp1 = *alpha * x[j];
-        temp2 = 0.;
-        y[j] += temp1 * a[j * a_dim1 + 1];
-        l = 1 - j;
-        /* Computing MIN */
-        i__4 = *n, i__2 = j + *k;
-        i__3 = min(i__4, i__2);
-        for (i__ = j + 1; i__ <= i__3; ++i__) {
-          y[i__] += temp1 * a[l + i__ + j * a_dim1];
-          temp2 += a[l + i__ + j * a_dim1] * x[i__];
-          /* L90: */
-        }
-        y[j] += *alpha * temp2;
-        /* L100: */
-      }
-    } else {
-      jx = kx;
-      jy = ky;
-      i__1 = *n;
-      for (j = 1; j <= i__1; ++j) {
-        temp1 = *alpha * x[jx];
-        temp2 = 0.;
-        y[jy] += temp1 * a[j * a_dim1 + 1];
-        l = 1 - j;
-        ix = jx;
-        iy = jy;
-        /* Computing MIN */
-        i__4 = *n, i__2 = j + *k;
-        i__3 = min(i__4, i__2);
-        for (i__ = j + 1; i__ <= i__3; ++i__) {
-          ix += *incx;
-          iy += *incy;
-          y[iy] += temp1 * a[l + i__ + j * a_dim1];
-          temp2 += a[l + i__ + j * a_dim1] * x[ix];
-          /* L110: */
-        }
-        y[jy] += *alpha * temp2;
-        jx += *incx;
-        jy += *incy;
-        /* L120: */
-      }
-    }
-  }
-
-  /*     End of DSBMV . */
-
-} /* dsbmv_ */
--- a/blas/f2c/dspmv.c
+++ b/blas/f2c/dspmv.c
@@ -1,308 +0,0 @@
-/* dspmv.f -- translated by f2c (version 20100827).
-   You must link the resulting object file with libf2c:
-        on Microsoft Windows system, link with libf2c.lib;
-        on Linux or Unix systems, link with .../path/to/libf2c.a -lm
-        or, if you install libf2c.a in a standard place, with -lf2c -lm
-        -- in that order, at the end of the command line, as in
-                cc *.o -lf2c -lm
-        Source for libf2c is in /netlib/f2c/libf2c.zip, e.g.,
-
-                http://www.netlib.org/f2c/libf2c.zip
-*/
-
-#include "datatypes.h"
-
-/* Subroutine */ void dspmv_(char *uplo, integer *n, doublereal *alpha, doublereal *ap, doublereal *x, integer *incx,
-                             doublereal *beta, doublereal *y, integer *incy) {
-  /* System generated locals */
-  integer i__1, i__2;
-
-  /* Local variables */
-  integer i__, j, k, kk, ix, iy, jx, jy, kx, ky, info;
-  doublereal temp1, temp2;
-  extern logical lsame_(char *, char *);
-  extern /* Subroutine */ void xerbla_(const char *, integer *);
-
-  /*     .. Scalar Arguments .. */
-  /*     .. */
-  /*     .. Array Arguments .. */
-  /*     .. */
-
-  /*  Purpose */
-  /*  ======= */
-
-  /*  DSPMV  performs the matrix-vector operation */
-
-  /*     y := alpha*A*x + beta*y, */
-
-  /*  where alpha and beta are scalars, x and y are n element vectors and */
-  /*  A is an n by n symmetric matrix, supplied in packed form. */
-
-  /*  Arguments */
-  /*  ========== */
-
-  /*  UPLO   - CHARACTER*1. */
-  /*           On entry, UPLO specifies whether the upper or lower */
-  /*           triangular part of the matrix A is supplied in the packed */
-  /*           array AP as follows: */
-
-  /*              UPLO = 'U' or 'u'   The upper triangular part of A is */
-  /*                                  supplied in AP. */
-
-  /*              UPLO = 'L' or 'l'   The lower triangular part of A is */
-  /*                                  supplied in AP. */
-
-  /*           Unchanged on exit. */
-
-  /*  N      - INTEGER. */
-  /*           On entry, N specifies the order of the matrix A. */
-  /*           N must be at least zero. */
-  /*           Unchanged on exit. */
-
-  /*  ALPHA  - DOUBLE PRECISION. */
-  /*           On entry, ALPHA specifies the scalar alpha. */
-  /*           Unchanged on exit. */
-
-  /*  AP     - DOUBLE PRECISION array of DIMENSION at least */
-  /*           ( ( n*( n + 1 ) )/2 ). */
-  /*           Before entry with UPLO = 'U' or 'u', the array AP must */
-  /*           contain the upper triangular part of the symmetric matrix */
-  /*           packed sequentially, column by column, so that AP( 1 ) */
-  /*           contains a( 1, 1 ), AP( 2 ) and AP( 3 ) contain a( 1, 2 ) */
-  /*           and a( 2, 2 ) respectively, and so on. */
-  /*           Before entry with UPLO = 'L' or 'l', the array AP must */
-  /*           contain the lower triangular part of the symmetric matrix */
-  /*           packed sequentially, column by column, so that AP( 1 ) */
-  /*           contains a( 1, 1 ), AP( 2 ) and AP( 3 ) contain a( 2, 1 ) */
-  /*           and a( 3, 1 ) respectively, and so on. */
-  /*           Unchanged on exit. */
-
-  /*  X      - DOUBLE PRECISION array of dimension at least */
-  /*           ( 1 + ( n - 1 )*abs( INCX ) ). */
-  /*           Before entry, the incremented array X must contain the n */
-  /*           element vector x. */
-  /*           Unchanged on exit. */
-
-  /*  INCX   - INTEGER. */
-  /*           On entry, INCX specifies the increment for the elements of */
-  /*           X. INCX must not be zero. */
-  /*           Unchanged on exit. */
-
-  /*  BETA   - DOUBLE PRECISION. */
-  /*           On entry, BETA specifies the scalar beta. When BETA is */
-  /*           supplied as zero then Y need not be set on input. */
-  /*           Unchanged on exit. */
-
-  /*  Y      - DOUBLE PRECISION array of dimension at least */
-  /*           ( 1 + ( n - 1 )*abs( INCY ) ). */
-  /*           Before entry, the incremented array Y must contain the n */
-  /*           element vector y. On exit, Y is overwritten by the updated */
-  /*           vector y. */
-
-  /*  INCY   - INTEGER. */
-  /*           On entry, INCY specifies the increment for the elements of */
-  /*           Y. INCY must not be zero. */
-  /*           Unchanged on exit. */
-
-  /*  Further Details */
-  /*  =============== */
-
-  /*  Level 2 Blas routine. */
-
-  /*  -- Written on 22-October-1986. */
-  /*     Jack Dongarra, Argonne National Lab. */
-  /*     Jeremy Du Croz, Nag Central Office. */
-  /*     Sven Hammarling, Nag Central Office. */
-  /*     Richard Hanson, Sandia National Labs. */
-
-  /*  ===================================================================== */
-
-  /*     .. Parameters .. */
-  /*     .. */
-  /*     .. Local Scalars .. */
-  /*     .. */
-  /*     .. External Functions .. */
-  /*     .. */
-  /*     .. External Subroutines .. */
-  /*     .. */
-
-  /*     Test the input parameters. */
-
-  /* Parameter adjustments */
-  --y;
-  --x;
-  --ap;
-
-  /* Function Body */
-  info = 0;
-  if (!lsame_(uplo, "U") && !lsame_(uplo, "L")) {
-    info = 1;
-  } else if (*n < 0) {
-    info = 2;
-  } else if (*incx == 0) {
-    info = 6;
-  } else if (*incy == 0) {
-    info = 9;
-  }
-  if (info != 0) {
-    xerbla_("DSPMV ", &info);
-    return;
-  }
-
-  /*     Quick return if possible. */
-
-  if (*n == 0 || (*alpha == 0. && *beta == 1.)) {
-    return;
-  }
-
-  /*     Set up the start points in  X  and  Y. */
-
-  if (*incx > 0) {
-    kx = 1;
-  } else {
-    kx = 1 - (*n - 1) * *incx;
-  }
-  if (*incy > 0) {
-    ky = 1;
-  } else {
-    ky = 1 - (*n - 1) * *incy;
-  }
-
-  /*     Start the operations. In this version the elements of the array AP */
-  /*     are accessed sequentially with one pass through AP. */
-
-  /*     First form  y := beta*y. */
-
-  if (*beta != 1.) {
-    if (*incy == 1) {
-      if (*beta == 0.) {
-        i__1 = *n;
-        for (i__ = 1; i__ <= i__1; ++i__) {
-          y[i__] = 0.;
-          /* L10: */
-        }
-      } else {
-        i__1 = *n;
-        for (i__ = 1; i__ <= i__1; ++i__) {
-          y[i__] = *beta * y[i__];
-          /* L20: */
-        }
-      }
-    } else {
-      iy = ky;
-      if (*beta == 0.) {
-        i__1 = *n;
-        for (i__ = 1; i__ <= i__1; ++i__) {
-          y[iy] = 0.;
-          iy += *incy;
-          /* L30: */
-        }
-      } else {
-        i__1 = *n;
-        for (i__ = 1; i__ <= i__1; ++i__) {
-          y[iy] = *beta * y[iy];
-          iy += *incy;
-          /* L40: */
-        }
-      }
-    }
-  }
-  if (*alpha == 0.) {
-    return;
-  }
-  kk = 1;
-  if (lsame_(uplo, "U")) {
-    /*        Form  y  when AP contains the upper triangle. */
-
-    if (*incx == 1 && *incy == 1) {
-      i__1 = *n;
-      for (j = 1; j <= i__1; ++j) {
-        temp1 = *alpha * x[j];
-        temp2 = 0.;
-        k = kk;
-        i__2 = j - 1;
-        for (i__ = 1; i__ <= i__2; ++i__) {
-          y[i__] += temp1 * ap[k];
-          temp2 += ap[k] * x[i__];
-          ++k;
-          /* L50: */
-        }
-        y[j] = y[j] + temp1 * ap[kk + j - 1] + *alpha * temp2;
-        kk += j;
-        /* L60: */
-      }
-    } else {
-      jx = kx;
-      jy = ky;
-      i__1 = *n;
-      for (j = 1; j <= i__1; ++j) {
-        temp1 = *alpha * x[jx];
-        temp2 = 0.;
-        ix = kx;
-        iy = ky;
-        i__2 = kk + j - 2;
-        for (k = kk; k <= i__2; ++k) {
-          y[iy] += temp1 * ap[k];
-          temp2 += ap[k] * x[ix];
-          ix += *incx;
-          iy += *incy;
-          /* L70: */
-        }
-        y[jy] = y[jy] + temp1 * ap[kk + j - 1] + *alpha * temp2;
-        jx += *incx;
-        jy += *incy;
-        kk += j;
-        /* L80: */
-      }
-    }
-  } else {
-    /*        Form  y  when AP contains the lower triangle. */
-
-    if (*incx == 1 && *incy == 1) {
-      i__1 = *n;
-      for (j = 1; j <= i__1; ++j) {
-        temp1 = *alpha * x[j];
-        temp2 = 0.;
-        y[j] += temp1 * ap[kk];
-        k = kk + 1;
-        i__2 = *n;
-        for (i__ = j + 1; i__ <= i__2; ++i__) {
-          y[i__] += temp1 * ap[k];
-          temp2 += ap[k] * x[i__];
-          ++k;
-          /* L90: */
-        }
-        y[j] += *alpha * temp2;
-        kk += *n - j + 1;
-        /* L100: */
-      }
-    } else {
-      jx = kx;
-      jy = ky;
-      i__1 = *n;
-      for (j = 1; j <= i__1; ++j) {
-        temp1 = *alpha * x[jx];
-        temp2 = 0.;
-        y[jy] += temp1 * ap[kk];
-        ix = jx;
-        iy = jy;
-        i__2 = kk + *n - j;
-        for (k = kk + 1; k <= i__2; ++k) {
-          ix += *incx;
-          iy += *incy;
-          y[iy] += temp1 * ap[k];
-          temp2 += ap[k] * x[ix];
-          /* L110: */
-        }
-        y[jy] += *alpha * temp2;
-        jx += *incx;
-        jy += *incy;
-        kk += *n - j + 1;
-        /* L120: */
-      }
-    }
-  }
-
-  /*     End of DSPMV . */
-
-} /* dspmv_ */
--- a/blas/f2c/dtbmv.c
+++ b/blas/f2c/dtbmv.c
@@ -1,417 +0,0 @@
-/* dtbmv.f -- translated by f2c (version 20100827).
-   You must link the resulting object file with libf2c:
-        on Microsoft Windows system, link with libf2c.lib;
-        on Linux or Unix systems, link with .../path/to/libf2c.a -lm
-        or, if you install libf2c.a in a standard place, with -lf2c -lm
-        -- in that order, at the end of the command line, as in
-                cc *.o -lf2c -lm
-        Source for libf2c is in /netlib/f2c/libf2c.zip, e.g.,
-
-                http://www.netlib.org/f2c/libf2c.zip
-*/
-
-#include "datatypes.h"
-
-/* Subroutine */ void dtbmv_(char *uplo, char *trans, char *diag, integer *n, integer *k, doublereal *a, integer *lda,
-                             doublereal *x, integer *incx) {
-  /* System generated locals */
-  integer a_dim1, a_offset, i__1, i__2, i__3, i__4;
-
-  /* Local variables */
-  integer i__, j, l, ix, jx, kx, info;
-  doublereal temp;
-  extern logical lsame_(char *, char *);
-  integer kplus1;
-  extern /* Subroutine */ void xerbla_(const char *, integer *);
-  logical nounit;
-
-  /*     .. Scalar Arguments .. */
-  /*     .. */
-  /*     .. Array Arguments .. */
-  /*     .. */
-
-  /*  Purpose */
-  /*  ======= */
-
-  /*  DTBMV  performs one of the matrix-vector operations */
-
-  /*     x := A*x,   or   x := A'*x, */
-
-  /*  where x is an n element vector and  A is an n by n unit, or non-unit, */
-  /*  upper or lower triangular band matrix, with ( k + 1 ) diagonals. */
-
-  /*  Arguments */
-  /*  ========== */
-
-  /*  UPLO   - CHARACTER*1. */
-  /*           On entry, UPLO specifies whether the matrix is an upper or */
-  /*           lower triangular matrix as follows: */
-
-  /*              UPLO = 'U' or 'u'   A is an upper triangular matrix. */
-
-  /*              UPLO = 'L' or 'l'   A is a lower triangular matrix. */
-
-  /*           Unchanged on exit. */
-
-  /*  TRANS  - CHARACTER*1. */
-  /*           On entry, TRANS specifies the operation to be performed as */
-  /*           follows: */
-
-  /*              TRANS = 'N' or 'n'   x := A*x. */
-
-  /*              TRANS = 'T' or 't'   x := A'*x. */
-
-  /*              TRANS = 'C' or 'c'   x := A'*x. */
-
-  /*           Unchanged on exit. */
-
-  /*  DIAG   - CHARACTER*1. */
-  /*           On entry, DIAG specifies whether or not A is unit */
-  /*           triangular as follows: */
-
-  /*              DIAG = 'U' or 'u'   A is assumed to be unit triangular. */
-
-  /*              DIAG = 'N' or 'n'   A is not assumed to be unit */
-  /*                                  triangular. */
-
-  /*           Unchanged on exit. */
-
-  /*  N      - INTEGER. */
-  /*           On entry, N specifies the order of the matrix A. */
-  /*           N must be at least zero. */
-  /*           Unchanged on exit. */
-
-  /*  K      - INTEGER. */
-  /*           On entry with UPLO = 'U' or 'u', K specifies the number of */
-  /*           super-diagonals of the matrix A. */
-  /*           On entry with UPLO = 'L' or 'l', K specifies the number of */
-  /*           sub-diagonals of the matrix A. */
-  /*           K must satisfy  0 .le. K. */
-  /*           Unchanged on exit. */
-
-  /*  A      - DOUBLE PRECISION array of DIMENSION ( LDA, n ). */
-  /*           Before entry with UPLO = 'U' or 'u', the leading ( k + 1 ) */
-  /*           by n part of the array A must contain the upper triangular */
-  /*           band part of the matrix of coefficients, supplied column by */
-  /*           column, with the leading diagonal of the matrix in row */
-  /*           ( k + 1 ) of the array, the first super-diagonal starting at */
-  /*           position 2 in row k, and so on. The top left k by k triangle */
-  /*           of the array A is not referenced. */
-  /*           The following program segment will transfer an upper */
-  /*           triangular band matrix from conventional full matrix storage */
-  /*           to band storage: */
-
-  /*                 DO 20, J = 1, N */
-  /*                    M = K + 1 - J */
-  /*                    DO 10, I = MAX( 1, J - K ), J */
-  /*                       A( M + I, J ) = matrix( I, J ) */
-  /*              10    CONTINUE */
-  /*              20 CONTINUE */
-
-  /*           Before entry with UPLO = 'L' or 'l', the leading ( k + 1 ) */
-  /*           by n part of the array A must contain the lower triangular */
-  /*           band part of the matrix of coefficients, supplied column by */
-  /*           column, with the leading diagonal of the matrix in row 1 of */
-  /*           the array, the first sub-diagonal starting at position 1 in */
-  /*           row 2, and so on. The bottom right k by k triangle of the */
-  /*           array A is not referenced. */
-  /*           The following program segment will transfer a lower */
-  /*           triangular band matrix from conventional full matrix storage */
-  /*           to band storage: */
-
-  /*                 DO 20, J = 1, N */
-  /*                    M = 1 - J */
-  /*                    DO 10, I = J, MIN( N, J + K ) */
-  /*                       A( M + I, J ) = matrix( I, J ) */
-  /*              10    CONTINUE */
-  /*              20 CONTINUE */
-
-  /*           Note that when DIAG = 'U' or 'u' the elements of the array A */
-  /*           corresponding to the diagonal elements of the matrix are not */
-  /*           referenced, but are assumed to be unity. */
-  /*           Unchanged on exit. */
-
-  /*  LDA    - INTEGER. */
-  /*           On entry, LDA specifies the first dimension of A as declared */
-  /*           in the calling (sub) program. LDA must be at least */
-  /*           ( k + 1 ). */
-  /*           Unchanged on exit. */
-
-  /*  X      - DOUBLE PRECISION array of dimension at least */
-  /*           ( 1 + ( n - 1 )*abs( INCX ) ). */
-  /*           Before entry, the incremented array X must contain the n */
-  /*           element vector x. On exit, X is overwritten with the */
-  /*           transformed vector x. */
-
-  /*  INCX   - INTEGER. */
-  /*           On entry, INCX specifies the increment for the elements of */
-  /*           X. INCX must not be zero. */
-  /*           Unchanged on exit. */
-
-  /*  Further Details */
-  /*  =============== */
-
-  /*  Level 2 Blas routine. */
-
-  /*  -- Written on 22-October-1986. */
-  /*     Jack Dongarra, Argonne National Lab. */
-  /*     Jeremy Du Croz, Nag Central Office. */
-  /*     Sven Hammarling, Nag Central Office. */
-  /*     Richard Hanson, Sandia National Labs. */
-
-  /*  ===================================================================== */
-
-  /*     .. Parameters .. */
-  /*     .. */
-  /*     .. Local Scalars .. */
-  /*     .. */
-  /*     .. External Functions .. */
-  /*     .. */
-  /*     .. External Subroutines .. */
-  /*     .. */
-  /*     .. Intrinsic Functions .. */
-  /*     .. */
-
-  /*     Test the input parameters. */
-
-  /* Parameter adjustments */
-  a_dim1 = *lda;
-  a_offset = 1 + a_dim1;
-  a -= a_offset;
-  --x;
-
-  /* Function Body */
-  info = 0;
-  if (!lsame_(uplo, "U") && !lsame_(uplo, "L")) {
-    info = 1;
-  } else if (!lsame_(trans, "N") && !lsame_(trans, "T") && !lsame_(trans, "C")) {
-    info = 2;
-  } else if (!lsame_(diag, "U") && !lsame_(diag, "N")) {
-    info = 3;
-  } else if (*n < 0) {
-    info = 4;
-  } else if (*k < 0) {
-    info = 5;
-  } else if (*lda < *k + 1) {
-    info = 7;
-  } else if (*incx == 0) {
-    info = 9;
-  }
-  if (info != 0) {
-    xerbla_("DTBMV ", &info);
-    return;
-  }
-
-  /*     Quick return if possible. */
-
-  if (*n == 0) {
-    return;
-  }
-
-  nounit = lsame_(diag, "N");
-
-  /*     Set up the start point in X if the increment is not unity. This */
-  /*     will be  ( N - 1 )*INCX   too small for descending loops. */
-
-  if (*incx <= 0) {
-    kx = 1 - (*n - 1) * *incx;
-  } else if (*incx != 1) {
-    kx = 1;
-  }
-
-  /*     Start the operations. In this version the elements of A are */
-  /*     accessed sequentially with one pass through A. */
-
-  if (lsame_(trans, "N")) {
-    /*         Form  x := A*x. */
-
-    if (lsame_(uplo, "U")) {
-      kplus1 = *k + 1;
-      if (*incx == 1) {
-        i__1 = *n;
-        for (j = 1; j <= i__1; ++j) {
-          if (x[j] != 0.) {
-            temp = x[j];
-            l = kplus1 - j;
-            /* Computing MAX */
-            i__2 = 1, i__3 = j - *k;
-            i__4 = j - 1;
-            for (i__ = max(i__2, i__3); i__ <= i__4; ++i__) {
-              x[i__] += temp * a[l + i__ + j * a_dim1];
-              /* L10: */
-            }
-            if (nounit) {
-              x[j] *= a[kplus1 + j * a_dim1];
-            }
-          }
-          /* L20: */
-        }
-      } else {
-        jx = kx;
-        i__1 = *n;
-        for (j = 1; j <= i__1; ++j) {
-          if (x[jx] != 0.) {
-            temp = x[jx];
-            ix = kx;
-            l = kplus1 - j;
-            /* Computing MAX */
-            i__4 = 1, i__2 = j - *k;
-            i__3 = j - 1;
-            for (i__ = max(i__4, i__2); i__ <= i__3; ++i__) {
-              x[ix] += temp * a[l + i__ + j * a_dim1];
-              ix += *incx;
-              /* L30: */
-            }
-            if (nounit) {
-              x[jx] *= a[kplus1 + j * a_dim1];
-            }
-          }
-          jx += *incx;
-          if (j > *k) {
-            kx += *incx;
-          }
-          /* L40: */
-        }
-      }
-    } else {
-      if (*incx == 1) {
-        for (j = *n; j >= 1; --j) {
-          if (x[j] != 0.) {
-            temp = x[j];
-            l = 1 - j;
-            /* Computing MIN */
-            i__1 = *n, i__3 = j + *k;
-            i__4 = j + 1;
-            for (i__ = min(i__1, i__3); i__ >= i__4; --i__) {
-              x[i__] += temp * a[l + i__ + j * a_dim1];
-              /* L50: */
-            }
-            if (nounit) {
-              x[j] *= a[j * a_dim1 + 1];
-            }
-          }
-          /* L60: */
-        }
-      } else {
-        kx += (*n - 1) * *incx;
-        jx = kx;
-        for (j = *n; j >= 1; --j) {
-          if (x[jx] != 0.) {
-            temp = x[jx];
-            ix = kx;
-            l = 1 - j;
-            /* Computing MIN */
-            i__4 = *n, i__1 = j + *k;
-            i__3 = j + 1;
-            for (i__ = min(i__4, i__1); i__ >= i__3; --i__) {
-              x[ix] += temp * a[l + i__ + j * a_dim1];
-              ix -= *incx;
-              /* L70: */
-            }
-            if (nounit) {
-              x[jx] *= a[j * a_dim1 + 1];
-            }
-          }
-          jx -= *incx;
-          if (*n - j >= *k) {
-            kx -= *incx;
-          }
-          /* L80: */
-        }
-      }
-    }
-  } else {
-    /*        Form  x := A'*x. */
-
-    if (lsame_(uplo, "U")) {
-      kplus1 = *k + 1;
-      if (*incx == 1) {
-        for (j = *n; j >= 1; --j) {
-          temp = x[j];
-          l = kplus1 - j;
-          if (nounit) {
-            temp *= a[kplus1 + j * a_dim1];
-          }
-          /* Computing MAX */
-          i__4 = 1, i__1 = j - *k;
-          i__3 = max(i__4, i__1);
-          for (i__ = j - 1; i__ >= i__3; --i__) {
-            temp += a[l + i__ + j * a_dim1] * x[i__];
-            /* L90: */
-          }
-          x[j] = temp;
-          /* L100: */
-        }
-      } else {
-        kx += (*n - 1) * *incx;
-        jx = kx;
-        for (j = *n; j >= 1; --j) {
-          temp = x[jx];
-          kx -= *incx;
-          ix = kx;
-          l = kplus1 - j;
-          if (nounit) {
-            temp *= a[kplus1 + j * a_dim1];
-          }
-          /* Computing MAX */
-          i__4 = 1, i__1 = j - *k;
-          i__3 = max(i__4, i__1);
-          for (i__ = j - 1; i__ >= i__3; --i__) {
-            temp += a[l + i__ + j * a_dim1] * x[ix];
-            ix -= *incx;
-            /* L110: */
-          }
-          x[jx] = temp;
-          jx -= *incx;
-          /* L120: */
-        }
-      }
-    } else {
-      if (*incx == 1) {
-        i__3 = *n;
-        for (j = 1; j <= i__3; ++j) {
-          temp = x[j];
-          l = 1 - j;
-          if (nounit) {
-            temp *= a[j * a_dim1 + 1];
-          }
-          /* Computing MIN */
-          i__1 = *n, i__2 = j + *k;
-          i__4 = min(i__1, i__2);
-          for (i__ = j + 1; i__ <= i__4; ++i__) {
-            temp += a[l + i__ + j * a_dim1] * x[i__];
-            /* L130: */
-          }
-          x[j] = temp;
-          /* L140: */
-        }
-      } else {
-        jx = kx;
-        i__3 = *n;
-        for (j = 1; j <= i__3; ++j) {
-          temp = x[jx];
-          kx += *incx;
-          ix = kx;
-          l = 1 - j;
-          if (nounit) {
-            temp *= a[j * a_dim1 + 1];
-          }
-          /* Computing MIN */
-          i__1 = *n, i__2 = j + *k;
-          i__4 = min(i__1, i__2);
-          for (i__ = j + 1; i__ <= i__4; ++i__) {
-            temp += a[l + i__ + j * a_dim1] * x[ix];
-            ix += *incx;
-            /* L150: */
-          }
-          x[jx] = temp;
-          jx += *incx;
-          /* L160: */
-        }
-      }
-    }
-  }
-
-  /*     End of DTBMV . */
-
-} /* dtbmv_ */
--- a/blas/f2c/lsame.c
+++ b/blas/f2c/lsame.c
@@ -1,109 +0,0 @@
-/* lsame.f -- translated by f2c (version 20100827).
-   You must link the resulting object file with libf2c:
-        on Microsoft Windows system, link with libf2c.lib;
-        on Linux or Unix systems, link with .../path/to/libf2c.a -lm
-        or, if you install libf2c.a in a standard place, with -lf2c -lm
-        -- in that order, at the end of the command line, as in
-                cc *.o -lf2c -lm
-        Source for libf2c is in /netlib/f2c/libf2c.zip, e.g.,
-
-                http://www.netlib.org/f2c/libf2c.zip
-*/
-
-#include "datatypes.h"
-
-logical lsame_(char *ca, char *cb) {
-  /* System generated locals */
-  logical ret_val;
-
-  /* Local variables */
-  integer inta, intb, zcode;
-
-  /*  -- LAPACK auxiliary routine (version 3.1) -- */
-  /*     Univ. of Tennessee, Univ. of California Berkeley and NAG Ltd.. */
-  /*     November 2006 */
-
-  /*     .. Scalar Arguments .. */
-  /*     .. */
-
-  /*  Purpose */
-  /*  ======= */
-
-  /*  LSAME returns .TRUE. if CA is the same letter as CB regardless of */
-  /*  case. */
-
-  /*  Arguments */
-  /*  ========= */
-
-  /*  CA      (input) CHARACTER*1 */
-
-  /*  CB      (input) CHARACTER*1 */
-  /*          CA and CB specify the single characters to be compared. */
-
-  /* ===================================================================== */
-
-  /*     .. Intrinsic Functions .. */
-  /*     .. */
-  /*     .. Local Scalars .. */
-  /*     .. */
-
-  /*     Test if the characters are equal */
-
-  ret_val = *(unsigned char *)ca == *(unsigned char *)cb;
-  if (ret_val) {
-    return ret_val;
-  }
-
-  /*     Now test for equivalence if both characters are alphabetic. */
-
-  zcode = 'Z';
-
-  /*     Use 'Z' rather than 'A' so that ASCII can be detected on Prime */
-  /*     machines, on which ICHAR returns a value with bit 8 set. */
-  /*     ICHAR('A') on Prime machines returns 193 which is the same as */
-  /*     ICHAR('A') on an EBCDIC machine. */
-
-  inta = *(unsigned char *)ca;
-  intb = *(unsigned char *)cb;
-
-  if (zcode == 90 || zcode == 122) {
-    /*        ASCII is assumed - ZCODE is the ASCII code of either lower or */
-    /*        upper case 'Z'. */
-
-    if (inta >= 97 && inta <= 122) {
-      inta += -32;
-    }
-    if (intb >= 97 && intb <= 122) {
-      intb += -32;
-    }
-
-  } else if (zcode == 233 || zcode == 169) {
-    /*        EBCDIC is assumed - ZCODE is the EBCDIC code of either lower or */
-    /*        upper case 'Z'. */
-
-    if ((inta >= 129 && inta <= 137) || (inta >= 145 && inta <= 153) || (inta >= 162 && inta <= 169)) {
-      inta += 64;
-    }
-    if ((intb >= 129 && intb <= 137) || (intb >= 145 && intb <= 153) || (intb >= 162 && intb <= 169)) {
-      intb += 64;
-    }
-
-  } else if (zcode == 218 || zcode == 250) {
-    /*        ASCII is assumed, on Prime machines - ZCODE is the ASCII code */
-    /*        plus 128 of either lower or upper case 'Z'. */
-
-    if (inta >= 225 && inta <= 250) {
-      inta += -32;
-    }
-    if (intb >= 225 && intb <= 250) {
-      intb += -32;
-    }
-  }
-  ret_val = inta == intb;
-
-  /*     RETURN */
-
-  /*     End of LSAME */
-
-  return ret_val;
-} /* lsame_ */
--- a/blas/f2c/srotm.c
+++ b/blas/f2c/srotm.c
@@ -1,212 +0,0 @@
-/* srotm.f -- translated by f2c (version 20100827).
-   You must link the resulting object file with libf2c:
-        on Microsoft Windows system, link with libf2c.lib;
-        on Linux or Unix systems, link with .../path/to/libf2c.a -lm
-        or, if you install libf2c.a in a standard place, with -lf2c -lm
-        -- in that order, at the end of the command line, as in
-                cc *.o -lf2c -lm
-        Source for libf2c is in /netlib/f2c/libf2c.zip, e.g.,
-
-                http://www.netlib.org/f2c/libf2c.zip
-*/
-
-#include "datatypes.h"
-
-/* Subroutine */ void srotm_(integer *n, real *sx, integer *incx, real *sy, integer *incy, real *sparam) {
-  /* Initialized data */
-
-  static real zero = 0.f;
-  static real two = 2.f;
-
-  /* System generated locals */
-  integer i__1, i__2;
-
-  /* Local variables */
-  integer i__;
-  real w, z__;
-  integer kx, ky;
-  real sh11, sh12, sh21, sh22, sflag;
-  integer nsteps;
-
-  /*     .. Scalar Arguments .. */
-  /*     .. */
-  /*     .. Array Arguments .. */
-  /*     .. */
-
-  /*  Purpose */
-  /*  ======= */
-
-  /*     APPLY THE MODIFIED GIVENS TRANSFORMATION, H, TO THE 2 BY N MATRIX */
-
-  /*     (SX**T) , WHERE **T INDICATES TRANSPOSE. THE ELEMENTS OF SX ARE IN */
-  /*     (DX**T) */
-
-  /*     SX(LX+I*INCX), I = 0 TO N-1, WHERE LX = 1 IF INCX .GE. 0, ELSE */
-  /*     LX = (-INCX)*N, AND SIMILARLY FOR SY USING USING LY AND INCY. */
-  /*     WITH SPARAM(1)=SFLAG, H HAS ONE OF THE FOLLOWING FORMS.. */
-
-  /*     SFLAG=-1.E0     SFLAG=0.E0        SFLAG=1.E0     SFLAG=-2.E0 */
-
-  /*       (SH11  SH12)    (1.E0  SH12)    (SH11  1.E0)    (1.E0  0.E0) */
-  /*     H=(          )    (          )    (          )    (          ) */
-  /*       (SH21  SH22),   (SH21  1.E0),   (-1.E0 SH22),   (0.E0  1.E0). */
-  /*     SEE  SROTMG FOR A DESCRIPTION OF DATA STORAGE IN SPARAM. */
-
-  /*  Arguments */
-  /*  ========= */
-
-  /*  N      (input) INTEGER */
-  /*         number of elements in input vector(s) */
-
-  /*  SX     (input/output) REAL array, dimension N */
-  /*         double precision vector with N elements */
-
-  /*  INCX   (input) INTEGER */
-  /*         storage spacing between elements of SX */
-
-  /*  SY     (input/output) REAL array, dimension N */
-  /*         double precision vector with N elements */
-
-  /*  INCY   (input) INTEGER */
-  /*         storage spacing between elements of SY */
-
-  /*  SPARAM (input/output)  REAL array, dimension 5 */
-  /*     SPARAM(1)=SFLAG */
-  /*     SPARAM(2)=SH11 */
-  /*     SPARAM(3)=SH21 */
-  /*     SPARAM(4)=SH12 */
-  /*     SPARAM(5)=SH22 */
-
-  /*  ===================================================================== */
-
-  /*     .. Local Scalars .. */
-  /*     .. */
-  /*     .. Data statements .. */
-  /* Parameter adjustments */
-  --sparam;
-  --sy;
-  --sx;
-
-  /* Function Body */
-  /*     .. */
-
-  sflag = sparam[1];
-  if (*n <= 0 || sflag + two == zero) {
-    goto L140;
-  }
-  if (!(*incx == *incy && *incx > 0)) {
-    goto L70;
-  }
-
-  nsteps = *n * *incx;
-  if (sflag < 0.f) {
-    goto L50;
-  } else if (sflag == 0) {
-    goto L10;
-  } else {
-    goto L30;
-  }
-L10:
-  sh12 = sparam[4];
-  sh21 = sparam[3];
-  i__1 = nsteps;
-  i__2 = *incx;
-  for (i__ = 1; i__2 < 0 ? i__ >= i__1 : i__ <= i__1; i__ += i__2) {
-    w = sx[i__];
-    z__ = sy[i__];
-    sx[i__] = w + z__ * sh12;
-    sy[i__] = w * sh21 + z__;
-    /* L20: */
-  }
-  goto L140;
-L30:
-  sh11 = sparam[2];
-  sh22 = sparam[5];
-  i__2 = nsteps;
-  i__1 = *incx;
-  for (i__ = 1; i__1 < 0 ? i__ >= i__2 : i__ <= i__2; i__ += i__1) {
-    w = sx[i__];
-    z__ = sy[i__];
-    sx[i__] = w * sh11 + z__;
-    sy[i__] = -w + sh22 * z__;
-    /* L40: */
-  }
-  goto L140;
-L50:
-  sh11 = sparam[2];
-  sh12 = sparam[4];
-  sh21 = sparam[3];
-  sh22 = sparam[5];
-  i__1 = nsteps;
-  i__2 = *incx;
-  for (i__ = 1; i__2 < 0 ? i__ >= i__1 : i__ <= i__1; i__ += i__2) {
-    w = sx[i__];
-    z__ = sy[i__];
-    sx[i__] = w * sh11 + z__ * sh12;
-    sy[i__] = w * sh21 + z__ * sh22;
-    /* L60: */
-  }
-  goto L140;
-L70:
-  kx = 1;
-  ky = 1;
-  if (*incx < 0) {
-    kx = (1 - *n) * *incx + 1;
-  }
-  if (*incy < 0) {
-    ky = (1 - *n) * *incy + 1;
-  }
-
-  if (sflag < 0.f) {
-    goto L120;
-  } else if (sflag == 0) {
-    goto L80;
-  } else {
-    goto L100;
-  }
-L80:
-  sh12 = sparam[4];
-  sh21 = sparam[3];
-  i__2 = *n;
-  for (i__ = 1; i__ <= i__2; ++i__) {
-    w = sx[kx];
-    z__ = sy[ky];
-    sx[kx] = w + z__ * sh12;
-    sy[ky] = w * sh21 + z__;
-    kx += *incx;
-    ky += *incy;
-    /* L90: */
-  }
-  goto L140;
-L100:
-  sh11 = sparam[2];
-  sh22 = sparam[5];
-  i__2 = *n;
-  for (i__ = 1; i__ <= i__2; ++i__) {
-    w = sx[kx];
-    z__ = sy[ky];
-    sx[kx] = w * sh11 + z__;
-    sy[ky] = -w + sh22 * z__;
-    kx += *incx;
-    ky += *incy;
-    /* L110: */
-  }
-  goto L140;
-L120:
-  sh11 = sparam[2];
-  sh12 = sparam[4];
-  sh21 = sparam[3];
-  sh22 = sparam[5];
-  i__2 = *n;
-  for (i__ = 1; i__ <= i__2; ++i__) {
-    w = sx[kx];
-    z__ = sy[ky];
-    sx[kx] = w * sh11 + z__ * sh12;
-    sy[ky] = w * sh21 + z__ * sh22;
-    kx += *incx;
-    ky += *incy;
-    /* L130: */
-  }
-L140:
-  return;
-} /* srotm_ */
--- a/blas/f2c/srotmg.c
+++ b/blas/f2c/srotmg.c
@@ -1,293 +0,0 @@
-/* srotmg.f -- translated by f2c (version 20100827).
-   You must link the resulting object file with libf2c:
-        on Microsoft Windows system, link with libf2c.lib;
-        on Linux or Unix systems, link with .../path/to/libf2c.a -lm
-        or, if you install libf2c.a in a standard place, with -lf2c -lm
-        -- in that order, at the end of the command line, as in
-                cc *.o -lf2c -lm
-        Source for libf2c is in /netlib/f2c/libf2c.zip, e.g.,
-
-                http://www.netlib.org/f2c/libf2c.zip
-*/
-
-#include "datatypes.h"
-
-/* Subroutine */ void srotmg_(real *sd1, real *sd2, real *sx1, real *sy1, real *sparam) {
-  /* Initialized data */
-
-  static real zero = 0.f;
-  static real one = 1.f;
-  static real two = 2.f;
-  static real gam = 4096.f;
-  static real gamsq = 16777200.f;
-  static real rgamsq = 5.96046e-8f;
-
-  /* Format strings */
-  static char fmt_120[] = "";
-  static char fmt_150[] = "";
-  static char fmt_180[] = "";
-  static char fmt_210[] = "";
-
-  /* System generated locals */
-  real r__1;
-
-  /* Local variables */
-  real su, sp1, sp2, sq1, sq2, sh11, sh12, sh21, sh22;
-  integer igo;
-  real sflag, stemp;
-
-  /* Assigned format variables */
-  static char *igo_fmt;
-  (void)igo_fmt;
-
-  /*     .. Scalar Arguments .. */
-  /*     .. */
-  /*     .. Array Arguments .. */
-  /*     .. */
-
-  /*  Purpose */
-  /*  ======= */
-
-  /*     CONSTRUCT THE MODIFIED GIVENS TRANSFORMATION MATRIX H WHICH ZEROS */
-  /*     THE SECOND COMPONENT OF THE 2-VECTOR  (SQRT(SD1)*SX1,SQRT(SD2)* */
-  /*     SY2)**T. */
-  /*     WITH SPARAM(1)=SFLAG, H HAS ONE OF THE FOLLOWING FORMS.. */
-
-  /*     SFLAG=-1.E0     SFLAG=0.E0        SFLAG=1.E0     SFLAG=-2.E0 */
-
-  /*       (SH11  SH12)    (1.E0  SH12)    (SH11  1.E0)    (1.E0  0.E0) */
-  /*     H=(          )    (          )    (          )    (          ) */
-  /*       (SH21  SH22),   (SH21  1.E0),   (-1.E0 SH22),   (0.E0  1.E0). */
-  /*     LOCATIONS 2-4 OF SPARAM CONTAIN SH11,SH21,SH12, AND SH22 */
-  /*     RESPECTIVELY. (VALUES OF 1.E0, -1.E0, OR 0.E0 IMPLIED BY THE */
-  /*     VALUE OF SPARAM(1) ARE NOT STORED IN SPARAM.) */
-
-  /*     THE VALUES OF GAMSQ AND RGAMSQ SET IN THE DATA STATEMENT MAY BE */
-  /*     INEXACT.  THIS IS OK AS THEY ARE ONLY USED FOR TESTING THE SIZE */
-  /*     OF SD1 AND SD2.  ALL ACTUAL SCALING OF DATA IS DONE USING GAM. */
-
-  /*  Arguments */
-  /*  ========= */
-
-  /*  SD1    (input/output) REAL */
-
-  /*  SD2    (input/output) REAL */
-
-  /*  SX1    (input/output) REAL */
-
-  /*  SY1    (input) REAL */
-
-  /*  SPARAM (input/output)  REAL array, dimension 5 */
-  /*     SPARAM(1)=SFLAG */
-  /*     SPARAM(2)=SH11 */
-  /*     SPARAM(3)=SH21 */
-  /*     SPARAM(4)=SH12 */
-  /*     SPARAM(5)=SH22 */
-
-  /*  ===================================================================== */
-
-  /*     .. Local Scalars .. */
-  /*     .. */
-  /*     .. Intrinsic Functions .. */
-  /*     .. */
-  /*     .. Data statements .. */
-
-  /* Parameter adjustments */
-  --sparam;
-
-  /* Function Body */
-  /*     .. */
-  if (!(*sd1 < zero)) {
-    goto L10;
-  }
-  /*       GO ZERO-H-D-AND-SX1.. */
-  goto L60;
-L10:
-  /*     CASE-SD1-NONNEGATIVE */
-  sp2 = *sd2 * *sy1;
-  if (!(sp2 == zero)) {
-    goto L20;
-  }
-  sflag = -two;
-  goto L260;
-/*     REGULAR-CASE.. */
-L20:
-  sp1 = *sd1 * *sx1;
-  sq2 = sp2 * *sy1;
-  sq1 = sp1 * *sx1;
-
-  if (!(dabs(sq1) > dabs(sq2))) {
-    goto L40;
-  }
-  sh21 = -(*sy1) / *sx1;
-  sh12 = sp2 / sp1;
-
-  su = one - sh12 * sh21;
-
-  if (!(su <= zero)) {
-    goto L30;
-  }
-  /*         GO ZERO-H-D-AND-SX1.. */
-  goto L60;
-L30:
-  sflag = zero;
-  *sd1 /= su;
-  *sd2 /= su;
-  *sx1 *= su;
-  /*         GO SCALE-CHECK.. */
-  goto L100;
-L40:
-  if (!(sq2 < zero)) {
-    goto L50;
-  }
-  /*         GO ZERO-H-D-AND-SX1.. */
-  goto L60;
-L50:
-  sflag = one;
-  sh11 = sp1 / sp2;
-  sh22 = *sx1 / *sy1;
-  su = one + sh11 * sh22;
-  stemp = *sd2 / su;
-  *sd2 = *sd1 / su;
-  *sd1 = stemp;
-  *sx1 = *sy1 * su;
-  /*         GO SCALE-CHECK */
-  goto L100;
-/*     PROCEDURE..ZERO-H-D-AND-SX1.. */
-L60:
-  sflag = -one;
-  sh11 = zero;
-  sh12 = zero;
-  sh21 = zero;
-  sh22 = zero;
-
-  *sd1 = zero;
-  *sd2 = zero;
-  *sx1 = zero;
-  /*         RETURN.. */
-  goto L220;
-/*     PROCEDURE..FIX-H.. */
-L70:
-  if (!(sflag >= zero)) {
-    goto L90;
-  }
-
-  if (!(sflag == zero)) {
-    goto L80;
-  }
-  sh11 = one;
-  sh22 = one;
-  sflag = -one;
-  goto L90;
-L80:
-  sh21 = -one;
-  sh12 = one;
-  sflag = -one;
-L90:
-  switch (igo) {
-    case 0:
-      goto L120;
-    case 1:
-      goto L150;
-    case 2:
-      goto L180;
-    case 3:
-      goto L210;
-  }
-/*     PROCEDURE..SCALE-CHECK */
-L100:
-L110:
-  if (!(*sd1 <= rgamsq)) {
-    goto L130;
-  }
-  if (*sd1 == zero) {
-    goto L160;
-  }
-  igo = 0;
-  igo_fmt = fmt_120;
-  /*              FIX-H.. */
-  goto L70;
-L120:
-  /* Computing 2nd power */
-  r__1 = gam;
-  *sd1 *= r__1 * r__1;
-  *sx1 /= gam;
-  sh11 /= gam;
-  sh12 /= gam;
-  goto L110;
-L130:
-L140:
-  if (!(*sd1 >= gamsq)) {
-    goto L160;
-  }
-  igo = 1;
-  igo_fmt = fmt_150;
-  /*              FIX-H.. */
-  goto L70;
-L150:
-  /* Computing 2nd power */
-  r__1 = gam;
-  *sd1 /= r__1 * r__1;
-  *sx1 *= gam;
-  sh11 *= gam;
-  sh12 *= gam;
-  goto L140;
-L160:
-L170:
-  if (!(dabs(*sd2) <= rgamsq)) {
-    goto L190;
-  }
-  if (*sd2 == zero) {
-    goto L220;
-  }
-  igo = 2;
-  igo_fmt = fmt_180;
-  /*              FIX-H.. */
-  goto L70;
-L180:
-  /* Computing 2nd power */
-  r__1 = gam;
-  *sd2 *= r__1 * r__1;
-  sh21 /= gam;
-  sh22 /= gam;
-  goto L170;
-L190:
-L200:
-  if (!(dabs(*sd2) >= gamsq)) {
-    goto L220;
-  }
-  igo = 3;
-  igo_fmt = fmt_210;
-  /*              FIX-H.. */
-  goto L70;
-L210:
-  /* Computing 2nd power */
-  r__1 = gam;
-  *sd2 /= r__1 * r__1;
-  sh21 *= gam;
-  sh22 *= gam;
-  goto L200;
-L220:
-  if (sflag < 0.f) {
-    goto L250;
-  } else if (sflag == 0) {
-    goto L230;
-  } else {
-    goto L240;
-  }
-L230:
-  sparam[3] = sh21;
-  sparam[4] = sh12;
-  goto L260;
-L240:
-  sparam[2] = sh11;
-  sparam[5] = sh22;
-  goto L260;
-L250:
-  sparam[2] = sh11;
-  sparam[3] = sh21;
-  sparam[4] = sh12;
-  sparam[5] = sh22;
-L260:
-  sparam[1] = sflag;
-} /* srotmg_ */
--- a/blas/f2c/ssbmv.c
+++ b/blas/f2c/ssbmv.c
@@ -1,359 +0,0 @@
-/* ssbmv.f -- translated by f2c (version 20100827).
-   You must link the resulting object file with libf2c:
-        on Microsoft Windows system, link with libf2c.lib;
-        on Linux or Unix systems, link with .../path/to/libf2c.a -lm
-        or, if you install libf2c.a in a standard place, with -lf2c -lm
-        -- in that order, at the end of the command line, as in
-                cc *.o -lf2c -lm
-        Source for libf2c is in /netlib/f2c/libf2c.zip, e.g.,
-
-                http://www.netlib.org/f2c/libf2c.zip
-*/
-
-#include "datatypes.h"
-
-/* Subroutine */ void ssbmv_(char *uplo, integer *n, integer *k, real *alpha, real *a, integer *lda, real *x,
-                             integer *incx, real *beta, real *y, integer *incy) {
-  /* System generated locals */
-  integer a_dim1, a_offset, i__1, i__2, i__3, i__4;
-
-  /* Local variables */
-  integer i__, j, l, ix, iy, jx, jy, kx, ky, info;
-  real temp1, temp2;
-  extern logical lsame_(char *, char *);
-  integer kplus1;
-  extern /* Subroutine */ void xerbla_(const char *, integer *);
-
-  /*     .. Scalar Arguments .. */
-  /*     .. */
-  /*     .. Array Arguments .. */
-  /*     .. */
-
-  /*  Purpose */
-  /*  ======= */
-
-  /*  SSBMV  performs the matrix-vector  operation */
-
-  /*     y := alpha*A*x + beta*y, */
-
-  /*  where alpha and beta are scalars, x and y are n element vectors and */
-  /*  A is an n by n symmetric band matrix, with k super-diagonals. */
-
-  /*  Arguments */
-  /*  ========== */
-
-  /*  UPLO   - CHARACTER*1. */
-  /*           On entry, UPLO specifies whether the upper or lower */
-  /*           triangular part of the band matrix A is being supplied as */
-  /*           follows: */
-
-  /*              UPLO = 'U' or 'u'   The upper triangular part of A is */
-  /*                                  being supplied. */
-
-  /*              UPLO = 'L' or 'l'   The lower triangular part of A is */
-  /*                                  being supplied. */
-
-  /*           Unchanged on exit. */
-
-  /*  N      - INTEGER. */
-  /*           On entry, N specifies the order of the matrix A. */
-  /*           N must be at least zero. */
-  /*           Unchanged on exit. */
-
-  /*  K      - INTEGER. */
-  /*           On entry, K specifies the number of super-diagonals of the */
-  /*           matrix A. K must satisfy  0 .le. K. */
-  /*           Unchanged on exit. */
-
-  /*  ALPHA  - REAL            . */
-  /*           On entry, ALPHA specifies the scalar alpha. */
-  /*           Unchanged on exit. */
-
-  /*  A      - REAL             array of DIMENSION ( LDA, n ). */
-  /*           Before entry with UPLO = 'U' or 'u', the leading ( k + 1 ) */
-  /*           by n part of the array A must contain the upper triangular */
-  /*           band part of the symmetric matrix, supplied column by */
-  /*           column, with the leading diagonal of the matrix in row */
-  /*           ( k + 1 ) of the array, the first super-diagonal starting at */
-  /*           position 2 in row k, and so on. The top left k by k triangle */
-  /*           of the array A is not referenced. */
-  /*           The following program segment will transfer the upper */
-  /*           triangular part of a symmetric band matrix from conventional */
-  /*           full matrix storage to band storage: */
-
-  /*                 DO 20, J = 1, N */
-  /*                    M = K + 1 - J */
-  /*                    DO 10, I = MAX( 1, J - K ), J */
-  /*                       A( M + I, J ) = matrix( I, J ) */
-  /*              10    CONTINUE */
-  /*              20 CONTINUE */
-
-  /*           Before entry with UPLO = 'L' or 'l', the leading ( k + 1 ) */
-  /*           by n part of the array A must contain the lower triangular */
-  /*           band part of the symmetric matrix, supplied column by */
-  /*           column, with the leading diagonal of the matrix in row 1 of */
-  /*           the array, the first sub-diagonal starting at position 1 in */
-  /*           row 2, and so on. The bottom right k by k triangle of the */
-  /*           array A is not referenced. */
-  /*           The following program segment will transfer the lower */
-  /*           triangular part of a symmetric band matrix from conventional */
-  /*           full matrix storage to band storage: */
-
-  /*                 DO 20, J = 1, N */
-  /*                    M = 1 - J */
-  /*                    DO 10, I = J, MIN( N, J + K ) */
-  /*                       A( M + I, J ) = matrix( I, J ) */
-  /*              10    CONTINUE */
-  /*              20 CONTINUE */
-
-  /*           Unchanged on exit. */
-
-  /*  LDA    - INTEGER. */
-  /*           On entry, LDA specifies the first dimension of A as declared */
-  /*           in the calling (sub) program. LDA must be at least */
-  /*           ( k + 1 ). */
-  /*           Unchanged on exit. */
-
-  /*  X      - REAL             array of DIMENSION at least */
-  /*           ( 1 + ( n - 1 )*abs( INCX ) ). */
-  /*           Before entry, the incremented array X must contain the */
-  /*           vector x. */
-  /*           Unchanged on exit. */
-
-  /*  INCX   - INTEGER. */
-  /*           On entry, INCX specifies the increment for the elements of */
-  /*           X. INCX must not be zero. */
-  /*           Unchanged on exit. */
-
-  /*  BETA   - REAL            . */
-  /*           On entry, BETA specifies the scalar beta. */
-  /*           Unchanged on exit. */
-
-  /*  Y      - REAL             array of DIMENSION at least */
-  /*           ( 1 + ( n - 1 )*abs( INCY ) ). */
-  /*           Before entry, the incremented array Y must contain the */
-  /*           vector y. On exit, Y is overwritten by the updated vector y. */
-
-  /*  INCY   - INTEGER. */
-  /*           On entry, INCY specifies the increment for the elements of */
-  /*           Y. INCY must not be zero. */
-  /*           Unchanged on exit. */
-
-  /*  Further Details */
-  /*  =============== */
-
-  /*  Level 2 Blas routine. */
-
-  /*  -- Written on 22-October-1986. */
-  /*     Jack Dongarra, Argonne National Lab. */
-  /*     Jeremy Du Croz, Nag Central Office. */
-  /*     Sven Hammarling, Nag Central Office. */
-  /*     Richard Hanson, Sandia National Labs. */
-
-  /*  ===================================================================== */
-
-  /*     .. Parameters .. */
-  /*     .. */
-  /*     .. Local Scalars .. */
-  /*     .. */
-  /*     .. External Functions .. */
-  /*     .. */
-  /*     .. External Subroutines .. */
-  /*     .. */
-  /*     .. Intrinsic Functions .. */
-  /*     .. */
-
-  /*     Test the input parameters. */
-
-  /* Parameter adjustments */
-  a_dim1 = *lda;
-  a_offset = 1 + a_dim1;
-  a -= a_offset;
-  --x;
-  --y;
-
-  /* Function Body */
-  info = 0;
-  if (!lsame_(uplo, "U") && !lsame_(uplo, "L")) {
-    info = 1;
-  } else if (*n < 0) {
-    info = 2;
-  } else if (*k < 0) {
-    info = 3;
-  } else if (*lda < *k + 1) {
-    info = 6;
-  } else if (*incx == 0) {
-    info = 8;
-  } else if (*incy == 0) {
-    info = 11;
-  }
-  if (info != 0) {
-    xerbla_("SSBMV ", &info);
-    return;
-  }
-
-  /*     Quick return if possible. */
-
-  if (*n == 0 || (*alpha == 0.f && *beta == 1.f)) {
-    return;
-  }
-
-  /*     Set up the start points in  X  and  Y. */
-
-  if (*incx > 0) {
-    kx = 1;
-  } else {
-    kx = 1 - (*n - 1) * *incx;
-  }
-  if (*incy > 0) {
-    ky = 1;
-  } else {
-    ky = 1 - (*n - 1) * *incy;
-  }
-
-  /*     Start the operations. In this version the elements of the array A */
-  /*     are accessed sequentially with one pass through A. */
-
-  /*     First form  y := beta*y. */
-
-  if (*beta != 1.f) {
-    if (*incy == 1) {
-      if (*beta == 0.f) {
-        i__1 = *n;
-        for (i__ = 1; i__ <= i__1; ++i__) {
-          y[i__] = 0.f;
-          /* L10: */
-        }
-      } else {
-        i__1 = *n;
-        for (i__ = 1; i__ <= i__1; ++i__) {
-          y[i__] = *beta * y[i__];
-          /* L20: */
-        }
-      }
-    } else {
-      iy = ky;
-      if (*beta == 0.f) {
-        i__1 = *n;
-        for (i__ = 1; i__ <= i__1; ++i__) {
-          y[iy] = 0.f;
-          iy += *incy;
-          /* L30: */
-        }
-      } else {
-        i__1 = *n;
-        for (i__ = 1; i__ <= i__1; ++i__) {
-          y[iy] = *beta * y[iy];
-          iy += *incy;
-          /* L40: */
-        }
-      }
-    }
-  }
-  if (*alpha == 0.f) {
-    return;
-  }
-  if (lsame_(uplo, "U")) {
-    /*        Form  y  when upper triangle of A is stored. */
-
-    kplus1 = *k + 1;
-    if (*incx == 1 && *incy == 1) {
-      i__1 = *n;
-      for (j = 1; j <= i__1; ++j) {
-        temp1 = *alpha * x[j];
-        temp2 = 0.f;
-        l = kplus1 - j;
-        /* Computing MAX */
-        i__2 = 1, i__3 = j - *k;
-        i__4 = j - 1;
-        for (i__ = max(i__2, i__3); i__ <= i__4; ++i__) {
-          y[i__] += temp1 * a[l + i__ + j * a_dim1];
-          temp2 += a[l + i__ + j * a_dim1] * x[i__];
-          /* L50: */
-        }
-        y[j] = y[j] + temp1 * a[kplus1 + j * a_dim1] + *alpha * temp2;
-        /* L60: */
-      }
-    } else {
-      jx = kx;
-      jy = ky;
-      i__1 = *n;
-      for (j = 1; j <= i__1; ++j) {
-        temp1 = *alpha * x[jx];
-        temp2 = 0.f;
-        ix = kx;
-        iy = ky;
-        l = kplus1 - j;
-        /* Computing MAX */
-        i__4 = 1, i__2 = j - *k;
-        i__3 = j - 1;
-        for (i__ = max(i__4, i__2); i__ <= i__3; ++i__) {
-          y[iy] += temp1 * a[l + i__ + j * a_dim1];
-          temp2 += a[l + i__ + j * a_dim1] * x[ix];
-          ix += *incx;
-          iy += *incy;
-          /* L70: */
-        }
-        y[jy] = y[jy] + temp1 * a[kplus1 + j * a_dim1] + *alpha * temp2;
-        jx += *incx;
-        jy += *incy;
-        if (j > *k) {
-          kx += *incx;
-          ky += *incy;
-        }
-        /* L80: */
-      }
-    }
-  } else {
-    /*        Form  y  when lower triangle of A is stored. */
-
-    if (*incx == 1 && *incy == 1) {
-      i__1 = *n;
-      for (j = 1; j <= i__1; ++j) {
-        temp1 = *alpha * x[j];
-        temp2 = 0.f;
-        y[j] += temp1 * a[j * a_dim1 + 1];
-        l = 1 - j;
-        /* Computing MIN */
-        i__4 = *n, i__2 = j + *k;
-        i__3 = min(i__4, i__2);
-        for (i__ = j + 1; i__ <= i__3; ++i__) {
-          y[i__] += temp1 * a[l + i__ + j * a_dim1];
-          temp2 += a[l + i__ + j * a_dim1] * x[i__];
-          /* L90: */
-        }
-        y[j] += *alpha * temp2;
-        /* L100: */
-      }
-    } else {
-      jx = kx;
-      jy = ky;
-      i__1 = *n;
-      for (j = 1; j <= i__1; ++j) {
-        temp1 = *alpha * x[jx];
-        temp2 = 0.f;
-        y[jy] += temp1 * a[j * a_dim1 + 1];
-        l = 1 - j;
-        ix = jx;
-        iy = jy;
-        /* Computing MIN */
-        i__4 = *n, i__2 = j + *k;
-        i__3 = min(i__4, i__2);
-        for (i__ = j + 1; i__ <= i__3; ++i__) {
-          ix += *incx;
-          iy += *incy;
-          y[iy] += temp1 * a[l + i__ + j * a_dim1];
-          temp2 += a[l + i__ + j * a_dim1] * x[ix];
-          /* L110: */
-        }
-        y[jy] += *alpha * temp2;
-        jx += *incx;
-        jy += *incy;
-        /* L120: */
-      }
-    }
-  }
-
-  /*     End of SSBMV . */
-
-} /* ssbmv_ */
--- a/blas/f2c/sspmv.c
+++ b/blas/f2c/sspmv.c
@@ -1,308 +0,0 @@
-/* sspmv.f -- translated by f2c (version 20100827).
-   You must link the resulting object file with libf2c:
-        on Microsoft Windows system, link with libf2c.lib;
-        on Linux or Unix systems, link with .../path/to/libf2c.a -lm
-        or, if you install libf2c.a in a standard place, with -lf2c -lm
-        -- in that order, at the end of the command line, as in
-                cc *.o -lf2c -lm
-        Source for libf2c is in /netlib/f2c/libf2c.zip, e.g.,
-
-                http://www.netlib.org/f2c/libf2c.zip
-*/
-
-#include "datatypes.h"
-
-/* Subroutine */ void sspmv_(char *uplo, integer *n, real *alpha, real *ap, real *x, integer *incx, real *beta, real *y,
-                             integer *incy) {
-  /* System generated locals */
-  integer i__1, i__2;
-
-  /* Local variables */
-  integer i__, j, k, kk, ix, iy, jx, jy, kx, ky, info;
-  real temp1, temp2;
-  extern logical lsame_(char *, char *);
-  extern /* Subroutine */ void xerbla_(const char *, integer *);
-
-  /*     .. Scalar Arguments .. */
-  /*     .. */
-  /*     .. Array Arguments .. */
-  /*     .. */
-
-  /*  Purpose */
-  /*  ======= */
-
-  /*  SSPMV  performs the matrix-vector operation */
-
-  /*     y := alpha*A*x + beta*y, */
-
-  /*  where alpha and beta are scalars, x and y are n element vectors and */
-  /*  A is an n by n symmetric matrix, supplied in packed form. */
-
-  /*  Arguments */
-  /*  ========== */
-
-  /*  UPLO   - CHARACTER*1. */
-  /*           On entry, UPLO specifies whether the upper or lower */
-  /*           triangular part of the matrix A is supplied in the packed */
-  /*           array AP as follows: */
-
-  /*              UPLO = 'U' or 'u'   The upper triangular part of A is */
-  /*                                  supplied in AP. */
-
-  /*              UPLO = 'L' or 'l'   The lower triangular part of A is */
-  /*                                  supplied in AP. */
-
-  /*           Unchanged on exit. */
-
-  /*  N      - INTEGER. */
-  /*           On entry, N specifies the order of the matrix A. */
-  /*           N must be at least zero. */
-  /*           Unchanged on exit. */
-
-  /*  ALPHA  - REAL            . */
-  /*           On entry, ALPHA specifies the scalar alpha. */
-  /*           Unchanged on exit. */
-
-  /*  AP     - REAL             array of DIMENSION at least */
-  /*           ( ( n*( n + 1 ) )/2 ). */
-  /*           Before entry with UPLO = 'U' or 'u', the array AP must */
-  /*           contain the upper triangular part of the symmetric matrix */
-  /*           packed sequentially, column by column, so that AP( 1 ) */
-  /*           contains a( 1, 1 ), AP( 2 ) and AP( 3 ) contain a( 1, 2 ) */
-  /*           and a( 2, 2 ) respectively, and so on. */
-  /*           Before entry with UPLO = 'L' or 'l', the array AP must */
-  /*           contain the lower triangular part of the symmetric matrix */
-  /*           packed sequentially, column by column, so that AP( 1 ) */
-  /*           contains a( 1, 1 ), AP( 2 ) and AP( 3 ) contain a( 2, 1 ) */
-  /*           and a( 3, 1 ) respectively, and so on. */
-  /*           Unchanged on exit. */
-
-  /*  X      - REAL             array of dimension at least */
-  /*           ( 1 + ( n - 1 )*abs( INCX ) ). */
-  /*           Before entry, the incremented array X must contain the n */
-  /*           element vector x. */
-  /*           Unchanged on exit. */
-
-  /*  INCX   - INTEGER. */
-  /*           On entry, INCX specifies the increment for the elements of */
-  /*           X. INCX must not be zero. */
-  /*           Unchanged on exit. */
-
-  /*  BETA   - REAL            . */
-  /*           On entry, BETA specifies the scalar beta. When BETA is */
-  /*           supplied as zero then Y need not be set on input. */
-  /*           Unchanged on exit. */
-
-  /*  Y      - REAL             array of dimension at least */
-  /*           ( 1 + ( n - 1 )*abs( INCY ) ). */
-  /*           Before entry, the incremented array Y must contain the n */
-  /*           element vector y. On exit, Y is overwritten by the updated */
-  /*           vector y. */
-
-  /*  INCY   - INTEGER. */
-  /*           On entry, INCY specifies the increment for the elements of */
-  /*           Y. INCY must not be zero. */
-  /*           Unchanged on exit. */
-
-  /*  Further Details */
-  /*  =============== */
-
-  /*  Level 2 Blas routine. */
-
-  /*  -- Written on 22-October-1986. */
-  /*     Jack Dongarra, Argonne National Lab. */
-  /*     Jeremy Du Croz, Nag Central Office. */
-  /*     Sven Hammarling, Nag Central Office. */
-  /*     Richard Hanson, Sandia National Labs. */
-
-  /*  ===================================================================== */
-
-  /*     .. Parameters .. */
-  /*     .. */
-  /*     .. Local Scalars .. */
-  /*     .. */
-  /*     .. External Functions .. */
-  /*     .. */
-  /*     .. External Subroutines .. */
-  /*     .. */
-
-  /*     Test the input parameters. */
-
-  /* Parameter adjustments */
-  --y;
-  --x;
-  --ap;
-
-  /* Function Body */
-  info = 0;
-  if (!lsame_(uplo, "U") && !lsame_(uplo, "L")) {
-    info = 1;
-  } else if (*n < 0) {
-    info = 2;
-  } else if (*incx == 0) {
-    info = 6;
-  } else if (*incy == 0) {
-    info = 9;
-  }
-  if (info != 0) {
-    xerbla_("SSPMV ", &info);
-    return;
-  }
-
-  /*     Quick return if possible. */
-
-  if (*n == 0 || (*alpha == 0.f && *beta == 1.f)) {
-    return;
-  }
-
-  /*     Set up the start points in  X  and  Y. */
-
-  if (*incx > 0) {
-    kx = 1;
-  } else {
-    kx = 1 - (*n - 1) * *incx;
-  }
-  if (*incy > 0) {
-    ky = 1;
-  } else {
-    ky = 1 - (*n - 1) * *incy;
-  }
-
-  /*     Start the operations. In this version the elements of the array AP */
-  /*     are accessed sequentially with one pass through AP. */
-
-  /*     First form  y := beta*y. */
-
-  if (*beta != 1.f) {
-    if (*incy == 1) {
-      if (*beta == 0.f) {
-        i__1 = *n;
-        for (i__ = 1; i__ <= i__1; ++i__) {
-          y[i__] = 0.f;
-          /* L10: */
-        }
-      } else {
-        i__1 = *n;
-        for (i__ = 1; i__ <= i__1; ++i__) {
-          y[i__] = *beta * y[i__];
-          /* L20: */
-        }
-      }
-    } else {
-      iy = ky;
-      if (*beta == 0.f) {
-        i__1 = *n;
-        for (i__ = 1; i__ <= i__1; ++i__) {
-          y[iy] = 0.f;
-          iy += *incy;
-          /* L30: */
-        }
-      } else {
-        i__1 = *n;
-        for (i__ = 1; i__ <= i__1; ++i__) {
-          y[iy] = *beta * y[iy];
-          iy += *incy;
-          /* L40: */
-        }
-      }
-    }
-  }
-  if (*alpha == 0.f) {
-    return;
-  }
-  kk = 1;
-  if (lsame_(uplo, "U")) {
-    /*        Form  y  when AP contains the upper triangle. */
-
-    if (*incx == 1 && *incy == 1) {
-      i__1 = *n;
-      for (j = 1; j <= i__1; ++j) {
-        temp1 = *alpha * x[j];
-        temp2 = 0.f;
-        k = kk;
-        i__2 = j - 1;
-        for (i__ = 1; i__ <= i__2; ++i__) {
-          y[i__] += temp1 * ap[k];
-          temp2 += ap[k] * x[i__];
-          ++k;
-          /* L50: */
-        }
-        y[j] = y[j] + temp1 * ap[kk + j - 1] + *alpha * temp2;
-        kk += j;
-        /* L60: */
-      }
-    } else {
-      jx = kx;
-      jy = ky;
-      i__1 = *n;
-      for (j = 1; j <= i__1; ++j) {
-        temp1 = *alpha * x[jx];
-        temp2 = 0.f;
-        ix = kx;
-        iy = ky;
-        i__2 = kk + j - 2;
-        for (k = kk; k <= i__2; ++k) {
-          y[iy] += temp1 * ap[k];
-          temp2 += ap[k] * x[ix];
-          ix += *incx;
-          iy += *incy;
-          /* L70: */
-        }
-        y[jy] = y[jy] + temp1 * ap[kk + j - 1] + *alpha * temp2;
-        jx += *incx;
-        jy += *incy;
-        kk += j;
-        /* L80: */
-      }
-    }
-  } else {
-    /*        Form  y  when AP contains the lower triangle. */
-
-    if (*incx == 1 && *incy == 1) {
-      i__1 = *n;
-      for (j = 1; j <= i__1; ++j) {
-        temp1 = *alpha * x[j];
-        temp2 = 0.f;
-        y[j] += temp1 * ap[kk];
-        k = kk + 1;
-        i__2 = *n;
-        for (i__ = j + 1; i__ <= i__2; ++i__) {
-          y[i__] += temp1 * ap[k];
-          temp2 += ap[k] * x[i__];
-          ++k;
-          /* L90: */
-        }
-        y[j] += *alpha * temp2;
-        kk += *n - j + 1;
-        /* L100: */
-      }
-    } else {
-      jx = kx;
-      jy = ky;
-      i__1 = *n;
-      for (j = 1; j <= i__1; ++j) {
-        temp1 = *alpha * x[jx];
-        temp2 = 0.f;
-        y[jy] += temp1 * ap[kk];
-        ix = jx;
-        iy = jy;
-        i__2 = kk + *n - j;
-        for (k = kk + 1; k <= i__2; ++k) {
-          ix += *incx;
-          iy += *incy;
-          y[iy] += temp1 * ap[k];
-          temp2 += ap[k] * x[ix];
-          /* L110: */
-        }
-        y[jy] += *alpha * temp2;
-        jx += *incx;
-        jy += *incy;
-        kk += *n - j + 1;
-        /* L120: */
-      }
-    }
-  }
-
-  /*     End of SSPMV . */
-
-} /* sspmv_ */
--- a/blas/f2c/stbmv.c
+++ b/blas/f2c/stbmv.c
@@ -1,417 +0,0 @@
-/* stbmv.f -- translated by f2c (version 20100827).
-   You must link the resulting object file with libf2c:
-        on Microsoft Windows system, link with libf2c.lib;
-        on Linux or Unix systems, link with .../path/to/libf2c.a -lm
-        or, if you install libf2c.a in a standard place, with -lf2c -lm
-        -- in that order, at the end of the command line, as in
-                cc *.o -lf2c -lm
-        Source for libf2c is in /netlib/f2c/libf2c.zip, e.g.,
-
-                http://www.netlib.org/f2c/libf2c.zip
-*/
-
-#include "datatypes.h"
-
-/* Subroutine */ void stbmv_(char *uplo, char *trans, char *diag, integer *n, integer *k, real *a, integer *lda,
-                             real *x, integer *incx) {
-  /* System generated locals */
-  integer a_dim1, a_offset, i__1, i__2, i__3, i__4;
-
-  /* Local variables */
-  integer i__, j, l, ix, jx, kx, info;
-  real temp;
-  extern logical lsame_(char *, char *);
-  integer kplus1;
-  extern /* Subroutine */ void xerbla_(const char *, integer *);
-  logical nounit;
-
-  /*     .. Scalar Arguments .. */
-  /*     .. */
-  /*     .. Array Arguments .. */
-  /*     .. */
-
-  /*  Purpose */
-  /*  ======= */
-
-  /*  STBMV  performs one of the matrix-vector operations */
-
-  /*     x := A*x,   or   x := A'*x, */
-
-  /*  where x is an n element vector and  A is an n by n unit, or non-unit, */
-  /*  upper or lower triangular band matrix, with ( k + 1 ) diagonals. */
-
-  /*  Arguments */
-  /*  ========== */
-
-  /*  UPLO   - CHARACTER*1. */
-  /*           On entry, UPLO specifies whether the matrix is an upper or */
-  /*           lower triangular matrix as follows: */
-
-  /*              UPLO = 'U' or 'u'   A is an upper triangular matrix. */
-
-  /*              UPLO = 'L' or 'l'   A is a lower triangular matrix. */
-
-  /*           Unchanged on exit. */
-
-  /*  TRANS  - CHARACTER*1. */
-  /*           On entry, TRANS specifies the operation to be performed as */
-  /*           follows: */
-
-  /*              TRANS = 'N' or 'n'   x := A*x. */
-
-  /*              TRANS = 'T' or 't'   x := A'*x. */
-
-  /*              TRANS = 'C' or 'c'   x := A'*x. */
-
-  /*           Unchanged on exit. */
-
-  /*  DIAG   - CHARACTER*1. */
-  /*           On entry, DIAG specifies whether or not A is unit */
-  /*           triangular as follows: */
-
-  /*              DIAG = 'U' or 'u'   A is assumed to be unit triangular. */
-
-  /*              DIAG = 'N' or 'n'   A is not assumed to be unit */
-  /*                                  triangular. */
-
-  /*           Unchanged on exit. */
-
-  /*  N      - INTEGER. */
-  /*           On entry, N specifies the order of the matrix A. */
-  /*           N must be at least zero. */
-  /*           Unchanged on exit. */
-
-  /*  K      - INTEGER. */
-  /*           On entry with UPLO = 'U' or 'u', K specifies the number of */
-  /*           super-diagonals of the matrix A. */
-  /*           On entry with UPLO = 'L' or 'l', K specifies the number of */
-  /*           sub-diagonals of the matrix A. */
-  /*           K must satisfy  0 .le. K. */
-  /*           Unchanged on exit. */
-
-  /*  A      - REAL             array of DIMENSION ( LDA, n ). */
-  /*           Before entry with UPLO = 'U' or 'u', the leading ( k + 1 ) */
-  /*           by n part of the array A must contain the upper triangular */
-  /*           band part of the matrix of coefficients, supplied column by */
-  /*           column, with the leading diagonal of the matrix in row */
-  /*           ( k + 1 ) of the array, the first super-diagonal starting at */
-  /*           position 2 in row k, and so on. The top left k by k triangle */
-  /*           of the array A is not referenced. */
-  /*           The following program segment will transfer an upper */
-  /*           triangular band matrix from conventional full matrix storage */
-  /*           to band storage: */
-
-  /*                 DO 20, J = 1, N */
-  /*                    M = K + 1 - J */
-  /*                    DO 10, I = MAX( 1, J - K ), J */
-  /*                       A( M + I, J ) = matrix( I, J ) */
-  /*              10    CONTINUE */
-  /*              20 CONTINUE */
-
-  /*           Before entry with UPLO = 'L' or 'l', the leading ( k + 1 ) */
-  /*           by n part of the array A must contain the lower triangular */
-  /*           band part of the matrix of coefficients, supplied column by */
-  /*           column, with the leading diagonal of the matrix in row 1 of */
-  /*           the array, the first sub-diagonal starting at position 1 in */
-  /*           row 2, and so on. The bottom right k by k triangle of the */
-  /*           array A is not referenced. */
-  /*           The following program segment will transfer a lower */
-  /*           triangular band matrix from conventional full matrix storage */
-  /*           to band storage: */
-
-  /*                 DO 20, J = 1, N */
-  /*                    M = 1 - J */
-  /*                    DO 10, I = J, MIN( N, J + K ) */
-  /*                       A( M + I, J ) = matrix( I, J ) */
-  /*              10    CONTINUE */
-  /*              20 CONTINUE */
-
-  /*           Note that when DIAG = 'U' or 'u' the elements of the array A */
-  /*           corresponding to the diagonal elements of the matrix are not */
-  /*           referenced, but are assumed to be unity. */
-  /*           Unchanged on exit. */
-
-  /*  LDA    - INTEGER. */
-  /*           On entry, LDA specifies the first dimension of A as declared */
-  /*           in the calling (sub) program. LDA must be at least */
-  /*           ( k + 1 ). */
-  /*           Unchanged on exit. */
-
-  /*  X      - REAL             array of dimension at least */
-  /*           ( 1 + ( n - 1 )*abs( INCX ) ). */
-  /*           Before entry, the incremented array X must contain the n */
-  /*           element vector x. On exit, X is overwritten with the */
-  /*           transformed vector x. */
-
-  /*  INCX   - INTEGER. */
-  /*           On entry, INCX specifies the increment for the elements of */
-  /*           X. INCX must not be zero. */
-  /*           Unchanged on exit. */
-
-  /*  Further Details */
-  /*  =============== */
-
-  /*  Level 2 Blas routine. */
-
-  /*  -- Written on 22-October-1986. */
-  /*     Jack Dongarra, Argonne National Lab. */
-  /*     Jeremy Du Croz, Nag Central Office. */
-  /*     Sven Hammarling, Nag Central Office. */
-  /*     Richard Hanson, Sandia National Labs. */
-
-  /*  ===================================================================== */
-
-  /*     .. Parameters .. */
-  /*     .. */
-  /*     .. Local Scalars .. */
-  /*     .. */
-  /*     .. External Functions .. */
-  /*     .. */
-  /*     .. External Subroutines .. */
-  /*     .. */
-  /*     .. Intrinsic Functions .. */
-  /*     .. */
-
-  /*     Test the input parameters. */
-
-  /* Parameter adjustments */
-  a_dim1 = *lda;
-  a_offset = 1 + a_dim1;
-  a -= a_offset;
-  --x;
-
-  /* Function Body */
-  info = 0;
-  if (!lsame_(uplo, "U") && !lsame_(uplo, "L")) {
-    info = 1;
-  } else if (!lsame_(trans, "N") && !lsame_(trans, "T") && !lsame_(trans, "C")) {
-    info = 2;
-  } else if (!lsame_(diag, "U") && !lsame_(diag, "N")) {
-    info = 3;
-  } else if (*n < 0) {
-    info = 4;
-  } else if (*k < 0) {
-    info = 5;
-  } else if (*lda < *k + 1) {
-    info = 7;
-  } else if (*incx == 0) {
-    info = 9;
-  }
-  if (info != 0) {
-    xerbla_("STBMV ", &info);
-    return;
-  }
-
-  /*     Quick return if possible. */
-
-  if (*n == 0) {
-    return;
-  }
-
-  nounit = lsame_(diag, "N");
-
-  /*     Set up the start point in X if the increment is not unity. This */
-  /*     will be  ( N - 1 )*INCX   too small for descending loops. */
-
-  if (*incx <= 0) {
-    kx = 1 - (*n - 1) * *incx;
-  } else if (*incx != 1) {
-    kx = 1;
-  }
-
-  /*     Start the operations. In this version the elements of A are */
-  /*     accessed sequentially with one pass through A. */
-
-  if (lsame_(trans, "N")) {
-    /*         Form  x := A*x. */
-
-    if (lsame_(uplo, "U")) {
-      kplus1 = *k + 1;
-      if (*incx == 1) {
-        i__1 = *n;
-        for (j = 1; j <= i__1; ++j) {
-          if (x[j] != 0.f) {
-            temp = x[j];
-            l = kplus1 - j;
-            /* Computing MAX */
-            i__2 = 1, i__3 = j - *k;
-            i__4 = j - 1;
-            for (i__ = max(i__2, i__3); i__ <= i__4; ++i__) {
-              x[i__] += temp * a[l + i__ + j * a_dim1];
-              /* L10: */
-            }
-            if (nounit) {
-              x[j] *= a[kplus1 + j * a_dim1];
-            }
-          }
-          /* L20: */
-        }
-      } else {
-        jx = kx;
-        i__1 = *n;
-        for (j = 1; j <= i__1; ++j) {
-          if (x[jx] != 0.f) {
-            temp = x[jx];
-            ix = kx;
-            l = kplus1 - j;
-            /* Computing MAX */
-            i__4 = 1, i__2 = j - *k;
-            i__3 = j - 1;
-            for (i__ = max(i__4, i__2); i__ <= i__3; ++i__) {
-              x[ix] += temp * a[l + i__ + j * a_dim1];
-              ix += *incx;
-              /* L30: */
-            }
-            if (nounit) {
-              x[jx] *= a[kplus1 + j * a_dim1];
-            }
-          }
-          jx += *incx;
-          if (j > *k) {
-            kx += *incx;
-          }
-          /* L40: */
-        }
-      }
-    } else {
-      if (*incx == 1) {
-        for (j = *n; j >= 1; --j) {
-          if (x[j] != 0.f) {
-            temp = x[j];
-            l = 1 - j;
-            /* Computing MIN */
-            i__1 = *n, i__3 = j + *k;
-            i__4 = j + 1;
-            for (i__ = min(i__1, i__3); i__ >= i__4; --i__) {
-              x[i__] += temp * a[l + i__ + j * a_dim1];
-              /* L50: */
-            }
-            if (nounit) {
-              x[j] *= a[j * a_dim1 + 1];
-            }
-          }
-          /* L60: */
-        }
-      } else {
-        kx += (*n - 1) * *incx;
-        jx = kx;
-        for (j = *n; j >= 1; --j) {
-          if (x[jx] != 0.f) {
-            temp = x[jx];
-            ix = kx;
-            l = 1 - j;
-            /* Computing MIN */
-            i__4 = *n, i__1 = j + *k;
-            i__3 = j + 1;
-            for (i__ = min(i__4, i__1); i__ >= i__3; --i__) {
-              x[ix] += temp * a[l + i__ + j * a_dim1];
-              ix -= *incx;
-              /* L70: */
-            }
-            if (nounit) {
-              x[jx] *= a[j * a_dim1 + 1];
-            }
-          }
-          jx -= *incx;
-          if (*n - j >= *k) {
-            kx -= *incx;
-          }
-          /* L80: */
-        }
-      }
-    }
-  } else {
-    /*        Form  x := A'*x. */
-
-    if (lsame_(uplo, "U")) {
-      kplus1 = *k + 1;
-      if (*incx == 1) {
-        for (j = *n; j >= 1; --j) {
-          temp = x[j];
-          l = kplus1 - j;
-          if (nounit) {
-            temp *= a[kplus1 + j * a_dim1];
-          }
-          /* Computing MAX */
-          i__4 = 1, i__1 = j - *k;
-          i__3 = max(i__4, i__1);
-          for (i__ = j - 1; i__ >= i__3; --i__) {
-            temp += a[l + i__ + j * a_dim1] * x[i__];
-            /* L90: */
-          }
-          x[j] = temp;
-          /* L100: */
-        }
-      } else {
-        kx += (*n - 1) * *incx;
-        jx = kx;
-        for (j = *n; j >= 1; --j) {
-          temp = x[jx];
-          kx -= *incx;
-          ix = kx;
-          l = kplus1 - j;
-          if (nounit) {
-            temp *= a[kplus1 + j * a_dim1];
-          }
-          /* Computing MAX */
-          i__4 = 1, i__1 = j - *k;
-          i__3 = max(i__4, i__1);
-          for (i__ = j - 1; i__ >= i__3; --i__) {
-            temp += a[l + i__ + j * a_dim1] * x[ix];
-            ix -= *incx;
-            /* L110: */
-          }
-          x[jx] = temp;
-          jx -= *incx;
-          /* L120: */
-        }
-      }
-    } else {
-      if (*incx == 1) {
-        i__3 = *n;
-        for (j = 1; j <= i__3; ++j) {
-          temp = x[j];
-          l = 1 - j;
-          if (nounit) {
-            temp *= a[j * a_dim1 + 1];
-          }
-          /* Computing MIN */
-          i__1 = *n, i__2 = j + *k;
-          i__4 = min(i__1, i__2);
-          for (i__ = j + 1; i__ <= i__4; ++i__) {
-            temp += a[l + i__ + j * a_dim1] * x[i__];
-            /* L130: */
-          }
-          x[j] = temp;
-          /* L140: */
-        }
-      } else {
-        jx = kx;
-        i__3 = *n;
-        for (j = 1; j <= i__3; ++j) {
-          temp = x[jx];
-          kx += *incx;
-          ix = kx;
-          l = 1 - j;
-          if (nounit) {
-            temp *= a[j * a_dim1 + 1];
-          }
-          /* Computing MIN */
-          i__1 = *n, i__2 = j + *k;
-          i__4 = min(i__1, i__2);
-          for (i__ = j + 1; i__ <= i__4; ++i__) {
-            temp += a[l + i__ + j * a_dim1] * x[ix];
-            ix += *incx;
-            /* L150: */
-          }
-          x[jx] = temp;
-          jx += *incx;
-          /* L160: */
-        }
-      }
-    }
-  }
-
-  /*     End of STBMV . */
-
-} /* stbmv_ */
--- a/blas/f2c/zhbmv.c
+++ b/blas/f2c/zhbmv.c
@@ -1,456 +0,0 @@
-/* zhbmv.f -- translated by f2c (version 20100827).
-   You must link the resulting object file with libf2c:
-        on Microsoft Windows system, link with libf2c.lib;
-        on Linux or Unix systems, link with .../path/to/libf2c.a -lm
-        or, if you install libf2c.a in a standard place, with -lf2c -lm
-        -- in that order, at the end of the command line, as in
-                cc *.o -lf2c -lm
-        Source for libf2c is in /netlib/f2c/libf2c.zip, e.g.,
-
-                http://www.netlib.org/f2c/libf2c.zip
-*/
-
-#include "datatypes.h"
-
-static inline void d_cnjg(doublecomplex *r, doublecomplex *z) {
-  r->r = z->r;
-  r->i = -(z->i);
-}
-
-/* Subroutine */ void zhbmv_(char *uplo, integer *n, integer *k, doublecomplex *alpha, doublecomplex *a, integer *lda,
-                             doublecomplex *x, integer *incx, doublecomplex *beta, doublecomplex *y, integer *incy) {
-  /* System generated locals */
-  integer a_dim1, a_offset, i__1, i__2, i__3, i__4, i__5;
-  doublereal d__1;
-  doublecomplex z__1, z__2, z__3, z__4;
-
-  /* Local variables */
-  integer i__, j, l, ix, iy, jx, jy, kx, ky, info;
-  doublecomplex temp1, temp2;
-  extern logical lsame_(char *, char *);
-  integer kplus1;
-  extern /* Subroutine */ void xerbla_(const char *, integer *);
-
-  /*     .. Scalar Arguments .. */
-  /*     .. */
-  /*     .. Array Arguments .. */
-  /*     .. */
-
-  /*  Purpose */
-  /*  ======= */
-
-  /*  ZHBMV  performs the matrix-vector  operation */
-
-  /*     y := alpha*A*x + beta*y, */
-
-  /*  where alpha and beta are scalars, x and y are n element vectors and */
-  /*  A is an n by n hermitian band matrix, with k super-diagonals. */
-
-  /*  Arguments */
-  /*  ========== */
-
-  /*  UPLO   - CHARACTER*1. */
-  /*           On entry, UPLO specifies whether the upper or lower */
-  /*           triangular part of the band matrix A is being supplied as */
-  /*           follows: */
-
-  /*              UPLO = 'U' or 'u'   The upper triangular part of A is */
-  /*                                  being supplied. */
-
-  /*              UPLO = 'L' or 'l'   The lower triangular part of A is */
-  /*                                  being supplied. */
-
-  /*           Unchanged on exit. */
-
-  /*  N      - INTEGER. */
-  /*           On entry, N specifies the order of the matrix A. */
-  /*           N must be at least zero. */
-  /*           Unchanged on exit. */
-
-  /*  K      - INTEGER. */
-  /*           On entry, K specifies the number of super-diagonals of the */
-  /*           matrix A. K must satisfy  0 .le. K. */
-  /*           Unchanged on exit. */
-
-  /*  ALPHA  - COMPLEX*16      . */
-  /*           On entry, ALPHA specifies the scalar alpha. */
-  /*           Unchanged on exit. */
-
-  /*  A      - COMPLEX*16       array of DIMENSION ( LDA, n ). */
-  /*           Before entry with UPLO = 'U' or 'u', the leading ( k + 1 ) */
-  /*           by n part of the array A must contain the upper triangular */
-  /*           band part of the hermitian matrix, supplied column by */
-  /*           column, with the leading diagonal of the matrix in row */
-  /*           ( k + 1 ) of the array, the first super-diagonal starting at */
-  /*           position 2 in row k, and so on. The top left k by k triangle */
-  /*           of the array A is not referenced. */
-  /*           The following program segment will transfer the upper */
-  /*           triangular part of a hermitian band matrix from conventional */
-  /*           full matrix storage to band storage: */
-
-  /*                 DO 20, J = 1, N */
-  /*                    M = K + 1 - J */
-  /*                    DO 10, I = MAX( 1, J - K ), J */
-  /*                       A( M + I, J ) = matrix( I, J ) */
-  /*              10    CONTINUE */
-  /*              20 CONTINUE */
-
-  /*           Before entry with UPLO = 'L' or 'l', the leading ( k + 1 ) */
-  /*           by n part of the array A must contain the lower triangular */
-  /*           band part of the hermitian matrix, supplied column by */
-  /*           column, with the leading diagonal of the matrix in row 1 of */
-  /*           the array, the first sub-diagonal starting at position 1 in */
-  /*           row 2, and so on. The bottom right k by k triangle of the */
-  /*           array A is not referenced. */
-  /*           The following program segment will transfer the lower */
-  /*           triangular part of a hermitian band matrix from conventional */
-  /*           full matrix storage to band storage: */
-
-  /*                 DO 20, J = 1, N */
-  /*                    M = 1 - J */
-  /*                    DO 10, I = J, MIN( N, J + K ) */
-  /*                       A( M + I, J ) = matrix( I, J ) */
-  /*              10    CONTINUE */
-  /*              20 CONTINUE */
-
-  /*           Note that the imaginary parts of the diagonal elements need */
-  /*           not be set and are assumed to be zero. */
-  /*           Unchanged on exit. */
-
-  /*  LDA    - INTEGER. */
-  /*           On entry, LDA specifies the first dimension of A as declared */
-  /*           in the calling (sub) program. LDA must be at least */
-  /*           ( k + 1 ). */
-  /*           Unchanged on exit. */
-
-  /*  X      - COMPLEX*16       array of DIMENSION at least */
-  /*           ( 1 + ( n - 1 )*abs( INCX ) ). */
-  /*           Before entry, the incremented array X must contain the */
-  /*           vector x. */
-  /*           Unchanged on exit. */
-
-  /*  INCX   - INTEGER. */
-  /*           On entry, INCX specifies the increment for the elements of */
-  /*           X. INCX must not be zero. */
-  /*           Unchanged on exit. */
-
-  /*  BETA   - COMPLEX*16      . */
-  /*           On entry, BETA specifies the scalar beta. */
-  /*           Unchanged on exit. */
-
-  /*  Y      - COMPLEX*16       array of DIMENSION at least */
-  /*           ( 1 + ( n - 1 )*abs( INCY ) ). */
-  /*           Before entry, the incremented array Y must contain the */
-  /*           vector y. On exit, Y is overwritten by the updated vector y. */
-
-  /*  INCY   - INTEGER. */
-  /*           On entry, INCY specifies the increment for the elements of */
-  /*           Y. INCY must not be zero. */
-  /*           Unchanged on exit. */
-
-  /*  Further Details */
-  /*  =============== */
-
-  /*  Level 2 Blas routine. */
-
-  /*  -- Written on 22-October-1986. */
-  /*     Jack Dongarra, Argonne National Lab. */
-  /*     Jeremy Du Croz, Nag Central Office. */
-  /*     Sven Hammarling, Nag Central Office. */
-  /*     Richard Hanson, Sandia National Labs. */
-
-  /*  ===================================================================== */
-
-  /*     .. Parameters .. */
-  /*     .. */
-  /*     .. Local Scalars .. */
-  /*     .. */
-  /*     .. External Functions .. */
-  /*     .. */
-  /*     .. External Subroutines .. */
-  /*     .. */
-  /*     .. Intrinsic Functions .. */
-  /*     .. */
-
-  /*     Test the input parameters. */
-
-  /* Parameter adjustments */
-  a_dim1 = *lda;
-  a_offset = 1 + a_dim1;
-  a -= a_offset;
-  --x;
-  --y;
-
-  /* Function Body */
-  info = 0;
-  if (!lsame_(uplo, "U") && !lsame_(uplo, "L")) {
-    info = 1;
-  } else if (*n < 0) {
-    info = 2;
-  } else if (*k < 0) {
-    info = 3;
-  } else if (*lda < *k + 1) {
-    info = 6;
-  } else if (*incx == 0) {
-    info = 8;
-  } else if (*incy == 0) {
-    info = 11;
-  }
-  if (info != 0) {
-    xerbla_("ZHBMV ", &info);
-    return;
-  }
-
-  /*     Quick return if possible. */
-
-  if (*n == 0 || (alpha->r == 0. && alpha->i == 0. && (beta->r == 1. && beta->i == 0.))) {
-    return;
-  }
-
-  /*     Set up the start points in  X  and  Y. */
-
-  if (*incx > 0) {
-    kx = 1;
-  } else {
-    kx = 1 - (*n - 1) * *incx;
-  }
-  if (*incy > 0) {
-    ky = 1;
-  } else {
-    ky = 1 - (*n - 1) * *incy;
-  }
-
-  /*     Start the operations. In this version the elements of the array A */
-  /*     are accessed sequentially with one pass through A. */
-
-  /*     First form  y := beta*y. */
-
-  if (beta->r != 1. || beta->i != 0.) {
-    if (*incy == 1) {
-      if (beta->r == 0. && beta->i == 0.) {
-        i__1 = *n;
-        for (i__ = 1; i__ <= i__1; ++i__) {
-          i__2 = i__;
-          y[i__2].r = 0., y[i__2].i = 0.;
-          /* L10: */
-        }
-      } else {
-        i__1 = *n;
-        for (i__ = 1; i__ <= i__1; ++i__) {
-          i__2 = i__;
-          i__3 = i__;
-          z__1.r = beta->r * y[i__3].r - beta->i * y[i__3].i, z__1.i = beta->r * y[i__3].i + beta->i * y[i__3].r;
-          y[i__2].r = z__1.r, y[i__2].i = z__1.i;
-          /* L20: */
-        }
-      }
-    } else {
-      iy = ky;
-      if (beta->r == 0. && beta->i == 0.) {
-        i__1 = *n;
-        for (i__ = 1; i__ <= i__1; ++i__) {
-          i__2 = iy;
-          y[i__2].r = 0., y[i__2].i = 0.;
-          iy += *incy;
-          /* L30: */
-        }
-      } else {
-        i__1 = *n;
-        for (i__ = 1; i__ <= i__1; ++i__) {
-          i__2 = iy;
-          i__3 = iy;
-          z__1.r = beta->r * y[i__3].r - beta->i * y[i__3].i, z__1.i = beta->r * y[i__3].i + beta->i * y[i__3].r;
-          y[i__2].r = z__1.r, y[i__2].i = z__1.i;
-          iy += *incy;
-          /* L40: */
-        }
-      }
-    }
-  }
-  if (alpha->r == 0. && alpha->i == 0.) {
-    return;
-  }
-  if (lsame_(uplo, "U")) {
-    /*        Form  y  when upper triangle of A is stored. */
-
-    kplus1 = *k + 1;
-    if (*incx == 1 && *incy == 1) {
-      i__1 = *n;
-      for (j = 1; j <= i__1; ++j) {
-        i__2 = j;
-        z__1.r = alpha->r * x[i__2].r - alpha->i * x[i__2].i, z__1.i = alpha->r * x[i__2].i + alpha->i * x[i__2].r;
-        temp1.r = z__1.r, temp1.i = z__1.i;
-        temp2.r = 0., temp2.i = 0.;
-        l = kplus1 - j;
-        /* Computing MAX */
-        i__2 = 1, i__3 = j - *k;
-        i__4 = j - 1;
-        for (i__ = max(i__2, i__3); i__ <= i__4; ++i__) {
-          i__2 = i__;
-          i__3 = i__;
-          i__5 = l + i__ + j * a_dim1;
-          z__2.r = temp1.r * a[i__5].r - temp1.i * a[i__5].i, z__2.i = temp1.r * a[i__5].i + temp1.i * a[i__5].r;
-          z__1.r = y[i__3].r + z__2.r, z__1.i = y[i__3].i + z__2.i;
-          y[i__2].r = z__1.r, y[i__2].i = z__1.i;
-          d_cnjg(&z__3, &a[l + i__ + j * a_dim1]);
-          i__2 = i__;
-          z__2.r = z__3.r * x[i__2].r - z__3.i * x[i__2].i, z__2.i = z__3.r * x[i__2].i + z__3.i * x[i__2].r;
-          z__1.r = temp2.r + z__2.r, z__1.i = temp2.i + z__2.i;
-          temp2.r = z__1.r, temp2.i = z__1.i;
-          /* L50: */
-        }
-        i__4 = j;
-        i__2 = j;
-        i__3 = kplus1 + j * a_dim1;
-        d__1 = a[i__3].r;
-        z__3.r = d__1 * temp1.r, z__3.i = d__1 * temp1.i;
-        z__2.r = y[i__2].r + z__3.r, z__2.i = y[i__2].i + z__3.i;
-        z__4.r = alpha->r * temp2.r - alpha->i * temp2.i, z__4.i = alpha->r * temp2.i + alpha->i * temp2.r;
-        z__1.r = z__2.r + z__4.r, z__1.i = z__2.i + z__4.i;
-        y[i__4].r = z__1.r, y[i__4].i = z__1.i;
-        /* L60: */
-      }
-    } else {
-      jx = kx;
-      jy = ky;
-      i__1 = *n;
-      for (j = 1; j <= i__1; ++j) {
-        i__4 = jx;
-        z__1.r = alpha->r * x[i__4].r - alpha->i * x[i__4].i, z__1.i = alpha->r * x[i__4].i + alpha->i * x[i__4].r;
-        temp1.r = z__1.r, temp1.i = z__1.i;
-        temp2.r = 0., temp2.i = 0.;
-        ix = kx;
-        iy = ky;
-        l = kplus1 - j;
-        /* Computing MAX */
-        i__4 = 1, i__2 = j - *k;
-        i__3 = j - 1;
-        for (i__ = max(i__4, i__2); i__ <= i__3; ++i__) {
-          i__4 = iy;
-          i__2 = iy;
-          i__5 = l + i__ + j * a_dim1;
-          z__2.r = temp1.r * a[i__5].r - temp1.i * a[i__5].i, z__2.i = temp1.r * a[i__5].i + temp1.i * a[i__5].r;
-          z__1.r = y[i__2].r + z__2.r, z__1.i = y[i__2].i + z__2.i;
-          y[i__4].r = z__1.r, y[i__4].i = z__1.i;
-          d_cnjg(&z__3, &a[l + i__ + j * a_dim1]);
-          i__4 = ix;
-          z__2.r = z__3.r * x[i__4].r - z__3.i * x[i__4].i, z__2.i = z__3.r * x[i__4].i + z__3.i * x[i__4].r;
-          z__1.r = temp2.r + z__2.r, z__1.i = temp2.i + z__2.i;
-          temp2.r = z__1.r, temp2.i = z__1.i;
-          ix += *incx;
-          iy += *incy;
-          /* L70: */
-        }
-        i__3 = jy;
-        i__4 = jy;
-        i__2 = kplus1 + j * a_dim1;
-        d__1 = a[i__2].r;
-        z__3.r = d__1 * temp1.r, z__3.i = d__1 * temp1.i;
-        z__2.r = y[i__4].r + z__3.r, z__2.i = y[i__4].i + z__3.i;
-        z__4.r = alpha->r * temp2.r - alpha->i * temp2.i, z__4.i = alpha->r * temp2.i + alpha->i * temp2.r;
-        z__1.r = z__2.r + z__4.r, z__1.i = z__2.i + z__4.i;
-        y[i__3].r = z__1.r, y[i__3].i = z__1.i;
-        jx += *incx;
-        jy += *incy;
-        if (j > *k) {
-          kx += *incx;
-          ky += *incy;
-        }
-        /* L80: */
-      }
-    }
-  } else {
-    /*        Form  y  when lower triangle of A is stored. */
-
-    if (*incx == 1 && *incy == 1) {
-      i__1 = *n;
-      for (j = 1; j <= i__1; ++j) {
-        i__3 = j;
-        z__1.r = alpha->r * x[i__3].r - alpha->i * x[i__3].i, z__1.i = alpha->r * x[i__3].i + alpha->i * x[i__3].r;
-        temp1.r = z__1.r, temp1.i = z__1.i;
-        temp2.r = 0., temp2.i = 0.;
-        i__3 = j;
-        i__4 = j;
-        i__2 = j * a_dim1 + 1;
-        d__1 = a[i__2].r;
-        z__2.r = d__1 * temp1.r, z__2.i = d__1 * temp1.i;
-        z__1.r = y[i__4].r + z__2.r, z__1.i = y[i__4].i + z__2.i;
-        y[i__3].r = z__1.r, y[i__3].i = z__1.i;
-        l = 1 - j;
-        /* Computing MIN */
-        i__4 = *n, i__2 = j + *k;
-        i__3 = min(i__4, i__2);
-        for (i__ = j + 1; i__ <= i__3; ++i__) {
-          i__4 = i__;
-          i__2 = i__;
-          i__5 = l + i__ + j * a_dim1;
-          z__2.r = temp1.r * a[i__5].r - temp1.i * a[i__5].i, z__2.i = temp1.r * a[i__5].i + temp1.i * a[i__5].r;
-          z__1.r = y[i__2].r + z__2.r, z__1.i = y[i__2].i + z__2.i;
-          y[i__4].r = z__1.r, y[i__4].i = z__1.i;
-          d_cnjg(&z__3, &a[l + i__ + j * a_dim1]);
-          i__4 = i__;
-          z__2.r = z__3.r * x[i__4].r - z__3.i * x[i__4].i, z__2.i = z__3.r * x[i__4].i + z__3.i * x[i__4].r;
-          z__1.r = temp2.r + z__2.r, z__1.i = temp2.i + z__2.i;
-          temp2.r = z__1.r, temp2.i = z__1.i;
-          /* L90: */
-        }
-        i__3 = j;
-        i__4 = j;
-        z__2.r = alpha->r * temp2.r - alpha->i * temp2.i, z__2.i = alpha->r * temp2.i + alpha->i * temp2.r;
-        z__1.r = y[i__4].r + z__2.r, z__1.i = y[i__4].i + z__2.i;
-        y[i__3].r = z__1.r, y[i__3].i = z__1.i;
-        /* L100: */
-      }
-    } else {
-      jx = kx;
-      jy = ky;
-      i__1 = *n;
-      for (j = 1; j <= i__1; ++j) {
-        i__3 = jx;
-        z__1.r = alpha->r * x[i__3].r - alpha->i * x[i__3].i, z__1.i = alpha->r * x[i__3].i + alpha->i * x[i__3].r;
-        temp1.r = z__1.r, temp1.i = z__1.i;
-        temp2.r = 0., temp2.i = 0.;
-        i__3 = jy;
-        i__4 = jy;
-        i__2 = j * a_dim1 + 1;
-        d__1 = a[i__2].r;
-        z__2.r = d__1 * temp1.r, z__2.i = d__1 * temp1.i;
-        z__1.r = y[i__4].r + z__2.r, z__1.i = y[i__4].i + z__2.i;
-        y[i__3].r = z__1.r, y[i__3].i = z__1.i;
-        l = 1 - j;
-        ix = jx;
-        iy = jy;
-        /* Computing MIN */
-        i__4 = *n, i__2 = j + *k;
-        i__3 = min(i__4, i__2);
-        for (i__ = j + 1; i__ <= i__3; ++i__) {
-          ix += *incx;
-          iy += *incy;
-          i__4 = iy;
-          i__2 = iy;
-          i__5 = l + i__ + j * a_dim1;
-          z__2.r = temp1.r * a[i__5].r - temp1.i * a[i__5].i, z__2.i = temp1.r * a[i__5].i + temp1.i * a[i__5].r;
-          z__1.r = y[i__2].r + z__2.r, z__1.i = y[i__2].i + z__2.i;
-          y[i__4].r = z__1.r, y[i__4].i = z__1.i;
-          d_cnjg(&z__3, &a[l + i__ + j * a_dim1]);
-          i__4 = ix;
-          z__2.r = z__3.r * x[i__4].r - z__3.i * x[i__4].i, z__2.i = z__3.r * x[i__4].i + z__3.i * x[i__4].r;
-          z__1.r = temp2.r + z__2.r, z__1.i = temp2.i + z__2.i;
-          temp2.r = z__1.r, temp2.i = z__1.i;
-          /* L110: */
-        }
-        i__3 = jy;
-        i__4 = jy;
-        z__2.r = alpha->r * temp2.r - alpha->i * temp2.i, z__2.i = alpha->r * temp2.i + alpha->i * temp2.r;
-        z__1.r = y[i__4].r + z__2.r, z__1.i = y[i__4].i + z__2.i;
-        y[i__3].r = z__1.r, y[i__3].i = z__1.i;
-        jx += *incx;
-        jy += *incy;
-        /* L120: */
-      }
-    }
-  }
-
-  /*     End of ZHBMV . */
-
-} /* zhbmv_ */
--- a/blas/f2c/zhpmv.c
+++ b/blas/f2c/zhpmv.c
@@ -1,407 +0,0 @@
-/* zhpmv.f -- translated by f2c (version 20100827).
-   You must link the resulting object file with libf2c:
-        on Microsoft Windows system, link with libf2c.lib;
-        on Linux or Unix systems, link with .../path/to/libf2c.a -lm
-        or, if you install libf2c.a in a standard place, with -lf2c -lm
-        -- in that order, at the end of the command line, as in
-                cc *.o -lf2c -lm
-        Source for libf2c is in /netlib/f2c/libf2c.zip, e.g.,
-
-                http://www.netlib.org/f2c/libf2c.zip
-*/
-
-#include "datatypes.h"
-
-static inline void d_cnjg(doublecomplex *r, doublecomplex *z) {
-  r->r = z->r;
-  r->i = -(z->i);
-}
-
-/* Subroutine */ void zhpmv_(char *uplo, integer *n, doublecomplex *alpha, doublecomplex *ap, doublecomplex *x,
-                             integer *incx, doublecomplex *beta, doublecomplex *y, integer *incy) {
-  /* System generated locals */
-  integer i__1, i__2, i__3, i__4, i__5;
-  doublereal d__1;
-  doublecomplex z__1, z__2, z__3, z__4;
-
-  /* Local variables */
-  integer i__, j, k, kk, ix, iy, jx, jy, kx, ky, info;
-  doublecomplex temp1, temp2;
-  extern logical lsame_(char *, char *);
-  extern /* Subroutine */ void xerbla_(const char *, integer *);
-
-  /*     .. Scalar Arguments .. */
-  /*     .. */
-  /*     .. Array Arguments .. */
-  /*     .. */
-
-  /*  Purpose */
-  /*  ======= */
-
-  /*  ZHPMV  performs the matrix-vector operation */
-
-  /*     y := alpha*A*x + beta*y, */
-
-  /*  where alpha and beta are scalars, x and y are n element vectors and */
-  /*  A is an n by n hermitian matrix, supplied in packed form. */
-
-  /*  Arguments */
-  /*  ========== */
-
-  /*  UPLO   - CHARACTER*1. */
-  /*           On entry, UPLO specifies whether the upper or lower */
-  /*           triangular part of the matrix A is supplied in the packed */
-  /*           array AP as follows: */
-
-  /*              UPLO = 'U' or 'u'   The upper triangular part of A is */
-  /*                                  supplied in AP. */
-
-  /*              UPLO = 'L' or 'l'   The lower triangular part of A is */
-  /*                                  supplied in AP. */
-
-  /*           Unchanged on exit. */
-
-  /*  N      - INTEGER. */
-  /*           On entry, N specifies the order of the matrix A. */
-  /*           N must be at least zero. */
-  /*           Unchanged on exit. */
-
-  /*  ALPHA  - COMPLEX*16      . */
-  /*           On entry, ALPHA specifies the scalar alpha. */
-  /*           Unchanged on exit. */
-
-  /*  AP     - COMPLEX*16       array of DIMENSION at least */
-  /*           ( ( n*( n + 1 ) )/2 ). */
-  /*           Before entry with UPLO = 'U' or 'u', the array AP must */
-  /*           contain the upper triangular part of the hermitian matrix */
-  /*           packed sequentially, column by column, so that AP( 1 ) */
-  /*           contains a( 1, 1 ), AP( 2 ) and AP( 3 ) contain a( 1, 2 ) */
-  /*           and a( 2, 2 ) respectively, and so on. */
-  /*           Before entry with UPLO = 'L' or 'l', the array AP must */
-  /*           contain the lower triangular part of the hermitian matrix */
-  /*           packed sequentially, column by column, so that AP( 1 ) */
-  /*           contains a( 1, 1 ), AP( 2 ) and AP( 3 ) contain a( 2, 1 ) */
-  /*           and a( 3, 1 ) respectively, and so on. */
-  /*           Note that the imaginary parts of the diagonal elements need */
-  /*           not be set and are assumed to be zero. */
-  /*           Unchanged on exit. */
-
-  /*  X      - COMPLEX*16       array of dimension at least */
-  /*           ( 1 + ( n - 1 )*abs( INCX ) ). */
-  /*           Before entry, the incremented array X must contain the n */
-  /*           element vector x. */
-  /*           Unchanged on exit. */
-
-  /*  INCX   - INTEGER. */
-  /*           On entry, INCX specifies the increment for the elements of */
-  /*           X. INCX must not be zero. */
-  /*           Unchanged on exit. */
-
-  /*  BETA   - COMPLEX*16      . */
-  /*           On entry, BETA specifies the scalar beta. When BETA is */
-  /*           supplied as zero then Y need not be set on input. */
-  /*           Unchanged on exit. */
-
-  /*  Y      - COMPLEX*16       array of dimension at least */
-  /*           ( 1 + ( n - 1 )*abs( INCY ) ). */
-  /*           Before entry, the incremented array Y must contain the n */
-  /*           element vector y. On exit, Y is overwritten by the updated */
-  /*           vector y. */
-
-  /*  INCY   - INTEGER. */
-  /*           On entry, INCY specifies the increment for the elements of */
-  /*           Y. INCY must not be zero. */
-  /*           Unchanged on exit. */
-
-  /*  Further Details */
-  /*  =============== */
-
-  /*  Level 2 Blas routine. */
-
-  /*  -- Written on 22-October-1986. */
-  /*     Jack Dongarra, Argonne National Lab. */
-  /*     Jeremy Du Croz, Nag Central Office. */
-  /*     Sven Hammarling, Nag Central Office. */
-  /*     Richard Hanson, Sandia National Labs. */
-
-  /*  ===================================================================== */
-
-  /*     .. Parameters .. */
-  /*     .. */
-  /*     .. Local Scalars .. */
-  /*     .. */
-  /*     .. External Functions .. */
-  /*     .. */
-  /*     .. External Subroutines .. */
-  /*     .. */
-  /*     .. Intrinsic Functions .. */
-  /*     .. */
-
-  /*     Test the input parameters. */
-
-  /* Parameter adjustments */
-  --y;
-  --x;
-  --ap;
-
-  /* Function Body */
-  info = 0;
-  if (!lsame_(uplo, "U") && !lsame_(uplo, "L")) {
-    info = 1;
-  } else if (*n < 0) {
-    info = 2;
-  } else if (*incx == 0) {
-    info = 6;
-  } else if (*incy == 0) {
-    info = 9;
-  }
-  if (info != 0) {
-    xerbla_("ZHPMV ", &info);
-    return;
-  }
-
-  /*     Quick return if possible. */
-
-  if (*n == 0 || (alpha->r == 0. && alpha->i == 0. && (beta->r == 1. && beta->i == 0.))) {
-    return;
-  }
-
-  /*     Set up the start points in  X  and  Y. */
-
-  if (*incx > 0) {
-    kx = 1;
-  } else {
-    kx = 1 - (*n - 1) * *incx;
-  }
-  if (*incy > 0) {
-    ky = 1;
-  } else {
-    ky = 1 - (*n - 1) * *incy;
-  }
-
-  /*     Start the operations. In this version the elements of the array AP */
-  /*     are accessed sequentially with one pass through AP. */
-
-  /*     First form  y := beta*y. */
-
-  if (beta->r != 1. || beta->i != 0.) {
-    if (*incy == 1) {
-      if (beta->r == 0. && beta->i == 0.) {
-        i__1 = *n;
-        for (i__ = 1; i__ <= i__1; ++i__) {
-          i__2 = i__;
-          y[i__2].r = 0., y[i__2].i = 0.;
-          /* L10: */
-        }
-      } else {
-        i__1 = *n;
-        for (i__ = 1; i__ <= i__1; ++i__) {
-          i__2 = i__;
-          i__3 = i__;
-          z__1.r = beta->r * y[i__3].r - beta->i * y[i__3].i, z__1.i = beta->r * y[i__3].i + beta->i * y[i__3].r;
-          y[i__2].r = z__1.r, y[i__2].i = z__1.i;
-          /* L20: */
-        }
-      }
-    } else {
-      iy = ky;
-      if (beta->r == 0. && beta->i == 0.) {
-        i__1 = *n;
-        for (i__ = 1; i__ <= i__1; ++i__) {
-          i__2 = iy;
-          y[i__2].r = 0., y[i__2].i = 0.;
-          iy += *incy;
-          /* L30: */
-        }
-      } else {
-        i__1 = *n;
-        for (i__ = 1; i__ <= i__1; ++i__) {
-          i__2 = iy;
-          i__3 = iy;
-          z__1.r = beta->r * y[i__3].r - beta->i * y[i__3].i, z__1.i = beta->r * y[i__3].i + beta->i * y[i__3].r;
-          y[i__2].r = z__1.r, y[i__2].i = z__1.i;
-          iy += *incy;
-          /* L40: */
-        }
-      }
-    }
-  }
-  if (alpha->r == 0. && alpha->i == 0.) {
-    return;
-  }
-  kk = 1;
-  if (lsame_(uplo, "U")) {
-    /*        Form  y  when AP contains the upper triangle. */
-
-    if (*incx == 1 && *incy == 1) {
-      i__1 = *n;
-      for (j = 1; j <= i__1; ++j) {
-        i__2 = j;
-        z__1.r = alpha->r * x[i__2].r - alpha->i * x[i__2].i, z__1.i = alpha->r * x[i__2].i + alpha->i * x[i__2].r;
-        temp1.r = z__1.r, temp1.i = z__1.i;
-        temp2.r = 0., temp2.i = 0.;
-        k = kk;
-        i__2 = j - 1;
-        for (i__ = 1; i__ <= i__2; ++i__) {
-          i__3 = i__;
-          i__4 = i__;
-          i__5 = k;
-          z__2.r = temp1.r * ap[i__5].r - temp1.i * ap[i__5].i, z__2.i = temp1.r * ap[i__5].i + temp1.i * ap[i__5].r;
-          z__1.r = y[i__4].r + z__2.r, z__1.i = y[i__4].i + z__2.i;
-          y[i__3].r = z__1.r, y[i__3].i = z__1.i;
-          d_cnjg(&z__3, &ap[k]);
-          i__3 = i__;
-          z__2.r = z__3.r * x[i__3].r - z__3.i * x[i__3].i, z__2.i = z__3.r * x[i__3].i + z__3.i * x[i__3].r;
-          z__1.r = temp2.r + z__2.r, z__1.i = temp2.i + z__2.i;
-          temp2.r = z__1.r, temp2.i = z__1.i;
-          ++k;
-          /* L50: */
-        }
-        i__2 = j;
-        i__3 = j;
-        i__4 = kk + j - 1;
-        d__1 = ap[i__4].r;
-        z__3.r = d__1 * temp1.r, z__3.i = d__1 * temp1.i;
-        z__2.r = y[i__3].r + z__3.r, z__2.i = y[i__3].i + z__3.i;
-        z__4.r = alpha->r * temp2.r - alpha->i * temp2.i, z__4.i = alpha->r * temp2.i + alpha->i * temp2.r;
-        z__1.r = z__2.r + z__4.r, z__1.i = z__2.i + z__4.i;
-        y[i__2].r = z__1.r, y[i__2].i = z__1.i;
-        kk += j;
-        /* L60: */
-      }
-    } else {
-      jx = kx;
-      jy = ky;
-      i__1 = *n;
-      for (j = 1; j <= i__1; ++j) {
-        i__2 = jx;
-        z__1.r = alpha->r * x[i__2].r - alpha->i * x[i__2].i, z__1.i = alpha->r * x[i__2].i + alpha->i * x[i__2].r;
-        temp1.r = z__1.r, temp1.i = z__1.i;
-        temp2.r = 0., temp2.i = 0.;
-        ix = kx;
-        iy = ky;
-        i__2 = kk + j - 2;
-        for (k = kk; k <= i__2; ++k) {
-          i__3 = iy;
-          i__4 = iy;
-          i__5 = k;
-          z__2.r = temp1.r * ap[i__5].r - temp1.i * ap[i__5].i, z__2.i = temp1.r * ap[i__5].i + temp1.i * ap[i__5].r;
-          z__1.r = y[i__4].r + z__2.r, z__1.i = y[i__4].i + z__2.i;
-          y[i__3].r = z__1.r, y[i__3].i = z__1.i;
-          d_cnjg(&z__3, &ap[k]);
-          i__3 = ix;
-          z__2.r = z__3.r * x[i__3].r - z__3.i * x[i__3].i, z__2.i = z__3.r * x[i__3].i + z__3.i * x[i__3].r;
-          z__1.r = temp2.r + z__2.r, z__1.i = temp2.i + z__2.i;
-          temp2.r = z__1.r, temp2.i = z__1.i;
-          ix += *incx;
-          iy += *incy;
-          /* L70: */
-        }
-        i__2 = jy;
-        i__3 = jy;
-        i__4 = kk + j - 1;
-        d__1 = ap[i__4].r;
-        z__3.r = d__1 * temp1.r, z__3.i = d__1 * temp1.i;
-        z__2.r = y[i__3].r + z__3.r, z__2.i = y[i__3].i + z__3.i;
-        z__4.r = alpha->r * temp2.r - alpha->i * temp2.i, z__4.i = alpha->r * temp2.i + alpha->i * temp2.r;
-        z__1.r = z__2.r + z__4.r, z__1.i = z__2.i + z__4.i;
-        y[i__2].r = z__1.r, y[i__2].i = z__1.i;
-        jx += *incx;
-        jy += *incy;
-        kk += j;
-        /* L80: */
-      }
-    }
-  } else {
-    /*        Form  y  when AP contains the lower triangle. */
-
-    if (*incx == 1 && *incy == 1) {
-      i__1 = *n;
-      for (j = 1; j <= i__1; ++j) {
-        i__2 = j;
-        z__1.r = alpha->r * x[i__2].r - alpha->i * x[i__2].i, z__1.i = alpha->r * x[i__2].i + alpha->i * x[i__2].r;
-        temp1.r = z__1.r, temp1.i = z__1.i;
-        temp2.r = 0., temp2.i = 0.;
-        i__2 = j;
-        i__3 = j;
-        i__4 = kk;
-        d__1 = ap[i__4].r;
-        z__2.r = d__1 * temp1.r, z__2.i = d__1 * temp1.i;
-        z__1.r = y[i__3].r + z__2.r, z__1.i = y[i__3].i + z__2.i;
-        y[i__2].r = z__1.r, y[i__2].i = z__1.i;
-        k = kk + 1;
-        i__2 = *n;
-        for (i__ = j + 1; i__ <= i__2; ++i__) {
-          i__3 = i__;
-          i__4 = i__;
-          i__5 = k;
-          z__2.r = temp1.r * ap[i__5].r - temp1.i * ap[i__5].i, z__2.i = temp1.r * ap[i__5].i + temp1.i * ap[i__5].r;
-          z__1.r = y[i__4].r + z__2.r, z__1.i = y[i__4].i + z__2.i;
-          y[i__3].r = z__1.r, y[i__3].i = z__1.i;
-          d_cnjg(&z__3, &ap[k]);
-          i__3 = i__;
-          z__2.r = z__3.r * x[i__3].r - z__3.i * x[i__3].i, z__2.i = z__3.r * x[i__3].i + z__3.i * x[i__3].r;
-          z__1.r = temp2.r + z__2.r, z__1.i = temp2.i + z__2.i;
-          temp2.r = z__1.r, temp2.i = z__1.i;
-          ++k;
-          /* L90: */
-        }
-        i__2 = j;
-        i__3 = j;
-        z__2.r = alpha->r * temp2.r - alpha->i * temp2.i, z__2.i = alpha->r * temp2.i + alpha->i * temp2.r;
-        z__1.r = y[i__3].r + z__2.r, z__1.i = y[i__3].i + z__2.i;
-        y[i__2].r = z__1.r, y[i__2].i = z__1.i;
-        kk += *n - j + 1;
-        /* L100: */
-      }
-    } else {
-      jx = kx;
-      jy = ky;
-      i__1 = *n;
-      for (j = 1; j <= i__1; ++j) {
-        i__2 = jx;
-        z__1.r = alpha->r * x[i__2].r - alpha->i * x[i__2].i, z__1.i = alpha->r * x[i__2].i + alpha->i * x[i__2].r;
-        temp1.r = z__1.r, temp1.i = z__1.i;
-        temp2.r = 0., temp2.i = 0.;
-        i__2 = jy;
-        i__3 = jy;
-        i__4 = kk;
-        d__1 = ap[i__4].r;
-        z__2.r = d__1 * temp1.r, z__2.i = d__1 * temp1.i;
-        z__1.r = y[i__3].r + z__2.r, z__1.i = y[i__3].i + z__2.i;
-        y[i__2].r = z__1.r, y[i__2].i = z__1.i;
-        ix = jx;
-        iy = jy;
-        i__2 = kk + *n - j;
-        for (k = kk + 1; k <= i__2; ++k) {
-          ix += *incx;
-          iy += *incy;
-          i__3 = iy;
-          i__4 = iy;
-          i__5 = k;
-          z__2.r = temp1.r * ap[i__5].r - temp1.i * ap[i__5].i, z__2.i = temp1.r * ap[i__5].i + temp1.i * ap[i__5].r;
-          z__1.r = y[i__4].r + z__2.r, z__1.i = y[i__4].i + z__2.i;
-          y[i__3].r = z__1.r, y[i__3].i = z__1.i;
-          d_cnjg(&z__3, &ap[k]);
-          i__3 = ix;
-          z__2.r = z__3.r * x[i__3].r - z__3.i * x[i__3].i, z__2.i = z__3.r * x[i__3].i + z__3.i * x[i__3].r;
-          z__1.r = temp2.r + z__2.r, z__1.i = temp2.i + z__2.i;
-          temp2.r = z__1.r, temp2.i = z__1.i;
-          /* L110: */
-        }
-        i__2 = jy;
-        i__3 = jy;
-        z__2.r = alpha->r * temp2.r - alpha->i * temp2.i, z__2.i = alpha->r * temp2.i + alpha->i * temp2.r;
-        z__1.r = y[i__3].r + z__2.r, z__1.i = y[i__3].i + z__2.i;
-        y[i__2].r = z__1.r, y[i__2].i = z__1.i;
-        jx += *incx;
-        jy += *incy;
-        kk += *n - j + 1;
-        /* L120: */
-      }
-    }
-  }
-
-  /*     End of ZHPMV . */
-
-} /* zhpmv_ */
--- a/blas/f2c/ztbmv.c
+++ b/blas/f2c/ztbmv.c
@@ -1,586 +0,0 @@
-/* ztbmv.f -- translated by f2c (version 20100827).
-   You must link the resulting object file with libf2c:
-        on Microsoft Windows system, link with libf2c.lib;
-        on Linux or Unix systems, link with .../path/to/libf2c.a -lm
-        or, if you install libf2c.a in a standard place, with -lf2c -lm
-        -- in that order, at the end of the command line, as in
-                cc *.o -lf2c -lm
-        Source for libf2c is in /netlib/f2c/libf2c.zip, e.g.,
-
-                http://www.netlib.org/f2c/libf2c.zip
-*/
-
-#include "datatypes.h"
-
-static inline void d_cnjg(doublecomplex *r, doublecomplex *z) {
-  r->r = z->r;
-  r->i = -(z->i);
-}
-
-/* Subroutine */ void ztbmv_(char *uplo, char *trans, char *diag, integer *n, integer *k, doublecomplex *a,
-                             integer *lda, doublecomplex *x, integer *incx) {
-  /* System generated locals */
-  integer a_dim1, a_offset, i__1, i__2, i__3, i__4, i__5;
-  doublecomplex z__1, z__2, z__3;
-
-  /* Local variables */
-  integer i__, j, l, ix, jx, kx, info;
-  doublecomplex temp;
-  extern logical lsame_(char *, char *);
-  integer kplus1;
-  extern /* Subroutine */ void xerbla_(const char *, integer *);
-  logical noconj, nounit;
-
-  /*     .. Scalar Arguments .. */
-  /*     .. */
-  /*     .. Array Arguments .. */
-  /*     .. */
-
-  /*  Purpose */
-  /*  ======= */
-
-  /*  ZTBMV  performs one of the matrix-vector operations */
-
-  /*     x := A*x,   or   x := A'*x,   or   x := conjg( A' )*x, */
-
-  /*  where x is an n element vector and  A is an n by n unit, or non-unit, */
-  /*  upper or lower triangular band matrix, with ( k + 1 ) diagonals. */
-
-  /*  Arguments */
-  /*  ========== */
-
-  /*  UPLO   - CHARACTER*1. */
-  /*           On entry, UPLO specifies whether the matrix is an upper or */
-  /*           lower triangular matrix as follows: */
-
-  /*              UPLO = 'U' or 'u'   A is an upper triangular matrix. */
-
-  /*              UPLO = 'L' or 'l'   A is a lower triangular matrix. */
-
-  /*           Unchanged on exit. */
-
-  /*  TRANS  - CHARACTER*1. */
-  /*           On entry, TRANS specifies the operation to be performed as */
-  /*           follows: */
-
-  /*              TRANS = 'N' or 'n'   x := A*x. */
-
-  /*              TRANS = 'T' or 't'   x := A'*x. */
-
-  /*              TRANS = 'C' or 'c'   x := conjg( A' )*x. */
-
-  /*           Unchanged on exit. */
-
-  /*  DIAG   - CHARACTER*1. */
-  /*           On entry, DIAG specifies whether or not A is unit */
-  /*           triangular as follows: */
-
-  /*              DIAG = 'U' or 'u'   A is assumed to be unit triangular. */
-
-  /*              DIAG = 'N' or 'n'   A is not assumed to be unit */
-  /*                                  triangular. */
-
-  /*           Unchanged on exit. */
-
-  /*  N      - INTEGER. */
-  /*           On entry, N specifies the order of the matrix A. */
-  /*           N must be at least zero. */
-  /*           Unchanged on exit. */
-
-  /*  K      - INTEGER. */
-  /*           On entry with UPLO = 'U' or 'u', K specifies the number of */
-  /*           super-diagonals of the matrix A. */
-  /*           On entry with UPLO = 'L' or 'l', K specifies the number of */
-  /*           sub-diagonals of the matrix A. */
-  /*           K must satisfy  0 .le. K. */
-  /*           Unchanged on exit. */
-
-  /*  A      - COMPLEX*16       array of DIMENSION ( LDA, n ). */
-  /*           Before entry with UPLO = 'U' or 'u', the leading ( k + 1 ) */
-  /*           by n part of the array A must contain the upper triangular */
-  /*           band part of the matrix of coefficients, supplied column by */
-  /*           column, with the leading diagonal of the matrix in row */
-  /*           ( k + 1 ) of the array, the first super-diagonal starting at */
-  /*           position 2 in row k, and so on. The top left k by k triangle */
-  /*           of the array A is not referenced. */
-  /*           The following program segment will transfer an upper */
-  /*           triangular band matrix from conventional full matrix storage */
-  /*           to band storage: */
-
-  /*                 DO 20, J = 1, N */
-  /*                    M = K + 1 - J */
-  /*                    DO 10, I = MAX( 1, J - K ), J */
-  /*                       A( M + I, J ) = matrix( I, J ) */
-  /*              10    CONTINUE */
-  /*              20 CONTINUE */
-
-  /*           Before entry with UPLO = 'L' or 'l', the leading ( k + 1 ) */
-  /*           by n part of the array A must contain the lower triangular */
-  /*           band part of the matrix of coefficients, supplied column by */
-  /*           column, with the leading diagonal of the matrix in row 1 of */
-  /*           the array, the first sub-diagonal starting at position 1 in */
-  /*           row 2, and so on. The bottom right k by k triangle of the */
-  /*           array A is not referenced. */
-  /*           The following program segment will transfer a lower */
-  /*           triangular band matrix from conventional full matrix storage */
-  /*           to band storage: */
-
-  /*                 DO 20, J = 1, N */
-  /*                    M = 1 - J */
-  /*                    DO 10, I = J, MIN( N, J + K ) */
-  /*                       A( M + I, J ) = matrix( I, J ) */
-  /*              10    CONTINUE */
-  /*              20 CONTINUE */
-
-  /*           Note that when DIAG = 'U' or 'u' the elements of the array A */
-  /*           corresponding to the diagonal elements of the matrix are not */
-  /*           referenced, but are assumed to be unity. */
-  /*           Unchanged on exit. */
-
-  /*  LDA    - INTEGER. */
-  /*           On entry, LDA specifies the first dimension of A as declared */
-  /*           in the calling (sub) program. LDA must be at least */
-  /*           ( k + 1 ). */
-  /*           Unchanged on exit. */
-
-  /*  X      - COMPLEX*16       array of dimension at least */
-  /*           ( 1 + ( n - 1 )*abs( INCX ) ). */
-  /*           Before entry, the incremented array X must contain the n */
-  /*           element vector x. On exit, X is overwritten with the */
-  /*           transformed vector x. */
-
-  /*  INCX   - INTEGER. */
-  /*           On entry, INCX specifies the increment for the elements of */
-  /*           X. INCX must not be zero. */
-  /*           Unchanged on exit. */
-
-  /*  Further Details */
-  /*  =============== */
-
-  /*  Level 2 Blas routine. */
-
-  /*  -- Written on 22-October-1986. */
-  /*     Jack Dongarra, Argonne National Lab. */
-  /*     Jeremy Du Croz, Nag Central Office. */
-  /*     Sven Hammarling, Nag Central Office. */
-  /*     Richard Hanson, Sandia National Labs. */
-
-  /*  ===================================================================== */
-
-  /*     .. Parameters .. */
-  /*     .. */
-  /*     .. Local Scalars .. */
-  /*     .. */
-  /*     .. External Functions .. */
-  /*     .. */
-  /*     .. External Subroutines .. */
-  /*     .. */
-  /*     .. Intrinsic Functions .. */
-  /*     .. */
-
-  /*     Test the input parameters. */
-
-  /* Parameter adjustments */
-  a_dim1 = *lda;
-  a_offset = 1 + a_dim1;
-  a -= a_offset;
-  --x;
-
-  /* Function Body */
-  info = 0;
-  if (!lsame_(uplo, "U") && !lsame_(uplo, "L")) {
-    info = 1;
-  } else if (!lsame_(trans, "N") && !lsame_(trans, "T") && !lsame_(trans, "C")) {
-    info = 2;
-  } else if (!lsame_(diag, "U") && !lsame_(diag, "N")) {
-    info = 3;
-  } else if (*n < 0) {
-    info = 4;
-  } else if (*k < 0) {
-    info = 5;
-  } else if (*lda < *k + 1) {
-    info = 7;
-  } else if (*incx == 0) {
-    info = 9;
-  }
-  if (info != 0) {
-    xerbla_("ZTBMV ", &info);
-    return;
-  }
-
-  /*     Quick return if possible. */
-
-  if (*n == 0) {
-    return;
-  }
-
-  noconj = lsame_(trans, "T");
-  nounit = lsame_(diag, "N");
-
-  /*     Set up the start point in X if the increment is not unity. This */
-  /*     will be  ( N - 1 )*INCX   too small for descending loops. */
-
-  if (*incx <= 0) {
-    kx = 1 - (*n - 1) * *incx;
-  } else if (*incx != 1) {
-    kx = 1;
-  }
-
-  /*     Start the operations. In this version the elements of A are */
-  /*     accessed sequentially with one pass through A. */
-
-  if (lsame_(trans, "N")) {
-    /*         Form  x := A*x. */
-
-    if (lsame_(uplo, "U")) {
-      kplus1 = *k + 1;
-      if (*incx == 1) {
-        i__1 = *n;
-        for (j = 1; j <= i__1; ++j) {
-          i__2 = j;
-          if (x[i__2].r != 0. || x[i__2].i != 0.) {
-            i__2 = j;
-            temp.r = x[i__2].r, temp.i = x[i__2].i;
-            l = kplus1 - j;
-            /* Computing MAX */
-            i__2 = 1, i__3 = j - *k;
-            i__4 = j - 1;
-            for (i__ = max(i__2, i__3); i__ <= i__4; ++i__) {
-              i__2 = i__;
-              i__3 = i__;
-              i__5 = l + i__ + j * a_dim1;
-              z__2.r = temp.r * a[i__5].r - temp.i * a[i__5].i, z__2.i = temp.r * a[i__5].i + temp.i * a[i__5].r;
-              z__1.r = x[i__3].r + z__2.r, z__1.i = x[i__3].i + z__2.i;
-              x[i__2].r = z__1.r, x[i__2].i = z__1.i;
-              /* L10: */
-            }
-            if (nounit) {
-              i__4 = j;
-              i__2 = j;
-              i__3 = kplus1 + j * a_dim1;
-              z__1.r = x[i__2].r * a[i__3].r - x[i__2].i * a[i__3].i,
-              z__1.i = x[i__2].r * a[i__3].i + x[i__2].i * a[i__3].r;
-              x[i__4].r = z__1.r, x[i__4].i = z__1.i;
-            }
-          }
-          /* L20: */
-        }
-      } else {
-        jx = kx;
-        i__1 = *n;
-        for (j = 1; j <= i__1; ++j) {
-          i__4 = jx;
-          if (x[i__4].r != 0. || x[i__4].i != 0.) {
-            i__4 = jx;
-            temp.r = x[i__4].r, temp.i = x[i__4].i;
-            ix = kx;
-            l = kplus1 - j;
-            /* Computing MAX */
-            i__4 = 1, i__2 = j - *k;
-            i__3 = j - 1;
-            for (i__ = max(i__4, i__2); i__ <= i__3; ++i__) {
-              i__4 = ix;
-              i__2 = ix;
-              i__5 = l + i__ + j * a_dim1;
-              z__2.r = temp.r * a[i__5].r - temp.i * a[i__5].i, z__2.i = temp.r * a[i__5].i + temp.i * a[i__5].r;
-              z__1.r = x[i__2].r + z__2.r, z__1.i = x[i__2].i + z__2.i;
-              x[i__4].r = z__1.r, x[i__4].i = z__1.i;
-              ix += *incx;
-              /* L30: */
-            }
-            if (nounit) {
-              i__3 = jx;
-              i__4 = jx;
-              i__2 = kplus1 + j * a_dim1;
-              z__1.r = x[i__4].r * a[i__2].r - x[i__4].i * a[i__2].i,
-              z__1.i = x[i__4].r * a[i__2].i + x[i__4].i * a[i__2].r;
-              x[i__3].r = z__1.r, x[i__3].i = z__1.i;
-            }
-          }
-          jx += *incx;
-          if (j > *k) {
-            kx += *incx;
-          }
-          /* L40: */
-        }
-      }
-    } else {
-      if (*incx == 1) {
-        for (j = *n; j >= 1; --j) {
-          i__1 = j;
-          if (x[i__1].r != 0. || x[i__1].i != 0.) {
-            i__1 = j;
-            temp.r = x[i__1].r, temp.i = x[i__1].i;
-            l = 1 - j;
-            /* Computing MIN */
-            i__1 = *n, i__3 = j + *k;
-            i__4 = j + 1;
-            for (i__ = min(i__1, i__3); i__ >= i__4; --i__) {
-              i__1 = i__;
-              i__3 = i__;
-              i__2 = l + i__ + j * a_dim1;
-              z__2.r = temp.r * a[i__2].r - temp.i * a[i__2].i, z__2.i = temp.r * a[i__2].i + temp.i * a[i__2].r;
-              z__1.r = x[i__3].r + z__2.r, z__1.i = x[i__3].i + z__2.i;
-              x[i__1].r = z__1.r, x[i__1].i = z__1.i;
-              /* L50: */
-            }
-            if (nounit) {
-              i__4 = j;
-              i__1 = j;
-              i__3 = j * a_dim1 + 1;
-              z__1.r = x[i__1].r * a[i__3].r - x[i__1].i * a[i__3].i,
-              z__1.i = x[i__1].r * a[i__3].i + x[i__1].i * a[i__3].r;
-              x[i__4].r = z__1.r, x[i__4].i = z__1.i;
-            }
-          }
-          /* L60: */
-        }
-      } else {
-        kx += (*n - 1) * *incx;
-        jx = kx;
-        for (j = *n; j >= 1; --j) {
-          i__4 = jx;
-          if (x[i__4].r != 0. || x[i__4].i != 0.) {
-            i__4 = jx;
-            temp.r = x[i__4].r, temp.i = x[i__4].i;
-            ix = kx;
-            l = 1 - j;
-            /* Computing MIN */
-            i__4 = *n, i__1 = j + *k;
-            i__3 = j + 1;
-            for (i__ = min(i__4, i__1); i__ >= i__3; --i__) {
-              i__4 = ix;
-              i__1 = ix;
-              i__2 = l + i__ + j * a_dim1;
-              z__2.r = temp.r * a[i__2].r - temp.i * a[i__2].i, z__2.i = temp.r * a[i__2].i + temp.i * a[i__2].r;
-              z__1.r = x[i__1].r + z__2.r, z__1.i = x[i__1].i + z__2.i;
-              x[i__4].r = z__1.r, x[i__4].i = z__1.i;
-              ix -= *incx;
-              /* L70: */
-            }
-            if (nounit) {
-              i__3 = jx;
-              i__4 = jx;
-              i__1 = j * a_dim1 + 1;
-              z__1.r = x[i__4].r * a[i__1].r - x[i__4].i * a[i__1].i,
-              z__1.i = x[i__4].r * a[i__1].i + x[i__4].i * a[i__1].r;
-              x[i__3].r = z__1.r, x[i__3].i = z__1.i;
-            }
-          }
-          jx -= *incx;
-          if (*n - j >= *k) {
-            kx -= *incx;
-          }
-          /* L80: */
-        }
-      }
-    }
-  } else {
-    /*        Form  x := A'*x  or  x := conjg( A' )*x. */
-
-    if (lsame_(uplo, "U")) {
-      kplus1 = *k + 1;
-      if (*incx == 1) {
-        for (j = *n; j >= 1; --j) {
-          i__3 = j;
-          temp.r = x[i__3].r, temp.i = x[i__3].i;
-          l = kplus1 - j;
-          if (noconj) {
-            if (nounit) {
-              i__3 = kplus1 + j * a_dim1;
-              z__1.r = temp.r * a[i__3].r - temp.i * a[i__3].i, z__1.i = temp.r * a[i__3].i + temp.i * a[i__3].r;
-              temp.r = z__1.r, temp.i = z__1.i;
-            }
-            /* Computing MAX */
-            i__4 = 1, i__1 = j - *k;
-            i__3 = max(i__4, i__1);
-            for (i__ = j - 1; i__ >= i__3; --i__) {
-              i__4 = l + i__ + j * a_dim1;
-              i__1 = i__;
-              z__2.r = a[i__4].r * x[i__1].r - a[i__4].i * x[i__1].i,
-              z__2.i = a[i__4].r * x[i__1].i + a[i__4].i * x[i__1].r;
-              z__1.r = temp.r + z__2.r, z__1.i = temp.i + z__2.i;
-              temp.r = z__1.r, temp.i = z__1.i;
-              /* L90: */
-            }
-          } else {
-            if (nounit) {
-              d_cnjg(&z__2, &a[kplus1 + j * a_dim1]);
-              z__1.r = temp.r * z__2.r - temp.i * z__2.i, z__1.i = temp.r * z__2.i + temp.i * z__2.r;
-              temp.r = z__1.r, temp.i = z__1.i;
-            }
-            /* Computing MAX */
-            i__4 = 1, i__1 = j - *k;
-            i__3 = max(i__4, i__1);
-            for (i__ = j - 1; i__ >= i__3; --i__) {
-              d_cnjg(&z__3, &a[l + i__ + j * a_dim1]);
-              i__4 = i__;
-              z__2.r = z__3.r * x[i__4].r - z__3.i * x[i__4].i, z__2.i = z__3.r * x[i__4].i + z__3.i * x[i__4].r;
-              z__1.r = temp.r + z__2.r, z__1.i = temp.i + z__2.i;
-              temp.r = z__1.r, temp.i = z__1.i;
-              /* L100: */
-            }
-          }
-          i__3 = j;
-          x[i__3].r = temp.r, x[i__3].i = temp.i;
-          /* L110: */
-        }
-      } else {
-        kx += (*n - 1) * *incx;
-        jx = kx;
-        for (j = *n; j >= 1; --j) {
-          i__3 = jx;
-          temp.r = x[i__3].r, temp.i = x[i__3].i;
-          kx -= *incx;
-          ix = kx;
-          l = kplus1 - j;
-          if (noconj) {
-            if (nounit) {
-              i__3 = kplus1 + j * a_dim1;
-              z__1.r = temp.r * a[i__3].r - temp.i * a[i__3].i, z__1.i = temp.r * a[i__3].i + temp.i * a[i__3].r;
-              temp.r = z__1.r, temp.i = z__1.i;
-            }
-            /* Computing MAX */
-            i__4 = 1, i__1 = j - *k;
-            i__3 = max(i__4, i__1);
-            for (i__ = j - 1; i__ >= i__3; --i__) {
-              i__4 = l + i__ + j * a_dim1;
-              i__1 = ix;
-              z__2.r = a[i__4].r * x[i__1].r - a[i__4].i * x[i__1].i,
-              z__2.i = a[i__4].r * x[i__1].i + a[i__4].i * x[i__1].r;
-              z__1.r = temp.r + z__2.r, z__1.i = temp.i + z__2.i;
-              temp.r = z__1.r, temp.i = z__1.i;
-              ix -= *incx;
-              /* L120: */
-            }
-          } else {
-            if (nounit) {
-              d_cnjg(&z__2, &a[kplus1 + j * a_dim1]);
-              z__1.r = temp.r * z__2.r - temp.i * z__2.i, z__1.i = temp.r * z__2.i + temp.i * z__2.r;
-              temp.r = z__1.r, temp.i = z__1.i;
-            }
-            /* Computing MAX */
-            i__4 = 1, i__1 = j - *k;
-            i__3 = max(i__4, i__1);
-            for (i__ = j - 1; i__ >= i__3; --i__) {
-              d_cnjg(&z__3, &a[l + i__ + j * a_dim1]);
-              i__4 = ix;
-              z__2.r = z__3.r * x[i__4].r - z__3.i * x[i__4].i, z__2.i = z__3.r * x[i__4].i + z__3.i * x[i__4].r;
-              z__1.r = temp.r + z__2.r, z__1.i = temp.i + z__2.i;
-              temp.r = z__1.r, temp.i = z__1.i;
-              ix -= *incx;
-              /* L130: */
-            }
-          }
-          i__3 = jx;
-          x[i__3].r = temp.r, x[i__3].i = temp.i;
-          jx -= *incx;
-          /* L140: */
-        }
-      }
-    } else {
-      if (*incx == 1) {
-        i__3 = *n;
-        for (j = 1; j <= i__3; ++j) {
-          i__4 = j;
-          temp.r = x[i__4].r, temp.i = x[i__4].i;
-          l = 1 - j;
-          if (noconj) {
-            if (nounit) {
-              i__4 = j * a_dim1 + 1;
-              z__1.r = temp.r * a[i__4].r - temp.i * a[i__4].i, z__1.i = temp.r * a[i__4].i + temp.i * a[i__4].r;
-              temp.r = z__1.r, temp.i = z__1.i;
-            }
-            /* Computing MIN */
-            i__1 = *n, i__2 = j + *k;
-            i__4 = min(i__1, i__2);
-            for (i__ = j + 1; i__ <= i__4; ++i__) {
-              i__1 = l + i__ + j * a_dim1;
-              i__2 = i__;
-              z__2.r = a[i__1].r * x[i__2].r - a[i__1].i * x[i__2].i,
-              z__2.i = a[i__1].r * x[i__2].i + a[i__1].i * x[i__2].r;
-              z__1.r = temp.r + z__2.r, z__1.i = temp.i + z__2.i;
-              temp.r = z__1.r, temp.i = z__1.i;
-              /* L150: */
-            }
-          } else {
-            if (nounit) {
-              d_cnjg(&z__2, &a[j * a_dim1 + 1]);
-              z__1.r = temp.r * z__2.r - temp.i * z__2.i, z__1.i = temp.r * z__2.i + temp.i * z__2.r;
-              temp.r = z__1.r, temp.i = z__1.i;
-            }
-            /* Computing MIN */
-            i__1 = *n, i__2 = j + *k;
-            i__4 = min(i__1, i__2);
-            for (i__ = j + 1; i__ <= i__4; ++i__) {
-              d_cnjg(&z__3, &a[l + i__ + j * a_dim1]);
-              i__1 = i__;
-              z__2.r = z__3.r * x[i__1].r - z__3.i * x[i__1].i, z__2.i = z__3.r * x[i__1].i + z__3.i * x[i__1].r;
-              z__1.r = temp.r + z__2.r, z__1.i = temp.i + z__2.i;
-              temp.r = z__1.r, temp.i = z__1.i;
-              /* L160: */
-            }
-          }
-          i__4 = j;
-          x[i__4].r = temp.r, x[i__4].i = temp.i;
-          /* L170: */
-        }
-      } else {
-        jx = kx;
-        i__3 = *n;
-        for (j = 1; j <= i__3; ++j) {
-          i__4 = jx;
-          temp.r = x[i__4].r, temp.i = x[i__4].i;
-          kx += *incx;
-          ix = kx;
-          l = 1 - j;
-          if (noconj) {
-            if (nounit) {
-              i__4 = j * a_dim1 + 1;
-              z__1.r = temp.r * a[i__4].r - temp.i * a[i__4].i, z__1.i = temp.r * a[i__4].i + temp.i * a[i__4].r;
-              temp.r = z__1.r, temp.i = z__1.i;
-            }
-            /* Computing MIN */
-            i__1 = *n, i__2 = j + *k;
-            i__4 = min(i__1, i__2);
-            for (i__ = j + 1; i__ <= i__4; ++i__) {
-              i__1 = l + i__ + j * a_dim1;
-              i__2 = ix;
-              z__2.r = a[i__1].r * x[i__2].r - a[i__1].i * x[i__2].i,
-              z__2.i = a[i__1].r * x[i__2].i + a[i__1].i * x[i__2].r;
-              z__1.r = temp.r + z__2.r, z__1.i = temp.i + z__2.i;
-              temp.r = z__1.r, temp.i = z__1.i;
-              ix += *incx;
-              /* L180: */
-            }
-          } else {
-            if (nounit) {
-              d_cnjg(&z__2, &a[j * a_dim1 + 1]);
-              z__1.r = temp.r * z__2.r - temp.i * z__2.i, z__1.i = temp.r * z__2.i + temp.i * z__2.r;
-              temp.r = z__1.r, temp.i = z__1.i;
-            }
-            /* Computing MIN */
-            i__1 = *n, i__2 = j + *k;
-            i__4 = min(i__1, i__2);
-            for (i__ = j + 1; i__ <= i__4; ++i__) {
-              d_cnjg(&z__3, &a[l + i__ + j * a_dim1]);
-              i__1 = ix;
-              z__2.r = z__3.r * x[i__1].r - z__3.i * x[i__1].i, z__2.i = z__3.r * x[i__1].i + z__3.i * x[i__1].r;
-              z__1.r = temp.r + z__2.r, z__1.i = temp.i + z__2.i;
-              temp.r = z__1.r, temp.i = z__1.i;
-              ix += *incx;
-              /* L190: */
-            }
-          }
-          i__4 = jx;
-          x[i__4].r = temp.r, x[i__4].i = temp.i;
-          jx += *incx;
-          /* L200: */
-        }
-      }
-    }
-  }
-
-  /*     End of ZTBMV . */
-
-} /* ztbmv_ */
--- a/blas/level1_cplx_impl.h
+++ b/blas/level1_cplx_impl.h
@@ -25,15 +25,19 @@ struct functor_traits<scalar_norm1_op> {
 // computes the sum of magnitudes of all vector elements or, for a complex vector x, the sum
 // res = |Rex1| + |Imx1| + |Rex2| + |Imx2| + ... + |Rexn| + |Imxn|, where x is a vector of order n
 extern "C" RealScalar EIGEN_CAT(REAL_SCALAR_SUFFIX, EIGEN_BLAS_FUNC_NAME(asum))(int *n, RealScalar *px, int *incx) {
-  //   std::cerr << "__asum " << *n << " " << *incx << "\n";
-  Complex *x = reinterpret_cast<Complex *>(px);
-
  if (*n <= 0) return 0;

+  // std::complex<T> is layout-compatible with T[2], so we can reinterpret
+  // a complex vector of length n as a real vector of length 2*n and use
+  // the fully vectorized cwiseAbs().sum() path.
  if (*incx == 1)
-    return make_vector(x, *n).unaryExpr<scalar_norm1_op>().sum();
-  else
+    return make_vector(px, 2 * *n).cwiseAbs().sum();
+  else {
+    // For non-unit stride, fall back to the scalar_norm1_op approach since
+    // the real components are not contiguous across complex elements.
+    Complex *x = reinterpret_cast<Complex *>(px);
    return make_vector(x, *n, std::abs(*incx)).unaryExpr<scalar_norm1_op>().sum();
+  }
 }

 extern "C" int EIGEN_CAT(i, EIGEN_BLAS_FUNC_NAME(amax))(int *n, RealScalar *px, int *incx) {
--- a/blas/level1_impl.h
+++ b/blas/level1_impl.h
@@ -69,15 +69,21 @@ EIGEN_BLAS_FUNC(copy)(int *n, RealScalar *px, int *incx, RealScalar *py, int *in
  // be careful, *incx==0 is allowed !!
  if (*incx == 1 && *incy == 1)
    make_vector(y, *n) = make_vector(x, *n);
-  else {
-    if (*incx < 0) x = x - (*n - 1) * (*incx);
+  else if (*incx == 0) {
+    // Broadcast: copy x[0] to all elements of y.
    if (*incy < 0) y = y - (*n - 1) * (*incy);
    for (int i = 0; i < *n; ++i) {
      *y = *x;
-      x += *incx;
      y += *incy;
    }
-  }
+  } else if (*incx > 0 && *incy > 0)
+    make_vector(y, *n, *incy) = make_vector(x, *n, *incx);
+  else if (*incx > 0 && *incy < 0)
+    make_vector(y, *n, -*incy).reverse() = make_vector(x, *n, *incx);
+  else if (*incx < 0 && *incy > 0)
+    make_vector(y, *n, *incy) = make_vector(x, *n, -*incx).reverse();
+  else if (*incx < 0 && *incy < 0)
+    make_vector(y, *n, -*incy) = make_vector(x, *n, -*incx);
 }

 EIGEN_BLAS_FUNC(rotg)(RealScalar *pa, RealScalar *pb, RealScalar *pc, RealScalar *ps) {
--- a/blas/level1_real_impl.h
+++ b/blas/level1_real_impl.h
@@ -58,23 +58,21 @@ extern "C" Scalar EIGEN_BLAS_FUNC_NAME(dot)(int *n, Scalar *px, int *incx, Scala
  Scalar *y = reinterpret_cast<Scalar *>(py);

  if (*incx == 1 && *incy == 1)
-    return (make_vector(x, *n).cwiseProduct(make_vector(y, *n))).sum();
+    return make_vector(x, *n).dot(make_vector(y, *n));
  else if (*incx > 0 && *incy > 0)
-    return (make_vector(x, *n, *incx).cwiseProduct(make_vector(y, *n, *incy))).sum();
+    return make_vector(x, *n, *incx).dot(make_vector(y, *n, *incy));
  else if (*incx < 0 && *incy > 0)
-    return (make_vector(x, *n, -*incx).reverse().cwiseProduct(make_vector(y, *n, *incy))).sum();
+    return make_vector(x, *n, -*incx).reverse().dot(make_vector(y, *n, *incy));
  else if (*incx > 0 && *incy < 0)
-    return (make_vector(x, *n, *incx).cwiseProduct(make_vector(y, *n, -*incy).reverse())).sum();
+    return make_vector(x, *n, *incx).dot(make_vector(y, *n, -*incy).reverse());
  else if (*incx < 0 && *incy < 0)
-    return (make_vector(x, *n, -*incx).reverse().cwiseProduct(make_vector(y, *n, -*incy).reverse())).sum();
+    return make_vector(x, *n, -*incx).reverse().dot(make_vector(y, *n, -*incy).reverse());
  else
    return 0;
 }

 // computes the Euclidean norm of a vector.
-// FIXME
 extern "C" Scalar EIGEN_BLAS_FUNC_NAME(nrm2)(int *n, Scalar *px, int *incx) {
-  //   std::cerr << "_nrm2 " << *n << " " << *incx << "\n";
  if (*n <= 0) return 0;

  Scalar *x = reinterpret_cast<Scalar *>(px);
@@ -108,23 +106,171 @@ EIGEN_BLAS_FUNC(rot)(int *n, Scalar *px, int *incx, Scalar *py, int *incy, Scala
    Eigen::internal::apply_rotation_in_the_plane(vx, vy, Eigen::JacobiRotation<Scalar>(c, s));
 }

-/*
-// performs rotation of points in the modified plane.
-EIGEN_BLAS_FUNC(rotm)(int *n, Scalar *px, int *incx, Scalar *py, int *incy, Scalar *param)
-{
-  Scalar* x = reinterpret_cast<Scalar*>(px);
-  Scalar* y = reinterpret_cast<Scalar*>(py);
+// Applies modified Givens rotation H to vectors x and y.
+//   param[0] = flag:
+//     -1: H = [[h11, h12], [h21, h22]]         (all 4 elements from param)
+//      0: H = [[1, h12], [h21, 1]]              (h12, h21 from param)
+//      1: H = [[h11, 1], [-1, h22]]             (h11, h22 from param)
+//     -2: H = identity                           (no-op)
+//   param[1..4] = h11, h21, h12, h22
+EIGEN_BLAS_FUNC(rotm)(int *n, Scalar *px, int *incx, Scalar *py, int *incy, Scalar *param) {
+  Scalar *x = reinterpret_cast<Scalar *>(px);
+  Scalar *y = reinterpret_cast<Scalar *>(py);

-  // TODO
+  Scalar flag = param[0];
+  if (*n <= 0 || flag == Scalar(-2)) return;

-  return 0;
+  Scalar h11, h12, h21, h22;
+  if (flag < Scalar(0)) {
+    h11 = param[1];
+    h21 = param[2];
+    h12 = param[3];
+    h22 = param[4];
+  } else if (flag == Scalar(0)) {
+    h11 = Scalar(1);
+    h21 = param[2];
+    h12 = param[3];
+    h22 = Scalar(1);
+  } else {
+    h11 = param[1];
+    h21 = Scalar(-1);
+    h12 = Scalar(1);
+    h22 = param[4];
+  }
+
+  int kx = *incx > 0 ? 0 : (1 - *n) * *incx;
+  int ky = *incy > 0 ? 0 : (1 - *n) * *incy;
+
+  for (int i = 0; i < *n; ++i) {
+    Scalar w = x[kx];
+    Scalar z = y[ky];
+    x[kx] = h11 * w + h12 * z;
+    y[ky] = h21 * w + h22 * z;
+    kx += *incx;
+    ky += *incy;
+  }
 }

-// computes the modified parameters for a Givens rotation.
-EIGEN_BLAS_FUNC(rotmg)(Scalar *d1, Scalar *d2, Scalar *x1, Scalar *x2, Scalar *param)
-{
-  // TODO
+// Constructs the modified Givens transformation matrix H which zeros the second
+// component of (sqrt(d1)*x1, sqrt(d2)*y1)^T.
+EIGEN_BLAS_FUNC(rotmg)(Scalar *d1, Scalar *d2, Scalar *x1, Scalar *y1, Scalar *param) {
+  using std::abs;

-  return 0;
+  const Scalar gam = Scalar(4096);
+  const Scalar gamsq = gam * gam;
+  const Scalar rgamsq = Scalar(1) / gamsq;
+
+  Scalar flag, h11 = Scalar(0), h12 = Scalar(0), h21 = Scalar(0), h22 = Scalar(0);
+
+  if (*d1 < Scalar(0)) {
+    // Negative d1: zero everything.
+    flag = Scalar(-1);
+    *d1 = *d2 = *x1 = Scalar(0);
+  } else {
+    Scalar p2 = *d2 * *y1;
+    if (p2 == Scalar(0)) {
+      // d2*y1 == 0: identity transform.
+      param[0] = Scalar(-2);
+      return;
+    }
+
+    Scalar p1 = *d1 * *x1;
+    Scalar q2 = p2 * *y1;
+    Scalar q1 = p1 * *x1;
+    bool do_scale = true;
+
+    if (abs(q1) > abs(q2)) {
+      h21 = -(*y1) / *x1;
+      h12 = p2 / p1;
+      Scalar u = Scalar(1) - h12 * h21;
+      if (u <= Scalar(0)) {
+        flag = Scalar(-1);
+        h11 = h12 = h21 = h22 = Scalar(0);
+        *d1 = *d2 = *x1 = Scalar(0);
+        do_scale = false;
+      } else {
+        flag = Scalar(0);
+        *d1 /= u;
+        *d2 /= u;
+        *x1 *= u;
+      }
+    } else if (q2 < Scalar(0)) {
+      flag = Scalar(-1);
+      h11 = h12 = h21 = h22 = Scalar(0);
+      *d1 = *d2 = *x1 = Scalar(0);
+      do_scale = false;
+    } else {
+      flag = Scalar(1);
+      h11 = p1 / p2;
+      h22 = *x1 / *y1;
+      Scalar u = Scalar(1) + h11 * h22;
+      Scalar temp = *d2 / u;
+      *d2 = *d1 / u;
+      *d1 = temp;
+      *x1 = *y1 * u;
+    }
+
+    if (do_scale) {
+      // Converts compact H representation (flag 0 or 1) to full form (flag -1)
+      // so that scaling factors can be absorbed into all four elements.
+      auto fix_h = [&]() {
+        if (flag >= Scalar(0)) {
+          if (flag == Scalar(0)) {
+            h11 = Scalar(1);
+            h22 = Scalar(1);
+          } else {
+            h21 = Scalar(-1);
+            h12 = Scalar(1);
+          }
+          flag = Scalar(-1);
+        }
+      };
+
+      // Scale d1 up if too small.
+      while (*d1 <= rgamsq && *d1 != Scalar(0)) {
+        fix_h();
+        *d1 *= gamsq;
+        *x1 /= gam;
+        h11 /= gam;
+        h12 /= gam;
+      }
+      // Scale d1 down if too large.
+      while (*d1 >= gamsq) {
+        fix_h();
+        *d1 /= gamsq;
+        *x1 *= gam;
+        h11 *= gam;
+        h12 *= gam;
+      }
+      // Scale |d2| up if too small.
+      while (abs(*d2) <= rgamsq && *d2 != Scalar(0)) {
+        fix_h();
+        *d2 *= gamsq;
+        h21 /= gam;
+        h22 /= gam;
+      }
+      // Scale |d2| down if too large.
+      while (abs(*d2) >= gamsq) {
+        fix_h();
+        *d2 /= gamsq;
+        h21 *= gam;
+        h22 *= gam;
+      }
+    }
+  }
+
+  // Store result in param array.
+  if (flag < Scalar(0)) {
+    param[1] = h11;
+    param[2] = h21;
+    param[3] = h12;
+    param[4] = h22;
+  } else if (flag == Scalar(0)) {
+    param[2] = h21;
+    param[3] = h12;
+  } else {
+    param[1] = h11;
+    param[4] = h22;
+  }
+  param[0] = flag;
 }
-*/
--- a/blas/level2_cplx_impl.h
+++ b/blas/level2_cplx_impl.h
@@ -72,31 +72,193 @@ EIGEN_BLAS_FUNC(hemv)
  if (actual_y != y) delete[] copy_back(actual_y, y, *n, *incy);
 }

-/**  ZHBMV  performs the matrix-vector  operation
+/**  HBMV  performs the matrix-vector operation
 *
 *     y := alpha*A*x + beta*y,
 *
 *  where alpha and beta are scalars, x and y are n element vectors and
 *  A is an n by n hermitian band matrix, with k super-diagonals.
+ *  Diagonal elements are real; off-diagonal contributions use conjugation.
 */
-// EIGEN_BLAS_FUNC(hbmv)(char *uplo, int *n, int *k, RealScalar *alpha, RealScalar *a, int *lda,
-//                           RealScalar *x, int *incx, RealScalar *beta, RealScalar *y, int *incy)
-// {
-//   return 1;
-// }
+EIGEN_BLAS_FUNC(hbmv)
+(char *uplo, int *n, int *k, RealScalar *palpha, RealScalar *pa, int *lda, RealScalar *px, int *incx, RealScalar *pbeta,
+ RealScalar *py, int *incy) {
+  const Scalar alpha = *reinterpret_cast<const Scalar *>(palpha);
+  const Scalar beta = *reinterpret_cast<const Scalar *>(pbeta);
+  const Scalar *a = reinterpret_cast<const Scalar *>(pa);
+  const Scalar *x = reinterpret_cast<const Scalar *>(px);
+  Scalar *y = reinterpret_cast<Scalar *>(py);

-/**  ZHPMV  performs the matrix-vector operation
+  int info = 0;
+  if (UPLO(*uplo) == INVALID)
+    info = 1;
+  else if (*n < 0)
+    info = 2;
+  else if (*k < 0)
+    info = 3;
+  else if (*lda < *k + 1)
+    info = 6;
+  else if (*incx == 0)
+    info = 8;
+  else if (*incy == 0)
+    info = 11;
+  if (info) return xerbla_(SCALAR_SUFFIX_UP "HBMV ", &info);
+
+  if (*n == 0 || (alpha == Scalar(0) && beta == Scalar(1))) return;
+
+  const Scalar *actual_x = get_compact_vector(x, *n, *incx);
+  Scalar *actual_y = get_compact_vector(y, *n, *incy);
+
+  // First form y := beta*y.
+  if (beta != Scalar(1)) {
+    if (beta == Scalar(0))
+      make_vector(actual_y, *n).setZero();
+    else
+      make_vector(actual_y, *n) *= beta;
+  }
+
+  if (alpha == Scalar(0)) {
+    if (actual_x != x) delete[] actual_x;
+    if (actual_y != y) delete[] copy_back(actual_y, y, *n, *incy);
+    return;
+  }
+
+  if (*k >= 8) {
+    // Vectorized path: use Eigen Map segments for the inner band operations.
+    ConstMatrixType band(a, *k + 1, *n, *lda);
+    if (UPLO(*uplo) == UP) {
+      for (int j = 0; j < *n; ++j) {
+        int start = std::max(0, j - *k);
+        int len = j - start;
+        int offset = *k - (j - start);
+        Scalar temp1 = alpha * actual_x[j];
+        actual_y[j] += Scalar(Eigen::numext::real(band(*k, j))) * temp1;
+        if (len > 0) {
+          make_vector(actual_y + start, len) += temp1 * band.col(j).segment(offset, len);
+          actual_y[j] += alpha * band.col(j).segment(offset, len).dot(make_vector(actual_x + start, len));
+        }
+      }
+    } else {
+      for (int j = 0; j < *n; ++j) {
+        int len = std::min(*n - 1, j + *k) - j;
+        Scalar temp1 = alpha * actual_x[j];
+        actual_y[j] += Scalar(Eigen::numext::real(band(0, j))) * temp1;
+        if (len > 0) {
+          make_vector(actual_y + j + 1, len) += temp1 * band.col(j).segment(1, len);
+          actual_y[j] += alpha * band.col(j).segment(1, len).dot(make_vector(actual_x + j + 1, len));
+        }
+      }
+    }
+  } else {
+    // Scalar path: for narrow bandwidth, avoid Map overhead.
+    if (UPLO(*uplo) == UP) {
+      for (int j = 0; j < *n; ++j) {
+        Scalar temp1 = alpha * actual_x[j];
+        Scalar temp2 = Scalar(0);
+        for (int i = std::max(0, j - *k); i < j; ++i) {
+          Scalar aij = a[(*k + i - j) + j * *lda];
+          actual_y[i] += temp1 * aij;
+          temp2 += Eigen::numext::conj(aij) * actual_x[i];
+        }
+        actual_y[j] += Scalar(Eigen::numext::real(a[*k + j * *lda])) * temp1 + alpha * temp2;
+      }
+    } else {
+      for (int j = 0; j < *n; ++j) {
+        Scalar temp1 = alpha * actual_x[j];
+        Scalar temp2 = Scalar(0);
+        actual_y[j] += Scalar(Eigen::numext::real(a[j * *lda])) * temp1;
+        for (int i = j + 1; i <= std::min(*n - 1, j + *k); ++i) {
+          Scalar aij = a[(i - j) + j * *lda];
+          actual_y[i] += temp1 * aij;
+          temp2 += Eigen::numext::conj(aij) * actual_x[i];
+        }
+        actual_y[j] += alpha * temp2;
+      }
+    }
+  }
+
+  if (actual_x != x) delete[] actual_x;
+  if (actual_y != y) delete[] copy_back(actual_y, y, *n, *incy);
+}
+
+/**  HPMV  performs the matrix-vector operation
 *
 *     y := alpha*A*x + beta*y,
 *
 *  where alpha and beta are scalars, x and y are n element vectors and
 *  A is an n by n hermitian matrix, supplied in packed form.
+ *  Diagonal elements are real; off-diagonal contributions use conjugation.
 */
-// EIGEN_BLAS_FUNC(hpmv)(char *uplo, int *n, RealScalar *alpha, RealScalar *ap, RealScalar *x, int *incx, RealScalar
-// *beta, RealScalar *y, int *incy)
-// {
-//   return 1;
-// }
+EIGEN_BLAS_FUNC(hpmv)
+(char *uplo, int *n, RealScalar *palpha, RealScalar *pap, RealScalar *px, int *incx, RealScalar *pbeta, RealScalar *py,
+ int *incy) {
+  const Scalar alpha = *reinterpret_cast<const Scalar *>(palpha);
+  const Scalar beta = *reinterpret_cast<const Scalar *>(pbeta);
+  const Scalar *ap = reinterpret_cast<const Scalar *>(pap);
+  const Scalar *x = reinterpret_cast<const Scalar *>(px);
+  Scalar *y = reinterpret_cast<Scalar *>(py);
+
+  int info = 0;
+  if (UPLO(*uplo) == INVALID)
+    info = 1;
+  else if (*n < 0)
+    info = 2;
+  else if (*incx == 0)
+    info = 6;
+  else if (*incy == 0)
+    info = 9;
+  if (info) return xerbla_(SCALAR_SUFFIX_UP "HPMV ", &info);
+
+  if (*n == 0 || (alpha == Scalar(0) && beta == Scalar(1))) return;
+
+  const Scalar *actual_x = get_compact_vector(x, *n, *incx);
+  Scalar *actual_y = get_compact_vector(y, *n, *incy);
+
+  // First form y := beta*y.
+  if (beta != Scalar(1)) {
+    if (beta == Scalar(0))
+      make_vector(actual_y, *n).setZero();
+    else
+      make_vector(actual_y, *n) *= beta;
+  }
+
+  if (alpha == Scalar(0)) {
+    if (actual_x != x) delete[] actual_x;
+    if (actual_y != y) delete[] copy_back(actual_y, y, *n, *incy);
+    return;
+  }
+
+  int kk = 0;
+  if (UPLO(*uplo) == UP) {
+    // Upper triangle packed: column j occupies ap[kk..kk+j].
+    for (int j = 0; j < *n; ++j) {
+      Scalar temp1 = alpha * actual_x[j];
+      // Diagonal is real.
+      actual_y[j] += Scalar(Eigen::numext::real(ap[kk + j])) * temp1;
+      if (j > 0) {
+        make_vector(actual_y, j) += temp1 * make_vector(ap + kk, j);
+        actual_y[j] += alpha * make_vector(ap + kk, j).dot(make_vector(actual_x, j));
+      }
+      kk += j + 1;
+    }
+  } else {
+    // Lower triangle packed: column j occupies ap[kk..kk+(n-j-1)].
+    for (int j = 0; j < *n; ++j) {
+      int len = *n - j - 1;
+      Scalar temp1 = alpha * actual_x[j];
+      // Diagonal is real.
+      actual_y[j] += Scalar(Eigen::numext::real(ap[kk])) * temp1;
+      if (len > 0) {
+        make_vector(actual_y + j + 1, len) += temp1 * make_vector(ap + kk + 1, len);
+        actual_y[j] += alpha * make_vector(ap + kk + 1, len).dot(make_vector(actual_x + j + 1, len));
+      }
+      kk += *n - j;
+    }
+  }
+
+  if (actual_x != x) delete[] actual_x;
+  if (actual_y != y) delete[] copy_back(actual_y, y, *n, *incy);
+}

 /**  ZHPR    performs the hermitian rank 1 operation
 *
--- a/blas/level2_impl.h
+++ b/blas/level2_impl.h
@@ -303,61 +303,158 @@ EIGEN_BLAS_FUNC(gbmv)
  if (actual_y != y) delete[] copy_back(actual_y, y, actual_m, *incy);
 }

-#if 0
 /**  TBMV  performs one of the matrix-vector operations
-  *
-  *     x := A*x,   or   x := A'*x,
-  *
-  *  where x is an n element vector and  A is an n by n unit, or non-unit,
-  *  upper or lower triangular band matrix, with ( k + 1 ) diagonals.
-  */
-EIGEN_BLAS_FUNC(tbmv)(char *uplo, char *opa, char *diag, int *n, int *k, RealScalar *pa, int *lda, RealScalar *px, int *incx)
-{
-  Scalar* a = reinterpret_cast<Scalar*>(pa);
-  Scalar* x = reinterpret_cast<Scalar*>(px);
-  int coeff_rows = *k + 1;
+ *
+ *     x := A*x,   or   x := A'*x,   or   x := conjg(A')*x,
+ *
+ *  where x is an n element vector and A is an n by n unit, or non-unit,
+ *  upper or lower triangular band matrix, with ( k + 1 ) diagonals.
+ *
+ *  Band storage: upper triangle stores A[i,j] at a[(k+i-j) + j*lda],
+ *  lower triangle stores A[i,j] at a[(i-j) + j*lda].
+ */
+EIGEN_BLAS_FUNC(tbmv)
+(char *uplo, char *opa, char *diag, int *n, int *k, RealScalar *pa, int *lda, RealScalar *px, int *incx) {
+  Scalar *a = reinterpret_cast<Scalar *>(pa);
+  Scalar *x = reinterpret_cast<Scalar *>(px);

  int info = 0;
-       if(UPLO(*uplo)==INVALID)                                       info = 1;
-  else if(OP(*opa)==INVALID)                                          info = 2;
-  else if(DIAG(*diag)==INVALID)                                       info = 3;
-  else if(*n<0)                                                       info = 4;
-  else if(*k<0)                                                       info = 5;
-  else if(*lda<coeff_rows)                                            info = 7;
-  else if(*incx==0)                                                   info = 9;
-  if(info)
-    return xerbla_(SCALAR_SUFFIX_UP"TBMV ",&info,6);
+  if (UPLO(*uplo) == INVALID)
+    info = 1;
+  else if (OP(*opa) == INVALID)
+    info = 2;
+  else if (DIAG(*diag) == INVALID)
+    info = 3;
+  else if (*n < 0)
+    info = 4;
+  else if (*k < 0)
+    info = 5;
+  else if (*lda < *k + 1)
+    info = 7;
+  else if (*incx == 0)
+    info = 9;
+  if (info) return xerbla_(SCALAR_SUFFIX_UP "TBMV ", &info);

-  if(*n==0) return;
+  if (*n == 0) return;

-  int actual_n = *n;
+  Scalar *actual_x = get_compact_vector(x, *n, *incx);

-  Scalar* actual_x = get_compact_vector(x,actual_n,*incx);
+  bool upper = (UPLO(*uplo) == UP);
+  int op = OP(*opa);
+  bool unit = (DIAG(*diag) == UNIT);

-  MatrixType mat_coeffs(a,coeff_rows,*n,*lda);
-
-  int ku = UPLO(*uplo)==UPPER ? *k : 0;
-  int kl = UPLO(*uplo)==LOWER ? *k : 0;
-
-  for(int j=0; j<*n; ++j)
-  {
-    int start = std::max(0,j - ku);
-    int end = std::min((*m)-1,j + kl);
-    int len = end - start + 1;
-    int offset = (ku) - j + start;
-
-    if(OP(*trans)==NOTR)
-      make_vector(actual_y+start,len) += (alpha*actual_x[j]) * mat_coeffs.col(j).segment(offset,len);
-    else if(OP(*trans)==TR)
-      actual_y[j] += alpha * ( mat_coeffs.col(j).segment(offset,len).transpose() * make_vector(actual_x+start,len) ).value();
-    else
-      actual_y[j] += alpha * ( mat_coeffs.col(j).segment(offset,len).adjoint()   * make_vector(actual_x+start,len) ).value();
+  if (*k >= 8) {
+    // Vectorized path: use Eigen Map segments for the inner band operations.
+    ConstMatrixType band(a, *k + 1, *n, *lda);
+    if (op == NOTR) {
+      if (upper) {
+        for (int j = 0; j < *n; ++j) {
+          if (actual_x[j] != Scalar(0)) {
+            int start = std::max(0, j - *k);
+            int len = j - start;
+            int offset = *k - (j - start);
+            Scalar temp = actual_x[j];
+            if (len > 0) make_vector(actual_x + start, len) += temp * band.col(j).segment(offset, len);
+            if (!unit) actual_x[j] = temp * band(*k, j);
+          }
+        }
+      } else {
+        for (int j = *n - 1; j >= 0; --j) {
+          if (actual_x[j] != Scalar(0)) {
+            int len = std::min(*n - 1, j + *k) - j;
+            Scalar temp = actual_x[j];
+            if (len > 0) make_vector(actual_x + j + 1, len) += temp * band.col(j).segment(1, len);
+            if (!unit) actual_x[j] = temp * band(0, j);
+          }
+        }
+      }
+    } else if (op == TR) {
+      if (upper) {
+        for (int j = *n - 1; j >= 0; --j) {
+          int start = std::max(0, j - *k);
+          int len = j - start;
+          int offset = *k - (j - start);
+          Scalar temp = actual_x[j];
+          if (!unit) temp *= band(*k, j);
+          if (len > 0)
+            temp += (band.col(j).segment(offset, len).cwiseProduct(make_vector(actual_x + start, len))).sum();
+          actual_x[j] = temp;
+        }
+      } else {
+        for (int j = 0; j < *n; ++j) {
+          int len = std::min(*n - 1, j + *k) - j;
+          Scalar temp = actual_x[j];
+          if (!unit) temp *= band(0, j);
+          if (len > 0) temp += (band.col(j).segment(1, len).cwiseProduct(make_vector(actual_x + j + 1, len))).sum();
+          actual_x[j] = temp;
+        }
+      }
+    } else {
+      // Conjugate transpose: .dot() computes conj(lhs) . rhs.
+      if (upper) {
+        for (int j = *n - 1; j >= 0; --j) {
+          int start = std::max(0, j - *k);
+          int len = j - start;
+          int offset = *k - (j - start);
+          Scalar temp = actual_x[j];
+          if (!unit) temp *= Eigen::numext::conj(band(*k, j));
+          if (len > 0) temp += band.col(j).segment(offset, len).dot(make_vector(actual_x + start, len));
+          actual_x[j] = temp;
+        }
+      } else {
+        for (int j = 0; j < *n; ++j) {
+          int len = std::min(*n - 1, j + *k) - j;
+          Scalar temp = actual_x[j];
+          if (!unit) temp *= Eigen::numext::conj(band(0, j));
+          if (len > 0) temp += band.col(j).segment(1, len).dot(make_vector(actual_x + j + 1, len));
+          actual_x[j] = temp;
+        }
+      }
+    }
+  } else {
+    // Scalar path: for narrow bandwidth, avoid Map overhead.
+    if (op == NOTR) {
+      if (upper) {
+        for (int j = 0; j < *n; ++j) {
+          if (actual_x[j] != Scalar(0)) {
+            Scalar temp = actual_x[j];
+            for (int i = std::max(0, j - *k); i < j; ++i) actual_x[i] += temp * a[(*k + i - j) + j * *lda];
+            if (!unit) actual_x[j] = temp * a[*k + j * *lda];
+          }
+        }
+      } else {
+        for (int j = *n - 1; j >= 0; --j) {
+          if (actual_x[j] != Scalar(0)) {
+            Scalar temp = actual_x[j];
+            for (int i = j + 1; i <= std::min(*n - 1, j + *k); ++i) actual_x[i] += temp * a[(i - j) + j * *lda];
+            if (!unit) actual_x[j] = temp * a[j * *lda];
+          }
+        }
+      }
+    } else {
+      // Transpose or conjugate transpose.
+      auto maybe_conj = [op](Scalar val) -> Scalar { return op == ADJ ? Eigen::numext::conj(val) : val; };
+      if (upper) {
+        for (int j = *n - 1; j >= 0; --j) {
+          Scalar temp = actual_x[j];
+          if (!unit) temp *= maybe_conj(a[*k + j * *lda]);
+          for (int i = std::max(0, j - *k); i < j; ++i) temp += maybe_conj(a[(*k + i - j) + j * *lda]) * actual_x[i];
+          actual_x[j] = temp;
+        }
+      } else {
+        for (int j = 0; j < *n; ++j) {
+          Scalar temp = actual_x[j];
+          if (!unit) temp *= maybe_conj(a[j * *lda]);
+          for (int i = j + 1; i <= std::min(*n - 1, j + *k); ++i)
+            temp += maybe_conj(a[(i - j) + j * *lda]) * actual_x[i];
+          actual_x[j] = temp;
+        }
+      }
+    }
  }

-  if(actual_x!=x) delete[] actual_x;
-  if(actual_y!=y) delete[] copy_back(actual_y,y,actual_m,*incy);
+  if (actual_x != x) delete[] copy_back(actual_x, x, *n, *incx);
 }
-#endif

 /**  DTBSV  solves one of the systems of equations
 *
--- a/blas/level2_real_impl.h
+++ b/blas/level2_real_impl.h
@@ -158,32 +158,196 @@ EIGEN_BLAS_FUNC(syr2)
  //   func[code](*n, a, *inca, b, *incb, c, *ldc, alpha);
 }

-/**  DSBMV  performs the matrix-vector  operation
+/**  SBMV  performs the matrix-vector operation
 *
 *     y := alpha*A*x + beta*y,
 *
 *  where alpha and beta are scalars, x and y are n element vectors and
 *  A is an n by n symmetric band matrix, with k super-diagonals.
+ *
+ *  Band storage: upper triangle stores A[i,j] at a[(k+i-j) + j*lda],
+ *  lower triangle stores A[i,j] at a[(i-j) + j*lda].
 */
-// EIGEN_BLAS_FUNC(sbmv)( char *uplo, int *n, int *k, RealScalar *alpha, RealScalar *a, int *lda,
-//                            RealScalar *x, int *incx, RealScalar *beta, RealScalar *y, int *incy)
-// {
-//   return 1;
-// }
+EIGEN_BLAS_FUNC(sbmv)
+(char *uplo, int *n, int *k, RealScalar *palpha, RealScalar *pa, int *lda, RealScalar *px, int *incx, RealScalar *pbeta,
+ RealScalar *py, int *incy) {
+  const Scalar alpha = *reinterpret_cast<const Scalar *>(palpha);
+  const Scalar beta = *reinterpret_cast<const Scalar *>(pbeta);
+  const Scalar *a = reinterpret_cast<const Scalar *>(pa);
+  const Scalar *x = reinterpret_cast<const Scalar *>(px);
+  Scalar *y = reinterpret_cast<Scalar *>(py);

-/**  DSPMV  performs the matrix-vector operation
+  int info = 0;
+  if (UPLO(*uplo) == INVALID)
+    info = 1;
+  else if (*n < 0)
+    info = 2;
+  else if (*k < 0)
+    info = 3;
+  else if (*lda < *k + 1)
+    info = 6;
+  else if (*incx == 0)
+    info = 8;
+  else if (*incy == 0)
+    info = 11;
+  if (info) return xerbla_(SCALAR_SUFFIX_UP "SBMV ", &info);
+
+  if (*n == 0 || (alpha == Scalar(0) && beta == Scalar(1))) return;
+
+  const Scalar *actual_x = get_compact_vector(x, *n, *incx);
+  Scalar *actual_y = get_compact_vector(y, *n, *incy);
+
+  // First form y := beta*y.
+  if (beta != Scalar(1)) {
+    if (beta == Scalar(0))
+      make_vector(actual_y, *n).setZero();
+    else
+      make_vector(actual_y, *n) *= beta;
+  }
+
+  if (alpha == Scalar(0)) {
+    if (actual_x != x) delete[] actual_x;
+    if (actual_y != y) delete[] copy_back(actual_y, y, *n, *incy);
+    return;
+  }
+
+  if (*k >= 8) {
+    // Vectorized path: use Eigen Map segments for the inner band operations.
+    ConstMatrixType band(a, *k + 1, *n, *lda);
+    if (UPLO(*uplo) == UP) {
+      for (int j = 0; j < *n; ++j) {
+        int start = std::max(0, j - *k);
+        int len = j - start;
+        int offset = *k - (j - start);
+        Scalar temp1 = alpha * actual_x[j];
+        actual_y[j] += temp1 * band(*k, j);
+        if (len > 0) {
+          make_vector(actual_y + start, len) += temp1 * band.col(j).segment(offset, len);
+          actual_y[j] += alpha * band.col(j).segment(offset, len).dot(make_vector(actual_x + start, len));
+        }
+      }
+    } else {
+      for (int j = 0; j < *n; ++j) {
+        int len = std::min(*n - 1, j + *k) - j;
+        Scalar temp1 = alpha * actual_x[j];
+        actual_y[j] += temp1 * band(0, j);
+        if (len > 0) {
+          make_vector(actual_y + j + 1, len) += temp1 * band.col(j).segment(1, len);
+          actual_y[j] += alpha * band.col(j).segment(1, len).dot(make_vector(actual_x + j + 1, len));
+        }
+      }
+    }
+  } else {
+    // Scalar path: for narrow bandwidth, avoid Map overhead.
+    if (UPLO(*uplo) == UP) {
+      for (int j = 0; j < *n; ++j) {
+        Scalar temp1 = alpha * actual_x[j];
+        Scalar temp2 = Scalar(0);
+        for (int i = std::max(0, j - *k); i < j; ++i) {
+          Scalar aij = a[(*k + i - j) + j * *lda];
+          actual_y[i] += temp1 * aij;
+          temp2 += aij * actual_x[i];
+        }
+        actual_y[j] += temp1 * a[*k + j * *lda] + alpha * temp2;
+      }
+    } else {
+      for (int j = 0; j < *n; ++j) {
+        Scalar temp1 = alpha * actual_x[j];
+        Scalar temp2 = Scalar(0);
+        actual_y[j] += temp1 * a[j * *lda];
+        for (int i = j + 1; i <= std::min(*n - 1, j + *k); ++i) {
+          Scalar aij = a[(i - j) + j * *lda];
+          actual_y[i] += temp1 * aij;
+          temp2 += aij * actual_x[i];
+        }
+        actual_y[j] += alpha * temp2;
+      }
+    }
+  }
+
+  if (actual_x != x) delete[] actual_x;
+  if (actual_y != y) delete[] copy_back(actual_y, y, *n, *incy);
+}
+
+/**  SPMV  performs the matrix-vector operation
 *
 *     y := alpha*A*x + beta*y,
 *
 *  where alpha and beta are scalars, x and y are n element vectors and
 *  A is an n by n symmetric matrix, supplied in packed form.
 *
+ *  Packed storage: upper triangle stores columns sequentially so that
+ *  column j occupies positions kk..kk+j (where kk = j*(j+1)/2),
+ *  lower triangle stores column j at positions kk..kk+(n-j-1).
 */
-// EIGEN_BLAS_FUNC(spmv)(char *uplo, int *n, RealScalar *alpha, RealScalar *ap, RealScalar *x, int *incx, RealScalar
-// *beta, RealScalar *y, int *incy)
-// {
-//   return 1;
-// }
+EIGEN_BLAS_FUNC(spmv)
+(char *uplo, int *n, RealScalar *palpha, RealScalar *pap, RealScalar *px, int *incx, RealScalar *pbeta, RealScalar *py,
+ int *incy) {
+  const Scalar alpha = *reinterpret_cast<const Scalar *>(palpha);
+  const Scalar beta = *reinterpret_cast<const Scalar *>(pbeta);
+  const Scalar *ap = reinterpret_cast<const Scalar *>(pap);
+  const Scalar *x = reinterpret_cast<const Scalar *>(px);
+  Scalar *y = reinterpret_cast<Scalar *>(py);
+
+  int info = 0;
+  if (UPLO(*uplo) == INVALID)
+    info = 1;
+  else if (*n < 0)
+    info = 2;
+  else if (*incx == 0)
+    info = 6;
+  else if (*incy == 0)
+    info = 9;
+  if (info) return xerbla_(SCALAR_SUFFIX_UP "SPMV ", &info);
+
+  if (*n == 0 || (alpha == Scalar(0) && beta == Scalar(1))) return;
+
+  const Scalar *actual_x = get_compact_vector(x, *n, *incx);
+  Scalar *actual_y = get_compact_vector(y, *n, *incy);
+
+  // First form y := beta*y.
+  if (beta != Scalar(1)) {
+    if (beta == Scalar(0))
+      make_vector(actual_y, *n).setZero();
+    else
+      make_vector(actual_y, *n) *= beta;
+  }
+
+  if (alpha == Scalar(0)) {
+    if (actual_x != x) delete[] actual_x;
+    if (actual_y != y) delete[] copy_back(actual_y, y, *n, *incy);
+    return;
+  }
+
+  int kk = 0;
+  if (UPLO(*uplo) == UP) {
+    // Upper triangle packed: column j occupies ap[kk..kk+j].
+    for (int j = 0; j < *n; ++j) {
+      Scalar temp1 = alpha * actual_x[j];
+      actual_y[j] += temp1 * ap[kk + j];
+      if (j > 0) {
+        make_vector(actual_y, j) += temp1 * make_vector(ap + kk, j);
+        actual_y[j] += alpha * make_vector(ap + kk, j).dot(make_vector(actual_x, j));
+      }
+      kk += j + 1;
+    }
+  } else {
+    // Lower triangle packed: column j occupies ap[kk..kk+(n-j-1)].
+    for (int j = 0; j < *n; ++j) {
+      int len = *n - j - 1;
+      Scalar temp1 = alpha * actual_x[j];
+      actual_y[j] += temp1 * ap[kk];
+      if (len > 0) {
+        make_vector(actual_y + j + 1, len) += temp1 * make_vector(ap + kk + 1, len);
+        actual_y[j] += alpha * make_vector(ap + kk + 1, len).dot(make_vector(actual_x + j + 1, len));
+      }
+      kk += *n - j;
+    }
+  }
+
+  if (actual_x != x) delete[] actual_x;
+  if (actual_y != y) delete[] copy_back(actual_y, y, *n, *incy);
+}

 /**  DSPR    performs the symmetric rank 1 operation
 *
--- a/blas/lsame.cpp
+++ b/blas/lsame.cpp
@@ -0,0 +1,15 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+#include <cctype>
+
+#include "blas.h"
+
+// LSAME returns true if ca and cb are the same letter, regardless of case.
+extern "C" EIGEN_BLAS_API int lsame_(const char *ca, const char *cb) {
+  return std::toupper(static_cast<unsigned char>(*ca)) == std::toupper(static_cast<unsigned char>(*cb));
+}
--- a/ci/build.linux.gitlab-ci.yml
+++ b/ci/build.linux.gitlab-ci.yml
@@ -197,7 +197,7 @@ build:linux:x86-64:nvhpc-26.1:default:unsupported:
    # Additional flags passed to the cuda compiler.
    EIGEN_CI_CUDA_CXX_FLAGS: ""
    # Compute architectures present in the GitLab CI runners.
-    EIGEN_CI_CUDA_COMPUTE_ARCH: "50;75"
+    EIGEN_CI_CUDA_COMPUTE_ARCH: "70;75"
    EIGEN_CI_BUILD_TARGET: buildtests_gpu
    EIGEN_CI_TEST_CUDA_CLANG: "off"
    EIGEN_CI_TEST_CUDA_NVC: "off"
@@ -234,7 +234,7 @@ build:linux:cuda-12.2:clang-12:
 # ROCm HIP
 build:linux:rocm-latest:gcc-10:
  extends: .build:linux:cross
-  image: rocm/dev-ubuntu-24.04:latest
+  image: rocm/dev-ubuntu-24.04:6.3.1
  variables:
    EIGEN_CI_C_COMPILER: gcc-10
    EIGEN_CI_CXX_COMPILER: g++-10
@@ -386,6 +386,6 @@ build:linux:cross:x86-64:clang-14:sanitizer:smoketest:
  rules:
    - if: $CI_PIPELINE_SOURCE == "merge_request_event"
  tags:
-    - saas-linux-medium-amd64
+    - saas-linux-large-amd64
  allow_failure: true
  timeout: 30m
--- a/ci/build.windows.gitlab-ci.yml
+++ b/ci/build.windows.gitlab-ci.yml
@@ -55,7 +55,7 @@ build:windows:x86-64:msvc-14.29:avx512dq:
  extends: .build:windows
  variables:
    # Compute architectures present in the GitLab CI runners.
-    EIGEN_CI_CUDA_COMPUTE_ARCH: "50;75"
+    EIGEN_CI_CUDA_COMPUTE_ARCH: "70;75"
    EIGEN_CI_BUILD_TARGET: buildtests_gpu
    EIGEN_CI_ADDITIONAL_ARGS:
      -DEIGEN_TEST_CUDA=on
@@ -66,8 +66,8 @@ build:windows:x86-64:msvc-14.29:avx512dq:
    - x86-64
    - cuda

-# MSVC 14.29 + CUDA 11.4
-build:windows:x86-64:cuda-11.4:msvc-14.29:
+# MSVC 14.29 + CUDA 12.2
+build:windows:x86-64:cuda-12.2:msvc-14.29:
  extends: .build:windows:cuda
  variables:
-    EIGEN_CI_BEFORE_SCRIPT: $$env:CUDA_PATH=$$env:CUDA_PATH_V11_4
+    EIGEN_CI_BEFORE_SCRIPT: $$env:CUDA_PATH=$$env:CUDA_PATH_V12_2
--- a/ci/test.linux.gitlab-ci.yml
+++ b/ci/test.linux.gitlab-ci.yml
@@ -488,7 +488,6 @@ test:linux:x86-64:clang-14:sanitizer:smoketest:
  variables:
    EIGEN_CI_INSTALL: clang-14 llvm-14 libclang-rt-14-dev
    EIGEN_CI_CTEST_LABEL: smoketest
-    EIGEN_CI_CTEST_PARALLEL: "2"
    EIGEN_CI_CTEST_ARGS: --timeout 120
    ASAN_OPTIONS: "detect_leaks=0:halt_on_error=1:abort_on_error=1:allocator_may_return_null=1:print_stacktrace=1:detect_stack_use_after_return=0"
    ASAN_SYMBOLIZER_PATH: "/usr/lib/llvm-14/bin/llvm-symbolizer"
@@ -496,6 +495,6 @@ test:linux:x86-64:clang-14:sanitizer:smoketest:
  rules:
    - if: $CI_PIPELINE_SOURCE == "merge_request_event"
  tags:
-    - saas-linux-medium-amd64
+    - saas-linux-large-amd64
  allow_failure: true
  timeout: 30m
--- a/ci/test.windows.gitlab-ci.yml
+++ b/ci/test.windows.gitlab-ci.yml
@@ -71,7 +71,7 @@ test:windows:x86-64:msvc-14.29:avx512dq:unsupported:
    - x86-64
    - cuda

-# MSVC 14.29 + CUDA 11.4
-test:windows:x86-64:cuda-11.4:msvc-14.29:
+# MSVC 14.29 + CUDA 12.2
+test:windows:x86-64:cuda-12.2:msvc-14.29:
  extends: .test:windows:cuda
-  needs: [ build:windows:x86-64:cuda-11.4:msvc-14.29 ]
+  needs: [ build:windows:x86-64:cuda-12.2:msvc-14.29 ]
--- a/cmake/EigenConfigureTesting.cmake
+++ b/cmake/EigenConfigureTesting.cmake
@@ -20,7 +20,8 @@ add_dependencies(check buildtests)

 # Convenience target for only building GPU tests.
 add_custom_target(buildtests_gpu)
-add_custom_target(check_gpu COMMAND "ctest" "--output-on-failure"
+add_custom_target(check_gpu COMMAND "ctest" ${EIGEN_CTEST_ARGS}
+                                            "--output-on-failure"
                                            "--no-compress-output"
                                            "--build-no-clean"
                                            "-T" "test"
@@ -71,4 +72,3 @@ elseif(MSVC)
  set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} /D_CRT_SECURE_NO_WARNINGS /D_SCL_SECURE_NO_WARNINGS")
 endif()

-
--- a/cmake/EigenTesting.cmake
+++ b/cmake/EigenTesting.cmake
@@ -8,6 +8,12 @@ macro(ei_add_property prop value)
  endif()
 endmacro()

+if(EIGEN_TEST_HIP AND NOT DEFINED EIGEN_HIP_ARCHITECTURES)
+  set(EIGEN_HIP_ARCHITECTURES
+      gfx900;gfx906;gfx908;gfx90a;gfx940;gfx941;gfx942;gfx1030;gfx1100;gfx1101;gfx1102;gfx1150;gfx1151
+      CACHE STRING "HIP GPU architectures to build Eigen's HIP tests for.")
+endif()
+
 #internal. See documentation of ei_add_test for details.
 macro(ei_add_test_internal testname testname_with_suffix)
  set(targetname ${testname_with_suffix})
@@ -30,7 +36,7 @@ macro(ei_add_test_internal testname testname_with_suffix)
      hip_reset_flags()
      hip_add_executable(${targetname} ${filename} HIPCC_OPTIONS -std=c++14)
      target_compile_definitions(${targetname} PRIVATE -DEIGEN_USE_HIP)
-      set_property(TARGET ${targetname} PROPERTY HIP_ARCHITECTURES gfx900 gfx906 gfx908 gfx90a gfx940 gfx941 gfx942 gfx1030)
+      set_property(TARGET ${targetname} PROPERTY HIP_ARCHITECTURES "${EIGEN_HIP_ARCHITECTURES}")
    elseif(EIGEN_TEST_CUDA_CLANG)
      set_source_files_properties(${filename} PROPERTIES LANGUAGE CXX)

@@ -134,6 +140,7 @@ macro(ei_add_test_internal testname testname_with_suffix)
  if (is_gpu_test)
    # Add gpu tag for testing only GPU tests.
    set_property(TEST ${testname_with_suffix} APPEND PROPERTY LABELS "gpu")
+    set_property(TEST ${testname_with_suffix} PROPERTY SKIP_RETURN_CODE 77)
  endif()

  if(EIGEN_SYCL)
--- a/doc/DenseDecompositionBenchmark.dox
+++ b/doc/DenseDecompositionBenchmark.dox
@@ -30,10 +30,11 @@ Timings are in \b milliseconds, and factors are relative to the LLT decompositio
 <a name="note_ls">\b *: </a> This decomposition do not support direct least-square solving for over-constrained problems, and the reported timing include the cost to form the symmetric covariance matrix \f$ A^T A \f$.

 \b Observations:
- + LLT is always the fastest solvers.
+ + LLT is always the fastest solver.
 + For largely over-constrained problems, the cost of Cholesky/LU decompositions is dominated by the computation of the symmetric covariance matrix.
- + For large problem sizes, only the decomposition implementing a cache-friendly blocking strategy scale well. Those include LLT, PartialPivLU, HouseholderQR, and BDCSVD. This explain why for a 4k x 4k matrix, HouseholderQR is faster than LDLT. In the future, LDLT and ColPivHouseholderQR will also implement blocking strategies.
+ + For large problem sizes, only the decompositions implementing a cache-friendly blocking strategy scale well. Those include LLT, PartialPivLU, HouseholderQR, and BDCSVD. This explains why for a 4k x 4k matrix, HouseholderQR is faster than LDLT.
 + CompleteOrthogonalDecomposition is based on ColPivHouseholderQR and they thus achieve the same level of performance.
+ + FullPivLU and FullPivHouseholderQR are dramatically slower for large matrices due to the lack of blocking, and are not shown for the 4k x 4k case.

 The above table was originally generated by a benchmark tool. Feel free to write your own benchmark to generate a table matching your hardware, compiler, and favorite problem sizes.

--- a/doc/LeastSquares.dox
+++ b/doc/LeastSquares.dox
@@ -7,13 +7,33 @@ of equations, say \a Ax = \a b, has no solutions. In this case, it makes sense t
 vector \a x which is closest to being a solution, in the sense that the difference \a Ax - \a b is
 as small as possible. This \a x is called the least square solution (if the Euclidean norm is used).

-The three methods discussed on this page are the SVD decomposition, the QR decomposition and normal
-equations. Of these, the SVD decomposition is generally the most accurate but the slowest, normal
-equations is the fastest but least accurate, and the QR decomposition is in between.
+The methods discussed on this page are the complete orthogonal decomposition (COD), the SVD
+decomposition, other QR decompositions, and normal equations. For most problems, we recommend
+CompleteOrthogonalDecomposition: it robustly computes the minimum-norm least squares solution
+(like the SVD) for both over- and under-determined systems, including rank-deficient ones, but at
+QR-like speed. The SVD is the most robust but also the slowest; use it when you also need singular
+values or vectors. Normal equations are the fastest but least robust.

 \eigenAutoToc


+\section LeastSquaresCOD Using the complete orthogonal decomposition (recommended)
+
+CompleteOrthogonalDecomposition is the recommended method for least squares problems. It handles the
+widest class of problems — overdetermined, underdetermined, and rank-deficient systems — and computes
+the minimum-norm solution when the system is rank-deficient or underdetermined, just like the SVD.
+It is based on a rank-revealing QR factorization (ColPivHouseholderQR) followed by a post-processing
+step, so it is significantly faster than SVD while providing comparable robustness.
+
+<table class="example">
+<tr><th>Example:</th><th>Output:</th></tr>
+<tr>
+  <td>\include LeastSquaresCOD.cpp </td>
+  <td>\verbinclude LeastSquaresCOD.out </td>
+</tr>
+</table>
+
+
 \section LeastSquaresSVD Using the SVD decomposition

 The \link BDCSVD::solve() solve() \endlink method in the BDCSVD class can be directly used to
@@ -30,16 +50,19 @@ computing least squares solutions:
 </table>

 This is example from the page \link TutorialLinearAlgebra Linear algebra and decompositions \endlink.
-If you just need to solve the least squares problem, but are not interested in the SVD per se, a
-faster alternative method is CompleteOrthogonalDecomposition. 
+The SVD gives you singular values and vectors in addition to the least squares solution, but if you
+only need the solution, CompleteOrthogonalDecomposition (above) is faster.


-\section LeastSquaresQR Using the QR decomposition
+\section LeastSquaresQR Using other QR decompositions

-The solve() method in QR decomposition classes also computes the least squares solution. There are
-three QR decomposition classes: HouseholderQR (no pivoting, fast but unstable if your matrix is
-not rull rank), ColPivHouseholderQR (column pivoting, thus a bit slower but more stable) and
-FullPivHouseholderQR (full pivoting, so slowest and slightly more stable than ColPivHouseholderQR).
+The solve() method in QR decomposition classes also computes the least squares solution. Besides
+CompleteOrthogonalDecomposition (above), there are three other QR decomposition classes:
+HouseholderQR (no pivoting, so fast but unreliable if your matrix is not full rank),
+ColPivHouseholderQR (column pivoting, a bit slower but rank-revealing), and FullPivHouseholderQR
+(full pivoting, significantly slower and rarely needed in practice).
+Note that only CompleteOrthogonalDecomposition and the SVD-based solvers compute minimum-norm
+solutions for rank-deficient or underdetermined problems; the other QR variants do not.
 Here is an example with column pivoting:

 <table class="example">
--- a/doc/TopicLinearAlgebraDecompositions.dox
+++ b/doc/TopicLinearAlgebraDecompositions.dox
@@ -42,10 +42,10 @@ To get an overview of the true relative speed of the different decompositions, c
    <tr class="alt">
        <td>FullPivLU</td>
        <td>-</td>
-        <td>Slow</td>
+        <td>Slow (no blocking)</td>
        <td>Proven</td>
        <td>Yes</td>
-        <td>-</td>
+        <td>Rank, kernel, image</td>
        <td>Yes</td>
        <td>Excellent</td>
        <td>-</td>
@@ -78,7 +78,7 @@ To get an overview of the true relative speed of the different decompositions, c
    <tr>
        <td>FullPivHouseholderQR</td>
        <td>-</td>
-        <td>Slow</td>
+        <td>Slow (no blocking)</td>
        <td>Proven</td>
        <td>Yes</td>
        <td>Orthogonalization</td>
@@ -120,7 +120,7 @@ To get an overview of the true relative speed of the different decompositions, c
        <td>-</td>
        <td>Yes</td>
        <td>Excellent</td>
-        <td><em>Soon: blocking</em></td>
+        <td>-</td>
    </tr>

    <tr><th class="inter" colspan="9">\n Singular values and eigenvalues decompositions</th></tr>
@@ -232,7 +232,7 @@ To get an overview of the true relative speed of the different decompositions, c
        <td>-</td>
        <td>-</td>
        <td>Good</td>
-        <td><em>Soon: blocking</em></td>
+        <td>-</td>
    </tr>

    <tr>
@@ -244,7 +244,7 @@ To get an overview of the true relative speed of the different decompositions, c
        <td>-</td>
        <td>-</td>
        <td>Good</td>
-        <td><em>Soon: blocking</em></td>
+        <td>-</td>
    </tr>

 </table>
@@ -253,9 +253,32 @@ To get an overview of the true relative speed of the different decompositions, c
 <ul>
 <li><a name="note1">\b 1: </a>There exist two variants of the LDLT algorithm. Eigen's one produces a pure diagonal D matrix, and therefore it cannot handle indefinite matrices, unlike Lapack's one which produces a block diagonal D matrix.</li>
 <li><a name="note2">\b 2: </a>Eigenvalues, SVD and Schur decompositions rely on iterative algorithms. Their convergence speed depends on how well the eigenvalues are separated.</li>
-<li><a name="note3">\b 3: </a>Our JacobiSVD is two-sided, making for proven and optimal precision for square matrices. For non-square matrices, we have to use a QR preconditioner first. The default choice, ColPivHouseholderQR, is already very reliable, but if you want it to be proven, use FullPivHouseholderQR instead.
+<li><a name="note3">\b 3: </a>Our JacobiSVD is two-sided, making for proven and optimal precision for square matrices. For non-square matrices, we have to use a QR preconditioner first. The default choice, ColPivHouseholderQR, is already very reliable, but if you want it to be proven, use FullPivHouseholderQR instead.</li>
 </ul>

+\section TopicLinAlgPracticalGuidance Practical guidance
+
+The following recommendations apply to the most common use cases:
+
+\li <b>Symmetric positive definite systems:</b> Use \b LLT. It is the fastest solver and has excellent
+    numerical properties for this class of problems. For semidefinite or nearly singular symmetric systems,
+    use \b LDLT.
+\li <b>General invertible systems:</b> Use \b PartialPivLU. It uses cache-friendly blocking and implicit
+    multi-threading, making it the fastest general-purpose solver. Partial pivoting is sufficient for
+    virtually all practical problems.
+\li <b>Least squares (over- or under-determined systems):</b> Use \b CompleteOrthogonalDecomposition as
+    the default. Like the SVD, it robustly computes the minimum-norm solution for rank-deficient and
+    under-determined problems, but at QR-like speed. Use \b BDCSVD when you also need singular values
+    or vectors, not just the least squares solution.
+\li <b>Full-rank least squares (overdetermined systems):</b> When the matrix is known to be full rank,
+    \b HouseholderQR is the fastest option. For very tall and skinny well-conditioned matrices,
+    solving via the normal equations with \b LLT can be faster still.
+\li <b>FullPivLU and FullPivHouseholderQR</b> use complete pivoting, which prevents the use of
+    cache-friendly blocking algorithms and makes them significantly slower than their partial/column
+    pivoting counterparts. In practice, complete pivoting rarely provides meaningful accuracy benefits.
+    These decompositions are primarily useful for debugging, pedagogy, or the very rare case
+    where column pivoting is insufficient.
+
 \section TopicLinAlgTerminology Terminology

 <dl>
--- a/doc/TutorialLinearAlgebra.dox
+++ b/doc/TutorialLinearAlgebra.dox
@@ -43,7 +43,23 @@ depending on your matrix, the problem you are trying to solve, and the trade-off
        <th>Requirements<br/>on the matrix</th>
        <th>Speed<br/> (small-to-medium)</th>
        <th>Speed<br/> (large)</th>
-        <th>Accuracy</th>
+        <th>Robustness<sup><a href="#note_robust">*</a></sup></th>
+    </tr>
+    <tr>
+        <td>LLT</td>
+        <td>llt()</td>
+        <td>Positive definite</td>
+        <td>+++</td>
+        <td>+++</td>
+        <td>+</td>
+    </tr>
+    <tr class="alt">
+        <td>LDLT</td>
+        <td>ldlt()</td>
+        <td>Positive or negative<br/> semidefinite</td>
+        <td>+++</td>
+        <td>+</td>
+        <td>++</td>
    </tr>
    <tr>
        <td>PartialPivLU</td>
@@ -54,14 +70,6 @@ depending on your matrix, the problem you are trying to solve, and the trade-off
        <td>+</td>
    </tr>
    <tr class="alt">
-        <td>FullPivLU</td>
-        <td>fullPivLu()</td>
-        <td>None</td>
-        <td>-</td>
-        <td>- -</td>
-        <td>+++</td>
-    </tr>
-    <tr>
        <td>HouseholderQR</td>
        <td>householderQr()</td>
        <td>None</td>
@@ -69,7 +77,7 @@ depending on your matrix, the problem you are trying to solve, and the trade-off
        <td>++</td>
        <td>+</td>
    </tr>
-    <tr class="alt">
+    <tr>
        <td>ColPivHouseholderQR</td>
        <td>colPivHouseholderQr()</td>
        <td>None</td>
@@ -77,14 +85,6 @@ depending on your matrix, the problem you are trying to solve, and the trade-off
        <td>-</td>
        <td>+++</td>
    </tr>
-    <tr>
-        <td>FullPivHouseholderQR</td>
-        <td>fullPivHouseholderQr()</td>
-        <td>None</td>
-        <td>-</td>
-        <td>- -</td>
-        <td>+++</td>
-    </tr>
    <tr class="alt">
        <td>CompleteOrthogonalDecomposition</td>
        <td>completeOrthogonalDecomposition()</td>
@@ -93,23 +93,7 @@ depending on your matrix, the problem you are trying to solve, and the trade-off
        <td>-</td>
        <td>+++</td>
    </tr>
-    <tr class="alt">
-        <td>LLT</td>
-        <td>llt()</td>
-        <td>Positive definite</td>
-        <td>+++</td>
-        <td>+++</td>
-        <td>+</td>
-    </tr>
    <tr>
-        <td>LDLT</td>
-        <td>ldlt()</td>
-        <td>Positive or negative<br/> semidefinite</td>
-        <td>+++</td>
-        <td>+</td>
-        <td>++</td>
-    </tr>
-    <tr class="alt">
        <td>BDCSVD</td>
        <td>bdcSvd()</td>
        <td>None</td>
@@ -126,15 +110,36 @@ depending on your matrix, the problem you are trying to solve, and the trade-off
        <td>+++</td>
    </tr>
 </table>
+
+<a name="note_robust"><b>*</b></a> The <b>Robustness</b> column indicates how well the decomposition handles
+ill-conditioned or rank-deficient matrices. All decompositions give excellent accuracy when their
+requirements on the matrix are met and the problem is well-conditioned.
+
 To get an overview of the true relative speed of the different decompositions, check this \link DenseDecompositionBenchmark benchmark \endlink.

-All of these decompositions offer a solve() method that works as in the above example. 
+All of these decompositions offer a solve() method that works as in the above example.

-If you know more about the properties of your matrix, you can use the above table to select the best method.
-For example, a good choice for solving linear systems with a non-symmetric matrix of full rank is PartialPivLU.
-If you know that your matrix is also symmetric and positive definite, the above table says that
-a very good choice is the LLT or LDLT decomposition. Here's an example, also demonstrating that using a general
-matrix (not a vector) as right hand side is possible:
+\b Practical \b recommendations:
+\li If your matrix is symmetric positive definite, use \b LLT. It is the fastest and is perfectly accurate
+    for this class of problems. If your matrix is only positive or negative semidefinite, use \b LDLT.
+\li For a general invertible matrix, \b PartialPivLU is the best choice. It is fast (uses cache-friendly
+    blocking) and reliable for the vast majority of problems.
+\li For least squares problems (over- or under-determined systems), \b CompleteOrthogonalDecomposition
+    is the recommended default. Like the SVD, it robustly computes the minimum-norm solution for
+    rank-deficient and under-determined problems, but at the cost of a QR decomposition rather than
+    an SVD. Use \b ColPivHouseholderQR if you only need least squares for full-rank overdetermined
+    systems and don't need the minimum-norm property.
+\li \b SVD decompositions (BDCSVD, JacobiSVD) are the most robust but also the slowest. Use these when
+    you need singular values/vectors, not just the solution.
+\li \b HouseholderQR is the fastest option for full-rank least squares problems, but it does not
+    reveal rank and cannot compute minimum-norm solutions for rank-deficient problems.
+\li FullPivLU and FullPivHouseholderQR use complete pivoting, which is significantly slower due to
+    lack of blocking. In practice, they rarely provide meaningful benefits over PartialPivLU and
+    ColPivHouseholderQR, respectively, and are not recommended for general use. They are primarily useful
+    for debugging or for pedagogical purposes.
+
+Here's an example showing the use of LLT for a symmetric positive definite system, also demonstrating
+that using a general matrix (not a vector) as right hand side is possible:

 <table class="example">
 <tr><th>Example:</th><th>Output:</th></tr>
@@ -151,14 +156,15 @@ supports many other decompositions), see our special page on

 \section TutorialLinAlgLeastsquares Least squares solving

-The most general and accurate method to solve under- or over-determined linear systems
-in the least squares sense, is the SVD decomposition. Eigen provides two implementations.
-The recommended one is the BDCSVD class, which scales well for large problems
-and automatically falls back to the JacobiSVD class for smaller problems.
-For both classes, their solve() method solved the linear system in the least-squares
-sense. 
+The recommended method to solve under- or over-determined linear systems in the least squares sense is
+\b CompleteOrthogonalDecomposition. Like the SVD, it robustly computes the minimum-norm least squares
+solution, correctly handling rank-deficient and under-determined problems, but it is significantly faster
+since it is based on a rank-revealing QR decomposition rather than a full SVD.

-Here is an example:
+If you also need the singular values or vectors themselves (not just the least squares solution), use
+\b BDCSVD, which scales well for large problems and automatically falls back to JacobiSVD for smaller ones.
+
+Here is an example using the SVD:
 <table class="example">
 <tr><th>Example:</th><th>Output:</th></tr>
 <tr>
@@ -167,11 +173,9 @@ Here is an example:
 </tr>
 </table>

-An alternative to the SVD, which is usually faster and about as accurate, is CompleteOrthogonalDecomposition. 
-
-Again, if you know more about the problem, the table above contains methods that are potentially faster.
-If your matrix is full rank, HouseHolderQR is the method of choice. If your matrix is full rank and well conditioned,
-using the Cholesky decomposition (LLT) on the matrix of the normal equations can be faster still.
+If you know more about the problem, faster methods are available.
+If your matrix is full rank, HouseholderQR is the fastest method. If your matrix is full rank and
+well conditioned, using the Cholesky decomposition (LLT) on the normal equations can be faster still.
 Our page on \link LeastSquares least squares solving \endlink has more details.


@@ -267,8 +271,9 @@ singular matrix). On \ref TopicLinearAlgebraDecompositions "this table" you can
 whether they are rank-revealing or not.

 Rank-revealing decompositions offer at least a rank() method. They can also offer convenience methods such as isInvertible(),
-and some are also providing methods to compute the kernel (null-space) and image (column-space) of the matrix, as is the
-case with FullPivLU:
+and some are also providing methods to compute the kernel (null-space) and image (column-space) of the matrix.
+ColPivHouseholderQR, CompleteOrthogonalDecomposition, and FullPivLU all provide these methods. Here is an example using
+FullPivLU:

 <table class="example">
 <tr><th>Example:</th><th>Output:</th></tr>
--- a/doc/snippets/LeastSquaresCOD.cpp
+++ b/doc/snippets/LeastSquaresCOD.cpp
@@ -0,0 +1,3 @@
+MatrixXf A = MatrixXf::Random(3, 2);
+VectorXf b = VectorXf::Random(3);
+cout << "The solution using the COD is:\n" << A.completeOrthogonalDecomposition().solve(b) << endl;
--- a/test/CMakeLists.txt
+++ b/test/CMakeLists.txt
@@ -433,7 +433,7 @@ if(EIGEN_TEST_CUDA_NVC AND NOT CMAKE_CXX_COMPILER_ID MATCHES "NVHPC")
  message(WARNING "EIGEN_TEST_CUDA_NVC is set, but CMAKE_CXX_COMPILER does not appear to be nvc++.")
 endif()

-find_package(CUDA 9.0)
+find_package(CUDA 11.4)
 if(CUDA_FOUND AND EIGEN_TEST_CUDA)
  # Make sure to compile without the -pedantic, -Wundef, -Wnon-virtual-dtor
  # and -fno-check-new flags since they trigger thousands of compilation warnings
@@ -502,6 +502,9 @@ if (EIGEN_TEST_HIP)
  endif()

  find_package(HIP REQUIRED)
+  if (HIP_FOUND AND HIP_VERSION VERSION_LESS "5.6")
+    message(FATAL_ERROR "Eigen requires ROCm/HIP >= 5.6, found ${HIP_VERSION}")
+  endif()
  if (HIP_FOUND)
    execute_process(COMMAND ${HIP_PATH}/bin/hipconfig --platform OUTPUT_VARIABLE HIP_PLATFORM)

--- a/test/bdcsvd.cpp
+++ b/test/bdcsvd.cpp
@@ -15,6 +15,7 @@
 #define EIGEN_RUNTIME_NO_MALLOC

 #include "main.h"
+#include "tridiag_test_matrices.h"
 #include <Eigen/SVD>

 #define SVD_DEFAULT(M) BDCSVD<M>
@@ -146,148 +147,26 @@ void verify_bidiagonal_vs_matrix_svd(const Matrix<RealScalar, Dynamic, 1>& diag,

 template <typename RealScalar>
 void bdcsvd_bidiagonal_hard_cases() {
-  using std::abs;
-  using std::cos;
-  using std::pow;
-  using std::sin;
-  typedef Matrix<RealScalar, Dynamic, 1> VectorXr;
-
  Eigen::internal::set_is_malloc_allowed(true);

-  const RealScalar eps = NumTraits<RealScalar>::epsilon();
+  // Use the shared tridiagonal test matrix generators.
+  // Each generator fills (diag, offdiag) which we treat as (diagonal, superdiagonal)
+  // of a bidiagonal matrix.
+  test::for_all_tridiag_test_matrices<RealScalar>(
+      [](const auto& diag, const auto& offdiag) { verify_bidiagonal_svd<RealScalar>(diag, offdiag); });

-  // Test sizes: cover n=1, very small, below/above algoSwap (16), and larger.
-  const int sizes[] = {1, 2, 3, 5, 10, 16, 20, 50, 100};
-  const int numSizes = sizeof(sizes) / sizeof(sizes[0]);
+  // Additional SVD-specific test: identity with cross-validation against full matrix SVD.
+  test::for_tridiag_sizes<RealScalar>([](auto& diag, auto& offdiag) {
+    test::tridiag_identity(diag, offdiag);
+    verify_bidiagonal_vs_matrix_svd<RealScalar>(diag, offdiag);
+  });

-  for (int si = 0; si < numSizes; ++si) {
-    const Index n = sizes[si];
-    VectorXr diag(n), superdiag(n > 1 ? n - 1 : 0);
-
-    // 1. Identity: d=[1,...,1], e=[0,...,0]
-    diag.setOnes();
-    superdiag.setZero();
-    verify_bidiagonal_svd<RealScalar>(diag, superdiag);
-    verify_bidiagonal_vs_matrix_svd<RealScalar>(diag, superdiag);
-
-    // 2. Zero: d=[0,...,0], e=[0,...,0]
-    diag.setZero();
-    superdiag.setZero();
-    verify_bidiagonal_svd<RealScalar>(diag, superdiag);
-
-    // 3. Scalar (only meaningful for n=1, but runs for all)
-    if (n == 1) {
-      diag(0) = RealScalar(3.14);
-      verify_bidiagonal_svd<RealScalar>(diag, superdiag);
-    }
-
-    // 4. Golub-Kahan: d=[1,...,1], e=[1,...,1]
-    diag.setOnes();
-    if (n > 1) superdiag.setOnes();
-    verify_bidiagonal_svd<RealScalar>(diag, superdiag);
-
-    // 5. Kahan matrix: d_i = s^(i-1), e_i = -c*s^(i-1)
-    // Clamp exponents so condition number stays bounded by 1/eps.
-    {
-      const RealScalar theta = RealScalar(0.3);
-      const RealScalar s = sin(theta);
-      const RealScalar c = cos(theta);
-      using std::log;
-      const RealScalar maxPower = -log(eps) / (-log(s));
-      for (Index i = 0; i < n; ++i) diag(i) = pow(s, numext::mini(RealScalar(i), maxPower));
-      for (Index i = 0; i < n - 1; ++i) superdiag(i) = -c * pow(s, numext::mini(RealScalar(i), maxPower));
-      verify_bidiagonal_svd<RealScalar>(diag, superdiag);
-    }
-
-    // 6. Geometric decay diagonal: d_i = 0.5^i, e=[0,...,0]
-    // Clamp so condition number stays bounded by 1/eps.
-    {
-      using std::log;
-      const RealScalar base = RealScalar(0.5);
-      const RealScalar maxPower = -log(eps) / (-log(base));
-      for (Index i = 0; i < n; ++i) diag(i) = pow(base, numext::mini(RealScalar(i), maxPower));
-      superdiag.setZero();
-      verify_bidiagonal_svd<RealScalar>(diag, superdiag);
-    }
-
-    // 7. Geometric decay superdiagonal: d=[1,...,1], e_i = 0.5^i
-    diag.setOnes();
-    for (Index i = 0; i < n - 1; ++i) superdiag(i) = pow(RealScalar(0.5), RealScalar(i));
-    verify_bidiagonal_svd<RealScalar>(diag, superdiag);
-
-    // 8. Clustered at 1: d_i = 1 + i*eps, e=[0,...,0]
-    for (Index i = 0; i < n; ++i) diag(i) = RealScalar(1) + RealScalar(i) * eps;
-    superdiag.setZero();
-    verify_bidiagonal_svd<RealScalar>(diag, superdiag);
-
-    // 9. Two clusters: half ≈ 1, half ≈ eps
-    for (Index i = 0; i < n; ++i) diag(i) = (i < n / 2) ? RealScalar(1) : eps;
-    superdiag.setZero();
-    verify_bidiagonal_svd<RealScalar>(diag, superdiag);
-
-    // 10. Single tiny singular value: d=[1,...,1,eps], e=[eps^2,...]
-    diag.setOnes();
-    diag(n - 1) = eps;
-    for (Index i = 0; i < n - 1; ++i) superdiag(i) = eps * eps;
-    verify_bidiagonal_svd<RealScalar>(diag, superdiag);
-
-    // 11. Graded: d_i = 10^(-i), e_i = 10^(-i)
-    for (Index i = 0; i < n; ++i) diag(i) = pow(RealScalar(10), -RealScalar(i));
-    for (Index i = 0; i < n - 1; ++i) superdiag(i) = pow(RealScalar(10), -RealScalar(i));
-    verify_bidiagonal_svd<RealScalar>(diag, superdiag);
-
-    // 12. Nearly diagonal: random diag, eps * random superdiag
-    diag = VectorXr::Random(n).cwiseAbs() + VectorXr::Constant(n, RealScalar(0.1));
-    for (Index i = 0; i < n - 1; ++i) superdiag(i) = eps * (RealScalar(0.5) + abs(internal::random<RealScalar>()));
-    verify_bidiagonal_svd<RealScalar>(diag, superdiag);
-
-    // 13. All equal: d=[c,...,c], e=[c,...,c]
-    diag.setConstant(RealScalar(2.5));
-    if (n > 1) superdiag.setConstant(RealScalar(2.5));
-    verify_bidiagonal_svd<RealScalar>(diag, superdiag);
-
-    // 14. Wilkinson: d_i = |n/2 - i|, e=[1,...,1]
-    for (Index i = 0; i < n; ++i) diag(i) = abs(RealScalar(n / 2) - RealScalar(i));
-    if (n > 1) superdiag.setOnes();
-    verify_bidiagonal_svd<RealScalar>(diag, superdiag);
-
-    // 15. Overflow/underflow: alternating big/tiny diagonal, tiny/big superdiagonal
-    {
-      const RealScalar big = (std::numeric_limits<RealScalar>::max)() / RealScalar(1000);
-      const RealScalar tiny = (std::numeric_limits<RealScalar>::min)() * RealScalar(1000);
-      for (Index i = 0; i < n; ++i) diag(i) = (i % 2 == 0) ? big : tiny;
-      for (Index i = 0; i < n - 1; ++i) superdiag(i) = (i % 2 == 0) ? tiny : big;
-      verify_bidiagonal_svd<RealScalar>(diag, superdiag);
-    }
-
-    // 16. Prescribed condition number: d_i = kappa^(-i/(n-1)), e_i = eps * random
-    if (n > 1) {
-      const RealScalar kappa = RealScalar(1) / eps;
-      for (Index i = 0; i < n; ++i) diag(i) = pow(kappa, -RealScalar(i) / RealScalar(n - 1));
-      for (Index i = 0; i < n - 1; ++i) superdiag(i) = eps * abs(internal::random<RealScalar>());
-      verify_bidiagonal_svd<RealScalar>(diag, superdiag);
-    }
-
-    // 17. Rank-deficient: d=[1,..,0,..,0,..,1], e=[0,...,0]
-    for (Index i = 0; i < n; ++i) diag(i) = (i < n / 3 || i >= 2 * n / 3) ? RealScalar(1) : RealScalar(0);
-    superdiag.setZero();
-    verify_bidiagonal_svd<RealScalar>(diag, superdiag);
-
-    // 18. Arrowhead stress: d_i = linspace(1, n), e_i = 1/(i+1)
-    for (Index i = 0; i < n; ++i) diag(i) = RealScalar(1) + RealScalar(i);
-    for (Index i = 0; i < n - 1; ++i) superdiag(i) = RealScalar(1) / RealScalar(i + 1);
-    verify_bidiagonal_svd<RealScalar>(diag, superdiag);
-
-    // 19. Repeated singular values: d=[1,2,3,1,2,3,...], e=[0,...,0]
-    for (Index i = 0; i < n; ++i) diag(i) = RealScalar((i % 3) + 1);
-    superdiag.setZero();
-    verify_bidiagonal_svd<RealScalar>(diag, superdiag);
-
-    // 20. Glued identity: d=[1,...,1], e=0 except e[n/2-1]=eps
-    diag.setOnes();
-    superdiag.setZero();
-    if (n > 2) superdiag(n / 2 - 1) = eps;
-    verify_bidiagonal_svd<RealScalar>(diag, superdiag);
+  // Additional SVD-specific test: scalar for n=1.
+  {
+    typedef Matrix<RealScalar, Dynamic, 1> VectorXr;
+    VectorXr diag(1), offdiag(0);
+    diag(0) = RealScalar(3.14);
+    verify_bidiagonal_svd<RealScalar>(diag, offdiag);
  }
 }

--- a/test/eigensolver_selfadjoint.cpp
+++ b/test/eigensolver_selfadjoint.cpp
@@ -10,6 +10,7 @@

 #include "main.h"
 #include "svd_fill.h"
+#include "tridiag_test_matrices.h"
 #include <limits>
 #include <Eigen/Eigenvalues>
 #include <Eigen/SparseCore>
@@ -25,17 +26,39 @@ void selfadjointeigensolver_essential_check(const MatrixType& m) {
  SelfAdjointEigenSolver<MatrixType> eiSymm(m);
  VERIFY_IS_EQUAL(eiSymm.info(), Success);

+  Index n = m.cols();
  RealScalar scaling = m.cwiseAbs().maxCoeff();
-  RealScalar unitary_error_factor = RealScalar(32);

  if (scaling < (std::numeric_limits<RealScalar>::min)()) {
    VERIFY(eiSymm.eigenvalues().cwiseAbs().maxCoeff() <= (std::numeric_limits<RealScalar>::min)());
  } else {
-    VERIFY_IS_APPROX((m.template selfadjointView<Lower>() * eiSymm.eigenvectors()) / scaling,
-                     (eiSymm.eigenvectors() * eiSymm.eigenvalues().asDiagonal()) / scaling);
+    // Columnwise residual check: for each eigenpair (lambda_i, v_i),
+    //   ||A*v_i - lambda_i*v_i|| / ||A||_max <= c * n * eps
+    // This ensures accuracy for every eigenpair, including those corresponding
+    // to small eigenvalues (which a Frobenius norm check would miss).
+    // Computed in scaled space (dividing by ||A||_max) to avoid overflow.
+    MatrixType scaledA = m.template selfadjointView<Lower>();
+    scaledA /= scaling;
+    MatrixType residual =
+        scaledA * eiSymm.eigenvectors() - eiSymm.eigenvectors() * (eiSymm.eigenvalues() / scaling).asDiagonal();
+    RealScalar tol = RealScalar(4) * RealScalar(numext::maxi(Index(1), n)) * NumTraits<RealScalar>::epsilon();
+    for (Index i = 0; i < n; ++i) {
+      VERIFY(residual.col(i).norm() <= tol);
+    }
  }
  VERIFY_IS_APPROX(m.template selfadjointView<Lower>().eigenvalues(), eiSymm.eigenvalues());
-  VERIFY(eiSymm.eigenvectors().isUnitary(test_precision<RealScalar>() * unitary_error_factor));
+
+  // Eigenvectors must be unitary. Use a tolerance proportional to n*epsilon,
+  // which is the expected rounding error for Householder-based orthogonal transformations.
+  RealScalar unitary_tol = RealScalar(4) * RealScalar(numext::maxi(Index(1), n)) * NumTraits<RealScalar>::epsilon();
+  // But don't go below the test_precision floor (matters for float).
+  unitary_tol = numext::maxi(unitary_tol, test_precision<RealScalar>());
+  VERIFY(eiSymm.eigenvectors().isUnitary(unitary_tol));
+
+  // Verify eigenvalues are sorted in non-decreasing order.
+  for (Index i = 1; i < n; ++i) {
+    VERIFY(eiSymm.eigenvalues()(i) >= eiSymm.eigenvalues()(i - 1));
+  }

  if (m.cols() <= 4) {
    SelfAdjointEigenSolver<MatrixType> eiDirect;
@@ -53,12 +76,20 @@ void selfadjointeigensolver_essential_check(const MatrixType& m) {
      VERIFY(eiDirect.eigenvalues().cwiseAbs().maxCoeff() <= (std::numeric_limits<RealScalar>::min)());
    } else {
      VERIFY_IS_APPROX(eiSymm.eigenvalues() / scaling, eiDirect.eigenvalues() / scaling);
+      // TODO: the direct 3x3 solver can produce large backward errors (>>n*eps*||A||)
+      // on some matrices. Investigate and fix, then tighten this to a Frobenius norm check.
      VERIFY_IS_APPROX((m.template selfadjointView<Lower>() * eiDirect.eigenvectors()) / scaling,
                       (eiDirect.eigenvectors() * eiDirect.eigenvalues().asDiagonal()) / scaling);
      VERIFY_IS_APPROX(m.template selfadjointView<Lower>().eigenvalues() / scaling, eiDirect.eigenvalues() / scaling);
    }

-    VERIFY(eiDirect.eigenvectors().isUnitary(test_precision<RealScalar>() * unitary_error_factor));
+    // Direct solver eigenvectors must also be unitary.
+    VERIFY(eiDirect.eigenvectors().isUnitary(unitary_tol));
+
+    // Direct solver eigenvalues must also be sorted.
+    for (Index i = 1; i < n; ++i) {
+      VERIFY(eiDirect.eigenvalues()(i) >= eiDirect.eigenvalues()(i - 1));
+    }
  }
 }

@@ -149,9 +180,15 @@ void selfadjointeigensolver(const MatrixType& m) {
  VERIFY_IS_APPROX(tridiag.diagonal(), tridiag.matrixT().diagonal());
  VERIFY_IS_APPROX(tridiag.subDiagonal(), tridiag.matrixT().template diagonal<-1>());
  Matrix<RealScalar, Dynamic, Dynamic> T = tridiag.matrixT();
-  if (rows > 1 && cols > 1) {
-    // FIXME check that upper and lower part are 0:
-    // VERIFY(T.topRightCorner(rows-2, cols-2).template triangularView<Upper>().isZero());
+  if (rows > 2) {
+    // Verify that the tridiagonal matrix is actually tridiagonal (zero outside the three central diagonals).
+    for (Index i = 0; i < rows; ++i) {
+      for (Index j = 0; j < cols; ++j) {
+        if (numext::abs(i - j) > 1) {
+          VERIFY(numext::is_exactly_zero(T(i, j)));
+        }
+      }
+    }
  }
  VERIFY_IS_APPROX(tridiag.diagonal(), T.diagonal());
  VERIFY_IS_APPROX(tridiag.subDiagonal(), T.template diagonal<1>());
@@ -170,7 +207,7 @@ void selfadjointeigensolver(const MatrixType& m) {
                                            eiSymmTridiag.eigenvectors().real().transpose());
  }

-  // Test matrix expponential from eigendecomposition.
+  // Test matrix exponential from eigendecomposition.
  // First scale to avoid overflow.
  symmB = symmB / symmB.norm();
  eiSymm.compute(symmB);
@@ -202,6 +239,451 @@ void selfadjointeigensolver(const MatrixType& m) {
  }
 }

+// Test matrices with exact eigenvalue multiplicities.
+template <typename MatrixType>
+void selfadjointeigensolver_repeated_eigenvalues(const MatrixType& m) {
+  typedef typename MatrixType::Scalar Scalar;
+  typedef typename NumTraits<Scalar>::Real RealScalar;
+  Index n = m.rows();
+  if (n < 2) return;
+
+  // Create a random unitary matrix via QR.
+  MatrixType q = MatrixType::Random(n, n);
+  HouseholderQR<MatrixType> qr(q);
+  q = qr.householderQ();
+
+  // All eigenvalues equal (scalar multiple of identity).
+  {
+    RealScalar lambda = internal::random<RealScalar>(-10, 10);
+    MatrixType A = lambda * MatrixType::Identity(n, n);
+    selfadjointeigensolver_essential_check(A);
+  }
+
+  // Eigenvalue of multiplicity n-1 (one distinct, rest equal).
+  {
+    Matrix<RealScalar, Dynamic, 1> d = Matrix<RealScalar, Dynamic, 1>::Constant(n, RealScalar(3));
+    d(0) = RealScalar(-2);
+    MatrixType A = (q * d.template cast<Scalar>().asDiagonal() * q.adjoint()).eval();
+    A.template triangularView<StrictlyUpper>().setZero();
+    selfadjointeigensolver_essential_check(A);
+  }
+
+  // Two clusters: first half one value, second half another.
+  if (n >= 4) {
+    Matrix<RealScalar, Dynamic, 1> d(n);
+    for (Index i = 0; i < n / 2; ++i) d(i) = RealScalar(1);
+    for (Index i = n / 2; i < n; ++i) d(i) = RealScalar(5);
+    MatrixType A = (q * d.template cast<Scalar>().asDiagonal() * q.adjoint()).eval();
+    A.template triangularView<StrictlyUpper>().setZero();
+    selfadjointeigensolver_essential_check(A);
+  }
+
+  // Nearly repeated eigenvalues: separated by O(epsilon).
+  {
+    Matrix<RealScalar, Dynamic, 1> d(n);
+    for (Index i = 0; i < n; ++i) {
+      d(i) = RealScalar(1) + RealScalar(i) * NumTraits<RealScalar>::epsilon() * RealScalar(10);
+    }
+    MatrixType A = (q * d.template cast<Scalar>().asDiagonal() * q.adjoint()).eval();
+    A.template triangularView<StrictlyUpper>().setZero();
+    selfadjointeigensolver_essential_check(A);
+  }
+}
+
+// Test matrices with extreme condition numbers and eigenvalue ranges.
+template <typename MatrixType>
+void selfadjointeigensolver_extreme_eigenvalues(const MatrixType& m) {
+  using std::pow;
+  typedef typename MatrixType::Scalar Scalar;
+  typedef typename NumTraits<Scalar>::Real RealScalar;
+  Index n = m.rows();
+  if (n < 2) return;
+
+  // Create a random unitary matrix.
+  MatrixType q = MatrixType::Random(n, n);
+  HouseholderQR<MatrixType> qr(q);
+  q = qr.householderQ();
+
+  // Eigenvalues spanning many orders of magnitude (high condition number).
+  {
+    RealScalar maxExp = RealScalar(std::numeric_limits<RealScalar>::max_exponent10) / RealScalar(4);
+    Matrix<RealScalar, Dynamic, 1> d(n);
+    for (Index i = 0; i < n; ++i) {
+      RealScalar exponent = -maxExp + RealScalar(2) * maxExp * RealScalar(i) / RealScalar(n - 1);
+      d(i) = pow(RealScalar(10), exponent);
+    }
+    MatrixType A = (q * d.template cast<Scalar>().asDiagonal() * q.adjoint()).eval();
+    A.template triangularView<StrictlyUpper>().setZero();
+
+    SelfAdjointEigenSolver<MatrixType> eig(A);
+    VERIFY_IS_EQUAL(eig.info(), Success);
+    // For ill-conditioned matrices we can only check the relative residual.
+    // ||A*V - V*D|| / ||A|| should be O(n * epsilon).
+    RealScalar Anorm = A.template selfadjointView<Lower>().operatorNorm();
+    if (Anorm > (std::numeric_limits<RealScalar>::min)()) {
+      MatrixType residual = A.template selfadjointView<Lower>() * eig.eigenvectors() -
+                            eig.eigenvectors() * eig.eigenvalues().asDiagonal();
+      RealScalar rel_err = residual.norm() / Anorm;
+      RealScalar tol = RealScalar(4) * RealScalar(n) * NumTraits<RealScalar>::epsilon();
+      VERIFY(rel_err <= tol);
+    }
+    // Eigenvalues must still be sorted.
+    for (Index i = 1; i < n; ++i) {
+      VERIFY(eig.eigenvalues()(i) >= eig.eigenvalues()(i - 1));
+    }
+  }
+
+  // Very tiny eigenvalues (near underflow).
+  {
+    RealScalar tiny = (std::numeric_limits<RealScalar>::min)() * RealScalar(100);
+    Matrix<RealScalar, Dynamic, 1> d(n);
+    for (Index i = 0; i < n; ++i) {
+      d(i) = tiny * (RealScalar(1) + RealScalar(i));
+    }
+    MatrixType A = (q * d.template cast<Scalar>().asDiagonal() * q.adjoint()).eval();
+    A.template triangularView<StrictlyUpper>().setZero();
+    selfadjointeigensolver_essential_check(A);
+  }
+
+  // Very large eigenvalues (near overflow).
+  {
+    RealScalar huge = (std::numeric_limits<RealScalar>::max)() / (RealScalar(n) * RealScalar(100));
+    Matrix<RealScalar, Dynamic, 1> d(n);
+    for (Index i = 0; i < n; ++i) {
+      d(i) = huge * (RealScalar(1) + RealScalar(i) * RealScalar(0.01));
+    }
+    MatrixType A = (q * d.template cast<Scalar>().asDiagonal() * q.adjoint()).eval();
+    A.template triangularView<StrictlyUpper>().setZero();
+    selfadjointeigensolver_essential_check(A);
+  }
+
+  // Mix of positive and negative eigenvalues.
+  {
+    Matrix<RealScalar, Dynamic, 1> d(n);
+    for (Index i = 0; i < n; ++i) {
+      d(i) = (i % 2 == 0) ? RealScalar(i + 1) : RealScalar(-(i + 1));
+    }
+    MatrixType A = (q * d.template cast<Scalar>().asDiagonal() * q.adjoint()).eval();
+    A.template triangularView<StrictlyUpper>().setZero();
+    selfadjointeigensolver_essential_check(A);
+  }
+
+  // One zero eigenvalue among non-zero ones (rank-deficient).
+  {
+    Matrix<RealScalar, Dynamic, 1> d = Matrix<RealScalar, Dynamic, 1>::LinSpaced(n, RealScalar(0), RealScalar(n - 1));
+    MatrixType A = (q * d.template cast<Scalar>().asDiagonal() * q.adjoint()).eval();
+    A.template triangularView<StrictlyUpper>().setZero();
+    selfadjointeigensolver_essential_check(A);
+  }
+}
+
+// Test computeFromTridiagonal with scaled inputs (regression for missing scaling).
+template <typename MatrixType>
+void selfadjointeigensolver_tridiagonal_scaled(const MatrixType& m) {
+  typedef typename MatrixType::Scalar Scalar;
+  typedef typename NumTraits<Scalar>::Real RealScalar;
+  Index n = m.rows();
+  if (n < 2) return;
+
+  // Create a tridiagonal matrix with large entries.
+  typedef Matrix<RealScalar, Dynamic, 1> RealVectorType;
+  RealVectorType diag(n), subdiag(n - 1);
+
+  // Case 1: Large values.
+  RealScalar scale = (std::numeric_limits<RealScalar>::max)() / (RealScalar(n) * RealScalar(100));
+  for (Index i = 0; i < n; ++i) diag(i) = scale * RealScalar(i + 1);
+  for (Index i = 0; i < n - 1; ++i) subdiag(i) = scale * RealScalar(0.5);
+
+  SelfAdjointEigenSolver<MatrixType> eig1;
+  eig1.computeFromTridiagonal(diag, subdiag, ComputeEigenvectors);
+  VERIFY_IS_EQUAL(eig1.info(), Success);
+
+  // Reconstruct tridiagonal and check residual.
+  Matrix<RealScalar, Dynamic, Dynamic> T = Matrix<RealScalar, Dynamic, Dynamic>::Zero(n, n);
+  T.diagonal() = diag;
+  T.template diagonal<1>() = subdiag;
+  T.template diagonal<-1>() = subdiag;
+  VERIFY_IS_APPROX(
+      T, eig1.eigenvectors().real() * eig1.eigenvalues().asDiagonal() * eig1.eigenvectors().real().transpose());
+
+  // Case 2: Tiny values.
+  scale = (std::numeric_limits<RealScalar>::min)() * RealScalar(100);
+  for (Index i = 0; i < n; ++i) diag(i) = scale * RealScalar(i + 1);
+  for (Index i = 0; i < n - 1; ++i) subdiag(i) = scale * RealScalar(0.5);
+
+  SelfAdjointEigenSolver<MatrixType> eig2;
+  eig2.computeFromTridiagonal(diag, subdiag, ComputeEigenvectors);
+  VERIFY_IS_EQUAL(eig2.info(), Success);
+
+  // Eigenvalues-only mode should produce the same eigenvalues.
+  SelfAdjointEigenSolver<MatrixType> eig2v;
+  eig2v.computeFromTridiagonal(diag, subdiag, EigenvaluesOnly);
+  VERIFY_IS_EQUAL(eig2v.info(), Success);
+  VERIFY_IS_APPROX(eig2.eigenvalues(), eig2v.eigenvalues());
+}
+
+// Test computeFromTridiagonal with wide dynamic range across decoupled blocks.
+// This exercises the per-block scaling in computeFromTridiagonal_impl: a zero on the
+// subdiagonal decouples the matrix into blocks with vastly different scales. Global
+// scaling would underflow the small block; per-block scaling handles both correctly.
+template <typename RealScalar>
+void selfadjointeigensolver_tridiagonal_wide_range() {
+  using std::sqrt;
+  typedef Matrix<RealScalar, Dynamic, Dynamic> MatrixType;
+  typedef Matrix<RealScalar, Dynamic, 1> VectorType;
+
+  // Block 1: entries near overflow threshold.
+  // Block 2: entries near 1.
+  // Separated by a zero subdiagonal entry.
+  const RealScalar big = sqrt(NumTraits<RealScalar>::highest()) / RealScalar(10);
+  const Index n = 6;
+  VectorType diag(n), subdiag(n - 1);
+
+  // First block: [0..2], large scale.
+  diag(0) = big;
+  diag(1) = big * RealScalar(1.1);
+  diag(2) = big * RealScalar(0.9);
+  subdiag(0) = big * RealScalar(0.01);
+  subdiag(1) = big * RealScalar(0.02);
+  // Zero subdiagonal decouples the two blocks.
+  subdiag(2) = RealScalar(0);
+  // Second block: [3..5], O(1) scale.
+  diag(3) = RealScalar(1);
+  diag(4) = RealScalar(2);
+  diag(5) = RealScalar(3);
+  subdiag(3) = RealScalar(0.5);
+  subdiag(4) = RealScalar(0.3);
+
+  // Build the full tridiagonal matrix for residual checking.
+  MatrixType T = MatrixType::Zero(n, n);
+  T.diagonal() = diag;
+  T.template diagonal<1>() = subdiag;
+  T.template diagonal<-1>() = subdiag;
+
+  SelfAdjointEigenSolver<MatrixType> eig;
+  eig.computeFromTridiagonal(diag, subdiag, ComputeEigenvectors);
+  VERIFY_IS_EQUAL(eig.info(), Success);
+
+  // Eigenvalues must be sorted.
+  for (Index i = 1; i < n; ++i) {
+    VERIFY(eig.eigenvalues()(i) >= eig.eigenvalues()(i - 1));
+  }
+
+  // Eigenvectors must be orthonormal.
+  RealScalar unitary_tol = RealScalar(4) * RealScalar(n) * NumTraits<RealScalar>::epsilon();
+  VERIFY(eig.eigenvectors().isUnitary(unitary_tol));
+
+  // Full residual check in scaled coordinates.
+  RealScalar Tnorm = T.cwiseAbs().maxCoeff();
+  MatrixType Tscaled = T / Tnorm;
+  MatrixType residual = Tscaled * eig.eigenvectors() - eig.eigenvectors() * (eig.eigenvalues() / Tnorm).asDiagonal();
+  RealScalar rel_err = residual.norm() / Tscaled.norm();
+  VERIFY(rel_err <= RealScalar(8) * RealScalar(n) * NumTraits<RealScalar>::epsilon());
+
+  // The small eigenvalues (~1,2,3) must be accurate, not lost to underflow.
+  // With global scaling to [-1,1], dividing by 'big' would underflow these to zero.
+  // Verify the small eigenvalues are within O(eps) of their true values.
+  // The small block is exactly [[1, 0.5, 0], [0.5, 2, 0.3], [0, 0.3, 3]].
+  MatrixType T_small(3, 3);
+  T_small << RealScalar(1), RealScalar(0.5), RealScalar(0), RealScalar(0.5), RealScalar(2), RealScalar(0.3),
+      RealScalar(0), RealScalar(0.3), RealScalar(3);
+  SelfAdjointEigenSolver<MatrixType> eig_small(T_small);
+  VectorType small_evals = eig_small.eigenvalues();
+
+  // Find the 3 smallest eigenvalues from the combined solver (they should be sorted first).
+  VectorType combined_small = eig.eigenvalues().head(3);
+  VERIFY_IS_APPROX(combined_small, small_evals);
+
+  // Eigenvalues-only mode must agree.
+  SelfAdjointEigenSolver<MatrixType> eig_vals;
+  eig_vals.computeFromTridiagonal(diag, subdiag, EigenvaluesOnly);
+  VERIFY_IS_EQUAL(eig_vals.info(), Success);
+  VERIFY_IS_APPROX(eig.eigenvalues() / Tnorm, eig_vals.eigenvalues() / Tnorm);
+}
+
+// Test computeFromTridiagonal with structured hard-case matrices from the literature.
+template <typename RealScalar>
+void selfadjointeigensolver_structured_tridiagonal() {
+  typedef Matrix<RealScalar, Dynamic, Dynamic> MatrixType;
+
+  test::for_all_symmetric_tridiag_test_matrices<RealScalar>([](const auto& diag, const auto& offdiag) {
+    Index n = diag.size();
+
+    // Build the full symmetric tridiagonal matrix for residual checking.
+    MatrixType T = MatrixType::Zero(n, n);
+    T.diagonal() = diag;
+    if (n > 1) {
+      T.template diagonal<1>() = offdiag;
+      T.template diagonal<-1>() = offdiag;
+    }
+    RealScalar Tnorm = T.cwiseAbs().maxCoeff();
+
+    // Test with eigenvectors.
+    SelfAdjointEigenSolver<MatrixType> eig;
+    eig.computeFromTridiagonal(diag, offdiag, ComputeEigenvectors);
+    VERIFY_IS_EQUAL(eig.info(), Success);
+
+    // Eigenvalues must be sorted.
+    for (Index i = 1; i < n; ++i) {
+      VERIFY(eig.eigenvalues()(i) >= eig.eigenvalues()(i - 1));
+    }
+
+    // Eigenvectors must be orthonormal.
+    RealScalar unitary_tol =
+        numext::maxi(RealScalar(4) * RealScalar(n) * NumTraits<RealScalar>::epsilon(), test_precision<RealScalar>());
+    VERIFY(eig.eigenvectors().isUnitary(unitary_tol));
+
+    // Residual check: ||T*V - V*D||_F / ||T||_max should be O(n*eps).
+    // Scale T to avoid overflow in the matrix product when entries span extreme ranges.
+    if (Tnorm > (std::numeric_limits<RealScalar>::min)()) {
+      MatrixType Tscaled = T / Tnorm;
+      MatrixType residual =
+          Tscaled * eig.eigenvectors() - eig.eigenvectors() * (eig.eigenvalues() / Tnorm).asDiagonal();
+      RealScalar rel_err = residual.norm() / Tscaled.norm();
+      VERIFY(rel_err <= RealScalar(8) * RealScalar(n) * NumTraits<RealScalar>::epsilon());
+    }
+
+    // Eigenvalues-only mode must produce the same eigenvalues.
+    SelfAdjointEigenSolver<MatrixType> eig_vals;
+    eig_vals.computeFromTridiagonal(diag, offdiag, EigenvaluesOnly);
+    VERIFY_IS_EQUAL(eig_vals.info(), Success);
+    if (Tnorm > (std::numeric_limits<RealScalar>::min)()) {
+      VERIFY_IS_APPROX(eig.eigenvalues() / Tnorm, eig_vals.eigenvalues() / Tnorm);
+    }
+  });
+}
+
+// Test with diagonal matrices (tridiagonalization is trivial).
+template <typename MatrixType>
+void selfadjointeigensolver_diagonal(const MatrixType& m) {
+  typedef typename MatrixType::Scalar Scalar;
+  typedef typename NumTraits<Scalar>::Real RealScalar;
+  Index n = m.rows();
+
+  // Random diagonal matrix.
+  MatrixType diag = MatrixType::Zero(n, n);
+  for (Index i = 0; i < n; ++i) {
+    diag(i, i) = internal::random<RealScalar>(-100, 100);
+  }
+  selfadjointeigensolver_essential_check(diag);
+
+  // The eigenvalues should be the diagonal entries, sorted.
+  SelfAdjointEigenSolver<MatrixType> eig(diag);
+  VERIFY_IS_EQUAL(eig.info(), Success);
+
+  Matrix<RealScalar, Dynamic, 1> expected_evals(n);
+  for (Index i = 0; i < n; ++i) expected_evals(i) = numext::real(diag(i, i));
+  std::sort(expected_evals.data(), expected_evals.data() + n);
+  VERIFY_IS_APPROX(eig.eigenvalues(), expected_evals);
+}
+
+// Test operatorInverseSqrt more thoroughly.
+template <typename MatrixType>
+void selfadjointeigensolver_inverse_sqrt(const MatrixType& m) {
+  Index n = m.rows();
+  if (n < 1) return;
+
+  // Create a positive-definite matrix.
+  MatrixType a = MatrixType::Random(n, n);
+  MatrixType spd = a.adjoint() * a + MatrixType::Identity(n, n);
+  spd.template triangularView<StrictlyUpper>().setZero();
+
+  SelfAdjointEigenSolver<MatrixType> eig(spd);
+  VERIFY_IS_EQUAL(eig.info(), Success);
+
+  MatrixType sqrtA = eig.operatorSqrt();
+  MatrixType invSqrtA = eig.operatorInverseSqrt();
+
+  // sqrtA * invSqrtA should be identity.
+  VERIFY_IS_APPROX(sqrtA * invSqrtA, MatrixType::Identity(n, n));
+
+  // invSqrtA * A * invSqrtA should be identity.
+  VERIFY_IS_APPROX(invSqrtA * spd.template selfadjointView<Lower>() * invSqrtA, MatrixType::Identity(n, n));
+
+  // invSqrtA should be symmetric/selfadjoint.
+  VERIFY_IS_APPROX(invSqrtA, invSqrtA.adjoint());
+}
+
+// Test that RowMajor matrices work correctly with computeDirect.
+template <int>
+void selfadjointeigensolver_rowmajor() {
+  typedef Matrix<double, 3, 3, RowMajor> RowMajorMatrix3d;
+  typedef Matrix<double, 2, 2, RowMajor> RowMajorMatrix2d;
+  typedef Matrix<float, 3, 3, RowMajor> RowMajorMatrix3f;
+  typedef Matrix<float, 2, 2, RowMajor> RowMajorMatrix2f;
+
+  // 3x3 RowMajor double
+  {
+    RowMajorMatrix3d a = RowMajorMatrix3d::Random();
+    RowMajorMatrix3d symmA = a.transpose() * a;
+    SelfAdjointEigenSolver<RowMajorMatrix3d> eig;
+    eig.computeDirect(symmA);
+    VERIFY_IS_EQUAL(eig.info(), Success);
+    // Compare with iterative solver.
+    SelfAdjointEigenSolver<RowMajorMatrix3d> eigRef(symmA);
+    VERIFY_IS_APPROX(eigRef.eigenvalues(), eig.eigenvalues());
+  }
+
+  // 2x2 RowMajor double
+  {
+    RowMajorMatrix2d a = RowMajorMatrix2d::Random();
+    RowMajorMatrix2d symmA = a.transpose() * a;
+    SelfAdjointEigenSolver<RowMajorMatrix2d> eig;
+    eig.computeDirect(symmA);
+    VERIFY_IS_EQUAL(eig.info(), Success);
+    SelfAdjointEigenSolver<RowMajorMatrix2d> eigRef(symmA);
+    VERIFY_IS_APPROX(eigRef.eigenvalues(), eig.eigenvalues());
+  }
+
+  // 3x3 RowMajor float
+  {
+    RowMajorMatrix3f a = RowMajorMatrix3f::Random();
+    RowMajorMatrix3f symmA = a.transpose() * a;
+    SelfAdjointEigenSolver<RowMajorMatrix3f> eig;
+    eig.computeDirect(symmA);
+    VERIFY_IS_EQUAL(eig.info(), Success);
+    SelfAdjointEigenSolver<RowMajorMatrix3f> eigRef(symmA);
+    VERIFY_IS_APPROX(eigRef.eigenvalues(), eig.eigenvalues());
+  }
+
+  // 2x2 RowMajor float
+  {
+    RowMajorMatrix2f a = RowMajorMatrix2f::Random();
+    RowMajorMatrix2f symmA = a.transpose() * a;
+    SelfAdjointEigenSolver<RowMajorMatrix2f> eig;
+    eig.computeDirect(symmA);
+    VERIFY_IS_EQUAL(eig.info(), Success);
+    SelfAdjointEigenSolver<RowMajorMatrix2f> eigRef(symmA);
+    VERIFY_IS_APPROX(eigRef.eigenvalues(), eig.eigenvalues());
+  }
+
+  // Dynamic RowMajor with iterative solver
+  {
+    typedef Matrix<double, Dynamic, Dynamic, RowMajor> RowMajorMatrixXd;
+    int s = internal::random<int>(2, 20);
+    RowMajorMatrixXd a = RowMajorMatrixXd::Random(s, s);
+    RowMajorMatrixXd symmA = a.transpose() * a;
+    SelfAdjointEigenSolver<RowMajorMatrixXd> eig(symmA);
+    VERIFY_IS_EQUAL(eig.info(), Success);
+    double scaling = symmA.cwiseAbs().maxCoeff();
+    if (scaling > (std::numeric_limits<double>::min)()) {
+      VERIFY_IS_APPROX((symmA.template selfadjointView<Lower>() * eig.eigenvectors()) / scaling,
+                       (eig.eigenvectors() * eig.eigenvalues().asDiagonal()) / scaling);
+    }
+  }
+}
+
+// Test matrix with Inf entries returns NoConvergence (similar to NaN test).
+template <int>
+void selfadjointeigensolver_inf() {
+  Matrix3d m;
+  m.setRandom();
+  m = m * m.transpose();
+  m(1, 1) = std::numeric_limits<double>::infinity();
+  SelfAdjointEigenSolver<Matrix3d> eig(m);
+  VERIFY_IS_EQUAL(eig.info(), NoConvergence);
+}
+
 template <int>
 void bug_854() {
  Matrix3d m;
@@ -227,11 +709,160 @@ void bug_1225() {
  VERIFY_IS_APPROX(eig1.eigenvalues(), eig2.eigenvalues());
 }

+// Verify that non-finite inputs are detected for all sizes, including 1x1.
+template <int>
+void selfadjointeigensolver_nonfinite() {
+  const double inf = std::numeric_limits<double>::infinity();
+  const double nan = std::numeric_limits<double>::quiet_NaN();
+
+  // 1x1 Inf.
+  {
+    Matrix<double, 1, 1> m;
+    m << inf;
+    SelfAdjointEigenSolver<Matrix<double, 1, 1>> eig(m);
+    VERIFY_IS_EQUAL(eig.info(), NoConvergence);
+  }
+  // 1x1 NaN.
+  {
+    Matrix<double, 1, 1> m;
+    m << nan;
+    SelfAdjointEigenSolver<Matrix<double, 1, 1>> eig(m);
+    VERIFY_IS_EQUAL(eig.info(), NoConvergence);
+  }
+  // 1x1 -Inf.
+  {
+    Matrix<double, 1, 1> m;
+    m << -inf;
+    SelfAdjointEigenSolver<Matrix<double, 1, 1>> eig(m);
+    VERIFY_IS_EQUAL(eig.info(), NoConvergence);
+  }
+  // 3x3 with Inf.
+  {
+    Matrix3d m = Matrix3d::Identity();
+    m(1, 1) = inf;
+    SelfAdjointEigenSolver<Matrix3d> eig(m);
+    VERIFY_IS_EQUAL(eig.info(), NoConvergence);
+  }
+  // 3x3 with NaN.
+  {
+    Matrix3d m = Matrix3d::Identity();
+    m(0, 1) = m(1, 0) = nan;
+    SelfAdjointEigenSolver<Matrix3d> eig(m);
+    VERIFY_IS_EQUAL(eig.info(), NoConvergence);
+  }
+  // Dynamic size with Inf.
+  {
+    MatrixXd m = MatrixXd::Identity(5, 5);
+    m(3, 3) = inf;
+    SelfAdjointEigenSolver<MatrixXd> eig(m);
+    VERIFY_IS_EQUAL(eig.info(), NoConvergence);
+  }
+}
+
 template <int>
 void bug_1204() {
  SparseMatrix<double> A(2, 2);
  A.setIdentity();
-  SelfAdjointEigenSolver<Eigen::SparseMatrix<double> > eig(A);
+  SelfAdjointEigenSolver<Eigen::SparseMatrix<double>> eig(A);
+}
+
+template <int>
+void selfadjointeigensolver_tridiagonal_zerosized() {
+  SelfAdjointEigenSolver<MatrixXd> eig;
+  VectorXd diag(0), subdiag(0);
+
+  eig.computeFromTridiagonal(diag, subdiag, EigenvaluesOnly);
+  VERIFY_IS_EQUAL(eig.info(), Success);
+  VERIFY_IS_EQUAL(eig.eigenvalues().size(), 0);
+  VERIFY_RAISES_ASSERT(eig.eigenvectors());
+
+  eig.computeFromTridiagonal(diag, subdiag, ComputeEigenvectors);
+  VERIFY_IS_EQUAL(eig.info(), Success);
+  VERIFY_IS_EQUAL(eig.eigenvalues().size(), 0);
+  VERIFY_IS_EQUAL(eig.eigenvectors().rows(), 0);
+  VERIFY_IS_EQUAL(eig.eigenvectors().cols(), 0);
+}
+
+// Specific 3x3 test cases that stress the direct solver.
+template <int>
+void direct_3x3_stress() {
+  // Near-planar point cloud covariance: two large eigenvalues, one near-zero.
+  {
+    Matrix3d m;
+    m << 100, 50, 0.001, 50, 100, 0.002, 0.001, 0.002, 1e-10;
+    selfadjointeigensolver_essential_check(m);
+  }
+
+  // All equal diagonal entries (triple eigenvalue).
+  {
+    Matrix3d m = Matrix3d::Identity() * 7.0;
+    selfadjointeigensolver_essential_check(m);
+  }
+
+  // Two exactly equal eigenvalues (from explicit construction).
+  {
+    Matrix3d q;
+    q << 1, 0, 0, 0, 1.0 / std::sqrt(2.0), 1.0 / std::sqrt(2.0), 0, -1.0 / std::sqrt(2.0), 1.0 / std::sqrt(2.0);
+    Vector3d d(1.0, 5.0, 5.0);
+    Matrix3d m = q * d.asDiagonal() * q.transpose();
+    selfadjointeigensolver_essential_check(m);
+  }
+
+  // Large off-diagonal relative to diagonal.
+  {
+    Matrix3d m;
+    m << 1, 1000, 1000, 1000, 1, 1000, 1000, 1000, 1;
+    selfadjointeigensolver_essential_check(m);
+  }
+
+  // Nearly singular: one eigenvalue much smaller than others.
+  {
+    Matrix3d m;
+    m << 1, 0.5, 0.3, 0.5, 1, 0.4, 0.3, 0.4, 1;
+    m *= 1e15;
+    Matrix3d perturbation = Matrix3d::Zero();
+    perturbation(0, 0) = 1e-15;
+    m += perturbation;
+    selfadjointeigensolver_essential_check(m);
+  }
+}
+
+// Specific 2x2 test cases that stress the direct solver.
+template <int>
+void direct_2x2_stress() {
+  // Equal eigenvalues.
+  {
+    Matrix2d m = Matrix2d::Identity() * 42.0;
+    selfadjointeigensolver_essential_check(m);
+  }
+
+  // Very small off-diagonal.
+  {
+    Matrix2d m;
+    m << 1.0, 1e-15, 1e-15, 1.0;
+    selfadjointeigensolver_essential_check(m);
+  }
+
+  // Huge ratio between diagonal entries.
+  {
+    Matrix2d m;
+    m << 1e100, 0, 0, 1e-100;
+    selfadjointeigensolver_essential_check(m);
+  }
+
+  // Anti-diagonal dominant.
+  {
+    Matrix2d m;
+    m << 0, 1e10, 1e10, 0;
+    selfadjointeigensolver_essential_check(m);
+  }
+
+  // Negative entries.
+  {
+    Matrix2d m;
+    m << -5.0, 3.0, 3.0, -5.0;
+    selfadjointeigensolver_essential_check(m);
+  }
 }

 EIGEN_DECLARE_TEST(eigensolver_selfadjoint) {
@@ -266,12 +897,62 @@ EIGEN_DECLARE_TEST(eigensolver_selfadjoint) {
    CALL_SUBTEST_5(selfadjointeigensolver(MatrixXcd(2, 2)));
    CALL_SUBTEST_6(selfadjointeigensolver(Matrix<double, 1, 1>()));
    CALL_SUBTEST_7(selfadjointeigensolver(Matrix<double, 2, 2>()));
+
+    // repeated eigenvalues
+    CALL_SUBTEST_17(selfadjointeigensolver_repeated_eigenvalues(Matrix3d()));
+    CALL_SUBTEST_15(selfadjointeigensolver_repeated_eigenvalues(Matrix2d()));
+    CALL_SUBTEST_2(selfadjointeigensolver_repeated_eigenvalues(Matrix4d()));
+    CALL_SUBTEST_4(selfadjointeigensolver_repeated_eigenvalues(MatrixXd(s, s)));
+    CALL_SUBTEST_13(selfadjointeigensolver_repeated_eigenvalues(Matrix3f()));
+    CALL_SUBTEST_12(selfadjointeigensolver_repeated_eigenvalues(Matrix2f()));
+    CALL_SUBTEST_18(selfadjointeigensolver_repeated_eigenvalues(Matrix3cd()));
+
+    // extreme eigenvalues (near overflow/underflow, high condition number)
+    CALL_SUBTEST_17(selfadjointeigensolver_extreme_eigenvalues(Matrix3d()));
+    CALL_SUBTEST_2(selfadjointeigensolver_extreme_eigenvalues(Matrix4d()));
+    CALL_SUBTEST_4(selfadjointeigensolver_extreme_eigenvalues(MatrixXd(s, s)));
+    CALL_SUBTEST_13(selfadjointeigensolver_extreme_eigenvalues(Matrix3f()));
+    CALL_SUBTEST_3(selfadjointeigensolver_extreme_eigenvalues(MatrixXf(s, s)));
+
+    // computeFromTridiagonal with scaled inputs
+    CALL_SUBTEST_4(selfadjointeigensolver_tridiagonal_scaled(MatrixXd(s, s)));
+    CALL_SUBTEST_3(selfadjointeigensolver_tridiagonal_scaled(MatrixXf(s, s)));
+
+    // structured tridiagonal hard cases from the literature
+    CALL_SUBTEST_4(selfadjointeigensolver_structured_tridiagonal<double>());
+    CALL_SUBTEST_3(selfadjointeigensolver_structured_tridiagonal<float>());
+
+    // wide dynamic range tridiagonal (per-block scaling regression)
+    CALL_SUBTEST_4(selfadjointeigensolver_tridiagonal_wide_range<double>());
+    CALL_SUBTEST_3(selfadjointeigensolver_tridiagonal_wide_range<float>());
+
+    // diagonal matrices
+    CALL_SUBTEST_17(selfadjointeigensolver_diagonal(Matrix3d()));
+    CALL_SUBTEST_4(selfadjointeigensolver_diagonal(MatrixXd(s, s)));
+
+    // operatorInverseSqrt
+    CALL_SUBTEST_17(selfadjointeigensolver_inverse_sqrt(Matrix3d()));
+    CALL_SUBTEST_2(selfadjointeigensolver_inverse_sqrt(Matrix4d()));
+    CALL_SUBTEST_4(selfadjointeigensolver_inverse_sqrt(MatrixXd(s, s)));
+    CALL_SUBTEST_13(selfadjointeigensolver_inverse_sqrt(Matrix3f()));
+
+    // RowMajor
+    CALL_SUBTEST_19(selfadjointeigensolver_rowmajor<0>());
  }

  CALL_SUBTEST_17(bug_854<0>());
  CALL_SUBTEST_17(bug_1014<0>());
  CALL_SUBTEST_17(bug_1204<0>());
  CALL_SUBTEST_17(bug_1225<0>());
+  CALL_SUBTEST_17(selfadjointeigensolver_nonfinite<0>());
+  CALL_SUBTEST_8(selfadjointeigensolver_tridiagonal_zerosized<0>());
+
+  // Stress tests for direct 3x3 and 2x2 solvers.
+  CALL_SUBTEST_17(direct_3x3_stress<0>());
+  CALL_SUBTEST_15(direct_2x2_stress<0>());
+
+  // Test Inf input handling.
+  CALL_SUBTEST_17(selfadjointeigensolver_inf<0>());

  // Test problem size constructors
  s = internal::random<int>(1, EIGEN_TEST_MAX_SIZE / 4);
--- a/test/gpu_basic.cu
+++ b/test/gpu_basic.cu
@@ -7,12 +7,6 @@
 // Public License v. 2.0. If a copy of the MPL was not distributed
 // with this file, You can obtain one at http://mozilla.org/MPL/2.0/.

-// workaround issue between gcc >= 4.7 and cuda 5.5
-#if (defined __GNUC__) && (__GNUC__ > 4 || __GNUC_MINOR__ >= 7)
-#undef _GLIBCXX_ATOMIC_BUILTINS
-#undef _GLIBCXX_USE_INT128
-#endif
-
 #define EIGEN_TEST_NO_LONGDOUBLE
 #define EIGEN_DEFAULT_DENSE_INDEX_TYPE int

--- a/test/gpu_test_helper.h
+++ b/test/gpu_test_helper.h
@@ -6,10 +6,8 @@
 // Allow gpu** macros for generic tests.
 #include <Eigen/src/Core/util/GpuHipCudaDefines.inc>

-// std::tuple cannot be used on device, and there is a bug in cuda < 9.2 that
-// doesn't allow std::tuple to compile for host code either. In these cases,
-// use our custom implementation.
-#if defined(EIGEN_GPU_COMPILE_PHASE) || (defined(EIGEN_CUDACC) && EIGEN_CUDA_SDK_VER < 92000)
+// std::tuple cannot be used on device, so use our custom implementation there.
+#if defined(EIGEN_GPU_COMPILE_PHASE)
 #define EIGEN_USE_CUSTOM_TUPLE 1
 #else
 #define EIGEN_USE_CUSTOM_TUPLE 0
@@ -42,6 +40,12 @@ using tuple_impl::tuple;
 #undef EIGEN_USE_CUSTOM_TUPLE
 }  // namespace test_detail

+template <typename T>
+using decay_t = typename std::decay<T>::type;
+
+template <typename Func, typename... Args>
+using kernel_result_t = decltype(std::declval<Func>()(std::declval<Args>()...));
+
 template <size_t N, size_t Idx, typename OutputIndexSequence, typename... Ts>
 struct extract_output_indices_helper;

@@ -90,14 +94,15 @@ struct void_helper {
  // Non-void return value.
  template <typename Func, typename... Args>
  static EIGEN_ALWAYS_INLINE EIGEN_DEVICE_FUNC auto call(Func&& func, Args&&... args)
-      -> std::enable_if_t<!std::is_same<decltype(func(args...)), void>::value, decltype(func(args...))> {
+      -> std::enable_if_t<!std::is_same<kernel_result_t<Func&&, Args&&...>, void>::value,
+                          kernel_result_t<Func&&, Args&&...>> {
    return func(std::forward<Args>(args)...);
  }

  // Void return value.
  template <typename Func, typename... Args>
  static EIGEN_ALWAYS_INLINE EIGEN_DEVICE_FUNC auto call(Func&& func, Args&&... args)
-      -> std::enable_if_t<std::is_same<decltype(func(args...)), void>::value, Void> {
+      -> std::enable_if_t<std::is_same<kernel_result_t<Func&&, Args&&...>, void>::value, Void> {
    func(std::forward<Args>(args)...);
    return Void{};
  }
@@ -135,18 +140,18 @@ EIGEN_DEVICE_FUNC void run_serialized(std::index_sequence<Indices...>, std::inde
  const uint8_t* read_end = buffer + capacity;
  read_ptr = Eigen::deserialize(read_ptr, read_end, input_size);
  // Create value-type instances to populate.
-  auto args = make_tuple(typename std::decay<Args>::type{}...);
+  auto args = make_tuple(decay_t<Args>{}...);
  EIGEN_UNUSED_VARIABLE(args);  // Avoid NVCC compile warning.
  // NVCC 9.1 requires us to spell out the template parameters explicitly.
-  read_ptr = Eigen::deserialize(read_ptr, read_end, get<Indices, typename std::decay<Args>::type...>(args)...);
+  read_ptr = Eigen::deserialize(read_ptr, read_end, get<Indices, decay_t<Args>...>(args)...);

  // Call function, with void->Void conversion so we are guaranteed a complete
  // output type.
-  auto result = void_helper::call(kernel, get<Indices, typename std::decay<Args>::type...>(args)...);
+  auto result = void_helper::call(kernel, get<Indices, decay_t<Args>...>(args)...);

  // Determine required output size.
  size_t output_size = Eigen::serialize_size(capacity);
-  output_size += Eigen::serialize_size(get<OutputIndices, typename std::decay<Args>::type...>(args)...);
+  output_size += Eigen::serialize_size(get<OutputIndices, decay_t<Args>...>(args)...);
  output_size += Eigen::serialize_size(result);

  // Always serialize required buffer size.
@@ -157,7 +162,7 @@ EIGEN_DEVICE_FUNC void run_serialized(std::index_sequence<Indices...>, std::inde
  // Serialize outputs if they fit in the buffer.
  if (output_size <= capacity) {
    // Collect outputs and result.
-    write_ptr = Eigen::serialize(write_ptr, write_end, get<OutputIndices, typename std::decay<Args>::type...>(args)...);
+    write_ptr = Eigen::serialize(write_ptr, write_end, get<OutputIndices, decay_t<Args>...>(args)...);
    write_ptr = Eigen::serialize(write_ptr, write_end, result);
  }
 }
@@ -282,7 +287,7 @@ auto run_serialized_on_gpu(size_t buffer_capacity_hint, std::index_sequence<Indi
 * \return kernel(args...).
 */
 template <typename Kernel, typename... Args>
-auto run_on_cpu(Kernel kernel, Args&&... args) -> decltype(kernel(args...)) {
+auto run_on_cpu(Kernel kernel, Args&&... args) -> internal::kernel_result_t<Kernel, Args&&...> {
  return kernel(std::forward<Args>(args)...);
 }

@@ -301,7 +306,7 @@ auto run_on_cpu(Kernel kernel, Args&&... args) -> decltype(kernel(args...)) {
 * \return kernel(args...).
 */
 template <typename Kernel, typename... Args>
-auto run_on_gpu(Kernel kernel, Args&&... args) -> decltype(kernel(args...)) {
+auto run_on_gpu(Kernel kernel, Args&&... args) -> internal::kernel_result_t<Kernel, Args&&...> {
  return internal::run_serialized_on_gpu<Kernel, Args...>(
      /*buffer_capacity_hint=*/0, std::make_index_sequence<sizeof...(Args)>{},
      internal::extract_output_indices<Args...>{}, kernel, std::forward<Args>(args)...);
@@ -322,7 +327,8 @@ auto run_on_gpu(Kernel kernel, Args&&... args) -> decltype(kernel(args...)) {
 * \sa run_on_gpu
 */
 template <typename Kernel, typename... Args>
-auto run_on_gpu_with_hint(size_t buffer_capacity_hint, Kernel kernel, Args&&... args) -> decltype(kernel(args...)) {
+auto run_on_gpu_with_hint(size_t buffer_capacity_hint, Kernel kernel, Args&&... args)
+    -> internal::kernel_result_t<Kernel, Args&&...> {
  return internal::run_serialized_on_gpu<Kernel, Args...>(
      buffer_capacity_hint, std::make_index_sequence<sizeof...(Args)>{}, internal::extract_output_indices<Args...>{},
      kernel, std::forward<Args>(args)...);
@@ -409,7 +415,7 @@ void print_gpu_device_info() {
 * \return kernel(args...).
 */
 template <typename Kernel, typename... Args>
-auto run(Kernel kernel, Args&&... args) -> decltype(kernel(args...)) {
+auto run(Kernel kernel, Args&&... args) -> internal::kernel_result_t<Kernel, Args&&...> {
 #ifdef EIGEN_GPUCC
  return run_on_gpu(kernel, std::forward<Args>(args)...);
 #else
@@ -432,7 +438,8 @@ auto run(Kernel kernel, Args&&... args) -> decltype(kernel(args...)) {
 * \sa run
 */
 template <typename Kernel, typename... Args>
-auto run_with_hint(size_t buffer_capacity_hint, Kernel kernel, Args&&... args) -> decltype(kernel(args...)) {
+auto run_with_hint(size_t buffer_capacity_hint, Kernel kernel, Args&&... args)
+    -> internal::kernel_result_t<Kernel, Args&&...> {
 #ifdef EIGEN_GPUCC
  return run_on_gpu_with_hint(buffer_capacity_hint, kernel, std::forward<Args>(args)...);
 #else
--- a/test/main.h
+++ b/test/main.h
@@ -76,10 +76,8 @@
 #include <cuda.h>
 #include <cuda_runtime.h>
 #include <cuda_runtime_api.h>
-#if CUDA_VERSION >= 7050
 #include <cuda_fp16.h>
 #endif
-#endif

 #if defined(EIGEN_CUDACC) || defined(EIGEN_HIPCC)
 #define EIGEN_TEST_NO_LONGDOUBLE
@@ -949,6 +947,37 @@ inline void set_seed_from_time() {
  g_seed = static_cast<decltype(g_seed)>(ns);
 }

+#if defined(EIGEN_USE_GPU)
+inline int maybe_skip_gpu_tests() {
+#if defined(EIGEN_USE_HIP)
+  int device_count = 0;
+  hipError_t status = hipGetDeviceCount(&device_count);
+  if (status != hipSuccess) {
+    std::cout << "SKIP: HIP GPU tests require a visible ROCm device. hipGetDeviceCount failed with: "
+              << hipGetErrorString(status) << std::endl;
+    return 77;
+  }
+  if (device_count <= 0) {
+    std::cout << "SKIP: HIP GPU tests require a visible ROCm device." << std::endl;
+    return 77;
+  }
+#elif defined(EIGEN_CUDACC)
+  int device_count = 0;
+  cudaError_t status = cudaGetDeviceCount(&device_count);
+  if (status != cudaSuccess) {
+    std::cout << "SKIP: CUDA GPU tests require a visible CUDA device. cudaGetDeviceCount failed with: "
+              << cudaGetErrorString(status) << std::endl;
+    return 77;
+  }
+  if (device_count <= 0) {
+    std::cout << "SKIP: CUDA GPU tests require a visible CUDA device." << std::endl;
+    return 77;
+  }
+#endif
+  return 0;
+}
+#endif
+
 int main(int argc, char* argv[]) {
  g_has_set_repeat = false;
  g_has_set_seed = false;
@@ -997,6 +1026,13 @@ int main(int argc, char* argv[]) {
  srand(g_seed);
  std::cout << "Repeating each test " << g_repeat << " times" << std::endl;

+#if defined(EIGEN_USE_GPU)
+  {
+    const int skip_code = maybe_skip_gpu_tests();
+    if (skip_code != 0) return skip_code;
+  }
+#endif
+
  VERIFY(EigenTest::all().size() > 0);

  for (std::size_t i = 0; i < EigenTest::all().size(); ++i) {
--- a/test/tridiag_test_matrices.h
+++ b/test/tridiag_test_matrices.h
@@ -0,0 +1,383 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2025 Rasmus Munk Larsen <rmlarsen@gmail.com>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+#ifndef EIGEN_TEST_TRIDIAG_TEST_MATRICES_H
+#define EIGEN_TEST_TRIDIAG_TEST_MATRICES_H
+
+// Structured tridiagonal test matrices from the numerical linear algebra
+// literature. Used by both the bidiagonal SVD and symmetric eigenvalue tests.
+//
+// Each generator writes into pre-allocated (diag, offdiag) vectors.
+// For SVD, offdiag is the superdiagonal of a bidiagonal matrix.
+// For eigenvalues, offdiag is the subdiagonal of a symmetric tridiagonal matrix.
+//
+// Usage:
+//   Matrix<RealScalar, Dynamic, 1> diag(n), offdiag(n-1);
+//   tridiag_identity(diag, offdiag);           // fills diag and offdiag
+//   my_verify(diag, offdiag);                  // solver-specific verification
+
+#include <Eigen/Core>
+
+namespace Eigen {
+namespace test {
+
+// 1. Identity: d=[1,...,1], e=[0,...,0]
+template <typename VectorType>
+void tridiag_identity(VectorType& diag, VectorType& offdiag) {
+  diag.setOnes();
+  offdiag.setZero();
+}
+
+// 2. Zero: d=[0,...,0], e=[0,...,0]
+template <typename VectorType>
+void tridiag_zero(VectorType& diag, VectorType& offdiag) {
+  diag.setZero();
+  offdiag.setZero();
+}
+
+// 3. Constant: d=[c,...,c], e=[c,...,c]
+template <typename VectorType>
+void tridiag_constant(VectorType& diag, VectorType& offdiag,
+                      typename VectorType::Scalar c = typename VectorType::Scalar(2.5)) {
+  diag.setConstant(c);
+  offdiag.setConstant(c);
+}
+
+// 4. 1-2-1 Toeplitz: d=[2,...,2], e=[1,...,1]
+// Eigenvalues: 2 - 2*cos(k*pi/(n+1)) for k=1,...,n
+template <typename VectorType>
+void tridiag_1_2_1(VectorType& diag, VectorType& offdiag) {
+  typedef typename VectorType::Scalar Scalar;
+  diag.setConstant(Scalar(2));
+  offdiag.setOnes();
+}
+
+// 5. Wilkinson W_{2m+1}: d_i = |m - i|, e=[1,...,1]
+// Has pairs of eigenvalues agreeing to many digits; stresses deflation.
+template <typename VectorType>
+void tridiag_wilkinson(VectorType& diag, VectorType& offdiag) {
+  typedef typename VectorType::Scalar Scalar;
+  Index n = diag.size();
+  for (Index i = 0; i < n; ++i) diag(i) = numext::abs(Scalar(n / 2) - Scalar(i));
+  offdiag.setOnes();
+}
+
+// 6. Clement matrix: d=[0,...,0], e_i = sqrt(i*(n-1-i))
+// Known eigenvalues: -(n-1), -(n-3), ..., (n-3), (n-1)
+template <typename VectorType>
+void tridiag_clement(VectorType& diag, VectorType& offdiag) {
+  EIGEN_USING_STD(sqrt);
+  typedef typename VectorType::Scalar Scalar;
+  Index n = diag.size();
+  diag.setZero();
+  for (Index i = 0; i < n - 1; ++i) offdiag(i) = sqrt(Scalar(i + 1) * Scalar(n - 1 - i));
+}
+
+// 7. Kahan-style: d_i = s^i, e_i = -c*s^i with s=sin(theta), c=cos(theta).
+// Geometric decay with controlled condition number.
+template <typename VectorType>
+void tridiag_kahan(VectorType& diag, VectorType& offdiag,
+                   typename VectorType::Scalar theta = typename VectorType::Scalar(0.3)) {
+  EIGEN_USING_STD(sin);
+  EIGEN_USING_STD(cos);
+  EIGEN_USING_STD(pow);
+  EIGEN_USING_STD(log);
+  typedef typename VectorType::Scalar Scalar;
+  Index n = diag.size();
+  const Scalar eps = NumTraits<Scalar>::epsilon();
+  const Scalar s = sin(theta);
+  const Scalar c = cos(theta);
+  const Scalar maxPower = -log(eps) / (-log(s));
+  for (Index i = 0; i < n; ++i) diag(i) = pow(s, numext::mini(Scalar(i), maxPower));
+  for (Index i = 0; i < n - 1; ++i) offdiag(i) = -c * pow(s, numext::mini(Scalar(i), maxPower));
+}
+
+// 8. Graded: d_i = base^(-i), e_i = base^(-i)
+template <typename VectorType>
+void tridiag_graded(VectorType& diag, VectorType& offdiag,
+                    typename VectorType::Scalar base = typename VectorType::Scalar(10)) {
+  EIGEN_USING_STD(pow);
+  typedef typename VectorType::Scalar Scalar;
+  Index n = diag.size();
+  for (Index i = 0; i < n; ++i) diag(i) = pow(base, -Scalar(i));
+  for (Index i = 0; i < n - 1; ++i) offdiag(i) = pow(base, -Scalar(i));
+}
+
+// 9. Geometric decay diagonal: d_i = base^i, e=[0,...,0]
+template <typename VectorType>
+void tridiag_geometric_diagonal(VectorType& diag, VectorType& offdiag,
+                                typename VectorType::Scalar base = typename VectorType::Scalar(0.5)) {
+  EIGEN_USING_STD(pow);
+  EIGEN_USING_STD(log);
+  typedef typename VectorType::Scalar Scalar;
+  Index n = diag.size();
+  const Scalar eps = NumTraits<Scalar>::epsilon();
+  const Scalar maxPower = -log(eps) / (-log(base));
+  for (Index i = 0; i < n; ++i) diag(i) = pow(base, numext::mini(Scalar(i), maxPower));
+  offdiag.setZero();
+}
+
+// 10. Geometric decay offdiagonal: d=[1,...,1], e_i = base^i
+template <typename VectorType>
+void tridiag_geometric_offdiag(VectorType& diag, VectorType& offdiag,
+                               typename VectorType::Scalar base = typename VectorType::Scalar(0.5)) {
+  EIGEN_USING_STD(pow);
+  typedef typename VectorType::Scalar Scalar;
+  Index n = diag.size();
+  diag.setOnes();
+  for (Index i = 0; i < n - 1; ++i) offdiag(i) = pow(base, Scalar(i));
+}
+
+// 11. Clustered eigenvalues: d_i = 1 + i*eps, e=[0,...,0]
+template <typename VectorType>
+void tridiag_clustered(VectorType& diag, VectorType& offdiag) {
+  typedef typename VectorType::Scalar Scalar;
+  Index n = diag.size();
+  const Scalar eps = NumTraits<Scalar>::epsilon();
+  for (Index i = 0; i < n; ++i) diag(i) = Scalar(1) + Scalar(i) * eps;
+  offdiag.setZero();
+}
+
+// 12. Two clusters: half at 1, half at eps.
+template <typename VectorType>
+void tridiag_two_clusters(VectorType& diag, VectorType& offdiag) {
+  typedef typename VectorType::Scalar Scalar;
+  Index n = diag.size();
+  const Scalar eps = NumTraits<Scalar>::epsilon();
+  for (Index i = 0; i < n; ++i) diag(i) = (i < n / 2) ? Scalar(1) : eps;
+  offdiag.setZero();
+}
+
+// 13. Single tiny value: d=[1,...,1,eps], e=[eps^2,...,eps^2]
+template <typename VectorType>
+void tridiag_single_tiny(VectorType& diag, VectorType& offdiag) {
+  typedef typename VectorType::Scalar Scalar;
+  Index n = diag.size();
+  const Scalar eps = NumTraits<Scalar>::epsilon();
+  diag.setOnes();
+  diag(n - 1) = eps;
+  offdiag.setConstant(eps * eps);
+}
+
+// 14. Overflow/underflow: alternating big/tiny diagonal and offdiagonal.
+template <typename VectorType>
+void tridiag_overflow_underflow(VectorType& diag, VectorType& offdiag) {
+  typedef typename VectorType::Scalar Scalar;
+  Index n = diag.size();
+  const Scalar big = (std::numeric_limits<Scalar>::max)() / Scalar(1000);
+  const Scalar tiny = (std::numeric_limits<Scalar>::min)() * Scalar(1000);
+  for (Index i = 0; i < n; ++i) diag(i) = (i % 2 == 0) ? big : tiny;
+  for (Index i = 0; i < n - 1; ++i) offdiag(i) = (i % 2 == 0) ? tiny : big;
+}
+
+// 15. Prescribed condition number: d_i = kappa^(-i/(n-1)), e_i = eps * random.
+template <typename VectorType>
+void tridiag_prescribed_cond(VectorType& diag, VectorType& offdiag) {
+  EIGEN_USING_STD(pow);
+  EIGEN_USING_STD(abs);
+  typedef typename VectorType::Scalar Scalar;
+  Index n = diag.size();
+  const Scalar eps = NumTraits<Scalar>::epsilon();
+  const Scalar kappa = Scalar(1) / eps;
+  for (Index i = 0; i < n; ++i) diag(i) = pow(kappa, -Scalar(i) / Scalar(n - 1));
+  for (Index i = 0; i < n - 1; ++i) offdiag(i) = eps * abs(internal::random<Scalar>());
+}
+
+// 16. Rank-deficient: d=[1,..,0,..,0,..,1], e=[0,...,0]
+template <typename VectorType>
+void tridiag_rank_deficient(VectorType& diag, VectorType& offdiag) {
+  typedef typename VectorType::Scalar Scalar;
+  Index n = diag.size();
+  for (Index i = 0; i < n; ++i) diag(i) = (i < n / 3 || i >= 2 * n / 3) ? Scalar(1) : Scalar(0);
+  offdiag.setZero();
+}
+
+// 17. Arrowhead-like: d_i = linspace(1,n), e_i = 1/(i+1)
+template <typename VectorType>
+void tridiag_arrowhead(VectorType& diag, VectorType& offdiag) {
+  typedef typename VectorType::Scalar Scalar;
+  Index n = diag.size();
+  for (Index i = 0; i < n; ++i) diag(i) = Scalar(1) + Scalar(i);
+  for (Index i = 0; i < n - 1; ++i) offdiag(i) = Scalar(1) / Scalar(i + 1);
+}
+
+// 18. Repeated values: d=[1,2,3,1,2,3,...], e=[0,...,0]
+template <typename VectorType>
+void tridiag_repeated(VectorType& diag, VectorType& offdiag) {
+  typedef typename VectorType::Scalar Scalar;
+  Index n = diag.size();
+  for (Index i = 0; i < n; ++i) diag(i) = Scalar((i % 3) + 1);
+  offdiag.setZero();
+}
+
+// 19. Glued blocks: d=[1,...,1], e=0 except e[n/2-1]=eps.
+// Two identity blocks coupled by a tiny off-diagonal entry.
+template <typename VectorType>
+void tridiag_glued(VectorType& diag, VectorType& offdiag) {
+  typedef typename VectorType::Scalar Scalar;
+  Index n = diag.size();
+  diag.setOnes();
+  offdiag.setZero();
+  if (n > 2) offdiag(n / 2 - 1) = NumTraits<Scalar>::epsilon();
+}
+
+// 20. Nearly diagonal: random diag, eps * random offdiag.
+template <typename VectorType>
+void tridiag_nearly_diagonal(VectorType& diag, VectorType& offdiag) {
+  EIGEN_USING_STD(abs);
+  typedef typename VectorType::Scalar Scalar;
+  Index n = diag.size();
+  const Scalar eps = NumTraits<Scalar>::epsilon();
+  diag = VectorType::Random(n).cwiseAbs() + VectorType::Constant(n, Scalar(0.1));
+  for (Index i = 0; i < n - 1; ++i) offdiag(i) = eps * (Scalar(0.5) + abs(internal::random<Scalar>()));
+}
+
+// 21. Negative eigenvalues: d_i = -i, e=[1,...,1]
+// (Only meaningful for symmetric eigenvalue problems, not SVD.)
+template <typename VectorType>
+void tridiag_negative(VectorType& diag, VectorType& offdiag) {
+  typedef typename VectorType::Scalar Scalar;
+  Index n = diag.size();
+  for (Index i = 0; i < n; ++i) diag(i) = -Scalar(i + 1);
+  offdiag.setOnes();
+}
+
+// 22. Mixed sign diagonal: d_i = (-1)^i * (i+1), e=[1,...,1]
+// (Only meaningful for symmetric eigenvalue problems, not SVD.)
+template <typename VectorType>
+void tridiag_mixed_sign(VectorType& diag, VectorType& offdiag) {
+  typedef typename VectorType::Scalar Scalar;
+  Index n = diag.size();
+  for (Index i = 0; i < n; ++i) diag(i) = ((i % 2 == 0) ? Scalar(1) : Scalar(-1)) * Scalar(i + 1);
+  offdiag.setOnes();
+}
+
+// Helper: iterate over a set of sizes and call a functor with each (diag, offdiag) pair
+// generated by a generator function.
+//
+// Usage:
+//   for_tridiag_sizes([](auto& diag, auto& offdiag) {
+//       tridiag_wilkinson(diag, offdiag);
+//       my_verify(diag, offdiag);
+//   });
+template <typename Scalar, typename Func>
+void for_tridiag_sizes(Func&& func) {
+  const int sizes[] = {1, 2, 3, 5, 10, 16, 20, 50, 100};
+  typedef Matrix<Scalar, Dynamic, 1> VectorType;
+  for (int si = 0; si < int(sizeof(sizes) / sizeof(sizes[0])); ++si) {
+    const Index n = sizes[si];
+    VectorType diag(n), offdiag(n > 1 ? n - 1 : 0);
+    func(diag, offdiag);
+  }
+}
+
+// Helper: run all generators (suitable for both SVD and eigenvalue problems).
+// The callback receives (diag, offdiag) after each generator fills them.
+template <typename Scalar, typename Func>
+void for_all_tridiag_test_matrices(Func&& verify) {
+  const int sizes[] = {1, 2, 3, 5, 10, 16, 20, 50, 100};
+  typedef Matrix<Scalar, Dynamic, 1> VectorType;
+
+  for (int si = 0; si < int(sizeof(sizes) / sizeof(sizes[0])); ++si) {
+    const Index n = sizes[si];
+    VectorType diag(n), offdiag(n > 1 ? n - 1 : 0);
+
+    tridiag_identity(diag, offdiag);
+    verify(diag, offdiag);
+
+    tridiag_zero(diag, offdiag);
+    verify(diag, offdiag);
+
+    tridiag_constant(diag, offdiag);
+    verify(diag, offdiag);
+
+    tridiag_1_2_1(diag, offdiag);
+    verify(diag, offdiag);
+
+    tridiag_wilkinson(diag, offdiag);
+    verify(diag, offdiag);
+
+    if (n > 1) {
+      tridiag_clement(diag, offdiag);
+      verify(diag, offdiag);
+    }
+
+    tridiag_kahan(diag, offdiag);
+    verify(diag, offdiag);
+
+    tridiag_graded(diag, offdiag);
+    verify(diag, offdiag);
+
+    tridiag_geometric_diagonal(diag, offdiag);
+    verify(diag, offdiag);
+
+    if (n > 1) {
+      tridiag_geometric_offdiag(diag, offdiag);
+      verify(diag, offdiag);
+    }
+
+    tridiag_clustered(diag, offdiag);
+    verify(diag, offdiag);
+
+    tridiag_two_clusters(diag, offdiag);
+    verify(diag, offdiag);
+
+    tridiag_single_tiny(diag, offdiag);
+    verify(diag, offdiag);
+
+    tridiag_overflow_underflow(diag, offdiag);
+    verify(diag, offdiag);
+
+    if (n > 1) {
+      tridiag_prescribed_cond(diag, offdiag);
+      verify(diag, offdiag);
+    }
+
+    tridiag_rank_deficient(diag, offdiag);
+    verify(diag, offdiag);
+
+    tridiag_arrowhead(diag, offdiag);
+    verify(diag, offdiag);
+
+    tridiag_repeated(diag, offdiag);
+    verify(diag, offdiag);
+
+    tridiag_glued(diag, offdiag);
+    verify(diag, offdiag);
+
+    tridiag_nearly_diagonal(diag, offdiag);
+    verify(diag, offdiag);
+  }
+}
+
+// Helper: run all generators, including those with negative values
+// (suitable only for symmetric eigenvalue problems, not SVD).
+template <typename Scalar, typename Func>
+void for_all_symmetric_tridiag_test_matrices(Func&& verify) {
+  for_all_tridiag_test_matrices<Scalar>(verify);
+
+  const int sizes[] = {1, 2, 3, 5, 10, 16, 20, 50, 100};
+  typedef Matrix<Scalar, Dynamic, 1> VectorType;
+
+  for (int si = 0; si < int(sizeof(sizes) / sizeof(sizes[0])); ++si) {
+    const Index n = sizes[si];
+    VectorType diag(n), offdiag(n > 1 ? n - 1 : 0);
+
+    tridiag_negative(diag, offdiag);
+    verify(diag, offdiag);
+
+    tridiag_mixed_sign(diag, offdiag);
+    verify(diag, offdiag);
+  }
+}
+
+}  // namespace test
+}  // namespace Eigen
+
+#endif  // EIGEN_TEST_TRIDIAG_TEST_MATRICES_H
--- a/unsupported/Eigen/src/Tensor/TensorContractionGpu.h
+++ b/unsupported/Eigen/src/Tensor/TensorContractionGpu.h
@@ -393,7 +393,8 @@ __device__ EIGEN_STRONG_INLINE void EigenContractionKernelInternal(const LhsMapp
  // the sum across all big k blocks of the product of little k block of index (x, y)
  // with block of index (y, z). To compute the final output, we need to reduce
  // the 8 threads over y by summation.
-#if defined(EIGEN_HIPCC) || (defined(EIGEN_CUDA_SDK_VER) && EIGEN_CUDA_SDK_VER < 90000)
+  // HIP uses non-sync warp shuffles; CUDA requires the _sync variants.
+#if defined(EIGEN_HIPCC)
 #define shuffleInc(i, j, mask) res(i, j) += __shfl_xor(res(i, j), mask)
 #else
 #define shuffleInc(i, j, mask) res(i, j) += __shfl_xor_sync(0xFFFFFFFF, res(i, j), mask)
@@ -622,7 +623,7 @@ __device__ __forceinline__ void EigenFloatContractionKernelInternal16x16(const L
      x1 = rhs_pf0.x;
      x2 = rhs_pf0.z;
    }
-#if defined(EIGEN_HIPCC) || (defined(EIGEN_CUDA_SDK_VER) && EIGEN_CUDA_SDK_VER < 90000)
+#if defined(EIGEN_HIPCC)
    x1 = __shfl_xor(x1, 4);
    x2 = __shfl_xor(x2, 4);
 #else
@@ -1377,13 +1378,6 @@ struct TensorEvaluator<const TensorContractionOp<Indices, LeftArgType, RightArgT
                  this->m_right_contracting_strides, this->m_k_strides);

    OutputMapper output(buffer, m);
-
-#if defined(EIGEN_USE_HIP)
-    setGpuSharedMemConfig(hipSharedMemBankSizeEightByte);
-#else
-    setGpuSharedMemConfig(cudaSharedMemBankSizeEightByte);
-#endif
-
    LaunchKernels<LhsScalar, RhsScalar, Index, LhsMapper, RhsMapper, OutputMapper>::Run(lhs, rhs, output, m, n, k,
                                                                                        this->m_device);
  }
--- a/unsupported/Eigen/src/Tensor/TensorConvolution.h
+++ b/unsupported/Eigen/src/Tensor/TensorConvolution.h
@@ -89,7 +89,7 @@ class IndexMapper {
      }
    } else {
      for (int i = NumDims - 1; i >= 0; --i) {
-        if (static_cast<size_t>(i + 1) < offset) {
+        if (i + 1 < static_cast<int>(offset)) {
          m_gpuInputStrides[i] = m_gpuInputStrides[i + 1] * gpuInputDimensions[i + 1];
          m_gpuOutputStrides[i] = m_gpuOutputStrides[i + 1] * gpuOutputDimensions[i + 1];
        } else {
--- a/unsupported/Eigen/src/Tensor/TensorDeviceGpu.h
+++ b/unsupported/Eigen/src/Tensor/TensorDeviceGpu.h
@@ -342,19 +342,6 @@ struct GpuDevice {

 #endif

-// FIXME: Should be device and kernel specific.
-#ifdef EIGEN_GPUCC
-static EIGEN_DEVICE_FUNC inline void setGpuSharedMemConfig(gpuSharedMemConfig config) {
-#ifndef EIGEN_GPU_COMPILE_PHASE
-  gpuError_t status = gpuDeviceSetSharedMemConfig(config);
-  EIGEN_UNUSED_VARIABLE(status);
-  gpu_assert(status == gpuSuccess);
-#else
-  EIGEN_UNUSED_VARIABLE(config);
-#endif
-}
-#endif
-
 }  // end namespace Eigen

 // undefine all the gpu* macros we defined at the beginning of the file
--- a/unsupported/Eigen/src/Tensor/TensorEvaluator.h
+++ b/unsupported/Eigen/src/Tensor/TensorEvaluator.h
@@ -175,7 +175,7 @@ EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE T loadConstant(const T* address) {
  return *address;
 }
 // Use the texture cache on CUDA devices whenever possible
-#if defined(EIGEN_CUDA_ARCH) && EIGEN_CUDA_ARCH >= 350
+#if defined(EIGEN_CUDA_ARCH)
 template <>
 EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE float loadConstant(const float* address) {
  return __ldg(address);
--- a/unsupported/Eigen/src/Tensor/TensorMeta.h
+++ b/unsupported/Eigen/src/Tensor/TensorMeta.h
@@ -49,7 +49,7 @@ struct PacketType : internal::packet_traits<Scalar> {
 };

 // For CUDA packet types when using a GpuDevice
-#if defined(EIGEN_USE_GPU) && defined(EIGEN_HAS_GPU_FP16) && defined(EIGEN_GPU_COMPILE_PHASE)
+#if defined(EIGEN_USE_GPU) && defined(EIGEN_GPU_COMPILE_PHASE)

 typedef ulonglong2 Packet4h2;
 template <>
--- a/unsupported/Eigen/src/Tensor/TensorReduction.h
+++ b/unsupported/Eigen/src/Tensor/TensorReduction.h
@@ -453,7 +453,7 @@ template <int B, int N, typename S, typename R, typename I_>
 __global__ EIGEN_HIP_LAUNCH_BOUNDS_1024 void FullReductionKernel(R, const S, I_, typename S::CoeffReturnType*,
                                                                 unsigned int*);

-#if defined(EIGEN_HAS_GPU_FP16)
+#if defined(EIGEN_GPUCC)
 template <typename S, typename R, typename I_>
 __global__ EIGEN_HIP_LAUNCH_BOUNDS_1024 void ReductionInitFullReduxKernelHalfFloat(
    R, const S, I_, internal::packet_traits<half>::type*);
@@ -883,7 +883,7 @@ struct TensorReductionEvaluatorBase<const TensorReductionOp<Op, Dims, ArgType, M
 #if defined(EIGEN_USE_GPU) && (defined(EIGEN_GPUCC))
  template <int B, int N, typename S, typename R, typename I_>
  KERNEL_FRIEND void internal::FullReductionKernel(R, const S, I_, typename S::CoeffReturnType*, unsigned int*);
-#if defined(EIGEN_HAS_GPU_FP16)
+#if defined(EIGEN_GPUCC)
  template <typename S, typename R, typename I_>
  KERNEL_FRIEND void internal::ReductionInitFullReduxKernelHalfFloat(R, const S, I_,
                                                                     internal::packet_traits<Eigen::half>::type*);
--- a/unsupported/Eigen/src/Tensor/TensorReductionGpu.h
+++ b/unsupported/Eigen/src/Tensor/TensorReductionGpu.h
@@ -25,7 +25,6 @@ namespace internal {
 // updated the content of the output address it will try again.
 template <typename T, typename R>
 __device__ EIGEN_ALWAYS_INLINE void atomicReduce(T* output, T accum, R& reducer) {
-#if (defined(EIGEN_HIP_DEVICE_COMPILE) && defined(__HIP_ARCH_HAS_WARP_SHUFFLE__)) || (EIGEN_CUDA_ARCH >= 300)
  if (sizeof(T) == 4) {
    unsigned int oldval = *reinterpret_cast<unsigned int*>(output);
    unsigned int newval = oldval;
@@ -61,12 +60,6 @@ __device__ EIGEN_ALWAYS_INLINE void atomicReduce(T* output, T accum, R& reducer)
  } else {
    gpu_assert(0 && "Wordsize not supported");
  }
-#else   // EIGEN_CUDA_ARCH >= 300
-  EIGEN_UNUSED_VARIABLE(output);
-  EIGEN_UNUSED_VARIABLE(accum);
-  EIGEN_UNUSED_VARIABLE(reducer);
-  gpu_assert(0 && "Shouldn't be called on unsupported device");
-#endif  // EIGEN_CUDA_ARCH >= 300
 }

 // We extend atomicExch to support extra data types
@@ -75,13 +68,42 @@ __device__ inline Type atomicExchCustom(Type* address, Type val) {
  return atomicExch(address, val);
 }

+template <typename T>
+EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR auto reduction_shuffle_mask() {
+#if defined(EIGEN_HIP_DEVICE_COMPILE)
+  return 0xFFFFFFFFFFFFFFFFull;
+#else
+  return 0xFFFFFFFFu;
+#endif
+}
+
+template <typename T>
+__device__ EIGEN_ALWAYS_INLINE T reduction_shuffle_down(T value, int offset) {
+  return __shfl_down_sync(reduction_shuffle_mask<T>(), value, offset, warpSize);
+}
+
+template <>
+__device__ EIGEN_ALWAYS_INLINE int reduction_shuffle_down<int>(int value, int offset) {
+  return __shfl_down_sync(reduction_shuffle_mask<int>(), value, offset, warpSize);
+}
+
+template <>
+__device__ EIGEN_ALWAYS_INLINE float reduction_shuffle_down<float>(float value, int offset) {
+  return __shfl_down_sync(reduction_shuffle_mask<float>(), value, offset, warpSize);
+}
+
+template <>
+__device__ EIGEN_ALWAYS_INLINE double reduction_shuffle_down<double>(double value, int offset) {
+  return __shfl_down_sync(reduction_shuffle_mask<double>(), value, offset, warpSize);
+}
+
 template <>
 __device__ inline double atomicExchCustom(double* address, double val) {
  unsigned long long int* address_as_ull = reinterpret_cast<unsigned long long int*>(address);
  return __longlong_as_double(atomicExch(address_as_ull, __double_as_longlong(val)));
 }

-#ifdef EIGEN_HAS_GPU_FP16
+// Half-float reduction specializations.
 template <typename R>
 __device__ inline void atomicReduce(half2* output, half2 accum, R& reducer) {
  unsigned int oldval = *reinterpret_cast<unsigned int*>(output);
@@ -111,17 +133,10 @@ __device__ inline void atomicReduce(Packet4h2* output, Packet4h2 accum, R& reduc
  }
 }
 #endif  // EIGEN_GPU_COMPILE_PHASE
-#endif  // EIGEN_HAS_GPU_FP16

 template <>
 __device__ inline void atomicReduce(float* output, float accum, SumReducer<float>&) {
-#if (defined(EIGEN_HIP_DEVICE_COMPILE) && defined(__HIP_ARCH_HAS_WARP_SHUFFLE__)) || (EIGEN_CUDA_ARCH >= 300)
  atomicAdd(output, accum);
-#else   // EIGEN_CUDA_ARCH >= 300
-  EIGEN_UNUSED_VARIABLE(output);
-  EIGEN_UNUSED_VARIABLE(accum);
-  gpu_assert(0 && "Shouldn't be called on unsupported device");
-#endif  // EIGEN_CUDA_ARCH >= 300
 }

 template <typename CoeffType, typename Index>
@@ -138,7 +153,6 @@ template <int BlockSize, int NumPerThread, typename Self, typename Reducer, type
 __global__ EIGEN_HIP_LAUNCH_BOUNDS_1024 void FullReductionKernel(Reducer reducer, const Self input, Index num_coeffs,
                                                                 typename Self::CoeffReturnType* output,
                                                                 unsigned int* semaphore) {
-#if (defined(EIGEN_HIP_DEVICE_COMPILE) && defined(__HIP_ARCH_HAS_WARP_SHUFFLE__)) || (EIGEN_CUDA_ARCH >= 300)
  // Initialize the output value
  const Index first_index = blockIdx.x * BlockSize * NumPerThread + threadIdx.x;
  if (gridDim.x == 1) {
@@ -179,20 +193,7 @@ __global__ EIGEN_HIP_LAUNCH_BOUNDS_1024 void FullReductionKernel(Reducer reducer

 #pragma unroll
  for (int offset = warpSize / 2; offset > 0; offset /= 2) {
-#if defined(EIGEN_HIPCC)
-    // use std::is_floating_point to determine the type of reduced_val
-    // This is needed because when Type == double, hipcc will give a "call to __shfl_down is ambiguous" error
-    // and list the float and int versions of __shfl_down as the candidate functions.
-    if (std::is_floating_point<typename Self::CoeffReturnType>::value) {
-      reducer.reduce(__shfl_down(static_cast<float>(accum), offset, warpSize), &accum);
-    } else {
-      reducer.reduce(__shfl_down(static_cast<int>(accum), offset, warpSize), &accum);
-    }
-#elif defined(EIGEN_CUDA_SDK_VER) && EIGEN_CUDA_SDK_VER < 90000
-    reducer.reduce(__shfl_down(accum, offset, warpSize), &accum);
-#else
-    reducer.reduce(__shfl_down_sync(0xFFFFFFFF, accum, offset, warpSize), &accum);
-#endif
+    reducer.reduce(reduction_shuffle_down(accum, offset), &accum);
  }

  if ((threadIdx.x & (warpSize - 1)) == 0) {
@@ -206,17 +207,9 @@ __global__ EIGEN_HIP_LAUNCH_BOUNDS_1024 void FullReductionKernel(Reducer reducer
    __threadfence_system();
 #endif
  }
-#else   // EIGEN_CUDA_ARCH >= 300
-  EIGEN_UNUSED_VARIABLE(reducer);
-  EIGEN_UNUSED_VARIABLE(input);
-  EIGEN_UNUSED_VARIABLE(num_coeffs);
-  EIGEN_UNUSED_VARIABLE(output);
-  EIGEN_UNUSED_VARIABLE(semaphore);
-  gpu_assert(0 && "Shouldn't be called on unsupported device");
-#endif  // EIGEN_CUDA_ARCH >= 300
 }

-#ifdef EIGEN_HAS_GPU_FP16
+// Half-float reduction specializations.
 template <typename Self, typename Reducer, typename Index>
 __global__ EIGEN_HIP_LAUNCH_BOUNDS_1024 void ReductionInitFullReduxKernelHalfFloat(Reducer reducer, const Self input,
                                                                                   Index num_coeffs, half* scratch) {
@@ -319,14 +312,6 @@ __global__ EIGEN_HIP_LAUNCH_BOUNDS_1024 void FullReductionKernelHalfFloat(Reduce
      hr[i] = wka_out.h;
    }
    reducer.reducePacket(r1, &accum);
-#elif defined(EIGEN_CUDA_SDK_VER) && EIGEN_CUDA_SDK_VER < 90000
-    PacketType r1;
-    half2* hr = reinterpret_cast<half2*>(&r1);
-    half2* hacc = reinterpret_cast<half2*>(&accum);
-    for (int i = 0; i < packet_width / 2; i++) {
-      hr[i] = __shfl_down(hacc[i], offset, warpSize);
-    }
-    reducer.reducePacket(r1, &accum);
 #else
    PacketType r1;
    half2* hr = reinterpret_cast<half2*>(&r1);
@@ -377,8 +362,6 @@ __global__ EIGEN_HIP_LAUNCH_BOUNDS_1024 void ReductionCleanupKernelHalfFloat(Op
  }
 }

-#endif  // EIGEN_HAS_GPU_FP16
-
 template <typename Self, typename Op, typename OutputType, bool PacketAccess, typename Enabled = void>
 struct FullReductionLauncher {
  static void run(const Self&, Op&, const GpuDevice&, OutputType*, typename Self::Index) {
@@ -409,7 +392,7 @@ struct FullReductionLauncher<
  }
 };

-#ifdef EIGEN_HAS_GPU_FP16
+// Half-float reduction specializations.
 template <typename Self, typename Op>
 struct FullReductionLauncher<Self, Op, Eigen::half, false> {
  static void run(const Self&, Op&, const GpuDevice&, half*, typename Self::Index) {
@@ -443,24 +426,18 @@ struct FullReductionLauncher<Self, Op, Eigen::half, true> {
    }
  }
 };
-#endif  // EIGEN_HAS_GPU_FP16

 template <typename Self, typename Op, bool Vectorizable>
 struct FullReducer<Self, Op, GpuDevice, Vectorizable> {
  // Unfortunately nvidia doesn't support well exotic types such as complex,
  // so reduce the scope of the optimized version of the code to the simple cases
  // of doubles, floats and half floats
-#ifdef EIGEN_HAS_GPU_FP16
+  // Half-float reduction specializations.
  static constexpr bool HasOptimizedImplementation =
      !Self::ReducerTraits::IsStateful && (internal::is_same<typename Self::CoeffReturnType, float>::value ||
                                           internal::is_same<typename Self::CoeffReturnType, double>::value ||
                                           (internal::is_same<typename Self::CoeffReturnType, Eigen::half>::value &&
                                            reducer_traits<Op, GpuDevice>::PacketAccess));
-#else   // EIGEN_HAS_GPU_FP16
-  static constexpr bool HasOptimizedImplementation =
-      !Self::ReducerTraits::IsStateful && (internal::is_same<typename Self::CoeffReturnType, float>::value ||
-                                           internal::is_same<typename Self::CoeffReturnType, double>::value);
-#endif  // EIGEN_HAS_GPU_FP16

  template <typename OutputType>
  static void run(const Self& self, Op& reducer, const GpuDevice& device, OutputType* output) {
@@ -481,7 +458,6 @@ __global__ EIGEN_HIP_LAUNCH_BOUNDS_1024 void InnerReductionKernel(Reducer reduce
                                                                  Index num_coeffs_to_reduce,
                                                                  Index num_preserved_coeffs,
                                                                  typename Self::CoeffReturnType* output) {
-#if (defined(EIGEN_HIP_DEVICE_COMPILE) && defined(__HIP_ARCH_HAS_WARP_SHUFFLE__)) || (EIGEN_CUDA_ARCH >= 300)
  typedef typename Self::CoeffReturnType Type;
  eigen_assert(blockDim.y == 1);
  eigen_assert(blockDim.z == 1);
@@ -534,20 +510,7 @@ __global__ EIGEN_HIP_LAUNCH_BOUNDS_1024 void InnerReductionKernel(Reducer reduce

 #pragma unroll
      for (int offset = warpSize / 2; offset > 0; offset /= 2) {
-#if defined(EIGEN_HIPCC)
-        // use std::is_floating_point to determine the type of reduced_val
-        // This is needed because when Type == double, hipcc will give a "call to __shfl_down is ambiguous" error
-        // and list the float and int versions of __shfl_down as the candidate functions.
-        if (std::is_floating_point<Type>::value) {
-          reducer.reduce(__shfl_down(static_cast<float>(reduced_val), offset), &reduced_val);
-        } else {
-          reducer.reduce(__shfl_down(static_cast<int>(reduced_val), offset), &reduced_val);
-        }
-#elif defined(EIGEN_CUDA_SDK_VER) && EIGEN_CUDA_SDK_VER < 90000
-        reducer.reduce(__shfl_down(reduced_val, offset), &reduced_val);
-#else
-        reducer.reduce(__shfl_down_sync(0xFFFFFFFF, reduced_val, offset), &reduced_val);
-#endif
+        reducer.reduce(reduction_shuffle_down(reduced_val, offset), &reduced_val);
      }

      if ((threadIdx.x & (warpSize - 1)) == 0) {
@@ -555,17 +518,9 @@ __global__ EIGEN_HIP_LAUNCH_BOUNDS_1024 void InnerReductionKernel(Reducer reduce
      }
    }
  }
-#else   // EIGEN_CUDA_ARCH >= 300
-  EIGEN_UNUSED_VARIABLE(reducer);
-  EIGEN_UNUSED_VARIABLE(input);
-  EIGEN_UNUSED_VARIABLE(num_coeffs_to_reduce);
-  EIGEN_UNUSED_VARIABLE(num_preserved_coeffs);
-  EIGEN_UNUSED_VARIABLE(output);
-  gpu_assert(0 && "Shouldn't be called on unsupported device");
-#endif  // EIGEN_CUDA_ARCH >= 300
 }

-#ifdef EIGEN_HAS_GPU_FP16
+// Half-float reduction specializations.

 template <int NumPerThread, typename Self, typename Reducer, typename Index>
 __global__ EIGEN_HIP_LAUNCH_BOUNDS_1024 void InnerReductionKernelHalfFloat(Reducer reducer, const Self input,
@@ -688,19 +643,6 @@ __global__ EIGEN_HIP_LAUNCH_BOUNDS_1024 void InnerReductionKernelHalfFloat(Reduc
        }
        reducer.reducePacket(r1, &reduced_val1);
        reducer.reducePacket(r2, &reduced_val2);
-#elif defined(EIGEN_CUDA_SDK_VER) && EIGEN_CUDA_SDK_VER < 90000
-        PacketType r1;
-        PacketType r2;
-        half2* hr1 = reinterpret_cast<half2*>(&r1);
-        half2* hr2 = reinterpret_cast<half2*>(&r2);
-        half2* rv1 = reinterpret_cast<half2*>(&reduced_val1);
-        half2* rv2 = reinterpret_cast<half2*>(&reduced_val2);
-        for (int i = 0; i < packet_width / 2; i++) {
-          hr1[i] = __shfl_down(rv1[i], offset, warpSize);
-          hr2[i] = __shfl_down(rv2[i], offset, warpSize);
-        }
-        reducer.reducePacket(r1, &reduced_val1);
-        reducer.reducePacket(r2, &reduced_val2);
 #else
        PacketType r1;
        PacketType r2;
@@ -741,8 +683,6 @@ __global__ EIGEN_HIP_LAUNCH_BOUNDS_1024 void InnerReductionKernelHalfFloat(Reduc
  }
 }

-#endif  // EIGEN_HAS_GPU_FP16
-
 template <typename Self, typename Op, typename OutputType, bool PacketAccess, typename Enabled = void>
 struct InnerReductionLauncher {
  static EIGEN_DEVICE_FUNC bool run(const Self&, Op&, const GpuDevice&, OutputType*, typename Self::Index,
@@ -786,7 +726,7 @@ struct InnerReductionLauncher<
  }
 };

-#ifdef EIGEN_HAS_GPU_FP16
+// Half-float reduction specializations.
 template <typename Self, typename Op>
 struct InnerReductionLauncher<Self, Op, Eigen::half, false> {
  static bool run(const Self&, Op&, const GpuDevice&, half*, typename Self::Index, typename Self::Index) {
@@ -826,24 +766,18 @@ struct InnerReductionLauncher<Self, Op, Eigen::half, true> {
    return false;
  }
 };
-#endif  // EIGEN_HAS_GPU_FP16

 template <typename Self, typename Op>
 struct InnerReducer<Self, Op, GpuDevice> {
  // Unfortunately nvidia doesn't support well exotic types such as complex,
  // so reduce the scope of the optimized version of the code to the simple case
  // of floats and half floats.
-#ifdef EIGEN_HAS_GPU_FP16
+  // Half-float reduction specializations.
  static constexpr bool HasOptimizedImplementation =
      !Self::ReducerTraits::IsStateful && (internal::is_same<typename Self::CoeffReturnType, float>::value ||
                                           internal::is_same<typename Self::CoeffReturnType, double>::value ||
                                           (internal::is_same<typename Self::CoeffReturnType, Eigen::half>::value &&
                                            reducer_traits<Op, GpuDevice>::PacketAccess));
-#else   // EIGEN_HAS_GPU_FP16
-  static constexpr bool HasOptimizedImplementation =
-      !Self::ReducerTraits::IsStateful && (internal::is_same<typename Self::CoeffReturnType, float>::value ||
-                                           internal::is_same<typename Self::CoeffReturnType, double>::value);
-#endif  // EIGEN_HAS_GPU_FP16

  template <typename OutputType>
  static bool run(const Self& self, Op& reducer, const GpuDevice& device, OutputType* output,
--- a/unsupported/test/CMakeLists.txt
+++ b/unsupported/test/CMakeLists.txt
@@ -237,7 +237,7 @@ if("${CMAKE_SIZEOF_VOID_P}" EQUAL "8" AND NOT CMAKE_CXX_COMPILER_ID STREQUAL "MS
  ei_add_test(cxx11_tensor_uint128)
 endif()

-find_package(CUDA 9.0)
+find_package(CUDA 11.4)
 if(CUDA_FOUND AND EIGEN_TEST_CUDA)
  # Make sure to compile without the -pedantic, -Wundef, -Wnon-virtual-dtor
  # and -fno-check-new flags since they trigger thousands of compilation warnings
@@ -281,26 +281,11 @@ if(CUDA_FOUND AND EIGEN_TEST_CUDA)
  ei_add_test(cxx11_tensor_argmax_gpu)
  ei_add_test(cxx11_tensor_cast_float16_gpu)
  ei_add_test(cxx11_tensor_scan_gpu)
-
-  set(EIGEN_CUDA_OLDEST_COMPUTE_ARCH 9999)
-  foreach(ARCH IN LISTS EIGEN_CUDA_COMPUTE_ARCH)
-    if(${ARCH} LESS ${EIGEN_CUDA_OLDEST_COMPUTE_ARCH})
-      set(EIGEN_CUDA_OLDEST_COMPUTE_ARCH ${ARCH})
-    endif()
-  endforeach()
-
-  # Contractions require arch 3.0 or higher
-  if (${EIGEN_CUDA_OLDEST_COMPUTE_ARCH} GREATER 29)
-    ei_add_test(cxx11_tensor_device)
-    ei_add_test(cxx11_tensor_gpu)
-    ei_add_test(cxx11_tensor_contract_gpu)
-    ei_add_test(cxx11_tensor_of_float16_gpu)
-  endif()
-
-  # The random number generation code requires arch 3.5 or greater.
-  if (${EIGEN_CUDA_OLDEST_COMPUTE_ARCH} GREATER 34)
-    ei_add_test(cxx11_tensor_random_gpu)
-  endif()
+  ei_add_test(cxx11_tensor_device)
+  ei_add_test(cxx11_tensor_gpu)
+  ei_add_test(cxx11_tensor_contract_gpu)
+  ei_add_test(cxx11_tensor_of_float16_gpu)
+  ei_add_test(cxx11_tensor_random_gpu)

  unset(EIGEN_ADD_TEST_FILENAME_EXTENSION)
 endif()
@@ -341,7 +326,6 @@ if (EIGEN_TEST_HIP)
      ei_add_test(cxx11_tensor_cast_float16_gpu)
      ei_add_test(cxx11_tensor_scan_gpu)
      ei_add_test(cxx11_tensor_device)
-
      ei_add_test(cxx11_tensor_gpu)
      ei_add_test(cxx11_tensor_contract_gpu)
      ei_add_test(cxx11_tensor_of_float16_gpu)
--- a/unsupported/test/cxx11_tensor_gpu.cu
+++ b/unsupported/test/cxx11_tensor_gpu.cu
@@ -850,6 +850,7 @@ void test_gpu_igamma() {
  Tensor<Scalar, 2> a(6, 6);
  Tensor<Scalar, 2> x(6, 6);
  Tensor<Scalar, 2> out(6, 6);
+  Tensor<Scalar, 2> expected_out(6, 6);
  out.setZero();

  Scalar a_s[] = {Scalar(0), Scalar(1), Scalar(1.5), Scalar(4), Scalar(0.0001), Scalar(1000.5)};
@@ -862,14 +863,11 @@ void test_gpu_igamma() {
    }
  }

-  Scalar nan = std::numeric_limits<Scalar>::quiet_NaN();
-  Scalar igamma_s[][6] = {
-      {0.0, nan, nan, nan, nan, nan},
-      {0.0, 0.6321205588285578, 0.7768698398515702, 0.9816843611112658, 9.999500016666262e-05, 1.0},
-      {0.0, 0.4275932955291202, 0.608374823728911, 0.9539882943107686, 7.522076445089201e-07, 1.0},
-      {0.0, 0.01898815687615381, 0.06564245437845008, 0.5665298796332909, 4.166333347221828e-18, 1.0},
-      {0.0, 0.9999780593618628, 0.9999899967080838, 0.9999996219837988, 0.9991370418689945, 1.0},
-      {0.0, 0.0, 0.0, 0.0, 0.0, 0.5042041932513908}};
+  for (int i = 0; i < 6; ++i) {
+    for (int j = 0; j < 6; ++j) {
+      expected_out(i, j) = numext::igamma(a(i, j), x(i, j));
+    }
+  }

  std::size_t bytes = a.size() * sizeof(Scalar);

@@ -897,10 +895,10 @@ void test_gpu_igamma() {

  for (int i = 0; i < 6; ++i) {
    for (int j = 0; j < 6; ++j) {
-      if ((std::isnan)(igamma_s[i][j])) {
+      if ((std::isnan)(expected_out(i, j))) {
        VERIFY((std::isnan)(out(i, j)));
      } else {
-        VERIFY_IS_APPROX(out(i, j), igamma_s[i][j]);
+        VERIFY_IS_APPROX(out(i, j), expected_out(i, j));
      }
    }
  }
@@ -915,6 +913,7 @@ void test_gpu_igammac() {
  Tensor<Scalar, 2> a(6, 6);
  Tensor<Scalar, 2> x(6, 6);
  Tensor<Scalar, 2> out(6, 6);
+  Tensor<Scalar, 2> expected_out(6, 6);
  out.setZero();

  Scalar a_s[] = {Scalar(0), Scalar(1), Scalar(1.5), Scalar(4), Scalar(0.0001), Scalar(1000.5)};
@@ -927,14 +926,11 @@ void test_gpu_igammac() {
    }
  }

-  Scalar nan = std::numeric_limits<Scalar>::quiet_NaN();
-  Scalar igammac_s[][6] = {
-      {nan, nan, nan, nan, nan, nan},
-      {1.0, 0.36787944117144233, 0.22313016014842982, 0.018315638888734182, 0.9999000049998333, 0.0},
-      {1.0, 0.5724067044708798, 0.3916251762710878, 0.04601170568923136, 0.9999992477923555, 0.0},
-      {1.0, 0.9810118431238462, 0.9343575456215499, 0.4334701203667089, 1.0, 0.0},
-      {1.0, 2.1940638138146658e-05, 1.0003291916285e-05, 3.7801620118431334e-07, 0.0008629581310054535, 0.0},
-      {1.0, 1.0, 1.0, 1.0, 1.0, 0.49579580674813944}};
+  for (int i = 0; i < 6; ++i) {
+    for (int j = 0; j < 6; ++j) {
+      expected_out(i, j) = numext::igammac(a(i, j), x(i, j));
+    }
+  }

  std::size_t bytes = a.size() * sizeof(Scalar);

@@ -962,10 +958,10 @@ void test_gpu_igammac() {

  for (int i = 0; i < 6; ++i) {
    for (int j = 0; j < 6; ++j) {
-      if ((std::isnan)(igammac_s[i][j])) {
+      if ((std::isnan)(expected_out(i, j))) {
        VERIFY((std::isnan)(out(i, j)));
      } else {
-        VERIFY_IS_APPROX(out(i, j), igammac_s[i][j]);
+        VERIFY_IS_APPROX(out(i, j), expected_out(i, j));
      }
    }
  }
@@ -1068,15 +1064,9 @@ void test_gpu_ndtri() {
  in_x(7) = Scalar(0.99);
  in_x(8) = Scalar(0.01);

-  expected_out(0) = std::numeric_limits<Scalar>::infinity();
-  expected_out(1) = -std::numeric_limits<Scalar>::infinity();
-  expected_out(2) = Scalar(0.0);
-  expected_out(3) = Scalar(-0.8416212335729142);
-  expected_out(4) = Scalar(0.8416212335729142);
-  expected_out(5) = Scalar(1.2815515655446004);
-  expected_out(6) = Scalar(-1.2815515655446004);
-  expected_out(7) = Scalar(2.3263478740408408);
-  expected_out(8) = Scalar(-2.3263478740408408);
+  for (int i = 0; i < 9; ++i) {
+    expected_out(i) = numext::ndtri(in_x(i));
+  }

  std::size_t bytes = in_x.size() * sizeof(Scalar);

@@ -1090,15 +1080,15 @@ void test_gpu_ndtri() {
  Eigen::GpuStreamDevice stream;
  Eigen::GpuDevice gpu_device(&stream);

-  Eigen::TensorMap<Eigen::Tensor<Scalar, 1> > gpu_in_x(d_in_x, 6);
-  Eigen::TensorMap<Eigen::Tensor<Scalar, 1> > gpu_out(d_out, 6);
+  Eigen::TensorMap<Eigen::Tensor<Scalar, 1> > gpu_in_x(d_in_x, 9);
+  Eigen::TensorMap<Eigen::Tensor<Scalar, 1> > gpu_out(d_out, 9);

  gpu_out.device(gpu_device) = gpu_in_x.ndtri();

  assert(gpuMemcpyAsync(out.data(), d_out, bytes, gpuMemcpyDeviceToHost, gpu_device.stream()) == gpuSuccess);
  assert(gpuStreamSynchronize(gpu_device.stream()) == gpuSuccess);

-  for (int i = 0; i < 6; ++i) {
+  for (int i = 0; i < 9; ++i) {
    VERIFY_IS_CWISE_APPROX(out(i), expected_out(i));
  }

@@ -1115,12 +1105,9 @@ void test_gpu_betainc() {
  Tensor<Scalar, 1> expected_out(125);
  out.setZero();

-  Scalar nan = std::numeric_limits<Scalar>::quiet_NaN();
-
  Array<Scalar, 1, Dynamic> x(125);
  Array<Scalar, 1, Dynamic> a(125);
  Array<Scalar, 1, Dynamic> b(125);
-  Array<Scalar, 1, Dynamic> v(125);

  a << 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0,
      0.0, 0.0, 0.0, 0.03062277660168379, 0.03062277660168379, 0.03062277660168379, 0.03062277660168379,
@@ -1160,25 +1147,11 @@ void test_gpu_betainc() {
      0.5, 0.8, 1.1, -0.1, 0.2, 0.5, 0.8, 1.1, -0.1, 0.2, 0.5, 0.8, 1.1, -0.1, 0.2, 0.5, 0.8, 1.1, -0.1, 0.2, 0.5, 0.8,
      1.1, -0.1, 0.2, 0.5, 0.8, 1.1, -0.1, 0.2, 0.5, 0.8, 1.1, -0.1, 0.2, 0.5, 0.8, 1.1;

-  v << nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan,
-      nan, nan, nan, nan, nan, nan, nan, nan, nan, 0.47972119876364683, 0.5, 0.5202788012363533, nan, nan,
-      0.9518683957740043, 0.9789663010413743, 0.9931729188073435, nan, nan, 0.999995949033062, 0.9999999999993698,
-      0.9999999999999999, nan, nan, 0.9999999999999999, 0.9999999999999999, 0.9999999999999999, nan, nan, nan, nan, nan,
-      nan, nan, 0.006827081192655869, 0.0210336989586256, 0.04813160422599567, nan, nan, 0.20014344256217678,
-      0.5000000000000001, 0.7998565574378232, nan, nan, 0.9991401428435834, 0.999999999698403, 0.9999999999999999, nan,
-      nan, 0.9999999999999999, 0.9999999999999999, 0.9999999999999999, nan, nan, nan, nan, nan, nan, nan,
-      1.0646600232370887e-25, 6.301722877826246e-13, 4.050966937974938e-06, nan, nan, 7.864342668429763e-23,
-      3.015969667594166e-10, 0.0008598571564165444, nan, nan, 6.031987710123844e-08, 0.5000000000000007,
-      0.9999999396801229, nan, nan, 0.9999999999999999, 0.9999999999999999, 0.9999999999999999, nan, nan, nan, nan, nan,
-      nan, nan, 0.0, 7.029920380986636e-306, 2.2450728208591345e-101, nan, nan, 0.0, 9.275871147869727e-302,
-      1.2232913026152827e-97, nan, nan, 0.0, 3.0891393081932924e-252, 2.9303043666183996e-60, nan, nan,
-      2.248913486879199e-196, 0.5000000000004947, 0.9999999999999999, nan;
-
  for (int i = 0; i < 125; ++i) {
    in_x(i) = x(i);
    in_a(i) = a(i);
    in_b(i) = b(i);
-    expected_out(i) = v(i);
+    expected_out(i) = numext::betainc(a(i), b(i), x(i));
  }

  std::size_t bytes = in_x.size() * sizeof(Scalar);
--- a/unsupported/test/cxx11_tensor_of_float16_gpu.cu
+++ b/unsupported/test/cxx11_tensor_of_float16_gpu.cu
@@ -53,8 +53,6 @@ void test_gpu_numext() {
  gpu_device.deallocate(d_res_float);
 }

-#ifdef EIGEN_HAS_GPU_FP16
-
 template <typename>
 void test_gpu_conversion() {
  Eigen::GpuStreamDevice stream;
@@ -442,12 +440,10 @@ void test_gpu_forced_evals() {
  gpu_device.deallocate(d_res_half2);
  gpu_device.deallocate(d_res_float);
 }
-#endif

 EIGEN_DECLARE_TEST(cxx11_tensor_of_float16_gpu) {
  CALL_SUBTEST_1(test_gpu_numext<void>());

-#ifdef EIGEN_HAS_GPU_FP16
  CALL_SUBTEST_1(test_gpu_conversion<void>());
  CALL_SUBTEST_1(test_gpu_unary<void>());
  CALL_SUBTEST_1(test_gpu_elementwise<void>());
@@ -456,7 +452,4 @@ EIGEN_DECLARE_TEST(cxx11_tensor_of_float16_gpu) {
  CALL_SUBTEST_3(test_gpu_reductions<void>());
  CALL_SUBTEST_4(test_gpu_full_reductions<void>());
  CALL_SUBTEST_5(test_gpu_forced_evals<void>());
-#else
-  std::cout << "Half floats are not supported by this version of gpu: skipping the test" << std::endl;
-#endif
 }
Author	SHA1	Message	Date
Rasmus Munk Larsen	6a9405bf7a	GPU: Raise CUDA/HIP minimum and remove legacy guards - Raise CUDA minimum from 9.0 to 11.4 (sm_70/Volta). - Raise HIP minimum to GFX906 (Vega 20/MI50) / ROCm 5.6. - Remove EIGEN_HAS_{CUDA,HIP,GPU}_FP16 guards — FP16 is always available on sm_70+ and GFX906+. - Remove obsolete __HIP_ARCH_HAS_* preprocessor branches. - C++14 cleanup: remove pre-C++14 workarounds in GPU code. - Fix NVCC warnings (deprecated register keyword, unreachable code, tautological comparisons). - Fix HIP test execution on gfx1151. - Update CI configuration for new minimum versions.	2026-04-09 15:21:39 -07:00
Rasmus Munk Larsen	e055e4e415	Add plog_core_double with fallback for AVX without AVX2 libeigen/eigen!2407 Co-authored-by: Rasmus Munk Larsen <rlarsen@nvidia.com>	2026-04-08 19:41:07 -07:00
Rasmus Munk Larsen	b1d2ce4c85	Revert "Speed up plog_double ~1.7x with fast integer range reduction" This reverts merge request !2385	2026-04-08 13:03:48 -07:00
Rasmus Munk Larsen	ab70739c9c	Speed up plog_double ~1.7x with fast integer range reduction libeigen/eigen!2385 Co-authored-by: Rasmus Munk Larsen <rmlarsen@gmail.com>	2026-04-07 21:48:25 -07:00
Rasmus Munk Larsen	e778b5d22b	Switch ASAN/UBSAN smoketest pipelines to large runners libeigen/eigen!2405 Co-authored-by: Rasmus Munk Larsen <rmlarsen@gmail.com>	2026-04-07 21:37:58 -07:00
Rasmus Munk Larsen	def45c5e1e	Improve psincos_double: faster polynomials + accurate range reduction libeigen/eigen!2389 Closes #3052 Co-authored-by: Rasmus Munk Larsen <rmlarsen@gmail.com>	2026-04-07 21:24:24 -07:00
Rasmus Munk Larsen	110530a4d8	Fix bugs and improve robustness of SelfAdjointEigenSolver, improve test coverage libeigen/eigen!2396 Co-authored-by: Rasmus Munk Larsen <rmlarsen@gmail.com>	2026-04-07 21:08:29 -07:00
Rasmus Munk Larsen	bde3a68bae	Improve dense linear solver docs with practical guidance libeigen/eigen!2395 Co-authored-by: Rasmus Munk Larsen <rmlarsen@gmail.com>	2026-04-05 21:40:42 -07:00
Rasmus Munk Larsen	8eabfb5342	Vectorize BLAS level 1/2 routines with Eigen expressions libeigen/eigen!2404 Co-authored-by: Rasmus Munk Larsen <rmlarsen@gmail.com>	2026-04-05 18:53:11 -07:00
Rasmus Munk Larsen	4ad90a60f1	Replace blas/f2c with clean C++ implementations libeigen/eigen!2402 Co-authored-by: Rasmus Munk Larsen <rmlarsen@gmail.com>	2026-04-05 16:04:41 -07:00