diff --git a/Eigen/src/Core/arch/NEON/Complex.h b/Eigen/src/Core/arch/NEON/Complex.h
index 154daa7a7..c7fb12fe8 100644
--- a/Eigen/src/Core/arch/NEON/Complex.h
+++ b/Eigen/src/Core/arch/NEON/Complex.h
@@ -272,7 +272,7 @@ ptranspose(PacketBlock<Packet2cf,2>& kernel) {
 }
 
 //---------- double ----------
-#if EIGEN_ARCH_ARM64
+#if EIGEN_ARCH_ARM64 && !EIGEN_APPLE_DOUBLE_NEON_BUG
 
 static uint64x2_t p2ul_CONJ_XOR = EIGEN_INIT_NEON_PACKET2(0x0, 0x8000000000000000);
 
diff --git a/Eigen/src/Core/functors/UnaryFunctors.h b/Eigen/src/Core/functors/UnaryFunctors.h
index ec42e6850..f32f0f113 100644
--- a/Eigen/src/Core/functors/UnaryFunctors.h
+++ b/Eigen/src/Core/functors/UnaryFunctors.h
@@ -55,6 +55,34 @@ struct functor_traits<scalar_abs_op<Scalar> >
   };
 };
 
+/** \internal
+  * \brief Template functor to compute the score of a scalar, to chose a pivot
+  *
+  * \sa class CwiseUnaryOp
+  */
+template<typename Scalar> struct scalar_score_coeff_op : scalar_abs_op<Scalar>
+{
+  typedef void Score_is_abs;
+};
+template<typename Scalar>
+struct functor_traits<scalar_score_coeff_op<Scalar> > : functor_traits<scalar_abs_op<Scalar> > {};
+
+/* Avoid recomputing abs when we know the score and they are the same. Not a true Eigen functor.  */
+template<typename Scalar, typename=void> struct abs_knowing_score
+{
+  EIGEN_EMPTY_STRUCT_CTOR(abs_knowing_score)
+  typedef typename NumTraits<Scalar>::Real result_type;
+  template<typename Score>
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const result_type operator() (const Scalar& a, const Score&) const { using std::abs; return abs(a); }
+};
+template<typename Scalar> struct abs_knowing_score<Scalar, typename scalar_score_coeff_op<Scalar>::Score_is_abs>
+{
+  EIGEN_EMPTY_STRUCT_CTOR(abs_knowing_score)
+  typedef typename NumTraits<Scalar>::Real result_type;
+  template<typename Scal>
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const result_type operator() (const Scal&, const result_type& a) const { return a; }
+};
+
 /** \internal
   * \brief Template functor to compute the squared absolute value of a scalar
   *
diff --git a/Eigen/src/Core/products/GeneralBlockPanelKernel.h b/Eigen/src/Core/products/GeneralBlockPanelKernel.h
index 0fc7289be..df9f66995 100644
--- a/Eigen/src/Core/products/GeneralBlockPanelKernel.h
+++ b/Eigen/src/Core/products/GeneralBlockPanelKernel.h
@@ -86,19 +86,21 @@ void computeProductBlockingSizes(Index& k, Index& m, Index& n, Index num_threads
   typedef gebp_traits<LhsScalar,RhsScalar> Traits;
 
 #ifdef EIGEN_TEST_SPECIFIC_BLOCKING_SIZES
-  EIGEN_UNUSED_VARIABLE(num_threads);
-  enum {
-    kr = 8,
-    mr = Traits::mr,
-    nr = Traits::nr
-  };
-  k = std::min<Index>(k, EIGEN_TEST_SPECIFIC_BLOCKING_SIZE_K);
-  if (k > kr) k -= k % kr;
-  m = std::min<Index>(m, EIGEN_TEST_SPECIFIC_BLOCKING_SIZE_M);
-  if (m > mr) m -= m % mr;
-  n = std::min<Index>(n, EIGEN_TEST_SPECIFIC_BLOCKING_SIZE_N);
-  if (n > nr) n -= n % nr;
-  return;
+  if (EIGEN_TEST_SPECIFIC_BLOCKING_SIZES) {
+    EIGEN_UNUSED_VARIABLE(num_threads);
+    enum {
+      kr = 8,
+      mr = Traits::mr,
+      nr = Traits::nr
+    };
+    k = std::min<Index>(k, EIGEN_TEST_SPECIFIC_BLOCKING_SIZE_K);
+    if (k > kr) k -= k % kr;
+    m = std::min<Index>(m, EIGEN_TEST_SPECIFIC_BLOCKING_SIZE_M);
+    if (m > mr) m -= m % mr;
+    n = std::min<Index>(n, EIGEN_TEST_SPECIFIC_BLOCKING_SIZE_N);
+    if (n > nr) n -= n % nr;
+    return;
+  }
 #endif
 
   // Explanations:
@@ -1479,17 +1481,17 @@ void gebp_kernel<LhsScalar,RhsScalar,Index,DataMapper,mr,nr,ConjugateLhs,Conjuga
 
           for(Index k=0; k<peeled_kc; k+=pk)
           {
-            EIGEN_ASM_COMMENT("begin gebp micro kernel 2pX1");
+            EIGEN_ASM_COMMENT("begin gebp micro kernel 1pX1");
             RhsPacket B_0;
         
 #define EIGEN_GEBGP_ONESTEP(K) \
             do {                                                                \
-              EIGEN_ASM_COMMENT("begin step of gebp micro kernel 2pX1");        \
+              EIGEN_ASM_COMMENT("begin step of gebp micro kernel 1pX1");        \
               EIGEN_ASM_COMMENT("Note: these asm comments work around bug 935!"); \
               traits.loadLhs(&blA[(0+1*K)*LhsProgress], A0);                    \
               traits.loadRhs(&blB[(0+K)*RhsProgress], B_0);                     \
               traits.madd(A0, B_0, C0, B_0);                                    \
-              EIGEN_ASM_COMMENT("end step of gebp micro kernel 2pX1");          \
+              EIGEN_ASM_COMMENT("end step of gebp micro kernel 1pX1");          \
             } while(false);
 
             EIGEN_GEBGP_ONESTEP(0);
@@ -1504,7 +1506,7 @@ void gebp_kernel<LhsScalar,RhsScalar,Index,DataMapper,mr,nr,ConjugateLhs,Conjuga
             blB += pk*RhsProgress;
             blA += pk*1*Traits::LhsProgress;
 
-            EIGEN_ASM_COMMENT("end gebp micro kernel 2pX1");
+            EIGEN_ASM_COMMENT("end gebp micro kernel 1pX1");
           }
 
           // process remaining peeled loop
diff --git a/Eigen/src/LU/FullPivLU.h b/Eigen/src/LU/FullPivLU.h
index 49c8b183d..d1a260a37 100644
--- a/Eigen/src/LU/FullPivLU.h
+++ b/Eigen/src/LU/FullPivLU.h
@@ -459,14 +459,16 @@ FullPivLU<MatrixType>& FullPivLU<MatrixType>::compute(const MatrixType& matrix)
 
     // biggest coefficient in the remaining bottom-right corner (starting at row k, col k)
     Index row_of_biggest_in_corner, col_of_biggest_in_corner;
-    RealScalar biggest_in_corner;
+    typedef internal::scalar_score_coeff_op<Scalar> Scoring;
+    typedef typename Scoring::result_type Score;
+    Score biggest_in_corner;
     biggest_in_corner = m_lu.bottomRightCorner(rows-k, cols-k)
-                        .cwiseAbs()
+                        .unaryExpr(Scoring())
                         .maxCoeff(&row_of_biggest_in_corner, &col_of_biggest_in_corner);
     row_of_biggest_in_corner += k; // correct the values! since they were computed in the corner,
     col_of_biggest_in_corner += k; // need to add k to them.
 
-    if(biggest_in_corner==RealScalar(0))
+    if(biggest_in_corner==Score(0))
     {
       // before exiting, make sure to initialize the still uninitialized transpositions
       // in a sane state without destroying what we already have.
@@ -479,7 +481,8 @@ FullPivLU<MatrixType>& FullPivLU<MatrixType>::compute(const MatrixType& matrix)
       break;
     }
 
-    if(biggest_in_corner > m_maxpivot) m_maxpivot = biggest_in_corner;
+    RealScalar abs_pivot = internal::abs_knowing_score<Scalar>()(m_lu(row_of_biggest_in_corner, col_of_biggest_in_corner), biggest_in_corner);
+    if(abs_pivot > m_maxpivot) m_maxpivot = abs_pivot;
 
     // Now that we've found the pivot, we need to apply the row/col swaps to
     // bring it to the location (k,k).
diff --git a/Eigen/src/LU/PartialPivLU.h b/Eigen/src/LU/PartialPivLU.h
index e57b36bc5..3d8825a97 100644
--- a/Eigen/src/LU/PartialPivLU.h
+++ b/Eigen/src/LU/PartialPivLU.h
@@ -275,6 +275,8 @@ struct partial_lu_impl
     */
   static Index unblocked_lu(MatrixType& lu, PivIndex* row_transpositions, PivIndex& nb_transpositions)
   {
+    typedef scalar_score_coeff_op<Scalar> Scoring;
+    typedef typename Scoring::result_type Score;
     const Index rows = lu.rows();
     const Index cols = lu.cols();
     const Index size = (std::min)(rows,cols);
@@ -286,13 +288,13 @@ struct partial_lu_impl
       Index rcols = cols-k-1;
         
       Index row_of_biggest_in_col;
-      RealScalar biggest_in_corner
-        = lu.col(k).tail(rows-k).cwiseAbs().maxCoeff(&row_of_biggest_in_col);
+      Score biggest_in_corner
+        = lu.col(k).tail(rows-k).unaryExpr(Scoring()).maxCoeff(&row_of_biggest_in_col);
       row_of_biggest_in_col += k;
 
       row_transpositions[k] = PivIndex(row_of_biggest_in_col);
 
-      if(biggest_in_corner != RealScalar(0))
+      if(biggest_in_corner != Score(0))
       {
         if(k != row_of_biggest_in_col)
         {
diff --git a/Eigen/src/QR/FullPivHouseholderQR.h b/Eigen/src/QR/FullPivHouseholderQR.h
index 7d5e58d2f..4952fbb46 100644
--- a/Eigen/src/QR/FullPivHouseholderQR.h
+++ b/Eigen/src/QR/FullPivHouseholderQR.h
@@ -443,13 +443,15 @@ FullPivHouseholderQR<MatrixType>& FullPivHouseholderQR<MatrixType>::compute(cons
   for (Index k = 0; k < size; ++k)
   {
     Index row_of_biggest_in_corner, col_of_biggest_in_corner;
-    RealScalar biggest_in_corner;
+    typedef internal::scalar_score_coeff_op<Scalar> Scoring;
+    typedef typename Scoring::result_type Score;
 
-    biggest_in_corner = m_qr.bottomRightCorner(rows-k, cols-k)
-                            .cwiseAbs()
-                            .maxCoeff(&row_of_biggest_in_corner, &col_of_biggest_in_corner);
+    Score score = m_qr.bottomRightCorner(rows-k, cols-k)
+                      .unaryExpr(Scoring())
+                      .maxCoeff(&row_of_biggest_in_corner, &col_of_biggest_in_corner);
     row_of_biggest_in_corner += k;
     col_of_biggest_in_corner += k;
+    RealScalar biggest_in_corner = internal::abs_knowing_score<Scalar>()(m_qr(row_of_biggest_in_corner, col_of_biggest_in_corner), score);
     if(k==0) biggest = biggest_in_corner;
 
     // if the corner is negligible, then we have less than full rank, and we can finish early
diff --git a/bench/analyze-blocking-sizes.cpp b/bench/analyze-blocking-sizes.cpp
index f507eab38..3316f8cbf 100644
--- a/bench/analyze-blocking-sizes.cpp
+++ b/bench/analyze-blocking-sizes.cpp
@@ -1,3 +1,12 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2015 Benoit Jacob <benoitjacob@google.com>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
 #include <iostream>
 #include <cstdint>
 #include <cstdlib>
@@ -559,8 +568,8 @@ void show_usage_and_exit(int argc, char* argv[],
 
 int main(int argc, char* argv[])
 {
-  cout.precision(3);
-  cerr.precision(3);
+  cout.precision(4);
+  cerr.precision(4);
 
   vector<unique_ptr<action_t>> available_actions;
   available_actions.emplace_back(new partition_action_t);
diff --git a/bench/benchmark-blocking-sizes.cpp b/bench/benchmark-blocking-sizes.cpp
index 04244575a..f1fef1ede 100644
--- a/bench/benchmark-blocking-sizes.cpp
+++ b/bench/benchmark-blocking-sizes.cpp
@@ -1,11 +1,22 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2015 Benoit Jacob <benoitjacob@google.com>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
 #include <iostream>
 #include <cstdint>
 #include <cstdlib>
 #include <vector>
 #include <fstream>
+#include <memory>
 
+bool eigen_use_specific_block_size;
 int eigen_block_size_k, eigen_block_size_m, eigen_block_size_n;
-#define EIGEN_TEST_SPECIFIC_BLOCKING_SIZES
+#define EIGEN_TEST_SPECIFIC_BLOCKING_SIZES eigen_use_specific_block_size
 #define EIGEN_TEST_SPECIFIC_BLOCKING_SIZE_K eigen_block_size_k
 #define EIGEN_TEST_SPECIFIC_BLOCKING_SIZE_M eigen_block_size_m
 #define EIGEN_TEST_SPECIFIC_BLOCKING_SIZE_N eigen_block_size_n
@@ -82,16 +93,25 @@ struct benchmark_t
 {
   uint16_t compact_product_size;
   uint16_t compact_block_size;
+  bool use_default_block_size;
   float gflops;
   benchmark_t()
     : compact_product_size(0)
     , compact_block_size(0)
     , gflops(0)
+    , use_default_block_size(false)
   {}
   benchmark_t(size_t pk, size_t pm, size_t pn,
               size_t bk, size_t bm, size_t bn)
     : compact_product_size(compact_size_triple(pk, pm, pn))
     , compact_block_size(compact_size_triple(bk, bm, bn))
+    , use_default_block_size(false)
+    , gflops(0)
+  {}
+  benchmark_t(size_t pk, size_t pm, size_t pn)
+    : compact_product_size(compact_size_triple(pk, pm, pn))
+    , compact_block_size(0)
+    , use_default_block_size(true)
     , gflops(0)
   {}
 
@@ -100,10 +120,12 @@ struct benchmark_t
 
 ostream& operator<<(ostream& s, const benchmark_t& b)
 {
-  s << hex;
-  s << b.compact_product_size
-    << " " << b.compact_block_size;
-  s << dec;
+  s << hex << b.compact_product_size << dec;
+  if (b.use_default_block_size) {
+    s << " default";
+  } else {
+    s << " " << hex << b.compact_block_size << dec;
+  }
   s << " " << b.gflops;
   return s;
 }
@@ -121,14 +143,18 @@ bool operator<(const benchmark_t& b1, const benchmark_t& b2)
 
 void benchmark_t::run()
 {
-  // expand our compact benchmark params into proper triples
   size_triple_t productsizes(compact_product_size);
-  size_triple_t blocksizes(compact_block_size);
-  
-  // feed eigen with our custom blocking params
-  eigen_block_size_k = blocksizes.k;
-  eigen_block_size_m = blocksizes.m;
-  eigen_block_size_n = blocksizes.n;
+
+  if (use_default_block_size) {
+    eigen_use_specific_block_size = false;
+  } else {
+    // feed eigen with our custom blocking params
+    eigen_use_specific_block_size = true;
+    size_triple_t blocksizes(compact_block_size);
+    eigen_block_size_k = blocksizes.k;
+    eigen_block_size_m = blocksizes.m;
+    eigen_block_size_n = blocksizes.n;
+  }
 
   // set up the matrix pool
 
@@ -231,9 +257,23 @@ string type_name<double>()
   return "double";
 }
 
-void show_usage_and_exit(const char *progname)
+struct action_t
 {
-  cerr << "usage: " << progname << " [--min-working-set-size=N]" << endl << endl;
+  virtual const char* invokation_name() const { abort(); return nullptr; }
+  virtual void run() const { abort(); }
+  virtual ~action_t() {}
+};
+
+void show_usage_and_exit(int argc, char* argv[],
+                         const vector<unique_ptr<action_t>>& available_actions)
+{
+  cerr << "usage: " << argv[0] << " <action> [options...]" << endl << endl;
+  cerr << "available actions:" << endl << endl;
+  for (auto it = available_actions.begin(); it != available_actions.end(); ++it) {
+    cerr << "  " << (*it)->invokation_name() << endl;
+  }
+  cerr << endl;
+  cerr << "options:" << endl << endl;
   cerr << "  --min-working-set-size=N:" << endl;
   cerr << "       Set the minimum working set size to N bytes." << endl;
   cerr << "       This is rounded up as needed to a multiple of matrix size." << endl;
@@ -245,56 +285,8 @@ void show_usage_and_exit(const char *progname)
   exit(1);
 }
 
-int main(int argc, char* argv[])
+void run_benchmarks(vector<benchmark_t>& benchmarks)
 {
-  for (int i = 1; i < argc; i++) {
-    if (argv[i] == strstr(argv[i], "--min-working-set-size=")) {
-      const char* equals_sign = strchr(argv[i], '=');
-      min_working_set_size = strtoul(equals_sign+1, nullptr, 10);
-    } else {
-      cerr << "unrecognized option: " << argv[i] << endl << endl;
-      show_usage_and_exit(argv[0]);
-    }
-  }
-
-  cout.precision(4);
-
-  print_cpuinfo();
-
-  cout << "benchmark parameters:" << endl;
-  cout << "pointer size: " << 8*sizeof(void*) << " bits" << endl;
-  cout << "scalar type: " << type_name<MatrixType::Scalar>() << endl;
-  cout << "packet size: " << internal::packet_traits<MatrixType::Scalar>::size << endl;
-  cout << "minsize = " << minsize << endl;
-  cout << "maxsize = " << maxsize << endl;
-  cout << "measurement_repetitions = " << measurement_repetitions << endl;
-  cout << "min_accurate_time = " << min_accurate_time << endl;
-  cout << "min_working_set_size = " << min_working_set_size;
-  if (min_working_set_size == 0) {
-    cout << " (try to outsize caches)";
-  }
-  cout << endl << endl;
-
-
-  // assemble the array of benchmarks without running them at first
-  vector<benchmark_t> benchmarks;
-  for (int repetition = 0; repetition < measurement_repetitions; repetition++) {
-    for (size_t ksize = minsize; ksize <= maxsize; ksize *= 2) {
-      for (size_t msize = minsize; msize <= maxsize; msize *= 2) {
-        for (size_t nsize = minsize; nsize <= maxsize; nsize *= 2) {
-          for (size_t kblock = minsize; kblock <= ksize; kblock *= 2) {
-            for (size_t mblock = minsize; mblock <= msize; mblock *= 2) {
-              for (size_t nblock = minsize; nblock <= nsize; nblock *= 2) {
-                benchmark_t b(ksize, msize, nsize, kblock, mblock, nblock);
-                benchmarks.push_back(b);
-              }
-            }
-          }
-        }
-      }
-    }
-  }
-
   // randomly shuffling benchmarks allows us to get accurate enough progress info,
   // as now the cheap/expensive benchmarks are randomly mixed so they average out.
   random_shuffle(benchmarks.begin(), benchmarks.end());
@@ -315,14 +307,23 @@ int main(int argc, char* argv[])
 
       if (i > 10) {
         cerr << ", ETA ";
-        float eta = float(time_now - time_start) * (1.0f - ratio_done) / ratio_done;
-        if (eta > 3600)
-          cerr << eta/3600 << " hours";
-        else if (eta > 60)
-          cerr << eta/60 << " minutes";
-        else cerr << eta << " seconds";
+        int eta = int(float(time_now - time_start) * (1.0f - ratio_done) / ratio_done);
+        int eta_remainder = eta;
+        if (eta_remainder > 3600) {
+          int hours = eta_remainder / 3600;
+          cerr << hours << " h ";
+          eta_remainder -= hours * 3600;
+        }
+        if (eta_remainder > 60) {
+          int minutes = eta_remainder / 60;
+          cerr << minutes << " min ";
+          eta_remainder -= minutes * 60;
+        }
+        if (eta < 600 && eta_remainder) {
+          cerr << eta_remainder << " s";
+        }
       }
-      cerr << "                                              \r" << flush;
+      cerr << "                                                \r" << flush;
     }
 
     // This is where we actually run a benchmark!
@@ -348,9 +349,116 @@ int main(int argc, char* argv[])
     }
   }
 
-  // Output data.
-  cout << "BEGIN MEASUREMENTS" << endl;
-  for (auto it = best_benchmarks.begin(); it != best_benchmarks.end(); ++it) {
-    cout << *it << endl;
-  }
+  // keep and return only the best benchmarks
+  benchmarks = best_benchmarks;
+}
+
+struct measure_all_pot_sizes_action_t : action_t
+{
+  virtual const char* invokation_name() const { return "measure-all-pot-sizes"; }
+  virtual void run() const
+  {
+    vector<benchmark_t> benchmarks;
+    for (int repetition = 0; repetition < measurement_repetitions; repetition++) {
+      for (size_t ksize = minsize; ksize <= maxsize; ksize *= 2) {
+        for (size_t msize = minsize; msize <= maxsize; msize *= 2) {
+          for (size_t nsize = minsize; nsize <= maxsize; nsize *= 2) {
+            for (size_t kblock = minsize; kblock <= ksize; kblock *= 2) {
+              for (size_t mblock = minsize; mblock <= msize; mblock *= 2) {
+                for (size_t nblock = minsize; nblock <= nsize; nblock *= 2) {
+                  benchmarks.emplace_back(ksize, msize, nsize, kblock, mblock, nblock);
+                }
+              }
+            }
+          }
+        }
+      }
+    }
+
+    run_benchmarks(benchmarks);
+
+    cout << "BEGIN MEASUREMENTS ALL POT SIZES" << endl;
+    for (auto it = benchmarks.begin(); it != benchmarks.end(); ++it) {
+      cout << *it << endl;
+    }
+  }
+};
+
+struct measure_default_sizes_action_t : action_t
+{
+  virtual const char* invokation_name() const { return "measure-default-sizes"; }
+  virtual void run() const
+  {
+    vector<benchmark_t> benchmarks;
+    for (int repetition = 0; repetition < measurement_repetitions; repetition++) {
+      for (size_t ksize = minsize; ksize <= maxsize; ksize *= 2) {
+        for (size_t msize = minsize; msize <= maxsize; msize *= 2) {
+          for (size_t nsize = minsize; nsize <= maxsize; nsize *= 2) {
+            benchmarks.emplace_back(ksize, msize, nsize);
+          }
+        }
+      }
+    }
+
+    run_benchmarks(benchmarks);
+
+    cout << "BEGIN MEASUREMENTS DEFAULT SIZES" << endl;
+    for (auto it = benchmarks.begin(); it != benchmarks.end(); ++it) {
+      cout << *it << endl;
+    }
+  }
+};
+
+int main(int argc, char* argv[])
+{
+  cout.precision(4);
+  cerr.precision(4);
+
+  vector<unique_ptr<action_t>> available_actions;
+  available_actions.emplace_back(new measure_all_pot_sizes_action_t);
+  available_actions.emplace_back(new measure_default_sizes_action_t);
+
+  auto action = available_actions.end();
+
+  if (argc <= 1) {
+    show_usage_and_exit(argc, argv, available_actions);
+  }
+  for (auto it = available_actions.begin(); it != available_actions.end(); ++it) {
+    if (!strcmp(argv[1], (*it)->invokation_name())) {
+      action = it;
+      break;
+    }
+  }
+
+  if (action == available_actions.end()) {
+    show_usage_and_exit(argc, argv, available_actions);
+  }
+
+  for (int i = 2; i < argc; i++) {
+    if (argv[i] == strstr(argv[i], "--min-working-set-size=")) {
+      const char* equals_sign = strchr(argv[i], '=');
+      min_working_set_size = strtoul(equals_sign+1, nullptr, 10);
+    } else {
+      cerr << "unrecognized option: " << argv[i] << endl << endl;
+      show_usage_and_exit(argc, argv, available_actions);
+    }
+  }
+
+  print_cpuinfo();
+
+  cout << "benchmark parameters:" << endl;
+  cout << "pointer size: " << 8*sizeof(void*) << " bits" << endl;
+  cout << "scalar type: " << type_name<MatrixType::Scalar>() << endl;
+  cout << "packet size: " << internal::packet_traits<MatrixType::Scalar>::size << endl;
+  cout << "minsize = " << minsize << endl;
+  cout << "maxsize = " << maxsize << endl;
+  cout << "measurement_repetitions = " << measurement_repetitions << endl;
+  cout << "min_accurate_time = " << min_accurate_time << endl;
+  cout << "min_working_set_size = " << min_working_set_size;
+  if (min_working_set_size == 0) {
+    cout << " (try to outsize caches)";
+  }
+  cout << endl << endl;
+
+  (*action)->run();
 }
diff --git a/doc/CustomizingEigen.dox b/doc/CustomizingEigen.dox
index 0850863aa..cb25f4ec9 100644
--- a/doc/CustomizingEigen.dox
+++ b/doc/CustomizingEigen.dox
@@ -157,6 +157,67 @@ inline adouble abs2(const adouble& x)  { return x*x; }
 #endif // ADOLCSUPPORT_H
 \endcode
 
+This other example adds support for the \c mpq_class type from <a href="https://gmplib.org/">GMP</a>. It shows in particular how to change the way Eigen picks the best pivot during LU factorization. It selects the coefficient with the highest score, where the score is by default the absolute value of a number, but we can define a different score, for instance to prefer pivots with a more compact representation (this is an example, not a recommendation). Note that the scores should always be non-negative and only zero is allowed to have a score of zero. Also, this can interact badly with thresholds for inexact scalar types.
+
+\code
+#include <gmpxx.h>
+#include <Eigen/Core>
+#include <boost/operators.hpp>
+
+namespace Eigen {
+  template<class> struct NumTraits;
+  template<> struct NumTraits<mpq_class>
+  {
+    typedef mpq_class Real;
+    typedef mpq_class NonInteger;
+    typedef mpq_class Nested;
+
+    static inline Real epsilon() { return 0; }
+    static inline Real dummy_precision() { return 0; }
+
+    enum {
+      IsInteger = 0,
+      IsSigned = 1,
+      IsComplex = 0,
+      RequireInitialization = 1,
+      ReadCost = 6,
+      AddCost = 150,
+      MulCost = 100
+    };
+  };
+
+  namespace internal {
+    template<>
+      struct significant_decimals_impl<mpq_class>
+      {
+	// Infinite precision when printing
+	static inline int run() { return 0; }
+      };
+
+    template<> struct scalar_score_coeff_op<mpq_class> {
+      struct result_type : boost::totally_ordered1<result_type> {
+	std::size_t len;
+	result_type(int i = 0) : len(i) {} // Eigen uses Score(0) and Score()
+	result_type(mpq_class const& q) :
+	  len(mpz_size(q.get_num_mpz_t())+
+	      mpz_size(q.get_den_mpz_t())-1) {}
+	friend bool operator<(result_type x, result_type y) {
+	  // 0 is the worst possible pivot
+	  if (x.len == 0) return y.len > 0;
+	  if (y.len == 0) return false;
+	  // Prefer a pivot with a small representation
+	  return x.len > y.len;
+	}
+	friend bool operator==(result_type x, result_type y) {
+	  // Only used to test if the score is 0
+	  return x.len == y.len;
+	}
+      };
+      result_type operator()(mpq_class const& x) const { return x; }
+    };
+  }
+}
+\endcode
 
 \sa \ref TopicPreprocessorDirectives
 
diff --git a/unsupported/test/cxx11_tensor_expr.cpp b/unsupported/test/cxx11_tensor_expr.cpp
index 695565e9b..8389e9840 100644
--- a/unsupported/test/cxx11_tensor_expr.cpp
+++ b/unsupported/test/cxx11_tensor_expr.cpp
@@ -260,7 +260,7 @@ static void test_type_casting()
   mat1.setRandom();
   mat2.setRandom();
 
-  mat3 = mat1.template cast<double>();
+  mat3 = mat1.cast<double>();
   for (int i = 0; i < 2; ++i) {
     for (int j = 0; j < 3; ++j) {
       for (int k = 0; k < 7; ++k) {
@@ -269,7 +269,7 @@ static void test_type_casting()
     }
   }
 
-  mat3 = mat2.template cast<double>();
+  mat3 = mat2.cast<double>();
   for (int i = 0; i < 2; ++i) {
     for (int j = 0; j < 3; ++j) {
       for (int k = 0; k < 7; ++k) {