Thread pool

2026-04-10 11:34:33 +08:00 · 2023-05-05 16:23:34 +00:00
parent 9eb8e2afba
commit 94f57867fe
25 changed files with 937 additions and 446 deletions
--- a/Eigen/src/Core/util/EmulateArray.h
+++ b/Eigen/src/Core/util/EmulateArray.h
@@ -0,0 +1,286 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2014 Benoit Steiner <benoit.steiner.goog@gmail.com>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+#ifndef EIGEN_EMULATE_ARRAY_H
+#define EIGEN_EMULATE_ARRAY_H
+
+// CUDA doesn't support the STL containers, so we use our own instead.
+#if defined(EIGEN_GPUCC) || defined(EIGEN_AVOID_STL_ARRAY)
+
+namespace Eigen {
+template <typename T, size_t n> class array {
+
+ public:
+  typedef T value_type;
+  typedef T* iterator;
+  typedef const T* const_iterator;
+
+  EIGEN_DEVICE_FUNC
+  EIGEN_STRONG_INLINE iterator begin() { return values; }
+  EIGEN_DEVICE_FUNC
+  EIGEN_STRONG_INLINE const_iterator begin() const { return values; }
+
+  EIGEN_DEVICE_FUNC
+  EIGEN_STRONG_INLINE iterator end() { return values + n; }
+  EIGEN_DEVICE_FUNC
+  EIGEN_STRONG_INLINE const_iterator end() const { return values + n; }
+
+
+#if !defined(EIGEN_GPUCC)
+  typedef std::reverse_iterator<iterator> reverse_iterator;
+  typedef std::reverse_iterator<const_iterator> const_reverse_iterator;
+
+  EIGEN_DEVICE_FUNC
+  EIGEN_STRONG_INLINE reverse_iterator rbegin() { return reverse_iterator(end());}
+  EIGEN_DEVICE_FUNC
+  EIGEN_STRONG_INLINE const_reverse_iterator rbegin() const { return const_reverse_iterator(end()); }
+
+  EIGEN_DEVICE_FUNC
+  EIGEN_STRONG_INLINE reverse_iterator rend() { return reverse_iterator(begin()); }
+  EIGEN_DEVICE_FUNC
+  EIGEN_STRONG_INLINE const_reverse_iterator rend() const { return const_reverse_iterator(begin()); }
+#endif
+
+  EIGEN_DEVICE_FUNC
+  EIGEN_STRONG_INLINE T& operator[] (size_t index) { eigen_internal_assert(index < size()); return values[index]; }
+  EIGEN_DEVICE_FUNC
+  EIGEN_STRONG_INLINE const T& operator[] (size_t index) const { eigen_internal_assert(index < size()); return values[index]; }
+
+  EIGEN_DEVICE_FUNC
+  EIGEN_STRONG_INLINE T& at(size_t index) { eigen_assert(index < size()); return values[index]; }
+  EIGEN_DEVICE_FUNC
+  EIGEN_STRONG_INLINE const T& at(size_t index) const { eigen_assert(index < size()); return values[index]; }
+
+  EIGEN_DEVICE_FUNC
+  EIGEN_STRONG_INLINE T& front() { return values[0]; }
+  EIGEN_DEVICE_FUNC
+  EIGEN_STRONG_INLINE const T& front() const { return values[0]; }
+
+  EIGEN_DEVICE_FUNC
+  EIGEN_STRONG_INLINE T& back() { return values[n-1]; }
+  EIGEN_DEVICE_FUNC
+  EIGEN_STRONG_INLINE const T& back() const { return values[n-1]; }
+
+
+  EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE
+  static std::size_t size() { return n; }
+
+  T values[n];
+
+  EIGEN_DEVICE_FUNC
+  EIGEN_STRONG_INLINE array() { }
+  EIGEN_DEVICE_FUNC
+  EIGEN_STRONG_INLINE array(const T& v) {
+    EIGEN_STATIC_ASSERT(n==1, YOU_MADE_A_PROGRAMMING_MISTAKE)
+    values[0] = v;
+  }
+  EIGEN_DEVICE_FUNC
+  EIGEN_STRONG_INLINE array(const T& v1, const T& v2) {
+    EIGEN_STATIC_ASSERT(n==2, YOU_MADE_A_PROGRAMMING_MISTAKE)
+    values[0] = v1;
+    values[1] = v2;
+  }
+  EIGEN_DEVICE_FUNC
+  EIGEN_STRONG_INLINE array(const T& v1, const T& v2, const T& v3) {
+    EIGEN_STATIC_ASSERT(n==3, YOU_MADE_A_PROGRAMMING_MISTAKE)
+    values[0] = v1;
+    values[1] = v2;
+    values[2] = v3;
+  }
+  EIGEN_DEVICE_FUNC
+  EIGEN_STRONG_INLINE array(const T& v1, const T& v2, const T& v3,
+                            const T& v4) {
+    EIGEN_STATIC_ASSERT(n==4, YOU_MADE_A_PROGRAMMING_MISTAKE)
+    values[0] = v1;
+    values[1] = v2;
+    values[2] = v3;
+    values[3] = v4;
+  }
+  EIGEN_DEVICE_FUNC
+  EIGEN_STRONG_INLINE array(const T& v1, const T& v2, const T& v3, const T& v4,
+                            const T& v5) {
+    EIGEN_STATIC_ASSERT(n==5, YOU_MADE_A_PROGRAMMING_MISTAKE)
+    values[0] = v1;
+    values[1] = v2;
+    values[2] = v3;
+    values[3] = v4;
+    values[4] = v5;
+  }
+  EIGEN_DEVICE_FUNC
+  EIGEN_STRONG_INLINE array(const T& v1, const T& v2, const T& v3, const T& v4,
+                            const T& v5, const T& v6) {
+    EIGEN_STATIC_ASSERT(n==6, YOU_MADE_A_PROGRAMMING_MISTAKE)
+    values[0] = v1;
+    values[1] = v2;
+    values[2] = v3;
+    values[3] = v4;
+    values[4] = v5;
+    values[5] = v6;
+  }
+  EIGEN_DEVICE_FUNC
+  EIGEN_STRONG_INLINE array(const T& v1, const T& v2, const T& v3, const T& v4,
+                            const T& v5, const T& v6, const T& v7) {
+    EIGEN_STATIC_ASSERT(n==7, YOU_MADE_A_PROGRAMMING_MISTAKE)
+    values[0] = v1;
+    values[1] = v2;
+    values[2] = v3;
+    values[3] = v4;
+    values[4] = v5;
+    values[5] = v6;
+    values[6] = v7;
+  }
+  EIGEN_DEVICE_FUNC
+  EIGEN_STRONG_INLINE array(
+      const T& v1, const T& v2, const T& v3, const T& v4,
+      const T& v5, const T& v6, const T& v7, const T& v8) {
+    EIGEN_STATIC_ASSERT(n==8, YOU_MADE_A_PROGRAMMING_MISTAKE)
+    values[0] = v1;
+    values[1] = v2;
+    values[2] = v3;
+    values[3] = v4;
+    values[4] = v5;
+    values[5] = v6;
+    values[6] = v7;
+    values[7] = v8;
+  }
+
+  EIGEN_DEVICE_FUNC
+  EIGEN_STRONG_INLINE array(std::initializer_list<T> l) {
+    eigen_assert(l.size() == n);
+    internal::smart_copy(l.begin(), l.end(), values);
+  }
+};
+
+
+// Specialize array for zero size
+template <typename T> class array<T, 0> {
+ public:
+  EIGEN_DEVICE_FUNC
+  EIGEN_STRONG_INLINE T& operator[] (size_t) {
+    eigen_assert(false && "Can't index a zero size array");
+    return dummy;
+  }
+  EIGEN_DEVICE_FUNC
+  EIGEN_STRONG_INLINE const T& operator[] (size_t) const {
+    eigen_assert(false && "Can't index a zero size array");
+    return dummy;
+  }
+
+  EIGEN_DEVICE_FUNC
+  EIGEN_STRONG_INLINE T& front() {
+    eigen_assert(false && "Can't index a zero size array");
+    return dummy;
+  }
+  EIGEN_DEVICE_FUNC
+  EIGEN_STRONG_INLINE const T& front() const {
+    eigen_assert(false && "Can't index a zero size array");
+    return dummy;
+  }
+  EIGEN_DEVICE_FUNC
+  EIGEN_STRONG_INLINE T& back() {
+    eigen_assert(false && "Can't index a zero size array");
+    return dummy;
+  }
+  EIGEN_DEVICE_FUNC
+  EIGEN_STRONG_INLINE const T& back() const {
+    eigen_assert(false && "Can't index a zero size array");
+    return dummy;
+  }
+
+  static EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE std::size_t size() { return 0; }
+
+  EIGEN_DEVICE_FUNC
+  EIGEN_STRONG_INLINE array() : dummy() { }
+
+  EIGEN_DEVICE_FUNC array(std::initializer_list<T> l) : dummy() {
+    EIGEN_UNUSED_VARIABLE(l);
+    eigen_assert(l.size() == 0);
+  }
+
+ private:
+  T dummy;
+};
+
+// Comparison operator
+// Todo: implement !=, <, <=, >,  and >=
+template<class T, std::size_t N>
+EIGEN_DEVICE_FUNC bool operator==(const array<T,N>& lhs, const array<T,N>& rhs) {
+  for (std::size_t i = 0; i < N; ++i) {
+    if (lhs[i] != rhs[i]) {
+      return false;
+    }
+  }
+  return true;
+}
+
+
+namespace internal {
+template<std::size_t I_, class T, std::size_t N>
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE T& array_get(array<T,N>& a) {
+  return a[I_];
+}
+template<std::size_t I_, class T, std::size_t N>
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const T& array_get(const array<T,N>& a) {
+  return a[I_];
+}
+
+template<class T, std::size_t N> struct array_size<array<T,N> > {
+  enum { value = N };
+};
+template<class T, std::size_t N> struct array_size<array<T,N>& > {
+  enum { value = N };
+};
+template<class T, std::size_t N> struct array_size<const array<T,N> > {
+  enum { value = N };
+};
+template<class T, std::size_t N> struct array_size<const array<T,N>& > {
+  enum { value = N };
+};
+
+}  // end namespace internal
+}  // end namespace Eigen
+
+#else
+
+// The compiler supports c++11, and we're not targeting cuda: use std::array as Eigen::array
+#include <array>
+
+namespace Eigen {
+
+template <typename T, std::size_t N> using array = std::array<T, N>;
+
+namespace internal {
+/* std::get is only constexpr in C++14, not yet in C++11
+ *     - libstdc++ from version 4.7 onwards has it nevertheless,
+ *                                          so use that
+ *     - libstdc++ older versions: use _M_instance directly
+ *     - libc++ all versions so far: use __elems_ directly
+ *     - all other libs: use std::get to be portable, but
+ *                       this may not be constexpr
+ */
+#if defined(__GLIBCXX__) && __GLIBCXX__ < 20120322
+#define STD_GET_ARR_HACK             a._M_instance[I_]
+#elif defined(_LIBCPP_VERSION)
+#define STD_GET_ARR_HACK             a.__elems_[I_]
+#else
+#define STD_GET_ARR_HACK             std::template get<I_, T, N>(a)
+#endif
+
+template<std::size_t I_, class T, std::size_t N> constexpr inline T&       array_get(std::array<T,N>&       a) { return (T&)       STD_GET_ARR_HACK; }
+template<std::size_t I_, class T, std::size_t N> constexpr inline T&&      array_get(std::array<T,N>&&      a) { return (T&&)      STD_GET_ARR_HACK; }
+template<std::size_t I_, class T, std::size_t N> constexpr inline T const& array_get(std::array<T,N> const& a) { return (T const&) STD_GET_ARR_HACK; }
+
+#undef STD_GET_ARR_HACK
+
+}  // end namespace internal
+}  // end namespace Eigen
+
+#endif
+
+#endif  // EIGEN_EMULATE_ARRAY_H
--- a/Eigen/src/Core/util/MaxSizeVector.h
+++ b/Eigen/src/Core/util/MaxSizeVector.h
@@ -0,0 +1,158 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2014 Benoit Steiner <benoit.steiner.goog@gmail.com>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+#ifndef EIGEN_FIXEDSIZEVECTOR_H
+#define EIGEN_FIXEDSIZEVECTOR_H
+
+namespace Eigen {
+
+/** \class MaxSizeVector
+  * \ingroup Core
+  *
+  * \brief The MaxSizeVector class.
+  *
+  * The %MaxSizeVector provides a subset of std::vector functionality.
+  *
+  * The goal is to provide basic std::vector operations when using
+  * std::vector is not an option (e.g. on GPU or when compiling using
+  * FMA/AVX, as this can cause either compilation failures or illegal
+  * instruction failures).
+  *
+  * Beware: The constructors are not API compatible with these of
+  * std::vector.
+  */
+template <typename T>
+class MaxSizeVector {
+  static const size_t alignment = internal::plain_enum_max(EIGEN_ALIGNOF(T), sizeof(void*));
+ public:
+  // Construct a new MaxSizeVector, reserve n elements.
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+  explicit MaxSizeVector(size_t n)
+      : reserve_(n), size_(0),
+        data_(static_cast<T*>(internal::handmade_aligned_malloc(n * sizeof(T), alignment))) {
+  }
+
+  // Construct a new MaxSizeVector, reserve and resize to n.
+  // Copy the init value to all elements.
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+  MaxSizeVector(size_t n, const T& init)
+      : reserve_(n), size_(n),
+        data_(static_cast<T*>(internal::handmade_aligned_malloc(n * sizeof(T), alignment))) {
+    size_t i = 0;
+    EIGEN_TRY
+    {
+      for(; i < size_; ++i) { new (&data_[i]) T(init); }
+    }
+    EIGEN_CATCH(...)
+    {
+      // Construction failed, destruct in reverse order:
+      for(; (i+1) > 0; --i) { data_[i-1].~T(); }
+      internal::handmade_aligned_free(data_);
+      EIGEN_THROW;
+    }
+  }
+
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+  ~MaxSizeVector() {
+    for (size_t i = size_; i > 0; --i) {
+      data_[i-1].~T();
+    }
+    internal::handmade_aligned_free(data_);
+  }
+
+  void resize(size_t n) {
+    eigen_assert(n <= reserve_);
+    for (; size_ < n; ++size_) {
+      new (&data_[size_]) T;
+    }
+    for (; size_ > n; --size_) {
+      data_[size_-1].~T();
+    }
+    eigen_assert(size_ == n);
+  }
+
+  // Append new elements (up to reserved size).
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+  void push_back(const T& t) {
+    eigen_assert(size_ < reserve_);
+    new (&data_[size_++]) T(t);
+  }
+
+  // For C++03 compatibility this only takes one argument
+  template<class X>
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+  void emplace_back(const X& x) {
+    eigen_assert(size_ < reserve_);
+    new (&data_[size_++]) T(x);
+  }
+
+
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+  const T& operator[] (size_t i) const {
+    eigen_assert(i < size_);
+    return data_[i];
+  }
+
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+  T& operator[] (size_t i) {
+    eigen_assert(i < size_);
+    return data_[i];
+  }
+
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+  T& back() {
+    eigen_assert(size_ > 0);
+    return data_[size_ - 1];
+  }
+
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+  const T& back() const {
+    eigen_assert(size_ > 0);
+    return data_[size_ - 1];
+  }
+
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+  void pop_back() {
+    eigen_assert(size_ > 0);
+    data_[--size_].~T();
+  }
+
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+  size_t size() const { return size_; }
+
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+  bool empty() const { return size_ == 0; }
+
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+  T* data() { return data_; }
+
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+  const T* data() const { return data_; }
+
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+  T* begin() { return data_; }
+
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+  T* end() { return data_ + size_; }
+
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+  const T* begin() const { return data_; }
+
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+  const T* end() const { return data_ + size_; }
+
+ private:
+  size_t reserve_;
+  size_t size_;
+  T* data_;
+};
+
+}  // namespace Eigen
+
+#endif  // EIGEN_FIXEDSIZEVECTOR_H
--- a/Eigen/src/Core/util/Meta.h
+++ b/Eigen/src/Core/util/Meta.h
@@ -27,6 +27,8 @@

 #endif

+#include "EmulateArray.h"
+
 // Define portable (u)int{32,64} types
 #include <cstdint>

@@ -563,6 +565,513 @@ using std::is_constant_evaluated;
 constexpr bool is_constant_evaluated() { return false; }
 #endif

+template<typename... tt>
+struct type_list { constexpr static int count = sizeof...(tt); };
+
+template<typename t, typename... tt>
+struct type_list<t, tt...> { constexpr static int count = sizeof...(tt) + 1; typedef t first_type; };
+
+template<typename T, T... nn>
+struct numeric_list { constexpr static std::size_t count = sizeof...(nn); };
+
+template<typename T, T n, T... nn>
+struct numeric_list<T, n, nn...> { static constexpr std::size_t count = sizeof...(nn) + 1;
+                                   static constexpr T first_value = n; };
+
+#ifndef EIGEN_PARSED_BY_DOXYGEN
+/* numeric list constructors
+ *
+ * equivalencies:
+ *     constructor                                              result
+ *     typename gen_numeric_list<int, 5>::type                  numeric_list<int, 0,1,2,3,4>
+ *     typename gen_numeric_list_reversed<int, 5>::type         numeric_list<int, 4,3,2,1,0>
+ *     typename gen_numeric_list_swapped_pair<int, 5,1,2>::type numeric_list<int, 0,2,1,3,4>
+ *     typename gen_numeric_list_repeated<int, 0, 5>::type      numeric_list<int, 0,0,0,0,0>
+ */
+
+template<typename T, std::size_t n, T start = 0, T... ii> struct gen_numeric_list                     : gen_numeric_list<T, n-1, start, start + n-1, ii...> {};
+template<typename T, T start, T... ii>                    struct gen_numeric_list<T, 0, start, ii...> { typedef numeric_list<T, ii...> type; };
+
+template<typename T, std::size_t n, T start = 0, T... ii> struct gen_numeric_list_reversed                     : gen_numeric_list_reversed<T, n-1, start, ii..., start + n-1> {};
+template<typename T, T start, T... ii>                    struct gen_numeric_list_reversed<T, 0, start, ii...> { typedef numeric_list<T, ii...> type; };
+
+template<typename T, std::size_t n, T a, T b, T start = 0, T... ii> struct gen_numeric_list_swapped_pair                           : gen_numeric_list_swapped_pair<T, n-1, a, b, start, (start + n-1) == a ? b : ((start + n-1) == b ? a : (start + n-1)), ii...> {};
+template<typename T, T a, T b, T start, T... ii>                    struct gen_numeric_list_swapped_pair<T, 0, a, b, start, ii...> { typedef numeric_list<T, ii...> type; };
+
+template<typename T, std::size_t n, T V, T... nn> struct gen_numeric_list_repeated                 : gen_numeric_list_repeated<T, n-1, V, V, nn...> {};
+template<typename T, T V, T... nn>                struct gen_numeric_list_repeated<T, 0, V, nn...> { typedef numeric_list<T, nn...> type; };
+
+/* list manipulation: concatenate */
+
+template<class a, class b> struct concat;
+
+template<typename... as, typename... bs> struct concat<type_list<as...>,       type_list<bs...>>        { typedef type_list<as..., bs...> type; };
+template<typename T, T... as, T... bs>   struct concat<numeric_list<T, as...>, numeric_list<T, bs...> > { typedef numeric_list<T, as..., bs...> type; };
+
+template<typename... p> struct mconcat;
+template<typename a>                             struct mconcat<a>           { typedef a type; };
+template<typename a, typename b>                 struct mconcat<a, b>        : concat<a, b> {};
+template<typename a, typename b, typename... cs> struct mconcat<a, b, cs...> : concat<a, typename mconcat<b, cs...>::type> {};
+
+/* list manipulation: extract slices */
+
+template<int n, typename x> struct take;
+template<int n, typename a, typename... as> struct take<n, type_list<a, as...>> : concat<type_list<a>, typename take<n-1, type_list<as...>>::type> {};
+template<int n>                             struct take<n, type_list<>>         { typedef type_list<> type; };
+template<typename a, typename... as>        struct take<0, type_list<a, as...>> { typedef type_list<> type; };
+template<>                                  struct take<0, type_list<>>         { typedef type_list<> type; };
+
+template<typename T, int n, T a, T... as> struct take<n, numeric_list<T, a, as...>> : concat<numeric_list<T, a>, typename take<n-1, numeric_list<T, as...>>::type> {};
+// XXX The following breaks in gcc-11, and is invalid anyways.
+// template<typename T, int n>               struct take<n, numeric_list<T>>           { typedef numeric_list<T> type; };
+template<typename T, T a, T... as>        struct take<0, numeric_list<T, a, as...>> { typedef numeric_list<T> type; };
+template<typename T>                      struct take<0, numeric_list<T>>           { typedef numeric_list<T> type; };
+
+template<typename T, int n, T... ii>      struct h_skip_helper_numeric;
+template<typename T, int n, T i, T... ii> struct h_skip_helper_numeric<T, n, i, ii...> : h_skip_helper_numeric<T, n-1, ii...> {};
+template<typename T, T i, T... ii>        struct h_skip_helper_numeric<T, 0, i, ii...> { typedef numeric_list<T, i, ii...> type; };
+template<typename T, int n>               struct h_skip_helper_numeric<T, n>           { typedef numeric_list<T> type; };
+template<typename T>                      struct h_skip_helper_numeric<T, 0>           { typedef numeric_list<T> type; };
+
+template<int n, typename... tt>             struct h_skip_helper_type;
+template<int n, typename t, typename... tt> struct h_skip_helper_type<n, t, tt...> : h_skip_helper_type<n-1, tt...> {};
+template<typename t, typename... tt>        struct h_skip_helper_type<0, t, tt...> { typedef type_list<t, tt...> type; };
+template<int n>                             struct h_skip_helper_type<n>           { typedef type_list<> type; };
+template<>                                  struct h_skip_helper_type<0>           { typedef type_list<> type; };
+#endif //not EIGEN_PARSED_BY_DOXYGEN
+
+template<int n>
+struct h_skip {
+  template<typename T, T... ii>
+  constexpr static EIGEN_STRONG_INLINE typename h_skip_helper_numeric<T, n, ii...>::type helper(numeric_list<T, ii...>) { return typename h_skip_helper_numeric<T, n, ii...>::type(); }
+  template<typename... tt>
+  constexpr static EIGEN_STRONG_INLINE typename h_skip_helper_type<n, tt...>::type helper(type_list<tt...>) { return typename h_skip_helper_type<n, tt...>::type(); }
+};
+
+template<int n, typename a> struct skip { typedef decltype(h_skip<n>::helper(a())) type; };
+
+template<int start, int count, typename a> struct slice : take<count, typename skip<start, a>::type> {};
+
+/* list manipulation: retrieve single element from list */
+
+template<int n, typename x> struct get;
+
+template<int n, typename a, typename... as>               struct get<n, type_list<a, as...>>   : get<n-1, type_list<as...>> {};
+template<typename a, typename... as>                      struct get<0, type_list<a, as...>>   { typedef a type; };
+
+template<typename T, int n, T a, T... as>                        struct get<n, numeric_list<T, a, as...>>   : get<n-1, numeric_list<T, as...>> {};
+template<typename T, T a, T... as>                               struct get<0, numeric_list<T, a, as...>>   { constexpr static T value = a; };
+
+template<std::size_t n, typename T, T a, T... as> constexpr T       array_get(const numeric_list<T, a, as...>&) {
+   return get<(int)n, numeric_list<T, a, as...>>::value;
+}
+
+/* always get type, regardless of dummy; good for parameter pack expansion */
+
+template<typename T, T dummy, typename t> struct id_numeric  { typedef t type; };
+template<typename dummy, typename t>      struct id_type     { typedef t type; };
+
+/* equality checking, flagged version */
+
+template<typename a, typename b> struct is_same_gf : is_same<a, b> { constexpr static int global_flags = 0; };
+
+/* apply_op to list */
+
+template<
+  bool from_left, // false
+  template<typename, typename> class op,
+  typename additional_param,
+  typename... values
+>
+struct h_apply_op_helper                                        { typedef type_list<typename op<values, additional_param>::type...> type; };
+template<
+  template<typename, typename> class op,
+  typename additional_param,
+  typename... values
+>
+struct h_apply_op_helper<true, op, additional_param, values...> { typedef type_list<typename op<additional_param, values>::type...> type; };
+
+template<
+  bool from_left,
+  template<typename, typename> class op,
+  typename additional_param
+>
+struct h_apply_op
+{
+  template<typename... values>
+  constexpr static typename h_apply_op_helper<from_left, op, additional_param, values...>::type helper(type_list<values...>)
+  { return typename h_apply_op_helper<from_left, op, additional_param, values...>::type(); }
+};
+
+template<
+  template<typename, typename> class op,
+  typename additional_param,
+  typename a
+>
+struct apply_op_from_left { typedef decltype(h_apply_op<true, op, additional_param>::helper(a())) type; };
+
+template<
+  template<typename, typename> class op,
+  typename additional_param,
+  typename a
+>
+struct apply_op_from_right { typedef decltype(h_apply_op<false, op, additional_param>::helper(a())) type; };
+
+/* see if an element is in a list */
+
+template<
+  template<typename, typename> class test,
+  typename check_against,
+  typename h_list,
+  bool last_check_positive = false
+>
+struct contained_in_list;
+
+template<
+  template<typename, typename> class test,
+  typename check_against,
+  typename h_list
+>
+struct contained_in_list<test, check_against, h_list, true>
+{
+  constexpr static bool value = true;
+};
+
+template<
+  template<typename, typename> class test,
+  typename check_against,
+  typename a,
+  typename... as
+>
+struct contained_in_list<test, check_against, type_list<a, as...>, false> : contained_in_list<test, check_against, type_list<as...>, test<check_against, a>::value> {};
+
+template<
+  template<typename, typename> class test,
+  typename check_against,
+  typename... empty
+>
+struct contained_in_list<test, check_against, type_list<empty...>, false> { constexpr static bool value = false; };
+
+/* see if an element is in a list and check for global flags */
+
+template<
+  template<typename, typename> class test,
+  typename check_against,
+  typename h_list,
+  int default_flags = 0,
+  bool last_check_positive = false,
+  int last_check_flags = default_flags
+>
+struct contained_in_list_gf;
+
+template<
+  template<typename, typename> class test,
+  typename check_against,
+  typename h_list,
+  int default_flags,
+  int last_check_flags
+>
+struct contained_in_list_gf<test, check_against, h_list, default_flags, true, last_check_flags>
+{
+  constexpr static bool value = true;
+  constexpr static int global_flags = last_check_flags;
+};
+
+template<
+  template<typename, typename> class test,
+  typename check_against,
+  typename a,
+  typename... as,
+  int default_flags,
+  int last_check_flags
+>
+struct contained_in_list_gf<test, check_against, type_list<a, as...>, default_flags, false, last_check_flags> : contained_in_list_gf<test, check_against, type_list<as...>, default_flags, test<check_against, a>::value, test<check_against, a>::global_flags> {};
+
+template<
+  template<typename, typename> class test,
+  typename check_against,
+  typename... empty,
+  int default_flags,
+  int last_check_flags
+>
+struct contained_in_list_gf<test, check_against, type_list<empty...>, default_flags, false, last_check_flags> { constexpr static bool value = false; constexpr static int global_flags = default_flags; };
+
+/* generic reductions */
+
+template<
+  typename Reducer,
+  typename... Ts
+> struct reduce;
+
+template<
+  typename Reducer
+> struct reduce<Reducer>
+{
+  EIGEN_DEVICE_FUNC constexpr static EIGEN_STRONG_INLINE int run() { return Reducer::Identity; }
+};
+
+template<
+  typename Reducer,
+  typename A
+> struct reduce<Reducer, A>
+{
+  EIGEN_DEVICE_FUNC constexpr static EIGEN_STRONG_INLINE A run(A a) { return a; }
+};
+
+template<
+  typename Reducer,
+  typename A,
+  typename... Ts
+> struct reduce<Reducer, A, Ts...>
+{
+  EIGEN_DEVICE_FUNC constexpr static EIGEN_STRONG_INLINE auto run(A a, Ts... ts) -> decltype(Reducer::run(a, reduce<Reducer, Ts...>::run(ts...))) {
+    return Reducer::run(a, reduce<Reducer, Ts...>::run(ts...));
+  }
+};
+
+/* generic binary operations */
+
+struct sum_op           {
+  template<typename A, typename B> EIGEN_DEVICE_FUNC constexpr static EIGEN_STRONG_INLINE auto run(A a, B b) -> decltype(a + b)   { return a + b;   }
+  static constexpr int Identity = 0;
+};
+struct product_op       {
+  template<typename A, typename B> EIGEN_DEVICE_FUNC constexpr static EIGEN_STRONG_INLINE auto run(A a, B b) -> decltype(a * b)   { return a * b;   }
+  static constexpr int Identity = 1;
+};
+
+struct logical_and_op   { template<typename A, typename B> constexpr static EIGEN_STRONG_INLINE auto run(A a, B b) -> decltype(a && b)  { return a && b;  } };
+struct logical_or_op    { template<typename A, typename B> constexpr static EIGEN_STRONG_INLINE auto run(A a, B b) -> decltype(a || b)  { return a || b;  } };
+
+struct equal_op         { template<typename A, typename B> constexpr static EIGEN_STRONG_INLINE auto run(A a, B b) -> decltype(a == b)  { return a == b;  } };
+struct not_equal_op     { template<typename A, typename B> constexpr static EIGEN_STRONG_INLINE auto run(A a, B b) -> decltype(a != b)  { return a != b;  } };
+struct lesser_op        { template<typename A, typename B> constexpr static EIGEN_STRONG_INLINE auto run(A a, B b) -> decltype(a < b)   { return a < b;   } };
+struct lesser_equal_op  { template<typename A, typename B> constexpr static EIGEN_STRONG_INLINE auto run(A a, B b) -> decltype(a <= b)  { return a <= b;  } };
+struct greater_op       { template<typename A, typename B> constexpr static EIGEN_STRONG_INLINE auto run(A a, B b) -> decltype(a > b)   { return a > b;   } };
+struct greater_equal_op { template<typename A, typename B> constexpr static EIGEN_STRONG_INLINE auto run(A a, B b) -> decltype(a >= b)  { return a >= b;  } };
+
+/* generic unary operations */
+
+struct not_op                { template<typename A> constexpr static EIGEN_STRONG_INLINE auto run(A a) -> decltype(!a)      { return !a;      } };
+struct negation_op           { template<typename A> constexpr static EIGEN_STRONG_INLINE auto run(A a) -> decltype(-a)      { return -a;      } };
+struct greater_equal_zero_op { template<typename A> constexpr static EIGEN_STRONG_INLINE auto run(A a) -> decltype(a >= 0)  { return a >= 0;  } };
+
+
+/* reductions for lists */
+
+// using auto -> return value spec makes ICC 13.0 and 13.1 crash here, so we have to hack it
+// together in front... (13.0 doesn't work with array_prod/array_reduce/... anyway, but 13.1
+// does...
+template<typename... Ts>
+EIGEN_DEVICE_FUNC constexpr EIGEN_STRONG_INLINE decltype(reduce<product_op, Ts...>::run((*((Ts*)0))...)) arg_prod(Ts... ts)
+{
+  return reduce<product_op, Ts...>::run(ts...);
+}
+
+template<typename... Ts>
+constexpr EIGEN_STRONG_INLINE decltype(reduce<sum_op, Ts...>::run((*((Ts*)0))...)) arg_sum(Ts... ts)
+{
+  return reduce<sum_op, Ts...>::run(ts...);
+}
+
+/* reverse arrays */
+
+template<typename Array, int... n>
+constexpr EIGEN_STRONG_INLINE Array h_array_reverse(Array arr, numeric_list<int, n...>)
+{
+  return {{array_get<sizeof...(n) - n - 1>(arr)...}};
+}
+
+template<typename T, std::size_t N>
+constexpr EIGEN_STRONG_INLINE array<T, N> array_reverse(array<T, N> arr)
+{
+  return h_array_reverse(arr, typename gen_numeric_list<int, N>::type());
+}
+
+
+/* generic array reductions */
+
+// can't reuse standard reduce() interface above because Intel's Compiler
+// *really* doesn't like it, so we just reimplement the stuff
+// (start from N - 1 and work down to 0 because specialization for
+// n == N - 1 also doesn't work in Intel's compiler, so it goes into
+// an infinite loop)
+template<typename Reducer, typename T, std::size_t N, std::size_t n = N - 1>
+struct h_array_reduce {
+  EIGEN_DEVICE_FUNC constexpr static EIGEN_STRONG_INLINE auto run(array<T, N> arr, T identity) -> decltype(Reducer::run(h_array_reduce<Reducer, T, N, n - 1>::run(arr, identity), array_get<n>(arr)))
+  {
+    return Reducer::run(h_array_reduce<Reducer, T, N, n - 1>::run(arr, identity), array_get<n>(arr));
+  }
+};
+
+template<typename Reducer, typename T, std::size_t N>
+struct h_array_reduce<Reducer, T, N, 0>
+{
+  EIGEN_DEVICE_FUNC constexpr static EIGEN_STRONG_INLINE T run(const array<T, N>& arr, T)
+  {
+    return array_get<0>(arr);
+  }
+};
+
+template<typename Reducer, typename T>
+struct h_array_reduce<Reducer, T, 0>
+{
+  EIGEN_DEVICE_FUNC constexpr static EIGEN_STRONG_INLINE T run(const array<T, 0>&, T identity)
+  {
+    return identity;
+  }
+};
+
+template<typename Reducer, typename T, std::size_t N>
+EIGEN_DEVICE_FUNC constexpr EIGEN_STRONG_INLINE auto array_reduce(const array<T, N>& arr, T identity) -> decltype(h_array_reduce<Reducer, T, N>::run(arr, identity))
+{
+  return h_array_reduce<Reducer, T, N>::run(arr, identity);
+}
+
+/* standard array reductions */
+
+template<typename T, std::size_t N>
+EIGEN_DEVICE_FUNC constexpr EIGEN_STRONG_INLINE auto array_sum(const array<T, N>& arr) -> decltype(array_reduce<sum_op, T, N>(arr, static_cast<T>(0)))
+{
+  return array_reduce<sum_op, T, N>(arr, static_cast<T>(0));
+}
+
+template<typename T, std::size_t N>
+EIGEN_DEVICE_FUNC constexpr EIGEN_STRONG_INLINE auto array_prod(const array<T, N>& arr) -> decltype(array_reduce<product_op, T, N>(arr, static_cast<T>(1)))
+{
+  return array_reduce<product_op, T, N>(arr, static_cast<T>(1));
+}
+
+template<typename t>
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE t array_prod(const std::vector<t>& a) {
+  eigen_assert(a.size() > 0);
+  t prod = 1;
+  for (size_t i = 0; i < a.size(); ++i) { prod *= a[i]; }
+  return prod;
+}
+
+/* zip an array */
+
+template<typename Op, typename A, typename B, std::size_t N, int... n>
+constexpr EIGEN_STRONG_INLINE array<decltype(Op::run(A(), B())),N> h_array_zip(array<A, N> a, array<B, N> b, numeric_list<int, n...>)
+{
+  return array<decltype(Op::run(A(), B())),N>{{ Op::run(array_get<n>(a), array_get<n>(b))... }};
+}
+
+template<typename Op, typename A, typename B, std::size_t N>
+constexpr EIGEN_STRONG_INLINE array<decltype(Op::run(A(), B())),N> array_zip(array<A, N> a, array<B, N> b)
+{
+  return h_array_zip<Op>(a, b, typename gen_numeric_list<int, N>::type());
+}
+
+/* zip an array and reduce the result */
+
+template<typename Reducer, typename Op, typename A, typename B, std::size_t N, int... n>
+constexpr EIGEN_STRONG_INLINE auto h_array_zip_and_reduce(array<A, N> a, array<B, N> b, numeric_list<int, n...>) -> decltype(reduce<Reducer, typename id_numeric<int,n,decltype(Op::run(A(), B()))>::type...>::run(Op::run(array_get<n>(a), array_get<n>(b))...))
+{
+  return reduce<Reducer, typename id_numeric<int,n,decltype(Op::run(A(), B()))>::type...>::run(Op::run(array_get<n>(a), array_get<n>(b))...);
+}
+
+template<typename Reducer, typename Op, typename A, typename B, std::size_t N>
+constexpr EIGEN_STRONG_INLINE auto array_zip_and_reduce(array<A, N> a, array<B, N> b) -> decltype(h_array_zip_and_reduce<Reducer, Op, A, B, N>(a, b, typename gen_numeric_list<int, N>::type()))
+{
+  return h_array_zip_and_reduce<Reducer, Op, A, B, N>(a, b, typename gen_numeric_list<int, N>::type());
+}
+
+/* apply stuff to an array */
+
+template<typename Op, typename A, std::size_t N, int... n>
+constexpr EIGEN_STRONG_INLINE array<decltype(Op::run(A())),N> h_array_apply(array<A, N> a, numeric_list<int, n...>)
+{
+  return array<decltype(Op::run(A())),N>{{ Op::run(array_get<n>(a))... }};
+}
+
+template<typename Op, typename A, std::size_t N>
+constexpr EIGEN_STRONG_INLINE array<decltype(Op::run(A())),N> array_apply(array<A, N> a)
+{
+  return h_array_apply<Op>(a, typename gen_numeric_list<int, N>::type());
+}
+
+/* apply stuff to an array and reduce */
+
+template<typename Reducer, typename Op, typename A, std::size_t N, int... n>
+constexpr EIGEN_STRONG_INLINE auto h_array_apply_and_reduce(array<A, N> arr, numeric_list<int, n...>) -> decltype(reduce<Reducer, typename id_numeric<int,n,decltype(Op::run(A()))>::type...>::run(Op::run(array_get<n>(arr))...))
+{
+  return reduce<Reducer, typename id_numeric<int,n,decltype(Op::run(A()))>::type...>::run(Op::run(array_get<n>(arr))...);
+}
+
+template<typename Reducer, typename Op, typename A, std::size_t N>
+constexpr EIGEN_STRONG_INLINE auto array_apply_and_reduce(array<A, N> a) -> decltype(h_array_apply_and_reduce<Reducer, Op, A, N>(a, typename gen_numeric_list<int, N>::type()))
+{
+  return h_array_apply_and_reduce<Reducer, Op, A, N>(a, typename gen_numeric_list<int, N>::type());
+}
+
+/* repeat a value n times (and make an array out of it
+ * usage:
+ *   array<int, 16> = repeat<16>(42);
+ */
+
+template<int n>
+struct h_repeat
+{
+  template<typename t, int... ii>
+  constexpr static EIGEN_STRONG_INLINE array<t, n> run(t v, numeric_list<int, ii...>)
+  {
+    return {{ typename id_numeric<int, ii, t>::type(v)... }};
+  }
+};
+
+template<int n, typename t>
+constexpr array<t, n> repeat(t v) { return h_repeat<n>::run(v, typename gen_numeric_list<int, n>::type()); }
+
+/* instantiate a class by a C-style array */
+template<class InstType, typename ArrType, std::size_t N, bool Reverse, typename... Ps>
+struct h_instantiate_by_c_array;
+
+template<class InstType, typename ArrType, std::size_t N, typename... Ps>
+struct h_instantiate_by_c_array<InstType, ArrType, N, false, Ps...>
+{
+  static InstType run(ArrType* arr, Ps... args)
+  {
+    return h_instantiate_by_c_array<InstType, ArrType, N - 1, false, Ps..., ArrType>::run(arr + 1, args..., arr[0]);
+  }
+};
+
+template<class InstType, typename ArrType, std::size_t N, typename... Ps>
+struct h_instantiate_by_c_array<InstType, ArrType, N, true, Ps...>
+{
+  static InstType run(ArrType* arr, Ps... args)
+  {
+    return h_instantiate_by_c_array<InstType, ArrType, N - 1, false, ArrType, Ps...>::run(arr + 1, arr[0], args...);
+  }
+};
+
+template<class InstType, typename ArrType, typename... Ps>
+struct h_instantiate_by_c_array<InstType, ArrType, 0, false, Ps...>
+{
+  static InstType run(ArrType* arr, Ps... args)
+  {
+    (void)arr;
+    return InstType(args...);
+  }
+};
+
+template<class InstType, typename ArrType, typename... Ps>
+struct h_instantiate_by_c_array<InstType, ArrType, 0, true, Ps...>
+{
+  static InstType run(ArrType* arr, Ps... args)
+  {
+    (void)arr;
+    return InstType(args...);
+  }
+};
+
+template<class InstType, typename ArrType, std::size_t N, bool Reverse = false>
+InstType instantiate_by_c_array(ArrType* arr)
+{
+  return h_instantiate_by_c_array<InstType, ArrType, N, Reverse>::run(arr);
+}
+
 } // end namespace internal

 } // end namespace Eigen
--- a/Eigen/src/ThreadPool/Barrier.h
+++ b/Eigen/src/ThreadPool/Barrier.h
@@ -0,0 +1,69 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2018 Rasmus Munk Larsen <rmlarsen@google.com>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+// Barrier is an object that allows one or more threads to wait until
+// Notify has been called a specified number of times.
+
+#ifndef EIGEN_CXX11_THREADPOOL_BARRIER_H
+#define EIGEN_CXX11_THREADPOOL_BARRIER_H
+
+#include "./InternalHeaderCheck.h"
+
+namespace Eigen {
+
+class Barrier {
+ public:
+  Barrier(unsigned int count) : state_(count << 1), notified_(false) {
+    eigen_plain_assert(((count << 1) >> 1) == count);
+  }
+  ~Barrier() { eigen_plain_assert((state_ >> 1) == 0); }
+
+  void Notify() {
+    unsigned int v = state_.fetch_sub(2, std::memory_order_acq_rel) - 2;
+    if (v != 1) {
+      // Clear the lowest bit (waiter flag) and check that the original state
+      // value was not zero. If it was zero, it means that notify was called
+      // more times than the original count.
+      eigen_plain_assert(((v + 2) & ~1) != 0);
+      return;  // either count has not dropped to 0, or waiter is not waiting
+    }
+    std::unique_lock<std::mutex> l(mu_);
+    eigen_plain_assert(!notified_);
+    notified_ = true;
+    cv_.notify_all();
+  }
+
+  void Wait() {
+    unsigned int v = state_.fetch_or(1, std::memory_order_acq_rel);
+    if ((v >> 1) == 0) return;
+    std::unique_lock<std::mutex> l(mu_);
+    while (!notified_) {
+      cv_.wait(l);
+    }
+  }
+
+ private:
+  std::mutex mu_;
+  std::condition_variable cv_;
+  std::atomic<unsigned int> state_;  // low bit is waiter flag
+  bool notified_;
+};
+
+// Notification is an object that allows a user to to wait for another
+// thread to signal a notification that an event has occurred.
+//
+// Multiple threads can wait on the same Notification object,
+// but only one caller must call Notify() on the object.
+struct Notification : Barrier {
+  Notification() : Barrier(1){};
+};
+
+}  // namespace Eigen
+
+#endif  // EIGEN_CXX11_THREADPOOL_BARRIER_H
--- a/Eigen/src/ThreadPool/EventCount.h
+++ b/Eigen/src/ThreadPool/EventCount.h
@@ -0,0 +1,251 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2016 Dmitry Vyukov <dvyukov@google.com>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+#ifndef EIGEN_CXX11_THREADPOOL_EVENTCOUNT_H
+#define EIGEN_CXX11_THREADPOOL_EVENTCOUNT_H
+
+#include "./InternalHeaderCheck.h"
+
+namespace Eigen {
+
+// EventCount allows to wait for arbitrary predicates in non-blocking
+// algorithms. Think of condition variable, but wait predicate does not need to
+// be protected by a mutex. Usage:
+// Waiting thread does:
+//
+//   if (predicate)
+//     return act();
+//   EventCount::Waiter& w = waiters[my_index];
+//   ec.Prewait(&w);
+//   if (predicate) {
+//     ec.CancelWait(&w);
+//     return act();
+//   }
+//   ec.CommitWait(&w);
+//
+// Notifying thread does:
+//
+//   predicate = true;
+//   ec.Notify(true);
+//
+// Notify is cheap if there are no waiting threads. Prewait/CommitWait are not
+// cheap, but they are executed only if the preceding predicate check has
+// failed.
+//
+// Algorithm outline:
+// There are two main variables: predicate (managed by user) and state_.
+// Operation closely resembles Dekker mutual algorithm:
+// https://en.wikipedia.org/wiki/Dekker%27s_algorithm
+// Waiting thread sets state_ then checks predicate, Notifying thread sets
+// predicate then checks state_. Due to seq_cst fences in between these
+// operations it is guaranteed than either waiter will see predicate change
+// and won't block, or notifying thread will see state_ change and will unblock
+// the waiter, or both. But it can't happen that both threads don't see each
+// other changes, which would lead to deadlock.
+class EventCount {
+ public:
+  class Waiter;
+
+  EventCount(MaxSizeVector<Waiter>& waiters)
+      : state_(kStackMask), waiters_(waiters) {
+    eigen_plain_assert(waiters.size() < (1 << kWaiterBits) - 1);
+  }
+
+  ~EventCount() {
+    // Ensure there are no waiters.
+    eigen_plain_assert(state_.load() == kStackMask);
+  }
+
+  // Prewait prepares for waiting.
+  // After calling Prewait, the thread must re-check the wait predicate
+  // and then call either CancelWait or CommitWait.
+  void Prewait() {
+    uint64_t state = state_.load(std::memory_order_relaxed);
+    for (;;) {
+      CheckState(state);
+      uint64_t newstate = state + kWaiterInc;
+      CheckState(newstate);
+      if (state_.compare_exchange_weak(state, newstate,
+                                       std::memory_order_seq_cst))
+        return;
+    }
+  }
+
+  // CommitWait commits waiting after Prewait.
+  void CommitWait(Waiter* w) {
+    eigen_plain_assert((w->epoch & ~kEpochMask) == 0);
+    w->state = Waiter::kNotSignaled;
+    const uint64_t me = (w - &waiters_[0]) | w->epoch;
+    uint64_t state = state_.load(std::memory_order_seq_cst);
+    for (;;) {
+      CheckState(state, true);
+      uint64_t newstate;
+      if ((state & kSignalMask) != 0) {
+        // Consume the signal and return immediately.
+        newstate = state - kWaiterInc - kSignalInc;
+      } else {
+        // Remove this thread from pre-wait counter and add to the waiter stack.
+        newstate = ((state & kWaiterMask) - kWaiterInc) | me;
+        w->next.store(state & (kStackMask | kEpochMask),
+                      std::memory_order_relaxed);
+      }
+      CheckState(newstate);
+      if (state_.compare_exchange_weak(state, newstate,
+                                       std::memory_order_acq_rel)) {
+        if ((state & kSignalMask) == 0) {
+          w->epoch += kEpochInc;
+          Park(w);
+        }
+        return;
+      }
+    }
+  }
+
+  // CancelWait cancels effects of the previous Prewait call.
+  void CancelWait() {
+    uint64_t state = state_.load(std::memory_order_relaxed);
+    for (;;) {
+      CheckState(state, true);
+      uint64_t newstate = state - kWaiterInc;
+      // We don't know if the thread was also notified or not,
+      // so we should not consume a signal unconditionally.
+      // Only if number of waiters is equal to number of signals,
+      // we know that the thread was notified and we must take away the signal.
+      if (((state & kWaiterMask) >> kWaiterShift) ==
+          ((state & kSignalMask) >> kSignalShift))
+        newstate -= kSignalInc;
+      CheckState(newstate);
+      if (state_.compare_exchange_weak(state, newstate,
+                                       std::memory_order_acq_rel))
+        return;
+    }
+  }
+
+  // Notify wakes one or all waiting threads.
+  // Must be called after changing the associated wait predicate.
+  void Notify(bool notifyAll) {
+    std::atomic_thread_fence(std::memory_order_seq_cst);
+    uint64_t state = state_.load(std::memory_order_acquire);
+    for (;;) {
+      CheckState(state);
+      const uint64_t waiters = (state & kWaiterMask) >> kWaiterShift;
+      const uint64_t signals = (state & kSignalMask) >> kSignalShift;
+      // Easy case: no waiters.
+      if ((state & kStackMask) == kStackMask && waiters == signals) return;
+      uint64_t newstate;
+      if (notifyAll) {
+        // Empty wait stack and set signal to number of pre-wait threads.
+        newstate =
+            (state & kWaiterMask) | (waiters << kSignalShift) | kStackMask;
+      } else if (signals < waiters) {
+        // There is a thread in pre-wait state, unblock it.
+        newstate = state + kSignalInc;
+      } else {
+        // Pop a waiter from list and unpark it.
+        Waiter* w = &waiters_[state & kStackMask];
+        uint64_t next = w->next.load(std::memory_order_relaxed);
+        newstate = (state & (kWaiterMask | kSignalMask)) | next;
+      }
+      CheckState(newstate);
+      if (state_.compare_exchange_weak(state, newstate,
+                                       std::memory_order_acq_rel)) {
+        if (!notifyAll && (signals < waiters))
+          return;  // unblocked pre-wait thread
+        if ((state & kStackMask) == kStackMask) return;
+        Waiter* w = &waiters_[state & kStackMask];
+        if (!notifyAll) w->next.store(kStackMask, std::memory_order_relaxed);
+        Unpark(w);
+        return;
+      }
+    }
+  }
+
+  class Waiter {
+    friend class EventCount;
+    // Align to 128 byte boundary to prevent false sharing with other Waiter
+    // objects in the same vector.
+    EIGEN_ALIGN_TO_BOUNDARY(128) std::atomic<uint64_t> next;
+    std::mutex mu;
+    std::condition_variable cv;
+    uint64_t epoch = 0;
+    unsigned state = kNotSignaled;
+    enum {
+      kNotSignaled,
+      kWaiting,
+      kSignaled,
+    };
+  };
+
+ private:
+  // State_ layout:
+  // - low kWaiterBits is a stack of waiters committed wait
+  //   (indexes in waiters_ array are used as stack elements,
+  //   kStackMask means empty stack).
+  // - next kWaiterBits is count of waiters in prewait state.
+  // - next kWaiterBits is count of pending signals.
+  // - remaining bits are ABA counter for the stack.
+  //   (stored in Waiter node and incremented on push).
+  static const uint64_t kWaiterBits = 14;
+  static const uint64_t kStackMask = (1ull << kWaiterBits) - 1;
+  static const uint64_t kWaiterShift = kWaiterBits;
+  static const uint64_t kWaiterMask = ((1ull << kWaiterBits) - 1)
+                                      << kWaiterShift;
+  static const uint64_t kWaiterInc = 1ull << kWaiterShift;
+  static const uint64_t kSignalShift = 2 * kWaiterBits;
+  static const uint64_t kSignalMask = ((1ull << kWaiterBits) - 1)
+                                      << kSignalShift;
+  static const uint64_t kSignalInc = 1ull << kSignalShift;
+  static const uint64_t kEpochShift = 3 * kWaiterBits;
+  static const uint64_t kEpochBits = 64 - kEpochShift;
+  static const uint64_t kEpochMask = ((1ull << kEpochBits) - 1) << kEpochShift;
+  static const uint64_t kEpochInc = 1ull << kEpochShift;
+  std::atomic<uint64_t> state_;
+  MaxSizeVector<Waiter>& waiters_;
+
+  static void CheckState(uint64_t state, bool waiter = false) {
+    static_assert(kEpochBits >= 20, "not enough bits to prevent ABA problem");
+    const uint64_t waiters = (state & kWaiterMask) >> kWaiterShift;
+    const uint64_t signals = (state & kSignalMask) >> kSignalShift;
+    eigen_plain_assert(waiters >= signals);
+    eigen_plain_assert(waiters < (1 << kWaiterBits) - 1);
+    eigen_plain_assert(!waiter || waiters > 0);
+    (void)waiters;
+    (void)signals;
+  }
+
+  void Park(Waiter* w) {
+    std::unique_lock<std::mutex> lock(w->mu);
+    while (w->state != Waiter::kSignaled) {
+      w->state = Waiter::kWaiting;
+      w->cv.wait(lock);
+    }
+  }
+
+  void Unpark(Waiter* w) {
+    for (Waiter* next; w; w = next) {
+      uint64_t wnext = w->next.load(std::memory_order_relaxed) & kStackMask;
+      next = wnext == kStackMask ? nullptr : &waiters_[wnext];
+      unsigned state;
+      {
+        std::unique_lock<std::mutex> lock(w->mu);
+        state = w->state;
+        w->state = Waiter::kSignaled;
+      }
+      // Avoid notifying if it wasn't waiting.
+      if (state == Waiter::kWaiting) w->cv.notify_one();
+    }
+  }
+
+  EventCount(const EventCount&) = delete;
+  void operator=(const EventCount&) = delete;
+};
+
+}  // namespace Eigen
+
+#endif  // EIGEN_CXX11_THREADPOOL_EVENTCOUNT_H
--- a/Eigen/src/ThreadPool/InternalHeaderCheck.h
+++ b/Eigen/src/ThreadPool/InternalHeaderCheck.h
@@ -0,0 +1,3 @@
+#ifndef EIGEN_THREADPOOL_MODULE_H
+#error "Please include unsupported/Eigen/CXX11/ThreadPool instead of including headers inside the src directory directly."
+#endif
--- a/Eigen/src/ThreadPool/NonBlockingThreadPool.h
+++ b/Eigen/src/ThreadPool/NonBlockingThreadPool.h
@@ -0,0 +1,488 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2016 Dmitry Vyukov <dvyukov@google.com>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+#ifndef EIGEN_CXX11_THREADPOOL_NONBLOCKING_THREAD_POOL_H
+#define EIGEN_CXX11_THREADPOOL_NONBLOCKING_THREAD_POOL_H
+
+#include "./InternalHeaderCheck.h"
+
+namespace Eigen {
+
+template <typename Environment>
+class ThreadPoolTempl : public Eigen::ThreadPoolInterface {
+ public:
+  typedef typename Environment::Task Task;
+  typedef RunQueue<Task, 1024> Queue;
+
+  ThreadPoolTempl(int num_threads, Environment env = Environment())
+      : ThreadPoolTempl(num_threads, true, env) {}
+
+  ThreadPoolTempl(int num_threads, bool allow_spinning,
+                  Environment env = Environment())
+      : env_(env),
+        num_threads_(num_threads),
+        allow_spinning_(allow_spinning),
+        thread_data_(num_threads),
+        all_coprimes_(num_threads),
+        waiters_(num_threads),
+        global_steal_partition_(EncodePartition(0, num_threads_)),
+        blocked_(0),
+        spinning_(0),
+        done_(false),
+        cancelled_(false),
+        ec_(waiters_) {
+    waiters_.resize(num_threads_);
+    // Calculate coprimes of all numbers [1, num_threads].
+    // Coprimes are used for random walks over all threads in Steal
+    // and NonEmptyQueueIndex. Iteration is based on the fact that if we take
+    // a random starting thread index t and calculate num_threads - 1 subsequent
+    // indices as (t + coprime) % num_threads, we will cover all threads without
+    // repetitions (effectively getting a presudo-random permutation of thread
+    // indices).
+    eigen_plain_assert(num_threads_ < kMaxThreads);
+    for (int i = 1; i <= num_threads_; ++i) {
+      all_coprimes_.emplace_back(i);
+      ComputeCoprimes(i, &all_coprimes_.back());
+    }
+#ifndef EIGEN_THREAD_LOCAL
+    init_barrier_.reset(new Barrier(num_threads_));
+#endif
+    thread_data_.resize(num_threads_);
+    for (int i = 0; i < num_threads_; i++) {
+      SetStealPartition(i, EncodePartition(0, num_threads_));
+      thread_data_[i].thread.reset(
+          env_.CreateThread([this, i]() { WorkerLoop(i); }));
+    }
+#ifndef EIGEN_THREAD_LOCAL
+    // Wait for workers to initialize per_thread_map_. Otherwise we might race
+    // with them in Schedule or CurrentThreadId.
+    init_barrier_->Wait();
+#endif
+  }
+
+  ~ThreadPoolTempl() {
+    done_ = true;
+
+    // Now if all threads block without work, they will start exiting.
+    // But note that threads can continue to work arbitrary long,
+    // block, submit new work, unblock and otherwise live full life.
+    if (!cancelled_) {
+      ec_.Notify(true);
+    } else {
+      // Since we were cancelled, there might be entries in the queues.
+      // Empty them to prevent their destructor from asserting.
+      for (size_t i = 0; i < thread_data_.size(); i++) {
+        thread_data_[i].queue.Flush();
+      }
+    }
+    // Join threads explicitly (by destroying) to avoid destruction order within
+    // this class.
+    for (size_t i = 0; i < thread_data_.size(); ++i)
+      thread_data_[i].thread.reset();
+  }
+
+  void SetStealPartitions(const std::vector<std::pair<unsigned, unsigned>>& partitions) {
+    eigen_plain_assert(partitions.size() == static_cast<std::size_t>(num_threads_));
+
+    // Pass this information to each thread queue.
+    for (int i = 0; i < num_threads_; i++) {
+      const auto& pair = partitions[i];
+      unsigned start = pair.first, end = pair.second;
+      AssertBounds(start, end);
+      unsigned val = EncodePartition(start, end);
+      SetStealPartition(i, val);
+    }
+  }
+
+  void Schedule(std::function<void()> fn) EIGEN_OVERRIDE {
+    ScheduleWithHint(std::move(fn), 0, num_threads_);
+  }
+
+  void ScheduleWithHint(std::function<void()> fn, int start,
+                        int limit) override {
+    Task t = env_.CreateTask(std::move(fn));
+    PerThread* pt = GetPerThread();
+    if (pt->pool == this) {
+      // Worker thread of this pool, push onto the thread's queue.
+      Queue& q = thread_data_[pt->thread_id].queue;
+      t = q.PushFront(std::move(t));
+    } else {
+      // A free-standing thread (or worker of another pool), push onto a random
+      // queue.
+      eigen_plain_assert(start < limit);
+      eigen_plain_assert(limit <= num_threads_);
+      int num_queues = limit - start;
+      int rnd = Rand(&pt->rand) % num_queues;
+      eigen_plain_assert(start + rnd < limit);
+      Queue& q = thread_data_[start + rnd].queue;
+      t = q.PushBack(std::move(t));
+    }
+    // Note: below we touch this after making w available to worker threads.
+    // Strictly speaking, this can lead to a racy-use-after-free. Consider that
+    // Schedule is called from a thread that is neither main thread nor a worker
+    // thread of this pool. Then, execution of w directly or indirectly
+    // completes overall computations, which in turn leads to destruction of
+    // this. We expect that such scenario is prevented by program, that is,
+    // this is kept alive while any threads can potentially be in Schedule.
+    if (!t.f) {
+      ec_.Notify(false);
+    } else {
+      env_.ExecuteTask(t);  // Push failed, execute directly.
+    }
+  }
+
+  void Cancel() EIGEN_OVERRIDE {
+    cancelled_ = true;
+    done_ = true;
+
+    // Let each thread know it's been cancelled.
+#ifdef EIGEN_THREAD_ENV_SUPPORTS_CANCELLATION
+    for (size_t i = 0; i < thread_data_.size(); i++) {
+      thread_data_[i].thread->OnCancel();
+    }
+#endif
+
+    // Wake up the threads without work to let them exit on their own.
+    ec_.Notify(true);
+  }
+
+  int NumThreads() const EIGEN_FINAL { return num_threads_; }
+
+  int CurrentThreadId() const EIGEN_FINAL {
+    const PerThread* pt = const_cast<ThreadPoolTempl*>(this)->GetPerThread();
+    if (pt->pool == this) {
+      return pt->thread_id;
+    } else {
+      return -1;
+    }
+  }
+
+ private:
+  // Create a single atomic<int> that encodes start and limit information for
+  // each thread.
+  // We expect num_threads_ < 65536, so we can store them in a single
+  // std::atomic<unsigned>.
+  // Exposed publicly as static functions so that external callers can reuse
+  // this encode/decode logic for maintaining their own thread-safe copies of
+  // scheduling and steal domain(s).
+  static const int kMaxPartitionBits = 16;
+  static const int kMaxThreads = 1 << kMaxPartitionBits;
+
+  inline unsigned EncodePartition(unsigned start, unsigned limit) {
+    return (start << kMaxPartitionBits) | limit;
+  }
+
+  inline void DecodePartition(unsigned val, unsigned* start, unsigned* limit) {
+    *limit = val & (kMaxThreads - 1);
+    val >>= kMaxPartitionBits;
+    *start = val;
+  }
+
+  void AssertBounds(int start, int end) {
+    eigen_plain_assert(start >= 0);
+    eigen_plain_assert(start < end);  // non-zero sized partition
+    eigen_plain_assert(end <= num_threads_);
+  }
+
+  inline void SetStealPartition(size_t i, unsigned val) {
+    thread_data_[i].steal_partition.store(val, std::memory_order_relaxed);
+  }
+
+  inline unsigned GetStealPartition(int i) {
+    return thread_data_[i].steal_partition.load(std::memory_order_relaxed);
+  }
+
+  void ComputeCoprimes(int N, MaxSizeVector<unsigned>* coprimes) {
+    for (int i = 1; i <= N; i++) {
+      unsigned a = i;
+      unsigned b = N;
+      // If GCD(a, b) == 1, then a and b are coprimes.
+      while (b != 0) {
+        unsigned tmp = a;
+        a = b;
+        b = tmp % b;
+      }
+      if (a == 1) {
+        coprimes->push_back(i);
+      }
+    }
+  }
+
+  typedef typename Environment::EnvThread Thread;
+
+  struct PerThread {
+    constexpr PerThread() : pool(NULL), rand(0), thread_id(-1) {}
+    ThreadPoolTempl* pool;  // Parent pool, or null for normal threads.
+    uint64_t rand;          // Random generator state.
+    int thread_id;          // Worker thread index in pool.
+#ifndef EIGEN_THREAD_LOCAL
+    // Prevent false sharing.
+    char pad_[128];
+#endif
+  };
+
+  struct ThreadData {
+    constexpr ThreadData() : thread(), steal_partition(0), queue() {}
+    std::unique_ptr<Thread> thread;
+    std::atomic<unsigned> steal_partition;
+    Queue queue;
+  };
+
+  Environment env_;
+  const int num_threads_;
+  const bool allow_spinning_;
+  MaxSizeVector<ThreadData> thread_data_;
+  MaxSizeVector<MaxSizeVector<unsigned>> all_coprimes_;
+  MaxSizeVector<EventCount::Waiter> waiters_;
+  unsigned global_steal_partition_;
+  std::atomic<unsigned> blocked_;
+  std::atomic<bool> spinning_;
+  std::atomic<bool> done_;
+  std::atomic<bool> cancelled_;
+  EventCount ec_;
+#ifndef EIGEN_THREAD_LOCAL
+  std::unique_ptr<Barrier> init_barrier_;
+  std::mutex per_thread_map_mutex_;  // Protects per_thread_map_.
+  std::unordered_map<uint64_t, std::unique_ptr<PerThread>> per_thread_map_;
+#endif
+
+  // Main worker thread loop.
+  void WorkerLoop(int thread_id) {
+#ifndef EIGEN_THREAD_LOCAL
+    std::unique_ptr<PerThread> new_pt(new PerThread());
+    per_thread_map_mutex_.lock();
+    bool insertOK = per_thread_map_.emplace(GlobalThreadIdHash(), std::move(new_pt)).second;
+    eigen_plain_assert(insertOK);
+    EIGEN_UNUSED_VARIABLE(insertOK);
+    per_thread_map_mutex_.unlock();
+    init_barrier_->Notify();
+    init_barrier_->Wait();
+#endif
+    PerThread* pt = GetPerThread();
+    pt->pool = this;
+    pt->rand = GlobalThreadIdHash();
+    pt->thread_id = thread_id;
+    Queue& q = thread_data_[thread_id].queue;
+    EventCount::Waiter* waiter = &waiters_[thread_id];
+    // TODO(dvyukov,rmlarsen): The time spent in NonEmptyQueueIndex() is
+    // proportional to num_threads_ and we assume that new work is scheduled at
+    // a constant rate, so we set spin_count to 5000 / num_threads_. The
+    // constant was picked based on a fair dice roll, tune it.
+    const int spin_count =
+        allow_spinning_ && num_threads_ > 0 ? 5000 / num_threads_ : 0;
+    if (num_threads_ == 1) {
+      // For num_threads_ == 1 there is no point in going through the expensive
+      // steal loop. Moreover, since NonEmptyQueueIndex() calls PopBack() on the
+      // victim queues it might reverse the order in which ops are executed
+      // compared to the order in which they are scheduled, which tends to be
+      // counter-productive for the types of I/O workloads the single thread
+      // pools tend to be used for.
+      while (!cancelled_) {
+        Task t = q.PopFront();
+        for (int i = 0; i < spin_count && !t.f; i++) {
+          if (!cancelled_.load(std::memory_order_relaxed)) {
+            t = q.PopFront();
+          }
+        }
+        if (!t.f) {
+          if (!WaitForWork(waiter, &t)) {
+            return;
+          }
+        }
+        if (t.f) {
+          env_.ExecuteTask(t);
+        }
+      }
+    } else {
+      while (!cancelled_) {
+        Task t = q.PopFront();
+        if (!t.f) {
+          t = LocalSteal();
+          if (!t.f) {
+            t = GlobalSteal();
+            if (!t.f) {
+              // Leave one thread spinning. This reduces latency.
+              if (allow_spinning_ && !spinning_ && !spinning_.exchange(true)) {
+                for (int i = 0; i < spin_count && !t.f; i++) {
+                  if (!cancelled_.load(std::memory_order_relaxed)) {
+                    t = GlobalSteal();
+                  } else {
+                    return;
+                  }
+                }
+                spinning_ = false;
+              }
+              if (!t.f) {
+                if (!WaitForWork(waiter, &t)) {
+                  return;
+                }
+              }
+            }
+          }
+        }
+        if (t.f) {
+          env_.ExecuteTask(t);
+        }
+      }
+    }
+  }
+
+  // Steal tries to steal work from other worker threads in the range [start,
+  // limit) in best-effort manner.
+  Task Steal(unsigned start, unsigned limit) {
+    PerThread* pt = GetPerThread();
+    const size_t size = limit - start;
+    unsigned r = Rand(&pt->rand);
+    // Reduce r into [0, size) range, this utilizes trick from
+    // https://lemire.me/blog/2016/06/27/a-fast-alternative-to-the-modulo-reduction/
+    eigen_plain_assert(all_coprimes_[size - 1].size() < (1<<30));
+    unsigned victim = ((uint64_t)r * (uint64_t)size) >> 32;
+    unsigned index = ((uint64_t) all_coprimes_[size - 1].size() * (uint64_t)r) >> 32;
+    unsigned inc = all_coprimes_[size - 1][index];
+
+    for (unsigned i = 0; i < size; i++) {
+      eigen_plain_assert(start + victim < limit);
+      Task t = thread_data_[start + victim].queue.PopBack();
+      if (t.f) {
+        return t;
+      }
+      victim += inc;
+      if (victim >= size) {
+        victim -= size;
+      }
+    }
+    return Task();
+  }
+
+  // Steals work within threads belonging to the partition.
+  Task LocalSteal() {
+    PerThread* pt = GetPerThread();
+    unsigned partition = GetStealPartition(pt->thread_id);
+    // If thread steal partition is the same as global partition, there is no
+    // need to go through the steal loop twice.
+    if (global_steal_partition_ == partition) return Task();
+    unsigned start, limit;
+    DecodePartition(partition, &start, &limit);
+    AssertBounds(start, limit);
+
+    return Steal(start, limit);
+  }
+
+  // Steals work from any other thread in the pool.
+  Task GlobalSteal() {
+    return Steal(0, num_threads_);
+  }
+
+
+  // WaitForWork blocks until new work is available (returns true), or if it is
+  // time to exit (returns false). Can optionally return a task to execute in t
+  // (in such case t.f != nullptr on return).
+  bool WaitForWork(EventCount::Waiter* waiter, Task* t) {
+    eigen_plain_assert(!t->f);
+    // We already did best-effort emptiness check in Steal, so prepare for
+    // blocking.
+    ec_.Prewait();
+    // Now do a reliable emptiness check.
+    int victim = NonEmptyQueueIndex();
+    if (victim != -1) {
+      ec_.CancelWait();
+      if (cancelled_) {
+        return false;
+      } else {
+        *t = thread_data_[victim].queue.PopBack();
+        return true;
+      }
+    }
+    // Number of blocked threads is used as termination condition.
+    // If we are shutting down and all worker threads blocked without work,
+    // that's we are done.
+    blocked_++;
+    // TODO is blocked_ required to be unsigned?
+    if (done_ && blocked_ == static_cast<unsigned>(num_threads_)) {
+      ec_.CancelWait();
+      // Almost done, but need to re-check queues.
+      // Consider that all queues are empty and all worker threads are preempted
+      // right after incrementing blocked_ above. Now a free-standing thread
+      // submits work and calls destructor (which sets done_). If we don't
+      // re-check queues, we will exit leaving the work unexecuted.
+      if (NonEmptyQueueIndex() != -1) {
+        // Note: we must not pop from queues before we decrement blocked_,
+        // otherwise the following scenario is possible. Consider that instead
+        // of checking for emptiness we popped the only element from queues.
+        // Now other worker threads can start exiting, which is bad if the
+        // work item submits other work. So we just check emptiness here,
+        // which ensures that all worker threads exit at the same time.
+        blocked_--;
+        return true;
+      }
+      // Reached stable termination state.
+      ec_.Notify(true);
+      return false;
+    }
+    ec_.CommitWait(waiter);
+    blocked_--;
+    return true;
+  }
+
+  int NonEmptyQueueIndex() {
+    PerThread* pt = GetPerThread();
+    // We intentionally design NonEmptyQueueIndex to steal work from
+    // anywhere in the queue so threads don't block in WaitForWork() forever
+    // when all threads in their partition go to sleep. Steal is still local.
+    const size_t size = thread_data_.size();
+    unsigned r = Rand(&pt->rand);
+    unsigned inc = all_coprimes_[size - 1][r % all_coprimes_[size - 1].size()];
+    unsigned victim = r % size;
+    for (unsigned i = 0; i < size; i++) {
+      if (!thread_data_[victim].queue.Empty()) {
+        return victim;
+      }
+      victim += inc;
+      if (victim >= size) {
+        victim -= size;
+      }
+    }
+    return -1;
+  }
+
+  static EIGEN_STRONG_INLINE uint64_t GlobalThreadIdHash() {
+    return std::hash<std::thread::id>()(std::this_thread::get_id());
+  }
+
+  EIGEN_STRONG_INLINE PerThread* GetPerThread() {
+#ifndef EIGEN_THREAD_LOCAL
+    static PerThread dummy;
+    auto it = per_thread_map_.find(GlobalThreadIdHash());
+    if (it == per_thread_map_.end()) {
+      return &dummy;
+    } else {
+      return it->second.get();
+    }
+#else
+    EIGEN_THREAD_LOCAL PerThread per_thread_;
+    PerThread* pt = &per_thread_;
+    return pt;
+#endif
+  }
+
+  static EIGEN_STRONG_INLINE unsigned Rand(uint64_t* state) {
+    uint64_t current = *state;
+    // Update the internal state
+    *state = current * 6364136223846793005ULL + 0xda3e39cb94b95bdbULL;
+    // Generate the random output (using the PCG-XSH-RS scheme)
+    return static_cast<unsigned>((current ^ (current >> 22)) >>
+                                 (22 + (current >> 61)));
+  }
+};
+
+typedef ThreadPoolTempl<StlThreadEnvironment> ThreadPool;
+
+}  // namespace Eigen
+
+#endif  // EIGEN_CXX11_THREADPOOL_NONBLOCKING_THREAD_POOL_H
--- a/Eigen/src/ThreadPool/RunQueue.h
+++ b/Eigen/src/ThreadPool/RunQueue.h
@@ -0,0 +1,238 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2016 Dmitry Vyukov <dvyukov@google.com>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+#ifndef EIGEN_CXX11_THREADPOOL_RUNQUEUE_H
+#define EIGEN_CXX11_THREADPOOL_RUNQUEUE_H
+
+#include "./InternalHeaderCheck.h"
+
+namespace Eigen {
+
+// RunQueue is a fixed-size, partially non-blocking deque or Work items.
+// Operations on front of the queue must be done by a single thread (owner),
+// operations on back of the queue can be done by multiple threads concurrently.
+//
+// Algorithm outline:
+// All remote threads operating on the queue back are serialized by a mutex.
+// This ensures that at most two threads access state: owner and one remote
+// thread (Size aside). The algorithm ensures that the occupied region of the
+// underlying array is logically continuous (can wraparound, but no stray
+// occupied elements). Owner operates on one end of this region, remote thread
+// operates on the other end. Synchronization between these threads
+// (potential consumption of the last element and take up of the last empty
+// element) happens by means of state variable in each element. States are:
+// empty, busy (in process of insertion of removal) and ready. Threads claim
+// elements (empty->busy and ready->busy transitions) by means of a CAS
+// operation. The finishing transition (busy->empty and busy->ready) are done
+// with plain store as the element is exclusively owned by the current thread.
+//
+// Note: we could permit only pointers as elements, then we would not need
+// separate state variable as null/non-null pointer value would serve as state,
+// but that would require malloc/free per operation for large, complex values
+// (and this is designed to store std::function<()>).
+template <typename Work, unsigned kSize>
+class RunQueue {
+ public:
+  RunQueue() : front_(0), back_(0) {
+    // require power-of-two for fast masking
+    eigen_plain_assert((kSize & (kSize - 1)) == 0);
+    eigen_plain_assert(kSize > 2);            // why would you do this?
+    eigen_plain_assert(kSize <= (64 << 10));  // leave enough space for counter
+    for (unsigned i = 0; i < kSize; i++)
+      array_[i].state.store(kEmpty, std::memory_order_relaxed);
+  }
+
+  ~RunQueue() { eigen_plain_assert(Size() == 0); }
+
+  // PushFront inserts w at the beginning of the queue.
+  // If queue is full returns w, otherwise returns default-constructed Work.
+  Work PushFront(Work w) {
+    unsigned front = front_.load(std::memory_order_relaxed);
+    Elem* e = &array_[front & kMask];
+    uint8_t s = e->state.load(std::memory_order_relaxed);
+    if (s != kEmpty ||
+        !e->state.compare_exchange_strong(s, kBusy, std::memory_order_acquire))
+      return w;
+    front_.store(front + 1 + (kSize << 1), std::memory_order_relaxed);
+    e->w = std::move(w);
+    e->state.store(kReady, std::memory_order_release);
+    return Work();
+  }
+
+  // PopFront removes and returns the first element in the queue.
+  // If the queue was empty returns default-constructed Work.
+  Work PopFront() {
+    unsigned front = front_.load(std::memory_order_relaxed);
+    Elem* e = &array_[(front - 1) & kMask];
+    uint8_t s = e->state.load(std::memory_order_relaxed);
+    if (s != kReady ||
+        !e->state.compare_exchange_strong(s, kBusy, std::memory_order_acquire))
+      return Work();
+    Work w = std::move(e->w);
+    e->state.store(kEmpty, std::memory_order_release);
+    front = ((front - 1) & kMask2) | (front & ~kMask2);
+    front_.store(front, std::memory_order_relaxed);
+    return w;
+  }
+
+  // PushBack adds w at the end of the queue.
+  // If queue is full returns w, otherwise returns default-constructed Work.
+  Work PushBack(Work w) {
+    std::unique_lock<std::mutex> lock(mutex_);
+    unsigned back = back_.load(std::memory_order_relaxed);
+    Elem* e = &array_[(back - 1) & kMask];
+    uint8_t s = e->state.load(std::memory_order_relaxed);
+    if (s != kEmpty ||
+        !e->state.compare_exchange_strong(s, kBusy, std::memory_order_acquire))
+      return w;
+    back = ((back - 1) & kMask2) | (back & ~kMask2);
+    back_.store(back, std::memory_order_relaxed);
+    e->w = std::move(w);
+    e->state.store(kReady, std::memory_order_release);
+    return Work();
+  }
+
+  // PopBack removes and returns the last elements in the queue.
+  Work PopBack() {
+    if (Empty()) return Work();
+    std::unique_lock<std::mutex> lock(mutex_);
+    unsigned back = back_.load(std::memory_order_relaxed);
+    Elem* e = &array_[back & kMask];
+    uint8_t s = e->state.load(std::memory_order_relaxed);
+    if (s != kReady ||
+        !e->state.compare_exchange_strong(s, kBusy, std::memory_order_acquire))
+      return Work();
+    Work w = std::move(e->w);
+    e->state.store(kEmpty, std::memory_order_release);
+    back_.store(back + 1 + (kSize << 1), std::memory_order_relaxed);
+    return w;
+  }
+
+  // PopBackHalf removes and returns half last elements in the queue.
+  // Returns number of elements removed.
+  unsigned PopBackHalf(std::vector<Work>* result) {
+    if (Empty()) return 0;
+    std::unique_lock<std::mutex> lock(mutex_);
+    unsigned back = back_.load(std::memory_order_relaxed);
+    unsigned size = Size();
+    unsigned mid = back;
+    if (size > 1) mid = back + (size - 1) / 2;
+    unsigned n = 0;
+    unsigned start = 0;
+    for (; static_cast<int>(mid - back) >= 0; mid--) {
+      Elem* e = &array_[mid & kMask];
+      uint8_t s = e->state.load(std::memory_order_relaxed);
+      if (n == 0) {
+        if (s != kReady || !e->state.compare_exchange_strong(
+                               s, kBusy, std::memory_order_acquire))
+          continue;
+        start = mid;
+      } else {
+        // Note: no need to store temporal kBusy, we exclusively own these
+        // elements.
+        eigen_plain_assert(s == kReady);
+      }
+      result->push_back(std::move(e->w));
+      e->state.store(kEmpty, std::memory_order_release);
+      n++;
+    }
+    if (n != 0)
+      back_.store(start + 1 + (kSize << 1), std::memory_order_relaxed);
+    return n;
+  }
+
+  // Size returns current queue size.
+  // Can be called by any thread at any time.
+  unsigned Size() const { return SizeOrNotEmpty<true>(); }
+
+  // Empty tests whether container is empty.
+  // Can be called by any thread at any time.
+  bool Empty() const { return SizeOrNotEmpty<false>() == 0; }
+
+  // Delete all the elements from the queue.
+  void Flush() {
+    while (!Empty()) {
+      PopFront();
+    }
+  }
+
+ private:
+  static const unsigned kMask = kSize - 1;
+  static const unsigned kMask2 = (kSize << 1) - 1;
+  struct Elem {
+    std::atomic<uint8_t> state;
+    Work w;
+  };
+  enum {
+    kEmpty,
+    kBusy,
+    kReady,
+  };
+  std::mutex mutex_;
+  // Low log(kSize) + 1 bits in front_ and back_ contain rolling index of
+  // front/back, respectively. The remaining bits contain modification counters
+  // that are incremented on Push operations. This allows us to (1) distinguish
+  // between empty and full conditions (if we would use log(kSize) bits for
+  // position, these conditions would be indistinguishable); (2) obtain
+  // consistent snapshot of front_/back_ for Size operation using the
+  // modification counters.
+  std::atomic<unsigned> front_;
+  std::atomic<unsigned> back_;
+  Elem array_[kSize];
+
+  // SizeOrNotEmpty returns current queue size; if NeedSizeEstimate is false,
+  // only whether the size is 0 is guaranteed to be correct.
+  // Can be called by any thread at any time.
+  template<bool NeedSizeEstimate>
+  unsigned SizeOrNotEmpty() const {
+    // Emptiness plays critical role in thread pool blocking. So we go to great
+    // effort to not produce false positives (claim non-empty queue as empty).
+    unsigned front = front_.load(std::memory_order_acquire);
+    for (;;) {
+      // Capture a consistent snapshot of front/tail.
+      unsigned back = back_.load(std::memory_order_acquire);
+      unsigned front1 = front_.load(std::memory_order_relaxed);
+      if (front != front1) {
+        front = front1;
+        std::atomic_thread_fence(std::memory_order_acquire);
+        continue;
+      }
+      if (NeedSizeEstimate) {
+        return CalculateSize(front, back);
+      } else {
+        // This value will be 0 if the queue is empty, and undefined otherwise.
+        unsigned maybe_zero = ((front ^ back) & kMask2);
+        // Queue size estimate must agree with maybe zero check on the queue
+        // empty/non-empty state.
+        eigen_assert((CalculateSize(front, back) == 0) == (maybe_zero == 0));
+        return maybe_zero;
+      }
+    }
+  }
+
+  EIGEN_ALWAYS_INLINE
+  unsigned CalculateSize(unsigned front, unsigned back) const {
+    int size = (front & kMask2) - (back & kMask2);
+    // Fix overflow.
+    if (size < 0) size += 2 * kSize;
+    // Order of modification in push/pop is crafted to make the queue look
+    // larger than it is during concurrent modifications. E.g. push can
+    // increment size before the corresponding pop has decremented it.
+    // So the computed size can be up to kSize + 1, fix it.
+    if (size > static_cast<int>(kSize)) size = kSize;
+    return static_cast<unsigned>(size);
+  }
+
+  RunQueue(const RunQueue&) = delete;
+  void operator=(const RunQueue&) = delete;
+};
+
+}  // namespace Eigen
+
+#endif  // EIGEN_CXX11_THREADPOOL_RUNQUEUE_H
--- a/Eigen/src/ThreadPool/ThreadCancel.h
+++ b/Eigen/src/ThreadPool/ThreadCancel.h
@@ -0,0 +1,23 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2016 Benoit Steiner <benoit.steiner.goog@gmail.com>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+#ifndef EIGEN_CXX11_THREADPOOL_THREAD_CANCEL_H
+#define EIGEN_CXX11_THREADPOOL_THREAD_CANCEL_H
+
+// Try to come up with a portable way to cancel a thread
+#if EIGEN_OS_GNULINUX
+  #define EIGEN_THREAD_CANCEL(t)                  \
+    pthread_cancel(t.native_handle());
+  #define EIGEN_SUPPORTS_THREAD_CANCELLATION 1
+#else
+#define EIGEN_THREAD_CANCEL(t)
+#endif
+
+
+#endif  // EIGEN_CXX11_THREADPOOL_THREAD_CANCEL_H
--- a/Eigen/src/ThreadPool/ThreadEnvironment.h
+++ b/Eigen/src/ThreadPool/ThreadEnvironment.h
@@ -0,0 +1,42 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2014 Benoit Steiner <benoit.steiner.goog@gmail.com>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+#ifndef EIGEN_CXX11_THREADPOOL_THREAD_ENVIRONMENT_H
+#define EIGEN_CXX11_THREADPOOL_THREAD_ENVIRONMENT_H
+
+#include "./InternalHeaderCheck.h"
+
+namespace Eigen {
+
+struct StlThreadEnvironment {
+  struct Task {
+    std::function<void()> f;
+  };
+
+  // EnvThread constructor must start the thread,
+  // destructor must join the thread.
+  class EnvThread {
+   public:
+    EnvThread(std::function<void()> f) : thr_(std::move(f)) {}
+    ~EnvThread() { thr_.join(); }
+    // This function is called when the threadpool is cancelled.
+    void OnCancel() { }
+
+   private:
+    std::thread thr_;
+  };
+
+  EnvThread* CreateThread(std::function<void()> f) { return new EnvThread(std::move(f)); }
+  Task CreateTask(std::function<void()> f) { return Task{std::move(f)}; }
+  void ExecuteTask(const Task& t) { t.f(); }
+};
+
+}  // namespace Eigen
+
+#endif  // EIGEN_CXX11_THREADPOOL_THREAD_ENVIRONMENT_H
--- a/Eigen/src/ThreadPool/ThreadLocal.h
+++ b/Eigen/src/ThreadPool/ThreadLocal.h
@@ -0,0 +1,299 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2016 Benoit Steiner <benoit.steiner.goog@gmail.com>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+#ifndef EIGEN_CXX11_THREADPOOL_THREAD_LOCAL_H
+#define EIGEN_CXX11_THREADPOOL_THREAD_LOCAL_H
+
+#ifdef EIGEN_AVOID_THREAD_LOCAL
+
+#ifdef EIGEN_THREAD_LOCAL
+#undef EIGEN_THREAD_LOCAL
+#endif
+
+#else
+
+#if ((EIGEN_COMP_GNUC) || __has_feature(cxx_thread_local) || EIGEN_COMP_MSVC )
+#define EIGEN_THREAD_LOCAL static thread_local
+#endif
+
+// Disable TLS for Apple and Android builds with older toolchains.
+#if defined(__APPLE__)
+// Included for TARGET_OS_IPHONE, __IPHONE_OS_VERSION_MIN_REQUIRED,
+// __IPHONE_8_0.
+#include <Availability.h>
+#include <TargetConditionals.h>
+#endif
+// Checks whether C++11's `thread_local` storage duration specifier is
+// supported.
+#if EIGEN_COMP_CLANGAPPLE && ((EIGEN_COMP_CLANGAPPLE < 8000042) || \
+     (TARGET_OS_IPHONE && __IPHONE_OS_VERSION_MIN_REQUIRED < __IPHONE_9_0))
+// Notes: Xcode's clang did not support `thread_local` until version
+// 8, and even then not for all iOS < 9.0.
+#undef EIGEN_THREAD_LOCAL
+
+#elif defined(__ANDROID__) && EIGEN_COMP_CLANG
+// There are platforms for which TLS should not be used even though the compiler
+// makes it seem like it's supported (Android NDK < r12b for example).
+// This is primarily because of linker problems and toolchain misconfiguration:
+// TLS isn't supported until NDK r12b per
+// https://developer.android.com/ndk/downloads/revision_history.html
+// Since NDK r16, `__NDK_MAJOR__` and `__NDK_MINOR__` are defined in
+// <android/ndk-version.h>. For NDK < r16, users should define these macros,
+// e.g. `-D__NDK_MAJOR__=11 -D__NKD_MINOR__=0` for NDK r11.
+#if __has_include(<android/ndk-version.h>)
+#include <android/ndk-version.h>
+#endif  // __has_include(<android/ndk-version.h>)
+#if defined(__ANDROID__) && defined(__clang__) && defined(__NDK_MAJOR__) && \
+    defined(__NDK_MINOR__) &&                                               \
+    ((__NDK_MAJOR__ < 12) || ((__NDK_MAJOR__ == 12) && (__NDK_MINOR__ < 1)))
+#undef EIGEN_THREAD_LOCAL
+#endif
+#endif  // defined(__ANDROID__) && defined(__clang__)
+
+#endif  // EIGEN_AVOID_THREAD_LOCAL
+
+#include "./InternalHeaderCheck.h"
+
+namespace Eigen {
+
+namespace internal {
+template <typename T>
+struct ThreadLocalNoOpInitialize {
+  void operator()(T&) const {}
+};
+
+template <typename T>
+struct ThreadLocalNoOpRelease {
+  void operator()(T&) const {}
+};
+
+}  // namespace internal
+
+// Thread local container for elements of type T, that does not use thread local
+// storage. As long as the number of unique threads accessing this storage
+// is smaller than `capacity_`, it is lock-free and wait-free. Otherwise it will
+// use a mutex for synchronization.
+//
+// Type `T` has to be default constructible, and by default each thread will get
+// a default constructed value. It is possible to specify custom `initialize`
+// callable, that will be called lazily from each thread accessing this object,
+// and will be passed a default initialized object of type `T`. Also it's
+// possible to pass a custom `release` callable, that will be invoked before
+// calling ~T().
+//
+// Example:
+//
+//   struct Counter {
+//     int value = 0;
+//   }
+//
+//   Eigen::ThreadLocal<Counter> counter(10);
+//
+//   // Each thread will have access to it's own counter object.
+//   Counter& cnt = counter.local();
+//   cnt++;
+//
+// WARNING: Eigen::ThreadLocal uses the OS-specific value returned by
+// std::this_thread::get_id() to identify threads. This value is not guaranteed
+// to be unique except for the life of the thread. A newly created thread may
+// get an OS-specific ID equal to that of an already destroyed thread.
+//
+// Somewhat similar to TBB thread local storage, with similar restrictions:
+// https://www.threadingbuildingblocks.org/docs/help/reference/thread_local_storage/enumerable_thread_specific_cls.html
+//
+template <typename T,
+          typename Initialize = internal::ThreadLocalNoOpInitialize<T>,
+          typename Release = internal::ThreadLocalNoOpRelease<T>>
+class ThreadLocal {
+  // We preallocate default constructed elements in MaxSizedVector.
+  static_assert(std::is_default_constructible<T>::value,
+                "ThreadLocal data type must be default constructible");
+
+ public:
+  explicit ThreadLocal(int capacity)
+      : ThreadLocal(capacity, internal::ThreadLocalNoOpInitialize<T>(),
+                    internal::ThreadLocalNoOpRelease<T>()) {}
+
+  ThreadLocal(int capacity, Initialize initialize)
+      : ThreadLocal(capacity, std::move(initialize),
+                    internal::ThreadLocalNoOpRelease<T>()) {}
+
+  ThreadLocal(int capacity, Initialize initialize, Release release)
+      : initialize_(std::move(initialize)),
+        release_(std::move(release)),
+        capacity_(capacity),
+        data_(capacity_),
+        ptr_(capacity_),
+        filled_records_(0) {
+    eigen_assert(capacity_ >= 0);
+    data_.resize(capacity_);
+    for (int i = 0; i < capacity_; ++i) {
+      ptr_.emplace_back(nullptr);
+    }
+  }
+
+  T& local() {
+    std::thread::id this_thread = std::this_thread::get_id();
+    if (capacity_ == 0) return SpilledLocal(this_thread);
+
+    std::size_t h = std::hash<std::thread::id>()(this_thread);
+    const int start_idx = h % capacity_;
+
+    // NOTE: From the definition of `std::this_thread::get_id()` it is
+    // guaranteed that we never can have concurrent insertions with the same key
+    // to our hash-map like data structure. If we didn't find an element during
+    // the initial traversal, it's guaranteed that no one else could have
+    // inserted it while we are in this function. This allows to massively
+    // simplify out lock-free insert-only hash map.
+
+    // Check if we already have an element for `this_thread`.
+    int idx = start_idx;
+    while (ptr_[idx].load() != nullptr) {
+      ThreadIdAndValue& record = *(ptr_[idx].load());
+      if (record.thread_id == this_thread) return record.value;
+
+      idx += 1;
+      if (idx >= capacity_) idx -= capacity_;
+      if (idx == start_idx) break;
+    }
+
+    // If we are here, it means that we found an insertion point in lookup
+    // table at `idx`, or we did a full traversal and table is full.
+
+    // If lock-free storage is full, fallback on mutex.
+    if (filled_records_.load() >= capacity_) return SpilledLocal(this_thread);
+
+    // We double check that we still have space to insert an element into a lock
+    // free storage. If old value in `filled_records_` is larger than the
+    // records capacity, it means that some other thread added an element while
+    // we were traversing lookup table.
+    int insertion_index =
+        filled_records_.fetch_add(1, std::memory_order_relaxed);
+    if (insertion_index >= capacity_) return SpilledLocal(this_thread);
+
+    // At this point it's guaranteed that we can access to
+    // data_[insertion_index_] without a data race.
+    data_[insertion_index].thread_id = this_thread;
+    initialize_(data_[insertion_index].value);
+
+    // That's the pointer we'll put into the lookup table.
+    ThreadIdAndValue* inserted = &data_[insertion_index];
+
+    // We'll use nullptr pointer to ThreadIdAndValue in a compare-and-swap loop.
+    ThreadIdAndValue* empty = nullptr;
+
+    // Now we have to find an insertion point into the lookup table. We start
+    // from the `idx` that was identified as an insertion point above, it's
+    // guaranteed that we will have an empty record somewhere in a lookup table
+    // (because we created a record in the `data_`).
+    const int insertion_idx = idx;
+
+    do {
+      // Always start search from the original insertion candidate.
+      idx = insertion_idx;
+      while (ptr_[idx].load() != nullptr) {
+        idx += 1;
+        if (idx >= capacity_) idx -= capacity_;
+        // If we did a full loop, it means that we don't have any free entries
+        // in the lookup table, and this means that something is terribly wrong.
+        eigen_assert(idx != insertion_idx);
+      }
+      // Atomic CAS of the pointer guarantees that any other thread, that will
+      // follow this pointer will see all the mutations in the `data_`.
+    } while (!ptr_[idx].compare_exchange_weak(empty, inserted));
+
+    return inserted->value;
+  }
+
+  // WARN: It's not thread safe to call it concurrently with `local()`.
+  void ForEach(std::function<void(std::thread::id, T&)> f) {
+    // Reading directly from `data_` is unsafe, because only CAS to the
+    // record in `ptr_` makes all changes visible to other threads.
+    for (auto& ptr : ptr_) {
+      ThreadIdAndValue* record = ptr.load();
+      if (record == nullptr) continue;
+      f(record->thread_id, record->value);
+    }
+
+    // We did not spill into the map based storage.
+    if (filled_records_.load(std::memory_order_relaxed) < capacity_) return;
+
+    // Adds a happens before edge from the last call to SpilledLocal().
+    std::unique_lock<std::mutex> lock(mu_);
+    for (auto& kv : per_thread_map_) {
+      f(kv.first, kv.second);
+    }
+  }
+
+  // WARN: It's not thread safe to call it concurrently with `local()`.
+  ~ThreadLocal() {
+    // Reading directly from `data_` is unsafe, because only CAS to the record
+    // in `ptr_` makes all changes visible to other threads.
+    for (auto& ptr : ptr_) {
+      ThreadIdAndValue* record = ptr.load();
+      if (record == nullptr) continue;
+      release_(record->value);
+    }
+
+    // We did not spill into the map based storage.
+    if (filled_records_.load(std::memory_order_relaxed) < capacity_) return;
+
+    // Adds a happens before edge from the last call to SpilledLocal().
+    std::unique_lock<std::mutex> lock(mu_);
+    for (auto& kv : per_thread_map_) {
+      release_(kv.second);
+    }
+  }
+
+ private:
+  struct ThreadIdAndValue {
+    std::thread::id thread_id;
+    T value;
+  };
+
+  // Use unordered map guarded by a mutex when lock free storage is full.
+  T& SpilledLocal(std::thread::id this_thread) {
+    std::unique_lock<std::mutex> lock(mu_);
+
+    auto it = per_thread_map_.find(this_thread);
+    if (it == per_thread_map_.end()) {
+      auto result = per_thread_map_.emplace(this_thread, T());
+      eigen_assert(result.second);
+      initialize_((*result.first).second);
+      return (*result.first).second;
+    } else {
+      return it->second;
+    }
+  }
+
+  Initialize initialize_;
+  Release release_;
+  const int capacity_;
+
+  // Storage that backs lock-free lookup table `ptr_`. Records stored in this
+  // storage contiguously starting from index 0.
+  MaxSizeVector<ThreadIdAndValue> data_;
+
+  // Atomic pointers to the data stored in `data_`. Used as a lookup table for
+  // linear probing hash map (https://en.wikipedia.org/wiki/Linear_probing).
+  MaxSizeVector<std::atomic<ThreadIdAndValue*>> ptr_;
+
+  // Number of records stored in the `data_`.
+  std::atomic<int> filled_records_;
+
+  // We fallback on per thread map if lock-free storage is full. In practice
+  // this should never happen, if `capacity_` is a reasonable estimate of the
+  // number of threads running in a system.
+  std::mutex mu_;  // Protects per_thread_map_.
+  std::unordered_map<std::thread::id, T> per_thread_map_;
+};
+
+}  // namespace Eigen
+
+#endif  // EIGEN_CXX11_THREADPOOL_THREAD_LOCAL_H
--- a/Eigen/src/ThreadPool/ThreadPoolInterface.h
+++ b/Eigen/src/ThreadPool/ThreadPoolInterface.h
@@ -0,0 +1,50 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2014 Benoit Steiner <benoit.steiner.goog@gmail.com>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+#ifndef EIGEN_CXX11_THREADPOOL_THREAD_POOL_INTERFACE_H
+#define EIGEN_CXX11_THREADPOOL_THREAD_POOL_INTERFACE_H
+
+#include "./InternalHeaderCheck.h"
+
+namespace Eigen {
+
+// This defines an interface that ThreadPoolDevice can take to use
+// custom thread pools underneath.
+class ThreadPoolInterface {
+ public:
+  // Submits a closure to be run by a thread in the pool.
+  virtual void Schedule(std::function<void()> fn) = 0;
+
+  // Submits a closure to be run by threads in the range [start, end) in the
+  // pool.
+  virtual void ScheduleWithHint(std::function<void()> fn, int /*start*/,
+                                int /*end*/) {
+    // Just defer to Schedule in case sub-classes aren't interested in
+    // overriding this functionality.
+    Schedule(fn);
+  }
+
+  // If implemented, stop processing the closures that have been enqueued.
+  // Currently running closures may still be processed.
+  // If not implemented, does nothing.
+  virtual void Cancel() {}
+
+  // Returns the number of threads in the pool.
+  virtual int NumThreads() const = 0;
+
+  // Returns a logical thread index between 0 and NumThreads() - 1 if called
+  // from one of the threads in the pool. Returns -1 otherwise.
+  virtual int CurrentThreadId() const = 0;
+
+  virtual ~ThreadPoolInterface() {}
+};
+
+}  // namespace Eigen
+
+#endif  // EIGEN_CXX11_THREADPOOL_THREAD_POOL_INTERFACE_H
--- a/Eigen/src/ThreadPool/ThreadYield.h
+++ b/Eigen/src/ThreadPool/ThreadYield.h
@@ -0,0 +1,16 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2016 Benoit Steiner <benoit.steiner.goog@gmail.com>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+#ifndef EIGEN_CXX11_THREADPOOL_THREAD_YIELD_H
+#define EIGEN_CXX11_THREADPOOL_THREAD_YIELD_H
+
+// Try to come up with a portable way to yield
+#define EIGEN_THREAD_YIELD() std::this_thread::yield()
+
+#endif  // EIGEN_CXX11_THREADPOOL_THREAD_YIELD_H