Fix undefined behavior in PPC load.

VSX vec_xl is Causing a bunch of test failures and failing `-fsanitize=undefined` with g++. Removing the instruction allows tests to pass and eliminates the warning.
Fix CUDA clang again with new C++11 usages
2026-04-10 11:34:33 +08:00 · 2025-03-14 07:58:36 -07:00 · 2025-03-14 07:58:29 -07:00 · 2025-03-13 15:01:33 -07:00 · 2025-03-13 21:49:24 +00:00 · 2025-03-13 11:13:35 -07:00
190 changed files with 7167 additions and 4330 deletions
--- a/.clang-format
+++ b/.clang-format
@@ -0,0 +1,19 @@
+---
+BasedOnStyle: Google
+ColumnLimit:  120
+---
+Language:     Cpp
+BasedOnStyle: Google
+ColumnLimit:  120
+StatementMacros:
+  - EIGEN_STATIC_ASSERT
+  - EIGEN_INITIALIZE_COEFFS_IF_THAT_OPTION_IS_ENABLED
+  - EIGEN_INTERNAL_DENSE_STORAGE_CTOR_PLUGIN
+SortIncludes: false
+AttributeMacros:
+- EIGEN_STRONG_INLINE
+- EIGEN_ALWAYS_INLINE
+- EIGEN_DEVICE_FUNC
+- EIGEN_DONT_INLINE
+- EIGEN_DEPRECATED
+- EIGEN_UNUSED
--- a/.gitlab-ci.yml
+++ b/.gitlab-ci.yml
@@ -0,0 +1,34 @@
+# This file is part of Eigen, a lightweight C++ template library
+# for linear algebra.
+#
+# Copyright (C) 2023, The Eigen Authors
+#
+# This Source Code Form is subject to the terms of the Mozilla
+# Public License v. 2.0. If a copy of the MPL was not distributed
+# with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+stages:
+  - checkformat
+  - build
+  - test
+  - deploy
+
+variables:
+  # CMake build directory.
+  EIGEN_CI_BUILDDIR: .build
+  # Specify the CMake build target.
+  EIGEN_CI_BUILD_TARGET: ""
+  # If a test regex is specified, that will be selected.
+  # Otherwise, we will try a label if specified.
+  EIGEN_CI_CTEST_REGEX: ""
+  EIGEN_CI_CTEST_LABEL: ""
+  EIGEN_CI_CTEST_ARGS: ""
+
+include:
+  - "/ci/checkformat.gitlab-ci.yml"
+  - "/ci/common.gitlab-ci.yml"
+  - "/ci/build.linux.gitlab-ci.yml"
+  - "/ci/build.windows.gitlab-ci.yml"
+  - "/ci/test.linux.gitlab-ci.yml"
+  - "/ci/test.windows.gitlab-ci.yml"
+  - "/ci/deploy.gitlab-ci.yml"
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -1,7 +1,7 @@
-project(Eigen3)
-
 cmake_minimum_required(VERSION 2.8.5)

+project(Eigen3)
+
 # guard against in-source builds

 if(${CMAKE_SOURCE_DIR} STREQUAL ${CMAKE_BINARY_DIR})
@@ -19,14 +19,6 @@ if (NOT CMAKE_BUILD_TYPE)
  set(CMAKE_BUILD_TYPE "Release")
 endif()

-string(TOLOWER "${CMAKE_BUILD_TYPE}" cmake_build_type_tolower)
-if(    NOT cmake_build_type_tolower STREQUAL "debug"
-   AND NOT cmake_build_type_tolower STREQUAL "release"
-   AND NOT cmake_build_type_tolower STREQUAL "relwithdebinfo")
-  message(FATAL_ERROR "Unknown build type \"${CMAKE_BUILD_TYPE}\". Allowed values are Debug, Release, RelWithDebInfo (case-insensitive).")
-endif()
-
-
 #############################################################################
 # retrieve version infomation                                               #
 #############################################################################
@@ -94,6 +86,20 @@ else()
  ei_add_cxx_compiler_flag("-std=c++03")
 endif()

+function(ei_maybe_separate_arguments variable mode args)
+  # Use separate_arguments if the input is a single string containing a space.
+  # Otherwise, if it is already a list or doesn't have a space, just propagate
+  # the original value.  This is to better support multi-argument lists.
+  list(LENGTH args list_length)
+  if (${list_length} EQUAL 1)
+    string(FIND "${args}" " " has_space)
+    if (${has_space} GREATER -1)
+      separate_arguments(args ${mode} "${args}")
+    endif()
+  endif()
+  set(${variable} ${args} PARENT_SCOPE)
+endfunction(ei_maybe_separate_arguments)
+
 #############################################################################
 # find how to link to the standard libraries                                #
 #############################################################################
@@ -104,6 +110,10 @@ find_package(StandardMathLibrary)
 set(EIGEN_TEST_CUSTOM_LINKER_FLAGS  "" CACHE STRING "Additional linker flags when linking unit tests.")
 set(EIGEN_TEST_CUSTOM_CXX_FLAGS     "" CACHE STRING "Additional compiler flags when compiling unit tests.")

+# Convert space-separated arguments into CMake lists for downstream consumption.
+ei_maybe_separate_arguments(EIGEN_TEST_CUSTOM_LINKER_FLAGS NATIVE_COMMAND "${EIGEN_TEST_CUSTOM_LINKER_FLAGS}")
+ei_maybe_separate_arguments(EIGEN_TEST_CUSTOM_CXX_FLAGS NATIVE_COMMAND "${EIGEN_TEST_CUSTOM_CXX_FLAGS}")
+
 set(EIGEN_STANDARD_LIBRARIES_TO_LINK_TO "")

 if(NOT STANDARD_MATH_LIBRARY_FOUND)
@@ -158,7 +168,7 @@ if(NOT MSVC)
  ei_add_cxx_compiler_flag("-Wall")
  ei_add_cxx_compiler_flag("-Wextra")
  #ei_add_cxx_compiler_flag("-Weverything")              # clang
-  
+
  ei_add_cxx_compiler_flag("-Wundef")
  ei_add_cxx_compiler_flag("-Wcast-align")
  ei_add_cxx_compiler_flag("-Wchar-subscripts")
@@ -173,29 +183,29 @@ if(NOT MSVC)
  ei_add_cxx_compiler_flag("-Wc++11-extensions")
  ei_add_cxx_compiler_flag("-Wdouble-promotion")
 #  ei_add_cxx_compiler_flag("-Wconversion")
-  
+
  # -Wshadow is insanely too strict with gcc, hopefully it will become usable with gcc 6
  # if(NOT CMAKE_COMPILER_IS_GNUCXX OR (CMAKE_CXX_COMPILER_VERSION VERSION_GREATER "5.0.0"))
  if(NOT CMAKE_COMPILER_IS_GNUCXX)
    ei_add_cxx_compiler_flag("-Wshadow")
  endif()
-  
+
  ei_add_cxx_compiler_flag("-Wno-psabi")
  ei_add_cxx_compiler_flag("-Wno-variadic-macros")
  ei_add_cxx_compiler_flag("-Wno-long-long")
-  
+
  ei_add_cxx_compiler_flag("-fno-check-new")
  ei_add_cxx_compiler_flag("-fno-common")
  ei_add_cxx_compiler_flag("-fstrict-aliasing")
  ei_add_cxx_compiler_flag("-wd981")                    # disable ICC's "operands are evaluated in unspecified order" remark
  ei_add_cxx_compiler_flag("-wd2304")                   # disable ICC's "warning #2304: non-explicit constructor with single argument may cause implicit type conversion" produced by -Wnon-virtual-dtor
-  
-  
+
+
  # The -ansi flag must be added last, otherwise it is also used as a linker flag by check_cxx_compiler_flag making it fails
  # Moreover we should not set both -strict-ansi and -ansi
  check_cxx_compiler_flag("-strict-ansi" COMPILER_SUPPORT_STRICTANSI)
  ei_add_cxx_compiler_flag("-Qunused-arguments")        # disable clang warning: argument unused during compilation: '-ansi'
-  
+
  if(COMPILER_SUPPORT_STRICTANSI)
    set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -strict-ansi")
  else()
@@ -206,7 +216,7 @@ if(NOT MSVC)
    ei_add_cxx_compiler_flag("-pie")
    ei_add_cxx_compiler_flag("-fPIE")
  endif()
-  
+
  set(CMAKE_REQUIRED_FLAGS "")

  option(EIGEN_TEST_SSE2 "Enable/Disable SSE2 in tests/examples" OFF)
@@ -380,6 +390,7 @@ if(EIGEN_TEST_NO_EXCEPTIONS)
  message(STATUS "Disabling exceptions in tests/examples")
 endif()

+set(EIGEN_CUDA_CXX_FLAGS "" CACHE STRING "Additional flags to pass to the cuda compiler.")
 set(EIGEN_CUDA_COMPUTE_ARCH 30 CACHE STRING "The CUDA compute architecture level to target when compiling CUDA code")

 include_directories(${CMAKE_CURRENT_SOURCE_DIR} ${CMAKE_CURRENT_BINARY_DIR})
@@ -451,13 +462,15 @@ if(BUILD_TESTING)
  endif()
 endif()

-if(EIGEN_LEAVE_TEST_IN_ALL_TARGET)
-  add_subdirectory(blas)
-  add_subdirectory(lapack)
-else()
-  add_subdirectory(blas EXCLUDE_FROM_ALL)
-  add_subdirectory(lapack EXCLUDE_FROM_ALL)
-endif()
+if (NOT CMAKE_CROSSCOMPILING)
+  if(EIGEN_LEAVE_TEST_IN_ALL_TARGET)
+    add_subdirectory(blas)
+    add_subdirectory(lapack)
+  else()
+    add_subdirectory(blas EXCLUDE_FROM_ALL)
+    add_subdirectory(lapack EXCLUDE_FROM_ALL)
+  endif()
+endif(NOT CMAKE_CROSSCOMPILING)

 # add SYCL
 option(EIGEN_TEST_SYCL "Add Sycl support." OFF)
@@ -540,6 +553,7 @@ if (NOT CMAKE_VERSION VERSION_LESS 3.0)

  # Imported target support
  add_library (eigen INTERFACE)
+  add_library (Eigen3::Eigen ALIAS eigen)

  target_compile_definitions (eigen INTERFACE ${EIGEN_DEFINITIONS})
  target_include_directories (eigen INTERFACE
@@ -581,11 +595,11 @@ if (NOT CMAKE_VERSION VERSION_LESS 3.0)

 else (NOT CMAKE_VERSION VERSION_LESS 3.0)
  # Fallback to legacy Eigen3Config.cmake without the imported target
-  
+
  # If CMakePackageConfigHelpers module is available (CMake >= 2.8.8)
-  # create a relocatable Config file, otherwise leave the hardcoded paths       
+  # create a relocatable Config file, otherwise leave the hardcoded paths
  include(CMakePackageConfigHelpers OPTIONAL RESULT_VARIABLE CPCH_PATH)
-  
+
  if(CPCH_PATH)
    configure_package_config_file (
      ${CMAKE_CURRENT_SOURCE_DIR}/cmake/Eigen3ConfigLegacy.cmake.in
@@ -594,7 +608,7 @@ else (NOT CMAKE_VERSION VERSION_LESS 3.0)
      INSTALL_DESTINATION ${CMAKEPACKAGE_INSTALL_DIR}
      NO_CHECK_REQUIRED_COMPONENTS_MACRO # Eigen does not provide components
    )
-  else() 
+  else()
    # The PACKAGE_* variables are defined by the configure_package_config_file
    # but without it we define them manually to the hardcoded paths
    set(PACKAGE_INIT "")
--- a/Eigen/Core
+++ b/Eigen/Core
@@ -31,7 +31,7 @@
 #endif

 // Handle NVCC/CUDA/SYCL
-#if defined(__CUDACC__) || defined(__SYCL_DEVICE_ONLY__)
+#if defined(EIGEN_CUDACC) || defined(__SYCL_DEVICE_ONLY__)
  // Do not try asserts on CUDA and SYCL!
  #ifndef EIGEN_NO_DEBUG
  #define EIGEN_NO_DEBUG
@@ -46,7 +46,7 @@
  #endif

  // All functions callable from CUDA code must be qualified with __device__
-  #ifdef __CUDACC__
+  #ifdef EIGEN_CUDACC
    // Do not try to vectorize on CUDA and SYCL!
    #ifndef EIGEN_DONT_VECTORIZE
    #define EIGEN_DONT_VECTORIZE
@@ -62,9 +62,16 @@

 #else
  #define EIGEN_DEVICE_FUNC
-
 #endif

+#if defined(EIGEN_CUDACC)
+  #include <cuda.h>
+  #define EIGEN_CUDA_SDK_VER (CUDA_VERSION * 10)
+#else
+  #define EIGEN_CUDA_SDK_VER 0
+#endif
+
+
 // When compiling CUDA device code with NVCC, pull in math functions from the
 // global namespace.  In host mode, and when device doee with clang, use the
 // std versions.
@@ -123,7 +130,7 @@
  #endif
 #endif

-#ifndef EIGEN_DONT_VECTORIZE
+#if !defined(EIGEN_DONT_VECTORIZE) && !defined(EIGEN_CUDACC)

  #if defined (EIGEN_SSE2_ON_NON_MSVC_BUT_NOT_OLD_GCC) || defined(EIGEN_SSE2_ON_MSVC_2008_OR_LATER)

@@ -213,6 +220,7 @@
    } // end extern "C"
  #elif defined __VSX__
    #define EIGEN_VECTORIZE
+    #define EIGEN_VECTORIZE_FMA
    #define EIGEN_VECTORIZE_VSX
    #include <altivec.h>
    // We need to #undef all these ugly tokens defined in <altivec.h>
@@ -222,6 +230,7 @@
    #undef pixel
  #elif defined __ALTIVEC__
    #define EIGEN_VECTORIZE
+    #define EIGEN_VECTORIZE_FMA
    #define EIGEN_VECTORIZE_ALTIVEC
    #include <altivec.h>
    // We need to #undef all these ugly tokens defined in <altivec.h>
@@ -233,6 +242,10 @@
    #define EIGEN_VECTORIZE
    #define EIGEN_VECTORIZE_NEON
    #include <arm_neon.h>
+    // Enable FMA for ARM.
+    #if defined(__ARM_FEATURE_FMA)
+      #define EIGEN_VECTORIZE_FMA
+    #endif
  #elif (defined __s390x__ && defined __VEC__)
    #define EIGEN_VECTORIZE
    #define EIGEN_VECTORIZE_ZVECTOR
@@ -245,16 +258,16 @@
  #define EIGEN_HAS_FP16_C
 #endif

-#if defined __CUDACC__
+#if defined EIGEN_CUDACC
  #define EIGEN_VECTORIZE_CUDA
  #include <vector_types.h>
-  #if EIGEN_CUDACC_VER >= 70500
+  #if EIGEN_CUDA_SDK_VER >= 70500
    #define EIGEN_HAS_CUDA_FP16
  #endif
 #endif

 #if defined EIGEN_HAS_CUDA_FP16
-  #include <host_defines.h>
+  #include <cuda_runtime_api.h>
  #include <cuda_fp16.h>
 #endif

--- a/Eigen/src/Cholesky/LDLT.h
+++ b/Eigen/src/Cholesky/LDLT.h
@@ -44,7 +44,7 @@ namespace internal {
  * decomposition to determine whether a system of equations has a solution.
  *
  * This class supports the \link InplaceDecomposition inplace decomposition \endlink mechanism.
-  * 
+  *
  * \sa MatrixBase::ldlt(), SelfAdjointView::ldlt(), class LLT
  */
 template<typename _MatrixType, int _UpLo> class LDLT
@@ -558,7 +558,7 @@ LDLT<MatrixType,_UpLo>& LDLT<MatrixType,_UpLo>::rankUpdate(const MatrixBase<Deri
 #ifndef EIGEN_PARSED_BY_DOXYGEN
 template<typename _MatrixType, int _UpLo>
 template<typename RhsType, typename DstType>
-void LDLT<_MatrixType,_UpLo>::_solve_impl(const RhsType &rhs, DstType &dst) const
+EIGEN_DEVICE_FUNC void LDLT<_MatrixType,_UpLo>::_solve_impl(const RhsType &rhs, DstType &dst) const
 {
  eigen_assert(rhs.rows() == rows());
  // dst = P b
--- a/Eigen/src/Cholesky/LLT.h
+++ b/Eigen/src/Cholesky/LLT.h
@@ -475,7 +475,7 @@ LLT<_MatrixType,_UpLo> LLT<_MatrixType,_UpLo>::rankUpdate(const VectorType& v, c
 #ifndef EIGEN_PARSED_BY_DOXYGEN
 template<typename _MatrixType,int _UpLo>
 template<typename RhsType, typename DstType>
-void LLT<_MatrixType,_UpLo>::_solve_impl(const RhsType &rhs, DstType &dst) const
+EIGEN_DEVICE_FUNC void LLT<_MatrixType,_UpLo>::_solve_impl(const RhsType &rhs, DstType &dst) const
 {
  dst = rhs;
  solveInPlace(dst);
--- a/Eigen/src/Core/Assign.h
+++ b/Eigen/src/Core/Assign.h
@@ -16,7 +16,7 @@ namespace Eigen {

 template<typename Derived>
 template<typename OtherDerived>
-EIGEN_STRONG_INLINE Derived& DenseBase<Derived>
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Derived& DenseBase<Derived>
  ::lazyAssign(const DenseBase<OtherDerived>& other)
 {
  enum{
@@ -29,7 +29,7 @@ EIGEN_STRONG_INLINE Derived& DenseBase<Derived>

  eigen_assert(rows() == other.rows() && cols() == other.cols());
  internal::call_assignment_no_alias(derived(),other.derived());
-  
+
  return derived();
 }

--- a/Eigen/src/Core/AssignEvaluator.h
+++ b/Eigen/src/Core/AssignEvaluator.h
@@ -17,7 +17,7 @@ namespace Eigen {
 // This implementation is based on Assign.h

 namespace internal {
-  
+
 /***************************************************************************
 * Part 1 : the logic deciding a strategy for traversal and unrolling       *
 ***************************************************************************/
@@ -29,12 +29,12 @@ struct copy_using_evaluator_traits
 {
  typedef typename DstEvaluator::XprType Dst;
  typedef typename Dst::Scalar DstScalar;
-  
+
  enum {
    DstFlags = DstEvaluator::Flags,
    SrcFlags = SrcEvaluator::Flags
  };
-  
+
 public:
  enum {
    DstAlignment = DstEvaluator::Alignment,
@@ -135,7 +135,7 @@ public:
                          ? int(CompleteUnrolling)
                          : int(NoUnrolling) )
              : int(Traversal) == int(LinearTraversal)
-                ? ( bool(MayUnrollCompletely) ? int(CompleteUnrolling) 
+                ? ( bool(MayUnrollCompletely) ? int(CompleteUnrolling)
                                              : int(NoUnrolling) )
 #if EIGEN_UNALIGNED_VECTORIZE
              : int(Traversal) == int(SliceVectorizedTraversal)
@@ -195,7 +195,7 @@ struct copy_using_evaluator_DefaultTraversal_CompleteUnrolling
  // FIXME: this is not very clean, perhaps this information should be provided by the kernel?
  typedef typename Kernel::DstEvaluatorType DstEvaluatorType;
  typedef typename DstEvaluatorType::XprType DstXprType;
-  
+
  enum {
    outer = Index / DstXprType::InnerSizeAtCompileTime,
    inner = Index % DstXprType::InnerSizeAtCompileTime
@@ -261,7 +261,7 @@ struct copy_using_evaluator_innervec_CompleteUnrolling
  typedef typename Kernel::DstEvaluatorType DstEvaluatorType;
  typedef typename DstEvaluatorType::XprType DstXprType;
  typedef typename Kernel::PacketType PacketType;
-  
+
  enum {
    outer = Index / DstXprType::InnerSizeAtCompileTime,
    inner = Index % DstXprType::InnerSizeAtCompileTime,
@@ -426,7 +426,7 @@ struct dense_assignment_loop<Kernel, LinearVectorizedTraversal, CompleteUnrollin
  {
    typedef typename Kernel::DstEvaluatorType::XprType DstXprType;
    typedef typename Kernel::PacketType PacketType;
-    
+
    enum { size = DstXprType::SizeAtCompileTime,
           packetSize =unpacket_traits<PacketType>::size,
           alignedSize = (size/packetSize)*packetSize };
@@ -599,14 +599,14 @@ protected:
  typedef typename DstEvaluatorTypeT::XprType DstXprType;
  typedef typename SrcEvaluatorTypeT::XprType SrcXprType;
 public:
-  
+
  typedef DstEvaluatorTypeT DstEvaluatorType;
  typedef SrcEvaluatorTypeT SrcEvaluatorType;
  typedef typename DstEvaluatorType::Scalar Scalar;
  typedef copy_using_evaluator_traits<DstEvaluatorTypeT, SrcEvaluatorTypeT, Functor> AssignmentTraits;
  typedef typename AssignmentTraits::PacketType PacketType;
-  
-  
+
+
  EIGEN_DEVICE_FUNC generic_dense_assignment_kernel(DstEvaluatorType &dst, const SrcEvaluatorType &src, const Functor &func, DstXprType& dstExpr)
    : m_dst(dst), m_src(src), m_functor(func), m_dstExpr(dstExpr)
  {
@@ -614,58 +614,58 @@ public:
    AssignmentTraits::debug();
    #endif
  }
-  
+
  EIGEN_DEVICE_FUNC Index size() const        { return m_dstExpr.size(); }
  EIGEN_DEVICE_FUNC Index innerSize() const   { return m_dstExpr.innerSize(); }
  EIGEN_DEVICE_FUNC Index outerSize() const   { return m_dstExpr.outerSize(); }
  EIGEN_DEVICE_FUNC Index rows() const        { return m_dstExpr.rows(); }
  EIGEN_DEVICE_FUNC Index cols() const        { return m_dstExpr.cols(); }
  EIGEN_DEVICE_FUNC Index outerStride() const { return m_dstExpr.outerStride(); }
-  
+
  EIGEN_DEVICE_FUNC DstEvaluatorType& dstEvaluator() { return m_dst; }
  EIGEN_DEVICE_FUNC const SrcEvaluatorType& srcEvaluator() const { return m_src; }
-  
+
  /// Assign src(row,col) to dst(row,col) through the assignment functor.
  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void assignCoeff(Index row, Index col)
  {
    m_functor.assignCoeff(m_dst.coeffRef(row,col), m_src.coeff(row,col));
  }
-  
+
  /// \sa assignCoeff(Index,Index)
  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void assignCoeff(Index index)
  {
    m_functor.assignCoeff(m_dst.coeffRef(index), m_src.coeff(index));
  }
-  
+
  /// \sa assignCoeff(Index,Index)
  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void assignCoeffByOuterInner(Index outer, Index inner)
  {
-    Index row = rowIndexByOuterInner(outer, inner); 
-    Index col = colIndexByOuterInner(outer, inner); 
+    Index row = rowIndexByOuterInner(outer, inner);
+    Index col = colIndexByOuterInner(outer, inner);
    assignCoeff(row, col);
  }
-  
-  
+
+
  template<int StoreMode, int LoadMode, typename PacketType>
  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void assignPacket(Index row, Index col)
  {
    m_functor.template assignPacket<StoreMode>(&m_dst.coeffRef(row,col), m_src.template packet<LoadMode,PacketType>(row,col));
  }
-  
+
  template<int StoreMode, int LoadMode, typename PacketType>
  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void assignPacket(Index index)
  {
    m_functor.template assignPacket<StoreMode>(&m_dst.coeffRef(index), m_src.template packet<LoadMode,PacketType>(index));
  }
-  
+
  template<int StoreMode, int LoadMode, typename PacketType>
  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void assignPacketByOuterInner(Index outer, Index inner)
  {
-    Index row = rowIndexByOuterInner(outer, inner); 
+    Index row = rowIndexByOuterInner(outer, inner);
    Index col = colIndexByOuterInner(outer, inner);
    assignPacket<StoreMode,LoadMode,PacketType>(row, col);
  }
-  
+
  EIGEN_DEVICE_FUNC static EIGEN_STRONG_INLINE Index rowIndexByOuterInner(Index outer, Index inner)
  {
    typedef typename DstEvaluatorType::ExpressionTraits Traits;
@@ -688,7 +688,7 @@ public:
  {
    return m_dstExpr.data();
  }
-  
+
 protected:
  DstEvaluatorType& m_dst;
  const SrcEvaluatorType& m_src;
@@ -734,7 +734,7 @@ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void call_dense_assignment_loop(DstXprType
  resize_if_allowed(dst, src, func);

  DstEvaluatorType dstEvaluator(dst);
-    
+
  typedef generic_dense_assignment_kernel<DstEvaluatorType,SrcEvaluatorType,Functor> Kernel;
  Kernel kernel(dstEvaluator, srcEvaluator, func, dst.const_cast_derived());

@@ -762,7 +762,7 @@ struct EigenBase2EigenBase {};

 template<typename,typename> struct AssignmentKind { typedef EigenBase2EigenBase Kind; };
 template<> struct AssignmentKind<DenseShape,DenseShape> { typedef Dense2Dense Kind; };
-    
+
 // This is the main assignment class
 template< typename DstXprType, typename SrcXprType, typename Functor,
          typename Kind = typename AssignmentKind< typename evaluator_traits<DstXprType>::Shape , typename evaluator_traits<SrcXprType>::Shape >::Kind,
@@ -787,7 +787,7 @@ void call_assignment(const Dst& dst, const Src& src)
 {
  call_assignment(dst, src, internal::assign_op<typename Dst::Scalar,typename Src::Scalar>());
 }
-                     
+
 // Deal with "assume-aliasing"
 template<typename Dst, typename Src, typename Func>
 EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
@@ -827,12 +827,12 @@ void call_assignment_no_alias(Dst& dst, const Src& src, const Func& func)
  typedef typename internal::conditional<NeedToTranspose, Transpose<Dst>, Dst>::type ActualDstTypeCleaned;
  typedef typename internal::conditional<NeedToTranspose, Transpose<Dst>, Dst&>::type ActualDstType;
  ActualDstType actualDst(dst);
-  
+
  // TODO check whether this is the right place to perform these checks:
  EIGEN_STATIC_ASSERT_LVALUE(Dst)
  EIGEN_STATIC_ASSERT_SAME_MATRIX_SIZE(ActualDstTypeCleaned,Src)
  EIGEN_CHECK_BINARY_COMPATIBILIY(Func,typename ActualDstTypeCleaned::Scalar,typename Src::Scalar);
-  
+
  Assignment<ActualDstTypeCleaned,Src,Func>::run(actualDst, src, func);
 }
 template<typename Dst, typename Src>
@@ -869,13 +869,12 @@ template<typename Dst, typename Src> void check_for_aliasing(const Dst &dst, con
 template< typename DstXprType, typename SrcXprType, typename Functor, typename Weak>
 struct Assignment<DstXprType, SrcXprType, Functor, Dense2Dense, Weak>
 {
-  EIGEN_DEVICE_FUNC
-  static EIGEN_STRONG_INLINE void run(DstXprType &dst, const SrcXprType &src, const Functor &func)
+  static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void run(DstXprType &dst, const SrcXprType &src, const Functor &func)
  {
 #ifndef EIGEN_NO_DEBUG
    internal::check_for_aliasing(dst, src);
 #endif
-    
+
    call_dense_assignment_loop(dst, src, func);
  }
 };
@@ -887,8 +886,7 @@ struct Assignment<DstXprType, SrcXprType, Functor, Dense2Dense, Weak>
 template< typename DstXprType, typename SrcXprType, typename Functor, typename Weak>
 struct Assignment<DstXprType, SrcXprType, Functor, EigenBase2EigenBase, Weak>
 {
-  EIGEN_DEVICE_FUNC
-  static EIGEN_STRONG_INLINE void run(DstXprType &dst, const SrcXprType &src, const internal::assign_op<typename DstXprType::Scalar,typename SrcXprType::Scalar> &/*func*/)
+  static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void run(DstXprType &dst, const SrcXprType &src, const internal::assign_op<typename DstXprType::Scalar,typename SrcXprType::Scalar> &/*func*/)
  {
    Index dstRows = src.rows();
    Index dstCols = src.cols();
--- a/Eigen/src/Core/BooleanRedux.h
+++ b/Eigen/src/Core/BooleanRedux.h
@@ -23,7 +23,7 @@ struct all_unroller
    row = (UnrollCount-1) % Traits::RowsAtCompileTime
  };

-  static inline bool run(const Derived &mat)
+  EIGEN_DEVICE_FUNC static inline bool run(const Derived &mat)
  {
    return all_unroller<Derived, UnrollCount-1>::run(mat) && mat.coeff(row, col);
  }
@@ -32,13 +32,13 @@ struct all_unroller
 template<typename Derived>
 struct all_unroller<Derived, 0>
 {
-  static inline bool run(const Derived &/*mat*/) { return true; }
+  EIGEN_DEVICE_FUNC static inline bool run(const Derived &/*mat*/) { return true; }
 };

 template<typename Derived>
 struct all_unroller<Derived, Dynamic>
 {
-  static inline bool run(const Derived &) { return false; }
+  EIGEN_DEVICE_FUNC static inline bool run(const Derived &) { return false; }
 };

 template<typename Derived, int UnrollCount>
@@ -50,7 +50,7 @@ struct any_unroller
    row = (UnrollCount-1) % Traits::RowsAtCompileTime
  };
  
-  static inline bool run(const Derived &mat)
+  EIGEN_DEVICE_FUNC static inline bool run(const Derived &mat)
  {
    return any_unroller<Derived, UnrollCount-1>::run(mat) || mat.coeff(row, col);
  }
@@ -59,13 +59,13 @@ struct any_unroller
 template<typename Derived>
 struct any_unroller<Derived, 0>
 {
-  static inline bool run(const Derived & /*mat*/) { return false; }
+  EIGEN_DEVICE_FUNC static inline bool run(const Derived & /*mat*/) { return false; }
 };

 template<typename Derived>
 struct any_unroller<Derived, Dynamic>
 {
-  static inline bool run(const Derived &) { return false; }
+  EIGEN_DEVICE_FUNC static inline bool run(const Derived &) { return false; }
 };

 } // end namespace internal
@@ -78,7 +78,7 @@ struct any_unroller<Derived, Dynamic>
  * \sa any(), Cwise::operator<()
  */
 template<typename Derived>
-inline bool DenseBase<Derived>::all() const
+EIGEN_DEVICE_FUNC inline bool DenseBase<Derived>::all() const
 {
  typedef internal::evaluator<Derived> Evaluator;
  enum {
@@ -102,7 +102,7 @@ inline bool DenseBase<Derived>::all() const
  * \sa all()
  */
 template<typename Derived>
-inline bool DenseBase<Derived>::any() const
+EIGEN_DEVICE_FUNC inline bool DenseBase<Derived>::any() const
 {
  typedef internal::evaluator<Derived> Evaluator;
  enum {
@@ -126,7 +126,7 @@ inline bool DenseBase<Derived>::any() const
  * \sa all(), any()
  */
 template<typename Derived>
-inline Eigen::Index DenseBase<Derived>::count() const
+EIGEN_DEVICE_FUNC inline Eigen::Index DenseBase<Derived>::count() const
 {
  return derived().template cast<bool>().template cast<Index>().sum();
 }
--- a/Eigen/src/Core/CommaInitializer.h
+++ b/Eigen/src/Core/CommaInitializer.h
@@ -11,7 +11,7 @@
 #ifndef EIGEN_COMMAINITIALIZER_H
 #define EIGEN_COMMAINITIALIZER_H

-namespace Eigen { 
+namespace Eigen {

 /** \class CommaInitializer
  * \ingroup Core_Module
@@ -44,7 +44,7 @@ struct CommaInitializer
    m_xpr.block(0, 0, other.rows(), other.cols()) = other;
  }

-  /* Copy/Move constructor which transfers ownership. This is crucial in 
+  /* Copy/Move constructor which transfers ownership. This is crucial in
   * absence of return value optimization to avoid assertions during destruction. */
  // FIXME in C++11 mode this could be replaced by a proper RValue constructor
  EIGEN_DEVICE_FUNC
@@ -135,13 +135,13 @@ struct CommaInitializer
  *
  * Example: \include MatrixBase_set.cpp
  * Output: \verbinclude MatrixBase_set.out
-  * 
+  *
  * \note According the c++ standard, the argument expressions of this comma initializer are evaluated in arbitrary order.
  *
  * \sa CommaInitializer::finished(), class CommaInitializer
  */
 template<typename Derived>
-inline CommaInitializer<Derived> DenseBase<Derived>::operator<< (const Scalar& s)
+EIGEN_DEVICE_FUNC inline CommaInitializer<Derived> DenseBase<Derived>::operator<< (const Scalar& s)
 {
  return CommaInitializer<Derived>(*static_cast<Derived*>(this), s);
 }
@@ -149,7 +149,7 @@ inline CommaInitializer<Derived> DenseBase<Derived>::operator<< (const Scalar& s
 /** \sa operator<<(const Scalar&) */
 template<typename Derived>
 template<typename OtherDerived>
-inline CommaInitializer<Derived>
+EIGEN_DEVICE_FUNC inline CommaInitializer<Derived>
 DenseBase<Derived>::operator<<(const DenseBase<OtherDerived>& other)
 {
  return CommaInitializer<Derived>(*static_cast<Derived *>(this), other);
--- a/Eigen/src/Core/CwiseBinaryOp.h
+++ b/Eigen/src/Core/CwiseBinaryOp.h
@@ -74,7 +74,7 @@ class CwiseBinaryOpImpl;
  * \sa MatrixBase::binaryExpr(const MatrixBase<OtherDerived> &,const CustomBinaryOp &) const, class CwiseUnaryOp, class CwiseNullaryOp
  */
 template<typename BinaryOp, typename LhsType, typename RhsType>
-class CwiseBinaryOp : 
+class CwiseBinaryOp :
  public CwiseBinaryOpImpl<
          BinaryOp, LhsType, RhsType,
          typename internal::cwise_promote_storage_type<typename internal::traits<LhsType>::StorageKind,
@@ -83,7 +83,7 @@ class CwiseBinaryOp :
  internal::no_assignment_operator
 {
  public:
-    
+
    typedef typename internal::remove_all<BinaryOp>::type Functor;
    typedef typename internal::remove_all<LhsType>::type Lhs;
    typedef typename internal::remove_all<RhsType>::type Rhs;
@@ -158,7 +158,7 @@ public:
  */
 template<typename Derived>
 template<typename OtherDerived>
-EIGEN_STRONG_INLINE Derived &
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Derived &
 MatrixBase<Derived>::operator-=(const MatrixBase<OtherDerived> &other)
 {
  call_assignment(derived(), other.derived(), internal::sub_assign_op<Scalar,typename OtherDerived::Scalar>());
@@ -171,7 +171,7 @@ MatrixBase<Derived>::operator-=(const MatrixBase<OtherDerived> &other)
  */
 template<typename Derived>
 template<typename OtherDerived>
-EIGEN_STRONG_INLINE Derived &
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Derived &
 MatrixBase<Derived>::operator+=(const MatrixBase<OtherDerived>& other)
 {
  call_assignment(derived(), other.derived(), internal::add_assign_op<Scalar,typename OtherDerived::Scalar>());
--- a/Eigen/src/Core/CwiseNullaryOp.h
+++ b/Eigen/src/Core/CwiseNullaryOp.h
@@ -126,12 +126,12 @@ DenseBase<Derived>::NullaryExpr(Index rows, Index cols, const CustomNullaryOp& f
  *
  * Here is an example with C++11 random generators: \include random_cpp11.cpp
  * Output: \verbinclude random_cpp11.out
-  * 
+  *
  * \sa class CwiseNullaryOp
  */
 template<typename Derived>
 template<typename CustomNullaryOp>
-EIGEN_STRONG_INLINE const CwiseNullaryOp<CustomNullaryOp, typename DenseBase<Derived>::PlainObject>
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const CwiseNullaryOp<CustomNullaryOp, typename DenseBase<Derived>::PlainObject>
 DenseBase<Derived>::NullaryExpr(Index size, const CustomNullaryOp& func)
 {
  EIGEN_STATIC_ASSERT_VECTOR_ONLY(Derived)
@@ -170,7 +170,7 @@ DenseBase<Derived>::NullaryExpr(const CustomNullaryOp& func)
  * \sa class CwiseNullaryOp
  */
 template<typename Derived>
-EIGEN_STRONG_INLINE const typename DenseBase<Derived>::ConstantReturnType
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const typename DenseBase<Derived>::ConstantReturnType
 DenseBase<Derived>::Constant(Index rows, Index cols, const Scalar& value)
 {
  return DenseBase<Derived>::NullaryExpr(rows, cols, internal::scalar_constant_op<Scalar>(value));
@@ -272,7 +272,7 @@ DenseBase<Derived>::LinSpaced(Index size, const Scalar& low, const Scalar& high)
 }

 /**
-  * \copydoc DenseBase::LinSpaced(Index, const Scalar&, const Scalar&)
+  * \copydoc DenseBase::LinSpaced(Index, const DenseBase::Scalar&, const DenseBase::Scalar&)
  * Special version for fixed size types which does not require the size parameter.
  */
 template<typename Derived>
--- a/Eigen/src/Core/Diagonal.h
+++ b/Eigen/src/Core/Diagonal.h
@@ -11,7 +11,7 @@
 #ifndef EIGEN_DIAGONAL_H
 #define EIGEN_DIAGONAL_H

-namespace Eigen { 
+namespace Eigen {

 /** \class Diagonal
  * \ingroup Core_Module
@@ -149,8 +149,8 @@ template<typename MatrixType, int _DiagIndex> class Diagonal
    }

    EIGEN_DEVICE_FUNC
-    inline const typename internal::remove_all<typename MatrixType::Nested>::type& 
-    nestedExpression() const 
+    inline const typename internal::remove_all<typename MatrixType::Nested>::type&
+    nestedExpression() const
    {
      return m_matrix;
    }
@@ -187,7 +187,7 @@ template<typename MatrixType, int _DiagIndex> class Diagonal
  *
  * \sa class Diagonal */
 template<typename Derived>
-inline typename MatrixBase<Derived>::DiagonalReturnType
+EIGEN_DEVICE_FUNC inline typename MatrixBase<Derived>::DiagonalReturnType
 MatrixBase<Derived>::diagonal()
 {
  return DiagonalReturnType(derived());
@@ -195,7 +195,7 @@ MatrixBase<Derived>::diagonal()

 /** This is the const version of diagonal(). */
 template<typename Derived>
-inline typename MatrixBase<Derived>::ConstDiagonalReturnType
+EIGEN_DEVICE_FUNC inline typename MatrixBase<Derived>::ConstDiagonalReturnType
 MatrixBase<Derived>::diagonal() const
 {
  return ConstDiagonalReturnType(derived());
@@ -213,7 +213,7 @@ MatrixBase<Derived>::diagonal() const
  *
  * \sa MatrixBase::diagonal(), class Diagonal */
 template<typename Derived>
-inline typename MatrixBase<Derived>::DiagonalDynamicIndexReturnType
+EIGEN_DEVICE_FUNC inline typename MatrixBase<Derived>::DiagonalDynamicIndexReturnType
 MatrixBase<Derived>::diagonal(Index index)
 {
  return DiagonalDynamicIndexReturnType(derived(), index);
@@ -221,7 +221,7 @@ MatrixBase<Derived>::diagonal(Index index)

 /** This is the const version of diagonal(Index). */
 template<typename Derived>
-inline typename MatrixBase<Derived>::ConstDiagonalDynamicIndexReturnType
+EIGEN_DEVICE_FUNC inline typename MatrixBase<Derived>::ConstDiagonalDynamicIndexReturnType
 MatrixBase<Derived>::diagonal(Index index) const
 {
  return ConstDiagonalDynamicIndexReturnType(derived(), index);
@@ -240,7 +240,7 @@ MatrixBase<Derived>::diagonal(Index index) const
  * \sa MatrixBase::diagonal(), class Diagonal */
 template<typename Derived>
 template<int Index_>
-inline typename MatrixBase<Derived>::template DiagonalIndexReturnType<Index_>::Type
+EIGEN_DEVICE_FUNC inline typename MatrixBase<Derived>::template DiagonalIndexReturnType<Index_>::Type
 MatrixBase<Derived>::diagonal()
 {
  return typename DiagonalIndexReturnType<Index_>::Type(derived());
@@ -249,7 +249,7 @@ MatrixBase<Derived>::diagonal()
 /** This is the const version of diagonal<int>(). */
 template<typename Derived>
 template<int Index_>
-inline typename MatrixBase<Derived>::template ConstDiagonalIndexReturnType<Index_>::Type
+EIGEN_DEVICE_FUNC inline typename MatrixBase<Derived>::template ConstDiagonalIndexReturnType<Index_>::Type
 MatrixBase<Derived>::diagonal() const
 {
  return typename ConstDiagonalIndexReturnType<Index_>::Type(derived());
--- a/Eigen/src/Core/DiagonalMatrix.h
+++ b/Eigen/src/Core/DiagonalMatrix.h
@@ -11,7 +11,7 @@
 #ifndef EIGEN_DIAGONALMATRIX_H
 #define EIGEN_DIAGONALMATRIX_H

-namespace Eigen { 
+namespace Eigen {

 #ifndef EIGEN_PARSED_BY_DOXYGEN
 template<typename Derived>
@@ -44,7 +44,7 @@ class DiagonalBase : public EigenBase<Derived>

    EIGEN_DEVICE_FUNC
    DenseMatrixType toDenseMatrix() const { return derived(); }
-    
+
    EIGEN_DEVICE_FUNC
    inline const DiagonalVectorType& diagonal() const { return derived().diagonal(); }
    EIGEN_DEVICE_FUNC
@@ -70,7 +70,7 @@ class DiagonalBase : public EigenBase<Derived>
    {
      return InverseReturnType(diagonal().cwiseInverse());
    }
-    
+
    EIGEN_DEVICE_FUNC
    inline const DiagonalWrapper<const EIGEN_EXPR_BINARYOP_SCALAR_RETURN_TYPE(DiagonalVectorType,Scalar,product) >
    operator*(const Scalar& scalar) const
@@ -273,7 +273,7 @@ class DiagonalWrapper
  * \sa class DiagonalWrapper, class DiagonalMatrix, diagonal(), isDiagonal()
  **/
 template<typename Derived>
-inline const DiagonalWrapper<const Derived>
+EIGEN_DEVICE_FUNC inline const DiagonalWrapper<const Derived>
 MatrixBase<Derived>::asDiagonal() const
 {
  return DiagonalWrapper<const Derived>(derived());
@@ -318,20 +318,20 @@ template<> struct AssignmentKind<DenseShape,DiagonalShape> { typedef Diagonal2De
 template< typename DstXprType, typename SrcXprType, typename Functor>
 struct Assignment<DstXprType, SrcXprType, Functor, Diagonal2Dense>
 {
-  static void run(DstXprType &dst, const SrcXprType &src, const internal::assign_op<typename DstXprType::Scalar,typename SrcXprType::Scalar> &/*func*/)
+  static EIGEN_DEVICE_FUNC void run(DstXprType &dst, const SrcXprType &src, const internal::assign_op<typename DstXprType::Scalar,typename SrcXprType::Scalar> &/*func*/)
  {
    Index dstRows = src.rows();
    Index dstCols = src.cols();
    if((dst.rows()!=dstRows) || (dst.cols()!=dstCols))
      dst.resize(dstRows, dstCols);
-    
+
    dst.setZero();
    dst.diagonal() = src.diagonal();
  }
-  
+
  static void run(DstXprType &dst, const SrcXprType &src, const internal::add_assign_op<typename DstXprType::Scalar,typename SrcXprType::Scalar> &/*func*/)
  { dst.diagonal() += src.diagonal(); }
-  
+
  static void run(DstXprType &dst, const SrcXprType &src, const internal::sub_assign_op<typename DstXprType::Scalar,typename SrcXprType::Scalar> &/*func*/)
  { dst.diagonal() -= src.diagonal(); }
 };
--- a/Eigen/src/Core/DiagonalProduct.h
+++ b/Eigen/src/Core/DiagonalProduct.h
@@ -17,7 +17,7 @@ namespace Eigen {
  */
 template<typename Derived>
 template<typename DiagonalDerived>
-inline const Product<Derived, DiagonalDerived, LazyProduct>
+EIGEN_DEVICE_FUNC inline const Product<Derived, DiagonalDerived, LazyProduct>
 MatrixBase<Derived>::operator*(const DiagonalBase<DiagonalDerived> &a_diagonal) const
 {
  return Product<Derived, DiagonalDerived, LazyProduct>(derived(),a_diagonal.derived());
--- a/Eigen/src/Core/Dot.h
+++ b/Eigen/src/Core/Dot.h
@@ -10,7 +10,7 @@
 #ifndef EIGEN_DOT_H
 #define EIGEN_DOT_H

-namespace Eigen { 
+namespace Eigen {

 namespace internal {

@@ -78,7 +78,7 @@ MatrixBase<Derived>::dot(const MatrixBase<OtherDerived>& other) const
  typedef internal::scalar_conj_product_op<Scalar,typename OtherDerived::Scalar> func;
  EIGEN_CHECK_BINARY_COMPATIBILIY(func,Scalar,typename OtherDerived::Scalar);
 #endif
-  
+
  eigen_assert(size() == other.size());

  return internal::dot_nocheck<Derived,OtherDerived>::run(*this, other);
@@ -86,14 +86,14 @@ MatrixBase<Derived>::dot(const MatrixBase<OtherDerived>& other) const

 //---------- implementation of L2 norm and related functions ----------

-/** \returns, for vectors, the squared \em l2 norm of \c *this, and for matrices the Frobenius norm.
+/** \returns, for vectors, the squared \em l2 norm of \c *this, and for matrices the squared Frobenius norm.
  * In both cases, it consists in the sum of the square of all the matrix entries.
  * For vectors, this is also equals to the dot product of \c *this with itself.
  *
  * \sa dot(), norm(), lpNorm()
  */
 template<typename Derived>
-EIGEN_STRONG_INLINE typename NumTraits<typename internal::traits<Derived>::Scalar>::Real MatrixBase<Derived>::squaredNorm() const
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE typename NumTraits<typename internal::traits<Derived>::Scalar>::Real MatrixBase<Derived>::squaredNorm() const
 {
  return numext::real((*this).cwiseAbs2().sum());
 }
@@ -105,7 +105,7 @@ EIGEN_STRONG_INLINE typename NumTraits<typename internal::traits<Derived>::Scala
  * \sa lpNorm(), dot(), squaredNorm()
  */
 template<typename Derived>
-EIGEN_STRONG_INLINE typename NumTraits<typename internal::traits<Derived>::Scalar>::Real MatrixBase<Derived>::norm() const
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE typename NumTraits<typename internal::traits<Derived>::Scalar>::Real MatrixBase<Derived>::norm() const
 {
  return numext::sqrt(squaredNorm());
 }
@@ -120,7 +120,7 @@ EIGEN_STRONG_INLINE typename NumTraits<typename internal::traits<Derived>::Scala
  * \sa norm(), normalize()
  */
 template<typename Derived>
-EIGEN_STRONG_INLINE const typename MatrixBase<Derived>::PlainObject
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const typename MatrixBase<Derived>::PlainObject
 MatrixBase<Derived>::normalized() const
 {
  typedef typename internal::nested_eval<Derived,2>::type _Nested;
@@ -142,7 +142,7 @@ MatrixBase<Derived>::normalized() const
  * \sa norm(), normalized()
  */
 template<typename Derived>
-EIGEN_STRONG_INLINE void MatrixBase<Derived>::normalize()
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void MatrixBase<Derived>::normalize()
 {
  RealScalar z = squaredNorm();
  // NOTE: after extensive benchmarking, this conditional does not impact performance, at least on recent x86 CPU
@@ -163,7 +163,7 @@ EIGEN_STRONG_INLINE void MatrixBase<Derived>::normalize()
  * \sa stableNorm(), stableNormalize(), normalized()
  */
 template<typename Derived>
-EIGEN_STRONG_INLINE const typename MatrixBase<Derived>::PlainObject
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const typename MatrixBase<Derived>::PlainObject
 MatrixBase<Derived>::stableNormalized() const
 {
  typedef typename internal::nested_eval<Derived,3>::type _Nested;
@@ -188,7 +188,7 @@ MatrixBase<Derived>::stableNormalized() const
  * \sa stableNorm(), stableNormalized(), normalize()
  */
 template<typename Derived>
-EIGEN_STRONG_INLINE void MatrixBase<Derived>::stableNormalize()
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void MatrixBase<Derived>::stableNormalize()
 {
  RealScalar w = cwiseAbs().maxCoeff();
  RealScalar z = (derived()/w).squaredNorm();
@@ -260,9 +260,9 @@ struct lpNorm_selector<Derived, Infinity>
 template<typename Derived>
 template<int p>
 #ifndef EIGEN_PARSED_BY_DOXYGEN
-inline typename NumTraits<typename internal::traits<Derived>::Scalar>::Real
+EIGEN_DEVICE_FUNC inline typename NumTraits<typename internal::traits<Derived>::Scalar>::Real
 #else
-MatrixBase<Derived>::RealScalar
+EIGEN_DEVICE_FUNC MatrixBase<Derived>::RealScalar
 #endif
 MatrixBase<Derived>::lpNorm() const
 {
--- a/Eigen/src/Core/Fuzzy.h
+++ b/Eigen/src/Core/Fuzzy.h
@@ -11,7 +11,7 @@
 #ifndef EIGEN_FUZZY_H
 #define EIGEN_FUZZY_H

-namespace Eigen { 
+namespace Eigen {

 namespace internal
 {
@@ -100,7 +100,7 @@ struct isMuchSmallerThan_scalar_selector<Derived, true>
  */
 template<typename Derived>
 template<typename OtherDerived>
-bool DenseBase<Derived>::isApprox(
+EIGEN_DEVICE_FUNC bool DenseBase<Derived>::isApprox(
  const DenseBase<OtherDerived>& other,
  const RealScalar& prec
 ) const
@@ -122,7 +122,7 @@ bool DenseBase<Derived>::isApprox(
  * \sa isApprox(), isMuchSmallerThan(const DenseBase<OtherDerived>&, RealScalar) const
  */
 template<typename Derived>
-bool DenseBase<Derived>::isMuchSmallerThan(
+EIGEN_DEVICE_FUNC bool DenseBase<Derived>::isMuchSmallerThan(
  const typename NumTraits<Scalar>::Real& other,
  const RealScalar& prec
 ) const
@@ -142,7 +142,7 @@ bool DenseBase<Derived>::isMuchSmallerThan(
  */
 template<typename Derived>
 template<typename OtherDerived>
-bool DenseBase<Derived>::isMuchSmallerThan(
+EIGEN_DEVICE_FUNC bool DenseBase<Derived>::isMuchSmallerThan(
  const DenseBase<OtherDerived>& other,
  const RealScalar& prec
 ) const
--- a/Eigen/src/Core/GeneralProduct.h
+++ b/Eigen/src/Core/GeneralProduct.h
@@ -207,12 +207,12 @@ template<> struct gemv_dense_selector<OnTheRight,ColMajor,true>
    typedef typename Rhs::Scalar   RhsScalar;
    typedef typename Dest::Scalar  ResScalar;
    typedef typename Dest::RealScalar  RealScalar;
-    
+
    typedef internal::blas_traits<Lhs> LhsBlasTraits;
    typedef typename LhsBlasTraits::DirectLinearAccessType ActualLhsType;
    typedef internal::blas_traits<Rhs> RhsBlasTraits;
    typedef typename RhsBlasTraits::DirectLinearAccessType ActualRhsType;
-  
+
    typedef Map<Matrix<ResScalar,Dynamic,1>, EIGEN_PLAIN_ENUM_MIN(AlignedMax,internal::packet_traits<ResScalar>::size)> MappedDest;

    ActualLhsType actualLhs = LhsBlasTraits::extract(lhs);
@@ -300,7 +300,7 @@ template<> struct gemv_dense_selector<OnTheRight,RowMajor,true>
    typedef typename Lhs::Scalar   LhsScalar;
    typedef typename Rhs::Scalar   RhsScalar;
    typedef typename Dest::Scalar  ResScalar;
-    
+
    typedef internal::blas_traits<Lhs> LhsBlasTraits;
    typedef typename LhsBlasTraits::DirectLinearAccessType ActualLhsType;
    typedef internal::blas_traits<Rhs> RhsBlasTraits;
@@ -386,7 +386,7 @@ template<> struct gemv_dense_selector<OnTheRight,RowMajor,false>
  */
 template<typename Derived>
 template<typename OtherDerived>
-inline const Product<Derived, OtherDerived>
+EIGEN_DEVICE_FUNC inline const Product<Derived, OtherDerived>
 MatrixBase<Derived>::operator*(const MatrixBase<OtherDerived> &other) const
 {
  // A note regarding the function declaration: In MSVC, this function will sometimes
@@ -428,7 +428,7 @@ MatrixBase<Derived>::operator*(const MatrixBase<OtherDerived> &other) const
  */
 template<typename Derived>
 template<typename OtherDerived>
-const Product<Derived,OtherDerived,LazyProduct>
+EIGEN_DEVICE_FUNC const Product<Derived,OtherDerived,LazyProduct>
 MatrixBase<Derived>::lazyProduct(const MatrixBase<OtherDerived> &other) const
 {
  enum {
--- a/Eigen/src/Core/GenericPacketMath.h
+++ b/Eigen/src/Core/GenericPacketMath.h
@@ -237,7 +237,7 @@ ploaddup(const typename unpacket_traits<Packet>::type* from) { return *from; }
  * For instance, for a packet of 8 elements, 2 scalars will be read from \a *from and
  * replicated to form: {from[0],from[0],from[0],from[0],from[1],from[1],from[1],from[1]}
  * Currently, this function is only used in matrix products.
-  * For packet-size smaller or equal to 4, this function is equivalent to pload1 
+  * For packet-size smaller or equal to 4, this function is equivalent to pload1
  */
 template<typename Packet> EIGEN_DEVICE_FUNC inline Packet
 ploadquad(const typename unpacket_traits<Packet>::type* from)
@@ -299,7 +299,7 @@ template<typename Scalar, typename Packet> EIGEN_DEVICE_FUNC inline void pstoreu
 template<typename Scalar> EIGEN_DEVICE_FUNC inline void prefetch(const Scalar* addr)
 {
 #ifdef __CUDA_ARCH__
-#if defined(__LP64__)
+#if defined(__LP64__) || EIGEN_OS_WIN64
  // 64-bit pointer operand constraint for inlined asm
  asm(" prefetch.L1 [ %1 ];" : "=l"(addr) : "l"(addr));
 #else
@@ -359,77 +359,77 @@ template<typename Packet> EIGEN_DEVICE_FUNC inline Packet pcplxflip(const Packet
 ***************************/

 /** \internal \returns the sine of \a a (coeff-wise) */
-template<typename Packet> EIGEN_DECLARE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS
+template<typename Packet> EIGEN_DEVICE_FUNC EIGEN_DECLARE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS
 Packet psin(const Packet& a) { using std::sin; return sin(a); }

 /** \internal \returns the cosine of \a a (coeff-wise) */
-template<typename Packet> EIGEN_DECLARE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS
+template<typename Packet> EIGEN_DEVICE_FUNC EIGEN_DECLARE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS
 Packet pcos(const Packet& a) { using std::cos; return cos(a); }

 /** \internal \returns the tan of \a a (coeff-wise) */
-template<typename Packet> EIGEN_DECLARE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS
+template<typename Packet> EIGEN_DEVICE_FUNC EIGEN_DECLARE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS
 Packet ptan(const Packet& a) { using std::tan; return tan(a); }

 /** \internal \returns the arc sine of \a a (coeff-wise) */
-template<typename Packet> EIGEN_DECLARE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS
+template<typename Packet> EIGEN_DEVICE_FUNC EIGEN_DECLARE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS
 Packet pasin(const Packet& a) { using std::asin; return asin(a); }

 /** \internal \returns the arc cosine of \a a (coeff-wise) */
-template<typename Packet> EIGEN_DECLARE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS
+template<typename Packet> EIGEN_DEVICE_FUNC EIGEN_DECLARE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS
 Packet pacos(const Packet& a) { using std::acos; return acos(a); }

 /** \internal \returns the arc tangent of \a a (coeff-wise) */
-template<typename Packet> EIGEN_DECLARE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS
+template<typename Packet> EIGEN_DEVICE_FUNC EIGEN_DECLARE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS
 Packet patan(const Packet& a) { using std::atan; return atan(a); }

 /** \internal \returns the hyperbolic sine of \a a (coeff-wise) */
-template<typename Packet> EIGEN_DECLARE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS
+template<typename Packet> EIGEN_DEVICE_FUNC EIGEN_DECLARE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS
 Packet psinh(const Packet& a) { using std::sinh; return sinh(a); }

 /** \internal \returns the hyperbolic cosine of \a a (coeff-wise) */
-template<typename Packet> EIGEN_DECLARE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS
+template<typename Packet> EIGEN_DEVICE_FUNC EIGEN_DECLARE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS
 Packet pcosh(const Packet& a) { using std::cosh; return cosh(a); }

 /** \internal \returns the hyperbolic tan of \a a (coeff-wise) */
-template<typename Packet> EIGEN_DECLARE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS
+template<typename Packet> EIGEN_DEVICE_FUNC EIGEN_DECLARE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS
 Packet ptanh(const Packet& a) { using std::tanh; return tanh(a); }

 /** \internal \returns the exp of \a a (coeff-wise) */
-template<typename Packet> EIGEN_DECLARE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS
+template<typename Packet> EIGEN_DEVICE_FUNC EIGEN_DECLARE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS
 Packet pexp(const Packet& a) { using std::exp; return exp(a); }

 /** \internal \returns the log of \a a (coeff-wise) */
-template<typename Packet> EIGEN_DECLARE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS
+template<typename Packet> EIGEN_DEVICE_FUNC EIGEN_DECLARE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS
 Packet plog(const Packet& a) { using std::log; return log(a); }

 /** \internal \returns the log1p of \a a (coeff-wise) */
-template<typename Packet> EIGEN_DECLARE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS
+template<typename Packet> EIGEN_DEVICE_FUNC EIGEN_DECLARE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS
 Packet plog1p(const Packet& a) { return numext::log1p(a); }

 /** \internal \returns the log10 of \a a (coeff-wise) */
-template<typename Packet> EIGEN_DECLARE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS
+template<typename Packet> EIGEN_DEVICE_FUNC EIGEN_DECLARE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS
 Packet plog10(const Packet& a) { using std::log10; return log10(a); }

 /** \internal \returns the square-root of \a a (coeff-wise) */
-template<typename Packet> EIGEN_DECLARE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS
+template<typename Packet> EIGEN_DEVICE_FUNC EIGEN_DECLARE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS
 Packet psqrt(const Packet& a) { using std::sqrt; return sqrt(a); }

 /** \internal \returns the reciprocal square-root of \a a (coeff-wise) */
-template<typename Packet> EIGEN_DECLARE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS
+template<typename Packet> EIGEN_DEVICE_FUNC EIGEN_DECLARE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS
 Packet prsqrt(const Packet& a) {
  return pdiv(pset1<Packet>(1), psqrt(a));
 }

 /** \internal \returns the rounded value of \a a (coeff-wise) */
-template<typename Packet> EIGEN_DECLARE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS
+template<typename Packet> EIGEN_DEVICE_FUNC EIGEN_DECLARE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS
 Packet pround(const Packet& a) { using numext::round; return round(a); }

 /** \internal \returns the floor of \a a (coeff-wise) */
-template<typename Packet> EIGEN_DECLARE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS
+template<typename Packet> EIGEN_DEVICE_FUNC EIGEN_DECLARE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS
 Packet pfloor(const Packet& a) { using numext::floor; return floor(a); }

 /** \internal \returns the ceil of \a a (coeff-wise) */
-template<typename Packet> EIGEN_DECLARE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS
+template<typename Packet> EIGEN_DEVICE_FUNC EIGEN_DECLARE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS
 Packet pceil(const Packet& a) { using numext::ceil; return ceil(a); }

 /***************************************************************************
@@ -494,14 +494,14 @@ struct palign_impl

 /** \internal update \a first using the concatenation of the packet_size minus \a Offset last elements
  * of \a first and \a Offset first elements of \a second.
-  * 
+  *
  * This function is currently only used to optimize matrix-vector products on unligned matrices.
  * It takes 2 packets that represent a contiguous memory array, and returns a packet starting
  * at the position \a Offset. For instance, for packets of 4 elements, we have:
  *  Input:
  *  - first = {f0,f1,f2,f3}
  *  - second = {s0,s1,s2,s3}
-  * Output: 
+  * Output:
  *   - if Offset==0 then {f0,f1,f2,f3}
  *   - if Offset==1 then {f1,f2,f3,s0}
  *   - if Offset==2 then {f2,f3,s0,s1}
@@ -518,7 +518,7 @@ inline void palign(PacketType& first, const PacketType& second)
 ***************************************************************************/

 // Eigen+CUDA does not support complexes.
-#ifndef __CUDACC__
+#ifndef EIGEN_CUDACC

 template<> inline std::complex<float> pmul(const std::complex<float>& a, const std::complex<float>& b)
 { return std::complex<float>(a.real()*b.real() - a.imag()*b.imag(), a.imag()*b.real() + a.real()*b.imag()); }
--- a/Eigen/src/Core/MathFunctions.h
+++ b/Eigen/src/Core/MathFunctions.h
@@ -461,6 +461,65 @@ struct arg_retval
  typedef typename NumTraits<Scalar>::Real type;
 };

+/****************************************************************************
+* Implementation of expm1                                                   *
+****************************************************************************/
+
+// This implementation is based on GSL Math's expm1.
+namespace std_fallback {
+  // fallback expm1 implementation in case there is no expm1(Scalar) function in namespace of Scalar,
+  // or that there is no suitable std::expm1 function available. Implementation
+  // attributed to Kahan. See: http://www.plunk.org/~hatch/rightway.php.
+  template<typename Scalar>
+  EIGEN_DEVICE_FUNC inline Scalar expm1(const Scalar& x) {
+    EIGEN_STATIC_ASSERT_NON_INTEGER(Scalar)
+    typedef typename NumTraits<Scalar>::Real RealScalar;
+
+    EIGEN_USING_STD_MATH(exp);
+    Scalar u = exp(x);
+    if (numext::equal_strict(u, Scalar(1))) {
+      return x;
+    }
+    Scalar um1 = u - RealScalar(1);
+    if (numext::equal_strict(um1, Scalar(-1))) {
+      return RealScalar(-1);
+    }
+
+    EIGEN_USING_STD_MATH(log);
+    return (u - RealScalar(1)) * x / log(u);
+  }
+}
+
+template<typename Scalar>
+struct expm1_impl {
+  EIGEN_DEVICE_FUNC static inline Scalar run(const Scalar& x)
+  {
+    EIGEN_STATIC_ASSERT_NON_INTEGER(Scalar)
+    #if EIGEN_HAS_CXX11_MATH
+    using std::expm1;
+    #else
+    using std_fallback::expm1;
+    #endif
+    return expm1(x);
+  }
+};
+
+// Specialization for complex types that are not supported by std::expm1.
+template <typename RealScalar>
+struct expm1_impl<std::complex<RealScalar> > {
+  EIGEN_DEVICE_FUNC static inline std::complex<RealScalar> run(
+      const std::complex<RealScalar>& x) {
+    EIGEN_STATIC_ASSERT_NON_INTEGER(RealScalar)
+    return std_fallback::expm1(x);
+  }
+};
+
+template<typename Scalar>
+struct expm1_retval
+{
+  typedef Scalar type;
+};
+
 /****************************************************************************
 * Implementation of log1p                                                   *
 ****************************************************************************/
@@ -480,17 +539,27 @@ namespace std_fallback {

 template<typename Scalar>
 struct log1p_impl {
-  static inline Scalar run(const Scalar& x)
+  static EIGEN_DEVICE_FUNC inline Scalar run(const Scalar& x)
  {
    EIGEN_STATIC_ASSERT_NON_INTEGER(Scalar)
    #if EIGEN_HAS_CXX11_MATH
    using std::log1p;
-    #endif
+    #else
    using std_fallback::log1p;
+    #endif
    return log1p(x);
  }
 };

+// Specialization for complex types that are not supported by std::log1p.
+template <typename RealScalar>
+struct log1p_impl<std::complex<RealScalar> > {
+  EIGEN_DEVICE_FUNC static inline std::complex<RealScalar> run(
+      const std::complex<RealScalar>& x) {
+    EIGEN_STATIC_ASSERT_NON_INTEGER(RealScalar)
+    return std_fallback::log1p(x);
+  }
+};

 template<typename Scalar>
 struct log1p_retval
@@ -555,19 +624,6 @@ struct random_retval
 template<typename Scalar> inline EIGEN_MATHFUNC_RETVAL(random, Scalar) random(const Scalar& x, const Scalar& y);
 template<typename Scalar> inline EIGEN_MATHFUNC_RETVAL(random, Scalar) random();

-template<typename Scalar>
-struct random_default_impl<Scalar, false, false>
-{
-  static inline Scalar run(const Scalar& x, const Scalar& y)
-  {
-    return x + (y-x) * Scalar(std::rand()) / Scalar(RAND_MAX);
-  }
-  static inline Scalar run()
-  {
-    return run(Scalar(NumTraits<Scalar>::IsSigned ? -1 : 0), Scalar(1));
-  }
-};
-
 enum {
  meta_floor_log2_terminate,
  meta_floor_log2_move_up,
@@ -615,6 +671,38 @@ struct meta_floor_log2<n, lower, upper, meta_floor_log2_bogus>
  // no value, error at compile time
 };

+#define EIGEN_RAND_MAX INT_MAX
+// Fill a signed positive int with random bits.
+// This is to overcome issues in MSVC which limits RAND_MAX to 32767.
+inline int random_int() {
+#if RAND_MAX == INT_MAX
+  return std::rand();
+#else
+  enum {
+    rand_bits = meta_floor_log2<(unsigned int)(RAND_MAX)+1>::value,
+    int_bits = meta_floor_log2<(unsigned int)(INT_MAX)+1>::value,
+  };
+  unsigned int out = std::rand();
+  for (int bit = rand_bits; bit < int(int_bits); bit += rand_bits) {
+    out = (out << rand_bits) ^ std::rand();
+  }
+  return static_cast<int>(out & INT_MAX);
+#endif
+}
+
+template<typename Scalar>
+struct random_default_impl<Scalar, false, false>
+{
+  static inline Scalar run(const Scalar& x, const Scalar& y)
+  {
+    return x + (y-x) * Scalar(random_int()) / Scalar(EIGEN_RAND_MAX);
+  }
+  static inline Scalar run()
+  {
+    return run(Scalar(NumTraits<Scalar>::IsSigned ? -1 : 0), Scalar(1));
+  }
+};
+
 template<typename Scalar>
 struct random_default_impl<Scalar, false, true>
 {
@@ -635,12 +723,12 @@ struct random_default_impl<Scalar, false, true>
    ScalarX offset = 0;
    ScalarX divisor = 1;
    ScalarX multiplier = 1;
-    const unsigned rand_max = RAND_MAX;
+    const unsigned rand_max = EIGEN_RAND_MAX;
    if (range <= rand_max) divisor = (rand_max + 1) / (range + 1);
    else                   multiplier = 1 + range / (rand_max + 1);
    // Rejection sampling.
    do {
-      offset = (unsigned(std::rand()) * multiplier) / divisor;
+      offset = (unsigned(random_int()) * multiplier) / divisor;
    } while (offset > range);
    return Scalar(ScalarX(x) + offset);
  }
@@ -650,12 +738,12 @@ struct random_default_impl<Scalar, false, true>
 #ifdef EIGEN_MAKING_DOCS
    return run(Scalar(NumTraits<Scalar>::IsSigned ? -10 : 0), Scalar(10));
 #else
-    enum { rand_bits = meta_floor_log2<(unsigned int)(RAND_MAX)+1>::value,
+    enum { rand_bits = meta_floor_log2<(unsigned int)(EIGEN_RAND_MAX)+1>::value,
           scalar_bits = sizeof(Scalar) * CHAR_BIT,
           shift = EIGEN_PLAIN_ENUM_MAX(0, int(rand_bits) - int(scalar_bits)),
           offset = NumTraits<Scalar>::IsSigned ? (1 << (EIGEN_PLAIN_ENUM_MIN(rand_bits,scalar_bits)-1)) : 0
    };
-    return Scalar((std::rand() >> shift) - offset);
+    return Scalar((random_int() >> shift) - offset);
 #endif
  }
 };
@@ -943,7 +1031,7 @@ inline EIGEN_MATHFUNC_RETVAL(log1p, Scalar) log1p(const Scalar& x)
  return EIGEN_MATHFUNC_IMPL(log1p, Scalar)::run(x);
 }

-#ifdef __CUDACC__
+#ifdef EIGEN_CUDACC
 template<> EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE
 float log1p(const float &x) { return ::log1pf(x); }

@@ -977,7 +1065,7 @@ T (floor)(const T& x)
  return floor(x);
 }

-#ifdef __CUDACC__
+#ifdef EIGEN_CUDACC
 template<> EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE
 float floor(const float &x) { return ::floorf(x); }

@@ -993,7 +1081,7 @@ T (ceil)(const T& x)
  return ceil(x);
 }

-#ifdef __CUDACC__
+#ifdef EIGEN_CUDACC
 template<> EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE
 float ceil(const float &x) { return ::ceilf(x); }

@@ -1041,7 +1129,7 @@ T log(const T &x) {
  return log(x);
 }

-#ifdef __CUDACC__
+#ifdef EIGEN_CUDACC
 template<> EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE
 float log(const float &x) { return ::logf(x); }

@@ -1069,7 +1157,7 @@ EIGEN_ALWAYS_INLINE float   abs(float x) { return cl::sycl::fabs(x); }
 EIGEN_ALWAYS_INLINE double  abs(double x) { return cl::sycl::fabs(x); }
 #endif // defined(__SYCL_DEVICE_ONLY__)

-#ifdef __CUDACC__
+#ifdef EIGEN_CUDACC
 template<> EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE
 float abs(const float &x) { return ::fabsf(x); }

@@ -1094,7 +1182,7 @@ T exp(const T &x) {
  return exp(x);
 }

-#ifdef __CUDACC__
+#ifdef EIGEN_CUDACC
 template<> EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE
 float exp(const float &x) { return ::expf(x); }

@@ -1109,7 +1197,7 @@ T cos(const T &x) {
  return cos(x);
 }

-#ifdef __CUDACC__
+#ifdef EIGEN_CUDACC
 template<> EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE
 float cos(const float &x) { return ::cosf(x); }

@@ -1124,7 +1212,7 @@ T sin(const T &x) {
  return sin(x);
 }

-#ifdef __CUDACC__
+#ifdef EIGEN_CUDACC
 template<> EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE
 float sin(const float &x) { return ::sinf(x); }

@@ -1139,7 +1227,7 @@ T tan(const T &x) {
  return tan(x);
 }

-#ifdef __CUDACC__
+#ifdef EIGEN_CUDACC
 template<> EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE
 float tan(const float &x) { return ::tanf(x); }

@@ -1154,7 +1242,7 @@ T acos(const T &x) {
  return acos(x);
 }

-#ifdef __CUDACC__
+#ifdef EIGEN_CUDACC
 template<> EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE
 float acos(const float &x) { return ::acosf(x); }

@@ -1169,7 +1257,7 @@ T asin(const T &x) {
  return asin(x);
 }

-#ifdef __CUDACC__
+#ifdef EIGEN_CUDACC
 template<> EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE
 float asin(const float &x) { return ::asinf(x); }

@@ -1184,7 +1272,7 @@ T atan(const T &x) {
  return atan(x);
 }

-#ifdef __CUDACC__
+#ifdef EIGEN_CUDACC
 template<> EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE
 float atan(const float &x) { return ::atanf(x); }

@@ -1200,7 +1288,7 @@ T cosh(const T &x) {
  return cosh(x);
 }

-#ifdef __CUDACC__
+#ifdef EIGEN_CUDACC
 template<> EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE
 float cosh(const float &x) { return ::coshf(x); }

@@ -1215,7 +1303,7 @@ T sinh(const T &x) {
  return sinh(x);
 }

-#ifdef __CUDACC__
+#ifdef EIGEN_CUDACC
 template<> EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE
 float sinh(const float &x) { return ::sinhf(x); }

@@ -1230,12 +1318,12 @@ T tanh(const T &x) {
  return tanh(x);
 }

-#if (!defined(__CUDACC__)) && EIGEN_FAST_MATH
+#if (!defined(EIGEN_CUDACC)) && EIGEN_FAST_MATH
 EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE
 float tanh(float x) { return internal::generic_fast_tanh_float(x); }
 #endif

-#ifdef __CUDACC__
+#ifdef EIGEN_CUDACC
 template<> EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE
 float tanh(const float &x) { return ::tanhf(x); }

@@ -1250,7 +1338,7 @@ T fmod(const T& a, const T& b) {
  return fmod(a, b);
 }

-#ifdef __CUDACC__
+#ifdef EIGEN_CUDACC
 template <>
 EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE
 float fmod(const float& a, const float& b) {
--- a/Eigen/src/Core/NestByValue.h
+++ b/Eigen/src/Core/NestByValue.h
@@ -99,7 +99,7 @@ template<typename ExpressionType> class NestByValue
 /** \returns an expression of the temporary version of *this.
  */
 template<typename Derived>
-inline const NestByValue<Derived>
+EIGEN_DEVICE_FUNC inline const NestByValue<Derived>
 DenseBase<Derived>::nestByValue() const
 {
  return NestByValue<Derived>(derived());
--- a/Eigen/src/Core/NumTraits.h
+++ b/Eigen/src/Core/NumTraits.h
@@ -54,34 +54,34 @@ struct default_digits10_impl<T,false,true> // Integer
  *
  * The provided data consists of:
  * \li A typedef \c Real, giving the "real part" type of \a T. If \a T is already real,
-  *     then \c Real is just a typedef to \a T. If \a T is \c std::complex<U> then \c Real
+  *     then \c Real is just a typedef to \a T. If \a T is `std::complex<U>` then \c Real
  *     is a typedef to \a U.
  * \li A typedef \c NonInteger, giving the type that should be used for operations producing non-integral values,
  *     such as quotients, square roots, etc. If \a T is a floating-point type, then this typedef just gives
  *     \a T again. Note however that many Eigen functions such as internal::sqrt simply refuse to
  *     take integers. Outside of a few cases, Eigen doesn't do automatic type promotion. Thus, this typedef is
  *     only intended as a helper for code that needs to explicitly promote types.
-  * \li A typedef \c Literal giving the type to use for numeric literals such as "2" or "0.5". For instance, for \c std::complex<U>, Literal is defined as \c U.
+  * \li A typedef \c Literal giving the type to use for numeric literals such as "2" or "0.5". For instance, for `std::complex<U>`, Literal is defined as \c U.
  *     Of course, this type must be fully compatible with \a T. In doubt, just use \a T here.
-  * \li A typedef \a Nested giving the type to use to nest a value inside of the expression tree. If you don't know what
+  * \li A typedef \c Nested giving the type to use to nest a value inside of the expression tree. If you don't know what
  *     this means, just use \a T here.
-  * \li An enum value \a IsComplex. It is equal to 1 if \a T is a \c std::complex
+  * \li An enum value \c IsComplex. It is equal to 1 if \a T is a `std::complex`
  *     type, and to 0 otherwise.
-  * \li An enum value \a IsInteger. It is equal to \c 1 if \a T is an integer type such as \c int,
+  * \li An enum value \c IsInteger. It is equal to \c 1 if \a T is an integer type such as \c int,
  *     and to \c 0 otherwise.
-  * \li Enum values ReadCost, AddCost and MulCost representing a rough estimate of the number of CPU cycles needed
+  * \li Enum values \c ReadCost, \c AddCost and \c MulCost representing a rough estimate of the number of CPU cycles needed
  *     to by move / add / mul instructions respectively, assuming the data is already stored in CPU registers.
  *     Stay vague here. No need to do architecture-specific stuff.
-  * \li An enum value \a IsSigned. It is equal to \c 1 if \a T is a signed type and to 0 if \a T is unsigned.
-  * \li An enum value \a RequireInitialization. It is equal to \c 1 if the constructor of the numeric type \a T must
+  * \li An enum value \c IsSigned. It is equal to \c 1 if \a T is a signed type and to 0 if \a T is unsigned.
+  * \li An enum value \c RequireInitialization. It is equal to \c 1 if the constructor of the numeric type \a T must
  *     be called, and to 0 if it is safe not to call it. Default is 0 if \a T is an arithmetic type, and 1 otherwise.
-  * \li An epsilon() function which, unlike <a href="http://en.cppreference.com/w/cpp/types/numeric_limits/epsilon">std::numeric_limits::epsilon()</a>,
+  * \li An `epsilon()` function which, unlike <a href="http://en.cppreference.com/w/cpp/types/numeric_limits/epsilon">`std::numeric_limits::epsilon()`</a>,
  *     it returns a \a Real instead of a \a T.
-  * \li A dummy_precision() function returning a weak epsilon value. It is mainly used as a default
+  * \li A `dummy_precision()` function returning a weak epsilon value. It is mainly used as a default
  *     value by the fuzzy comparison operators.
-  * \li highest() and lowest() functions returning the highest and lowest possible values respectively.
-  * \li digits10() function returning the number of decimal digits that can be represented without change. This is
-  *     the analogue of <a href="http://en.cppreference.com/w/cpp/types/numeric_limits/digits10">std::numeric_limits<T>::digits10</a>
+  * \li `highest()` and `lowest()` functions returning the highest and lowest possible values respectively.
+  * \li `digits10()` function returning the number of decimal digits that can be represented without change. This is
+  *     the analogue of <a href="http://en.cppreference.com/w/cpp/types/numeric_limits/digits10">`std::numeric_limits<T>::digits10`</a>
  *     which is used as the default implementation if specialized.
  */

@@ -166,7 +166,16 @@ template<> struct NumTraits<double> : GenericNumTraits<double>
 template<> struct NumTraits<long double>
  : GenericNumTraits<long double>
 {
-  static inline long double dummy_precision() { return 1e-15l; }
+  static inline long double dummy_precision() { return static_cast<long double>(1e-15l); }
+
+#if defined(EIGEN_ARCH_PPC) && (__LDBL_MANT_DIG__ == 106)
+  // PowerPC double double causes issues with some values
+  static inline long double epsilon()
+  {
+    // 2^(-(__LDBL_MANT_DIG__)+1)
+    return static_cast<long double>(2.4651903288156618919116517665087e-32l);
+  }
+#endif
 };

 template<typename _Real> struct NumTraits<std::complex<_Real> >
--- a/Eigen/src/Core/PermutationMatrix.h
+++ b/Eigen/src/Core/PermutationMatrix.h
@@ -87,17 +87,6 @@ class PermutationBase : public EigenBase<Derived>
      return derived();
    }

-    #ifndef EIGEN_PARSED_BY_DOXYGEN
-    /** This is a special case of the templated operator=. Its purpose is to
-      * prevent a default operator= from hiding the templated operator=.
-      */
-    Derived& operator=(const PermutationBase& other)
-    {
-      indices() = other.indices();
-      return derived();
-    }
-    #endif
-
    /** \returns the number of rows */
    inline Index rows() const { return Index(indices().size()); }

@@ -333,12 +322,6 @@ class PermutationMatrix : public PermutationBase<PermutationMatrix<SizeAtCompile
    inline PermutationMatrix(const PermutationBase<OtherDerived>& other)
      : m_indices(other.indices()) {}

-    #ifndef EIGEN_PARSED_BY_DOXYGEN
-    /** Standard copy constructor. Defined only to prevent a default copy constructor
-      * from hiding the other templated constructor */
-    inline PermutationMatrix(const PermutationMatrix& other) : m_indices(other.indices()) {}
-    #endif
-
    /** Generic constructor from expression of the indices. The indices
      * array has the meaning that the permutations sends each integer i to indices[i].
      *
@@ -373,17 +356,6 @@ class PermutationMatrix : public PermutationBase<PermutationMatrix<SizeAtCompile
      return Base::operator=(tr.derived());
    }

-    #ifndef EIGEN_PARSED_BY_DOXYGEN
-    /** This is a special case of the templated operator=. Its purpose is to
-      * prevent a default operator= from hiding the templated operator=.
-      */
-    PermutationMatrix& operator=(const PermutationMatrix& other)
-    {
-      m_indices = other.m_indices;
-      return *this;
-    }
-    #endif
-
    /** const version of indices(). */
    const IndicesType& indices() const { return m_indices; }
    /** \returns a reference to the stored array representing the permutation. */
--- a/Eigen/src/Core/ProductEvaluators.h
+++ b/Eigen/src/Core/ProductEvaluators.h
@@ -14,7 +14,7 @@
 #define EIGEN_PRODUCTEVALUATORS_H

 namespace Eigen {
-  
+
 namespace internal {

 /** \internal
@@ -22,19 +22,19 @@ namespace internal {
  * Since products require special treatments to handle all possible cases,
  * we simply deffer the evaluation logic to a product_evaluator class
  * which offers more partial specialization possibilities.
-  * 
+  *
  * \sa class product_evaluator
  */
 template<typename Lhs, typename Rhs, int Options>
-struct evaluator<Product<Lhs, Rhs, Options> > 
+struct evaluator<Product<Lhs, Rhs, Options> >
 : public product_evaluator<Product<Lhs, Rhs, Options> >
 {
  typedef Product<Lhs, Rhs, Options> XprType;
  typedef product_evaluator<XprType> Base;
-  
+
  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE explicit evaluator(const XprType& xpr) : Base(xpr) {}
 };
- 
+
 // Catch "scalar * ( A * B )" and transform it to "(A*scalar) * B"
 // TODO we should apply that rule only if that's really helpful
 template<typename Lhs, typename Rhs, typename Scalar1, typename Scalar2, typename Plain1>
@@ -62,12 +62,12 @@ struct evaluator<CwiseBinaryOp<internal::scalar_product_op<Scalar1,Scalar2>,


 template<typename Lhs, typename Rhs, int DiagIndex>
-struct evaluator<Diagonal<const Product<Lhs, Rhs, DefaultProduct>, DiagIndex> > 
+struct evaluator<Diagonal<const Product<Lhs, Rhs, DefaultProduct>, DiagIndex> >
 : public evaluator<Diagonal<const Product<Lhs, Rhs, LazyProduct>, DiagIndex> >
 {
  typedef Diagonal<const Product<Lhs, Rhs, DefaultProduct>, DiagIndex> XprType;
  typedef evaluator<Diagonal<const Product<Lhs, Rhs, LazyProduct>, DiagIndex> > Base;
-  
+
  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE explicit evaluator(const XprType& xpr)
    : Base(Diagonal<const Product<Lhs, Rhs, LazyProduct>, DiagIndex>(
        Product<Lhs, Rhs, LazyProduct>(xpr.nestedExpression().lhs(), xpr.nestedExpression().rhs()),
@@ -108,23 +108,23 @@ struct product_evaluator<Product<Lhs, Rhs, Options>, ProductTag, LhsShape, RhsSh
    : m_result(xpr.rows(), xpr.cols())
  {
    ::new (static_cast<Base*>(this)) Base(m_result);
-    
+
 // FIXME shall we handle nested_eval here?,
 // if so, then we must take care at removing the call to nested_eval in the specializations (e.g., in permutation_matrix_product, transposition_matrix_product, etc.)
 //     typedef typename internal::nested_eval<Lhs,Rhs::ColsAtCompileTime>::type LhsNested;
 //     typedef typename internal::nested_eval<Rhs,Lhs::RowsAtCompileTime>::type RhsNested;
 //     typedef typename internal::remove_all<LhsNested>::type LhsNestedCleaned;
 //     typedef typename internal::remove_all<RhsNested>::type RhsNestedCleaned;
-//     
+//
 //     const LhsNested lhs(xpr.lhs());
 //     const RhsNested rhs(xpr.rhs());
-//   
+//
 //     generic_product_impl<LhsNestedCleaned, RhsNestedCleaned>::evalTo(m_result, lhs, rhs);

    generic_product_impl<Lhs, Rhs, LhsShape, RhsShape, ProductTag>::evalTo(m_result, xpr.lhs(), xpr.rhs());
  }
-  
-protected:  
+
+protected:
  PlainObject m_result;
 };

@@ -137,7 +137,7 @@ struct Assignment<DstXprType, Product<Lhs,Rhs,Options>, internal::assign_op<Scal
  typename enable_if<(Options==DefaultProduct || Options==AliasFreeProduct)>::type>
 {
  typedef Product<Lhs,Rhs,Options> SrcXprType;
-  static EIGEN_STRONG_INLINE
+  static EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC
  void run(DstXprType &dst, const SrcXprType &src, const internal::assign_op<Scalar,Scalar> &)
  {
    Index dstRows = src.rows();
@@ -155,7 +155,7 @@ struct Assignment<DstXprType, Product<Lhs,Rhs,Options>, internal::add_assign_op<
  typename enable_if<(Options==DefaultProduct || Options==AliasFreeProduct)>::type>
 {
  typedef Product<Lhs,Rhs,Options> SrcXprType;
-  static EIGEN_STRONG_INLINE
+  static EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC
  void run(DstXprType &dst, const SrcXprType &src, const internal::add_assign_op<Scalar,Scalar> &)
  {
    eigen_assert(dst.rows() == src.rows() && dst.cols() == src.cols());
@@ -170,7 +170,7 @@ struct Assignment<DstXprType, Product<Lhs,Rhs,Options>, internal::sub_assign_op<
  typename enable_if<(Options==DefaultProduct || Options==AliasFreeProduct)>::type>
 {
  typedef Product<Lhs,Rhs,Options> SrcXprType;
-  static EIGEN_STRONG_INLINE
+  static EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC
  void run(DstXprType &dst, const SrcXprType &src, const internal::sub_assign_op<Scalar,Scalar> &)
  {
    eigen_assert(dst.rows() == src.rows() && dst.cols() == src.cols());
@@ -190,7 +190,7 @@ struct Assignment<DstXprType, CwiseBinaryOp<internal::scalar_product_op<ScalarBi
  typedef CwiseBinaryOp<internal::scalar_product_op<ScalarBis,Scalar>,
                        const CwiseNullaryOp<internal::scalar_constant_op<ScalarBis>,Plain>,
                        const Product<Lhs,Rhs,DefaultProduct> > SrcXprType;
-  static EIGEN_STRONG_INLINE
+  static EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC
  void run(DstXprType &dst, const SrcXprType &src, const AssignFunc& func)
  {
    call_assignment_no_alias(dst, (src.lhs().functor().m_other * src.rhs().lhs())*src.rhs().rhs(), func);
@@ -250,13 +250,13 @@ struct generic_product_impl<Lhs,Rhs,DenseShape,DenseShape,InnerProduct>
  {
    dst.coeffRef(0,0) = (lhs.transpose().cwiseProduct(rhs)).sum();
  }
-  
+
  template<typename Dst>
  static EIGEN_STRONG_INLINE void addTo(Dst& dst, const Lhs& lhs, const Rhs& rhs)
  {
    dst.coeffRef(0,0) += (lhs.transpose().cwiseProduct(rhs)).sum();
  }
-  
+
  template<typename Dst>
  static EIGEN_STRONG_INLINE void subTo(Dst& dst, const Lhs& lhs, const Rhs& rhs)
  { dst.coeffRef(0,0) -= (lhs.transpose().cwiseProduct(rhs)).sum(); }
@@ -298,7 +298,7 @@ struct generic_product_impl<Lhs,Rhs,DenseShape,DenseShape,OuterProduct>
 {
  template<typename T> struct is_row_major : internal::conditional<(int(T::Flags)&RowMajorBit), internal::true_type, internal::false_type>::type {};
  typedef typename Product<Lhs,Rhs>::Scalar Scalar;
-  
+
  // TODO it would be nice to be able to exploit our *_assign_op functors for that purpose
  struct set  { template<typename Dst, typename Src> void operator()(const Dst& dst, const Src& src) const { dst.const_cast_derived()  = src; } };
  struct add  { template<typename Dst, typename Src> void operator()(const Dst& dst, const Src& src) const { dst.const_cast_derived() += src; } };
@@ -310,31 +310,31 @@ struct generic_product_impl<Lhs,Rhs,DenseShape,DenseShape,OuterProduct>
      dst.const_cast_derived() += m_scale * src;
    }
  };
-  
+
  template<typename Dst>
  static EIGEN_STRONG_INLINE void evalTo(Dst& dst, const Lhs& lhs, const Rhs& rhs)
  {
    internal::outer_product_selector_run(dst, lhs, rhs, set(), is_row_major<Dst>());
  }
-  
+
  template<typename Dst>
  static EIGEN_STRONG_INLINE void addTo(Dst& dst, const Lhs& lhs, const Rhs& rhs)
  {
    internal::outer_product_selector_run(dst, lhs, rhs, add(), is_row_major<Dst>());
  }
-  
+
  template<typename Dst>
  static EIGEN_STRONG_INLINE void subTo(Dst& dst, const Lhs& lhs, const Rhs& rhs)
  {
    internal::outer_product_selector_run(dst, lhs, rhs, sub(), is_row_major<Dst>());
  }
-  
+
  template<typename Dst>
  static EIGEN_STRONG_INLINE void scaleAndAddTo(Dst& dst, const Lhs& lhs, const Rhs& rhs, const Scalar& alpha)
  {
    internal::outer_product_selector_run(dst, lhs, rhs, adds(alpha), is_row_major<Dst>());
  }
-  
+
 };


@@ -343,7 +343,7 @@ template<typename Lhs, typename Rhs, typename Derived>
 struct generic_product_impl_base
 {
  typedef typename Product<Lhs,Rhs>::Scalar Scalar;
-  
+
  template<typename Dst>
  static EIGEN_STRONG_INLINE void evalTo(Dst& dst, const Lhs& lhs, const Rhs& rhs)
  { dst.setZero(); scaleAndAddTo(dst, lhs, rhs, Scalar(1)); }
@@ -355,7 +355,7 @@ struct generic_product_impl_base
  template<typename Dst>
  static EIGEN_STRONG_INLINE void subTo(Dst& dst, const Lhs& lhs, const Rhs& rhs)
  { scaleAndAddTo(dst, lhs, rhs, Scalar(-1)); }
-  
+
  template<typename Dst>
  static EIGEN_STRONG_INLINE void scaleAndAddTo(Dst& dst, const Lhs& lhs, const Rhs& rhs, const Scalar& alpha)
  { Derived::scaleAndAddTo(dst,lhs,rhs,alpha); }
@@ -385,12 +385,12 @@ struct generic_product_impl<Lhs,Rhs,DenseShape,DenseShape,GemvProduct>
 };

 template<typename Lhs, typename Rhs>
-struct generic_product_impl<Lhs,Rhs,DenseShape,DenseShape,CoeffBasedProductMode> 
+struct generic_product_impl<Lhs,Rhs,DenseShape,DenseShape,CoeffBasedProductMode>
 {
  typedef typename Product<Lhs,Rhs>::Scalar Scalar;
-  
+
  template<typename Dst>
-  static EIGEN_STRONG_INLINE void evalTo(Dst& dst, const Lhs& lhs, const Rhs& rhs)
+  static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void evalTo(Dst& dst, const Lhs& lhs, const Rhs& rhs)
  {
    // Same as: dst.noalias() = lhs.lazyProduct(rhs);
    // but easier on the compiler side
@@ -403,7 +403,7 @@ struct generic_product_impl<Lhs,Rhs,DenseShape,DenseShape,CoeffBasedProductMode>
    // dst.noalias() += lhs.lazyProduct(rhs);
    call_assignment_no_alias(dst, lhs.lazyProduct(rhs), internal::add_assign_op<typename Dst::Scalar,Scalar>());
  }
-  
+
  template<typename Dst>
  static EIGEN_STRONG_INLINE void subTo(Dst& dst, const Lhs& lhs, const Rhs& rhs)
  {
@@ -435,8 +435,8 @@ struct generic_product_impl<Lhs,Rhs,DenseShape,DenseShape,CoeffBasedProductMode>
  {
    call_assignment_no_alias(dst, lhs.lazyProduct(rhs), func);
  }
-  
-  
+
+
 //   template<typename Dst>
 //   static inline void scaleAndAddTo(Dst& dst, const Lhs& lhs, const Rhs& rhs, const Scalar& alpha)
 //   { dst.noalias() += alpha * lhs.lazyProduct(rhs); }
@@ -497,7 +497,7 @@ struct product_evaluator<Product<Lhs, Rhs, LazyProduct>, ProductTag, DenseShape,

  typedef typename internal::nested_eval<Lhs,Rhs::ColsAtCompileTime>::type LhsNested;
  typedef typename internal::nested_eval<Rhs,Lhs::RowsAtCompileTime>::type RhsNested;
-  
+
  typedef typename internal::remove_all<LhsNested>::type LhsNestedCleaned;
  typedef typename internal::remove_all<RhsNested>::type RhsNestedCleaned;

@@ -516,7 +516,7 @@ struct product_evaluator<Product<Lhs, Rhs, LazyProduct>, ProductTag, DenseShape,
  typedef typename find_best_packet<Scalar,ColsAtCompileTime>::type RhsVecPacketType;

  enum {
-      
+
    LhsCoeffReadCost = LhsEtorType::CoeffReadCost,
    RhsCoeffReadCost = RhsEtorType::CoeffReadCost,
    CoeffReadCost = InnerSize==0 ? NumTraits<Scalar>::ReadCost
@@ -525,10 +525,10 @@ struct product_evaluator<Product<Lhs, Rhs, LazyProduct>, ProductTag, DenseShape,
                    + (InnerSize - 1) * NumTraits<Scalar>::AddCost,

    Unroll = CoeffReadCost <= EIGEN_UNROLLING_LIMIT,
-    
+
    LhsFlags = LhsEtorType::Flags,
    RhsFlags = RhsEtorType::Flags,
-    
+
    LhsRowMajor = LhsFlags & RowMajorBit,
    RhsRowMajor = RhsFlags & RowMajorBit,

@@ -538,7 +538,7 @@ struct product_evaluator<Product<Lhs, Rhs, LazyProduct>, ProductTag, DenseShape,
    // Here, we don't care about alignment larger than the usable packet size.
    LhsAlignment = EIGEN_PLAIN_ENUM_MIN(LhsEtorType::Alignment,LhsVecPacketSize*int(sizeof(typename LhsNestedCleaned::Scalar))),
    RhsAlignment = EIGEN_PLAIN_ENUM_MIN(RhsEtorType::Alignment,RhsVecPacketSize*int(sizeof(typename RhsNestedCleaned::Scalar))),
-      
+
    SameType = is_same<typename LhsNestedCleaned::Scalar,typename RhsNestedCleaned::Scalar>::value,

    CanVectorizeRhs = bool(RhsRowMajor) && (RhsFlags & PacketAccessBit) && (ColsAtCompileTime!=1),
@@ -553,7 +553,7 @@ struct product_evaluator<Product<Lhs, Rhs, LazyProduct>, ProductTag, DenseShape,
          // TODO enable vectorization for mixed types
          | (SameType && (CanVectorizeLhs || CanVectorizeRhs) ? PacketAccessBit : 0)
          | (XprType::IsVectorAtCompileTime ? LinearAccessBit : 0),
-          
+
    LhsOuterStrideBytes = int(LhsNestedCleaned::OuterStrideAtCompileTime) * int(sizeof(typename LhsNestedCleaned::Scalar)),
    RhsOuterStrideBytes = int(RhsNestedCleaned::OuterStrideAtCompileTime) * int(sizeof(typename RhsNestedCleaned::Scalar)),

@@ -572,7 +572,7 @@ struct product_evaluator<Product<Lhs, Rhs, LazyProduct>, ProductTag, DenseShape,
                        && (LhsFlags & RhsFlags & ActualPacketAccessBit)
                        && (InnerSize % packet_traits<Scalar>::size == 0)
  };
-  
+
  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const CoeffReturnType coeff(Index row, Index col) const
  {
    return (m_lhs.row(row).transpose().cwiseProduct( m_rhs.col(col) )).sum();
@@ -611,7 +611,7 @@ struct product_evaluator<Product<Lhs, Rhs, LazyProduct>, ProductTag, DenseShape,
 protected:
  typename internal::add_const_on_value_type<LhsNested>::type m_lhs;
  typename internal::add_const_on_value_type<RhsNested>::type m_rhs;
-  
+
  LhsEtorType m_lhsImpl;
  RhsEtorType m_rhsImpl;

@@ -730,7 +730,7 @@ struct generic_product_impl<Lhs,Rhs,TriangularShape,DenseShape,ProductTag>
  : generic_product_impl_base<Lhs,Rhs,generic_product_impl<Lhs,Rhs,TriangularShape,DenseShape,ProductTag> >
 {
  typedef typename Product<Lhs,Rhs>::Scalar Scalar;
-  
+
  template<typename Dest>
  static void scaleAndAddTo(Dest& dst, const Lhs& lhs, const Rhs& rhs, const Scalar& alpha)
  {
@@ -744,7 +744,7 @@ struct generic_product_impl<Lhs,Rhs,DenseShape,TriangularShape,ProductTag>
 : generic_product_impl_base<Lhs,Rhs,generic_product_impl<Lhs,Rhs,DenseShape,TriangularShape,ProductTag> >
 {
  typedef typename Product<Lhs,Rhs>::Scalar Scalar;
-  
+
  template<typename Dest>
  static void scaleAndAddTo(Dest& dst, const Lhs& lhs, const Rhs& rhs, const Scalar& alpha)
  {
@@ -765,7 +765,7 @@ struct generic_product_impl<Lhs,Rhs,SelfAdjointShape,DenseShape,ProductTag>
  : generic_product_impl_base<Lhs,Rhs,generic_product_impl<Lhs,Rhs,SelfAdjointShape,DenseShape,ProductTag> >
 {
  typedef typename Product<Lhs,Rhs>::Scalar Scalar;
-  
+
  template<typename Dest>
  static void scaleAndAddTo(Dest& dst, const Lhs& lhs, const Rhs& rhs, const Scalar& alpha)
  {
@@ -778,7 +778,7 @@ struct generic_product_impl<Lhs,Rhs,DenseShape,SelfAdjointShape,ProductTag>
 : generic_product_impl_base<Lhs,Rhs,generic_product_impl<Lhs,Rhs,DenseShape,SelfAdjointShape,ProductTag> >
 {
  typedef typename Product<Lhs,Rhs>::Scalar Scalar;
-  
+
  template<typename Dest>
  static void scaleAndAddTo(Dest& dst, const Lhs& lhs, const Rhs& rhs, const Scalar& alpha)
  {
@@ -790,7 +790,7 @@ struct generic_product_impl<Lhs,Rhs,DenseShape,SelfAdjointShape,ProductTag>
 /***************************************************************************
 * Diagonal products
 ***************************************************************************/
-  
+
 template<typename MatrixType, typename DiagonalType, typename Derived, int ProductOrder>
 struct diagonal_product_evaluator_base
  : evaluator_base<Derived>
@@ -799,7 +799,7 @@ struct diagonal_product_evaluator_base
 public:
  enum {
    CoeffReadCost = NumTraits<Scalar>::MulCost + evaluator<MatrixType>::CoeffReadCost + evaluator<DiagonalType>::CoeffReadCost,
-    
+
    MatrixFlags = evaluator<MatrixType>::Flags,
    DiagFlags = evaluator<DiagonalType>::Flags,
    _StorageOrder = MatrixFlags & RowMajorBit ? RowMajor : ColMajor,
@@ -817,14 +817,14 @@ public:
                      ||  (DiagonalType::SizeAtCompileTime==Dynamic && MatrixType::RowsAtCompileTime==1 && ProductOrder==OnTheLeft)
                      ||  (DiagonalType::SizeAtCompileTime==Dynamic && MatrixType::ColsAtCompileTime==1 && ProductOrder==OnTheRight)
  };
-  
+
  diagonal_product_evaluator_base(const MatrixType &mat, const DiagonalType &diag)
    : m_diagImpl(diag), m_matImpl(mat)
  {
    EIGEN_INTERNAL_CHECK_COST_VALUE(NumTraits<Scalar>::MulCost);
    EIGEN_INTERNAL_CHECK_COST_VALUE(CoeffReadCost);
  }
-  
+
  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Scalar coeff(Index idx) const
  {
    if(AsScalarProduct)
@@ -832,7 +832,7 @@ public:
    else
      return m_diagImpl.coeff(idx) * m_matImpl.coeff(idx);
  }
-  
+
 protected:
  template<int LoadMode,typename PacketType>
  EIGEN_STRONG_INLINE PacketType packet_impl(Index row, Index col, Index id, internal::true_type) const
@@ -840,7 +840,7 @@ protected:
    return internal::pmul(m_matImpl.template packet<LoadMode,PacketType>(row, col),
                          internal::pset1<PacketType>(m_diagImpl.coeff(id)));
  }
-  
+
  template<int LoadMode,typename PacketType>
  EIGEN_STRONG_INLINE PacketType packet_impl(Index row, Index col, Index id, internal::false_type) const
  {
@@ -851,7 +851,7 @@ protected:
    return internal::pmul(m_matImpl.template packet<LoadMode,PacketType>(row, col),
                          m_diagImpl.template packet<DiagonalPacketLoadMode,PacketType>(id));
  }
-  
+
  evaluator<DiagonalType> m_diagImpl;
  evaluator<MatrixType>   m_matImpl;
 };
@@ -866,10 +866,10 @@ struct product_evaluator<Product<Lhs, Rhs, ProductKind>, ProductTag, DiagonalSha
  using Base::m_matImpl;
  using Base::coeff;
  typedef typename Base::Scalar Scalar;
-  
+
  typedef Product<Lhs, Rhs, ProductKind> XprType;
  typedef typename XprType::PlainObject PlainObject;
-  
+
  enum {
    StorageOrder = int(Rhs::Flags) & RowMajorBit ? RowMajor : ColMajor
  };
@@ -878,12 +878,12 @@ struct product_evaluator<Product<Lhs, Rhs, ProductKind>, ProductTag, DiagonalSha
    : Base(xpr.rhs(), xpr.lhs().diagonal())
  {
  }
-  
+
  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Scalar coeff(Index row, Index col) const
  {
    return m_diagImpl.coeff(row) * m_matImpl.coeff(row, col);
  }
-  
+
 #ifndef __CUDACC__
  template<int LoadMode,typename PacketType>
  EIGEN_STRONG_INLINE PacketType packet(Index row, Index col) const
@@ -893,7 +893,7 @@ struct product_evaluator<Product<Lhs, Rhs, ProductKind>, ProductTag, DiagonalSha
    return this->template packet_impl<LoadMode,PacketType>(row,col, row,
                                 typename internal::conditional<int(StorageOrder)==RowMajor, internal::true_type, internal::false_type>::type());
  }
-  
+
  template<int LoadMode,typename PacketType>
  EIGEN_STRONG_INLINE PacketType packet(Index idx) const
  {
@@ -912,22 +912,22 @@ struct product_evaluator<Product<Lhs, Rhs, ProductKind>, ProductTag, DenseShape,
  using Base::m_matImpl;
  using Base::coeff;
  typedef typename Base::Scalar Scalar;
-  
+
  typedef Product<Lhs, Rhs, ProductKind> XprType;
  typedef typename XprType::PlainObject PlainObject;
-  
+
  enum { StorageOrder = int(Lhs::Flags) & RowMajorBit ? RowMajor : ColMajor };

  EIGEN_DEVICE_FUNC explicit product_evaluator(const XprType& xpr)
    : Base(xpr.lhs(), xpr.rhs().diagonal())
  {
  }
-  
+
  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Scalar coeff(Index row, Index col) const
  {
    return m_matImpl.coeff(row, col) * m_diagImpl.coeff(col);
  }
-  
+
 #ifndef __CUDACC__
  template<int LoadMode,typename PacketType>
  EIGEN_STRONG_INLINE PacketType packet(Index row, Index col) const
@@ -935,7 +935,7 @@ struct product_evaluator<Product<Lhs, Rhs, ProductKind>, ProductTag, DenseShape,
    return this->template packet_impl<LoadMode,PacketType>(row,col, col,
                                 typename internal::conditional<int(StorageOrder)==ColMajor, internal::true_type, internal::false_type>::type());
  }
-  
+
  template<int LoadMode,typename PacketType>
  EIGEN_STRONG_INLINE PacketType packet(Index idx) const
  {
@@ -1017,7 +1017,7 @@ template<typename Lhs, typename Rhs, int ProductTag, typename MatrixShape>
 struct generic_product_impl<Lhs, Rhs, PermutationShape, MatrixShape, ProductTag>
 {
  template<typename Dest>
-  static void evalTo(Dest& dst, const Lhs& lhs, const Rhs& rhs)
+  static EIGEN_DEVICE_FUNC void evalTo(Dest& dst, const Lhs& lhs, const Rhs& rhs)
  {
    permutation_matrix_product<Rhs, OnTheLeft, false, MatrixShape>::run(dst, lhs, rhs);
  }
@@ -1027,7 +1027,7 @@ template<typename Lhs, typename Rhs, int ProductTag, typename MatrixShape>
 struct generic_product_impl<Lhs, Rhs, MatrixShape, PermutationShape, ProductTag>
 {
  template<typename Dest>
-  static void evalTo(Dest& dst, const Lhs& lhs, const Rhs& rhs)
+  static EIGEN_DEVICE_FUNC void evalTo(Dest& dst, const Lhs& lhs, const Rhs& rhs)
  {
    permutation_matrix_product<Lhs, OnTheRight, false, MatrixShape>::run(dst, rhs, lhs);
  }
@@ -1037,7 +1037,7 @@ template<typename Lhs, typename Rhs, int ProductTag, typename MatrixShape>
 struct generic_product_impl<Inverse<Lhs>, Rhs, PermutationShape, MatrixShape, ProductTag>
 {
  template<typename Dest>
-  static void evalTo(Dest& dst, const Inverse<Lhs>& lhs, const Rhs& rhs)
+  static EIGEN_DEVICE_FUNC void evalTo(Dest& dst, const Inverse<Lhs>& lhs, const Rhs& rhs)
  {
    permutation_matrix_product<Rhs, OnTheLeft, true, MatrixShape>::run(dst, lhs.nestedExpression(), rhs);
  }
@@ -1047,7 +1047,7 @@ template<typename Lhs, typename Rhs, int ProductTag, typename MatrixShape>
 struct generic_product_impl<Lhs, Inverse<Rhs>, MatrixShape, PermutationShape, ProductTag>
 {
  template<typename Dest>
-  static void evalTo(Dest& dst, const Lhs& lhs, const Inverse<Rhs>& rhs)
+  static EIGEN_DEVICE_FUNC void evalTo(Dest& dst, const Lhs& lhs, const Inverse<Rhs>& rhs)
  {
    permutation_matrix_product<Lhs, OnTheRight, true, MatrixShape>::run(dst, rhs.nestedExpression(), lhs);
  }
@@ -1069,7 +1069,7 @@ struct transposition_matrix_product
 {
  typedef typename nested_eval<ExpressionType, 1>::type MatrixType;
  typedef typename remove_all<MatrixType>::type MatrixTypeCleaned;
-  
+
  template<typename Dest, typename TranspositionType>
  static inline void run(Dest& dst, const TranspositionType& tr, const ExpressionType& xpr)
  {
@@ -1094,7 +1094,7 @@ template<typename Lhs, typename Rhs, int ProductTag, typename MatrixShape>
 struct generic_product_impl<Lhs, Rhs, TranspositionsShape, MatrixShape, ProductTag>
 {
  template<typename Dest>
-  static void evalTo(Dest& dst, const Lhs& lhs, const Rhs& rhs)
+  static EIGEN_DEVICE_FUNC void evalTo(Dest& dst, const Lhs& lhs, const Rhs& rhs)
  {
    transposition_matrix_product<Rhs, OnTheLeft, false, MatrixShape>::run(dst, lhs, rhs);
  }
@@ -1104,7 +1104,7 @@ template<typename Lhs, typename Rhs, int ProductTag, typename MatrixShape>
 struct generic_product_impl<Lhs, Rhs, MatrixShape, TranspositionsShape, ProductTag>
 {
  template<typename Dest>
-  static void evalTo(Dest& dst, const Lhs& lhs, const Rhs& rhs)
+  static EIGEN_DEVICE_FUNC void evalTo(Dest& dst, const Lhs& lhs, const Rhs& rhs)
  {
    transposition_matrix_product<Lhs, OnTheRight, false, MatrixShape>::run(dst, rhs, lhs);
  }
@@ -1115,7 +1115,7 @@ template<typename Lhs, typename Rhs, int ProductTag, typename MatrixShape>
 struct generic_product_impl<Transpose<Lhs>, Rhs, TranspositionsShape, MatrixShape, ProductTag>
 {
  template<typename Dest>
-  static void evalTo(Dest& dst, const Transpose<Lhs>& lhs, const Rhs& rhs)
+  static EIGEN_DEVICE_FUNC void evalTo(Dest& dst, const Transpose<Lhs>& lhs, const Rhs& rhs)
  {
    transposition_matrix_product<Rhs, OnTheLeft, true, MatrixShape>::run(dst, lhs.nestedExpression(), rhs);
  }
@@ -1125,7 +1125,7 @@ template<typename Lhs, typename Rhs, int ProductTag, typename MatrixShape>
 struct generic_product_impl<Lhs, Transpose<Rhs>, MatrixShape, TranspositionsShape, ProductTag>
 {
  template<typename Dest>
-  static void evalTo(Dest& dst, const Lhs& lhs, const Transpose<Rhs>& rhs)
+  static EIGEN_DEVICE_FUNC void evalTo(Dest& dst, const Lhs& lhs, const Transpose<Rhs>& rhs)
  {
    transposition_matrix_product<Lhs, OnTheRight, true, MatrixShape>::run(dst, rhs.nestedExpression(), lhs);
  }
--- a/Eigen/src/Core/Random.h
+++ b/Eigen/src/Core/Random.h
@@ -10,7 +10,7 @@
 #ifndef EIGEN_RANDOM_H
 #define EIGEN_RANDOM_H

-namespace Eigen { 
+namespace Eigen {

 namespace internal {

@@ -29,16 +29,16 @@ struct functor_traits<scalar_random_op<Scalar> >
  *
  * Numbers are uniformly spread through their whole definition range for integer types,
  * and in the [-1:1] range for floating point scalar types.
-  * 
+  *
  * The parameters \a rows and \a cols are the number of rows and of columns of
  * the returned matrix. Must be compatible with this MatrixBase type.
  *
  * \not_reentrant
-  * 
+  *
  * This variant is meant to be used for dynamic-size matrix types. For fixed-size types,
  * it is redundant to pass \a rows and \a cols as arguments, so Random() should be used
  * instead.
-  * 
+  *
  *
  * Example: \include MatrixBase_random_int_int.cpp
  * Output: \verbinclude MatrixBase_random_int_int.out
@@ -46,7 +46,7 @@ struct functor_traits<scalar_random_op<Scalar> >
  * This expression has the "evaluate before nesting" flag so that it will be evaluated into
  * a temporary matrix whenever it is nested in a larger expression. This prevents unexpected
  * behavior with expressions involving random matrices.
-  * 
+  *
  * See DenseBase::NullaryExpr(Index, const CustomNullaryOp&) for an example using C++11 random generators.
  *
  * \sa DenseBase::setRandom(), DenseBase::Random(Index), DenseBase::Random()
@@ -93,7 +93,7 @@ DenseBase<Derived>::Random(Index size)
  *
  * Numbers are uniformly spread through their whole definition range for integer types,
  * and in the [-1:1] range for floating point scalar types.
-  * 
+  *
  * This variant is only for fixed-size MatrixBase types. For dynamic-size types, you
  * need to use the variants taking size arguments.
  *
@@ -103,7 +103,7 @@ DenseBase<Derived>::Random(Index size)
  * This expression has the "evaluate before nesting" flag so that it will be evaluated into
  * a temporary matrix whenever it is nested in a larger expression. This prevents unexpected
  * behavior with expressions involving random matrices.
-  * 
+  *
  * \not_reentrant
  *
  * \sa DenseBase::setRandom(), DenseBase::Random(Index,Index), DenseBase::Random(Index)
@@ -119,16 +119,16 @@ DenseBase<Derived>::Random()
  *
  * Numbers are uniformly spread through their whole definition range for integer types,
  * and in the [-1:1] range for floating point scalar types.
-  * 
+  *
  * \not_reentrant
-  * 
+  *
  * Example: \include MatrixBase_setRandom.cpp
  * Output: \verbinclude MatrixBase_setRandom.out
  *
  * \sa class CwiseNullaryOp, setRandom(Index), setRandom(Index,Index)
  */
 template<typename Derived>
-inline Derived& DenseBase<Derived>::setRandom()
+EIGEN_DEVICE_FUNC inline Derived& DenseBase<Derived>::setRandom()
 {
  return *this = Random(rows(), cols());
 }
@@ -137,7 +137,7 @@ inline Derived& DenseBase<Derived>::setRandom()
  *
  * Numbers are uniformly spread through their whole definition range for integer types,
  * and in the [-1:1] range for floating point scalar types.
-  * 
+  *
  * \only_for_vectors
  * \not_reentrant
  *
@@ -160,7 +160,7 @@ PlainObjectBase<Derived>::setRandom(Index newSize)
  * and in the [-1:1] range for floating point scalar types.
  *
  * \not_reentrant
-  * 
+  *
  * \param rows the new number of rows
  * \param cols the new number of columns
  *
--- a/Eigen/src/Core/Redux.h
+++ b/Eigen/src/Core/Redux.h
@@ -11,7 +11,7 @@
 #ifndef EIGEN_REDUX_H
 #define EIGEN_REDUX_H

-namespace Eigen { 
+namespace Eigen {

 namespace internal {

@@ -60,7 +60,7 @@ public:
  enum {
    Unrolling = Cost <= UnrollingLimit ? CompleteUnrolling : NoUnrolling
  };
-  
+
 #ifdef EIGEN_DEBUG_ASSIGN
  static void debug()
  {
@@ -128,7 +128,7 @@ template<typename Func, typename Derived, int Start>
 struct redux_novec_unroller<Func, Derived, Start, 0>
 {
  typedef typename Derived::Scalar Scalar;
-  EIGEN_DEVICE_FUNC 
+  EIGEN_DEVICE_FUNC
  static EIGEN_STRONG_INLINE Scalar run(const Derived&, const Func&) { return Scalar(); }
 };

@@ -215,7 +215,7 @@ struct redux_impl<Func, Derived, LinearVectorizedTraversal, NoUnrolling>
  static Scalar run(const Derived &mat, const Func& func)
  {
    const Index size = mat.size();
-    
+
    const Index packetSize = redux_traits<Func, Derived>::PacketSize;
    const int packetAlignment = unpacket_traits<PacketScalar>::alignment;
    enum {
@@ -336,12 +336,12 @@ class redux_evaluator
 public:
  typedef _XprType XprType;
  EIGEN_DEVICE_FUNC explicit redux_evaluator(const XprType &xpr) : m_evaluator(xpr), m_xpr(xpr) {}
-  
+
  typedef typename XprType::Scalar Scalar;
  typedef typename XprType::CoeffReturnType CoeffReturnType;
  typedef typename XprType::PacketScalar PacketScalar;
  typedef typename XprType::PacketReturnType PacketReturnType;
-  
+
  enum {
    MaxRowsAtCompileTime = XprType::MaxRowsAtCompileTime,
    MaxColsAtCompileTime = XprType::MaxColsAtCompileTime,
@@ -353,7 +353,7 @@ public:
    CoeffReadCost = evaluator<XprType>::CoeffReadCost,
    Alignment = evaluator<XprType>::Alignment
  };
-  
+
  EIGEN_DEVICE_FUNC Index rows() const { return m_xpr.rows(); }
  EIGEN_DEVICE_FUNC Index cols() const { return m_xpr.cols(); }
  EIGEN_DEVICE_FUNC Index size() const { return m_xpr.size(); }
@@ -375,17 +375,17 @@ public:
  template<int LoadMode, typename PacketType>
  PacketType packet(Index index) const
  { return m_evaluator.template packet<LoadMode,PacketType>(index); }
-  
+
  EIGEN_DEVICE_FUNC
  CoeffReturnType coeffByOuterInner(Index outer, Index inner) const
  { return m_evaluator.coeff(IsRowMajor ? outer : inner, IsRowMajor ? inner : outer); }
-  
+
  template<int LoadMode, typename PacketType>
  PacketType packetByOuterInner(Index outer, Index inner) const
  { return m_evaluator.template packet<LoadMode,PacketType>(IsRowMajor ? outer : inner, IsRowMajor ? inner : outer); }
-  
+
  const XprType & nestedExpression() const { return m_xpr; }
-  
+
 protected:
  internal::evaluator<XprType> m_evaluator;
  const XprType &m_xpr;
@@ -407,14 +407,14 @@ protected:
  */
 template<typename Derived>
 template<typename Func>
-EIGEN_STRONG_INLINE typename internal::traits<Derived>::Scalar
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE typename internal::traits<Derived>::Scalar
 DenseBase<Derived>::redux(const Func& func) const
 {
  eigen_assert(this->rows()>0 && this->cols()>0 && "you are using an empty matrix");

  typedef typename internal::redux_evaluator<Derived> ThisEvaluator;
  ThisEvaluator thisEval(derived());
-  
+
  return internal::redux_impl<Func, ThisEvaluator>::run(thisEval, func);
 }

@@ -422,7 +422,7 @@ DenseBase<Derived>::redux(const Func& func) const
  * \warning the result is undefined if \c *this contains NaN.
  */
 template<typename Derived>
-EIGEN_STRONG_INLINE typename internal::traits<Derived>::Scalar
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE typename internal::traits<Derived>::Scalar
 DenseBase<Derived>::minCoeff() const
 {
  return derived().redux(Eigen::internal::scalar_min_op<Scalar,Scalar>());
@@ -432,7 +432,7 @@ DenseBase<Derived>::minCoeff() const
  * \warning the result is undefined if \c *this contains NaN.
  */
 template<typename Derived>
-EIGEN_STRONG_INLINE typename internal::traits<Derived>::Scalar
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE typename internal::traits<Derived>::Scalar
 DenseBase<Derived>::maxCoeff() const
 {
  return derived().redux(Eigen::internal::scalar_max_op<Scalar,Scalar>());
@@ -445,7 +445,7 @@ DenseBase<Derived>::maxCoeff() const
  * \sa trace(), prod(), mean()
  */
 template<typename Derived>
-EIGEN_STRONG_INLINE typename internal::traits<Derived>::Scalar
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE typename internal::traits<Derived>::Scalar
 DenseBase<Derived>::sum() const
 {
  if(SizeAtCompileTime==0 || (SizeAtCompileTime==Dynamic && size()==0))
@@ -458,7 +458,7 @@ DenseBase<Derived>::sum() const
 * \sa trace(), prod(), sum()
 */
 template<typename Derived>
-EIGEN_STRONG_INLINE typename internal::traits<Derived>::Scalar
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE typename internal::traits<Derived>::Scalar
 DenseBase<Derived>::mean() const
 {
 #ifdef __INTEL_COMPILER
@@ -479,7 +479,7 @@ DenseBase<Derived>::mean() const
  * \sa sum(), mean(), trace()
  */
 template<typename Derived>
-EIGEN_STRONG_INLINE typename internal::traits<Derived>::Scalar
+EIGEN_DEVICE_FUNC  EIGEN_STRONG_INLINE typename internal::traits<Derived>::Scalar
 DenseBase<Derived>::prod() const
 {
  if(SizeAtCompileTime==0 || (SizeAtCompileTime==Dynamic && size()==0))
@@ -494,7 +494,7 @@ DenseBase<Derived>::prod() const
  * \sa diagonal(), sum()
  */
 template<typename Derived>
-EIGEN_STRONG_INLINE typename internal::traits<Derived>::Scalar
+EIGEN_DEVICE_FUNC  EIGEN_STRONG_INLINE typename internal::traits<Derived>::Scalar
 MatrixBase<Derived>::trace() const
 {
  return derived().diagonal().sum();
--- a/Eigen/src/Core/Replicate.h
+++ b/Eigen/src/Core/Replicate.h
@@ -10,7 +10,7 @@
 #ifndef EIGEN_REPLICATE_H
 #define EIGEN_REPLICATE_H

-namespace Eigen { 
+namespace Eigen {

 namespace internal {
 template<typename MatrixType,int RowFactor,int ColFactor>
@@ -35,7 +35,7 @@ struct traits<Replicate<MatrixType,RowFactor,ColFactor> >
    IsRowMajor = MaxRowsAtCompileTime==1 && MaxColsAtCompileTime!=1 ? 1
               : MaxColsAtCompileTime==1 && MaxRowsAtCompileTime!=1 ? 0
               : (MatrixType::Flags & RowMajorBit) ? 1 : 0,
-    
+
    // FIXME enable DirectAccess with negative strides?
    Flags = IsRowMajor ? RowMajorBit : 0
  };
@@ -95,8 +95,8 @@ template<typename MatrixType,int RowFactor,int ColFactor> class Replicate

    EIGEN_DEVICE_FUNC
    const _MatrixTypeNested& nestedExpression() const
-    { 
-      return m_matrix; 
+    {
+      return m_matrix;
    }

  protected:
@@ -115,7 +115,7 @@ template<typename MatrixType,int RowFactor,int ColFactor> class Replicate
  */
 template<typename Derived>
 template<int RowFactor, int ColFactor>
-const Replicate<Derived,RowFactor,ColFactor>
+EIGEN_DEVICE_FUNC const Replicate<Derived,RowFactor,ColFactor>
 DenseBase<Derived>::replicate() const
 {
  return Replicate<Derived,RowFactor,ColFactor>(derived());
@@ -130,7 +130,7 @@ DenseBase<Derived>::replicate() const
  * \sa VectorwiseOp::replicate(), DenseBase::replicate(), class Replicate
  */
 template<typename ExpressionType, int Direction>
-const typename VectorwiseOp<ExpressionType,Direction>::ReplicateReturnType
+EIGEN_DEVICE_FUNC const typename VectorwiseOp<ExpressionType,Direction>::ReplicateReturnType
 VectorwiseOp<ExpressionType,Direction>::replicate(Index factor) const
 {
  return typename VectorwiseOp<ExpressionType,Direction>::ReplicateReturnType
--- a/Eigen/src/Core/ReturnByValue.h
+++ b/Eigen/src/Core/ReturnByValue.h
@@ -79,7 +79,7 @@ template<typename Derived> class ReturnByValue

 template<typename Derived>
 template<typename OtherDerived>
-Derived& DenseBase<Derived>::operator=(const ReturnByValue<OtherDerived>& other)
+EIGEN_DEVICE_FUNC Derived& DenseBase<Derived>::operator=(const ReturnByValue<OtherDerived>& other)
 {
  other.evalTo(derived());
  return derived();
@@ -90,7 +90,7 @@ namespace internal {
 // Expression is evaluated in a temporary; default implementation of Assignment is bypassed so that
 // when a ReturnByValue expression is assigned, the evaluator is not constructed.
 // TODO: Finalize port to new regime; ReturnByValue should not exist in the expression world
-  
+
 template<typename Derived>
 struct evaluator<ReturnByValue<Derived> >
  : public evaluator<typename internal::traits<Derived>::ReturnType>
@@ -98,7 +98,7 @@ struct evaluator<ReturnByValue<Derived> >
  typedef ReturnByValue<Derived> XprType;
  typedef typename internal::traits<Derived>::ReturnType PlainObject;
  typedef evaluator<PlainObject> Base;
-  
+
  EIGEN_DEVICE_FUNC explicit evaluator(const XprType& xpr)
    : m_result(xpr.rows(), xpr.cols())
  {
--- a/Eigen/src/Core/Reverse.h
+++ b/Eigen/src/Core/Reverse.h
@@ -12,7 +12,7 @@
 #ifndef EIGEN_REVERSE_H
 #define EIGEN_REVERSE_H

-namespace Eigen { 
+namespace Eigen {

 namespace internal {

@@ -44,7 +44,7 @@ template<typename PacketType> struct reverse_packet_cond<PacketType,false>
  static inline PacketType run(const PacketType& x) { return x; }
 };

-} // end namespace internal 
+} // end namespace internal

 /** \class Reverse
  * \ingroup Core_Module
@@ -98,7 +98,7 @@ template<typename MatrixType, int Direction> class Reverse
    }

    EIGEN_DEVICE_FUNC const typename internal::remove_all<typename MatrixType::Nested>::type&
-    nestedExpression() const 
+    nestedExpression() const
    {
      return m_matrix;
    }
@@ -114,7 +114,7 @@ template<typename MatrixType, int Direction> class Reverse
  *
  */
 template<typename Derived>
-inline typename DenseBase<Derived>::ReverseReturnType
+EIGEN_DEVICE_FUNC inline typename DenseBase<Derived>::ReverseReturnType
 DenseBase<Derived>::reverse()
 {
  return ReverseReturnType(derived());
@@ -136,7 +136,7 @@ DenseBase<Derived>::reverse()
  *
  * \sa VectorwiseOp::reverseInPlace(), reverse() */
 template<typename Derived>
-inline void DenseBase<Derived>::reverseInPlace()
+EIGEN_DEVICE_FUNC inline void DenseBase<Derived>::reverseInPlace()
 {
  if(cols()>rows())
  {
@@ -161,7 +161,7 @@ inline void DenseBase<Derived>::reverseInPlace()
 }

 namespace internal {
-  
+
 template<int Direction>
 struct vectorwise_reverse_inplace_impl;

@@ -201,7 +201,7 @@ struct vectorwise_reverse_inplace_impl<Horizontal>
  *
  * \sa DenseBase::reverseInPlace(), reverse() */
 template<typename ExpressionType, int Direction>
-void VectorwiseOp<ExpressionType,Direction>::reverseInPlace()
+EIGEN_DEVICE_FUNC void VectorwiseOp<ExpressionType,Direction>::reverseInPlace()
 {
  internal::vectorwise_reverse_inplace_impl<Direction>::run(_expression().const_cast_derived());
 }
--- a/Eigen/src/Core/SelfAdjointView.h
+++ b/Eigen/src/Core/SelfAdjointView.h
@@ -10,7 +10,7 @@
 #ifndef EIGEN_SELFADJOINTMATRIX_H
 #define EIGEN_SELFADJOINTMATRIX_H

-namespace Eigen { 
+namespace Eigen {

 /** \class SelfAdjointView
  * \ingroup Core_Module
@@ -58,7 +58,7 @@ template<typename _MatrixType, unsigned int UpLo> class SelfAdjointView
    typedef MatrixTypeNestedCleaned NestedExpression;

    /** \brief The type of coefficients in this matrix */
-    typedef typename internal::traits<SelfAdjointView>::Scalar Scalar; 
+    typedef typename internal::traits<SelfAdjointView>::Scalar Scalar;
    typedef typename MatrixType::StorageIndex StorageIndex;
    typedef typename internal::remove_all<typename MatrixType::ConjugateReturnType>::type MatrixConjugateReturnType;

@@ -131,7 +131,7 @@ template<typename _MatrixType, unsigned int UpLo> class SelfAdjointView
    {
      return Product<OtherDerived,SelfAdjointView>(lhs.derived(),rhs);
    }
-    
+
    friend EIGEN_DEVICE_FUNC
    const SelfAdjointView<const EIGEN_SCALAR_BINARYOP_EXPR_RETURN_TYPE(Scalar,MatrixType,product),UpLo>
    operator*(const Scalar& s, const SelfAdjointView& mat)
@@ -287,17 +287,17 @@ protected:
  using Base::m_src;
  using Base::m_functor;
 public:
-  
+
  typedef typename Base::DstEvaluatorType DstEvaluatorType;
  typedef typename Base::SrcEvaluatorType SrcEvaluatorType;
  typedef typename Base::Scalar Scalar;
  typedef typename Base::AssignmentTraits AssignmentTraits;
-  
-  
+
+
  EIGEN_DEVICE_FUNC triangular_dense_assignment_kernel(DstEvaluatorType &dst, const SrcEvaluatorType &src, const Functor &func, DstXprType& dstExpr)
    : Base(dst, src, func, dstExpr)
  {}
-  
+
  EIGEN_DEVICE_FUNC void assignCoeff(Index row, Index col)
  {
    eigen_internal_assert(row!=col);
@@ -305,12 +305,12 @@ public:
    m_functor.assignCoeff(m_dst.coeffRef(row,col), tmp);
    m_functor.assignCoeff(m_dst.coeffRef(col,row), numext::conj(tmp));
  }
-  
+
  EIGEN_DEVICE_FUNC void assignDiagonalCoeff(Index id)
  {
    Base::assignCoeff(id,id);
  }
-  
+
  EIGEN_DEVICE_FUNC void assignOppositeCoeff(Index, Index)
  { eigen_internal_assert(false && "should never be called"); }
 };
@@ -324,7 +324,7 @@ public:
 /** This is the const version of MatrixBase::selfadjointView() */
 template<typename Derived>
 template<unsigned int UpLo>
-typename MatrixBase<Derived>::template ConstSelfAdjointViewReturnType<UpLo>::Type
+EIGEN_DEVICE_FUNC typename MatrixBase<Derived>::template ConstSelfAdjointViewReturnType<UpLo>::Type
 MatrixBase<Derived>::selfadjointView() const
 {
  return typename ConstSelfAdjointViewReturnType<UpLo>::Type(derived());
@@ -341,7 +341,7 @@ MatrixBase<Derived>::selfadjointView() const
  */
 template<typename Derived>
 template<unsigned int UpLo>
-typename MatrixBase<Derived>::template SelfAdjointViewReturnType<UpLo>::Type
+EIGEN_DEVICE_FUNC typename MatrixBase<Derived>::template SelfAdjointViewReturnType<UpLo>::Type
 MatrixBase<Derived>::selfadjointView()
 {
  return typename SelfAdjointViewReturnType<UpLo>::Type(derived());
--- a/Eigen/src/Core/Solve.h
+++ b/Eigen/src/Core/Solve.h
@@ -13,7 +13,7 @@
 namespace Eigen {

 template<typename Decomposition, typename RhsType, typename StorageKind> class SolveImpl;
-  
+
 /** \class Solve
  * \ingroup Core_Module
  *
@@ -64,11 +64,11 @@ class Solve : public SolveImpl<Decomposition,RhsType,typename internal::traits<R
 public:
  typedef typename internal::traits<Solve>::PlainObject PlainObject;
  typedef typename internal::traits<Solve>::StorageIndex StorageIndex;
-  
+
  Solve(const Decomposition &dec, const RhsType &rhs)
    : m_dec(dec), m_rhs(rhs)
  {}
-  
+
  EIGEN_DEVICE_FUNC Index rows() const { return m_dec.cols(); }
  EIGEN_DEVICE_FUNC Index cols() const { return m_rhs.cols(); }

@@ -87,14 +87,14 @@ class SolveImpl<Decomposition,RhsType,Dense>
  : public MatrixBase<Solve<Decomposition,RhsType> >
 {
  typedef Solve<Decomposition,RhsType> Derived;
-  
+
 public:
-  
+
  typedef MatrixBase<Solve<Decomposition,RhsType> > Base;
  EIGEN_DENSE_PUBLIC_INTERFACE(Derived)

 private:
-  
+
  Scalar coeff(Index row, Index col) const;
  Scalar coeff(Index i) const;
 };
@@ -119,15 +119,15 @@ struct evaluator<Solve<Decomposition,RhsType> >
  typedef evaluator<PlainObject> Base;

  enum { Flags = Base::Flags | EvalBeforeNestingBit };
-  
+
  EIGEN_DEVICE_FUNC explicit evaluator(const SolveType& solve)
    : m_result(solve.rows(), solve.cols())
  {
    ::new (static_cast<Base*>(this)) Base(m_result);
    solve.dec()._solve_impl(solve.rhs(), m_result);
  }
-  
-protected:  
+
+protected:
  PlainObject m_result;
 };

@@ -137,7 +137,7 @@ template<typename DstXprType, typename DecType, typename RhsType, typename Scala
 struct Assignment<DstXprType, Solve<DecType,RhsType>, internal::assign_op<Scalar,Scalar>, Dense2Dense>
 {
  typedef Solve<DecType,RhsType> SrcXprType;
-  static void run(DstXprType &dst, const SrcXprType &src, const internal::assign_op<Scalar,Scalar> &)
+  static EIGEN_DEVICE_FUNC void run(DstXprType &dst, const SrcXprType &src, const internal::assign_op<Scalar,Scalar> &)
  {
    Index dstRows = src.rows();
    Index dstCols = src.cols();
@@ -153,7 +153,7 @@ template<typename DstXprType, typename DecType, typename RhsType, typename Scala
 struct Assignment<DstXprType, Solve<Transpose<const DecType>,RhsType>, internal::assign_op<Scalar,Scalar>, Dense2Dense>
 {
  typedef Solve<Transpose<const DecType>,RhsType> SrcXprType;
-  static void run(DstXprType &dst, const SrcXprType &src, const internal::assign_op<Scalar,Scalar> &)
+  static EIGEN_DEVICE_FUNC void run(DstXprType &dst, const SrcXprType &src, const internal::assign_op<Scalar,Scalar> &)
  {
    Index dstRows = src.rows();
    Index dstCols = src.cols();
@@ -170,13 +170,13 @@ struct Assignment<DstXprType, Solve<CwiseUnaryOp<internal::scalar_conjugate_op<t
                  internal::assign_op<Scalar,Scalar>, Dense2Dense>
 {
  typedef Solve<CwiseUnaryOp<internal::scalar_conjugate_op<typename DecType::Scalar>, const Transpose<const DecType> >,RhsType> SrcXprType;
-  static void run(DstXprType &dst, const SrcXprType &src, const internal::assign_op<Scalar,Scalar> &)
+  static EIGEN_DEVICE_FUNC void run(DstXprType &dst, const SrcXprType &src, const internal::assign_op<Scalar,Scalar> &)
  {
    Index dstRows = src.rows();
    Index dstCols = src.cols();
    if((dst.rows()!=dstRows) || (dst.cols()!=dstCols))
      dst.resize(dstRows, dstCols);
-    
+
    src.dec().nestedExpression().nestedExpression().template _solve_impl_transposed<true>(src.rhs(), dst);
  }
 };
--- a/Eigen/src/Core/SolveTriangular.h
+++ b/Eigen/src/Core/SolveTriangular.h
@@ -10,7 +10,7 @@
 #ifndef EIGEN_SOLVETRIANGULAR_H
 #define EIGEN_SOLVETRIANGULAR_H

-namespace Eigen { 
+namespace Eigen {

 namespace internal {

@@ -64,7 +64,7 @@ struct triangular_solver_selector<Lhs,Rhs,Side,Mode,NoUnrolling,1>

    ei_declare_aligned_stack_constructed_variable(RhsScalar,actualRhs,rhs.size(),
                                                  (useRhsDirectly ? rhs.data() : 0));
-                                                  
+
    if(!useRhsDirectly)
      MappedRhs(actualRhs,rhs.size()) = rhs;

@@ -148,7 +148,7 @@ struct triangular_solver_selector<Lhs,Rhs,OnTheRight,Mode,CompleteUnrolling,1> {
  {
    Transpose<const Lhs> trLhs(lhs);
    Transpose<Rhs> trRhs(rhs);
-    
+
    triangular_solver_unroller<Transpose<const Lhs>,Transpose<Rhs>,
                              ((Mode&Upper)==Upper ? Lower : Upper) | (Mode&UnitDiag),
                              0,Rhs::SizeAtCompileTime>::run(trLhs,trRhs);
@@ -164,7 +164,7 @@ struct triangular_solver_selector<Lhs,Rhs,OnTheRight,Mode,CompleteUnrolling,1> {
 #ifndef EIGEN_PARSED_BY_DOXYGEN
 template<typename MatrixType, unsigned int Mode>
 template<int Side, typename OtherDerived>
-void TriangularViewImpl<MatrixType,Mode,Dense>::solveInPlace(const MatrixBase<OtherDerived>& _other) const
+EIGEN_DEVICE_FUNC void TriangularViewImpl<MatrixType,Mode,Dense>::solveInPlace(const MatrixBase<OtherDerived>& _other) const
 {
  OtherDerived& other = _other.const_cast_derived();
  eigen_assert( derived().cols() == derived().rows() && ((Side==OnTheLeft && derived().cols() == other.rows()) || (Side==OnTheRight && derived().cols() == other.cols())) );
@@ -187,7 +187,7 @@ void TriangularViewImpl<MatrixType,Mode,Dense>::solveInPlace(const MatrixBase<Ot

 template<typename Derived, unsigned int Mode>
 template<int Side, typename Other>
-const internal::triangular_solve_retval<Side,TriangularView<Derived,Mode>,Other>
+EIGEN_DEVICE_FUNC const internal::triangular_solve_retval<Side,TriangularView<Derived,Mode>,Other>
 TriangularViewImpl<Derived,Mode,Dense>::solve(const MatrixBase<Other>& other) const
 {
  return internal::triangular_solve_retval<Side,TriangularViewType,Other>(derived(), other.derived());
--- a/Eigen/src/Core/Transpose.h
+++ b/Eigen/src/Core/Transpose.h
@@ -11,7 +11,7 @@
 #ifndef EIGEN_TRANSPOSE_H
 #define EIGEN_TRANSPOSE_H

-namespace Eigen { 
+namespace Eigen {

 namespace internal {
 template<typename MatrixType>
@@ -170,7 +170,7 @@ template<typename MatrixType> class TransposeImpl<MatrixType,Dense>
  *
  * \sa transposeInPlace(), adjoint() */
 template<typename Derived>
-inline Transpose<Derived>
+EIGEN_DEVICE_FUNC inline Transpose<Derived>
 DenseBase<Derived>::transpose()
 {
  return TransposeReturnType(derived());
@@ -182,7 +182,7 @@ DenseBase<Derived>::transpose()
  *
  * \sa transposeInPlace(), adjoint() */
 template<typename Derived>
-inline typename DenseBase<Derived>::ConstTransposeReturnType
+EIGEN_DEVICE_FUNC inline typename DenseBase<Derived>::ConstTransposeReturnType
 DenseBase<Derived>::transpose() const
 {
  return ConstTransposeReturnType(derived());
@@ -208,7 +208,7 @@ DenseBase<Derived>::transpose() const
  *
  * \sa adjointInPlace(), transpose(), conjugate(), class Transpose, class internal::scalar_conjugate_op */
 template<typename Derived>
-inline const typename MatrixBase<Derived>::AdjointReturnType
+EIGEN_DEVICE_FUNC inline const typename MatrixBase<Derived>::AdjointReturnType
 MatrixBase<Derived>::adjoint() const
 {
  return AdjointReturnType(this->transpose());
@@ -278,12 +278,12 @@ struct inplace_transpose_selector<MatrixType,false,MatchPacketSize> { // non squ
  * Notice however that this method is only useful if you want to replace a matrix by its own transpose.
  * If you just need the transpose of a matrix, use transpose().
  *
-  * \note if the matrix is not square, then \c *this must be a resizable matrix. 
+  * \note if the matrix is not square, then \c *this must be a resizable matrix.
  * This excludes (non-square) fixed-size matrices, block-expressions and maps.
  *
  * \sa transpose(), adjoint(), adjointInPlace() */
 template<typename Derived>
-inline void DenseBase<Derived>::transposeInPlace()
+EIGEN_DEVICE_FUNC inline void DenseBase<Derived>::transposeInPlace()
 {
  eigen_assert((rows() == cols() || (RowsAtCompileTime == Dynamic && ColsAtCompileTime == Dynamic))
               && "transposeInPlace() called on a non-square non-resizable matrix");
@@ -314,7 +314,7 @@ inline void DenseBase<Derived>::transposeInPlace()
  *
  * \sa transpose(), adjoint(), transposeInPlace() */
 template<typename Derived>
-inline void MatrixBase<Derived>::adjointInPlace()
+EIGEN_DEVICE_FUNC inline void MatrixBase<Derived>::adjointInPlace()
 {
  derived() = adjoint().eval();
 }
--- a/Eigen/src/Core/Transpositions.h
+++ b/Eigen/src/Core/Transpositions.h
@@ -33,17 +33,6 @@ class TranspositionsBase
      indices() = other.indices();
      return derived();
    }
-    
-    #ifndef EIGEN_PARSED_BY_DOXYGEN
-    /** This is a special case of the templated operator=. Its purpose is to
-      * prevent a default operator= from hiding the templated operator=.
-      */
-    Derived& operator=(const TranspositionsBase& other)
-    {
-      indices() = other.indices();
-      return derived();
-    }
-    #endif

    /** \returns the number of transpositions */
    Index size() const { return indices().size(); }
@@ -171,12 +160,6 @@ class Transpositions : public TranspositionsBase<Transpositions<SizeAtCompileTim
    inline Transpositions(const TranspositionsBase<OtherDerived>& other)
      : m_indices(other.indices()) {}

-    #ifndef EIGEN_PARSED_BY_DOXYGEN
-    /** Standard copy constructor. Defined only to prevent a default copy constructor
-      * from hiding the other templated constructor */
-    inline Transpositions(const Transpositions& other) : m_indices(other.indices()) {}
-    #endif
-
    /** Generic constructor from expression of the transposition indices. */
    template<typename Other>
    explicit inline Transpositions(const MatrixBase<Other>& indices) : m_indices(indices)
@@ -189,17 +172,6 @@ class Transpositions : public TranspositionsBase<Transpositions<SizeAtCompileTim
      return Base::operator=(other);
    }

-    #ifndef EIGEN_PARSED_BY_DOXYGEN
-    /** This is a special case of the templated operator=. Its purpose is to
-      * prevent a default operator= from hiding the templated operator=.
-      */
-    Transpositions& operator=(const Transpositions& other)
-    {
-      m_indices = other.m_indices;
-      return *this;
-    }
-    #endif
-
    /** Constructs an uninitialized permutation matrix of given size.
      */
    inline Transpositions(Index size) : m_indices(size)
@@ -306,17 +278,6 @@ class TranspositionsWrapper
      return Base::operator=(other);
    }

-    #ifndef EIGEN_PARSED_BY_DOXYGEN
-    /** This is a special case of the templated operator=. Its purpose is to
-      * prevent a default operator= from hiding the templated operator=.
-      */
-    TranspositionsWrapper& operator=(const TranspositionsWrapper& other)
-    {
-      m_indices = other.m_indices;
-      return *this;
-    }
-    #endif
-
    /** const version of indices(). */
    const IndicesType& indices() const { return m_indices; }

--- a/Eigen/src/Core/TriangularMatrix.h
+++ b/Eigen/src/Core/TriangularMatrix.h
@@ -11,12 +11,12 @@
 #ifndef EIGEN_TRIANGULARMATRIX_H
 #define EIGEN_TRIANGULARMATRIX_H

-namespace Eigen { 
+namespace Eigen {

 namespace internal {
-  
+
 template<int Side, typename TriangularType, typename Rhs> struct triangular_solve_retval;
-  
+
 }

 /** \class TriangularBase
@@ -34,16 +34,16 @@ template<typename Derived> class TriangularBase : public EigenBase<Derived>
      ColsAtCompileTime = internal::traits<Derived>::ColsAtCompileTime,
      MaxRowsAtCompileTime = internal::traits<Derived>::MaxRowsAtCompileTime,
      MaxColsAtCompileTime = internal::traits<Derived>::MaxColsAtCompileTime,
-      
+
      SizeAtCompileTime = (internal::size_at_compile_time<internal::traits<Derived>::RowsAtCompileTime,
                                                   internal::traits<Derived>::ColsAtCompileTime>::ret),
      /**< This is equal to the number of coefficients, i.e. the number of
          * rows times the number of columns, or to \a Dynamic if this is not
          * known at compile-time. \sa RowsAtCompileTime, ColsAtCompileTime */
-      
+
      MaxSizeAtCompileTime = (internal::size_at_compile_time<internal::traits<Derived>::MaxRowsAtCompileTime,
                                                   internal::traits<Derived>::MaxColsAtCompileTime>::ret)
-        
+
    };
    typedef typename internal::traits<Derived>::Scalar Scalar;
    typedef typename internal::traits<Derived>::StorageKind StorageKind;
@@ -63,7 +63,7 @@ template<typename Derived> class TriangularBase : public EigenBase<Derived>
    inline Index outerStride() const { return derived().outerStride(); }
    EIGEN_DEVICE_FUNC
    inline Index innerStride() const { return derived().innerStride(); }
-    
+
    // dummy resize function
    void resize(Index rows, Index cols)
    {
@@ -155,7 +155,7 @@ template<typename Derived> class TriangularBase : public EigenBase<Derived>
  * \param MatrixType the type of the object in which we are taking the triangular part
  * \param Mode the kind of triangular matrix expression to construct. Can be #Upper,
  *             #Lower, #UnitUpper, #UnitLower, #StrictlyUpper, or #StrictlyLower.
-  *             This is in fact a bit field; it must have either #Upper or #Lower, 
+  *             This is in fact a bit field; it must have either #Upper or #Lower,
  *             and additionally it may have #UnitDiag or #ZeroDiag or neither.
  *
  * This class represents a triangular part of a matrix, not necessarily square. Strictly speaking, for rectangular
@@ -197,7 +197,7 @@ template<typename _MatrixType, unsigned int _Mode> class TriangularView
    typedef typename internal::traits<TriangularView>::MatrixTypeNestedNonRef MatrixTypeNestedNonRef;

    typedef typename internal::remove_all<typename MatrixType::ConjugateReturnType>::type MatrixConjugateReturnType;
-    
+
  public:

    typedef typename internal::traits<TriangularView>::StorageKind StorageKind;
@@ -216,7 +216,7 @@ template<typename _MatrixType, unsigned int _Mode> class TriangularView
    EIGEN_DEVICE_FUNC
    explicit inline TriangularView(MatrixType& matrix) : m_matrix(matrix)
    {}
-    
+
    EIGEN_INHERIT_ASSIGNMENT_OPERATORS(TriangularView)

    /** \copydoc EigenBase::rows() */
@@ -233,7 +233,7 @@ template<typename _MatrixType, unsigned int _Mode> class TriangularView
    /** \returns a reference to the nested expression */
    EIGEN_DEVICE_FUNC
    NestedExpression& nestedExpression() { return m_matrix; }
-    
+
    typedef TriangularView<const MatrixConjugateReturnType,Mode> ConjugateReturnType;
    /** \sa MatrixBase::conjugate() const */
    EIGEN_DEVICE_FUNC
@@ -255,7 +255,7 @@ template<typename _MatrixType, unsigned int _Mode> class TriangularView
      typename MatrixType::TransposeReturnType tmp(m_matrix);
      return TransposeReturnType(tmp);
    }
-    
+
    typedef TriangularView<const typename MatrixType::ConstTransposeReturnType,TransposeMode> ConstTransposeReturnType;
    /** \sa MatrixBase::transpose() const */
    EIGEN_DEVICE_FUNC
@@ -266,10 +266,10 @@ template<typename _MatrixType, unsigned int _Mode> class TriangularView

    template<typename Other>
    EIGEN_DEVICE_FUNC
-    inline const Solve<TriangularView, Other> 
+    inline const Solve<TriangularView, Other>
    solve(const MatrixBase<Other>& other) const
    { return Solve<TriangularView, Other>(*this, other.derived()); }
-    
+
  // workaround MSVC ICE
  #if EIGEN_COMP_MSVC
    template<int Side, typename Other>
@@ -313,7 +313,7 @@ template<typename _MatrixType, unsigned int _Mode> class TriangularView
      else
        return m_matrix.diagonal().prod();
    }
-      
+
  protected:

    MatrixTypeNested m_matrix;
@@ -375,7 +375,7 @@ template<typename _MatrixType, unsigned int _Mode> class TriangularViewImpl<_Mat
      internal::call_assignment_no_alias(derived(), other.derived(), internal::sub_assign_op<Scalar,typename Other::Scalar>());
      return derived();
    }
-    
+
    /** \sa MatrixBase::operator*=() */
    EIGEN_DEVICE_FUNC
    TriangularViewType&  operator*=(const typename internal::traits<MatrixType>::Scalar& other) { return *this = derived().nestedExpression() * other; }
@@ -556,7 +556,7 @@ template<typename _MatrixType, unsigned int _Mode> class TriangularViewImpl<_Mat
 // FIXME should we keep that possibility
 template<typename MatrixType, unsigned int Mode>
 template<typename OtherDerived>
-inline TriangularView<MatrixType, Mode>&
+EIGEN_DEVICE_FUNC inline TriangularView<MatrixType, Mode>&
 TriangularViewImpl<MatrixType, Mode, Dense>::operator=(const MatrixBase<OtherDerived>& other)
 {
  internal::call_assignment_no_alias(derived(), other.derived(), internal::assign_op<Scalar,typename OtherDerived::Scalar>());
@@ -566,7 +566,7 @@ TriangularViewImpl<MatrixType, Mode, Dense>::operator=(const MatrixBase<OtherDer
 // FIXME should we keep that possibility
 template<typename MatrixType, unsigned int Mode>
 template<typename OtherDerived>
-void TriangularViewImpl<MatrixType, Mode, Dense>::lazyAssign(const MatrixBase<OtherDerived>& other)
+EIGEN_DEVICE_FUNC void TriangularViewImpl<MatrixType, Mode, Dense>::lazyAssign(const MatrixBase<OtherDerived>& other)
 {
  internal::call_assignment_no_alias(derived(), other.template triangularView<Mode>());
 }
@@ -575,7 +575,7 @@ void TriangularViewImpl<MatrixType, Mode, Dense>::lazyAssign(const MatrixBase<Ot

 template<typename MatrixType, unsigned int Mode>
 template<typename OtherDerived>
-inline TriangularView<MatrixType, Mode>&
+EIGEN_DEVICE_FUNC inline TriangularView<MatrixType, Mode>&
 TriangularViewImpl<MatrixType, Mode, Dense>::operator=(const TriangularBase<OtherDerived>& other)
 {
  eigen_assert(Mode == int(OtherDerived::Mode));
@@ -585,7 +585,7 @@ TriangularViewImpl<MatrixType, Mode, Dense>::operator=(const TriangularBase<Othe

 template<typename MatrixType, unsigned int Mode>
 template<typename OtherDerived>
-void TriangularViewImpl<MatrixType, Mode, Dense>::lazyAssign(const TriangularBase<OtherDerived>& other)
+EIGEN_DEVICE_FUNC void TriangularViewImpl<MatrixType, Mode, Dense>::lazyAssign(const TriangularBase<OtherDerived>& other)
 {
  eigen_assert(Mode == int(OtherDerived::Mode));
  internal::call_assignment_no_alias(derived(), other.derived());
@@ -600,7 +600,7 @@ void TriangularViewImpl<MatrixType, Mode, Dense>::lazyAssign(const TriangularBas
  * If the matrix is triangular, the opposite part is set to zero. */
 template<typename Derived>
 template<typename DenseDerived>
-void TriangularBase<Derived>::evalTo(MatrixBase<DenseDerived> &other) const
+EIGEN_DEVICE_FUNC void TriangularBase<Derived>::evalTo(MatrixBase<DenseDerived> &other) const
 {
  evalToLazy(other.derived());
 }
@@ -626,7 +626,7 @@ void TriangularBase<Derived>::evalTo(MatrixBase<DenseDerived> &other) const
  */
 template<typename Derived>
 template<unsigned int Mode>
-typename MatrixBase<Derived>::template TriangularViewReturnType<Mode>::Type
+EIGEN_DEVICE_FUNC typename MatrixBase<Derived>::template TriangularViewReturnType<Mode>::Type
 MatrixBase<Derived>::triangularView()
 {
  return typename TriangularViewReturnType<Mode>::Type(derived());
@@ -635,7 +635,7 @@ MatrixBase<Derived>::triangularView()
 /** This is the const version of MatrixBase::triangularView() */
 template<typename Derived>
 template<unsigned int Mode>
-typename MatrixBase<Derived>::template ConstTriangularViewReturnType<Mode>::Type
+EIGEN_DEVICE_FUNC typename MatrixBase<Derived>::template ConstTriangularViewReturnType<Mode>::Type
 MatrixBase<Derived>::triangularView() const
 {
  return typename ConstTriangularViewReturnType<Mode>::Type(derived());
@@ -700,7 +700,7 @@ bool MatrixBase<Derived>::isLowerTriangular(const RealScalar& prec) const

 namespace internal {

-  
+
 // TODO currently a triangular expression has the form TriangularView<.,.>
 //      in the future triangular-ness should be defined by the expression traits
 //      such that Transpose<TriangularView<.,.> > is valid. (currently TriangularBase::transpose() is overloaded to make it work)
@@ -728,7 +728,7 @@ struct Dense2Triangular         {};

 template<typename Kernel, unsigned int Mode, int UnrollCount, bool ClearOpposite> struct triangular_assignment_loop;

- 
+
 /** \internal Specialization of the dense assignment kernel for triangular matrices.
  * The main difference is that the triangular, diagonal, and opposite parts are processed through three different functions.
  * \tparam UpLo must be either Lower or Upper
@@ -745,17 +745,17 @@ protected:
  using Base::m_src;
  using Base::m_functor;
 public:
-  
+
  typedef typename Base::DstEvaluatorType DstEvaluatorType;
  typedef typename Base::SrcEvaluatorType SrcEvaluatorType;
  typedef typename Base::Scalar Scalar;
  typedef typename Base::AssignmentTraits AssignmentTraits;
-  
-  
+
+
  EIGEN_DEVICE_FUNC triangular_dense_assignment_kernel(DstEvaluatorType &dst, const SrcEvaluatorType &src, const Functor &func, DstXprType& dstExpr)
    : Base(dst, src, func, dstExpr)
  {}
-  
+
 #ifdef EIGEN_INTERNAL_DEBUGGING
  EIGEN_DEVICE_FUNC void assignCoeff(Index row, Index col)
  {
@@ -765,16 +765,16 @@ public:
 #else
  using Base::assignCoeff;
 #endif
-  
+
  EIGEN_DEVICE_FUNC void assignDiagonalCoeff(Index id)
  {
         if(Mode==UnitDiag && SetOpposite) m_functor.assignCoeff(m_dst.coeffRef(id,id), Scalar(1));
    else if(Mode==ZeroDiag && SetOpposite) m_functor.assignCoeff(m_dst.coeffRef(id,id), Scalar(0));
    else if(Mode==0)                       Base::assignCoeff(id,id);
  }
-  
+
  EIGEN_DEVICE_FUNC void assignOppositeCoeff(Index row, Index col)
-  { 
+  {
    eigen_internal_assert(row!=col);
    if(SetOpposite)
      m_functor.assignCoeff(m_dst.coeffRef(row,col), Scalar(0));
@@ -795,17 +795,17 @@ void call_triangular_assignment_loop(DstXprType& dst, const SrcXprType& src, con
  if((dst.rows()!=dstRows) || (dst.cols()!=dstCols))
    dst.resize(dstRows, dstCols);
  DstEvaluatorType dstEvaluator(dst);
-    
+
  typedef triangular_dense_assignment_kernel< Mode&(Lower|Upper),Mode&(UnitDiag|ZeroDiag|SelfAdjoint),SetOpposite,
                                              DstEvaluatorType,SrcEvaluatorType,Functor> Kernel;
  Kernel kernel(dstEvaluator, srcEvaluator, func, dst.const_cast_derived());
-  
+
  enum {
      unroll = DstXprType::SizeAtCompileTime != Dynamic
            && SrcEvaluatorType::CoeffReadCost < HugeCost
            && DstXprType::SizeAtCompileTime * (DstEvaluatorType::CoeffReadCost+SrcEvaluatorType::CoeffReadCost) / 2 <= EIGEN_UNROLLING_LIMIT
    };
-  
+
  triangular_assignment_loop<Kernel, Mode, unroll ? int(DstXprType::SizeAtCompileTime) : Dynamic, SetOpposite>::run(kernel);
 }

@@ -827,8 +827,8 @@ struct Assignment<DstXprType, SrcXprType, Functor, Triangular2Triangular>
  EIGEN_DEVICE_FUNC static void run(DstXprType &dst, const SrcXprType &src, const Functor &func)
  {
    eigen_assert(int(DstXprType::Mode) == int(SrcXprType::Mode));
-    
-    call_triangular_assignment_loop<DstXprType::Mode, false>(dst, src, func);  
+
+    call_triangular_assignment_loop<DstXprType::Mode, false>(dst, src, func);
  }
 };

@@ -837,7 +837,7 @@ struct Assignment<DstXprType, SrcXprType, Functor, Triangular2Dense>
 {
  EIGEN_DEVICE_FUNC static void run(DstXprType &dst, const SrcXprType &src, const Functor &func)
  {
-    call_triangular_assignment_loop<SrcXprType::Mode, (SrcXprType::Mode&SelfAdjoint)==0>(dst, src, func);  
+    call_triangular_assignment_loop<SrcXprType::Mode, (SrcXprType::Mode&SelfAdjoint)==0>(dst, src, func);
  }
 };

@@ -846,7 +846,7 @@ struct Assignment<DstXprType, SrcXprType, Functor, Dense2Triangular>
 {
  EIGEN_DEVICE_FUNC static void run(DstXprType &dst, const SrcXprType &src, const Functor &func)
  {
-    call_triangular_assignment_loop<DstXprType::Mode, false>(dst, src, func);  
+    call_triangular_assignment_loop<DstXprType::Mode, false>(dst, src, func);
  }
 };

@@ -857,19 +857,19 @@ struct triangular_assignment_loop
  // FIXME: this is not very clean, perhaps this information should be provided by the kernel?
  typedef typename Kernel::DstEvaluatorType DstEvaluatorType;
  typedef typename DstEvaluatorType::XprType DstXprType;
-  
+
  enum {
    col = (UnrollCount-1) / DstXprType::RowsAtCompileTime,
    row = (UnrollCount-1) % DstXprType::RowsAtCompileTime
  };
-  
+
  typedef typename Kernel::Scalar Scalar;

  EIGEN_DEVICE_FUNC
  static inline void run(Kernel &kernel)
  {
    triangular_assignment_loop<Kernel, Mode, UnrollCount-1, SetOpposite>::run(kernel);
-    
+
    if(row==col)
      kernel.assignDiagonalCoeff(row);
    else if( ((Mode&Lower) && row>col) || ((Mode&Upper) && row<col) )
@@ -912,10 +912,10 @@ struct triangular_assignment_loop<Kernel, Mode, Dynamic, SetOpposite>
      }
      else
        i = maxi;
-      
+
      if(i<kernel.rows()) // then i==j
        kernel.assignDiagonalCoeff(i++);
-      
+
      if (((Mode&Upper) && SetOpposite) || (Mode&Lower))
      {
        for(; i < kernel.rows(); ++i)
@@ -932,20 +932,20 @@ struct triangular_assignment_loop<Kernel, Mode, Dynamic, SetOpposite>
  * If the matrix is triangular, the opposite part is set to zero. */
 template<typename Derived>
 template<typename DenseDerived>
-void TriangularBase<Derived>::evalToLazy(MatrixBase<DenseDerived> &other) const
+EIGEN_DEVICE_FUNC void TriangularBase<Derived>::evalToLazy(MatrixBase<DenseDerived> &other) const
 {
  other.derived().resize(this->rows(), this->cols());
  internal::call_triangular_assignment_loop<Derived::Mode,(Derived::Mode&SelfAdjoint)==0 /* SetOpposite */>(other.derived(), derived().nestedExpression());
 }

 namespace internal {
-  
+
 // Triangular = Product
 template< typename DstXprType, typename Lhs, typename Rhs, typename Scalar>
 struct Assignment<DstXprType, Product<Lhs,Rhs,DefaultProduct>, internal::assign_op<Scalar,typename Product<Lhs,Rhs,DefaultProduct>::Scalar>, Dense2Triangular>
 {
  typedef Product<Lhs,Rhs,DefaultProduct> SrcXprType;
-  static void run(DstXprType &dst, const SrcXprType &src, const internal::assign_op<Scalar,typename SrcXprType::Scalar> &)
+  static EIGEN_DEVICE_FUNC void run(DstXprType &dst, const SrcXprType &src, const internal::assign_op<Scalar,typename SrcXprType::Scalar> &)
  {
    Index dstRows = src.rows();
    Index dstCols = src.cols();
@@ -961,7 +961,7 @@ template< typename DstXprType, typename Lhs, typename Rhs, typename Scalar>
 struct Assignment<DstXprType, Product<Lhs,Rhs,DefaultProduct>, internal::add_assign_op<Scalar,typename Product<Lhs,Rhs,DefaultProduct>::Scalar>, Dense2Triangular>
 {
  typedef Product<Lhs,Rhs,DefaultProduct> SrcXprType;
-  static void run(DstXprType &dst, const SrcXprType &src, const internal::add_assign_op<Scalar,typename SrcXprType::Scalar> &)
+  static EIGEN_DEVICE_FUNC void run(DstXprType &dst, const SrcXprType &src, const internal::add_assign_op<Scalar,typename SrcXprType::Scalar> &)
  {
    dst._assignProduct(src, 1, 1);
  }
@@ -972,7 +972,7 @@ template< typename DstXprType, typename Lhs, typename Rhs, typename Scalar>
 struct Assignment<DstXprType, Product<Lhs,Rhs,DefaultProduct>, internal::sub_assign_op<Scalar,typename Product<Lhs,Rhs,DefaultProduct>::Scalar>, Dense2Triangular>
 {
  typedef Product<Lhs,Rhs,DefaultProduct> SrcXprType;
-  static void run(DstXprType &dst, const SrcXprType &src, const internal::sub_assign_op<Scalar,typename SrcXprType::Scalar> &)
+  static EIGEN_DEVICE_FUNC void run(DstXprType &dst, const SrcXprType &src, const internal::sub_assign_op<Scalar,typename SrcXprType::Scalar> &)
  {
    dst._assignProduct(src, -1, 1);
  }
--- a/Eigen/src/Core/VectorwiseOp.h
+++ b/Eigen/src/Core/VectorwiseOp.h
@@ -670,7 +670,7 @@ template<typename ExpressionType, int Direction> class VectorwiseOp
  * \sa rowwise(), class VectorwiseOp, \ref TutorialReductionsVisitorsBroadcasting
  */
 template<typename Derived>
-inline typename DenseBase<Derived>::ColwiseReturnType
+EIGEN_DEVICE_FUNC inline typename DenseBase<Derived>::ColwiseReturnType
 DenseBase<Derived>::colwise()
 {
  return ColwiseReturnType(derived());
@@ -684,7 +684,7 @@ DenseBase<Derived>::colwise()
  * \sa colwise(), class VectorwiseOp, \ref TutorialReductionsVisitorsBroadcasting
  */
 template<typename Derived>
-inline typename DenseBase<Derived>::RowwiseReturnType
+EIGEN_DEVICE_FUNC inline typename DenseBase<Derived>::RowwiseReturnType
 DenseBase<Derived>::rowwise()
 {
  return RowwiseReturnType(derived());
--- a/Eigen/src/Core/arch/AltiVec/Complex.h
+++ b/Eigen/src/Core/arch/AltiVec/Complex.h
@@ -15,7 +15,9 @@ namespace Eigen {

 namespace internal {

-static Packet4ui  p4ui_CONJ_XOR = vec_mergeh((Packet4ui)p4i_ZERO, (Packet4ui)p4f_MZERO);//{ 0x00000000, 0x80000000, 0x00000000, 0x80000000 };
+inline Packet4ui  p4ui_CONJ_XOR() {
+  return vec_mergeh((Packet4ui)p4i_ZERO, (Packet4ui)p4f_MZERO);//{ 0x00000000, 0x80000000, 0x00000000, 0x80000000 };
+}
 #ifdef __VSX__
 #if defined(_BIG_ENDIAN)
 static Packet2ul  p2ul_CONJ_XOR1 = (Packet2ul) vec_sld((Packet4ui) p2d_MZERO, (Packet4ui) p2l_ZERO, 8);//{ 0x8000000000000000, 0x0000000000000000 };
@@ -29,8 +31,54 @@ static Packet2ul  p2ul_CONJ_XOR2 = (Packet2ul) vec_sld((Packet4ui) p2d_MZERO, (P
 //---------- float ----------
 struct Packet2cf
 {
-  EIGEN_STRONG_INLINE explicit Packet2cf() : v(p4f_ZERO) {}
+  EIGEN_STRONG_INLINE explicit Packet2cf() {}
  EIGEN_STRONG_INLINE explicit Packet2cf(const Packet4f& a) : v(a) {}
+
+  EIGEN_STRONG_INLINE Packet2cf pmul(const Packet2cf& a, const Packet2cf& b)
+  {
+    Packet4f v1, v2;
+
+    // Permute and multiply the real parts of a and b
+    v1 = vec_perm(a.v, a.v, p16uc_PSET32_WODD);
+    // Get the imaginary parts of a
+    v2 = vec_perm(a.v, a.v, p16uc_PSET32_WEVEN);
+    // multiply a_re * b
+    v1 = vec_madd(v1, b.v, p4f_ZERO);
+    // multiply a_im * b and get the conjugate result
+    v2 = vec_madd(v2, b.v, p4f_ZERO);
+    v2 = reinterpret_cast<Packet4f>(pxor(v2, reinterpret_cast<Packet4f>(p4ui_CONJ_XOR())));
+    // permute back to a proper order
+    v2 = vec_perm(v2, v2, p16uc_COMPLEX32_REV);
+
+    return Packet2cf(padd<Packet4f>(v1, v2));
+  }
+
+  EIGEN_STRONG_INLINE Packet2cf& operator*=(const Packet2cf& b) {
+    v = pmul(Packet2cf(*this), b).v;
+    return *this;
+  }
+  EIGEN_STRONG_INLINE Packet2cf operator*(const Packet2cf& b) const {
+    return Packet2cf(*this) *= b;
+  }
+
+  EIGEN_STRONG_INLINE Packet2cf& operator+=(const Packet2cf& b) {
+    v = padd(v, b.v);
+    return *this;
+  }
+  EIGEN_STRONG_INLINE Packet2cf operator+(const Packet2cf& b) const {
+    return Packet2cf(*this) += b;
+  }
+  EIGEN_STRONG_INLINE Packet2cf& operator-=(const Packet2cf& b) {
+    v = psub(v, b.v);
+    return *this;
+  }
+  EIGEN_STRONG_INLINE Packet2cf operator-(const Packet2cf& b) const {
+    return Packet2cf(*this) -= b;
+  }
+  EIGEN_STRONG_INLINE Packet2cf operator-(void) const {
+    return Packet2cf(-v);
+  }
+
  Packet4f  v;
 };

@@ -82,14 +130,14 @@ template<> EIGEN_STRONG_INLINE void pstoreu<std::complex<float> >(std::complex<f

 template<> EIGEN_DEVICE_FUNC inline Packet2cf pgather<std::complex<float>, Packet2cf>(const std::complex<float>* from, Index stride)
 {
-  std::complex<float> EIGEN_ALIGN16 af[2];
+  EIGEN_ALIGN16 std::complex<float> af[2];
  af[0] = from[0*stride];
  af[1] = from[1*stride];
  return pload<Packet2cf>(af);
 }
 template<> EIGEN_DEVICE_FUNC inline void pscatter<std::complex<float>, Packet2cf>(std::complex<float>* to, const Packet2cf& from, Index stride)
 {
-  std::complex<float> EIGEN_ALIGN16 af[2];
+  EIGEN_ALIGN16 std::complex<float> af[2];
  pstore<std::complex<float> >((std::complex<float> *) af, from);
  to[0*stride] = af[0];
  to[1*stride] = af[1];
@@ -98,26 +146,7 @@ template<> EIGEN_DEVICE_FUNC inline void pscatter<std::complex<float>, Packet2cf
 template<> EIGEN_STRONG_INLINE Packet2cf padd<Packet2cf>(const Packet2cf& a, const Packet2cf& b) { return Packet2cf(a.v + b.v); }
 template<> EIGEN_STRONG_INLINE Packet2cf psub<Packet2cf>(const Packet2cf& a, const Packet2cf& b) { return Packet2cf(a.v - b.v); }
 template<> EIGEN_STRONG_INLINE Packet2cf pnegate(const Packet2cf& a) { return Packet2cf(pnegate(a.v)); }
-template<> EIGEN_STRONG_INLINE Packet2cf pconj(const Packet2cf& a) { return Packet2cf(pxor<Packet4f>(a.v, reinterpret_cast<Packet4f>(p4ui_CONJ_XOR))); }
-
-template<> EIGEN_STRONG_INLINE Packet2cf pmul<Packet2cf>(const Packet2cf& a, const Packet2cf& b)
-{
-  Packet4f v1, v2;
-
-  // Permute and multiply the real parts of a and b
-  v1 = vec_perm(a.v, a.v, p16uc_PSET32_WODD);
-  // Get the imaginary parts of a
-  v2 = vec_perm(a.v, a.v, p16uc_PSET32_WEVEN);
-  // multiply a_re * b 
-  v1 = vec_madd(v1, b.v, p4f_ZERO);
-  // multiply a_im * b and get the conjugate result
-  v2 = vec_madd(v2, b.v, p4f_ZERO);
-  v2 = reinterpret_cast<Packet4f>(pxor(v2, reinterpret_cast<Packet4f>(p4ui_CONJ_XOR)));
-  // permute back to a proper order
-  v2 = vec_perm(v2, v2, p16uc_COMPLEX32_REV);
-  
-  return Packet2cf(padd<Packet4f>(v1, v2));
-}
+template<> EIGEN_STRONG_INLINE Packet2cf pconj(const Packet2cf& a) { return Packet2cf(pxor<Packet4f>(a.v, reinterpret_cast<Packet4f>(p4ui_CONJ_XOR()))); }

 template<> EIGEN_STRONG_INLINE Packet2cf pand   <Packet2cf>(const Packet2cf& a, const Packet2cf& b) { return Packet2cf(pand<Packet4f>(a.v, b.v)); }
 template<> EIGEN_STRONG_INLINE Packet2cf por    <Packet2cf>(const Packet2cf& a, const Packet2cf& b) { return Packet2cf(por<Packet4f>(a.v, b.v)); }
@@ -128,7 +157,7 @@ template<> EIGEN_STRONG_INLINE void prefetch<std::complex<float> >(const std::co

 template<> EIGEN_STRONG_INLINE std::complex<float>  pfirst<Packet2cf>(const Packet2cf& a)
 {
-  std::complex<float> EIGEN_ALIGN16 res[2];
+  EIGEN_ALIGN16 std::complex<float> res[2];
  pstore((float *)&res, a.v);

  return res[0];
@@ -152,7 +181,7 @@ template<> EIGEN_STRONG_INLINE std::complex<float> predux<Packet2cf>(const Packe
 template<> EIGEN_STRONG_INLINE Packet2cf preduxp<Packet2cf>(const Packet2cf* vecs)
 {
  Packet4f b1, b2;
-#ifdef _BIG_ENDIAN  
+#ifdef _BIG_ENDIAN
  b1 = vec_sld(vecs[0].v, vecs[1].v, 8);
  b2 = vec_sld(vecs[1].v, vecs[0].v, 8);
 #else
@@ -260,6 +289,51 @@ struct Packet1cd
 {
  EIGEN_STRONG_INLINE Packet1cd() {}
  EIGEN_STRONG_INLINE explicit Packet1cd(const Packet2d& a) : v(a) {}
+
+  EIGEN_STRONG_INLINE Packet1cd pmul(const Packet1cd& a, const Packet1cd& b)
+  {
+    Packet2d a_re, a_im, v1, v2;
+
+    // Permute and multiply the real parts of a and b
+    a_re = vec_perm(a.v, a.v, p16uc_PSET64_HI);
+    // Get the imaginary parts of a
+    a_im = vec_perm(a.v, a.v, p16uc_PSET64_LO);
+    // multiply a_re * b
+    v1 = vec_madd(a_re, b.v, p2d_ZERO);
+    // multiply a_im * b and get the conjugate result
+    v2 = vec_madd(a_im, b.v, p2d_ZERO);
+    v2 = reinterpret_cast<Packet2d>(vec_sld(reinterpret_cast<Packet4ui>(v2), reinterpret_cast<Packet4ui>(v2), 8));
+    v2 = pxor(v2, reinterpret_cast<Packet2d>(p2ul_CONJ_XOR1));
+
+    return Packet1cd(padd<Packet2d>(v1, v2));
+  }
+
+  EIGEN_STRONG_INLINE Packet1cd& operator*=(const Packet1cd& b) {
+    v = pmul(Packet1cd(*this), b).v;
+    return *this;
+  }
+  EIGEN_STRONG_INLINE Packet1cd operator*(const Packet1cd& b) const {
+    return Packet1cd(*this) *= b;
+  }
+
+  EIGEN_STRONG_INLINE Packet1cd& operator+=(const Packet1cd& b) {
+    v = padd(v, b.v);
+    return *this;
+  }
+  EIGEN_STRONG_INLINE Packet1cd operator+(const Packet1cd& b) const {
+    return Packet1cd(*this) += b;
+  }
+  EIGEN_STRONG_INLINE Packet1cd& operator-=(const Packet1cd& b) {
+    v = psub(v, b.v);
+    return *this;
+  }
+  EIGEN_STRONG_INLINE Packet1cd operator-(const Packet1cd& b) const {
+    return Packet1cd(*this) -= b;
+  }
+  EIGEN_STRONG_INLINE Packet1cd operator-(void) const {
+    return Packet1cd(-v);
+  }
+
  Packet2d v;
 };

@@ -296,19 +370,13 @@ template<> EIGEN_STRONG_INLINE void pstoreu<std::complex<double> >(std::complex<
 template<> EIGEN_STRONG_INLINE Packet1cd pset1<Packet1cd>(const std::complex<double>&  from)
 { /* here we really have to use unaligned loads :( */ return ploadu<Packet1cd>(&from); }

-template<> EIGEN_DEVICE_FUNC inline Packet1cd pgather<std::complex<double>, Packet1cd>(const std::complex<double>* from, Index stride)
+template<> EIGEN_DEVICE_FUNC inline Packet1cd pgather<std::complex<double>, Packet1cd>(const std::complex<double>* from, Index)
 {
-  std::complex<double> EIGEN_ALIGN16 af[2];
-  af[0] = from[0*stride];
-  af[1] = from[1*stride];
-  return pload<Packet1cd>(af);
+  return pload<Packet1cd>(from);
 }
-template<> EIGEN_DEVICE_FUNC inline void pscatter<std::complex<double>, Packet1cd>(std::complex<double>* to, const Packet1cd& from, Index stride)
+template<> EIGEN_DEVICE_FUNC inline void pscatter<std::complex<double>, Packet1cd>(std::complex<double>* to, const Packet1cd& from, Index)
 {
-  std::complex<double> EIGEN_ALIGN16 af[2];
-  pstore<std::complex<double> >(af, from);
-  to[0*stride] = af[0];
-  to[1*stride] = af[1];
+  pstore<std::complex<double> >(to, from);
 }

 template<> EIGEN_STRONG_INLINE Packet1cd padd<Packet1cd>(const Packet1cd& a, const Packet1cd& b) { return Packet1cd(a.v + b.v); }
@@ -316,24 +384,6 @@ template<> EIGEN_STRONG_INLINE Packet1cd psub<Packet1cd>(const Packet1cd& a, con
 template<> EIGEN_STRONG_INLINE Packet1cd pnegate(const Packet1cd& a) { return Packet1cd(pnegate(Packet2d(a.v))); }
 template<> EIGEN_STRONG_INLINE Packet1cd pconj(const Packet1cd& a) { return Packet1cd(pxor(a.v, reinterpret_cast<Packet2d>(p2ul_CONJ_XOR2))); }

-template<> EIGEN_STRONG_INLINE Packet1cd pmul<Packet1cd>(const Packet1cd& a, const Packet1cd& b)
-{
-  Packet2d a_re, a_im, v1, v2;
-
-  // Permute and multiply the real parts of a and b
-  a_re = vec_perm(a.v, a.v, p16uc_PSET64_HI);
-  // Get the imaginary parts of a
-  a_im = vec_perm(a.v, a.v, p16uc_PSET64_LO);
-  // multiply a_re * b
-  v1 = vec_madd(a_re, b.v, p2d_ZERO);
-  // multiply a_im * b and get the conjugate result
-  v2 = vec_madd(a_im, b.v, p2d_ZERO);
-  v2 = reinterpret_cast<Packet2d>(vec_sld(reinterpret_cast<Packet4ui>(v2), reinterpret_cast<Packet4ui>(v2), 8));
-  v2 = pxor(v2, reinterpret_cast<Packet2d>(p2ul_CONJ_XOR1));
-
-  return Packet1cd(padd<Packet2d>(v1, v2));
-}
-
 template<> EIGEN_STRONG_INLINE Packet1cd pand   <Packet1cd>(const Packet1cd& a, const Packet1cd& b) { return Packet1cd(pand(a.v,b.v)); }
 template<> EIGEN_STRONG_INLINE Packet1cd por    <Packet1cd>(const Packet1cd& a, const Packet1cd& b) { return Packet1cd(por(a.v,b.v)); }
 template<> EIGEN_STRONG_INLINE Packet1cd pxor   <Packet1cd>(const Packet1cd& a, const Packet1cd& b) { return Packet1cd(pxor(a.v,b.v)); }
@@ -345,7 +395,7 @@ template<> EIGEN_STRONG_INLINE void prefetch<std::complex<double> >(const std::c

 template<> EIGEN_STRONG_INLINE std::complex<double>  pfirst<Packet1cd>(const Packet1cd& a)
 {
-  std::complex<double> EIGEN_ALIGN16 res[2];
+  EIGEN_ALIGN16 std::complex<double> res[2];
  pstore<std::complex<double> >(res, a);

  return res[0];
--- a/Eigen/src/Core/arch/AltiVec/PacketMath.h
+++ b/Eigen/src/Core/arch/AltiVec/PacketMath.h
@@ -22,10 +22,6 @@ namespace internal {
 #define EIGEN_HAS_SINGLE_INSTRUCTION_MADD
 #endif

-#ifndef EIGEN_HAS_SINGLE_INSTRUCTION_CJMADD
-#define EIGEN_HAS_SINGLE_INSTRUCTION_CJMADD
-#endif
-
 // NOTE Altivec has 32 registers, but Eigen only accepts a value of 8 or 16
 #ifndef EIGEN_ARCH_DEFAULT_NUMBER_OF_REGISTERS
 #define EIGEN_ARCH_DEFAULT_NUMBER_OF_REGISTERS  32
@@ -40,9 +36,8 @@ typedef __vector unsigned char  Packet16uc;

 // We don't want to write the same code all the time, but we need to reuse the constants
 // and it doesn't really work to declare them global, so we define macros instead
-
 #define _EIGEN_DECLARE_CONST_FAST_Packet4f(NAME,X) \
-  Packet4f p4f_##NAME = reinterpret_cast<Packet4f>(vec_splat_s32(X))
+  Packet4f p4f_##NAME = {X, X, X, X}

 #define _EIGEN_DECLARE_CONST_FAST_Packet4i(NAME,X) \
  Packet4i p4i_##NAME = vec_splat_s32(X)
@@ -64,7 +59,7 @@ typedef __vector unsigned char  Packet16uc;

 #define DST_CHAN 1
 #define DST_CTRL(size, count, stride) (((size) << 24) | ((count) << 16) | (stride))
-
+#define __UNPACK_TYPE__(PACKETNAME) typename unpacket_traits<PACKETNAME>::type

 // These constants are endian-agnostic
 static _EIGEN_DECLARE_CONST_FAST_Packet4f(ZERO, 0); //{ 0.0, 0.0, 0.0, 0.0}
@@ -77,21 +72,15 @@ static Packet4f p4f_MZERO = (Packet4f) vec_sl((Packet4ui)p4i_MINUS1, (Packet4ui)
 static Packet4f p4f_ONE = vec_ctf(p4i_ONE, 0); //{ 1.0, 1.0, 1.0, 1.0}
 #endif

+static Packet4ui p4ui_SIGN = {0x80000000u, 0x80000000u, 0x80000000u, 0x80000000u};
+static Packet4ui p4ui_PREV0DOT5 = {0x3EFFFFFFu, 0x3EFFFFFFu, 0x3EFFFFFFu, 0x3EFFFFFFu};
+
 static Packet4f p4f_COUNTDOWN = { 0.0, 1.0, 2.0, 3.0 };
 static Packet4i p4i_COUNTDOWN = { 0, 1, 2, 3 };

 static Packet16uc p16uc_REVERSE32 = { 12,13,14,15, 8,9,10,11, 4,5,6,7, 0,1,2,3 };
 static Packet16uc p16uc_DUPLICATE32_HI = { 0,1,2,3, 0,1,2,3, 4,5,6,7, 4,5,6,7 };

-// Mask alignment
-#ifdef __PPC64__
-#define _EIGEN_MASK_ALIGNMENT	0xfffffffffffffff0
-#else
-#define _EIGEN_MASK_ALIGNMENT	0xfffffff0
-#endif
-
-#define _EIGEN_ALIGNED_PTR(x)	((std::ptrdiff_t)(x) & _EIGEN_MASK_ALIGNMENT)
-
 // Handle endianness properly while loading constants
 // Define global static constants:
 #ifdef _BIG_ENDIAN
@@ -235,112 +224,127 @@ inline std::ostream & operator <<(std::ostream & s, const Packet4ui & v)
  return s;
 }

+template <typename Packet, typename Scalar>
+EIGEN_STRONG_INLINE Packet pload_common(const Scalar* from)
+{
+  // some versions of GCC throw "unused-but-set-parameter".
+  // ignoring these warnings for now.
+  EIGEN_UNUSED_VARIABLE(from);
+  EIGEN_DEBUG_ALIGNED_LOAD
+  return vec_ld(0, from);
+}
+
 // Need to define them first or we get specialization after instantiation errors
 template<> EIGEN_STRONG_INLINE Packet4f pload<Packet4f>(const float* from)
 {
-  EIGEN_DEBUG_ALIGNED_LOAD
-#ifdef __VSX__
-  return vec_vsx_ld(0, from);
-#else
-  return vec_ld(0, from);
-#endif
+  return pload_common<Packet4f, float>(from);
 }

 template<> EIGEN_STRONG_INLINE Packet4i pload<Packet4i>(const int*     from)
 {
-  EIGEN_DEBUG_ALIGNED_LOAD
+  return pload_common<Packet4i, int>(from);
+}
+
+template <typename Packet>
+EIGEN_STRONG_INLINE void pstore_common(__UNPACK_TYPE__(Packet)* to, const Packet& from){
+  // some versions of GCC throw "unused-but-set-parameter" (float *to).
+  // ignoring these warnings for now.
+  EIGEN_UNUSED_VARIABLE(to);
+  EIGEN_DEBUG_ALIGNED_STORE
 #ifdef __VSX__
-  return vec_vsx_ld(0, from);
+  vec_xst(from, 0, to);
 #else
-  return vec_ld(0, from);
+  vec_st(from, 0, to);
 #endif
 }

 template<> EIGEN_STRONG_INLINE void pstore<float>(float*   to, const Packet4f& from)
 {
-  EIGEN_DEBUG_ALIGNED_STORE
-#ifdef __VSX__
-  vec_vsx_st(from, 0, to);
-#else
-  vec_st(from, 0, to);
-#endif
+  pstore_common<Packet4f>(to, from);
 }

 template<> EIGEN_STRONG_INLINE void pstore<int>(int*       to, const Packet4i& from)
 {
-  EIGEN_DEBUG_ALIGNED_STORE
-#ifdef __VSX__
-  vec_vsx_st(from, 0, to);
-#else
-  vec_st(from, 0, to);
-#endif
+  pstore_common<Packet4i>(to, from);
+}
+
+template<typename Packet>
+EIGEN_STRONG_INLINE Packet pset1_size4(const __UNPACK_TYPE__(Packet)& from)
+{
+  Packet v = {from, from, from, from};
+  return v;
 }

 template<> EIGEN_STRONG_INLINE Packet4f pset1<Packet4f>(const float&  from) {
-  Packet4f v = {from, from, from, from};
-  return v;
+  return pset1_size4<Packet4f>(from);
 }

 template<> EIGEN_STRONG_INLINE Packet4i pset1<Packet4i>(const int&    from)   {
-  Packet4i v = {from, from, from, from};
-  return v;
+  return pset1_size4<Packet4i>(from);
 }
-template<> EIGEN_STRONG_INLINE void
-pbroadcast4<Packet4f>(const float *a,
-                      Packet4f& a0, Packet4f& a1, Packet4f& a2, Packet4f& a3)
+
+template<typename Packet> EIGEN_STRONG_INLINE void
+pbroadcast4_common(const __UNPACK_TYPE__(Packet) *a,
+                      Packet& a0, Packet& a1, Packet& a2, Packet& a3)
 {
-  a3 = pload<Packet4f>(a);
+  a3 = pload<Packet>(a);
  a0 = vec_splat(a3, 0);
  a1 = vec_splat(a3, 1);
  a2 = vec_splat(a3, 2);
  a3 = vec_splat(a3, 3);
 }
+
+template<> EIGEN_STRONG_INLINE void
+pbroadcast4<Packet4f>(const float *a,
+                      Packet4f& a0, Packet4f& a1, Packet4f& a2, Packet4f& a3)
+{
+  pbroadcast4_common<Packet4f>(a, a0, a1, a2, a3);
+}
 template<> EIGEN_STRONG_INLINE void
 pbroadcast4<Packet4i>(const int *a,
                      Packet4i& a0, Packet4i& a1, Packet4i& a2, Packet4i& a3)
 {
-  a3 = pload<Packet4i>(a);
-  a0 = vec_splat(a3, 0);
-  a1 = vec_splat(a3, 1);
-  a2 = vec_splat(a3, 2);
-  a3 = vec_splat(a3, 3);
+  pbroadcast4_common<Packet4i>(a, a0, a1, a2, a3);
+}
+
+template<typename Packet> EIGEN_DEVICE_FUNC inline Packet pgather_common(const __UNPACK_TYPE__(Packet)* from, Index stride)
+{
+  EIGEN_ALIGN16 __UNPACK_TYPE__(Packet) a[4];
+  a[0] = from[0*stride];
+  a[1] = from[1*stride];
+  a[2] = from[2*stride];
+  a[3] = from[3*stride];
+  return pload<Packet>(a);
 }

 template<> EIGEN_DEVICE_FUNC inline Packet4f pgather<float, Packet4f>(const float* from, Index stride)
 {
-  float EIGEN_ALIGN16 af[4];
-  af[0] = from[0*stride];
-  af[1] = from[1*stride];
-  af[2] = from[2*stride];
-  af[3] = from[3*stride];
- return pload<Packet4f>(af);
+  return pgather_common<Packet4f>(from, stride);
 }
+
 template<> EIGEN_DEVICE_FUNC inline Packet4i pgather<int, Packet4i>(const int* from, Index stride)
 {
-  int EIGEN_ALIGN16 ai[4];
-  ai[0] = from[0*stride];
-  ai[1] = from[1*stride];
-  ai[2] = from[2*stride];
-  ai[3] = from[3*stride];
- return pload<Packet4i>(ai);
+  return pgather_common<Packet4i>(from, stride);
 }
+
+template<typename Packet> EIGEN_DEVICE_FUNC inline void pscatter_size4(__UNPACK_TYPE__(Packet)* to, const Packet& from, Index stride)
+{
+  EIGEN_ALIGN16 __UNPACK_TYPE__(Packet) a[4];
+  pstore<__UNPACK_TYPE__(Packet)>(a, from);
+  to[0*stride] = a[0];
+  to[1*stride] = a[1];
+  to[2*stride] = a[2];
+  to[3*stride] = a[3];
+}
+
 template<> EIGEN_DEVICE_FUNC inline void pscatter<float, Packet4f>(float* to, const Packet4f& from, Index stride)
 {
-  float EIGEN_ALIGN16 af[4];
-  pstore<float>(af, from);
-  to[0*stride] = af[0];
-  to[1*stride] = af[1];
-  to[2*stride] = af[2];
-  to[3*stride] = af[3];
+  pscatter_size4<Packet4f>(to, from, stride);
 }
+
 template<> EIGEN_DEVICE_FUNC inline void pscatter<int, Packet4i>(int* to, const Packet4i& from, Index stride)
 {
-  int EIGEN_ALIGN16 ai[4];
-  pstore<int>((int *)ai, from);
-  to[0*stride] = ai[0];
-  to[1*stride] = ai[1];
-  to[2*stride] = ai[2];
-  to[3*stride] = ai[3];
+  pscatter_size4<Packet4i>(to, from, stride);
 }

 template<> EIGEN_STRONG_INLINE Packet4f plset<Packet4f>(const float& a) { return pset1<Packet4f>(a) + p4f_COUNTDOWN; }
@@ -424,66 +428,67 @@ template<> EIGEN_STRONG_INLINE Packet4i pxor<Packet4i>(const Packet4i& a, const
 template<> EIGEN_STRONG_INLINE Packet4f pandnot<Packet4f>(const Packet4f& a, const Packet4f& b) { return vec_and(a, vec_nor(b, b)); }
 template<> EIGEN_STRONG_INLINE Packet4i pandnot<Packet4i>(const Packet4i& a, const Packet4i& b) { return vec_and(a, vec_nor(b, b)); }

-template<> EIGEN_STRONG_INLINE Packet4f pround<Packet4f>(const Packet4f& a) { return vec_round(a); }
+template<> EIGEN_STRONG_INLINE Packet4f pround<Packet4f>(const Packet4f& a)
+{
+    Packet4f t = vec_add(reinterpret_cast<Packet4f>(vec_or(vec_and(reinterpret_cast<Packet4ui>(a), p4ui_SIGN), p4ui_PREV0DOT5)), a);
+    Packet4f res;
+
+#ifdef __VSX__
+    __asm__("xvrspiz %x0, %x1\n\t"
+        : "=&wa" (res)
+        : "wa" (t));
+#else
+    __asm__("vrfiz %0, %1\n\t"
+        : "=v" (res)
+        : "v" (t));
+#endif
+
+    return res;
+}
 template<> EIGEN_STRONG_INLINE Packet4f pceil<Packet4f>(const  Packet4f& a) { return vec_ceil(a); }
 template<> EIGEN_STRONG_INLINE Packet4f pfloor<Packet4f>(const Packet4f& a) { return vec_floor(a); }

-#ifdef _BIG_ENDIAN
+template<typename Packet> EIGEN_STRONG_INLINE Packet ploadu_common(const __UNPACK_TYPE__(Packet)* from)
+{
+  EIGEN_DEBUG_UNALIGNED_LOAD
+  Packet16uc mask = vec_lvsl(0, from);                 // create the permute mask
+  Packet16uc MSQ = vec_ld(0, (unsigned char *)from);   // most significant quadword
+  Packet16uc LSQ = vec_ld(15, (unsigned char *)from);  // least significant quadword
+  //TODO: Add static_cast here
+  return (Packet) vec_perm(MSQ, LSQ, mask);            // align the data
+}
+
 template<> EIGEN_STRONG_INLINE Packet4f ploadu<Packet4f>(const float* from)
 {
-  EIGEN_DEBUG_ALIGNED_LOAD
-  Packet16uc MSQ, LSQ;
-  Packet16uc mask;
-  MSQ = vec_ld(0, (unsigned char *)from);          // most significant quadword
-  LSQ = vec_ld(15, (unsigned char *)from);         // least significant quadword
-  mask = vec_lvsl(0, from);                        // create the permute mask
-  return (Packet4f) vec_perm(MSQ, LSQ, mask);           // align the data
-
+  return ploadu_common<Packet4f>(from);
 }
 template<> EIGEN_STRONG_INLINE Packet4i ploadu<Packet4i>(const int* from)
 {
-  EIGEN_DEBUG_ALIGNED_LOAD
-  // Taken from http://developer.apple.com/hardwaredrivers/ve/alignment.html
-  Packet16uc MSQ, LSQ;
-  Packet16uc mask;
-  MSQ = vec_ld(0, (unsigned char *)from);          // most significant quadword
-  LSQ = vec_ld(15, (unsigned char *)from);         // least significant quadword
-  mask = vec_lvsl(0, from);                        // create the permute mask
-  return (Packet4i) vec_perm(MSQ, LSQ, mask);    // align the data
+  return ploadu_common<Packet4i>(from);
 }
-#else
-// We also need ot redefine little endian loading of Packet4i/Packet4f using VSX
-template<> EIGEN_STRONG_INLINE Packet4i ploadu<Packet4i>(const int* from)
-{
-  EIGEN_DEBUG_UNALIGNED_LOAD
-  return (Packet4i) vec_vsx_ld((long)from & 15, (const int*) _EIGEN_ALIGNED_PTR(from));
-}
-template<> EIGEN_STRONG_INLINE Packet4f ploadu<Packet4f>(const float* from)
-{
-  EIGEN_DEBUG_UNALIGNED_LOAD
-  return (Packet4f) vec_vsx_ld((long)from & 15, (const float*) _EIGEN_ALIGNED_PTR(from));
-}
-#endif

+template<typename Packet> EIGEN_STRONG_INLINE Packet ploaddup_common(const __UNPACK_TYPE__(Packet)*   from)
+{
+  Packet p;
+  if((std::ptrdiff_t(from) % 16) == 0)  p = pload<Packet>(from);
+  else                                  p = ploadu<Packet>(from);
+  return vec_perm(p, p, p16uc_DUPLICATE32_HI);
+}
 template<> EIGEN_STRONG_INLINE Packet4f ploaddup<Packet4f>(const float*   from)
 {
-  Packet4f p;
-  if((std::ptrdiff_t(from) % 16) == 0)  p = pload<Packet4f>(from);
-  else                                  p = ploadu<Packet4f>(from);
-  return vec_perm(p, p, p16uc_DUPLICATE32_HI);
+  return ploaddup_common<Packet4f>(from);
 }
 template<> EIGEN_STRONG_INLINE Packet4i ploaddup<Packet4i>(const int*     from)
 {
-  Packet4i p;
-  if((std::ptrdiff_t(from) % 16) == 0)  p = pload<Packet4i>(from);
-  else                                  p = ploadu<Packet4i>(from);
-  return vec_perm(p, p, p16uc_DUPLICATE32_HI);
+  return ploaddup_common<Packet4i>(from);
 }

-#ifdef _BIG_ENDIAN
-template<> EIGEN_STRONG_INLINE void pstoreu<float>(float*  to, const Packet4f& from)
+template<typename Packet> EIGEN_STRONG_INLINE void pstoreu_common(__UNPACK_TYPE__(Packet)*  to, const Packet& from)
 {
  EIGEN_DEBUG_UNALIGNED_STORE
+#ifdef __VSX__
+  vec_xst(from, 0, to);
+#else
  // Taken from http://developer.apple.com/hardwaredrivers/ve/alignment.html
  // Warning: not thread safe!
  Packet16uc MSQ, LSQ, edges;
@@ -497,45 +502,23 @@ template<> EIGEN_STRONG_INLINE void pstoreu<float>(float*  to, const Packet4f& f
  MSQ = vec_perm(edges,(Packet16uc)from,align);             // misalign the data (MSQ)
  LSQ = vec_perm((Packet16uc)from,edges,align);             // misalign the data (LSQ)
  vec_st( LSQ, 15, (unsigned char *)to );                   // Store the LSQ part first
-  vec_st( MSQ, 0, (unsigned char *)to );                    // Store the MSQ part
+  vec_st( MSQ, 0, (unsigned char *)to );                   // Store the MSQ part second
+#endif
+}
+template<> EIGEN_STRONG_INLINE void pstoreu<float>(float*  to, const Packet4f& from)
+{
+  pstoreu_common<Packet4f>(to, from);
 }
 template<> EIGEN_STRONG_INLINE void pstoreu<int>(int*      to, const Packet4i& from)
 {
-  EIGEN_DEBUG_UNALIGNED_STORE
-  // Taken from http://developer.apple.com/hardwaredrivers/ve/alignment.html
-  // Warning: not thread safe!
-  Packet16uc MSQ, LSQ, edges;
-  Packet16uc edgeAlign, align;
-
-  MSQ = vec_ld(0, (unsigned char *)to);                     // most significant quadword
-  LSQ = vec_ld(15, (unsigned char *)to);                    // least significant quadword
-  edgeAlign = vec_lvsl(0, to);                              // permute map to extract edges
-  edges=vec_perm(LSQ, MSQ, edgeAlign);                      // extract the edges
-  align = vec_lvsr( 0, to );                                // permute map to misalign data
-  MSQ = vec_perm(edges, (Packet16uc) from, align);          // misalign the data (MSQ)
-  LSQ = vec_perm((Packet16uc) from, edges, align);          // misalign the data (LSQ)
-  vec_st( LSQ, 15, (unsigned char *)to );                   // Store the LSQ part first
-  vec_st( MSQ, 0, (unsigned char *)to );                    // Store the MSQ part
+  pstoreu_common<Packet4i>(to, from);
 }
-#else
-// We also need ot redefine little endian loading of Packet4i/Packet4f using VSX
-template<> EIGEN_STRONG_INLINE void pstoreu<int>(int*       to, const Packet4i& from)
-{
-  EIGEN_DEBUG_ALIGNED_STORE
-  vec_vsx_st(from, (long)to & 15, (int*) _EIGEN_ALIGNED_PTR(to));
-}
-template<> EIGEN_STRONG_INLINE void pstoreu<float>(float*   to, const Packet4f& from)
-{
-  EIGEN_DEBUG_ALIGNED_STORE
-  vec_vsx_st(from, (long)to & 15, (float*) _EIGEN_ALIGNED_PTR(to));
-}
-#endif

 template<> EIGEN_STRONG_INLINE void prefetch<float>(const float* addr)    { EIGEN_PPC_PREFETCH(addr); }
 template<> EIGEN_STRONG_INLINE void prefetch<int>(const int*     addr)    { EIGEN_PPC_PREFETCH(addr); }

-template<> EIGEN_STRONG_INLINE float  pfirst<Packet4f>(const Packet4f& a) { float EIGEN_ALIGN16 x; vec_ste(a, 0, &x); return x; }
-template<> EIGEN_STRONG_INLINE int    pfirst<Packet4i>(const Packet4i& a) { int   EIGEN_ALIGN16 x; vec_ste(a, 0, &x); return x; }
+template<> EIGEN_STRONG_INLINE float  pfirst<Packet4f>(const Packet4f& a) { EIGEN_ALIGN16 float x; vec_ste(a, 0, &x); return x; }
+template<> EIGEN_STRONG_INLINE int    pfirst<Packet4i>(const Packet4i& a) { EIGEN_ALIGN16 int   x; vec_ste(a, 0, &x); return x; }

 template<> EIGEN_STRONG_INLINE Packet4f preverse(const Packet4f& a)
 {
@@ -643,37 +626,42 @@ template<> EIGEN_STRONG_INLINE int predux_mul<Packet4i>(const Packet4i& a)
 }

 // min
-template<> EIGEN_STRONG_INLINE float predux_min<Packet4f>(const Packet4f& a)
+template<typename Packet> EIGEN_STRONG_INLINE
+__UNPACK_TYPE__(Packet) predux_min4(const Packet& a)
 {
-  Packet4f b, res;
+  Packet b, res;
  b = vec_min(a, vec_sld(a, a, 8));
  res = vec_min(b, vec_sld(b, b, 4));
  return pfirst(res);
 }

+
+template<> EIGEN_STRONG_INLINE float predux_min<Packet4f>(const Packet4f& a)
+{
+  return predux_min4<Packet4f>(a);
+}
+
 template<> EIGEN_STRONG_INLINE int predux_min<Packet4i>(const Packet4i& a)
 {
-  Packet4i b, res;
-  b = vec_min(a, vec_sld(a, a, 8));
-  res = vec_min(b, vec_sld(b, b, 4));
-  return pfirst(res);
+  return predux_min4<Packet4i>(a);
 }
-
 // max
-template<> EIGEN_STRONG_INLINE float predux_max<Packet4f>(const Packet4f& a)
+template<typename Packet> EIGEN_STRONG_INLINE __UNPACK_TYPE__(Packet) predux_max4(const Packet& a)
 {
-  Packet4f b, res;
+  Packet b, res;
  b = vec_max(a, vec_sld(a, a, 8));
  res = vec_max(b, vec_sld(b, b, 4));
  return pfirst(res);
 }

+template<> EIGEN_STRONG_INLINE float predux_max<Packet4f>(const Packet4f& a)
+{
+  return predux_max4<Packet4f>(a);
+}
+
 template<> EIGEN_STRONG_INLINE int predux_max<Packet4i>(const Packet4i& a)
 {
-  Packet4i b, res;
-  b = vec_max(a, vec_sld(a, a, 8));
-  res = vec_max(b, vec_sld(b, b, 4));
-  return pfirst(res);
+  return predux_max4<Packet4i>(a);
 }

 template<int Offset>
@@ -730,9 +718,9 @@ struct palign_impl<Offset,Packet4i>
  }
 };

-EIGEN_DEVICE_FUNC inline void
-ptranspose(PacketBlock<Packet4f,4>& kernel) {
-  Packet4f t0, t1, t2, t3;
+template <typename T>
+EIGEN_DEVICE_FUNC inline void ptranpose_common(PacketBlock<T, 4>& kernel) {
+  T t0, t1, t2, t3;
  t0 = vec_mergeh(kernel.packet[0], kernel.packet[2]);
  t1 = vec_mergel(kernel.packet[0], kernel.packet[2]);
  t2 = vec_mergeh(kernel.packet[1], kernel.packet[3]);
@@ -743,29 +731,23 @@ ptranspose(PacketBlock<Packet4f,4>& kernel) {
  kernel.packet[3] = vec_mergel(t1, t3);
 }

-EIGEN_DEVICE_FUNC inline void
-ptranspose(PacketBlock<Packet4i,4>& kernel) {
-  Packet4i t0, t1, t2, t3;
-  t0 = vec_mergeh(kernel.packet[0], kernel.packet[2]);
-  t1 = vec_mergel(kernel.packet[0], kernel.packet[2]);
-  t2 = vec_mergeh(kernel.packet[1], kernel.packet[3]);
-  t3 = vec_mergel(kernel.packet[1], kernel.packet[3]);
-  kernel.packet[0] = vec_mergeh(t0, t2);
-  kernel.packet[1] = vec_mergel(t0, t2);
-  kernel.packet[2] = vec_mergeh(t1, t3);
-  kernel.packet[3] = vec_mergel(t1, t3);
+EIGEN_DEVICE_FUNC inline void ptranspose(PacketBlock<Packet4f, 4>& kernel) { ptranpose_common<Packet4f>(kernel); }
+
+EIGEN_DEVICE_FUNC inline void ptranspose(PacketBlock<Packet4i, 4>& kernel) { ptranpose_common<Packet4i>(kernel); }
+
+template<typename Packet> EIGEN_STRONG_INLINE
+Packet pblend4(const Selector<4>& ifPacket, const Packet& thenPacket, const Packet& elsePacket) {
+  Packet4ui select = { ifPacket.select[0], ifPacket.select[1], ifPacket.select[2], ifPacket.select[3] };
+  Packet4ui mask = reinterpret_cast<Packet4ui>(vec_cmpeq(reinterpret_cast<Packet4ui>(select), reinterpret_cast<Packet4ui>(p4i_ONE)));
+  return vec_sel(elsePacket, thenPacket, mask);
 }

 template<> EIGEN_STRONG_INLINE Packet4i pblend(const Selector<4>& ifPacket, const Packet4i& thenPacket, const Packet4i& elsePacket) {
-  Packet4ui select = { ifPacket.select[0], ifPacket.select[1], ifPacket.select[2], ifPacket.select[3] };
-  Packet4ui mask = reinterpret_cast<Packet4ui>(vec_cmpeq(reinterpret_cast<Packet4ui>(select), reinterpret_cast<Packet4ui>(p4i_ONE)));
-  return vec_sel(elsePacket, thenPacket, mask);
+  return pblend4<Packet4i>(ifPacket, thenPacket, elsePacket);
 }

 template<> EIGEN_STRONG_INLINE Packet4f pblend(const Selector<4>& ifPacket, const Packet4f& thenPacket, const Packet4f& elsePacket) {
-  Packet4ui select = { ifPacket.select[0], ifPacket.select[1], ifPacket.select[2], ifPacket.select[3] };
-  Packet4ui mask = reinterpret_cast<Packet4ui>(vec_cmpeq(reinterpret_cast<Packet4ui>(select), reinterpret_cast<Packet4ui>(p4i_ONE)));
-  return vec_sel(elsePacket, thenPacket, mask);
+  return pblend4<Packet4f>(ifPacket, thenPacket, elsePacket);
 }


@@ -785,6 +767,8 @@ static Packet2l  p2l_ZERO = reinterpret_cast<Packet2l>(p4i_ZERO);
 static Packet2d  p2d_ONE  = { 1.0, 1.0 };
 static Packet2d  p2d_ZERO = reinterpret_cast<Packet2d>(p4f_ZERO);
 static Packet2d  p2d_MZERO = { -0.0, -0.0 };
+static Packet2ul p2ul_SIGN = {0x8000000000000000ull, 0x8000000000000000ull};
+static Packet2ul p2ul_PREV0DOT5 = {0x3FDFFFFFFFFFFFFFull, 0x3FDFFFFFFFFFFFFFull};

 #ifdef _BIG_ENDIAN
 static Packet2d p2d_COUNTDOWN = reinterpret_cast<Packet2d>(vec_sld(reinterpret_cast<Packet4f>(p2d_ZERO), reinterpret_cast<Packet4f>(p2d_ONE), 8));
@@ -792,16 +776,9 @@ static Packet2d p2d_COUNTDOWN = reinterpret_cast<Packet2d>(vec_sld(reinterpret_c
 static Packet2d p2d_COUNTDOWN = reinterpret_cast<Packet2d>(vec_sld(reinterpret_cast<Packet4f>(p2d_ONE), reinterpret_cast<Packet4f>(p2d_ZERO), 8));
 #endif

-template<int index> Packet2d vec_splat_dbl(Packet2d& a);
-
-template<> EIGEN_STRONG_INLINE Packet2d vec_splat_dbl<0>(Packet2d& a)
+template<int index> Packet2d vec_splat_dbl(Packet2d& a)
 {
-  return reinterpret_cast<Packet2d>(vec_perm(a, a, p16uc_PSET64_HI));
-}
-
-template<> EIGEN_STRONG_INLINE Packet2d vec_splat_dbl<1>(Packet2d& a)
-{
-  return reinterpret_cast<Packet2d>(vec_perm(a, a, p16uc_PSET64_LO));
+  return vec_splat(a, index);
 }

 template<> struct packet_traits<double> : default_packet_traits
@@ -826,7 +803,11 @@ template<> struct packet_traits<double> : default_packet_traits
    HasLog  = 0,
    HasExp  = 1,
    HasSqrt = 1,
+#if !EIGEN_COMP_CLANG
    HasRsqrt = 1,
+#else
+    HasRsqrt = 0,
+#endif
    HasRound = 1,
    HasFloor = 1,
    HasCeil = 1,
@@ -863,21 +844,13 @@ inline std::ostream & operator <<(std::ostream & s, const Packet2d & v)
 template<> EIGEN_STRONG_INLINE Packet2d pload<Packet2d>(const double* from)
 {
  EIGEN_DEBUG_ALIGNED_LOAD
-#ifdef __VSX__
-  return vec_vsx_ld(0, from);
-#else
-  return vec_ld(0, from);
-#endif
+  return vec_xl(0, const_cast<double *>(from)); // cast needed by Clang
 }

 template<> EIGEN_STRONG_INLINE void pstore<double>(double*   to, const Packet2d& from)
 {
  EIGEN_DEBUG_ALIGNED_STORE
-#ifdef __VSX__
-  vec_vsx_st(from, 0, to);
-#else
-  vec_st(from, 0, to);
-#endif
+  vec_xst(from, 0, to);
 }

 template<> EIGEN_STRONG_INLINE Packet2d pset1<Packet2d>(const double&  from) {
@@ -889,24 +862,23 @@ template<> EIGEN_STRONG_INLINE void
 pbroadcast4<Packet2d>(const double *a,
                      Packet2d& a0, Packet2d& a1, Packet2d& a2, Packet2d& a3)
 {
-  a1 = pload<Packet2d>(a);
-  a0 = vec_splat_dbl<0>(a1);
-  a1 = vec_splat_dbl<1>(a1);
-  a3 = pload<Packet2d>(a+2);
-  a2 = vec_splat_dbl<0>(a3);
-  a3 = vec_splat_dbl<1>(a3);
+  //This way is faster than vec_splat (at least for doubles in Power 9)
+  a0 = pset1<Packet2d>(a[0]);
+  a1 = pset1<Packet2d>(a[1]);
+  a2 = pset1<Packet2d>(a[2]);
+  a3 = pset1<Packet2d>(a[3]);
 }

 template<> EIGEN_DEVICE_FUNC inline Packet2d pgather<double, Packet2d>(const double* from, Index stride)
 {
-  double EIGEN_ALIGN16 af[2];
+  EIGEN_ALIGN16 double af[2];
  af[0] = from[0*stride];
  af[1] = from[1*stride];
 return pload<Packet2d>(af);
 }
 template<> EIGEN_DEVICE_FUNC inline void pscatter<double, Packet2d>(double* to, const Packet2d& from, Index stride)
 {
-  double EIGEN_ALIGN16 af[2];
+  EIGEN_ALIGN16 double af[2];
  pstore<double>(af, from);
  to[0*stride] = af[0];
  to[1*stride] = af[1];
@@ -918,7 +890,14 @@ template<> EIGEN_STRONG_INLINE Packet2d padd<Packet2d>(const Packet2d& a, const

 template<> EIGEN_STRONG_INLINE Packet2d psub<Packet2d>(const Packet2d& a, const Packet2d& b) { return a - b; }

-template<> EIGEN_STRONG_INLINE Packet2d pnegate(const Packet2d& a) { return p2d_ZERO - a; }
+template<> EIGEN_STRONG_INLINE Packet2d pnegate(const Packet2d& a)
+{
+#ifdef __POWER8_VECTOR__
+  return vec_neg(a);
+#else
+  return vec_xor(a, p2d_MZERO);
+#endif
+}

 template<> EIGEN_STRONG_INLINE Packet2d pconj(const Packet2d& a) { return a; }

@@ -950,14 +929,24 @@ template<> EIGEN_STRONG_INLINE Packet2d pxor<Packet2d>(const Packet2d& a, const

 template<> EIGEN_STRONG_INLINE Packet2d pandnot<Packet2d>(const Packet2d& a, const Packet2d& b) { return vec_and(a, vec_nor(b, b)); }

-template<> EIGEN_STRONG_INLINE Packet2d pround<Packet2d>(const Packet2d& a) { return vec_round(a); }
+template<> EIGEN_STRONG_INLINE Packet2d pround<Packet2d>(const Packet2d& a)
+{
+    Packet2d t = vec_add(reinterpret_cast<Packet2d>(vec_or(vec_and(reinterpret_cast<Packet2ul>(a), p2ul_SIGN), p2ul_PREV0DOT5)), a);
+    Packet2d res;
+
+    __asm__("xvrdpiz %x0, %x1\n\t"
+        : "=&wa" (res)
+        : "wa" (t));
+
+    return res;
+}
 template<> EIGEN_STRONG_INLINE Packet2d pceil<Packet2d>(const  Packet2d& a) { return vec_ceil(a); }
 template<> EIGEN_STRONG_INLINE Packet2d pfloor<Packet2d>(const Packet2d& a) { return vec_floor(a); }

 template<> EIGEN_STRONG_INLINE Packet2d ploadu<Packet2d>(const double* from)
 {
-  EIGEN_DEBUG_ALIGNED_LOAD
-  return (Packet2d) vec_vsx_ld((long)from & 15, (const double*) _EIGEN_ALIGNED_PTR(from));
+  EIGEN_DEBUG_UNALIGNED_LOAD
+  return vec_xl(0, const_cast<double*>(from));
 }

 template<> EIGEN_STRONG_INLINE Packet2d ploaddup<Packet2d>(const double*   from)
@@ -970,13 +959,13 @@ template<> EIGEN_STRONG_INLINE Packet2d ploaddup<Packet2d>(const double*   from)

 template<> EIGEN_STRONG_INLINE void pstoreu<double>(double*  to, const Packet2d& from)
 {
-  EIGEN_DEBUG_ALIGNED_STORE
-  vec_vsx_st((Packet4f)from, (long)to & 15, (float*) _EIGEN_ALIGNED_PTR(to));
+  EIGEN_DEBUG_UNALIGNED_STORE
+  vec_xst(from, 0, to);
 }

 template<> EIGEN_STRONG_INLINE void prefetch<double>(const double* addr) { EIGEN_PPC_PREFETCH(addr); }

-template<> EIGEN_STRONG_INLINE double  pfirst<Packet2d>(const Packet2d& a) { double EIGEN_ALIGN16 x[2]; pstore<double>(x, a); return x[0]; }
+template<> EIGEN_STRONG_INLINE double  pfirst<Packet2d>(const Packet2d& a) { EIGEN_ALIGN16 double x[2]; pstore<double>(x, a); return x[0]; }

 template<> EIGEN_STRONG_INLINE Packet2d preverse(const Packet2d& a)
 {
--- a/Eigen/src/Core/arch/CUDA/Complex.h
+++ b/Eigen/src/Core/arch/CUDA/Complex.h
@@ -16,7 +16,7 @@ namespace Eigen {

 namespace internal {

-#if defined(__CUDACC__) && defined(EIGEN_USE_GPU)
+#if defined(EIGEN_CUDACC) && defined(EIGEN_USE_GPU)

 // Many std::complex methods such as operator+, operator-, operator* and
 // operator/ are not constexpr. Due to this, clang does not treat them as device
@@ -55,7 +55,7 @@ template<typename T> struct scalar_difference_op<std::complex<T>, std::complex<T
 // Product
 template<typename T> struct scalar_product_op<const std::complex<T>, const std::complex<T> >  : binary_op_base<const std::complex<T>, const std::complex<T> > {
  enum {
-    Vectorizable = packet_traits<std::complex<T>>::HasMul
+    Vectorizable = packet_traits<std::complex<T> >::HasMul
  };
  typedef typename std::complex<T> result_type;

@@ -76,7 +76,7 @@ template<typename T> struct scalar_product_op<std::complex<T>, std::complex<T> >
 // Quotient
 template<typename T> struct scalar_quotient_op<const std::complex<T>, const std::complex<T> > : binary_op_base<const std::complex<T>, const std::complex<T> > {
  enum {
-    Vectorizable = packet_traits<std::complex<T>>::HasDiv
+    Vectorizable = packet_traits<std::complex<T> >::HasDiv
  };
  typedef typename std::complex<T> result_type;

--- a/Eigen/src/Core/arch/CUDA/Half.h
+++ b/Eigen/src/Core/arch/CUDA/Half.h
@@ -57,7 +57,7 @@ struct __half_raw {
  explicit EIGEN_DEVICE_FUNC __half_raw(unsigned short raw) : x(raw) {}
  unsigned short x;
 };
-#elif defined(EIGEN_CUDACC_VER) && EIGEN_CUDACC_VER < 90000
+#elif EIGEN_CUDA_SDK_VER < 90000
 // In CUDA < 9.0, __half is the equivalent of CUDA 9's __half_raw
 typedef __half __half_raw;
 #endif
@@ -70,7 +70,7 @@ struct half_base : public __half_raw {
  EIGEN_DEVICE_FUNC half_base() {}
  EIGEN_DEVICE_FUNC half_base(const half_base& h) : __half_raw(h) {}
  EIGEN_DEVICE_FUNC half_base(const __half_raw& h) : __half_raw(h) {}
-#if defined(EIGEN_HAS_CUDA_FP16) && defined(EIGEN_CUDACC_VER) && EIGEN_CUDACC_VER >= 90000
+#if defined(EIGEN_HAS_CUDA_FP16) && EIGEN_CUDA_SDK_VER >= 90000
  EIGEN_DEVICE_FUNC half_base(const __half& h) : __half_raw(*(__half_raw*)&h) {}
 #endif
 };
@@ -79,7 +79,7 @@ struct half_base : public __half_raw {

 // Class definition.
 struct half : public half_impl::half_base {
-  #if !defined(EIGEN_HAS_CUDA_FP16) || (defined(EIGEN_CUDACC_VER) && EIGEN_CUDACC_VER < 90000)
+  #if !defined(EIGEN_HAS_CUDA_FP16) || (EIGEN_CUDA_SDK_VER < 90000)
    typedef half_impl::__half_raw __half_raw;
  #endif

@@ -87,7 +87,7 @@ struct half : public half_impl::half_base {

  EIGEN_DEVICE_FUNC half(const __half_raw& h) : half_impl::half_base(h) {}
  EIGEN_DEVICE_FUNC half(const half& h) : half_impl::half_base(h) {}
-#if defined(EIGEN_HAS_CUDA_FP16) && defined(EIGEN_CUDACC_VER) && EIGEN_CUDACC_VER >= 90000
+#if defined(EIGEN_HAS_CUDA_FP16) && EIGEN_CUDA_SDK_VER >= 90000
  EIGEN_DEVICE_FUNC half(const __half& h) : half_impl::half_base(h) {}
 #endif

@@ -209,56 +209,56 @@ namespace half_impl {
 // versions to get the ALU speed increased), but you do save the
 // conversion steps back and forth.

-EIGEN_STRONG_INLINE __device__ half operator + (const half& a, const half& b) {
-  return __hadd(a, b);
+EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC half operator + (const half& a, const half& b) {
+  return __hadd(static_cast<__half>(a), static_cast<__half>(b));
 }
-EIGEN_STRONG_INLINE __device__ half operator * (const half& a, const half& b) {
-  return __hmul(a, b);
+EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC half operator * (const half& a, const half& b) {
+  return __hmul(static_cast<__half>(a), static_cast<__half>(b));
 }
-EIGEN_STRONG_INLINE __device__ half operator - (const half& a, const half& b) {
-  return __hsub(a, b);
+EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC half operator - (const half& a, const half& b) {
+  return __hsub(static_cast<__half>(a), static_cast<__half>(b));
 }
-EIGEN_STRONG_INLINE __device__ half operator / (const half& a, const half& b) {
+EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC half operator / (const half& a, const half& b) {
  float num = __half2float(a);
  float denom = __half2float(b);
  return __float2half(num / denom);
 }
-EIGEN_STRONG_INLINE __device__ half operator - (const half& a) {
-  return __hneg(a);
+EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC half operator - (const half& a) {
+  return __hneg(static_cast<__half>(a));
 }
-EIGEN_STRONG_INLINE __device__ half& operator += (half& a, const half& b) {
+EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC half& operator += (half& a, const half& b) {
  a = a + b;
  return a;
 }
-EIGEN_STRONG_INLINE __device__ half& operator *= (half& a, const half& b) {
+EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC half& operator *= (half& a, const half& b) {
  a = a * b;
  return a;
 }
-EIGEN_STRONG_INLINE __device__ half& operator -= (half& a, const half& b) {
+EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC half& operator -= (half& a, const half& b) {
  a = a - b;
  return a;
 }
-EIGEN_STRONG_INLINE __device__ half& operator /= (half& a, const half& b) {
+EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC half& operator /= (half& a, const half& b) {
  a = a / b;
  return a;
 }
-EIGEN_STRONG_INLINE __device__ bool operator == (const half& a, const half& b) {
-  return __heq(a, b);
+EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC bool operator == (const half& a, const half& b) {
+  return __heq(static_cast<__half>(a), static_cast<__half>(b));
 }
-EIGEN_STRONG_INLINE __device__ bool operator != (const half& a, const half& b) {
-  return __hne(a, b);
+EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC bool operator != (const half& a, const half& b) {
+  return __hne(static_cast<__half>(a), static_cast<__half>(b));
 }
-EIGEN_STRONG_INLINE __device__ bool operator < (const half& a, const half& b) {
-  return __hlt(a, b);
+EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC bool operator < (const half& a, const half& b) {
+  return __hlt(static_cast<__half>(a), static_cast<__half>(b));
 }
-EIGEN_STRONG_INLINE __device__ bool operator <= (const half& a, const half& b) {
-  return __hle(a, b);
+EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC bool operator <= (const half& a, const half& b) {
+  return __hle(static_cast<__half>(a), static_cast<__half>(b));
 }
-EIGEN_STRONG_INLINE __device__ bool operator > (const half& a, const half& b) {
-  return __hgt(a, b);
+EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC bool operator > (const half& a, const half& b) {
+  return __hgt(static_cast<__half>(a), static_cast<__half>(b));
 }
-EIGEN_STRONG_INLINE __device__ bool operator >= (const half& a, const half& b) {
-  return __hge(a, b);
+EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC bool operator >= (const half& a, const half& b) {
+  return __hge(static_cast<__half>(a), static_cast<__half>(b));
 }

 #else  // Emulate support for half floats
@@ -449,14 +449,14 @@ EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC half abs(const half& a) {
  return result;
 }
 EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC half exp(const half& a) {
-#if EIGEN_CUDACC_VER >= 80000 && defined EIGEN_CUDA_ARCH && EIGEN_CUDA_ARCH >= 530
+#if EIGEN_CUDA_SDK_VER >= 80000 && defined EIGEN_CUDA_ARCH && EIGEN_CUDA_ARCH >= 530
  return half(hexp(a));
 #else
   return half(::expf(float(a)));
 #endif
 }
 EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC half log(const half& a) {
-#if defined(EIGEN_HAS_CUDA_FP16) && EIGEN_CUDACC_VER >= 80000 && defined(EIGEN_CUDA_ARCH) && EIGEN_CUDA_ARCH >= 530
+#if defined(EIGEN_HAS_CUDA_FP16) && EIGEN_CUDA_SDK_VER >= 80000 && defined(EIGEN_CUDA_ARCH) && EIGEN_CUDA_ARCH >= 530
  return half(::hlog(a));
 #else
  return half(::logf(float(a)));
@@ -469,7 +469,7 @@ EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC half log10(const half& a) {
  return half(::log10f(float(a)));
 }
 EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC half sqrt(const half& a) {
-#if EIGEN_CUDACC_VER >= 80000 && defined EIGEN_CUDA_ARCH && EIGEN_CUDA_ARCH >= 530
+#if EIGEN_CUDA_SDK_VER >= 80000 && defined EIGEN_CUDA_ARCH && EIGEN_CUDA_ARCH >= 530
  return half(hsqrt(a));
 #else
    return half(::sqrtf(float(a)));
@@ -491,14 +491,14 @@ EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC half tanh(const half& a) {
  return half(::tanhf(float(a)));
 }
 EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC half floor(const half& a) {
-#if EIGEN_CUDACC_VER >= 80000 && defined EIGEN_CUDA_ARCH && EIGEN_CUDA_ARCH >= 300
+#if EIGEN_CUDA_SDK_VER >= 80000 && defined EIGEN_CUDA_ARCH && EIGEN_CUDA_ARCH >= 300
  return half(hfloor(a));
 #else
  return half(::floorf(float(a)));
 #endif
 }
 EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC half ceil(const half& a) {
-#if EIGEN_CUDACC_VER >= 80000 && defined EIGEN_CUDA_ARCH && EIGEN_CUDA_ARCH >= 300
+#if EIGEN_CUDA_SDK_VER >= 80000 && defined EIGEN_CUDA_ARCH && EIGEN_CUDA_ARCH >= 300
  return half(hceil(a));
 #else
  return half(::ceilf(float(a)));
@@ -541,7 +541,7 @@ struct random_default_impl<half, false, false>
 {
  static inline half run(const half& x, const half& y)
  {
-    return x + (y-x) * half(float(std::rand()) / float(RAND_MAX));
+    return x + (y-x) * half(random<float>());
  }
  static inline half run()
  {
@@ -593,7 +593,7 @@ EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC Eigen::half exph(const Eigen::half& a) {
  return Eigen::half(::expf(float(a)));
 }
 EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC Eigen::half logh(const Eigen::half& a) {
-#if EIGEN_CUDACC_VER >= 80000 && defined(EIGEN_CUDA_ARCH) && EIGEN_CUDA_ARCH >= 530
+#if EIGEN_CUDA_SDK_VER >= 80000 && defined(EIGEN_CUDA_ARCH) && EIGEN_CUDA_ARCH >= 530
  return Eigen::half(::hlog(a));
 #else
  return Eigen::half(::logf(float(a)));
@@ -626,25 +626,71 @@ struct hash<Eigen::half> {
 } // end namespace std


-// Add the missing shfl_xor intrinsic
-#if defined(EIGEN_CUDA_ARCH) && EIGEN_CUDA_ARCH >= 300
-__device__ EIGEN_STRONG_INLINE Eigen::half __shfl_xor(Eigen::half var, int laneMask, int width=warpSize) {
-  #if EIGEN_CUDACC_VER < 90000
-  return static_cast<Eigen::half>(__shfl_xor(static_cast<float>(var), laneMask, width));
-  #else
-  return static_cast<Eigen::half>(__shfl_xor_sync(0xFFFFFFFF, static_cast<float>(var), laneMask, width));
-  #endif
+// Add the missing shfl* intrinsics.
+// The __shfl* functions are only valid on HIP or _CUDA_ARCH_ >= 300.
+//   CUDA defines them for (__CUDA_ARCH__ >= 300 || !defined(__CUDA_ARCH__))
+//
+// HIP and CUDA prior to SDK 9.0 define
+//    __shfl, __shfl_up, __shfl_down, __shfl_xor for int and float
+// CUDA since 9.0 deprecates those and instead defines
+//    __shfl_sync, __shfl_up_sync, __shfl_down_sync, __shfl_xor_sync,
+//    with native support for __half and __nv_bfloat16
+//
+// Note that the following are __device__ - only functions.
+#if defined(EIGEN_CUDACC) && (!defined(EIGEN_CUDA_ARCH) || EIGEN_CUDA_ARCH >= 300)
+#if defined(EIGEN_HAS_CUDA_FP16) && EIGEN_CUDA_SDK_VER >= 90000
+
+__device__ EIGEN_STRONG_INLINE Eigen::half __shfl_sync(unsigned mask, Eigen::half var, int srcLane,
+                                                       int width = warpSize) {
+  const __half h = var;
+  return static_cast<Eigen::half>(__shfl_sync(mask, h, srcLane, width));
 }
+
+__device__ EIGEN_STRONG_INLINE Eigen::half __shfl_up_sync(unsigned mask, Eigen::half var, unsigned int delta,
+                                                          int width = warpSize) {
+  const __half h = var;
+  return static_cast<Eigen::half>(__shfl_up_sync(mask, h, delta, width));
+}
+
+__device__ EIGEN_STRONG_INLINE Eigen::half __shfl_down_sync(unsigned mask, Eigen::half var, unsigned int delta,
+                                                            int width = warpSize) {
+  const __half h = var;
+  return static_cast<Eigen::half>(__shfl_down_sync(mask, h, delta, width));
+}
+
+__device__ EIGEN_STRONG_INLINE Eigen::half __shfl_xor_sync(unsigned mask, Eigen::half var, int laneMask,
+                                                           int width = warpSize) {
+  const __half h = var;
+  return static_cast<Eigen::half>(__shfl_xor_sync(mask, h, laneMask, width));
+}
+
+#else  // CUDA SDK < 9.0
+
+__device__ EIGEN_STRONG_INLINE Eigen::half __shfl(Eigen::half var, int srcLane, int width = warpSize) {
+  return static_cast<Eigen::half>(__shfl(static_cast<float>(var), srcLane, width));
+}
+
+__device__ EIGEN_STRONG_INLINE Eigen::half __shfl_up(Eigen::half var, unsigned int delta, int width = warpSize) {
+  return static_cast<Eigen::half>(__shfl_up(static_cast<float>(var), delta, width));
+}
+
+__device__ EIGEN_STRONG_INLINE Eigen::half __shfl_down(Eigen::half var, unsigned int delta, int width = warpSize) {
+  return static_cast<Eigen::half>(__shfl_down(static_cast<float>(var), delta, width));
+}
+
+__device__ EIGEN_STRONG_INLINE Eigen::half __shfl_xor(Eigen::half var, int laneMask, int width = warpSize) {
+  return static_cast<Eigen::half>(__shfl_xor(static_cast<float>(var), laneMask, width));
+}
+
 #endif
+#endif  // __shfl*

 // ldg() has an overload for __half_raw, but we also need one for Eigen::half.
-#if defined(EIGEN_CUDA_ARCH) && EIGEN_CUDA_ARCH >= 350
-EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC Eigen::half __ldg(const Eigen::half* ptr) {
-  return Eigen::half_impl::raw_uint16_to_half(
-      __ldg(reinterpret_cast<const unsigned short*>(ptr)));
+#if defined(EIGEN_CUDACC) && (!defined(EIGEN_CUDA_ARCH) || EIGEN_CUDA_ARCH >= 350)
+EIGEN_STRONG_INLINE __device__ Eigen::half __ldg(const Eigen::half* ptr) {
+  return Eigen::half_impl::raw_uint16_to_half(__ldg(reinterpret_cast<const Eigen::numext::uint16_t*>(ptr)));
 }
-#endif
-
+#endif  // __ldg

 #if defined(EIGEN_CUDA_ARCH)
 namespace Eigen {
--- a/Eigen/src/Core/arch/CUDA/MathFunctions.h
+++ b/Eigen/src/Core/arch/CUDA/MathFunctions.h
@@ -17,7 +17,7 @@ namespace internal {
 // Make sure this is only available when targeting a GPU: we don't want to
 // introduce conflicts between these packet_traits definitions and the ones
 // we'll use on the host side (SSE, AVX, ...)
-#if defined(__CUDACC__) && defined(EIGEN_USE_GPU)
+#if defined(EIGEN_CUDACC) && defined(EIGEN_USE_GPU)
 template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
 float4 plog<float4>(const float4& a)
 {
--- a/Eigen/src/Core/arch/CUDA/PacketMath.h
+++ b/Eigen/src/Core/arch/CUDA/PacketMath.h
@@ -17,7 +17,7 @@ namespace internal {
 // Make sure this is only available when targeting a GPU: we don't want to
 // introduce conflicts between these packet_traits definitions and the ones
 // we'll use on the host side (SSE, AVX, ...)
-#if defined(__CUDACC__) && defined(EIGEN_USE_GPU)
+#if defined(EIGEN_CUDACC) && defined(EIGEN_USE_GPU)
 template<> struct is_arithmetic<float4>  { enum { value = true }; };
 template<> struct is_arithmetic<double2> { enum { value = true }; };

@@ -167,10 +167,10 @@ template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE double2 ploadu<double2>(const d
  return make_double2(from[0], from[1]);
 }

-template<> EIGEN_STRONG_INLINE float4 ploaddup<float4>(const float*   from) {
+template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE float4 ploaddup<float4>(const float*   from) {
  return make_float4(from[0], from[0], from[1], from[1]);
 }
-template<> EIGEN_STRONG_INLINE double2 ploaddup<double2>(const double*  from) {
+template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE double2 ploaddup<double2>(const double*  from) {
  return make_double2(from[0], from[0]);
 }

@@ -197,7 +197,7 @@ template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void pstoreu<double>(double* to
 template<>
 EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE float4 ploadt_ro<float4, Aligned>(const float* from) {
 #if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 350
-  return __ldg((const float4*)from);
+  return __ldg(reinterpret_cast<const float4*>(from));
 #else
  return make_float4(from[0], from[1], from[2], from[3]);
 #endif
@@ -205,7 +205,7 @@ EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE float4 ploadt_ro<float4, Aligned>(const fl
 template<>
 EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE double2 ploadt_ro<double2, Aligned>(const double* from) {
 #if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 350
-  return __ldg((const double2*)from);
+  return __ldg(reinterpret_cast<const double2*>(from));
 #else
  return make_double2(from[0], from[1]);
 #endif
--- a/Eigen/src/Core/arch/CUDA/PacketMathHalf.h
+++ b/Eigen/src/Core/arch/CUDA/PacketMathHalf.h
@@ -15,7 +15,7 @@ namespace Eigen {
 namespace internal {

 // Most of the following operations require arch >= 3.0
-#if defined(EIGEN_HAS_CUDA_FP16) && defined(__CUDACC__) && defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 300
+#if defined(EIGEN_HAS_CUDA_FP16) && defined(EIGEN_CUDACC) && defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 300

 template<> struct is_arithmetic<half2> { enum { value = true }; };

@@ -41,42 +41,42 @@ template<> struct packet_traits<Eigen::half> : default_packet_traits

 template<> struct unpacket_traits<half2> { typedef Eigen::half type; enum {size=2, alignment=Aligned16}; typedef half2 half; };

-template<> __device__ EIGEN_STRONG_INLINE half2 pset1<half2>(const Eigen::half& from) {
+template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 pset1<half2>(const Eigen::half& from) {
  return __half2half2(from);
 }

-template<> __device__ EIGEN_STRONG_INLINE half2 pload<half2>(const Eigen::half* from) {
+template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 pload<half2>(const Eigen::half* from) {
  return *reinterpret_cast<const half2*>(from);
 }

-template<> __device__ EIGEN_STRONG_INLINE half2 ploadu<half2>(const Eigen::half* from) {
+template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 ploadu<half2>(const Eigen::half* from) {
  return __halves2half2(from[0], from[1]);
 }

-template<> EIGEN_STRONG_INLINE half2 ploaddup<half2>(const Eigen::half*  from) {
+template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 ploaddup<half2>(const Eigen::half*  from) {
  return __halves2half2(from[0], from[0]);
 }

-template<> __device__ EIGEN_STRONG_INLINE void pstore<Eigen::half>(Eigen::half* to, const half2& from) {
+template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void pstore<Eigen::half>(Eigen::half* to, const half2& from) {
  *reinterpret_cast<half2*>(to) = from;
 }

-template<> __device__ EIGEN_STRONG_INLINE void pstoreu<Eigen::half>(Eigen::half* to, const half2& from) {
+template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void pstoreu<Eigen::half>(Eigen::half* to, const half2& from) {
  to[0] = __low2half(from);
  to[1] = __high2half(from);
 }

 template<>
- __device__ EIGEN_ALWAYS_INLINE half2 ploadt_ro<half2, Aligned>(const Eigen::half* from) {
+ EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE half2 ploadt_ro<half2, Aligned>(const Eigen::half* from) {
 #if __CUDA_ARCH__ >= 350
-   return __ldg((const half2*)from);
+   return __ldg(reinterpret_cast<const half2*>(from));
 #else
  return __halves2half2(*(from+0), *(from+1));
 #endif
 }

 template<>
-__device__ EIGEN_ALWAYS_INLINE half2 ploadt_ro<half2, Unaligned>(const Eigen::half* from) {
+EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE half2 ploadt_ro<half2, Unaligned>(const Eigen::half* from) {
 #if __CUDA_ARCH__ >= 350
   return __halves2half2(__ldg(from+0), __ldg(from+1));
 #else
@@ -84,20 +84,20 @@ __device__ EIGEN_ALWAYS_INLINE half2 ploadt_ro<half2, Unaligned>(const Eigen::ha
 #endif
 }

-template<> __device__ EIGEN_STRONG_INLINE half2 pgather<Eigen::half, half2>(const Eigen::half* from, Index stride) {
+template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 pgather<Eigen::half, half2>(const Eigen::half* from, Index stride) {
  return __halves2half2(from[0*stride], from[1*stride]);
 }

-template<> __device__ EIGEN_STRONG_INLINE void pscatter<Eigen::half, half2>(Eigen::half* to, const half2& from, Index stride) {
+template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void pscatter<Eigen::half, half2>(Eigen::half* to, const half2& from, Index stride) {
  to[stride*0] = __low2half(from);
  to[stride*1] = __high2half(from);
 }

-template<> __device__ EIGEN_STRONG_INLINE Eigen::half pfirst<half2>(const half2& a) {
+template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Eigen::half pfirst<half2>(const half2& a) {
  return __low2half(a);
 }

-template<> __device__ EIGEN_STRONG_INLINE half2 pabs<half2>(const half2& a) {
+template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 pabs<half2>(const half2& a) {
  half2 result;
  unsigned temp = *(reinterpret_cast<const unsigned*>(&(a)));
  *(reinterpret_cast<unsigned*>(&(result))) = temp & 0x7FFF7FFF;
@@ -105,7 +105,7 @@ template<> __device__ EIGEN_STRONG_INLINE half2 pabs<half2>(const half2& a) {
 }


-__device__ EIGEN_STRONG_INLINE void
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void
 ptranspose(PacketBlock<half2,2>& kernel) {
  __half a1 = __low2half(kernel.packet[0]);
  __half a2 = __high2half(kernel.packet[0]);
@@ -115,7 +115,7 @@ ptranspose(PacketBlock<half2,2>& kernel) {
  kernel.packet[1] = __halves2half2(a2, b2);
 }

-template<> __device__ EIGEN_STRONG_INLINE half2 plset<half2>(const Eigen::half& a) {
+template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 plset<half2>(const Eigen::half& a) {
 #if __CUDA_ARCH__ >= 530
  return __halves2half2(a, __hadd(a, __float2half(1.0f)));
 #else
@@ -124,7 +124,7 @@ template<> __device__ EIGEN_STRONG_INLINE half2 plset<half2>(const Eigen::half&
 #endif
 }

-template<> __device__ EIGEN_STRONG_INLINE half2 padd<half2>(const half2& a, const half2& b) {
+template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 padd<half2>(const half2& a, const half2& b) {
 #if __CUDA_ARCH__ >= 530
  return __hadd2(a, b);
 #else
@@ -138,7 +138,7 @@ template<> __device__ EIGEN_STRONG_INLINE half2 padd<half2>(const half2& a, cons
 #endif
 }

-template<> __device__ EIGEN_STRONG_INLINE half2 psub<half2>(const half2& a, const half2& b) {
+template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 psub<half2>(const half2& a, const half2& b) {
 #if __CUDA_ARCH__ >= 530
  return __hsub2(a, b);
 #else
@@ -152,7 +152,7 @@ template<> __device__ EIGEN_STRONG_INLINE half2 psub<half2>(const half2& a, cons
 #endif
 }

-template<> __device__ EIGEN_STRONG_INLINE half2 pnegate(const half2& a) {
+template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 pnegate(const half2& a) {
 #if __CUDA_ARCH__ >= 530
  return __hneg2(a);
 #else
@@ -162,9 +162,9 @@ template<> __device__ EIGEN_STRONG_INLINE half2 pnegate(const half2& a) {
 #endif
 }

-template<> __device__ EIGEN_STRONG_INLINE half2 pconj(const half2& a) { return a; }
+template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 pconj(const half2& a) { return a; }

-template<> __device__ EIGEN_STRONG_INLINE half2 pmul<half2>(const half2& a, const half2& b) {
+template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 pmul<half2>(const half2& a, const half2& b) {
 #if __CUDA_ARCH__ >= 530
  return __hmul2(a, b);
 #else
@@ -178,7 +178,7 @@ template<> __device__ EIGEN_STRONG_INLINE half2 pmul<half2>(const half2& a, cons
 #endif
 }

-template<> __device__ EIGEN_STRONG_INLINE half2 pmadd<half2>(const half2& a, const half2& b, const half2& c) {
+template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 pmadd<half2>(const half2& a, const half2& b, const half2& c) {
 #if __CUDA_ARCH__ >= 530
   return __hfma2(a, b, c);
 #else
@@ -194,7 +194,7 @@ template<> __device__ EIGEN_STRONG_INLINE half2 pmadd<half2>(const half2& a, con
 #endif
 }

-template<> __device__ EIGEN_STRONG_INLINE half2 pdiv<half2>(const half2& a, const half2& b) {
+template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 pdiv<half2>(const half2& a, const half2& b) {
  float a1 = __low2float(a);
  float a2 = __high2float(a);
  float b1 = __low2float(b);
@@ -204,7 +204,7 @@ template<> __device__ EIGEN_STRONG_INLINE half2 pdiv<half2>(const half2& a, cons
  return __floats2half2_rn(r1, r2);
 }

-template<> __device__ EIGEN_STRONG_INLINE half2 pmin<half2>(const half2& a, const half2& b) {
+template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 pmin<half2>(const half2& a, const half2& b) {
  float a1 = __low2float(a);
  float a2 = __high2float(a);
  float b1 = __low2float(b);
@@ -214,7 +214,7 @@ template<> __device__ EIGEN_STRONG_INLINE half2 pmin<half2>(const half2& a, cons
  return __halves2half2(r1, r2);
 }

-template<> __device__ EIGEN_STRONG_INLINE half2 pmax<half2>(const half2& a, const half2& b) {
+template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 pmax<half2>(const half2& a, const half2& b) {
  float a1 = __low2float(a);
  float a2 = __high2float(a);
  float b1 = __low2float(b);
@@ -224,7 +224,7 @@ template<> __device__ EIGEN_STRONG_INLINE half2 pmax<half2>(const half2& a, cons
  return __halves2half2(r1, r2);
 }

-template<> __device__ EIGEN_STRONG_INLINE Eigen::half predux<half2>(const half2& a) {
+template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Eigen::half predux<half2>(const half2& a) {
 #if __CUDA_ARCH__ >= 530
  return __hadd(__low2half(a), __high2half(a));
 #else
@@ -234,7 +234,7 @@ template<> __device__ EIGEN_STRONG_INLINE Eigen::half predux<half2>(const half2&
 #endif
 }

-template<> __device__ EIGEN_STRONG_INLINE Eigen::half predux_max<half2>(const half2& a) {
+template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Eigen::half predux_max<half2>(const half2& a) {
 #if __CUDA_ARCH__ >= 530
  __half first = __low2half(a);
  __half second = __high2half(a);
@@ -246,7 +246,7 @@ template<> __device__ EIGEN_STRONG_INLINE Eigen::half predux_max<half2>(const ha
 #endif
 }

-template<> __device__ EIGEN_STRONG_INLINE Eigen::half predux_min<half2>(const half2& a) {
+template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Eigen::half predux_min<half2>(const half2& a) {
 #if __CUDA_ARCH__ >= 530
  __half first = __low2half(a);
  __half second = __high2half(a);
@@ -258,7 +258,7 @@ template<> __device__ EIGEN_STRONG_INLINE Eigen::half predux_min<half2>(const ha
 #endif
 }

-template<> __device__ EIGEN_STRONG_INLINE Eigen::half predux_mul<half2>(const half2& a) {
+template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Eigen::half predux_mul<half2>(const half2& a) {
 #if __CUDA_ARCH__ >= 530
  return __hmul(__low2half(a), __high2half(a));
 #else
@@ -268,7 +268,7 @@ template<> __device__ EIGEN_STRONG_INLINE Eigen::half predux_mul<half2>(const ha
 #endif
 }

-template<> __device__ EIGEN_STRONG_INLINE half2 plog1p<half2>(const half2& a) {
+template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 plog1p<half2>(const half2& a) {
  float a1 = __low2float(a);
  float a2 = __high2float(a);
  float r1 = log1pf(a1);
@@ -276,31 +276,31 @@ template<> __device__ EIGEN_STRONG_INLINE half2 plog1p<half2>(const half2& a) {
  return __floats2half2_rn(r1, r2);
 }

-#if EIGEN_CUDACC_VER >= 80000 && defined EIGEN_CUDA_ARCH && EIGEN_CUDA_ARCH >= 530
+#if EIGEN_CUDA_SDK_VER >= 80000 && defined EIGEN_CUDA_ARCH && EIGEN_CUDA_ARCH >= 530

-template<>  __device__ EIGEN_STRONG_INLINE
+template<>  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
 half2 plog<half2>(const half2& a) {
  return h2log(a);
 }

-template<> __device__ EIGEN_STRONG_INLINE
+template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
 half2 pexp<half2>(const half2& a) {
  return h2exp(a);
 }

-template<> __device__ EIGEN_STRONG_INLINE
+template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
 half2 psqrt<half2>(const half2& a) {
  return h2sqrt(a);
 }

-template<> __device__ EIGEN_STRONG_INLINE
+template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
 half2 prsqrt<half2>(const half2& a) {
  return h2rsqrt(a);
 }

 #else

-template<> __device__ EIGEN_STRONG_INLINE half2 plog<half2>(const half2& a) {
+template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 plog<half2>(const half2& a) {
  float a1 = __low2float(a);
  float a2 = __high2float(a);
  float r1 = logf(a1);
@@ -308,7 +308,7 @@ template<> __device__ EIGEN_STRONG_INLINE half2 plog<half2>(const half2& a) {
  return __floats2half2_rn(r1, r2);
 }

-template<> __device__ EIGEN_STRONG_INLINE half2 pexp<half2>(const half2& a) {
+template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 pexp<half2>(const half2& a) {
  float a1 = __low2float(a);
  float a2 = __high2float(a);
  float r1 = expf(a1);
@@ -316,7 +316,7 @@ template<> __device__ EIGEN_STRONG_INLINE half2 pexp<half2>(const half2& a) {
  return __floats2half2_rn(r1, r2);
 }

-template<> __device__ EIGEN_STRONG_INLINE half2 psqrt<half2>(const half2& a) {
+template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 psqrt<half2>(const half2& a) {
  float a1 = __low2float(a);
  float a2 = __high2float(a);
  float r1 = sqrtf(a1);
@@ -324,7 +324,7 @@ template<> __device__ EIGEN_STRONG_INLINE half2 psqrt<half2>(const half2& a) {
  return __floats2half2_rn(r1, r2);
 }

-template<> __device__ EIGEN_STRONG_INLINE half2 prsqrt<half2>(const half2& a) {
+template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 prsqrt<half2>(const half2& a) {
  float a1 = __low2float(a);
  float a2 = __high2float(a);
  float r1 = rsqrtf(a1);
--- a/Eigen/src/Core/arch/Default/GenericPacketMathFunctions.h
+++ b/Eigen/src/Core/arch/Default/GenericPacketMathFunctions.h
--- a/Eigen/src/Core/arch/NEON/PacketMath.h
+++ b/Eigen/src/Core/arch/NEON/PacketMath.h
@@ -32,7 +32,7 @@ namespace internal {
 #if EIGEN_ARCH_ARM64
 #define EIGEN_ARCH_DEFAULT_NUMBER_OF_REGISTERS 32
 #else
-#define EIGEN_ARCH_DEFAULT_NUMBER_OF_REGISTERS 16 
+#define EIGEN_ARCH_DEFAULT_NUMBER_OF_REGISTERS 16
 #endif
 #endif

@@ -107,7 +107,7 @@ template<> struct packet_traits<float>  : default_packet_traits
    AlignedOnScalar = 1,
    size = 4,
    HasHalfPacket=0, // Packet2f intrinsics not implemented yet
-   
+
    HasDiv  = 1,
    // FIXME check the Has*
    HasSin  = 0,
@@ -173,32 +173,48 @@ template<> EIGEN_STRONG_INLINE Packet4i pconj(const Packet4i& a) { return a; }
 template<> EIGEN_STRONG_INLINE Packet4f pmul<Packet4f>(const Packet4f& a, const Packet4f& b) { return vmulq_f32(a,b); }
 template<> EIGEN_STRONG_INLINE Packet4i pmul<Packet4i>(const Packet4i& a, const Packet4i& b) { return vmulq_s32(a,b); }

-template<> EIGEN_STRONG_INLINE Packet4f pdiv<Packet4f>(const Packet4f& a, const Packet4f& b)
-{
-#if EIGEN_ARCH_ARM64
-  return vdivq_f32(a,b);
-#else
-  Packet4f inv, restep, div;
-
-  // NEON does not offer a divide instruction, we have to do a reciprocal approximation
-  // However NEON in contrast to other SIMD engines (AltiVec/SSE), offers
-  // a reciprocal estimate AND a reciprocal step -which saves a few instructions
-  // vrecpeq_f32() returns an estimate to 1/b, which we will finetune with
-  // Newton-Raphson and vrecpsq_f32()
-  inv = vrecpeq_f32(b);
-
-  // This returns a differential, by which we will have to multiply inv to get a better
-  // approximation of 1/b.
-  restep = vrecpsq_f32(b, inv);
-  inv = vmulq_f32(restep, inv);
-
-  // Finally, multiply a by 1/b and get the wanted result of the division.
-  div = vmulq_f32(a, inv);
-
-  return div;
-#endif
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet4f pselect(const Packet4f& mask, const Packet4f& a, const Packet4f& b) {
+  return vbslq_f32(vreinterpretq_u32_f32(mask), a, b);
 }

+EIGEN_STRONG_INLINE Packet4f pcmp_le(const Packet4f& a, const Packet4f& b) {
+  return vreinterpretq_f32_u32(vcleq_f32(a, b));
+}
+
+EIGEN_STRONG_INLINE Packet4f preciprocal(const Packet4f& a)
+{
+  // Compute approximate reciprocal.
+  float32x4_t result = vrecpeq_f32(a);
+  result = vmulq_f32(vrecpsq_f32(a, result), result);
+  result = vmulq_f32(vrecpsq_f32(a, result), result);
+  return result;
+}
+
+#if EIGEN_ARCH_ARM64
+template<> EIGEN_STRONG_INLINE Packet4f pdiv(const Packet4f& a, const Packet4f& b) { return vdivq_f32(a, b); }
+template<> EIGEN_STRONG_INLINE Packet2f pdiv(const Packet2f& a, const Packet2f& b) { return vdiv_f32(a, b); }
+#else
+template<typename Packet>
+EIGEN_STRONG_INLINE Packet pdiv_float_common(const Packet& a, const Packet& b) {
+  // if b is large, NEON intrinsics will flush preciprocal(b) to zero
+  // avoid underflow with the following manipulation:
+  // a / b = f * (a * reciprocal(f * b))
+  const Packet cst_one = pset1<Packet>(1.0f);
+  const Packet cst_quarter = pset1<Packet>(0.25f);
+  const Packet cst_thresh = pset1<Packet>(NumTraits<float>::highest() / 4.0f);
+
+  Packet b_will_underflow = pcmp_le(cst_thresh, pabs(b));
+  Packet f = pselect(b_will_underflow, cst_quarter, cst_one);
+  Packet result = pmul(f, pmul(a, preciprocal(pmul(b, f))));
+  return result;
+}
+
+template<> EIGEN_STRONG_INLINE Packet4f pdiv<Packet4f>(const Packet4f& a, const Packet4f& b) {
+  return pdiv_float_common(a, b);
+}
+
+#endif
+
 template<> EIGEN_STRONG_INLINE Packet4i pdiv<Packet4i>(const Packet4i& /*a*/, const Packet4i& /*b*/)
 { eigen_assert(false && "packet integer division are not supported by NEON");
  return pset1<Packet4i>(0);
@@ -208,7 +224,7 @@ template<> EIGEN_STRONG_INLINE Packet4i pdiv<Packet4i>(const Packet4i& /*a*/, co
 // then implements a slow software scalar fallback calling fmaf()!
 // Filed LLVM bug:
 //     https://llvm.org/bugs/show_bug.cgi?id=27216
-#if (defined __ARM_FEATURE_FMA) && !(EIGEN_COMP_CLANG && EIGEN_ARCH_ARM)
+#if (defined EIGEN_VECTORIZE_FMA) && !(EIGEN_COMP_CLANG && EIGEN_ARCH_ARM)
 // See bug 936.
 // FMA is available on VFPv4 i.e. when compiling with -mfpu=neon-vfpv4.
 // FMA is a true fused multiply-add i.e. only 1 rounding at the end, no intermediate rounding.
@@ -478,7 +494,7 @@ template<> EIGEN_STRONG_INLINE int32_t predux_min<Packet4i>(const Packet4i& a)
  a_hi = vget_high_s32(a);
  min = vpmin_s32(a_lo, a_hi);
  min = vpmin_s32(min, min);
-  
+
  return vget_lane_s32(min, 0);
 }

@@ -595,7 +611,7 @@ template<> struct packet_traits<double>  : default_packet_traits
    AlignedOnScalar = 1,
    size = 2,
    HasHalfPacket=0,
-   
+
    HasDiv  = 1,
    // FIXME check the Has*
    HasSin  = 0,
@@ -628,7 +644,7 @@ template<> EIGEN_STRONG_INLINE Packet2d pmul<Packet2d>(const Packet2d& a, const

 template<> EIGEN_STRONG_INLINE Packet2d pdiv<Packet2d>(const Packet2d& a, const Packet2d& b) { return vdivq_f64(a,b); }

-#ifdef __ARM_FEATURE_FMA
+#ifdef EIGEN_VECTORIZE_FMA
 // See bug 936. See above comment about FMA for float.
 template<> EIGEN_STRONG_INLINE Packet2d pmadd(const Packet2d& a, const Packet2d& b, const Packet2d& c) { return vfmaq_f64(c,a,b); }
 #else
@@ -751,7 +767,7 @@ ptranspose(PacketBlock<Packet2d,2>& kernel) {
  kernel.packet[0] = trn1;
  kernel.packet[1] = trn2;
 }
-#endif // EIGEN_ARCH_ARM64 
+#endif // EIGEN_ARCH_ARM64

 } // end namespace internal

--- a/Eigen/src/Core/arch/ZVector/PacketMath.h
+++ b/Eigen/src/Core/arch/ZVector/PacketMath.h
@@ -10,8 +10,6 @@
 #ifndef EIGEN_PACKET_MATH_ZVECTOR_H
 #define EIGEN_PACKET_MATH_ZVECTOR_H

-#include <stdint.h>
-
 namespace Eigen {

 namespace internal {
@@ -46,10 +44,10 @@ typedef struct {
 } Packet4f;

 typedef union {
-  int32_t   i[4];
-  uint32_t ui[4];
-  int64_t   l[2];
-  uint64_t ul[2];
+  numext::int32_t   i[4];
+  numext::uint32_t ui[4];
+  numext::int64_t   l[2];
+  numext::uint64_t ul[2];
  double    d[2];
  Packet4i  v4i;
  Packet4ui v4ui;
--- a/Eigen/src/Core/products/GeneralMatrixMatrixTriangular.h
+++ b/Eigen/src/Core/products/GeneralMatrixMatrixTriangular.h
@@ -10,7 +10,7 @@
 #ifndef EIGEN_GENERAL_MATRIX_MATRIX_TRIANGULAR_H
 #define EIGEN_GENERAL_MATRIX_MATRIX_TRIANGULAR_H

-namespace Eigen { 
+namespace Eigen {

 template<typename Scalar, typename Index, int StorageOrder, int UpLo, bool ConjLhs, bool ConjRhs>
 struct selfadjoint_rank1_update;
@@ -27,7 +27,7 @@ namespace internal {
 // forward declarations (defined at the end of this file)
 template<typename LhsScalar, typename RhsScalar, typename Index, int mr, int nr, bool ConjLhs, bool ConjRhs, int ResInnerStride, int UpLo>
 struct tribb_kernel;
-  
+
 /* Optimized matrix-matrix product evaluating only one triangular half */
 template <typename Index,
          typename LhsScalar, int LhsStorageOrder, bool ConjugateLhs,
@@ -164,7 +164,7 @@ struct tribb_kernel
      if(UpLo==Upper)
        gebp_kernel1(res.getSubMapper(0, j), blockA, actual_b, j, depth, actualBlockSize, alpha,
                     -1, -1, 0, 0);
-      
+
      // selfadjoint micro block
      {
        Index i = j;
@@ -186,7 +186,7 @@ struct tribb_kernel
      if(UpLo==Lower)
      {
        Index i = j+actualBlockSize;
-        gebp_kernel1(res.getSubMapper(i, j), blockA+depth*i, actual_b, size-i, 
+        gebp_kernel1(res.getSubMapper(i, j), blockA+depth*i, actual_b, size-i,
                     depth, actualBlockSize, alpha, -1, -1, 0, 0);
      }
    }
@@ -207,13 +207,13 @@ struct general_product_to_triangular_selector<MatrixType,ProductType,UpLo,true>
  static void run(MatrixType& mat, const ProductType& prod, const typename MatrixType::Scalar& alpha, bool beta)
  {
    typedef typename MatrixType::Scalar Scalar;
-    
+
    typedef typename internal::remove_all<typename ProductType::LhsNested>::type Lhs;
    typedef internal::blas_traits<Lhs> LhsBlasTraits;
    typedef typename LhsBlasTraits::DirectLinearAccessType ActualLhs;
    typedef typename internal::remove_all<ActualLhs>::type _ActualLhs;
    typename internal::add_const_on_value_type<ActualLhs>::type actualLhs = LhsBlasTraits::extract(prod.lhs());
-    
+
    typedef typename internal::remove_all<typename ProductType::RhsNested>::type Rhs;
    typedef internal::blas_traits<Rhs> RhsBlasTraits;
    typedef typename RhsBlasTraits::DirectLinearAccessType ActualRhs;
@@ -230,18 +230,18 @@ struct general_product_to_triangular_selector<MatrixType,ProductType,UpLo,true>
      UseLhsDirectly = _ActualLhs::InnerStrideAtCompileTime==1,
      UseRhsDirectly = _ActualRhs::InnerStrideAtCompileTime==1
    };
-    
+
    internal::gemv_static_vector_if<Scalar,Lhs::SizeAtCompileTime,Lhs::MaxSizeAtCompileTime,!UseLhsDirectly> static_lhs;
    ei_declare_aligned_stack_constructed_variable(Scalar, actualLhsPtr, actualLhs.size(),
      (UseLhsDirectly ? const_cast<Scalar*>(actualLhs.data()) : static_lhs.data()));
    if(!UseLhsDirectly) Map<typename _ActualLhs::PlainObject>(actualLhsPtr, actualLhs.size()) = actualLhs;
-    
+
    internal::gemv_static_vector_if<Scalar,Rhs::SizeAtCompileTime,Rhs::MaxSizeAtCompileTime,!UseRhsDirectly> static_rhs;
    ei_declare_aligned_stack_constructed_variable(Scalar, actualRhsPtr, actualRhs.size(),
      (UseRhsDirectly ? const_cast<Scalar*>(actualRhs.data()) : static_rhs.data()));
    if(!UseRhsDirectly) Map<typename _ActualRhs::PlainObject>(actualRhsPtr, actualRhs.size()) = actualRhs;
-    
-    
+
+
    selfadjoint_rank1_update<Scalar,Index,StorageOrder,UpLo,
                              LhsBlasTraits::NeedToConjugate && NumTraits<Scalar>::IsComplex,
                              RhsBlasTraits::NeedToConjugate && NumTraits<Scalar>::IsComplex>
@@ -259,7 +259,7 @@ struct general_product_to_triangular_selector<MatrixType,ProductType,UpLo,false>
    typedef typename LhsBlasTraits::DirectLinearAccessType ActualLhs;
    typedef typename internal::remove_all<ActualLhs>::type _ActualLhs;
    typename internal::add_const_on_value_type<ActualLhs>::type actualLhs = LhsBlasTraits::extract(prod.lhs());
-    
+
    typedef typename internal::remove_all<typename ProductType::RhsNested>::type Rhs;
    typedef internal::blas_traits<Rhs> RhsBlasTraits;
    typedef typename RhsBlasTraits::DirectLinearAccessType ActualRhs;
@@ -300,15 +300,15 @@ struct general_product_to_triangular_selector<MatrixType,ProductType,UpLo,false>
  }
 };

-template<typename MatrixType, unsigned int UpLo>
+template<typename _MatrixType, unsigned int _Mode>
 template<typename ProductType>
-TriangularView<MatrixType,UpLo>& TriangularViewImpl<MatrixType,UpLo,Dense>::_assignProduct(const ProductType& prod, const Scalar& alpha, bool beta)
+EIGEN_DEVICE_FUNC TriangularView<_MatrixType,_Mode>& TriangularViewImpl<_MatrixType,_Mode,Dense>::_assignProduct(const ProductType& prod, const Scalar& alpha, bool beta)
 {
-  EIGEN_STATIC_ASSERT((UpLo&UnitDiag)==0, WRITING_TO_TRIANGULAR_PART_WITH_UNIT_DIAGONAL_IS_NOT_SUPPORTED);
+  EIGEN_STATIC_ASSERT((_Mode&UnitDiag)==0, WRITING_TO_TRIANGULAR_PART_WITH_UNIT_DIAGONAL_IS_NOT_SUPPORTED);
  eigen_assert(derived().nestedExpression().rows() == prod.rows() && derived().cols() == prod.cols());
-  
-  general_product_to_triangular_selector<MatrixType, ProductType, UpLo, internal::traits<ProductType>::InnerSize==1>::run(derived().nestedExpression().const_cast_derived(), prod, alpha, beta);
-  
+
+  general_product_to_triangular_selector<_MatrixType, ProductType, _Mode, internal::traits<ProductType>::InnerSize==1>::run(derived().nestedExpression().const_cast_derived(), prod, alpha, beta);
+
  return derived();
 }

--- a/Eigen/src/Core/products/Parallelizer.h
+++ b/Eigen/src/Core/products/Parallelizer.h
@@ -132,8 +132,7 @@ void parallelize_gemm(const Functor& func, Index rows, Index cols, Index depth,

  ei_declare_aligned_stack_constructed_variable(GemmParallelInfo<Index>,info,threads,0);

-  int errorCount = 0;
-  #pragma omp parallel num_threads(threads) reduction(+: errorCount)
+  #pragma omp parallel num_threads(threads)
  {
    Index i = omp_get_thread_num();
    // Note that the actual number of threads might be lower than the number of request ones.
@@ -152,14 +151,11 @@ void parallelize_gemm(const Functor& func, Index rows, Index cols, Index depth,
    info[i].lhs_start = r0;
    info[i].lhs_length = actualBlockRows;

-    EIGEN_TRY {
-      if(transpose) func(c0, actualBlockCols, 0, rows, info);
-      else          func(0, rows, c0, actualBlockCols, info);
-    } EIGEN_CATCH(...) {
-      ++errorCount;
-    }
+    if(transpose)
+      func(c0, actualBlockCols, 0, rows, info);
+    else
+      func(0, rows, c0, actualBlockCols, info);
  }
-  if (errorCount) EIGEN_THROW_X(Eigen::eigen_assert_exception());
 #endif
 }

--- a/Eigen/src/Core/products/SelfadjointProduct.h
+++ b/Eigen/src/Core/products/SelfadjointProduct.h
@@ -16,7 +16,7 @@
 * It corresponds to the level 3 SYRK and level 2 SYR Blas routines.
 **********************************************************************/

-namespace Eigen { 
+namespace Eigen {


 template<typename Scalar, typename Index, int UpLo, bool ConjLhs, bool ConjRhs>
@@ -68,10 +68,10 @@ struct selfadjoint_product_selector<MatrixType,OtherType,UpLo,true>

    ei_declare_aligned_stack_constructed_variable(Scalar, actualOtherPtr, other.size(),
      (UseOtherDirectly ? const_cast<Scalar*>(actualOther.data()) : static_other.data()));
-      
+
    if(!UseOtherDirectly)
      Map<typename _ActualOtherType::PlainObject>(actualOtherPtr, actualOther.size()) = actualOther;
-    
+
    selfadjoint_rank1_update<Scalar,Index,StorageOrder,UpLo,
                              OtherBlasTraits::NeedToConjugate  && NumTraits<Scalar>::IsComplex,
                            (!OtherBlasTraits::NeedToConjugate) && NumTraits<Scalar>::IsComplex>
@@ -120,7 +120,7 @@ struct selfadjoint_product_selector<MatrixType,OtherType,UpLo,false>

 template<typename MatrixType, unsigned int UpLo>
 template<typename DerivedU>
-SelfAdjointView<MatrixType,UpLo>& SelfAdjointView<MatrixType,UpLo>
+EIGEN_DEVICE_FUNC SelfAdjointView<MatrixType,UpLo>& SelfAdjointView<MatrixType,UpLo>
 ::rankUpdate(const MatrixBase<DerivedU>& u, const Scalar& alpha)
 {
  selfadjoint_product_selector<MatrixType,DerivedU,UpLo>::run(_expression().const_cast_derived(), u.derived(), alpha);
--- a/Eigen/src/Core/products/SelfadjointRank2Update.h
+++ b/Eigen/src/Core/products/SelfadjointRank2Update.h
@@ -10,7 +10,7 @@
 #ifndef EIGEN_SELFADJOINTRANK2UPTADE_H
 #define EIGEN_SELFADJOINTRANK2UPTADE_H

-namespace Eigen { 
+namespace Eigen {

 namespace internal {

@@ -57,7 +57,7 @@ template<bool Cond, typename T> struct conj_expr_if

 template<typename MatrixType, unsigned int UpLo>
 template<typename DerivedU, typename DerivedV>
-SelfAdjointView<MatrixType,UpLo>& SelfAdjointView<MatrixType,UpLo>
+EIGEN_DEVICE_FUNC SelfAdjointView<MatrixType,UpLo>& SelfAdjointView<MatrixType,UpLo>
 ::rankUpdate(const MatrixBase<DerivedU>& u, const MatrixBase<DerivedV>& v, const Scalar& alpha)
 {
  typedef internal::blas_traits<DerivedU> UBlasTraits;
--- a/Eigen/src/Core/util/ConfigureVectorization.h
+++ b/Eigen/src/Core/util/ConfigureVectorization.h
@@ -0,0 +1,521 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2008-2018 Gael Guennebaud <gael.guennebaud@inria.fr>
+// Copyright (C) 2020, Arm Limited and Contributors
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+#ifndef EIGEN_CONFIGURE_VECTORIZATION_H
+#define EIGEN_CONFIGURE_VECTORIZATION_H
+
+//------------------------------------------------------------------------------------------
+// Static and dynamic alignment control
+//
+// The main purpose of this section is to define EIGEN_MAX_ALIGN_BYTES and EIGEN_MAX_STATIC_ALIGN_BYTES
+// as the maximal boundary in bytes on which dynamically and statically allocated data may be alignment respectively.
+// The values of EIGEN_MAX_ALIGN_BYTES and EIGEN_MAX_STATIC_ALIGN_BYTES can be specified by the user. If not,
+// a default value is automatically computed based on architecture, compiler, and OS.
+//
+// This section also defines macros EIGEN_ALIGN_TO_BOUNDARY(N) and the shortcuts EIGEN_ALIGN{8,16,32,_MAX}
+// to be used to declare statically aligned buffers.
+//------------------------------------------------------------------------------------------
+
+
+/* EIGEN_ALIGN_TO_BOUNDARY(n) forces data to be n-byte aligned. This is used to satisfy SIMD requirements.
+ * However, we do that EVEN if vectorization (EIGEN_VECTORIZE) is disabled,
+ * so that vectorization doesn't affect binary compatibility.
+ *
+ * If we made alignment depend on whether or not EIGEN_VECTORIZE is defined, it would be impossible to link
+ * vectorized and non-vectorized code.
+ * 
+ * FIXME: this code can be cleaned up once we switch to proper C++11 only.
+ */
+#if (defined EIGEN_CUDACC)
+  #define EIGEN_ALIGN_TO_BOUNDARY(n) __align__(n)
+  #define EIGEN_ALIGNOF(x) __alignof(x)
+#elif EIGEN_HAS_ALIGNAS
+  #define EIGEN_ALIGN_TO_BOUNDARY(n) alignas(n)
+  #define EIGEN_ALIGNOF(x) alignof(x)
+#elif EIGEN_COMP_GNUC || EIGEN_COMP_PGI || EIGEN_COMP_IBM || EIGEN_COMP_ARM
+  #define EIGEN_ALIGN_TO_BOUNDARY(n) __attribute__((aligned(n)))
+  #define EIGEN_ALIGNOF(x) __alignof(x)
+#elif EIGEN_COMP_MSVC
+  #define EIGEN_ALIGN_TO_BOUNDARY(n) __declspec(align(n))
+  #define EIGEN_ALIGNOF(x) __alignof(x)
+#elif EIGEN_COMP_SUNCC
+  // FIXME not sure about this one:
+  #define EIGEN_ALIGN_TO_BOUNDARY(n) __attribute__((aligned(n)))
+  #define EIGEN_ALIGNOF(x) __alignof(x)
+#else
+  #error Please tell me what is the equivalent of alignas(n) and alignof(x) for your compiler
+#endif
+
+// If the user explicitly disable vectorization, then we also disable alignment
+#if defined(EIGEN_DONT_VECTORIZE)
+  #if defined(EIGEN_GPUCC)
+    // GPU code is always vectorized and requires memory alignment for
+    // statically allocated buffers.
+    #define EIGEN_IDEAL_MAX_ALIGN_BYTES 16
+  #else
+    #define EIGEN_IDEAL_MAX_ALIGN_BYTES 0
+  #endif
+#elif defined(__AVX512F__)
+  // 64 bytes static alignment is preferred only if really required
+  #define EIGEN_IDEAL_MAX_ALIGN_BYTES 64
+#elif defined(__AVX__)
+  // 32 bytes static alignment is preferred only if really required
+  #define EIGEN_IDEAL_MAX_ALIGN_BYTES 32
+#else
+  #define EIGEN_IDEAL_MAX_ALIGN_BYTES 16
+#endif
+
+
+// EIGEN_MIN_ALIGN_BYTES defines the minimal value for which the notion of explicit alignment makes sense
+#define EIGEN_MIN_ALIGN_BYTES 16
+
+// Defined the boundary (in bytes) on which the data needs to be aligned. Note
+// that unless EIGEN_ALIGN is defined and not equal to 0, the data may not be
+// aligned at all regardless of the value of this #define.
+
+#if (defined(EIGEN_DONT_ALIGN_STATICALLY) || defined(EIGEN_DONT_ALIGN))  && defined(EIGEN_MAX_STATIC_ALIGN_BYTES) && EIGEN_MAX_STATIC_ALIGN_BYTES>0
+#error EIGEN_MAX_STATIC_ALIGN_BYTES and EIGEN_DONT_ALIGN[_STATICALLY] are both defined with EIGEN_MAX_STATIC_ALIGN_BYTES!=0. Use EIGEN_MAX_STATIC_ALIGN_BYTES=0 as a synonym of EIGEN_DONT_ALIGN_STATICALLY.
+#endif
+
+// EIGEN_DONT_ALIGN_STATICALLY and EIGEN_DONT_ALIGN are deprecated
+// They imply EIGEN_MAX_STATIC_ALIGN_BYTES=0
+#if defined(EIGEN_DONT_ALIGN_STATICALLY) || defined(EIGEN_DONT_ALIGN)
+  #ifdef EIGEN_MAX_STATIC_ALIGN_BYTES
+    #undef EIGEN_MAX_STATIC_ALIGN_BYTES
+  #endif
+  #define EIGEN_MAX_STATIC_ALIGN_BYTES 0
+#endif
+
+#ifndef EIGEN_MAX_STATIC_ALIGN_BYTES
+
+  // Try to automatically guess what is the best default value for EIGEN_MAX_STATIC_ALIGN_BYTES
+
+  // 16 byte alignment is only useful for vectorization. Since it affects the ABI, we need to enable
+  // 16 byte alignment on all platforms where vectorization might be enabled. In theory we could always
+  // enable alignment, but it can be a cause of problems on some platforms, so we just disable it in
+  // certain common platform (compiler+architecture combinations) to avoid these problems.
+  // Only static alignment is really problematic (relies on nonstandard compiler extensions),
+  // try to keep heap alignment even when we have to disable static alignment.
+  #if EIGEN_COMP_GNUC && !(EIGEN_ARCH_i386_OR_x86_64 || EIGEN_ARCH_ARM_OR_ARM64 || EIGEN_ARCH_PPC || EIGEN_ARCH_IA64 || EIGEN_ARCH_MIPS)
+  #define EIGEN_GCC_AND_ARCH_DOESNT_WANT_STACK_ALIGNMENT 1
+  #elif EIGEN_ARCH_ARM_OR_ARM64 && EIGEN_COMP_GNUC_STRICT && EIGEN_GNUC_AT_MOST(4, 6)
+  // Old versions of GCC on ARM, at least 4.4, were once seen to have buggy static alignment support.
+  // Not sure which version fixed it, hopefully it doesn't affect 4.7, which is still somewhat in use.
+  // 4.8 and newer seem definitely unaffected.
+  #define EIGEN_GCC_AND_ARCH_DOESNT_WANT_STACK_ALIGNMENT 1
+  #else
+  #define EIGEN_GCC_AND_ARCH_DOESNT_WANT_STACK_ALIGNMENT 0
+  #endif
+
+  // static alignment is completely disabled with GCC 3, Sun Studio, and QCC/QNX
+  #if !EIGEN_GCC_AND_ARCH_DOESNT_WANT_STACK_ALIGNMENT \
+  && !EIGEN_GCC3_OR_OLDER \
+  && !EIGEN_COMP_SUNCC \
+  && !EIGEN_OS_QNX
+    #define EIGEN_ARCH_WANTS_STACK_ALIGNMENT 1
+  #else
+    #define EIGEN_ARCH_WANTS_STACK_ALIGNMENT 0
+  #endif
+
+  #if EIGEN_ARCH_WANTS_STACK_ALIGNMENT
+    #define EIGEN_MAX_STATIC_ALIGN_BYTES EIGEN_IDEAL_MAX_ALIGN_BYTES
+  #else
+    #define EIGEN_MAX_STATIC_ALIGN_BYTES 0
+  #endif
+
+#endif
+
+// If EIGEN_MAX_ALIGN_BYTES is defined, then it is considered as an upper bound for EIGEN_MAX_STATIC_ALIGN_BYTES
+#if defined(EIGEN_MAX_ALIGN_BYTES) && EIGEN_MAX_ALIGN_BYTES<EIGEN_MAX_STATIC_ALIGN_BYTES
+#undef EIGEN_MAX_STATIC_ALIGN_BYTES
+#define EIGEN_MAX_STATIC_ALIGN_BYTES EIGEN_MAX_ALIGN_BYTES
+#endif
+
+#if EIGEN_MAX_STATIC_ALIGN_BYTES==0 && !defined(EIGEN_DISABLE_UNALIGNED_ARRAY_ASSERT)
+  #define EIGEN_DISABLE_UNALIGNED_ARRAY_ASSERT
+#endif
+
+// At this stage, EIGEN_MAX_STATIC_ALIGN_BYTES>0 is the true test whether we want to align arrays on the stack or not.
+// It takes into account both the user choice to explicitly enable/disable alignment (by setting EIGEN_MAX_STATIC_ALIGN_BYTES)
+// and the architecture config (EIGEN_ARCH_WANTS_STACK_ALIGNMENT).
+// Henceforth, only EIGEN_MAX_STATIC_ALIGN_BYTES should be used.
+
+
+// Shortcuts to EIGEN_ALIGN_TO_BOUNDARY
+#define EIGEN_ALIGN8  EIGEN_ALIGN_TO_BOUNDARY(8)
+#define EIGEN_ALIGN16 EIGEN_ALIGN_TO_BOUNDARY(16)
+#define EIGEN_ALIGN32 EIGEN_ALIGN_TO_BOUNDARY(32)
+#define EIGEN_ALIGN64 EIGEN_ALIGN_TO_BOUNDARY(64)
+#if EIGEN_MAX_STATIC_ALIGN_BYTES>0
+#define EIGEN_ALIGN_MAX EIGEN_ALIGN_TO_BOUNDARY(EIGEN_MAX_STATIC_ALIGN_BYTES)
+#else
+#define EIGEN_ALIGN_MAX
+#endif
+
+
+// Dynamic alignment control
+
+#if defined(EIGEN_DONT_ALIGN) && defined(EIGEN_MAX_ALIGN_BYTES) && EIGEN_MAX_ALIGN_BYTES>0
+#error EIGEN_MAX_ALIGN_BYTES and EIGEN_DONT_ALIGN are both defined with EIGEN_MAX_ALIGN_BYTES!=0. Use EIGEN_MAX_ALIGN_BYTES=0 as a synonym of EIGEN_DONT_ALIGN.
+#endif
+
+#ifdef EIGEN_DONT_ALIGN
+  #ifdef EIGEN_MAX_ALIGN_BYTES
+    #undef EIGEN_MAX_ALIGN_BYTES
+  #endif
+  #define EIGEN_MAX_ALIGN_BYTES 0
+#elif !defined(EIGEN_MAX_ALIGN_BYTES)
+  #define EIGEN_MAX_ALIGN_BYTES EIGEN_IDEAL_MAX_ALIGN_BYTES
+#endif
+
+#if EIGEN_IDEAL_MAX_ALIGN_BYTES > EIGEN_MAX_ALIGN_BYTES
+#define EIGEN_DEFAULT_ALIGN_BYTES EIGEN_IDEAL_MAX_ALIGN_BYTES
+#else
+#define EIGEN_DEFAULT_ALIGN_BYTES EIGEN_MAX_ALIGN_BYTES
+#endif
+
+
+#ifndef EIGEN_UNALIGNED_VECTORIZE
+#define EIGEN_UNALIGNED_VECTORIZE 1
+#endif
+
+//----------------------------------------------------------------------
+
+// if alignment is disabled, then disable vectorization. Note: EIGEN_MAX_ALIGN_BYTES is the proper check, it takes into
+// account both the user's will (EIGEN_MAX_ALIGN_BYTES,EIGEN_DONT_ALIGN) and our own platform checks
+#if EIGEN_MAX_ALIGN_BYTES==0
+  #ifndef EIGEN_DONT_VECTORIZE
+    #define EIGEN_DONT_VECTORIZE
+  #endif
+#endif
+
+
+// The following (except #include <malloc.h> and _M_IX86_FP ??) can likely be
+// removed as gcc 4.1 and msvc 2008 are not supported anyways.
+#if EIGEN_COMP_MSVC
+  #include <malloc.h> // for _aligned_malloc -- need it regardless of whether vectorization is enabled
+  #if (EIGEN_COMP_MSVC >= 1500) // 2008 or later
+    // a user reported that in 64-bit mode, MSVC doesn't care to define _M_IX86_FP.
+    #if (defined(_M_IX86_FP) && (_M_IX86_FP >= 2)) || EIGEN_ARCH_x86_64
+      #define EIGEN_SSE2_ON_MSVC_2008_OR_LATER
+    #endif
+  #endif
+#else
+  #if (defined __SSE2__) && ( (!EIGEN_COMP_GNUC) || EIGEN_COMP_ICC || EIGEN_GNUC_AT_LEAST(4,2) )
+    #define EIGEN_SSE2_ON_NON_MSVC_BUT_NOT_OLD_GCC
+  #endif
+#endif
+
+#if !(defined(EIGEN_DONT_VECTORIZE) || defined(EIGEN_GPUCC))
+
+  #if defined (EIGEN_SSE2_ON_NON_MSVC_BUT_NOT_OLD_GCC) || defined(EIGEN_SSE2_ON_MSVC_2008_OR_LATER)
+
+    // Defines symbols for compile-time detection of which instructions are
+    // used.
+    // EIGEN_VECTORIZE_YY is defined if and only if the instruction set YY is used
+    #define EIGEN_VECTORIZE
+    #define EIGEN_VECTORIZE_SSE
+    #define EIGEN_VECTORIZE_SSE2
+
+    // Detect sse3/ssse3/sse4:
+    // gcc and icc defines __SSE3__, ...
+    // there is no way to know about this on msvc. You can define EIGEN_VECTORIZE_SSE* if you
+    // want to force the use of those instructions with msvc.
+    #ifdef __SSE3__
+      #define EIGEN_VECTORIZE_SSE3
+    #endif
+    #ifdef __SSSE3__
+      #define EIGEN_VECTORIZE_SSSE3
+    #endif
+    #ifdef __SSE4_1__
+      #define EIGEN_VECTORIZE_SSE4_1
+    #endif
+    #ifdef __SSE4_2__
+      #define EIGEN_VECTORIZE_SSE4_2
+    #endif
+    #ifdef __AVX__
+      #ifndef EIGEN_USE_SYCL 
+        #define EIGEN_VECTORIZE_AVX
+      #endif
+      #define EIGEN_VECTORIZE_SSE3
+      #define EIGEN_VECTORIZE_SSSE3
+      #define EIGEN_VECTORIZE_SSE4_1
+      #define EIGEN_VECTORIZE_SSE4_2
+    #endif
+    #ifdef __AVX2__
+      #ifndef EIGEN_USE_SYCL 
+        #define EIGEN_VECTORIZE_AVX2
+        #define EIGEN_VECTORIZE_AVX
+      #endif
+      #define EIGEN_VECTORIZE_SSE3
+      #define EIGEN_VECTORIZE_SSSE3
+      #define EIGEN_VECTORIZE_SSE4_1
+      #define EIGEN_VECTORIZE_SSE4_2
+    #endif
+    #if defined(__FMA__) || (EIGEN_COMP_MSVC && defined(__AVX2__))
+      // MSVC does not expose a switch dedicated for FMA
+      // For MSVC, AVX2 => FMA
+      #define EIGEN_VECTORIZE_FMA
+    #endif
+    #if defined(__AVX512F__)
+      #ifndef EIGEN_VECTORIZE_FMA
+      #if EIGEN_COMP_GNUC
+      #error Please add -mfma to your compiler flags: compiling with -mavx512f alone without SSE/AVX FMA is not supported (bug 1638).
+      #else
+      #error Please enable FMA in your compiler flags (e.g. -mfma): compiling with AVX512 alone without SSE/AVX FMA is not supported (bug 1638).
+      #endif
+      #endif
+      #ifndef EIGEN_USE_SYCL
+        #define EIGEN_VECTORIZE_AVX512
+        #define EIGEN_VECTORIZE_AVX2
+        #define EIGEN_VECTORIZE_AVX
+      #endif
+      #define EIGEN_VECTORIZE_FMA
+      #define EIGEN_VECTORIZE_SSE3
+      #define EIGEN_VECTORIZE_SSSE3
+      #define EIGEN_VECTORIZE_SSE4_1
+      #define EIGEN_VECTORIZE_SSE4_2
+      #ifndef EIGEN_USE_SYCL
+        #ifdef __AVX512DQ__
+          #define EIGEN_VECTORIZE_AVX512DQ
+        #endif
+        #ifdef __AVX512ER__
+          #define EIGEN_VECTORIZE_AVX512ER
+        #endif
+        #ifdef __AVX512BF16__
+          #define EIGEN_VECTORIZE_AVX512BF16
+        #endif
+      #endif
+    #endif
+
+    // Disable AVX support on broken xcode versions
+    #if defined(__apple_build_version__) && (__apple_build_version__ == 11000033 ) && ( __MAC_OS_X_VERSION_MIN_REQUIRED == 101500 )
+      // A nasty bug in the clang compiler shipped with xcode in a common compilation situation
+      // when XCode 11.0 and Mac deployment target macOS 10.15 is https://trac.macports.org/ticket/58776#no1
+      #ifdef EIGEN_VECTORIZE_AVX
+        #undef EIGEN_VECTORIZE_AVX
+        #warning "Disabling AVX support: clang compiler shipped with XCode 11.[012] generates broken assembly with -macosx-version-min=10.15 and AVX enabled. "
+        #ifdef EIGEN_VECTORIZE_AVX2
+          #undef EIGEN_VECTORIZE_AVX2
+        #endif
+        #ifdef EIGEN_VECTORIZE_FMA
+          #undef EIGEN_VECTORIZE_FMA
+        #endif
+        #ifdef EIGEN_VECTORIZE_AVX512
+          #undef EIGEN_VECTORIZE_AVX512
+        #endif
+        #ifdef EIGEN_VECTORIZE_AVX512DQ
+          #undef EIGEN_VECTORIZE_AVX512DQ
+        #endif
+        #ifdef EIGEN_VECTORIZE_AVX512ER
+          #undef EIGEN_VECTORIZE_AVX512ER
+        #endif
+      #endif
+      // NOTE: Confirmed test failures in XCode 11.0, and XCode 11.2 with  -macosx-version-min=10.15 and AVX
+      // NOTE using -macosx-version-min=10.15 with Xcode 11.0 results in runtime segmentation faults in many tests, 11.2 produce core dumps in 3 tests
+      // NOTE using -macosx-version-min=10.14 produces functioning and passing tests in all cases
+      // NOTE __clang_version__ "11.0.0 (clang-1100.0.33.8)"  XCode 11.0 <- Produces many segfault and core dumping tests
+      //                                                                    with  -macosx-version-min=10.15 and AVX
+      // NOTE __clang_version__ "11.0.0 (clang-1100.0.33.12)" XCode 11.2 <- Produces 3 core dumping tests with  
+      //                                                                    -macosx-version-min=10.15 and AVX
+    #endif
+
+    // include files
+
+    // This extern "C" works around a MINGW-w64 compilation issue
+    // https://sourceforge.net/tracker/index.php?func=detail&aid=3018394&group_id=202880&atid=983354
+    // In essence, intrin.h is included by windows.h and also declares intrinsics (just as emmintrin.h etc. below do).
+    // However, intrin.h uses an extern "C" declaration, and g++ thus complains of duplicate declarations
+    // with conflicting linkage.  The linkage for intrinsics doesn't matter, but at that stage the compiler doesn't know;
+    // so, to avoid compile errors when windows.h is included after Eigen/Core, ensure intrinsics are extern "C" here too.
+    // notice that since these are C headers, the extern "C" is theoretically needed anyways.
+    extern "C" {
+      // In theory we should only include immintrin.h and not the other *mmintrin.h header files directly.
+      // Doing so triggers some issues with ICC. However old gcc versions seems to not have this file, thus:
+      #if EIGEN_COMP_ICC >= 1110 || EIGEN_COMP_EMSCRIPTEN
+        #include <immintrin.h>
+      #else
+        #include <mmintrin.h>
+        #include <emmintrin.h>
+        #include <xmmintrin.h>
+        #ifdef  EIGEN_VECTORIZE_SSE3
+        #include <pmmintrin.h>
+        #endif
+        #ifdef EIGEN_VECTORIZE_SSSE3
+        #include <tmmintrin.h>
+        #endif
+        #ifdef EIGEN_VECTORIZE_SSE4_1
+        #include <smmintrin.h>
+        #endif
+        #ifdef EIGEN_VECTORIZE_SSE4_2
+        #include <nmmintrin.h>
+        #endif
+        #if defined(EIGEN_VECTORIZE_AVX) || defined(EIGEN_VECTORIZE_AVX512)
+        #include <immintrin.h>
+        #endif
+      #endif
+    } // end extern "C"
+
+  #elif defined(__VSX__) && !defined(__APPLE__)
+
+    #define EIGEN_VECTORIZE
+    #define EIGEN_VECTORIZE_VSX 1
+    #define EIGEN_VECTORIZE_FMA
+    #include <altivec.h>
+    // We need to #undef all these ugly tokens defined in <altivec.h>
+    // => use __vector instead of vector
+    #undef bool
+    #undef vector
+    #undef pixel
+
+  #elif defined __ALTIVEC__
+
+    #define EIGEN_VECTORIZE
+    #define EIGEN_VECTORIZE_ALTIVEC
+    #define EIGEN_VECTORIZE_FMA
+    #include <altivec.h>
+    // We need to #undef all these ugly tokens defined in <altivec.h>
+    // => use __vector instead of vector
+    #undef bool
+    #undef vector
+    #undef pixel
+
+  #elif ((defined  __ARM_NEON) || (defined __ARM_NEON__)) && !(defined EIGEN_ARM64_USE_SVE)
+
+    #define EIGEN_VECTORIZE
+    #define EIGEN_VECTORIZE_NEON
+    #include <arm_neon.h>
+
+  // We currently require SVE to be enabled explicitly via EIGEN_ARM64_USE_SVE and
+  // will not select the backend automatically
+  #elif (defined __ARM_FEATURE_SVE) && (defined EIGEN_ARM64_USE_SVE)
+
+    #define EIGEN_VECTORIZE
+    #define EIGEN_VECTORIZE_SVE
+    #include <arm_sve.h>
+
+    // Since we depend on knowing SVE vector lengths at compile-time, we need
+    // to ensure a fixed lengths is set
+    #if defined __ARM_FEATURE_SVE_BITS
+      #define EIGEN_ARM64_SVE_VL __ARM_FEATURE_SVE_BITS
+    #else
+#error "Eigen requires a fixed SVE lector length but EIGEN_ARM64_SVE_VL is not set."
+#endif
+
+#elif (defined __s390x__ && defined __VEC__)
+
+#define EIGEN_VECTORIZE
+#define EIGEN_VECTORIZE_ZVECTOR
+#include <vecintrin.h>
+
+#elif defined __mips_msa
+
+// Limit MSA optimizations to little-endian CPUs for now.
+// TODO: Perhaps, eventually support MSA optimizations on big-endian CPUs?
+#if defined(__BYTE_ORDER__) && (__BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__)
+#if defined(__LP64__)
+#define EIGEN_MIPS_64
+#else
+#define EIGEN_MIPS_32
+#endif
+#define EIGEN_VECTORIZE
+#define EIGEN_VECTORIZE_MSA
+#include <msa.h>
+#endif
+
+#endif
+#endif
+
+// Following the Arm ACLE arm_neon.h should also include arm_fp16.h but not all
+// compilers seem to follow this. We therefore include it explicitly.
+// See also: https://bugs.llvm.org/show_bug.cgi?id=47955
+#if defined(EIGEN_HAS_ARM64_FP16_SCALAR_ARITHMETIC)
+  #include <arm_fp16.h>
+#endif
+
+// Enable FMA for ARM.
+#if defined(__ARM_FEATURE_FMA)
+#define EIGEN_VECTORIZE_FMA
+#endif
+
+#if defined(__F16C__) && !defined(EIGEN_GPUCC) && (!EIGEN_COMP_CLANG_STRICT || EIGEN_COMP_CLANG>=380)
+  // We can use the optimized fp16 to float and float to fp16 conversion routines
+  #define EIGEN_HAS_FP16_C
+
+  #if EIGEN_COMP_GNUC
+    // Make sure immintrin.h is included, even if e.g. vectorization is
+    // explicitly disabled (see also issue #2395).
+    // Note that FP16C intrinsics for gcc and clang are included by immintrin.h,
+    // as opposed to emmintrin.h as suggested by Intel:
+    // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#othertechs=FP16C&expand=1711
+    #include <immintrin.h>
+  #endif
+#endif
+
+#if defined EIGEN_CUDACC
+  #define EIGEN_VECTORIZE_GPU
+  #include <vector_types.h>
+  #if EIGEN_CUDA_SDK_VER >= 70500
+    #define EIGEN_HAS_CUDA_FP16
+  #endif
+#endif
+
+#if defined(EIGEN_HAS_CUDA_FP16)
+  #include <cuda_runtime_api.h>
+  #include <cuda_fp16.h>
+#endif
+
+#if defined(EIGEN_HIPCC)
+  #define EIGEN_VECTORIZE_GPU
+  #include <hip/hip_vector_types.h>
+  #define EIGEN_HAS_HIP_FP16
+  #include <hip/hip_fp16.h>
+#endif
+
+
+/** \brief Namespace containing all symbols from the %Eigen library. */
+namespace Eigen {
+
+inline static const char *SimdInstructionSetsInUse(void) {
+#if defined(EIGEN_VECTORIZE_AVX512)
+  return "AVX512, FMA, AVX2, AVX, SSE, SSE2, SSE3, SSSE3, SSE4.1, SSE4.2";
+#elif defined(EIGEN_VECTORIZE_AVX)
+  return "AVX SSE, SSE2, SSE3, SSSE3, SSE4.1, SSE4.2";
+#elif defined(EIGEN_VECTORIZE_SSE4_2)
+  return "SSE, SSE2, SSE3, SSSE3, SSE4.1, SSE4.2";
+#elif defined(EIGEN_VECTORIZE_SSE4_1)
+  return "SSE, SSE2, SSE3, SSSE3, SSE4.1";
+#elif defined(EIGEN_VECTORIZE_SSSE3)
+  return "SSE, SSE2, SSE3, SSSE3";
+#elif defined(EIGEN_VECTORIZE_SSE3)
+  return "SSE, SSE2, SSE3";
+#elif defined(EIGEN_VECTORIZE_SSE2)
+  return "SSE, SSE2";
+#elif defined(EIGEN_VECTORIZE_ALTIVEC)
+  return "AltiVec";
+#elif defined(EIGEN_VECTORIZE_VSX)
+  return "VSX";
+#elif defined(EIGEN_VECTORIZE_NEON)
+  return "ARM NEON";
+#elif defined(EIGEN_VECTORIZE_SVE)
+  return "ARM SVE";
+#elif defined(EIGEN_VECTORIZE_ZVECTOR)
+  return "S390X ZVECTOR";
+#elif defined(EIGEN_VECTORIZE_MSA)
+  return "MIPS MSA";
+#else
+  return "None";
+#endif
+}
+
+} // end namespace Eigen
+
+
+#endif // EIGEN_CONFIGURE_VECTORIZATION_H
--- a/Eigen/src/Core/util/DisableStupidWarnings.h
+++ b/Eigen/src/Core/util/DisableStupidWarnings.h
@@ -1,94 +1,146 @@
 #ifndef EIGEN_WARNINGS_DISABLED
 #define EIGEN_WARNINGS_DISABLED

-#ifdef _MSC_VER
-  // 4100 - unreferenced formal parameter (occurred e.g. in aligned_allocator::destroy(pointer p))
-  // 4101 - unreferenced local variable
-  // 4127 - conditional expression is constant
-  // 4181 - qualifier applied to reference type ignored
-  // 4211 - nonstandard extension used : redefined extern to static
-  // 4244 - 'argument' : conversion from 'type1' to 'type2', possible loss of data
-  // 4273 - QtAlignedMalloc, inconsistent DLL linkage
-  // 4324 - structure was padded due to declspec(align())
-  // 4503 - decorated name length exceeded, name was truncated
-  // 4512 - assignment operator could not be generated
-  // 4522 - 'class' : multiple assignment operators specified
-  // 4700 - uninitialized local variable 'xyz' used
-  // 4714 - function marked as __forceinline not inlined
-  // 4717 - 'function' : recursive on all control paths, function will cause runtime stack overflow
-  // 4800 - 'type' : forcing value to bool 'true' or 'false' (performance warning)
-  #ifndef EIGEN_PERMANENTLY_DISABLE_STUPID_WARNINGS
-    #pragma warning( push )
-  #endif
-  #pragma warning( disable : 4100 4101 4127 4181 4211 4244 4273 4324 4503 4512 4522 4700 4714 4717 4800)
-
-#elif defined __INTEL_COMPILER
-  // 2196 - routine is both "inline" and "noinline" ("noinline" assumed)
-  //        ICC 12 generates this warning even without any inline keyword, when defining class methods 'inline' i.e. inside of class body
-  //        typedef that may be a reference type.
-  // 279  - controlling expression is constant
-  //        ICC 12 generates this warning on assert(constant_expression_depending_on_template_params) and frankly this is a legitimate use case.
-  // 1684 - conversion from pointer to same-sized integral type (potential portability problem)
-  // 2259 - non-pointer conversion from "Eigen::Index={ptrdiff_t={long}}" to "int" may lose significant bits
-  #ifndef EIGEN_PERMANENTLY_DISABLE_STUPID_WARNINGS
-    #pragma warning push
-  #endif
-  #pragma warning disable 2196 279 1684 2259
-
-#elif defined __clang__
-  // -Wconstant-logical-operand - warning: use of logical && with constant operand; switch to bitwise & or remove constant
-  //     this is really a stupid warning as it warns on compile-time expressions involving enums
-  #ifndef EIGEN_PERMANENTLY_DISABLE_STUPID_WARNINGS
-    #pragma clang diagnostic push
-  #endif
-  #pragma clang diagnostic ignored "-Wconstant-logical-operand"
-
-#elif defined __GNUC__
-
-  #if (!defined(EIGEN_PERMANENTLY_DISABLE_STUPID_WARNINGS)) &&  (__GNUC__ > 4 || (__GNUC__ == 4 && __GNUC_MINOR__ >= 6))
-    #pragma GCC diagnostic push
-  #endif
-  // g++ warns about local variables shadowing member functions, which is too strict
-  #pragma GCC diagnostic ignored "-Wshadow"
-  #if __GNUC__ == 4 && __GNUC_MINOR__ < 8
-    // Until g++-4.7 there are warnings when comparing unsigned int vs 0, even in templated functions:
-    #pragma GCC diagnostic ignored "-Wtype-limits"
-  #endif
-  #if __GNUC__>=6
-    #pragma GCC diagnostic ignored "-Wignored-attributes"
-  #endif
-  #if __GNUC__==7
-    // See: https://gcc.gnu.org/bugzilla/show_bug.cgi?id=89325
-    #pragma GCC diagnostic ignored "-Wattributes"
-  #endif
+#if defined(_MSC_VER)
+// 4100 - unreferenced formal parameter (occurred e.g. in aligned_allocator::destroy(pointer p))
+// 4101 - unreferenced local variable
+// 4127 - conditional expression is constant
+// 4181 - qualifier applied to reference type ignored
+// 4211 - nonstandard extension used : redefined extern to static
+// 4244 - 'argument' : conversion from 'type1' to 'type2', possible loss of data
+// 4273 - QtAlignedMalloc, inconsistent DLL linkage
+// 4324 - structure was padded due to declspec(align())
+// 4503 - decorated name length exceeded, name was truncated
+// 4512 - assignment operator could not be generated
+// 4522 - 'class' : multiple assignment operators specified
+// 4700 - uninitialized local variable 'xyz' used
+// 4714 - function marked as __forceinline not inlined
+// 4717 - 'function' : recursive on all control paths, function will cause runtime stack overflow
+// 4800 - 'type' : forcing value to bool 'true' or 'false' (performance warning)
+#ifndef EIGEN_PERMANENTLY_DISABLE_STUPID_WARNINGS
+#pragma warning(push)
+#endif
+#pragma warning(disable : 4100 4101 4127 4181 4211 4244 4273 4324 4503 4512 4522 4700 4714 4717 4800)
+// We currently rely on has_denorm in tests, and need it defined correctly for half/bfloat16.
+#ifndef _SILENCE_CXX23_DENORM_DEPRECATION_WARNING
+#define EIGEN_REENABLE_CXX23_DENORM_DEPRECATION_WARNING 1
+#define _SILENCE_CXX23_DENORM_DEPRECATION_WARNING
 #endif

-#if defined __NVCC__
-  // Disable the "statement is unreachable" message
-  #pragma diag_suppress code_is_unreachable
-  // Disable the "dynamic initialization in unreachable code" message
-  #pragma diag_suppress initialization_not_reachable
-  // Disable the "invalid error number" message that we get with older versions of nvcc
-  #pragma diag_suppress 1222
-  // Disable the "calling a __host__ function from a __host__ __device__ function is not allowed" messages (yes, there are many of them and they seem to change with every version of the compiler)
-  #pragma diag_suppress 2527
-  #pragma diag_suppress 2529
-  #pragma diag_suppress 2651
-  #pragma diag_suppress 2653
-  #pragma diag_suppress 2668
-  #pragma diag_suppress 2669
-  #pragma diag_suppress 2670
-  #pragma diag_suppress 2671
-  #pragma diag_suppress 2735
-  #pragma diag_suppress 2737
+#elif defined __INTEL_COMPILER
+// 2196 - routine is both "inline" and "noinline" ("noinline" assumed)
+//        ICC 12 generates this warning even without any inline keyword, when defining class methods 'inline' i.e.
+//        inside of class body typedef that may be a reference type.
+// 279  - controlling expression is constant
+//        ICC 12 generates this warning on assert(constant_expression_depending_on_template_params) and frankly this is
+//        a legitimate use case.
+// 1684 - conversion from pointer to same-sized integral type (potential portability problem)
+// 2259 - non-pointer conversion from "Eigen::Index={ptrdiff_t={long}}" to "int" may lose significant bits
+#ifndef EIGEN_PERMANENTLY_DISABLE_STUPID_WARNINGS
+#pragma warning push
+#endif
+#pragma warning disable 2196 279 1684 2259
+
+#elif defined __clang__
+#ifndef EIGEN_PERMANENTLY_DISABLE_STUPID_WARNINGS
+#pragma clang diagnostic push
+#endif
+#if defined(__has_warning)
+// -Wconstant-logical-operand - warning: use of logical && with constant operand; switch to bitwise & or remove constant
+//     this is really a stupid warning as it warns on compile-time expressions involving enums
+#if __has_warning("-Wconstant-logical-operand")
+#pragma clang diagnostic ignored "-Wconstant-logical-operand"
+#endif
+#if __has_warning("-Wimplicit-int-float-conversion")
+#pragma clang diagnostic ignored "-Wimplicit-int-float-conversion"
+#endif
+#if (defined(__ALTIVEC__) || defined(__VSX__)) && (!defined(__STDC_VERSION__) || (__STDC_VERSION__ < 201112L))
+// warning: generic selections are a C11-specific feature
+// ignoring warnings thrown at vec_ctf in Altivec/PacketMath.h
+#if __has_warning("-Wc11-extensions")
+#pragma clang diagnostic ignored "-Wc11-extensions"
+#endif
+#endif
+#endif
+
+#elif defined __GNUC__ && !defined(__FUJITSU)
+
+#if (!defined(EIGEN_PERMANENTLY_DISABLE_STUPID_WARNINGS)) && (__GNUC__ > 4 || (__GNUC__ == 4 && __GNUC_MINOR__ >= 6))
+#pragma GCC diagnostic push
+#endif
+// g++ warns about local variables shadowing member functions, which is too strict
+#pragma GCC diagnostic ignored "-Wshadow"
+#if __GNUC__ == 4 && __GNUC_MINOR__ < 8
+// Until g++-4.7 there are warnings when comparing unsigned int vs 0, even in templated functions:
+#pragma GCC diagnostic ignored "-Wtype-limits"
+#endif
+#if __GNUC__ >= 6
+#pragma GCC diagnostic ignored "-Wignored-attributes"
+#endif
+#if __GNUC__ == 7
+// See: https://gcc.gnu.org/bugzilla/show_bug.cgi?id=89325
+#pragma GCC diagnostic ignored "-Wattributes"
+#endif
+#endif
+
+#if defined __NVCC__ && defined __CUDACC__
+// MSVC 14.16 (required by CUDA 9.*) does not support the _Pragma keyword, so
+// we instead use Microsoft's __pragma extension.
+#if defined _MSC_VER
+#define EIGEN_MAKE_PRAGMA(X) __pragma(#X)
+#else
+#define EIGEN_MAKE_PRAGMA(X) _Pragma(#X)
+#endif
+#if defined __NVCC_DIAG_PRAGMA_SUPPORT__
+#define EIGEN_NV_DIAG_SUPPRESS(X) EIGEN_MAKE_PRAGMA(nv_diag_suppress X)
+#else
+#define EIGEN_NV_DIAG_SUPPRESS(X) EIGEN_MAKE_PRAGMA(diag_suppress X)
+#endif
+
+EIGEN_NV_DIAG_SUPPRESS(boolean_controlling_expr_is_constant)
+// Disable the "statement is unreachable" message
+EIGEN_NV_DIAG_SUPPRESS(code_is_unreachable)
+// Disable the "dynamic initialization in unreachable code" message
+EIGEN_NV_DIAG_SUPPRESS(initialization_not_reachable)
+// Disable the "invalid error number" message that we get with older versions of nvcc
+EIGEN_NV_DIAG_SUPPRESS(1222)
+// Disable the "calling a __host__ function from a __host__ __device__ function is not allowed" messages (yes, there are
+// many of them and they seem to change with every version of the compiler)
+EIGEN_NV_DIAG_SUPPRESS(2527)
+EIGEN_NV_DIAG_SUPPRESS(2529)
+EIGEN_NV_DIAG_SUPPRESS(2651)
+EIGEN_NV_DIAG_SUPPRESS(2653)
+EIGEN_NV_DIAG_SUPPRESS(2668)
+EIGEN_NV_DIAG_SUPPRESS(2669)
+EIGEN_NV_DIAG_SUPPRESS(2670)
+EIGEN_NV_DIAG_SUPPRESS(2671)
+EIGEN_NV_DIAG_SUPPRESS(2735)
+EIGEN_NV_DIAG_SUPPRESS(2737)
+EIGEN_NV_DIAG_SUPPRESS(2739)
+EIGEN_NV_DIAG_SUPPRESS(2885)
+EIGEN_NV_DIAG_SUPPRESS(2888)
+EIGEN_NV_DIAG_SUPPRESS(2976)
+EIGEN_NV_DIAG_SUPPRESS(2979)
+EIGEN_NV_DIAG_SUPPRESS(20011)
+EIGEN_NV_DIAG_SUPPRESS(20014)
+// Disable the "// __device__ annotation is ignored on a function(...) that is
+//              explicitly defaulted on its first declaration" message.
+// The __device__ annotation seems to actually be needed in some cases,
+// otherwise resulting in kernel runtime errors.
+EIGEN_NV_DIAG_SUPPRESS(2886)
+EIGEN_NV_DIAG_SUPPRESS(2929)
+EIGEN_NV_DIAG_SUPPRESS(2977)
+EIGEN_NV_DIAG_SUPPRESS(20012)
+#undef EIGEN_NV_DIAG_SUPPRESS
+#undef EIGEN_MAKE_PRAGMA
 #endif

 #else
 // warnings already disabled:
-# ifndef EIGEN_WARNINGS_DISABLED_2
-#  define EIGEN_WARNINGS_DISABLED_2
-# elif defined(EIGEN_INTERNAL_DEBUGGING)
-#  error "Do not include \"DisableStupidWarnings.h\" recursively more than twice!"
-# endif
+#ifndef EIGEN_WARNINGS_DISABLED_2
+#define EIGEN_WARNINGS_DISABLED_2
+#elif defined(EIGEN_INTERNAL_DEBUGGING)
+#error "Do not include \"DisableStupidWarnings.h\" recursively more than twice!"
+#endif

-#endif // not EIGEN_WARNINGS_DISABLED
+#endif  // not EIGEN_WARNINGS_DISABLED
--- a/Eigen/src/Core/util/Macros.h
+++ b/Eigen/src/Core/util/Macros.h
@@ -13,7 +13,7 @@

 #define EIGEN_WORLD_VERSION 3
 #define EIGEN_MAJOR_VERSION 3
-#define EIGEN_MINOR_VERSION 8
+#define EIGEN_MINOR_VERSION 9

 #define EIGEN_VERSION_AT_LEAST(x,y,z) (EIGEN_WORLD_VERSION>x || (EIGEN_WORLD_VERSION>=x && \
                                      (EIGEN_MAJOR_VERSION>y || (EIGEN_MAJOR_VERSION>=y && \
--- a/Eigen/src/Core/util/Meta.h
+++ b/Eigen/src/Core/util/Meta.h
@@ -16,8 +16,40 @@
 #include <math_constants.h>
 #endif

-#if EIGEN_COMP_ICC>=1600 &&  __cplusplus >= 201103L
+// Recent versions of ICC require <cstdint> for pointer types below.
+#define EIGEN_ICC_NEEDS_CSTDINT (EIGEN_COMP_ICC>=1600 && EIGEN_COMP_CXXVER >= 11)
+
+// Define portable (u)int{32,64} types
+#if EIGEN_HAS_CXX11 || EIGEN_ICC_NEEDS_CSTDINT
 #include <cstdint>
+namespace Eigen {
+namespace numext {
+typedef std::uint8_t  uint8_t;
+typedef std::int8_t   int8_t;
+typedef std::uint16_t uint16_t;
+typedef std::int16_t  int16_t;
+typedef std::uint32_t uint32_t;
+typedef std::int32_t  int32_t;
+typedef std::uint64_t uint64_t;
+typedef std::int64_t  int64_t;
+}
+}
+#else
+// Without c++11, all compilers able to compile Eigen also
+// provide the C99 stdint.h header file.
+#include <stdint.h>
+namespace Eigen {
+namespace numext {
+typedef ::uint8_t  uint8_t;
+typedef ::int8_t   int8_t;
+typedef ::uint16_t uint16_t;
+typedef ::int16_t  int16_t;
+typedef ::uint32_t uint32_t;
+typedef ::int32_t  int32_t;
+typedef ::uint64_t uint64_t;
+typedef ::int64_t  int64_t;
+}
+}
 #endif

 namespace Eigen {
@@ -43,13 +75,14 @@ namespace internal {

 // Only recent versions of ICC complain about using ptrdiff_t to hold pointers,
 // and older versions do not provide *intptr_t types.
-#if EIGEN_COMP_ICC>=1600 &&  __cplusplus >= 201103L
+#if EIGEN_ICC_NEEDS_CSTDINT
 typedef std::intptr_t  IntPtr;
 typedef std::uintptr_t UIntPtr;
 #else
 typedef std::ptrdiff_t IntPtr;
 typedef std::size_t UIntPtr;
 #endif
+#undef EIGEN_ICC_NEEDS_CSTDINT

 struct true_type {  enum { value = 1 }; };
 struct false_type { enum { value = 0 }; };
@@ -133,10 +166,8 @@ template<> struct make_unsigned<signed int>       { typedef unsigned int type; }
 template<> struct make_unsigned<unsigned int>     { typedef unsigned int type; };
 template<> struct make_unsigned<signed long>      { typedef unsigned long type; };
 template<> struct make_unsigned<unsigned long>    { typedef unsigned long type; };
-#if EIGEN_COMP_MSVC
-template<> struct make_unsigned<signed __int64>   { typedef unsigned __int64 type; };
-template<> struct make_unsigned<unsigned __int64> { typedef unsigned __int64 type; };
-#endif
+template<> struct make_unsigned<signed long long> { typedef unsigned long type; };
+template<> struct make_unsigned<unsigned long long> { typedef unsigned long type; };
 #endif

 template <typename T> struct add_const { typedef const T type; };
@@ -494,7 +525,7 @@ template<typename T, typename U> struct scalar_product_traits
 } // end namespace internal

 namespace numext {
-  
+
 #if defined(__CUDA_ARCH__)
 template<typename T> EIGEN_DEVICE_FUNC   void swap(T &a, T &b) { T tmp = b; b = a; a = tmp; }
 #else
@@ -510,7 +541,7 @@ using std::numeric_limits;
 // Integer division with rounding up.
 // T is assumed to be an integer type with a>=0, and b>0
 template<typename T>
-T div_ceil(const T &a, const T &b)
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE T div_ceil(const T &a, const T &b)
 {
  return (a+b-1) / b;
 }
@@ -539,30 +570,4 @@ bool not_equal_strict(const double& x,const double& y) { return std::not_equal_t

 } // end namespace Eigen

-// Define portable (u)int{32,64} types
-#if EIGEN_HAS_CXX11
-#include <cstdint>
-namespace Eigen {
-namespace numext {
-typedef std::uint32_t uint32_t;
-typedef std::int32_t  int32_t;
-typedef std::uint64_t uint64_t;
-typedef std::int64_t  int64_t;
-}
-}
-#else
-// Without c++11, all compilers able to compile Eigen also
-// provides the C99 stdint.h header file.
-#include <stdint.h>
-namespace Eigen {
-namespace numext {
-typedef ::uint32_t uint32_t;
-typedef ::int32_t  int32_t;
-typedef ::uint64_t uint64_t;
-typedef ::int64_t  int64_t;
-}
-}
-#endif
-
-
 #endif // EIGEN_META_H
--- a/Eigen/src/Eigenvalues/MatrixBaseEigenvalues.h
+++ b/Eigen/src/Eigenvalues/MatrixBaseEigenvalues.h
@@ -11,7 +11,7 @@
 #ifndef EIGEN_MATRIXBASEEIGENVALUES_H
 #define EIGEN_MATRIXBASEEIGENVALUES_H

-namespace Eigen { 
+namespace Eigen {

 namespace internal {

@@ -42,13 +42,13 @@ struct eigenvalues_selector<Derived, false>

 } // end namespace internal

-/** \brief Computes the eigenvalues of a matrix 
+/** \brief Computes the eigenvalues of a matrix
  * \returns Column vector containing the eigenvalues.
  *
  * \eigenvalues_module
  * This function computes the eigenvalues with the help of the EigenSolver
  * class (for real matrices) or the ComplexEigenSolver class (for complex
-  * matrices). 
+  * matrices).
  *
  * The eigenvalues are repeated according to their algebraic multiplicity,
  * so there are as many eigenvalues as rows in the matrix.
@@ -83,8 +83,8 @@ MatrixBase<Derived>::eigenvalues() const
  *
  * \sa SelfAdjointEigenSolver::eigenvalues(), MatrixBase::eigenvalues()
  */
-template<typename MatrixType, unsigned int UpLo> 
-inline typename SelfAdjointView<MatrixType, UpLo>::EigenvaluesReturnType
+template<typename MatrixType, unsigned int UpLo>
+EIGEN_DEVICE_FUNC inline typename SelfAdjointView<MatrixType, UpLo>::EigenvaluesReturnType
 SelfAdjointView<MatrixType, UpLo>::eigenvalues() const
 {
  PlainObject thisAsMatrix(*this);
@@ -147,7 +147,7 @@ MatrixBase<Derived>::operatorNorm() const
  * \sa eigenvalues(), MatrixBase::operatorNorm()
  */
 template<typename MatrixType, unsigned int UpLo>
-inline typename SelfAdjointView<MatrixType, UpLo>::RealScalar
+EIGEN_DEVICE_FUNC inline typename SelfAdjointView<MatrixType, UpLo>::RealScalar
 SelfAdjointView<MatrixType, UpLo>::operatorNorm() const
 {
  return eigenvalues().cwiseAbs().maxCoeff();
--- a/Eigen/src/Geometry/Scaling.h
+++ b/Eigen/src/Geometry/Scaling.h
@@ -14,7 +14,7 @@ namespace Eigen {

 /** \geometry_module \ingroup Geometry_Module
  *
-  * \class Scaling
+  * \class UniformScaling
  *
  * \brief Represents a generic uniform scaling transformation
  *
--- a/Eigen/src/Householder/HouseholderSequence.h
+++ b/Eigen/src/Householder/HouseholderSequence.h
@@ -443,7 +443,8 @@ typename internal::matrix_type_times_scalar_type<typename VectorsType::Scalar,Ot
  return res;
 }

-/** \ingroup Householder_Module \householder_module
+/** \ingroup Householder_Module
+  * \householder_module
  * \brief Convenience function for constructing a Householder sequence. 
  * \returns A HouseholderSequence constructed from the specified arguments.
  */
@@ -453,7 +454,8 @@ HouseholderSequence<VectorsType,CoeffsType> householderSequence(const VectorsTyp
  return HouseholderSequence<VectorsType,CoeffsType,OnTheLeft>(v, h);
 }

-/** \ingroup Householder_Module \householder_module
+/** \ingroup Householder_Module
+  * \householder_module
  * \brief Convenience function for constructing a Householder sequence. 
  * \returns A HouseholderSequence constructed from the specified arguments.
  * \details This function differs from householderSequence() in that the template argument \p OnTheSide of
--- a/Eigen/src/LU/FullPivLU.h
+++ b/Eigen/src/LU/FullPivLU.h
@@ -53,7 +53,7 @@ template<typename _MatrixType> struct traits<FullPivLU<_MatrixType> >
  * Output: \verbinclude class_FullPivLU.out
  *
  * This class supports the \link InplaceDecomposition inplace decomposition \endlink mechanism.
-  * 
+  *
  * \sa MatrixBase::fullPivLu(), MatrixBase::determinant(), MatrixBase::inverse()
  */
 template<typename _MatrixType> class FullPivLU
@@ -744,7 +744,7 @@ struct image_retval<FullPivLU<_MatrixType> >
 #ifndef EIGEN_PARSED_BY_DOXYGEN
 template<typename _MatrixType>
 template<typename RhsType, typename DstType>
-void FullPivLU<_MatrixType>::_solve_impl(const RhsType &rhs, DstType &dst) const
+EIGEN_DEVICE_FUNC void FullPivLU<_MatrixType>::_solve_impl(const RhsType &rhs, DstType &dst) const
 {
  /* The decomposition PAQ = LU can be rewritten as A = P^{-1} L U Q^{-1}.
  * So we proceed as follows:
@@ -792,7 +792,7 @@ void FullPivLU<_MatrixType>::_solve_impl(const RhsType &rhs, DstType &dst) const

 template<typename _MatrixType>
 template<bool Conjugate, typename RhsType, typename DstType>
-void FullPivLU<_MatrixType>::_solve_impl_transposed(const RhsType &rhs, DstType &dst) const
+EIGEN_DEVICE_FUNC void FullPivLU<_MatrixType>::_solve_impl_transposed(const RhsType &rhs, DstType &dst) const
 {
  /* The decomposition PAQ = LU can be rewritten as A = P^{-1} L U Q^{-1},
   * and since permutations are real and unitary, we can write this
@@ -864,7 +864,7 @@ struct Assignment<DstXprType, Inverse<FullPivLU<MatrixType> >, internal::assign_
 {
  typedef FullPivLU<MatrixType> LuType;
  typedef Inverse<LuType> SrcXprType;
-  static void run(DstXprType &dst, const SrcXprType &src, const internal::assign_op<typename DstXprType::Scalar,typename MatrixType::Scalar> &)
+  static EIGEN_DEVICE_FUNC void run(DstXprType &dst, const SrcXprType &src, const internal::assign_op<typename DstXprType::Scalar,typename MatrixType::Scalar> &)
  {
    dst = src.nestedExpression().solve(MatrixType::Identity(src.rows(), src.cols()));
  }
--- a/Eigen/src/LU/InverseImpl.h
+++ b/Eigen/src/LU/InverseImpl.h
@@ -11,7 +11,7 @@
 #ifndef EIGEN_INVERSE_IMPL_H
 #define EIGEN_INVERSE_IMPL_H

-namespace Eigen { 
+namespace Eigen {

 namespace internal {

@@ -72,7 +72,7 @@ struct compute_inverse_and_det_with_check<MatrixType, ResultType, 1>
 ****************************/

 template<typename MatrixType, typename ResultType>
-EIGEN_DEVICE_FUNC 
+EIGEN_DEVICE_FUNC
 inline void compute_inverse_size2_helper(
    const MatrixType& matrix, const typename ResultType::Scalar& invdet,
    ResultType& result)
@@ -122,7 +122,7 @@ struct compute_inverse_and_det_with_check<MatrixType, ResultType, 2>
 ****************************/

 template<typename MatrixType, int i, int j>
-EIGEN_DEVICE_FUNC 
+EIGEN_DEVICE_FUNC
 inline typename MatrixType::Scalar cofactor_3x3(const MatrixType& m)
 {
  enum {
@@ -200,7 +200,7 @@ struct compute_inverse_and_det_with_check<MatrixType, ResultType, 3>
 ****************************/

 template<typename Derived>
-EIGEN_DEVICE_FUNC 
+EIGEN_DEVICE_FUNC
 inline const typename Derived::Scalar general_det3_helper
 (const MatrixBase<Derived>& matrix, int i1, int i2, int i3, int j1, int j2, int j3)
 {
@@ -209,7 +209,7 @@ inline const typename Derived::Scalar general_det3_helper
 }

 template<typename MatrixType, int i, int j>
-EIGEN_DEVICE_FUNC 
+EIGEN_DEVICE_FUNC
 inline typename MatrixType::Scalar cofactor_4x4(const MatrixType& matrix)
 {
  enum {
@@ -290,13 +290,13 @@ template<typename DstXprType, typename XprType>
 struct Assignment<DstXprType, Inverse<XprType>, internal::assign_op<typename DstXprType::Scalar,typename XprType::Scalar>, Dense2Dense>
 {
  typedef Inverse<XprType> SrcXprType;
-  static void run(DstXprType &dst, const SrcXprType &src, const internal::assign_op<typename DstXprType::Scalar,typename XprType::Scalar> &)
+  static EIGEN_DEVICE_FUNC void run(DstXprType &dst, const SrcXprType &src, const internal::assign_op<typename DstXprType::Scalar,typename XprType::Scalar> &)
  {
    Index dstRows = src.rows();
    Index dstCols = src.cols();
    if((dst.rows()!=dstRows) || (dst.cols()!=dstCols))
      dst.resize(dstRows, dstCols);
-    
+
    const int Size = EIGEN_PLAIN_ENUM_MIN(XprType::ColsAtCompileTime,DstXprType::ColsAtCompileTime);
    EIGEN_ONLY_USED_FOR_DEBUG(Size);
    eigen_assert(( (Size<=1) || (Size>4) || (extract_data(src.nestedExpression())!=extract_data(dst)))
@@ -304,14 +304,14 @@ struct Assignment<DstXprType, Inverse<XprType>, internal::assign_op<typename Dst

    typedef typename internal::nested_eval<XprType,XprType::ColsAtCompileTime>::type  ActualXprType;
    typedef typename internal::remove_all<ActualXprType>::type                        ActualXprTypeCleanded;
-    
+
    ActualXprType actual_xpr(src.nestedExpression());
-    
+
    compute_inverse<ActualXprTypeCleanded, DstXprType>::run(actual_xpr, dst);
  }
 };

-  
+
 } // end namespace internal

 /** \lu_module
--- a/Eigen/src/LU/PartialPivLU.h
+++ b/Eigen/src/LU/PartialPivLU.h
@@ -69,7 +69,7 @@ struct enable_if_ref<Ref<T>,Derived> {
  * The data of the LU decomposition can be directly accessed through the methods matrixLU(), permutationP().
  *
  * This class supports the \link InplaceDecomposition inplace decomposition \endlink mechanism.
-  * 
+  *
  * \sa MatrixBase::partialPivLu(), MatrixBase::determinant(), MatrixBase::inverse(), MatrixBase::computeInverse(), class FullPivLU
  */
 template<typename _MatrixType> class PartialPivLU
@@ -572,7 +572,7 @@ struct Assignment<DstXprType, Inverse<PartialPivLU<MatrixType> >, internal::assi
 {
  typedef PartialPivLU<MatrixType> LuType;
  typedef Inverse<LuType> SrcXprType;
-  static void run(DstXprType &dst, const SrcXprType &src, const internal::assign_op<typename DstXprType::Scalar,typename LuType::Scalar> &)
+  static EIGEN_DEVICE_FUNC void run(DstXprType &dst, const SrcXprType &src, const internal::assign_op<typename DstXprType::Scalar,typename LuType::Scalar> &)
  {
    dst = src.nestedExpression().solve(MatrixType::Identity(src.rows(), src.cols()));
  }
--- a/Eigen/src/PardisoSupport/PardisoSupport.h
+++ b/Eigen/src/PardisoSupport/PardisoSupport.h
@@ -192,7 +192,7 @@ class PardisoImpl : public SparseSolverBase<Derived>
    void pardisoInit(int type)
    {
      m_type = type;
-      EIGEN_USING_STD(abs);
+      EIGEN_USING_STD_MATH(abs);
      bool symmetric = abs(m_type) < 10;
      m_iparm[0] = 1;   // No solver default
      m_iparm[1] = 2;   // use Metis for the ordering
--- a/Eigen/src/QR/ColPivHouseholderQR.h
+++ b/Eigen/src/QR/ColPivHouseholderQR.h
@@ -42,7 +42,7 @@ template<typename _MatrixType> struct traits<ColPivHouseholderQR<_MatrixType> >
  * numerical stability. It is slower than HouseholderQR, and faster than FullPivHouseholderQR.
  *
  * This class supports the \link InplaceDecomposition inplace decomposition \endlink mechanism.
-  * 
+  *
  * \sa MatrixBase::colPivHouseholderQr()
  */
 template<typename _MatrixType> class ColPivHouseholderQR
@@ -582,7 +582,7 @@ void ColPivHouseholderQR<MatrixType>::computeInPlace()
 #ifndef EIGEN_PARSED_BY_DOXYGEN
 template<typename _MatrixType>
 template<typename RhsType, typename DstType>
-void ColPivHouseholderQR<_MatrixType>::_solve_impl(const RhsType &rhs, DstType &dst) const
+EIGEN_DEVICE_FUNC void ColPivHouseholderQR<_MatrixType>::_solve_impl(const RhsType &rhs, DstType &dst) const
 {
  eigen_assert(rhs.rows() == rows());

@@ -618,7 +618,7 @@ struct Assignment<DstXprType, Inverse<ColPivHouseholderQR<MatrixType> >, interna
 {
  typedef ColPivHouseholderQR<MatrixType> QrType;
  typedef Inverse<QrType> SrcXprType;
-  static void run(DstXprType &dst, const SrcXprType &src, const internal::assign_op<typename DstXprType::Scalar,typename QrType::Scalar> &)
+  static EIGEN_DEVICE_FUNC void run(DstXprType &dst, const SrcXprType &src, const internal::assign_op<typename DstXprType::Scalar,typename QrType::Scalar> &)
  {
    dst = src.nestedExpression().solve(MatrixType::Identity(src.rows(), src.cols()));
  }
--- a/Eigen/src/QR/CompleteOrthogonalDecomposition.h
+++ b/Eigen/src/QR/CompleteOrthogonalDecomposition.h
@@ -41,7 +41,7 @@ struct traits<CompleteOrthogonalDecomposition<_MatrixType> >
  * size rank-by-rank. \b A may be rank deficient.
  *
  * This class supports the \link InplaceDecomposition inplace decomposition \endlink mechanism.
-  * 
+  *
  * \sa MatrixBase::completeOrthogonalDecomposition()
  */
 template <typename _MatrixType>
@@ -489,7 +489,7 @@ void CompleteOrthogonalDecomposition<MatrixType>::applyZAdjointOnTheLeftInPlace(
 #ifndef EIGEN_PARSED_BY_DOXYGEN
 template <typename _MatrixType>
 template <typename RhsType, typename DstType>
-void CompleteOrthogonalDecomposition<_MatrixType>::_solve_impl(
+EIGEN_DEVICE_FUNC void CompleteOrthogonalDecomposition<_MatrixType>::_solve_impl(
    const RhsType& rhs, DstType& dst) const {
  eigen_assert(rhs.rows() == this->rows());

@@ -532,7 +532,7 @@ struct Assignment<DstXprType, Inverse<CompleteOrthogonalDecomposition<MatrixType
 {
  typedef CompleteOrthogonalDecomposition<MatrixType> CodType;
  typedef Inverse<CodType> SrcXprType;
-  static void run(DstXprType &dst, const SrcXprType &src, const internal::assign_op<typename DstXprType::Scalar,typename CodType::Scalar> &)
+  static EIGEN_DEVICE_FUNC void run(DstXprType &dst, const SrcXprType &src, const internal::assign_op<typename DstXprType::Scalar,typename CodType::Scalar> &)
  {
    dst = src.nestedExpression().solve(MatrixType::Identity(src.rows(), src.rows()));
  }
--- a/Eigen/src/QR/FullPivHouseholderQR.h
+++ b/Eigen/src/QR/FullPivHouseholderQR.h
@@ -11,7 +11,7 @@
 #ifndef EIGEN_FULLPIVOTINGHOUSEHOLDERQR_H
 #define EIGEN_FULLPIVOTINGHOUSEHOLDERQR_H

-namespace Eigen { 
+namespace Eigen {

 namespace internal {

@@ -40,18 +40,18 @@ struct traits<FullPivHouseholderQRMatrixQReturnType<MatrixType> >
  * \tparam _MatrixType the type of the matrix of which we are computing the QR decomposition
  *
  * This class performs a rank-revealing QR decomposition of a matrix \b A into matrices \b P, \b P', \b Q and \b R
-  * such that 
+  * such that
  * \f[
  *  \mathbf{P} \, \mathbf{A} \, \mathbf{P}' = \mathbf{Q} \, \mathbf{R}
  * \f]
-  * by using Householder transformations. Here, \b P and \b P' are permutation matrices, \b Q a unitary matrix 
+  * by using Householder transformations. Here, \b P and \b P' are permutation matrices, \b Q a unitary matrix
  * and \b R an upper triangular matrix.
  *
  * This decomposition performs a very prudent full pivoting in order to be rank-revealing and achieve optimal
  * numerical stability. The trade-off is that it is slower than HouseholderQR and ColPivHouseholderQR.
  *
  * This class supports the \link InplaceDecomposition inplace decomposition \endlink mechanism.
-  * 
+  *
  * \sa MatrixBase::fullPivHouseholderQr()
  */
 template<typename _MatrixType> class FullPivHouseholderQR
@@ -114,12 +114,12 @@ template<typename _MatrixType> class FullPivHouseholderQR
      *
      * This constructor computes the QR factorization of the matrix \a matrix by calling
      * the method compute(). It is a short cut for:
-      * 
+      *
      * \code
      * FullPivHouseholderQR<MatrixType> qr(matrix.rows(), matrix.cols());
      * qr.compute(matrix);
      * \endcode
-      * 
+      *
      * \sa compute()
      */
    template<typename InputType>
@@ -317,9 +317,9 @@ template<typename _MatrixType> class FullPivHouseholderQR

    inline Index rows() const { return m_qr.rows(); }
    inline Index cols() const { return m_qr.cols(); }
-    
+
    /** \returns a const reference to the vector of Householder coefficients used to represent the factor \c Q.
-      * 
+      *
      * For advanced uses only.
      */
    const HCoeffsType& hCoeffs() const { return m_hCoeffs; }
@@ -392,7 +392,7 @@ template<typename _MatrixType> class FullPivHouseholderQR
      *          diagonal coefficient of U.
      */
    RealScalar maxPivot() const { return m_maxpivot; }
-    
+
    #ifndef EIGEN_PARSED_BY_DOXYGEN
    template<typename RhsType, typename DstType>
    EIGEN_DEVICE_FUNC
@@ -400,14 +400,14 @@ template<typename _MatrixType> class FullPivHouseholderQR
    #endif

  protected:
-    
+
    static void check_template_parameters()
    {
      EIGEN_STATIC_ASSERT_NON_INTEGER(Scalar);
    }
-    
+
    void computeInPlace();
-    
+
    MatrixType m_qr;
    HCoeffsType m_hCoeffs;
    IntDiagSizeVectorType m_rows_transpositions;
@@ -463,7 +463,7 @@ void FullPivHouseholderQR<MatrixType>::computeInPlace()
  Index cols = m_qr.cols();
  Index size = (std::min)(rows,cols);

-  
+
  m_hCoeffs.resize(size);

  m_temp.resize(cols);
@@ -539,7 +539,7 @@ void FullPivHouseholderQR<MatrixType>::computeInPlace()
 #ifndef EIGEN_PARSED_BY_DOXYGEN
 template<typename _MatrixType>
 template<typename RhsType, typename DstType>
-void FullPivHouseholderQR<_MatrixType>::_solve_impl(const RhsType &rhs, DstType &dst) const
+EIGEN_DEVICE_FUNC void FullPivHouseholderQR<_MatrixType>::_solve_impl(const RhsType &rhs, DstType &dst) const
 {
  eigen_assert(rhs.rows() == rows());
  const Index l_rank = rank();
@@ -574,14 +574,14 @@ void FullPivHouseholderQR<_MatrixType>::_solve_impl(const RhsType &rhs, DstType
 #endif

 namespace internal {
-  
+
 template<typename DstXprType, typename MatrixType>
 struct Assignment<DstXprType, Inverse<FullPivHouseholderQR<MatrixType> >, internal::assign_op<typename DstXprType::Scalar,typename FullPivHouseholderQR<MatrixType>::Scalar>, Dense2Dense>
 {
  typedef FullPivHouseholderQR<MatrixType> QrType;
  typedef Inverse<QrType> SrcXprType;
-  static void run(DstXprType &dst, const SrcXprType &src, const internal::assign_op<typename DstXprType::Scalar,typename QrType::Scalar> &)
-  {    
+  static EIGEN_DEVICE_FUNC void run(DstXprType &dst, const SrcXprType &src, const internal::assign_op<typename DstXprType::Scalar,typename QrType::Scalar> &)
+  {
    dst = src.nestedExpression().solve(MatrixType::Identity(src.rows(), src.cols()));
  }
 };
--- a/Eigen/src/QR/HouseholderQR.h
+++ b/Eigen/src/QR/HouseholderQR.h
@@ -12,7 +12,7 @@
 #ifndef EIGEN_QR_H
 #define EIGEN_QR_H

-namespace Eigen { 
+namespace Eigen {

 /** \ingroup QR_Module
  *
@@ -24,7 +24,7 @@ namespace Eigen {
  * \tparam _MatrixType the type of the matrix of which we are computing the QR decomposition
  *
  * This class performs a QR decomposition of a matrix \b A into matrices \b Q and \b R
-  * such that 
+  * such that
  * \f[
  *  \mathbf{A} = \mathbf{Q} \, \mathbf{R}
  * \f]
@@ -85,12 +85,12 @@ template<typename _MatrixType> class HouseholderQR
      *
      * This constructor computes the QR factorization of the matrix \a matrix by calling
      * the method compute(). It is a short cut for:
-      * 
+      *
      * \code
      * HouseholderQR<MatrixType> qr(matrix.rows(), matrix.cols());
      * qr.compute(matrix);
      * \endcode
-      * 
+      *
      * \sa compute()
      */
    template<typename InputType>
@@ -204,13 +204,13 @@ template<typename _MatrixType> class HouseholderQR

    inline Index rows() const { return m_qr.rows(); }
    inline Index cols() const { return m_qr.cols(); }
-    
+
    /** \returns a const reference to the vector of Householder coefficients used to represent the factor \c Q.
-      * 
+      *
      * For advanced uses only.
      */
    const HCoeffsType& hCoeffs() const { return m_hCoeffs; }
-    
+
    #ifndef EIGEN_PARSED_BY_DOXYGEN
    template<typename RhsType, typename DstType>
    EIGEN_DEVICE_FUNC
@@ -218,14 +218,14 @@ template<typename _MatrixType> class HouseholderQR
    #endif

  protected:
-    
+
    static void check_template_parameters()
    {
      EIGEN_STATIC_ASSERT_NON_INTEGER(Scalar);
    }

    void computeInPlace();
-    
+
    MatrixType m_qr;
    HCoeffsType m_hCoeffs;
    RowVectorType m_temp;
@@ -347,7 +347,7 @@ struct householder_qr_inplace_blocked
 #ifndef EIGEN_PARSED_BY_DOXYGEN
 template<typename _MatrixType>
 template<typename RhsType, typename DstType>
-void HouseholderQR<_MatrixType>::_solve_impl(const RhsType &rhs, DstType &dst) const
+EIGEN_DEVICE_FUNC void HouseholderQR<_MatrixType>::_solve_impl(const RhsType &rhs, DstType &dst) const
 {
  const Index rank = (std::min)(rows(), cols());
  eigen_assert(rhs.rows() == rows());
@@ -379,7 +379,7 @@ template<typename MatrixType>
 void HouseholderQR<MatrixType>::computeInPlace()
 {
  check_template_parameters();
-  
+
  Index rows = m_qr.rows();
  Index cols = m_qr.cols();
  Index size = (std::min)(rows,cols);
--- a/Eigen/src/SVD/SVDBase.h
+++ b/Eigen/src/SVD/SVDBase.h
@@ -34,12 +34,12 @@ namespace Eigen {
 *
 * Singular values are always sorted in decreasing order.
 *
- * 
+ *
 * You can ask for only \em thin \a U or \a V to be computed, meaning the following. In case of a rectangular n-by-p matrix, letting \a m be the
 * smaller value among \a n and \a p, there are only \a m singular vectors; the remaining columns of \a U and \a V do not correspond to actual
 * singular vectors. Asking for \em thin \a U or \a V means asking for only their \a m first columns to be formed. So \a U is then a n-by-m matrix,
 * and \a V is then a p-by-m matrix. Notice that thin \a U and \a V are all you need for (least squares) solving.
- *  
+ *
 * If the input matrix has inf or nan coefficients, the result of the computation is undefined, but the computation is guaranteed to
 * terminate in finite (and reasonable) time.
 * \sa class BDCSVD, class JacobiSVD
@@ -67,7 +67,7 @@ public:
  typedef Matrix<Scalar, RowsAtCompileTime, RowsAtCompileTime, MatrixOptions, MaxRowsAtCompileTime, MaxRowsAtCompileTime> MatrixUType;
  typedef Matrix<Scalar, ColsAtCompileTime, ColsAtCompileTime, MatrixOptions, MaxColsAtCompileTime, MaxColsAtCompileTime> MatrixVType;
  typedef typename internal::plain_diag_type<MatrixType, RealScalar>::type SingularValuesType;
-  
+
  Derived& derived() { return *static_cast<Derived*>(this); }
  const Derived& derived() const { return *static_cast<const Derived*>(this); }

@@ -120,7 +120,7 @@ public:
    eigen_assert(m_isInitialized && "SVD is not initialized.");
    return m_nonzeroSingularValues;
  }
-  
+
  /** \returns the rank of the matrix of which \c *this is the SVD.
    *
    * \note This method has to determine which singular values should be considered nonzero.
@@ -137,7 +137,7 @@ public:
    while(i>=0 && m_singularValues.coeff(i) < premultiplied_threshold) --i;
    return i+1;
  }
-  
+
  /** Allows to prescribe a threshold to be used by certain methods, such as rank() and solve(),
    * which need to determine when singular values are to be considered nonzero.
    * This is not used for the SVD decomposition itself.
@@ -193,7 +193,7 @@ public:

  inline Index rows() const { return m_rows; }
  inline Index cols() const { return m_cols; }
-  
+
  /** \returns a (least squares) solution of \f$ A x = b \f$ using the current SVD decomposition of A.
    *
    * \param b the right-hand-side of the equation to solve.
@@ -211,7 +211,7 @@ public:
    eigen_assert(computeU() && computeV() && "SVD::solve() requires both unitaries U and V to be computed (thin unitaries suffice).");
    return Solve<Derived, Rhs>(derived(), b.derived());
  }
-  
+
  #ifndef EIGEN_PARSED_BY_DOXYGEN
  template<typename RhsType, typename DstType>
  EIGEN_DEVICE_FUNC
@@ -219,12 +219,12 @@ public:
  #endif

 protected:
-  
+
  static void check_template_parameters()
  {
    EIGEN_STATIC_ASSERT_NON_INTEGER(Scalar);
  }
-  
+
  // return true if already allocated
  bool allocate(Index rows, Index cols, unsigned int computationOptions) ;

@@ -258,7 +258,7 @@ protected:
 #ifndef EIGEN_PARSED_BY_DOXYGEN
 template<typename Derived>
 template<typename RhsType, typename DstType>
-void SVDBase<Derived>::_solve_impl(const RhsType &rhs, DstType &dst) const
+EIGEN_DEVICE_FUNC void SVDBase<Derived>::_solve_impl(const RhsType &rhs, DstType &dst) const
 {
  eigen_assert(rhs.rows() == rows());

--- a/Eigen/src/SparseCore/SparseAssign.h
+++ b/Eigen/src/SparseCore/SparseAssign.h
@@ -10,9 +10,9 @@
 #ifndef EIGEN_SPARSEASSIGN_H
 #define EIGEN_SPARSEASSIGN_H

-namespace Eigen { 
+namespace Eigen {

-template<typename Derived>    
+template<typename Derived>
 template<typename OtherDerived>
 Derived& SparseMatrixBase<Derived>::operator=(const EigenBase<OtherDerived> &other)
 {
@@ -104,7 +104,7 @@ void assign_sparse_to_sparse(DstXprType &dst, const SrcXprType &src)

    enum { Flip = (DstEvaluatorType::Flags & RowMajorBit) != (SrcEvaluatorType::Flags & RowMajorBit) };

-    
+
    DstXprType temp(src.rows(), src.cols());

    temp.reserve((std::max)(src.rows(),src.cols())*2);
@@ -127,7 +127,7 @@ void assign_sparse_to_sparse(DstXprType &dst, const SrcXprType &src)
 template< typename DstXprType, typename SrcXprType, typename Functor>
 struct Assignment<DstXprType, SrcXprType, Functor, Sparse2Sparse>
 {
-  static void run(DstXprType &dst, const SrcXprType &src, const internal::assign_op<typename DstXprType::Scalar,typename SrcXprType::Scalar> &/*func*/)
+  static EIGEN_DEVICE_FUNC void run(DstXprType &dst, const SrcXprType &src, const internal::assign_op<typename DstXprType::Scalar,typename SrcXprType::Scalar> &/*func*/)
  {
    assign_sparse_to_sparse(dst.derived(), src.derived());
  }
@@ -137,15 +137,15 @@ struct Assignment<DstXprType, SrcXprType, Functor, Sparse2Sparse>
 template< typename DstXprType, typename SrcXprType, typename Functor>
 struct Assignment<DstXprType, SrcXprType, Functor, Sparse2Dense>
 {
-  static void run(DstXprType &dst, const SrcXprType &src, const Functor &func)
+  static EIGEN_DEVICE_FUNC void run(DstXprType &dst, const SrcXprType &src, const Functor &func)
  {
    if(internal::is_same<Functor,internal::assign_op<typename DstXprType::Scalar,typename SrcXprType::Scalar> >::value)
      dst.setZero();
-    
+
    internal::evaluator<SrcXprType> srcEval(src);
    resize_if_allowed(dst, src, func);
    internal::evaluator<DstXprType> dstEval(dst);
-    
+
    const Index outerEvaluationSize = (internal::evaluator<SrcXprType>::Flags&RowMajorBit) ? src.rows() : src.cols();
    for (Index j=0; j<outerEvaluationSize; ++j)
      for (typename internal::evaluator<SrcXprType>::InnerIterator i(srcEval,j); i; ++i)
@@ -159,7 +159,7 @@ template<typename DstXprType, typename DecType, typename RhsType, typename Scala
 struct Assignment<DstXprType, Solve<DecType,RhsType>, internal::assign_op<Scalar,Scalar>, Sparse2Sparse>
 {
  typedef Solve<DecType,RhsType> SrcXprType;
-  static void run(DstXprType &dst, const SrcXprType &src, const internal::assign_op<Scalar,Scalar> &)
+  static EIGEN_DEVICE_FUNC void run(DstXprType &dst, const SrcXprType &src, const internal::assign_op<Scalar,Scalar> &)
  {
    Index dstRows = src.rows();
    Index dstCols = src.cols();
@@ -182,7 +182,7 @@ struct Assignment<DstXprType, SrcXprType, Functor, Diagonal2Sparse>
  typedef Array<StorageIndex,Dynamic,1> ArrayXI;
  typedef Array<Scalar,Dynamic,1> ArrayXS;
  template<int Options>
-  static void run(SparseMatrix<Scalar,Options,StorageIndex> &dst, const SrcXprType &src, const internal::assign_op<typename DstXprType::Scalar,typename SrcXprType::Scalar> &/*func*/)
+  static EIGEN_DEVICE_FUNC void run(SparseMatrix<Scalar,Options,StorageIndex> &dst, const SrcXprType &src, const internal::assign_op<typename DstXprType::Scalar,typename SrcXprType::Scalar> &/*func*/)
  {
    Index dstRows = src.rows();
    Index dstCols = src.cols();
@@ -196,16 +196,16 @@ struct Assignment<DstXprType, SrcXprType, Functor, Diagonal2Sparse>
    Map<ArrayXI>(dst.outerIndexPtr(), size+1).setLinSpaced(0,StorageIndex(size));
    Map<ArrayXS>(dst.valuePtr(), size) = src.diagonal();
  }
-  
+
  template<typename DstDerived>
  static void run(SparseMatrixBase<DstDerived> &dst, const SrcXprType &src, const internal::assign_op<typename DstXprType::Scalar,typename SrcXprType::Scalar> &/*func*/)
  {
    dst.diagonal() = src.diagonal();
  }
-  
+
  static void run(DstXprType &dst, const SrcXprType &src, const internal::add_assign_op<typename DstXprType::Scalar,typename SrcXprType::Scalar> &/*func*/)
  { dst.diagonal() += src.diagonal(); }
-  
+
  static void run(DstXprType &dst, const SrcXprType &src, const internal::sub_assign_op<typename DstXprType::Scalar,typename SrcXprType::Scalar> &/*func*/)
  { dst.diagonal() -= src.diagonal(); }
 };
--- a/Eigen/src/SparseCore/SparseCwiseBinaryOp.h
+++ b/Eigen/src/SparseCore/SparseCwiseBinaryOp.h
@@ -101,7 +101,7 @@ public:
      }
      else
      {
-        m_value = 0; // this is to avoid a compilation warning
+        m_value = Scalar(0); // this is to avoid a compilation warning
        m_id = -1;
      }
      return *this;
--- a/Eigen/src/SparseCore/SparseMatrix.h
+++ b/Eigen/src/SparseCore/SparseMatrix.h
@@ -577,10 +577,12 @@ class SparseMatrix
      else if (innerChange < 0) 
      {
        // Inner size decreased: allocate a new m_innerNonZeros
-        m_innerNonZeros = static_cast<StorageIndex*>(std::malloc((m_outerSize+outerChange+1) * sizeof(StorageIndex)));
+        m_innerNonZeros = static_cast<StorageIndex*>(std::malloc((m_outerSize + outerChange) * sizeof(StorageIndex)));
        if (!m_innerNonZeros) internal::throw_std_bad_alloc();
-        for(Index i = 0; i < m_outerSize; i++)
+        for(Index i = 0; i < m_outerSize + (std::min)(outerChange, Index(0)); i++)
          m_innerNonZeros[i] = m_outerIndex[i+1] - m_outerIndex[i];
+        for(Index i = m_outerSize; i < m_outerSize + outerChange; i++)
+          m_innerNonZeros[i] = 0;
      }
      
      // Change the m_innerNonZeros in case of a decrease of inner size
@@ -974,7 +976,7 @@ void set_from_triplets(const InputIterator& begin, const InputIterator& end, Spa
  * \code
    typedef Triplet<double> T;
    std::vector<T> tripletList;
-    triplets.reserve(estimation_of_entries);
+    tripletList.reserve(estimation_of_entries);
    for(...)
    {
      // ...
@@ -1233,7 +1235,7 @@ typename SparseMatrix<_Scalar,_Options,_StorageIndex>::Scalar& SparseMatrix<_Sca
    }
    
    m_data.index(p) = convert_index(inner);
-    return (m_data.value(p) = 0);
+    return (m_data.value(p) = Scalar(0));
  }
  
  if(m_data.size() != m_data.allocatedSize())
--- a/Eigen/src/SparseCore/SparseProduct.h
+++ b/Eigen/src/SparseCore/SparseProduct.h
@@ -10,7 +10,7 @@
 #ifndef EIGEN_SPARSEPRODUCT_H
 #define EIGEN_SPARSEPRODUCT_H

-namespace Eigen { 
+namespace Eigen {

 /** \returns an expression of the product of two sparse matrices.
  * By default a conservative product preserving the symbolic non zeros is performed.
@@ -102,13 +102,13 @@ template< typename DstXprType, typename Lhs, typename Rhs>
 struct Assignment<DstXprType, Product<Lhs,Rhs,AliasFreeProduct>, internal::assign_op<typename DstXprType::Scalar,typename Product<Lhs,Rhs,AliasFreeProduct>::Scalar>, Sparse2Dense>
 {
  typedef Product<Lhs,Rhs,AliasFreeProduct> SrcXprType;
-  static void run(DstXprType &dst, const SrcXprType &src, const internal::assign_op<typename DstXprType::Scalar,typename SrcXprType::Scalar> &)
+  static EIGEN_DEVICE_FUNC void run(DstXprType &dst, const SrcXprType &src, const internal::assign_op<typename DstXprType::Scalar,typename SrcXprType::Scalar> &)
  {
    Index dstRows = src.rows();
    Index dstCols = src.cols();
    if((dst.rows()!=dstRows) || (dst.cols()!=dstCols))
      dst.resize(dstRows, dstCols);
-    
+
    generic_product_impl<Lhs, Rhs>::evalTo(dst,src.lhs(),src.rhs());
  }
 };
@@ -118,7 +118,7 @@ template< typename DstXprType, typename Lhs, typename Rhs>
 struct Assignment<DstXprType, Product<Lhs,Rhs,AliasFreeProduct>, internal::add_assign_op<typename DstXprType::Scalar,typename Product<Lhs,Rhs,AliasFreeProduct>::Scalar>, Sparse2Dense>
 {
  typedef Product<Lhs,Rhs,AliasFreeProduct> SrcXprType;
-  static void run(DstXprType &dst, const SrcXprType &src, const internal::add_assign_op<typename DstXprType::Scalar,typename SrcXprType::Scalar> &)
+  static EIGEN_DEVICE_FUNC void run(DstXprType &dst, const SrcXprType &src, const internal::add_assign_op<typename DstXprType::Scalar,typename SrcXprType::Scalar> &)
  {
    generic_product_impl<Lhs, Rhs>::addTo(dst,src.lhs(),src.rhs());
  }
@@ -129,7 +129,7 @@ template< typename DstXprType, typename Lhs, typename Rhs>
 struct Assignment<DstXprType, Product<Lhs,Rhs,AliasFreeProduct>, internal::sub_assign_op<typename DstXprType::Scalar,typename Product<Lhs,Rhs,AliasFreeProduct>::Scalar>, Sparse2Dense>
 {
  typedef Product<Lhs,Rhs,AliasFreeProduct> SrcXprType;
-  static void run(DstXprType &dst, const SrcXprType &src, const internal::sub_assign_op<typename DstXprType::Scalar,typename SrcXprType::Scalar> &)
+  static EIGEN_DEVICE_FUNC void run(DstXprType &dst, const SrcXprType &src, const internal::sub_assign_op<typename DstXprType::Scalar,typename SrcXprType::Scalar> &)
  {
    generic_product_impl<Lhs, Rhs>::subTo(dst,src.lhs(),src.rhs());
  }
--- a/Eigen/src/SparseCore/SparseSelfAdjointView.h
+++ b/Eigen/src/SparseCore/SparseSelfAdjointView.h
@@ -10,8 +10,8 @@
 #ifndef EIGEN_SPARSE_SELFADJOINTVIEW_H
 #define EIGEN_SPARSE_SELFADJOINTVIEW_H

-namespace Eigen { 
-  
+namespace Eigen {
+
 /** \ingroup SparseCore_Module
  * \class SparseSelfAdjointView
  *
@@ -27,7 +27,7 @@ namespace Eigen {
  * \sa SparseMatrixBase::selfadjointView()
  */
 namespace internal {
-  
+
 template<typename MatrixType, unsigned int Mode>
 struct traits<SparseSelfAdjointView<MatrixType,Mode> > : traits<MatrixType> {
 };
@@ -44,7 +44,7 @@ template<typename MatrixType, unsigned int _Mode> class SparseSelfAdjointView
  : public EigenBase<SparseSelfAdjointView<MatrixType,_Mode> >
 {
  public:
-    
+
    enum {
      Mode = _Mode,
      TransposeMode = ((Mode & Upper) ? Lower : 0) | ((Mode & Lower) ? Upper : 0),
@@ -58,7 +58,7 @@ template<typename MatrixType, unsigned int _Mode> class SparseSelfAdjointView
    typedef Matrix<StorageIndex,Dynamic,1> VectorI;
    typedef typename internal::ref_selector<MatrixType>::non_const_type MatrixTypeNested;
    typedef typename internal::remove_all<MatrixTypeNested>::type _MatrixTypeNested;
-    
+
    explicit inline SparseSelfAdjointView(MatrixType& matrix) : m_matrix(matrix)
    {
      eigen_assert(rows()==cols() && "SelfAdjointView is only for squared matrices");
@@ -94,7 +94,7 @@ template<typename MatrixType, unsigned int _Mode> class SparseSelfAdjointView
    {
      return Product<OtherDerived, SparseSelfAdjointView>(lhs.derived(), rhs);
    }
-    
+
    /** Efficient sparse self-adjoint matrix times dense vector/matrix product */
    template<typename OtherDerived>
    Product<SparseSelfAdjointView,OtherDerived>
@@ -121,7 +121,7 @@ template<typename MatrixType, unsigned int _Mode> class SparseSelfAdjointView
      */
    template<typename DerivedU>
    SparseSelfAdjointView& rankUpdate(const SparseMatrixBase<DerivedU>& u, const Scalar& alpha = Scalar(1));
-    
+
    /** \returns an expression of P H P^-1 */
    // TODO implement twists in a more evaluator friendly fashion
    SparseSymmetricPermutationProduct<_MatrixTypeNested,Mode> twistedBy(const PermutationMatrix<Dynamic,Dynamic,StorageIndex>& perm) const
@@ -148,7 +148,7 @@ template<typename MatrixType, unsigned int _Mode> class SparseSelfAdjointView
      PermutationMatrix<Dynamic,Dynamic,StorageIndex> pnull;
      return *this = src.twistedBy(pnull);
    }
-    
+
    void resize(Index rows, Index cols)
    {
      EIGEN_ONLY_USED_FOR_DEBUG(rows);
@@ -156,7 +156,7 @@ template<typename MatrixType, unsigned int _Mode> class SparseSelfAdjointView
      eigen_assert(rows == this->rows() && cols == this->cols()
                && "SparseSelfadjointView::resize() does not actually allow to resize.");
    }
-    
+
  protected:

    MatrixTypeNested m_matrix;
@@ -203,7 +203,7 @@ SparseSelfAdjointView<MatrixType,Mode>::rankUpdate(const SparseMatrixBase<Derive
 }

 namespace internal {
-  
+
 // TODO currently a selfadjoint expression has the form SelfAdjointView<.,.>
 //      in the future selfadjoint-ness should be defined by the expression traits
 //      such that Transpose<SelfAdjointView<.,.> > is valid. (currently TriangularBase::transpose() is overloaded to make it work)
@@ -226,7 +226,7 @@ struct Assignment<DstXprType, SrcXprType, Functor, SparseSelfAdjoint2Sparse>
  typedef internal::assign_op<typename DstXprType::Scalar,typename SrcXprType::Scalar> AssignOpType;

  template<typename DestScalar,int StorageOrder>
-  static void run(SparseMatrix<DestScalar,StorageOrder,StorageIndex> &dst, const SrcXprType &src, const AssignOpType&/*func*/)
+  static EIGEN_DEVICE_FUNC void run(SparseMatrix<DestScalar,StorageOrder,StorageIndex> &dst, const SrcXprType &src, const AssignOpType&/*func*/)
  {
    internal::permute_symm_to_fullsymm<SrcXprType::Mode>(src.matrix(), dst);
  }
@@ -257,7 +257,7 @@ struct Assignment<DstXprType, SrcXprType, Functor, SparseSelfAdjoint2Sparse>
    run(tmp, src, AssignOpType());
    dst -= tmp;
  }
-  
+
  template<typename DestScalar>
  static void run(DynamicSparseMatrix<DestScalar,ColMajor,StorageIndex>& dst, const SrcXprType &src, const AssignOpType&/*func*/)
  {
@@ -280,13 +280,13 @@ template<int Mode, typename SparseLhsType, typename DenseRhsType, typename Dense
 inline void sparse_selfadjoint_time_dense_product(const SparseLhsType& lhs, const DenseRhsType& rhs, DenseResType& res, const AlphaType& alpha)
 {
  EIGEN_ONLY_USED_FOR_DEBUG(alpha);
-  
+
  typedef typename internal::nested_eval<SparseLhsType,DenseRhsType::MaxColsAtCompileTime>::type SparseLhsTypeNested;
  typedef typename internal::remove_all<SparseLhsTypeNested>::type SparseLhsTypeNestedCleaned;
  typedef evaluator<SparseLhsTypeNestedCleaned> LhsEval;
  typedef typename LhsEval::InnerIterator LhsIterator;
  typedef typename SparseLhsType::Scalar LhsScalar;
-  
+
  enum {
    LhsIsRowMajor = (LhsEval::Flags&RowMajorBit)==RowMajorBit,
    ProcessFirstHalf =
@@ -295,7 +295,7 @@ inline void sparse_selfadjoint_time_dense_product(const SparseLhsType& lhs, cons
          || ( (Mode&Lower) && LhsIsRowMajor),
    ProcessSecondHalf = !ProcessFirstHalf
  };
-  
+
  SparseLhsTypeNested lhs_nested(lhs);
  LhsEval lhsEval(lhs_nested);

@@ -349,7 +349,7 @@ struct generic_product_impl<LhsView, Rhs, SparseSelfAdjointShape, DenseShape, Pr
    typedef typename nested_eval<Rhs,Dynamic>::type RhsNested;
    LhsNested lhsNested(lhsView.matrix());
    RhsNested rhsNested(rhs);
-    
+
    internal::sparse_selfadjoint_time_dense_product<LhsView::Mode>(lhsNested, rhsNested, dst, alpha);
  }
 };
@@ -366,7 +366,7 @@ struct generic_product_impl<Lhs, RhsView, DenseShape, SparseSelfAdjointShape, Pr
    typedef typename nested_eval<Rhs,Dynamic>::type RhsNested;
    LhsNested lhsNested(lhs);
    RhsNested rhsNested(rhsView.matrix());
-    
+
    // transpose everything
    Transpose<Dest> dstT(dst);
    internal::sparse_selfadjoint_time_dense_product<RhsView::TransposeMode>(rhsNested.transpose(), lhsNested.transpose(), dstT, alpha);
@@ -390,7 +390,7 @@ struct product_evaluator<Product<LhsView, Rhs, DefaultProduct>, ProductTag, Spar
    ::new (static_cast<Base*>(this)) Base(m_result);
    generic_product_impl<typename Rhs::PlainObject, Rhs, SparseShape, SparseShape, ProductTag>::evalTo(m_result, m_lhs, xpr.rhs());
  }
-  
+
 protected:
  typename Rhs::PlainObject m_lhs;
  PlainObject m_result;
@@ -410,7 +410,7 @@ struct product_evaluator<Product<Lhs, RhsView, DefaultProduct>, ProductTag, Spar
    ::new (static_cast<Base*>(this)) Base(m_result);
    generic_product_impl<Lhs, typename Lhs::PlainObject, SparseShape, SparseShape, ProductTag>::evalTo(m_result, xpr.lhs(), m_rhs);
  }
-  
+
 protected:
  typename Lhs::PlainObject m_rhs;
  PlainObject m_result;
@@ -432,13 +432,13 @@ void permute_symm_to_fullsymm(const MatrixType& mat, SparseMatrix<typename Matri
  typedef Matrix<StorageIndex,Dynamic,1> VectorI;
  typedef evaluator<MatrixType> MatEval;
  typedef typename evaluator<MatrixType>::InnerIterator MatIterator;
-  
+
  MatEval matEval(mat);
  Dest& dest(_dest.derived());
  enum {
    StorageOrderMatch = int(Dest::IsRowMajor) == int(MatrixType::IsRowMajor)
  };
-  
+
  Index size = mat.rows();
  VectorI count;
  count.resize(size);
@@ -465,7 +465,7 @@ void permute_symm_to_fullsymm(const MatrixType& mat, SparseMatrix<typename Matri
    }
  }
  Index nnz = count.sum();
-  
+
  // reserve space
  dest.resizeNonZeros(nnz);
  dest.outerIndexPtr()[0] = 0;
@@ -473,7 +473,7 @@ void permute_symm_to_fullsymm(const MatrixType& mat, SparseMatrix<typename Matri
    dest.outerIndexPtr()[j+1] = dest.outerIndexPtr()[j] + count[j];
  for(Index j=0; j<size; ++j)
    count[j] = dest.outerIndexPtr()[j];
-  
+
  // copy data
  for(StorageIndex j = 0; j<size; ++j)
  {
@@ -482,10 +482,10 @@ void permute_symm_to_fullsymm(const MatrixType& mat, SparseMatrix<typename Matri
      StorageIndex i = internal::convert_index<StorageIndex>(it.index());
      Index r = it.row();
      Index c = it.col();
-      
+
      StorageIndex jp = perm ? perm[j] : j;
      StorageIndex ip = perm ? perm[i] : i;
-      
+
      if(Mode==int(Upper|Lower))
      {
        Index k = count[StorageOrderMatch ? jp : ip]++;
@@ -531,7 +531,7 @@ void permute_symm_to_symm(const MatrixType& mat, SparseMatrix<typename MatrixTyp
  };

  MatEval matEval(mat);
-  
+
  Index size = mat.rows();
  VectorI count(size);
  count.setZero();
@@ -544,7 +544,7 @@ void permute_symm_to_symm(const MatrixType& mat, SparseMatrix<typename MatrixTyp
      StorageIndex i = it.index();
      if((int(SrcMode)==int(Lower) && i<j) || (int(SrcMode)==int(Upper) && i>j))
        continue;
-                  
+
      StorageIndex ip = perm ? perm[i] : i;
      count[int(DstMode)==int(Lower) ? (std::min)(ip,jp) : (std::max)(ip,jp)]++;
    }
@@ -555,22 +555,22 @@ void permute_symm_to_symm(const MatrixType& mat, SparseMatrix<typename MatrixTyp
  dest.resizeNonZeros(dest.outerIndexPtr()[size]);
  for(Index j=0; j<size; ++j)
    count[j] = dest.outerIndexPtr()[j];
-  
+
  for(StorageIndex j = 0; j<size; ++j)
  {
-    
+
    for(MatIterator it(matEval,j); it; ++it)
    {
      StorageIndex i = it.index();
      if((int(SrcMode)==int(Lower) && i<j) || (int(SrcMode)==int(Upper) && i>j))
        continue;
-                  
+
      StorageIndex jp = perm ? perm[j] : j;
      StorageIndex ip = perm? perm[i] : i;
-      
+
      Index k = count[int(DstMode)==int(Lower) ? (std::min)(ip,jp) : (std::max)(ip,jp)]++;
      dest.innerIndexPtr()[k] = int(DstMode)==int(Lower) ? (std::max)(ip,jp) : (std::min)(ip,jp);
-      
+
      if(!StorageOrderMatch) std::swap(ip,jp);
      if( ((int(DstMode)==int(Lower) && ip<jp) || (int(DstMode)==int(Upper) && ip>jp)))
        dest.valuePtr()[k] = numext::conj(it.value());
@@ -609,17 +609,17 @@ class SparseSymmetricPermutationProduct
    typedef Matrix<StorageIndex,Dynamic,1> VectorI;
    typedef typename MatrixType::Nested MatrixTypeNested;
    typedef typename internal::remove_all<MatrixTypeNested>::type NestedExpression;
-    
+
    SparseSymmetricPermutationProduct(const MatrixType& mat, const Perm& perm)
      : m_matrix(mat), m_perm(perm)
    {}
-    
+
    inline Index rows() const { return m_matrix.rows(); }
    inline Index cols() const { return m_matrix.cols(); }
-        
+
    const NestedExpression& matrix() const { return m_matrix; }
    const Perm& perm() const { return m_perm; }
-    
+
  protected:
    MatrixTypeNested m_matrix;
    const Perm& m_perm;
@@ -627,21 +627,21 @@ class SparseSymmetricPermutationProduct
 };

 namespace internal {
-  
+
 template<typename DstXprType, typename MatrixType, int Mode, typename Scalar>
 struct Assignment<DstXprType, SparseSymmetricPermutationProduct<MatrixType,Mode>, internal::assign_op<Scalar,typename MatrixType::Scalar>, Sparse2Sparse>
 {
  typedef SparseSymmetricPermutationProduct<MatrixType,Mode> SrcXprType;
  typedef typename DstXprType::StorageIndex DstIndex;
  template<int Options>
-  static void run(SparseMatrix<Scalar,Options,DstIndex> &dst, const SrcXprType &src, const internal::assign_op<Scalar,typename MatrixType::Scalar> &)
+  static EIGEN_DEVICE_FUNC void run(SparseMatrix<Scalar,Options,DstIndex> &dst, const SrcXprType &src, const internal::assign_op<Scalar,typename MatrixType::Scalar> &)
  {
    // internal::permute_symm_to_fullsymm<Mode>(m_matrix,_dest,m_perm.indices().data());
    SparseMatrix<Scalar,(Options&RowMajor)==RowMajor ? ColMajor : RowMajor, DstIndex> tmp;
    internal::permute_symm_to_fullsymm<Mode>(src.matrix(),tmp,src.perm().indices().data());
    dst = tmp;
  }
-  
+
  template<typename DestType,unsigned int DestMode>
  static void run(SparseSelfAdjointView<DestType,DestMode>& dst, const SrcXprType &src, const internal::assign_op<Scalar,typename MatrixType::Scalar> &)
  {
--- a/Eigen/src/SparseQR/SparseQR.h
+++ b/Eigen/src/SparseQR/SparseQR.h
@@ -42,31 +42,31 @@ namespace internal {
  * \ingroup SparseQR_Module
  * \class SparseQR
  * \brief Sparse left-looking rank-revealing QR factorization
-  * 
-  * This class implements a left-looking rank-revealing QR decomposition 
+  *
+  * This class implements a left-looking rank-revealing QR decomposition
  * of sparse matrices. When a column has a norm less than a given tolerance
-  * it is implicitly permuted to the end. The QR factorization thus obtained is 
-  * given by A*P = Q*R where R is upper triangular or trapezoidal. 
-  * 
+  * it is implicitly permuted to the end. The QR factorization thus obtained is
+  * given by A*P = Q*R where R is upper triangular or trapezoidal.
+  *
  * P is the column permutation which is the product of the fill-reducing and the
  * rank-revealing permutations. Use colsPermutation() to get it.
-  * 
-  * Q is the orthogonal matrix represented as products of Householder reflectors. 
+  *
+  * Q is the orthogonal matrix represented as products of Householder reflectors.
  * Use matrixQ() to get an expression and matrixQ().adjoint() to get the adjoint.
  * You can then apply it to a vector.
-  * 
+  *
  * R is the sparse triangular or trapezoidal matrix. The later occurs when A is rank-deficient.
  * matrixR().topLeftCorner(rank(), rank()) always returns a triangular factor of full rank.
-  * 
+  *
  * \tparam _MatrixType The type of the sparse matrix A, must be a column-major SparseMatrix<>
-  * \tparam _OrderingType The fill-reducing ordering method. See the \link OrderingMethods_Module 
+  * \tparam _OrderingType The fill-reducing ordering method. See the \link OrderingMethods_Module
  *  OrderingMethods \endlink module for the list of built-in and external ordering methods.
-  * 
+  *
  * \implsparsesolverconcept
  *
  * \warning The input sparse matrix A must be in compressed mode (see SparseMatrix::makeCompressed()).
  * \warning For complex matrices matrixQ().transpose() will actually return the adjoint matrix.
-  * 
+  *
  */
 template<typename _MatrixType, typename _OrderingType>
 class SparseQR : public SparseSolverBase<SparseQR<_MatrixType,_OrderingType> >
@@ -90,26 +90,26 @@ class SparseQR : public SparseSolverBase<SparseQR<_MatrixType,_OrderingType> >
      ColsAtCompileTime = MatrixType::ColsAtCompileTime,
      MaxColsAtCompileTime = MatrixType::MaxColsAtCompileTime
    };
-    
+
  public:
    SparseQR () :  m_analysisIsok(false), m_lastError(""), m_useDefaultThreshold(true),m_isQSorted(false),m_isEtreeOk(false)
    { }
-    
+
    /** Construct a QR factorization of the matrix \a mat.
-      * 
+      *
      * \warning The matrix \a mat must be in compressed mode (see SparseMatrix::makeCompressed()).
-      * 
+      *
      * \sa compute()
      */
    explicit SparseQR(const MatrixType& mat) : m_analysisIsok(false), m_lastError(""), m_useDefaultThreshold(true),m_isQSorted(false),m_isEtreeOk(false)
    {
      compute(mat);
    }
-    
+
    /** Computes the QR factorization of the sparse matrix \a mat.
-      * 
+      *
      * \warning The matrix \a mat must be in compressed mode (see SparseMatrix::makeCompressed()).
-      * 
+      *
      * \sa analyzePattern(), factorize()
      */
    void compute(const MatrixType& mat)
@@ -119,15 +119,15 @@ class SparseQR : public SparseSolverBase<SparseQR<_MatrixType,_OrderingType> >
    }
    void analyzePattern(const MatrixType& mat);
    void factorize(const MatrixType& mat);
-    
-    /** \returns the number of rows of the represented matrix. 
+
+    /** \returns the number of rows of the represented matrix.
      */
    inline Index rows() const { return m_pmat.rows(); }
-    
-    /** \returns the number of columns of the represented matrix. 
+
+    /** \returns the number of columns of the represented matrix.
      */
    inline Index cols() const { return m_pmat.cols();}
-    
+
    /** \returns a const reference to the \b sparse upper triangular matrix R of the QR factorization.
      * \warning The entries of the returned matrix are not sorted. This means that using it in algorithms
      *          expecting sorted entries will fail. This include random coefficient accesses (SpaseMatrix::coeff()),
@@ -142,7 +142,7 @@ class SparseQR : public SparseSolverBase<SparseQR<_MatrixType,_OrderingType> >
      * \endcode
      */
    const QRMatrixType& matrixR() const { return m_R; }
-    
+
    /** \returns the number of non linearly dependent columns as determined by the pivoting threshold.
      *
      * \sa setPivotThreshold()
@@ -150,9 +150,9 @@ class SparseQR : public SparseSolverBase<SparseQR<_MatrixType,_OrderingType> >
    Index rank() const
    {
      eigen_assert(m_isInitialized && "The factorization should be called first, use compute()");
-      return m_nonzeropivots; 
+      return m_nonzeropivots;
    }
-    
+
    /** \returns an expression of the matrix Q as products of sparse Householder reflectors.
    * The common usage of this function is to apply it to a dense matrix or vector
    * \code
@@ -171,23 +171,23 @@ class SparseQR : public SparseSolverBase<SparseQR<_MatrixType,_OrderingType> >
    * reflectors are stored unsorted, two transpositions are needed to sort
    * them before performing the product.
    */
-    SparseQRMatrixQReturnType<SparseQR> matrixQ() const 
+    SparseQRMatrixQReturnType<SparseQR> matrixQ() const
    { return SparseQRMatrixQReturnType<SparseQR>(*this); }
-    
+
    /** \returns a const reference to the column permutation P that was applied to A such that A*P = Q*R
      * It is the combination of the fill-in reducing permutation and numerical column pivoting.
      */
    const PermutationType& colsPermutation() const
-    { 
+    {
      eigen_assert(m_isInitialized && "Decomposition is not initialized.");
      return m_outputPerm_c;
    }
-    
+
    /** \returns A string describing the type of error.
      * This method is provided to ease debugging, not to handle errors.
      */
    std::string lastErrorMessage() const { return m_lastError; }
-    
+
    /** \internal */
    template<typename Rhs, typename Dest>
    bool _solve_impl(const MatrixBase<Rhs> &B, MatrixBase<Dest> &dest) const
@@ -196,21 +196,21 @@ class SparseQR : public SparseSolverBase<SparseQR<_MatrixType,_OrderingType> >
      eigen_assert(this->rows() == B.rows() && "SparseQR::solve() : invalid number of rows in the right hand side matrix");

      Index rank = this->rank();
-      
+
      // Compute Q^* * b;
      typename Dest::PlainObject y, b;
      y = this->matrixQ().adjoint() * B;
      b = y;
-      
+
      // Solve with the triangular matrix R
      y.resize((std::max<Index>)(cols(),y.rows()),y.cols());
      y.topRows(rank) = this->matrixR().topLeftCorner(rank, rank).template triangularView<Upper>().solve(b.topRows(rank));
      y.bottomRows(y.rows()-rank).setZero();
-      
+
      // Apply the column permutation
      if (m_perm_c.size())  dest = colsPermutation() * y.topRows(cols());
      else                  dest = y.topRows(cols());
-      
+
      m_info = Success;
      return true;
    }
@@ -225,13 +225,13 @@ class SparseQR : public SparseSolverBase<SparseQR<_MatrixType,_OrderingType> >
      m_useDefaultThreshold = false;
      m_threshold = threshold;
    }
-    
+
    /** \returns the solution X of \f$ A X = B \f$ using the current decomposition of A.
      *
      * \sa compute()
      */
    template<typename Rhs>
-    inline const Solve<SparseQR, Rhs> solve(const MatrixBase<Rhs>& B) const 
+    inline const Solve<SparseQR, Rhs> solve(const MatrixBase<Rhs>& B) const
    {
      eigen_assert(m_isInitialized && "The factorization should be called first, use compute()");
      eigen_assert(this->rows() == B.rows() && "SparseQR::solve() : invalid number of rows in the right hand side matrix");
@@ -244,14 +244,14 @@ class SparseQR : public SparseSolverBase<SparseQR<_MatrixType,_OrderingType> >
          eigen_assert(this->rows() == B.rows() && "SparseQR::solve() : invalid number of rows in the right hand side matrix");
          return Solve<SparseQR, Rhs>(*this, B.derived());
    }
-    
+
    /** \brief Reports whether previous computation was successful.
      *
      * \returns \c Success if computation was successful,
      *          \c NumericalIssue if the QR factorization reports a numerical problem
      *          \c InvalidInput if the input matrix is invalid
      *
-      * \sa iparm()          
+      * \sa iparm()
      */
    ComputationInfo info() const
    {
@@ -270,7 +270,7 @@ class SparseQR : public SparseSolverBase<SparseQR<_MatrixType,_OrderingType> >
      this->m_isQSorted = true;
    }

-    
+
  protected:
    bool m_analysisIsok;
    bool m_factorizationIsok;
@@ -290,18 +290,18 @@ class SparseQR : public SparseSolverBase<SparseQR<_MatrixType,_OrderingType> >
    IndexVector m_firstRowElt;      // First element in each row
    bool m_isQSorted;               // whether Q is sorted or not
    bool m_isEtreeOk;               // whether the elimination tree match the initial input matrix
-    
+
    template <typename, typename > friend struct SparseQR_QProduct;
-    
+
 };

-/** \brief Preprocessing step of a QR factorization 
-  * 
+/** \brief Preprocessing step of a QR factorization
+  *
  * \warning The matrix \a mat must be in compressed mode (see SparseMatrix::makeCompressed()).
-  * 
+  *
  * In this step, the fill-reducing permutation is computed and applied to the columns of A
  * and the column elimination tree is computed as well. Only the sparsity pattern of \a mat is exploited.
-  * 
+  *
  * \note In this step it is assumed that there is no empty row in the matrix \a mat.
  */
 template <typename MatrixType, typename OrderingType>
@@ -311,26 +311,26 @@ void SparseQR<MatrixType,OrderingType>::analyzePattern(const MatrixType& mat)
  // Copy to a column major matrix if the input is rowmajor
  typename internal::conditional<MatrixType::IsRowMajor,QRMatrixType,const MatrixType&>::type matCpy(mat);
  // Compute the column fill reducing ordering
-  OrderingType ord; 
-  ord(matCpy, m_perm_c); 
+  OrderingType ord;
+  ord(matCpy, m_perm_c);
  Index n = mat.cols();
  Index m = mat.rows();
  Index diagSize = (std::min)(m,n);
-  
+
  if (!m_perm_c.size())
  {
    m_perm_c.resize(n);
    m_perm_c.indices().setLinSpaced(n, 0,StorageIndex(n-1));
  }
-  
+
  // Compute the column elimination tree of the permuted matrix
  m_outputPerm_c = m_perm_c.inverse();
  internal::coletree(matCpy, m_etree, m_firstRowElt, m_outputPerm_c.indices().data());
  m_isEtreeOk = true;
-  
+
  m_R.resize(m, n);
  m_Q.resize(m, diagSize);
-  
+
  // Allocate space for nonzero elements : rough estimation
  m_R.reserve(2*mat.nonZeros()); //FIXME Get a more accurate estimation through symbolic factorization with the etree
  m_Q.reserve(2*mat.nonZeros());
@@ -339,17 +339,17 @@ void SparseQR<MatrixType,OrderingType>::analyzePattern(const MatrixType& mat)
 }

 /** \brief Performs the numerical QR factorization of the input matrix
-  * 
+  *
  * The function SparseQR::analyzePattern(const MatrixType&) must have been called beforehand with
  * a matrix having the same sparsity pattern than \a mat.
-  * 
+  *
  * \param mat The sparse column-major matrix
  */
 template <typename MatrixType, typename OrderingType>
 void SparseQR<MatrixType,OrderingType>::factorize(const MatrixType& mat)
 {
  using std::abs;
-  
+
  eigen_assert(m_analysisIsok && "analyzePattern() should be called before this step");
  StorageIndex m = StorageIndex(mat.rows());
  StorageIndex n = StorageIndex(mat.cols());
@@ -359,7 +359,7 @@ void SparseQR<MatrixType,OrderingType>::factorize(const MatrixType& mat)
  Index nzcolR, nzcolQ;                                     // Number of nonzero for the current column of R and Q
  ScalarVector tval(m);                                     // The dense vector used to compute the current column
  RealScalar pivotThreshold = m_threshold;
-  
+
  m_R.setZero();
  m_Q.setZero();
  m_pmat = mat;
@@ -371,12 +371,12 @@ void SparseQR<MatrixType,OrderingType>::factorize(const MatrixType& mat)
  }

  m_pmat.uncompress(); // To have the innerNonZeroPtr allocated
-  
+
  // Apply the fill-in reducing permutation lazily:
  {
    // If the input is row major, copy the original column indices,
    // otherwise directly use the input matrix
-    // 
+    //
    IndexVector originalOuterIndicesCpy;
    const StorageIndex *originalOuterIndices = mat.outerIndexPtr();
    if(MatrixType::IsRowMajor)
@@ -384,20 +384,20 @@ void SparseQR<MatrixType,OrderingType>::factorize(const MatrixType& mat)
      originalOuterIndicesCpy = IndexVector::Map(m_pmat.outerIndexPtr(),n+1);
      originalOuterIndices = originalOuterIndicesCpy.data();
    }
-    
+
    for (int i = 0; i < n; i++)
    {
      Index p = m_perm_c.size() ? m_perm_c.indices()(i) : i;
-      m_pmat.outerIndexPtr()[p] = originalOuterIndices[i]; 
-      m_pmat.innerNonZeroPtr()[p] = originalOuterIndices[i+1] - originalOuterIndices[i]; 
+      m_pmat.outerIndexPtr()[p] = originalOuterIndices[i];
+      m_pmat.innerNonZeroPtr()[p] = originalOuterIndices[i+1] - originalOuterIndices[i];
    }
  }
-  
+
  /* Compute the default threshold as in MatLab, see:
   * Tim Davis, "Algorithm 915, SuiteSparseQR: Multifrontal Multithreaded Rank-Revealing
-   * Sparse QR Factorization, ACM Trans. on Math. Soft. 38(1), 2011, Page 8:3 
+   * Sparse QR Factorization, ACM Trans. on Math. Soft. 38(1), 2011, Page 8:3
   */
-  if(m_useDefaultThreshold) 
+  if(m_useDefaultThreshold)
  {
    RealScalar max2Norm = 0.0;
    for (int j = 0; j < n; j++) max2Norm = numext::maxi(max2Norm, m_pmat.col(j).norm());
@@ -405,10 +405,10 @@ void SparseQR<MatrixType,OrderingType>::factorize(const MatrixType& mat)
      max2Norm = RealScalar(1);
    pivotThreshold = 20 * (m + n) * max2Norm * NumTraits<RealScalar>::epsilon();
  }
-  
+
  // Initialize the numerical permutation
  m_pivotperm.setIdentity(n);
-  
+
  StorageIndex nonzeroCol = 0; // Record the number of valid pivots
  m_Q.startVec(0);

@@ -421,8 +421,8 @@ void SparseQR<MatrixType,OrderingType>::factorize(const MatrixType& mat)
    Qidx(0) = nonzeroCol;
    nzcolR = 0; nzcolQ = 1;
    bool found_diag = nonzeroCol>=m;
-    tval.setZero(); 
-    
+    tval.setZero();
+
    // Symbolic factorization: find the nonzero locations of the column k of the factors R and Q, i.e.,
    // all the nodes (with indexes lower than rank) reachable through the column elimination tree (etree) rooted at node k.
    // Note: if the diagonal entry does not exist, then its contribution must be explicitly added,
@@ -432,7 +432,7 @@ void SparseQR<MatrixType,OrderingType>::factorize(const MatrixType& mat)
      StorageIndex curIdx = nonzeroCol;
      if(itp) curIdx = StorageIndex(itp.row());
      if(curIdx == nonzeroCol) found_diag = true;
-      
+
      // Get the nonzeros indexes of the current column of R
      StorageIndex st = m_firstRowElt(curIdx); // The traversal of the etree starts here
      if (st < 0 )
@@ -442,7 +442,7 @@ void SparseQR<MatrixType,OrderingType>::factorize(const MatrixType& mat)
        return;
      }

-      // Traverse the etree 
+      // Traverse the etree
      Index bi = nzcolR;
      for (; mark(st) != col; st = m_etree(st))
      {
@@ -454,13 +454,13 @@ void SparseQR<MatrixType,OrderingType>::factorize(const MatrixType& mat)
      // Reverse the list to get the topological ordering
      Index nt = nzcolR-bi;
      for(Index i = 0; i < nt/2; i++) std::swap(Ridx(bi+i), Ridx(nzcolR-i-1));
-       
+
      // Copy the current (curIdx,pcol) value of the input matrix
      if(itp) tval(curIdx) = itp.value();
      else    tval(curIdx) = Scalar(0);
-      
+
      // Compute the pattern of Q(:,k)
-      if(curIdx > nonzeroCol && mark(curIdx) != col ) 
+      if(curIdx > nonzeroCol && mark(curIdx) != col )
      {
        Qidx(nzcolQ) = curIdx;  // Add this row to the pattern of Q,
        mark(curIdx) = col;     // and mark it as visited
@@ -472,15 +472,15 @@ void SparseQR<MatrixType,OrderingType>::factorize(const MatrixType& mat)
    for (Index i = nzcolR-1; i >= 0; i--)
    {
      Index curIdx = Ridx(i);
-      
+
      // Apply the curIdx-th householder vector to the current column (temporarily stored into tval)
      Scalar tdot(0);
-      
+
      // First compute q' * tval
      tdot = m_Q.col(curIdx).dot(tval);

      tdot *= m_hcoeffs(curIdx);
-      
+
      // Then update tval = tval - q * tau
      // FIXME: tval -= tdot * m_Q.col(curIdx) should amount to the same (need to check/add support for efficient "dense ?= sparse")
      for (typename QRMatrixType::InnerIterator itq(m_Q, curIdx); itq; ++itq)
@@ -500,16 +500,16 @@ void SparseQR<MatrixType,OrderingType>::factorize(const MatrixType& mat)
        }
      }
    } // End update current column
-    
+
    Scalar tau = RealScalar(0);
    RealScalar beta = 0;
-    
+
    if(nonzeroCol < diagSize)
    {
      // Compute the Householder reflection that eliminate the current column
      // FIXME this step should call the Householder module.
      Scalar c0 = nzcolQ ? tval(Qidx(0)) : Scalar(0);
-      
+
      // First, the squared norm of Q((col+1):m, col)
      RealScalar sqrNorm = 0.;
      for (Index itq = 1; itq < nzcolQ; ++itq) sqrNorm += numext::abs2(tval(Qidx(itq)));
@@ -528,7 +528,7 @@ void SparseQR<MatrixType,OrderingType>::factorize(const MatrixType& mat)
        for (Index itq = 1; itq < nzcolQ; ++itq)
          tval(Qidx(itq)) /= (c0 - beta);
        tau = numext::conj((beta-c0) / beta);
-          
+
      }
    }

@@ -536,7 +536,7 @@ void SparseQR<MatrixType,OrderingType>::factorize(const MatrixType& mat)
    for (Index  i = nzcolR-1; i >= 0; i--)
    {
      Index curIdx = Ridx(i);
-      if(curIdx < nonzeroCol) 
+      if(curIdx < nonzeroCol)
      {
        m_R.insertBackByOuterInnerUnordered(col, curIdx) = tval(curIdx);
        tval(curIdx) = Scalar(0.);
@@ -562,17 +562,17 @@ void SparseQR<MatrixType,OrderingType>::factorize(const MatrixType& mat)
    else
    {
      // Zero pivot found: move implicitly this column to the end
-      for (Index j = nonzeroCol; j < n-1; j++) 
+      for (Index j = nonzeroCol; j < n-1; j++)
        std::swap(m_pivotperm.indices()(j), m_pivotperm.indices()[j+1]);
-      
+
      // Recompute the column elimination tree
      internal::coletree(m_pmat, m_etree, m_firstRowElt, m_pivotperm.indices().data());
      m_isEtreeOk = false;
    }
  }
-  
+
  m_hcoeffs.tail(diagSize-nonzeroCol).setZero();
-  
+
  // Finalize the column pointers of the sparse matrices R and Q
  m_Q.finalize();
  m_Q.makeCompressed();
@@ -581,18 +581,18 @@ void SparseQR<MatrixType,OrderingType>::factorize(const MatrixType& mat)
  m_isQSorted = false;

  m_nonzeropivots = nonzeroCol;
-  
+
  if(nonzeroCol<n)
  {
    // Permute the triangular factor to put the 'dead' columns to the end
    QRMatrixType tempR(m_R);
    m_R = tempR * m_pivotperm;
-    
+
    // Update the column permutation
    m_outputPerm_c = m_outputPerm_c * m_pivotperm;
  }
-  
-  m_isInitialized = true; 
+
+  m_isInitialized = true;
  m_factorizationIsok = true;
  m_info = Success;
 }
@@ -602,12 +602,12 @@ struct SparseQR_QProduct : ReturnByValue<SparseQR_QProduct<SparseQRType, Derived
 {
  typedef typename SparseQRType::QRMatrixType MatrixType;
  typedef typename SparseQRType::Scalar Scalar;
-  // Get the references 
-  SparseQR_QProduct(const SparseQRType& qr, const Derived& other, bool transpose) : 
+  // Get the references
+  SparseQR_QProduct(const SparseQRType& qr, const Derived& other, bool transpose) :
  m_qr(qr),m_other(other),m_transpose(transpose) {}
  inline Index rows() const { return m_qr.matrixQ().rows(); }
  inline Index cols() const { return m_other.cols(); }
-  
+
  // Assign to a vector
  template<typename DesType>
  void evalTo(DesType& res) const
@@ -651,7 +651,7 @@ struct SparseQR_QProduct : ReturnByValue<SparseQR_QProduct<SparseQRType, Derived
      }
    }
  }
-  
+
  const SparseQRType& m_qr;
  const Derived& m_other;
  bool m_transpose; // TODO this actually means adjoint
@@ -659,7 +659,7 @@ struct SparseQR_QProduct : ReturnByValue<SparseQR_QProduct<SparseQRType, Derived

 template<typename SparseQRType>
 struct SparseQRMatrixQReturnType : public EigenBase<SparseQRMatrixQReturnType<SparseQRType> >
-{  
+{
  typedef typename SparseQRType::Scalar Scalar;
  typedef Matrix<Scalar,Dynamic,Dynamic> DenseMatrix;
  enum {
@@ -701,7 +701,7 @@ struct SparseQRMatrixQTransposeReturnType
 };

 namespace internal {
-  
+
 template<typename SparseQRType>
 struct evaluator_traits<SparseQRMatrixQReturnType<SparseQRType> >
 {
@@ -716,7 +716,7 @@ struct Assignment<DstXprType, SparseQRMatrixQReturnType<SparseQRType>, internal:
  typedef SparseQRMatrixQReturnType<SparseQRType> SrcXprType;
  typedef typename DstXprType::Scalar Scalar;
  typedef typename DstXprType::StorageIndex StorageIndex;
-  static void run(DstXprType &dst, const SrcXprType &src, const internal::assign_op<Scalar,Scalar> &/*func*/)
+  static EIGEN_DEVICE_FUNC void run(DstXprType &dst, const SrcXprType &src, const internal::assign_op<Scalar,Scalar> &/*func*/)
  {
    typename DstXprType::PlainObject idMat(src.rows(), src.cols());
    idMat.setIdentity();
@@ -732,7 +732,7 @@ struct Assignment<DstXprType, SparseQRMatrixQReturnType<SparseQRType>, internal:
  typedef SparseQRMatrixQReturnType<SparseQRType> SrcXprType;
  typedef typename DstXprType::Scalar Scalar;
  typedef typename DstXprType::StorageIndex StorageIndex;
-  static void run(DstXprType &dst, const SrcXprType &src, const internal::assign_op<Scalar,Scalar> &/*func*/)
+  static EIGEN_DEVICE_FUNC void run(DstXprType &dst, const SrcXprType &src, const internal::assign_op<Scalar,Scalar> &/*func*/)
  {
    dst = src.m_qr.matrixQ() * DstXprType::Identity(src.m_qr.rows(), src.m_qr.rows());
  }
--- a/Eigen/src/StlSupport/StdDeque.h
+++ b/Eigen/src/StlSupport/StdDeque.h
@@ -98,19 +98,7 @@ namespace std {
  { return deque_base::insert(position,x); }
  void insert(const_iterator position, size_type new_size, const value_type& x)
  { deque_base::insert(position, new_size, x); }
-#elif defined(_GLIBCXX_DEQUE) && EIGEN_GNUC_AT_LEAST(4,2) && !EIGEN_GNUC_AT_LEAST(10, 1)
-  // workaround GCC std::deque implementation
-  // GCC 10.1 doesn't let us access _Deque_impl _M_impl anymore and we have to
-  // fall-back to the default case
-  void resize(size_type new_size, const value_type& x)
-  {
-    if (new_size < deque_base::size())
-      deque_base::_M_erase_at_end(this->_M_impl._M_start + new_size);
-    else
-      deque_base::insert(deque_base::end(), new_size - deque_base::size(), x);
-  }
 #else
-  // either non-GCC or GCC between 4.1 and 10.1
  // default implementation which should always work.
  void resize(size_type new_size, const value_type& x)
  {
--- a/Eigen/src/plugins/ArrayCwiseBinaryOps.h
+++ b/Eigen/src/plugins/ArrayCwiseBinaryOps.h
@@ -119,7 +119,7 @@ OP(const Scalar& s) const { \
  return this->OP(Derived::PlainObject::Constant(rows(), cols(), s)); \
 } \
 EIGEN_DEVICE_FUNC friend EIGEN_STRONG_INLINE const RCmp ## COMPARATOR ## ReturnType \
-OP(const Scalar& s, const Derived& d) { \
+OP(const Scalar& s, const EIGEN_CURRENT_STORAGE_BASE_CLASS<Derived>& d) { \
  return Derived::PlainObject::Constant(d.rows(), d.cols(), s).OP(d); \
 }

@@ -314,7 +314,8 @@ polygamma(const EIGEN_CURRENT_STORAGE_BASE_CLASS<DerivedN> &n) const
  *
  * It returns the Riemann zeta function of two arguments \c *this and \a q:
  *
-  * \param *this is the exposent, it must be > 1
+  * `*this` is the exponent, it must be > 1
+  * 
  * \param q is the shift, it must be > 0
  *
  * \note This function supports only float and double scalar types. To support other scalar types, the user has
--- a/blas/CMakeLists.txt
+++ b/blas/CMakeLists.txt
@@ -4,7 +4,6 @@ project(EigenBlas CXX)
 include("../cmake/language_support.cmake")

 workaround_9220(Fortran EIGEN_Fortran_COMPILER_WORKS)
-
 if(EIGEN_Fortran_COMPILER_WORKS)
  enable_language(Fortran OPTIONAL)
  if(NOT CMAKE_Fortran_COMPILER)
--- a/ci/CTest2JUnit.xsl
+++ b/ci/CTest2JUnit.xsl
@@ -0,0 +1,120 @@
+<xsl:stylesheet xmlns:xsl="http://www.w3.org/1999/XSL/Transform" version="1.0">
+<xsl:output method="xml" indent="yes"/>
+    <xsl:template match="/Site">
+	<xsl:variable name="Name"><xsl:value-of select="@Name"/></xsl:variable>
+	<xsl:variable name="Hostname"><xsl:value-of select="@Hostname"/></xsl:variable>
+	<xsl:variable name="TestCount"><xsl:value-of select="count(//TestList/Test)"/> </xsl:variable>
+	<xsl:variable name="ErrorCount"><xsl:value-of select="count(//TestList/Test[@Status='error'])"/> </xsl:variable>
+	<xsl:variable name="FailureCount"><xsl:value-of select="count(//Testing/Test[@Status='failed'])"/> </xsl:variable>
+	<testsuite name="{$Name}" hostname="{$Hostname}" errors="0" failures="{$FailureCount}" tests="{$TestCount}">
+	    <xsl:variable name="BuildName"><xsl:value-of select="@BuildName"/></xsl:variable>
+	    <xsl:variable name="BuildStamp"><xsl:value-of select="@BuildStamp"/></xsl:variable>
+	    <xsl:variable name="Generator"><xsl:value-of select="@Generator"/></xsl:variable>
+	    <xsl:variable name="CompilerName"><xsl:value-of select="@CompilerName"/></xsl:variable>
+	    <xsl:variable name="OSName"><xsl:value-of select="@OSName"/></xsl:variable>
+	    <xsl:variable name="OSRelease"><xsl:value-of select="@OSRelease"/></xsl:variable>
+	    <xsl:variable name="OSVersion"><xsl:value-of select="@OSVersion"/></xsl:variable>
+	    <xsl:variable name="OSPlatform"><xsl:value-of select="@OSPlatform"/></xsl:variable>
+	    <xsl:variable name="Is64Bits"><xsl:value-of select="@Is64Bits"/></xsl:variable>
+	    <xsl:variable name="VendorString"><xsl:value-of select="@VendorString"/></xsl:variable>
+	    <xsl:variable name="VendorID"><xsl:value-of select="@VendorID"/></xsl:variable>
+	    <xsl:variable name="FamilyID"><xsl:value-of select="@FamilyID"/></xsl:variable>
+	    <xsl:variable name="ModelID"><xsl:value-of select="@ModelID"/></xsl:variable>
+	    <xsl:variable name="ProcessorCacheSize"><xsl:value-of select="@ProcessorCacheSize"/></xsl:variable>
+	    <xsl:variable name="NumberOfLogicalCPU"><xsl:value-of select="@NumberOfLogicalCPU"/></xsl:variable>
+	    <xsl:variable name="NumberOfPhysicalCPU"><xsl:value-of select="@NumberOfPhysicalCPU"/></xsl:variable>
+	    <xsl:variable name="TotalVirtualMemory"><xsl:value-of select="@TotalVirtualMemory"/></xsl:variable>
+	    <xsl:variable name="TotalPhysicalMemory"><xsl:value-of select="@TotalPhysicalMemory"/></xsl:variable>
+	    <xsl:variable name="LogicalProcessorsPerPhysical"><xsl:value-of select="@LogicalProcessorsPerPhysical"/></xsl:variable>
+	    <xsl:variable name="ProcessorClockFrequency"><xsl:value-of select="@ProcessorClockFrequency"/></xsl:variable>
+	    <properties>
+		<property name="BuildName" value="{$BuildName}" />
+		<property name="BuildStamp" value="{$BuildStamp}" />
+		<property name="Name" value="{$Name}" />
+		<property name="Generator" value="{$Generator}" />
+		<property name="CompilerName" value="{$CompilerName}" />
+		<property name="OSName" value="{$OSName}" />
+		<property name="Hostname" value="{$Hostname}" />
+		<property name="OSRelease" value="{$OSRelease}" />
+		<property name="OSVersion" value="{$OSVersion}" />
+		<property name="OSPlatform" value="{$OSPlatform}" />
+		<property name="Is64Bits" value="{$Is64Bits}" />
+		<property name="VendorString" value="{$VendorString}" />
+		<property name="VendorID" value="{$VendorID}" />
+		<property name="FamilyID" value="{$FamilyID}" />
+		<property name="ModelID" value="{$ModelID}" />
+		<property name="ProcessorCacheSize" value="{$ProcessorCacheSize}" />
+		<property name="NumberOfLogicalCPU" value="{$NumberOfLogicalCPU}" />
+		<property name="NumberOfPhysicalCPU" value="{$NumberOfPhysicalCPU}" />
+		<property name="TotalVirtualMemory" value="{$TotalVirtualMemory}" />
+		<property name="TotalPhysicalMemory" value="{$TotalPhysicalMemory}" />
+		<property name="LogicalProcessorsPerPhysical" value="{$LogicalProcessorsPerPhysical}" />
+		<property name="ProcessorClockFrequency" value="{$ProcessorClockFrequency}" />
+	    </properties>
+	    <xsl:apply-templates select="Testing/Test"/>
+
+	    <system-out>
+		BuildName: <xsl:value-of select="$BuildName" />
+		BuildStamp: <xsl:value-of select="$BuildStamp" />
+		Name: <xsl:value-of select="$Name" />
+		Generator: <xsl:value-of select="$Generator" />
+		CompilerName: <xsl:value-of select="$CompilerName" />
+		OSName: <xsl:value-of select="$OSName" />
+		Hostname: <xsl:value-of select="$Hostname" />
+		OSRelease: <xsl:value-of select="$OSRelease" />
+		OSVersion: <xsl:value-of select="$OSVersion" />
+		OSPlatform: <xsl:value-of select="$OSPlatform" />
+		Is64Bits: <xsl:value-of select="$Is64Bits" />
+		VendorString: <xsl:value-of select="$VendorString" />
+		VendorID: <xsl:value-of select="$VendorID" />
+		FamilyID: <xsl:value-of select="$FamilyID" />
+		ModelID: <xsl:value-of select="$ModelID" />
+		ProcessorCacheSize: <xsl:value-of select="$ProcessorCacheSize" />
+		NumberOfLogicalCPU: <xsl:value-of select="$NumberOfLogicalCPU" />
+		NumberOfPhysicalCPU: <xsl:value-of select="$NumberOfPhysicalCPU" />
+		TotalVirtualMemory: <xsl:value-of select="$TotalVirtualMemory" />
+		TotalPhysicalMemory: <xsl:value-of select="$TotalPhysicalMemory" />
+		LogicalProcessorsPerPhysical: <xsl:value-of select="$LogicalProcessorsPerPhysical" />
+		ProcessorClockFrequency: <xsl:value-of select="$ProcessorClockFrequency" />
+	    </system-out>
+	</testsuite>
+    </xsl:template>
+
+    <xsl:template match="Testing/Test">
+	<xsl:variable name="testcasename"><xsl:value-of select= "Name"/></xsl:variable>
+	<xsl:variable name="testclassname"><xsl:value-of select= " concat('this', substring(Path,2))"/></xsl:variable>
+	<xsl:variable name="exectime">
+	    <xsl:for-each select="Results/NamedMeasurement">
+		<xsl:if test="@name = 'Execution Time'">
+		    <xsl:value-of select="."/>
+		</xsl:if>
+	    </xsl:for-each>
+	</xsl:variable>
+
+	<testcase name="{$testcasename}" classname="{$testclassname}" time="{$exectime}">
+	    <xsl:if test="@Status = 'passed'">
+	    </xsl:if>
+	    <xsl:if test="@Status = 'failed'">
+		<xsl:variable name="failtype">
+		    <xsl:for-each select="Results/NamedMeasurement">
+			<xsl:if test="@name = 'Exit Code'">
+			    <xsl:value-of select="."/>
+			</xsl:if>
+		    </xsl:for-each>
+		</xsl:variable>
+		<xsl:variable name="failcode">
+		    <xsl:for-each select="Results/NamedMeasurement">
+			<xsl:if test="@name = 'Exit Value'">
+			    <xsl:value-of select="."/>
+			</xsl:if>
+		    </xsl:for-each>
+		</xsl:variable>
+		<failure message="{$failtype} ({$failcode})"><xsl:value-of select="Results/Measurement/Value/text()" /></failure>
+	    </xsl:if>
+	    <xsl:if test="@Status = 'notrun'">
+		<skipped><xsl:value-of select="Results/Measurement/Value/text()" /></skipped>
+	    </xsl:if>
+	</testcase>
+    </xsl:template>
+
+</xsl:stylesheet>
--- a/ci/README.md
+++ b/ci/README.md
@@ -0,0 +1,12 @@
+## Eigen CI infrastructure
+
+Eigen's CI infrastructure uses three stages:
+  1. A `checkformat` stage to verify MRs satisfy proper formatting style, as
+     defined by `clang-format`.
+  2. A `build` stage to build the unit-tests.
+  3. A `test` stage to run the unit-tests.
+
+For merge requests, only a small subset of tests are built/run, and only on a
+small subset of platforms.  This is to reduce our overall testing infrastructure
+resource usage.  In addition, we have nightly jobs that build and run the full
+suite of tests on most officially supported platforms.
--- a/ci/build.linux.gitlab-ci.yml
+++ b/ci/build.linux.gitlab-ci.yml
@@ -0,0 +1,331 @@
+# Base configuration for linux cross-compilation.
+.build:linux:cross:
+  extends: .common:linux:cross
+  stage: build
+  variables:
+    EIGEN_CI_BUILD_TARGET: buildtests
+  script:
+    - . ci/scripts/build.linux.script.sh
+  tags:
+    - linux
+    - eigen-runner
+    - cross-compiler
+  rules:
+    - if: $CI_PIPELINE_SOURCE == "schedule" && $CI_PROJECT_NAMESPACE == "libeigen"
+    - if: $CI_PIPELINE_SOURCE == "web" && $CI_PROJECT_NAMESPACE == "libeigen"
+    - if: $CI_PIPELINE_SOURCE == "push" && $CI_PROJECT_NAMESPACE == "libeigen"
+  cache:
+    key: "$CI_JOB_NAME_SLUG-$CI_COMMIT_REF_SLUG-BUILD"
+    paths:
+      - ${EIGEN_CI_BUILDDIR}/
+
+######## x86-64 ################################################################
+
+.build:linux:cross:x86-64:
+  extends: .build:linux:cross
+  variables:
+    EIGEN_CI_TARGET_ARCH: x86_64
+    EIGEN_CI_CROSS_TARGET_TRIPLE: x86_64-linux-gnu
+
+# GCC-6 (minimum on Ubuntu 18.04)
+build:linux:cross:x86-64:gcc-6:default:
+  extends: .build:linux:cross:x86-64
+  image: ubuntu:18.04
+  variables:
+    EIGEN_CI_C_COMPILER: gcc-6
+    EIGEN_CI_CXX_COMPILER: g++-6
+    EIGEN_CI_CROSS_INSTALL: g++-6-x86-64-linux-gnu
+    EIGEN_CI_CROSS_C_COMPILER: x86_64-linux-gnu-gcc-6
+    EIGEN_CI_CROSS_CXX_COMPILER: x86_64-linux-gnu-g++-6
+
+# GCC-10 (stable recent version)
+build:linux:cross:x86-64:gcc-10:default:
+  extends: .build:linux:cross:x86-64
+  variables:
+    EIGEN_CI_C_COMPILER: gcc-10
+    EIGEN_CI_CXX_COMPILER: g++-10
+    EIGEN_CI_CROSS_INSTALL: g++-10-x86-64-linux-gnu
+    EIGEN_CI_CROSS_C_COMPILER: x86_64-linux-gnu-gcc-10
+    EIGEN_CI_CROSS_CXX_COMPILER: x86_64-linux-gnu-g++-10
+
+build:linux:cross:x86-64:gcc-10:cxx03:
+  extends: build:linux:cross:x86-64:gcc-10:default
+  variables:
+    EIGEN_CI_ADDITIONAL_ARGS: "-DEIGEN_TEST_CXX11=off"
+
+build:linux:cross:x86-64:gcc-10:avx:
+  extends: build:linux:cross:x86-64:gcc-10:default
+  variables:
+    EIGEN_CI_ADDITIONAL_ARGS: "-DEIGEN_TEST_AVX=on"
+
+build:linux:cross:x86-64:gcc-10:avx2:
+  extends: build:linux:cross:x86-64:gcc-10:default
+  variables:
+    EIGEN_CI_ADDITIONAL_ARGS: "-DEIGEN_TEST_AVX2=on"
+
+build:linux:cross:x86-64:gcc-10:avx512dq:
+  extends: build:linux:cross:x86-64:gcc-10:default
+  variables:
+    EIGEN_CI_ADDITIONAL_ARGS: "-DEIGEN_TEST_AVX512DQ=on"
+
+# Clang-6 (minimum on Ubuntu 20.04)
+build:linux:cross:x86-64:clang-6:default:
+  extends: .build:linux:cross:x86-64
+  image: ubuntu:20.04
+  variables:
+    EIGEN_CI_INSTALL: g++-8 clang-6.0 lld-6.0
+    EIGEN_CI_C_COMPILER: clang-6.0
+    EIGEN_CI_CXX_COMPILER: clang++-6.0
+    EIGEN_CI_CROSS_INSTALL: g++-8-x86-64-linux-gnu clang-6.0 lld-6.0
+    EIGEN_CI_ADDITIONAL_ARGS: -DCMAKE_EXE_LINKER_FLAGS=-fuse-ld=lld-6.0
+
+# Clang-12 (stable recent version)
+build:linux:cross:x86-64:clang-12:default:
+  extends: .build:linux:cross:x86-64
+  variables:
+    EIGEN_CI_INSTALL: clang-12
+    EIGEN_CI_C_COMPILER: clang-12
+    EIGEN_CI_CXX_COMPILER: clang++-12
+    EIGEN_CI_CROSS_INSTALL: g++-10-x86-64-linux-gnu clang-12
+
+build:linux:cross:x86-64:clang-12:avx:
+  extends: build:linux:cross:x86-64:clang-12:default
+  variables:
+    EIGEN_CI_ADDITIONAL_ARGS: "-DEIGEN_TEST_AVX=on"
+
+build:linux:cross:x86-64:clang-12:avx2:
+  extends: build:linux:cross:x86-64:clang-12:default
+  variables:
+    EIGEN_CI_ADDITIONAL_ARGS: "-DEIGEN_TEST_AVX2=on"
+
+build:linux:cross:x86-64:clang-12:avx512dq:
+  extends: build:linux:cross:x86-64:clang-12:default
+  variables:
+    EIGEN_CI_ADDITIONAL_ARGS: "-DEIGEN_TEST_AVX512DQ=on"
+
+build:linux:doc:
+  extends: .build:linux:cross
+  variables:
+    EIGEN_CI_TARGET_ARCH: any
+    EIGEN_CI_BUILD_TARGET: doc
+    EIGEN_CI_INSTALL: ca-certificates clang flex python3 bison graphviz
+    EIGEN_CI_C_COMPILER: clang
+    EIGEN_CI_CXX_COMPILER: clang++
+    EIGEN_CI_BEFORE_SCRIPT: ". ci/scripts/build_and_install_doxygen.sh Release_1_13_2"
+  rules:
+    - if: $CI_PIPELINE_SOURCE == "push" && $CI_PROJECT_NAMESPACE == "libeigen"
+
+# # Sanitizers (Disabled because ASAN hangs and MSAN requires instrumented libc++)
+# build:linux:cross:x86-64:clang-12:default:asan:
+#   extends: build:linux:cross:x86-64:clang-12:default
+#   variables:
+#     EIGEN_CI_ADDITIONAL_ARGS:
+#       -DEIGEN_TEST_CUSTOM_CXX_FLAGS=-fsanitize=address,undefined
+#       -DEIGEN_TEST_CUSTOM_LINKER_FLAGS=-fsanitize=address,undefined
+
+# build:linux:cross:x86-64:clang-12:default:msan:
+#   extends: build:linux:cross:x86-64:clang-12:default
+#   variables:
+#     EIGEN_CI_ADDITIONAL_ARGS:
+#       -DEIGEN_TEST_CUSTOM_CXX_FLAGS=-fsanitize=memory
+#       -DEIGEN_TEST_CUSTOM_LINKER_FLAGS=-fsanitize=memory
+
+######## CUDA ##################################################################
+
+.build:linux:cuda:
+  extends: .build:linux:cross:x86-64
+  variables:
+    # Additional flags passed to the cuda compiler.
+    EIGEN_CI_CUDA_CXX_FLAGS: ""
+    # Compute architectures present in the GitLab CI runners.
+    EIGEN_CI_CUDA_COMPUTE_ARCH: "50;75"
+    EIGEN_CI_BUILD_TARGET: buildtests_gpu
+    EIGEN_CI_TEST_CUDA_CLANG: "off"
+    EIGEN_CI_ADDITIONAL_ARGS:
+      -DEIGEN_TEST_CUDA=on
+      -DEIGEN_CUDA_CXX_FLAGS=${EIGEN_CI_CUDA_CXX_FLAGS}
+      -DEIGEN_CUDA_COMPUTE_ARCH=${EIGEN_CI_CUDA_COMPUTE_ARCH}
+      -DEIGEN_TEST_CUDA_CLANG=${EIGEN_CI_TEST_CUDA_CLANG}
+  tags:
+    - eigen-runner
+    - linux
+    # Requires intel runner for cuda docker image support.
+    - x86-64
+
+# NVidia no longer provides docker images < CUDA 11.0.3.
+# # GCC-7, CUDA-9.2
+# build:linux:cuda-9.2:gcc-7:
+#   extends: .build:linux:cuda
+#   image: nvidia/cuda:9.2-devel-ubuntu18.04
+#   variables:
+#     # cuda 9.2 doesn't support sm_75, so lower to 70.
+#     EIGEN_CI_CUDA_COMPUTE_ARCH: "50;70"
+#     EIGEN_CI_C_COMPILER: gcc-7
+#     EIGEN_CI_CXX_COMPILER: g++-7
+
+# # Clang-10, CUDA-9.2
+# build:linux:cuda-9.2:clang-10:
+#   extends: build:linux:cuda-9.2:gcc-7
+#   variables:
+#     EIGEN_CI_C_COMPILER: clang-10
+#     EIGEN_CI_CXX_COMPILER: clang++-10
+#     EIGEN_CI_TEST_CUDA_CLANG: "on"
+
+# # GCC-8, CUDA-10.2
+# build:linux:cuda-10.2:gcc-8:
+#   extends: .build:linux:cuda
+#   image: nvidia/cuda:10.2-devel-ubuntu18.04
+#   variables:
+#     EIGEN_CI_C_COMPILER: gcc-8
+#     EIGEN_CI_CXX_COMPILER: g++-8
+
+# # Clang-10, CUDA-10.2
+# build:linux:cuda-10.2:clang-10:
+#   extends: build:linux:cuda-10.2:gcc-8
+#   variables:
+#     EIGEN_CI_C_COMPILER: clang-10
+#     EIGEN_CI_CXX_COMPILER: clang++-10
+#     EIGEN_CI_TEST_CUDA_CLANG: "on"
+
+# GCC-10, CUDA-11.4
+build:linux:cuda-11.4:gcc-10:
+  extends: .build:linux:cuda
+  image: nvidia/cuda:11.4.3-devel-ubuntu20.04
+  variables:
+    EIGEN_CI_C_COMPILER: gcc-10
+    EIGEN_CI_CXX_COMPILER: g++-10
+
+# Clang-12, CUDA-11.4
+build:linux:cuda-11.4:clang-12:
+  extends: build:linux:cuda-11.4:gcc-10
+  variables:
+    EIGEN_CI_C_COMPILER: clang-12
+    EIGEN_CI_CXX_COMPILER: clang++-12
+    EIGEN_CI_TEST_CUDA_CLANG: "on"
+
+# GCC-10, CUDA-12.2
+build:linux:cuda-12.2:gcc-10:
+  extends: .build:linux:cuda
+  image: nvidia/cuda:12.2.0-devel-ubuntu20.04
+  variables:
+    EIGEN_CI_C_COMPILER: gcc-10
+    EIGEN_CI_CXX_COMPILER: g++-10
+
+# Clang-12, CUDA-12.2
+build:linux:cuda-12.2:clang-12:
+  extends: build:linux:cuda-12.2:gcc-10
+  variables:
+    EIGEN_CI_C_COMPILER: clang-12
+    EIGEN_CI_CXX_COMPILER: clang++-12
+    EIGEN_CI_TEST_CUDA_CLANG: "on"
+
+# ######## HIP ###################################################################
+# Note: these are currently build-only, until we get an AMD-supported runner.
+
+# ROCm HIP
+build:linux:rocm-latest:gcc-10:
+  extends: .build:linux:cross
+  image: rocm/dev-ubuntu-24.04:latest
+  variables:
+    EIGEN_CI_C_COMPILER: gcc-10
+    EIGEN_CI_CXX_COMPILER: g++-10
+    EIGEN_CI_BUILD_TARGET: buildtests_gpu
+    EIGEN_CI_ADDITIONAL_ARGS: -DEIGEN_TEST_HIP=on
+  tags:
+    - eigen-runner
+    - linux
+    - x86-64
+
+######## Arm ###################################################################
+
+.build:linux:cross:arm:
+  extends: .build:linux:cross
+  variables:
+    EIGEN_CI_TARGET_ARCH: arm
+    EIGEN_CI_CROSS_TARGET_TRIPLE: arm-linux-gnueabihf
+    EIGEN_CI_ADDITIONAL_ARGS: >
+      -DEIGEN_TEST_CUSTOM_CXX_FLAGS=-march=armv7-a;-mfpu=neon-vfpv4
+
+build:linux:cross:arm:gcc-10:default:
+  extends: .build:linux:cross:arm
+  variables:
+    EIGEN_CI_CROSS_INSTALL: g++-10-arm-linux-gnueabihf
+    EIGEN_CI_CROSS_C_COMPILER: arm-linux-gnueabihf-gcc-10
+    EIGEN_CI_CROSS_CXX_COMPILER: arm-linux-gnueabihf-g++-10
+
+build:linux:cross:arm:clang-12:default:
+  extends: .build:linux:cross:arm
+  variables:
+    EIGEN_CI_INSTALL: clang-12
+    EIGEN_CI_C_COMPILER: clang-12
+    EIGEN_CI_CXX_COMPILER: clang++-12
+    EIGEN_CI_CROSS_INSTALL: g++-10-arm-linux-gnueabihf clang-12
+
+######## aarch64 ###############################################################
+
+.build:linux:cross:aarch64:
+  extends: .build:linux:cross
+  variables:
+    EIGEN_CI_TARGET_ARCH: aarch64
+    EIGEN_CI_CROSS_TARGET_TRIPLE: aarch64-linux-gnu
+    EIGEN_CI_ADDITIONAL_ARGS: -DEIGEN_TEST_CUSTOM_CXX_FLAGS=-march=armv8.2-a+fp16
+
+build:linux:cross:aarch64:gcc-10:default:
+  extends: .build:linux:cross:aarch64
+  variables:
+    EIGEN_CI_C_COMPILER: gcc-10
+    EIGEN_CI_CXX_COMPILER: g++-10
+    EIGEN_CI_CROSS_INSTALL: g++-10-aarch64-linux-gnu
+    EIGEN_CI_CROSS_C_COMPILER: aarch64-linux-gnu-gcc-10
+    EIGEN_CI_CROSS_CXX_COMPILER: aarch64-linux-gnu-g++-10
+
+build:linux:cross:aarch64:clang-12:default:
+  extends: .build:linux:cross:aarch64
+  variables:
+    EIGEN_CI_INSTALL: clang-12
+    EIGEN_CI_C_COMPILER: clang-12
+    EIGEN_CI_CXX_COMPILER: clang++-12
+    EIGEN_CI_CROSS_INSTALL: g++-10-aarch64-linux-gnu clang-12
+
+######## ppc64le ###############################################################
+
+.build:linux:cross:ppc64le:
+  extends: .build:linux:cross
+  variables:
+    EIGEN_CI_TARGET_ARCH: ppc64le
+    EIGEN_CI_CROSS_TARGET_TRIPLE: powerpc64le-linux-gnu
+
+build:linux:cross:ppc64le:gcc-10:default:
+  extends: .build:linux:cross:ppc64le
+  variables:
+    EIGEN_CI_C_COMPILER: gcc-10
+    EIGEN_CI_CXX_COMPILER: g++-10
+    EIGEN_CI_CROSS_INSTALL: g++-10-powerpc64le-linux-gnu
+    EIGEN_CI_CROSS_C_COMPILER: powerpc64le-linux-gnu-gcc-10
+    EIGEN_CI_CROSS_CXX_COMPILER: powerpc64le-linux-gnu-g++-10
+    # Temporarily disable MMA until #2457 is resolved.
+    EIGEN_CI_ADDITIONAL_ARGS: "-DEIGEN_ALTIVEC_DISABLE_MMA=1"
+
+build:linux:cross:ppc64le:clang-12:default:
+  extends: .build:linux:cross:ppc64le
+  variables:
+    EIGEN_CI_INSTALL: clang-12
+    EIGEN_CI_C_COMPILER: clang-12
+    EIGEN_CI_CXX_COMPILER: clang++-12
+    EIGEN_CI_CROSS_INSTALL: g++-10-powerpc64le-linux-gnu clang-12
+
+######## MR Smoke Tests ########################################################
+
+build:linux:cross:x86-64:gcc-10:default:smoketest:
+  extends: build:linux:cross:x86-64:gcc-10:default
+  variables:
+    EIGEN_CI_BUILD_TARGET: buildsmoketests
+  rules:
+    - if: $CI_PIPELINE_SOURCE == "merge_request_event"
+
+build:linux:cross:x86-64:clang-12:default:smoketest:
+  extends: build:linux:cross:x86-64:clang-12:default
+  variables:
+    EIGEN_CI_BUILD_TARGET: buildsmoketests
+  rules:
+    - if: $CI_PIPELINE_SOURCE == "merge_request_event"
--- a/ci/build.windows.gitlab-ci.yml
+++ b/ci/build.windows.gitlab-ci.yml
@@ -0,0 +1,114 @@
+# Base configuration for windows builds.
+.build:windows:
+  extends: .common:windows
+  stage: build
+  variables:
+    EIGEN_CI_BUILD_TARGET: buildtests
+    # Reduce overall build size and compile time.
+    # Note: /d2ReducedOptimizeHugeFunctions is only available in VS 2019.
+    EIGEN_CI_TEST_CUSTOM_CXX_FLAGS: "/d2ReducedOptimizeHugeFunctions;/DEIGEN_STRONG_INLINE=inline;/Os"
+  script:
+     - ./ci/scripts/build.windows.script.ps1
+  tags:
+    - eigen-runner
+    - windows
+    - x86-64
+  rules:
+    - if: $CI_PIPELINE_SOURCE == "schedule" && $CI_PROJECT_NAMESPACE == "libeigen"
+    - if: $CI_PIPELINE_SOURCE == "web" && $CI_PROJECT_NAMESPACE == "libeigen"
+    - if: $CI_PIPELINE_SOURCE == "push" && $CI_PROJECT_NAMESPACE == "libeigen"
+  cache:
+    key: "$CI_JOB_NAME-$CI_COMMIT_REF_SLUG-BUILD"
+    paths:
+      - ${EIGEN_CI_BUILDDIR}/
+
+######### MSVC #################################################################
+
+# MSVC 14.16 (VS 2017)
+build:windows:x86-64:msvc-14.16:default:
+  extends: .build:windows
+  variables:
+    EIGEN_CI_MSVC_VER: "14.16"
+    # Override to remove unsupported /d2ReducedOptimizeHugeFunctions.
+    EIGEN_CI_TEST_CUSTOM_CXX_FLAGS: "/DEIGEN_STRONG_INLINE=inline;/Os"
+
+# MSVC 14.29 (VS 2019)
+build:windows:x86-64:msvc-14.29:default:
+  extends: .build:windows
+  variables:
+    EIGEN_CI_MSVC_VER: "14.29"
+
+build:windows:x86-64:msvc-14.29:cxx03:
+  extends: build:windows:x86-64:msvc-14.29:default
+  variables:
+    EIGEN_CI_ADDITIONAL_ARGS: "-DEIGEN_TEST_CXX11=off"
+
+build:windows:x86-64:msvc-14.29:avx2:
+  extends: build:windows:x86-64:msvc-14.29:default
+  variables:
+    EIGEN_CI_ADDITIONAL_ARGS: "-DEIGEN_TEST_AVX2=on"
+
+build:windows:x86-64:msvc-14.29:avx512dq:
+  extends: build:windows:x86-64:msvc-14.29:default
+  variables:
+    EIGEN_CI_ADDITIONAL_ARGS: "-DEIGEN_TEST_AVX512DQ=on"
+
+######### MSVC + CUDA ##########################################################
+.build:windows:cuda:
+  extends: .build:windows
+  variables:
+    # Compute architectures present in the GitLab CI runners.
+    EIGEN_CI_CUDA_COMPUTE_ARCH: "50;75"
+    EIGEN_CI_BUILD_TARGET: buildtests_gpu
+    EIGEN_CI_ADDITIONAL_ARGS:
+      -DEIGEN_TEST_CUDA=on
+      -DEIGEN_CUDA_COMPUTE_ARCH=${EIGEN_CI_CUDA_COMPUTE_ARCH}
+  tags:
+    - eigen-runner
+    - windows
+    - x86-64
+    - cuda
+
+# The CUDA 9.2 compiler crashes with an internal error.
+# # MSVC 14.16 + CUDA 9.2
+# build:windows:x86-64:cuda-9.2:msvc-14.16:
+#   extends: .build:windows:cuda
+#   variables:
+#     # CUDA 9.2 doesn't support sm_75.
+#     EIGEN_CI_CUDA_COMPUTE_ARCH: "50;70"
+#     # CUDA 9.2 only supports up to VS 2017.
+#     EIGEN_CI_MSVC_VER: "14.16"
+#     EIGEN_CI_TEST_CUSTOM_CXX_FLAGS: "/DEIGEN_STRONG_INLINE=inline;/Os"
+#     EIGEN_CI_BEFORE_SCRIPT: $$env:CUDA_PATH=$$env:CUDA_PATH_V9_2
+
+# MSVC 14.29 + CUDA 10.2
+build:windows:x86-64:cuda-10.2:msvc-14.29:
+  extends: .build:windows:cuda
+  variables:
+    EIGEN_CI_MSVC_VER: "14.29"
+    EIGEN_CI_BEFORE_SCRIPT: $$env:CUDA_PATH=$$env:CUDA_PATH_V10_2
+
+# MSVC 14.29 + CUDA 11.4
+build:windows:x86-64:cuda-11.4:msvc-14.29:
+  extends: .build:windows:cuda
+  variables:
+    EIGEN_CI_MSVC_VER: "14.29"
+    EIGEN_CI_BEFORE_SCRIPT: $$env:CUDA_PATH=$$env:CUDA_PATH_V11_4
+
+######## MR Smoke Tests ########################################################
+
+# MSVC 14.29 64-bit (VS 2019)
+build:windows:x86-64:msvc-14.29:avx512dq:smoketest:
+  extends: build:windows:x86-64:msvc-14.29:avx512dq
+  variables:
+    EIGEN_CI_BUILD_TARGET: buildsmoketests
+  rules:
+    - if: $CI_PIPELINE_SOURCE == "merge_request_event"
+
+# MSVC 14.29 32-bit (VS 2019)
+build:windows:x86:msvc-14.29:avx512dq:smoketest:
+  extends: build:windows:x86-64:msvc-14.29:avx512dq:smoketest
+  variables:
+    EIGEN_CI_MSVC_ARCH: "x86"
+  rules:
+    - if: $CI_PIPELINE_SOURCE == "merge_request_event"
--- a/ci/checkformat.gitlab-ci.yml
+++ b/ci/checkformat.gitlab-ci.yml
@@ -0,0 +1,10 @@
+checkformat:clangformat:
+  stage: checkformat
+  image: alpine:3.19
+  only:
+    - merge_requests
+  allow_failure: true
+  before_script:
+    - apk add --no-cache git clang17-extra-tools python3
+  script:
+    - git clang-format --diff --commit ${CI_MERGE_REQUEST_DIFF_BASE_SHA}
--- a/ci/common.gitlab-ci.yml
+++ b/ci/common.gitlab-ci.yml
@@ -0,0 +1,40 @@
+# Base configuration for linux builds and tests.
+.common:linux:cross:
+  image: ubuntu:20.04
+  variables:
+    EIGEN_CI_TARGET_ARCH: ""
+    EIGEN_CI_ADDITIONAL_ARGS: ""
+    # If host matches target, use the following:
+    EIGEN_CI_C_COMPILER: ""
+    EIGEN_CI_CXX_COMPILER: ""
+    EIGEN_CI_INSTALL: "${EIGEN_CI_C_COMPILER} ${EIGEN_CI_CXX_COMPILER}"
+    # If host does not match the target, use the following:
+    EIGEN_CI_CROSS_TARGET_TRIPLE: ""
+    EIGEN_CI_CROSS_C_COMPILER: ${EIGEN_CI_C_COMPILER}
+    EIGEN_CI_CROSS_CXX_COMPILER: ${EIGEN_CI_CXX_COMPILER}
+    EIGEN_CI_CROSS_INSTALL: "${EIGEN_CI_CROSS_C_COMPILER} ${EIGEN_CI_CROSS_CXX_COMPILER}"
+  before_script:
+    # Call script in current shell - it sets up some environment variables.
+    - . ci/scripts/common.linux.before_script.sh
+  artifacts:
+    when: always
+    name: "$CI_JOB_NAME_SLUG-$CI_COMMIT_REF_SLUG"
+    paths:
+      - ${EIGEN_CI_BUILDDIR}/
+    expire_in: 5 days
+
+# Base configuration for Windows builds and tests.
+.common:windows:
+  variables:
+    EIGEN_CI_MSVC_ARCH: x64
+    EIGEN_CI_MSVC_VER: "14.29"
+    EIGEN_CI_ADDITIONAL_ARGS: ""
+    EIGEN_CI_BEFORE_SCRIPT: ""
+  before_script:
+     - . ci/scripts/common.windows.before_script.ps1
+  artifacts:
+    when: always
+    name: "$CI_JOB_NAME_SLUG-$CI_COMMIT_REF_NAME"
+    paths:
+      - ${EIGEN_CI_BUILDDIR}/
+    expire_in: 5 days
--- a/ci/deploy.gitlab-ci.yml
+++ b/ci/deploy.gitlab-ci.yml
@@ -0,0 +1,43 @@
+ # Push a nightly tag if the pipeline succeeded.
+deploy:tag:nightly:
+  stage: deploy
+  image: alpine:edge
+  dependencies: []
+  before_script:
+    - apk add git
+  script:
+    - git tag -f nightly $CI_COMMIT_SHORT_SHA
+    - git push -f $EIGEN_CI_GIT_PUSH_URL tag nightly
+  tags:
+    - linux
+    - eigen-runner
+  rules:
+    - if: $CI_PIPELINE_SOURCE == "schedule" && $CI_PROJECT_NAMESPACE == "libeigen" && $CI_COMMIT_BRANCH == $CI_DEFAULT_BRANCH
+    - if: $CI_PIPELINE_SOURCE == "web" && $CI_PROJECT_NAMESPACE == "libeigen" && $CI_COMMIT_BRANCH == $CI_DEFAULT_BRANCH
+
+# Upload docs if pipeline succeeded.
+deploy:docs:
+  stage: deploy
+  image: busybox
+  dependencies: [ build:linux:doc ]
+  variables:
+    PAGES_PREFIX: docs-nightly
+  script:
+    - echo "Deploying site to $CI_PAGES_URL"
+    - mv ${EIGEN_CI_BUILDDIR}/doc/html public
+  pages:
+    path_prefix: $PAGES_PREFIX
+    expire_in: never
+  artifacts:
+    name: "$CI_JOB_NAME_SLUG-$CI_COMMIT_REF_SLUG"
+    paths:
+      - public
+  tags:
+    - eigen-runner
+    - linux
+  rules:
+    - if: $CI_PIPELINE_SOURCE == "schedule" && $CI_PROJECT_NAMESPACE == "libeigen"
+    - if: $CI_PIPELINE_SOURCE == "web" && $CI_PROJECT_NAMESPACE == "libeigen"
+    - if: $CI_PIPELINE_SOURCE == "push" && $CI_PROJECT_NAMESPACE == "libeigen"
+      variables:
+        PAGES_PREFIX: docs-$CI_COMMIT_REF_NAME
--- a/ci/scripts/build.linux.script.sh
+++ b/ci/scripts/build.linux.script.sh
@@ -0,0 +1,31 @@
+#!/bin/bash
+
+set -x
+
+# Create and enter build directory.
+rootdir=`pwd`
+mkdir -p ${EIGEN_CI_BUILDDIR}
+cd ${EIGEN_CI_BUILDDIR}
+
+# Configure build.
+cmake -G Ninja                                                   \
+  -DCMAKE_CXX_COMPILER=${EIGEN_CI_CXX_COMPILER}                  \
+  -DCMAKE_C_COMPILER=${EIGEN_CI_C_COMPILER}                      \
+  -DCMAKE_CXX_COMPILER_TARGET=${EIGEN_CI_CXX_COMPILER_TARGET}    \
+  ${EIGEN_CI_ADDITIONAL_ARGS} ${rootdir}
+
+target=""
+if [[ ${EIGEN_CI_BUILD_TARGET} ]]; then
+  target="--target ${EIGEN_CI_BUILD_TARGET}"
+fi
+
+# Builds (particularly gcc) sometimes get killed, potentially when running
+# out of resources.  In that case, keep trying to build the remaining
+# targets (k0), then try to build again with a single thread (j1) to minimize
+# resource use.
+cmake --build . ${target} -- -k0 || cmake --build . ${target} -- -k0 -j1
+
+# Return to root directory.
+cd ${rootdir}
+
+set +x
--- a/ci/scripts/build.windows.script.ps1
+++ b/ci/scripts/build.windows.script.ps1
@@ -0,0 +1,44 @@
+# Find Visual Studio installation directory.
+$VS_INSTALL_DIR = &"${Env:ProgramFiles(x86)}\Microsoft Visual Studio\Installer\vswhere.exe" -latest -property installationPath
+
+# Run VCVarsAll.bat initialization script and extract environment variables.
+# http://allen-mack.blogspot.com/2008/03/replace-visual-studio-command-prompt.html
+cmd.exe /c "`"${VS_INSTALL_DIR}\VC\Auxiliary\Build\vcvarsall.bat`" $EIGEN_CI_MSVC_ARCH -vcvars_ver=$EIGEN_CI_MSVC_VER & set" |
+  foreach {
+    if ($_ -match "=") {
+      $v = $_.split("="); set-item -force -path "ENV:\$($v[0])" -value "$($v[1])"
+    }
+  }
+
+# Create and enter build directory.
+$rootdir = Get-Location
+if (-Not (Test-Path ${EIGEN_CI_BUILDDIR})) {
+    mkdir $EIGEN_CI_BUILDDIR
+}
+cd $EIGEN_CI_BUILDDIR
+
+# We need to split EIGEN_CI_ADDITIONAL_ARGS, otherwise they are interpreted
+# as a single argument.  Split by space, unless double-quoted.
+$split_args = [regex]::Split(${EIGEN_CI_ADDITIONAL_ARGS}, ' (?=(?:[^"]|"[^"]*")*$)' )
+
+# Configure build.
+cmake -G Ninja -DCMAKE_BUILD_TYPE=MinSizeRel `
+      -DEIGEN_TEST_CUSTOM_CXX_FLAGS="${EIGEN_CI_TEST_CUSTOM_CXX_FLAGS}" `
+      ${split_args} "${rootdir}"
+
+$target = ""
+if (${EIGEN_CI_BUILD_TARGET}) {
+  $target = "--target ${EIGEN_CI_BUILD_TARGET}"
+}
+
+# Windows builds sometimes fail due heap errors. In that case, try
+# building the rest, then try to build again with a single thread.
+cmake --build . ${target} -- -k0 || cmake --build . ${target} -- -k0 -j1
+
+$success = $LASTEXITCODE
+
+# Return to root directory.
+cd ${rootdir}
+
+# Explicitly propagate exit code to indicate pass/failure of build command.
+if($success -ne 0) { Exit $success }
--- a/ci/scripts/build_and_install_doxygen.sh
+++ b/ci/scripts/build_and_install_doxygen.sh
@@ -0,0 +1,6 @@
+git clone --depth 1 --branch $1 https://github.com/doxygen/doxygen.git
+cmake -B doxygen/.build -G Ninja                                 \
+  -DCMAKE_CXX_COMPILER=${EIGEN_CI_CXX_COMPILER}                  \
+  -DCMAKE_C_COMPILER=${EIGEN_CI_C_COMPILER}                      \
+  doxygen
+cmake --build doxygen/.build -t install
--- a/ci/scripts/common.linux.before_script.sh
+++ b/ci/scripts/common.linux.before_script.sh
@@ -0,0 +1,46 @@
+#!/bin/bash
+
+set -x
+
+echo "Running ${CI_JOB_NAME}"
+
+# Get architecture and display CI configuration.
+export ARCH=`uname -m`
+export NPROC=`nproc`
+echo "arch=$ARCH, target=${EIGEN_CI_TARGET_ARCH}"
+echo "Processors: ${NPROC}"
+echo "CI Variables:"
+export | grep EIGEN
+
+# Set noninteractive, otherwise tzdata may be installed and prompt for a
+# geographical region.
+export DEBIAN_FRONTEND=noninteractive
+apt-get update -y > /dev/null
+apt-get install -y --no-install-recommends ninja-build cmake git > /dev/null
+
+# Install required dependencies and set up compilers.
+# These are required even for testing to ensure that dynamic runtime libraries
+# are available.
+if [[ "$ARCH" == "${EIGEN_CI_TARGET_ARCH}" || "${EIGEN_CI_TARGET_ARCH}" == "any" ]]; then
+  apt-get install -y --no-install-recommends ${EIGEN_CI_INSTALL} > /dev/null;
+  export EIGEN_CI_CXX_IMPLICIT_INCLUDE_DIRECTORIES="";
+  export EIGEN_CI_CXX_COMPILER_TARGET="";
+else
+  apt-get install -y --no-install-recommends ${EIGEN_CI_CROSS_INSTALL} > /dev/null;
+  export EIGEN_CI_C_COMPILER=${EIGEN_CI_CROSS_C_COMPILER};
+  export EIGEN_CI_CXX_COMPILER=${EIGEN_CI_CROSS_CXX_COMPILER};
+  export EIGEN_CI_CXX_COMPILER_TARGET=${EIGEN_CI_CROSS_TARGET_TRIPLE};
+  # Tell the compiler where to find headers and libraries if using clang.
+  # NOTE: this breaks GCC since it messes with include path order
+  #       (https://gcc.gnu.org/bugzilla/show_bug.cgi?id=70129)
+  if [[ "${EIGEN_CI_CROSS_CXX_COMPILER}" == *"clang"* ]]; then
+    export CPLUS_INCLUDE_PATH="/usr/${EIGEN_CI_CROSS_TARGET_TRIPLE}/include";
+    export LIBRARY_PATH="/usr/${EIGEN_CI_CROSS_TARGET_TRIPLE}/lib64";
+  fi
+fi
+
+echo "Compilers: ${EIGEN_CI_C_COMPILER} ${EIGEN_CI_CXX_COMPILER}"
+
+if [ -n "$EIGEN_CI_BEFORE_SCRIPT" ]; then eval "$EIGEN_CI_BEFORE_SCRIPT"; fi
+
+set +x
--- a/ci/scripts/common.windows.before_script.ps1
+++ b/ci/scripts/common.windows.before_script.ps1
@@ -0,0 +1,8 @@
+echo "Running ${CI_JOB_NAME}"
+
+# Print configuration variables.
+Get-Variable EIGEN* | Format-Table -Wrap
+Get-Variable CMAKE* | Format-Table -Wrap
+
+# Run a custom before-script command.
+if ("${EIGEN_CI_BEFORE_SCRIPT}") { Invoke-Expression -Command "${EIGEN_CI_BEFORE_SCRIPT}" }
--- a/ci/scripts/test.linux.after_script.sh
+++ b/ci/scripts/test.linux.after_script.sh
@@ -0,0 +1,19 @@
+#!/bin/bash
+
+set -x
+
+# Enter build directory.
+rootdir=`pwd`
+cd ${EIGEN_CI_BUILDDIR}
+
+# Install xml processor.
+apt-get update -y
+apt-get install --no-install-recommends -y xsltproc
+
+# Generate test results.
+xsltproc ${rootdir}/ci/CTest2JUnit.xsl Testing/`head -n 1 < Testing/TAG`/Test.xml > "JUnitTestResults_$CI_JOB_ID.xml"
+
+# Return to root directory.
+cd ${rootdir}
+
+set +x
--- a/ci/scripts/test.linux.script.sh
+++ b/ci/scripts/test.linux.script.sh
@@ -0,0 +1,31 @@
+#!/bin/bash
+
+set -x
+
+# Enter build directory.
+rootdir=`pwd`
+cd ${EIGEN_CI_BUILDDIR}
+
+target=""
+if [[ ${EIGEN_CI_CTEST_REGEX} ]]; then
+  target="-R ${EIGEN_CI_CTEST_REGEX}"
+elif [[ ${EIGEN_CI_CTEST_LABEL} ]]; then
+  target="-L ${EIGEN_CI_CTEST_LABEL}"
+fi
+
+# Repeat tests up to three times to ignore flakes.  Do not re-run with -T test,
+# otherwise we lose test results for those that passed.
+# Note: starting with CMake 3.17, we can use --repeat until-pass:3, but we have
+#       no way of easily installing this on ppc64le.
+ctest ${EIGEN_CI_CTEST_ARGS} --parallel ${NPROC} \
+      --output-on-failure --no-compress-output  \
+      --build-no-clean -T test ${target} || \
+  ctest ${EIGEN_CI_CTEST_ARGS} --parallel ${NPROC} \
+      --output-on-failure --no-compress-output --rerun-failed || \
+  ctest ${EIGEN_CI_CTEST_ARGS} --parallel ${NPROC} \
+      --output-on-failure --no-compress-output --rerun-failed
+
+# Return to root directory.
+cd ${rootdir}
+
+set +x
--- a/ci/scripts/test.windows.after_script.ps1
+++ b/ci/scripts/test.windows.after_script.ps1
@@ -0,0 +1,17 @@
+# Change to build directory.
+$rootdir = Get-Location
+cd ${EIGEN_CI_BUILDDIR}
+
+# Determine the appropriate test tag and results file.
+$TEST_TAG = Get-Content Testing\TAG | select -first 1
+
+# PowerShell equivalent to xsltproc:
+$XSL_FILE = Resolve-Path "..\ci\CTest2JUnit.xsl"
+$INPUT_FILE = Resolve-Path Testing\$TEST_TAG\Test.xml
+$OUTPUT_FILE = Join-Path -Path $pwd -ChildPath JUnitTestResults_$CI_JOB_ID.xml
+$xslt = New-Object System.Xml.Xsl.XslCompiledTransform;
+$xslt.Load($XSL_FILE)
+$xslt.Transform($INPUT_FILE,$OUTPUT_FILE)
+
+# Return to root directory.
+cd ${rootdir}
--- a/ci/scripts/test.windows.script.ps1
+++ b/ci/scripts/test.windows.script.ps1
@@ -0,0 +1,30 @@
+# Change to build directory.
+$rootdir = Get-Location
+cd $EIGEN_CI_BUILDDIR
+
+# Determine number of processors for parallel tests.
+$NPROC=${Env:NUMBER_OF_PROCESSORS}
+
+# Set target based on regex or label.
+$target = ""
+if (${EIGEN_CI_CTEST_REGEX}) {
+  $target = "-R","${EIGEN_CI_CTEST_REGEX}"
+} elseif (${EIGEN_CI_CTEST_LABEL}) {
+  $target = "-L","${EIGEN_CI_CTEST_LABEL}"
+}
+
+# Repeat tests up to three times to ignore flakes.  Do not re-run with -T test,
+# otherwise we lose test results for those that passed.
+# Note: starting with CMake 3.17, we can use --repeat until-pass:3, but we have
+#       no way of easily installing this on ppc64le.
+ctest $EIGEN_CI_CTEST_ARGS -j$NPROC --output-on-failure --no-compress-output --build-no-clean -T test $target || `
+  ctest $EIGEN_CI_CTEST_ARGS -j$NPROC --output-on-failure --no-compress-output --rerun-failed || `
+  ctest $EIGEN_CI_CTEST_ARGS -j$NPROC --output-on-failure --no-compress-output --rerun-failed
+
+$success = $LASTEXITCODE
+
+# Return to root directory.
+cd ${rootdir}
+
+# Explicitly propagate exit code to indicate pass/failure of test command.
+if($success -ne 0) { Exit $success }
--- a/ci/scripts/vars.linux.sh
+++ b/ci/scripts/vars.linux.sh
@@ -0,0 +1,14 @@
+#!/bin/bash
+
+# Initialize default variables used by the CI.
+export EIGEN_CI_ADDITIONAL_ARGS=""
+export EIGEN_CI_BEFORE_SCRIPT=""
+export EIGEN_CI_BUILD_TARGET="buildtests"
+export EIGEN_CI_BUILDDIR=".build"
+export EIGEN_CI_C_COMPILER="clang"
+export EIGEN_CI_CXX_COMPILER="clang++"
+export EIGEN_CI_TEST_CUSTOM_CXX_FLAGS=""
+export EIGEN_CI_CTEST_LABEL="Official"
+export EIGEN_CI_CTEST_REGEX=""
+export EIGEN_CI_CTEST_ARGS=""
+
--- a/ci/scripts/vars.windows.ps1
+++ b/ci/scripts/vars.windows.ps1
@@ -0,0 +1,11 @@
+# Initialize default variables used by the CI.
+$EIGEN_CI_ADDITIONAL_ARGS       = ""
+$EIGEN_CI_BEFORE_SCRIPT         = ""
+$EIGEN_CI_BUILD_TARGET          = "buildtests"
+$EIGEN_CI_BUILDDIR              = ".build"
+$EIGEN_CI_MSVC_ARCH             = "x64"
+$EIGEN_CI_MSVC_VER              = "14.29"
+$EIGEN_CI_TEST_CUSTOM_CXX_FLAGS = "/d2ReducedOptimizeHugeFunctions /DEIGEN_STRONG_INLINE=inline /Os"
+$EIGEN_CI_CTEST_LABEL           = "Official"
+$EIGEN_CI_CTEST_REGEX           = ""
+$EIGEN_CI_CTEST_ARGS            = ""
--- a/ci/smoketests.gitlab-ci.yml
+++ b/ci/smoketests.gitlab-ci.yml
@@ -0,0 +1,107 @@
+.buildsmoketests:linux:base:
+  stage: buildsmoketests
+  image: ubuntu:18.04
+  before_script:
+    - apt-get update -y
+    - apt-get install -y --no-install-recommends software-properties-common
+    - add-apt-repository -y  ppa:ubuntu-toolchain-r/test
+    - apt-get update
+    - apt-get install --no-install-recommends -y ${EIGEN_CI_CXX_COMPILER}
+      ${EIGEN_CI_CC_COMPILER} cmake ninja-build
+  script:
+    - mkdir -p ${BUILDDIR} && cd ${BUILDDIR}
+    - CXX=${EIGEN_CI_CXX_COMPILER} CC=${EIGEN_CI_CC_COMPILER} cmake -G
+      ${EIGEN_CI_CMAKE_GENEATOR} -DEIGEN_TEST_CXX11=${EIGEN_TEST_CXX11}
+      ${EIGEN_CI_ADDITIONAL_ARGS} ..
+    - cmake --build . --target buildsmoketests
+  artifacts:
+    name: "$CI_JOB_NAME-$CI_COMMIT_REF_NAME"
+    paths:
+      - ${BUILDDIR}/
+    expire_in: 5 days
+  only:
+    - merge_requests
+
+buildsmoketests:x86-64:linux:gcc-10:cxx11-off:
+  extends: .buildsmoketests:linux:base
+  variables:
+    EIGEN_CI_CXX_COMPILER: "g++-10"
+    EIGEN_CI_CC_COMPILER: "gcc-10"
+    EIGEN_TEST_CXX11: "off"
+
+buildsmoketests:x86-64:linux:gcc-10:cxx11-on:
+  extends: .buildsmoketests:linux:base
+  variables:
+    EIGEN_CI_CXX_COMPILER: "g++-10"
+    EIGEN_CI_CC_COMPILER: "gcc-10"
+    EIGEN_TEST_CXX11: "on"
+
+buildsmoketests:x86-64:linux:clang-10:cxx11-off:
+  extends: .buildsmoketests:linux:base
+  variables:
+    EIGEN_CI_CXX_COMPILER: "clang++-10"
+    EIGEN_CI_CC_COMPILER: "clang-10"
+    EIGEN_TEST_CXX11: "off"
+
+buildsmoketests:x86-64:linux:clang-10:cxx11-on:
+  extends: .buildsmoketests:linux:base
+  variables:
+    EIGEN_CI_CXX_COMPILER: "clang++-10"
+    EIGEN_CI_CC_COMPILER: "clang-10"
+    EIGEN_TEST_CXX11: "on"
+
+.smoketests:linux:base:
+  stage: smoketests
+  image: ubuntu:18.04
+  before_script:
+    - apt-get update -y
+    - apt-get install -y --no-install-recommends software-properties-common
+    - add-apt-repository -y ppa:ubuntu-toolchain-r/test
+    - apt-get update
+    - apt-get install --no-install-recommends -y ${EIGEN_CI_CXX_COMPILER}
+      ${EIGEN_CI_CC_COMPILER} cmake ninja-build xsltproc
+  script:
+    - export CXX=${EIGEN_CI_CXX_COMPILER}
+    - export CC=${EIGEN_CI_CC_COMPILER}
+    - cd ${BUILDDIR} && ctest --output-on-failure --no-compress-output
+      --build-no-clean -T test -L smoketest
+  after_script:
+    - apt-get update -y
+    - apt-get install --no-install-recommends -y xsltproc
+    - cd ${BUILDDIR}
+    - xsltproc ../ci/CTest2JUnit.xsl Testing/`head -n 1 < Testing/TAG`/Test.xml > "JUnitTestResults_$CI_JOB_ID.xml"
+  artifacts:
+    reports:
+      junit:
+        - ${BUILDDIR}/JUnitTestResults_$CI_JOB_ID.xml
+    expire_in: 5 days
+  only:
+    - merge_requests
+
+smoketests:x86-64:linux:gcc-10:cxx11-off:
+  extends: .smoketests:linux:base
+  variables:
+    EIGEN_CI_CXX_COMPILER: g++-10
+    EIGEN_CI_CC_COMPILER: gcc-10
+  needs: [ "buildsmoketests:x86-64:linux:gcc-10:cxx11-off" ]
+
+smoketests:x86-64:linux:gcc-10:cxx11-on:
+  extends: .smoketests:linux:base
+  variables:
+    EIGEN_CI_CXX_COMPILER: g++-10
+    EIGEN_CI_CC_COMPILER: gcc-10
+  needs: [ "buildsmoketests:x86-64:linux:gcc-10:cxx11-on" ]
+
+smoketests:x86-64:linux:clang-10:cxx11-off:
+  extends: .smoketests:linux:base
+  variables:
+    EIGEN_CI_CXX_COMPILER: clang++-10
+    EIGEN_CI_CC_COMPILER: clang-10
+  needs: [ "buildsmoketests:x86-64:linux:clang-10:cxx11-off" ]
+
+smoketests:x86-64:linux:clang-10:cxx11-on:
+  extends: .smoketests:linux:base
+  variables:
+    EIGEN_CI_CXX_COMPILER: clang++-10
+    EIGEN_CI_CC_COMPILER: clang-10
+  needs: [ "buildsmoketests:x86-64:linux:clang-10:cxx11-on" ]
--- a/ci/test.linux.gitlab-ci.yml
+++ b/ci/test.linux.gitlab-ci.yml
@@ -0,0 +1,451 @@
+.test:linux:
+  extends: .common:linux:cross
+  stage: test
+  script:
+    - . ci/scripts/test.linux.script.sh
+  after_script:
+    - . ci/scripts/test.linux.after_script.sh
+  rules:
+    - if: $CI_PIPELINE_SOURCE == "schedule" && $CI_PROJECT_NAMESPACE == "libeigen"
+    - if: $CI_PIPELINE_SOURCE == "web" && $CI_PROJECT_NAMESPACE == "libeigen"
+    - if: $CI_PIPELINE_SOURCE == "push" && $CI_PROJECT_NAMESPACE == "libeigen"
+
+##### x86-64 ###################################################################
+.test:linux:x86-64:
+  extends: .test:linux
+  variables:
+    EIGEN_CI_TARGET_ARCH: x86_64
+    EIGEN_CI_CROSS_TARGET_TRIPLE: x86_64-linux-gnu
+  tags:
+    - eigen-runner
+    - linux
+    - x86-64
+
+# GCC-6 (minimum on Ubuntu 18.04)
+.test:linux:x86-64:gcc-6:default:
+  extends: .test:linux:x86-64
+  image: ubuntu:18.04
+  needs: [ build:linux:cross:x86-64:gcc-6:default ]
+  variables:
+    EIGEN_CI_INSTALL: g++-6
+
+test:linux:x86-64:gcc-6:default:official:
+  extends: .test:linux:x86-64:gcc-6:default
+  variables:
+    EIGEN_CI_CTEST_LABEL: Official
+
+test:linux:x86-64:gcc-6:default:unsupported:
+  extends: .test:linux:x86-64:gcc-6:default
+  variables:
+    EIGEN_CI_CTEST_LABEL: Unsupported
+
+# GCC-10 (modern stable)
+.test:linux:x86-64:gcc-10:default:
+  extends: .test:linux:x86-64
+  needs: [ build:linux:cross:x86-64:gcc-10:default ]
+  variables:
+    EIGEN_CI_INSTALL: g++-10
+
+test:linux:x86-64:gcc-10:default:official:
+  extends: .test:linux:x86-64:gcc-10:default
+  variables:
+    EIGEN_CI_CTEST_LABEL: Official
+
+test:linux:x86-64:gcc-10:default:unsupported:
+  extends: .test:linux:x86-64:gcc-10:default
+  variables:
+    EIGEN_CI_CTEST_LABEL: Unsupported
+
+.test:linux:x86-64:gcc-10:cxx03:
+  extends: .test:linux:x86-64
+  needs: [ build:linux:cross:x86-64:gcc-10:cxx03 ]
+  variables:
+    EIGEN_CI_INSTALL: g++-10
+
+test:linux:x86-64:gcc-10:cxx03:official:
+  extends: .test:linux:x86-64:gcc-10:cxx03
+  variables:
+    EIGEN_CI_CTEST_LABEL: Official
+
+test:linux:x86-64:gcc-10:cxx03:unsupported:
+  extends: .test:linux:x86-64:gcc-10:cxx03
+  variables:
+    EIGEN_CI_CTEST_LABEL: Unsupported
+
+.test:linux:x86-64:gcc-10:avx:
+  extends: .test:linux:x86-64
+  needs: [ build:linux:cross:x86-64:gcc-10:avx ]
+  variables:
+    EIGEN_CI_INSTALL: g++-10
+
+test:linux:x86-64:gcc-10:avx:official:
+  extends: .test:linux:x86-64:gcc-10:avx
+  variables:
+    EIGEN_CI_CTEST_LABEL: Official
+
+test:linux:x86-64:gcc-10:avx:unsupported:
+  extends: .test:linux:x86-64:gcc-10:avx
+  variables:
+    EIGEN_CI_CTEST_LABEL: Unsupported
+
+.test:linux:x86-64:gcc-10:avx2:
+  extends: .test:linux:x86-64
+  needs: [ build:linux:cross:x86-64:gcc-10:avx2 ]
+  variables:
+    EIGEN_CI_INSTALL: g++-10
+
+test:linux:x86-64:gcc-10:avx2:official:
+  extends: .test:linux:x86-64:gcc-10:avx2
+  variables:
+    EIGEN_CI_CTEST_LABEL: Official
+
+test:linux:x86-64:gcc-10:avx2:unsupported:
+  extends: .test:linux:x86-64:gcc-10:avx2
+  variables:
+    EIGEN_CI_CTEST_LABEL: Unsupported
+
+.test:linux:x86-64:gcc-10:avx512dq:
+  extends: .test:linux:x86-64
+  needs: [ build:linux:cross:x86-64:gcc-10:avx512dq ]
+  variables:
+    EIGEN_CI_INSTALL: g++-10
+  tags:
+    - eigen-runner
+    - linux
+    - x86-64
+    - avx512
+
+test:linux:x86-64:gcc-10:avx512dq:official:
+  extends: .test:linux:x86-64:gcc-10:avx512dq
+  variables:
+    EIGEN_CI_CTEST_LABEL: Official
+
+test:linux:x86-64:gcc-10:avx512dq:unsupported:
+  extends: .test:linux:x86-64:gcc-10:avx512dq
+  variables:
+    EIGEN_CI_CTEST_LABEL: Unsupported
+
+# Clang-6 (minimum on Ubuntu 20.04)
+.test:linux:x86-64:clang-6:default:
+  extends: .test:linux:x86-64
+  image: ubuntu:20.04
+  needs: [ build:linux:cross:x86-64:clang-6:default ]
+  variables:
+    EIGEN_CI_INSTALL: g++-8 clang-6.0 lld-6.0
+
+test:linux:x86-64:clang-6:default:official:
+  extends: .test:linux:x86-64:clang-6:default
+  variables:
+    EIGEN_CI_CTEST_LABEL: Official
+
+test:linux:x86-64:clang-6:default:unsupported:
+  extends: .test:linux:x86-64:clang-6:default
+  variables:
+    EIGEN_CI_CTEST_LABEL: Unsupported
+
+# Clang-12 (modern stable)
+.test:linux:x86-64:clang-12:default:
+  extends: .test:linux:x86-64
+  needs: [ build:linux:cross:x86-64:clang-12:default ]
+  variables:
+    EIGEN_CI_INSTALL: clang-12
+
+test:linux:x86-64:clang-12:default:official:
+  extends: .test:linux:x86-64:clang-12:default
+  variables:
+    EIGEN_CI_CTEST_LABEL: Official
+
+test:linux:x86-64:clang-12:default:unsupported:
+  extends: .test:linux:x86-64:clang-12:default
+  variables:
+    EIGEN_CI_CTEST_LABEL: Unsupported
+
+.test:linux:x86-64:clang-12:avx:
+  extends: .test:linux:x86-64
+  needs: [ build:linux:cross:x86-64:clang-12:avx ]
+  variables:
+    EIGEN_CI_INSTALL: clang-12
+
+test:linux:x86-64:clang-12:avx:official:
+  extends: .test:linux:x86-64:clang-12:avx
+  variables:
+    EIGEN_CI_CTEST_LABEL: Official
+
+test:linux:x86-64:clang-12:avx:unsupported:
+  extends: .test:linux:x86-64:clang-12:avx
+  variables:
+    EIGEN_CI_CTEST_LABEL: Unsupported
+
+.test:linux:x86-64:clang-12:avx2:
+  extends: .test:linux:x86-64
+  needs: [ build:linux:cross:x86-64:clang-12:avx2 ]
+  variables:
+    EIGEN_CI_INSTALL: clang-12
+
+test:linux:x86-64:clang-12:avx2:official:
+  extends: .test:linux:x86-64:clang-12:avx2
+  variables:
+    EIGEN_CI_CTEST_LABEL: Official
+
+test:linux:x86-64:clang-12:avx2:unsupported:
+  extends: .test:linux:x86-64:clang-12:avx2
+  variables:
+    EIGEN_CI_CTEST_LABEL: Unsupported
+
+.test:linux:x86-64:clang-12:avx512dq:
+  extends: .test:linux:x86-64
+  needs: [ build:linux:cross:x86-64:clang-12:avx512dq ]
+  variables:
+    EIGEN_CI_INSTALL: clang-12
+  tags:
+    - eigen-runner
+    - linux
+    - x86-64
+    - avx512
+
+test:linux:x86-64:clang-12:avx512dq:official:
+  extends: .test:linux:x86-64:clang-12:avx512dq
+  variables:
+    EIGEN_CI_CTEST_LABEL: Official
+
+test:linux:x86-64:clang-12:avx512dq:unsupported:
+  extends: .test:linux:x86-64:clang-12:avx512dq
+  variables:
+    EIGEN_CI_CTEST_LABEL: Unsupported
+
+##### CUDA #####################################################################
+.test:linux:cuda:
+  extends: .test:linux
+  allow_failure: true
+  variables:
+    EIGEN_CI_CTEST_LABEL: gpu
+  tags:
+    - eigen-runner
+    - linux
+    - x86-64
+    - cuda
+
+# NVidia no longer provides docker images < CUDA 11.0.3.
+# # GCC-7, CUDA-9.2
+# test:linux:cuda-9.2:gcc-7:
+#   extends: .test:linux:cuda
+#   image: nvidia/cuda:9.2-devel-ubuntu18.04
+#   needs: [ build:linux:cuda-9.2:gcc-7 ]
+#   variables:
+#     EIGEN_CI_CXX_COMPILER: g++-7
+#     EIGEN_CI_CC_COMPILER: gcc-7
+
+# # Clang-10, CUDA-9.2
+# test:linux:cuda-9.2:clang-10:
+#   extends: .test:linux:cuda
+#   image: nvidia/cuda:9.2-devel-ubuntu18.04
+#   needs: [ build:linux:cuda-9.2:clang-10 ]
+#   variables:
+#     EIGEN_CI_CXX_COMPILER: clang++-10
+#     EIGEN_CI_CC_COMPILER: clang-10
+
+# # GCC-8, CUDA-10.2
+# test:linux:cuda-10.2:gcc-8:
+#   extends: .test:linux:cuda
+#   image: nvidia/cuda:10.2-devel-ubuntu18.04
+#   needs: [ build:linux:cuda-10.2:gcc-8 ]
+#   variables:
+#     EIGEN_CI_CXX_COMPILER: g++-8
+#     EIGEN_CI_CC_COMPILER: gcc-8
+
+# # Clang-10, CUDA-10.2
+# test:linux:cuda-10.2:clang-10:
+#   extends: .test:linux:cuda
+#   image: nvidia/cuda:10.2-devel-ubuntu18.04
+#   needs: [ build:linux:cuda-10.2:clang-10 ]
+#   variables:
+#     EIGEN_CI_CXX_COMPILER: clang++-10
+#     EIGEN_CI_CC_COMPILER: clang-10
+
+# GCC-10, CUDA-11.4
+test:linux:cuda-11.4:gcc-10:
+  extends: .test:linux:cuda
+  image: nvidia/cuda:11.4.3-devel-ubuntu20.04
+  needs: [ build:linux:cuda-11.4:gcc-10 ]
+  variables:
+    EIGEN_CI_CXX_COMPILER: g++-10
+    EIGEN_CI_CC_COMPILER: gcc-10
+
+# Clang-12, CUDA-11.4
+test:linux:cuda-11.4:clang-12:
+  extends: .test:linux:cuda
+  image: nvidia/cuda:11.4.3-devel-ubuntu20.04
+  needs: [ build:linux:cuda-11.4:clang-12 ]
+  variables:
+    EIGEN_CI_CXX_COMPILER: clang++-12
+    EIGEN_CI_CC_COMPILER: clang-12
+
+# GCC-10, CUDA-12.2
+test:linux:cuda-12.2:gcc-10:
+  extends: .test:linux:cuda
+  image: nvidia/cuda:12.2.0-devel-ubuntu20.04
+  needs: [ build:linux:cuda-12.2:gcc-10 ]
+  variables:
+    EIGEN_CI_CXX_COMPILER: g++-10
+    EIGEN_CI_CC_COMPILER: gcc-10
+
+# Clang-12, CUDA-12.2
+test:linux:cuda-12.2:clang-12:
+  extends: .test:linux:cuda
+  image: nvidia/cuda:12.2.0-devel-ubuntu20.04
+  needs: [ build:linux:cuda-12.2:clang-12 ]
+  variables:
+    EIGEN_CI_CXX_COMPILER: clang++-12
+    EIGEN_CI_CC_COMPILER: clang-12
+
+##### arm ######################################################################
+
+.test:linux:arm:
+  extends: .test:linux
+  variables:
+    EIGEN_CI_TARGET_ARCH: arm
+    EIGEN_CI_CROSS_TARGET_TRIPLE: arm-linux-gnueabihf
+    # Enable cross-compiled arm binary to run on aarch64.
+    EIGEN_CI_BEFORE_SCRIPT: "ln -s /usr/arm-linux-gnueabihf/lib/ld-linux-armhf.so.3 /lib/ && export LD_LIBRARY_PATH=/usr/arm-linux-gnueabihf/lib/"
+  tags:
+    - eigen-runner
+    - linux
+    - aarch64
+
+.test:linux:arm:gcc-10:default:
+  extends: .test:linux:arm
+  needs: [ build:linux:cross:arm:gcc-10:default ]
+  variables:
+    EIGEN_CI_CROSS_INSTALL: g++-10-arm-linux-gnueabihf
+
+test:linux:arm:gcc-10:default:official:
+  extends: .test:linux:arm:gcc-10:default
+  variables:
+    EIGEN_CI_CTEST_LABEL: Official
+
+test:linux:arm:gcc-10:default:unsupported:
+  extends: .test:linux:arm:gcc-10:default
+  variables:
+    EIGEN_CI_CTEST_LABEL: Unsupported
+
+.test:linux:arm:clang-12:default:
+  extends: .test:linux:arm
+  needs: [ build:linux:cross:arm:clang-12:default ]
+  variables:
+    EIGEN_CI_CROSS_INSTALL: g++-10-arm-linux-gnueabihf clang-12
+
+test:linux:arm:clang-12:default:official:
+  extends: .test:linux:arm:clang-12:default
+  variables:
+    EIGEN_CI_CTEST_LABEL: Official
+
+test:linux:arm:clang-12:default:unsupported:
+  extends: .test:linux:arm:clang-12:default
+  variables:
+    EIGEN_CI_CTEST_LABEL: Unsupported
+
+##### aarch64 ##################################################################
+
+.test:linux:aarch64:
+  extends: .test:linux
+  variables:
+    EIGEN_CI_TARGET_ARCH: aarch64
+    EIGEN_CI_CROSS_TARGET_TRIPLE: aarch64-linux-gnu
+  tags:
+    - eigen-runner
+    - linux
+    - aarch64
+
+.test:linux:aarch64:gcc-10:default:
+  extends: .test:linux:aarch64
+  needs: [ build:linux:cross:aarch64:gcc-10:default ]
+  variables:
+    EIGEN_CI_INSTALL: g++-10
+
+test:linux:aarch64:gcc-10:default:official:
+  extends: .test:linux:aarch64:gcc-10:default
+  variables:
+    EIGEN_CI_CTEST_LABEL: Official
+
+test:linux:aarch64:gcc-10:default:unsupported:
+  extends: .test:linux:aarch64:gcc-10:default
+  variables:
+    EIGEN_CI_CTEST_LABEL: Unsupported
+
+.test:linux:aarch64:clang-12:default:
+  extends: .test:linux:aarch64
+  needs: [ build:linux:cross:aarch64:clang-12:default ]
+  variables:
+    EIGEN_CI_INSTALL: clang-12
+
+test:linux:aarch64:clang-12:default:official:
+  extends: .test:linux:aarch64:clang-12:default
+  variables:
+    EIGEN_CI_CTEST_LABEL: Official
+
+test:linux:aarch64:clang-12:default:unsupported:
+  extends: .test:linux:aarch64:clang-12:default
+  variables:
+    EIGEN_CI_CTEST_LABEL: Unsupported
+
+##### ppc64le ##################################################################
+
+.test:linux:ppc64le:
+  extends: .test:linux
+  variables:
+    EIGEN_CI_TARGET_ARCH: ppc64le
+    EIGEN_CI_CROSS_TARGET_TRIPLE: powerpc64le-linux-gnu
+  tags:
+    - eigen-runner
+    - linux
+    - ppc64le
+
+.test:linux:ppc64le:gcc-10:default:
+  extends: .test:linux:ppc64le
+  needs: [ build:linux:cross:ppc64le:gcc-10:default ]
+  variables:
+    EIGEN_CI_INSTALL: g++-10
+
+test:linux:ppc64le:gcc-10:default:official:
+  extends: .test:linux:ppc64le:gcc-10:default
+  variables:
+    EIGEN_CI_CTEST_LABEL: Official
+
+test:linux:ppc64le:gcc-10:default:unsupported:
+  extends: .test:linux:ppc64le:gcc-10:default
+  variables:
+    EIGEN_CI_CTEST_LABEL: Unsupported
+
+.test:linux:ppc64le:clang-12:default:
+  extends: .test:linux:ppc64le
+  needs: [ build:linux:cross:ppc64le:clang-12:default ]
+  variables:
+    EIGEN_CI_INSTALL: clang-12
+
+test:linux:ppc64le:clang-12:default:official:
+  extends: .test:linux:ppc64le:clang-12:default
+  variables:
+    EIGEN_CI_CTEST_LABEL: Official
+
+test:linux:ppc64le:clang-12:default:unsupported:
+  extends: .test:linux:ppc64le:clang-12:default
+  variables:
+    EIGEN_CI_CTEST_LABEL: Unsupported
+
+##### MR Smoke Tests ###########################################################
+
+test:linux:x86-64:gcc-10:default:smoketest:
+  extends: .test:linux:x86-64:gcc-10:default
+  needs: [ build:linux:cross:x86-64:gcc-10:default:smoketest ]
+  variables:
+    EIGEN_CI_CTEST_LABEL: smoketest
+  rules:
+    - if: $CI_PIPELINE_SOURCE == "merge_request_event"
+
+test:linux:x86-64:clang-12:default:smoketest:
+  extends: .test:linux:x86-64:clang-12:default
+  needs: [ build:linux:cross:x86-64:clang-12:default:smoketest ]
+  variables:
+    EIGEN_CI_CTEST_LABEL: smoketest
+  rules:
+    - if: $CI_PIPELINE_SOURCE == "merge_request_event"
--- a/ci/test.windows.gitlab-ci.yml
+++ b/ci/test.windows.gitlab-ci.yml
@@ -0,0 +1,112 @@
+.test:windows:
+  extends: .common:windows
+  stage: test
+  script:
+    - ./ci/scripts/test.windows.script.ps1
+  after_script:
+    - ./ci/scripts/test.windows.after_script.ps1
+  rules:
+    - if: $CI_PIPELINE_SOURCE == "schedule" && $CI_PROJECT_NAMESPACE == "libeigen"
+    - if: $CI_PIPELINE_SOURCE == "web" && $CI_PROJECT_NAMESPACE == "libeigen"
+    - if: $CI_PIPELINE_SOURCE == "push" && $CI_PROJECT_NAMESPACE == "libeigen"
+  tags: 
+    - eigen-runner
+    - windows
+    - x86-64
+
+##### MSVC #####################################################################
+
+# MSVC 14.16 (VS 2017)
+.test:windows:x86-64:msvc-14.16:default:
+  extends: .test:windows
+  needs: [ build:windows:x86-64:msvc-14.16:default ]
+
+test:windows:x86-64:msvc-14.16:default:official:
+  extends: .test:windows:x86-64:msvc-14.16:default
+  variables:
+    EIGEN_CI_CTEST_LABEL: Official
+
+test:windows:x86-64:msvc-14.16:default:unsupported:
+  extends: .test:windows:x86-64:msvc-14.16:default
+  variables:
+    EIGEN_CI_CTEST_LABEL: Unsupported
+
+# MSVC 14.29 (VS 2019)
+.test:windows:x86-64:msvc-14.29:default:
+  extends: .test:windows
+  needs: [ build:windows:x86-64:msvc-14.29:default ]
+
+test:windows:x86-64:msvc-14.29:default:official:
+  extends: .test:windows:x86-64:msvc-14.29:default
+  needs: [ build:windows:x86-64:msvc-14.29:cxx03 ]
+
+test:windows:x86-64:msvc-14.29:default:official:
+  extends: .test:windows:x86-64:msvc-14.29:default
+  variables:
+    EIGEN_CI_CTEST_LABEL: Official
+
+test:windows:x86-64:msvc-14.29:default:unsupported:
+  extends: .test:windows:x86-64:msvc-14.29:default
+  variables:
+    EIGEN_CI_CTEST_LABEL: Unsupported
+
+.test:windows:x86-64:msvc-14.29:avx2:
+  extends: .test:windows
+  needs: [ build:windows:x86-64:msvc-14.29:avx2 ]
+
+test:windows:x86-64:msvc-14.29:avx2:official:
+  extends: .test:windows:x86-64:msvc-14.29:avx2
+  variables:
+    EIGEN_CI_CTEST_LABEL: Official
+
+test:windows:x86-64:msvc-14.29:avx2:unsupported:
+  extends: .test:windows:x86-64:msvc-14.29:avx2
+  variables:
+    EIGEN_CI_CTEST_LABEL: Unsupported
+
+.test:windows:x86-64:msvc-14.29:avx512dq:
+  extends: .test:windows
+  needs: [ build:windows:x86-64:msvc-14.29:avx512dq ]
+  tags: 
+    - eigen-runner
+    - windows
+    - x86-64
+    - avx512
+
+test:windows:x86-64:msvc-14.29:avx512dq:official:
+  extends: .test:windows:x86-64:msvc-14.29:avx512dq
+  variables:
+    EIGEN_CI_CTEST_LABEL: Official
+
+test:windows:x86-64:msvc-14.29:avx512dq:unsupported:
+  extends: .test:windows:x86-64:msvc-14.29:avx512dq
+  variables:
+    EIGEN_CI_CTEST_LABEL: Unsupported
+
+##### MSVC + CUDA ##############################################################
+.test:windows:cuda:
+  extends: .test:windows
+  allow_failure: true
+  variables:
+    EIGEN_CI_CTEST_LABEL: gpu
+  tags: 
+    - eigen-runner
+    - windows
+    - x86-64
+    - cuda
+
+# The CUDA 9.2 compiler crashes with an internal error.
+# # MSVC 14.16 + CUDA 9.2
+# test:windows:x86-64:cuda-9.2:msvc-14.16:
+#   extends: .test:windows:cuda
+#   needs: [ build:windows:x86-64:cuda-9.2:msvc-14.16 ]
+  
+# MSVC 14.29 + CUDA 10.2
+test:windows:x86-64:cuda-10.2:msvc-14.29:
+  extends: .test:windows:cuda
+  needs: [ build:windows:x86-64:cuda-10.2:msvc-14.29 ]
+  
+# MSVC 14.29 + CUDA 11.4
+test:windows:x86-64:cuda-11.4:msvc-14.29:
+  extends: .test:windows:cuda
+  needs: [ build:windows:x86-64:cuda-11.4:msvc-14.29 ]
--- a/cmake/Eigen3Config.cmake.in
+++ b/cmake/Eigen3Config.cmake.in
@@ -3,7 +3,9 @@

@PACKAGE_INIT@

-include ("${CMAKE_CURRENT_LIST_DIR}/Eigen3Targets.cmake")
+if (NOT TARGET eigen)
+  include ("${CMAKE_CURRENT_LIST_DIR}/Eigen3Targets.cmake")
+endif ()

 # Legacy variables, do *not* use. May be removed in the future.

--- a/cmake/EigenConfigureTesting.cmake
+++ b/cmake/EigenConfigureTesting.cmake
@@ -1,7 +1,7 @@
 include(EigenTesting)
 include(CheckCXXSourceCompiles)

-# configure the "site" and "buildname" 
+# configure the "site" and "buildname"
 ei_set_sitename()

 # retrieve and store the build string
@@ -11,6 +11,15 @@ add_custom_target(buildtests)
 add_custom_target(check COMMAND "ctest")
 add_dependencies(check buildtests)

+# Convenience target for only building GPU tests.
+add_custom_target(buildtests_gpu)
+add_custom_target(check_gpu COMMAND "ctest" "--output-on-failure"
+                                            "--no-compress-output"
+                                            "--build-no-clean"
+                                            "-T" "test"
+                                            "-L" "gpu")
+add_dependencies(check_gpu buildtests_gpu)
+
 # check whether /bin/bash exists (disabled as not used anymore)
 # find_file(EIGEN_BIN_BASH_EXISTS "/bin/bash" PATHS "/" NO_DEFAULT_PATH)

@@ -50,7 +59,7 @@ if(CMAKE_COMPILER_IS_GNUCXX)
    set(CTEST_CUSTOM_COVERAGE_EXCLUDE "/test/")
    set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} ${COVERAGE_FLAGS}")
  endif(EIGEN_COVERAGE_TESTING)
-  
+
 elseif(MSVC)
  set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} /D_CRT_SECURE_NO_WARNINGS /D_SCL_SECURE_NO_WARNINGS")
 endif(CMAKE_COMPILER_IS_GNUCXX)
--- a/cmake/EigenTesting.cmake
+++ b/cmake/EigenTesting.cmake
@@ -18,26 +18,26 @@ macro(ei_add_test_internal testname testname_with_suffix)
    set(filename ${testname}.cpp)
  endif()

+  set(is_gpu_test OFF)
  if(EIGEN_ADD_TEST_FILENAME_EXTENSION STREQUAL cu)
+    set(is_gpu_test ON)
    if(EIGEN_TEST_CUDA_CLANG)
      set_source_files_properties(${filename} PROPERTIES LANGUAGE CXX)
-      if(CUDA_64_BIT_DEVICE_CODE)
+      
+      if(CUDA_64_BIT_DEVICE_CODE AND (EXISTS "${CUDA_TOOLKIT_ROOT_DIR}/lib64"))
        link_directories("${CUDA_TOOLKIT_ROOT_DIR}/lib64")
      else()
        link_directories("${CUDA_TOOLKIT_ROOT_DIR}/lib")
      endif()
-      if (${ARGC} GREATER 2)
-        add_executable(${targetname} ${filename})
-      else()
-        add_executable(${targetname} ${filename} OPTIONS ${ARGV2})
+
+      add_executable(${targetname} ${filename})
+      set(CUDA_CLANG_LINK_LIBRARIES "cudart_static" "cuda" "dl" "pthread")
+      if (CMAKE_SYSTEM_NAME STREQUAL "Linux")
+        set(CUDA_CLANG_LINK_LIBRARIES ${CUDA_CLANG_LINK_LIBRARIES} "rt")
      endif()
-      target_link_libraries(${targetname} "cudart_static" "cuda" "dl" "rt" "pthread")
+      target_link_libraries(${targetname} ${CUDA_CLANG_LINK_LIBRARIES})
    else()
-      if (${ARGC} GREATER 2)
-        cuda_add_executable(${targetname} ${filename} OPTIONS ${ARGV2})
-      else()
-        cuda_add_executable(${targetname} ${filename})
-      endif()
+      cuda_add_executable(${targetname} ${filename})
    endif()
  else()
    add_executable(${targetname} ${filename})
@@ -46,7 +46,10 @@ macro(ei_add_test_internal testname testname_with_suffix)
  if (targetname MATCHES "^eigen2_")
    add_dependencies(eigen2_buildtests ${targetname})
  else()
-    add_dependencies(buildtests ${targetname})
+  add_dependencies(buildtests ${targetname})
+  endif()
+  if (is_gpu_test)
+    add_dependencies(buildtests_gpu ${targetname})
  endif()

  if(EIGEN_NO_ASSERTION_CHECKING)
@@ -71,7 +74,7 @@ macro(ei_add_test_internal testname testname_with_suffix)
  endif(${ARGC} GREATER 2)

  if(EIGEN_TEST_CUSTOM_CXX_FLAGS)
-    ei_add_target_property(${targetname} COMPILE_FLAGS "${EIGEN_TEST_CUSTOM_CXX_FLAGS}")
+    target_compile_options(${targetname} PRIVATE ${EIGEN_TEST_CUSTOM_CXX_FLAGS})
  endif()

  if(EIGEN_STANDARD_LIBRARIES_TO_LINK_TO)
@@ -97,7 +100,11 @@ macro(ei_add_test_internal testname testname_with_suffix)
    endif()
  endif()

-  add_test(${testname_with_suffix} "${targetname}")
+  add_test(NAME ${testname_with_suffix} COMMAND "${targetname}")
+  if (is_gpu_test)
+    # Add gpu tag for testing only GPU tests.
+    set_property(TEST ${testname_with_suffix} APPEND PROPERTY LABELS "gpu")
+  endif()

  # Specify target and test labels accoirding to EIGEN_CURRENT_SUBPROJECT
  get_property(current_subproject GLOBAL PROPERTY EIGEN_CURRENT_SUBPROJECT)
@@ -145,10 +152,10 @@ macro(ei_add_test_internal_sycl testname testname_with_suffix)
  endif()

  if(EIGEN_NO_ASSERTION_CHECKING)
-    ei_add_target_property(${targetname} COMPILE_FLAGS "-DEIGEN_NO_ASSERTION_CHECKING=1")
+    target_compile_options(${targetname} PRIVATE "-DEIGEN_NO_ASSERTION_CHECKING=1")
  else(EIGEN_NO_ASSERTION_CHECKING)
    if(EIGEN_DEBUG_ASSERTS)
-      ei_add_target_property(${targetname} COMPILE_FLAGS "-DEIGEN_DEBUG_ASSERTS=1")
+      target_compile_options(${targetname} PRIVATE "-DEIGEN_DEBUG_ASSERTS=1")
    endif(EIGEN_DEBUG_ASSERTS)
  endif(EIGEN_NO_ASSERTION_CHECKING)

@@ -166,7 +173,7 @@ macro(ei_add_test_internal_sycl testname testname_with_suffix)
  endif(${ARGC} GREATER 2)

  if(EIGEN_TEST_CUSTOM_CXX_FLAGS)
-    ei_add_target_property(${targetname} COMPILE_FLAGS "${EIGEN_TEST_CUSTOM_CXX_FLAGS}")
+    target_compile_options(${targetname} PRIVATE ${EIGEN_TEST_CUSTOM_CXX_FLAGS})
  endif()

  if(EIGEN_STANDARD_LIBRARIES_TO_LINK_TO)
@@ -192,7 +199,7 @@ macro(ei_add_test_internal_sycl testname testname_with_suffix)
    endif()
  endif()

-  add_test(${testname_with_suffix} "${targetname}")
+  add_test(NAME ${testname_with_suffix} COMMAND "${targetname}")

  # Specify target and test labels according to EIGEN_CURRENT_SUBPROJECT
  get_property(current_subproject GLOBAL PROPERTY EIGEN_CURRENT_SUBPROJECT)
--- a/cmake/FindStandardMathLibrary.cmake
+++ b/cmake/FindStandardMathLibrary.cmake
@@ -19,8 +19,11 @@ include(CheckCXXSourceCompiles)
 # notice the std:: is required on some platforms such as QNX

 set(find_standard_math_library_test_program
-"#include<cmath>
-int main() { std::sin(0.0); std::log(0.0f); }")
+"
+#include<cmath>
+int main(int argc, char **){
+  return int(std::sin(double(argc)) + std::log(double(argc)));
+}")

 # first try compiling/linking the test program without any linker flags

--- a/Show More
+++ b/Show More
Author	SHA1	Message	Date
Antonio Sanchez	5e8edd2186	Fix undefined behavior in PPC load. VSX vec_xl is Causing a bunch of test failures and failing `-fsanitize=undefined` with g++. Removing the instruction allows tests to pass and eliminates the warning.	2025-03-14 07:58:36 -07:00
Antonio Sanchez	0ac1fc52dd	Fix CUDA clang again with new C++11 usages	2025-03-14 07:58:29 -07:00
Antonio Sánchez	6aa0143851	Judge unitary-ness relative to scaling. (cherry picked from commit `c1d637433e`)	2025-03-13 15:01:33 -07:00
Antonio Sanchez	c7f6f8315f	Update CUDA testing infra to match master branch.	2025-03-13 21:49:24 +00:00
Antonio Sanchez	b0448fc6e0	Fix cxx03 testing job configuration	2025-03-13 11:13:35 -07:00
Antonio Sanchez	3b8644da50	Better rand to fix MSVC random tests	2025-03-13 08:50:26 -07:00
Antonio Sanchez	414c42bfcf	Fix cuda clang builds	2025-03-12 21:47:26 -07:00
Antonio Sanchez	952eda443b	Fix GPU build failures.	2025-03-09 17:04:41 -07:00
Chip Kerchner	6a4a0b66bd	Fix epsilon and dummy_precision values in long double for double doubles. Prevented some algorithms from converging on PPC. (cherry picked from commit `54459214a1`)	2025-03-07 21:19:41 -08:00
Antonio Sanchez	079de53fa5	Adjust tolerance of matrix_power test for MSVC. (cherry picked from commit `1c2690ed24`)	2025-03-07 21:02:39 -08:00
Antonio Sanchez	ce950ca2db	Patch PPC PacketMath from 3.4. This is to fix failing tests for PPC due to UB on loads.	2025-03-07 20:59:30 -08:00
Antonio Sanchez	49bd503308	Fix merge conflict error	2025-03-03 15:21:02 -08:00
Charles Schlosser	5b20d9f326	Fix arm32 float division and related bugs (cherry picked from commit `81b48065ea`) (cherry picked from commit `72f77ccb3e`)	2025-03-03 13:11:37 -08:00
Antonio Sánchez	5f8f69020b	Remove poor non-convergence checks in NonLinearOptimization. (cherry picked from commit `d819a33bf6`) (cherry picked from commit `b30a2a527e`)	2025-03-03 07:56:31 -08:00
Antonio Sánchez	dc9325848a	Fix arm32 issues. (cherry picked from commit `a73970a864`) (cherry picked from commit `c23abcf25c`)	2025-03-03 07:38:59 -08:00
Antonio Sanchez	9df4c76bb8	Fix emulated builds cmake configuration	2025-03-03 07:26:26 -08:00
Antonio Sánchez	0071c2e8a8	Fix more hard-coded magic bounds. (cherry picked from commit `ae5280aa8d`)	2025-03-03 07:26:02 -08:00
Antonio Sánchez	03727bdf55	Slightly adjust error bound for nonlinear tests. (cherry picked from commit `42aa3d17cd`)	2025-03-03 07:24:35 -08:00
Antonio Sánchez	5e39ba6642	Fix emulated tests. (cherry picked from commit `9589cc4e7f`)	2025-03-02 16:38:13 -08:00
Antonio Sanchez	d2ce4faa5a	Fix cuda 9+ builds Fix removed `shfl_` intrinsics, disable warnings, update CUDA header inclusion.	2025-03-02 16:21:48 -08:00
Antonio Sanchez	43b7aa2412	Don't check for build type	2025-02-28 22:13:46 -08:00
Antonio Sanchez	23b1682723	Fix cuda device warnings	2025-02-28 22:09:30 -08:00
Antonio Sanchez	c53002f5fb	Fix failing tests on arm/ppc	2025-02-28 13:15:33 -08:00
Antonio Sanchez	ea37d9e73e	Remove private access of std::deque::_M_impl. This no longer works on gcc or clang, so we should just remove the hack. The default should compile to similar code anyways. (cherry picked from commit `82c0c18a83`)	2025-02-28 11:21:13 -08:00
Antonio Sánchez	ece7cec604	Fix parsing of command-line arguments when already specified as a cmake list. (cherry picked from commit `555cec17ed`)	2025-02-26 07:49:23 -08:00
Antonio Sanchez	2e708d48ca	Merge CI from 3.4	2025-02-25 21:23:42 -08:00
C. Antonio Sanchez	109935bfce	Fix Tensor docs (cherry picked from commit `42d9cc0b1d`)	2025-02-25 21:21:49 -08:00
Antonio Sanchez	339d7188ed	Fix up all doxygen warnings.	2025-02-25 21:05:40 -08:00
Jean-Christophe Fillion-Robin	02f420012a	[PATCH] cmake: Support source include with add_subdirectory and find_package use This commit allows the sources of the project to be included in a parent project CMakeLists.txt and support use of "find_package(Eigen3 CONFIG REQUIRED)" Here is an example allowing to test the changes. It is not particularly useful in itself. This change will allow to support one of the scenario allowing to create custom 3D Slicer application bundling associated plugins. /tmp/eigen-git-mirror # Eigen sources /tmp/test/CMakeLists.txt: cmake_minimum_required(VERSION 3.12) project(test) add_subdirectory("/tmp/eigen-git-mirror" "eigen-git-mirror") find_package(Eigen3 CONFIG REQUIRED) and configuring it using: mkdir /tmp/test-build && cd $_ cmake \ -DCMAKE_FIND_PACKAGE_NO_PACKAGE_REGISTRY:BOOL=1 \ -DEigen3_DIR:PATH=/tmp/test-build/eigen-git-mirror \ /tmp/test Co-authored-by: Pablo Hernandez <pablo.hernandez@kitware.com> --- CMakeLists.txt \| 1 + cmake/Eigen3Config.cmake.in \| 4 +++- 2 files changed, 4 insertions(+), 1 deletion(-) (cherry picked from commit `2cbd9dd498`)	2022-02-13 21:22:14 +00:00
Antonio Sánchez	d45ac54008	Correct use of EIGEN_CUDACC to respect EIGEN_NO_CUDA.	2022-02-04 22:24:31 +00:00
Jakob Struye	d9585478d9	Clearer doc for squaredNorm (cherry picked from commit `53a29c7e35`)	2021-08-18 15:13:11 +00:00
Benoit Steiner	01421e31a2	Added missing EIGEN_DEVICE_FUNC qualifiers (cherry picked from commit `c36bc2d445`)	2021-07-20 21:21:53 +00:00
René Wagner	2f81b6363f	BooleanRedux.h: Add more EIGEN_DEVICE_FUNC qualifiers. This enables operator== on Eigen matrices in device code. (cherry picked from commit `0aebe19aca`)	2021-07-20 21:20:54 +00:00
Adam Shapiro	53a7864c48	Fixed sparse conservativeResize() when both num cols and rows decreased. The previous implementation caused a buffer overflow trying to calculate non- zero counts for columns that no longer exist. (cherry picked from commit `2ac0b78739`) (cherry picked from commit f4b67691c42952b44ce7dae62f5c18ed93b53521)	2021-02-23 21:35:46 +00:00
Gael Guennebaud	9fc3d9f3ca	Fix some implicit literal to Scalar conversions in SparseCore (cherry picked from commit `afa8d13532`)	2021-02-19 18:54:23 +00:00
Antonio Sanchez	84911f9c05	Include `<cstdint>` in one place, remove custom typedefs Originating from [this SO issue](https://stackoverflow.com/questions/65901014/how-to-solve-this-all-error-2-in-this-case), some win32 compilers define `__int32` as a `long`, but MinGW defines `std::int32_t` as an `int`, leading to a type conflict. To avoid this, we remove the custom `typedef` definitions for win32. The Tensor module requires C++11 anyways, so we are guaranteed to have included `<cstdint>` already in `Eigen/Core`. Also re-arranged the headers to only include `<cstdint>` in one place to avoid this type of error again.	2021-01-28 11:10:13 -08:00
Rasmus Munk Larsen	77dc6dbb44	Fix bugs in log1p and expm1 where repeated using statements would clobber each other. Add specializations for complex types since std::log1p and std::exp1m do not support complex. (cherry picked from commit `d55d392e7b`)	2021-01-21 11:22:36 +01:00
David Tellenbach	a36d19c4fc	Fix a typo in SparseMatrix documentation. This fixes issue #2091. (cherry picked from commit `2e8f850c78`)	2020-12-09 14:53:09 +01:00
David Tellenbach	0fd6b4f71d	Bump to 3.3.9	2020-12-04 22:53:41 +01:00
Florian Maurin	52207cf6f9	Fix typo in doc (cherry picked from commit `c5985c46f5`)	2020-11-30 12:09:43 +00:00
Jim Lersch	0c26611d2d	Workaround for doxygen class template titles in which the template part of the class signature is lost due to a problem with forward declarations. The problem is probably caused by doxygen bug #7689. It is confirmed to be fixed in doxygen >= 1.8.19. (cherry picked from commit `68f69414f7`)	2020-11-28 16:21:12 +01:00
Christoph Hertzberg	2a4fcb2c31	Fix doxygen class block that was wrongly named. Manually cherry-picked from `a7170f2aca`	2020-11-27 19:41:19 +01:00
Christoph Hertzberg	54930b6b55	Remove unused variable	2020-11-25 17:59:18 +01:00
Martin Vonheim Larsen	4e5385c905	Enable MathJax in Doxygen.in Note that HTTPS must be used against the MathJax CDN when hosted on `eigen.tuxfamily.org` (which uses HTTPS) in order to avoid `Mixed Content`-errors from browsers. Using HTTPS for MathJax also works if the Eigen docs are hosted on plain HTTP. (cherry picked from commit `280f4f2407`)	2020-11-17 15:39:44 +01:00
Christoph Hertzberg	ac632f663e	bug #1746 : Removed implementation of standard copy-constructor and standard copy-assign-operator from PermutationMatrix and Transpositions to allow malloc-less std::move. Added unit-test to rvalue_types (cherry picked from commit `efd9867ff0`)	2020-11-12 11:54:51 +01:00
Christoph Hertzberg	3620371c5c	Bug #2036 make sure find_standard_math_library_test_program actually compiles (and is guaranteed to call math functions) (cherry picked from commit `ecb7bc9514`)	2020-11-04 13:38:17 +01:00
David Tellenbach	5dda502f84	Rename test/array.cpp to test/array_cwise.cpp Having a test named "array" can clash with the standard library header "array". Fixes issue #2046	2020-11-04 13:01:17 +01:00
Gael Guennebaud	590aec8fab	check two ctors (cherry picked from commit `572d62697d`)	2020-10-28 09:53:23 +01:00
David Tellenbach	75f8b06e50	Mention problems when using potentially throwing scalars and OpenMP (cherry picked from commit `9022f5aa8a`)	2020-10-09 17:41:41 +02:00
Karl Ljungkvist	e91e5d8c87	Fix typo in Tutorial_BlockOperations_block_assignment.cpp (cherry picked from commit `d199c17b14`)	2020-10-09 14:19:23 +02:00
Luke Peterson	ef3cc72cb6	Remove error counting in OpenMP parallelize_gemm This resolves a compilation error associated with Eigen::eigen_assert_exception. It also eliminates the counting of exceptions that may occur in the OpenMP parallel section. If an unhandled exception occurs in this section, the behavior is non-conforming according to the OpenMP specification.	2020-10-08 18:50:33 -07:00
David Tellenbach	7a0a2a5001	Define coeff-wise binary array operators for base class This fixes #2012.	2020-10-09 00:53:34 +02:00
szczepaniak bartek	bfdd4a9903	Fix Paradiso. EIGEN_USING_STD -> EIGEN_USING_STD_MATH	2020-10-08 19:38:35 +00:00