bump to 3.3.6

workaround "may be used uninitialized" warning
(grafted from bff90bf270 )
2026-04-10 11:34:33 +08:00 · 2018-12-10 14:46:51 +01:00 · 2018-12-08 18:58:28 +01:00 · 2018-12-08 09:44:21 +01:00 · 2018-12-08 00:13:37 +01:00 · 2018-12-08 00:05:38 +01:00
903 changed files with 20949 additions and 75322 deletions
--- a/.hgignore
+++ b/.hgignore
@@ -28,11 +28,7 @@ activity.png
 *.rej
 log
 patch
-*.patch
 a
 a.*
 lapack/testing
 lapack/reference
-.*project
-.settings
-Makefile
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -1,6 +1,6 @@
 project(Eigen3)

-cmake_minimum_required(VERSION 2.8.11)
+cmake_minimum_required(VERSION 2.8.5)

 # guard against in-source builds

@@ -8,7 +8,6 @@ if(${CMAKE_SOURCE_DIR} STREQUAL ${CMAKE_BINARY_DIR})
  message(FATAL_ERROR "In-source builds not allowed. Please make a new directory (called a build directory) and run CMake from there. You may need to remove CMakeCache.txt. ")
 endif()

-
 # Alias Eigen_*_DIR to Eigen3_*_DIR:

 set(Eigen_SOURCE_DIR ${Eigen3_SOURCE_DIR})
@@ -29,7 +28,7 @@ endif()


 #############################################################################
-# retrieve version information                                               #
+# retrieve version infomation                                               #
 #############################################################################

 # automatically parse the version number
@@ -54,13 +53,13 @@ endif()
 if(EIGEN_BRANCH_OUTPUT MATCHES "default")
 string(REGEX MATCH "^changeset: *[0-9]*:([0-9;a-f]+).*" EIGEN_HG_CHANGESET_MATCH "${EIGEN_HGTIP_OUTPUT}")
 set(EIGEN_HG_CHANGESET "${CMAKE_MATCH_1}")
-endif()
+endif(EIGEN_BRANCH_OUTPUT MATCHES "default")
 #...and show it next to the version number
 if(EIGEN_HG_CHANGESET)
  set(EIGEN_VERSION "${EIGEN_VERSION_NUMBER} (mercurial changeset ${EIGEN_HG_CHANGESET})")
-else()
+else(EIGEN_HG_CHANGESET)
  set(EIGEN_VERSION "${EIGEN_VERSION_NUMBER}")
-endif()
+endif(EIGEN_HG_CHANGESET)


 include(CheckCXXCompilerFlag)
@@ -79,7 +78,7 @@ macro(ei_add_cxx_compiler_flag FLAG)
  if(COMPILER_SUPPORT_${SFLAG})
    set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} ${FLAG}")
  endif()
-endmacro()
+endmacro(ei_add_cxx_compiler_flag)

 check_cxx_compiler_flag("-std=c++11" EIGEN_COMPILER_SUPPORT_CPP11)

@@ -135,7 +134,7 @@ if(NOT WIN32 OR NOT CMAKE_HOST_SYSTEM_NAME MATCHES Windows)
  option(EIGEN_BUILD_PKGCONFIG "Build pkg-config .pc file for Eigen" ON)
 endif()

-set(CMAKE_INCLUDE_CURRENT_DIR OFF)
+set(CMAKE_INCLUDE_CURRENT_DIR ON)

 option(EIGEN_SPLIT_LARGE_TESTS "Split large tests into smaller executables" ON)

@@ -159,7 +158,7 @@ if(NOT MSVC)
  ei_add_cxx_compiler_flag("-Wall")
  ei_add_cxx_compiler_flag("-Wextra")
  #ei_add_cxx_compiler_flag("-Weverything")              # clang
-
+  
  ei_add_cxx_compiler_flag("-Wundef")
  ei_add_cxx_compiler_flag("-Wcast-align")
  ei_add_cxx_compiler_flag("-Wchar-subscripts")
@@ -174,25 +173,29 @@ if(NOT MSVC)
  ei_add_cxx_compiler_flag("-Wc++11-extensions")
  ei_add_cxx_compiler_flag("-Wdouble-promotion")
 #  ei_add_cxx_compiler_flag("-Wconversion")
-
-  ei_add_cxx_compiler_flag("-Wshadow")
-
+  
+  # -Wshadow is insanely too strict with gcc, hopefully it will become usable with gcc 6
+  # if(NOT CMAKE_COMPILER_IS_GNUCXX OR (CMAKE_CXX_COMPILER_VERSION VERSION_GREATER "5.0.0"))
+  if(NOT CMAKE_COMPILER_IS_GNUCXX)
+    ei_add_cxx_compiler_flag("-Wshadow")
+  endif()
+  
  ei_add_cxx_compiler_flag("-Wno-psabi")
  ei_add_cxx_compiler_flag("-Wno-variadic-macros")
  ei_add_cxx_compiler_flag("-Wno-long-long")
-
+  
  ei_add_cxx_compiler_flag("-fno-check-new")
  ei_add_cxx_compiler_flag("-fno-common")
  ei_add_cxx_compiler_flag("-fstrict-aliasing")
  ei_add_cxx_compiler_flag("-wd981")                    # disable ICC's "operands are evaluated in unspecified order" remark
  ei_add_cxx_compiler_flag("-wd2304")                   # disable ICC's "warning #2304: non-explicit constructor with single argument may cause implicit type conversion" produced by -Wnon-virtual-dtor
-
-
+  
+  
  # The -ansi flag must be added last, otherwise it is also used as a linker flag by check_cxx_compiler_flag making it fails
  # Moreover we should not set both -strict-ansi and -ansi
  check_cxx_compiler_flag("-strict-ansi" COMPILER_SUPPORT_STRICTANSI)
  ei_add_cxx_compiler_flag("-Qunused-arguments")        # disable clang warning: argument unused during compilation: '-ansi'
-
+  
  if(COMPILER_SUPPORT_STRICTANSI)
    set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -strict-ansi")
  else()
@@ -203,7 +206,7 @@ if(NOT MSVC)
    ei_add_cxx_compiler_flag("-pie")
    ei_add_cxx_compiler_flag("-fPIE")
  endif()
-
+  
  set(CMAKE_REQUIRED_FLAGS "")

  option(EIGEN_TEST_SSE2 "Enable/Disable SSE2 in tests/examples" OFF)
@@ -250,10 +253,7 @@ if(NOT MSVC)

  option(EIGEN_TEST_AVX512 "Enable/Disable AVX512 in tests/examples" OFF)
  if(EIGEN_TEST_AVX512)
-    set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -mavx512f -mfma -DEIGEN_ENABLE_AVX512")
-    if (NOT "${CMAKE_CXX_COMPILER_ID}" STREQUAL "Clang")
-      set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -fabi-version=6")
-    endif()
+    set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -mavx512f -fabi-version=6 -DEIGEN_ENABLE_AVX512")
    message(STATUS "Enabling AVX512 in tests/examples")
  endif()

@@ -275,12 +275,6 @@ if(NOT MSVC)
    message(STATUS "Enabling VSX in tests/examples")
  endif()

-  option(EIGEN_TEST_MSA "Enable/Disable MSA in tests/examples" OFF)
-  if(EIGEN_TEST_MSA)
-    set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -mmsa")
-    message(STATUS "Enabling MSA in tests/examples")
-  endif()
-
  option(EIGEN_TEST_NEON "Enable/Disable Neon in tests/examples" OFF)
  if(EIGEN_TEST_NEON)
    if(EIGEN_TEST_FMA)
@@ -298,18 +292,12 @@ if(NOT MSVC)
    message(STATUS "Enabling NEON in tests/examples")
  endif()

-  option(EIGEN_TEST_Z13 "Enable/Disable S390X(zEC13) ZVECTOR in tests/examples" OFF)
-  if(EIGEN_TEST_Z13)
+  option(EIGEN_TEST_ZVECTOR "Enable/Disable S390X(zEC13) ZVECTOR in tests/examples" OFF)
+  if(EIGEN_TEST_ZVECTOR)
    set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -march=z13 -mzvector")
    message(STATUS "Enabling S390X(zEC13) ZVECTOR in tests/examples")
  endif()

-  option(EIGEN_TEST_Z14 "Enable/Disable S390X(zEC14) ZVECTOR in tests/examples" OFF)
-  if(EIGEN_TEST_Z14)
-    set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -march=z14 -mzvector")
-    message(STATUS "Enabling S390X(zEC13) ZVECTOR in tests/examples")
-  endif()
-
  check_cxx_compiler_flag("-fopenmp" COMPILER_SUPPORT_OPENMP)
  if(COMPILER_SUPPORT_OPENMP)
    option(EIGEN_TEST_OPENMP "Enable/Disable OpenMP in tests/examples" OFF)
@@ -319,7 +307,7 @@ if(NOT MSVC)
    endif()
  endif()

-else()
+else(NOT MSVC)

  # C4127 - conditional expression is constant
  # C4714 - marked as __forceinline not inlined (I failed to deactivate it selectively)
@@ -327,7 +315,7 @@ else()
  #         because we are oftentimes returning objects that have a destructor or may
  #         throw exceptions - in particular in the unit tests we are throwing extra many
  #         exceptions to cover indexing errors.
-  # C4505 - unreferenced local function has been removed (impossible to deactivate selectively)
+  # C4505 - unreferenced local function has been removed (impossible to deactive selectively)
  set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} /EHsc /wd4127 /wd4505 /wd4714")

  # replace all /Wx by /W4
@@ -347,23 +335,10 @@ else()
    if(NOT CMAKE_CL_64)
      # arch is not supported on 64 bit systems, SSE is enabled automatically.
      set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} /arch:SSE2")
-    endif()
+    endif(NOT CMAKE_CL_64)
    message(STATUS "Enabling SSE2 in tests/examples")
-  endif()
-
-  option(EIGEN_TEST_AVX "Enable/Disable AVX in tests/examples" OFF)
-  if(EIGEN_TEST_AVX)
-    set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} /arch:AVX")
-    message(STATUS "Enabling AVX in tests/examples")
-  endif()
-
-  option(EIGEN_TEST_FMA "Enable/Disable FMA/AVX2 in tests/examples" OFF)
-  if(EIGEN_TEST_FMA AND NOT EIGEN_TEST_NEON)
-    set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} /arch:AVX2")
-    message(STATUS "Enabling FMA/AVX2 in tests/examples")
-  endif()
-
-endif()
+  endif(EIGEN_TEST_SSE2)
+endif(NOT MSVC)

 option(EIGEN_TEST_NO_EXPLICIT_VECTORIZATION "Disable explicit vectorization in tests/examples" OFF)
 option(EIGEN_TEST_X87 "Force using X87 instructions. Implies no vectorization." OFF)
@@ -407,7 +382,7 @@ endif()

 set(EIGEN_CUDA_COMPUTE_ARCH 30 CACHE STRING "The CUDA compute architecture level to target when compiling CUDA code")

-include_directories(${CMAKE_CURRENT_SOURCE_DIR})
+include_directories(${CMAKE_CURRENT_SOURCE_DIR} ${CMAKE_CURRENT_BINARY_DIR})

 # Backward compatibility support for EIGEN_INCLUDE_INSTALL_DIR
 if(EIGEN_INCLUDE_INSTALL_DIR)
@@ -440,9 +415,9 @@ macro(ei_add_target_property target prop value)
  # if the property wasn't previously set, ${previous} is now "previous-NOTFOUND" which cmake allows catching with plain if()
  if(NOT previous)
    set(previous "")
-  endif()
+  endif(NOT previous)
  set_target_properties(${target} PROPERTIES ${prop} "${previous} ${value}")
-endmacro()
+endmacro(ei_add_target_property)

 install(FILES
  signature_of_eigen3_matrix_library
@@ -456,7 +431,7 @@ if(EIGEN_BUILD_PKGCONFIG)
        )
 endif()

-install(DIRECTORY Eigen DESTINATION ${INCLUDE_INSTALL_DIR} COMPONENT Devel)
+add_subdirectory(Eigen)

 add_subdirectory(doc EXCLUDE_FROM_ALL)

@@ -469,8 +444,6 @@ if(BUILD_TESTING)
  else()
    add_subdirectory(test EXCLUDE_FROM_ALL)
  endif()
-
-  add_subdirectory(failtest)
 endif()

 if(EIGEN_LEAVE_TEST_IN_ALL_TARGET)
@@ -483,31 +456,9 @@ endif()

 # add SYCL
 option(EIGEN_TEST_SYCL "Add Sycl support." OFF)
-option(EIGEN_SYCL_TRISYCL "Use the triSYCL Sycl implementation (ComputeCPP by default)." OFF)
 if(EIGEN_TEST_SYCL)
  set (CMAKE_MODULE_PATH "${CMAKE_ROOT}/Modules" "cmake/Modules/" "${CMAKE_MODULE_PATH}")
-  if(EIGEN_SYCL_TRISYCL)
-    message(STATUS "Using triSYCL")
-    include(FindTriSYCL)
-  else()
-    message(STATUS "Using ComputeCPP SYCL")
-    include(FindComputeCpp)
-    set(COMPUTECPP_DRIVER_DEFAULT_VALUE OFF)
-    if (NOT MSVC)
-      set(COMPUTECPP_DRIVER_DEFAULT_VALUE ON)
-    endif()
-    option(COMPUTECPP_USE_COMPILER_DRIVER
-      "Use ComputeCpp driver instead of a 2 steps compilation"
-      ${COMPUTECPP_DRIVER_DEFAULT_VALUE}
-    )
-  endif(EIGEN_SYCL_TRISYCL)
-  option(EIGEN_DONT_VECTORIZE_SYCL "Don't use vectorisation in the SYCL tests." OFF)
-  if(EIGEN_DONT_VECTORIZE_SYCL)
-    message(STATUS "Disabling SYCL vectorization in tests/examples")
-    # When disabling SYCL vectorization, also disable Eigen default vectorization
-    add_definitions(-DEIGEN_DONT_VECTORIZE=1)
-    add_definitions(-DEIGEN_DONT_VECTORIZE_SYCL=1)
-  endif()
+  include(FindComputeCpp)
 endif()

 add_subdirectory(unsupported)
@@ -520,11 +471,11 @@ add_subdirectory(scripts EXCLUDE_FROM_ALL)
 # TODO: consider also replacing EIGEN_BUILD_BTL by a custom target "make btl"?
 if(EIGEN_BUILD_BTL)
  add_subdirectory(bench/btl EXCLUDE_FROM_ALL)
-endif()
+endif(EIGEN_BUILD_BTL)

 if(NOT WIN32)
  add_subdirectory(bench/spbench EXCLUDE_FROM_ALL)
-endif()
+endif(NOT WIN32)

 configure_file(scripts/cdashtesting.cmake.in cdashtesting.cmake @ONLY)

@@ -536,6 +487,11 @@ message(STATUS "")
 message(STATUS "Configured Eigen ${EIGEN_VERSION_NUMBER}")
 message(STATUS "")

+option(EIGEN_FAILTEST "Enable failtests." OFF)
+if(EIGEN_FAILTEST)
+  add_subdirectory(failtest)
+endif()
+
 string(TOLOWER "${CMAKE_GENERATOR}" cmake_generator_tolower)
 if(cmake_generator_tolower MATCHES "makefile")
  message(STATUS "Some things you can do now:")
@@ -552,10 +508,8 @@ if(cmake_generator_tolower MATCHES "makefile")
  message(STATUS "              |   Or:")
  message(STATUS "              |     cmake . -DINCLUDE_INSTALL_DIR=yourdir")
  message(STATUS "make doc      | Generate the API documentation, requires Doxygen & LaTeX")
-  if(BUILD_TESTING)
-    message(STATUS "make check    | Build and run the unit-tests. Read this page:")
-    message(STATUS "              |   http://eigen.tuxfamily.org/index.php?title=Tests")
-  endif()
+  message(STATUS "make check    | Build and run the unit-tests. Read this page:")
+  message(STATUS "              |   http://eigen.tuxfamily.org/index.php?title=Tests")
  message(STATUS "make blas     | Build BLAS library (not the same thing as Eigen)")
  message(STATUS "make uninstall| Removes files installed by make install")
  message(STATUS "--------------+--------------------------------------------------------------")
@@ -581,7 +535,7 @@ if (NOT CMAKE_VERSION VERSION_LESS 3.0)

  # Imported target support
  add_library (eigen INTERFACE)
-  add_library (Eigen3::Eigen ALIAS eigen)
+
  target_compile_definitions (eigen INTERFACE ${EIGEN_DEFINITIONS})
  target_include_directories (eigen INTERFACE
    $<BUILD_INTERFACE:${CMAKE_CURRENT_SOURCE_DIR}>
@@ -620,13 +574,13 @@ if (NOT CMAKE_VERSION VERSION_LESS 3.0)

  install (EXPORT Eigen3Targets NAMESPACE Eigen3:: DESTINATION ${CMAKEPACKAGE_INSTALL_DIR})

-else ()
+else (NOT CMAKE_VERSION VERSION_LESS 3.0)
  # Fallback to legacy Eigen3Config.cmake without the imported target
-
+  
  # If CMakePackageConfigHelpers module is available (CMake >= 2.8.8)
-  # create a relocatable Config file, otherwise leave the hardcoded paths
+  # create a relocatable Config file, otherwise leave the hardcoded paths       
  include(CMakePackageConfigHelpers OPTIONAL RESULT_VARIABLE CPCH_PATH)
-
+  
  if(CPCH_PATH)
    configure_package_config_file (
      ${CMAKE_CURRENT_SOURCE_DIR}/cmake/Eigen3ConfigLegacy.cmake.in
@@ -635,7 +589,7 @@ else ()
      INSTALL_DESTINATION ${CMAKEPACKAGE_INSTALL_DIR}
      NO_CHECK_REQUIRED_COMPONENTS_MACRO # Eigen does not provide components
    )
-  else()
+  else() 
    # The PACKAGE_* variables are defined by the configure_package_config_file
    # but without it we define them manually to the hardcoded paths
    set(PACKAGE_INIT "")
@@ -650,7 +604,7 @@ else ()
                                    VERSION ${EIGEN_VERSION_NUMBER}
                                    COMPATIBILITY SameMajorVersion )

-endif ()
+endif (NOT CMAKE_VERSION VERSION_LESS 3.0)

 install ( FILES ${CMAKE_CURRENT_SOURCE_DIR}/cmake/UseEigen3.cmake
                ${CMAKE_CURRENT_BINARY_DIR}/Eigen3Config.cmake
--- a/CTestConfig.cmake
+++ b/CTestConfig.cmake
@@ -2,16 +2,12 @@
 ## Then modify the CMakeLists.txt file in the root directory of your
 ## project to incorporate the testing dashboard.
 ## # The following are required to uses Dart and the Cdash dashboard
-##   enable_testing()
-##   include(CTest)
-set(CTEST_PROJECT_NAME "Eigen")
+##   ENABLE_TESTING()
+##   INCLUDE(CTest)
+set(CTEST_PROJECT_NAME "Eigen 3.3")
 set(CTEST_NIGHTLY_START_TIME "00:00:00 UTC")

 set(CTEST_DROP_METHOD "http")
-set(CTEST_DROP_SITE "my.cdash.org")
-set(CTEST_DROP_LOCATION "/submit.php?project=Eigen")
+set(CTEST_DROP_SITE "manao.inria.fr")
+set(CTEST_DROP_LOCATION "/CDash/submit.php?project=Eigen+3.3")
 set(CTEST_DROP_SITE_CDASH TRUE)
-#set(CTEST_PROJECT_SUBPROJECTS
-#Official
-#Unsupported
-#)
--- a/Eigen/CMakeLists.txt
+++ b/Eigen/CMakeLists.txt
@@ -0,0 +1,19 @@
+include(RegexUtils)
+test_escape_string_as_regex()
+
+file(GLOB Eigen_directory_files "*")
+
+escape_string_as_regex(ESCAPED_CMAKE_CURRENT_SOURCE_DIR "${CMAKE_CURRENT_SOURCE_DIR}")
+
+foreach(f ${Eigen_directory_files})
+  if(NOT f MATCHES "\\.txt" AND NOT f MATCHES "${ESCAPED_CMAKE_CURRENT_SOURCE_DIR}/[.].+" AND NOT f MATCHES "${ESCAPED_CMAKE_CURRENT_SOURCE_DIR}/src")
+    list(APPEND Eigen_directory_files_to_install ${f})
+  endif()
+endforeach(f ${Eigen_directory_files})
+
+install(FILES
+  ${Eigen_directory_files_to_install}
+  DESTINATION ${INCLUDE_INSTALL_DIR}/Eigen COMPONENT Devel
+  )
+
+install(DIRECTORY src DESTINATION ${INCLUDE_INSTALL_DIR}/Eigen COMPONENT Devel FILES_MATCHING PATTERN "*.h")
--- a/Eigen/Core
+++ b/Eigen/Core
@@ -14,26 +14,79 @@
 // first thing Eigen does: stop the compiler from committing suicide
 #include "src/Core/util/DisableStupidWarnings.h"

-// then include this file where all our macros are defined. It's really important to do it first because
-// it's where we do all the compiler/OS/arch detections and define most defaults.
-#include "src/Core/util/Macros.h"
-
-// This detects SSE/AVX/NEON/etc. and configure alignment settings
-#include "src/Core/util/ConfigureVectorization.h"
-
-// We need cuda_runtime.h/hip_runtime.h to ensure that
-// the EIGEN_USING_STD_MATH macro works properly on the device side
-#if defined(EIGEN_CUDACC)
-  #include <cuda_runtime.h>
-#elif defined(EIGEN_HIPCC)
-  #include <hip/hip_runtime.h>
+#if defined(__CUDACC__) && !defined(EIGEN_NO_CUDA)
+  #define EIGEN_CUDACC __CUDACC__
 #endif

+#if defined(__CUDA_ARCH__) && !defined(EIGEN_NO_CUDA)
+  #define EIGEN_CUDA_ARCH __CUDA_ARCH__
+#endif
+
+#if defined(__CUDACC_VER_MAJOR__) && (__CUDACC_VER_MAJOR__ >= 9)
+#define EIGEN_CUDACC_VER  ((__CUDACC_VER_MAJOR__ * 10000) + (__CUDACC_VER_MINOR__ * 100))
+#elif defined(__CUDACC_VER__)
+#define EIGEN_CUDACC_VER __CUDACC_VER__
+#else
+#define EIGEN_CUDACC_VER 0
+#endif
+
+// Handle NVCC/CUDA/SYCL
+#if defined(__CUDACC__) || defined(__SYCL_DEVICE_ONLY__)
+  // Do not try asserts on CUDA and SYCL!
+  #ifndef EIGEN_NO_DEBUG
+  #define EIGEN_NO_DEBUG
+  #endif
+
+  #ifdef EIGEN_INTERNAL_DEBUGGING
+  #undef EIGEN_INTERNAL_DEBUGGING
+  #endif
+
+  #ifdef EIGEN_EXCEPTIONS
+  #undef EIGEN_EXCEPTIONS
+  #endif
+
+  // All functions callable from CUDA code must be qualified with __device__
+  #ifdef __CUDACC__
+    // Do not try to vectorize on CUDA and SYCL!
+    #ifndef EIGEN_DONT_VECTORIZE
+    #define EIGEN_DONT_VECTORIZE
+    #endif
+
+    #define EIGEN_DEVICE_FUNC __host__ __device__
+    // We need cuda_runtime.h to ensure that that EIGEN_USING_STD_MATH macro
+    // works properly on the device side
+    #include <cuda_runtime.h>
+  #else
+    #define EIGEN_DEVICE_FUNC
+  #endif
+
+#else
+  #define EIGEN_DEVICE_FUNC
+
+#endif
+
+// When compiling CUDA device code with NVCC, pull in math functions from the
+// global namespace.  In host mode, and when device doee with clang, use the
+// std versions.
+#if defined(__CUDA_ARCH__) && defined(__NVCC__)
+  #define EIGEN_USING_STD_MATH(FUNC) using ::FUNC;
+#else
+  #define EIGEN_USING_STD_MATH(FUNC) using std::FUNC;
+#endif
+
+#if (defined(_CPPUNWIND) || defined(__EXCEPTIONS)) && !defined(__CUDA_ARCH__) && !defined(EIGEN_EXCEPTIONS) && !defined(EIGEN_USE_SYCL)
+  #define EIGEN_EXCEPTIONS
+#endif

 #ifdef EIGEN_EXCEPTIONS
  #include <new>
 #endif

+// then include this file where all our macros are defined. It's really important to do it first because
+// it's where we do all the alignment settings (platform detection and honoring the user's will if he
+// defined e.g. EIGEN_DONT_ALIGN) so it needs to be done before we do anything with vectorization.
+#include "src/Core/util/Macros.h"
+
 // Disable the ipa-cp-clone optimization flag with MinGW 6.x or newer (enabled by default with -O3)
 // See http://eigen.tuxfamily.org/bz/show_bug.cgi?id=556 for details.
 #if EIGEN_COMP_MINGW && EIGEN_GNUC_AT_LEAST(4,6)
@@ -46,9 +99,163 @@
 // and inclusion of their respective header files
 #include "src/Core/util/MKL_support.h"

+// if alignment is disabled, then disable vectorization. Note: EIGEN_MAX_ALIGN_BYTES is the proper check, it takes into
+// account both the user's will (EIGEN_MAX_ALIGN_BYTES,EIGEN_DONT_ALIGN) and our own platform checks
+#if EIGEN_MAX_ALIGN_BYTES==0
+  #ifndef EIGEN_DONT_VECTORIZE
+    #define EIGEN_DONT_VECTORIZE
+  #endif
+#endif

-#if defined(EIGEN_HAS_CUDA_FP16) || defined(EIGEN_HAS_HIP_FP16)
-  #define EIGEN_HAS_GPU_FP16
+#if EIGEN_COMP_MSVC
+  #include <malloc.h> // for _aligned_malloc -- need it regardless of whether vectorization is enabled
+  #if (EIGEN_COMP_MSVC >= 1500) // 2008 or later
+    // Remember that usage of defined() in a #define is undefined by the standard.
+    // a user reported that in 64-bit mode, MSVC doesn't care to define _M_IX86_FP.
+    #if (defined(_M_IX86_FP) && (_M_IX86_FP >= 2)) || EIGEN_ARCH_x86_64
+      #define EIGEN_SSE2_ON_MSVC_2008_OR_LATER
+    #endif
+  #endif
+#else
+  // Remember that usage of defined() in a #define is undefined by the standard
+  #if (defined __SSE2__) && ( (!EIGEN_COMP_GNUC) || EIGEN_COMP_ICC || EIGEN_GNUC_AT_LEAST(4,2) )
+    #define EIGEN_SSE2_ON_NON_MSVC_BUT_NOT_OLD_GCC
+  #endif
+#endif
+
+#ifndef EIGEN_DONT_VECTORIZE
+
+  #if defined (EIGEN_SSE2_ON_NON_MSVC_BUT_NOT_OLD_GCC) || defined(EIGEN_SSE2_ON_MSVC_2008_OR_LATER)
+
+    // Defines symbols for compile-time detection of which instructions are
+    // used.
+    // EIGEN_VECTORIZE_YY is defined if and only if the instruction set YY is used
+    #define EIGEN_VECTORIZE
+    #define EIGEN_VECTORIZE_SSE
+    #define EIGEN_VECTORIZE_SSE2
+
+    // Detect sse3/ssse3/sse4:
+    // gcc and icc defines __SSE3__, ...
+    // there is no way to know about this on msvc. You can define EIGEN_VECTORIZE_SSE* if you
+    // want to force the use of those instructions with msvc.
+    #ifdef __SSE3__
+      #define EIGEN_VECTORIZE_SSE3
+    #endif
+    #ifdef __SSSE3__
+      #define EIGEN_VECTORIZE_SSSE3
+    #endif
+    #ifdef __SSE4_1__
+      #define EIGEN_VECTORIZE_SSE4_1
+    #endif
+    #ifdef __SSE4_2__
+      #define EIGEN_VECTORIZE_SSE4_2
+    #endif
+    #ifdef __AVX__
+      #define EIGEN_VECTORIZE_AVX
+      #define EIGEN_VECTORIZE_SSE3
+      #define EIGEN_VECTORIZE_SSSE3
+      #define EIGEN_VECTORIZE_SSE4_1
+      #define EIGEN_VECTORIZE_SSE4_2
+    #endif
+    #ifdef __AVX2__
+      #define EIGEN_VECTORIZE_AVX2
+    #endif
+    #ifdef __FMA__
+      #define EIGEN_VECTORIZE_FMA
+    #endif
+    #if defined(__AVX512F__) && defined(EIGEN_ENABLE_AVX512)
+      #define EIGEN_VECTORIZE_AVX512
+      #define EIGEN_VECTORIZE_AVX2
+      #define EIGEN_VECTORIZE_AVX
+      #define EIGEN_VECTORIZE_FMA
+      #ifdef __AVX512DQ__
+        #define EIGEN_VECTORIZE_AVX512DQ
+      #endif
+      #ifdef __AVX512ER__
+        #define EIGEN_VECTORIZE_AVX512ER
+      #endif
+    #endif
+
+    // include files
+
+    // This extern "C" works around a MINGW-w64 compilation issue
+    // https://sourceforge.net/tracker/index.php?func=detail&aid=3018394&group_id=202880&atid=983354
+    // In essence, intrin.h is included by windows.h and also declares intrinsics (just as emmintrin.h etc. below do).
+    // However, intrin.h uses an extern "C" declaration, and g++ thus complains of duplicate declarations
+    // with conflicting linkage.  The linkage for intrinsics doesn't matter, but at that stage the compiler doesn't know;
+    // so, to avoid compile errors when windows.h is included after Eigen/Core, ensure intrinsics are extern "C" here too.
+    // notice that since these are C headers, the extern "C" is theoretically needed anyways.
+    extern "C" {
+      // In theory we should only include immintrin.h and not the other *mmintrin.h header files directly.
+      // Doing so triggers some issues with ICC. However old gcc versions seems to not have this file, thus:
+      #if EIGEN_COMP_ICC >= 1110
+        #include <immintrin.h>
+      #else
+        #include <mmintrin.h>
+        #include <emmintrin.h>
+        #include <xmmintrin.h>
+        #ifdef  EIGEN_VECTORIZE_SSE3
+        #include <pmmintrin.h>
+        #endif
+        #ifdef EIGEN_VECTORIZE_SSSE3
+        #include <tmmintrin.h>
+        #endif
+        #ifdef EIGEN_VECTORIZE_SSE4_1
+        #include <smmintrin.h>
+        #endif
+        #ifdef EIGEN_VECTORIZE_SSE4_2
+        #include <nmmintrin.h>
+        #endif
+        #if defined(EIGEN_VECTORIZE_AVX) || defined(EIGEN_VECTORIZE_AVX512)
+        #include <immintrin.h>
+        #endif
+      #endif
+    } // end extern "C"
+  #elif defined __VSX__
+    #define EIGEN_VECTORIZE
+    #define EIGEN_VECTORIZE_VSX
+    #include <altivec.h>
+    // We need to #undef all these ugly tokens defined in <altivec.h>
+    // => use __vector instead of vector
+    #undef bool
+    #undef vector
+    #undef pixel
+  #elif defined __ALTIVEC__
+    #define EIGEN_VECTORIZE
+    #define EIGEN_VECTORIZE_ALTIVEC
+    #include <altivec.h>
+    // We need to #undef all these ugly tokens defined in <altivec.h>
+    // => use __vector instead of vector
+    #undef bool
+    #undef vector
+    #undef pixel
+  #elif (defined  __ARM_NEON) || (defined __ARM_NEON__)
+    #define EIGEN_VECTORIZE
+    #define EIGEN_VECTORIZE_NEON
+    #include <arm_neon.h>
+  #elif (defined __s390x__ && defined __VEC__)
+    #define EIGEN_VECTORIZE
+    #define EIGEN_VECTORIZE_ZVECTOR
+    #include <vecintrin.h>
+  #endif
+#endif
+
+#if defined(__F16C__) && !defined(EIGEN_COMP_CLANG)
+  // We can use the optimized fp16 to float and float to fp16 conversion routines
+  #define EIGEN_HAS_FP16_C
+#endif
+
+#if defined __CUDACC__
+  #define EIGEN_VECTORIZE_CUDA
+  #include <vector_types.h>
+  #if EIGEN_CUDACC_VER >= 70500
+    #define EIGEN_HAS_CUDA_FP16
+  #endif
+#endif
+
+#if defined EIGEN_HAS_CUDA_FP16
+  #include <host_defines.h>
+  #include <cuda_fp16.h>
 #endif

 #if (defined _OPENMP) && (!defined EIGEN_DONT_PARALLELIZE)
@@ -72,9 +279,7 @@
 #include <cmath>
 #include <cassert>
 #include <functional>
-#ifndef EIGEN_NO_IO
-  #include <iosfwd>
-#endif
+#include <iosfwd>
 #include <cstring>
 #include <string>
 #include <limits>
@@ -82,10 +287,6 @@
 // for min/max:
 #include <algorithm>

-#if EIGEN_HAS_CXX11
-#include <array>
-#endif
-
 // for std::is_nothrow_move_assignable
 #ifdef EIGEN_INCLUDE_TYPE_TRAITS
 #include <type_traits>
@@ -101,25 +302,38 @@
  #include <intrin.h>
 #endif

-#if defined(EIGEN_USE_SYCL)
-  #undef min
-  #undef max
-  #undef isnan
-  #undef isinf
-  #undef isfinite
-  #include <SYCL/sycl.hpp>
-  #include <map>
-  #include <memory>
-  #include <utility>
-  #include <thread>
-  #ifndef EIGEN_SYCL_LOCAL_THREAD_DIM0
-  #define EIGEN_SYCL_LOCAL_THREAD_DIM0 16
-  #endif
-  #ifndef EIGEN_SYCL_LOCAL_THREAD_DIM1
-  #define EIGEN_SYCL_LOCAL_THREAD_DIM1 16
-  #endif
-#endif
+/** \brief Namespace containing all symbols from the %Eigen library. */
+namespace Eigen {

+inline static const char *SimdInstructionSetsInUse(void) {
+#if defined(EIGEN_VECTORIZE_AVX512)
+  return "AVX512, FMA, AVX2, AVX, SSE, SSE2, SSE3, SSSE3, SSE4.1, SSE4.2";
+#elif defined(EIGEN_VECTORIZE_AVX)
+  return "AVX SSE, SSE2, SSE3, SSSE3, SSE4.1, SSE4.2";
+#elif defined(EIGEN_VECTORIZE_SSE4_2)
+  return "SSE, SSE2, SSE3, SSSE3, SSE4.1, SSE4.2";
+#elif defined(EIGEN_VECTORIZE_SSE4_1)
+  return "SSE, SSE2, SSE3, SSSE3, SSE4.1";
+#elif defined(EIGEN_VECTORIZE_SSSE3)
+  return "SSE, SSE2, SSE3, SSSE3";
+#elif defined(EIGEN_VECTORIZE_SSE3)
+  return "SSE, SSE2, SSE3";
+#elif defined(EIGEN_VECTORIZE_SSE2)
+  return "SSE, SSE2";
+#elif defined(EIGEN_VECTORIZE_ALTIVEC)
+  return "AltiVec";
+#elif defined(EIGEN_VECTORIZE_VSX)
+  return "VSX";
+#elif defined(EIGEN_VECTORIZE_NEON)
+  return "ARM NEON";
+#elif defined(EIGEN_VECTORIZE_ZVECTOR)
+  return "S390X ZVECTOR";
+#else
+  return "None";
+#endif
+}
+
+} // end namespace Eigen

 #if defined EIGEN2_SUPPORT_STAGE40_FULL_EIGEN3_STRICTNESS || defined EIGEN2_SUPPORT_STAGE30_FULL_EIGEN3_API || defined EIGEN2_SUPPORT_STAGE20_RESOLVE_API_CONFLICTS || defined EIGEN2_SUPPORT_STAGE10_FULL_EIGEN2_API || defined EIGEN2_SUPPORT
 // This will generate an error message:
@@ -128,7 +342,7 @@

 namespace Eigen {

-// we use size_t frequently and we'll never remember to prepend it with std:: every time just to
+// we use size_t frequently and we'll never remember to prepend it with std:: everytime just to
 // ensure QNX/QCC support
 using std::size_t;
 // gcc 4.6.0 wants std:: for ptrdiff_t
@@ -152,85 +366,58 @@ using std::ptrdiff_t;
 #include "src/Core/util/StaticAssert.h"
 #include "src/Core/util/XprHelper.h"
 #include "src/Core/util/Memory.h"
-#include "src/Core/util/IntegralConstant.h"
-#include "src/Core/util/SymbolicIndex.h"

 #include "src/Core/NumTraits.h"
 #include "src/Core/MathFunctions.h"
 #include "src/Core/GenericPacketMath.h"
 #include "src/Core/MathFunctionsImpl.h"
 #include "src/Core/arch/Default/ConjHelper.h"
-// Generic half float support
-#include "src/Core/arch/Default/Half.h"
-#include "src/Core/arch/Default/TypeCasting.h"
-#include "src/Core/arch/Default/GenericPacketMathFunctionsFwd.h"

 #if defined EIGEN_VECTORIZE_AVX512
  #include "src/Core/arch/SSE/PacketMath.h"
-  #include "src/Core/arch/SSE/TypeCasting.h"
-  #include "src/Core/arch/SSE/Complex.h"
  #include "src/Core/arch/AVX/PacketMath.h"
-  #include "src/Core/arch/AVX/TypeCasting.h"
-  #include "src/Core/arch/AVX/Complex.h"
  #include "src/Core/arch/AVX512/PacketMath.h"
-  #include "src/Core/arch/AVX512/TypeCasting.h"
-  #include "src/Core/arch/AVX512/Complex.h"
-  #include "src/Core/arch/SSE/MathFunctions.h"
-  #include "src/Core/arch/AVX/MathFunctions.h"
  #include "src/Core/arch/AVX512/MathFunctions.h"
 #elif defined EIGEN_VECTORIZE_AVX
  // Use AVX for floats and doubles, SSE for integers
  #include "src/Core/arch/SSE/PacketMath.h"
-  #include "src/Core/arch/SSE/TypeCasting.h"
  #include "src/Core/arch/SSE/Complex.h"
-  #include "src/Core/arch/AVX/PacketMath.h"
-  #include "src/Core/arch/AVX/TypeCasting.h"
-  #include "src/Core/arch/AVX/Complex.h"
  #include "src/Core/arch/SSE/MathFunctions.h"
+  #include "src/Core/arch/AVX/PacketMath.h"
  #include "src/Core/arch/AVX/MathFunctions.h"
+  #include "src/Core/arch/AVX/Complex.h"
+  #include "src/Core/arch/AVX/TypeCasting.h"
+  #include "src/Core/arch/SSE/TypeCasting.h"
 #elif defined EIGEN_VECTORIZE_SSE
  #include "src/Core/arch/SSE/PacketMath.h"
-  #include "src/Core/arch/SSE/TypeCasting.h"
  #include "src/Core/arch/SSE/MathFunctions.h"
  #include "src/Core/arch/SSE/Complex.h"
+  #include "src/Core/arch/SSE/TypeCasting.h"
 #elif defined(EIGEN_VECTORIZE_ALTIVEC) || defined(EIGEN_VECTORIZE_VSX)
  #include "src/Core/arch/AltiVec/PacketMath.h"
  #include "src/Core/arch/AltiVec/MathFunctions.h"
  #include "src/Core/arch/AltiVec/Complex.h"
 #elif defined EIGEN_VECTORIZE_NEON
  #include "src/Core/arch/NEON/PacketMath.h"
-  #include "src/Core/arch/NEON/TypeCasting.h"
  #include "src/Core/arch/NEON/MathFunctions.h"
  #include "src/Core/arch/NEON/Complex.h"
 #elif defined EIGEN_VECTORIZE_ZVECTOR
  #include "src/Core/arch/ZVector/PacketMath.h"
  #include "src/Core/arch/ZVector/MathFunctions.h"
  #include "src/Core/arch/ZVector/Complex.h"
-#elif defined EIGEN_VECTORIZE_MSA
-  #include "src/Core/arch/MSA/PacketMath.h"
-  #include "src/Core/arch/MSA/MathFunctions.h"
-  #include "src/Core/arch/MSA/Complex.h"
 #endif

-#if defined EIGEN_VECTORIZE_GPU
-  #include "src/Core/arch/GPU/PacketMath.h"
-  #include "src/Core/arch/GPU/MathFunctions.h"
-  #include "src/Core/arch/GPU/TypeCasting.h"
-#endif
+// Half float support
+#include "src/Core/arch/CUDA/Half.h"
+#include "src/Core/arch/CUDA/PacketMathHalf.h"
+#include "src/Core/arch/CUDA/TypeCasting.h"

-#if defined(EIGEN_USE_SYCL)
-  #include "src/Core/arch/SYCL/SyclMemoryModel.h"
-  #include "src/Core/arch/SYCL/InteropHeaders.h"
-#if !defined(EIGEN_DONT_VECTORIZE_SYCL)
-  #include "src/Core/arch/SYCL/PacketMath.h"
-  #include "src/Core/arch/SYCL/MathFunctions.h"
-  #include "src/Core/arch/SYCL/TypeCasting.h"
-#endif
+#if defined EIGEN_VECTORIZE_CUDA
+  #include "src/Core/arch/CUDA/PacketMath.h"
+  #include "src/Core/arch/CUDA/MathFunctions.h"
 #endif

 #include "src/Core/arch/Default/Settings.h"
-// This file provides generic implementations valid for scalar as well
-#include "src/Core/arch/Default/GenericPacketMathFunctions.h"

 #include "src/Core/functors/TernaryFunctors.h"
 #include "src/Core/functors/BinaryFunctors.h"
@@ -241,16 +428,9 @@ using std::ptrdiff_t;

 // Specialized functors to enable the processing of complex numbers
 // on CUDA devices
-#ifdef EIGEN_CUDACC
 #include "src/Core/arch/CUDA/Complex.h"
-#endif

-#include "src/Core/util/IndexedViewHelper.h"
-#include "src/Core/util/ReshapedHelper.h"
-#include "src/Core/ArithmeticSequence.h"
-#ifndef EIGEN_NO_IO
-  #include "src/Core/IO.h"
-#endif
+#include "src/Core/IO.h"
 #include "src/Core/DenseCoeffsBase.h"
 #include "src/Core/DenseBase.h"
 #include "src/Core/MatrixBase.h"
@@ -291,8 +471,6 @@ using std::ptrdiff_t;
 #include "src/Core/Ref.h"
 #include "src/Core/Block.h"
 #include "src/Core/VectorBlock.h"
-#include "src/Core/IndexedView.h"
-#include "src/Core/Reshaped.h"
 #include "src/Core/Transpose.h"
 #include "src/Core/DiagonalMatrix.h"
 #include "src/Core/Diagonal.h"
@@ -332,12 +510,10 @@ using std::ptrdiff_t;
 #include "src/Core/BooleanRedux.h"
 #include "src/Core/Select.h"
 #include "src/Core/VectorwiseOp.h"
-#include "src/Core/PartialReduxEvaluator.h"
 #include "src/Core/Random.h"
 #include "src/Core/Replicate.h"
 #include "src/Core/Reverse.h"
 #include "src/Core/ArrayWrapper.h"
-#include "src/Core/StlIterators.h"

 #ifdef EIGEN_USE_BLAS
 #include "src/Core/products/GeneralMatrixMatrix_BLAS.h"
--- a/Eigen/Eigenvalues
+++ b/Eigen/Eigenvalues
@@ -10,14 +10,14 @@

 #include "Core"

+#include "src/Core/util/DisableStupidWarnings.h"
+
 #include "Cholesky"
 #include "Jacobi"
 #include "Householder"
 #include "LU"
 #include "Geometry"

-#include "src/Core/util/DisableStupidWarnings.h"
-
 /** \defgroup Eigenvalues_Module Eigenvalues module
  *
  *
--- a/Eigen/Geometry
+++ b/Eigen/Geometry
@@ -10,12 +10,12 @@

 #include "Core"

+#include "src/Core/util/DisableStupidWarnings.h"
+
 #include "SVD"
 #include "LU"
 #include <limits>

-#include "src/Core/util/DisableStupidWarnings.h"
-
 /** \defgroup Geometry_Module Geometry module
  *
  * This module provides support for:
@@ -49,8 +49,9 @@
 #include "src/Geometry/AlignedBox.h"
 #include "src/Geometry/Umeyama.h"

-// Use the SSE optimized version whenever possible.
-#if defined EIGEN_VECTORIZE_SSE
+// Use the SSE optimized version whenever possible. At the moment the
+// SSE version doesn't compile when AVX is enabled
+#if defined EIGEN_VECTORIZE_SSE && !defined EIGEN_VECTORIZE_AVX
 #include "src/Geometry/arch/Geometry_SSE.h"
 #endif

@@ -58,3 +59,4 @@

 #endif // EIGEN_GEOMETRY_MODULE_H
 /* vim: set filetype=cpp et sw=2 ts=2 ai: */
+
--- a/Eigen/KLUSupport
+++ b/Eigen/KLUSupport
@@ -1,41 +0,0 @@
-// This file is part of Eigen, a lightweight C++ template library
-// for linear algebra.
-//
-// This Source Code Form is subject to the terms of the Mozilla
-// Public License v. 2.0. If a copy of the MPL was not distributed
-// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
-
-#ifndef EIGEN_KLUSUPPORT_MODULE_H
-#define EIGEN_KLUSUPPORT_MODULE_H
-
-#include <Eigen/SparseCore>
-
-#include <Eigen/src/Core/util/DisableStupidWarnings.h>
-
-extern "C" {
-#include <btf.h>
-#include <klu.h>
-   }
-
-/** \ingroup Support_modules
-  * \defgroup KLUSupport_Module KLUSupport module
-  *
-  * This module provides an interface to the KLU library which is part of the <a href="http://www.suitesparse.com">suitesparse</a> package.
-  * It provides the following factorization class:
-  * - class KLU: a sparse LU factorization, well-suited for circuit simulation.
-  *
-  * \code
-  * #include <Eigen/KLUSupport>
-  * \endcode
-  *
-  * In order to use this module, the klu and btf headers must be accessible from the include paths, and your binary must be linked to the klu library and its dependencies.
-  * The dependencies depend on how umfpack has been compiled.
-  * For a cmake based project, you can use our FindKLU.cmake module to help you in this task.
-  *
-  */
-
-#include "src/KLUSupport/KLUSupport.h"
-
-#include <Eigen/src/Core/util/ReenableStupidWarnings.h>
-
-#endif // EIGEN_KLUSUPPORT_MODULE_H
--- a/Eigen/OrderingMethods
+++ b/Eigen/OrderingMethods
@@ -63,7 +63,10 @@
  * \endcode
  */

+#ifndef EIGEN_MPL2_ONLY
 #include "src/OrderingMethods/Amd.h"
+#endif
+
 #include "src/OrderingMethods/Ordering.h"
 #include "src/Core/util/ReenableStupidWarnings.h"

--- a/Eigen/PaStiXSupport
+++ b/Eigen/PaStiXSupport
@@ -36,7 +36,6 @@ extern "C" {
  * \endcode
  *
  * In order to use this module, the PaSTiX headers must be accessible from the include paths, and your binary must be linked to the PaSTiX library and its dependencies.
-  * This wrapper resuires PaStiX version 5.x compiled without MPI support.
  * The dependencies depend on how PaSTiX has been compiled.
  * For a cmake based project, you can use our FindPaSTiX.cmake module to help you in this task.
  *
--- a/Eigen/PardisoSupport
+++ b/Eigen/PardisoSupport
--- a/Eigen/QR
+++ b/Eigen/QR
@@ -10,12 +10,12 @@

 #include "Core"

+#include "src/Core/util/DisableStupidWarnings.h"
+
 #include "Cholesky"
 #include "Jacobi"
 #include "Householder"

-#include "src/Core/util/DisableStupidWarnings.h"
-
 /** \defgroup QR_Module QR module
  *
  *
--- a/Eigen/Sparse
+++ b/Eigen/Sparse
@@ -25,7 +25,9 @@

 #include "SparseCore"
 #include "OrderingMethods"
+#ifndef EIGEN_MPL2_ONLY
 #include "SparseCholesky"
+#endif
 #include "SparseLU"
 #include "SparseQR"
 #include "IterativeLinearSolvers"
--- a/Eigen/SparseCholesky
+++ b/Eigen/SparseCholesky
@@ -30,8 +30,16 @@
  * \endcode
  */

+#ifdef EIGEN_MPL2_ONLY
+#error The SparseCholesky module has nothing to offer in MPL2 only mode
+#endif
+
 #include "src/SparseCholesky/SimplicialCholesky.h"
+
+#ifndef EIGEN_MPL2_ONLY
 #include "src/SparseCholesky/SimplicialCholesky_impl.h"
+#endif
+
 #include "src/Core/util/ReenableStupidWarnings.h"

 #endif // EIGEN_SPARSECHOLESKY_MODULE_H
--- a/Eigen/SparseLU
+++ b/Eigen/SparseLU
@@ -23,8 +23,6 @@
 // Ordering interface
 #include "OrderingMethods"

-#include "src/Core/util/DisableStupidWarnings.h"
-
 #include "src/SparseLU/SparseLU_gemm_kernel.h"

 #include "src/SparseLU/SparseLU_Structs.h"
@@ -45,6 +43,4 @@
 #include "src/SparseLU/SparseLU_Utils.h"
 #include "src/SparseLU/SparseLU.h"

-#include "src/Core/util/ReenableStupidWarnings.h"
-
 #endif // EIGEN_SPARSELU_MODULE_H
--- a/Eigen/SparseQR
+++ b/Eigen/SparseQR
@@ -28,6 +28,7 @@
  * 
  */

+#include "OrderingMethods"
 #include "src/SparseCore/SparseColEtree.h"
 #include "src/SparseQR/SparseQR.h"

--- a/Eigen/src/Cholesky/LDLT.h
+++ b/Eigen/src/Cholesky/LDLT.h
@@ -16,15 +16,6 @@
 namespace Eigen {

 namespace internal {
-  template<typename _MatrixType, int _UpLo> struct traits<LDLT<_MatrixType, _UpLo> >
-   : traits<_MatrixType>
-  {
-    typedef MatrixXpr XprKind;
-    typedef SolverStorage StorageKind;
-    typedef int StorageIndex;
-    enum { Flags = 0 };
-  };
-
  template<typename MatrixType, int UpLo> struct LDLT_Traits;

  // PositiveSemiDef means positive semi-definite and non-zero; same for NegativeSemiDef
@@ -57,19 +48,20 @@ namespace internal {
  * \sa MatrixBase::ldlt(), SelfAdjointView::ldlt(), class LLT
  */
 template<typename _MatrixType, int _UpLo> class LDLT
-        : public SolverBase<LDLT<_MatrixType, _UpLo> >
 {
  public:
    typedef _MatrixType MatrixType;
-    typedef SolverBase<LDLT> Base;
-    friend class SolverBase<LDLT>;
-
-    EIGEN_GENERIC_PUBLIC_INTERFACE(LDLT)
    enum {
+      RowsAtCompileTime = MatrixType::RowsAtCompileTime,
+      ColsAtCompileTime = MatrixType::ColsAtCompileTime,
      MaxRowsAtCompileTime = MatrixType::MaxRowsAtCompileTime,
      MaxColsAtCompileTime = MatrixType::MaxColsAtCompileTime,
      UpLo = _UpLo
    };
+    typedef typename MatrixType::Scalar Scalar;
+    typedef typename NumTraits<typename MatrixType::Scalar>::Real RealScalar;
+    typedef Eigen::Index Index; ///< \deprecated since Eigen 3.3
+    typedef typename MatrixType::StorageIndex StorageIndex;
    typedef Matrix<Scalar, RowsAtCompileTime, 1, 0, MaxRowsAtCompileTime, 1> TmpMatrixType;

    typedef Transpositions<RowsAtCompileTime, MaxRowsAtCompileTime> TranspositionType;
@@ -188,7 +180,6 @@ template<typename _MatrixType, int _UpLo> class LDLT
      return m_sign == internal::NegativeSemiDef || m_sign == internal::ZeroSign;
    }

-    #ifdef EIGEN_PARSED_BY_DOXYGEN
    /** \returns a solution x of \f$ A x = b \f$ using the current decomposition of A.
      *
      * This function also supports in-place solves using the syntax <tt>x = decompositionObject.solve(x)</tt> .
@@ -206,8 +197,13 @@ template<typename _MatrixType, int _UpLo> class LDLT
      */
    template<typename Rhs>
    inline const Solve<LDLT, Rhs>
-    solve(const MatrixBase<Rhs>& b) const;
-    #endif
+    solve(const MatrixBase<Rhs>& b) const
+    {
+      eigen_assert(m_isInitialized && "LDLT is not initialized.");
+      eigen_assert(m_matrix.rows()==b.rows()
+                && "LDLT::solve(): invalid number of rows of the right hand side matrix b");
+      return Solve<LDLT, Rhs>(*this, b.derived());
+    }

    template<typename Derived>
    bool solveInPlace(MatrixBase<Derived> &bAndX) const;
@@ -251,7 +247,7 @@ template<typename _MatrixType, int _UpLo> class LDLT

    /** \brief Reports whether previous computation was successful.
      *
-      * \returns \c Success if computation was successful,
+      * \returns \c Success if computation was succesful,
      *          \c NumericalIssue if the factorization failed because of a zero pivot.
      */
    ComputationInfo info() const
@@ -262,10 +258,8 @@ template<typename _MatrixType, int _UpLo> class LDLT

    #ifndef EIGEN_PARSED_BY_DOXYGEN
    template<typename RhsType, typename DstType>
+    EIGEN_DEVICE_FUNC
    void _solve_impl(const RhsType &rhs, DstType &dst) const;
-
-    template<bool Conjugate, typename RhsType, typename DstType>
-    void _solve_impl_transposed(const RhsType &rhs, DstType &dst) const;
    #endif

  protected:
@@ -566,22 +560,14 @@ template<typename _MatrixType, int _UpLo>
 template<typename RhsType, typename DstType>
 void LDLT<_MatrixType,_UpLo>::_solve_impl(const RhsType &rhs, DstType &dst) const
 {
-  _solve_impl_transposed<true>(rhs, dst);
-}
-
-template<typename _MatrixType,int _UpLo>
-template<bool Conjugate, typename RhsType, typename DstType>
-void LDLT<_MatrixType,_UpLo>::_solve_impl_transposed(const RhsType &rhs, DstType &dst) const
-{
+  eigen_assert(rhs.rows() == rows());
  // dst = P b
  dst = m_transpositions * rhs;

  // dst = L^-1 (P b)
-  // dst = L^-*T (P b)
-  matrixL().template conjugateIf<!Conjugate>().solveInPlace(dst);
+  matrixL().solveInPlace(dst);

-  // dst = D^-* (L^-1 P b)
-  // dst = D^-1 (L^-*T P b)
+  // dst = D^-1 (L^-1 P b)
  // more precisely, use pseudo-inverse of D (see bug 241)
  using std::abs;
  const typename Diagonal<const MatrixType>::RealReturnType vecD(vectorD());
@@ -593,6 +579,7 @@ void LDLT<_MatrixType,_UpLo>::_solve_impl_transposed(const RhsType &rhs, DstType
  // Moreover, Lapack's xSYTRS routines use 0 for the tolerance.
  // Using numeric_limits::min() gives us more robustness to denormals.
  RealScalar tolerance = (std::numeric_limits<RealScalar>::min)();
+
  for (Index i = 0; i < vecD.size(); ++i)
  {
    if(abs(vecD(i)) > tolerance)
@@ -601,12 +588,10 @@ void LDLT<_MatrixType,_UpLo>::_solve_impl_transposed(const RhsType &rhs, DstType
      dst.row(i).setZero();
  }

-  // dst = L^-* (D^-* L^-1 P b)
-  // dst = L^-T (D^-1 L^-*T P b)
-  matrixL().transpose().template conjugateIf<Conjugate>().solveInPlace(dst);
+  // dst = L^-T (D^-1 L^-1 P b)
+  matrixU().solveInPlace(dst);

-  // dst = P^T (L^-* D^-* L^-1 P b) = A^-1 b
-  // dst = P^-T (L^-T D^-1 L^-*T P b) = A^-1 b
+  // dst = P^-1 (L^-T D^-1 L^-1 P b) = A^-1 b
  dst = m_transpositions.transpose() * dst;
 }
 #endif
--- a/Eigen/src/Cholesky/LLT.h
+++ b/Eigen/src/Cholesky/LLT.h
@@ -13,16 +13,6 @@
 namespace Eigen {

 namespace internal{
-
-template<typename _MatrixType, int _UpLo> struct traits<LLT<_MatrixType, _UpLo> >
- : traits<_MatrixType>
-{
-  typedef MatrixXpr XprKind;
-  typedef SolverStorage StorageKind;
-  typedef int StorageIndex;
-  enum { Flags = 0 };
-};
-
 template<typename MatrixType, int UpLo> struct LLT_Traits;
 }

@@ -64,17 +54,18 @@ template<typename MatrixType, int UpLo> struct LLT_Traits;
  * \sa MatrixBase::llt(), SelfAdjointView::llt(), class LDLT
  */
 template<typename _MatrixType, int _UpLo> class LLT
-        : public SolverBase<LLT<_MatrixType, _UpLo> >
 {
  public:
    typedef _MatrixType MatrixType;
-    typedef SolverBase<LLT> Base;
-    friend class SolverBase<LLT>;
-
-    EIGEN_GENERIC_PUBLIC_INTERFACE(LLT)
    enum {
+      RowsAtCompileTime = MatrixType::RowsAtCompileTime,
+      ColsAtCompileTime = MatrixType::ColsAtCompileTime,
      MaxColsAtCompileTime = MatrixType::MaxColsAtCompileTime
    };
+    typedef typename MatrixType::Scalar Scalar;
+    typedef typename NumTraits<typename MatrixType::Scalar>::Real RealScalar;
+    typedef Eigen::Index Index; ///< \deprecated since Eigen 3.3
+    typedef typename MatrixType::StorageIndex StorageIndex;

    enum {
      PacketSize = internal::packet_traits<Scalar>::size,
@@ -109,7 +100,7 @@ template<typename _MatrixType, int _UpLo> class LLT
      compute(matrix.derived());
    }

-    /** \brief Constructs a LLT factorization from a given matrix
+    /** \brief Constructs a LDLT factorization from a given matrix
      *
      * This overloaded constructor is provided for \link InplaceDecomposition inplace decomposition \endlink when
      * \c MatrixType is a Eigen::Ref.
@@ -138,7 +129,6 @@ template<typename _MatrixType, int _UpLo> class LLT
      return Traits::getL(m_matrix);
    }

-    #ifdef EIGEN_PARSED_BY_DOXYGEN
    /** \returns the solution x of \f$ A x = b \f$ using the current decomposition of A.
      *
      * Since this LLT class assumes anyway that the matrix A is invertible, the solution
@@ -151,8 +141,13 @@ template<typename _MatrixType, int _UpLo> class LLT
      */
    template<typename Rhs>
    inline const Solve<LLT, Rhs>
-    solve(const MatrixBase<Rhs>& b) const;
-    #endif
+    solve(const MatrixBase<Rhs>& b) const
+    {
+      eigen_assert(m_isInitialized && "LLT is not initialized.");
+      eigen_assert(m_matrix.rows()==b.rows()
+                && "LLT::solve(): invalid number of rows of the right hand side matrix b");
+      return Solve<LLT, Rhs>(*this, b.derived());
+    }

    template<typename Derived>
    void solveInPlace(const MatrixBase<Derived> &bAndX) const;
@@ -185,7 +180,7 @@ template<typename _MatrixType, int _UpLo> class LLT

    /** \brief Reports whether previous computation was successful.
      *
-      * \returns \c Success if computation was successful,
+      * \returns \c Success if computation was succesful,
      *          \c NumericalIssue if the matrix.appears not to be positive definite.
      */
    ComputationInfo info() const
@@ -205,14 +200,12 @@ template<typename _MatrixType, int _UpLo> class LLT
    inline Index cols() const { return m_matrix.cols(); }

    template<typename VectorType>
-    LLT & rankUpdate(const VectorType& vec, const RealScalar& sigma = 1);
+    LLT rankUpdate(const VectorType& vec, const RealScalar& sigma = 1);

    #ifndef EIGEN_PARSED_BY_DOXYGEN
    template<typename RhsType, typename DstType>
+    EIGEN_DEVICE_FUNC
    void _solve_impl(const RhsType &rhs, DstType &dst) const;
-
-    template<bool Conjugate, typename RhsType, typename DstType>
-    void _solve_impl_transposed(const RhsType &rhs, DstType &dst) const;
    #endif

  protected:
@@ -466,7 +459,7 @@ LLT<MatrixType,_UpLo>& LLT<MatrixType,_UpLo>::compute(const EigenBase<InputType>
  */
 template<typename _MatrixType, int _UpLo>
 template<typename VectorType>
-LLT<_MatrixType,_UpLo> & LLT<_MatrixType,_UpLo>::rankUpdate(const VectorType& v, const RealScalar& sigma)
+LLT<_MatrixType,_UpLo> LLT<_MatrixType,_UpLo>::rankUpdate(const VectorType& v, const RealScalar& sigma)
 {
  EIGEN_STATIC_ASSERT_VECTOR_ONLY(VectorType);
  eigen_assert(v.size()==m_matrix.cols());
@@ -484,17 +477,8 @@ template<typename _MatrixType,int _UpLo>
 template<typename RhsType, typename DstType>
 void LLT<_MatrixType,_UpLo>::_solve_impl(const RhsType &rhs, DstType &dst) const
 {
-  _solve_impl_transposed<true>(rhs, dst);
-}
-
-template<typename _MatrixType,int _UpLo>
-template<bool Conjugate, typename RhsType, typename DstType>
-void LLT<_MatrixType,_UpLo>::_solve_impl_transposed(const RhsType &rhs, DstType &dst) const
-{
-    dst = rhs;
-
-    matrixL().template conjugateIf<!Conjugate>().solveInPlace(dst);
-    matrixU().template conjugateIf<!Conjugate>().solveInPlace(dst);
+  dst = rhs;
+  solveInPlace(dst);
 }
 #endif

--- a/Eigen/src/CholmodSupport/CholmodSupport.h
+++ b/Eigen/src/CholmodSupport/CholmodSupport.h
@@ -10,7 +10,7 @@
 #ifndef EIGEN_CHOLMODSUPPORT_H
 #define EIGEN_CHOLMODSUPPORT_H

-namespace Eigen {
+namespace Eigen { 

 namespace internal {

@@ -32,7 +32,7 @@ template<> struct cholmod_configure_matrix<std::complex<double> > {
  }
 };

-// Other scalar types are not yet supported by Cholmod
+// Other scalar types are not yet suppotred by Cholmod
 // template<> struct cholmod_configure_matrix<float> {
 //   template<typename CholmodType>
 //   static void run(CholmodType& mat) {
@@ -79,12 +79,12 @@ cholmod_sparse viewAsCholmod(Ref<SparseMatrix<_Scalar,_Options,_StorageIndex> >

  res.dtype   = 0;
  res.stype   = -1;
-
+  
  if (internal::is_same<_StorageIndex,int>::value)
  {
    res.itype = CHOLMOD_INT;
  }
-  else if (internal::is_same<_StorageIndex,SuiteSparse_long>::value)
+  else if (internal::is_same<_StorageIndex,long>::value)
  {
    res.itype = CHOLMOD_LONG;
  }
@@ -95,9 +95,9 @@ cholmod_sparse viewAsCholmod(Ref<SparseMatrix<_Scalar,_Options,_StorageIndex> >

  // setup res.xtype
  internal::cholmod_configure_matrix<_Scalar>::run(res);
-
+  
  res.stype = 0;
-
+  
  return res;
 }

@@ -121,12 +121,9 @@ template<typename _Scalar, int _Options, typename _Index, unsigned int UpLo>
 cholmod_sparse viewAsCholmod(const SparseSelfAdjointView<const SparseMatrix<_Scalar,_Options,_Index>, UpLo>& mat)
 {
  cholmod_sparse res = viewAsCholmod(Ref<SparseMatrix<_Scalar,_Options,_Index> >(mat.matrix().const_cast_derived()));
-
+  
  if(UpLo==Upper) res.stype =  1;
  if(UpLo==Lower) res.stype = -1;
-  // swap stype for rowmajor matrices (only works for real matrices)
-  EIGEN_STATIC_ASSERT((_Options & RowMajorBit) == 0 || NumTraits<_Scalar>::IsComplex == 0, THIS_METHOD_IS_ONLY_FOR_COLUMN_MAJOR_MATRICES);
-  if(_Options & RowMajorBit) res.stype *=-1;

  return res;
 }
@@ -162,44 +159,6 @@ MappedSparseMatrix<Scalar,Flags,StorageIndex> viewAsEigen(cholmod_sparse& cm)
          static_cast<StorageIndex*>(cm.p), static_cast<StorageIndex*>(cm.i),static_cast<Scalar*>(cm.x) );
 }

-namespace internal {
-
-// template specializations for int and long that call the correct cholmod method
-
-#define EIGEN_CHOLMOD_SPECIALIZE0(ret, name) \
-    template<typename _StorageIndex> inline ret cm_ ## name       (cholmod_common &Common) { return cholmod_ ## name   (&Common); } \
-    template<>                       inline ret cm_ ## name<SuiteSparse_long> (cholmod_common &Common) { return cholmod_l_ ## name (&Common); }
-
-#define EIGEN_CHOLMOD_SPECIALIZE1(ret, name, t1, a1) \
-    template<typename _StorageIndex> inline ret cm_ ## name       (t1& a1, cholmod_common &Common) { return cholmod_ ## name   (&a1, &Common); } \
-    template<>                       inline ret cm_ ## name<SuiteSparse_long> (t1& a1, cholmod_common &Common) { return cholmod_l_ ## name (&a1, &Common); }
-
-EIGEN_CHOLMOD_SPECIALIZE0(int, start)
-EIGEN_CHOLMOD_SPECIALIZE0(int, finish)
-
-EIGEN_CHOLMOD_SPECIALIZE1(int, free_factor, cholmod_factor*, L)
-EIGEN_CHOLMOD_SPECIALIZE1(int, free_dense,  cholmod_dense*,  X)
-EIGEN_CHOLMOD_SPECIALIZE1(int, free_sparse, cholmod_sparse*, A)
-
-EIGEN_CHOLMOD_SPECIALIZE1(cholmod_factor*, analyze, cholmod_sparse, A)
-
-template<typename _StorageIndex> inline cholmod_dense*  cm_solve         (int sys, cholmod_factor& L, cholmod_dense&  B, cholmod_common &Common) { return cholmod_solve     (sys, &L, &B, &Common); }
-template<>                       inline cholmod_dense*  cm_solve<SuiteSparse_long>   (int sys, cholmod_factor& L, cholmod_dense&  B, cholmod_common &Common) { return cholmod_l_solve   (sys, &L, &B, &Common); }
-
-template<typename _StorageIndex> inline cholmod_sparse* cm_spsolve       (int sys, cholmod_factor& L, cholmod_sparse& B, cholmod_common &Common) { return cholmod_spsolve   (sys, &L, &B, &Common); }
-template<>                       inline cholmod_sparse* cm_spsolve<SuiteSparse_long> (int sys, cholmod_factor& L, cholmod_sparse& B, cholmod_common &Common) { return cholmod_l_spsolve (sys, &L, &B, &Common); }
-
-template<typename _StorageIndex>
-inline int  cm_factorize_p       (cholmod_sparse*  A, double beta[2], _StorageIndex* fset, std::size_t fsize, cholmod_factor* L, cholmod_common &Common) { return cholmod_factorize_p   (A, beta, fset, fsize, L, &Common); }
-template<>
-inline int  cm_factorize_p<SuiteSparse_long> (cholmod_sparse*  A, double beta[2], SuiteSparse_long* fset,          std::size_t fsize, cholmod_factor* L, cholmod_common &Common) { return cholmod_l_factorize_p (A, beta, fset, fsize, L, &Common); }
-
-#undef EIGEN_CHOLMOD_SPECIALIZE0
-#undef EIGEN_CHOLMOD_SPECIALIZE1
-
-}  // namespace internal
-
-
 enum CholmodMode {
  CholmodAuto, CholmodSimplicialLLt, CholmodSupernodalLLt, CholmodLDLt
 };
@@ -236,7 +195,7 @@ class CholmodBase : public SparseSolverBase<Derived>
    {
      EIGEN_STATIC_ASSERT((internal::is_same<double,RealScalar>::value), CHOLMOD_SUPPORTS_DOUBLE_PRECISION_ONLY);
      m_shiftOffset[0] = m_shiftOffset[1] = 0.0;
-      internal::cm_start<StorageIndex>(m_cholmod);
+      cholmod_start(&m_cholmod);
    }

    explicit CholmodBase(const MatrixType& matrix)
@@ -244,23 +203,23 @@ class CholmodBase : public SparseSolverBase<Derived>
    {
      EIGEN_STATIC_ASSERT((internal::is_same<double,RealScalar>::value), CHOLMOD_SUPPORTS_DOUBLE_PRECISION_ONLY);
      m_shiftOffset[0] = m_shiftOffset[1] = 0.0;
-      internal::cm_start<StorageIndex>(m_cholmod);
+      cholmod_start(&m_cholmod);
      compute(matrix);
    }

    ~CholmodBase()
    {
      if(m_cholmodFactor)
-        internal::cm_free_factor<StorageIndex>(m_cholmodFactor, m_cholmod);
-      internal::cm_finish<StorageIndex>(m_cholmod);
+        cholmod_free_factor(&m_cholmodFactor, &m_cholmod);
+      cholmod_finish(&m_cholmod);
    }
-
+    
    inline StorageIndex cols() const { return internal::convert_index<StorageIndex, Index>(m_cholmodFactor->n); }
    inline StorageIndex rows() const { return internal::convert_index<StorageIndex, Index>(m_cholmodFactor->n); }
-
+    
    /** \brief Reports whether previous computation was successful.
      *
-      * \returns \c Success if computation was successful,
+      * \returns \c Success if computation was succesful,
      *          \c NumericalIssue if the matrix.appears to be negative.
      */
    ComputationInfo info() const
@@ -276,29 +235,29 @@ class CholmodBase : public SparseSolverBase<Derived>
      factorize(matrix);
      return derived();
    }
-
+    
    /** Performs a symbolic decomposition on the sparsity pattern of \a matrix.
      *
      * This function is particularly useful when solving for several problems having the same structure.
-      *
+      * 
      * \sa factorize()
      */
    void analyzePattern(const MatrixType& matrix)
    {
      if(m_cholmodFactor)
      {
-        internal::cm_free_factor<StorageIndex>(m_cholmodFactor, m_cholmod);
+        cholmod_free_factor(&m_cholmodFactor, &m_cholmod);
        m_cholmodFactor = 0;
      }
      cholmod_sparse A = viewAsCholmod(matrix.template selfadjointView<UpLo>());
-      m_cholmodFactor = internal::cm_analyze<StorageIndex>(A, m_cholmod);
-
+      m_cholmodFactor = cholmod_analyze(&A, &m_cholmod);
+      
      this->m_isInitialized = true;
      this->m_info = Success;
      m_analysisIsOk = true;
      m_factorizationIsOk = false;
    }
-
+    
    /** Performs a numeric decomposition of \a matrix
      *
      * The given matrix must have the same sparsity pattern as the matrix on which the symbolic decomposition has been performed.
@@ -309,17 +268,17 @@ class CholmodBase : public SparseSolverBase<Derived>
    {
      eigen_assert(m_analysisIsOk && "You must first call analyzePattern()");
      cholmod_sparse A = viewAsCholmod(matrix.template selfadjointView<UpLo>());
-      internal::cm_factorize_p<StorageIndex>(&A, m_shiftOffset, 0, 0, m_cholmodFactor, m_cholmod);
+      cholmod_factorize_p(&A, m_shiftOffset, 0, 0, m_cholmodFactor, &m_cholmod);

      // If the factorization failed, minor is the column at which it did. On success minor == n.
      this->m_info = (m_cholmodFactor->minor == m_cholmodFactor->n ? Success : NumericalIssue);
      m_factorizationIsOk = true;
    }
-
+    
    /** Returns a reference to the Cholmod's configuration structure to get a full control over the performed operations.
     *  See the Cholmod user guide for details. */
    cholmod_common& cholmod() { return m_cholmod; }
-
+    
    #ifndef EIGEN_PARSED_BY_DOXYGEN
    /** \internal */
    template<typename Rhs,typename Dest>
@@ -329,23 +288,22 @@ class CholmodBase : public SparseSolverBase<Derived>
      const Index size = m_cholmodFactor->n;
      EIGEN_UNUSED_VARIABLE(size);
      eigen_assert(size==b.rows());
-
-      // Cholmod needs column-major storage without inner-stride, which corresponds to the default behavior of Ref.
+      
+      // Cholmod needs column-major stoarge without inner-stride, which corresponds to the default behavior of Ref.
      Ref<const Matrix<typename Rhs::Scalar,Dynamic,Dynamic,ColMajor> > b_ref(b.derived());

      cholmod_dense b_cd = viewAsCholmod(b_ref);
-      cholmod_dense* x_cd = internal::cm_solve<StorageIndex>(CHOLMOD_A, *m_cholmodFactor, b_cd, m_cholmod);
+      cholmod_dense* x_cd = cholmod_solve(CHOLMOD_A, m_cholmodFactor, &b_cd, &m_cholmod);
      if(!x_cd)
      {
        this->m_info = NumericalIssue;
        return;
      }
      // TODO optimize this copy by swapping when possible (be careful with alignment, etc.)
-      // NOTE Actually, the copy can be avoided by calling cholmod_solve2 instead of cholmod_solve
      dest = Matrix<Scalar,Dest::RowsAtCompileTime,Dest::ColsAtCompileTime>::Map(reinterpret_cast<Scalar*>(x_cd->x),b.rows(),b.cols());
-      internal::cm_free_dense<StorageIndex>(x_cd, m_cholmod);
+      cholmod_free_dense(&x_cd, &m_cholmod);
    }
-
+    
    /** \internal */
    template<typename RhsDerived, typename DestDerived>
    void _solve_impl(const SparseMatrixBase<RhsDerived> &b, SparseMatrixBase<DestDerived> &dest) const
@@ -358,20 +316,19 @@ class CholmodBase : public SparseSolverBase<Derived>
      // note: cs stands for Cholmod Sparse
      Ref<SparseMatrix<typename RhsDerived::Scalar,ColMajor,typename RhsDerived::StorageIndex> > b_ref(b.const_cast_derived());
      cholmod_sparse b_cs = viewAsCholmod(b_ref);
-      cholmod_sparse* x_cs = internal::cm_spsolve<StorageIndex>(CHOLMOD_A, *m_cholmodFactor, b_cs, m_cholmod);
+      cholmod_sparse* x_cs = cholmod_spsolve(CHOLMOD_A, m_cholmodFactor, &b_cs, &m_cholmod);
      if(!x_cs)
      {
        this->m_info = NumericalIssue;
        return;
      }
      // TODO optimize this copy by swapping when possible (be careful with alignment, etc.)
-      // NOTE cholmod_spsolve in fact just calls the dense solver for blocks of 4 columns at a time (similar to Eigen's sparse solver)
      dest.derived() = viewAsEigen<typename DestDerived::Scalar,ColMajor,typename DestDerived::StorageIndex>(*x_cs);
-      internal::cm_free_sparse<StorageIndex>(x_cs, m_cholmod);
+      cholmod_free_sparse(&x_cs, &m_cholmod);
    }
    #endif // EIGEN_PARSED_BY_DOXYGEN
-
-
+    
+    
    /** Sets the shift parameter that will be used to adjust the diagonal coefficients during the numerical factorization.
      *
      * During the numerical factorization, an offset term is added to the diagonal coefficients:\n
@@ -386,7 +343,7 @@ class CholmodBase : public SparseSolverBase<Derived>
      m_shiftOffset[0] = double(offset);
      return derived();
    }
-
+    
    /** \returns the determinant of the underlying matrix from the current factorization */
    Scalar determinant() const
    {
@@ -441,7 +398,7 @@ class CholmodBase : public SparseSolverBase<Derived>
    template<typename Stream>
    void dumpMemory(Stream& /*s*/)
    {}
-
+    
  protected:
    mutable cholmod_common m_cholmod;
    cholmod_factor* m_cholmodFactor;
@@ -478,11 +435,11 @@ class CholmodSimplicialLLT : public CholmodBase<_MatrixType, _UpLo, CholmodSimpl
 {
    typedef CholmodBase<_MatrixType, _UpLo, CholmodSimplicialLLT> Base;
    using Base::m_cholmod;
-
+    
  public:
-
+    
    typedef _MatrixType MatrixType;
-
+    
    CholmodSimplicialLLT() : Base() { init(); }

    CholmodSimplicialLLT(const MatrixType& matrix) : Base()
@@ -529,11 +486,11 @@ class CholmodSimplicialLDLT : public CholmodBase<_MatrixType, _UpLo, CholmodSimp
 {
    typedef CholmodBase<_MatrixType, _UpLo, CholmodSimplicialLDLT> Base;
    using Base::m_cholmod;
-
+    
  public:
-
+    
    typedef _MatrixType MatrixType;
-
+    
    CholmodSimplicialLDLT() : Base() { init(); }

    CholmodSimplicialLDLT(const MatrixType& matrix) : Base()
@@ -578,11 +535,11 @@ class CholmodSupernodalLLT : public CholmodBase<_MatrixType, _UpLo, CholmodSuper
 {
    typedef CholmodBase<_MatrixType, _UpLo, CholmodSupernodalLLT> Base;
    using Base::m_cholmod;
-
+    
  public:
-
+    
    typedef _MatrixType MatrixType;
-
+    
    CholmodSupernodalLLT() : Base() { init(); }

    CholmodSupernodalLLT(const MatrixType& matrix) : Base()
@@ -629,11 +586,11 @@ class CholmodDecomposition : public CholmodBase<_MatrixType, _UpLo, CholmodDecom
 {
    typedef CholmodBase<_MatrixType, _UpLo, CholmodDecomposition> Base;
    using Base::m_cholmod;
-
+    
  public:
-
+    
    typedef _MatrixType MatrixType;
-
+    
    CholmodDecomposition() : Base() { init(); }

    CholmodDecomposition(const MatrixType& matrix) : Base()
@@ -643,7 +600,7 @@ class CholmodDecomposition : public CholmodBase<_MatrixType, _UpLo, CholmodDecom
    }

    ~CholmodDecomposition() {}
-
+    
    void setMode(CholmodMode mode)
    {
      switch(mode)
--- a/Eigen/src/Core/ArithmeticSequence.h
+++ b/Eigen/src/Core/ArithmeticSequence.h
@@ -1,413 +0,0 @@
-// This file is part of Eigen, a lightweight C++ template library
-// for linear algebra.
-//
-// Copyright (C) 2017 Gael Guennebaud <gael.guennebaud@inria.fr>
-//
-// This Source Code Form is subject to the terms of the Mozilla
-// Public License v. 2.0. If a copy of the MPL was not distributed
-// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
-
-#ifndef EIGEN_ARITHMETIC_SEQUENCE_H
-#define EIGEN_ARITHMETIC_SEQUENCE_H
-
-namespace Eigen {
-
-namespace internal {
-
-#if (!EIGEN_HAS_CXX11) || !((!EIGEN_COMP_GNUC) || EIGEN_COMP_GNUC>=48)
-template<typename T> struct aseq_negate {};
-
-template<> struct aseq_negate<Index> {
-  typedef Index type;
-};
-
-template<int N> struct aseq_negate<FixedInt<N> > {
-  typedef FixedInt<-N> type;
-};
-
-// Compilation error in the following case:
-template<> struct aseq_negate<FixedInt<DynamicIndex> > {};
-
-template<typename FirstType,typename SizeType,typename IncrType,
-         bool FirstIsSymbolic=symbolic::is_symbolic<FirstType>::value,
-         bool SizeIsSymbolic =symbolic::is_symbolic<SizeType>::value>
-struct aseq_reverse_first_type {
-  typedef Index type;
-};
-
-template<typename FirstType,typename SizeType,typename IncrType>
-struct aseq_reverse_first_type<FirstType,SizeType,IncrType,true,true> {
-  typedef symbolic::AddExpr<FirstType,
-                            symbolic::ProductExpr<symbolic::AddExpr<SizeType,symbolic::ValueExpr<FixedInt<-1> > >,
-                                                  symbolic::ValueExpr<IncrType> >
-                           > type;
-};
-
-template<typename SizeType,typename IncrType,typename EnableIf = void>
-struct aseq_reverse_first_type_aux {
-  typedef Index type;
-};
-
-template<typename SizeType,typename IncrType>
-struct aseq_reverse_first_type_aux<SizeType,IncrType,typename internal::enable_if<bool((SizeType::value+IncrType::value)|0x1)>::type> {
-  typedef FixedInt<(SizeType::value-1)*IncrType::value> type;
-};
-
-template<typename FirstType,typename SizeType,typename IncrType>
-struct aseq_reverse_first_type<FirstType,SizeType,IncrType,true,false> {
-  typedef typename aseq_reverse_first_type_aux<SizeType,IncrType>::type Aux;
-  typedef symbolic::AddExpr<FirstType,symbolic::ValueExpr<Aux> > type;
-};
-
-template<typename FirstType,typename SizeType,typename IncrType>
-struct aseq_reverse_first_type<FirstType,SizeType,IncrType,false,true> {
-  typedef symbolic::AddExpr<symbolic::ProductExpr<symbolic::AddExpr<SizeType,symbolic::ValueExpr<FixedInt<-1> > >,
-                                                  symbolic::ValueExpr<IncrType> >,
-                            symbolic::ValueExpr<> > type;
-};
-#endif
-
-// Helper to cleanup the type of the increment:
-template<typename T> struct cleanup_seq_incr {
-  typedef typename cleanup_index_type<T,DynamicIndex>::type type;
-};
-
-}
-
-//--------------------------------------------------------------------------------
-// seq(first,last,incr) and seqN(first,size,incr)
-//--------------------------------------------------------------------------------
-
-template<typename FirstType=Index,typename SizeType=Index,typename IncrType=internal::FixedInt<1> >
-class ArithmeticSequence;
-
-template<typename FirstType,typename SizeType,typename IncrType>
-ArithmeticSequence<typename internal::cleanup_index_type<FirstType>::type,
-                   typename internal::cleanup_index_type<SizeType>::type,
-                   typename internal::cleanup_seq_incr<IncrType>::type >
-seqN(FirstType first, SizeType size, IncrType incr);
-
-/** \class ArithmeticSequence
-  * \ingroup Core_Module
-  *
-  * This class represents an arithmetic progression \f$ a_0, a_1, a_2, ..., a_{n-1}\f$ defined by
-  * its \em first value \f$ a_0 \f$, its \em size (aka length) \em n, and the \em increment (aka stride)
-  * that is equal to \f$ a_{i+1}-a_{i}\f$ for any \em i.
-  *
-  * It is internally used as the return type of the Eigen::seq and Eigen::seqN functions, and as the input arguments
-  * of DenseBase::operator()(const RowIndices&, const ColIndices&), and most of the time this is the
-  * only way it is used.
-  *
-  * \tparam FirstType type of the first element, usually an Index,
-  *                   but internally it can be a symbolic expression
-  * \tparam SizeType type representing the size of the sequence, usually an Index
-  *                  or a compile time integral constant. Internally, it can also be a symbolic expression
-  * \tparam IncrType type of the increment, can be a runtime Index, or a compile time integral constant (default is compile-time 1)
-  *
-  * \sa Eigen::seq, Eigen::seqN, DenseBase::operator()(const RowIndices&, const ColIndices&), class IndexedView
-  */
-template<typename FirstType,typename SizeType,typename IncrType>
-class ArithmeticSequence
-{
-public:
-  ArithmeticSequence(FirstType first, SizeType size) : m_first(first), m_size(size) {}
-  ArithmeticSequence(FirstType first, SizeType size, IncrType incr) : m_first(first), m_size(size), m_incr(incr) {}
-
-  enum {
-    SizeAtCompileTime = internal::get_fixed_value<SizeType>::value,
-    IncrAtCompileTime = internal::get_fixed_value<IncrType,DynamicIndex>::value
-  };
-
-  /** \returns the size, i.e., number of elements, of the sequence */
-  Index size()  const { return m_size; }
-
-  /** \returns the first element \f$ a_0 \f$ in the sequence */
-  Index first()  const { return m_first; }
-
-  /** \returns the value \f$ a_i \f$ at index \a i in the sequence. */
-  Index operator[](Index i) const { return m_first + i * m_incr; }
-
-  const FirstType& firstObject() const { return m_first; }
-  const SizeType&  sizeObject()  const { return m_size; }
-  const IncrType&  incrObject()  const { return m_incr; }
-
-protected:
-  FirstType m_first;
-  SizeType  m_size;
-  IncrType  m_incr;
-
-public:
-
-#if EIGEN_HAS_CXX11 && ((!EIGEN_COMP_GNUC) || EIGEN_COMP_GNUC>=48)
-  auto reverse() const -> decltype(Eigen::seqN(m_first+(m_size+fix<-1>())*m_incr,m_size,-m_incr)) {
-    return seqN(m_first+(m_size+fix<-1>())*m_incr,m_size,-m_incr);
-  }
-#else
-protected:
-  typedef typename internal::aseq_negate<IncrType>::type ReverseIncrType;
-  typedef typename internal::aseq_reverse_first_type<FirstType,SizeType,IncrType>::type ReverseFirstType;
-public:
-  ArithmeticSequence<ReverseFirstType,SizeType,ReverseIncrType>
-  reverse() const {
-    return seqN(m_first+(m_size+fix<-1>())*m_incr,m_size,-m_incr);
-  }
-#endif
-};
-
-/** \returns an ArithmeticSequence starting at \a first, of length \a size, and increment \a incr
-  *
-  * \sa seqN(FirstType,SizeType), seq(FirstType,LastType,IncrType) */
-template<typename FirstType,typename SizeType,typename IncrType>
-ArithmeticSequence<typename internal::cleanup_index_type<FirstType>::type,typename internal::cleanup_index_type<SizeType>::type,typename internal::cleanup_seq_incr<IncrType>::type >
-seqN(FirstType first, SizeType size, IncrType incr)  {
-  return ArithmeticSequence<typename internal::cleanup_index_type<FirstType>::type,typename internal::cleanup_index_type<SizeType>::type,typename internal::cleanup_seq_incr<IncrType>::type>(first,size,incr);
-}
-
-/** \returns an ArithmeticSequence starting at \a first, of length \a size, and unit increment
-  *
-  * \sa seqN(FirstType,SizeType,IncrType), seq(FirstType,LastType) */
-template<typename FirstType,typename SizeType>
-ArithmeticSequence<typename internal::cleanup_index_type<FirstType>::type,typename internal::cleanup_index_type<SizeType>::type >
-seqN(FirstType first, SizeType size)  {
-  return ArithmeticSequence<typename internal::cleanup_index_type<FirstType>::type,typename internal::cleanup_index_type<SizeType>::type>(first,size);
-}
-
-#ifdef EIGEN_PARSED_BY_DOXYGEN
-
-/** \returns an ArithmeticSequence starting at \a f, up (or down) to \a l, and with positive (or negative) increment \a incr
-  *
-  * It is essentially an alias to:
-  * \code
-  * seqN(f, (l-f+incr)/incr, incr);
-  * \endcode
-  *
-  * \sa seqN(FirstType,SizeType,IncrType), seq(FirstType,LastType)
-  */
-template<typename FirstType,typename LastType, typename IncrType>
-auto seq(FirstType f, LastType l, IncrType incr);
-
-/** \returns an ArithmeticSequence starting at \a f, up (or down) to \a l, and unit increment
-  *
-  * It is essentially an alias to:
-  * \code
-  * seqN(f,l-f+1);
-  * \endcode
-  *
-  * \sa seqN(FirstType,SizeType), seq(FirstType,LastType,IncrType)
-  */
-template<typename FirstType,typename LastType>
-auto seq(FirstType f, LastType l);
-
-#else // EIGEN_PARSED_BY_DOXYGEN
-
-#if EIGEN_HAS_CXX11
-template<typename FirstType,typename LastType>
-auto seq(FirstType f, LastType l) -> decltype(seqN(typename internal::cleanup_index_type<FirstType>::type(f),
-                                                   (  typename internal::cleanup_index_type<LastType>::type(l)
-                                                    - typename internal::cleanup_index_type<FirstType>::type(f)+fix<1>())))
-{
-  return seqN(typename internal::cleanup_index_type<FirstType>::type(f),
-              (typename internal::cleanup_index_type<LastType>::type(l)
-               -typename internal::cleanup_index_type<FirstType>::type(f)+fix<1>()));
-}
-
-template<typename FirstType,typename LastType, typename IncrType>
-auto seq(FirstType f, LastType l, IncrType incr)
-  -> decltype(seqN(typename internal::cleanup_index_type<FirstType>::type(f),
-                   (   typename internal::cleanup_index_type<LastType>::type(l)
-                     - typename internal::cleanup_index_type<FirstType>::type(f)+typename internal::cleanup_seq_incr<IncrType>::type(incr)
-                   ) / typename internal::cleanup_seq_incr<IncrType>::type(incr),
-                   typename internal::cleanup_seq_incr<IncrType>::type(incr)))
-{
-  typedef typename internal::cleanup_seq_incr<IncrType>::type CleanedIncrType;
-  return seqN(typename internal::cleanup_index_type<FirstType>::type(f),
-              ( typename internal::cleanup_index_type<LastType>::type(l)
-               -typename internal::cleanup_index_type<FirstType>::type(f)+CleanedIncrType(incr)) / CleanedIncrType(incr),
-              CleanedIncrType(incr));
-}
-
-#else // EIGEN_HAS_CXX11
-
-template<typename FirstType,typename LastType>
-typename internal::enable_if<!(symbolic::is_symbolic<FirstType>::value || symbolic::is_symbolic<LastType>::value),
-                             ArithmeticSequence<typename internal::cleanup_index_type<FirstType>::type,Index> >::type
-seq(FirstType f, LastType l)
-{
-  return seqN(typename internal::cleanup_index_type<FirstType>::type(f),
-              Index((typename internal::cleanup_index_type<LastType>::type(l)-typename internal::cleanup_index_type<FirstType>::type(f)+fix<1>())));
-}
-
-template<typename FirstTypeDerived,typename LastType>
-typename internal::enable_if<!symbolic::is_symbolic<LastType>::value,
-    ArithmeticSequence<FirstTypeDerived, symbolic::AddExpr<symbolic::AddExpr<symbolic::NegateExpr<FirstTypeDerived>,symbolic::ValueExpr<> >,
-                                                            symbolic::ValueExpr<internal::FixedInt<1> > > > >::type
-seq(const symbolic::BaseExpr<FirstTypeDerived> &f, LastType l)
-{
-  return seqN(f.derived(),(typename internal::cleanup_index_type<LastType>::type(l)-f.derived()+fix<1>()));
-}
-
-template<typename FirstType,typename LastTypeDerived>
-typename internal::enable_if<!symbolic::is_symbolic<FirstType>::value,
-    ArithmeticSequence<typename internal::cleanup_index_type<FirstType>::type,
-                        symbolic::AddExpr<symbolic::AddExpr<LastTypeDerived,symbolic::ValueExpr<> >,
-                                          symbolic::ValueExpr<internal::FixedInt<1> > > > >::type
-seq(FirstType f, const symbolic::BaseExpr<LastTypeDerived> &l)
-{
-  return seqN(typename internal::cleanup_index_type<FirstType>::type(f),(l.derived()-typename internal::cleanup_index_type<FirstType>::type(f)+fix<1>()));
-}
-
-template<typename FirstTypeDerived,typename LastTypeDerived>
-ArithmeticSequence<FirstTypeDerived,
-                    symbolic::AddExpr<symbolic::AddExpr<LastTypeDerived,symbolic::NegateExpr<FirstTypeDerived> >,symbolic::ValueExpr<internal::FixedInt<1> > > >
-seq(const symbolic::BaseExpr<FirstTypeDerived> &f, const symbolic::BaseExpr<LastTypeDerived> &l)
-{
-  return seqN(f.derived(),(l.derived()-f.derived()+fix<1>()));
-}
-
-
-template<typename FirstType,typename LastType, typename IncrType>
-typename internal::enable_if<!(symbolic::is_symbolic<FirstType>::value || symbolic::is_symbolic<LastType>::value),
-    ArithmeticSequence<typename internal::cleanup_index_type<FirstType>::type,Index,typename internal::cleanup_seq_incr<IncrType>::type> >::type
-seq(FirstType f, LastType l, IncrType incr)
-{
-  typedef typename internal::cleanup_seq_incr<IncrType>::type CleanedIncrType;
-  return seqN(typename internal::cleanup_index_type<FirstType>::type(f),
-              Index((typename internal::cleanup_index_type<LastType>::type(l)-typename internal::cleanup_index_type<FirstType>::type(f)+CleanedIncrType(incr))/CleanedIncrType(incr)), incr);
-}
-
-template<typename FirstTypeDerived,typename LastType, typename IncrType>
-typename internal::enable_if<!symbolic::is_symbolic<LastType>::value,
-    ArithmeticSequence<FirstTypeDerived,
-                        symbolic::QuotientExpr<symbolic::AddExpr<symbolic::AddExpr<symbolic::NegateExpr<FirstTypeDerived>,
-                                                                                   symbolic::ValueExpr<> >,
-                                                                 symbolic::ValueExpr<typename internal::cleanup_seq_incr<IncrType>::type> >,
-                                              symbolic::ValueExpr<typename internal::cleanup_seq_incr<IncrType>::type> >,
-                        typename internal::cleanup_seq_incr<IncrType>::type> >::type
-seq(const symbolic::BaseExpr<FirstTypeDerived> &f, LastType l, IncrType incr)
-{
-  typedef typename internal::cleanup_seq_incr<IncrType>::type CleanedIncrType;
-  return seqN(f.derived(),(typename internal::cleanup_index_type<LastType>::type(l)-f.derived()+CleanedIncrType(incr))/CleanedIncrType(incr), incr);
-}
-
-template<typename FirstType,typename LastTypeDerived, typename IncrType>
-typename internal::enable_if<!symbolic::is_symbolic<FirstType>::value,
-    ArithmeticSequence<typename internal::cleanup_index_type<FirstType>::type,
-                        symbolic::QuotientExpr<symbolic::AddExpr<symbolic::AddExpr<LastTypeDerived,symbolic::ValueExpr<> >,
-                                                                 symbolic::ValueExpr<typename internal::cleanup_seq_incr<IncrType>::type> >,
-                                               symbolic::ValueExpr<typename internal::cleanup_seq_incr<IncrType>::type> >,
-                        typename internal::cleanup_seq_incr<IncrType>::type> >::type
-seq(FirstType f, const symbolic::BaseExpr<LastTypeDerived> &l, IncrType incr)
-{
-  typedef typename internal::cleanup_seq_incr<IncrType>::type CleanedIncrType;
-  return seqN(typename internal::cleanup_index_type<FirstType>::type(f),
-              (l.derived()-typename internal::cleanup_index_type<FirstType>::type(f)+CleanedIncrType(incr))/CleanedIncrType(incr), incr);
-}
-
-template<typename FirstTypeDerived,typename LastTypeDerived, typename IncrType>
-ArithmeticSequence<FirstTypeDerived,
-                    symbolic::QuotientExpr<symbolic::AddExpr<symbolic::AddExpr<LastTypeDerived,
-                                                                               symbolic::NegateExpr<FirstTypeDerived> >,
-                                                             symbolic::ValueExpr<typename internal::cleanup_seq_incr<IncrType>::type> >,
-                                          symbolic::ValueExpr<typename internal::cleanup_seq_incr<IncrType>::type> >,
-                    typename internal::cleanup_seq_incr<IncrType>::type>
-seq(const symbolic::BaseExpr<FirstTypeDerived> &f, const symbolic::BaseExpr<LastTypeDerived> &l, IncrType incr)
-{
-  typedef typename internal::cleanup_seq_incr<IncrType>::type CleanedIncrType;
-  return seqN(f.derived(),(l.derived()-f.derived()+CleanedIncrType(incr))/CleanedIncrType(incr), incr);
-}
-#endif // EIGEN_HAS_CXX11
-
-#endif // EIGEN_PARSED_BY_DOXYGEN
-
-
-#if EIGEN_HAS_CXX11 || defined(EIGEN_PARSED_BY_DOXYGEN)
-/** \cpp11
-  * \returns a symbolic ArithmeticSequence representing the last \a size elements with increment \a incr.
-  *
-  * It is a shortcut for: \code seqN(last-(size-fix<1>)*incr, size, incr) \endcode
-  * 
-  * \sa lastN(SizeType), seqN(FirstType,SizeType), seq(FirstType,LastType,IncrType) */
-template<typename SizeType,typename IncrType>
-auto lastN(SizeType size, IncrType incr)
-> decltype(seqN(Eigen::last-(size-fix<1>())*incr, size, incr))
-{
-  return seqN(Eigen::last-(size-fix<1>())*incr, size, incr);
-}
-
-/** \cpp11
-  * \returns a symbolic ArithmeticSequence representing the last \a size elements with a unit increment.
-  *
-  *  It is a shortcut for: \code seq(last+fix<1>-size, last) \endcode
-  * 
-  * \sa lastN(SizeType,IncrType, seqN(FirstType,SizeType), seq(FirstType,LastType) */
-template<typename SizeType>
-auto lastN(SizeType size)
-> decltype(seqN(Eigen::last+fix<1>()-size, size))
-{
-  return seqN(Eigen::last+fix<1>()-size, size);
-}
-#endif
-
-namespace internal {
-
-// Convert a symbolic span into a usable one (i.e., remove last/end "keywords")
-template<typename T>
-struct make_size_type {
-  typedef typename internal::conditional<symbolic::is_symbolic<T>::value, Index, T>::type type;
-};
-
-template<typename FirstType,typename SizeType,typename IncrType,int XprSize>
-struct IndexedViewCompatibleType<ArithmeticSequence<FirstType,SizeType,IncrType>, XprSize> {
-  typedef ArithmeticSequence<Index,typename make_size_type<SizeType>::type,IncrType> type;
-};
-
-template<typename FirstType,typename SizeType,typename IncrType>
-ArithmeticSequence<Index,typename make_size_type<SizeType>::type,IncrType>
-makeIndexedViewCompatible(const ArithmeticSequence<FirstType,SizeType,IncrType>& ids, Index size,SpecializedType) {
-  return ArithmeticSequence<Index,typename make_size_type<SizeType>::type,IncrType>(
-            eval_expr_given_size(ids.firstObject(),size),eval_expr_given_size(ids.sizeObject(),size),ids.incrObject());
-}
-
-template<typename FirstType,typename SizeType,typename IncrType>
-struct get_compile_time_incr<ArithmeticSequence<FirstType,SizeType,IncrType> > {
-  enum { value = get_fixed_value<IncrType,DynamicIndex>::value };
-};
-
-} // end namespace internal
-
-/** \namespace Eigen::indexing
-  * \ingroup Core_Module
-  * 
-  * The sole purpose of this namespace is to be able to import all functions
-  * and symbols that are expected to be used within operator() for indexing
-  * and slicing. If you already imported the whole Eigen namespace:
-  * \code using namespace Eigen; \endcode
-  * then you are already all set. Otherwise, if you don't want/cannot import
-  * the whole Eigen namespace, the following line:
-  * \code using namespace Eigen::indexing; \endcode
-  * is equivalent to:
-  * \code
-  using Eigen::all;
-  using Eigen::seq;
-  using Eigen::seqN;
-  using Eigen::lastN; // c++11 only
-  using Eigen::last;
-  using Eigen::lastp1;
-  using Eigen::fix;
-  \endcode
-  */
-namespace indexing {
-  using Eigen::all;
-  using Eigen::seq;
-  using Eigen::seqN;
-  #if EIGEN_HAS_CXX11
-  using Eigen::lastN;
-  #endif
-  using Eigen::last;
-  using Eigen::lastp1;
-  using Eigen::fix;
-}
-
-} // end namespace Eigen
-
-#endif // EIGEN_ARITHMETIC_SEQUENCE_H
--- a/Eigen/src/Core/Array.h
+++ b/Eigen/src/Core/Array.h
@@ -162,45 +162,6 @@ class Array
    }
 #endif

-    #if EIGEN_HAS_CXX11
-    /** \copydoc PlainObjectBase(const Scalar& a0, const Scalar& a1, const Scalar& a2, const Scalar& a3, const ArgTypes&... args)
-     *
-     * Example: \include Array_variadic_ctor_cxx11.cpp
-     * Output: \verbinclude Array_variadic_ctor_cxx11.out
-     *
-     * \sa Array(const std::initializer_list<std::initializer_list<Scalar>>&)
-     * \sa Array(const Scalar&), Array(const Scalar&,const Scalar&)
-     */
-    template <typename... ArgTypes>
-    EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
-    Array(const Scalar& a0, const Scalar& a1, const Scalar& a2, const Scalar& a3, const ArgTypes&... args)
-      : Base(a0, a1, a2, a3, args...) {}
-
-    /** \brief Constructs an array and initializes it from the coefficients given as initializer-lists grouped by row. \cpp11
-      * 
-      * In the general case, the constructor takes a list of rows, each row being represented as a list of coefficients:
-      * 
-      * Example: \include Array_initializer_list_23_cxx11.cpp
-      * Output: \verbinclude Array_initializer_list_23_cxx11.out
-      * 
-      * Each of the inner initializer lists must contain the exact same number of elements, otherwise an assertion is triggered.
-      * 
-      * In the case of a compile-time column 1D array, implicit transposition from a single row is allowed.
-      * Therefore <code> Array<int,Dynamic,1>{{1,2,3,4,5}}</code> is legal and the more verbose syntax
-      * <code>Array<int,Dynamic,1>{{1},{2},{3},{4},{5}}</code> can be avoided:
-      * 
-      * Example: \include Array_initializer_list_vector_cxx11.cpp
-      * Output: \verbinclude Array_initializer_list_vector_cxx11.out
-      * 
-      * In the case of fixed-sized arrays, the initializer list sizes must exactly match the array sizes,
-      * and implicit transposition is allowed for compile-time 1D arrays only.
-      * 
-      * \sa  Array(const Scalar& a0, const Scalar& a1, const Scalar& a2, const Scalar& a3, const ArgTypes&... args)
-      */
-    EIGEN_DEVICE_FUNC
-    EIGEN_STRONG_INLINE Array(const std::initializer_list<std::initializer_list<Scalar>>& list) : Base(list) {}
-    #endif // end EIGEN_HAS_CXX11
-
    #ifndef EIGEN_PARSED_BY_DOXYGEN
    template<typename T>
    EIGEN_DEVICE_FUNC
@@ -217,7 +178,6 @@ class Array
      Base::_check_template_params();
      this->template _init2<T0,T1>(val0, val1);
    }
-
    #else
    /** \brief Constructs a fixed-sized array initialized with coefficients starting at \a data */
    EIGEN_DEVICE_FUNC explicit Array(const Scalar *data);
@@ -229,8 +189,7 @@ class Array
      */
    EIGEN_DEVICE_FUNC
    EIGEN_STRONG_INLINE explicit Array(Index dim);
-    /** constructs an initialized 1x1 Array with the given coefficient
-      * \sa const Scalar& a0, const Scalar& a1, const Scalar& a2, const Scalar& a3, const ArgTypes&... args */
+    /** constructs an initialized 1x1 Array with the given coefficient */
    Array(const Scalar& value);
    /** constructs an uninitialized array with \a rows rows and \a cols columns.
      *
@@ -238,14 +197,11 @@ class Array
      * it is redundant to pass these parameters, so one should use the default constructor
      * Array() instead. */
    Array(Index rows, Index cols);
-    /** constructs an initialized 2D vector with given coefficients
-      * \sa Array(const Scalar& a0, const Scalar& a1, const Scalar& a2, const Scalar& a3, const ArgTypes&... args) */
+    /** constructs an initialized 2D vector with given coefficients */
    Array(const Scalar& val0, const Scalar& val1);
-    #endif  // end EIGEN_PARSED_BY_DOXYGEN 
+    #endif

-    /** constructs an initialized 3D vector with given coefficients
-      * \sa Array(const Scalar& a0, const Scalar& a1, const Scalar& a2, const Scalar& a3, const ArgTypes&... args)
-      */
+    /** constructs an initialized 3D vector with given coefficients */
    EIGEN_DEVICE_FUNC
    EIGEN_STRONG_INLINE Array(const Scalar& val0, const Scalar& val1, const Scalar& val2)
    {
@@ -255,9 +211,7 @@ class Array
      m_storage.data()[1] = val1;
      m_storage.data()[2] = val2;
    }
-    /** constructs an initialized 4D vector with given coefficients
-      * \sa Array(const Scalar& a0, const Scalar& a1, const Scalar& a2, const Scalar& a3, const ArgTypes&... args)
-      */
+    /** constructs an initialized 4D vector with given coefficients */
    EIGEN_DEVICE_FUNC
    EIGEN_STRONG_INLINE Array(const Scalar& val0, const Scalar& val1, const Scalar& val2, const Scalar& val3)
    {
@@ -304,7 +258,7 @@ class Array
 /** \defgroup arraytypedefs Global array typedefs
  * \ingroup Core_Module
  *
-  * %Eigen defines several typedef shortcuts for most common 1D and 2D array types.
+  * Eigen defines several typedef shortcuts for most common 1D and 2D array types.
  *
  * The general patterns are the following:
  *
@@ -317,12 +271,6 @@ class Array
  * There are also \c ArraySizeType which are self-explanatory. For example, \c Array4cf is
  * a fixed-size 1D array of 4 complex floats.
  *
-  * With \cpp11, template alias are also defined for common sizes.
-  * They follow the same pattern as above except that the scalar type suffix is replaced by a
-  * template parameter, i.e.:
-  *   - `ArrayRowsCols<Type>` where `Rows` and `Cols` can be \c 2,\c 3,\c 4, or \c X for fixed or dynamic size.
-  *   - `ArraySize<Type>` where `Size` can be \c 2,\c 3,\c 4 or \c X for fixed or dynamic size 1D arrays.
-  * 
  * \sa class Array
  */

@@ -355,43 +303,9 @@ EIGEN_MAKE_ARRAY_TYPEDEFS_ALL_SIZES(std::complex<double>, cd)

 #undef EIGEN_MAKE_ARRAY_TYPEDEFS_ALL_SIZES
 #undef EIGEN_MAKE_ARRAY_TYPEDEFS
-#undef EIGEN_MAKE_ARRAY_FIXED_TYPEDEFS

-#if EIGEN_HAS_CXX11
+#undef EIGEN_MAKE_ARRAY_TYPEDEFS_LARGE

-#define EIGEN_MAKE_ARRAY_TYPEDEFS(Size, SizeSuffix)               \
-/** \ingroup arraytypedefs */                                     \
-/** \brief \cpp11 */                                              \
-template <typename Type>                                          \
-using Array##SizeSuffix##SizeSuffix = Array<Type, Size, Size>;    \
-/** \ingroup arraytypedefs */                                     \
-/** \brief \cpp11 */                                              \
-template <typename Type>                                          \
-using Array##SizeSuffix = Array<Type, Size, 1>; 
-
-#define EIGEN_MAKE_ARRAY_FIXED_TYPEDEFS(Size)                     \
-/** \ingroup arraytypedefs */                                     \
-/** \brief \cpp11 */                                              \
-template <typename Type>                                          \
-using Array##Size##X = Array<Type, Size, Dynamic>;                \
-/** \ingroup arraytypedefs */                                     \
-/** \brief \cpp11 */                                              \
-template <typename Type>                                          \
-using Array##X##Size = Array<Type, Dynamic, Size>;
-
-EIGEN_MAKE_ARRAY_TYPEDEFS(2, 2)
-EIGEN_MAKE_ARRAY_TYPEDEFS(3, 3)
-EIGEN_MAKE_ARRAY_TYPEDEFS(4, 4)
-EIGEN_MAKE_ARRAY_TYPEDEFS(Dynamic, X)
-EIGEN_MAKE_ARRAY_FIXED_TYPEDEFS(2)
-EIGEN_MAKE_ARRAY_FIXED_TYPEDEFS(3)
-EIGEN_MAKE_ARRAY_FIXED_TYPEDEFS(4)
-
-#undef EIGEN_MAKE_ARRAY_TYPEDEFS
-#undef EIGEN_MAKE_ARRAY_FIXED_TYPEDEFS
-
-#endif // EIGEN_HAS_CXX11
-  
 #define EIGEN_USING_ARRAY_TYPEDEFS_FOR_TYPE_AND_SIZE(TypeSuffix, SizeSuffix) \
 using Eigen::Matrix##SizeSuffix##TypeSuffix; \
 using Eigen::Vector##SizeSuffix##TypeSuffix; \
--- a/Eigen/src/Core/ArrayBase.h
+++ b/Eigen/src/Core/ArrayBase.h
@@ -69,7 +69,6 @@ template<typename Derived> class ArrayBase
    using Base::coeff;
    using Base::coeffRef;
    using Base::lazyAssign;
-    using Base::operator-;
    using Base::operator=;
    using Base::operator+=;
    using Base::operator-=;
@@ -89,6 +88,7 @@ template<typename Derived> class ArrayBase

 #define EIGEN_CURRENT_STORAGE_BASE_CLASS Eigen::ArrayBase
 #define EIGEN_DOC_UNARY_ADDONS(X,Y)
+#   include "../plugins/CommonCwiseUnaryOps.h"
 #   include "../plugins/MatrixCwiseUnaryOps.h"
 #   include "../plugins/ArrayCwiseUnaryOps.h"
 #   include "../plugins/CommonCwiseBinaryOps.h"
--- a/Eigen/src/Core/ArrayWrapper.h
+++ b/Eigen/src/Core/ArrayWrapper.h
@@ -90,8 +90,8 @@ class ArrayWrapper : public ArrayBase<ArrayWrapper<ExpressionType> >
    EIGEN_DEVICE_FUNC
    inline void evalTo(Dest& dst) const { dst = m_expression; }

-    EIGEN_DEVICE_FUNC
    const typename internal::remove_all<NestedExpressionType>::type& 
+    EIGEN_DEVICE_FUNC
    nestedExpression() const 
    {
      return m_expression;
--- a/Eigen/src/Core/Assign.h
+++ b/Eigen/src/Core/Assign.h
@@ -16,7 +16,7 @@ namespace Eigen {

 template<typename Derived>
 template<typename OtherDerived>
-EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Derived& DenseBase<Derived>
+EIGEN_STRONG_INLINE Derived& DenseBase<Derived>
  ::lazyAssign(const DenseBase<OtherDerived>& other)
 {
  enum{
--- a/Eigen/src/Core/AssignEvaluator.h
+++ b/Eigen/src/Core/AssignEvaluator.h
@@ -24,7 +24,7 @@ namespace internal {

 // copy_using_evaluator_traits is based on assign_traits

-template <typename DstEvaluator, typename SrcEvaluator, typename AssignFunc, int MaxPacketSize = -1>
+template <typename DstEvaluator, typename SrcEvaluator, typename AssignFunc>
 struct copy_using_evaluator_traits
 {
  typedef typename DstEvaluator::XprType Dst;
@@ -51,15 +51,13 @@ private:
    InnerMaxSize = int(Dst::IsVectorAtCompileTime) ? int(Dst::MaxSizeAtCompileTime)
              : int(DstFlags)&RowMajorBit ? int(Dst::MaxColsAtCompileTime)
              : int(Dst::MaxRowsAtCompileTime),
-    RestrictedInnerSize = EIGEN_SIZE_MIN_PREFER_FIXED(InnerSize,MaxPacketSize),
-    RestrictedLinearSize = EIGEN_SIZE_MIN_PREFER_FIXED(Dst::SizeAtCompileTime,MaxPacketSize),
    OuterStride = int(outer_stride_at_compile_time<Dst>::ret),
    MaxSizeAtCompileTime = Dst::SizeAtCompileTime
  };

  // TODO distinguish between linear traversal and inner-traversals
-  typedef typename find_best_packet<DstScalar,RestrictedLinearSize>::type LinearPacketType;
-  typedef typename find_best_packet<DstScalar,RestrictedInnerSize>::type InnerPacketType;
+  typedef typename find_best_packet<DstScalar,Dst::SizeAtCompileTime>::type LinearPacketType;
+  typedef typename find_best_packet<DstScalar,InnerSize>::type InnerPacketType;

  enum {
    LinearPacketSize = unpacket_traits<LinearPacketType>::size,
@@ -99,7 +97,7 @@ private:

 public:
  enum {
-    Traversal = (int(MayLinearVectorize) && (LinearPacketSize>InnerPacketSize)) ? int(LinearVectorizedTraversal)
+    Traversal = int(MayLinearVectorize) && (LinearPacketSize>InnerPacketSize) ? int(LinearVectorizedTraversal)
              : int(MayInnerVectorize)   ? int(InnerVectorizedTraversal)
              : int(MayLinearVectorize)  ? int(LinearVectorizedTraversal)
              : int(MaySliceVectorize)   ? int(SliceVectorizedTraversal)
@@ -174,8 +172,6 @@ public:
    EIGEN_DEBUG_VAR(MaySliceVectorize)
    std::cerr << "Traversal" << " = " << Traversal << " (" << demangle_traversal(Traversal) << ")" << std::endl;
    EIGEN_DEBUG_VAR(SrcEvaluator::CoeffReadCost)
-    EIGEN_DEBUG_VAR(DstEvaluator::CoeffReadCost)
-    EIGEN_DEBUG_VAR(Dst::SizeAtCompileTime)
    EIGEN_DEBUG_VAR(UnrollingLimit)
    EIGEN_DEBUG_VAR(MayUnrollCompletely)
    EIGEN_DEBUG_VAR(MayUnrollInner)
@@ -534,7 +530,7 @@ struct dense_assignment_loop<Kernel, SliceVectorizedTraversal, NoUnrolling>
    const Scalar *dst_ptr = kernel.dstDataPtr();
    if((!bool(dstIsAligned)) && (UIntPtr(dst_ptr) % sizeof(Scalar))>0)
    {
-      // the pointer is not aligned-on scalar, so alignment is not possible
+      // the pointer is not aligend-on scalar, so alignment is not possible
      return dense_assignment_loop<Kernel,DefaultTraversal,NoUnrolling>::run(kernel);
    }
    const Index packetAlignedMask = packetSize - 1;
@@ -611,8 +607,7 @@ public:
  typedef typename AssignmentTraits::PacketType PacketType;
  
  
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
-  generic_dense_assignment_kernel(DstEvaluatorType &dst, const SrcEvaluatorType &src, const Functor &func, DstXprType& dstExpr)
+  EIGEN_DEVICE_FUNC generic_dense_assignment_kernel(DstEvaluatorType &dst, const SrcEvaluatorType &src, const Functor &func, DstXprType& dstExpr)
    : m_dst(dst), m_src(src), m_functor(func), m_dstExpr(dstExpr)
  {
    #ifdef EIGEN_DEBUG_ASSIGN
@@ -702,27 +697,6 @@ protected:
  DstXprType& m_dstExpr;
 };

-// Special kernel used when computing small products whose operands have dynamic dimensions.  It ensures that the
-// PacketSize used is no larger than 4, thereby increasing the chance that vectorized instructions will be used
-// when computing the product.
-
-template<typename DstEvaluatorTypeT, typename SrcEvaluatorTypeT, typename Functor>
-class restricted_packet_dense_assignment_kernel : public generic_dense_assignment_kernel<DstEvaluatorTypeT, SrcEvaluatorTypeT, Functor, BuiltIn>
-{
-protected:
-  typedef generic_dense_assignment_kernel<DstEvaluatorTypeT, SrcEvaluatorTypeT, Functor, BuiltIn> Base;
- public:
-    typedef typename Base::Scalar Scalar;
-    typedef typename Base::DstXprType DstXprType;
-    typedef copy_using_evaluator_traits<DstEvaluatorTypeT, SrcEvaluatorTypeT, Functor, 4> AssignmentTraits;
-    typedef typename AssignmentTraits::PacketType PacketType;
-    
-    EIGEN_DEVICE_FUNC restricted_packet_dense_assignment_kernel(DstEvaluatorTypeT &dst, const SrcEvaluatorTypeT &src, const Functor &func, DstXprType& dstExpr)
-    : Base(dst, src, func, dstExpr)
-  {
-  }
- };
- 
 /***************************************************************************
 * Part 5 : Entry point for dense rectangular assignment
 ***************************************************************************/
@@ -782,7 +756,7 @@ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void call_dense_assignment_loop(DstXprType
 // AssignmentKind must define a Kind typedef.
 template<typename DstShape, typename SrcShape> struct AssignmentKind;

-// Assignment kind defined in this file:
+// Assignement kind defined in this file:
 struct Dense2Dense {};
 struct EigenBase2EigenBase {};

@@ -861,27 +835,6 @@ void call_assignment_no_alias(Dst& dst, const Src& src, const Func& func)
  
  Assignment<ActualDstTypeCleaned,Src,Func>::run(actualDst, src, func);
 }
-
-template<typename Dst, typename Src, typename Func>
-EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
-void call_restricted_packet_assignment_no_alias(Dst& dst, const Src& src, const Func& func)
-{
-    typedef evaluator<Dst> DstEvaluatorType;
-    typedef evaluator<Src> SrcEvaluatorType;
-    typedef restricted_packet_dense_assignment_kernel<DstEvaluatorType,SrcEvaluatorType,Func> Kernel;
-
-    EIGEN_STATIC_ASSERT_LVALUE(Dst)
-    EIGEN_CHECK_BINARY_COMPATIBILIY(Func,typename Dst::Scalar,typename Src::Scalar);
-
-    SrcEvaluatorType srcEvaluator(src);
-    resize_if_allowed(dst, src, func);
-    
-    DstEvaluatorType dstEvaluator(dst);
-    Kernel kernel(dstEvaluator, srcEvaluator, func, dst.const_cast_derived());
-
-    dense_assignment_loop<Kernel>::run(kernel);
-}
-
 template<typename Dst, typename Src>
 EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
 void call_assignment_no_alias(Dst& dst, const Src& src)
@@ -946,7 +899,7 @@ struct Assignment<DstXprType, SrcXprType, Functor, EigenBase2EigenBase, Weak>
    src.evalTo(dst);
  }

-  // NOTE The following two functions are templated to avoid their instantiation if not needed
+  // NOTE The following two functions are templated to avoid their instanciation if not needed
  //      This is needed because some expressions supports evalTo only and/or have 'void' as scalar type.
  template<typename SrcScalarType>
  EIGEN_DEVICE_FUNC
--- a/Eigen/src/Core/Assign_MKL.h
+++ b/Eigen/src/Core/Assign_MKL.h
@@ -68,16 +68,16 @@ class vml_assign_traits

 #define EIGEN_PP_EXPAND(ARG) ARG
 #if !defined (EIGEN_FAST_MATH) || (EIGEN_FAST_MATH != 1)
-#define EIGEN_VMLMODE_EXPAND_xLA , VML_HA
+#define EIGEN_VMLMODE_EXPAND_LA , VML_HA
 #else
-#define EIGEN_VMLMODE_EXPAND_xLA , VML_LA
+#define EIGEN_VMLMODE_EXPAND_LA , VML_LA
 #endif

-#define EIGEN_VMLMODE_EXPAND_x_
+#define EIGEN_VMLMODE_EXPAND__ 

-#define EIGEN_VMLMODE_PREFIX_xLA vm
-#define EIGEN_VMLMODE_PREFIX_x_  v
-#define EIGEN_VMLMODE_PREFIX(VMLMODE) EIGEN_CAT(EIGEN_VMLMODE_PREFIX_x,VMLMODE)
+#define EIGEN_VMLMODE_PREFIX_LA vm
+#define EIGEN_VMLMODE_PREFIX__  v
+#define EIGEN_VMLMODE_PREFIX(VMLMODE) EIGEN_CAT(EIGEN_VMLMODE_PREFIX_,VMLMODE)

 #define EIGEN_MKL_VML_DECLARE_UNARY_CALL(EIGENOP, VMLOP, EIGENTYPE, VMLTYPE, VMLMODE)                                           \
  template< typename DstXprType, typename SrcXprNested>                                                                         \
@@ -89,7 +89,7 @@ class vml_assign_traits
      eigen_assert(dst.rows() == src.rows() && dst.cols() == src.cols());                                                       \
      if(vml_assign_traits<DstXprType,SrcXprNested>::Traversal==LinearTraversal) {                                              \
        VMLOP(dst.size(), (const VMLTYPE*)src.nestedExpression().data(),                                                        \
-              (VMLTYPE*)dst.data() EIGEN_PP_EXPAND(EIGEN_VMLMODE_EXPAND_x##VMLMODE) );                                           \
+              (VMLTYPE*)dst.data() EIGEN_PP_EXPAND(EIGEN_VMLMODE_EXPAND_##VMLMODE) );                                           \
      } else {                                                                                                                  \
        const Index outerSize = dst.outerSize();                                                                                \
        for(Index outer = 0; outer < outerSize; ++outer) {                                                                      \
@@ -97,7 +97,7 @@ class vml_assign_traits
                                                      &(src.nestedExpression().coeffRef(0, outer));                             \
          EIGENTYPE *dst_ptr = dst.IsRowMajor ? &(dst.coeffRef(outer,0)) : &(dst.coeffRef(0, outer));                           \
          VMLOP( dst.innerSize(), (const VMLTYPE*)src_ptr,                                                                      \
-                (VMLTYPE*)dst_ptr EIGEN_PP_EXPAND(EIGEN_VMLMODE_EXPAND_x##VMLMODE));                                             \
+                (VMLTYPE*)dst_ptr EIGEN_PP_EXPAND(EIGEN_VMLMODE_EXPAND_##VMLMODE));                                             \
        }                                                                                                                       \
      }                                                                                                                         \
    }                                                                                                                           \
@@ -152,7 +152,7 @@ EIGEN_MKL_VML_DECLARE_UNARY_CALLS_REAL(ceil,  Ceil,   _)
      if(vml_assign_traits<DstXprType,SrcXprNested>::Traversal==LinearTraversal)                                              \
      {                                                                                                                       \
        VMLOP( dst.size(), (const VMLTYPE*)src.lhs().data(), exponent,                                                        \
-              (VMLTYPE*)dst.data() EIGEN_PP_EXPAND(EIGEN_VMLMODE_EXPAND_x##VMLMODE) );                                         \
+              (VMLTYPE*)dst.data() EIGEN_PP_EXPAND(EIGEN_VMLMODE_EXPAND_##VMLMODE) );                                         \
      } else {                                                                                                                \
        const Index outerSize = dst.outerSize();                                                                              \
        for(Index outer = 0; outer < outerSize; ++outer) {                                                                    \
@@ -160,7 +160,7 @@ EIGEN_MKL_VML_DECLARE_UNARY_CALLS_REAL(ceil,  Ceil,   _)
                                                      &(src.lhs().coeffRef(0, outer));                                        \
          EIGENTYPE *dst_ptr = dst.IsRowMajor ? &(dst.coeffRef(outer,0)) : &(dst.coeffRef(0, outer));                         \
          VMLOP( dst.innerSize(), (const VMLTYPE*)src_ptr, exponent,                                                          \
-                 (VMLTYPE*)dst_ptr EIGEN_PP_EXPAND(EIGEN_VMLMODE_EXPAND_x##VMLMODE));                                          \
+                 (VMLTYPE*)dst_ptr EIGEN_PP_EXPAND(EIGEN_VMLMODE_EXPAND_##VMLMODE));                                          \
        }                                                                                                                     \
      }                                                                                                                       \
    }                                                                                                                         \
--- a/Eigen/src/Core/Block.h
+++ b/Eigen/src/Core/Block.h
@@ -114,8 +114,8 @@ template<typename XprType, int BlockRows, int BlockCols, bool InnerPanel> class
  
    /** Column or Row constructor
      */
-    EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
-    Block(XprType& xpr, Index i) : Impl(xpr,i)
+    EIGEN_DEVICE_FUNC
+    inline Block(XprType& xpr, Index i) : Impl(xpr,i)
    {
      eigen_assert( (i>=0) && (
          ((BlockRows==1) && (BlockCols==XprType::ColsAtCompileTime) && i<xpr.rows())
@@ -124,8 +124,8 @@ template<typename XprType, int BlockRows, int BlockCols, bool InnerPanel> class

    /** Fixed-size constructor
      */
-    EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
-    Block(XprType& xpr, Index startRow, Index startCol)
+    EIGEN_DEVICE_FUNC
+    inline Block(XprType& xpr, Index startRow, Index startCol)
      : Impl(xpr, startRow, startCol)
    {
      EIGEN_STATIC_ASSERT(RowsAtCompileTime!=Dynamic && ColsAtCompileTime!=Dynamic,THIS_METHOD_IS_ONLY_FOR_FIXED_SIZE)
@@ -135,8 +135,8 @@ template<typename XprType, int BlockRows, int BlockCols, bool InnerPanel> class

    /** Dynamic-size constructor
      */
-    EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
-    Block(XprType& xpr,
+    EIGEN_DEVICE_FUNC
+    inline Block(XprType& xpr,
          Index startRow, Index startCol,
          Index blockRows, Index blockCols)
      : Impl(xpr, startRow, startCol, blockRows, blockCols)
@@ -159,10 +159,10 @@ class BlockImpl<XprType, BlockRows, BlockCols, InnerPanel, Dense>
  public:
    typedef Impl Base;
    EIGEN_INHERIT_ASSIGNMENT_OPERATORS(BlockImpl)
-    EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE BlockImpl(XprType& xpr, Index i) : Impl(xpr,i) {}
-    EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE BlockImpl(XprType& xpr, Index startRow, Index startCol) : Impl(xpr, startRow, startCol) {}
+    EIGEN_DEVICE_FUNC inline BlockImpl(XprType& xpr, Index i) : Impl(xpr,i) {}
+    EIGEN_DEVICE_FUNC inline BlockImpl(XprType& xpr, Index startRow, Index startCol) : Impl(xpr, startRow, startCol) {}
    EIGEN_DEVICE_FUNC
-    EIGEN_STRONG_INLINE BlockImpl(XprType& xpr, Index startRow, Index startCol, Index blockRows, Index blockCols)
+    inline BlockImpl(XprType& xpr, Index startRow, Index startCol, Index blockRows, Index blockCols)
      : Impl(xpr, startRow, startCol, blockRows, blockCols) {}
 };

@@ -294,22 +294,22 @@ template<typename XprType, int BlockRows, int BlockCols, bool InnerPanel, bool H
    EIGEN_DEVICE_FUNC inline Index outerStride() const;
    #endif

-    EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+    EIGEN_DEVICE_FUNC
    const typename internal::remove_all<XprTypeNested>::type& nestedExpression() const
    { 
      return m_xpr; 
    }

-    EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+    EIGEN_DEVICE_FUNC
    XprType& nestedExpression() { return m_xpr; }
      
-    EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+    EIGEN_DEVICE_FUNC
    StorageIndex startRow() const
    { 
      return m_startRow.value(); 
    }
      
-    EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+    EIGEN_DEVICE_FUNC
    StorageIndex startCol() const
    { 
      return m_startCol.value(); 
@@ -342,8 +342,8 @@ class BlockImpl_dense<XprType,BlockRows,BlockCols, InnerPanel,true>

    /** Column or Row constructor
      */
-    EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
-    BlockImpl_dense(XprType& xpr, Index i)
+    EIGEN_DEVICE_FUNC
+    inline BlockImpl_dense(XprType& xpr, Index i)
      : Base(xpr.data() + i * (    ((BlockRows==1) && (BlockCols==XprType::ColsAtCompileTime) && (!XprTypeIsRowMajor)) 
                                || ((BlockRows==XprType::RowsAtCompileTime) && (BlockCols==1) && ( XprTypeIsRowMajor)) ? xpr.innerStride() : xpr.outerStride()),
             BlockRows==1 ? 1 : xpr.rows(),
@@ -357,8 +357,8 @@ class BlockImpl_dense<XprType,BlockRows,BlockCols, InnerPanel,true>

    /** Fixed-size constructor
      */
-    EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
-    BlockImpl_dense(XprType& xpr, Index startRow, Index startCol)
+    EIGEN_DEVICE_FUNC
+    inline BlockImpl_dense(XprType& xpr, Index startRow, Index startCol)
      : Base(xpr.data()+xpr.innerStride()*(XprTypeIsRowMajor?startCol:startRow) + xpr.outerStride()*(XprTypeIsRowMajor?startRow:startCol)),
        m_xpr(xpr), m_startRow(startRow), m_startCol(startCol)
    {
@@ -367,8 +367,8 @@ class BlockImpl_dense<XprType,BlockRows,BlockCols, InnerPanel,true>

    /** Dynamic-size constructor
      */
-    EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
-    BlockImpl_dense(XprType& xpr,
+    EIGEN_DEVICE_FUNC
+    inline BlockImpl_dense(XprType& xpr,
          Index startRow, Index startCol,
          Index blockRows, Index blockCols)
      : Base(xpr.data()+xpr.innerStride()*(XprTypeIsRowMajor?startCol:startRow) + xpr.outerStride()*(XprTypeIsRowMajor?startRow:startCol), blockRows, blockCols),
@@ -377,18 +377,18 @@ class BlockImpl_dense<XprType,BlockRows,BlockCols, InnerPanel,true>
      init();
    }

-    EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+    EIGEN_DEVICE_FUNC
    const typename internal::remove_all<XprTypeNested>::type& nestedExpression() const
    { 
      return m_xpr; 
    }

-    EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+    EIGEN_DEVICE_FUNC
    XprType& nestedExpression() { return m_xpr; }
      
    /** \sa MapBase::innerStride() */
-    EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
-    Index innerStride() const
+    EIGEN_DEVICE_FUNC
+    inline Index innerStride() const
    {
      return internal::traits<BlockType>::HasSameStorageOrderAsXprType
             ? m_xpr.innerStride()
@@ -396,19 +396,19 @@ class BlockImpl_dense<XprType,BlockRows,BlockCols, InnerPanel,true>
    }

    /** \sa MapBase::outerStride() */
-    EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
-    Index outerStride() const
+    EIGEN_DEVICE_FUNC
+    inline Index outerStride() const
    {
      return m_outerStride;
    }

-    EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+    EIGEN_DEVICE_FUNC
    StorageIndex startRow() const
    {
      return m_startRow.value();
    }

-    EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+    EIGEN_DEVICE_FUNC
    StorageIndex startCol() const
    {
      return m_startCol.value();
@@ -422,8 +422,8 @@ class BlockImpl_dense<XprType,BlockRows,BlockCols, InnerPanel,true>

    #ifndef EIGEN_PARSED_BY_DOXYGEN
    /** \internal used by allowAligned() */
-    EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
-    BlockImpl_dense(XprType& xpr, const Scalar* data, Index blockRows, Index blockCols)
+    EIGEN_DEVICE_FUNC
+    inline BlockImpl_dense(XprType& xpr, const Scalar* data, Index blockRows, Index blockCols)
      : Base(data, blockRows, blockCols), m_xpr(xpr)
    {
      init();
@@ -431,7 +431,7 @@ class BlockImpl_dense<XprType,BlockRows,BlockCols, InnerPanel,true>
    #endif

  protected:
-    EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+    EIGEN_DEVICE_FUNC
    void init()
    {
      m_outerStride = internal::traits<BlockType>::HasSameStorageOrderAsXprType
--- a/Eigen/src/Core/BooleanRedux.h
+++ b/Eigen/src/Core/BooleanRedux.h
@@ -14,54 +14,56 @@ namespace Eigen {

 namespace internal {

-template<typename Derived, int UnrollCount, int Rows>
+template<typename Derived, int UnrollCount>
 struct all_unroller
 {
+  typedef typename Derived::ExpressionTraits Traits;
  enum {
-    col = (UnrollCount-1) / Rows,
-    row = (UnrollCount-1) % Rows
+    col = (UnrollCount-1) / Traits::RowsAtCompileTime,
+    row = (UnrollCount-1) % Traits::RowsAtCompileTime
  };

  static inline bool run(const Derived &mat)
  {
-    return all_unroller<Derived, UnrollCount-1, Rows>::run(mat) && mat.coeff(row, col);
+    return all_unroller<Derived, UnrollCount-1>::run(mat) && mat.coeff(row, col);
  }
 };

-template<typename Derived, int Rows>
-struct all_unroller<Derived, 0, Rows>
+template<typename Derived>
+struct all_unroller<Derived, 0>
 {
  static inline bool run(const Derived &/*mat*/) { return true; }
 };

-template<typename Derived, int Rows>
-struct all_unroller<Derived, Dynamic, Rows>
+template<typename Derived>
+struct all_unroller<Derived, Dynamic>
 {
  static inline bool run(const Derived &) { return false; }
 };

-template<typename Derived, int UnrollCount, int Rows>
+template<typename Derived, int UnrollCount>
 struct any_unroller
 {
+  typedef typename Derived::ExpressionTraits Traits;
  enum {
-    col = (UnrollCount-1) / Rows,
-    row = (UnrollCount-1) % Rows
+    col = (UnrollCount-1) / Traits::RowsAtCompileTime,
+    row = (UnrollCount-1) % Traits::RowsAtCompileTime
  };
  
  static inline bool run(const Derived &mat)
  {
-    return any_unroller<Derived, UnrollCount-1, Rows>::run(mat) || mat.coeff(row, col);
+    return any_unroller<Derived, UnrollCount-1>::run(mat) || mat.coeff(row, col);
  }
 };

-template<typename Derived, int Rows>
-struct any_unroller<Derived, 0, Rows>
+template<typename Derived>
+struct any_unroller<Derived, 0>
 {
  static inline bool run(const Derived & /*mat*/) { return false; }
 };

-template<typename Derived, int Rows>
-struct any_unroller<Derived, Dynamic, Rows>
+template<typename Derived>
+struct any_unroller<Derived, Dynamic>
 {
  static inline bool run(const Derived &) { return false; }
 };
@@ -76,7 +78,7 @@ struct any_unroller<Derived, Dynamic, Rows>
  * \sa any(), Cwise::operator<()
  */
 template<typename Derived>
-EIGEN_DEVICE_FUNC inline bool DenseBase<Derived>::all() const
+inline bool DenseBase<Derived>::all() const
 {
  typedef internal::evaluator<Derived> Evaluator;
  enum {
@@ -85,7 +87,7 @@ EIGEN_DEVICE_FUNC inline bool DenseBase<Derived>::all() const
  };
  Evaluator evaluator(derived());
  if(unroll)
-    return internal::all_unroller<Evaluator, unroll ? int(SizeAtCompileTime) : Dynamic, internal::traits<Derived>::RowsAtCompileTime>::run(evaluator);
+    return internal::all_unroller<Evaluator, unroll ? int(SizeAtCompileTime) : Dynamic>::run(evaluator);
  else
  {
    for(Index j = 0; j < cols(); ++j)
@@ -100,7 +102,7 @@ EIGEN_DEVICE_FUNC inline bool DenseBase<Derived>::all() const
  * \sa all()
  */
 template<typename Derived>
-EIGEN_DEVICE_FUNC inline bool DenseBase<Derived>::any() const
+inline bool DenseBase<Derived>::any() const
 {
  typedef internal::evaluator<Derived> Evaluator;
  enum {
@@ -109,7 +111,7 @@ EIGEN_DEVICE_FUNC inline bool DenseBase<Derived>::any() const
  };
  Evaluator evaluator(derived());
  if(unroll)
-    return internal::any_unroller<Evaluator, unroll ? int(SizeAtCompileTime) : Dynamic, internal::traits<Derived>::RowsAtCompileTime>::run(evaluator);
+    return internal::any_unroller<Evaluator, unroll ? int(SizeAtCompileTime) : Dynamic>::run(evaluator);
  else
  {
    for(Index j = 0; j < cols(); ++j)
@@ -124,7 +126,7 @@ EIGEN_DEVICE_FUNC inline bool DenseBase<Derived>::any() const
  * \sa all(), any()
  */
 template<typename Derived>
-EIGEN_DEVICE_FUNC inline Eigen::Index DenseBase<Derived>::count() const
+inline Eigen::Index DenseBase<Derived>::count() const
 {
  return derived().template cast<bool>().template cast<Index>().sum();
 }
--- a/Eigen/src/Core/CommaInitializer.h
+++ b/Eigen/src/Core/CommaInitializer.h
@@ -141,7 +141,7 @@ struct CommaInitializer
  * \sa CommaInitializer::finished(), class CommaInitializer
  */
 template<typename Derived>
-EIGEN_DEVICE_FUNC inline CommaInitializer<Derived> DenseBase<Derived>::operator<< (const Scalar& s)
+inline CommaInitializer<Derived> DenseBase<Derived>::operator<< (const Scalar& s)
 {
  return CommaInitializer<Derived>(*static_cast<Derived*>(this), s);
 }
@@ -149,7 +149,7 @@ EIGEN_DEVICE_FUNC inline CommaInitializer<Derived> DenseBase<Derived>::operator<
 /** \sa operator<<(const Scalar&) */
 template<typename Derived>
 template<typename OtherDerived>
-EIGEN_DEVICE_FUNC inline CommaInitializer<Derived>
+inline CommaInitializer<Derived>
 DenseBase<Derived>::operator<<(const DenseBase<OtherDerived>& other)
 {
  return CommaInitializer<Derived>(*static_cast<Derived *>(this), other);
--- a/Eigen/src/Core/CoreEvaluators.h
+++ b/Eigen/src/Core/CoreEvaluators.h
@@ -90,8 +90,7 @@ template<typename T>
 struct evaluator : public unary_evaluator<T>
 {
  typedef unary_evaluator<T> Base;
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
-  explicit evaluator(const T& xpr) : Base(xpr) {}
+  EIGEN_DEVICE_FUNC explicit evaluator(const T& xpr) : Base(xpr) {}
 };


@@ -100,14 +99,14 @@ template<typename T>
 struct evaluator<const T>
  : evaluator<T>
 {
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+  EIGEN_DEVICE_FUNC
  explicit evaluator(const T& xpr) : evaluator<T>(xpr) {}
 };

 // ---------- base class for all evaluators ----------

 template<typename ExpressionType>
-struct evaluator_base
+struct evaluator_base : public noncopyable
 {
  // TODO that's not very nice to have to propagate all these traits. They are currently only needed to handle outer,inner indices.
  typedef traits<ExpressionType> ExpressionTraits;
@@ -115,14 +114,6 @@ struct evaluator_base
  enum {
    Alignment = 0
  };
-  // noncopyable:
-  // Don't make this class inherit noncopyable as this kills EBO (Empty Base Optimization)
-  // and make complex evaluator much larger than then should do.
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE evaluator_base() {}
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE ~evaluator_base() {}
-private:
-  EIGEN_DEVICE_FUNC evaluator_base(const evaluator_base&);
-  EIGEN_DEVICE_FUNC const evaluator_base& operator=(const evaluator_base&);
 };

 // -------------------- Matrix and Array --------------------
@@ -132,33 +123,6 @@ private:
 // Here we directly specialize evaluator. This is not really a unary expression, and it is, by definition, dense,
 // so no need for more sophisticated dispatching.

-// this helper permits to completely eliminate m_outerStride if it is known at compiletime.
-template<typename Scalar,int OuterStride> class plainobjectbase_evaluator_data {
-public:
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
-  plainobjectbase_evaluator_data(const Scalar* ptr, Index outerStride) : data(ptr)
-  {
-#ifndef EIGEN_INTERNAL_DEBUGGING
-    EIGEN_UNUSED_VARIABLE(outerStride);
-#endif
-    eigen_internal_assert(outerStride==OuterStride);
-  }
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
-  Index outerStride() const { return OuterStride; }
-  const Scalar *data;
-};
-
-template<typename Scalar> class plainobjectbase_evaluator_data<Scalar,Dynamic> {
-public:
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
-  plainobjectbase_evaluator_data(const Scalar* ptr, Index outerStride) : data(ptr), m_outerStride(outerStride) {}
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
-  Index outerStride() const { return m_outerStride; }
-  const Scalar *data;
-protected:
-  Index m_outerStride;
-};
-
 template<typename Derived>
 struct evaluator<PlainObjectBase<Derived> >
  : evaluator_base<Derived>
@@ -177,23 +141,18 @@ struct evaluator<PlainObjectBase<Derived> >
    Flags = traits<Derived>::EvaluatorFlags,
    Alignment = traits<Derived>::Alignment
  };
-  enum {
-    // We do not need to know the outer stride for vectors
-    OuterStrideAtCompileTime = IsVectorAtCompileTime  ? 0
-                                                      : int(IsRowMajor) ? ColsAtCompileTime
-                                                                        : RowsAtCompileTime
-  };
-
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
-  evaluator()
-    : m_d(0,OuterStrideAtCompileTime)
+  
+  EIGEN_DEVICE_FUNC evaluator()
+    : m_data(0),
+      m_outerStride(IsVectorAtCompileTime  ? 0 
+                                           : int(IsRowMajor) ? ColsAtCompileTime 
+                                           : RowsAtCompileTime)
  {
    EIGEN_INTERNAL_CHECK_COST_VALUE(CoeffReadCost);
  }
-
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
-  explicit evaluator(const PlainObjectType& m)
-    : m_d(m.data(),IsVectorAtCompileTime ? 0 : m.outerStride())
+  
+  EIGEN_DEVICE_FUNC explicit evaluator(const PlainObjectType& m)
+    : m_data(m.data()), m_outerStride(IsVectorAtCompileTime ? 0 : m.outerStride()) 
  {
    EIGEN_INTERNAL_CHECK_COST_VALUE(CoeffReadCost);
  }
@@ -202,30 +161,30 @@ struct evaluator<PlainObjectBase<Derived> >
  CoeffReturnType coeff(Index row, Index col) const
  {
    if (IsRowMajor)
-      return m_d.data[row * m_d.outerStride() + col];
+      return m_data[row * m_outerStride.value() + col];
    else
-      return m_d.data[row + col * m_d.outerStride()];
+      return m_data[row + col * m_outerStride.value()];
  }

  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
  CoeffReturnType coeff(Index index) const
  {
-    return m_d.data[index];
+    return m_data[index];
  }

  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
  Scalar& coeffRef(Index row, Index col)
  {
    if (IsRowMajor)
-      return const_cast<Scalar*>(m_d.data)[row * m_d.outerStride() + col];
+      return const_cast<Scalar*>(m_data)[row * m_outerStride.value() + col];
    else
-      return const_cast<Scalar*>(m_d.data)[row + col * m_d.outerStride()];
+      return const_cast<Scalar*>(m_data)[row + col * m_outerStride.value()];
  }

  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
  Scalar& coeffRef(Index index)
  {
-    return const_cast<Scalar*>(m_d.data)[index];
+    return const_cast<Scalar*>(m_data)[index];
  }

  template<int LoadMode, typename PacketType>
@@ -233,16 +192,16 @@ struct evaluator<PlainObjectBase<Derived> >
  PacketType packet(Index row, Index col) const
  {
    if (IsRowMajor)
-      return ploadt<PacketType, LoadMode>(m_d.data + row * m_d.outerStride() + col);
+      return ploadt<PacketType, LoadMode>(m_data + row * m_outerStride.value() + col);
    else
-      return ploadt<PacketType, LoadMode>(m_d.data + row + col * m_d.outerStride());
+      return ploadt<PacketType, LoadMode>(m_data + row + col * m_outerStride.value());
  }

  template<int LoadMode, typename PacketType>
  EIGEN_STRONG_INLINE
  PacketType packet(Index index) const
  {
-    return ploadt<PacketType, LoadMode>(m_d.data + index);
+    return ploadt<PacketType, LoadMode>(m_data + index);
  }

  template<int StoreMode,typename PacketType>
@@ -251,22 +210,26 @@ struct evaluator<PlainObjectBase<Derived> >
  {
    if (IsRowMajor)
      return pstoret<Scalar, PacketType, StoreMode>
-	            (const_cast<Scalar*>(m_d.data) + row * m_d.outerStride() + col, x);
+	            (const_cast<Scalar*>(m_data) + row * m_outerStride.value() + col, x);
    else
      return pstoret<Scalar, PacketType, StoreMode>
-                    (const_cast<Scalar*>(m_d.data) + row + col * m_d.outerStride(), x);
+                    (const_cast<Scalar*>(m_data) + row + col * m_outerStride.value(), x);
  }

  template<int StoreMode, typename PacketType>
  EIGEN_STRONG_INLINE
  void writePacket(Index index, const PacketType& x)
  {
-    return pstoret<Scalar, PacketType, StoreMode>(const_cast<Scalar*>(m_d.data) + index, x);
+    return pstoret<Scalar, PacketType, StoreMode>(const_cast<Scalar*>(m_data) + index, x);
  }

 protected:
+  const Scalar *m_data;

-  plainobjectbase_evaluator_data<Scalar,OuterStrideAtCompileTime> m_d;
+  // We do not need to know the outer stride for vectors
+  variable_if_dynamic<Index, IsVectorAtCompileTime  ? 0 
+                                                    : int(IsRowMajor) ? ColsAtCompileTime 
+                                                    : RowsAtCompileTime> m_outerStride;
 };

 template<typename Scalar, int Rows, int Cols, int Options, int MaxRows, int MaxCols>
@@ -275,11 +238,9 @@ struct evaluator<Matrix<Scalar, Rows, Cols, Options, MaxRows, MaxCols> >
 {
  typedef Matrix<Scalar, Rows, Cols, Options, MaxRows, MaxCols> XprType;
  
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
-  evaluator() {}
+  EIGEN_DEVICE_FUNC evaluator() {}

-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
-  explicit evaluator(const XprType& m)
+  EIGEN_DEVICE_FUNC explicit evaluator(const XprType& m)
    : evaluator<PlainObjectBase<XprType> >(m) 
  { }
 };
@@ -290,11 +251,9 @@ struct evaluator<Array<Scalar, Rows, Cols, Options, MaxRows, MaxCols> >
 {
  typedef Array<Scalar, Rows, Cols, Options, MaxRows, MaxCols> XprType;

-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
-  evaluator() {}
+  EIGEN_DEVICE_FUNC evaluator() {}
  
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
-  explicit evaluator(const XprType& m)
+  EIGEN_DEVICE_FUNC explicit evaluator(const XprType& m)
    : evaluator<PlainObjectBase<XprType> >(m) 
  { }
 };
@@ -313,8 +272,7 @@ struct unary_evaluator<Transpose<ArgType>, IndexBased>
    Alignment = evaluator<ArgType>::Alignment
  };

-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
-  explicit unary_evaluator(const XprType& t) : m_argImpl(t.nestedExpression()) {}
+  EIGEN_DEVICE_FUNC explicit unary_evaluator(const XprType& t) : m_argImpl(t.nestedExpression()) {}

  typedef typename XprType::Scalar Scalar;
  typedef typename XprType::CoeffReturnType CoeffReturnType;
@@ -569,7 +527,9 @@ struct unary_evaluator<CwiseUnaryOp<UnaryOp, ArgType>, IndexBased >
  };

  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
-  explicit unary_evaluator(const XprType& op) : m_d(op)
+  explicit unary_evaluator(const XprType& op)
+    : m_functor(op.functor()), 
+      m_argImpl(op.nestedExpression()) 
  {
    EIGEN_INTERNAL_CHECK_COST_VALUE(functor_traits<UnaryOp>::Cost);
    EIGEN_INTERNAL_CHECK_COST_VALUE(CoeffReadCost);
@@ -580,43 +540,32 @@ struct unary_evaluator<CwiseUnaryOp<UnaryOp, ArgType>, IndexBased >
  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
  CoeffReturnType coeff(Index row, Index col) const
  {
-    return m_d.func()(m_d.argImpl.coeff(row, col));
+    return m_functor(m_argImpl.coeff(row, col));
  }

  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
  CoeffReturnType coeff(Index index) const
  {
-    return m_d.func()(m_d.argImpl.coeff(index));
+    return m_functor(m_argImpl.coeff(index));
  }

  template<int LoadMode, typename PacketType>
  EIGEN_STRONG_INLINE
  PacketType packet(Index row, Index col) const
  {
-    return m_d.func().packetOp(m_d.argImpl.template packet<LoadMode, PacketType>(row, col));
+    return m_functor.packetOp(m_argImpl.template packet<LoadMode, PacketType>(row, col));
  }

  template<int LoadMode, typename PacketType>
  EIGEN_STRONG_INLINE
  PacketType packet(Index index) const
  {
-    return m_d.func().packetOp(m_d.argImpl.template packet<LoadMode, PacketType>(index));
+    return m_functor.packetOp(m_argImpl.template packet<LoadMode, PacketType>(index));
  }

 protected:
-
-  // this helper permits to completely eliminate the functor if it is empty
-  class Data : private UnaryOp
-  {
-  public:
-    EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
-    Data(const XprType& xpr) : UnaryOp(xpr.functor()), argImpl(xpr.nestedExpression()) {}
-    EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
-    const UnaryOp& func() const { return static_cast<const UnaryOp&>(*this); }
-    evaluator<ArgType> argImpl;
-  };
-
-  Data m_d;
+  const UnaryOp m_functor;
+  evaluator<ArgType> m_argImpl;
 };

 // -------------------- CwiseTernaryOp --------------------
@@ -660,7 +609,11 @@ struct ternary_evaluator<CwiseTernaryOp<TernaryOp, Arg1, Arg2, Arg3>, IndexBased
        evaluator<Arg3>::Alignment)
  };

-  EIGEN_DEVICE_FUNC explicit ternary_evaluator(const XprType& xpr) : m_d(xpr)
+  EIGEN_DEVICE_FUNC explicit ternary_evaluator(const XprType& xpr)
+    : m_functor(xpr.functor()),
+      m_arg1Impl(xpr.arg1()), 
+      m_arg2Impl(xpr.arg2()), 
+      m_arg3Impl(xpr.arg3())  
  {
    EIGEN_INTERNAL_CHECK_COST_VALUE(functor_traits<TernaryOp>::Cost);
    EIGEN_INTERNAL_CHECK_COST_VALUE(CoeffReadCost);
@@ -671,47 +624,38 @@ struct ternary_evaluator<CwiseTernaryOp<TernaryOp, Arg1, Arg2, Arg3>, IndexBased
  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
  CoeffReturnType coeff(Index row, Index col) const
  {
-    return m_d.func()(m_d.arg1Impl.coeff(row, col), m_d.arg2Impl.coeff(row, col), m_d.arg3Impl.coeff(row, col));
+    return m_functor(m_arg1Impl.coeff(row, col), m_arg2Impl.coeff(row, col), m_arg3Impl.coeff(row, col));
  }

  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
  CoeffReturnType coeff(Index index) const
  {
-    return m_d.func()(m_d.arg1Impl.coeff(index), m_d.arg2Impl.coeff(index), m_d.arg3Impl.coeff(index));
+    return m_functor(m_arg1Impl.coeff(index), m_arg2Impl.coeff(index), m_arg3Impl.coeff(index));
  }

  template<int LoadMode, typename PacketType>
  EIGEN_STRONG_INLINE
  PacketType packet(Index row, Index col) const
  {
-    return m_d.func().packetOp(m_d.arg1Impl.template packet<LoadMode,PacketType>(row, col),
-                               m_d.arg2Impl.template packet<LoadMode,PacketType>(row, col),
-                               m_d.arg3Impl.template packet<LoadMode,PacketType>(row, col));
+    return m_functor.packetOp(m_arg1Impl.template packet<LoadMode,PacketType>(row, col),
+                              m_arg2Impl.template packet<LoadMode,PacketType>(row, col),
+                              m_arg3Impl.template packet<LoadMode,PacketType>(row, col));
  }

  template<int LoadMode, typename PacketType>
  EIGEN_STRONG_INLINE
  PacketType packet(Index index) const
  {
-    return m_d.func().packetOp(m_d.arg1Impl.template packet<LoadMode,PacketType>(index),
-                               m_d.arg2Impl.template packet<LoadMode,PacketType>(index),
-                               m_d.arg3Impl.template packet<LoadMode,PacketType>(index));
+    return m_functor.packetOp(m_arg1Impl.template packet<LoadMode,PacketType>(index),
+                              m_arg2Impl.template packet<LoadMode,PacketType>(index),
+                              m_arg3Impl.template packet<LoadMode,PacketType>(index));
  }

 protected:
-  // this helper permits to completely eliminate the functor if it is empty
-  struct Data : private TernaryOp
-  {
-    EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
-    Data(const XprType& xpr) : TernaryOp(xpr.functor()), arg1Impl(xpr.arg1()), arg2Impl(xpr.arg2()), arg3Impl(xpr.arg3()) {}
-    EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
-    const TernaryOp& func() const { return static_cast<const TernaryOp&>(*this); }
-    evaluator<Arg1> arg1Impl;
-    evaluator<Arg2> arg2Impl;
-    evaluator<Arg3> arg3Impl;
-  };
-
-  Data m_d;
+  const TernaryOp m_functor;
+  evaluator<Arg1> m_arg1Impl;
+  evaluator<Arg2> m_arg2Impl;
+  evaluator<Arg3> m_arg3Impl;
 };

 // -------------------- CwiseBinaryOp --------------------
@@ -724,8 +668,7 @@ struct evaluator<CwiseBinaryOp<BinaryOp, Lhs, Rhs> >
  typedef CwiseBinaryOp<BinaryOp, Lhs, Rhs> XprType;
  typedef binary_evaluator<CwiseBinaryOp<BinaryOp, Lhs, Rhs> > Base;
  
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
-  explicit evaluator(const XprType& xpr) : Base(xpr) {}
+  EIGEN_DEVICE_FUNC explicit evaluator(const XprType& xpr) : Base(xpr) {}
 };

 template<typename BinaryOp, typename Lhs, typename Rhs>
@@ -753,8 +696,10 @@ struct binary_evaluator<CwiseBinaryOp<BinaryOp, Lhs, Rhs>, IndexBased, IndexBase
    Alignment = EIGEN_PLAIN_ENUM_MIN(evaluator<Lhs>::Alignment,evaluator<Rhs>::Alignment)
  };

-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
-  explicit binary_evaluator(const XprType& xpr) : m_d(xpr)
+  EIGEN_DEVICE_FUNC explicit binary_evaluator(const XprType& xpr)
+    : m_functor(xpr.functor()),
+      m_lhsImpl(xpr.lhs()), 
+      m_rhsImpl(xpr.rhs())  
  {
    EIGEN_INTERNAL_CHECK_COST_VALUE(functor_traits<BinaryOp>::Cost);
    EIGEN_INTERNAL_CHECK_COST_VALUE(CoeffReadCost);
@@ -765,45 +710,35 @@ struct binary_evaluator<CwiseBinaryOp<BinaryOp, Lhs, Rhs>, IndexBased, IndexBase
  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
  CoeffReturnType coeff(Index row, Index col) const
  {
-    return m_d.func()(m_d.lhsImpl.coeff(row, col), m_d.rhsImpl.coeff(row, col));
+    return m_functor(m_lhsImpl.coeff(row, col), m_rhsImpl.coeff(row, col));
  }

  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
  CoeffReturnType coeff(Index index) const
  {
-    return m_d.func()(m_d.lhsImpl.coeff(index), m_d.rhsImpl.coeff(index));
+    return m_functor(m_lhsImpl.coeff(index), m_rhsImpl.coeff(index));
  }

  template<int LoadMode, typename PacketType>
  EIGEN_STRONG_INLINE
  PacketType packet(Index row, Index col) const
  {
-    return m_d.func().packetOp(m_d.lhsImpl.template packet<LoadMode,PacketType>(row, col),
-                               m_d.rhsImpl.template packet<LoadMode,PacketType>(row, col));
+    return m_functor.packetOp(m_lhsImpl.template packet<LoadMode,PacketType>(row, col),
+                              m_rhsImpl.template packet<LoadMode,PacketType>(row, col));
  }

  template<int LoadMode, typename PacketType>
  EIGEN_STRONG_INLINE
  PacketType packet(Index index) const
  {
-    return m_d.func().packetOp(m_d.lhsImpl.template packet<LoadMode,PacketType>(index),
-                               m_d.rhsImpl.template packet<LoadMode,PacketType>(index));
+    return m_functor.packetOp(m_lhsImpl.template packet<LoadMode,PacketType>(index),
+                              m_rhsImpl.template packet<LoadMode,PacketType>(index));
  }

 protected:
-
-  // this helper permits to completely eliminate the functor if it is empty
-  struct Data : private BinaryOp
-  {
-    EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
-    Data(const XprType& xpr) : BinaryOp(xpr.functor()), lhsImpl(xpr.lhs()), rhsImpl(xpr.rhs()) {}
-    EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
-    const BinaryOp& func() const { return static_cast<const BinaryOp&>(*this); }
-    evaluator<Lhs> lhsImpl;
-    evaluator<Rhs> rhsImpl;
-  };
-
-  Data m_d;
+  const BinaryOp m_functor;
+  evaluator<Lhs> m_lhsImpl;
+  evaluator<Rhs> m_rhsImpl;
 };

 // -------------------- CwiseUnaryView --------------------
@@ -822,7 +757,9 @@ struct unary_evaluator<CwiseUnaryView<UnaryOp, ArgType>, IndexBased>
    Alignment = 0 // FIXME it is not very clear why alignment is necessarily lost...
  };

-  EIGEN_DEVICE_FUNC explicit unary_evaluator(const XprType& op) : m_d(op)
+  EIGEN_DEVICE_FUNC explicit unary_evaluator(const XprType& op)
+    : m_unaryOp(op.functor()), 
+      m_argImpl(op.nestedExpression()) 
  {
    EIGEN_INTERNAL_CHECK_COST_VALUE(functor_traits<UnaryOp>::Cost);
    EIGEN_INTERNAL_CHECK_COST_VALUE(CoeffReadCost);
@@ -834,40 +771,30 @@ struct unary_evaluator<CwiseUnaryView<UnaryOp, ArgType>, IndexBased>
  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
  CoeffReturnType coeff(Index row, Index col) const
  {
-    return m_d.func()(m_d.argImpl.coeff(row, col));
+    return m_unaryOp(m_argImpl.coeff(row, col));
  }

  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
  CoeffReturnType coeff(Index index) const
  {
-    return m_d.func()(m_d.argImpl.coeff(index));
+    return m_unaryOp(m_argImpl.coeff(index));
  }

  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
  Scalar& coeffRef(Index row, Index col)
  {
-    return m_d.func()(m_d.argImpl.coeffRef(row, col));
+    return m_unaryOp(m_argImpl.coeffRef(row, col));
  }

  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
  Scalar& coeffRef(Index index)
  {
-    return m_d.func()(m_d.argImpl.coeffRef(index));
+    return m_unaryOp(m_argImpl.coeffRef(index));
  }

 protected:
-
-  // this helper permits to completely eliminate the functor if it is empty
-  struct Data : private UnaryOp
-  {
-    EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
-    Data(const XprType& xpr) : UnaryOp(xpr.functor()), argImpl(xpr.nestedExpression()) {}
-    EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
-    const UnaryOp& func() const { return static_cast<const UnaryOp&>(*this); }
-    evaluator<ArgType> argImpl;
-  };
-
-  Data m_d;
+  const UnaryOp m_unaryOp;
+  evaluator<ArgType> m_argImpl;
 };

 // -------------------- Map --------------------
@@ -891,8 +818,7 @@ struct mapbase_evaluator : evaluator_base<Derived>
    CoeffReadCost = NumTraits<Scalar>::ReadCost
  };

-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
-  explicit mapbase_evaluator(const XprType& map)
+  EIGEN_DEVICE_FUNC explicit mapbase_evaluator(const XprType& map)
    : m_data(const_cast<PointerType>(map.data())),
      m_innerStride(map.innerStride()),
      m_outerStride(map.outerStride())
@@ -956,10 +882,10 @@ struct mapbase_evaluator : evaluator_base<Derived>
    internal::pstoret<Scalar, PacketType, StoreMode>(m_data + index * m_innerStride.value(), x);
  }
 protected:
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
-  Index rowStride() const { return XprType::IsRowMajor ? m_outerStride.value() : m_innerStride.value(); }
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
-  Index colStride() const { return XprType::IsRowMajor ? m_innerStride.value() : m_outerStride.value(); }
+  EIGEN_DEVICE_FUNC
+  inline Index rowStride() const { return XprType::IsRowMajor ? m_outerStride.value() : m_innerStride.value(); }
+  EIGEN_DEVICE_FUNC
+  inline Index colStride() const { return XprType::IsRowMajor ? m_innerStride.value() : m_outerStride.value(); }

  PointerType m_data;
  const internal::variable_if_dynamic<Index, XprType::InnerStrideAtCompileTime> m_innerStride;
@@ -1012,8 +938,7 @@ struct evaluator<Ref<PlainObjectType, RefOptions, StrideType> >
    Alignment = evaluator<Map<PlainObjectType, RefOptions, StrideType> >::Alignment
  };

-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
-  explicit evaluator(const XprType& ref)
+  EIGEN_DEVICE_FUNC explicit evaluator(const XprType& ref)
    : mapbase_evaluator<XprType, PlainObjectType>(ref) 
  { }
 };
@@ -1068,8 +993,7 @@ struct evaluator<Block<ArgType, BlockRows, BlockCols, InnerPanel> >
    Alignment = EIGEN_PLAIN_ENUM_MIN(evaluator<ArgType>::Alignment, Alignment0)
  };
  typedef block_evaluator<ArgType, BlockRows, BlockCols, InnerPanel> block_evaluator_type;
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
-  explicit evaluator(const XprType& block) : block_evaluator_type(block)
+  EIGEN_DEVICE_FUNC explicit evaluator(const XprType& block) : block_evaluator_type(block)
  {
    EIGEN_INTERNAL_CHECK_COST_VALUE(CoeffReadCost);
  }
@@ -1082,8 +1006,7 @@ struct block_evaluator<ArgType, BlockRows, BlockCols, InnerPanel, /*HasDirectAcc
 {
  typedef Block<ArgType, BlockRows, BlockCols, InnerPanel> XprType;

-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
-  explicit block_evaluator(const XprType& block)
+  EIGEN_DEVICE_FUNC explicit block_evaluator(const XprType& block)
    : unary_evaluator<XprType>(block) 
  {}
 };
@@ -1094,12 +1017,11 @@ struct unary_evaluator<Block<ArgType, BlockRows, BlockCols, InnerPanel>, IndexBa
 {
  typedef Block<ArgType, BlockRows, BlockCols, InnerPanel> XprType;

-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
-  explicit unary_evaluator(const XprType& block)
+  EIGEN_DEVICE_FUNC explicit unary_evaluator(const XprType& block)
    : m_argImpl(block.nestedExpression()), 
      m_startRow(block.startRow()), 
      m_startCol(block.startCol()),
-      m_linear_offset(ForwardLinearAccess?(ArgType::IsRowMajor ? block.startRow()*block.nestedExpression().cols() + block.startCol() : block.startCol()*block.nestedExpression().rows() + block.startRow()):0)
+      m_linear_offset(InnerPanel?(XprType::IsRowMajor ? block.startRow()*block.cols() : block.startCol()*block.rows()):0)
  { }
 
  typedef typename XprType::Scalar Scalar;
@@ -1107,7 +1029,7 @@ struct unary_evaluator<Block<ArgType, BlockRows, BlockCols, InnerPanel>, IndexBa

  enum {
    RowsAtCompileTime = XprType::RowsAtCompileTime,
-    ForwardLinearAccess = (InnerPanel || int(XprType::IsRowMajor)==int(ArgType::IsRowMajor)) && bool(evaluator<ArgType>::Flags&LinearAccessBit)
+    ForwardLinearAccess = InnerPanel && bool(evaluator<ArgType>::Flags&LinearAccessBit)
  };
 
  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
@@ -1118,8 +1040,11 @@ struct unary_evaluator<Block<ArgType, BlockRows, BlockCols, InnerPanel>, IndexBa
  
  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
  CoeffReturnType coeff(Index index) const
-  {
-    return linear_coeff_impl(index, bool_constant<ForwardLinearAccess>());
+  { 
+    if (ForwardLinearAccess)
+      return m_argImpl.coeff(m_linear_offset.value() + index); 
+    else
+      return coeff(RowsAtCompileTime == 1 ? 0 : index, RowsAtCompileTime == 1 ? index : 0);
  }

  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
@@ -1130,8 +1055,11 @@ struct unary_evaluator<Block<ArgType, BlockRows, BlockCols, InnerPanel>, IndexBa
  
  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
  Scalar& coeffRef(Index index)
-  {
-    return linear_coeffRef_impl(index, bool_constant<ForwardLinearAccess>());
+  { 
+    if (ForwardLinearAccess)
+      return m_argImpl.coeffRef(m_linear_offset.value() + index); 
+    else
+      return coeffRef(RowsAtCompileTime == 1 ? 0 : index, RowsAtCompileTime == 1 ? index : 0);
  }
 
  template<int LoadMode, typename PacketType>
@@ -1172,32 +1100,10 @@ struct unary_evaluator<Block<ArgType, BlockRows, BlockCols, InnerPanel>, IndexBa
  }
 
 protected:
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
-  CoeffReturnType linear_coeff_impl(Index index, internal::true_type /* ForwardLinearAccess */) const
-  {
-    return m_argImpl.coeff(m_linear_offset.value() + index); 
-  }
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
-  CoeffReturnType linear_coeff_impl(Index index, internal::false_type /* not ForwardLinearAccess */) const
-  {
-    return coeff(RowsAtCompileTime == 1 ? 0 : index, RowsAtCompileTime == 1 ? index : 0);
-  }
-
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
-  Scalar& linear_coeffRef_impl(Index index, internal::true_type /* ForwardLinearAccess */)
-  {
-    return m_argImpl.coeffRef(m_linear_offset.value() + index); 
-  }
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
-  Scalar& linear_coeffRef_impl(Index index, internal::false_type /* not ForwardLinearAccess */)
-  {
-    return coeffRef(RowsAtCompileTime == 1 ? 0 : index, RowsAtCompileTime == 1 ? index : 0);
-  }
-
  evaluator<ArgType> m_argImpl;
  const variable_if_dynamic<Index, (ArgType::RowsAtCompileTime == 1 && BlockRows==1) ? 0 : Dynamic> m_startRow;
  const variable_if_dynamic<Index, (ArgType::ColsAtCompileTime == 1 && BlockCols==1) ? 0 : Dynamic> m_startCol;
-  const variable_if_dynamic<Index, ForwardLinearAccess ? Dynamic : 0> m_linear_offset;
+  const variable_if_dynamic<Index, InnerPanel ? Dynamic : 0> m_linear_offset;
 };

 // TODO: This evaluator does not actually use the child evaluator; 
@@ -1211,8 +1117,7 @@ struct block_evaluator<ArgType, BlockRows, BlockCols, InnerPanel, /* HasDirectAc
  typedef Block<ArgType, BlockRows, BlockCols, InnerPanel> XprType;
  typedef typename XprType::Scalar Scalar;

-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
-  explicit block_evaluator(const XprType& block)
+  EIGEN_DEVICE_FUNC explicit block_evaluator(const XprType& block)
    : mapbase_evaluator<XprType, typename XprType::PlainObject>(block) 
  {
    // TODO: for the 3.3 release, this should be turned to an internal assertion, but let's keep it as is for the beta lifetime
@@ -1240,8 +1145,7 @@ struct evaluator<Select<ConditionMatrixType, ThenMatrixType, ElseMatrixType> >
    Alignment = EIGEN_PLAIN_ENUM_MIN(evaluator<ThenMatrixType>::Alignment, evaluator<ElseMatrixType>::Alignment)
  };

-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
-  explicit evaluator(const XprType& select)
+  EIGEN_DEVICE_FUNC explicit evaluator(const XprType& select)
    : m_conditionImpl(select.conditionMatrix()),
      m_thenImpl(select.thenMatrix()),
      m_elseImpl(select.elseMatrix())
@@ -1298,8 +1202,7 @@ struct unary_evaluator<Replicate<ArgType, RowFactor, ColFactor> >
    Alignment = evaluator<ArgTypeNestedCleaned>::Alignment
  };

-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
-  explicit unary_evaluator(const XprType& replicate)
+  EIGEN_DEVICE_FUNC explicit unary_evaluator(const XprType& replicate)
    : m_arg(replicate.nestedExpression()),
      m_argImpl(m_arg),
      m_rows(replicate.nestedExpression().rows()),
@@ -1363,6 +1266,64 @@ protected:
  const variable_if_dynamic<Index, ArgType::ColsAtCompileTime> m_cols;
 };

+
+// -------------------- PartialReduxExpr --------------------
+
+template< typename ArgType, typename MemberOp, int Direction>
+struct evaluator<PartialReduxExpr<ArgType, MemberOp, Direction> >
+  : evaluator_base<PartialReduxExpr<ArgType, MemberOp, Direction> >
+{
+  typedef PartialReduxExpr<ArgType, MemberOp, Direction> XprType;
+  typedef typename internal::nested_eval<ArgType,1>::type ArgTypeNested;
+  typedef typename internal::remove_all<ArgTypeNested>::type ArgTypeNestedCleaned;
+  typedef typename ArgType::Scalar InputScalar;
+  typedef typename XprType::Scalar Scalar;
+  enum {
+    TraversalSize = Direction==int(Vertical) ? int(ArgType::RowsAtCompileTime) :  int(ArgType::ColsAtCompileTime)
+  };
+  typedef typename MemberOp::template Cost<InputScalar,int(TraversalSize)> CostOpType;
+  enum {
+    CoeffReadCost = TraversalSize==Dynamic ? HugeCost
+                  : TraversalSize * evaluator<ArgType>::CoeffReadCost + int(CostOpType::value),
+    
+    Flags = (traits<XprType>::Flags&RowMajorBit) | (evaluator<ArgType>::Flags&(HereditaryBits&(~RowMajorBit))) | LinearAccessBit,
+    
+    Alignment = 0 // FIXME this will need to be improved once PartialReduxExpr is vectorized
+  };
+
+  EIGEN_DEVICE_FUNC explicit evaluator(const XprType xpr)
+    : m_arg(xpr.nestedExpression()), m_functor(xpr.functor())
+  {
+    EIGEN_INTERNAL_CHECK_COST_VALUE(TraversalSize==Dynamic ? HugeCost : int(CostOpType::value));
+    EIGEN_INTERNAL_CHECK_COST_VALUE(CoeffReadCost);
+  }
+
+  typedef typename XprType::CoeffReturnType CoeffReturnType;
+
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+  const Scalar coeff(Index i, Index j) const
+  {
+    if (Direction==Vertical)
+      return m_functor(m_arg.col(j));
+    else
+      return m_functor(m_arg.row(i));
+  }
+
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+  const Scalar coeff(Index index) const
+  {
+    if (Direction==Vertical)
+      return m_functor(m_arg.col(index));
+    else
+      return m_functor(m_arg.row(index));
+  }
+
+protected:
+  typename internal::add_const_on_value_type<ArgTypeNested>::type m_arg;
+  const MemberOp m_functor;
+};
+
+
 // -------------------- MatrixWrapper and ArrayWrapper --------------------
 //
 // evaluator_wrapper_base<T> is a common base class for the
@@ -1379,8 +1340,7 @@ struct evaluator_wrapper_base
    Alignment = evaluator<ArgType>::Alignment
  };

-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
-  explicit evaluator_wrapper_base(const ArgType& arg) : m_argImpl(arg) {}
+  EIGEN_DEVICE_FUNC explicit evaluator_wrapper_base(const ArgType& arg) : m_argImpl(arg) {}

  typedef typename ArgType::Scalar Scalar;
  typedef typename ArgType::CoeffReturnType CoeffReturnType;
@@ -1447,8 +1407,7 @@ struct unary_evaluator<MatrixWrapper<TArgType> >
 {
  typedef MatrixWrapper<TArgType> XprType;

-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
-  explicit unary_evaluator(const XprType& wrapper)
+  EIGEN_DEVICE_FUNC explicit unary_evaluator(const XprType& wrapper)
    : evaluator_wrapper_base<MatrixWrapper<TArgType> >(wrapper.nestedExpression())
  { }
 };
@@ -1459,8 +1418,7 @@ struct unary_evaluator<ArrayWrapper<TArgType> >
 {
  typedef ArrayWrapper<TArgType> XprType;

-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
-  explicit unary_evaluator(const XprType& wrapper)
+  EIGEN_DEVICE_FUNC explicit unary_evaluator(const XprType& wrapper)
    : evaluator_wrapper_base<ArrayWrapper<TArgType> >(wrapper.nestedExpression())
  { }
 };
@@ -1502,8 +1460,7 @@ struct unary_evaluator<Reverse<ArgType, Direction> >
    Alignment = 0 // FIXME in some rare cases, Alignment could be preserved, like a Vector4f.
  };

-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
-  explicit unary_evaluator(const XprType& reverse)
+  EIGEN_DEVICE_FUNC explicit unary_evaluator(const XprType& reverse)
    : m_argImpl(reverse.nestedExpression()),
      m_rows(ReverseRow ? reverse.nestedExpression().rows() : 1),
      m_cols(ReverseCol ? reverse.nestedExpression().cols() : 1)
@@ -1610,8 +1567,7 @@ struct evaluator<Diagonal<ArgType, DiagIndex> >
    Alignment = 0
  };

-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
-  explicit evaluator(const XprType& diagonal)
+  EIGEN_DEVICE_FUNC explicit evaluator(const XprType& diagonal)
    : m_argImpl(diagonal.nestedExpression()),
      m_index(diagonal.index())
  { }
--- a/Eigen/src/Core/CoreIterators.h
+++ b/Eigen/src/Core/CoreIterators.h
@@ -48,11 +48,6 @@ public:
    * Explicit zeros are not skipped over. To skip explicit zeros, see class SparseView
    */
  EIGEN_STRONG_INLINE InnerIterator& operator++()   { m_iter.operator++(); return *this; }
-  EIGEN_STRONG_INLINE InnerIterator& operator+=(Index i) { m_iter.operator+=(i); return *this; }
-  EIGEN_STRONG_INLINE InnerIterator operator+(Index i) 
-  { InnerIterator result(*this); result+=i; return result; }
-    
-
  /// \returns the column or row index of the current coefficient.
  EIGEN_STRONG_INLINE Index index() const           { return m_iter.index(); }
  /// \returns the row index of the current coefficient.
--- a/Eigen/src/Core/CwiseBinaryOp.h
+++ b/Eigen/src/Core/CwiseBinaryOp.h
@@ -100,14 +100,8 @@ class CwiseBinaryOp :
    typedef typename internal::remove_reference<LhsNested>::type _LhsNested;
    typedef typename internal::remove_reference<RhsNested>::type _RhsNested;

-#if EIGEN_COMP_MSVC && EIGEN_HAS_CXX11
-    //Required for Visual Studio or the Copy constructor will probably not get inlined!
-    EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
-    CwiseBinaryOp(const CwiseBinaryOp<BinaryOp,LhsType,RhsType>&) = default;
-#endif
-
-    EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
-    CwiseBinaryOp(const Lhs& aLhs, const Rhs& aRhs, const BinaryOp& func = BinaryOp())
+    EIGEN_DEVICE_FUNC
+    EIGEN_STRONG_INLINE CwiseBinaryOp(const Lhs& aLhs, const Rhs& aRhs, const BinaryOp& func = BinaryOp())
      : m_lhs(aLhs), m_rhs(aRhs), m_functor(func)
    {
      EIGEN_CHECK_BINARY_COMPATIBILIY(BinaryOp,typename Lhs::Scalar,typename Rhs::Scalar);
@@ -116,16 +110,16 @@ class CwiseBinaryOp :
      eigen_assert(aLhs.rows() == aRhs.rows() && aLhs.cols() == aRhs.cols());
    }

-    EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
-    Index rows() const {
+    EIGEN_DEVICE_FUNC
+    EIGEN_STRONG_INLINE Index rows() const {
      // return the fixed size type if available to enable compile time optimizations
      if (internal::traits<typename internal::remove_all<LhsNested>::type>::RowsAtCompileTime==Dynamic)
        return m_rhs.rows();
      else
        return m_lhs.rows();
    }
-    EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
-    Index cols() const {
+    EIGEN_DEVICE_FUNC
+    EIGEN_STRONG_INLINE Index cols() const {
      // return the fixed size type if available to enable compile time optimizations
      if (internal::traits<typename internal::remove_all<LhsNested>::type>::ColsAtCompileTime==Dynamic)
        return m_rhs.cols();
@@ -134,13 +128,13 @@ class CwiseBinaryOp :
    }

    /** \returns the left hand side nested expression */
-    EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+    EIGEN_DEVICE_FUNC
    const _LhsNested& lhs() const { return m_lhs; }
    /** \returns the right hand side nested expression */
-    EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+    EIGEN_DEVICE_FUNC
    const _RhsNested& rhs() const { return m_rhs; }
    /** \returns the functor representing the binary operation */
-    EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+    EIGEN_DEVICE_FUNC
    const BinaryOp& functor() const { return m_functor; }

  protected:
@@ -164,7 +158,7 @@ public:
  */
 template<typename Derived>
 template<typename OtherDerived>
-EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Derived &
+EIGEN_STRONG_INLINE Derived &
 MatrixBase<Derived>::operator-=(const MatrixBase<OtherDerived> &other)
 {
  call_assignment(derived(), other.derived(), internal::sub_assign_op<Scalar,typename OtherDerived::Scalar>());
@@ -177,7 +171,7 @@ MatrixBase<Derived>::operator-=(const MatrixBase<OtherDerived> &other)
  */
 template<typename Derived>
 template<typename OtherDerived>
-EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Derived &
+EIGEN_STRONG_INLINE Derived &
 MatrixBase<Derived>::operator+=(const MatrixBase<OtherDerived>& other)
 {
  call_assignment(derived(), other.derived(), internal::add_assign_op<Scalar,typename OtherDerived::Scalar>());
@@ -187,3 +181,4 @@ MatrixBase<Derived>::operator+=(const MatrixBase<OtherDerived>& other)
 } // end namespace Eigen

 #endif // EIGEN_CWISE_BINARY_OP_H
+
--- a/Eigen/src/Core/CwiseNullaryOp.h
+++ b/Eigen/src/Core/CwiseNullaryOp.h
@@ -105,12 +105,7 @@ class CwiseNullaryOp : public internal::dense_xpr_base< CwiseNullaryOp<NullaryOp
  */
 template<typename Derived>
 template<typename CustomNullaryOp>
-EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
-#ifndef EIGEN_PARSED_BY_DOXYGEN
-const CwiseNullaryOp<CustomNullaryOp,typename DenseBase<Derived>::PlainObject>
-#else
-const CwiseNullaryOp<CustomNullaryOp,PlainObject>
-#endif
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const CwiseNullaryOp<CustomNullaryOp, typename DenseBase<Derived>::PlainObject>
 DenseBase<Derived>::NullaryExpr(Index rows, Index cols, const CustomNullaryOp& func)
 {
  return CwiseNullaryOp<CustomNullaryOp, PlainObject>(rows, cols, func);
@@ -136,12 +131,7 @@ DenseBase<Derived>::NullaryExpr(Index rows, Index cols, const CustomNullaryOp& f
  */
 template<typename Derived>
 template<typename CustomNullaryOp>
-EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
-#ifndef EIGEN_PARSED_BY_DOXYGEN
-const CwiseNullaryOp<CustomNullaryOp, typename DenseBase<Derived>::PlainObject>
-#else
-const CwiseNullaryOp<CustomNullaryOp, PlainObject>
-#endif
+EIGEN_STRONG_INLINE const CwiseNullaryOp<CustomNullaryOp, typename DenseBase<Derived>::PlainObject>
 DenseBase<Derived>::NullaryExpr(Index size, const CustomNullaryOp& func)
 {
  EIGEN_STATIC_ASSERT_VECTOR_ONLY(Derived)
@@ -160,12 +150,7 @@ DenseBase<Derived>::NullaryExpr(Index size, const CustomNullaryOp& func)
  */
 template<typename Derived>
 template<typename CustomNullaryOp>
-EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
-#ifndef EIGEN_PARSED_BY_DOXYGEN
-const CwiseNullaryOp<CustomNullaryOp, typename DenseBase<Derived>::PlainObject>
-#else
-const CwiseNullaryOp<CustomNullaryOp, PlainObject>
-#endif
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const CwiseNullaryOp<CustomNullaryOp, typename DenseBase<Derived>::PlainObject>
 DenseBase<Derived>::NullaryExpr(const CustomNullaryOp& func)
 {
  return CwiseNullaryOp<CustomNullaryOp, PlainObject>(RowsAtCompileTime, ColsAtCompileTime, func);
@@ -185,7 +170,7 @@ DenseBase<Derived>::NullaryExpr(const CustomNullaryOp& func)
  * \sa class CwiseNullaryOp
  */
 template<typename Derived>
-EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const typename DenseBase<Derived>::ConstantReturnType
+EIGEN_STRONG_INLINE const typename DenseBase<Derived>::ConstantReturnType
 DenseBase<Derived>::Constant(Index rows, Index cols, const Scalar& value)
 {
  return DenseBase<Derived>::NullaryExpr(rows, cols, internal::scalar_constant_op<Scalar>(value));
@@ -232,32 +217,27 @@ DenseBase<Derived>::Constant(const Scalar& value)

 /** \deprecated because of accuracy loss. In Eigen 3.3, it is an alias for LinSpaced(Index,const Scalar&,const Scalar&)
  *
-  * \only_for_vectors
-  *
-  * Example: \include DenseBase_LinSpaced_seq_deprecated.cpp
-  * Output: \verbinclude DenseBase_LinSpaced_seq_deprecated.out
-  *
-  * \sa LinSpaced(Index,const Scalar&, const Scalar&), setLinSpaced(Index,const Scalar&,const Scalar&)
+  * \sa LinSpaced(Index,Scalar,Scalar), setLinSpaced(Index,const Scalar&,const Scalar&)
  */
 template<typename Derived>
-EIGEN_DEPRECATED EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const typename DenseBase<Derived>::RandomAccessLinSpacedReturnType
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const typename DenseBase<Derived>::RandomAccessLinSpacedReturnType
 DenseBase<Derived>::LinSpaced(Sequential_t, Index size, const Scalar& low, const Scalar& high)
 {
  EIGEN_STATIC_ASSERT_VECTOR_ONLY(Derived)
-  return DenseBase<Derived>::NullaryExpr(size, internal::linspaced_op<Scalar>(low,high,size));
+  return DenseBase<Derived>::NullaryExpr(size, internal::linspaced_op<Scalar,PacketScalar>(low,high,size));
 }

 /** \deprecated because of accuracy loss. In Eigen 3.3, it is an alias for LinSpaced(const Scalar&,const Scalar&)
  *
-  * \sa LinSpaced(const Scalar&, const Scalar&)
+  * \sa LinSpaced(Scalar,Scalar)
  */
 template<typename Derived>
-EIGEN_DEPRECATED EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const typename DenseBase<Derived>::RandomAccessLinSpacedReturnType
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const typename DenseBase<Derived>::RandomAccessLinSpacedReturnType
 DenseBase<Derived>::LinSpaced(Sequential_t, const Scalar& low, const Scalar& high)
 {
  EIGEN_STATIC_ASSERT_VECTOR_ONLY(Derived)
  EIGEN_STATIC_ASSERT_FIXED_SIZE(Derived)
-  return DenseBase<Derived>::NullaryExpr(Derived::SizeAtCompileTime, internal::linspaced_op<Scalar>(low,high,Derived::SizeAtCompileTime));
+  return DenseBase<Derived>::NullaryExpr(Derived::SizeAtCompileTime, internal::linspaced_op<Scalar,PacketScalar>(low,high,Derived::SizeAtCompileTime));
 }

 /**
@@ -288,7 +268,7 @@ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const typename DenseBase<Derived>::RandomA
 DenseBase<Derived>::LinSpaced(Index size, const Scalar& low, const Scalar& high)
 {
  EIGEN_STATIC_ASSERT_VECTOR_ONLY(Derived)
-  return DenseBase<Derived>::NullaryExpr(size, internal::linspaced_op<Scalar>(low,high,size));
+  return DenseBase<Derived>::NullaryExpr(size, internal::linspaced_op<Scalar,PacketScalar>(low,high,size));
 }

 /**
@@ -301,7 +281,7 @@ DenseBase<Derived>::LinSpaced(const Scalar& low, const Scalar& high)
 {
  EIGEN_STATIC_ASSERT_VECTOR_ONLY(Derived)
  EIGEN_STATIC_ASSERT_FIXED_SIZE(Derived)
-  return DenseBase<Derived>::NullaryExpr(Derived::SizeAtCompileTime, internal::linspaced_op<Scalar>(low,high,Derived::SizeAtCompileTime));
+  return DenseBase<Derived>::NullaryExpr(Derived::SizeAtCompileTime, internal::linspaced_op<Scalar,PacketScalar>(low,high,Derived::SizeAtCompileTime));
 }

 /** \returns true if all coefficients in this matrix are approximately equal to \a val, to within precision \a prec */
@@ -403,7 +383,7 @@ template<typename Derived>
 EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Derived& DenseBase<Derived>::setLinSpaced(Index newSize, const Scalar& low, const Scalar& high)
 {
  EIGEN_STATIC_ASSERT_VECTOR_ONLY(Derived)
-  return derived() = Derived::NullaryExpr(newSize, internal::linspaced_op<Scalar>(low,high,newSize));
+  return derived() = Derived::NullaryExpr(newSize, internal::linspaced_op<Scalar,PacketScalar>(low,high,newSize));
 }

 /**
@@ -881,42 +861,6 @@ template<typename Derived>
 EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const typename MatrixBase<Derived>::BasisReturnType MatrixBase<Derived>::UnitW()
 { return Derived::Unit(3); }

-/** \brief Set the coefficients of \c *this to the i-th unit (basis) vector
-  *
-  * \param i index of the unique coefficient to be set to 1
-  *
-  * \only_for_vectors
-  *
-  * \sa MatrixBase::setIdentity(), class CwiseNullaryOp, MatrixBase::Unit(Index,Index)
-  */
-template<typename Derived>
-EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Derived& MatrixBase<Derived>::setUnit(Index i)
-{
-  EIGEN_STATIC_ASSERT_VECTOR_ONLY(Derived);
-  eigen_assert(i<size());
-  derived().setZero();
-  derived().coeffRef(i) = Scalar(1);
-  return derived();
-}
-
-/** \brief Resizes to the given \a newSize, and writes the i-th unit (basis) vector into *this.
-  *
-  * \param newSize the new size of the vector
-  * \param i index of the unique coefficient to be set to 1
-  *
-  * \only_for_vectors
-  *
-  * \sa MatrixBase::setIdentity(), class CwiseNullaryOp, MatrixBase::Unit(Index,Index)
-  */
-template<typename Derived>
-EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Derived& MatrixBase<Derived>::setUnit(Index newSize, Index i)
-{
-  EIGEN_STATIC_ASSERT_VECTOR_ONLY(Derived);
-  eigen_assert(i<newSize);
-  derived().resize(newSize);
-  return setUnit(i);
-}
-
 } // end namespace Eigen

 #endif // EIGEN_CWISE_NULLARY_OP_H
--- a/Eigen/src/Core/CwiseUnaryView.h
+++ b/Eigen/src/Core/CwiseUnaryView.h
@@ -81,7 +81,7 @@ class CwiseUnaryView : public CwiseUnaryViewImpl<ViewOp, MatrixType, typename in

    /** \returns the nested expression */
    typename internal::remove_reference<MatrixTypeNested>::type&
-    nestedExpression() { return m_matrix; }
+    nestedExpression() { return m_matrix.const_cast_derived(); }

  protected:
    MatrixTypeNested m_matrix;
--- a/Eigen/src/Core/DenseBase.h
+++ b/Eigen/src/Core/DenseBase.h
@@ -40,7 +40,7 @@ static inline void check_DenseIndex_is_signed() {
  */
 template<typename Derived> class DenseBase
 #ifndef EIGEN_PARSED_BY_DOXYGEN
-  : public DenseCoeffsBase<Derived, internal::accessors_level<Derived>::value>
+  : public DenseCoeffsBase<Derived>
 #else
  : public DenseCoeffsBase<Derived,DirectWriteAccessors>
 #endif // not EIGEN_PARSED_BY_DOXYGEN
@@ -71,7 +71,7 @@ template<typename Derived> class DenseBase
    typedef Scalar value_type;
    
    typedef typename NumTraits<Scalar>::Real RealScalar;
-    typedef DenseCoeffsBase<Derived, internal::accessors_level<Derived>::value> Base;
+    typedef DenseCoeffsBase<Derived> Base;

    using Base::derived;
    using Base::const_cast_derived;
@@ -150,18 +150,13 @@ template<typename Derived> class DenseBase
          * \sa SizeAtCompileTime, MaxRowsAtCompileTime, MaxColsAtCompileTime
          */

-      IsVectorAtCompileTime = internal::traits<Derived>::RowsAtCompileTime == 1
-                           || internal::traits<Derived>::ColsAtCompileTime == 1,
+      IsVectorAtCompileTime = internal::traits<Derived>::MaxRowsAtCompileTime == 1
+                           || internal::traits<Derived>::MaxColsAtCompileTime == 1,
        /**< This is set to true if either the number of rows or the number of
          * columns is known at compile-time to be equal to 1. Indeed, in that case,
          * we are dealing with a column-vector (if there is only one column) or with
          * a row-vector (if there is only one row). */

-      NumDimensions = int(MaxSizeAtCompileTime) == 1 ? 0 : bool(IsVectorAtCompileTime) ? 1 : 2,
-        /**< This value is equal to Tensor::NumDimensions, i.e. 0 for scalars, 1 for vectors, 
-         * and 2 for matrices.
-         */
-
      Flags = internal::traits<Derived>::Flags,
        /**< This stores expression \ref flags flags which may or may not be inherited by new expressions
          * constructed from this one. See the \ref flags "list of flags".
@@ -266,9 +261,9 @@ template<typename Derived> class DenseBase
    /** \internal Represents a matrix with all coefficients equal to one another*/
    typedef CwiseNullaryOp<internal::scalar_constant_op<Scalar>,PlainObject> ConstantReturnType;
    /** \internal \deprecated Represents a vector with linearly spaced coefficients that allows sequential access only. */
-    EIGEN_DEPRECATED typedef CwiseNullaryOp<internal::linspaced_op<Scalar>,PlainObject> SequentialLinSpacedReturnType;
+    typedef CwiseNullaryOp<internal::linspaced_op<Scalar,PacketScalar>,PlainObject> SequentialLinSpacedReturnType;
    /** \internal Represents a vector with linearly spaced coefficients that allows random access. */
-    typedef CwiseNullaryOp<internal::linspaced_op<Scalar>,PlainObject> RandomAccessLinSpacedReturnType;
+    typedef CwiseNullaryOp<internal::linspaced_op<Scalar,PacketScalar>,PlainObject> RandomAccessLinSpacedReturnType;
    /** \internal the return type of MatrixBase::eigenvalues() */
    typedef Matrix<typename NumTraits<typename internal::traits<Derived>::Scalar>::Real, internal::traits<Derived>::ColsAtCompileTime, 1> EigenvaluesReturnType;

@@ -302,17 +297,17 @@ template<typename Derived> class DenseBase
    Derived& operator=(const ReturnByValue<OtherDerived>& func);

    /** \internal
-      * Copies \a other into *this without evaluating other. \returns a reference to *this. */
+      * Copies \a other into *this without evaluating other. \returns a reference to *this.
+      * \deprecated */
    template<typename OtherDerived>
-    /** \deprecated */
-    EIGEN_DEPRECATED EIGEN_DEVICE_FUNC
+    EIGEN_DEVICE_FUNC
    Derived& lazyAssign(const DenseBase<OtherDerived>& other);

    EIGEN_DEVICE_FUNC
    CommaInitializer<Derived> operator<< (const Scalar& s);

-    template<unsigned int Added,unsigned int Removed>
    /** \deprecated it now returns \c *this */
+    template<unsigned int Added,unsigned int Removed>
    EIGEN_DEPRECATED
    const Derived& flagged() const
    { return derived(); }
@@ -337,13 +332,12 @@ template<typename Derived> class DenseBase
    EIGEN_DEVICE_FUNC static const ConstantReturnType
    Constant(const Scalar& value);

-    EIGEN_DEPRECATED EIGEN_DEVICE_FUNC static const RandomAccessLinSpacedReturnType
+    EIGEN_DEVICE_FUNC static const SequentialLinSpacedReturnType
    LinSpaced(Sequential_t, Index size, const Scalar& low, const Scalar& high);
-    EIGEN_DEPRECATED EIGEN_DEVICE_FUNC static const RandomAccessLinSpacedReturnType
-    LinSpaced(Sequential_t, const Scalar& low, const Scalar& high);
-
    EIGEN_DEVICE_FUNC static const RandomAccessLinSpacedReturnType
    LinSpaced(Index size, const Scalar& low, const Scalar& high);
+    EIGEN_DEVICE_FUNC static const SequentialLinSpacedReturnType
+    LinSpaced(Sequential_t, const Scalar& low, const Scalar& high);
    EIGEN_DEVICE_FUNC static const RandomAccessLinSpacedReturnType
    LinSpaced(const Scalar& low, const Scalar& high);

@@ -375,7 +369,7 @@ template<typename Derived> class DenseBase
    template<typename OtherDerived> EIGEN_DEVICE_FUNC
    bool isApprox(const DenseBase<OtherDerived>& other,
                  const RealScalar& prec = NumTraits<Scalar>::dummy_precision()) const;
-    EIGEN_DEVICE_FUNC
+    EIGEN_DEVICE_FUNC 
    bool isMuchSmallerThan(const RealScalar& other,
                           const RealScalar& prec = NumTraits<Scalar>::dummy_precision()) const;
    template<typename OtherDerived> EIGEN_DEVICE_FUNC
@@ -386,7 +380,7 @@ template<typename Derived> class DenseBase
    EIGEN_DEVICE_FUNC bool isConstant(const Scalar& value, const RealScalar& prec = NumTraits<Scalar>::dummy_precision()) const;
    EIGEN_DEVICE_FUNC bool isZero(const RealScalar& prec = NumTraits<Scalar>::dummy_precision()) const;
    EIGEN_DEVICE_FUNC bool isOnes(const RealScalar& prec = NumTraits<Scalar>::dummy_precision()) const;
-
+    
    inline bool hasNaN() const;
    inline bool allFinite() const;

@@ -400,8 +394,8 @@ template<typename Derived> class DenseBase
      *
      * Notice that in the case of a plain matrix or vector (not an expression) this function just returns
      * a const reference, in order to avoid a useless copy.
-      *
-      * \warning Be careful with eval() and the auto C++ keyword, as detailed in this \link TopicPitfalls_auto_keyword page \endlink.
+      * 
+      * \warning Be carefull with eval() and the auto C++ keyword, as detailed in this \link TopicPitfalls_auto_keyword page \endlink.
      */
    EIGEN_DEVICE_FUNC
    EIGEN_STRONG_INLINE EvalReturnType eval() const
@@ -416,7 +410,7 @@ template<typename Derived> class DenseBase
      *
      */
    template<typename OtherDerived>
-    EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+    EIGEN_DEVICE_FUNC
    void swap(const DenseBase<OtherDerived>& other)
    {
      EIGEN_STATIC_ASSERT(!OtherDerived::IsPlainObjectBase,THIS_EXPRESSION_IS_NOT_A_LVALUE__IT_IS_READ_ONLY);
@@ -428,7 +422,7 @@ template<typename Derived> class DenseBase
      *
      */
    template<typename OtherDerived>
-    EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+    EIGEN_DEVICE_FUNC
    void swap(PlainObjectBase<OtherDerived>& other)
    {
      eigen_assert(rows()==other.rows() && cols()==other.cols());
@@ -499,7 +493,7 @@ template<typename Derived> class DenseBase
    typedef VectorwiseOp<Derived, Vertical> ColwiseReturnType;
    typedef const VectorwiseOp<const Derived, Vertical> ConstColwiseReturnType;

-    /** \returns a VectorwiseOp wrapper of *this for broadcasting and partial reductions
+    /** \returns a VectorwiseOp wrapper of *this providing additional partial reduction operations
    *
    * Example: \include MatrixBase_rowwise.cpp
    * Output: \verbinclude MatrixBase_rowwise.out
@@ -512,7 +506,7 @@ template<typename Derived> class DenseBase
    }
    EIGEN_DEVICE_FUNC RowwiseReturnType rowwise();

-    /** \returns a VectorwiseOp wrapper of *this broadcasting and partial reductions
+    /** \returns a VectorwiseOp wrapper of *this providing additional partial reduction operations
    *
    * Example: \include MatrixBase_colwise.cpp
    * Output: \verbinclude MatrixBase_colwise.out
@@ -573,59 +567,16 @@ template<typename Derived> class DenseBase
    }
    EIGEN_DEVICE_FUNC void reverseInPlace();

-    #ifdef EIGEN_PARSED_BY_DOXYGEN
-    /** STL-like <a href="https://en.cppreference.com/w/cpp/named_req/RandomAccessIterator">RandomAccessIterator</a>
-      * iterator type as returned by the begin() and end() methods.
-      */
-    typedef random_access_iterator_type iterator;
-    /** This is the const version of iterator (aka read-only) */
-    typedef random_access_iterator_type const_iterator;
-    #else
-    typedef typename internal::conditional< (Flags&DirectAccessBit)==DirectAccessBit,
-                                            internal::pointer_based_stl_iterator<Derived>,
-                                            internal::generic_randaccess_stl_iterator<Derived>
-                                          >::type iterator_type;
-
-    typedef typename internal::conditional< (Flags&DirectAccessBit)==DirectAccessBit,
-                                            internal::pointer_based_stl_iterator<const Derived>,
-                                            internal::generic_randaccess_stl_iterator<const Derived>
-                                          >::type const_iterator_type;
-
-    // Stl-style iterators are supported only for vectors.
-
-    typedef typename internal::conditional< IsVectorAtCompileTime,
-                                            iterator_type,
-                                            void
-                                          >::type iterator;
-
-    typedef typename internal::conditional< IsVectorAtCompileTime,
-                                            const_iterator_type,
-                                            void
-                                          >::type const_iterator;
-    #endif
-
-    inline iterator begin();
-    inline const_iterator begin() const;
-    inline const_iterator cbegin() const;
-    inline iterator end();
-    inline const_iterator end() const;
-    inline const_iterator cend() const;
-
 #define EIGEN_CURRENT_STORAGE_BASE_CLASS Eigen::DenseBase
 #define EIGEN_DOC_BLOCK_ADDONS_NOT_INNER_PANEL
 #define EIGEN_DOC_BLOCK_ADDONS_INNER_PANEL_IF(COND)
-#define EIGEN_DOC_UNARY_ADDONS(X,Y)
-#   include "../plugins/CommonCwiseUnaryOps.h"
 #   include "../plugins/BlockMethods.h"
-#   include "../plugins/IndexedViewMethods.h"
-#   include "../plugins/ReshapedMethods.h"
 #   ifdef EIGEN_DENSEBASE_PLUGIN
 #     include EIGEN_DENSEBASE_PLUGIN
 #   endif
 #undef EIGEN_CURRENT_STORAGE_BASE_CLASS
 #undef EIGEN_DOC_BLOCK_ADDONS_NOT_INNER_PANEL
 #undef EIGEN_DOC_BLOCK_ADDONS_INNER_PANEL_IF
-#undef EIGEN_DOC_UNARY_ADDONS

    // disable the use of evalTo for dense objects with a nice compilation error
    template<typename Dest>
--- a/Eigen/src/Core/DenseCoeffsBase.h
+++ b/Eigen/src/Core/DenseCoeffsBase.h
@@ -22,8 +22,7 @@ template<typename T> struct add_const_on_value_type_if_arithmetic
 /** \brief Base class providing read-only coefficient access to matrices and arrays.
  * \ingroup Core_Module
  * \tparam Derived Type of the derived class
-  *
-  * \note #ReadOnlyAccessors Constant indicating read-only access
+  * \tparam #ReadOnlyAccessors Constant indicating read-only access
  *
  * This class defines the \c operator() \c const function and friends, which can be used to read specific
  * entries of a matrix or array.
@@ -289,8 +288,7 @@ class DenseCoeffsBase<Derived,ReadOnlyAccessors> : public EigenBase<Derived>
 /** \brief Base class providing read/write coefficient access to matrices and arrays.
  * \ingroup Core_Module
  * \tparam Derived Type of the derived class
-  *
-  * \note #WriteAccessors Constant indicating read/write access
+  * \tparam #WriteAccessors Constant indicating read/write access
  *
  * This class defines the non-const \c operator() function and friends, which can be used to write specific
  * entries of a matrix or array. This class inherits DenseCoeffsBase<Derived, ReadOnlyAccessors> which
@@ -468,8 +466,7 @@ class DenseCoeffsBase<Derived, WriteAccessors> : public DenseCoeffsBase<Derived,
 /** \brief Base class providing direct read-only coefficient access to matrices and arrays.
  * \ingroup Core_Module
  * \tparam Derived Type of the derived class
-  *
-  * \note #DirectAccessors Constant indicating direct access
+  * \tparam #DirectAccessors Constant indicating direct access
  *
  * This class defines functions to work with strides which can be used to access entries directly. This class
  * inherits DenseCoeffsBase<Derived, ReadOnlyAccessors> which defines functions to access entries read-only using
@@ -542,8 +539,7 @@ class DenseCoeffsBase<Derived, DirectAccessors> : public DenseCoeffsBase<Derived
 /** \brief Base class providing direct read/write coefficient access to matrices and arrays.
  * \ingroup Core_Module
  * \tparam Derived Type of the derived class
-  *
-  * \note #DirectWriteAccessors Constant indicating direct access
+  * \tparam #DirectWriteAccessors Constant indicating direct access
  *
  * This class defines functions to work with strides which can be used to access entries directly. This class
  * inherits DenseCoeffsBase<Derived, WriteAccessors> which defines functions to access entries read/write using
--- a/Eigen/src/Core/DenseStorage.h
+++ b/Eigen/src/Core/DenseStorage.h
@@ -61,7 +61,7 @@ struct plain_array
 #if defined(EIGEN_DISABLE_UNALIGNED_ARRAY_ASSERT)
  #define EIGEN_MAKE_UNALIGNED_ARRAY_ASSERT(sizemask)
 #elif EIGEN_GNUC_AT_LEAST(4,7) 
-  // GCC 4.7 is too aggressive in its optimizations and remove the alignment test based on the fact the array is declared to be aligned.
+  // GCC 4.7 is too aggressive in its optimizations and remove the alignement test based on the fact the array is declared to be aligned.
  // See this bug report: http://gcc.gnu.org/bugzilla/show_bug.cgi?id=53900
  // Hiding the origin of the array pointer behind a function argument seems to do the trick even if the function is inlined:
  template<typename PtrType>
@@ -207,9 +207,7 @@ template<typename T, int Size, int _Rows, int _Cols, int _Options> class DenseSt
      EIGEN_UNUSED_VARIABLE(rows);
      EIGEN_UNUSED_VARIABLE(cols);
    }
-    EIGEN_DEVICE_FUNC void swap(DenseStorage& other) {
-      numext::swap(m_data, other.m_data);
-    }
+    EIGEN_DEVICE_FUNC void swap(DenseStorage& other) { std::swap(m_data,other.m_data); }
    EIGEN_DEVICE_FUNC static Index rows(void) {return _Rows;}
    EIGEN_DEVICE_FUNC static Index cols(void) {return _Cols;}
    EIGEN_DEVICE_FUNC void conservativeResize(Index,Index,Index) {}
@@ -269,11 +267,7 @@ template<typename T, int Size, int _Options> class DenseStorage<T, Size, Dynamic
    }
    EIGEN_DEVICE_FUNC DenseStorage(Index, Index rows, Index cols) : m_rows(rows), m_cols(cols) {}
    EIGEN_DEVICE_FUNC void swap(DenseStorage& other)
-    {
-      numext::swap(m_data,other.m_data);
-      numext::swap(m_rows,other.m_rows);
-      numext::swap(m_cols,other.m_cols);
-    }
+    { std::swap(m_data,other.m_data); std::swap(m_rows,other.m_rows); std::swap(m_cols,other.m_cols); }
    EIGEN_DEVICE_FUNC Index rows() const {return m_rows;}
    EIGEN_DEVICE_FUNC Index cols() const {return m_cols;}
    EIGEN_DEVICE_FUNC void conservativeResize(Index, Index rows, Index cols) { m_rows = rows; m_cols = cols; }
@@ -302,11 +296,7 @@ template<typename T, int Size, int _Cols, int _Options> class DenseStorage<T, Si
      return *this; 
    }
    EIGEN_DEVICE_FUNC DenseStorage(Index, Index rows, Index) : m_rows(rows) {}
-    EIGEN_DEVICE_FUNC void swap(DenseStorage& other)
-    {
-      numext::swap(m_data,other.m_data);
-      numext::swap(m_rows,other.m_rows);
-    }
+    EIGEN_DEVICE_FUNC void swap(DenseStorage& other) { std::swap(m_data,other.m_data); std::swap(m_rows,other.m_rows); }
    EIGEN_DEVICE_FUNC Index rows(void) const {return m_rows;}
    EIGEN_DEVICE_FUNC Index cols(void) const {return _Cols;}
    EIGEN_DEVICE_FUNC void conservativeResize(Index, Index rows, Index) { m_rows = rows; }
@@ -335,14 +325,11 @@ template<typename T, int Size, int _Rows, int _Options> class DenseStorage<T, Si
      return *this;
    }
    EIGEN_DEVICE_FUNC DenseStorage(Index, Index, Index cols) : m_cols(cols) {}
-    EIGEN_DEVICE_FUNC void swap(DenseStorage& other) {
-      numext::swap(m_data,other.m_data);
-      numext::swap(m_cols,other.m_cols);
-    }
+    EIGEN_DEVICE_FUNC void swap(DenseStorage& other) { std::swap(m_data,other.m_data); std::swap(m_cols,other.m_cols); }
    EIGEN_DEVICE_FUNC Index rows(void) const {return _Rows;}
    EIGEN_DEVICE_FUNC Index cols(void) const {return m_cols;}
-    EIGEN_DEVICE_FUNC void conservativeResize(Index, Index, Index cols) { m_cols = cols; }
-    EIGEN_DEVICE_FUNC void resize(Index, Index, Index cols) { m_cols = cols; }
+    void conservativeResize(Index, Index, Index cols) { m_cols = cols; }
+    void resize(Index, Index, Index cols) { m_cols = cols; }
    EIGEN_DEVICE_FUNC const T *data() const { return m_data.array; }
    EIGEN_DEVICE_FUNC T *data() { return m_data.array; }
 };
@@ -394,19 +381,16 @@ template<typename T, int _Options> class DenseStorage<T, Dynamic, Dynamic, Dynam
    EIGEN_DEVICE_FUNC
    DenseStorage& operator=(DenseStorage&& other) EIGEN_NOEXCEPT
    {
-      numext::swap(m_data, other.m_data);
-      numext::swap(m_rows, other.m_rows);
-      numext::swap(m_cols, other.m_cols);
+      using std::swap;
+      swap(m_data, other.m_data);
+      swap(m_rows, other.m_rows);
+      swap(m_cols, other.m_cols);
      return *this;
    }
 #endif
    EIGEN_DEVICE_FUNC ~DenseStorage() { internal::conditional_aligned_delete_auto<T,(_Options&DontAlign)==0>(m_data, m_rows*m_cols); }
    EIGEN_DEVICE_FUNC void swap(DenseStorage& other)
-    {
-      numext::swap(m_data,other.m_data);
-      numext::swap(m_rows,other.m_rows);
-      numext::swap(m_cols,other.m_cols);
-    }
+    { std::swap(m_data,other.m_data); std::swap(m_rows,other.m_rows); std::swap(m_cols,other.m_cols); }
    EIGEN_DEVICE_FUNC Index rows(void) const {return m_rows;}
    EIGEN_DEVICE_FUNC Index cols(void) const {return m_cols;}
    void conservativeResize(Index size, Index rows, Index cols)
@@ -420,7 +404,7 @@ template<typename T, int _Options> class DenseStorage<T, Dynamic, Dynamic, Dynam
      if(size != m_rows*m_cols)
      {
        internal::conditional_aligned_delete_auto<T,(_Options&DontAlign)==0>(m_data, m_rows*m_cols);
-        if (size>0) // >0 and not simply !=0 to let the compiler knows that size cannot be negative
+        if (size)
          m_data = internal::conditional_aligned_new_auto<T,(_Options&DontAlign)==0>(size);
        else
          m_data = 0;
@@ -475,16 +459,14 @@ template<typename T, int _Rows, int _Options> class DenseStorage<T, Dynamic, _Ro
    EIGEN_DEVICE_FUNC
    DenseStorage& operator=(DenseStorage&& other) EIGEN_NOEXCEPT
    {
-      numext::swap(m_data, other.m_data);
-      numext::swap(m_cols, other.m_cols);
+      using std::swap;
+      swap(m_data, other.m_data);
+      swap(m_cols, other.m_cols);
      return *this;
    }
 #endif
    EIGEN_DEVICE_FUNC ~DenseStorage() { internal::conditional_aligned_delete_auto<T,(_Options&DontAlign)==0>(m_data, _Rows*m_cols); }
-    EIGEN_DEVICE_FUNC void swap(DenseStorage& other) {
-      numext::swap(m_data,other.m_data);
-      numext::swap(m_cols,other.m_cols);
-    }
+    EIGEN_DEVICE_FUNC void swap(DenseStorage& other) { std::swap(m_data,other.m_data); std::swap(m_cols,other.m_cols); }
    EIGEN_DEVICE_FUNC static Index rows(void) {return _Rows;}
    EIGEN_DEVICE_FUNC Index cols(void) const {return m_cols;}
    EIGEN_DEVICE_FUNC void conservativeResize(Index size, Index, Index cols)
@@ -497,7 +479,7 @@ template<typename T, int _Rows, int _Options> class DenseStorage<T, Dynamic, _Ro
      if(size != _Rows*m_cols)
      {
        internal::conditional_aligned_delete_auto<T,(_Options&DontAlign)==0>(m_data, _Rows*m_cols);
-        if (size>0) // >0 and not simply !=0 to let the compiler knows that size cannot be negative
+        if (size)
          m_data = internal::conditional_aligned_new_auto<T,(_Options&DontAlign)==0>(size);
        else
          m_data = 0;
@@ -551,16 +533,14 @@ template<typename T, int _Cols, int _Options> class DenseStorage<T, Dynamic, Dyn
    EIGEN_DEVICE_FUNC
    DenseStorage& operator=(DenseStorage&& other) EIGEN_NOEXCEPT
    {
-      numext::swap(m_data, other.m_data);
-      numext::swap(m_rows, other.m_rows);
+      using std::swap;
+      swap(m_data, other.m_data);
+      swap(m_rows, other.m_rows);
      return *this;
    }
 #endif
    EIGEN_DEVICE_FUNC ~DenseStorage() { internal::conditional_aligned_delete_auto<T,(_Options&DontAlign)==0>(m_data, _Cols*m_rows); }
-    EIGEN_DEVICE_FUNC void swap(DenseStorage& other) {
-      numext::swap(m_data,other.m_data);
-      numext::swap(m_rows,other.m_rows);
-    }
+    EIGEN_DEVICE_FUNC void swap(DenseStorage& other) { std::swap(m_data,other.m_data); std::swap(m_rows,other.m_rows); }
    EIGEN_DEVICE_FUNC Index rows(void) const {return m_rows;}
    EIGEN_DEVICE_FUNC static Index cols(void) {return _Cols;}
    void conservativeResize(Index size, Index rows, Index)
@@ -573,7 +553,7 @@ template<typename T, int _Cols, int _Options> class DenseStorage<T, Dynamic, Dyn
      if(size != m_rows*_Cols)
      {
        internal::conditional_aligned_delete_auto<T,(_Options&DontAlign)==0>(m_data, _Cols*m_rows);
-        if (size>0) // >0 and not simply !=0 to let the compiler knows that size cannot be negative
+        if (size)
          m_data = internal::conditional_aligned_new_auto<T,(_Options&DontAlign)==0>(size);
        else
          m_data = 0;
--- a/Eigen/src/Core/Diagonal.h
+++ b/Eigen/src/Core/Diagonal.h
@@ -187,7 +187,7 @@ template<typename MatrixType, int _DiagIndex> class Diagonal
  *
  * \sa class Diagonal */
 template<typename Derived>
-EIGEN_DEVICE_FUNC inline typename MatrixBase<Derived>::DiagonalReturnType
+inline typename MatrixBase<Derived>::DiagonalReturnType
 MatrixBase<Derived>::diagonal()
 {
  return DiagonalReturnType(derived());
@@ -195,7 +195,7 @@ MatrixBase<Derived>::diagonal()

 /** This is the const version of diagonal(). */
 template<typename Derived>
-EIGEN_DEVICE_FUNC inline typename MatrixBase<Derived>::ConstDiagonalReturnType
+inline typename MatrixBase<Derived>::ConstDiagonalReturnType
 MatrixBase<Derived>::diagonal() const
 {
  return ConstDiagonalReturnType(derived());
@@ -213,7 +213,7 @@ MatrixBase<Derived>::diagonal() const
  *
  * \sa MatrixBase::diagonal(), class Diagonal */
 template<typename Derived>
-EIGEN_DEVICE_FUNC inline typename MatrixBase<Derived>::DiagonalDynamicIndexReturnType
+inline typename MatrixBase<Derived>::DiagonalDynamicIndexReturnType
 MatrixBase<Derived>::diagonal(Index index)
 {
  return DiagonalDynamicIndexReturnType(derived(), index);
@@ -221,7 +221,7 @@ MatrixBase<Derived>::diagonal(Index index)

 /** This is the const version of diagonal(Index). */
 template<typename Derived>
-EIGEN_DEVICE_FUNC inline typename MatrixBase<Derived>::ConstDiagonalDynamicIndexReturnType
+inline typename MatrixBase<Derived>::ConstDiagonalDynamicIndexReturnType
 MatrixBase<Derived>::diagonal(Index index) const
 {
  return ConstDiagonalDynamicIndexReturnType(derived(), index);
@@ -240,7 +240,6 @@ MatrixBase<Derived>::diagonal(Index index) const
  * \sa MatrixBase::diagonal(), class Diagonal */
 template<typename Derived>
 template<int Index_>
-EIGEN_DEVICE_FUNC
 inline typename MatrixBase<Derived>::template DiagonalIndexReturnType<Index_>::Type
 MatrixBase<Derived>::diagonal()
 {
@@ -250,7 +249,6 @@ MatrixBase<Derived>::diagonal()
 /** This is the const version of diagonal<int>(). */
 template<typename Derived>
 template<int Index_>
-EIGEN_DEVICE_FUNC
 inline typename MatrixBase<Derived>::template ConstDiagonalIndexReturnType<Index_>::Type
 MatrixBase<Derived>::diagonal() const
 {
--- a/Eigen/src/Core/DiagonalMatrix.h
+++ b/Eigen/src/Core/DiagonalMatrix.h
@@ -44,7 +44,7 @@ class DiagonalBase : public EigenBase<Derived>

    EIGEN_DEVICE_FUNC
    DenseMatrixType toDenseMatrix() const { return derived(); }
-
+    
    EIGEN_DEVICE_FUNC
    inline const DiagonalVectorType& diagonal() const { return derived().diagonal(); }
    EIGEN_DEVICE_FUNC
@@ -83,30 +83,6 @@ class DiagonalBase : public EigenBase<Derived>
    {
      return DiagonalWrapper<const EIGEN_SCALAR_BINARYOP_EXPR_RETURN_TYPE(Scalar,DiagonalVectorType,product) >(scalar * other.diagonal());
    }
-
-    template<typename OtherDerived>
-    EIGEN_DEVICE_FUNC
-    #ifdef EIGEN_PARSED_BY_DOXYGEN
-    inline unspecified_expression_type
-    #else
-    inline const DiagonalWrapper<const EIGEN_CWISE_BINARY_RETURN_TYPE(DiagonalVectorType,typename OtherDerived::DiagonalVectorType,sum) >
-    #endif
-    operator+(const DiagonalBase<OtherDerived>& other) const
-    {
-      return (diagonal() + other.diagonal()).asDiagonal();
-    }
-
-    template<typename OtherDerived>
-    EIGEN_DEVICE_FUNC
-    #ifdef EIGEN_PARSED_BY_DOXYGEN
-    inline unspecified_expression_type
-    #else
-    inline const DiagonalWrapper<const EIGEN_CWISE_BINARY_RETURN_TYPE(DiagonalVectorType,typename OtherDerived::DiagonalVectorType,difference) >
-    #endif
-    operator-(const DiagonalBase<OtherDerived>& other) const
-    {
-      return (diagonal() - other.diagonal()).asDiagonal();
-    }
 };

 #endif
@@ -178,30 +154,6 @@ class DiagonalMatrix
    EIGEN_DEVICE_FUNC
    inline DiagonalMatrix(const Scalar& x, const Scalar& y, const Scalar& z) : m_diagonal(x,y,z) {}

-    #if EIGEN_HAS_CXX11
-    /** \brief Construct a diagonal matrix with fixed size from an arbitrary number of coefficients. \cpp11
-      * 
-      * There exists C++98 anologue constructors for fixed-size diagonal matrices having 2 or 3 coefficients.
-      * 
-      * \warning To construct a diagonal matrix of fixed size, the number of values passed to this 
-      * constructor must match the fixed dimension of \c *this.
-      * 
-      * \sa DiagonalMatrix(const Scalar&, const Scalar&)
-      * \sa DiagonalMatrix(const Scalar&, const Scalar&, const Scalar&)
-      */
-    template <typename... ArgTypes>
-    EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
-    DiagonalMatrix(const Scalar& a0, const Scalar& a1, const Scalar& a2, const ArgTypes&... args)
-      : m_diagonal(a0, a1, a2, args...) {}
-
-    /** \brief Constructs a DiagonalMatrix and initializes it by elements given by an initializer list of initializer
-      * lists \cpp11
-      */
-    EIGEN_DEVICE_FUNC
-    explicit EIGEN_STRONG_INLINE DiagonalMatrix(const std::initializer_list<std::initializer_list<Scalar>>& list)
-      : m_diagonal(list) {}
-    #endif  // EIGEN_HAS_CXX11
-
    /** Copy constructor. */
    template<typename OtherDerived>
    EIGEN_DEVICE_FUNC
@@ -321,7 +273,7 @@ class DiagonalWrapper
  * \sa class DiagonalWrapper, class DiagonalMatrix, diagonal(), isDiagonal()
  **/
 template<typename Derived>
-EIGEN_DEVICE_FUNC inline const DiagonalWrapper<const Derived>
+inline const DiagonalWrapper<const Derived>
 MatrixBase<Derived>::asDiagonal() const
 {
  return DiagonalWrapper<const Derived>(derived());
--- a/Eigen/src/Core/DiagonalProduct.h
+++ b/Eigen/src/Core/DiagonalProduct.h
@@ -17,7 +17,7 @@ namespace Eigen {
  */
 template<typename Derived>
 template<typename DiagonalDerived>
-EIGEN_DEVICE_FUNC inline const Product<Derived, DiagonalDerived, LazyProduct>
+inline const Product<Derived, DiagonalDerived, LazyProduct>
 MatrixBase<Derived>::operator*(const DiagonalBase<DiagonalDerived> &a_diagonal) const
 {
  return Product<Derived, DiagonalDerived, LazyProduct>(derived(),a_diagonal.derived());
--- a/Eigen/src/Core/Dot.h
+++ b/Eigen/src/Core/Dot.h
@@ -93,7 +93,7 @@ MatrixBase<Derived>::dot(const MatrixBase<OtherDerived>& other) const
  * \sa dot(), norm(), lpNorm()
  */
 template<typename Derived>
-EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE typename NumTraits<typename internal::traits<Derived>::Scalar>::Real MatrixBase<Derived>::squaredNorm() const
+EIGEN_STRONG_INLINE typename NumTraits<typename internal::traits<Derived>::Scalar>::Real MatrixBase<Derived>::squaredNorm() const
 {
  return numext::real((*this).cwiseAbs2().sum());
 }
@@ -105,7 +105,7 @@ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE typename NumTraits<typename internal::trai
  * \sa lpNorm(), dot(), squaredNorm()
  */
 template<typename Derived>
-EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE typename NumTraits<typename internal::traits<Derived>::Scalar>::Real MatrixBase<Derived>::norm() const
+EIGEN_STRONG_INLINE typename NumTraits<typename internal::traits<Derived>::Scalar>::Real MatrixBase<Derived>::norm() const
 {
  return numext::sqrt(squaredNorm());
 }
@@ -120,7 +120,7 @@ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE typename NumTraits<typename internal::trai
  * \sa norm(), normalize()
  */
 template<typename Derived>
-EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const typename MatrixBase<Derived>::PlainObject
+EIGEN_STRONG_INLINE const typename MatrixBase<Derived>::PlainObject
 MatrixBase<Derived>::normalized() const
 {
  typedef typename internal::nested_eval<Derived,2>::type _Nested;
@@ -142,7 +142,7 @@ MatrixBase<Derived>::normalized() const
  * \sa norm(), normalized()
  */
 template<typename Derived>
-EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void MatrixBase<Derived>::normalize()
+EIGEN_STRONG_INLINE void MatrixBase<Derived>::normalize()
 {
  RealScalar z = squaredNorm();
  // NOTE: after extensive benchmarking, this conditional does not impact performance, at least on recent x86 CPU
@@ -163,7 +163,7 @@ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void MatrixBase<Derived>::normalize()
  * \sa stableNorm(), stableNormalize(), normalized()
  */
 template<typename Derived>
-EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const typename MatrixBase<Derived>::PlainObject
+EIGEN_STRONG_INLINE const typename MatrixBase<Derived>::PlainObject
 MatrixBase<Derived>::stableNormalized() const
 {
  typedef typename internal::nested_eval<Derived,3>::type _Nested;
@@ -188,7 +188,7 @@ MatrixBase<Derived>::stableNormalized() const
  * \sa stableNorm(), stableNormalized(), normalize()
  */
 template<typename Derived>
-EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void MatrixBase<Derived>::stableNormalize()
+EIGEN_STRONG_INLINE void MatrixBase<Derived>::stableNormalize()
 {
  RealScalar w = cwiseAbs().maxCoeff();
  RealScalar z = (derived()/w).squaredNorm();
@@ -260,9 +260,9 @@ struct lpNorm_selector<Derived, Infinity>
 template<typename Derived>
 template<int p>
 #ifndef EIGEN_PARSED_BY_DOXYGEN
-EIGEN_DEVICE_FUNC inline typename NumTraits<typename internal::traits<Derived>::Scalar>::Real
+inline typename NumTraits<typename internal::traits<Derived>::Scalar>::Real
 #else
-EIGEN_DEVICE_FUNC MatrixBase<Derived>::RealScalar
+MatrixBase<Derived>::RealScalar
 #endif
 MatrixBase<Derived>::lpNorm() const
 {
--- a/Eigen/src/Core/EigenBase.h
+++ b/Eigen/src/Core/EigenBase.h
@@ -32,9 +32,8 @@ template<typename Derived> struct EigenBase
  
  /** \brief The interface type of indices
    * \details To change this, \c \#define the preprocessor symbol \c EIGEN_DEFAULT_DENSE_INDEX_TYPE.
+    * \deprecated Since Eigen 3.3, its usage is deprecated. Use Eigen::Index instead.
    * \sa StorageIndex, \ref TopicPreprocessorDirectives.
-    * DEPRECATED: Since Eigen 3.3, its usage is deprecated. Use Eigen::Index instead.
-    * Deprecation is not marked with a doxygen comment because there are too many existing usages to add the deprecation attribute.
    */
  typedef Eigen::Index Index;

--- a/Eigen/src/Core/Fuzzy.h
+++ b/Eigen/src/Core/Fuzzy.h
@@ -100,7 +100,7 @@ struct isMuchSmallerThan_scalar_selector<Derived, true>
  */
 template<typename Derived>
 template<typename OtherDerived>
-EIGEN_DEVICE_FUNC bool DenseBase<Derived>::isApprox(
+bool DenseBase<Derived>::isApprox(
  const DenseBase<OtherDerived>& other,
  const RealScalar& prec
 ) const
@@ -122,7 +122,7 @@ EIGEN_DEVICE_FUNC bool DenseBase<Derived>::isApprox(
  * \sa isApprox(), isMuchSmallerThan(const DenseBase<OtherDerived>&, RealScalar) const
  */
 template<typename Derived>
-EIGEN_DEVICE_FUNC bool DenseBase<Derived>::isMuchSmallerThan(
+bool DenseBase<Derived>::isMuchSmallerThan(
  const typename NumTraits<Scalar>::Real& other,
  const RealScalar& prec
 ) const
@@ -142,7 +142,7 @@ EIGEN_DEVICE_FUNC bool DenseBase<Derived>::isMuchSmallerThan(
  */
 template<typename Derived>
 template<typename OtherDerived>
-EIGEN_DEVICE_FUNC bool DenseBase<Derived>::isMuchSmallerThan(
+bool DenseBase<Derived>::isMuchSmallerThan(
  const DenseBase<OtherDerived>& other,
  const RealScalar& prec
 ) const
--- a/Eigen/src/Core/GeneralProduct.h
+++ b/Eigen/src/Core/GeneralProduct.h
@@ -18,16 +18,6 @@ enum {
  Small = 3
 };

-// Define the threshold value to fallback from the generic matrix-matrix product
-// implementation (heavy) to the lightweight coeff-based product one.
-// See generic_product_impl<Lhs,Rhs,DenseShape,DenseShape,GemmProduct>
-// in products/GeneralMatrixMatrix.h for more details.
-// TODO This threshold should also be used in the compile-time selector below.
-#ifndef EIGEN_GEMM_TO_COEFFBASED_THRESHOLD
-// This default value has been obtained on a Haswell architecture.
-#define EIGEN_GEMM_TO_COEFFBASED_THRESHOLD 20
-#endif
-
 namespace internal {

 template<int Rows, int Cols, int Depth> struct product_type_selector;
@@ -35,7 +25,7 @@ template<int Rows, int Cols, int Depth> struct product_type_selector;
 template<int Size, int MaxSize> struct product_size_category
 {
  enum {
-    #ifndef EIGEN_GPU_COMPILE_PHASE
+    #ifndef EIGEN_CUDA_ARCH
    is_large = MaxSize == Dynamic ||
               Size >= EIGEN_CACHEFRIENDLY_PRODUCT_THRESHOLD ||
               (Size==Dynamic && MaxSize>=EIGEN_CACHEFRIENDLY_PRODUCT_THRESHOLD),
@@ -163,13 +153,13 @@ template<typename Scalar,int Size,int MaxSize,bool Cond> struct gemv_static_vect
 template<typename Scalar,int Size,int MaxSize>
 struct gemv_static_vector_if<Scalar,Size,MaxSize,false>
 {
-  EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC Scalar* data() { eigen_internal_assert(false && "should never be called"); return 0; }
+  EIGEN_STRONG_INLINE  Scalar* data() { eigen_internal_assert(false && "should never be called"); return 0; }
 };

 template<typename Scalar,int Size>
 struct gemv_static_vector_if<Scalar,Size,Dynamic,true>
 {
-  EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC Scalar* data() { return 0; }
+  EIGEN_STRONG_INLINE Scalar* data() { return 0; }
 };

 template<typename Scalar,int Size,int MaxSize>
@@ -239,7 +229,7 @@ template<> struct gemv_dense_selector<OnTheRight,ColMajor,true>
      // on, the other hand it is good for the cache to pack the vector anyways...
      EvalToDestAtCompileTime = (ActualDest::InnerStrideAtCompileTime==1),
      ComplexByReal = (NumTraits<LhsScalar>::IsComplex) && (!NumTraits<RhsScalar>::IsComplex),
-      MightCannotUseDest = ((!EvalToDestAtCompileTime) || ComplexByReal) && (ActualDest::MaxSizeAtCompileTime!=0)
+      MightCannotUseDest = (!EvalToDestAtCompileTime) || ComplexByReal
    };

    typedef const_blas_data_mapper<LhsScalar,Index,ColMajor> LhsMapper;
@@ -326,7 +316,7 @@ template<> struct gemv_dense_selector<OnTheRight,RowMajor,true>
    enum {
      // FIXME find a way to allow an inner stride on the result if packet_traits<Scalar>::size==1
      // on, the other hand it is good for the cache to pack the vector anyways...
-      DirectlyUseRhs = ActualRhsTypeCleaned::InnerStrideAtCompileTime==1 || ActualRhsTypeCleaned::MaxSizeAtCompileTime==0
+      DirectlyUseRhs = ActualRhsTypeCleaned::InnerStrideAtCompileTime==1
    };

    gemv_static_vector_if<RhsScalar,ActualRhsTypeCleaned::SizeAtCompileTime,ActualRhsTypeCleaned::MaxSizeAtCompileTime,!DirectlyUseRhs> static_rhs;
@@ -396,8 +386,7 @@ template<> struct gemv_dense_selector<OnTheRight,RowMajor,false>
  */
 template<typename Derived>
 template<typename OtherDerived>
-EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
-const Product<Derived, OtherDerived>
+inline const Product<Derived, OtherDerived>
 MatrixBase<Derived>::operator*(const MatrixBase<OtherDerived> &other) const
 {
  // A note regarding the function declaration: In MSVC, this function will sometimes
@@ -439,7 +428,6 @@ MatrixBase<Derived>::operator*(const MatrixBase<OtherDerived> &other) const
  */
 template<typename Derived>
 template<typename OtherDerived>
-EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
 const Product<Derived,OtherDerived,LazyProduct>
 MatrixBase<Derived>::lazyProduct(const MatrixBase<OtherDerived> &other) const
 {
--- a/Eigen/src/Core/GenericPacketMath.h
+++ b/Eigen/src/Core/GenericPacketMath.h
@@ -56,13 +56,11 @@ struct default_packet_traits
    HasConj   = 1,
    HasSetLinear = 1,
    HasBlend  = 0,
-    HasReduxp = 1,

    HasDiv    = 0,
    HasSqrt   = 0,
    HasRsqrt  = 0,
    HasExp    = 0,
-    HasExpm1  = 0,
    HasLog    = 0,
    HasLog1p  = 0,
    HasLog10  = 0,
@@ -83,11 +81,7 @@ struct default_packet_traits
    HasPolygamma = 0,
    HasErf = 0,
    HasErfc = 0,
-    HasNdtri = 0,
-    HasBessel = 0,
    HasIGamma = 0,
-    HasIGammaDerA = 0,
-    HasGammaSampleDerAlpha = 0,
    HasIGammac = 0,
    HasBetaInc = 0,

@@ -152,18 +146,15 @@ pcast(const SrcPacket& a, const SrcPacket& /*b*/, const SrcPacket& /*c*/, const
  return static_cast<TgtPacket>(a);
 }

-/** \internal \returns reinterpret_cast<Target>(a) */
-template <typename Target, typename Packet>
-EIGEN_DEVICE_FUNC inline Target
-preinterpret(const Packet& a); /* { return reinterpret_cast<const Target&>(a); } */
-
 /** \internal \returns a + b (coeff-wise) */
 template<typename Packet> EIGEN_DEVICE_FUNC inline Packet
-padd(const Packet& a, const Packet& b) { return a+b; }
+padd(const Packet& a,
+        const Packet& b) { return a+b; }

 /** \internal \returns a - b (coeff-wise) */
 template<typename Packet> EIGEN_DEVICE_FUNC inline Packet
-psub(const Packet& a, const Packet& b) { return a-b; }
+psub(const Packet& a,
+        const Packet& b) { return a-b; }

 /** \internal \returns -a (coeff-wise) */
 template<typename Packet> EIGEN_DEVICE_FUNC inline Packet
@@ -176,19 +167,23 @@ pconj(const Packet& a) { return numext::conj(a); }

 /** \internal \returns a * b (coeff-wise) */
 template<typename Packet> EIGEN_DEVICE_FUNC inline Packet
-pmul(const Packet& a, const Packet& b) { return a*b; }
+pmul(const Packet& a,
+        const Packet& b) { return a*b; }

 /** \internal \returns a / b (coeff-wise) */
 template<typename Packet> EIGEN_DEVICE_FUNC inline Packet
-pdiv(const Packet& a, const Packet& b) { return a/b; }
+pdiv(const Packet& a,
+        const Packet& b) { return a/b; }

 /** \internal \returns the min of \a a and \a b  (coeff-wise) */
 template<typename Packet> EIGEN_DEVICE_FUNC inline Packet
-pmin(const Packet& a, const Packet& b) { return numext::mini(a, b); }
+pmin(const Packet& a,
+        const Packet& b) { return numext::mini(a, b); }

 /** \internal \returns the max of \a a and \a b  (coeff-wise) */
 template<typename Packet> EIGEN_DEVICE_FUNC inline Packet
-pmax(const Packet& a, const Packet& b) { return numext::maxi(a, b); }
+pmax(const Packet& a,
+        const Packet& b) { return numext::maxi(a, b); }

 /** \internal \returns the absolute value of \a a */
 template<typename Packet> EIGEN_DEVICE_FUNC inline Packet
@@ -212,101 +207,7 @@ pxor(const Packet& a, const Packet& b) { return a ^ b; }

 /** \internal \returns the bitwise andnot of \a a and \a b */
 template<typename Packet> EIGEN_DEVICE_FUNC inline Packet
-pandnot(const Packet& a, const Packet& b) { return a & (~b); }
-
-/** \internal \returns ones */
-template<typename Packet> EIGEN_DEVICE_FUNC inline Packet
-ptrue(const Packet& /*a*/) { Packet b; memset((void*)&b, 0xff, sizeof(b)); return b;}
-
-template <typename RealScalar>
-EIGEN_DEVICE_FUNC inline std::complex<RealScalar> ptrue(const std::complex<RealScalar>& /*a*/) {
-  RealScalar b;
-  b = ptrue(b);
-  return std::complex<RealScalar>(b, b);
-}
-
-/** \internal \returns the bitwise not of \a a */
-template <typename Packet> EIGEN_DEVICE_FUNC inline Packet
-pnot(const Packet& a) { return pxor(ptrue(a), a);}
-
-/** \internal \returns \a a shifted by N bits to the right */
-template<int N> EIGEN_DEVICE_FUNC inline int
-pshiftright(const int& a) { return a >> N; }
-template<int N> EIGEN_DEVICE_FUNC inline long int
-pshiftright(const long int& a) { return a >> N; }
-
-/** \internal \returns \a a shifted by N bits to the left */
-template<int N> EIGEN_DEVICE_FUNC inline int
-pshiftleft(const int& a) { return a << N; }
-template<int N> EIGEN_DEVICE_FUNC inline long int
-pshiftleft(const long int& a) { return a << N; }
-
-/** \internal \returns the significant and exponent of the underlying floating point numbers
-  * See https://en.cppreference.com/w/cpp/numeric/math/frexp
-  */
-template <typename Packet>
-EIGEN_DEVICE_FUNC inline Packet pfrexp(const Packet& a, Packet& exponent) {
-  int exp;
-  EIGEN_USING_STD_MATH(frexp);
-  Packet result = frexp(a, &exp);
-  exponent = static_cast<Packet>(exp);
-  return result;
-}
-
-/** \internal \returns a * 2^exponent
-  * See https://en.cppreference.com/w/cpp/numeric/math/ldexp
-  */
-template<typename Packet> EIGEN_DEVICE_FUNC inline Packet
-pldexp(const Packet &a, const Packet &exponent) {
-  EIGEN_USING_STD_MATH(ldexp);
-  return ldexp(a, static_cast<int>(exponent));
-}
-
-/** \internal \returns zeros */
-template<typename Packet> EIGEN_DEVICE_FUNC inline Packet
-pzero(const Packet& a) { return pxor(a,a); }
-
-template<> EIGEN_DEVICE_FUNC inline float pzero<float>(const float& a) {
-  EIGEN_UNUSED_VARIABLE(a);
-  return 0.f;
-}
-
-template<> EIGEN_DEVICE_FUNC inline double pzero<double>(const double& a) {
-  EIGEN_UNUSED_VARIABLE(a);
-  return 0.;
-}
-
-/** \internal \returns bits of \a or \b according to the input bit mask \a mask */
-template<typename Packet> EIGEN_DEVICE_FUNC inline Packet
-pselect(const Packet& mask, const Packet& a, const Packet& b) {
-  return por(pand(a,mask),pandnot(b,mask));
-}
-
-template<> EIGEN_DEVICE_FUNC inline float pselect<float>(
-    const float& mask, const float& a, const float&b) {
-  return numext::equal_strict(mask,0.f) ? b : a;
-}
-
-template<> EIGEN_DEVICE_FUNC inline double pselect<double>(
-    const double& mask, const double& a, const double& b) {
-  return numext::equal_strict(mask,0.) ? b : a;
-}
-
-/** \internal \returns a <= b as a bit mask */
-template<typename Packet> EIGEN_DEVICE_FUNC inline Packet
-pcmp_le(const Packet& a, const Packet& b)  { return a<=b ? ptrue(a) : pzero(a); }
-
-/** \internal \returns a < b as a bit mask */
-template<typename Packet> EIGEN_DEVICE_FUNC inline Packet
-pcmp_lt(const Packet& a, const Packet& b)  { return a<b ? ptrue(a) : pzero(a); }
-
-/** \internal \returns a == b as a bit mask */
-template<typename Packet> EIGEN_DEVICE_FUNC inline Packet
-pcmp_eq(const Packet& a, const Packet& b) { return a==b ? ptrue(a) : pzero(a); }
-
-/** \internal \returns a < b or a==NaN or b==NaN as a bit mask */
-template<typename Packet> EIGEN_DEVICE_FUNC inline Packet
-pcmp_lt_or_nan(const Packet& a, const Packet& b) { return pnot(pcmp_le(b,a)); } 
+pandnot(const Packet& a, const Packet& b) { return a & (!b); }

 /** \internal \returns a packet version of \a *from, from must be 16 bytes aligned */
 template<typename Packet> EIGEN_DEVICE_FUNC inline Packet
@@ -316,22 +217,10 @@ pload(const typename unpacket_traits<Packet>::type* from) { return *from; }
 template<typename Packet> EIGEN_DEVICE_FUNC inline Packet
 ploadu(const typename unpacket_traits<Packet>::type* from) { return *from; }

-/** \internal \returns a packet version of \a *from, (un-aligned masked load)
- * There is no generic implementation. We only have implementations for specialized
- * cases. Generic case should not be called.
- */
-template<typename Packet> EIGEN_DEVICE_FUNC inline
-typename enable_if<unpacket_traits<Packet>::masked_load_available, Packet>::type
-ploadu(const typename unpacket_traits<Packet>::type* from, typename unpacket_traits<Packet>::mask_t umask);
-
 /** \internal \returns a packet with constant coefficients \a a, e.g.: (a,a,a,a) */
 template<typename Packet> EIGEN_DEVICE_FUNC inline Packet
 pset1(const typename unpacket_traits<Packet>::type& a) { return a; }

-/** \internal \returns a packet with constant coefficients set from bits */
-template<typename Packet,typename BitsType> EIGEN_DEVICE_FUNC inline Packet
-pset1frombits(BitsType a);
-
 /** \internal \returns a packet with constant coefficients \a a[0], e.g.: (a[0],a[0],a[0],a[0]) */
 template<typename Packet> EIGEN_DEVICE_FUNC inline Packet
 pload1(const typename unpacket_traits<Packet>::type  *a) { return pset1<Packet>(*a); }
@@ -400,15 +289,6 @@ template<typename Scalar, typename Packet> EIGEN_DEVICE_FUNC inline void pstore(
 template<typename Scalar, typename Packet> EIGEN_DEVICE_FUNC inline void pstoreu(Scalar* to, const Packet& from)
 {  (*to) = from; }

-/** \internal copy the packet \a from to \a *to, (un-aligned store with a mask)
- * There is no generic implementation. We only have implementations for specialized
- * cases. Generic case should not be called.
- */
-template<typename Scalar, typename Packet>
-EIGEN_DEVICE_FUNC inline
-typename enable_if<unpacket_traits<Packet>::masked_store_available, void>::type
-pstoreu(Scalar* to, const Packet& from, typename unpacket_traits<Packet>::mask_t umask);
-
 template<typename Scalar, typename Packet> EIGEN_DEVICE_FUNC inline Packet pgather(const Scalar* from, Index /*stride*/)
 { return ploadu<Packet>(from); }

@@ -418,9 +298,7 @@ pstoreu(Scalar* to, const Packet& from, typename unpacket_traits<Packet>::mask_t
 /** \internal tries to do cache prefetching of \a addr */
 template<typename Scalar> EIGEN_DEVICE_FUNC inline void prefetch(const Scalar* addr)
 {
-#if defined(EIGEN_HIP_DEVICE_COMPILE)
-  // do nothing
-#elif defined(EIGEN_CUDA_ARCH)
+#ifdef __CUDA_ARCH__
 #if defined(__LP64__)
  // 64-bit pointer operand constraint for inlined asm
  asm(" prefetch.L1 [ %1 ];" : "=l"(addr) : "l"(addr));
@@ -445,48 +323,27 @@ preduxp(const Packet* vecs) { return vecs[0]; }
 template<typename Packet> EIGEN_DEVICE_FUNC inline typename unpacket_traits<Packet>::type predux(const Packet& a)
 { return a; }

-/** \internal \returns the sum of the elements of upper and lower half of \a a if \a a is larger than 4.
+/** \internal \returns the sum of the elements of \a a by block of 4 elements.
  * For a packet {a0, a1, a2, a3, a4, a5, a6, a7}, it returns a half packet {a0+a4, a1+a5, a2+a6, a3+a7}
  * For packet-size smaller or equal to 4, this boils down to a noop.
  */
 template<typename Packet> EIGEN_DEVICE_FUNC inline
 typename conditional<(unpacket_traits<Packet>::size%8)==0,typename unpacket_traits<Packet>::half,Packet>::type
-predux_half_dowto4(const Packet& a)
+predux_downto4(const Packet& a)
 { return a; }

-/** \internal \returns the product of the elements of \a a */
+/** \internal \returns the product of the elements of \a a*/
 template<typename Packet> EIGEN_DEVICE_FUNC inline typename unpacket_traits<Packet>::type predux_mul(const Packet& a)
 { return a; }

-/** \internal \returns the min of the elements of \a a */
+/** \internal \returns the min of the elements of \a a*/
 template<typename Packet> EIGEN_DEVICE_FUNC inline typename unpacket_traits<Packet>::type predux_min(const Packet& a)
 { return a; }

-/** \internal \returns the max of the elements of \a a */
+/** \internal \returns the max of the elements of \a a*/
 template<typename Packet> EIGEN_DEVICE_FUNC inline typename unpacket_traits<Packet>::type predux_max(const Packet& a)
 { return a; }

-/** \internal \returns true if all coeffs of \a a means "true"
-  * It is supposed to be called on values returned by pcmp_*.
-  */
-// not needed yet
-// template<typename Packet> EIGEN_DEVICE_FUNC inline bool predux_all(const Packet& a)
-// { return bool(a); }
-
-/** \internal \returns true if any coeffs of \a a means "true"
-  * It is supposed to be called on values returned by pcmp_*.
-  */
-template<typename Packet> EIGEN_DEVICE_FUNC inline bool predux_any(const Packet& a)
-{
-  // Dirty but generic implementation where "true" is assumed to be non 0 and all the sames.
-  // It is expected that "true" is either:
-  //  - Scalar(1)
-  //  - bits full of ones (NaN for floats),
-  //  - or first bit equals to 1 (1 for ints, smallest denormal for floats).
-  // For all these cases, taking the sum is just fine, and this boils down to a no-op for scalars.
-  return bool(predux(a));
-}
-
 /** \internal \returns the reversed elements of \a a*/
 template<typename Packet> EIGEN_DEVICE_FUNC inline Packet preverse(const Packet& a)
 { return a; }
@@ -494,7 +351,10 @@ template<typename Packet> EIGEN_DEVICE_FUNC inline Packet preverse(const Packet&
 /** \internal \returns \a a with real and imaginary part flipped (for complex type only) */
 template<typename Packet> EIGEN_DEVICE_FUNC inline Packet pcplxflip(const Packet& a)
 {
-  return Packet(numext::imag(a),numext::real(a));
+  // FIXME: uncomment the following in case we drop the internal imag and real functions.
+//   using std::imag;
+//   using std::real;
+  return Packet(imag(a),real(a));
 }

 /**************************
@@ -503,51 +363,47 @@ template<typename Packet> EIGEN_DEVICE_FUNC inline Packet pcplxflip(const Packet

 /** \internal \returns the sine of \a a (coeff-wise) */
 template<typename Packet> EIGEN_DECLARE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS
-Packet psin(const Packet& a) { EIGEN_USING_STD_MATH(sin); return sin(a); }
+Packet psin(const Packet& a) { using std::sin; return sin(a); }

 /** \internal \returns the cosine of \a a (coeff-wise) */
 template<typename Packet> EIGEN_DECLARE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS
-Packet pcos(const Packet& a) { EIGEN_USING_STD_MATH(cos); return cos(a); }
+Packet pcos(const Packet& a) { using std::cos; return cos(a); }

 /** \internal \returns the tan of \a a (coeff-wise) */
 template<typename Packet> EIGEN_DECLARE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS
-Packet ptan(const Packet& a) { EIGEN_USING_STD_MATH(tan); return tan(a); }
+Packet ptan(const Packet& a) { using std::tan; return tan(a); }

 /** \internal \returns the arc sine of \a a (coeff-wise) */
 template<typename Packet> EIGEN_DECLARE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS
-Packet pasin(const Packet& a) { EIGEN_USING_STD_MATH(asin); return asin(a); }
+Packet pasin(const Packet& a) { using std::asin; return asin(a); }

 /** \internal \returns the arc cosine of \a a (coeff-wise) */
 template<typename Packet> EIGEN_DECLARE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS
-Packet pacos(const Packet& a) { EIGEN_USING_STD_MATH(acos); return acos(a); }
+Packet pacos(const Packet& a) { using std::acos; return acos(a); }

 /** \internal \returns the arc tangent of \a a (coeff-wise) */
 template<typename Packet> EIGEN_DECLARE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS
-Packet patan(const Packet& a) { EIGEN_USING_STD_MATH(atan); return atan(a); }
+Packet patan(const Packet& a) { using std::atan; return atan(a); }

 /** \internal \returns the hyperbolic sine of \a a (coeff-wise) */
 template<typename Packet> EIGEN_DECLARE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS
-Packet psinh(const Packet& a) { EIGEN_USING_STD_MATH(sinh); return sinh(a); }
+Packet psinh(const Packet& a) { using std::sinh; return sinh(a); }

 /** \internal \returns the hyperbolic cosine of \a a (coeff-wise) */
 template<typename Packet> EIGEN_DECLARE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS
-Packet pcosh(const Packet& a) { EIGEN_USING_STD_MATH(cosh); return cosh(a); }
+Packet pcosh(const Packet& a) { using std::cosh; return cosh(a); }

 /** \internal \returns the hyperbolic tan of \a a (coeff-wise) */
 template<typename Packet> EIGEN_DECLARE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS
-Packet ptanh(const Packet& a) { EIGEN_USING_STD_MATH(tanh); return tanh(a); }
+Packet ptanh(const Packet& a) { using std::tanh; return tanh(a); }

 /** \internal \returns the exp of \a a (coeff-wise) */
 template<typename Packet> EIGEN_DECLARE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS
-Packet pexp(const Packet& a) { EIGEN_USING_STD_MATH(exp); return exp(a); }
-
-/** \internal \returns the expm1 of \a a (coeff-wise) */
-template<typename Packet> EIGEN_DECLARE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS
-Packet pexpm1(const Packet& a) { return numext::expm1(a); }
+Packet pexp(const Packet& a) { using std::exp; return exp(a); }

 /** \internal \returns the log of \a a (coeff-wise) */
 template<typename Packet> EIGEN_DECLARE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS
-Packet plog(const Packet& a) { EIGEN_USING_STD_MATH(log); return log(a); }
+Packet plog(const Packet& a) { using std::log; return log(a); }

 /** \internal \returns the log1p of \a a (coeff-wise) */
 template<typename Packet> EIGEN_DECLARE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS
@@ -555,11 +411,11 @@ Packet plog1p(const Packet& a) { return numext::log1p(a); }

 /** \internal \returns the log10 of \a a (coeff-wise) */
 template<typename Packet> EIGEN_DECLARE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS
-Packet plog10(const Packet& a) { EIGEN_USING_STD_MATH(log10); return log10(a); }
+Packet plog10(const Packet& a) { using std::log10; return log10(a); }

 /** \internal \returns the square-root of \a a (coeff-wise) */
 template<typename Packet> EIGEN_DECLARE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS
-Packet psqrt(const Packet& a) { EIGEN_USING_STD_MATH(sqrt); return sqrt(a); }
+Packet psqrt(const Packet& a) { using std::sqrt; return sqrt(a); }

 /** \internal \returns the reciprocal square-root of \a a (coeff-wise) */
 template<typename Packet> EIGEN_DECLARE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS
@@ -583,7 +439,7 @@ Packet pceil(const Packet& a) { using numext::ceil; return ceil(a); }
 * The following functions might not have to be overwritten for vectorized types
 ***************************************************************************/

-/** \internal copy a packet with constant coefficient \a a (e.g., [a,a,a,a]) to \a *to. \a to must be 16 bytes aligned */
+/** \internal copy a packet with constant coeficient \a a (e.g., [a,a,a,a]) to \a *to. \a to must be 16 bytes aligned */
 // NOTE: this function must really be templated on the packet type (think about different packet types for the same scalar type)
 template<typename Packet>
 inline void pstore1(typename unpacket_traits<Packet>::type* to, const typename unpacket_traits<Packet>::type& a)
@@ -665,13 +521,13 @@ inline void palign(PacketType& first, const PacketType& second)
 ***************************************************************************/

 // Eigen+CUDA does not support complexes.
-#if !defined(EIGEN_GPUCC)
+#ifndef __CUDACC__

 template<> inline std::complex<float> pmul(const std::complex<float>& a, const std::complex<float>& b)
-{ return std::complex<float>(a.real()*b.real() - a.imag()*b.imag(), a.imag()*b.real() + a.real()*b.imag()); }
+{ return std::complex<float>(real(a)*real(b) - imag(a)*imag(b), imag(a)*real(b) + real(a)*imag(b)); }

 template<> inline std::complex<double> pmul(const std::complex<double>& a, const std::complex<double>& b)
-{ return std::complex<double>(a.real()*b.real() - a.imag()*b.imag(), a.imag()*b.real() + a.real()*b.imag()); }
+{ return std::complex<double>(real(a)*real(b) - imag(a)*imag(b), imag(a)*real(b) + real(a)*imag(b)); }

 #endif

@@ -730,22 +586,6 @@ pinsertlast(const Packet& a, typename unpacket_traits<Packet>::type b)
  return pblend(mask, pset1<Packet>(b), a);
 }

-/***************************************************************************
- * Some generic implementations to be used by implementors
-***************************************************************************/
-
-/** Default implementation of pfrexp for float.
-  * It is expected to be called by implementers of template<> pfrexp.
-  */
-template<typename Packet> EIGEN_STRONG_INLINE Packet
-pfrexp_float(const Packet& a, Packet& exponent);
-
-/** Default implementation of pldexp for float.
-  * It is expected to be called by implementers of template<> pldexp.
-  */
-template<typename Packet> EIGEN_STRONG_INLINE Packet
-pldexp_float(Packet a, Packet exponent);
-
 } // end namespace internal

 } // end namespace Eigen
--- a/Eigen/src/Core/GlobalFunctions.h
+++ b/Eigen/src/Core/GlobalFunctions.h
@@ -66,19 +66,11 @@ namespace Eigen
  EIGEN_ARRAY_DECLARE_GLOBAL_UNARY(sinh,scalar_sinh_op,hyperbolic sine,\sa ArrayBase::sinh)
  EIGEN_ARRAY_DECLARE_GLOBAL_UNARY(cosh,scalar_cosh_op,hyperbolic cosine,\sa ArrayBase::cosh)
  EIGEN_ARRAY_DECLARE_GLOBAL_UNARY(tanh,scalar_tanh_op,hyperbolic tangent,\sa ArrayBase::tanh)
-#if EIGEN_HAS_CXX11_MATH
-  EIGEN_ARRAY_DECLARE_GLOBAL_UNARY(asinh,scalar_asinh_op,inverse hyperbolic sine,\sa ArrayBase::asinh)
-  EIGEN_ARRAY_DECLARE_GLOBAL_UNARY(acosh,scalar_acosh_op,inverse hyperbolic cosine,\sa ArrayBase::acosh)
-  EIGEN_ARRAY_DECLARE_GLOBAL_UNARY(atanh,scalar_atanh_op,inverse hyperbolic tangent,\sa ArrayBase::atanh)
-#endif
-  EIGEN_ARRAY_DECLARE_GLOBAL_UNARY(logistic,scalar_logistic_op,logistic function,\sa ArrayBase::logistic)
  EIGEN_ARRAY_DECLARE_GLOBAL_UNARY(lgamma,scalar_lgamma_op,natural logarithm of the gamma function,\sa ArrayBase::lgamma)
  EIGEN_ARRAY_DECLARE_GLOBAL_UNARY(digamma,scalar_digamma_op,derivative of lgamma,\sa ArrayBase::digamma)
  EIGEN_ARRAY_DECLARE_GLOBAL_UNARY(erf,scalar_erf_op,error function,\sa ArrayBase::erf)
  EIGEN_ARRAY_DECLARE_GLOBAL_UNARY(erfc,scalar_erfc_op,complement error function,\sa ArrayBase::erfc)
-  EIGEN_ARRAY_DECLARE_GLOBAL_UNARY(ndtri,scalar_ndtri_op,inverse normal distribution function,\sa ArrayBase::ndtri)
  EIGEN_ARRAY_DECLARE_GLOBAL_UNARY(exp,scalar_exp_op,exponential,\sa ArrayBase::exp)
-  EIGEN_ARRAY_DECLARE_GLOBAL_UNARY(expm1,scalar_expm1_op,exponential of a value minus 1,\sa ArrayBase::expm1)
  EIGEN_ARRAY_DECLARE_GLOBAL_UNARY(log,scalar_log_op,natural logarithm,\sa Eigen::log10 DOXCOMMA ArrayBase::log)
  EIGEN_ARRAY_DECLARE_GLOBAL_UNARY(log1p,scalar_log1p_op,natural logarithm of 1 plus the value,\sa ArrayBase::log1p)
  EIGEN_ARRAY_DECLARE_GLOBAL_UNARY(log10,scalar_log10_op,base 10 logarithm,\sa Eigen::log DOXCOMMA ArrayBase::log)
@@ -96,7 +88,7 @@ namespace Eigen
  EIGEN_ARRAY_DECLARE_GLOBAL_UNARY(isinf,scalar_isinf_op,infinite value test,\sa Eigen::isnan DOXCOMMA Eigen::isfinite DOXCOMMA ArrayBase::isinf)
  EIGEN_ARRAY_DECLARE_GLOBAL_UNARY(isfinite,scalar_isfinite_op,finite value test,\sa Eigen::isinf DOXCOMMA Eigen::isnan DOXCOMMA ArrayBase::isfinite)
  EIGEN_ARRAY_DECLARE_GLOBAL_UNARY(sign,scalar_sign_op,sign (or 0),\sa ArrayBase::sign)
-
+  
  /** \returns an expression of the coefficient-wise power of \a x to the given constant \a exponent.
    *
    * \tparam ScalarExponent is the scalar type of \a exponent. It must be compatible with the scalar type of the given expression (\c Derived::Scalar).
@@ -110,18 +102,17 @@ namespace Eigen
  inline const CwiseBinaryOp<internal::scalar_pow_op<Derived::Scalar,ScalarExponent>,Derived,Constant<ScalarExponent> >
  pow(const Eigen::ArrayBase<Derived>& x, const ScalarExponent& exponent);
 #else
-  template <typename Derived,typename ScalarExponent>
-  EIGEN_DEVICE_FUNC inline
-  EIGEN_MSVC10_WORKAROUND_BINARYOP_RETURN_TYPE(
-    const EIGEN_EXPR_BINARYOP_SCALAR_RETURN_TYPE(Derived,typename internal::promote_scalar_arg<typename Derived::Scalar
-                                                 EIGEN_COMMA ScalarExponent EIGEN_COMMA
-                                                 EIGEN_SCALAR_BINARY_SUPPORTED(pow,typename Derived::Scalar,ScalarExponent)>::type,pow))
-  pow(const Eigen::ArrayBase<Derived>& x, const ScalarExponent& exponent)
-  {
-    typedef typename internal::promote_scalar_arg<typename Derived::Scalar,ScalarExponent,
-                                                  EIGEN_SCALAR_BINARY_SUPPORTED(pow,typename Derived::Scalar,ScalarExponent)>::type PromotedExponent;
-    return EIGEN_EXPR_BINARYOP_SCALAR_RETURN_TYPE(Derived,PromotedExponent,pow)(x.derived(),
-           typename internal::plain_constant_type<Derived,PromotedExponent>::type(x.derived().rows(), x.derived().cols(), internal::scalar_constant_op<PromotedExponent>(exponent)));
+  template<typename Derived,typename ScalarExponent>
+  inline typename internal::enable_if<   !(internal::is_same<typename Derived::Scalar,ScalarExponent>::value) && EIGEN_SCALAR_BINARY_SUPPORTED(pow,typename Derived::Scalar,ScalarExponent),
+          const EIGEN_EXPR_BINARYOP_SCALAR_RETURN_TYPE(Derived,ScalarExponent,pow) >::type
+  pow(const Eigen::ArrayBase<Derived>& x, const ScalarExponent& exponent) {
+    return x.derived().pow(exponent);
+  }
+
+  template<typename Derived>
+  inline const EIGEN_EXPR_BINARYOP_SCALAR_RETURN_TYPE(Derived,typename Derived::Scalar,pow)
+  pow(const Eigen::ArrayBase<Derived>& x, const typename Derived::Scalar& exponent) {
+    return x.derived().pow(exponent);
  }
 #endif

@@ -131,21 +122,21 @@ namespace Eigen
    *
    * Example: \include Cwise_array_power_array.cpp
    * Output: \verbinclude Cwise_array_power_array.out
-    *
+    * 
    * \sa ArrayBase::pow()
    *
    * \relates ArrayBase
    */
  template<typename Derived,typename ExponentDerived>
  inline const Eigen::CwiseBinaryOp<Eigen::internal::scalar_pow_op<typename Derived::Scalar, typename ExponentDerived::Scalar>, const Derived, const ExponentDerived>
-  pow(const Eigen::ArrayBase<Derived>& x, const Eigen::ArrayBase<ExponentDerived>& exponents)
+  pow(const Eigen::ArrayBase<Derived>& x, const Eigen::ArrayBase<ExponentDerived>& exponents) 
  {
    return Eigen::CwiseBinaryOp<Eigen::internal::scalar_pow_op<typename Derived::Scalar, typename ExponentDerived::Scalar>, const Derived, const ExponentDerived>(
      x.derived(),
      exponents.derived()
    );
  }
-
+  
  /** \returns an expression of the coefficient-wise power of the scalar \a x to the given array of \a exponents.
    *
    * This function computes the coefficient-wise power between a scalar and an array of exponents.
@@ -154,7 +145,7 @@ namespace Eigen
    *
    * Example: \include Cwise_scalar_power_array.cpp
    * Output: \verbinclude Cwise_scalar_power_array.out
-    *
+    * 
    * \sa ArrayBase::pow()
    *
    * \relates ArrayBase
@@ -164,17 +155,21 @@ namespace Eigen
  inline const CwiseBinaryOp<internal::scalar_pow_op<Scalar,Derived::Scalar>,Constant<Scalar>,Derived>
  pow(const Scalar& x,const Eigen::ArrayBase<Derived>& x);
 #else
-  template <typename Scalar, typename Derived>
-  EIGEN_DEVICE_FUNC inline
-  EIGEN_MSVC10_WORKAROUND_BINARYOP_RETURN_TYPE(
-    const EIGEN_SCALAR_BINARYOP_EXPR_RETURN_TYPE(typename internal::promote_scalar_arg<typename Derived::Scalar
-                                                 EIGEN_COMMA Scalar EIGEN_COMMA
-                                                 EIGEN_SCALAR_BINARY_SUPPORTED(pow,Scalar,typename Derived::Scalar)>::type,Derived,pow))
-  pow(const Scalar& x, const Eigen::ArrayBase<Derived>& exponents) {
-    typedef typename internal::promote_scalar_arg<typename Derived::Scalar,Scalar,
-                                                  EIGEN_SCALAR_BINARY_SUPPORTED(pow,Scalar,typename Derived::Scalar)>::type PromotedScalar;
-    return EIGEN_SCALAR_BINARYOP_EXPR_RETURN_TYPE(PromotedScalar,Derived,pow)(
-           typename internal::plain_constant_type<Derived,PromotedScalar>::type(exponents.derived().rows(), exponents.derived().cols(), internal::scalar_constant_op<PromotedScalar>(x)), exponents.derived());
+  template<typename Scalar, typename Derived>
+  inline typename internal::enable_if<   !(internal::is_same<typename Derived::Scalar,Scalar>::value) && EIGEN_SCALAR_BINARY_SUPPORTED(pow,Scalar,typename Derived::Scalar),
+          const EIGEN_SCALAR_BINARYOP_EXPR_RETURN_TYPE(Scalar,Derived,pow) >::type
+  pow(const Scalar& x, const Eigen::ArrayBase<Derived>& exponents)
+  {
+    return EIGEN_SCALAR_BINARYOP_EXPR_RETURN_TYPE(Scalar,Derived,pow)(
+            typename internal::plain_constant_type<Derived,Scalar>::type(exponents.rows(), exponents.cols(), x), exponents.derived() );
+  }
+
+  template<typename Derived>
+  inline const EIGEN_SCALAR_BINARYOP_EXPR_RETURN_TYPE(typename Derived::Scalar,Derived,pow)
+  pow(const typename Derived::Scalar& x, const Eigen::ArrayBase<Derived>& exponents)
+  {
+    return EIGEN_SCALAR_BINARYOP_EXPR_RETURN_TYPE(typename Derived::Scalar,Derived,pow)(
+      typename internal::plain_constant_type<Derived,typename Derived::Scalar>::type(exponents.rows(), exponents.cols(), x), exponents.derived() );
  }
 #endif

--- a/Eigen/src/Core/IO.h
+++ b/Eigen/src/Core/IO.h
@@ -41,7 +41,6 @@ std::ostream & print_matrix(std::ostream & s, const Derived& _m, const IOFormat&
  *  - \b rowSuffix string printed at the end of each row
  *  - \b matPrefix string printed at the beginning of the matrix
  *  - \b matSuffix string printed at the end of the matrix
-  *  - \b fill character printed to fill the empty space in aligned columns
  *
  * Example: \include IOFormat.cpp
  * Output: \verbinclude IOFormat.out
@@ -54,9 +53,9 @@ struct IOFormat
  IOFormat(int _precision = StreamPrecision, int _flags = 0,
    const std::string& _coeffSeparator = " ",
    const std::string& _rowSeparator = "\n", const std::string& _rowPrefix="", const std::string& _rowSuffix="",
-    const std::string& _matPrefix="", const std::string& _matSuffix="", const char _fill=' ')
+    const std::string& _matPrefix="", const std::string& _matSuffix="")
  : matPrefix(_matPrefix), matSuffix(_matSuffix), rowPrefix(_rowPrefix), rowSuffix(_rowSuffix), rowSeparator(_rowSeparator),
-    rowSpacer(""), coeffSeparator(_coeffSeparator), fill(_fill), precision(_precision), flags(_flags)
+    rowSpacer(""), coeffSeparator(_coeffSeparator), precision(_precision), flags(_flags)
  {
    // TODO check if rowPrefix, rowSuffix or rowSeparator contains a newline
    // don't add rowSpacer if columns are not to be aligned
@@ -72,7 +71,6 @@ struct IOFormat
  std::string matPrefix, matSuffix;
  std::string rowPrefix, rowSuffix, rowSeparator, rowSpacer;
  std::string coeffSeparator;
-  char fill;
  int precision;
  int flags;
 };
@@ -178,26 +176,18 @@ std::ostream & print_matrix(std::ostream & s, const Derived& _m, const IOFormat&
        width = std::max<Index>(width, Index(sstr.str().length()));
      }
  }
-  std::streamsize old_width = s.width();
-  char old_fill_character = s.fill();
  s << fmt.matPrefix;
  for(Index i = 0; i < m.rows(); ++i)
  {
    if (i)
      s << fmt.rowSpacer;
    s << fmt.rowPrefix;
-    if(width) {
-      s.fill(fmt.fill);
-      s.width(width);
-    }
+    if(width) s.width(width);
    s << m.coeff(i, 0);
    for(Index j = 1; j < m.cols(); ++j)
    {
      s << fmt.coeffSeparator;
-      if(width) {
-        s.fill(fmt.fill);
-        s.width(width);
-      }
+      if (width) s.width(width);
      s << m.coeff(i, j);
    }
    s << fmt.rowSuffix;
@@ -206,10 +196,6 @@ std::ostream & print_matrix(std::ostream & s, const Derived& _m, const IOFormat&
  }
  s << fmt.matSuffix;
  if(explicit_precision) s.precision(old_precision);
-  if(width) {
-    s.fill(old_fill_character);
-    s.width(old_width);
-  }
  return s;
 }

--- a/Eigen/src/Core/IndexedView.h
+++ b/Eigen/src/Core/IndexedView.h
@@ -1,207 +0,0 @@
-// This file is part of Eigen, a lightweight C++ template library
-// for linear algebra.
-//
-// Copyright (C) 2017 Gael Guennebaud <gael.guennebaud@inria.fr>
-//
-// This Source Code Form is subject to the terms of the Mozilla
-// Public License v. 2.0. If a copy of the MPL was not distributed
-// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
-
-#ifndef EIGEN_INDEXED_VIEW_H
-#define EIGEN_INDEXED_VIEW_H
-
-namespace Eigen {
-
-namespace internal {
-
-template<typename XprType, typename RowIndices, typename ColIndices>
-struct traits<IndexedView<XprType, RowIndices, ColIndices> >
- : traits<XprType>
-{
-  enum {
-    RowsAtCompileTime = int(array_size<RowIndices>::value),
-    ColsAtCompileTime = int(array_size<ColIndices>::value),
-    MaxRowsAtCompileTime = RowsAtCompileTime != Dynamic ? int(RowsAtCompileTime) : Dynamic,
-    MaxColsAtCompileTime = ColsAtCompileTime != Dynamic ? int(ColsAtCompileTime) : Dynamic,
-
-    XprTypeIsRowMajor = (int(traits<XprType>::Flags)&RowMajorBit) != 0,
-    IsRowMajor = (MaxRowsAtCompileTime==1&&MaxColsAtCompileTime!=1) ? 1
-               : (MaxColsAtCompileTime==1&&MaxRowsAtCompileTime!=1) ? 0
-               : XprTypeIsRowMajor,
-
-    RowIncr = int(get_compile_time_incr<RowIndices>::value),
-    ColIncr = int(get_compile_time_incr<ColIndices>::value),
-    InnerIncr = IsRowMajor ? ColIncr : RowIncr,
-    OuterIncr = IsRowMajor ? RowIncr : ColIncr,
-
-    HasSameStorageOrderAsXprType = (IsRowMajor == XprTypeIsRowMajor),
-    XprInnerStride = HasSameStorageOrderAsXprType ? int(inner_stride_at_compile_time<XprType>::ret) : int(outer_stride_at_compile_time<XprType>::ret),
-    XprOuterstride = HasSameStorageOrderAsXprType ? int(outer_stride_at_compile_time<XprType>::ret) : int(inner_stride_at_compile_time<XprType>::ret),
-
-    InnerSize = XprTypeIsRowMajor ? ColsAtCompileTime : RowsAtCompileTime,
-    IsBlockAlike = InnerIncr==1 && OuterIncr==1,
-    IsInnerPannel = HasSameStorageOrderAsXprType && is_same<AllRange<InnerSize>,typename conditional<XprTypeIsRowMajor,ColIndices,RowIndices>::type>::value,
-
-    InnerStrideAtCompileTime = InnerIncr<0 || InnerIncr==DynamicIndex || XprInnerStride==Dynamic ? Dynamic : XprInnerStride * InnerIncr,
-    OuterStrideAtCompileTime = OuterIncr<0 || OuterIncr==DynamicIndex || XprOuterstride==Dynamic ? Dynamic : XprOuterstride * OuterIncr,
-
-    ReturnAsScalar = is_same<RowIndices,SingleRange>::value && is_same<ColIndices,SingleRange>::value,
-    ReturnAsBlock = (!ReturnAsScalar) && IsBlockAlike,
-    ReturnAsIndexedView = (!ReturnAsScalar) && (!ReturnAsBlock),
-
-    // FIXME we deal with compile-time strides if and only if we have DirectAccessBit flag,
-    // but this is too strict regarding negative strides...
-    DirectAccessMask = (int(InnerIncr)!=UndefinedIncr && int(OuterIncr)!=UndefinedIncr && InnerIncr>=0 && OuterIncr>=0) ? DirectAccessBit : 0,
-    FlagsRowMajorBit = IsRowMajor ? RowMajorBit : 0,
-    FlagsLvalueBit = is_lvalue<XprType>::value ? LvalueBit : 0,
-    Flags = (traits<XprType>::Flags & (HereditaryBits | DirectAccessMask)) | FlagsLvalueBit | FlagsRowMajorBit
-  };
-
-  typedef Block<XprType,RowsAtCompileTime,ColsAtCompileTime,IsInnerPannel> BlockType;
-};
-
-}
-
-template<typename XprType, typename RowIndices, typename ColIndices, typename StorageKind>
-class IndexedViewImpl;
-
-
-/** \class IndexedView
-  * \ingroup Core_Module
-  *
-  * \brief Expression of a non-sequential sub-matrix defined by arbitrary sequences of row and column indices
-  *
-  * \tparam XprType the type of the expression in which we are taking the intersections of sub-rows and sub-columns
-  * \tparam RowIndices the type of the object defining the sequence of row indices
-  * \tparam ColIndices the type of the object defining the sequence of column indices
-  *
-  * This class represents an expression of a sub-matrix (or sub-vector) defined as the intersection
-  * of sub-sets of rows and columns, that are themself defined by generic sequences of row indices \f$ \{r_0,r_1,..r_{m-1}\} \f$
-  * and column indices \f$ \{c_0,c_1,..c_{n-1} \}\f$. Let \f$ A \f$  be the nested matrix, then the resulting matrix \f$ B \f$ has \c m
-  * rows and \c n columns, and its entries are given by: \f$ B(i,j) = A(r_i,c_j) \f$.
-  *
-  * The \c RowIndices and \c ColIndices types must be compatible with the following API:
-  * \code
-  * <integral type> operator[](Index) const;
-  * Index size() const;
-  * \endcode
-  *
-  * Typical supported types thus include:
-  *  - std::vector<int>
-  *  - std::valarray<int>
-  *  - std::array<int>
-  *  - Plain C arrays: int[N]
-  *  - Eigen::ArrayXi
-  *  - decltype(ArrayXi::LinSpaced(...))
-  *  - Any view/expressions of the previous types
-  *  - Eigen::ArithmeticSequence
-  *  - Eigen::internal::AllRange      (helper for Eigen::all)
-  *  - Eigen::internal::SingleRange  (helper for single index)
-  *  - etc.
-  *
-  * In typical usages of %Eigen, this class should never be used directly. It is the return type of
-  * DenseBase::operator()(const RowIndices&, const ColIndices&).
-  *
-  * \sa class Block
-  */
-template<typename XprType, typename RowIndices, typename ColIndices>
-class IndexedView : public IndexedViewImpl<XprType, RowIndices, ColIndices, typename internal::traits<XprType>::StorageKind>
-{
-public:
-  typedef typename IndexedViewImpl<XprType, RowIndices, ColIndices, typename internal::traits<XprType>::StorageKind>::Base Base;
-  EIGEN_GENERIC_PUBLIC_INTERFACE(IndexedView)
-  EIGEN_INHERIT_ASSIGNMENT_OPERATORS(IndexedView)
-
-  typedef typename internal::ref_selector<XprType>::non_const_type MatrixTypeNested;
-  typedef typename internal::remove_all<XprType>::type NestedExpression;
-
-  template<typename T0, typename T1>
-  IndexedView(XprType& xpr, const T0& rowIndices, const T1& colIndices)
-    : m_xpr(xpr), m_rowIndices(rowIndices), m_colIndices(colIndices)
-  {}
-
-  /** \returns number of rows */
-  Index rows() const { return internal::size(m_rowIndices); }
-
-  /** \returns number of columns */
-  Index cols() const { return internal::size(m_colIndices); }
-
-  /** \returns the nested expression */
-  const typename internal::remove_all<XprType>::type&
-  nestedExpression() const { return m_xpr; }
-
-  /** \returns the nested expression */
-  typename internal::remove_reference<XprType>::type&
-  nestedExpression() { return m_xpr; }
-
-  /** \returns a const reference to the object storing/generating the row indices */
-  const RowIndices& rowIndices() const { return m_rowIndices; }
-
-  /** \returns a const reference to the object storing/generating the column indices */
-  const ColIndices& colIndices() const { return m_colIndices; }
-
-protected:
-  MatrixTypeNested m_xpr;
-  RowIndices m_rowIndices;
-  ColIndices m_colIndices;
-};
-
-
-// Generic API dispatcher
-template<typename XprType, typename RowIndices, typename ColIndices, typename StorageKind>
-class IndexedViewImpl
-  : public internal::generic_xpr_base<IndexedView<XprType, RowIndices, ColIndices> >::type
-{
-public:
-  typedef typename internal::generic_xpr_base<IndexedView<XprType, RowIndices, ColIndices> >::type Base;
-};
-
-namespace internal {
-
-
-template<typename ArgType, typename RowIndices, typename ColIndices>
-struct unary_evaluator<IndexedView<ArgType, RowIndices, ColIndices>, IndexBased>
-  : evaluator_base<IndexedView<ArgType, RowIndices, ColIndices> >
-{
-  typedef IndexedView<ArgType, RowIndices, ColIndices> XprType;
-
-  enum {
-    CoeffReadCost = evaluator<ArgType>::CoeffReadCost /* TODO + cost of row/col index */,
-
-    Flags = (evaluator<ArgType>::Flags & (HereditaryBits /*| LinearAccessBit | DirectAccessBit*/)),
-
-    Alignment = 0
-  };
-
-  EIGEN_DEVICE_FUNC explicit unary_evaluator(const XprType& xpr) : m_argImpl(xpr.nestedExpression()), m_xpr(xpr)
-  {
-    EIGEN_INTERNAL_CHECK_COST_VALUE(CoeffReadCost);
-  }
-
-  typedef typename XprType::Scalar Scalar;
-  typedef typename XprType::CoeffReturnType CoeffReturnType;
-
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
-  CoeffReturnType coeff(Index row, Index col) const
-  {
-    return m_argImpl.coeff(m_xpr.rowIndices()[row], m_xpr.colIndices()[col]);
-  }
-
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
-  Scalar& coeffRef(Index row, Index col)
-  {
-    return m_argImpl.coeffRef(m_xpr.rowIndices()[row], m_xpr.colIndices()[col]);
-  }
-
-protected:
-
-  evaluator<ArgType> m_argImpl;
-  const XprType& m_xpr;
-
-};
-
-} // end namespace internal
-
-} // end namespace Eigen
-
-#endif // EIGEN_INDEXED_VIEW_H
--- a/Eigen/src/Core/Inverse.h
+++ b/Eigen/src/Core/Inverse.h
@@ -1,7 +1,7 @@
 // This file is part of Eigen, a lightweight C++ template library
 // for linear algebra.
 //
-// Copyright (C) 2014-2019 Gael Guennebaud <gael.guennebaud@inria.fr>
+// Copyright (C) 2014 Gael Guennebaud <gael.guennebaud@inria.fr>
 //
 // This Source Code Form is subject to the terms of the Mozilla
 // Public License v. 2.0. If a copy of the MPL was not distributed
@@ -44,6 +44,7 @@ class Inverse : public InverseImpl<XprType,typename internal::traits<XprType>::S
 {
 public:
  typedef typename XprType::StorageIndex StorageIndex;
+  typedef typename XprType::PlainObject                       PlainObject;
  typedef typename XprType::Scalar                            Scalar;
  typedef typename internal::ref_selector<XprType>::type      XprTypeNested;
  typedef typename internal::remove_all<XprTypeNested>::type  XprTypeNestedCleaned;
@@ -54,8 +55,8 @@ public:
    : m_xpr(xpr)
  {}

-  EIGEN_DEVICE_FUNC Index rows() const { return m_xpr.cols(); }
-  EIGEN_DEVICE_FUNC Index cols() const { return m_xpr.rows(); }
+  EIGEN_DEVICE_FUNC Index rows() const { return m_xpr.rows(); }
+  EIGEN_DEVICE_FUNC Index cols() const { return m_xpr.cols(); }

  EIGEN_DEVICE_FUNC const XprTypeNestedCleaned& nestedExpression() const { return m_xpr; }

--- a/Eigen/src/Core/Map.h
+++ b/Eigen/src/Core/Map.h
@@ -113,10 +113,10 @@ template<typename PlainObjectType, int MapOptions, typename StrideType> class Ma
    EIGEN_DEVICE_FUNC
    inline Index outerStride() const
    {
-      return StrideType::OuterStrideAtCompileTime != 0 ? m_stride.outer()
-           : internal::traits<Map>::OuterStrideAtCompileTime != Dynamic ? Index(internal::traits<Map>::OuterStrideAtCompileTime)
+      return int(StrideType::OuterStrideAtCompileTime) != 0 ? m_stride.outer()
+           : int(internal::traits<Map>::OuterStrideAtCompileTime) != Dynamic ? Index(internal::traits<Map>::OuterStrideAtCompileTime)
           : IsVectorAtCompileTime ? (this->size() * innerStride())
-           : int(Flags)&RowMajorBit ? (this->cols() * innerStride())
+           : (int(Flags)&RowMajorBit) ? (this->cols() * innerStride())
           : (this->rows() * innerStride());
    }

--- a/Eigen/src/Core/MathFunctions.h
+++ b/Eigen/src/Core/MathFunctions.h
@@ -14,6 +14,7 @@
 // TODO this should better be moved to NumTraits
 #define EIGEN_PI 3.141592653589793238462643383279502884197169399375105820974944592307816406L

+
 namespace Eigen {

 // On WINCE, std::abs is defined for int only, so let's defined our own overloads:
@@ -96,7 +97,7 @@ struct real_default_impl<Scalar,true>

 template<typename Scalar> struct real_impl : real_default_impl<Scalar> {};

-#if defined(EIGEN_GPU_COMPILE_PHASE)
+#ifdef __CUDA_ARCH__
 template<typename T>
 struct real_impl<std::complex<T> >
 {
@@ -144,7 +145,7 @@ struct imag_default_impl<Scalar,true>

 template<typename Scalar> struct imag_impl : imag_default_impl<Scalar> {};

-#if defined(EIGEN_GPU_COMPILE_PHASE)
+#ifdef __CUDA_ARCH__
 template<typename T>
 struct imag_impl<std::complex<T> >
 {
@@ -238,7 +239,7 @@ struct imag_ref_retval
 ****************************************************************************/

 template<typename Scalar, bool IsComplex = NumTraits<Scalar>::IsComplex>
-struct conj_default_impl
+struct conj_impl
 {
  EIGEN_DEVICE_FUNC
  static inline Scalar run(const Scalar& x)
@@ -248,7 +249,7 @@ struct conj_default_impl
 };

 template<typename Scalar>
-struct conj_default_impl<Scalar,true>
+struct conj_impl<Scalar,true>
 {
  EIGEN_DEVICE_FUNC
  static inline Scalar run(const Scalar& x)
@@ -258,20 +259,6 @@ struct conj_default_impl<Scalar,true>
  }
 };

-template<typename Scalar> struct conj_impl : conj_default_impl<Scalar> {};
-
-#if defined(EIGEN_GPU_COMPILE_PHASE)
-template<typename T>
-struct conj_impl<std::complex<T> >
-{
-  EIGEN_DEVICE_FUNC
-  static inline std::complex<T> run(const std::complex<T>& x)
-  {
-    return std::complex<T>(x.real(), -x.imag());
-  }
-};
-#endif
-
 template<typename Scalar>
 struct conj_retval
 {
@@ -300,7 +287,7 @@ struct abs2_impl_default<Scalar, true> // IsComplex
  EIGEN_DEVICE_FUNC
  static inline RealScalar run(const Scalar& x)
  {
-    return x.real()*x.real() + x.imag()*x.imag();
+    return real(x)*real(x) + imag(x)*imag(x);
  }
 };

@@ -326,17 +313,14 @@ struct abs2_retval
 ****************************************************************************/

 template<typename Scalar, bool IsComplex>
-struct norm1_default_impl;
-
-template<typename Scalar>
-struct norm1_default_impl<Scalar,true>
+struct norm1_default_impl
 {
  typedef typename NumTraits<Scalar>::Real RealScalar;
  EIGEN_DEVICE_FUNC
  static inline RealScalar run(const Scalar& x)
  {
    EIGEN_USING_STD_MATH(abs);
-    return abs(x.real()) + abs(x.imag());
+    return abs(real(x)) + abs(imag(x));
  }
 };

@@ -405,7 +389,7 @@ inline NewType cast(const OldType& x)
    static inline Scalar run(const Scalar& x)
    {
      EIGEN_STATIC_ASSERT((!NumTraits<Scalar>::IsComplex), NUMERIC_TYPE_MUST_BE_REAL)
-      EIGEN_USING_STD_MATH(round);
+      using std::round;
      return round(x);
    }
  };
@@ -438,12 +422,7 @@ struct round_retval
  struct arg_impl {
    static inline Scalar run(const Scalar& x)
    {
-      #if defined(EIGEN_HIP_DEVICE_COMPILE)
-      // HIP does not seem to have a native device side implementation for the math routine "arg"
-      using std::arg;
-      #else 		  
      EIGEN_USING_STD_MATH(arg);
-      #endif
      return arg(x);
    }
  };
@@ -479,66 +458,6 @@ struct arg_retval
  typedef typename NumTraits<Scalar>::Real type;
 };

-/****************************************************************************
-* Implementation of expm1                                                   *
-****************************************************************************/
-
-// This implementation is based on GSL Math's expm1.
-namespace std_fallback {
-  // fallback expm1 implementation in case there is no expm1(Scalar) function in namespace of Scalar,
-  // or that there is no suitable std::expm1 function available. Implementation
-  // attributed to Kahan. See: http://www.plunk.org/~hatch/rightway.php.
-  template<typename Scalar>
-  EIGEN_DEVICE_FUNC inline Scalar expm1(const Scalar& x) {
-    EIGEN_STATIC_ASSERT_NON_INTEGER(Scalar)
-    typedef typename NumTraits<Scalar>::Real RealScalar;
-
-    EIGEN_USING_STD_MATH(exp);
-    Scalar u = exp(x);
-    if (numext::equal_strict(u, Scalar(1))) {
-      return x;
-    }
-    Scalar um1 = u - RealScalar(1);
-    if (numext::equal_strict(um1, Scalar(-1))) {
-      return RealScalar(-1);
-    }
-
-    EIGEN_USING_STD_MATH(log);
-    Scalar logu = log(u);
-    return numext::equal_strict(u, logu) ? u : (u - RealScalar(1)) * x / logu;
-  }
-}
-
-template<typename Scalar>
-struct expm1_impl {
-  EIGEN_DEVICE_FUNC static inline Scalar run(const Scalar& x)
-  {
-    EIGEN_STATIC_ASSERT_NON_INTEGER(Scalar)
-    #if EIGEN_HAS_CXX11_MATH
-    using std::expm1;
-    #else
-    using std_fallback::expm1;
-    #endif
-    return expm1(x);
-  }
-};
-
-// Specialization for complex types that are not supported by std::expm1.
-template <typename RealScalar>
-struct expm1_impl<std::complex<RealScalar> > {
-  EIGEN_DEVICE_FUNC static inline std::complex<RealScalar> run(
-      const std::complex<RealScalar>& x) {
-    EIGEN_STATIC_ASSERT_NON_INTEGER(RealScalar)
-    return std_fallback::expm1(x);
-  }
-};
-
-template<typename Scalar>
-struct expm1_retval
-{
-  typedef Scalar type;
-};
-
 /****************************************************************************
 * Implementation of log1p                                                   *
 ****************************************************************************/
@@ -552,36 +471,23 @@ namespace std_fallback {
    typedef typename NumTraits<Scalar>::Real RealScalar;
    EIGEN_USING_STD_MATH(log);
    Scalar x1p = RealScalar(1) + x;
-    Scalar log_1p = log(x1p);
-    const bool is_small = numext::equal_strict(x1p, Scalar(1));
-    const bool is_inf = numext::equal_strict(x1p, log_1p);
-    return (is_small || is_inf) ? x : x * (log_1p / (x1p - RealScalar(1)));
+    return numext::equal_strict(x1p, Scalar(1)) ? x : x * ( log(x1p) / (x1p - RealScalar(1)) );
  }
 }

 template<typename Scalar>
 struct log1p_impl {
-  EIGEN_DEVICE_FUNC static inline Scalar run(const Scalar& x)
+  static inline Scalar run(const Scalar& x)
  {
    EIGEN_STATIC_ASSERT_NON_INTEGER(Scalar)
    #if EIGEN_HAS_CXX11_MATH
    using std::log1p;
-    #else
-    using std_fallback::log1p;
    #endif
+    using std_fallback::log1p;
    return log1p(x);
  }
 };

-// Specialization for complex types that are not supported by std::log1p.
-template <typename RealScalar>
-struct log1p_impl<std::complex<RealScalar> > {
-  EIGEN_DEVICE_FUNC static inline std::complex<RealScalar> run(
-      const std::complex<RealScalar>& x) {
-    EIGEN_STATIC_ASSERT_NON_INTEGER(RealScalar)
-    return std_fallback::log1p(x);
-  }
-};

 template<typename Scalar>
 struct log1p_retval
@@ -756,8 +662,8 @@ struct random_default_impl<Scalar, true, false>
 {
  static inline Scalar run(const Scalar& x, const Scalar& y)
  {
-    return Scalar(random(x.real(), y.real()),
-                  random(x.imag(), y.imag()));
+    return Scalar(random(real(x), real(y)),
+                  random(imag(x), imag(y)));
  }
  static inline Scalar run()
  {
@@ -778,7 +684,7 @@ inline EIGEN_MATHFUNC_RETVAL(random, Scalar) random()
  return EIGEN_MATHFUNC_IMPL(random, Scalar)::run();
 }

-// Implementation of is* functions
+// Implementatin of is* functions

 // std::is* do not work with fast-math and gcc, std::is* are available on MSVC 2013 and newer, as well as in clang.
 #if (EIGEN_HAS_CXX11_MATH && !(EIGEN_COMP_GNUC_STRICT && __FINITE_MATH_ONLY__)) || (EIGEN_COMP_MSVC>=1800) || (EIGEN_COMP_CLANG)
@@ -807,7 +713,7 @@ EIGEN_DEVICE_FUNC
 typename internal::enable_if<(!internal::is_integral<T>::value)&&(!NumTraits<T>::IsComplex),bool>::type
 isfinite_impl(const T& x)
 {
-  #if defined(EIGEN_GPU_COMPILE_PHASE)
+  #ifdef __CUDA_ARCH__
    return (::isfinite)(x);
  #elif EIGEN_USE_STD_FPCLASSIFY
    using std::isfinite;
@@ -822,7 +728,7 @@ EIGEN_DEVICE_FUNC
 typename internal::enable_if<(!internal::is_integral<T>::value)&&(!NumTraits<T>::IsComplex),bool>::type
 isinf_impl(const T& x)
 {
-  #if defined(EIGEN_GPU_COMPILE_PHASE)
+  #ifdef __CUDA_ARCH__
    return (::isinf)(x);
  #elif EIGEN_USE_STD_FPCLASSIFY
    using std::isinf;
@@ -837,7 +743,7 @@ EIGEN_DEVICE_FUNC
 typename internal::enable_if<(!internal::is_integral<T>::value)&&(!NumTraits<T>::IsComplex),bool>::type
 isnan_impl(const T& x)
 {
-  #if defined(EIGEN_GPU_COMPILE_PHASE)
+  #ifdef __CUDA_ARCH__
    return (::isnan)(x);
  #elif EIGEN_USE_STD_FPCLASSIFY
    using std::isnan;
@@ -894,6 +800,7 @@ template<typename T> EIGEN_DEVICE_FUNC bool isnan_impl(const std::complex<T>& x)
 template<typename T> EIGEN_DEVICE_FUNC bool isinf_impl(const std::complex<T>& x);

 template<typename T> T generic_fast_tanh_float(const T& a_x);
+
 } // end namespace internal

 /****************************************************************************
@@ -902,7 +809,7 @@ template<typename T> T generic_fast_tanh_float(const T& a_x);

 namespace numext {

-#if (!defined(EIGEN_GPUCC) || defined(EIGEN_CONSTEXPR_ARE_DEVICE_FUNC)) 
+#ifndef __CUDA_ARCH__
 template<typename T>
 EIGEN_DEVICE_FUNC
 EIGEN_ALWAYS_INLINE T mini(const T& x, const T& y)
@@ -931,24 +838,6 @@ EIGEN_ALWAYS_INLINE float mini(const float& x, const float& y)
 {
  return fminf(x, y);
 }
-template<>
-EIGEN_DEVICE_FUNC
-EIGEN_ALWAYS_INLINE double mini(const double& x, const double& y)
-{
-  return fmin(x, y);
-}
-template<>
-EIGEN_DEVICE_FUNC
-EIGEN_ALWAYS_INLINE long double mini(const long double& x, const long double& y)
-{
-#if defined(EIGEN_HIPCC)
-  // no "fminl" on HIP yet
-  return (x < y) ? x : y;
-#else
-  return fminl(x, y);
-#endif
-}
-
 template<typename T>
 EIGEN_DEVICE_FUNC
 EIGEN_ALWAYS_INLINE T maxi(const T& x, const T& y)
@@ -961,92 +850,6 @@ EIGEN_ALWAYS_INLINE float maxi(const float& x, const float& y)
 {
  return fmaxf(x, y);
 }
-template<>
-EIGEN_DEVICE_FUNC
-EIGEN_ALWAYS_INLINE double maxi(const double& x, const double& y)
-{
-  return fmax(x, y);
-}
-template<>
-EIGEN_DEVICE_FUNC
-EIGEN_ALWAYS_INLINE long double maxi(const long double& x, const long double& y)
-{
-#if defined(EIGEN_HIPCC)
-  // no "fmaxl" on HIP yet
-  return (x > y) ? x : y;
-#else
-  return fmaxl(x, y);
-#endif
-}
-#endif
-
-#if defined(SYCL_DEVICE_ONLY)
-
-
-#define SYCL_SPECIALIZE_SIGNED_INTEGER_TYPES_BINARY(NAME, FUNC) \
-  SYCL_SPECIALIZE_BINARY_FUNC(NAME, FUNC, cl::sycl::cl_char)   \
-  SYCL_SPECIALIZE_BINARY_FUNC(NAME, FUNC, cl::sycl::cl_short)  \
-  SYCL_SPECIALIZE_BINARY_FUNC(NAME, FUNC, cl::sycl::cl_int)    \
-  SYCL_SPECIALIZE_BINARY_FUNC(NAME, FUNC, cl::sycl::cl_long)
-#define SYCL_SPECIALIZE_SIGNED_INTEGER_TYPES_UNARY(NAME, FUNC) \
-  SYCL_SPECIALIZE_UNARY_FUNC(NAME, FUNC, cl::sycl::cl_char)   \
-  SYCL_SPECIALIZE_UNARY_FUNC(NAME, FUNC, cl::sycl::cl_short)  \
-  SYCL_SPECIALIZE_UNARY_FUNC(NAME, FUNC, cl::sycl::cl_int)    \
-  SYCL_SPECIALIZE_UNARY_FUNC(NAME, FUNC, cl::sycl::cl_long)
-#define SYCL_SPECIALIZE_UNSIGNED_INTEGER_TYPES_BINARY(NAME, FUNC) \
-  SYCL_SPECIALIZE_BINARY_FUNC(NAME, FUNC, cl::sycl::cl_uchar)  \
-  SYCL_SPECIALIZE_BINARY_FUNC(NAME, FUNC, cl::sycl::cl_ushort) \
-  SYCL_SPECIALIZE_BINARY_FUNC(NAME, FUNC, cl::sycl::cl_uint)   \
-  SYCL_SPECIALIZE_BINARY_FUNC(NAME, FUNC, cl::sycl::cl_ulong)
-#define SYCL_SPECIALIZE_UNSIGNED_INTEGER_TYPES_UNARY(NAME, FUNC) \
-  SYCL_SPECIALIZE_UNARY_FUNC(NAME, FUNC, cl::sycl::cl_uchar)  \
-  SYCL_SPECIALIZE_UNARY_FUNC(NAME, FUNC, cl::sycl::cl_ushort) \
-  SYCL_SPECIALIZE_UNARY_FUNC(NAME, FUNC, cl::sycl::cl_uint)   \
-  SYCL_SPECIALIZE_UNARY_FUNC(NAME, FUNC, cl::sycl::cl_ulong)
-#define SYCL_SPECIALIZE_INTEGER_TYPES_BINARY(NAME, FUNC) \
-  SYCL_SPECIALIZE_SIGNED_INTEGER_TYPES_BINARY(NAME, FUNC) \
-  SYCL_SPECIALIZE_UNSIGNED_INTEGER_TYPES_BINARY(NAME, FUNC)
-#define SYCL_SPECIALIZE_INTEGER_TYPES_UNARY(NAME, FUNC) \
-  SYCL_SPECIALIZE_SIGNED_INTEGER_TYPES_UNARY(NAME, FUNC) \
-  SYCL_SPECIALIZE_UNSIGNED_INTEGER_TYPES_UNARY(NAME, FUNC)
-#define SYCL_SPECIALIZE_FLOATING_TYPES_BINARY(NAME, FUNC) \
-  SYCL_SPECIALIZE_BINARY_FUNC(NAME, FUNC, cl::sycl::cl_float) \
-  SYCL_SPECIALIZE_BINARY_FUNC(NAME, FUNC,cl::sycl::cl_double)
-#define SYCL_SPECIALIZE_FLOATING_TYPES_UNARY(NAME, FUNC) \
-  SYCL_SPECIALIZE_UNARY_FUNC(NAME, FUNC, cl::sycl::cl_float) \
-  SYCL_SPECIALIZE_UNARY_FUNC(NAME, FUNC,cl::sycl::cl_double)
-#define SYCL_SPECIALIZE_FLOATING_TYPES_UNARY_FUNC_RET_TYPE(NAME, FUNC, RET_TYPE) \
-  SYCL_SPECIALIZE_GEN_UNARY_FUNC(NAME, FUNC, RET_TYPE, cl::sycl::cl_float) \
-  SYCL_SPECIALIZE_GEN_UNARY_FUNC(NAME, FUNC, RET_TYPE, cl::sycl::cl_double)
-
-#define SYCL_SPECIALIZE_GEN_UNARY_FUNC(NAME, FUNC, RET_TYPE, ARG_TYPE) \
-template<>                                               \
-  EIGEN_DEVICE_FUNC                                      \
-  EIGEN_ALWAYS_INLINE RET_TYPE NAME(const ARG_TYPE& x) { \
-    return cl::sycl::FUNC(x);                            \
-  }
-
-#define SYCL_SPECIALIZE_UNARY_FUNC(NAME, FUNC, TYPE) \
-  SYCL_SPECIALIZE_GEN_UNARY_FUNC(NAME, FUNC, TYPE, TYPE)
-
-#define SYCL_SPECIALIZE_GEN1_BINARY_FUNC(NAME, FUNC, RET_TYPE, ARG_TYPE1, ARG_TYPE2) \
-  template<>                                                                  \
-  EIGEN_DEVICE_FUNC                                                           \
-  EIGEN_ALWAYS_INLINE RET_TYPE NAME(const ARG_TYPE1& x, const ARG_TYPE2& y) { \
-    return cl::sycl::FUNC(x, y);                                              \
-  }
-
-#define SYCL_SPECIALIZE_GEN2_BINARY_FUNC(NAME, FUNC, RET_TYPE, ARG_TYPE) \
-  SYCL_SPECIALIZE_GEN1_BINARY_FUNC(NAME, FUNC, RET_TYPE, ARG_TYPE, ARG_TYPE)
-
-#define SYCL_SPECIALIZE_BINARY_FUNC(NAME, FUNC, TYPE) \
-  SYCL_SPECIALIZE_GEN2_BINARY_FUNC(NAME, FUNC, TYPE, TYPE)
-
-SYCL_SPECIALIZE_INTEGER_TYPES_BINARY(mini, min)
-SYCL_SPECIALIZE_FLOATING_TYPES_BINARY(mini, fmin)
-SYCL_SPECIALIZE_INTEGER_TYPES_BINARY(maxi, max)
-SYCL_SPECIALIZE_FLOATING_TYPES_BINARY(maxi, fmax)
-
 #endif


@@ -1113,9 +916,6 @@ inline EIGEN_MATHFUNC_RETVAL(abs2, Scalar) abs2(const Scalar& x)
  return EIGEN_MATHFUNC_IMPL(abs2, Scalar)::run(x);
 }

-EIGEN_DEVICE_FUNC
-inline bool abs2(bool x) { return x; }
-
 template<typename Scalar>
 EIGEN_DEVICE_FUNC
 inline EIGEN_MATHFUNC_RETVAL(norm1, Scalar) norm1(const Scalar& x)
@@ -1130,10 +930,6 @@ inline EIGEN_MATHFUNC_RETVAL(hypot, Scalar) hypot(const Scalar& x, const Scalar&
  return EIGEN_MATHFUNC_IMPL(hypot, Scalar)::run(x, y);
 }

-#if defined(SYCL_DEVICE_ONLY)
-  SYCL_SPECIALIZE_FLOATING_TYPES_BINARY(hypot, hypot)
-#endif
-
 template<typename Scalar>
 EIGEN_DEVICE_FUNC
 inline EIGEN_MATHFUNC_RETVAL(log1p, Scalar) log1p(const Scalar& x)
@@ -1141,11 +937,7 @@ inline EIGEN_MATHFUNC_RETVAL(log1p, Scalar) log1p(const Scalar& x)
  return EIGEN_MATHFUNC_IMPL(log1p, Scalar)::run(x);
 }

-#if defined(SYCL_DEVICE_ONLY)
-SYCL_SPECIALIZE_FLOATING_TYPES_UNARY(log1p, log1p)
-#endif
-
-#if defined(EIGEN_GPUCC)
+#ifdef __CUDACC__
 template<> EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE
 float log1p(const float &x) { return ::log1pf(x); }

@@ -1160,20 +952,10 @@ inline typename internal::pow_impl<ScalarX,ScalarY>::result_type pow(const Scala
  return internal::pow_impl<ScalarX,ScalarY>::run(x, y);
 }

-#if defined(SYCL_DEVICE_ONLY)
-SYCL_SPECIALIZE_FLOATING_TYPES_BINARY(pow, pow)
-#endif
-
 template<typename T> EIGEN_DEVICE_FUNC bool (isnan)   (const T &x) { return internal::isnan_impl(x); }
 template<typename T> EIGEN_DEVICE_FUNC bool (isinf)   (const T &x) { return internal::isinf_impl(x); }
 template<typename T> EIGEN_DEVICE_FUNC bool (isfinite)(const T &x) { return internal::isfinite_impl(x); }

-#if defined(SYCL_DEVICE_ONLY)
-SYCL_SPECIALIZE_FLOATING_TYPES_UNARY_FUNC_RET_TYPE(isnan, isnan, bool)
-SYCL_SPECIALIZE_FLOATING_TYPES_UNARY_FUNC_RET_TYPE(isinf, isinf, bool)
-SYCL_SPECIALIZE_FLOATING_TYPES_UNARY_FUNC_RET_TYPE(isfinite, isfinite, bool)
-#endif
-
 template<typename Scalar>
 EIGEN_DEVICE_FUNC
 inline EIGEN_MATHFUNC_RETVAL(round, Scalar) round(const Scalar& x)
@@ -1181,10 +963,6 @@ inline EIGEN_MATHFUNC_RETVAL(round, Scalar) round(const Scalar& x)
  return EIGEN_MATHFUNC_IMPL(round, Scalar)::run(x);
 }

-#if defined(SYCL_DEVICE_ONLY)
-SYCL_SPECIALIZE_FLOATING_TYPES_UNARY(round, round)
-#endif
-
 template<typename T>
 EIGEN_DEVICE_FUNC
 T (floor)(const T& x)
@@ -1193,11 +971,7 @@ T (floor)(const T& x)
  return floor(x);
 }

-#if defined(SYCL_DEVICE_ONLY)
-SYCL_SPECIALIZE_FLOATING_TYPES_UNARY(floor, floor)
-#endif
-
-#if defined(EIGEN_GPUCC)
+#ifdef __CUDACC__
 template<> EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE
 float floor(const float &x) { return ::floorf(x); }

@@ -1213,11 +987,7 @@ T (ceil)(const T& x)
  return ceil(x);
 }

-#if defined(SYCL_DEVICE_ONLY)
-SYCL_SPECIALIZE_FLOATING_TYPES_UNARY(ceil, ceil)
-#endif
-
-#if defined(EIGEN_GPUCC)
+#ifdef __CUDACC__
 template<> EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE
 float ceil(const float &x) { return ::ceilf(x); }

@@ -1258,10 +1028,6 @@ T sqrt(const T &x)
  return sqrt(x);
 }

-#if defined(SYCL_DEVICE_ONLY)
-SYCL_SPECIALIZE_FLOATING_TYPES_UNARY(sqrt, sqrt)
-#endif
-
 template<typename T>
 EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE
 T log(const T &x) {
@@ -1269,12 +1035,7 @@ T log(const T &x) {
  return log(x);
 }

-#if defined(SYCL_DEVICE_ONLY)
-SYCL_SPECIALIZE_FLOATING_TYPES_UNARY(log, log)
-#endif
-
-
-#if defined(EIGEN_GPUCC)
+#ifdef __CUDACC__
 template<> EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE
 float log(const float &x) { return ::logf(x); }

@@ -1297,12 +1058,12 @@ abs(const T &x) {
  return x;
 }

-#if defined(SYCL_DEVICE_ONLY)
-SYCL_SPECIALIZE_INTEGER_TYPES_UNARY(abs, abs)
-SYCL_SPECIALIZE_FLOATING_TYPES_UNARY(abs, fabs)
-#endif
+#if defined(__SYCL_DEVICE_ONLY__)
+EIGEN_ALWAYS_INLINE float   abs(float x) { return cl::sycl::fabs(x); }
+EIGEN_ALWAYS_INLINE double  abs(double x) { return cl::sycl::fabs(x); }
+#endif // defined(__SYCL_DEVICE_ONLY__)

-#if defined(EIGEN_GPUCC)
+#ifdef __CUDACC__
 template<> EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE
 float abs(const float &x) { return ::fabsf(x); }

@@ -1327,51 +1088,12 @@ T exp(const T &x) {
  return exp(x);
 }

-#if defined(SYCL_DEVICE_ONLY)
-SYCL_SPECIALIZE_FLOATING_TYPES_UNARY(exp, exp)
-#endif
-
-#if defined(EIGEN_GPUCC)
+#ifdef __CUDACC__
 template<> EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE
 float exp(const float &x) { return ::expf(x); }

 template<> EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE
 double exp(const double &x) { return ::exp(x); }
-
-template<> EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE
-std::complex<float> exp(const std::complex<float>& x) {
-  float com = ::expf(x.real());
-  float res_real = com * ::cosf(x.imag());
-  float res_imag = com * ::sinf(x.imag());
-  return std::complex<float>(res_real, res_imag);
-}
-
-template<> EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE
-std::complex<double> exp(const std::complex<double>& x) {
-  double com = ::exp(x.real());
-  double res_real = com * ::cos(x.imag());
-  double res_imag = com * ::sin(x.imag());
-  return std::complex<double>(res_real, res_imag);
-}
-#endif
-
-template<typename Scalar>
-EIGEN_DEVICE_FUNC
-inline EIGEN_MATHFUNC_RETVAL(expm1, Scalar) expm1(const Scalar& x)
-{
-  return EIGEN_MATHFUNC_IMPL(expm1, Scalar)::run(x);
-}
-
-#if defined(SYCL_DEVICE_ONLY)
-SYCL_SPECIALIZE_FLOATING_TYPES_UNARY(expm1, expm1)
-#endif
-
-#if defined(EIGEN_GPUCC)
-template<> EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE
-float expm1(const float &x) { return ::expm1f(x); }
-
-template<> EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE
-double expm1(const double &x) { return ::expm1(x); }
 #endif

 template<typename T>
@@ -1381,11 +1103,7 @@ T cos(const T &x) {
  return cos(x);
 }

-#if defined(SYCL_DEVICE_ONLY)
-SYCL_SPECIALIZE_FLOATING_TYPES_UNARY(cos,cos)
-#endif
-
-#if defined(EIGEN_GPUCC)
+#ifdef __CUDACC__
 template<> EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE
 float cos(const float &x) { return ::cosf(x); }

@@ -1400,11 +1118,7 @@ T sin(const T &x) {
  return sin(x);
 }

-#if defined(SYCL_DEVICE_ONLY)
-SYCL_SPECIALIZE_FLOATING_TYPES_UNARY(sin, sin)
-#endif
-
-#if defined(EIGEN_GPUCC)
+#ifdef __CUDACC__
 template<> EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE
 float sin(const float &x) { return ::sinf(x); }

@@ -1419,11 +1133,7 @@ T tan(const T &x) {
  return tan(x);
 }

-#if defined(SYCL_DEVICE_ONLY)
-SYCL_SPECIALIZE_FLOATING_TYPES_UNARY(tan, tan)
-#endif
-
-#if defined(EIGEN_GPUCC)
+#ifdef __CUDACC__
 template<> EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE
 float tan(const float &x) { return ::tanf(x); }

@@ -1438,21 +1148,7 @@ T acos(const T &x) {
  return acos(x);
 }

-#if EIGEN_HAS_CXX11_MATH
-template<typename T>
-EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE
-T acosh(const T &x) {
-  EIGEN_USING_STD_MATH(acosh);
-  return acosh(x);
-}
-#endif
-
-#if defined(SYCL_DEVICE_ONLY)
-SYCL_SPECIALIZE_FLOATING_TYPES_UNARY(acos, acos)
-SYCL_SPECIALIZE_FLOATING_TYPES_UNARY(acosh, acosh)
-#endif
-
-#if defined(EIGEN_GPUCC)
+#ifdef __CUDACC__
 template<> EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE
 float acos(const float &x) { return ::acosf(x); }

@@ -1467,21 +1163,7 @@ T asin(const T &x) {
  return asin(x);
 }

-#if EIGEN_HAS_CXX11_MATH
-template<typename T>
-EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE
-T asinh(const T &x) {
-  EIGEN_USING_STD_MATH(asinh);
-  return asinh(x);
-}
-#endif
-
-#if defined(SYCL_DEVICE_ONLY)
-SYCL_SPECIALIZE_FLOATING_TYPES_UNARY(asin, asin)
-SYCL_SPECIALIZE_FLOATING_TYPES_UNARY(asinh, asinh)
-#endif
-
-#if defined(EIGEN_GPUCC)
+#ifdef __CUDACC__
 template<> EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE
 float asin(const float &x) { return ::asinf(x); }

@@ -1496,21 +1178,7 @@ T atan(const T &x) {
  return atan(x);
 }

-#if EIGEN_HAS_CXX11_MATH
-template<typename T>
-EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE
-T atanh(const T &x) {
-  EIGEN_USING_STD_MATH(atanh);
-  return atanh(x);
-}
-#endif
-
-#if defined(SYCL_DEVICE_ONLY)
-SYCL_SPECIALIZE_FLOATING_TYPES_UNARY(atan, atan)
-SYCL_SPECIALIZE_FLOATING_TYPES_UNARY(atanh, atanh)
-#endif
-
-#if defined(EIGEN_GPUCC)
+#ifdef __CUDACC__
 template<> EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE
 float atan(const float &x) { return ::atanf(x); }

@@ -1526,11 +1194,7 @@ T cosh(const T &x) {
  return cosh(x);
 }

-#if defined(SYCL_DEVICE_ONLY)
-SYCL_SPECIALIZE_FLOATING_TYPES_UNARY(cosh, cosh)
-#endif
-
-#if defined(EIGEN_GPUCC)
+#ifdef __CUDACC__
 template<> EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE
 float cosh(const float &x) { return ::coshf(x); }

@@ -1545,11 +1209,7 @@ T sinh(const T &x) {
  return sinh(x);
 }

-#if defined(SYCL_DEVICE_ONLY)
-SYCL_SPECIALIZE_FLOATING_TYPES_UNARY(sinh, sinh)
-#endif
-
-#if defined(EIGEN_GPUCC)
+#ifdef __CUDACC__
 template<> EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE
 float sinh(const float &x) { return ::sinhf(x); }

@@ -1564,16 +1224,12 @@ T tanh(const T &x) {
  return tanh(x);
 }

-#if (!defined(EIGEN_GPUCC)) && EIGEN_FAST_MATH && !defined(SYCL_DEVICE_ONLY)
+#if (!defined(__CUDACC__)) && EIGEN_FAST_MATH
 EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE
 float tanh(float x) { return internal::generic_fast_tanh_float(x); }
 #endif

-#if defined(SYCL_DEVICE_ONLY)
-SYCL_SPECIALIZE_FLOATING_TYPES_UNARY(tanh, tanh)
-#endif
-
-#if defined(EIGEN_GPUCC)
+#ifdef __CUDACC__
 template<> EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE
 float tanh(const float &x) { return ::tanhf(x); }

@@ -1588,11 +1244,7 @@ T fmod(const T& a, const T& b) {
  return fmod(a, b);
 }

-#if defined(SYCL_DEVICE_ONLY)
-SYCL_SPECIALIZE_FLOATING_TYPES_BINARY(fmod, fmod)
-#endif
-
-#if defined(EIGEN_GPUCC)
+#ifdef __CUDACC__
 template <>
 EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE
 float fmod(const float& a, const float& b) {
@@ -1606,23 +1258,6 @@ double fmod(const double& a, const double& b) {
 }
 #endif

-#if defined(SYCL_DEVICE_ONLY)
-#undef SYCL_SPECIALIZE_SIGNED_INTEGER_TYPES_BINARY
-#undef SYCL_SPECIALIZE_SIGNED_INTEGER_TYPES_UNARY
-#undef SYCL_SPECIALIZE_UNSIGNED_INTEGER_TYPES_BINARY
-#undef SYCL_SPECIALIZE_UNSIGNED_INTEGER_TYPES_UNARY
-#undef SYCL_SPECIALIZE_INTEGER_TYPES_BINARY
-#undef SYCL_SPECIALIZE_UNSIGNED_INTEGER_TYPES_UNARY
-#undef SYCL_SPECIALIZE_FLOATING_TYPES_BINARY
-#undef SYCL_SPECIALIZE_FLOATING_TYPES_UNARY
-#undef SYCL_SPECIALIZE_FLOATING_TYPES_UNARY_FUNC_RET_TYPE
-#undef SYCL_SPECIALIZE_GEN_UNARY_FUNC
-#undef SYCL_SPECIALIZE_UNARY_FUNC
-#undef SYCL_SPECIALIZE_GEN1_BINARY_FUNC
-#undef SYCL_SPECIALIZE_GEN2_BINARY_FUNC
-#undef SYCL_SPECIALIZE_BINARY_FUNC
-#endif
-
 } // end namespace numext

 namespace internal {
@@ -1751,13 +1386,13 @@ template<> struct random_impl<bool>
 template<> struct scalar_fuzzy_impl<bool>
 {
  typedef bool RealScalar;
-
+  
  template<typename OtherScalar> EIGEN_DEVICE_FUNC
  static inline bool isMuchSmallerThan(const bool& x, const bool&, const bool&)
  {
    return !x;
  }
-
+  
  EIGEN_DEVICE_FUNC
  static inline bool isApprox(bool x, bool y, bool)
  {
@@ -1769,10 +1404,10 @@ template<> struct scalar_fuzzy_impl<bool>
  {
    return (!x) || y;
  }
-
+  
 };

-
+  
 } // end namespace internal

 } // end namespace Eigen
--- a/Eigen/src/Core/MathFunctionsImpl.h
+++ b/Eigen/src/Core/MathFunctionsImpl.h
@@ -29,7 +29,12 @@ T generic_fast_tanh_float(const T& a_x)
  // this range is +/-1.0f in single-precision.
  const T plus_9 = pset1<T>(9.f);
  const T minus_9 = pset1<T>(-9.f);
-  const T x = pmax(pmin(a_x, plus_9), minus_9);
+  // NOTE GCC prior to 6.3 might improperly optimize this max/min
+  //      step such that if a_x is nan, x will be either 9 or -9,
+  //      and tanh will return 1 or -1 instead of nan.
+  //      This is supposed to be fixed in gcc6.3,
+  //      see: https://gcc.gnu.org/bugzilla/show_bug.cgi?id=72867
+  const T x = pmax(minus_9,pmin(plus_9,a_x));
  // The monomial coefficients of the numerator polynomial (odd).
  const T alpha_1 = pset1<T>(4.89352455891786e-03f);
  const T alpha_3 = pset1<T>(6.37261928875436e-04f);
@@ -67,14 +72,14 @@ T generic_fast_tanh_float(const T& a_x)
 }

 template<typename RealScalar>
-EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+EIGEN_STRONG_INLINE
 RealScalar positive_real_hypot(const RealScalar& x, const RealScalar& y)
 {
  EIGEN_USING_STD_MATH(sqrt);
  RealScalar p, qp;
  p = numext::maxi(x,y);
  if(p==RealScalar(0)) return RealScalar(0);
-  qp = numext::mini(y,x) / p;
+  qp = numext::mini(y,x) / p;    
  return p * sqrt(RealScalar(1) + qp*qp);
 }

@@ -82,8 +87,7 @@ template<typename Scalar>
 struct hypot_impl
 {
  typedef typename NumTraits<Scalar>::Real RealScalar;
-  static EIGEN_DEVICE_FUNC
-  inline RealScalar run(const Scalar& x, const Scalar& y)
+  static inline RealScalar run(const Scalar& x, const Scalar& y)
  {
    EIGEN_USING_STD_MATH(abs);
    return positive_real_hypot<RealScalar>(abs(x), abs(y));
--- a/Eigen/src/Core/Matrix.h
+++ b/Eigen/src/Core/Matrix.h
@@ -255,27 +255,27 @@ class Matrix
      *
      * \sa resize(Index,Index)
      */
-    EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
-    Matrix() : Base()
+    EIGEN_DEVICE_FUNC
+    EIGEN_STRONG_INLINE Matrix() : Base()
    {
      Base::_check_template_params();
      EIGEN_INITIALIZE_COEFFS_IF_THAT_OPTION_IS_ENABLED
    }

    // FIXME is it still needed
-    EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+    EIGEN_DEVICE_FUNC
    explicit Matrix(internal::constructor_without_unaligned_array_assert)
      : Base(internal::constructor_without_unaligned_array_assert())
    { Base::_check_template_params(); EIGEN_INITIALIZE_COEFFS_IF_THAT_OPTION_IS_ENABLED }

 #if EIGEN_HAS_RVALUE_REFERENCES
-    EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+    EIGEN_DEVICE_FUNC
    Matrix(Matrix&& other) EIGEN_NOEXCEPT_IF(std::is_nothrow_move_constructible<Scalar>::value)
      : Base(std::move(other))
    {
      Base::_check_template_params();
    }
-    EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+    EIGEN_DEVICE_FUNC
    Matrix& operator=(Matrix&& other) EIGEN_NOEXCEPT_IF(std::is_nothrow_move_assignable<Scalar>::value)
    {
      other.swap(*this);
@@ -283,65 +283,25 @@ class Matrix
    }
 #endif

-#if EIGEN_HAS_CXX11
-    /** \copydoc PlainObjectBase(const Scalar&, const Scalar&, const Scalar&,  const Scalar&, const ArgTypes&... args)
-     *
-     * Example: \include Matrix_variadic_ctor_cxx11.cpp
-     * Output: \verbinclude Matrix_variadic_ctor_cxx11.out
-     *
-     * \sa Matrix(const std::initializer_list<std::initializer_list<Scalar>>&)
-     */
-    template <typename... ArgTypes>
-    EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
-    Matrix(const Scalar& a0, const Scalar& a1, const Scalar& a2,  const Scalar& a3, const ArgTypes&... args)
-      : Base(a0, a1, a2, a3, args...) {}
-
-    /** \brief Constructs a Matrix and initializes it from the coefficients given as initializer-lists grouped by row. \cpp11
-      * 
-      * In the general case, the constructor takes a list of rows, each row being represented as a list of coefficients:
-      * 
-      * Example: \include Matrix_initializer_list_23_cxx11.cpp
-      * Output: \verbinclude Matrix_initializer_list_23_cxx11.out
-      * 
-      * Each of the inner initializer lists must contain the exact same number of elements, otherwise an assertion is triggered.
-      * 
-      * In the case of a compile-time column vector, implicit transposition from a single row is allowed.
-      * Therefore <code>VectorXd{{1,2,3,4,5}}</code> is legal and the more verbose syntax
-      * <code>RowVectorXd{{1},{2},{3},{4},{5}}</code> can be avoided:
-      * 
-      * Example: \include Matrix_initializer_list_vector_cxx11.cpp
-      * Output: \verbinclude Matrix_initializer_list_vector_cxx11.out
-      * 
-      * In the case of fixed-sized matrices, the initializer list sizes must exactly match the matrix sizes,
-      * and implicit transposition is allowed for compile-time vectors only.
-      * 
-      * \sa Matrix(const Scalar& a0, const Scalar& a1, const Scalar& a2,  const Scalar& a3, const ArgTypes&... args)
-      */
-    EIGEN_DEVICE_FUNC
-    explicit EIGEN_STRONG_INLINE Matrix(const std::initializer_list<std::initializer_list<Scalar>>& list) : Base(list) {}
-#endif // end EIGEN_HAS_CXX11
-
-#ifndef EIGEN_PARSED_BY_DOXYGEN
+    #ifndef EIGEN_PARSED_BY_DOXYGEN

    // This constructor is for both 1x1 matrices and dynamic vectors
    template<typename T>
-    EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
-    explicit Matrix(const T& x)
+    EIGEN_DEVICE_FUNC
+    EIGEN_STRONG_INLINE explicit Matrix(const T& x)
    {
      Base::_check_template_params();
      Base::template _init1<T>(x);
    }

    template<typename T0, typename T1>
-    EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
-    Matrix(const T0& x, const T1& y)
+    EIGEN_DEVICE_FUNC
+    EIGEN_STRONG_INLINE Matrix(const T0& x, const T1& y)
    {
      Base::_check_template_params();
      Base::template _init2<T0,T1>(x, y);
    }
-
-
-#else
+    #else
    /** \brief Constructs a fixed-sized matrix initialized with coefficients starting at \a data */
    EIGEN_DEVICE_FUNC
    explicit Matrix(const Scalar *data);
@@ -359,8 +319,7 @@ class Matrix
      * \c EIGEN_INITIALIZE_MATRICES_BY_{ZERO,\c NAN} macros (see \ref TopicPreprocessorDirectives).
      */
    EIGEN_STRONG_INLINE explicit Matrix(Index dim);
-    /** \brief Constructs an initialized 1x1 matrix with the given coefficient
-      * \sa Matrix(const Scalar&, const Scalar&, const Scalar&,  const Scalar&, const ArgTypes&...) */
+    /** \brief Constructs an initialized 1x1 matrix with the given coefficient */
    Matrix(const Scalar& x);
    /** \brief Constructs an uninitialized matrix with \a rows rows and \a cols columns.
      *
@@ -377,14 +336,11 @@ class Matrix
    EIGEN_DEVICE_FUNC
    Matrix(Index rows, Index cols);
    
-    /** \brief Constructs an initialized 2D vector with given coefficients
-      * \sa Matrix(const Scalar&, const Scalar&, const Scalar&,  const Scalar&, const ArgTypes&...) */
+    /** \brief Constructs an initialized 2D vector with given coefficients */
    Matrix(const Scalar& x, const Scalar& y);
-    #endif  // end EIGEN_PARSED_BY_DOXYGEN
+    #endif

-    /** \brief Constructs an initialized 3D vector with given coefficients
-      * \sa Matrix(const Scalar&, const Scalar&, const Scalar&,  const Scalar&, const ArgTypes&...)
-      */
+    /** \brief Constructs an initialized 3D vector with given coefficients */
    EIGEN_DEVICE_FUNC
    EIGEN_STRONG_INLINE Matrix(const Scalar& x, const Scalar& y, const Scalar& z)
    {
@@ -394,9 +350,7 @@ class Matrix
      m_storage.data()[1] = y;
      m_storage.data()[2] = z;
    }
-    /** \brief Constructs an initialized 4D vector with given coefficients
-      * \sa Matrix(const Scalar&, const Scalar&, const Scalar&,  const Scalar&, const ArgTypes&...)
-      */
+    /** \brief Constructs an initialized 4D vector with given coefficients */
    EIGEN_DEVICE_FUNC
    EIGEN_STRONG_INLINE Matrix(const Scalar& x, const Scalar& y, const Scalar& z, const Scalar& w)
    {
@@ -451,7 +405,7 @@ class Matrix
  *
  * \ingroup Core_Module
  *
-  * %Eigen defines several typedef shortcuts for most common matrix and vector types.
+  * Eigen defines several typedef shortcuts for most common matrix and vector types.
  *
  * The general patterns are the following:
  *
@@ -463,15 +417,6 @@ class Matrix
  *
  * There are also \c VectorSizeType and \c RowVectorSizeType which are self-explanatory. For example, \c Vector4cf is
  * a fixed-size vector of 4 complex floats.
-  * 
-  * With \cpp11, template alias are also defined for common sizes.
-  * They follow the same pattern as above except that the scalar type suffix is replaced by a
-  * template parameter, i.e.:
-  *   - `MatrixSize<Type>` where `Size` can be \c 2,\c 3,\c 4 for fixed size square matrices or \c X for dynamic size.
-  *   - `MatrixXSize<Type>` and `MatrixSizeX<Type>` where `Size` can be \c 2,\c 3,\c 4 for hybrid dynamic/fixed matrices.
-  *   - `VectorSize<Type>` and `RowVectorSize<Type>` for column and row vectors.
-  * 
-  * With \cpp11, you can also use fully generic column and row vector types: `Vector<Type,Size>` and `RowVector<Type,Size>`.
  *
  * \sa class Matrix
  */
@@ -509,55 +454,6 @@ EIGEN_MAKE_TYPEDEFS_ALL_SIZES(std::complex<double>, cd)
 #undef EIGEN_MAKE_TYPEDEFS
 #undef EIGEN_MAKE_FIXED_TYPEDEFS

-#if EIGEN_HAS_CXX11
-
-#define EIGEN_MAKE_TYPEDEFS(Size, SizeSuffix)                     \
-/** \ingroup matrixtypedefs */                                    \
-/** \brief \cpp11 */                                              \
-template <typename Type>                                          \
-using Matrix##SizeSuffix = Matrix<Type, Size, Size>;              \
-/** \ingroup matrixtypedefs */                                    \
-/** \brief \cpp11 */                                              \
-template <typename Type>                                          \
-using Vector##SizeSuffix = Matrix<Type, Size, 1>;                 \
-/** \ingroup matrixtypedefs */                                    \
-/** \brief \cpp11 */                                              \
-template <typename Type>                                          \
-using RowVector##SizeSuffix = Matrix<Type, 1, Size>;
-
-#define EIGEN_MAKE_FIXED_TYPEDEFS(Size)                           \
-/** \ingroup matrixtypedefs */                                    \
-/** \brief \cpp11 */                                              \
-template <typename Type>                                          \
-using Matrix##Size##X = Matrix<Type, Size, Dynamic>;              \
-/** \ingroup matrixtypedefs */                                    \
-/** \brief \cpp11 */                                              \
-template <typename Type>                                          \
-using Matrix##X##Size = Matrix<Type, Dynamic, Size>;
-
-EIGEN_MAKE_TYPEDEFS(2, 2)
-EIGEN_MAKE_TYPEDEFS(3, 3)
-EIGEN_MAKE_TYPEDEFS(4, 4)
-EIGEN_MAKE_TYPEDEFS(Dynamic, X)
-EIGEN_MAKE_FIXED_TYPEDEFS(2)
-EIGEN_MAKE_FIXED_TYPEDEFS(3)
-EIGEN_MAKE_FIXED_TYPEDEFS(4)
-
-/** \ingroup matrixtypedefs
-  * \brief \cpp11 */
-template <typename Type, int Size>
-using Vector = Matrix<Type, Size, 1>;
-
-/** \ingroup matrixtypedefs
-  * \brief \cpp11 */
-template <typename Type, int Size>
-using RowVector = Matrix<Type, 1, Size>;
-
-#undef EIGEN_MAKE_TYPEDEFS
-#undef EIGEN_MAKE_FIXED_TYPEDEFS
-
-#endif // EIGEN_HAS_CXX11
-
 } // end namespace Eigen

 #endif // EIGEN_MATRIX_H
--- a/Eigen/src/Core/MatrixBase.h
+++ b/Eigen/src/Core/MatrixBase.h
@@ -76,7 +76,6 @@ template<typename Derived> class MatrixBase
    using Base::coeffRef;
    using Base::lazyAssign;
    using Base::eval;
-    using Base::operator-;
    using Base::operator+=;
    using Base::operator-=;
    using Base::operator*=;
@@ -123,6 +122,7 @@ template<typename Derived> class MatrixBase

 #define EIGEN_CURRENT_STORAGE_BASE_CLASS Eigen::MatrixBase
 #define EIGEN_DOC_UNARY_ADDONS(X,Y)
+#   include "../plugins/CommonCwiseUnaryOps.h"
 #   include "../plugins/CommonCwiseBinaryOps.h"
 #   include "../plugins/MatrixCwiseUnaryOps.h"
 #   include "../plugins/MatrixCwiseBinaryOps.h"
@@ -268,8 +268,6 @@ template<typename Derived> class MatrixBase
    Derived& setIdentity();
    EIGEN_DEVICE_FUNC
    Derived& setIdentity(Index rows, Index cols);
-    EIGEN_DEVICE_FUNC Derived& setUnit(Index i);
-    EIGEN_DEVICE_FUNC Derived& setUnit(Index newSize, Index i);

    bool isIdentity(const RealScalar& prec = NumTraits<Scalar>::dummy_precision()) const;
    bool isDiagonal(const RealScalar& prec = NumTraits<Scalar>::dummy_precision()) const;
@@ -298,7 +296,7 @@ template<typename Derived> class MatrixBase
    EIGEN_DEVICE_FUNC inline bool operator!=(const MatrixBase<OtherDerived>& other) const
    { return cwiseNotEqual(other).any(); }

-    NoAlias<Derived,Eigen::MatrixBase > EIGEN_DEVICE_FUNC noalias();
+    NoAlias<Derived,Eigen::MatrixBase > noalias();

    // TODO forceAlignedAccess is temporarily disabled
    // Need to find a nicer workaround.
@@ -328,7 +326,6 @@ template<typename Derived> class MatrixBase

    inline const PartialPivLU<PlainObject> lu() const;

-    EIGEN_DEVICE_FUNC
    inline const Inverse<Derived> inverse() const;

    template<typename ResultType>
@@ -338,15 +335,12 @@ template<typename Derived> class MatrixBase
      bool& invertible,
      const RealScalar& absDeterminantThreshold = NumTraits<Scalar>::dummy_precision()
    ) const;
-
    template<typename ResultType>
    inline void computeInverseWithCheck(
      ResultType& inverse,
      bool& invertible,
      const RealScalar& absDeterminantThreshold = NumTraits<Scalar>::dummy_precision()
    ) const;
-
-    EIGEN_DEVICE_FUNC
    Scalar determinant() const;

 /////////// Cholesky module ///////////
@@ -418,19 +412,15 @@ template<typename Derived> class MatrixBase

 ////////// Householder module ///////////

-    EIGEN_DEVICE_FUNC
    void makeHouseholderInPlace(Scalar& tau, RealScalar& beta);
    template<typename EssentialPart>
-    EIGEN_DEVICE_FUNC
    void makeHouseholder(EssentialPart& essential,
                         Scalar& tau, RealScalar& beta) const;
    template<typename EssentialPart>
-    EIGEN_DEVICE_FUNC
    void applyHouseholderOnTheLeft(const EssentialPart& essential,
                                   const Scalar& tau,
                                   Scalar* workspace);
    template<typename EssentialPart>
-    EIGEN_DEVICE_FUNC
    void applyHouseholderOnTheRight(const EssentialPart& essential,
                                    const Scalar& tau,
                                    Scalar* workspace);
@@ -438,10 +428,8 @@ template<typename Derived> class MatrixBase
 ///////// Jacobi module /////////

    template<typename OtherScalar>
-    EIGEN_DEVICE_FUNC
    void applyOnTheLeft(Index p, Index q, const JacobiRotation<OtherScalar>& j);
    template<typename OtherScalar>
-    EIGEN_DEVICE_FUNC
    void applyOnTheRight(Index p, Index q, const JacobiRotation<OtherScalar>& j);

 ///////// SparseCore module /////////
@@ -468,11 +456,6 @@ template<typename Derived> class MatrixBase
    const MatrixFunctionReturnValue<Derived> matrixFunction(StemFunction f) const;
    EIGEN_MATRIX_FUNCTION(MatrixFunctionReturnValue, cosh, hyperbolic cosine)
    EIGEN_MATRIX_FUNCTION(MatrixFunctionReturnValue, sinh, hyperbolic sine)
-#if EIGEN_HAS_CXX11_MATH
-    EIGEN_MATRIX_FUNCTION(MatrixFunctionReturnValue, atanh, inverse hyperbolic cosine)
-    EIGEN_MATRIX_FUNCTION(MatrixFunctionReturnValue, acosh, inverse hyperbolic cosine)
-    EIGEN_MATRIX_FUNCTION(MatrixFunctionReturnValue, asinh, inverse hyperbolic sine)
-#endif
    EIGEN_MATRIX_FUNCTION(MatrixFunctionReturnValue, cos, cosine)
    EIGEN_MATRIX_FUNCTION(MatrixFunctionReturnValue, sin, sine)
    EIGEN_MATRIX_FUNCTION(MatrixSquareRootReturnValue, sqrt, square root)
--- a/Eigen/src/Core/NestByValue.h
+++ b/Eigen/src/Core/NestByValue.h
@@ -16,11 +16,7 @@ namespace Eigen {
 namespace internal {
 template<typename ExpressionType>
 struct traits<NestByValue<ExpressionType> > : public traits<ExpressionType>
-{
-  enum {
-    Flags = traits<ExpressionType>::Flags & ~NestByRefBit
-  };
-};
+{};
 }

 /** \class NestByValue
@@ -47,11 +43,55 @@ template<typename ExpressionType> class NestByValue

    EIGEN_DEVICE_FUNC inline Index rows() const { return m_expression.rows(); }
    EIGEN_DEVICE_FUNC inline Index cols() const { return m_expression.cols(); }
+    EIGEN_DEVICE_FUNC inline Index outerStride() const { return m_expression.outerStride(); }
+    EIGEN_DEVICE_FUNC inline Index innerStride() const { return m_expression.innerStride(); }
+
+    EIGEN_DEVICE_FUNC inline const CoeffReturnType coeff(Index row, Index col) const
+    {
+      return m_expression.coeff(row, col);
+    }
+
+    EIGEN_DEVICE_FUNC inline Scalar& coeffRef(Index row, Index col)
+    {
+      return m_expression.const_cast_derived().coeffRef(row, col);
+    }
+
+    EIGEN_DEVICE_FUNC inline const CoeffReturnType coeff(Index index) const
+    {
+      return m_expression.coeff(index);
+    }
+
+    EIGEN_DEVICE_FUNC inline Scalar& coeffRef(Index index)
+    {
+      return m_expression.const_cast_derived().coeffRef(index);
+    }
+
+    template<int LoadMode>
+    inline const PacketScalar packet(Index row, Index col) const
+    {
+      return m_expression.template packet<LoadMode>(row, col);
+    }
+
+    template<int LoadMode>
+    inline void writePacket(Index row, Index col, const PacketScalar& x)
+    {
+      m_expression.const_cast_derived().template writePacket<LoadMode>(row, col, x);
+    }
+
+    template<int LoadMode>
+    inline const PacketScalar packet(Index index) const
+    {
+      return m_expression.template packet<LoadMode>(index);
+    }
+
+    template<int LoadMode>
+    inline void writePacket(Index index, const PacketScalar& x)
+    {
+      m_expression.const_cast_derived().template writePacket<LoadMode>(index, x);
+    }

    EIGEN_DEVICE_FUNC operator const ExpressionType&() const { return m_expression; }

-    EIGEN_DEVICE_FUNC const ExpressionType& nestedExpression() const { return m_expression; }
-
  protected:
    const ExpressionType m_expression;
 };
@@ -59,27 +99,12 @@ template<typename ExpressionType> class NestByValue
 /** \returns an expression of the temporary version of *this.
  */
 template<typename Derived>
-EIGEN_DEVICE_FUNC inline const NestByValue<Derived>
+inline const NestByValue<Derived>
 DenseBase<Derived>::nestByValue() const
 {
  return NestByValue<Derived>(derived());
 }

-namespace internal {
-
-// Evaluator of Solve -> eval into a temporary
-template<typename ArgType>
-struct evaluator<NestByValue<ArgType> >
-  : public evaluator<ArgType>
-{
-  typedef evaluator<ArgType> Base;
-
-  EIGEN_DEVICE_FUNC explicit evaluator(const NestByValue<ArgType>& xpr)
-    : Base(xpr.nestedExpression())
-  {}
-};
-}
-
 } // end namespace Eigen

 #endif // EIGEN_NESTBYVALUE_H
--- a/Eigen/src/Core/NoAlias.h
+++ b/Eigen/src/Core/NoAlias.h
@@ -33,7 +33,6 @@ class NoAlias
  public:
    typedef typename ExpressionType::Scalar Scalar;
    
-    EIGEN_DEVICE_FUNC
    explicit NoAlias(ExpressionType& expression) : m_expression(expression) {}
    
    template<typename OtherDerived>
@@ -75,10 +74,10 @@ class NoAlias
  *
  * More precisely, noalias() allows to bypass the EvalBeforeAssignBit flag.
  * Currently, even though several expressions may alias, only product
-  * expressions have this flag. Therefore, noalias() is only useful when
+  * expressions have this flag. Therefore, noalias() is only usefull when
  * the source expression contains a matrix product.
  *
-  * Here are some examples where noalias is useful:
+  * Here are some examples where noalias is usefull:
  * \code
  * D.noalias()  = A * B;
  * D.noalias() += A.transpose() * B;
@@ -99,7 +98,7 @@ class NoAlias
  * \sa class NoAlias
  */
 template<typename Derived>
-NoAlias<Derived,MatrixBase> EIGEN_DEVICE_FUNC MatrixBase<Derived>::noalias()
+NoAlias<Derived,MatrixBase> MatrixBase<Derived>::noalias()
 {
  return NoAlias<Derived, Eigen::MatrixBase >(derived());
 }
--- a/Eigen/src/Core/NumTraits.h
+++ b/Eigen/src/Core/NumTraits.h
@@ -21,14 +21,12 @@ template< typename T,
          bool is_integer = NumTraits<T>::IsInteger>
 struct default_digits10_impl
 {
-  EIGEN_DEVICE_FUNC
  static int run() { return std::numeric_limits<T>::digits10; }
 };

 template<typename T>
 struct default_digits10_impl<T,false,false> // Floating point
 {
-  EIGEN_DEVICE_FUNC
  static int run() {
    using std::log10;
    using std::ceil;
@@ -40,38 +38,6 @@ struct default_digits10_impl<T,false,false> // Floating point
 template<typename T>
 struct default_digits10_impl<T,false,true> // Integer
 {
-  EIGEN_DEVICE_FUNC
-  static int run() { return 0; }
-};
-
-
-// default implementation of digits(), based on numeric_limits if specialized,
-// 0 for integer types, and log2(epsilon()) otherwise.
-template< typename T,
-          bool use_numeric_limits = std::numeric_limits<T>::is_specialized,
-          bool is_integer = NumTraits<T>::IsInteger>
-struct default_digits_impl
-{
-  EIGEN_DEVICE_FUNC
-  static int run() { return std::numeric_limits<T>::digits; }
-};
-
-template<typename T>
-struct default_digits_impl<T,false,false> // Floating point
-{
-  EIGEN_DEVICE_FUNC
-  static int run() {
-    using std::log;
-    using std::ceil;
-    typedef typename NumTraits<T>::Real Real;
-    return int(ceil(-log(NumTraits<Real>::epsilon())/log(static_cast<Real>(2))));
-  }
-};
-
-template<typename T>
-struct default_digits_impl<T,false,true> // Integer
-{
-  EIGEN_DEVICE_FUNC
  static int run() { return 0; }
 };

@@ -105,7 +71,7 @@ struct default_digits_impl<T,false,true> // Integer
  *     and to \c 0 otherwise.
  * \li Enum values ReadCost, AddCost and MulCost representing a rough estimate of the number of CPU cycles needed
  *     to by move / add / mul instructions respectively, assuming the data is already stored in CPU registers.
-  *     Stay vague here. No need to do architecture-specific stuff. If you don't know what this means, just use \c Eigen::HugeCost.
+  *     Stay vague here. No need to do architecture-specific stuff.
  * \li An enum value \a IsSigned. It is equal to \c 1 if \a T is a signed type and to 0 if \a T is unsigned.
  * \li An enum value \a RequireInitialization. It is equal to \c 1 if the constructor of the numeric type \a T must
  *     be called, and to 0 if it is safe not to call it. Default is 0 if \a T is an arithmetic type, and 1 otherwise.
@@ -152,12 +118,6 @@ template<typename T> struct GenericNumTraits
    return internal::default_digits10_impl<T>::run();
  }

-  EIGEN_DEVICE_FUNC
-  static inline int digits()
-  {
-    return internal::default_digits_impl<T>::run();
-  }
-
  EIGEN_DEVICE_FUNC
  static inline Real dummy_precision()
  {
@@ -173,8 +133,7 @@ template<typename T> struct GenericNumTraits

  EIGEN_DEVICE_FUNC
  static inline T lowest()  {
-    return IsInteger ? (numext::numeric_limits<T>::min)()
-                     : static_cast<T>(-(numext::numeric_limits<T>::max)());
+    return IsInteger ? (numext::numeric_limits<T>::min)() : (-(numext::numeric_limits<T>::max)());
  }

  EIGEN_DEVICE_FUNC
--- a/Eigen/src/Core/PartialReduxEvaluator.h
+++ b/Eigen/src/Core/PartialReduxEvaluator.h
@@ -1,232 +0,0 @@
-// This file is part of Eigen, a lightweight C++ template library
-// for linear algebra.
-//
-// Copyright (C) 2011-2018 Gael Guennebaud <gael.guennebaud@inria.fr>
-//
-// This Source Code Form is subject to the terms of the Mozilla
-// Public License v. 2.0. If a copy of the MPL was not distributed
-// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
-
-#ifndef EIGEN_PARTIALREDUX_H
-#define EIGEN_PARTIALREDUX_H
-
-namespace Eigen { 
-
-namespace internal {
-
-
-/***************************************************************************
-*
-* This file provides evaluators for partial reductions.
-* There are two modes:
-*
-*  - scalar path: simply calls the respective function on the column or row.
-*    -> nothing special here, all the tricky part is handled by the return
-*       types of VectorwiseOp's members. They embed the functor calling the
-*       respective DenseBase's member function.
-*
-*  - vectorized path: implements a packet-wise reductions followed by
-*    some (optional) processing of the outcome, e.g., division by n for mean.
-*
-* For the vectorized path let's observe that the packet-size and outer-unrolling
-* are both decided by the assignement logic. So all we have to do is to decide
-* on the inner unrolling.
-*
-* For the unrolling, we can reuse "internal::redux_vec_unroller" from Redux.h,
-* but be need to be careful to specify correct increment.
-*
-***************************************************************************/
-
-
-/* logic deciding a strategy for unrolling of vectorized paths */
-template<typename Func, typename Evaluator>
-struct packetwise_redux_traits
-{
-  enum {
-    OuterSize = int(Evaluator::IsRowMajor) ? Evaluator::RowsAtCompileTime : Evaluator::ColsAtCompileTime,
-    Cost = OuterSize == Dynamic ? HugeCost
-         : OuterSize * Evaluator::CoeffReadCost + (OuterSize-1) * functor_traits<Func>::Cost,
-    Unrolling = Cost <= EIGEN_UNROLLING_LIMIT ? CompleteUnrolling : NoUnrolling
-  };
-
-};
-
-/* Value to be returned when size==0 , by default let's return 0 */
-template<typename PacketType,typename Func>
-EIGEN_DEVICE_FUNC
-PacketType packetwise_redux_empty_value(const Func& ) { return pset1<PacketType>(0); }
-
-/* For products the default is 1 */
-template<typename PacketType,typename Scalar>
-EIGEN_DEVICE_FUNC
-PacketType packetwise_redux_empty_value(const scalar_product_op<Scalar,Scalar>& ) { return pset1<PacketType>(1); }
-
-/* Perform the actual reduction */
-template<typename Func, typename Evaluator,
-         int Unrolling = packetwise_redux_traits<Func, Evaluator>::Unrolling
->
-struct packetwise_redux_impl;
-
-/* Perform the actual reduction with unrolling */
-template<typename Func, typename Evaluator>
-struct packetwise_redux_impl<Func, Evaluator, CompleteUnrolling>
-{
-  typedef redux_novec_unroller<Func,Evaluator, 0, Evaluator::SizeAtCompileTime> Base;
-  typedef typename Evaluator::Scalar Scalar;
-
-  template<typename PacketType>
-  EIGEN_DEVICE_FUNC static EIGEN_STRONG_INLINE
-  PacketType run(const Evaluator &eval, const Func& func, Index /*size*/)
-  {
-    return redux_vec_unroller<Func, Evaluator, 0, packetwise_redux_traits<Func, Evaluator>::OuterSize>::template run<PacketType>(eval,func);
-  }
-};
-
-/* Add a specialization of redux_vec_unroller for size==0 at compiletime.
- * This specialization is not required for general reductions, which is
- * why it is defined here.
- */
-template<typename Func, typename Evaluator, int Start>
-struct redux_vec_unroller<Func, Evaluator, Start, 0>
-{
-  template<typename PacketType>
-  EIGEN_DEVICE_FUNC
-  static EIGEN_STRONG_INLINE PacketType run(const Evaluator &, const Func& f)
-  {
-    return packetwise_redux_empty_value<PacketType>(f);
-  }
-};
-
-/* Perform the actual reduction for dynamic sizes */
-template<typename Func, typename Evaluator>
-struct packetwise_redux_impl<Func, Evaluator, NoUnrolling>
-{
-  typedef typename Evaluator::Scalar Scalar;
-  typedef typename redux_traits<Func, Evaluator>::PacketType PacketScalar;
-
-  template<typename PacketType>
-  EIGEN_DEVICE_FUNC
-  static PacketType run(const Evaluator &eval, const Func& func, Index size)
-  {
-    if(size==0)
-      return packetwise_redux_empty_value<PacketType>(func);
-    
-    const Index size4 = (size-1)&(~3);
-    PacketType p = eval.template packetByOuterInner<Unaligned,PacketType>(0,0);
-    Index i = 1;
-    // This loop is optimized for instruction pipelining:
-    // - each iteration generates two independent instructions
-    // - thanks to branch prediction and out-of-order execution we have independent instructions across loops
-    for(; i<size4; i+=4)
-      p = func.packetOp(p,
-            func.packetOp(
-              func.packetOp(eval.template packetByOuterInner<Unaligned,PacketType>(i+0,0),eval.template packetByOuterInner<Unaligned,PacketType>(i+1,0)),
-              func.packetOp(eval.template packetByOuterInner<Unaligned,PacketType>(i+2,0),eval.template packetByOuterInner<Unaligned,PacketType>(i+3,0))));
-    for(; i<size; ++i)
-      p = func.packetOp(p, eval.template packetByOuterInner<Unaligned,PacketType>(i,0));
-    return p;
-  }
-};
-
-template< typename ArgType, typename MemberOp, int Direction>
-struct evaluator<PartialReduxExpr<ArgType, MemberOp, Direction> >
-  : evaluator_base<PartialReduxExpr<ArgType, MemberOp, Direction> >
-{
-  typedef PartialReduxExpr<ArgType, MemberOp, Direction> XprType;
-  typedef typename internal::nested_eval<ArgType,1>::type ArgTypeNested;
-  typedef typename internal::add_const_on_value_type<ArgTypeNested>::type ConstArgTypeNested;
-  typedef typename internal::remove_all<ArgTypeNested>::type ArgTypeNestedCleaned;
-  typedef typename ArgType::Scalar InputScalar;
-  typedef typename XprType::Scalar Scalar;
-  enum {
-    TraversalSize = Direction==int(Vertical) ? int(ArgType::RowsAtCompileTime) :  int(ArgType::ColsAtCompileTime)
-  };
-  typedef typename MemberOp::template Cost<int(TraversalSize)> CostOpType;
-  enum {
-    CoeffReadCost = TraversalSize==Dynamic ? HugeCost
-                  : TraversalSize==0 ? 1
-                  : TraversalSize * evaluator<ArgType>::CoeffReadCost + int(CostOpType::value),
-    
-    _ArgFlags = evaluator<ArgType>::Flags,
-
-    _Vectorizable =  bool(int(_ArgFlags)&PacketAccessBit)
-                  && bool(MemberOp::Vectorizable)
-                  && (Direction==int(Vertical) ? bool(_ArgFlags&RowMajorBit) : (_ArgFlags&RowMajorBit)==0)
-                  && (TraversalSize!=0),
-                  
-    Flags = (traits<XprType>::Flags&RowMajorBit)
-          | (evaluator<ArgType>::Flags&(HereditaryBits&(~RowMajorBit)))
-          | (_Vectorizable ? PacketAccessBit : 0)
-          | LinearAccessBit,
-    
-    Alignment = 0 // FIXME this will need to be improved once PartialReduxExpr is vectorized
-  };
-
-  EIGEN_DEVICE_FUNC explicit evaluator(const XprType xpr)
-    : m_arg(xpr.nestedExpression()), m_functor(xpr.functor())
-  {
-    EIGEN_INTERNAL_CHECK_COST_VALUE(TraversalSize==Dynamic ? HugeCost : (TraversalSize==0 ? 1 : int(CostOpType::value)));
-    EIGEN_INTERNAL_CHECK_COST_VALUE(CoeffReadCost);
-  }
-
-  typedef typename XprType::CoeffReturnType CoeffReturnType;
-
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
-  const Scalar coeff(Index i, Index j) const
-  {
-    return coeff(Direction==Vertical ? j : i);
-  }
-
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
-  const Scalar coeff(Index index) const
-  {
-    return m_functor(m_arg.template subVector<DirectionType(Direction)>(index));
-  }
-
-  template<int LoadMode,typename PacketType>
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
-  PacketType packet(Index i, Index j) const
-  {
-    return packet<LoadMode,PacketType>(Direction==Vertical ? j : i);
-  }
-  
-  template<int LoadMode,typename PacketType>
-  EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC
-  PacketType packet(Index idx) const
-  {
-    enum { PacketSize = internal::unpacket_traits<PacketType>::size };
-    typedef Block<const ArgTypeNestedCleaned,
-                  Direction==Vertical ? int(ArgType::RowsAtCompileTime) : int(PacketSize),
-                  Direction==Vertical ? int(PacketSize) : int(ArgType::ColsAtCompileTime),
-                  true /* InnerPanel */> PanelType;
-    
-    PanelType panel(m_arg,
-                    Direction==Vertical ? 0 : idx,
-                    Direction==Vertical ? idx : 0,
-                    Direction==Vertical ? m_arg.rows() : Index(PacketSize),
-                    Direction==Vertical ? Index(PacketSize) : m_arg.cols());
-
-    // FIXME
-    // See bug 1612, currently if PacketSize==1 (i.e. complex<double> with 128bits registers) then the storage-order of panel get reversed
-    // and methods like packetByOuterInner do not make sense anymore in this context.
-    // So let's just by pass "vectorization" in this case:
-    if(PacketSize==1)
-      return internal::pset1<PacketType>(coeff(idx));
-    
-    typedef typename internal::redux_evaluator<PanelType> PanelEvaluator;
-    PanelEvaluator panel_eval(panel);
-    typedef typename MemberOp::BinaryOp BinaryOp;
-    PacketType p = internal::packetwise_redux_impl<BinaryOp,PanelEvaluator>::template run<PacketType>(panel_eval,m_functor.binaryFunc(),m_arg.outerSize());
-    return p;
-  }
-
-protected:
-  ConstArgTypeNested m_arg;
-  const MemberOp m_functor;
-};
-
-} // end namespace internal
-
-} // end namespace Eigen
-
-#endif // EIGEN_PARTIALREDUX_H
--- a/Eigen/src/Core/PermutationMatrix.h
+++ b/Eigen/src/Core/PermutationMatrix.h
@@ -87,14 +87,25 @@ class PermutationBase : public EigenBase<Derived>
      return derived();
    }

+    #ifndef EIGEN_PARSED_BY_DOXYGEN
+    /** This is a special case of the templated operator=. Its purpose is to
+      * prevent a default operator= from hiding the templated operator=.
+      */
+    Derived& operator=(const PermutationBase& other)
+    {
+      indices() = other.indices();
+      return derived();
+    }
+    #endif
+
    /** \returns the number of rows */
-    inline EIGEN_DEVICE_FUNC Index rows() const { return Index(indices().size()); }
+    inline Index rows() const { return Index(indices().size()); }

    /** \returns the number of columns */
-    inline EIGEN_DEVICE_FUNC Index cols() const { return Index(indices().size()); }
+    inline Index cols() const { return Index(indices().size()); }

    /** \returns the size of a side of the respective square matrix, i.e., the number of indices */
-    inline EIGEN_DEVICE_FUNC Index size() const { return Index(indices().size()); }
+    inline Index size() const { return Index(indices().size()); }

    #ifndef EIGEN_PARSED_BY_DOXYGEN
    template<typename DenseDerived>
@@ -322,6 +333,12 @@ class PermutationMatrix : public PermutationBase<PermutationMatrix<SizeAtCompile
    inline PermutationMatrix(const PermutationBase<OtherDerived>& other)
      : m_indices(other.indices()) {}

+    #ifndef EIGEN_PARSED_BY_DOXYGEN
+    /** Standard copy constructor. Defined only to prevent a default copy constructor
+      * from hiding the other templated constructor */
+    inline PermutationMatrix(const PermutationMatrix& other) : m_indices(other.indices()) {}
+    #endif
+
    /** Generic constructor from expression of the indices. The indices
      * array has the meaning that the permutations sends each integer i to indices[i].
      *
@@ -356,6 +373,17 @@ class PermutationMatrix : public PermutationBase<PermutationMatrix<SizeAtCompile
      return Base::operator=(tr.derived());
    }

+    #ifndef EIGEN_PARSED_BY_DOXYGEN
+    /** This is a special case of the templated operator=. Its purpose is to
+      * prevent a default operator= from hiding the templated operator=.
+      */
+    PermutationMatrix& operator=(const PermutationMatrix& other)
+    {
+      m_indices = other.m_indices;
+      return *this;
+    }
+    #endif
+
    /** const version of indices(). */
    const IndicesType& indices() const { return m_indices; }
    /** \returns a reference to the stored array representing the permutation. */
--- a/Eigen/src/Core/PlainObjectBase.h
+++ b/Eigen/src/Core/PlainObjectBase.h
@@ -104,7 +104,7 @@ class PlainObjectBase : public internal::dense_xpr_base<Derived>::type

    typedef typename internal::traits<Derived>::StorageKind StorageKind;
    typedef typename internal::traits<Derived>::Scalar Scalar;
-
+    
    typedef typename internal::packet_traits<Scalar>::type PacketScalar;
    typedef typename NumTraits<Scalar>::Real RealScalar;
    typedef Derived DenseType;
@@ -358,7 +358,7 @@ class PlainObjectBase : public internal::dense_xpr_base<Derived>::type
      * remain row-vectors and vectors remain vectors.
      */
    template<typename OtherDerived>
-    EIGEN_DEVICE_FUNC
+    EIGEN_DEVICE_FUNC 
    EIGEN_STRONG_INLINE void resizeLike(const EigenBase<OtherDerived>& _other)
    {
      const OtherDerived& other = _other.derived();
@@ -383,7 +383,7 @@ class PlainObjectBase : public internal::dense_xpr_base<Derived>::type
      * of rows and/or of columns, you can use conservativeResize(NoChange_t, Index) or
      * conservativeResize(Index, NoChange_t).
      *
-      * Matrices are resized relative to the top-left element. In case values need to be
+      * Matrices are resized relative to the top-left element. In case values need to be 
      * appended to the matrix they will be uninitialized.
      */
    EIGEN_DEVICE_FUNC
@@ -440,7 +440,7 @@ class PlainObjectBase : public internal::dense_xpr_base<Derived>::type
      * of rows and/or of columns, you can use conservativeResize(NoChange_t, Index) or
      * conservativeResize(Index, NoChange_t).
      *
-      * Matrices are resized relative to the top-left element. In case values need to be
+      * Matrices are resized relative to the top-left element. In case values need to be 
      * appended to the matrix they will copied from \c other.
      */
    template<typename OtherDerived>
@@ -526,71 +526,6 @@ class PlainObjectBase : public internal::dense_xpr_base<Derived>::type
 //       EIGEN_INITIALIZE_COEFFS_IF_THAT_OPTION_IS_ENABLED
    }

-    #if EIGEN_HAS_CXX11
-    /** \brief Construct a row of column vector with fixed size from an arbitrary number of coefficients. \cpp11
-      *
-      * \only_for_vectors
-      * 
-      * This constructor is for 1D array or vectors with more than 4 coefficients.
-      * There exists C++98 analogue constructors for fixed-size array/vector having 1, 2, 3, or 4 coefficients.
-      * 
-      * \warning To construct a column (resp. row) vector of fixed length, the number of values passed to this 
-      * constructor must match the the fixed number of rows (resp. columns) of \c *this.
-      */
-    template <typename... ArgTypes>
-    EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
-    PlainObjectBase(const Scalar& a0, const Scalar& a1, const Scalar& a2,  const Scalar& a3, const ArgTypes&... args)
-      : m_storage()
-    {
-      _check_template_params();
-      EIGEN_STATIC_ASSERT_VECTOR_SPECIFIC_SIZE(PlainObjectBase, sizeof...(args) + 4);
-      m_storage.data()[0] = a0;
-      m_storage.data()[1] = a1;
-      m_storage.data()[2] = a2;
-      m_storage.data()[3] = a3;
-      int i = 4;
-      auto x = {(m_storage.data()[i++] = args, 0)...};
-      static_cast<void>(x);
-    }
-
-    /** \brief Constructs a Matrix or Array and initializes it by elements given by an initializer list of initializer
-      * lists \cpp11
-      */
-    EIGEN_DEVICE_FUNC
-    explicit EIGEN_STRONG_INLINE PlainObjectBase(const std::initializer_list<std::initializer_list<Scalar>>& list)
-      : m_storage()
-    {
-      _check_template_params();
-
-      size_t list_size = 0;
-      if (list.begin() != list.end()) {
-        list_size = list.begin()->size();
-      }
-
-      // This is to allow syntax like VectorXi {{1, 2, 3, 4}}
-      if (ColsAtCompileTime == 1 && list.size() == 1) {
-        eigen_assert(list_size == static_cast<size_t>(RowsAtCompileTime) || RowsAtCompileTime == Dynamic);
-        resize(list_size, ColsAtCompileTime);
-        std::copy(list.begin()->begin(), list.begin()->end(), m_storage.data());
-      } else {
-        eigen_assert(list.size() == static_cast<size_t>(RowsAtCompileTime) || RowsAtCompileTime == Dynamic);
-        eigen_assert(list_size == static_cast<size_t>(ColsAtCompileTime) || ColsAtCompileTime == Dynamic);
-        resize(list.size(), list_size);
-       
-        Index row_index = 0;
-        for (const std::initializer_list<Scalar>& row : list) {
-          eigen_assert(list_size == row.size());
-          Index col_index = 0;
-          for (const Scalar& e : row) {
-            coeffRef(row_index, col_index) = e;
-            ++col_index;
-          }
-          ++row_index;
-        }
-      }
-    }
-    #endif  // end EIGEN_HAS_CXX11
-
    /** \sa PlainObjectBase::operator=(const EigenBase<OtherDerived>&) */
    template<typename OtherDerived>
    EIGEN_DEVICE_FUNC
@@ -629,7 +564,7 @@ class PlainObjectBase : public internal::dense_xpr_base<Derived>::type
      * \copydetails DenseBase::operator=(const EigenBase<OtherDerived> &other)
      */
    template<typename OtherDerived>
-    EIGEN_DEVICE_FUNC
+    EIGEN_DEVICE_FUNC 
    EIGEN_STRONG_INLINE Derived& operator=(const EigenBase<OtherDerived> &other)
    {
      _resize_to_match(other);
@@ -743,7 +678,7 @@ class PlainObjectBase : public internal::dense_xpr_base<Derived>::type
      * remain row-vectors and vectors remain vectors.
      */
    template<typename OtherDerived>
-    EIGEN_DEVICE_FUNC
+    EIGEN_DEVICE_FUNC 
    EIGEN_STRONG_INLINE void _resize_to_match(const EigenBase<OtherDerived>& other)
    {
      #ifdef EIGEN_NO_AUTOMATIC_RESIZING
@@ -770,10 +705,10 @@ class PlainObjectBase : public internal::dense_xpr_base<Derived>::type
      *
      * \internal
      */
-    // aliasing is dealt once in internal::call_assignment
+    // aliasing is dealt once in internall::call_assignment
    // so at this stage we have to assume aliasing... and resising has to be done later.
    template<typename OtherDerived>
-    EIGEN_DEVICE_FUNC
+    EIGEN_DEVICE_FUNC 
    EIGEN_STRONG_INLINE Derived& _set(const DenseBase<OtherDerived>& other)
    {
      internal::call_assignment(this->derived(), other.derived());
@@ -786,7 +721,7 @@ class PlainObjectBase : public internal::dense_xpr_base<Derived>::type
      * \sa operator=(const MatrixBase<OtherDerived>&), _set()
      */
    template<typename OtherDerived>
-    EIGEN_DEVICE_FUNC
+    EIGEN_DEVICE_FUNC 
    EIGEN_STRONG_INLINE Derived& _set_noalias(const DenseBase<OtherDerived>& other)
    {
      // I don't think we need this resize call since the lazyAssign will anyways resize
@@ -802,25 +737,23 @@ class PlainObjectBase : public internal::dense_xpr_base<Derived>::type
    EIGEN_DEVICE_FUNC
    EIGEN_STRONG_INLINE void _init2(Index rows, Index cols, typename internal::enable_if<Base::SizeAtCompileTime!=2,T0>::type* = 0)
    {
-      const bool t0_is_integer_alike = internal::is_valid_index_type<T0>::value;
-      const bool t1_is_integer_alike = internal::is_valid_index_type<T1>::value;
-      EIGEN_STATIC_ASSERT(t0_is_integer_alike &&
-                          t1_is_integer_alike,
+      EIGEN_STATIC_ASSERT(bool(NumTraits<T0>::IsInteger) &&
+                          bool(NumTraits<T1>::IsInteger),
                          FLOATING_POINT_ARGUMENT_PASSED__INTEGER_WAS_EXPECTED)
      resize(rows,cols);
    }
-
+    
    template<typename T0, typename T1>
-    EIGEN_DEVICE_FUNC
+    EIGEN_DEVICE_FUNC 
    EIGEN_STRONG_INLINE void _init2(const T0& val0, const T1& val1, typename internal::enable_if<Base::SizeAtCompileTime==2,T0>::type* = 0)
    {
      EIGEN_STATIC_ASSERT_VECTOR_SPECIFIC_SIZE(PlainObjectBase, 2)
      m_storage.data()[0] = Scalar(val0);
      m_storage.data()[1] = Scalar(val1);
    }
-
+    
    template<typename T0, typename T1>
-    EIGEN_DEVICE_FUNC
+    EIGEN_DEVICE_FUNC 
    EIGEN_STRONG_INLINE void _init2(const Index& val0, const Index& val1,
                                    typename internal::enable_if<    (!internal::is_same<Index,Scalar>::value)
                                                                  && (internal::is_same<T0,Index>::value)
@@ -840,14 +773,14 @@ class PlainObjectBase : public internal::dense_xpr_base<Derived>::type
                                                                              && ((!internal::is_same<typename internal::traits<Derived>::XprKind,ArrayXpr>::value || Base::SizeAtCompileTime==Dynamic)),T>::type* = 0)
    {
      // NOTE MSVC 2008 complains if we directly put bool(NumTraits<T>::IsInteger) as the EIGEN_STATIC_ASSERT argument.
-      const bool is_integer_alike = internal::is_valid_index_type<T>::value;
-      EIGEN_UNUSED_VARIABLE(is_integer_alike);
-      EIGEN_STATIC_ASSERT(is_integer_alike,
+      const bool is_integer = NumTraits<T>::IsInteger;
+      EIGEN_UNUSED_VARIABLE(is_integer);
+      EIGEN_STATIC_ASSERT(is_integer,
                          FLOATING_POINT_ARGUMENT_PASSED__INTEGER_WAS_EXPECTED)
      resize(size);
    }
-
-    // We have a 1x1 matrix/array => the argument is interpreted as the value of the unique coefficient (case where scalar type can be implicitly converted)
+    
+    // We have a 1x1 matrix/array => the argument is interpreted as the value of the unique coefficient (case where scalar type can be implicitely converted)
    template<typename T>
    EIGEN_DEVICE_FUNC
    EIGEN_STRONG_INLINE void _init1(const Scalar& val0, typename internal::enable_if<Base::SizeAtCompileTime==1 && internal::is_convertible<T, Scalar>::value,T>::type* = 0)
@@ -855,7 +788,7 @@ class PlainObjectBase : public internal::dense_xpr_base<Derived>::type
      EIGEN_STATIC_ASSERT_VECTOR_SPECIFIC_SIZE(PlainObjectBase, 1)
      m_storage.data()[0] = val0;
    }
-
+    
    // We have a 1x1 matrix/array => the argument is interpreted as the value of the unique coefficient (case where scalar type match the index type)
    template<typename T>
    EIGEN_DEVICE_FUNC
@@ -911,7 +844,7 @@ class PlainObjectBase : public internal::dense_xpr_base<Derived>::type
    {
      this->derived() = r;
    }
-
+    
    // For fixed-size Array<Scalar,...>
    template<typename T>
    EIGEN_DEVICE_FUNC
@@ -923,7 +856,7 @@ class PlainObjectBase : public internal::dense_xpr_base<Derived>::type
    {
      Base::setConstant(val0);
    }
-
+    
    // For fixed-size Array<Index,...>
    template<typename T>
    EIGEN_DEVICE_FUNC
@@ -937,34 +870,34 @@ class PlainObjectBase : public internal::dense_xpr_base<Derived>::type
    {
      Base::setConstant(val0);
    }
-
+    
    template<typename MatrixTypeA, typename MatrixTypeB, bool SwapPointers>
    friend struct internal::matrix_swap_impl;

  public:
-
+    
 #ifndef EIGEN_PARSED_BY_DOXYGEN
    /** \internal
      * \brief Override DenseBase::swap() since for dynamic-sized matrices
      * of same type it is enough to swap the data pointers.
      */
    template<typename OtherDerived>
-    EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+    EIGEN_DEVICE_FUNC
    void swap(DenseBase<OtherDerived> & other)
    {
      enum { SwapPointers = internal::is_same<Derived, OtherDerived>::value && Base::SizeAtCompileTime==Dynamic };
      internal::matrix_swap_impl<Derived, OtherDerived, bool(SwapPointers)>::run(this->derived(), other.derived());
    }
-
+    
    /** \internal
      * \brief const version forwarded to DenseBase::swap
      */
    template<typename OtherDerived>
-    EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+    EIGEN_DEVICE_FUNC
    void swap(DenseBase<OtherDerived> const & other)
    { Base::swap(other.derived()); }
-
-    EIGEN_DEVICE_FUNC
+    
+    EIGEN_DEVICE_FUNC 
    static EIGEN_STRONG_INLINE void _check_template_params()
    {
      EIGEN_STATIC_ASSERT((EIGEN_IMPLIES(MaxRowsAtCompileTime==1 && MaxColsAtCompileTime!=1, (Options&RowMajor)==RowMajor)
@@ -988,19 +921,13 @@ namespace internal {
 template <typename Derived, typename OtherDerived, bool IsVector>
 struct conservative_resize_like_impl
 {
-  #if EIGEN_HAS_TYPE_TRAITS
-  static const bool IsRelocatable = std::is_trivially_copyable<typename Derived::Scalar>::value;
-  #else
-  static const bool IsRelocatable = !NumTraits<typename Derived::Scalar>::RequireInitialization;
-  #endif
  static void run(DenseBase<Derived>& _this, Index rows, Index cols)
  {
    if (_this.rows() == rows && _this.cols() == cols) return;
    EIGEN_STATIC_ASSERT_DYNAMIC_SIZE(Derived)

-    if ( IsRelocatable
-          && (( Derived::IsRowMajor && _this.cols() == cols) ||  // row-major and we change only the number of rows
-              (!Derived::IsRowMajor && _this.rows() == rows) ))  // column-major and we change only the number of columns
+    if ( ( Derived::IsRowMajor && _this.cols() == cols) || // row-major and we change only the number of rows
+         (!Derived::IsRowMajor && _this.rows() == rows) )  // column-major and we change only the number of columns
    {
      internal::check_rows_cols_for_overflow<Derived::MaxSizeAtCompileTime>::run(rows, cols);
      _this.derived().m_storage.conservativeResize(rows*cols,rows,cols);
@@ -1028,9 +955,8 @@ struct conservative_resize_like_impl
    EIGEN_STATIC_ASSERT_DYNAMIC_SIZE(Derived)
    EIGEN_STATIC_ASSERT_DYNAMIC_SIZE(OtherDerived)

-    if ( IsRelocatable &&
-          (( Derived::IsRowMajor && _this.cols() == other.cols()) ||  // row-major and we change only the number of rows
-           (!Derived::IsRowMajor && _this.rows() == other.rows()) ))  // column-major and we change only the number of columns
+    if ( ( Derived::IsRowMajor && _this.cols() == other.cols()) || // row-major and we change only the number of rows
+         (!Derived::IsRowMajor && _this.rows() == other.rows()) )  // column-major and we change only the number of columns
    {
      const Index new_rows = other.rows() - _this.rows();
      const Index new_cols = other.cols() - _this.cols();
@@ -1058,18 +984,13 @@ template <typename Derived, typename OtherDerived>
 struct conservative_resize_like_impl<Derived,OtherDerived,true>
  : conservative_resize_like_impl<Derived,OtherDerived,false>
 {
-  typedef conservative_resize_like_impl<Derived,OtherDerived,false> Base;
-  using Base::run;
-  using Base::IsRelocatable;
-
+  using conservative_resize_like_impl<Derived,OtherDerived,false>::run;
+  
  static void run(DenseBase<Derived>& _this, Index size)
  {
    const Index new_rows = Derived::RowsAtCompileTime==1 ? 1 : size;
    const Index new_cols = Derived::RowsAtCompileTime==1 ? size : 1;
-    if(IsRelocatable)
-      _this.derived().m_storage.conservativeResize(size,new_rows,new_cols);
-    else
-      Base::run(_this.derived(), new_rows, new_cols);
+    _this.derived().m_storage.conservativeResize(size,new_rows,new_cols);
  }

  static void run(DenseBase<Derived>& _this, const DenseBase<OtherDerived>& other)
@@ -1080,10 +1001,7 @@ struct conservative_resize_like_impl<Derived,OtherDerived,true>

    const Index new_rows = Derived::RowsAtCompileTime==1 ? 1 : other.rows();
    const Index new_cols = Derived::RowsAtCompileTime==1 ? other.cols() : 1;
-    if(IsRelocatable)
-      _this.derived().m_storage.conservativeResize(other.size(),new_rows,new_cols);
-    else
-      Base::run(_this.derived(), new_rows, new_cols);
+    _this.derived().m_storage.conservativeResize(other.size(),new_rows,new_cols);

    if (num_new_elements > 0)
      _this.tail(num_new_elements) = other.tail(num_new_elements);
@@ -1094,7 +1012,7 @@ template<typename MatrixTypeA, typename MatrixTypeB, bool SwapPointers>
 struct matrix_swap_impl
 {
  EIGEN_DEVICE_FUNC
-  static EIGEN_STRONG_INLINE void run(MatrixTypeA& a, MatrixTypeB& b)
+  static inline void run(MatrixTypeA& a, MatrixTypeB& b)
  {
    a.base().swap(b);
  }
--- a/Eigen/src/Core/Product.h
+++ b/Eigen/src/Core/Product.h
@@ -90,23 +90,18 @@ class Product : public ProductImpl<_Lhs,_Rhs,Option,
    typedef typename internal::remove_all<LhsNested>::type LhsNestedCleaned;
    typedef typename internal::remove_all<RhsNested>::type RhsNestedCleaned;

-    EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
-    Product(const Lhs& lhs, const Rhs& rhs) : m_lhs(lhs), m_rhs(rhs)
+    EIGEN_DEVICE_FUNC Product(const Lhs& lhs, const Rhs& rhs) : m_lhs(lhs), m_rhs(rhs)
    {
      eigen_assert(lhs.cols() == rhs.rows()
        && "invalid matrix product"
        && "if you wanted a coeff-wise or a dot product use the respective explicit functions");
    }

-    EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
-    Index rows() const { return m_lhs.rows(); }
-    EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
-    Index cols() const { return m_rhs.cols(); }
+    EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Index rows() const { return m_lhs.rows(); }
+    EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Index cols() const { return m_rhs.cols(); }

-    EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
-    const LhsNestedCleaned& lhs() const { return m_lhs; }
-    EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
-    const RhsNestedCleaned& rhs() const { return m_rhs; }
+    EIGEN_DEVICE_FUNC const LhsNestedCleaned& lhs() const { return m_lhs; }
+    EIGEN_DEVICE_FUNC const RhsNestedCleaned& rhs() const { return m_rhs; }

  protected:

@@ -121,7 +116,7 @@ class dense_product_base
 : public internal::dense_xpr_base<Product<Lhs,Rhs,Option> >::type
 {};

-/** Conversion to scalar for inner-products */
+/** Convertion to scalar for inner-products */
 template<typename Lhs, typename Rhs, int Option>
 class dense_product_base<Lhs, Rhs, Option, InnerProduct>
 : public internal::dense_xpr_base<Product<Lhs,Rhs,Option> >::type
@@ -132,7 +127,7 @@ public:
  using Base::derived;
  typedef typename Base::Scalar Scalar;
  
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE operator const Scalar() const
+  EIGEN_STRONG_INLINE operator const Scalar() const
  {
    return internal::evaluator<ProductXpr>(derived()).coeff(0,0);
  }
--- a/Eigen/src/Core/ProductEvaluators.h
+++ b/Eigen/src/Core/ProductEvaluators.h
@@ -20,7 +20,7 @@ namespace internal {
 /** \internal
  * Evaluator of a product expression.
  * Since products require special treatments to handle all possible cases,
-  * we simply defer the evaluation logic to a product_evaluator class
+  * we simply deffer the evaluation logic to a product_evaluator class
  * which offers more partial specialization possibilities.
  * 
  * \sa class product_evaluator
@@ -128,7 +128,7 @@ protected:
  PlainObject m_result;
 };

-// The following three shortcuts are enabled only if the scalar types match exactly.
+// The following three shortcuts are enabled only if the scalar types match excatly.
 // TODO: we could enable them for different scalar types when the product is not vectorized.

 // Dense = Product
@@ -137,7 +137,7 @@ struct Assignment<DstXprType, Product<Lhs,Rhs,Options>, internal::assign_op<Scal
  typename enable_if<(Options==DefaultProduct || Options==AliasFreeProduct)>::type>
 {
  typedef Product<Lhs,Rhs,Options> SrcXprType;
-  static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+  static EIGEN_STRONG_INLINE
  void run(DstXprType &dst, const SrcXprType &src, const internal::assign_op<Scalar,Scalar> &)
  {
    Index dstRows = src.rows();
@@ -155,7 +155,7 @@ struct Assignment<DstXprType, Product<Lhs,Rhs,Options>, internal::add_assign_op<
  typename enable_if<(Options==DefaultProduct || Options==AliasFreeProduct)>::type>
 {
  typedef Product<Lhs,Rhs,Options> SrcXprType;
-  static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+  static EIGEN_STRONG_INLINE
  void run(DstXprType &dst, const SrcXprType &src, const internal::add_assign_op<Scalar,Scalar> &)
  {
    eigen_assert(dst.rows() == src.rows() && dst.cols() == src.cols());
@@ -170,7 +170,7 @@ struct Assignment<DstXprType, Product<Lhs,Rhs,Options>, internal::sub_assign_op<
  typename enable_if<(Options==DefaultProduct || Options==AliasFreeProduct)>::type>
 {
  typedef Product<Lhs,Rhs,Options> SrcXprType;
-  static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+  static EIGEN_STRONG_INLINE
  void run(DstXprType &dst, const SrcXprType &src, const internal::sub_assign_op<Scalar,Scalar> &)
  {
    eigen_assert(dst.rows() == src.rows() && dst.cols() == src.cols());
@@ -190,7 +190,7 @@ struct Assignment<DstXprType, CwiseBinaryOp<internal::scalar_product_op<ScalarBi
  typedef CwiseBinaryOp<internal::scalar_product_op<ScalarBis,Scalar>,
                        const CwiseNullaryOp<internal::scalar_constant_op<ScalarBis>,Plain>,
                        const Product<Lhs,Rhs,DefaultProduct> > SrcXprType;
-  static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+  static EIGEN_STRONG_INLINE
  void run(DstXprType &dst, const SrcXprType &src, const AssignFunc& func)
  {
    call_assignment_no_alias(dst, (src.lhs().functor().m_other * src.rhs().lhs())*src.rhs().rhs(), func);
@@ -217,7 +217,7 @@ template<typename DstXprType, typename OtherXpr, typename ProductType, typename
 struct assignment_from_xpr_op_product
 {
  template<typename SrcXprType, typename InitialFunc>
-  static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+  static EIGEN_STRONG_INLINE
  void run(DstXprType &dst, const SrcXprType &src, const InitialFunc& /*func*/)
  {
    call_assignment_no_alias(dst, src.lhs(), Func1());
@@ -246,19 +246,19 @@ template<typename Lhs, typename Rhs>
 struct generic_product_impl<Lhs,Rhs,DenseShape,DenseShape,InnerProduct>
 {
  template<typename Dst>
-  static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void evalTo(Dst& dst, const Lhs& lhs, const Rhs& rhs)
+  static EIGEN_STRONG_INLINE void evalTo(Dst& dst, const Lhs& lhs, const Rhs& rhs)
  {
    dst.coeffRef(0,0) = (lhs.transpose().cwiseProduct(rhs)).sum();
  }
  
  template<typename Dst>
-  static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void addTo(Dst& dst, const Lhs& lhs, const Rhs& rhs)
+  static EIGEN_STRONG_INLINE void addTo(Dst& dst, const Lhs& lhs, const Rhs& rhs)
  {
    dst.coeffRef(0,0) += (lhs.transpose().cwiseProduct(rhs)).sum();
  }
  
  template<typename Dst>
-  static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void subTo(Dst& dst, const Lhs& lhs, const Rhs& rhs)
+  static EIGEN_STRONG_INLINE void subTo(Dst& dst, const Lhs& lhs, const Rhs& rhs)
  { dst.coeffRef(0,0) -= (lhs.transpose().cwiseProduct(rhs)).sum(); }
 };

@@ -269,10 +269,10 @@ struct generic_product_impl<Lhs,Rhs,DenseShape,DenseShape,InnerProduct>

 // Column major result
 template<typename Dst, typename Lhs, typename Rhs, typename Func>
-void EIGEN_DEVICE_FUNC outer_product_selector_run(Dst& dst, const Lhs &lhs, const Rhs &rhs, const Func& func, const false_type&)
+void outer_product_selector_run(Dst& dst, const Lhs &lhs, const Rhs &rhs, const Func& func, const false_type&)
 {
  evaluator<Rhs> rhsEval(rhs);
-  ei_declare_local_nested_eval(Lhs,lhs,Rhs::SizeAtCompileTime,actual_lhs);
+  typename nested_eval<Lhs,Rhs::SizeAtCompileTime>::type actual_lhs(lhs);
  // FIXME if cols is large enough, then it might be useful to make sure that lhs is sequentially stored
  // FIXME not very good if rhs is real and lhs complex while alpha is real too
  const Index cols = dst.cols();
@@ -282,10 +282,10 @@ void EIGEN_DEVICE_FUNC outer_product_selector_run(Dst& dst, const Lhs &lhs, cons

 // Row major result
 template<typename Dst, typename Lhs, typename Rhs, typename Func>
-void EIGEN_DEVICE_FUNC outer_product_selector_run(Dst& dst, const Lhs &lhs, const Rhs &rhs, const Func& func, const true_type&)
+void outer_product_selector_run(Dst& dst, const Lhs &lhs, const Rhs &rhs, const Func& func, const true_type&)
 {
  evaluator<Lhs> lhsEval(lhs);
-  ei_declare_local_nested_eval(Rhs,rhs,Lhs::SizeAtCompileTime,actual_rhs);
+  typename nested_eval<Rhs,Lhs::SizeAtCompileTime>::type actual_rhs(rhs);
  // FIXME if rows is large enough, then it might be useful to make sure that rhs is sequentially stored
  // FIXME not very good if lhs is real and rhs complex while alpha is real too
  const Index rows = dst.rows();
@@ -300,37 +300,37 @@ struct generic_product_impl<Lhs,Rhs,DenseShape,DenseShape,OuterProduct>
  typedef typename Product<Lhs,Rhs>::Scalar Scalar;
  
  // TODO it would be nice to be able to exploit our *_assign_op functors for that purpose
-  struct set  { template<typename Dst, typename Src> EIGEN_DEVICE_FUNC void operator()(const Dst& dst, const Src& src) const { dst.const_cast_derived()  = src; } };
-  struct add  { template<typename Dst, typename Src> EIGEN_DEVICE_FUNC void operator()(const Dst& dst, const Src& src) const { dst.const_cast_derived() += src; } };
-  struct sub  { template<typename Dst, typename Src> EIGEN_DEVICE_FUNC void operator()(const Dst& dst, const Src& src) const { dst.const_cast_derived() -= src; } };
+  struct set  { template<typename Dst, typename Src> void operator()(const Dst& dst, const Src& src) const { dst.const_cast_derived()  = src; } };
+  struct add  { template<typename Dst, typename Src> void operator()(const Dst& dst, const Src& src) const { dst.const_cast_derived() += src; } };
+  struct sub  { template<typename Dst, typename Src> void operator()(const Dst& dst, const Src& src) const { dst.const_cast_derived() -= src; } };
  struct adds {
    Scalar m_scale;
    explicit adds(const Scalar& s) : m_scale(s) {}
-    template<typename Dst, typename Src> void EIGEN_DEVICE_FUNC operator()(const Dst& dst, const Src& src) const {
+    template<typename Dst, typename Src> void operator()(const Dst& dst, const Src& src) const {
      dst.const_cast_derived() += m_scale * src;
    }
  };
  
  template<typename Dst>
-  static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void evalTo(Dst& dst, const Lhs& lhs, const Rhs& rhs)
+  static EIGEN_STRONG_INLINE void evalTo(Dst& dst, const Lhs& lhs, const Rhs& rhs)
  {
    internal::outer_product_selector_run(dst, lhs, rhs, set(), is_row_major<Dst>());
  }
  
  template<typename Dst>
-  static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void addTo(Dst& dst, const Lhs& lhs, const Rhs& rhs)
+  static EIGEN_STRONG_INLINE void addTo(Dst& dst, const Lhs& lhs, const Rhs& rhs)
  {
    internal::outer_product_selector_run(dst, lhs, rhs, add(), is_row_major<Dst>());
  }
  
  template<typename Dst>
-  static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void subTo(Dst& dst, const Lhs& lhs, const Rhs& rhs)
+  static EIGEN_STRONG_INLINE void subTo(Dst& dst, const Lhs& lhs, const Rhs& rhs)
  {
    internal::outer_product_selector_run(dst, lhs, rhs, sub(), is_row_major<Dst>());
  }
  
  template<typename Dst>
-  static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void scaleAndAddTo(Dst& dst, const Lhs& lhs, const Rhs& rhs, const Scalar& alpha)
+  static EIGEN_STRONG_INLINE void scaleAndAddTo(Dst& dst, const Lhs& lhs, const Rhs& rhs, const Scalar& alpha)
  {
    internal::outer_product_selector_run(dst, lhs, rhs, adds(alpha), is_row_major<Dst>());
  }
@@ -345,19 +345,19 @@ struct generic_product_impl_base
  typedef typename Product<Lhs,Rhs>::Scalar Scalar;
  
  template<typename Dst>
-  static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void evalTo(Dst& dst, const Lhs& lhs, const Rhs& rhs)
+  static EIGEN_STRONG_INLINE void evalTo(Dst& dst, const Lhs& lhs, const Rhs& rhs)
  { dst.setZero(); scaleAndAddTo(dst, lhs, rhs, Scalar(1)); }

  template<typename Dst>
-  static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void addTo(Dst& dst, const Lhs& lhs, const Rhs& rhs)
+  static EIGEN_STRONG_INLINE void addTo(Dst& dst, const Lhs& lhs, const Rhs& rhs)
  { scaleAndAddTo(dst,lhs, rhs, Scalar(1)); }

  template<typename Dst>
-  static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void subTo(Dst& dst, const Lhs& lhs, const Rhs& rhs)
+  static EIGEN_STRONG_INLINE void subTo(Dst& dst, const Lhs& lhs, const Rhs& rhs)
  { scaleAndAddTo(dst, lhs, rhs, Scalar(-1)); }
  
  template<typename Dst>
-  static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void scaleAndAddTo(Dst& dst, const Lhs& lhs, const Rhs& rhs, const Scalar& alpha)
+  static EIGEN_STRONG_INLINE void scaleAndAddTo(Dst& dst, const Lhs& lhs, const Rhs& rhs, const Scalar& alpha)
  { Derived::scaleAndAddTo(dst,lhs,rhs,alpha); }

 };
@@ -373,7 +373,7 @@ struct generic_product_impl<Lhs,Rhs,DenseShape,DenseShape,GemvProduct>
  typedef typename internal::remove_all<typename internal::conditional<int(Side)==OnTheRight,LhsNested,RhsNested>::type>::type MatrixType;

  template<typename Dest>
-  static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void scaleAndAddTo(Dest& dst, const Lhs& lhs, const Rhs& rhs, const Scalar& alpha)
+  static EIGEN_STRONG_INLINE void scaleAndAddTo(Dest& dst, const Lhs& lhs, const Rhs& rhs, const Scalar& alpha)
  {
    LhsNested actual_lhs(lhs);
    RhsNested actual_rhs(rhs);
@@ -390,79 +390,30 @@ struct generic_product_impl<Lhs,Rhs,DenseShape,DenseShape,CoeffBasedProductMode>
  typedef typename Product<Lhs,Rhs>::Scalar Scalar;
  
  template<typename Dst>
-  static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void evalTo(Dst& dst, const Lhs& lhs, const Rhs& rhs)
+  static EIGEN_STRONG_INLINE void evalTo(Dst& dst, const Lhs& lhs, const Rhs& rhs)
  {
    // Same as: dst.noalias() = lhs.lazyProduct(rhs);
    // but easier on the compiler side
    call_assignment_no_alias(dst, lhs.lazyProduct(rhs), internal::assign_op<typename Dst::Scalar,Scalar>());
  }
-
+  
  template<typename Dst>
-  static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void addTo(Dst& dst, const Lhs& lhs, const Rhs& rhs)
+  static EIGEN_STRONG_INLINE void addTo(Dst& dst, const Lhs& lhs, const Rhs& rhs)
  {
    // dst.noalias() += lhs.lazyProduct(rhs);
    call_assignment_no_alias(dst, lhs.lazyProduct(rhs), internal::add_assign_op<typename Dst::Scalar,Scalar>());
  }
  
  template<typename Dst>
-  static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void subTo(Dst& dst, const Lhs& lhs, const Rhs& rhs)
+  static EIGEN_STRONG_INLINE void subTo(Dst& dst, const Lhs& lhs, const Rhs& rhs)
  {
    // dst.noalias() -= lhs.lazyProduct(rhs);
    call_assignment_no_alias(dst, lhs.lazyProduct(rhs), internal::sub_assign_op<typename Dst::Scalar,Scalar>());
  }
-
-  // This is a special evaluation path called from generic_product_impl<...,GemmProduct> in file GeneralMatrixMatrix.h
-  // This variant tries to extract scalar multiples from both the LHS and RHS and factor them out. For instance:
-  //   dst {,+,-}= (s1*A)*(B*s2)
-  // will be rewritten as:
-  //   dst {,+,-}= (s1*s2) * (A.lazyProduct(B))
-  // There are at least four benefits of doing so:
-  //  1 - huge performance gain for heap-allocated matrix types as it save costly allocations.
-  //  2 - it is faster than simply by-passing the heap allocation through stack allocation.
-  //  3 - it makes this fallback consistent with the heavy GEMM routine.
-  //  4 - it fully by-passes huge stack allocation attempts when multiplying huge fixed-size matrices.
-  //      (see https://stackoverflow.com/questions/54738495)
-  // For small fixed sizes matrices, howver, the gains are less obvious, it is sometimes x2 faster, but sometimes x3 slower,
-  // and the behavior depends also a lot on the compiler... This is why this re-writting strategy is currently
-  // enabled only when falling back from the main GEMM.
-  template<typename Dst, typename Func>
-  static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
-  void eval_dynamic(Dst& dst, const Lhs& lhs, const Rhs& rhs, const Func &func)
-  {
-    enum {
-      HasScalarFactor = blas_traits<Lhs>::HasScalarFactor || blas_traits<Rhs>::HasScalarFactor,
-      ConjLhs = blas_traits<Lhs>::NeedToConjugate,
-      ConjRhs = blas_traits<Rhs>::NeedToConjugate
-    };
-    // FIXME: in c++11 this should be auto, and extractScalarFactor should also return auto
-    //        this is important for real*complex_mat
-    Scalar actualAlpha =    blas_traits<Lhs>::extractScalarFactor(lhs)
-                          * blas_traits<Rhs>::extractScalarFactor(rhs);
-    eval_dynamic_impl(dst,
-                      blas_traits<Lhs>::extract(lhs).template conjugateIf<ConjLhs>(),
-                      blas_traits<Rhs>::extract(rhs).template conjugateIf<ConjRhs>(),
-                      func,
-                      actualAlpha,
-                      typename conditional<HasScalarFactor,true_type,false_type>::type());
-  }
-
-protected:
-
-  template<typename Dst, typename LhsT, typename RhsT, typename Func, typename Scalar>
-  static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
-  void eval_dynamic_impl(Dst& dst, const LhsT& lhs, const RhsT& rhs, const Func &func, const Scalar&  s /* == 1 */, false_type)
-  {
-    EIGEN_UNUSED_VARIABLE(s);
-    eigen_internal_assert(s==Scalar(1));
-    call_restricted_packet_assignment_no_alias(dst, lhs.lazyProduct(rhs), func);
-  }
-
-  template<typename Dst, typename LhsT, typename RhsT, typename Func, typename Scalar>
-  static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
-  void eval_dynamic_impl(Dst& dst, const LhsT& lhs, const RhsT& rhs, const Func &func, const Scalar& s, true_type)
-  {
-    call_restricted_packet_assignment_no_alias(dst, s * lhs.lazyProduct(rhs), func);
-  }
+  
+//   template<typename Dst>
+//   static inline void scaleAndAddTo(Dst& dst, const Lhs& lhs, const Rhs& rhs, const Scalar& alpha)
+//   { dst.noalias() += alpha * lhs.lazyProduct(rhs); }
 };

 // This specialization enforces the use of a coefficient-based evaluation strategy
@@ -605,8 +556,7 @@ struct product_evaluator<Product<Lhs, Rhs, LazyProduct>, ProductTag, DenseShape,
   * which is why we don't set the LinearAccessBit.
   * TODO: this seems possible when the result is a vector
   */
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
-  const CoeffReturnType coeff(Index index) const
+  EIGEN_DEVICE_FUNC const CoeffReturnType coeff(Index index) const
  {
    const Index row = (RowsAtCompileTime == 1 || MaxRowsAtCompileTime==1) ? 0 : index;
    const Index col = (RowsAtCompileTime == 1 || MaxRowsAtCompileTime==1) ? index : 0;
@@ -614,7 +564,6 @@ struct product_evaluator<Product<Lhs, Rhs, LazyProduct>, ProductTag, DenseShape,
  }

  template<int LoadMode, typename PacketType>
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
  const PacketType packet(Index row, Index col) const
  {
    PacketType res;
@@ -626,7 +575,6 @@ struct product_evaluator<Product<Lhs, Rhs, LazyProduct>, ProductTag, DenseShape,
  }

  template<int LoadMode, typename PacketType>
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
  const PacketType packet(Index index) const
  {
    const Index row = (RowsAtCompileTime == 1 || MaxRowsAtCompileTime==1) ? 0 : index;
@@ -655,8 +603,7 @@ struct product_evaluator<Product<Lhs, Rhs, DefaultProduct>, LazyCoeffBasedProduc
  enum {
    Flags = Base::Flags | EvalBeforeNestingBit
  };
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
-  explicit product_evaluator(const XprType& xpr)
+  EIGEN_DEVICE_FUNC explicit product_evaluator(const XprType& xpr)
    : Base(BaseProduct(xpr.lhs(),xpr.rhs()))
  {}
 };
@@ -794,8 +741,7 @@ struct generic_product_impl<Lhs,Rhs,SelfAdjointShape,DenseShape,ProductTag>
  typedef typename Product<Lhs,Rhs>::Scalar Scalar;
  
  template<typename Dest>
-  static EIGEN_DEVICE_FUNC
-  void scaleAndAddTo(Dest& dst, const Lhs& lhs, const Rhs& rhs, const Scalar& alpha)
+  static void scaleAndAddTo(Dest& dst, const Lhs& lhs, const Rhs& rhs, const Scalar& alpha)
  {
    selfadjoint_product_impl<typename Lhs::MatrixType,Lhs::Mode,false,Rhs,0,Rhs::IsVectorAtCompileTime>::run(dst, lhs.nestedExpression(), rhs, alpha);
  }
@@ -830,21 +776,13 @@ public:
    
    MatrixFlags = evaluator<MatrixType>::Flags,
    DiagFlags = evaluator<DiagonalType>::Flags,
-    
-    _StorageOrder = (Derived::MaxRowsAtCompileTime==1 && Derived::MaxColsAtCompileTime!=1) ? RowMajor
-                  : (Derived::MaxColsAtCompileTime==1 && Derived::MaxRowsAtCompileTime!=1) ? ColMajor
-                  : MatrixFlags & RowMajorBit ? RowMajor : ColMajor,
-    _SameStorageOrder = _StorageOrder == (MatrixFlags & RowMajorBit ? RowMajor : ColMajor),
-
+    _StorageOrder = MatrixFlags & RowMajorBit ? RowMajor : ColMajor,
    _ScalarAccessOnDiag =  !((int(_StorageOrder) == ColMajor && int(ProductOrder) == OnTheLeft)
                           ||(int(_StorageOrder) == RowMajor && int(ProductOrder) == OnTheRight)),
    _SameTypes = is_same<typename MatrixType::Scalar, typename DiagonalType::Scalar>::value,
    // FIXME currently we need same types, but in the future the next rule should be the one
    //_Vectorizable = bool(int(MatrixFlags)&PacketAccessBit) && ((!_PacketOnDiag) || (_SameTypes && bool(int(DiagFlags)&PacketAccessBit))),
-    _Vectorizable =   bool(int(MatrixFlags)&PacketAccessBit)
-                  &&  _SameTypes
-                  && (_SameStorageOrder || (MatrixFlags&LinearAccessBit)==LinearAccessBit)
-                  && (_ScalarAccessOnDiag || (bool(int(DiagFlags)&PacketAccessBit))),
+    _Vectorizable = bool(int(MatrixFlags)&PacketAccessBit) && _SameTypes && (_ScalarAccessOnDiag || (bool(int(DiagFlags)&PacketAccessBit))),
    _LinearAccessMask = (MatrixType::RowsAtCompileTime==1 || MatrixType::ColsAtCompileTime==1) ? LinearAccessBit : 0,
    Flags = ((HereditaryBits|_LinearAccessMask) & (unsigned int)(MatrixFlags)) | (_Vectorizable ? PacketAccessBit : 0),
    Alignment = evaluator<MatrixType>::Alignment,
@@ -905,10 +843,10 @@ struct product_evaluator<Product<Lhs, Rhs, ProductKind>, ProductTag, DiagonalSha
  
  typedef Product<Lhs, Rhs, ProductKind> XprType;
  typedef typename XprType::PlainObject PlainObject;
-  typedef typename Lhs::DiagonalVectorType DiagonalType;
-
  
-  enum { StorageOrder = Base::_StorageOrder };
+  enum {
+    StorageOrder = int(Rhs::Flags) & RowMajorBit ? RowMajor : ColMajor
+  };

  EIGEN_DEVICE_FUNC explicit product_evaluator(const XprType& xpr)
    : Base(xpr.rhs(), xpr.lhs().diagonal())
@@ -920,7 +858,7 @@ struct product_evaluator<Product<Lhs, Rhs, ProductKind>, ProductTag, DiagonalSha
    return m_diagImpl.coeff(row) * m_matImpl.coeff(row, col);
  }
  
-#ifndef EIGEN_GPUCC
+#ifndef __CUDACC__
  template<int LoadMode,typename PacketType>
  EIGEN_STRONG_INLINE PacketType packet(Index row, Index col) const
  {
@@ -952,7 +890,7 @@ struct product_evaluator<Product<Lhs, Rhs, ProductKind>, ProductTag, DenseShape,
  typedef Product<Lhs, Rhs, ProductKind> XprType;
  typedef typename XprType::PlainObject PlainObject;
  
-  enum { StorageOrder = Base::_StorageOrder };
+  enum { StorageOrder = int(Lhs::Flags) & RowMajorBit ? RowMajor : ColMajor };

  EIGEN_DEVICE_FUNC explicit product_evaluator(const XprType& xpr)
    : Base(xpr.lhs(), xpr.rhs().diagonal())
@@ -964,7 +902,7 @@ struct product_evaluator<Product<Lhs, Rhs, ProductKind>, ProductTag, DenseShape,
    return m_matImpl.coeff(row, col) * m_diagImpl.coeff(col);
  }
  
-#ifndef EIGEN_GPUCC
+#ifndef __CUDACC__
  template<int LoadMode,typename PacketType>
  EIGEN_STRONG_INLINE PacketType packet(Index row, Index col) const
  {
--- a/Eigen/src/Core/Random.h
+++ b/Eigen/src/Core/Random.h
@@ -128,7 +128,7 @@ DenseBase<Derived>::Random()
  * \sa class CwiseNullaryOp, setRandom(Index), setRandom(Index,Index)
  */
 template<typename Derived>
-EIGEN_DEVICE_FUNC inline Derived& DenseBase<Derived>::setRandom()
+inline Derived& DenseBase<Derived>::setRandom()
 {
  return *this = Random(rows(), cols());
 }
--- a/Eigen/src/Core/Redux.h
+++ b/Eigen/src/Core/Redux.h
@@ -23,29 +23,23 @@ namespace internal {
 * Part 1 : the logic deciding a strategy for vectorization and unrolling
 ***************************************************************************/

-template<typename Func, typename Evaluator>
+template<typename Func, typename Derived>
 struct redux_traits
 {
 public:
-    typedef typename find_best_packet<typename Evaluator::Scalar,Evaluator::SizeAtCompileTime>::type PacketType;
+    typedef typename find_best_packet<typename Derived::Scalar,Derived::SizeAtCompileTime>::type PacketType;
  enum {
    PacketSize = unpacket_traits<PacketType>::size,
-    InnerMaxSize = int(Evaluator::IsRowMajor)
-                 ? Evaluator::MaxColsAtCompileTime
-                 : Evaluator::MaxRowsAtCompileTime,
-    OuterMaxSize = int(Evaluator::IsRowMajor)
-                 ? Evaluator::MaxRowsAtCompileTime
-                 : Evaluator::MaxColsAtCompileTime,
-    SliceVectorizedWork = int(InnerMaxSize)==Dynamic ? Dynamic
-                        : int(OuterMaxSize)==Dynamic ? (int(InnerMaxSize)>=int(PacketSize) ? Dynamic : 0)
-                        : (int(InnerMaxSize)/int(PacketSize)) * int(OuterMaxSize)
+    InnerMaxSize = int(Derived::IsRowMajor)
+                 ? Derived::MaxColsAtCompileTime
+                 : Derived::MaxRowsAtCompileTime
  };

  enum {
-    MightVectorize = (int(Evaluator::Flags)&ActualPacketAccessBit)
+    MightVectorize = (int(Derived::Flags)&ActualPacketAccessBit)
                  && (functor_traits<Func>::PacketAccess),
-    MayLinearVectorize = bool(MightVectorize) && (int(Evaluator::Flags)&LinearAccessBit),
-    MaySliceVectorize  = bool(MightVectorize) && (int(SliceVectorizedWork)==Dynamic || int(SliceVectorizedWork)>=3)
+    MayLinearVectorize = bool(MightVectorize) && (int(Derived::Flags)&LinearAccessBit),
+    MaySliceVectorize  = bool(MightVectorize) && int(InnerMaxSize)>=3*PacketSize
  };

 public:
@@ -57,8 +51,8 @@ public:

 public:
  enum {
-    Cost = Evaluator::SizeAtCompileTime == Dynamic ? HugeCost
-         : Evaluator::SizeAtCompileTime * Evaluator::CoeffReadCost + (Evaluator::SizeAtCompileTime-1) * functor_traits<Func>::Cost,
+    Cost = Derived::SizeAtCompileTime == Dynamic ? HugeCost
+         : Derived::SizeAtCompileTime * Derived::CoeffReadCost + (Derived::SizeAtCompileTime-1) * functor_traits<Func>::Cost,
    UnrollingLimit = EIGEN_UNROLLING_LIMIT * (int(Traversal) == int(DefaultTraversal) ? 1 : int(PacketSize))
  };

@@ -70,20 +64,18 @@ public:
 #ifdef EIGEN_DEBUG_ASSIGN
  static void debug()
  {
-    std::cerr << "Xpr: " << typeid(typename Evaluator::XprType).name() << std::endl;
+    std::cerr << "Xpr: " << typeid(typename Derived::XprType).name() << std::endl;
    std::cerr.setf(std::ios::hex, std::ios::basefield);
-    EIGEN_DEBUG_VAR(Evaluator::Flags)
+    EIGEN_DEBUG_VAR(Derived::Flags)
    std::cerr.unsetf(std::ios::hex);
    EIGEN_DEBUG_VAR(InnerMaxSize)
-    EIGEN_DEBUG_VAR(OuterMaxSize)
-    EIGEN_DEBUG_VAR(SliceVectorizedWork)
    EIGEN_DEBUG_VAR(PacketSize)
    EIGEN_DEBUG_VAR(MightVectorize)
    EIGEN_DEBUG_VAR(MayLinearVectorize)
    EIGEN_DEBUG_VAR(MaySliceVectorize)
-    std::cerr << "Traversal" << " = " << Traversal << " (" << demangle_traversal(Traversal) << ")" << std::endl;
+    EIGEN_DEBUG_VAR(Traversal)
    EIGEN_DEBUG_VAR(UnrollingLimit)
-    std::cerr << "Unrolling" << " = " << Unrolling << " (" << demangle_unrolling(Unrolling) << ")" << std::endl;
+    EIGEN_DEBUG_VAR(Unrolling)
    std::cerr << std::endl;
  }
 #endif
@@ -95,86 +87,88 @@ public:

 /*** no vectorization ***/

-template<typename Func, typename Evaluator, int Start, int Length>
+template<typename Func, typename Derived, int Start, int Length>
 struct redux_novec_unroller
 {
  enum {
    HalfLength = Length/2
  };

-  typedef typename Evaluator::Scalar Scalar;
+  typedef typename Derived::Scalar Scalar;

  EIGEN_DEVICE_FUNC
-  static EIGEN_STRONG_INLINE Scalar run(const Evaluator &eval, const Func& func)
+  static EIGEN_STRONG_INLINE Scalar run(const Derived &mat, const Func& func)
  {
-    return func(redux_novec_unroller<Func, Evaluator, Start, HalfLength>::run(eval,func),
-                redux_novec_unroller<Func, Evaluator, Start+HalfLength, Length-HalfLength>::run(eval,func));
+    return func(redux_novec_unroller<Func, Derived, Start, HalfLength>::run(mat,func),
+                redux_novec_unroller<Func, Derived, Start+HalfLength, Length-HalfLength>::run(mat,func));
  }
 };

-template<typename Func, typename Evaluator, int Start>
-struct redux_novec_unroller<Func, Evaluator, Start, 1>
+template<typename Func, typename Derived, int Start>
+struct redux_novec_unroller<Func, Derived, Start, 1>
 {
  enum {
-    outer = Start / Evaluator::InnerSizeAtCompileTime,
-    inner = Start % Evaluator::InnerSizeAtCompileTime
+    outer = Start / Derived::InnerSizeAtCompileTime,
+    inner = Start % Derived::InnerSizeAtCompileTime
  };

-  typedef typename Evaluator::Scalar Scalar;
+  typedef typename Derived::Scalar Scalar;

  EIGEN_DEVICE_FUNC
-  static EIGEN_STRONG_INLINE Scalar run(const Evaluator &eval, const Func&)
+  static EIGEN_STRONG_INLINE Scalar run(const Derived &mat, const Func&)
  {
-    return eval.coeffByOuterInner(outer, inner);
+    return mat.coeffByOuterInner(outer, inner);
  }
 };

 // This is actually dead code and will never be called. It is required
 // to prevent false warnings regarding failed inlining though
 // for 0 length run() will never be called at all.
-template<typename Func, typename Evaluator, int Start>
-struct redux_novec_unroller<Func, Evaluator, Start, 0>
+template<typename Func, typename Derived, int Start>
+struct redux_novec_unroller<Func, Derived, Start, 0>
 {
-  typedef typename Evaluator::Scalar Scalar;
+  typedef typename Derived::Scalar Scalar;
  EIGEN_DEVICE_FUNC 
-  static EIGEN_STRONG_INLINE Scalar run(const Evaluator&, const Func&) { return Scalar(); }
+  static EIGEN_STRONG_INLINE Scalar run(const Derived&, const Func&) { return Scalar(); }
 };

 /*** vectorization ***/

-template<typename Func, typename Evaluator, int Start, int Length>
+template<typename Func, typename Derived, int Start, int Length>
 struct redux_vec_unroller
 {
-  template<typename PacketType>
-  EIGEN_DEVICE_FUNC
-  static EIGEN_STRONG_INLINE PacketType run(const Evaluator &eval, const Func& func)
-  {
-    enum {
-      PacketSize = unpacket_traits<PacketType>::size,
-      HalfLength = Length/2
-    };
+  enum {
+    PacketSize = redux_traits<Func, Derived>::PacketSize,
+    HalfLength = Length/2
+  };

+  typedef typename Derived::Scalar Scalar;
+  typedef typename redux_traits<Func, Derived>::PacketType PacketScalar;
+
+  static EIGEN_STRONG_INLINE PacketScalar run(const Derived &mat, const Func& func)
+  {
    return func.packetOp(
-            redux_vec_unroller<Func, Evaluator, Start, HalfLength>::template run<PacketType>(eval,func),
-            redux_vec_unroller<Func, Evaluator, Start+HalfLength, Length-HalfLength>::template run<PacketType>(eval,func) );
+            redux_vec_unroller<Func, Derived, Start, HalfLength>::run(mat,func),
+            redux_vec_unroller<Func, Derived, Start+HalfLength, Length-HalfLength>::run(mat,func) );
  }
 };

-template<typename Func, typename Evaluator, int Start>
-struct redux_vec_unroller<Func, Evaluator, Start, 1>
+template<typename Func, typename Derived, int Start>
+struct redux_vec_unroller<Func, Derived, Start, 1>
 {
-  template<typename PacketType>
-  EIGEN_DEVICE_FUNC
-  static EIGEN_STRONG_INLINE PacketType run(const Evaluator &eval, const Func&)
+  enum {
+    index = Start * redux_traits<Func, Derived>::PacketSize,
+    outer = index / int(Derived::InnerSizeAtCompileTime),
+    inner = index % int(Derived::InnerSizeAtCompileTime),
+    alignment = Derived::Alignment
+  };
+
+  typedef typename Derived::Scalar Scalar;
+  typedef typename redux_traits<Func, Derived>::PacketType PacketScalar;
+
+  static EIGEN_STRONG_INLINE PacketScalar run(const Derived &mat, const Func&)
  {
-    enum {
-      PacketSize = unpacket_traits<PacketType>::size,
-      index = Start * PacketSize,
-      outer = index / int(Evaluator::InnerSizeAtCompileTime),
-      inner = index % int(Evaluator::InnerSizeAtCompileTime),
-      alignment = Evaluator::Alignment
-    };
-    return eval.template packetByOuterInner<alignment,PacketType>(outer, inner);
+    return mat.template packetByOuterInner<alignment,PacketScalar>(outer, inner);
  }
 };

@@ -182,65 +176,53 @@ struct redux_vec_unroller<Func, Evaluator, Start, 1>
 * Part 3 : implementation of all cases
 ***************************************************************************/

-template<typename Func, typename Evaluator,
-         int Traversal = redux_traits<Func, Evaluator>::Traversal,
-         int Unrolling = redux_traits<Func, Evaluator>::Unrolling
+template<typename Func, typename Derived,
+         int Traversal = redux_traits<Func, Derived>::Traversal,
+         int Unrolling = redux_traits<Func, Derived>::Unrolling
 >
 struct redux_impl;

-template<typename Func, typename Evaluator>
-struct redux_impl<Func, Evaluator, DefaultTraversal, NoUnrolling>
+template<typename Func, typename Derived>
+struct redux_impl<Func, Derived, DefaultTraversal, NoUnrolling>
 {
-  typedef typename Evaluator::Scalar Scalar;
-
-  template<typename XprType>
-  EIGEN_DEVICE_FUNC static EIGEN_STRONG_INLINE
-  Scalar run(const Evaluator &eval, const Func& func, const XprType& xpr)
+  typedef typename Derived::Scalar Scalar;
+  EIGEN_DEVICE_FUNC
+  static EIGEN_STRONG_INLINE Scalar run(const Derived &mat, const Func& func)
  {
-    eigen_assert(xpr.rows()>0 && xpr.cols()>0 && "you are using an empty matrix");
+    eigen_assert(mat.rows()>0 && mat.cols()>0 && "you are using an empty matrix");
    Scalar res;
-    res = eval.coeffByOuterInner(0, 0);
-    for(Index i = 1; i < xpr.innerSize(); ++i)
-      res = func(res, eval.coeffByOuterInner(0, i));
-    for(Index i = 1; i < xpr.outerSize(); ++i)
-      for(Index j = 0; j < xpr.innerSize(); ++j)
-        res = func(res, eval.coeffByOuterInner(i, j));
+    res = mat.coeffByOuterInner(0, 0);
+    for(Index i = 1; i < mat.innerSize(); ++i)
+      res = func(res, mat.coeffByOuterInner(0, i));
+    for(Index i = 1; i < mat.outerSize(); ++i)
+      for(Index j = 0; j < mat.innerSize(); ++j)
+        res = func(res, mat.coeffByOuterInner(i, j));
    return res;
  }
 };

-template<typename Func, typename Evaluator>
-struct redux_impl<Func,Evaluator, DefaultTraversal, CompleteUnrolling>
-  : redux_novec_unroller<Func,Evaluator, 0, Evaluator::SizeAtCompileTime>
-{
-  typedef redux_novec_unroller<Func,Evaluator, 0, Evaluator::SizeAtCompileTime> Base;
-  typedef typename Evaluator::Scalar Scalar;
-  template<typename XprType>
-  EIGEN_DEVICE_FUNC static EIGEN_STRONG_INLINE
-  Scalar run(const Evaluator &eval, const Func& func, const XprType& /*xpr*/)
-  {
-    return Base::run(eval,func);
-  }
-};
+template<typename Func, typename Derived>
+struct redux_impl<Func,Derived, DefaultTraversal, CompleteUnrolling>
+  : public redux_novec_unroller<Func,Derived, 0, Derived::SizeAtCompileTime>
+{};

-template<typename Func, typename Evaluator>
-struct redux_impl<Func, Evaluator, LinearVectorizedTraversal, NoUnrolling>
+template<typename Func, typename Derived>
+struct redux_impl<Func, Derived, LinearVectorizedTraversal, NoUnrolling>
 {
-  typedef typename Evaluator::Scalar Scalar;
-  typedef typename redux_traits<Func, Evaluator>::PacketType PacketScalar;
+  typedef typename Derived::Scalar Scalar;
+  typedef typename redux_traits<Func, Derived>::PacketType PacketScalar;

-  template<typename XprType>
-  static Scalar run(const Evaluator &eval, const Func& func, const XprType& xpr)
+  static Scalar run(const Derived &mat, const Func& func)
  {
-    const Index size = xpr.size();
+    const Index size = mat.size();
    
-    const Index packetSize = redux_traits<Func, Evaluator>::PacketSize;
+    const Index packetSize = redux_traits<Func, Derived>::PacketSize;
    const int packetAlignment = unpacket_traits<PacketScalar>::alignment;
    enum {
-      alignment0 = (bool(Evaluator::Flags & DirectAccessBit) && bool(packet_traits<Scalar>::AlignedOnScalar)) ? int(packetAlignment) : int(Unaligned),
-      alignment = EIGEN_PLAIN_ENUM_MAX(alignment0, Evaluator::Alignment)
+      alignment0 = (bool(Derived::Flags & DirectAccessBit) && bool(packet_traits<Scalar>::AlignedOnScalar)) ? int(packetAlignment) : int(Unaligned),
+      alignment = EIGEN_PLAIN_ENUM_MAX(alignment0, Derived::Alignment)
    };
-    const Index alignedStart = internal::first_default_aligned(xpr);
+    const Index alignedStart = internal::first_default_aligned(mat.nestedExpression());
    const Index alignedSize2 = ((size-alignedStart)/(2*packetSize))*(2*packetSize);
    const Index alignedSize = ((size-alignedStart)/(packetSize))*(packetSize);
    const Index alignedEnd2 = alignedStart + alignedSize2;
@@ -248,34 +230,34 @@ struct redux_impl<Func, Evaluator, LinearVectorizedTraversal, NoUnrolling>
    Scalar res;
    if(alignedSize)
    {
-      PacketScalar packet_res0 = eval.template packet<alignment,PacketScalar>(alignedStart);
+      PacketScalar packet_res0 = mat.template packet<alignment,PacketScalar>(alignedStart);
      if(alignedSize>packetSize) // we have at least two packets to partly unroll the loop
      {
-        PacketScalar packet_res1 = eval.template packet<alignment,PacketScalar>(alignedStart+packetSize);
+        PacketScalar packet_res1 = mat.template packet<alignment,PacketScalar>(alignedStart+packetSize);
        for(Index index = alignedStart + 2*packetSize; index < alignedEnd2; index += 2*packetSize)
        {
-          packet_res0 = func.packetOp(packet_res0, eval.template packet<alignment,PacketScalar>(index));
-          packet_res1 = func.packetOp(packet_res1, eval.template packet<alignment,PacketScalar>(index+packetSize));
+          packet_res0 = func.packetOp(packet_res0, mat.template packet<alignment,PacketScalar>(index));
+          packet_res1 = func.packetOp(packet_res1, mat.template packet<alignment,PacketScalar>(index+packetSize));
        }

        packet_res0 = func.packetOp(packet_res0,packet_res1);
        if(alignedEnd>alignedEnd2)
-          packet_res0 = func.packetOp(packet_res0, eval.template packet<alignment,PacketScalar>(alignedEnd2));
+          packet_res0 = func.packetOp(packet_res0, mat.template packet<alignment,PacketScalar>(alignedEnd2));
      }
      res = func.predux(packet_res0);

      for(Index index = 0; index < alignedStart; ++index)
-        res = func(res,eval.coeff(index));
+        res = func(res,mat.coeff(index));

      for(Index index = alignedEnd; index < size; ++index)
-        res = func(res,eval.coeff(index));
+        res = func(res,mat.coeff(index));
    }
    else // too small to vectorize anything.
         // since this is dynamic-size hence inefficient anyway for such small sizes, don't try to optimize.
    {
-      res = eval.coeff(0);
+      res = mat.coeff(0);
      for(Index index = 1; index < size; ++index)
-        res = func(res,eval.coeff(index));
+        res = func(res,mat.coeff(index));
    }

    return res;
@@ -283,108 +265,130 @@ struct redux_impl<Func, Evaluator, LinearVectorizedTraversal, NoUnrolling>
 };

 // NOTE: for SliceVectorizedTraversal we simply bypass unrolling
-template<typename Func, typename Evaluator, int Unrolling>
-struct redux_impl<Func, Evaluator, SliceVectorizedTraversal, Unrolling>
+template<typename Func, typename Derived, int Unrolling>
+struct redux_impl<Func, Derived, SliceVectorizedTraversal, Unrolling>
 {
-  typedef typename Evaluator::Scalar Scalar;
-  typedef typename redux_traits<Func, Evaluator>::PacketType PacketType;
+  typedef typename Derived::Scalar Scalar;
+  typedef typename redux_traits<Func, Derived>::PacketType PacketType;

-  template<typename XprType>
-  EIGEN_DEVICE_FUNC static Scalar run(const Evaluator &eval, const Func& func, const XprType& xpr)
+  EIGEN_DEVICE_FUNC static Scalar run(const Derived &mat, const Func& func)
  {
-    eigen_assert(xpr.rows()>0 && xpr.cols()>0 && "you are using an empty matrix");
-    const Index innerSize = xpr.innerSize();
-    const Index outerSize = xpr.outerSize();
+    eigen_assert(mat.rows()>0 && mat.cols()>0 && "you are using an empty matrix");
+    const Index innerSize = mat.innerSize();
+    const Index outerSize = mat.outerSize();
    enum {
-      packetSize = redux_traits<Func, Evaluator>::PacketSize
+      packetSize = redux_traits<Func, Derived>::PacketSize
    };
    const Index packetedInnerSize = ((innerSize)/packetSize)*packetSize;
    Scalar res;
    if(packetedInnerSize)
    {
-      PacketType packet_res = eval.template packet<Unaligned,PacketType>(0,0);
+      PacketType packet_res = mat.template packet<Unaligned,PacketType>(0,0);
      for(Index j=0; j<outerSize; ++j)
        for(Index i=(j==0?packetSize:0); i<packetedInnerSize; i+=Index(packetSize))
-          packet_res = func.packetOp(packet_res, eval.template packetByOuterInner<Unaligned,PacketType>(j,i));
+          packet_res = func.packetOp(packet_res, mat.template packetByOuterInner<Unaligned,PacketType>(j,i));

      res = func.predux(packet_res);
      for(Index j=0; j<outerSize; ++j)
        for(Index i=packetedInnerSize; i<innerSize; ++i)
-          res = func(res, eval.coeffByOuterInner(j,i));
+          res = func(res, mat.coeffByOuterInner(j,i));
    }
    else // too small to vectorize anything.
         // since this is dynamic-size hence inefficient anyway for such small sizes, don't try to optimize.
    {
-      res = redux_impl<Func, Evaluator, DefaultTraversal, NoUnrolling>::run(eval, func, xpr);
+      res = redux_impl<Func, Derived, DefaultTraversal, NoUnrolling>::run(mat, func);
    }

    return res;
  }
 };

-template<typename Func, typename Evaluator>
-struct redux_impl<Func, Evaluator, LinearVectorizedTraversal, CompleteUnrolling>
+template<typename Func, typename Derived>
+struct redux_impl<Func, Derived, LinearVectorizedTraversal, CompleteUnrolling>
 {
-  typedef typename Evaluator::Scalar Scalar;
+  typedef typename Derived::Scalar Scalar;

-  typedef typename redux_traits<Func, Evaluator>::PacketType PacketType;
+  typedef typename redux_traits<Func, Derived>::PacketType PacketScalar;
  enum {
-    PacketSize = redux_traits<Func, Evaluator>::PacketSize,
-    Size = Evaluator::SizeAtCompileTime,
+    PacketSize = redux_traits<Func, Derived>::PacketSize,
+    Size = Derived::SizeAtCompileTime,
    VectorizedSize = (Size / PacketSize) * PacketSize
  };
-
-  template<typename XprType>
-  EIGEN_DEVICE_FUNC static EIGEN_STRONG_INLINE
-  Scalar run(const Evaluator &eval, const Func& func, const XprType &xpr)
+  EIGEN_DEVICE_FUNC static EIGEN_STRONG_INLINE Scalar run(const Derived &mat, const Func& func)
  {
-    EIGEN_ONLY_USED_FOR_DEBUG(xpr)
-    eigen_assert(xpr.rows()>0 && xpr.cols()>0 && "you are using an empty matrix");
+    eigen_assert(mat.rows()>0 && mat.cols()>0 && "you are using an empty matrix");
    if (VectorizedSize > 0) {
-      Scalar res = func.predux(redux_vec_unroller<Func, Evaluator, 0, Size / PacketSize>::template run<PacketType>(eval,func));
+      Scalar res = func.predux(redux_vec_unroller<Func, Derived, 0, Size / PacketSize>::run(mat,func));
      if (VectorizedSize != Size)
-        res = func(res,redux_novec_unroller<Func, Evaluator, VectorizedSize, Size-VectorizedSize>::run(eval,func));
+        res = func(res,redux_novec_unroller<Func, Derived, VectorizedSize, Size-VectorizedSize>::run(mat,func));
      return res;
    }
    else {
-      return redux_novec_unroller<Func, Evaluator, 0, Size>::run(eval,func);
+      return redux_novec_unroller<Func, Derived, 0, Size>::run(mat,func);
    }
  }
 };

 // evaluator adaptor
 template<typename _XprType>
-class redux_evaluator : public internal::evaluator<_XprType>
+class redux_evaluator
 {
-  typedef internal::evaluator<_XprType> Base;
 public:
  typedef _XprType XprType;
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
-  explicit redux_evaluator(const XprType &xpr) : Base(xpr) {}
+  EIGEN_DEVICE_FUNC explicit redux_evaluator(const XprType &xpr) : m_evaluator(xpr), m_xpr(xpr) {}
  
  typedef typename XprType::Scalar Scalar;
  typedef typename XprType::CoeffReturnType CoeffReturnType;
  typedef typename XprType::PacketScalar PacketScalar;
+  typedef typename XprType::PacketReturnType PacketReturnType;
  
  enum {
    MaxRowsAtCompileTime = XprType::MaxRowsAtCompileTime,
    MaxColsAtCompileTime = XprType::MaxColsAtCompileTime,
    // TODO we should not remove DirectAccessBit and rather find an elegant way to query the alignment offset at runtime from the evaluator
-    Flags = Base::Flags & ~DirectAccessBit,
+    Flags = evaluator<XprType>::Flags & ~DirectAccessBit,
    IsRowMajor = XprType::IsRowMajor,
    SizeAtCompileTime = XprType::SizeAtCompileTime,
-    InnerSizeAtCompileTime = XprType::InnerSizeAtCompileTime
+    InnerSizeAtCompileTime = XprType::InnerSizeAtCompileTime,
+    CoeffReadCost = evaluator<XprType>::CoeffReadCost,
+    Alignment = evaluator<XprType>::Alignment
  };
  
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+  EIGEN_DEVICE_FUNC Index rows() const { return m_xpr.rows(); }
+  EIGEN_DEVICE_FUNC Index cols() const { return m_xpr.cols(); }
+  EIGEN_DEVICE_FUNC Index size() const { return m_xpr.size(); }
+  EIGEN_DEVICE_FUNC Index innerSize() const { return m_xpr.innerSize(); }
+  EIGEN_DEVICE_FUNC Index outerSize() const { return m_xpr.outerSize(); }
+
+  EIGEN_DEVICE_FUNC
+  CoeffReturnType coeff(Index row, Index col) const
+  { return m_evaluator.coeff(row, col); }
+
+  EIGEN_DEVICE_FUNC
+  CoeffReturnType coeff(Index index) const
+  { return m_evaluator.coeff(index); }
+
+  template<int LoadMode, typename PacketType>
+  PacketType packet(Index row, Index col) const
+  { return m_evaluator.template packet<LoadMode,PacketType>(row, col); }
+
+  template<int LoadMode, typename PacketType>
+  PacketType packet(Index index) const
+  { return m_evaluator.template packet<LoadMode,PacketType>(index); }
+  
+  EIGEN_DEVICE_FUNC
  CoeffReturnType coeffByOuterInner(Index outer, Index inner) const
-  { return Base::coeff(IsRowMajor ? outer : inner, IsRowMajor ? inner : outer); }
+  { return m_evaluator.coeff(IsRowMajor ? outer : inner, IsRowMajor ? inner : outer); }
  
  template<int LoadMode, typename PacketType>
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
  PacketType packetByOuterInner(Index outer, Index inner) const
-  { return Base::template packet<LoadMode,PacketType>(IsRowMajor ? outer : inner, IsRowMajor ? inner : outer); }
+  { return m_evaluator.template packet<LoadMode,PacketType>(IsRowMajor ? outer : inner, IsRowMajor ? inner : outer); }
  
+  const XprType & nestedExpression() const { return m_xpr; }
+  
+protected:
+  internal::evaluator<XprType> m_evaluator;
+  const XprType &m_xpr;
 };

 } // end namespace internal
@@ -399,42 +403,36 @@ public:
  * The template parameter \a BinaryOp is the type of the functor \a func which must be
  * an associative operator. Both current C++98 and C++11 functor styles are handled.
  *
-  * \warning the matrix must be not empty, otherwise an assertion is triggered.
-  *
  * \sa DenseBase::sum(), DenseBase::minCoeff(), DenseBase::maxCoeff(), MatrixBase::colwise(), MatrixBase::rowwise()
  */
 template<typename Derived>
 template<typename Func>
-EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE typename internal::traits<Derived>::Scalar
+EIGEN_STRONG_INLINE typename internal::traits<Derived>::Scalar
 DenseBase<Derived>::redux(const Func& func) const
 {
  eigen_assert(this->rows()>0 && this->cols()>0 && "you are using an empty matrix");

  typedef typename internal::redux_evaluator<Derived> ThisEvaluator;
  ThisEvaluator thisEval(derived());
-
-  // The initial expression is passed to the reducer as an additional argument instead of
-  // passing it as a member of redux_evaluator to help  
-  return internal::redux_impl<Func, ThisEvaluator>::run(thisEval, func, derived());
+  
+  return internal::redux_impl<Func, ThisEvaluator>::run(thisEval, func);
 }

 /** \returns the minimum of all coefficients of \c *this.
-  * \warning the matrix must be not empty, otherwise an assertion is triggered.
  * \warning the result is undefined if \c *this contains NaN.
  */
 template<typename Derived>
-EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE typename internal::traits<Derived>::Scalar
+EIGEN_STRONG_INLINE typename internal::traits<Derived>::Scalar
 DenseBase<Derived>::minCoeff() const
 {
  return derived().redux(Eigen::internal::scalar_min_op<Scalar,Scalar>());
 }

 /** \returns the maximum of all coefficients of \c *this.
-  * \warning the matrix must be not empty, otherwise an assertion is triggered.
  * \warning the result is undefined if \c *this contains NaN.
  */
 template<typename Derived>
-EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE typename internal::traits<Derived>::Scalar
+EIGEN_STRONG_INLINE typename internal::traits<Derived>::Scalar
 DenseBase<Derived>::maxCoeff() const
 {
  return derived().redux(Eigen::internal::scalar_max_op<Scalar,Scalar>());
@@ -447,7 +445,7 @@ DenseBase<Derived>::maxCoeff() const
  * \sa trace(), prod(), mean()
  */
 template<typename Derived>
-EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE typename internal::traits<Derived>::Scalar
+EIGEN_STRONG_INLINE typename internal::traits<Derived>::Scalar
 DenseBase<Derived>::sum() const
 {
  if(SizeAtCompileTime==0 || (SizeAtCompileTime==Dynamic && size()==0))
@@ -460,7 +458,7 @@ DenseBase<Derived>::sum() const
 * \sa trace(), prod(), sum()
 */
 template<typename Derived>
-EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE typename internal::traits<Derived>::Scalar
+EIGEN_STRONG_INLINE typename internal::traits<Derived>::Scalar
 DenseBase<Derived>::mean() const
 {
 #ifdef __INTEL_COMPILER
@@ -481,7 +479,7 @@ DenseBase<Derived>::mean() const
  * \sa sum(), mean(), trace()
  */
 template<typename Derived>
-EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE typename internal::traits<Derived>::Scalar
+EIGEN_STRONG_INLINE typename internal::traits<Derived>::Scalar
 DenseBase<Derived>::prod() const
 {
  if(SizeAtCompileTime==0 || (SizeAtCompileTime==Dynamic && size()==0))
@@ -496,7 +494,7 @@ DenseBase<Derived>::prod() const
  * \sa diagonal(), sum()
  */
 template<typename Derived>
-EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE typename internal::traits<Derived>::Scalar
+EIGEN_STRONG_INLINE typename internal::traits<Derived>::Scalar
 MatrixBase<Derived>::trace() const
 {
  return derived().diagonal().sum();
--- a/Eigen/src/Core/Ref.h
+++ b/Eigen/src/Core/Ref.h
@@ -28,13 +28,12 @@ struct traits<Ref<_PlainObjectType, _Options, _StrideType> >

  template<typename Derived> struct match {
    enum {
-      IsVectorAtCompileTime = PlainObjectType::IsVectorAtCompileTime || Derived::IsVectorAtCompileTime,
      HasDirectAccess = internal::has_direct_access<Derived>::ret,
-      StorageOrderMatch = IsVectorAtCompileTime || ((PlainObjectType::Flags&RowMajorBit)==(Derived::Flags&RowMajorBit)),
+      StorageOrderMatch = PlainObjectType::IsVectorAtCompileTime || Derived::IsVectorAtCompileTime || ((PlainObjectType::Flags&RowMajorBit)==(Derived::Flags&RowMajorBit)),
      InnerStrideMatch = int(StrideType::InnerStrideAtCompileTime)==int(Dynamic)
                      || int(StrideType::InnerStrideAtCompileTime)==int(Derived::InnerStrideAtCompileTime)
                      || (int(StrideType::InnerStrideAtCompileTime)==0 && int(Derived::InnerStrideAtCompileTime)==1),
-      OuterStrideMatch = IsVectorAtCompileTime
+      OuterStrideMatch = Derived::IsVectorAtCompileTime
                      || int(StrideType::OuterStrideAtCompileTime)==int(Dynamic) || int(StrideType::OuterStrideAtCompileTime)==int(Derived::OuterStrideAtCompileTime),
      // NOTE, this indirection of evaluator<Derived>::Alignment is needed
      // to workaround a very strange bug in MSVC related to the instantiation
@@ -187,8 +186,6 @@ protected:
  * void foo(const Ref<MatrixXf,0,Stride<> >& A) { foo_impl(A); }
  * \endcode
  *
-  * See also the following stackoverflow questions for further references:
-  *  - <a href="http://stackoverflow.com/questions/21132538/correct-usage-of-the-eigenref-class">Correct usage of the Eigen::Ref<> class</a>
  *
  * \sa PlainObjectBase::Map(), \ref TopicStorageOrders
  */
--- a/Eigen/src/Core/Replicate.h
+++ b/Eigen/src/Core/Replicate.h
@@ -115,7 +115,7 @@ template<typename MatrixType,int RowFactor,int ColFactor> class Replicate
  */
 template<typename Derived>
 template<int RowFactor, int ColFactor>
-EIGEN_DEVICE_FUNC const Replicate<Derived,RowFactor,ColFactor>
+const Replicate<Derived,RowFactor,ColFactor>
 DenseBase<Derived>::replicate() const
 {
  return Replicate<Derived,RowFactor,ColFactor>(derived());
@@ -130,7 +130,7 @@ DenseBase<Derived>::replicate() const
  * \sa VectorwiseOp::replicate(), DenseBase::replicate(), class Replicate
  */
 template<typename ExpressionType, int Direction>
-EIGEN_DEVICE_FUNC const typename VectorwiseOp<ExpressionType,Direction>::ReplicateReturnType
+const typename VectorwiseOp<ExpressionType,Direction>::ReplicateReturnType
 VectorwiseOp<ExpressionType,Direction>::replicate(Index factor) const
 {
  return typename VectorwiseOp<ExpressionType,Direction>::ReplicateReturnType
--- a/Eigen/src/Core/Reshaped.h
+++ b/Eigen/src/Core/Reshaped.h
@@ -1,453 +0,0 @@
-// This file is part of Eigen, a lightweight C++ template library
-// for linear algebra.
-//
-// Copyright (C) 2008-2017 Gael Guennebaud <gael.guennebaud@inria.fr>
-// Copyright (C) 2014 yoco <peter.xiau@gmail.com>
-//
-// This Source Code Form is subject to the terms of the Mozilla
-// Public License v. 2.0. If a copy of the MPL was not distributed
-// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
-
-#ifndef EIGEN_RESHAPED_H
-#define EIGEN_RESHAPED_H
-
-namespace Eigen {
-namespace internal {
-
-/** \class Reshaped
-  * \ingroup Core_Module
-  *
-  * \brief Expression of a fixed-size or dynamic-size reshape
-  *
-  * \tparam XprType the type of the expression in which we are taking a reshape
-  * \tparam Rows the number of rows of the reshape we are taking at compile time (optional)
-  * \tparam Cols the number of columns of the reshape we are taking at compile time (optional)
-  * \tparam Order can be ColMajor or RowMajor, default is ColMajor.
-  *
-  * This class represents an expression of either a fixed-size or dynamic-size reshape.
-  * It is the return type of DenseBase::reshaped(NRowsType,NColsType) and
-  * most of the time this is the only way it is used.
-  *
-  * However, in C++98, if you want to directly maniputate reshaped expressions,
-  * for instance if you want to write a function returning such an expression, you
-  * will need to use this class. In C++11, it is advised to use the \em auto
-  * keyword for such use cases.
-  *
-  * Here is an example illustrating the dynamic case:
-  * \include class_Reshaped.cpp
-  * Output: \verbinclude class_Reshaped.out
-  *
-  * Here is an example illustrating the fixed-size case:
-  * \include class_FixedReshaped.cpp
-  * Output: \verbinclude class_FixedReshaped.out
-  *
-  * \sa DenseBase::reshaped(NRowsType,NColsType)
-  */
-
-template<typename XprType, int Rows, int Cols, int Order>
-struct traits<Reshaped<XprType, Rows, Cols, Order> > : traits<XprType>
-{
-  typedef typename traits<XprType>::Scalar Scalar;
-  typedef typename traits<XprType>::StorageKind StorageKind;
-  typedef typename traits<XprType>::XprKind XprKind;
-  enum{
-    MatrixRows = traits<XprType>::RowsAtCompileTime,
-    MatrixCols = traits<XprType>::ColsAtCompileTime,
-    RowsAtCompileTime = Rows,
-    ColsAtCompileTime = Cols,
-    MaxRowsAtCompileTime = Rows,
-    MaxColsAtCompileTime = Cols,
-    XpxStorageOrder = ((int(traits<XprType>::Flags) & RowMajorBit) == RowMajorBit) ? RowMajor : ColMajor,
-    ReshapedStorageOrder = (RowsAtCompileTime == 1 && ColsAtCompileTime != 1) ? RowMajor
-                         : (ColsAtCompileTime == 1 && RowsAtCompileTime != 1) ? ColMajor
-                         : XpxStorageOrder,
-    HasSameStorageOrderAsXprType = (ReshapedStorageOrder == XpxStorageOrder),
-    InnerSize = (ReshapedStorageOrder==int(RowMajor)) ? int(ColsAtCompileTime) : int(RowsAtCompileTime),
-    InnerStrideAtCompileTime = HasSameStorageOrderAsXprType
-                             ? int(inner_stride_at_compile_time<XprType>::ret)
-                             : Dynamic,
-    OuterStrideAtCompileTime = Dynamic,
-
-    HasDirectAccess = internal::has_direct_access<XprType>::ret
-                    && (Order==int(XpxStorageOrder))
-                    && ((evaluator<XprType>::Flags&LinearAccessBit)==LinearAccessBit),
-
-    MaskPacketAccessBit = (InnerSize == Dynamic || (InnerSize % packet_traits<Scalar>::size) == 0)
-                       && (InnerStrideAtCompileTime == 1)
-                        ? PacketAccessBit : 0,
-    //MaskAlignedBit = ((OuterStrideAtCompileTime!=Dynamic) && (((OuterStrideAtCompileTime * int(sizeof(Scalar))) % 16) == 0)) ? AlignedBit : 0,
-    FlagsLinearAccessBit = (RowsAtCompileTime == 1 || ColsAtCompileTime == 1) ? LinearAccessBit : 0,
-    FlagsLvalueBit = is_lvalue<XprType>::value ? LvalueBit : 0,
-    FlagsRowMajorBit = (ReshapedStorageOrder==int(RowMajor)) ? RowMajorBit : 0,
-    FlagsDirectAccessBit = HasDirectAccess ? DirectAccessBit : 0,
-    Flags0 = traits<XprType>::Flags & ( (HereditaryBits & ~RowMajorBit) | MaskPacketAccessBit),
-
-    Flags = (Flags0 | FlagsLinearAccessBit | FlagsLvalueBit | FlagsRowMajorBit | FlagsDirectAccessBit)
-  };
-};
-
-template<typename XprType, int Rows, int Cols, int Order, bool HasDirectAccess> class ReshapedImpl_dense;
-
-} // end namespace internal
-
-template<typename XprType, int Rows, int Cols, int Order, typename StorageKind> class ReshapedImpl;
-
-template<typename XprType, int Rows, int Cols, int Order> class Reshaped
-  : public ReshapedImpl<XprType, Rows, Cols, Order, typename internal::traits<XprType>::StorageKind>
-{
-    typedef ReshapedImpl<XprType, Rows, Cols, Order, typename internal::traits<XprType>::StorageKind> Impl;
-  public:
-    //typedef typename Impl::Base Base;
-    typedef Impl Base;
-    EIGEN_GENERIC_PUBLIC_INTERFACE(Reshaped)
-    EIGEN_INHERIT_ASSIGNMENT_OPERATORS(Reshaped)
-
-    /** Fixed-size constructor
-      */
-    EIGEN_DEVICE_FUNC
-    inline Reshaped(XprType& xpr)
-      : Impl(xpr)
-    {
-      EIGEN_STATIC_ASSERT(RowsAtCompileTime!=Dynamic && ColsAtCompileTime!=Dynamic,THIS_METHOD_IS_ONLY_FOR_FIXED_SIZE)
-      eigen_assert(Rows * Cols == xpr.rows() * xpr.cols());
-    }
-
-    /** Dynamic-size constructor
-      */
-    EIGEN_DEVICE_FUNC
-    inline Reshaped(XprType& xpr,
-          Index reshapeRows, Index reshapeCols)
-      : Impl(xpr, reshapeRows, reshapeCols)
-    {
-      eigen_assert((RowsAtCompileTime==Dynamic || RowsAtCompileTime==reshapeRows)
-          && (ColsAtCompileTime==Dynamic || ColsAtCompileTime==reshapeCols));
-      eigen_assert(reshapeRows * reshapeCols == xpr.rows() * xpr.cols());
-    }
-};
-
-// The generic default implementation for dense reshape simply forward to the internal::ReshapedImpl_dense
-// that must be specialized for direct and non-direct access...
-template<typename XprType, int Rows, int Cols, int Order>
-class ReshapedImpl<XprType, Rows, Cols, Order, Dense>
-  : public internal::ReshapedImpl_dense<XprType, Rows, Cols, Order,internal::traits<Reshaped<XprType,Rows,Cols,Order> >::HasDirectAccess>
-{
-    typedef internal::ReshapedImpl_dense<XprType, Rows, Cols, Order,internal::traits<Reshaped<XprType,Rows,Cols,Order> >::HasDirectAccess> Impl;
-  public:
-    typedef Impl Base;
-    EIGEN_INHERIT_ASSIGNMENT_OPERATORS(ReshapedImpl)
-    EIGEN_DEVICE_FUNC inline ReshapedImpl(XprType& xpr) : Impl(xpr) {}
-    EIGEN_DEVICE_FUNC inline ReshapedImpl(XprType& xpr, Index reshapeRows, Index reshapeCols)
-      : Impl(xpr, reshapeRows, reshapeCols) {}
-};
-
-namespace internal {
-
-/** \internal Internal implementation of dense Reshaped in the general case. */
-template<typename XprType, int Rows, int Cols, int Order>
-class ReshapedImpl_dense<XprType,Rows,Cols,Order,false>
-  : public internal::dense_xpr_base<Reshaped<XprType, Rows, Cols, Order> >::type
-{
-    typedef Reshaped<XprType, Rows, Cols, Order> ReshapedType;
-  public:
-
-    typedef typename internal::dense_xpr_base<ReshapedType>::type Base;
-    EIGEN_DENSE_PUBLIC_INTERFACE(ReshapedType)
-    EIGEN_INHERIT_ASSIGNMENT_OPERATORS(ReshapedImpl_dense)
-
-    typedef typename internal::ref_selector<XprType>::non_const_type MatrixTypeNested;
-    typedef typename internal::remove_all<XprType>::type NestedExpression;
-
-    class InnerIterator;
-
-    /** Fixed-size constructor
-      */
-    EIGEN_DEVICE_FUNC
-    inline ReshapedImpl_dense(XprType& xpr)
-      : m_xpr(xpr), m_rows(Rows), m_cols(Cols)
-    {}
-
-    /** Dynamic-size constructor
-      */
-    EIGEN_DEVICE_FUNC
-    inline ReshapedImpl_dense(XprType& xpr, Index nRows, Index nCols)
-      : m_xpr(xpr), m_rows(nRows), m_cols(nCols)
-    {}
-
-    EIGEN_DEVICE_FUNC Index rows() const { return m_rows; }
-    EIGEN_DEVICE_FUNC Index cols() const { return m_cols; }
-
-    #ifdef EIGEN_PARSED_BY_DOXYGEN
-    /** \sa MapBase::data() */
-    EIGEN_DEVICE_FUNC inline const Scalar* data() const;
-    EIGEN_DEVICE_FUNC inline Index innerStride() const;
-    EIGEN_DEVICE_FUNC inline Index outerStride() const;
-    #endif
-
-    /** \returns the nested expression */
-    EIGEN_DEVICE_FUNC
-    const typename internal::remove_all<XprType>::type&
-    nestedExpression() const { return m_xpr; }
-
-    /** \returns the nested expression */
-    EIGEN_DEVICE_FUNC
-    typename internal::remove_reference<XprType>::type&
-    nestedExpression() { return m_xpr; }
-
-  protected:
-
-    MatrixTypeNested m_xpr;
-    const internal::variable_if_dynamic<Index, Rows> m_rows;
-    const internal::variable_if_dynamic<Index, Cols> m_cols;
-};
-
-
-/** \internal Internal implementation of dense Reshaped in the direct access case. */
-template<typename XprType, int Rows, int Cols, int Order>
-class ReshapedImpl_dense<XprType, Rows, Cols, Order, true>
-  : public MapBase<Reshaped<XprType, Rows, Cols, Order> >
-{
-    typedef Reshaped<XprType, Rows, Cols, Order> ReshapedType;
-    typedef typename internal::ref_selector<XprType>::non_const_type XprTypeNested;
-  public:
-
-    typedef MapBase<ReshapedType> Base;
-    EIGEN_DENSE_PUBLIC_INTERFACE(ReshapedType)
-    EIGEN_INHERIT_ASSIGNMENT_OPERATORS(ReshapedImpl_dense)
-
-    /** Fixed-size constructor
-      */
-    EIGEN_DEVICE_FUNC
-    inline ReshapedImpl_dense(XprType& xpr)
-      : Base(xpr.data()), m_xpr(xpr)
-    {}
-
-    /** Dynamic-size constructor
-      */
-    EIGEN_DEVICE_FUNC
-    inline ReshapedImpl_dense(XprType& xpr, Index nRows, Index nCols)
-      : Base(xpr.data(), nRows, nCols),
-        m_xpr(xpr)
-    {}
-
-    EIGEN_DEVICE_FUNC
-    const typename internal::remove_all<XprTypeNested>::type& nestedExpression() const
-    {
-      return m_xpr;
-    }
-
-    EIGEN_DEVICE_FUNC
-    XprType& nestedExpression() { return m_xpr; }
-
-    /** \sa MapBase::innerStride() */
-    EIGEN_DEVICE_FUNC
-    inline Index innerStride() const
-    {
-      return m_xpr.innerStride();
-    }
-
-    /** \sa MapBase::outerStride() */
-    EIGEN_DEVICE_FUNC
-    inline Index outerStride() const
-    {
-      return ((Flags&RowMajorBit)==RowMajorBit) ? this->cols() : this->rows();
-    }
-
-  protected:
-
-    XprTypeNested m_xpr;
-};
-
-// Evaluators
-template<typename ArgType, int Rows, int Cols, int Order, bool HasDirectAccess> struct reshaped_evaluator;
-
-template<typename ArgType, int Rows, int Cols, int Order>
-struct evaluator<Reshaped<ArgType, Rows, Cols, Order> >
-  : reshaped_evaluator<ArgType, Rows, Cols, Order, traits<Reshaped<ArgType,Rows,Cols,Order> >::HasDirectAccess>
-{
-  typedef Reshaped<ArgType, Rows, Cols, Order> XprType;
-  typedef typename XprType::Scalar Scalar;
-  // TODO: should check for smaller packet types
-  typedef typename packet_traits<Scalar>::type PacketScalar;
-
-  enum {
-    CoeffReadCost = evaluator<ArgType>::CoeffReadCost,
-    HasDirectAccess = traits<XprType>::HasDirectAccess,
-
-//     RowsAtCompileTime = traits<XprType>::RowsAtCompileTime,
-//     ColsAtCompileTime = traits<XprType>::ColsAtCompileTime,
-//     MaxRowsAtCompileTime = traits<XprType>::MaxRowsAtCompileTime,
-//     MaxColsAtCompileTime = traits<XprType>::MaxColsAtCompileTime,
-//
-//     InnerStrideAtCompileTime = traits<XprType>::HasSameStorageOrderAsXprType
-//                              ? int(inner_stride_at_compile_time<ArgType>::ret)
-//                              : Dynamic,
-//     OuterStrideAtCompileTime = Dynamic,
-
-    FlagsLinearAccessBit = (traits<XprType>::RowsAtCompileTime == 1 || traits<XprType>::ColsAtCompileTime == 1 || HasDirectAccess) ? LinearAccessBit : 0,
-    FlagsRowMajorBit = (traits<XprType>::ReshapedStorageOrder==int(RowMajor)) ? RowMajorBit : 0,
-    FlagsDirectAccessBit =  HasDirectAccess ? DirectAccessBit : 0,
-    Flags0 = evaluator<ArgType>::Flags & (HereditaryBits & ~RowMajorBit),
-    Flags = Flags0 | FlagsLinearAccessBit | FlagsRowMajorBit | FlagsDirectAccessBit,
-
-    PacketAlignment = unpacket_traits<PacketScalar>::alignment,
-    Alignment = evaluator<ArgType>::Alignment
-  };
-  typedef reshaped_evaluator<ArgType, Rows, Cols, Order, HasDirectAccess> reshaped_evaluator_type;
-  EIGEN_DEVICE_FUNC explicit evaluator(const XprType& xpr) : reshaped_evaluator_type(xpr)
-  {
-    EIGEN_INTERNAL_CHECK_COST_VALUE(CoeffReadCost);
-  }
-};
-
-template<typename ArgType, int Rows, int Cols, int Order>
-struct reshaped_evaluator<ArgType, Rows, Cols, Order, /* HasDirectAccess */ false>
-  : evaluator_base<Reshaped<ArgType, Rows, Cols, Order> >
-{
-  typedef Reshaped<ArgType, Rows, Cols, Order> XprType;
-
-  enum {
-    CoeffReadCost = evaluator<ArgType>::CoeffReadCost /* TODO + cost of index computations */,
-
-    Flags = (evaluator<ArgType>::Flags & (HereditaryBits /*| LinearAccessBit | DirectAccessBit*/)),
-
-    Alignment = 0
-  };
-
-  EIGEN_DEVICE_FUNC explicit reshaped_evaluator(const XprType& xpr) : m_argImpl(xpr.nestedExpression()), m_xpr(xpr)
-  {
-    EIGEN_INTERNAL_CHECK_COST_VALUE(CoeffReadCost);
-  }
-
-  typedef typename XprType::Scalar Scalar;
-  typedef typename XprType::CoeffReturnType CoeffReturnType;
-
-  typedef std::pair<Index, Index> RowCol;
-
-  inline RowCol index_remap(Index rowId, Index colId) const
-  {
-    if(Order==ColMajor)
-    {
-      const Index nth_elem_idx = colId * m_xpr.rows() + rowId;
-      return RowCol(nth_elem_idx % m_xpr.nestedExpression().rows(),
-                    nth_elem_idx / m_xpr.nestedExpression().rows());
-    }
-    else
-    {
-      const Index nth_elem_idx = colId + rowId * m_xpr.cols();
-      return RowCol(nth_elem_idx / m_xpr.nestedExpression().cols(),
-                    nth_elem_idx % m_xpr.nestedExpression().cols());
-    }
-  }
-
-  EIGEN_DEVICE_FUNC
-  inline Scalar& coeffRef(Index rowId, Index colId)
-  {
-    EIGEN_STATIC_ASSERT_LVALUE(XprType)
-    const RowCol row_col = index_remap(rowId, colId);
-    return m_argImpl.coeffRef(row_col.first, row_col.second);
-  }
-
-  EIGEN_DEVICE_FUNC
-  inline const Scalar& coeffRef(Index rowId, Index colId) const
-  {
-    const RowCol row_col = index_remap(rowId, colId);
-    return m_argImpl.coeffRef(row_col.first, row_col.second);
-  }
-
-  EIGEN_DEVICE_FUNC
-  EIGEN_STRONG_INLINE const CoeffReturnType coeff(Index rowId, Index colId) const
-  {
-    const RowCol row_col = index_remap(rowId, colId);
-    return m_argImpl.coeff(row_col.first, row_col.second);
-  }
-
-  EIGEN_DEVICE_FUNC
-  inline Scalar& coeffRef(Index index)
-  {
-    EIGEN_STATIC_ASSERT_LVALUE(XprType)
-    const RowCol row_col = index_remap(Rows == 1 ? 0 : index,
-                                       Rows == 1 ? index : 0);
-    return m_argImpl.coeffRef(row_col.first, row_col.second);
-
-  }
-
-  EIGEN_DEVICE_FUNC
-  inline const Scalar& coeffRef(Index index) const
-  {
-    const RowCol row_col = index_remap(Rows == 1 ? 0 : index,
-                                       Rows == 1 ? index : 0);
-    return m_argImpl.coeffRef(row_col.first, row_col.second);
-  }
-
-  EIGEN_DEVICE_FUNC
-  inline const CoeffReturnType coeff(Index index) const
-  {
-    const RowCol row_col = index_remap(Rows == 1 ? 0 : index,
-                                       Rows == 1 ? index : 0);
-    return m_argImpl.coeff(row_col.first, row_col.second);
-  }
-#if 0
-  EIGEN_DEVICE_FUNC
-  template<int LoadMode>
-  inline PacketScalar packet(Index rowId, Index colId) const
-  {
-    const RowCol row_col = index_remap(rowId, colId);
-    return m_argImpl.template packet<Unaligned>(row_col.first, row_col.second);
-
-  }
-
-  template<int LoadMode>
-  EIGEN_DEVICE_FUNC
-  inline void writePacket(Index rowId, Index colId, const PacketScalar& val)
-  {
-    const RowCol row_col = index_remap(rowId, colId);
-    m_argImpl.const_cast_derived().template writePacket<Unaligned>
-            (row_col.first, row_col.second, val);
-  }
-
-  template<int LoadMode>
-  EIGEN_DEVICE_FUNC
-  inline PacketScalar packet(Index index) const
-  {
-    const RowCol row_col = index_remap(RowsAtCompileTime == 1 ? 0 : index,
-                                        RowsAtCompileTime == 1 ? index : 0);
-    return m_argImpl.template packet<Unaligned>(row_col.first, row_col.second);
-  }
-
-  template<int LoadMode>
-  EIGEN_DEVICE_FUNC
-  inline void writePacket(Index index, const PacketScalar& val)
-  {
-    const RowCol row_col = index_remap(RowsAtCompileTime == 1 ? 0 : index,
-                                        RowsAtCompileTime == 1 ? index : 0);
-    return m_argImpl.template packet<Unaligned>(row_col.first, row_col.second, val);
-  }
-#endif
-protected:
-
-  evaluator<ArgType> m_argImpl;
-  const XprType& m_xpr;
-
-};
-
-template<typename ArgType, int Rows, int Cols, int Order>
-struct reshaped_evaluator<ArgType, Rows, Cols, Order, /* HasDirectAccess */ true>
-: mapbase_evaluator<Reshaped<ArgType, Rows, Cols, Order>,
-                      typename Reshaped<ArgType, Rows, Cols, Order>::PlainObject>
-{
-  typedef Reshaped<ArgType, Rows, Cols, Order> XprType;
-  typedef typename XprType::Scalar Scalar;
-
-  EIGEN_DEVICE_FUNC explicit reshaped_evaluator(const XprType& xpr)
-    : mapbase_evaluator<XprType, typename XprType::PlainObject>(xpr)
-  {
-    // TODO: for the 3.4 release, this should be turned to an internal assertion, but let's keep it as is for the beta lifetime
-    eigen_assert(((internal::UIntPtr(xpr.data()) % EIGEN_PLAIN_ENUM_MAX(1,evaluator<XprType>::Alignment)) == 0) && "data is not aligned");
-  }
-};
-
-} // end namespace internal
-
-} // end namespace Eigen
-
-#endif // EIGEN_RESHAPED_H
--- a/Eigen/src/Core/ReturnByValue.h
+++ b/Eigen/src/Core/ReturnByValue.h
@@ -79,7 +79,7 @@ template<typename Derived> class ReturnByValue

 template<typename Derived>
 template<typename OtherDerived>
-EIGEN_DEVICE_FUNC Derived& DenseBase<Derived>::operator=(const ReturnByValue<OtherDerived>& other)
+Derived& DenseBase<Derived>::operator=(const ReturnByValue<OtherDerived>& other)
 {
  other.evalTo(derived());
  return derived();
--- a/Eigen/src/Core/Reverse.h
+++ b/Eigen/src/Core/Reverse.h
@@ -114,7 +114,7 @@ template<typename MatrixType, int Direction> class Reverse
  *
  */
 template<typename Derived>
-EIGEN_DEVICE_FUNC inline typename DenseBase<Derived>::ReverseReturnType
+inline typename DenseBase<Derived>::ReverseReturnType
 DenseBase<Derived>::reverse()
 {
  return ReverseReturnType(derived());
@@ -136,7 +136,7 @@ DenseBase<Derived>::reverse()
  *
  * \sa VectorwiseOp::reverseInPlace(), reverse() */
 template<typename Derived>
-EIGEN_DEVICE_FUNC inline void DenseBase<Derived>::reverseInPlace()
+inline void DenseBase<Derived>::reverseInPlace()
 {
  if(cols()>rows())
  {
@@ -171,10 +171,8 @@ struct vectorwise_reverse_inplace_impl<Vertical>
  template<typename ExpressionType>
  static void run(ExpressionType &xpr)
  {
-    const int HalfAtCompileTime = ExpressionType::RowsAtCompileTime==Dynamic?Dynamic:ExpressionType::RowsAtCompileTime/2;
    Index half = xpr.rows()/2;
-    xpr.topRows(fix<HalfAtCompileTime>(half))
-       .swap(xpr.bottomRows(fix<HalfAtCompileTime>(half)).colwise().reverse());
+    xpr.topRows(half).swap(xpr.bottomRows(half).colwise().reverse());
  }
 };

@@ -184,10 +182,8 @@ struct vectorwise_reverse_inplace_impl<Horizontal>
  template<typename ExpressionType>
  static void run(ExpressionType &xpr)
  {
-    const int HalfAtCompileTime = ExpressionType::ColsAtCompileTime==Dynamic?Dynamic:ExpressionType::ColsAtCompileTime/2;
    Index half = xpr.cols()/2;
-    xpr.leftCols(fix<HalfAtCompileTime>(half))
-       .swap(xpr.rightCols(fix<HalfAtCompileTime>(half)).rowwise().reverse());
+    xpr.leftCols(half).swap(xpr.rightCols(half).rowwise().reverse());
  }
 };

@@ -205,9 +201,9 @@ struct vectorwise_reverse_inplace_impl<Horizontal>
  *
  * \sa DenseBase::reverseInPlace(), reverse() */
 template<typename ExpressionType, int Direction>
-EIGEN_DEVICE_FUNC void VectorwiseOp<ExpressionType,Direction>::reverseInPlace()
+void VectorwiseOp<ExpressionType,Direction>::reverseInPlace()
 {
-  internal::vectorwise_reverse_inplace_impl<Direction>::run(m_matrix);
+  internal::vectorwise_reverse_inplace_impl<Direction>::run(_expression().const_cast_derived());
 }

 } // end namespace Eigen
--- a/Eigen/src/Core/SelfAdjointView.h
+++ b/Eigen/src/Core/SelfAdjointView.h
@@ -61,7 +61,6 @@ template<typename _MatrixType, unsigned int UpLo> class SelfAdjointView
    typedef typename internal::traits<SelfAdjointView>::Scalar Scalar; 
    typedef typename MatrixType::StorageIndex StorageIndex;
    typedef typename internal::remove_all<typename MatrixType::ConjugateReturnType>::type MatrixConjugateReturnType;
-    typedef SelfAdjointView<typename internal::add_const<MatrixType>::type, UpLo> ConstSelfAdjointView;

    enum {
      Mode = internal::traits<SelfAdjointView>::Mode,
@@ -198,18 +197,6 @@ template<typename _MatrixType, unsigned int UpLo> class SelfAdjointView
    inline const ConjugateReturnType conjugate() const
    { return ConjugateReturnType(m_matrix.conjugate()); }

-    /** \returns an expression of the complex conjugate of \c *this if Cond==true,
-     *           returns \c *this otherwise.
-     */
-    template<bool Cond>
-    EIGEN_DEVICE_FUNC
-    inline typename internal::conditional<Cond,ConjugateReturnType,ConstSelfAdjointView>::type
-    conjugateIf() const
-    {
-      typedef typename internal::conditional<Cond,ConjugateReturnType,ConstSelfAdjointView>::type ReturnType;
-      return ReturnType(m_matrix.template conjugateIf<Cond>());
-    }
-
    typedef SelfAdjointView<const typename MatrixType::AdjointReturnType,TransposeMode> AdjointReturnType;
    /** \sa MatrixBase::adjoint() const */
    EIGEN_DEVICE_FUNC
@@ -337,7 +324,7 @@ public:
 /** This is the const version of MatrixBase::selfadjointView() */
 template<typename Derived>
 template<unsigned int UpLo>
-EIGEN_DEVICE_FUNC typename MatrixBase<Derived>::template ConstSelfAdjointViewReturnType<UpLo>::Type
+typename MatrixBase<Derived>::template ConstSelfAdjointViewReturnType<UpLo>::Type
 MatrixBase<Derived>::selfadjointView() const
 {
  return typename ConstSelfAdjointViewReturnType<UpLo>::Type(derived());
@@ -354,7 +341,7 @@ MatrixBase<Derived>::selfadjointView() const
  */
 template<typename Derived>
 template<unsigned int UpLo>
-EIGEN_DEVICE_FUNC typename MatrixBase<Derived>::template SelfAdjointViewReturnType<UpLo>::Type
+typename MatrixBase<Derived>::template SelfAdjointViewReturnType<UpLo>::Type
 MatrixBase<Derived>::selfadjointView()
 {
  return typename SelfAdjointViewReturnType<UpLo>::Type(derived());
--- a/Eigen/src/Core/Solve.h
+++ b/Eigen/src/Core/Solve.h
@@ -19,7 +19,7 @@ template<typename Decomposition, typename RhsType, typename StorageKind> class S
  *
  * \brief Pseudo expression representing a solving operation
  *
-  * \tparam Decomposition the type of the matrix or decomposition object
+  * \tparam Decomposition the type of the matrix or decomposion object
  * \tparam Rhstype the type of the right-hand side
  *
  * This class represents an expression of A.solve(B)
@@ -181,7 +181,7 @@ struct Assignment<DstXprType, Solve<CwiseUnaryOp<internal::scalar_conjugate_op<t
  }
 };

-} // end namespace internal
+} // end namepsace internal

 } // end namespace Eigen

--- a/Eigen/src/Core/SolveTriangular.h
+++ b/Eigen/src/Core/SolveTriangular.h
@@ -19,7 +19,7 @@ namespace internal {
 template<typename LhsScalar, typename RhsScalar, typename Index, int Side, int Mode, bool Conjugate, int StorageOrder>
 struct triangular_solve_vector;

-template <typename Scalar, typename Index, int Side, int Mode, bool Conjugate, int TriStorageOrder, int OtherStorageOrder, int OtherInnerStride>
+template <typename Scalar, typename Index, int Side, int Mode, bool Conjugate, int TriStorageOrder, int OtherStorageOrder>
 struct triangular_solve_matrix;

 // small helper struct extracting some traits on the underlying solver operation
@@ -98,8 +98,8 @@ struct triangular_solver_selector<Lhs,Rhs,Side,Mode,NoUnrolling,Dynamic>
    BlockingType blocking(rhs.rows(), rhs.cols(), size, 1, false);

    triangular_solve_matrix<Scalar,Index,Side,Mode,LhsProductTraits::NeedToConjugate,(int(Lhs::Flags) & RowMajorBit) ? RowMajor : ColMajor,
-                               (Rhs::Flags&RowMajorBit) ? RowMajor : ColMajor, Rhs::InnerStrideAtCompileTime>
-      ::run(size, othersize, &actualLhs.coeffRef(0,0), actualLhs.outerStride(), &rhs.coeffRef(0,0), rhs.innerStride(), rhs.outerStride(), blocking);
+                               (Rhs::Flags&RowMajorBit) ? RowMajor : ColMajor>
+      ::run(size, othersize, &actualLhs.coeffRef(0,0), actualLhs.outerStride(), &rhs.coeffRef(0,0), rhs.outerStride(), blocking);
  }
 };

@@ -164,7 +164,7 @@ struct triangular_solver_selector<Lhs,Rhs,OnTheRight,Mode,CompleteUnrolling,1> {
 #ifndef EIGEN_PARSED_BY_DOXYGEN
 template<typename MatrixType, unsigned int Mode>
 template<int Side, typename OtherDerived>
-EIGEN_DEVICE_FUNC void TriangularViewImpl<MatrixType,Mode,Dense>::solveInPlace(const MatrixBase<OtherDerived>& _other) const
+void TriangularViewImpl<MatrixType,Mode,Dense>::solveInPlace(const MatrixBase<OtherDerived>& _other) const
 {
  OtherDerived& other = _other.const_cast_derived();
  eigen_assert( derived().cols() == derived().rows() && ((Side==OnTheLeft && derived().cols() == other.rows()) || (Side==OnTheRight && derived().cols() == other.cols())) );
--- a/Eigen/src/Core/SolverBase.h
+++ b/Eigen/src/Core/SolverBase.h
@@ -14,35 +14,8 @@ namespace Eigen {

 namespace internal {

-template<typename Derived>
-struct solve_assertion {
-    template<bool Transpose_, typename Rhs>
-    static void run(const Derived& solver, const Rhs& b) { solver.template _check_solve_assertion<Transpose_>(b); }
-};

-template<typename Derived>
-struct solve_assertion<Transpose<Derived> >
-{
-    typedef Transpose<Derived> type;

-    template<bool Transpose_, typename Rhs>
-    static void run(const type& transpose, const Rhs& b)
-    {
-        internal::solve_assertion<typename internal::remove_all<Derived>::type>::template run<true>(transpose.nestedExpression(), b);
-    }
-};
-
-template<typename Scalar, typename Derived>
-struct solve_assertion<CwiseUnaryOp<Eigen::internal::scalar_conjugate_op<Scalar>, const Transpose<Derived> > >
-{
-    typedef CwiseUnaryOp<Eigen::internal::scalar_conjugate_op<Scalar>, const Transpose<Derived> > type;
-
-    template<bool Transpose_, typename Rhs>
-    static void run(const type& adjoint, const Rhs& b)
-    {
-        internal::solve_assertion<typename internal::remove_all<Transpose<Derived> >::type>::template run<true>(adjoint.nestedExpression(), b);
-    }
-};
 } // end namespace internal

 /** \class SolverBase
@@ -62,7 +35,7 @@ struct solve_assertion<CwiseUnaryOp<Eigen::internal::scalar_conjugate_op<Scalar>
  *
  * \warning Currently, any other usage of transpose() and adjoint() are not supported and will produce compilation errors.
  *
-  * \sa class PartialPivLU, class FullPivLU, class HouseholderQR, class ColPivHouseholderQR, class FullPivHouseholderQR, class CompleteOrthogonalDecomposition, class LLT, class LDLT, class SVDBase
+  * \sa class PartialPivLU, class FullPivLU
  */
 template<typename Derived>
 class SolverBase : public EigenBase<Derived>
@@ -73,9 +46,6 @@ class SolverBase : public EigenBase<Derived>
    typedef typename internal::traits<Derived>::Scalar Scalar;
    typedef Scalar CoeffReturnType;

-    template<typename Derived_>
-    friend struct internal::solve_assertion;
-
    enum {
      RowsAtCompileTime = internal::traits<Derived>::RowsAtCompileTime,
      ColsAtCompileTime = internal::traits<Derived>::ColsAtCompileTime,
@@ -86,8 +56,7 @@ class SolverBase : public EigenBase<Derived>
      MaxSizeAtCompileTime = (internal::size_at_compile_time<internal::traits<Derived>::MaxRowsAtCompileTime,
                                                             internal::traits<Derived>::MaxColsAtCompileTime>::ret),
      IsVectorAtCompileTime = internal::traits<Derived>::MaxRowsAtCompileTime == 1
-                           || internal::traits<Derived>::MaxColsAtCompileTime == 1,
-      NumDimensions = int(MaxSizeAtCompileTime) == 1 ? 0 : bool(IsVectorAtCompileTime) ? 1 : 2
+                           || internal::traits<Derived>::MaxColsAtCompileTime == 1
    };

    /** Default constructor */
@@ -105,7 +74,7 @@ class SolverBase : public EigenBase<Derived>
    inline const Solve<Derived, Rhs>
    solve(const MatrixBase<Rhs>& b) const
    {
-      internal::solve_assertion<typename internal::remove_all<Derived>::type>::template run<false>(derived(), b);
+      eigen_assert(derived().rows()==b.rows() && "solve(): invalid number of rows of the right hand side matrix b");
      return Solve<Derived, Rhs>(derived(), b.derived());
    }

@@ -143,13 +112,6 @@ class SolverBase : public EigenBase<Derived>
    }

  protected:
-
-    template<bool Transpose_, typename Rhs>
-    void _check_solve_assertion(const Rhs& b) const {
-        EIGEN_ONLY_USED_FOR_DEBUG(b);
-        eigen_assert(derived().m_isInitialized && "Solver is not initialized.");
-        eigen_assert((Transpose_?derived().cols():derived().rows())==b.rows() && "SolverBase::solve(): invalid number of rows of the right hand side matrix b");
-    }
 };

 namespace internal {
--- a/Eigen/src/Core/StableNorm.h
+++ b/Eigen/src/Core/StableNorm.h
@@ -50,71 +50,6 @@ inline void stable_norm_kernel(const ExpressionType& bl, Scalar& ssq, Scalar& sc
    ssq += (bl*invScale).squaredNorm();
 }

-template<typename VectorType, typename RealScalar>
-void stable_norm_impl_inner_step(const VectorType &vec, RealScalar& ssq, RealScalar& scale, RealScalar& invScale)
-{
-  typedef typename VectorType::Scalar Scalar;
-  const Index blockSize = 4096;
-  
-  typedef typename internal::nested_eval<VectorType,2>::type VectorTypeCopy;
-  typedef typename internal::remove_all<VectorTypeCopy>::type VectorTypeCopyClean;
-  const VectorTypeCopy copy(vec);
-  
-  enum {
-    CanAlign = (   (int(VectorTypeCopyClean::Flags)&DirectAccessBit)
-                || (int(internal::evaluator<VectorTypeCopyClean>::Alignment)>0) // FIXME Alignment)>0 might not be enough
-               ) && (blockSize*sizeof(Scalar)*2<EIGEN_STACK_ALLOCATION_LIMIT)
-                 && (EIGEN_MAX_STATIC_ALIGN_BYTES>0) // if we cannot allocate on the stack, then let's not bother about this optimization
-  };
-  typedef typename internal::conditional<CanAlign, Ref<const Matrix<Scalar,Dynamic,1,0,blockSize,1>, internal::evaluator<VectorTypeCopyClean>::Alignment>,
-                                                   typename VectorTypeCopyClean::ConstSegmentReturnType>::type SegmentWrapper;
-  Index n = vec.size();
-  
-  Index bi = internal::first_default_aligned(copy);
-  if (bi>0)
-    internal::stable_norm_kernel(copy.head(bi), ssq, scale, invScale);
-  for (; bi<n; bi+=blockSize)
-    internal::stable_norm_kernel(SegmentWrapper(copy.segment(bi,numext::mini(blockSize, n - bi))), ssq, scale, invScale);
-}
-
-template<typename VectorType>
-typename VectorType::RealScalar
-stable_norm_impl(const VectorType &vec, typename enable_if<VectorType::IsVectorAtCompileTime>::type* = 0 )
-{
-  using std::sqrt;
-  using std::abs;
-
-  Index n = vec.size();
-
-  if(n==1)
-    return abs(vec.coeff(0));
-
-  typedef typename VectorType::RealScalar RealScalar;
-  RealScalar scale(0);
-  RealScalar invScale(1);
-  RealScalar ssq(0); // sum of squares
-
-  stable_norm_impl_inner_step(vec, ssq, scale, invScale);
-  
-  return scale * sqrt(ssq);
-}
-
-template<typename MatrixType>
-typename MatrixType::RealScalar
-stable_norm_impl(const MatrixType &mat, typename enable_if<!MatrixType::IsVectorAtCompileTime>::type* = 0 )
-{
-  using std::sqrt;
-
-  typedef typename MatrixType::RealScalar RealScalar;
-  RealScalar scale(0);
-  RealScalar invScale(1);
-  RealScalar ssq(0); // sum of squares
-
-  for(Index j=0; j<mat.outerSize(); ++j)
-    stable_norm_impl_inner_step(mat.innerVector(j), ssq, scale, invScale);
-  return scale * sqrt(ssq);
-}
-
 template<typename Derived>
 inline typename NumTraits<typename traits<Derived>::Scalar>::Real
 blueNorm_impl(const EigenBase<Derived>& _vec)
@@ -139,7 +74,7 @@ blueNorm_impl(const EigenBase<Derived>& _vec)
    // are used. For any specific computer, each of the assignment
    // statements can be replaced
    ibeta = std::numeric_limits<RealScalar>::radix;                 // base for floating-point numbers
-    it    = NumTraits<RealScalar>::digits();                        // number of base-beta digits in mantissa
+    it    = std::numeric_limits<RealScalar>::digits;                // number of base-beta digits in mantissa
    iemin = std::numeric_limits<RealScalar>::min_exponent;          // minimum exponent
    iemax = std::numeric_limits<RealScalar>::max_exponent;          // maximum exponent
    rbig  = (std::numeric_limits<RealScalar>::max)();               // largest floating-point number
@@ -163,16 +98,12 @@ blueNorm_impl(const EigenBase<Derived>& _vec)
  RealScalar asml = RealScalar(0);
  RealScalar amed = RealScalar(0);
  RealScalar abig = RealScalar(0);
-
-  for(Index j=0; j<vec.outerSize(); ++j)
+  for(typename Derived::InnerIterator it(vec, 0); it; ++it)
  {
-    for(typename Derived::InnerIterator it(vec, j); it; ++it)
-    {
-      RealScalar ax = abs(it.value());
-      if(ax > ab2)     abig += numext::abs2(ax*s2m);
-      else if(ax < b1) asml += numext::abs2(ax*s1m);
-      else             amed += numext::abs2(ax);
-    }
+    RealScalar ax = abs(it.value());
+    if(ax > ab2)     abig += numext::abs2(ax*s2m);
+    else if(ax < b1) asml += numext::abs2(ax*s1m);
+    else             amed += numext::abs2(ax);
  }
  if(amed!=amed)
    return amed;  // we got a NaN
@@ -225,7 +156,36 @@ template<typename Derived>
 inline typename NumTraits<typename internal::traits<Derived>::Scalar>::Real
 MatrixBase<Derived>::stableNorm() const
 {
-  return internal::stable_norm_impl(derived());
+  using std::sqrt;
+  using std::abs;
+  const Index blockSize = 4096;
+  RealScalar scale(0);
+  RealScalar invScale(1);
+  RealScalar ssq(0); // sum of square
+  
+  typedef typename internal::nested_eval<Derived,2>::type DerivedCopy;
+  typedef typename internal::remove_all<DerivedCopy>::type DerivedCopyClean;
+  const DerivedCopy copy(derived());
+  
+  enum {
+    CanAlign = (   (int(DerivedCopyClean::Flags)&DirectAccessBit)
+                || (int(internal::evaluator<DerivedCopyClean>::Alignment)>0) // FIXME Alignment)>0 might not be enough
+               ) && (blockSize*sizeof(Scalar)*2<EIGEN_STACK_ALLOCATION_LIMIT)
+                 && (EIGEN_MAX_STATIC_ALIGN_BYTES>0) // if we cannot allocate on the stack, then let's not bother about this optimization
+  };
+  typedef typename internal::conditional<CanAlign, Ref<const Matrix<Scalar,Dynamic,1,0,blockSize,1>, internal::evaluator<DerivedCopyClean>::Alignment>,
+                                                   typename DerivedCopyClean::ConstSegmentReturnType>::type SegmentWrapper;
+  Index n = size();
+  
+  if(n==1)
+    return abs(this->coeff(0));
+  
+  Index bi = internal::first_default_aligned(copy);
+  if (bi>0)
+    internal::stable_norm_kernel(copy.head(bi), ssq, scale, invScale);
+  for (; bi<n; bi+=blockSize)
+    internal::stable_norm_kernel(SegmentWrapper(copy.segment(bi,numext::mini(blockSize, n - bi))), ssq, scale, invScale);
+  return scale * sqrt(ssq);
 }

 /** \returns the \em l2 norm of \c *this using the Blue's algorithm.
@@ -253,10 +213,7 @@ template<typename Derived>
 inline typename NumTraits<typename internal::traits<Derived>::Scalar>::Real
 MatrixBase<Derived>::hypotNorm() const
 {
-  if(size()==1)
-    return numext::abs(coeff(0,0));
-  else
-    return this->cwiseAbs().redux(internal::scalar_hypot_op<RealScalar>());
+  return this->cwiseAbs().redux(internal::scalar_hypot_op<RealScalar>());
 }

 } // end namespace Eigen
--- a/Eigen/src/Core/StlIterators.h
+++ b/Eigen/src/Core/StlIterators.h
@@ -1,331 +0,0 @@
-// This file is part of Eigen, a lightweight C++ template library
-// for linear algebra.
-//
-// Copyright (C) 2018 Gael Guennebaud <gael.guennebaud@inria.fr>
-//
-// This Source Code Form is subject to the terms of the Mozilla
-// Public License v. 2.0. If a copy of the MPL was not distributed
-// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
-
-namespace Eigen {
-
-namespace internal {
-
-template<typename IteratorType>
-struct indexed_based_stl_iterator_traits;
-
-template<typename  Derived>
-class indexed_based_stl_iterator_base
-{
-protected:
-  typedef indexed_based_stl_iterator_traits<Derived> traits;
-  typedef typename traits::XprType XprType;
-  typedef indexed_based_stl_iterator_base<typename traits::non_const_iterator> non_const_iterator;
-  typedef indexed_based_stl_iterator_base<typename traits::const_iterator> const_iterator;
-  typedef typename internal::conditional<internal::is_const<XprType>::value,non_const_iterator,const_iterator>::type other_iterator;
-  // NOTE: in C++03 we cannot declare friend classes through typedefs because we need to write friend class:
-  friend class indexed_based_stl_iterator_base<typename traits::const_iterator>;
-  friend class indexed_based_stl_iterator_base<typename traits::non_const_iterator>;
-public:
-  typedef Index difference_type;
-  typedef std::random_access_iterator_tag iterator_category;
-
-  indexed_based_stl_iterator_base() : mp_xpr(0), m_index(0) {}
-  indexed_based_stl_iterator_base(XprType& xpr, Index index) : mp_xpr(&xpr), m_index(index) {}
-
-  indexed_based_stl_iterator_base(const non_const_iterator& other)
-    : mp_xpr(other.mp_xpr), m_index(other.m_index)
-  {}
-
-  indexed_based_stl_iterator_base& operator=(const non_const_iterator& other)
-  {
-    mp_xpr = other.mp_xpr;
-    m_index = other.m_index;
-    return *this;
-  }
-
-  Derived& operator++() { ++m_index; return derived(); }
-  Derived& operator--() { --m_index; return derived(); }
-
-  Derived operator++(int) { Derived prev(derived()); operator++(); return prev;}
-  Derived operator--(int) { Derived prev(derived()); operator--(); return prev;}
-
-  friend Derived operator+(const indexed_based_stl_iterator_base& a, Index b) { Derived ret(a.derived()); ret += b; return ret; }
-  friend Derived operator-(const indexed_based_stl_iterator_base& a, Index b) { Derived ret(a.derived()); ret -= b; return ret; }
-  friend Derived operator+(Index a, const indexed_based_stl_iterator_base& b) { Derived ret(b.derived()); ret += a; return ret; }
-  friend Derived operator-(Index a, const indexed_based_stl_iterator_base& b) { Derived ret(b.derived()); ret -= a; return ret; }
-  
-  Derived& operator+=(Index b) { m_index += b; return derived(); }
-  Derived& operator-=(Index b) { m_index -= b; return derived(); }
-
-  difference_type operator-(const indexed_based_stl_iterator_base& other) const
-  {
-    eigen_assert(mp_xpr == other.mp_xpr);
-    return m_index - other.m_index;
-  }
-
-  difference_type operator-(const other_iterator& other) const
-  {
-    eigen_assert(mp_xpr == other.mp_xpr);
-    return m_index - other.m_index;
-  }
-
-  bool operator==(const indexed_based_stl_iterator_base& other) const { eigen_assert(mp_xpr == other.mp_xpr); return m_index == other.m_index; }
-  bool operator!=(const indexed_based_stl_iterator_base& other) const { eigen_assert(mp_xpr == other.mp_xpr); return m_index != other.m_index; }
-  bool operator< (const indexed_based_stl_iterator_base& other) const { eigen_assert(mp_xpr == other.mp_xpr); return m_index <  other.m_index; }
-  bool operator<=(const indexed_based_stl_iterator_base& other) const { eigen_assert(mp_xpr == other.mp_xpr); return m_index <= other.m_index; }
-  bool operator> (const indexed_based_stl_iterator_base& other) const { eigen_assert(mp_xpr == other.mp_xpr); return m_index >  other.m_index; }
-  bool operator>=(const indexed_based_stl_iterator_base& other) const { eigen_assert(mp_xpr == other.mp_xpr); return m_index >= other.m_index; }
-
-  bool operator==(const other_iterator& other) const { eigen_assert(mp_xpr == other.mp_xpr); return m_index == other.m_index; }
-  bool operator!=(const other_iterator& other) const { eigen_assert(mp_xpr == other.mp_xpr); return m_index != other.m_index; }
-  bool operator< (const other_iterator& other) const { eigen_assert(mp_xpr == other.mp_xpr); return m_index <  other.m_index; }
-  bool operator<=(const other_iterator& other) const { eigen_assert(mp_xpr == other.mp_xpr); return m_index <= other.m_index; }
-  bool operator> (const other_iterator& other) const { eigen_assert(mp_xpr == other.mp_xpr); return m_index >  other.m_index; }
-  bool operator>=(const other_iterator& other) const { eigen_assert(mp_xpr == other.mp_xpr); return m_index >= other.m_index; }
-
-protected:
-
-  Derived& derived() { return static_cast<Derived&>(*this); }
-  const Derived& derived() const { return static_cast<const Derived&>(*this); }
-
-  XprType *mp_xpr;
-  Index m_index;
-};
-
-template<typename XprType>
-class pointer_based_stl_iterator
-{
-  enum { is_lvalue  = internal::is_lvalue<XprType>::value };
-  typedef pointer_based_stl_iterator<typename internal::remove_const<XprType>::type> non_const_iterator;
-  typedef pointer_based_stl_iterator<typename internal::add_const<XprType>::type> const_iterator;
-  typedef typename internal::conditional<internal::is_const<XprType>::value,non_const_iterator,const_iterator>::type other_iterator;
-  // NOTE: in C++03 we cannot declare friend classes through typedefs because we need to write friend class:
-  friend class pointer_based_stl_iterator<typename internal::add_const<XprType>::type>;
-  friend class pointer_based_stl_iterator<typename internal::remove_const<XprType>::type>;
-public:
-  typedef Index difference_type;
-  typedef typename XprType::Scalar value_type;
-  typedef std::random_access_iterator_tag iterator_category;
-  typedef typename internal::conditional<bool(is_lvalue), value_type*, const value_type*>::type pointer;
-  typedef typename internal::conditional<bool(is_lvalue), value_type&, const value_type&>::type reference;
-
-
-  pointer_based_stl_iterator() : m_ptr(0) {}
-  pointer_based_stl_iterator(XprType& xpr, Index index) : m_incr(xpr.innerStride())
-  {
-    m_ptr = xpr.data() + index * m_incr.value();
-  }
-
-  pointer_based_stl_iterator(const non_const_iterator& other)
-    : m_ptr(other.m_ptr), m_incr(other.m_incr)
-  {}
-
-  pointer_based_stl_iterator& operator=(const non_const_iterator& other)
-  {
-    m_ptr = other.m_ptr;
-    m_incr.setValue(other.m_incr);
-    return *this;
-  }
-
-  reference operator*()         const { return *m_ptr;   }
-  reference operator[](Index i) const { return *(m_ptr+i*m_incr.value()); }
-  pointer   operator->()        const { return m_ptr;    }
-
-  pointer_based_stl_iterator& operator++() { m_ptr += m_incr.value(); return *this; }
-  pointer_based_stl_iterator& operator--() { m_ptr -= m_incr.value(); return *this; }
-
-  pointer_based_stl_iterator operator++(int) { pointer_based_stl_iterator prev(*this); operator++(); return prev;}
-  pointer_based_stl_iterator operator--(int) { pointer_based_stl_iterator prev(*this); operator--(); return prev;}
-
-  friend pointer_based_stl_iterator operator+(const pointer_based_stl_iterator& a, Index b) { pointer_based_stl_iterator ret(a); ret += b; return ret; }
-  friend pointer_based_stl_iterator operator-(const pointer_based_stl_iterator& a, Index b) { pointer_based_stl_iterator ret(a); ret -= b; return ret; }
-  friend pointer_based_stl_iterator operator+(Index a, const pointer_based_stl_iterator& b) { pointer_based_stl_iterator ret(b); ret += a; return ret; }
-  friend pointer_based_stl_iterator operator-(Index a, const pointer_based_stl_iterator& b) { pointer_based_stl_iterator ret(b); ret -= a; return ret; }
-  
-  pointer_based_stl_iterator& operator+=(Index b) { m_ptr += b*m_incr.value(); return *this; }
-  pointer_based_stl_iterator& operator-=(Index b) { m_ptr -= b*m_incr.value(); return *this; }
-
-  difference_type operator-(const pointer_based_stl_iterator& other) const {
-    return (m_ptr - other.m_ptr)/m_incr.value();
-  }
-
-  difference_type operator-(const other_iterator& other) const {
-    return (m_ptr - other.m_ptr)/m_incr.value();
-  }
-
-  bool operator==(const pointer_based_stl_iterator& other) const { return m_ptr == other.m_ptr; }
-  bool operator!=(const pointer_based_stl_iterator& other) const { return m_ptr != other.m_ptr; }
-  bool operator< (const pointer_based_stl_iterator& other) const { return m_ptr <  other.m_ptr; }
-  bool operator<=(const pointer_based_stl_iterator& other) const { return m_ptr <= other.m_ptr; }
-  bool operator> (const pointer_based_stl_iterator& other) const { return m_ptr >  other.m_ptr; }
-  bool operator>=(const pointer_based_stl_iterator& other) const { return m_ptr >= other.m_ptr; }
-
-  bool operator==(const other_iterator& other) const { return m_ptr == other.m_ptr; }
-  bool operator!=(const other_iterator& other) const { return m_ptr != other.m_ptr; }
-  bool operator< (const other_iterator& other) const { return m_ptr <  other.m_ptr; }
-  bool operator<=(const other_iterator& other) const { return m_ptr <= other.m_ptr; }
-  bool operator> (const other_iterator& other) const { return m_ptr >  other.m_ptr; }
-  bool operator>=(const other_iterator& other) const { return m_ptr >= other.m_ptr; }
-
-protected:
-
-  pointer m_ptr;
-  internal::variable_if_dynamic<Index, XprType::InnerStrideAtCompileTime> m_incr;
-};
-
-template<typename _XprType>
-struct indexed_based_stl_iterator_traits<generic_randaccess_stl_iterator<_XprType> >
-{
-  typedef _XprType XprType;
-  typedef generic_randaccess_stl_iterator<typename internal::remove_const<XprType>::type> non_const_iterator;
-  typedef generic_randaccess_stl_iterator<typename internal::add_const<XprType>::type> const_iterator;
-};
-
-template<typename XprType>
-class generic_randaccess_stl_iterator : public indexed_based_stl_iterator_base<generic_randaccess_stl_iterator<XprType> >
-{
-public:
-  typedef typename XprType::Scalar value_type;
-
-protected:
-
-  enum {
-    has_direct_access = (internal::traits<XprType>::Flags & DirectAccessBit) ? 1 : 0,
-    is_lvalue  = internal::is_lvalue<XprType>::value
-  };
-
-  typedef indexed_based_stl_iterator_base<generic_randaccess_stl_iterator> Base;
-  using Base::m_index;
-  using Base::mp_xpr;
-
-  // TODO currently const Transpose/Reshape expressions never returns const references,
-  // so lets return by value too.
-  //typedef typename internal::conditional<bool(has_direct_access), const value_type&, const value_type>::type read_only_ref_t;
-  typedef const value_type read_only_ref_t;
-
-public:
-  
-  typedef typename internal::conditional<bool(is_lvalue), value_type *, const value_type *>::type pointer;
-  typedef typename internal::conditional<bool(is_lvalue), value_type&, read_only_ref_t>::type reference;
-  
-  generic_randaccess_stl_iterator() : Base() {}
-  generic_randaccess_stl_iterator(XprType& xpr, Index index) : Base(xpr,index) {}
-  generic_randaccess_stl_iterator(const typename Base::non_const_iterator& other) : Base(other) {}
-  using Base::operator=;
-
-  reference operator*()         const { return   (*mp_xpr)(m_index);   }
-  reference operator[](Index i) const { return   (*mp_xpr)(m_index+i); }
-  pointer   operator->()        const { return &((*mp_xpr)(m_index)); }
-};
-
-template<typename _XprType, DirectionType Direction>
-struct indexed_based_stl_iterator_traits<subvector_stl_iterator<_XprType,Direction> >
-{
-  typedef _XprType XprType;
-  typedef subvector_stl_iterator<typename internal::remove_const<XprType>::type, Direction> non_const_iterator;
-  typedef subvector_stl_iterator<typename internal::add_const<XprType>::type, Direction> const_iterator;
-};
-
-template<typename XprType, DirectionType Direction>
-class subvector_stl_iterator : public indexed_based_stl_iterator_base<subvector_stl_iterator<XprType,Direction> >
-{
-protected:
-
-  enum { is_lvalue  = internal::is_lvalue<XprType>::value };
-
-  typedef indexed_based_stl_iterator_base<subvector_stl_iterator> Base;
-  using Base::m_index;
-  using Base::mp_xpr;
-
-  typedef typename internal::conditional<Direction==Vertical,typename XprType::ColXpr,typename XprType::RowXpr>::type SubVectorType;
-  typedef typename internal::conditional<Direction==Vertical,typename XprType::ConstColXpr,typename XprType::ConstRowXpr>::type ConstSubVectorType;
-
-
-public:
-  typedef typename internal::conditional<bool(is_lvalue), SubVectorType, ConstSubVectorType>::type reference;
-  typedef typename reference::PlainObject value_type;
-
-private:
-  class subvector_stl_iterator_ptr
-  {
-  public:
-      subvector_stl_iterator_ptr(const reference &subvector) : m_subvector(subvector) {}
-      reference* operator->() { return &m_subvector; }
-  private:
-      reference m_subvector;
-  };
-public:
-
-  typedef subvector_stl_iterator_ptr pointer;
-  
-  subvector_stl_iterator() : Base() {}
-  subvector_stl_iterator(XprType& xpr, Index index) : Base(xpr,index) {}
-
-  reference operator*()         const { return (*mp_xpr).template subVector<Direction>(m_index); }
-  reference operator[](Index i) const { return (*mp_xpr).template subVector<Direction>(m_index+i); }
-  pointer   operator->()        const { return (*mp_xpr).template subVector<Direction>(m_index); }
-};
-
-} // namespace internal
-
-
-/** returns an iterator to the first element of the 1D vector or array
-  * \only_for_vectors
-  * \sa end(), cbegin()
-  */
-template<typename Derived>
-inline typename DenseBase<Derived>::iterator DenseBase<Derived>::begin()
-{
-  EIGEN_STATIC_ASSERT_VECTOR_ONLY(Derived);
-  return iterator(derived(), 0);
-}
-
-/** const version of begin() */
-template<typename Derived>
-inline typename DenseBase<Derived>::const_iterator DenseBase<Derived>::begin() const
-{
-  return cbegin();
-}
-
-/** returns a read-only const_iterator to the first element of the 1D vector or array
-  * \only_for_vectors
-  * \sa cend(), begin()
-  */
-template<typename Derived>
-inline typename DenseBase<Derived>::const_iterator DenseBase<Derived>::cbegin() const
-{
-  EIGEN_STATIC_ASSERT_VECTOR_ONLY(Derived);
-  return const_iterator(derived(), 0);
-}
-
-/** returns an iterator to the element following the last element of the 1D vector or array
-  * \only_for_vectors
-  * \sa begin(), cend()
-  */
-template<typename Derived>
-inline typename DenseBase<Derived>::iterator DenseBase<Derived>::end()
-{
-  EIGEN_STATIC_ASSERT_VECTOR_ONLY(Derived);
-  return iterator(derived(), size());
-}
-
-/** const version of end() */
-template<typename Derived>
-inline typename DenseBase<Derived>::const_iterator DenseBase<Derived>::end() const
-{
-  return cend();
-}
-
-/** returns a read-only const_iterator to the element following the last element of the 1D vector or array
-  * \only_for_vectors
-  * \sa begin(), cend()
-  */
-template<typename Derived>
-inline typename DenseBase<Derived>::const_iterator DenseBase<Derived>::cend() const
-{
-  EIGEN_STATIC_ASSERT_VECTOR_ONLY(Derived);
-  return const_iterator(derived(), size());
-}
-
-} // namespace Eigen
--- a/Eigen/src/Core/Swap.h
+++ b/Eigen/src/Core/Swap.h
@@ -30,13 +30,12 @@ public:
  typedef typename Base::DstXprType DstXprType;
  typedef swap_assign_op<Scalar> Functor;
  
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
-  generic_dense_assignment_kernel(DstEvaluatorTypeT &dst, const SrcEvaluatorTypeT &src, const Functor &func, DstXprType& dstExpr)
+  EIGEN_DEVICE_FUNC generic_dense_assignment_kernel(DstEvaluatorTypeT &dst, const SrcEvaluatorTypeT &src, const Functor &func, DstXprType& dstExpr)
    : Base(dst, src, func, dstExpr)
  {}
  
  template<int StoreMode, int LoadMode, typename PacketType>
-  EIGEN_STRONG_INLINE void assignPacket(Index row, Index col)
+  void assignPacket(Index row, Index col)
  {
    PacketType tmp = m_src.template packet<LoadMode,PacketType>(row,col);
    const_cast<SrcEvaluatorTypeT&>(m_src).template writePacket<LoadMode>(row,col, m_dst.template packet<StoreMode,PacketType>(row,col));
@@ -44,7 +43,7 @@ public:
  }
  
  template<int StoreMode, int LoadMode, typename PacketType>
-  EIGEN_STRONG_INLINE void assignPacket(Index index)
+  void assignPacket(Index index)
  {
    PacketType tmp = m_src.template packet<LoadMode,PacketType>(index);
    const_cast<SrcEvaluatorTypeT&>(m_src).template writePacket<LoadMode>(index, m_dst.template packet<StoreMode,PacketType>(index));
@@ -53,7 +52,7 @@ public:
  
  // TODO find a simple way not to have to copy/paste this function from generic_dense_assignment_kernel, by simple I mean no CRTP (Gael)
  template<int StoreMode, int LoadMode, typename PacketType>
-  EIGEN_STRONG_INLINE void assignPacketByOuterInner(Index outer, Index inner)
+  void assignPacketByOuterInner(Index outer, Index inner)
  {
    Index row = Base::rowIndexByOuterInner(outer, inner); 
    Index col = Base::colIndexByOuterInner(outer, inner);
--- a/Eigen/src/Core/Transpose.h
+++ b/Eigen/src/Core/Transpose.h
@@ -61,27 +61,24 @@ template<typename MatrixType> class Transpose
    typedef typename internal::remove_all<MatrixType>::type NestedExpression;

    EIGEN_DEVICE_FUNC
-    explicit EIGEN_STRONG_INLINE Transpose(MatrixType& matrix) : m_matrix(matrix) {}
+    explicit inline Transpose(MatrixType& matrix) : m_matrix(matrix) {}

    EIGEN_INHERIT_ASSIGNMENT_OPERATORS(Transpose)

-    EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
-    Index rows() const { return m_matrix.cols(); }
-    EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
-    Index cols() const { return m_matrix.rows(); }
+    EIGEN_DEVICE_FUNC inline Index rows() const { return m_matrix.cols(); }
+    EIGEN_DEVICE_FUNC inline Index cols() const { return m_matrix.rows(); }

    /** \returns the nested expression */
-    EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+    EIGEN_DEVICE_FUNC
    const typename internal::remove_all<MatrixTypeNested>::type&
    nestedExpression() const { return m_matrix; }

    /** \returns the nested expression */
-    EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+    EIGEN_DEVICE_FUNC
    typename internal::remove_reference<MatrixTypeNested>::type&
    nestedExpression() { return m_matrix; }

    /** \internal */
-    EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
    void resize(Index nrows, Index ncols) {
      m_matrix.resize(ncols,nrows);
    }
@@ -125,10 +122,8 @@ template<typename MatrixType> class TransposeImpl<MatrixType,Dense>
    EIGEN_DENSE_PUBLIC_INTERFACE(Transpose<MatrixType>)
    EIGEN_INHERIT_ASSIGNMENT_OPERATORS(TransposeImpl)

-    EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
-    Index innerStride() const { return derived().nestedExpression().innerStride(); }
-    EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
-    Index outerStride() const { return derived().nestedExpression().outerStride(); }
+    EIGEN_DEVICE_FUNC inline Index innerStride() const { return derived().nestedExpression().innerStride(); }
+    EIGEN_DEVICE_FUNC inline Index outerStride() const { return derived().nestedExpression().outerStride(); }

    typedef typename internal::conditional<
                       internal::is_lvalue<MatrixType>::value,
@@ -136,20 +131,18 @@ template<typename MatrixType> class TransposeImpl<MatrixType,Dense>
                       const Scalar
                     >::type ScalarWithConstIfNotLvalue;

-    EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
-    ScalarWithConstIfNotLvalue* data() { return derived().nestedExpression().data(); }
-    EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
-    const Scalar* data() const { return derived().nestedExpression().data(); }
+    EIGEN_DEVICE_FUNC inline ScalarWithConstIfNotLvalue* data() { return derived().nestedExpression().data(); }
+    EIGEN_DEVICE_FUNC inline const Scalar* data() const { return derived().nestedExpression().data(); }

    // FIXME: shall we keep the const version of coeffRef?
-    EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
-    const Scalar& coeffRef(Index rowId, Index colId) const
+    EIGEN_DEVICE_FUNC
+    inline const Scalar& coeffRef(Index rowId, Index colId) const
    {
      return derived().nestedExpression().coeffRef(colId, rowId);
    }

-    EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
-    const Scalar& coeffRef(Index index) const
+    EIGEN_DEVICE_FUNC
+    inline const Scalar& coeffRef(Index index) const
    {
      return derived().nestedExpression().coeffRef(index);
    }
@@ -175,8 +168,7 @@ template<typename MatrixType> class TransposeImpl<MatrixType,Dense>
  *
  * \sa transposeInPlace(), adjoint() */
 template<typename Derived>
-EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
-Transpose<Derived>
+inline Transpose<Derived>
 DenseBase<Derived>::transpose()
 {
  return TransposeReturnType(derived());
@@ -188,8 +180,7 @@ DenseBase<Derived>::transpose()
  *
  * \sa transposeInPlace(), adjoint() */
 template<typename Derived>
-EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
-typename DenseBase<Derived>::ConstTransposeReturnType
+inline typename DenseBase<Derived>::ConstTransposeReturnType
 DenseBase<Derived>::transpose() const
 {
  return ConstTransposeReturnType(derived());
@@ -215,7 +206,7 @@ DenseBase<Derived>::transpose() const
  *
  * \sa adjointInPlace(), transpose(), conjugate(), class Transpose, class internal::scalar_conjugate_op */
 template<typename Derived>
-EIGEN_DEVICE_FUNC inline const typename MatrixBase<Derived>::AdjointReturnType
+inline const typename MatrixBase<Derived>::AdjointReturnType
 MatrixBase<Derived>::adjoint() const
 {
  return AdjointReturnType(this->transpose());
@@ -237,7 +228,7 @@ struct inplace_transpose_selector;
 template<typename MatrixType>
 struct inplace_transpose_selector<MatrixType,true,false> { // square matrix
  static void run(MatrixType& m) {
-    m.matrix().template triangularView<StrictlyUpper>().swap(m.matrix().transpose().template triangularView<StrictlyUpper>());
+    m.matrix().template triangularView<StrictlyUpper>().swap(m.matrix().transpose());
  }
 };

@@ -262,7 +253,7 @@ template<typename MatrixType,bool MatchPacketSize>
 struct inplace_transpose_selector<MatrixType,false,MatchPacketSize> { // non square matrix
  static void run(MatrixType& m) {
    if (m.rows()==m.cols())
-      m.matrix().template triangularView<StrictlyUpper>().swap(m.matrix().transpose().template triangularView<StrictlyUpper>());
+      m.matrix().template triangularView<StrictlyUpper>().swap(m.matrix().transpose());
    else
      m = m.transpose().eval();
  }
@@ -290,7 +281,7 @@ struct inplace_transpose_selector<MatrixType,false,MatchPacketSize> { // non squ
  *
  * \sa transpose(), adjoint(), adjointInPlace() */
 template<typename Derived>
-EIGEN_DEVICE_FUNC inline void DenseBase<Derived>::transposeInPlace()
+inline void DenseBase<Derived>::transposeInPlace()
 {
  eigen_assert((rows() == cols() || (RowsAtCompileTime == Dynamic && ColsAtCompileTime == Dynamic))
               && "transposeInPlace() called on a non-square non-resizable matrix");
@@ -321,7 +312,7 @@ EIGEN_DEVICE_FUNC inline void DenseBase<Derived>::transposeInPlace()
  *
  * \sa transpose(), adjoint(), transposeInPlace() */
 template<typename Derived>
-EIGEN_DEVICE_FUNC inline void MatrixBase<Derived>::adjointInPlace()
+inline void MatrixBase<Derived>::adjointInPlace()
 {
  derived() = adjoint().eval();
 }
@@ -400,8 +391,7 @@ struct checkTransposeAliasing_impl<Derived, OtherDerived, false>
 template<typename Dst, typename Src>
 void check_for_aliasing(const Dst &dst, const Src &src)
 {
-  if((!Dst::IsVectorAtCompileTime) && dst.rows()>1 && dst.cols()>1)
-    internal::checkTransposeAliasing_impl<Dst, Src>::run(dst, src);
+  internal::checkTransposeAliasing_impl<Dst, Src>::run(dst, src);
 }

 } // end namespace internal
--- a/Eigen/src/Core/Transpositions.h
+++ b/Eigen/src/Core/Transpositions.h
@@ -33,6 +33,17 @@ class TranspositionsBase
      indices() = other.indices();
      return derived();
    }
+    
+    #ifndef EIGEN_PARSED_BY_DOXYGEN
+    /** This is a special case of the templated operator=. Its purpose is to
+      * prevent a default operator= from hiding the templated operator=.
+      */
+    Derived& operator=(const TranspositionsBase& other)
+    {
+      indices() = other.indices();
+      return derived();
+    }
+    #endif

    /** \returns the number of transpositions */
    Index size() const { return indices().size(); }
@@ -73,7 +84,7 @@ class TranspositionsBase
    }

    // FIXME: do we want such methods ?
-    // might be useful when the target matrix expression is complex, e.g.:
+    // might be usefull when the target matrix expression is complex, e.g.:
    // object.matrix().block(..,..,..,..) = trans * object.matrix().block(..,..,..,..);
    /*
    template<typename MatrixType>
@@ -160,6 +171,12 @@ class Transpositions : public TranspositionsBase<Transpositions<SizeAtCompileTim
    inline Transpositions(const TranspositionsBase<OtherDerived>& other)
      : m_indices(other.indices()) {}

+    #ifndef EIGEN_PARSED_BY_DOXYGEN
+    /** Standard copy constructor. Defined only to prevent a default copy constructor
+      * from hiding the other templated constructor */
+    inline Transpositions(const Transpositions& other) : m_indices(other.indices()) {}
+    #endif
+
    /** Generic constructor from expression of the transposition indices. */
    template<typename Other>
    explicit inline Transpositions(const MatrixBase<Other>& indices) : m_indices(indices)
@@ -172,6 +189,17 @@ class Transpositions : public TranspositionsBase<Transpositions<SizeAtCompileTim
      return Base::operator=(other);
    }

+    #ifndef EIGEN_PARSED_BY_DOXYGEN
+    /** This is a special case of the templated operator=. Its purpose is to
+      * prevent a default operator= from hiding the templated operator=.
+      */
+    Transpositions& operator=(const Transpositions& other)
+    {
+      m_indices = other.m_indices;
+      return *this;
+    }
+    #endif
+
    /** Constructs an uninitialized permutation matrix of given size.
      */
    inline Transpositions(Index size) : m_indices(size)
@@ -278,6 +306,17 @@ class TranspositionsWrapper
      return Base::operator=(other);
    }

+    #ifndef EIGEN_PARSED_BY_DOXYGEN
+    /** This is a special case of the templated operator=. Its purpose is to
+      * prevent a default operator= from hiding the templated operator=.
+      */
+    TranspositionsWrapper& operator=(const TranspositionsWrapper& other)
+    {
+      m_indices = other.m_indices;
+      return *this;
+    }
+    #endif
+
    /** const version of indices(). */
    const IndicesType& indices() const { return m_indices; }

--- a/Eigen/src/Core/TriangularMatrix.h
+++ b/Eigen/src/Core/TriangularMatrix.h
@@ -65,7 +65,6 @@ template<typename Derived> class TriangularBase : public EigenBase<Derived>
    inline Index innerStride() const { return derived().innerStride(); }
    
    // dummy resize function
-    EIGEN_DEVICE_FUNC
    void resize(Index rows, Index cols)
    {
      EIGEN_UNUSED_VARIABLE(rows);
@@ -198,7 +197,6 @@ template<typename _MatrixType, unsigned int _Mode> class TriangularView
    typedef typename internal::traits<TriangularView>::MatrixTypeNestedNonRef MatrixTypeNestedNonRef;

    typedef typename internal::remove_all<typename MatrixType::ConjugateReturnType>::type MatrixConjugateReturnType;
-    typedef TriangularView<typename internal::add_const<MatrixType>::type, _Mode> ConstTriangularView;
    
  public:

@@ -244,18 +242,6 @@ template<typename _MatrixType, unsigned int _Mode> class TriangularView
    inline const ConjugateReturnType conjugate() const
    { return ConjugateReturnType(m_matrix.conjugate()); }

-    /** \returns an expression of the complex conjugate of \c *this if Cond==true,
-     *           returns \c *this otherwise.
-     */
-    template<bool Cond>
-    EIGEN_DEVICE_FUNC
-    inline typename internal::conditional<Cond,ConjugateReturnType,ConstTriangularView>::type
-    conjugateIf() const
-    {
-      typedef typename internal::conditional<Cond,ConjugateReturnType,ConstTriangularView>::type ReturnType;
-      return ReturnType(m_matrix.template conjugateIf<Cond>());
-    }
-
    typedef TriangularView<const typename MatrixType::AdjointReturnType,TransposeMode> AdjointReturnType;
    /** \sa MatrixBase::adjoint() const */
    EIGEN_DEVICE_FUNC
@@ -449,14 +435,14 @@ template<typename _MatrixType, unsigned int _Mode> class TriangularViewImpl<_Mat
    TriangularViewType& operator=(const TriangularViewImpl& other)
    { return *this = other.derived().nestedExpression(); }

-    template<typename OtherDerived>
    /** \deprecated */
-    EIGEN_DEPRECATED EIGEN_DEVICE_FUNC
+    template<typename OtherDerived>
+    EIGEN_DEVICE_FUNC
    void lazyAssign(const TriangularBase<OtherDerived>& other);

-    template<typename OtherDerived>
    /** \deprecated */
-    EIGEN_DEPRECATED EIGEN_DEVICE_FUNC
+    template<typename OtherDerived>
+    EIGEN_DEVICE_FUNC
    void lazyAssign(const MatrixBase<OtherDerived>& other);
 #endif

@@ -484,7 +470,7 @@ template<typename _MatrixType, unsigned int _Mode> class TriangularViewImpl<_Mat
      * \a Side==OnTheLeft (the default), or the right-inverse-multiply  \a other * inverse(\c *this) if
      * \a Side==OnTheRight.
      *
-      * Note that the template parameter \c Side can be omitted, in which case \c Side==OnTheLeft
+      * Note that the template parameter \c Side can be ommitted, in which case \c Side==OnTheLeft
      *
      * The matrix \c *this must be triangular and invertible (i.e., all the coefficients of the
      * diagonal must be non zero). It works as a forward (resp. backward) substitution if \c *this
@@ -502,6 +488,7 @@ template<typename _MatrixType, unsigned int _Mode> class TriangularViewImpl<_Mat
      * \sa TriangularView::solveInPlace()
      */
    template<int Side, typename Other>
+    EIGEN_DEVICE_FUNC
    inline const internal::triangular_solve_retval<Side,TriangularViewType, Other>
    solve(const MatrixBase<Other>& other) const;

@@ -510,7 +497,7 @@ template<typename _MatrixType, unsigned int _Mode> class TriangularViewImpl<_Mat
      * \warning The parameter is only marked 'const' to make the C++ compiler accept a temporary expression here.
      * This function will const_cast it, so constness isn't honored here.
      *
-      * Note that the template parameter \c Side can be omitted, in which case \c Side==OnTheLeft
+      * Note that the template parameter \c Side can be ommitted, in which case \c Side==OnTheLeft
      *
      * See TriangularView:solve() for the details.
      */
@@ -536,10 +523,10 @@ template<typename _MatrixType, unsigned int _Mode> class TriangularViewImpl<_Mat
      call_assignment(derived(), other.const_cast_derived(), internal::swap_assign_op<Scalar>());
    }

-    /** Shortcut for \code (*this).swap(other.triangularView<(*this)::Mode>()) \endcode */
+    /** \deprecated
+      * Shortcut for \code (*this).swap(other.triangularView<(*this)::Mode>()) \endcode */
    template<typename OtherDerived>
-    /** \deprecated */
-    EIGEN_DEPRECATED EIGEN_DEVICE_FUNC
+    EIGEN_DEVICE_FUNC
    void swap(MatrixBase<OtherDerived> const & other)
    {
      EIGEN_STATIC_ASSERT_LVALUE(OtherDerived);
@@ -567,7 +554,7 @@ template<typename _MatrixType, unsigned int _Mode> class TriangularViewImpl<_Mat
 // FIXME should we keep that possibility
 template<typename MatrixType, unsigned int Mode>
 template<typename OtherDerived>
-EIGEN_DEVICE_FUNC inline TriangularView<MatrixType, Mode>&
+inline TriangularView<MatrixType, Mode>&
 TriangularViewImpl<MatrixType, Mode, Dense>::operator=(const MatrixBase<OtherDerived>& other)
 {
  internal::call_assignment_no_alias(derived(), other.derived(), internal::assign_op<Scalar,typename OtherDerived::Scalar>());
@@ -577,7 +564,7 @@ TriangularViewImpl<MatrixType, Mode, Dense>::operator=(const MatrixBase<OtherDer
 // FIXME should we keep that possibility
 template<typename MatrixType, unsigned int Mode>
 template<typename OtherDerived>
-EIGEN_DEVICE_FUNC void TriangularViewImpl<MatrixType, Mode, Dense>::lazyAssign(const MatrixBase<OtherDerived>& other)
+void TriangularViewImpl<MatrixType, Mode, Dense>::lazyAssign(const MatrixBase<OtherDerived>& other)
 {
  internal::call_assignment_no_alias(derived(), other.template triangularView<Mode>());
 }
@@ -586,7 +573,7 @@ EIGEN_DEVICE_FUNC void TriangularViewImpl<MatrixType, Mode, Dense>::lazyAssign(c

 template<typename MatrixType, unsigned int Mode>
 template<typename OtherDerived>
-EIGEN_DEVICE_FUNC inline TriangularView<MatrixType, Mode>&
+inline TriangularView<MatrixType, Mode>&
 TriangularViewImpl<MatrixType, Mode, Dense>::operator=(const TriangularBase<OtherDerived>& other)
 {
  eigen_assert(Mode == int(OtherDerived::Mode));
@@ -596,7 +583,7 @@ TriangularViewImpl<MatrixType, Mode, Dense>::operator=(const TriangularBase<Othe

 template<typename MatrixType, unsigned int Mode>
 template<typename OtherDerived>
-EIGEN_DEVICE_FUNC void TriangularViewImpl<MatrixType, Mode, Dense>::lazyAssign(const TriangularBase<OtherDerived>& other)
+void TriangularViewImpl<MatrixType, Mode, Dense>::lazyAssign(const TriangularBase<OtherDerived>& other)
 {
  eigen_assert(Mode == int(OtherDerived::Mode));
  internal::call_assignment_no_alias(derived(), other.derived());
@@ -611,7 +598,7 @@ EIGEN_DEVICE_FUNC void TriangularViewImpl<MatrixType, Mode, Dense>::lazyAssign(c
  * If the matrix is triangular, the opposite part is set to zero. */
 template<typename Derived>
 template<typename DenseDerived>
-EIGEN_DEVICE_FUNC void TriangularBase<Derived>::evalTo(MatrixBase<DenseDerived> &other) const
+void TriangularBase<Derived>::evalTo(MatrixBase<DenseDerived> &other) const
 {
  evalToLazy(other.derived());
 }
@@ -637,7 +624,6 @@ EIGEN_DEVICE_FUNC void TriangularBase<Derived>::evalTo(MatrixBase<DenseDerived>
  */
 template<typename Derived>
 template<unsigned int Mode>
-EIGEN_DEVICE_FUNC
 typename MatrixBase<Derived>::template TriangularViewReturnType<Mode>::Type
 MatrixBase<Derived>::triangularView()
 {
@@ -647,7 +633,6 @@ MatrixBase<Derived>::triangularView()
 /** This is the const version of MatrixBase::triangularView() */
 template<typename Derived>
 template<unsigned int Mode>
-EIGEN_DEVICE_FUNC
 typename MatrixBase<Derived>::template ConstTriangularViewReturnType<Mode>::Type
 MatrixBase<Derived>::triangularView() const
 {
@@ -730,7 +715,6 @@ struct unary_evaluator<TriangularView<MatrixType,Mode>, IndexBased>
 {
  typedef TriangularView<MatrixType,Mode> XprType;
  typedef evaluator<typename internal::remove_all<MatrixType>::type> Base;
-  EIGEN_DEVICE_FUNC
  unary_evaluator(const XprType &xpr) : Base(xpr.nestedExpression()) {}
 };

@@ -946,7 +930,7 @@ struct triangular_assignment_loop<Kernel, Mode, Dynamic, SetOpposite>
  * If the matrix is triangular, the opposite part is set to zero. */
 template<typename Derived>
 template<typename DenseDerived>
-EIGEN_DEVICE_FUNC void TriangularBase<Derived>::evalToLazy(MatrixBase<DenseDerived> &other) const
+void TriangularBase<Derived>::evalToLazy(MatrixBase<DenseDerived> &other) const
 {
  other.derived().resize(this->rows(), this->cols());
  internal::call_triangular_assignment_loop<Derived::Mode,(Derived::Mode&SelfAdjoint)==0 /* SetOpposite */>(other.derived(), derived().nestedExpression());
--- a/Eigen/src/Core/VectorBlock.h
+++ b/Eigen/src/Core/VectorBlock.h
@@ -35,7 +35,7 @@ struct traits<VectorBlock<VectorType, Size> >
  * It is the return type of DenseBase::segment(Index,Index) and DenseBase::segment<int>(Index) and
  * most of the time this is the only way it is used.
  *
-  * However, if you want to directly manipulate sub-vector expressions,
+  * However, if you want to directly maniputate sub-vector expressions,
  * for instance if you want to write a function returning such an expression, you
  * will need to use this class.
  *
@@ -71,8 +71,8 @@ template<typename VectorType, int Size> class VectorBlock

    /** Dynamic-size constructor
      */
-    EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
-    VectorBlock(VectorType& vector, Index start, Index size)
+    EIGEN_DEVICE_FUNC
+    inline VectorBlock(VectorType& vector, Index start, Index size)
      : Base(vector,
             IsColVector ? start : 0, IsColVector ? 0 : start,
             IsColVector ? size  : 1, IsColVector ? 1 : size)
@@ -82,8 +82,8 @@ template<typename VectorType, int Size> class VectorBlock

    /** Fixed-size constructor
      */
-    EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
-    VectorBlock(VectorType& vector, Index start)
+    EIGEN_DEVICE_FUNC
+    inline VectorBlock(VectorType& vector, Index start)
      : Base(vector, IsColVector ? start : 0, IsColVector ? 0 : start)
    {
      EIGEN_STATIC_ASSERT_VECTOR_ONLY(VectorBlock);
--- a/Eigen/src/Core/VectorwiseOp.h
+++ b/Eigen/src/Core/VectorwiseOp.h
@@ -1,7 +1,7 @@
 // This file is part of Eigen, a lightweight C++ template library
 // for linear algebra.
 //
-// Copyright (C) 2008-2019 Gael Guennebaud <gael.guennebaud@inria.fr>
+// Copyright (C) 2008-2010 Gael Guennebaud <gael.guennebaud@inria.fr>
 // Copyright (C) 2006-2008 Benoit Jacob <jacob.benoit.1@gmail.com>
 //
 // This Source Code Form is subject to the terms of the Mozilla
@@ -81,46 +81,39 @@ class PartialReduxExpr : public internal::dense_xpr_base< PartialReduxExpr<Matri
    const MemberOp m_functor;
 };

-template<typename A,typename B> struct partial_redux_dummy_func;
-
-#define EIGEN_MAKE_PARTIAL_REDUX_FUNCTOR(MEMBER,COST,VECTORIZABLE,BINARYOP)                \
-  template <typename ResultType,typename Scalar>                                                            \
-  struct member_##MEMBER {                                                                  \
-    EIGEN_EMPTY_STRUCT_CTOR(member_##MEMBER)                                                \
-    typedef ResultType result_type;                                                         \
-    typedef BINARYOP<Scalar,Scalar> BinaryOp;   \
-    template<int Size> struct Cost { enum { value = COST }; };             \
-    enum { Vectorizable = VECTORIZABLE };                                                   \
-    template<typename XprType>                                                              \
-    EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE                                                   \
-    ResultType operator()(const XprType& mat) const                                         \
-    { return mat.MEMBER(); }                                                                \
-    BinaryOp binaryFunc() const { return BinaryOp(); }                                      \
+#define EIGEN_MEMBER_FUNCTOR(MEMBER,COST)                               \
+  template <typename ResultType>                                        \
+  struct member_##MEMBER {                                              \
+    EIGEN_EMPTY_STRUCT_CTOR(member_##MEMBER)                            \
+    typedef ResultType result_type;                                     \
+    template<typename Scalar, int Size> struct Cost                     \
+    { enum { value = COST }; };                                         \
+    template<typename XprType>                                          \
+    EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE                               \
+    ResultType operator()(const XprType& mat) const                     \
+    { return mat.MEMBER(); } \
  }

-#define EIGEN_MEMBER_FUNCTOR(MEMBER,COST) \
-  EIGEN_MAKE_PARTIAL_REDUX_FUNCTOR(MEMBER,COST,0,partial_redux_dummy_func)
-
 namespace internal {

+EIGEN_MEMBER_FUNCTOR(squaredNorm, Size * NumTraits<Scalar>::MulCost + (Size-1)*NumTraits<Scalar>::AddCost);
 EIGEN_MEMBER_FUNCTOR(norm, (Size+5) * NumTraits<Scalar>::MulCost + (Size-1)*NumTraits<Scalar>::AddCost);
 EIGEN_MEMBER_FUNCTOR(stableNorm, (Size+5) * NumTraits<Scalar>::MulCost + (Size-1)*NumTraits<Scalar>::AddCost);
 EIGEN_MEMBER_FUNCTOR(blueNorm, (Size+5) * NumTraits<Scalar>::MulCost + (Size-1)*NumTraits<Scalar>::AddCost);
 EIGEN_MEMBER_FUNCTOR(hypotNorm, (Size-1) * functor_traits<scalar_hypot_op<Scalar> >::Cost );
+EIGEN_MEMBER_FUNCTOR(sum, (Size-1)*NumTraits<Scalar>::AddCost);
+EIGEN_MEMBER_FUNCTOR(mean, (Size-1)*NumTraits<Scalar>::AddCost + NumTraits<Scalar>::MulCost);
+EIGEN_MEMBER_FUNCTOR(minCoeff, (Size-1)*NumTraits<Scalar>::AddCost);
+EIGEN_MEMBER_FUNCTOR(maxCoeff, (Size-1)*NumTraits<Scalar>::AddCost);
 EIGEN_MEMBER_FUNCTOR(all, (Size-1)*NumTraits<Scalar>::AddCost);
 EIGEN_MEMBER_FUNCTOR(any, (Size-1)*NumTraits<Scalar>::AddCost);
 EIGEN_MEMBER_FUNCTOR(count, (Size-1)*NumTraits<Scalar>::AddCost);
+EIGEN_MEMBER_FUNCTOR(prod, (Size-1)*NumTraits<Scalar>::MulCost);

-EIGEN_MAKE_PARTIAL_REDUX_FUNCTOR(sum, (Size-1)*NumTraits<Scalar>::AddCost, 1, internal::scalar_sum_op);
-EIGEN_MAKE_PARTIAL_REDUX_FUNCTOR(minCoeff, (Size-1)*NumTraits<Scalar>::AddCost, 1, internal::scalar_min_op);
-EIGEN_MAKE_PARTIAL_REDUX_FUNCTOR(maxCoeff, (Size-1)*NumTraits<Scalar>::AddCost, 1, internal::scalar_max_op);
-EIGEN_MAKE_PARTIAL_REDUX_FUNCTOR(prod, (Size-1)*NumTraits<Scalar>::MulCost, 1, internal::scalar_product_op);
-
-template <int p, typename ResultType,typename Scalar>
+template <int p, typename ResultType>
 struct member_lpnorm {
  typedef ResultType result_type;
-  enum { Vectorizable = 0 };
-  template<int Size> struct Cost
+  template<typename Scalar, int Size> struct Cost
  { enum { value = (Size+5) * NumTraits<Scalar>::MulCost + (Size-1)*NumTraits<Scalar>::AddCost }; };
  EIGEN_DEVICE_FUNC member_lpnorm() {}
  template<typename XprType>
@@ -128,20 +121,17 @@ struct member_lpnorm {
  { return mat.template lpNorm<p>(); }
 };

-template <typename BinaryOpT, typename Scalar>
+template <typename BinaryOp, typename Scalar>
 struct member_redux {
-  typedef BinaryOpT BinaryOp;
  typedef typename result_of<
                     BinaryOp(const Scalar&,const Scalar&)
                   >::type  result_type;
-  
-  enum { Vectorizable = functor_traits<BinaryOp>::PacketAccess };
-  template<int Size> struct Cost { enum { value = (Size-1) * functor_traits<BinaryOp>::Cost }; };
+  template<typename _Scalar, int Size> struct Cost
+  { enum { value = (Size-1) * functor_traits<BinaryOp>::Cost }; };
  EIGEN_DEVICE_FUNC explicit member_redux(const BinaryOp func) : m_functor(func) {}
  template<typename Derived>
  EIGEN_DEVICE_FUNC inline result_type operator()(const DenseBase<Derived>& mat) const
  { return mat.redux(m_functor); }
-  const BinaryOp& binaryFunc() const { return m_functor; }
  const BinaryOp m_functor;
 };
 }
@@ -149,38 +139,18 @@ struct member_redux {
 /** \class VectorwiseOp
  * \ingroup Core_Module
  *
-  * \brief Pseudo expression providing broadcasting and partial reduction operations
+  * \brief Pseudo expression providing partial reduction operations
  *
  * \tparam ExpressionType the type of the object on which to do partial reductions
-  * \tparam Direction indicates whether to operate on columns (#Vertical) or rows (#Horizontal)
+  * \tparam Direction indicates the direction of the redux (#Vertical or #Horizontal)
  *
-  * This class represents a pseudo expression with broadcasting and partial reduction features.
+  * This class represents a pseudo expression with partial reduction features.
  * It is the return type of DenseBase::colwise() and DenseBase::rowwise()
-  * and most of the time this is the only way it is explicitly used.
+  * and most of the time this is the only way it is used.
  *
-  * To understand the logic of rowwise/colwise expression, let's consider a generic case `A.colwise().foo()`
-  * where `foo` is any method of `VectorwiseOp`. This expression is equivalent to applying `foo()` to each
-  * column of `A` and then re-assemble the outputs in a matrix expression:
-  * \code [A.col(0).foo(), A.col(1).foo(), ..., A.col(A.cols()-1).foo()] \endcode
-  * 
  * Example: \include MatrixBase_colwise.cpp
  * Output: \verbinclude MatrixBase_colwise.out
  *
-  * The begin() and end() methods are obviously exceptions to the previous rule as they
-  * return STL-compatible begin/end iterators to the rows or columns of the nested expression.
-  * Typical use cases include for-range-loop and calls to STL algorithms:
-  * 
-  * Example: \include MatrixBase_colwise_iterator_cxx11.cpp
-  * Output: \verbinclude MatrixBase_colwise_iterator_cxx11.out
-  * 
-  * For a partial reduction on an empty input, some rules apply.
-  * For the sake of clarity, let's consider a vertical reduction:
-  *   - If the number of columns is zero, then a 1x0 row-major vector expression is returned.
-  *   - Otherwise, if the number of rows is zero, then
-  *       - a row vector of zeros is returned for sum-like reductions (sum, squaredNorm, norm, etc.)
-  *       - a row vector of ones is returned for a product reduction (e.g., <code>MatrixXd(n,0).colwise().prod()</code>)
-  *       - an assert is triggered for all other reductions (minCoeff,maxCoeff,redux(bin_op))
-  * 
  * \sa DenseBase::colwise(), DenseBase::rowwise(), class PartialReduxExpr
  */
 template<typename ExpressionType, int Direction> class VectorwiseOp
@@ -193,11 +163,11 @@ template<typename ExpressionType, int Direction> class VectorwiseOp
    typedef typename internal::ref_selector<ExpressionType>::non_const_type ExpressionTypeNested;
    typedef typename internal::remove_all<ExpressionTypeNested>::type ExpressionTypeNestedCleaned;

-    template<template<typename OutScalar,typename InputScalar> class Functor,
-                      typename ReturnScalar=Scalar> struct ReturnType
+    template<template<typename _Scalar> class Functor,
+                      typename Scalar_=Scalar> struct ReturnType
    {
      typedef PartialReduxExpr<ExpressionType,
-                               Functor<ReturnScalar,Scalar>,
+                               Functor<Scalar_>,
                               Direction
                              > Type;
    };
@@ -216,7 +186,24 @@ template<typename ExpressionType, int Direction> class VectorwiseOp
    };

  protected:
-  
+
+    typedef typename internal::conditional<isVertical,
+                               typename ExpressionType::ColXpr,
+                               typename ExpressionType::RowXpr>::type SubVector;
+    /** \internal
+      * \returns the i-th subvector according to the \c Direction */
+    EIGEN_DEVICE_FUNC
+    SubVector subVector(Index i)
+    {
+      return SubVector(m_matrix.derived(),i);
+    }
+
+    /** \internal
+      * \returns the number of subvectors in the direction \c Direction */
+    EIGEN_DEVICE_FUNC
+    Index subVectors() const
+    { return isVertical?m_matrix.cols():m_matrix.rows(); }
+
    template<typename OtherDerived> struct ExtendedType {
      typedef Replicate<OtherDerived,
                        isVertical   ? 1 : ExpressionType::RowsAtCompileTime,
@@ -271,81 +258,42 @@ template<typename ExpressionType, int Direction> class VectorwiseOp
    EIGEN_DEVICE_FUNC
    inline const ExpressionType& _expression() const { return m_matrix; }

-    #ifdef EIGEN_PARSED_BY_DOXYGEN
-    /** STL-like <a href="https://en.cppreference.com/w/cpp/named_req/RandomAccessIterator">RandomAccessIterator</a>
-      * iterator type over the columns or rows as returned by the begin() and end() methods.
-      */
-    random_access_iterator_type iterator;
-    /** This is the const version of iterator (aka read-only) */
-    random_access_iterator_type const_iterator;
-    #else
-    typedef internal::subvector_stl_iterator<ExpressionType,       DirectionType(Direction)> iterator;
-    typedef internal::subvector_stl_iterator<const ExpressionType, DirectionType(Direction)> const_iterator;
-    #endif
-
-    /** returns an iterator to the first row (rowwise) or column (colwise) of the nested expression.
-      * \sa end(), cbegin()
-      */
-    iterator        begin()       { return iterator      (m_matrix, 0); }
-    /** const version of begin() */
-    const_iterator  begin() const { return const_iterator(m_matrix, 0); }
-    /** const version of begin() */
-    const_iterator cbegin() const { return const_iterator(m_matrix, 0); }
-
-    /** returns an iterator to the row (resp. column) following the last row (resp. column) of the nested expression
-      * \sa begin(), cend()
-      */
-    iterator        end()         { return iterator      (m_matrix, m_matrix.template subVectors<DirectionType(Direction)>()); }
-    /** const version of end() */
-    const_iterator  end()   const { return const_iterator(m_matrix, m_matrix.template subVectors<DirectionType(Direction)>()); }
-    /** const version of end() */
-    const_iterator cend()   const { return const_iterator(m_matrix, m_matrix.template subVectors<DirectionType(Direction)>()); }
-
    /** \returns a row or column vector expression of \c *this reduxed by \a func
      *
      * The template parameter \a BinaryOp is the type of the functor
      * of the custom redux operator. Note that func must be an associative operator.
      *
-      * \warning the size along the reduction direction must be strictly positive,
-      *          otherwise an assertion is triggered.
-      * 
      * \sa class VectorwiseOp, DenseBase::colwise(), DenseBase::rowwise()
      */
    template<typename BinaryOp>
    EIGEN_DEVICE_FUNC
    const typename ReduxReturnType<BinaryOp>::Type
    redux(const BinaryOp& func = BinaryOp()) const
-    {
-      eigen_assert(redux_length()>0 && "you are using an empty matrix");
-      return typename ReduxReturnType<BinaryOp>::Type(_expression(), internal::member_redux<BinaryOp,Scalar>(func));
-    }
+    { return typename ReduxReturnType<BinaryOp>::Type(_expression(), internal::member_redux<BinaryOp,Scalar>(func)); }

    typedef typename ReturnType<internal::member_minCoeff>::Type MinCoeffReturnType;
    typedef typename ReturnType<internal::member_maxCoeff>::Type MaxCoeffReturnType;
-    typedef PartialReduxExpr<const CwiseUnaryOp<internal::scalar_abs2_op<Scalar>, const ExpressionTypeNestedCleaned>,internal::member_sum<RealScalar,RealScalar>,Direction> SquaredNormReturnType;
-    typedef CwiseUnaryOp<internal::scalar_sqrt_op<RealScalar>, const SquaredNormReturnType> NormReturnType;
+    typedef typename ReturnType<internal::member_squaredNorm,RealScalar>::Type SquaredNormReturnType;
+    typedef typename ReturnType<internal::member_norm,RealScalar>::Type NormReturnType;
    typedef typename ReturnType<internal::member_blueNorm,RealScalar>::Type BlueNormReturnType;
    typedef typename ReturnType<internal::member_stableNorm,RealScalar>::Type StableNormReturnType;
    typedef typename ReturnType<internal::member_hypotNorm,RealScalar>::Type HypotNormReturnType;
    typedef typename ReturnType<internal::member_sum>::Type SumReturnType;
-    typedef EIGEN_EXPR_BINARYOP_SCALAR_RETURN_TYPE(SumReturnType,Scalar,quotient) MeanReturnType;
+    typedef typename ReturnType<internal::member_mean>::Type MeanReturnType;
    typedef typename ReturnType<internal::member_all>::Type AllReturnType;
    typedef typename ReturnType<internal::member_any>::Type AnyReturnType;
-    typedef PartialReduxExpr<ExpressionType, internal::member_count<Index,Scalar>, Direction> CountReturnType;
+    typedef PartialReduxExpr<ExpressionType, internal::member_count<Index>, Direction> CountReturnType;
    typedef typename ReturnType<internal::member_prod>::Type ProdReturnType;
    typedef Reverse<const ExpressionType, Direction> ConstReverseReturnType;
    typedef Reverse<ExpressionType, Direction> ReverseReturnType;

    template<int p> struct LpNormReturnType {
-      typedef PartialReduxExpr<ExpressionType, internal::member_lpnorm<p,RealScalar,Scalar>,Direction> Type;
+      typedef PartialReduxExpr<ExpressionType, internal::member_lpnorm<p,RealScalar>,Direction> Type;
    };

    /** \returns a row (or column) vector expression of the smallest coefficient
      * of each column (or row) of the referenced expression.
      *
-      * \warning the size along the reduction direction must be strictly positive,
-      *          otherwise an assertion is triggered.
-      * 
      * \warning the result is undefined if \c *this contains NaN.
      *
      * Example: \include PartialRedux_minCoeff.cpp
@@ -354,17 +302,11 @@ template<typename ExpressionType, int Direction> class VectorwiseOp
      * \sa DenseBase::minCoeff() */
    EIGEN_DEVICE_FUNC
    const MinCoeffReturnType minCoeff() const
-    {
-      eigen_assert(redux_length()>0 && "you are using an empty matrix");
-      return MinCoeffReturnType(_expression());
-    }
+    { return MinCoeffReturnType(_expression()); }

    /** \returns a row (or column) vector expression of the largest coefficient
      * of each column (or row) of the referenced expression.
      *
-      * \warning the size along the reduction direction must be strictly positive,
-      *          otherwise an assertion is triggered.
-      * 
      * \warning the result is undefined if \c *this contains NaN.
      *
      * Example: \include PartialRedux_maxCoeff.cpp
@@ -373,10 +315,7 @@ template<typename ExpressionType, int Direction> class VectorwiseOp
      * \sa DenseBase::maxCoeff() */
    EIGEN_DEVICE_FUNC
    const MaxCoeffReturnType maxCoeff() const
-    {
-      eigen_assert(redux_length()>0 && "you are using an empty matrix");
-      return MaxCoeffReturnType(_expression());
-    }
+    { return MaxCoeffReturnType(_expression()); }

    /** \returns a row (or column) vector expression of the squared norm
      * of each column (or row) of the referenced expression.
@@ -388,7 +327,7 @@ template<typename ExpressionType, int Direction> class VectorwiseOp
      * \sa DenseBase::squaredNorm() */
    EIGEN_DEVICE_FUNC
    const SquaredNormReturnType squaredNorm() const
-    { return SquaredNormReturnType(m_matrix.cwiseAbs2()); }
+    { return SquaredNormReturnType(_expression()); }

    /** \returns a row (or column) vector expression of the norm
      * of each column (or row) of the referenced expression.
@@ -400,7 +339,7 @@ template<typename ExpressionType, int Direction> class VectorwiseOp
      * \sa DenseBase::norm() */
    EIGEN_DEVICE_FUNC
    const NormReturnType norm() const
-    { return NormReturnType(squaredNorm()); }
+    { return NormReturnType(_expression()); }

    /** \returns a row (or column) vector expression of the norm
      * of each column (or row) of the referenced expression.
@@ -465,7 +404,7 @@ template<typename ExpressionType, int Direction> class VectorwiseOp
    * \sa DenseBase::mean() */
    EIGEN_DEVICE_FUNC
    const MeanReturnType mean() const
-    { return sum() / Scalar(Direction==Vertical?m_matrix.rows():m_matrix.cols()); }
+    { return MeanReturnType(_expression()); }

    /** \returns a row (or column) vector expression representing
      * whether \b all coefficients of each respective column (or row) are \c true.
@@ -561,7 +500,7 @@ template<typename ExpressionType, int Direction> class VectorwiseOp
      EIGEN_STATIC_ASSERT_VECTOR_ONLY(OtherDerived)
      EIGEN_STATIC_ASSERT_SAME_XPR_KIND(ExpressionType, OtherDerived)
      //eigen_assert((m_matrix.isNull()) == (other.isNull())); FIXME
-      return m_matrix = extendedTo(other.derived());
+      return const_cast<ExpressionType&>(m_matrix = extendedTo(other.derived()));
    }

    /** Adds the vector \a other to each subvector of \c *this */
@@ -571,7 +510,7 @@ template<typename ExpressionType, int Direction> class VectorwiseOp
    {
      EIGEN_STATIC_ASSERT_VECTOR_ONLY(OtherDerived)
      EIGEN_STATIC_ASSERT_SAME_XPR_KIND(ExpressionType, OtherDerived)
-      return m_matrix += extendedTo(other.derived());
+      return const_cast<ExpressionType&>(m_matrix += extendedTo(other.derived()));
    }

    /** Substracts the vector \a other to each subvector of \c *this */
@@ -581,7 +520,7 @@ template<typename ExpressionType, int Direction> class VectorwiseOp
    {
      EIGEN_STATIC_ASSERT_VECTOR_ONLY(OtherDerived)
      EIGEN_STATIC_ASSERT_SAME_XPR_KIND(ExpressionType, OtherDerived)
-      return m_matrix -= extendedTo(other.derived());
+      return const_cast<ExpressionType&>(m_matrix -= extendedTo(other.derived()));
    }

    /** Multiples each subvector of \c *this by the vector \a other */
@@ -593,7 +532,7 @@ template<typename ExpressionType, int Direction> class VectorwiseOp
      EIGEN_STATIC_ASSERT_ARRAYXPR(ExpressionType)
      EIGEN_STATIC_ASSERT_SAME_XPR_KIND(ExpressionType, OtherDerived)
      m_matrix *= extendedTo(other.derived());
-      return m_matrix;
+      return const_cast<ExpressionType&>(m_matrix);
    }

    /** Divides each subvector of \c *this by the vector \a other */
@@ -605,7 +544,7 @@ template<typename ExpressionType, int Direction> class VectorwiseOp
      EIGEN_STATIC_ASSERT_ARRAYXPR(ExpressionType)
      EIGEN_STATIC_ASSERT_SAME_XPR_KIND(ExpressionType, OtherDerived)
      m_matrix /= extendedTo(other.derived());
-      return m_matrix;
+      return const_cast<ExpressionType&>(m_matrix);
    }

    /** Returns the expression of the sum of the vector \a other to each subvector of \c *this */
@@ -670,7 +609,7 @@ template<typename ExpressionType, int Direction> class VectorwiseOp
    EIGEN_DEVICE_FUNC
    CwiseBinaryOp<internal::scalar_quotient_op<Scalar>,
                  const ExpressionTypeNestedCleaned,
-                  const typename OppositeExtendedType<NormReturnType>::Type>
+                  const typename OppositeExtendedType<typename ReturnType<internal::member_norm,RealScalar>::Type>::Type>
    normalized() const { return m_matrix.cwiseQuotient(extendedToOpposite(this->norm())); }


@@ -720,10 +659,6 @@ template<typename ExpressionType, int Direction> class VectorwiseOp
    const HNormalizedReturnType hnormalized() const;

  protected:
-    Index redux_length() const
-    {
-      return Direction==Vertical ? m_matrix.rows() : m_matrix.cols();
-    }
    ExpressionTypeNested m_matrix;
 };

@@ -735,7 +670,7 @@ template<typename ExpressionType, int Direction> class VectorwiseOp
  * \sa rowwise(), class VectorwiseOp, \ref TutorialReductionsVisitorsBroadcasting
  */
 template<typename Derived>
-EIGEN_DEVICE_FUNC inline typename DenseBase<Derived>::ColwiseReturnType
+inline typename DenseBase<Derived>::ColwiseReturnType
 DenseBase<Derived>::colwise()
 {
  return ColwiseReturnType(derived());
@@ -749,7 +684,7 @@ DenseBase<Derived>::colwise()
  * \sa colwise(), class VectorwiseOp, \ref TutorialReductionsVisitorsBroadcasting
  */
 template<typename Derived>
-EIGEN_DEVICE_FUNC inline typename DenseBase<Derived>::RowwiseReturnType
+inline typename DenseBase<Derived>::RowwiseReturnType
 DenseBase<Derived>::rowwise()
 {
  return RowwiseReturnType(derived());
--- a/Eigen/src/Core/Visitor.h
+++ b/Eigen/src/Core/Visitor.h
@@ -40,14 +40,6 @@ struct visitor_impl<Visitor, Derived, 1>
  }
 };

-// This specialization enables visitors on empty matrices at compile-time
-template<typename Visitor, typename Derived>
-struct visitor_impl<Visitor, Derived, 0> {
-  EIGEN_DEVICE_FUNC
-  static inline void run(const Derived &/*mat*/, Visitor& /*visitor*/)
-  {}
-};
-
 template<typename Visitor, typename Derived>
 struct visitor_impl<Visitor, Derived, Dynamic>
 {
@@ -106,8 +98,6 @@ protected:
  *
  * \note compared to one or two \em for \em loops, visitors offer automatic
  * unrolling for small fixed size matrix.
-  * 
-  * \note if the matrix is empty, then the visitor is left unchanged.
  *
  * \sa minCoeff(Index*,Index*), maxCoeff(Index*,Index*), DenseBase::redux()
  */
@@ -116,9 +106,6 @@ template<typename Visitor>
 EIGEN_DEVICE_FUNC
 void DenseBase<Derived>::visit(Visitor& visitor) const
 {
-  if(size()==0)
-    return;
-  
  typedef typename internal::visitor_evaluator<Derived> ThisEvaluator;
  ThisEvaluator thisEval(derived());
  
@@ -137,9 +124,6 @@ namespace internal {
 template <typename Derived>
 struct coeff_visitor
 {
-  // default initialization to avoid countless invalid maybe-uninitialized warnings by gcc
-  EIGEN_DEVICE_FUNC
-  coeff_visitor() : row(-1), col(-1), res(0) {}
  typedef typename Derived::Scalar Scalar;
  Index row, col;
  Scalar res;
@@ -212,9 +196,6 @@ struct functor_traits<max_coeff_visitor<Scalar> > {

 /** \fn DenseBase<Derived>::minCoeff(IndexType* rowId, IndexType* colId) const
  * \returns the minimum of all coefficients of *this and puts in *row and *col its location.
-  * 
-  * \warning the matrix must be not empty, otherwise an assertion is triggered.
-  * 
  * \warning the result is undefined if \c *this contains NaN.
  *
  * \sa DenseBase::minCoeff(Index*), DenseBase::maxCoeff(Index*,Index*), DenseBase::visit(), DenseBase::minCoeff()
@@ -225,8 +206,6 @@ EIGEN_DEVICE_FUNC
 typename internal::traits<Derived>::Scalar
 DenseBase<Derived>::minCoeff(IndexType* rowId, IndexType* colId) const
 {
-  eigen_assert(this->rows()>0 && this->cols()>0 && "you are using an empty matrix");
-
  internal::min_coeff_visitor<Derived> minVisitor;
  this->visit(minVisitor);
  *rowId = minVisitor.row;
@@ -235,9 +214,6 @@ DenseBase<Derived>::minCoeff(IndexType* rowId, IndexType* colId) const
 }

 /** \returns the minimum of all coefficients of *this and puts in *index its location.
-  * 
-  * \warning the matrix must be not empty, otherwise an assertion is triggered.
-  * 
  * \warning the result is undefined if \c *this contains NaN. 
  *
  * \sa DenseBase::minCoeff(IndexType*,IndexType*), DenseBase::maxCoeff(IndexType*,IndexType*), DenseBase::visit(), DenseBase::minCoeff()
@@ -248,8 +224,6 @@ EIGEN_DEVICE_FUNC
 typename internal::traits<Derived>::Scalar
 DenseBase<Derived>::minCoeff(IndexType* index) const
 {
-  eigen_assert(this->rows()>0 && this->cols()>0 && "you are using an empty matrix");
-
  EIGEN_STATIC_ASSERT_VECTOR_ONLY(Derived)
  internal::min_coeff_visitor<Derived> minVisitor;
  this->visit(minVisitor);
@@ -259,9 +233,6 @@ DenseBase<Derived>::minCoeff(IndexType* index) const

 /** \fn DenseBase<Derived>::maxCoeff(IndexType* rowId, IndexType* colId) const
  * \returns the maximum of all coefficients of *this and puts in *row and *col its location.
-  * 
-  * \warning the matrix must be not empty, otherwise an assertion is triggered.
-  * 
  * \warning the result is undefined if \c *this contains NaN. 
  *
  * \sa DenseBase::minCoeff(IndexType*,IndexType*), DenseBase::visit(), DenseBase::maxCoeff()
@@ -272,8 +243,6 @@ EIGEN_DEVICE_FUNC
 typename internal::traits<Derived>::Scalar
 DenseBase<Derived>::maxCoeff(IndexType* rowPtr, IndexType* colPtr) const
 {
-  eigen_assert(this->rows()>0 && this->cols()>0 && "you are using an empty matrix");
-
  internal::max_coeff_visitor<Derived> maxVisitor;
  this->visit(maxVisitor);
  *rowPtr = maxVisitor.row;
@@ -282,9 +251,6 @@ DenseBase<Derived>::maxCoeff(IndexType* rowPtr, IndexType* colPtr) const
 }

 /** \returns the maximum of all coefficients of *this and puts in *index its location.
-  * 
-  * \warning the matrix must be not empty, otherwise an assertion is triggered.
-  *
  * \warning the result is undefined if \c *this contains NaN.
  *
  * \sa DenseBase::maxCoeff(IndexType*,IndexType*), DenseBase::minCoeff(IndexType*,IndexType*), DenseBase::visitor(), DenseBase::maxCoeff()
@@ -295,8 +261,6 @@ EIGEN_DEVICE_FUNC
 typename internal::traits<Derived>::Scalar
 DenseBase<Derived>::maxCoeff(IndexType* index) const
 {
-  eigen_assert(this->rows()>0 && this->cols()>0 && "you are using an empty matrix");
-
  EIGEN_STATIC_ASSERT_VECTOR_ONLY(Derived)
  internal::max_coeff_visitor<Derived> maxVisitor;
  this->visit(maxVisitor);
--- a/Eigen/src/Core/arch/AVX/Complex.h
+++ b/Eigen/src/Core/arch/AVX/Complex.h
@@ -22,7 +22,6 @@ struct Packet4cf
  __m256  v;
 };

-#ifndef EIGEN_VECTORIZE_AVX512
 template<> struct packet_traits<std::complex<float> >  : default_packet_traits
 {
  typedef Packet4cf type;
@@ -45,9 +44,8 @@ template<> struct packet_traits<std::complex<float> >  : default_packet_traits
    HasSetLinear = 0
  };
 };
-#endif

-template<> struct unpacket_traits<Packet4cf> { typedef std::complex<float> type; enum {size=4, alignment=Aligned32, vectorizable=true, masked_load_available=false, masked_store_available=false}; typedef Packet2cf half; };
+template<> struct unpacket_traits<Packet4cf> { typedef std::complex<float> type; enum {size=4, alignment=Aligned32}; typedef Packet2cf half; };

 template<> EIGEN_STRONG_INLINE Packet4cf padd<Packet4cf>(const Packet4cf& a, const Packet4cf& b) { return Packet4cf(_mm256_add_ps(a.v,b.v)); }
 template<> EIGEN_STRONG_INLINE Packet4cf psub<Packet4cf>(const Packet4cf& a, const Packet4cf& b) { return Packet4cf(_mm256_sub_ps(a.v,b.v)); }
@@ -69,18 +67,10 @@ template<> EIGEN_STRONG_INLINE Packet4cf pmul<Packet4cf>(const Packet4cf& a, con
  return Packet4cf(result);
 }

-template <>
-EIGEN_STRONG_INLINE Packet4cf pcmp_eq(const Packet4cf& a, const Packet4cf& b) {
-  __m256 eq = _mm256_cmp_ps(a.v, b.v, _CMP_EQ_OQ);
-  return Packet4cf(_mm256_and_ps(eq, _mm256_permute_ps(eq, 0xb1)));
-}
-
-template<> EIGEN_STRONG_INLINE Packet4cf ptrue<Packet4cf>(const Packet4cf& a) { return Packet4cf(ptrue(Packet8f(a.v))); }
-template<> EIGEN_STRONG_INLINE Packet4cf pnot<Packet4cf>(const Packet4cf& a) { return Packet4cf(pnot(Packet8f(a.v))); }
 template<> EIGEN_STRONG_INLINE Packet4cf pand   <Packet4cf>(const Packet4cf& a, const Packet4cf& b) { return Packet4cf(_mm256_and_ps(a.v,b.v)); }
 template<> EIGEN_STRONG_INLINE Packet4cf por    <Packet4cf>(const Packet4cf& a, const Packet4cf& b) { return Packet4cf(_mm256_or_ps(a.v,b.v)); }
 template<> EIGEN_STRONG_INLINE Packet4cf pxor   <Packet4cf>(const Packet4cf& a, const Packet4cf& b) { return Packet4cf(_mm256_xor_ps(a.v,b.v)); }
-template<> EIGEN_STRONG_INLINE Packet4cf pandnot<Packet4cf>(const Packet4cf& a, const Packet4cf& b) { return Packet4cf(_mm256_andnot_ps(b.v,a.v)); }
+template<> EIGEN_STRONG_INLINE Packet4cf pandnot<Packet4cf>(const Packet4cf& a, const Packet4cf& b) { return Packet4cf(_mm256_andnot_ps(a.v,b.v)); }

 template<> EIGEN_STRONG_INLINE Packet4cf pload <Packet4cf>(const std::complex<float>* from) { EIGEN_DEBUG_ALIGNED_LOAD return Packet4cf(pload<Packet8f>(&numext::real_ref(*from))); }
 template<> EIGEN_STRONG_INLINE Packet4cf ploadu<Packet4cf>(const std::complex<float>* from) { EIGEN_DEBUG_UNALIGNED_LOAD return Packet4cf(ploadu<Packet8f>(&numext::real_ref(*from))); }
@@ -238,7 +228,6 @@ struct Packet2cd
  __m256d  v;
 };

-#ifndef EIGEN_VECTORIZE_AVX512
 template<> struct packet_traits<std::complex<double> >  : default_packet_traits
 {
  typedef Packet2cd type;
@@ -261,9 +250,8 @@ template<> struct packet_traits<std::complex<double> >  : default_packet_traits
    HasSetLinear = 0
  };
 };
-#endif

-template<> struct unpacket_traits<Packet2cd> { typedef std::complex<double> type; enum {size=2, alignment=Aligned32, vectorizable=true, masked_load_available=false, masked_store_available=false}; typedef Packet1cd half; };
+template<> struct unpacket_traits<Packet2cd> { typedef std::complex<double> type; enum {size=2, alignment=Aligned32}; typedef Packet1cd half; };

 template<> EIGEN_STRONG_INLINE Packet2cd padd<Packet2cd>(const Packet2cd& a, const Packet2cd& b) { return Packet2cd(_mm256_add_pd(a.v,b.v)); }
 template<> EIGEN_STRONG_INLINE Packet2cd psub<Packet2cd>(const Packet2cd& a, const Packet2cd& b) { return Packet2cd(_mm256_sub_pd(a.v,b.v)); }
@@ -284,18 +272,10 @@ template<> EIGEN_STRONG_INLINE Packet2cd pmul<Packet2cd>(const Packet2cd& a, con
  return Packet2cd(_mm256_addsub_pd(even, odd));
 }

-template <>
-EIGEN_STRONG_INLINE Packet2cd pcmp_eq(const Packet2cd& a, const Packet2cd& b) {
-  __m256d eq = _mm256_cmp_pd(a.v, b.v, _CMP_EQ_OQ);
-  return Packet2cd(pand(eq, _mm256_permute_pd(eq, 0x5)));
-}
-
-template<> EIGEN_STRONG_INLINE Packet2cd ptrue<Packet2cd>(const Packet2cd& a) { return Packet2cd(ptrue(Packet4d(a.v))); }
-template<> EIGEN_STRONG_INLINE Packet2cd pnot<Packet2cd>(const Packet2cd& a) { return Packet2cd(pnot(Packet4d(a.v))); }
 template<> EIGEN_STRONG_INLINE Packet2cd pand   <Packet2cd>(const Packet2cd& a, const Packet2cd& b) { return Packet2cd(_mm256_and_pd(a.v,b.v)); }
 template<> EIGEN_STRONG_INLINE Packet2cd por    <Packet2cd>(const Packet2cd& a, const Packet2cd& b) { return Packet2cd(_mm256_or_pd(a.v,b.v)); }
 template<> EIGEN_STRONG_INLINE Packet2cd pxor   <Packet2cd>(const Packet2cd& a, const Packet2cd& b) { return Packet2cd(_mm256_xor_pd(a.v,b.v)); }
-template<> EIGEN_STRONG_INLINE Packet2cd pandnot<Packet2cd>(const Packet2cd& a, const Packet2cd& b) { return Packet2cd(_mm256_andnot_pd(b.v,a.v)); }
+template<> EIGEN_STRONG_INLINE Packet2cd pandnot<Packet2cd>(const Packet2cd& a, const Packet2cd& b) { return Packet2cd(_mm256_andnot_pd(a.v,b.v)); }

 template<> EIGEN_STRONG_INLINE Packet2cd pload <Packet2cd>(const std::complex<double>* from)
 { EIGEN_DEBUG_ALIGNED_LOAD return Packet2cd(pload<Packet4d>((const double*)from)); }
--- a/Eigen/src/Core/arch/AVX/MathFunctions.h
+++ b/Eigen/src/Core/arch/AVX/MathFunctions.h
@@ -10,7 +10,7 @@
 #ifndef EIGEN_MATH_FUNCTIONS_AVX_H
 #define EIGEN_MATH_FUNCTIONS_AVX_H

-/* The sin and cos functions of this file are loosely derived from
+/* The sin, cos, exp, and log functions of this file are loosely derived from
 * Julien Pommier's sse math library: http://gruntthepeon.free.fr/ssemath/
 */

@@ -18,32 +18,187 @@ namespace Eigen {

 namespace internal {

+inline Packet8i pshiftleft(Packet8i v, int n)
+{
+#ifdef EIGEN_VECTORIZE_AVX2
+  return _mm256_slli_epi32(v, n);
+#else
+  __m128i lo = _mm_slli_epi32(_mm256_extractf128_si256(v, 0), n);
+  __m128i hi = _mm_slli_epi32(_mm256_extractf128_si256(v, 1), n);
+  return _mm256_insertf128_si256(_mm256_castsi128_si256(lo), (hi), 1);
+#endif
+}
+
+inline Packet8f pshiftright(Packet8f v, int n)
+{
+#ifdef EIGEN_VECTORIZE_AVX2
+  return _mm256_cvtepi32_ps(_mm256_srli_epi32(_mm256_castps_si256(v), n));
+#else
+  __m128i lo = _mm_srli_epi32(_mm256_extractf128_si256(_mm256_castps_si256(v), 0), n);
+  __m128i hi = _mm_srli_epi32(_mm256_extractf128_si256(_mm256_castps_si256(v), 1), n);
+  return _mm256_cvtepi32_ps(_mm256_insertf128_si256(_mm256_castsi128_si256(lo), (hi), 1));
+#endif
+}
+
+// Sine function
+// Computes sin(x) by wrapping x to the interval [-Pi/4,3*Pi/4] and
+// evaluating interpolants in [-Pi/4,Pi/4] or [Pi/4,3*Pi/4]. The interpolants
+// are (anti-)symmetric and thus have only odd/even coefficients
 template <>
 EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS EIGEN_UNUSED Packet8f
 psin<Packet8f>(const Packet8f& _x) {
-  return psin_float(_x);
-}
-
-template <>
-EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS EIGEN_UNUSED Packet8f
-pcos<Packet8f>(const Packet8f& _x) {
-  return pcos_float(_x);
+  Packet8f x = _x;
+
+  // Some useful values.
+  _EIGEN_DECLARE_CONST_Packet8i(one, 1);
+  _EIGEN_DECLARE_CONST_Packet8f(one, 1.0f);
+  _EIGEN_DECLARE_CONST_Packet8f(two, 2.0f);
+  _EIGEN_DECLARE_CONST_Packet8f(one_over_four, 0.25f);
+  _EIGEN_DECLARE_CONST_Packet8f(one_over_pi, 3.183098861837907e-01f);
+  _EIGEN_DECLARE_CONST_Packet8f(neg_pi_first, -3.140625000000000e+00f);
+  _EIGEN_DECLARE_CONST_Packet8f(neg_pi_second, -9.670257568359375e-04f);
+  _EIGEN_DECLARE_CONST_Packet8f(neg_pi_third, -6.278329571784980e-07f);
+  _EIGEN_DECLARE_CONST_Packet8f(four_over_pi, 1.273239544735163e+00f);
+
+  // Map x from [-Pi/4,3*Pi/4] to z in [-1,3] and subtract the shifted period.
+  Packet8f z = pmul(x, p8f_one_over_pi);
+  Packet8f shift = _mm256_floor_ps(padd(z, p8f_one_over_four));
+  x = pmadd(shift, p8f_neg_pi_first, x);
+  x = pmadd(shift, p8f_neg_pi_second, x);
+  x = pmadd(shift, p8f_neg_pi_third, x);
+  z = pmul(x, p8f_four_over_pi);
+
+  // Make a mask for the entries that need flipping, i.e. wherever the shift
+  // is odd.
+  Packet8i shift_ints = _mm256_cvtps_epi32(shift);
+  Packet8i shift_isodd = _mm256_castps_si256(_mm256_and_ps(_mm256_castsi256_ps(shift_ints), _mm256_castsi256_ps(p8i_one)));
+  Packet8i sign_flip_mask = pshiftleft(shift_isodd, 31);
+
+  // Create a mask for which interpolant to use, i.e. if z > 1, then the mask
+  // is set to ones for that entry.
+  Packet8f ival_mask = _mm256_cmp_ps(z, p8f_one, _CMP_GT_OQ);
+
+  // Evaluate the polynomial for the interval [1,3] in z.
+  _EIGEN_DECLARE_CONST_Packet8f(coeff_right_0, 9.999999724233232e-01f);
+  _EIGEN_DECLARE_CONST_Packet8f(coeff_right_2, -3.084242535619928e-01f);
+  _EIGEN_DECLARE_CONST_Packet8f(coeff_right_4, 1.584991525700324e-02f);
+  _EIGEN_DECLARE_CONST_Packet8f(coeff_right_6, -3.188805084631342e-04f);
+  Packet8f z_minus_two = psub(z, p8f_two);
+  Packet8f z_minus_two2 = pmul(z_minus_two, z_minus_two);
+  Packet8f right = pmadd(p8f_coeff_right_6, z_minus_two2, p8f_coeff_right_4);
+  right = pmadd(right, z_minus_two2, p8f_coeff_right_2);
+  right = pmadd(right, z_minus_two2, p8f_coeff_right_0);
+
+  // Evaluate the polynomial for the interval [-1,1] in z.
+  _EIGEN_DECLARE_CONST_Packet8f(coeff_left_1, 7.853981525427295e-01f);
+  _EIGEN_DECLARE_CONST_Packet8f(coeff_left_3, -8.074536727092352e-02f);
+  _EIGEN_DECLARE_CONST_Packet8f(coeff_left_5, 2.489871967827018e-03f);
+  _EIGEN_DECLARE_CONST_Packet8f(coeff_left_7, -3.587725841214251e-05f);
+  Packet8f z2 = pmul(z, z);
+  Packet8f left = pmadd(p8f_coeff_left_7, z2, p8f_coeff_left_5);
+  left = pmadd(left, z2, p8f_coeff_left_3);
+  left = pmadd(left, z2, p8f_coeff_left_1);
+  left = pmul(left, z);
+
+  // Assemble the results, i.e. select the left and right polynomials.
+  left = _mm256_andnot_ps(ival_mask, left);
+  right = _mm256_and_ps(ival_mask, right);
+  Packet8f res = _mm256_or_ps(left, right);
+
+  // Flip the sign on the odd intervals and return the result.
+  res = _mm256_xor_ps(res, _mm256_castsi256_ps(sign_flip_mask));
+  return res;
 }

+// Natural logarithm
+// Computes log(x) as log(2^e * m) = C*e + log(m), where the constant C =log(2)
+// and m is in the range [sqrt(1/2),sqrt(2)). In this range, the logarithm can
+// be easily approximated by a polynomial centered on m=1 for stability.
+// TODO(gonnet): Further reduce the interval allowing for lower-degree
+//               polynomial interpolants -> ... -> profit!
 template <>
 EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS EIGEN_UNUSED Packet8f
 plog<Packet8f>(const Packet8f& _x) {
-  return plog_float(_x);
-}
+  Packet8f x = _x;
+  _EIGEN_DECLARE_CONST_Packet8f(1, 1.0f);
+  _EIGEN_DECLARE_CONST_Packet8f(half, 0.5f);
+  _EIGEN_DECLARE_CONST_Packet8f(126f, 126.0f);

-template<> EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS EIGEN_UNUSED
-Packet8f plog1p<Packet8f>(const Packet8f& _x) {
-  return generic_plog1p(_x);
-}
+  _EIGEN_DECLARE_CONST_Packet8f_FROM_INT(inv_mant_mask, ~0x7f800000);

-template<> EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS EIGEN_UNUSED
-Packet8f pexpm1<Packet8f>(const Packet8f& _x) {
-  return generic_expm1(_x);
+  // The smallest non denormalized float number.
+  _EIGEN_DECLARE_CONST_Packet8f_FROM_INT(min_norm_pos, 0x00800000);
+  _EIGEN_DECLARE_CONST_Packet8f_FROM_INT(minus_inf, 0xff800000);
+
+  // Polynomial coefficients.
+  _EIGEN_DECLARE_CONST_Packet8f(cephes_SQRTHF, 0.707106781186547524f);
+  _EIGEN_DECLARE_CONST_Packet8f(cephes_log_p0, 7.0376836292E-2f);
+  _EIGEN_DECLARE_CONST_Packet8f(cephes_log_p1, -1.1514610310E-1f);
+  _EIGEN_DECLARE_CONST_Packet8f(cephes_log_p2, 1.1676998740E-1f);
+  _EIGEN_DECLARE_CONST_Packet8f(cephes_log_p3, -1.2420140846E-1f);
+  _EIGEN_DECLARE_CONST_Packet8f(cephes_log_p4, +1.4249322787E-1f);
+  _EIGEN_DECLARE_CONST_Packet8f(cephes_log_p5, -1.6668057665E-1f);
+  _EIGEN_DECLARE_CONST_Packet8f(cephes_log_p6, +2.0000714765E-1f);
+  _EIGEN_DECLARE_CONST_Packet8f(cephes_log_p7, -2.4999993993E-1f);
+  _EIGEN_DECLARE_CONST_Packet8f(cephes_log_p8, +3.3333331174E-1f);
+  _EIGEN_DECLARE_CONST_Packet8f(cephes_log_q1, -2.12194440e-4f);
+  _EIGEN_DECLARE_CONST_Packet8f(cephes_log_q2, 0.693359375f);
+
+  Packet8f invalid_mask = _mm256_cmp_ps(x, _mm256_setzero_ps(), _CMP_NGE_UQ); // not greater equal is true if x is NaN
+  Packet8f iszero_mask = _mm256_cmp_ps(x, _mm256_setzero_ps(), _CMP_EQ_OQ);
+
+  // Truncate input values to the minimum positive normal.
+  x = pmax(x, p8f_min_norm_pos);
+
+  Packet8f emm0 = pshiftright(x,23);
+  Packet8f e = _mm256_sub_ps(emm0, p8f_126f);
+
+  // Set the exponents to -1, i.e. x are in the range [0.5,1).
+  x = _mm256_and_ps(x, p8f_inv_mant_mask);
+  x = _mm256_or_ps(x, p8f_half);
+
+  // part2: Shift the inputs from the range [0.5,1) to [sqrt(1/2),sqrt(2))
+  // and shift by -1. The values are then centered around 0, which improves
+  // the stability of the polynomial evaluation.
+  //   if( x < SQRTHF ) {
+  //     e -= 1;
+  //     x = x + x - 1.0;
+  //   } else { x = x - 1.0; }
+  Packet8f mask = _mm256_cmp_ps(x, p8f_cephes_SQRTHF, _CMP_LT_OQ);
+  Packet8f tmp = _mm256_and_ps(x, mask);
+  x = psub(x, p8f_1);
+  e = psub(e, _mm256_and_ps(p8f_1, mask));
+  x = padd(x, tmp);
+
+  Packet8f x2 = pmul(x, x);
+  Packet8f x3 = pmul(x2, x);
+
+  // Evaluate the polynomial approximant of degree 8 in three parts, probably
+  // to improve instruction-level parallelism.
+  Packet8f y, y1, y2;
+  y = pmadd(p8f_cephes_log_p0, x, p8f_cephes_log_p1);
+  y1 = pmadd(p8f_cephes_log_p3, x, p8f_cephes_log_p4);
+  y2 = pmadd(p8f_cephes_log_p6, x, p8f_cephes_log_p7);
+  y = pmadd(y, x, p8f_cephes_log_p2);
+  y1 = pmadd(y1, x, p8f_cephes_log_p5);
+  y2 = pmadd(y2, x, p8f_cephes_log_p8);
+  y = pmadd(y, x3, y1);
+  y = pmadd(y, x3, y2);
+  y = pmul(y, x3);
+
+  // Add the logarithm of the exponent back to the result of the interpolation.
+  y1 = pmul(e, p8f_cephes_log_q1);
+  tmp = pmul(x2, p8f_half);
+  y = padd(y, y1);
+  x = psub(x, tmp);
+  y2 = pmul(e, p8f_cephes_log_q2);
+  x = padd(x, y);
+  x = padd(x, y2);
+
+  // Filter out invalid inputs, i.e. negative arg will be NAN, 0 will be -INF.
+  return _mm256_or_ps(
+      _mm256_andnot_ps(iszero_mask, _mm256_or_ps(x, invalid_mask)),
+      _mm256_and_ps(iszero_mask, p8f_minus_inf));
 }

 // Exponential function. Works by writing "x = m*log(2) + r" where
@@ -52,7 +207,62 @@ Packet8f pexpm1<Packet8f>(const Packet8f& _x) {
 template <>
 EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS EIGEN_UNUSED Packet8f
 pexp<Packet8f>(const Packet8f& _x) {
-  return pexp_float(_x);
+  _EIGEN_DECLARE_CONST_Packet8f(1, 1.0f);
+  _EIGEN_DECLARE_CONST_Packet8f(half, 0.5f);
+  _EIGEN_DECLARE_CONST_Packet8f(127, 127.0f);
+
+  _EIGEN_DECLARE_CONST_Packet8f(exp_hi, 88.3762626647950f);
+  _EIGEN_DECLARE_CONST_Packet8f(exp_lo, -88.3762626647949f);
+
+  _EIGEN_DECLARE_CONST_Packet8f(cephes_LOG2EF, 1.44269504088896341f);
+
+  _EIGEN_DECLARE_CONST_Packet8f(cephes_exp_p0, 1.9875691500E-4f);
+  _EIGEN_DECLARE_CONST_Packet8f(cephes_exp_p1, 1.3981999507E-3f);
+  _EIGEN_DECLARE_CONST_Packet8f(cephes_exp_p2, 8.3334519073E-3f);
+  _EIGEN_DECLARE_CONST_Packet8f(cephes_exp_p3, 4.1665795894E-2f);
+  _EIGEN_DECLARE_CONST_Packet8f(cephes_exp_p4, 1.6666665459E-1f);
+  _EIGEN_DECLARE_CONST_Packet8f(cephes_exp_p5, 5.0000001201E-1f);
+
+  // Clamp x.
+  Packet8f x = pmax(pmin(_x, p8f_exp_hi), p8f_exp_lo);
+
+  // Express exp(x) as exp(m*ln(2) + r), start by extracting
+  // m = floor(x/ln(2) + 0.5).
+  Packet8f m = _mm256_floor_ps(pmadd(x, p8f_cephes_LOG2EF, p8f_half));
+
+// Get r = x - m*ln(2). If no FMA instructions are available, m*ln(2) is
+// subtracted out in two parts, m*C1+m*C2 = m*ln(2), to avoid accumulating
+// truncation errors. Note that we don't use the "pmadd" function here to
+// ensure that a precision-preserving FMA instruction is used.
+#ifdef EIGEN_VECTORIZE_FMA
+  _EIGEN_DECLARE_CONST_Packet8f(nln2, -0.6931471805599453f);
+  Packet8f r = _mm256_fmadd_ps(m, p8f_nln2, x);
+#else
+  _EIGEN_DECLARE_CONST_Packet8f(cephes_exp_C1, 0.693359375f);
+  _EIGEN_DECLARE_CONST_Packet8f(cephes_exp_C2, -2.12194440e-4f);
+  Packet8f r = psub(x, pmul(m, p8f_cephes_exp_C1));
+  r = psub(r, pmul(m, p8f_cephes_exp_C2));
+#endif
+
+  Packet8f r2 = pmul(r, r);
+
+  // TODO(gonnet): Split into odd/even polynomials and try to exploit
+  //               instruction-level parallelism.
+  Packet8f y = p8f_cephes_exp_p0;
+  y = pmadd(y, r, p8f_cephes_exp_p1);
+  y = pmadd(y, r, p8f_cephes_exp_p2);
+  y = pmadd(y, r, p8f_cephes_exp_p3);
+  y = pmadd(y, r, p8f_cephes_exp_p4);
+  y = pmadd(y, r, p8f_cephes_exp_p5);
+  y = pmadd(y, r2, r);
+  y = padd(y, p8f_1);
+
+  // Build emm0 = 2^m.
+  Packet8i emm0 = _mm256_cvttps_epi32(padd(m, p8f_127));
+  emm0 = pshiftleft(emm0, 23);
+
+  // Return 2^m * exp(r).
+  return pmax(pmul(y, _mm256_castsi256_ps(emm0)), _x);
 }

 // Hyperbolic Tangent function.
@@ -62,11 +272,84 @@ ptanh<Packet8f>(const Packet8f& x) {
  return internal::generic_fast_tanh_float(x);
 }

-// Exponential function for doubles.
 template <>
 EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS EIGEN_UNUSED Packet4d
-pexp<Packet4d>(const Packet4d& x) {
-  return pexp_double(x);
+pexp<Packet4d>(const Packet4d& _x) {
+  Packet4d x = _x;
+
+  _EIGEN_DECLARE_CONST_Packet4d(1, 1.0);
+  _EIGEN_DECLARE_CONST_Packet4d(2, 2.0);
+  _EIGEN_DECLARE_CONST_Packet4d(half, 0.5);
+
+  _EIGEN_DECLARE_CONST_Packet4d(exp_hi, 709.437);
+  _EIGEN_DECLARE_CONST_Packet4d(exp_lo, -709.436139303);
+
+  _EIGEN_DECLARE_CONST_Packet4d(cephes_LOG2EF, 1.4426950408889634073599);
+
+  _EIGEN_DECLARE_CONST_Packet4d(cephes_exp_p0, 1.26177193074810590878e-4);
+  _EIGEN_DECLARE_CONST_Packet4d(cephes_exp_p1, 3.02994407707441961300e-2);
+  _EIGEN_DECLARE_CONST_Packet4d(cephes_exp_p2, 9.99999999999999999910e-1);
+
+  _EIGEN_DECLARE_CONST_Packet4d(cephes_exp_q0, 3.00198505138664455042e-6);
+  _EIGEN_DECLARE_CONST_Packet4d(cephes_exp_q1, 2.52448340349684104192e-3);
+  _EIGEN_DECLARE_CONST_Packet4d(cephes_exp_q2, 2.27265548208155028766e-1);
+  _EIGEN_DECLARE_CONST_Packet4d(cephes_exp_q3, 2.00000000000000000009e0);
+
+  _EIGEN_DECLARE_CONST_Packet4d(cephes_exp_C1, 0.693145751953125);
+  _EIGEN_DECLARE_CONST_Packet4d(cephes_exp_C2, 1.42860682030941723212e-6);
+  _EIGEN_DECLARE_CONST_Packet4i(1023, 1023);
+
+  Packet4d tmp, fx;
+
+  // clamp x
+  x = pmax(pmin(x, p4d_exp_hi), p4d_exp_lo);
+  // Express exp(x) as exp(g + n*log(2)).
+  fx = pmadd(p4d_cephes_LOG2EF, x, p4d_half);
+
+  // Get the integer modulus of log(2), i.e. the "n" described above.
+  fx = _mm256_floor_pd(fx);
+
+  // Get the remainder modulo log(2), i.e. the "g" described above. Subtract
+  // n*log(2) out in two steps, i.e. n*C1 + n*C2, C1+C2=log2 to get the last
+  // digits right.
+  tmp = pmul(fx, p4d_cephes_exp_C1);
+  Packet4d z = pmul(fx, p4d_cephes_exp_C2);
+  x = psub(x, tmp);
+  x = psub(x, z);
+
+  Packet4d x2 = pmul(x, x);
+
+  // Evaluate the numerator polynomial of the rational interpolant.
+  Packet4d px = p4d_cephes_exp_p0;
+  px = pmadd(px, x2, p4d_cephes_exp_p1);
+  px = pmadd(px, x2, p4d_cephes_exp_p2);
+  px = pmul(px, x);
+
+  // Evaluate the denominator polynomial of the rational interpolant.
+  Packet4d qx = p4d_cephes_exp_q0;
+  qx = pmadd(qx, x2, p4d_cephes_exp_q1);
+  qx = pmadd(qx, x2, p4d_cephes_exp_q2);
+  qx = pmadd(qx, x2, p4d_cephes_exp_q3);
+
+  // I don't really get this bit, copied from the SSE2 routines, so...
+  // TODO(gonnet): Figure out what is going on here, perhaps find a better
+  // rational interpolant?
+  x = _mm256_div_pd(px, psub(qx, px));
+  x = pmadd(p4d_2, x, p4d_1);
+
+  // Build e=2^n by constructing the exponents in a 128-bit vector and
+  // shifting them to where they belong in double-precision values.
+  __m128i emm0 = _mm256_cvtpd_epi32(fx);
+  emm0 = _mm_add_epi32(emm0, p4i_1023);
+  emm0 = _mm_shuffle_epi32(emm0, _MM_SHUFFLE(3, 1, 2, 0));
+  __m128i lo = _mm_slli_epi64(emm0, 52);
+  __m128i hi = _mm_slli_epi64(_mm_srli_epi64(emm0, 32), 52);
+  __m256i e = _mm256_insertf128_si256(_mm256_setzero_si256(), lo, 0);
+  e = _mm256_insertf128_si256(e, hi, 1);
+
+  // Construct the result 2^n * exp(g) = e * x. The max is used to catch
+  // non-finite values in the input.
+  return pmax(pmul(x, _mm256_castsi256_pd(e)), _x);
 }

 // Functions for sqrt.
@@ -109,6 +392,7 @@ Packet4d psqrt<Packet4d>(const Packet4d& x) {
 template<> EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS EIGEN_UNUSED
 Packet8f prsqrt<Packet8f>(const Packet8f& _x) {
  _EIGEN_DECLARE_CONST_Packet8f_FROM_INT(inf, 0x7f800000);
+  _EIGEN_DECLARE_CONST_Packet8f_FROM_INT(nan, 0x7fc00000);
  _EIGEN_DECLARE_CONST_Packet8f(one_point_five, 1.5f);
  _EIGEN_DECLARE_CONST_Packet8f(minus_half, -0.5f);
  _EIGEN_DECLARE_CONST_Packet8f_FROM_INT(flt_min, 0x00800000);
@@ -117,25 +401,20 @@ Packet8f prsqrt<Packet8f>(const Packet8f& _x) {

  // select only the inverse sqrt of positive normal inputs (denormals are
  // flushed to zero and cause infs as well).
-  Packet8f lt_min_mask = _mm256_cmp_ps(_x, p8f_flt_min, _CMP_LT_OQ);
-  Packet8f inf_mask =  _mm256_cmp_ps(_x, p8f_inf, _CMP_EQ_OQ);
-  Packet8f not_normal_finite_mask = _mm256_or_ps(lt_min_mask, inf_mask);
+  Packet8f le_zero_mask = _mm256_cmp_ps(_x, p8f_flt_min, _CMP_LT_OQ);
+  Packet8f x = _mm256_andnot_ps(le_zero_mask, _mm256_rsqrt_ps(_x));

-  // Compute an approximate result using the rsqrt intrinsic.
-  Packet8f y_approx = _mm256_rsqrt_ps(_x);
+  // Fill in NaNs and Infs for the negative/zero entries.
+  Packet8f neg_mask = _mm256_cmp_ps(_x, _mm256_setzero_ps(), _CMP_LT_OQ);
+  Packet8f zero_mask = _mm256_andnot_ps(neg_mask, le_zero_mask);
+  Packet8f infs_and_nans = _mm256_or_ps(_mm256_and_ps(neg_mask, p8f_nan),
+                                        _mm256_and_ps(zero_mask, p8f_inf));

-  // Do a single step of Newton-Raphson iteration to improve the approximation.
-  // This uses the formula y_{n+1} = y_n * (1.5 - y_n * (0.5 * x) * y_n).
-  // It is essential to evaluate the inner term like this because forming
-  // y_n^2 may over- or underflow.
-  Packet8f y_newton = pmul(y_approx, pmadd(y_approx, pmul(neg_half, y_approx), p8f_one_point_five));
+  // Do a single step of Newton's iteration.
+  x = pmul(x, pmadd(neg_half, pmul(x, x), p8f_one_point_five));

-  // Select the result of the Newton-Raphson step for positive normal arguments.
-  // For other arguments, choose the output of the intrinsic. This will
-  // return rsqrt(+inf) = 0, rsqrt(x) = NaN if x < 0, and rsqrt(x) = +inf if
-  // x is zero or a positive denormalized float (equivalent to flushing positive
-  // denormalized inputs to zero).
-  return pselect<Packet8f>(not_normal_finite_mask, y_approx, y_newton);
+  // Insert NaNs and Infs in all the right places.
+  return _mm256_or_ps(x, infs_and_nans);
 }

 #else
--- a/Eigen/src/Core/arch/AVX/PacketMath.h
+++ b/Eigen/src/Core/arch/AVX/PacketMath.h
@@ -18,11 +18,11 @@ namespace internal {
 #define EIGEN_CACHEFRIENDLY_PRODUCT_THRESHOLD 8
 #endif

-#if !defined(EIGEN_VECTORIZE_AVX512) && !defined(EIGEN_ARCH_DEFAULT_NUMBER_OF_REGISTERS)
-#define EIGEN_ARCH_DEFAULT_NUMBER_OF_REGISTERS 16
+#ifndef EIGEN_ARCH_DEFAULT_NUMBER_OF_REGISTERS
+#define EIGEN_ARCH_DEFAULT_NUMBER_OF_REGISTERS (2*sizeof(void*))
 #endif

-#ifdef EIGEN_VECTORIZE_FMA
+#ifdef __FMA__
 #ifndef EIGEN_HAS_SINGLE_INSTRUCTION_MADD
 #define EIGEN_HAS_SINGLE_INSTRUCTION_MADD
 #endif
@@ -31,14 +31,10 @@ namespace internal {
 typedef __m256  Packet8f;
 typedef __m256i Packet8i;
 typedef __m256d Packet4d;
-typedef struct {
-  __m128i x;
-} Packet8h;

 template<> struct is_arithmetic<__m256>  { enum { value = true }; };
 template<> struct is_arithmetic<__m256i> { enum { value = true }; };
 template<> struct is_arithmetic<__m256d> { enum { value = true }; };
-template<> struct is_arithmetic<Packet8h> { enum { value = true }; };

 #define _EIGEN_DECLARE_CONST_Packet8f(NAME,X) \
  const Packet8f p8f_##NAME = pset1<Packet8f>(X)
@@ -62,22 +58,17 @@ template<> struct packet_traits<float>  : default_packet_traits
  enum {
    Vectorizable = 1,
    AlignedOnScalar = 1,
-    size = 8,
+    size=8,
    HasHalfPacket = 1,

-    HasDiv = 1,
-    HasSin = EIGEN_FAST_MATH,
-    HasCos = EIGEN_FAST_MATH,
-    HasLog = 1,
-    HasLog1p = 1,
-    HasExpm1 = 1,
-    HasExp = 1,
-    HasNdtri = 1,
-    HasBessel = 1,
+    HasDiv  = 1,
+    HasSin  = EIGEN_FAST_MATH,
+    HasCos  = 0,
+    HasLog  = 1,
+    HasExp  = 1,
    HasSqrt = 1,
    HasRsqrt = 1,
-    HasTanh = EIGEN_FAST_MATH,
-    HasErf = EIGEN_FAST_MATH,
+    HasTanh  = EIGEN_FAST_MATH,
    HasBlend = 1,
    HasRound = 1,
    HasFloor = 1,
@@ -104,35 +95,6 @@ template<> struct packet_traits<double> : default_packet_traits
    HasCeil = 1
  };
 };
-
-template <>
-struct packet_traits<Eigen::half> : default_packet_traits {
-  typedef Packet8h type;
-  // There is no half-size packet for Packet8h.
-  typedef Packet8h half;
-  enum {
-    Vectorizable = 1,
-    AlignedOnScalar = 1,
-    size = 8,
-    HasHalfPacket = 0,
-    HasAdd    = 1,
-    HasSub    = 1,
-    HasMul    = 1,
-    HasDiv    = 1,
-    HasNegate = 1,
-    HasAbs    = 0,
-    HasAbs2   = 0,
-    HasMin    = 0,
-    HasMax    = 0,
-    HasConj   = 0,
-    HasSetLinear = 0,
-    HasSqrt = 0,
-    HasRsqrt = 0,
-    HasExp = 0,
-    HasLog = 0,
-    HasBlend = 0
-  };
-};
 #endif

 template<> struct scalar_div_cost<float,true> { enum { value = 14 }; };
@@ -151,30 +113,14 @@ template<> struct packet_traits<int>    : default_packet_traits
 };
 */

-template<> struct unpacket_traits<Packet8f> {
-  typedef float     type;
-  typedef Packet4f  half;
-  typedef Packet8i  integer_packet;
-  typedef uint8_t   mask_t;
-  enum {size=8, alignment=Aligned32, vectorizable=true, masked_load_available=true, masked_store_available=true};
-};
-template<> struct unpacket_traits<Packet4d> {
-  typedef double type;
-  typedef Packet2d half;
-  enum {size=4, alignment=Aligned32, vectorizable=true, masked_load_available=false, masked_store_available=false};
-};
-template<> struct unpacket_traits<Packet8i> { typedef int    type; typedef Packet4i half; enum {size=8, alignment=Aligned32, vectorizable=false, masked_load_available=false, masked_store_available=false}; };
+template<> struct unpacket_traits<Packet8f> { typedef float  type; typedef Packet4f half; enum {size=8, alignment=Aligned32}; };
+template<> struct unpacket_traits<Packet4d> { typedef double type; typedef Packet2d half; enum {size=4, alignment=Aligned32}; };
+template<> struct unpacket_traits<Packet8i> { typedef int    type; typedef Packet4i half; enum {size=8, alignment=Aligned32}; };

 template<> EIGEN_STRONG_INLINE Packet8f pset1<Packet8f>(const float&  from) { return _mm256_set1_ps(from); }
 template<> EIGEN_STRONG_INLINE Packet4d pset1<Packet4d>(const double& from) { return _mm256_set1_pd(from); }
 template<> EIGEN_STRONG_INLINE Packet8i pset1<Packet8i>(const int&    from) { return _mm256_set1_epi32(from); }

-template<> EIGEN_STRONG_INLINE Packet8f pset1frombits<Packet8f>(unsigned int from) { return _mm256_castsi256_ps(pset1<Packet8i>(from)); }
-
-template<> EIGEN_STRONG_INLINE Packet8f pzero(const Packet8f& /*a*/) { return _mm256_setzero_ps(); }
-template<> EIGEN_STRONG_INLINE Packet4d pzero(const Packet4d& /*a*/) { return _mm256_setzero_pd(); }
-template<> EIGEN_STRONG_INLINE Packet8i pzero(const Packet8i& /*a*/) { return _mm256_setzero_si256(); }
-
 template<> EIGEN_STRONG_INLINE Packet8f pload1<Packet8f>(const float*  from) { return _mm256_broadcast_ss(from); }
 template<> EIGEN_STRONG_INLINE Packet4d pload1<Packet4d>(const double* from) { return _mm256_broadcast_sd(from); }

@@ -183,15 +129,6 @@ template<> EIGEN_STRONG_INLINE Packet4d plset<Packet4d>(const double& a) { retur

 template<> EIGEN_STRONG_INLINE Packet8f padd<Packet8f>(const Packet8f& a, const Packet8f& b) { return _mm256_add_ps(a,b); }
 template<> EIGEN_STRONG_INLINE Packet4d padd<Packet4d>(const Packet4d& a, const Packet4d& b) { return _mm256_add_pd(a,b); }
-template<> EIGEN_STRONG_INLINE Packet8i padd<Packet8i>(const Packet8i& a, const Packet8i& b) {
-#ifdef EIGEN_VECTORIZE_AVX2
-  return _mm256_add_epi32(a,b);
-#else
-  __m128i lo = _mm_add_epi32(_mm256_extractf128_si256(a, 0), _mm256_extractf128_si256(b, 0));
-  __m128i hi = _mm_add_epi32(_mm256_extractf128_si256(a, 1), _mm256_extractf128_si256(b, 1));
-  return _mm256_insertf128_si256(_mm256_castsi128_si256(lo), (hi), 1);
-#endif
-}

 template<> EIGEN_STRONG_INLINE Packet8f psub<Packet8f>(const Packet8f& a, const Packet8f& b) { return _mm256_sub_ps(a,b); }
 template<> EIGEN_STRONG_INLINE Packet4d psub<Packet4d>(const Packet4d& a, const Packet4d& b) { return _mm256_sub_pd(a,b); }
@@ -220,7 +157,7 @@ template<> EIGEN_STRONG_INLINE Packet8i pdiv<Packet8i>(const Packet8i& /*a*/, co
  return pset1<Packet8i>(0);
 }

-#ifdef EIGEN_VECTORIZE_FMA
+#ifdef __FMA__
 template<> EIGEN_STRONG_INLINE Packet8f pmadd(const Packet8f& a, const Packet8f& b, const Packet8f& c) {
 #if ( (EIGEN_COMP_GNUC_STRICT && EIGEN_COMP_GNUC<80) || (EIGEN_COMP_CLANG) )
  // Clang stupidly generates a vfmadd213ps instruction plus some vmovaps on registers,
@@ -247,69 +184,11 @@ template<> EIGEN_STRONG_INLINE Packet4d pmadd(const Packet4d& a, const Packet4d&
 }
 #endif

-template<> EIGEN_STRONG_INLINE Packet8f pmin<Packet8f>(const Packet8f& a, const Packet8f& b) {
-#if EIGEN_COMP_GNUC && EIGEN_COMP_GNUC < 63
-  // There appears to be a bug in GCC, by which the optimizer may flip
-  // the argument order in calls to _mm_min_ps/_mm_max_ps, so we have to
-  // resort to inline ASM here. This is supposed to be fixed in gcc6.3,
-  // see also: https://gcc.gnu.org/bugzilla/show_bug.cgi?id=72867
-  Packet8f res;
-  asm("vminps %[a], %[b], %[res]" : [res] "=x" (res) : [a] "x" (a), [b] "x" (b));
-  return res;
-#else
-  // Arguments are swapped to match NaN propagation behavior of std::min.
-  return _mm256_min_ps(b,a);
-#endif
-}
-template<> EIGEN_STRONG_INLINE Packet4d pmin<Packet4d>(const Packet4d& a, const Packet4d& b) {
-#if EIGEN_COMP_GNUC && EIGEN_COMP_GNUC < 63
-  // See pmin above
-  Packet4d res;
-  asm("vminpd %[a], %[b], %[res]" : [res] "=x" (res) : [a] "x" (a), [b] "x" (b));
-  return res;
-#else
-  // Arguments are swapped to match NaN propagation behavior of std::min.
-  return _mm256_min_pd(b,a);
-#endif
-}
-template<> EIGEN_STRONG_INLINE Packet8f pmax<Packet8f>(const Packet8f& a, const Packet8f& b) {
-#if EIGEN_COMP_GNUC && EIGEN_COMP_GNUC < 63
-  // See pmin above
-  Packet8f res;
-  asm("vmaxps %[a], %[b], %[res]" : [res] "=x" (res) : [a] "x" (a), [b] "x" (b));
-  return res;
-#else
-  // Arguments are swapped to match NaN propagation behavior of std::max.
-  return _mm256_max_ps(b,a);
-#endif
-}
-template<> EIGEN_STRONG_INLINE Packet4d pmax<Packet4d>(const Packet4d& a, const Packet4d& b) {
-#if EIGEN_COMP_GNUC && EIGEN_COMP_GNUC < 63
-  // See pmin above
-  Packet4d res;
-  asm("vmaxpd %[a], %[b], %[res]" : [res] "=x" (res) : [a] "x" (a), [b] "x" (b));
-  return res;
-#else
-  // Arguments are swapped to match NaN propagation behavior of std::max.
-  return _mm256_max_pd(b,a);
-#endif
-}
+template<> EIGEN_STRONG_INLINE Packet8f pmin<Packet8f>(const Packet8f& a, const Packet8f& b) { return _mm256_min_ps(a,b); }
+template<> EIGEN_STRONG_INLINE Packet4d pmin<Packet4d>(const Packet4d& a, const Packet4d& b) { return _mm256_min_pd(a,b); }

-template<> EIGEN_STRONG_INLINE Packet8f pcmp_le(const Packet8f& a, const Packet8f& b) { return _mm256_cmp_ps(a,b,_CMP_LE_OQ); }
-template<> EIGEN_STRONG_INLINE Packet8f pcmp_lt(const Packet8f& a, const Packet8f& b) { return _mm256_cmp_ps(a,b,_CMP_LT_OQ); }
-template<> EIGEN_STRONG_INLINE Packet8f pcmp_eq(const Packet8f& a, const Packet8f& b) { return _mm256_cmp_ps(a,b,_CMP_EQ_OQ); }
-template<> EIGEN_STRONG_INLINE Packet4d pcmp_eq(const Packet4d& a, const Packet4d& b) { return _mm256_cmp_pd(a,b,_CMP_EQ_OQ); }
-template<> EIGEN_STRONG_INLINE Packet8f pcmp_lt_or_nan(const Packet8f& a, const Packet8f& b) { return _mm256_cmp_ps(a, b, _CMP_NGE_UQ); }
-
-template<> EIGEN_STRONG_INLINE Packet8i pcmp_eq(const Packet8i& a, const Packet8i& b) {
-#ifdef EIGEN_VECTORIZE_AVX2
-  return _mm256_cmpeq_epi32(a,b);
-#else
-  __m128i lo = _mm_cmpeq_epi32(_mm256_extractf128_si256(a, 0), _mm256_extractf128_si256(b, 0));
-  __m128i hi = _mm_cmpeq_epi32(_mm256_extractf128_si256(a, 1), _mm256_extractf128_si256(b, 1));
-  return _mm256_insertf128_si256(_mm256_castsi128_si256(lo), (hi), 1);
-#endif
-}
+template<> EIGEN_STRONG_INLINE Packet8f pmax<Packet8f>(const Packet8f& a, const Packet8f& b) { return _mm256_max_ps(a,b); }
+template<> EIGEN_STRONG_INLINE Packet4d pmax<Packet4d>(const Packet4d& a, const Packet4d& b) { return _mm256_max_pd(a,b); }

 template<> EIGEN_STRONG_INLINE Packet8f pround<Packet8f>(const Packet8f& a) { return _mm256_round_ps(a, _MM_FROUND_CUR_DIRECTION); }
 template<> EIGEN_STRONG_INLINE Packet4d pround<Packet4d>(const Packet4d& a) { return _mm256_round_pd(a, _MM_FROUND_CUR_DIRECTION); }
@@ -320,101 +199,17 @@ template<> EIGEN_STRONG_INLINE Packet4d pceil<Packet4d>(const Packet4d& a) { ret
 template<> EIGEN_STRONG_INLINE Packet8f pfloor<Packet8f>(const Packet8f& a) { return _mm256_floor_ps(a); }
 template<> EIGEN_STRONG_INLINE Packet4d pfloor<Packet4d>(const Packet4d& a) { return _mm256_floor_pd(a); }

-
-template<> EIGEN_STRONG_INLINE Packet8i ptrue<Packet8i>(const Packet8i& a) {
-#ifdef EIGEN_VECTORIZE_AVX2
-  // vpcmpeqd has lower latency than the more general vcmpps
-  return _mm256_cmpeq_epi32(a,a);
-#else
-  const __m256 b = _mm256_castsi256_ps(a);
-  return _mm256_castps_si256(_mm256_cmp_ps(b,b,_CMP_TRUE_UQ));
-#endif
-}
-
-template<> EIGEN_STRONG_INLINE Packet8f ptrue<Packet8f>(const Packet8f& a) {
-#ifdef EIGEN_VECTORIZE_AVX2
-  // vpcmpeqd has lower latency than the more general vcmpps
-  const __m256i b = _mm256_castps_si256(a);
-  return _mm256_castsi256_ps(_mm256_cmpeq_epi32(b,b));
-#else
-  return _mm256_cmp_ps(a,a,_CMP_TRUE_UQ);
-#endif
-}
-
-template<> EIGEN_STRONG_INLINE Packet4d ptrue<Packet4d>(const Packet4d& a) {
-#ifdef EIGEN_VECTORIZE_AVX2
-  // vpcmpeqq has lower latency than the more general vcmppd
-  const __m256i b = _mm256_castpd_si256(a);
-  return _mm256_castsi256_pd(_mm256_cmpeq_epi64(b,b));
-#else
-  return _mm256_cmp_pd(a,a,_CMP_TRUE_UQ);
-#endif
-}
-
 template<> EIGEN_STRONG_INLINE Packet8f pand<Packet8f>(const Packet8f& a, const Packet8f& b) { return _mm256_and_ps(a,b); }
 template<> EIGEN_STRONG_INLINE Packet4d pand<Packet4d>(const Packet4d& a, const Packet4d& b) { return _mm256_and_pd(a,b); }
-template<> EIGEN_STRONG_INLINE Packet8i pand<Packet8i>(const Packet8i& a, const Packet8i& b) {
-#ifdef EIGEN_VECTORIZE_AVX2
-  return _mm256_and_si256(a,b);
-#else
-  return _mm256_castps_si256(_mm256_and_ps(_mm256_castsi256_ps(a),_mm256_castsi256_ps(b)));
-#endif
-}

 template<> EIGEN_STRONG_INLINE Packet8f por<Packet8f>(const Packet8f& a, const Packet8f& b) { return _mm256_or_ps(a,b); }
 template<> EIGEN_STRONG_INLINE Packet4d por<Packet4d>(const Packet4d& a, const Packet4d& b) { return _mm256_or_pd(a,b); }
-template<> EIGEN_STRONG_INLINE Packet8i por<Packet8i>(const Packet8i& a, const Packet8i& b) {
-#ifdef EIGEN_VECTORIZE_AVX2
-  return _mm256_or_si256(a,b);
-#else
-  return _mm256_castps_si256(_mm256_or_ps(_mm256_castsi256_ps(a),_mm256_castsi256_ps(b)));
-#endif
-}

 template<> EIGEN_STRONG_INLINE Packet8f pxor<Packet8f>(const Packet8f& a, const Packet8f& b) { return _mm256_xor_ps(a,b); }
 template<> EIGEN_STRONG_INLINE Packet4d pxor<Packet4d>(const Packet4d& a, const Packet4d& b) { return _mm256_xor_pd(a,b); }
-template<> EIGEN_STRONG_INLINE Packet8i pxor<Packet8i>(const Packet8i& a, const Packet8i& b) {
-#ifdef EIGEN_VECTORIZE_AVX2
-  return _mm256_xor_si256(a,b);
-#else
-  return _mm256_castps_si256(_mm256_xor_ps(_mm256_castsi256_ps(a),_mm256_castsi256_ps(b)));
-#endif
-}

-template<> EIGEN_STRONG_INLINE Packet8f pandnot<Packet8f>(const Packet8f& a, const Packet8f& b) { return _mm256_andnot_ps(b,a); }
-template<> EIGEN_STRONG_INLINE Packet4d pandnot<Packet4d>(const Packet4d& a, const Packet4d& b) { return _mm256_andnot_pd(b,a); }
-template<> EIGEN_STRONG_INLINE Packet8i pandnot<Packet8i>(const Packet8i& a, const Packet8i& b) {
-#ifdef EIGEN_VECTORIZE_AVX2
-  return _mm256_andnot_si256(b,a);
-#else
-  return _mm256_castps_si256(_mm256_andnot_ps(_mm256_castsi256_ps(b),_mm256_castsi256_ps(a)));
-#endif
-}
-
-template<> EIGEN_STRONG_INLINE Packet8f pselect<Packet8f>(const Packet8f& mask, const Packet8f& a, const Packet8f& b)
-{ return _mm256_blendv_ps(b,a,mask); }
-template<> EIGEN_STRONG_INLINE Packet4d pselect<Packet4d>(const Packet4d& mask, const Packet4d& a, const Packet4d& b)
-{ return _mm256_blendv_pd(b,a,mask); }
-
-template<int N> EIGEN_STRONG_INLINE Packet8i pshiftright(Packet8i a) {
-#ifdef EIGEN_VECTORIZE_AVX2
-  return _mm256_srli_epi32(a, N);
-#else
-  __m128i lo = _mm_srli_epi32(_mm256_extractf128_si256(a, 0), N);
-  __m128i hi = _mm_srli_epi32(_mm256_extractf128_si256(a, 1), N);
-  return _mm256_insertf128_si256(_mm256_castsi128_si256(lo), (hi), 1);
-#endif
-}
-
-template<int N> EIGEN_STRONG_INLINE Packet8i pshiftleft(Packet8i a) {
-#ifdef EIGEN_VECTORIZE_AVX2
-  return _mm256_slli_epi32(a, N);
-#else
-  __m128i lo = _mm_slli_epi32(_mm256_extractf128_si256(a, 0), N);
-  __m128i hi = _mm_slli_epi32(_mm256_extractf128_si256(a, 1), N);
-  return _mm256_insertf128_si256(_mm256_castsi128_si256(lo), (hi), 1);
-#endif
-}
+template<> EIGEN_STRONG_INLINE Packet8f pandnot<Packet8f>(const Packet8f& a, const Packet8f& b) { return _mm256_andnot_ps(a,b); }
+template<> EIGEN_STRONG_INLINE Packet4d pandnot<Packet4d>(const Packet4d& a, const Packet4d& b) { return _mm256_andnot_pd(a,b); }

 template<> EIGEN_STRONG_INLINE Packet8f pload<Packet8f>(const float*   from) { EIGEN_DEBUG_ALIGNED_LOAD return _mm256_load_ps(from); }
 template<> EIGEN_STRONG_INLINE Packet4d pload<Packet4d>(const double*  from) { EIGEN_DEBUG_ALIGNED_LOAD return _mm256_load_pd(from); }
@@ -424,14 +219,6 @@ template<> EIGEN_STRONG_INLINE Packet8f ploadu<Packet8f>(const float* from) { EI
 template<> EIGEN_STRONG_INLINE Packet4d ploadu<Packet4d>(const double* from) { EIGEN_DEBUG_UNALIGNED_LOAD return _mm256_loadu_pd(from); }
 template<> EIGEN_STRONG_INLINE Packet8i ploadu<Packet8i>(const int* from) { EIGEN_DEBUG_UNALIGNED_LOAD return _mm256_loadu_si256(reinterpret_cast<const __m256i*>(from)); }

-template<> EIGEN_STRONG_INLINE Packet8f ploadu<Packet8f>(const float* from, uint8_t umask) {
-  Packet8i mask = _mm256_set1_epi8(static_cast<char>(umask));
-  const Packet8i bit_mask = _mm256_set_epi32(0xffffff7f, 0xffffffbf, 0xffffffdf, 0xffffffef, 0xfffffff7, 0xfffffffb, 0xfffffffd, 0xfffffffe);
-  mask = por<Packet8i>(mask, bit_mask);
-  mask = pcmp_eq<Packet8i>(mask, _mm256_set1_epi32(0xffffffff));
-  EIGEN_DEBUG_UNALIGNED_LOAD return _mm256_maskload_ps(from, mask);
-}
-
 // Loads 4 floats from memory a returns the packet {a0, a0  a1, a1, a2, a2, a3, a3}
 template<> EIGEN_STRONG_INLINE Packet8f ploaddup<Packet8f>(const float* from)
 {
@@ -439,7 +226,7 @@ template<> EIGEN_STRONG_INLINE Packet8f ploaddup<Packet8f>(const float* from)
 //   Packet8f tmp  = _mm256_castps128_ps256(_mm_loadu_ps(from));
 //   tmp = _mm256_insertf128_ps(tmp, _mm_movehl_ps(_mm256_castps256_ps128(tmp),_mm256_castps256_ps128(tmp)), 1);
 //   return _mm256_unpacklo_ps(tmp,tmp);
-
+  
  // _mm256_insertf128_ps is very slow on Haswell, thus:
  Packet8f tmp = _mm256_broadcast_ps((const __m128*)(const void*)from);
  // mimic an "inplace" permutation of the lower 128bits using a blend
@@ -469,14 +256,6 @@ template<> EIGEN_STRONG_INLINE void pstoreu<float>(float*   to, const Packet8f&
 template<> EIGEN_STRONG_INLINE void pstoreu<double>(double* to, const Packet4d& from) { EIGEN_DEBUG_UNALIGNED_STORE _mm256_storeu_pd(to, from); }
 template<> EIGEN_STRONG_INLINE void pstoreu<int>(int*       to, const Packet8i& from) { EIGEN_DEBUG_UNALIGNED_STORE _mm256_storeu_si256(reinterpret_cast<__m256i*>(to), from); }

-template<> EIGEN_STRONG_INLINE void pstoreu<float>(float*   to, const Packet8f& from, uint8_t umask) {
-  Packet8i mask = _mm256_set1_epi8(static_cast<char>(umask));
-  const Packet8i bit_mask = _mm256_set_epi32(0xffffff7f, 0xffffffbf, 0xffffffdf, 0xffffffef, 0xfffffff7, 0xfffffffb, 0xfffffffd, 0xfffffffe);
-  mask = por<Packet8i>(mask, bit_mask);
-  mask = pcmp_eq<Packet8i>(mask, _mm256_set1_epi32(0xffffffff));
-  EIGEN_DEBUG_UNALIGNED_STORE return _mm256_maskstore_ps(to, mask, from);
-}
-
 // NOTE: leverage _mm256_i32gather_ps and _mm256_i32gather_pd if AVX2 instructions are available
 // NOTE: for the record the following seems to be slower: return _mm256_i32gather_ps(from, _mm256_set1_epi32(stride), 4);
 template<> EIGEN_DEVICE_FUNC inline Packet8f pgather<float, Packet8f>(const float* from, Index stride)
@@ -575,28 +354,6 @@ template<> EIGEN_STRONG_INLINE Packet4d pabs(const Packet4d& a)
  return _mm256_and_pd(a,mask);
 }

-template<> EIGEN_STRONG_INLINE Packet8f pfrexp<Packet8f>(const Packet8f& a, Packet8f& exponent) {
-  return pfrexp_float(a,exponent);
-}
-
-template<> EIGEN_STRONG_INLINE Packet8f pldexp<Packet8f>(const Packet8f& a, const Packet8f& exponent) {
-  return pldexp_float(a,exponent);
-}
-
-template<> EIGEN_STRONG_INLINE Packet4d pldexp<Packet4d>(const Packet4d& a, const Packet4d& exponent) {
-  // Build e=2^n by constructing the exponents in a 128-bit vector and
-  // shifting them to where they belong in double-precision values.
-  Packet4i cst_1023 = pset1<Packet4i>(1023);
-  __m128i emm0 = _mm256_cvtpd_epi32(exponent);
-  emm0 = _mm_add_epi32(emm0, cst_1023);
-  emm0 = _mm_shuffle_epi32(emm0, _MM_SHUFFLE(3, 1, 2, 0));
-  __m128i lo = _mm_slli_epi64(emm0, 52);
-  __m128i hi = _mm_slli_epi64(_mm_srli_epi64(emm0, 32), 52);
-  __m256i e = _mm256_insertf128_si256(_mm256_setzero_si256(), lo, 0);
-  e = _mm256_insertf128_si256(e, hi, 1);
-  return pmul(a,_mm256_castsi256_pd(e));
-}
-
 // preduxp should be ok
 // FIXME: why is this ok? why isn't the simply implementation working as expected?
 template<> EIGEN_STRONG_INLINE Packet8f preduxp<Packet8f>(const Packet8f* vecs)
@@ -649,7 +406,7 @@ template<> EIGEN_STRONG_INLINE double predux<Packet4d>(const Packet4d& a)
  return predux(Packet2d(_mm_add_pd(_mm256_castpd256_pd128(a),_mm256_extractf128_pd(a,1))));
 }

-template<> EIGEN_STRONG_INLINE Packet4f predux_half_dowto4<Packet8f>(const Packet8f& a)
+template<> EIGEN_STRONG_INLINE Packet4f predux_downto4<Packet8f>(const Packet8f& a)
 {
  return _mm_add_ps(_mm256_castps256_ps128(a),_mm256_extractf128_ps(a,1));
 }
@@ -693,16 +450,6 @@ template<> EIGEN_STRONG_INLINE double predux_max<Packet4d>(const Packet4d& a)
  return pfirst(_mm256_max_pd(tmp, _mm256_shuffle_pd(tmp, tmp, 1)));
 }

-// not needed yet
-// template<> EIGEN_STRONG_INLINE bool predux_all(const Packet8f& x)
-// {
-//   return _mm256_movemask_ps(x)==0xFF;
-// }
-
-template<> EIGEN_STRONG_INLINE bool predux_any(const Packet8f& x)
-{
-  return _mm256_movemask_ps(x)!=0;
-}

 template<int Offset>
 struct palign_impl<Offset,Packet8f>
@@ -883,335 +630,6 @@ template<> EIGEN_STRONG_INLINE Packet4d pinsertlast(const Packet4d& a, double b)
  return _mm256_blend_pd(a,pset1<Packet4d>(b),(1<<3));
 }

-
-// Packet math for Eigen::half
-template<> struct unpacket_traits<Packet8h> { typedef Eigen::half type; enum {size=8, alignment=Aligned16, vectorizable=true, masked_load_available=false, masked_store_available=false}; typedef Packet8h half; };
-
-template<> EIGEN_STRONG_INLINE Packet8h pset1<Packet8h>(const Eigen::half& from) {
-  Packet8h result;
-  result.x = _mm_set1_epi16(from.x);
-  return result;
-}
-
-template<> EIGEN_STRONG_INLINE Eigen::half pfirst<Packet8h>(const Packet8h& from) {
-  return half_impl::raw_uint16_to_half(static_cast<unsigned short>(_mm_extract_epi16(from.x, 0)));
-}
-
-template<> EIGEN_STRONG_INLINE Packet8h pload<Packet8h>(const Eigen::half* from) {
-  Packet8h result;
-  result.x = _mm_load_si128(reinterpret_cast<const __m128i*>(from));
-  return result;
-}
-
-template<> EIGEN_STRONG_INLINE Packet8h ploadu<Packet8h>(const Eigen::half* from) {
-  Packet8h result;
-  result.x = _mm_loadu_si128(reinterpret_cast<const __m128i*>(from));
-  return result;
-}
-
-template<> EIGEN_STRONG_INLINE void pstore<Eigen::half>(Eigen::half* to, const Packet8h& from) {
-  _mm_store_si128(reinterpret_cast<__m128i*>(to), from.x);
-}
-
-template<> EIGEN_STRONG_INLINE void pstoreu<Eigen::half>(Eigen::half* to, const Packet8h& from) {
-  _mm_storeu_si128(reinterpret_cast<__m128i*>(to), from.x);
-}
-
-template<> EIGEN_STRONG_INLINE Packet8h
-ploaddup<Packet8h>(const Eigen::half*  from) {
-  Packet8h result;
-  unsigned short a = from[0].x;
-  unsigned short b = from[1].x;
-  unsigned short c = from[2].x;
-  unsigned short d = from[3].x;
-  result.x = _mm_set_epi16(d, d, c, c, b, b, a, a);
-  return result;
-}
-
-template<> EIGEN_STRONG_INLINE Packet8h
-ploadquad<Packet8h>(const Eigen::half* from) {
-  Packet8h result;
-  unsigned short a = from[0].x;
-  unsigned short b = from[1].x;
-  result.x = _mm_set_epi16(b, b, b, b, a, a, a, a);
-  return result;
-}
-
-EIGEN_STRONG_INLINE Packet8f half2float(const Packet8h& a) {
-#ifdef EIGEN_HAS_FP16_C
-  return _mm256_cvtph_ps(a.x);
-#else
-  EIGEN_ALIGN32 Eigen::half aux[8];
-  pstore(aux, a);
-  float f0(aux[0]);
-  float f1(aux[1]);
-  float f2(aux[2]);
-  float f3(aux[3]);
-  float f4(aux[4]);
-  float f5(aux[5]);
-  float f6(aux[6]);
-  float f7(aux[7]);
-
-  return _mm256_set_ps(f7, f6, f5, f4, f3, f2, f1, f0);
-#endif
-}
-
-EIGEN_STRONG_INLINE Packet8h float2half(const Packet8f& a) {
-#ifdef EIGEN_HAS_FP16_C
-  Packet8h result;
-  result.x = _mm256_cvtps_ph(a, _MM_FROUND_TO_NEAREST_INT|_MM_FROUND_NO_EXC);
-  return result;
-#else
-  EIGEN_ALIGN32 float aux[8];
-  pstore(aux, a);
-  Eigen::half h0(aux[0]);
-  Eigen::half h1(aux[1]);
-  Eigen::half h2(aux[2]);
-  Eigen::half h3(aux[3]);
-  Eigen::half h4(aux[4]);
-  Eigen::half h5(aux[5]);
-  Eigen::half h6(aux[6]);
-  Eigen::half h7(aux[7]);
-
-  Packet8h result;
-  result.x = _mm_set_epi16(h7.x, h6.x, h5.x, h4.x, h3.x, h2.x, h1.x, h0.x);
-  return result;
-#endif
-}
-
-template<> EIGEN_STRONG_INLINE Packet8h ptrue(const Packet8h& a) {
-  Packet8h r; r.x = _mm_cmpeq_epi32(a.x, a.x); return r;
-}
-
-template<> EIGEN_STRONG_INLINE Packet8h por(const Packet8h& a,const Packet8h& b) {
-  // in some cases Packet4i is a wrapper around __m128i, so we either need to
-  // cast to Packet4i to directly call the intrinsics as below:
-  Packet8h r; r.x = _mm_or_si128(a.x,b.x); return r;
-}
-template<> EIGEN_STRONG_INLINE Packet8h pxor(const Packet8h& a,const Packet8h& b) {
-  Packet8h r; r.x = _mm_xor_si128(a.x,b.x); return r;
-}
-template<> EIGEN_STRONG_INLINE Packet8h pand(const Packet8h& a,const Packet8h& b) {
-  Packet8h r; r.x = _mm_and_si128(a.x,b.x); return r;
-}
-template<> EIGEN_STRONG_INLINE Packet8h pandnot(const Packet8h& a,const Packet8h& b) {
-  Packet8h r; r.x = _mm_andnot_si128(b.x,a.x); return r;
-}
-
-template<> EIGEN_STRONG_INLINE Packet8h pselect(const Packet8h& mask, const Packet8h& a, const Packet8h& b) {
-  Packet8h r; r.x = _mm_blendv_epi8(b.x, a.x, mask.x); return r;
-}
-
-template<> EIGEN_STRONG_INLINE Packet8h pcmp_eq(const Packet8h& a,const Packet8h& b) {
-  Packet8f af = half2float(a);
-  Packet8f bf = half2float(b);
-  Packet8f rf = pcmp_eq(af, bf);
-  // Pack the 32-bit flags into 16-bits flags.
-  Packet8h result; result.x = _mm_packs_epi32(_mm256_extractf128_si256(_mm256_castps_si256(rf), 0),
-                                              _mm256_extractf128_si256(_mm256_castps_si256(rf), 1));
-  return result;
-}
-
-template<> EIGEN_STRONG_INLINE Packet8h pconj(const Packet8h& a) { return a; }
-
-template<> EIGEN_STRONG_INLINE Packet8h pnegate(const Packet8h& a) {
-  Packet8h sign_mask; sign_mask.x = _mm_set1_epi16(static_cast<unsigned short>(0x8000));
-  Packet8h result; result.x = _mm_xor_si128(a.x, sign_mask.x);
-  return result;
-}
-
-template<> EIGEN_STRONG_INLINE Packet8h padd<Packet8h>(const Packet8h& a, const Packet8h& b) {
-  Packet8f af = half2float(a);
-  Packet8f bf = half2float(b);
-  Packet8f rf = padd(af, bf);
-  return float2half(rf);
-}
-
-template<> EIGEN_STRONG_INLINE Packet8h psub<Packet8h>(const Packet8h& a, const Packet8h& b) {
-  Packet8f af = half2float(a);
-  Packet8f bf = half2float(b);
-  Packet8f rf = psub(af, bf);
-  return float2half(rf);
-}
-
-template<> EIGEN_STRONG_INLINE Packet8h pmul<Packet8h>(const Packet8h& a, const Packet8h& b) {
-  Packet8f af = half2float(a);
-  Packet8f bf = half2float(b);
-  Packet8f rf = pmul(af, bf);
-  return float2half(rf);
-}
-
-template<> EIGEN_STRONG_INLINE Packet8h pdiv<Packet8h>(const Packet8h& a, const Packet8h& b) {
-  Packet8f af = half2float(a);
-  Packet8f bf = half2float(b);
-  Packet8f rf = pdiv(af, bf);
-  return float2half(rf);
-}
-
-template<> EIGEN_STRONG_INLINE Packet8h pgather<Eigen::half, Packet8h>(const Eigen::half* from, Index stride)
-{
-  Packet8h result;
-  result.x = _mm_set_epi16(from[7*stride].x, from[6*stride].x, from[5*stride].x, from[4*stride].x, from[3*stride].x, from[2*stride].x, from[1*stride].x, from[0*stride].x);
-  return result;
-}
-
-template<> EIGEN_STRONG_INLINE void pscatter<Eigen::half, Packet8h>(Eigen::half* to, const Packet8h& from, Index stride)
-{
-  EIGEN_ALIGN32 Eigen::half aux[8];
-  pstore(aux, from);
-  to[stride*0].x = aux[0].x;
-  to[stride*1].x = aux[1].x;
-  to[stride*2].x = aux[2].x;
-  to[stride*3].x = aux[3].x;
-  to[stride*4].x = aux[4].x;
-  to[stride*5].x = aux[5].x;
-  to[stride*6].x = aux[6].x;
-  to[stride*7].x = aux[7].x;
-}
-
-template<> EIGEN_STRONG_INLINE Eigen::half predux<Packet8h>(const Packet8h& a) {
-  Packet8f af = half2float(a);
-  float reduced = predux<Packet8f>(af);
-  return Eigen::half(reduced);
-}
-
-template<> EIGEN_STRONG_INLINE Eigen::half predux_max<Packet8h>(const Packet8h& a) {
-  Packet8f af = half2float(a);
-  float reduced = predux_max<Packet8f>(af);
-  return Eigen::half(reduced);
-}
-
-template<> EIGEN_STRONG_INLINE Eigen::half predux_min<Packet8h>(const Packet8h& a) {
-  Packet8f af = half2float(a);
-  float reduced = predux_min<Packet8f>(af);
-  return Eigen::half(reduced);
-}
-
-template<> EIGEN_STRONG_INLINE Eigen::half predux_mul<Packet8h>(const Packet8h& a) {
-  Packet8f af = half2float(a);
-  float reduced = predux_mul<Packet8f>(af);
-  return Eigen::half(reduced);
-}
-
-template<> EIGEN_STRONG_INLINE Packet8h preduxp<Packet8h>(const Packet8h* p) {
-  Packet8f pf[8];
-  pf[0] = half2float(p[0]);
-  pf[1] = half2float(p[1]);
-  pf[2] = half2float(p[2]);
-  pf[3] = half2float(p[3]);
-  pf[4] = half2float(p[4]);
-  pf[5] = half2float(p[5]);
-  pf[6] = half2float(p[6]);
-  pf[7] = half2float(p[7]);
-  Packet8f reduced = preduxp<Packet8f>(pf);
-  return float2half(reduced);
-}
-
-template<> EIGEN_STRONG_INLINE Packet8h preverse(const Packet8h& a)
-{
-  __m128i m = _mm_setr_epi8(14,15,12,13,10,11,8,9,6,7,4,5,2,3,0,1);
-  Packet8h res;
-  res.x = _mm_shuffle_epi8(a.x,m);
-  return res;
-}
-
-template<> EIGEN_STRONG_INLINE Packet8h pinsertfirst(const Packet8h& a, Eigen::half b)
-{
-  Packet8h res;
-  res.x = _mm_insert_epi16(a.x,int(b.x),0);
-  return res;
-}
-
-template<> EIGEN_STRONG_INLINE Packet8h pinsertlast(const Packet8h& a, Eigen::half b)
-{
-  Packet8h res;
-  res.x = _mm_insert_epi16(a.x,int(b.x),7);
-  return res;
-}
-
-template<int Offset>
-struct palign_impl<Offset,Packet8h>
-{
-  static EIGEN_STRONG_INLINE void run(Packet8h& first, const Packet8h& second)
-  {
-    if (Offset!=0)
-      first.x = _mm_alignr_epi8(second.x,first.x, Offset*2);
-  }
-};
-
-EIGEN_STRONG_INLINE void
-ptranspose(PacketBlock<Packet8h,8>& kernel) {
-  __m128i a = kernel.packet[0].x;
-  __m128i b = kernel.packet[1].x;
-  __m128i c = kernel.packet[2].x;
-  __m128i d = kernel.packet[3].x;
-  __m128i e = kernel.packet[4].x;
-  __m128i f = kernel.packet[5].x;
-  __m128i g = kernel.packet[6].x;
-  __m128i h = kernel.packet[7].x;
-
-  __m128i a03b03 = _mm_unpacklo_epi16(a, b);
-  __m128i c03d03 = _mm_unpacklo_epi16(c, d);
-  __m128i e03f03 = _mm_unpacklo_epi16(e, f);
-  __m128i g03h03 = _mm_unpacklo_epi16(g, h);
-  __m128i a47b47 = _mm_unpackhi_epi16(a, b);
-  __m128i c47d47 = _mm_unpackhi_epi16(c, d);
-  __m128i e47f47 = _mm_unpackhi_epi16(e, f);
-  __m128i g47h47 = _mm_unpackhi_epi16(g, h);
-
-  __m128i a01b01c01d01 = _mm_unpacklo_epi32(a03b03, c03d03);
-  __m128i a23b23c23d23 = _mm_unpackhi_epi32(a03b03, c03d03);
-  __m128i e01f01g01h01 = _mm_unpacklo_epi32(e03f03, g03h03);
-  __m128i e23f23g23h23 = _mm_unpackhi_epi32(e03f03, g03h03);
-  __m128i a45b45c45d45 = _mm_unpacklo_epi32(a47b47, c47d47);
-  __m128i a67b67c67d67 = _mm_unpackhi_epi32(a47b47, c47d47);
-  __m128i e45f45g45h45 = _mm_unpacklo_epi32(e47f47, g47h47);
-  __m128i e67f67g67h67 = _mm_unpackhi_epi32(e47f47, g47h47);
-
-  __m128i a0b0c0d0e0f0g0h0 = _mm_unpacklo_epi64(a01b01c01d01, e01f01g01h01);
-  __m128i a1b1c1d1e1f1g1h1 = _mm_unpackhi_epi64(a01b01c01d01, e01f01g01h01);
-  __m128i a2b2c2d2e2f2g2h2 = _mm_unpacklo_epi64(a23b23c23d23, e23f23g23h23);
-  __m128i a3b3c3d3e3f3g3h3 = _mm_unpackhi_epi64(a23b23c23d23, e23f23g23h23);
-  __m128i a4b4c4d4e4f4g4h4 = _mm_unpacklo_epi64(a45b45c45d45, e45f45g45h45);
-  __m128i a5b5c5d5e5f5g5h5 = _mm_unpackhi_epi64(a45b45c45d45, e45f45g45h45);
-  __m128i a6b6c6d6e6f6g6h6 = _mm_unpacklo_epi64(a67b67c67d67, e67f67g67h67);
-  __m128i a7b7c7d7e7f7g7h7 = _mm_unpackhi_epi64(a67b67c67d67, e67f67g67h67);
-
-  kernel.packet[0].x = a0b0c0d0e0f0g0h0;
-  kernel.packet[1].x = a1b1c1d1e1f1g1h1;
-  kernel.packet[2].x = a2b2c2d2e2f2g2h2;
-  kernel.packet[3].x = a3b3c3d3e3f3g3h3;
-  kernel.packet[4].x = a4b4c4d4e4f4g4h4;
-  kernel.packet[5].x = a5b5c5d5e5f5g5h5;
-  kernel.packet[6].x = a6b6c6d6e6f6g6h6;
-  kernel.packet[7].x = a7b7c7d7e7f7g7h7;
-}
-
-EIGEN_STRONG_INLINE void
-ptranspose(PacketBlock<Packet8h,4>& kernel) {
-  EIGEN_ALIGN32 Eigen::half in[4][8];
-  pstore<Eigen::half>(in[0], kernel.packet[0]);
-  pstore<Eigen::half>(in[1], kernel.packet[1]);
-  pstore<Eigen::half>(in[2], kernel.packet[2]);
-  pstore<Eigen::half>(in[3], kernel.packet[3]);
-
-  EIGEN_ALIGN32 Eigen::half out[4][8];
-
-  for (int i = 0; i < 4; ++i) {
-    for (int j = 0; j < 4; ++j) {
-      out[i][j] = in[j][2*i];
-    }
-    for (int j = 0; j < 4; ++j) {
-      out[i][j+4] = in[j][2*i+1];
-    }
-  }
-
-  kernel.packet[0] = pload<Packet8h>(out[0]);
-  kernel.packet[1] = pload<Packet8h>(out[1]);
-  kernel.packet[2] = pload<Packet8h>(out[2]);
-  kernel.packet[3] = pload<Packet8h>(out[3]);
-}
-
 } // end namespace internal

 } // end namespace Eigen
--- a/Eigen/src/Core/arch/AVX/TypeCasting.h
+++ b/Eigen/src/Core/arch/AVX/TypeCasting.h
@@ -37,51 +37,13 @@ struct type_casting_traits<int, float> {


 template<> EIGEN_STRONG_INLINE Packet8i pcast<Packet8f, Packet8i>(const Packet8f& a) {
-  return _mm256_cvttps_epi32(a);
+  return _mm256_cvtps_epi32(a);
 }

 template<> EIGEN_STRONG_INLINE Packet8f pcast<Packet8i, Packet8f>(const Packet8i& a) {
  return _mm256_cvtepi32_ps(a);
 }

-template<> EIGEN_STRONG_INLINE Packet8i preinterpret<Packet8i,Packet8f>(const Packet8f& a) {
-  return _mm256_castps_si256(a);
-}
-
-template<> EIGEN_STRONG_INLINE Packet8f preinterpret<Packet8f,Packet8i>(const Packet8i& a) {
-  return _mm256_castsi256_ps(a);
-}
-
-#ifndef EIGEN_VECTORIZE_AVX512
-
-template <>
-struct type_casting_traits<Eigen::half, float> {
-  enum {
-    VectorizedCast = 1,
-    SrcCoeffRatio = 1,
-    TgtCoeffRatio = 1
-  };
-};
-
-template<> EIGEN_STRONG_INLINE Packet8f pcast<Packet8h, Packet8f>(const Packet8h& a) {
-  return half2float(a);
-}
-
-template <>
-struct type_casting_traits<float, Eigen::half> {
-  enum {
-    VectorizedCast = 1,
-    SrcCoeffRatio = 1,
-    TgtCoeffRatio = 1
-  };
-};
-
-#endif  // EIGEN_VECTORIZE_AVX512
-
-template<> EIGEN_STRONG_INLINE Packet8h pcast<Packet8f, Packet8h>(const Packet8f& a) {
-  return float2half(a);
-}
-
 } // end namespace internal

 } // end namespace Eigen
--- a/Eigen/src/Core/arch/AVX512/Complex.h
+++ b/Eigen/src/Core/arch/AVX512/Complex.h
@@ -1,492 +0,0 @@
-// This file is part of Eigen, a lightweight C++ template library
-// for linear algebra.
-//
-// Copyright (C) 2018 Gael Guennebaud <gael.guennebaud@inria.fr>
-//
-// This Source Code Form is subject to the terms of the Mozilla
-// Public License v. 2.0. If a copy of the MPL was not distributed
-// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
-
-#ifndef EIGEN_COMPLEX_AVX512_H
-#define EIGEN_COMPLEX_AVX512_H
-
-namespace Eigen {
-
-namespace internal {
-
-//---------- float ----------
-struct Packet8cf
-{
-  EIGEN_STRONG_INLINE Packet8cf() {}
-  EIGEN_STRONG_INLINE explicit Packet8cf(const __m512& a) : v(a) {}
-  __m512  v;
-};
-
-template<> struct packet_traits<std::complex<float> >  : default_packet_traits
-{
-  typedef Packet8cf type;
-  typedef Packet4cf half;
-  enum {
-    Vectorizable = 1,
-    AlignedOnScalar = 1,
-    size = 8,
-    HasHalfPacket = 1,
-
-    HasAdd    = 1,
-    HasSub    = 1,
-    HasMul    = 1,
-    HasDiv    = 1,
-    HasNegate = 1,
-    HasAbs    = 0,
-    HasAbs2   = 0,
-    HasMin    = 0,
-    HasMax    = 0,
-    HasSetLinear = 0,
-    HasReduxp = 0
-  };
-};
-
-template<> struct unpacket_traits<Packet8cf> {
-  typedef std::complex<float> type;
-  enum {
-    size = 8,
-    alignment=unpacket_traits<Packet16f>::alignment,
-    vectorizable=true,
-    masked_load_available=false,
-    masked_store_available=false
-  };
-  typedef Packet4cf half;
-};
-
-template<> EIGEN_STRONG_INLINE Packet8cf ptrue<Packet8cf>(const Packet8cf& a) { return Packet8cf(ptrue(Packet16f(a.v))); }
-template<> EIGEN_STRONG_INLINE Packet8cf pnot<Packet8cf>(const Packet8cf& a) { return Packet8cf(pnot(Packet16f(a.v))); }
-template<> EIGEN_STRONG_INLINE Packet8cf padd<Packet8cf>(const Packet8cf& a, const Packet8cf& b) { return Packet8cf(_mm512_add_ps(a.v,b.v)); }
-template<> EIGEN_STRONG_INLINE Packet8cf psub<Packet8cf>(const Packet8cf& a, const Packet8cf& b) { return Packet8cf(_mm512_sub_ps(a.v,b.v)); }
-template<> EIGEN_STRONG_INLINE Packet8cf pnegate(const Packet8cf& a)
-{
-  return Packet8cf(pnegate(a.v));
-}
-template<> EIGEN_STRONG_INLINE Packet8cf pconj(const Packet8cf& a)
-{
-  const __m512 mask = _mm512_castsi512_ps(_mm512_setr_epi32(
-    0x00000000,0x80000000,0x00000000,0x80000000,0x00000000,0x80000000,0x00000000,0x80000000,
-    0x00000000,0x80000000,0x00000000,0x80000000,0x00000000,0x80000000,0x00000000,0x80000000));
-  return Packet8cf(pxor(a.v,mask));
-}
-
-template<> EIGEN_STRONG_INLINE Packet8cf pmul<Packet8cf>(const Packet8cf& a, const Packet8cf& b)
-{
-  __m512 tmp2 = _mm512_mul_ps(_mm512_movehdup_ps(a.v), _mm512_permute_ps(b.v, _MM_SHUFFLE(2,3,0,1)));
-  return Packet8cf(_mm512_fmaddsub_ps(_mm512_moveldup_ps(a.v), b.v, tmp2));
-}
-
-template<> EIGEN_STRONG_INLINE Packet8cf pand   <Packet8cf>(const Packet8cf& a, const Packet8cf& b) { return Packet8cf(pand(a.v,b.v)); }
-template<> EIGEN_STRONG_INLINE Packet8cf por    <Packet8cf>(const Packet8cf& a, const Packet8cf& b) { return Packet8cf(por(a.v,b.v)); }
-template<> EIGEN_STRONG_INLINE Packet8cf pxor   <Packet8cf>(const Packet8cf& a, const Packet8cf& b) { return Packet8cf(pxor(a.v,b.v)); }
-template<> EIGEN_STRONG_INLINE Packet8cf pandnot<Packet8cf>(const Packet8cf& a, const Packet8cf& b) { return Packet8cf(pandnot(a.v,b.v)); }
-
-template <>
-EIGEN_STRONG_INLINE Packet8cf pcmp_eq(const Packet8cf& a, const Packet8cf& b) {
-  __m512 eq = pcmp_eq<Packet16f>(a.v, b.v);
-  return Packet8cf(pand(eq, _mm512_permute_ps(eq, 0xB1)));
-}
-
-template<> EIGEN_STRONG_INLINE Packet8cf pload <Packet8cf>(const std::complex<float>* from) { EIGEN_DEBUG_ALIGNED_LOAD return Packet8cf(pload<Packet16f>(&numext::real_ref(*from))); }
-template<> EIGEN_STRONG_INLINE Packet8cf ploadu<Packet8cf>(const std::complex<float>* from) { EIGEN_DEBUG_UNALIGNED_LOAD return Packet8cf(ploadu<Packet16f>(&numext::real_ref(*from))); }
-
-
-template<> EIGEN_STRONG_INLINE Packet8cf pset1<Packet8cf>(const std::complex<float>& from)
-{
-  return Packet8cf(_mm512_castpd_ps(pload1<Packet8d>((const double*)(const void*)&from)));
-}
-
-template<> EIGEN_STRONG_INLINE Packet8cf ploaddup<Packet8cf>(const std::complex<float>* from)
-{
-  return Packet8cf( _mm512_castpd_ps( ploaddup<Packet8d>((const double*)(const void*)from )) );
-}
-template<> EIGEN_STRONG_INLINE Packet8cf ploadquad<Packet8cf>(const std::complex<float>* from)
-{
-  return Packet8cf( _mm512_castpd_ps( ploadquad<Packet8d>((const double*)(const void*)from )) );
-}
-
-template<> EIGEN_STRONG_INLINE void pstore <std::complex<float> >(std::complex<float>* to, const Packet8cf& from) { EIGEN_DEBUG_ALIGNED_STORE pstore(&numext::real_ref(*to), from.v); }
-template<> EIGEN_STRONG_INLINE void pstoreu<std::complex<float> >(std::complex<float>* to, const Packet8cf& from) { EIGEN_DEBUG_UNALIGNED_STORE pstoreu(&numext::real_ref(*to), from.v); }
-
-template<> EIGEN_DEVICE_FUNC inline Packet8cf pgather<std::complex<float>, Packet8cf>(const std::complex<float>* from, Index stride)
-{
-  return Packet8cf(_mm512_castpd_ps(pgather<double,Packet8d>((const double*)(const void*)from, stride)));
-}
-
-template<> EIGEN_DEVICE_FUNC inline void pscatter<std::complex<float>, Packet8cf>(std::complex<float>* to, const Packet8cf& from, Index stride)
-{
-  pscatter((double*)(void*)to, _mm512_castps_pd(from.v), stride);
-}
-
-template<> EIGEN_STRONG_INLINE std::complex<float>  pfirst<Packet8cf>(const Packet8cf& a)
-{
-  return pfirst(Packet2cf(_mm512_castps512_ps128(a.v)));
-}
-
-template<> EIGEN_STRONG_INLINE Packet8cf preverse(const Packet8cf& a) {
-  return Packet8cf(_mm512_castsi512_ps(
-            _mm512_permutexvar_epi64( _mm512_set_epi32(0, 0, 0, 1, 0, 2, 0, 3, 0, 4, 0, 5, 0, 6, 0, 7),
-                                      _mm512_castps_si512(a.v))));
-}
-
-template<> EIGEN_STRONG_INLINE std::complex<float> predux<Packet8cf>(const Packet8cf& a)
-{
-  return predux(padd(Packet4cf(extract256<0>(a.v)),
-                     Packet4cf(extract256<1>(a.v))));
-}
-
-template<> EIGEN_STRONG_INLINE std::complex<float> predux_mul<Packet8cf>(const Packet8cf& a)
-{
-  return predux_mul(pmul(Packet4cf(extract256<0>(a.v)),
-                         Packet4cf(extract256<1>(a.v))));
-}
-
-template <>
-EIGEN_STRONG_INLINE Packet4cf predux_half_dowto4<Packet8cf>(const Packet8cf& a) {
-  __m256 lane0 = extract256<0>(a.v);
-  __m256 lane1 = extract256<1>(a.v);
-  __m256 res = _mm256_add_ps(lane0, lane1);
-  return Packet4cf(res);
-}
-
-template<int Offset>
-struct palign_impl<Offset,Packet8cf>
-{
-  static EIGEN_STRONG_INLINE void run(Packet8cf& first, const Packet8cf& second)
-  {
-    if (Offset==0) return;
-    palign_impl<Offset*2,Packet16f>::run(first.v, second.v);
-  }
-};
-
-template<> struct conj_helper<Packet8cf, Packet8cf, false,true>
-{
-  EIGEN_STRONG_INLINE Packet8cf pmadd(const Packet8cf& x, const Packet8cf& y, const Packet8cf& c) const
-  { return padd(pmul(x,y),c); }
-
-  EIGEN_STRONG_INLINE Packet8cf pmul(const Packet8cf& a, const Packet8cf& b) const
-  {
-    return internal::pmul(a, pconj(b));
-  }
-};
-
-template<> struct conj_helper<Packet8cf, Packet8cf, true,false>
-{
-  EIGEN_STRONG_INLINE Packet8cf pmadd(const Packet8cf& x, const Packet8cf& y, const Packet8cf& c) const
-  { return padd(pmul(x,y),c); }
-
-  EIGEN_STRONG_INLINE Packet8cf pmul(const Packet8cf& a, const Packet8cf& b) const
-  {
-    return internal::pmul(pconj(a), b);
-  }
-};
-
-template<> struct conj_helper<Packet8cf, Packet8cf, true,true>
-{
-  EIGEN_STRONG_INLINE Packet8cf pmadd(const Packet8cf& x, const Packet8cf& y, const Packet8cf& c) const
-  { return padd(pmul(x,y),c); }
-
-  EIGEN_STRONG_INLINE Packet8cf pmul(const Packet8cf& a, const Packet8cf& b) const
-  {
-    return pconj(internal::pmul(a, b));
-  }
-};
-
-EIGEN_MAKE_CONJ_HELPER_CPLX_REAL(Packet8cf,Packet16f)
-
-template<> EIGEN_STRONG_INLINE Packet8cf pdiv<Packet8cf>(const Packet8cf& a, const Packet8cf& b)
-{
-  Packet8cf num = pmul(a, pconj(b));
-  __m512 tmp = _mm512_mul_ps(b.v, b.v);
-  __m512 tmp2    = _mm512_shuffle_ps(tmp,tmp,0xB1);
-  __m512 denom = _mm512_add_ps(tmp, tmp2);
-  return Packet8cf(_mm512_div_ps(num.v, denom));
-}
-
-template<> EIGEN_STRONG_INLINE Packet8cf pcplxflip<Packet8cf>(const Packet8cf& x)
-{
-  return Packet8cf(_mm512_shuffle_ps(x.v, x.v, _MM_SHUFFLE(2, 3, 0 ,1)));
-}
-
-//---------- double ----------
-struct Packet4cd
-{
-  EIGEN_STRONG_INLINE Packet4cd() {}
-  EIGEN_STRONG_INLINE explicit Packet4cd(const __m512d& a) : v(a) {}
-  __m512d  v;
-};
-
-template<> struct packet_traits<std::complex<double> >  : default_packet_traits
-{
-  typedef Packet4cd type;
-  typedef Packet2cd half;
-  enum {
-    Vectorizable = 1,
-    AlignedOnScalar = 0,
-    size = 4,
-    HasHalfPacket = 1,
-
-    HasAdd    = 1,
-    HasSub    = 1,
-    HasMul    = 1,
-    HasDiv    = 1,
-    HasNegate = 1,
-    HasAbs    = 0,
-    HasAbs2   = 0,
-    HasMin    = 0,
-    HasMax    = 0,
-    HasSetLinear = 0,
-    HasReduxp = 0
-  };
-};
-
-template<> struct unpacket_traits<Packet4cd> {
-  typedef std::complex<double> type;
-  enum {
-    size = 4,
-    alignment = unpacket_traits<Packet8d>::alignment,
-    vectorizable=true,
-    masked_load_available=false,
-    masked_store_available=false
-  };
-  typedef Packet2cd half;
-};
-
-template<> EIGEN_STRONG_INLINE Packet4cd padd<Packet4cd>(const Packet4cd& a, const Packet4cd& b) { return Packet4cd(_mm512_add_pd(a.v,b.v)); }
-template<> EIGEN_STRONG_INLINE Packet4cd psub<Packet4cd>(const Packet4cd& a, const Packet4cd& b) { return Packet4cd(_mm512_sub_pd(a.v,b.v)); }
-template<> EIGEN_STRONG_INLINE Packet4cd pnegate(const Packet4cd& a) { return Packet4cd(pnegate(a.v)); }
-template<> EIGEN_STRONG_INLINE Packet4cd pconj(const Packet4cd& a)
-{
-  const __m512d mask = _mm512_castsi512_pd(
-          _mm512_set_epi32(0x80000000,0x0,0x0,0x0,0x80000000,0x0,0x0,0x0,
-                           0x80000000,0x0,0x0,0x0,0x80000000,0x0,0x0,0x0));
-  return Packet4cd(pxor(a.v,mask));
-}
-
-template<> EIGEN_STRONG_INLINE Packet4cd pmul<Packet4cd>(const Packet4cd& a, const Packet4cd& b)
-{
-  __m512d tmp1 = _mm512_shuffle_pd(a.v,a.v,0x0);
-  __m512d tmp2 = _mm512_shuffle_pd(a.v,a.v,0xFF);
-  __m512d tmp3 = _mm512_shuffle_pd(b.v,b.v,0x55);
-  __m512d odd  = _mm512_mul_pd(tmp2, tmp3);
-  return Packet4cd(_mm512_fmaddsub_pd(tmp1, b.v, odd));
-}
-
-template<> EIGEN_STRONG_INLINE Packet4cd ptrue<Packet4cd>(const Packet4cd& a) { return Packet4cd(ptrue(Packet8d(a.v))); }
-template<> EIGEN_STRONG_INLINE Packet4cd pnot<Packet4cd>(const Packet4cd& a) { return Packet4cd(pnot(Packet8d(a.v))); }
-template<> EIGEN_STRONG_INLINE Packet4cd pand   <Packet4cd>(const Packet4cd& a, const Packet4cd& b) { return Packet4cd(pand(a.v,b.v)); }
-template<> EIGEN_STRONG_INLINE Packet4cd por    <Packet4cd>(const Packet4cd& a, const Packet4cd& b) { return Packet4cd(por(a.v,b.v)); }
-template<> EIGEN_STRONG_INLINE Packet4cd pxor   <Packet4cd>(const Packet4cd& a, const Packet4cd& b) { return Packet4cd(pxor(a.v,b.v)); }
-template<> EIGEN_STRONG_INLINE Packet4cd pandnot<Packet4cd>(const Packet4cd& a, const Packet4cd& b) { return Packet4cd(pandnot(a.v,b.v)); }
-
-template <>
-EIGEN_STRONG_INLINE Packet4cd pcmp_eq(const Packet4cd& a, const Packet4cd& b) {
-  __m512d eq = pcmp_eq<Packet8d>(a.v, b.v);
-  return Packet4cd(pand(eq, _mm512_permute_pd(eq, 0x55)));
-}
-
-template<> EIGEN_STRONG_INLINE Packet4cd pload <Packet4cd>(const std::complex<double>* from)
-{ EIGEN_DEBUG_ALIGNED_LOAD return Packet4cd(pload<Packet8d>((const double*)from)); }
-template<> EIGEN_STRONG_INLINE Packet4cd ploadu<Packet4cd>(const std::complex<double>* from)
-{ EIGEN_DEBUG_UNALIGNED_LOAD return Packet4cd(ploadu<Packet8d>((const double*)from)); }
-
-template<> EIGEN_STRONG_INLINE Packet4cd pset1<Packet4cd>(const std::complex<double>& from)
-{
-  #ifdef EIGEN_VECTORIZE_AVX512DQ
-  return Packet4cd(_mm512_broadcast_f64x2(pset1<Packet1cd>(from).v));
-  #else
-  return Packet4cd(_mm512_castps_pd(_mm512_broadcast_f32x4( _mm_castpd_ps(pset1<Packet1cd>(from).v))));
-  #endif
-}
-
-template<> EIGEN_STRONG_INLINE Packet4cd ploaddup<Packet4cd>(const std::complex<double>* from) {
-  return Packet4cd(_mm512_insertf64x4(
-          _mm512_castpd256_pd512(ploaddup<Packet2cd>(from).v), ploaddup<Packet2cd>(from+1).v, 1));
-}
-
-template<> EIGEN_STRONG_INLINE void pstore <std::complex<double> >(std::complex<double> *   to, const Packet4cd& from) { EIGEN_DEBUG_ALIGNED_STORE pstore((double*)to, from.v); }
-template<> EIGEN_STRONG_INLINE void pstoreu<std::complex<double> >(std::complex<double> *   to, const Packet4cd& from) { EIGEN_DEBUG_UNALIGNED_STORE pstoreu((double*)to, from.v); }
-
-template<> EIGEN_DEVICE_FUNC inline Packet4cd pgather<std::complex<double>, Packet4cd>(const std::complex<double>* from, Index stride)
-{
-  return Packet4cd(_mm512_insertf64x4(_mm512_castpd256_pd512(
-            _mm256_insertf128_pd(_mm256_castpd128_pd256(ploadu<Packet1cd>(from+0*stride).v), ploadu<Packet1cd>(from+1*stride).v,1)),
-            _mm256_insertf128_pd(_mm256_castpd128_pd256(ploadu<Packet1cd>(from+2*stride).v), ploadu<Packet1cd>(from+3*stride).v,1), 1));
-}
-
-template<> EIGEN_DEVICE_FUNC inline void pscatter<std::complex<double>, Packet4cd>(std::complex<double>* to, const Packet4cd& from, Index stride)
-{
-  __m512i fromi = _mm512_castpd_si512(from.v);
-  double* tod = (double*)(void*)to;
-  _mm_storeu_pd(tod+0*stride, _mm_castsi128_pd(_mm512_extracti32x4_epi32(fromi,0)) );
-  _mm_storeu_pd(tod+2*stride, _mm_castsi128_pd(_mm512_extracti32x4_epi32(fromi,1)) );
-  _mm_storeu_pd(tod+4*stride, _mm_castsi128_pd(_mm512_extracti32x4_epi32(fromi,2)) );
-  _mm_storeu_pd(tod+6*stride, _mm_castsi128_pd(_mm512_extracti32x4_epi32(fromi,3)) );
-}
-
-template<> EIGEN_STRONG_INLINE std::complex<double> pfirst<Packet4cd>(const Packet4cd& a)
-{
-  __m128d low = extract128<0>(a.v);
-  EIGEN_ALIGN16 double res[2];
-  _mm_store_pd(res, low);
-  return std::complex<double>(res[0],res[1]);
-}
-
-template<> EIGEN_STRONG_INLINE Packet4cd preverse(const Packet4cd& a) {
-  return Packet4cd(_mm512_shuffle_f64x2(a.v, a.v, EIGEN_SSE_SHUFFLE_MASK(3,2,1,0)));
-}
-
-template<> EIGEN_STRONG_INLINE std::complex<double> predux<Packet4cd>(const Packet4cd& a)
-{
-  return predux(padd(Packet2cd(_mm512_extractf64x4_pd(a.v,0)),
-                     Packet2cd(_mm512_extractf64x4_pd(a.v,1))));
-}
-
-template<> EIGEN_STRONG_INLINE std::complex<double> predux_mul<Packet4cd>(const Packet4cd& a)
-{
-  return predux_mul(pmul(Packet2cd(_mm512_extractf64x4_pd(a.v,0)),
-                         Packet2cd(_mm512_extractf64x4_pd(a.v,1))));
-}
-
-template<int Offset>
-struct palign_impl<Offset,Packet4cd>
-{
-  static EIGEN_STRONG_INLINE void run(Packet4cd& first, const Packet4cd& second)
-  {
-    if (Offset==0) return;
-    palign_impl<Offset*2,Packet8d>::run(first.v, second.v);
-  }
-};
-
-template<> struct conj_helper<Packet4cd, Packet4cd, false,true>
-{
-  EIGEN_STRONG_INLINE Packet4cd pmadd(const Packet4cd& x, const Packet4cd& y, const Packet4cd& c) const
-  { return padd(pmul(x,y),c); }
-
-  EIGEN_STRONG_INLINE Packet4cd pmul(const Packet4cd& a, const Packet4cd& b) const
-  {
-    return internal::pmul(a, pconj(b));
-  }
-};
-
-template<> struct conj_helper<Packet4cd, Packet4cd, true,false>
-{
-  EIGEN_STRONG_INLINE Packet4cd pmadd(const Packet4cd& x, const Packet4cd& y, const Packet4cd& c) const
-  { return padd(pmul(x,y),c); }
-
-  EIGEN_STRONG_INLINE Packet4cd pmul(const Packet4cd& a, const Packet4cd& b) const
-  {
-    return internal::pmul(pconj(a), b);
-  }
-};
-
-template<> struct conj_helper<Packet4cd, Packet4cd, true,true>
-{
-  EIGEN_STRONG_INLINE Packet4cd pmadd(const Packet4cd& x, const Packet4cd& y, const Packet4cd& c) const
-  { return padd(pmul(x,y),c); }
-
-  EIGEN_STRONG_INLINE Packet4cd pmul(const Packet4cd& a, const Packet4cd& b) const
-  {
-    return pconj(internal::pmul(a, b));
-  }
-};
-
-EIGEN_MAKE_CONJ_HELPER_CPLX_REAL(Packet4cd,Packet8d)
-
-template<> EIGEN_STRONG_INLINE Packet4cd pdiv<Packet4cd>(const Packet4cd& a, const Packet4cd& b)
-{
-  Packet4cd num = pmul(a, pconj(b));
-  __m512d tmp = _mm512_mul_pd(b.v, b.v);
-  __m512d denom =  padd(_mm512_permute_pd(tmp,0x55), tmp);
-  return Packet4cd(_mm512_div_pd(num.v, denom));
-}
-
-template<> EIGEN_STRONG_INLINE Packet4cd pcplxflip<Packet4cd>(const Packet4cd& x)
-{
-  return Packet4cd(_mm512_permute_pd(x.v,0x55));
-}
-
-EIGEN_DEVICE_FUNC inline void
-ptranspose(PacketBlock<Packet8cf,4>& kernel) {
-  PacketBlock<Packet8d,4> pb;
-  
-  pb.packet[0] = _mm512_castps_pd(kernel.packet[0].v);
-  pb.packet[1] = _mm512_castps_pd(kernel.packet[1].v);
-  pb.packet[2] = _mm512_castps_pd(kernel.packet[2].v);
-  pb.packet[3] = _mm512_castps_pd(kernel.packet[3].v);
-  ptranspose(pb);
-  kernel.packet[0].v = _mm512_castpd_ps(pb.packet[0]);
-  kernel.packet[1].v = _mm512_castpd_ps(pb.packet[1]);
-  kernel.packet[2].v = _mm512_castpd_ps(pb.packet[2]);
-  kernel.packet[3].v = _mm512_castpd_ps(pb.packet[3]);
-}
-
-EIGEN_DEVICE_FUNC inline void
-ptranspose(PacketBlock<Packet8cf,8>& kernel) {
-  PacketBlock<Packet8d,8> pb;
-  
-  pb.packet[0] = _mm512_castps_pd(kernel.packet[0].v);
-  pb.packet[1] = _mm512_castps_pd(kernel.packet[1].v);
-  pb.packet[2] = _mm512_castps_pd(kernel.packet[2].v);
-  pb.packet[3] = _mm512_castps_pd(kernel.packet[3].v);
-  pb.packet[4] = _mm512_castps_pd(kernel.packet[4].v);
-  pb.packet[5] = _mm512_castps_pd(kernel.packet[5].v);
-  pb.packet[6] = _mm512_castps_pd(kernel.packet[6].v);
-  pb.packet[7] = _mm512_castps_pd(kernel.packet[7].v);
-  ptranspose(pb);
-  kernel.packet[0].v = _mm512_castpd_ps(pb.packet[0]);
-  kernel.packet[1].v = _mm512_castpd_ps(pb.packet[1]);
-  kernel.packet[2].v = _mm512_castpd_ps(pb.packet[2]);
-  kernel.packet[3].v = _mm512_castpd_ps(pb.packet[3]);
-  kernel.packet[4].v = _mm512_castpd_ps(pb.packet[4]);
-  kernel.packet[5].v = _mm512_castpd_ps(pb.packet[5]);
-  kernel.packet[6].v = _mm512_castpd_ps(pb.packet[6]);
-  kernel.packet[7].v = _mm512_castpd_ps(pb.packet[7]);
-}
-
-EIGEN_DEVICE_FUNC inline void
-ptranspose(PacketBlock<Packet4cd,4>& kernel) {
-  __m512d T0 = _mm512_shuffle_f64x2(kernel.packet[0].v, kernel.packet[1].v, EIGEN_SSE_SHUFFLE_MASK(0,1,0,1)); // [a0 a1 b0 b1]
-  __m512d T1 = _mm512_shuffle_f64x2(kernel.packet[0].v, kernel.packet[1].v, EIGEN_SSE_SHUFFLE_MASK(2,3,2,3)); // [a2 a3 b2 b3]
-  __m512d T2 = _mm512_shuffle_f64x2(kernel.packet[2].v, kernel.packet[3].v, EIGEN_SSE_SHUFFLE_MASK(0,1,0,1)); // [c0 c1 d0 d1]
-  __m512d T3 = _mm512_shuffle_f64x2(kernel.packet[2].v, kernel.packet[3].v, EIGEN_SSE_SHUFFLE_MASK(2,3,2,3)); // [c2 c3 d2 d3]
-
-  kernel.packet[3] = Packet4cd(_mm512_shuffle_f64x2(T1, T3, EIGEN_SSE_SHUFFLE_MASK(1,3,1,3))); // [a3 b3 c3 d3]
-  kernel.packet[2] = Packet4cd(_mm512_shuffle_f64x2(T1, T3, EIGEN_SSE_SHUFFLE_MASK(0,2,0,2))); // [a2 b2 c2 d2]
-  kernel.packet[1] = Packet4cd(_mm512_shuffle_f64x2(T0, T2, EIGEN_SSE_SHUFFLE_MASK(1,3,1,3))); // [a1 b1 c1 d1]
-  kernel.packet[0] = Packet4cd(_mm512_shuffle_f64x2(T0, T2, EIGEN_SSE_SHUFFLE_MASK(0,2,0,2))); // [a0 b0 c0 d0]
-}
-
-template<> EIGEN_STRONG_INLINE Packet8cf pinsertfirst(const Packet8cf& a, std::complex<float> b)
-{
-  Packet2cf tmp = Packet2cf(_mm512_extractf32x4_ps(a.v,0));
-  tmp = pinsertfirst(tmp, b);
-  return Packet8cf( _mm512_insertf32x4(a.v, tmp.v, 0) );
-}
-
-template<> EIGEN_STRONG_INLINE Packet4cd pinsertfirst(const Packet4cd& a, std::complex<double> b)
-{
-  return Packet4cd(_mm512_castsi512_pd( _mm512_inserti32x4(_mm512_castpd_si512(a.v), _mm_castpd_si128(pset1<Packet1cd>(b).v), 0) ));
-}
-
-template<> EIGEN_STRONG_INLINE Packet8cf pinsertlast(const Packet8cf& a, std::complex<float> b)
-{
-  Packet2cf tmp = Packet2cf(_mm512_extractf32x4_ps(a.v,3) );
-  tmp = pinsertlast(tmp, b);
-  return Packet8cf( _mm512_insertf32x4(a.v, tmp.v, 3) );
-}
-
-template<> EIGEN_STRONG_INLINE Packet4cd pinsertlast(const Packet4cd& a, std::complex<double> b)
-{
-  return Packet4cd(_mm512_castsi512_pd( _mm512_inserti32x4(_mm512_castpd_si512(a.v), _mm_castpd_si128(pset1<Packet1cd>(b).v), 3) ));
-}
-
-} // end namespace internal
-
-} // end namespace Eigen
-
-#endif // EIGEN_COMPLEX_AVX512_H
--- a/Eigen/src/Core/arch/AVX512/MathFunctions.h
+++ b/Eigen/src/Core/arch/AVX512/MathFunctions.h
@@ -15,13 +15,13 @@ namespace Eigen {
 namespace internal {

 // Disable the code for older versions of gcc that don't support many of the required avx512 instrinsics.
-#if EIGEN_GNUC_AT_LEAST(5, 3) || EIGEN_COMP_CLANG  || EIGEN_COMP_MSVC >= 1923
+#if EIGEN_GNUC_AT_LEAST(5, 3)

 #define _EIGEN_DECLARE_CONST_Packet16f(NAME, X) \
  const Packet16f p16f_##NAME = pset1<Packet16f>(X)

 #define _EIGEN_DECLARE_CONST_Packet16f_FROM_INT(NAME, X) \
-  const Packet16f p16f_##NAME =  preinterpret<Packet16f,Packet16i>(pset1<Packet16i>(X))
+  const Packet16f p16f_##NAME = (__m512)pset1<Packet16i>(X)

 #define _EIGEN_DECLARE_CONST_Packet8d(NAME, X) \
  const Packet8d p8d_##NAME = pset1<Packet8d>(X)
@@ -47,7 +47,6 @@ plog<Packet16f>(const Packet16f& _x) {
  // The smallest non denormalized float number.
  _EIGEN_DECLARE_CONST_Packet16f_FROM_INT(min_norm_pos, 0x00800000);
  _EIGEN_DECLARE_CONST_Packet16f_FROM_INT(minus_inf, 0xff800000);
-  _EIGEN_DECLARE_CONST_Packet16f_FROM_INT(pos_inf, 0x7f800000);
  _EIGEN_DECLARE_CONST_Packet16f_FROM_INT(nan, 0x7fc00000);

  // Polynomial coefficients.
@@ -65,14 +64,16 @@ plog<Packet16f>(const Packet16f& _x) {
  _EIGEN_DECLARE_CONST_Packet16f(cephes_log_q2, 0.693359375f);

  // invalid_mask is set to true when x is NaN
-  __mmask16 invalid_mask =  _mm512_cmp_ps_mask(x, _mm512_setzero_ps(), _CMP_NGE_UQ);
-  __mmask16 iszero_mask  =  _mm512_cmp_ps_mask(x, _mm512_setzero_ps(), _CMP_EQ_OQ);
-      
+  __mmask16 invalid_mask =
+      _mm512_cmp_ps_mask(x, _mm512_setzero_ps(), _CMP_NGE_UQ);
+  __mmask16 iszero_mask =
+      _mm512_cmp_ps_mask(x, _mm512_setzero_ps(), _CMP_EQ_UQ);
+
  // Truncate input values to the minimum positive normal.
  x = pmax(x, p16f_min_norm_pos);

  // Extract the shifted exponents.
-  Packet16f emm0 = _mm512_cvtepi32_ps(_mm512_srli_epi32(preinterpret<Packet16i,Packet16f>(x), 23));
+  Packet16f emm0 = _mm512_cvtepi32_ps(_mm512_srli_epi32((__m512i)x, 23));
  Packet16f e = _mm512_sub_ps(emm0, p16f_126f);

  // Set the exponents to -1, i.e. x are in the range [0.5,1).
@@ -117,16 +118,10 @@ plog<Packet16f>(const Packet16f& _x) {
  x = padd(x, y);
  x = padd(x, y2);

-  __mmask16 pos_inf_mask = _mm512_cmp_ps_mask(_x,p16f_pos_inf,_CMP_EQ_OQ);
-  // Filter out invalid inputs, i.e.:
-  //  - negative arg will be NAN,
-  //  - 0 will be -INF.
-  //  - +INF will be +INF
+  // Filter out invalid inputs, i.e. negative arg will be NAN, 0 will be -INF.
  return _mm512_mask_blend_ps(iszero_mask,
-            _mm512_mask_blend_ps(invalid_mask,
-              _mm512_mask_blend_ps(pos_inf_mask,x,p16f_pos_inf),
-              p16f_nan),
-            p16f_minus_inf);
+                              _mm512_mask_blend_ps(invalid_mask, x, p16f_nan),
+                              p16f_minus_inf);
 }
 #endif

@@ -253,7 +248,6 @@ pexp<Packet8d>(const Packet8d& _x) {
  return pmax(pmul(x, e), _x);
  }*/

-
 // Functions for sqrt.
 // The EIGEN_FAST_MATH version uses the _mm_rsqrt_ps approximation and one step
 // of Newton's method, at a cost of 1-2 bits of precision as opposed to the
@@ -264,39 +258,48 @@ pexp<Packet8d>(const Packet8d& _x) {
 template <>
 EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS EIGEN_UNUSED Packet16f
 psqrt<Packet16f>(const Packet16f& _x) {
-  Packet16f neg_half = pmul(_x, pset1<Packet16f>(-.5f));
-  __mmask16 denormal_mask = _mm512_kand(
-      _mm512_cmp_ps_mask(_x, pset1<Packet16f>((std::numeric_limits<float>::min)()),
-                        _CMP_LT_OQ),
-      _mm512_cmp_ps_mask(_x, _mm512_setzero_ps(), _CMP_GE_OQ));
+  _EIGEN_DECLARE_CONST_Packet16f(one_point_five, 1.5f);
+  _EIGEN_DECLARE_CONST_Packet16f(minus_half, -0.5f);
+  _EIGEN_DECLARE_CONST_Packet16f_FROM_INT(flt_min, 0x00800000);

-  Packet16f x = _mm512_rsqrt14_ps(_x);
+  Packet16f neg_half = pmul(_x, p16f_minus_half);
+
+  // select only the inverse sqrt of positive normal inputs (denormals are
+  // flushed to zero and cause infs as well).
+  __mmask16 non_zero_mask = _mm512_cmp_ps_mask(_x, p16f_flt_min, _CMP_GE_OQ);
+  Packet16f x = _mm512_mask_blend_ps(non_zero_mask, _mm512_setzero_ps(), _mm512_rsqrt14_ps(_x));

  // Do a single step of Newton's iteration.
-  x = pmul(x, pmadd(neg_half, pmul(x, x), pset1<Packet16f>(1.5f)));
+  x = pmul(x, pmadd(neg_half, pmul(x, x), p16f_one_point_five));

-  // Flush results for denormals to zero.
-  return _mm512_mask_blend_ps(denormal_mask, pmul(_x,x), _mm512_setzero_ps());
+  // Multiply the original _x by it's reciprocal square root to extract the
+  // square root.
+  return pmul(_x, x);
 }

 template <>
 EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS EIGEN_UNUSED Packet8d
 psqrt<Packet8d>(const Packet8d& _x) {
-  Packet8d neg_half = pmul(_x, pset1<Packet8d>(-.5));
-  __mmask16 denormal_mask = _mm512_kand(
-      _mm512_cmp_pd_mask(_x, pset1<Packet8d>((std::numeric_limits<double>::min)()),
-                        _CMP_LT_OQ),
-      _mm512_cmp_pd_mask(_x, _mm512_setzero_pd(), _CMP_GE_OQ));
+  _EIGEN_DECLARE_CONST_Packet8d(one_point_five, 1.5);
+  _EIGEN_DECLARE_CONST_Packet8d(minus_half, -0.5);
+  _EIGEN_DECLARE_CONST_Packet8d_FROM_INT64(dbl_min, 0x0010000000000000LL);

-  Packet8d x = _mm512_rsqrt14_pd(_x);
+  Packet8d neg_half = pmul(_x, p8d_minus_half);

-  // Do a single step of Newton's iteration.
-  x = pmul(x, pmadd(neg_half, pmul(x, x), pset1<Packet8d>(1.5)));
+  // select only the inverse sqrt of positive normal inputs (denormals are
+  // flushed to zero and cause infs as well).
+  __mmask8 non_zero_mask = _mm512_cmp_pd_mask(_x, p8d_dbl_min, _CMP_GE_OQ);
+  Packet8d x = _mm512_mask_blend_pd(non_zero_mask, _mm512_setzero_pd(), _mm512_rsqrt14_pd(_x));
+
+  // Do a first step of Newton's iteration.
+  x = pmul(x, pmadd(neg_half, pmul(x, x), p8d_one_point_five));

  // Do a second step of Newton's iteration.
-  x = pmul(x, pmadd(neg_half, pmul(x, x), pset1<Packet8d>(1.5)));
+  x = pmul(x, pmadd(neg_half, pmul(x, x), p8d_one_point_five));

-  return _mm512_mask_blend_pd(denormal_mask, pmul(_x,x), _mm512_setzero_pd());
+  // Multiply the original _x by it's reciprocal square root to extract the
+  // square root.
+  return pmul(_x, x);
 }
 #else
 template <>
@@ -309,136 +312,78 @@ EIGEN_STRONG_INLINE Packet8d psqrt<Packet8d>(const Packet8d& x) {
 }
 #endif

-// prsqrt for float.
-#if defined(EIGEN_VECTORIZE_AVX512ER)
-
-template <>
-EIGEN_STRONG_INLINE Packet16f prsqrt<Packet16f>(const Packet16f& x) {
-  return _mm512_rsqrt28_ps(x);
-}
-
-#elif EIGEN_FAST_MATH
-
+// Functions for rsqrt.
+// Almost identical to the sqrt routine, just leave out the last multiplication
+// and fill in NaN/Inf where needed. Note that this function only exists as an
+// iterative version for doubles since there is no instruction for diretly
+// computing the reciprocal square root in AVX-512.
+#ifdef EIGEN_FAST_MATH
 template <>
 EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS EIGEN_UNUSED Packet16f
 prsqrt<Packet16f>(const Packet16f& _x) {
  _EIGEN_DECLARE_CONST_Packet16f_FROM_INT(inf, 0x7f800000);
+  _EIGEN_DECLARE_CONST_Packet16f_FROM_INT(nan, 0x7fc00000);
  _EIGEN_DECLARE_CONST_Packet16f(one_point_five, 1.5f);
  _EIGEN_DECLARE_CONST_Packet16f(minus_half, -0.5f);
+  _EIGEN_DECLARE_CONST_Packet16f_FROM_INT(flt_min, 0x00800000);

  Packet16f neg_half = pmul(_x, p16f_minus_half);

-  // Identity infinite, negative and denormal arguments.
-  __mmask16 inf_mask = _mm512_cmp_ps_mask(_x, p16f_inf, _CMP_EQ_OQ);
-  __mmask16 not_pos_mask = _mm512_cmp_ps_mask(_x, _mm512_setzero_ps(), _CMP_LE_OQ);
-  __mmask16 not_finite_pos_mask = not_pos_mask | inf_mask;
-  
-  // Compute an approximate result using the rsqrt intrinsic, forcing +inf
-  // for denormals for consistency with AVX and SSE implementations.
-  Packet16f y_approx = _mm512_rsqrt14_ps(_x);
+  // select only the inverse sqrt of positive normal inputs (denormals are
+  // flushed to zero and cause infs as well).
+  __mmask16 le_zero_mask = _mm512_cmp_ps_mask(_x, p16f_flt_min, _CMP_LT_OQ);
+  Packet16f x = _mm512_mask_blend_ps(le_zero_mask, _mm512_rsqrt14_ps(_x), _mm512_setzero_ps());

-  // Do a single step of Newton-Raphson iteration to improve the approximation.
-  // This uses the formula y_{n+1} = y_n * (1.5 - y_n * (0.5 * x) * y_n).
-  // It is essential to evaluate the inner term like this because forming
-  // y_n^2 may over- or underflow.
-  Packet16f y_newton = pmul(y_approx, pmadd(y_approx, pmul(neg_half, y_approx), p16f_one_point_five));
+  // Fill in NaNs and Infs for the negative/zero entries.
+  __mmask16 neg_mask = _mm512_cmp_ps_mask(_x, _mm512_setzero_ps(), _CMP_LT_OQ);
+  Packet16f infs_and_nans = _mm512_mask_blend_ps(
+      neg_mask, _mm512_mask_blend_ps(le_zero_mask, _mm512_setzero_ps(), p16f_inf), p16f_nan);

-  // Select the result of the Newton-Raphson step for positive finite arguments.
-  // For other arguments, choose the output of the intrinsic. This will
-  // return rsqrt(+inf) = 0, rsqrt(x) = NaN if x < 0, and rsqrt(0) = +inf.
-  return _mm512_mask_blend_ps(not_finite_pos_mask, y_newton, y_approx);
-  }
+  // Do a single step of Newton's iteration.
+  x = pmul(x, pmadd(neg_half, pmul(x, x), p16f_one_point_five));

-#else
-
-template <>
-EIGEN_STRONG_INLINE Packet16f prsqrt<Packet16f>(const Packet16f& x) {
-  _EIGEN_DECLARE_CONST_Packet16f(one, 1.0f);
-  return _mm512_div_ps(p16f_one, _mm512_sqrt_ps(x));
+  // Insert NaNs and Infs in all the right places.
+  return _mm512_mask_blend_ps(le_zero_mask, x, infs_and_nans);
 }

-#endif
-
-// prsqrt for double.
-#if EIGEN_FAST_MATH
 template <>
 EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS EIGEN_UNUSED Packet8d
 prsqrt<Packet8d>(const Packet8d& _x) {
+  _EIGEN_DECLARE_CONST_Packet8d_FROM_INT64(inf, 0x7ff0000000000000LL);
+  _EIGEN_DECLARE_CONST_Packet8d_FROM_INT64(nan, 0x7ff1000000000000LL);
  _EIGEN_DECLARE_CONST_Packet8d(one_point_five, 1.5);
  _EIGEN_DECLARE_CONST_Packet8d(minus_half, -0.5);
-  _EIGEN_DECLARE_CONST_Packet8d_FROM_INT64(inf, 0x7ff0000000000000LL);
+  _EIGEN_DECLARE_CONST_Packet8d_FROM_INT64(dbl_min, 0x0010000000000000LL);

  Packet8d neg_half = pmul(_x, p8d_minus_half);

-  // Identity infinite, negative and denormal arguments.
-  __mmask8 inf_mask = _mm512_cmp_pd_mask(_x, p8d_inf, _CMP_EQ_OQ);
-  __mmask8 not_pos_mask = _mm512_cmp_pd_mask(_x, _mm512_setzero_pd(), _CMP_LE_OQ);
-  __mmask8 not_finite_pos_mask = not_pos_mask | inf_mask;
+  // select only the inverse sqrt of positive normal inputs (denormals are
+  // flushed to zero and cause infs as well).
+  __mmask8 le_zero_mask = _mm512_cmp_pd_mask(_x, p8d_dbl_min, _CMP_LT_OQ);
+  Packet8d x = _mm512_mask_blend_pd(le_zero_mask, _mm512_rsqrt14_pd(_x), _mm512_setzero_pd());

-  // Compute an approximate result using the rsqrt intrinsic, forcing +inf
-  // for denormals for consistency with AVX and SSE implementations.
-#if defined(EIGEN_VECTORIZE_AVX512ER)
-  Packet8d y_approx = _mm512_rsqrt28_pd(_x);
-#else
-  Packet8d y_approx = _mm512_rsqrt14_pd(_x);
-#endif
-  // Do one or two steps of Newton-Raphson's to improve the approximation, depending on the
-  // starting accuracy (either 2^-14 or 2^-28, depending on whether AVX512ER is available).
-  // The Newton-Raphson algorithm has quadratic convergence and roughly doubles the number
-  // of correct digits for each step.
-  // This uses the formula y_{n+1} = y_n * (1.5 - y_n * (0.5 * x) * y_n).
-  // It is essential to evaluate the inner term like this because forming
-  // y_n^2 may over- or underflow.
-  Packet8d y_newton = pmul(y_approx, pmadd(neg_half, pmul(y_approx, y_approx), p8d_one_point_five));
-#if !defined(EIGEN_VECTORIZE_AVX512ER)
-  y_newton = pmul(y_newton, pmadd(y_newton, pmul(neg_half, y_newton), p8d_one_point_five));
-#endif
-  // Select the result of the Newton-Raphson step for positive finite arguments.
-  // For other arguments, choose the output of the intrinsic. This will
-  // return rsqrt(+inf) = 0, rsqrt(x) = NaN if x < 0, and rsqrt(0) = +inf.
-  return _mm512_mask_blend_pd(not_finite_pos_mask, y_newton, y_approx);
+  // Fill in NaNs and Infs for the negative/zero entries.
+  __mmask8 neg_mask = _mm512_cmp_pd_mask(_x, _mm512_setzero_pd(), _CMP_LT_OQ);
+  Packet8d infs_and_nans = _mm512_mask_blend_pd(
+      neg_mask, _mm512_mask_blend_pd(le_zero_mask, _mm512_setzero_pd(), p8d_inf), p8d_nan);
+
+  // Do a first step of Newton's iteration.
+  x = pmul(x, pmadd(neg_half, pmul(x, x), p8d_one_point_five));
+
+  // Do a second step of Newton's iteration.
+  x = pmul(x, pmadd(neg_half, pmul(x, x), p8d_one_point_five));
+
+  // Insert NaNs and Infs in all the right places.
+  return _mm512_mask_blend_pd(le_zero_mask, x, infs_and_nans);
 }
-#else
+#elif defined(EIGEN_VECTORIZE_AVX512ER)
 template <>
-EIGEN_STRONG_INLINE Packet8d prsqrt<Packet8d>(const Packet8d& x) {
-  _EIGEN_DECLARE_CONST_Packet8d(one, 1.0f);
-  return _mm512_div_pd(p8d_one, _mm512_sqrt_pd(x));
+EIGEN_STRONG_INLINE Packet16f prsqrt<Packet16f>(const Packet16f& x) {
+  return _mm512_rsqrt28_ps(x);
 }
 #endif
-
-#if defined(EIGEN_VECTORIZE_AVX512DQ)
-template<> EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS EIGEN_UNUSED
-Packet16f plog1p<Packet16f>(const Packet16f& _x) {
-  return generic_plog1p(_x);
-}
-
-template<> EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS EIGEN_UNUSED
-Packet16f pexpm1<Packet16f>(const Packet16f& _x) {
-  return generic_expm1(_x);
-}
 #endif

-#endif
-
-
-template <>
-EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS EIGEN_UNUSED Packet16f
-psin<Packet16f>(const Packet16f& _x) {
-  return psin_float(_x);
-}
-
-template <>
-EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS EIGEN_UNUSED Packet16f
-pcos<Packet16f>(const Packet16f& _x) {
-  return pcos_float(_x);
-}
-
-template <>
-EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS EIGEN_UNUSED Packet16f
-ptanh<Packet16f>(const Packet16f& _x) {
-  return internal::generic_fast_tanh_float(_x);
-}
-
 }  // end namespace internal

 }  // end namespace Eigen
--- a/Eigen/src/Core/arch/AVX512/PacketMath.h
+++ b/Eigen/src/Core/arch/AVX512/PacketMath.h
--- a/Eigen/src/Core/arch/AVX512/TypeCasting.h
+++ b/Eigen/src/Core/arch/AVX512/TypeCasting.h
@@ -1,47 +0,0 @@
-// This file is part of Eigen, a lightweight C++ template library
-// for linear algebra.
-//
-// Copyright (C) 2019 Rasmus Munk Larsen <rmlarsen@google.com>
-//
-// This Source Code Form is subject to the terms of the Mozilla
-// Public License v. 2.0. If a copy of the MPL was not distributed
-// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
-
-#ifndef EIGEN_TYPE_CASTING_AVX512_H
-#define EIGEN_TYPE_CASTING_AVX512_H
-
-namespace Eigen {
-
-namespace internal {
-
-template <>
-struct type_casting_traits<half, float> {
-  enum {
-    VectorizedCast = 1,
-    SrcCoeffRatio = 1,
-    TgtCoeffRatio = 1
-  };
-};
-
-template<> EIGEN_STRONG_INLINE Packet16f pcast<Packet16h, Packet16f>(const Packet16h& a) {
-  return half2float(a);
-}
-
-template <>
-struct type_casting_traits<float, half> {
-  enum {
-    VectorizedCast = 1,
-    SrcCoeffRatio = 1,
-    TgtCoeffRatio = 1
-  };
-};
-
-template<> EIGEN_STRONG_INLINE Packet16h pcast<Packet16f, Packet16h>(const Packet16f& a) {
-  return float2half(a);
-}
-
-} // end namespace internal
-
-} // end namespace Eigen
-
-#endif // EIGEN_TYPE_CASTING_AVX512_H
--- a/Eigen/src/Core/arch/AltiVec/Complex.h
+++ b/Eigen/src/Core/arch/AltiVec/Complex.h
@@ -60,7 +60,7 @@ template<> struct packet_traits<std::complex<float> >  : default_packet_traits
  };
 };

-template<> struct unpacket_traits<Packet2cf> { typedef std::complex<float> type; enum {size=2, alignment=Aligned16, vectorizable=true, masked_load_available=false, masked_store_available=false}; typedef Packet2cf half; };
+template<> struct unpacket_traits<Packet2cf> { typedef std::complex<float> type; enum {size=2, alignment=Aligned16}; typedef Packet2cf half; };

 template<> EIGEN_STRONG_INLINE Packet2cf pset1<Packet2cf>(const std::complex<float>&  from)
 {
@@ -82,14 +82,14 @@ template<> EIGEN_STRONG_INLINE void pstoreu<std::complex<float> >(std::complex<f

 template<> EIGEN_DEVICE_FUNC inline Packet2cf pgather<std::complex<float>, Packet2cf>(const std::complex<float>* from, Index stride)
 {
-  EIGEN_ALIGN16 std::complex<float> af[2];
+  std::complex<float> EIGEN_ALIGN16 af[2];
  af[0] = from[0*stride];
  af[1] = from[1*stride];
  return pload<Packet2cf>(af);
 }
 template<> EIGEN_DEVICE_FUNC inline void pscatter<std::complex<float>, Packet2cf>(std::complex<float>* to, const Packet2cf& from, Index stride)
 {
-  EIGEN_ALIGN16 std::complex<float> af[2];
+  std::complex<float> EIGEN_ALIGN16 af[2];
  pstore<std::complex<float> >((std::complex<float> *) af, from);
  to[0*stride] = af[0];
  to[1*stride] = af[1];
@@ -128,7 +128,7 @@ template<> EIGEN_STRONG_INLINE void prefetch<std::complex<float> >(const std::co

 template<> EIGEN_STRONG_INLINE std::complex<float>  pfirst<Packet2cf>(const Packet2cf& a)
 {
-  EIGEN_ALIGN16 std::complex<float> res[2];
+  std::complex<float> EIGEN_ALIGN16 res[2];
  pstore((float *)&res, a.v);

  return res[0];
@@ -246,11 +246,6 @@ EIGEN_STRONG_INLINE void ptranspose(PacketBlock<Packet2cf,2>& kernel)
  kernel.packet[0].v = tmp;
 }

-template<> EIGEN_STRONG_INLINE Packet2cf pcmp_eq(const Packet2cf& a, const Packet2cf& b) {
-  Packet4f eq = reinterpret_cast<Packet4f>(vec_cmpeq(a.v,b.v));
-  return Packet2cf(vec_and(eq, vec_perm(eq, eq, p16uc_COMPLEX32_REV)));
-}
-
 #ifdef __VSX__
 template<> EIGEN_STRONG_INLINE Packet2cf pblend(const Selector<2>& ifPacket, const Packet2cf& thenPacket, const Packet2cf& elsePacket) {
  Packet2cf result;
@@ -291,7 +286,7 @@ template<> struct packet_traits<std::complex<double> >  : default_packet_traits
  };
 };

-template<> struct unpacket_traits<Packet1cd> { typedef std::complex<double> type; enum {size=1, alignment=Aligned16, vectorizable=true, masked_load_available=false, masked_store_available=false}; typedef Packet1cd half; };
+template<> struct unpacket_traits<Packet1cd> { typedef std::complex<double> type; enum {size=1, alignment=Aligned16}; typedef Packet1cd half; };

 template<> EIGEN_STRONG_INLINE Packet1cd pload <Packet1cd>(const std::complex<double>* from) { return Packet1cd(pload<Packet2d>((const double*)from)); }
 template<> EIGEN_STRONG_INLINE Packet1cd ploadu<Packet1cd>(const std::complex<double>* from) { return Packet1cd(ploadu<Packet2d>((const double*)from)); }
@@ -303,14 +298,14 @@ template<> EIGEN_STRONG_INLINE Packet1cd pset1<Packet1cd>(const std::complex<dou

 template<> EIGEN_DEVICE_FUNC inline Packet1cd pgather<std::complex<double>, Packet1cd>(const std::complex<double>* from, Index stride)
 {
-  EIGEN_ALIGN16 std::complex<double> af[2];
+  std::complex<double> EIGEN_ALIGN16 af[2];
  af[0] = from[0*stride];
  af[1] = from[1*stride];
  return pload<Packet1cd>(af);
 }
 template<> EIGEN_DEVICE_FUNC inline void pscatter<std::complex<double>, Packet1cd>(std::complex<double>* to, const Packet1cd& from, Index stride)
 {
-  EIGEN_ALIGN16 std::complex<double> af[2];
+  std::complex<double> EIGEN_ALIGN16 af[2];
  pstore<std::complex<double> >(af, from);
  to[0*stride] = af[0];
  to[1*stride] = af[1];
@@ -350,7 +345,7 @@ template<> EIGEN_STRONG_INLINE void prefetch<std::complex<double> >(const std::c

 template<> EIGEN_STRONG_INLINE std::complex<double>  pfirst<Packet1cd>(const Packet1cd& a)
 {
-  EIGEN_ALIGN16 std::complex<double> res[2];
+  std::complex<double> EIGEN_ALIGN16 res[2];
  pstore<std::complex<double> >(res, a);

  return res[0];
@@ -427,18 +422,6 @@ EIGEN_STRONG_INLINE void ptranspose(PacketBlock<Packet1cd,2>& kernel)
  kernel.packet[1].v = vec_perm(kernel.packet[0].v, kernel.packet[1].v, p16uc_TRANSPOSE64_LO);
  kernel.packet[0].v = tmp;
 }
-
-template<> EIGEN_STRONG_INLINE Packet1cd pcmp_eq(const Packet1cd& a, const Packet1cd& b) {
-  // Compare real and imaginary parts of a and b to get the mask vector:
-  // [re(a)==re(b), im(a)==im(b)]
-  Packet2d eq = reinterpret_cast<Packet2d>(vec_cmpeq(a.v,b.v));
-  // Swap real/imag elements in the mask in to get:
-  // [im(a)==im(b), re(a)==re(b)]
-  Packet2d eq_swapped = reinterpret_cast<Packet2d>(vec_sld(reinterpret_cast<Packet4ui>(eq), reinterpret_cast<Packet4ui>(eq), 8));
-  // Return re(a)==re(b) & im(a)==im(b) by computing bitwise AND of eq and eq_swapped
-  return Packet1cd(vec_and(eq, eq_swapped));
-}
-
 #endif // __VSX__
 } // end namespace internal

--- a/Eigen/src/Core/arch/AltiVec/MathFunctions.h
+++ b/Eigen/src/Core/arch/AltiVec/MathFunctions.h
@@ -9,6 +9,10 @@
 // Public License v. 2.0. If a copy of the MPL was not distributed
 // with this file, You can obtain one at http://mozilla.org/MPL/2.0/.

+/* The sin, cos, exp, and log functions of this file come from
+ * Julien Pommier's sse math library: http://gruntthepeon.free.fr/ssemath/
+ */
+
 #ifndef EIGEN_MATH_FUNCTIONS_ALTIVEC_H
 #define EIGEN_MATH_FUNCTIONS_ALTIVEC_H

@@ -16,28 +20,180 @@ namespace Eigen {

 namespace internal {

+static _EIGEN_DECLARE_CONST_Packet4f(1 , 1.0f);
+static _EIGEN_DECLARE_CONST_Packet4f(half, 0.5f);
+static _EIGEN_DECLARE_CONST_Packet4i(0x7f, 0x7f);
+static _EIGEN_DECLARE_CONST_Packet4i(23, 23);
+
+static _EIGEN_DECLARE_CONST_Packet4f_FROM_INT(inv_mant_mask, ~0x7f800000);
+
+/* the smallest non denormalized float number */
+static _EIGEN_DECLARE_CONST_Packet4f_FROM_INT(min_norm_pos,  0x00800000);
+static _EIGEN_DECLARE_CONST_Packet4f_FROM_INT(minus_inf,     0xff800000); // -1.f/0.f
+static _EIGEN_DECLARE_CONST_Packet4f_FROM_INT(minus_nan,     0xffffffff);
+  
+/* natural logarithm computed for 4 simultaneous float
+  return NaN for x <= 0
+*/
+static _EIGEN_DECLARE_CONST_Packet4f(cephes_SQRTHF, 0.707106781186547524f);
+static _EIGEN_DECLARE_CONST_Packet4f(cephes_log_p0, 7.0376836292E-2f);
+static _EIGEN_DECLARE_CONST_Packet4f(cephes_log_p1, - 1.1514610310E-1f);
+static _EIGEN_DECLARE_CONST_Packet4f(cephes_log_p2, 1.1676998740E-1f);
+static _EIGEN_DECLARE_CONST_Packet4f(cephes_log_p3, - 1.2420140846E-1f);
+static _EIGEN_DECLARE_CONST_Packet4f(cephes_log_p4, + 1.4249322787E-1f);
+static _EIGEN_DECLARE_CONST_Packet4f(cephes_log_p5, - 1.6668057665E-1f);
+static _EIGEN_DECLARE_CONST_Packet4f(cephes_log_p6, + 2.0000714765E-1f);
+static _EIGEN_DECLARE_CONST_Packet4f(cephes_log_p7, - 2.4999993993E-1f);
+static _EIGEN_DECLARE_CONST_Packet4f(cephes_log_p8, + 3.3333331174E-1f);
+static _EIGEN_DECLARE_CONST_Packet4f(cephes_log_q1, -2.12194440e-4f);
+static _EIGEN_DECLARE_CONST_Packet4f(cephes_log_q2, 0.693359375f);
+
+static _EIGEN_DECLARE_CONST_Packet4f(exp_hi,  88.3762626647950f);
+static _EIGEN_DECLARE_CONST_Packet4f(exp_lo, -88.3762626647949f);
+
+static _EIGEN_DECLARE_CONST_Packet4f(cephes_LOG2EF, 1.44269504088896341f);
+static _EIGEN_DECLARE_CONST_Packet4f(cephes_exp_C1, 0.693359375f);
+static _EIGEN_DECLARE_CONST_Packet4f(cephes_exp_C2, -2.12194440e-4f);
+
+static _EIGEN_DECLARE_CONST_Packet4f(cephes_exp_p0, 1.9875691500E-4f);
+static _EIGEN_DECLARE_CONST_Packet4f(cephes_exp_p1, 1.3981999507E-3f);
+static _EIGEN_DECLARE_CONST_Packet4f(cephes_exp_p2, 8.3334519073E-3f);
+static _EIGEN_DECLARE_CONST_Packet4f(cephes_exp_p3, 4.1665795894E-2f);
+static _EIGEN_DECLARE_CONST_Packet4f(cephes_exp_p4, 1.6666665459E-1f);
+static _EIGEN_DECLARE_CONST_Packet4f(cephes_exp_p5, 5.0000001201E-1f);
+
+#ifdef __VSX__
+static _EIGEN_DECLARE_CONST_Packet2d(1 , 1.0);
+static _EIGEN_DECLARE_CONST_Packet2d(2 , 2.0);
+static _EIGEN_DECLARE_CONST_Packet2d(half, 0.5);
+
+static _EIGEN_DECLARE_CONST_Packet2d(exp_hi,  709.437);
+static _EIGEN_DECLARE_CONST_Packet2d(exp_lo, -709.436139303);
+
+static _EIGEN_DECLARE_CONST_Packet2d(cephes_LOG2EF, 1.4426950408889634073599);
+
+static _EIGEN_DECLARE_CONST_Packet2d(cephes_exp_p0, 1.26177193074810590878e-4);
+static _EIGEN_DECLARE_CONST_Packet2d(cephes_exp_p1, 3.02994407707441961300e-2);
+static _EIGEN_DECLARE_CONST_Packet2d(cephes_exp_p2, 9.99999999999999999910e-1);
+
+static _EIGEN_DECLARE_CONST_Packet2d(cephes_exp_q0, 3.00198505138664455042e-6);
+static _EIGEN_DECLARE_CONST_Packet2d(cephes_exp_q1, 2.52448340349684104192e-3);
+static _EIGEN_DECLARE_CONST_Packet2d(cephes_exp_q2, 2.27265548208155028766e-1);
+static _EIGEN_DECLARE_CONST_Packet2d(cephes_exp_q3, 2.00000000000000000009e0);
+
+static _EIGEN_DECLARE_CONST_Packet2d(cephes_exp_C1, 0.693145751953125);
+static _EIGEN_DECLARE_CONST_Packet2d(cephes_exp_C2, 1.42860682030941723212e-6);
+
+#ifdef __POWER8_VECTOR__
+static Packet2l p2l_1023 = { 1023, 1023 };
+static Packet2ul p2ul_52 = { 52, 52 };
+#endif
+
+#endif
+
 template<> EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS EIGEN_UNUSED
 Packet4f plog<Packet4f>(const Packet4f& _x)
 {
-  return plog_float(_x);
+  Packet4f x = _x;
+
+  Packet4i emm0;
+
+  /* isvalid_mask is 0 if x < 0 or x is NaN. */
+  Packet4ui isvalid_mask = reinterpret_cast<Packet4ui>(vec_cmpge(x, p4f_ZERO));
+  Packet4ui iszero_mask = reinterpret_cast<Packet4ui>(vec_cmpeq(x, p4f_ZERO));
+
+  x = pmax(x, p4f_min_norm_pos);  /* cut off denormalized stuff */
+  emm0 = vec_sr(reinterpret_cast<Packet4i>(x),
+                reinterpret_cast<Packet4ui>(p4i_23));
+
+  /* keep only the fractional part */
+  x = pand(x, p4f_inv_mant_mask);
+  x = por(x, p4f_half);
+
+  emm0 = psub(emm0, p4i_0x7f);
+  Packet4f e = padd(vec_ctf(emm0, 0), p4f_1);
+
+  /* part2:
+     if( x < SQRTHF ) {
+       e -= 1;
+       x = x + x - 1.0;
+     } else { x = x - 1.0; }
+  */
+  Packet4f mask = reinterpret_cast<Packet4f>(vec_cmplt(x, p4f_cephes_SQRTHF));
+  Packet4f tmp = pand(x, mask);
+  x = psub(x, p4f_1);
+  e = psub(e, pand(p4f_1, mask));
+  x = padd(x, tmp);
+
+  Packet4f x2 = pmul(x,x);
+  Packet4f x3 = pmul(x2,x);
+
+  Packet4f y, y1, y2;
+  y  = pmadd(p4f_cephes_log_p0, x, p4f_cephes_log_p1);
+  y1 = pmadd(p4f_cephes_log_p3, x, p4f_cephes_log_p4);
+  y2 = pmadd(p4f_cephes_log_p6, x, p4f_cephes_log_p7);
+  y  = pmadd(y , x, p4f_cephes_log_p2);
+  y1 = pmadd(y1, x, p4f_cephes_log_p5);
+  y2 = pmadd(y2, x, p4f_cephes_log_p8);
+  y = pmadd(y, x3, y1);
+  y = pmadd(y, x3, y2);
+  y = pmul(y, x3);
+
+  y1 = pmul(e, p4f_cephes_log_q1);
+  tmp = pmul(x2, p4f_half);
+  y = padd(y, y1);
+  x = psub(x, tmp);
+  y2 = pmul(e, p4f_cephes_log_q2);
+  x = padd(x, y);
+  x = padd(x, y2);
+  // negative arg will be NAN, 0 will be -INF
+  x = vec_sel(x, p4f_minus_inf, iszero_mask);
+  x = vec_sel(p4f_minus_nan, x, isvalid_mask);
+  return x;
 }

 template<> EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS EIGEN_UNUSED
 Packet4f pexp<Packet4f>(const Packet4f& _x)
 {
-  return pexp_float(_x);
-}
+  Packet4f x = _x;

-template<> EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS EIGEN_UNUSED
-Packet4f psin<Packet4f>(const Packet4f& _x)
-{
-  return psin_float(_x);
-}
+  Packet4f tmp, fx;
+  Packet4i emm0;

-template<> EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS EIGEN_UNUSED
-Packet4f pcos<Packet4f>(const Packet4f& _x)
-{
-  return pcos_float(_x);
+  // clamp x
+  x = pmax(pmin(x, p4f_exp_hi), p4f_exp_lo);
+
+  // express exp(x) as exp(g + n*log(2))
+  fx = pmadd(x, p4f_cephes_LOG2EF, p4f_half);
+
+  fx = pfloor(fx);
+
+  tmp = pmul(fx, p4f_cephes_exp_C1);
+  Packet4f z = pmul(fx, p4f_cephes_exp_C2);
+  x = psub(x, tmp);
+  x = psub(x, z);
+
+  z = pmul(x,x);
+
+  Packet4f y = p4f_cephes_exp_p0;
+  y = pmadd(y, x, p4f_cephes_exp_p1);
+  y = pmadd(y, x, p4f_cephes_exp_p2);
+  y = pmadd(y, x, p4f_cephes_exp_p3);
+  y = pmadd(y, x, p4f_cephes_exp_p4);
+  y = pmadd(y, x, p4f_cephes_exp_p5);
+  y = pmadd(y, z, x);
+  y = padd(y, p4f_1);
+
+  // build 2^n
+  emm0 = vec_cts(fx, 0);
+  emm0 = vec_add(emm0, p4i_0x7f);
+  emm0 = vec_sl(emm0, reinterpret_cast<Packet4ui>(p4i_23));
+
+  // Altivec's max & min operators just drop silent NaNs. Check NaNs in 
+  // inputs and return them unmodified.
+  Packet4ui isnumber_mask = reinterpret_cast<Packet4ui>(vec_cmpeq(_x, _x));
+  return vec_sel(_x, pmax(pmul(y, reinterpret_cast<Packet4f>(emm0)), _x),
+                 isnumber_mask);
 }

 #ifndef EIGEN_COMP_CLANG
@@ -69,19 +225,95 @@ Packet2d psqrt<Packet2d>(const Packet2d& x)
  return  vec_sqrt(x);
 }

+// VSX support varies between different compilers and even different
+// versions of the same compiler.  For gcc version >= 4.9.3, we can use
+// vec_cts to efficiently convert Packet2d to Packet2l.  Otherwise, use
+// a slow version that works with older compilers. 
+// Update: apparently vec_cts/vec_ctf intrinsics for 64-bit doubles
+// are buggy, https://gcc.gnu.org/bugzilla/show_bug.cgi?id=70963
+static inline Packet2l ConvertToPacket2l(const Packet2d& x) {
+#if EIGEN_GNUC_AT_LEAST(5, 4) || \
+    (EIGEN_GNUC_AT(6, 1) && __GNUC_PATCHLEVEL__ >= 1)
+  return vec_cts(x, 0);    // TODO: check clang version.
+#else
+  double tmp[2];
+  memcpy(tmp, &x, sizeof(tmp));
+  Packet2l l = { static_cast<long long>(tmp[0]),
+                 static_cast<long long>(tmp[1]) };
+  return l;
+#endif
+}
+
 template<> EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS EIGEN_UNUSED
 Packet2d pexp<Packet2d>(const Packet2d& _x)
 {
-  return pexp_double(_x);
-}
+  Packet2d x = _x;
+
+  Packet2d tmp, fx;
+  Packet2l emm0;
+
+  // clamp x
+  x = pmax(pmin(x, p2d_exp_hi), p2d_exp_lo);
+
+  /* express exp(x) as exp(g + n*log(2)) */
+  fx = pmadd(x, p2d_cephes_LOG2EF, p2d_half);
+
+  fx = pfloor(fx);
+
+  tmp = pmul(fx, p2d_cephes_exp_C1);
+  Packet2d z = pmul(fx, p2d_cephes_exp_C2);
+  x = psub(x, tmp);
+  x = psub(x, z);
+
+  Packet2d x2 = pmul(x,x);
+
+  Packet2d px = p2d_cephes_exp_p0;
+  px = pmadd(px, x2, p2d_cephes_exp_p1);
+  px = pmadd(px, x2, p2d_cephes_exp_p2);
+  px = pmul (px, x);
+
+  Packet2d qx = p2d_cephes_exp_q0;
+  qx = pmadd(qx, x2, p2d_cephes_exp_q1);
+  qx = pmadd(qx, x2, p2d_cephes_exp_q2);
+  qx = pmadd(qx, x2, p2d_cephes_exp_q3);
+
+  x = pdiv(px,psub(qx,px));
+  x = pmadd(p2d_2,x,p2d_1);
+
+  // build 2^n
+  emm0 = ConvertToPacket2l(fx);
+
+#ifdef __POWER8_VECTOR__ 
+  emm0 = vec_add(emm0, p2l_1023);
+  emm0 = vec_sl(emm0, p2ul_52);
+#else
+  // Code is a bit complex for POWER7.  There is actually a
+  // vec_xxsldi intrinsic but it is not supported by some gcc versions.
+  // So we shift (52-32) bits and do a word swap with zeros.
+  _EIGEN_DECLARE_CONST_Packet4i(1023, 1023);
+  _EIGEN_DECLARE_CONST_Packet4i(20, 20);    // 52 - 32
+
+  Packet4i emm04i = reinterpret_cast<Packet4i>(emm0);
+  emm04i = vec_add(emm04i, p4i_1023);
+  emm04i = vec_sl(emm04i, reinterpret_cast<Packet4ui>(p4i_20));
+  static const Packet16uc perm = {
+    0x14, 0x15, 0x16, 0x17, 0x00, 0x01, 0x02, 0x03, 
+    0x1c, 0x1d, 0x1e, 0x1f, 0x08, 0x09, 0x0a, 0x0b };
+#ifdef  _BIG_ENDIAN
+  emm0 = reinterpret_cast<Packet2l>(vec_perm(p4i_ZERO, emm04i, perm));
+#else
+  emm0 = reinterpret_cast<Packet2l>(vec_perm(emm04i, p4i_ZERO, perm));
 #endif

-// Hyperbolic Tangent function.
-template <>
-EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS EIGEN_UNUSED Packet4f
-ptanh<Packet4f>(const Packet4f& x) {
-  return internal::generic_fast_tanh_float(x);
+#endif
+
+  // Altivec's max & min operators just drop silent NaNs. Check NaNs in 
+  // inputs and return them unmodified.
+  Packet2ul isnumber_mask = reinterpret_cast<Packet2ul>(vec_cmpeq(_x, _x));
+  return vec_sel(_x, pmax(pmul(x, reinterpret_cast<Packet2d>(emm0)), _x),
+                 isnumber_mask);
 }
+#endif

 }  // end namespace internal

--- a/Eigen/src/Core/arch/AltiVec/PacketMath.h
+++ b/Eigen/src/Core/arch/AltiVec/PacketMath.h
@@ -83,6 +83,15 @@ static Packet4i p4i_COUNTDOWN = { 0, 1, 2, 3 };
 static Packet16uc p16uc_REVERSE32 = { 12,13,14,15, 8,9,10,11, 4,5,6,7, 0,1,2,3 };
 static Packet16uc p16uc_DUPLICATE32_HI = { 0,1,2,3, 0,1,2,3, 4,5,6,7, 4,5,6,7 };

+// Mask alignment
+#ifdef __PPC64__
+#define _EIGEN_MASK_ALIGNMENT	0xfffffffffffffff0
+#else
+#define _EIGEN_MASK_ALIGNMENT	0xfffffff0
+#endif
+
+#define _EIGEN_ALIGNED_PTR(x)	((std::ptrdiff_t)(x) & _EIGEN_MASK_ALIGNMENT)
+
 // Handle endianness properly while loading constants
 // Define global static constants:
 #ifdef _BIG_ENDIAN
@@ -120,27 +129,27 @@ static Packet16uc p16uc_COMPLEX32_REV2 = vec_sld(p16uc_PSET64_HI, p16uc_PSET64_L
  #define EIGEN_PPC_PREFETCH(ADDR) asm( "   dcbt [%[addr]]\n" :: [addr] "r" (ADDR) : "cc" );
 #endif

-template <>
-struct packet_traits<float> : default_packet_traits {
+template<> struct packet_traits<float>  : default_packet_traits
+{
  typedef Packet4f type;
  typedef Packet4f half;
  enum {
    Vectorizable = 1,
    AlignedOnScalar = 1,
-    size = 4,
+    size=4,
    HasHalfPacket = 1,

-    HasAdd = 1,
-    HasSub = 1,
-    HasMul = 1,
-    HasDiv = 1,
-    HasMin = 1,
-    HasMax = 1,
-    HasAbs = 1,
-    HasSin = EIGEN_FAST_MATH,
-    HasCos = EIGEN_FAST_MATH,
-    HasLog = 1,
-    HasExp = 1,
+    HasAdd  = 1,
+    HasSub  = 1,
+    HasMul  = 1,
+    HasDiv  = 1,
+    HasMin  = 1,
+    HasMax  = 1,
+    HasAbs  = 1,
+    HasSin  = 0,
+    HasCos  = 0,
+    HasLog  = 0,
+    HasExp  = 1,
 #ifdef __VSX__
    HasSqrt = 1,
 #if !EIGEN_COMP_CLANG
@@ -151,8 +160,6 @@ struct packet_traits<float> : default_packet_traits {
 #else
    HasSqrt = 0,
    HasRsqrt = 0,
-    HasTanh = EIGEN_FAST_MATH,
-    HasErf = EIGEN_FAST_MATH,
 #endif
    HasRound = 1,
    HasFloor = 1,
@@ -161,8 +168,8 @@ struct packet_traits<float> : default_packet_traits {
    HasBlend = 1
  };
 };
-template <>
-struct packet_traits<int> : default_packet_traits {
+template<> struct packet_traits<int>    : default_packet_traits
+{
  typedef Packet4i type;
  typedef Packet4i half;
  enum {
@@ -179,19 +186,9 @@ struct packet_traits<int> : default_packet_traits {
  };
 };

-template<> struct unpacket_traits<Packet4f>
-{
-  typedef float     type;
-  typedef Packet4f  half;
-  typedef Packet4i  integer_packet;
-  enum {size=4, alignment=Aligned16, vectorizable=true, masked_load_available=false, masked_store_available=false};
-};
-template<> struct unpacket_traits<Packet4i>
-{
-  typedef int       type;
-  typedef Packet4i  half;
-  enum {size=4, alignment=Aligned16, vectorizable=false, masked_load_available=false, masked_store_available=false};
-};
+
+template<> struct unpacket_traits<Packet4f> { typedef float  type; enum {size=4, alignment=Aligned16}; typedef Packet4f half; };
+template<> struct unpacket_traits<Packet4i> { typedef int    type; enum {size=4, alignment=Aligned16}; typedef Packet4i half; };

 inline std::ostream & operator <<(std::ostream & s, const Packet16uc & v)
 {
@@ -241,38 +238,42 @@ inline std::ostream & operator <<(std::ostream & s, const Packet4ui & v)
 // Need to define them first or we get specialization after instantiation errors
 template<> EIGEN_STRONG_INLINE Packet4f pload<Packet4f>(const float* from)
 {
-  // some versions of GCC throw "unused-but-set-parameter".
-  // ignoring these warnings for now.
-  EIGEN_UNUSED_VARIABLE(from);
  EIGEN_DEBUG_ALIGNED_LOAD
+#ifdef __VSX__
+  return vec_vsx_ld(0, from);
+#else
  return vec_ld(0, from);
+#endif
 }

 template<> EIGEN_STRONG_INLINE Packet4i pload<Packet4i>(const int*     from)
 {
-  // some versions of GCC throw "unused-but-set-parameter".
-  // ignoring these warnings for now.
-  EIGEN_UNUSED_VARIABLE(from);
  EIGEN_DEBUG_ALIGNED_LOAD
+#ifdef __VSX__
+  return vec_vsx_ld(0, from);
+#else
  return vec_ld(0, from);
+#endif
 }

 template<> EIGEN_STRONG_INLINE void pstore<float>(float*   to, const Packet4f& from)
 {
-  // some versions of GCC throw "unused-but-set-parameter" (float *to).
-  // ignoring these warnings for now.
-  EIGEN_UNUSED_VARIABLE(to);
  EIGEN_DEBUG_ALIGNED_STORE
+#ifdef __VSX__
+  vec_vsx_st(from, 0, to);
+#else
  vec_st(from, 0, to);
+#endif
 }

 template<> EIGEN_STRONG_INLINE void pstore<int>(int*       to, const Packet4i& from)
 {
-  // some versions of GCC throw "unused-but-set-parameter" (float *to).
-  // ignoring these warnings for now.
-  EIGEN_UNUSED_VARIABLE(to);
  EIGEN_DEBUG_ALIGNED_STORE
+#ifdef __VSX__
+  vec_vsx_st(from, 0, to);
+#else
  vec_st(from, 0, to);
+#endif
 }

 template<> EIGEN_STRONG_INLINE Packet4f pset1<Packet4f>(const float&  from) {
@@ -284,11 +285,6 @@ template<> EIGEN_STRONG_INLINE Packet4i pset1<Packet4i>(const int&    from)   {
  Packet4i v = {from, from, from, from};
  return v;
 }
-
-template<> EIGEN_STRONG_INLINE Packet4f pset1frombits<Packet4f>(unsigned int from) {
-  return reinterpret_cast<Packet4f>(pset1<Packet4i>(from));
-}
-
 template<> EIGEN_STRONG_INLINE void
 pbroadcast4<Packet4f>(const float *a,
                      Packet4f& a0, Packet4f& a1, Packet4f& a2, Packet4f& a3)
@@ -312,7 +308,7 @@ pbroadcast4<Packet4i>(const int *a,

 template<> EIGEN_DEVICE_FUNC inline Packet4f pgather<float, Packet4f>(const float* from, Index stride)
 {
-  EIGEN_ALIGN16 float af[4];
+  float EIGEN_ALIGN16 af[4];
  af[0] = from[0*stride];
  af[1] = from[1*stride];
  af[2] = from[2*stride];
@@ -321,7 +317,7 @@ template<> EIGEN_DEVICE_FUNC inline Packet4f pgather<float, Packet4f>(const floa
 }
 template<> EIGEN_DEVICE_FUNC inline Packet4i pgather<int, Packet4i>(const int* from, Index stride)
 {
-  EIGEN_ALIGN16 int ai[4];
+  int EIGEN_ALIGN16 ai[4];
  ai[0] = from[0*stride];
  ai[1] = from[1*stride];
  ai[2] = from[2*stride];
@@ -330,7 +326,7 @@ template<> EIGEN_DEVICE_FUNC inline Packet4i pgather<int, Packet4i>(const int* f
 }
 template<> EIGEN_DEVICE_FUNC inline void pscatter<float, Packet4f>(float* to, const Packet4f& from, Index stride)
 {
-  EIGEN_ALIGN16 float af[4];
+  float EIGEN_ALIGN16 af[4];
  pstore<float>(af, from);
  to[0*stride] = af[0];
  to[1*stride] = af[1];
@@ -339,7 +335,7 @@ template<> EIGEN_DEVICE_FUNC inline void pscatter<float, Packet4f>(float* to, co
 }
 template<> EIGEN_DEVICE_FUNC inline void pscatter<int, Packet4i>(int* to, const Packet4i& from, Index stride)
 {
-  EIGEN_ALIGN16 int ai[4];
+  int EIGEN_ALIGN16 ai[4];
  pstore<int>((int *)ai, from);
  to[0*stride] = ai[0];
  to[1*stride] = ai[1];
@@ -395,7 +391,6 @@ template<> EIGEN_STRONG_INLINE Packet4i pmadd(const Packet4i& a, const Packet4i&
 template<> EIGEN_STRONG_INLINE Packet4f pmin<Packet4f>(const Packet4f& a, const Packet4f& b)
 {
  #ifdef __VSX__
-  // NOTE: about 10% slower than vec_min, but consistent with std::min and SSE regarding NaN
  Packet4f ret;
  __asm__ ("xvcmpgesp %x0,%x1,%x2\n\txxsel %x0,%x1,%x2,%x0" : "=&wa" (ret) : "wa" (a), "wa" (b));
  return ret;
@@ -408,7 +403,6 @@ template<> EIGEN_STRONG_INLINE Packet4i pmin<Packet4i>(const Packet4i& a, const
 template<> EIGEN_STRONG_INLINE Packet4f pmax<Packet4f>(const Packet4f& a, const Packet4f& b)
 {
  #ifdef __VSX__
-  // NOTE: about 10% slower than vec_max, but consistent with std::max and SSE regarding NaN
  Packet4f ret;
  __asm__ ("xvcmpgtsp %x0,%x2,%x1\n\txxsel %x0,%x1,%x2,%x0" : "=&wa" (ret) : "wa" (a), "wa" (b));
  return ret;
@@ -418,15 +412,6 @@ template<> EIGEN_STRONG_INLINE Packet4f pmax<Packet4f>(const Packet4f& a, const
 }
 template<> EIGEN_STRONG_INLINE Packet4i pmax<Packet4i>(const Packet4i& a, const Packet4i& b) { return vec_max(a, b); }

-template<> EIGEN_STRONG_INLINE Packet4f pcmp_le(const Packet4f& a, const Packet4f& b) { return reinterpret_cast<Packet4f>(vec_cmple(a,b)); }
-template<> EIGEN_STRONG_INLINE Packet4f pcmp_lt(const Packet4f& a, const Packet4f& b) { return reinterpret_cast<Packet4f>(vec_cmplt(a,b)); }
-template<> EIGEN_STRONG_INLINE Packet4f pcmp_eq(const Packet4f& a, const Packet4f& b) { return reinterpret_cast<Packet4f>(vec_cmpeq(a,b)); }
-template<> EIGEN_STRONG_INLINE Packet4f pcmp_lt_or_nan(const Packet4f& a, const Packet4f& b) {
-  Packet4f c = reinterpret_cast<Packet4f>(vec_cmpge(a,b));
-  return vec_nor(c,c);
-}
-template<> EIGEN_STRONG_INLINE Packet4i pcmp_eq(const Packet4i& a, const Packet4i& b) { return reinterpret_cast<Packet4i>(vec_cmpeq(a,b)); }
-
 template<> EIGEN_STRONG_INLINE Packet4f pand<Packet4f>(const Packet4f& a, const Packet4f& b) { return vec_and(a, b); }
 template<> EIGEN_STRONG_INLINE Packet4i pand<Packet4i>(const Packet4i& a, const Packet4i& b) { return vec_and(a, b); }

@@ -439,10 +424,6 @@ template<> EIGEN_STRONG_INLINE Packet4i pxor<Packet4i>(const Packet4i& a, const
 template<> EIGEN_STRONG_INLINE Packet4f pandnot<Packet4f>(const Packet4f& a, const Packet4f& b) { return vec_and(a, vec_nor(b, b)); }
 template<> EIGEN_STRONG_INLINE Packet4i pandnot<Packet4i>(const Packet4i& a, const Packet4i& b) { return vec_and(a, vec_nor(b, b)); }

-template<> EIGEN_STRONG_INLINE Packet4f pselect(const Packet4f& mask, const Packet4f& a, const Packet4f& b) {
-  return vec_sel(b, a, reinterpret_cast<Packet4ui>(mask));
-}
-
 template<> EIGEN_STRONG_INLINE Packet4f pround<Packet4f>(const Packet4f& a) { return vec_round(a); }
 template<> EIGEN_STRONG_INLINE Packet4f pceil<Packet4f>(const  Packet4f& a) { return vec_ceil(a); }
 template<> EIGEN_STRONG_INLINE Packet4f pfloor<Packet4f>(const Packet4f& a) { return vec_floor(a); }
@@ -471,16 +452,16 @@ template<> EIGEN_STRONG_INLINE Packet4i ploadu<Packet4i>(const int* from)
  return (Packet4i) vec_perm(MSQ, LSQ, mask);    // align the data
 }
 #else
-// We also need to redefine little endian loading of Packet4i/Packet4f using VSX
+// We also need ot redefine little endian loading of Packet4i/Packet4f using VSX
 template<> EIGEN_STRONG_INLINE Packet4i ploadu<Packet4i>(const int* from)
 {
  EIGEN_DEBUG_UNALIGNED_LOAD
-  return vec_vsx_ld(0, from);
+  return (Packet4i) vec_vsx_ld((long)from & 15, (const int*) _EIGEN_ALIGNED_PTR(from));
 }
 template<> EIGEN_STRONG_INLINE Packet4f ploadu<Packet4f>(const float* from)
 {
  EIGEN_DEBUG_UNALIGNED_LOAD
-  return vec_vsx_ld(0, from);
+  return (Packet4f) vec_vsx_ld((long)from & 15, (const float*) _EIGEN_ALIGNED_PTR(from));
 }
 #endif

@@ -537,24 +518,24 @@ template<> EIGEN_STRONG_INLINE void pstoreu<int>(int*      to, const Packet4i& f
  vec_st( MSQ, 0, (unsigned char *)to );                    // Store the MSQ part
 }
 #else
-// We also need to redefine little endian loading of Packet4i/Packet4f using VSX
+// We also need ot redefine little endian loading of Packet4i/Packet4f using VSX
 template<> EIGEN_STRONG_INLINE void pstoreu<int>(int*       to, const Packet4i& from)
 {
-  EIGEN_DEBUG_UNALIGNED_STORE
-  vec_vsx_st(from, 0, to);
+  EIGEN_DEBUG_ALIGNED_STORE
+  vec_vsx_st(from, (long)to & 15, (int*) _EIGEN_ALIGNED_PTR(to));
 }
 template<> EIGEN_STRONG_INLINE void pstoreu<float>(float*   to, const Packet4f& from)
 {
-  EIGEN_DEBUG_UNALIGNED_STORE
-  vec_vsx_st(from, 0, to);
+  EIGEN_DEBUG_ALIGNED_STORE
+  vec_vsx_st(from, (long)to & 15, (float*) _EIGEN_ALIGNED_PTR(to));
 }
 #endif

 template<> EIGEN_STRONG_INLINE void prefetch<float>(const float* addr)    { EIGEN_PPC_PREFETCH(addr); }
 template<> EIGEN_STRONG_INLINE void prefetch<int>(const int*     addr)    { EIGEN_PPC_PREFETCH(addr); }

-template<> EIGEN_STRONG_INLINE float  pfirst<Packet4f>(const Packet4f& a) { EIGEN_ALIGN16 float x; vec_ste(a, 0, &x); return x; }
-template<> EIGEN_STRONG_INLINE int    pfirst<Packet4i>(const Packet4i& a) { EIGEN_ALIGN16 int   x; vec_ste(a, 0, &x); return x; }
+template<> EIGEN_STRONG_INLINE float  pfirst<Packet4f>(const Packet4f& a) { float EIGEN_ALIGN16 x; vec_ste(a, 0, &x); return x; }
+template<> EIGEN_STRONG_INLINE int    pfirst<Packet4i>(const Packet4i& a) { int   EIGEN_ALIGN16 x; vec_ste(a, 0, &x); return x; }

 template<> EIGEN_STRONG_INLINE Packet4f preverse(const Packet4f& a)
 {
@@ -567,19 +548,6 @@ template<> EIGEN_STRONG_INLINE Packet4i preverse(const Packet4i& a)
 template<> EIGEN_STRONG_INLINE Packet4f pabs(const Packet4f& a) { return vec_abs(a); }
 template<> EIGEN_STRONG_INLINE Packet4i pabs(const Packet4i& a) { return vec_abs(a); }

-template<int N> EIGEN_STRONG_INLINE Packet4i pshiftright(Packet4i a)
-{ return vec_sr(a,reinterpret_cast<Packet4ui>(pset1<Packet4i>(N))); }
-template<int N> EIGEN_STRONG_INLINE Packet4i pshiftleft(Packet4i a)
-{ return vec_sl(a,reinterpret_cast<Packet4ui>(pset1<Packet4i>(N))); }
-
-template<> EIGEN_STRONG_INLINE Packet4f pfrexp<Packet4f>(const Packet4f& a, Packet4f& exponent) {
-  return pfrexp_float(a,exponent);
-}
-
-template<> EIGEN_STRONG_INLINE Packet4f pldexp<Packet4f>(const Packet4f& a, const Packet4f& exponent) {
-  return pldexp_float(a,exponent);
-}
-
 template<> EIGEN_STRONG_INLINE float predux<Packet4f>(const Packet4f& a)
 {
  Packet4f b, sum;
@@ -708,11 +676,6 @@ template<> EIGEN_STRONG_INLINE int predux_max<Packet4i>(const Packet4i& a)
  return pfirst(res);
 }

-template<> EIGEN_STRONG_INLINE bool predux_any(const Packet4f& x)
-{
-  return vec_any_ne(x, pzero(x));
-}
-
 template<int Offset>
 struct palign_impl<Offset,Packet4f>
 {
@@ -806,43 +769,6 @@ template<> EIGEN_STRONG_INLINE Packet4f pblend(const Selector<4>& ifPacket, cons
 }


-template <>
-struct type_casting_traits<float, int> {
-  enum {
-    VectorizedCast = 1,
-    SrcCoeffRatio = 1,
-    TgtCoeffRatio = 1
-  };
-};
-
-template <>
-struct type_casting_traits<int, float> {
-  enum {
-    VectorizedCast = 1,
-    SrcCoeffRatio = 1,
-    TgtCoeffRatio = 1
-  };
-};
-
-
-template<> EIGEN_STRONG_INLINE Packet4i pcast<Packet4f, Packet4i>(const Packet4f& a) {
-  return vec_cts(a,0);
-}
-
-template<> EIGEN_STRONG_INLINE Packet4f pcast<Packet4i, Packet4f>(const Packet4i& a) {
-  return vec_ctf(a,0);
-}
-
-template<> EIGEN_STRONG_INLINE Packet4i preinterpret<Packet4i,Packet4f>(const Packet4f& a) {
-  return reinterpret_cast<Packet4i>(a);
-}
-
-template<> EIGEN_STRONG_INLINE Packet4f preinterpret<Packet4f,Packet4i>(const Packet4i& a) {
-  return reinterpret_cast<Packet4f>(a);
-}
-
-
-
 //---------- double ----------
 #ifdef __VSX__
 typedef __vector double              Packet2d;
@@ -909,7 +835,7 @@ template<> struct packet_traits<double> : default_packet_traits
  };
 };

-template<> struct unpacket_traits<Packet2d> { typedef double type; enum {size=2, alignment=Aligned16, vectorizable=true, masked_load_available=false, masked_store_available=false}; typedef Packet2d half; };
+template<> struct unpacket_traits<Packet2d> { typedef double type; enum {size=2, alignment=Aligned16}; typedef Packet2d half; };

 inline std::ostream & operator <<(std::ostream & s, const Packet2l & v)
 {
@@ -937,13 +863,21 @@ inline std::ostream & operator <<(std::ostream & s, const Packet2d & v)
 template<> EIGEN_STRONG_INLINE Packet2d pload<Packet2d>(const double* from)
 {
  EIGEN_DEBUG_ALIGNED_LOAD
-  return vec_xl(0, const_cast<double *>(from)); // cast needed by Clang
+#ifdef __VSX__
+  return vec_vsx_ld(0, from);
+#else
+  return vec_ld(0, from);
+#endif
 }

 template<> EIGEN_STRONG_INLINE void pstore<double>(double*   to, const Packet2d& from)
 {
  EIGEN_DEBUG_ALIGNED_STORE
-  vec_xst(from, 0, to);
+#ifdef __VSX__
+  vec_vsx_st(from, 0, to);
+#else
+  vec_st(from, 0, to);
+#endif
 }

 template<> EIGEN_STRONG_INLINE Packet2d pset1<Packet2d>(const double&  from) {
@@ -965,14 +899,14 @@ pbroadcast4<Packet2d>(const double *a,

 template<> EIGEN_DEVICE_FUNC inline Packet2d pgather<double, Packet2d>(const double* from, Index stride)
 {
-  EIGEN_ALIGN16 double af[2];
+  double EIGEN_ALIGN16 af[2];
  af[0] = from[0*stride];
  af[1] = from[1*stride];
 return pload<Packet2d>(af);
 }
 template<> EIGEN_DEVICE_FUNC inline void pscatter<double, Packet2d>(double* to, const Packet2d& from, Index stride)
 {
-  EIGEN_ALIGN16 double af[2];
+  double EIGEN_ALIGN16 af[2];
  pstore<double>(af, from);
  to[0*stride] = af[0];
  to[1*stride] = af[1];
@@ -996,7 +930,6 @@ template<> EIGEN_STRONG_INLINE Packet2d pmadd(const Packet2d& a, const Packet2d&

 template<> EIGEN_STRONG_INLINE Packet2d pmin<Packet2d>(const Packet2d& a, const Packet2d& b)
 {
-  // NOTE: about 10% slower than vec_min, but consistent with std::min and SSE regarding NaN
  Packet2d ret;
  __asm__ ("xvcmpgedp %x0,%x1,%x2\n\txxsel %x0,%x1,%x2,%x0" : "=&wa" (ret) : "wa" (a), "wa" (b));
  return ret;
@@ -1004,20 +937,11 @@ template<> EIGEN_STRONG_INLINE Packet2d pmin<Packet2d>(const Packet2d& a, const

 template<> EIGEN_STRONG_INLINE Packet2d pmax<Packet2d>(const Packet2d& a, const Packet2d& b)
 {
-  // NOTE: about 10% slower than vec_max, but consistent with std::max and SSE regarding NaN
  Packet2d ret;
  __asm__ ("xvcmpgtdp %x0,%x2,%x1\n\txxsel %x0,%x1,%x2,%x0" : "=&wa" (ret) : "wa" (a), "wa" (b));
  return ret;
 }

-template<> EIGEN_STRONG_INLINE Packet2d pcmp_le(const Packet2d& a, const Packet2d& b) { return reinterpret_cast<Packet2d>(vec_cmple(a,b)); }
-template<> EIGEN_STRONG_INLINE Packet2d pcmp_lt(const Packet2d& a, const Packet2d& b) { return reinterpret_cast<Packet2d>(vec_cmplt(a,b)); }
-template<> EIGEN_STRONG_INLINE Packet2d pcmp_eq(const Packet2d& a, const Packet2d& b) { return reinterpret_cast<Packet2d>(vec_cmpeq(a,b)); }
-template<> EIGEN_STRONG_INLINE Packet2d pcmp_lt_or_nan(const Packet2d& a, const Packet2d& b) {
-  Packet2d c = reinterpret_cast<Packet2d>(vec_cmpge(a,b));
-  return vec_nor(c,c);
-}
-
 template<> EIGEN_STRONG_INLINE Packet2d pand<Packet2d>(const Packet2d& a, const Packet2d& b) { return vec_and(a, b); }

 template<> EIGEN_STRONG_INLINE Packet2d por<Packet2d>(const Packet2d& a, const Packet2d& b) { return vec_or(a, b); }
@@ -1032,8 +956,8 @@ template<> EIGEN_STRONG_INLINE Packet2d pfloor<Packet2d>(const Packet2d& a) { re

 template<> EIGEN_STRONG_INLINE Packet2d ploadu<Packet2d>(const double* from)
 {
-  EIGEN_DEBUG_UNALIGNED_LOAD
-  return vec_vsx_ld(0, from);
+  EIGEN_DEBUG_ALIGNED_LOAD
+  return (Packet2d) vec_vsx_ld((long)from & 15, (const double*) _EIGEN_ALIGNED_PTR(from));
 }

 template<> EIGEN_STRONG_INLINE Packet2d ploaddup<Packet2d>(const double*   from)
@@ -1046,13 +970,13 @@ template<> EIGEN_STRONG_INLINE Packet2d ploaddup<Packet2d>(const double*   from)

 template<> EIGEN_STRONG_INLINE void pstoreu<double>(double*  to, const Packet2d& from)
 {
-  EIGEN_DEBUG_UNALIGNED_STORE
-  vec_vsx_st(from, 0, to);
+  EIGEN_DEBUG_ALIGNED_STORE
+  vec_vsx_st((Packet4f)from, (long)to & 15, (float*) _EIGEN_ALIGNED_PTR(to));
 }

 template<> EIGEN_STRONG_INLINE void prefetch<double>(const double* addr) { EIGEN_PPC_PREFETCH(addr); }

-template<> EIGEN_STRONG_INLINE double  pfirst<Packet2d>(const Packet2d& a) { EIGEN_ALIGN16 double x[2]; pstore<double>(x, a); return x[0]; }
+template<> EIGEN_STRONG_INLINE double  pfirst<Packet2d>(const Packet2d& a) { double EIGEN_ALIGN16 x[2]; pstore<double>(x, a); return x[0]; }

 template<> EIGEN_STRONG_INLINE Packet2d preverse(const Packet2d& a)
 {
@@ -1060,59 +984,6 @@ template<> EIGEN_STRONG_INLINE Packet2d preverse(const Packet2d& a)
 }
 template<> EIGEN_STRONG_INLINE Packet2d pabs(const Packet2d& a) { return vec_abs(a); }

-// VSX support varies between different compilers and even different
-// versions of the same compiler.  For gcc version >= 4.9.3, we can use
-// vec_cts to efficiently convert Packet2d to Packet2l.  Otherwise, use
-// a slow version that works with older compilers. 
-// Update: apparently vec_cts/vec_ctf intrinsics for 64-bit doubles
-// are buggy, https://gcc.gnu.org/bugzilla/show_bug.cgi?id=70963
-static inline Packet2l ConvertToPacket2l(const Packet2d& x) {
-#if EIGEN_GNUC_AT_LEAST(5, 4) || \
-    (EIGEN_GNUC_AT(6, 1) && __GNUC_PATCHLEVEL__ >= 1)
-  return vec_cts(x, 0);    // TODO: check clang version.
-#else
-  double tmp[2];
-  memcpy(tmp, &x, sizeof(tmp));
-  Packet2l l = { static_cast<long long>(tmp[0]),
-                 static_cast<long long>(tmp[1]) };
-  return l;
-#endif
-}
-
-template<> EIGEN_STRONG_INLINE Packet2d pldexp<Packet2d>(const Packet2d& a, const Packet2d& exponent) {
-  
-  // build 2^n
-  Packet2l emm0 = ConvertToPacket2l(exponent);
-
-#ifdef __POWER8_VECTOR__ 
-  const Packet2l  p2l_1023 = { 1023, 1023 };
-  const Packet2ul p2ul_52 = { 52, 52 };
-  emm0 = vec_add(emm0, p2l_1023);
-  emm0 = vec_sl(emm0, p2ul_52);
-#else
-  // Code is a bit complex for POWER7.  There is actually a
-  // vec_xxsldi intrinsic but it is not supported by some gcc versions.
-  // So we shift (52-32) bits and do a word swap with zeros.
-  const Packet4i p4i_1023 = pset1<Packet4i>(1023);
-  const Packet4i p4i_20 = pset1<Packet4i>(20);    // 52 - 32
-
-  Packet4i emm04i = reinterpret_cast<Packet4i>(emm0);
-  emm04i = vec_add(emm04i, p4i_1023);
-  emm04i = vec_sl(emm04i, reinterpret_cast<Packet4ui>(p4i_20));
-  static const Packet16uc perm = {
-    0x14, 0x15, 0x16, 0x17, 0x00, 0x01, 0x02, 0x03, 
-    0x1c, 0x1d, 0x1e, 0x1f, 0x08, 0x09, 0x0a, 0x0b };
-#ifdef  _BIG_ENDIAN
-  emm0 = reinterpret_cast<Packet2l>(vec_perm(p4i_ZERO, emm04i, perm));
-#else
-  emm0 = reinterpret_cast<Packet2l>(vec_perm(emm04i, p4i_ZERO, perm));
-#endif
-
-#endif
-
-  return pmul(a, reinterpret_cast<Packet2d>(emm0));
-}
-
 template<> EIGEN_STRONG_INLINE double predux<Packet2d>(const Packet2d& a)
 {
  Packet2d b, sum;
--- a/Eigen/src/Core/arch/CUDA/Complex.h
+++ b/Eigen/src/Core/arch/CUDA/Complex.h
@@ -16,7 +16,7 @@ namespace Eigen {

 namespace internal {

-#if defined(EIGEN_CUDACC) && defined(EIGEN_USE_GPU)
+#if defined(__CUDACC__) && defined(EIGEN_USE_GPU)

 // Many std::complex methods such as operator+, operator-, operator* and
 // operator/ are not constexpr. Due to this, clang does not treat them as device
@@ -55,7 +55,7 @@ template<typename T> struct scalar_difference_op<std::complex<T>, std::complex<T
 // Product
 template<typename T> struct scalar_product_op<const std::complex<T>, const std::complex<T> >  : binary_op_base<const std::complex<T>, const std::complex<T> > {
  enum {
-    Vectorizable = packet_traits<std::complex<T> >::HasMul
+    Vectorizable = packet_traits<std::complex<T>>::HasMul
  };
  typedef typename std::complex<T> result_type;

@@ -76,7 +76,7 @@ template<typename T> struct scalar_product_op<std::complex<T>, std::complex<T> >
 // Quotient
 template<typename T> struct scalar_quotient_op<const std::complex<T>, const std::complex<T> > : binary_op_base<const std::complex<T>, const std::complex<T> > {
  enum {
-    Vectorizable = packet_traits<std::complex<T> >::HasDiv
+    Vectorizable = packet_traits<std::complex<T>>::HasDiv
  };
  typedef typename std::complex<T> result_type;

--- a/Eigen/src/Core/arch/Default/Half.h
+++ b/Eigen/src/Core/arch/Default/Half.h
@@ -26,15 +26,15 @@


 // Standard 16-bit float type, mostly useful for GPUs. Defines a new
-// type Eigen::half (inheriting either from CUDA's or HIP's __half struct) with
+// type Eigen::half (inheriting from CUDA's __half struct) with
 // operator overloads such that it behaves basically as an arithmetic
 // type. It will be quite slow on CPUs (so it is recommended to stay
-// in fp32 for CPUs, except for simple parameter conversions, I/O
+// in float32_bits for CPUs, except for simple parameter conversions, I/O
 // to disk and the likes), but fast on GPUs.


-#ifndef EIGEN_HALF_H
-#define EIGEN_HALF_H
+#ifndef EIGEN_HALF_CUDA_H
+#define EIGEN_HALF_CUDA_H

 #if __cplusplus > 199711L
 #define EIGEN_EXPLICIT_CAST(tgt_type) explicit operator tgt_type()
@@ -49,25 +49,16 @@ struct half;

 namespace half_impl {

-#if !defined(EIGEN_HAS_GPU_FP16)
+#if !defined(EIGEN_HAS_CUDA_FP16)
 // Make our own __half_raw definition that is similar to CUDA's.
 struct __half_raw {
  EIGEN_DEVICE_FUNC __half_raw() : x(0) {}
  explicit EIGEN_DEVICE_FUNC __half_raw(unsigned short raw) : x(raw) {}
  unsigned short x;
 };
-#elif defined(EIGEN_HAS_HIP_FP16)
-  // Nothing to do here
-  // HIP fp16 header file has a definition for __half_raw
-#elif defined(EIGEN_HAS_CUDA_FP16)
- #if defined(EIGEN_CUDA_SDK_VER) && EIGEN_CUDA_SDK_VER < 90000
+#elif defined(EIGEN_CUDACC_VER) && EIGEN_CUDACC_VER < 90000
 // In CUDA < 9.0, __half is the equivalent of CUDA 9's __half_raw
- typedef __half __half_raw;
- #endif // defined(EIGEN_HAS_CUDA_FP16)
-
-#elif defined(SYCL_DEVICE_ONLY)
-typedef cl::sycl::half __half_raw;
-
+typedef __half __half_raw;
 #endif

 EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC __half_raw raw_uint16_to_half(unsigned short x);
@@ -76,16 +67,10 @@ EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC float half_to_float(__half_raw h);

 struct half_base : public __half_raw {
  EIGEN_DEVICE_FUNC half_base() {}
+  EIGEN_DEVICE_FUNC half_base(const half_base& h) : __half_raw(h) {}
  EIGEN_DEVICE_FUNC half_base(const __half_raw& h) : __half_raw(h) {}
-
-#if defined(EIGEN_HAS_GPU_FP16)
- #if defined(EIGEN_HAS_HIP_FP16)
-  EIGEN_DEVICE_FUNC half_base(const __half& h) { x = __half_as_ushort(h); }
- #elif defined(EIGEN_HAS_CUDA_FP16)
-  #if (defined(EIGEN_CUDA_SDK_VER) && EIGEN_CUDA_SDK_VER >= 90000)
+#if defined(EIGEN_HAS_CUDA_FP16) && defined(EIGEN_CUDACC_VER) && EIGEN_CUDACC_VER >= 90000
  EIGEN_DEVICE_FUNC half_base(const __half& h) : __half_raw(*(__half_raw*)&h) {}
-  #endif
- #endif    
 #endif
 };

@@ -93,38 +78,18 @@ struct half_base : public __half_raw {

 // Class definition.
 struct half : public half_impl::half_base {
-
-  // Writing this out as separate #if-else blocks to make the code easier to follow
-  // The same applies to most #if-else blocks in this file
-#if !defined(EIGEN_HAS_GPU_FP16)
-  typedef half_impl::__half_raw __half_raw;
-#elif defined(EIGEN_HAS_HIP_FP16)
-  // Nothing to do here
-  // HIP fp16 header file has a definition for __half_raw
-#elif defined(EIGEN_HAS_CUDA_FP16)
-  // Note that EIGEN_CUDA_SDK_VER is set to 0 even when compiling with HIP, so
-  // (EIGEN_CUDA_SDK_VER < 90000) is true even for HIP!  So keeping this within
-  // #if defined(EIGEN_HAS_CUDA_FP16) is needed
-  #if defined(EIGEN_CUDA_SDK_VER) && EIGEN_CUDA_SDK_VER < 90000
+  #if !defined(EIGEN_HAS_CUDA_FP16) || (defined(EIGEN_CUDACC_VER) && EIGEN_CUDACC_VER < 90000)
    typedef half_impl::__half_raw __half_raw;
  #endif
-#endif

  EIGEN_DEVICE_FUNC half() {}

  EIGEN_DEVICE_FUNC half(const __half_raw& h) : half_impl::half_base(h) {}
-
-#if defined(EIGEN_HAS_GPU_FP16)
- #if defined(EIGEN_HAS_HIP_FP16)
+  EIGEN_DEVICE_FUNC half(const half& h) : half_impl::half_base(h) {}
+#if defined(EIGEN_HAS_CUDA_FP16) && defined(EIGEN_CUDACC_VER) && EIGEN_CUDACC_VER >= 90000
  EIGEN_DEVICE_FUNC half(const __half& h) : half_impl::half_base(h) {}
- #elif defined(EIGEN_HAS_CUDA_FP16)
-  #if defined(EIGEN_CUDA_SDK_VER) && EIGEN_CUDA_SDK_VER >= 90000
-  EIGEN_DEVICE_FUNC half(const __half& h) : half_impl::half_base(h) {}
-  #endif
- #endif
 #endif

-
  explicit EIGEN_DEVICE_FUNC half(bool b)
      : half_impl::half_base(half_impl::raw_uint16_to_half(b ? 0x3c00 : 0)) {}
  template<class T>
@@ -173,6 +138,11 @@ struct half : public half_impl::half_base {
  EIGEN_DEVICE_FUNC EIGEN_EXPLICIT_CAST(double) const {
    return static_cast<double>(half_impl::half_to_float(*this));
  }
+
+  EIGEN_DEVICE_FUNC half& operator=(const half& other) {
+    x = other.x;
+    return *this;
+  }
 };

 } // end namespace Eigen
@@ -231,24 +201,15 @@ namespace Eigen {

 namespace half_impl {

-#if (defined(EIGEN_HAS_CUDA_FP16) && defined(EIGEN_CUDA_ARCH) && \
-     EIGEN_CUDA_ARCH >= 530) ||                                  \
-    (defined(EIGEN_HAS_HIP_FP16) && defined(HIP_DEVICE_COMPILE))
-#define EIGEN_HAS_NATIVE_FP16
-#endif
+#if defined(EIGEN_HAS_CUDA_FP16) && defined(EIGEN_CUDA_ARCH) && EIGEN_CUDA_ARCH >= 530

 // Intrinsics for native fp16 support. Note that on current hardware,
-// these are no faster than fp32 arithmetic (you need to use the half2
+// these are no faster than float32_bits arithmetic (you need to use the half2
 // versions to get the ALU speed increased), but you do save the
 // conversion steps back and forth.

-#if defined(EIGEN_HAS_NATIVE_FP16)
 EIGEN_STRONG_INLINE __device__ half operator + (const half& a, const half& b) {
-#if defined(EIGEN_CUDA_SDK_VER) && EIGEN_CUDA_SDK_VER >= 90000
-  return __hadd(::__half(a), ::__half(b));
-#else
  return __hadd(a, b);
-#endif
 }
 EIGEN_STRONG_INLINE __device__ half operator * (const half& a, const half& b) {
  return __hmul(a, b);
@@ -257,13 +218,9 @@ EIGEN_STRONG_INLINE __device__ half operator - (const half& a, const half& b) {
  return __hsub(a, b);
 }
 EIGEN_STRONG_INLINE __device__ half operator / (const half& a, const half& b) {
-#if defined(EIGEN_CUDA_SDK_VER) && EIGEN_CUDA_SDK_VER >= 90000
-  return __hdiv(a, b);
-#else
  float num = __half2float(a);
  float denom = __half2float(b);
  return __float2half(num / denom);
-#endif
 }
 EIGEN_STRONG_INLINE __device__ half operator - (const half& a) {
  return __hneg(a);
@@ -303,26 +260,10 @@ EIGEN_STRONG_INLINE __device__ bool operator >= (const half& a, const half& b) {
  return __hge(a, b);
 }

-#endif
+#else  // Emulate support for half floats

-// We need to distinguish ‘clang as the CUDA compiler’ from ‘clang as the host compiler,
-// invoked by NVCC’ (e.g. on MacOS). The former needs to see both host and device implementation
-// of the functions, while the latter can only deal with one of them.
-#if !defined(EIGEN_HAS_NATIVE_FP16) || (EIGEN_COMP_CLANG && !EIGEN_COMP_NVCC) // Emulate support for half floats
-
-#if EIGEN_COMP_CLANG && defined(EIGEN_CUDACC)
-// We need to provide emulated *host-side* FP16 operators for clang.
-#pragma push_macro("EIGEN_DEVICE_FUNC")
-#undef EIGEN_DEVICE_FUNC
-#if defined(EIGEN_HAS_CUDA_FP16) && defined(EIGEN_HAS_NATIVE_FP16)
-#define EIGEN_DEVICE_FUNC __host__
-#else // both host and device need emulated ops.
-#define EIGEN_DEVICE_FUNC __host__ __device__
-#endif
-#endif
-
-// Definitions for CPUs and older HIP+CUDA, mostly working through conversion
-// to/from fp32.
+// Definitions for CPUs and older CUDA, mostly working through conversion
+// to/from float32_bits.

 EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC half operator + (const half& a, const half& b) {
  return half(float(a) + float(b));
@@ -376,9 +317,6 @@ EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC bool operator >= (const half& a, const hal
  return float(a) >= float(b);
 }

-#if defined(__clang__) && defined(__CUDA__)
-#pragma pop_macro("EIGEN_DEVICE_FUNC")
-#endif
 #endif  // Emulate support for half floats

 // Division by an index. Do it in full float precision to avoid accuracy
@@ -404,8 +342,7 @@ union float32_bits {
 };

 EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC __half_raw float_to_half_rtne(float ff) {
-#if (defined(EIGEN_HAS_CUDA_FP16) && defined(EIGEN_CUDA_ARCH) && EIGEN_CUDA_ARCH >= 300) || \
-  (defined(EIGEN_HAS_HIP_FP16) && defined(EIGEN_HIP_DEVICE_COMPILE))
+#if defined(EIGEN_HAS_CUDA_FP16) && defined(EIGEN_CUDA_ARCH) && EIGEN_CUDA_ARCH >= 300
  __half tmp_ff = __float2half(ff);
  return *(__half_raw*)&tmp_ff;

@@ -461,8 +398,7 @@ EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC __half_raw float_to_half_rtne(float ff) {
 }

 EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC float half_to_float(__half_raw h) {
-#if (defined(EIGEN_HAS_CUDA_FP16) && defined(EIGEN_CUDA_ARCH) && EIGEN_CUDA_ARCH >= 300) || \
-  (defined(EIGEN_HAS_HIP_FP16) && defined(EIGEN_HIP_DEVICE_COMPILE))
+#if defined(EIGEN_HAS_CUDA_FP16) && defined(EIGEN_CUDA_ARCH) && EIGEN_CUDA_ARCH >= 300
  return __half2float(h);

 #elif defined(EIGEN_HAS_FP16_C)
@@ -496,8 +432,7 @@ EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC bool (isinf)(const half& a) {
  return (a.x & 0x7fff) == 0x7c00;
 }
 EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC bool (isnan)(const half& a) {
-#if (defined(EIGEN_HAS_CUDA_FP16) && defined(EIGEN_CUDA_ARCH) && EIGEN_CUDA_ARCH >= 530) || \
-  (defined(EIGEN_HAS_HIP_FP16) && defined(EIGEN_HIP_DEVICE_COMPILE))
+#if defined(EIGEN_HAS_CUDA_FP16) && defined(EIGEN_CUDA_ARCH) && EIGEN_CUDA_ARCH >= 530
  return __hisnan(a);
 #else
  return (a.x & 0x7fff) > 0x7c00;
@@ -513,19 +448,14 @@ EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC half abs(const half& a) {
  return result;
 }
 EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC half exp(const half& a) {
-#if (EIGEN_CUDA_SDK_VER >= 80000 && defined EIGEN_CUDA_ARCH && EIGEN_CUDA_ARCH >= 530) || \
-  defined(EIGEN_HIP_DEVICE_COMPILE)
+#if EIGEN_CUDACC_VER >= 80000 && defined EIGEN_CUDA_ARCH && EIGEN_CUDA_ARCH >= 530
  return half(hexp(a));
 #else
   return half(::expf(float(a)));
 #endif
 }
-EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC half expm1(const half& a) {
-  return half(numext::expm1(float(a)));
-}
 EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC half log(const half& a) {
-#if (defined(EIGEN_HAS_CUDA_FP16) && EIGEN_CUDA_SDK_VER >= 80000 && defined(EIGEN_CUDA_ARCH) && EIGEN_CUDA_ARCH >= 530) || \
-  (defined(EIGEN_HAS_HIP_FP16) && defined(EIGEN_HIP_DEVICE_COMPILE))
+#if defined(EIGEN_HAS_CUDA_FP16) && EIGEN_CUDACC_VER >= 80000 && defined(EIGEN_CUDA_ARCH) && EIGEN_CUDA_ARCH >= 530
  return half(::hlog(a));
 #else
  return half(::logf(float(a)));
@@ -538,8 +468,7 @@ EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC half log10(const half& a) {
  return half(::log10f(float(a)));
 }
 EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC half sqrt(const half& a) {
-#if (EIGEN_CUDA_SDK_VER >= 80000 && defined EIGEN_CUDA_ARCH && EIGEN_CUDA_ARCH >= 530) || \
-  defined(EIGEN_HIP_DEVICE_COMPILE)
+#if EIGEN_CUDACC_VER >= 80000 && defined EIGEN_CUDA_ARCH && EIGEN_CUDA_ARCH >= 530
  return half(hsqrt(a));
 #else
    return half(::sqrtf(float(a)));
@@ -561,16 +490,14 @@ EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC half tanh(const half& a) {
  return half(::tanhf(float(a)));
 }
 EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC half floor(const half& a) {
-#if (EIGEN_CUDA_SDK_VER >= 80000 && defined EIGEN_CUDA_ARCH && EIGEN_CUDA_ARCH >= 300) || \
-  defined(EIGEN_HIP_DEVICE_COMPILE)
+#if EIGEN_CUDACC_VER >= 80000 && defined EIGEN_CUDA_ARCH && EIGEN_CUDA_ARCH >= 300
  return half(hfloor(a));
 #else
  return half(::floorf(float(a)));
 #endif
 }
 EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC half ceil(const half& a) {
-#if (EIGEN_CUDA_SDK_VER >= 80000 && defined EIGEN_CUDA_ARCH && EIGEN_CUDA_ARCH >= 300) || \
-  defined(EIGEN_HIP_DEVICE_COMPILE)
+#if EIGEN_CUDACC_VER >= 80000 && defined EIGEN_CUDA_ARCH && EIGEN_CUDA_ARCH >= 300
  return half(hceil(a));
 #else
  return half(::ceilf(float(a)));
@@ -578,8 +505,7 @@ EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC half ceil(const half& a) {
 }

 EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC half (min)(const half& a, const half& b) {
-#if (defined(EIGEN_HAS_CUDA_FP16) && defined(EIGEN_CUDA_ARCH) && EIGEN_CUDA_ARCH >= 530) || \
-  (defined(EIGEN_HAS_HIP_FP16) && defined(EIGEN_HIP_DEVICE_COMPILE))
+#if defined(EIGEN_HAS_CUDA_FP16) && defined(EIGEN_CUDA_ARCH) && EIGEN_CUDA_ARCH >= 530
  return __hlt(b, a) ? b : a;
 #else
  const float f1 = static_cast<float>(a);
@@ -588,8 +514,7 @@ EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC half (min)(const half& a, const half& b) {
 #endif
 }
 EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC half (max)(const half& a, const half& b) {
-#if (defined(EIGEN_HAS_CUDA_FP16) && defined(EIGEN_CUDA_ARCH) && EIGEN_CUDA_ARCH >= 530) || \
-  (defined(EIGEN_HAS_HIP_FP16) && defined(EIGEN_HIP_DEVICE_COMPILE))
+#if defined(EIGEN_HAS_CUDA_FP16) && defined(EIGEN_CUDA_ARCH) && EIGEN_CUDA_ARCH >= 530
  return __hlt(a, b) ? b : a;
 #else
  const float f1 = static_cast<float>(a);
@@ -598,12 +523,10 @@ EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC half (max)(const half& a, const half& b) {
 #endif
 }

-#ifndef EIGEN_NO_IO
 EIGEN_ALWAYS_INLINE std::ostream& operator << (std::ostream& os, const half& v) {
  os << static_cast<float>(v);
  return os;
 }
-#endif

 } // end namespace half_impl

@@ -669,8 +592,7 @@ EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC Eigen::half exph(const Eigen::half& a) {
  return Eigen::half(::expf(float(a)));
 }
 EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC Eigen::half logh(const Eigen::half& a) {
-#if (EIGEN_CUDA_SDK_VER >= 80000 && defined(EIGEN_CUDA_ARCH) && EIGEN_CUDA_ARCH >= 530) || \
-  defined(EIGEN_HIP_DEVICE_COMPILE)
+#if EIGEN_CUDACC_VER >= 80000 && defined(EIGEN_CUDA_ARCH) && EIGEN_CUDA_ARCH >= 530
  return Eigen::half(::hlog(a));
 #else
  return Eigen::half(::logf(float(a)));
@@ -704,12 +626,9 @@ struct hash<Eigen::half> {


 // Add the missing shfl_xor intrinsic
-#if (defined(EIGEN_CUDA_ARCH) && EIGEN_CUDA_ARCH >= 300) || \
-  defined(EIGEN_HIP_DEVICE_COMPILE)
-
+#if defined(EIGEN_CUDA_ARCH) && EIGEN_CUDA_ARCH >= 300
 __device__ EIGEN_STRONG_INLINE Eigen::half __shfl_xor(Eigen::half var, int laneMask, int width=warpSize) {
-  #if (EIGEN_CUDA_SDK_VER < 90000) || \
-    defined(EIGEN_HAS_HIP_FP16)
+  #if EIGEN_CUDACC_VER < 90000
  return static_cast<Eigen::half>(__shfl_xor(static_cast<float>(var), laneMask, width));
  #else
  return static_cast<Eigen::half>(__shfl_xor_sync(0xFFFFFFFF, static_cast<float>(var), laneMask, width));
@@ -718,8 +637,7 @@ __device__ EIGEN_STRONG_INLINE Eigen::half __shfl_xor(Eigen::half var, int laneM
 #endif

 // ldg() has an overload for __half_raw, but we also need one for Eigen::half.
-#if (defined(EIGEN_CUDA_ARCH) && EIGEN_CUDA_ARCH >= 350) || \
-  defined(EIGEN_HIP_DEVICE_COMPILE)
+#if defined(EIGEN_CUDA_ARCH) && EIGEN_CUDA_ARCH >= 350
 EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC Eigen::half __ldg(const Eigen::half* ptr) {
  return Eigen::half_impl::raw_uint16_to_half(
      __ldg(reinterpret_cast<const unsigned short*>(ptr)));
@@ -727,7 +645,7 @@ EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC Eigen::half __ldg(const Eigen::half* ptr)
 #endif


-#if defined(EIGEN_GPU_COMPILE_PHASE)
+#if defined(EIGEN_CUDA_ARCH)
 namespace Eigen {
 namespace numext {

@@ -753,4 +671,4 @@ bool (isfinite)(const Eigen::half& h) {
 }  // namespace numext
 #endif

-#endif // EIGEN_HALF_H
+#endif // EIGEN_HALF_CUDA_H
--- a/Eigen/src/Core/arch/CUDA/MathFunctions.h
+++ b/Eigen/src/Core/arch/CUDA/MathFunctions.h
@@ -7,8 +7,8 @@
 // Public License v. 2.0. If a copy of the MPL was not distributed
 // with this file, You can obtain one at http://mozilla.org/MPL/2.0/.

-#ifndef EIGEN_MATH_FUNCTIONS_GPU_H
-#define EIGEN_MATH_FUNCTIONS_GPU_H
+#ifndef EIGEN_MATH_FUNCTIONS_CUDA_H
+#define EIGEN_MATH_FUNCTIONS_CUDA_H

 namespace Eigen {

@@ -17,7 +17,7 @@ namespace internal {
 // Make sure this is only available when targeting a GPU: we don't want to
 // introduce conflicts between these packet_traits definitions and the ones
 // we'll use on the host side (SSE, AVX, ...)
-#if defined(EIGEN_GPUCC) && defined(EIGEN_USE_GPU)
+#if defined(__CUDACC__) && defined(EIGEN_USE_GPU)
 template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
 float4 plog<float4>(const float4& a)
 {
@@ -56,18 +56,6 @@ double2 pexp<double2>(const double2& a)
  return make_double2(exp(a.x), exp(a.y));
 }

-template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
-float4 pexpm1<float4>(const float4& a)
-{
-  return make_float4(expm1f(a.x), expm1f(a.y), expm1f(a.z), expm1f(a.w));
-}
-
-template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
-double2 pexpm1<double2>(const double2& a)
-{
-  return make_double2(expm1(a.x), expm1(a.y));
-}
-
 template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
 float4 psqrt<float4>(const float4& a)
 {
@@ -100,4 +88,4 @@ double2 prsqrt<double2>(const double2& a)

 } // end namespace Eigen

-#endif // EIGEN_MATH_FUNCTIONS_GPU_H
+#endif // EIGEN_MATH_FUNCTIONS_CUDA_H
--- a/Eigen/src/Core/arch/CUDA/PacketMath.h
+++ b/Eigen/src/Core/arch/CUDA/PacketMath.h
@@ -0,0 +1,333 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2014 Benoit Steiner <benoit.steiner.goog@gmail.com>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+#ifndef EIGEN_PACKET_MATH_CUDA_H
+#define EIGEN_PACKET_MATH_CUDA_H
+
+namespace Eigen {
+
+namespace internal {
+
+// Make sure this is only available when targeting a GPU: we don't want to
+// introduce conflicts between these packet_traits definitions and the ones
+// we'll use on the host side (SSE, AVX, ...)
+#if defined(__CUDACC__) && defined(EIGEN_USE_GPU)
+template<> struct is_arithmetic<float4>  { enum { value = true }; };
+template<> struct is_arithmetic<double2> { enum { value = true }; };
+
+template<> struct packet_traits<float> : default_packet_traits
+{
+  typedef float4 type;
+  typedef float4 half;
+  enum {
+    Vectorizable = 1,
+    AlignedOnScalar = 1,
+    size=4,
+    HasHalfPacket = 0,
+
+    HasDiv  = 1,
+    HasSin  = 0,
+    HasCos  = 0,
+    HasLog  = 1,
+    HasExp  = 1,
+    HasSqrt = 1,
+    HasRsqrt = 1,
+    HasLGamma = 1,
+    HasDiGamma = 1,
+    HasZeta = 1,
+    HasPolygamma = 1,
+    HasErf = 1,
+    HasErfc = 1,
+    HasIGamma = 1,
+    HasIGammac = 1,
+    HasBetaInc = 1,
+
+    HasBlend = 0,
+  };
+};
+
+template<> struct packet_traits<double> : default_packet_traits
+{
+  typedef double2 type;
+  typedef double2 half;
+  enum {
+    Vectorizable = 1,
+    AlignedOnScalar = 1,
+    size=2,
+    HasHalfPacket = 0,
+
+    HasDiv  = 1,
+    HasLog  = 1,
+    HasExp  = 1,
+    HasSqrt = 1,
+    HasRsqrt = 1,
+    HasLGamma = 1,
+    HasDiGamma = 1,
+    HasZeta = 1,
+    HasPolygamma = 1,
+    HasErf = 1,
+    HasErfc = 1,
+    HasIGamma = 1,
+    HasIGammac = 1,
+    HasBetaInc = 1,
+
+    HasBlend = 0,
+  };
+};
+
+
+template<> struct unpacket_traits<float4>  { typedef float  type; enum {size=4, alignment=Aligned16}; typedef float4 half; };
+template<> struct unpacket_traits<double2> { typedef double type; enum {size=2, alignment=Aligned16}; typedef double2 half; };
+
+template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE float4 pset1<float4>(const float&  from) {
+  return make_float4(from, from, from, from);
+}
+template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE double2 pset1<double2>(const double& from) {
+  return make_double2(from, from);
+}
+
+
+template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE float4 plset<float4>(const float& a) {
+  return make_float4(a, a+1, a+2, a+3);
+}
+template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE double2 plset<double2>(const double& a) {
+  return make_double2(a, a+1);
+}
+
+template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE float4 padd<float4>(const float4& a, const float4& b) {
+  return make_float4(a.x+b.x, a.y+b.y, a.z+b.z, a.w+b.w);
+}
+template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE double2 padd<double2>(const double2& a, const double2& b) {
+  return make_double2(a.x+b.x, a.y+b.y);
+}
+
+template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE float4 psub<float4>(const float4& a, const float4& b) {
+  return make_float4(a.x-b.x, a.y-b.y, a.z-b.z, a.w-b.w);
+}
+template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE double2 psub<double2>(const double2& a, const double2& b) {
+  return make_double2(a.x-b.x, a.y-b.y);
+}
+
+template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE float4 pnegate(const float4& a) {
+  return make_float4(-a.x, -a.y, -a.z, -a.w);
+}
+template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE double2 pnegate(const double2& a) {
+  return make_double2(-a.x, -a.y);
+}
+
+template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE float4 pconj(const float4& a) { return a; }
+template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE double2 pconj(const double2& a) { return a; }
+
+template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE float4 pmul<float4>(const float4& a, const float4& b) {
+  return make_float4(a.x*b.x, a.y*b.y, a.z*b.z, a.w*b.w);
+}
+template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE double2 pmul<double2>(const double2& a, const double2& b) {
+  return make_double2(a.x*b.x, a.y*b.y);
+}
+
+template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE float4 pdiv<float4>(const float4& a, const float4& b) {
+  return make_float4(a.x/b.x, a.y/b.y, a.z/b.z, a.w/b.w);
+}
+template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE double2 pdiv<double2>(const double2& a, const double2& b) {
+  return make_double2(a.x/b.x, a.y/b.y);
+}
+
+template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE float4 pmin<float4>(const float4& a, const float4& b) {
+  return make_float4(fminf(a.x, b.x), fminf(a.y, b.y), fminf(a.z, b.z), fminf(a.w, b.w));
+}
+template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE double2 pmin<double2>(const double2& a, const double2& b) {
+  return make_double2(fmin(a.x, b.x), fmin(a.y, b.y));
+}
+
+template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE float4 pmax<float4>(const float4& a, const float4& b) {
+  return make_float4(fmaxf(a.x, b.x), fmaxf(a.y, b.y), fmaxf(a.z, b.z), fmaxf(a.w, b.w));
+}
+template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE double2 pmax<double2>(const double2& a, const double2& b) {
+  return make_double2(fmax(a.x, b.x), fmax(a.y, b.y));
+}
+
+template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE float4 pload<float4>(const float* from) {
+  return *reinterpret_cast<const float4*>(from);
+}
+
+template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE double2 pload<double2>(const double* from) {
+  return *reinterpret_cast<const double2*>(from);
+}
+
+template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE float4 ploadu<float4>(const float* from) {
+  return make_float4(from[0], from[1], from[2], from[3]);
+}
+template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE double2 ploadu<double2>(const double* from) {
+  return make_double2(from[0], from[1]);
+}
+
+template<> EIGEN_STRONG_INLINE float4 ploaddup<float4>(const float*   from) {
+  return make_float4(from[0], from[0], from[1], from[1]);
+}
+template<> EIGEN_STRONG_INLINE double2 ploaddup<double2>(const double*  from) {
+  return make_double2(from[0], from[0]);
+}
+
+template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void pstore<float>(float*   to, const float4& from) {
+  *reinterpret_cast<float4*>(to) = from;
+}
+
+template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void pstore<double>(double* to, const double2& from) {
+  *reinterpret_cast<double2*>(to) = from;
+}
+
+template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void pstoreu<float>(float*  to, const float4& from) {
+  to[0] = from.x;
+  to[1] = from.y;
+  to[2] = from.z;
+  to[3] = from.w;
+}
+
+template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void pstoreu<double>(double* to, const double2& from) {
+  to[0] = from.x;
+  to[1] = from.y;
+}
+
+template<>
+EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE float4 ploadt_ro<float4, Aligned>(const float* from) {
+#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 350
+  return __ldg((const float4*)from);
+#else
+  return make_float4(from[0], from[1], from[2], from[3]);
+#endif
+}
+template<>
+EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE double2 ploadt_ro<double2, Aligned>(const double* from) {
+#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 350
+  return __ldg((const double2*)from);
+#else
+  return make_double2(from[0], from[1]);
+#endif
+}
+
+template<>
+EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE float4 ploadt_ro<float4, Unaligned>(const float* from) {
+#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 350
+  return make_float4(__ldg(from+0), __ldg(from+1), __ldg(from+2), __ldg(from+3));
+#else
+  return make_float4(from[0], from[1], from[2], from[3]);
+#endif
+}
+template<>
+EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE double2 ploadt_ro<double2, Unaligned>(const double* from) {
+#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 350
+  return make_double2(__ldg(from+0), __ldg(from+1));
+#else
+  return make_double2(from[0], from[1]);
+#endif
+}
+
+template<> EIGEN_DEVICE_FUNC inline float4 pgather<float, float4>(const float* from, Index stride) {
+  return make_float4(from[0*stride], from[1*stride], from[2*stride], from[3*stride]);
+}
+
+template<> EIGEN_DEVICE_FUNC inline double2 pgather<double, double2>(const double* from, Index stride) {
+  return make_double2(from[0*stride], from[1*stride]);
+}
+
+template<> EIGEN_DEVICE_FUNC inline void pscatter<float, float4>(float* to, const float4& from, Index stride) {
+  to[stride*0] = from.x;
+  to[stride*1] = from.y;
+  to[stride*2] = from.z;
+  to[stride*3] = from.w;
+}
+template<> EIGEN_DEVICE_FUNC inline void pscatter<double, double2>(double* to, const double2& from, Index stride) {
+  to[stride*0] = from.x;
+  to[stride*1] = from.y;
+}
+
+template<> EIGEN_DEVICE_FUNC inline float  pfirst<float4>(const float4& a) {
+  return a.x;
+}
+template<> EIGEN_DEVICE_FUNC inline double pfirst<double2>(const double2& a) {
+  return a.x;
+}
+
+template<> EIGEN_DEVICE_FUNC inline float  predux<float4>(const float4& a) {
+  return a.x + a.y + a.z + a.w;
+}
+template<> EIGEN_DEVICE_FUNC inline double predux<double2>(const double2& a) {
+  return a.x + a.y;
+}
+
+template<> EIGEN_DEVICE_FUNC inline float  predux_max<float4>(const float4& a) {
+  return fmaxf(fmaxf(a.x, a.y), fmaxf(a.z, a.w));
+}
+template<> EIGEN_DEVICE_FUNC inline double predux_max<double2>(const double2& a) {
+  return fmax(a.x, a.y);
+}
+
+template<> EIGEN_DEVICE_FUNC inline float  predux_min<float4>(const float4& a) {
+  return fminf(fminf(a.x, a.y), fminf(a.z, a.w));
+}
+template<> EIGEN_DEVICE_FUNC inline double predux_min<double2>(const double2& a) {
+  return fmin(a.x, a.y);
+}
+
+template<> EIGEN_DEVICE_FUNC inline float  predux_mul<float4>(const float4& a) {
+  return a.x * a.y * a.z * a.w;
+}
+template<> EIGEN_DEVICE_FUNC inline double predux_mul<double2>(const double2& a) {
+  return a.x * a.y;
+}
+
+template<> EIGEN_DEVICE_FUNC inline float4  pabs<float4>(const float4& a) {
+  return make_float4(fabsf(a.x), fabsf(a.y), fabsf(a.z), fabsf(a.w));
+}
+template<> EIGEN_DEVICE_FUNC inline double2 pabs<double2>(const double2& a) {
+  return make_double2(fabs(a.x), fabs(a.y));
+}
+
+EIGEN_DEVICE_FUNC inline void
+ptranspose(PacketBlock<float4,4>& kernel) {
+  float tmp = kernel.packet[0].y;
+  kernel.packet[0].y = kernel.packet[1].x;
+  kernel.packet[1].x = tmp;
+
+  tmp = kernel.packet[0].z;
+  kernel.packet[0].z = kernel.packet[2].x;
+  kernel.packet[2].x = tmp;
+
+  tmp = kernel.packet[0].w;
+  kernel.packet[0].w = kernel.packet[3].x;
+  kernel.packet[3].x = tmp;
+
+  tmp = kernel.packet[1].z;
+  kernel.packet[1].z = kernel.packet[2].y;
+  kernel.packet[2].y = tmp;
+
+  tmp = kernel.packet[1].w;
+  kernel.packet[1].w = kernel.packet[3].y;
+  kernel.packet[3].y = tmp;
+
+  tmp = kernel.packet[2].w;
+  kernel.packet[2].w = kernel.packet[3].z;
+  kernel.packet[3].z = tmp;
+}
+
+EIGEN_DEVICE_FUNC inline void
+ptranspose(PacketBlock<double2,2>& kernel) {
+  double tmp = kernel.packet[0].y;
+  kernel.packet[0].y = kernel.packet[1].x;
+  kernel.packet[1].x = tmp;
+}
+
+#endif
+
+} // end namespace internal
+
+} // end namespace Eigen
+
+
+#endif // EIGEN_PACKET_MATH_CUDA_H
--- a/Eigen/src/Core/arch/CUDA/PacketMathHalf.h
+++ b/Eigen/src/Core/arch/CUDA/PacketMathHalf.h
--- a/Eigen/src/Core/arch/CUDA/TypeCasting.h
+++ b/Eigen/src/Core/arch/CUDA/TypeCasting.h
@@ -0,0 +1,212 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2016 Benoit Steiner <benoit.steiner.goog@gmail.com>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+#ifndef EIGEN_TYPE_CASTING_CUDA_H
+#define EIGEN_TYPE_CASTING_CUDA_H
+
+namespace Eigen {
+
+namespace internal {
+
+template<>
+struct scalar_cast_op<float, Eigen::half> {
+  EIGEN_EMPTY_STRUCT_CTOR(scalar_cast_op)
+  typedef Eigen::half result_type;
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Eigen::half operator() (const float& a) const {
+    #if defined(EIGEN_HAS_CUDA_FP16) && defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 300
+      return __float2half(a);
+    #else
+      return Eigen::half(a);
+    #endif
+  }
+};
+
+template<>
+struct functor_traits<scalar_cast_op<float, Eigen::half> >
+{ enum { Cost = NumTraits<float>::AddCost, PacketAccess = false }; };
+
+
+template<>
+struct scalar_cast_op<int, Eigen::half> {
+  EIGEN_EMPTY_STRUCT_CTOR(scalar_cast_op)
+  typedef Eigen::half result_type;
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Eigen::half operator() (const int& a) const {
+    #if defined(EIGEN_HAS_CUDA_FP16) && defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 300
+      return __float2half(static_cast<float>(a));
+    #else
+      return Eigen::half(static_cast<float>(a));
+    #endif
+  }
+};
+
+template<>
+struct functor_traits<scalar_cast_op<int, Eigen::half> >
+{ enum { Cost = NumTraits<float>::AddCost, PacketAccess = false }; };
+
+
+template<>
+struct scalar_cast_op<Eigen::half, float> {
+  EIGEN_EMPTY_STRUCT_CTOR(scalar_cast_op)
+  typedef float result_type;
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE float operator() (const Eigen::half& a) const {
+    #if defined(EIGEN_HAS_CUDA_FP16) && defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 300
+      return __half2float(a);
+    #else
+      return static_cast<float>(a);
+    #endif
+  }
+};
+
+template<>
+struct functor_traits<scalar_cast_op<Eigen::half, float> >
+{ enum { Cost = NumTraits<float>::AddCost, PacketAccess = false }; };
+
+
+
+#if defined(EIGEN_HAS_CUDA_FP16) && defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 300
+
+template <>
+struct type_casting_traits<Eigen::half, float> {
+  enum {
+    VectorizedCast = 1,
+    SrcCoeffRatio = 2,
+    TgtCoeffRatio = 1
+  };
+};
+
+template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE float4 pcast<half2, float4>(const half2& a, const half2& b) {
+  float2 r1 = __half22float2(a);
+  float2 r2 = __half22float2(b);
+  return make_float4(r1.x, r1.y, r2.x, r2.y);
+}
+
+template <>
+struct type_casting_traits<float, Eigen::half> {
+  enum {
+    VectorizedCast = 1,
+    SrcCoeffRatio = 1,
+    TgtCoeffRatio = 2
+  };
+};
+
+template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 pcast<float4, half2>(const float4& a) {
+  // Simply discard the second half of the input
+  return __floats2half2_rn(a.x, a.y);
+}
+
+#elif defined EIGEN_VECTORIZE_AVX512
+template <>
+struct type_casting_traits<half, float> {
+  enum {
+    VectorizedCast = 1,
+    SrcCoeffRatio = 1,
+    TgtCoeffRatio = 1
+  };
+};
+
+template<> EIGEN_STRONG_INLINE Packet16f pcast<Packet16h, Packet16f>(const Packet16h& a) {
+  return half2float(a);
+}
+
+template <>
+struct type_casting_traits<float, half> {
+  enum {
+    VectorizedCast = 1,
+    SrcCoeffRatio = 1,
+    TgtCoeffRatio = 1
+  };
+};
+
+template<> EIGEN_STRONG_INLINE Packet16h pcast<Packet16f, Packet16h>(const Packet16f& a) {
+  return float2half(a);
+}
+
+#elif defined EIGEN_VECTORIZE_AVX
+
+template <>
+struct type_casting_traits<Eigen::half, float> {
+  enum {
+    VectorizedCast = 1,
+    SrcCoeffRatio = 1,
+    TgtCoeffRatio = 1
+  };
+};
+
+template<> EIGEN_STRONG_INLINE Packet8f pcast<Packet8h, Packet8f>(const Packet8h& a) {
+  return half2float(a);
+}
+
+template <>
+struct type_casting_traits<float, Eigen::half> {
+  enum {
+    VectorizedCast = 1,
+    SrcCoeffRatio = 1,
+    TgtCoeffRatio = 1
+  };
+};
+
+template<> EIGEN_STRONG_INLINE Packet8h pcast<Packet8f, Packet8h>(const Packet8f& a) {
+  return float2half(a);
+}
+
+// Disable the following code since it's broken on too many platforms / compilers.
+//#elif defined(EIGEN_VECTORIZE_SSE) && (!EIGEN_ARCH_x86_64) && (!EIGEN_COMP_MSVC)
+#elif 0
+
+template <>
+struct type_casting_traits<Eigen::half, float> {
+  enum {
+    VectorizedCast = 1,
+    SrcCoeffRatio = 1,
+    TgtCoeffRatio = 1
+  };
+};
+
+template<> EIGEN_STRONG_INLINE Packet4f pcast<Packet4h, Packet4f>(const Packet4h& a) {
+  __int64_t a64 = _mm_cvtm64_si64(a.x);
+  Eigen::half h = raw_uint16_to_half(static_cast<unsigned short>(a64));
+  float f1 = static_cast<float>(h);
+  h = raw_uint16_to_half(static_cast<unsigned short>(a64 >> 16));
+  float f2 = static_cast<float>(h);
+  h = raw_uint16_to_half(static_cast<unsigned short>(a64 >> 32));
+  float f3 = static_cast<float>(h);
+  h = raw_uint16_to_half(static_cast<unsigned short>(a64 >> 48));
+  float f4 = static_cast<float>(h);
+  return _mm_set_ps(f4, f3, f2, f1);
+}
+
+template <>
+struct type_casting_traits<float, Eigen::half> {
+  enum {
+    VectorizedCast = 1,
+    SrcCoeffRatio = 1,
+    TgtCoeffRatio = 1
+  };
+};
+
+template<> EIGEN_STRONG_INLINE Packet4h pcast<Packet4f, Packet4h>(const Packet4f& a) {
+  EIGEN_ALIGN16 float aux[4];
+  pstore(aux, a);
+  Eigen::half h0(aux[0]);
+  Eigen::half h1(aux[1]);
+  Eigen::half h2(aux[2]);
+  Eigen::half h3(aux[3]);
+
+  Packet4h result;
+  result.x = _mm_set_pi16(h3.x, h2.x, h1.x, h0.x);
+  return result;
+}
+
+#endif
+
+} // end namespace internal
+
+} // end namespace Eigen
+
+#endif // EIGEN_TYPE_CASTING_CUDA_H
--- a/Eigen/src/Core/arch/Default/GenericPacketMathFunctions.h
+++ b/Eigen/src/Core/arch/Default/GenericPacketMathFunctions.h
@@ -1,655 +0,0 @@
-// This file is part of Eigen, a lightweight C++ template library
-// for linear algebra.
-//
-// Copyright (C) 2007 Julien Pommier
-// Copyright (C) 2014 Pedro Gonnet (pedro.gonnet@gmail.com)
-// Copyright (C) 2009-2019 Gael Guennebaud <gael.guennebaud@inria.fr>
-//
-// This Source Code Form is subject to the terms of the Mozilla
-// Public License v. 2.0. If a copy of the MPL was not distributed
-// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
-
-/* The exp and log functions of this file initially come from
- * Julien Pommier's sse math library: http://gruntthepeon.free.fr/ssemath/
- */
-
-#ifndef EIGEN_ARCH_GENERIC_PACKET_MATH_FUNCTIONS_H
-#define EIGEN_ARCH_GENERIC_PACKET_MATH_FUNCTIONS_H
-
-namespace Eigen {
-namespace internal {
-
-template<typename Packet> EIGEN_STRONG_INLINE Packet
-pfrexp_float(const Packet& a, Packet& exponent) {
-  typedef typename unpacket_traits<Packet>::integer_packet PacketI;
-  const Packet cst_126f = pset1<Packet>(126.0f);
-  const Packet cst_half = pset1<Packet>(0.5f);
-  const Packet cst_inv_mant_mask  = pset1frombits<Packet>(~0x7f800000u);
-  exponent = psub(pcast<PacketI,Packet>(pshiftright<23>(preinterpret<PacketI>(a))), cst_126f);
-  return por(pand(a, cst_inv_mant_mask), cst_half);
-}
-
-template<typename Packet> EIGEN_STRONG_INLINE Packet
-pldexp_float(Packet a, Packet exponent)
-{
-  typedef typename unpacket_traits<Packet>::integer_packet PacketI;
-  const Packet cst_127 = pset1<Packet>(127.f);
-  // return a * 2^exponent
-  PacketI ei = pcast<Packet,PacketI>(padd(exponent, cst_127));
-  return pmul(a, preinterpret<Packet>(pshiftleft<23>(ei)));
-}
-
-// Natural logarithm
-// Computes log(x) as log(2^e * m) = C*e + log(m), where the constant C =log(2)
-// and m is in the range [sqrt(1/2),sqrt(2)). In this range, the logarithm can
-// be easily approximated by a polynomial centered on m=1 for stability.
-// TODO(gonnet): Further reduce the interval allowing for lower-degree
-//               polynomial interpolants -> ... -> profit!
-template <typename Packet>
-EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS
-EIGEN_UNUSED
-Packet plog_float(const Packet _x)
-{
-  Packet x = _x;
-
-  const Packet cst_1              = pset1<Packet>(1.0f);
-  const Packet cst_half           = pset1<Packet>(0.5f);
-  // The smallest non denormalized float number.
-  const Packet cst_min_norm_pos   = pset1frombits<Packet>( 0x00800000u);
-  const Packet cst_minus_inf      = pset1frombits<Packet>( 0xff800000u);
-  const Packet cst_pos_inf        = pset1frombits<Packet>( 0x7f800000u);
-
-  // Polynomial coefficients.
-  const Packet cst_cephes_SQRTHF = pset1<Packet>(0.707106781186547524f);
-  const Packet cst_cephes_log_p0 = pset1<Packet>(7.0376836292E-2f);
-  const Packet cst_cephes_log_p1 = pset1<Packet>(-1.1514610310E-1f);
-  const Packet cst_cephes_log_p2 = pset1<Packet>(1.1676998740E-1f);
-  const Packet cst_cephes_log_p3 = pset1<Packet>(-1.2420140846E-1f);
-  const Packet cst_cephes_log_p4 = pset1<Packet>(+1.4249322787E-1f);
-  const Packet cst_cephes_log_p5 = pset1<Packet>(-1.6668057665E-1f);
-  const Packet cst_cephes_log_p6 = pset1<Packet>(+2.0000714765E-1f);
-  const Packet cst_cephes_log_p7 = pset1<Packet>(-2.4999993993E-1f);
-  const Packet cst_cephes_log_p8 = pset1<Packet>(+3.3333331174E-1f);
-  const Packet cst_cephes_log_q1 = pset1<Packet>(-2.12194440e-4f);
-  const Packet cst_cephes_log_q2 = pset1<Packet>(0.693359375f);
-
-  // Truncate input values to the minimum positive normal.
-  x = pmax(x, cst_min_norm_pos);
-
-  Packet e;
-  // extract significant in the range [0.5,1) and exponent
-  x = pfrexp(x,e);
-
-  // part2: Shift the inputs from the range [0.5,1) to [sqrt(1/2),sqrt(2))
-  // and shift by -1. The values are then centered around 0, which improves
-  // the stability of the polynomial evaluation.
-  //   if( x < SQRTHF ) {
-  //     e -= 1;
-  //     x = x + x - 1.0;
-  //   } else { x = x - 1.0; }
-  Packet mask = pcmp_lt(x, cst_cephes_SQRTHF);
-  Packet tmp = pand(x, mask);
-  x = psub(x, cst_1);
-  e = psub(e, pand(cst_1, mask));
-  x = padd(x, tmp);
-
-  Packet x2 = pmul(x, x);
-  Packet x3 = pmul(x2, x);
-
-  // Evaluate the polynomial approximant of degree 8 in three parts, probably
-  // to improve instruction-level parallelism.
-  Packet y, y1, y2;
-  y  = pmadd(cst_cephes_log_p0, x, cst_cephes_log_p1);
-  y1 = pmadd(cst_cephes_log_p3, x, cst_cephes_log_p4);
-  y2 = pmadd(cst_cephes_log_p6, x, cst_cephes_log_p7);
-  y  = pmadd(y, x, cst_cephes_log_p2);
-  y1 = pmadd(y1, x, cst_cephes_log_p5);
-  y2 = pmadd(y2, x, cst_cephes_log_p8);
-  y  = pmadd(y, x3, y1);
-  y  = pmadd(y, x3, y2);
-  y  = pmul(y, x3);
-
-  // Add the logarithm of the exponent back to the result of the interpolation.
-  y1  = pmul(e, cst_cephes_log_q1);
-  tmp = pmul(x2, cst_half);
-  y   = padd(y, y1);
-  x   = psub(x, tmp);
-  y2  = pmul(e, cst_cephes_log_q2);
-  x   = padd(x, y);
-  x   = padd(x, y2);
-
-  Packet invalid_mask = pcmp_lt_or_nan(_x, pzero(_x));
-  Packet iszero_mask  = pcmp_eq(_x,pzero(_x));
-  Packet pos_inf_mask = pcmp_eq(_x,cst_pos_inf);
-  // Filter out invalid inputs, i.e.:
-  //  - negative arg will be NAN
-  //  - 0 will be -INF
-  //  - +INF will be +INF
-  return pselect(iszero_mask, cst_minus_inf,
-                              por(pselect(pos_inf_mask,cst_pos_inf,x), invalid_mask));
-}
-
-/** \internal \returns log(1 + x) computed using W. Kahan's formula.
-    See: http://www.plunk.org/~hatch/rightway.php
- */
-template<typename Packet>
-Packet generic_plog1p(const Packet& x)
-{
-  typedef typename unpacket_traits<Packet>::type ScalarType;
-  const Packet one = pset1<Packet>(ScalarType(1));
-  Packet xp1 = padd(x, one);
-  Packet small_mask = pcmp_eq(xp1, one);
-  Packet log1 = plog(xp1);
-  Packet inf_mask = pcmp_eq(xp1, log1);
-  Packet log_large = pmul(x, pdiv(log1, psub(xp1, one)));
-  return pselect(por(small_mask, inf_mask), x, log_large);
-}
-
-/** \internal \returns exp(x)-1 computed using W. Kahan's formula.
-    See: http://www.plunk.org/~hatch/rightway.php
- */
-template<typename Packet>
-Packet generic_expm1(const Packet& x)
-{
-  typedef typename unpacket_traits<Packet>::type ScalarType;
-  const Packet one = pset1<Packet>(ScalarType(1));
-  const Packet neg_one = pset1<Packet>(ScalarType(-1));
-  Packet u = pexp(x);
-  Packet one_mask = pcmp_eq(u, one);
-  Packet u_minus_one = psub(u, one);
-  Packet neg_one_mask = pcmp_eq(u_minus_one, neg_one);
-  Packet logu = plog(u);
-  // The following comparison is to catch the case where
-  // exp(x) = +inf. It is written in this way to avoid having
-  // to form the constant +inf, which depends on the packet
-  // type.
-  Packet pos_inf_mask = pcmp_eq(logu, u);
-  Packet expm1 = pmul(u_minus_one, pdiv(x, logu));
-  expm1 = pselect(pos_inf_mask, u, expm1);
-  return pselect(one_mask,
-                 x,
-                 pselect(neg_one_mask,
-                         neg_one,
-                         expm1));
-}
-
-
-// Exponential function. Works by writing "x = m*log(2) + r" where
-// "m = floor(x/log(2)+1/2)" and "r" is the remainder. The result is then
-// "exp(x) = 2^m*exp(r)" where exp(r) is in the range [-1,1).
-template <typename Packet>
-EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS
-EIGEN_UNUSED
-Packet pexp_float(const Packet _x)
-{
-  const Packet cst_1      = pset1<Packet>(1.0f);
-  const Packet cst_half   = pset1<Packet>(0.5f);
-  const Packet cst_exp_hi = pset1<Packet>( 88.3762626647950f);
-  const Packet cst_exp_lo = pset1<Packet>(-88.3762626647949f);
-
-  const Packet cst_cephes_LOG2EF = pset1<Packet>(1.44269504088896341f);
-  const Packet cst_cephes_exp_p0 = pset1<Packet>(1.9875691500E-4f);
-  const Packet cst_cephes_exp_p1 = pset1<Packet>(1.3981999507E-3f);
-  const Packet cst_cephes_exp_p2 = pset1<Packet>(8.3334519073E-3f);
-  const Packet cst_cephes_exp_p3 = pset1<Packet>(4.1665795894E-2f);
-  const Packet cst_cephes_exp_p4 = pset1<Packet>(1.6666665459E-1f);
-  const Packet cst_cephes_exp_p5 = pset1<Packet>(5.0000001201E-1f);
-
-  // Clamp x.
-  Packet x = pmax(pmin(_x, cst_exp_hi), cst_exp_lo);
-
-  // Express exp(x) as exp(m*ln(2) + r), start by extracting
-  // m = floor(x/ln(2) + 0.5).
-  Packet m = pfloor(pmadd(x, cst_cephes_LOG2EF, cst_half));
-
-  // Get r = x - m*ln(2). If no FMA instructions are available, m*ln(2) is
-  // subtracted out in two parts, m*C1+m*C2 = m*ln(2), to avoid accumulating
-  // truncation errors.
-  Packet r;
-#ifdef EIGEN_HAS_SINGLE_INSTRUCTION_MADD
-  const Packet cst_nln2 = pset1<Packet>(-0.6931471805599453f);
-  r = pmadd(m, cst_nln2, x);
-#else
-  const Packet cst_cephes_exp_C1 = pset1<Packet>(0.693359375f);
-  const Packet cst_cephes_exp_C2 = pset1<Packet>(-2.12194440e-4f);
-  r = psub(x, pmul(m, cst_cephes_exp_C1));
-  r = psub(r, pmul(m, cst_cephes_exp_C2));
-#endif
-
-  Packet r2 = pmul(r, r);
-
-  // TODO(gonnet): Split into odd/even polynomials and try to exploit
-  //               instruction-level parallelism.
-  Packet y = cst_cephes_exp_p0;
-  y = pmadd(y, r, cst_cephes_exp_p1);
-  y = pmadd(y, r, cst_cephes_exp_p2);
-  y = pmadd(y, r, cst_cephes_exp_p3);
-  y = pmadd(y, r, cst_cephes_exp_p4);
-  y = pmadd(y, r, cst_cephes_exp_p5);
-  y = pmadd(y, r2, r);
-  y = padd(y, cst_1);
-
-  // Return 2^m * exp(r).
-  return pmax(pldexp(y,m), _x);
-}
-
-// make it the default path for scalar float
-template<>
-EIGEN_DEVICE_FUNC inline float pexp(const float& a) { return pexp_float(a); }
-
-template <typename Packet>
-EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS
-EIGEN_UNUSED
-Packet pexp_double(const Packet _x)
-{
-  Packet x = _x;
-
-  const Packet cst_1 = pset1<Packet>(1.0);
-  const Packet cst_2 = pset1<Packet>(2.0);
-  const Packet cst_half = pset1<Packet>(0.5);
-
-  const Packet cst_exp_hi = pset1<Packet>(709.437);
-  const Packet cst_exp_lo = pset1<Packet>(-709.436139303);
-
-  const Packet cst_cephes_LOG2EF = pset1<Packet>(1.4426950408889634073599);
-  const Packet cst_cephes_exp_p0 = pset1<Packet>(1.26177193074810590878e-4);
-  const Packet cst_cephes_exp_p1 = pset1<Packet>(3.02994407707441961300e-2);
-  const Packet cst_cephes_exp_p2 = pset1<Packet>(9.99999999999999999910e-1);
-  const Packet cst_cephes_exp_q0 = pset1<Packet>(3.00198505138664455042e-6);
-  const Packet cst_cephes_exp_q1 = pset1<Packet>(2.52448340349684104192e-3);
-  const Packet cst_cephes_exp_q2 = pset1<Packet>(2.27265548208155028766e-1);
-  const Packet cst_cephes_exp_q3 = pset1<Packet>(2.00000000000000000009e0);
-  const Packet cst_cephes_exp_C1 = pset1<Packet>(0.693145751953125);
-  const Packet cst_cephes_exp_C2 = pset1<Packet>(1.42860682030941723212e-6);
-
-  Packet tmp, fx;
-
-  // clamp x
-  x = pmax(pmin(x, cst_exp_hi), cst_exp_lo);
-  // Express exp(x) as exp(g + n*log(2)).
-  fx = pmadd(cst_cephes_LOG2EF, x, cst_half);
-
-  // Get the integer modulus of log(2), i.e. the "n" described above.
-  fx = pfloor(fx);
-
-  // Get the remainder modulo log(2), i.e. the "g" described above. Subtract
-  // n*log(2) out in two steps, i.e. n*C1 + n*C2, C1+C2=log2 to get the last
-  // digits right.
-  tmp = pmul(fx, cst_cephes_exp_C1);
-  Packet z = pmul(fx, cst_cephes_exp_C2);
-  x = psub(x, tmp);
-  x = psub(x, z);
-
-  Packet x2 = pmul(x, x);
-
-  // Evaluate the numerator polynomial of the rational interpolant.
-  Packet px = cst_cephes_exp_p0;
-  px = pmadd(px, x2, cst_cephes_exp_p1);
-  px = pmadd(px, x2, cst_cephes_exp_p2);
-  px = pmul(px, x);
-
-  // Evaluate the denominator polynomial of the rational interpolant.
-  Packet qx = cst_cephes_exp_q0;
-  qx = pmadd(qx, x2, cst_cephes_exp_q1);
-  qx = pmadd(qx, x2, cst_cephes_exp_q2);
-  qx = pmadd(qx, x2, cst_cephes_exp_q3);
-
-  // I don't really get this bit, copied from the SSE2 routines, so...
-  // TODO(gonnet): Figure out what is going on here, perhaps find a better
-  // rational interpolant?
-  x = pdiv(px, psub(qx, px));
-  x = pmadd(cst_2, x, cst_1);
-
-  // Construct the result 2^n * exp(g) = e * x. The max is used to catch
-  // non-finite values in the input.
-  return pmax(pldexp(x,fx), _x);
-}
-
-// make it the default path for scalar double
-template<>
-EIGEN_DEVICE_FUNC inline double pexp(const double& a) { return pexp_double(a); }
-
-// The following code is inspired by the following stack-overflow answer:
-//   https://stackoverflow.com/questions/30463616/payne-hanek-algorithm-implementation-in-c/30465751#30465751
-// It has been largely optimized:
-//  - By-pass calls to frexp.
-//  - Aligned loads of required 96 bits of 2/pi. This is accomplished by
-//    (1) balancing the mantissa and exponent to the required bits of 2/pi are
-//    aligned on 8-bits, and (2) replicating the storage of the bits of 2/pi.
-//  - Avoid a branch in rounding and extraction of the remaining fractional part.
-// Overall, I measured a speed up higher than x2 on x86-64.
-inline float trig_reduce_huge (float xf, int *quadrant)
-{
-  using Eigen::numext::int32_t;
-  using Eigen::numext::uint32_t;
-  using Eigen::numext::int64_t;
-  using Eigen::numext::uint64_t;
-
-  const double pio2_62 = 3.4061215800865545e-19;    // pi/2 * 2^-62
-  const uint64_t zero_dot_five = uint64_t(1) << 61; // 0.5 in 2.62-bit fixed-point foramt
-
-  // 192 bits of 2/pi for Payne-Hanek reduction
-  // Bits are introduced by packet of 8 to enable aligned reads.
-  static const uint32_t two_over_pi [] = 
-  {
-    0x00000028, 0x000028be, 0x0028be60, 0x28be60db,
-    0xbe60db93, 0x60db9391, 0xdb939105, 0x9391054a,
-    0x91054a7f, 0x054a7f09, 0x4a7f09d5, 0x7f09d5f4,
-    0x09d5f47d, 0xd5f47d4d, 0xf47d4d37, 0x7d4d3770,
-    0x4d377036, 0x377036d8, 0x7036d8a5, 0x36d8a566,
-    0xd8a5664f, 0xa5664f10, 0x664f10e4, 0x4f10e410,
-    0x10e41000, 0xe4100000
-  };
-  
-  uint32_t xi = numext::as_uint(xf);
-  // Below, -118 = -126 + 8.
-  //   -126 is to get the exponent,
-  //   +8 is to enable alignment of 2/pi's bits on 8 bits.
-  // This is possible because the fractional part of x as only 24 meaningful bits.
-  uint32_t e = (xi >> 23) - 118;
-  // Extract the mantissa and shift it to align it wrt the exponent
-  xi = ((xi & 0x007fffffu)| 0x00800000u) << (e & 0x7);
-
-  uint32_t i = e >> 3;
-  uint32_t twoopi_1  = two_over_pi[i-1];
-  uint32_t twoopi_2  = two_over_pi[i+3];
-  uint32_t twoopi_3  = two_over_pi[i+7];
-
-  // Compute x * 2/pi in 2.62-bit fixed-point format.
-  uint64_t p;
-  p = uint64_t(xi) * twoopi_3;
-  p = uint64_t(xi) * twoopi_2 + (p >> 32);
-  p = (uint64_t(xi * twoopi_1) << 32) + p;
-
-  // Round to nearest: add 0.5 and extract integral part.
-  uint64_t q = (p + zero_dot_five) >> 62;
-  *quadrant = int(q);
-  // Now it remains to compute "r = x - q*pi/2" with high accuracy,
-  // since we have p=x/(pi/2) with high accuracy, we can more efficiently compute r as:
-  //   r = (p-q)*pi/2,
-  // where the product can be be carried out with sufficient accuracy using double precision.
-  p -= q<<62;
-  return float(double(int64_t(p)) * pio2_62);
-}
-
-template<bool ComputeSine,typename Packet>
-EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS
-EIGEN_UNUSED
-#if EIGEN_GNUC_AT_LEAST(4,4) && EIGEN_COMP_GNUC_STRICT
-__attribute__((optimize("-fno-unsafe-math-optimizations")))
-#endif
-Packet psincos_float(const Packet& _x)
-{
-// Workaround -ffast-math aggressive optimizations
-// See bug 1674
-#if EIGEN_COMP_CLANG && defined(EIGEN_VECTORIZE_SSE)
-#define EIGEN_SINCOS_DONT_OPT(X) __asm__  ("" : "+x" (X));
-#else
-#define EIGEN_SINCOS_DONT_OPT(X)
-#endif
-
-  typedef typename unpacket_traits<Packet>::integer_packet PacketI;
-
-  const Packet  cst_2oPI            = pset1<Packet>(0.636619746685028076171875f); // 2/PI
-  const Packet  cst_rounding_magic  = pset1<Packet>(12582912); // 2^23 for rounding
-  const PacketI csti_1              = pset1<PacketI>(1);
-  const Packet  cst_sign_mask       = pset1frombits<Packet>(0x80000000u);
-
-  Packet x = pabs(_x);
-
-  // Scale x by 2/Pi to find x's octant.
-  Packet y = pmul(x, cst_2oPI);
-
-  // Rounding trick:
-  Packet y_round = padd(y, cst_rounding_magic);
-  EIGEN_SINCOS_DONT_OPT(y_round)
-  PacketI y_int = preinterpret<PacketI>(y_round); // last 23 digits represent integer (if abs(x)<2^24)
-  y = psub(y_round, cst_rounding_magic); // nearest integer to x*4/pi
-
-  // Reduce x by y octants to get: -Pi/4 <= x <= +Pi/4
-  // using "Extended precision modular arithmetic"
-  #if defined(EIGEN_HAS_SINGLE_INSTRUCTION_MADD)
-  // This version requires true FMA for high accuracy
-  // It provides a max error of 1ULP up to (with absolute_error < 5.9605e-08):
-  const float huge_th = ComputeSine ? 117435.992f : 71476.0625f;
-  x = pmadd(y, pset1<Packet>(-1.57079601287841796875f), x);
-  x = pmadd(y, pset1<Packet>(-3.1391647326017846353352069854736328125e-07f), x);
-  x = pmadd(y, pset1<Packet>(-5.390302529957764765544681040410068817436695098876953125e-15f), x);
-  #else
-  // Without true FMA, the previous set of coefficients maintain 1ULP accuracy
-  // up to x<15.7 (for sin), but accuracy is immediately lost for x>15.7.
-  // We thus use one more iteration to maintain 2ULPs up to reasonably large inputs.
-
-  // The following set of coefficients maintain 1ULP up to 9.43 and 14.16 for sin and cos respectively.
-  // and 2 ULP up to:
-  const float huge_th = ComputeSine ? 25966.f : 18838.f;
-  x = pmadd(y, pset1<Packet>(-1.5703125), x); // = 0xbfc90000
-  EIGEN_SINCOS_DONT_OPT(x)
-  x = pmadd(y, pset1<Packet>(-0.000483989715576171875), x); // = 0xb9fdc000
-  EIGEN_SINCOS_DONT_OPT(x)
-  x = pmadd(y, pset1<Packet>(1.62865035235881805419921875e-07), x); // = 0x342ee000
-  x = pmadd(y, pset1<Packet>(5.5644315544167710640977020375430583953857421875e-11), x); // = 0x2e74b9ee
-
-  // For the record, the following set of coefficients maintain 2ULP up
-  // to a slightly larger range:
-  // const float huge_th = ComputeSine ? 51981.f : 39086.125f;
-  // but it slightly fails to maintain 1ULP for two values of sin below pi.
-  // x = pmadd(y, pset1<Packet>(-3.140625/2.), x);
-  // x = pmadd(y, pset1<Packet>(-0.00048351287841796875), x);
-  // x = pmadd(y, pset1<Packet>(-3.13855707645416259765625e-07), x);
-  // x = pmadd(y, pset1<Packet>(-6.0771006282767103812147979624569416046142578125e-11), x);
-
-  // For the record, with only 3 iterations it is possible to maintain
-  // 1 ULP up to 3PI (maybe more) and 2ULP up to 255.
-  // The coefficients are: 0xbfc90f80, 0xb7354480, 0x2e74b9ee
-  #endif
-
-  if(predux_any(pcmp_le(pset1<Packet>(huge_th),pabs(_x))))
-  {
-    const int PacketSize = unpacket_traits<Packet>::size;
-    EIGEN_ALIGN_TO_BOUNDARY(sizeof(Packet)) float vals[PacketSize];
-    EIGEN_ALIGN_TO_BOUNDARY(sizeof(Packet)) float x_cpy[PacketSize];
-    EIGEN_ALIGN_TO_BOUNDARY(sizeof(Packet)) int y_int2[PacketSize];
-    pstoreu(vals, pabs(_x));
-    pstoreu(x_cpy, x);
-    pstoreu(y_int2, y_int);
-    for(int k=0; k<PacketSize;++k)
-    {
-      float val = vals[k];
-      if(val>=huge_th && (numext::isfinite)(val))
-        x_cpy[k] = trig_reduce_huge(val,&y_int2[k]);
-    }
-    x = ploadu<Packet>(x_cpy);
-    y_int = ploadu<PacketI>(y_int2);
-  }
-
-  // Compute the sign to apply to the polynomial.
-  // sin: sign = second_bit(y_int) xor signbit(_x)
-  // cos: sign = second_bit(y_int+1)
-  Packet sign_bit = ComputeSine ? pxor(_x, preinterpret<Packet>(pshiftleft<30>(y_int)))
-                                : preinterpret<Packet>(pshiftleft<30>(padd(y_int,csti_1)));
-  sign_bit = pand(sign_bit, cst_sign_mask); // clear all but left most bit
-
-  // Get the polynomial selection mask from the second bit of y_int
-  // We'll calculate both (sin and cos) polynomials and then select from the two.
-  Packet poly_mask = preinterpret<Packet>(pcmp_eq(pand(y_int, csti_1), pzero(y_int)));
-
-  Packet x2 = pmul(x,x);
-
-  // Evaluate the cos(x) polynomial. (-Pi/4 <= x <= Pi/4)
-  Packet y1 =        pset1<Packet>(2.4372266125283204019069671630859375e-05f);
-  y1 = pmadd(y1, x2, pset1<Packet>(-0.00138865201734006404876708984375f     ));
-  y1 = pmadd(y1, x2, pset1<Packet>(0.041666619479656219482421875f           ));
-  y1 = pmadd(y1, x2, pset1<Packet>(-0.5f));
-  y1 = pmadd(y1, x2, pset1<Packet>(1.f));
-
-  // Evaluate the sin(x) polynomial. (Pi/4 <= x <= Pi/4)
-  // octave/matlab code to compute those coefficients:
-  //    x = (0:0.0001:pi/4)';
-  //    A = [x.^3 x.^5 x.^7];
-  //    w = ((1.-(x/(pi/4)).^2).^5)*2000+1;         # weights trading relative accuracy
-  //    c = (A'*diag(w)*A)\(A'*diag(w)*(sin(x)-x)); # weighted LS, linear coeff forced to 1
-  //    printf('%.64f\n %.64f\n%.64f\n', c(3), c(2), c(1))
-  //
-  Packet y2 =        pset1<Packet>(-0.0001959234114083702898469196984621021329076029360294342041015625f);
-  y2 = pmadd(y2, x2, pset1<Packet>( 0.0083326873655616851693794799871284340042620897293090820312500000f));
-  y2 = pmadd(y2, x2, pset1<Packet>(-0.1666666203982298255503735617821803316473960876464843750000000000f));
-  y2 = pmul(y2, x2);
-  y2 = pmadd(y2, x, x);
-
-  // Select the correct result from the two polynomials.
-  y = ComputeSine ? pselect(poly_mask,y2,y1)
-                  : pselect(poly_mask,y1,y2);
-
-  // Update the sign and filter huge inputs
-  return pxor(y, sign_bit);
-
-#undef EIGEN_SINCOS_DONT_OPT
-}
-
-template<typename Packet>
-EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS
-EIGEN_UNUSED
-Packet psin_float(const Packet& x)
-{
-  return psincos_float<true>(x);
-}
-
-template<typename Packet>
-EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS
-EIGEN_UNUSED
-Packet pcos_float(const Packet& x)
-{
-  return psincos_float<false>(x);
-}
-
-/* polevl (modified for Eigen)
- *
- *      Evaluate polynomial
- *
- *
- *
- * SYNOPSIS:
- *
- * int N;
- * Scalar x, y, coef[N+1];
- *
- * y = polevl<decltype(x), N>( x, coef);
- *
- *
- *
- * DESCRIPTION:
- *
- * Evaluates polynomial of degree N:
- *
- *                     2          N
- * y  =  C  + C x + C x  +...+ C x
- *        0    1     2          N
- *
- * Coefficients are stored in reverse order:
- *
- * coef[0] = C  , ..., coef[N] = C  .
- *            N                   0
- *
- *  The function p1evl() assumes that coef[N] = 1.0 and is
- * omitted from the array.  Its calling arguments are
- * otherwise the same as polevl().
- *
- *
- * The Eigen implementation is templatized.  For best speed, store
- * coef as a const array (constexpr), e.g.
- *
- * const double coef[] = {1.0, 2.0, 3.0, ...};
- *
- */
-template <typename Packet, int N>
-struct ppolevl {
-  static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet run(const Packet& x, const typename unpacket_traits<Packet>::type coeff[]) {
-    EIGEN_STATIC_ASSERT((N > 0), YOU_MADE_A_PROGRAMMING_MISTAKE);
-    return pmadd(ppolevl<Packet, N-1>::run(x, coeff), x, pset1<Packet>(coeff[N]));
-  }
-};
-
-template <typename Packet>
-struct ppolevl<Packet, 0> {
-  static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet run(const Packet& x, const typename unpacket_traits<Packet>::type coeff[]) {
-    EIGEN_UNUSED_VARIABLE(x);
-    return pset1<Packet>(coeff[0]);
-  }
-};
-
-/* chbevl (modified for Eigen)
- *
- *     Evaluate Chebyshev series
- *
- *
- *
- * SYNOPSIS:
- *
- * int N;
- * Scalar x, y, coef[N], chebevl();
- *
- * y = chbevl( x, coef, N );
- *
- *
- *
- * DESCRIPTION:
- *
- * Evaluates the series
- *
- *        N-1
- *         - '
- *  y  =   >   coef[i] T (x/2)
- *         -            i
- *        i=0
- *
- * of Chebyshev polynomials Ti at argument x/2.
- *
- * Coefficients are stored in reverse order, i.e. the zero
- * order term is last in the array.  Note N is the number of
- * coefficients, not the order.
- *
- * If coefficients are for the interval a to b, x must
- * have been transformed to x -> 2(2x - b - a)/(b-a) before
- * entering the routine.  This maps x from (a, b) to (-1, 1),
- * over which the Chebyshev polynomials are defined.
- *
- * If the coefficients are for the inverted interval, in
- * which (a, b) is mapped to (1/b, 1/a), the transformation
- * required is x -> 2(2ab/x - b - a)/(b-a).  If b is infinity,
- * this becomes x -> 4a/x - 1.
- *
- *
- *
- * SPEED:
- *
- * Taking advantage of the recurrence properties of the
- * Chebyshev polynomials, the routine requires one more
- * addition per loop than evaluating a nested polynomial of
- * the same degree.
- *
- */
-
-template <typename Packet, int N>
-struct pchebevl {
-  EIGEN_DEVICE_FUNC
-  static EIGEN_STRONG_INLINE Packet run(Packet x, const typename unpacket_traits<Packet>::type coef[]) {
-    typedef typename unpacket_traits<Packet>::type Scalar;
-    Packet b0 = pset1<Packet>(coef[0]);
-    Packet b1 = pset1<Packet>(static_cast<Scalar>(0.f));
-    Packet b2;
-
-    for (int i = 1; i < N; i++) {
-      b2 = b1;
-      b1 = b0;
-      b0 = psub(pmadd(x, b1, pset1<Packet>(coef[i])), b2);
-    }
-
-    return pmul(pset1<Packet>(static_cast<Scalar>(0.5f)), psub(b0, b2));
-  }
-};
-
-} // end namespace internal
-} // end namespace Eigen
-
-#endif // EIGEN_ARCH_GENERIC_PACKET_MATH_FUNCTIONS_H
--- a/Show More
+++ b/Show More