Fix undefined behavior in PPC load.

VSX vec_xl is Causing a bunch of test failures and failing `-fsanitize=undefined` with g++. Removing the instruction allows tests to pass and eliminates the warning.
Fix CUDA clang again with new C++11 usages
2026-04-10 11:34:33 +08:00 · 2025-03-14 07:58:36 -07:00 · 2025-03-14 07:58:29 -07:00 · 2025-03-13 15:01:33 -07:00 · 2025-03-13 21:49:24 +00:00 · 2025-03-13 11:13:35 -07:00
486 changed files with 18238 additions and 8545 deletions
--- a/.clang-format
+++ b/.clang-format
@@ -0,0 +1,19 @@
+---
+BasedOnStyle: Google
+ColumnLimit:  120
+---
+Language:     Cpp
+BasedOnStyle: Google
+ColumnLimit:  120
+StatementMacros:
+  - EIGEN_STATIC_ASSERT
+  - EIGEN_INITIALIZE_COEFFS_IF_THAT_OPTION_IS_ENABLED
+  - EIGEN_INTERNAL_DENSE_STORAGE_CTOR_PLUGIN
+SortIncludes: false
+AttributeMacros:
+- EIGEN_STRONG_INLINE
+- EIGEN_ALWAYS_INLINE
+- EIGEN_DEVICE_FUNC
+- EIGEN_DONT_INLINE
+- EIGEN_DEPRECATED
+- EIGEN_UNUSED
--- a/.gitignore
+++ b/.gitignore
@@ -1,4 +1,3 @@
-syntax: glob
 qrc_*cxx
 *.orig
 *.pyc
@@ -13,7 +12,7 @@ core
 core.*
 *.bak
 *~
-build*
+*build*
 *.moc.*
 *.moc
 ui_*
@@ -28,7 +27,11 @@ activity.png
 *.rej
 log
 patch
+*.patch
 a
 a.*
 lapack/testing
 lapack/reference
+.*project
+.settings
+Makefile
--- a/.gitlab-ci.yml
+++ b/.gitlab-ci.yml
@@ -0,0 +1,34 @@
+# This file is part of Eigen, a lightweight C++ template library
+# for linear algebra.
+#
+# Copyright (C) 2023, The Eigen Authors
+#
+# This Source Code Form is subject to the terms of the Mozilla
+# Public License v. 2.0. If a copy of the MPL was not distributed
+# with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+stages:
+  - checkformat
+  - build
+  - test
+  - deploy
+
+variables:
+  # CMake build directory.
+  EIGEN_CI_BUILDDIR: .build
+  # Specify the CMake build target.
+  EIGEN_CI_BUILD_TARGET: ""
+  # If a test regex is specified, that will be selected.
+  # Otherwise, we will try a label if specified.
+  EIGEN_CI_CTEST_REGEX: ""
+  EIGEN_CI_CTEST_LABEL: ""
+  EIGEN_CI_CTEST_ARGS: ""
+
+include:
+  - "/ci/checkformat.gitlab-ci.yml"
+  - "/ci/common.gitlab-ci.yml"
+  - "/ci/build.linux.gitlab-ci.yml"
+  - "/ci/build.windows.gitlab-ci.yml"
+  - "/ci/test.linux.gitlab-ci.yml"
+  - "/ci/test.windows.gitlab-ci.yml"
+  - "/ci/deploy.gitlab-ci.yml"
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -1,7 +1,7 @@
-project(Eigen3)
-
 cmake_minimum_required(VERSION 2.8.5)

+project(Eigen3)
+
 # guard against in-source builds

 if(${CMAKE_SOURCE_DIR} STREQUAL ${CMAKE_BINARY_DIR})
@@ -19,14 +19,6 @@ if (NOT CMAKE_BUILD_TYPE)
  set(CMAKE_BUILD_TYPE "Release")
 endif()

-string(TOLOWER "${CMAKE_BUILD_TYPE}" cmake_build_type_tolower)
-if(    NOT cmake_build_type_tolower STREQUAL "debug"
-   AND NOT cmake_build_type_tolower STREQUAL "release"
-   AND NOT cmake_build_type_tolower STREQUAL "relwithdebinfo")
-  message(FATAL_ERROR "Unknown build type \"${CMAKE_BUILD_TYPE}\". Allowed values are Debug, Release, RelWithDebInfo (case-insensitive).")
-endif()
-
-
 #############################################################################
 # retrieve version infomation                                               #
 #############################################################################
@@ -41,10 +33,13 @@ string(REGEX MATCH "define[ \t]+EIGEN_MINOR_VERSION[ \t]+([0-9]+)" _eigen_minor_
 set(EIGEN_MINOR_VERSION "${CMAKE_MATCH_1}")
 set(EIGEN_VERSION_NUMBER ${EIGEN_WORLD_VERSION}.${EIGEN_MAJOR_VERSION}.${EIGEN_MINOR_VERSION})

-# if the mercurial program is absent, this will leave the EIGEN_HG_CHANGESET string empty,
-# but won't stop CMake.
-execute_process(COMMAND hg tip -R ${CMAKE_SOURCE_DIR} OUTPUT_VARIABLE EIGEN_HGTIP_OUTPUT)
-execute_process(COMMAND hg branch -R ${CMAKE_SOURCE_DIR} OUTPUT_VARIABLE EIGEN_BRANCH_OUTPUT)
+# if we are not in a mercurial clone
+if(IS_DIRECTORY ${CMAKE_SOURCE_DIR}/.hg)
+  # if the mercurial program is absent or this will leave the EIGEN_HG_CHANGESET string empty,
+  # but won't stop CMake.
+  execute_process(COMMAND hg tip -R ${CMAKE_SOURCE_DIR} OUTPUT_VARIABLE EIGEN_HGTIP_OUTPUT)
+  execute_process(COMMAND hg branch -R ${CMAKE_SOURCE_DIR} OUTPUT_VARIABLE EIGEN_BRANCH_OUTPUT)
+endif()

 # if this is the default (aka development) branch, extract the mercurial changeset number from the hg tip output...
 if(EIGEN_BRANCH_OUTPUT MATCHES "default")
@@ -64,6 +59,47 @@ include(GNUInstallDirs)

 set(CMAKE_MODULE_PATH ${PROJECT_SOURCE_DIR}/cmake)

+
+option(EIGEN_TEST_CXX11 "Enable testing with C++11 and C++11 features (e.g. Tensor module)." OFF)
+
+
+macro(ei_add_cxx_compiler_flag FLAG)
+  string(REGEX REPLACE "-" "" SFLAG1 ${FLAG})
+  string(REGEX REPLACE "\\+" "p" SFLAG ${SFLAG1})
+  check_cxx_compiler_flag(${FLAG} COMPILER_SUPPORT_${SFLAG})
+  if(COMPILER_SUPPORT_${SFLAG})
+    set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} ${FLAG}")
+  endif()
+endmacro(ei_add_cxx_compiler_flag)
+
+check_cxx_compiler_flag("-std=c++11" EIGEN_COMPILER_SUPPORT_CPP11)
+
+if(EIGEN_TEST_CXX11)
+  set(CMAKE_CXX_STANDARD 11)
+  set(CMAKE_CXX_EXTENSIONS OFF)
+  if(EIGEN_COMPILER_SUPPORT_CPP11)
+    set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -std=c++11")
+  endif()
+else()
+  #set(CMAKE_CXX_STANDARD 03)
+  #set(CMAKE_CXX_EXTENSIONS OFF)
+  ei_add_cxx_compiler_flag("-std=c++03")
+endif()
+
+function(ei_maybe_separate_arguments variable mode args)
+  # Use separate_arguments if the input is a single string containing a space.
+  # Otherwise, if it is already a list or doesn't have a space, just propagate
+  # the original value.  This is to better support multi-argument lists.
+  list(LENGTH args list_length)
+  if (${list_length} EQUAL 1)
+    string(FIND "${args}" " " has_space)
+    if (${has_space} GREATER -1)
+      separate_arguments(args ${mode} "${args}")
+    endif()
+  endif()
+  set(${variable} ${args} PARENT_SCOPE)
+endfunction(ei_maybe_separate_arguments)
+
 #############################################################################
 # find how to link to the standard libraries                                #
 #############################################################################
@@ -74,6 +110,10 @@ find_package(StandardMathLibrary)
 set(EIGEN_TEST_CUSTOM_LINKER_FLAGS  "" CACHE STRING "Additional linker flags when linking unit tests.")
 set(EIGEN_TEST_CUSTOM_CXX_FLAGS     "" CACHE STRING "Additional compiler flags when compiling unit tests.")

+# Convert space-separated arguments into CMake lists for downstream consumption.
+ei_maybe_separate_arguments(EIGEN_TEST_CUSTOM_LINKER_FLAGS NATIVE_COMMAND "${EIGEN_TEST_CUSTOM_LINKER_FLAGS}")
+ei_maybe_separate_arguments(EIGEN_TEST_CUSTOM_CXX_FLAGS NATIVE_COMMAND "${EIGEN_TEST_CUSTOM_CXX_FLAGS}")
+
 set(EIGEN_STANDARD_LIBRARIES_TO_LINK_TO "")

 if(NOT STANDARD_MATH_LIBRARY_FOUND)
@@ -115,15 +155,6 @@ endif()

 set(EIGEN_TEST_MAX_SIZE "320" CACHE STRING "Maximal matrix/vector size, default is 320")

-macro(ei_add_cxx_compiler_flag FLAG)
-  string(REGEX REPLACE "-" "" SFLAG1 ${FLAG})
-  string(REGEX REPLACE "\\+" "p" SFLAG ${SFLAG1})
-  check_cxx_compiler_flag(${FLAG} COMPILER_SUPPORT_${SFLAG})
-  if(COMPILER_SUPPORT_${SFLAG})
-    set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} ${FLAG}")
-  endif()
-endmacro(ei_add_cxx_compiler_flag)
-
 if(NOT MSVC)
  # We assume that other compilers are partly compatible with GNUCC

@@ -137,7 +168,7 @@ if(NOT MSVC)
  ei_add_cxx_compiler_flag("-Wall")
  ei_add_cxx_compiler_flag("-Wextra")
  #ei_add_cxx_compiler_flag("-Weverything")              # clang
-  
+
  ei_add_cxx_compiler_flag("-Wundef")
  ei_add_cxx_compiler_flag("-Wcast-align")
  ei_add_cxx_compiler_flag("-Wchar-subscripts")
@@ -152,29 +183,29 @@ if(NOT MSVC)
  ei_add_cxx_compiler_flag("-Wc++11-extensions")
  ei_add_cxx_compiler_flag("-Wdouble-promotion")
 #  ei_add_cxx_compiler_flag("-Wconversion")
-  
+
  # -Wshadow is insanely too strict with gcc, hopefully it will become usable with gcc 6
  # if(NOT CMAKE_COMPILER_IS_GNUCXX OR (CMAKE_CXX_COMPILER_VERSION VERSION_GREATER "5.0.0"))
  if(NOT CMAKE_COMPILER_IS_GNUCXX)
    ei_add_cxx_compiler_flag("-Wshadow")
  endif()
-  
+
  ei_add_cxx_compiler_flag("-Wno-psabi")
  ei_add_cxx_compiler_flag("-Wno-variadic-macros")
  ei_add_cxx_compiler_flag("-Wno-long-long")
-  
+
  ei_add_cxx_compiler_flag("-fno-check-new")
  ei_add_cxx_compiler_flag("-fno-common")
  ei_add_cxx_compiler_flag("-fstrict-aliasing")
  ei_add_cxx_compiler_flag("-wd981")                    # disable ICC's "operands are evaluated in unspecified order" remark
  ei_add_cxx_compiler_flag("-wd2304")                   # disable ICC's "warning #2304: non-explicit constructor with single argument may cause implicit type conversion" produced by -Wnon-virtual-dtor
-  
-  
+
+
  # The -ansi flag must be added last, otherwise it is also used as a linker flag by check_cxx_compiler_flag making it fails
  # Moreover we should not set both -strict-ansi and -ansi
  check_cxx_compiler_flag("-strict-ansi" COMPILER_SUPPORT_STRICTANSI)
  ei_add_cxx_compiler_flag("-Qunused-arguments")        # disable clang warning: argument unused during compilation: '-ansi'
-  
+
  if(COMPILER_SUPPORT_STRICTANSI)
    set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -strict-ansi")
  else()
@@ -185,7 +216,7 @@ if(NOT MSVC)
    ei_add_cxx_compiler_flag("-pie")
    ei_add_cxx_compiler_flag("-fPIE")
  endif()
-  
+
  set(CMAKE_REQUIRED_FLAGS "")

  option(EIGEN_TEST_SSE2 "Enable/Disable SSE2 in tests/examples" OFF)
@@ -359,8 +390,7 @@ if(EIGEN_TEST_NO_EXCEPTIONS)
  message(STATUS "Disabling exceptions in tests/examples")
 endif()

-option(EIGEN_TEST_CXX11 "Enable testing with C++11 and C++11 features (e.g. Tensor module)." OFF)
-
+set(EIGEN_CUDA_CXX_FLAGS "" CACHE STRING "Additional flags to pass to the cuda compiler.")
 set(EIGEN_CUDA_COMPUTE_ARCH 30 CACHE STRING "The CUDA compute architecture level to target when compiling CUDA code")

 include_directories(${CMAKE_CURRENT_SOURCE_DIR} ${CMAKE_CURRENT_BINARY_DIR})
@@ -372,22 +402,27 @@ endif()

 if(EIGEN_INCLUDE_INSTALL_DIR AND NOT INCLUDE_INSTALL_DIR)
  set(INCLUDE_INSTALL_DIR ${EIGEN_INCLUDE_INSTALL_DIR}
-      CACHE PATH "The directory relative to CMAKE_PREFIX_PATH where Eigen header files are installed")
+      CACHE STRING "The directory relative to CMAKE_PREFIX_PATH where Eigen header files are installed")
 else()
  set(INCLUDE_INSTALL_DIR
      "${CMAKE_INSTALL_INCLUDEDIR}/eigen3"
-      CACHE PATH "The directory relative to CMAKE_PREFIX_PATH where Eigen header files are installed"
+      CACHE STRING "The directory relative to CMAKE_PREFIX_PATH where Eigen header files are installed"
      )
 endif()
 set(CMAKEPACKAGE_INSTALL_DIR
-    "${CMAKE_INSTALL_LIBDIR}/cmake/eigen3"
-    CACHE PATH "The directory relative to CMAKE_PREFIX_PATH where Eigen3Config.cmake is installed"
+    "${CMAKE_INSTALL_DATADIR}/eigen3/cmake"
+    CACHE STRING "The directory relative to CMAKE_PREFIX_PATH where Eigen3Config.cmake is installed"
    )
 set(PKGCONFIG_INSTALL_DIR
    "${CMAKE_INSTALL_DATADIR}/pkgconfig"
-    CACHE PATH "The directory relative to CMAKE_PREFIX_PATH where eigen3.pc is installed"
+    CACHE STRING "The directory relative to CMAKE_PREFIX_PATH where eigen3.pc is installed"
    )

+foreach(var INCLUDE_INSTALL_DIR CMAKEPACKAGE_INSTALL_DIR PKGCONFIG_INSTALL_DIR)
+  if(IS_ABSOLUTE "${${var}}")
+    message(FATAL_ERROR "${var} must be relative to CMAKE_PREFIX_PATH. Got: ${${var}}")
+  endif()
+endforeach()

 # similar to set_target_properties but append the property instead of overwriting it
 macro(ei_add_target_property target prop value)
@@ -416,25 +451,26 @@ add_subdirectory(Eigen)

 add_subdirectory(doc EXCLUDE_FROM_ALL)

-include(EigenConfigureTesting)
+option(BUILD_TESTING "Enable creation of Eigen tests." ON)
+if(BUILD_TESTING)
+  include(EigenConfigureTesting)

-# fixme, not sure this line is still needed:
-enable_testing() # must be called from the root CMakeLists, see man page
-
-
-if(EIGEN_LEAVE_TEST_IN_ALL_TARGET)
-  add_subdirectory(test) # can't do EXCLUDE_FROM_ALL here, breaks CTest
-else()
-  add_subdirectory(test EXCLUDE_FROM_ALL)
+  if(EIGEN_LEAVE_TEST_IN_ALL_TARGET)
+    add_subdirectory(test) # can't do EXCLUDE_FROM_ALL here, breaks CTest
+  else()
+    add_subdirectory(test EXCLUDE_FROM_ALL)
+  endif()
 endif()

-if(EIGEN_LEAVE_TEST_IN_ALL_TARGET)
-  add_subdirectory(blas)
-  add_subdirectory(lapack)
-else()
-  add_subdirectory(blas EXCLUDE_FROM_ALL)
-  add_subdirectory(lapack EXCLUDE_FROM_ALL)
-endif()
+if (NOT CMAKE_CROSSCOMPILING)
+  if(EIGEN_LEAVE_TEST_IN_ALL_TARGET)
+    add_subdirectory(blas)
+    add_subdirectory(lapack)
+  else()
+    add_subdirectory(blas EXCLUDE_FROM_ALL)
+    add_subdirectory(lapack EXCLUDE_FROM_ALL)
+  endif()
+endif(NOT CMAKE_CROSSCOMPILING)

 # add SYCL
 option(EIGEN_TEST_SYCL "Add Sycl support." OFF)
@@ -461,7 +497,9 @@ endif(NOT WIN32)

 configure_file(scripts/cdashtesting.cmake.in cdashtesting.cmake @ONLY)

-ei_testing_print_summary()
+if(BUILD_TESTING)
+  ei_testing_print_summary()
+endif()

 message(STATUS "")
 message(STATUS "Configured Eigen ${EIGEN_VERSION_NUMBER}")
@@ -507,18 +545,90 @@ set ( EIGEN_VERSION_MINOR  ${EIGEN_MAJOR_VERSION} )
 set ( EIGEN_VERSION_PATCH  ${EIGEN_MINOR_VERSION} )
 set ( EIGEN_DEFINITIONS "")
 set ( EIGEN_INCLUDE_DIR "${CMAKE_INSTALL_PREFIX}/${INCLUDE_INSTALL_DIR}" )
-set ( EIGEN_INCLUDE_DIRS ${EIGEN_INCLUDE_DIR} )
 set ( EIGEN_ROOT_DIR ${CMAKE_INSTALL_PREFIX} )

-configure_file ( ${CMAKE_CURRENT_SOURCE_DIR}/cmake/Eigen3Config.cmake.in
-                 ${CMAKE_CURRENT_BINARY_DIR}/Eigen3Config.cmake
-                 @ONLY ESCAPE_QUOTES
-               )
+# Interface libraries require at least CMake 3.0
+if (NOT CMAKE_VERSION VERSION_LESS 3.0)
+  include (CMakePackageConfigHelpers)
+
+  # Imported target support
+  add_library (eigen INTERFACE)
+  add_library (Eigen3::Eigen ALIAS eigen)
+
+  target_compile_definitions (eigen INTERFACE ${EIGEN_DEFINITIONS})
+  target_include_directories (eigen INTERFACE
+    $<BUILD_INTERFACE:${CMAKE_CURRENT_SOURCE_DIR}>
+    $<INSTALL_INTERFACE:${INCLUDE_INSTALL_DIR}>
+  )
+
+  # Export as title case Eigen
+  set_target_properties (eigen PROPERTIES EXPORT_NAME Eigen)
+
+  install (TARGETS eigen EXPORT Eigen3Targets)
+
+  configure_package_config_file (
+    ${CMAKE_CURRENT_SOURCE_DIR}/cmake/Eigen3Config.cmake.in
+    ${CMAKE_CURRENT_BINARY_DIR}/Eigen3Config.cmake
+    PATH_VARS EIGEN_INCLUDE_DIR EIGEN_ROOT_DIR
+    INSTALL_DESTINATION ${CMAKEPACKAGE_INSTALL_DIR}
+    NO_CHECK_REQUIRED_COMPONENTS_MACRO # Eigen does not provide components
+  )
+  # Remove CMAKE_SIZEOF_VOID_P from Eigen3ConfigVersion.cmake since Eigen does
+  # not depend on architecture specific settings or libraries. More
+  # specifically, an Eigen3Config.cmake generated from a 64 bit target can be
+  # used for 32 bit targets as well (and vice versa).
+  set (_Eigen3_CMAKE_SIZEOF_VOID_P ${CMAKE_SIZEOF_VOID_P})
+  unset (CMAKE_SIZEOF_VOID_P)
+  write_basic_package_version_file (Eigen3ConfigVersion.cmake
+                                    VERSION ${EIGEN_VERSION_NUMBER}
+                                    COMPATIBILITY SameMajorVersion)
+  set (CMAKE_SIZEOF_VOID_P ${_Eigen3_CMAKE_SIZEOF_VOID_P})
+
+  # The Eigen target will be located in the Eigen3 namespace. Other CMake
+  # targets can refer to it using Eigen3::Eigen.
+  export (TARGETS eigen NAMESPACE Eigen3:: FILE Eigen3Targets.cmake)
+  # Export Eigen3 package to CMake registry such that it can be easily found by
+  # CMake even if it has not been installed to a standard directory.
+  export (PACKAGE Eigen3)
+
+  install (EXPORT Eigen3Targets NAMESPACE Eigen3:: DESTINATION ${CMAKEPACKAGE_INSTALL_DIR})
+
+else (NOT CMAKE_VERSION VERSION_LESS 3.0)
+  # Fallback to legacy Eigen3Config.cmake without the imported target
+
+  # If CMakePackageConfigHelpers module is available (CMake >= 2.8.8)
+  # create a relocatable Config file, otherwise leave the hardcoded paths
+  include(CMakePackageConfigHelpers OPTIONAL RESULT_VARIABLE CPCH_PATH)
+
+  if(CPCH_PATH)
+    configure_package_config_file (
+      ${CMAKE_CURRENT_SOURCE_DIR}/cmake/Eigen3ConfigLegacy.cmake.in
+      ${CMAKE_CURRENT_BINARY_DIR}/Eigen3Config.cmake
+      PATH_VARS EIGEN_INCLUDE_DIR EIGEN_ROOT_DIR
+      INSTALL_DESTINATION ${CMAKEPACKAGE_INSTALL_DIR}
+      NO_CHECK_REQUIRED_COMPONENTS_MACRO # Eigen does not provide components
+    )
+  else()
+    # The PACKAGE_* variables are defined by the configure_package_config_file
+    # but without it we define them manually to the hardcoded paths
+    set(PACKAGE_INIT "")
+    set(PACKAGE_EIGEN_INCLUDE_DIR ${EIGEN_INCLUDE_DIR})
+    set(PACKAGE_EIGEN_ROOT_DIR ${EIGEN_ROOT_DIR})
+    configure_file ( ${CMAKE_CURRENT_SOURCE_DIR}/cmake/Eigen3ConfigLegacy.cmake.in
+                     ${CMAKE_CURRENT_BINARY_DIR}/Eigen3Config.cmake
+                     @ONLY ESCAPE_QUOTES )
+  endif()
+
+  write_basic_package_version_file( Eigen3ConfigVersion.cmake
+                                    VERSION ${EIGEN_VERSION_NUMBER}
+                                    COMPATIBILITY SameMajorVersion )
+
+endif (NOT CMAKE_VERSION VERSION_LESS 3.0)

 install ( FILES ${CMAKE_CURRENT_SOURCE_DIR}/cmake/UseEigen3.cmake
                ${CMAKE_CURRENT_BINARY_DIR}/Eigen3Config.cmake
-          DESTINATION ${CMAKEPACKAGE_INSTALL_DIR}
-        )
+                ${CMAKE_CURRENT_BINARY_DIR}/Eigen3ConfigVersion.cmake
+          DESTINATION ${CMAKEPACKAGE_INSTALL_DIR} )

 # Add uninstall target
 add_custom_target ( uninstall
--- a/CTestConfig.cmake
+++ b/CTestConfig.cmake
@@ -8,10 +8,6 @@ set(CTEST_PROJECT_NAME "Eigen")
 set(CTEST_NIGHTLY_START_TIME "00:00:00 UTC")

 set(CTEST_DROP_METHOD "http")
-set(CTEST_DROP_SITE "manao.inria.fr")
-set(CTEST_DROP_LOCATION "/CDash/submit.php?project=Eigen")
+set(CTEST_DROP_SITE "my.cdash.org")
+set(CTEST_DROP_LOCATION "/submit.php?project=Eigen")
 set(CTEST_DROP_SITE_CDASH TRUE)
-set(CTEST_PROJECT_SUBPROJECTS
-Official
-Unsupported
-)
--- a/CTestCustom.cmake.in
+++ b/CTestCustom.cmake.in
@@ -1,3 +1,4 @@

 set(CTEST_CUSTOM_MAXIMUM_NUMBER_OF_WARNINGS "2000")
 set(CTEST_CUSTOM_MAXIMUM_NUMBER_OF_ERRORS   "2000")
+list(APPEND CTEST_CUSTOM_ERROR_EXCEPTION    @EIGEN_CTEST_ERROR_EXCEPTION@)
--- a/Eigen/Cholesky
+++ b/Eigen/Cholesky
@@ -9,6 +9,7 @@
 #define EIGEN_CHOLESKY_MODULE_H

 #include "Core"
+#include "Jacobi"

 #include "src/Core/util/DisableStupidWarnings.h"

@@ -31,7 +32,11 @@
 #include "src/Cholesky/LLT.h"
 #include "src/Cholesky/LDLT.h"
 #ifdef EIGEN_USE_LAPACKE
+#ifdef EIGEN_USE_MKL
+#include "mkl_lapacke.h"
+#else
 #include "src/misc/lapacke.h"
+#endif
 #include "src/Cholesky/LLT_LAPACKE.h"
 #endif

--- a/Eigen/Core
+++ b/Eigen/Core
@@ -14,8 +14,24 @@
 // first thing Eigen does: stop the compiler from committing suicide
 #include "src/Core/util/DisableStupidWarnings.h"

+#if defined(__CUDACC__) && !defined(EIGEN_NO_CUDA)
+  #define EIGEN_CUDACC __CUDACC__
+#endif
+
+#if defined(__CUDA_ARCH__) && !defined(EIGEN_NO_CUDA)
+  #define EIGEN_CUDA_ARCH __CUDA_ARCH__
+#endif
+
+#if defined(__CUDACC_VER_MAJOR__) && (__CUDACC_VER_MAJOR__ >= 9)
+#define EIGEN_CUDACC_VER  ((__CUDACC_VER_MAJOR__ * 10000) + (__CUDACC_VER_MINOR__ * 100))
+#elif defined(__CUDACC_VER__)
+#define EIGEN_CUDACC_VER __CUDACC_VER__
+#else
+#define EIGEN_CUDACC_VER 0
+#endif
+
 // Handle NVCC/CUDA/SYCL
-#if defined(__CUDACC__) || defined(__SYCL_DEVICE_ONLY__)
+#if defined(EIGEN_CUDACC) || defined(__SYCL_DEVICE_ONLY__)
  // Do not try asserts on CUDA and SYCL!
  #ifndef EIGEN_NO_DEBUG
  #define EIGEN_NO_DEBUG
@@ -30,25 +46,32 @@
  #endif

  // All functions callable from CUDA code must be qualified with __device__
-  #ifdef __CUDACC__
+  #ifdef EIGEN_CUDACC
    // Do not try to vectorize on CUDA and SYCL!
    #ifndef EIGEN_DONT_VECTORIZE
    #define EIGEN_DONT_VECTORIZE
    #endif

    #define EIGEN_DEVICE_FUNC __host__ __device__
-    // We need math_functions.hpp to ensure that that EIGEN_USING_STD_MATH macro
+    // We need cuda_runtime.h to ensure that that EIGEN_USING_STD_MATH macro
    // works properly on the device side
-    #include <math_functions.hpp>
+    #include <cuda_runtime.h>
  #else
    #define EIGEN_DEVICE_FUNC
  #endif

 #else
  #define EIGEN_DEVICE_FUNC
-
 #endif

+#if defined(EIGEN_CUDACC)
+  #include <cuda.h>
+  #define EIGEN_CUDA_SDK_VER (CUDA_VERSION * 10)
+#else
+  #define EIGEN_CUDA_SDK_VER 0
+#endif
+
+
 // When compiling CUDA device code with NVCC, pull in math functions from the
 // global namespace.  In host mode, and when device doee with clang, use the
 // std versions.
@@ -107,7 +130,7 @@
  #endif
 #endif

-#ifndef EIGEN_DONT_VECTORIZE
+#if !defined(EIGEN_DONT_VECTORIZE) && !defined(EIGEN_CUDACC)

  #if defined (EIGEN_SSE2_ON_NON_MSVC_BUT_NOT_OLD_GCC) || defined(EIGEN_SSE2_ON_MSVC_2008_OR_LATER)

@@ -155,6 +178,9 @@
      #ifdef __AVX512DQ__
        #define EIGEN_VECTORIZE_AVX512DQ
      #endif
+      #ifdef __AVX512ER__
+        #define EIGEN_VECTORIZE_AVX512ER
+      #endif
    #endif

    // include files
@@ -194,6 +220,7 @@
    } // end extern "C"
  #elif defined __VSX__
    #define EIGEN_VECTORIZE
+    #define EIGEN_VECTORIZE_FMA
    #define EIGEN_VECTORIZE_VSX
    #include <altivec.h>
    // We need to #undef all these ugly tokens defined in <altivec.h>
@@ -203,6 +230,7 @@
    #undef pixel
  #elif defined __ALTIVEC__
    #define EIGEN_VECTORIZE
+    #define EIGEN_VECTORIZE_FMA
    #define EIGEN_VECTORIZE_ALTIVEC
    #include <altivec.h>
    // We need to #undef all these ugly tokens defined in <altivec.h>
@@ -214,6 +242,10 @@
    #define EIGEN_VECTORIZE
    #define EIGEN_VECTORIZE_NEON
    #include <arm_neon.h>
+    // Enable FMA for ARM.
+    #if defined(__ARM_FEATURE_FMA)
+      #define EIGEN_VECTORIZE_FMA
+    #endif
  #elif (defined __s390x__ && defined __VEC__)
    #define EIGEN_VECTORIZE
    #define EIGEN_VECTORIZE_ZVECTOR
@@ -226,16 +258,16 @@
  #define EIGEN_HAS_FP16_C
 #endif

-#if defined __CUDACC__
+#if defined EIGEN_CUDACC
  #define EIGEN_VECTORIZE_CUDA
  #include <vector_types.h>
-  #if defined __CUDACC_VER__ && __CUDACC_VER__ >= 70500
+  #if EIGEN_CUDA_SDK_VER >= 70500
    #define EIGEN_HAS_CUDA_FP16
  #endif
 #endif

 #if defined EIGEN_HAS_CUDA_FP16
-  #include <host_defines.h>
+  #include <cuda_runtime_api.h>
  #include <cuda_fp16.h>
 #endif

@@ -260,7 +292,10 @@
 #include <cmath>
 #include <cassert>
 #include <functional>
-#include <iosfwd>
+#include <sstream>
+#ifndef EIGEN_NO_IO
+  #include <iosfwd>
+#endif
 #include <cstring>
 #include <string>
 #include <limits>
@@ -321,12 +356,16 @@ inline static const char *SimdInstructionSetsInUse(void) {
 #error Eigen2-support is only available up to version 3.2. Please go to "http://eigen.tuxfamily.org/index.php?title=Eigen2" for further information
 #endif

+namespace Eigen {
+
 // we use size_t frequently and we'll never remember to prepend it with std:: everytime just to
 // ensure QNX/QCC support
 using std::size_t;
 // gcc 4.6.0 wants std:: for ptrdiff_t
 using std::ptrdiff_t;

+}
+
 /** \defgroup Core_Module Core module
  * This is the main module of Eigen providing dense matrix and vector support
  * (both fixed and dynamic size) with all the features corresponding to a BLAS library
@@ -348,10 +387,13 @@ using std::ptrdiff_t;
 #include "src/Core/MathFunctions.h"
 #include "src/Core/GenericPacketMath.h"
 #include "src/Core/MathFunctionsImpl.h"
+#include "src/Core/arch/Default/ConjHelper.h"

 #if defined EIGEN_VECTORIZE_AVX512
  #include "src/Core/arch/SSE/PacketMath.h"
+  #include "src/Core/arch/SSE/MathFunctions.h"
  #include "src/Core/arch/AVX/PacketMath.h"
+  #include "src/Core/arch/AVX/MathFunctions.h"
  #include "src/Core/arch/AVX512/PacketMath.h"
  #include "src/Core/arch/AVX512/MathFunctions.h"
 #elif defined EIGEN_VECTORIZE_AVX
@@ -363,6 +405,7 @@ using std::ptrdiff_t;
  #include "src/Core/arch/AVX/MathFunctions.h"
  #include "src/Core/arch/AVX/Complex.h"
  #include "src/Core/arch/AVX/TypeCasting.h"
+  #include "src/Core/arch/SSE/TypeCasting.h"
 #elif defined EIGEN_VECTORIZE_SSE
  #include "src/Core/arch/SSE/PacketMath.h"
  #include "src/Core/arch/SSE/MathFunctions.h"
@@ -405,6 +448,7 @@ using std::ptrdiff_t;
 // on CUDA devices
 #include "src/Core/arch/CUDA/Complex.h"

+#include "src/Core/IO.h"
 #include "src/Core/DenseCoeffsBase.h"
 #include "src/Core/DenseBase.h"
 #include "src/Core/MatrixBase.h"
@@ -452,7 +496,6 @@ using std::ptrdiff_t;
 #include "src/Core/Redux.h"
 #include "src/Core/Visitor.h"
 #include "src/Core/Fuzzy.h"
-#include "src/Core/IO.h"
 #include "src/Core/Swap.h"
 #include "src/Core/CommaInitializer.h"
 #include "src/Core/GeneralProduct.h"
--- a/Eigen/Eigenvalues
+++ b/Eigen/Eigenvalues
@@ -10,14 +10,14 @@

 #include "Core"

-#include "src/Core/util/DisableStupidWarnings.h"
-
 #include "Cholesky"
 #include "Jacobi"
 #include "Householder"
 #include "LU"
 #include "Geometry"

+#include "src/Core/util/DisableStupidWarnings.h"
+
 /** \defgroup Eigenvalues_Module Eigenvalues module
  *
  *
@@ -45,7 +45,11 @@
 #include "src/Eigenvalues/GeneralizedEigenSolver.h"
 #include "src/Eigenvalues/MatrixBaseEigenvalues.h"
 #ifdef EIGEN_USE_LAPACKE
+#ifdef EIGEN_USE_MKL
+#include "mkl_lapacke.h"
+#else
 #include "src/misc/lapacke.h"
+#endif
 #include "src/Eigenvalues/RealSchur_LAPACKE.h"
 #include "src/Eigenvalues/ComplexSchur_LAPACKE.h"
 #include "src/Eigenvalues/SelfAdjointEigenSolver_LAPACKE.h"
--- a/Eigen/Geometry
+++ b/Eigen/Geometry
@@ -10,12 +10,12 @@

 #include "Core"

-#include "src/Core/util/DisableStupidWarnings.h"
-
 #include "SVD"
 #include "LU"
 #include <limits>

+#include "src/Core/util/DisableStupidWarnings.h"
+
 /** \defgroup Geometry_Module Geometry module
  *
  * This module provides support for:
--- a/Eigen/LU
+++ b/Eigen/LU
@@ -28,7 +28,11 @@
 #include "src/LU/FullPivLU.h"
 #include "src/LU/PartialPivLU.h"
 #ifdef EIGEN_USE_LAPACKE
+#ifdef EIGEN_USE_MKL
+#include "mkl_lapacke.h"
+#else
 #include "src/misc/lapacke.h"
+#endif
 #include "src/LU/PartialPivLU_LAPACKE.h"
 #endif
 #include "src/LU/Determinant.h"
--- a/Eigen/QR
+++ b/Eigen/QR
@@ -10,12 +10,12 @@

 #include "Core"

-#include "src/Core/util/DisableStupidWarnings.h"
-
 #include "Cholesky"
 #include "Jacobi"
 #include "Householder"

+#include "src/Core/util/DisableStupidWarnings.h"
+
 /** \defgroup QR_Module QR module
  *
  *
@@ -36,7 +36,11 @@
 #include "src/QR/ColPivHouseholderQR.h"
 #include "src/QR/CompleteOrthogonalDecomposition.h"
 #ifdef EIGEN_USE_LAPACKE
+#ifdef EIGEN_USE_MKL
+#include "mkl_lapacke.h"
+#else
 #include "src/misc/lapacke.h"
+#endif
 #include "src/QR/HouseholderQR_LAPACKE.h"
 #include "src/QR/ColPivHouseholderQR_LAPACKE.h"
 #endif
--- a/Eigen/QtAlignedMalloc
+++ b/Eigen/QtAlignedMalloc
@@ -14,7 +14,7 @@

 #include "src/Core/util/DisableStupidWarnings.h"

-void *qMalloc(size_t size)
+void *qMalloc(std::size_t size)
 {
  return Eigen::internal::aligned_malloc(size);
 }
@@ -24,10 +24,10 @@ void qFree(void *ptr)
  Eigen::internal::aligned_free(ptr);
 }

-void *qRealloc(void *ptr, size_t size)
+void *qRealloc(void *ptr, std::size_t size)
 {
  void* newPtr = Eigen::internal::aligned_malloc(size);
-  memcpy(newPtr, ptr, size);
+  std::memcpy(newPtr, ptr, size);
  Eigen::internal::aligned_free(ptr);
  return newPtr;
 }
--- a/Eigen/SVD
+++ b/Eigen/SVD
@@ -37,7 +37,11 @@
 #include "src/SVD/JacobiSVD.h"
 #include "src/SVD/BDCSVD.h"
 #if defined(EIGEN_USE_LAPACKE) && !defined(EIGEN_USE_LAPACKE_STRICT)
+#ifdef EIGEN_USE_MKL
+#include "mkl_lapacke.h"
+#else
 #include "src/misc/lapacke.h"
+#endif
 #include "src/SVD/JacobiSVD_LAPACKE.h"
 #endif

--- a/Eigen/Sparse
+++ b/Eigen/Sparse
@@ -25,7 +25,9 @@

 #include "SparseCore"
 #include "OrderingMethods"
+#ifndef EIGEN_MPL2_ONLY
 #include "SparseCholesky"
+#endif
 #include "SparseLU"
 #include "SparseQR"
 #include "IterativeLinearSolvers"
--- a/Eigen/SparseQR
+++ b/Eigen/SparseQR
@@ -28,7 +28,6 @@
  * 
  */

-#include "OrderingMethods"
 #include "src/SparseCore/SparseColEtree.h"
 #include "src/SparseQR/SparseQR.h"

--- a/Eigen/StdDeque
+++ b/Eigen/StdDeque
@@ -14,7 +14,7 @@
 #include "Core"
 #include <deque>

-#if EIGEN_COMP_MSVC && EIGEN_OS_WIN64 /* MSVC auto aligns in 64 bit builds */
+#if EIGEN_COMP_MSVC && EIGEN_OS_WIN64 && (EIGEN_MAX_STATIC_ALIGN_BYTES<=16) /* MSVC auto aligns up to 16 bytes in 64 bit builds */

 #define EIGEN_DEFINE_STL_DEQUE_SPECIALIZATION(...)

--- a/Eigen/StdList
+++ b/Eigen/StdList
@@ -13,7 +13,7 @@
 #include "Core"
 #include <list>

-#if EIGEN_COMP_MSVC && EIGEN_OS_WIN64 /* MSVC auto aligns in 64 bit builds */    
+#if EIGEN_COMP_MSVC && EIGEN_OS_WIN64 && (EIGEN_MAX_STATIC_ALIGN_BYTES<=16) /* MSVC auto aligns up to 16 bytes in 64 bit builds */

 #define EIGEN_DEFINE_STL_LIST_SPECIALIZATION(...)

--- a/Eigen/StdVector
+++ b/Eigen/StdVector
@@ -14,7 +14,7 @@
 #include "Core"
 #include <vector>

-#if EIGEN_COMP_MSVC && EIGEN_OS_WIN64 /* MSVC auto aligns in 64 bit builds */
+#if EIGEN_COMP_MSVC && EIGEN_OS_WIN64 && (EIGEN_MAX_STATIC_ALIGN_BYTES<=16) /* MSVC auto aligns up to 16 bytes in 64 bit builds */

 #define EIGEN_DEFINE_STL_VECTOR_SPECIALIZATION(...)

--- a/Eigen/src/Cholesky/LDLT.h
+++ b/Eigen/src/Cholesky/LDLT.h
@@ -44,7 +44,7 @@ namespace internal {
  * decomposition to determine whether a system of equations has a solution.
  *
  * This class supports the \link InplaceDecomposition inplace decomposition \endlink mechanism.
-  * 
+  *
  * \sa MatrixBase::ldlt(), SelfAdjointView::ldlt(), class LLT
  */
 template<typename _MatrixType, int _UpLo> class LDLT
@@ -248,7 +248,7 @@ template<typename _MatrixType, int _UpLo> class LDLT
    /** \brief Reports whether previous computation was successful.
      *
      * \returns \c Success if computation was succesful,
-      *          \c NumericalIssue if the matrix.appears to be negative.
+      *          \c NumericalIssue if the factorization failed because of a zero pivot.
      */
    ComputationInfo info() const
    {
@@ -305,7 +305,8 @@ template<> struct ldlt_inplace<Lower>
    if (size <= 1)
    {
      transpositions.setIdentity();
-      if (numext::real(mat.coeff(0,0)) > static_cast<RealScalar>(0) ) sign = PositiveSemiDef;
+      if(size==0) sign = ZeroSign;
+      else if (numext::real(mat.coeff(0,0)) > static_cast<RealScalar>(0) ) sign = PositiveSemiDef;
      else if (numext::real(mat.coeff(0,0)) < static_cast<RealScalar>(0)) sign = NegativeSemiDef;
      else sign = ZeroSign;
      return true;
@@ -376,6 +377,8 @@ template<> struct ldlt_inplace<Lower>

      if((rs>0) && pivot_is_valid)
        A21 /= realAkk;
+      else if(rs>0)
+        ret = ret && (A21.array()==Scalar(0)).all();

      if(found_zero_pivot && pivot_is_valid) ret = false; // factorization failed
      else if(!pivot_is_valid) found_zero_pivot = true;
@@ -555,7 +558,7 @@ LDLT<MatrixType,_UpLo>& LDLT<MatrixType,_UpLo>::rankUpdate(const MatrixBase<Deri
 #ifndef EIGEN_PARSED_BY_DOXYGEN
 template<typename _MatrixType, int _UpLo>
 template<typename RhsType, typename DstType>
-void LDLT<_MatrixType,_UpLo>::_solve_impl(const RhsType &rhs, DstType &dst) const
+EIGEN_DEVICE_FUNC void LDLT<_MatrixType,_UpLo>::_solve_impl(const RhsType &rhs, DstType &dst) const
 {
  eigen_assert(rhs.rows() == rows());
  // dst = P b
@@ -568,13 +571,14 @@ void LDLT<_MatrixType,_UpLo>::_solve_impl(const RhsType &rhs, DstType &dst) cons
  // more precisely, use pseudo-inverse of D (see bug 241)
  using std::abs;
  const typename Diagonal<const MatrixType>::RealReturnType vecD(vectorD());
-  // In some previous versions, tolerance was set to the max of 1/highest and the maximal diagonal entry * epsilon
-  // as motivated by LAPACK's xGELSS:
+  // In some previous versions, tolerance was set to the max of 1/highest (or rather numeric_limits::min())
+  // and the maximal diagonal entry * epsilon as motivated by LAPACK's xGELSS:
  // RealScalar tolerance = numext::maxi(vecD.array().abs().maxCoeff() * NumTraits<RealScalar>::epsilon(),RealScalar(1) / NumTraits<RealScalar>::highest());
  // However, LDLT is not rank revealing, and so adjusting the tolerance wrt to the highest
  // diagonal element is not well justified and leads to numerical issues in some cases.
  // Moreover, Lapack's xSYTRS routines use 0 for the tolerance.
-  RealScalar tolerance = RealScalar(1) / NumTraits<RealScalar>::highest();
+  // Using numeric_limits::min() gives us more robustness to denormals.
+  RealScalar tolerance = (std::numeric_limits<RealScalar>::min)();

  for (Index i = 0; i < vecD.size(); ++i)
  {
--- a/Eigen/src/Cholesky/LLT.h
+++ b/Eigen/src/Cholesky/LLT.h
@@ -24,7 +24,7 @@ template<typename MatrixType, int UpLo> struct LLT_Traits;
  *
  * \tparam _MatrixType the type of the matrix of which we are computing the LL^T Cholesky decomposition
  * \tparam _UpLo the triangular part that will be used for the decompositon: Lower (default) or Upper.
-  *             The other triangular part won't be read.
+  *               The other triangular part won't be read.
  *
  * This class performs a LL^T Cholesky decomposition of a symmetric, positive definite
  * matrix A such that A = LL^* = U^*U, where L is lower triangular.
@@ -41,14 +41,18 @@ template<typename MatrixType, int UpLo> struct LLT_Traits;
  * Example: \include LLT_example.cpp
  * Output: \verbinclude LLT_example.out
  *
+  * \b Performance: for best performance, it is recommended to use a column-major storage format
+  * with the Lower triangular part (the default), or, equivalently, a row-major storage format
+  * with the Upper triangular part. Otherwise, you might get a 20% slowdown for the full factorization
+  * step, and rank-updates can be up to 3 times slower.
+  *
  * This class supports the \link InplaceDecomposition inplace decomposition \endlink mechanism.
  *
+  * Note that during the decomposition, only the lower (or upper, as defined by _UpLo) triangular part of A is considered.
+  * Therefore, the strict lower part does not have to store correct values.
+  *
  * \sa MatrixBase::llt(), SelfAdjointView::llt(), class LDLT
  */
- /* HEY THIS DOX IS DISABLED BECAUSE THERE's A BUG EITHER HERE OR IN LDLT ABOUT THAT (OR BOTH)
-  * Note that during the decomposition, only the upper triangular part of A is considered. Therefore,
-  * the strict lower part does not have to store correct values.
-  */
 template<typename _MatrixType, int _UpLo> class LLT
 {
  public:
@@ -146,7 +150,7 @@ template<typename _MatrixType, int _UpLo> class LLT
    }

    template<typename Derived>
-    void solveInPlace(MatrixBase<Derived> &bAndX) const;
+    void solveInPlace(const MatrixBase<Derived> &bAndX) const;

    template<typename InputType>
    LLT& compute(const EigenBase<InputType>& matrix);
@@ -177,7 +181,7 @@ template<typename _MatrixType, int _UpLo> class LLT
    /** \brief Reports whether previous computation was successful.
      *
      * \returns \c Success if computation was succesful,
-      *          \c NumericalIssue if the matrix.appears to be negative.
+      *          \c NumericalIssue if the matrix.appears not to be positive definite.
      */
    ComputationInfo info() const
    {
@@ -425,7 +429,8 @@ LLT<MatrixType,_UpLo>& LLT<MatrixType,_UpLo>::compute(const EigenBase<InputType>
  eigen_assert(a.rows()==a.cols());
  const Index size = a.rows();
  m_matrix.resize(size, size);
-  m_matrix = a.derived();
+  if (!internal::is_same_dense(m_matrix, a.derived()))
+    m_matrix = a.derived();

  // Compute matrix L1 norm = max abs column sum.
  m_l1_norm = RealScalar(0);
@@ -470,7 +475,7 @@ LLT<_MatrixType,_UpLo> LLT<_MatrixType,_UpLo>::rankUpdate(const VectorType& v, c
 #ifndef EIGEN_PARSED_BY_DOXYGEN
 template<typename _MatrixType,int _UpLo>
 template<typename RhsType, typename DstType>
-void LLT<_MatrixType,_UpLo>::_solve_impl(const RhsType &rhs, DstType &dst) const
+EIGEN_DEVICE_FUNC void LLT<_MatrixType,_UpLo>::_solve_impl(const RhsType &rhs, DstType &dst) const
 {
  dst = rhs;
  solveInPlace(dst);
@@ -485,11 +490,14 @@ void LLT<_MatrixType,_UpLo>::_solve_impl(const RhsType &rhs, DstType &dst) const
  *
  * This version avoids a copy when the right hand side matrix b is not needed anymore.
  *
+  * \warning The parameter is only marked 'const' to make the C++ compiler accept a temporary expression here.
+  * This function will const_cast it, so constness isn't honored here.
+  *
  * \sa LLT::solve(), MatrixBase::llt()
  */
 template<typename MatrixType, int _UpLo>
 template<typename Derived>
-void LLT<MatrixType,_UpLo>::solveInPlace(MatrixBase<Derived> &bAndX) const
+void LLT<MatrixType,_UpLo>::solveInPlace(const MatrixBase<Derived> &bAndX) const
 {
  eigen_assert(m_isInitialized && "LLT is not initialized.");
  eigen_assert(m_matrix.rows()==bAndX.rows());
--- a/Eigen/src/Core/Array.h
+++ b/Eigen/src/Core/Array.h
@@ -153,8 +153,6 @@ class Array
      : Base(std::move(other))
    {
      Base::_check_template_params();
-      if (RowsAtCompileTime!=Dynamic && ColsAtCompileTime!=Dynamic)
-        Base::_set_noalias(other);
    }
    EIGEN_DEVICE_FUNC
    Array& operator=(Array&& other) EIGEN_NOEXCEPT_IF(std::is_nothrow_move_assignable<Scalar>::value)
@@ -231,10 +229,16 @@ class Array
            : Base(other)
    { }

+  private:
+    struct PrivateType {};
+  public:
+
    /** \sa MatrixBase::operator=(const EigenBase<OtherDerived>&) */
    template<typename OtherDerived>
    EIGEN_DEVICE_FUNC
-    EIGEN_STRONG_INLINE Array(const EigenBase<OtherDerived> &other)
+    EIGEN_STRONG_INLINE Array(const EigenBase<OtherDerived> &other,
+                              typename internal::enable_if<internal::is_convertible<typename OtherDerived::Scalar,Scalar>::value,
+                                                           PrivateType>::type = PrivateType())
      : Base(other.derived())
    { }

--- a/Eigen/src/Core/ArrayBase.h
+++ b/Eigen/src/Core/ArrayBase.h
@@ -153,8 +153,8 @@ template<typename Derived> class ArrayBase
 //     inline void evalTo(Dest& dst) const { dst = matrix(); }

  protected:
-    EIGEN_DEVICE_FUNC
-    ArrayBase() : Base() {}
+    EIGEN_DEFAULT_COPY_CONSTRUCTOR(ArrayBase)
+    EIGEN_DEFAULT_EMPTY_CONSTRUCTOR_AND_DESTRUCTOR(ArrayBase)

  private:
    explicit ArrayBase(Index);
@@ -175,7 +175,7 @@ template<typename Derived> class ArrayBase
  */
 template<typename Derived>
 template<typename OtherDerived>
-EIGEN_STRONG_INLINE Derived &
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Derived &
 ArrayBase<Derived>::operator-=(const ArrayBase<OtherDerived> &other)
 {
  call_assignment(derived(), other.derived(), internal::sub_assign_op<Scalar,typename OtherDerived::Scalar>());
@@ -188,7 +188,7 @@ ArrayBase<Derived>::operator-=(const ArrayBase<OtherDerived> &other)
  */
 template<typename Derived>
 template<typename OtherDerived>
-EIGEN_STRONG_INLINE Derived &
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Derived &
 ArrayBase<Derived>::operator+=(const ArrayBase<OtherDerived>& other)
 {
  call_assignment(derived(), other.derived(), internal::add_assign_op<Scalar,typename OtherDerived::Scalar>());
@@ -201,7 +201,7 @@ ArrayBase<Derived>::operator+=(const ArrayBase<OtherDerived>& other)
  */
 template<typename Derived>
 template<typename OtherDerived>
-EIGEN_STRONG_INLINE Derived &
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Derived &
 ArrayBase<Derived>::operator*=(const ArrayBase<OtherDerived>& other)
 {
  call_assignment(derived(), other.derived(), internal::mul_assign_op<Scalar,typename OtherDerived::Scalar>());
@@ -214,7 +214,7 @@ ArrayBase<Derived>::operator*=(const ArrayBase<OtherDerived>& other)
  */
 template<typename Derived>
 template<typename OtherDerived>
-EIGEN_STRONG_INLINE Derived &
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Derived &
 ArrayBase<Derived>::operator/=(const ArrayBase<OtherDerived>& other)
 {
  call_assignment(derived(), other.derived(), internal::div_assign_op<Scalar,typename OtherDerived::Scalar>());
--- a/Eigen/src/Core/ArrayWrapper.h
+++ b/Eigen/src/Core/ArrayWrapper.h
@@ -32,7 +32,8 @@ struct traits<ArrayWrapper<ExpressionType> >
  // Let's remove NestByRefBit
  enum {
    Flags0 = traits<typename remove_all<typename ExpressionType::Nested>::type >::Flags,
-    Flags = Flags0 & ~NestByRefBit
+    LvalueBitFlag = is_lvalue<ExpressionType>::value ? LvalueBit : 0,
+    Flags = (Flags0 & ~(NestByRefBit | LvalueBit)) | LvalueBitFlag
  };
 };
 }
@@ -129,7 +130,8 @@ struct traits<MatrixWrapper<ExpressionType> >
  // Let's remove NestByRefBit
  enum {
    Flags0 = traits<typename remove_all<typename ExpressionType::Nested>::type >::Flags,
-    Flags = Flags0 & ~NestByRefBit
+    LvalueBitFlag = is_lvalue<ExpressionType>::value ? LvalueBit : 0,
+    Flags = (Flags0 & ~(NestByRefBit | LvalueBit)) | LvalueBitFlag
  };
 };
 }
--- a/Eigen/src/Core/Assign.h
+++ b/Eigen/src/Core/Assign.h
@@ -16,7 +16,7 @@ namespace Eigen {

 template<typename Derived>
 template<typename OtherDerived>
-EIGEN_STRONG_INLINE Derived& DenseBase<Derived>
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Derived& DenseBase<Derived>
  ::lazyAssign(const DenseBase<OtherDerived>& other)
 {
  enum{
@@ -29,7 +29,7 @@ EIGEN_STRONG_INLINE Derived& DenseBase<Derived>

  eigen_assert(rows() == other.rows() && cols() == other.cols());
  internal::call_assignment_no_alias(derived(),other.derived());
-  
+
  return derived();
 }

--- a/Eigen/src/Core/AssignEvaluator.h
+++ b/Eigen/src/Core/AssignEvaluator.h
@@ -17,7 +17,7 @@ namespace Eigen {
 // This implementation is based on Assign.h

 namespace internal {
-  
+
 /***************************************************************************
 * Part 1 : the logic deciding a strategy for traversal and unrolling       *
 ***************************************************************************/
@@ -29,17 +29,17 @@ struct copy_using_evaluator_traits
 {
  typedef typename DstEvaluator::XprType Dst;
  typedef typename Dst::Scalar DstScalar;
-  
+
  enum {
    DstFlags = DstEvaluator::Flags,
    SrcFlags = SrcEvaluator::Flags
  };
-  
+
 public:
  enum {
    DstAlignment = DstEvaluator::Alignment,
    SrcAlignment = SrcEvaluator::Alignment,
-    DstHasDirectAccess = DstFlags & DirectAccessBit,
+    DstHasDirectAccess = (DstFlags & DirectAccessBit) == DirectAccessBit,
    JointAlignment = EIGEN_PLAIN_ENUM_MIN(DstAlignment,SrcAlignment)
  };

@@ -83,7 +83,7 @@ private:
                       && int(OuterStride)!=Dynamic && int(OuterStride)%int(InnerPacketSize)==0
                       && (EIGEN_UNALIGNED_VECTORIZE  || int(JointAlignment)>=int(InnerRequiredAlignment)),
    MayLinearize = bool(StorageOrdersAgree) && (int(DstFlags) & int(SrcFlags) & LinearAccessBit),
-    MayLinearVectorize = bool(MightVectorize) && MayLinearize && DstHasDirectAccess
+    MayLinearVectorize = bool(MightVectorize) && bool(MayLinearize) && bool(DstHasDirectAccess)
                       && (EIGEN_UNALIGNED_VECTORIZE || (int(DstAlignment)>=int(LinearRequiredAlignment)) || MaxSizeAtCompileTime == Dynamic),
      /* If the destination isn't aligned, we have to do runtime checks and we don't unroll,
         so it's only good for large enough sizes. */
@@ -135,7 +135,7 @@ public:
                          ? int(CompleteUnrolling)
                          : int(NoUnrolling) )
              : int(Traversal) == int(LinearTraversal)
-                ? ( bool(MayUnrollCompletely) ? int(CompleteUnrolling) 
+                ? ( bool(MayUnrollCompletely) ? int(CompleteUnrolling)
                                              : int(NoUnrolling) )
 #if EIGEN_UNALIGNED_VECTORIZE
              : int(Traversal) == int(SliceVectorizedTraversal)
@@ -195,7 +195,7 @@ struct copy_using_evaluator_DefaultTraversal_CompleteUnrolling
  // FIXME: this is not very clean, perhaps this information should be provided by the kernel?
  typedef typename Kernel::DstEvaluatorType DstEvaluatorType;
  typedef typename DstEvaluatorType::XprType DstXprType;
-  
+
  enum {
    outer = Index / DstXprType::InnerSizeAtCompileTime,
    inner = Index % DstXprType::InnerSizeAtCompileTime
@@ -261,7 +261,7 @@ struct copy_using_evaluator_innervec_CompleteUnrolling
  typedef typename Kernel::DstEvaluatorType DstEvaluatorType;
  typedef typename DstEvaluatorType::XprType DstXprType;
  typedef typename Kernel::PacketType PacketType;
-  
+
  enum {
    outer = Index / DstXprType::InnerSizeAtCompileTime,
    inner = Index % DstXprType::InnerSizeAtCompileTime,
@@ -407,7 +407,7 @@ struct dense_assignment_loop<Kernel, LinearVectorizedTraversal, NoUnrolling>
                                                            : int(Kernel::AssignmentTraits::DstAlignment),
      srcAlignment = Kernel::AssignmentTraits::JointAlignment
    };
-    const Index alignedStart = dstIsAligned ? 0 : internal::first_aligned<requestedAlignment>(&kernel.dstEvaluator().coeffRef(0), size);
+    const Index alignedStart = dstIsAligned ? 0 : internal::first_aligned<requestedAlignment>(kernel.dstDataPtr(), size);
    const Index alignedEnd = alignedStart + ((size-alignedStart)/packetSize)*packetSize;

    unaligned_dense_assignment_loop<dstIsAligned!=0>::run(kernel, 0, alignedStart);
@@ -426,7 +426,7 @@ struct dense_assignment_loop<Kernel, LinearVectorizedTraversal, CompleteUnrollin
  {
    typedef typename Kernel::DstEvaluatorType::XprType DstXprType;
    typedef typename Kernel::PacketType PacketType;
-    
+
    enum { size = DstXprType::SizeAtCompileTime,
           packetSize =unpacket_traits<PacketType>::size,
           alignedSize = (size/packetSize)*packetSize };
@@ -515,7 +515,7 @@ struct dense_assignment_loop<Kernel, LinearTraversal, CompleteUnrolling>
 template<typename Kernel>
 struct dense_assignment_loop<Kernel, SliceVectorizedTraversal, NoUnrolling>
 {
-  EIGEN_DEVICE_FUNC static inline void run(Kernel &kernel)
+  EIGEN_DEVICE_FUNC static EIGEN_STRONG_INLINE void run(Kernel &kernel)
  {
    typedef typename Kernel::Scalar Scalar;
    typedef typename Kernel::PacketType PacketType;
@@ -527,7 +527,7 @@ struct dense_assignment_loop<Kernel, SliceVectorizedTraversal, NoUnrolling>
      dstAlignment = alignable ? int(requestedAlignment)
                               : int(Kernel::AssignmentTraits::DstAlignment)
    };
-    const Scalar *dst_ptr = &kernel.dstEvaluator().coeffRef(0,0);
+    const Scalar *dst_ptr = kernel.dstDataPtr();
    if((!bool(dstIsAligned)) && (UIntPtr(dst_ptr) % sizeof(Scalar))>0)
    {
      // the pointer is not aligend-on scalar, so alignment is not possible
@@ -563,7 +563,7 @@ struct dense_assignment_loop<Kernel, SliceVectorizedTraversal, NoUnrolling>
 template<typename Kernel>
 struct dense_assignment_loop<Kernel, SliceVectorizedTraversal, InnerUnrolling>
 {
-  EIGEN_DEVICE_FUNC static inline void run(Kernel &kernel)
+  EIGEN_DEVICE_FUNC static EIGEN_STRONG_INLINE void run(Kernel &kernel)
  {
    typedef typename Kernel::DstEvaluatorType::XprType DstXprType;
    typedef typename Kernel::PacketType PacketType;
@@ -599,14 +599,14 @@ protected:
  typedef typename DstEvaluatorTypeT::XprType DstXprType;
  typedef typename SrcEvaluatorTypeT::XprType SrcXprType;
 public:
-  
+
  typedef DstEvaluatorTypeT DstEvaluatorType;
  typedef SrcEvaluatorTypeT SrcEvaluatorType;
  typedef typename DstEvaluatorType::Scalar Scalar;
  typedef copy_using_evaluator_traits<DstEvaluatorTypeT, SrcEvaluatorTypeT, Functor> AssignmentTraits;
  typedef typename AssignmentTraits::PacketType PacketType;
-  
-  
+
+
  EIGEN_DEVICE_FUNC generic_dense_assignment_kernel(DstEvaluatorType &dst, const SrcEvaluatorType &src, const Functor &func, DstXprType& dstExpr)
    : m_dst(dst), m_src(src), m_functor(func), m_dstExpr(dstExpr)
  {
@@ -614,58 +614,58 @@ public:
    AssignmentTraits::debug();
    #endif
  }
-  
+
  EIGEN_DEVICE_FUNC Index size() const        { return m_dstExpr.size(); }
  EIGEN_DEVICE_FUNC Index innerSize() const   { return m_dstExpr.innerSize(); }
  EIGEN_DEVICE_FUNC Index outerSize() const   { return m_dstExpr.outerSize(); }
  EIGEN_DEVICE_FUNC Index rows() const        { return m_dstExpr.rows(); }
  EIGEN_DEVICE_FUNC Index cols() const        { return m_dstExpr.cols(); }
  EIGEN_DEVICE_FUNC Index outerStride() const { return m_dstExpr.outerStride(); }
-  
+
  EIGEN_DEVICE_FUNC DstEvaluatorType& dstEvaluator() { return m_dst; }
  EIGEN_DEVICE_FUNC const SrcEvaluatorType& srcEvaluator() const { return m_src; }
-  
+
  /// Assign src(row,col) to dst(row,col) through the assignment functor.
  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void assignCoeff(Index row, Index col)
  {
    m_functor.assignCoeff(m_dst.coeffRef(row,col), m_src.coeff(row,col));
  }
-  
+
  /// \sa assignCoeff(Index,Index)
  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void assignCoeff(Index index)
  {
    m_functor.assignCoeff(m_dst.coeffRef(index), m_src.coeff(index));
  }
-  
+
  /// \sa assignCoeff(Index,Index)
  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void assignCoeffByOuterInner(Index outer, Index inner)
  {
-    Index row = rowIndexByOuterInner(outer, inner); 
-    Index col = colIndexByOuterInner(outer, inner); 
+    Index row = rowIndexByOuterInner(outer, inner);
+    Index col = colIndexByOuterInner(outer, inner);
    assignCoeff(row, col);
  }
-  
-  
+
+
  template<int StoreMode, int LoadMode, typename PacketType>
  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void assignPacket(Index row, Index col)
  {
    m_functor.template assignPacket<StoreMode>(&m_dst.coeffRef(row,col), m_src.template packet<LoadMode,PacketType>(row,col));
  }
-  
+
  template<int StoreMode, int LoadMode, typename PacketType>
  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void assignPacket(Index index)
  {
    m_functor.template assignPacket<StoreMode>(&m_dst.coeffRef(index), m_src.template packet<LoadMode,PacketType>(index));
  }
-  
+
  template<int StoreMode, int LoadMode, typename PacketType>
  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void assignPacketByOuterInner(Index outer, Index inner)
  {
-    Index row = rowIndexByOuterInner(outer, inner); 
+    Index row = rowIndexByOuterInner(outer, inner);
    Index col = colIndexByOuterInner(outer, inner);
    assignPacket<StoreMode,LoadMode,PacketType>(row, col);
  }
-  
+
  EIGEN_DEVICE_FUNC static EIGEN_STRONG_INLINE Index rowIndexByOuterInner(Index outer, Index inner)
  {
    typedef typename DstEvaluatorType::ExpressionTraits Traits;
@@ -683,7 +683,12 @@ public:
      : int(DstEvaluatorType::Flags)&RowMajorBit ? inner
      : outer;
  }
-  
+
+  EIGEN_DEVICE_FUNC const Scalar* dstDataPtr() const
+  {
+    return m_dstExpr.data();
+  }
+
 protected:
  DstEvaluatorType& m_dst;
  const SrcEvaluatorType& m_src;
@@ -696,6 +701,26 @@ protected:
 * Part 5 : Entry point for dense rectangular assignment
 ***************************************************************************/

+template<typename DstXprType,typename SrcXprType, typename Functor>
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+void resize_if_allowed(DstXprType &dst, const SrcXprType& src, const Functor &/*func*/)
+{
+  EIGEN_ONLY_USED_FOR_DEBUG(dst);
+  EIGEN_ONLY_USED_FOR_DEBUG(src);
+  eigen_assert(dst.rows() == src.rows() && dst.cols() == src.cols());
+}
+
+template<typename DstXprType,typename SrcXprType, typename T1, typename T2>
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+void resize_if_allowed(DstXprType &dst, const SrcXprType& src, const internal::assign_op<T1,T2> &/*func*/)
+{
+  Index dstRows = src.rows();
+  Index dstCols = src.cols();
+  if(((dst.rows()!=dstRows) || (dst.cols()!=dstCols)))
+    dst.resize(dstRows, dstCols);
+  eigen_assert(dst.rows() == dstRows && dst.cols() == dstCols);
+}
+
 template<typename DstXprType, typename SrcXprType, typename Functor>
 EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void call_dense_assignment_loop(DstXprType& dst, const SrcXprType& src, const Functor &func)
 {
@@ -706,13 +731,10 @@ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void call_dense_assignment_loop(DstXprType

  // NOTE To properly handle A = (A*A.transpose())/s with A rectangular,
  // we need to resize the destination after the source evaluator has been created.
-  Index dstRows = src.rows();
-  Index dstCols = src.cols();
-  if((dst.rows()!=dstRows) || (dst.cols()!=dstCols))
-    dst.resize(dstRows, dstCols);
+  resize_if_allowed(dst, src, func);

  DstEvaluatorType dstEvaluator(dst);
-    
+
  typedef generic_dense_assignment_kernel<DstEvaluatorType,SrcEvaluatorType,Functor> Kernel;
  Kernel kernel(dstEvaluator, srcEvaluator, func, dst.const_cast_derived());

@@ -740,7 +762,7 @@ struct EigenBase2EigenBase {};

 template<typename,typename> struct AssignmentKind { typedef EigenBase2EigenBase Kind; };
 template<> struct AssignmentKind<DenseShape,DenseShape> { typedef Dense2Dense Kind; };
-    
+
 // This is the main assignment class
 template< typename DstXprType, typename SrcXprType, typename Functor,
          typename Kind = typename AssignmentKind< typename evaluator_traits<DstXprType>::Shape , typename evaluator_traits<SrcXprType>::Shape >::Kind,
@@ -765,7 +787,7 @@ void call_assignment(const Dst& dst, const Src& src)
 {
  call_assignment(dst, src, internal::assign_op<typename Dst::Scalar,typename Src::Scalar>());
 }
-                     
+
 // Deal with "assume-aliasing"
 template<typename Dst, typename Src, typename Func>
 EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
@@ -805,12 +827,12 @@ void call_assignment_no_alias(Dst& dst, const Src& src, const Func& func)
  typedef typename internal::conditional<NeedToTranspose, Transpose<Dst>, Dst>::type ActualDstTypeCleaned;
  typedef typename internal::conditional<NeedToTranspose, Transpose<Dst>, Dst&>::type ActualDstType;
  ActualDstType actualDst(dst);
-  
+
  // TODO check whether this is the right place to perform these checks:
  EIGEN_STATIC_ASSERT_LVALUE(Dst)
  EIGEN_STATIC_ASSERT_SAME_MATRIX_SIZE(ActualDstTypeCleaned,Src)
  EIGEN_CHECK_BINARY_COMPATIBILIY(Func,typename ActualDstTypeCleaned::Scalar,typename Src::Scalar);
-  
+
  Assignment<ActualDstTypeCleaned,Src,Func>::run(actualDst, src, func);
 }
 template<typename Dst, typename Src>
@@ -847,13 +869,12 @@ template<typename Dst, typename Src> void check_for_aliasing(const Dst &dst, con
 template< typename DstXprType, typename SrcXprType, typename Functor, typename Weak>
 struct Assignment<DstXprType, SrcXprType, Functor, Dense2Dense, Weak>
 {
-  EIGEN_DEVICE_FUNC
-  static EIGEN_STRONG_INLINE void run(DstXprType &dst, const SrcXprType &src, const Functor &func)
+  static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void run(DstXprType &dst, const SrcXprType &src, const Functor &func)
  {
 #ifndef EIGEN_NO_DEBUG
    internal::check_for_aliasing(dst, src);
 #endif
-    
+
    call_dense_assignment_loop(dst, src, func);
  }
 };
@@ -865,8 +886,7 @@ struct Assignment<DstXprType, SrcXprType, Functor, Dense2Dense, Weak>
 template< typename DstXprType, typename SrcXprType, typename Functor, typename Weak>
 struct Assignment<DstXprType, SrcXprType, Functor, EigenBase2EigenBase, Weak>
 {
-  EIGEN_DEVICE_FUNC
-  static EIGEN_STRONG_INLINE void run(DstXprType &dst, const SrcXprType &src, const internal::assign_op<typename DstXprType::Scalar,typename SrcXprType::Scalar> &/*func*/)
+  static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void run(DstXprType &dst, const SrcXprType &src, const internal::assign_op<typename DstXprType::Scalar,typename SrcXprType::Scalar> &/*func*/)
  {
    Index dstRows = src.rows();
    Index dstCols = src.cols();
@@ -876,6 +896,34 @@ struct Assignment<DstXprType, SrcXprType, Functor, EigenBase2EigenBase, Weak>
    eigen_assert(dst.rows() == src.rows() && dst.cols() == src.cols());
    src.evalTo(dst);
  }
+
+  // NOTE The following two functions are templated to avoid their instanciation if not needed
+  //      This is needed because some expressions supports evalTo only and/or have 'void' as scalar type.
+  template<typename SrcScalarType>
+  EIGEN_DEVICE_FUNC
+  static EIGEN_STRONG_INLINE void run(DstXprType &dst, const SrcXprType &src, const internal::add_assign_op<typename DstXprType::Scalar,SrcScalarType> &/*func*/)
+  {
+    Index dstRows = src.rows();
+    Index dstCols = src.cols();
+    if((dst.rows()!=dstRows) || (dst.cols()!=dstCols))
+      dst.resize(dstRows, dstCols);
+
+    eigen_assert(dst.rows() == src.rows() && dst.cols() == src.cols());
+    src.addTo(dst);
+  }
+
+  template<typename SrcScalarType>
+  EIGEN_DEVICE_FUNC
+  static EIGEN_STRONG_INLINE void run(DstXprType &dst, const SrcXprType &src, const internal::sub_assign_op<typename DstXprType::Scalar,SrcScalarType> &/*func*/)
+  {
+    Index dstRows = src.rows();
+    Index dstCols = src.cols();
+    if((dst.rows()!=dstRows) || (dst.cols()!=dstCols))
+      dst.resize(dstRows, dstCols);
+
+    eigen_assert(dst.rows() == src.rows() && dst.cols() == src.cols());
+    src.subTo(dst);
+  }
 };

 } // namespace internal
--- a/Eigen/src/Core/Assign_MKL.h
+++ b/Eigen/src/Core/Assign_MKL.h
@@ -84,7 +84,8 @@ class vml_assign_traits
  struct Assignment<DstXprType, CwiseUnaryOp<scalar_##EIGENOP##_op<EIGENTYPE>, SrcXprNested>, assign_op<EIGENTYPE,EIGENTYPE>,   \
                   Dense2Dense, typename enable_if<vml_assign_traits<DstXprType,SrcXprNested>::EnableVml>::type> {              \
    typedef CwiseUnaryOp<scalar_##EIGENOP##_op<EIGENTYPE>, SrcXprNested> SrcXprType;                                            \
-    static void run(DstXprType &dst, const SrcXprType &src, const assign_op<EIGENTYPE,EIGENTYPE> &/*func*/) {                   \
+    static void run(DstXprType &dst, const SrcXprType &src, const assign_op<EIGENTYPE,EIGENTYPE> &func) {                       \
+      resize_if_allowed(dst, src, func);                                                                                        \
      eigen_assert(dst.rows() == src.rows() && dst.cols() == src.cols());                                                       \
      if(vml_assign_traits<DstXprType,SrcXprNested>::Traversal==LinearTraversal) {                                              \
        VMLOP(dst.size(), (const VMLTYPE*)src.nestedExpression().data(),                                                        \
@@ -144,7 +145,8 @@ EIGEN_MKL_VML_DECLARE_UNARY_CALLS_REAL(ceil,  Ceil,   _)
                   Dense2Dense, typename enable_if<vml_assign_traits<DstXprType,SrcXprNested>::EnableVml>::type> {            \
    typedef CwiseBinaryOp<scalar_##EIGENOP##_op<EIGENTYPE,EIGENTYPE>, SrcXprNested,                                           \
                    const CwiseNullaryOp<internal::scalar_constant_op<EIGENTYPE>,Plain> > SrcXprType;                         \
-    static void run(DstXprType &dst, const SrcXprType &src, const assign_op<EIGENTYPE,EIGENTYPE> &/*func*/) {                 \
+    static void run(DstXprType &dst, const SrcXprType &src, const assign_op<EIGENTYPE,EIGENTYPE> &func) {                     \
+      resize_if_allowed(dst, src, func);                                                                                      \
      eigen_assert(dst.rows() == src.rows() && dst.cols() == src.cols());                                                     \
      VMLTYPE exponent = reinterpret_cast<const VMLTYPE&>(src.rhs().functor().m_other);                                       \
      if(vml_assign_traits<DstXprType,SrcXprNested>::Traversal==LinearTraversal)                                              \
--- a/Eigen/src/Core/BooleanRedux.h
+++ b/Eigen/src/Core/BooleanRedux.h
@@ -23,7 +23,7 @@ struct all_unroller
    row = (UnrollCount-1) % Traits::RowsAtCompileTime
  };

-  static inline bool run(const Derived &mat)
+  EIGEN_DEVICE_FUNC static inline bool run(const Derived &mat)
  {
    return all_unroller<Derived, UnrollCount-1>::run(mat) && mat.coeff(row, col);
  }
@@ -32,13 +32,13 @@ struct all_unroller
 template<typename Derived>
 struct all_unroller<Derived, 0>
 {
-  static inline bool run(const Derived &/*mat*/) { return true; }
+  EIGEN_DEVICE_FUNC static inline bool run(const Derived &/*mat*/) { return true; }
 };

 template<typename Derived>
 struct all_unroller<Derived, Dynamic>
 {
-  static inline bool run(const Derived &) { return false; }
+  EIGEN_DEVICE_FUNC static inline bool run(const Derived &) { return false; }
 };

 template<typename Derived, int UnrollCount>
@@ -50,7 +50,7 @@ struct any_unroller
    row = (UnrollCount-1) % Traits::RowsAtCompileTime
  };
  
-  static inline bool run(const Derived &mat)
+  EIGEN_DEVICE_FUNC static inline bool run(const Derived &mat)
  {
    return any_unroller<Derived, UnrollCount-1>::run(mat) || mat.coeff(row, col);
  }
@@ -59,13 +59,13 @@ struct any_unroller
 template<typename Derived>
 struct any_unroller<Derived, 0>
 {
-  static inline bool run(const Derived & /*mat*/) { return false; }
+  EIGEN_DEVICE_FUNC static inline bool run(const Derived & /*mat*/) { return false; }
 };

 template<typename Derived>
 struct any_unroller<Derived, Dynamic>
 {
-  static inline bool run(const Derived &) { return false; }
+  EIGEN_DEVICE_FUNC static inline bool run(const Derived &) { return false; }
 };

 } // end namespace internal
@@ -78,7 +78,7 @@ struct any_unroller<Derived, Dynamic>
  * \sa any(), Cwise::operator<()
  */
 template<typename Derived>
-inline bool DenseBase<Derived>::all() const
+EIGEN_DEVICE_FUNC inline bool DenseBase<Derived>::all() const
 {
  typedef internal::evaluator<Derived> Evaluator;
  enum {
@@ -102,7 +102,7 @@ inline bool DenseBase<Derived>::all() const
  * \sa all()
  */
 template<typename Derived>
-inline bool DenseBase<Derived>::any() const
+EIGEN_DEVICE_FUNC inline bool DenseBase<Derived>::any() const
 {
  typedef internal::evaluator<Derived> Evaluator;
  enum {
@@ -126,7 +126,7 @@ inline bool DenseBase<Derived>::any() const
  * \sa all(), any()
  */
 template<typename Derived>
-inline Eigen::Index DenseBase<Derived>::count() const
+EIGEN_DEVICE_FUNC inline Eigen::Index DenseBase<Derived>::count() const
 {
  return derived().template cast<bool>().template cast<Index>().sum();
 }
--- a/Eigen/src/Core/CommaInitializer.h
+++ b/Eigen/src/Core/CommaInitializer.h
@@ -11,7 +11,7 @@
 #ifndef EIGEN_COMMAINITIALIZER_H
 #define EIGEN_COMMAINITIALIZER_H

-namespace Eigen { 
+namespace Eigen {

 /** \class CommaInitializer
  * \ingroup Core_Module
@@ -44,7 +44,7 @@ struct CommaInitializer
    m_xpr.block(0, 0, other.rows(), other.cols()) = other;
  }

-  /* Copy/Move constructor which transfers ownership. This is crucial in 
+  /* Copy/Move constructor which transfers ownership. This is crucial in
   * absence of return value optimization to avoid assertions during destruction. */
  // FIXME in C++11 mode this could be replaced by a proper RValue constructor
  EIGEN_DEVICE_FUNC
@@ -135,13 +135,13 @@ struct CommaInitializer
  *
  * Example: \include MatrixBase_set.cpp
  * Output: \verbinclude MatrixBase_set.out
-  * 
+  *
  * \note According the c++ standard, the argument expressions of this comma initializer are evaluated in arbitrary order.
  *
  * \sa CommaInitializer::finished(), class CommaInitializer
  */
 template<typename Derived>
-inline CommaInitializer<Derived> DenseBase<Derived>::operator<< (const Scalar& s)
+EIGEN_DEVICE_FUNC inline CommaInitializer<Derived> DenseBase<Derived>::operator<< (const Scalar& s)
 {
  return CommaInitializer<Derived>(*static_cast<Derived*>(this), s);
 }
@@ -149,7 +149,7 @@ inline CommaInitializer<Derived> DenseBase<Derived>::operator<< (const Scalar& s
 /** \sa operator<<(const Scalar&) */
 template<typename Derived>
 template<typename OtherDerived>
-inline CommaInitializer<Derived>
+EIGEN_DEVICE_FUNC inline CommaInitializer<Derived>
 DenseBase<Derived>::operator<<(const DenseBase<OtherDerived>& other)
 {
  return CommaInitializer<Derived>(*static_cast<Derived *>(this), other);
--- a/Eigen/src/Core/ConditionEstimator.h
+++ b/Eigen/src/Core/ConditionEstimator.h
@@ -160,7 +160,7 @@ rcond_estimate_helper(typename Decomposition::RealScalar matrix_norm, const Deco
 {
  typedef typename Decomposition::RealScalar RealScalar;
  eigen_assert(dec.rows() == dec.cols());
-  if (dec.rows() == 0)              return RealScalar(1);
+  if (dec.rows() == 0)              return NumTraits<RealScalar>::infinity();
  if (matrix_norm == RealScalar(0)) return RealScalar(0);
  if (dec.rows() == 1)              return RealScalar(1);
  const RealScalar inverse_matrix_norm = rcond_invmatrix_L1_norm_estimate(dec);
--- a/Eigen/src/Core/CoreEvaluators.h
+++ b/Eigen/src/Core/CoreEvaluators.h
@@ -977,7 +977,7 @@ struct evaluator<Block<ArgType, BlockRows, BlockCols, InnerPanel> >
    OuterStrideAtCompileTime = HasSameStorageOrderAsArgType
                             ? int(outer_stride_at_compile_time<ArgType>::ret)
                             : int(inner_stride_at_compile_time<ArgType>::ret),
-    MaskPacketAccessBit = (InnerStrideAtCompileTime == 1) ? PacketAccessBit : 0,
+    MaskPacketAccessBit = (InnerStrideAtCompileTime == 1 || HasSameStorageOrderAsArgType) ? PacketAccessBit : 0,
    
    FlagsLinearAccessBit = (RowsAtCompileTime == 1 || ColsAtCompileTime == 1 || (InnerPanel && (evaluator<ArgType>::Flags&LinearAccessBit))) ? LinearAccessBit : 0,    
    FlagsRowMajorBit = XprType::Flags&RowMajorBit,
@@ -987,7 +987,9 @@ struct evaluator<Block<ArgType, BlockRows, BlockCols, InnerPanel> >
    Flags = Flags0 | FlagsLinearAccessBit | FlagsRowMajorBit,
    
    PacketAlignment = unpacket_traits<PacketScalar>::alignment,
-    Alignment0 = (InnerPanel && (OuterStrideAtCompileTime!=Dynamic) && (((OuterStrideAtCompileTime * int(sizeof(Scalar))) % int(PacketAlignment)) == 0)) ? int(PacketAlignment) : 0,
+    Alignment0 = (InnerPanel && (OuterStrideAtCompileTime!=Dynamic)
+                             && (OuterStrideAtCompileTime!=0)
+                             && (((OuterStrideAtCompileTime * int(sizeof(Scalar))) % int(PacketAlignment)) == 0)) ? int(PacketAlignment) : 0,
    Alignment = EIGEN_PLAIN_ENUM_MIN(evaluator<ArgType>::Alignment, Alignment0)
  };
  typedef block_evaluator<ArgType, BlockRows, BlockCols, InnerPanel> block_evaluator_type;
@@ -1018,14 +1020,16 @@ struct unary_evaluator<Block<ArgType, BlockRows, BlockCols, InnerPanel>, IndexBa
  EIGEN_DEVICE_FUNC explicit unary_evaluator(const XprType& block)
    : m_argImpl(block.nestedExpression()), 
      m_startRow(block.startRow()), 
-      m_startCol(block.startCol()) 
+      m_startCol(block.startCol()),
+      m_linear_offset(InnerPanel?(XprType::IsRowMajor ? block.startRow()*block.cols() : block.startCol()*block.rows()):0)
  { }
 
  typedef typename XprType::Scalar Scalar;
  typedef typename XprType::CoeffReturnType CoeffReturnType;

  enum {
-    RowsAtCompileTime = XprType::RowsAtCompileTime
+    RowsAtCompileTime = XprType::RowsAtCompileTime,
+    ForwardLinearAccess = InnerPanel && bool(evaluator<ArgType>::Flags&LinearAccessBit)
  };
 
  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
@@ -1037,7 +1041,10 @@ struct unary_evaluator<Block<ArgType, BlockRows, BlockCols, InnerPanel>, IndexBa
  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
  CoeffReturnType coeff(Index index) const
  { 
-    return coeff(RowsAtCompileTime == 1 ? 0 : index, RowsAtCompileTime == 1 ? index : 0);
+    if (ForwardLinearAccess)
+      return m_argImpl.coeff(m_linear_offset.value() + index); 
+    else
+      return coeff(RowsAtCompileTime == 1 ? 0 : index, RowsAtCompileTime == 1 ? index : 0);
  }

  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
@@ -1049,7 +1056,10 @@ struct unary_evaluator<Block<ArgType, BlockRows, BlockCols, InnerPanel>, IndexBa
  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
  Scalar& coeffRef(Index index)
  { 
-    return coeffRef(RowsAtCompileTime == 1 ? 0 : index, RowsAtCompileTime == 1 ? index : 0);
+    if (ForwardLinearAccess)
+      return m_argImpl.coeffRef(m_linear_offset.value() + index); 
+    else
+      return coeffRef(RowsAtCompileTime == 1 ? 0 : index, RowsAtCompileTime == 1 ? index : 0);
  }
 
  template<int LoadMode, typename PacketType>
@@ -1063,8 +1073,11 @@ struct unary_evaluator<Block<ArgType, BlockRows, BlockCols, InnerPanel>, IndexBa
  EIGEN_STRONG_INLINE
  PacketType packet(Index index) const 
  { 
-    return packet<LoadMode,PacketType>(RowsAtCompileTime == 1 ? 0 : index,
-                                       RowsAtCompileTime == 1 ? index : 0);
+    if (ForwardLinearAccess)
+      return m_argImpl.template packet<LoadMode,PacketType>(m_linear_offset.value() + index);
+    else
+      return packet<LoadMode,PacketType>(RowsAtCompileTime == 1 ? 0 : index,
+                                         RowsAtCompileTime == 1 ? index : 0);
  }
  
  template<int StoreMode, typename PacketType>
@@ -1078,15 +1091,19 @@ struct unary_evaluator<Block<ArgType, BlockRows, BlockCols, InnerPanel>, IndexBa
  EIGEN_STRONG_INLINE
  void writePacket(Index index, const PacketType& x) 
  {
-    return writePacket<StoreMode,PacketType>(RowsAtCompileTime == 1 ? 0 : index,
-                                             RowsAtCompileTime == 1 ? index : 0,
-                                             x);
+    if (ForwardLinearAccess)
+      return m_argImpl.template writePacket<StoreMode,PacketType>(m_linear_offset.value() + index, x);
+    else
+      return writePacket<StoreMode,PacketType>(RowsAtCompileTime == 1 ? 0 : index,
+                                              RowsAtCompileTime == 1 ? index : 0,
+                                              x);
  }
 
 protected:
  evaluator<ArgType> m_argImpl;
  const variable_if_dynamic<Index, (ArgType::RowsAtCompileTime == 1 && BlockRows==1) ? 0 : Dynamic> m_startRow;
  const variable_if_dynamic<Index, (ArgType::ColsAtCompileTime == 1 && BlockCols==1) ? 0 : Dynamic> m_startCol;
+  const variable_if_dynamic<Index, InnerPanel ? Dynamic : 0> m_linear_offset;
 };

 // TODO: This evaluator does not actually use the child evaluator; 
@@ -1556,9 +1573,7 @@ struct evaluator<Diagonal<ArgType, DiagIndex> >
  { }
 
  typedef typename XprType::Scalar Scalar;
-  // FIXME having to check whether ArgType is sparse here i not very nice.
-  typedef typename internal::conditional<!internal::is_same<typename ArgType::StorageKind,Sparse>::value,
-                                         typename XprType::CoeffReturnType,Scalar>::type CoeffReturnType;
+  typedef typename XprType::CoeffReturnType CoeffReturnType;

  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
  CoeffReturnType coeff(Index row, Index) const
--- a/Eigen/src/Core/CwiseBinaryOp.h
+++ b/Eigen/src/Core/CwiseBinaryOp.h
@@ -46,7 +46,7 @@ struct traits<CwiseBinaryOp<BinaryOp, Lhs, Rhs> >
  typedef typename remove_reference<LhsNested>::type _LhsNested;
  typedef typename remove_reference<RhsNested>::type _RhsNested;
  enum {
-    Flags = _LhsNested::Flags & RowMajorBit
+    Flags = cwise_promote_storage_order<typename traits<Lhs>::StorageKind,typename traits<Rhs>::StorageKind,_LhsNested::Flags & RowMajorBit,_RhsNested::Flags & RowMajorBit>::value
  };
 };
 } // end namespace internal
@@ -74,7 +74,7 @@ class CwiseBinaryOpImpl;
  * \sa MatrixBase::binaryExpr(const MatrixBase<OtherDerived> &,const CustomBinaryOp &) const, class CwiseUnaryOp, class CwiseNullaryOp
  */
 template<typename BinaryOp, typename LhsType, typename RhsType>
-class CwiseBinaryOp : 
+class CwiseBinaryOp :
  public CwiseBinaryOpImpl<
          BinaryOp, LhsType, RhsType,
          typename internal::cwise_promote_storage_type<typename internal::traits<LhsType>::StorageKind,
@@ -83,7 +83,8 @@ class CwiseBinaryOp :
  internal::no_assignment_operator
 {
  public:
-    
+
+    typedef typename internal::remove_all<BinaryOp>::type Functor;
    typedef typename internal::remove_all<LhsType>::type Lhs;
    typedef typename internal::remove_all<RhsType>::type Rhs;

@@ -157,7 +158,7 @@ public:
  */
 template<typename Derived>
 template<typename OtherDerived>
-EIGEN_STRONG_INLINE Derived &
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Derived &
 MatrixBase<Derived>::operator-=(const MatrixBase<OtherDerived> &other)
 {
  call_assignment(derived(), other.derived(), internal::sub_assign_op<Scalar,typename OtherDerived::Scalar>());
@@ -170,7 +171,7 @@ MatrixBase<Derived>::operator-=(const MatrixBase<OtherDerived> &other)
  */
 template<typename Derived>
 template<typename OtherDerived>
-EIGEN_STRONG_INLINE Derived &
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Derived &
 MatrixBase<Derived>::operator+=(const MatrixBase<OtherDerived>& other)
 {
  call_assignment(derived(), other.derived(), internal::add_assign_op<Scalar,typename OtherDerived::Scalar>());
--- a/Eigen/src/Core/CwiseNullaryOp.h
+++ b/Eigen/src/Core/CwiseNullaryOp.h
@@ -105,7 +105,7 @@ class CwiseNullaryOp : public internal::dense_xpr_base< CwiseNullaryOp<NullaryOp
  */
 template<typename Derived>
 template<typename CustomNullaryOp>
-EIGEN_STRONG_INLINE const CwiseNullaryOp<CustomNullaryOp, typename DenseBase<Derived>::PlainObject>
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const CwiseNullaryOp<CustomNullaryOp, typename DenseBase<Derived>::PlainObject>
 DenseBase<Derived>::NullaryExpr(Index rows, Index cols, const CustomNullaryOp& func)
 {
  return CwiseNullaryOp<CustomNullaryOp, PlainObject>(rows, cols, func);
@@ -126,12 +126,12 @@ DenseBase<Derived>::NullaryExpr(Index rows, Index cols, const CustomNullaryOp& f
  *
  * Here is an example with C++11 random generators: \include random_cpp11.cpp
  * Output: \verbinclude random_cpp11.out
-  * 
+  *
  * \sa class CwiseNullaryOp
  */
 template<typename Derived>
 template<typename CustomNullaryOp>
-EIGEN_STRONG_INLINE const CwiseNullaryOp<CustomNullaryOp, typename DenseBase<Derived>::PlainObject>
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const CwiseNullaryOp<CustomNullaryOp, typename DenseBase<Derived>::PlainObject>
 DenseBase<Derived>::NullaryExpr(Index size, const CustomNullaryOp& func)
 {
  EIGEN_STATIC_ASSERT_VECTOR_ONLY(Derived)
@@ -150,7 +150,7 @@ DenseBase<Derived>::NullaryExpr(Index size, const CustomNullaryOp& func)
  */
 template<typename Derived>
 template<typename CustomNullaryOp>
-EIGEN_STRONG_INLINE const CwiseNullaryOp<CustomNullaryOp, typename DenseBase<Derived>::PlainObject>
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const CwiseNullaryOp<CustomNullaryOp, typename DenseBase<Derived>::PlainObject>
 DenseBase<Derived>::NullaryExpr(const CustomNullaryOp& func)
 {
  return CwiseNullaryOp<CustomNullaryOp, PlainObject>(RowsAtCompileTime, ColsAtCompileTime, func);
@@ -170,7 +170,7 @@ DenseBase<Derived>::NullaryExpr(const CustomNullaryOp& func)
  * \sa class CwiseNullaryOp
  */
 template<typename Derived>
-EIGEN_STRONG_INLINE const typename DenseBase<Derived>::ConstantReturnType
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const typename DenseBase<Derived>::ConstantReturnType
 DenseBase<Derived>::Constant(Index rows, Index cols, const Scalar& value)
 {
  return DenseBase<Derived>::NullaryExpr(rows, cols, internal::scalar_constant_op<Scalar>(value));
@@ -192,7 +192,7 @@ DenseBase<Derived>::Constant(Index rows, Index cols, const Scalar& value)
  * \sa class CwiseNullaryOp
  */
 template<typename Derived>
-EIGEN_STRONG_INLINE const typename DenseBase<Derived>::ConstantReturnType
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const typename DenseBase<Derived>::ConstantReturnType
 DenseBase<Derived>::Constant(Index size, const Scalar& value)
 {
  return DenseBase<Derived>::NullaryExpr(size, internal::scalar_constant_op<Scalar>(value));
@@ -208,7 +208,7 @@ DenseBase<Derived>::Constant(Index size, const Scalar& value)
  * \sa class CwiseNullaryOp
  */
 template<typename Derived>
-EIGEN_STRONG_INLINE const typename DenseBase<Derived>::ConstantReturnType
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const typename DenseBase<Derived>::ConstantReturnType
 DenseBase<Derived>::Constant(const Scalar& value)
 {
  EIGEN_STATIC_ASSERT_FIXED_SIZE(Derived)
@@ -220,7 +220,7 @@ DenseBase<Derived>::Constant(const Scalar& value)
  * \sa LinSpaced(Index,Scalar,Scalar), setLinSpaced(Index,const Scalar&,const Scalar&)
  */
 template<typename Derived>
-EIGEN_STRONG_INLINE const typename DenseBase<Derived>::RandomAccessLinSpacedReturnType
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const typename DenseBase<Derived>::RandomAccessLinSpacedReturnType
 DenseBase<Derived>::LinSpaced(Sequential_t, Index size, const Scalar& low, const Scalar& high)
 {
  EIGEN_STATIC_ASSERT_VECTOR_ONLY(Derived)
@@ -232,7 +232,7 @@ DenseBase<Derived>::LinSpaced(Sequential_t, Index size, const Scalar& low, const
  * \sa LinSpaced(Scalar,Scalar)
  */
 template<typename Derived>
-EIGEN_STRONG_INLINE const typename DenseBase<Derived>::RandomAccessLinSpacedReturnType
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const typename DenseBase<Derived>::RandomAccessLinSpacedReturnType
 DenseBase<Derived>::LinSpaced(Sequential_t, const Scalar& low, const Scalar& high)
 {
  EIGEN_STATIC_ASSERT_VECTOR_ONLY(Derived)
@@ -264,7 +264,7 @@ DenseBase<Derived>::LinSpaced(Sequential_t, const Scalar& low, const Scalar& hig
  * \sa setLinSpaced(Index,const Scalar&,const Scalar&), CwiseNullaryOp
  */
 template<typename Derived>
-EIGEN_STRONG_INLINE const typename DenseBase<Derived>::RandomAccessLinSpacedReturnType
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const typename DenseBase<Derived>::RandomAccessLinSpacedReturnType
 DenseBase<Derived>::LinSpaced(Index size, const Scalar& low, const Scalar& high)
 {
  EIGEN_STATIC_ASSERT_VECTOR_ONLY(Derived)
@@ -272,11 +272,11 @@ DenseBase<Derived>::LinSpaced(Index size, const Scalar& low, const Scalar& high)
 }

 /**
-  * \copydoc DenseBase::LinSpaced(Index, const Scalar&, const Scalar&)
+  * \copydoc DenseBase::LinSpaced(Index, const DenseBase::Scalar&, const DenseBase::Scalar&)
  * Special version for fixed size types which does not require the size parameter.
  */
 template<typename Derived>
-EIGEN_STRONG_INLINE const typename DenseBase<Derived>::RandomAccessLinSpacedReturnType
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const typename DenseBase<Derived>::RandomAccessLinSpacedReturnType
 DenseBase<Derived>::LinSpaced(const Scalar& low, const Scalar& high)
 {
  EIGEN_STATIC_ASSERT_VECTOR_ONLY(Derived)
@@ -286,7 +286,7 @@ DenseBase<Derived>::LinSpaced(const Scalar& low, const Scalar& high)

 /** \returns true if all coefficients in this matrix are approximately equal to \a val, to within precision \a prec */
 template<typename Derived>
-bool DenseBase<Derived>::isApproxToConstant
+EIGEN_DEVICE_FUNC bool DenseBase<Derived>::isApproxToConstant
 (const Scalar& val, const RealScalar& prec) const
 {
  typename internal::nested_eval<Derived,1>::type self(derived());
@@ -301,7 +301,7 @@ bool DenseBase<Derived>::isApproxToConstant
  *
  * \returns true if all coefficients in this matrix are approximately equal to \a value, to within precision \a prec */
 template<typename Derived>
-bool DenseBase<Derived>::isConstant
+EIGEN_DEVICE_FUNC bool DenseBase<Derived>::isConstant
 (const Scalar& val, const RealScalar& prec) const
 {
  return isApproxToConstant(val, prec);
@@ -312,7 +312,7 @@ bool DenseBase<Derived>::isConstant
  * \sa setConstant(), Constant(), class CwiseNullaryOp
  */
 template<typename Derived>
-EIGEN_STRONG_INLINE void DenseBase<Derived>::fill(const Scalar& val)
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void DenseBase<Derived>::fill(const Scalar& val)
 {
  setConstant(val);
 }
@@ -322,7 +322,7 @@ EIGEN_STRONG_INLINE void DenseBase<Derived>::fill(const Scalar& val)
  * \sa fill(), setConstant(Index,const Scalar&), setConstant(Index,Index,const Scalar&), setZero(), setOnes(), Constant(), class CwiseNullaryOp, setZero(), setOnes()
  */
 template<typename Derived>
-EIGEN_STRONG_INLINE Derived& DenseBase<Derived>::setConstant(const Scalar& val)
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Derived& DenseBase<Derived>::setConstant(const Scalar& val)
 {
  return derived() = Constant(rows(), cols(), val);
 }
@@ -337,7 +337,7 @@ EIGEN_STRONG_INLINE Derived& DenseBase<Derived>::setConstant(const Scalar& val)
  * \sa MatrixBase::setConstant(const Scalar&), setConstant(Index,Index,const Scalar&), class CwiseNullaryOp, MatrixBase::Constant(const Scalar&)
  */
 template<typename Derived>
-EIGEN_STRONG_INLINE Derived&
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Derived&
 PlainObjectBase<Derived>::setConstant(Index size, const Scalar& val)
 {
  resize(size);
@@ -356,7 +356,7 @@ PlainObjectBase<Derived>::setConstant(Index size, const Scalar& val)
  * \sa MatrixBase::setConstant(const Scalar&), setConstant(Index,const Scalar&), class CwiseNullaryOp, MatrixBase::Constant(const Scalar&)
  */
 template<typename Derived>
-EIGEN_STRONG_INLINE Derived&
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Derived&
 PlainObjectBase<Derived>::setConstant(Index rows, Index cols, const Scalar& val)
 {
  resize(rows, cols);
@@ -380,7 +380,7 @@ PlainObjectBase<Derived>::setConstant(Index rows, Index cols, const Scalar& val)
  * \sa LinSpaced(Index,const Scalar&,const Scalar&), CwiseNullaryOp
  */
 template<typename Derived>
-EIGEN_STRONG_INLINE Derived& DenseBase<Derived>::setLinSpaced(Index newSize, const Scalar& low, const Scalar& high)
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Derived& DenseBase<Derived>::setLinSpaced(Index newSize, const Scalar& low, const Scalar& high)
 {
  EIGEN_STATIC_ASSERT_VECTOR_ONLY(Derived)
  return derived() = Derived::NullaryExpr(newSize, internal::linspaced_op<Scalar,PacketScalar>(low,high,newSize));
@@ -400,7 +400,7 @@ EIGEN_STRONG_INLINE Derived& DenseBase<Derived>::setLinSpaced(Index newSize, con
  * \sa LinSpaced(Index,const Scalar&,const Scalar&), setLinSpaced(Index, const Scalar&, const Scalar&), CwiseNullaryOp
  */
 template<typename Derived>
-EIGEN_STRONG_INLINE Derived& DenseBase<Derived>::setLinSpaced(const Scalar& low, const Scalar& high)
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Derived& DenseBase<Derived>::setLinSpaced(const Scalar& low, const Scalar& high)
 {
  EIGEN_STATIC_ASSERT_VECTOR_ONLY(Derived)
  return setLinSpaced(size(), low, high);
@@ -423,7 +423,7 @@ EIGEN_STRONG_INLINE Derived& DenseBase<Derived>::setLinSpaced(const Scalar& low,
  * \sa Zero(), Zero(Index)
  */
 template<typename Derived>
-EIGEN_STRONG_INLINE const typename DenseBase<Derived>::ConstantReturnType
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const typename DenseBase<Derived>::ConstantReturnType
 DenseBase<Derived>::Zero(Index rows, Index cols)
 {
  return Constant(rows, cols, Scalar(0));
@@ -446,7 +446,7 @@ DenseBase<Derived>::Zero(Index rows, Index cols)
  * \sa Zero(), Zero(Index,Index)
  */
 template<typename Derived>
-EIGEN_STRONG_INLINE const typename DenseBase<Derived>::ConstantReturnType
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const typename DenseBase<Derived>::ConstantReturnType
 DenseBase<Derived>::Zero(Index size)
 {
  return Constant(size, Scalar(0));
@@ -463,7 +463,7 @@ DenseBase<Derived>::Zero(Index size)
  * \sa Zero(Index), Zero(Index,Index)
  */
 template<typename Derived>
-EIGEN_STRONG_INLINE const typename DenseBase<Derived>::ConstantReturnType
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const typename DenseBase<Derived>::ConstantReturnType
 DenseBase<Derived>::Zero()
 {
  return Constant(Scalar(0));
@@ -478,7 +478,7 @@ DenseBase<Derived>::Zero()
  * \sa class CwiseNullaryOp, Zero()
  */
 template<typename Derived>
-bool DenseBase<Derived>::isZero(const RealScalar& prec) const
+EIGEN_DEVICE_FUNC bool DenseBase<Derived>::isZero(const RealScalar& prec) const
 {
  typename internal::nested_eval<Derived,1>::type self(derived());
  for(Index j = 0; j < cols(); ++j)
@@ -496,7 +496,7 @@ bool DenseBase<Derived>::isZero(const RealScalar& prec) const
  * \sa class CwiseNullaryOp, Zero()
  */
 template<typename Derived>
-EIGEN_STRONG_INLINE Derived& DenseBase<Derived>::setZero()
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Derived& DenseBase<Derived>::setZero()
 {
  return setConstant(Scalar(0));
 }
@@ -511,7 +511,7 @@ EIGEN_STRONG_INLINE Derived& DenseBase<Derived>::setZero()
  * \sa DenseBase::setZero(), setZero(Index,Index), class CwiseNullaryOp, DenseBase::Zero()
  */
 template<typename Derived>
-EIGEN_STRONG_INLINE Derived&
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Derived&
 PlainObjectBase<Derived>::setZero(Index newSize)
 {
  resize(newSize);
@@ -529,7 +529,7 @@ PlainObjectBase<Derived>::setZero(Index newSize)
  * \sa DenseBase::setZero(), setZero(Index), class CwiseNullaryOp, DenseBase::Zero()
  */
 template<typename Derived>
-EIGEN_STRONG_INLINE Derived&
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Derived&
 PlainObjectBase<Derived>::setZero(Index rows, Index cols)
 {
  resize(rows, cols);
@@ -553,7 +553,7 @@ PlainObjectBase<Derived>::setZero(Index rows, Index cols)
  * \sa Ones(), Ones(Index), isOnes(), class Ones
  */
 template<typename Derived>
-EIGEN_STRONG_INLINE const typename DenseBase<Derived>::ConstantReturnType
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const typename DenseBase<Derived>::ConstantReturnType
 DenseBase<Derived>::Ones(Index rows, Index cols)
 {
  return Constant(rows, cols, Scalar(1));
@@ -576,7 +576,7 @@ DenseBase<Derived>::Ones(Index rows, Index cols)
  * \sa Ones(), Ones(Index,Index), isOnes(), class Ones
  */
 template<typename Derived>
-EIGEN_STRONG_INLINE const typename DenseBase<Derived>::ConstantReturnType
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const typename DenseBase<Derived>::ConstantReturnType
 DenseBase<Derived>::Ones(Index newSize)
 {
  return Constant(newSize, Scalar(1));
@@ -593,7 +593,7 @@ DenseBase<Derived>::Ones(Index newSize)
  * \sa Ones(Index), Ones(Index,Index), isOnes(), class Ones
  */
 template<typename Derived>
-EIGEN_STRONG_INLINE const typename DenseBase<Derived>::ConstantReturnType
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const typename DenseBase<Derived>::ConstantReturnType
 DenseBase<Derived>::Ones()
 {
  return Constant(Scalar(1));
@@ -608,7 +608,7 @@ DenseBase<Derived>::Ones()
  * \sa class CwiseNullaryOp, Ones()
  */
 template<typename Derived>
-bool DenseBase<Derived>::isOnes
+EIGEN_DEVICE_FUNC bool DenseBase<Derived>::isOnes
 (const RealScalar& prec) const
 {
  return isApproxToConstant(Scalar(1), prec);
@@ -622,7 +622,7 @@ bool DenseBase<Derived>::isOnes
  * \sa class CwiseNullaryOp, Ones()
  */
 template<typename Derived>
-EIGEN_STRONG_INLINE Derived& DenseBase<Derived>::setOnes()
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Derived& DenseBase<Derived>::setOnes()
 {
  return setConstant(Scalar(1));
 }
@@ -637,7 +637,7 @@ EIGEN_STRONG_INLINE Derived& DenseBase<Derived>::setOnes()
  * \sa MatrixBase::setOnes(), setOnes(Index,Index), class CwiseNullaryOp, MatrixBase::Ones()
  */
 template<typename Derived>
-EIGEN_STRONG_INLINE Derived&
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Derived&
 PlainObjectBase<Derived>::setOnes(Index newSize)
 {
  resize(newSize);
@@ -655,7 +655,7 @@ PlainObjectBase<Derived>::setOnes(Index newSize)
  * \sa MatrixBase::setOnes(), setOnes(Index), class CwiseNullaryOp, MatrixBase::Ones()
  */
 template<typename Derived>
-EIGEN_STRONG_INLINE Derived&
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Derived&
 PlainObjectBase<Derived>::setOnes(Index rows, Index cols)
 {
  resize(rows, cols);
@@ -679,7 +679,7 @@ PlainObjectBase<Derived>::setOnes(Index rows, Index cols)
  * \sa Identity(), setIdentity(), isIdentity()
  */
 template<typename Derived>
-EIGEN_STRONG_INLINE const typename MatrixBase<Derived>::IdentityReturnType
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const typename MatrixBase<Derived>::IdentityReturnType
 MatrixBase<Derived>::Identity(Index rows, Index cols)
 {
  return DenseBase<Derived>::NullaryExpr(rows, cols, internal::scalar_identity_op<Scalar>());
@@ -696,7 +696,7 @@ MatrixBase<Derived>::Identity(Index rows, Index cols)
  * \sa Identity(Index,Index), setIdentity(), isIdentity()
  */
 template<typename Derived>
-EIGEN_STRONG_INLINE const typename MatrixBase<Derived>::IdentityReturnType
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const typename MatrixBase<Derived>::IdentityReturnType
 MatrixBase<Derived>::Identity()
 {
  EIGEN_STATIC_ASSERT_FIXED_SIZE(Derived)
@@ -771,7 +771,7 @@ struct setIdentity_impl<Derived, true>
  * \sa class CwiseNullaryOp, Identity(), Identity(Index,Index), isIdentity()
  */
 template<typename Derived>
-EIGEN_STRONG_INLINE Derived& MatrixBase<Derived>::setIdentity()
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Derived& MatrixBase<Derived>::setIdentity()
 {
  return internal::setIdentity_impl<Derived>::run(derived());
 }
@@ -787,7 +787,7 @@ EIGEN_STRONG_INLINE Derived& MatrixBase<Derived>::setIdentity()
  * \sa MatrixBase::setIdentity(), class CwiseNullaryOp, MatrixBase::Identity()
  */
 template<typename Derived>
-EIGEN_STRONG_INLINE Derived& MatrixBase<Derived>::setIdentity(Index rows, Index cols)
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Derived& MatrixBase<Derived>::setIdentity(Index rows, Index cols)
 {
  derived().resize(rows, cols);
  return setIdentity();
@@ -800,7 +800,7 @@ EIGEN_STRONG_INLINE Derived& MatrixBase<Derived>::setIdentity(Index rows, Index
  * \sa MatrixBase::Unit(Index), MatrixBase::UnitX(), MatrixBase::UnitY(), MatrixBase::UnitZ(), MatrixBase::UnitW()
  */
 template<typename Derived>
-EIGEN_STRONG_INLINE const typename MatrixBase<Derived>::BasisReturnType MatrixBase<Derived>::Unit(Index newSize, Index i)
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const typename MatrixBase<Derived>::BasisReturnType MatrixBase<Derived>::Unit(Index newSize, Index i)
 {
  EIGEN_STATIC_ASSERT_VECTOR_ONLY(Derived)
  return BasisReturnType(SquareMatrixType::Identity(newSize,newSize), i);
@@ -815,7 +815,7 @@ EIGEN_STRONG_INLINE const typename MatrixBase<Derived>::BasisReturnType MatrixBa
  * \sa MatrixBase::Unit(Index,Index), MatrixBase::UnitX(), MatrixBase::UnitY(), MatrixBase::UnitZ(), MatrixBase::UnitW()
  */
 template<typename Derived>
-EIGEN_STRONG_INLINE const typename MatrixBase<Derived>::BasisReturnType MatrixBase<Derived>::Unit(Index i)
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const typename MatrixBase<Derived>::BasisReturnType MatrixBase<Derived>::Unit(Index i)
 {
  EIGEN_STATIC_ASSERT_VECTOR_ONLY(Derived)
  return BasisReturnType(SquareMatrixType::Identity(),i);
@@ -828,7 +828,7 @@ EIGEN_STRONG_INLINE const typename MatrixBase<Derived>::BasisReturnType MatrixBa
  * \sa MatrixBase::Unit(Index,Index), MatrixBase::Unit(Index), MatrixBase::UnitY(), MatrixBase::UnitZ(), MatrixBase::UnitW()
  */
 template<typename Derived>
-EIGEN_STRONG_INLINE const typename MatrixBase<Derived>::BasisReturnType MatrixBase<Derived>::UnitX()
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const typename MatrixBase<Derived>::BasisReturnType MatrixBase<Derived>::UnitX()
 { return Derived::Unit(0); }

 /** \returns an expression of the Y axis unit vector (0,1{,0}^*)
@@ -838,7 +838,7 @@ EIGEN_STRONG_INLINE const typename MatrixBase<Derived>::BasisReturnType MatrixBa
  * \sa MatrixBase::Unit(Index,Index), MatrixBase::Unit(Index), MatrixBase::UnitY(), MatrixBase::UnitZ(), MatrixBase::UnitW()
  */
 template<typename Derived>
-EIGEN_STRONG_INLINE const typename MatrixBase<Derived>::BasisReturnType MatrixBase<Derived>::UnitY()
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const typename MatrixBase<Derived>::BasisReturnType MatrixBase<Derived>::UnitY()
 { return Derived::Unit(1); }

 /** \returns an expression of the Z axis unit vector (0,0,1{,0}^*)
@@ -848,7 +848,7 @@ EIGEN_STRONG_INLINE const typename MatrixBase<Derived>::BasisReturnType MatrixBa
  * \sa MatrixBase::Unit(Index,Index), MatrixBase::Unit(Index), MatrixBase::UnitY(), MatrixBase::UnitZ(), MatrixBase::UnitW()
  */
 template<typename Derived>
-EIGEN_STRONG_INLINE const typename MatrixBase<Derived>::BasisReturnType MatrixBase<Derived>::UnitZ()
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const typename MatrixBase<Derived>::BasisReturnType MatrixBase<Derived>::UnitZ()
 { return Derived::Unit(2); }

 /** \returns an expression of the W axis unit vector (0,0,0,1)
@@ -858,7 +858,7 @@ EIGEN_STRONG_INLINE const typename MatrixBase<Derived>::BasisReturnType MatrixBa
  * \sa MatrixBase::Unit(Index,Index), MatrixBase::Unit(Index), MatrixBase::UnitY(), MatrixBase::UnitZ(), MatrixBase::UnitW()
  */
 template<typename Derived>
-EIGEN_STRONG_INLINE const typename MatrixBase<Derived>::BasisReturnType MatrixBase<Derived>::UnitW()
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const typename MatrixBase<Derived>::BasisReturnType MatrixBase<Derived>::UnitW()
 { return Derived::Unit(3); }

 } // end namespace Eigen
--- a/Eigen/src/Core/CwiseUnaryView.h
+++ b/Eigen/src/Core/CwiseUnaryView.h
@@ -121,6 +121,8 @@ class CwiseUnaryViewImpl<ViewOp,MatrixType,Dense>
    {
      return derived().nestedExpression().outerStride() * sizeof(typename internal::traits<MatrixType>::Scalar) / sizeof(Scalar);
    }
+  protected:
+    EIGEN_DEFAULT_EMPTY_CONSTRUCTOR_AND_DESTRUCTOR(CwiseUnaryViewImpl)
 };

 } // end namespace Eigen
--- a/Eigen/src/Core/DenseBase.h
+++ b/Eigen/src/Core/DenseBase.h
@@ -40,7 +40,7 @@ static inline void check_DenseIndex_is_signed() {
  */
 template<typename Derived> class DenseBase
 #ifndef EIGEN_PARSED_BY_DOXYGEN
-  : public DenseCoeffsBase<Derived>
+  : public DenseCoeffsBase<Derived, internal::accessors_level<Derived>::value>
 #else
  : public DenseCoeffsBase<Derived,DirectWriteAccessors>
 #endif // not EIGEN_PARSED_BY_DOXYGEN
@@ -71,7 +71,7 @@ template<typename Derived> class DenseBase
    typedef Scalar value_type;
    
    typedef typename NumTraits<Scalar>::Real RealScalar;
-    typedef DenseCoeffsBase<Derived> Base;
+    typedef DenseCoeffsBase<Derived, internal::accessors_level<Derived>::value> Base;

    using Base::derived;
    using Base::const_cast_derived;
@@ -296,7 +296,7 @@ template<typename Derived> class DenseBase
    EIGEN_DEVICE_FUNC
    Derived& operator=(const ReturnByValue<OtherDerived>& func);

-    /** \ínternal
+    /** \internal
      * Copies \a other into *this without evaluating other. \returns a reference to *this.
      * \deprecated */
    template<typename OtherDerived>
@@ -463,7 +463,17 @@ template<typename Derived> class DenseBase
    EIGEN_DEVICE_FUNC
    void visit(Visitor& func) const;

-    inline const WithFormat<Derived> format(const IOFormat& fmt) const;
+    /** \returns a WithFormat proxy object allowing to print a matrix the with given
+      * format \a fmt.
+      *
+      * See class IOFormat for some examples.
+      *
+      * \sa class IOFormat, class WithFormat
+      */
+    inline const WithFormat<Derived> format(const IOFormat& fmt) const
+    {
+      return WithFormat<Derived>(derived(), fmt);
+    }

    /** \returns the unique coefficient of a 1x1 expression */
    EIGEN_DEVICE_FUNC
@@ -474,9 +484,9 @@ template<typename Derived> class DenseBase
      return derived().coeff(0,0);
    }

-    bool all() const;
-    bool any() const;
-    Index count() const;
+    EIGEN_DEVICE_FUNC bool all() const;
+    EIGEN_DEVICE_FUNC bool any() const;
+    EIGEN_DEVICE_FUNC Index count() const;

    typedef VectorwiseOp<Derived, Horizontal> RowwiseReturnType;
    typedef const VectorwiseOp<const Derived, Horizontal> ConstRowwiseReturnType;
@@ -577,11 +587,12 @@ template<typename Derived> class DenseBase
    }

  protected:
+    EIGEN_DEFAULT_COPY_CONSTRUCTOR(DenseBase)
    /** Default constructor. Do nothing. */
    EIGEN_DEVICE_FUNC DenseBase()
    {
      /* Just checks for self-consistency of the flags.
-       * Only do it when debugging Eigen, as this borders on paranoiac and could slow compilation down
+       * Only do it when debugging Eigen, as this borders on paranoia and could slow compilation down
       */
 #ifdef EIGEN_INTERNAL_DEBUGGING
      EIGEN_STATIC_ASSERT((EIGEN_IMPLIES(MaxRowsAtCompileTime==1 && MaxColsAtCompileTime!=1, int(IsRowMajor))
--- a/Eigen/src/Core/DenseStorage.h
+++ b/Eigen/src/Core/DenseStorage.h
@@ -13,9 +13,9 @@
 #define EIGEN_MATRIXSTORAGE_H

 #ifdef EIGEN_DENSE_STORAGE_CTOR_PLUGIN
-  #define EIGEN_INTERNAL_DENSE_STORAGE_CTOR_PLUGIN EIGEN_DENSE_STORAGE_CTOR_PLUGIN;
+  #define EIGEN_INTERNAL_DENSE_STORAGE_CTOR_PLUGIN(X) X; EIGEN_DENSE_STORAGE_CTOR_PLUGIN;
 #else
-  #define EIGEN_INTERNAL_DENSE_STORAGE_CTOR_PLUGIN
+  #define EIGEN_INTERNAL_DENSE_STORAGE_CTOR_PLUGIN(X)
 #endif

 namespace Eigen {
@@ -184,12 +184,16 @@ template<typename T, int Size, int _Rows, int _Cols, int _Options> class DenseSt
 {
    internal::plain_array<T,Size,_Options> m_data;
  public:
-    EIGEN_DEVICE_FUNC DenseStorage() {}
+    EIGEN_DEVICE_FUNC DenseStorage() {
+      EIGEN_INTERNAL_DENSE_STORAGE_CTOR_PLUGIN(Index size = Size)
+    }
    EIGEN_DEVICE_FUNC
    explicit DenseStorage(internal::constructor_without_unaligned_array_assert)
      : m_data(internal::constructor_without_unaligned_array_assert()) {}
    EIGEN_DEVICE_FUNC 
-    DenseStorage(const DenseStorage& other) : m_data(other.m_data) {}
+    DenseStorage(const DenseStorage& other) : m_data(other.m_data) {
+      EIGEN_INTERNAL_DENSE_STORAGE_CTOR_PLUGIN(Index size = Size)
+    }
    EIGEN_DEVICE_FUNC 
    DenseStorage& operator=(const DenseStorage& other)
    { 
@@ -197,7 +201,7 @@ template<typename T, int Size, int _Rows, int _Cols, int _Options> class DenseSt
      return *this; 
    }
    EIGEN_DEVICE_FUNC DenseStorage(Index size, Index rows, Index cols) {
-      EIGEN_INTERNAL_DENSE_STORAGE_CTOR_PLUGIN
+      EIGEN_INTERNAL_DENSE_STORAGE_CTOR_PLUGIN({})
      eigen_internal_assert(size==rows*cols && rows==_Rows && cols==_Cols);
      EIGEN_UNUSED_VARIABLE(size);
      EIGEN_UNUSED_VARIABLE(rows);
@@ -343,7 +347,7 @@ template<typename T, int _Options> class DenseStorage<T, Dynamic, Dynamic, Dynam
    EIGEN_DEVICE_FUNC DenseStorage(Index size, Index rows, Index cols)
      : m_data(internal::conditional_aligned_new_auto<T,(_Options&DontAlign)==0>(size)), m_rows(rows), m_cols(cols)
    {
-      EIGEN_INTERNAL_DENSE_STORAGE_CTOR_PLUGIN
+      EIGEN_INTERNAL_DENSE_STORAGE_CTOR_PLUGIN({})
      eigen_internal_assert(size==rows*cols && rows>=0 && cols >=0);
    }
    EIGEN_DEVICE_FUNC DenseStorage(const DenseStorage& other)
@@ -351,6 +355,7 @@ template<typename T, int _Options> class DenseStorage<T, Dynamic, Dynamic, Dynam
      , m_rows(other.m_rows)
      , m_cols(other.m_cols)
    {
+      EIGEN_INTERNAL_DENSE_STORAGE_CTOR_PLUGIN(Index size = m_rows*m_cols)
      internal::smart_copy(other.m_data, other.m_data+other.m_rows*other.m_cols, m_data);
    }
    EIGEN_DEVICE_FUNC DenseStorage& operator=(const DenseStorage& other)
@@ -399,11 +404,11 @@ template<typename T, int _Options> class DenseStorage<T, Dynamic, Dynamic, Dynam
      if(size != m_rows*m_cols)
      {
        internal::conditional_aligned_delete_auto<T,(_Options&DontAlign)==0>(m_data, m_rows*m_cols);
-        if (size)
+        if (size>0) // >0 and not simply !=0 to let the compiler knows that size cannot be negative
          m_data = internal::conditional_aligned_new_auto<T,(_Options&DontAlign)==0>(size);
        else
          m_data = 0;
-        EIGEN_INTERNAL_DENSE_STORAGE_CTOR_PLUGIN
+        EIGEN_INTERNAL_DENSE_STORAGE_CTOR_PLUGIN({})
      }
      m_rows = rows;
      m_cols = cols;
@@ -422,7 +427,7 @@ template<typename T, int _Rows, int _Options> class DenseStorage<T, Dynamic, _Ro
    explicit DenseStorage(internal::constructor_without_unaligned_array_assert) : m_data(0), m_cols(0) {}
    EIGEN_DEVICE_FUNC DenseStorage(Index size, Index rows, Index cols) : m_data(internal::conditional_aligned_new_auto<T,(_Options&DontAlign)==0>(size)), m_cols(cols)
    {
-      EIGEN_INTERNAL_DENSE_STORAGE_CTOR_PLUGIN
+      EIGEN_INTERNAL_DENSE_STORAGE_CTOR_PLUGIN({})
      eigen_internal_assert(size==rows*cols && rows==_Rows && cols >=0);
      EIGEN_UNUSED_VARIABLE(rows);
    }
@@ -430,6 +435,7 @@ template<typename T, int _Rows, int _Options> class DenseStorage<T, Dynamic, _Ro
      : m_data(internal::conditional_aligned_new_auto<T,(_Options&DontAlign)==0>(_Rows*other.m_cols))
      , m_cols(other.m_cols)
    {
+      EIGEN_INTERNAL_DENSE_STORAGE_CTOR_PLUGIN(Index size = m_cols*_Rows)
      internal::smart_copy(other.m_data, other.m_data+_Rows*m_cols, m_data);
    }
    EIGEN_DEVICE_FUNC DenseStorage& operator=(const DenseStorage& other)
@@ -473,11 +479,11 @@ template<typename T, int _Rows, int _Options> class DenseStorage<T, Dynamic, _Ro
      if(size != _Rows*m_cols)
      {
        internal::conditional_aligned_delete_auto<T,(_Options&DontAlign)==0>(m_data, _Rows*m_cols);
-        if (size)
+        if (size>0) // >0 and not simply !=0 to let the compiler knows that size cannot be negative
          m_data = internal::conditional_aligned_new_auto<T,(_Options&DontAlign)==0>(size);
        else
          m_data = 0;
-        EIGEN_INTERNAL_DENSE_STORAGE_CTOR_PLUGIN
+        EIGEN_INTERNAL_DENSE_STORAGE_CTOR_PLUGIN({})
      }
      m_cols = cols;
    }
@@ -495,7 +501,7 @@ template<typename T, int _Cols, int _Options> class DenseStorage<T, Dynamic, Dyn
    explicit DenseStorage(internal::constructor_without_unaligned_array_assert) : m_data(0), m_rows(0) {}
    EIGEN_DEVICE_FUNC DenseStorage(Index size, Index rows, Index cols) : m_data(internal::conditional_aligned_new_auto<T,(_Options&DontAlign)==0>(size)), m_rows(rows)
    {
-      EIGEN_INTERNAL_DENSE_STORAGE_CTOR_PLUGIN
+      EIGEN_INTERNAL_DENSE_STORAGE_CTOR_PLUGIN({})
      eigen_internal_assert(size==rows*cols && rows>=0 && cols == _Cols);
      EIGEN_UNUSED_VARIABLE(cols);
    }
@@ -503,6 +509,7 @@ template<typename T, int _Cols, int _Options> class DenseStorage<T, Dynamic, Dyn
      : m_data(internal::conditional_aligned_new_auto<T,(_Options&DontAlign)==0>(other.m_rows*_Cols))
      , m_rows(other.m_rows)
    {
+      EIGEN_INTERNAL_DENSE_STORAGE_CTOR_PLUGIN(Index size = m_rows*_Cols)
      internal::smart_copy(other.m_data, other.m_data+other.m_rows*_Cols, m_data);
    }
    EIGEN_DEVICE_FUNC DenseStorage& operator=(const DenseStorage& other)
@@ -546,11 +553,11 @@ template<typename T, int _Cols, int _Options> class DenseStorage<T, Dynamic, Dyn
      if(size != m_rows*_Cols)
      {
        internal::conditional_aligned_delete_auto<T,(_Options&DontAlign)==0>(m_data, _Cols*m_rows);
-        if (size)
+        if (size>0) // >0 and not simply !=0 to let the compiler knows that size cannot be negative
          m_data = internal::conditional_aligned_new_auto<T,(_Options&DontAlign)==0>(size);
        else
          m_data = 0;
-        EIGEN_INTERNAL_DENSE_STORAGE_CTOR_PLUGIN
+        EIGEN_INTERNAL_DENSE_STORAGE_CTOR_PLUGIN({})
      }
      m_rows = rows;
    }
--- a/Eigen/src/Core/Diagonal.h
+++ b/Eigen/src/Core/Diagonal.h
@@ -11,7 +11,7 @@
 #ifndef EIGEN_DIAGONAL_H
 #define EIGEN_DIAGONAL_H

-namespace Eigen { 
+namespace Eigen {

 /** \class Diagonal
  * \ingroup Core_Module
@@ -21,7 +21,7 @@ namespace Eigen {
  * \param MatrixType the type of the object in which we are taking a sub/main/super diagonal
  * \param DiagIndex the index of the sub/super diagonal. The default is 0 and it means the main diagonal.
  *              A positive value means a superdiagonal, a negative value means a subdiagonal.
-  *              You can also use Dynamic so the index can be set at runtime.
+  *              You can also use DynamicIndex so the index can be set at runtime.
  *
  * The matrix is not required to be square.
  *
@@ -70,7 +70,10 @@ template<typename MatrixType, int _DiagIndex> class Diagonal
    EIGEN_DENSE_PUBLIC_INTERFACE(Diagonal)

    EIGEN_DEVICE_FUNC
-    explicit inline Diagonal(MatrixType& matrix, Index a_index = DiagIndex) : m_matrix(matrix), m_index(a_index) {}
+    explicit inline Diagonal(MatrixType& matrix, Index a_index = DiagIndex) : m_matrix(matrix), m_index(a_index)
+    {
+      eigen_assert( a_index <= m_matrix.cols() && -a_index <= m_matrix.rows() );
+    }

    EIGEN_INHERIT_ASSIGNMENT_OPERATORS(Diagonal)

@@ -146,8 +149,8 @@ template<typename MatrixType, int _DiagIndex> class Diagonal
    }

    EIGEN_DEVICE_FUNC
-    inline const typename internal::remove_all<typename MatrixType::Nested>::type& 
-    nestedExpression() const 
+    inline const typename internal::remove_all<typename MatrixType::Nested>::type&
+    nestedExpression() const
    {
      return m_matrix;
    }
@@ -184,7 +187,7 @@ template<typename MatrixType, int _DiagIndex> class Diagonal
  *
  * \sa class Diagonal */
 template<typename Derived>
-inline typename MatrixBase<Derived>::DiagonalReturnType
+EIGEN_DEVICE_FUNC inline typename MatrixBase<Derived>::DiagonalReturnType
 MatrixBase<Derived>::diagonal()
 {
  return DiagonalReturnType(derived());
@@ -192,7 +195,7 @@ MatrixBase<Derived>::diagonal()

 /** This is the const version of diagonal(). */
 template<typename Derived>
-inline typename MatrixBase<Derived>::ConstDiagonalReturnType
+EIGEN_DEVICE_FUNC inline typename MatrixBase<Derived>::ConstDiagonalReturnType
 MatrixBase<Derived>::diagonal() const
 {
  return ConstDiagonalReturnType(derived());
@@ -210,7 +213,7 @@ MatrixBase<Derived>::diagonal() const
  *
  * \sa MatrixBase::diagonal(), class Diagonal */
 template<typename Derived>
-inline typename MatrixBase<Derived>::DiagonalDynamicIndexReturnType
+EIGEN_DEVICE_FUNC inline typename MatrixBase<Derived>::DiagonalDynamicIndexReturnType
 MatrixBase<Derived>::diagonal(Index index)
 {
  return DiagonalDynamicIndexReturnType(derived(), index);
@@ -218,7 +221,7 @@ MatrixBase<Derived>::diagonal(Index index)

 /** This is the const version of diagonal(Index). */
 template<typename Derived>
-inline typename MatrixBase<Derived>::ConstDiagonalDynamicIndexReturnType
+EIGEN_DEVICE_FUNC inline typename MatrixBase<Derived>::ConstDiagonalDynamicIndexReturnType
 MatrixBase<Derived>::diagonal(Index index) const
 {
  return ConstDiagonalDynamicIndexReturnType(derived(), index);
@@ -237,7 +240,7 @@ MatrixBase<Derived>::diagonal(Index index) const
  * \sa MatrixBase::diagonal(), class Diagonal */
 template<typename Derived>
 template<int Index_>
-inline typename MatrixBase<Derived>::template DiagonalIndexReturnType<Index_>::Type
+EIGEN_DEVICE_FUNC inline typename MatrixBase<Derived>::template DiagonalIndexReturnType<Index_>::Type
 MatrixBase<Derived>::diagonal()
 {
  return typename DiagonalIndexReturnType<Index_>::Type(derived());
@@ -246,7 +249,7 @@ MatrixBase<Derived>::diagonal()
 /** This is the const version of diagonal<int>(). */
 template<typename Derived>
 template<int Index_>
-inline typename MatrixBase<Derived>::template ConstDiagonalIndexReturnType<Index_>::Type
+EIGEN_DEVICE_FUNC inline typename MatrixBase<Derived>::template ConstDiagonalIndexReturnType<Index_>::Type
 MatrixBase<Derived>::diagonal() const
 {
  return typename ConstDiagonalIndexReturnType<Index_>::Type(derived());
--- a/Eigen/src/Core/DiagonalMatrix.h
+++ b/Eigen/src/Core/DiagonalMatrix.h
@@ -11,7 +11,7 @@
 #ifndef EIGEN_DIAGONALMATRIX_H
 #define EIGEN_DIAGONALMATRIX_H

-namespace Eigen { 
+namespace Eigen {

 #ifndef EIGEN_PARSED_BY_DOXYGEN
 template<typename Derived>
@@ -44,7 +44,7 @@ class DiagonalBase : public EigenBase<Derived>

    EIGEN_DEVICE_FUNC
    DenseMatrixType toDenseMatrix() const { return derived(); }
-    
+
    EIGEN_DEVICE_FUNC
    inline const DiagonalVectorType& diagonal() const { return derived().diagonal(); }
    EIGEN_DEVICE_FUNC
@@ -70,7 +70,7 @@ class DiagonalBase : public EigenBase<Derived>
    {
      return InverseReturnType(diagonal().cwiseInverse());
    }
-    
+
    EIGEN_DEVICE_FUNC
    inline const DiagonalWrapper<const EIGEN_EXPR_BINARYOP_SCALAR_RETURN_TYPE(DiagonalVectorType,Scalar,product) >
    operator*(const Scalar& scalar) const
@@ -273,7 +273,7 @@ class DiagonalWrapper
  * \sa class DiagonalWrapper, class DiagonalMatrix, diagonal(), isDiagonal()
  **/
 template<typename Derived>
-inline const DiagonalWrapper<const Derived>
+EIGEN_DEVICE_FUNC inline const DiagonalWrapper<const Derived>
 MatrixBase<Derived>::asDiagonal() const
 {
  return DiagonalWrapper<const Derived>(derived());
@@ -318,20 +318,20 @@ template<> struct AssignmentKind<DenseShape,DiagonalShape> { typedef Diagonal2De
 template< typename DstXprType, typename SrcXprType, typename Functor>
 struct Assignment<DstXprType, SrcXprType, Functor, Diagonal2Dense>
 {
-  static void run(DstXprType &dst, const SrcXprType &src, const internal::assign_op<typename DstXprType::Scalar,typename SrcXprType::Scalar> &/*func*/)
+  static EIGEN_DEVICE_FUNC void run(DstXprType &dst, const SrcXprType &src, const internal::assign_op<typename DstXprType::Scalar,typename SrcXprType::Scalar> &/*func*/)
  {
    Index dstRows = src.rows();
    Index dstCols = src.cols();
    if((dst.rows()!=dstRows) || (dst.cols()!=dstCols))
      dst.resize(dstRows, dstCols);
-    
+
    dst.setZero();
    dst.diagonal() = src.diagonal();
  }
-  
+
  static void run(DstXprType &dst, const SrcXprType &src, const internal::add_assign_op<typename DstXprType::Scalar,typename SrcXprType::Scalar> &/*func*/)
  { dst.diagonal() += src.diagonal(); }
-  
+
  static void run(DstXprType &dst, const SrcXprType &src, const internal::sub_assign_op<typename DstXprType::Scalar,typename SrcXprType::Scalar> &/*func*/)
  { dst.diagonal() -= src.diagonal(); }
 };
--- a/Eigen/src/Core/DiagonalProduct.h
+++ b/Eigen/src/Core/DiagonalProduct.h
@@ -17,7 +17,7 @@ namespace Eigen {
  */
 template<typename Derived>
 template<typename DiagonalDerived>
-inline const Product<Derived, DiagonalDerived, LazyProduct>
+EIGEN_DEVICE_FUNC inline const Product<Derived, DiagonalDerived, LazyProduct>
 MatrixBase<Derived>::operator*(const DiagonalBase<DiagonalDerived> &a_diagonal) const
 {
  return Product<Derived, DiagonalDerived, LazyProduct>(derived(),a_diagonal.derived());
--- a/Eigen/src/Core/Dot.h
+++ b/Eigen/src/Core/Dot.h
@@ -10,7 +10,7 @@
 #ifndef EIGEN_DOT_H
 #define EIGEN_DOT_H

-namespace Eigen { 
+namespace Eigen {

 namespace internal {

@@ -31,7 +31,8 @@ struct dot_nocheck
  typedef scalar_conj_product_op<typename traits<T>::Scalar,typename traits<U>::Scalar> conj_prod;
  typedef typename conj_prod::result_type ResScalar;
  EIGEN_DEVICE_FUNC
-  static inline ResScalar run(const MatrixBase<T>& a, const MatrixBase<U>& b)
+  EIGEN_STRONG_INLINE
+  static ResScalar run(const MatrixBase<T>& a, const MatrixBase<U>& b)
  {
    return a.template binaryExpr<conj_prod>(b).sum();
  }
@@ -43,7 +44,8 @@ struct dot_nocheck<T, U, true>
  typedef scalar_conj_product_op<typename traits<T>::Scalar,typename traits<U>::Scalar> conj_prod;
  typedef typename conj_prod::result_type ResScalar;
  EIGEN_DEVICE_FUNC
-  static inline ResScalar run(const MatrixBase<T>& a, const MatrixBase<U>& b)
+  EIGEN_STRONG_INLINE
+  static ResScalar run(const MatrixBase<T>& a, const MatrixBase<U>& b)
  {
    return a.transpose().template binaryExpr<conj_prod>(b).sum();
  }
@@ -51,7 +53,8 @@ struct dot_nocheck<T, U, true>

 } // end namespace internal

-/** \returns the dot product of *this with other.
+/** \fn MatrixBase::dot
+  * \returns the dot product of *this with other.
  *
  * \only_for_vectors
  *
@@ -64,14 +67,17 @@ struct dot_nocheck<T, U, true>
 template<typename Derived>
 template<typename OtherDerived>
 EIGEN_DEVICE_FUNC
+EIGEN_STRONG_INLINE
 typename ScalarBinaryOpTraits<typename internal::traits<Derived>::Scalar,typename internal::traits<OtherDerived>::Scalar>::ReturnType
 MatrixBase<Derived>::dot(const MatrixBase<OtherDerived>& other) const
 {
  EIGEN_STATIC_ASSERT_VECTOR_ONLY(Derived)
  EIGEN_STATIC_ASSERT_VECTOR_ONLY(OtherDerived)
  EIGEN_STATIC_ASSERT_SAME_VECTOR_SIZE(Derived,OtherDerived)
+#if !(defined(EIGEN_NO_STATIC_ASSERT) && defined(EIGEN_NO_DEBUG))
  typedef internal::scalar_conj_product_op<Scalar,typename OtherDerived::Scalar> func;
  EIGEN_CHECK_BINARY_COMPATIBILIY(func,Scalar,typename OtherDerived::Scalar);
+#endif

  eigen_assert(size() == other.size());

@@ -80,14 +86,14 @@ MatrixBase<Derived>::dot(const MatrixBase<OtherDerived>& other) const

 //---------- implementation of L2 norm and related functions ----------

-/** \returns, for vectors, the squared \em l2 norm of \c *this, and for matrices the Frobenius norm.
+/** \returns, for vectors, the squared \em l2 norm of \c *this, and for matrices the squared Frobenius norm.
  * In both cases, it consists in the sum of the square of all the matrix entries.
  * For vectors, this is also equals to the dot product of \c *this with itself.
  *
  * \sa dot(), norm(), lpNorm()
  */
 template<typename Derived>
-EIGEN_STRONG_INLINE typename NumTraits<typename internal::traits<Derived>::Scalar>::Real MatrixBase<Derived>::squaredNorm() const
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE typename NumTraits<typename internal::traits<Derived>::Scalar>::Real MatrixBase<Derived>::squaredNorm() const
 {
  return numext::real((*this).cwiseAbs2().sum());
 }
@@ -99,7 +105,7 @@ EIGEN_STRONG_INLINE typename NumTraits<typename internal::traits<Derived>::Scala
  * \sa lpNorm(), dot(), squaredNorm()
  */
 template<typename Derived>
-inline typename NumTraits<typename internal::traits<Derived>::Scalar>::Real MatrixBase<Derived>::norm() const
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE typename NumTraits<typename internal::traits<Derived>::Scalar>::Real MatrixBase<Derived>::norm() const
 {
  return numext::sqrt(squaredNorm());
 }
@@ -114,7 +120,7 @@ inline typename NumTraits<typename internal::traits<Derived>::Scalar>::Real Matr
  * \sa norm(), normalize()
  */
 template<typename Derived>
-inline const typename MatrixBase<Derived>::PlainObject
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const typename MatrixBase<Derived>::PlainObject
 MatrixBase<Derived>::normalized() const
 {
  typedef typename internal::nested_eval<Derived,2>::type _Nested;
@@ -136,7 +142,7 @@ MatrixBase<Derived>::normalized() const
  * \sa norm(), normalized()
  */
 template<typename Derived>
-inline void MatrixBase<Derived>::normalize()
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void MatrixBase<Derived>::normalize()
 {
  RealScalar z = squaredNorm();
  // NOTE: after extensive benchmarking, this conditional does not impact performance, at least on recent x86 CPU
@@ -157,7 +163,7 @@ inline void MatrixBase<Derived>::normalize()
  * \sa stableNorm(), stableNormalize(), normalized()
  */
 template<typename Derived>
-inline const typename MatrixBase<Derived>::PlainObject
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const typename MatrixBase<Derived>::PlainObject
 MatrixBase<Derived>::stableNormalized() const
 {
  typedef typename internal::nested_eval<Derived,3>::type _Nested;
@@ -182,7 +188,7 @@ MatrixBase<Derived>::stableNormalized() const
  * \sa stableNorm(), stableNormalized(), normalize()
  */
 template<typename Derived>
-inline void MatrixBase<Derived>::stableNormalize()
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void MatrixBase<Derived>::stableNormalize()
 {
  RealScalar w = cwiseAbs().maxCoeff();
  RealScalar z = (derived()/w).squaredNorm();
@@ -254,9 +260,9 @@ struct lpNorm_selector<Derived, Infinity>
 template<typename Derived>
 template<int p>
 #ifndef EIGEN_PARSED_BY_DOXYGEN
-inline typename NumTraits<typename internal::traits<Derived>::Scalar>::Real
+EIGEN_DEVICE_FUNC inline typename NumTraits<typename internal::traits<Derived>::Scalar>::Real
 #else
-MatrixBase<Derived>::RealScalar
+EIGEN_DEVICE_FUNC MatrixBase<Derived>::RealScalar
 #endif
 MatrixBase<Derived>::lpNorm() const
 {
--- a/Eigen/src/Core/EigenBase.h
+++ b/Eigen/src/Core/EigenBase.h
@@ -14,6 +14,7 @@
 namespace Eigen {

 /** \class EigenBase
+  * \ingroup Core_Module
  * 
  * Common base class for all classes T such that MatrixBase has an operator=(T) and a constructor MatrixBase(T).
  *
@@ -128,6 +129,7 @@ template<typename Derived> struct EigenBase
  */
 template<typename Derived>
 template<typename OtherDerived>
+EIGEN_DEVICE_FUNC
 Derived& DenseBase<Derived>::operator=(const EigenBase<OtherDerived> &other)
 {
  call_assignment(derived(), other.derived());
@@ -136,6 +138,7 @@ Derived& DenseBase<Derived>::operator=(const EigenBase<OtherDerived> &other)

 template<typename Derived>
 template<typename OtherDerived>
+EIGEN_DEVICE_FUNC
 Derived& DenseBase<Derived>::operator+=(const EigenBase<OtherDerived> &other)
 {
  call_assignment(derived(), other.derived(), internal::add_assign_op<Scalar,typename OtherDerived::Scalar>());
@@ -144,6 +147,7 @@ Derived& DenseBase<Derived>::operator+=(const EigenBase<OtherDerived> &other)

 template<typename Derived>
 template<typename OtherDerived>
+EIGEN_DEVICE_FUNC
 Derived& DenseBase<Derived>::operator-=(const EigenBase<OtherDerived> &other)
 {
  call_assignment(derived(), other.derived(), internal::sub_assign_op<Scalar,typename OtherDerived::Scalar>());
--- a/Eigen/src/Core/Fuzzy.h
+++ b/Eigen/src/Core/Fuzzy.h
@@ -11,7 +11,7 @@
 #ifndef EIGEN_FUZZY_H
 #define EIGEN_FUZZY_H

-namespace Eigen { 
+namespace Eigen {

 namespace internal
 {
@@ -100,7 +100,7 @@ struct isMuchSmallerThan_scalar_selector<Derived, true>
  */
 template<typename Derived>
 template<typename OtherDerived>
-bool DenseBase<Derived>::isApprox(
+EIGEN_DEVICE_FUNC bool DenseBase<Derived>::isApprox(
  const DenseBase<OtherDerived>& other,
  const RealScalar& prec
 ) const
@@ -122,7 +122,7 @@ bool DenseBase<Derived>::isApprox(
  * \sa isApprox(), isMuchSmallerThan(const DenseBase<OtherDerived>&, RealScalar) const
  */
 template<typename Derived>
-bool DenseBase<Derived>::isMuchSmallerThan(
+EIGEN_DEVICE_FUNC bool DenseBase<Derived>::isMuchSmallerThan(
  const typename NumTraits<Scalar>::Real& other,
  const RealScalar& prec
 ) const
@@ -142,7 +142,7 @@ bool DenseBase<Derived>::isMuchSmallerThan(
  */
 template<typename Derived>
 template<typename OtherDerived>
-bool DenseBase<Derived>::isMuchSmallerThan(
+EIGEN_DEVICE_FUNC bool DenseBase<Derived>::isMuchSmallerThan(
  const DenseBase<OtherDerived>& other,
  const RealScalar& prec
 ) const
--- a/Eigen/src/Core/GeneralProduct.h
+++ b/Eigen/src/Core/GeneralProduct.h
@@ -24,11 +24,17 @@ template<int Rows, int Cols, int Depth> struct product_type_selector;

 template<int Size, int MaxSize> struct product_size_category
 {
-  enum { is_large = MaxSize == Dynamic ||
-                    Size >= EIGEN_CACHEFRIENDLY_PRODUCT_THRESHOLD,
-         value = is_large  ? Large
-               : Size == 1 ? 1
-                           : Small
+  enum {
+    #ifndef EIGEN_CUDA_ARCH
+    is_large = MaxSize == Dynamic ||
+               Size >= EIGEN_CACHEFRIENDLY_PRODUCT_THRESHOLD ||
+               (Size==Dynamic && MaxSize>=EIGEN_CACHEFRIENDLY_PRODUCT_THRESHOLD),
+    #else
+    is_large = 0,
+    #endif
+    value = is_large  ? Large
+          : Size == 1 ? 1
+                      : Small
  };
 };

@@ -201,12 +207,12 @@ template<> struct gemv_dense_selector<OnTheRight,ColMajor,true>
    typedef typename Rhs::Scalar   RhsScalar;
    typedef typename Dest::Scalar  ResScalar;
    typedef typename Dest::RealScalar  RealScalar;
-    
+
    typedef internal::blas_traits<Lhs> LhsBlasTraits;
    typedef typename LhsBlasTraits::DirectLinearAccessType ActualLhsType;
    typedef internal::blas_traits<Rhs> RhsBlasTraits;
    typedef typename RhsBlasTraits::DirectLinearAccessType ActualRhsType;
-  
+
    typedef Map<Matrix<ResScalar,Dynamic,1>, EIGEN_PLAIN_ENUM_MIN(AlignedMax,internal::packet_traits<ResScalar>::size)> MappedDest;

    ActualLhsType actualLhs = LhsBlasTraits::extract(lhs);
@@ -223,50 +229,65 @@ template<> struct gemv_dense_selector<OnTheRight,ColMajor,true>
      // on, the other hand it is good for the cache to pack the vector anyways...
      EvalToDestAtCompileTime = (ActualDest::InnerStrideAtCompileTime==1),
      ComplexByReal = (NumTraits<LhsScalar>::IsComplex) && (!NumTraits<RhsScalar>::IsComplex),
-      MightCannotUseDest = (ActualDest::InnerStrideAtCompileTime!=1) || ComplexByReal
+      MightCannotUseDest = (!EvalToDestAtCompileTime) || ComplexByReal
    };

-    gemv_static_vector_if<ResScalar,ActualDest::SizeAtCompileTime,ActualDest::MaxSizeAtCompileTime,MightCannotUseDest> static_dest;
-
-    const bool alphaIsCompatible = (!ComplexByReal) || (numext::imag(actualAlpha)==RealScalar(0));
-    const bool evalToDest = EvalToDestAtCompileTime && alphaIsCompatible;
-
-    RhsScalar compatibleAlpha = get_factor<ResScalar,RhsScalar>::run(actualAlpha);
-
-    ei_declare_aligned_stack_constructed_variable(ResScalar,actualDestPtr,dest.size(),
-                                                  evalToDest ? dest.data() : static_dest.data());
-
-    if(!evalToDest)
-    {
-      #ifdef EIGEN_DENSE_STORAGE_CTOR_PLUGIN
-      Index size = dest.size();
-      EIGEN_DENSE_STORAGE_CTOR_PLUGIN
-      #endif
-      if(!alphaIsCompatible)
-      {
-        MappedDest(actualDestPtr, dest.size()).setZero();
-        compatibleAlpha = RhsScalar(1);
-      }
-      else
-        MappedDest(actualDestPtr, dest.size()) = dest;
-    }
-
    typedef const_blas_data_mapper<LhsScalar,Index,ColMajor> LhsMapper;
    typedef const_blas_data_mapper<RhsScalar,Index,RowMajor> RhsMapper;
-    general_matrix_vector_product
-        <Index,LhsScalar,LhsMapper,ColMajor,LhsBlasTraits::NeedToConjugate,RhsScalar,RhsMapper,RhsBlasTraits::NeedToConjugate>::run(
-        actualLhs.rows(), actualLhs.cols(),
-        LhsMapper(actualLhs.data(), actualLhs.outerStride()),
-        RhsMapper(actualRhs.data(), actualRhs.innerStride()),
-        actualDestPtr, 1,
-        compatibleAlpha);
+    RhsScalar compatibleAlpha = get_factor<ResScalar,RhsScalar>::run(actualAlpha);

-    if (!evalToDest)
+    if(!MightCannotUseDest)
    {
-      if(!alphaIsCompatible)
-        dest += actualAlpha * MappedDest(actualDestPtr, dest.size());
-      else
-        dest = MappedDest(actualDestPtr, dest.size());
+      // shortcut if we are sure to be able to use dest directly,
+      // this ease the compiler to generate cleaner and more optimzized code for most common cases
+      general_matrix_vector_product
+          <Index,LhsScalar,LhsMapper,ColMajor,LhsBlasTraits::NeedToConjugate,RhsScalar,RhsMapper,RhsBlasTraits::NeedToConjugate>::run(
+          actualLhs.rows(), actualLhs.cols(),
+          LhsMapper(actualLhs.data(), actualLhs.outerStride()),
+          RhsMapper(actualRhs.data(), actualRhs.innerStride()),
+          dest.data(), 1,
+          compatibleAlpha);
+    }
+    else
+    {
+      gemv_static_vector_if<ResScalar,ActualDest::SizeAtCompileTime,ActualDest::MaxSizeAtCompileTime,MightCannotUseDest> static_dest;
+
+      const bool alphaIsCompatible = (!ComplexByReal) || (numext::imag(actualAlpha)==RealScalar(0));
+      const bool evalToDest = EvalToDestAtCompileTime && alphaIsCompatible;
+
+      ei_declare_aligned_stack_constructed_variable(ResScalar,actualDestPtr,dest.size(),
+                                                    evalToDest ? dest.data() : static_dest.data());
+
+      if(!evalToDest)
+      {
+        #ifdef EIGEN_DENSE_STORAGE_CTOR_PLUGIN
+        Index size = dest.size();
+        EIGEN_DENSE_STORAGE_CTOR_PLUGIN
+        #endif
+        if(!alphaIsCompatible)
+        {
+          MappedDest(actualDestPtr, dest.size()).setZero();
+          compatibleAlpha = RhsScalar(1);
+        }
+        else
+          MappedDest(actualDestPtr, dest.size()) = dest;
+      }
+
+      general_matrix_vector_product
+          <Index,LhsScalar,LhsMapper,ColMajor,LhsBlasTraits::NeedToConjugate,RhsScalar,RhsMapper,RhsBlasTraits::NeedToConjugate>::run(
+          actualLhs.rows(), actualLhs.cols(),
+          LhsMapper(actualLhs.data(), actualLhs.outerStride()),
+          RhsMapper(actualRhs.data(), actualRhs.innerStride()),
+          actualDestPtr, 1,
+          compatibleAlpha);
+
+      if (!evalToDest)
+      {
+        if(!alphaIsCompatible)
+          dest.matrix() += actualAlpha * MappedDest(actualDestPtr, dest.size());
+        else
+          dest = MappedDest(actualDestPtr, dest.size());
+      }
    }
  }
 };
@@ -279,7 +300,7 @@ template<> struct gemv_dense_selector<OnTheRight,RowMajor,true>
    typedef typename Lhs::Scalar   LhsScalar;
    typedef typename Rhs::Scalar   RhsScalar;
    typedef typename Dest::Scalar  ResScalar;
-    
+
    typedef internal::blas_traits<Lhs> LhsBlasTraits;
    typedef typename LhsBlasTraits::DirectLinearAccessType ActualLhsType;
    typedef internal::blas_traits<Rhs> RhsBlasTraits;
@@ -329,6 +350,7 @@ template<> struct gemv_dense_selector<OnTheRight,ColMajor,false>
  template<typename Lhs, typename Rhs, typename Dest>
  static void run(const Lhs &lhs, const Rhs &rhs, Dest& dest, const typename Dest::Scalar& alpha)
  {
+    EIGEN_STATIC_ASSERT((!nested_eval<Lhs,1>::Evaluate),EIGEN_INTERNAL_COMPILATION_ERROR_OR_YOU_MADE_A_PROGRAMMING_MISTAKE);
    // TODO if rhs is large enough it might be beneficial to make sure that dest is sequentially stored in memory, otherwise use a temp
    typename nested_eval<Rhs,1>::type actual_rhs(rhs);
    const Index size = rhs.rows();
@@ -342,6 +364,7 @@ template<> struct gemv_dense_selector<OnTheRight,RowMajor,false>
  template<typename Lhs, typename Rhs, typename Dest>
  static void run(const Lhs &lhs, const Rhs &rhs, Dest& dest, const typename Dest::Scalar& alpha)
  {
+    EIGEN_STATIC_ASSERT((!nested_eval<Lhs,1>::Evaluate),EIGEN_INTERNAL_COMPILATION_ERROR_OR_YOU_MADE_A_PROGRAMMING_MISTAKE);
    typename nested_eval<Rhs,Lhs::RowsAtCompileTime>::type actual_rhs(rhs);
    const Index rows = dest.rows();
    for(Index i=0; i<rows; ++i)
@@ -361,11 +384,9 @@ template<> struct gemv_dense_selector<OnTheRight,RowMajor,false>
  *
  * \sa lazyProduct(), operator*=(const MatrixBase&), Cwise::operator*()
  */
-#ifndef __CUDACC__
-
 template<typename Derived>
 template<typename OtherDerived>
-inline const Product<Derived, OtherDerived>
+EIGEN_DEVICE_FUNC inline const Product<Derived, OtherDerived>
 MatrixBase<Derived>::operator*(const MatrixBase<OtherDerived> &other) const
 {
  // A note regarding the function declaration: In MSVC, this function will sometimes
@@ -394,8 +415,6 @@ MatrixBase<Derived>::operator*(const MatrixBase<OtherDerived> &other) const
  return Product<Derived, OtherDerived>(derived(), other.derived());
 }

-#endif // __CUDACC__
-
 /** \returns an expression of the matrix product of \c *this and \a other without implicit evaluation.
  *
  * The returned product will behave like any other expressions: the coefficients of the product will be
@@ -409,7 +428,7 @@ MatrixBase<Derived>::operator*(const MatrixBase<OtherDerived> &other) const
  */
 template<typename Derived>
 template<typename OtherDerived>
-const Product<Derived,OtherDerived,LazyProduct>
+EIGEN_DEVICE_FUNC const Product<Derived,OtherDerived,LazyProduct>
 MatrixBase<Derived>::lazyProduct(const MatrixBase<OtherDerived> &other) const
 {
  enum {
--- a/Eigen/src/Core/GenericPacketMath.h
+++ b/Eigen/src/Core/GenericPacketMath.h
@@ -230,14 +230,14 @@ pload1(const typename unpacket_traits<Packet>::type  *a) { return pset1<Packet>(
  * duplicated to form: {from[0],from[0],from[1],from[1],from[2],from[2],from[3],from[3]}
  * Currently, this function is only used for scalar * complex products.
  */
-template<typename Packet> EIGEN_DEVICE_FUNC inline Packet
+template<typename Packet> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet
 ploaddup(const typename unpacket_traits<Packet>::type* from) { return *from; }

 /** \internal \returns a packet with elements of \a *from quadrupled.
  * For instance, for a packet of 8 elements, 2 scalars will be read from \a *from and
  * replicated to form: {from[0],from[0],from[0],from[0],from[1],from[1],from[1],from[1]}
  * Currently, this function is only used in matrix products.
-  * For packet-size smaller or equal to 4, this function is equivalent to pload1 
+  * For packet-size smaller or equal to 4, this function is equivalent to pload1
  */
 template<typename Packet> EIGEN_DEVICE_FUNC inline Packet
 ploadquad(const typename unpacket_traits<Packet>::type* from)
@@ -278,7 +278,7 @@ inline void pbroadcast2(const typename unpacket_traits<Packet>::type *a,
 }

 /** \internal \brief Returns a packet with coefficients (a,a+1,...,a+packet_size-1). */
-template<typename Packet> inline Packet
+template<typename Packet> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet
 plset(const typename unpacket_traits<Packet>::type& a) { return a; }

 /** \internal copy the packet \a from to \a *to, \a to must be 16 bytes aligned */
@@ -299,7 +299,7 @@ template<typename Scalar, typename Packet> EIGEN_DEVICE_FUNC inline void pstoreu
 template<typename Scalar> EIGEN_DEVICE_FUNC inline void prefetch(const Scalar* addr)
 {
 #ifdef __CUDA_ARCH__
-#if defined(__LP64__)
+#if defined(__LP64__) || EIGEN_OS_WIN64
  // 64-bit pointer operand constraint for inlined asm
  asm(" prefetch.L1 [ %1 ];" : "=l"(addr) : "l"(addr));
 #else
@@ -351,10 +351,7 @@ template<typename Packet> EIGEN_DEVICE_FUNC inline Packet preverse(const Packet&
 /** \internal \returns \a a with real and imaginary part flipped (for complex type only) */
 template<typename Packet> EIGEN_DEVICE_FUNC inline Packet pcplxflip(const Packet& a)
 {
-  // FIXME: uncomment the following in case we drop the internal imag and real functions.
-//   using std::imag;
-//   using std::real;
-  return Packet(imag(a),real(a));
+  return Packet(a.imag(),a.real());
 }

 /**************************
@@ -362,77 +359,77 @@ template<typename Packet> EIGEN_DEVICE_FUNC inline Packet pcplxflip(const Packet
 ***************************/

 /** \internal \returns the sine of \a a (coeff-wise) */
-template<typename Packet> EIGEN_DECLARE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS
+template<typename Packet> EIGEN_DEVICE_FUNC EIGEN_DECLARE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS
 Packet psin(const Packet& a) { using std::sin; return sin(a); }

 /** \internal \returns the cosine of \a a (coeff-wise) */
-template<typename Packet> EIGEN_DECLARE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS
+template<typename Packet> EIGEN_DEVICE_FUNC EIGEN_DECLARE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS
 Packet pcos(const Packet& a) { using std::cos; return cos(a); }

 /** \internal \returns the tan of \a a (coeff-wise) */
-template<typename Packet> EIGEN_DECLARE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS
+template<typename Packet> EIGEN_DEVICE_FUNC EIGEN_DECLARE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS
 Packet ptan(const Packet& a) { using std::tan; return tan(a); }

 /** \internal \returns the arc sine of \a a (coeff-wise) */
-template<typename Packet> EIGEN_DECLARE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS
+template<typename Packet> EIGEN_DEVICE_FUNC EIGEN_DECLARE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS
 Packet pasin(const Packet& a) { using std::asin; return asin(a); }

 /** \internal \returns the arc cosine of \a a (coeff-wise) */
-template<typename Packet> EIGEN_DECLARE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS
+template<typename Packet> EIGEN_DEVICE_FUNC EIGEN_DECLARE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS
 Packet pacos(const Packet& a) { using std::acos; return acos(a); }

 /** \internal \returns the arc tangent of \a a (coeff-wise) */
-template<typename Packet> EIGEN_DECLARE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS
+template<typename Packet> EIGEN_DEVICE_FUNC EIGEN_DECLARE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS
 Packet patan(const Packet& a) { using std::atan; return atan(a); }

 /** \internal \returns the hyperbolic sine of \a a (coeff-wise) */
-template<typename Packet> EIGEN_DECLARE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS
+template<typename Packet> EIGEN_DEVICE_FUNC EIGEN_DECLARE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS
 Packet psinh(const Packet& a) { using std::sinh; return sinh(a); }

 /** \internal \returns the hyperbolic cosine of \a a (coeff-wise) */
-template<typename Packet> EIGEN_DECLARE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS
+template<typename Packet> EIGEN_DEVICE_FUNC EIGEN_DECLARE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS
 Packet pcosh(const Packet& a) { using std::cosh; return cosh(a); }

 /** \internal \returns the hyperbolic tan of \a a (coeff-wise) */
-template<typename Packet> EIGEN_DECLARE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS
+template<typename Packet> EIGEN_DEVICE_FUNC EIGEN_DECLARE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS
 Packet ptanh(const Packet& a) { using std::tanh; return tanh(a); }

 /** \internal \returns the exp of \a a (coeff-wise) */
-template<typename Packet> EIGEN_DECLARE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS
+template<typename Packet> EIGEN_DEVICE_FUNC EIGEN_DECLARE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS
 Packet pexp(const Packet& a) { using std::exp; return exp(a); }

 /** \internal \returns the log of \a a (coeff-wise) */
-template<typename Packet> EIGEN_DECLARE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS
+template<typename Packet> EIGEN_DEVICE_FUNC EIGEN_DECLARE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS
 Packet plog(const Packet& a) { using std::log; return log(a); }

 /** \internal \returns the log1p of \a a (coeff-wise) */
-template<typename Packet> EIGEN_DECLARE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS
+template<typename Packet> EIGEN_DEVICE_FUNC EIGEN_DECLARE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS
 Packet plog1p(const Packet& a) { return numext::log1p(a); }

 /** \internal \returns the log10 of \a a (coeff-wise) */
-template<typename Packet> EIGEN_DECLARE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS
+template<typename Packet> EIGEN_DEVICE_FUNC EIGEN_DECLARE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS
 Packet plog10(const Packet& a) { using std::log10; return log10(a); }

 /** \internal \returns the square-root of \a a (coeff-wise) */
-template<typename Packet> EIGEN_DECLARE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS
+template<typename Packet> EIGEN_DEVICE_FUNC EIGEN_DECLARE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS
 Packet psqrt(const Packet& a) { using std::sqrt; return sqrt(a); }

 /** \internal \returns the reciprocal square-root of \a a (coeff-wise) */
-template<typename Packet> EIGEN_DECLARE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS
+template<typename Packet> EIGEN_DEVICE_FUNC EIGEN_DECLARE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS
 Packet prsqrt(const Packet& a) {
  return pdiv(pset1<Packet>(1), psqrt(a));
 }

 /** \internal \returns the rounded value of \a a (coeff-wise) */
-template<typename Packet> EIGEN_DECLARE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS
+template<typename Packet> EIGEN_DEVICE_FUNC EIGEN_DECLARE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS
 Packet pround(const Packet& a) { using numext::round; return round(a); }

 /** \internal \returns the floor of \a a (coeff-wise) */
-template<typename Packet> EIGEN_DECLARE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS
+template<typename Packet> EIGEN_DEVICE_FUNC EIGEN_DECLARE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS
 Packet pfloor(const Packet& a) { using numext::floor; return floor(a); }

 /** \internal \returns the ceil of \a a (coeff-wise) */
-template<typename Packet> EIGEN_DECLARE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS
+template<typename Packet> EIGEN_DEVICE_FUNC EIGEN_DECLARE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS
 Packet pceil(const Packet& a) { using numext::ceil; return ceil(a); }

 /***************************************************************************
@@ -482,7 +479,7 @@ EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE void pstoret(Scalar* to, const Packet& fro
  * by the current computation.
  */
 template<typename Packet, int LoadMode>
-inline Packet ploadt_ro(const typename unpacket_traits<Packet>::type* from)
+EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE Packet ploadt_ro(const typename unpacket_traits<Packet>::type* from)
 {
  return ploadt<Packet, LoadMode>(from);
 }
@@ -497,14 +494,14 @@ struct palign_impl

 /** \internal update \a first using the concatenation of the packet_size minus \a Offset last elements
  * of \a first and \a Offset first elements of \a second.
-  * 
+  *
  * This function is currently only used to optimize matrix-vector products on unligned matrices.
  * It takes 2 packets that represent a contiguous memory array, and returns a packet starting
  * at the position \a Offset. For instance, for packets of 4 elements, we have:
  *  Input:
  *  - first = {f0,f1,f2,f3}
  *  - second = {s0,s1,s2,s3}
-  * Output: 
+  * Output:
  *   - if Offset==0 then {f0,f1,f2,f3}
  *   - if Offset==1 then {f1,f2,f3,s0}
  *   - if Offset==2 then {f2,f3,s0,s1}
@@ -521,13 +518,13 @@ inline void palign(PacketType& first, const PacketType& second)
 ***************************************************************************/

 // Eigen+CUDA does not support complexes.
-#ifndef __CUDACC__
+#ifndef EIGEN_CUDACC

 template<> inline std::complex<float> pmul(const std::complex<float>& a, const std::complex<float>& b)
-{ return std::complex<float>(real(a)*real(b) - imag(a)*imag(b), imag(a)*real(b) + real(a)*imag(b)); }
+{ return std::complex<float>(a.real()*b.real() - a.imag()*b.imag(), a.imag()*b.real() + a.real()*b.imag()); }

 template<> inline std::complex<double> pmul(const std::complex<double>& a, const std::complex<double>& b)
-{ return std::complex<double>(real(a)*real(b) - imag(a)*imag(b), imag(a)*real(b) + real(a)*imag(b)); }
+{ return std::complex<double>(a.real()*b.real() - a.imag()*b.imag(), a.imag()*b.real() + a.real()*b.imag()); }

 #endif

--- a/Eigen/src/Core/IO.h
+++ b/Eigen/src/Core/IO.h
@@ -105,24 +105,10 @@ class WithFormat
    }

  protected:
-    const typename ExpressionType::Nested m_matrix;
+    typename ExpressionType::Nested m_matrix;
    IOFormat m_format;
 };

-/** \returns a WithFormat proxy object allowing to print a matrix the with given
-  * format \a fmt.
-  *
-  * See class IOFormat for some examples.
-  *
-  * \sa class IOFormat, class WithFormat
-  */
-template<typename Derived>
-inline const WithFormat<Derived>
-DenseBase<Derived>::format(const IOFormat& fmt) const
-{
-  return WithFormat<Derived>(derived(), fmt);
-}
-
 namespace internal {

 // NOTE: This helper is kept for backward compatibility with previous code specializing
--- a/Eigen/src/Core/Inverse.h
+++ b/Eigen/src/Core/Inverse.h
@@ -45,6 +45,7 @@ class Inverse : public InverseImpl<XprType,typename internal::traits<XprType>::S
 public:
  typedef typename XprType::StorageIndex StorageIndex;
  typedef typename XprType::PlainObject                       PlainObject;
+  typedef typename XprType::Scalar                            Scalar;
  typedef typename internal::ref_selector<XprType>::type      XprTypeNested;
  typedef typename internal::remove_all<XprTypeNested>::type  XprTypeNestedCleaned;
  typedef typename internal::ref_selector<Inverse>::type Nested;
--- a/Eigen/src/Core/Map.h
+++ b/Eigen/src/Core/Map.h
@@ -20,11 +20,17 @@ struct traits<Map<PlainObjectType, MapOptions, StrideType> >
 {
  typedef traits<PlainObjectType> TraitsBase;
  enum {
+    PlainObjectTypeInnerSize = ((traits<PlainObjectType>::Flags&RowMajorBit)==RowMajorBit)
+                             ? PlainObjectType::ColsAtCompileTime
+                             : PlainObjectType::RowsAtCompileTime,
+
    InnerStrideAtCompileTime = StrideType::InnerStrideAtCompileTime == 0
                             ? int(PlainObjectType::InnerStrideAtCompileTime)
                             : int(StrideType::InnerStrideAtCompileTime),
    OuterStrideAtCompileTime = StrideType::OuterStrideAtCompileTime == 0
-                             ? int(PlainObjectType::OuterStrideAtCompileTime)
+                             ? (InnerStrideAtCompileTime==Dynamic || PlainObjectTypeInnerSize==Dynamic
+                                ? Dynamic
+                                : int(InnerStrideAtCompileTime) * int(PlainObjectTypeInnerSize))
                             : int(StrideType::OuterStrideAtCompileTime),
    Alignment = int(MapOptions)&int(AlignedMask),
    Flags0 = TraitsBase::Flags & (~NestByRefBit),
@@ -107,10 +113,11 @@ template<typename PlainObjectType, int MapOptions, typename StrideType> class Ma
    EIGEN_DEVICE_FUNC
    inline Index outerStride() const
    {
-      return StrideType::OuterStrideAtCompileTime != 0 ? m_stride.outer()
-           : IsVectorAtCompileTime ? this->size()
-           : int(Flags)&RowMajorBit ? this->cols()
-           : this->rows();
+      return int(StrideType::OuterStrideAtCompileTime) != 0 ? m_stride.outer()
+           : int(internal::traits<Map>::OuterStrideAtCompileTime) != Dynamic ? Index(internal::traits<Map>::OuterStrideAtCompileTime)
+           : IsVectorAtCompileTime ? (this->size() * innerStride())
+           : (int(Flags)&RowMajorBit) ? (this->cols() * innerStride())
+           : (this->rows() * innerStride());
    }

    /** Constructor in the fixed-size case.
--- a/Eigen/src/Core/MapBase.h
+++ b/Eigen/src/Core/MapBase.h
@@ -43,6 +43,7 @@ template<typename Derived> class MapBase<Derived, ReadOnlyAccessors>
    enum {
      RowsAtCompileTime = internal::traits<Derived>::RowsAtCompileTime,
      ColsAtCompileTime = internal::traits<Derived>::ColsAtCompileTime,
+      InnerStrideAtCompileTime = internal::traits<Derived>::InnerStrideAtCompileTime,
      SizeAtCompileTime = Base::SizeAtCompileTime
    };

@@ -181,14 +182,19 @@ template<typename Derived> class MapBase<Derived, ReadOnlyAccessors>
    #endif

  protected:
+    EIGEN_DEFAULT_COPY_CONSTRUCTOR(MapBase)
+    EIGEN_DEFAULT_EMPTY_CONSTRUCTOR_AND_DESTRUCTOR(MapBase)

    template<typename T>
    EIGEN_DEVICE_FUNC
    void checkSanity(typename internal::enable_if<(internal::traits<T>::Alignment>0),void*>::type = 0) const
    {
 #if EIGEN_MAX_ALIGN_BYTES>0
+      // innerStride() is not set yet when this function is called, so we optimistically assume the lowest plausible value:
+      const Index minInnerStride = InnerStrideAtCompileTime == Dynamic ? 1 : Index(InnerStrideAtCompileTime);
+      EIGEN_ONLY_USED_FOR_DEBUG(minInnerStride);
      eigen_assert((   ((internal::UIntPtr(m_data) % internal::traits<Derived>::Alignment) == 0)
-                    || (cols() * rows() * innerStride() * sizeof(Scalar)) < internal::traits<Derived>::Alignment ) && "data is not aligned");
+                    || (cols() * rows() * minInnerStride * sizeof(Scalar)) < internal::traits<Derived>::Alignment ) && "data is not aligned");
 #endif
    }

@@ -290,6 +296,9 @@ template<typename Derived> class MapBase<Derived, WriteAccessors>
    // In theory we could simply refer to Base:Base::operator=, but MSVC does not like Base::Base,
    // see bugs 821 and 920.
    using ReadOnlyMapBase::Base::operator=;
+  protected:
+    EIGEN_DEFAULT_COPY_CONSTRUCTOR(MapBase)
+    EIGEN_DEFAULT_EMPTY_CONSTRUCTOR_AND_DESTRUCTOR(MapBase)
 };

 #undef EIGEN_STATIC_ASSERT_INDEX_BASED_ACCESS
--- a/Eigen/src/Core/MathFunctions.h
+++ b/Eigen/src/Core/MathFunctions.h
@@ -287,7 +287,7 @@ struct abs2_impl_default<Scalar, true> // IsComplex
  EIGEN_DEVICE_FUNC
  static inline RealScalar run(const Scalar& x)
  {
-    return real(x)*real(x) + imag(x)*imag(x);
+    return x.real()*x.real() + x.imag()*x.imag();
  }
 };

@@ -313,14 +313,17 @@ struct abs2_retval
 ****************************************************************************/

 template<typename Scalar, bool IsComplex>
-struct norm1_default_impl
+struct norm1_default_impl;
+
+template<typename Scalar>
+struct norm1_default_impl<Scalar,true>
 {
  typedef typename NumTraits<Scalar>::Real RealScalar;
  EIGEN_DEVICE_FUNC
  static inline RealScalar run(const Scalar& x)
  {
    EIGEN_USING_STD_MATH(abs);
-    return abs(real(x)) + abs(imag(x));
+    return abs(x.real()) + abs(x.imag());
  }
 };

@@ -348,31 +351,7 @@ struct norm1_retval
 * Implementation of hypot                                                *
 ****************************************************************************/

-template<typename Scalar>
-struct hypot_impl
-{
-  typedef typename NumTraits<Scalar>::Real RealScalar;
-  static inline RealScalar run(const Scalar& x, const Scalar& y)
-  {
-    EIGEN_USING_STD_MATH(abs);
-    EIGEN_USING_STD_MATH(sqrt);
-    RealScalar _x = abs(x);
-    RealScalar _y = abs(y);
-    Scalar p, qp;
-    if(_x>_y)
-    {
-      p = _x;
-      qp = _y / p;
-    }
-    else
-    {
-      p = _y;
-      qp = _x / p;
-    }
-    if(p==RealScalar(0)) return RealScalar(0);
-    return p * sqrt(RealScalar(1) + qp*qp);
-  }
-};
+template<typename Scalar> struct hypot_impl;

 template<typename Scalar>
 struct hypot_retval
@@ -482,6 +461,65 @@ struct arg_retval
  typedef typename NumTraits<Scalar>::Real type;
 };

+/****************************************************************************
+* Implementation of expm1                                                   *
+****************************************************************************/
+
+// This implementation is based on GSL Math's expm1.
+namespace std_fallback {
+  // fallback expm1 implementation in case there is no expm1(Scalar) function in namespace of Scalar,
+  // or that there is no suitable std::expm1 function available. Implementation
+  // attributed to Kahan. See: http://www.plunk.org/~hatch/rightway.php.
+  template<typename Scalar>
+  EIGEN_DEVICE_FUNC inline Scalar expm1(const Scalar& x) {
+    EIGEN_STATIC_ASSERT_NON_INTEGER(Scalar)
+    typedef typename NumTraits<Scalar>::Real RealScalar;
+
+    EIGEN_USING_STD_MATH(exp);
+    Scalar u = exp(x);
+    if (numext::equal_strict(u, Scalar(1))) {
+      return x;
+    }
+    Scalar um1 = u - RealScalar(1);
+    if (numext::equal_strict(um1, Scalar(-1))) {
+      return RealScalar(-1);
+    }
+
+    EIGEN_USING_STD_MATH(log);
+    return (u - RealScalar(1)) * x / log(u);
+  }
+}
+
+template<typename Scalar>
+struct expm1_impl {
+  EIGEN_DEVICE_FUNC static inline Scalar run(const Scalar& x)
+  {
+    EIGEN_STATIC_ASSERT_NON_INTEGER(Scalar)
+    #if EIGEN_HAS_CXX11_MATH
+    using std::expm1;
+    #else
+    using std_fallback::expm1;
+    #endif
+    return expm1(x);
+  }
+};
+
+// Specialization for complex types that are not supported by std::expm1.
+template <typename RealScalar>
+struct expm1_impl<std::complex<RealScalar> > {
+  EIGEN_DEVICE_FUNC static inline std::complex<RealScalar> run(
+      const std::complex<RealScalar>& x) {
+    EIGEN_STATIC_ASSERT_NON_INTEGER(RealScalar)
+    return std_fallback::expm1(x);
+  }
+};
+
+template<typename Scalar>
+struct expm1_retval
+{
+  typedef Scalar type;
+};
+
 /****************************************************************************
 * Implementation of log1p                                                   *
 ****************************************************************************/
@@ -495,23 +533,33 @@ namespace std_fallback {
    typedef typename NumTraits<Scalar>::Real RealScalar;
    EIGEN_USING_STD_MATH(log);
    Scalar x1p = RealScalar(1) + x;
-    return ( x1p == Scalar(1) ) ? x : x * ( log(x1p) / (x1p - RealScalar(1)) );
+    return numext::equal_strict(x1p, Scalar(1)) ? x : x * ( log(x1p) / (x1p - RealScalar(1)) );
  }
 }

 template<typename Scalar>
 struct log1p_impl {
-  static inline Scalar run(const Scalar& x)
+  static EIGEN_DEVICE_FUNC inline Scalar run(const Scalar& x)
  {
    EIGEN_STATIC_ASSERT_NON_INTEGER(Scalar)
    #if EIGEN_HAS_CXX11_MATH
    using std::log1p;
-    #endif
+    #else
    using std_fallback::log1p;
+    #endif
    return log1p(x);
  }
 };

+// Specialization for complex types that are not supported by std::log1p.
+template <typename RealScalar>
+struct log1p_impl<std::complex<RealScalar> > {
+  EIGEN_DEVICE_FUNC static inline std::complex<RealScalar> run(
+      const std::complex<RealScalar>& x) {
+    EIGEN_STATIC_ASSERT_NON_INTEGER(RealScalar)
+    return std_fallback::log1p(x);
+  }
+};

 template<typename Scalar>
 struct log1p_retval
@@ -576,19 +624,6 @@ struct random_retval
 template<typename Scalar> inline EIGEN_MATHFUNC_RETVAL(random, Scalar) random(const Scalar& x, const Scalar& y);
 template<typename Scalar> inline EIGEN_MATHFUNC_RETVAL(random, Scalar) random();

-template<typename Scalar>
-struct random_default_impl<Scalar, false, false>
-{
-  static inline Scalar run(const Scalar& x, const Scalar& y)
-  {
-    return x + (y-x) * Scalar(std::rand()) / Scalar(RAND_MAX);
-  }
-  static inline Scalar run()
-  {
-    return run(Scalar(NumTraits<Scalar>::IsSigned ? -1 : 0), Scalar(1));
-  }
-};
-
 enum {
  meta_floor_log2_terminate,
  meta_floor_log2_move_up,
@@ -636,25 +671,64 @@ struct meta_floor_log2<n, lower, upper, meta_floor_log2_bogus>
  // no value, error at compile time
 };

+#define EIGEN_RAND_MAX INT_MAX
+// Fill a signed positive int with random bits.
+// This is to overcome issues in MSVC which limits RAND_MAX to 32767.
+inline int random_int() {
+#if RAND_MAX == INT_MAX
+  return std::rand();
+#else
+  enum {
+    rand_bits = meta_floor_log2<(unsigned int)(RAND_MAX)+1>::value,
+    int_bits = meta_floor_log2<(unsigned int)(INT_MAX)+1>::value,
+  };
+  unsigned int out = std::rand();
+  for (int bit = rand_bits; bit < int(int_bits); bit += rand_bits) {
+    out = (out << rand_bits) ^ std::rand();
+  }
+  return static_cast<int>(out & INT_MAX);
+#endif
+}
+
+template<typename Scalar>
+struct random_default_impl<Scalar, false, false>
+{
+  static inline Scalar run(const Scalar& x, const Scalar& y)
+  {
+    return x + (y-x) * Scalar(random_int()) / Scalar(EIGEN_RAND_MAX);
+  }
+  static inline Scalar run()
+  {
+    return run(Scalar(NumTraits<Scalar>::IsSigned ? -1 : 0), Scalar(1));
+  }
+};
+
 template<typename Scalar>
 struct random_default_impl<Scalar, false, true>
 {
  static inline Scalar run(const Scalar& x, const Scalar& y)
-  { 
-    typedef typename conditional<NumTraits<Scalar>::IsSigned,std::ptrdiff_t,std::size_t>::type ScalarX;
-    if(y<x)
+  {
+    if (y <= x)
      return x;
-    // the following difference might overflow on a 32 bits system,
-    // but since y>=x the result converted to an unsigned long is still correct.
-    std::size_t range = ScalarX(y)-ScalarX(x);
-    std::size_t offset = 0;
-    // rejection sampling
-    std::size_t divisor = 1;
-    std::size_t multiplier = 1;
-    if(range<RAND_MAX) divisor = (std::size_t(RAND_MAX)+1)/(range+1);
-    else               multiplier = 1 + range/(std::size_t(RAND_MAX)+1);
+    // ScalarU is the unsigned counterpart of Scalar, possibly Scalar itself.
+    typedef typename make_unsigned<Scalar>::type ScalarU;
+    // ScalarX is the widest of ScalarU and unsigned int.
+    // We'll deal only with ScalarX and unsigned int below thus avoiding signed
+    // types and arithmetic and signed overflows (which are undefined behavior).
+    typedef typename conditional<(ScalarU(-1) > unsigned(-1)), ScalarU, unsigned>::type ScalarX;
+    // The following difference doesn't overflow, provided our integer types are two's
+    // complement and have the same number of padding bits in signed and unsigned variants.
+    // This is the case in most modern implementations of C++.
+    ScalarX range = ScalarX(y) - ScalarX(x);
+    ScalarX offset = 0;
+    ScalarX divisor = 1;
+    ScalarX multiplier = 1;
+    const unsigned rand_max = EIGEN_RAND_MAX;
+    if (range <= rand_max) divisor = (rand_max + 1) / (range + 1);
+    else                   multiplier = 1 + range / (rand_max + 1);
+    // Rejection sampling.
    do {
-      offset = (std::size_t(std::rand()) * multiplier) / divisor;
+      offset = (unsigned(random_int()) * multiplier) / divisor;
    } while (offset > range);
    return Scalar(ScalarX(x) + offset);
  }
@@ -664,12 +738,12 @@ struct random_default_impl<Scalar, false, true>
 #ifdef EIGEN_MAKING_DOCS
    return run(Scalar(NumTraits<Scalar>::IsSigned ? -10 : 0), Scalar(10));
 #else
-    enum { rand_bits = meta_floor_log2<(unsigned int)(RAND_MAX)+1>::value,
+    enum { rand_bits = meta_floor_log2<(unsigned int)(EIGEN_RAND_MAX)+1>::value,
           scalar_bits = sizeof(Scalar) * CHAR_BIT,
           shift = EIGEN_PLAIN_ENUM_MAX(0, int(rand_bits) - int(scalar_bits)),
           offset = NumTraits<Scalar>::IsSigned ? (1 << (EIGEN_PLAIN_ENUM_MIN(rand_bits,scalar_bits)-1)) : 0
    };
-    return Scalar((std::rand() >> shift) - offset);
+    return Scalar((random_int() >> shift) - offset);
 #endif
  }
 };
@@ -679,8 +753,8 @@ struct random_default_impl<Scalar, true, false>
 {
  static inline Scalar run(const Scalar& x, const Scalar& y)
  {
-    return Scalar(random(real(x), real(y)),
-                  random(imag(x), imag(y)));
+    return Scalar(random(x.real(), y.real()),
+                  random(x.imag(), y.imag()));
  }
  static inline Scalar run()
  {
@@ -933,6 +1007,9 @@ inline EIGEN_MATHFUNC_RETVAL(abs2, Scalar) abs2(const Scalar& x)
  return EIGEN_MATHFUNC_IMPL(abs2, Scalar)::run(x);
 }

+EIGEN_DEVICE_FUNC
+inline bool abs2(bool x) { return x; }
+
 template<typename Scalar>
 EIGEN_DEVICE_FUNC
 inline EIGEN_MATHFUNC_RETVAL(norm1, Scalar) norm1(const Scalar& x)
@@ -954,7 +1031,7 @@ inline EIGEN_MATHFUNC_RETVAL(log1p, Scalar) log1p(const Scalar& x)
  return EIGEN_MATHFUNC_IMPL(log1p, Scalar)::run(x);
 }

-#ifdef __CUDACC__
+#ifdef EIGEN_CUDACC
 template<> EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE
 float log1p(const float &x) { return ::log1pf(x); }

@@ -988,7 +1065,7 @@ T (floor)(const T& x)
  return floor(x);
 }

-#ifdef __CUDACC__
+#ifdef EIGEN_CUDACC
 template<> EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE
 float floor(const float &x) { return ::floorf(x); }

@@ -1004,7 +1081,7 @@ T (ceil)(const T& x)
  return ceil(x);
 }

-#ifdef __CUDACC__
+#ifdef EIGEN_CUDACC
 template<> EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE
 float ceil(const float &x) { return ::ceilf(x); }

@@ -1030,7 +1107,8 @@ inline int log2(int x)

 /** \returns the square root of \a x.
  *
-  * It is essentially equivalent to \code using std::sqrt; return sqrt(x); \endcode,
+  * It is essentially equivalent to
+  * \code using std::sqrt; return sqrt(x); \endcode
  * but slightly faster for float/double and some compilers (e.g., gcc), thanks to
  * specializations when SSE is enabled.
  *
@@ -1051,7 +1129,7 @@ T log(const T &x) {
  return log(x);
 }

-#ifdef __CUDACC__
+#ifdef EIGEN_CUDACC
 template<> EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE
 float log(const float &x) { return ::logf(x); }

@@ -1061,12 +1139,25 @@ double log(const double &x) { return ::log(x); }

 template<typename T>
 EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE
-typename NumTraits<T>::Real abs(const T &x) {
+typename internal::enable_if<NumTraits<T>::IsSigned || NumTraits<T>::IsComplex,typename NumTraits<T>::Real>::type
+abs(const T &x) {
  EIGEN_USING_STD_MATH(abs);
  return abs(x);
 }

-#ifdef __CUDACC__
+template<typename T>
+EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE
+typename internal::enable_if<!(NumTraits<T>::IsSigned || NumTraits<T>::IsComplex),typename NumTraits<T>::Real>::type
+abs(const T &x) {
+  return x;
+}
+
+#if defined(__SYCL_DEVICE_ONLY__)
+EIGEN_ALWAYS_INLINE float   abs(float x) { return cl::sycl::fabs(x); }
+EIGEN_ALWAYS_INLINE double  abs(double x) { return cl::sycl::fabs(x); }
+#endif // defined(__SYCL_DEVICE_ONLY__)
+
+#ifdef EIGEN_CUDACC
 template<> EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE
 float abs(const float &x) { return ::fabsf(x); }

@@ -1091,7 +1182,7 @@ T exp(const T &x) {
  return exp(x);
 }

-#ifdef __CUDACC__
+#ifdef EIGEN_CUDACC
 template<> EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE
 float exp(const float &x) { return ::expf(x); }

@@ -1106,7 +1197,7 @@ T cos(const T &x) {
  return cos(x);
 }

-#ifdef __CUDACC__
+#ifdef EIGEN_CUDACC
 template<> EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE
 float cos(const float &x) { return ::cosf(x); }

@@ -1121,7 +1212,7 @@ T sin(const T &x) {
  return sin(x);
 }

-#ifdef __CUDACC__
+#ifdef EIGEN_CUDACC
 template<> EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE
 float sin(const float &x) { return ::sinf(x); }

@@ -1136,7 +1227,7 @@ T tan(const T &x) {
  return tan(x);
 }

-#ifdef __CUDACC__
+#ifdef EIGEN_CUDACC
 template<> EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE
 float tan(const float &x) { return ::tanf(x); }

@@ -1151,7 +1242,7 @@ T acos(const T &x) {
  return acos(x);
 }

-#ifdef __CUDACC__
+#ifdef EIGEN_CUDACC
 template<> EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE
 float acos(const float &x) { return ::acosf(x); }

@@ -1166,7 +1257,7 @@ T asin(const T &x) {
  return asin(x);
 }

-#ifdef __CUDACC__
+#ifdef EIGEN_CUDACC
 template<> EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE
 float asin(const float &x) { return ::asinf(x); }

@@ -1181,7 +1272,7 @@ T atan(const T &x) {
  return atan(x);
 }

-#ifdef __CUDACC__
+#ifdef EIGEN_CUDACC
 template<> EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE
 float atan(const float &x) { return ::atanf(x); }

@@ -1197,7 +1288,7 @@ T cosh(const T &x) {
  return cosh(x);
 }

-#ifdef __CUDACC__
+#ifdef EIGEN_CUDACC
 template<> EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE
 float cosh(const float &x) { return ::coshf(x); }

@@ -1212,7 +1303,7 @@ T sinh(const T &x) {
  return sinh(x);
 }

-#ifdef __CUDACC__
+#ifdef EIGEN_CUDACC
 template<> EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE
 float sinh(const float &x) { return ::sinhf(x); }

@@ -1227,12 +1318,12 @@ T tanh(const T &x) {
  return tanh(x);
 }

-#if (!defined(__CUDACC__)) && EIGEN_FAST_MATH
+#if (!defined(EIGEN_CUDACC)) && EIGEN_FAST_MATH
 EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE
 float tanh(float x) { return internal::generic_fast_tanh_float(x); }
 #endif

-#ifdef __CUDACC__
+#ifdef EIGEN_CUDACC
 template<> EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE
 float tanh(const float &x) { return ::tanhf(x); }

@@ -1247,7 +1338,7 @@ T fmod(const T& a, const T& b) {
  return fmod(a, b);
 }

-#ifdef __CUDACC__
+#ifdef EIGEN_CUDACC
 template <>
 EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE
 float fmod(const float& a, const float& b) {
--- a/Eigen/src/Core/MathFunctionsImpl.h
+++ b/Eigen/src/Core/MathFunctionsImpl.h
@@ -71,6 +71,29 @@ T generic_fast_tanh_float(const T& a_x)
  return pdiv(p, q);
 }

+template<typename RealScalar>
+EIGEN_STRONG_INLINE
+RealScalar positive_real_hypot(const RealScalar& x, const RealScalar& y)
+{
+  EIGEN_USING_STD_MATH(sqrt);
+  RealScalar p, qp;
+  p = numext::maxi(x,y);
+  if(p==RealScalar(0)) return RealScalar(0);
+  qp = numext::mini(y,x) / p;    
+  return p * sqrt(RealScalar(1) + qp*qp);
+}
+
+template<typename Scalar>
+struct hypot_impl
+{
+  typedef typename NumTraits<Scalar>::Real RealScalar;
+  static inline RealScalar run(const Scalar& x, const Scalar& y)
+  {
+    EIGEN_USING_STD_MATH(abs);
+    return positive_real_hypot<RealScalar>(abs(x), abs(y));
+  }
+};
+
 } // end namespace internal

 } // end namespace Eigen
--- a/Eigen/src/Core/Matrix.h
+++ b/Eigen/src/Core/Matrix.h
@@ -274,8 +274,6 @@ class Matrix
      : Base(std::move(other))
    {
      Base::_check_template_params();
-      if (RowsAtCompileTime!=Dynamic && ColsAtCompileTime!=Dynamic)
-        Base::_set_noalias(other);
    }
    EIGEN_DEVICE_FUNC
    Matrix& operator=(Matrix&& other) EIGEN_NOEXCEPT_IF(std::is_nothrow_move_assignable<Scalar>::value)
--- a/Eigen/src/Core/MatrixBase.h
+++ b/Eigen/src/Core/MatrixBase.h
@@ -160,20 +160,11 @@ template<typename Derived> class MatrixBase
    EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
    Derived& operator-=(const MatrixBase<OtherDerived>& other);

-#ifdef __CUDACC__
    template<typename OtherDerived>
    EIGEN_DEVICE_FUNC
-    const Product<Derived,OtherDerived,LazyProduct>
-    operator*(const MatrixBase<OtherDerived> &other) const
-    { return this->lazyProduct(other); }
-#else
-
-    template<typename OtherDerived>
    const Product<Derived,OtherDerived>
    operator*(const MatrixBase<OtherDerived> &other) const;

-#endif
-
    template<typename OtherDerived>
    EIGEN_DEVICE_FUNC
    const Product<Derived,OtherDerived,LazyProduct>
@@ -294,7 +285,7 @@ template<typename Derived> class MatrixBase
      *          fuzzy comparison such as isApprox()
      * \sa isApprox(), operator!= */
    template<typename OtherDerived>
-    inline bool operator==(const MatrixBase<OtherDerived>& other) const
+    EIGEN_DEVICE_FUNC inline bool operator==(const MatrixBase<OtherDerived>& other) const
    { return cwiseEqual(other).all(); }

    /** \returns true if at least one pair of coefficients of \c *this and \a other are not exactly equal to each other.
@@ -302,7 +293,7 @@ template<typename Derived> class MatrixBase
      *          fuzzy comparison such as isApprox()
      * \sa isApprox(), operator== */
    template<typename OtherDerived>
-    inline bool operator!=(const MatrixBase<OtherDerived>& other) const
+    EIGEN_DEVICE_FUNC inline bool operator!=(const MatrixBase<OtherDerived>& other) const
    { return cwiseNotEqual(other).any(); }

    NoAlias<Derived,Eigen::MatrixBase > noalias();
@@ -453,19 +444,28 @@ template<typename Derived> class MatrixBase
 ///////// MatrixFunctions module /////////

    typedef typename internal::stem_function<Scalar>::type StemFunction;
-    const MatrixExponentialReturnValue<Derived> exp() const;
+#define EIGEN_MATRIX_FUNCTION(ReturnType, Name, Description) \
+    /** \returns an expression of the matrix Description of \c *this. \brief This function requires the <a href="unsupported/group__MatrixFunctions__Module.html"> unsupported MatrixFunctions module</a>. To compute the coefficient-wise Description use ArrayBase::##Name . */ \
+    const ReturnType<Derived> Name() const;
+#define EIGEN_MATRIX_FUNCTION_1(ReturnType, Name, Description, Argument) \
+    /** \returns an expression of the matrix Description of \c *this. \brief This function requires the <a href="unsupported/group__MatrixFunctions__Module.html"> unsupported MatrixFunctions module</a>. To compute the coefficient-wise Description use ArrayBase::##Name . */ \
+    const ReturnType<Derived> Name(Argument) const;
+
+    EIGEN_MATRIX_FUNCTION(MatrixExponentialReturnValue, exp, exponential)
+    /** \brief Helper function for the <a href="unsupported/group__MatrixFunctions__Module.html"> unsupported MatrixFunctions module</a>.*/
    const MatrixFunctionReturnValue<Derived> matrixFunction(StemFunction f) const;
-    const MatrixFunctionReturnValue<Derived> cosh() const;
-    const MatrixFunctionReturnValue<Derived> sinh() const;
-    const MatrixFunctionReturnValue<Derived> cos() const;
-    const MatrixFunctionReturnValue<Derived> sin() const;
-    const MatrixSquareRootReturnValue<Derived> sqrt() const;
-    const MatrixLogarithmReturnValue<Derived> log() const;
-    const MatrixPowerReturnValue<Derived> pow(const RealScalar& p) const;
-    const MatrixComplexPowerReturnValue<Derived> pow(const std::complex<RealScalar>& p) const;
+    EIGEN_MATRIX_FUNCTION(MatrixFunctionReturnValue, cosh, hyperbolic cosine)
+    EIGEN_MATRIX_FUNCTION(MatrixFunctionReturnValue, sinh, hyperbolic sine)
+    EIGEN_MATRIX_FUNCTION(MatrixFunctionReturnValue, cos, cosine)
+    EIGEN_MATRIX_FUNCTION(MatrixFunctionReturnValue, sin, sine)
+    EIGEN_MATRIX_FUNCTION(MatrixSquareRootReturnValue, sqrt, square root)
+    EIGEN_MATRIX_FUNCTION(MatrixLogarithmReturnValue, log, logarithm)
+    EIGEN_MATRIX_FUNCTION_1(MatrixPowerReturnValue,        pow, power to \c p, const RealScalar& p)
+    EIGEN_MATRIX_FUNCTION_1(MatrixComplexPowerReturnValue, pow, power to \c p, const std::complex<RealScalar>& p)

  protected:
-    EIGEN_DEVICE_FUNC MatrixBase() : Base() {}
+    EIGEN_DEFAULT_COPY_CONSTRUCTOR(MatrixBase)
+    EIGEN_DEFAULT_EMPTY_CONSTRUCTOR_AND_DESTRUCTOR(MatrixBase)

  private:
    EIGEN_DEVICE_FUNC explicit MatrixBase(int);
--- a/Eigen/src/Core/NestByValue.h
+++ b/Eigen/src/Core/NestByValue.h
@@ -99,7 +99,7 @@ template<typename ExpressionType> class NestByValue
 /** \returns an expression of the temporary version of *this.
  */
 template<typename Derived>
-inline const NestByValue<Derived>
+EIGEN_DEVICE_FUNC inline const NestByValue<Derived>
 DenseBase<Derived>::nestByValue() const
 {
  return NestByValue<Derived>(derived());
--- a/Eigen/src/Core/NumTraits.h
+++ b/Eigen/src/Core/NumTraits.h
@@ -54,34 +54,34 @@ struct default_digits10_impl<T,false,true> // Integer
  *
  * The provided data consists of:
  * \li A typedef \c Real, giving the "real part" type of \a T. If \a T is already real,
-  *     then \c Real is just a typedef to \a T. If \a T is \c std::complex<U> then \c Real
+  *     then \c Real is just a typedef to \a T. If \a T is `std::complex<U>` then \c Real
  *     is a typedef to \a U.
  * \li A typedef \c NonInteger, giving the type that should be used for operations producing non-integral values,
  *     such as quotients, square roots, etc. If \a T is a floating-point type, then this typedef just gives
  *     \a T again. Note however that many Eigen functions such as internal::sqrt simply refuse to
  *     take integers. Outside of a few cases, Eigen doesn't do automatic type promotion. Thus, this typedef is
  *     only intended as a helper for code that needs to explicitly promote types.
-  * \li A typedef \c Literal giving the type to use for numeric literals such as "2" or "0.5". For instance, for \c std::complex<U>, Literal is defined as \c U.
+  * \li A typedef \c Literal giving the type to use for numeric literals such as "2" or "0.5". For instance, for `std::complex<U>`, Literal is defined as \c U.
  *     Of course, this type must be fully compatible with \a T. In doubt, just use \a T here.
-  * \li A typedef \a Nested giving the type to use to nest a value inside of the expression tree. If you don't know what
+  * \li A typedef \c Nested giving the type to use to nest a value inside of the expression tree. If you don't know what
  *     this means, just use \a T here.
-  * \li An enum value \a IsComplex. It is equal to 1 if \a T is a \c std::complex
+  * \li An enum value \c IsComplex. It is equal to 1 if \a T is a `std::complex`
  *     type, and to 0 otherwise.
-  * \li An enum value \a IsInteger. It is equal to \c 1 if \a T is an integer type such as \c int,
+  * \li An enum value \c IsInteger. It is equal to \c 1 if \a T is an integer type such as \c int,
  *     and to \c 0 otherwise.
-  * \li Enum values ReadCost, AddCost and MulCost representing a rough estimate of the number of CPU cycles needed
+  * \li Enum values \c ReadCost, \c AddCost and \c MulCost representing a rough estimate of the number of CPU cycles needed
  *     to by move / add / mul instructions respectively, assuming the data is already stored in CPU registers.
  *     Stay vague here. No need to do architecture-specific stuff.
-  * \li An enum value \a IsSigned. It is equal to \c 1 if \a T is a signed type and to 0 if \a T is unsigned.
-  * \li An enum value \a RequireInitialization. It is equal to \c 1 if the constructor of the numeric type \a T must
+  * \li An enum value \c IsSigned. It is equal to \c 1 if \a T is a signed type and to 0 if \a T is unsigned.
+  * \li An enum value \c RequireInitialization. It is equal to \c 1 if the constructor of the numeric type \a T must
  *     be called, and to 0 if it is safe not to call it. Default is 0 if \a T is an arithmetic type, and 1 otherwise.
-  * \li An epsilon() function which, unlike <a href="http://en.cppreference.com/w/cpp/types/numeric_limits/epsilon">std::numeric_limits::epsilon()</a>,
+  * \li An `epsilon()` function which, unlike <a href="http://en.cppreference.com/w/cpp/types/numeric_limits/epsilon">`std::numeric_limits::epsilon()`</a>,
  *     it returns a \a Real instead of a \a T.
-  * \li A dummy_precision() function returning a weak epsilon value. It is mainly used as a default
+  * \li A `dummy_precision()` function returning a weak epsilon value. It is mainly used as a default
  *     value by the fuzzy comparison operators.
-  * \li highest() and lowest() functions returning the highest and lowest possible values respectively.
-  * \li digits10() function returning the number of decimal digits that can be represented without change. This is
-  *     the analogue of <a href="http://en.cppreference.com/w/cpp/types/numeric_limits/digits10">std::numeric_limits<T>::digits10</a>
+  * \li `highest()` and `lowest()` functions returning the highest and lowest possible values respectively.
+  * \li `digits10()` function returning the number of decimal digits that can be represented without change. This is
+  *     the analogue of <a href="http://en.cppreference.com/w/cpp/types/numeric_limits/digits10">`std::numeric_limits<T>::digits10`</a>
  *     which is used as the default implementation if specialized.
  */

@@ -166,7 +166,16 @@ template<> struct NumTraits<double> : GenericNumTraits<double>
 template<> struct NumTraits<long double>
  : GenericNumTraits<long double>
 {
-  static inline long double dummy_precision() { return 1e-15l; }
+  static inline long double dummy_precision() { return static_cast<long double>(1e-15l); }
+
+#if defined(EIGEN_ARCH_PPC) && (__LDBL_MANT_DIG__ == 106)
+  // PowerPC double double causes issues with some values
+  static inline long double epsilon()
+  {
+    // 2^(-(__LDBL_MANT_DIG__)+1)
+    return static_cast<long double>(2.4651903288156618919116517665087e-32l);
+  }
+#endif
 };

 template<typename _Real> struct NumTraits<std::complex<_Real> >
@@ -215,6 +224,8 @@ struct NumTraits<Array<Scalar, Rows, Cols, Options, MaxRows, MaxCols> >
  static inline RealScalar epsilon() { return NumTraits<RealScalar>::epsilon(); }
  EIGEN_DEVICE_FUNC
  static inline RealScalar dummy_precision() { return NumTraits<RealScalar>::dummy_precision(); }
+
+  static inline int digits10() { return NumTraits<Scalar>::digits10(); }
 };

 template<> struct NumTraits<std::string>
--- a/Eigen/src/Core/PermutationMatrix.h
+++ b/Eigen/src/Core/PermutationMatrix.h
@@ -87,17 +87,6 @@ class PermutationBase : public EigenBase<Derived>
      return derived();
    }

-    #ifndef EIGEN_PARSED_BY_DOXYGEN
-    /** This is a special case of the templated operator=. Its purpose is to
-      * prevent a default operator= from hiding the templated operator=.
-      */
-    Derived& operator=(const PermutationBase& other)
-    {
-      indices() = other.indices();
-      return derived();
-    }
-    #endif
-
    /** \returns the number of rows */
    inline Index rows() const { return Index(indices().size()); }

@@ -333,12 +322,6 @@ class PermutationMatrix : public PermutationBase<PermutationMatrix<SizeAtCompile
    inline PermutationMatrix(const PermutationBase<OtherDerived>& other)
      : m_indices(other.indices()) {}

-    #ifndef EIGEN_PARSED_BY_DOXYGEN
-    /** Standard copy constructor. Defined only to prevent a default copy constructor
-      * from hiding the other templated constructor */
-    inline PermutationMatrix(const PermutationMatrix& other) : m_indices(other.indices()) {}
-    #endif
-
    /** Generic constructor from expression of the indices. The indices
      * array has the meaning that the permutations sends each integer i to indices[i].
      *
@@ -373,17 +356,6 @@ class PermutationMatrix : public PermutationBase<PermutationMatrix<SizeAtCompile
      return Base::operator=(tr.derived());
    }

-    #ifndef EIGEN_PARSED_BY_DOXYGEN
-    /** This is a special case of the templated operator=. Its purpose is to
-      * prevent a default operator= from hiding the templated operator=.
-      */
-    PermutationMatrix& operator=(const PermutationMatrix& other)
-    {
-      m_indices = other.m_indices;
-      return *this;
-    }
-    #endif
-
    /** const version of indices(). */
    const IndicesType& indices() const { return m_indices; }
    /** \returns a reference to the stored array representing the permutation. */
--- a/Eigen/src/Core/PlainObjectBase.h
+++ b/Eigen/src/Core/PlainObjectBase.h
@@ -41,7 +41,7 @@ template<> struct check_rows_cols_for_overflow<Dynamic> {
  {
    // http://hg.mozilla.org/mozilla-central/file/6c8a909977d3/xpcom/ds/CheckedInt.h#l242
    // we assume Index is signed
-    Index max_index = (size_t(1) << (8 * sizeof(Index) - 1)) - 1; // assume Index is signed
+    Index max_index = (std::size_t(1) << (8 * sizeof(Index) - 1)) - 1; // assume Index is signed
    bool error = (rows == 0 || cols == 0) ? false
               : (rows > max_index / cols);
    if (error)
@@ -58,6 +58,28 @@ template<typename MatrixTypeA, typename MatrixTypeB, bool SwapPointers> struct m

 } // end namespace internal

+#ifdef EIGEN_PARSED_BY_DOXYGEN
+namespace doxygen {
+
+// This is a workaround to doxygen not being able to understand the inheritance logic
+// when it is hidden by the dense_xpr_base helper struct.
+// Moreover, doxygen fails to include members that are not documented in the declaration body of
+// MatrixBase if we inherits MatrixBase<Matrix<_Scalar, _Rows, _Cols, _Options, _MaxRows, _MaxCols> >,
+// this is why we simply inherits MatrixBase, though this does not make sense.
+
+/** This class is just a workaround for Doxygen and it does not not actually exist. */
+template<typename Derived> struct dense_xpr_base_dispatcher;
+/** This class is just a workaround for Doxygen and it does not not actually exist. */
+template<typename _Scalar, int _Rows, int _Cols, int _Options, int _MaxRows, int _MaxCols>
+struct dense_xpr_base_dispatcher<Matrix<_Scalar, _Rows, _Cols, _Options, _MaxRows, _MaxCols> >
+    : public MatrixBase {};
+/** This class is just a workaround for Doxygen and it does not not actually exist. */
+template<typename _Scalar, int _Rows, int _Cols, int _Options, int _MaxRows, int _MaxCols>
+struct dense_xpr_base_dispatcher<Array<_Scalar, _Rows, _Cols, _Options, _MaxRows, _MaxCols> >
+    : public ArrayBase {};
+
+} // namespace doxygen
+
 /** \class PlainObjectBase
  * \ingroup Core_Module
  * \brief %Dense storage base class for matrices and arrays.
@@ -65,26 +87,10 @@ template<typename MatrixTypeA, typename MatrixTypeB, bool SwapPointers> struct m
  * This class can be extended with the help of the plugin mechanism described on the page
  * \ref TopicCustomizing_Plugins by defining the preprocessor symbol \c EIGEN_PLAINOBJECTBASE_PLUGIN.
  *
+  * \tparam Derived is the derived type, e.g., a Matrix or Array
+  *
  * \sa \ref TopicClassHierarchy
  */
-#ifdef EIGEN_PARSED_BY_DOXYGEN
-namespace doxygen {
-
-// this is a workaround to doxygen not being able to understand the inheritance logic
-// when it is hidden by the dense_xpr_base helper struct.
-/** This class is just a workaround for Doxygen and it does not not actually exist. */
-template<typename Derived> struct dense_xpr_base_dispatcher;
-/** This class is just a workaround for Doxygen and it does not not actually exist. */
-template<typename _Scalar, int _Rows, int _Cols, int _Options, int _MaxRows, int _MaxCols>
-struct dense_xpr_base_dispatcher<Matrix<_Scalar, _Rows, _Cols, _Options, _MaxRows, _MaxCols> >
-    : public MatrixBase<Matrix<_Scalar, _Rows, _Cols, _Options, _MaxRows, _MaxCols> > {};
-/** This class is just a workaround for Doxygen and it does not not actually exist. */
-template<typename _Scalar, int _Rows, int _Cols, int _Options, int _MaxRows, int _MaxCols>
-struct dense_xpr_base_dispatcher<Array<_Scalar, _Rows, _Cols, _Options, _MaxRows, _MaxCols> >
-    : public ArrayBase<Array<_Scalar, _Rows, _Cols, _Options, _MaxRows, _MaxCols> > {};
-
-} // namespace doxygen
-
 template<typename Derived>
 class PlainObjectBase : public doxygen::dense_xpr_base_dispatcher<Derived>
 #else
@@ -554,7 +560,8 @@ class PlainObjectBase : public internal::dense_xpr_base<Derived>::type

  public:

-    /** \copydoc DenseBase::operator=(const EigenBase<OtherDerived>&)
+    /** \brief Copies the generic expression \a other into *this.
+      * \copydetails DenseBase::operator=(const EigenBase<OtherDerived> &other)
      */
    template<typename OtherDerived>
    EIGEN_DEVICE_FUNC 
@@ -570,6 +577,10 @@ class PlainObjectBase : public internal::dense_xpr_base<Derived>::type
      * while the AlignedMap() functions return aligned Map objects and thus should be called only with 16-byte-aligned
      * \a data pointers.
      *
+      * Here is an example using strides:
+      * \include Matrix_Map_stride.cpp
+      * Output: \verbinclude Matrix_Map_stride.out
+      *
      * \see class Map
      */
    //@{
@@ -726,8 +737,10 @@ class PlainObjectBase : public internal::dense_xpr_base<Derived>::type
    EIGEN_DEVICE_FUNC
    EIGEN_STRONG_INLINE void _init2(Index rows, Index cols, typename internal::enable_if<Base::SizeAtCompileTime!=2,T0>::type* = 0)
    {
-      EIGEN_STATIC_ASSERT(bool(NumTraits<T0>::IsInteger) &&
-                          bool(NumTraits<T1>::IsInteger),
+      const bool t0_is_integer_alike = internal::is_valid_index_type<T0>::value;
+      const bool t1_is_integer_alike = internal::is_valid_index_type<T1>::value;
+      EIGEN_STATIC_ASSERT(t0_is_integer_alike &&
+                          t1_is_integer_alike,
                          FLOATING_POINT_ARGUMENT_PASSED__INTEGER_WAS_EXPECTED)
      resize(rows,cols);
    }
@@ -762,8 +775,9 @@ class PlainObjectBase : public internal::dense_xpr_base<Derived>::type
                                                                              && ((!internal::is_same<typename internal::traits<Derived>::XprKind,ArrayXpr>::value || Base::SizeAtCompileTime==Dynamic)),T>::type* = 0)
    {
      // NOTE MSVC 2008 complains if we directly put bool(NumTraits<T>::IsInteger) as the EIGEN_STATIC_ASSERT argument.
-      const bool is_integer = NumTraits<T>::IsInteger;
-      EIGEN_STATIC_ASSERT(is_integer,
+      const bool is_integer_alike = internal::is_valid_index_type<T>::value;
+      EIGEN_UNUSED_VARIABLE(is_integer_alike);
+      EIGEN_STATIC_ASSERT(is_integer_alike,
                          FLOATING_POINT_ARGUMENT_PASSED__INTEGER_WAS_EXPECTED)
      resize(size);
    }
@@ -804,6 +818,13 @@ class PlainObjectBase : public internal::dense_xpr_base<Derived>::type
      this->_set_noalias(other);
    }

+    // Initialize an arbitrary matrix from an object convertible to the Derived type.
+    template<typename T>
+    EIGEN_DEVICE_FUNC
+    EIGEN_STRONG_INLINE void _init1(const Derived& other){
+      this->_set_noalias(other);
+    }
+
    // Initialize an arbitrary matrix from a generic Eigen expression
    template<typename T, typename OtherDerived>
    EIGEN_DEVICE_FUNC
@@ -826,7 +847,7 @@ class PlainObjectBase : public internal::dense_xpr_base<Derived>::type
      this->derived() = r;
    }
    
-    // For fixed -size arrays:
+    // For fixed-size Array<Scalar,...>
    template<typename T>
    EIGEN_DEVICE_FUNC
    EIGEN_STRONG_INLINE void _init1(const Scalar& val0,
@@ -838,6 +859,7 @@ class PlainObjectBase : public internal::dense_xpr_base<Derived>::type
      Base::setConstant(val0);
    }
    
+    // For fixed-size Array<Index,...>
    template<typename T>
    EIGEN_DEVICE_FUNC
    EIGEN_STRONG_INLINE void _init1(const Index& val0,
--- a/Eigen/src/Core/Product.h
+++ b/Eigen/src/Core/Product.h
@@ -97,8 +97,8 @@ class Product : public ProductImpl<_Lhs,_Rhs,Option,
        && "if you wanted a coeff-wise or a dot product use the respective explicit functions");
    }

-    EIGEN_DEVICE_FUNC inline Index rows() const { return m_lhs.rows(); }
-    EIGEN_DEVICE_FUNC inline Index cols() const { return m_rhs.cols(); }
+    EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Index rows() const { return m_lhs.rows(); }
+    EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Index cols() const { return m_rhs.cols(); }

    EIGEN_DEVICE_FUNC const LhsNestedCleaned& lhs() const { return m_lhs; }
    EIGEN_DEVICE_FUNC const RhsNestedCleaned& rhs() const { return m_rhs; }
@@ -127,7 +127,7 @@ public:
  using Base::derived;
  typedef typename Base::Scalar Scalar;
  
-  operator const Scalar() const
+  EIGEN_STRONG_INLINE operator const Scalar() const
  {
    return internal::evaluator<ProductXpr>(derived()).coeff(0,0);
  }
@@ -162,7 +162,7 @@ class ProductImpl<Lhs,Rhs,Option,Dense>
    
  public:
  
-    EIGEN_DEVICE_FUNC Scalar coeff(Index row, Index col) const
+    EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Scalar coeff(Index row, Index col) const
    {
      EIGEN_STATIC_ASSERT(EnableCoeff, THIS_METHOD_IS_ONLY_FOR_INNER_OR_LAZY_PRODUCTS);
      eigen_assert( (Option==LazyProduct) || (this->rows() == 1 && this->cols() == 1) );
@@ -170,7 +170,7 @@ class ProductImpl<Lhs,Rhs,Option,Dense>
      return internal::evaluator<Derived>(derived()).coeff(row,col);
    }

-    EIGEN_DEVICE_FUNC Scalar coeff(Index i) const
+    EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Scalar coeff(Index i) const
    {
      EIGEN_STATIC_ASSERT(EnableCoeff, THIS_METHOD_IS_ONLY_FOR_INNER_OR_LAZY_PRODUCTS);
      eigen_assert( (Option==LazyProduct) || (this->rows() == 1 && this->cols() == 1) );
--- a/Eigen/src/Core/ProductEvaluators.h
+++ b/Eigen/src/Core/ProductEvaluators.h
@@ -14,7 +14,7 @@
 #define EIGEN_PRODUCTEVALUATORS_H

 namespace Eigen {
-  
+
 namespace internal {

 /** \internal
@@ -22,19 +22,19 @@ namespace internal {
  * Since products require special treatments to handle all possible cases,
  * we simply deffer the evaluation logic to a product_evaluator class
  * which offers more partial specialization possibilities.
-  * 
+  *
  * \sa class product_evaluator
  */
 template<typename Lhs, typename Rhs, int Options>
-struct evaluator<Product<Lhs, Rhs, Options> > 
+struct evaluator<Product<Lhs, Rhs, Options> >
 : public product_evaluator<Product<Lhs, Rhs, Options> >
 {
  typedef Product<Lhs, Rhs, Options> XprType;
  typedef product_evaluator<XprType> Base;
-  
-  EIGEN_DEVICE_FUNC explicit evaluator(const XprType& xpr) : Base(xpr) {}
+
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE explicit evaluator(const XprType& xpr) : Base(xpr) {}
 };
- 
+
 // Catch "scalar * ( A * B )" and transform it to "(A*scalar) * B"
 // TODO we should apply that rule only if that's really helpful
 template<typename Lhs, typename Rhs, typename Scalar1, typename Scalar2, typename Plain1>
@@ -55,20 +55,20 @@ struct evaluator<CwiseBinaryOp<internal::scalar_product_op<Scalar1,Scalar2>,
                               const Product<Lhs, Rhs, DefaultProduct> > XprType;
  typedef evaluator<Product<EIGEN_SCALAR_BINARYOP_EXPR_RETURN_TYPE(Scalar1,Lhs,product), Rhs, DefaultProduct> > Base;

-  EIGEN_DEVICE_FUNC explicit evaluator(const XprType& xpr)
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE explicit evaluator(const XprType& xpr)
    : Base(xpr.lhs().functor().m_other * xpr.rhs().lhs() * xpr.rhs().rhs())
  {}
 };


 template<typename Lhs, typename Rhs, int DiagIndex>
-struct evaluator<Diagonal<const Product<Lhs, Rhs, DefaultProduct>, DiagIndex> > 
+struct evaluator<Diagonal<const Product<Lhs, Rhs, DefaultProduct>, DiagIndex> >
 : public evaluator<Diagonal<const Product<Lhs, Rhs, LazyProduct>, DiagIndex> >
 {
  typedef Diagonal<const Product<Lhs, Rhs, DefaultProduct>, DiagIndex> XprType;
  typedef evaluator<Diagonal<const Product<Lhs, Rhs, LazyProduct>, DiagIndex> > Base;
-  
-  EIGEN_DEVICE_FUNC explicit evaluator(const XprType& xpr)
+
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE explicit evaluator(const XprType& xpr)
    : Base(Diagonal<const Product<Lhs, Rhs, LazyProduct>, DiagIndex>(
        Product<Lhs, Rhs, LazyProduct>(xpr.nestedExpression().lhs(), xpr.nestedExpression().rhs()),
        xpr.index() ))
@@ -108,23 +108,23 @@ struct product_evaluator<Product<Lhs, Rhs, Options>, ProductTag, LhsShape, RhsSh
    : m_result(xpr.rows(), xpr.cols())
  {
    ::new (static_cast<Base*>(this)) Base(m_result);
-    
+
 // FIXME shall we handle nested_eval here?,
 // if so, then we must take care at removing the call to nested_eval in the specializations (e.g., in permutation_matrix_product, transposition_matrix_product, etc.)
 //     typedef typename internal::nested_eval<Lhs,Rhs::ColsAtCompileTime>::type LhsNested;
 //     typedef typename internal::nested_eval<Rhs,Lhs::RowsAtCompileTime>::type RhsNested;
 //     typedef typename internal::remove_all<LhsNested>::type LhsNestedCleaned;
 //     typedef typename internal::remove_all<RhsNested>::type RhsNestedCleaned;
-//     
+//
 //     const LhsNested lhs(xpr.lhs());
 //     const RhsNested rhs(xpr.rhs());
-//   
+//
 //     generic_product_impl<LhsNestedCleaned, RhsNestedCleaned>::evalTo(m_result, lhs, rhs);

    generic_product_impl<Lhs, Rhs, LhsShape, RhsShape, ProductTag>::evalTo(m_result, xpr.lhs(), xpr.rhs());
  }
-  
-protected:  
+
+protected:
  PlainObject m_result;
 };

@@ -137,7 +137,7 @@ struct Assignment<DstXprType, Product<Lhs,Rhs,Options>, internal::assign_op<Scal
  typename enable_if<(Options==DefaultProduct || Options==AliasFreeProduct)>::type>
 {
  typedef Product<Lhs,Rhs,Options> SrcXprType;
-  static EIGEN_STRONG_INLINE
+  static EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC
  void run(DstXprType &dst, const SrcXprType &src, const internal::assign_op<Scalar,Scalar> &)
  {
    Index dstRows = src.rows();
@@ -155,13 +155,10 @@ struct Assignment<DstXprType, Product<Lhs,Rhs,Options>, internal::add_assign_op<
  typename enable_if<(Options==DefaultProduct || Options==AliasFreeProduct)>::type>
 {
  typedef Product<Lhs,Rhs,Options> SrcXprType;
-  static EIGEN_STRONG_INLINE
+  static EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC
  void run(DstXprType &dst, const SrcXprType &src, const internal::add_assign_op<Scalar,Scalar> &)
  {
-    Index dstRows = src.rows();
-    Index dstCols = src.cols();
-    if((dst.rows()!=dstRows) || (dst.cols()!=dstCols))
-      dst.resize(dstRows, dstCols);
+    eigen_assert(dst.rows() == src.rows() && dst.cols() == src.cols());
    // FIXME shall we handle nested_eval here?
    generic_product_impl<Lhs, Rhs>::addTo(dst, src.lhs(), src.rhs());
  }
@@ -173,13 +170,10 @@ struct Assignment<DstXprType, Product<Lhs,Rhs,Options>, internal::sub_assign_op<
  typename enable_if<(Options==DefaultProduct || Options==AliasFreeProduct)>::type>
 {
  typedef Product<Lhs,Rhs,Options> SrcXprType;
-  static EIGEN_STRONG_INLINE
+  static EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC
  void run(DstXprType &dst, const SrcXprType &src, const internal::sub_assign_op<Scalar,Scalar> &)
  {
-    Index dstRows = src.rows();
-    Index dstCols = src.cols();
-    if((dst.rows()!=dstRows) || (dst.cols()!=dstCols))
-      dst.resize(dstRows, dstCols);
+    eigen_assert(dst.rows() == src.rows() && dst.cols() == src.cols());
    // FIXME shall we handle nested_eval here?
    generic_product_impl<Lhs, Rhs>::subTo(dst, src.lhs(), src.rhs());
  }
@@ -196,7 +190,7 @@ struct Assignment<DstXprType, CwiseBinaryOp<internal::scalar_product_op<ScalarBi
  typedef CwiseBinaryOp<internal::scalar_product_op<ScalarBis,Scalar>,
                        const CwiseNullaryOp<internal::scalar_constant_op<ScalarBis>,Plain>,
                        const Product<Lhs,Rhs,DefaultProduct> > SrcXprType;
-  static EIGEN_STRONG_INLINE
+  static EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC
  void run(DstXprType &dst, const SrcXprType &src, const AssignFunc& func)
  {
    call_assignment_no_alias(dst, (src.lhs().functor().m_other * src.rhs().lhs())*src.rhs().rhs(), func);
@@ -213,6 +207,12 @@ struct evaluator_assume_aliasing<CwiseBinaryOp<internal::scalar_sum_op<typename
  static const bool value = true;
 };

+template<typename OtherXpr, typename Lhs, typename Rhs>
+struct evaluator_assume_aliasing<CwiseBinaryOp<internal::scalar_difference_op<typename OtherXpr::Scalar,typename Product<Lhs,Rhs,DefaultProduct>::Scalar>, const OtherXpr,
+                                               const Product<Lhs,Rhs,DefaultProduct> >, DenseShape > {
+  static const bool value = true;
+};
+
 template<typename DstXprType, typename OtherXpr, typename ProductType, typename Func1, typename Func2>
 struct assignment_from_xpr_op_product
 {
@@ -246,19 +246,19 @@ template<typename Lhs, typename Rhs>
 struct generic_product_impl<Lhs,Rhs,DenseShape,DenseShape,InnerProduct>
 {
  template<typename Dst>
-  static inline void evalTo(Dst& dst, const Lhs& lhs, const Rhs& rhs)
+  static EIGEN_STRONG_INLINE void evalTo(Dst& dst, const Lhs& lhs, const Rhs& rhs)
  {
    dst.coeffRef(0,0) = (lhs.transpose().cwiseProduct(rhs)).sum();
  }
-  
+
  template<typename Dst>
-  static inline void addTo(Dst& dst, const Lhs& lhs, const Rhs& rhs)
+  static EIGEN_STRONG_INLINE void addTo(Dst& dst, const Lhs& lhs, const Rhs& rhs)
  {
    dst.coeffRef(0,0) += (lhs.transpose().cwiseProduct(rhs)).sum();
  }
-  
+
  template<typename Dst>
-  static void subTo(Dst& dst, const Lhs& lhs, const Rhs& rhs)
+  static EIGEN_STRONG_INLINE void subTo(Dst& dst, const Lhs& lhs, const Rhs& rhs)
  { dst.coeffRef(0,0) -= (lhs.transpose().cwiseProduct(rhs)).sum(); }
 };

@@ -298,7 +298,7 @@ struct generic_product_impl<Lhs,Rhs,DenseShape,DenseShape,OuterProduct>
 {
  template<typename T> struct is_row_major : internal::conditional<(int(T::Flags)&RowMajorBit), internal::true_type, internal::false_type>::type {};
  typedef typename Product<Lhs,Rhs>::Scalar Scalar;
-  
+
  // TODO it would be nice to be able to exploit our *_assign_op functors for that purpose
  struct set  { template<typename Dst, typename Src> void operator()(const Dst& dst, const Src& src) const { dst.const_cast_derived()  = src; } };
  struct add  { template<typename Dst, typename Src> void operator()(const Dst& dst, const Src& src) const { dst.const_cast_derived() += src; } };
@@ -310,31 +310,31 @@ struct generic_product_impl<Lhs,Rhs,DenseShape,DenseShape,OuterProduct>
      dst.const_cast_derived() += m_scale * src;
    }
  };
-  
+
  template<typename Dst>
-  static inline void evalTo(Dst& dst, const Lhs& lhs, const Rhs& rhs)
+  static EIGEN_STRONG_INLINE void evalTo(Dst& dst, const Lhs& lhs, const Rhs& rhs)
  {
    internal::outer_product_selector_run(dst, lhs, rhs, set(), is_row_major<Dst>());
  }
-  
+
  template<typename Dst>
-  static inline void addTo(Dst& dst, const Lhs& lhs, const Rhs& rhs)
+  static EIGEN_STRONG_INLINE void addTo(Dst& dst, const Lhs& lhs, const Rhs& rhs)
  {
    internal::outer_product_selector_run(dst, lhs, rhs, add(), is_row_major<Dst>());
  }
-  
+
  template<typename Dst>
-  static inline void subTo(Dst& dst, const Lhs& lhs, const Rhs& rhs)
+  static EIGEN_STRONG_INLINE void subTo(Dst& dst, const Lhs& lhs, const Rhs& rhs)
  {
    internal::outer_product_selector_run(dst, lhs, rhs, sub(), is_row_major<Dst>());
  }
-  
+
  template<typename Dst>
-  static inline void scaleAndAddTo(Dst& dst, const Lhs& lhs, const Rhs& rhs, const Scalar& alpha)
+  static EIGEN_STRONG_INLINE void scaleAndAddTo(Dst& dst, const Lhs& lhs, const Rhs& rhs, const Scalar& alpha)
  {
    internal::outer_product_selector_run(dst, lhs, rhs, adds(alpha), is_row_major<Dst>());
  }
-  
+
 };


@@ -343,7 +343,7 @@ template<typename Lhs, typename Rhs, typename Derived>
 struct generic_product_impl_base
 {
  typedef typename Product<Lhs,Rhs>::Scalar Scalar;
-  
+
  template<typename Dst>
  static EIGEN_STRONG_INLINE void evalTo(Dst& dst, const Lhs& lhs, const Rhs& rhs)
  { dst.setZero(); scaleAndAddTo(dst, lhs, rhs, Scalar(1)); }
@@ -355,7 +355,7 @@ struct generic_product_impl_base
  template<typename Dst>
  static EIGEN_STRONG_INLINE void subTo(Dst& dst, const Lhs& lhs, const Rhs& rhs)
  { scaleAndAddTo(dst, lhs, rhs, Scalar(-1)); }
-  
+
  template<typename Dst>
  static EIGEN_STRONG_INLINE void scaleAndAddTo(Dst& dst, const Lhs& lhs, const Rhs& rhs, const Scalar& alpha)
  { Derived::scaleAndAddTo(dst,lhs,rhs,alpha); }
@@ -366,47 +366,77 @@ template<typename Lhs, typename Rhs>
 struct generic_product_impl<Lhs,Rhs,DenseShape,DenseShape,GemvProduct>
  : generic_product_impl_base<Lhs,Rhs,generic_product_impl<Lhs,Rhs,DenseShape,DenseShape,GemvProduct> >
 {
+  typedef typename nested_eval<Lhs,1>::type LhsNested;
+  typedef typename nested_eval<Rhs,1>::type RhsNested;
  typedef typename Product<Lhs,Rhs>::Scalar Scalar;
  enum { Side = Lhs::IsVectorAtCompileTime ? OnTheLeft : OnTheRight };
-  typedef typename internal::conditional<int(Side)==OnTheRight,Lhs,Rhs>::type MatrixType;
+  typedef typename internal::remove_all<typename internal::conditional<int(Side)==OnTheRight,LhsNested,RhsNested>::type>::type MatrixType;

  template<typename Dest>
  static EIGEN_STRONG_INLINE void scaleAndAddTo(Dest& dst, const Lhs& lhs, const Rhs& rhs, const Scalar& alpha)
  {
+    LhsNested actual_lhs(lhs);
+    RhsNested actual_rhs(rhs);
    internal::gemv_dense_selector<Side,
                            (int(MatrixType::Flags)&RowMajorBit) ? RowMajor : ColMajor,
                            bool(internal::blas_traits<MatrixType>::HasUsableDirectAccess)
-                           >::run(lhs, rhs, dst, alpha);
+                           >::run(actual_lhs, actual_rhs, dst, alpha);
  }
 };

 template<typename Lhs, typename Rhs>
-struct generic_product_impl<Lhs,Rhs,DenseShape,DenseShape,CoeffBasedProductMode> 
+struct generic_product_impl<Lhs,Rhs,DenseShape,DenseShape,CoeffBasedProductMode>
 {
  typedef typename Product<Lhs,Rhs>::Scalar Scalar;
-  
+
  template<typename Dst>
-  static EIGEN_STRONG_INLINE void evalTo(Dst& dst, const Lhs& lhs, const Rhs& rhs)
+  static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void evalTo(Dst& dst, const Lhs& lhs, const Rhs& rhs)
  {
    // Same as: dst.noalias() = lhs.lazyProduct(rhs);
    // but easier on the compiler side
    call_assignment_no_alias(dst, lhs.lazyProduct(rhs), internal::assign_op<typename Dst::Scalar,Scalar>());
  }
-  
+
  template<typename Dst>
  static EIGEN_STRONG_INLINE void addTo(Dst& dst, const Lhs& lhs, const Rhs& rhs)
  {
    // dst.noalias() += lhs.lazyProduct(rhs);
    call_assignment_no_alias(dst, lhs.lazyProduct(rhs), internal::add_assign_op<typename Dst::Scalar,Scalar>());
  }
-  
+
  template<typename Dst>
  static EIGEN_STRONG_INLINE void subTo(Dst& dst, const Lhs& lhs, const Rhs& rhs)
  {
    // dst.noalias() -= lhs.lazyProduct(rhs);
    call_assignment_no_alias(dst, lhs.lazyProduct(rhs), internal::sub_assign_op<typename Dst::Scalar,Scalar>());
  }
-  
+
+  // Catch "dst {,+,-}= (s*A)*B" and evaluate it lazily by moving out the scalar factor:
+  //    dst {,+,-}= s * (A.lazyProduct(B))
+  // This is a huge benefit for heap-allocated matrix types as it save one costly allocation.
+  // For them, this strategy is also faster than simply by-passing the heap allocation through
+  // stack allocation.
+  // For fixed sizes matrices, this is less obvious, it is sometimes x2 faster, but sometimes x3 slower,
+  // and the behavior depends also a lot on the compiler... so let's be conservative and enable them for dynamic-size only,
+  // that is when coming from generic_product_impl<...,GemmProduct> in file GeneralMatrixMatrix.h
+  template<typename Dst, typename Scalar1, typename Scalar2, typename Plain1, typename Xpr2, typename Func>
+  static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+  void eval_dynamic(Dst& dst, const CwiseBinaryOp<internal::scalar_product_op<Scalar1,Scalar2>,
+                                           const CwiseNullaryOp<internal::scalar_constant_op<Scalar1>, Plain1>, Xpr2>& lhs, const Rhs& rhs, const Func &func)
+  {
+    call_assignment_no_alias(dst, lhs.lhs().functor().m_other * lhs.rhs().lazyProduct(rhs), func);
+  }
+
+  // Here, we we always have LhsT==Lhs, but we need to make it a template type to make the above
+  // overload more specialized.
+  template<typename Dst, typename LhsT, typename Func>
+  static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+  void eval_dynamic(Dst& dst, const LhsT& lhs, const Rhs& rhs, const Func &func)
+  {
+    call_assignment_no_alias(dst, lhs.lazyProduct(rhs), func);
+  }
+
+
 //   template<typename Dst>
 //   static inline void scaleAndAddTo(Dst& dst, const Lhs& lhs, const Rhs& rhs, const Scalar& alpha)
 //   { dst.noalias() += alpha * lhs.lazyProduct(rhs); }
@@ -467,7 +497,7 @@ struct product_evaluator<Product<Lhs, Rhs, LazyProduct>, ProductTag, DenseShape,

  typedef typename internal::nested_eval<Lhs,Rhs::ColsAtCompileTime>::type LhsNested;
  typedef typename internal::nested_eval<Rhs,Lhs::RowsAtCompileTime>::type RhsNested;
-  
+
  typedef typename internal::remove_all<LhsNested>::type LhsNestedCleaned;
  typedef typename internal::remove_all<RhsNested>::type RhsNestedCleaned;

@@ -486,7 +516,7 @@ struct product_evaluator<Product<Lhs, Rhs, LazyProduct>, ProductTag, DenseShape,
  typedef typename find_best_packet<Scalar,ColsAtCompileTime>::type RhsVecPacketType;

  enum {
-      
+
    LhsCoeffReadCost = LhsEtorType::CoeffReadCost,
    RhsCoeffReadCost = RhsEtorType::CoeffReadCost,
    CoeffReadCost = InnerSize==0 ? NumTraits<Scalar>::ReadCost
@@ -495,10 +525,10 @@ struct product_evaluator<Product<Lhs, Rhs, LazyProduct>, ProductTag, DenseShape,
                    + (InnerSize - 1) * NumTraits<Scalar>::AddCost,

    Unroll = CoeffReadCost <= EIGEN_UNROLLING_LIMIT,
-    
+
    LhsFlags = LhsEtorType::Flags,
    RhsFlags = RhsEtorType::Flags,
-    
+
    LhsRowMajor = LhsFlags & RowMajorBit,
    RhsRowMajor = RhsFlags & RowMajorBit,

@@ -508,7 +538,7 @@ struct product_evaluator<Product<Lhs, Rhs, LazyProduct>, ProductTag, DenseShape,
    // Here, we don't care about alignment larger than the usable packet size.
    LhsAlignment = EIGEN_PLAIN_ENUM_MIN(LhsEtorType::Alignment,LhsVecPacketSize*int(sizeof(typename LhsNestedCleaned::Scalar))),
    RhsAlignment = EIGEN_PLAIN_ENUM_MIN(RhsEtorType::Alignment,RhsVecPacketSize*int(sizeof(typename RhsNestedCleaned::Scalar))),
-      
+
    SameType = is_same<typename LhsNestedCleaned::Scalar,typename RhsNestedCleaned::Scalar>::value,

    CanVectorizeRhs = bool(RhsRowMajor) && (RhsFlags & PacketAccessBit) && (ColsAtCompileTime!=1),
@@ -523,7 +553,7 @@ struct product_evaluator<Product<Lhs, Rhs, LazyProduct>, ProductTag, DenseShape,
          // TODO enable vectorization for mixed types
          | (SameType && (CanVectorizeLhs || CanVectorizeRhs) ? PacketAccessBit : 0)
          | (XprType::IsVectorAtCompileTime ? LinearAccessBit : 0),
-          
+
    LhsOuterStrideBytes = int(LhsNestedCleaned::OuterStrideAtCompileTime) * int(sizeof(typename LhsNestedCleaned::Scalar)),
    RhsOuterStrideBytes = int(RhsNestedCleaned::OuterStrideAtCompileTime) * int(sizeof(typename RhsNestedCleaned::Scalar)),

@@ -542,7 +572,7 @@ struct product_evaluator<Product<Lhs, Rhs, LazyProduct>, ProductTag, DenseShape,
                        && (LhsFlags & RhsFlags & ActualPacketAccessBit)
                        && (InnerSize % packet_traits<Scalar>::size == 0)
  };
-  
+
  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const CoeffReturnType coeff(Index row, Index col) const
  {
    return (m_lhs.row(row).transpose().cwiseProduct( m_rhs.col(col) )).sum();
@@ -581,7 +611,7 @@ struct product_evaluator<Product<Lhs, Rhs, LazyProduct>, ProductTag, DenseShape,
 protected:
  typename internal::add_const_on_value_type<LhsNested>::type m_lhs;
  typename internal::add_const_on_value_type<RhsNested>::type m_rhs;
-  
+
  LhsEtorType m_lhsImpl;
  RhsEtorType m_rhsImpl;

@@ -700,7 +730,7 @@ struct generic_product_impl<Lhs,Rhs,TriangularShape,DenseShape,ProductTag>
  : generic_product_impl_base<Lhs,Rhs,generic_product_impl<Lhs,Rhs,TriangularShape,DenseShape,ProductTag> >
 {
  typedef typename Product<Lhs,Rhs>::Scalar Scalar;
-  
+
  template<typename Dest>
  static void scaleAndAddTo(Dest& dst, const Lhs& lhs, const Rhs& rhs, const Scalar& alpha)
  {
@@ -714,7 +744,7 @@ struct generic_product_impl<Lhs,Rhs,DenseShape,TriangularShape,ProductTag>
 : generic_product_impl_base<Lhs,Rhs,generic_product_impl<Lhs,Rhs,DenseShape,TriangularShape,ProductTag> >
 {
  typedef typename Product<Lhs,Rhs>::Scalar Scalar;
-  
+
  template<typename Dest>
  static void scaleAndAddTo(Dest& dst, const Lhs& lhs, const Rhs& rhs, const Scalar& alpha)
  {
@@ -735,7 +765,7 @@ struct generic_product_impl<Lhs,Rhs,SelfAdjointShape,DenseShape,ProductTag>
  : generic_product_impl_base<Lhs,Rhs,generic_product_impl<Lhs,Rhs,SelfAdjointShape,DenseShape,ProductTag> >
 {
  typedef typename Product<Lhs,Rhs>::Scalar Scalar;
-  
+
  template<typename Dest>
  static void scaleAndAddTo(Dest& dst, const Lhs& lhs, const Rhs& rhs, const Scalar& alpha)
  {
@@ -748,7 +778,7 @@ struct generic_product_impl<Lhs,Rhs,DenseShape,SelfAdjointShape,ProductTag>
 : generic_product_impl_base<Lhs,Rhs,generic_product_impl<Lhs,Rhs,DenseShape,SelfAdjointShape,ProductTag> >
 {
  typedef typename Product<Lhs,Rhs>::Scalar Scalar;
-  
+
  template<typename Dest>
  static void scaleAndAddTo(Dest& dst, const Lhs& lhs, const Rhs& rhs, const Scalar& alpha)
  {
@@ -760,7 +790,7 @@ struct generic_product_impl<Lhs,Rhs,DenseShape,SelfAdjointShape,ProductTag>
 /***************************************************************************
 * Diagonal products
 ***************************************************************************/
-  
+
 template<typename MatrixType, typename DiagonalType, typename Derived, int ProductOrder>
 struct diagonal_product_evaluator_base
  : evaluator_base<Derived>
@@ -769,7 +799,7 @@ struct diagonal_product_evaluator_base
 public:
  enum {
    CoeffReadCost = NumTraits<Scalar>::MulCost + evaluator<MatrixType>::CoeffReadCost + evaluator<DiagonalType>::CoeffReadCost,
-    
+
    MatrixFlags = evaluator<MatrixType>::Flags,
    DiagFlags = evaluator<DiagonalType>::Flags,
    _StorageOrder = MatrixFlags & RowMajorBit ? RowMajor : ColMajor,
@@ -781,21 +811,28 @@ public:
    _Vectorizable = bool(int(MatrixFlags)&PacketAccessBit) && _SameTypes && (_ScalarAccessOnDiag || (bool(int(DiagFlags)&PacketAccessBit))),
    _LinearAccessMask = (MatrixType::RowsAtCompileTime==1 || MatrixType::ColsAtCompileTime==1) ? LinearAccessBit : 0,
    Flags = ((HereditaryBits|_LinearAccessMask) & (unsigned int)(MatrixFlags)) | (_Vectorizable ? PacketAccessBit : 0),
-    Alignment = evaluator<MatrixType>::Alignment
+    Alignment = evaluator<MatrixType>::Alignment,
+
+    AsScalarProduct =     (DiagonalType::SizeAtCompileTime==1)
+                      ||  (DiagonalType::SizeAtCompileTime==Dynamic && MatrixType::RowsAtCompileTime==1 && ProductOrder==OnTheLeft)
+                      ||  (DiagonalType::SizeAtCompileTime==Dynamic && MatrixType::ColsAtCompileTime==1 && ProductOrder==OnTheRight)
  };
-  
+
  diagonal_product_evaluator_base(const MatrixType &mat, const DiagonalType &diag)
    : m_diagImpl(diag), m_matImpl(mat)
  {
    EIGEN_INTERNAL_CHECK_COST_VALUE(NumTraits<Scalar>::MulCost);
    EIGEN_INTERNAL_CHECK_COST_VALUE(CoeffReadCost);
  }
-  
+
  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Scalar coeff(Index idx) const
  {
-    return m_diagImpl.coeff(idx) * m_matImpl.coeff(idx);
+    if(AsScalarProduct)
+      return m_diagImpl.coeff(0) * m_matImpl.coeff(idx);
+    else
+      return m_diagImpl.coeff(idx) * m_matImpl.coeff(idx);
  }
-  
+
 protected:
  template<int LoadMode,typename PacketType>
  EIGEN_STRONG_INLINE PacketType packet_impl(Index row, Index col, Index id, internal::true_type) const
@@ -803,7 +840,7 @@ protected:
    return internal::pmul(m_matImpl.template packet<LoadMode,PacketType>(row, col),
                          internal::pset1<PacketType>(m_diagImpl.coeff(id)));
  }
-  
+
  template<int LoadMode,typename PacketType>
  EIGEN_STRONG_INLINE PacketType packet_impl(Index row, Index col, Index id, internal::false_type) const
  {
@@ -814,7 +851,7 @@ protected:
    return internal::pmul(m_matImpl.template packet<LoadMode,PacketType>(row, col),
                          m_diagImpl.template packet<DiagonalPacketLoadMode,PacketType>(id));
  }
-  
+
  evaluator<DiagonalType> m_diagImpl;
  evaluator<MatrixType>   m_matImpl;
 };
@@ -829,10 +866,10 @@ struct product_evaluator<Product<Lhs, Rhs, ProductKind>, ProductTag, DiagonalSha
  using Base::m_matImpl;
  using Base::coeff;
  typedef typename Base::Scalar Scalar;
-  
+
  typedef Product<Lhs, Rhs, ProductKind> XprType;
  typedef typename XprType::PlainObject PlainObject;
-  
+
  enum {
    StorageOrder = int(Rhs::Flags) & RowMajorBit ? RowMajor : ColMajor
  };
@@ -841,12 +878,12 @@ struct product_evaluator<Product<Lhs, Rhs, ProductKind>, ProductTag, DiagonalSha
    : Base(xpr.rhs(), xpr.lhs().diagonal())
  {
  }
-  
+
  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Scalar coeff(Index row, Index col) const
  {
    return m_diagImpl.coeff(row) * m_matImpl.coeff(row, col);
  }
-  
+
 #ifndef __CUDACC__
  template<int LoadMode,typename PacketType>
  EIGEN_STRONG_INLINE PacketType packet(Index row, Index col) const
@@ -856,7 +893,7 @@ struct product_evaluator<Product<Lhs, Rhs, ProductKind>, ProductTag, DiagonalSha
    return this->template packet_impl<LoadMode,PacketType>(row,col, row,
                                 typename internal::conditional<int(StorageOrder)==RowMajor, internal::true_type, internal::false_type>::type());
  }
-  
+
  template<int LoadMode,typename PacketType>
  EIGEN_STRONG_INLINE PacketType packet(Index idx) const
  {
@@ -875,22 +912,22 @@ struct product_evaluator<Product<Lhs, Rhs, ProductKind>, ProductTag, DenseShape,
  using Base::m_matImpl;
  using Base::coeff;
  typedef typename Base::Scalar Scalar;
-  
+
  typedef Product<Lhs, Rhs, ProductKind> XprType;
  typedef typename XprType::PlainObject PlainObject;
-  
+
  enum { StorageOrder = int(Lhs::Flags) & RowMajorBit ? RowMajor : ColMajor };

  EIGEN_DEVICE_FUNC explicit product_evaluator(const XprType& xpr)
    : Base(xpr.lhs(), xpr.rhs().diagonal())
  {
  }
-  
+
  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Scalar coeff(Index row, Index col) const
  {
    return m_matImpl.coeff(row, col) * m_diagImpl.coeff(col);
  }
-  
+
 #ifndef __CUDACC__
  template<int LoadMode,typename PacketType>
  EIGEN_STRONG_INLINE PacketType packet(Index row, Index col) const
@@ -898,7 +935,7 @@ struct product_evaluator<Product<Lhs, Rhs, ProductKind>, ProductTag, DenseShape,
    return this->template packet_impl<LoadMode,PacketType>(row,col, col,
                                 typename internal::conditional<int(StorageOrder)==ColMajor, internal::true_type, internal::false_type>::type());
  }
-  
+
  template<int LoadMode,typename PacketType>
  EIGEN_STRONG_INLINE PacketType packet(Index idx) const
  {
@@ -980,7 +1017,7 @@ template<typename Lhs, typename Rhs, int ProductTag, typename MatrixShape>
 struct generic_product_impl<Lhs, Rhs, PermutationShape, MatrixShape, ProductTag>
 {
  template<typename Dest>
-  static void evalTo(Dest& dst, const Lhs& lhs, const Rhs& rhs)
+  static EIGEN_DEVICE_FUNC void evalTo(Dest& dst, const Lhs& lhs, const Rhs& rhs)
  {
    permutation_matrix_product<Rhs, OnTheLeft, false, MatrixShape>::run(dst, lhs, rhs);
  }
@@ -990,7 +1027,7 @@ template<typename Lhs, typename Rhs, int ProductTag, typename MatrixShape>
 struct generic_product_impl<Lhs, Rhs, MatrixShape, PermutationShape, ProductTag>
 {
  template<typename Dest>
-  static void evalTo(Dest& dst, const Lhs& lhs, const Rhs& rhs)
+  static EIGEN_DEVICE_FUNC void evalTo(Dest& dst, const Lhs& lhs, const Rhs& rhs)
  {
    permutation_matrix_product<Lhs, OnTheRight, false, MatrixShape>::run(dst, rhs, lhs);
  }
@@ -1000,7 +1037,7 @@ template<typename Lhs, typename Rhs, int ProductTag, typename MatrixShape>
 struct generic_product_impl<Inverse<Lhs>, Rhs, PermutationShape, MatrixShape, ProductTag>
 {
  template<typename Dest>
-  static void evalTo(Dest& dst, const Inverse<Lhs>& lhs, const Rhs& rhs)
+  static EIGEN_DEVICE_FUNC void evalTo(Dest& dst, const Inverse<Lhs>& lhs, const Rhs& rhs)
  {
    permutation_matrix_product<Rhs, OnTheLeft, true, MatrixShape>::run(dst, lhs.nestedExpression(), rhs);
  }
@@ -1010,7 +1047,7 @@ template<typename Lhs, typename Rhs, int ProductTag, typename MatrixShape>
 struct generic_product_impl<Lhs, Inverse<Rhs>, MatrixShape, PermutationShape, ProductTag>
 {
  template<typename Dest>
-  static void evalTo(Dest& dst, const Lhs& lhs, const Inverse<Rhs>& rhs)
+  static EIGEN_DEVICE_FUNC void evalTo(Dest& dst, const Lhs& lhs, const Inverse<Rhs>& rhs)
  {
    permutation_matrix_product<Lhs, OnTheRight, true, MatrixShape>::run(dst, rhs.nestedExpression(), lhs);
  }
@@ -1032,7 +1069,7 @@ struct transposition_matrix_product
 {
  typedef typename nested_eval<ExpressionType, 1>::type MatrixType;
  typedef typename remove_all<MatrixType>::type MatrixTypeCleaned;
-  
+
  template<typename Dest, typename TranspositionType>
  static inline void run(Dest& dst, const TranspositionType& tr, const ExpressionType& xpr)
  {
@@ -1057,7 +1094,7 @@ template<typename Lhs, typename Rhs, int ProductTag, typename MatrixShape>
 struct generic_product_impl<Lhs, Rhs, TranspositionsShape, MatrixShape, ProductTag>
 {
  template<typename Dest>
-  static void evalTo(Dest& dst, const Lhs& lhs, const Rhs& rhs)
+  static EIGEN_DEVICE_FUNC void evalTo(Dest& dst, const Lhs& lhs, const Rhs& rhs)
  {
    transposition_matrix_product<Rhs, OnTheLeft, false, MatrixShape>::run(dst, lhs, rhs);
  }
@@ -1067,7 +1104,7 @@ template<typename Lhs, typename Rhs, int ProductTag, typename MatrixShape>
 struct generic_product_impl<Lhs, Rhs, MatrixShape, TranspositionsShape, ProductTag>
 {
  template<typename Dest>
-  static void evalTo(Dest& dst, const Lhs& lhs, const Rhs& rhs)
+  static EIGEN_DEVICE_FUNC void evalTo(Dest& dst, const Lhs& lhs, const Rhs& rhs)
  {
    transposition_matrix_product<Lhs, OnTheRight, false, MatrixShape>::run(dst, rhs, lhs);
  }
@@ -1078,7 +1115,7 @@ template<typename Lhs, typename Rhs, int ProductTag, typename MatrixShape>
 struct generic_product_impl<Transpose<Lhs>, Rhs, TranspositionsShape, MatrixShape, ProductTag>
 {
  template<typename Dest>
-  static void evalTo(Dest& dst, const Transpose<Lhs>& lhs, const Rhs& rhs)
+  static EIGEN_DEVICE_FUNC void evalTo(Dest& dst, const Transpose<Lhs>& lhs, const Rhs& rhs)
  {
    transposition_matrix_product<Rhs, OnTheLeft, true, MatrixShape>::run(dst, lhs.nestedExpression(), rhs);
  }
@@ -1088,7 +1125,7 @@ template<typename Lhs, typename Rhs, int ProductTag, typename MatrixShape>
 struct generic_product_impl<Lhs, Transpose<Rhs>, MatrixShape, TranspositionsShape, ProductTag>
 {
  template<typename Dest>
-  static void evalTo(Dest& dst, const Lhs& lhs, const Transpose<Rhs>& rhs)
+  static EIGEN_DEVICE_FUNC void evalTo(Dest& dst, const Lhs& lhs, const Transpose<Rhs>& rhs)
  {
    transposition_matrix_product<Lhs, OnTheRight, true, MatrixShape>::run(dst, rhs.nestedExpression(), lhs);
  }
--- a/Eigen/src/Core/Random.h
+++ b/Eigen/src/Core/Random.h
@@ -10,7 +10,7 @@
 #ifndef EIGEN_RANDOM_H
 #define EIGEN_RANDOM_H

-namespace Eigen { 
+namespace Eigen {

 namespace internal {

@@ -29,16 +29,16 @@ struct functor_traits<scalar_random_op<Scalar> >
  *
  * Numbers are uniformly spread through their whole definition range for integer types,
  * and in the [-1:1] range for floating point scalar types.
-  * 
+  *
  * The parameters \a rows and \a cols are the number of rows and of columns of
  * the returned matrix. Must be compatible with this MatrixBase type.
  *
  * \not_reentrant
-  * 
+  *
  * This variant is meant to be used for dynamic-size matrix types. For fixed-size types,
  * it is redundant to pass \a rows and \a cols as arguments, so Random() should be used
  * instead.
-  * 
+  *
  *
  * Example: \include MatrixBase_random_int_int.cpp
  * Output: \verbinclude MatrixBase_random_int_int.out
@@ -46,7 +46,7 @@ struct functor_traits<scalar_random_op<Scalar> >
  * This expression has the "evaluate before nesting" flag so that it will be evaluated into
  * a temporary matrix whenever it is nested in a larger expression. This prevents unexpected
  * behavior with expressions involving random matrices.
-  * 
+  *
  * See DenseBase::NullaryExpr(Index, const CustomNullaryOp&) for an example using C++11 random generators.
  *
  * \sa DenseBase::setRandom(), DenseBase::Random(Index), DenseBase::Random()
@@ -93,7 +93,7 @@ DenseBase<Derived>::Random(Index size)
  *
  * Numbers are uniformly spread through their whole definition range for integer types,
  * and in the [-1:1] range for floating point scalar types.
-  * 
+  *
  * This variant is only for fixed-size MatrixBase types. For dynamic-size types, you
  * need to use the variants taking size arguments.
  *
@@ -103,7 +103,7 @@ DenseBase<Derived>::Random(Index size)
  * This expression has the "evaluate before nesting" flag so that it will be evaluated into
  * a temporary matrix whenever it is nested in a larger expression. This prevents unexpected
  * behavior with expressions involving random matrices.
-  * 
+  *
  * \not_reentrant
  *
  * \sa DenseBase::setRandom(), DenseBase::Random(Index,Index), DenseBase::Random(Index)
@@ -119,16 +119,16 @@ DenseBase<Derived>::Random()
  *
  * Numbers are uniformly spread through their whole definition range for integer types,
  * and in the [-1:1] range for floating point scalar types.
-  * 
+  *
  * \not_reentrant
-  * 
+  *
  * Example: \include MatrixBase_setRandom.cpp
  * Output: \verbinclude MatrixBase_setRandom.out
  *
  * \sa class CwiseNullaryOp, setRandom(Index), setRandom(Index,Index)
  */
 template<typename Derived>
-inline Derived& DenseBase<Derived>::setRandom()
+EIGEN_DEVICE_FUNC inline Derived& DenseBase<Derived>::setRandom()
 {
  return *this = Random(rows(), cols());
 }
@@ -137,7 +137,7 @@ inline Derived& DenseBase<Derived>::setRandom()
  *
  * Numbers are uniformly spread through their whole definition range for integer types,
  * and in the [-1:1] range for floating point scalar types.
-  * 
+  *
  * \only_for_vectors
  * \not_reentrant
  *
@@ -160,7 +160,7 @@ PlainObjectBase<Derived>::setRandom(Index newSize)
  * and in the [-1:1] range for floating point scalar types.
  *
  * \not_reentrant
-  * 
+  *
  * \param rows the new number of rows
  * \param cols the new number of columns
  *
--- a/Eigen/src/Core/Redux.h
+++ b/Eigen/src/Core/Redux.h
@@ -11,7 +11,7 @@
 #ifndef EIGEN_REDUX_H
 #define EIGEN_REDUX_H

-namespace Eigen { 
+namespace Eigen {

 namespace internal {

@@ -60,7 +60,7 @@ public:
  enum {
    Unrolling = Cost <= UnrollingLimit ? CompleteUnrolling : NoUnrolling
  };
-  
+
 #ifdef EIGEN_DEBUG_ASSIGN
  static void debug()
  {
@@ -128,7 +128,7 @@ template<typename Func, typename Derived, int Start>
 struct redux_novec_unroller<Func, Derived, Start, 0>
 {
  typedef typename Derived::Scalar Scalar;
-  EIGEN_DEVICE_FUNC 
+  EIGEN_DEVICE_FUNC
  static EIGEN_STRONG_INLINE Scalar run(const Derived&, const Func&) { return Scalar(); }
 };

@@ -215,7 +215,7 @@ struct redux_impl<Func, Derived, LinearVectorizedTraversal, NoUnrolling>
  static Scalar run(const Derived &mat, const Func& func)
  {
    const Index size = mat.size();
-    
+
    const Index packetSize = redux_traits<Func, Derived>::PacketSize;
    const int packetAlignment = unpacket_traits<PacketScalar>::alignment;
    enum {
@@ -336,12 +336,12 @@ class redux_evaluator
 public:
  typedef _XprType XprType;
  EIGEN_DEVICE_FUNC explicit redux_evaluator(const XprType &xpr) : m_evaluator(xpr), m_xpr(xpr) {}
-  
+
  typedef typename XprType::Scalar Scalar;
  typedef typename XprType::CoeffReturnType CoeffReturnType;
  typedef typename XprType::PacketScalar PacketScalar;
  typedef typename XprType::PacketReturnType PacketReturnType;
-  
+
  enum {
    MaxRowsAtCompileTime = XprType::MaxRowsAtCompileTime,
    MaxColsAtCompileTime = XprType::MaxColsAtCompileTime,
@@ -353,7 +353,7 @@ public:
    CoeffReadCost = evaluator<XprType>::CoeffReadCost,
    Alignment = evaluator<XprType>::Alignment
  };
-  
+
  EIGEN_DEVICE_FUNC Index rows() const { return m_xpr.rows(); }
  EIGEN_DEVICE_FUNC Index cols() const { return m_xpr.cols(); }
  EIGEN_DEVICE_FUNC Index size() const { return m_xpr.size(); }
@@ -375,17 +375,17 @@ public:
  template<int LoadMode, typename PacketType>
  PacketType packet(Index index) const
  { return m_evaluator.template packet<LoadMode,PacketType>(index); }
-  
+
  EIGEN_DEVICE_FUNC
  CoeffReturnType coeffByOuterInner(Index outer, Index inner) const
  { return m_evaluator.coeff(IsRowMajor ? outer : inner, IsRowMajor ? inner : outer); }
-  
+
  template<int LoadMode, typename PacketType>
  PacketType packetByOuterInner(Index outer, Index inner) const
  { return m_evaluator.template packet<LoadMode,PacketType>(IsRowMajor ? outer : inner, IsRowMajor ? inner : outer); }
-  
+
  const XprType & nestedExpression() const { return m_xpr; }
-  
+
 protected:
  internal::evaluator<XprType> m_evaluator;
  const XprType &m_xpr;
@@ -407,14 +407,14 @@ protected:
  */
 template<typename Derived>
 template<typename Func>
-typename internal::traits<Derived>::Scalar
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE typename internal::traits<Derived>::Scalar
 DenseBase<Derived>::redux(const Func& func) const
 {
  eigen_assert(this->rows()>0 && this->cols()>0 && "you are using an empty matrix");

  typedef typename internal::redux_evaluator<Derived> ThisEvaluator;
  ThisEvaluator thisEval(derived());
-  
+
  return internal::redux_impl<Func, ThisEvaluator>::run(thisEval, func);
 }

@@ -422,7 +422,7 @@ DenseBase<Derived>::redux(const Func& func) const
  * \warning the result is undefined if \c *this contains NaN.
  */
 template<typename Derived>
-EIGEN_STRONG_INLINE typename internal::traits<Derived>::Scalar
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE typename internal::traits<Derived>::Scalar
 DenseBase<Derived>::minCoeff() const
 {
  return derived().redux(Eigen::internal::scalar_min_op<Scalar,Scalar>());
@@ -432,7 +432,7 @@ DenseBase<Derived>::minCoeff() const
  * \warning the result is undefined if \c *this contains NaN.
  */
 template<typename Derived>
-EIGEN_STRONG_INLINE typename internal::traits<Derived>::Scalar
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE typename internal::traits<Derived>::Scalar
 DenseBase<Derived>::maxCoeff() const
 {
  return derived().redux(Eigen::internal::scalar_max_op<Scalar,Scalar>());
@@ -445,7 +445,7 @@ DenseBase<Derived>::maxCoeff() const
  * \sa trace(), prod(), mean()
  */
 template<typename Derived>
-EIGEN_STRONG_INLINE typename internal::traits<Derived>::Scalar
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE typename internal::traits<Derived>::Scalar
 DenseBase<Derived>::sum() const
 {
  if(SizeAtCompileTime==0 || (SizeAtCompileTime==Dynamic && size()==0))
@@ -458,7 +458,7 @@ DenseBase<Derived>::sum() const
 * \sa trace(), prod(), sum()
 */
 template<typename Derived>
-EIGEN_STRONG_INLINE typename internal::traits<Derived>::Scalar
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE typename internal::traits<Derived>::Scalar
 DenseBase<Derived>::mean() const
 {
 #ifdef __INTEL_COMPILER
@@ -479,7 +479,7 @@ DenseBase<Derived>::mean() const
  * \sa sum(), mean(), trace()
  */
 template<typename Derived>
-EIGEN_STRONG_INLINE typename internal::traits<Derived>::Scalar
+EIGEN_DEVICE_FUNC  EIGEN_STRONG_INLINE typename internal::traits<Derived>::Scalar
 DenseBase<Derived>::prod() const
 {
  if(SizeAtCompileTime==0 || (SizeAtCompileTime==Dynamic && size()==0))
@@ -494,7 +494,7 @@ DenseBase<Derived>::prod() const
  * \sa diagonal(), sum()
  */
 template<typename Derived>
-EIGEN_STRONG_INLINE typename internal::traits<Derived>::Scalar
+EIGEN_DEVICE_FUNC  EIGEN_STRONG_INLINE typename internal::traits<Derived>::Scalar
 MatrixBase<Derived>::trace() const
 {
  return derived().diagonal().sum();
--- a/Eigen/src/Core/Ref.h
+++ b/Eigen/src/Core/Ref.h
@@ -28,12 +28,13 @@ struct traits<Ref<_PlainObjectType, _Options, _StrideType> >

  template<typename Derived> struct match {
    enum {
+      IsVectorAtCompileTime = PlainObjectType::IsVectorAtCompileTime || Derived::IsVectorAtCompileTime,
      HasDirectAccess = internal::has_direct_access<Derived>::ret,
-      StorageOrderMatch = PlainObjectType::IsVectorAtCompileTime || Derived::IsVectorAtCompileTime || ((PlainObjectType::Flags&RowMajorBit)==(Derived::Flags&RowMajorBit)),
+      StorageOrderMatch = IsVectorAtCompileTime || ((PlainObjectType::Flags&RowMajorBit)==(Derived::Flags&RowMajorBit)),
      InnerStrideMatch = int(StrideType::InnerStrideAtCompileTime)==int(Dynamic)
                      || int(StrideType::InnerStrideAtCompileTime)==int(Derived::InnerStrideAtCompileTime)
                      || (int(StrideType::InnerStrideAtCompileTime)==0 && int(Derived::InnerStrideAtCompileTime)==1),
-      OuterStrideMatch = Derived::IsVectorAtCompileTime
+      OuterStrideMatch = IsVectorAtCompileTime
                      || int(StrideType::OuterStrideAtCompileTime)==int(Dynamic) || int(StrideType::OuterStrideAtCompileTime)==int(Derived::OuterStrideAtCompileTime),
      // NOTE, this indirection of evaluator<Derived>::Alignment is needed
      // to workaround a very strange bug in MSVC related to the instantiation
@@ -95,6 +96,8 @@ protected:
  template<typename Expression>
  EIGEN_DEVICE_FUNC void construct(Expression& expr)
  {
+    EIGEN_STATIC_ASSERT_SAME_MATRIX_SIZE(PlainObjectType,Expression);
+
    if(PlainObjectType::RowsAtCompileTime==1)
    {
      eigen_assert(expr.rows()==1 || expr.cols()==1);
--- a/Eigen/src/Core/Replicate.h
+++ b/Eigen/src/Core/Replicate.h
@@ -10,7 +10,7 @@
 #ifndef EIGEN_REPLICATE_H
 #define EIGEN_REPLICATE_H

-namespace Eigen { 
+namespace Eigen {

 namespace internal {
 template<typename MatrixType,int RowFactor,int ColFactor>
@@ -35,7 +35,7 @@ struct traits<Replicate<MatrixType,RowFactor,ColFactor> >
    IsRowMajor = MaxRowsAtCompileTime==1 && MaxColsAtCompileTime!=1 ? 1
               : MaxColsAtCompileTime==1 && MaxRowsAtCompileTime!=1 ? 0
               : (MatrixType::Flags & RowMajorBit) ? 1 : 0,
-    
+
    // FIXME enable DirectAccess with negative strides?
    Flags = IsRowMajor ? RowMajorBit : 0
  };
@@ -95,8 +95,8 @@ template<typename MatrixType,int RowFactor,int ColFactor> class Replicate

    EIGEN_DEVICE_FUNC
    const _MatrixTypeNested& nestedExpression() const
-    { 
-      return m_matrix; 
+    {
+      return m_matrix;
    }

  protected:
@@ -115,7 +115,7 @@ template<typename MatrixType,int RowFactor,int ColFactor> class Replicate
  */
 template<typename Derived>
 template<int RowFactor, int ColFactor>
-const Replicate<Derived,RowFactor,ColFactor>
+EIGEN_DEVICE_FUNC const Replicate<Derived,RowFactor,ColFactor>
 DenseBase<Derived>::replicate() const
 {
  return Replicate<Derived,RowFactor,ColFactor>(derived());
@@ -130,7 +130,7 @@ DenseBase<Derived>::replicate() const
  * \sa VectorwiseOp::replicate(), DenseBase::replicate(), class Replicate
  */
 template<typename ExpressionType, int Direction>
-const typename VectorwiseOp<ExpressionType,Direction>::ReplicateReturnType
+EIGEN_DEVICE_FUNC const typename VectorwiseOp<ExpressionType,Direction>::ReplicateReturnType
 VectorwiseOp<ExpressionType,Direction>::replicate(Index factor) const
 {
  return typename VectorwiseOp<ExpressionType,Direction>::ReplicateReturnType
--- a/Eigen/src/Core/ReturnByValue.h
+++ b/Eigen/src/Core/ReturnByValue.h
@@ -79,7 +79,7 @@ template<typename Derived> class ReturnByValue

 template<typename Derived>
 template<typename OtherDerived>
-Derived& DenseBase<Derived>::operator=(const ReturnByValue<OtherDerived>& other)
+EIGEN_DEVICE_FUNC Derived& DenseBase<Derived>::operator=(const ReturnByValue<OtherDerived>& other)
 {
  other.evalTo(derived());
  return derived();
@@ -90,7 +90,7 @@ namespace internal {
 // Expression is evaluated in a temporary; default implementation of Assignment is bypassed so that
 // when a ReturnByValue expression is assigned, the evaluator is not constructed.
 // TODO: Finalize port to new regime; ReturnByValue should not exist in the expression world
-  
+
 template<typename Derived>
 struct evaluator<ReturnByValue<Derived> >
  : public evaluator<typename internal::traits<Derived>::ReturnType>
@@ -98,7 +98,7 @@ struct evaluator<ReturnByValue<Derived> >
  typedef ReturnByValue<Derived> XprType;
  typedef typename internal::traits<Derived>::ReturnType PlainObject;
  typedef evaluator<PlainObject> Base;
-  
+
  EIGEN_DEVICE_FUNC explicit evaluator(const XprType& xpr)
    : m_result(xpr.rows(), xpr.cols())
  {
--- a/Eigen/src/Core/Reverse.h
+++ b/Eigen/src/Core/Reverse.h
@@ -12,7 +12,7 @@
 #ifndef EIGEN_REVERSE_H
 #define EIGEN_REVERSE_H

-namespace Eigen { 
+namespace Eigen {

 namespace internal {

@@ -44,7 +44,7 @@ template<typename PacketType> struct reverse_packet_cond<PacketType,false>
  static inline PacketType run(const PacketType& x) { return x; }
 };

-} // end namespace internal 
+} // end namespace internal

 /** \class Reverse
  * \ingroup Core_Module
@@ -98,7 +98,7 @@ template<typename MatrixType, int Direction> class Reverse
    }

    EIGEN_DEVICE_FUNC const typename internal::remove_all<typename MatrixType::Nested>::type&
-    nestedExpression() const 
+    nestedExpression() const
    {
      return m_matrix;
    }
@@ -114,7 +114,7 @@ template<typename MatrixType, int Direction> class Reverse
  *
  */
 template<typename Derived>
-inline typename DenseBase<Derived>::ReverseReturnType
+EIGEN_DEVICE_FUNC inline typename DenseBase<Derived>::ReverseReturnType
 DenseBase<Derived>::reverse()
 {
  return ReverseReturnType(derived());
@@ -136,7 +136,7 @@ DenseBase<Derived>::reverse()
  *
  * \sa VectorwiseOp::reverseInPlace(), reverse() */
 template<typename Derived>
-inline void DenseBase<Derived>::reverseInPlace()
+EIGEN_DEVICE_FUNC inline void DenseBase<Derived>::reverseInPlace()
 {
  if(cols()>rows())
  {
@@ -161,7 +161,7 @@ inline void DenseBase<Derived>::reverseInPlace()
 }

 namespace internal {
-  
+
 template<int Direction>
 struct vectorwise_reverse_inplace_impl;

@@ -201,7 +201,7 @@ struct vectorwise_reverse_inplace_impl<Horizontal>
  *
  * \sa DenseBase::reverseInPlace(), reverse() */
 template<typename ExpressionType, int Direction>
-void VectorwiseOp<ExpressionType,Direction>::reverseInPlace()
+EIGEN_DEVICE_FUNC void VectorwiseOp<ExpressionType,Direction>::reverseInPlace()
 {
  internal::vectorwise_reverse_inplace_impl<Direction>::run(_expression().const_cast_derived());
 }
--- a/Eigen/src/Core/SelfAdjointView.h
+++ b/Eigen/src/Core/SelfAdjointView.h
@@ -10,7 +10,7 @@
 #ifndef EIGEN_SELFADJOINTMATRIX_H
 #define EIGEN_SELFADJOINTMATRIX_H

-namespace Eigen { 
+namespace Eigen {

 /** \class SelfAdjointView
  * \ingroup Core_Module
@@ -45,7 +45,7 @@ struct traits<SelfAdjointView<MatrixType, UpLo> > : traits<MatrixType>
 };
 }

-// FIXME could also be called SelfAdjointWrapper to be consistent with DiagonalWrapper ??
+
 template<typename _MatrixType, unsigned int UpLo> class SelfAdjointView
  : public TriangularBase<SelfAdjointView<_MatrixType, UpLo> >
 {
@@ -58,18 +58,22 @@ template<typename _MatrixType, unsigned int UpLo> class SelfAdjointView
    typedef MatrixTypeNestedCleaned NestedExpression;

    /** \brief The type of coefficients in this matrix */
-    typedef typename internal::traits<SelfAdjointView>::Scalar Scalar; 
+    typedef typename internal::traits<SelfAdjointView>::Scalar Scalar;
    typedef typename MatrixType::StorageIndex StorageIndex;
+    typedef typename internal::remove_all<typename MatrixType::ConjugateReturnType>::type MatrixConjugateReturnType;

    enum {
      Mode = internal::traits<SelfAdjointView>::Mode,
-      Flags = internal::traits<SelfAdjointView>::Flags
+      Flags = internal::traits<SelfAdjointView>::Flags,
+      TransposeMode = ((Mode & Upper) ? Lower : 0) | ((Mode & Lower) ? Upper : 0)
    };
    typedef typename MatrixType::PlainObject PlainObject;

    EIGEN_DEVICE_FUNC
    explicit inline SelfAdjointView(MatrixType& matrix) : m_matrix(matrix)
-    {}
+    {
+      EIGEN_STATIC_ASSERT(UpLo==Lower || UpLo==Upper,SELFADJOINTVIEW_ACCEPTS_UPPER_AND_LOWER_MODE_ONLY);
+    }

    EIGEN_DEVICE_FUNC
    inline Index rows() const { return m_matrix.rows(); }
@@ -127,7 +131,7 @@ template<typename _MatrixType, unsigned int UpLo> class SelfAdjointView
    {
      return Product<OtherDerived,SelfAdjointView>(lhs.derived(),rhs);
    }
-    
+
    friend EIGEN_DEVICE_FUNC
    const SelfAdjointView<const EIGEN_SCALAR_BINARYOP_EXPR_RETURN_TYPE(Scalar,MatrixType,product),UpLo>
    operator*(const Scalar& s, const SelfAdjointView& mat)
@@ -187,6 +191,36 @@ template<typename _MatrixType, unsigned int UpLo> class SelfAdjointView
                                   TriangularView<typename MatrixType::AdjointReturnType,TriMode> >::type(tmp2);
    }

+    typedef SelfAdjointView<const MatrixConjugateReturnType,UpLo> ConjugateReturnType;
+    /** \sa MatrixBase::conjugate() const */
+    EIGEN_DEVICE_FUNC
+    inline const ConjugateReturnType conjugate() const
+    { return ConjugateReturnType(m_matrix.conjugate()); }
+
+    typedef SelfAdjointView<const typename MatrixType::AdjointReturnType,TransposeMode> AdjointReturnType;
+    /** \sa MatrixBase::adjoint() const */
+    EIGEN_DEVICE_FUNC
+    inline const AdjointReturnType adjoint() const
+    { return AdjointReturnType(m_matrix.adjoint()); }
+
+    typedef SelfAdjointView<typename MatrixType::TransposeReturnType,TransposeMode> TransposeReturnType;
+     /** \sa MatrixBase::transpose() */
+    EIGEN_DEVICE_FUNC
+    inline TransposeReturnType transpose()
+    {
+      EIGEN_STATIC_ASSERT_LVALUE(MatrixType)
+      typename MatrixType::TransposeReturnType tmp(m_matrix);
+      return TransposeReturnType(tmp);
+    }
+
+    typedef SelfAdjointView<const typename MatrixType::ConstTransposeReturnType,TransposeMode> ConstTransposeReturnType;
+    /** \sa MatrixBase::transpose() const */
+    EIGEN_DEVICE_FUNC
+    inline const ConstTransposeReturnType transpose() const
+    {
+      return ConstTransposeReturnType(m_matrix.transpose());
+    }
+
    /** \returns a const expression of the main diagonal of the matrix \c *this
      *
      * This method simply returns the diagonal of the nested expression, thus by-passing the SelfAdjointView decorator.
@@ -253,17 +287,17 @@ protected:
  using Base::m_src;
  using Base::m_functor;
 public:
-  
+
  typedef typename Base::DstEvaluatorType DstEvaluatorType;
  typedef typename Base::SrcEvaluatorType SrcEvaluatorType;
  typedef typename Base::Scalar Scalar;
  typedef typename Base::AssignmentTraits AssignmentTraits;
-  
-  
+
+
  EIGEN_DEVICE_FUNC triangular_dense_assignment_kernel(DstEvaluatorType &dst, const SrcEvaluatorType &src, const Functor &func, DstXprType& dstExpr)
    : Base(dst, src, func, dstExpr)
  {}
-  
+
  EIGEN_DEVICE_FUNC void assignCoeff(Index row, Index col)
  {
    eigen_internal_assert(row!=col);
@@ -271,12 +305,12 @@ public:
    m_functor.assignCoeff(m_dst.coeffRef(row,col), tmp);
    m_functor.assignCoeff(m_dst.coeffRef(col,row), numext::conj(tmp));
  }
-  
+
  EIGEN_DEVICE_FUNC void assignDiagonalCoeff(Index id)
  {
    Base::assignCoeff(id,id);
  }
-  
+
  EIGEN_DEVICE_FUNC void assignOppositeCoeff(Index, Index)
  { eigen_internal_assert(false && "should never be called"); }
 };
@@ -287,17 +321,27 @@ public:
 * Implementation of MatrixBase methods
 ***************************************************************************/

+/** This is the const version of MatrixBase::selfadjointView() */
 template<typename Derived>
 template<unsigned int UpLo>
-typename MatrixBase<Derived>::template ConstSelfAdjointViewReturnType<UpLo>::Type
+EIGEN_DEVICE_FUNC typename MatrixBase<Derived>::template ConstSelfAdjointViewReturnType<UpLo>::Type
 MatrixBase<Derived>::selfadjointView() const
 {
  return typename ConstSelfAdjointViewReturnType<UpLo>::Type(derived());
 }

+/** \returns an expression of a symmetric/self-adjoint view extracted from the upper or lower triangular part of the current matrix
+  *
+  * The parameter \a UpLo can be either \c #Upper or \c #Lower
+  *
+  * Example: \include MatrixBase_selfadjointView.cpp
+  * Output: \verbinclude MatrixBase_selfadjointView.out
+  *
+  * \sa class SelfAdjointView
+  */
 template<typename Derived>
 template<unsigned int UpLo>
-typename MatrixBase<Derived>::template SelfAdjointViewReturnType<UpLo>::Type
+EIGEN_DEVICE_FUNC typename MatrixBase<Derived>::template SelfAdjointViewReturnType<UpLo>::Type
 MatrixBase<Derived>::selfadjointView()
 {
  return typename SelfAdjointViewReturnType<UpLo>::Type(derived());
--- a/Eigen/src/Core/SelfCwiseBinaryOp.h
+++ b/Eigen/src/Core/SelfCwiseBinaryOp.h
@@ -15,33 +15,29 @@ namespace Eigen {
 // TODO generalize the scalar type of 'other'

 template<typename Derived>
-EIGEN_STRONG_INLINE Derived& DenseBase<Derived>::operator*=(const Scalar& other)
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Derived& DenseBase<Derived>::operator*=(const Scalar& other)
 {
-  typedef typename Derived::PlainObject PlainObject;
  internal::call_assignment(this->derived(), PlainObject::Constant(rows(),cols(),other), internal::mul_assign_op<Scalar,Scalar>());
  return derived();
 }

 template<typename Derived>
-EIGEN_STRONG_INLINE Derived& ArrayBase<Derived>::operator+=(const Scalar& other)
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Derived& ArrayBase<Derived>::operator+=(const Scalar& other)
 {
-  typedef typename Derived::PlainObject PlainObject;
  internal::call_assignment(this->derived(), PlainObject::Constant(rows(),cols(),other), internal::add_assign_op<Scalar,Scalar>());
  return derived();
 }

 template<typename Derived>
-EIGEN_STRONG_INLINE Derived& ArrayBase<Derived>::operator-=(const Scalar& other)
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Derived& ArrayBase<Derived>::operator-=(const Scalar& other)
 {
-  typedef typename Derived::PlainObject PlainObject;
  internal::call_assignment(this->derived(), PlainObject::Constant(rows(),cols(),other), internal::sub_assign_op<Scalar,Scalar>());
  return derived();
 }

 template<typename Derived>
-EIGEN_STRONG_INLINE Derived& DenseBase<Derived>::operator/=(const Scalar& other)
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Derived& DenseBase<Derived>::operator/=(const Scalar& other)
 {
-  typedef typename Derived::PlainObject PlainObject;
  internal::call_assignment(this->derived(), PlainObject::Constant(rows(),cols(),other), internal::div_assign_op<Scalar,Scalar>());
  return derived();
 }
--- a/Eigen/src/Core/Solve.h
+++ b/Eigen/src/Core/Solve.h
@@ -13,7 +13,7 @@
 namespace Eigen {

 template<typename Decomposition, typename RhsType, typename StorageKind> class SolveImpl;
-  
+
 /** \class Solve
  * \ingroup Core_Module
  *
@@ -34,12 +34,12 @@ template<typename Decomposition, typename RhsType,typename StorageKind> struct s
 template<typename Decomposition, typename RhsType>
 struct solve_traits<Decomposition,RhsType,Dense>
 {
-  typedef Matrix<typename RhsType::Scalar,
+  typedef typename make_proper_matrix_type<typename RhsType::Scalar,
                 Decomposition::ColsAtCompileTime,
                 RhsType::ColsAtCompileTime,
                 RhsType::PlainObject::Options,
                 Decomposition::MaxColsAtCompileTime,
-                 RhsType::MaxColsAtCompileTime> PlainObject;  
+                 RhsType::MaxColsAtCompileTime>::type PlainObject;
 };

 template<typename Decomposition, typename RhsType>
@@ -64,11 +64,11 @@ class Solve : public SolveImpl<Decomposition,RhsType,typename internal::traits<R
 public:
  typedef typename internal::traits<Solve>::PlainObject PlainObject;
  typedef typename internal::traits<Solve>::StorageIndex StorageIndex;
-  
+
  Solve(const Decomposition &dec, const RhsType &rhs)
    : m_dec(dec), m_rhs(rhs)
  {}
-  
+
  EIGEN_DEVICE_FUNC Index rows() const { return m_dec.cols(); }
  EIGEN_DEVICE_FUNC Index cols() const { return m_rhs.cols(); }

@@ -87,14 +87,14 @@ class SolveImpl<Decomposition,RhsType,Dense>
  : public MatrixBase<Solve<Decomposition,RhsType> >
 {
  typedef Solve<Decomposition,RhsType> Derived;
-  
+
 public:
-  
+
  typedef MatrixBase<Solve<Decomposition,RhsType> > Base;
  EIGEN_DENSE_PUBLIC_INTERFACE(Derived)

 private:
-  
+
  Scalar coeff(Index row, Index col) const;
  Scalar coeff(Index i) const;
 };
@@ -119,15 +119,15 @@ struct evaluator<Solve<Decomposition,RhsType> >
  typedef evaluator<PlainObject> Base;

  enum { Flags = Base::Flags | EvalBeforeNestingBit };
-  
+
  EIGEN_DEVICE_FUNC explicit evaluator(const SolveType& solve)
    : m_result(solve.rows(), solve.cols())
  {
    ::new (static_cast<Base*>(this)) Base(m_result);
    solve.dec()._solve_impl(solve.rhs(), m_result);
  }
-  
-protected:  
+
+protected:
  PlainObject m_result;
 };

@@ -137,7 +137,7 @@ template<typename DstXprType, typename DecType, typename RhsType, typename Scala
 struct Assignment<DstXprType, Solve<DecType,RhsType>, internal::assign_op<Scalar,Scalar>, Dense2Dense>
 {
  typedef Solve<DecType,RhsType> SrcXprType;
-  static void run(DstXprType &dst, const SrcXprType &src, const internal::assign_op<Scalar,Scalar> &)
+  static EIGEN_DEVICE_FUNC void run(DstXprType &dst, const SrcXprType &src, const internal::assign_op<Scalar,Scalar> &)
  {
    Index dstRows = src.rows();
    Index dstCols = src.cols();
@@ -153,7 +153,7 @@ template<typename DstXprType, typename DecType, typename RhsType, typename Scala
 struct Assignment<DstXprType, Solve<Transpose<const DecType>,RhsType>, internal::assign_op<Scalar,Scalar>, Dense2Dense>
 {
  typedef Solve<Transpose<const DecType>,RhsType> SrcXprType;
-  static void run(DstXprType &dst, const SrcXprType &src, const internal::assign_op<Scalar,Scalar> &)
+  static EIGEN_DEVICE_FUNC void run(DstXprType &dst, const SrcXprType &src, const internal::assign_op<Scalar,Scalar> &)
  {
    Index dstRows = src.rows();
    Index dstCols = src.cols();
@@ -170,13 +170,13 @@ struct Assignment<DstXprType, Solve<CwiseUnaryOp<internal::scalar_conjugate_op<t
                  internal::assign_op<Scalar,Scalar>, Dense2Dense>
 {
  typedef Solve<CwiseUnaryOp<internal::scalar_conjugate_op<typename DecType::Scalar>, const Transpose<const DecType> >,RhsType> SrcXprType;
-  static void run(DstXprType &dst, const SrcXprType &src, const internal::assign_op<Scalar,Scalar> &)
+  static EIGEN_DEVICE_FUNC void run(DstXprType &dst, const SrcXprType &src, const internal::assign_op<Scalar,Scalar> &)
  {
    Index dstRows = src.rows();
    Index dstCols = src.cols();
    if((dst.rows()!=dstRows) || (dst.cols()!=dstCols))
      dst.resize(dstRows, dstCols);
-    
+
    src.dec().nestedExpression().nestedExpression().template _solve_impl_transposed<true>(src.rhs(), dst);
  }
 };
--- a/Eigen/src/Core/SolveTriangular.h
+++ b/Eigen/src/Core/SolveTriangular.h
@@ -10,7 +10,7 @@
 #ifndef EIGEN_SOLVETRIANGULAR_H
 #define EIGEN_SOLVETRIANGULAR_H

-namespace Eigen { 
+namespace Eigen {

 namespace internal {

@@ -19,7 +19,7 @@ namespace internal {
 template<typename LhsScalar, typename RhsScalar, typename Index, int Side, int Mode, bool Conjugate, int StorageOrder>
 struct triangular_solve_vector;

-template <typename Scalar, typename Index, int Side, int Mode, bool Conjugate, int TriStorageOrder, int OtherStorageOrder>
+template <typename Scalar, typename Index, int Side, int Mode, bool Conjugate, int TriStorageOrder, int OtherStorageOrder, int OtherInnerStride>
 struct triangular_solve_matrix;

 // small helper struct extracting some traits on the underlying solver operation
@@ -64,7 +64,7 @@ struct triangular_solver_selector<Lhs,Rhs,Side,Mode,NoUnrolling,1>

    ei_declare_aligned_stack_constructed_variable(RhsScalar,actualRhs,rhs.size(),
                                                  (useRhsDirectly ? rhs.data() : 0));
-                                                  
+
    if(!useRhsDirectly)
      MappedRhs(actualRhs,rhs.size()) = rhs;

@@ -98,8 +98,8 @@ struct triangular_solver_selector<Lhs,Rhs,Side,Mode,NoUnrolling,Dynamic>
    BlockingType blocking(rhs.rows(), rhs.cols(), size, 1, false);

    triangular_solve_matrix<Scalar,Index,Side,Mode,LhsProductTraits::NeedToConjugate,(int(Lhs::Flags) & RowMajorBit) ? RowMajor : ColMajor,
-                               (Rhs::Flags&RowMajorBit) ? RowMajor : ColMajor>
-      ::run(size, othersize, &actualLhs.coeffRef(0,0), actualLhs.outerStride(), &rhs.coeffRef(0,0), rhs.outerStride(), blocking);
+                               (Rhs::Flags&RowMajorBit) ? RowMajor : ColMajor, Rhs::InnerStrideAtCompileTime>
+      ::run(size, othersize, &actualLhs.coeffRef(0,0), actualLhs.outerStride(), &rhs.coeffRef(0,0), rhs.innerStride(), rhs.outerStride(), blocking);
  }
 };

@@ -148,7 +148,7 @@ struct triangular_solver_selector<Lhs,Rhs,OnTheRight,Mode,CompleteUnrolling,1> {
  {
    Transpose<const Lhs> trLhs(lhs);
    Transpose<Rhs> trRhs(rhs);
-    
+
    triangular_solver_unroller<Transpose<const Lhs>,Transpose<Rhs>,
                              ((Mode&Upper)==Upper ? Lower : Upper) | (Mode&UnitDiag),
                              0,Rhs::SizeAtCompileTime>::run(trLhs,trRhs);
@@ -161,13 +161,17 @@ struct triangular_solver_selector<Lhs,Rhs,OnTheRight,Mode,CompleteUnrolling,1> {
 * TriangularView methods
 ***************************************************************************/

+#ifndef EIGEN_PARSED_BY_DOXYGEN
 template<typename MatrixType, unsigned int Mode>
 template<int Side, typename OtherDerived>
-void TriangularViewImpl<MatrixType,Mode,Dense>::solveInPlace(const MatrixBase<OtherDerived>& _other) const
+EIGEN_DEVICE_FUNC void TriangularViewImpl<MatrixType,Mode,Dense>::solveInPlace(const MatrixBase<OtherDerived>& _other) const
 {
  OtherDerived& other = _other.const_cast_derived();
  eigen_assert( derived().cols() == derived().rows() && ((Side==OnTheLeft && derived().cols() == other.rows()) || (Side==OnTheRight && derived().cols() == other.cols())) );
  eigen_assert((!(Mode & ZeroDiag)) && bool(Mode & (Upper|Lower)));
+  // If solving for a 0x0 matrix, nothing to do, simply return.
+  if (derived().cols() == 0)
+    return;

  enum { copy = (internal::traits<OtherDerived>::Flags & RowMajorBit)  && OtherDerived::IsVectorAtCompileTime && OtherDerived::SizeAtCompileTime!=1};
  typedef typename internal::conditional<copy,
@@ -183,11 +187,12 @@ void TriangularViewImpl<MatrixType,Mode,Dense>::solveInPlace(const MatrixBase<Ot

 template<typename Derived, unsigned int Mode>
 template<int Side, typename Other>
-const internal::triangular_solve_retval<Side,TriangularView<Derived,Mode>,Other>
+EIGEN_DEVICE_FUNC const internal::triangular_solve_retval<Side,TriangularView<Derived,Mode>,Other>
 TriangularViewImpl<Derived,Mode,Dense>::solve(const MatrixBase<Other>& other) const
 {
  return internal::triangular_solve_retval<Side,TriangularViewType,Other>(derived(), other.derived());
 }
+#endif

 namespace internal {

--- a/Eigen/src/Core/StableNorm.h
+++ b/Eigen/src/Core/StableNorm.h
@@ -165,12 +165,13 @@ MatrixBase<Derived>::stableNorm() const
  
  typedef typename internal::nested_eval<Derived,2>::type DerivedCopy;
  typedef typename internal::remove_all<DerivedCopy>::type DerivedCopyClean;
-  DerivedCopy copy(derived());
+  const DerivedCopy copy(derived());
  
  enum {
    CanAlign = (   (int(DerivedCopyClean::Flags)&DirectAccessBit)
                || (int(internal::evaluator<DerivedCopyClean>::Alignment)>0) // FIXME Alignment)>0 might not be enough
-               ) && (blockSize*sizeof(Scalar)*2<EIGEN_STACK_ALLOCATION_LIMIT) // ifwe cannot allocate on the stack, then let's not bother about this optimization
+               ) && (blockSize*sizeof(Scalar)*2<EIGEN_STACK_ALLOCATION_LIMIT)
+                 && (EIGEN_MAX_STATIC_ALIGN_BYTES>0) // if we cannot allocate on the stack, then let's not bother about this optimization
  };
  typedef typename internal::conditional<CanAlign, Ref<const Matrix<Scalar,Dynamic,1,0,blockSize,1>, internal::evaluator<DerivedCopyClean>::Alignment>,
                                                   typename DerivedCopyClean::ConstSegmentReturnType>::type SegmentWrapper;
--- a/Eigen/src/Core/Transpose.h
+++ b/Eigen/src/Core/Transpose.h
@@ -11,7 +11,7 @@
 #ifndef EIGEN_TRANSPOSE_H
 #define EIGEN_TRANSPOSE_H

-namespace Eigen { 
+namespace Eigen {

 namespace internal {
 template<typename MatrixType>
@@ -146,6 +146,8 @@ template<typename MatrixType> class TransposeImpl<MatrixType,Dense>
    {
      return derived().nestedExpression().coeffRef(index);
    }
+  protected:
+    EIGEN_DEFAULT_EMPTY_CONSTRUCTOR_AND_DESTRUCTOR(TransposeImpl)
 };

 /** \returns an expression of the transpose of *this.
@@ -168,7 +170,7 @@ template<typename MatrixType> class TransposeImpl<MatrixType,Dense>
  *
  * \sa transposeInPlace(), adjoint() */
 template<typename Derived>
-inline Transpose<Derived>
+EIGEN_DEVICE_FUNC inline Transpose<Derived>
 DenseBase<Derived>::transpose()
 {
  return TransposeReturnType(derived());
@@ -180,7 +182,7 @@ DenseBase<Derived>::transpose()
  *
  * \sa transposeInPlace(), adjoint() */
 template<typename Derived>
-inline typename DenseBase<Derived>::ConstTransposeReturnType
+EIGEN_DEVICE_FUNC inline typename DenseBase<Derived>::ConstTransposeReturnType
 DenseBase<Derived>::transpose() const
 {
  return ConstTransposeReturnType(derived());
@@ -206,7 +208,7 @@ DenseBase<Derived>::transpose() const
  *
  * \sa adjointInPlace(), transpose(), conjugate(), class Transpose, class internal::scalar_conjugate_op */
 template<typename Derived>
-inline const typename MatrixBase<Derived>::AdjointReturnType
+EIGEN_DEVICE_FUNC inline const typename MatrixBase<Derived>::AdjointReturnType
 MatrixBase<Derived>::adjoint() const
 {
  return AdjointReturnType(this->transpose());
@@ -276,12 +278,12 @@ struct inplace_transpose_selector<MatrixType,false,MatchPacketSize> { // non squ
  * Notice however that this method is only useful if you want to replace a matrix by its own transpose.
  * If you just need the transpose of a matrix, use transpose().
  *
-  * \note if the matrix is not square, then \c *this must be a resizable matrix. 
+  * \note if the matrix is not square, then \c *this must be a resizable matrix.
  * This excludes (non-square) fixed-size matrices, block-expressions and maps.
  *
  * \sa transpose(), adjoint(), adjointInPlace() */
 template<typename Derived>
-inline void DenseBase<Derived>::transposeInPlace()
+EIGEN_DEVICE_FUNC inline void DenseBase<Derived>::transposeInPlace()
 {
  eigen_assert((rows() == cols() || (RowsAtCompileTime == Dynamic && ColsAtCompileTime == Dynamic))
               && "transposeInPlace() called on a non-square non-resizable matrix");
@@ -312,7 +314,7 @@ inline void DenseBase<Derived>::transposeInPlace()
  *
  * \sa transpose(), adjoint(), transposeInPlace() */
 template<typename Derived>
-inline void MatrixBase<Derived>::adjointInPlace()
+EIGEN_DEVICE_FUNC inline void MatrixBase<Derived>::adjointInPlace()
 {
  derived() = adjoint().eval();
 }
--- a/Eigen/src/Core/Transpositions.h
+++ b/Eigen/src/Core/Transpositions.h
@@ -33,17 +33,6 @@ class TranspositionsBase
      indices() = other.indices();
      return derived();
    }
-    
-    #ifndef EIGEN_PARSED_BY_DOXYGEN
-    /** This is a special case of the templated operator=. Its purpose is to
-      * prevent a default operator= from hiding the templated operator=.
-      */
-    Derived& operator=(const TranspositionsBase& other)
-    {
-      indices() = other.indices();
-      return derived();
-    }
-    #endif

    /** \returns the number of transpositions */
    Index size() const { return indices().size(); }
@@ -171,12 +160,6 @@ class Transpositions : public TranspositionsBase<Transpositions<SizeAtCompileTim
    inline Transpositions(const TranspositionsBase<OtherDerived>& other)
      : m_indices(other.indices()) {}

-    #ifndef EIGEN_PARSED_BY_DOXYGEN
-    /** Standard copy constructor. Defined only to prevent a default copy constructor
-      * from hiding the other templated constructor */
-    inline Transpositions(const Transpositions& other) : m_indices(other.indices()) {}
-    #endif
-
    /** Generic constructor from expression of the transposition indices. */
    template<typename Other>
    explicit inline Transpositions(const MatrixBase<Other>& indices) : m_indices(indices)
@@ -189,17 +172,6 @@ class Transpositions : public TranspositionsBase<Transpositions<SizeAtCompileTim
      return Base::operator=(other);
    }

-    #ifndef EIGEN_PARSED_BY_DOXYGEN
-    /** This is a special case of the templated operator=. Its purpose is to
-      * prevent a default operator= from hiding the templated operator=.
-      */
-    Transpositions& operator=(const Transpositions& other)
-    {
-      m_indices = other.m_indices;
-      return *this;
-    }
-    #endif
-
    /** Constructs an uninitialized permutation matrix of given size.
      */
    inline Transpositions(Index size) : m_indices(size)
@@ -306,17 +278,6 @@ class TranspositionsWrapper
      return Base::operator=(other);
    }

-    #ifndef EIGEN_PARSED_BY_DOXYGEN
-    /** This is a special case of the templated operator=. Its purpose is to
-      * prevent a default operator= from hiding the templated operator=.
-      */
-    TranspositionsWrapper& operator=(const TranspositionsWrapper& other)
-    {
-      m_indices = other.m_indices;
-      return *this;
-    }
-    #endif
-
    /** const version of indices(). */
    const IndicesType& indices() const { return m_indices; }

@@ -384,7 +345,7 @@ class Transpose<TranspositionsBase<TranspositionsDerived> >
    const Product<OtherDerived, Transpose, AliasFreeProduct>
    operator*(const MatrixBase<OtherDerived>& matrix, const Transpose& trt)
    {
-      return Product<OtherDerived, Transpose, AliasFreeProduct>(matrix.derived(), trt.derived());
+      return Product<OtherDerived, Transpose, AliasFreeProduct>(matrix.derived(), trt);
    }

    /** \returns the \a matrix with the inverse transpositions applied to the rows.
--- a/Eigen/src/Core/TriangularMatrix.h
+++ b/Eigen/src/Core/TriangularMatrix.h
@@ -11,12 +11,12 @@
 #ifndef EIGEN_TRIANGULARMATRIX_H
 #define EIGEN_TRIANGULARMATRIX_H

-namespace Eigen { 
+namespace Eigen {

 namespace internal {
-  
+
 template<int Side, typename TriangularType, typename Rhs> struct triangular_solve_retval;
-  
+
 }

 /** \class TriangularBase
@@ -34,16 +34,16 @@ template<typename Derived> class TriangularBase : public EigenBase<Derived>
      ColsAtCompileTime = internal::traits<Derived>::ColsAtCompileTime,
      MaxRowsAtCompileTime = internal::traits<Derived>::MaxRowsAtCompileTime,
      MaxColsAtCompileTime = internal::traits<Derived>::MaxColsAtCompileTime,
-      
+
      SizeAtCompileTime = (internal::size_at_compile_time<internal::traits<Derived>::RowsAtCompileTime,
                                                   internal::traits<Derived>::ColsAtCompileTime>::ret),
      /**< This is equal to the number of coefficients, i.e. the number of
          * rows times the number of columns, or to \a Dynamic if this is not
          * known at compile-time. \sa RowsAtCompileTime, ColsAtCompileTime */
-      
+
      MaxSizeAtCompileTime = (internal::size_at_compile_time<internal::traits<Derived>::MaxRowsAtCompileTime,
                                                   internal::traits<Derived>::MaxColsAtCompileTime>::ret)
-        
+
    };
    typedef typename internal::traits<Derived>::Scalar Scalar;
    typedef typename internal::traits<Derived>::StorageKind StorageKind;
@@ -63,7 +63,7 @@ template<typename Derived> class TriangularBase : public EigenBase<Derived>
    inline Index outerStride() const { return derived().outerStride(); }
    EIGEN_DEVICE_FUNC
    inline Index innerStride() const { return derived().innerStride(); }
-    
+
    // dummy resize function
    void resize(Index rows, Index cols)
    {
@@ -155,7 +155,7 @@ template<typename Derived> class TriangularBase : public EigenBase<Derived>
  * \param MatrixType the type of the object in which we are taking the triangular part
  * \param Mode the kind of triangular matrix expression to construct. Can be #Upper,
  *             #Lower, #UnitUpper, #UnitLower, #StrictlyUpper, or #StrictlyLower.
-  *             This is in fact a bit field; it must have either #Upper or #Lower, 
+  *             This is in fact a bit field; it must have either #Upper or #Lower,
  *             and additionally it may have #UnitDiag or #ZeroDiag or neither.
  *
  * This class represents a triangular part of a matrix, not necessarily square. Strictly speaking, for rectangular
@@ -197,7 +197,7 @@ template<typename _MatrixType, unsigned int _Mode> class TriangularView
    typedef typename internal::traits<TriangularView>::MatrixTypeNestedNonRef MatrixTypeNestedNonRef;

    typedef typename internal::remove_all<typename MatrixType::ConjugateReturnType>::type MatrixConjugateReturnType;
-    
+
  public:

    typedef typename internal::traits<TriangularView>::StorageKind StorageKind;
@@ -216,10 +216,8 @@ template<typename _MatrixType, unsigned int _Mode> class TriangularView
    EIGEN_DEVICE_FUNC
    explicit inline TriangularView(MatrixType& matrix) : m_matrix(matrix)
    {}
-    
-    using Base::operator=;
-    TriangularView& operator=(const TriangularView &other)
-    { return Base::operator=(other); }
+
+    EIGEN_INHERIT_ASSIGNMENT_OPERATORS(TriangularView)

    /** \copydoc EigenBase::rows() */
    EIGEN_DEVICE_FUNC
@@ -235,7 +233,7 @@ template<typename _MatrixType, unsigned int _Mode> class TriangularView
    /** \returns a reference to the nested expression */
    EIGEN_DEVICE_FUNC
    NestedExpression& nestedExpression() { return m_matrix; }
-    
+
    typedef TriangularView<const MatrixConjugateReturnType,Mode> ConjugateReturnType;
    /** \sa MatrixBase::conjugate() const */
    EIGEN_DEVICE_FUNC
@@ -257,7 +255,7 @@ template<typename _MatrixType, unsigned int _Mode> class TriangularView
      typename MatrixType::TransposeReturnType tmp(m_matrix);
      return TransposeReturnType(tmp);
    }
-    
+
    typedef TriangularView<const typename MatrixType::ConstTransposeReturnType,TransposeMode> ConstTransposeReturnType;
    /** \sa MatrixBase::transpose() const */
    EIGEN_DEVICE_FUNC
@@ -268,10 +266,10 @@ template<typename _MatrixType, unsigned int _Mode> class TriangularView

    template<typename Other>
    EIGEN_DEVICE_FUNC
-    inline const Solve<TriangularView, Other> 
+    inline const Solve<TriangularView, Other>
    solve(const MatrixBase<Other>& other) const
    { return Solve<TriangularView, Other>(*this, other.derived()); }
-    
+
  // workaround MSVC ICE
  #if EIGEN_COMP_MSVC
    template<int Side, typename Other>
@@ -315,7 +313,7 @@ template<typename _MatrixType, unsigned int _Mode> class TriangularView
      else
        return m_matrix.diagonal().prod();
    }
-      
+
  protected:

    MatrixTypeNested m_matrix;
@@ -377,7 +375,7 @@ template<typename _MatrixType, unsigned int _Mode> class TriangularViewImpl<_Mat
      internal::call_assignment_no_alias(derived(), other.derived(), internal::sub_assign_op<Scalar,typename Other::Scalar>());
      return derived();
    }
-    
+
    /** \sa MatrixBase::operator*=() */
    EIGEN_DEVICE_FUNC
    TriangularViewType&  operator*=(const typename internal::traits<MatrixType>::Scalar& other) { return *this = derived().nestedExpression() * other; }
@@ -470,6 +468,8 @@ template<typename _MatrixType, unsigned int _Mode> class TriangularViewImpl<_Mat
      * \a Side==OnTheLeft (the default), or the right-inverse-multiply  \a other * inverse(\c *this) if
      * \a Side==OnTheRight.
      *
+      * Note that the template parameter \c Side can be ommitted, in which case \c Side==OnTheLeft
+      *
      * The matrix \c *this must be triangular and invertible (i.e., all the coefficients of the
      * diagonal must be non zero). It works as a forward (resp. backward) substitution if \c *this
      * is an upper (resp. lower) triangular matrix.
@@ -495,6 +495,8 @@ template<typename _MatrixType, unsigned int _Mode> class TriangularViewImpl<_Mat
      * \warning The parameter is only marked 'const' to make the C++ compiler accept a temporary expression here.
      * This function will const_cast it, so constness isn't honored here.
      *
+      * Note that the template parameter \c Side can be ommitted, in which case \c Side==OnTheLeft
+      *
      * See TriangularView:solve() for the details.
      */
    template<int Side, typename OtherDerived>
@@ -539,17 +541,22 @@ template<typename _MatrixType, unsigned int _Mode> class TriangularViewImpl<_Mat

    template<typename ProductType>
    EIGEN_DEVICE_FUNC
-    EIGEN_STRONG_INLINE TriangularViewType& _assignProduct(const ProductType& prod, const Scalar& alpha);
+    EIGEN_STRONG_INLINE TriangularViewType& _assignProduct(const ProductType& prod, const Scalar& alpha, bool beta);
+  protected:
+    EIGEN_DEFAULT_COPY_CONSTRUCTOR(TriangularViewImpl)
+    EIGEN_DEFAULT_EMPTY_CONSTRUCTOR_AND_DESTRUCTOR(TriangularViewImpl)
+
 };

 /***************************************************************************
 * Implementation of triangular evaluation/assignment
 ***************************************************************************/

+#ifndef EIGEN_PARSED_BY_DOXYGEN
 // FIXME should we keep that possibility
 template<typename MatrixType, unsigned int Mode>
 template<typename OtherDerived>
-inline TriangularView<MatrixType, Mode>&
+EIGEN_DEVICE_FUNC inline TriangularView<MatrixType, Mode>&
 TriangularViewImpl<MatrixType, Mode, Dense>::operator=(const MatrixBase<OtherDerived>& other)
 {
  internal::call_assignment_no_alias(derived(), other.derived(), internal::assign_op<Scalar,typename OtherDerived::Scalar>());
@@ -559,7 +566,7 @@ TriangularViewImpl<MatrixType, Mode, Dense>::operator=(const MatrixBase<OtherDer
 // FIXME should we keep that possibility
 template<typename MatrixType, unsigned int Mode>
 template<typename OtherDerived>
-void TriangularViewImpl<MatrixType, Mode, Dense>::lazyAssign(const MatrixBase<OtherDerived>& other)
+EIGEN_DEVICE_FUNC void TriangularViewImpl<MatrixType, Mode, Dense>::lazyAssign(const MatrixBase<OtherDerived>& other)
 {
  internal::call_assignment_no_alias(derived(), other.template triangularView<Mode>());
 }
@@ -568,7 +575,7 @@ void TriangularViewImpl<MatrixType, Mode, Dense>::lazyAssign(const MatrixBase<Ot

 template<typename MatrixType, unsigned int Mode>
 template<typename OtherDerived>
-inline TriangularView<MatrixType, Mode>&
+EIGEN_DEVICE_FUNC inline TriangularView<MatrixType, Mode>&
 TriangularViewImpl<MatrixType, Mode, Dense>::operator=(const TriangularBase<OtherDerived>& other)
 {
  eigen_assert(Mode == int(OtherDerived::Mode));
@@ -578,11 +585,12 @@ TriangularViewImpl<MatrixType, Mode, Dense>::operator=(const TriangularBase<Othe

 template<typename MatrixType, unsigned int Mode>
 template<typename OtherDerived>
-void TriangularViewImpl<MatrixType, Mode, Dense>::lazyAssign(const TriangularBase<OtherDerived>& other)
+EIGEN_DEVICE_FUNC void TriangularViewImpl<MatrixType, Mode, Dense>::lazyAssign(const TriangularBase<OtherDerived>& other)
 {
  eigen_assert(Mode == int(OtherDerived::Mode));
  internal::call_assignment_no_alias(derived(), other.derived());
 }
+#endif

 /***************************************************************************
 * Implementation of TriangularBase methods
@@ -592,7 +600,7 @@ void TriangularViewImpl<MatrixType, Mode, Dense>::lazyAssign(const TriangularBas
  * If the matrix is triangular, the opposite part is set to zero. */
 template<typename Derived>
 template<typename DenseDerived>
-void TriangularBase<Derived>::evalTo(MatrixBase<DenseDerived> &other) const
+EIGEN_DEVICE_FUNC void TriangularBase<Derived>::evalTo(MatrixBase<DenseDerived> &other) const
 {
  evalToLazy(other.derived());
 }
@@ -618,7 +626,7 @@ void TriangularBase<Derived>::evalTo(MatrixBase<DenseDerived> &other) const
  */
 template<typename Derived>
 template<unsigned int Mode>
-typename MatrixBase<Derived>::template TriangularViewReturnType<Mode>::Type
+EIGEN_DEVICE_FUNC typename MatrixBase<Derived>::template TriangularViewReturnType<Mode>::Type
 MatrixBase<Derived>::triangularView()
 {
  return typename TriangularViewReturnType<Mode>::Type(derived());
@@ -627,7 +635,7 @@ MatrixBase<Derived>::triangularView()
 /** This is the const version of MatrixBase::triangularView() */
 template<typename Derived>
 template<unsigned int Mode>
-typename MatrixBase<Derived>::template ConstTriangularViewReturnType<Mode>::Type
+EIGEN_DEVICE_FUNC typename MatrixBase<Derived>::template ConstTriangularViewReturnType<Mode>::Type
 MatrixBase<Derived>::triangularView() const
 {
  return typename ConstTriangularViewReturnType<Mode>::Type(derived());
@@ -692,7 +700,7 @@ bool MatrixBase<Derived>::isLowerTriangular(const RealScalar& prec) const

 namespace internal {

-  
+
 // TODO currently a triangular expression has the form TriangularView<.,.>
 //      in the future triangular-ness should be defined by the expression traits
 //      such that Transpose<TriangularView<.,.> > is valid. (currently TriangularBase::transpose() is overloaded to make it work)
@@ -720,7 +728,7 @@ struct Dense2Triangular         {};

 template<typename Kernel, unsigned int Mode, int UnrollCount, bool ClearOpposite> struct triangular_assignment_loop;

- 
+
 /** \internal Specialization of the dense assignment kernel for triangular matrices.
  * The main difference is that the triangular, diagonal, and opposite parts are processed through three different functions.
  * \tparam UpLo must be either Lower or Upper
@@ -737,17 +745,17 @@ protected:
  using Base::m_src;
  using Base::m_functor;
 public:
-  
+
  typedef typename Base::DstEvaluatorType DstEvaluatorType;
  typedef typename Base::SrcEvaluatorType SrcEvaluatorType;
  typedef typename Base::Scalar Scalar;
  typedef typename Base::AssignmentTraits AssignmentTraits;
-  
-  
+
+
  EIGEN_DEVICE_FUNC triangular_dense_assignment_kernel(DstEvaluatorType &dst, const SrcEvaluatorType &src, const Functor &func, DstXprType& dstExpr)
    : Base(dst, src, func, dstExpr)
  {}
-  
+
 #ifdef EIGEN_INTERNAL_DEBUGGING
  EIGEN_DEVICE_FUNC void assignCoeff(Index row, Index col)
  {
@@ -757,16 +765,16 @@ public:
 #else
  using Base::assignCoeff;
 #endif
-  
+
  EIGEN_DEVICE_FUNC void assignDiagonalCoeff(Index id)
  {
         if(Mode==UnitDiag && SetOpposite) m_functor.assignCoeff(m_dst.coeffRef(id,id), Scalar(1));
    else if(Mode==ZeroDiag && SetOpposite) m_functor.assignCoeff(m_dst.coeffRef(id,id), Scalar(0));
    else if(Mode==0)                       Base::assignCoeff(id,id);
  }
-  
+
  EIGEN_DEVICE_FUNC void assignOppositeCoeff(Index row, Index col)
-  { 
+  {
    eigen_internal_assert(row!=col);
    if(SetOpposite)
      m_functor.assignCoeff(m_dst.coeffRef(row,col), Scalar(0));
@@ -787,17 +795,17 @@ void call_triangular_assignment_loop(DstXprType& dst, const SrcXprType& src, con
  if((dst.rows()!=dstRows) || (dst.cols()!=dstCols))
    dst.resize(dstRows, dstCols);
  DstEvaluatorType dstEvaluator(dst);
-    
+
  typedef triangular_dense_assignment_kernel< Mode&(Lower|Upper),Mode&(UnitDiag|ZeroDiag|SelfAdjoint),SetOpposite,
                                              DstEvaluatorType,SrcEvaluatorType,Functor> Kernel;
  Kernel kernel(dstEvaluator, srcEvaluator, func, dst.const_cast_derived());
-  
+
  enum {
      unroll = DstXprType::SizeAtCompileTime != Dynamic
            && SrcEvaluatorType::CoeffReadCost < HugeCost
            && DstXprType::SizeAtCompileTime * (DstEvaluatorType::CoeffReadCost+SrcEvaluatorType::CoeffReadCost) / 2 <= EIGEN_UNROLLING_LIMIT
    };
-  
+
  triangular_assignment_loop<Kernel, Mode, unroll ? int(DstXprType::SizeAtCompileTime) : Dynamic, SetOpposite>::run(kernel);
 }

@@ -819,8 +827,8 @@ struct Assignment<DstXprType, SrcXprType, Functor, Triangular2Triangular>
  EIGEN_DEVICE_FUNC static void run(DstXprType &dst, const SrcXprType &src, const Functor &func)
  {
    eigen_assert(int(DstXprType::Mode) == int(SrcXprType::Mode));
-    
-    call_triangular_assignment_loop<DstXprType::Mode, false>(dst, src, func);  
+
+    call_triangular_assignment_loop<DstXprType::Mode, false>(dst, src, func);
  }
 };

@@ -829,7 +837,7 @@ struct Assignment<DstXprType, SrcXprType, Functor, Triangular2Dense>
 {
  EIGEN_DEVICE_FUNC static void run(DstXprType &dst, const SrcXprType &src, const Functor &func)
  {
-    call_triangular_assignment_loop<SrcXprType::Mode, (SrcXprType::Mode&SelfAdjoint)==0>(dst, src, func);  
+    call_triangular_assignment_loop<SrcXprType::Mode, (SrcXprType::Mode&SelfAdjoint)==0>(dst, src, func);
  }
 };

@@ -838,7 +846,7 @@ struct Assignment<DstXprType, SrcXprType, Functor, Dense2Triangular>
 {
  EIGEN_DEVICE_FUNC static void run(DstXprType &dst, const SrcXprType &src, const Functor &func)
  {
-    call_triangular_assignment_loop<DstXprType::Mode, false>(dst, src, func);  
+    call_triangular_assignment_loop<DstXprType::Mode, false>(dst, src, func);
  }
 };

@@ -849,19 +857,19 @@ struct triangular_assignment_loop
  // FIXME: this is not very clean, perhaps this information should be provided by the kernel?
  typedef typename Kernel::DstEvaluatorType DstEvaluatorType;
  typedef typename DstEvaluatorType::XprType DstXprType;
-  
+
  enum {
    col = (UnrollCount-1) / DstXprType::RowsAtCompileTime,
    row = (UnrollCount-1) % DstXprType::RowsAtCompileTime
  };
-  
+
  typedef typename Kernel::Scalar Scalar;

  EIGEN_DEVICE_FUNC
  static inline void run(Kernel &kernel)
  {
    triangular_assignment_loop<Kernel, Mode, UnrollCount-1, SetOpposite>::run(kernel);
-    
+
    if(row==col)
      kernel.assignDiagonalCoeff(row);
    else if( ((Mode&Lower) && row>col) || ((Mode&Upper) && row<col) )
@@ -904,10 +912,10 @@ struct triangular_assignment_loop<Kernel, Mode, Dynamic, SetOpposite>
      }
      else
        i = maxi;
-      
+
      if(i<kernel.rows()) // then i==j
        kernel.assignDiagonalCoeff(i++);
-      
+
      if (((Mode&Upper) && SetOpposite) || (Mode&Lower))
      {
        for(; i < kernel.rows(); ++i)
@@ -924,28 +932,27 @@ struct triangular_assignment_loop<Kernel, Mode, Dynamic, SetOpposite>
  * If the matrix is triangular, the opposite part is set to zero. */
 template<typename Derived>
 template<typename DenseDerived>
-void TriangularBase<Derived>::evalToLazy(MatrixBase<DenseDerived> &other) const
+EIGEN_DEVICE_FUNC void TriangularBase<Derived>::evalToLazy(MatrixBase<DenseDerived> &other) const
 {
  other.derived().resize(this->rows(), this->cols());
  internal::call_triangular_assignment_loop<Derived::Mode,(Derived::Mode&SelfAdjoint)==0 /* SetOpposite */>(other.derived(), derived().nestedExpression());
 }

 namespace internal {
-  
+
 // Triangular = Product
 template< typename DstXprType, typename Lhs, typename Rhs, typename Scalar>
 struct Assignment<DstXprType, Product<Lhs,Rhs,DefaultProduct>, internal::assign_op<Scalar,typename Product<Lhs,Rhs,DefaultProduct>::Scalar>, Dense2Triangular>
 {
  typedef Product<Lhs,Rhs,DefaultProduct> SrcXprType;
-  static void run(DstXprType &dst, const SrcXprType &src, const internal::assign_op<Scalar,typename SrcXprType::Scalar> &)
+  static EIGEN_DEVICE_FUNC void run(DstXprType &dst, const SrcXprType &src, const internal::assign_op<Scalar,typename SrcXprType::Scalar> &)
  {
    Index dstRows = src.rows();
    Index dstCols = src.cols();
    if((dst.rows()!=dstRows) || (dst.cols()!=dstCols))
      dst.resize(dstRows, dstCols);

-    dst.setZero();
-    dst._assignProduct(src, 1);
+    dst._assignProduct(src, 1, 0);
  }
 };

@@ -954,9 +961,9 @@ template< typename DstXprType, typename Lhs, typename Rhs, typename Scalar>
 struct Assignment<DstXprType, Product<Lhs,Rhs,DefaultProduct>, internal::add_assign_op<Scalar,typename Product<Lhs,Rhs,DefaultProduct>::Scalar>, Dense2Triangular>
 {
  typedef Product<Lhs,Rhs,DefaultProduct> SrcXprType;
-  static void run(DstXprType &dst, const SrcXprType &src, const internal::add_assign_op<Scalar,typename SrcXprType::Scalar> &)
+  static EIGEN_DEVICE_FUNC void run(DstXprType &dst, const SrcXprType &src, const internal::add_assign_op<Scalar,typename SrcXprType::Scalar> &)
  {
-    dst._assignProduct(src, 1);
+    dst._assignProduct(src, 1, 1);
  }
 };

@@ -965,9 +972,9 @@ template< typename DstXprType, typename Lhs, typename Rhs, typename Scalar>
 struct Assignment<DstXprType, Product<Lhs,Rhs,DefaultProduct>, internal::sub_assign_op<Scalar,typename Product<Lhs,Rhs,DefaultProduct>::Scalar>, Dense2Triangular>
 {
  typedef Product<Lhs,Rhs,DefaultProduct> SrcXprType;
-  static void run(DstXprType &dst, const SrcXprType &src, const internal::sub_assign_op<Scalar,typename SrcXprType::Scalar> &)
+  static EIGEN_DEVICE_FUNC void run(DstXprType &dst, const SrcXprType &src, const internal::sub_assign_op<Scalar,typename SrcXprType::Scalar> &)
  {
-    dst._assignProduct(src, -1);
+    dst._assignProduct(src, -1, 1);
  }
 };

--- a/Eigen/src/Core/VectorwiseOp.h
+++ b/Eigen/src/Core/VectorwiseOp.h
@@ -670,7 +670,7 @@ template<typename ExpressionType, int Direction> class VectorwiseOp
  * \sa rowwise(), class VectorwiseOp, \ref TutorialReductionsVisitorsBroadcasting
  */
 template<typename Derived>
-inline typename DenseBase<Derived>::ColwiseReturnType
+EIGEN_DEVICE_FUNC inline typename DenseBase<Derived>::ColwiseReturnType
 DenseBase<Derived>::colwise()
 {
  return ColwiseReturnType(derived());
@@ -684,7 +684,7 @@ DenseBase<Derived>::colwise()
  * \sa colwise(), class VectorwiseOp, \ref TutorialReductionsVisitorsBroadcasting
  */
 template<typename Derived>
-inline typename DenseBase<Derived>::RowwiseReturnType
+EIGEN_DEVICE_FUNC inline typename DenseBase<Derived>::RowwiseReturnType
 DenseBase<Derived>::rowwise()
 {
  return RowwiseReturnType(derived());
--- a/Eigen/src/Core/Visitor.h
+++ b/Eigen/src/Core/Visitor.h
@@ -194,7 +194,8 @@ struct functor_traits<max_coeff_visitor<Scalar> > {

 } // end namespace internal

-/** \returns the minimum of all coefficients of *this and puts in *row and *col its location.
+/** \fn DenseBase<Derived>::minCoeff(IndexType* rowId, IndexType* colId) const
+  * \returns the minimum of all coefficients of *this and puts in *row and *col its location.
  * \warning the result is undefined if \c *this contains NaN.
  *
  * \sa DenseBase::minCoeff(Index*), DenseBase::maxCoeff(Index*,Index*), DenseBase::visit(), DenseBase::minCoeff()
@@ -230,7 +231,8 @@ DenseBase<Derived>::minCoeff(IndexType* index) const
  return minVisitor.res;
 }

-/** \returns the maximum of all coefficients of *this and puts in *row and *col its location.
+/** \fn DenseBase<Derived>::maxCoeff(IndexType* rowId, IndexType* colId) const
+  * \returns the maximum of all coefficients of *this and puts in *row and *col its location.
  * \warning the result is undefined if \c *this contains NaN. 
  *
  * \sa DenseBase::minCoeff(IndexType*,IndexType*), DenseBase::visit(), DenseBase::maxCoeff()
--- a/Eigen/src/Core/arch/AVX/Complex.h
+++ b/Eigen/src/Core/arch/AVX/Complex.h
@@ -204,23 +204,7 @@ template<> struct conj_helper<Packet4cf, Packet4cf, true,true>
  }
 };

-template<> struct conj_helper<Packet8f, Packet4cf, false,false>
-{
-  EIGEN_STRONG_INLINE Packet4cf pmadd(const Packet8f& x, const Packet4cf& y, const Packet4cf& c) const
-  { return padd(c, pmul(x,y)); }
-
-  EIGEN_STRONG_INLINE Packet4cf pmul(const Packet8f& x, const Packet4cf& y) const
-  { return Packet4cf(Eigen::internal::pmul(x, y.v)); }
-};
-
-template<> struct conj_helper<Packet4cf, Packet8f, false,false>
-{
-  EIGEN_STRONG_INLINE Packet4cf pmadd(const Packet4cf& x, const Packet8f& y, const Packet4cf& c) const
-  { return padd(c, pmul(x,y)); }
-
-  EIGEN_STRONG_INLINE Packet4cf pmul(const Packet4cf& x, const Packet8f& y) const
-  { return Packet4cf(Eigen::internal::pmul(x.v, y)); }
-};
+EIGEN_MAKE_CONJ_HELPER_CPLX_REAL(Packet4cf,Packet8f)

 template<> EIGEN_STRONG_INLINE Packet4cf pdiv<Packet4cf>(const Packet4cf& a, const Packet4cf& b)
 {
@@ -400,23 +384,7 @@ template<> struct conj_helper<Packet2cd, Packet2cd, true,true>
  }
 };

-template<> struct conj_helper<Packet4d, Packet2cd, false,false>
-{
-  EIGEN_STRONG_INLINE Packet2cd pmadd(const Packet4d& x, const Packet2cd& y, const Packet2cd& c) const
-  { return padd(c, pmul(x,y)); }
-
-  EIGEN_STRONG_INLINE Packet2cd pmul(const Packet4d& x, const Packet2cd& y) const
-  { return Packet2cd(Eigen::internal::pmul(x, y.v)); }
-};
-
-template<> struct conj_helper<Packet2cd, Packet4d, false,false>
-{
-  EIGEN_STRONG_INLINE Packet2cd pmadd(const Packet2cd& x, const Packet4d& y, const Packet2cd& c) const
-  { return padd(c, pmul(x,y)); }
-
-  EIGEN_STRONG_INLINE Packet2cd pmul(const Packet2cd& x, const Packet4d& y) const
-  { return Packet2cd(Eigen::internal::pmul(x.v, y)); }
-};
+EIGEN_MAKE_CONJ_HELPER_CPLX_REAL(Packet2cd,Packet4d)

 template<> EIGEN_STRONG_INLINE Packet2cd pdiv<Packet2cd>(const Packet2cd& a, const Packet2cd& b)
 {
--- a/Eigen/src/Core/arch/AVX/PacketMath.h
+++ b/Eigen/src/Core/arch/AVX/PacketMath.h
@@ -159,11 +159,12 @@ template<> EIGEN_STRONG_INLINE Packet8i pdiv<Packet8i>(const Packet8i& /*a*/, co

 #ifdef __FMA__
 template<> EIGEN_STRONG_INLINE Packet8f pmadd(const Packet8f& a, const Packet8f& b, const Packet8f& c) {
-#if ( EIGEN_COMP_GNUC_STRICT || (EIGEN_COMP_CLANG && (EIGEN_COMP_CLANG<308)) )
-  // clang stupidly generates a vfmadd213ps instruction plus some vmovaps on registers,
-  // and gcc stupidly generates a vfmadd132ps instruction,
-  // so let's enforce it to generate a vfmadd231ps instruction since the most common use case is to accumulate
-  // the result of the product.
+#if ( (EIGEN_COMP_GNUC_STRICT && EIGEN_COMP_GNUC<80) || (EIGEN_COMP_CLANG) )
+  // Clang stupidly generates a vfmadd213ps instruction plus some vmovaps on registers,
+  //  and even register spilling with clang>=6.0 (bug 1637).
+  // Gcc stupidly generates a vfmadd132ps instruction.
+  // So let's enforce it to generate a vfmadd231ps instruction since the most common use
+  //  case is to accumulate the result of the product.
  Packet8f res = c;
  __asm__("vfmadd231ps %[a], %[b], %[c]" : [c] "+x" (res) : [a] "x" (a), [b] "x" (b));
  return res;
@@ -172,7 +173,7 @@ template<> EIGEN_STRONG_INLINE Packet8f pmadd(const Packet8f& a, const Packet8f&
 #endif
 }
 template<> EIGEN_STRONG_INLINE Packet4d pmadd(const Packet4d& a, const Packet4d& b, const Packet4d& c) {
-#if ( EIGEN_COMP_GNUC_STRICT || (EIGEN_COMP_CLANG && (EIGEN_COMP_CLANG<308)) )
+#if ( (EIGEN_COMP_GNUC_STRICT && EIGEN_COMP_GNUC<80) || (EIGEN_COMP_CLANG) )
  // see above
  Packet4d res = c;
  __asm__("vfmadd231pd %[a], %[b], %[c]" : [c] "+x" (res) : [a] "x" (a), [b] "x" (b));
@@ -308,9 +309,9 @@ template<> EIGEN_STRONG_INLINE void pstore1<Packet8i>(int* to, const int& a)
 }

 #ifndef EIGEN_VECTORIZE_AVX512
-template<> EIGEN_STRONG_INLINE void prefetch<float>(const float*   addr) { _mm_prefetch((const char*)(addr), _MM_HINT_T0); }
-template<> EIGEN_STRONG_INLINE void prefetch<double>(const double* addr) { _mm_prefetch((const char*)(addr), _MM_HINT_T0); }
-template<> EIGEN_STRONG_INLINE void prefetch<int>(const int*       addr) { _mm_prefetch((const char*)(addr), _MM_HINT_T0); }
+template<> EIGEN_STRONG_INLINE void prefetch<float>(const float*   addr) { _mm_prefetch((SsePrefetchPtrType)(addr), _MM_HINT_T0); }
+template<> EIGEN_STRONG_INLINE void prefetch<double>(const double* addr) { _mm_prefetch((SsePrefetchPtrType)(addr), _MM_HINT_T0); }
+template<> EIGEN_STRONG_INLINE void prefetch<int>(const int*       addr) { _mm_prefetch((SsePrefetchPtrType)(addr), _MM_HINT_T0); }
 #endif

 template<> EIGEN_STRONG_INLINE float  pfirst<Packet8f>(const Packet8f& a) {
@@ -333,9 +334,12 @@ template<> EIGEN_STRONG_INLINE Packet4d preverse(const Packet4d& a)
 {
   __m256d tmp = _mm256_shuffle_pd(a,a,5);
  return _mm256_permute2f128_pd(tmp, tmp, 1);
-
+  #if 0
+  // This version is unlikely to be faster as _mm256_shuffle_ps and _mm256_permute_pd
+  // exhibit the same latency/throughput, but it is here for future reference/benchmarking...
  __m256d swap_halves = _mm256_permute2f128_pd(a,a,1);
    return _mm256_permute_pd(swap_halves,5);
+  #endif
 }

 // pabs should be ok
@@ -395,14 +399,11 @@ template<> EIGEN_STRONG_INLINE Packet4d preduxp<Packet4d>(const Packet4d* vecs)

 template<> EIGEN_STRONG_INLINE float predux<Packet8f>(const Packet8f& a)
 {
-  Packet8f tmp0 = _mm256_hadd_ps(a,_mm256_permute2f128_ps(a,a,1));
-  tmp0 = _mm256_hadd_ps(tmp0,tmp0);
-  return pfirst(_mm256_hadd_ps(tmp0, tmp0));
+  return predux(Packet4f(_mm_add_ps(_mm256_castps256_ps128(a),_mm256_extractf128_ps(a,1))));
 }
 template<> EIGEN_STRONG_INLINE double predux<Packet4d>(const Packet4d& a)
 {
-  Packet4d tmp0 = _mm256_hadd_pd(a,_mm256_permute2f128_pd(a,a,1));
-  return pfirst(_mm256_hadd_pd(tmp0,tmp0));
+  return predux(Packet2d(_mm_add_pd(_mm256_castpd256_pd128(a),_mm256_extractf128_pd(a,1))));
 }

 template<> EIGEN_STRONG_INLINE Packet4f predux_downto4<Packet8f>(const Packet8f& a)
--- a/Eigen/src/Core/arch/AVX512/MathFunctions.h
+++ b/Eigen/src/Core/arch/AVX512/MathFunctions.h
@@ -29,6 +29,7 @@ namespace internal {
 #define _EIGEN_DECLARE_CONST_Packet8d_FROM_INT64(NAME, X) \
  const Packet8d p8d_##NAME = _mm512_castsi512_pd(_mm512_set1_epi64(X))

+
 // Natural logarithm
 // Computes log(x) as log(2^e * m) = C*e + log(m), where the constant C =log(2)
 // and m is in the range [sqrt(1/2),sqrt(2)). In this range, the logarithm can
@@ -47,6 +48,7 @@ plog<Packet16f>(const Packet16f& _x) {
  // The smallest non denormalized float number.
  _EIGEN_DECLARE_CONST_Packet16f_FROM_INT(min_norm_pos, 0x00800000);
  _EIGEN_DECLARE_CONST_Packet16f_FROM_INT(minus_inf, 0xff800000);
+  _EIGEN_DECLARE_CONST_Packet16f_FROM_INT(pos_inf, 0x7f800000);
  _EIGEN_DECLARE_CONST_Packet16f_FROM_INT(nan, 0x7fc00000);

  // Polynomial coefficients.
@@ -64,11 +66,9 @@ plog<Packet16f>(const Packet16f& _x) {
  _EIGEN_DECLARE_CONST_Packet16f(cephes_log_q2, 0.693359375f);

  // invalid_mask is set to true when x is NaN
-  __mmask16 invalid_mask =
-      _mm512_cmp_ps_mask(x, _mm512_setzero_ps(), _CMP_NGE_UQ);
-  __mmask16 iszero_mask =
-      _mm512_cmp_ps_mask(x, _mm512_setzero_ps(), _CMP_EQ_UQ);
-
+  __mmask16 invalid_mask =  _mm512_cmp_ps_mask(x, _mm512_setzero_ps(), _CMP_NGE_UQ);
+  __mmask16 iszero_mask  =  _mm512_cmp_ps_mask(x, _mm512_setzero_ps(), _CMP_EQ_OQ);
+      
  // Truncate input values to the minimum positive normal.
  x = pmax(x, p16f_min_norm_pos);

@@ -88,9 +88,9 @@ plog<Packet16f>(const Packet16f& _x) {
  //     x = x + x - 1.0;
  //   } else { x = x - 1.0; }
  __mmask16 mask = _mm512_cmp_ps_mask(x, p16f_cephes_SQRTHF, _CMP_LT_OQ);
-  Packet16f tmp = _mm512_mask_blend_ps(mask, x, _mm512_setzero_ps());
+  Packet16f tmp = _mm512_mask_blend_ps(mask, _mm512_setzero_ps(), x);
  x = psub(x, p16f_1);
-  e = psub(e, _mm512_mask_blend_ps(mask, p16f_1, _mm512_setzero_ps()));
+  e = psub(e, _mm512_mask_blend_ps(mask, _mm512_setzero_ps(), p16f_1));
  x = padd(x, tmp);

  Packet16f x2 = pmul(x, x);
@@ -118,10 +118,18 @@ plog<Packet16f>(const Packet16f& _x) {
  x = padd(x, y);
  x = padd(x, y2);

-  // Filter out invalid inputs, i.e. negative arg will be NAN, 0 will be -INF.
-  return _mm512_mask_blend_ps(iszero_mask, p16f_minus_inf,
-                              _mm512_mask_blend_ps(invalid_mask, p16f_nan, x));
+  __mmask16 pos_inf_mask = _mm512_cmp_ps_mask(_x,p16f_pos_inf,_CMP_EQ_OQ);
+  // Filter out invalid inputs, i.e.:
+  //  - negative arg will be NAN,
+  //  - 0 will be -INF.
+  //  - +INF will be +INF
+  return _mm512_mask_blend_ps(iszero_mask,
+            _mm512_mask_blend_ps(invalid_mask,
+              _mm512_mask_blend_ps(pos_inf_mask,x,p16f_pos_inf),
+              p16f_nan),
+            p16f_minus_inf);
 }
+
 #endif

 // Exponential function. Works by writing "x = m*log(2) + r" where
@@ -257,50 +265,39 @@ pexp<Packet8d>(const Packet8d& _x) {
 template <>
 EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS EIGEN_UNUSED Packet16f
 psqrt<Packet16f>(const Packet16f& _x) {
-  _EIGEN_DECLARE_CONST_Packet16f(one_point_five, 1.5f);
-  _EIGEN_DECLARE_CONST_Packet16f(minus_half, -0.5f);
-  _EIGEN_DECLARE_CONST_Packet16f_FROM_INT(flt_min, 0x00800000);
+  Packet16f neg_half = pmul(_x, pset1<Packet16f>(-.5f));
+  __mmask16 denormal_mask = _mm512_kand(
+      _mm512_cmp_ps_mask(_x, pset1<Packet16f>((std::numeric_limits<float>::min)()),
+                        _CMP_LT_OQ),
+      _mm512_cmp_ps_mask(_x, _mm512_setzero_ps(), _CMP_GE_OQ));

-  Packet16f neg_half = pmul(_x, p16f_minus_half);
-
-  // select only the inverse sqrt of positive normal inputs (denormals are
-  // flushed to zero and cause infs as well).
-  __mmask16 non_zero_mask = _mm512_cmp_ps_mask(_x, p16f_flt_min, _CMP_GE_OQ);
-  Packet16f x = _mm512_mask_blend_ps(non_zero_mask, _mm512_rsqrt14_ps(_x),
-                                     _mm512_setzero_ps());
+  Packet16f x = _mm512_rsqrt14_ps(_x);

  // Do a single step of Newton's iteration.
-  x = pmul(x, pmadd(neg_half, pmul(x, x), p16f_one_point_five));
+  x = pmul(x, pmadd(neg_half, pmul(x, x), pset1<Packet16f>(1.5f)));

-  // Multiply the original _x by it's reciprocal square root to extract the
-  // square root.
-  return pmul(_x, x);
+  // Flush results for denormals to zero.
+  return _mm512_mask_blend_ps(denormal_mask, pmul(_x,x), _mm512_setzero_ps());
 }

 template <>
 EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS EIGEN_UNUSED Packet8d
 psqrt<Packet8d>(const Packet8d& _x) {
-  _EIGEN_DECLARE_CONST_Packet8d(one_point_five, 1.5);
-  _EIGEN_DECLARE_CONST_Packet8d(minus_half, -0.5);
-  _EIGEN_DECLARE_CONST_Packet8d_FROM_INT64(dbl_min, 0x0010000000000000LL);
+  Packet8d neg_half = pmul(_x, pset1<Packet8d>(-.5));
+  __mmask16 denormal_mask = _mm512_kand(
+      _mm512_cmp_pd_mask(_x, pset1<Packet8d>((std::numeric_limits<double>::min)()),
+                        _CMP_LT_OQ),
+      _mm512_cmp_pd_mask(_x, _mm512_setzero_pd(), _CMP_GE_OQ));

-  Packet8d neg_half = pmul(_x, p8d_minus_half);
+  Packet8d x = _mm512_rsqrt14_pd(_x);

-  // select only the inverse sqrt of positive normal inputs (denormals are
-  // flushed to zero and cause infs as well).
-  __mmask8 non_zero_mask = _mm512_cmp_pd_mask(_x, p8d_dbl_min, _CMP_GE_OQ);
-  Packet8d x = _mm512_mask_blend_pd(non_zero_mask, _mm512_rsqrt14_pd(_x),
-                                    _mm512_setzero_pd());
-
-  // Do a first step of Newton's iteration.
-  x = pmul(x, pmadd(neg_half, pmul(x, x), p8d_one_point_five));
+  // Do a single step of Newton's iteration.
+  x = pmul(x, pmadd(neg_half, pmul(x, x), pset1<Packet8d>(1.5)));

  // Do a second step of Newton's iteration.
-  x = pmul(x, pmadd(neg_half, pmul(x, x), p8d_one_point_five));
+  x = pmul(x, pmadd(neg_half, pmul(x, x), pset1<Packet8d>(1.5)));

-  // Multiply the original _x by it's reciprocal square root to extract the
-  // square root.
-  return pmul(_x, x);
+  return _mm512_mask_blend_pd(denormal_mask, pmul(_x,x), _mm512_setzero_pd());
 }
 #else
 template <>
@@ -333,20 +330,18 @@ prsqrt<Packet16f>(const Packet16f& _x) {
  // select only the inverse sqrt of positive normal inputs (denormals are
  // flushed to zero and cause infs as well).
  __mmask16 le_zero_mask = _mm512_cmp_ps_mask(_x, p16f_flt_min, _CMP_LT_OQ);
-  Packet16f x = _mm512_mask_blend_ps(le_zero_mask, _mm512_setzero_ps(),
-                                     _mm512_rsqrt14_ps(_x));
+  Packet16f x = _mm512_mask_blend_ps(le_zero_mask, _mm512_rsqrt14_ps(_x), _mm512_setzero_ps());

  // Fill in NaNs and Infs for the negative/zero entries.
  __mmask16 neg_mask = _mm512_cmp_ps_mask(_x, _mm512_setzero_ps(), _CMP_LT_OQ);
  Packet16f infs_and_nans = _mm512_mask_blend_ps(
-      neg_mask, p16f_nan,
-      _mm512_mask_blend_ps(le_zero_mask, p16f_inf, _mm512_setzero_ps()));
+      neg_mask, _mm512_mask_blend_ps(le_zero_mask, _mm512_setzero_ps(), p16f_inf), p16f_nan);

  // Do a single step of Newton's iteration.
  x = pmul(x, pmadd(neg_half, pmul(x, x), p16f_one_point_five));

  // Insert NaNs and Infs in all the right places.
-  return _mm512_mask_blend_ps(le_zero_mask, infs_and_nans, x);
+  return _mm512_mask_blend_ps(le_zero_mask, x, infs_and_nans);
 }

 template <>
@@ -363,14 +358,12 @@ prsqrt<Packet8d>(const Packet8d& _x) {
  // select only the inverse sqrt of positive normal inputs (denormals are
  // flushed to zero and cause infs as well).
  __mmask8 le_zero_mask = _mm512_cmp_pd_mask(_x, p8d_dbl_min, _CMP_LT_OQ);
-  Packet8d x = _mm512_mask_blend_pd(le_zero_mask, _mm512_setzero_pd(),
-                                    _mm512_rsqrt14_pd(_x));
+  Packet8d x = _mm512_mask_blend_pd(le_zero_mask, _mm512_rsqrt14_pd(_x), _mm512_setzero_pd());

  // Fill in NaNs and Infs for the negative/zero entries.
  __mmask8 neg_mask = _mm512_cmp_pd_mask(_x, _mm512_setzero_pd(), _CMP_LT_OQ);
  Packet8d infs_and_nans = _mm512_mask_blend_pd(
-      neg_mask, p8d_nan,
-      _mm512_mask_blend_pd(le_zero_mask, p8d_inf, _mm512_setzero_pd()));
+      neg_mask, _mm512_mask_blend_pd(le_zero_mask, _mm512_setzero_pd(), p8d_inf), p8d_nan);

  // Do a first step of Newton's iteration.
  x = pmul(x, pmadd(neg_half, pmul(x, x), p8d_one_point_five));
@@ -379,9 +372,9 @@ prsqrt<Packet8d>(const Packet8d& _x) {
  x = pmul(x, pmadd(neg_half, pmul(x, x), p8d_one_point_five));

  // Insert NaNs and Infs in all the right places.
-  return _mm512_mask_blend_pd(le_zero_mask, infs_and_nans, x);
+  return _mm512_mask_blend_pd(le_zero_mask, x, infs_and_nans);
 }
-#else
+#elif defined(EIGEN_VECTORIZE_AVX512ER)
 template <>
 EIGEN_STRONG_INLINE Packet16f prsqrt<Packet16f>(const Packet16f& x) {
  return _mm512_rsqrt28_ps(x);
--- a/Eigen/src/Core/arch/AVX512/PacketMath.h
+++ b/Eigen/src/Core/arch/AVX512/PacketMath.h
@@ -19,10 +19,10 @@ namespace internal {
 #endif

 #ifndef EIGEN_ARCH_DEFAULT_NUMBER_OF_REGISTERS
-#define EIGEN_ARCH_DEFAULT_NUMBER_OF_REGISTERS (2*sizeof(void*))
+#define EIGEN_ARCH_DEFAULT_NUMBER_OF_REGISTERS 32
 #endif

-#ifdef __FMA__
+#ifdef EIGEN_VECTORIZE_FMA
 #ifndef EIGEN_HAS_SINGLE_INSTRUCTION_MADD
 #define EIGEN_HAS_SINGLE_INSTRUCTION_MADD
 #endif
@@ -54,13 +54,14 @@ template<> struct packet_traits<float>  : default_packet_traits
    AlignedOnScalar = 1,
    size = 16,
    HasHalfPacket = 1,
-#if EIGEN_GNUC_AT_LEAST(5, 3)
+    HasBlend = 0,
+#if EIGEN_GNUC_AT_LEAST(5, 3) || (!EIGEN_COMP_GNUC_STRICT)
 #ifdef EIGEN_VECTORIZE_AVX512DQ
    HasLog = 1,
 #endif
    HasExp = 1,
-    HasSqrt = 1,
-    HasRsqrt = 1,
+    HasSqrt = EIGEN_FAST_MATH,
+    HasRsqrt = EIGEN_FAST_MATH,
 #endif
    HasDiv = 1
  };
@@ -74,8 +75,8 @@ template<> struct packet_traits<double> : default_packet_traits
    AlignedOnScalar = 1,
    size = 8,
    HasHalfPacket = 1,
-#if EIGEN_GNUC_AT_LEAST(5, 3)
-    HasSqrt = 1,
+#if EIGEN_GNUC_AT_LEAST(5, 3) || (!EIGEN_COMP_GNUC_STRICT)
+    HasSqrt = EIGEN_FAST_MATH,
    HasRsqrt = EIGEN_FAST_MATH,
 #endif
    HasDiv = 1
@@ -98,6 +99,7 @@ template <>
 struct unpacket_traits<Packet16f> {
  typedef float type;
  typedef Packet8f half;
+  typedef Packet16i integer_packet;
  enum { size = 16, alignment=Aligned64 };
 };
 template <>
@@ -132,7 +134,7 @@ EIGEN_STRONG_INLINE Packet16f pload1<Packet16f>(const float* from) {
 }
 template <>
 EIGEN_STRONG_INLINE Packet8d pload1<Packet8d>(const double* from) {
-  return _mm512_broadcastsd_pd(_mm_load_pd1(from));
+  return _mm512_set1_pd(*from);
 }

 template <>
@@ -158,6 +160,11 @@ EIGEN_STRONG_INLINE Packet8d padd<Packet8d>(const Packet8d& a,
                                            const Packet8d& b) {
  return _mm512_add_pd(a, b);
 }
+template <>
+EIGEN_STRONG_INLINE Packet16i padd<Packet16i>(const Packet16i& a,
+                                              const Packet16i& b) {
+  return _mm512_add_epi32(a, b);
+}

 template <>
 EIGEN_STRONG_INLINE Packet16f psub<Packet16f>(const Packet16f& a,
@@ -169,6 +176,11 @@ EIGEN_STRONG_INLINE Packet8d psub<Packet8d>(const Packet8d& a,
                                            const Packet8d& b) {
  return _mm512_sub_pd(a, b);
 }
+template <>
+EIGEN_STRONG_INLINE Packet16i psub<Packet16i>(const Packet16i& a,
+                                              const Packet16i& b) {
+  return _mm512_sub_epi32(a, b);
+}

 template <>
 EIGEN_STRONG_INLINE Packet16f pnegate(const Packet16f& a) {
@@ -202,6 +214,11 @@ EIGEN_STRONG_INLINE Packet8d pmul<Packet8d>(const Packet8d& a,
                                            const Packet8d& b) {
  return _mm512_mul_pd(a, b);
 }
+template <>
+EIGEN_STRONG_INLINE Packet16i pmul<Packet16i>(const Packet16i& a,
+                                              const Packet16i& b) {
+  return _mm512_mul_epi32(a, b);
+}

 template <>
 EIGEN_STRONG_INLINE Packet16f pdiv<Packet16f>(const Packet16f& a,
@@ -214,7 +231,7 @@ EIGEN_STRONG_INLINE Packet8d pdiv<Packet8d>(const Packet8d& a,
  return _mm512_div_pd(a, b);
 }

-#ifdef __FMA__
+#ifdef EIGEN_VECTORIZE_FMA
 template <>
 EIGEN_STRONG_INLINE Packet16f pmadd(const Packet16f& a, const Packet16f& b,
                                    const Packet16f& c) {
@@ -230,23 +247,73 @@ EIGEN_STRONG_INLINE Packet8d pmadd(const Packet8d& a, const Packet8d& b,
 template <>
 EIGEN_STRONG_INLINE Packet16f pmin<Packet16f>(const Packet16f& a,
                                              const Packet16f& b) {
-  return _mm512_min_ps(a, b);
+  // Arguments are reversed to match NaN propagation behavior of std::min.
+  return _mm512_min_ps(b, a);
 }
 template <>
 EIGEN_STRONG_INLINE Packet8d pmin<Packet8d>(const Packet8d& a,
                                            const Packet8d& b) {
-  return _mm512_min_pd(a, b);
+  // Arguments are reversed to match NaN propagation behavior of std::min.
+  return _mm512_min_pd(b, a);
 }

 template <>
 EIGEN_STRONG_INLINE Packet16f pmax<Packet16f>(const Packet16f& a,
                                              const Packet16f& b) {
-  return _mm512_max_ps(a, b);
+  // Arguments are reversed to match NaN propagation behavior of std::max.
+  return _mm512_max_ps(b, a);
 }
 template <>
 EIGEN_STRONG_INLINE Packet8d pmax<Packet8d>(const Packet8d& a,
                                            const Packet8d& b) {
-  return _mm512_max_pd(a, b);
+  // Arguments are reversed to match NaN propagation behavior of std::max.
+  return _mm512_max_pd(b, a);
+}
+
+#ifdef EIGEN_VECTORIZE_AVX512DQ
+template<int I_> EIGEN_STRONG_INLINE Packet8f extract256(Packet16f x) { return _mm512_extractf32x8_ps(x,I_); }
+template<int I_> EIGEN_STRONG_INLINE Packet2d extract128(Packet8d x) { return _mm512_extractf64x2_pd(x,I_); }
+EIGEN_STRONG_INLINE Packet16f cat256(Packet8f a, Packet8f b) { return _mm512_insertf32x8(_mm512_castps256_ps512(a),b,1); }
+#else
+// AVX512F does not define _mm512_extractf32x8_ps to extract _m256 from _m512
+template<int I_> EIGEN_STRONG_INLINE Packet8f extract256(Packet16f x) {
+  return  _mm256_castsi256_ps(_mm512_extracti64x4_epi64( _mm512_castps_si512(x),I_));
+}
+
+// AVX512F does not define _mm512_extractf64x2_pd to extract _m128 from _m512
+template<int I_> EIGEN_STRONG_INLINE Packet2d extract128(Packet8d x) {
+  return _mm_castsi128_pd(_mm512_extracti32x4_epi32( _mm512_castpd_si512(x),I_));
+}
+
+EIGEN_STRONG_INLINE Packet16f cat256(Packet8f a, Packet8f b) {
+  return _mm512_castsi512_ps(_mm512_inserti64x4(_mm512_castsi256_si512(_mm256_castps_si256(a)),
+                                                _mm256_castps_si256(b),1));
+}
+#endif
+
+// Helper function for bit packing snippet of low precision comparison.
+// It packs the flags from 32x16 to 16x16.
+EIGEN_STRONG_INLINE __m256i Pack32To16(Packet16f rf) {
+  // Split data into small pieces and handle with AVX instructions
+  // to guarantee internal order of vector.
+  // Operation:
+  //   dst[15:0]    := Saturate16(rf[31:0])
+  //   dst[31:16]   := Saturate16(rf[63:32])
+  //   ...
+  //   dst[255:240] := Saturate16(rf[255:224])
+  __m256i lo = _mm256_castps_si256(extract256<0>(rf));
+  __m256i hi = _mm256_castps_si256(extract256<1>(rf));
+  __m128i result_lo = _mm_packs_epi32(_mm256_extractf128_si256(lo, 0),
+                                      _mm256_extractf128_si256(lo, 1));
+  __m128i result_hi = _mm_packs_epi32(_mm256_extractf128_si256(hi, 0),
+                                      _mm256_extractf128_si256(hi, 1));
+  return _mm256_insertf128_si256(_mm256_castsi128_si256(result_lo), result_hi, 1);
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet16i pand<Packet16i>(const Packet16i& a,
+                                              const Packet16i& b) {
+  return _mm512_and_si512(a,b);
 }

 template <>
@@ -255,24 +322,7 @@ EIGEN_STRONG_INLINE Packet16f pand<Packet16f>(const Packet16f& a,
 #ifdef EIGEN_VECTORIZE_AVX512DQ
  return _mm512_and_ps(a, b);
 #else
-  Packet16f res = _mm512_undefined_ps();
-  Packet4f lane0_a = _mm512_extractf32x4_ps(a, 0);
-  Packet4f lane0_b = _mm512_extractf32x4_ps(b, 0);
-  res = _mm512_insertf32x4(res, _mm_and_ps(lane0_a, lane0_b), 0);
-
-  Packet4f lane1_a = _mm512_extractf32x4_ps(a, 1);
-  Packet4f lane1_b = _mm512_extractf32x4_ps(b, 1);
-  res = _mm512_insertf32x4(res, _mm_and_ps(lane1_a, lane1_b), 1);
-
-  Packet4f lane2_a = _mm512_extractf32x4_ps(a, 2);
-  Packet4f lane2_b = _mm512_extractf32x4_ps(b, 2);
-  res = _mm512_insertf32x4(res, _mm_and_ps(lane2_a, lane2_b), 2);
-
-  Packet4f lane3_a = _mm512_extractf32x4_ps(a, 3);
-  Packet4f lane3_b = _mm512_extractf32x4_ps(b, 3);
-  res = _mm512_insertf32x4(res, _mm_and_ps(lane3_a, lane3_b), 3);
-
-  return res;
+  return _mm512_castsi512_ps(pand(_mm512_castps_si512(a),_mm512_castps_si512(b)));
 #endif
 }
 template <>
@@ -288,35 +338,21 @@ EIGEN_STRONG_INLINE Packet8d pand<Packet8d>(const Packet8d& a,

  Packet4d lane1_a = _mm512_extractf64x4_pd(a, 1);
  Packet4d lane1_b = _mm512_extractf64x4_pd(b, 1);
-  res = _mm512_insertf64x4(res, _mm256_and_pd(lane1_a, lane1_b), 1);
-
-  return res;
+  return _mm512_insertf64x4(res, _mm256_and_pd(lane1_a, lane1_b), 1);
 #endif
 }
+
 template <>
-EIGEN_STRONG_INLINE Packet16f por<Packet16f>(const Packet16f& a,
-                                             const Packet16f& b) {
+EIGEN_STRONG_INLINE Packet16i por<Packet16i>(const Packet16i& a, const Packet16i& b) {
+  return _mm512_or_si512(a, b);
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet16f por<Packet16f>(const Packet16f& a, const Packet16f& b) {
 #ifdef EIGEN_VECTORIZE_AVX512DQ
  return _mm512_or_ps(a, b);
 #else
-  Packet16f res = _mm512_undefined_ps();
-  Packet4f lane0_a = _mm512_extractf32x4_ps(a, 0);
-  Packet4f lane0_b = _mm512_extractf32x4_ps(b, 0);
-  res = _mm512_insertf32x4(res, _mm_or_ps(lane0_a, lane0_b), 0);
-
-  Packet4f lane1_a = _mm512_extractf32x4_ps(a, 1);
-  Packet4f lane1_b = _mm512_extractf32x4_ps(b, 1);
-  res = _mm512_insertf32x4(res, _mm_or_ps(lane1_a, lane1_b), 1);
-
-  Packet4f lane2_a = _mm512_extractf32x4_ps(a, 2);
-  Packet4f lane2_b = _mm512_extractf32x4_ps(b, 2);
-  res = _mm512_insertf32x4(res, _mm_or_ps(lane2_a, lane2_b), 2);
-
-  Packet4f lane3_a = _mm512_extractf32x4_ps(a, 3);
-  Packet4f lane3_b = _mm512_extractf32x4_ps(b, 3);
-  res = _mm512_insertf32x4(res, _mm_or_ps(lane3_a, lane3_b), 3);
-
-  return res;
+  return _mm512_castsi512_ps(por(_mm512_castps_si512(a),_mm512_castps_si512(b)));
 #endif
 }

@@ -326,109 +362,67 @@ EIGEN_STRONG_INLINE Packet8d por<Packet8d>(const Packet8d& a,
 #ifdef EIGEN_VECTORIZE_AVX512DQ
  return _mm512_or_pd(a, b);
 #else
-  Packet8d res = _mm512_undefined_pd();
-  Packet4d lane0_a = _mm512_extractf64x4_pd(a, 0);
-  Packet4d lane0_b = _mm512_extractf64x4_pd(b, 0);
-  res = _mm512_insertf64x4(res, _mm256_or_pd(lane0_a, lane0_b), 0);
-
-  Packet4d lane1_a = _mm512_extractf64x4_pd(a, 1);
-  Packet4d lane1_b = _mm512_extractf64x4_pd(b, 1);
-  res = _mm512_insertf64x4(res, _mm256_or_pd(lane1_a, lane1_b), 1);
-
-  return res;
+  return _mm512_castsi512_pd(por(_mm512_castpd_si512(a),_mm512_castpd_si512(b)));
 #endif
 }

 template <>
-EIGEN_STRONG_INLINE Packet16f pxor<Packet16f>(const Packet16f& a,
-                                              const Packet16f& b) {
+EIGEN_STRONG_INLINE Packet16i pxor<Packet16i>(const Packet16i& a, const Packet16i& b) {
+  return _mm512_xor_si512(a, b);
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet16f pxor<Packet16f>(const Packet16f& a, const Packet16f& b) {
 #ifdef EIGEN_VECTORIZE_AVX512DQ
  return _mm512_xor_ps(a, b);
 #else
-  Packet16f res = _mm512_undefined_ps();
-  Packet4f lane0_a = _mm512_extractf32x4_ps(a, 0);
-  Packet4f lane0_b = _mm512_extractf32x4_ps(b, 0);
-  res = _mm512_insertf32x4(res, _mm_xor_ps(lane0_a, lane0_b), 0);
-
-  Packet4f lane1_a = _mm512_extractf32x4_ps(a, 1);
-  Packet4f lane1_b = _mm512_extractf32x4_ps(b, 1);
-  res = _mm512_insertf32x4(res, _mm_xor_ps(lane1_a, lane1_b), 1);
-
-  Packet4f lane2_a = _mm512_extractf32x4_ps(a, 2);
-  Packet4f lane2_b = _mm512_extractf32x4_ps(b, 2);
-  res = _mm512_insertf32x4(res, _mm_xor_ps(lane2_a, lane2_b), 2);
-
-  Packet4f lane3_a = _mm512_extractf32x4_ps(a, 3);
-  Packet4f lane3_b = _mm512_extractf32x4_ps(b, 3);
-  res = _mm512_insertf32x4(res, _mm_xor_ps(lane3_a, lane3_b), 3);
-
-  return res;
+  return _mm512_castsi512_ps(pxor(_mm512_castps_si512(a),_mm512_castps_si512(b)));
 #endif
 }
+
 template <>
-EIGEN_STRONG_INLINE Packet8d pxor<Packet8d>(const Packet8d& a,
-                                            const Packet8d& b) {
+EIGEN_STRONG_INLINE Packet8d pxor<Packet8d>(const Packet8d& a, const Packet8d& b) {
 #ifdef EIGEN_VECTORIZE_AVX512DQ
  return _mm512_xor_pd(a, b);
 #else
-  Packet8d res = _mm512_undefined_pd();
-  Packet4d lane0_a = _mm512_extractf64x4_pd(a, 0);
-  Packet4d lane0_b = _mm512_extractf64x4_pd(b, 0);
-  res = _mm512_insertf64x4(res, _mm256_xor_pd(lane0_a, lane0_b), 0);
-
-  Packet4d lane1_a = _mm512_extractf64x4_pd(a, 1);
-  Packet4d lane1_b = _mm512_extractf64x4_pd(b, 1);
-  res = _mm512_insertf64x4(res, _mm256_xor_pd(lane1_a, lane1_b), 1);
-
-  return res;
+  return _mm512_castsi512_pd(pxor(_mm512_castpd_si512(a),_mm512_castpd_si512(b)));
 #endif
 }

 template <>
-EIGEN_STRONG_INLINE Packet16f pandnot<Packet16f>(const Packet16f& a,
-                                                 const Packet16f& b) {
+EIGEN_STRONG_INLINE Packet16i pandnot<Packet16i>(const Packet16i& a, const Packet16i& b) {
+  return _mm512_andnot_si512(b, a);
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet16f pandnot<Packet16f>(const Packet16f& a, const Packet16f& b) {
 #ifdef EIGEN_VECTORIZE_AVX512DQ
-  return _mm512_andnot_ps(a, b);
+  return _mm512_andnot_ps(b, a);
 #else
-  Packet16f res = _mm512_undefined_ps();
-  Packet4f lane0_a = _mm512_extractf32x4_ps(a, 0);
-  Packet4f lane0_b = _mm512_extractf32x4_ps(b, 0);
-  res = _mm512_insertf32x4(res, _mm_andnot_ps(lane0_a, lane0_b), 0);
-
-  Packet4f lane1_a = _mm512_extractf32x4_ps(a, 1);
-  Packet4f lane1_b = _mm512_extractf32x4_ps(b, 1);
-  res = _mm512_insertf32x4(res, _mm_andnot_ps(lane1_a, lane1_b), 1);
-
-  Packet4f lane2_a = _mm512_extractf32x4_ps(a, 2);
-  Packet4f lane2_b = _mm512_extractf32x4_ps(b, 2);
-  res = _mm512_insertf32x4(res, _mm_andnot_ps(lane2_a, lane2_b), 2);
-
-  Packet4f lane3_a = _mm512_extractf32x4_ps(a, 3);
-  Packet4f lane3_b = _mm512_extractf32x4_ps(b, 3);
-  res = _mm512_insertf32x4(res, _mm_andnot_ps(lane3_a, lane3_b), 3);
-
-  return res;
+  return _mm512_castsi512_ps(pandnot(_mm512_castps_si512(a),_mm512_castps_si512(b)));
 #endif
 }
 template <>
-EIGEN_STRONG_INLINE Packet8d pandnot<Packet8d>(const Packet8d& a,
-                                               const Packet8d& b) {
+EIGEN_STRONG_INLINE Packet8d pandnot<Packet8d>(const Packet8d& a,const Packet8d& b) {
 #ifdef EIGEN_VECTORIZE_AVX512DQ
-  return _mm512_andnot_pd(a, b);
+  return _mm512_andnot_pd(b, a);
 #else
-  Packet8d res = _mm512_undefined_pd();
-  Packet4d lane0_a = _mm512_extractf64x4_pd(a, 0);
-  Packet4d lane0_b = _mm512_extractf64x4_pd(b, 0);
-  res = _mm512_insertf64x4(res, _mm256_andnot_pd(lane0_a, lane0_b), 0);
-
-  Packet4d lane1_a = _mm512_extractf64x4_pd(a, 1);
-  Packet4d lane1_b = _mm512_extractf64x4_pd(b, 1);
-  res = _mm512_insertf64x4(res, _mm256_andnot_pd(lane1_a, lane1_b), 1);
-
-  return res;
+  return _mm512_castsi512_pd(pandnot(_mm512_castpd_si512(a),_mm512_castpd_si512(b)));
 #endif
 }

+template<int N> EIGEN_STRONG_INLINE Packet16i parithmetic_shift_right(Packet16i a) {
+  return _mm512_srai_epi32(a, N);
+}
+
+template<int N> EIGEN_STRONG_INLINE Packet16i plogical_shift_right(Packet16i a) {
+  return _mm512_srli_epi32(a, N);
+}
+
+template<int N> EIGEN_STRONG_INLINE Packet16i plogical_shift_left(Packet16i a) {
+  return _mm512_slli_epi32(a, N);
+}
+
 template <>
 EIGEN_STRONG_INLINE Packet16f pload<Packet16f>(const float* from) {
  EIGEN_DEBUG_ALIGNED_LOAD return _mm512_load_ps(from);
@@ -461,75 +455,55 @@ EIGEN_STRONG_INLINE Packet16i ploadu<Packet16i>(const int* from) {
 // {a0, a0  a1, a1, a2, a2, a3, a3, a4, a4, a5, a5, a6, a6, a7, a7}
 template <>
 EIGEN_STRONG_INLINE Packet16f ploaddup<Packet16f>(const float* from) {
-  Packet8f lane0 = _mm256_broadcast_ps((const __m128*)(const void*)from);
-  // mimic an "inplace" permutation of the lower 128bits using a blend
-  lane0 = _mm256_blend_ps(
-      lane0, _mm256_castps128_ps256(_mm_permute_ps(
-                 _mm256_castps256_ps128(lane0), _MM_SHUFFLE(1, 0, 1, 0))),
-      15);
-  // then we can perform a consistent permutation on the global register to get
-  // everything in shape:
-  lane0 = _mm256_permute_ps(lane0, _MM_SHUFFLE(3, 3, 2, 2));
-
-  Packet8f lane1 = _mm256_broadcast_ps((const __m128*)(const void*)(from + 4));
-  // mimic an "inplace" permutation of the lower 128bits using a blend
-  lane1 = _mm256_blend_ps(
-      lane1, _mm256_castps128_ps256(_mm_permute_ps(
-                 _mm256_castps256_ps128(lane1), _MM_SHUFFLE(1, 0, 1, 0))),
-      15);
-  // then we can perform a consistent permutation on the global register to get
-  // everything in shape:
-  lane1 = _mm256_permute_ps(lane1, _MM_SHUFFLE(3, 3, 2, 2));
+  // an unaligned load is required here as there is no requirement
+  // on the alignment of input pointer 'from'
+  __m256i low_half = _mm256_loadu_si256(reinterpret_cast<const __m256i*>(from));
+  __m512 even_elements = _mm512_castsi512_ps(_mm512_cvtepu32_epi64(low_half));
+  __m512 pairs = _mm512_permute_ps(even_elements, _MM_SHUFFLE(2, 2, 0, 0));
+  return pairs;
+}

 #ifdef EIGEN_VECTORIZE_AVX512DQ
-  Packet16f res = _mm512_undefined_ps();
-  return _mm512_insertf32x8(res, lane0, 0);
-  return _mm512_insertf32x8(res, lane1, 1);
-  return res;
-#else
-  Packet16f res = _mm512_undefined_ps();
-  res = _mm512_insertf32x4(res, _mm256_extractf128_ps(lane0, 0), 0);
-  res = _mm512_insertf32x4(res, _mm256_extractf128_ps(lane0, 1), 1);
-  res = _mm512_insertf32x4(res, _mm256_extractf128_ps(lane1, 0), 2);
-  res = _mm512_insertf32x4(res, _mm256_extractf128_ps(lane1, 1), 3);
-  return res;
-#endif
-}
+// FIXME: this does not look optimal, better load a Packet4d and shuffle...
 // Loads 4 doubles from memory a returns the packet {a0, a0  a1, a1, a2, a2, a3,
 // a3}
 template <>
 EIGEN_STRONG_INLINE Packet8d ploaddup<Packet8d>(const double* from) {
-  Packet4d lane0 = _mm256_broadcast_pd((const __m128d*)(const void*)from);
-  lane0 = _mm256_permute_pd(lane0, 3 << 2);
-
-  Packet4d lane1 = _mm256_broadcast_pd((const __m128d*)(const void*)(from + 2));
-  lane1 = _mm256_permute_pd(lane1, 3 << 2);
-
-  Packet8d res = _mm512_undefined_pd();
-  res = _mm512_insertf64x4(res, lane0, 0);
-  return _mm512_insertf64x4(res, lane1, 1);
+ __m512d x = _mm512_setzero_pd();
+  x = _mm512_insertf64x2(x, _mm_loaddup_pd(&from[0]), 0);
+  x = _mm512_insertf64x2(x, _mm_loaddup_pd(&from[1]), 1);
+  x = _mm512_insertf64x2(x, _mm_loaddup_pd(&from[2]), 2);
+  x = _mm512_insertf64x2(x, _mm_loaddup_pd(&from[3]), 3);
+  return x;
 }
+#else
+template <>
+EIGEN_STRONG_INLINE Packet8d ploaddup<Packet8d>(const double* from) {
+  __m512d x = _mm512_setzero_pd();
+  x = _mm512_mask_broadcastsd_pd(x, 0x3<<0, _mm_load_sd(from+0));
+  x = _mm512_mask_broadcastsd_pd(x, 0x3<<2, _mm_load_sd(from+1));
+  x = _mm512_mask_broadcastsd_pd(x, 0x3<<4, _mm_load_sd(from+2));
+  x = _mm512_mask_broadcastsd_pd(x, 0x3<<6, _mm_load_sd(from+3));
+  return x;
+}
+#endif

 // Loads 4 floats from memory a returns the packet
 // {a0, a0  a0, a0, a1, a1, a1, a1, a2, a2, a2, a2, a3, a3, a3, a3}
 template <>
 EIGEN_STRONG_INLINE Packet16f ploadquad<Packet16f>(const float* from) {
-  Packet16f tmp = _mm512_undefined_ps();
-  tmp = _mm512_insertf32x4(tmp, _mm_load_ps1(from), 0);
-  tmp = _mm512_insertf32x4(tmp, _mm_load_ps1(from + 1), 1);
-  tmp = _mm512_insertf32x4(tmp, _mm_load_ps1(from + 2), 2);
-  tmp = _mm512_insertf32x4(tmp, _mm_load_ps1(from + 3), 3);
-  return tmp;
+  Packet16f tmp = _mm512_castps128_ps512(ploadu<Packet4f>(from));
+  const Packet16i scatter_mask = _mm512_set_epi32(3,3,3,3, 2,2,2,2, 1,1,1,1, 0,0,0,0);
+  return _mm512_permutexvar_ps(scatter_mask, tmp);
 }
+
 // Loads 2 doubles from memory a returns the packet
 // {a0, a0  a0, a0, a1, a1, a1, a1}
 template <>
 EIGEN_STRONG_INLINE Packet8d ploadquad<Packet8d>(const double* from) {
-  Packet8d tmp = _mm512_undefined_pd();
-  Packet2d tmp0 = _mm_load_pd1(from);
-  Packet2d tmp1 = _mm_load_pd1(from + 1);
-  Packet4d lane0 = _mm256_broadcastsd_pd(tmp0);
-  Packet4d lane1 = _mm256_broadcastsd_pd(tmp1);
+  __m256d lane0 = _mm256_set1_pd(*from);
+  __m256d lane1 = _mm256_set1_pd(*(from+1));
+  __m512d tmp = _mm512_undefined_pd();
  tmp = _mm512_insertf64x4(tmp, lane0, 0);
  return _mm512_insertf64x4(tmp, lane1, 1);
 }
@@ -565,7 +539,7 @@ EIGEN_STRONG_INLINE void pstoreu<int>(int* to, const Packet16i& from) {
 template <>
 EIGEN_DEVICE_FUNC inline Packet16f pgather<float, Packet16f>(const float* from,
                                                             Index stride) {
-  Packet16i stride_vector = _mm512_set1_epi32(stride);
+  Packet16i stride_vector = _mm512_set1_epi32(convert_index<int>(stride));
  Packet16i stride_multiplier =
      _mm512_set_epi32(15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
  Packet16i indices = _mm512_mullo_epi32(stride_vector, stride_multiplier);
@@ -575,7 +549,7 @@ EIGEN_DEVICE_FUNC inline Packet16f pgather<float, Packet16f>(const float* from,
 template <>
 EIGEN_DEVICE_FUNC inline Packet8d pgather<double, Packet8d>(const double* from,
                                                            Index stride) {
-  Packet8i stride_vector = _mm256_set1_epi32(stride);
+  Packet8i stride_vector = _mm256_set1_epi32(convert_index<int>(stride));
  Packet8i stride_multiplier = _mm256_set_epi32(7, 6, 5, 4, 3, 2, 1, 0);
  Packet8i indices = _mm256_mullo_epi32(stride_vector, stride_multiplier);

@@ -586,7 +560,7 @@ template <>
 EIGEN_DEVICE_FUNC inline void pscatter<float, Packet16f>(float* to,
                                                         const Packet16f& from,
                                                         Index stride) {
-  Packet16i stride_vector = _mm512_set1_epi32(stride);
+  Packet16i stride_vector = _mm512_set1_epi32(convert_index<int>(stride));
  Packet16i stride_multiplier =
      _mm512_set_epi32(15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
  Packet16i indices = _mm512_mullo_epi32(stride_vector, stride_multiplier);
@@ -596,7 +570,7 @@ template <>
 EIGEN_DEVICE_FUNC inline void pscatter<double, Packet8d>(double* to,
                                                         const Packet8d& from,
                                                         Index stride) {
-  Packet8i stride_vector = _mm256_set1_epi32(stride);
+  Packet8i stride_vector = _mm256_set1_epi32(convert_index<int>(stride));
  Packet8i stride_multiplier = _mm256_set_epi32(7, 6, 5, 4, 3, 2, 1, 0);
  Packet8i indices = _mm256_mullo_epi32(stride_vector, stride_multiplier);
  _mm512_i32scatter_pd(to, indices, from, 8);
@@ -618,9 +592,9 @@ EIGEN_STRONG_INLINE void pstore1<Packet16i>(int* to, const int& a) {
  pstore(to, pa);
 }

-template<> EIGEN_STRONG_INLINE void prefetch<float>(const float*   addr) { _mm_prefetch((const char*)(addr), _MM_HINT_T0); }
-template<> EIGEN_STRONG_INLINE void prefetch<double>(const double* addr) { _mm_prefetch((const char*)(addr), _MM_HINT_T0); }
-template<> EIGEN_STRONG_INLINE void prefetch<int>(const int*       addr) { _mm_prefetch((const char*)(addr), _MM_HINT_T0); }
+template<> EIGEN_STRONG_INLINE void prefetch<float>(const float*   addr) { _mm_prefetch((SsePrefetchPtrType)(addr), _MM_HINT_T0); }
+template<> EIGEN_STRONG_INLINE void prefetch<double>(const double* addr) { _mm_prefetch((SsePrefetchPtrType)(addr), _MM_HINT_T0); }
+template<> EIGEN_STRONG_INLINE void prefetch<int>(const int*       addr) { _mm_prefetch((SsePrefetchPtrType)(addr), _MM_HINT_T0); }

 template <>
 EIGEN_STRONG_INLINE float pfirst<Packet16f>(const Packet16f& a) {
@@ -648,20 +622,20 @@ template<> EIGEN_STRONG_INLINE Packet8d preverse(const Packet8d& a)
 template<> EIGEN_STRONG_INLINE Packet16f pabs(const Packet16f& a)
 {
  // _mm512_abs_ps intrinsic not found, so hack around it
-  return (__m512)_mm512_and_si512((__m512i)a, _mm512_set1_epi32(0x7fffffff));
+  return _mm512_castsi512_ps(_mm512_and_si512(_mm512_castps_si512(a), _mm512_set1_epi32(0x7fffffff)));
 }
 template <>
 EIGEN_STRONG_INLINE Packet8d pabs(const Packet8d& a) {
  // _mm512_abs_ps intrinsic not found, so hack around it
-  return (__m512d)_mm512_and_si512((__m512i)a,
-                                   _mm512_set1_epi64(0x7fffffffffffffff));
+  return _mm512_castsi512_pd(_mm512_and_si512(_mm512_castpd_si512(a),
+                                   _mm512_set1_epi64(0x7fffffffffffffff)));
 }

 #ifdef EIGEN_VECTORIZE_AVX512DQ
 // AVX512F does not define _mm512_extractf32x8_ps to extract _m256 from _m512
 #define EIGEN_EXTRACT_8f_FROM_16f(INPUT, OUTPUT)                           \
-  __m256 OUTPUT##_0 = _mm512_extractf32x8_ps(INPUT, 0) __m256 OUTPUT##_1 = \
-      _mm512_extractf32x8_ps(INPUT, 1)
+  __m256 OUTPUT##_0 = _mm512_extractf32x8_ps(INPUT, 0);                    \
+  __m256 OUTPUT##_1 = _mm512_extractf32x8_ps(INPUT, 1)
 #else
 #define EIGEN_EXTRACT_8f_FROM_16f(INPUT, OUTPUT)                \
  __m256 OUTPUT##_0 = _mm256_insertf128_ps(                     \
@@ -674,17 +648,136 @@ EIGEN_STRONG_INLINE Packet8d pabs(const Packet8d& a) {

 #ifdef EIGEN_VECTORIZE_AVX512DQ
 #define EIGEN_INSERT_8f_INTO_16f(OUTPUT, INPUTA, INPUTB) \
-  OUTPUT = _mm512_insertf32x8(OUTPUT, INPUTA, 0);        \
-  OUTPUT = _mm512_insertf32x8(OUTPUT, INPUTB, 1);
+  OUTPUT = _mm512_insertf32x8(_mm512_castps256_ps512(INPUTA), INPUTB, 1);
 #else
 #define EIGEN_INSERT_8f_INTO_16f(OUTPUT, INPUTA, INPUTB)                    \
+  OUTPUT = _mm512_undefined_ps();                                           \
  OUTPUT = _mm512_insertf32x4(OUTPUT, _mm256_extractf128_ps(INPUTA, 0), 0); \
  OUTPUT = _mm512_insertf32x4(OUTPUT, _mm256_extractf128_ps(INPUTA, 1), 1); \
  OUTPUT = _mm512_insertf32x4(OUTPUT, _mm256_extractf128_ps(INPUTB, 0), 2); \
  OUTPUT = _mm512_insertf32x4(OUTPUT, _mm256_extractf128_ps(INPUTB, 1), 3);
 #endif
-template<> EIGEN_STRONG_INLINE Packet16f preduxp<Packet16f>(const Packet16f*
-vecs)
+
+template <>
+EIGEN_STRONG_INLINE float predux<Packet16f>(const Packet16f& a) {
+#ifdef EIGEN_VECTORIZE_AVX512DQ
+  __m256 lane0 = _mm512_extractf32x8_ps(a, 0);
+  __m256 lane1 = _mm512_extractf32x8_ps(a, 1);
+  Packet8f x = _mm256_add_ps(lane0, lane1);
+  return predux<Packet8f>(x);
+#else
+  __m128 lane0 = _mm512_extractf32x4_ps(a, 0);
+  __m128 lane1 = _mm512_extractf32x4_ps(a, 1);
+  __m128 lane2 = _mm512_extractf32x4_ps(a, 2);
+  __m128 lane3 = _mm512_extractf32x4_ps(a, 3);
+  __m128 sum = _mm_add_ps(_mm_add_ps(lane0, lane1), _mm_add_ps(lane2, lane3));
+  sum = _mm_hadd_ps(sum, sum);
+  sum = _mm_hadd_ps(sum, _mm_permute_ps(sum, 1));
+  return _mm_cvtss_f32(sum);
+#endif
+}
+template <>
+EIGEN_STRONG_INLINE double predux<Packet8d>(const Packet8d& a) {
+  __m256d lane0 = _mm512_extractf64x4_pd(a, 0);
+  __m256d lane1 = _mm512_extractf64x4_pd(a, 1);
+  __m256d sum = _mm256_add_pd(lane0, lane1);
+  __m256d tmp0 = _mm256_hadd_pd(sum, _mm256_permute2f128_pd(sum, sum, 1));
+  return _mm_cvtsd_f64(_mm256_castpd256_pd128(_mm256_hadd_pd(tmp0, tmp0)));
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet8f predux_downto4<Packet16f>(const Packet16f& a) {
+#ifdef EIGEN_VECTORIZE_AVX512DQ
+  Packet8f lane0 = _mm512_extractf32x8_ps(a, 0);
+  Packet8f lane1 = _mm512_extractf32x8_ps(a, 1);
+  return padd(lane0, lane1);
+#else
+  Packet4f lane0 = _mm512_extractf32x4_ps(a, 0);
+  Packet4f lane1 = _mm512_extractf32x4_ps(a, 1);
+  Packet4f lane2 = _mm512_extractf32x4_ps(a, 2);
+  Packet4f lane3 = _mm512_extractf32x4_ps(a, 3);
+  Packet4f sum0 = padd(lane0, lane2);
+  Packet4f sum1 = padd(lane1, lane3);
+  return _mm256_insertf128_ps(_mm256_castps128_ps256(sum0), sum1, 1);
+#endif
+}
+template <>
+EIGEN_STRONG_INLINE Packet4d predux_downto4<Packet8d>(const Packet8d& a) {
+  Packet4d lane0 = _mm512_extractf64x4_pd(a, 0);
+  Packet4d lane1 = _mm512_extractf64x4_pd(a, 1);
+  Packet4d res = padd(lane0, lane1);
+  return res;
+}
+
+template <>
+EIGEN_STRONG_INLINE float predux_mul<Packet16f>(const Packet16f& a) {
+//#ifdef EIGEN_VECTORIZE_AVX512DQ
+#if 0
+  Packet8f lane0 = _mm512_extractf32x8_ps(a, 0);
+  Packet8f lane1 = _mm512_extractf32x8_ps(a, 1);
+  Packet8f res = pmul(lane0, lane1);
+  res = pmul(res, _mm256_permute2f128_ps(res, res, 1));
+  res = pmul(res, _mm_permute_ps(res, _MM_SHUFFLE(0, 0, 3, 2)));
+  return pfirst(pmul(res, _mm_permute_ps(res, _MM_SHUFFLE(0, 0, 0, 1))));
+#else
+  __m128 lane0 = _mm512_extractf32x4_ps(a, 0);
+  __m128 lane1 = _mm512_extractf32x4_ps(a, 1);
+  __m128 lane2 = _mm512_extractf32x4_ps(a, 2);
+  __m128 lane3 = _mm512_extractf32x4_ps(a, 3);
+  __m128 res = pmul(pmul(lane0, lane1), pmul(lane2, lane3));
+  res = pmul(res, _mm_permute_ps(res, _MM_SHUFFLE(0, 0, 3, 2)));
+  return pfirst(pmul(res, _mm_permute_ps(res, _MM_SHUFFLE(0, 0, 0, 1))));
+#endif
+}
+template <>
+EIGEN_STRONG_INLINE double predux_mul<Packet8d>(const Packet8d& a) {
+  __m256d lane0 = _mm512_extractf64x4_pd(a, 0);
+  __m256d lane1 = _mm512_extractf64x4_pd(a, 1);
+  __m256d res = pmul(lane0, lane1);
+  res = pmul(res, _mm256_permute2f128_pd(res, res, 1));
+  return pfirst(pmul(res, _mm256_shuffle_pd(res, res, 1)));
+}
+
+template <>
+EIGEN_STRONG_INLINE float predux_min<Packet16f>(const Packet16f& a) {
+  __m128 lane0 = _mm512_extractf32x4_ps(a, 0);
+  __m128 lane1 = _mm512_extractf32x4_ps(a, 1);
+  __m128 lane2 = _mm512_extractf32x4_ps(a, 2);
+  __m128 lane3 = _mm512_extractf32x4_ps(a, 3);
+  __m128 res = _mm_min_ps(_mm_min_ps(lane0, lane1), _mm_min_ps(lane2, lane3));
+  res = _mm_min_ps(res, _mm_permute_ps(res, _MM_SHUFFLE(0, 0, 3, 2)));
+  return pfirst(_mm_min_ps(res, _mm_permute_ps(res, _MM_SHUFFLE(0, 0, 0, 1))));
+}
+template <>
+EIGEN_STRONG_INLINE double predux_min<Packet8d>(const Packet8d& a) {
+  __m256d lane0 = _mm512_extractf64x4_pd(a, 0);
+  __m256d lane1 = _mm512_extractf64x4_pd(a, 1);
+  __m256d res = _mm256_min_pd(lane0, lane1);
+  res = _mm256_min_pd(res, _mm256_permute2f128_pd(res, res, 1));
+  return pfirst(_mm256_min_pd(res, _mm256_shuffle_pd(res, res, 1)));
+}
+
+template <>
+EIGEN_STRONG_INLINE float predux_max<Packet16f>(const Packet16f& a) {
+  __m128 lane0 = _mm512_extractf32x4_ps(a, 0);
+  __m128 lane1 = _mm512_extractf32x4_ps(a, 1);
+  __m128 lane2 = _mm512_extractf32x4_ps(a, 2);
+  __m128 lane3 = _mm512_extractf32x4_ps(a, 3);
+  __m128 res = _mm_max_ps(_mm_max_ps(lane0, lane1), _mm_max_ps(lane2, lane3));
+  res = _mm_max_ps(res, _mm_permute_ps(res, _MM_SHUFFLE(0, 0, 3, 2)));
+  return pfirst(_mm_max_ps(res, _mm_permute_ps(res, _MM_SHUFFLE(0, 0, 0, 1))));
+}
+
+template <>
+EIGEN_STRONG_INLINE double predux_max<Packet8d>(const Packet8d& a) {
+  __m256d lane0 = _mm512_extractf64x4_pd(a, 0);
+  __m256d lane1 = _mm512_extractf64x4_pd(a, 1);
+  __m256d res = _mm256_max_pd(lane0, lane1);
+  res = _mm256_max_pd(res, _mm256_permute2f128_pd(res, res, 1));
+  return pfirst(_mm256_max_pd(res, _mm256_shuffle_pd(res, res, 1)));
+}
+
+template<> EIGEN_STRONG_INLINE Packet16f preduxp<Packet16f>(const Packet16f* vecs)
 {
  EIGEN_EXTRACT_8f_FROM_16f(vecs[0], vecs0);
  EIGEN_EXTRACT_8f_FROM_16f(vecs[1], vecs1);
@@ -873,174 +966,7 @@ template<> EIGEN_STRONG_INLINE Packet8d preduxp<Packet8d>(const Packet8d* vecs)

  return _mm512_insertf64x4(final_output, final_1, 1);
 }
-
-template <>
-EIGEN_STRONG_INLINE float predux<Packet16f>(const Packet16f& a) {
-  //#ifdef EIGEN_VECTORIZE_AVX512DQ
-#if 0
-  Packet8f lane0 = _mm512_extractf32x8_ps(a, 0);
-  Packet8f lane1 = _mm512_extractf32x8_ps(a, 1);
-  Packet8f sum = padd(lane0, lane1);
-  Packet8f tmp0 = _mm256_hadd_ps(sum, _mm256_permute2f128_ps(a, a, 1));
-  tmp0 = _mm256_hadd_ps(tmp0, tmp0);
-  return pfirst(_mm256_hadd_ps(tmp0, tmp0));
-#else
-  Packet4f lane0 = _mm512_extractf32x4_ps(a, 0);
-  Packet4f lane1 = _mm512_extractf32x4_ps(a, 1);
-  Packet4f lane2 = _mm512_extractf32x4_ps(a, 2);
-  Packet4f lane3 = _mm512_extractf32x4_ps(a, 3);
-  Packet4f sum = padd(padd(lane0, lane1), padd(lane2, lane3));
-  sum = _mm_hadd_ps(sum, sum);
-  sum = _mm_hadd_ps(sum, _mm_permute_ps(sum, 1));
-  return pfirst(sum);
-#endif
-}
-template <>
-EIGEN_STRONG_INLINE double predux<Packet8d>(const Packet8d& a) {
-  Packet4d lane0 = _mm512_extractf64x4_pd(a, 0);
-  Packet4d lane1 = _mm512_extractf64x4_pd(a, 1);
-  Packet4d sum = padd(lane0, lane1);
-  Packet4d tmp0 = _mm256_hadd_pd(sum, _mm256_permute2f128_pd(sum, sum, 1));
-  return pfirst(_mm256_hadd_pd(tmp0, tmp0));
-}
-
-template <>
-EIGEN_STRONG_INLINE Packet8f predux_downto4<Packet16f>(const Packet16f& a) {
-#ifdef EIGEN_VECTORIZE_AVX512DQ
-  Packet8f lane0 = _mm512_extractf32x8_ps(a, 0);
-  Packet8f lane1 = _mm512_extractf32x8_ps(a, 1);
-  return padd(lane0, lane1);
-#else
-  Packet4f lane0 = _mm512_extractf32x4_ps(a, 0);
-  Packet4f lane1 = _mm512_extractf32x4_ps(a, 1);
-  Packet4f lane2 = _mm512_extractf32x4_ps(a, 2);
-  Packet4f lane3 = _mm512_extractf32x4_ps(a, 3);
-  Packet4f sum0 = padd(lane0, lane2);
-  Packet4f sum1 = padd(lane1, lane3);
-  return _mm256_insertf128_ps(_mm256_castps128_ps256(sum0), sum1, 1);
-#endif
-}
-template <>
-EIGEN_STRONG_INLINE Packet4d predux_downto4<Packet8d>(const Packet8d& a) {
-  Packet4d lane0 = _mm512_extractf64x4_pd(a, 0);
-  Packet4d lane1 = _mm512_extractf64x4_pd(a, 1);
-  Packet4d res = padd(lane0, lane1);
-  return res;
-}
-
-template <>
-EIGEN_STRONG_INLINE float predux_mul<Packet16f>(const Packet16f& a) {
-//#ifdef EIGEN_VECTORIZE_AVX512DQ
-#if 0
-  Packet8f lane0 = _mm512_extractf32x8_ps(a, 0);
-  Packet8f lane1 = _mm512_extractf32x8_ps(a, 1);
-  Packet8f res = pmul(lane0, lane1);
-  res = pmul(res, _mm256_permute2f128_ps(res, res, 1));
-  res = pmul(res, _mm_permute_ps(res, _MM_SHUFFLE(0, 0, 3, 2)));
-  return pfirst(pmul(res, _mm_permute_ps(res, _MM_SHUFFLE(0, 0, 0, 1))));
-#else
-  Packet4f lane0 = _mm512_extractf32x4_ps(a, 0);
-  Packet4f lane1 = _mm512_extractf32x4_ps(a, 1);
-  Packet4f lane2 = _mm512_extractf32x4_ps(a, 2);
-  Packet4f lane3 = _mm512_extractf32x4_ps(a, 3);
-  Packet4f res = pmul(pmul(lane0, lane1), pmul(lane2, lane3));
-  res = pmul(res, _mm_permute_ps(res, _MM_SHUFFLE(0, 0, 3, 2)));
-  return pfirst(pmul(res, _mm_permute_ps(res, _MM_SHUFFLE(0, 0, 0, 1))));
-#endif
-}
-template <>
-EIGEN_STRONG_INLINE double predux_mul<Packet8d>(const Packet8d& a) {
-  Packet4d lane0 = _mm512_extractf64x4_pd(a, 0);
-  Packet4d lane1 = _mm512_extractf64x4_pd(a, 1);
-  Packet4d res = pmul(lane0, lane1);
-  res = pmul(res, _mm256_permute2f128_pd(res, res, 1));
-  return pfirst(pmul(res, _mm256_shuffle_pd(res, res, 1)));
-}
-
-template <>
-EIGEN_STRONG_INLINE float predux_min<Packet16f>(const Packet16f& a) {
-  Packet4f lane0 = _mm512_extractf32x4_ps(a, 0);
-  Packet4f lane1 = _mm512_extractf32x4_ps(a, 1);
-  Packet4f lane2 = _mm512_extractf32x4_ps(a, 2);
-  Packet4f lane3 = _mm512_extractf32x4_ps(a, 3);
-  Packet4f res = _mm_min_ps(_mm_min_ps(lane0, lane1), _mm_min_ps(lane2, lane3));
-  res = _mm_min_ps(res, _mm_permute_ps(res, _MM_SHUFFLE(0, 0, 3, 2)));
-  return pfirst(_mm_min_ps(res, _mm_permute_ps(res, _MM_SHUFFLE(0, 0, 0, 1))));
-}
-template <>
-EIGEN_STRONG_INLINE double predux_min<Packet8d>(const Packet8d& a) {
-  Packet4d lane0 = _mm512_extractf64x4_pd(a, 0);
-  Packet4d lane1 = _mm512_extractf64x4_pd(a, 1);
-  Packet4d res = _mm256_min_pd(lane0, lane1);
-  res = _mm256_min_pd(res, _mm256_permute2f128_pd(res, res, 1));
-  return pfirst(_mm256_min_pd(res, _mm256_shuffle_pd(res, res, 1)));
-}
-
-template <>
-EIGEN_STRONG_INLINE float predux_max<Packet16f>(const Packet16f& a) {
-  Packet4f lane0 = _mm512_extractf32x4_ps(a, 0);
-  Packet4f lane1 = _mm512_extractf32x4_ps(a, 1);
-  Packet4f lane2 = _mm512_extractf32x4_ps(a, 2);
-  Packet4f lane3 = _mm512_extractf32x4_ps(a, 3);
-  Packet4f res = _mm_max_ps(_mm_max_ps(lane0, lane1), _mm_max_ps(lane2, lane3));
-  res = _mm_max_ps(res, _mm_permute_ps(res, _MM_SHUFFLE(0, 0, 3, 2)));
-  return pfirst(_mm_max_ps(res, _mm_permute_ps(res, _MM_SHUFFLE(0, 0, 0, 1))));
-}
-template <>
-EIGEN_STRONG_INLINE double predux_max<Packet8d>(const Packet8d& a) {
-  Packet4d lane0 = _mm512_extractf64x4_pd(a, 0);
-  Packet4d lane1 = _mm512_extractf64x4_pd(a, 1);
-  Packet4d res = _mm256_max_pd(lane0, lane1);
-  res = _mm256_max_pd(res, _mm256_permute2f128_pd(res, res, 1));
-  return pfirst(_mm256_max_pd(res, _mm256_shuffle_pd(res, res, 1)));
-}
-
-template <int Offset>
-struct palign_impl<Offset, Packet16f> {
-  static EIGEN_STRONG_INLINE void run(Packet16f& first,
-                                      const Packet16f& second) {
-    if (Offset != 0) {
-      __m512i first_idx = _mm512_set_epi32(
-          Offset + 15, Offset + 14, Offset + 13, Offset + 12, Offset + 11,
-          Offset + 10, Offset + 9, Offset + 8, Offset + 7, Offset + 6,
-          Offset + 5, Offset + 4, Offset + 3, Offset + 2, Offset + 1, Offset);
-
-      __m512i second_idx =
-          _mm512_set_epi32(Offset - 1, Offset - 2, Offset - 3, Offset - 4,
-                           Offset - 5, Offset - 6, Offset - 7, Offset - 8,
-                           Offset - 9, Offset - 10, Offset - 11, Offset - 12,
-                           Offset - 13, Offset - 14, Offset - 15, Offset - 16);
-
-      unsigned short mask = 0xFFFF;
-      mask <<= (16 - Offset);
-
-      first = _mm512_permutexvar_ps(first_idx, first);
-      Packet16f tmp = _mm512_permutexvar_ps(second_idx, second);
-      first = _mm512_mask_blend_ps(mask, first, tmp);
-    }
-  }
-};
-template <int Offset>
-struct palign_impl<Offset, Packet8d> {
-  static EIGEN_STRONG_INLINE void run(Packet8d& first, const Packet8d& second) {
-    if (Offset != 0) {
-      __m512i first_idx = _mm512_set_epi32(
-          0, Offset + 7, 0, Offset + 6, 0, Offset + 5, 0, Offset + 4, 0,
-          Offset + 3, 0, Offset + 2, 0, Offset + 1, 0, Offset);
-
-      __m512i second_idx = _mm512_set_epi32(
-          0, Offset - 1, 0, Offset - 2, 0, Offset - 3, 0, Offset - 4, 0,
-          Offset - 5, 0, Offset - 6, 0, Offset - 7, 0, Offset - 8);
-
-      unsigned char mask = 0xFF;
-      mask <<= (8 - Offset);
-
-      first = _mm512_permutexvar_pd(first_idx, first);
-      Packet8d tmp = _mm512_permutexvar_pd(second_idx, second);
-      first = _mm512_mask_blend_pd(mask, first, tmp);
-    }
-  }
-};
+ 


 #define PACK_OUTPUT(OUTPUT, INPUT, INDEX, STRIDE) \
@@ -1302,13 +1228,76 @@ EIGEN_STRONG_INLINE Packet16f pblend(const Selector<16>& /*ifPacket*/,
  return Packet16f();
 }
 template <>
-EIGEN_STRONG_INLINE Packet8d pblend(const Selector<8>& /*ifPacket*/,
-                                    const Packet8d& /*thenPacket*/,
-                                    const Packet8d& /*elsePacket*/) {
-  assert(false && "To be implemented");
-  return Packet8d();
+EIGEN_STRONG_INLINE Packet8d pblend(const Selector<8>& ifPacket,
+                                    const Packet8d& thenPacket,
+                                    const Packet8d& elsePacket) {
+  __mmask8 m = (ifPacket.select[0]   )
+             | (ifPacket.select[1]<<1)
+             | (ifPacket.select[2]<<2)
+             | (ifPacket.select[3]<<3)
+             | (ifPacket.select[4]<<4)
+             | (ifPacket.select[5]<<5)
+             | (ifPacket.select[6]<<6)
+             | (ifPacket.select[7]<<7);
+  return _mm512_mask_blend_pd(m, elsePacket, thenPacket);
 }

+template<> EIGEN_STRONG_INLINE Packet16i pcast<Packet16f, Packet16i>(const Packet16f& a) {
+  return _mm512_cvttps_epi32(a);
+}
+
+template<> EIGEN_STRONG_INLINE Packet16f pcast<Packet16i, Packet16f>(const Packet16i& a) {
+  return _mm512_cvtepi32_ps(a);
+}
+
+template <int Offset>
+struct palign_impl<Offset, Packet16f> {
+  static EIGEN_STRONG_INLINE void run(Packet16f& first,
+                                      const Packet16f& second) {
+    if (Offset != 0) {
+      __m512i first_idx = _mm512_set_epi32(
+          Offset + 15, Offset + 14, Offset + 13, Offset + 12, Offset + 11,
+          Offset + 10, Offset + 9, Offset + 8, Offset + 7, Offset + 6,
+          Offset + 5, Offset + 4, Offset + 3, Offset + 2, Offset + 1, Offset);
+
+      __m512i second_idx =
+          _mm512_set_epi32(Offset - 1, Offset - 2, Offset - 3, Offset - 4,
+                           Offset - 5, Offset - 6, Offset - 7, Offset - 8,
+                           Offset - 9, Offset - 10, Offset - 11, Offset - 12,
+                           Offset - 13, Offset - 14, Offset - 15, Offset - 16);
+
+      unsigned short mask = 0xFFFF;
+      mask <<= (16 - Offset);
+
+      first = _mm512_permutexvar_ps(first_idx, first);
+      Packet16f tmp = _mm512_permutexvar_ps(second_idx, second);
+      first = _mm512_mask_blend_ps(mask, first, tmp);
+    }
+  }
+};
+template <int Offset>
+struct palign_impl<Offset, Packet8d> {
+  static EIGEN_STRONG_INLINE void run(Packet8d& first, const Packet8d& second) {
+    if (Offset != 0) {
+      __m512i first_idx = _mm512_set_epi32(
+          0, Offset + 7, 0, Offset + 6, 0, Offset + 5, 0, Offset + 4, 0,
+          Offset + 3, 0, Offset + 2, 0, Offset + 1, 0, Offset);
+
+      __m512i second_idx = _mm512_set_epi32(
+          0, Offset - 1, 0, Offset - 2, 0, Offset - 3, 0, Offset - 4, 0,
+          Offset - 5, 0, Offset - 6, 0, Offset - 7, 0, Offset - 8);
+
+      unsigned char mask = 0xFF;
+      mask <<= (8 - Offset);
+
+      first = _mm512_permutexvar_pd(first_idx, first);
+      Packet8d tmp = _mm512_permutexvar_pd(second_idx, second);
+      first = _mm512_mask_blend_pd(mask, first, tmp);
+    }
+  }
+};
+
+
 } // end namespace internal

 } // end namespace Eigen
--- a/Eigen/src/Core/arch/AltiVec/Complex.h
+++ b/Eigen/src/Core/arch/AltiVec/Complex.h
@@ -15,22 +15,70 @@ namespace Eigen {

 namespace internal {

-static Packet4ui  p4ui_CONJ_XOR = vec_mergeh((Packet4ui)p4i_ZERO, (Packet4ui)p4f_ZERO_);//{ 0x00000000, 0x80000000, 0x00000000, 0x80000000 };
+inline Packet4ui  p4ui_CONJ_XOR() {
+  return vec_mergeh((Packet4ui)p4i_ZERO, (Packet4ui)p4f_MZERO);//{ 0x00000000, 0x80000000, 0x00000000, 0x80000000 };
+}
 #ifdef __VSX__
 #if defined(_BIG_ENDIAN)
-static Packet2ul  p2ul_CONJ_XOR1 = (Packet2ul) vec_sld((Packet4ui) p2d_ZERO_, (Packet4ui) p2l_ZERO, 8);//{ 0x8000000000000000, 0x0000000000000000 };
-static Packet2ul  p2ul_CONJ_XOR2 = (Packet2ul) vec_sld((Packet4ui) p2l_ZERO,  (Packet4ui) p2d_ZERO_, 8);//{ 0x8000000000000000, 0x0000000000000000 };
+static Packet2ul  p2ul_CONJ_XOR1 = (Packet2ul) vec_sld((Packet4ui) p2d_MZERO, (Packet4ui) p2l_ZERO, 8);//{ 0x8000000000000000, 0x0000000000000000 };
+static Packet2ul  p2ul_CONJ_XOR2 = (Packet2ul) vec_sld((Packet4ui) p2l_ZERO,  (Packet4ui) p2d_MZERO, 8);//{ 0x8000000000000000, 0x0000000000000000 };
 #else
-static Packet2ul  p2ul_CONJ_XOR1 = (Packet2ul) vec_sld((Packet4ui) p2l_ZERO,  (Packet4ui) p2d_ZERO_, 8);//{ 0x8000000000000000, 0x0000000000000000 };
-static Packet2ul  p2ul_CONJ_XOR2 = (Packet2ul) vec_sld((Packet4ui) p2d_ZERO_, (Packet4ui) p2l_ZERO, 8);//{ 0x8000000000000000, 0x0000000000000000 };
+static Packet2ul  p2ul_CONJ_XOR1 = (Packet2ul) vec_sld((Packet4ui) p2l_ZERO,  (Packet4ui) p2d_MZERO, 8);//{ 0x8000000000000000, 0x0000000000000000 };
+static Packet2ul  p2ul_CONJ_XOR2 = (Packet2ul) vec_sld((Packet4ui) p2d_MZERO, (Packet4ui) p2l_ZERO, 8);//{ 0x8000000000000000, 0x0000000000000000 };
 #endif
 #endif

 //---------- float ----------
 struct Packet2cf
 {
-  EIGEN_STRONG_INLINE explicit Packet2cf() : v(p4f_ZERO) {}
+  EIGEN_STRONG_INLINE explicit Packet2cf() {}
  EIGEN_STRONG_INLINE explicit Packet2cf(const Packet4f& a) : v(a) {}
+
+  EIGEN_STRONG_INLINE Packet2cf pmul(const Packet2cf& a, const Packet2cf& b)
+  {
+    Packet4f v1, v2;
+
+    // Permute and multiply the real parts of a and b
+    v1 = vec_perm(a.v, a.v, p16uc_PSET32_WODD);
+    // Get the imaginary parts of a
+    v2 = vec_perm(a.v, a.v, p16uc_PSET32_WEVEN);
+    // multiply a_re * b
+    v1 = vec_madd(v1, b.v, p4f_ZERO);
+    // multiply a_im * b and get the conjugate result
+    v2 = vec_madd(v2, b.v, p4f_ZERO);
+    v2 = reinterpret_cast<Packet4f>(pxor(v2, reinterpret_cast<Packet4f>(p4ui_CONJ_XOR())));
+    // permute back to a proper order
+    v2 = vec_perm(v2, v2, p16uc_COMPLEX32_REV);
+
+    return Packet2cf(padd<Packet4f>(v1, v2));
+  }
+
+  EIGEN_STRONG_INLINE Packet2cf& operator*=(const Packet2cf& b) {
+    v = pmul(Packet2cf(*this), b).v;
+    return *this;
+  }
+  EIGEN_STRONG_INLINE Packet2cf operator*(const Packet2cf& b) const {
+    return Packet2cf(*this) *= b;
+  }
+
+  EIGEN_STRONG_INLINE Packet2cf& operator+=(const Packet2cf& b) {
+    v = padd(v, b.v);
+    return *this;
+  }
+  EIGEN_STRONG_INLINE Packet2cf operator+(const Packet2cf& b) const {
+    return Packet2cf(*this) += b;
+  }
+  EIGEN_STRONG_INLINE Packet2cf& operator-=(const Packet2cf& b) {
+    v = psub(v, b.v);
+    return *this;
+  }
+  EIGEN_STRONG_INLINE Packet2cf operator-(const Packet2cf& b) const {
+    return Packet2cf(*this) -= b;
+  }
+  EIGEN_STRONG_INLINE Packet2cf operator-(void) const {
+    return Packet2cf(-v);
+  }
+
  Packet4f  v;
 };

@@ -65,7 +113,7 @@ template<> struct unpacket_traits<Packet2cf> { typedef std::complex<float> type;
 template<> EIGEN_STRONG_INLINE Packet2cf pset1<Packet2cf>(const std::complex<float>&  from)
 {
  Packet2cf res;
-  if((ptrdiff_t(&from) % 16) == 0)
+  if((std::ptrdiff_t(&from) % 16) == 0)
    res.v = pload<Packet4f>((const float *)&from);
  else
    res.v = ploadu<Packet4f>((const float *)&from);
@@ -82,14 +130,14 @@ template<> EIGEN_STRONG_INLINE void pstoreu<std::complex<float> >(std::complex<f

 template<> EIGEN_DEVICE_FUNC inline Packet2cf pgather<std::complex<float>, Packet2cf>(const std::complex<float>* from, Index stride)
 {
-  std::complex<float> EIGEN_ALIGN16 af[2];
+  EIGEN_ALIGN16 std::complex<float> af[2];
  af[0] = from[0*stride];
  af[1] = from[1*stride];
  return pload<Packet2cf>(af);
 }
 template<> EIGEN_DEVICE_FUNC inline void pscatter<std::complex<float>, Packet2cf>(std::complex<float>* to, const Packet2cf& from, Index stride)
 {
-  std::complex<float> EIGEN_ALIGN16 af[2];
+  EIGEN_ALIGN16 std::complex<float> af[2];
  pstore<std::complex<float> >((std::complex<float> *) af, from);
  to[0*stride] = af[0];
  to[1*stride] = af[1];
@@ -98,26 +146,7 @@ template<> EIGEN_DEVICE_FUNC inline void pscatter<std::complex<float>, Packet2cf
 template<> EIGEN_STRONG_INLINE Packet2cf padd<Packet2cf>(const Packet2cf& a, const Packet2cf& b) { return Packet2cf(a.v + b.v); }
 template<> EIGEN_STRONG_INLINE Packet2cf psub<Packet2cf>(const Packet2cf& a, const Packet2cf& b) { return Packet2cf(a.v - b.v); }
 template<> EIGEN_STRONG_INLINE Packet2cf pnegate(const Packet2cf& a) { return Packet2cf(pnegate(a.v)); }
-template<> EIGEN_STRONG_INLINE Packet2cf pconj(const Packet2cf& a) { return Packet2cf(pxor<Packet4f>(a.v, reinterpret_cast<Packet4f>(p4ui_CONJ_XOR))); }
-
-template<> EIGEN_STRONG_INLINE Packet2cf pmul<Packet2cf>(const Packet2cf& a, const Packet2cf& b)
-{
-  Packet4f v1, v2;
-
-  // Permute and multiply the real parts of a and b
-  v1 = vec_perm(a.v, a.v, p16uc_PSET32_WODD);
-  // Get the imaginary parts of a
-  v2 = vec_perm(a.v, a.v, p16uc_PSET32_WEVEN);
-  // multiply a_re * b 
-  v1 = vec_madd(v1, b.v, p4f_ZERO);
-  // multiply a_im * b and get the conjugate result
-  v2 = vec_madd(v2, b.v, p4f_ZERO);
-  v2 = reinterpret_cast<Packet4f>(pxor(v2, reinterpret_cast<Packet4f>(p4ui_CONJ_XOR)));
-  // permute back to a proper order
-  v2 = vec_perm(v2, v2, p16uc_COMPLEX32_REV);
-  
-  return Packet2cf(padd<Packet4f>(v1, v2));
-}
+template<> EIGEN_STRONG_INLINE Packet2cf pconj(const Packet2cf& a) { return Packet2cf(pxor<Packet4f>(a.v, reinterpret_cast<Packet4f>(p4ui_CONJ_XOR()))); }

 template<> EIGEN_STRONG_INLINE Packet2cf pand   <Packet2cf>(const Packet2cf& a, const Packet2cf& b) { return Packet2cf(pand<Packet4f>(a.v, b.v)); }
 template<> EIGEN_STRONG_INLINE Packet2cf por    <Packet2cf>(const Packet2cf& a, const Packet2cf& b) { return Packet2cf(por<Packet4f>(a.v, b.v)); }
@@ -128,7 +157,7 @@ template<> EIGEN_STRONG_INLINE void prefetch<std::complex<float> >(const std::co

 template<> EIGEN_STRONG_INLINE std::complex<float>  pfirst<Packet2cf>(const Packet2cf& a)
 {
-  std::complex<float> EIGEN_ALIGN16 res[2];
+  EIGEN_ALIGN16 std::complex<float> res[2];
  pstore((float *)&res, a.v);

  return res[0];
@@ -152,7 +181,7 @@ template<> EIGEN_STRONG_INLINE std::complex<float> predux<Packet2cf>(const Packe
 template<> EIGEN_STRONG_INLINE Packet2cf preduxp<Packet2cf>(const Packet2cf* vecs)
 {
  Packet4f b1, b2;
-#ifdef _BIG_ENDIAN  
+#ifdef _BIG_ENDIAN
  b1 = vec_sld(vecs[0].v, vecs[1].v, 8);
  b2 = vec_sld(vecs[1].v, vecs[0].v, 8);
 #else
@@ -224,23 +253,7 @@ template<> struct conj_helper<Packet2cf, Packet2cf, true,true>
  }
 };

-template<> struct conj_helper<Packet4f, Packet2cf, false,false>
-{
-  EIGEN_STRONG_INLINE Packet2cf pmadd(const Packet4f& x, const Packet2cf& y, const Packet2cf& c) const
-  { return padd(c, pmul(x,y)); }
-
-  EIGEN_STRONG_INLINE Packet2cf pmul(const Packet4f& x, const Packet2cf& y) const
-  { return Packet2cf(internal::pmul<Packet4f>(x, y.v)); }
-};
-
-template<> struct conj_helper<Packet2cf, Packet4f, false,false>
-{
-  EIGEN_STRONG_INLINE Packet2cf pmadd(const Packet2cf& x, const Packet4f& y, const Packet2cf& c) const
-  { return padd(c, pmul(x,y)); }
-
-  EIGEN_STRONG_INLINE Packet2cf pmul(const Packet2cf& x, const Packet4f& y) const
-  { return Packet2cf(internal::pmul<Packet4f>(x.v, y)); }
-};
+EIGEN_MAKE_CONJ_HELPER_CPLX_REAL(Packet2cf,Packet4f)

 template<> EIGEN_STRONG_INLINE Packet2cf pdiv<Packet2cf>(const Packet2cf& a, const Packet2cf& b)
 {
@@ -276,6 +289,51 @@ struct Packet1cd
 {
  EIGEN_STRONG_INLINE Packet1cd() {}
  EIGEN_STRONG_INLINE explicit Packet1cd(const Packet2d& a) : v(a) {}
+
+  EIGEN_STRONG_INLINE Packet1cd pmul(const Packet1cd& a, const Packet1cd& b)
+  {
+    Packet2d a_re, a_im, v1, v2;
+
+    // Permute and multiply the real parts of a and b
+    a_re = vec_perm(a.v, a.v, p16uc_PSET64_HI);
+    // Get the imaginary parts of a
+    a_im = vec_perm(a.v, a.v, p16uc_PSET64_LO);
+    // multiply a_re * b
+    v1 = vec_madd(a_re, b.v, p2d_ZERO);
+    // multiply a_im * b and get the conjugate result
+    v2 = vec_madd(a_im, b.v, p2d_ZERO);
+    v2 = reinterpret_cast<Packet2d>(vec_sld(reinterpret_cast<Packet4ui>(v2), reinterpret_cast<Packet4ui>(v2), 8));
+    v2 = pxor(v2, reinterpret_cast<Packet2d>(p2ul_CONJ_XOR1));
+
+    return Packet1cd(padd<Packet2d>(v1, v2));
+  }
+
+  EIGEN_STRONG_INLINE Packet1cd& operator*=(const Packet1cd& b) {
+    v = pmul(Packet1cd(*this), b).v;
+    return *this;
+  }
+  EIGEN_STRONG_INLINE Packet1cd operator*(const Packet1cd& b) const {
+    return Packet1cd(*this) *= b;
+  }
+
+  EIGEN_STRONG_INLINE Packet1cd& operator+=(const Packet1cd& b) {
+    v = padd(v, b.v);
+    return *this;
+  }
+  EIGEN_STRONG_INLINE Packet1cd operator+(const Packet1cd& b) const {
+    return Packet1cd(*this) += b;
+  }
+  EIGEN_STRONG_INLINE Packet1cd& operator-=(const Packet1cd& b) {
+    v = psub(v, b.v);
+    return *this;
+  }
+  EIGEN_STRONG_INLINE Packet1cd operator-(const Packet1cd& b) const {
+    return Packet1cd(*this) -= b;
+  }
+  EIGEN_STRONG_INLINE Packet1cd operator-(void) const {
+    return Packet1cd(-v);
+  }
+
  Packet2d v;
 };

@@ -312,19 +370,13 @@ template<> EIGEN_STRONG_INLINE void pstoreu<std::complex<double> >(std::complex<
 template<> EIGEN_STRONG_INLINE Packet1cd pset1<Packet1cd>(const std::complex<double>&  from)
 { /* here we really have to use unaligned loads :( */ return ploadu<Packet1cd>(&from); }

-template<> EIGEN_DEVICE_FUNC inline Packet1cd pgather<std::complex<double>, Packet1cd>(const std::complex<double>* from, Index stride)
+template<> EIGEN_DEVICE_FUNC inline Packet1cd pgather<std::complex<double>, Packet1cd>(const std::complex<double>* from, Index)
 {
-  std::complex<double> EIGEN_ALIGN16 af[2];
-  af[0] = from[0*stride];
-  af[1] = from[1*stride];
-  return pload<Packet1cd>(af);
+  return pload<Packet1cd>(from);
 }
-template<> EIGEN_DEVICE_FUNC inline void pscatter<std::complex<double>, Packet1cd>(std::complex<double>* to, const Packet1cd& from, Index stride)
+template<> EIGEN_DEVICE_FUNC inline void pscatter<std::complex<double>, Packet1cd>(std::complex<double>* to, const Packet1cd& from, Index)
 {
-  std::complex<double> EIGEN_ALIGN16 af[2];
-  pstore<std::complex<double> >(af, from);
-  to[0*stride] = af[0];
-  to[1*stride] = af[1];
+  pstore<std::complex<double> >(to, from);
 }

 template<> EIGEN_STRONG_INLINE Packet1cd padd<Packet1cd>(const Packet1cd& a, const Packet1cd& b) { return Packet1cd(a.v + b.v); }
@@ -332,24 +384,6 @@ template<> EIGEN_STRONG_INLINE Packet1cd psub<Packet1cd>(const Packet1cd& a, con
 template<> EIGEN_STRONG_INLINE Packet1cd pnegate(const Packet1cd& a) { return Packet1cd(pnegate(Packet2d(a.v))); }
 template<> EIGEN_STRONG_INLINE Packet1cd pconj(const Packet1cd& a) { return Packet1cd(pxor(a.v, reinterpret_cast<Packet2d>(p2ul_CONJ_XOR2))); }

-template<> EIGEN_STRONG_INLINE Packet1cd pmul<Packet1cd>(const Packet1cd& a, const Packet1cd& b)
-{
-  Packet2d a_re, a_im, v1, v2;
-
-  // Permute and multiply the real parts of a and b
-  a_re = vec_perm(a.v, a.v, p16uc_PSET64_HI);
-  // Get the imaginary parts of a
-  a_im = vec_perm(a.v, a.v, p16uc_PSET64_LO);
-  // multiply a_re * b
-  v1 = vec_madd(a_re, b.v, p2d_ZERO);
-  // multiply a_im * b and get the conjugate result
-  v2 = vec_madd(a_im, b.v, p2d_ZERO);
-  v2 = reinterpret_cast<Packet2d>(vec_sld(reinterpret_cast<Packet4ui>(v2), reinterpret_cast<Packet4ui>(v2), 8));
-  v2 = pxor(v2, reinterpret_cast<Packet2d>(p2ul_CONJ_XOR1));
-
-  return Packet1cd(padd<Packet2d>(v1, v2));
-}
-
 template<> EIGEN_STRONG_INLINE Packet1cd pand   <Packet1cd>(const Packet1cd& a, const Packet1cd& b) { return Packet1cd(pand(a.v,b.v)); }
 template<> EIGEN_STRONG_INLINE Packet1cd por    <Packet1cd>(const Packet1cd& a, const Packet1cd& b) { return Packet1cd(por(a.v,b.v)); }
 template<> EIGEN_STRONG_INLINE Packet1cd pxor   <Packet1cd>(const Packet1cd& a, const Packet1cd& b) { return Packet1cd(pxor(a.v,b.v)); }
@@ -361,7 +395,7 @@ template<> EIGEN_STRONG_INLINE void prefetch<std::complex<double> >(const std::c

 template<> EIGEN_STRONG_INLINE std::complex<double>  pfirst<Packet1cd>(const Packet1cd& a)
 {
-  std::complex<double> EIGEN_ALIGN16 res[2];
+  EIGEN_ALIGN16 std::complex<double> res[2];
  pstore<std::complex<double> >(res, a);

  return res[0];
@@ -416,23 +450,8 @@ template<> struct conj_helper<Packet1cd, Packet1cd, true,true>
    return pconj(internal::pmul(a, b));
  }
 };
-template<> struct conj_helper<Packet2d, Packet1cd, false,false>
-{
-  EIGEN_STRONG_INLINE Packet1cd pmadd(const Packet2d& x, const Packet1cd& y, const Packet1cd& c) const
-  { return padd(c, pmul(x,y)); }

-  EIGEN_STRONG_INLINE Packet1cd pmul(const Packet2d& x, const Packet1cd& y) const
-  { return Packet1cd(internal::pmul<Packet2d>(x, y.v)); }
-};
-
-template<> struct conj_helper<Packet1cd, Packet2d, false,false>
-{
-  EIGEN_STRONG_INLINE Packet1cd pmadd(const Packet1cd& x, const Packet2d& y, const Packet1cd& c) const
-  { return padd(c, pmul(x,y)); }
-
-  EIGEN_STRONG_INLINE Packet1cd pmul(const Packet1cd& x, const Packet2d& y) const
-  { return Packet1cd(internal::pmul<Packet2d>(x.v, y)); }
-};
+EIGEN_MAKE_CONJ_HELPER_CPLX_REAL(Packet1cd,Packet2d)

 template<> EIGEN_STRONG_INLINE Packet1cd pdiv<Packet1cd>(const Packet1cd& a, const Packet1cd& b)
 {
--- a/Eigen/src/Core/arch/AltiVec/MathFunctions.h
+++ b/Eigen/src/Core/arch/AltiVec/MathFunctions.h
@@ -84,8 +84,10 @@ static _EIGEN_DECLARE_CONST_Packet2d(cephes_exp_q3, 2.00000000000000000009e0);
 static _EIGEN_DECLARE_CONST_Packet2d(cephes_exp_C1, 0.693145751953125);
 static _EIGEN_DECLARE_CONST_Packet2d(cephes_exp_C2, 1.42860682030941723212e-6);

+#ifdef __POWER8_VECTOR__
 static Packet2l p2l_1023 = { 1023, 1023 };
 static Packet2ul p2ul_52 = { 52, 52 };
+#endif

 #endif

--- a/Eigen/src/Core/arch/AltiVec/PacketMath.h
+++ b/Eigen/src/Core/arch/AltiVec/PacketMath.h
@@ -22,10 +22,6 @@ namespace internal {
 #define EIGEN_HAS_SINGLE_INSTRUCTION_MADD
 #endif

-#ifndef EIGEN_HAS_SINGLE_INSTRUCTION_CJMADD
-#define EIGEN_HAS_SINGLE_INSTRUCTION_CJMADD
-#endif
-
 // NOTE Altivec has 32 registers, but Eigen only accepts a value of 8 or 16
 #ifndef EIGEN_ARCH_DEFAULT_NUMBER_OF_REGISTERS
 #define EIGEN_ARCH_DEFAULT_NUMBER_OF_REGISTERS  32
@@ -40,9 +36,8 @@ typedef __vector unsigned char  Packet16uc;

 // We don't want to write the same code all the time, but we need to reuse the constants
 // and it doesn't really work to declare them global, so we define macros instead
-
 #define _EIGEN_DECLARE_CONST_FAST_Packet4f(NAME,X) \
-  Packet4f p4f_##NAME = reinterpret_cast<Packet4f>(vec_splat_s32(X))
+  Packet4f p4f_##NAME = {X, X, X, X}

 #define _EIGEN_DECLARE_CONST_FAST_Packet4i(NAME,X) \
  Packet4i p4i_##NAME = vec_splat_s32(X)
@@ -64,7 +59,7 @@ typedef __vector unsigned char  Packet16uc;

 #define DST_CHAN 1
 #define DST_CTRL(size, count, stride) (((size) << 24) | ((count) << 16) | (stride))
-
+#define __UNPACK_TYPE__(PACKETNAME) typename unpacket_traits<PACKETNAME>::type

 // These constants are endian-agnostic
 static _EIGEN_DECLARE_CONST_FAST_Packet4f(ZERO, 0); //{ 0.0, 0.0, 0.0, 0.0}
@@ -72,26 +67,20 @@ static _EIGEN_DECLARE_CONST_FAST_Packet4i(ZERO, 0); //{ 0, 0, 0, 0,}
 static _EIGEN_DECLARE_CONST_FAST_Packet4i(ONE,1); //{ 1, 1, 1, 1}
 static _EIGEN_DECLARE_CONST_FAST_Packet4i(MINUS16,-16); //{ -16, -16, -16, -16}
 static _EIGEN_DECLARE_CONST_FAST_Packet4i(MINUS1,-1); //{ -1, -1, -1, -1}
-static Packet4f p4f_ZERO_ = (Packet4f) vec_sl((Packet4ui)p4i_MINUS1, (Packet4ui)p4i_MINUS1); //{ 0x80000000, 0x80000000, 0x80000000, 0x80000000}
+static Packet4f p4f_MZERO = (Packet4f) vec_sl((Packet4ui)p4i_MINUS1, (Packet4ui)p4i_MINUS1); //{ 0x80000000, 0x80000000, 0x80000000, 0x80000000}
 #ifndef __VSX__
 static Packet4f p4f_ONE = vec_ctf(p4i_ONE, 0); //{ 1.0, 1.0, 1.0, 1.0}
 #endif

+static Packet4ui p4ui_SIGN = {0x80000000u, 0x80000000u, 0x80000000u, 0x80000000u};
+static Packet4ui p4ui_PREV0DOT5 = {0x3EFFFFFFu, 0x3EFFFFFFu, 0x3EFFFFFFu, 0x3EFFFFFFu};
+
 static Packet4f p4f_COUNTDOWN = { 0.0, 1.0, 2.0, 3.0 };
 static Packet4i p4i_COUNTDOWN = { 0, 1, 2, 3 };

 static Packet16uc p16uc_REVERSE32 = { 12,13,14,15, 8,9,10,11, 4,5,6,7, 0,1,2,3 };
 static Packet16uc p16uc_DUPLICATE32_HI = { 0,1,2,3, 0,1,2,3, 4,5,6,7, 4,5,6,7 };

-// Mask alignment
-#ifdef __PPC64__
-#define _EIGEN_MASK_ALIGNMENT	0xfffffffffffffff0
-#else
-#define _EIGEN_MASK_ALIGNMENT	0xfffffff0
-#endif
-
-#define _EIGEN_ALIGNED_PTR(x)	((ptrdiff_t)(x) & _EIGEN_MASK_ALIGNMENT)
-
 // Handle endianness properly while loading constants
 // Define global static constants:
 #ifdef _BIG_ENDIAN
@@ -103,7 +92,7 @@ static Packet16uc p16uc_PSET32_WODD   = vec_sld((Packet16uc) vec_splat((Packet4u
 static Packet16uc p16uc_PSET32_WEVEN  = vec_sld(p16uc_DUPLICATE32_HI, (Packet16uc) vec_splat((Packet4ui)p16uc_FORWARD, 3), 8);//{ 4,5,6,7, 4,5,6,7, 12,13,14,15, 12,13,14,15 };
 static Packet16uc p16uc_HALF64_0_16 = vec_sld((Packet16uc)p4i_ZERO, vec_splat((Packet16uc) vec_abs(p4i_MINUS16), 3), 8);      //{ 0,0,0,0, 0,0,0,0, 16,16,16,16, 16,16,16,16};
 #else
-static Packet16uc p16uc_FORWARD = p16uc_REVERSE32; 
+static Packet16uc p16uc_FORWARD = p16uc_REVERSE32;
 static Packet16uc p16uc_REVERSE64 = { 8,9,10,11, 12,13,14,15, 0,1,2,3, 4,5,6,7 };
 static Packet16uc p16uc_PSET32_WODD = vec_sld((Packet16uc) vec_splat((Packet4ui)p16uc_FORWARD, 1), (Packet16uc) vec_splat((Packet4ui)p16uc_FORWARD, 3), 8);//{ 0,1,2,3, 0,1,2,3, 8,9,10,11, 8,9,10,11 };
 static Packet16uc p16uc_PSET32_WEVEN = vec_sld((Packet16uc) vec_splat((Packet4ui)p16uc_FORWARD, 0), (Packet16uc) vec_splat((Packet4ui)p16uc_FORWARD, 2), 8);//{ 4,5,6,7, 4,5,6,7, 12,13,14,15, 12,13,14,15 };
@@ -235,112 +224,127 @@ inline std::ostream & operator <<(std::ostream & s, const Packet4ui & v)
  return s;
 }

+template <typename Packet, typename Scalar>
+EIGEN_STRONG_INLINE Packet pload_common(const Scalar* from)
+{
+  // some versions of GCC throw "unused-but-set-parameter".
+  // ignoring these warnings for now.
+  EIGEN_UNUSED_VARIABLE(from);
+  EIGEN_DEBUG_ALIGNED_LOAD
+  return vec_ld(0, from);
+}
+
 // Need to define them first or we get specialization after instantiation errors
 template<> EIGEN_STRONG_INLINE Packet4f pload<Packet4f>(const float* from)
 {
-  EIGEN_DEBUG_ALIGNED_LOAD
-#ifdef __VSX__
-  return vec_vsx_ld(0, from);
-#else
-  return vec_ld(0, from);
-#endif
+  return pload_common<Packet4f, float>(from);
 }

 template<> EIGEN_STRONG_INLINE Packet4i pload<Packet4i>(const int*     from)
 {
-  EIGEN_DEBUG_ALIGNED_LOAD
+  return pload_common<Packet4i, int>(from);
+}
+
+template <typename Packet>
+EIGEN_STRONG_INLINE void pstore_common(__UNPACK_TYPE__(Packet)* to, const Packet& from){
+  // some versions of GCC throw "unused-but-set-parameter" (float *to).
+  // ignoring these warnings for now.
+  EIGEN_UNUSED_VARIABLE(to);
+  EIGEN_DEBUG_ALIGNED_STORE
 #ifdef __VSX__
-  return vec_vsx_ld(0, from);
+  vec_xst(from, 0, to);
 #else
-  return vec_ld(0, from);
+  vec_st(from, 0, to);
 #endif
 }

 template<> EIGEN_STRONG_INLINE void pstore<float>(float*   to, const Packet4f& from)
 {
-  EIGEN_DEBUG_ALIGNED_STORE
-#ifdef __VSX__
-  vec_vsx_st(from, 0, to);
-#else
-  vec_st(from, 0, to);
-#endif
+  pstore_common<Packet4f>(to, from);
 }

 template<> EIGEN_STRONG_INLINE void pstore<int>(int*       to, const Packet4i& from)
 {
-  EIGEN_DEBUG_ALIGNED_STORE
-#ifdef __VSX__
-  vec_vsx_st(from, 0, to);
-#else
-  vec_st(from, 0, to);
-#endif
+  pstore_common<Packet4i>(to, from);
+}
+
+template<typename Packet>
+EIGEN_STRONG_INLINE Packet pset1_size4(const __UNPACK_TYPE__(Packet)& from)
+{
+  Packet v = {from, from, from, from};
+  return v;
 }

 template<> EIGEN_STRONG_INLINE Packet4f pset1<Packet4f>(const float&  from) {
-  Packet4f v = {from, from, from, from};
-  return v;
+  return pset1_size4<Packet4f>(from);
 }

 template<> EIGEN_STRONG_INLINE Packet4i pset1<Packet4i>(const int&    from)   {
-  Packet4i v = {from, from, from, from};
-  return v;
+  return pset1_size4<Packet4i>(from);
 }
-template<> EIGEN_STRONG_INLINE void
-pbroadcast4<Packet4f>(const float *a,
-                      Packet4f& a0, Packet4f& a1, Packet4f& a2, Packet4f& a3)
+
+template<typename Packet> EIGEN_STRONG_INLINE void
+pbroadcast4_common(const __UNPACK_TYPE__(Packet) *a,
+                      Packet& a0, Packet& a1, Packet& a2, Packet& a3)
 {
-  a3 = pload<Packet4f>(a);
+  a3 = pload<Packet>(a);
  a0 = vec_splat(a3, 0);
  a1 = vec_splat(a3, 1);
  a2 = vec_splat(a3, 2);
  a3 = vec_splat(a3, 3);
 }
+
+template<> EIGEN_STRONG_INLINE void
+pbroadcast4<Packet4f>(const float *a,
+                      Packet4f& a0, Packet4f& a1, Packet4f& a2, Packet4f& a3)
+{
+  pbroadcast4_common<Packet4f>(a, a0, a1, a2, a3);
+}
 template<> EIGEN_STRONG_INLINE void
 pbroadcast4<Packet4i>(const int *a,
                      Packet4i& a0, Packet4i& a1, Packet4i& a2, Packet4i& a3)
 {
-  a3 = pload<Packet4i>(a);
-  a0 = vec_splat(a3, 0);
-  a1 = vec_splat(a3, 1);
-  a2 = vec_splat(a3, 2);
-  a3 = vec_splat(a3, 3);
+  pbroadcast4_common<Packet4i>(a, a0, a1, a2, a3);
+}
+
+template<typename Packet> EIGEN_DEVICE_FUNC inline Packet pgather_common(const __UNPACK_TYPE__(Packet)* from, Index stride)
+{
+  EIGEN_ALIGN16 __UNPACK_TYPE__(Packet) a[4];
+  a[0] = from[0*stride];
+  a[1] = from[1*stride];
+  a[2] = from[2*stride];
+  a[3] = from[3*stride];
+  return pload<Packet>(a);
 }

 template<> EIGEN_DEVICE_FUNC inline Packet4f pgather<float, Packet4f>(const float* from, Index stride)
 {
-  float EIGEN_ALIGN16 af[4];
-  af[0] = from[0*stride];
-  af[1] = from[1*stride];
-  af[2] = from[2*stride];
-  af[3] = from[3*stride];
- return pload<Packet4f>(af);
+  return pgather_common<Packet4f>(from, stride);
 }
+
 template<> EIGEN_DEVICE_FUNC inline Packet4i pgather<int, Packet4i>(const int* from, Index stride)
 {
-  int EIGEN_ALIGN16 ai[4];
-  ai[0] = from[0*stride];
-  ai[1] = from[1*stride];
-  ai[2] = from[2*stride];
-  ai[3] = from[3*stride];
- return pload<Packet4i>(ai);
+  return pgather_common<Packet4i>(from, stride);
 }
+
+template<typename Packet> EIGEN_DEVICE_FUNC inline void pscatter_size4(__UNPACK_TYPE__(Packet)* to, const Packet& from, Index stride)
+{
+  EIGEN_ALIGN16 __UNPACK_TYPE__(Packet) a[4];
+  pstore<__UNPACK_TYPE__(Packet)>(a, from);
+  to[0*stride] = a[0];
+  to[1*stride] = a[1];
+  to[2*stride] = a[2];
+  to[3*stride] = a[3];
+}
+
 template<> EIGEN_DEVICE_FUNC inline void pscatter<float, Packet4f>(float* to, const Packet4f& from, Index stride)
 {
-  float EIGEN_ALIGN16 af[4];
-  pstore<float>(af, from);
-  to[0*stride] = af[0];
-  to[1*stride] = af[1];
-  to[2*stride] = af[2];
-  to[3*stride] = af[3];
+  pscatter_size4<Packet4f>(to, from, stride);
 }
+
 template<> EIGEN_DEVICE_FUNC inline void pscatter<int, Packet4i>(int* to, const Packet4i& from, Index stride)
 {
-  int EIGEN_ALIGN16 ai[4];
-  pstore<int>((int *)ai, from);
-  to[0*stride] = ai[0];
-  to[1*stride] = ai[1];
-  to[2*stride] = ai[2];
-  to[3*stride] = ai[3];
+  pscatter_size4<Packet4i>(to, from, stride);
 }

 template<> EIGEN_STRONG_INLINE Packet4f plset<Packet4f>(const float& a) { return pset1<Packet4f>(a) + p4f_COUNTDOWN; }
@@ -358,7 +362,7 @@ template<> EIGEN_STRONG_INLINE Packet4i pnegate(const Packet4i& a) { return p4i_
 template<> EIGEN_STRONG_INLINE Packet4f pconj(const Packet4f& a) { return a; }
 template<> EIGEN_STRONG_INLINE Packet4i pconj(const Packet4i& a) { return a; }

-template<> EIGEN_STRONG_INLINE Packet4f pmul<Packet4f>(const Packet4f& a, const Packet4f& b) { return vec_madd(a,b, p4f_ZERO); }
+template<> EIGEN_STRONG_INLINE Packet4f pmul<Packet4f>(const Packet4f& a, const Packet4f& b) { return vec_madd(a,b, p4f_MZERO); }
 template<> EIGEN_STRONG_INLINE Packet4i pmul<Packet4i>(const Packet4i& a, const Packet4i& b) { return a * b; }

 template<> EIGEN_STRONG_INLINE Packet4f pdiv<Packet4f>(const Packet4f& a, const Packet4f& b)
@@ -373,7 +377,7 @@ template<> EIGEN_STRONG_INLINE Packet4f pdiv<Packet4f>(const Packet4f& a, const
  t   = vec_nmsub(y_0, b, p4f_ONE);
  y_1 = vec_madd(y_0, t, y_0);

-  return vec_madd(a, y_1, p4f_ZERO);
+  return vec_madd(a, y_1, p4f_MZERO);
 #else
  return vec_div(a, b);
 #endif
@@ -388,10 +392,28 @@ template<> EIGEN_STRONG_INLINE Packet4i pdiv<Packet4i>(const Packet4i& /*a*/, co
 template<> EIGEN_STRONG_INLINE Packet4f pmadd(const Packet4f& a, const Packet4f& b, const Packet4f& c) { return vec_madd(a,b,c); }
 template<> EIGEN_STRONG_INLINE Packet4i pmadd(const Packet4i& a, const Packet4i& b, const Packet4i& c) { return a*b + c; }

-template<> EIGEN_STRONG_INLINE Packet4f pmin<Packet4f>(const Packet4f& a, const Packet4f& b) { return vec_min(a, b); }
+template<> EIGEN_STRONG_INLINE Packet4f pmin<Packet4f>(const Packet4f& a, const Packet4f& b)
+{
+  #ifdef __VSX__
+  Packet4f ret;
+  __asm__ ("xvcmpgesp %x0,%x1,%x2\n\txxsel %x0,%x1,%x2,%x0" : "=&wa" (ret) : "wa" (a), "wa" (b));
+  return ret;
+  #else
+  return vec_min(a, b);
+  #endif
+}
 template<> EIGEN_STRONG_INLINE Packet4i pmin<Packet4i>(const Packet4i& a, const Packet4i& b) { return vec_min(a, b); }

-template<> EIGEN_STRONG_INLINE Packet4f pmax<Packet4f>(const Packet4f& a, const Packet4f& b) { return vec_max(a, b); }
+template<> EIGEN_STRONG_INLINE Packet4f pmax<Packet4f>(const Packet4f& a, const Packet4f& b)
+{
+  #ifdef __VSX__
+  Packet4f ret;
+  __asm__ ("xvcmpgtsp %x0,%x2,%x1\n\txxsel %x0,%x1,%x2,%x0" : "=&wa" (ret) : "wa" (a), "wa" (b));
+  return ret;
+  #else
+  return vec_max(a, b);
+  #endif
+}
 template<> EIGEN_STRONG_INLINE Packet4i pmax<Packet4i>(const Packet4i& a, const Packet4i& b) { return vec_max(a, b); }

 template<> EIGEN_STRONG_INLINE Packet4f pand<Packet4f>(const Packet4f& a, const Packet4f& b) { return vec_and(a, b); }
@@ -406,66 +428,67 @@ template<> EIGEN_STRONG_INLINE Packet4i pxor<Packet4i>(const Packet4i& a, const
 template<> EIGEN_STRONG_INLINE Packet4f pandnot<Packet4f>(const Packet4f& a, const Packet4f& b) { return vec_and(a, vec_nor(b, b)); }
 template<> EIGEN_STRONG_INLINE Packet4i pandnot<Packet4i>(const Packet4i& a, const Packet4i& b) { return vec_and(a, vec_nor(b, b)); }

-template<> EIGEN_STRONG_INLINE Packet4f pround<Packet4f>(const Packet4f& a) { return vec_round(a); }
+template<> EIGEN_STRONG_INLINE Packet4f pround<Packet4f>(const Packet4f& a)
+{
+    Packet4f t = vec_add(reinterpret_cast<Packet4f>(vec_or(vec_and(reinterpret_cast<Packet4ui>(a), p4ui_SIGN), p4ui_PREV0DOT5)), a);
+    Packet4f res;
+
+#ifdef __VSX__
+    __asm__("xvrspiz %x0, %x1\n\t"
+        : "=&wa" (res)
+        : "wa" (t));
+#else
+    __asm__("vrfiz %0, %1\n\t"
+        : "=v" (res)
+        : "v" (t));
+#endif
+
+    return res;
+}
 template<> EIGEN_STRONG_INLINE Packet4f pceil<Packet4f>(const  Packet4f& a) { return vec_ceil(a); }
 template<> EIGEN_STRONG_INLINE Packet4f pfloor<Packet4f>(const Packet4f& a) { return vec_floor(a); }

-#ifdef _BIG_ENDIAN
+template<typename Packet> EIGEN_STRONG_INLINE Packet ploadu_common(const __UNPACK_TYPE__(Packet)* from)
+{
+  EIGEN_DEBUG_UNALIGNED_LOAD
+  Packet16uc mask = vec_lvsl(0, from);                 // create the permute mask
+  Packet16uc MSQ = vec_ld(0, (unsigned char *)from);   // most significant quadword
+  Packet16uc LSQ = vec_ld(15, (unsigned char *)from);  // least significant quadword
+  //TODO: Add static_cast here
+  return (Packet) vec_perm(MSQ, LSQ, mask);            // align the data
+}
+
 template<> EIGEN_STRONG_INLINE Packet4f ploadu<Packet4f>(const float* from)
 {
-  EIGEN_DEBUG_ALIGNED_LOAD
-  Packet16uc MSQ, LSQ;
-  Packet16uc mask;
-  MSQ = vec_ld(0, (unsigned char *)from);          // most significant quadword
-  LSQ = vec_ld(15, (unsigned char *)from);         // least significant quadword
-  mask = vec_lvsl(0, from);                        // create the permute mask
-  return (Packet4f) vec_perm(MSQ, LSQ, mask);           // align the data
-
+  return ploadu_common<Packet4f>(from);
 }
 template<> EIGEN_STRONG_INLINE Packet4i ploadu<Packet4i>(const int* from)
 {
-  EIGEN_DEBUG_ALIGNED_LOAD
-  // Taken from http://developer.apple.com/hardwaredrivers/ve/alignment.html
-  Packet16uc MSQ, LSQ;
-  Packet16uc mask;
-  MSQ = vec_ld(0, (unsigned char *)from);          // most significant quadword
-  LSQ = vec_ld(15, (unsigned char *)from);         // least significant quadword
-  mask = vec_lvsl(0, from);                        // create the permute mask
-  return (Packet4i) vec_perm(MSQ, LSQ, mask);    // align the data
+  return ploadu_common<Packet4i>(from);
 }
-#else
-// We also need ot redefine little endian loading of Packet4i/Packet4f using VSX
-template<> EIGEN_STRONG_INLINE Packet4i ploadu<Packet4i>(const int* from)
-{
-  EIGEN_DEBUG_UNALIGNED_LOAD
-  return (Packet4i) vec_vsx_ld((long)from & 15, (const int*) _EIGEN_ALIGNED_PTR(from));
-}
-template<> EIGEN_STRONG_INLINE Packet4f ploadu<Packet4f>(const float* from)
-{
-  EIGEN_DEBUG_UNALIGNED_LOAD
-  return (Packet4f) vec_vsx_ld((long)from & 15, (const float*) _EIGEN_ALIGNED_PTR(from));
-}
-#endif

+template<typename Packet> EIGEN_STRONG_INLINE Packet ploaddup_common(const __UNPACK_TYPE__(Packet)*   from)
+{
+  Packet p;
+  if((std::ptrdiff_t(from) % 16) == 0)  p = pload<Packet>(from);
+  else                                  p = ploadu<Packet>(from);
+  return vec_perm(p, p, p16uc_DUPLICATE32_HI);
+}
 template<> EIGEN_STRONG_INLINE Packet4f ploaddup<Packet4f>(const float*   from)
 {
-  Packet4f p;
-  if((ptrdiff_t(from) % 16) == 0)  p = pload<Packet4f>(from);
-  else                             p = ploadu<Packet4f>(from);
-  return vec_perm(p, p, p16uc_DUPLICATE32_HI);
+  return ploaddup_common<Packet4f>(from);
 }
 template<> EIGEN_STRONG_INLINE Packet4i ploaddup<Packet4i>(const int*     from)
 {
-  Packet4i p;
-  if((ptrdiff_t(from) % 16) == 0)  p = pload<Packet4i>(from);
-  else                             p = ploadu<Packet4i>(from);
-  return vec_perm(p, p, p16uc_DUPLICATE32_HI);
+  return ploaddup_common<Packet4i>(from);
 }

-#ifdef _BIG_ENDIAN
-template<> EIGEN_STRONG_INLINE void pstoreu<float>(float*  to, const Packet4f& from)
+template<typename Packet> EIGEN_STRONG_INLINE void pstoreu_common(__UNPACK_TYPE__(Packet)*  to, const Packet& from)
 {
  EIGEN_DEBUG_UNALIGNED_STORE
+#ifdef __VSX__
+  vec_xst(from, 0, to);
+#else
  // Taken from http://developer.apple.com/hardwaredrivers/ve/alignment.html
  // Warning: not thread safe!
  Packet16uc MSQ, LSQ, edges;
@@ -479,45 +502,23 @@ template<> EIGEN_STRONG_INLINE void pstoreu<float>(float*  to, const Packet4f& f
  MSQ = vec_perm(edges,(Packet16uc)from,align);             // misalign the data (MSQ)
  LSQ = vec_perm((Packet16uc)from,edges,align);             // misalign the data (LSQ)
  vec_st( LSQ, 15, (unsigned char *)to );                   // Store the LSQ part first
-  vec_st( MSQ, 0, (unsigned char *)to );                    // Store the MSQ part
+  vec_st( MSQ, 0, (unsigned char *)to );                   // Store the MSQ part second
+#endif
+}
+template<> EIGEN_STRONG_INLINE void pstoreu<float>(float*  to, const Packet4f& from)
+{
+  pstoreu_common<Packet4f>(to, from);
 }
 template<> EIGEN_STRONG_INLINE void pstoreu<int>(int*      to, const Packet4i& from)
 {
-  EIGEN_DEBUG_UNALIGNED_STORE
-  // Taken from http://developer.apple.com/hardwaredrivers/ve/alignment.html
-  // Warning: not thread safe!
-  Packet16uc MSQ, LSQ, edges;
-  Packet16uc edgeAlign, align;
-
-  MSQ = vec_ld(0, (unsigned char *)to);                     // most significant quadword
-  LSQ = vec_ld(15, (unsigned char *)to);                    // least significant quadword
-  edgeAlign = vec_lvsl(0, to);                              // permute map to extract edges
-  edges=vec_perm(LSQ, MSQ, edgeAlign);                      // extract the edges
-  align = vec_lvsr( 0, to );                                // permute map to misalign data
-  MSQ = vec_perm(edges, (Packet16uc) from, align);          // misalign the data (MSQ)
-  LSQ = vec_perm((Packet16uc) from, edges, align);          // misalign the data (LSQ)
-  vec_st( LSQ, 15, (unsigned char *)to );                   // Store the LSQ part first
-  vec_st( MSQ, 0, (unsigned char *)to );                    // Store the MSQ part
+  pstoreu_common<Packet4i>(to, from);
 }
-#else
-// We also need ot redefine little endian loading of Packet4i/Packet4f using VSX
-template<> EIGEN_STRONG_INLINE void pstoreu<int>(int*       to, const Packet4i& from)
-{
-  EIGEN_DEBUG_ALIGNED_STORE
-  vec_vsx_st(from, (long)to & 15, (int*) _EIGEN_ALIGNED_PTR(to));
-}
-template<> EIGEN_STRONG_INLINE void pstoreu<float>(float*   to, const Packet4f& from)
-{
-  EIGEN_DEBUG_ALIGNED_STORE
-  vec_vsx_st(from, (long)to & 15, (float*) _EIGEN_ALIGNED_PTR(to));
-}
-#endif

 template<> EIGEN_STRONG_INLINE void prefetch<float>(const float* addr)    { EIGEN_PPC_PREFETCH(addr); }
 template<> EIGEN_STRONG_INLINE void prefetch<int>(const int*     addr)    { EIGEN_PPC_PREFETCH(addr); }

-template<> EIGEN_STRONG_INLINE float  pfirst<Packet4f>(const Packet4f& a) { float EIGEN_ALIGN16 x; vec_ste(a, 0, &x); return x; }
-template<> EIGEN_STRONG_INLINE int    pfirst<Packet4i>(const Packet4i& a) { int   EIGEN_ALIGN16 x; vec_ste(a, 0, &x); return x; }
+template<> EIGEN_STRONG_INLINE float  pfirst<Packet4f>(const Packet4f& a) { EIGEN_ALIGN16 float x; vec_ste(a, 0, &x); return x; }
+template<> EIGEN_STRONG_INLINE int    pfirst<Packet4i>(const Packet4i& a) { EIGEN_ALIGN16 int   x; vec_ste(a, 0, &x); return x; }

 template<> EIGEN_STRONG_INLINE Packet4f preverse(const Packet4f& a)
 {
@@ -625,37 +626,42 @@ template<> EIGEN_STRONG_INLINE int predux_mul<Packet4i>(const Packet4i& a)
 }

 // min
-template<> EIGEN_STRONG_INLINE float predux_min<Packet4f>(const Packet4f& a)
+template<typename Packet> EIGEN_STRONG_INLINE
+__UNPACK_TYPE__(Packet) predux_min4(const Packet& a)
 {
-  Packet4f b, res;
+  Packet b, res;
  b = vec_min(a, vec_sld(a, a, 8));
  res = vec_min(b, vec_sld(b, b, 4));
  return pfirst(res);
 }

+
+template<> EIGEN_STRONG_INLINE float predux_min<Packet4f>(const Packet4f& a)
+{
+  return predux_min4<Packet4f>(a);
+}
+
 template<> EIGEN_STRONG_INLINE int predux_min<Packet4i>(const Packet4i& a)
 {
-  Packet4i b, res;
-  b = vec_min(a, vec_sld(a, a, 8));
-  res = vec_min(b, vec_sld(b, b, 4));
-  return pfirst(res);
+  return predux_min4<Packet4i>(a);
 }
-
 // max
-template<> EIGEN_STRONG_INLINE float predux_max<Packet4f>(const Packet4f& a)
+template<typename Packet> EIGEN_STRONG_INLINE __UNPACK_TYPE__(Packet) predux_max4(const Packet& a)
 {
-  Packet4f b, res;
+  Packet b, res;
  b = vec_max(a, vec_sld(a, a, 8));
  res = vec_max(b, vec_sld(b, b, 4));
  return pfirst(res);
 }

+template<> EIGEN_STRONG_INLINE float predux_max<Packet4f>(const Packet4f& a)
+{
+  return predux_max4<Packet4f>(a);
+}
+
 template<> EIGEN_STRONG_INLINE int predux_max<Packet4i>(const Packet4i& a)
 {
-  Packet4i b, res;
-  b = vec_max(a, vec_sld(a, a, 8));
-  res = vec_max(b, vec_sld(b, b, 4));
-  return pfirst(res);
+  return predux_max4<Packet4i>(a);
 }

 template<int Offset>
@@ -712,9 +718,9 @@ struct palign_impl<Offset,Packet4i>
  }
 };

-EIGEN_DEVICE_FUNC inline void
-ptranspose(PacketBlock<Packet4f,4>& kernel) {
-  Packet4f t0, t1, t2, t3;
+template <typename T>
+EIGEN_DEVICE_FUNC inline void ptranpose_common(PacketBlock<T, 4>& kernel) {
+  T t0, t1, t2, t3;
  t0 = vec_mergeh(kernel.packet[0], kernel.packet[2]);
  t1 = vec_mergel(kernel.packet[0], kernel.packet[2]);
  t2 = vec_mergeh(kernel.packet[1], kernel.packet[3]);
@@ -725,29 +731,23 @@ ptranspose(PacketBlock<Packet4f,4>& kernel) {
  kernel.packet[3] = vec_mergel(t1, t3);
 }

-EIGEN_DEVICE_FUNC inline void
-ptranspose(PacketBlock<Packet4i,4>& kernel) {
-  Packet4i t0, t1, t2, t3;
-  t0 = vec_mergeh(kernel.packet[0], kernel.packet[2]);
-  t1 = vec_mergel(kernel.packet[0], kernel.packet[2]);
-  t2 = vec_mergeh(kernel.packet[1], kernel.packet[3]);
-  t3 = vec_mergel(kernel.packet[1], kernel.packet[3]);
-  kernel.packet[0] = vec_mergeh(t0, t2);
-  kernel.packet[1] = vec_mergel(t0, t2);
-  kernel.packet[2] = vec_mergeh(t1, t3);
-  kernel.packet[3] = vec_mergel(t1, t3);
+EIGEN_DEVICE_FUNC inline void ptranspose(PacketBlock<Packet4f, 4>& kernel) { ptranpose_common<Packet4f>(kernel); }
+
+EIGEN_DEVICE_FUNC inline void ptranspose(PacketBlock<Packet4i, 4>& kernel) { ptranpose_common<Packet4i>(kernel); }
+
+template<typename Packet> EIGEN_STRONG_INLINE
+Packet pblend4(const Selector<4>& ifPacket, const Packet& thenPacket, const Packet& elsePacket) {
+  Packet4ui select = { ifPacket.select[0], ifPacket.select[1], ifPacket.select[2], ifPacket.select[3] };
+  Packet4ui mask = reinterpret_cast<Packet4ui>(vec_cmpeq(reinterpret_cast<Packet4ui>(select), reinterpret_cast<Packet4ui>(p4i_ONE)));
+  return vec_sel(elsePacket, thenPacket, mask);
 }

 template<> EIGEN_STRONG_INLINE Packet4i pblend(const Selector<4>& ifPacket, const Packet4i& thenPacket, const Packet4i& elsePacket) {
-  Packet4ui select = { ifPacket.select[0], ifPacket.select[1], ifPacket.select[2], ifPacket.select[3] };
-  Packet4ui mask = reinterpret_cast<Packet4ui>(vec_cmpeq(reinterpret_cast<Packet4ui>(select), reinterpret_cast<Packet4ui>(p4i_ONE)));
-  return vec_sel(elsePacket, thenPacket, mask);
+  return pblend4<Packet4i>(ifPacket, thenPacket, elsePacket);
 }

 template<> EIGEN_STRONG_INLINE Packet4f pblend(const Selector<4>& ifPacket, const Packet4f& thenPacket, const Packet4f& elsePacket) {
-  Packet4ui select = { ifPacket.select[0], ifPacket.select[1], ifPacket.select[2], ifPacket.select[3] };
-  Packet4ui mask = reinterpret_cast<Packet4ui>(vec_cmpeq(reinterpret_cast<Packet4ui>(select), reinterpret_cast<Packet4ui>(p4i_ONE)));
-  return vec_sel(elsePacket, thenPacket, mask);
+  return pblend4<Packet4f>(ifPacket, thenPacket, elsePacket);
 }


@@ -764,9 +764,11 @@ typedef __vector __bool long         Packet2bl;

 static Packet2l  p2l_ONE  = { 1, 1 };
 static Packet2l  p2l_ZERO = reinterpret_cast<Packet2l>(p4i_ZERO);
-static Packet2d  p2d_ONE  = { 1.0, 1.0 }; 
+static Packet2d  p2d_ONE  = { 1.0, 1.0 };
 static Packet2d  p2d_ZERO = reinterpret_cast<Packet2d>(p4f_ZERO);
-static Packet2d  p2d_ZERO_ = { -0.0, -0.0 };
+static Packet2d  p2d_MZERO = { -0.0, -0.0 };
+static Packet2ul p2ul_SIGN = {0x8000000000000000ull, 0x8000000000000000ull};
+static Packet2ul p2ul_PREV0DOT5 = {0x3FDFFFFFFFFFFFFFull, 0x3FDFFFFFFFFFFFFFull};

 #ifdef _BIG_ENDIAN
 static Packet2d p2d_COUNTDOWN = reinterpret_cast<Packet2d>(vec_sld(reinterpret_cast<Packet4f>(p2d_ZERO), reinterpret_cast<Packet4f>(p2d_ONE), 8));
@@ -774,16 +776,9 @@ static Packet2d p2d_COUNTDOWN = reinterpret_cast<Packet2d>(vec_sld(reinterpret_c
 static Packet2d p2d_COUNTDOWN = reinterpret_cast<Packet2d>(vec_sld(reinterpret_cast<Packet4f>(p2d_ONE), reinterpret_cast<Packet4f>(p2d_ZERO), 8));
 #endif

-template<int index> Packet2d vec_splat_dbl(Packet2d& a);
-
-template<> EIGEN_STRONG_INLINE Packet2d vec_splat_dbl<0>(Packet2d& a)
+template<int index> Packet2d vec_splat_dbl(Packet2d& a)
 {
-  return reinterpret_cast<Packet2d>(vec_perm(a, a, p16uc_PSET64_HI));
-}
-
-template<> EIGEN_STRONG_INLINE Packet2d vec_splat_dbl<1>(Packet2d& a)
-{
-  return reinterpret_cast<Packet2d>(vec_perm(a, a, p16uc_PSET64_LO));
+  return vec_splat(a, index);
 }

 template<> struct packet_traits<double> : default_packet_traits
@@ -808,7 +803,11 @@ template<> struct packet_traits<double> : default_packet_traits
    HasLog  = 0,
    HasExp  = 1,
    HasSqrt = 1,
+#if !EIGEN_COMP_CLANG
    HasRsqrt = 1,
+#else
+    HasRsqrt = 0,
+#endif
    HasRound = 1,
    HasFloor = 1,
    HasCeil = 1,
@@ -845,21 +844,13 @@ inline std::ostream & operator <<(std::ostream & s, const Packet2d & v)
 template<> EIGEN_STRONG_INLINE Packet2d pload<Packet2d>(const double* from)
 {
  EIGEN_DEBUG_ALIGNED_LOAD
-#ifdef __VSX__
-  return vec_vsx_ld(0, from);
-#else
-  return vec_ld(0, from);
-#endif
+  return vec_xl(0, const_cast<double *>(from)); // cast needed by Clang
 }

 template<> EIGEN_STRONG_INLINE void pstore<double>(double*   to, const Packet2d& from)
 {
  EIGEN_DEBUG_ALIGNED_STORE
-#ifdef __VSX__
-  vec_vsx_st(from, 0, to);
-#else
-  vec_st(from, 0, to);
-#endif
+  vec_xst(from, 0, to);
 }

 template<> EIGEN_STRONG_INLINE Packet2d pset1<Packet2d>(const double&  from) {
@@ -871,24 +862,23 @@ template<> EIGEN_STRONG_INLINE void
 pbroadcast4<Packet2d>(const double *a,
                      Packet2d& a0, Packet2d& a1, Packet2d& a2, Packet2d& a3)
 {
-  a1 = pload<Packet2d>(a);
-  a0 = vec_splat_dbl<0>(a1);
-  a1 = vec_splat_dbl<1>(a1);
-  a3 = pload<Packet2d>(a+2);
-  a2 = vec_splat_dbl<0>(a3);
-  a3 = vec_splat_dbl<1>(a3);
+  //This way is faster than vec_splat (at least for doubles in Power 9)
+  a0 = pset1<Packet2d>(a[0]);
+  a1 = pset1<Packet2d>(a[1]);
+  a2 = pset1<Packet2d>(a[2]);
+  a3 = pset1<Packet2d>(a[3]);
 }

 template<> EIGEN_DEVICE_FUNC inline Packet2d pgather<double, Packet2d>(const double* from, Index stride)
 {
-  double EIGEN_ALIGN16 af[2];
+  EIGEN_ALIGN16 double af[2];
  af[0] = from[0*stride];
  af[1] = from[1*stride];
 return pload<Packet2d>(af);
 }
 template<> EIGEN_DEVICE_FUNC inline void pscatter<double, Packet2d>(double* to, const Packet2d& from, Index stride)
 {
-  double EIGEN_ALIGN16 af[2];
+  EIGEN_ALIGN16 double af[2];
  pstore<double>(af, from);
  to[0*stride] = af[0];
  to[1*stride] = af[1];
@@ -900,19 +890,36 @@ template<> EIGEN_STRONG_INLINE Packet2d padd<Packet2d>(const Packet2d& a, const

 template<> EIGEN_STRONG_INLINE Packet2d psub<Packet2d>(const Packet2d& a, const Packet2d& b) { return a - b; }

-template<> EIGEN_STRONG_INLINE Packet2d pnegate(const Packet2d& a) { return p2d_ZERO - a; }
+template<> EIGEN_STRONG_INLINE Packet2d pnegate(const Packet2d& a)
+{
+#ifdef __POWER8_VECTOR__
+  return vec_neg(a);
+#else
+  return vec_xor(a, p2d_MZERO);
+#endif
+}

 template<> EIGEN_STRONG_INLINE Packet2d pconj(const Packet2d& a) { return a; }

-template<> EIGEN_STRONG_INLINE Packet2d pmul<Packet2d>(const Packet2d& a, const Packet2d& b) { return vec_madd(a,b,p2d_ZERO); }
+template<> EIGEN_STRONG_INLINE Packet2d pmul<Packet2d>(const Packet2d& a, const Packet2d& b) { return vec_madd(a,b,p2d_MZERO); }
 template<> EIGEN_STRONG_INLINE Packet2d pdiv<Packet2d>(const Packet2d& a, const Packet2d& b) { return vec_div(a,b); }

 // for some weird raisons, it has to be overloaded for packet of integers
 template<> EIGEN_STRONG_INLINE Packet2d pmadd(const Packet2d& a, const Packet2d& b, const Packet2d& c) { return vec_madd(a, b, c); }

-template<> EIGEN_STRONG_INLINE Packet2d pmin<Packet2d>(const Packet2d& a, const Packet2d& b) { return vec_min(a, b); }
+template<> EIGEN_STRONG_INLINE Packet2d pmin<Packet2d>(const Packet2d& a, const Packet2d& b)
+{
+  Packet2d ret;
+  __asm__ ("xvcmpgedp %x0,%x1,%x2\n\txxsel %x0,%x1,%x2,%x0" : "=&wa" (ret) : "wa" (a), "wa" (b));
+  return ret;
+ }

-template<> EIGEN_STRONG_INLINE Packet2d pmax<Packet2d>(const Packet2d& a, const Packet2d& b) { return vec_max(a, b); }
+template<> EIGEN_STRONG_INLINE Packet2d pmax<Packet2d>(const Packet2d& a, const Packet2d& b)
+{
+  Packet2d ret;
+  __asm__ ("xvcmpgtdp %x0,%x2,%x1\n\txxsel %x0,%x1,%x2,%x0" : "=&wa" (ret) : "wa" (a), "wa" (b));
+  return ret;
+}

 template<> EIGEN_STRONG_INLINE Packet2d pand<Packet2d>(const Packet2d& a, const Packet2d& b) { return vec_and(a, b); }

@@ -922,33 +929,43 @@ template<> EIGEN_STRONG_INLINE Packet2d pxor<Packet2d>(const Packet2d& a, const

 template<> EIGEN_STRONG_INLINE Packet2d pandnot<Packet2d>(const Packet2d& a, const Packet2d& b) { return vec_and(a, vec_nor(b, b)); }

-template<> EIGEN_STRONG_INLINE Packet2d pround<Packet2d>(const Packet2d& a) { return vec_round(a); }
+template<> EIGEN_STRONG_INLINE Packet2d pround<Packet2d>(const Packet2d& a)
+{
+    Packet2d t = vec_add(reinterpret_cast<Packet2d>(vec_or(vec_and(reinterpret_cast<Packet2ul>(a), p2ul_SIGN), p2ul_PREV0DOT5)), a);
+    Packet2d res;
+
+    __asm__("xvrdpiz %x0, %x1\n\t"
+        : "=&wa" (res)
+        : "wa" (t));
+
+    return res;
+}
 template<> EIGEN_STRONG_INLINE Packet2d pceil<Packet2d>(const  Packet2d& a) { return vec_ceil(a); }
 template<> EIGEN_STRONG_INLINE Packet2d pfloor<Packet2d>(const Packet2d& a) { return vec_floor(a); }

 template<> EIGEN_STRONG_INLINE Packet2d ploadu<Packet2d>(const double* from)
 {
-  EIGEN_DEBUG_ALIGNED_LOAD
-  return (Packet2d) vec_vsx_ld((long)from & 15, (const double*) _EIGEN_ALIGNED_PTR(from));
+  EIGEN_DEBUG_UNALIGNED_LOAD
+  return vec_xl(0, const_cast<double*>(from));
 }

 template<> EIGEN_STRONG_INLINE Packet2d ploaddup<Packet2d>(const double*   from)
 {
  Packet2d p;
-  if((ptrdiff_t(from) % 16) == 0)  p = pload<Packet2d>(from);
-  else                             p = ploadu<Packet2d>(from);
+  if((std::ptrdiff_t(from) % 16) == 0)  p = pload<Packet2d>(from);
+  else                                  p = ploadu<Packet2d>(from);
  return vec_splat_dbl<0>(p);
 }

 template<> EIGEN_STRONG_INLINE void pstoreu<double>(double*  to, const Packet2d& from)
 {
-  EIGEN_DEBUG_ALIGNED_STORE
-  vec_vsx_st((Packet4f)from, (long)to & 15, (float*) _EIGEN_ALIGNED_PTR(to));
+  EIGEN_DEBUG_UNALIGNED_STORE
+  vec_xst(from, 0, to);
 }

 template<> EIGEN_STRONG_INLINE void prefetch<double>(const double* addr) { EIGEN_PPC_PREFETCH(addr); }

-template<> EIGEN_STRONG_INLINE double  pfirst<Packet2d>(const Packet2d& a) { double EIGEN_ALIGN16 x[2]; pstore<double>(x, a); return x[0]; }
+template<> EIGEN_STRONG_INLINE double  pfirst<Packet2d>(const Packet2d& a) { EIGEN_ALIGN16 double x[2]; pstore<double>(x, a); return x[0]; }

 template<> EIGEN_STRONG_INLINE Packet2d preverse(const Packet2d& a)
 {
@@ -969,7 +986,7 @@ template<> EIGEN_STRONG_INLINE Packet2d preduxp<Packet2d>(const Packet2d* vecs)
  Packet2d v[2], sum;
  v[0] = vecs[0] + reinterpret_cast<Packet2d>(vec_sld(reinterpret_cast<Packet4f>(vecs[0]), reinterpret_cast<Packet4f>(vecs[0]), 8));
  v[1] = vecs[1] + reinterpret_cast<Packet2d>(vec_sld(reinterpret_cast<Packet4f>(vecs[1]), reinterpret_cast<Packet4f>(vecs[1]), 8));
- 
+
 #ifdef _BIG_ENDIAN
  sum = reinterpret_cast<Packet2d>(vec_sld(reinterpret_cast<Packet4f>(v[0]), reinterpret_cast<Packet4f>(v[1]), 8));
 #else
@@ -1022,7 +1039,7 @@ ptranspose(PacketBlock<Packet2d,2>& kernel) {

 template<> EIGEN_STRONG_INLINE Packet2d pblend(const Selector<2>& ifPacket, const Packet2d& thenPacket, const Packet2d& elsePacket) {
  Packet2l select = { ifPacket.select[0], ifPacket.select[1] };
-  Packet2bl mask = vec_cmpeq(reinterpret_cast<Packet2d>(select), reinterpret_cast<Packet2d>(p2l_ONE));
+  Packet2bl mask = reinterpret_cast<Packet2bl>( vec_cmpeq(reinterpret_cast<Packet2d>(select), reinterpret_cast<Packet2d>(p2l_ONE)) );
  return vec_sel(elsePacket, thenPacket, mask);
 }
 #endif // __VSX__
--- a/Eigen/src/Core/arch/CUDA/Complex.h
+++ b/Eigen/src/Core/arch/CUDA/Complex.h
@@ -16,7 +16,7 @@ namespace Eigen {

 namespace internal {

-#if defined(__CUDACC__) && defined(EIGEN_USE_GPU)
+#if defined(EIGEN_CUDACC) && defined(EIGEN_USE_GPU)

 // Many std::complex methods such as operator+, operator-, operator* and
 // operator/ are not constexpr. Due to this, clang does not treat them as device
@@ -55,7 +55,7 @@ template<typename T> struct scalar_difference_op<std::complex<T>, std::complex<T
 // Product
 template<typename T> struct scalar_product_op<const std::complex<T>, const std::complex<T> >  : binary_op_base<const std::complex<T>, const std::complex<T> > {
  enum {
-    Vectorizable = packet_traits<std::complex<T>>::HasMul
+    Vectorizable = packet_traits<std::complex<T> >::HasMul
  };
  typedef typename std::complex<T> result_type;

@@ -76,7 +76,7 @@ template<typename T> struct scalar_product_op<std::complex<T>, std::complex<T> >
 // Quotient
 template<typename T> struct scalar_quotient_op<const std::complex<T>, const std::complex<T> > : binary_op_base<const std::complex<T>, const std::complex<T> > {
  enum {
-    Vectorizable = packet_traits<std::complex<T>>::HasDiv
+    Vectorizable = packet_traits<std::complex<T> >::HasDiv
  };
  typedef typename std::complex<T> result_type;

--- a/Eigen/src/Core/arch/CUDA/Half.h
+++ b/Eigen/src/Core/arch/CUDA/Half.h
@@ -13,7 +13,7 @@
 // Redistribution and use in source and binary forms, with or without
 // modification, are permitted.
 // THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
-// “AS IS” AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
 // LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
 // A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
 // HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
@@ -29,7 +29,7 @@
 // type Eigen::half (inheriting from CUDA's __half struct) with
 // operator overloads such that it behaves basically as an arithmetic
 // type. It will be quite slow on CPUs (so it is recommended to stay
-// in fp32 for CPUs, except for simple parameter conversions, I/O
+// in float32_bits for CPUs, except for simple parameter conversions, I/O
 // to disk and the likes), but fast on GPUs.


@@ -42,6 +42,7 @@
 #define EIGEN_EXPLICIT_CAST(tgt_type) operator tgt_type()
 #endif

+#include <sstream>

 namespace Eigen {

@@ -50,38 +51,45 @@ struct half;
 namespace half_impl {

 #if !defined(EIGEN_HAS_CUDA_FP16)
-
-// Make our own __half definition that is similar to CUDA's.
-struct __half {
-  EIGEN_DEVICE_FUNC __half() {}
-  explicit EIGEN_DEVICE_FUNC __half(unsigned short raw) : x(raw) {}
+// Make our own __half_raw definition that is similar to CUDA's.
+struct __half_raw {
+  EIGEN_DEVICE_FUNC __half_raw() : x(0) {}
+  explicit EIGEN_DEVICE_FUNC __half_raw(unsigned short raw) : x(raw) {}
  unsigned short x;
 };
-
+#elif EIGEN_CUDA_SDK_VER < 90000
+// In CUDA < 9.0, __half is the equivalent of CUDA 9's __half_raw
+typedef __half __half_raw;
 #endif

-EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC __half raw_uint16_to_half(unsigned short x);
-EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC __half float_to_half_rtne(float ff);
-EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC float half_to_float(__half h);
+EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC __half_raw raw_uint16_to_half(unsigned short x);
+EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC __half_raw float_to_half_rtne(float ff);
+EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC float half_to_float(__half_raw h);

-struct half_base : public __half {
+struct half_base : public __half_raw {
  EIGEN_DEVICE_FUNC half_base() {}
-  EIGEN_DEVICE_FUNC half_base(const half_base& h) : __half(h) {}
-  EIGEN_DEVICE_FUNC half_base(const __half& h) : __half(h) {}
+  EIGEN_DEVICE_FUNC half_base(const half_base& h) : __half_raw(h) {}
+  EIGEN_DEVICE_FUNC half_base(const __half_raw& h) : __half_raw(h) {}
+#if defined(EIGEN_HAS_CUDA_FP16) && EIGEN_CUDA_SDK_VER >= 90000
+  EIGEN_DEVICE_FUNC half_base(const __half& h) : __half_raw(*(__half_raw*)&h) {}
+#endif
 };

 } // namespace half_impl

 // Class definition.
 struct half : public half_impl::half_base {
-  #if !defined(EIGEN_HAS_CUDA_FP16)
-    typedef half_impl::__half __half;
+  #if !defined(EIGEN_HAS_CUDA_FP16) || (EIGEN_CUDA_SDK_VER < 90000)
+    typedef half_impl::__half_raw __half_raw;
  #endif

  EIGEN_DEVICE_FUNC half() {}

-  EIGEN_DEVICE_FUNC half(const __half& h) : half_impl::half_base(h) {}
+  EIGEN_DEVICE_FUNC half(const __half_raw& h) : half_impl::half_base(h) {}
  EIGEN_DEVICE_FUNC half(const half& h) : half_impl::half_base(h) {}
+#if defined(EIGEN_HAS_CUDA_FP16) && EIGEN_CUDA_SDK_VER >= 90000
+  EIGEN_DEVICE_FUNC half(const __half& h) : half_impl::half_base(h) {}
+#endif

  explicit EIGEN_DEVICE_FUNC half(bool b)
      : half_impl::half_base(half_impl::raw_uint16_to_half(b ? 0x3c00 : 0)) {}
@@ -138,71 +146,125 @@ struct half : public half_impl::half_base {
  }
 };

+} // end namespace Eigen
+
+namespace std {
+template<>
+struct numeric_limits<Eigen::half> {
+  static const bool is_specialized = true;
+  static const bool is_signed = true;
+  static const bool is_integer = false;
+  static const bool is_exact = false;
+  static const bool has_infinity = true;
+  static const bool has_quiet_NaN = true;
+  static const bool has_signaling_NaN = true;
+  static const float_denorm_style has_denorm = denorm_present;
+  static const bool has_denorm_loss = false;
+  static const std::float_round_style round_style = std::round_to_nearest;
+  static const bool is_iec559 = false;
+  static const bool is_bounded = false;
+  static const bool is_modulo = false;
+  static const int digits = 11;
+  static const int digits10 = 3;      // according to http://half.sourceforge.net/structstd_1_1numeric__limits_3_01half__float_1_1half_01_4.html
+  static const int max_digits10 = 5;  // according to http://half.sourceforge.net/structstd_1_1numeric__limits_3_01half__float_1_1half_01_4.html
+  static const int radix = 2;
+  static const int min_exponent = -13;
+  static const int min_exponent10 = -4;
+  static const int max_exponent = 16;
+  static const int max_exponent10 = 4;
+  static const bool traps = true;
+  static const bool tinyness_before = false;
+
+  static Eigen::half (min)() { return Eigen::half_impl::raw_uint16_to_half(0x400); }
+  static Eigen::half lowest() { return Eigen::half_impl::raw_uint16_to_half(0xfbff); }
+  static Eigen::half (max)() { return Eigen::half_impl::raw_uint16_to_half(0x7bff); }
+  static Eigen::half epsilon() { return Eigen::half_impl::raw_uint16_to_half(0x0800); }
+  static Eigen::half round_error() { return Eigen::half(0.5); }
+  static Eigen::half infinity() { return Eigen::half_impl::raw_uint16_to_half(0x7c00); }
+  static Eigen::half quiet_NaN() { return Eigen::half_impl::raw_uint16_to_half(0x7e00); }
+  static Eigen::half signaling_NaN() { return Eigen::half_impl::raw_uint16_to_half(0x7e00); }
+  static Eigen::half denorm_min() { return Eigen::half_impl::raw_uint16_to_half(0x1); }
+};
+
+// If std::numeric_limits<T> is specialized, should also specialize
+// std::numeric_limits<const T>, std::numeric_limits<volatile T>, and
+// std::numeric_limits<const volatile T>
+// https://stackoverflow.com/a/16519653/
+template<>
+struct numeric_limits<const Eigen::half> : numeric_limits<Eigen::half> {};
+template<>
+struct numeric_limits<volatile Eigen::half> : numeric_limits<Eigen::half> {};
+template<>
+struct numeric_limits<const volatile Eigen::half> : numeric_limits<Eigen::half> {};
+} // end namespace std
+
+namespace Eigen {
+
 namespace half_impl {

-#if defined(EIGEN_HAS_CUDA_FP16) && defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 530
+#if defined(EIGEN_HAS_CUDA_FP16) && defined(EIGEN_CUDA_ARCH) && EIGEN_CUDA_ARCH >= 530

 // Intrinsics for native fp16 support. Note that on current hardware,
-// these are no faster than fp32 arithmetic (you need to use the half2
+// these are no faster than float32_bits arithmetic (you need to use the half2
 // versions to get the ALU speed increased), but you do save the
 // conversion steps back and forth.

-__device__ half operator + (const half& a, const half& b) {
-  return __hadd(a, b);
+EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC half operator + (const half& a, const half& b) {
+  return __hadd(static_cast<__half>(a), static_cast<__half>(b));
 }
-__device__ half operator * (const half& a, const half& b) {
-  return __hmul(a, b);
+EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC half operator * (const half& a, const half& b) {
+  return __hmul(static_cast<__half>(a), static_cast<__half>(b));
 }
-__device__ half operator - (const half& a, const half& b) {
-  return __hsub(a, b);
+EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC half operator - (const half& a, const half& b) {
+  return __hsub(static_cast<__half>(a), static_cast<__half>(b));
 }
-__device__ half operator / (const half& a, const half& b) {
+EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC half operator / (const half& a, const half& b) {
  float num = __half2float(a);
  float denom = __half2float(b);
  return __float2half(num / denom);
 }
-__device__ half operator - (const half& a) {
-  return __hneg(a);
+EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC half operator - (const half& a) {
+  return __hneg(static_cast<__half>(a));
 }
-__device__ half& operator += (half& a, const half& b) {
+EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC half& operator += (half& a, const half& b) {
  a = a + b;
  return a;
 }
-__device__ half& operator *= (half& a, const half& b) {
+EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC half& operator *= (half& a, const half& b) {
  a = a * b;
  return a;
 }
-__device__ half& operator -= (half& a, const half& b) {
+EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC half& operator -= (half& a, const half& b) {
  a = a - b;
  return a;
 }
-__device__ half& operator /= (half& a, const half& b) {
+EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC half& operator /= (half& a, const half& b) {
  a = a / b;
  return a;
 }
-__device__ bool operator == (const half& a, const half& b) {
-  return __heq(a, b);
+EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC bool operator == (const half& a, const half& b) {
+  return __heq(static_cast<__half>(a), static_cast<__half>(b));
 }
-__device__ bool operator != (const half& a, const half& b) {
-  return __hne(a, b);
+EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC bool operator != (const half& a, const half& b) {
+  return __hne(static_cast<__half>(a), static_cast<__half>(b));
 }
-__device__ bool operator < (const half& a, const half& b) {
-  return __hlt(a, b);
+EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC bool operator < (const half& a, const half& b) {
+  return __hlt(static_cast<__half>(a), static_cast<__half>(b));
 }
-__device__ bool operator <= (const half& a, const half& b) {
-  return __hle(a, b);
+EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC bool operator <= (const half& a, const half& b) {
+  return __hle(static_cast<__half>(a), static_cast<__half>(b));
 }
-__device__ bool operator > (const half& a, const half& b) {
-  return __hgt(a, b);
+EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC bool operator > (const half& a, const half& b) {
+  return __hgt(static_cast<__half>(a), static_cast<__half>(b));
 }
-__device__ bool operator >= (const half& a, const half& b) {
-  return __hge(a, b);
+EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC bool operator >= (const half& a, const half& b) {
+  return __hge(static_cast<__half>(a), static_cast<__half>(b));
 }

 #else  // Emulate support for half floats

 // Definitions for CPUs and older CUDA, mostly working through conversion
-// to/from fp32.
+// to/from float32_bits.

 EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC half operator + (const half& a, const half& b) {
  return half(float(a) + float(b));
@@ -238,10 +300,10 @@ EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC half& operator /= (half& a, const half& b)
  return a;
 }
 EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC bool operator == (const half& a, const half& b) {
-  return float(a) == float(b);
+  return numext::equal_strict(float(a),float(b));
 }
 EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC bool operator != (const half& a, const half& b) {
-  return float(a) != float(b);
+  return numext::not_equal_strict(float(a), float(b));
 }
 EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC bool operator < (const half& a, const half& b) {
  return float(a) < float(b);
@@ -269,34 +331,35 @@ EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC half operator / (const half& a, Index b) {
 // these in hardware. If we need more performance on older/other CPUs, they are
 // also possible to vectorize directly.

-EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC __half raw_uint16_to_half(unsigned short x) {
-  __half h;
+EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC __half_raw raw_uint16_to_half(unsigned short x) {
+  __half_raw h;
  h.x = x;
  return h;
 }

-union FP32 {
+union float32_bits {
  unsigned int u;
  float f;
 };

-EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC __half float_to_half_rtne(float ff) {
-#if defined(EIGEN_HAS_CUDA_FP16) && defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 300
-  return __float2half(ff);
+EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC __half_raw float_to_half_rtne(float ff) {
+#if defined(EIGEN_HAS_CUDA_FP16) && defined(EIGEN_CUDA_ARCH) && EIGEN_CUDA_ARCH >= 300
+  __half tmp_ff = __float2half(ff);
+  return *(__half_raw*)&tmp_ff;

 #elif defined(EIGEN_HAS_FP16_C)
-  __half h;
+  __half_raw h;
  h.x = _cvtss_sh(ff, 0);
  return h;

 #else
-  FP32 f; f.f = ff;
+  float32_bits f; f.f = ff;

-  const FP32 f32infty = { 255 << 23 };
-  const FP32 f16max = { (127 + 16) << 23 };
-  const FP32 denorm_magic = { ((127 - 15) + (23 - 10) + 1) << 23 };
+  const float32_bits f32infty = { 255 << 23 };
+  const float32_bits f16max = { (127 + 16) << 23 };
+  const float32_bits denorm_magic = { ((127 - 15) + (23 - 10) + 1) << 23 };
  unsigned int sign_mask = 0x80000000u;
-  __half o;
+  __half_raw o;
  o.x = static_cast<unsigned short>(0x0u);

  unsigned int sign = f.u & sign_mask;
@@ -335,17 +398,17 @@ EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC __half float_to_half_rtne(float ff) {
 #endif
 }

-EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC float half_to_float(__half h) {
-#if defined(EIGEN_HAS_CUDA_FP16) && defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 300
+EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC float half_to_float(__half_raw h) {
+#if defined(EIGEN_HAS_CUDA_FP16) && defined(EIGEN_CUDA_ARCH) && EIGEN_CUDA_ARCH >= 300
  return __half2float(h);

 #elif defined(EIGEN_HAS_FP16_C)
  return _cvtsh_ss(h.x);

 #else
-  const FP32 magic = { 113 << 23 };
+  const float32_bits magic = { 113 << 23 };
  const unsigned int shifted_exp = 0x7c00 << 13; // exponent mask after shift
-  FP32 o;
+  float32_bits o;

  o.u = (h.x & 0x7fff) << 13;             // exponent/mantissa bits
  unsigned int exp = shifted_exp & o.u;   // just the exponent
@@ -370,7 +433,7 @@ EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC bool (isinf)(const half& a) {
  return (a.x & 0x7fff) == 0x7c00;
 }
 EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC bool (isnan)(const half& a) {
-#if defined(EIGEN_HAS_CUDA_FP16) && defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 530
+#if defined(EIGEN_HAS_CUDA_FP16) && defined(EIGEN_CUDA_ARCH) && EIGEN_CUDA_ARCH >= 530
  return __hisnan(a);
 #else
  return (a.x & 0x7fff) > 0x7c00;
@@ -386,11 +449,15 @@ EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC half abs(const half& a) {
  return result;
 }
 EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC half exp(const half& a) {
-  return half(::expf(float(a)));
+#if EIGEN_CUDA_SDK_VER >= 80000 && defined EIGEN_CUDA_ARCH && EIGEN_CUDA_ARCH >= 530
+  return half(hexp(a));
+#else
+   return half(::expf(float(a)));
+#endif
 }
 EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC half log(const half& a) {
-#if defined(EIGEN_HAS_CUDA_FP16) && defined __CUDACC_VER__ && __CUDACC_VER__ >= 80000 && defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 530
-  return Eigen::half(::hlog(a));
+#if defined(EIGEN_HAS_CUDA_FP16) && EIGEN_CUDA_SDK_VER >= 80000 && defined(EIGEN_CUDA_ARCH) && EIGEN_CUDA_ARCH >= 530
+  return half(::hlog(a));
 #else
  return half(::logf(float(a)));
 #endif
@@ -402,7 +469,11 @@ EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC half log10(const half& a) {
  return half(::log10f(float(a)));
 }
 EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC half sqrt(const half& a) {
-  return half(::sqrtf(float(a)));
+#if EIGEN_CUDA_SDK_VER >= 80000 && defined EIGEN_CUDA_ARCH && EIGEN_CUDA_ARCH >= 530
+  return half(hsqrt(a));
+#else
+    return half(::sqrtf(float(a)));
+#endif
 }
 EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC half pow(const half& a, const half& b) {
  return half(::powf(float(a), float(b)));
@@ -420,14 +491,22 @@ EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC half tanh(const half& a) {
  return half(::tanhf(float(a)));
 }
 EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC half floor(const half& a) {
+#if EIGEN_CUDA_SDK_VER >= 80000 && defined EIGEN_CUDA_ARCH && EIGEN_CUDA_ARCH >= 300
+  return half(hfloor(a));
+#else
  return half(::floorf(float(a)));
+#endif
 }
 EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC half ceil(const half& a) {
+#if EIGEN_CUDA_SDK_VER >= 80000 && defined EIGEN_CUDA_ARCH && EIGEN_CUDA_ARCH >= 300
+  return half(hceil(a));
+#else
  return half(::ceilf(float(a)));
+#endif
 }

 EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC half (min)(const half& a, const half& b) {
-#if defined(EIGEN_HAS_CUDA_FP16) && defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 530
+#if defined(EIGEN_HAS_CUDA_FP16) && defined(EIGEN_CUDA_ARCH) && EIGEN_CUDA_ARCH >= 530
  return __hlt(b, a) ? b : a;
 #else
  const float f1 = static_cast<float>(a);
@@ -436,7 +515,7 @@ EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC half (min)(const half& a, const half& b) {
 #endif
 }
 EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC half (max)(const half& a, const half& b) {
-#if defined(EIGEN_HAS_CUDA_FP16) && defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 530
+#if defined(EIGEN_HAS_CUDA_FP16) && defined(EIGEN_CUDA_ARCH) && EIGEN_CUDA_ARCH >= 530
  return __hlt(a, b) ? b : a;
 #else
  const float f1 = static_cast<float>(a);
@@ -462,7 +541,7 @@ struct random_default_impl<half, false, false>
 {
  static inline half run(const half& x, const half& y)
  {
-    return x + (y-x) * half(float(std::rand()) / float(RAND_MAX));
+    return x + (y-x) * half(random<float>());
  }
  static inline half run()
  {
@@ -477,6 +556,13 @@ template<> struct is_arithmetic<half> { enum { value = true }; };
 template<> struct NumTraits<Eigen::half>
    : GenericNumTraits<Eigen::half>
 {
+  enum {
+    IsSigned = true,
+    IsInteger = false,
+    IsComplex = false,
+    RequireInitialization = false
+  };
+
  EIGEN_DEVICE_FUNC static EIGEN_STRONG_INLINE Eigen::half epsilon() {
    return half_impl::raw_uint16_to_half(0x0800);
  }
@@ -507,7 +593,7 @@ EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC Eigen::half exph(const Eigen::half& a) {
  return Eigen::half(::expf(float(a)));
 }
 EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC Eigen::half logh(const Eigen::half& a) {
-#if defined __CUDACC_VER__ && __CUDACC_VER__ >= 80000 && defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 530
+#if EIGEN_CUDA_SDK_VER >= 80000 && defined(EIGEN_CUDA_ARCH) && EIGEN_CUDA_ARCH >= 530
  return Eigen::half(::hlog(a));
 #else
  return Eigen::half(::logf(float(a)));
@@ -540,23 +626,73 @@ struct hash<Eigen::half> {
 } // end namespace std


-// Add the missing shfl_xor intrinsic
-#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 300
-__device__ EIGEN_STRONG_INLINE Eigen::half __shfl_xor(Eigen::half var, int laneMask, int width=warpSize) {
+// Add the missing shfl* intrinsics.
+// The __shfl* functions are only valid on HIP or _CUDA_ARCH_ >= 300.
+//   CUDA defines them for (__CUDA_ARCH__ >= 300 || !defined(__CUDA_ARCH__))
+//
+// HIP and CUDA prior to SDK 9.0 define
+//    __shfl, __shfl_up, __shfl_down, __shfl_xor for int and float
+// CUDA since 9.0 deprecates those and instead defines
+//    __shfl_sync, __shfl_up_sync, __shfl_down_sync, __shfl_xor_sync,
+//    with native support for __half and __nv_bfloat16
+//
+// Note that the following are __device__ - only functions.
+#if defined(EIGEN_CUDACC) && (!defined(EIGEN_CUDA_ARCH) || EIGEN_CUDA_ARCH >= 300)
+#if defined(EIGEN_HAS_CUDA_FP16) && EIGEN_CUDA_SDK_VER >= 90000
+
+__device__ EIGEN_STRONG_INLINE Eigen::half __shfl_sync(unsigned mask, Eigen::half var, int srcLane,
+                                                       int width = warpSize) {
+  const __half h = var;
+  return static_cast<Eigen::half>(__shfl_sync(mask, h, srcLane, width));
+}
+
+__device__ EIGEN_STRONG_INLINE Eigen::half __shfl_up_sync(unsigned mask, Eigen::half var, unsigned int delta,
+                                                          int width = warpSize) {
+  const __half h = var;
+  return static_cast<Eigen::half>(__shfl_up_sync(mask, h, delta, width));
+}
+
+__device__ EIGEN_STRONG_INLINE Eigen::half __shfl_down_sync(unsigned mask, Eigen::half var, unsigned int delta,
+                                                            int width = warpSize) {
+  const __half h = var;
+  return static_cast<Eigen::half>(__shfl_down_sync(mask, h, delta, width));
+}
+
+__device__ EIGEN_STRONG_INLINE Eigen::half __shfl_xor_sync(unsigned mask, Eigen::half var, int laneMask,
+                                                           int width = warpSize) {
+  const __half h = var;
+  return static_cast<Eigen::half>(__shfl_xor_sync(mask, h, laneMask, width));
+}
+
+#else  // CUDA SDK < 9.0
+
+__device__ EIGEN_STRONG_INLINE Eigen::half __shfl(Eigen::half var, int srcLane, int width = warpSize) {
+  return static_cast<Eigen::half>(__shfl(static_cast<float>(var), srcLane, width));
+}
+
+__device__ EIGEN_STRONG_INLINE Eigen::half __shfl_up(Eigen::half var, unsigned int delta, int width = warpSize) {
+  return static_cast<Eigen::half>(__shfl_up(static_cast<float>(var), delta, width));
+}
+
+__device__ EIGEN_STRONG_INLINE Eigen::half __shfl_down(Eigen::half var, unsigned int delta, int width = warpSize) {
+  return static_cast<Eigen::half>(__shfl_down(static_cast<float>(var), delta, width));
+}
+
+__device__ EIGEN_STRONG_INLINE Eigen::half __shfl_xor(Eigen::half var, int laneMask, int width = warpSize) {
  return static_cast<Eigen::half>(__shfl_xor(static_cast<float>(var), laneMask, width));
 }
-#endif

-// ldg() has an overload for __half, but we also need one for Eigen::half.
-#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 350
-EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC Eigen::half __ldg(const Eigen::half* ptr) {
-  return Eigen::half_impl::raw_uint16_to_half(
-      __ldg(reinterpret_cast<const unsigned short*>(ptr)));
+#endif
+#endif  // __shfl*
+
+// ldg() has an overload for __half_raw, but we also need one for Eigen::half.
+#if defined(EIGEN_CUDACC) && (!defined(EIGEN_CUDA_ARCH) || EIGEN_CUDA_ARCH >= 350)
+EIGEN_STRONG_INLINE __device__ Eigen::half __ldg(const Eigen::half* ptr) {
+  return Eigen::half_impl::raw_uint16_to_half(__ldg(reinterpret_cast<const Eigen::numext::uint16_t*>(ptr)));
 }
-#endif
+#endif  // __ldg

-
-#if defined(__CUDA_ARCH__)
+#if defined(EIGEN_CUDA_ARCH)
 namespace Eigen {
 namespace numext {

--- a/Eigen/src/Core/arch/CUDA/MathFunctions.h
+++ b/Eigen/src/Core/arch/CUDA/MathFunctions.h
@@ -17,7 +17,7 @@ namespace internal {
 // Make sure this is only available when targeting a GPU: we don't want to
 // introduce conflicts between these packet_traits definitions and the ones
 // we'll use on the host side (SSE, AVX, ...)
-#if defined(__CUDACC__) && defined(EIGEN_USE_GPU)
+#if defined(EIGEN_CUDACC) && defined(EIGEN_USE_GPU)
 template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
 float4 plog<float4>(const float4& a)
 {
--- a/Eigen/src/Core/arch/CUDA/PacketMath.h
+++ b/Eigen/src/Core/arch/CUDA/PacketMath.h
@@ -17,7 +17,7 @@ namespace internal {
 // Make sure this is only available when targeting a GPU: we don't want to
 // introduce conflicts between these packet_traits definitions and the ones
 // we'll use on the host side (SSE, AVX, ...)
-#if defined(__CUDACC__) && defined(EIGEN_USE_GPU)
+#if defined(EIGEN_CUDACC) && defined(EIGEN_USE_GPU)
 template<> struct is_arithmetic<float4>  { enum { value = true }; };
 template<> struct is_arithmetic<double2> { enum { value = true }; };

@@ -167,10 +167,10 @@ template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE double2 ploadu<double2>(const d
  return make_double2(from[0], from[1]);
 }

-template<> EIGEN_STRONG_INLINE float4 ploaddup<float4>(const float*   from) {
+template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE float4 ploaddup<float4>(const float*   from) {
  return make_float4(from[0], from[0], from[1], from[1]);
 }
-template<> EIGEN_STRONG_INLINE double2 ploaddup<double2>(const double*  from) {
+template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE double2 ploaddup<double2>(const double*  from) {
  return make_double2(from[0], from[0]);
 }

@@ -197,7 +197,7 @@ template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void pstoreu<double>(double* to
 template<>
 EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE float4 ploadt_ro<float4, Aligned>(const float* from) {
 #if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 350
-  return __ldg((const float4*)from);
+  return __ldg(reinterpret_cast<const float4*>(from));
 #else
  return make_float4(from[0], from[1], from[2], from[3]);
 #endif
@@ -205,7 +205,7 @@ EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE float4 ploadt_ro<float4, Aligned>(const fl
 template<>
 EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE double2 ploadt_ro<double2, Aligned>(const double* from) {
 #if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 350
-  return __ldg((const double2*)from);
+  return __ldg(reinterpret_cast<const double2*>(from));
 #else
  return make_double2(from[0], from[1]);
 #endif
@@ -291,7 +291,7 @@ template<> EIGEN_DEVICE_FUNC inline double2 pabs<double2>(const double2& a) {

 EIGEN_DEVICE_FUNC inline void
 ptranspose(PacketBlock<float4,4>& kernel) {
-  double tmp = kernel.packet[0].y;
+  float tmp = kernel.packet[0].y;
  kernel.packet[0].y = kernel.packet[1].x;
  kernel.packet[1].x = tmp;

--- a/Eigen/src/Core/arch/CUDA/PacketMathHalf.h
+++ b/Eigen/src/Core/arch/CUDA/PacketMathHalf.h
@@ -15,7 +15,7 @@ namespace Eigen {
 namespace internal {

 // Most of the following operations require arch >= 3.0
-#if defined(EIGEN_HAS_CUDA_FP16) && defined(__CUDACC__) && defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 300
+#if defined(EIGEN_HAS_CUDA_FP16) && defined(EIGEN_CUDACC) && defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 300

 template<> struct is_arithmetic<half2> { enum { value = true }; };

@@ -41,42 +41,42 @@ template<> struct packet_traits<Eigen::half> : default_packet_traits

 template<> struct unpacket_traits<half2> { typedef Eigen::half type; enum {size=2, alignment=Aligned16}; typedef half2 half; };

-template<> __device__ EIGEN_STRONG_INLINE half2 pset1<half2>(const Eigen::half& from) {
+template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 pset1<half2>(const Eigen::half& from) {
  return __half2half2(from);
 }

-template<> __device__ EIGEN_STRONG_INLINE half2 pload<half2>(const Eigen::half* from) {
+template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 pload<half2>(const Eigen::half* from) {
  return *reinterpret_cast<const half2*>(from);
 }

-template<> __device__ EIGEN_STRONG_INLINE half2 ploadu<half2>(const Eigen::half* from) {
+template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 ploadu<half2>(const Eigen::half* from) {
  return __halves2half2(from[0], from[1]);
 }

-template<> EIGEN_STRONG_INLINE half2 ploaddup<half2>(const Eigen::half*  from) {
+template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 ploaddup<half2>(const Eigen::half*  from) {
  return __halves2half2(from[0], from[0]);
 }

-template<> __device__ EIGEN_STRONG_INLINE void pstore<Eigen::half>(Eigen::half* to, const half2& from) {
+template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void pstore<Eigen::half>(Eigen::half* to, const half2& from) {
  *reinterpret_cast<half2*>(to) = from;
 }

-template<> __device__ EIGEN_STRONG_INLINE void pstoreu<Eigen::half>(Eigen::half* to, const half2& from) {
+template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void pstoreu<Eigen::half>(Eigen::half* to, const half2& from) {
  to[0] = __low2half(from);
  to[1] = __high2half(from);
 }

 template<>
- __device__ EIGEN_ALWAYS_INLINE half2 ploadt_ro<half2, Aligned>(const Eigen::half* from) {
+ EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE half2 ploadt_ro<half2, Aligned>(const Eigen::half* from) {
 #if __CUDA_ARCH__ >= 350
-   return __ldg((const half2*)from);
+   return __ldg(reinterpret_cast<const half2*>(from));
 #else
  return __halves2half2(*(from+0), *(from+1));
 #endif
 }

 template<>
-__device__ EIGEN_ALWAYS_INLINE half2 ploadt_ro<half2, Unaligned>(const Eigen::half* from) {
+EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE half2 ploadt_ro<half2, Unaligned>(const Eigen::half* from) {
 #if __CUDA_ARCH__ >= 350
   return __halves2half2(__ldg(from+0), __ldg(from+1));
 #else
@@ -84,27 +84,28 @@ __device__ EIGEN_ALWAYS_INLINE half2 ploadt_ro<half2, Unaligned>(const Eigen::ha
 #endif
 }

-template<> __device__ EIGEN_STRONG_INLINE half2 pgather<Eigen::half, half2>(const Eigen::half* from, Index stride) {
+template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 pgather<Eigen::half, half2>(const Eigen::half* from, Index stride) {
  return __halves2half2(from[0*stride], from[1*stride]);
 }

-template<> __device__ EIGEN_STRONG_INLINE void pscatter<Eigen::half, half2>(Eigen::half* to, const half2& from, Index stride) {
+template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void pscatter<Eigen::half, half2>(Eigen::half* to, const half2& from, Index stride) {
  to[stride*0] = __low2half(from);
  to[stride*1] = __high2half(from);
 }

-template<> __device__ EIGEN_STRONG_INLINE Eigen::half pfirst<half2>(const half2& a) {
+template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Eigen::half pfirst<half2>(const half2& a) {
  return __low2half(a);
 }

-template<> __device__ EIGEN_STRONG_INLINE half2 pabs<half2>(const half2& a) {
+template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 pabs<half2>(const half2& a) {
  half2 result;
-  result.x = a.x & 0x7FFF7FFF;
+  unsigned temp = *(reinterpret_cast<const unsigned*>(&(a)));
+  *(reinterpret_cast<unsigned*>(&(result))) = temp & 0x7FFF7FFF;
  return result;
 }


-__device__ EIGEN_STRONG_INLINE void
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void
 ptranspose(PacketBlock<half2,2>& kernel) {
  __half a1 = __low2half(kernel.packet[0]);
  __half a2 = __high2half(kernel.packet[0]);
@@ -114,7 +115,7 @@ ptranspose(PacketBlock<half2,2>& kernel) {
  kernel.packet[1] = __halves2half2(a2, b2);
 }

-template<> __device__ EIGEN_STRONG_INLINE half2 plset<half2>(const Eigen::half& a) {
+template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 plset<half2>(const Eigen::half& a) {
 #if __CUDA_ARCH__ >= 530
  return __halves2half2(a, __hadd(a, __float2half(1.0f)));
 #else
@@ -123,7 +124,7 @@ template<> __device__ EIGEN_STRONG_INLINE half2 plset<half2>(const Eigen::half&
 #endif
 }

-template<> __device__ EIGEN_STRONG_INLINE half2 padd<half2>(const half2& a, const half2& b) {
+template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 padd<half2>(const half2& a, const half2& b) {
 #if __CUDA_ARCH__ >= 530
  return __hadd2(a, b);
 #else
@@ -137,7 +138,7 @@ template<> __device__ EIGEN_STRONG_INLINE half2 padd<half2>(const half2& a, cons
 #endif
 }

-template<> __device__ EIGEN_STRONG_INLINE half2 psub<half2>(const half2& a, const half2& b) {
+template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 psub<half2>(const half2& a, const half2& b) {
 #if __CUDA_ARCH__ >= 530
  return __hsub2(a, b);
 #else
@@ -151,7 +152,7 @@ template<> __device__ EIGEN_STRONG_INLINE half2 psub<half2>(const half2& a, cons
 #endif
 }

-template<> __device__ EIGEN_STRONG_INLINE half2 pnegate(const half2& a) {
+template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 pnegate(const half2& a) {
 #if __CUDA_ARCH__ >= 530
  return __hneg2(a);
 #else
@@ -161,9 +162,9 @@ template<> __device__ EIGEN_STRONG_INLINE half2 pnegate(const half2& a) {
 #endif
 }

-template<> __device__ EIGEN_STRONG_INLINE half2 pconj(const half2& a) { return a; }
+template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 pconj(const half2& a) { return a; }

-template<> __device__ EIGEN_STRONG_INLINE half2 pmul<half2>(const half2& a, const half2& b) {
+template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 pmul<half2>(const half2& a, const half2& b) {
 #if __CUDA_ARCH__ >= 530
  return __hmul2(a, b);
 #else
@@ -177,7 +178,7 @@ template<> __device__ EIGEN_STRONG_INLINE half2 pmul<half2>(const half2& a, cons
 #endif
 }

-template<> __device__ EIGEN_STRONG_INLINE half2 pmadd<half2>(const half2& a, const half2& b, const half2& c) {
+template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 pmadd<half2>(const half2& a, const half2& b, const half2& c) {
 #if __CUDA_ARCH__ >= 530
   return __hfma2(a, b, c);
 #else
@@ -193,7 +194,7 @@ template<> __device__ EIGEN_STRONG_INLINE half2 pmadd<half2>(const half2& a, con
 #endif
 }

-template<> __device__ EIGEN_STRONG_INLINE half2 pdiv<half2>(const half2& a, const half2& b) {
+template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 pdiv<half2>(const half2& a, const half2& b) {
  float a1 = __low2float(a);
  float a2 = __high2float(a);
  float b1 = __low2float(b);
@@ -203,7 +204,7 @@ template<> __device__ EIGEN_STRONG_INLINE half2 pdiv<half2>(const half2& a, cons
  return __floats2half2_rn(r1, r2);
 }

-template<> __device__ EIGEN_STRONG_INLINE half2 pmin<half2>(const half2& a, const half2& b) {
+template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 pmin<half2>(const half2& a, const half2& b) {
  float a1 = __low2float(a);
  float a2 = __high2float(a);
  float b1 = __low2float(b);
@@ -213,7 +214,7 @@ template<> __device__ EIGEN_STRONG_INLINE half2 pmin<half2>(const half2& a, cons
  return __halves2half2(r1, r2);
 }

-template<> __device__ EIGEN_STRONG_INLINE half2 pmax<half2>(const half2& a, const half2& b) {
+template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 pmax<half2>(const half2& a, const half2& b) {
  float a1 = __low2float(a);
  float a2 = __high2float(a);
  float b1 = __low2float(b);
@@ -223,17 +224,17 @@ template<> __device__ EIGEN_STRONG_INLINE half2 pmax<half2>(const half2& a, cons
  return __halves2half2(r1, r2);
 }

-template<> __device__ EIGEN_STRONG_INLINE Eigen::half predux<half2>(const half2& a) {
+template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Eigen::half predux<half2>(const half2& a) {
 #if __CUDA_ARCH__ >= 530
  return __hadd(__low2half(a), __high2half(a));
 #else
  float a1 = __low2float(a);
  float a2 = __high2float(a);
-  return Eigen::half(half_impl::raw_uint16_to_half(__float2half_rn(a1 + a2)));
+  return Eigen::half(__float2half_rn(a1 + a2));
 #endif
 }

-template<> __device__ EIGEN_STRONG_INLINE Eigen::half predux_max<half2>(const half2& a) {
+template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Eigen::half predux_max<half2>(const half2& a) {
 #if __CUDA_ARCH__ >= 530
  __half first = __low2half(a);
  __half second = __high2half(a);
@@ -245,7 +246,7 @@ template<> __device__ EIGEN_STRONG_INLINE Eigen::half predux_max<half2>(const ha
 #endif
 }

-template<> __device__ EIGEN_STRONG_INLINE Eigen::half predux_min<half2>(const half2& a) {
+template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Eigen::half predux_min<half2>(const half2& a) {
 #if __CUDA_ARCH__ >= 530
  __half first = __low2half(a);
  __half second = __high2half(a);
@@ -257,17 +258,17 @@ template<> __device__ EIGEN_STRONG_INLINE Eigen::half predux_min<half2>(const ha
 #endif
 }

-template<> __device__ EIGEN_STRONG_INLINE Eigen::half predux_mul<half2>(const half2& a) {
+template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Eigen::half predux_mul<half2>(const half2& a) {
 #if __CUDA_ARCH__ >= 530
  return __hmul(__low2half(a), __high2half(a));
 #else
  float a1 = __low2float(a);
  float a2 = __high2float(a);
-  return Eigen::half(half_impl::raw_uint16_to_half(__float2half_rn(a1 * a2)));
+  return Eigen::half(__float2half_rn(a1 * a2));
 #endif
 }

-template<> __device__ EIGEN_STRONG_INLINE half2 plog1p<half2>(const half2& a) {
+template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 plog1p<half2>(const half2& a) {
  float a1 = __low2float(a);
  float a2 = __high2float(a);
  float r1 = log1pf(a1);
@@ -275,31 +276,31 @@ template<> __device__ EIGEN_STRONG_INLINE half2 plog1p<half2>(const half2& a) {
  return __floats2half2_rn(r1, r2);
 }

-#if defined __CUDACC_VER__ && __CUDACC_VER__ >= 80000 && defined __CUDA_ARCH__ && __CUDA_ARCH__ >= 530
+#if EIGEN_CUDA_SDK_VER >= 80000 && defined EIGEN_CUDA_ARCH && EIGEN_CUDA_ARCH >= 530

-template<>  __device__ EIGEN_STRONG_INLINE
+template<>  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
 half2 plog<half2>(const half2& a) {
  return h2log(a);
 }

-template<> __device__ EIGEN_STRONG_INLINE
+template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
 half2 pexp<half2>(const half2& a) {
  return h2exp(a);
 }

-template<> __device__ EIGEN_STRONG_INLINE
+template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
 half2 psqrt<half2>(const half2& a) {
  return h2sqrt(a);
 }

-template<> __device__ EIGEN_STRONG_INLINE
+template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
 half2 prsqrt<half2>(const half2& a) {
  return h2rsqrt(a);
 }

 #else

-template<> __device__ EIGEN_STRONG_INLINE half2 plog<half2>(const half2& a) {
+template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 plog<half2>(const half2& a) {
  float a1 = __low2float(a);
  float a2 = __high2float(a);
  float r1 = logf(a1);
@@ -307,7 +308,7 @@ template<> __device__ EIGEN_STRONG_INLINE half2 plog<half2>(const half2& a) {
  return __floats2half2_rn(r1, r2);
 }

-template<> __device__ EIGEN_STRONG_INLINE half2 pexp<half2>(const half2& a) {
+template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 pexp<half2>(const half2& a) {
  float a1 = __low2float(a);
  float a2 = __high2float(a);
  float r1 = expf(a1);
@@ -315,7 +316,7 @@ template<> __device__ EIGEN_STRONG_INLINE half2 pexp<half2>(const half2& a) {
  return __floats2half2_rn(r1, r2);
 }

-template<> __device__ EIGEN_STRONG_INLINE half2 psqrt<half2>(const half2& a) {
+template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 psqrt<half2>(const half2& a) {
  float a1 = __low2float(a);
  float a2 = __high2float(a);
  float r1 = sqrtf(a1);
@@ -323,7 +324,7 @@ template<> __device__ EIGEN_STRONG_INLINE half2 psqrt<half2>(const half2& a) {
  return __floats2half2_rn(r1, r2);
 }

-template<> __device__ EIGEN_STRONG_INLINE half2 prsqrt<half2>(const half2& a) {
+template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 prsqrt<half2>(const half2& a) {
  float a1 = __low2float(a);
  float a2 = __high2float(a);
  float r1 = rsqrtf(a1);
--- a/Eigen/src/Core/arch/Default/ConjHelper.h
+++ b/Eigen/src/Core/arch/Default/ConjHelper.h
@@ -0,0 +1,29 @@
+
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2017 Gael Guennebaud <gael.guennebaud@inria.fr>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+#ifndef EIGEN_ARCH_CONJ_HELPER_H
+#define EIGEN_ARCH_CONJ_HELPER_H
+
+#define EIGEN_MAKE_CONJ_HELPER_CPLX_REAL(PACKET_CPLX, PACKET_REAL)                                                          \
+  template<> struct conj_helper<PACKET_REAL, PACKET_CPLX, false,false> {                                          \
+    EIGEN_STRONG_INLINE PACKET_CPLX pmadd(const PACKET_REAL& x, const PACKET_CPLX& y, const PACKET_CPLX& c) const \
+    { return padd(c, pmul(x,y)); }                                                                                \
+    EIGEN_STRONG_INLINE PACKET_CPLX pmul(const PACKET_REAL& x, const PACKET_CPLX& y) const                        \
+    { return PACKET_CPLX(Eigen::internal::pmul<PACKET_REAL>(x, y.v)); }                                           \
+  };                                                                                                              \
+                                                                                                                  \
+  template<> struct conj_helper<PACKET_CPLX, PACKET_REAL, false,false> {                                          \
+    EIGEN_STRONG_INLINE PACKET_CPLX pmadd(const PACKET_CPLX& x, const PACKET_REAL& y, const PACKET_CPLX& c) const \
+    { return padd(c, pmul(x,y)); }                                                                                \
+    EIGEN_STRONG_INLINE PACKET_CPLX pmul(const PACKET_CPLX& x, const PACKET_REAL& y) const                        \
+    { return PACKET_CPLX(Eigen::internal::pmul<PACKET_REAL>(x.v, y)); }                                           \
+  };
+
+#endif // EIGEN_ARCH_CONJ_HELPER_H
--- a/Eigen/src/Core/arch/Default/GenericPacketMathFunctions.h
+++ b/Eigen/src/Core/arch/Default/GenericPacketMathFunctions.h
--- a/Eigen/src/Core/arch/NEON/Complex.h
+++ b/Eigen/src/Core/arch/NEON/Complex.h
@@ -67,7 +67,7 @@ template<> struct unpacket_traits<Packet2cf> { typedef std::complex<float> type;
 template<> EIGEN_STRONG_INLINE Packet2cf pset1<Packet2cf>(const std::complex<float>&  from)
 {
  float32x2_t r64;
-  r64 = vld1_f32((float *)&from);
+  r64 = vld1_f32((const float *)&from);

  return Packet2cf(vcombine_f32(r64, r64));
 }
@@ -142,7 +142,7 @@ template<> EIGEN_DEVICE_FUNC inline void pscatter<std::complex<float>, Packet2cf
  to[stride*1] = std::complex<float>(vgetq_lane_f32(from.v, 2), vgetq_lane_f32(from.v, 3));
 }

-template<> EIGEN_STRONG_INLINE void prefetch<std::complex<float> >(const std::complex<float> *   addr) { EIGEN_ARM_PREFETCH((float *)addr); }
+template<> EIGEN_STRONG_INLINE void prefetch<std::complex<float> >(const std::complex<float> *   addr) { EIGEN_ARM_PREFETCH((const float *)addr); }

 template<> EIGEN_STRONG_INLINE std::complex<float>  pfirst<Packet2cf>(const Packet2cf& a)
 {
@@ -265,6 +265,8 @@ template<> struct conj_helper<Packet2cf, Packet2cf, true,true>
  }
 };

+EIGEN_MAKE_CONJ_HELPER_CPLX_REAL(Packet2cf,Packet4f)
+
 template<> EIGEN_STRONG_INLINE Packet2cf pdiv<Packet2cf>(const Packet2cf& a, const Packet2cf& b)
 {
  // TODO optimize it for NEON
@@ -275,7 +277,7 @@ template<> EIGEN_STRONG_INLINE Packet2cf pdiv<Packet2cf>(const Packet2cf& a, con
  s = vmulq_f32(b.v, b.v);
  rev_s = vrev64q_f32(s);

-  return Packet2cf(pdiv(res.v, vaddq_f32(s,rev_s)));
+  return Packet2cf(pdiv<Packet4f>(res.v, vaddq_f32(s,rev_s)));
 }

 EIGEN_DEVICE_FUNC inline void
@@ -381,7 +383,7 @@ template<> EIGEN_STRONG_INLINE Packet1cd ploaddup<Packet1cd>(const std::complex<
 template<> EIGEN_STRONG_INLINE void pstore <std::complex<double> >(std::complex<double> *   to, const Packet1cd& from) { EIGEN_DEBUG_ALIGNED_STORE pstore((double*)to, from.v); }
 template<> EIGEN_STRONG_INLINE void pstoreu<std::complex<double> >(std::complex<double> *   to, const Packet1cd& from) { EIGEN_DEBUG_UNALIGNED_STORE pstoreu((double*)to, from.v); }

-template<> EIGEN_STRONG_INLINE void prefetch<std::complex<double> >(const std::complex<double> *   addr) { EIGEN_ARM_PREFETCH((double *)addr); }
+template<> EIGEN_STRONG_INLINE void prefetch<std::complex<double> >(const std::complex<double> *   addr) { EIGEN_ARM_PREFETCH((const double *)addr); }

 template<> EIGEN_DEVICE_FUNC inline Packet1cd pgather<std::complex<double>, Packet1cd>(const std::complex<double>* from, Index stride)
 {
@@ -456,6 +458,8 @@ template<> struct conj_helper<Packet1cd, Packet1cd, true,true>
  }
 };

+EIGEN_MAKE_CONJ_HELPER_CPLX_REAL(Packet1cd,Packet2d)
+
 template<> EIGEN_STRONG_INLINE Packet1cd pdiv<Packet1cd>(const Packet1cd& a, const Packet1cd& b)
 {
  // TODO optimize it for NEON
--- a/Eigen/src/Core/arch/NEON/PacketMath.h
+++ b/Eigen/src/Core/arch/NEON/PacketMath.h
@@ -28,11 +28,42 @@ namespace internal {
 #define EIGEN_HAS_SINGLE_INSTRUCTION_CJMADD
 #endif

-// FIXME NEON has 16 quad registers, but since the current register allocator
-// is so bad, it is much better to reduce it to 8
 #ifndef EIGEN_ARCH_DEFAULT_NUMBER_OF_REGISTERS
-#define EIGEN_ARCH_DEFAULT_NUMBER_OF_REGISTERS 16 
+#if EIGEN_ARCH_ARM64
+#define EIGEN_ARCH_DEFAULT_NUMBER_OF_REGISTERS 32
+#else
+#define EIGEN_ARCH_DEFAULT_NUMBER_OF_REGISTERS 16
 #endif
+#endif
+
+#if EIGEN_COMP_MSVC
+
+// In MSVC's arm_neon.h header file, all NEON vector types
+// are aliases to the same underlying type __n128.
+// We thus have to wrap them to make them different C++ types.
+// (See also bug 1428)
+
+template<typename T,int unique_id>
+struct eigen_packet_wrapper
+{
+  operator T&() { return m_val; }
+  operator const T&() const { return m_val; }
+  eigen_packet_wrapper() {}
+  eigen_packet_wrapper(const T &v) : m_val(v) {}
+  eigen_packet_wrapper& operator=(const T &v) {
+    m_val = v;
+    return *this;
+  }
+
+  T m_val;
+};
+typedef eigen_packet_wrapper<float32x2_t,0> Packet2f;
+typedef eigen_packet_wrapper<float32x4_t,1> Packet4f;
+typedef eigen_packet_wrapper<int32x4_t  ,2> Packet4i;
+typedef eigen_packet_wrapper<int32x2_t  ,3> Packet2i;
+typedef eigen_packet_wrapper<uint32x4_t ,4> Packet4ui;
+
+#else

 typedef float32x2_t Packet2f;
 typedef float32x4_t Packet4f;
@@ -40,23 +71,28 @@ typedef int32x4_t   Packet4i;
 typedef int32x2_t   Packet2i;
 typedef uint32x4_t  Packet4ui;

+#endif // EIGEN_COMP_MSVC
+
 #define _EIGEN_DECLARE_CONST_Packet4f(NAME,X) \
  const Packet4f p4f_##NAME = pset1<Packet4f>(X)

 #define _EIGEN_DECLARE_CONST_Packet4f_FROM_INT(NAME,X) \
-  const Packet4f p4f_##NAME = vreinterpretq_f32_u32(pset1<int>(X))
+  const Packet4f p4f_##NAME = vreinterpretq_f32_u32(pset1<int32_t>(X))

 #define _EIGEN_DECLARE_CONST_Packet4i(NAME,X) \
  const Packet4i p4i_##NAME = pset1<Packet4i>(X)

-// arm64 does have the pld instruction. If available, let's trust the __builtin_prefetch built-in function
-// which available on LLVM and GCC (at least)
-#if EIGEN_HAS_BUILTIN(__builtin_prefetch) || EIGEN_COMP_GNUC
+#if EIGEN_ARCH_ARM64
+  // __builtin_prefetch tends to do nothing on ARM64 compilers because the
+  // prefetch instructions there are too detailed for __builtin_prefetch to map
+  // meaningfully to them.
+  #define EIGEN_ARM_PREFETCH(ADDR)  __asm__ __volatile__("prfm pldl1keep, [%[addr]]\n" ::[addr] "r"(ADDR) : );
+#elif EIGEN_HAS_BUILTIN(__builtin_prefetch) || EIGEN_COMP_GNUC
  #define EIGEN_ARM_PREFETCH(ADDR) __builtin_prefetch(ADDR);
 #elif defined __pld
  #define EIGEN_ARM_PREFETCH(ADDR) __pld(ADDR)
-#elif !EIGEN_ARCH_ARM64
-  #define EIGEN_ARM_PREFETCH(ADDR) __asm__ __volatile__ ( "   pld [%[addr]]\n" :: [addr] "r" (ADDR) : "cc" );
+#elif EIGEN_ARCH_ARM32
+  #define EIGEN_ARM_PREFETCH(ADDR) __asm__ __volatile__ ("pld [%[addr]]\n" :: [addr] "r" (ADDR) : );
 #else
  // by default no explicit prefetching
  #define EIGEN_ARM_PREFETCH(ADDR)
@@ -71,7 +107,7 @@ template<> struct packet_traits<float>  : default_packet_traits
    AlignedOnScalar = 1,
    size = 4,
    HasHalfPacket=0, // Packet2f intrinsics not implemented yet
-   
+
    HasDiv  = 1,
    // FIXME check the Has*
    HasSin  = 0,
@@ -81,7 +117,7 @@ template<> struct packet_traits<float>  : default_packet_traits
    HasSqrt = 0
  };
 };
-template<> struct packet_traits<int>    : default_packet_traits
+template<> struct packet_traits<int32_t>    : default_packet_traits
 {
  typedef Packet4i type;
  typedef Packet4i half; // Packet2i intrinsics not implemented yet
@@ -103,19 +139,19 @@ EIGEN_STRONG_INLINE void        vst1q_f32(float* to, float32x4_t from) { ::vst1q
 EIGEN_STRONG_INLINE void        vst1_f32 (float* to, float32x2_t from) { ::vst1_f32 ((float32_t*)to,from); }
 #endif

-template<> struct unpacket_traits<Packet4f> { typedef float  type; enum {size=4, alignment=Aligned16}; typedef Packet4f half; };
-template<> struct unpacket_traits<Packet4i> { typedef int    type; enum {size=4, alignment=Aligned16}; typedef Packet4i half; };
+template<> struct unpacket_traits<Packet4f> { typedef float   type; enum {size=4, alignment=Aligned16}; typedef Packet4f half; };
+template<> struct unpacket_traits<Packet4i> { typedef int32_t type; enum {size=4, alignment=Aligned16}; typedef Packet4i half; };

 template<> EIGEN_STRONG_INLINE Packet4f pset1<Packet4f>(const float&  from) { return vdupq_n_f32(from); }
-template<> EIGEN_STRONG_INLINE Packet4i pset1<Packet4i>(const int&    from)   { return vdupq_n_s32(from); }
+template<> EIGEN_STRONG_INLINE Packet4i pset1<Packet4i>(const int32_t&    from)   { return vdupq_n_s32(from); }

 template<> EIGEN_STRONG_INLINE Packet4f plset<Packet4f>(const float& a)
 {
-  const float32_t f[] = {0, 1, 2, 3};
+  const float f[] = {0, 1, 2, 3};
  Packet4f countdown = vld1q_f32(f);
  return vaddq_f32(pset1<Packet4f>(a), countdown);
 }
-template<> EIGEN_STRONG_INLINE Packet4i plset<Packet4i>(const int& a)
+template<> EIGEN_STRONG_INLINE Packet4i plset<Packet4i>(const int32_t& a)
 {
  const int32_t i[] = {0, 1, 2, 3};
  Packet4i countdown = vld1q_s32(i);
@@ -137,32 +173,48 @@ template<> EIGEN_STRONG_INLINE Packet4i pconj(const Packet4i& a) { return a; }
 template<> EIGEN_STRONG_INLINE Packet4f pmul<Packet4f>(const Packet4f& a, const Packet4f& b) { return vmulq_f32(a,b); }
 template<> EIGEN_STRONG_INLINE Packet4i pmul<Packet4i>(const Packet4i& a, const Packet4i& b) { return vmulq_s32(a,b); }

-template<> EIGEN_STRONG_INLINE Packet4f pdiv<Packet4f>(const Packet4f& a, const Packet4f& b)
-{
-#if EIGEN_ARCH_ARM64
-  return vdivq_f32(a,b);
-#else
-  Packet4f inv, restep, div;
-
-  // NEON does not offer a divide instruction, we have to do a reciprocal approximation
-  // However NEON in contrast to other SIMD engines (AltiVec/SSE), offers
-  // a reciprocal estimate AND a reciprocal step -which saves a few instructions
-  // vrecpeq_f32() returns an estimate to 1/b, which we will finetune with
-  // Newton-Raphson and vrecpsq_f32()
-  inv = vrecpeq_f32(b);
-
-  // This returns a differential, by which we will have to multiply inv to get a better
-  // approximation of 1/b.
-  restep = vrecpsq_f32(b, inv);
-  inv = vmulq_f32(restep, inv);
-
-  // Finally, multiply a by 1/b and get the wanted result of the division.
-  div = vmulq_f32(a, inv);
-
-  return div;
-#endif
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet4f pselect(const Packet4f& mask, const Packet4f& a, const Packet4f& b) {
+  return vbslq_f32(vreinterpretq_u32_f32(mask), a, b);
 }

+EIGEN_STRONG_INLINE Packet4f pcmp_le(const Packet4f& a, const Packet4f& b) {
+  return vreinterpretq_f32_u32(vcleq_f32(a, b));
+}
+
+EIGEN_STRONG_INLINE Packet4f preciprocal(const Packet4f& a)
+{
+  // Compute approximate reciprocal.
+  float32x4_t result = vrecpeq_f32(a);
+  result = vmulq_f32(vrecpsq_f32(a, result), result);
+  result = vmulq_f32(vrecpsq_f32(a, result), result);
+  return result;
+}
+
+#if EIGEN_ARCH_ARM64
+template<> EIGEN_STRONG_INLINE Packet4f pdiv(const Packet4f& a, const Packet4f& b) { return vdivq_f32(a, b); }
+template<> EIGEN_STRONG_INLINE Packet2f pdiv(const Packet2f& a, const Packet2f& b) { return vdiv_f32(a, b); }
+#else
+template<typename Packet>
+EIGEN_STRONG_INLINE Packet pdiv_float_common(const Packet& a, const Packet& b) {
+  // if b is large, NEON intrinsics will flush preciprocal(b) to zero
+  // avoid underflow with the following manipulation:
+  // a / b = f * (a * reciprocal(f * b))
+  const Packet cst_one = pset1<Packet>(1.0f);
+  const Packet cst_quarter = pset1<Packet>(0.25f);
+  const Packet cst_thresh = pset1<Packet>(NumTraits<float>::highest() / 4.0f);
+
+  Packet b_will_underflow = pcmp_le(cst_thresh, pabs(b));
+  Packet f = pselect(b_will_underflow, cst_quarter, cst_one);
+  Packet result = pmul(f, pmul(a, preciprocal(pmul(b, f))));
+  return result;
+}
+
+template<> EIGEN_STRONG_INLINE Packet4f pdiv<Packet4f>(const Packet4f& a, const Packet4f& b) {
+  return pdiv_float_common(a, b);
+}
+
+#endif
+
 template<> EIGEN_STRONG_INLINE Packet4i pdiv<Packet4i>(const Packet4i& /*a*/, const Packet4i& /*b*/)
 { eigen_assert(false && "packet integer division are not supported by NEON");
  return pset1<Packet4i>(0);
@@ -172,7 +224,7 @@ template<> EIGEN_STRONG_INLINE Packet4i pdiv<Packet4i>(const Packet4i& /*a*/, co
 // then implements a slow software scalar fallback calling fmaf()!
 // Filed LLVM bug:
 //     https://llvm.org/bugs/show_bug.cgi?id=27216
-#if (defined __ARM_FEATURE_FMA) && !(EIGEN_COMP_CLANG && EIGEN_ARCH_ARM)
+#if (defined EIGEN_VECTORIZE_FMA) && !(EIGEN_COMP_CLANG && EIGEN_ARCH_ARM)
 // See bug 936.
 // FMA is available on VFPv4 i.e. when compiling with -mfpu=neon-vfpv4.
 // FMA is a true fused multiply-add i.e. only 1 rounding at the end, no intermediate rounding.
@@ -238,20 +290,20 @@ template<> EIGEN_STRONG_INLINE Packet4f pandnot<Packet4f>(const Packet4f& a, con
 }
 template<> EIGEN_STRONG_INLINE Packet4i pandnot<Packet4i>(const Packet4i& a, const Packet4i& b) { return vbicq_s32(a,b); }

-template<> EIGEN_STRONG_INLINE Packet4f pload<Packet4f>(const float* from) { EIGEN_DEBUG_ALIGNED_LOAD return vld1q_f32(from); }
-template<> EIGEN_STRONG_INLINE Packet4i pload<Packet4i>(const int*   from) { EIGEN_DEBUG_ALIGNED_LOAD return vld1q_s32(from); }
+template<> EIGEN_STRONG_INLINE Packet4f pload<Packet4f>(const float*    from) { EIGEN_DEBUG_ALIGNED_LOAD return vld1q_f32(from); }
+template<> EIGEN_STRONG_INLINE Packet4i pload<Packet4i>(const int32_t*  from) { EIGEN_DEBUG_ALIGNED_LOAD return vld1q_s32(from); }

-template<> EIGEN_STRONG_INLINE Packet4f ploadu<Packet4f>(const float* from) { EIGEN_DEBUG_UNALIGNED_LOAD return vld1q_f32(from); }
-template<> EIGEN_STRONG_INLINE Packet4i ploadu<Packet4i>(const int* from)   { EIGEN_DEBUG_UNALIGNED_LOAD return vld1q_s32(from); }
+template<> EIGEN_STRONG_INLINE Packet4f ploadu<Packet4f>(const float*   from) { EIGEN_DEBUG_UNALIGNED_LOAD return vld1q_f32(from); }
+template<> EIGEN_STRONG_INLINE Packet4i ploadu<Packet4i>(const int32_t* from) { EIGEN_DEBUG_UNALIGNED_LOAD return vld1q_s32(from); }

-template<> EIGEN_STRONG_INLINE Packet4f ploaddup<Packet4f>(const float*   from)
+template<> EIGEN_STRONG_INLINE Packet4f ploaddup<Packet4f>(const float* from)
 {
  float32x2_t lo, hi;
  lo = vld1_dup_f32(from);
  hi = vld1_dup_f32(from+1);
  return vcombine_f32(lo, hi);
 }
-template<> EIGEN_STRONG_INLINE Packet4i ploaddup<Packet4i>(const int*     from)
+template<> EIGEN_STRONG_INLINE Packet4i ploaddup<Packet4i>(const int32_t* from)
 {
  int32x2_t lo, hi;
  lo = vld1_dup_s32(from);
@@ -259,11 +311,11 @@ template<> EIGEN_STRONG_INLINE Packet4i ploaddup<Packet4i>(const int*     from)
  return vcombine_s32(lo, hi);
 }

-template<> EIGEN_STRONG_INLINE void pstore<float>(float*   to, const Packet4f& from) { EIGEN_DEBUG_ALIGNED_STORE vst1q_f32(to, from); }
-template<> EIGEN_STRONG_INLINE void pstore<int>(int*       to, const Packet4i& from) { EIGEN_DEBUG_ALIGNED_STORE vst1q_s32(to, from); }
+template<> EIGEN_STRONG_INLINE void pstore<float>  (float*    to, const Packet4f& from) { EIGEN_DEBUG_ALIGNED_STORE vst1q_f32(to, from); }
+template<> EIGEN_STRONG_INLINE void pstore<int32_t>(int32_t*  to, const Packet4i& from) { EIGEN_DEBUG_ALIGNED_STORE vst1q_s32(to, from); }

-template<> EIGEN_STRONG_INLINE void pstoreu<float>(float*  to, const Packet4f& from) { EIGEN_DEBUG_UNALIGNED_STORE vst1q_f32(to, from); }
-template<> EIGEN_STRONG_INLINE void pstoreu<int>(int*      to, const Packet4i& from) { EIGEN_DEBUG_UNALIGNED_STORE vst1q_s32(to, from); }
+template<> EIGEN_STRONG_INLINE void pstoreu<float>  (float*   to, const Packet4f& from) { EIGEN_DEBUG_UNALIGNED_STORE vst1q_f32(to, from); }
+template<> EIGEN_STRONG_INLINE void pstoreu<int32_t>(int32_t* to, const Packet4i& from) { EIGEN_DEBUG_UNALIGNED_STORE vst1q_s32(to, from); }

 template<> EIGEN_DEVICE_FUNC inline Packet4f pgather<float, Packet4f>(const float* from, Index stride)
 {
@@ -274,7 +326,7 @@ template<> EIGEN_DEVICE_FUNC inline Packet4f pgather<float, Packet4f>(const floa
  res = vsetq_lane_f32(from[3*stride], res, 3);
  return res;
 }
-template<> EIGEN_DEVICE_FUNC inline Packet4i pgather<int, Packet4i>(const int* from, Index stride)
+template<> EIGEN_DEVICE_FUNC inline Packet4i pgather<int32_t, Packet4i>(const int32_t* from, Index stride)
 {
  Packet4i res = pset1<Packet4i>(0);
  res = vsetq_lane_s32(from[0*stride], res, 0);
@@ -291,7 +343,7 @@ template<> EIGEN_DEVICE_FUNC inline void pscatter<float, Packet4f>(float* to, co
  to[stride*2] = vgetq_lane_f32(from, 2);
  to[stride*3] = vgetq_lane_f32(from, 3);
 }
-template<> EIGEN_DEVICE_FUNC inline void pscatter<int, Packet4i>(int* to, const Packet4i& from, Index stride)
+template<> EIGEN_DEVICE_FUNC inline void pscatter<int32_t, Packet4i>(int32_t* to, const Packet4i& from, Index stride)
 {
  to[stride*0] = vgetq_lane_s32(from, 0);
  to[stride*1] = vgetq_lane_s32(from, 1);
@@ -299,12 +351,12 @@ template<> EIGEN_DEVICE_FUNC inline void pscatter<int, Packet4i>(int* to, const
  to[stride*3] = vgetq_lane_s32(from, 3);
 }

-template<> EIGEN_STRONG_INLINE void prefetch<float>(const float* addr) { EIGEN_ARM_PREFETCH(addr); }
-template<> EIGEN_STRONG_INLINE void prefetch<int>(const int*     addr) { EIGEN_ARM_PREFETCH(addr); }
+template<> EIGEN_STRONG_INLINE void prefetch<float>  (const float*    addr) { EIGEN_ARM_PREFETCH(addr); }
+template<> EIGEN_STRONG_INLINE void prefetch<int32_t>(const int32_t*  addr) { EIGEN_ARM_PREFETCH(addr); }

 // FIXME only store the 2 first elements ?
-template<> EIGEN_STRONG_INLINE float  pfirst<Packet4f>(const Packet4f& a) { float EIGEN_ALIGN16 x[4]; vst1q_f32(x, a); return x[0]; }
-template<> EIGEN_STRONG_INLINE int    pfirst<Packet4i>(const Packet4i& a) { int   EIGEN_ALIGN16 x[4]; vst1q_s32(x, a); return x[0]; }
+template<> EIGEN_STRONG_INLINE float   pfirst<Packet4f>(const Packet4f& a) { float   EIGEN_ALIGN16 x[4]; vst1q_f32(x, a); return x[0]; }
+template<> EIGEN_STRONG_INLINE int32_t pfirst<Packet4i>(const Packet4i& a) { int32_t EIGEN_ALIGN16 x[4]; vst1q_s32(x, a); return x[0]; }

 template<> EIGEN_STRONG_INLINE Packet4f preverse(const Packet4f& a) {
  float32x2_t a_lo, a_hi;
@@ -359,7 +411,7 @@ template<> EIGEN_STRONG_INLINE Packet4f preduxp<Packet4f>(const Packet4f* vecs)
  return sum;
 }

-template<> EIGEN_STRONG_INLINE int predux<Packet4i>(const Packet4i& a)
+template<> EIGEN_STRONG_INLINE int32_t predux<Packet4i>(const Packet4i& a)
 {
  int32x2_t a_lo, a_hi, sum;

@@ -406,7 +458,7 @@ template<> EIGEN_STRONG_INLINE float predux_mul<Packet4f>(const Packet4f& a)

  return vget_lane_f32(prod, 0);
 }
-template<> EIGEN_STRONG_INLINE int predux_mul<Packet4i>(const Packet4i& a)
+template<> EIGEN_STRONG_INLINE int32_t predux_mul<Packet4i>(const Packet4i& a)
 {
  int32x2_t a_lo, a_hi, prod;

@@ -434,7 +486,7 @@ template<> EIGEN_STRONG_INLINE float predux_min<Packet4f>(const Packet4f& a)
  return vget_lane_f32(min, 0);
 }

-template<> EIGEN_STRONG_INLINE int predux_min<Packet4i>(const Packet4i& a)
+template<> EIGEN_STRONG_INLINE int32_t predux_min<Packet4i>(const Packet4i& a)
 {
  int32x2_t a_lo, a_hi, min;

@@ -442,7 +494,7 @@ template<> EIGEN_STRONG_INLINE int predux_min<Packet4i>(const Packet4i& a)
  a_hi = vget_high_s32(a);
  min = vpmin_s32(a_lo, a_hi);
  min = vpmin_s32(min, min);
-  
+
  return vget_lane_s32(min, 0);
 }

@@ -459,7 +511,7 @@ template<> EIGEN_STRONG_INLINE float predux_max<Packet4f>(const Packet4f& a)
  return vget_lane_f32(max, 0);
 }

-template<> EIGEN_STRONG_INLINE int predux_max<Packet4i>(const Packet4i& a)
+template<> EIGEN_STRONG_INLINE int32_t predux_max<Packet4i>(const Packet4i& a)
 {
  int32x2_t a_lo, a_hi, max;

@@ -559,7 +611,7 @@ template<> struct packet_traits<double>  : default_packet_traits
    AlignedOnScalar = 1,
    size = 2,
    HasHalfPacket=0,
-   
+
    HasDiv  = 1,
    // FIXME check the Has*
    HasSin  = 0,
@@ -592,7 +644,7 @@ template<> EIGEN_STRONG_INLINE Packet2d pmul<Packet2d>(const Packet2d& a, const

 template<> EIGEN_STRONG_INLINE Packet2d pdiv<Packet2d>(const Packet2d& a, const Packet2d& b) { return vdivq_f64(a,b); }

-#ifdef __ARM_FEATURE_FMA
+#ifdef EIGEN_VECTORIZE_FMA
 // See bug 936. See above comment about FMA for float.
 template<> EIGEN_STRONG_INLINE Packet2d pmadd(const Packet2d& a, const Packet2d& b, const Packet2d& c) { return vfmaq_f64(c,a,b); }
 #else
@@ -715,7 +767,7 @@ ptranspose(PacketBlock<Packet2d,2>& kernel) {
  kernel.packet[0] = trn1;
  kernel.packet[1] = trn2;
 }
-#endif // EIGEN_ARCH_ARM64 
+#endif // EIGEN_ARCH_ARM64

 } // end namespace internal

--- a/Eigen/src/Core/arch/SSE/Complex.h
+++ b/Eigen/src/Core/arch/SSE/Complex.h
@@ -128,7 +128,7 @@ template<> EIGEN_DEVICE_FUNC inline void pscatter<std::complex<float>, Packet2cf
                                     _mm_cvtss_f32(_mm_shuffle_ps(from.v, from.v, 3)));
 }

-template<> EIGEN_STRONG_INLINE void prefetch<std::complex<float> >(const std::complex<float> *   addr) { _mm_prefetch((const char*)(addr), _MM_HINT_T0); }
+template<> EIGEN_STRONG_INLINE void prefetch<std::complex<float> >(const std::complex<float> *   addr) { _mm_prefetch((SsePrefetchPtrType)(addr), _MM_HINT_T0); }

 template<> EIGEN_STRONG_INLINE std::complex<float>  pfirst<Packet2cf>(const Packet2cf& a)
 {
@@ -229,23 +229,7 @@ template<> struct conj_helper<Packet2cf, Packet2cf, true,true>
  }
 };

-template<> struct conj_helper<Packet4f, Packet2cf, false,false>
-{
-  EIGEN_STRONG_INLINE Packet2cf pmadd(const Packet4f& x, const Packet2cf& y, const Packet2cf& c) const
-  { return padd(c, pmul(x,y)); }
-
-  EIGEN_STRONG_INLINE Packet2cf pmul(const Packet4f& x, const Packet2cf& y) const
-  { return Packet2cf(Eigen::internal::pmul<Packet4f>(x, y.v)); }
-};
-
-template<> struct conj_helper<Packet2cf, Packet4f, false,false>
-{
-  EIGEN_STRONG_INLINE Packet2cf pmadd(const Packet2cf& x, const Packet4f& y, const Packet2cf& c) const
-  { return padd(c, pmul(x,y)); }
-
-  EIGEN_STRONG_INLINE Packet2cf pmul(const Packet2cf& x, const Packet4f& y) const
-  { return Packet2cf(Eigen::internal::pmul<Packet4f>(x.v, y)); }
-};
+EIGEN_MAKE_CONJ_HELPER_CPLX_REAL(Packet2cf,Packet4f)

 template<> EIGEN_STRONG_INLINE Packet2cf pdiv<Packet2cf>(const Packet2cf& a, const Packet2cf& b)
 {
@@ -340,7 +324,7 @@ template<> EIGEN_STRONG_INLINE Packet1cd ploaddup<Packet1cd>(const std::complex<
 template<> EIGEN_STRONG_INLINE void pstore <std::complex<double> >(std::complex<double> *   to, const Packet1cd& from) { EIGEN_DEBUG_ALIGNED_STORE pstore((double*)to, Packet2d(from.v)); }
 template<> EIGEN_STRONG_INLINE void pstoreu<std::complex<double> >(std::complex<double> *   to, const Packet1cd& from) { EIGEN_DEBUG_UNALIGNED_STORE pstoreu((double*)to, Packet2d(from.v)); }

-template<> EIGEN_STRONG_INLINE void prefetch<std::complex<double> >(const std::complex<double> *   addr) { _mm_prefetch((const char*)(addr), _MM_HINT_T0); }
+template<> EIGEN_STRONG_INLINE void prefetch<std::complex<double> >(const std::complex<double> *   addr) { _mm_prefetch((SsePrefetchPtrType)(addr), _MM_HINT_T0); }

 template<> EIGEN_STRONG_INLINE std::complex<double>  pfirst<Packet1cd>(const Packet1cd& a)
 {
@@ -430,23 +414,7 @@ template<> struct conj_helper<Packet1cd, Packet1cd, true,true>
  }
 };

-template<> struct conj_helper<Packet2d, Packet1cd, false,false>
-{
-  EIGEN_STRONG_INLINE Packet1cd pmadd(const Packet2d& x, const Packet1cd& y, const Packet1cd& c) const
-  { return padd(c, pmul(x,y)); }
-
-  EIGEN_STRONG_INLINE Packet1cd pmul(const Packet2d& x, const Packet1cd& y) const
-  { return Packet1cd(Eigen::internal::pmul<Packet2d>(x, y.v)); }
-};
-
-template<> struct conj_helper<Packet1cd, Packet2d, false,false>
-{
-  EIGEN_STRONG_INLINE Packet1cd pmadd(const Packet1cd& x, const Packet2d& y, const Packet1cd& c) const
-  { return padd(c, pmul(x,y)); }
-
-  EIGEN_STRONG_INLINE Packet1cd pmul(const Packet1cd& x, const Packet2d& y) const
-  { return Packet1cd(Eigen::internal::pmul<Packet2d>(x.v, y)); }
-};
+EIGEN_MAKE_CONJ_HELPER_CPLX_REAL(Packet1cd,Packet2d)

 template<> EIGEN_STRONG_INLINE Packet1cd pdiv<Packet1cd>(const Packet1cd& a, const Packet1cd& b)
 {
--- a/Eigen/src/Core/arch/SSE/PacketMath.h
+++ b/Eigen/src/Core/arch/SSE/PacketMath.h
@@ -28,7 +28,7 @@ namespace internal {
 #endif
 #endif

-#if (defined EIGEN_VECTORIZE_AVX) && EIGEN_COMP_GNUC_STRICT && (__GXX_ABI_VERSION < 1004)
+#if ((defined EIGEN_VECTORIZE_AVX) && (EIGEN_COMP_GNUC_STRICT || EIGEN_COMP_MINGW) && (__GXX_ABI_VERSION < 1004)) || EIGEN_OS_QNX
 // With GCC's default ABI version, a __m128 or __m256 are the same types and therefore we cannot
 // have overloads for both types without linking error.
 // One solution is to increase ABI version using -fabi-version=4 (or greater).
@@ -409,10 +409,16 @@ template<> EIGEN_STRONG_INLINE void pstore1<Packet2d>(double* to, const double&
  pstore(to, Packet2d(vec2d_swizzle1(pa,0,0)));
 }

+#if EIGEN_COMP_PGI
+typedef const void * SsePrefetchPtrType;
+#else
+typedef const char * SsePrefetchPtrType;
+#endif
+
 #ifndef EIGEN_VECTORIZE_AVX
-template<> EIGEN_STRONG_INLINE void prefetch<float>(const float*   addr) { _mm_prefetch((const char*)(addr), _MM_HINT_T0); }
-template<> EIGEN_STRONG_INLINE void prefetch<double>(const double* addr) { _mm_prefetch((const char*)(addr), _MM_HINT_T0); }
-template<> EIGEN_STRONG_INLINE void prefetch<int>(const int*       addr) { _mm_prefetch((const char*)(addr), _MM_HINT_T0); }
+template<> EIGEN_STRONG_INLINE void prefetch<float>(const float*   addr) { _mm_prefetch((SsePrefetchPtrType)(addr), _MM_HINT_T0); }
+template<> EIGEN_STRONG_INLINE void prefetch<double>(const double* addr) { _mm_prefetch((SsePrefetchPtrType)(addr), _MM_HINT_T0); }
+template<> EIGEN_STRONG_INLINE void prefetch<int>(const int*       addr) { _mm_prefetch((SsePrefetchPtrType)(addr), _MM_HINT_T0); }
 #endif

 #if EIGEN_COMP_MSVC_STRICT && EIGEN_OS_WIN64
@@ -504,30 +510,13 @@ template<> EIGEN_STRONG_INLINE Packet4f preduxp<Packet4f>(const Packet4f* vecs)
 {
  return _mm_hadd_ps(_mm_hadd_ps(vecs[0], vecs[1]),_mm_hadd_ps(vecs[2], vecs[3]));
 }
+
 template<> EIGEN_STRONG_INLINE Packet2d preduxp<Packet2d>(const Packet2d* vecs)
 {
  return _mm_hadd_pd(vecs[0], vecs[1]);
 }

-template<> EIGEN_STRONG_INLINE float predux<Packet4f>(const Packet4f& a)
-{
-  Packet4f tmp0 = _mm_hadd_ps(a,a);
-  return pfirst<Packet4f>(_mm_hadd_ps(tmp0, tmp0));
-}
-
-template<> EIGEN_STRONG_INLINE double predux<Packet2d>(const Packet2d& a) { return pfirst<Packet2d>(_mm_hadd_pd(a, a)); }
 #else
-// SSE2 versions
-template<> EIGEN_STRONG_INLINE float predux<Packet4f>(const Packet4f& a)
-{
-  Packet4f tmp = _mm_add_ps(a, _mm_movehl_ps(a,a));
-  return pfirst<Packet4f>(_mm_add_ss(tmp, _mm_shuffle_ps(tmp,tmp, 1)));
-}
-template<> EIGEN_STRONG_INLINE double predux<Packet2d>(const Packet2d& a)
-{
-  return pfirst<Packet2d>(_mm_add_sd(a, _mm_unpackhi_pd(a,a)));
-}
-
 template<> EIGEN_STRONG_INLINE Packet4f preduxp<Packet4f>(const Packet4f* vecs)
 {
  Packet4f tmp0, tmp1, tmp2;
@@ -548,6 +537,29 @@ template<> EIGEN_STRONG_INLINE Packet2d preduxp<Packet2d>(const Packet2d* vecs)
 }
 #endif  // SSE3

+template<> EIGEN_STRONG_INLINE float predux<Packet4f>(const Packet4f& a)
+{
+  // Disable SSE3 _mm_hadd_pd that is extremely slow on all existing Intel's architectures
+  // (from Nehalem to Haswell)
+// #ifdef EIGEN_VECTORIZE_SSE3
+//   Packet4f tmp = _mm_add_ps(a, vec4f_swizzle1(a,2,3,2,3));
+//   return pfirst<Packet4f>(_mm_hadd_ps(tmp, tmp));
+// #else
+  Packet4f tmp = _mm_add_ps(a, _mm_movehl_ps(a,a));
+  return pfirst<Packet4f>(_mm_add_ss(tmp, _mm_shuffle_ps(tmp,tmp, 1)));
+// #endif
+}
+
+template<> EIGEN_STRONG_INLINE double predux<Packet2d>(const Packet2d& a)
+{
+  // Disable SSE3 _mm_hadd_pd that is extremely slow on all existing Intel's architectures
+  // (from Nehalem to Haswell)
+// #ifdef EIGEN_VECTORIZE_SSE3
+//   return pfirst<Packet2d>(_mm_hadd_pd(a, a));
+// #else
+  return pfirst<Packet2d>(_mm_add_sd(a, _mm_unpackhi_pd(a,a)));
+// #endif
+}

 #ifdef EIGEN_VECTORIZE_SSSE3
 template<> EIGEN_STRONG_INLINE Packet4i preduxp<Packet4i>(const Packet4i* vecs)
@@ -870,4 +882,14 @@ template<> EIGEN_STRONG_INLINE double pmadd(const double& a, const double& b, co

 } // end namespace Eigen

+#if EIGEN_COMP_PGI
+// PGI++ does not define the following intrinsics in C++ mode.
+static inline __m128  _mm_castpd_ps   (__m128d x) { return reinterpret_cast<__m128&>(x);  }
+static inline __m128i _mm_castpd_si128(__m128d x) { return reinterpret_cast<__m128i&>(x); }
+static inline __m128d _mm_castps_pd   (__m128  x) { return reinterpret_cast<__m128d&>(x); }
+static inline __m128i _mm_castps_si128(__m128  x) { return reinterpret_cast<__m128i&>(x); }
+static inline __m128  _mm_castsi128_ps(__m128i x) { return reinterpret_cast<__m128&>(x);  }
+static inline __m128d _mm_castsi128_pd(__m128i x) { return reinterpret_cast<__m128d&>(x); }
+#endif
+
 #endif // EIGEN_PACKET_MATH_SSE_H
--- a/Eigen/src/Core/arch/SSE/TypeCasting.h
+++ b/Eigen/src/Core/arch/SSE/TypeCasting.h
@@ -14,6 +14,7 @@ namespace Eigen {

 namespace internal {

+#ifndef EIGEN_VECTORIZE_AVX
 template <>
 struct type_casting_traits<float, int> {
  enum {
@@ -23,11 +24,6 @@ struct type_casting_traits<float, int> {
  };
 };

-template<> EIGEN_STRONG_INLINE Packet4i pcast<Packet4f, Packet4i>(const Packet4f& a) {
-  return _mm_cvttps_epi32(a);
-}
-
-
 template <>
 struct type_casting_traits<int, float> {
  enum {
@@ -37,11 +33,6 @@ struct type_casting_traits<int, float> {
  };
 };

-template<> EIGEN_STRONG_INLINE Packet4f pcast<Packet4i, Packet4f>(const Packet4i& a) {
-  return _mm_cvtepi32_ps(a);
-}
-
-
 template <>
 struct type_casting_traits<double, float> {
  enum {
@@ -51,10 +42,6 @@ struct type_casting_traits<double, float> {
  };
 };

-template<> EIGEN_STRONG_INLINE Packet4f pcast<Packet2d, Packet4f>(const Packet2d& a, const Packet2d& b) {
-  return _mm_shuffle_ps(_mm_cvtpd_ps(a), _mm_cvtpd_ps(b), (1 << 2) | (1 << 6));
-}
-
 template <>
 struct type_casting_traits<float, double> {
  enum {
@@ -63,6 +50,19 @@ struct type_casting_traits<float, double> {
    TgtCoeffRatio = 2
  };
 };
+#endif
+
+template<> EIGEN_STRONG_INLINE Packet4i pcast<Packet4f, Packet4i>(const Packet4f& a) {
+  return _mm_cvttps_epi32(a);
+}
+
+template<> EIGEN_STRONG_INLINE Packet4f pcast<Packet4i, Packet4f>(const Packet4i& a) {
+  return _mm_cvtepi32_ps(a);
+}
+
+template<> EIGEN_STRONG_INLINE Packet4f pcast<Packet2d, Packet4f>(const Packet2d& a, const Packet2d& b) {
+  return _mm_shuffle_ps(_mm_cvtpd_ps(a), _mm_cvtpd_ps(b), (1 << 2) | (1 << 6));
+}

 template<> EIGEN_STRONG_INLINE Packet2d pcast<Packet4f, Packet2d>(const Packet4f& a) {
  // Simply discard the second half of the input
--- a/Eigen/src/Core/arch/ZVector/Complex.h
+++ b/Eigen/src/Core/arch/ZVector/Complex.h
@@ -2,6 +2,7 @@
 // for linear algebra.
 //
 // Copyright (C) 2010 Gael Guennebaud <gael.guennebaud@inria.fr>
+// Copyright (C) 2016 Konstantinos Margaritis <markos@freevec.org>
 //
 // This Source Code Form is subject to the terms of the Mozilla
 // Public License v. 2.0. If a copy of the MPL was not distributed
@@ -24,13 +25,48 @@ struct Packet1cd
  Packet2d v;
 };

+struct Packet2cf
+{
+  EIGEN_STRONG_INLINE Packet2cf() {}
+  EIGEN_STRONG_INLINE explicit Packet2cf(const Packet4f& a) : v(a) {}
+  union {
+    Packet4f v;
+    Packet1cd cd[2];
+  };
+};
+
+template<> struct packet_traits<std::complex<float> >  : default_packet_traits
+{
+  typedef Packet2cf type;
+  typedef Packet2cf half;
+  enum {
+    Vectorizable = 1,
+    AlignedOnScalar = 1,
+    size = 2,
+    HasHalfPacket = 0,
+
+    HasAdd    = 1,
+    HasSub    = 1,
+    HasMul    = 1,
+    HasDiv    = 1,
+    HasNegate = 1,
+    HasAbs    = 0,
+    HasAbs2   = 0,
+    HasMin    = 0,
+    HasMax    = 0,
+    HasBlend  = 1,
+    HasSetLinear = 0
+  };
+};
+
+
 template<> struct packet_traits<std::complex<double> >  : default_packet_traits
 {
  typedef Packet1cd type;
  typedef Packet1cd half;
  enum {
    Vectorizable = 1,
-    AlignedOnScalar = 0,
+    AlignedOnScalar = 1,
    size = 1,
    HasHalfPacket = 0,

@@ -47,20 +83,68 @@ template<> struct packet_traits<std::complex<double> >  : default_packet_traits
  };
 };

+template<> struct unpacket_traits<Packet2cf> { typedef std::complex<float>  type; enum {size=2, alignment=Aligned16}; typedef Packet2cf half; };
 template<> struct unpacket_traits<Packet1cd> { typedef std::complex<double> type; enum {size=1, alignment=Aligned16}; typedef Packet1cd half; };

+/* Forward declaration */
+EIGEN_STRONG_INLINE void ptranspose(PacketBlock<Packet2cf,2>& kernel);
+
+template<> EIGEN_STRONG_INLINE Packet2cf pload <Packet2cf>(const std::complex<float>* from)  { EIGEN_DEBUG_ALIGNED_LOAD return Packet2cf(pload<Packet4f>((const float*)from)); }
 template<> EIGEN_STRONG_INLINE Packet1cd pload <Packet1cd>(const std::complex<double>* from) { EIGEN_DEBUG_ALIGNED_LOAD return Packet1cd(pload<Packet2d>((const double*)from)); }
+template<> EIGEN_STRONG_INLINE Packet2cf ploadu<Packet2cf>(const std::complex<float>* from)  { EIGEN_DEBUG_UNALIGNED_LOAD return Packet2cf(ploadu<Packet4f>((const float*)from)); }
 template<> EIGEN_STRONG_INLINE Packet1cd ploadu<Packet1cd>(const std::complex<double>* from) { EIGEN_DEBUG_UNALIGNED_LOAD return Packet1cd(ploadu<Packet2d>((const double*)from)); }
+template<> EIGEN_STRONG_INLINE void pstore <std::complex<float> >(std::complex<float> *     to, const Packet2cf& from) { EIGEN_DEBUG_ALIGNED_STORE pstore((float*)to, from.v); }
 template<> EIGEN_STRONG_INLINE void pstore <std::complex<double> >(std::complex<double> *   to, const Packet1cd& from) { EIGEN_DEBUG_ALIGNED_STORE pstore((double*)to, from.v); }
+template<> EIGEN_STRONG_INLINE void pstoreu<std::complex<float> >(std::complex<float> *     to, const Packet2cf& from) { EIGEN_DEBUG_UNALIGNED_STORE pstoreu((float*)to, from.v); }
 template<> EIGEN_STRONG_INLINE void pstoreu<std::complex<double> >(std::complex<double> *   to, const Packet1cd& from) { EIGEN_DEBUG_UNALIGNED_STORE pstoreu((double*)to, from.v); }

 template<> EIGEN_STRONG_INLINE Packet1cd pset1<Packet1cd>(const std::complex<double>&  from)
 { /* here we really have to use unaligned loads :( */ return ploadu<Packet1cd>(&from); }

+template<> EIGEN_STRONG_INLINE Packet2cf pset1<Packet2cf>(const std::complex<float>&  from)
+{
+  Packet2cf res;
+  res.cd[0] = Packet1cd(vec_ld2f((const float *)&from));
+  res.cd[1] = res.cd[0];
+  return res;
+}
+template<> EIGEN_DEVICE_FUNC inline Packet2cf pgather<std::complex<float>, Packet2cf>(const std::complex<float>* from, Index stride)
+{
+  std::complex<float> EIGEN_ALIGN16 af[2];
+  af[0] = from[0*stride];
+  af[1] = from[1*stride];
+  return pload<Packet2cf>(af);
+}
+template<> EIGEN_DEVICE_FUNC inline Packet1cd pgather<std::complex<double>, Packet1cd>(const std::complex<double>* from, Index stride EIGEN_UNUSED)
+{
+  return pload<Packet1cd>(from);
+}
+template<> EIGEN_DEVICE_FUNC inline void pscatter<std::complex<float>, Packet2cf>(std::complex<float>* to, const Packet2cf& from, Index stride)
+{
+  std::complex<float> EIGEN_ALIGN16 af[2];
+  pstore<std::complex<float> >((std::complex<float> *) af, from);
+  to[0*stride] = af[0];
+  to[1*stride] = af[1];
+}
+template<> EIGEN_DEVICE_FUNC inline void pscatter<std::complex<double>, Packet1cd>(std::complex<double>* to, const Packet1cd& from, Index stride EIGEN_UNUSED)
+{
+  pstore<std::complex<double> >(to, from);
+}
+
+template<> EIGEN_STRONG_INLINE Packet2cf padd<Packet2cf>(const Packet2cf& a, const Packet2cf& b) { return Packet2cf(padd<Packet4f>(a.v, b.v)); }
 template<> EIGEN_STRONG_INLINE Packet1cd padd<Packet1cd>(const Packet1cd& a, const Packet1cd& b) { return Packet1cd(a.v + b.v); }
+template<> EIGEN_STRONG_INLINE Packet2cf psub<Packet2cf>(const Packet2cf& a, const Packet2cf& b) { return Packet2cf(psub<Packet4f>(a.v, b.v)); }
 template<> EIGEN_STRONG_INLINE Packet1cd psub<Packet1cd>(const Packet1cd& a, const Packet1cd& b) { return Packet1cd(a.v - b.v); }
 template<> EIGEN_STRONG_INLINE Packet1cd pnegate(const Packet1cd& a) { return Packet1cd(pnegate(Packet2d(a.v))); }
+template<> EIGEN_STRONG_INLINE Packet2cf pnegate(const Packet2cf& a) { return Packet2cf(pnegate(Packet4f(a.v))); }
 template<> EIGEN_STRONG_INLINE Packet1cd pconj(const Packet1cd& a) { return Packet1cd((Packet2d)vec_xor((Packet2d)a.v, (Packet2d)p2ul_CONJ_XOR2)); }
+template<> EIGEN_STRONG_INLINE Packet2cf pconj(const Packet2cf& a)
+{
+  Packet2cf res;
+  res.v.v4f[0] = pconj(Packet1cd(reinterpret_cast<Packet2d>(a.v.v4f[0]))).v;
+  res.v.v4f[1] = pconj(Packet1cd(reinterpret_cast<Packet2d>(a.v.v4f[1]))).v;
+  return res;
+}

 template<> EIGEN_STRONG_INLINE Packet1cd pmul<Packet1cd>(const Packet1cd& a, const Packet1cd& b)
 {
@@ -79,43 +163,90 @@ template<> EIGEN_STRONG_INLINE Packet1cd pmul<Packet1cd>(const Packet1cd& a, con

  return Packet1cd(v1 + v2);
 }
-
-template<> EIGEN_STRONG_INLINE Packet1cd pand   <Packet1cd>(const Packet1cd& a, const Packet1cd& b) { return Packet1cd(vec_and(a.v,b.v)); }
-template<> EIGEN_STRONG_INLINE Packet1cd por    <Packet1cd>(const Packet1cd& a, const Packet1cd& b) { return Packet1cd(vec_or(a.v,b.v)); }
-template<> EIGEN_STRONG_INLINE Packet1cd pxor   <Packet1cd>(const Packet1cd& a, const Packet1cd& b) { return Packet1cd(vec_xor(a.v,b.v)); }
-template<> EIGEN_STRONG_INLINE Packet1cd pandnot<Packet1cd>(const Packet1cd& a, const Packet1cd& b) { return Packet1cd(vec_and(a.v, vec_nor(b.v,b.v))); }
-
-template<> EIGEN_STRONG_INLINE Packet1cd ploaddup<Packet1cd>(const std::complex<double>*     from)
+template<> EIGEN_STRONG_INLINE Packet2cf pmul<Packet2cf>(const Packet2cf& a, const Packet2cf& b)
 {
-  return pset1<Packet1cd>(*from);
+  Packet2cf res;
+  res.v.v4f[0] = pmul(Packet1cd(reinterpret_cast<Packet2d>(a.v.v4f[0])), Packet1cd(reinterpret_cast<Packet2d>(b.v.v4f[0]))).v;
+  res.v.v4f[1] = pmul(Packet1cd(reinterpret_cast<Packet2d>(a.v.v4f[1])), Packet1cd(reinterpret_cast<Packet2d>(b.v.v4f[1]))).v;
+  return res;
 }

+template<> EIGEN_STRONG_INLINE Packet1cd pand   <Packet1cd>(const Packet1cd& a, const Packet1cd& b) { return Packet1cd(vec_and(a.v,b.v)); }
+template<> EIGEN_STRONG_INLINE Packet2cf pand   <Packet2cf>(const Packet2cf& a, const Packet2cf& b) { return Packet2cf(pand<Packet4f>(a.v,b.v)); }
+template<> EIGEN_STRONG_INLINE Packet1cd por    <Packet1cd>(const Packet1cd& a, const Packet1cd& b) { return Packet1cd(vec_or(a.v,b.v)); }
+template<> EIGEN_STRONG_INLINE Packet2cf por    <Packet2cf>(const Packet2cf& a, const Packet2cf& b) { return Packet2cf(por<Packet4f>(a.v,b.v)); }
+template<> EIGEN_STRONG_INLINE Packet1cd pxor   <Packet1cd>(const Packet1cd& a, const Packet1cd& b) { return Packet1cd(vec_xor(a.v,b.v)); }
+template<> EIGEN_STRONG_INLINE Packet2cf pxor   <Packet2cf>(const Packet2cf& a, const Packet2cf& b) { return Packet2cf(pxor<Packet4f>(a.v,b.v)); }
+template<> EIGEN_STRONG_INLINE Packet1cd pandnot<Packet1cd>(const Packet1cd& a, const Packet1cd& b) { return Packet1cd(vec_and(a.v, vec_nor(b.v,b.v))); }
+template<> EIGEN_STRONG_INLINE Packet2cf pandnot<Packet2cf>(const Packet2cf& a, const Packet2cf& b) { return Packet2cf(pandnot<Packet4f>(a.v,b.v)); }
+
+template<> EIGEN_STRONG_INLINE Packet1cd ploaddup<Packet1cd>(const std::complex<double>*     from) {  return pset1<Packet1cd>(*from); }
+template<> EIGEN_STRONG_INLINE Packet2cf ploaddup<Packet2cf>(const std::complex<float>*      from) {  return pset1<Packet2cf>(*from); }
+
+template<> EIGEN_STRONG_INLINE void prefetch<std::complex<float> >(const std::complex<float> *     addr) { EIGEN_ZVECTOR_PREFETCH(addr); }
 template<> EIGEN_STRONG_INLINE void prefetch<std::complex<double> >(const std::complex<double> *   addr) { EIGEN_ZVECTOR_PREFETCH(addr); }

 template<> EIGEN_STRONG_INLINE std::complex<double>  pfirst<Packet1cd>(const Packet1cd& a)
 {
-  std::complex<double> EIGEN_ALIGN16 res[2];
-  pstore<std::complex<double> >(res, a);
+  std::complex<double> EIGEN_ALIGN16 res;
+  pstore<std::complex<double> >(&res, a);
+
+  return res;
+}
+template<> EIGEN_STRONG_INLINE std::complex<float>  pfirst<Packet2cf>(const Packet2cf& a)
+{
+  std::complex<float> EIGEN_ALIGN16 res[2];
+  pstore<std::complex<float> >(res, a);

  return res[0];
 }

 template<> EIGEN_STRONG_INLINE Packet1cd preverse(const Packet1cd& a) { return a; }
+template<> EIGEN_STRONG_INLINE Packet2cf preverse(const Packet2cf& a)
+{
+  Packet2cf res;
+  res.cd[0] = a.cd[1];
+  res.cd[1] = a.cd[0];
+  return res;
+}

 template<> EIGEN_STRONG_INLINE std::complex<double> predux<Packet1cd>(const Packet1cd& a)
 {
  return pfirst(a);
 }
+template<> EIGEN_STRONG_INLINE std::complex<float> predux<Packet2cf>(const Packet2cf& a)
+{
+  std::complex<float> res;
+  Packet1cd b = padd<Packet1cd>(a.cd[0], a.cd[1]);
+  vec_st2f(b.v, (float*)&res);
+  return res;
+}

 template<> EIGEN_STRONG_INLINE Packet1cd preduxp<Packet1cd>(const Packet1cd* vecs)
 {
  return vecs[0];
 }
+template<> EIGEN_STRONG_INLINE Packet2cf preduxp<Packet2cf>(const Packet2cf* vecs)
+{
+  PacketBlock<Packet2cf,2> transpose;
+  transpose.packet[0] = vecs[0];
+  transpose.packet[1] = vecs[1];
+  ptranspose(transpose);
+
+  return padd<Packet2cf>(transpose.packet[0], transpose.packet[1]);
+} 

 template<> EIGEN_STRONG_INLINE std::complex<double> predux_mul<Packet1cd>(const Packet1cd& a)
 {
  return pfirst(a);
 }
+template<> EIGEN_STRONG_INLINE std::complex<float> predux_mul<Packet2cf>(const Packet2cf& a)
+{
+  std::complex<float> res;
+  Packet1cd b = pmul<Packet1cd>(a.cd[0], a.cd[1]);
+  vec_st2f(b.v, (float*)&res);
+  return res;
+}

 template<int Offset>
 struct palign_impl<Offset,Packet1cd>
@@ -127,6 +258,18 @@ struct palign_impl<Offset,Packet1cd>
  }
 };

+template<int Offset>
+struct palign_impl<Offset,Packet2cf>
+{
+  static EIGEN_STRONG_INLINE void run(Packet2cf& first, const Packet2cf& second)
+  {
+    if (Offset == 1) {
+      first.cd[0] = first.cd[1];
+      first.cd[1] = second.cd[0];
+    }
+  }
+};
+
 template<> struct conj_helper<Packet1cd, Packet1cd, false,true>
 {
  EIGEN_STRONG_INLINE Packet1cd pmadd(const Packet1cd& x, const Packet1cd& y, const Packet1cd& c) const
@@ -160,6 +303,42 @@ template<> struct conj_helper<Packet1cd, Packet1cd, true,true>
  }
 };

+template<> struct conj_helper<Packet2cf, Packet2cf, false,true>
+{
+  EIGEN_STRONG_INLINE Packet2cf pmadd(const Packet2cf& x, const Packet2cf& y, const Packet2cf& c) const
+  { return padd(pmul(x,y),c); }
+
+  EIGEN_STRONG_INLINE Packet2cf pmul(const Packet2cf& a, const Packet2cf& b) const
+  {
+    return internal::pmul(a, pconj(b));
+  }
+};
+
+template<> struct conj_helper<Packet2cf, Packet2cf, true,false>
+{
+  EIGEN_STRONG_INLINE Packet2cf pmadd(const Packet2cf& x, const Packet2cf& y, const Packet2cf& c) const
+  { return padd(pmul(x,y),c); }
+
+  EIGEN_STRONG_INLINE Packet2cf pmul(const Packet2cf& a, const Packet2cf& b) const
+  {
+    return internal::pmul(pconj(a), b);
+  }
+};
+
+template<> struct conj_helper<Packet2cf, Packet2cf, true,true>
+{
+  EIGEN_STRONG_INLINE Packet2cf pmadd(const Packet2cf& x, const Packet2cf& y, const Packet2cf& c) const
+  { return padd(pmul(x,y),c); }
+
+  EIGEN_STRONG_INLINE Packet2cf pmul(const Packet2cf& a, const Packet2cf& b) const
+  {
+    return pconj(internal::pmul(a, b));
+  }
+};
+
+EIGEN_MAKE_CONJ_HELPER_CPLX_REAL(Packet2cf,Packet4f)
+EIGEN_MAKE_CONJ_HELPER_CPLX_REAL(Packet1cd,Packet2d)
+
 template<> EIGEN_STRONG_INLINE Packet1cd pdiv<Packet1cd>(const Packet1cd& a, const Packet1cd& b)
 {
  // TODO optimize it for AltiVec
@@ -168,17 +347,49 @@ template<> EIGEN_STRONG_INLINE Packet1cd pdiv<Packet1cd>(const Packet1cd& a, con
  return Packet1cd(pdiv(res.v, s + vec_perm(s, s, p16uc_REVERSE64)));
 }

+template<> EIGEN_STRONG_INLINE Packet2cf pdiv<Packet2cf>(const Packet2cf& a, const Packet2cf& b)
+{
+  // TODO optimize it for AltiVec
+  Packet2cf res;
+  res.cd[0] = pdiv<Packet1cd>(a.cd[0], b.cd[0]);
+  res.cd[1] = pdiv<Packet1cd>(a.cd[1], b.cd[1]);
+  return res;
+}
+
 EIGEN_STRONG_INLINE Packet1cd pcplxflip/*<Packet1cd>*/(const Packet1cd& x)
 {
  return Packet1cd(preverse(Packet2d(x.v)));
 }

+EIGEN_STRONG_INLINE Packet2cf pcplxflip/*<Packet2cf>*/(const Packet2cf& x)
+{
+  Packet2cf res;
+  res.cd[0] = pcplxflip(x.cd[0]);
+  res.cd[1] = pcplxflip(x.cd[1]);
+  return res;
+}
+
 EIGEN_STRONG_INLINE void ptranspose(PacketBlock<Packet1cd,2>& kernel)
 {
  Packet2d tmp = vec_perm(kernel.packet[0].v, kernel.packet[1].v, p16uc_TRANSPOSE64_HI);
  kernel.packet[1].v = vec_perm(kernel.packet[0].v, kernel.packet[1].v, p16uc_TRANSPOSE64_LO);
  kernel.packet[0].v = tmp;
 }
+
+EIGEN_STRONG_INLINE void ptranspose(PacketBlock<Packet2cf,2>& kernel)
+{
+  Packet1cd tmp = kernel.packet[0].cd[1];
+  kernel.packet[0].cd[1] = kernel.packet[1].cd[0];
+  kernel.packet[1].cd[0] = tmp;
+}
+
+template<> EIGEN_STRONG_INLINE Packet2cf pblend(const Selector<2>& ifPacket, const Packet2cf& thenPacket, const Packet2cf& elsePacket) {
+  Packet2cf result;
+  const Selector<4> ifPacket4 = { ifPacket.select[0], ifPacket.select[0], ifPacket.select[1], ifPacket.select[1] };
+  result.v = pblend<Packet4f>(ifPacket4, thenPacket.v, elsePacket.v);
+  return result;
+}
+
 } // end namespace internal

 } // end namespace Eigen
--- a/Eigen/src/Core/arch/ZVector/MathFunctions.h
+++ b/Eigen/src/Core/arch/ZVector/MathFunctions.h
@@ -3,6 +3,7 @@
 //
 // Copyright (C) 2007 Julien Pommier
 // Copyright (C) 2009 Gael Guennebaud <gael.guennebaud@inria.fr>
+// Copyright (C) 2016 Konstantinos Margaritis <markos@freevec.org>
 //
 // This Source Code Form is subject to the terms of the Mozilla
 // Public License v. 2.0. If a copy of the MPL was not distributed
@@ -19,32 +20,32 @@ namespace Eigen {

 namespace internal {

+static _EIGEN_DECLARE_CONST_Packet2d(1 , 1.0);
+static _EIGEN_DECLARE_CONST_Packet2d(2 , 2.0);
+static _EIGEN_DECLARE_CONST_Packet2d(half, 0.5);
+
+static _EIGEN_DECLARE_CONST_Packet2d(exp_hi,  709.437);
+static _EIGEN_DECLARE_CONST_Packet2d(exp_lo, -709.436139303);
+
+static _EIGEN_DECLARE_CONST_Packet2d(cephes_LOG2EF, 1.4426950408889634073599);
+
+static _EIGEN_DECLARE_CONST_Packet2d(cephes_exp_p0, 1.26177193074810590878e-4);
+static _EIGEN_DECLARE_CONST_Packet2d(cephes_exp_p1, 3.02994407707441961300e-2);
+static _EIGEN_DECLARE_CONST_Packet2d(cephes_exp_p2, 9.99999999999999999910e-1);
+
+static _EIGEN_DECLARE_CONST_Packet2d(cephes_exp_q0, 3.00198505138664455042e-6);
+static _EIGEN_DECLARE_CONST_Packet2d(cephes_exp_q1, 2.52448340349684104192e-3);
+static _EIGEN_DECLARE_CONST_Packet2d(cephes_exp_q2, 2.27265548208155028766e-1);
+static _EIGEN_DECLARE_CONST_Packet2d(cephes_exp_q3, 2.00000000000000000009e0);
+
+static _EIGEN_DECLARE_CONST_Packet2d(cephes_exp_C1, 0.693145751953125);
+static _EIGEN_DECLARE_CONST_Packet2d(cephes_exp_C2, 1.42860682030941723212e-6);
+
 template<> EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS EIGEN_UNUSED
 Packet2d pexp<Packet2d>(const Packet2d& _x)
 {
  Packet2d x = _x;

-  _EIGEN_DECLARE_CONST_Packet2d(1 , 1.0);
-  _EIGEN_DECLARE_CONST_Packet2d(2 , 2.0);
-  _EIGEN_DECLARE_CONST_Packet2d(half, 0.5);
-
-  _EIGEN_DECLARE_CONST_Packet2d(exp_hi,  709.437);
-  _EIGEN_DECLARE_CONST_Packet2d(exp_lo, -709.436139303);
-
-  _EIGEN_DECLARE_CONST_Packet2d(cephes_LOG2EF, 1.4426950408889634073599);
-
-  _EIGEN_DECLARE_CONST_Packet2d(cephes_exp_p0, 1.26177193074810590878e-4);
-  _EIGEN_DECLARE_CONST_Packet2d(cephes_exp_p1, 3.02994407707441961300e-2);
-  _EIGEN_DECLARE_CONST_Packet2d(cephes_exp_p2, 9.99999999999999999910e-1);
-
-  _EIGEN_DECLARE_CONST_Packet2d(cephes_exp_q0, 3.00198505138664455042e-6);
-  _EIGEN_DECLARE_CONST_Packet2d(cephes_exp_q1, 2.52448340349684104192e-3);
-  _EIGEN_DECLARE_CONST_Packet2d(cephes_exp_q2, 2.27265548208155028766e-1);
-  _EIGEN_DECLARE_CONST_Packet2d(cephes_exp_q3, 2.00000000000000000009e0);
-
-  _EIGEN_DECLARE_CONST_Packet2d(cephes_exp_C1, 0.693145751953125);
-  _EIGEN_DECLARE_CONST_Packet2d(cephes_exp_C2, 1.42860682030941723212e-6);
-
  Packet2d tmp, fx;
  Packet2l emm0;

@@ -91,18 +92,44 @@ Packet2d pexp<Packet2d>(const Packet2d& _x)
                 isnumber_mask);
 }

+template<> EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS EIGEN_UNUSED
+Packet4f pexp<Packet4f>(const Packet4f& x)
+{
+  Packet4f res;
+  res.v4f[0] = pexp<Packet2d>(x.v4f[0]);
+  res.v4f[1] = pexp<Packet2d>(x.v4f[1]);
+  return res;
+}
+
 template<> EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS EIGEN_UNUSED
 Packet2d psqrt<Packet2d>(const Packet2d& x)
 {
  return  __builtin_s390_vfsqdb(x);
 }

+template<> EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS EIGEN_UNUSED
+Packet4f psqrt<Packet4f>(const Packet4f& x)
+{
+  Packet4f res;
+  res.v4f[0] = psqrt<Packet2d>(x.v4f[0]);
+  res.v4f[1] = psqrt<Packet2d>(x.v4f[1]);
+  return res;
+}
+
 template<> EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS EIGEN_UNUSED
 Packet2d prsqrt<Packet2d>(const Packet2d& x) {
  // Unfortunately we can't use the much faster mm_rqsrt_pd since it only provides an approximation.
  return pset1<Packet2d>(1.0) / psqrt<Packet2d>(x);
 }

+template<> EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS EIGEN_UNUSED
+Packet4f prsqrt<Packet4f>(const Packet4f& x) {
+  Packet4f res;
+  res.v4f[0] = prsqrt<Packet2d>(x.v4f[0]);
+  res.v4f[1] = prsqrt<Packet2d>(x.v4f[1]);
+  return res;
+}
+
 }  // end namespace internal

 }  // end namespace Eigen
--- a/Eigen/src/Core/arch/ZVector/PacketMath.h
+++ b/Eigen/src/Core/arch/ZVector/PacketMath.h
@@ -10,8 +10,6 @@
 #ifndef EIGEN_PACKET_MATH_ZVECTOR_H
 #define EIGEN_PACKET_MATH_ZVECTOR_H

-#include <stdint.h>
-
 namespace Eigen {

 namespace internal {
@@ -28,9 +26,8 @@ namespace internal {
 #define EIGEN_HAS_SINGLE_INSTRUCTION_CJMADD
 #endif

-// NOTE Altivec has 32 registers, but Eigen only accepts a value of 8 or 16
 #ifndef EIGEN_ARCH_DEFAULT_NUMBER_OF_REGISTERS
-#define EIGEN_ARCH_DEFAULT_NUMBER_OF_REGISTERS  32
+#define EIGEN_ARCH_DEFAULT_NUMBER_OF_REGISTERS  16
 #endif

 typedef __vector int                 Packet4i;
@@ -42,11 +39,15 @@ typedef __vector double              Packet2d;
 typedef __vector unsigned long long  Packet2ul;
 typedef __vector long long           Packet2l;

+typedef struct {
+	Packet2d  v4f[2];
+} Packet4f;
+
 typedef union {
-  int32_t   i[4];
-  uint32_t ui[4];
-  int64_t   l[2];
-  uint64_t ul[2];
+  numext::int32_t   i[4];
+  numext::uint32_t ui[4];
+  numext::int64_t   l[2];
+  numext::uint64_t ul[2];
  double    d[2];
  Packet4i  v4i;
  Packet4ui v4ui;
@@ -88,6 +89,7 @@ static Packet2d p2d_ONE = { 1.0, 1.0 };
 static Packet2d p2d_ZERO_ = { -0.0, -0.0 };

 static Packet4i p4i_COUNTDOWN = { 0, 1, 2, 3 };
+static Packet4f p4f_COUNTDOWN = { 0.0, 1.0, 2.0, 3.0 };
 static Packet2d p2d_COUNTDOWN = reinterpret_cast<Packet2d>(vec_sld(reinterpret_cast<Packet16uc>(p2d_ZERO), reinterpret_cast<Packet16uc>(p2d_ONE), 8));

 static Packet16uc p16uc_PSET64_HI = { 0,1,2,3, 4,5,6,7, 0,1,2,3, 4,5,6,7 };
@@ -96,7 +98,7 @@ static Packet16uc p16uc_DUPLICATE32_HI = { 0,1,2,3, 0,1,2,3, 4,5,6,7, 4,5,6,7 };
 // Mask alignment
 #define _EIGEN_MASK_ALIGNMENT	0xfffffffffffffff0

-#define _EIGEN_ALIGNED_PTR(x)	((ptrdiff_t)(x) & _EIGEN_MASK_ALIGNMENT)
+#define _EIGEN_ALIGNED_PTR(x)	((std::ptrdiff_t)(x) & _EIGEN_MASK_ALIGNMENT)

 // Handle endianness properly while loading constants
 // Define global static constants:
@@ -132,13 +134,11 @@ template<> struct packet_traits<int>    : default_packet_traits
  typedef Packet4i type;
  typedef Packet4i half;
  enum {
-    // FIXME check the Has*
    Vectorizable = 1,
    AlignedOnScalar = 1,
    size = 4,
    HasHalfPacket = 0,

-    // FIXME check the Has*
    HasAdd  = 1,
    HasSub  = 1,
    HasMul  = 1,
@@ -147,6 +147,37 @@ template<> struct packet_traits<int>    : default_packet_traits
  };
 };

+template<> struct packet_traits<float> : default_packet_traits
+{
+  typedef Packet4f type;
+  typedef Packet4f half;
+  enum {
+    Vectorizable = 1,
+    AlignedOnScalar = 1,
+    size=4,
+    HasHalfPacket = 0,
+
+    HasAdd  = 1,
+    HasSub  = 1,
+    HasMul  = 1,
+    HasDiv  = 1,
+    HasMin  = 1,
+    HasMax  = 1,
+    HasAbs  = 1,
+    HasSin  = 0,
+    HasCos  = 0,
+    HasLog  = 0,
+    HasExp  = 1,
+    HasSqrt = 1,
+    HasRsqrt = 1,
+    HasRound = 1,
+    HasFloor = 1,
+    HasCeil = 1,
+    HasNegate = 1,
+    HasBlend = 1
+  };
+};
+
 template<> struct packet_traits<double> : default_packet_traits
 {
  typedef Packet2d type;
@@ -157,7 +188,6 @@ template<> struct packet_traits<double> : default_packet_traits
    size=2,
    HasHalfPacket = 1,

-    // FIXME check the Has*
    HasAdd  = 1,
    HasSub  = 1,
    HasMul  = 1,
@@ -180,8 +210,12 @@ template<> struct packet_traits<double> : default_packet_traits
 };

 template<> struct unpacket_traits<Packet4i> { typedef int    type; enum {size=4, alignment=Aligned16}; typedef Packet4i half; };
+template<> struct unpacket_traits<Packet4f> { typedef float  type; enum {size=4, alignment=Aligned16}; typedef Packet4f half; };
 template<> struct unpacket_traits<Packet2d> { typedef double type; enum {size=2, alignment=Aligned16}; typedef Packet2d half; };

+/* Forward declaration */
+EIGEN_DEVICE_FUNC inline void ptranspose(PacketBlock<Packet4f,4>& kernel);
+ 
 inline std::ostream & operator <<(std::ostream & s, const Packet4i & v)
 {
  Packet vt;
@@ -222,6 +256,32 @@ inline std::ostream & operator <<(std::ostream & s, const Packet2d & v)
  return s;
 }

+/* Helper function to simulate a vec_splat_packet4f
+ */
+template<int element> EIGEN_STRONG_INLINE Packet4f vec_splat_packet4f(const Packet4f&   from)
+{
+  Packet4f splat;
+  switch (element) {
+  case 0:
+    splat.v4f[0] = vec_splat(from.v4f[0], 0);
+    splat.v4f[1] = splat.v4f[0];
+    break;
+  case 1:
+    splat.v4f[0] = vec_splat(from.v4f[0], 1);
+    splat.v4f[1] = splat.v4f[0];
+    break;
+  case 2:
+    splat.v4f[0] = vec_splat(from.v4f[1], 0);
+    splat.v4f[1] = splat.v4f[0];
+    break;
+  case 3:
+    splat.v4f[0] = vec_splat(from.v4f[1], 1);
+    splat.v4f[1] = splat.v4f[0];
+    break;
+  }
+  return splat;
+}
+
 template<int Offset>
 struct palign_impl<Offset,Packet4i>
 {
@@ -238,6 +298,31 @@ struct palign_impl<Offset,Packet4i>
  }
 };

+/* This is a tricky one, we have to translate float alignment to vector elements of sizeof double
+ */
+template<int Offset>
+struct palign_impl<Offset,Packet4f>
+{
+  static EIGEN_STRONG_INLINE void run(Packet4f& first, const Packet4f& second)
+  {
+    switch (Offset % 4) {
+    case 1:
+      first.v4f[0] = vec_sld(first.v4f[0], first.v4f[1], 8);
+      first.v4f[1] = vec_sld(first.v4f[1], second.v4f[0], 8);
+      break;
+    case 2:
+      first.v4f[0] = first.v4f[1];
+      first.v4f[1] = second.v4f[0];
+      break;
+    case 3:
+      first.v4f[0] = vec_sld(first.v4f[1],  second.v4f[0], 8);
+      first.v4f[1] = vec_sld(second.v4f[0], second.v4f[1], 8);
+      break;
+    }
+  }
+};
+
+
 template<int Offset>
 struct palign_impl<Offset,Packet2d>
 {
@@ -257,6 +342,16 @@ template<> EIGEN_STRONG_INLINE Packet4i pload<Packet4i>(const int*     from)
  return vfrom->v4i;
 }

+template<> EIGEN_STRONG_INLINE Packet4f pload<Packet4f>(const float*   from)
+{
+  // FIXME: No intrinsic yet
+  EIGEN_DEBUG_ALIGNED_LOAD
+  Packet4f vfrom;
+  vfrom.v4f[0] = vec_ld2f(&from[0]);
+  vfrom.v4f[1] = vec_ld2f(&from[2]);
+  return vfrom;
+}
+
 template<> EIGEN_STRONG_INLINE Packet2d pload<Packet2d>(const double* from)
 {
  // FIXME: No intrinsic yet
@@ -275,6 +370,15 @@ template<> EIGEN_STRONG_INLINE void pstore<int>(int*       to, const Packet4i& f
  vto->v4i = from;
 }

+template<> EIGEN_STRONG_INLINE void pstore<float>(float*   to, const Packet4f& from)
+{
+  // FIXME: No intrinsic yet
+  EIGEN_DEBUG_ALIGNED_STORE
+  vec_st2f(from.v4f[0], &to[0]);
+  vec_st2f(from.v4f[1], &to[2]);
+}
+
+
 template<> EIGEN_STRONG_INLINE void pstore<double>(double*   to, const Packet2d& from)
 {
  // FIXME: No intrinsic yet
@@ -288,10 +392,16 @@ template<> EIGEN_STRONG_INLINE Packet4i pset1<Packet4i>(const int&    from)
 {
  return vec_splats(from);
 }
-
 template<> EIGEN_STRONG_INLINE Packet2d pset1<Packet2d>(const double& from) {
  return vec_splats(from);
 }
+template<> EIGEN_STRONG_INLINE Packet4f pset1<Packet4f>(const float&    from)
+{
+  Packet4f to;
+  to.v4f[0] = pset1<Packet2d>(static_cast<const double&>(from));
+  to.v4f[1] = to.v4f[0];
+  return to;
+}

 template<> EIGEN_STRONG_INLINE void
 pbroadcast4<Packet4i>(const int *a,
@@ -304,6 +414,17 @@ pbroadcast4<Packet4i>(const int *a,
  a3 = vec_splat(a3, 3);
 }

+template<> EIGEN_STRONG_INLINE void
+pbroadcast4<Packet4f>(const float *a,
+                      Packet4f& a0, Packet4f& a1, Packet4f& a2, Packet4f& a3)
+{
+  a3 = pload<Packet4f>(a);
+  a0 = vec_splat_packet4f<0>(a3);
+  a1 = vec_splat_packet4f<1>(a3);
+  a2 = vec_splat_packet4f<2>(a3);
+  a3 = vec_splat_packet4f<3>(a3);
+}
+
 template<> EIGEN_STRONG_INLINE void
 pbroadcast4<Packet2d>(const double *a,
                      Packet2d& a0, Packet2d& a1, Packet2d& a2, Packet2d& a3)
@@ -326,6 +447,16 @@ template<> EIGEN_DEVICE_FUNC inline Packet4i pgather<int, Packet4i>(const int* f
 return pload<Packet4i>(ai);
 }

+template<> EIGEN_DEVICE_FUNC inline Packet4f pgather<float, Packet4f>(const float* from, Index stride)
+{
+  float EIGEN_ALIGN16 ai[4];
+  ai[0] = from[0*stride];
+  ai[1] = from[1*stride];
+  ai[2] = from[2*stride];
+  ai[3] = from[3*stride];
+ return pload<Packet4f>(ai);
+}
+
 template<> EIGEN_DEVICE_FUNC inline Packet2d pgather<double, Packet2d>(const double* from, Index stride)
 {
  double EIGEN_ALIGN16 af[2];
@@ -344,6 +475,16 @@ template<> EIGEN_DEVICE_FUNC inline void pscatter<int, Packet4i>(int* to, const
  to[3*stride] = ai[3];
 }

+template<> EIGEN_DEVICE_FUNC inline void pscatter<float, Packet4f>(float* to, const Packet4f& from, Index stride)
+{
+  float EIGEN_ALIGN16 ai[4];
+  pstore<float>((float *)ai, from);
+  to[0*stride] = ai[0];
+  to[1*stride] = ai[1];
+  to[2*stride] = ai[2];
+  to[3*stride] = ai[3];
+}
+
 template<> EIGEN_DEVICE_FUNC inline void pscatter<double, Packet2d>(double* to, const Packet2d& from, Index stride)
 {
  double EIGEN_ALIGN16 af[2];
@@ -353,52 +494,160 @@ template<> EIGEN_DEVICE_FUNC inline void pscatter<double, Packet2d>(double* to,
 }

 template<> EIGEN_STRONG_INLINE Packet4i padd<Packet4i>(const Packet4i& a, const Packet4i& b) { return (a + b); }
+template<> EIGEN_STRONG_INLINE Packet4f padd<Packet4f>(const Packet4f& a, const Packet4f& b)
+{
+  Packet4f c;
+  c.v4f[0] = a.v4f[0] + b.v4f[0];
+  c.v4f[1] = a.v4f[1] + b.v4f[1];
+  return c;
+}
 template<> EIGEN_STRONG_INLINE Packet2d padd<Packet2d>(const Packet2d& a, const Packet2d& b) { return (a + b); }

 template<> EIGEN_STRONG_INLINE Packet4i psub<Packet4i>(const Packet4i& a, const Packet4i& b) { return (a - b); }
+template<> EIGEN_STRONG_INLINE Packet4f psub<Packet4f>(const Packet4f& a, const Packet4f& b)
+{
+  Packet4f c;
+  c.v4f[0] = a.v4f[0] - b.v4f[0];
+  c.v4f[1] = a.v4f[1] - b.v4f[1];
+  return c;
+}
 template<> EIGEN_STRONG_INLINE Packet2d psub<Packet2d>(const Packet2d& a, const Packet2d& b) { return (a - b); }

 template<> EIGEN_STRONG_INLINE Packet4i pmul<Packet4i>(const Packet4i& a, const Packet4i& b) { return (a * b); }
+template<> EIGEN_STRONG_INLINE Packet4f pmul<Packet4f>(const Packet4f& a, const Packet4f& b)
+{
+  Packet4f c;
+  c.v4f[0] = a.v4f[0] * b.v4f[0];
+  c.v4f[1] = a.v4f[1] * b.v4f[1];
+  return c;
+}
 template<> EIGEN_STRONG_INLINE Packet2d pmul<Packet2d>(const Packet2d& a, const Packet2d& b) { return (a * b); }

 template<> EIGEN_STRONG_INLINE Packet4i pdiv<Packet4i>(const Packet4i& a, const Packet4i& b) { return (a / b); }
+template<> EIGEN_STRONG_INLINE Packet4f pdiv<Packet4f>(const Packet4f& a, const Packet4f& b)
+{
+  Packet4f c;
+  c.v4f[0] = a.v4f[0] / b.v4f[0];
+  c.v4f[1] = a.v4f[1] / b.v4f[1];
+  return c;
+}
 template<> EIGEN_STRONG_INLINE Packet2d pdiv<Packet2d>(const Packet2d& a, const Packet2d& b) { return (a / b); }

 template<> EIGEN_STRONG_INLINE Packet4i pnegate(const Packet4i& a) { return (-a); }
+template<> EIGEN_STRONG_INLINE Packet4f pnegate(const Packet4f& a)
+{
+  Packet4f c;
+  c.v4f[0] = -a.v4f[0];
+  c.v4f[1] = -a.v4f[1];
+  return c;
+}
 template<> EIGEN_STRONG_INLINE Packet2d pnegate(const Packet2d& a) { return (-a); }

 template<> EIGEN_STRONG_INLINE Packet4i pconj(const Packet4i& a) { return a; }
+template<> EIGEN_STRONG_INLINE Packet4f pconj(const Packet4f& a) { return a; }
 template<> EIGEN_STRONG_INLINE Packet2d pconj(const Packet2d& a) { return a; }

 template<> EIGEN_STRONG_INLINE Packet4i pmadd(const Packet4i& a, const Packet4i& b, const Packet4i& c) { return padd<Packet4i>(pmul<Packet4i>(a, b), c); }
+template<> EIGEN_STRONG_INLINE Packet4f pmadd(const Packet4f& a, const Packet4f& b, const Packet4f& c)
+{
+  Packet4f res;
+  res.v4f[0] = vec_madd(a.v4f[0], b.v4f[0], c.v4f[0]);
+  res.v4f[1] = vec_madd(a.v4f[1], b.v4f[1], c.v4f[1]);
+  return res;
+}
 template<> EIGEN_STRONG_INLINE Packet2d pmadd(const Packet2d& a, const Packet2d& b, const Packet2d& c) { return vec_madd(a, b, c); }

 template<> EIGEN_STRONG_INLINE Packet4i plset<Packet4i>(const int& a)    { return padd<Packet4i>(pset1<Packet4i>(a), p4i_COUNTDOWN); }
+template<> EIGEN_STRONG_INLINE Packet4f plset<Packet4f>(const float& a)  { return padd<Packet4f>(pset1<Packet4f>(a), p4f_COUNTDOWN); }
 template<> EIGEN_STRONG_INLINE Packet2d plset<Packet2d>(const double& a) { return padd<Packet2d>(pset1<Packet2d>(a), p2d_COUNTDOWN); }

 template<> EIGEN_STRONG_INLINE Packet4i pmin<Packet4i>(const Packet4i& a, const Packet4i& b) { return vec_min(a, b); }
 template<> EIGEN_STRONG_INLINE Packet2d pmin<Packet2d>(const Packet2d& a, const Packet2d& b) { return vec_min(a, b); }
+template<> EIGEN_STRONG_INLINE Packet4f pmin<Packet4f>(const Packet4f& a, const Packet4f& b)
+{
+  Packet4f res;
+  res.v4f[0] = pmin(a.v4f[0], b.v4f[0]);
+  res.v4f[1] = pmin(a.v4f[1], b.v4f[1]);
+  return res;
+}

 template<> EIGEN_STRONG_INLINE Packet4i pmax<Packet4i>(const Packet4i& a, const Packet4i& b) { return vec_max(a, b); }
 template<> EIGEN_STRONG_INLINE Packet2d pmax<Packet2d>(const Packet2d& a, const Packet2d& b) { return vec_max(a, b); }
+template<> EIGEN_STRONG_INLINE Packet4f pmax<Packet4f>(const Packet4f& a, const Packet4f& b)
+{
+  Packet4f res;
+  res.v4f[0] = pmax(a.v4f[0], b.v4f[0]);
+  res.v4f[1] = pmax(a.v4f[1], b.v4f[1]);
+  return res;
+}

 template<> EIGEN_STRONG_INLINE Packet4i pand<Packet4i>(const Packet4i& a, const Packet4i& b) { return vec_and(a, b); }
 template<> EIGEN_STRONG_INLINE Packet2d pand<Packet2d>(const Packet2d& a, const Packet2d& b) { return vec_and(a, b); }
+template<> EIGEN_STRONG_INLINE Packet4f pand<Packet4f>(const Packet4f& a, const Packet4f& b)
+{
+  Packet4f res;
+  res.v4f[0] = pand(a.v4f[0], b.v4f[0]);
+  res.v4f[1] = pand(a.v4f[1], b.v4f[1]);
+  return res;
+}

 template<> EIGEN_STRONG_INLINE Packet4i por<Packet4i>(const Packet4i& a, const Packet4i& b) { return vec_or(a, b); }
 template<> EIGEN_STRONG_INLINE Packet2d por<Packet2d>(const Packet2d& a, const Packet2d& b) { return vec_or(a, b); }
+template<> EIGEN_STRONG_INLINE Packet4f por<Packet4f>(const Packet4f& a, const Packet4f& b)
+{
+  Packet4f res;
+  res.v4f[0] = pand(a.v4f[0], b.v4f[0]);
+  res.v4f[1] = pand(a.v4f[1], b.v4f[1]);
+  return res;
+}

 template<> EIGEN_STRONG_INLINE Packet4i pxor<Packet4i>(const Packet4i& a, const Packet4i& b) { return vec_xor(a, b); }
 template<> EIGEN_STRONG_INLINE Packet2d pxor<Packet2d>(const Packet2d& a, const Packet2d& b) { return vec_xor(a, b); }
+template<> EIGEN_STRONG_INLINE Packet4f pxor<Packet4f>(const Packet4f& a, const Packet4f& b)
+{
+  Packet4f res;
+  res.v4f[0] = pand(a.v4f[0], b.v4f[0]);
+  res.v4f[1] = pand(a.v4f[1], b.v4f[1]);
+  return res;
+}

 template<> EIGEN_STRONG_INLINE Packet4i pandnot<Packet4i>(const Packet4i& a, const Packet4i& b) { return pand<Packet4i>(a, vec_nor(b, b)); }
 template<> EIGEN_STRONG_INLINE Packet2d pandnot<Packet2d>(const Packet2d& a, const Packet2d& b) { return vec_and(a, vec_nor(b, b)); }
+template<> EIGEN_STRONG_INLINE Packet4f pandnot<Packet4f>(const Packet4f& a, const Packet4f& b)
+{
+  Packet4f res;
+  res.v4f[0] = pandnot(a.v4f[0], b.v4f[0]);
+  res.v4f[1] = pandnot(a.v4f[1], b.v4f[1]);
+  return res;
+}

+template<> EIGEN_STRONG_INLINE Packet4f pround<Packet4f>(const Packet4f& a)
+{
+  Packet4f res;
+  res.v4f[0] = vec_round(a.v4f[0]);
+  res.v4f[1] = vec_round(a.v4f[1]);
+  return res;
+}
 template<> EIGEN_STRONG_INLINE Packet2d pround<Packet2d>(const Packet2d& a) { return vec_round(a); }
+template<> EIGEN_STRONG_INLINE Packet4f pceil<Packet4f>(const  Packet4f& a)
+{
+  Packet4f res;
+  res.v4f[0] = vec_ceil(a.v4f[0]);
+  res.v4f[1] = vec_ceil(a.v4f[1]);
+  return res;
+}
 template<> EIGEN_STRONG_INLINE Packet2d pceil<Packet2d>(const  Packet2d& a) { return vec_ceil(a); }
+template<> EIGEN_STRONG_INLINE Packet4f pfloor<Packet4f>(const Packet4f& a)
+{
+  Packet4f res;
+  res.v4f[0] = vec_floor(a.v4f[0]);
+  res.v4f[1] = vec_floor(a.v4f[1]);
+  return res;
+}
 template<> EIGEN_STRONG_INLINE Packet2d pfloor<Packet2d>(const Packet2d& a) { return vec_floor(a); }

 template<> EIGEN_STRONG_INLINE Packet4i ploadu<Packet4i>(const int*       from) { return pload<Packet4i>(from); }
+template<> EIGEN_STRONG_INLINE Packet4f ploadu<Packet4f>(const float*     from) { return pload<Packet4f>(from); }
 template<> EIGEN_STRONG_INLINE Packet2d ploadu<Packet2d>(const double*    from) { return pload<Packet2d>(from); }


@@ -408,6 +657,14 @@ template<> EIGEN_STRONG_INLINE Packet4i ploaddup<Packet4i>(const int*     from)
  return vec_perm(p, p, p16uc_DUPLICATE32_HI);
 }

+template<> EIGEN_STRONG_INLINE Packet4f ploaddup<Packet4f>(const float*    from)
+{
+  Packet4f p = pload<Packet4f>(from);
+  p.v4f[1] = vec_splat(p.v4f[0], 1);
+  p.v4f[0] = vec_splat(p.v4f[0], 0);
+  return p;
+}
+
 template<> EIGEN_STRONG_INLINE Packet2d ploaddup<Packet2d>(const double*   from)
 {
  Packet2d p = pload<Packet2d>(from);
@@ -415,12 +672,15 @@ template<> EIGEN_STRONG_INLINE Packet2d ploaddup<Packet2d>(const double*   from)
 }

 template<> EIGEN_STRONG_INLINE void pstoreu<int>(int*        to, const Packet4i& from) { pstore<int>(to, from); }
+template<> EIGEN_STRONG_INLINE void pstoreu<float>(float*    to, const Packet4f& from) { pstore<float>(to, from); }
 template<> EIGEN_STRONG_INLINE void pstoreu<double>(double*  to, const Packet2d& from) { pstore<double>(to, from); }

 template<> EIGEN_STRONG_INLINE void prefetch<int>(const int*       addr) { EIGEN_ZVECTOR_PREFETCH(addr); }
+template<> EIGEN_STRONG_INLINE void prefetch<float>(const float*   addr) { EIGEN_ZVECTOR_PREFETCH(addr); }
 template<> EIGEN_STRONG_INLINE void prefetch<double>(const double* addr) { EIGEN_ZVECTOR_PREFETCH(addr); }

 template<> EIGEN_STRONG_INLINE int    pfirst<Packet4i>(const Packet4i& a) { int    EIGEN_ALIGN16 x[4]; pstore(x, a); return x[0]; }
+template<> EIGEN_STRONG_INLINE float  pfirst<Packet4f>(const Packet4f& a) { float  EIGEN_ALIGN16 x[2]; vec_st2f(a.v4f[0], &x[0]); return x[0]; }
 template<> EIGEN_STRONG_INLINE double pfirst<Packet2d>(const Packet2d& a) { double EIGEN_ALIGN16 x[2]; pstore(x, a); return x[0]; }

 template<> EIGEN_STRONG_INLINE Packet4i preverse(const Packet4i& a)
@@ -433,8 +693,23 @@ template<> EIGEN_STRONG_INLINE Packet2d preverse(const Packet2d& a)
  return reinterpret_cast<Packet2d>(vec_perm(reinterpret_cast<Packet16uc>(a), reinterpret_cast<Packet16uc>(a), p16uc_REVERSE64));
 }

-template<> EIGEN_STRONG_INLINE Packet4i pabs(const Packet4i& a) { return vec_abs(a); }
-template<> EIGEN_STRONG_INLINE Packet2d pabs(const Packet2d& a) { return vec_abs(a); }
+template<> EIGEN_STRONG_INLINE Packet4f preverse(const Packet4f& a)
+{
+  Packet4f rev;
+  rev.v4f[0] = preverse<Packet2d>(a.v4f[1]);
+  rev.v4f[1] = preverse<Packet2d>(a.v4f[0]);
+  return rev;
+}
+
+template<> EIGEN_STRONG_INLINE Packet4i pabs<Packet4i>(const Packet4i& a) { return vec_abs(a); }
+template<> EIGEN_STRONG_INLINE Packet2d pabs<Packet2d>(const Packet2d& a) { return vec_abs(a); }
+template<> EIGEN_STRONG_INLINE Packet4f pabs<Packet4f>(const Packet4f& a)
+{
+  Packet4f res;
+  res.v4f[0] = pabs(a.v4f[0]);
+  res.v4f[1] = pabs(a.v4f[1]);
+  return res;
+}

 template<> EIGEN_STRONG_INLINE int predux<Packet4i>(const Packet4i& a)
 {
@@ -453,6 +728,13 @@ template<> EIGEN_STRONG_INLINE double predux<Packet2d>(const Packet2d& a)
  sum = padd<Packet2d>(a, b);
  return pfirst(sum);
 }
+template<> EIGEN_STRONG_INLINE float predux<Packet4f>(const Packet4f& a)
+{
+  Packet2d sum;
+  sum = padd<Packet2d>(a.v4f[0], a.v4f[1]);
+  double first = predux<Packet2d>(sum);
+  return static_cast<float>(first);
+}

 template<> EIGEN_STRONG_INLINE Packet4i preduxp<Packet4i>(const Packet4i* vecs)
 {
@@ -493,6 +775,21 @@ template<> EIGEN_STRONG_INLINE Packet2d preduxp<Packet2d>(const Packet2d* vecs)
  return sum;
 }

+template<> EIGEN_STRONG_INLINE Packet4f preduxp<Packet4f>(const Packet4f* vecs)
+{
+  PacketBlock<Packet4f,4> transpose;
+  transpose.packet[0] = vecs[0];
+  transpose.packet[1] = vecs[1];
+  transpose.packet[2] = vecs[2];
+  transpose.packet[3] = vecs[3];
+  ptranspose(transpose);
+
+  Packet4f sum = padd(transpose.packet[0], transpose.packet[1]);
+  sum = padd(sum, transpose.packet[2]);
+  sum = padd(sum, transpose.packet[3]);
+  return sum;
+}
+
 // Other reduction functions:
 // mul
 template<> EIGEN_STRONG_INLINE int predux_mul<Packet4i>(const Packet4i& a)
@@ -507,6 +804,12 @@ template<> EIGEN_STRONG_INLINE double predux_mul<Packet2d>(const Packet2d& a)
  return pfirst(pmul(a, reinterpret_cast<Packet2d>(vec_sld(reinterpret_cast<Packet4i>(a), reinterpret_cast<Packet4i>(a), 8))));
 }

+template<> EIGEN_STRONG_INLINE float predux_mul<Packet4f>(const Packet4f& a)
+{
+  // Return predux_mul<Packet2d> of the subvectors product
+  return static_cast<float>(pfirst(predux_mul(pmul(a.v4f[0], a.v4f[1]))));
+}
+
 // min
 template<> EIGEN_STRONG_INLINE int predux_min<Packet4i>(const Packet4i& a)
 {
@@ -521,6 +824,14 @@ template<> EIGEN_STRONG_INLINE double predux_min<Packet2d>(const Packet2d& a)
  return pfirst(pmin<Packet2d>(a, reinterpret_cast<Packet2d>(vec_sld(reinterpret_cast<Packet4i>(a), reinterpret_cast<Packet4i>(a), 8))));
 }

+template<> EIGEN_STRONG_INLINE float predux_min<Packet4f>(const Packet4f& a)
+{
+  Packet2d b, res;
+  b   = pmin<Packet2d>(a.v4f[0], a.v4f[1]);
+  res = pmin<Packet2d>(b, reinterpret_cast<Packet2d>(vec_sld(reinterpret_cast<Packet4i>(b), reinterpret_cast<Packet4i>(b), 8)));
+  return static_cast<float>(pfirst(res));
+}
+
 // max
 template<> EIGEN_STRONG_INLINE int predux_max<Packet4i>(const Packet4i& a)
 {
@@ -536,6 +847,14 @@ template<> EIGEN_STRONG_INLINE double predux_max<Packet2d>(const Packet2d& a)
  return pfirst(pmax<Packet2d>(a, reinterpret_cast<Packet2d>(vec_sld(reinterpret_cast<Packet4i>(a), reinterpret_cast<Packet4i>(a), 8))));
 }

+template<> EIGEN_STRONG_INLINE float predux_max<Packet4f>(const Packet4f& a)
+{
+  Packet2d b, res;
+  b   = pmax<Packet2d>(a.v4f[0], a.v4f[1]);
+  res = pmax<Packet2d>(b, reinterpret_cast<Packet2d>(vec_sld(reinterpret_cast<Packet4i>(b), reinterpret_cast<Packet4i>(b), 8)));
+  return static_cast<float>(pfirst(res));
+}
+
 EIGEN_DEVICE_FUNC inline void
 ptranspose(PacketBlock<Packet4i,4>& kernel) {
  Packet4i t0 = vec_mergeh(kernel.packet[0], kernel.packet[2]);
@@ -556,12 +875,61 @@ ptranspose(PacketBlock<Packet2d,2>& kernel) {
  kernel.packet[1] = t1;
 }

+/* Split the Packet4f PacketBlock into 4 Packet2d PacketBlocks and transpose each one
+ */
+EIGEN_DEVICE_FUNC inline void
+ptranspose(PacketBlock<Packet4f,4>& kernel) {
+  PacketBlock<Packet2d,2> t0,t1,t2,t3;
+  // copy top-left 2x2 Packet2d block
+  t0.packet[0] = kernel.packet[0].v4f[0];
+  t0.packet[1] = kernel.packet[1].v4f[0];
+
+  // copy top-right 2x2 Packet2d block
+  t1.packet[0] = kernel.packet[0].v4f[1];
+  t1.packet[1] = kernel.packet[1].v4f[1];
+
+  // copy bottom-left 2x2 Packet2d block
+  t2.packet[0] = kernel.packet[2].v4f[0];
+  t2.packet[1] = kernel.packet[3].v4f[0];
+
+  // copy bottom-right 2x2 Packet2d block
+  t3.packet[0] = kernel.packet[2].v4f[1];
+  t3.packet[1] = kernel.packet[3].v4f[1];
+
+  // Transpose all 2x2 blocks
+  ptranspose(t0);
+  ptranspose(t1);
+  ptranspose(t2);
+  ptranspose(t3);
+
+  // Copy back transposed blocks, but exchange t1 and t2 due to transposition
+  kernel.packet[0].v4f[0] = t0.packet[0];
+  kernel.packet[0].v4f[1] = t2.packet[0];
+  kernel.packet[1].v4f[0] = t0.packet[1];
+  kernel.packet[1].v4f[1] = t2.packet[1];
+  kernel.packet[2].v4f[0] = t1.packet[0];
+  kernel.packet[2].v4f[1] = t3.packet[0];
+  kernel.packet[3].v4f[0] = t1.packet[1];
+  kernel.packet[3].v4f[1] = t3.packet[1];
+}
+
 template<> EIGEN_STRONG_INLINE Packet4i pblend(const Selector<4>& ifPacket, const Packet4i& thenPacket, const Packet4i& elsePacket) {
  Packet4ui select = { ifPacket.select[0], ifPacket.select[1], ifPacket.select[2], ifPacket.select[3] };
  Packet4ui mask = vec_cmpeq(select, reinterpret_cast<Packet4ui>(p4i_ONE));
  return vec_sel(elsePacket, thenPacket, mask);
 }

+template<> EIGEN_STRONG_INLINE Packet4f pblend(const Selector<4>& ifPacket, const Packet4f& thenPacket, const Packet4f& elsePacket) {
+  Packet2ul select_hi = { ifPacket.select[0], ifPacket.select[1] };
+  Packet2ul select_lo = { ifPacket.select[2], ifPacket.select[3] };
+  Packet2ul mask_hi = vec_cmpeq(select_hi, reinterpret_cast<Packet2ul>(p2l_ONE));
+  Packet2ul mask_lo = vec_cmpeq(select_lo, reinterpret_cast<Packet2ul>(p2l_ONE));
+  Packet4f result;
+  result.v4f[0] = vec_sel(elsePacket.v4f[0], thenPacket.v4f[0], mask_hi);
+  result.v4f[1] = vec_sel(elsePacket.v4f[1], thenPacket.v4f[1], mask_lo);
+  return result;
+}
+
 template<> EIGEN_STRONG_INLINE Packet2d pblend(const Selector<2>& ifPacket, const Packet2d& thenPacket, const Packet2d& elsePacket) {
  Packet2ul select = { ifPacket.select[0], ifPacket.select[1] };
  Packet2ul mask = vec_cmpeq(select, reinterpret_cast<Packet2ul>(p2l_ONE));
--- a/Eigen/src/Core/functors/AssignmentFunctors.h
+++ b/Eigen/src/Core/functors/AssignmentFunctors.h
@@ -28,7 +28,7 @@ template<typename DstScalar,typename SrcScalar> struct assign_op {
  { internal::pstoret<DstScalar,Packet,Alignment>(a,b); }
 };

-// Empty overload for void type (used by PermutationMatrix
+// Empty overload for void type (used by PermutationMatrix)
 template<typename DstScalar> struct assign_op<DstScalar,void> {};

 template<typename DstScalar,typename SrcScalar>
--- a/Eigen/src/Core/functors/BinaryFunctors.h
+++ b/Eigen/src/Core/functors/BinaryFunctors.h
@@ -255,7 +255,7 @@ struct scalar_cmp_op<LhsScalar,RhsScalar, cmp_NEQ> : binary_op_base<LhsScalar,Rh


 /** \internal
-  * \brief Template functor to compute the hypot of two scalars
+  * \brief Template functor to compute the hypot of two \b positive \b and \b real scalars
  *
  * \sa MatrixBase::stableNorm(), class Redux
  */
@@ -263,22 +263,15 @@ template<typename Scalar>
 struct scalar_hypot_op<Scalar,Scalar> : binary_op_base<Scalar,Scalar>
 {
  EIGEN_EMPTY_STRUCT_CTOR(scalar_hypot_op)
-//   typedef typename NumTraits<Scalar>::Real result_type;
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Scalar operator() (const Scalar& _x, const Scalar& _y) const
+
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Scalar operator() (const Scalar &x, const Scalar &y) const
  {
-    EIGEN_USING_STD_MATH(sqrt)
-    Scalar p, qp;
-    if(_x>_y)
-    {
-      p = _x;
-      qp = _y / p;
-    }
-    else
-    {
-      p = _y;
-      qp = _x / p;
-    }
-    return p * sqrt(Scalar(1) + qp*qp);
+    // This functor is used by hypotNorm only for which it is faster to first apply abs
+    // on all coefficients prior to reduction through hypot.
+    // This way we avoid calling abs on positive and real entries, and this also permits
+    // to seamlessly handle complexes. Otherwise we would have to handle both real and complexes
+    // through the same functor...
+    return internal::positive_real_hypot(x,y);
  }
 };
 template<typename Scalar>
--- a/Eigen/src/Core/functors/NullaryFunctors.h
+++ b/Eigen/src/Core/functors/NullaryFunctors.h
@@ -44,16 +44,16 @@ struct linspaced_op_impl<Scalar,Packet,/*IsInteger*/false>
 {
  linspaced_op_impl(const Scalar& low, const Scalar& high, Index num_steps) :
    m_low(low), m_high(high), m_size1(num_steps==1 ? 1 : num_steps-1), m_step(num_steps==1 ? Scalar() : (high-low)/Scalar(num_steps-1)),
-    m_interPacket(plset<Packet>(0)),
-    m_flip(std::abs(high)<std::abs(low))
+    m_flip(numext::abs(high)<numext::abs(low))
  {}

  template<typename IndexType>
  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Scalar operator() (IndexType i) const {
+    typedef typename NumTraits<Scalar>::Real RealScalar;
    if(m_flip)
-      return (i==0)? m_low : (m_high - (m_size1-i)*m_step);
+      return (i==0)? m_low : (m_high - RealScalar(m_size1-i)*m_step);
    else
-      return (i==m_size1)? m_high : (m_low + i*m_step);
+      return (i==m_size1)? m_high : (m_low + RealScalar(i)*m_step);
  }

  template<typename IndexType>
@@ -63,7 +63,7 @@ struct linspaced_op_impl<Scalar,Packet,/*IsInteger*/false>
    // [low, ..., low] + ( [step, ..., step] * ( [i, ..., i] + [0, ..., size] ) )
    if(m_flip)
    {
-      Packet pi = padd(pset1<Packet>(Scalar(i-m_size1)),m_interPacket);
+      Packet pi = plset<Packet>(Scalar(i-m_size1));
      Packet res = padd(pset1<Packet>(m_high), pmul(pset1<Packet>(m_step), pi));
      if(i==0)
        res = pinsertfirst(res, m_low);
@@ -71,7 +71,7 @@ struct linspaced_op_impl<Scalar,Packet,/*IsInteger*/false>
    }
    else
    {
-      Packet pi = padd(pset1<Packet>(Scalar(i)),m_interPacket);
+      Packet pi = plset<Packet>(Scalar(i));
      Packet res = padd(pset1<Packet>(m_low), pmul(pset1<Packet>(m_step), pi));
      if(i==m_size1-unpacket_traits<Packet>::size+1)
        res = pinsertlast(res, m_high);
@@ -83,7 +83,6 @@ struct linspaced_op_impl<Scalar,Packet,/*IsInteger*/false>
  const Scalar m_high;
  const Index m_size1;
  const Scalar m_step;
-  const Packet m_interPacket;
  const bool m_flip;
 };

@@ -93,8 +92,8 @@ struct linspaced_op_impl<Scalar,Packet,/*IsInteger*/true>
  linspaced_op_impl(const Scalar& low, const Scalar& high, Index num_steps) :
    m_low(low),
    m_multiplier((high-low)/convert_index<Scalar>(num_steps<=1 ? 1 : num_steps-1)),
-    m_divisor(convert_index<Scalar>(num_steps+high-low)/(high-low+1)),
-    m_use_divisor((high-low+1)<num_steps)
+    m_divisor(convert_index<Scalar>((high>=low?num_steps:-num_steps)+(high-low))/((numext::abs(high-low)+1)==0?1:(numext::abs(high-low)+1))),
+    m_use_divisor(num_steps>1 && (numext::abs(high-low)+1)<num_steps)
  {}

  template<typename IndexType>
@@ -173,6 +172,13 @@ template<typename Scalar, typename PacketType,typename IndexType>
 struct has_unary_operator<linspaced_op<Scalar,PacketType>,IndexType> { enum { value = 1}; };
 template<typename Scalar, typename PacketType,typename IndexType>
 struct has_binary_operator<linspaced_op<Scalar,PacketType>,IndexType> { enum { value = 0}; };
+
+template<typename Scalar,typename IndexType>
+struct has_nullary_operator<scalar_random_op<Scalar>,IndexType> { enum { value = 1}; };
+template<typename Scalar,typename IndexType>
+struct has_unary_operator<scalar_random_op<Scalar>,IndexType> { enum { value = 0}; };
+template<typename Scalar,typename IndexType>
+struct has_binary_operator<scalar_random_op<Scalar>,IndexType> { enum { value = 0}; };
 #endif

 } // end namespace internal
--- a/Eigen/src/Core/functors/StlFunctors.h
+++ b/Eigen/src/Core/functors/StlFunctors.h
@@ -72,7 +72,7 @@ template<typename T>
 struct functor_traits<std::not_equal_to<T> >
 { enum { Cost = 1, PacketAccess = false }; };

-#if(__cplusplus < 201103L)
+#if (__cplusplus < 201103L) && (EIGEN_COMP_MSVC <= 1900)
 // std::binder* are deprecated since c++11 and will be removed in c++17
 template<typename T>
 struct functor_traits<std::binder2nd<T> >
@@ -83,13 +83,17 @@ struct functor_traits<std::binder1st<T> >
 { enum { Cost = functor_traits<T>::Cost, PacketAccess = false }; };
 #endif

+#if (__cplusplus < 201703L) && (EIGEN_COMP_MSVC < 1910)
+// std::unary_negate is deprecated since c++17 and will be removed in c++20
 template<typename T>
 struct functor_traits<std::unary_negate<T> >
 { enum { Cost = 1 + functor_traits<T>::Cost, PacketAccess = false }; };

+// std::binary_negate is deprecated since c++17 and will be removed in c++20
 template<typename T>
 struct functor_traits<std::binary_negate<T> >
 { enum { Cost = 1 + functor_traits<T>::Cost, PacketAccess = false }; };
+#endif

 #ifdef EIGEN_STDEXT_SUPPORT

--- a/Show More
+++ b/Show More