Fix docs build job.

Extend the range of supported CMake package config versions
Modified to be backward-compatible with Eigen 3.4.0, in that the following will still accept 3.4.1: ``` find_package(Eigen3 3.3) ``` (cherry picked from commit 027dc5bc8d)
2026-04-10 11:34:33 +08:00 · 2025-09-29 16:25:14 -07:00 · 2025-09-29 10:47:01 -07:00 · 2025-09-23 13:22:29 -07:00 · 2025-09-12 14:14:52 -07:00 · 2025-08-29 11:34:35 -07:00
603 changed files with 44302 additions and 20954 deletions
--- a/.clang-format
+++ b/.clang-format
@@ -0,0 +1,19 @@
 ---
 BasedOnStyle: Google
 ColumnLimit:  120
 ---
 Language:     Cpp
 BasedOnStyle: Google
 ColumnLimit:  120
 StatementMacros:
  - EIGEN_STATIC_ASSERT
  - EIGEN_INITIALIZE_COEFFS_IF_THAT_OPTION_IS_ENABLED
  - EIGEN_INTERNAL_DENSE_STORAGE_CTOR_PLUGIN
 SortIncludes: false
 AttributeMacros:
 - EIGEN_STRONG_INLINE
 - EIGEN_ALWAYS_INLINE
 - EIGEN_DEVICE_FUNC
 - EIGEN_DONT_INLINE
 - EIGEN_DEPRECATED
 - EIGEN_UNUSED
--- a/.gitignore
+++ b/.gitignore
@@ -1,4 +1,3 @@
 syntax: glob
 qrc_*cxx
 *.orig
 *.pyc
@@ -36,3 +35,7 @@ lapack/reference
 .*project
 .settings
 Makefile
 !ci/build.gitlab-ci.yml
 !scripts/buildtests.in
 !Eigen/Core
 !Eigen/src/Core
--- a/.gitlab-ci.yml
+++ b/.gitlab-ci.yml
@@ -0,0 +1,34 @@
 # This file is part of Eigen, a lightweight C++ template library
 # for linear algebra.
 #
 # Copyright (C) 2023, The Eigen Authors
 #
 # This Source Code Form is subject to the terms of the Mozilla
 # Public License v. 2.0. If a copy of the MPL was not distributed
 # with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
 stages:
  - checkformat
  - build
  - test
  - deploy
 variables:
  # CMake build directory.
  EIGEN_CI_BUILDDIR: .build
  # Specify the CMake build target.
  EIGEN_CI_BUILD_TARGET: ""
  # If a test regex is specified, that will be selected.
  # Otherwise, we will try a label if specified.
  EIGEN_CI_CTEST_REGEX: ""
  EIGEN_CI_CTEST_LABEL: ""
  EIGEN_CI_CTEST_ARGS: ""
 include:
  - "/ci/checkformat.gitlab-ci.yml"
  - "/ci/common.gitlab-ci.yml"
  - "/ci/build.linux.gitlab-ci.yml"
  - "/ci/build.windows.gitlab-ci.yml"
  - "/ci/test.linux.gitlab-ci.yml"
  - "/ci/test.windows.gitlab-ci.yml"
  - "/ci/deploy.gitlab-ci.yml"
--- a/.gitlab/issue_templates/Bug
+++ b/.gitlab/issue_templates/Bug
@@ -0,0 +1,69 @@
 <!--
 Please read this!
 Before opening a new issue, make sure to search for keywords in the issues
 filtered by "bug::confirmed" or "bug::unconfirmed" and "bugzilla" label:
 - https://gitlab.com/libeigen/eigen/-/issues?scope=all&utf8=%E2%9C%93&state=opened&label_name[]=bug%3A%3Aconfirmed
 - https://gitlab.com/libeigen/eigen/-/issues?scope=all&utf8=%E2%9C%93&state=opened&label_name[]=bug%3A%3Aunconfirmed
 - https://gitlab.com/libeigen/eigen/-/issues?scope=all&utf8=%E2%9C%93&state=opened&label_name[]=bugzilla
 and verify the issue you're about to submit isn't a duplicate. -->
 ### Summary
 <!-- Summarize the bug encountered concisely. -->
 ### Environment
 <!-- Please provide your development environment here -->
 - **Operating System** : Windows/Linux
 - **Architecture** : x64/Arm64/PowerPC ...
 - **Eigen Version** : 3.3.9
 - **Compiler Version** : Gcc7.0
 - **Compile Flags** : -O3 -march=native
 - **Vector Extension** : SSE/AVX/NEON ...
 ### Minimal Example
 <!-- If possible, please create a minimal example here that exhibits the problematic behavior.
 You can also link to [godbolt](https://godbolt.org). But please note that you need to click 
 the "Share" button in the top right-hand corner of the godbolt page where you reproduce the sample 
 code to get the share link instead of in your browser address bar. 
 You can read [the guidelines on stackoverflow](https://stackoverflow.com/help/minimal-reproducible-example)
 on how to create a good minimal example. -->
 ```cpp
 //show your code here
 ```
 ### Steps to reproduce
 <!-- Describe how one can reproduce the issue - this is very important. Please use an ordered list. -->
 1. first step
 2. second step
 3. ... 
 ### What is the current *bug* behavior?
 <!-- Describe what actually happens. -->
 ### What is the expected *correct* behavior?
 <!-- Describe what you should see instead. -->
 ### Relevant logs
 <!-- Add relevant code snippets or program output within blocks marked by " ``` " -->
 <!-- OPTIONAL: remove this section if you are not reporting a compilation warning issue.-->
 ### Warning Messages
 <!-- Show us the warning messages you got! -->
 <!-- OPTIONAL: remove this section if you are not reporting a performance issue. -->
 ### Benchmark scripts and results
 <!-- Please share any benchmark scripts - either standalone, or using [Google Benchmark](https://github.com/google/benchmark). -->
 ### Anything else that might help
 <!-- It will be better to provide us more information to help narrow down the cause. 
 Including but not limited to the following: 
 - lines of code that might help us diagnose the problem. 
 - potential ways to address the issue.
 - last known working/first broken version (release number or commit hash). --> 
 - [ ] Have a plan to fix this issue.
--- a/.gitlab/issue_templates/Feature
+++ b/.gitlab/issue_templates/Feature
@@ -0,0 +1,7 @@
 ### Describe the feature you would like to be implemented.
 ### Would such a feature be useful for other users? Why?
 ### Any hints on how to implement the requested feature?
 ### Additional resources
--- a/.gitlab/merge_request_templates/Merge
+++ b/.gitlab/merge_request_templates/Merge
@@ -0,0 +1,26 @@
 <!-- 
 Thanks for contributing a merge request! Please name and fully describe your MR as you would for a commit message.
 If the MR fixes an issue, please include "Fixes #issue" in the commit message and the MR description.
 In addition, we recommend that first-time contributors read our [contribution guidelines](https://eigen.tuxfamily.org/index.php?title=Contributing_to_Eigen) and [git page](https://eigen.tuxfamily.org/index.php?title=Git), which will help you submit a more standardized MR.
 Before submitting the MR, you also need to complete the following checks:
 - Make one PR per feature/bugfix (don't mix multiple changes into one PR). Avoid committing unrelated changes.
 - Rebase before committing
 - For code changes, run the test suite (at least the tests that are likely affected by the change).
  See our [test guidelines](https://eigen.tuxfamily.org/index.php?title=Tests).
 - If possible, add a test (both for bug-fixes as well as new features)
 - Make sure new features are documented
 Note that we are a team of volunteers; we appreciate your patience during the review process.
 Again, thanks for contributing! -->
 ### Reference issue
 <!-- You can link to a specific issue using the gitlab syntax #<issue number>  -->
 ### What does this implement/fix?
 <!--Please explain your changes.-->
 ### Additional information
 <!--Any additional information you think is important.-->
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -1,38 +1,101 @@
 cmake_minimum_required(VERSION 3.10.0)
 #==============================================================================
 # CMake Policy issues.
 #==============================================================================
 # Allow overriding options in a parent project via `set` before including Eigen.
 if (POLICY CMP0077)
  cmake_policy (SET CMP0077 NEW)
 endif (POLICY CMP0077)
 # NOTE Remove setting the policy once the minimum required CMake version is
 # increased to at least 3.15. Retain enabling the export to package registry.
 if (POLICY CMP0090)
  # The export command does not populate package registry by default
  cmake_policy (SET CMP0090 NEW)
  # Unless otherwise specified, always export to package registry to ensure
  # backwards compatibility.
  if (NOT DEFINED CMAKE_EXPORT_PACKAGE_REGISTRY)
    set (CMAKE_EXPORT_PACKAGE_REGISTRY ON)
  endif (NOT DEFINED CMAKE_EXPORT_PACKAGE_REGISTRY)
 endif (POLICY CMP0090)
 # Disable warning about find_package(CUDA).
 # CUDA language support is lacking for clang as the CUDA compiler
 # until at least cmake version 3.18.  Even then, there seems to be
 # issues on Windows+Ninja in passing build flags.  Continue using
 # the "old" way for now.
 if (POLICY CMP0146)
  cmake_policy(SET CMP0146 OLD)
 endif ()
 #==============================================================================
 # CMake Project.
 #==============================================================================
 project(Eigen3)
-cmake_minimum_required(VERSION 2.8.11)
+# Remove this block after bumping CMake to v3.21.0
-
+# PROJECT_IS_TOP_LEVEL is defined then by default
-# guard against in-source builds
+if(CMAKE_VERSION VERSION_LESS 3.21.0)
-
+  if(CMAKE_SOURCE_DIR STREQUAL CMAKE_CURRENT_SOURCE_DIR)
-if(${CMAKE_SOURCE_DIR} STREQUAL ${CMAKE_BINARY_DIR})
+    set(PROJECT_IS_TOP_LEVEL ON)
-  message(FATAL_ERROR "In-source builds not allowed. Please make a new directory (called a build directory) and run CMake from there. You may need to remove CMakeCache.txt. ")
+  else()
    set(PROJECT_IS_TOP_LEVEL OFF)
  endif()
 endif()
 #==============================================================================
 # Build ON/OFF Settings.
 #==============================================================================
 # Determine if we should build tests.
 include(CMakeDependentOption)
 cmake_dependent_option(BUILD_TESTING "Enable creation of tests." ON "PROJECT_IS_TOP_LEVEL" OFF)
 option(EIGEN_BUILD_TESTING "Enable creation of Eigen tests." ${BUILD_TESTING})
 option(EIGEN_LEAVE_TEST_IN_ALL_TARGET "Leaves tests in the all target, needed by ctest for automatic building." OFF)
-# Alias Eigen_*_DIR to Eigen3_*_DIR:
+# Determine if we should build BLAS/LAPACK implementations.
-
+option(EIGEN_BUILD_BLAS "Toggles the building of the Eigen Blas library" ${PROJECT_IS_TOP_LEVEL})
-set(Eigen_SOURCE_DIR ${Eigen3_SOURCE_DIR})
+option(EIGEN_BUILD_LAPACK "Toggles the building of the included Eigen LAPACK library" ${PROJECT_IS_TOP_LEVEL})
-set(Eigen_BINARY_DIR ${Eigen3_BINARY_DIR})
+if (EIGEN_BUILD_BLAS OR EIGEN_BUILD_LAPACK)
-
+  # BLAS and LAPACK currently need a fortran compiler.
-# guard against bad build-type strings
+  include(CMakeDetermineFortranCompiler)
-
+  if (NOT CMAKE_Fortran_COMPILER)
-if (NOT CMAKE_BUILD_TYPE)
+    set(EIGEN_BUILD_BLAS OFF)
-  set(CMAKE_BUILD_TYPE "Release")
+    set(EIGEN_BUILD_LAPACK OFF)
  else()
    # Determine if we should build shared libraries for BLAS/LAPACK on this platform.
    get_cmake_property(EIGEN_BUILD_SHARED_LIBS TARGET_SUPPORTS_SHARED_LIBS)
  endif()
 endif()
-string(TOLOWER "${CMAKE_BUILD_TYPE}" cmake_build_type_tolower)
+option(EIGEN_BUILD_BTL "Build benchmark suite" OFF)
-if(    NOT cmake_build_type_tolower STREQUAL "debug"
+option(EIGEN_BUILD_SPBENCH "Build sparse benchmark suite" OFF)
-   AND NOT cmake_build_type_tolower STREQUAL "release"
+# Avoid building docs if included from another project.
-   AND NOT cmake_build_type_tolower STREQUAL "relwithdebinfo")
+# Building documentation requires creating and running executables on the host
-  message(FATAL_ERROR "Unknown build type \"${CMAKE_BUILD_TYPE}\". Allowed values are Debug, Release, RelWithDebInfo (case-insensitive).")
+# platform.  We shouldn't do this if cross-compiling.
 if (PROJECT_IS_TOP_LEVEL AND NOT CMAKE_CROSSCOMPILING)
  set(EIGEN_BUILD_DOC_DEFAULT ON)
 endif()
 option(EIGEN_BUILD_DOC "Enable creation of Eigen documentation" ${EIGEN_BUILD_DOC_DEFAULT})
 option(EIGEN_BUILD_DEMOS "Toggles the building of the Eigen demos" ${PROJECT_IS_TOP_LEVEL})
 # Disable pkgconfig only for native Windows builds
 if(NOT WIN32 OR NOT CMAKE_HOST_SYSTEM_NAME MATCHES Windows)
  option(EIGEN_BUILD_PKGCONFIG "Build pkg-config .pc file for Eigen" ${PROJECT_IS_TOP_LEVEL})
 endif()
 option(EIGEN_BUILD_CMAKE_PACKAGE "Enables the creation of EigenConfig.cmake and related files" ${PROJECT_IS_TOP_LEVEL})
 if (EIGEN_BUILD_TESTING OR EIGEN_BUILD_BLAS OR EIGEN_BUILD_LAPACK OR EIGEN_BUILT_BTL OR EIGEN_BUILD_BTL OR EIGEN_BUILD_SPBENCH OR EIGEN_BUILD_DOC OR EIGEN_BUILD_DEMOS)
  set(EIGEN_IS_BUILDING_ ON)
 endif()
 #==============================================================================
 # Version Info.
 #==============================================================================
-#############################################################################
+# Automatically parse the version number from header files.
 # retrieve version information                                               #
 #############################################################################
 # automatically parse the version number
 file(READ "${PROJECT_SOURCE_DIR}/Eigen/src/Core/util/Macros.h" _eigen_version_header)
 string(REGEX MATCH "define[ \t]+EIGEN_WORLD_VERSION[ \t]+([0-9]+)" _eigen_world_version_match "${_eigen_version_header}")
 set(EIGEN_WORLD_VERSION "${CMAKE_MATCH_1}")
@@ -42,36 +105,218 @@ string(REGEX MATCH "define[ \t]+EIGEN_MINOR_VERSION[ \t]+([0-9]+)" _eigen_minor_
 set(EIGEN_MINOR_VERSION "${CMAKE_MATCH_1}")
 set(EIGEN_VERSION_NUMBER ${EIGEN_WORLD_VERSION}.${EIGEN_MAJOR_VERSION}.${EIGEN_MINOR_VERSION})
-# if we are not in a mercurial clone
+# If we are in a git repo, extract a changeset.
-if(IS_DIRECTORY ${CMAKE_SOURCE_DIR}/.hg)
+if(IS_DIRECTORY ${CMAKE_SOURCE_DIR}/.git)
-  # if the mercurial program is absent or this will leave the EIGEN_HG_CHANGESET string empty,
+  # if the git program is absent or this will leave the EIGEN_GIT_REVNUM string empty,
  # but won't stop CMake.
-  execute_process(COMMAND hg tip -R ${CMAKE_SOURCE_DIR} OUTPUT_VARIABLE EIGEN_HGTIP_OUTPUT)
+  execute_process(COMMAND git ls-remote -q ${CMAKE_SOURCE_DIR} HEAD OUTPUT_VARIABLE EIGEN_GIT_OUTPUT)
  execute_process(COMMAND hg branch -R ${CMAKE_SOURCE_DIR} OUTPUT_VARIABLE EIGEN_BRANCH_OUTPUT)
 endif()
-# if this is the default (aka development) branch, extract the mercurial changeset number from the hg tip output...
+# extract the git rev number from the git output...
-if(EIGEN_BRANCH_OUTPUT MATCHES "default")
+if(EIGEN_GIT_OUTPUT)
-string(REGEX MATCH "^changeset: *[0-9]*:([0-9;a-f]+).*" EIGEN_HG_CHANGESET_MATCH "${EIGEN_HGTIP_OUTPUT}")
+string(REGEX MATCH "^([0-9;a-f]+).*" EIGEN_GIT_CHANGESET_MATCH "${EIGEN_GIT_OUTPUT}")
-set(EIGEN_HG_CHANGESET "${CMAKE_MATCH_1}")
+set(EIGEN_GIT_REVNUM "${CMAKE_MATCH_1}")
 endif()
 #...and show it next to the version number
-if(EIGEN_HG_CHANGESET)
+if(EIGEN_GIT_REVNUM)
-  set(EIGEN_VERSION "${EIGEN_VERSION_NUMBER} (mercurial changeset ${EIGEN_HG_CHANGESET})")
+  set(EIGEN_VERSION "${EIGEN_VERSION_NUMBER} (git rev ${EIGEN_GIT_REVNUM})")
 else()
  set(EIGEN_VERSION "${EIGEN_VERSION_NUMBER}")
 endif()
 #==============================================================================
 # Install Path Configuration.
 #==============================================================================
 # Unconditionally allow install of targets to support nested dependency
 # installations.
 #
 # Note: projects that depend on Eigen should _probably_ exclude installing
 # Eigen by default (e.g. by using EXCLUDE_FROM_ALL when using
 # FetchContent_Declare or add_subdirectory) to avoid overwriting a previous
 # installation.
 include(CheckCXXCompilerFlag)
 include(GNUInstallDirs)
 # Backward compatibility support for EIGEN_INCLUDE_INSTALL_DIR
 if(EIGEN_INCLUDE_INSTALL_DIR)
  message(WARNING "EIGEN_INCLUDE_INSTALL_DIR is deprecated. Use INCLUDE_INSTALL_DIR instead.")
 endif()
 if(EIGEN_INCLUDE_INSTALL_DIR AND NOT INCLUDE_INSTALL_DIR)
  set(INCLUDE_INSTALL_DIR ${EIGEN_INCLUDE_INSTALL_DIR}
      CACHE PATH "The directory relative to CMAKE_INSTALL_PREFIX where Eigen header files are installed")
 else()
  set(INCLUDE_INSTALL_DIR
      "${CMAKE_INSTALL_INCLUDEDIR}/eigen3"
      CACHE PATH "The directory relative to CMAKE_INSTALL_PREFIX where Eigen header files are installed"
      )
 endif()
 set(CMAKEPACKAGE_INSTALL_DIR
    "${CMAKE_INSTALL_DATADIR}/eigen3/cmake"
    CACHE PATH "The directory relative to CMAKE_INSTALL_PREFIX where Eigen3Config.cmake is installed"
    )
 set(PKGCONFIG_INSTALL_DIR
    "${CMAKE_INSTALL_DATADIR}/pkgconfig"
    CACHE PATH "The directory relative to CMAKE_INSTALL_PREFIX where eigen3.pc is installed"
    )
 foreach(var INCLUDE_INSTALL_DIR CMAKEPACKAGE_INSTALL_DIR PKGCONFIG_INSTALL_DIR)
  # If an absolute path is specified, make it relative to "{CMAKE_INSTALL_PREFIX}".
  if(IS_ABSOLUTE "${${var}}")
    file(RELATIVE_PATH "${var}" "${CMAKE_INSTALL_PREFIX}" "${${var}}")
  endif()
 endforeach()
 #==============================================================================
 # Eigen Library.
 #==============================================================================
 set ( EIGEN_VERSION_STRING ${EIGEN_VERSION_NUMBER} )
 set ( EIGEN_VERSION_MAJOR  ${EIGEN_WORLD_VERSION} )
 set ( EIGEN_VERSION_MINOR  ${EIGEN_MAJOR_VERSION} )
 set ( EIGEN_VERSION_PATCH  ${EIGEN_MINOR_VERSION} )
 set ( EIGEN_DEFINITIONS "")
 set ( EIGEN_INCLUDE_DIR "${CMAKE_INSTALL_PREFIX}/${INCLUDE_INSTALL_DIR}" )
 set ( EIGEN_ROOT_DIR ${CMAKE_INSTALL_PREFIX} )
 # Alias Eigen_*_DIR to Eigen3_*_DIR:
 set(Eigen_SOURCE_DIR ${Eigen3_SOURCE_DIR})
 set(Eigen_BINARY_DIR ${Eigen3_BINARY_DIR})
 # Imported target support
 add_library (eigen INTERFACE)
 add_library (Eigen3::Eigen ALIAS eigen)
 target_include_directories (eigen INTERFACE
  $<BUILD_INTERFACE:${CMAKE_CURRENT_SOURCE_DIR}>
  $<INSTALL_INTERFACE:${INCLUDE_INSTALL_DIR}>
 )
 # Export as title case Eigen
 set_target_properties (eigen PROPERTIES EXPORT_NAME Eigen)
 #==============================================================================
 # Install Rule Configuration.
 #==============================================================================
 install(FILES
  signature_of_eigen3_matrix_library
  DESTINATION ${INCLUDE_INSTALL_DIR} COMPONENT Devel
  )
 if(EIGEN_BUILD_PKGCONFIG)
    configure_file(eigen3.pc.in eigen3.pc @ONLY)
    install(FILES ${CMAKE_CURRENT_BINARY_DIR}/eigen3.pc
        DESTINATION ${PKGCONFIG_INSTALL_DIR})
 endif()
 install(DIRECTORY Eigen DESTINATION ${INCLUDE_INSTALL_DIR} COMPONENT Devel)
 install(TARGETS eigen EXPORT Eigen3Targets)
 if(EIGEN_BUILD_CMAKE_PACKAGE)
  include (CMakePackageConfigHelpers)
  configure_package_config_file (
    ${CMAKE_CURRENT_SOURCE_DIR}/cmake/Eigen3Config.cmake.in
    ${CMAKE_CURRENT_BINARY_DIR}/Eigen3Config.cmake
    PATH_VARS EIGEN_INCLUDE_DIR EIGEN_ROOT_DIR
    INSTALL_DESTINATION ${CMAKEPACKAGE_INSTALL_DIR}
    NO_SET_AND_CHECK_MACRO # Eigen does not provide legacy style defines
    NO_CHECK_REQUIRED_COMPONENTS_MACRO # Eigen does not provide components
  )
  set(CVF_VERSION "${EIGEN_VERSION_NUMBER}")
  configure_file("${CMAKE_CURRENT_SOURCE_DIR}/cmake/Eigen3ConfigVersion.cmake.in"
                 "Eigen3ConfigVersion.cmake"
                 @ONLY)
  # The Eigen target will be located in the Eigen3 namespace. Other CMake
  # targets can refer to it using Eigen3::Eigen.
  export (TARGETS eigen NAMESPACE Eigen3:: FILE Eigen3Targets.cmake)
  # Export Eigen3 package to CMake registry such that it can be easily found by
  # CMake even if it has not been installed to a standard directory.
  export (PACKAGE Eigen3)
  install (EXPORT Eigen3Targets NAMESPACE Eigen3:: DESTINATION ${CMAKEPACKAGE_INSTALL_DIR})
  install (FILES ${CMAKE_CURRENT_SOURCE_DIR}/cmake/UseEigen3.cmake
                ${CMAKE_CURRENT_BINARY_DIR}/Eigen3Config.cmake
                ${CMAKE_CURRENT_BINARY_DIR}/Eigen3ConfigVersion.cmake
          DESTINATION ${CMAKEPACKAGE_INSTALL_DIR})
  # Add uninstall target
  if(NOT TARGET uninstall)
    add_custom_target ( uninstall
        COMMAND ${CMAKE_COMMAND} -P ${CMAKE_CURRENT_SOURCE_DIR}/cmake/EigenUninstall.cmake)
  endif()
 endif()
 #==============================================================================
 # General Build Configuration.
 #==============================================================================
 # Guard against in-source builds
 if(${CMAKE_SOURCE_DIR} STREQUAL ${CMAKE_BINARY_DIR})
  message(FATAL_ERROR "In-source builds not allowed. Please make a new directory (called a build directory) and run CMake from there. You may need to remove CMakeCache.txt. ")
 endif()
 # Guard against bad build-type strings
 if (PROJECT_IS_TOP_LEVEL AND NOT CMAKE_BUILD_TYPE)
  set(CMAKE_BUILD_TYPE "Release")
 endif()
 # Only try to figure out how to link the math library if we are building something.
 # Otherwise, let the parent project deal with dependencies.
 if (EIGEN_IS_BUILDING_)
  # Use Eigen's cmake files.
  set(CMAKE_MODULE_PATH ${PROJECT_SOURCE_DIR}/cmake)
  set(CMAKE_INCLUDE_CURRENT_DIR OFF)
-option(EIGEN_TEST_CXX11 "Enable testing with C++11 and C++11 features (e.g. Tensor module)." OFF)
+  find_package(StandardMathLibrary)
  set(EIGEN_STANDARD_LIBRARIES_TO_LINK_TO "")
  if(NOT STANDARD_MATH_LIBRARY_FOUND)
    message(FATAL_ERROR
      "Can't link to the standard math library. Please report to the Eigen developers, telling them about your platform.")
  else()
    if(EIGEN_STANDARD_LIBRARIES_TO_LINK_TO)
      set(EIGEN_STANDARD_LIBRARIES_TO_LINK_TO "${EIGEN_STANDARD_LIBRARIES_TO_LINK_TO} ${STANDARD_MATH_LIBRARY}")
    else()
      set(EIGEN_STANDARD_LIBRARIES_TO_LINK_TO "${STANDARD_MATH_LIBRARY}")
    endif()
  endif()
  if(EIGEN_STANDARD_LIBRARIES_TO_LINK_TO)
    message(STATUS "Standard libraries to link to explicitly: ${EIGEN_STANDARD_LIBRARIES_TO_LINK_TO}")
  else()
    message(STATUS "Standard libraries to link to explicitly: none")
  endif()
  # Default tests/examples/libraries to row-major.
  option(EIGEN_DEFAULT_TO_ROW_MAJOR "Use row-major as default matrix storage order" OFF)
  if(EIGEN_DEFAULT_TO_ROW_MAJOR)
    add_definitions("-DEIGEN_DEFAULT_TO_ROW_MAJOR")
  endif()
 endif()
 #==============================================================================
 # Test Configuration.
 #==============================================================================
 if (EIGEN_BUILD_TESTING)
  function(ei_maybe_separate_arguments variable mode args)
    # Use separate_arguments if the input is a single string containing a space.
    # Otherwise, if it is already a list or doesn't have a space, just propagate
    # the original value.  This is to better support multi-argument lists.
    list(LENGTH args list_length)
    if (${list_length} EQUAL 1)
      string(FIND "${args}" " " has_space)
      if (${has_space} GREATER -1)
        separate_arguments(args ${mode} "${args}")
      endif()
    endif()
    set(${variable} ${args} PARENT_SCOPE)
  endfunction(ei_maybe_separate_arguments)
  include(CheckCXXCompilerFlag)
  macro(ei_add_cxx_compiler_flag FLAG)
    string(REGEX REPLACE "-" "" SFLAG1 ${FLAG})
    string(REGEX REPLACE "\\+" "p" SFLAG ${SFLAG1})
@@ -83,6 +328,7 @@ endmacro()
  check_cxx_compiler_flag("-std=c++11" EIGEN_COMPILER_SUPPORT_CPP11)
  option(EIGEN_TEST_CXX11 "Enable testing with C++11 and C++11 features (e.g. Tensor module)." ${EIGEN_COMPILER_SUPPORT_CPP11})
  if(EIGEN_TEST_CXX11)
    set(CMAKE_CXX_STANDARD 11)
    set(CMAKE_CXX_EXTENSIONS OFF)
@@ -90,62 +336,19 @@ if(EIGEN_TEST_CXX11)
      set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -std=c++11")
    endif()
  else()
  #set(CMAKE_CXX_STANDARD 03)
  #set(CMAKE_CXX_EXTENSIONS OFF)
    ei_add_cxx_compiler_flag("-std=c++03")
  endif()
 #############################################################################
 # find how to link to the standard libraries                                #
 #############################################################################
 find_package(StandardMathLibrary)
  set(EIGEN_TEST_CUSTOM_LINKER_FLAGS  "" CACHE STRING "Additional linker flags when linking unit tests.")
  set(EIGEN_TEST_CUSTOM_CXX_FLAGS     "" CACHE STRING "Additional compiler flags when compiling unit tests.")
-
+  # Convert space-separated arguments into CMake lists for downstream consumption.
-set(EIGEN_STANDARD_LIBRARIES_TO_LINK_TO "")
+  ei_maybe_separate_arguments(EIGEN_TEST_CUSTOM_LINKER_FLAGS NATIVE_COMMAND "${EIGEN_TEST_CUSTOM_LINKER_FLAGS}")
-
+  ei_maybe_separate_arguments(EIGEN_TEST_CUSTOM_CXX_FLAGS NATIVE_COMMAND "${EIGEN_TEST_CUSTOM_CXX_FLAGS}")
 if(NOT STANDARD_MATH_LIBRARY_FOUND)
  message(FATAL_ERROR
    "Can't link to the standard math library. Please report to the Eigen developers, telling them about your platform.")
 else()
  if(EIGEN_STANDARD_LIBRARIES_TO_LINK_TO)
    set(EIGEN_STANDARD_LIBRARIES_TO_LINK_TO "${EIGEN_STANDARD_LIBRARIES_TO_LINK_TO} ${STANDARD_MATH_LIBRARY}")
  else()
    set(EIGEN_STANDARD_LIBRARIES_TO_LINK_TO "${STANDARD_MATH_LIBRARY}")
  endif()
 endif()
 if(EIGEN_STANDARD_LIBRARIES_TO_LINK_TO)
  message(STATUS "Standard libraries to link to explicitly: ${EIGEN_STANDARD_LIBRARIES_TO_LINK_TO}")
 else()
  message(STATUS "Standard libraries to link to explicitly: none")
 endif()
 option(EIGEN_BUILD_BTL "Build benchmark suite" OFF)
 # Disable pkgconfig only for native Windows builds
 if(NOT WIN32 OR NOT CMAKE_HOST_SYSTEM_NAME MATCHES Windows)
  option(EIGEN_BUILD_PKGCONFIG "Build pkg-config .pc file for Eigen" ON)
 endif()
 set(CMAKE_INCLUDE_CURRENT_DIR OFF)
  option(EIGEN_SPLIT_LARGE_TESTS "Split large tests into smaller executables" ON)
 option(EIGEN_DEFAULT_TO_ROW_MAJOR "Use row-major as default matrix storage order" OFF)
 if(EIGEN_DEFAULT_TO_ROW_MAJOR)
  add_definitions("-DEIGEN_DEFAULT_TO_ROW_MAJOR")
 endif()
  set(EIGEN_TEST_MAX_SIZE "320" CACHE STRING "Maximal matrix/vector size, default is 320")
  # Flags for tests.
  if(NOT MSVC)
    # We assume that other compilers are partly compatible with GNUCC
@@ -159,7 +362,6 @@ if(NOT MSVC)
    ei_add_cxx_compiler_flag("-Wall")
    ei_add_cxx_compiler_flag("-Wextra")
    # ei_add_cxx_compiler_flag("-Weverything") # clang
    ei_add_cxx_compiler_flag("-Wundef")
    ei_add_cxx_compiler_flag("-Wcast-align")
    ei_add_cxx_compiler_flag("-Wchar-subscripts")
@@ -174,31 +376,16 @@ if(NOT MSVC)
    ei_add_cxx_compiler_flag("-Wc++11-extensions")
    ei_add_cxx_compiler_flag("-Wdouble-promotion")
    # ei_add_cxx_compiler_flag("-Wconversion")
    ei_add_cxx_compiler_flag("-Wshadow")
    ei_add_cxx_compiler_flag("-Wno-psabi")
    ei_add_cxx_compiler_flag("-Wno-variadic-macros")
    ei_add_cxx_compiler_flag("-Wno-long-long")
    ei_add_cxx_compiler_flag("-fno-check-new")
    ei_add_cxx_compiler_flag("-fno-common")
    ei_add_cxx_compiler_flag("-fstrict-aliasing")
    ei_add_cxx_compiler_flag("-wd981")                    # disable ICC's "operands are evaluated in unspecified order" remark
    ei_add_cxx_compiler_flag("-wd2304")                   # disable ICC's "warning #2304: non-explicit constructor with single argument may cause implicit type conversion" produced by -Wnon-virtual-dtor
  # The -ansi flag must be added last, otherwise it is also used as a linker flag by check_cxx_compiler_flag making it fails
  # Moreover we should not set both -strict-ansi and -ansi
  check_cxx_compiler_flag("-strict-ansi" COMPILER_SUPPORT_STRICTANSI)
  ei_add_cxx_compiler_flag("-Qunused-arguments")        # disable clang warning: argument unused during compilation: '-ansi'
  if(COMPILER_SUPPORT_STRICTANSI)
    set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -strict-ansi")
  else()
    ei_add_cxx_compiler_flag("-ansi")
  endif()
    if(ANDROID_NDK)
      ei_add_cxx_compiler_flag("-pie")
      ei_add_cxx_compiler_flag("-fPIE")
@@ -248,15 +435,30 @@ if(NOT MSVC)
      message(STATUS "Enabling FMA in tests/examples")
    endif()
    option(EIGEN_TEST_AVX2 "Enable/Disable AVX2 in tests/examples" OFF)
    if(EIGEN_TEST_AVX2)
      set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -mavx2 -mfma")
      message(STATUS "Enabling AVX2 in tests/examples")
    endif()
    option(EIGEN_TEST_AVX512 "Enable/Disable AVX512 in tests/examples" OFF)
    if(EIGEN_TEST_AVX512)
-    set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -mavx512f -mfma -DEIGEN_ENABLE_AVX512")
+      set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -mavx512f -mfma")
    if (NOT "${CMAKE_CXX_COMPILER_ID}" STREQUAL "Clang")
      set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -fabi-version=6")
    endif()
      message(STATUS "Enabling AVX512 in tests/examples")
    endif()
    option(EIGEN_TEST_AVX512DQ "Enable/Disable AVX512DQ in tests/examples" OFF)
    if(EIGEN_TEST_AVX512DQ)
      set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -mavx512dq -mfma")
      message(STATUS "Enabling AVX512DQ in tests/examples")
    endif()
    option(EIGEN_TEST_AVX512FP16 "Enable/Disable AVX512-FP16 in tests/examples" OFF)
    if(EIGEN_TEST_AVX512FP16)
      set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -mavx512f -mfma -mavx512vl -mavx512fp16")
    message(STATUS "Enabling AVX512-FP16 in tests/examples")
    endif()
    option(EIGEN_TEST_F16C "Enable/Disable F16C in tests/examples" OFF)
    if(EIGEN_TEST_F16C)
      set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -mf16c")
@@ -320,7 +522,6 @@ if(NOT MSVC)
    endif()
  else()
    # C4127 - conditional expression is constant
    # C4714 - marked as __forceinline not inlined (I failed to deactivate it selectively)
    #         We can disable this warning in the unit tests since it is clear that it occurs
@@ -358,13 +559,21 @@ else()
    endif()
    option(EIGEN_TEST_FMA "Enable/Disable FMA/AVX2 in tests/examples" OFF)
-  if(EIGEN_TEST_FMA AND NOT EIGEN_TEST_NEON)
+    option(EIGEN_TEST_AVX2 "Enable/Disable FMA/AVX2 in tests/examples" OFF)
    if((EIGEN_TEST_FMA AND NOT EIGEN_TEST_NEON) OR EIGEN_TEST_AVX2)
      set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} /arch:AVX2")
      message(STATUS "Enabling FMA/AVX2 in tests/examples")
    endif()
    option(EIGEN_TEST_AVX512 "Enable/Disable AVX512 in tests/examples" OFF)
    option(EIGEN_TEST_AVX512DQ "Enable/Disable AVX512DQ in tests/examples" OFF)
    if(EIGEN_TEST_AVX512 OR EIGEN_TEST_AVX512DQ)
      set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} /arch:AVX512")
      message(STATUS "Enabling AVX512 in tests/examples")
    endif()
  endif(NOT MSVC)
  option(EIGEN_TEST_NO_EXPLICIT_VECTORIZATION "Disable explicit vectorization in tests/examples" OFF)
  option(EIGEN_TEST_X87 "Force using X87 instructions. Implies no vectorization." OFF)
  option(EIGEN_TEST_32BIT "Force generating 32bit code." OFF)
@@ -405,91 +614,38 @@ if(EIGEN_TEST_NO_EXCEPTIONS)
    message(STATUS "Disabling exceptions in tests/examples")
  endif()
-set(EIGEN_CUDA_COMPUTE_ARCH 30 CACHE STRING "The CUDA compute architecture level to target when compiling CUDA code")
+  set(EIGEN_CUDA_CXX_FLAGS "" CACHE STRING "Additional flags to pass to the cuda compiler.")
  set(EIGEN_CUDA_COMPUTE_ARCH 30 CACHE STRING "The CUDA compute architecture(s) to target when compiling CUDA code")
 include_directories(${CMAKE_CURRENT_SOURCE_DIR})
 # Backward compatibility support for EIGEN_INCLUDE_INSTALL_DIR
 if(EIGEN_INCLUDE_INSTALL_DIR)
  message(WARNING "EIGEN_INCLUDE_INSTALL_DIR is deprecated. Use INCLUDE_INSTALL_DIR instead.")
 endif()
 if(EIGEN_INCLUDE_INSTALL_DIR AND NOT INCLUDE_INSTALL_DIR)
  set(INCLUDE_INSTALL_DIR ${EIGEN_INCLUDE_INSTALL_DIR}
      CACHE PATH "The directory relative to CMAKE_PREFIX_PATH where Eigen header files are installed")
 else()
  set(INCLUDE_INSTALL_DIR
      "${CMAKE_INSTALL_INCLUDEDIR}/eigen3"
      CACHE PATH "The directory relative to CMAKE_PREFIX_PATH where Eigen header files are installed"
      )
 endif()
 set(CMAKEPACKAGE_INSTALL_DIR
    "${CMAKE_INSTALL_DATADIR}/eigen3/cmake"
    CACHE PATH "The directory relative to CMAKE_PREFIX_PATH where Eigen3Config.cmake is installed"
    )
 set(PKGCONFIG_INSTALL_DIR
    "${CMAKE_INSTALL_DATADIR}/pkgconfig"
    CACHE PATH "The directory relative to CMAKE_PREFIX_PATH where eigen3.pc is installed"
    )
 # similar to set_target_properties but append the property instead of overwriting it
 macro(ei_add_target_property target prop value)
  get_target_property(previous ${target} ${prop})
  # if the property wasn't previously set, ${previous} is now "previous-NOTFOUND" which cmake allows catching with plain if()
  if(NOT previous)
    set(previous "")
  endif()
  set_target_properties(${target} PROPERTIES ${prop} "${previous} ${value}")
 endmacro()
 install(FILES
  signature_of_eigen3_matrix_library
  DESTINATION ${INCLUDE_INSTALL_DIR} COMPONENT Devel
  )
 if(EIGEN_BUILD_PKGCONFIG)
    configure_file(eigen3.pc.in eigen3.pc @ONLY)
    install(FILES ${CMAKE_CURRENT_BINARY_DIR}/eigen3.pc
        DESTINATION ${PKGCONFIG_INSTALL_DIR}
        )
 endif()
 install(DIRECTORY Eigen DESTINATION ${INCLUDE_INSTALL_DIR} COMPONENT Devel)
 add_subdirectory(doc EXCLUDE_FROM_ALL)
 option(BUILD_TESTING "Enable creation of Eigen tests." ON)
 if(BUILD_TESTING)
  include(EigenConfigureTesting)
  if(EIGEN_LEAVE_TEST_IN_ALL_TARGET)
    add_subdirectory(test) # can't do EXCLUDE_FROM_ALL here, breaks CTest
  else()
    add_subdirectory(test EXCLUDE_FROM_ALL)
  endif()
  add_subdirectory(failtest)
 endif()
 if(EIGEN_LEAVE_TEST_IN_ALL_TARGET)
  add_subdirectory(blas)
  add_subdirectory(lapack)
 else()
  add_subdirectory(blas EXCLUDE_FROM_ALL)
  add_subdirectory(lapack EXCLUDE_FROM_ALL)
 endif()
 # add SYCL
  option(EIGEN_TEST_SYCL "Add Sycl support." OFF)
 option(EIGEN_SYCL_TRISYCL "Use the triSYCL Sycl implementation (ComputeCPP by default)." OFF)
  if(EIGEN_TEST_SYCL)
    option(EIGEN_SYCL_DPCPP "Use the DPCPP Sycl implementation (DPCPP is default SYCL-Compiler)." ON)
    option(EIGEN_SYCL_TRISYCL "Use the triSYCL Sycl implementation." OFF)
    option(EIGEN_SYCL_ComputeCpp "Use the ComputeCPP Sycl implementation." OFF)
    # Building options
    # https://developer.codeplay.com/products/computecpp/ce/2.11.0/guides/eigen-overview/options-for-building-eigen-sycl
    option(EIGEN_SYCL_USE_DEFAULT_SELECTOR "Use sycl default selector to select the preferred device." OFF)
    option(EIGEN_SYCL_NO_LOCAL_MEM "Build for devices without dedicated shared memory." OFF)
    option(EIGEN_SYCL_LOCAL_MEM "Allow the use of local memory (enabled by default)." ON)
    option(EIGEN_SYCL_LOCAL_THREAD_DIM0 "Set work group size for dimension 0." 16)
    option(EIGEN_SYCL_LOCAL_THREAD_DIM1 "Set work group size for dimension 1." 16)
    option(EIGEN_SYCL_ASYNC_EXECUTION "Allow asynchronous execution (enabled by default)." ON)
    option(EIGEN_SYCL_DISABLE_SKINNY "Disable optimization for tall/skinny matrices." OFF)
    option(EIGEN_SYCL_DISABLE_DOUBLE_BUFFER "Disable double buffer." OFF)
    option(EIGEN_SYCL_DISABLE_SCALAR "Disable scalar contraction." OFF)
    option(EIGEN_SYCL_DISABLE_GEMV "Disable GEMV and create a single kernel to calculate contraction instead." OFF)
    set(EIGEN_SYCL ON)
    set(CMAKE_CXX_STANDARD 17)
    set(CMAKE_CXX_FLAGS  "${CMAKE_CXX_FLAGS} -Wno-deprecated-declarations -Wno-shorten-64-to-32 -Wno-cast-align")
    set(CMAKE_CXX_FLAGS  "${CMAKE_CXX_FLAGS} -Wno-deprecated-copy-with-user-provided-copy -Wno-unused-variable")
    set (CMAKE_MODULE_PATH "${CMAKE_ROOT}/Modules" "cmake/Modules/" "${CMAKE_MODULE_PATH}")
    find_package(Threads REQUIRED)
    if(EIGEN_SYCL_TRISYCL)
      message(STATUS "Using triSYCL")
      include(FindTriSYCL)
-  else()
+    elseif(EIGEN_SYCL_ComputeCpp)
      message(STATUS "Using ComputeCPP SYCL")
      include(FindComputeCpp)
      set(COMPUTECPP_DRIVER_DEFAULT_VALUE OFF)
@@ -500,8 +656,12 @@ if(EIGEN_TEST_SYCL)
              "Use ComputeCpp driver instead of a 2 steps compilation"
              ${COMPUTECPP_DRIVER_DEFAULT_VALUE}
              )
    else() #Default SYCL compiler is DPCPP (EIGEN_SYCL_DPCPP)
      set(DPCPP_SYCL_TARGET "spir64" CACHE STRING "Default target for Intel CPU/GPU")
      message(STATUS "Using DPCPP")
      find_package(DPCPP)
      add_definitions(-DSYCL_COMPILER_IS_DPCPP)
    endif(EIGEN_SYCL_TRISYCL)
  option(EIGEN_DONT_VECTORIZE_SYCL "Don't use vectorisation in the SYCL tests." OFF)
    if(EIGEN_DONT_VECTORIZE_SYCL)
      message(STATUS "Disabling SYCL vectorization in tests/examples")
      # When disabling SYCL vectorization, also disable Eigen default vectorization
@@ -510,39 +670,75 @@ if(EIGEN_TEST_SYCL)
    endif()
  endif()
  include(EigenConfigureTesting)
  if(EIGEN_LEAVE_TEST_IN_ALL_TARGET)
    # CTest automatic test building relies on the "all" target.
    add_subdirectory(test)
    add_subdirectory(failtest)
  else()
    add_subdirectory(test EXCLUDE_FROM_ALL)
    add_subdirectory(failtest EXCLUDE_FROM_ALL)
  endif()
  ei_testing_print_summary()
  if (EIGEN_SPLIT_TESTSUITE)
    ei_split_testsuite("${EIGEN_SPLIT_TESTSUITE}")
  endif()
 endif(EIGEN_BUILD_TESTING)
 #==============================================================================
 # Other Build Configurations.
 #==============================================================================
 add_subdirectory(unsupported)
-add_subdirectory(demos EXCLUDE_FROM_ALL)
+if(EIGEN_BUILD_BLAS)
  add_subdirectory(blas)
 endif()
-# must be after test and unsupported, for configuring buildtests.in
+if (EIGEN_BUILD_LAPACK)
-add_subdirectory(scripts EXCLUDE_FROM_ALL)
+  add_subdirectory(lapack)
 endif()
 if(EIGEN_BUILD_DOC)
  add_subdirectory(doc EXCLUDE_FROM_ALL)
 endif()
 # TODO: consider also replacing EIGEN_BUILD_BTL by a custom target "make btl"?
 if(EIGEN_BUILD_BTL)
  add_subdirectory(bench/btl EXCLUDE_FROM_ALL)
 endif()
-if(NOT WIN32)
+if(NOT WIN32 AND EIGEN_BUILD_SPBENCH)
  add_subdirectory(bench/spbench EXCLUDE_FROM_ALL)
 endif()
-configure_file(scripts/cdashtesting.cmake.in cdashtesting.cmake @ONLY)
+if (EIGEN_BUILD_DEMOS)
-
+  add_subdirectory(demos EXCLUDE_FROM_ALL)
 if(BUILD_TESTING)
  ei_testing_print_summary()
 endif()
-message(STATUS "")
+if (PROJECT_IS_TOP_LEVEL)
-message(STATUS "Configured Eigen ${EIGEN_VERSION_NUMBER}")
+  # must be after test and unsupported, for configuring buildtests.in
-message(STATUS "")
+  add_subdirectory(scripts EXCLUDE_FROM_ALL)
  configure_file(scripts/cdashtesting.cmake.in cdashtesting.cmake @ONLY)
 endif()
 #==============================================================================
 # Summary.
 #==============================================================================
 if(PROJECT_IS_TOP_LEVEL)
  string(TOLOWER "${CMAKE_GENERATOR}" cmake_generator_tolower)
  if(cmake_generator_tolower MATCHES "makefile")
-  message(STATUS "Some things you can do now:")
+    message(STATUS "Available targets (use: make TARGET):")
-  message(STATUS "--------------+--------------------------------------------------------------")
+  else()
-  message(STATUS "Command       |   Description")
+    message(STATUS "Available targets (use: cmake --build . --target TARGET):")
-  message(STATUS "--------------+--------------------------------------------------------------")
+  endif()
-  message(STATUS "make install  | Install Eigen. Headers will be installed to:")
+  message(STATUS "---------+--------------------------------------------------------------")
  message(STATUS "Target   |   Description")
  message(STATUS "---------+--------------------------------------------------------------")
  message(STATUS "install  | Install Eigen. Headers will be installed to:")
  message(STATUS "         |     <CMAKE_INSTALL_PREFIX>/<INCLUDE_INSTALL_DIR>")
  message(STATUS "         |   Using the following values:")
  message(STATUS "         |     CMAKE_INSTALL_PREFIX: ${CMAKE_INSTALL_PREFIX}")
@@ -551,112 +747,24 @@ if(cmake_generator_tolower MATCHES "makefile")
  message(STATUS "         |     cmake . -DCMAKE_INSTALL_PREFIX=yourprefix")
  message(STATUS "         |   Or:")
  message(STATUS "         |     cmake . -DINCLUDE_INSTALL_DIR=yourdir")
-  message(STATUS "make doc      | Generate the API documentation, requires Doxygen & LaTeX")
+  message(STATUS "uninstall| Remove files installed by the install target")
-  if(BUILD_TESTING)
+  if (EIGEN_BUILD_DOC)
-    message(STATUS "make check    | Build and run the unit-tests. Read this page:")
+    message(STATUS "doc      | Generate the API documentation, requires Doxygen & LaTeX")
  endif()
  if(EIGEN_BUILD_TESTING)
    message(STATUS "check    | Build and run the unit-tests. Read this page:")
    message(STATUS "         |   http://eigen.tuxfamily.org/index.php?title=Tests")
  endif()
-  message(STATUS "make blas     | Build BLAS library (not the same thing as Eigen)")
+  if (EIGEN_BUILD_BLAS)
-  message(STATUS "make uninstall| Removes files installed by make install")
+    message(STATUS "blas     | Build BLAS library (not the same thing as Eigen)")
-  message(STATUS "--------------+--------------------------------------------------------------")
+  endif()
-else()
+  if (EIGEN_BUILD_LAPACK)
-  message(STATUS "To build/run the unit tests, read this page:")
+    message(STATUS "lapack   | Build LAPACK subset library (not the same thing as Eigen)")
-  message(STATUS "  http://eigen.tuxfamily.org/index.php?title=Tests")
+  endif()
  message(STATUS "---------+--------------------------------------------------------------")
  message(STATUS "")
 endif()
 message(STATUS "")
-
+message(STATUS "Configured Eigen ${EIGEN_VERSION_NUMBER}")
-
+message(STATUS "")
 set ( EIGEN_VERSION_STRING ${EIGEN_VERSION_NUMBER} )
 set ( EIGEN_VERSION_MAJOR  ${EIGEN_WORLD_VERSION} )
 set ( EIGEN_VERSION_MINOR  ${EIGEN_MAJOR_VERSION} )
 set ( EIGEN_VERSION_PATCH  ${EIGEN_MINOR_VERSION} )
 set ( EIGEN_DEFINITIONS "")
 set ( EIGEN_INCLUDE_DIR "${CMAKE_INSTALL_PREFIX}/${INCLUDE_INSTALL_DIR}" )
 set ( EIGEN_ROOT_DIR ${CMAKE_INSTALL_PREFIX} )
 # Interface libraries require at least CMake 3.0
 if (NOT CMAKE_VERSION VERSION_LESS 3.0)
  include (CMakePackageConfigHelpers)
  # Imported target support
  add_library (eigen INTERFACE)
  add_library (Eigen3::Eigen ALIAS eigen)
  target_compile_definitions (eigen INTERFACE ${EIGEN_DEFINITIONS})
  target_include_directories (eigen INTERFACE
    $<BUILD_INTERFACE:${CMAKE_CURRENT_SOURCE_DIR}>
    $<INSTALL_INTERFACE:${INCLUDE_INSTALL_DIR}>
  )
  # Export as title case Eigen
  set_target_properties (eigen PROPERTIES EXPORT_NAME Eigen)
  install (TARGETS eigen EXPORT Eigen3Targets)
  configure_package_config_file (
    ${CMAKE_CURRENT_SOURCE_DIR}/cmake/Eigen3Config.cmake.in
    ${CMAKE_CURRENT_BINARY_DIR}/Eigen3Config.cmake
    PATH_VARS EIGEN_INCLUDE_DIR EIGEN_ROOT_DIR
    INSTALL_DESTINATION ${CMAKEPACKAGE_INSTALL_DIR}
    NO_CHECK_REQUIRED_COMPONENTS_MACRO # Eigen does not provide components
  )
  # Remove CMAKE_SIZEOF_VOID_P from Eigen3ConfigVersion.cmake since Eigen does
  # not depend on architecture specific settings or libraries. More
  # specifically, an Eigen3Config.cmake generated from a 64 bit target can be
  # used for 32 bit targets as well (and vice versa).
  set (_Eigen3_CMAKE_SIZEOF_VOID_P ${CMAKE_SIZEOF_VOID_P})
  unset (CMAKE_SIZEOF_VOID_P)
  write_basic_package_version_file (Eigen3ConfigVersion.cmake
                                    VERSION ${EIGEN_VERSION_NUMBER}
                                    COMPATIBILITY SameMajorVersion)
  set (CMAKE_SIZEOF_VOID_P ${_Eigen3_CMAKE_SIZEOF_VOID_P})
  # The Eigen target will be located in the Eigen3 namespace. Other CMake
  # targets can refer to it using Eigen3::Eigen.
  export (TARGETS eigen NAMESPACE Eigen3:: FILE Eigen3Targets.cmake)
  # Export Eigen3 package to CMake registry such that it can be easily found by
  # CMake even if it has not been installed to a standard directory.
  export (PACKAGE Eigen3)
  install (EXPORT Eigen3Targets NAMESPACE Eigen3:: DESTINATION ${CMAKEPACKAGE_INSTALL_DIR})
 else ()
  # Fallback to legacy Eigen3Config.cmake without the imported target
  # If CMakePackageConfigHelpers module is available (CMake >= 2.8.8)
  # create a relocatable Config file, otherwise leave the hardcoded paths
  include(CMakePackageConfigHelpers OPTIONAL RESULT_VARIABLE CPCH_PATH)
  if(CPCH_PATH)
    configure_package_config_file (
      ${CMAKE_CURRENT_SOURCE_DIR}/cmake/Eigen3ConfigLegacy.cmake.in
      ${CMAKE_CURRENT_BINARY_DIR}/Eigen3Config.cmake
      PATH_VARS EIGEN_INCLUDE_DIR EIGEN_ROOT_DIR
      INSTALL_DESTINATION ${CMAKEPACKAGE_INSTALL_DIR}
      NO_CHECK_REQUIRED_COMPONENTS_MACRO # Eigen does not provide components
    )
  else()
    # The PACKAGE_* variables are defined by the configure_package_config_file
    # but without it we define them manually to the hardcoded paths
    set(PACKAGE_INIT "")
    set(PACKAGE_EIGEN_INCLUDE_DIR ${EIGEN_INCLUDE_DIR})
    set(PACKAGE_EIGEN_ROOT_DIR ${EIGEN_ROOT_DIR})
    configure_file ( ${CMAKE_CURRENT_SOURCE_DIR}/cmake/Eigen3ConfigLegacy.cmake.in
                     ${CMAKE_CURRENT_BINARY_DIR}/Eigen3Config.cmake
                     @ONLY ESCAPE_QUOTES )
  endif()
  write_basic_package_version_file( Eigen3ConfigVersion.cmake
                                    VERSION ${EIGEN_VERSION_NUMBER}
                                    COMPATIBILITY SameMajorVersion )
 endif ()
 install ( FILES ${CMAKE_CURRENT_SOURCE_DIR}/cmake/UseEigen3.cmake
                ${CMAKE_CURRENT_BINARY_DIR}/Eigen3Config.cmake
                ${CMAKE_CURRENT_BINARY_DIR}/Eigen3ConfigVersion.cmake
          DESTINATION ${CMAKEPACKAGE_INSTALL_DIR} )
 # Add uninstall target
 add_custom_target ( uninstall
    COMMAND ${CMAKE_COMMAND} -P ${CMAKE_CURRENT_SOURCE_DIR}/cmake/EigenUninstall.cmake)
--- a/COPYING.APACHE
+++ b/COPYING.APACHE
@@ -0,0 +1,203 @@
 /*
                                 Apache License
                           Version 2.0, January 2004
                        http://www.apache.org/licenses/
   TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
   1. Definitions.
      "License" shall mean the terms and conditions for use, reproduction,
      and distribution as defined by Sections 1 through 9 of this document.
      "Licensor" shall mean the copyright owner or entity authorized by
      the copyright owner that is granting the License.
      "Legal Entity" shall mean the union of the acting entity and all
      other entities that control, are controlled by, or are under common
      control with that entity. For the purposes of this definition,
      "control" means (i) the power, direct or indirect, to cause the
      direction or management of such entity, whether by contract or
      otherwise, or (ii) ownership of fifty percent (50%) or more of the
      outstanding shares, or (iii) beneficial ownership of such entity.
      "You" (or "Your") shall mean an individual or Legal Entity
      exercising permissions granted by this License.
      "Source" form shall mean the preferred form for making modifications,
      including but not limited to software source code, documentation
      source, and configuration files.
      "Object" form shall mean any form resulting from mechanical
      transformation or translation of a Source form, including but
      not limited to compiled object code, generated documentation,
      and conversions to other media types.
      "Work" shall mean the work of authorship, whether in Source or
      Object form, made available under the License, as indicated by a
      copyright notice that is included in or attached to the work
      (an example is provided in the Appendix below).
      "Derivative Works" shall mean any work, whether in Source or Object
      form, that is based on (or derived from) the Work and for which the
      editorial revisions, annotations, elaborations, or other modifications
      represent, as a whole, an original work of authorship. For the purposes
      of this License, Derivative Works shall not include works that remain
      separable from, or merely link (or bind by name) to the interfaces of,
      the Work and Derivative Works thereof.
      "Contribution" shall mean any work of authorship, including
      the original version of the Work and any modifications or additions
      to that Work or Derivative Works thereof, that is intentionally
      submitted to Licensor for inclusion in the Work by the copyright owner
      or by an individual or Legal Entity authorized to submit on behalf of
      the copyright owner. For the purposes of this definition, "submitted"
      means any form of electronic, verbal, or written communication sent
      to the Licensor or its representatives, including but not limited to
      communication on electronic mailing lists, source code control systems,
      and issue tracking systems that are managed by, or on behalf of, the
      Licensor for the purpose of discussing and improving the Work, but
      excluding communication that is conspicuously marked or otherwise
      designated in writing by the copyright owner as "Not a Contribution."
      "Contributor" shall mean Licensor and any individual or Legal Entity
      on behalf of whom a Contribution has been received by Licensor and
      subsequently incorporated within the Work.
   2. Grant of Copyright License. Subject to the terms and conditions of
      this License, each Contributor hereby grants to You a perpetual,
      worldwide, non-exclusive, no-charge, royalty-free, irrevocable
      copyright license to reproduce, prepare Derivative Works of,
      publicly display, publicly perform, sublicense, and distribute the
      Work and such Derivative Works in Source or Object form.
   3. Grant of Patent License. Subject to the terms and conditions of
      this License, each Contributor hereby grants to You a perpetual,
      worldwide, non-exclusive, no-charge, royalty-free, irrevocable
      (except as stated in this section) patent license to make, have made,
      use, offer to sell, sell, import, and otherwise transfer the Work,
      where such license applies only to those patent claims licensable
      by such Contributor that are necessarily infringed by their
      Contribution(s) alone or by combination of their Contribution(s)
      with the Work to which such Contribution(s) was submitted. If You
      institute patent litigation against any entity (including a
      cross-claim or counterclaim in a lawsuit) alleging that the Work
      or a Contribution incorporated within the Work constitutes direct
      or contributory patent infringement, then any patent licenses
      granted to You under this License for that Work shall terminate
      as of the date such litigation is filed.
   4. Redistribution. You may reproduce and distribute copies of the
      Work or Derivative Works thereof in any medium, with or without
      modifications, and in Source or Object form, provided that You
      meet the following conditions:
      (a) You must give any other recipients of the Work or
          Derivative Works a copy of this License; and
      (b) You must cause any modified files to carry prominent notices
          stating that You changed the files; and
      (c) You must retain, in the Source form of any Derivative Works
          that You distribute, all copyright, patent, trademark, and
          attribution notices from the Source form of the Work,
          excluding those notices that do not pertain to any part of
          the Derivative Works; and
      (d) If the Work includes a "NOTICE" text file as part of its
          distribution, then any Derivative Works that You distribute must
          include a readable copy of the attribution notices contained
          within such NOTICE file, excluding those notices that do not
          pertain to any part of the Derivative Works, in at least one
          of the following places: within a NOTICE text file distributed
          as part of the Derivative Works; within the Source form or
          documentation, if provided along with the Derivative Works; or,
          within a display generated by the Derivative Works, if and
          wherever such third-party notices normally appear. The contents
          of the NOTICE file are for informational purposes only and
          do not modify the License. You may add Your own attribution
          notices within Derivative Works that You distribute, alongside
          or as an addendum to the NOTICE text from the Work, provided
          that such additional attribution notices cannot be construed
          as modifying the License.
      You may add Your own copyright statement to Your modifications and
      may provide additional or different license terms and conditions
      for use, reproduction, or distribution of Your modifications, or
      for any such Derivative Works as a whole, provided Your use,
      reproduction, and distribution of the Work otherwise complies with
      the conditions stated in this License.
   5. Submission of Contributions. Unless You explicitly state otherwise,
      any Contribution intentionally submitted for inclusion in the Work
      by You to the Licensor shall be under the terms and conditions of
      this License, without any additional terms or conditions.
      Notwithstanding the above, nothing herein shall supersede or modify
      the terms of any separate license agreement you may have executed
      with Licensor regarding such Contributions.
   6. Trademarks. This License does not grant permission to use the trade
      names, trademarks, service marks, or product names of the Licensor,
      except as required for reasonable and customary use in describing the
      origin of the Work and reproducing the content of the NOTICE file.
   7. Disclaimer of Warranty. Unless required by applicable law or
      agreed to in writing, Licensor provides the Work (and each
      Contributor provides its Contributions) on an "AS IS" BASIS,
      WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
      implied, including, without limitation, any warranties or conditions
      of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
      PARTICULAR PURPOSE. You are solely responsible for determining the
      appropriateness of using or redistributing the Work and assume any
      risks associated with Your exercise of permissions under this License.
   8. Limitation of Liability. In no event and under no legal theory,
      whether in tort (including negligence), contract, or otherwise,
      unless required by applicable law (such as deliberate and grossly
      negligent acts) or agreed to in writing, shall any Contributor be
      liable to You for damages, including any direct, indirect, special,
      incidental, or consequential damages of any character arising as a
      result of this License or out of the use or inability to use the
      Work (including but not limited to damages for loss of goodwill,
      work stoppage, computer failure or malfunction, or any and all
      other commercial damages or losses), even if such Contributor
      has been advised of the possibility of such damages.
   9. Accepting Warranty or Additional Liability. While redistributing
      the Work or Derivative Works thereof, You may choose to offer,
      and charge a fee for, acceptance of support, warranty, indemnity,
      or other liability obligations and/or rights consistent with this
      License. However, in accepting such obligations, You may act only
      on Your own behalf and on Your sole responsibility, not on behalf
      of any other Contributor, and only if You agree to indemnify,
      defend, and hold each Contributor harmless for any liability
      incurred by, or claims asserted against, such Contributor by reason
      of your accepting any such warranty or additional liability.
   END OF TERMS AND CONDITIONS
   APPENDIX: How to apply the Apache License to your work.
      To apply the Apache License to your work, attach the following
      boilerplate notice, with the fields enclosed by brackets "[]"
      replaced with your own identifying information. (Don't include
      the brackets!)  The text should be enclosed in the appropriate
      comment syntax for the file format. We also recommend that a
      file or class name and description of purpose be included on the
      same "printed page" as the copyright notice for easier
      identification within third-party archives.
   Copyright [yyyy] [name of copyright owner]
   Licensed under the Apache License, Version 2.0 (the "License");
   you may not use this file except in compliance with the License.
   You may obtain a copy of the License at
       http://www.apache.org/licenses/LICENSE-2.0
   Unless required by applicable law or agreed to in writing, software
   distributed under the License is distributed on an "AS IS" BASIS,
   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
   See the License for the specific language governing permissions and
   limitations under the License.
 */
--- a/COPYING.MINPACK
+++ b/COPYING.MINPACK
@@ -49,4 +49,3 @@ SUCH LIABILITY IS ASSERTED ON THE BASIS OF CONTRACT, TORT
 (INCLUDING NEGLIGENCE OR STRICT LIABILITY), OR OTHERWISE,
 EVEN IF ANY OF SAID PARTIES HAS BEEN WARNED OF THE
 POSSIBILITY OF SUCH LOSS OR DAMAGES.
--- a/Eigen/Cholesky
+++ b/Eigen/Cholesky
@@ -43,4 +43,3 @@
 #include "src/Core/util/ReenableStupidWarnings.h"
 #endif // EIGEN_CHOLESKY_MODULE_H
 /* vim: set filetype=cpp et sw=2 ts=2 ai: */
--- a/Eigen/CholmodSupport
+++ b/Eigen/CholmodSupport
@@ -22,7 +22,7 @@ extern "C" {
  * This module provides an interface to the Cholmod library which is part of the <a href="http://www.suitesparse.com">suitesparse</a> package.
  * It provides the two following main factorization classes:
  * - class CholmodSupernodalLLT: a supernodal LLT Cholesky factorization.
-  * - class CholmodDecomposiiton: a general L(D)LT Cholesky factorization with automatic or explicit runtime selection of the underlying factorization method (supernodal or simplicial).
+  * - class CholmodDecomposition: a general L(D)LT Cholesky factorization with automatic or explicit runtime selection of the underlying factorization method (supernodal or simplicial).
  *
  * For the sake of completeness, this module also propose the two following classes:
  * - class CholmodSimplicialLLT
--- a/Eigen/Core
+++ b/Eigen/Core
@@ -11,7 +11,7 @@
 #ifndef EIGEN_CORE_H
 #define EIGEN_CORE_H
-// first thing Eigen does: stop the compiler from committing suicide
+// first thing Eigen does: stop the compiler from reporting useless warnings.
 #include "src/Core/util/DisableStupidWarnings.h"
 // then include this file where all our macros are defined. It's really important to do it first because
@@ -22,7 +22,7 @@
 #include "src/Core/util/ConfigureVectorization.h"
 // We need cuda_runtime.h/hip_runtime.h to ensure that
-// the EIGEN_USING_STD_MATH macro works properly on the device side
+// the EIGEN_USING_STD macro works properly on the device side
 #if defined(EIGEN_CUDACC)
  #include <cuda_runtime.h>
 #elif defined(EIGEN_HIPCC)
@@ -36,10 +36,17 @@
 // Disable the ipa-cp-clone optimization flag with MinGW 6.x or newer (enabled by default with -O3)
 // See http://eigen.tuxfamily.org/bz/show_bug.cgi?id=556 for details.
-#if EIGEN_COMP_MINGW && EIGEN_GNUC_AT_LEAST(4,6)
+#if EIGEN_COMP_MINGW && EIGEN_GNUC_AT_LEAST(4,6) && EIGEN_GNUC_AT_MOST(5,5)
  #pragma GCC optimize ("-fno-ipa-cp-clone")
 #endif
 // Prevent ICC from specializing std::complex operators that silently fail
 // on device. This allows us to use our own device-compatible specializations
 // instead.
 #if defined(EIGEN_COMP_ICC) && defined(EIGEN_GPU_COMPILE_PHASE) \
    && !defined(_OVERRIDE_COMPLEX_SPECIALIZATION_)
 #define _OVERRIDE_COMPLEX_SPECIALIZATION_ 1
 #endif
 #include <complex>
 // this include file manages BLAS and MKL related macros
@@ -51,6 +58,10 @@
  #define EIGEN_HAS_GPU_FP16
 #endif
 #if defined(EIGEN_HAS_CUDA_BF16) || defined(EIGEN_HAS_HIP_BF16)
  #define EIGEN_HAS_GPU_BF16
 #endif
 #if (defined _OPENMP) && (!defined EIGEN_DONT_PARALLELIZE)
  #define EIGEN_HAS_OPENMP
 #endif
@@ -73,6 +84,7 @@
 #include <cassert>
 #include <functional>
 #ifndef EIGEN_NO_IO
  #include <sstream>
  #include <iosfwd>
 #endif
 #include <cstring>
@@ -97,7 +109,8 @@
 #endif
 // required for __cpuid, needs to be included after cmath
-#if EIGEN_COMP_MSVC && EIGEN_ARCH_i386_OR_x86_64 && !EIGEN_OS_WINCE
+// also required for _BitScanReverse on Windows on ARM
 #if EIGEN_COMP_MSVC && (EIGEN_ARCH_i386_OR_x86_64 || EIGEN_ARCH_ARM64) && !EIGEN_OS_WINCE
  #include <intrin.h>
 #endif
@@ -107,7 +120,7 @@
  #undef isnan
  #undef isinf
  #undef isfinite
-  #include <SYCL/sycl.hpp>
+  #include <CL/sycl.hpp>
  #include <map>
  #include <memory>
  #include <utility>
@@ -162,6 +175,7 @@ using std::ptrdiff_t;
 #include "src/Core/arch/Default/ConjHelper.h"
 // Generic half float support
 #include "src/Core/arch/Default/Half.h"
 #include "src/Core/arch/Default/BFloat16.h"
 #include "src/Core/arch/Default/TypeCasting.h"
 #include "src/Core/arch/Default/GenericPacketMathFunctionsFwd.h"
@@ -202,6 +216,10 @@ using std::ptrdiff_t;
  #include "src/Core/arch/NEON/TypeCasting.h"
  #include "src/Core/arch/NEON/MathFunctions.h"
  #include "src/Core/arch/NEON/Complex.h"
 #elif defined EIGEN_VECTORIZE_SVE
  #include "src/Core/arch/SVE/PacketMath.h"
  #include "src/Core/arch/SVE/TypeCasting.h"
  #include "src/Core/arch/SVE/MathFunctions.h"
 #elif defined EIGEN_VECTORIZE_ZVECTOR
  #include "src/Core/arch/ZVector/PacketMath.h"
  #include "src/Core/arch/ZVector/MathFunctions.h"
@@ -329,6 +347,12 @@ using std::ptrdiff_t;
 #include "src/Core/CoreIterators.h"
 #include "src/Core/ConditionEstimator.h"
 #if defined(EIGEN_VECTORIZE_VSX)
  #include "src/Core/arch/AltiVec/MatrixProduct.h"
 #elif defined EIGEN_VECTORIZE_NEON
  #include "src/Core/arch/NEON/GeneralBlockPanelKernel.h"
 #endif
 #include "src/Core/BooleanRedux.h"
 #include "src/Core/Select.h"
 #include "src/Core/VectorwiseOp.h"
--- a/Eigen/Eigenvalues
+++ b/Eigen/Eigenvalues
@@ -58,4 +58,3 @@
 #include "src/Core/util/ReenableStupidWarnings.h"
 #endif // EIGEN_EIGENVALUES_MODULE_H
 /* vim: set filetype=cpp et sw=2 ts=2 ai: */
--- a/Eigen/Geometry
+++ b/Eigen/Geometry
@@ -50,11 +50,10 @@
 #include "src/Geometry/Umeyama.h"
 // Use the SSE optimized version whenever possible.
-#if defined EIGEN_VECTORIZE_SSE
+#if (defined EIGEN_VECTORIZE_SSE) || (defined EIGEN_VECTORIZE_NEON)
-#include "src/Geometry/arch/Geometry_SSE.h"
+#include "src/Geometry/arch/Geometry_SIMD.h"
 #endif
 #include "src/Core/util/ReenableStupidWarnings.h"
 #endif // EIGEN_GEOMETRY_MODULE_H
 /* vim: set filetype=cpp et sw=2 ts=2 ai: */
--- a/Eigen/Householder
+++ b/Eigen/Householder
@@ -27,4 +27,3 @@
 #include "src/Core/util/ReenableStupidWarnings.h"
 #endif // EIGEN_HOUSEHOLDER_MODULE_H
 /* vim: set filetype=cpp et sw=2 ts=2 ai: */
--- a/Eigen/Jacobi
+++ b/Eigen/Jacobi
@@ -29,5 +29,4 @@
 #include "src/Core/util/ReenableStupidWarnings.h"
 #endif // EIGEN_JACOBI_MODULE_H
 /* vim: set filetype=cpp et sw=2 ts=2 ai: */
--- a/Eigen/LU
+++ b/Eigen/LU
@@ -38,13 +38,10 @@
 #include "src/LU/Determinant.h"
 #include "src/LU/InverseImpl.h"
-// Use the SSE optimized version whenever possible. At the moment the
+#if defined EIGEN_VECTORIZE_SSE || defined EIGEN_VECTORIZE_NEON
-// SSE version doesn't compile when AVX is enabled
+  #include "src/LU/arch/InverseSize4.h"
 #if defined EIGEN_VECTORIZE_SSE && !defined EIGEN_VECTORIZE_AVX
  #include "src/LU/arch/Inverse_SSE.h"
 #endif
 #include "src/Core/util/ReenableStupidWarnings.h"
 #endif // EIGEN_LU_MODULE_H
 /* vim: set filetype=cpp et sw=2 ts=2 ai: */
--- a/Eigen/QR
+++ b/Eigen/QR
@@ -48,4 +48,3 @@
 #include "src/Core/util/ReenableStupidWarnings.h"
 #endif // EIGEN_QR_MODULE_H
 /* vim: set filetype=cpp et sw=2 ts=2 ai: */
--- a/Eigen/QtAlignedMalloc
+++ b/Eigen/QtAlignedMalloc
@@ -37,4 +37,3 @@ void *qRealloc(void *ptr, std::size_t size)
 #endif
 #endif // EIGEN_QTMALLOC_MODULE_H
 /* vim: set filetype=cpp et sw=2 ts=2 ai: */
--- a/Eigen/SVD
+++ b/Eigen/SVD
@@ -48,4 +48,3 @@
 #include "src/Core/util/ReenableStupidWarnings.h"
 #endif // EIGEN_SVD_MODULE_H
 /* vim: set filetype=cpp et sw=2 ts=2 ai: */
--- a/Eigen/SparseLU
+++ b/Eigen/SparseLU
@@ -25,8 +25,6 @@
 #include "src/Core/util/DisableStupidWarnings.h"
 #include "src/SparseLU/SparseLU_gemm_kernel.h"
 #include "src/SparseLU/SparseLU_Structs.h"
 #include "src/SparseLU/SparseLU_SupernodalMatrix.h"
 #include "src/SparseLU/SparseLUImpl.h"
--- a/Eigen/src/Cholesky/LDLT.h
+++ b/Eigen/src/Cholesky/LDLT.h
@@ -45,7 +45,7 @@ namespace internal {
  * matrix \f$ A \f$ such that \f$ A =  P^TLDL^*P \f$, where P is a permutation matrix, L
  * is lower triangular with a unit diagonal and D is a diagonal matrix.
  *
-  * The decomposition uses pivoting to ensure stability, so that L will have
+  * The decomposition uses pivoting to ensure stability, so that D will have
  * zeros in the bottom right rank(A) - n submatrix. Avoiding the square root
  * on D also stabilizes the computation.
  *
@@ -200,7 +200,7 @@ template<typename _MatrixType, int _UpLo> class LDLT
      * \f$ L^* y_4 = y_3 \f$ and \f$ P x = y_4 \f$ in succession. If the matrix \f$ A \f$ is singular, then
      * \f$ D \f$ will also be singular (all the other matrices are invertible). In that case, the
      * least-square solution of \f$ D y_3 = y_2 \f$ is computed. This does not mean that this function
-      * computes the least-square solution of \f$ A x = b \f$ is \f$ A \f$ is singular.
+      * computes the least-square solution of \f$ A x = b \f$ if \f$ A \f$ is singular.
      *
      * \sa MatrixBase::ldlt(), SelfAdjointView::ldlt()
      */
@@ -246,8 +246,8 @@ template<typename _MatrixType, int _UpLo> class LDLT
      */
    const LDLT& adjoint() const { return *this; };
-    inline Index rows() const { return m_matrix.rows(); }
+    EIGEN_DEVICE_FUNC inline EIGEN_CONSTEXPR Index rows() const EIGEN_NOEXCEPT { return m_matrix.rows(); }
-    inline Index cols() const { return m_matrix.cols(); }
+    EIGEN_DEVICE_FUNC inline EIGEN_CONSTEXPR Index cols() const EIGEN_NOEXCEPT { return m_matrix.cols(); }
    /** \brief Reports whether previous computation was successful.
      *
--- a/Eigen/src/Cholesky/LLT.h
+++ b/Eigen/src/Cholesky/LLT.h
@@ -199,10 +199,10 @@ template<typename _MatrixType, int _UpLo> class LLT
      * This method is provided for compatibility with other matrix decompositions, thus enabling generic code such as:
      * \code x = decomposition.adjoint().solve(b) \endcode
      */
-    const LLT& adjoint() const { return *this; };
+    const LLT& adjoint() const EIGEN_NOEXCEPT { return *this; };
-    inline Index rows() const { return m_matrix.rows(); }
+    inline EIGEN_CONSTEXPR Index rows() const EIGEN_NOEXCEPT { return m_matrix.rows(); }
-    inline Index cols() const { return m_matrix.cols(); }
+    inline EIGEN_CONSTEXPR Index cols() const EIGEN_NOEXCEPT { return m_matrix.cols(); }
    template<typename VectorType>
    LLT & rankUpdate(const VectorType& vec, const RealScalar& sigma = 1);
--- a/Eigen/src/Core/ArithmeticSequence.h
+++ b/Eigen/src/Core/ArithmeticSequence.h
@@ -172,7 +172,8 @@ seqN(FirstType first, SizeType size)  {
  return ArithmeticSequence<typename internal::cleanup_index_type<FirstType>::type,typename internal::cleanup_index_type<SizeType>::type>(first,size);
 }
-#ifdef EIGEN_PARSED_BY_DOXYGEN
+
 #if EIGEN_HAS_CXX11
 /** \returns an ArithmeticSequence starting at \a f, up (or down) to \a l, and with positive (or negative) increment \a incr
  *
@@ -183,24 +184,6 @@ seqN(FirstType first, SizeType size)  {
  *
  * \sa seqN(FirstType,SizeType,IncrType), seq(FirstType,LastType)
  */
 template<typename FirstType,typename LastType, typename IncrType>
 auto seq(FirstType f, LastType l, IncrType incr);
 /** \returns an ArithmeticSequence starting at \a f, up (or down) to \a l, and unit increment
  *
  * It is essentially an alias to:
  * \code
  * seqN(f,l-f+1);
  * \endcode
  *
  * \sa seqN(FirstType,SizeType), seq(FirstType,LastType,IncrType)
  */
 template<typename FirstType,typename LastType>
 auto seq(FirstType f, LastType l);
 #else // EIGEN_PARSED_BY_DOXYGEN
 #if EIGEN_HAS_CXX11
 template<typename FirstType,typename LastType>
 auto seq(FirstType f, LastType l) -> decltype(seqN(typename internal::cleanup_index_type<FirstType>::type(f),
                                                   (  typename internal::cleanup_index_type<LastType>::type(l)
@@ -211,6 +194,15 @@ auto seq(FirstType f, LastType l) -> decltype(seqN(typename internal::cleanup_in
               -typename internal::cleanup_index_type<FirstType>::type(f)+fix<1>()));
 }
 /** \returns an ArithmeticSequence starting at \a f, up (or down) to \a l, and unit increment
  *
  * It is essentially an alias to:
  * \code
  * seqN(f,l-f+1);
  * \endcode
  *
  * \sa seqN(FirstType,SizeType), seq(FirstType,LastType,IncrType)
  */
 template<typename FirstType,typename LastType, typename IncrType>
 auto seq(FirstType f, LastType l, IncrType incr)
  -> decltype(seqN(typename internal::cleanup_index_type<FirstType>::type(f),
@@ -317,26 +309,12 @@ seq(const symbolic::BaseExpr<FirstTypeDerived> &f, const symbolic::BaseExpr<Last
 }
 #endif // EIGEN_HAS_CXX11
-#endif // EIGEN_PARSED_BY_DOXYGEN
+#if EIGEN_HAS_CXX11
 #if EIGEN_HAS_CXX11 || defined(EIGEN_PARSED_BY_DOXYGEN)
 /** \cpp11
  * \returns a symbolic ArithmeticSequence representing the last \a size elements with increment \a incr.
  *
  * It is a shortcut for: \code seqN(last-(size-fix<1>)*incr, size, incr) \endcode
  * 
  * \sa lastN(SizeType), seqN(FirstType,SizeType), seq(FirstType,LastType,IncrType) */
 template<typename SizeType,typename IncrType>
 auto lastN(SizeType size, IncrType incr)
 -> decltype(seqN(Eigen::last-(size-fix<1>())*incr, size, incr))
 {
  return seqN(Eigen::last-(size-fix<1>())*incr, size, incr);
 }
 /** \cpp11
  * \returns a symbolic ArithmeticSequence representing the last \a size elements with a unit increment.
  *
  * \anchor indexing_lastN
  *
  *  It is a shortcut for: \code seq(last+fix<1>-size, last) \endcode
  * 
  * \sa lastN(SizeType,IncrType, seqN(FirstType,SizeType), seq(FirstType,LastType) */
@@ -346,6 +324,21 @@ auto lastN(SizeType size)
 {
  return seqN(Eigen::last+fix<1>()-size, size);
 }
 /** \cpp11
  * \returns a symbolic ArithmeticSequence representing the last \a size elements with increment \a incr.
  *
  * \anchor indexing_lastN_with_incr
  *
  * It is a shortcut for: \code seqN(last-(size-fix<1>)*incr, size, incr) \endcode
  * 
  * \sa lastN(SizeType), seqN(FirstType,SizeType), seq(FirstType,LastType,IncrType) */
 template<typename SizeType,typename IncrType>
 auto lastN(SizeType size, IncrType incr)
 -> decltype(seqN(Eigen::last-(size-fix<1>())*incr, size, incr))
 {
  return seqN(Eigen::last-(size-fix<1>())*incr, size, incr);
 }
 #endif
 namespace internal {
--- a/Eigen/src/Core/Array.h
+++ b/Eigen/src/Core/Array.h
@@ -157,13 +157,21 @@ class Array
    EIGEN_DEVICE_FUNC
    Array& operator=(Array&& other) EIGEN_NOEXCEPT_IF(std::is_nothrow_move_assignable<Scalar>::value)
    {
-      other.swap(*this);
+      Base::operator=(std::move(other));
      return *this;
    }
 #endif
    #if EIGEN_HAS_CXX11
-    /** \copydoc PlainObjectBase(const Scalar& a0, const Scalar& a1, const Scalar& a2, const Scalar& a3, const ArgTypes&... args)
+    /** \brief Construct a row of column vector with fixed size from an arbitrary number of coefficients. \cpp11
     *
     * \only_for_vectors
     *
     * This constructor is for 1D array or vectors with more than 4 coefficients.
     * There exists C++98 analogue constructors for fixed-size array/vector having 1, 2, 3, or 4 coefficients.
     *
     * \warning To construct a column (resp. row) vector of fixed length, the number of values passed to this
     * constructor must match the the fixed number of rows (resp. columns) of \c *this.
     *
     * Example: \include Array_variadic_ctor_cxx11.cpp
     * Output: \verbinclude Array_variadic_ctor_cxx11.out
@@ -288,8 +296,10 @@ class Array
      : Base(other.derived())
    { }
-    EIGEN_DEVICE_FUNC inline Index innerStride() const { return 1; }
+    EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR
-    EIGEN_DEVICE_FUNC inline Index outerStride() const { return this->innerSize(); }
+    inline Index innerStride() const EIGEN_NOEXCEPT{ return 1; }
    EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR
    inline Index outerStride() const EIGEN_NOEXCEPT { return this->innerSize(); }
    #ifdef EIGEN_ARRAY_PLUGIN
    #include EIGEN_ARRAY_PLUGIN
--- a/Eigen/src/Core/ArrayBase.h
+++ b/Eigen/src/Core/ArrayBase.h
@@ -153,8 +153,8 @@ template<typename Derived> class ArrayBase
 //     inline void evalTo(Dest& dst) const { dst = matrix(); }
  protected:
-    EIGEN_DEVICE_FUNC
+    EIGEN_DEFAULT_COPY_CONSTRUCTOR(ArrayBase)
-    ArrayBase() : Base() {}
+    EIGEN_DEFAULT_EMPTY_CONSTRUCTOR_AND_DESTRUCTOR(ArrayBase)
  private:
    explicit ArrayBase(Index);
--- a/Eigen/src/Core/ArrayWrapper.h
+++ b/Eigen/src/Core/ArrayWrapper.h
@@ -60,14 +60,14 @@ class ArrayWrapper : public ArrayBase<ArrayWrapper<ExpressionType> >
    EIGEN_DEVICE_FUNC
    explicit EIGEN_STRONG_INLINE ArrayWrapper(ExpressionType& matrix) : m_expression(matrix) {}
-    EIGEN_DEVICE_FUNC
+    EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR
-    inline Index rows() const { return m_expression.rows(); }
+    inline Index rows() const EIGEN_NOEXCEPT { return m_expression.rows(); }
-    EIGEN_DEVICE_FUNC
+    EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR
-    inline Index cols() const { return m_expression.cols(); }
+    inline Index cols() const EIGEN_NOEXCEPT { return m_expression.cols(); }
-    EIGEN_DEVICE_FUNC
+    EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR
-    inline Index outerStride() const { return m_expression.outerStride(); }
+    inline Index outerStride() const EIGEN_NOEXCEPT { return m_expression.outerStride(); }
-    EIGEN_DEVICE_FUNC
+    EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR
-    inline Index innerStride() const { return m_expression.innerStride(); }
+    inline Index innerStride() const EIGEN_NOEXCEPT { return m_expression.innerStride(); }
    EIGEN_DEVICE_FUNC
    inline ScalarWithConstIfNotLvalue* data() { return m_expression.data(); }
@@ -158,14 +158,14 @@ class MatrixWrapper : public MatrixBase<MatrixWrapper<ExpressionType> >
    EIGEN_DEVICE_FUNC
    explicit inline MatrixWrapper(ExpressionType& matrix) : m_expression(matrix) {}
-    EIGEN_DEVICE_FUNC
+    EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR
-    inline Index rows() const { return m_expression.rows(); }
+    inline Index rows() const EIGEN_NOEXCEPT { return m_expression.rows(); }
-    EIGEN_DEVICE_FUNC
+    EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR
-    inline Index cols() const { return m_expression.cols(); }
+    inline Index cols() const EIGEN_NOEXCEPT { return m_expression.cols(); }
-    EIGEN_DEVICE_FUNC
+    EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR
-    inline Index outerStride() const { return m_expression.outerStride(); }
+    inline Index outerStride() const EIGEN_NOEXCEPT { return m_expression.outerStride(); }
-    EIGEN_DEVICE_FUNC
+    EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR
-    inline Index innerStride() const { return m_expression.innerStride(); }
+    inline Index innerStride() const EIGEN_NOEXCEPT { return m_expression.innerStride(); }
    EIGEN_DEVICE_FUNC
    inline ScalarWithConstIfNotLvalue* data() { return m_expression.data(); }
--- a/Eigen/src/Core/AssignEvaluator.h
+++ b/Eigen/src/Core/AssignEvaluator.h
@@ -99,7 +99,8 @@ private:
 public:
  enum {
-    Traversal = (int(MayLinearVectorize) && (LinearPacketSize>InnerPacketSize)) ? int(LinearVectorizedTraversal)
+    Traversal =  int(Dst::SizeAtCompileTime) == 0 ? int(AllAtOnceTraversal) // If compile-size is zero, traversing will fail at compile-time.
              : (int(MayLinearVectorize) && (LinearPacketSize>InnerPacketSize)) ? int(LinearVectorizedTraversal)
              : int(MayInnerVectorize)   ? int(InnerVectorizedTraversal)
              : int(MayLinearVectorize)  ? int(LinearVectorizedTraversal)
              : int(MaySliceVectorize)   ? int(SliceVectorizedTraversal)
@@ -316,6 +317,22 @@ template<typename Kernel,
         int Unrolling = Kernel::AssignmentTraits::Unrolling>
 struct dense_assignment_loop;
 /************************
 ***** Special Cases *****
 ************************/
 // Zero-sized assignment is a no-op.
 template<typename Kernel, int Unrolling>
 struct dense_assignment_loop<Kernel, AllAtOnceTraversal, Unrolling>
 {
  EIGEN_DEVICE_FUNC static void EIGEN_STRONG_INLINE run(Kernel& /*kernel*/)
  {
    typedef typename Kernel::DstEvaluatorType::XprType DstXprType;
    EIGEN_STATIC_ASSERT(int(DstXprType::SizeAtCompileTime) == 0,
      EIGEN_INTERNAL_ERROR_PLEASE_FILE_A_BUG_REPORT)
  }
 };
 /************************
 *** Default traversal ***
 ************************/
@@ -433,7 +450,7 @@ struct dense_assignment_loop<Kernel, LinearVectorizedTraversal, CompleteUnrollin
    enum { size = DstXprType::SizeAtCompileTime,
           packetSize =unpacket_traits<PacketType>::size,
-           alignedSize = (size/packetSize)*packetSize };
+           alignedSize = (int(size)/packetSize)*packetSize };
    copy_using_evaluator_innervec_CompleteUnrolling<Kernel, 0, alignedSize>::run(kernel);
    copy_using_evaluator_DefaultTraversal_CompleteUnrolling<Kernel, alignedSize, size>::run(kernel);
@@ -572,14 +589,15 @@ struct dense_assignment_loop<Kernel, SliceVectorizedTraversal, InnerUnrolling>
    typedef typename Kernel::DstEvaluatorType::XprType DstXprType;
    typedef typename Kernel::PacketType PacketType;
-    enum { size = DstXprType::InnerSizeAtCompileTime,
+    enum { innerSize = DstXprType::InnerSizeAtCompileTime,
           packetSize =unpacket_traits<PacketType>::size,
-           vectorizableSize = (size/packetSize)*packetSize };
+           vectorizableSize = (int(innerSize) / int(packetSize)) * int(packetSize),
           size = DstXprType::SizeAtCompileTime };
    for(Index outer = 0; outer < kernel.outerSize(); ++outer)
    {
      copy_using_evaluator_innervec_InnerUnrolling<Kernel, 0, vectorizableSize, 0, 0>::run(kernel, outer);
-      copy_using_evaluator_DefaultTraversal_InnerUnrolling<Kernel, vectorizableSize, size>::run(kernel, outer);
+      copy_using_evaluator_DefaultTraversal_InnerUnrolling<Kernel, vectorizableSize, innerSize>::run(kernel, outer);
    }
  }
 };
@@ -620,15 +638,15 @@ public:
    #endif
  }
-  EIGEN_DEVICE_FUNC Index size() const        { return m_dstExpr.size(); }
+  EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR Index size() const EIGEN_NOEXCEPT { return m_dstExpr.size(); }
-  EIGEN_DEVICE_FUNC Index innerSize() const   { return m_dstExpr.innerSize(); }
+  EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR Index innerSize() const EIGEN_NOEXCEPT { return m_dstExpr.innerSize(); }
-  EIGEN_DEVICE_FUNC Index outerSize() const   { return m_dstExpr.outerSize(); }
+  EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR Index outerSize() const EIGEN_NOEXCEPT { return m_dstExpr.outerSize(); }
-  EIGEN_DEVICE_FUNC Index rows() const        { return m_dstExpr.rows(); }
+  EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR Index rows() const EIGEN_NOEXCEPT { return m_dstExpr.rows(); }
-  EIGEN_DEVICE_FUNC Index cols() const        { return m_dstExpr.cols(); }
+  EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR Index cols() const EIGEN_NOEXCEPT { return m_dstExpr.cols(); }
-  EIGEN_DEVICE_FUNC Index outerStride() const { return m_dstExpr.outerStride(); }
+  EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR Index outerStride() const EIGEN_NOEXCEPT { return m_dstExpr.outerStride(); }
-  EIGEN_DEVICE_FUNC DstEvaluatorType& dstEvaluator() { return m_dst; }
+  EIGEN_DEVICE_FUNC DstEvaluatorType& dstEvaluator() EIGEN_NOEXCEPT { return m_dst; }
-  EIGEN_DEVICE_FUNC const SrcEvaluatorType& srcEvaluator() const { return m_src; }
+  EIGEN_DEVICE_FUNC const SrcEvaluatorType& srcEvaluator() const EIGEN_NOEXCEPT { return m_src; }
  /// Assign src(row,col) to dst(row,col) through the assignment functor.
  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void assignCoeff(Index row, Index col)
@@ -767,6 +785,16 @@ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void call_dense_assignment_loop(DstXprType
  dense_assignment_loop<Kernel>::run(kernel);
 }
 // Specialization for filling the destination with a constant value.
 #ifndef EIGEN_GPU_COMPILE_PHASE
 template<typename DstXprType>
 EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void call_dense_assignment_loop(DstXprType& dst, const Eigen::CwiseNullaryOp<Eigen::internal::scalar_constant_op<typename DstXprType::Scalar>, DstXprType>& src, const internal::assign_op<typename DstXprType::Scalar,typename DstXprType::Scalar>& func)
 {
  resize_if_allowed(dst, src, func);
  std::fill_n(dst.data(), dst.size(), src.functor()());
 }
 #endif
 template<typename DstXprType, typename SrcXprType>
 EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void call_dense_assignment_loop(DstXprType& dst, const SrcXprType& src)
 {
--- a/Eigen/src/Core/BandMatrix.h
+++ b/Eigen/src/Core/BandMatrix.h
@@ -67,7 +67,7 @@ class BandMatrixBase : public EigenBase<Derived>
      * \warning the internal storage must be column major. */
    inline Block<CoefficientsType,Dynamic,1> col(Index i)
    {
-      EIGEN_STATIC_ASSERT((Options&RowMajor)==0,THIS_METHOD_IS_ONLY_FOR_COLUMN_MAJOR_MATRICES);
+      EIGEN_STATIC_ASSERT((int(Options) & int(RowMajor)) == 0, THIS_METHOD_IS_ONLY_FOR_COLUMN_MAJOR_MATRICES);
      Index start = 0;
      Index len = coeffs().rows();
      if (i<=supers())
@@ -90,7 +90,7 @@ class BandMatrixBase : public EigenBase<Derived>
    template<int Index> struct DiagonalIntReturnType {
      enum {
-        ReturnOpposite = (Options&SelfAdjoint) && (((Index)>0 && Supers==0) || ((Index)<0 && Subs==0)),
+        ReturnOpposite = (int(Options) & int(SelfAdjoint)) && (((Index) > 0 && Supers == 0) || ((Index) < 0 && Subs == 0)),
        Conjugate = ReturnOpposite && NumTraits<Scalar>::IsComplex,
        ActualIndex = ReturnOpposite ? -Index : Index,
        DiagonalSize = (RowsAtCompileTime==Dynamic || ColsAtCompileTime==Dynamic)
@@ -192,7 +192,7 @@ struct traits<BandMatrix<_Scalar,_Rows,_Cols,_Supers,_Subs,_Options> >
    Options = _Options,
    DataRowsAtCompileTime = ((Supers!=Dynamic) && (Subs!=Dynamic)) ? 1 + Supers + Subs : Dynamic
  };
-  typedef Matrix<Scalar,DataRowsAtCompileTime,ColsAtCompileTime,Options&RowMajor?RowMajor:ColMajor> CoefficientsType;
+  typedef Matrix<Scalar, DataRowsAtCompileTime, ColsAtCompileTime, int(Options) & int(RowMajor) ? RowMajor : ColMajor> CoefficientsType;
 };
 template<typename _Scalar, int Rows, int Cols, int Supers, int Subs, int Options>
@@ -211,16 +211,16 @@ class BandMatrix : public BandMatrixBase<BandMatrix<_Scalar,Rows,Cols,Supers,Sub
    }
    /** \returns the number of columns */
-    inline Index rows() const { return m_rows.value(); }
+    inline EIGEN_CONSTEXPR Index rows() const { return m_rows.value(); }
    /** \returns the number of rows */
-    inline Index cols() const { return m_coeffs.cols(); }
+    inline EIGEN_CONSTEXPR Index cols() const { return m_coeffs.cols(); }
    /** \returns the number of super diagonals */
-    inline Index supers() const { return m_supers.value(); }
+    inline EIGEN_CONSTEXPR Index supers() const { return m_supers.value(); }
    /** \returns the number of sub diagonals */
-    inline Index subs() const { return m_subs.value(); }
+    inline EIGEN_CONSTEXPR Index subs() const { return m_subs.value(); }
    inline const CoefficientsType& coeffs() const { return m_coeffs; }
    inline CoefficientsType& coeffs() { return m_coeffs; }
@@ -275,16 +275,16 @@ class BandMatrixWrapper : public BandMatrixBase<BandMatrixWrapper<_CoefficientsT
    }
    /** \returns the number of columns */
-    inline Index rows() const { return m_rows.value(); }
+    inline EIGEN_CONSTEXPR Index rows() const { return m_rows.value(); }
    /** \returns the number of rows */
-    inline Index cols() const { return m_coeffs.cols(); }
+    inline EIGEN_CONSTEXPR Index cols() const { return m_coeffs.cols(); }
    /** \returns the number of super diagonals */
-    inline Index supers() const { return m_supers.value(); }
+    inline EIGEN_CONSTEXPR Index supers() const { return m_supers.value(); }
    /** \returns the number of sub diagonals */
-    inline Index subs() const { return m_subs.value(); }
+    inline EIGEN_CONSTEXPR Index subs() const { return m_subs.value(); }
    inline const CoefficientsType& coeffs() const { return m_coeffs; }
--- a/Eigen/src/Core/Block.h
+++ b/Eigen/src/Core/Block.h
@@ -260,19 +260,19 @@ template<typename XprType, int BlockRows, int BlockCols, bool InnerPanel, bool H
    }
    template<int LoadMode>
-    inline PacketScalar packet(Index rowId, Index colId) const
+    EIGEN_DEVICE_FUNC inline PacketScalar packet(Index rowId, Index colId) const
    {
      return m_xpr.template packet<Unaligned>(rowId + m_startRow.value(), colId + m_startCol.value());
    }
    template<int LoadMode>
-    inline void writePacket(Index rowId, Index colId, const PacketScalar& val)
+    EIGEN_DEVICE_FUNC inline void writePacket(Index rowId, Index colId, const PacketScalar& val)
    {
      m_xpr.template writePacket<Unaligned>(rowId + m_startRow.value(), colId + m_startCol.value(), val);
    }
    template<int LoadMode>
-    inline PacketScalar packet(Index index) const
+    EIGEN_DEVICE_FUNC inline PacketScalar packet(Index index) const
    {
      return m_xpr.template packet<Unaligned>
              (m_startRow.value() + (RowsAtCompileTime == 1 ? 0 : index),
@@ -280,7 +280,7 @@ template<typename XprType, int BlockRows, int BlockCols, bool InnerPanel, bool H
    }
    template<int LoadMode>
-    inline void writePacket(Index index, const PacketScalar& val)
+    EIGEN_DEVICE_FUNC inline void writePacket(Index index, const PacketScalar& val)
    {
      m_xpr.template writePacket<Unaligned>
         (m_startRow.value() + (RowsAtCompileTime == 1 ? 0 : index),
@@ -303,14 +303,14 @@ template<typename XprType, int BlockRows, int BlockCols, bool InnerPanel, bool H
    EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
    XprType& nestedExpression() { return m_xpr; }
-    EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+    EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE EIGEN_CONSTEXPR
-    StorageIndex startRow() const
+    StorageIndex startRow() const EIGEN_NOEXCEPT
    {
      return m_startRow.value();
    }
-    EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+    EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE EIGEN_CONSTEXPR
-    StorageIndex startCol() const
+    StorageIndex startCol() const EIGEN_NOEXCEPT
    {
      return m_startCol.value();
    }
@@ -334,6 +334,17 @@ class BlockImpl_dense<XprType,BlockRows,BlockCols, InnerPanel,true>
    enum {
      XprTypeIsRowMajor = (int(traits<XprType>::Flags)&RowMajorBit) != 0
    };
    /** \internal Returns base+offset (unless base is null, in which case returns null).
      * Adding an offset to nullptr is undefined behavior, so we must avoid it.
      */
    template <typename Scalar>
    EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR EIGEN_ALWAYS_INLINE
    static Scalar* add_to_nullable_pointer(Scalar* base, Index offset)
    {
      return base != NULL ? base+offset : NULL;
    }
  public:
    typedef MapBase<BlockType> Base;
@@ -344,8 +355,9 @@ class BlockImpl_dense<XprType,BlockRows,BlockCols, InnerPanel,true>
      */
    EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
    BlockImpl_dense(XprType& xpr, Index i)
-      : Base(xpr.data() + i * (    ((BlockRows==1) && (BlockCols==XprType::ColsAtCompileTime) && (!XprTypeIsRowMajor)) 
+      : Base((BlockRows == 0 || BlockCols == 0) ? NULL : add_to_nullable_pointer(xpr.data(),
-                                || ((BlockRows==XprType::RowsAtCompileTime) && (BlockCols==1) && ( XprTypeIsRowMajor)) ? xpr.innerStride() : xpr.outerStride()),
+                 i * (    ((BlockRows==1) && (BlockCols==XprType::ColsAtCompileTime) && (!XprTypeIsRowMajor))
                       || ((BlockRows==XprType::RowsAtCompileTime) && (BlockCols==1) && ( XprTypeIsRowMajor)) ? xpr.innerStride() : xpr.outerStride())),
             BlockRows==1 ? 1 : xpr.rows(),
             BlockCols==1 ? 1 : xpr.cols()),
        m_xpr(xpr),
@@ -359,7 +371,8 @@ class BlockImpl_dense<XprType,BlockRows,BlockCols, InnerPanel,true>
      */
    EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
    BlockImpl_dense(XprType& xpr, Index startRow, Index startCol)
-      : Base(xpr.data()+xpr.innerStride()*(XprTypeIsRowMajor?startCol:startRow) + xpr.outerStride()*(XprTypeIsRowMajor?startRow:startCol)),
+      : Base((BlockRows == 0 || BlockCols == 0) ? NULL : add_to_nullable_pointer(xpr.data(),
                 xpr.innerStride()*(XprTypeIsRowMajor?startCol:startRow) + xpr.outerStride()*(XprTypeIsRowMajor?startRow:startCol))),
        m_xpr(xpr), m_startRow(startRow), m_startCol(startCol)
    {
      init();
@@ -371,14 +384,16 @@ class BlockImpl_dense<XprType,BlockRows,BlockCols, InnerPanel,true>
    BlockImpl_dense(XprType& xpr,
          Index startRow, Index startCol,
          Index blockRows, Index blockCols)
-      : Base(xpr.data()+xpr.innerStride()*(XprTypeIsRowMajor?startCol:startRow) + xpr.outerStride()*(XprTypeIsRowMajor?startRow:startCol), blockRows, blockCols),
+      : Base((blockRows == 0 || blockCols == 0) ? NULL : add_to_nullable_pointer(xpr.data(),
                 xpr.innerStride()*(XprTypeIsRowMajor?startCol:startRow) + xpr.outerStride()*(XprTypeIsRowMajor?startRow:startCol)),
             blockRows, blockCols),
        m_xpr(xpr), m_startRow(startRow), m_startCol(startCol)
    {
      init();
    }
    EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
-    const typename internal::remove_all<XprTypeNested>::type& nestedExpression() const
+    const typename internal::remove_all<XprTypeNested>::type& nestedExpression() const EIGEN_NOEXCEPT
    {
      return m_xpr;
    }
@@ -387,8 +402,8 @@ class BlockImpl_dense<XprType,BlockRows,BlockCols, InnerPanel,true>
    XprType& nestedExpression() { return m_xpr; }
    /** \sa MapBase::innerStride() */
-    EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+    EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE EIGEN_CONSTEXPR
-    Index innerStride() const
+    Index innerStride() const EIGEN_NOEXCEPT
    {
      return internal::traits<BlockType>::HasSameStorageOrderAsXprType
             ? m_xpr.innerStride()
@@ -396,23 +411,19 @@ class BlockImpl_dense<XprType,BlockRows,BlockCols, InnerPanel,true>
    }
    /** \sa MapBase::outerStride() */
-    EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+    EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE EIGEN_CONSTEXPR
-    Index outerStride() const
+    Index outerStride() const EIGEN_NOEXCEPT
    {
-      return m_outerStride;
+      return internal::traits<BlockType>::HasSameStorageOrderAsXprType
                    ? m_xpr.outerStride()
                    : m_xpr.innerStride();
    }
-    EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+    EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE EIGEN_CONSTEXPR
-    StorageIndex startRow() const
+    StorageIndex startRow() const EIGEN_NOEXCEPT { return m_startRow.value(); }
    {
      return m_startRow.value();
    }
-    EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+    EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE EIGEN_CONSTEXPR
-    StorageIndex startCol() const
+    StorageIndex startCol() const EIGEN_NOEXCEPT { return m_startCol.value(); }
    {
      return m_startCol.value();
    }
  #ifndef __SUNPRO_CC
  // FIXME sunstudio is not friendly with the above friend...
--- a/Eigen/src/Core/BooleanRedux.h
+++ b/Eigen/src/Core/BooleanRedux.h
@@ -14,56 +14,58 @@ namespace Eigen {
 namespace internal {
-template<typename Derived, int UnrollCount, int Rows>
+template<typename Derived, int UnrollCount, int InnerSize>
 struct all_unroller
 {
  enum {
-    col = (UnrollCount-1) / Rows,
+    IsRowMajor = (int(Derived::Flags) & int(RowMajor)),
-    row = (UnrollCount-1) % Rows
+    i = (UnrollCount-1) / InnerSize,
    j = (UnrollCount-1) % InnerSize
  };
-  static inline bool run(const Derived &mat)
+  EIGEN_DEVICE_FUNC static inline bool run(const Derived &mat)
  {
-    return all_unroller<Derived, UnrollCount-1, Rows>::run(mat) && mat.coeff(row, col);
+    return all_unroller<Derived, UnrollCount-1, InnerSize>::run(mat) && mat.coeff(IsRowMajor ? i : j, IsRowMajor ? j : i);
  }
 };
-template<typename Derived, int Rows>
+template<typename Derived, int InnerSize>
-struct all_unroller<Derived, 0, Rows>
+struct all_unroller<Derived, 0, InnerSize>
 {
-  static inline bool run(const Derived &/*mat*/) { return true; }
+  EIGEN_DEVICE_FUNC static inline bool run(const Derived &/*mat*/) { return true; }
 };
-template<typename Derived, int Rows>
+template<typename Derived, int InnerSize>
-struct all_unroller<Derived, Dynamic, Rows>
+struct all_unroller<Derived, Dynamic, InnerSize>
 {
-  static inline bool run(const Derived &) { return false; }
+  EIGEN_DEVICE_FUNC static inline bool run(const Derived &) { return false; }
 };
-template<typename Derived, int UnrollCount, int Rows>
+template<typename Derived, int UnrollCount, int InnerSize>
 struct any_unroller
 {
  enum {
-    col = (UnrollCount-1) / Rows,
+    IsRowMajor = (int(Derived::Flags) & int(RowMajor)),
-    row = (UnrollCount-1) % Rows
+    i = (UnrollCount-1) / InnerSize,
    j = (UnrollCount-1) % InnerSize
  };
-  static inline bool run(const Derived &mat)
+  EIGEN_DEVICE_FUNC static inline bool run(const Derived &mat)
  {
-    return any_unroller<Derived, UnrollCount-1, Rows>::run(mat) || mat.coeff(row, col);
+    return any_unroller<Derived, UnrollCount-1, InnerSize>::run(mat) || mat.coeff(IsRowMajor ? i : j, IsRowMajor ? j : i);
  }
 };
-template<typename Derived, int Rows>
+template<typename Derived, int InnerSize>
-struct any_unroller<Derived, 0, Rows>
+struct any_unroller<Derived, 0, InnerSize>
 {
-  static inline bool run(const Derived & /*mat*/) { return false; }
+  EIGEN_DEVICE_FUNC static inline bool run(const Derived & /*mat*/) { return false; }
 };
-template<typename Derived, int Rows>
+template<typename Derived, int InnerSize>
-struct any_unroller<Derived, Dynamic, Rows>
+struct any_unroller<Derived, Dynamic, InnerSize>
 {
-  static inline bool run(const Derived &) { return false; }
+  EIGEN_DEVICE_FUNC static inline bool run(const Derived &) { return false; }
 };
 } // end namespace internal
@@ -81,16 +83,16 @@ EIGEN_DEVICE_FUNC inline bool DenseBase<Derived>::all() const
  typedef internal::evaluator<Derived> Evaluator;
  enum {
    unroll = SizeAtCompileTime != Dynamic
-          && SizeAtCompileTime * (Evaluator::CoeffReadCost + NumTraits<Scalar>::AddCost) <= EIGEN_UNROLLING_LIMIT
+          && SizeAtCompileTime * (int(Evaluator::CoeffReadCost) + int(NumTraits<Scalar>::AddCost)) <= EIGEN_UNROLLING_LIMIT
  };
  Evaluator evaluator(derived());
  if(unroll)
-    return internal::all_unroller<Evaluator, unroll ? int(SizeAtCompileTime) : Dynamic, internal::traits<Derived>::RowsAtCompileTime>::run(evaluator);
+    return internal::all_unroller<Evaluator, unroll ? int(SizeAtCompileTime) : Dynamic, InnerSizeAtCompileTime>::run(evaluator);
  else
  {
-    for(Index j = 0; j < cols(); ++j)
+    for(Index i = 0; i < derived().outerSize(); ++i)
-      for(Index i = 0; i < rows(); ++i)
+      for(Index j = 0; j < derived().innerSize(); ++j)
-        if (!evaluator.coeff(i, j)) return false;
+        if (!evaluator.coeff(IsRowMajor ? i : j, IsRowMajor ? j : i)) return false;
    return true;
  }
 }
@@ -105,16 +107,16 @@ EIGEN_DEVICE_FUNC inline bool DenseBase<Derived>::any() const
  typedef internal::evaluator<Derived> Evaluator;
  enum {
    unroll = SizeAtCompileTime != Dynamic
-          && SizeAtCompileTime * (Evaluator::CoeffReadCost + NumTraits<Scalar>::AddCost) <= EIGEN_UNROLLING_LIMIT
+          && SizeAtCompileTime * (int(Evaluator::CoeffReadCost) + int(NumTraits<Scalar>::AddCost)) <= EIGEN_UNROLLING_LIMIT
  };
  Evaluator evaluator(derived());
  if(unroll)
-    return internal::any_unroller<Evaluator, unroll ? int(SizeAtCompileTime) : Dynamic, internal::traits<Derived>::RowsAtCompileTime>::run(evaluator);
+    return internal::any_unroller<Evaluator, unroll ? int(SizeAtCompileTime) : Dynamic, InnerSizeAtCompileTime>::run(evaluator);
  else
  {
-    for(Index j = 0; j < cols(); ++j)
+    for(Index i = 0; i < derived().outerSize(); ++i)
-      for(Index i = 0; i < rows(); ++i)
+      for(Index j = 0; j < derived().innerSize(); ++j)
-        if (evaluator.coeff(i, j)) return true;
+        if (evaluator.coeff(IsRowMajor ? i : j, IsRowMajor ? j : i)) return true;
    return false;
  }
 }
--- a/Eigen/src/Core/CommaInitializer.h
+++ b/Eigen/src/Core/CommaInitializer.h
@@ -33,6 +33,8 @@ struct CommaInitializer
  inline CommaInitializer(XprType& xpr, const Scalar& s)
    : m_xpr(xpr), m_row(0), m_col(1), m_currentBlockRows(1)
  {
    eigen_assert(m_xpr.rows() > 0 && m_xpr.cols() > 0
      && "Cannot comma-initialize a 0x0 matrix (operator<<)");
    m_xpr.coeffRef(0,0) = s;
  }
@@ -41,6 +43,8 @@ struct CommaInitializer
  inline CommaInitializer(XprType& xpr, const DenseBase<OtherDerived>& other)
    : m_xpr(xpr), m_row(0), m_col(other.cols()), m_currentBlockRows(other.rows())
  {
    eigen_assert(m_xpr.rows() >= other.rows() && m_xpr.cols() >= other.cols()
      && "Cannot comma-initialize a 0x0 matrix (operator<<)");
    m_xpr.block(0, 0, other.rows(), other.cols()) = other;
  }
--- a/Eigen/src/Core/CoreEvaluators.h
+++ b/Eigen/src/Core/CoreEvaluators.h
@@ -143,8 +143,8 @@ public:
 #endif
    eigen_internal_assert(outerStride==OuterStride);
  }
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE EIGEN_CONSTEXPR
-  Index outerStride() const { return OuterStride; }
+  Index outerStride() const EIGEN_NOEXCEPT { return OuterStride; }
  const Scalar *data;
 };
@@ -561,7 +561,7 @@ struct unary_evaluator<CwiseUnaryOp<UnaryOp, ArgType>, IndexBased >
  typedef CwiseUnaryOp<UnaryOp, ArgType> XprType;
  enum {
-    CoeffReadCost = evaluator<ArgType>::CoeffReadCost + functor_traits<UnaryOp>::Cost,
+    CoeffReadCost = int(evaluator<ArgType>::CoeffReadCost) + int(functor_traits<UnaryOp>::Cost),
    Flags = evaluator<ArgType>::Flags
          & (HereditaryBits | LinearAccessBit | (functor_traits<UnaryOp>::PacketAccess ? PacketAccessBit : 0)),
@@ -606,13 +606,13 @@ struct unary_evaluator<CwiseUnaryOp<UnaryOp, ArgType>, IndexBased >
 protected:
  // this helper permits to completely eliminate the functor if it is empty
-  class Data : private UnaryOp
+  struct Data
  {
  public:
    EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
-    Data(const XprType& xpr) : UnaryOp(xpr.functor()), argImpl(xpr.nestedExpression()) {}
+    Data(const XprType& xpr) : op(xpr.functor()), argImpl(xpr.nestedExpression()) {}
    EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
-    const UnaryOp& func() const { return static_cast<const UnaryOp&>(*this); }
+    const UnaryOp& func() const { return op; }
    UnaryOp op;
    evaluator<ArgType> argImpl;
  };
@@ -639,7 +639,7 @@ struct ternary_evaluator<CwiseTernaryOp<TernaryOp, Arg1, Arg2, Arg3>, IndexBased
  typedef CwiseTernaryOp<TernaryOp, Arg1, Arg2, Arg3> XprType;
  enum {
-    CoeffReadCost = evaluator<Arg1>::CoeffReadCost + evaluator<Arg2>::CoeffReadCost + evaluator<Arg3>::CoeffReadCost + functor_traits<TernaryOp>::Cost,
+    CoeffReadCost = int(evaluator<Arg1>::CoeffReadCost) + int(evaluator<Arg2>::CoeffReadCost) + int(evaluator<Arg3>::CoeffReadCost) + int(functor_traits<TernaryOp>::Cost),
    Arg1Flags = evaluator<Arg1>::Flags,
    Arg2Flags = evaluator<Arg2>::Flags,
@@ -700,12 +700,13 @@ struct ternary_evaluator<CwiseTernaryOp<TernaryOp, Arg1, Arg2, Arg3>, IndexBased
 protected:
  // this helper permits to completely eliminate the functor if it is empty
-  struct Data : private TernaryOp
+  struct Data
  {
    EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
-    Data(const XprType& xpr) : TernaryOp(xpr.functor()), arg1Impl(xpr.arg1()), arg2Impl(xpr.arg2()), arg3Impl(xpr.arg3()) {}
+    Data(const XprType& xpr) : op(xpr.functor()), arg1Impl(xpr.arg1()), arg2Impl(xpr.arg2()), arg3Impl(xpr.arg3()) {}
    EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
-    const TernaryOp& func() const { return static_cast<const TernaryOp&>(*this); }
+    const TernaryOp& func() const { return op; }
    TernaryOp op;
    evaluator<Arg1> arg1Impl;
    evaluator<Arg2> arg2Impl;
    evaluator<Arg3> arg3Impl;
@@ -735,7 +736,7 @@ struct binary_evaluator<CwiseBinaryOp<BinaryOp, Lhs, Rhs>, IndexBased, IndexBase
  typedef CwiseBinaryOp<BinaryOp, Lhs, Rhs> XprType;
  enum {
-    CoeffReadCost = evaluator<Lhs>::CoeffReadCost + evaluator<Rhs>::CoeffReadCost + functor_traits<BinaryOp>::Cost,
+    CoeffReadCost = int(evaluator<Lhs>::CoeffReadCost) + int(evaluator<Rhs>::CoeffReadCost) + int(functor_traits<BinaryOp>::Cost),
    LhsFlags = evaluator<Lhs>::Flags,
    RhsFlags = evaluator<Rhs>::Flags,
@@ -793,12 +794,13 @@ struct binary_evaluator<CwiseBinaryOp<BinaryOp, Lhs, Rhs>, IndexBased, IndexBase
 protected:
  // this helper permits to completely eliminate the functor if it is empty
-  struct Data : private BinaryOp
+  struct Data
  {
    EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
-    Data(const XprType& xpr) : BinaryOp(xpr.functor()), lhsImpl(xpr.lhs()), rhsImpl(xpr.rhs()) {}
+    Data(const XprType& xpr) : op(xpr.functor()), lhsImpl(xpr.lhs()), rhsImpl(xpr.rhs()) {}
    EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
-    const BinaryOp& func() const { return static_cast<const BinaryOp&>(*this); }
+    const BinaryOp& func() const { return op; }
    BinaryOp op;
    evaluator<Lhs> lhsImpl;
    evaluator<Rhs> rhsImpl;
  };
@@ -815,7 +817,7 @@ struct unary_evaluator<CwiseUnaryView<UnaryOp, ArgType>, IndexBased>
  typedef CwiseUnaryView<UnaryOp, ArgType> XprType;
  enum {
-    CoeffReadCost = evaluator<ArgType>::CoeffReadCost + functor_traits<UnaryOp>::Cost,
+    CoeffReadCost = int(evaluator<ArgType>::CoeffReadCost) + int(functor_traits<UnaryOp>::Cost),
    Flags = (evaluator<ArgType>::Flags & (HereditaryBits | LinearAccessBit | DirectAccessBit)),
@@ -858,12 +860,13 @@ struct unary_evaluator<CwiseUnaryView<UnaryOp, ArgType>, IndexBased>
 protected:
  // this helper permits to completely eliminate the functor if it is empty
-  struct Data : private UnaryOp
+  struct Data
  {
    EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
-    Data(const XprType& xpr) : UnaryOp(xpr.functor()), argImpl(xpr.nestedExpression()) {}
+    Data(const XprType& xpr) : op(xpr.functor()), argImpl(xpr.nestedExpression()) {}
    EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
-    const UnaryOp& func() const { return static_cast<const UnaryOp&>(*this); }
+    const UnaryOp& func() const { return op; }
    UnaryOp op;
    evaluator<ArgType> argImpl;
  };
@@ -956,10 +959,14 @@ struct mapbase_evaluator : evaluator_base<Derived>
    internal::pstoret<Scalar, PacketType, StoreMode>(m_data + index * m_innerStride.value(), x);
  }
 protected:
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE EIGEN_CONSTEXPR
-  Index rowStride() const { return XprType::IsRowMajor ? m_outerStride.value() : m_innerStride.value(); }
+  Index rowStride() const EIGEN_NOEXCEPT {
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+    return XprType::IsRowMajor ? m_outerStride.value() : m_innerStride.value();
-  Index colStride() const { return XprType::IsRowMajor ? m_innerStride.value() : m_outerStride.value(); }
+  }
  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE EIGEN_CONSTEXPR
  Index colStride() const EIGEN_NOEXCEPT {
     return XprType::IsRowMajor ? m_innerStride.value() : m_outerStride.value();
  }
  PointerType m_data;
  const internal::variable_if_dynamic<Index, XprType::InnerStrideAtCompileTime> m_innerStride;
@@ -1648,8 +1655,10 @@ protected:
  const internal::variable_if_dynamicindex<Index, XprType::DiagIndex> m_index;
 private:
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Index rowOffset() const { return m_index.value() > 0 ? 0 : -m_index.value(); }
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE EIGEN_CONSTEXPR
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Index colOffset() const { return m_index.value() > 0 ? m_index.value() : 0; }
+  Index rowOffset() const { return m_index.value() > 0 ? 0 : -m_index.value(); }
  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE EIGEN_CONSTEXPR
  Index colOffset() const { return m_index.value() > 0 ? m_index.value() : 0; }
 };
@@ -1686,12 +1695,12 @@ class EvalToTemp
    return m_arg;
  }
-  Index rows() const 
+  EIGEN_CONSTEXPR Index rows() const EIGEN_NOEXCEPT
  {
    return m_arg.rows();
  }
-  Index cols() const 
+  EIGEN_CONSTEXPR Index cols() const EIGEN_NOEXCEPT
  {
    return m_arg.cols();
  }
--- a/Eigen/src/Core/CwiseBinaryOp.h
+++ b/Eigen/src/Core/CwiseBinaryOp.h
@@ -102,7 +102,7 @@ class CwiseBinaryOp :
 #if EIGEN_COMP_MSVC && EIGEN_HAS_CXX11
    //Required for Visual Studio or the Copy constructor will probably not get inlined!
-    EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+    EIGEN_STRONG_INLINE
    CwiseBinaryOp(const CwiseBinaryOp<BinaryOp,LhsType,RhsType>&) = default;
 #endif
@@ -116,21 +116,15 @@ class CwiseBinaryOp :
      eigen_assert(aLhs.rows() == aRhs.rows() && aLhs.cols() == aRhs.cols());
    }
-    EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+    EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE EIGEN_CONSTEXPR
-    Index rows() const {
+    Index rows() const EIGEN_NOEXCEPT {
      // return the fixed size type if available to enable compile time optimizations
-      if (internal::traits<typename internal::remove_all<LhsNested>::type>::RowsAtCompileTime==Dynamic)
+      return internal::traits<typename internal::remove_all<LhsNested>::type>::RowsAtCompileTime==Dynamic ? m_rhs.rows() : m_lhs.rows();
        return m_rhs.rows();
      else
        return m_lhs.rows();
    }
-    EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+    EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE EIGEN_CONSTEXPR
-    Index cols() const {
+    Index cols() const EIGEN_NOEXCEPT {
      // return the fixed size type if available to enable compile time optimizations
-      if (internal::traits<typename internal::remove_all<LhsNested>::type>::ColsAtCompileTime==Dynamic)
+      return internal::traits<typename internal::remove_all<LhsNested>::type>::ColsAtCompileTime==Dynamic ? m_rhs.cols() : m_lhs.cols();
        return m_rhs.cols();
      else
        return m_lhs.cols();
    }
    /** \returns the left hand side nested expression */
--- a/Eigen/src/Core/CwiseNullaryOp.h
+++ b/Eigen/src/Core/CwiseNullaryOp.h
@@ -74,10 +74,10 @@ class CwiseNullaryOp : public internal::dense_xpr_base< CwiseNullaryOp<NullaryOp
            && (ColsAtCompileTime == Dynamic || ColsAtCompileTime == cols));
    }
-    EIGEN_DEVICE_FUNC
+    EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE EIGEN_CONSTEXPR
-    EIGEN_STRONG_INLINE Index rows() const { return m_rows.value(); }
+    Index rows() const { return m_rows.value(); }
-    EIGEN_DEVICE_FUNC
+    EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE EIGEN_CONSTEXPR
-    EIGEN_STRONG_INLINE Index cols() const { return m_cols.value(); }
+    Index cols() const { return m_cols.value(); }
    /** \returns the functor representing the nullary operation */
    EIGEN_DEVICE_FUNC
@@ -292,7 +292,7 @@ DenseBase<Derived>::LinSpaced(Index size, const Scalar& low, const Scalar& high)
 }
 /**
-  * \copydoc DenseBase::LinSpaced(Index, const Scalar&, const Scalar&)
+  * \copydoc DenseBase::LinSpaced(Index, const DenseBase::Scalar&, const DenseBase::Scalar&)
  * Special version for fixed size types which does not require the size parameter.
  */
 template<typename Derived>
@@ -383,6 +383,33 @@ PlainObjectBase<Derived>::setConstant(Index rows, Index cols, const Scalar& val)
  return setConstant(val);
 }
 /** Resizes to the given size, changing only the number of columns, and sets all
  * coefficients in this expression to the given value \a val. For the parameter
  * of type NoChange_t, just pass the special value \c NoChange.
  *
  * \sa MatrixBase::setConstant(const Scalar&), setConstant(Index,const Scalar&), class CwiseNullaryOp, MatrixBase::Constant(const Scalar&)
  */
 template<typename Derived>
 EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Derived&
 PlainObjectBase<Derived>::setConstant(NoChange_t, Index cols, const Scalar& val)
 {
  return setConstant(rows(), cols, val);
 }
 /** Resizes to the given size, changing only the number of rows, and sets all
  * coefficients in this expression to the given value \a val. For the parameter
  * of type NoChange_t, just pass the special value \c NoChange.
  *
  * \sa MatrixBase::setConstant(const Scalar&), setConstant(Index,const Scalar&), class CwiseNullaryOp, MatrixBase::Constant(const Scalar&)
  */
 template<typename Derived>
 EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Derived&
 PlainObjectBase<Derived>::setConstant(Index rows, NoChange_t, const Scalar& val)
 {
  return setConstant(rows, cols(), val);
 }
 /**
  * \brief Sets a linearly spaced vector.
  *
@@ -556,6 +583,32 @@ PlainObjectBase<Derived>::setZero(Index rows, Index cols)
  return setConstant(Scalar(0));
 }
 /** Resizes to the given size, changing only the number of columns, and sets all
  * coefficients in this expression to zero. For the parameter of type NoChange_t,
  * just pass the special value \c NoChange.
  *
  * \sa DenseBase::setZero(), setZero(Index), setZero(Index, Index), setZero(Index, NoChange_t), class CwiseNullaryOp, DenseBase::Zero()
  */
 template<typename Derived>
 EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Derived&
 PlainObjectBase<Derived>::setZero(NoChange_t, Index cols)
 {
  return setZero(rows(), cols);
 }
 /** Resizes to the given size, changing only the number of rows, and sets all
  * coefficients in this expression to zero. For the parameter of type NoChange_t,
  * just pass the special value \c NoChange.
  *
  * \sa DenseBase::setZero(), setZero(Index), setZero(Index, Index), setZero(NoChange_t, Index), class CwiseNullaryOp, DenseBase::Zero()
  */
 template<typename Derived>
 EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Derived&
 PlainObjectBase<Derived>::setZero(Index rows, NoChange_t)
 {
  return setZero(rows, cols());
 }
 // ones:
 /** \returns an expression of a matrix where all coefficients equal one.
@@ -682,6 +735,32 @@ PlainObjectBase<Derived>::setOnes(Index rows, Index cols)
  return setConstant(Scalar(1));
 }
 /** Resizes to the given size, changing only the number of rows, and sets all
  * coefficients in this expression to one. For the parameter of type NoChange_t,
  * just pass the special value \c NoChange.
  *
 * \sa MatrixBase::setOnes(), setOnes(Index), setOnes(Index, Index), setOnes(NoChange_t, Index), class CwiseNullaryOp, MatrixBase::Ones()
  */
 template<typename Derived>
 EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Derived&
 PlainObjectBase<Derived>::setOnes(Index rows, NoChange_t)
 {
  return setOnes(rows, cols());
 }
 /** Resizes to the given size, changing only the number of columns, and sets all
  * coefficients in this expression to one. For the parameter of type NoChange_t,
  * just pass the special value \c NoChange.
  *
 * \sa MatrixBase::setOnes(), setOnes(Index), setOnes(Index, Index), setOnes(Index, NoChange_t) class CwiseNullaryOp, MatrixBase::Ones()
  */
 template<typename Derived>
 EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Derived&
 PlainObjectBase<Derived>::setOnes(NoChange_t, Index cols)
 {
  return setOnes(rows(), cols);
 }
 // Identity:
 /** \returns an expression of the identity matrix (not necessarily square).
--- a/Eigen/src/Core/CwiseUnaryOp.h
+++ b/Eigen/src/Core/CwiseUnaryOp.h
@@ -65,10 +65,10 @@ class CwiseUnaryOp : public CwiseUnaryOpImpl<UnaryOp, XprType, typename internal
    explicit CwiseUnaryOp(const XprType& xpr, const UnaryOp& func = UnaryOp())
      : m_xpr(xpr), m_functor(func) {}
-    EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+    EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE EIGEN_CONSTEXPR
-    Index rows() const { return m_xpr.rows(); }
+    Index rows() const EIGEN_NOEXCEPT { return m_xpr.rows(); }
-    EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+    EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE EIGEN_CONSTEXPR
-    Index cols() const { return m_xpr.cols(); }
+    Index cols() const EIGEN_NOEXCEPT { return m_xpr.cols(); }
    /** \returns the functor representing the unary operation */
    EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
--- a/Eigen/src/Core/CwiseUnaryView.h
+++ b/Eigen/src/Core/CwiseUnaryView.h
@@ -64,23 +64,25 @@ class CwiseUnaryView : public CwiseUnaryViewImpl<ViewOp, MatrixType, typename in
    typedef typename internal::ref_selector<MatrixType>::non_const_type MatrixTypeNested;
    typedef typename internal::remove_all<MatrixType>::type NestedExpression;
-    explicit inline CwiseUnaryView(MatrixType& mat, const ViewOp& func = ViewOp())
+    explicit EIGEN_DEVICE_FUNC inline CwiseUnaryView(MatrixType& mat, const ViewOp& func = ViewOp())
      : m_matrix(mat), m_functor(func) {}
    EIGEN_INHERIT_ASSIGNMENT_OPERATORS(CwiseUnaryView)
-    EIGEN_STRONG_INLINE Index rows() const { return m_matrix.rows(); }
+    EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE EIGEN_CONSTEXPR
-    EIGEN_STRONG_INLINE Index cols() const { return m_matrix.cols(); }
+    Index rows() const EIGEN_NOEXCEPT { return m_matrix.rows(); }
    EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE EIGEN_CONSTEXPR
    Index cols() const EIGEN_NOEXCEPT { return m_matrix.cols(); }
    /** \returns the functor representing unary operation */
-    const ViewOp& functor() const { return m_functor; }
+    EIGEN_DEVICE_FUNC const ViewOp& functor() const { return m_functor; }
    /** \returns the nested expression */
-    const typename internal::remove_all<MatrixTypeNested>::type&
+    EIGEN_DEVICE_FUNC const typename internal::remove_all<MatrixTypeNested>::type&
    nestedExpression() const { return m_matrix; }
    /** \returns the nested expression */
-    typename internal::remove_reference<MatrixTypeNested>::type&
+    EIGEN_DEVICE_FUNC typename internal::remove_reference<MatrixTypeNested>::type&
    nestedExpression() { return m_matrix; }
  protected:
@@ -112,15 +114,17 @@ class CwiseUnaryViewImpl<ViewOp,MatrixType,Dense>
    EIGEN_DEVICE_FUNC inline Scalar* data() { return &(this->coeffRef(0)); }
    EIGEN_DEVICE_FUNC inline const Scalar* data() const { return &(this->coeff(0)); }
-    EIGEN_DEVICE_FUNC inline Index innerStride() const
+    EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR inline Index innerStride() const
    {
      return derived().nestedExpression().innerStride() * sizeof(typename internal::traits<MatrixType>::Scalar) / sizeof(Scalar);
    }
-    EIGEN_DEVICE_FUNC inline Index outerStride() const
+    EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR inline Index outerStride() const
    {
      return derived().nestedExpression().outerStride() * sizeof(typename internal::traits<MatrixType>::Scalar) / sizeof(Scalar);
    }
  protected:
    EIGEN_DEFAULT_EMPTY_CONSTRUCTOR_AND_DESTRUCTOR(CwiseUnaryViewImpl)
 };
 } // end namespace Eigen
--- a/Eigen/src/Core/DenseBase.h
+++ b/Eigen/src/Core/DenseBase.h
@@ -18,7 +18,7 @@ namespace internal {
 // The index type defined by EIGEN_DEFAULT_DENSE_INDEX_TYPE must be a signed type.
 // This dummy function simply aims at checking that at compile time.
 static inline void check_DenseIndex_is_signed() {
-  EIGEN_STATIC_ASSERT(NumTraits<DenseIndex>::IsSigned,THE_INDEX_TYPE_MUST_BE_A_SIGNED_TYPE); 
+  EIGEN_STATIC_ASSERT(NumTraits<DenseIndex>::IsSigned,THE_INDEX_TYPE_MUST_BE_A_SIGNED_TYPE)
 }
 } // end namespace internal
@@ -211,7 +211,7 @@ template<typename Derived> class DenseBase
    /** \returns the number of nonzero coefficients which is in practice the number
      * of stored coefficients. */
-    EIGEN_DEVICE_FUNC
+    EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR
    inline Index nonZeros() const { return size(); }
    /** \returns the outer size.
@@ -219,7 +219,7 @@ template<typename Derived> class DenseBase
      * \note For a vector, this returns just 1. For a matrix (non-vector), this is the major dimension
      * with respect to the \ref TopicStorageOrders "storage order", i.e., the number of columns for a
      * column-major matrix, and the number of rows for a row-major matrix. */
-    EIGEN_DEVICE_FUNC
+    EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR
    Index outerSize() const
    {
      return IsVectorAtCompileTime ? 1
@@ -231,7 +231,7 @@ template<typename Derived> class DenseBase
      * \note For a vector, this is just the size. For a matrix (non-vector), this is the minor dimension
      * with respect to the \ref TopicStorageOrders "storage order", i.e., the number of rows for a
      * column-major matrix, and the number of columns for a row-major matrix. */
-    EIGEN_DEVICE_FUNC
+    EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR
    Index innerSize() const
    {
      return IsVectorAtCompileTime ? this->size()
@@ -324,9 +324,9 @@ template<typename Derived> class DenseBase
    typedef Transpose<Derived> TransposeReturnType;
    EIGEN_DEVICE_FUNC
    TransposeReturnType transpose();
-    typedef typename internal::add_const<Transpose<const Derived> >::type ConstTransposeReturnType;
+    typedef Transpose<const Derived> ConstTransposeReturnType;
    EIGEN_DEVICE_FUNC
-    ConstTransposeReturnType transpose() const;
+    const ConstTransposeReturnType transpose() const;
    EIGEN_DEVICE_FUNC
    void transposeInPlace();
@@ -449,18 +449,58 @@ template<typename Derived> class DenseBase
    EIGEN_DEVICE_FUNC Scalar prod() const;
    template<int NaNPropagation>
    EIGEN_DEVICE_FUNC typename internal::traits<Derived>::Scalar minCoeff() const;
    template<int NaNPropagation>
    EIGEN_DEVICE_FUNC typename internal::traits<Derived>::Scalar maxCoeff() const;
-    template<typename IndexType> EIGEN_DEVICE_FUNC
+
    // By default, the fastest version with undefined NaN propagation semantics is
    // used.
    // TODO(rmlarsen): Replace with default template argument when we move to
    // c++11 or beyond.
    EIGEN_DEVICE_FUNC inline typename internal::traits<Derived>::Scalar minCoeff() const {
      return minCoeff<PropagateFast>();
    }
    EIGEN_DEVICE_FUNC inline typename internal::traits<Derived>::Scalar maxCoeff() const {
      return maxCoeff<PropagateFast>();
    }
    template<int NaNPropagation, typename IndexType>
    EIGEN_DEVICE_FUNC
    typename internal::traits<Derived>::Scalar minCoeff(IndexType* row, IndexType* col) const;
-    template<typename IndexType> EIGEN_DEVICE_FUNC
+    template<int NaNPropagation, typename IndexType>
    EIGEN_DEVICE_FUNC
    typename internal::traits<Derived>::Scalar maxCoeff(IndexType* row, IndexType* col) const;
-    template<typename IndexType> EIGEN_DEVICE_FUNC
+    template<int NaNPropagation, typename IndexType>
    EIGEN_DEVICE_FUNC
    typename internal::traits<Derived>::Scalar minCoeff(IndexType* index) const;
-    template<typename IndexType> EIGEN_DEVICE_FUNC
+    template<int NaNPropagation, typename IndexType>
    EIGEN_DEVICE_FUNC
    typename internal::traits<Derived>::Scalar maxCoeff(IndexType* index) const;
    // TODO(rmlarsen): Replace these methods with a default template argument.
    template<typename IndexType>
    EIGEN_DEVICE_FUNC inline
    typename internal::traits<Derived>::Scalar minCoeff(IndexType* row, IndexType* col) const {
      return minCoeff<PropagateFast>(row, col);
    }
    template<typename IndexType>
    EIGEN_DEVICE_FUNC inline
    typename internal::traits<Derived>::Scalar maxCoeff(IndexType* row, IndexType* col) const {
      return maxCoeff<PropagateFast>(row, col);
    }
    template<typename IndexType>
     EIGEN_DEVICE_FUNC inline
    typename internal::traits<Derived>::Scalar minCoeff(IndexType* index) const {
      return minCoeff<PropagateFast>(index);
    }
    template<typename IndexType>
    EIGEN_DEVICE_FUNC inline
    typename internal::traits<Derived>::Scalar maxCoeff(IndexType* index) const {
      return maxCoeff<PropagateFast>(index);
    }
    template<typename BinaryOp>
    EIGEN_DEVICE_FUNC
    Scalar redux(const BinaryOp& func) const;
@@ -530,16 +570,16 @@ template<typename Derived> class DenseBase
    static const RandomReturnType Random();
    template<typename ThenDerived,typename ElseDerived>
-    const Select<Derived,ThenDerived,ElseDerived>
+    inline EIGEN_DEVICE_FUNC const Select<Derived,ThenDerived,ElseDerived>
    select(const DenseBase<ThenDerived>& thenMatrix,
           const DenseBase<ElseDerived>& elseMatrix) const;
    template<typename ThenDerived>
-    inline const Select<Derived,ThenDerived, typename ThenDerived::ConstantReturnType>
+    inline EIGEN_DEVICE_FUNC const Select<Derived,ThenDerived, typename ThenDerived::ConstantReturnType>
    select(const DenseBase<ThenDerived>& thenMatrix, const typename ThenDerived::Scalar& elseScalar) const;
    template<typename ElseDerived>
-    inline const Select<Derived, typename ElseDerived::ConstantReturnType, ElseDerived >
+    inline EIGEN_DEVICE_FUNC const Select<Derived, typename ElseDerived::ConstantReturnType, ElseDerived >
    select(const typename ElseDerived::Scalar& thenScalar, const DenseBase<ElseDerived>& elseMatrix) const;
    template<int p> RealScalar lpNorm() const;
@@ -636,11 +676,12 @@ template<typename Derived> class DenseBase
    }
  protected:
    EIGEN_DEFAULT_COPY_CONSTRUCTOR(DenseBase)
    /** Default constructor. Do nothing. */
    EIGEN_DEVICE_FUNC DenseBase()
    {
      /* Just checks for self-consistency of the flags.
-       * Only do it when debugging Eigen, as this borders on paranoiac and could slow compilation down
+       * Only do it when debugging Eigen, as this borders on paranoia and could slow compilation down
       */
 #ifdef EIGEN_INTERNAL_DEBUGGING
      EIGEN_STATIC_ASSERT((EIGEN_IMPLIES(MaxRowsAtCompileTime==1 && MaxColsAtCompileTime!=1, int(IsRowMajor))
--- a/Eigen/src/Core/DenseCoeffsBase.h
+++ b/Eigen/src/Core/DenseCoeffsBase.h
@@ -495,7 +495,7 @@ class DenseCoeffsBase<Derived, DirectAccessors> : public DenseCoeffsBase<Derived
      *
      * \sa outerStride(), rowStride(), colStride()
      */
-    EIGEN_DEVICE_FUNC
+    EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR
    inline Index innerStride() const
    {
      return derived().innerStride();
@@ -506,14 +506,14 @@ class DenseCoeffsBase<Derived, DirectAccessors> : public DenseCoeffsBase<Derived
      *
      * \sa innerStride(), rowStride(), colStride()
      */
-    EIGEN_DEVICE_FUNC
+    EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR
    inline Index outerStride() const
    {
      return derived().outerStride();
    }
    // FIXME shall we remove it ?
-    inline Index stride() const
+    EIGEN_CONSTEXPR inline Index stride() const
    {
      return Derived::IsVectorAtCompileTime ? innerStride() : outerStride();
    }
@@ -522,7 +522,7 @@ class DenseCoeffsBase<Derived, DirectAccessors> : public DenseCoeffsBase<Derived
      *
      * \sa innerStride(), outerStride(), colStride()
      */
-    EIGEN_DEVICE_FUNC
+    EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR
    inline Index rowStride() const
    {
      return Derived::IsRowMajor ? outerStride() : innerStride();
@@ -532,7 +532,7 @@ class DenseCoeffsBase<Derived, DirectAccessors> : public DenseCoeffsBase<Derived
      *
      * \sa innerStride(), outerStride(), rowStride()
      */
-    EIGEN_DEVICE_FUNC
+    EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR
    inline Index colStride() const
    {
      return Derived::IsRowMajor ? innerStride() : outerStride();
@@ -570,8 +570,8 @@ class DenseCoeffsBase<Derived, DirectWriteAccessors>
      *
      * \sa outerStride(), rowStride(), colStride()
      */
-    EIGEN_DEVICE_FUNC
+    EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR
-    inline Index innerStride() const
+    inline Index innerStride() const EIGEN_NOEXCEPT
    {
      return derived().innerStride();
    }
@@ -581,14 +581,14 @@ class DenseCoeffsBase<Derived, DirectWriteAccessors>
      *
      * \sa innerStride(), rowStride(), colStride()
      */
-    EIGEN_DEVICE_FUNC
+    EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR
-    inline Index outerStride() const
+    inline Index outerStride() const EIGEN_NOEXCEPT
    {
      return derived().outerStride();
    }
    // FIXME shall we remove it ?
-    inline Index stride() const
+    EIGEN_CONSTEXPR inline Index stride() const EIGEN_NOEXCEPT
    {
      return Derived::IsVectorAtCompileTime ? innerStride() : outerStride();
    }
@@ -597,8 +597,8 @@ class DenseCoeffsBase<Derived, DirectWriteAccessors>
      *
      * \sa innerStride(), outerStride(), colStride()
      */
-    EIGEN_DEVICE_FUNC
+    EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR
-    inline Index rowStride() const
+    inline Index rowStride() const EIGEN_NOEXCEPT
    {
      return Derived::IsRowMajor ? outerStride() : innerStride();
    }
@@ -607,8 +607,8 @@ class DenseCoeffsBase<Derived, DirectWriteAccessors>
      *
      * \sa innerStride(), outerStride(), rowStride()
      */
-    EIGEN_DEVICE_FUNC
+    EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR
-    inline Index colStride() const
+    inline Index colStride() const EIGEN_NOEXCEPT
    {
      return Derived::IsRowMajor ? innerStride() : outerStride();
    }
@@ -619,7 +619,7 @@ namespace internal {
 template<int Alignment, typename Derived, bool JustReturnZero>
 struct first_aligned_impl
 {
-  static inline Index run(const Derived&)
+  static EIGEN_CONSTEXPR inline Index run(const Derived&) EIGEN_NOEXCEPT
  { return 0; }
 };
--- a/Eigen/src/Core/DenseStorage.h
+++ b/Eigen/src/Core/DenseStorage.h
@@ -163,6 +163,30 @@ struct plain_array<T, 0, MatrixOrArrayOptions, Alignment>
  EIGEN_DEVICE_FUNC plain_array(constructor_without_unaligned_array_assert) {}
 };
 struct plain_array_helper {
  template<typename T, int Size, int MatrixOrArrayOptions, int Alignment>
  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
  static void copy(const plain_array<T, Size, MatrixOrArrayOptions, Alignment>& src, const Eigen::Index size,
                         plain_array<T, Size, MatrixOrArrayOptions, Alignment>& dst) {
    smart_copy(src.array, src.array + size, dst.array);
  }
  template<typename T, int Size, int MatrixOrArrayOptions, int Alignment>
  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
  static void swap(plain_array<T, Size, MatrixOrArrayOptions, Alignment>& a, const Eigen::Index a_size,
                   plain_array<T, Size, MatrixOrArrayOptions, Alignment>& b, const Eigen::Index b_size) {
    if (a_size < b_size) {
      std::swap_ranges(b.array, b.array + a_size, a.array);
      smart_move(b.array + a_size, b.array + b_size, a.array + a_size);
    } else if (a_size > b_size) {
      std::swap_ranges(a.array, a.array + b_size, b.array);
      smart_move(a.array + b_size, a.array + a_size, b.array + b_size);
    } else {
      std::swap_ranges(a.array, a.array + a_size, b.array);
    }
  }
 };
 } // end namespace internal
 /** \internal
@@ -190,16 +214,41 @@ template<typename T, int Size, int _Rows, int _Cols, int _Options> class DenseSt
    EIGEN_DEVICE_FUNC
    explicit DenseStorage(internal::constructor_without_unaligned_array_assert)
      : m_data(internal::constructor_without_unaligned_array_assert()) {}
 #if !EIGEN_HAS_CXX11 || defined(EIGEN_DENSE_STORAGE_CTOR_PLUGIN)
    EIGEN_DEVICE_FUNC
    DenseStorage(const DenseStorage& other) : m_data(other.m_data) {
      EIGEN_INTERNAL_DENSE_STORAGE_CTOR_PLUGIN(Index size = Size)
    }
 #else
    EIGEN_DEVICE_FUNC DenseStorage(const DenseStorage&) = default;
 #endif
 #if !EIGEN_HAS_CXX11
    EIGEN_DEVICE_FUNC
    DenseStorage& operator=(const DenseStorage& other)
    {
      if (this != &other) m_data = other.m_data;
      return *this;
    }
 #else
    EIGEN_DEVICE_FUNC DenseStorage& operator=(const DenseStorage&) = default;
 #endif
 #if EIGEN_HAS_RVALUE_REFERENCES
 #if !EIGEN_HAS_CXX11
    EIGEN_DEVICE_FUNC DenseStorage(DenseStorage&& other) EIGEN_NOEXCEPT
      : m_data(std::move(other.m_data))
    {
    }
    EIGEN_DEVICE_FUNC DenseStorage& operator=(DenseStorage&& other) EIGEN_NOEXCEPT
    {
      if (this != &other)
        m_data = std::move(other.m_data);
      return *this;
    }
 #else
    EIGEN_DEVICE_FUNC DenseStorage(DenseStorage&&) = default;
    EIGEN_DEVICE_FUNC DenseStorage& operator=(DenseStorage&&) = default;
 #endif
 #endif
    EIGEN_DEVICE_FUNC DenseStorage(Index size, Index rows, Index cols) {
      EIGEN_INTERNAL_DENSE_STORAGE_CTOR_PLUGIN({})
      eigen_internal_assert(size==rows*cols && rows==_Rows && cols==_Cols);
@@ -210,8 +259,8 @@ template<typename T, int Size, int _Rows, int _Cols, int _Options> class DenseSt
    EIGEN_DEVICE_FUNC void swap(DenseStorage& other) {
      numext::swap(m_data, other.m_data);
    }
-    EIGEN_DEVICE_FUNC static Index rows(void) {return _Rows;}
+    EIGEN_DEVICE_FUNC static EIGEN_CONSTEXPR Index rows(void) EIGEN_NOEXCEPT {return _Rows;}
-    EIGEN_DEVICE_FUNC static Index cols(void) {return _Cols;}
+    EIGEN_DEVICE_FUNC static EIGEN_CONSTEXPR Index cols(void) EIGEN_NOEXCEPT {return _Cols;}
    EIGEN_DEVICE_FUNC void conservativeResize(Index,Index,Index) {}
    EIGEN_DEVICE_FUNC void resize(Index,Index,Index) {}
    EIGEN_DEVICE_FUNC const T *data() const { return m_data.array; }
@@ -228,8 +277,8 @@ template<typename T, int _Rows, int _Cols, int _Options> class DenseStorage<T, 0
    EIGEN_DEVICE_FUNC DenseStorage& operator=(const DenseStorage&) { return *this; }
    EIGEN_DEVICE_FUNC DenseStorage(Index,Index,Index) {}
    EIGEN_DEVICE_FUNC void swap(DenseStorage& ) {}
-    EIGEN_DEVICE_FUNC static Index rows(void) {return _Rows;}
+    EIGEN_DEVICE_FUNC static EIGEN_CONSTEXPR Index rows(void) EIGEN_NOEXCEPT {return _Rows;}
-    EIGEN_DEVICE_FUNC static Index cols(void) {return _Cols;}
+    EIGEN_DEVICE_FUNC static EIGEN_CONSTEXPR Index cols(void) EIGEN_NOEXCEPT {return _Cols;}
    EIGEN_DEVICE_FUNC void conservativeResize(Index,Index,Index) {}
    EIGEN_DEVICE_FUNC void resize(Index,Index,Index) {}
    EIGEN_DEVICE_FUNC const T *data() const { return 0; }
@@ -256,21 +305,25 @@ template<typename T, int Size, int _Options> class DenseStorage<T, Size, Dynamic
    EIGEN_DEVICE_FUNC DenseStorage() : m_rows(0), m_cols(0) {}
    EIGEN_DEVICE_FUNC explicit DenseStorage(internal::constructor_without_unaligned_array_assert)
      : m_data(internal::constructor_without_unaligned_array_assert()), m_rows(0), m_cols(0) {}
-    EIGEN_DEVICE_FUNC DenseStorage(const DenseStorage& other) : m_data(other.m_data), m_rows(other.m_rows), m_cols(other.m_cols) {}
+    EIGEN_DEVICE_FUNC DenseStorage(const DenseStorage& other)
      : m_data(internal::constructor_without_unaligned_array_assert()), m_rows(other.m_rows), m_cols(other.m_cols)
    {
      internal::plain_array_helper::copy(other.m_data, m_rows * m_cols, m_data);
    }
    EIGEN_DEVICE_FUNC DenseStorage& operator=(const DenseStorage& other)
    {
      if (this != &other)
      {
        m_data = other.m_data;
        m_rows = other.m_rows;
        m_cols = other.m_cols;
        internal::plain_array_helper::copy(other.m_data, m_rows * m_cols, m_data);
      }
      return *this;
    }
    EIGEN_DEVICE_FUNC DenseStorage(Index, Index rows, Index cols) : m_rows(rows), m_cols(cols) {}
    EIGEN_DEVICE_FUNC void swap(DenseStorage& other)
    {
-      numext::swap(m_data,other.m_data);
+      internal::plain_array_helper::swap(m_data, m_rows * m_cols, other.m_data, other.m_rows * other.m_cols);
      numext::swap(m_rows,other.m_rows);
      numext::swap(m_cols,other.m_cols);
    }
@@ -291,24 +344,29 @@ template<typename T, int Size, int _Cols, int _Options> class DenseStorage<T, Si
    EIGEN_DEVICE_FUNC DenseStorage() : m_rows(0) {}
    EIGEN_DEVICE_FUNC explicit DenseStorage(internal::constructor_without_unaligned_array_assert)
      : m_data(internal::constructor_without_unaligned_array_assert()), m_rows(0) {}
-    EIGEN_DEVICE_FUNC DenseStorage(const DenseStorage& other) : m_data(other.m_data), m_rows(other.m_rows) {}
+    EIGEN_DEVICE_FUNC DenseStorage(const DenseStorage& other)
      : m_data(internal::constructor_without_unaligned_array_assert()), m_rows(other.m_rows)
    {
      internal::plain_array_helper::copy(other.m_data, m_rows * _Cols, m_data);
    }
    EIGEN_DEVICE_FUNC DenseStorage& operator=(const DenseStorage& other)
    {
      if (this != &other)
      {
        m_data = other.m_data;
        m_rows = other.m_rows;
        internal::plain_array_helper::copy(other.m_data, m_rows * _Cols, m_data);
      }
      return *this;
    }
    EIGEN_DEVICE_FUNC DenseStorage(Index, Index rows, Index) : m_rows(rows) {}
    EIGEN_DEVICE_FUNC void swap(DenseStorage& other)
    { 
-      numext::swap(m_data,other.m_data);
+      internal::plain_array_helper::swap(m_data, m_rows * _Cols, other.m_data, other.m_rows * _Cols);
      numext::swap(m_rows, other.m_rows);
    }
-    EIGEN_DEVICE_FUNC Index rows(void) const {return m_rows;}
+    EIGEN_DEVICE_FUNC Index rows(void) const EIGEN_NOEXCEPT {return m_rows;}
-    EIGEN_DEVICE_FUNC Index cols(void) const {return _Cols;}
+    EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR Index cols(void) const EIGEN_NOEXCEPT {return _Cols;}
    EIGEN_DEVICE_FUNC void conservativeResize(Index, Index rows, Index) { m_rows = rows; }
    EIGEN_DEVICE_FUNC void resize(Index, Index rows, Index) { m_rows = rows; }
    EIGEN_DEVICE_FUNC const T *data() const { return m_data.array; }
@@ -324,23 +382,27 @@ template<typename T, int Size, int _Rows, int _Options> class DenseStorage<T, Si
    EIGEN_DEVICE_FUNC DenseStorage() : m_cols(0) {}
    EIGEN_DEVICE_FUNC explicit DenseStorage(internal::constructor_without_unaligned_array_assert)
      : m_data(internal::constructor_without_unaligned_array_assert()), m_cols(0) {}
-    EIGEN_DEVICE_FUNC DenseStorage(const DenseStorage& other) : m_data(other.m_data), m_cols(other.m_cols) {}
+    EIGEN_DEVICE_FUNC DenseStorage(const DenseStorage& other) 
      : m_data(internal::constructor_without_unaligned_array_assert()), m_cols(other.m_cols)
    {
      internal::plain_array_helper::copy(other.m_data, _Rows * m_cols, m_data);
    }
    EIGEN_DEVICE_FUNC DenseStorage& operator=(const DenseStorage& other)
    {
      if (this != &other)
      {
        m_data = other.m_data;
        m_cols = other.m_cols;
        internal::plain_array_helper::copy(other.m_data, _Rows * m_cols, m_data);
      }
      return *this;
    }
    EIGEN_DEVICE_FUNC DenseStorage(Index, Index, Index cols) : m_cols(cols) {}
    EIGEN_DEVICE_FUNC void swap(DenseStorage& other) {
-      numext::swap(m_data,other.m_data);
+      internal::plain_array_helper::swap(m_data, _Rows * m_cols, other.m_data, _Rows * other.m_cols);
      numext::swap(m_cols, other.m_cols);
    }
-    EIGEN_DEVICE_FUNC Index rows(void) const {return _Rows;}
+    EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR Index rows(void) const EIGEN_NOEXCEPT {return _Rows;}
-    EIGEN_DEVICE_FUNC Index cols(void) const {return m_cols;}
+    EIGEN_DEVICE_FUNC Index cols(void) const EIGEN_NOEXCEPT {return m_cols;}
    EIGEN_DEVICE_FUNC void conservativeResize(Index, Index, Index cols) { m_cols = cols; }
    EIGEN_DEVICE_FUNC void resize(Index, Index, Index cols) { m_cols = cols; }
    EIGEN_DEVICE_FUNC const T *data() const { return m_data.array; }
@@ -407,8 +469,8 @@ template<typename T, int _Options> class DenseStorage<T, Dynamic, Dynamic, Dynam
      numext::swap(m_rows,other.m_rows);
      numext::swap(m_cols,other.m_cols);
    }
-    EIGEN_DEVICE_FUNC Index rows(void) const {return m_rows;}
+    EIGEN_DEVICE_FUNC Index rows(void) const EIGEN_NOEXCEPT {return m_rows;}
-    EIGEN_DEVICE_FUNC Index cols(void) const {return m_cols;}
+    EIGEN_DEVICE_FUNC Index cols(void) const EIGEN_NOEXCEPT {return m_cols;}
    void conservativeResize(Index size, Index rows, Index cols)
    {
      m_data = internal::conditional_aligned_realloc_new_auto<T,(_Options&DontAlign)==0>(m_data, size, m_rows*m_cols);
@@ -485,8 +547,8 @@ template<typename T, int _Rows, int _Options> class DenseStorage<T, Dynamic, _Ro
      numext::swap(m_data,other.m_data);
      numext::swap(m_cols,other.m_cols);
    }
-    EIGEN_DEVICE_FUNC static Index rows(void) {return _Rows;}
+    EIGEN_DEVICE_FUNC static EIGEN_CONSTEXPR Index rows(void) EIGEN_NOEXCEPT {return _Rows;}
-    EIGEN_DEVICE_FUNC Index cols(void) const {return m_cols;}
+    EIGEN_DEVICE_FUNC Index cols(void) const EIGEN_NOEXCEPT {return m_cols;}
    EIGEN_DEVICE_FUNC void conservativeResize(Index size, Index, Index cols)
    {
      m_data = internal::conditional_aligned_realloc_new_auto<T,(_Options&DontAlign)==0>(m_data, size, _Rows*m_cols);
@@ -561,8 +623,8 @@ template<typename T, int _Cols, int _Options> class DenseStorage<T, Dynamic, Dyn
      numext::swap(m_data,other.m_data);
      numext::swap(m_rows,other.m_rows);
    }
-    EIGEN_DEVICE_FUNC Index rows(void) const {return m_rows;}
+    EIGEN_DEVICE_FUNC Index rows(void) const EIGEN_NOEXCEPT {return m_rows;}
-    EIGEN_DEVICE_FUNC static Index cols(void) {return _Cols;}
+    EIGEN_DEVICE_FUNC static EIGEN_CONSTEXPR Index cols(void) {return _Cols;}
    void conservativeResize(Index size, Index rows, Index)
    {
      m_data = internal::conditional_aligned_realloc_new_auto<T,(_Options&DontAlign)==0>(m_data, size, m_rows*_Cols);
--- a/Eigen/src/Core/Diagonal.h
+++ b/Eigen/src/Core/Diagonal.h
@@ -84,20 +84,16 @@ template<typename MatrixType, int _DiagIndex> class Diagonal
                               : numext::mini<Index>(m_matrix.rows(),m_matrix.cols()-m_index.value());
    }
-    EIGEN_DEVICE_FUNC
+    EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR
-    inline Index cols() const { return 1; }
+    inline Index cols() const EIGEN_NOEXCEPT { return 1; }
-    EIGEN_DEVICE_FUNC
+    EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR
-    inline Index innerStride() const
+    inline Index innerStride() const EIGEN_NOEXCEPT {
    {
      return m_matrix.outerStride() + 1;
    }
-    EIGEN_DEVICE_FUNC
+    EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR
-    inline Index outerStride() const
+    inline Index outerStride() const EIGEN_NOEXCEPT { return 0; }
    {
      return 0;
    }
    typedef typename internal::conditional<
                       internal::is_lvalue<MatrixType>::value,
@@ -167,12 +163,12 @@ template<typename MatrixType, int _DiagIndex> class Diagonal
  private:
    // some compilers may fail to optimize std::max etc in case of compile-time constants...
-    EIGEN_DEVICE_FUNC
+    EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE EIGEN_CONSTEXPR
-    EIGEN_STRONG_INLINE Index absDiagIndex() const { return m_index.value()>0 ? m_index.value() : -m_index.value(); }
+    Index absDiagIndex() const EIGEN_NOEXCEPT { return m_index.value()>0 ? m_index.value() : -m_index.value(); }
-    EIGEN_DEVICE_FUNC
+    EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE EIGEN_CONSTEXPR
-    EIGEN_STRONG_INLINE Index rowOffset() const { return m_index.value()>0 ? 0 : -m_index.value(); }
+    Index rowOffset() const EIGEN_NOEXCEPT { return m_index.value()>0 ? 0 : -m_index.value(); }
-    EIGEN_DEVICE_FUNC
+    EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE EIGEN_CONSTEXPR
-    EIGEN_STRONG_INLINE Index colOffset() const { return m_index.value()>0 ? m_index.value() : 0; }
+    Index colOffset() const EIGEN_NOEXCEPT { return m_index.value()>0 ? m_index.value() : 0; }
    // trigger a compile-time error if someone try to call packet
    template<int LoadMode> typename MatrixType::PacketReturnType packet(Index) const;
    template<int LoadMode> typename MatrixType::PacketReturnType packet(Index,Index) const;
@@ -195,7 +191,8 @@ MatrixBase<Derived>::diagonal()
 /** This is the const version of diagonal(). */
 template<typename Derived>
-EIGEN_DEVICE_FUNC inline typename MatrixBase<Derived>::ConstDiagonalReturnType
+EIGEN_DEVICE_FUNC inline
 const typename MatrixBase<Derived>::ConstDiagonalReturnType
 MatrixBase<Derived>::diagonal() const
 {
  return ConstDiagonalReturnType(derived());
@@ -213,18 +210,18 @@ MatrixBase<Derived>::diagonal() const
  *
  * \sa MatrixBase::diagonal(), class Diagonal */
 template<typename Derived>
-EIGEN_DEVICE_FUNC inline typename MatrixBase<Derived>::DiagonalDynamicIndexReturnType
+EIGEN_DEVICE_FUNC inline Diagonal<Derived, DynamicIndex>
 MatrixBase<Derived>::diagonal(Index index)
 {
-  return DiagonalDynamicIndexReturnType(derived(), index);
+  return Diagonal<Derived, DynamicIndex>(derived(), index);
 }
 /** This is the const version of diagonal(Index). */
 template<typename Derived>
-EIGEN_DEVICE_FUNC inline typename MatrixBase<Derived>::ConstDiagonalDynamicIndexReturnType
+EIGEN_DEVICE_FUNC inline const Diagonal<const Derived, DynamicIndex>
 MatrixBase<Derived>::diagonal(Index index) const
 {
-  return ConstDiagonalDynamicIndexReturnType(derived(), index);
+  return Diagonal<const Derived, DynamicIndex>(derived(), index);
 }
 /** \returns an expression of the \a DiagIndex-th sub or super diagonal of the matrix \c *this
@@ -241,20 +238,20 @@ MatrixBase<Derived>::diagonal(Index index) const
 template<typename Derived>
 template<int Index_>
 EIGEN_DEVICE_FUNC
-inline typename MatrixBase<Derived>::template DiagonalIndexReturnType<Index_>::Type
+inline Diagonal<Derived, Index_>
 MatrixBase<Derived>::diagonal()
 {
-  return typename DiagonalIndexReturnType<Index_>::Type(derived());
+  return Diagonal<Derived, Index_>(derived());
 }
 /** This is the const version of diagonal<int>(). */
 template<typename Derived>
 template<int Index_>
 EIGEN_DEVICE_FUNC
-inline typename MatrixBase<Derived>::template ConstDiagonalIndexReturnType<Index_>::Type
+inline const Diagonal<const Derived, Index_>
 MatrixBase<Derived>::diagonal() const
 {
-  return typename ConstDiagonalIndexReturnType<Index_>::Type(derived());
+  return  Diagonal<const Derived, Index_>(derived());
 }
 } // end namespace Eigen
--- a/Eigen/src/Core/Dot.h
+++ b/Eigen/src/Core/Dot.h
@@ -18,14 +18,9 @@ namespace internal {
 // with mismatched types, the compiler emits errors about failing to instantiate cwiseProduct BEFORE
 // looking at the static assertions. Thus this is a trick to get better compile errors.
 template<typename T, typename U,
-// the NeedToTranspose condition here is taken straight from Assign.h
+         bool NeedToTranspose = T::IsVectorAtCompileTime && U::IsVectorAtCompileTime &&
-         bool NeedToTranspose = T::IsVectorAtCompileTime
+                ((int(T::RowsAtCompileTime) == 1 && int(U::ColsAtCompileTime) == 1) ||
-                && U::IsVectorAtCompileTime
+                 (int(T::ColsAtCompileTime) == 1 && int(U::RowsAtCompileTime) == 1))>
                && ((int(T::RowsAtCompileTime) == 1 && int(U::ColsAtCompileTime) == 1)
                      |  // FIXME | instead of || to please GCC 4.4.0 stupid warning "suggest parentheses around &&".
                         // revert to || as soon as not needed anymore.
                    (int(T::ColsAtCompileTime) == 1 && int(U::RowsAtCompileTime) == 1))
 >
 struct dot_nocheck
 {
  typedef scalar_conj_product_op<typename traits<T>::Scalar,typename traits<U>::Scalar> conj_prod;
@@ -86,7 +81,7 @@ MatrixBase<Derived>::dot(const MatrixBase<OtherDerived>& other) const
 //---------- implementation of L2 norm and related functions ----------
-/** \returns, for vectors, the squared \em l2 norm of \c *this, and for matrices the Frobenius norm.
+/** \returns, for vectors, the squared \em l2 norm of \c *this, and for matrices the squared Frobenius norm.
  * In both cases, it consists in the sum of the square of all the matrix entries.
  * For vectors, this is also equals to the dot product of \c *this with itself.
  *
@@ -207,7 +202,7 @@ struct lpNorm_selector
  EIGEN_DEVICE_FUNC
  static inline RealScalar run(const MatrixBase<Derived>& m)
  {
-    EIGEN_USING_STD_MATH(pow)
+    EIGEN_USING_STD(pow)
    return pow(m.cwiseAbs().array().pow(p).sum(), RealScalar(1)/p);
  }
 };
--- a/Eigen/src/Core/EigenBase.h
+++ b/Eigen/src/Core/EigenBase.h
@@ -56,15 +56,15 @@ template<typename Derived> struct EigenBase
  { return *static_cast<const Derived*>(this); }
  /** \returns the number of rows. \sa cols(), RowsAtCompileTime */
-  EIGEN_DEVICE_FUNC
+  EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR
-  inline Index rows() const { return derived().rows(); }
+  inline Index rows() const EIGEN_NOEXCEPT { return derived().rows(); }
  /** \returns the number of columns. \sa rows(), ColsAtCompileTime*/
-  EIGEN_DEVICE_FUNC
+  EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR
-  inline Index cols() const { return derived().cols(); }
+  inline Index cols() const EIGEN_NOEXCEPT { return derived().cols(); }
  /** \returns the number of coefficients, which is rows()*cols().
    * \sa rows(), cols(), SizeAtCompileTime. */
-  EIGEN_DEVICE_FUNC
+  EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR
-  inline Index size() const { return rows() * cols(); }
+  inline Index size() const EIGEN_NOEXCEPT { return rows() * cols(); }
  /** \internal Don't use it, but do the equivalent: \code dst = *this; \endcode */
  template<typename Dest>
--- a/Eigen/src/Core/ForceAlignedAccess.h
+++ b/Eigen/src/Core/ForceAlignedAccess.h
@@ -41,10 +41,14 @@ template<typename ExpressionType> class ForceAlignedAccess
    EIGEN_DEVICE_FUNC explicit inline ForceAlignedAccess(const ExpressionType& matrix) : m_expression(matrix) {}
-    EIGEN_DEVICE_FUNC inline Index rows() const { return m_expression.rows(); }
+    EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR
-    EIGEN_DEVICE_FUNC inline Index cols() const { return m_expression.cols(); }
+    inline Index rows() const EIGEN_NOEXCEPT { return m_expression.rows(); }
-    EIGEN_DEVICE_FUNC inline Index outerStride() const { return m_expression.outerStride(); }
+    EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR
-    EIGEN_DEVICE_FUNC inline Index innerStride() const { return m_expression.innerStride(); }
+    inline Index cols() const EIGEN_NOEXCEPT { return m_expression.cols(); }
    EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR
    inline Index outerStride() const EIGEN_NOEXCEPT { return m_expression.outerStride(); }
    EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR
    inline Index innerStride() const EIGEN_NOEXCEPT { return m_expression.innerStride(); }
    EIGEN_DEVICE_FUNC inline const CoeffReturnType coeff(Index row, Index col) const
    {
--- a/Eigen/src/Core/GeneralProduct.h
+++ b/Eigen/src/Core/GeneralProduct.h
@@ -228,8 +228,7 @@ template<> struct gemv_dense_selector<OnTheRight,ColMajor,true>
    ActualLhsType actualLhs = LhsBlasTraits::extract(lhs);
    ActualRhsType actualRhs = RhsBlasTraits::extract(rhs);
-    ResScalar actualAlpha = alpha * LhsBlasTraits::extractScalarFactor(lhs)
+    ResScalar actualAlpha = combine_scalar_factors(alpha, lhs, rhs);
                                  * RhsBlasTraits::extractScalarFactor(rhs);
    // make sure Dest is a compile-time vector type (bug 1166)
    typedef typename conditional<Dest::IsVectorAtCompileTime, Dest, typename Dest::ColXpr>::type ActualDest;
@@ -320,8 +319,7 @@ template<> struct gemv_dense_selector<OnTheRight,RowMajor,true>
    typename add_const<ActualLhsType>::type actualLhs = LhsBlasTraits::extract(lhs);
    typename add_const<ActualRhsType>::type actualRhs = RhsBlasTraits::extract(rhs);
-    ResScalar actualAlpha = alpha * LhsBlasTraits::extractScalarFactor(lhs)
+    ResScalar actualAlpha = combine_scalar_factors(alpha, lhs, rhs);
                                  * RhsBlasTraits::extractScalarFactor(rhs);
    enum {
      // FIXME find a way to allow an inner stride on the result if packet_traits<Scalar>::size==1
--- a/Eigen/src/Core/GenericPacketMath.h
+++ b/Eigen/src/Core/GenericPacketMath.h
--- a/Eigen/src/Core/GlobalFunctions.h
+++ b/Eigen/src/Core/GlobalFunctions.h
@@ -81,14 +81,16 @@ namespace Eigen
  EIGEN_ARRAY_DECLARE_GLOBAL_UNARY(expm1,scalar_expm1_op,exponential of a value minus 1,\sa ArrayBase::expm1)
  EIGEN_ARRAY_DECLARE_GLOBAL_UNARY(log,scalar_log_op,natural logarithm,\sa Eigen::log10 DOXCOMMA ArrayBase::log)
  EIGEN_ARRAY_DECLARE_GLOBAL_UNARY(log1p,scalar_log1p_op,natural logarithm of 1 plus the value,\sa ArrayBase::log1p)
-  EIGEN_ARRAY_DECLARE_GLOBAL_UNARY(log10,scalar_log10_op,base 10 logarithm,\sa Eigen::log DOXCOMMA ArrayBase::log)
+  EIGEN_ARRAY_DECLARE_GLOBAL_UNARY(log10,scalar_log10_op,base 10 logarithm,\sa Eigen::log DOXCOMMA ArrayBase::log10)
  EIGEN_ARRAY_DECLARE_GLOBAL_UNARY(log2,scalar_log2_op,base 2 logarithm,\sa Eigen::log DOXCOMMA ArrayBase::log2)
  EIGEN_ARRAY_DECLARE_GLOBAL_UNARY(abs,scalar_abs_op,absolute value,\sa ArrayBase::abs DOXCOMMA MatrixBase::cwiseAbs)
  EIGEN_ARRAY_DECLARE_GLOBAL_UNARY(abs2,scalar_abs2_op,squared absolute value,\sa ArrayBase::abs2 DOXCOMMA MatrixBase::cwiseAbs2)
-  EIGEN_ARRAY_DECLARE_GLOBAL_UNARY(arg,scalar_arg_op,complex argument,\sa ArrayBase::arg)
+  EIGEN_ARRAY_DECLARE_GLOBAL_UNARY(arg,scalar_arg_op,complex argument,\sa ArrayBase::arg DOXCOMMA MatrixBase::cwiseArg)
  EIGEN_ARRAY_DECLARE_GLOBAL_UNARY(sqrt,scalar_sqrt_op,square root,\sa ArrayBase::sqrt DOXCOMMA MatrixBase::cwiseSqrt)
  EIGEN_ARRAY_DECLARE_GLOBAL_UNARY(rsqrt,scalar_rsqrt_op,reciprocal square root,\sa ArrayBase::rsqrt)
  EIGEN_ARRAY_DECLARE_GLOBAL_UNARY(square,scalar_square_op,square (power 2),\sa Eigen::abs2 DOXCOMMA Eigen::pow DOXCOMMA ArrayBase::square)
  EIGEN_ARRAY_DECLARE_GLOBAL_UNARY(cube,scalar_cube_op,cube (power 3),\sa Eigen::pow DOXCOMMA ArrayBase::cube)
  EIGEN_ARRAY_DECLARE_GLOBAL_UNARY(rint,scalar_rint_op,nearest integer,\sa Eigen::floor DOXCOMMA Eigen::ceil DOXCOMMA ArrayBase::round)
  EIGEN_ARRAY_DECLARE_GLOBAL_UNARY(round,scalar_round_op,nearest integer,\sa Eigen::floor DOXCOMMA Eigen::ceil DOXCOMMA ArrayBase::round)
  EIGEN_ARRAY_DECLARE_GLOBAL_UNARY(floor,scalar_floor_op,nearest integer not greater than the giben value,\sa Eigen::ceil DOXCOMMA ArrayBase::floor)
  EIGEN_ARRAY_DECLARE_GLOBAL_UNARY(ceil,scalar_ceil_op,nearest integer not less than the giben value,\sa Eigen::floor DOXCOMMA ArrayBase::ceil)
--- a/Eigen/src/Core/IO.h
+++ b/Eigen/src/Core/IO.h
@@ -130,6 +130,9 @@ struct significant_decimals_impl
 template<typename Derived>
 std::ostream & print_matrix(std::ostream & s, const Derived& _m, const IOFormat& fmt)
 {
  using internal::is_same;
  using internal::conditional;
  if(_m.size() == 0)
  {
    s << fmt.matPrefix << fmt.matSuffix;
@@ -138,6 +141,22 @@ std::ostream & print_matrix(std::ostream & s, const Derived& _m, const IOFormat&
  typename Derived::Nested m = _m;
  typedef typename Derived::Scalar Scalar;
  typedef typename
      conditional<
          is_same<Scalar, char>::value ||
            is_same<Scalar, unsigned char>::value ||
            is_same<Scalar, numext::int8_t>::value ||
            is_same<Scalar, numext::uint8_t>::value,
          int,
          typename conditional<
              is_same<Scalar, std::complex<char> >::value ||
                is_same<Scalar, std::complex<unsigned char> >::value ||
                is_same<Scalar, std::complex<numext::int8_t> >::value ||
                is_same<Scalar, std::complex<numext::uint8_t> >::value,
              std::complex<int>,
              const Scalar&
            >::type
        >::type PrintType;
  Index width = 0;
@@ -174,7 +193,7 @@ std::ostream & print_matrix(std::ostream & s, const Derived& _m, const IOFormat&
      {
        std::stringstream sstr;
        sstr.copyfmt(s);
-        sstr << m.coeff(i,j);
+        sstr << static_cast<PrintType>(m.coeff(i,j));
        width = std::max<Index>(width, Index(sstr.str().length()));
      }
  }
@@ -190,7 +209,7 @@ std::ostream & print_matrix(std::ostream & s, const Derived& _m, const IOFormat&
      s.fill(fmt.fill);
      s.width(width);
    }
-    s << m.coeff(i, 0);
+    s << static_cast<PrintType>(m.coeff(i, 0));
    for(Index j = 1; j < m.cols(); ++j)
    {
      s << fmt.coeffSeparator;
@@ -198,7 +217,7 @@ std::ostream & print_matrix(std::ostream & s, const Derived& _m, const IOFormat&
        s.fill(fmt.fill);
        s.width(width);
      }
-      s << m.coeff(i, j);
+      s << static_cast<PrintType>(m.coeff(i, j));
    }
    s << fmt.rowSuffix;
    if( i < m.rows() - 1)
--- a/Eigen/src/Core/IndexedView.h
+++ b/Eigen/src/Core/IndexedView.h
@@ -54,7 +54,8 @@ struct traits<IndexedView<XprType, RowIndices, ColIndices> >
    DirectAccessMask = (int(InnerIncr)!=UndefinedIncr && int(OuterIncr)!=UndefinedIncr && InnerIncr>=0 && OuterIncr>=0) ? DirectAccessBit : 0,
    FlagsRowMajorBit = IsRowMajor ? RowMajorBit : 0,
    FlagsLvalueBit = is_lvalue<XprType>::value ? LvalueBit : 0,
-    Flags = (traits<XprType>::Flags & (HereditaryBits | DirectAccessMask)) | FlagsLvalueBit | FlagsRowMajorBit
+    FlagsLinearAccessBit = (RowsAtCompileTime == 1 || ColsAtCompileTime == 1) ? LinearAccessBit : 0,
    Flags = (traits<XprType>::Flags & (HereditaryBits | DirectAccessMask )) | FlagsLvalueBit | FlagsRowMajorBit | FlagsLinearAccessBit
  };
  typedef Block<XprType,RowsAtCompileTime,ColsAtCompileTime,IsInnerPannel> BlockType;
@@ -121,10 +122,10 @@ public:
  {}
  /** \returns number of rows */
-  Index rows() const { return internal::size(m_rowIndices); }
+  Index rows() const { return internal::index_list_size(m_rowIndices); }
  /** \returns number of columns */
-  Index cols() const { return internal::size(m_colIndices); }
+  Index cols() const { return internal::index_list_size(m_colIndices); }
  /** \returns the nested expression */
  const typename internal::remove_all<XprType>::type&
@@ -168,7 +169,11 @@ struct unary_evaluator<IndexedView<ArgType, RowIndices, ColIndices>, IndexBased>
  enum {
    CoeffReadCost = evaluator<ArgType>::CoeffReadCost /* TODO + cost of row/col index */,
-    Flags = (evaluator<ArgType>::Flags & (HereditaryBits /*| LinearAccessBit | DirectAccessBit*/)),
+    FlagsLinearAccessBit = (traits<XprType>::RowsAtCompileTime == 1 || traits<XprType>::ColsAtCompileTime == 1) ? LinearAccessBit : 0,
    FlagsRowMajorBit = traits<XprType>::FlagsRowMajorBit, 
    Flags = (evaluator<ArgType>::Flags & (HereditaryBits & ~RowMajorBit /*| LinearAccessBit | DirectAccessBit*/)) | FlagsLinearAccessBit | FlagsRowMajorBit,
    Alignment = 0
  };
@@ -184,15 +189,50 @@ struct unary_evaluator<IndexedView<ArgType, RowIndices, ColIndices>, IndexBased>
  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
  CoeffReturnType coeff(Index row, Index col) const
  {
    eigen_assert(m_xpr.rowIndices()[row] >= 0 && m_xpr.rowIndices()[row] < m_xpr.nestedExpression().rows()
                 && m_xpr.colIndices()[col] >= 0 && m_xpr.colIndices()[col] < m_xpr.nestedExpression().cols());
    return m_argImpl.coeff(m_xpr.rowIndices()[row], m_xpr.colIndices()[col]);
  }
  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
  Scalar& coeffRef(Index row, Index col)
  {
    eigen_assert(m_xpr.rowIndices()[row] >= 0 && m_xpr.rowIndices()[row] < m_xpr.nestedExpression().rows()
                 && m_xpr.colIndices()[col] >= 0 && m_xpr.colIndices()[col] < m_xpr.nestedExpression().cols());
    return m_argImpl.coeffRef(m_xpr.rowIndices()[row], m_xpr.colIndices()[col]);
  }
  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
  Scalar& coeffRef(Index index)
  {
    EIGEN_STATIC_ASSERT_LVALUE(XprType)
    Index row = XprType::RowsAtCompileTime == 1 ? 0 : index;
    Index col = XprType::RowsAtCompileTime == 1 ? index : 0;
    eigen_assert(m_xpr.rowIndices()[row] >= 0 && m_xpr.rowIndices()[row] < m_xpr.nestedExpression().rows()
                 && m_xpr.colIndices()[col] >= 0 && m_xpr.colIndices()[col] < m_xpr.nestedExpression().cols());
    return m_argImpl.coeffRef( m_xpr.rowIndices()[row], m_xpr.colIndices()[col]);
  }
  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
  const Scalar& coeffRef(Index index) const
  {
    Index row = XprType::RowsAtCompileTime == 1 ? 0 : index;
    Index col = XprType::RowsAtCompileTime == 1 ? index : 0;
    eigen_assert(m_xpr.rowIndices()[row] >= 0 && m_xpr.rowIndices()[row] < m_xpr.nestedExpression().rows()
                 && m_xpr.colIndices()[col] >= 0 && m_xpr.colIndices()[col] < m_xpr.nestedExpression().cols());
    return m_argImpl.coeffRef( m_xpr.rowIndices()[row], m_xpr.colIndices()[col]);
  }
  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
  const CoeffReturnType coeff(Index index) const
  {
    Index row = XprType::RowsAtCompileTime == 1 ? 0 : index;
    Index col = XprType::RowsAtCompileTime == 1 ? index : 0;
    eigen_assert(m_xpr.rowIndices()[row] >= 0 && m_xpr.rowIndices()[row] < m_xpr.nestedExpression().rows()
                 && m_xpr.colIndices()[col] >= 0 && m_xpr.colIndices()[col] < m_xpr.nestedExpression().cols());
    return m_argImpl.coeff( m_xpr.rowIndices()[row], m_xpr.colIndices()[col]);
  }
 protected:
  evaluator<ArgType> m_argImpl;
--- a/Eigen/src/Core/Inverse.h
+++ b/Eigen/src/Core/Inverse.h
@@ -54,8 +54,8 @@ public:
    : m_xpr(xpr)
  {}
-  EIGEN_DEVICE_FUNC Index rows() const { return m_xpr.cols(); }
+  EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR  Index rows() const EIGEN_NOEXCEPT { return m_xpr.cols(); }
-  EIGEN_DEVICE_FUNC Index cols() const { return m_xpr.rows(); }
+  EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR  Index cols() const EIGEN_NOEXCEPT { return m_xpr.rows(); }
  EIGEN_DEVICE_FUNC const XprTypeNestedCleaned& nestedExpression() const { return m_xpr; }
--- a/Eigen/src/Core/Map.h
+++ b/Eigen/src/Core/Map.h
@@ -47,7 +47,7 @@ private:
  * \brief A matrix or vector expression mapping an existing array of data.
  *
  * \tparam PlainObjectType the equivalent matrix type of the mapped data
-  * \tparam MapOptions specifies the pointer alignment in bytes. It can be: \c #Aligned128, , \c #Aligned64, \c #Aligned32, \c #Aligned16, \c #Aligned8 or \c #Unaligned.
+  * \tparam MapOptions specifies the pointer alignment in bytes. It can be: \c #Aligned128, \c #Aligned64, \c #Aligned32, \c #Aligned16, \c #Aligned8 or \c #Unaligned.
  *                The default is \c #Unaligned.
  * \tparam StrideType optionally specifies strides. By default, Map assumes the memory layout
  *                   of an ordinary, contiguous array. This can be overridden by specifying strides.
@@ -104,13 +104,13 @@ template<typename PlainObjectType, int MapOptions, typename StrideType> class Ma
    EIGEN_DEVICE_FUNC
    inline PointerType cast_to_pointer_type(PointerArgType ptr) { return ptr; }
-    EIGEN_DEVICE_FUNC
+    EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR
    inline Index innerStride() const
    {
      return StrideType::InnerStrideAtCompileTime != 0 ? m_stride.inner() : 1;
    }
-    EIGEN_DEVICE_FUNC
+    EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR
    inline Index outerStride() const
    {
      return StrideType::OuterStrideAtCompileTime != 0 ? m_stride.outer()
--- a/Eigen/src/Core/MapBase.h
+++ b/Eigen/src/Core/MapBase.h
@@ -87,9 +87,11 @@ template<typename Derived> class MapBase<Derived, ReadOnlyAccessors>
    typedef typename Base::CoeffReturnType CoeffReturnType;
    /** \copydoc DenseBase::rows() */
-    EIGEN_DEVICE_FUNC inline Index rows() const { return m_rows.value(); }
+    EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR
    inline Index rows() const EIGEN_NOEXCEPT { return m_rows.value(); }
    /** \copydoc DenseBase::cols() */
-    EIGEN_DEVICE_FUNC inline Index cols() const { return m_cols.value(); }
+    EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR
    inline Index cols() const EIGEN_NOEXCEPT { return m_cols.value(); }
    /** Returns a pointer to the first coefficient of the matrix or vector.
      *
@@ -182,6 +184,8 @@ template<typename Derived> class MapBase<Derived, ReadOnlyAccessors>
    #endif
  protected:
    EIGEN_DEFAULT_COPY_CONSTRUCTOR(MapBase)
    EIGEN_DEFAULT_EMPTY_CONSTRUCTOR_AND_DESTRUCTOR(MapBase)
    template<typename T>
    EIGEN_DEVICE_FUNC
@@ -294,6 +298,9 @@ template<typename Derived> class MapBase<Derived, WriteAccessors>
    // In theory we could simply refer to Base:Base::operator=, but MSVC does not like Base::Base,
    // see bugs 821 and 920.
    using ReadOnlyMapBase::Base::operator=;
  protected:
    EIGEN_DEFAULT_COPY_CONSTRUCTOR(MapBase)
    EIGEN_DEFAULT_EMPTY_CONSTRUCTOR_AND_DESTRUCTOR(MapBase)
 };
 #undef EIGEN_STATIC_ASSERT_INDEX_BASED_ACCESS
--- a/Eigen/src/Core/MathFunctions.h
+++ b/Eigen/src/Core/MathFunctions.h
@@ -2,6 +2,7 @@
 // for linear algebra.
 //
 // Copyright (C) 2006-2010 Benoit Jacob <jacob.benoit.1@gmail.com>
 // Copyright (c) 2021, NVIDIA CORPORATION. All rights reserved.
 //
 // This Source Code Form is subject to the terms of the Mozilla
 // Public License v. 2.0. If a copy of the MPL was not distributed
@@ -10,9 +11,11 @@
 #ifndef EIGEN_MATHFUNCTIONS_H
 #define EIGEN_MATHFUNCTIONS_H
 // source: http://www.geom.uiuc.edu/~huberty/math5337/groupe/digits.html
 // TODO this should better be moved to NumTraits
 // Source: WolframAlpha
 #define EIGEN_PI    3.141592653589793238462643383279502884197169399375105820974944592307816406L
 #define EIGEN_LOG2E 1.442695040888963407359924681001892137426645954152985934135449406931109219L
 #define EIGEN_LN2   0.693147180559945309417232121458176568075500134360255254120680009493393621L
 namespace Eigen {
@@ -212,12 +215,12 @@ struct imag_ref_default_impl
 template<typename Scalar>
 struct imag_ref_default_impl<Scalar, false>
 {
-  EIGEN_DEVICE_FUNC
+  EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR
  static inline Scalar run(Scalar&)
  {
    return Scalar(0);
  }
-  EIGEN_DEVICE_FUNC
+  EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR
  static inline const Scalar run(const Scalar&)
  {
    return Scalar(0);
@@ -258,19 +261,8 @@ struct conj_default_impl<Scalar,true>
  }
 };
-template<typename Scalar> struct conj_impl : conj_default_impl<Scalar> {};
+template<typename Scalar, bool IsComplex = NumTraits<Scalar>::IsComplex>
-
+struct conj_impl : conj_default_impl<Scalar, IsComplex> {};
 #if defined(EIGEN_GPU_COMPILE_PHASE)
 template<typename T>
 struct conj_impl<std::complex<T> >
 {
  EIGEN_DEVICE_FUNC
  static inline std::complex<T> run(const std::complex<T>& x)
  {
    return std::complex<T>(x.real(), -x.imag());
  }
 };
 #endif
 template<typename Scalar>
 struct conj_retval
@@ -321,6 +313,65 @@ struct abs2_retval
  typedef typename NumTraits<Scalar>::Real type;
 };
 /****************************************************************************
 * Implementation of sqrt/rsqrt                                             *
 ****************************************************************************/
 template<typename Scalar>
 struct sqrt_impl
 {
  EIGEN_DEVICE_FUNC
  static EIGEN_ALWAYS_INLINE Scalar run(const Scalar& x)
  {
    EIGEN_USING_STD(sqrt);
    return sqrt(x);
  }
 };
 // Complex sqrt defined in MathFunctionsImpl.h.
 template<typename T> EIGEN_DEVICE_FUNC std::complex<T> complex_sqrt(const std::complex<T>& a_x);
 // Custom implementation is faster than `std::sqrt`, works on
 // GPU, and correctly handles special cases (unlike MSVC).
 template<typename T>
 struct sqrt_impl<std::complex<T> >
 {
  EIGEN_DEVICE_FUNC
  static EIGEN_ALWAYS_INLINE std::complex<T> run(const std::complex<T>& x)
  {
    return complex_sqrt<T>(x);
  }
 };
 template<typename Scalar>
 struct sqrt_retval
 {
  typedef Scalar type;
 };
 // Default implementation relies on numext::sqrt, at bottom of file.
 template<typename T>
 struct rsqrt_impl;
 // Complex rsqrt defined in MathFunctionsImpl.h.
 template<typename T> EIGEN_DEVICE_FUNC std::complex<T> complex_rsqrt(const std::complex<T>& a_x);
 template<typename T>
 struct rsqrt_impl<std::complex<T> >
 {
  EIGEN_DEVICE_FUNC
  static EIGEN_ALWAYS_INLINE std::complex<T> run(const std::complex<T>& x)
  {
    return complex_rsqrt<T>(x);
  }
 };
 template<typename Scalar>
 struct rsqrt_retval
 {
  typedef Scalar type;
 };
 /****************************************************************************
 * Implementation of norm1                                                *
 ****************************************************************************/
@@ -335,7 +386,7 @@ struct norm1_default_impl<Scalar,true>
  EIGEN_DEVICE_FUNC
  static inline RealScalar run(const Scalar& x)
  {
-    EIGEN_USING_STD_MATH(abs);
+    EIGEN_USING_STD(abs);
    return abs(x.real()) + abs(x.imag());
  }
 };
@@ -346,7 +397,7 @@ struct norm1_default_impl<Scalar, false>
  EIGEN_DEVICE_FUNC
  static inline Scalar run(const Scalar& x)
  {
-    EIGEN_USING_STD_MATH(abs);
+    EIGEN_USING_STD(abs);
    return abs(x);
  }
 };
@@ -376,7 +427,7 @@ struct hypot_retval
 * Implementation of cast                                                 *
 ****************************************************************************/
-template<typename OldType, typename NewType>
+template<typename OldType, typename NewType, typename EnableIf = void>
 struct cast_impl
 {
  EIGEN_DEVICE_FUNC
@@ -386,6 +437,22 @@ struct cast_impl
  }
 };
 // Casting from S -> Complex<T> leads to an implicit conversion from S to T,
 // generating warnings on clang.  Here we explicitly cast the real component.
 template<typename OldType, typename NewType>
 struct cast_impl<OldType, NewType,
  typename internal::enable_if<
    !NumTraits<OldType>::IsComplex && NumTraits<NewType>::IsComplex
  >::type>
 {
  EIGEN_DEVICE_FUNC
  static inline NewType run(const OldType& x)
  {
    typedef typename NumTraits<NewType>::Real NewReal;
    return static_cast<NewType>(static_cast<NewReal>(x));
  }
 };
 // here, for once, we're plainly returning NewType: we don't want cast to do weird things.
 template<typename OldType, typename NewType>
@@ -399,29 +466,59 @@ inline NewType cast(const OldType& x)
 * Implementation of round                                                   *
 ****************************************************************************/
 #if EIGEN_HAS_CXX11_MATH
 template<typename Scalar>
-  struct round_impl {
+struct round_impl
 {
  EIGEN_DEVICE_FUNC
  static inline Scalar run(const Scalar& x)
  {
    EIGEN_STATIC_ASSERT((!NumTraits<Scalar>::IsComplex), NUMERIC_TYPE_MUST_BE_REAL)
-      EIGEN_USING_STD_MATH(round);
+#if EIGEN_HAS_CXX11_MATH
-      return round(x);
+    EIGEN_USING_STD(round);
 #endif
    return Scalar(round(x));
  }
 };
 #if !EIGEN_HAS_CXX11_MATH
 #if EIGEN_HAS_C99_MATH
 // Use ::roundf for float.
 template<>
 struct round_impl<float> {
  EIGEN_DEVICE_FUNC
  static inline float run(const float& x)
  {
    return ::roundf(x);
  }
 };
 #else
 template<typename Scalar>
-  struct round_impl
+struct round_using_floor_ceil_impl
 {
  EIGEN_DEVICE_FUNC
  static inline Scalar run(const Scalar& x)
  {
    EIGEN_STATIC_ASSERT((!NumTraits<Scalar>::IsComplex), NUMERIC_TYPE_MUST_BE_REAL)
-      EIGEN_USING_STD_MATH(floor);
+    // Without C99 round/roundf, resort to floor/ceil.
-      EIGEN_USING_STD_MATH(ceil);
+    EIGEN_USING_STD(floor);
-      return (x > Scalar(0)) ? floor(x + Scalar(0.5)) : ceil(x - Scalar(0.5));
+    EIGEN_USING_STD(ceil);
    // If not enough precision to resolve a decimal at all, return the input.
    // Otherwise, adding 0.5 can trigger an increment by 1.
    const Scalar limit = Scalar(1ull << (NumTraits<Scalar>::digits() - 1));
    if (x >= limit || x <= -limit) {
      return x;
    }
    return (x > Scalar(0)) ? Scalar(floor(x + Scalar(0.5))) : Scalar(ceil(x - Scalar(0.5)));
  }
 };
-#endif
+
 template<>
 struct round_impl<float> : round_using_floor_ceil_impl<float> {};
 template<>
 struct round_impl<double> : round_using_floor_ceil_impl<double> {};
 #endif // EIGEN_HAS_C99_MATH
 #endif // !EIGEN_HAS_CXX11_MATH
 template<typename Scalar>
 struct round_retval
@@ -429,22 +526,82 @@ struct round_retval
  typedef Scalar type;
 };
 /****************************************************************************
 * Implementation of rint                                                    *
 ****************************************************************************/
 template<typename Scalar>
 struct rint_impl {
  EIGEN_DEVICE_FUNC
  static inline Scalar run(const Scalar& x)
  {
    EIGEN_STATIC_ASSERT((!NumTraits<Scalar>::IsComplex), NUMERIC_TYPE_MUST_BE_REAL)
 #if EIGEN_HAS_CXX11_MATH
      EIGEN_USING_STD(rint);
 #endif
    return rint(x);
  }
 };
 #if !EIGEN_HAS_CXX11_MATH
 template<>
 struct rint_impl<double> {
  EIGEN_DEVICE_FUNC
  static inline double run(const double& x)
  {
    return ::rint(x);
  }
 };
 template<>
 struct rint_impl<float> {
  EIGEN_DEVICE_FUNC
  static inline float run(const float& x)
  {
    return ::rintf(x);
  }
 };
 #endif
 template<typename Scalar>
 struct rint_retval
 {
  typedef Scalar type;
 };
 /****************************************************************************
 * Implementation of arg                                                     *
 ****************************************************************************/
-#if EIGEN_HAS_CXX11_MATH
+// Visual Studio 2017 has a bug where arg(float) returns 0 for negative inputs.
 // This seems to be fixed in VS 2019.
 #if EIGEN_HAS_CXX11_MATH && (!EIGEN_COMP_MSVC || EIGEN_COMP_MSVC >= 1920)
 // std::arg is only defined for types of std::complex, or integer types or float/double/long double
 template<typename Scalar,
          bool HasStdImpl = NumTraits<Scalar>::IsComplex || is_integral<Scalar>::value
                            || is_same<Scalar, float>::value || is_same<Scalar, double>::value
                            || is_same<Scalar, long double>::value >
 struct arg_default_impl;
 template<typename Scalar>
-  struct arg_impl {
+struct arg_default_impl<Scalar, true> {
-    static inline Scalar run(const Scalar& x)
+  typedef typename NumTraits<Scalar>::Real RealScalar;
  EIGEN_DEVICE_FUNC
  static inline RealScalar run(const Scalar& x)
  {
-      #if defined(EIGEN_HIP_DEVICE_COMPILE)
+    // There is no official ::arg on device in CUDA/HIP, so we always need to use std::arg.
      // HIP does not seem to have a native device side implementation for the math routine "arg"
    using std::arg;
-      #else 		  
+    return static_cast<RealScalar>(arg(x));
-      EIGEN_USING_STD_MATH(arg);
+  }
-      #endif
+};
-      return arg(x);
+
 // Must be non-complex floating-point type (e.g. half/bfloat16).
 template<typename Scalar>
 struct arg_default_impl<Scalar, false> {
  typedef typename NumTraits<Scalar>::Real RealScalar;
  EIGEN_DEVICE_FUNC
  static inline RealScalar run(const Scalar& x)
  {
    return (x < Scalar(0)) ? RealScalar(EIGEN_PI) : RealScalar(0);
  }
 };
 #else
@@ -455,7 +612,8 @@ struct round_retval
  EIGEN_DEVICE_FUNC
  static inline RealScalar run(const Scalar& x)
  {
-      return (x < Scalar(0)) ? Scalar(EIGEN_PI) : Scalar(0); }
+    return (x < RealScalar(0)) ? RealScalar(EIGEN_PI) : RealScalar(0);
  }
 };
 template<typename Scalar>
@@ -465,13 +623,12 @@ struct round_retval
  EIGEN_DEVICE_FUNC
  static inline RealScalar run(const Scalar& x)
  {
-      EIGEN_USING_STD_MATH(arg);
+    EIGEN_USING_STD(arg);
    return arg(x);
  }
 };
  template<typename Scalar> struct arg_impl : arg_default_impl<Scalar> {};
 #endif
 template<typename Scalar> struct arg_impl : arg_default_impl<Scalar> {};
 template<typename Scalar>
 struct arg_retval
@@ -493,7 +650,7 @@ namespace std_fallback {
    EIGEN_STATIC_ASSERT_NON_INTEGER(Scalar)
    typedef typename NumTraits<Scalar>::Real RealScalar;
-    EIGEN_USING_STD_MATH(exp);
+    EIGEN_USING_STD(exp);
    Scalar u = exp(x);
    if (numext::equal_strict(u, Scalar(1))) {
      return x;
@@ -503,7 +660,7 @@ namespace std_fallback {
      return RealScalar(-1);
    }
-    EIGEN_USING_STD_MATH(log);
+    EIGEN_USING_STD(log);
    Scalar logu = log(u);
    return numext::equal_strict(u, logu) ? u : (u - RealScalar(1)) * x / logu;
  }
@@ -523,22 +680,36 @@ struct expm1_impl {
  }
 };
 // Specialization for complex types that are not supported by std::expm1.
 template <typename RealScalar>
 struct expm1_impl<std::complex<RealScalar> > {
  EIGEN_DEVICE_FUNC static inline std::complex<RealScalar> run(
      const std::complex<RealScalar>& x) {
    EIGEN_STATIC_ASSERT_NON_INTEGER(RealScalar)
    return std_fallback::expm1(x);
  }
 };
 template<typename Scalar>
 struct expm1_retval
 {
  typedef Scalar type;
 };
 /****************************************************************************
 * Implementation of log                                                     *
 ****************************************************************************/
 // Complex log defined in MathFunctionsImpl.h.
 template<typename T> EIGEN_DEVICE_FUNC std::complex<T> complex_log(const std::complex<T>& z);
 template<typename Scalar>
 struct log_impl {
  EIGEN_DEVICE_FUNC static inline Scalar run(const Scalar& x)
  {
    EIGEN_USING_STD(log);
    return static_cast<Scalar>(log(x));
  }
 };
 template<typename Scalar>
 struct log_impl<std::complex<Scalar> > {
  EIGEN_DEVICE_FUNC static inline std::complex<Scalar> run(const std::complex<Scalar>& z)
  {
    return complex_log(z);
  }
 };
 /****************************************************************************
 * Implementation of log1p                                                   *
 ****************************************************************************/
@@ -550,9 +721,9 @@ namespace std_fallback {
  EIGEN_DEVICE_FUNC inline Scalar log1p(const Scalar& x) {
    EIGEN_STATIC_ASSERT_NON_INTEGER(Scalar)
    typedef typename NumTraits<Scalar>::Real RealScalar;
-    EIGEN_USING_STD_MATH(log);
+    EIGEN_USING_STD(log);
    Scalar x1p = RealScalar(1) + x;
-    Scalar log_1p = log(x1p);
+    Scalar log_1p = log_impl<Scalar>::run(x1p);
    const bool is_small = numext::equal_strict(x1p, Scalar(1));
    const bool is_inf = numext::equal_strict(x1p, log_1p);
    return (is_small || is_inf) ? x : x * (log_1p / (x1p - RealScalar(1)));
@@ -600,7 +771,7 @@ struct pow_impl
  typedef typename ScalarBinaryOpTraits<ScalarX,ScalarY,internal::scalar_pow_op<ScalarX,ScalarY> >::ReturnType result_type;
  static EIGEN_DEVICE_FUNC inline result_type run(const ScalarX& x, const ScalarY& y)
  {
-    EIGEN_USING_STD_MATH(pow);
+    EIGEN_USING_STD(pow);
    return pow(x, y);
  }
 };
@@ -706,13 +877,159 @@ struct meta_floor_log2<n, lower, upper, meta_floor_log2_bogus>
  // no value, error at compile time
 };
 template <typename BitsType, typename EnableIf = void>
 struct count_bits_impl {
  static EIGEN_DEVICE_FUNC inline int clz(BitsType bits) {
    EIGEN_STATIC_ASSERT(
        is_integral<BitsType>::value && !NumTraits<BitsType>::IsSigned,
        THIS_TYPE_IS_NOT_SUPPORTED);
    int n = CHAR_BIT * sizeof(BitsType);
    int shift = n / 2;
    while (bits > 0 && shift > 0) {
      BitsType y = bits >> shift;
      if (y > 0) {
        n -= shift;
        bits = y;
      }
      shift /= 2;
    }
    if (shift == 0) {
      --n;
    }
    return n;
  }
  static EIGEN_DEVICE_FUNC inline int ctz(BitsType bits) {
    EIGEN_STATIC_ASSERT(
        is_integral<BitsType>::value && !NumTraits<BitsType>::IsSigned,
        THIS_TYPE_IS_NOT_SUPPORTED);
    int n = CHAR_BIT * sizeof(BitsType);
    int shift = n / 2;
    while (bits > 0 && shift > 0) {
      BitsType y = bits << shift;
      if (y > 0) {
        n -= shift;
        bits = y;
      }
      shift /= 2;
    }
    if (shift == 0) {
      --n;
    }
    return n;
  }
 };
 // Count leading zeros.
 template <typename BitsType>
 EIGEN_DEVICE_FUNC inline int clz(BitsType bits) {
  return count_bits_impl<BitsType>::clz(bits);
 }
 // Count trailing zeros.
 template <typename BitsType>
 EIGEN_DEVICE_FUNC inline int ctz(BitsType bits) {
  return count_bits_impl<BitsType>::ctz(bits);
 }
 #if EIGEN_COMP_GNUC || EIGEN_COMP_CLANG
 template <typename BitsType>
 struct count_bits_impl<BitsType, typename enable_if<sizeof(BitsType) <= sizeof(unsigned int)>::type> {
  static const int kNumBits = static_cast<int>(sizeof(BitsType) * CHAR_BIT);
  static EIGEN_DEVICE_FUNC inline int clz(BitsType bits) {
    EIGEN_STATIC_ASSERT(is_integral<BitsType>::value, THIS_TYPE_IS_NOT_SUPPORTED);
    static const int kLeadingBitsOffset = (sizeof(unsigned int) - sizeof(BitsType)) * CHAR_BIT;
    return bits == 0 ? kNumBits : __builtin_clz(static_cast<unsigned int>(bits)) - kLeadingBitsOffset;
  }
  static EIGEN_DEVICE_FUNC inline int ctz(BitsType bits) {
    EIGEN_STATIC_ASSERT(is_integral<BitsType>::value, THIS_TYPE_IS_NOT_SUPPORTED);
    return bits == 0 ? kNumBits : __builtin_ctz(static_cast<unsigned int>(bits));
  }
 };
 template <typename BitsType>
 struct count_bits_impl<
    BitsType, typename enable_if<sizeof(unsigned int) < sizeof(BitsType) && sizeof(BitsType) <= sizeof(unsigned long)>::type> {
  static const int kNumBits = static_cast<int>(sizeof(BitsType) * CHAR_BIT);
  static EIGEN_DEVICE_FUNC inline int clz(BitsType bits) {
    EIGEN_STATIC_ASSERT(is_integral<BitsType>::value, THIS_TYPE_IS_NOT_SUPPORTED);
    static const int kLeadingBitsOffset = (sizeof(unsigned long) - sizeof(BitsType)) * CHAR_BIT;
    return bits == 0 ? kNumBits : __builtin_clzl(static_cast<unsigned long>(bits)) - kLeadingBitsOffset;
  }
  static EIGEN_DEVICE_FUNC inline int ctz(BitsType bits) {
    EIGEN_STATIC_ASSERT(is_integral<BitsType>::value, THIS_TYPE_IS_NOT_SUPPORTED);
    return bits == 0 ? kNumBits : __builtin_ctzl(static_cast<unsigned long>(bits));
  }
 };
 template <typename BitsType>
 struct count_bits_impl<BitsType, typename enable_if<sizeof(unsigned long) < sizeof(BitsType) &&
                                                  sizeof(BitsType) <= sizeof(unsigned long long)>::type> {
  static const int kNumBits = static_cast<int>(sizeof(BitsType) * CHAR_BIT);
  static EIGEN_DEVICE_FUNC inline int clz(BitsType bits) {
    EIGEN_STATIC_ASSERT(is_integral<BitsType>::value, THIS_TYPE_IS_NOT_SUPPORTED);
    static const int kLeadingBitsOffset = (sizeof(unsigned long long) - sizeof(BitsType)) * CHAR_BIT;
    return bits == 0 ? kNumBits : __builtin_clzll(static_cast<unsigned long long>(bits)) - kLeadingBitsOffset;
  }
  static EIGEN_DEVICE_FUNC inline int ctz(BitsType bits) {
    EIGEN_STATIC_ASSERT(is_integral<BitsType>::value, THIS_TYPE_IS_NOT_SUPPORTED);
    return bits == 0 ? kNumBits : __builtin_ctzll(static_cast<unsigned long long>(bits));
  }
 };
 #elif EIGEN_COMP_MSVC
 template <typename BitsType>
 struct count_bits_impl<BitsType, typename enable_if<sizeof(BitsType) <= sizeof(unsigned long)>::type> {
  static const int kNumBits = static_cast<int>(sizeof(BitsType) * CHAR_BIT);
  static EIGEN_DEVICE_FUNC inline int clz(BitsType bits) {
    EIGEN_STATIC_ASSERT(is_integral<BitsType>::value, THIS_TYPE_IS_NOT_SUPPORTED);
    unsigned long out;
    _BitScanReverse(&out, static_cast<unsigned long>(bits));
    return bits == 0 ? kNumBits : (kNumBits - 1) - static_cast<int>(out);
  }
  static EIGEN_DEVICE_FUNC inline int ctz(BitsType bits) {
    EIGEN_STATIC_ASSERT(is_integral<BitsType>::value, THIS_TYPE_IS_NOT_SUPPORTED);
    unsigned long out;
    _BitScanForward(&out, static_cast<unsigned long>(bits));
    return bits == 0 ? kNumBits : static_cast<int>(out);
  }
 };
 #ifdef _WIN64
 template <typename BitsType>
 struct count_bits_impl<
    BitsType, typename enable_if<sizeof(unsigned long) < sizeof(BitsType) && sizeof(BitsType) <= sizeof(__int64)>::type> {
  static const int kNumBits = static_cast<int>(sizeof(BitsType) * CHAR_BIT);
  static EIGEN_DEVICE_FUNC inline int clz(BitsType bits) {
    EIGEN_STATIC_ASSERT(is_integral<BitsType>::value, THIS_TYPE_IS_NOT_SUPPORTED);
    unsigned long out;
    _BitScanReverse64(&out, static_cast<unsigned __int64>(bits));
    return bits == 0 ? kNumBits : (kNumBits - 1) - static_cast<int>(out);
  }
  static EIGEN_DEVICE_FUNC inline int ctz(BitsType bits) {
    EIGEN_STATIC_ASSERT(is_integral<BitsType>::value, THIS_TYPE_IS_NOT_SUPPORTED);
    unsigned long out;
    _BitScanForward64(&out, static_cast<unsigned __int64>(bits));
    return bits == 0 ? kNumBits : static_cast<int>(out);
  }
 };
 #endif  // _WIN64
 #endif  // EIGEN_COMP_GNUC || EIGEN_COMP_CLANG
 template <typename Scalar>
-struct random_default_impl<Scalar, false, true>
+struct random_default_impl<Scalar, false, true> {
-{
+  static inline Scalar run(const Scalar& x, const Scalar& y) {
-  static inline Scalar run(const Scalar& x, const Scalar& y)
+    if (y <= x) return x;
  {
    if (y <= x)
      return x;
    // ScalarU is the unsigned counterpart of Scalar, possibly Scalar itself.
    typedef typename make_unsigned<Scalar>::type ScalarU;
    // ScalarX is the widest of ScalarU and unsigned int.
@@ -857,11 +1174,15 @@ template<typename T> EIGEN_DEVICE_FUNC bool isinf_msvc_helper(T x)
 }
 //MSVC defines a _isnan builtin function, but for double only
 #ifndef EIGEN_GPU_COMPILE_PHASE
 EIGEN_DEVICE_FUNC inline bool isnan_impl(const long double& x) { return _isnan(x)!=0; }
 #endif
 EIGEN_DEVICE_FUNC inline bool isnan_impl(const double& x)      { return _isnan(x)!=0; }
 EIGEN_DEVICE_FUNC inline bool isnan_impl(const float& x)       { return _isnan(x)!=0; }
 #ifndef EIGEN_GPU_COMPILE_PHASE
 EIGEN_DEVICE_FUNC inline bool isinf_impl(const long double& x) { return isinf_msvc_helper(x); }
 #endif
 EIGEN_DEVICE_FUNC inline bool isinf_impl(const double& x)      { return isinf_msvc_helper(x); }
 EIGEN_DEVICE_FUNC inline bool isinf_impl(const float& x)       { return isinf_msvc_helper(x); }
@@ -875,12 +1196,16 @@ EIGEN_DEVICE_FUNC inline bool isinf_impl(const float& x)       { return isinf_ms
  #define EIGEN_TMP_NOOPT_ATTRIB EIGEN_DEVICE_FUNC inline __attribute__((noinline,optimize("no-finite-math-only")))
 #endif
 #ifndef EIGEN_GPU_COMPILE_PHASE
 template<> EIGEN_TMP_NOOPT_ATTRIB bool isnan_impl(const long double& x) { return __builtin_isnan(x); }
 #endif
 template<> EIGEN_TMP_NOOPT_ATTRIB bool isnan_impl(const double& x)      { return __builtin_isnan(x); }
 template<> EIGEN_TMP_NOOPT_ATTRIB bool isnan_impl(const float& x)       { return __builtin_isnan(x); }
 template<> EIGEN_TMP_NOOPT_ATTRIB bool isinf_impl(const double& x)      { return __builtin_isinf(x); }
 template<> EIGEN_TMP_NOOPT_ATTRIB bool isinf_impl(const float& x)       { return __builtin_isinf(x); }
 #ifndef EIGEN_GPU_COMPILE_PHASE
 template<> EIGEN_TMP_NOOPT_ATTRIB bool isinf_impl(const long double& x) { return __builtin_isinf(x); }
 #endif
 #undef EIGEN_TMP_NOOPT_ATTRIB
@@ -907,7 +1232,7 @@ template<typename T>
 EIGEN_DEVICE_FUNC
 EIGEN_ALWAYS_INLINE T mini(const T& x, const T& y)
 {
-  EIGEN_USING_STD_MATH(min);
+  EIGEN_USING_STD(min)
  return min EIGEN_NOT_A_MACRO (x,y);
 }
@@ -915,7 +1240,7 @@ template<typename T>
 EIGEN_DEVICE_FUNC
 EIGEN_ALWAYS_INLINE T maxi(const T& x, const T& y)
 {
-  EIGEN_USING_STD_MATH(max);
+  EIGEN_USING_STD(max)
  return max EIGEN_NOT_A_MACRO (x,y);
 }
 #else
@@ -937,6 +1262,8 @@ EIGEN_ALWAYS_INLINE double mini(const double& x, const double& y)
 {
  return fmin(x, y);
 }
 #ifndef EIGEN_GPU_COMPILE_PHASE
 template<>
 EIGEN_DEVICE_FUNC
 EIGEN_ALWAYS_INLINE long double mini(const long double& x, const long double& y)
@@ -948,6 +1275,7 @@ EIGEN_ALWAYS_INLINE long double mini(const long double& x, const long double& y)
  return fminl(x, y);
 #endif
 }
 #endif
 template<typename T>
 EIGEN_DEVICE_FUNC
@@ -967,6 +1295,7 @@ EIGEN_ALWAYS_INLINE double maxi(const double& x, const double& y)
 {
  return fmax(x, y);
 }
 #ifndef EIGEN_GPU_COMPILE_PHASE
 template<>
 EIGEN_DEVICE_FUNC
 EIGEN_ALWAYS_INLINE long double maxi(const long double& x, const long double& y)
@@ -979,6 +1308,7 @@ EIGEN_ALWAYS_INLINE long double maxi(const long double& x, const long double& y)
 #endif
 }
 #endif
 #endif
 #if defined(SYCL_DEVICE_ONLY)
@@ -1116,6 +1446,34 @@ inline EIGEN_MATHFUNC_RETVAL(abs2, Scalar) abs2(const Scalar& x)
 EIGEN_DEVICE_FUNC
 inline bool abs2(bool x) { return x; }
 template<typename T>
 EIGEN_DEVICE_FUNC
 EIGEN_ALWAYS_INLINE T absdiff(const T& x, const T& y)
 {
  return x > y ? x - y : y - x;
 }
 template<>
 EIGEN_DEVICE_FUNC
 EIGEN_ALWAYS_INLINE float absdiff(const float& x, const float& y)
 {
  return fabsf(x - y);
 }
 template<>
 EIGEN_DEVICE_FUNC
 EIGEN_ALWAYS_INLINE double absdiff(const double& x, const double& y)
 {
  return fabs(x - y);
 }
 // HIP and CUDA do not support long double.
 #ifndef EIGEN_GPU_COMPILE_PHASE
 template<>
 EIGEN_DEVICE_FUNC
 EIGEN_ALWAYS_INLINE long double absdiff(const long double& x, const long double& y) {
  return fabsl(x - y);
 }
 #endif
 template<typename Scalar>
 EIGEN_DEVICE_FUNC
 inline EIGEN_MATHFUNC_RETVAL(norm1, Scalar) norm1(const Scalar& x)
@@ -1174,6 +1532,13 @@ SYCL_SPECIALIZE_FLOATING_TYPES_UNARY_FUNC_RET_TYPE(isinf, isinf, bool)
 SYCL_SPECIALIZE_FLOATING_TYPES_UNARY_FUNC_RET_TYPE(isfinite, isfinite, bool)
 #endif
 template<typename Scalar>
 EIGEN_DEVICE_FUNC
 inline EIGEN_MATHFUNC_RETVAL(rint, Scalar) rint(const Scalar& x)
 {
  return EIGEN_MATHFUNC_IMPL(rint, Scalar)::run(x);
 }
 template<typename Scalar>
 EIGEN_DEVICE_FUNC
 inline EIGEN_MATHFUNC_RETVAL(round, Scalar) round(const Scalar& x)
@@ -1189,7 +1554,7 @@ template<typename T>
 EIGEN_DEVICE_FUNC
 T (floor)(const T& x)
 {
-  EIGEN_USING_STD_MATH(floor);
+  EIGEN_USING_STD(floor)
  return floor(x);
 }
@@ -1209,7 +1574,7 @@ template<typename T>
 EIGEN_DEVICE_FUNC
 T (ceil)(const T& x)
 {
-  EIGEN_USING_STD_MATH(ceil);
+  EIGEN_USING_STD(ceil);
  return ceil(x);
 }
@@ -1250,23 +1615,34 @@ inline int log2(int x)
  *
  * It's usage is justified in performance critical functions, like norm/normalize.
  */
-template<typename T>
+template<typename Scalar>
-EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE
+EIGEN_DEVICE_FUNC
-T sqrt(const T &x)
+EIGEN_ALWAYS_INLINE EIGEN_MATHFUNC_RETVAL(sqrt, Scalar) sqrt(const Scalar& x)
 {
-  EIGEN_USING_STD_MATH(sqrt);
+  return EIGEN_MATHFUNC_IMPL(sqrt, Scalar)::run(x);
  return sqrt(x);
 }
 // Boolean specialization, avoids implicit float to bool conversion (-Wimplicit-conversion-floating-point-to-bool).
 template<>
 EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS EIGEN_DEVICE_FUNC
 bool sqrt<bool>(const bool &x) { return x; }
 #if defined(SYCL_DEVICE_ONLY)
 SYCL_SPECIALIZE_FLOATING_TYPES_UNARY(sqrt, sqrt)
 #endif
 /** \returns the reciprocal square root of \a x. **/
 template<typename T>
 EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE
 T rsqrt(const T& x)
 {
  return internal::rsqrt_impl<T>::run(x);
 }
 template<typename T>
 EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE
 T log(const T &x) {
-  EIGEN_USING_STD_MATH(log);
+  return internal::log_impl<T>::run(x);
  return log(x);
 }
 #if defined(SYCL_DEVICE_ONLY)
@@ -1286,7 +1662,7 @@ template<typename T>
 EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE
 typename internal::enable_if<NumTraits<T>::IsSigned || NumTraits<T>::IsComplex,typename NumTraits<T>::Real>::type
 abs(const T &x) {
-  EIGEN_USING_STD_MATH(abs);
+  EIGEN_USING_STD(abs);
  return abs(x);
 }
@@ -1323,7 +1699,7 @@ double abs(const std::complex<double>& x) {
 template<typename T>
 EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE
 T exp(const T &x) {
-  EIGEN_USING_STD_MATH(exp);
+  EIGEN_USING_STD(exp);
  return exp(x);
 }
@@ -1377,7 +1753,7 @@ double expm1(const double &x) { return ::expm1(x); }
 template<typename T>
 EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE
 T cos(const T &x) {
-  EIGEN_USING_STD_MATH(cos);
+  EIGEN_USING_STD(cos);
  return cos(x);
 }
@@ -1396,7 +1772,7 @@ double cos(const double &x) { return ::cos(x); }
 template<typename T>
 EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE
 T sin(const T &x) {
-  EIGEN_USING_STD_MATH(sin);
+  EIGEN_USING_STD(sin);
  return sin(x);
 }
@@ -1415,7 +1791,7 @@ double sin(const double &x) { return ::sin(x); }
 template<typename T>
 EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE
 T tan(const T &x) {
-  EIGEN_USING_STD_MATH(tan);
+  EIGEN_USING_STD(tan);
  return tan(x);
 }
@@ -1434,7 +1810,7 @@ double tan(const double &x) { return ::tan(x); }
 template<typename T>
 EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE
 T acos(const T &x) {
-  EIGEN_USING_STD_MATH(acos);
+  EIGEN_USING_STD(acos);
  return acos(x);
 }
@@ -1442,8 +1818,8 @@ T acos(const T &x) {
 template<typename T>
 EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE
 T acosh(const T &x) {
-  EIGEN_USING_STD_MATH(acosh);
+  EIGEN_USING_STD(acosh);
-  return acosh(x);
+  return static_cast<T>(acosh(x));
 }
 #endif
@@ -1463,7 +1839,7 @@ double acos(const double &x) { return ::acos(x); }
 template<typename T>
 EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE
 T asin(const T &x) {
-  EIGEN_USING_STD_MATH(asin);
+  EIGEN_USING_STD(asin);
  return asin(x);
 }
@@ -1471,8 +1847,8 @@ T asin(const T &x) {
 template<typename T>
 EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE
 T asinh(const T &x) {
-  EIGEN_USING_STD_MATH(asinh);
+  EIGEN_USING_STD(asinh);
-  return asinh(x);
+  return static_cast<T>(asinh(x));
 }
 #endif
@@ -1492,16 +1868,16 @@ double asin(const double &x) { return ::asin(x); }
 template<typename T>
 EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE
 T atan(const T &x) {
-  EIGEN_USING_STD_MATH(atan);
+  EIGEN_USING_STD(atan);
-  return atan(x);
+  return static_cast<T>(atan(x));
 }
 #if EIGEN_HAS_CXX11_MATH
 template<typename T>
 EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE
 T atanh(const T &x) {
-  EIGEN_USING_STD_MATH(atanh);
+  EIGEN_USING_STD(atanh);
-  return atanh(x);
+  return static_cast<T>(atanh(x));
 }
 #endif
@@ -1522,8 +1898,8 @@ double atan(const double &x) { return ::atan(x); }
 template<typename T>
 EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE
 T cosh(const T &x) {
-  EIGEN_USING_STD_MATH(cosh);
+  EIGEN_USING_STD(cosh);
-  return cosh(x);
+  return static_cast<T>(cosh(x));
 }
 #if defined(SYCL_DEVICE_ONLY)
@@ -1541,8 +1917,8 @@ double cosh(const double &x) { return ::cosh(x); }
 template<typename T>
 EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE
 T sinh(const T &x) {
-  EIGEN_USING_STD_MATH(sinh);
+  EIGEN_USING_STD(sinh);
-  return sinh(x);
+  return static_cast<T>(sinh(x));
 }
 #if defined(SYCL_DEVICE_ONLY)
@@ -1560,7 +1936,7 @@ double sinh(const double &x) { return ::sinh(x); }
 template<typename T>
 EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE
 T tanh(const T &x) {
-  EIGEN_USING_STD_MATH(tanh);
+  EIGEN_USING_STD(tanh);
  return tanh(x);
 }
@@ -1584,7 +1960,7 @@ double tanh(const double &x) { return ::tanh(x); }
 template <typename T>
 EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE
 T fmod(const T& a, const T& b) {
-  EIGEN_USING_STD_MATH(fmod);
+  EIGEN_USING_STD(fmod);
  return fmod(a, b);
 }
@@ -1746,6 +2122,11 @@ template<> struct random_impl<bool>
  {
    return random<int>(0,1)==0 ? false : true;
  }
  static inline bool run(const bool& a, const bool& b)
  {
    return random<int>(a, b)==0 ? false : true;
  }
 };
 template<> struct scalar_fuzzy_impl<bool>
@@ -1772,6 +2153,57 @@ template<> struct scalar_fuzzy_impl<bool>
 };
 } // end namespace internal
 // Default implementations that rely on other numext implementations
 namespace internal {
 // Specialization for complex types that are not supported by std::expm1.
 template <typename RealScalar>
 struct expm1_impl<std::complex<RealScalar> > {
  EIGEN_DEVICE_FUNC static inline std::complex<RealScalar> run(
      const std::complex<RealScalar>& x) {
    EIGEN_STATIC_ASSERT_NON_INTEGER(RealScalar)
    RealScalar xr = x.real();
    RealScalar xi = x.imag();
    // expm1(z) = exp(z) - 1
    //          = exp(x +  i * y) - 1
    //          = exp(x) * (cos(y) + i * sin(y)) - 1
    //          = exp(x) * cos(y) - 1 + i * exp(x) * sin(y)
    // Imag(expm1(z)) = exp(x) * sin(y)
    // Real(expm1(z)) = exp(x) * cos(y) - 1
    //          = exp(x) * cos(y) - 1.
    //          = expm1(x) + exp(x) * (cos(y) - 1)
    //          = expm1(x) + exp(x) * (2 * sin(y / 2) ** 2)
    RealScalar erm1 = numext::expm1<RealScalar>(xr);
    RealScalar er = erm1 + RealScalar(1.);
    RealScalar sin2 = numext::sin(xi / RealScalar(2.));
    sin2 = sin2 * sin2;
    RealScalar s = numext::sin(xi);
    RealScalar real_part = erm1 - RealScalar(2.) * er * sin2;
    return std::complex<RealScalar>(real_part, er * s);
  }
 };
 template<typename T>
 struct rsqrt_impl {
  EIGEN_DEVICE_FUNC
  static EIGEN_ALWAYS_INLINE T run(const T& x) {
    return T(1)/numext::sqrt(x);
  }
 };
 #if defined(EIGEN_GPU_COMPILE_PHASE)
 template<typename T>
 struct conj_impl<std::complex<T>, true>
 {
  EIGEN_DEVICE_FUNC
  static inline std::complex<T> run(const std::complex<T>& x)
  {
    return std::complex<T>(numext::real(x), -numext::imag(x));
  }
 };
 #endif
 } // end namespace internal
--- a/Eigen/src/Core/MathFunctionsImpl.h
+++ b/Eigen/src/Core/MathFunctionsImpl.h
@@ -17,19 +17,28 @@ namespace internal {
 /** \internal \returns the hyperbolic tan of \a a (coeff-wise)
    Doesn't do anything fancy, just a 13/6-degree rational interpolant which
-    is accurate up to a couple of ulp in the range [-9, 9], outside of which
+    is accurate up to a couple of ulps in the (approximate) range [-8, 8],
-    the tanh(x) = +/-1.
+    outside of which tanh(x) = +/-1 in single precision. The input is clamped
    to the range [-c, c]. The value c is chosen as the smallest value where
    the approximation evaluates to exactly 1. In the reange [-0.0004, 0.0004]
    the approxmation tanh(x) ~= x is used for better accuracy as x tends to zero.
    This implementation works on both scalars and packets.
 */
 template<typename T>
 T generic_fast_tanh_float(const T& a_x)
 {
-  // Clamp the inputs to the range [-9, 9] since anything outside
+  // Clamp the inputs to the range [-c, c]
-  // this range is +/-1.0f in single-precision.
+#ifdef EIGEN_VECTORIZE_FMA
-  const T plus_9 = pset1<T>(9.f);
+  const T plus_clamp = pset1<T>(7.99881172180175781f);
-  const T minus_9 = pset1<T>(-9.f);
+  const T minus_clamp = pset1<T>(-7.99881172180175781f);
-  const T x = pmax(pmin(a_x, plus_9), minus_9);
+#else
  const T plus_clamp = pset1<T>(7.90531110763549805f);
  const T minus_clamp = pset1<T>(-7.90531110763549805f);
 #endif
  const T tiny = pset1<T>(0.0004f);
  const T x = pmax(pmin(a_x, plus_clamp), minus_clamp);
  const T tiny_mask = pcmp_lt(pabs(a_x), tiny);
  // The monomial coefficients of the numerator polynomial (odd).
  const T alpha_1 = pset1<T>(4.89352455891786e-03f);
  const T alpha_3 = pset1<T>(6.37261928875436e-04f);
@@ -57,20 +66,26 @@ T generic_fast_tanh_float(const T& a_x)
  p = pmadd(x2, p, alpha_1);
  p = pmul(x, p);
-  // Evaluate the denominator polynomial p.
+  // Evaluate the denominator polynomial q.
  T q = pmadd(x2, beta_6, beta_4);
  q = pmadd(x2, q, beta_2);
  q = pmadd(x2, q, beta_0);
  // Divide the numerator by the denominator.
-  return pdiv(p, q);
+  return pselect(tiny_mask, x, pdiv(p, q));
 }
 template<typename RealScalar>
 EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
 RealScalar positive_real_hypot(const RealScalar& x, const RealScalar& y)
 {
-  EIGEN_USING_STD_MATH(sqrt);
+  // IEEE IEC 6059 special cases.
  if ((numext::isinf)(x) || (numext::isinf)(y))
    return NumTraits<RealScalar>::infinity();
  if ((numext::isnan)(x) || (numext::isnan)(y))
    return NumTraits<RealScalar>::quiet_NaN();
  EIGEN_USING_STD(sqrt);
  RealScalar p, qp;
  p = numext::maxi(x,y);
  if(p==RealScalar(0)) return RealScalar(0);
@@ -85,11 +100,99 @@ struct hypot_impl
  static EIGEN_DEVICE_FUNC
  inline RealScalar run(const Scalar& x, const Scalar& y)
  {
-    EIGEN_USING_STD_MATH(abs);
+    EIGEN_USING_STD(abs);
    return positive_real_hypot<RealScalar>(abs(x), abs(y));
  }
 };
 // Generic complex sqrt implementation that correctly handles corner cases
 // according to https://en.cppreference.com/w/cpp/numeric/complex/sqrt
 template<typename T>
 EIGEN_DEVICE_FUNC std::complex<T> complex_sqrt(const std::complex<T>& z) {
  // Computes the principal sqrt of the input.
  //
  // For a complex square root of the number x + i*y. We want to find real
  // numbers u and v such that
  //    (u + i*v)^2 = x + i*y  <=>
  //    u^2 - v^2 + i*2*u*v = x + i*v.
  // By equating the real and imaginary parts we get:
  //    u^2 - v^2 = x
  //    2*u*v = y.
  //
  // For x >= 0, this has the numerically stable solution
  //    u = sqrt(0.5 * (x + sqrt(x^2 + y^2)))
  //    v = y / (2 * u)
  // and for x < 0,
  //    v = sign(y) * sqrt(0.5 * (-x + sqrt(x^2 + y^2)))
  //    u = y / (2 * v)
  //
  // Letting w = sqrt(0.5 * (|x| + |z|)),
  //   if x == 0: u = w, v = sign(y) * w
  //   if x > 0:  u = w, v = y / (2 * w)
  //   if x < 0:  u = |y| / (2 * w), v = sign(y) * w
  const T x = numext::real(z);
  const T y = numext::imag(z);
  const T zero = T(0);
  const T w = numext::sqrt(T(0.5) * (numext::abs(x) + numext::hypot(x, y)));
  return
    (numext::isinf)(y) ? std::complex<T>(NumTraits<T>::infinity(), y)
      : x == zero ? std::complex<T>(w, y < zero ? -w : w)
      : x > zero ? std::complex<T>(w, y / (2 * w))
      : std::complex<T>(numext::abs(y) / (2 * w), y < zero ? -w : w );
 }
 // Generic complex rsqrt implementation.
 template<typename T>
 EIGEN_DEVICE_FUNC std::complex<T> complex_rsqrt(const std::complex<T>& z) {
  // Computes the principal reciprocal sqrt of the input.
  //
  // For a complex reciprocal square root of the number z = x + i*y. We want to
  // find real numbers u and v such that
  //    (u + i*v)^2 = 1 / (x + i*y)  <=>
  //    u^2 - v^2 + i*2*u*v = x/|z|^2 - i*v/|z|^2.
  // By equating the real and imaginary parts we get:
  //    u^2 - v^2 = x/|z|^2
  //    2*u*v = y/|z|^2.
  //
  // For x >= 0, this has the numerically stable solution
  //    u = sqrt(0.5 * (x + |z|)) / |z|
  //    v = -y / (2 * u * |z|)
  // and for x < 0,
  //    v = -sign(y) * sqrt(0.5 * (-x + |z|)) / |z|
  //    u = -y / (2 * v * |z|)
  //
  // Letting w = sqrt(0.5 * (|x| + |z|)),
  //   if x == 0: u = w / |z|, v = -sign(y) * w / |z|
  //   if x > 0:  u = w / |z|, v = -y / (2 * w * |z|)
  //   if x < 0:  u = |y| / (2 * w * |z|), v = -sign(y) * w / |z|
  const T x = numext::real(z);
  const T y = numext::imag(z);
  const T zero = T(0);
  const T abs_z = numext::hypot(x, y);
  const T w = numext::sqrt(T(0.5) * (numext::abs(x) + abs_z));
  const T woz = w / abs_z;
  // Corner cases consistent with 1/sqrt(z) on gcc/clang.
  return
    abs_z == zero ? std::complex<T>(NumTraits<T>::infinity(), NumTraits<T>::quiet_NaN())
      : ((numext::isinf)(x) || (numext::isinf)(y)) ? std::complex<T>(zero, zero)
      : x == zero ? std::complex<T>(woz, y < zero ? woz : -woz)
      : x > zero ? std::complex<T>(woz, -y / (2 * w * abs_z))
      : std::complex<T>(numext::abs(y) / (2 * w * abs_z), y < zero ? woz : -woz );
 }
 template<typename T>
 EIGEN_DEVICE_FUNC std::complex<T> complex_log(const std::complex<T>& z) {
  // Computes complex log.
  T a = numext::abs(z);
  EIGEN_USING_STD(atan2);
  T b = atan2(z.imag(), z.real());
  return std::complex<T>(numext::log(a), b);
 }
 } // end namespace internal
 } // end namespace Eigen
--- a/Eigen/src/Core/Matrix.h
+++ b/Eigen/src/Core/Matrix.h
@@ -225,8 +225,6 @@ class Matrix
      return Base::_set(other);
    }
    /* Here, doxygen failed to copy the brief information when using \copydoc */
    /**
      * \brief Copies the generic expression \a other into *this.
      * \copydetails DenseBase::operator=(const EigenBase<OtherDerived> &other)
@@ -278,13 +276,21 @@ class Matrix
    EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
    Matrix& operator=(Matrix&& other) EIGEN_NOEXCEPT_IF(std::is_nothrow_move_assignable<Scalar>::value)
    {
-      other.swap(*this);
+      Base::operator=(std::move(other));
      return *this;
    }
 #endif
 #if EIGEN_HAS_CXX11
-    /** \copydoc PlainObjectBase(const Scalar&, const Scalar&, const Scalar&,  const Scalar&, const ArgTypes&... args)
+    /** \brief Construct a row of column vector with fixed size from an arbitrary number of coefficients. \cpp11
     *
     * \only_for_vectors
     *
     * This constructor is for 1D array or vectors with more than 4 coefficients.
     * There exists C++98 analogue constructors for fixed-size array/vector having 1, 2, 3, or 4 coefficients.
     *
     * \warning To construct a column (resp. row) vector of fixed length, the number of values passed to this
     * constructor must match the the fixed number of rows (resp. columns) of \c *this.
     *
     * Example: \include Matrix_variadic_ctor_cxx11.cpp
     * Output: \verbinclude Matrix_variadic_ctor_cxx11.out
@@ -297,6 +303,8 @@ class Matrix
      : Base(a0, a1, a2, a3, args...) {}
    /** \brief Constructs a Matrix and initializes it from the coefficients given as initializer-lists grouped by row. \cpp11
      *
      * \anchor matrix_constructor_initializer_list
      *
      * In the general case, the constructor takes a list of rows, each row being represented as a list of coefficients:
      *
@@ -423,8 +431,10 @@ class Matrix
      : Base(other.derived())
    { }
-    EIGEN_DEVICE_FUNC inline Index innerStride() const { return 1; }
+    EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR
-    EIGEN_DEVICE_FUNC inline Index outerStride() const { return this->innerSize(); }
+    inline Index innerStride() const EIGEN_NOEXCEPT { return 1; }
    EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR
    inline Index outerStride() const EIGEN_NOEXCEPT { return this->innerSize(); }
    /////////// Geometry module ///////////
@@ -478,16 +488,21 @@ class Matrix
 #define EIGEN_MAKE_TYPEDEFS(Type, TypeSuffix, Size, SizeSuffix)   \
 /** \ingroup matrixtypedefs */                                    \
 /** \brief \noop            */                                    \
 typedef Matrix<Type, Size, Size> Matrix##SizeSuffix##TypeSuffix;  \
 /** \ingroup matrixtypedefs */                                    \
 /** \brief \noop            */                                    \
 typedef Matrix<Type, Size, 1>    Vector##SizeSuffix##TypeSuffix;  \
 /** \ingroup matrixtypedefs */                                    \
 /** \brief \noop            */                                    \
 typedef Matrix<Type, 1, Size>    RowVector##SizeSuffix##TypeSuffix;
 #define EIGEN_MAKE_FIXED_TYPEDEFS(Type, TypeSuffix, Size)         \
 /** \ingroup matrixtypedefs */                                    \
 /** \brief \noop            */                                    \
 typedef Matrix<Type, Size, Dynamic> Matrix##Size##X##TypeSuffix;  \
 /** \ingroup matrixtypedefs */                                    \
 /** \brief \noop            */                                    \
 typedef Matrix<Type, Dynamic, Size> Matrix##X##Size##TypeSuffix;
 #define EIGEN_MAKE_TYPEDEFS_ALL_SIZES(Type, TypeSuffix) \
--- a/Eigen/src/Core/MatrixBase.h
+++ b/Eigen/src/Core/MatrixBase.h
@@ -206,28 +206,22 @@ template<typename Derived> class MatrixBase
    EIGEN_DEVICE_FUNC
    DiagonalReturnType diagonal();
-    typedef typename internal::add_const<Diagonal<const Derived> >::type ConstDiagonalReturnType;
+    typedef Diagonal<const Derived> ConstDiagonalReturnType;
    EIGEN_DEVICE_FUNC
-    ConstDiagonalReturnType diagonal() const;
+    const ConstDiagonalReturnType diagonal() const;
    template<int Index> struct DiagonalIndexReturnType { typedef Diagonal<Derived,Index> Type; };
    template<int Index> struct ConstDiagonalIndexReturnType { typedef const Diagonal<const Derived,Index> Type; };
    template<int Index>
    EIGEN_DEVICE_FUNC
-    typename DiagonalIndexReturnType<Index>::Type diagonal();
+    Diagonal<Derived, Index> diagonal();
    template<int Index>
    EIGEN_DEVICE_FUNC
-    typename ConstDiagonalIndexReturnType<Index>::Type diagonal() const;
+    const Diagonal<const Derived, Index> diagonal() const;
    typedef Diagonal<Derived,DynamicIndex> DiagonalDynamicIndexReturnType;
    typedef typename internal::add_const<Diagonal<const Derived,DynamicIndex> >::type ConstDiagonalDynamicIndexReturnType;
    EIGEN_DEVICE_FUNC
-    DiagonalDynamicIndexReturnType diagonal(Index index);
+    Diagonal<Derived, DynamicIndex> diagonal(Index index);
    EIGEN_DEVICE_FUNC
-    ConstDiagonalDynamicIndexReturnType diagonal(Index index) const;
+    const Diagonal<const Derived, DynamicIndex> diagonal(Index index) const;
    template<unsigned int Mode> struct TriangularViewReturnType { typedef TriangularView<Derived, Mode> Type; };
    template<unsigned int Mode> struct ConstTriangularViewReturnType { typedef const TriangularView<const Derived, Mode> Type; };
@@ -481,7 +475,8 @@ template<typename Derived> class MatrixBase
    EIGEN_MATRIX_FUNCTION_1(MatrixComplexPowerReturnValue, pow, power to \c p, const std::complex<RealScalar>& p)
  protected:
-    EIGEN_DEVICE_FUNC MatrixBase() : Base() {}
+    EIGEN_DEFAULT_COPY_CONSTRUCTOR(MatrixBase)
    EIGEN_DEFAULT_EMPTY_CONSTRUCTOR_AND_DESTRUCTOR(MatrixBase)
  private:
    EIGEN_DEVICE_FUNC explicit MatrixBase(int);
--- a/Eigen/src/Core/NestByValue.h
+++ b/Eigen/src/Core/NestByValue.h
@@ -45,8 +45,8 @@ template<typename ExpressionType> class NestByValue
    EIGEN_DEVICE_FUNC explicit inline NestByValue(const ExpressionType& matrix) : m_expression(matrix) {}
-    EIGEN_DEVICE_FUNC inline Index rows() const { return m_expression.rows(); }
+    EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR inline Index rows() const EIGEN_NOEXCEPT { return m_expression.rows(); }
-    EIGEN_DEVICE_FUNC inline Index cols() const { return m_expression.cols(); }
+    EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR inline Index cols() const EIGEN_NOEXCEPT { return m_expression.cols(); }
    EIGEN_DEVICE_FUNC operator const ExpressionType&() const { return m_expression; }
--- a/Eigen/src/Core/NumTraits.h
+++ b/Eigen/src/Core/NumTraits.h
@@ -21,14 +21,14 @@ template< typename T,
          bool is_integer = NumTraits<T>::IsInteger>
 struct default_digits10_impl
 {
-  EIGEN_DEVICE_FUNC
+  EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR
  static int run() { return std::numeric_limits<T>::digits10; }
 };
 template<typename T>
 struct default_digits10_impl<T,false,false> // Floating point
 {
-  EIGEN_DEVICE_FUNC
+  EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR
  static int run() {
    using std::log10;
    using std::ceil;
@@ -40,7 +40,7 @@ struct default_digits10_impl<T,false,false> // Floating point
 template<typename T>
 struct default_digits10_impl<T,false,true> // Integer
 {
-  EIGEN_DEVICE_FUNC
+  EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR
  static int run() { return 0; }
 };
@@ -52,14 +52,14 @@ template< typename T,
          bool is_integer = NumTraits<T>::IsInteger>
 struct default_digits_impl
 {
-  EIGEN_DEVICE_FUNC
+  EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR
  static int run() { return std::numeric_limits<T>::digits; }
 };
 template<typename T>
 struct default_digits_impl<T,false,false> // Floating point
 {
-  EIGEN_DEVICE_FUNC
+  EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR
  static int run() {
    using std::log;
    using std::ceil;
@@ -71,12 +71,34 @@ struct default_digits_impl<T,false,false> // Floating point
 template<typename T>
 struct default_digits_impl<T,false,true> // Integer
 {
-  EIGEN_DEVICE_FUNC
+  EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR
  static int run() { return 0; }
 };
 } // end namespace internal
 namespace numext {
 /** \internal bit-wise cast without changing the underlying bit representation. */
 // TODO: Replace by std::bit_cast (available in C++20)
 template <typename Tgt, typename Src>
 EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC Tgt bit_cast(const Src& src) {
 #if EIGEN_HAS_TYPE_TRAITS
  // The behaviour of memcpy is not specified for non-trivially copyable types
  EIGEN_STATIC_ASSERT(std::is_trivially_copyable<Src>::value, THIS_TYPE_IS_NOT_SUPPORTED);
  EIGEN_STATIC_ASSERT(std::is_trivially_copyable<Tgt>::value && std::is_default_constructible<Tgt>::value,
                      THIS_TYPE_IS_NOT_SUPPORTED);
 #endif
  EIGEN_STATIC_ASSERT(sizeof(Src) == sizeof(Tgt), THIS_TYPE_IS_NOT_SUPPORTED);
  Tgt tgt;
  EIGEN_USING_STD(memcpy)
  memcpy(&tgt, &src, sizeof(Tgt));
  return tgt;
 }
 }  // namespace numext
 // clang-format off
 /** \class NumTraits
  * \ingroup Core_Module
  *
@@ -88,36 +110,47 @@ struct default_digits_impl<T,false,true> // Integer
  *
  * The provided data consists of:
  * \li A typedef \c Real, giving the "real part" type of \a T. If \a T is already real,
-  *     then \c Real is just a typedef to \a T. If \a T is \c std::complex<U> then \c Real
+  *     then \c Real is just a typedef to \a T. If \a T is `std::complex<U>` then \c Real
  *     is a typedef to \a U.
  * \li A typedef \c NonInteger, giving the type that should be used for operations producing non-integral values,
  *     such as quotients, square roots, etc. If \a T is a floating-point type, then this typedef just gives
-  *     \a T again. Note however that many Eigen functions such as internal::sqrt simply refuse to
+  *     \a T again. Note however that many Eigen functions such as `internal::sqrt` simply refuse to
  *     take integers. Outside of a few cases, Eigen doesn't do automatic type promotion. Thus, this typedef is
  *     only intended as a helper for code that needs to explicitly promote types.
-  * \li A typedef \c Literal giving the type to use for numeric literals such as "2" or "0.5". For instance, for \c std::complex<U>, Literal is defined as \c U.
+  * \li A typedef \c Literal giving the type to use for numeric literals such as "2" or "0.5". For instance, for `std::complex<U>`,
  *     Literal is defined as \c U.
  *     Of course, this type must be fully compatible with \a T. In doubt, just use \a T here.
-  * \li A typedef \a Nested giving the type to use to nest a value inside of the expression tree. If you don't know what
+  * \li A typedef \c Nested giving the type to use to nest a value inside of the expression tree. If you don't know what
  *     this means, just use \a T here.
-  * \li An enum value \a IsComplex. It is equal to 1 if \a T is a \c std::complex
+  * \li An enum value \c IsComplex. It is equal to 1 if \a T is a \c std::complex
  *     type, and to 0 otherwise.
-  * \li An enum value \a IsInteger. It is equal to \c 1 if \a T is an integer type such as \c int,
+  * \li An enum value \c IsInteger. It is equal to \c 1 if \a T is an integer type such as \c int,
  *     and to \c 0 otherwise.
-  * \li Enum values ReadCost, AddCost and MulCost representing a rough estimate of the number of CPU cycles needed
+  * \li Enum values \c ReadCost, \c AddCost and \c MulCost representing a rough estimate of the number of CPU cycles needed
  *     to by move / add / mul instructions respectively, assuming the data is already stored in CPU registers.
  *     Stay vague here. No need to do architecture-specific stuff. If you don't know what this means, just use \c Eigen::HugeCost.
-  * \li An enum value \a IsSigned. It is equal to \c 1 if \a T is a signed type and to 0 if \a T is unsigned.
+  * \li An enum value \c IsSigned. It is equal to \c 1 if \a T is a signed type and to 0 if \a T is unsigned.
-  * \li An enum value \a RequireInitialization. It is equal to \c 1 if the constructor of the numeric type \a T must
+  * \li An enum value \c RequireInitialization. It is equal to \c 1 if the constructor of the numeric type \a T must
  *     be called, and to 0 if it is safe not to call it. Default is 0 if \a T is an arithmetic type, and 1 otherwise.
-  * \li An epsilon() function which, unlike <a href="http://en.cppreference.com/w/cpp/types/numeric_limits/epsilon">std::numeric_limits::epsilon()</a>,
+  * \li An `epsilon()` function which, unlike <a href="http://en.cppreference.com/w/cpp/types/numeric_limits/epsilon">`std::numeric_limits::epsilon()`</a>,
-  *     it returns a \a Real instead of a \a T.
+  *     it returns a \c Real instead of a \a T.
-  * \li A dummy_precision() function returning a weak epsilon value. It is mainly used as a default
+  * \li A `dummy_precision()` function returning a weak epsilon value. It is mainly used as a default
  *     value by the fuzzy comparison operators.
-  * \li highest() and lowest() functions returning the highest and lowest possible values respectively.
+  * \li `highest()` and `lowest()` functions returning the highest and lowest possible values respectively.
-  * \li digits10() function returning the number of decimal digits that can be represented without change. This is
+  * \li `digits()` function returning the number of radix digits (non-sign digits for integers, mantissa for floating-point). This is
  *     the analogue of <a href="http://en.cppreference.com/w/cpp/types/numeric_limits/digits">std::numeric_limits<T>::digits</a>
  *     which is used as the default implementation if specialized.
  * \li `digits10()` function returning the number of decimal digits that can be represented without change. This is
  *     the analogue of <a href="http://en.cppreference.com/w/cpp/types/numeric_limits/digits10">std::numeric_limits<T>::digits10</a>
  *     which is used as the default implementation if specialized.
  * \li `min_exponent()` and `max_exponent()` functions returning the highest and lowest possible values, respectively,
  *     such that the radix raised to the power exponent-1 is a normalized floating-point number.  These are equivalent to
  *     <a href="http://en.cppreference.com/w/cpp/types/numeric_limits/min_exponent">`std::numeric_limits<T>::min_exponent`</a>/
  *     <a href="http://en.cppreference.com/w/cpp/types/numeric_limits/max_exponent">`std::numeric_limits<T>::max_exponent`</a>.
  * \li `infinity()` function returning a representation of positive infinity, if available.
  * \li `quiet_NaN` function returning a non-signaling "not-a-number", if available.
  */
  // clang-format on
 template<typename T> struct GenericNumTraits
 {
@@ -140,49 +173,60 @@ template<typename T> struct GenericNumTraits
  typedef T Nested;
  typedef T Literal;
-  EIGEN_DEVICE_FUNC
+  EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR
  static inline Real epsilon()
  {
    return numext::numeric_limits<T>::epsilon();
  }
-  EIGEN_DEVICE_FUNC
+  EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR
  static inline int digits10()
  {
    return internal::default_digits10_impl<T>::run();
  }
-  EIGEN_DEVICE_FUNC
+  EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR
  static inline int digits()
  {
    return internal::default_digits_impl<T>::run();
  }
-  EIGEN_DEVICE_FUNC
+  EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR
  static inline int min_exponent()
  {
    return numext::numeric_limits<T>::min_exponent;
  }
  EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR
  static inline int max_exponent()
  {
    return numext::numeric_limits<T>::max_exponent;
  }
  EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR
  static inline Real dummy_precision()
  {
    // make sure to override this for floating-point types
    return Real(0);
  }
-
+  EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR
  EIGEN_DEVICE_FUNC
  static inline T highest() {
    return (numext::numeric_limits<T>::max)();
  }
-  EIGEN_DEVICE_FUNC
+  EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR
  static inline T lowest()  {
    return IsInteger ? (numext::numeric_limits<T>::min)()
                     : static_cast<T>(-(numext::numeric_limits<T>::max)());
  }
-  EIGEN_DEVICE_FUNC
+  EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR
  static inline T infinity() {
    return numext::numeric_limits<T>::infinity();
  }
-  EIGEN_DEVICE_FUNC
+  EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR
  static inline T quiet_NaN() {
    return numext::numeric_limits<T>::quiet_NaN();
  }
@@ -194,21 +238,35 @@ template<typename T> struct NumTraits : GenericNumTraits<T>
 template<> struct NumTraits<float>
  : GenericNumTraits<float>
 {
-  EIGEN_DEVICE_FUNC
+  EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR
  static inline float dummy_precision() { return 1e-5f; }
 };
 template<> struct NumTraits<double> : GenericNumTraits<double>
 {
-  EIGEN_DEVICE_FUNC
+  EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR
  static inline double dummy_precision() { return 1e-12; }
 };
 // GPU devices treat `long double` as `double`.
 #ifndef EIGEN_GPU_COMPILE_PHASE
 template<> struct NumTraits<long double>
  : GenericNumTraits<long double>
 {
-  static inline long double dummy_precision() { return 1e-15l; }
+  EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR
  static inline long double dummy_precision() { return static_cast<long double>(1e-15l); }
 #if defined(EIGEN_ARCH_PPC) && (__LDBL_MANT_DIG__ == 106)
  // PowerPC double double causes issues with some values
  EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR
  static inline long double epsilon()
  {
    // 2^(-(__LDBL_MANT_DIG__)+1)
    return static_cast<long double>(2.4651903288156618919116517665087e-32l);
  }
 #endif
 };
 #endif
 template<typename _Real> struct NumTraits<std::complex<_Real> >
  : GenericNumTraits<std::complex<_Real> >
@@ -223,11 +281,11 @@ template<typename _Real> struct NumTraits<std::complex<_Real> >
    MulCost = 4 * NumTraits<Real>::MulCost + 2 * NumTraits<Real>::AddCost
  };
-  EIGEN_DEVICE_FUNC
+  EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR
  static inline Real epsilon() { return NumTraits<Real>::epsilon(); }
-  EIGEN_DEVICE_FUNC
+  EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR
  static inline Real dummy_precision() { return NumTraits<Real>::dummy_precision(); }
-  EIGEN_DEVICE_FUNC
+  EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR
  static inline int digits10() { return NumTraits<Real>::digits10(); }
 };
@@ -247,16 +305,17 @@ struct NumTraits<Array<Scalar, Rows, Cols, Options, MaxRows, MaxCols> >
    IsInteger = NumTraits<Scalar>::IsInteger,
    IsSigned  = NumTraits<Scalar>::IsSigned,
    RequireInitialization = 1,
-    ReadCost = ArrayType::SizeAtCompileTime==Dynamic ? HugeCost : ArrayType::SizeAtCompileTime * NumTraits<Scalar>::ReadCost,
+    ReadCost = ArrayType::SizeAtCompileTime==Dynamic ? HugeCost : ArrayType::SizeAtCompileTime * int(NumTraits<Scalar>::ReadCost),
-    AddCost  = ArrayType::SizeAtCompileTime==Dynamic ? HugeCost : ArrayType::SizeAtCompileTime * NumTraits<Scalar>::AddCost,
+    AddCost  = ArrayType::SizeAtCompileTime==Dynamic ? HugeCost : ArrayType::SizeAtCompileTime * int(NumTraits<Scalar>::AddCost),
-    MulCost  = ArrayType::SizeAtCompileTime==Dynamic ? HugeCost : ArrayType::SizeAtCompileTime * NumTraits<Scalar>::MulCost
+    MulCost  = ArrayType::SizeAtCompileTime==Dynamic ? HugeCost : ArrayType::SizeAtCompileTime * int(NumTraits<Scalar>::MulCost)
  };
-  EIGEN_DEVICE_FUNC
+  EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR
  static inline RealScalar epsilon() { return NumTraits<RealScalar>::epsilon(); }
-  EIGEN_DEVICE_FUNC
+  EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR
  static inline RealScalar dummy_precision() { return NumTraits<RealScalar>::dummy_precision(); }
  EIGEN_CONSTEXPR
  static inline int digits10() { return NumTraits<Scalar>::digits10(); }
 };
@@ -270,6 +329,7 @@ template<> struct NumTraits<std::string>
    MulCost  = HugeCost
  };
  EIGEN_CONSTEXPR
  static inline int digits10() { return 0; }
 private:
@@ -284,6 +344,8 @@ private:
 // Empty specialization for void to allow template specialization based on NumTraits<T>::Real with T==void and SFINAE.
 template<> struct NumTraits<void> {};
 template<> struct NumTraits<bool> : GenericNumTraits<bool> {};
 } // end namespace Eigen
 #endif // EIGEN_NUMTRAITS_H
--- a/Eigen/src/Core/PartialReduxEvaluator.h
+++ b/Eigen/src/Core/PartialReduxEvaluator.h
@@ -54,12 +54,17 @@ struct packetwise_redux_traits
 /* Value to be returned when size==0 , by default let's return 0 */
 template<typename PacketType,typename Func>
 EIGEN_DEVICE_FUNC
-PacketType packetwise_redux_empty_value(const Func& ) { return pset1<PacketType>(0); }
+PacketType packetwise_redux_empty_value(const Func& ) {
  const typename unpacket_traits<PacketType>::type zero(0);
  return pset1<PacketType>(zero);
 }
 /* For products the default is 1 */
 template<typename PacketType,typename Scalar>
 EIGEN_DEVICE_FUNC
-PacketType packetwise_redux_empty_value(const scalar_product_op<Scalar,Scalar>& ) { return pset1<PacketType>(1); }
+PacketType packetwise_redux_empty_value(const scalar_product_op<Scalar,Scalar>& ) {
  return pset1<PacketType>(Scalar(1));
 }
 /* Perform the actual reduction */
 template<typename Func, typename Evaluator,
@@ -145,7 +150,7 @@ struct evaluator<PartialReduxExpr<ArgType, MemberOp, Direction> >
  enum {
    CoeffReadCost = TraversalSize==Dynamic ? HugeCost
                  : TraversalSize==0 ? 1
-                  : TraversalSize * evaluator<ArgType>::CoeffReadCost + int(CostOpType::value),
+                  : int(TraversalSize) * int(evaluator<ArgType>::CoeffReadCost) + int(CostOpType::value),
    _ArgFlags = evaluator<ArgType>::Flags,
--- a/Eigen/src/Core/PlainObjectBase.h
+++ b/Eigen/src/Core/PlainObjectBase.h
@@ -13,10 +13,10 @@
 #if defined(EIGEN_INITIALIZE_MATRICES_BY_ZERO)
 # define EIGEN_INITIALIZE_COEFFS
-# define EIGEN_INITIALIZE_COEFFS_IF_THAT_OPTION_IS_ENABLED for(int i=0;i<base().size();++i) coeffRef(i)=Scalar(0);
+# define EIGEN_INITIALIZE_COEFFS_IF_THAT_OPTION_IS_ENABLED for(Index i=0;i<base().size();++i) coeffRef(i)=Scalar(0);
 #elif defined(EIGEN_INITIALIZE_MATRICES_BY_NAN)
 # define EIGEN_INITIALIZE_COEFFS
-# define EIGEN_INITIALIZE_COEFFS_IF_THAT_OPTION_IS_ENABLED for(int i=0;i<base().size();++i) coeffRef(i)=std::numeric_limits<Scalar>::quiet_NaN();
+# define EIGEN_INITIALIZE_COEFFS_IF_THAT_OPTION_IS_ENABLED for(Index i=0;i<base().size();++i) coeffRef(i)=std::numeric_limits<Scalar>::quiet_NaN();
 #else
 # undef EIGEN_INITIALIZE_COEFFS
 # define EIGEN_INITIALIZE_COEFFS_IF_THAT_OPTION_IS_ENABLED
@@ -118,16 +118,8 @@ class PlainObjectBase : public internal::dense_xpr_base<Derived>::type
    using Base::IsVectorAtCompileTime;
    using Base::Flags;
    template<typename PlainObjectType, int MapOptions, typename StrideType> friend class Eigen::Map;
    friend  class Eigen::Map<Derived, Unaligned>;
    typedef Eigen::Map<Derived, Unaligned>  MapType;
    friend  class Eigen::Map<const Derived, Unaligned>;
    typedef const Eigen::Map<const Derived, Unaligned> ConstMapType;
 #if EIGEN_MAX_ALIGN_BYTES>0
    // for EIGEN_MAX_ALIGN_BYTES==0, AlignedMax==Unaligned, and many compilers generate warnings for friend-ing a class twice.
    friend  class Eigen::Map<Derived, AlignedMax>;
    friend  class Eigen::Map<const Derived, AlignedMax>;
 #endif
    typedef Eigen::Map<Derived, AlignedMax> AlignedMapType;
    typedef const Eigen::Map<const Derived, AlignedMax> ConstAlignedMapType;
    template<typename StrideType> struct StridedMapType { typedef Eigen::Map<Derived, Unaligned, StrideType> type; };
@@ -147,10 +139,10 @@ class PlainObjectBase : public internal::dense_xpr_base<Derived>::type
    EIGEN_DEVICE_FUNC
    const Base& base() const { return *static_cast<const Base*>(this); }
-    EIGEN_DEVICE_FUNC
+    EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE EIGEN_CONSTEXPR
-    EIGEN_STRONG_INLINE Index rows() const { return m_storage.rows(); }
+    Index rows() const EIGEN_NOEXCEPT { return m_storage.rows(); }
-    EIGEN_DEVICE_FUNC
+    EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE EIGEN_CONSTEXPR
-    EIGEN_STRONG_INLINE Index cols() const { return m_storage.cols(); }
+    Index cols() const EIGEN_NOEXCEPT { return m_storage.cols(); }
    /** This is an overloaded version of DenseCoeffsBase<Derived,ReadOnlyAccessors>::coeff(Index,Index) const
      * provided to by-pass the creation of an evaluator of the expression, thus saving compilation efforts.
@@ -508,8 +500,8 @@ class PlainObjectBase : public internal::dense_xpr_base<Derived>::type
    EIGEN_DEVICE_FUNC
    PlainObjectBase& operator=(PlainObjectBase&& other) EIGEN_NOEXCEPT
    {
-      using std::swap;
+      _check_template_params();
-      swap(m_storage, other.m_storage);
+      m_storage = std::move(other.m_storage);
      return *this;
    }
 #endif
@@ -548,7 +540,7 @@ class PlainObjectBase : public internal::dense_xpr_base<Derived>::type
      m_storage.data()[1] = a1;
      m_storage.data()[2] = a2;
      m_storage.data()[3] = a3;
-      int i = 4;
+      Index i = 4;
      auto x = {(m_storage.data()[i++] = args, 0)...};
      static_cast<void>(x);
    }
@@ -717,18 +709,26 @@ class PlainObjectBase : public internal::dense_xpr_base<Derived>::type
    using Base::setConstant;
    EIGEN_DEVICE_FUNC Derived& setConstant(Index size, const Scalar& val);
    EIGEN_DEVICE_FUNC Derived& setConstant(Index rows, Index cols, const Scalar& val);
    EIGEN_DEVICE_FUNC Derived& setConstant(NoChange_t, Index cols, const Scalar& val);
    EIGEN_DEVICE_FUNC Derived& setConstant(Index rows, NoChange_t, const Scalar& val);
    using Base::setZero;
    EIGEN_DEVICE_FUNC Derived& setZero(Index size);
    EIGEN_DEVICE_FUNC Derived& setZero(Index rows, Index cols);
    EIGEN_DEVICE_FUNC Derived& setZero(NoChange_t, Index cols);
    EIGEN_DEVICE_FUNC Derived& setZero(Index rows, NoChange_t);
    using Base::setOnes;
    EIGEN_DEVICE_FUNC Derived& setOnes(Index size);
    EIGEN_DEVICE_FUNC Derived& setOnes(Index rows, Index cols);
    EIGEN_DEVICE_FUNC Derived& setOnes(NoChange_t, Index cols);
    EIGEN_DEVICE_FUNC Derived& setOnes(Index rows, NoChange_t);
    using Base::setRandom;
    Derived& setRandom(Index size);
    Derived& setRandom(Index rows, Index cols);
    Derived& setRandom(NoChange_t, Index cols);
    Derived& setRandom(Index rows, NoChange_t);
    #ifdef EIGEN_PLAINOBJECTBASE_PLUGIN
    #include EIGEN_PLAINOBJECTBASE_PLUGIN
@@ -967,8 +967,8 @@ class PlainObjectBase : public internal::dense_xpr_base<Derived>::type
    EIGEN_DEVICE_FUNC
    static EIGEN_STRONG_INLINE void _check_template_params()
    {
-      EIGEN_STATIC_ASSERT((EIGEN_IMPLIES(MaxRowsAtCompileTime==1 && MaxColsAtCompileTime!=1, (Options&RowMajor)==RowMajor)
+      EIGEN_STATIC_ASSERT((EIGEN_IMPLIES(MaxRowsAtCompileTime==1 && MaxColsAtCompileTime!=1, (int(Options)&RowMajor)==RowMajor)
-                        && EIGEN_IMPLIES(MaxColsAtCompileTime==1 && MaxRowsAtCompileTime!=1, (Options&RowMajor)==0)
+                        && EIGEN_IMPLIES(MaxColsAtCompileTime==1 && MaxRowsAtCompileTime!=1, (int(Options)&RowMajor)==0)
                        && ((RowsAtCompileTime == Dynamic) || (RowsAtCompileTime >= 0))
                        && ((ColsAtCompileTime == Dynamic) || (ColsAtCompileTime >= 0))
                        && ((MaxRowsAtCompileTime == Dynamic) || (MaxRowsAtCompileTime >= 0))
@@ -980,6 +980,17 @@ class PlainObjectBase : public internal::dense_xpr_base<Derived>::type
    }
    enum { IsPlainObjectBase = 1 };
 #endif
  public:
    // These apparently need to be down here for nvcc+icc to prevent duplicate
    // Map symbol.
    template<typename PlainObjectType, int MapOptions, typename StrideType> friend class Eigen::Map;
    friend class Eigen::Map<Derived, Unaligned>;
    friend class Eigen::Map<const Derived, Unaligned>;
 #if EIGEN_MAX_ALIGN_BYTES>0
    // for EIGEN_MAX_ALIGN_BYTES==0, AlignedMax==Unaligned, and many compilers generate warnings for friend-ing a class twice.
    friend class Eigen::Map<Derived, AlignedMax>;
    friend class Eigen::Map<const Derived, AlignedMax>;
 #endif
 };
@@ -1008,7 +1019,7 @@ struct conservative_resize_like_impl
    else
    {
      // The storage order does not allow us to use reallocation.
-      typename Derived::PlainObject tmp(rows,cols);
+      Derived tmp(rows,cols);
      const Index common_rows = numext::mini(rows, _this.rows());
      const Index common_cols = numext::mini(cols, _this.cols());
      tmp.block(0,0,common_rows,common_cols) = _this.block(0,0,common_rows,common_cols);
@@ -1043,7 +1054,7 @@ struct conservative_resize_like_impl
    else
    {
      // The storage order does not allow us to use reallocation.
-      typename Derived::PlainObject tmp(other);
+      Derived tmp(other);
      const Index common_rows = numext::mini(tmp.rows(), _this.rows());
      const Index common_cols = numext::mini(tmp.cols(), _this.cols());
      tmp.block(0,0,common_rows,common_cols) = _this.block(0,0,common_rows,common_cols);
--- a/Eigen/src/Core/Product.h
+++ b/Eigen/src/Core/Product.h
@@ -98,10 +98,10 @@ class Product : public ProductImpl<_Lhs,_Rhs,Option,
        && "if you wanted a coeff-wise or a dot product use the respective explicit functions");
    }
-    EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+    EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE EIGEN_CONSTEXPR
-    Index rows() const { return m_lhs.rows(); }
+    Index rows() const EIGEN_NOEXCEPT { return m_lhs.rows(); }
-    EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+    EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE EIGEN_CONSTEXPR
-    Index cols() const { return m_rhs.cols(); }
+    Index cols() const EIGEN_NOEXCEPT { return m_rhs.cols(); }
    EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
    const LhsNestedCleaned& lhs() const { return m_lhs; }
--- a/Eigen/src/Core/ProductEvaluators.h
+++ b/Eigen/src/Core/ProductEvaluators.h
@@ -375,6 +375,11 @@ struct generic_product_impl<Lhs,Rhs,DenseShape,DenseShape,GemvProduct>
  template<typename Dest>
  static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void scaleAndAddTo(Dest& dst, const Lhs& lhs, const Rhs& rhs, const Scalar& alpha)
  {
    // Fallback to inner product if both the lhs and rhs is a runtime vector.
    if (lhs.rows() == 1 && rhs.cols() == 1) {
      dst.coeffRef(0,0) += alpha * lhs.row(0).conjugate().dot(rhs.col(0));
      return;
    }
    LhsNested actual_lhs(lhs);
    RhsNested actual_rhs(rhs);
    internal::gemv_dense_selector<Side,
@@ -436,8 +441,8 @@ struct generic_product_impl<Lhs,Rhs,DenseShape,DenseShape,CoeffBasedProductMode>
    };
    // FIXME: in c++11 this should be auto, and extractScalarFactor should also return auto
    //        this is important for real*complex_mat
-    Scalar actualAlpha =    blas_traits<Lhs>::extractScalarFactor(lhs)
+    Scalar actualAlpha = combine_scalar_factors<Scalar>(lhs, rhs);
-                          * blas_traits<Rhs>::extractScalarFactor(rhs);
+
    eval_dynamic_impl(dst,
                      blas_traits<Lhs>::extract(lhs).template conjugateIf<ConjLhs>(),
                      blas_traits<Rhs>::extract(rhs).template conjugateIf<ConjRhs>(),
@@ -544,7 +549,7 @@ struct product_evaluator<Product<Lhs, Rhs, LazyProduct>, ProductTag, DenseShape,
    RhsCoeffReadCost = RhsEtorType::CoeffReadCost,
    CoeffReadCost = InnerSize==0 ? NumTraits<Scalar>::ReadCost
                  : InnerSize == Dynamic ? HugeCost
-                  : InnerSize * (NumTraits<Scalar>::MulCost + LhsCoeffReadCost + RhsCoeffReadCost)
+                    : InnerSize * (NumTraits<Scalar>::MulCost + int(LhsCoeffReadCost) + int(RhsCoeffReadCost))
                    + (InnerSize - 1) * NumTraits<Scalar>::AddCost,
    Unroll = CoeffReadCost <= EIGEN_UNROLLING_LIMIT,
@@ -571,7 +576,7 @@ struct product_evaluator<Product<Lhs, Rhs, LazyProduct>, ProductTag, DenseShape,
                    : (MaxColsAtCompileTime==1&&MaxRowsAtCompileTime!=1) ? 0
                    : (bool(RhsRowMajor) && !CanVectorizeLhs),
-    Flags = ((unsigned int)(LhsFlags | RhsFlags) & HereditaryBits & ~RowMajorBit)
+    Flags = ((int(LhsFlags) | int(RhsFlags)) & HereditaryBits & ~RowMajorBit)
          | (EvalToRowMajor ? RowMajorBit : 0)
          // TODO enable vectorization for mixed types
          | (SameType && (CanVectorizeLhs || CanVectorizeRhs) ? PacketAccessBit : 0)
@@ -592,8 +597,8 @@ struct product_evaluator<Product<Lhs, Rhs, LazyProduct>, ProductTag, DenseShape,
    CanVectorizeInner =    SameType
                        && LhsRowMajor
                        && (!RhsRowMajor)
-                        && (LhsFlags & RhsFlags & ActualPacketAccessBit)
+                        && (int(LhsFlags) & int(RhsFlags) & ActualPacketAccessBit)
-                        && (InnerSize % packet_traits<Scalar>::size == 0)
+                        && (int(InnerSize) % packet_traits<Scalar>::size == 0)
  };
  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const CoeffReturnType coeff(Index row, Index col) const
@@ -668,7 +673,7 @@ struct product_evaluator<Product<Lhs, Rhs, DefaultProduct>, LazyCoeffBasedProduc
 template<int UnrollingIndex, typename Lhs, typename Rhs, typename Packet, int LoadMode>
 struct etor_product_packet_impl<RowMajor, UnrollingIndex, Lhs, Rhs, Packet, LoadMode>
 {
-  static EIGEN_STRONG_INLINE void run(Index row, Index col, const Lhs& lhs, const Rhs& rhs, Index innerDim, Packet &res)
+  static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void run(Index row, Index col, const Lhs& lhs, const Rhs& rhs, Index innerDim, Packet &res)
  {
    etor_product_packet_impl<RowMajor, UnrollingIndex-1, Lhs, Rhs, Packet, LoadMode>::run(row, col, lhs, rhs, innerDim, res);
    res =  pmadd(pset1<Packet>(lhs.coeff(row, Index(UnrollingIndex-1))), rhs.template packet<LoadMode,Packet>(Index(UnrollingIndex-1), col), res);
@@ -678,7 +683,7 @@ struct etor_product_packet_impl<RowMajor, UnrollingIndex, Lhs, Rhs, Packet, Load
 template<int UnrollingIndex, typename Lhs, typename Rhs, typename Packet, int LoadMode>
 struct etor_product_packet_impl<ColMajor, UnrollingIndex, Lhs, Rhs, Packet, LoadMode>
 {
-  static EIGEN_STRONG_INLINE void run(Index row, Index col, const Lhs& lhs, const Rhs& rhs, Index innerDim, Packet &res)
+  static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void run(Index row, Index col, const Lhs& lhs, const Rhs& rhs, Index innerDim, Packet &res)
  {
    etor_product_packet_impl<ColMajor, UnrollingIndex-1, Lhs, Rhs, Packet, LoadMode>::run(row, col, lhs, rhs, innerDim, res);
    res =  pmadd(lhs.template packet<LoadMode,Packet>(row, Index(UnrollingIndex-1)), pset1<Packet>(rhs.coeff(Index(UnrollingIndex-1), col)), res);
@@ -688,7 +693,7 @@ struct etor_product_packet_impl<ColMajor, UnrollingIndex, Lhs, Rhs, Packet, Load
 template<typename Lhs, typename Rhs, typename Packet, int LoadMode>
 struct etor_product_packet_impl<RowMajor, 1, Lhs, Rhs, Packet, LoadMode>
 {
-  static EIGEN_STRONG_INLINE void run(Index row, Index col, const Lhs& lhs, const Rhs& rhs, Index /*innerDim*/, Packet &res)
+  static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void run(Index row, Index col, const Lhs& lhs, const Rhs& rhs, Index /*innerDim*/, Packet &res)
  {
    res = pmul(pset1<Packet>(lhs.coeff(row, Index(0))),rhs.template packet<LoadMode,Packet>(Index(0), col));
  }
@@ -697,7 +702,7 @@ struct etor_product_packet_impl<RowMajor, 1, Lhs, Rhs, Packet, LoadMode>
 template<typename Lhs, typename Rhs, typename Packet, int LoadMode>
 struct etor_product_packet_impl<ColMajor, 1, Lhs, Rhs, Packet, LoadMode>
 {
-  static EIGEN_STRONG_INLINE void run(Index row, Index col, const Lhs& lhs, const Rhs& rhs, Index /*innerDim*/, Packet &res)
+  static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void run(Index row, Index col, const Lhs& lhs, const Rhs& rhs, Index /*innerDim*/, Packet &res)
  {
    res = pmul(lhs.template packet<LoadMode,Packet>(row, Index(0)), pset1<Packet>(rhs.coeff(Index(0), col)));
  }
@@ -706,7 +711,7 @@ struct etor_product_packet_impl<ColMajor, 1, Lhs, Rhs, Packet, LoadMode>
 template<typename Lhs, typename Rhs, typename Packet, int LoadMode>
 struct etor_product_packet_impl<RowMajor, 0, Lhs, Rhs, Packet, LoadMode>
 {
-  static EIGEN_STRONG_INLINE void run(Index /*row*/, Index /*col*/, const Lhs& /*lhs*/, const Rhs& /*rhs*/, Index /*innerDim*/, Packet &res)
+  static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void run(Index /*row*/, Index /*col*/, const Lhs& /*lhs*/, const Rhs& /*rhs*/, Index /*innerDim*/, Packet &res)
  {
    res = pset1<Packet>(typename unpacket_traits<Packet>::type(0));
  }
@@ -715,7 +720,7 @@ struct etor_product_packet_impl<RowMajor, 0, Lhs, Rhs, Packet, LoadMode>
 template<typename Lhs, typename Rhs, typename Packet, int LoadMode>
 struct etor_product_packet_impl<ColMajor, 0, Lhs, Rhs, Packet, LoadMode>
 {
-  static EIGEN_STRONG_INLINE void run(Index /*row*/, Index /*col*/, const Lhs& /*lhs*/, const Rhs& /*rhs*/, Index /*innerDim*/, Packet &res)
+  static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void run(Index /*row*/, Index /*col*/, const Lhs& /*lhs*/, const Rhs& /*rhs*/, Index /*innerDim*/, Packet &res)
  {
    res = pset1<Packet>(typename unpacket_traits<Packet>::type(0));
  }
@@ -724,7 +729,7 @@ struct etor_product_packet_impl<ColMajor, 0, Lhs, Rhs, Packet, LoadMode>
 template<typename Lhs, typename Rhs, typename Packet, int LoadMode>
 struct etor_product_packet_impl<RowMajor, Dynamic, Lhs, Rhs, Packet, LoadMode>
 {
-  static EIGEN_STRONG_INLINE void run(Index row, Index col, const Lhs& lhs, const Rhs& rhs, Index innerDim, Packet& res)
+  static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void run(Index row, Index col, const Lhs& lhs, const Rhs& rhs, Index innerDim, Packet& res)
  {
    res = pset1<Packet>(typename unpacket_traits<Packet>::type(0));
    for(Index i = 0; i < innerDim; ++i)
@@ -735,7 +740,7 @@ struct etor_product_packet_impl<RowMajor, Dynamic, Lhs, Rhs, Packet, LoadMode>
 template<typename Lhs, typename Rhs, typename Packet, int LoadMode>
 struct etor_product_packet_impl<ColMajor, Dynamic, Lhs, Rhs, Packet, LoadMode>
 {
-  static EIGEN_STRONG_INLINE void run(Index row, Index col, const Lhs& lhs, const Rhs& rhs, Index innerDim, Packet& res)
+  static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void run(Index row, Index col, const Lhs& lhs, const Rhs& rhs, Index innerDim, Packet& res)
  {
    res = pset1<Packet>(typename unpacket_traits<Packet>::type(0));
    for(Index i = 0; i < innerDim; ++i)
@@ -826,7 +831,7 @@ struct diagonal_product_evaluator_base
   typedef typename ScalarBinaryOpTraits<typename MatrixType::Scalar, typename DiagonalType::Scalar>::ReturnType Scalar;
 public:
  enum {
-    CoeffReadCost = NumTraits<Scalar>::MulCost + evaluator<MatrixType>::CoeffReadCost + evaluator<DiagonalType>::CoeffReadCost,
+    CoeffReadCost = int(NumTraits<Scalar>::MulCost) + int(evaluator<MatrixType>::CoeffReadCost) + int(evaluator<DiagonalType>::CoeffReadCost),
    MatrixFlags = evaluator<MatrixType>::Flags,
    DiagFlags = evaluator<DiagonalType>::Flags,
@@ -854,7 +859,7 @@ public:
                      ||  (DiagonalType::SizeAtCompileTime==Dynamic && MatrixType::ColsAtCompileTime==1 && ProductOrder==OnTheRight)
  };
-  diagonal_product_evaluator_base(const MatrixType &mat, const DiagonalType &diag)
+  EIGEN_DEVICE_FUNC diagonal_product_evaluator_base(const MatrixType &mat, const DiagonalType &diag)
    : m_diagImpl(diag), m_matImpl(mat)
  {
    EIGEN_INTERNAL_CHECK_COST_VALUE(NumTraits<Scalar>::MulCost);
@@ -999,7 +1004,7 @@ struct permutation_matrix_product<ExpressionType, Side, Transposed, DenseShape>
    typedef typename remove_all<MatrixType>::type MatrixTypeCleaned;
    template<typename Dest, typename PermutationType>
-    static inline void run(Dest& dst, const PermutationType& perm, const ExpressionType& xpr)
+    static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void run(Dest& dst, const PermutationType& perm, const ExpressionType& xpr)
    {
      MatrixType mat(xpr);
      const Index n = Side==OnTheLeft ? mat.rows() : mat.cols();
@@ -1053,7 +1058,7 @@ template<typename Lhs, typename Rhs, int ProductTag, typename MatrixShape>
 struct generic_product_impl<Lhs, Rhs, PermutationShape, MatrixShape, ProductTag>
 {
  template<typename Dest>
-  static void evalTo(Dest& dst, const Lhs& lhs, const Rhs& rhs)
+  static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void evalTo(Dest& dst, const Lhs& lhs, const Rhs& rhs)
  {
    permutation_matrix_product<Rhs, OnTheLeft, false, MatrixShape>::run(dst, lhs, rhs);
  }
@@ -1063,7 +1068,7 @@ template<typename Lhs, typename Rhs, int ProductTag, typename MatrixShape>
 struct generic_product_impl<Lhs, Rhs, MatrixShape, PermutationShape, ProductTag>
 {
  template<typename Dest>
-  static void evalTo(Dest& dst, const Lhs& lhs, const Rhs& rhs)
+  static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void evalTo(Dest& dst, const Lhs& lhs, const Rhs& rhs)
  {
    permutation_matrix_product<Lhs, OnTheRight, false, MatrixShape>::run(dst, rhs, lhs);
  }
@@ -1073,7 +1078,7 @@ template<typename Lhs, typename Rhs, int ProductTag, typename MatrixShape>
 struct generic_product_impl<Inverse<Lhs>, Rhs, PermutationShape, MatrixShape, ProductTag>
 {
  template<typename Dest>
-  static void evalTo(Dest& dst, const Inverse<Lhs>& lhs, const Rhs& rhs)
+  static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void evalTo(Dest& dst, const Inverse<Lhs>& lhs, const Rhs& rhs)
  {
    permutation_matrix_product<Rhs, OnTheLeft, true, MatrixShape>::run(dst, lhs.nestedExpression(), rhs);
  }
@@ -1083,7 +1088,7 @@ template<typename Lhs, typename Rhs, int ProductTag, typename MatrixShape>
 struct generic_product_impl<Lhs, Inverse<Rhs>, MatrixShape, PermutationShape, ProductTag>
 {
  template<typename Dest>
-  static void evalTo(Dest& dst, const Lhs& lhs, const Inverse<Rhs>& rhs)
+  static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void evalTo(Dest& dst, const Lhs& lhs, const Inverse<Rhs>& rhs)
  {
    permutation_matrix_product<Lhs, OnTheRight, true, MatrixShape>::run(dst, rhs.nestedExpression(), lhs);
  }
@@ -1107,7 +1112,7 @@ struct transposition_matrix_product
  typedef typename remove_all<MatrixType>::type MatrixTypeCleaned;
  template<typename Dest, typename TranspositionType>
-  static inline void run(Dest& dst, const TranspositionType& tr, const ExpressionType& xpr)
+  static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void run(Dest& dst, const TranspositionType& tr, const ExpressionType& xpr)
  {
    MatrixType mat(xpr);
    typedef typename TranspositionType::StorageIndex StorageIndex;
@@ -1130,7 +1135,7 @@ template<typename Lhs, typename Rhs, int ProductTag, typename MatrixShape>
 struct generic_product_impl<Lhs, Rhs, TranspositionsShape, MatrixShape, ProductTag>
 {
  template<typename Dest>
-  static void evalTo(Dest& dst, const Lhs& lhs, const Rhs& rhs)
+  static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void evalTo(Dest& dst, const Lhs& lhs, const Rhs& rhs)
  {
    transposition_matrix_product<Rhs, OnTheLeft, false, MatrixShape>::run(dst, lhs, rhs);
  }
@@ -1140,7 +1145,7 @@ template<typename Lhs, typename Rhs, int ProductTag, typename MatrixShape>
 struct generic_product_impl<Lhs, Rhs, MatrixShape, TranspositionsShape, ProductTag>
 {
  template<typename Dest>
-  static void evalTo(Dest& dst, const Lhs& lhs, const Rhs& rhs)
+  static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void evalTo(Dest& dst, const Lhs& lhs, const Rhs& rhs)
  {
    transposition_matrix_product<Lhs, OnTheRight, false, MatrixShape>::run(dst, rhs, lhs);
  }
@@ -1151,7 +1156,7 @@ template<typename Lhs, typename Rhs, int ProductTag, typename MatrixShape>
 struct generic_product_impl<Transpose<Lhs>, Rhs, TranspositionsShape, MatrixShape, ProductTag>
 {
  template<typename Dest>
-  static void evalTo(Dest& dst, const Transpose<Lhs>& lhs, const Rhs& rhs)
+  static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void evalTo(Dest& dst, const Transpose<Lhs>& lhs, const Rhs& rhs)
  {
    transposition_matrix_product<Rhs, OnTheLeft, true, MatrixShape>::run(dst, lhs.nestedExpression(), rhs);
  }
@@ -1161,7 +1166,7 @@ template<typename Lhs, typename Rhs, int ProductTag, typename MatrixShape>
 struct generic_product_impl<Lhs, Transpose<Rhs>, MatrixShape, TranspositionsShape, ProductTag>
 {
  template<typename Dest>
-  static void evalTo(Dest& dst, const Lhs& lhs, const Transpose<Rhs>& rhs)
+  static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void evalTo(Dest& dst, const Lhs& lhs, const Transpose<Rhs>& rhs)
  {
    transposition_matrix_product<Lhs, OnTheRight, true, MatrixShape>::run(dst, rhs.nestedExpression(), lhs);
  }
--- a/Eigen/src/Core/Random.h
+++ b/Eigen/src/Core/Random.h
@@ -177,6 +177,42 @@ PlainObjectBase<Derived>::setRandom(Index rows, Index cols)
  return setRandom();
 }
 /** Resizes to the given size, changing only the number of columns, and sets all
  * coefficients in this expression to random values. For the parameter of type
  * NoChange_t, just pass the special value \c NoChange.
  *
  * Numbers are uniformly spread through their whole definition range for integer types,
  * and in the [-1:1] range for floating point scalar types.
  *
  * \not_reentrant
  *
  * \sa DenseBase::setRandom(), setRandom(Index), setRandom(Index, NoChange_t), class CwiseNullaryOp, DenseBase::Random()
  */
 template<typename Derived>
 EIGEN_STRONG_INLINE Derived&
 PlainObjectBase<Derived>::setRandom(NoChange_t, Index cols)
 {
  return setRandom(rows(), cols);
 }
 /** Resizes to the given size, changing only the number of rows, and sets all
  * coefficients in this expression to random values. For the parameter of type
  * NoChange_t, just pass the special value \c NoChange.
  *
  * Numbers are uniformly spread through their whole definition range for integer types,
  * and in the [-1:1] range for floating point scalar types.
  *
  * \not_reentrant
  *
  * \sa DenseBase::setRandom(), setRandom(Index), setRandom(NoChange_t, Index), class CwiseNullaryOp, DenseBase::Random()
  */
 template<typename Derived>
 EIGEN_STRONG_INLINE Derived&
 PlainObjectBase<Derived>::setRandom(Index rows, NoChange_t)
 {
  return setRandom(rows, cols());
 }
 } // end namespace Eigen
 #endif // EIGEN_RANDOM_H
--- a/Eigen/src/Core/Redux.h
+++ b/Eigen/src/Core/Redux.h
@@ -58,7 +58,7 @@ public:
 public:
  enum {
    Cost = Evaluator::SizeAtCompileTime == Dynamic ? HugeCost
-         : Evaluator::SizeAtCompileTime * Evaluator::CoeffReadCost + (Evaluator::SizeAtCompileTime-1) * functor_traits<Func>::Cost,
+         : int(Evaluator::SizeAtCompileTime) * int(Evaluator::CoeffReadCost) + (Evaluator::SizeAtCompileTime-1) * functor_traits<Func>::Cost,
    UnrollingLimit = EIGEN_UNROLLING_LIMIT * (int(Traversal) == int(DefaultTraversal) ? 1 : int(PacketSize))
  };
@@ -331,7 +331,7 @@ struct redux_impl<Func, Evaluator, LinearVectorizedTraversal, CompleteUnrolling>
  enum {
    PacketSize = redux_traits<Func, Evaluator>::PacketSize,
    Size = Evaluator::SizeAtCompileTime,
-    VectorizedSize = (Size / PacketSize) * PacketSize
+    VectorizedSize = (int(Size) / int(PacketSize)) * int(PacketSize)
  };
  template<typename XprType>
@@ -419,25 +419,33 @@ DenseBase<Derived>::redux(const Func& func) const
 }
 /** \returns the minimum of all coefficients of \c *this.
  * In case \c *this contains NaN, NaNPropagation determines the behavior:
  *   NaNPropagation == PropagateFast : undefined
  *   NaNPropagation == PropagateNaN : result is NaN
  *   NaNPropagation == PropagateNumbers : result is minimum of elements that are not NaN
  * \warning the matrix must be not empty, otherwise an assertion is triggered.
  * \warning the result is undefined if \c *this contains NaN.
  */
 template<typename Derived>
 template<int NaNPropagation>
 EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE typename internal::traits<Derived>::Scalar
 DenseBase<Derived>::minCoeff() const
 {
-  return derived().redux(Eigen::internal::scalar_min_op<Scalar,Scalar>());
+  return derived().redux(Eigen::internal::scalar_min_op<Scalar,Scalar, NaNPropagation>());
 }
 /** \returns the maximum of all coefficients of \c *this. 
  * In case \c *this contains NaN, NaNPropagation determines the behavior:
  *   NaNPropagation == PropagateFast : undefined
  *   NaNPropagation == PropagateNaN : result is NaN
  *   NaNPropagation == PropagateNumbers : result is maximum of elements that are not NaN
  * \warning the matrix must be not empty, otherwise an assertion is triggered.
  * \warning the result is undefined if \c *this contains NaN.
  */
 template<typename Derived>
 template<int NaNPropagation>
 EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE typename internal::traits<Derived>::Scalar
 DenseBase<Derived>::maxCoeff() const
 {
-  return derived().redux(Eigen::internal::scalar_max_op<Scalar,Scalar>());
+  return derived().redux(Eigen::internal::scalar_max_op<Scalar,Scalar, NaNPropagation>());
 }
 /** \returns the sum of all coefficients of \c *this
--- a/Eigen/src/Core/Ref.h
+++ b/Eigen/src/Core/Ref.h
@@ -67,12 +67,12 @@ public:
  typedef MapBase<Derived> Base;
  EIGEN_DENSE_PUBLIC_INTERFACE(RefBase)
-  EIGEN_DEVICE_FUNC inline Index innerStride() const
+  EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR inline Index innerStride() const
  {
    return StrideType::InnerStrideAtCompileTime != 0 ? m_stride.inner() : 1;
  }
-  EIGEN_DEVICE_FUNC inline Index outerStride() const
+  EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR inline Index outerStride() const
  {
    return StrideType::OuterStrideAtCompileTime != 0 ? m_stride.outer()
         : IsVectorAtCompileTime ? this->size()
@@ -93,29 +93,115 @@ protected:
  typedef Stride<StrideType::OuterStrideAtCompileTime,StrideType::InnerStrideAtCompileTime> StrideBase;
-  template<typename Expression>
+  // Resolves inner stride if default 0.
-  EIGEN_DEVICE_FUNC void construct(Expression& expr)
+  static EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR Index resolveInnerStride(Index inner) {
-  {
+    return inner == 0 ? 1 : inner;
-    EIGEN_STATIC_ASSERT_SAME_MATRIX_SIZE(PlainObjectType,Expression);
+  }
  // Resolves outer stride if default 0.
  static EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR Index resolveOuterStride(Index inner, Index outer, Index rows, Index cols, bool isVectorAtCompileTime, bool isRowMajor) {
    return outer == 0 ? isVectorAtCompileTime ? inner * rows * cols : isRowMajor ? inner * cols : inner * rows : outer;
  }
  // Returns true if construction is valid, false if there is a stride mismatch,
  // and fails if there is a size mismatch.
  template<typename Expression>
  EIGEN_DEVICE_FUNC bool construct(Expression& expr)
  {
    // Check matrix sizes.  If this is a compile-time vector, we do allow
    // implicitly transposing.
    EIGEN_STATIC_ASSERT(
      EIGEN_PREDICATE_SAME_MATRIX_SIZE(PlainObjectType, Expression)
      // If it is a vector, the transpose sizes might match.
      || ( PlainObjectType::IsVectorAtCompileTime
            && ((int(PlainObjectType::RowsAtCompileTime)==Eigen::Dynamic
              || int(Expression::ColsAtCompileTime)==Eigen::Dynamic
              || int(PlainObjectType::RowsAtCompileTime)==int(Expression::ColsAtCompileTime))
            &&  (int(PlainObjectType::ColsAtCompileTime)==Eigen::Dynamic
              || int(Expression::RowsAtCompileTime)==Eigen::Dynamic
              || int(PlainObjectType::ColsAtCompileTime)==int(Expression::RowsAtCompileTime)))),
      YOU_MIXED_MATRICES_OF_DIFFERENT_SIZES
    )
    // Determine runtime rows and columns.
    Index rows = expr.rows();
    Index cols = expr.cols();
    if(PlainObjectType::RowsAtCompileTime==1)
    {
      eigen_assert(expr.rows()==1 || expr.cols()==1);
-      ::new (static_cast<Base*>(this)) Base(expr.data(), 1, expr.size());
+      rows = 1;
      cols = expr.size();
    }
    else if(PlainObjectType::ColsAtCompileTime==1)
    {
      eigen_assert(expr.rows()==1 || expr.cols()==1);
-      ::new (static_cast<Base*>(this)) Base(expr.data(), expr.size(), 1);
+      rows = expr.size();
      cols = 1;
    }
-    else
+    // Verify that the sizes are valid.
-      ::new (static_cast<Base*>(this)) Base(expr.data(), expr.rows(), expr.cols());
+    eigen_assert(
      (PlainObjectType::RowsAtCompileTime == Dynamic) || (PlainObjectType::RowsAtCompileTime == rows));
    eigen_assert(
      (PlainObjectType::ColsAtCompileTime == Dynamic) || (PlainObjectType::ColsAtCompileTime == cols));
-    if(Expression::IsVectorAtCompileTime && (!PlainObjectType::IsVectorAtCompileTime) && ((Expression::Flags&RowMajorBit)!=(PlainObjectType::Flags&RowMajorBit)))
+
-      ::new (&m_stride) StrideBase(expr.innerStride(), StrideType::InnerStrideAtCompileTime==0?0:1);
+    // If this is a vector, we might be transposing, which means that stride should swap.
-    else
+    const bool transpose = PlainObjectType::IsVectorAtCompileTime && (rows != expr.rows());
-      ::new (&m_stride) StrideBase(StrideType::OuterStrideAtCompileTime==0?0:expr.outerStride(),
+    // If the storage format differs, we also need to swap the stride.
-                                   StrideType::InnerStrideAtCompileTime==0?0:expr.innerStride());    
+    const bool row_major = ((PlainObjectType::Flags)&RowMajorBit) != 0;
    const bool expr_row_major = (Expression::Flags&RowMajorBit) != 0;
    const bool storage_differs =  (row_major != expr_row_major);
    const bool swap_stride = (transpose != storage_differs);
    // Determine expr's actual strides, resolving any defaults if zero.
    const Index expr_inner_actual = resolveInnerStride(expr.innerStride());
    const Index expr_outer_actual = resolveOuterStride(expr_inner_actual,
                                                       expr.outerStride(),
                                                       expr.rows(),
                                                       expr.cols(),
                                                       Expression::IsVectorAtCompileTime != 0,
                                                       expr_row_major);
    // If this is a column-major row vector or row-major column vector, the inner-stride
    // is arbitrary, so set it to either the compile-time inner stride or 1.
    const bool row_vector = (rows == 1);
    const bool col_vector = (cols == 1);
    const Index inner_stride =
        ( (!row_major && row_vector) || (row_major && col_vector) ) ?
            ( StrideType::InnerStrideAtCompileTime > 0 ? Index(StrideType::InnerStrideAtCompileTime) : 1)
            : swap_stride ? expr_outer_actual : expr_inner_actual;
    // If this is a column-major column vector or row-major row vector, the outer-stride
    // is arbitrary, so set it to either the compile-time outer stride or vector size.
    const Index outer_stride =
      ( (!row_major && col_vector) || (row_major && row_vector) ) ?
          ( StrideType::OuterStrideAtCompileTime > 0 ? Index(StrideType::OuterStrideAtCompileTime) : rows * cols * inner_stride)
          : swap_stride ? expr_inner_actual : expr_outer_actual;
    // Check if given inner/outer strides are compatible with compile-time strides.
    const bool inner_valid = (StrideType::InnerStrideAtCompileTime == Dynamic)
        || (resolveInnerStride(Index(StrideType::InnerStrideAtCompileTime)) == inner_stride);
    if (!inner_valid) {
      return false;
    }
    const bool outer_valid = (StrideType::OuterStrideAtCompileTime == Dynamic)
        || (resolveOuterStride(
              inner_stride,
              Index(StrideType::OuterStrideAtCompileTime),
              rows, cols, PlainObjectType::IsVectorAtCompileTime != 0,
              row_major)
            == outer_stride);
    if (!outer_valid) {
      return false;
    }
    ::new (static_cast<Base*>(this)) Base(expr.data(), rows, cols);
    ::new (&m_stride) StrideBase(
      (StrideType::OuterStrideAtCompileTime == 0) ? 0 : outer_stride,
      (StrideType::InnerStrideAtCompileTime == 0) ? 0 : inner_stride );
    return true;
  }
  StrideBase m_stride;
@@ -212,7 +298,10 @@ template<typename PlainObjectType, int Options, typename StrideType> class Ref
                                 typename internal::enable_if<bool(Traits::template match<Derived>::MatchAtCompileTime),Derived>::type* = 0)
    {
      EIGEN_STATIC_ASSERT(bool(Traits::template match<Derived>::MatchAtCompileTime), STORAGE_LAYOUT_DOES_NOT_MATCH);
-      Base::construct(expr.derived());
+      // Construction must pass since we will not create temprary storage in the non-const case.
      const bool success = Base::construct(expr.derived());
      EIGEN_UNUSED_VARIABLE(success)
      eigen_assert(success);
    }
    template<typename Derived>
    EIGEN_DEVICE_FUNC inline Ref(const DenseBase<Derived>& expr,
@@ -223,10 +312,13 @@ template<typename PlainObjectType, int Options, typename StrideType> class Ref
    inline Ref(DenseBase<Derived>& expr)
    #endif
    {
-      EIGEN_STATIC_ASSERT(bool(internal::is_lvalue<Derived>::value), THIS_EXPRESSION_IS_NOT_A_LVALUE__IT_IS_READ_ONLY);
+      EIGEN_STATIC_ASSERT((static_cast<bool>(internal::is_lvalue<Derived>::value)), THIS_EXPRESSION_IS_NOT_A_LVALUE__IT_IS_READ_ONLY);
-      EIGEN_STATIC_ASSERT(bool(Traits::template match<Derived>::MatchAtCompileTime), STORAGE_LAYOUT_DOES_NOT_MATCH);
+      EIGEN_STATIC_ASSERT((static_cast<bool>(Traits::template match<Derived>::MatchAtCompileTime)), STORAGE_LAYOUT_DOES_NOT_MATCH);
      EIGEN_STATIC_ASSERT(!Derived::IsPlainObjectBase,THIS_EXPRESSION_IS_NOT_A_LVALUE__IT_IS_READ_ONLY);
-      Base::construct(expr.const_cast_derived());
+      // Construction must pass since we will not create temporary storage in the non-const case.
      const bool success = Base::construct(expr.const_cast_derived());
      EIGEN_UNUSED_VARIABLE(success)
      eigen_assert(success);
    }
    EIGEN_INHERIT_ASSIGNMENT_OPERATORS(Ref)
@@ -267,7 +359,10 @@ template<typename TPlainObjectType, int Options, typename StrideType> class Ref<
    template<typename Expression>
    EIGEN_DEVICE_FUNC void construct(const Expression& expr,internal::true_type)
    {
-      Base::construct(expr);
+      // Check if we can use the underlying expr's storage directly, otherwise call the copy version.
      if (!Base::construct(expr)) {
        construct(expr, internal::false_type());
      }
    }
    template<typename Expression>
--- a/Eigen/src/Core/Replicate.h
+++ b/Eigen/src/Core/Replicate.h
@@ -88,9 +88,9 @@ template<typename MatrixType,int RowFactor,int ColFactor> class Replicate
                          THE_MATRIX_OR_EXPRESSION_THAT_YOU_PASSED_DOES_NOT_HAVE_THE_EXPECTED_TYPE)
    }
-    EIGEN_DEVICE_FUNC
+    EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR
    inline Index rows() const { return m_matrix.rows() * m_rowFactor.value(); }
-    EIGEN_DEVICE_FUNC
+    EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR
    inline Index cols() const { return m_matrix.cols() * m_colFactor.value(); }
    EIGEN_DEVICE_FUNC
--- a/Eigen/src/Core/Reshaped.h
+++ b/Eigen/src/Core/Reshaped.h
@@ -12,7 +12,6 @@
 #define EIGEN_RESHAPED_H
 namespace Eigen {
 namespace internal {
 /** \class Reshaped
  * \ingroup Core_Module
@@ -44,6 +43,8 @@ namespace internal {
  * \sa DenseBase::reshaped(NRowsType,NColsType)
  */
 namespace internal {
 template<typename XprType, int Rows, int Cols, int Order>
 struct traits<Reshaped<XprType, Rows, Cols, Order> > : traits<XprType>
 {
@@ -239,17 +240,17 @@ class ReshapedImpl_dense<XprType, Rows, Cols, Order, true>
    XprType& nestedExpression() { return m_xpr; }
    /** \sa MapBase::innerStride() */
-    EIGEN_DEVICE_FUNC
+    EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR
    inline Index innerStride() const
    {
      return m_xpr.innerStride();
    }
    /** \sa MapBase::outerStride() */
-    EIGEN_DEVICE_FUNC
+    EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR
    inline Index outerStride() const
    {
-      return ((Flags&RowMajorBit)==RowMajorBit) ? this->cols() : this->rows();
+      return (((Flags&RowMajorBit)==RowMajorBit) ? this->cols() : this->rows()) * m_xpr.innerStride();
    }
  protected:
--- a/Eigen/src/Core/ReturnByValue.h
+++ b/Eigen/src/Core/ReturnByValue.h
@@ -60,8 +60,10 @@ template<typename Derived> class ReturnByValue
    EIGEN_DEVICE_FUNC
    inline void evalTo(Dest& dst) const
    { static_cast<const Derived*>(this)->evalTo(dst); }
-    EIGEN_DEVICE_FUNC inline Index rows() const { return static_cast<const Derived*>(this)->rows(); }
+    EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR
-    EIGEN_DEVICE_FUNC inline Index cols() const { return static_cast<const Derived*>(this)->cols(); }
+    inline Index rows() const EIGEN_NOEXCEPT { return static_cast<const Derived*>(this)->rows(); }
    EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR
    inline Index cols() const EIGEN_NOEXCEPT { return static_cast<const Derived*>(this)->cols(); }
 #ifndef EIGEN_PARSED_BY_DOXYGEN
 #define Unusable YOU_ARE_TRYING_TO_ACCESS_A_SINGLE_COEFFICIENT_IN_A_SPECIAL_EXPRESSION_WHERE_THAT_IS_NOT_ALLOWED_BECAUSE_THAT_WOULD_BE_INEFFICIENT
--- a/Eigen/src/Core/Reverse.h
+++ b/Eigen/src/Core/Reverse.h
@@ -89,8 +89,10 @@ template<typename MatrixType, int Direction> class Reverse
    EIGEN_INHERIT_ASSIGNMENT_OPERATORS(Reverse)
-    EIGEN_DEVICE_FUNC inline Index rows() const { return m_matrix.rows(); }
+    EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR
-    EIGEN_DEVICE_FUNC inline Index cols() const { return m_matrix.cols(); }
+    inline Index rows() const EIGEN_NOEXCEPT { return m_matrix.rows(); }
    EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR
    inline Index cols() const EIGEN_NOEXCEPT { return m_matrix.cols(); }
    EIGEN_DEVICE_FUNC inline Index innerStride() const
    {
--- a/Eigen/src/Core/Select.h
+++ b/Eigen/src/Core/Select.h
@@ -67,8 +67,10 @@ class Select : public internal::dense_xpr_base< Select<ConditionMatrixType, Then
      eigen_assert(m_condition.cols() == m_then.cols() && m_condition.cols() == m_else.cols());
    }
-    inline EIGEN_DEVICE_FUNC Index rows() const { return m_condition.rows(); }
+    inline EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR
-    inline EIGEN_DEVICE_FUNC Index cols() const { return m_condition.cols(); }
+    Index rows() const EIGEN_NOEXCEPT { return m_condition.rows(); }
    inline EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR
    Index cols() const EIGEN_NOEXCEPT { return m_condition.cols(); }
    inline EIGEN_DEVICE_FUNC
    const Scalar coeff(Index i, Index j) const
@@ -120,7 +122,7 @@ class Select : public internal::dense_xpr_base< Select<ConditionMatrixType, Then
  */
 template<typename Derived>
 template<typename ThenDerived,typename ElseDerived>
-inline const Select<Derived,ThenDerived,ElseDerived>
+inline EIGEN_DEVICE_FUNC const Select<Derived,ThenDerived,ElseDerived>
 DenseBase<Derived>::select(const DenseBase<ThenDerived>& thenMatrix,
                            const DenseBase<ElseDerived>& elseMatrix) const
 {
@@ -134,7 +136,7 @@ DenseBase<Derived>::select(const DenseBase<ThenDerived>& thenMatrix,
  */
 template<typename Derived>
 template<typename ThenDerived>
-inline const Select<Derived,ThenDerived, typename ThenDerived::ConstantReturnType>
+inline EIGEN_DEVICE_FUNC const Select<Derived,ThenDerived, typename ThenDerived::ConstantReturnType>
 DenseBase<Derived>::select(const DenseBase<ThenDerived>& thenMatrix,
                           const typename ThenDerived::Scalar& elseScalar) const
 {
@@ -149,7 +151,7 @@ DenseBase<Derived>::select(const DenseBase<ThenDerived>& thenMatrix,
  */
 template<typename Derived>
 template<typename ElseDerived>
-inline const Select<Derived, typename ElseDerived::ConstantReturnType, ElseDerived >
+inline EIGEN_DEVICE_FUNC const Select<Derived, typename ElseDerived::ConstantReturnType, ElseDerived >
 DenseBase<Derived>::select(const typename ElseDerived::Scalar& thenScalar,
                           const DenseBase<ElseDerived>& elseMatrix) const
 {
--- a/Eigen/src/Core/SelfAdjointView.h
+++ b/Eigen/src/Core/SelfAdjointView.h
@@ -66,7 +66,7 @@ template<typename _MatrixType, unsigned int UpLo> class SelfAdjointView
    enum {
      Mode = internal::traits<SelfAdjointView>::Mode,
      Flags = internal::traits<SelfAdjointView>::Flags,
-      TransposeMode = ((Mode & Upper) ? Lower : 0) | ((Mode & Lower) ? Upper : 0)
+      TransposeMode = ((int(Mode) & int(Upper)) ? Lower : 0) | ((int(Mode) & int(Lower)) ? Upper : 0)
    };
    typedef typename MatrixType::PlainObject PlainObject;
@@ -76,14 +76,14 @@ template<typename _MatrixType, unsigned int UpLo> class SelfAdjointView
      EIGEN_STATIC_ASSERT(UpLo==Lower || UpLo==Upper,SELFADJOINTVIEW_ACCEPTS_UPPER_AND_LOWER_MODE_ONLY);
    }
-    EIGEN_DEVICE_FUNC
+    EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR
-    inline Index rows() const { return m_matrix.rows(); }
+    inline Index rows() const EIGEN_NOEXCEPT { return m_matrix.rows(); }
-    EIGEN_DEVICE_FUNC
+    EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR
-    inline Index cols() const { return m_matrix.cols(); }
+    inline Index cols() const EIGEN_NOEXCEPT { return m_matrix.cols(); }
-    EIGEN_DEVICE_FUNC
+    EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR
-    inline Index outerStride() const { return m_matrix.outerStride(); }
+    inline Index outerStride() const EIGEN_NOEXCEPT { return m_matrix.outerStride(); }
-    EIGEN_DEVICE_FUNC
+    EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR
-    inline Index innerStride() const { return m_matrix.innerStride(); }
+    inline Index innerStride() const EIGEN_NOEXCEPT { return m_matrix.innerStride(); }
    /** \sa MatrixBase::coeff()
      * \warning the coordinates must fit into the referenced triangular part
--- a/Eigen/src/Core/Solve.h
+++ b/Eigen/src/Core/Solve.h
@@ -69,8 +69,8 @@ public:
    : m_dec(dec), m_rhs(rhs)
  {}
-  EIGEN_DEVICE_FUNC Index rows() const { return m_dec.cols(); }
+  EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR Index rows() const EIGEN_NOEXCEPT { return m_dec.cols(); }
-  EIGEN_DEVICE_FUNC Index cols() const { return m_rhs.cols(); }
+  EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR Index cols() const EIGEN_NOEXCEPT { return m_rhs.cols(); }
  EIGEN_DEVICE_FUNC const Decomposition& dec() const { return m_dec; }
  EIGEN_DEVICE_FUNC const RhsType&       rhs() const { return m_rhs; }
--- a/Eigen/src/Core/SolveTriangular.h
+++ b/Eigen/src/Core/SolveTriangular.h
@@ -54,7 +54,7 @@ struct triangular_solver_selector<Lhs,Rhs,Side,Mode,NoUnrolling,1>
  typedef blas_traits<Lhs> LhsProductTraits;
  typedef typename LhsProductTraits::ExtractType ActualLhsType;
  typedef Map<Matrix<RhsScalar,Dynamic,1>, Aligned> MappedRhs;
-  static void run(const Lhs& lhs, Rhs& rhs)
+  static EIGEN_DEVICE_FUNC void run(const Lhs& lhs, Rhs& rhs)
  {
    ActualLhsType actualLhs = LhsProductTraits::extract(lhs);
@@ -85,7 +85,7 @@ struct triangular_solver_selector<Lhs,Rhs,Side,Mode,NoUnrolling,Dynamic>
  typedef blas_traits<Lhs> LhsProductTraits;
  typedef typename LhsProductTraits::DirectLinearAccessType ActualLhsType;
-  static void run(const Lhs& lhs, Rhs& rhs)
+  static EIGEN_DEVICE_FUNC void run(const Lhs& lhs, Rhs& rhs)
  {
    typename internal::add_const_on_value_type<ActualLhsType>::type actualLhs = LhsProductTraits::extract(lhs);
@@ -118,7 +118,7 @@ struct triangular_solver_unroller<Lhs,Rhs,Mode,LoopIndex,Size,false> {
    DiagIndex  = IsLower ? LoopIndex : Size - LoopIndex - 1,
    StartIndex = IsLower ? 0         : DiagIndex+1
  };
-  static void run(const Lhs& lhs, Rhs& rhs)
+  static EIGEN_DEVICE_FUNC void run(const Lhs& lhs, Rhs& rhs)
  {
    if (LoopIndex>0)
      rhs.coeffRef(DiagIndex) -= lhs.row(DiagIndex).template segment<LoopIndex>(StartIndex).transpose()
@@ -133,18 +133,18 @@ struct triangular_solver_unroller<Lhs,Rhs,Mode,LoopIndex,Size,false> {
 template<typename Lhs, typename Rhs, int Mode, int LoopIndex, int Size>
 struct triangular_solver_unroller<Lhs,Rhs,Mode,LoopIndex,Size,true> {
-  static void run(const Lhs&, Rhs&) {}
+  static EIGEN_DEVICE_FUNC void run(const Lhs&, Rhs&) {}
 };
 template<typename Lhs, typename Rhs, int Mode>
 struct triangular_solver_selector<Lhs,Rhs,OnTheLeft,Mode,CompleteUnrolling,1> {
-  static void run(const Lhs& lhs, Rhs& rhs)
+  static EIGEN_DEVICE_FUNC void run(const Lhs& lhs, Rhs& rhs)
  { triangular_solver_unroller<Lhs,Rhs,Mode,0,Rhs::SizeAtCompileTime>::run(lhs,rhs); }
 };
 template<typename Lhs, typename Rhs, int Mode>
 struct triangular_solver_selector<Lhs,Rhs,OnTheRight,Mode,CompleteUnrolling,1> {
-  static void run(const Lhs& lhs, Rhs& rhs)
+  static EIGEN_DEVICE_FUNC void run(const Lhs& lhs, Rhs& rhs)
  {
    Transpose<const Lhs> trLhs(lhs);
    Transpose<Rhs> trRhs(rhs);
@@ -168,7 +168,7 @@ EIGEN_DEVICE_FUNC void TriangularViewImpl<MatrixType,Mode,Dense>::solveInPlace(c
 {
  OtherDerived& other = _other.const_cast_derived();
  eigen_assert( derived().cols() == derived().rows() && ((Side==OnTheLeft && derived().cols() == other.rows()) || (Side==OnTheRight && derived().cols() == other.cols())) );
-  eigen_assert((!(Mode & ZeroDiag)) && bool(Mode & (Upper|Lower)));
+  eigen_assert((!(int(Mode) & int(ZeroDiag))) && bool(int(Mode) & (int(Upper) | int(Lower))));
  // If solving for a 0x0 matrix, nothing to do, simply return.
  if (derived().cols() == 0)
    return;
@@ -213,8 +213,8 @@ template<int Side, typename TriangularType, typename Rhs> struct triangular_solv
    : m_triangularMatrix(tri), m_rhs(rhs)
  {}
-  inline Index rows() const { return m_rhs.rows(); }
+  inline EIGEN_CONSTEXPR Index rows() const EIGEN_NOEXCEPT { return m_rhs.rows(); }
-  inline Index cols() const { return m_rhs.cols(); }
+  inline EIGEN_CONSTEXPR Index cols() const EIGEN_NOEXCEPT { return m_rhs.cols(); }
  template<typename Dest> inline void evalTo(Dest& dst) const
  {
--- a/Eigen/src/Core/SolverBase.h
+++ b/Eigen/src/Core/SolverBase.h
@@ -110,7 +110,7 @@ class SolverBase : public EigenBase<Derived>
    }
    /** \internal the return type of transpose() */
-    typedef typename internal::add_const<Transpose<const Derived> >::type ConstTransposeReturnType;
+    typedef Transpose<const Derived> ConstTransposeReturnType;
    /** \returns an expression of the transposed of the factored matrix.
      *
      * A typical usage is to solve for the transposed problem A^T x = b:
@@ -118,15 +118,15 @@ class SolverBase : public EigenBase<Derived>
      *
      * \sa adjoint(), solve()
      */
-    inline ConstTransposeReturnType transpose() const
+    inline const ConstTransposeReturnType transpose() const
    {
      return ConstTransposeReturnType(derived());
    }
    /** \internal the return type of adjoint() */
    typedef typename internal::conditional<NumTraits<Scalar>::IsComplex,
-                        CwiseUnaryOp<internal::scalar_conjugate_op<Scalar>, ConstTransposeReturnType>,
+               CwiseUnaryOp<internal::scalar_conjugate_op<Scalar>, const ConstTransposeReturnType>,
-                        ConstTransposeReturnType
+               const ConstTransposeReturnType
            >::type AdjointReturnType;
    /** \returns an expression of the adjoint of the factored matrix
      *
@@ -137,7 +137,7 @@ class SolverBase : public EigenBase<Derived>
      *
      * \sa transpose(), solve()
      */
-    inline AdjointReturnType adjoint() const
+    inline const AdjointReturnType adjoint() const
    {
      return AdjointReturnType(derived().transpose());
    }
--- a/Eigen/src/Core/StableNorm.h
+++ b/Eigen/src/Core/StableNorm.h
@@ -123,13 +123,7 @@ blueNorm_impl(const EigenBase<Derived>& _vec)
  using std::pow;
  using std::sqrt;
  using std::abs;
-  const Derived& vec(_vec.derived());
+
  static bool initialized = false;
  static RealScalar b1, b2, s1m, s2m, rbig, relerr;
  if(!initialized)
  {
    int ibeta, it, iemin, iemax, iexp;
    RealScalar eps;
  // This program calculates the machine-dependent constants
  // bl, b2, slm, s2m, relerr overfl
  // from the "basic" machine-dependent numbers
@@ -138,26 +132,19 @@ blueNorm_impl(const EigenBase<Derived>& _vec)
  // For portability, the PORT subprograms "ilmaeh" and "rlmach"
  // are used. For any specific computer, each of the assignment
  // statements can be replaced
-    ibeta = std::numeric_limits<RealScalar>::radix;                 // base for floating-point numbers
+  static const int ibeta = std::numeric_limits<RealScalar>::radix;  // base for floating-point numbers
-    it    = NumTraits<RealScalar>::digits();                        // number of base-beta digits in mantissa
+  static const int it    = NumTraits<RealScalar>::digits();  // number of base-beta digits in mantissa
-    iemin = std::numeric_limits<RealScalar>::min_exponent;          // minimum exponent
+  static const int iemin = NumTraits<RealScalar>::min_exponent();  // minimum exponent
-    iemax = std::numeric_limits<RealScalar>::max_exponent;          // maximum exponent
+  static const int iemax = NumTraits<RealScalar>::max_exponent();  // maximum exponent
-    rbig  = (std::numeric_limits<RealScalar>::max)();               // largest floating-point number
+  static const RealScalar rbig   = NumTraits<RealScalar>::highest();  // largest floating-point number
  static const RealScalar b1     = RealScalar(pow(RealScalar(ibeta),RealScalar(-((1-iemin)/2))));  // lower boundary of midrange
  static const RealScalar b2     = RealScalar(pow(RealScalar(ibeta),RealScalar((iemax + 1 - it)/2)));  // upper boundary of midrange
  static const RealScalar s1m    = RealScalar(pow(RealScalar(ibeta),RealScalar((2-iemin)/2)));  // scaling factor for lower range
  static const RealScalar s2m    = RealScalar(pow(RealScalar(ibeta),RealScalar(- ((iemax+it)/2))));  // scaling factor for upper range
  static const RealScalar eps    = RealScalar(pow(double(ibeta), 1-it));
  static const RealScalar relerr = sqrt(eps);  // tolerance for neglecting asml
-    iexp  = -((1-iemin)/2);
+  const Derived& vec(_vec.derived());
    b1    = RealScalar(pow(RealScalar(ibeta),RealScalar(iexp)));    // lower boundary of midrange
    iexp  = (iemax + 1 - it)/2;
    b2    = RealScalar(pow(RealScalar(ibeta),RealScalar(iexp)));    // upper boundary of midrange
    iexp  = (2-iemin)/2;
    s1m   = RealScalar(pow(RealScalar(ibeta),RealScalar(iexp)));    // scaling factor for lower range
    iexp  = - ((iemax+it)/2);
    s2m   = RealScalar(pow(RealScalar(ibeta),RealScalar(iexp)));    // scaling factor for upper range
    eps     = RealScalar(pow(double(ibeta), 1-it));
    relerr  = sqrt(eps);                                            // tolerance for neglecting asml
    initialized = true;
  }
  Index n = vec.size();
  RealScalar ab2 = b2 / RealScalar(n);
  RealScalar asml = RealScalar(0);
@@ -166,9 +153,9 @@ blueNorm_impl(const EigenBase<Derived>& _vec)
  for(Index j=0; j<vec.outerSize(); ++j)
  {
-    for(typename Derived::InnerIterator it(vec, j); it; ++it)
+    for(typename Derived::InnerIterator iter(vec, j); iter; ++iter)
    {
-      RealScalar ax = abs(it.value());
+      RealScalar ax = abs(iter.value());
      if(ax > ab2)     abig += numext::abs2(ax*s2m);
      else if(ax < b1) asml += numext::abs2(ax*s1m);
      else             amed += numext::abs2(ax);
--- a/Eigen/src/Core/StlIterators.h
+++ b/Eigen/src/Core/StlIterators.h
@@ -7,6 +7,9 @@
 // Public License v. 2.0. If a copy of the MPL was not distributed
 // with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
 #ifndef EIGEN_STLITERATORS_H
 #define EIGEN_STLITERATORS_H
 namespace Eigen {
 namespace internal {
@@ -30,10 +33,10 @@ public:
  typedef Index difference_type;
  typedef std::random_access_iterator_tag iterator_category;
-  indexed_based_stl_iterator_base() : mp_xpr(0), m_index(0) {}
+  indexed_based_stl_iterator_base() EIGEN_NO_THROW : mp_xpr(0), m_index(0) {}
-  indexed_based_stl_iterator_base(XprType& xpr, Index index) : mp_xpr(&xpr), m_index(index) {}
+  indexed_based_stl_iterator_base(XprType& xpr, Index index) EIGEN_NO_THROW : mp_xpr(&xpr), m_index(index) {}
-  indexed_based_stl_iterator_base(const non_const_iterator& other)
+  indexed_based_stl_iterator_base(const non_const_iterator& other) EIGEN_NO_THROW
    : mp_xpr(other.mp_xpr), m_index(other.m_index)
  {}
@@ -93,6 +96,85 @@ protected:
  Index m_index;
 };
 template<typename  Derived>
 class indexed_based_stl_reverse_iterator_base
 {
 protected:
  typedef indexed_based_stl_iterator_traits<Derived> traits;
  typedef typename traits::XprType XprType;
  typedef indexed_based_stl_reverse_iterator_base<typename traits::non_const_iterator> non_const_iterator;
  typedef indexed_based_stl_reverse_iterator_base<typename traits::const_iterator> const_iterator;
  typedef typename internal::conditional<internal::is_const<XprType>::value,non_const_iterator,const_iterator>::type other_iterator;
  // NOTE: in C++03 we cannot declare friend classes through typedefs because we need to write friend class:
  friend class indexed_based_stl_reverse_iterator_base<typename traits::const_iterator>;
  friend class indexed_based_stl_reverse_iterator_base<typename traits::non_const_iterator>;
 public:
  typedef Index difference_type;
  typedef std::random_access_iterator_tag iterator_category;
  indexed_based_stl_reverse_iterator_base() : mp_xpr(0), m_index(0) {}
  indexed_based_stl_reverse_iterator_base(XprType& xpr, Index index) : mp_xpr(&xpr), m_index(index) {}
  indexed_based_stl_reverse_iterator_base(const non_const_iterator& other)
    : mp_xpr(other.mp_xpr), m_index(other.m_index)
  {}
  indexed_based_stl_reverse_iterator_base& operator=(const non_const_iterator& other)
  {
    mp_xpr = other.mp_xpr;
    m_index = other.m_index;
    return *this;
  }
  Derived& operator++() { --m_index; return derived(); }
  Derived& operator--() { ++m_index; return derived(); }
  Derived operator++(int) { Derived prev(derived()); operator++(); return prev;}
  Derived operator--(int) { Derived prev(derived()); operator--(); return prev;}
  friend Derived operator+(const indexed_based_stl_reverse_iterator_base& a, Index b) { Derived ret(a.derived()); ret += b; return ret; }
  friend Derived operator-(const indexed_based_stl_reverse_iterator_base& a, Index b) { Derived ret(a.derived()); ret -= b; return ret; }
  friend Derived operator+(Index a, const indexed_based_stl_reverse_iterator_base& b) { Derived ret(b.derived()); ret += a; return ret; }
  friend Derived operator-(Index a, const indexed_based_stl_reverse_iterator_base& b) { Derived ret(b.derived()); ret -= a; return ret; }
  Derived& operator+=(Index b) { m_index -= b; return derived(); }
  Derived& operator-=(Index b) { m_index += b; return derived(); }
  difference_type operator-(const indexed_based_stl_reverse_iterator_base& other) const
  {
    eigen_assert(mp_xpr == other.mp_xpr);
    return other.m_index - m_index;
  }
  difference_type operator-(const other_iterator& other) const
  {
    eigen_assert(mp_xpr == other.mp_xpr);
    return other.m_index - m_index;
  }
  bool operator==(const indexed_based_stl_reverse_iterator_base& other) const { eigen_assert(mp_xpr == other.mp_xpr); return m_index == other.m_index; }
  bool operator!=(const indexed_based_stl_reverse_iterator_base& other) const { eigen_assert(mp_xpr == other.mp_xpr); return m_index != other.m_index; }
  bool operator< (const indexed_based_stl_reverse_iterator_base& other) const { eigen_assert(mp_xpr == other.mp_xpr); return m_index >  other.m_index; }
  bool operator<=(const indexed_based_stl_reverse_iterator_base& other) const { eigen_assert(mp_xpr == other.mp_xpr); return m_index >= other.m_index; }
  bool operator> (const indexed_based_stl_reverse_iterator_base& other) const { eigen_assert(mp_xpr == other.mp_xpr); return m_index <  other.m_index; }
  bool operator>=(const indexed_based_stl_reverse_iterator_base& other) const { eigen_assert(mp_xpr == other.mp_xpr); return m_index <= other.m_index; }
  bool operator==(const other_iterator& other) const { eigen_assert(mp_xpr == other.mp_xpr); return m_index == other.m_index; }
  bool operator!=(const other_iterator& other) const { eigen_assert(mp_xpr == other.mp_xpr); return m_index != other.m_index; }
  bool operator< (const other_iterator& other) const { eigen_assert(mp_xpr == other.mp_xpr); return m_index >  other.m_index; }
  bool operator<=(const other_iterator& other) const { eigen_assert(mp_xpr == other.mp_xpr); return m_index >= other.m_index; }
  bool operator> (const other_iterator& other) const { eigen_assert(mp_xpr == other.mp_xpr); return m_index <  other.m_index; }
  bool operator>=(const other_iterator& other) const { eigen_assert(mp_xpr == other.mp_xpr); return m_index <= other.m_index; }
 protected:
  Derived& derived() { return static_cast<Derived&>(*this); }
  const Derived& derived() const { return static_cast<const Derived&>(*this); }
  XprType *mp_xpr;
  Index m_index;
 };
 template<typename XprType>
 class pointer_based_stl_iterator
 {
@@ -111,17 +193,17 @@ public:
  typedef typename internal::conditional<bool(is_lvalue), value_type&, const value_type&>::type reference;
-  pointer_based_stl_iterator() : m_ptr(0) {}
+  pointer_based_stl_iterator() EIGEN_NO_THROW : m_ptr(0) {}
-  pointer_based_stl_iterator(XprType& xpr, Index index) : m_incr(xpr.innerStride())
+  pointer_based_stl_iterator(XprType& xpr, Index index) EIGEN_NO_THROW : m_incr(xpr.innerStride())
  {
    m_ptr = xpr.data() + index * m_incr.value();
  }
-  pointer_based_stl_iterator(const non_const_iterator& other)
+  pointer_based_stl_iterator(const non_const_iterator& other) EIGEN_NO_THROW
    : m_ptr(other.m_ptr), m_incr(other.m_incr)
  {}
-  pointer_based_stl_iterator& operator=(const non_const_iterator& other)
+  pointer_based_stl_iterator& operator=(const non_const_iterator& other) EIGEN_NO_THROW
  {
    m_ptr = other.m_ptr;
    m_incr.setValue(other.m_incr);
@@ -267,6 +349,54 @@ public:
  pointer   operator->()        const { return (*mp_xpr).template subVector<Direction>(m_index); }
 };
 template<typename _XprType, DirectionType Direction>
 struct indexed_based_stl_iterator_traits<subvector_stl_reverse_iterator<_XprType,Direction> >
 {
  typedef _XprType XprType;
  typedef subvector_stl_reverse_iterator<typename internal::remove_const<XprType>::type, Direction> non_const_iterator;
  typedef subvector_stl_reverse_iterator<typename internal::add_const<XprType>::type, Direction> const_iterator;
 };
 template<typename XprType, DirectionType Direction>
 class subvector_stl_reverse_iterator : public indexed_based_stl_reverse_iterator_base<subvector_stl_reverse_iterator<XprType,Direction> >
 {
 protected:
  enum { is_lvalue  = internal::is_lvalue<XprType>::value };
  typedef indexed_based_stl_reverse_iterator_base<subvector_stl_reverse_iterator> Base;
  using Base::m_index;
  using Base::mp_xpr;
  typedef typename internal::conditional<Direction==Vertical,typename XprType::ColXpr,typename XprType::RowXpr>::type SubVectorType;
  typedef typename internal::conditional<Direction==Vertical,typename XprType::ConstColXpr,typename XprType::ConstRowXpr>::type ConstSubVectorType;
 public:
  typedef typename internal::conditional<bool(is_lvalue), SubVectorType, ConstSubVectorType>::type reference;
  typedef typename reference::PlainObject value_type;
 private:
  class subvector_stl_reverse_iterator_ptr
  {
  public:
      subvector_stl_reverse_iterator_ptr(const reference &subvector) : m_subvector(subvector) {}
      reference* operator->() { return &m_subvector; }
  private:
      reference m_subvector;
  };
 public:
  typedef subvector_stl_reverse_iterator_ptr pointer;
  subvector_stl_reverse_iterator() : Base() {}
  subvector_stl_reverse_iterator(XprType& xpr, Index index) : Base(xpr,index) {}
  reference operator*()         const { return (*mp_xpr).template subVector<Direction>(m_index); }
  reference operator[](Index i) const { return (*mp_xpr).template subVector<Direction>(m_index+i); }
  pointer   operator->()        const { return (*mp_xpr).template subVector<Direction>(m_index); }
 };
 } // namespace internal
@@ -329,3 +459,5 @@ inline typename DenseBase<Derived>::const_iterator DenseBase<Derived>::cend() co
 }
 } // namespace Eigen
 #endif // EIGEN_STLITERATORS_H
--- a/Eigen/src/Core/Stride.h
+++ b/Eigen/src/Core/Stride.h
@@ -38,6 +38,14 @@ namespace Eigen {
  * \include Map_general_stride.cpp
  * Output: \verbinclude Map_general_stride.out
  *
  * Both strides can be negative. However, a negative stride of -1 cannot be specified at compile time
  * because of the ambiguity with Dynamic which is defined to -1 (historically, negative strides were
  * not allowed).
  *
  * Note that for compile-time vectors (ColsAtCompileTime==1 or RowsAtCompile==1),
  * the inner stride is the pointer increment between two consecutive elements,
  * regardless of storage layout.
  *
  * \sa class InnerStride, class OuterStride, \ref TopicStorageOrders
  */
 template<int _OuterStrideAtCompileTime, int _InnerStrideAtCompileTime>
@@ -55,6 +63,8 @@ class Stride
    Stride()
      : m_outer(OuterStrideAtCompileTime), m_inner(InnerStrideAtCompileTime)
    {
      // FIXME: for Eigen 4 we should use DynamicIndex instead of Dynamic.
      // FIXME: for Eigen 4 we should also unify this API with fix<>
      eigen_assert(InnerStrideAtCompileTime != Dynamic && OuterStrideAtCompileTime != Dynamic);
    }
@@ -63,7 +73,6 @@ class Stride
    Stride(Index outerStride, Index innerStride)
      : m_outer(outerStride), m_inner(innerStride)
    {
      eigen_assert(innerStride>=0 && outerStride>=0);
    }
    /** Copy constructor */
@@ -73,10 +82,10 @@ class Stride
    {}
    /** \returns the outer stride */
-    EIGEN_DEVICE_FUNC
+    EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR
    inline Index outer() const { return m_outer.value(); }
    /** \returns the inner stride */
-    EIGEN_DEVICE_FUNC
+    EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR
    inline Index inner() const { return m_inner.value(); }
  protected:
--- a/Eigen/src/Core/Transpose.h
+++ b/Eigen/src/Core/Transpose.h
@@ -65,10 +65,10 @@ template<typename MatrixType> class Transpose
    EIGEN_INHERIT_ASSIGNMENT_OPERATORS(Transpose)
-    EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+    EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE EIGEN_CONSTEXPR
-    Index rows() const { return m_matrix.cols(); }
+    Index rows() const EIGEN_NOEXCEPT { return m_matrix.cols(); }
-    EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+    EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE EIGEN_CONSTEXPR
-    Index cols() const { return m_matrix.rows(); }
+    Index cols() const EIGEN_NOEXCEPT { return m_matrix.rows(); }
    /** \returns the nested expression */
    EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
@@ -153,6 +153,8 @@ template<typename MatrixType> class TransposeImpl<MatrixType,Dense>
    {
      return derived().nestedExpression().coeffRef(index);
    }
  protected:
    EIGEN_DEFAULT_EMPTY_CONSTRUCTOR_AND_DESTRUCTOR(TransposeImpl)
 };
 /** \returns an expression of the transpose of *this.
@@ -176,7 +178,7 @@ template<typename MatrixType> class TransposeImpl<MatrixType,Dense>
  * \sa transposeInPlace(), adjoint() */
 template<typename Derived>
 EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
-Transpose<Derived>
+typename DenseBase<Derived>::TransposeReturnType
 DenseBase<Derived>::transpose()
 {
  return TransposeReturnType(derived());
@@ -189,7 +191,7 @@ DenseBase<Derived>::transpose()
  * \sa transposeInPlace(), adjoint() */
 template<typename Derived>
 EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
-typename DenseBase<Derived>::ConstTransposeReturnType
+const typename DenseBase<Derived>::ConstTransposeReturnType
 DenseBase<Derived>::transpose() const
 {
  return ConstTransposeReturnType(derived());
@@ -241,7 +243,6 @@ struct inplace_transpose_selector<MatrixType,true,false> { // square matrix
  }
 };
 // TODO: vectorized path is currently limited to LargestPacketSize x LargestPacketSize cases only.
 template<typename MatrixType>
 struct inplace_transpose_selector<MatrixType,true,true> { // PacketSize x PacketSize
  static void run(MatrixType& m) {
@@ -258,16 +259,66 @@ struct inplace_transpose_selector<MatrixType,true,true> { // PacketSize x Packet
  }
 };
 template <typename MatrixType, Index Alignment>
 void BlockedInPlaceTranspose(MatrixType& m) {
  typedef typename MatrixType::Scalar Scalar;
  typedef typename internal::packet_traits<typename MatrixType::Scalar>::type Packet;
  const Index PacketSize = internal::packet_traits<Scalar>::size;
  eigen_assert(m.rows() == m.cols());
  int row_start = 0;
  for (; row_start + PacketSize <= m.rows(); row_start += PacketSize) {
    for (int col_start = row_start; col_start + PacketSize <= m.cols(); col_start += PacketSize) {
      PacketBlock<Packet> A;
      if (row_start == col_start) {
        for (Index i=0; i<PacketSize; ++i)
          A.packet[i] = m.template packetByOuterInner<Alignment>(row_start + i,col_start);
        internal::ptranspose(A);
        for (Index i=0; i<PacketSize; ++i)
          m.template writePacket<Alignment>(m.rowIndexByOuterInner(row_start + i, col_start), m.colIndexByOuterInner(row_start + i,col_start), A.packet[i]);
      } else {
        PacketBlock<Packet> B;
        for (Index i=0; i<PacketSize; ++i) {
          A.packet[i] = m.template packetByOuterInner<Alignment>(row_start + i,col_start);
          B.packet[i] = m.template packetByOuterInner<Alignment>(col_start + i, row_start);
        }
        internal::ptranspose(A);
        internal::ptranspose(B);
        for (Index i=0; i<PacketSize; ++i) {
          m.template writePacket<Alignment>(m.rowIndexByOuterInner(row_start + i, col_start), m.colIndexByOuterInner(row_start + i,col_start), B.packet[i]);
          m.template writePacket<Alignment>(m.rowIndexByOuterInner(col_start + i, row_start), m.colIndexByOuterInner(col_start + i,row_start), A.packet[i]);
        }
      }
    }
  }
  for (Index row = row_start; row < m.rows(); ++row) {
    m.matrix().row(row).head(row).swap(
        m.matrix().col(row).head(row).transpose());
  }
 }
 template<typename MatrixType,bool MatchPacketSize>
-struct inplace_transpose_selector<MatrixType,false,MatchPacketSize> { // non square matrix
+struct inplace_transpose_selector<MatrixType,false,MatchPacketSize> { // non square or dynamic matrix
  static void run(MatrixType& m) {
-    if (m.rows()==m.cols())
+    typedef typename MatrixType::Scalar Scalar;
-      m.matrix().template triangularView<StrictlyUpper>().swap(m.matrix().transpose().template triangularView<StrictlyUpper>());
+    if (m.rows() == m.cols()) {
      const Index PacketSize = internal::packet_traits<Scalar>::size;
      if (!NumTraits<Scalar>::IsComplex && m.rows() >= PacketSize) {
        if ((m.rows() % PacketSize) == 0)
          BlockedInPlaceTranspose<MatrixType,internal::evaluator<MatrixType>::Alignment>(m);
        else
          BlockedInPlaceTranspose<MatrixType,Unaligned>(m);
      }
      else {
        m.matrix().template triangularView<StrictlyUpper>().swap(m.matrix().transpose().template triangularView<StrictlyUpper>());
      }
    } else {
      m = m.transpose().eval();
    }
  }
 };
 } // end namespace internal
 /** This is the "in place" version of transpose(): it replaces \c *this by its own transpose.
--- a/Eigen/src/Core/Transpositions.h
+++ b/Eigen/src/Core/Transpositions.h
@@ -23,7 +23,9 @@ class TranspositionsBase
    typedef typename IndicesType::Scalar StorageIndex;
    typedef Eigen::Index Index; ///< \deprecated since Eigen 3.3
    EIGEN_DEVICE_FUNC
    Derived& derived() { return *static_cast<Derived*>(this); }
    EIGEN_DEVICE_FUNC
    const Derived& derived() const { return *static_cast<const Derived*>(this); }
    /** Copies the \a other transpositions into \c *this */
@@ -35,13 +37,17 @@ class TranspositionsBase
    }
    /** \returns the number of transpositions */
    EIGEN_DEVICE_FUNC
    Index size() const { return indices().size(); }
    /** \returns the number of rows of the equivalent permutation matrix */
    EIGEN_DEVICE_FUNC
    Index rows() const { return indices().size(); }
    /** \returns the number of columns of the equivalent permutation matrix */
    EIGEN_DEVICE_FUNC
    Index cols() const { return indices().size(); }
    /** Direct access to the underlying index vector */
    EIGEN_DEVICE_FUNC
    inline const StorageIndex& coeff(Index i) const { return indices().coeff(i); }
    /** Direct access to the underlying index vector */
    inline StorageIndex& coeffRef(Index i) { return indices().coeffRef(i); }
@@ -55,8 +61,10 @@ class TranspositionsBase
    inline StorageIndex& operator[](Index i) { return indices()(i); }
    /** const version of indices(). */
    EIGEN_DEVICE_FUNC
    const IndicesType& indices() const { return derived().indices(); }
    /** \returns a reference to the stored array representing the transpositions. */
    EIGEN_DEVICE_FUNC
    IndicesType& indices() { return derived().indices(); }
    /** Resizes to given size. */
@@ -178,8 +186,10 @@ class Transpositions : public TranspositionsBase<Transpositions<SizeAtCompileTim
    {}
    /** const version of indices(). */
    EIGEN_DEVICE_FUNC
    const IndicesType& indices() const { return m_indices; }
    /** \returns a reference to the stored array representing the transpositions. */
    EIGEN_DEVICE_FUNC
    IndicesType& indices() { return m_indices; }
  protected:
@@ -237,9 +247,11 @@ class Map<Transpositions<SizeAtCompileTime,MaxSizeAtCompileTime,_StorageIndex>,P
    #endif
    /** const version of indices(). */
    EIGEN_DEVICE_FUNC
    const IndicesType& indices() const { return m_indices; }
    /** \returns a reference to the stored array representing the transpositions. */
    EIGEN_DEVICE_FUNC
    IndicesType& indices() { return m_indices; }
  protected:
@@ -279,9 +291,11 @@ class TranspositionsWrapper
    }
    /** const version of indices(). */
    EIGEN_DEVICE_FUNC
    const IndicesType& indices() const { return m_indices; }
    /** \returns a reference to the stored array representing the transpositions. */
    EIGEN_DEVICE_FUNC
    IndicesType& indices() { return m_indices; }
  protected:
@@ -335,9 +349,12 @@ class Transpose<TranspositionsBase<TranspositionsDerived> >
    explicit Transpose(const TranspositionType& t) : m_transpositions(t) {}
-    Index size() const { return m_transpositions.size(); }
+    EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR
-    Index rows() const { return m_transpositions.size(); }
+    Index size() const EIGEN_NOEXCEPT { return m_transpositions.size(); }
-    Index cols() const { return m_transpositions.size(); }
+    EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR
    Index rows() const EIGEN_NOEXCEPT { return m_transpositions.size(); }
    EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR
    Index cols() const EIGEN_NOEXCEPT { return m_transpositions.size(); }
    /** \returns the \a matrix with the inverse transpositions applied to the columns.
      */
@@ -357,6 +374,7 @@ class Transpose<TranspositionsBase<TranspositionsDerived> >
      return Product<Transpose, OtherDerived, AliasFreeProduct>(*this, matrix.derived());
    }
    EIGEN_DEVICE_FUNC
    const TranspositionType& nestedExpression() const { return m_transpositions; }
  protected:
--- a/Eigen/src/Core/TriangularMatrix.h
+++ b/Eigen/src/Core/TriangularMatrix.h
@@ -53,16 +53,16 @@ template<typename Derived> class TriangularBase : public EigenBase<Derived>
    typedef Derived const& Nested;
    EIGEN_DEVICE_FUNC
-    inline TriangularBase() { eigen_assert(!((Mode&UnitDiag) && (Mode&ZeroDiag))); }
+    inline TriangularBase() { eigen_assert(!((int(Mode) & int(UnitDiag)) && (int(Mode) & int(ZeroDiag)))); }
-    EIGEN_DEVICE_FUNC
+    EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR
-    inline Index rows() const { return derived().rows(); }
+    inline Index rows() const EIGEN_NOEXCEPT { return derived().rows(); }
-    EIGEN_DEVICE_FUNC
+    EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR
-    inline Index cols() const { return derived().cols(); }
+    inline Index cols() const EIGEN_NOEXCEPT { return derived().cols(); }
-    EIGEN_DEVICE_FUNC
+    EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR
-    inline Index outerStride() const { return derived().outerStride(); }
+    inline Index outerStride() const EIGEN_NOEXCEPT { return derived().outerStride(); }
-    EIGEN_DEVICE_FUNC
+    EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR
-    inline Index innerStride() const { return derived().innerStride(); }
+    inline Index innerStride() const EIGEN_NOEXCEPT { return derived().innerStride(); }
    // dummy resize function
    EIGEN_DEVICE_FUNC
@@ -100,12 +100,10 @@ template<typename Derived> class TriangularBase : public EigenBase<Derived>
      return coeffRef(row,col);
    }
    #ifndef EIGEN_PARSED_BY_DOXYGEN
    EIGEN_DEVICE_FUNC
    inline const Derived& derived() const { return *static_cast<const Derived*>(this); }
    EIGEN_DEVICE_FUNC
    inline Derived& derived() { return *static_cast<Derived*>(this); }
    #endif // not EIGEN_PARSED_BY_DOXYGEN
    template<typename DenseDerived>
    EIGEN_DEVICE_FUNC
@@ -219,16 +217,14 @@ template<typename _MatrixType, unsigned int _Mode> class TriangularView
    explicit inline TriangularView(MatrixType& matrix) : m_matrix(matrix)
    {}
-    using Base::operator=;
+    EIGEN_INHERIT_ASSIGNMENT_OPERATORS(TriangularView)
    TriangularView& operator=(const TriangularView &other)
    { return Base::operator=(other); }
    /** \copydoc EigenBase::rows() */
-    EIGEN_DEVICE_FUNC
+    EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR
-    inline Index rows() const { return m_matrix.rows(); }
+    inline Index rows() const EIGEN_NOEXCEPT { return m_matrix.rows(); }
    /** \copydoc EigenBase::cols() */
-    EIGEN_DEVICE_FUNC
+    EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR
-    inline Index cols() const { return m_matrix.cols(); }
+    inline Index cols() const EIGEN_NOEXCEPT { return m_matrix.cols(); }
    /** \returns a const reference to the nested expression */
    EIGEN_DEVICE_FUNC
@@ -444,7 +440,6 @@ template<typename _MatrixType, unsigned int _Mode> class TriangularViewImpl<_Mat
    EIGEN_DEVICE_FUNC
    TriangularViewType& operator=(const MatrixBase<OtherDerived>& other);
 #ifndef EIGEN_PARSED_BY_DOXYGEN
    EIGEN_DEVICE_FUNC
    TriangularViewType& operator=(const TriangularViewImpl& other)
    { return *this = other.derived().nestedExpression(); }
@@ -458,7 +453,6 @@ template<typename _MatrixType, unsigned int _Mode> class TriangularViewImpl<_Mat
    /** \deprecated */
    EIGEN_DEPRECATED EIGEN_DEVICE_FUNC
    void lazyAssign(const MatrixBase<OtherDerived>& other);
 #endif
    /** Efficient triangular matrix times vector/matrix product */
    template<typename OtherDerived>
@@ -526,11 +520,7 @@ template<typename _MatrixType, unsigned int _Mode> class TriangularViewImpl<_Mat
    /** Swaps the coefficients of the common triangular parts of two matrices */
    template<typename OtherDerived>
    EIGEN_DEVICE_FUNC
 #ifdef EIGEN_PARSED_BY_DOXYGEN
    void swap(TriangularBase<OtherDerived> &other)
 #else
    void swap(TriangularBase<OtherDerived> const & other)
 #endif
    {
      EIGEN_STATIC_ASSERT_LVALUE(OtherDerived);
      call_assignment(derived(), other.const_cast_derived(), internal::swap_assign_op<Scalar>());
@@ -555,8 +545,13 @@ template<typename _MatrixType, unsigned int _Mode> class TriangularViewImpl<_Mat
    }
    template <typename ProductType>
-    EIGEN_DEVICE_FUNC
+    EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TriangularViewType& _assignProduct(const ProductType& prod, const Scalar& alpha,
-    EIGEN_STRONG_INLINE TriangularViewType& _assignProduct(const ProductType& prod, const Scalar& alpha, bool beta);
+                                                                           bool beta);
  protected:
    EIGEN_DEFAULT_COPY_CONSTRUCTOR(TriangularViewImpl)
    EIGEN_DEFAULT_EMPTY_CONSTRUCTOR_AND_DESTRUCTOR(TriangularViewImpl)
 };
 /***************************************************************************
@@ -817,7 +812,7 @@ void call_triangular_assignment_loop(DstXprType& dst, const SrcXprType& src, con
  enum {
      unroll = DstXprType::SizeAtCompileTime != Dynamic
            && SrcEvaluatorType::CoeffReadCost < HugeCost
-            && DstXprType::SizeAtCompileTime * (DstEvaluatorType::CoeffReadCost+SrcEvaluatorType::CoeffReadCost) / 2 <= EIGEN_UNROLLING_LIMIT
+            && DstXprType::SizeAtCompileTime * (int(DstEvaluatorType::CoeffReadCost) + int(SrcEvaluatorType::CoeffReadCost)) / 2 <= EIGEN_UNROLLING_LIMIT
    };
  triangular_assignment_loop<Kernel, Mode, unroll ? int(DstXprType::SizeAtCompileTime) : Dynamic, SetOpposite>::run(kernel);
@@ -851,7 +846,7 @@ struct Assignment<DstXprType, SrcXprType, Functor, Triangular2Dense>
 {
  EIGEN_DEVICE_FUNC static void run(DstXprType &dst, const SrcXprType &src, const Functor &func)
  {
-    call_triangular_assignment_loop<SrcXprType::Mode, (SrcXprType::Mode&SelfAdjoint)==0>(dst, src, func);  
+    call_triangular_assignment_loop<SrcXprType::Mode, (int(SrcXprType::Mode) & int(SelfAdjoint)) == 0>(dst, src, func);
  }
 };
@@ -949,7 +944,7 @@ template<typename DenseDerived>
 EIGEN_DEVICE_FUNC void TriangularBase<Derived>::evalToLazy(MatrixBase<DenseDerived> &other) const
 {
  other.derived().resize(this->rows(), this->cols());
-  internal::call_triangular_assignment_loop<Derived::Mode,(Derived::Mode&SelfAdjoint)==0 /* SetOpposite */>(other.derived(), derived().nestedExpression());
+  internal::call_triangular_assignment_loop<Derived::Mode, (int(Derived::Mode) & int(SelfAdjoint)) == 0 /* SetOpposite */>(other.derived(), derived().nestedExpression());
 }
 namespace internal {
@@ -966,7 +961,7 @@ struct Assignment<DstXprType, Product<Lhs,Rhs,DefaultProduct>, internal::assign_
    if((dst.rows()!=dstRows) || (dst.cols()!=dstCols))
      dst.resize(dstRows, dstCols);
-    dst._assignProduct(src, 1, 0);
+    dst._assignProduct(src, Scalar(1), false);
  }
 };
@@ -977,7 +972,7 @@ struct Assignment<DstXprType, Product<Lhs,Rhs,DefaultProduct>, internal::add_ass
  typedef Product<Lhs,Rhs,DefaultProduct> SrcXprType;
  static void run(DstXprType &dst, const SrcXprType &src, const internal::add_assign_op<Scalar,typename SrcXprType::Scalar> &)
  {
-    dst._assignProduct(src, 1, 1);
+    dst._assignProduct(src, Scalar(1), true);
  }
 };
@@ -988,7 +983,7 @@ struct Assignment<DstXprType, Product<Lhs,Rhs,DefaultProduct>, internal::sub_ass
  typedef Product<Lhs,Rhs,DefaultProduct> SrcXprType;
  static void run(DstXprType &dst, const SrcXprType &src, const internal::sub_assign_op<Scalar,typename SrcXprType::Scalar> &)
  {
-    dst._assignProduct(src, -1, 1);
+    dst._assignProduct(src, Scalar(-1), true);
  }
 };
--- a/Eigen/src/Core/VectorwiseOp.h
+++ b/Eigen/src/Core/VectorwiseOp.h
@@ -65,10 +65,10 @@ class PartialReduxExpr : public internal::dense_xpr_base< PartialReduxExpr<Matri
    explicit PartialReduxExpr(const MatrixType& mat, const MemberOp& func = MemberOp())
      : m_matrix(mat), m_functor(func) {}
-    EIGEN_DEVICE_FUNC
+    EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR
-    Index rows() const { return (Direction==Vertical   ? 1 : m_matrix.rows()); }
+    Index rows() const EIGEN_NOEXCEPT { return (Direction==Vertical   ? 1 : m_matrix.rows()); }
-    EIGEN_DEVICE_FUNC
+    EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR
-    Index cols() const { return (Direction==Horizontal ? 1 : m_matrix.cols()); }
+    Index cols() const EIGEN_NOEXCEPT { return (Direction==Horizontal ? 1 : m_matrix.cols()); }
    EIGEN_DEVICE_FUNC
    typename MatrixType::Nested nestedExpression() const { return m_matrix; }
@@ -281,6 +281,8 @@ template<typename ExpressionType, int Direction> class VectorwiseOp
    #else
    typedef internal::subvector_stl_iterator<ExpressionType,               DirectionType(Direction)> iterator;
    typedef internal::subvector_stl_iterator<const ExpressionType,         DirectionType(Direction)> const_iterator;
    typedef internal::subvector_stl_reverse_iterator<ExpressionType,       DirectionType(Direction)> reverse_iterator;
    typedef internal::subvector_stl_reverse_iterator<const ExpressionType, DirectionType(Direction)> const_reverse_iterator;
    #endif
    /** returns an iterator to the first row (rowwise) or column (colwise) of the nested expression.
@@ -292,6 +294,15 @@ template<typename ExpressionType, int Direction> class VectorwiseOp
    /** const version of begin() */
    const_iterator          cbegin() const { return const_iterator(m_matrix, 0); }
    /** returns a reverse iterator to the last row (rowwise) or column (colwise) of the nested expression.
      * \sa rend(), crbegin()
      */
    reverse_iterator        rbegin()       { return reverse_iterator       (m_matrix, m_matrix.template subVectors<DirectionType(Direction)>()-1); }
 	/** const version of rbegin() */
    const_reverse_iterator  rbegin() const { return const_reverse_iterator (m_matrix, m_matrix.template subVectors<DirectionType(Direction)>()-1); }
 	/** const version of rbegin() */
 	const_reverse_iterator crbegin() const { return const_reverse_iterator (m_matrix, m_matrix.template subVectors<DirectionType(Direction)>()-1); }
    /** returns an iterator to the row (resp. column) following the last row (resp. column) of the nested expression
      * \sa begin(), cend()
      */
@@ -301,6 +312,15 @@ template<typename ExpressionType, int Direction> class VectorwiseOp
    /** const version of end() */
    const_iterator          cend()  const  { return const_iterator(m_matrix, m_matrix.template subVectors<DirectionType(Direction)>()); }
    /** returns a reverse iterator to the row (resp. column) before the first row (resp. column) of the nested expression
      * \sa begin(), cend()
      */
    reverse_iterator        rend()         { return reverse_iterator       (m_matrix, -1); }
    /** const version of rend() */
    const_reverse_iterator  rend()  const  { return const_reverse_iterator (m_matrix, -1); }
    /** const version of rend() */
    const_reverse_iterator crend()  const  { return const_reverse_iterator (m_matrix, -1); }
    /** \returns a row or column vector expression of \c *this reduxed by \a func
      *
      * The template parameter \a BinaryOp is the type of the functor
@@ -719,6 +739,10 @@ template<typename ExpressionType, int Direction> class VectorwiseOp
    EIGEN_DEVICE_FUNC
    const HNormalizedReturnType hnormalized() const;
 #   ifdef EIGEN_VECTORWISEOP_PLUGIN
 #     include EIGEN_VECTORWISEOP_PLUGIN
 #   endif
  protected:
    Index redux_length() const
    {
--- a/Eigen/src/Core/Visitor.h
+++ b/Eigen/src/Core/Visitor.h
@@ -79,9 +79,9 @@ public:
    CoeffReadCost = internal::evaluator<XprType>::CoeffReadCost
  };
-  EIGEN_DEVICE_FUNC Index rows() const { return m_xpr.rows(); }
+  EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR Index rows() const EIGEN_NOEXCEPT { return m_xpr.rows(); }
-  EIGEN_DEVICE_FUNC Index cols() const { return m_xpr.cols(); }
+  EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR Index cols() const EIGEN_NOEXCEPT { return m_xpr.cols(); }
-  EIGEN_DEVICE_FUNC Index size() const { return m_xpr.size(); }
+  EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR Index size() const EIGEN_NOEXCEPT { return m_xpr.size(); }
  EIGEN_DEVICE_FUNC CoeffReturnType coeff(Index row, Index col) const
  { return m_evaluator.coeff(row, col); }
@@ -124,7 +124,7 @@ void DenseBase<Derived>::visit(Visitor& visitor) const
  enum {
    unroll =  SizeAtCompileTime != Dynamic
-           && SizeAtCompileTime * ThisEvaluator::CoeffReadCost + (SizeAtCompileTime-1) * internal::functor_traits<Visitor>::Cost <= EIGEN_UNROLLING_LIMIT
+           && SizeAtCompileTime * int(ThisEvaluator::CoeffReadCost) + (SizeAtCompileTime-1) * int(internal::functor_traits<Visitor>::Cost) <= EIGEN_UNROLLING_LIMIT
  };
  return internal::visitor_impl<Visitor, ThisEvaluator, unroll ? int(SizeAtCompileTime) : Dynamic>::run(thisEval, visitor);
 }
@@ -157,7 +157,7 @@ struct coeff_visitor
  *
  * \sa DenseBase::minCoeff(Index*, Index*)
  */
-template <typename Derived>
+template <typename Derived, int NaNPropagation>
 struct min_coeff_visitor : coeff_visitor<Derived>
 {
  typedef typename Derived::Scalar Scalar;
@@ -173,8 +173,40 @@ struct min_coeff_visitor : coeff_visitor<Derived>
  }
 };
-template<typename Scalar>
+template <typename Derived>
-struct functor_traits<min_coeff_visitor<Scalar> > {
+struct min_coeff_visitor<Derived, PropagateNumbers> : coeff_visitor<Derived>
 {
  typedef typename Derived::Scalar Scalar;
  EIGEN_DEVICE_FUNC
  void operator() (const Scalar& value, Index i, Index j)
  {
    if((numext::isnan)(this->res) || (!(numext::isnan)(value) && value < this->res))
    {
      this->res = value;
      this->row = i;
      this->col = j;
    }
  }
 };
 template <typename Derived>
 struct min_coeff_visitor<Derived, PropagateNaN> : coeff_visitor<Derived>
 {
  typedef typename Derived::Scalar Scalar;
  EIGEN_DEVICE_FUNC
  void operator() (const Scalar& value, Index i, Index j)
  {
    if((numext::isnan)(value) || value < this->res)
    {
      this->res = value;
      this->row = i;
      this->col = j;
    }
  }
 };
 template<typename Scalar, int NaNPropagation>
    struct functor_traits<min_coeff_visitor<Scalar, NaNPropagation> > {
  enum {
    Cost = NumTraits<Scalar>::AddCost
  };
@@ -185,7 +217,7 @@ struct functor_traits<min_coeff_visitor<Scalar> > {
  *
  * \sa DenseBase::maxCoeff(Index*, Index*)
  */
-template <typename Derived>
+template <typename Derived, int NaNPropagation>
 struct max_coeff_visitor : coeff_visitor<Derived>
 {
  typedef typename Derived::Scalar Scalar;
@@ -201,8 +233,40 @@ struct max_coeff_visitor : coeff_visitor<Derived>
  }
 };
-template<typename Scalar>
+template <typename Derived>
-struct functor_traits<max_coeff_visitor<Scalar> > {
+struct max_coeff_visitor<Derived, PropagateNumbers> : coeff_visitor<Derived>
 {
  typedef typename Derived::Scalar Scalar;
  EIGEN_DEVICE_FUNC
  void operator() (const Scalar& value, Index i, Index j)
  {
    if((numext::isnan)(this->res) || (!(numext::isnan)(value) && value > this->res))
    {
      this->res = value;
      this->row = i;
      this->col = j;
    }
  }
 };
 template <typename Derived>
 struct max_coeff_visitor<Derived, PropagateNaN> : coeff_visitor<Derived>
 {
  typedef typename Derived::Scalar Scalar;
  EIGEN_DEVICE_FUNC
  void operator() (const Scalar& value, Index i, Index j)
  {
    if((numext::isnan)(value) || value > this->res)
    {
      this->res = value;
      this->row = i;
      this->col = j;
    }
  }
 };
 template<typename Scalar, int NaNPropagation>
 struct functor_traits<max_coeff_visitor<Scalar, NaNPropagation> > {
  enum {
    Cost = NumTraits<Scalar>::AddCost
  };
@@ -213,21 +277,23 @@ struct functor_traits<max_coeff_visitor<Scalar> > {
 /** \fn DenseBase<Derived>::minCoeff(IndexType* rowId, IndexType* colId) const
  * \returns the minimum of all coefficients of *this and puts in *row and *col its location.
  *
  * In case \c *this contains NaN, NaNPropagation determines the behavior:
  *   NaNPropagation == PropagateFast : undefined
  *   NaNPropagation == PropagateNaN : result is NaN
  *   NaNPropagation == PropagateNumbers : result is maximum of elements that are not NaN
  * \warning the matrix must be not empty, otherwise an assertion is triggered.
  *
  * \warning the result is undefined if \c *this contains NaN.
  *
  * \sa DenseBase::minCoeff(Index*), DenseBase::maxCoeff(Index*,Index*), DenseBase::visit(), DenseBase::minCoeff()
  */
 template<typename Derived>
-template<typename IndexType>
+template<int NaNPropagation, typename IndexType>
 EIGEN_DEVICE_FUNC
 typename internal::traits<Derived>::Scalar
 DenseBase<Derived>::minCoeff(IndexType* rowId, IndexType* colId) const
 {
  eigen_assert(this->rows()>0 && this->cols()>0 && "you are using an empty matrix");
-  internal::min_coeff_visitor<Derived> minVisitor;
+  internal::min_coeff_visitor<Derived, NaNPropagation> minVisitor;
  this->visit(minVisitor);
  *rowId = minVisitor.row;
  if (colId) *colId = minVisitor.col;
@@ -236,14 +302,16 @@ DenseBase<Derived>::minCoeff(IndexType* rowId, IndexType* colId) const
 /** \returns the minimum of all coefficients of *this and puts in *index its location.
  *
  * In case \c *this contains NaN, NaNPropagation determines the behavior:
  *   NaNPropagation == PropagateFast : undefined
  *   NaNPropagation == PropagateNaN : result is NaN
  *   NaNPropagation == PropagateNumbers : result is maximum of elements that are not NaN
  * \warning the matrix must be not empty, otherwise an assertion is triggered.
  *
  * \warning the result is undefined if \c *this contains NaN. 
  *
  * \sa DenseBase::minCoeff(IndexType*,IndexType*), DenseBase::maxCoeff(IndexType*,IndexType*), DenseBase::visit(), DenseBase::minCoeff()
  */
 template<typename Derived>
-template<typename IndexType>
+template<int NaNPropagation, typename IndexType>
 EIGEN_DEVICE_FUNC
 typename internal::traits<Derived>::Scalar
 DenseBase<Derived>::minCoeff(IndexType* index) const
@@ -251,7 +319,7 @@ DenseBase<Derived>::minCoeff(IndexType* index) const
  eigen_assert(this->rows()>0 && this->cols()>0 && "you are using an empty matrix");
  EIGEN_STATIC_ASSERT_VECTOR_ONLY(Derived)
-  internal::min_coeff_visitor<Derived> minVisitor;
+      internal::min_coeff_visitor<Derived, NaNPropagation> minVisitor;
  this->visit(minVisitor);
  *index = IndexType((RowsAtCompileTime==1) ? minVisitor.col : minVisitor.row);
  return minVisitor.res;
@@ -260,21 +328,23 @@ DenseBase<Derived>::minCoeff(IndexType* index) const
 /** \fn DenseBase<Derived>::maxCoeff(IndexType* rowId, IndexType* colId) const
  * \returns the maximum of all coefficients of *this and puts in *row and *col its location.
  *
  * In case \c *this contains NaN, NaNPropagation determines the behavior:
  *   NaNPropagation == PropagateFast : undefined
  *   NaNPropagation == PropagateNaN : result is NaN
  *   NaNPropagation == PropagateNumbers : result is maximum of elements that are not NaN
  * \warning the matrix must be not empty, otherwise an assertion is triggered.
  *
  * \warning the result is undefined if \c *this contains NaN. 
  *
  * \sa DenseBase::minCoeff(IndexType*,IndexType*), DenseBase::visit(), DenseBase::maxCoeff()
  */
 template<typename Derived>
-template<typename IndexType>
+template<int NaNPropagation, typename IndexType>
 EIGEN_DEVICE_FUNC
 typename internal::traits<Derived>::Scalar
 DenseBase<Derived>::maxCoeff(IndexType* rowPtr, IndexType* colPtr) const
 {
  eigen_assert(this->rows()>0 && this->cols()>0 && "you are using an empty matrix");
-  internal::max_coeff_visitor<Derived> maxVisitor;
+  internal::max_coeff_visitor<Derived, NaNPropagation> maxVisitor;
  this->visit(maxVisitor);
  *rowPtr = maxVisitor.row;
  if (colPtr) *colPtr = maxVisitor.col;
@@ -283,14 +353,16 @@ DenseBase<Derived>::maxCoeff(IndexType* rowPtr, IndexType* colPtr) const
 /** \returns the maximum of all coefficients of *this and puts in *index its location.
  *
  * In case \c *this contains NaN, NaNPropagation determines the behavior:
  *   NaNPropagation == PropagateFast : undefined
  *   NaNPropagation == PropagateNaN : result is NaN
  *   NaNPropagation == PropagateNumbers : result is maximum of elements that are not NaN
  * \warning the matrix must be not empty, otherwise an assertion is triggered.
  *
  * \warning the result is undefined if \c *this contains NaN.
  *
  * \sa DenseBase::maxCoeff(IndexType*,IndexType*), DenseBase::minCoeff(IndexType*,IndexType*), DenseBase::visitor(), DenseBase::maxCoeff()
  */
 template<typename Derived>
-template<typename IndexType>
+template<int NaNPropagation, typename IndexType>
 EIGEN_DEVICE_FUNC
 typename internal::traits<Derived>::Scalar
 DenseBase<Derived>::maxCoeff(IndexType* index) const
@@ -298,7 +370,7 @@ DenseBase<Derived>::maxCoeff(IndexType* index) const
  eigen_assert(this->rows()>0 && this->cols()>0 && "you are using an empty matrix");
  EIGEN_STATIC_ASSERT_VECTOR_ONLY(Derived)
-  internal::max_coeff_visitor<Derived> maxVisitor;
+      internal::max_coeff_visitor<Derived, NaNPropagation> maxVisitor;
  this->visit(maxVisitor);
  *index = (RowsAtCompileTime==1) ? maxVisitor.col : maxVisitor.row;
  return maxVisitor.res;
--- a/Eigen/src/Core/arch/AVX/Complex.h
+++ b/Eigen/src/Core/arch/AVX/Complex.h
@@ -38,6 +38,7 @@ template<> struct packet_traits<std::complex<float> >  : default_packet_traits
    HasMul    = 1,
    HasDiv    = 1,
    HasNegate = 1,
    HasSqrt   = 1,
    HasAbs    = 0,
    HasAbs2   = 0,
    HasMin    = 0,
@@ -47,7 +48,18 @@ template<> struct packet_traits<std::complex<float> >  : default_packet_traits
 };
 #endif
-template<> struct unpacket_traits<Packet4cf> { typedef std::complex<float> type; enum {size=4, alignment=Aligned32, vectorizable=true, masked_load_available=false, masked_store_available=false}; typedef Packet2cf half; };
+template<> struct unpacket_traits<Packet4cf> {
  typedef std::complex<float> type;
  typedef Packet2cf half;
  typedef Packet8f as_real;
  enum {
    size=4,
    alignment=Aligned32,
    vectorizable=true,
    masked_load_available=false,
    masked_store_available=false
  };
 };
 template<> EIGEN_STRONG_INLINE Packet4cf padd<Packet4cf>(const Packet4cf& a, const Packet4cf& b) { return Packet4cf(_mm256_add_ps(a.v,b.v)); }
 template<> EIGEN_STRONG_INLINE Packet4cf psub<Packet4cf>(const Packet4cf& a, const Packet4cf& b) { return Packet4cf(_mm256_sub_ps(a.v,b.v)); }
@@ -76,7 +88,6 @@ EIGEN_STRONG_INLINE Packet4cf pcmp_eq(const Packet4cf& a, const Packet4cf& b) {
 }
 template<> EIGEN_STRONG_INLINE Packet4cf ptrue<Packet4cf>(const Packet4cf& a) { return Packet4cf(ptrue(Packet8f(a.v))); }
 template<> EIGEN_STRONG_INLINE Packet4cf pnot<Packet4cf>(const Packet4cf& a) { return Packet4cf(pnot(Packet8f(a.v))); }
 template<> EIGEN_STRONG_INLINE Packet4cf pand   <Packet4cf>(const Packet4cf& a, const Packet4cf& b) { return Packet4cf(_mm256_and_ps(a.v,b.v)); }
 template<> EIGEN_STRONG_INLINE Packet4cf por    <Packet4cf>(const Packet4cf& a, const Packet4cf& b) { return Packet4cf(_mm256_or_ps(a.v,b.v)); }
 template<> EIGEN_STRONG_INLINE Packet4cf pxor   <Packet4cf>(const Packet4cf& a, const Packet4cf& b) { return Packet4cf(_mm256_xor_ps(a.v,b.v)); }
@@ -88,7 +99,9 @@ template<> EIGEN_STRONG_INLINE Packet4cf ploadu<Packet4cf>(const std::complex<fl
 template<> EIGEN_STRONG_INLINE Packet4cf pset1<Packet4cf>(const std::complex<float>& from)
 {
-  return Packet4cf(_mm256_castpd_ps(_mm256_broadcast_sd((const double*)(const void*)&from)));
+  const float re = std::real(from);
  const float im = std::imag(from);
  return Packet4cf(_mm256_set_ps(im, re, im, re, im, re, im, re));
 }
 template<> EIGEN_STRONG_INLINE Packet4cf ploaddup<Packet4cf>(const std::complex<float>* from)
@@ -150,79 +163,18 @@ template<> EIGEN_STRONG_INLINE std::complex<float> predux<Packet4cf>(const Packe
                     Packet2cf(_mm256_extractf128_ps(a.v,1))));
 }
 template<> EIGEN_STRONG_INLINE Packet4cf preduxp<Packet4cf>(const Packet4cf* vecs)
 {
  Packet8f t0 = _mm256_shuffle_ps(vecs[0].v, vecs[0].v, _MM_SHUFFLE(3, 1, 2 ,0));
  Packet8f t1 = _mm256_shuffle_ps(vecs[1].v, vecs[1].v, _MM_SHUFFLE(3, 1, 2 ,0));
  t0 = _mm256_hadd_ps(t0,t1);
  Packet8f t2 = _mm256_shuffle_ps(vecs[2].v, vecs[2].v, _MM_SHUFFLE(3, 1, 2 ,0));
  Packet8f t3 = _mm256_shuffle_ps(vecs[3].v, vecs[3].v, _MM_SHUFFLE(3, 1, 2 ,0));
  t2 = _mm256_hadd_ps(t2,t3);
  t1 = _mm256_permute2f128_ps(t0,t2, 0 + (2<<4));
  t3 = _mm256_permute2f128_ps(t0,t2, 1 + (3<<4));
  return Packet4cf(_mm256_add_ps(t1,t3));
 }
 template<> EIGEN_STRONG_INLINE std::complex<float> predux_mul<Packet4cf>(const Packet4cf& a)
 {
  return predux_mul(pmul(Packet2cf(_mm256_extractf128_ps(a.v, 0)),
                         Packet2cf(_mm256_extractf128_ps(a.v, 1))));
 }
 template<int Offset>
 struct palign_impl<Offset,Packet4cf>
 {
  static EIGEN_STRONG_INLINE void run(Packet4cf& first, const Packet4cf& second)
  {
    if (Offset==0) return;
    palign_impl<Offset*2,Packet8f>::run(first.v, second.v);
  }
 };
 template<> struct conj_helper<Packet4cf, Packet4cf, false,true>
 {
  EIGEN_STRONG_INLINE Packet4cf pmadd(const Packet4cf& x, const Packet4cf& y, const Packet4cf& c) const
  { return padd(pmul(x,y),c); }
  EIGEN_STRONG_INLINE Packet4cf pmul(const Packet4cf& a, const Packet4cf& b) const
  {
    return internal::pmul(a, pconj(b));
  }
 };
 template<> struct conj_helper<Packet4cf, Packet4cf, true,false>
 {
  EIGEN_STRONG_INLINE Packet4cf pmadd(const Packet4cf& x, const Packet4cf& y, const Packet4cf& c) const
  { return padd(pmul(x,y),c); }
  EIGEN_STRONG_INLINE Packet4cf pmul(const Packet4cf& a, const Packet4cf& b) const
  {
    return internal::pmul(pconj(a), b);
  }
 };
 template<> struct conj_helper<Packet4cf, Packet4cf, true,true>
 {
  EIGEN_STRONG_INLINE Packet4cf pmadd(const Packet4cf& x, const Packet4cf& y, const Packet4cf& c) const
  { return padd(pmul(x,y),c); }
  EIGEN_STRONG_INLINE Packet4cf pmul(const Packet4cf& a, const Packet4cf& b) const
  {
    return pconj(internal::pmul(a, b));
  }
 };
 EIGEN_MAKE_CONJ_HELPER_CPLX_REAL(Packet4cf,Packet8f)
 template<> EIGEN_STRONG_INLINE Packet4cf pdiv<Packet4cf>(const Packet4cf& a, const Packet4cf& b)
 {
-  Packet4cf num = pmul(a, pconj(b));
+  return pdiv_complex(a, b);
  __m256 tmp = _mm256_mul_ps(b.v, b.v);
  __m256 tmp2    = _mm256_shuffle_ps(tmp,tmp,0xB1);
  __m256 denom = _mm256_add_ps(tmp, tmp2);
  return Packet4cf(_mm256_div_ps(num.v, denom));
 }
 template<> EIGEN_STRONG_INLINE Packet4cf pcplxflip<Packet4cf>(const Packet4cf& x)
@@ -254,6 +206,7 @@ template<> struct packet_traits<std::complex<double> >  : default_packet_traits
    HasMul    = 1,
    HasDiv    = 1,
    HasNegate = 1,
    HasSqrt   = 1,
    HasAbs    = 0,
    HasAbs2   = 0,
    HasMin    = 0,
@@ -263,7 +216,18 @@ template<> struct packet_traits<std::complex<double> >  : default_packet_traits
 };
 #endif
-template<> struct unpacket_traits<Packet2cd> { typedef std::complex<double> type; enum {size=2, alignment=Aligned32, vectorizable=true, masked_load_available=false, masked_store_available=false}; typedef Packet1cd half; };
+template<> struct unpacket_traits<Packet2cd> {
  typedef std::complex<double> type;
  typedef Packet1cd half;
  typedef Packet4d as_real;
  enum {
    size=2,
    alignment=Aligned32,
    vectorizable=true,
    masked_load_available=false,
    masked_store_available=false
  };
 };
 template<> EIGEN_STRONG_INLINE Packet2cd padd<Packet2cd>(const Packet2cd& a, const Packet2cd& b) { return Packet2cd(_mm256_add_pd(a.v,b.v)); }
 template<> EIGEN_STRONG_INLINE Packet2cd psub<Packet2cd>(const Packet2cd& a, const Packet2cd& b) { return Packet2cd(_mm256_sub_pd(a.v,b.v)); }
@@ -291,7 +255,6 @@ EIGEN_STRONG_INLINE Packet2cd pcmp_eq(const Packet2cd& a, const Packet2cd& b) {
 }
 template<> EIGEN_STRONG_INLINE Packet2cd ptrue<Packet2cd>(const Packet2cd& a) { return Packet2cd(ptrue(Packet4d(a.v))); }
 template<> EIGEN_STRONG_INLINE Packet2cd pnot<Packet2cd>(const Packet2cd& a) { return Packet2cd(pnot(Packet4d(a.v))); }
 template<> EIGEN_STRONG_INLINE Packet2cd pand   <Packet2cd>(const Packet2cd& a, const Packet2cd& b) { return Packet2cd(_mm256_and_pd(a.v,b.v)); }
 template<> EIGEN_STRONG_INLINE Packet2cd por    <Packet2cd>(const Packet2cd& a, const Packet2cd& b) { return Packet2cd(_mm256_or_pd(a.v,b.v)); }
 template<> EIGEN_STRONG_INLINE Packet2cd pxor   <Packet2cd>(const Packet2cd& a, const Packet2cd& b) { return Packet2cd(_mm256_xor_pd(a.v,b.v)); }
@@ -347,71 +310,17 @@ template<> EIGEN_STRONG_INLINE std::complex<double> predux<Packet2cd>(const Pack
                     Packet1cd(_mm256_extractf128_pd(a.v,1))));
 }
 template<> EIGEN_STRONG_INLINE Packet2cd preduxp<Packet2cd>(const Packet2cd* vecs)
 {
  Packet4d t0 = _mm256_permute2f128_pd(vecs[0].v,vecs[1].v, 0 + (2<<4));
  Packet4d t1 = _mm256_permute2f128_pd(vecs[0].v,vecs[1].v, 1 + (3<<4));
  return Packet2cd(_mm256_add_pd(t0,t1));
 }
 template<> EIGEN_STRONG_INLINE std::complex<double> predux_mul<Packet2cd>(const Packet2cd& a)
 {
  return predux(pmul(Packet1cd(_mm256_extractf128_pd(a.v,0)),
                     Packet1cd(_mm256_extractf128_pd(a.v,1))));
 }
 template<int Offset>
 struct palign_impl<Offset,Packet2cd>
 {
  static EIGEN_STRONG_INLINE void run(Packet2cd& first, const Packet2cd& second)
  {
    if (Offset==0) return;
    palign_impl<Offset*2,Packet4d>::run(first.v, second.v);
  }
 };
 template<> struct conj_helper<Packet2cd, Packet2cd, false,true>
 {
  EIGEN_STRONG_INLINE Packet2cd pmadd(const Packet2cd& x, const Packet2cd& y, const Packet2cd& c) const
  { return padd(pmul(x,y),c); }
  EIGEN_STRONG_INLINE Packet2cd pmul(const Packet2cd& a, const Packet2cd& b) const
  {
    return internal::pmul(a, pconj(b));
  }
 };
 template<> struct conj_helper<Packet2cd, Packet2cd, true,false>
 {
  EIGEN_STRONG_INLINE Packet2cd pmadd(const Packet2cd& x, const Packet2cd& y, const Packet2cd& c) const
  { return padd(pmul(x,y),c); }
  EIGEN_STRONG_INLINE Packet2cd pmul(const Packet2cd& a, const Packet2cd& b) const
  {
    return internal::pmul(pconj(a), b);
  }
 };
 template<> struct conj_helper<Packet2cd, Packet2cd, true,true>
 {
  EIGEN_STRONG_INLINE Packet2cd pmadd(const Packet2cd& x, const Packet2cd& y, const Packet2cd& c) const
  { return padd(pmul(x,y),c); }
  EIGEN_STRONG_INLINE Packet2cd pmul(const Packet2cd& a, const Packet2cd& b) const
  {
    return pconj(internal::pmul(a, b));
  }
 };
 EIGEN_MAKE_CONJ_HELPER_CPLX_REAL(Packet2cd,Packet4d)
 template<> EIGEN_STRONG_INLINE Packet2cd pdiv<Packet2cd>(const Packet2cd& a, const Packet2cd& b)
 {
-  Packet2cd num = pmul(a, pconj(b));
+  return pdiv_complex(a, b);
  __m256d tmp = _mm256_mul_pd(b.v, b.v);
  __m256d denom = _mm256_hadd_pd(tmp, tmp);
  return Packet2cd(_mm256_div_pd(num.v, denom));
 }
 template<> EIGEN_STRONG_INLINE Packet2cd pcplxflip<Packet2cd>(const Packet2cd& x)
@@ -444,24 +353,12 @@ ptranspose(PacketBlock<Packet2cd,2>& kernel) {
 kernel.packet[0].v = tmp;
 }
-template<> EIGEN_STRONG_INLINE Packet4cf pinsertfirst(const Packet4cf& a, std::complex<float> b)
+template<> EIGEN_STRONG_INLINE Packet2cd psqrt<Packet2cd>(const Packet2cd& a) {
-{
+  return psqrt_complex<Packet2cd>(a);
  return Packet4cf(_mm256_blend_ps(a.v,pset1<Packet4cf>(b).v,1|2));
 }
-template<> EIGEN_STRONG_INLINE Packet2cd pinsertfirst(const Packet2cd& a, std::complex<double> b)
+template<> EIGEN_STRONG_INLINE Packet4cf psqrt<Packet4cf>(const Packet4cf& a) {
-{
+  return psqrt_complex<Packet4cf>(a);
  return Packet2cd(_mm256_blend_pd(a.v,pset1<Packet2cd>(b).v,1|2));
 }
 template<> EIGEN_STRONG_INLINE Packet4cf pinsertlast(const Packet4cf& a, std::complex<float> b)
 {
  return Packet4cf(_mm256_blend_ps(a.v,pset1<Packet4cf>(b).v,(1<<7)|(1<<6)));
 }
 template<> EIGEN_STRONG_INLINE Packet2cd pinsertlast(const Packet2cd& a, std::complex<double> b)
 {
  return Packet2cd(_mm256_blend_pd(a.v,pset1<Packet2cd>(b).v,(1<<3)|(1<<2)));
 }
 } // end namespace internal
--- a/Eigen/src/Core/arch/AVX/MathFunctions.h
+++ b/Eigen/src/Core/arch/AVX/MathFunctions.h
@@ -36,6 +36,24 @@ plog<Packet8f>(const Packet8f& _x) {
  return plog_float(_x);
 }
 template <>
 EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS EIGEN_UNUSED Packet4d
 plog<Packet4d>(const Packet4d& _x) {
  return plog_double(_x);
 }
 template <>
 EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS EIGEN_UNUSED Packet8f
 plog2<Packet8f>(const Packet8f& _x) {
  return plog2_float(_x);
 }
 template <>
 EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS EIGEN_UNUSED Packet4d
 plog2<Packet4d>(const Packet4d& _x) {
  return plog2_double(_x);
 }
 template<> EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS EIGEN_UNUSED
 Packet8f plog1p<Packet8f>(const Packet8f& _x) {
  return generic_plog1p(_x);
@@ -58,15 +76,15 @@ pexp<Packet8f>(const Packet8f& _x) {
 // Hyperbolic Tangent function.
 template <>
 EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS EIGEN_UNUSED Packet8f
-ptanh<Packet8f>(const Packet8f& x) {
+ptanh<Packet8f>(const Packet8f& _x) {
-  return internal::generic_fast_tanh_float(x);
+  return internal::generic_fast_tanh_float(_x);
 }
 // Exponential function for doubles.
 template <>
 EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS EIGEN_UNUSED Packet4d
-pexp<Packet4d>(const Packet4d& x) {
+pexp<Packet4d>(const Packet4d& _x) {
-  return pexp_double(x);
+  return pexp_double(_x);
 }
 // Functions for sqrt.
@@ -79,33 +97,36 @@ pexp<Packet4d>(const Packet4d& x) {
 // For detail see here: http://www.beyond3d.com/content/articles/8/
 #if EIGEN_FAST_MATH
 template <>
-EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS EIGEN_UNUSED Packet8f
+EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS EIGEN_UNUSED
-psqrt<Packet8f>(const Packet8f& _x) {
+Packet8f psqrt<Packet8f>(const Packet8f& _x) {
-  Packet8f half = pmul(_x, pset1<Packet8f>(.5f));
+  Packet8f minus_half_x = pmul(_x, pset1<Packet8f>(-0.5f));
-  Packet8f denormal_mask = _mm256_and_ps(
+  Packet8f denormal_mask = pandnot(
-      _mm256_cmp_ps(_x, pset1<Packet8f>((std::numeric_limits<float>::min)()),
+      pcmp_lt(_x, pset1<Packet8f>((std::numeric_limits<float>::min)())),
-                    _CMP_LT_OQ),
+      pcmp_lt(_x, pzero(_x)));
      _mm256_cmp_ps(_x, _mm256_setzero_ps(), _CMP_GE_OQ));
  // Compute approximate reciprocal sqrt.
  Packet8f x = _mm256_rsqrt_ps(_x);
  // Do a single step of Newton's iteration.
-  x = pmul(x, psub(pset1<Packet8f>(1.5f), pmul(half, pmul(x,x))));
+  x = pmul(x, pmadd(minus_half_x, pmul(x,x), pset1<Packet8f>(1.5f)));
  // Flush results for denormals to zero.
-  return _mm256_andnot_ps(denormal_mask, pmul(_x,x));
+  return pandnot(pmul(_x,x), denormal_mask);
 }
 #else
 template <> EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS EIGEN_UNUSED
 Packet8f psqrt<Packet8f>(const Packet8f& x) {
  return _mm256_sqrt_ps(x);
 }
 #endif
 template <> EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS EIGEN_UNUSED
 Packet4d psqrt<Packet4d>(const Packet4d& x) {
  return _mm256_sqrt_pd(x);
 }
 #if EIGEN_FAST_MATH
 #else
 template <> EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS EIGEN_UNUSED
 Packet8f psqrt<Packet8f>(const Packet8f& _x) {
  return _mm256_sqrt_ps(_x);
 }
 #endif
 template <> EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS EIGEN_UNUSED
 Packet4d psqrt<Packet4d>(const Packet4d& _x) {
  return _mm256_sqrt_pd(_x);
 }
 #if EIGEN_FAST_MATH
 template<> EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS EIGEN_UNUSED
 Packet8f prsqrt<Packet8f>(const Packet8f& _x) {
  _EIGEN_DECLARE_CONST_Packet8f_FROM_INT(inf, 0x7f800000);
@@ -140,18 +161,65 @@ Packet8f prsqrt<Packet8f>(const Packet8f& _x) {
 #else
 template <> EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS EIGEN_UNUSED
-Packet8f prsqrt<Packet8f>(const Packet8f& x) {
+Packet8f prsqrt<Packet8f>(const Packet8f& _x) {
  _EIGEN_DECLARE_CONST_Packet8f(one, 1.0f);
-  return _mm256_div_ps(p8f_one, _mm256_sqrt_ps(x));
+  return _mm256_div_ps(p8f_one, _mm256_sqrt_ps(_x));
 }
 #endif
 template <> EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS EIGEN_UNUSED
-Packet4d prsqrt<Packet4d>(const Packet4d& x) {
+Packet4d prsqrt<Packet4d>(const Packet4d& _x) {
  _EIGEN_DECLARE_CONST_Packet4d(one, 1.0);
-  return _mm256_div_pd(p4d_one, _mm256_sqrt_pd(x));
+  return _mm256_div_pd(p4d_one, _mm256_sqrt_pd(_x));
 }
 F16_PACKET_FUNCTION(Packet8f, Packet8h, psin)
 F16_PACKET_FUNCTION(Packet8f, Packet8h, pcos)
 F16_PACKET_FUNCTION(Packet8f, Packet8h, plog)
 F16_PACKET_FUNCTION(Packet8f, Packet8h, plog2)
 F16_PACKET_FUNCTION(Packet8f, Packet8h, plog1p)
 F16_PACKET_FUNCTION(Packet8f, Packet8h, pexpm1)
 F16_PACKET_FUNCTION(Packet8f, Packet8h, pexp)
 F16_PACKET_FUNCTION(Packet8f, Packet8h, ptanh)
 F16_PACKET_FUNCTION(Packet8f, Packet8h, psqrt)
 F16_PACKET_FUNCTION(Packet8f, Packet8h, prsqrt)
 template <>
 EIGEN_STRONG_INLINE Packet8h pfrexp(const Packet8h& a, Packet8h& exponent) {
  Packet8f fexponent;
  const Packet8h out = float2half(pfrexp<Packet8f>(half2float(a), fexponent));
  exponent = float2half(fexponent);
  return out;
 }
 template <>
 EIGEN_STRONG_INLINE Packet8h pldexp(const Packet8h& a, const Packet8h& exponent) {
  return float2half(pldexp<Packet8f>(half2float(a), half2float(exponent)));
 }
 BF16_PACKET_FUNCTION(Packet8f, Packet8bf, psin)
 BF16_PACKET_FUNCTION(Packet8f, Packet8bf, pcos)
 BF16_PACKET_FUNCTION(Packet8f, Packet8bf, plog)
 BF16_PACKET_FUNCTION(Packet8f, Packet8bf, plog2)
 BF16_PACKET_FUNCTION(Packet8f, Packet8bf, plog1p)
 BF16_PACKET_FUNCTION(Packet8f, Packet8bf, pexpm1)
 BF16_PACKET_FUNCTION(Packet8f, Packet8bf, pexp)
 BF16_PACKET_FUNCTION(Packet8f, Packet8bf, ptanh)
 BF16_PACKET_FUNCTION(Packet8f, Packet8bf, psqrt)
 BF16_PACKET_FUNCTION(Packet8f, Packet8bf, prsqrt)
 template <>
 EIGEN_STRONG_INLINE Packet8bf pfrexp(const Packet8bf& a, Packet8bf& exponent) {
  Packet8f fexponent;
  const Packet8bf out = F32ToBf16(pfrexp<Packet8f>(Bf16ToF32(a), fexponent));
  exponent = F32ToBf16(fexponent);
  return out;
 }
 template <>
 EIGEN_STRONG_INLINE Packet8bf pldexp(const Packet8bf& a, const Packet8bf& exponent) {
  return F32ToBf16(pldexp<Packet8f>(Bf16ToF32(a), Bf16ToF32(exponent)));
 }
 }  // end namespace internal
--- a/Eigen/src/Core/arch/AVX/PacketMath.h
+++ b/Eigen/src/Core/arch/AVX/PacketMath.h
--- a/Eigen/src/Core/arch/AVX/TypeCasting.h
+++ b/Eigen/src/Core/arch/AVX/TypeCasting.h
@@ -35,6 +35,46 @@ struct type_casting_traits<int, float> {
 };
 #ifndef EIGEN_VECTORIZE_AVX512
 template <>
 struct type_casting_traits<Eigen::half, float> {
  enum {
    VectorizedCast = 1,
    SrcCoeffRatio = 1,
    TgtCoeffRatio = 1
  };
 };
 template <>
 struct type_casting_traits<float, Eigen::half> {
  enum {
    VectorizedCast = 1,
    SrcCoeffRatio = 1,
    TgtCoeffRatio = 1
  };
 };
 template <>
 struct type_casting_traits<bfloat16, float> {
  enum {
    VectorizedCast = 1,
    SrcCoeffRatio = 1,
    TgtCoeffRatio = 1
  };
 };
 template <>
 struct type_casting_traits<float, bfloat16> {
  enum {
    VectorizedCast = 1,
    SrcCoeffRatio = 1,
    TgtCoeffRatio = 1
  };
 };
 #endif  // EIGEN_VECTORIZE_AVX512
 template<> EIGEN_STRONG_INLINE Packet8i pcast<Packet8f, Packet8i>(const Packet8f& a) {
  return _mm256_cvttps_epi32(a);
@@ -52,36 +92,22 @@ template<> EIGEN_STRONG_INLINE Packet8f preinterpret<Packet8f,Packet8i>(const Pa
  return _mm256_castsi256_ps(a);
 }
 #ifndef EIGEN_VECTORIZE_AVX512
 template <>
 struct type_casting_traits<Eigen::half, float> {
  enum {
    VectorizedCast = 1,
    SrcCoeffRatio = 1,
    TgtCoeffRatio = 1
  };
 };
 template<> EIGEN_STRONG_INLINE Packet8f pcast<Packet8h, Packet8f>(const Packet8h& a) {
  return half2float(a);
 }
-template <>
+template<> EIGEN_STRONG_INLINE Packet8f pcast<Packet8bf, Packet8f>(const Packet8bf& a) {
-struct type_casting_traits<float, Eigen::half> {
+  return Bf16ToF32(a);
-  enum {
+}
    VectorizedCast = 1,
    SrcCoeffRatio = 1,
    TgtCoeffRatio = 1
  };
 };
 #endif  // EIGEN_VECTORIZE_AVX512
 template<> EIGEN_STRONG_INLINE Packet8h pcast<Packet8f, Packet8h>(const Packet8f& a) {
  return float2half(a);
 }
 template<> EIGEN_STRONG_INLINE Packet8bf pcast<Packet8f, Packet8bf>(const Packet8f& a) {
  return F32ToBf16(a);
 }
 } // end namespace internal
 } // end namespace Eigen
--- a/Eigen/src/Core/arch/AVX512/Complex.h
+++ b/Eigen/src/Core/arch/AVX512/Complex.h
@@ -37,17 +37,19 @@ template<> struct packet_traits<std::complex<float> >  : default_packet_traits
    HasMul    = 1,
    HasDiv    = 1,
    HasNegate = 1,
    HasSqrt   = EIGEN_HAS_AVX512_MATH,
    HasAbs    = 0,
    HasAbs2   = 0,
    HasMin    = 0,
    HasMax    = 0,
-    HasSetLinear = 0,
+    HasSetLinear = 0
    HasReduxp = 0
  };
 };
 template<> struct unpacket_traits<Packet8cf> {
  typedef std::complex<float> type;
  typedef Packet4cf half;
  typedef Packet16f as_real;
  enum {
    size = 8,
    alignment=unpacket_traits<Packet16f>::alignment,
@@ -55,11 +57,9 @@ template<> struct unpacket_traits<Packet8cf> {
    masked_load_available=false,
    masked_store_available=false
  };
  typedef Packet4cf half;
 };
 template<> EIGEN_STRONG_INLINE Packet8cf ptrue<Packet8cf>(const Packet8cf& a) { return Packet8cf(ptrue(Packet16f(a.v))); }
 template<> EIGEN_STRONG_INLINE Packet8cf pnot<Packet8cf>(const Packet8cf& a) { return Packet8cf(pnot(Packet16f(a.v))); }
 template<> EIGEN_STRONG_INLINE Packet8cf padd<Packet8cf>(const Packet8cf& a, const Packet8cf& b) { return Packet8cf(_mm512_add_ps(a.v,b.v)); }
 template<> EIGEN_STRONG_INLINE Packet8cf psub<Packet8cf>(const Packet8cf& a, const Packet8cf& b) { return Packet8cf(_mm512_sub_ps(a.v,b.v)); }
 template<> EIGEN_STRONG_INLINE Packet8cf pnegate(const Packet8cf& a)
@@ -97,7 +97,9 @@ template<> EIGEN_STRONG_INLINE Packet8cf ploadu<Packet8cf>(const std::complex<fl
 template<> EIGEN_STRONG_INLINE Packet8cf pset1<Packet8cf>(const std::complex<float>& from)
 {
-  return Packet8cf(_mm512_castpd_ps(pload1<Packet8d>((const double*)(const void*)&from)));
+  const float re = std::real(from);
  const float im = std::imag(from);
  return Packet8cf(_mm512_set_ps(im, re, im, re, im, re, im, re, im, re, im, re, im, re, im, re));
 }
 template<> EIGEN_STRONG_INLINE Packet8cf ploaddup<Packet8cf>(const std::complex<float>* from)
@@ -153,58 +155,11 @@ EIGEN_STRONG_INLINE Packet4cf predux_half_dowto4<Packet8cf>(const Packet8cf& a)
  return Packet4cf(res);
 }
 template<int Offset>
 struct palign_impl<Offset,Packet8cf>
 {
  static EIGEN_STRONG_INLINE void run(Packet8cf& first, const Packet8cf& second)
  {
    if (Offset==0) return;
    palign_impl<Offset*2,Packet16f>::run(first.v, second.v);
  }
 };
 template<> struct conj_helper<Packet8cf, Packet8cf, false,true>
 {
  EIGEN_STRONG_INLINE Packet8cf pmadd(const Packet8cf& x, const Packet8cf& y, const Packet8cf& c) const
  { return padd(pmul(x,y),c); }
  EIGEN_STRONG_INLINE Packet8cf pmul(const Packet8cf& a, const Packet8cf& b) const
  {
    return internal::pmul(a, pconj(b));
  }
 };
 template<> struct conj_helper<Packet8cf, Packet8cf, true,false>
 {
  EIGEN_STRONG_INLINE Packet8cf pmadd(const Packet8cf& x, const Packet8cf& y, const Packet8cf& c) const
  { return padd(pmul(x,y),c); }
  EIGEN_STRONG_INLINE Packet8cf pmul(const Packet8cf& a, const Packet8cf& b) const
  {
    return internal::pmul(pconj(a), b);
  }
 };
 template<> struct conj_helper<Packet8cf, Packet8cf, true,true>
 {
  EIGEN_STRONG_INLINE Packet8cf pmadd(const Packet8cf& x, const Packet8cf& y, const Packet8cf& c) const
  { return padd(pmul(x,y),c); }
  EIGEN_STRONG_INLINE Packet8cf pmul(const Packet8cf& a, const Packet8cf& b) const
  {
    return pconj(internal::pmul(a, b));
  }
 };
 EIGEN_MAKE_CONJ_HELPER_CPLX_REAL(Packet8cf,Packet16f)
 template<> EIGEN_STRONG_INLINE Packet8cf pdiv<Packet8cf>(const Packet8cf& a, const Packet8cf& b)
 {
-  Packet8cf num = pmul(a, pconj(b));
+  return pdiv_complex(a, b);
  __m512 tmp = _mm512_mul_ps(b.v, b.v);
  __m512 tmp2    = _mm512_shuffle_ps(tmp,tmp,0xB1);
  __m512 denom = _mm512_add_ps(tmp, tmp2);
  return Packet8cf(_mm512_div_ps(num.v, denom));
 }
 template<> EIGEN_STRONG_INLINE Packet8cf pcplxflip<Packet8cf>(const Packet8cf& x)
@@ -235,17 +190,19 @@ template<> struct packet_traits<std::complex<double> >  : default_packet_traits
    HasMul    = 1,
    HasDiv    = 1,
    HasNegate = 1,
    HasSqrt   = EIGEN_HAS_AVX512_MATH,
    HasAbs    = 0,
    HasAbs2   = 0,
    HasMin    = 0,
    HasMax    = 0,
-    HasSetLinear = 0,
+    HasSetLinear = 0
    HasReduxp = 0
  };
 };
 template<> struct unpacket_traits<Packet4cd> {
  typedef std::complex<double> type;
  typedef Packet2cd half;
  typedef Packet8d as_real;
  enum {
    size = 4,
    alignment = unpacket_traits<Packet8d>::alignment,
@@ -253,7 +210,6 @@ template<> struct unpacket_traits<Packet4cd> {
    masked_load_available=false,
    masked_store_available=false
  };
  typedef Packet2cd half;
 };
 template<> EIGEN_STRONG_INLINE Packet4cd padd<Packet4cd>(const Packet4cd& a, const Packet4cd& b) { return Packet4cd(_mm512_add_pd(a.v,b.v)); }
@@ -277,7 +233,6 @@ template<> EIGEN_STRONG_INLINE Packet4cd pmul<Packet4cd>(const Packet4cd& a, con
 }
 template<> EIGEN_STRONG_INLINE Packet4cd ptrue<Packet4cd>(const Packet4cd& a) { return Packet4cd(ptrue(Packet8d(a.v))); }
 template<> EIGEN_STRONG_INLINE Packet4cd pnot<Packet4cd>(const Packet4cd& a) { return Packet4cd(pnot(Packet8d(a.v))); }
 template<> EIGEN_STRONG_INLINE Packet4cd pand   <Packet4cd>(const Packet4cd& a, const Packet4cd& b) { return Packet4cd(pand(a.v,b.v)); }
 template<> EIGEN_STRONG_INLINE Packet4cd por    <Packet4cd>(const Packet4cd& a, const Packet4cd& b) { return Packet4cd(por(a.v,b.v)); }
 template<> EIGEN_STRONG_INLINE Packet4cd pxor   <Packet4cd>(const Packet4cd& a, const Packet4cd& b) { return Packet4cd(pxor(a.v,b.v)); }
@@ -296,11 +251,7 @@ template<> EIGEN_STRONG_INLINE Packet4cd ploadu<Packet4cd>(const std::complex<do
 template<> EIGEN_STRONG_INLINE Packet4cd pset1<Packet4cd>(const std::complex<double>& from)
 {
  #ifdef EIGEN_VECTORIZE_AVX512DQ
  return Packet4cd(_mm512_broadcast_f64x2(pset1<Packet1cd>(from).v));
  #else
  return Packet4cd(_mm512_castps_pd(_mm512_broadcast_f32x4( _mm_castpd_ps(pset1<Packet1cd>(from).v))));
  #endif
 }
 template<> EIGEN_STRONG_INLINE Packet4cd ploaddup<Packet4cd>(const std::complex<double>* from) {
@@ -337,7 +288,7 @@ template<> EIGEN_STRONG_INLINE std::complex<double> pfirst<Packet4cd>(const Pack
 }
 template<> EIGEN_STRONG_INLINE Packet4cd preverse(const Packet4cd& a) {
-  return Packet4cd(_mm512_shuffle_f64x2(a.v, a.v, EIGEN_SSE_SHUFFLE_MASK(3,2,1,0)));
+  return Packet4cd(_mm512_shuffle_f64x2(a.v, a.v, (shuffle_mask<3,2,1,0>::mask)));
 }
 template<> EIGEN_STRONG_INLINE std::complex<double> predux<Packet4cd>(const Packet4cd& a)
@@ -352,57 +303,11 @@ template<> EIGEN_STRONG_INLINE std::complex<double> predux_mul<Packet4cd>(const
                         Packet2cd(_mm512_extractf64x4_pd(a.v,1))));
 }
 template<int Offset>
 struct palign_impl<Offset,Packet4cd>
 {
  static EIGEN_STRONG_INLINE void run(Packet4cd& first, const Packet4cd& second)
  {
    if (Offset==0) return;
    palign_impl<Offset*2,Packet8d>::run(first.v, second.v);
  }
 };
 template<> struct conj_helper<Packet4cd, Packet4cd, false,true>
 {
  EIGEN_STRONG_INLINE Packet4cd pmadd(const Packet4cd& x, const Packet4cd& y, const Packet4cd& c) const
  { return padd(pmul(x,y),c); }
  EIGEN_STRONG_INLINE Packet4cd pmul(const Packet4cd& a, const Packet4cd& b) const
  {
    return internal::pmul(a, pconj(b));
  }
 };
 template<> struct conj_helper<Packet4cd, Packet4cd, true,false>
 {
  EIGEN_STRONG_INLINE Packet4cd pmadd(const Packet4cd& x, const Packet4cd& y, const Packet4cd& c) const
  { return padd(pmul(x,y),c); }
  EIGEN_STRONG_INLINE Packet4cd pmul(const Packet4cd& a, const Packet4cd& b) const
  {
    return internal::pmul(pconj(a), b);
  }
 };
 template<> struct conj_helper<Packet4cd, Packet4cd, true,true>
 {
  EIGEN_STRONG_INLINE Packet4cd pmadd(const Packet4cd& x, const Packet4cd& y, const Packet4cd& c) const
  { return padd(pmul(x,y),c); }
  EIGEN_STRONG_INLINE Packet4cd pmul(const Packet4cd& a, const Packet4cd& b) const
  {
    return pconj(internal::pmul(a, b));
  }
 };
 EIGEN_MAKE_CONJ_HELPER_CPLX_REAL(Packet4cd,Packet8d)
 template<> EIGEN_STRONG_INLINE Packet4cd pdiv<Packet4cd>(const Packet4cd& a, const Packet4cd& b)
 {
-  Packet4cd num = pmul(a, pconj(b));
+  return pdiv_complex(a, b);
  __m512d tmp = _mm512_mul_pd(b.v, b.v);
  __m512d denom =  padd(_mm512_permute_pd(tmp,0x55), tmp);
  return Packet4cd(_mm512_div_pd(num.v, denom));
 }
 template<> EIGEN_STRONG_INLINE Packet4cd pcplxflip<Packet4cd>(const Packet4cd& x)
@@ -450,43 +355,30 @@ ptranspose(PacketBlock<Packet8cf,8>& kernel) {
 EIGEN_DEVICE_FUNC inline void
 ptranspose(PacketBlock<Packet4cd,4>& kernel) {
-  __m512d T0 = _mm512_shuffle_f64x2(kernel.packet[0].v, kernel.packet[1].v, EIGEN_SSE_SHUFFLE_MASK(0,1,0,1)); // [a0 a1 b0 b1]
+  __m512d T0 = _mm512_shuffle_f64x2(kernel.packet[0].v, kernel.packet[1].v, (shuffle_mask<0,1,0,1>::mask)); // [a0 a1 b0 b1]
-  __m512d T1 = _mm512_shuffle_f64x2(kernel.packet[0].v, kernel.packet[1].v, EIGEN_SSE_SHUFFLE_MASK(2,3,2,3)); // [a2 a3 b2 b3]
+  __m512d T1 = _mm512_shuffle_f64x2(kernel.packet[0].v, kernel.packet[1].v, (shuffle_mask<2,3,2,3>::mask)); // [a2 a3 b2 b3]
-  __m512d T2 = _mm512_shuffle_f64x2(kernel.packet[2].v, kernel.packet[3].v, EIGEN_SSE_SHUFFLE_MASK(0,1,0,1)); // [c0 c1 d0 d1]
+  __m512d T2 = _mm512_shuffle_f64x2(kernel.packet[2].v, kernel.packet[3].v, (shuffle_mask<0,1,0,1>::mask)); // [c0 c1 d0 d1]
-  __m512d T3 = _mm512_shuffle_f64x2(kernel.packet[2].v, kernel.packet[3].v, EIGEN_SSE_SHUFFLE_MASK(2,3,2,3)); // [c2 c3 d2 d3]
+  __m512d T3 = _mm512_shuffle_f64x2(kernel.packet[2].v, kernel.packet[3].v, (shuffle_mask<2,3,2,3>::mask)); // [c2 c3 d2 d3]
-  kernel.packet[3] = Packet4cd(_mm512_shuffle_f64x2(T1, T3, EIGEN_SSE_SHUFFLE_MASK(1,3,1,3))); // [a3 b3 c3 d3]
+  kernel.packet[3] = Packet4cd(_mm512_shuffle_f64x2(T1, T3, (shuffle_mask<1,3,1,3>::mask))); // [a3 b3 c3 d3]
-  kernel.packet[2] = Packet4cd(_mm512_shuffle_f64x2(T1, T3, EIGEN_SSE_SHUFFLE_MASK(0,2,0,2))); // [a2 b2 c2 d2]
+  kernel.packet[2] = Packet4cd(_mm512_shuffle_f64x2(T1, T3, (shuffle_mask<0,2,0,2>::mask))); // [a2 b2 c2 d2]
-  kernel.packet[1] = Packet4cd(_mm512_shuffle_f64x2(T0, T2, EIGEN_SSE_SHUFFLE_MASK(1,3,1,3))); // [a1 b1 c1 d1]
+  kernel.packet[1] = Packet4cd(_mm512_shuffle_f64x2(T0, T2, (shuffle_mask<1,3,1,3>::mask))); // [a1 b1 c1 d1]
-  kernel.packet[0] = Packet4cd(_mm512_shuffle_f64x2(T0, T2, EIGEN_SSE_SHUFFLE_MASK(0,2,0,2))); // [a0 b0 c0 d0]
+  kernel.packet[0] = Packet4cd(_mm512_shuffle_f64x2(T0, T2, (shuffle_mask<0,2,0,2>::mask))); // [a0 b0 c0 d0]
 }
-template<> EIGEN_STRONG_INLINE Packet8cf pinsertfirst(const Packet8cf& a, std::complex<float> b)
+#if EIGEN_HAS_AVX512_MATH
-{
+
-  Packet2cf tmp = Packet2cf(_mm512_extractf32x4_ps(a.v,0));
+template<> EIGEN_STRONG_INLINE Packet4cd psqrt<Packet4cd>(const Packet4cd& a) {
-  tmp = pinsertfirst(tmp, b);
+  return psqrt_complex<Packet4cd>(a);
  return Packet8cf( _mm512_insertf32x4(a.v, tmp.v, 0) );
 }
-template<> EIGEN_STRONG_INLINE Packet4cd pinsertfirst(const Packet4cd& a, std::complex<double> b)
+template<> EIGEN_STRONG_INLINE Packet8cf psqrt<Packet8cf>(const Packet8cf& a) {
-{
+  return psqrt_complex<Packet8cf>(a);
  return Packet4cd(_mm512_castsi512_pd( _mm512_inserti32x4(_mm512_castpd_si512(a.v), _mm_castpd_si128(pset1<Packet1cd>(b).v), 0) ));
 }
-template<> EIGEN_STRONG_INLINE Packet8cf pinsertlast(const Packet8cf& a, std::complex<float> b)
+#endif
 {
  Packet2cf tmp = Packet2cf(_mm512_extractf32x4_ps(a.v,3) );
  tmp = pinsertlast(tmp, b);
  return Packet8cf( _mm512_insertf32x4(a.v, tmp.v, 3) );
 }
 template<> EIGEN_STRONG_INLINE Packet4cd pinsertlast(const Packet4cd& a, std::complex<double> b)
 {
  return Packet4cd(_mm512_castsi512_pd( _mm512_inserti32x4(_mm512_castpd_si512(a.v), _mm_castpd_si128(pset1<Packet1cd>(b).v), 3) ));
 }
 } // end namespace internal
 } // end namespace Eigen
 #endif // EIGEN_COMPLEX_AVX512_H
--- a/Eigen/src/Core/arch/AVX512/MathFunctions.h
+++ b/Eigen/src/Core/arch/AVX512/MathFunctions.h
@@ -14,8 +14,7 @@ namespace Eigen {
 namespace internal {
-// Disable the code for older versions of gcc that don't support many of the required avx512 instrinsics.
+#if EIGEN_HAS_AVX512_MATH
 #if EIGEN_GNUC_AT_LEAST(5, 3) || EIGEN_COMP_CLANG  || EIGEN_COMP_MSVC >= 1923
 #define _EIGEN_DECLARE_CONST_Packet16f(NAME, X) \
  const Packet16f p16f_##NAME = pset1<Packet16f>(X)
@@ -29,106 +28,41 @@ namespace internal {
 #define _EIGEN_DECLARE_CONST_Packet8d_FROM_INT64(NAME, X) \
  const Packet8d p8d_##NAME = _mm512_castsi512_pd(_mm512_set1_epi64(X))
-// Natural logarithm
+#define _EIGEN_DECLARE_CONST_Packet16bf(NAME, X) \
-// Computes log(x) as log(2^e * m) = C*e + log(m), where the constant C =log(2)
+  const Packet16bf p16bf_##NAME = pset1<Packet16bf>(X)
-// and m is in the range [sqrt(1/2),sqrt(2)). In this range, the logarithm can
+
-// be easily approximated by a polynomial centered on m=1 for stability.
+#define _EIGEN_DECLARE_CONST_Packet16bf_FROM_INT(NAME, X) \
-#if defined(EIGEN_VECTORIZE_AVX512DQ)
+  const Packet16bf p16bf_##NAME =  preinterpret<Packet16bf,Packet16i>(pset1<Packet16i>(X))
 template <>
 EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS EIGEN_UNUSED Packet16f
 plog<Packet16f>(const Packet16f& _x) {
-  Packet16f x = _x;
+  return plog_float(_x);
  _EIGEN_DECLARE_CONST_Packet16f(1, 1.0f);
  _EIGEN_DECLARE_CONST_Packet16f(half, 0.5f);
  _EIGEN_DECLARE_CONST_Packet16f(126f, 126.0f);
  _EIGEN_DECLARE_CONST_Packet16f_FROM_INT(inv_mant_mask, ~0x7f800000);
  // The smallest non denormalized float number.
  _EIGEN_DECLARE_CONST_Packet16f_FROM_INT(min_norm_pos, 0x00800000);
  _EIGEN_DECLARE_CONST_Packet16f_FROM_INT(minus_inf, 0xff800000);
  _EIGEN_DECLARE_CONST_Packet16f_FROM_INT(pos_inf, 0x7f800000);
  _EIGEN_DECLARE_CONST_Packet16f_FROM_INT(nan, 0x7fc00000);
  // Polynomial coefficients.
  _EIGEN_DECLARE_CONST_Packet16f(cephes_SQRTHF, 0.707106781186547524f);
  _EIGEN_DECLARE_CONST_Packet16f(cephes_log_p0, 7.0376836292E-2f);
  _EIGEN_DECLARE_CONST_Packet16f(cephes_log_p1, -1.1514610310E-1f);
  _EIGEN_DECLARE_CONST_Packet16f(cephes_log_p2, 1.1676998740E-1f);
  _EIGEN_DECLARE_CONST_Packet16f(cephes_log_p3, -1.2420140846E-1f);
  _EIGEN_DECLARE_CONST_Packet16f(cephes_log_p4, +1.4249322787E-1f);
  _EIGEN_DECLARE_CONST_Packet16f(cephes_log_p5, -1.6668057665E-1f);
  _EIGEN_DECLARE_CONST_Packet16f(cephes_log_p6, +2.0000714765E-1f);
  _EIGEN_DECLARE_CONST_Packet16f(cephes_log_p7, -2.4999993993E-1f);
  _EIGEN_DECLARE_CONST_Packet16f(cephes_log_p8, +3.3333331174E-1f);
  _EIGEN_DECLARE_CONST_Packet16f(cephes_log_q1, -2.12194440e-4f);
  _EIGEN_DECLARE_CONST_Packet16f(cephes_log_q2, 0.693359375f);
  // invalid_mask is set to true when x is NaN
  __mmask16 invalid_mask =  _mm512_cmp_ps_mask(x, _mm512_setzero_ps(), _CMP_NGE_UQ);
  __mmask16 iszero_mask  =  _mm512_cmp_ps_mask(x, _mm512_setzero_ps(), _CMP_EQ_OQ);
  // Truncate input values to the minimum positive normal.
  x = pmax(x, p16f_min_norm_pos);
  // Extract the shifted exponents.
  Packet16f emm0 = _mm512_cvtepi32_ps(_mm512_srli_epi32(preinterpret<Packet16i,Packet16f>(x), 23));
  Packet16f e = _mm512_sub_ps(emm0, p16f_126f);
  // Set the exponents to -1, i.e. x are in the range [0.5,1).
  x = _mm512_and_ps(x, p16f_inv_mant_mask);
  x = _mm512_or_ps(x, p16f_half);
  // part2: Shift the inputs from the range [0.5,1) to [sqrt(1/2),sqrt(2))
  // and shift by -1. The values are then centered around 0, which improves
  // the stability of the polynomial evaluation.
  //   if( x < SQRTHF ) {
  //     e -= 1;
  //     x = x + x - 1.0;
  //   } else { x = x - 1.0; }
  __mmask16 mask = _mm512_cmp_ps_mask(x, p16f_cephes_SQRTHF, _CMP_LT_OQ);
  Packet16f tmp = _mm512_mask_blend_ps(mask, _mm512_setzero_ps(), x);
  x = psub(x, p16f_1);
  e = psub(e, _mm512_mask_blend_ps(mask, _mm512_setzero_ps(), p16f_1));
  x = padd(x, tmp);
  Packet16f x2 = pmul(x, x);
  Packet16f x3 = pmul(x2, x);
  // Evaluate the polynomial approximant of degree 8 in three parts, probably
  // to improve instruction-level parallelism.
  Packet16f y, y1, y2;
  y = pmadd(p16f_cephes_log_p0, x, p16f_cephes_log_p1);
  y1 = pmadd(p16f_cephes_log_p3, x, p16f_cephes_log_p4);
  y2 = pmadd(p16f_cephes_log_p6, x, p16f_cephes_log_p7);
  y = pmadd(y, x, p16f_cephes_log_p2);
  y1 = pmadd(y1, x, p16f_cephes_log_p5);
  y2 = pmadd(y2, x, p16f_cephes_log_p8);
  y = pmadd(y, x3, y1);
  y = pmadd(y, x3, y2);
  y = pmul(y, x3);
  // Add the logarithm of the exponent back to the result of the interpolation.
  y1 = pmul(e, p16f_cephes_log_q1);
  tmp = pmul(x2, p16f_half);
  y = padd(y, y1);
  x = psub(x, tmp);
  y2 = pmul(e, p16f_cephes_log_q2);
  x = padd(x, y);
  x = padd(x, y2);
  __mmask16 pos_inf_mask = _mm512_cmp_ps_mask(_x,p16f_pos_inf,_CMP_EQ_OQ);
  // Filter out invalid inputs, i.e.:
  //  - negative arg will be NAN,
  //  - 0 will be -INF.
  //  - +INF will be +INF
  return _mm512_mask_blend_ps(iszero_mask,
            _mm512_mask_blend_ps(invalid_mask,
              _mm512_mask_blend_ps(pos_inf_mask,x,p16f_pos_inf),
              p16f_nan),
            p16f_minus_inf);
 }
-#endif
+
 template <>
 EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS EIGEN_UNUSED Packet8d
 plog<Packet8d>(const Packet8d& _x) {
  return plog_double(_x);
 }
 F16_PACKET_FUNCTION(Packet16f, Packet16h, plog)
 BF16_PACKET_FUNCTION(Packet16f, Packet16bf, plog)
 template <>
 EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS EIGEN_UNUSED Packet16f
 plog2<Packet16f>(const Packet16f& _x) {
  return plog2_float(_x);
 }
 template <>
 EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS EIGEN_UNUSED Packet8d
 plog2<Packet8d>(const Packet8d& _x) {
  return plog2_double(_x);
 }
 F16_PACKET_FUNCTION(Packet16f, Packet16h, plog2)
 BF16_PACKET_FUNCTION(Packet16f, Packet16bf, plog2)
 // Exponential function. Works by writing "x = m*log(2) + r" where
 // "m = floor(x/log(2)+1/2)" and "r" is the remainder. The result is then
@@ -164,17 +98,17 @@ pexp<Packet16f>(const Packet16f& _x) {
  _EIGEN_DECLARE_CONST_Packet16f(nln2, -0.6931471805599453f);
  Packet16f r = _mm512_fmadd_ps(m, p16f_nln2, x);
  Packet16f r2 = pmul(r, r);
  Packet16f r3 = pmul(r2, r);
-  // TODO(gonnet): Split into odd/even polynomials and try to exploit
+  // Evaluate the polynomial approximant,improved by instruction-level parallelism.
-  //               instruction-level parallelism.
+  Packet16f y, y1, y2;
-  Packet16f y = p16f_cephes_exp_p0;
+  y  = pmadd(p16f_cephes_exp_p0, r, p16f_cephes_exp_p1);
-  y = pmadd(y, r, p16f_cephes_exp_p1);
+  y1 = pmadd(p16f_cephes_exp_p3, r, p16f_cephes_exp_p4);
  y2 = padd(r, p16f_1);
  y  = pmadd(y, r, p16f_cephes_exp_p2);
-  y = pmadd(y, r, p16f_cephes_exp_p3);
+  y1 = pmadd(y1, r, p16f_cephes_exp_p5);
-  y = pmadd(y, r, p16f_cephes_exp_p4);
+  y  = pmadd(y, r3, y1);
-  y = pmadd(y, r, p16f_cephes_exp_p5);
+  y  = pmadd(y, r2, y2);
  y = pmadd(y, r2, r);
  y = padd(y, p16f_1);
  // Build emm0 = 2^m.
  Packet16i emm0 = _mm512_cvttps_epi32(padd(m, p16f_127));
@@ -184,75 +118,40 @@ pexp<Packet16f>(const Packet16f& _x) {
  return pmax(pmul(y, _mm512_castsi512_ps(emm0)), _x);
 }
-/*template <>
+template <>
 EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS EIGEN_UNUSED Packet8d
 pexp<Packet8d>(const Packet8d& _x) {
-  Packet8d x = _x;
+  return pexp_double(_x);
 }
-  _EIGEN_DECLARE_CONST_Packet8d(1, 1.0);
+F16_PACKET_FUNCTION(Packet16f, Packet16h, pexp)
-  _EIGEN_DECLARE_CONST_Packet8d(2, 2.0);
+BF16_PACKET_FUNCTION(Packet16f, Packet16bf, pexp)
-  _EIGEN_DECLARE_CONST_Packet8d(exp_hi, 709.437);
+template <>
-  _EIGEN_DECLARE_CONST_Packet8d(exp_lo, -709.436139303);
+EIGEN_STRONG_INLINE Packet16h pfrexp(const Packet16h& a, Packet16h& exponent) {
  Packet16f fexponent;
  const Packet16h out = float2half(pfrexp<Packet16f>(half2float(a), fexponent));
  exponent = float2half(fexponent);
  return out;
 }
-  _EIGEN_DECLARE_CONST_Packet8d(cephes_LOG2EF, 1.4426950408889634073599);
+template <>
 EIGEN_STRONG_INLINE Packet16h pldexp(const Packet16h& a, const Packet16h& exponent) {
  return float2half(pldexp<Packet16f>(half2float(a), half2float(exponent)));
 }
-  _EIGEN_DECLARE_CONST_Packet8d(cephes_exp_p0, 1.26177193074810590878e-4);
+template <>
-  _EIGEN_DECLARE_CONST_Packet8d(cephes_exp_p1, 3.02994407707441961300e-2);
+EIGEN_STRONG_INLINE Packet16bf pfrexp(const Packet16bf& a, Packet16bf& exponent) {
-  _EIGEN_DECLARE_CONST_Packet8d(cephes_exp_p2, 9.99999999999999999910e-1);
+  Packet16f fexponent;
-
+  const Packet16bf out = F32ToBf16(pfrexp<Packet16f>(Bf16ToF32(a), fexponent));
-  _EIGEN_DECLARE_CONST_Packet8d(cephes_exp_q0, 3.00198505138664455042e-6);
+  exponent = F32ToBf16(fexponent);
-  _EIGEN_DECLARE_CONST_Packet8d(cephes_exp_q1, 2.52448340349684104192e-3);
+  return out;
-  _EIGEN_DECLARE_CONST_Packet8d(cephes_exp_q2, 2.27265548208155028766e-1);
+}
  _EIGEN_DECLARE_CONST_Packet8d(cephes_exp_q3, 2.00000000000000000009e0);
  _EIGEN_DECLARE_CONST_Packet8d(cephes_exp_C1, 0.693145751953125);
  _EIGEN_DECLARE_CONST_Packet8d(cephes_exp_C2, 1.42860682030941723212e-6);
  // clamp x
  x = pmax(pmin(x, p8d_exp_hi), p8d_exp_lo);
  // Express exp(x) as exp(g + n*log(2)).
  const Packet8d n =
      _mm512_mul_round_pd(p8d_cephes_LOG2EF, x, _MM_FROUND_TO_NEAREST_INT);
  // Get the remainder modulo log(2), i.e. the "g" described above. Subtract
  // n*log(2) out in two steps, i.e. n*C1 + n*C2, C1+C2=log2 to get the last
  // digits right.
  const Packet8d nC1 = pmul(n, p8d_cephes_exp_C1);
  const Packet8d nC2 = pmul(n, p8d_cephes_exp_C2);
  x = psub(x, nC1);
  x = psub(x, nC2);
  const Packet8d x2 = pmul(x, x);
  // Evaluate the numerator polynomial of the rational interpolant.
  Packet8d px = p8d_cephes_exp_p0;
  px = pmadd(px, x2, p8d_cephes_exp_p1);
  px = pmadd(px, x2, p8d_cephes_exp_p2);
  px = pmul(px, x);
  // Evaluate the denominator polynomial of the rational interpolant.
  Packet8d qx = p8d_cephes_exp_q0;
  qx = pmadd(qx, x2, p8d_cephes_exp_q1);
  qx = pmadd(qx, x2, p8d_cephes_exp_q2);
  qx = pmadd(qx, x2, p8d_cephes_exp_q3);
  // I don't really get this bit, copied from the SSE2 routines, so...
  // TODO(gonnet): Figure out what is going on here, perhaps find a better
  // rational interpolant?
  x = _mm512_div_pd(px, psub(qx, px));
  x = pmadd(p8d_2, x, p8d_1);
  // Build e=2^n.
  const Packet8d e = _mm512_castsi512_pd(_mm512_slli_epi64(
      _mm512_add_epi64(_mm512_cvtpd_epi64(n), _mm512_set1_epi64(1023)), 52));
  // Construct the result 2^n * exp(g) = e * x. The max is used to catch
  // non-finite values in the input.
  return pmax(pmul(x, e), _x);
  }*/
 template <>
 EIGEN_STRONG_INLINE Packet16bf pldexp(const Packet16bf& a, const Packet16bf& exponent) {
  return F32ToBf16(pldexp<Packet16f>(Bf16ToF32(a), Bf16ToF32(exponent)));
 }
 // Functions for sqrt.
 // The EIGEN_FAST_MATH version uses the _mm_rsqrt_ps approximation and one step
@@ -303,12 +202,16 @@ template <>
 EIGEN_STRONG_INLINE Packet16f psqrt<Packet16f>(const Packet16f& x) {
  return _mm512_sqrt_ps(x);
 }
 template <>
 EIGEN_STRONG_INLINE Packet8d psqrt<Packet8d>(const Packet8d& x) {
  return _mm512_sqrt_pd(x);
 }
 #endif
 F16_PACKET_FUNCTION(Packet16f, Packet16h, psqrt)
 BF16_PACKET_FUNCTION(Packet16f, Packet16bf, psqrt)
 // prsqrt for float.
 #if defined(EIGEN_VECTORIZE_AVX512ER)
@@ -316,7 +219,6 @@ template <>
 EIGEN_STRONG_INLINE Packet16f prsqrt<Packet16f>(const Packet16f& x) {
  return _mm512_rsqrt28_ps(x);
 }
 #elif EIGEN_FAST_MATH
 template <>
@@ -348,7 +250,6 @@ prsqrt<Packet16f>(const Packet16f& _x) {
  // return rsqrt(+inf) = 0, rsqrt(x) = NaN if x < 0, and rsqrt(0) = +inf.
  return _mm512_mask_blend_ps(not_finite_pos_mask, y_newton, y_approx);
 }
 #else
 template <>
@@ -356,9 +257,11 @@ EIGEN_STRONG_INLINE Packet16f prsqrt<Packet16f>(const Packet16f& x) {
  _EIGEN_DECLARE_CONST_Packet16f(one, 1.0f);
  return _mm512_div_ps(p16f_one, _mm512_sqrt_ps(x));
 }
 #endif
 F16_PACKET_FUNCTION(Packet16f, Packet16h, prsqrt)
 BF16_PACKET_FUNCTION(Packet16f, Packet16bf, prsqrt)
 // prsqrt for double.
 #if EIGEN_FAST_MATH
 template <>
@@ -406,19 +309,23 @@ EIGEN_STRONG_INLINE Packet8d prsqrt<Packet8d>(const Packet8d& x) {
 }
 #endif
 #if defined(EIGEN_VECTORIZE_AVX512DQ)
 template<> EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS EIGEN_UNUSED
 Packet16f plog1p<Packet16f>(const Packet16f& _x) {
  return generic_plog1p(_x);
 }
 F16_PACKET_FUNCTION(Packet16f, Packet16h, plog1p)
 BF16_PACKET_FUNCTION(Packet16f, Packet16bf, plog1p)
 template<> EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS EIGEN_UNUSED
 Packet16f pexpm1<Packet16f>(const Packet16f& _x) {
  return generic_expm1(_x);
 }
 #endif
-#endif
+F16_PACKET_FUNCTION(Packet16f, Packet16h, pexpm1)
 BF16_PACKET_FUNCTION(Packet16f, Packet16bf, pexpm1)
 #endif  // EIGEN_HAS_AVX512_MATH
 template <>
@@ -439,6 +346,14 @@ ptanh<Packet16f>(const Packet16f& _x) {
  return internal::generic_fast_tanh_float(_x);
 }
 F16_PACKET_FUNCTION(Packet16f, Packet16h, psin)
 F16_PACKET_FUNCTION(Packet16f, Packet16h, pcos)
 F16_PACKET_FUNCTION(Packet16f, Packet16h, ptanh)
 BF16_PACKET_FUNCTION(Packet16f, Packet16bf, psin)
 BF16_PACKET_FUNCTION(Packet16f, Packet16bf, pcos)
 BF16_PACKET_FUNCTION(Packet16f, Packet16bf, ptanh)
 }  // end namespace internal
 }  // end namespace Eigen
--- a/Eigen/src/Core/arch/AVX512/PacketMath.h
+++ b/Eigen/src/Core/arch/AVX512/PacketMath.h
--- a/Eigen/src/Core/arch/AVX512/TypeCasting.h
+++ b/Eigen/src/Core/arch/AVX512/TypeCasting.h
@@ -14,6 +14,22 @@ namespace Eigen {
 namespace internal {
 template<> EIGEN_STRONG_INLINE Packet16i pcast<Packet16f, Packet16i>(const Packet16f& a) {
  return _mm512_cvttps_epi32(a);
 }
 template<> EIGEN_STRONG_INLINE Packet16f pcast<Packet16i, Packet16f>(const Packet16i& a) {
  return _mm512_cvtepi32_ps(a);
 }
 template<> EIGEN_STRONG_INLINE Packet16i preinterpret<Packet16i, Packet16f>(const Packet16f& a) {
  return _mm512_castps_si512(a);
 }
 template<> EIGEN_STRONG_INLINE Packet16f preinterpret<Packet16f, Packet16i>(const Packet16i& a) {
  return _mm512_castsi512_ps(a);
 }
 template <>
 struct type_casting_traits<half, float> {
  enum {
@@ -40,6 +56,32 @@ template<> EIGEN_STRONG_INLINE Packet16h pcast<Packet16f, Packet16h>(const Packe
  return float2half(a);
 }
 template <>
 struct type_casting_traits<bfloat16, float> {
  enum {
    VectorizedCast = 1,
    SrcCoeffRatio = 1,
    TgtCoeffRatio = 1
  };
 };
 template<> EIGEN_STRONG_INLINE Packet16f pcast<Packet16bf, Packet16f>(const Packet16bf& a) {
  return Bf16ToF32(a);
 }
 template <>
 struct type_casting_traits<float, bfloat16> {
  enum {
    VectorizedCast = 1,
    SrcCoeffRatio = 1,
    TgtCoeffRatio = 1
  };
 };
 template<> EIGEN_STRONG_INLINE Packet16bf pcast<Packet16f, Packet16bf>(const Packet16f& a) {
  return F32ToBf16(a);
 }
 } // end namespace internal
 } // end namespace Eigen
--- a/Eigen/src/Core/arch/AltiVec/Complex.h
+++ b/Eigen/src/Core/arch/AltiVec/Complex.h
@@ -15,8 +15,10 @@ namespace Eigen {
 namespace internal {
-static Packet4ui  p4ui_CONJ_XOR = vec_mergeh((Packet4ui)p4i_ZERO, (Packet4ui)p4f_MZERO);//{ 0x00000000, 0x80000000, 0x00000000, 0x80000000 };
+inline Packet4ui  p4ui_CONJ_XOR() {
-#ifdef __VSX__
+  return vec_mergeh((Packet4ui)p4i_ZERO, (Packet4ui)p4f_MZERO);//{ 0x00000000, 0x80000000, 0x00000000, 0x80000000 };
 }
 #ifdef EIGEN_VECTORIZE_VSX
 #if defined(_BIG_ENDIAN)
 static Packet2ul  p2ul_CONJ_XOR1 = (Packet2ul) vec_sld((Packet4ui) p2d_MZERO, (Packet4ui) p2l_ZERO, 8);//{ 0x8000000000000000, 0x0000000000000000 };
 static Packet2ul  p2ul_CONJ_XOR2 = (Packet2ul) vec_sld((Packet4ui) p2l_ZERO,  (Packet4ui) p2d_MZERO, 8);//{ 0x8000000000000000, 0x0000000000000000 };
@@ -29,8 +31,54 @@ static Packet2ul  p2ul_CONJ_XOR2 = (Packet2ul) vec_sld((Packet4ui) p2d_MZERO, (P
 //---------- float ----------
 struct Packet2cf
 {
-  EIGEN_STRONG_INLINE explicit Packet2cf() : v(p4f_ZERO) {}
+  EIGEN_STRONG_INLINE explicit Packet2cf() {}
  EIGEN_STRONG_INLINE explicit Packet2cf(const Packet4f& a) : v(a) {}
  EIGEN_STRONG_INLINE Packet2cf pmul(const Packet2cf& a, const Packet2cf& b)
  {
    Packet4f v1, v2;
    // Permute and multiply the real parts of a and b
    v1 = vec_perm(a.v, a.v, p16uc_PSET32_WODD);
    // Get the imaginary parts of a
    v2 = vec_perm(a.v, a.v, p16uc_PSET32_WEVEN);
    // multiply a_re * b
    v1 = vec_madd(v1, b.v, p4f_ZERO);
    // multiply a_im * b and get the conjugate result
    v2 = vec_madd(v2, b.v, p4f_ZERO);
    v2 = reinterpret_cast<Packet4f>(pxor(v2, reinterpret_cast<Packet4f>(p4ui_CONJ_XOR())));
    // permute back to a proper order
    v2 = vec_perm(v2, v2, p16uc_COMPLEX32_REV);
    return Packet2cf(padd<Packet4f>(v1, v2));
  }
  EIGEN_STRONG_INLINE Packet2cf& operator*=(const Packet2cf& b) {
    v = pmul(Packet2cf(*this), b).v;
    return *this;
  }
  EIGEN_STRONG_INLINE Packet2cf operator*(const Packet2cf& b) const {
    return Packet2cf(*this) *= b;
  }
  EIGEN_STRONG_INLINE Packet2cf& operator+=(const Packet2cf& b) {
    v = padd(v, b.v);
    return *this;
  }
  EIGEN_STRONG_INLINE Packet2cf operator+(const Packet2cf& b) const {
    return Packet2cf(*this) += b;
  }
  EIGEN_STRONG_INLINE Packet2cf& operator-=(const Packet2cf& b) {
    v = psub(v, b.v);
    return *this;
  }
  EIGEN_STRONG_INLINE Packet2cf operator-(const Packet2cf& b) const {
    return Packet2cf(*this) -= b;
  }
  EIGEN_STRONG_INLINE Packet2cf operator-(void) const {
    return Packet2cf(-v);
  }
  Packet4f  v;
 };
@@ -38,6 +86,7 @@ template<> struct packet_traits<std::complex<float> >  : default_packet_traits
 {
  typedef Packet2cf type;
  typedef Packet2cf half;
  typedef Packet4f as_real;
  enum {
    Vectorizable = 1,
    AlignedOnScalar = 1,
@@ -53,14 +102,15 @@ template<> struct packet_traits<std::complex<float> >  : default_packet_traits
    HasAbs2   = 0,
    HasMin    = 0,
    HasMax    = 0,
-#ifdef __VSX__
+    HasSqrt   = 1,
 #ifdef EIGEN_VECTORIZE_VSX
    HasBlend  = 1,
 #endif
    HasSetLinear = 0
  };
 };
-template<> struct unpacket_traits<Packet2cf> { typedef std::complex<float> type; enum {size=2, alignment=Aligned16, vectorizable=true, masked_load_available=false, masked_store_available=false}; typedef Packet2cf half; };
+template<> struct unpacket_traits<Packet2cf> { typedef std::complex<float> type; enum {size=2, alignment=Aligned16, vectorizable=true, masked_load_available=false, masked_store_available=false}; typedef Packet2cf half; typedef Packet4f as_real; };
 template<> EIGEN_STRONG_INLINE Packet2cf pset1<Packet2cf>(const std::complex<float>&  from)
 {
@@ -80,6 +130,25 @@ template<> EIGEN_STRONG_INLINE Packet2cf ploaddup<Packet2cf>(const std::complex<
 template<> EIGEN_STRONG_INLINE void pstore <std::complex<float> >(std::complex<float> *   to, const Packet2cf& from) { pstore((float*)to, from.v); }
 template<> EIGEN_STRONG_INLINE void pstoreu<std::complex<float> >(std::complex<float> *   to, const Packet2cf& from) { pstoreu((float*)to, from.v); }
 EIGEN_STRONG_INLINE Packet2cf pload2(const std::complex<float>& from0, const std::complex<float>& from1)
 {
  Packet4f res0, res1;
 #ifdef EIGEN_VECTORIZE_VSX
  __asm__ ("lxsdx %x0,%y1" : "=wa" (res0) : "Z" (from0));
  __asm__ ("lxsdx %x0,%y1" : "=wa" (res1) : "Z" (from1));
 #ifdef _BIG_ENDIAN
  __asm__ ("xxpermdi %x0, %x1, %x2, 0" : "=wa" (res0) : "wa" (res0), "wa" (res1));
 #else
  __asm__ ("xxpermdi %x0, %x2, %x1, 0" : "=wa" (res0) : "wa" (res0), "wa" (res1));
 #endif
 #else
  *reinterpret_cast<std::complex<float> *>(&res0) = from0;
  *reinterpret_cast<std::complex<float> *>(&res1) = from1;
  res0 = vec_perm(res0, res1, p16uc_TRANSPOSE64_HI);
 #endif
  return Packet2cf(res0);
 }
 template<> EIGEN_DEVICE_FUNC inline Packet2cf pgather<std::complex<float>, Packet2cf>(const std::complex<float>* from, Index stride)
 {
  EIGEN_ALIGN16 std::complex<float> af[2];
@@ -98,26 +167,7 @@ template<> EIGEN_DEVICE_FUNC inline void pscatter<std::complex<float>, Packet2cf
 template<> EIGEN_STRONG_INLINE Packet2cf padd<Packet2cf>(const Packet2cf& a, const Packet2cf& b) { return Packet2cf(a.v + b.v); }
 template<> EIGEN_STRONG_INLINE Packet2cf psub<Packet2cf>(const Packet2cf& a, const Packet2cf& b) { return Packet2cf(a.v - b.v); }
 template<> EIGEN_STRONG_INLINE Packet2cf pnegate(const Packet2cf& a) { return Packet2cf(pnegate(a.v)); }
-template<> EIGEN_STRONG_INLINE Packet2cf pconj(const Packet2cf& a) { return Packet2cf(pxor<Packet4f>(a.v, reinterpret_cast<Packet4f>(p4ui_CONJ_XOR))); }
+template<> EIGEN_STRONG_INLINE Packet2cf pconj(const Packet2cf& a) { return Packet2cf(pxor<Packet4f>(a.v, reinterpret_cast<Packet4f>(p4ui_CONJ_XOR()))); }
 template<> EIGEN_STRONG_INLINE Packet2cf pmul<Packet2cf>(const Packet2cf& a, const Packet2cf& b)
 {
  Packet4f v1, v2;
  // Permute and multiply the real parts of a and b
  v1 = vec_perm(a.v, a.v, p16uc_PSET32_WODD);
  // Get the imaginary parts of a
  v2 = vec_perm(a.v, a.v, p16uc_PSET32_WEVEN);
  // multiply a_re * b 
  v1 = vec_madd(v1, b.v, p4f_ZERO);
  // multiply a_im * b and get the conjugate result
  v2 = vec_madd(v2, b.v, p4f_ZERO);
  v2 = reinterpret_cast<Packet4f>(pxor(v2, reinterpret_cast<Packet4f>(p4ui_CONJ_XOR)));
  // permute back to a proper order
  v2 = vec_perm(v2, v2, p16uc_COMPLEX32_REV);
  return Packet2cf(padd<Packet4f>(v1, v2));
 }
 template<> EIGEN_STRONG_INLINE Packet2cf pand   <Packet2cf>(const Packet2cf& a, const Packet2cf& b) { return Packet2cf(pand<Packet4f>(a.v, b.v)); }
 template<> EIGEN_STRONG_INLINE Packet2cf por    <Packet2cf>(const Packet2cf& a, const Packet2cf& b) { return Packet2cf(por<Packet4f>(a.v, b.v)); }
@@ -149,22 +199,6 @@ template<> EIGEN_STRONG_INLINE std::complex<float> predux<Packet2cf>(const Packe
  return pfirst<Packet2cf>(Packet2cf(b));
 }
 template<> EIGEN_STRONG_INLINE Packet2cf preduxp<Packet2cf>(const Packet2cf* vecs)
 {
  Packet4f b1, b2;
 #ifdef _BIG_ENDIAN  
  b1 = vec_sld(vecs[0].v, vecs[1].v, 8);
  b2 = vec_sld(vecs[1].v, vecs[0].v, 8);
 #else
  b1 = vec_sld(vecs[1].v, vecs[0].v, 8);
  b2 = vec_sld(vecs[0].v, vecs[1].v, 8);
 #endif
  b2 = vec_sld(b2, b2, 8);
  b2 = padd<Packet4f>(b1, b2);
  return Packet2cf(b2);
 }
 template<> EIGEN_STRONG_INLINE std::complex<float> predux_mul<Packet2cf>(const Packet2cf& a)
 {
  Packet4f b;
@@ -175,63 +209,11 @@ template<> EIGEN_STRONG_INLINE std::complex<float> predux_mul<Packet2cf>(const P
  return pfirst<Packet2cf>(prod);
 }
 template<int Offset>
 struct palign_impl<Offset,Packet2cf>
 {
  static EIGEN_STRONG_INLINE void run(Packet2cf& first, const Packet2cf& second)
  {
    if (Offset==1)
    {
 #ifdef _BIG_ENDIAN
      first.v = vec_sld(first.v, second.v, 8);
 #else
      first.v = vec_sld(second.v, first.v, 8);
 #endif
    }
  }
 };
 template<> struct conj_helper<Packet2cf, Packet2cf, false,true>
 {
  EIGEN_STRONG_INLINE Packet2cf pmadd(const Packet2cf& x, const Packet2cf& y, const Packet2cf& c) const
  { return padd(pmul(x,y),c); }
  EIGEN_STRONG_INLINE Packet2cf pmul(const Packet2cf& a, const Packet2cf& b) const
  {
    return internal::pmul(a, pconj(b));
  }
 };
 template<> struct conj_helper<Packet2cf, Packet2cf, true,false>
 {
  EIGEN_STRONG_INLINE Packet2cf pmadd(const Packet2cf& x, const Packet2cf& y, const Packet2cf& c) const
  { return padd(pmul(x,y),c); }
  EIGEN_STRONG_INLINE Packet2cf pmul(const Packet2cf& a, const Packet2cf& b) const
  {
    return internal::pmul(pconj(a), b);
  }
 };
 template<> struct conj_helper<Packet2cf, Packet2cf, true,true>
 {
  EIGEN_STRONG_INLINE Packet2cf pmadd(const Packet2cf& x, const Packet2cf& y, const Packet2cf& c) const
  { return padd(pmul(x,y),c); }
  EIGEN_STRONG_INLINE Packet2cf pmul(const Packet2cf& a, const Packet2cf& b) const
  {
    return pconj(internal::pmul(a, b));
  }
 };
 EIGEN_MAKE_CONJ_HELPER_CPLX_REAL(Packet2cf,Packet4f)
 template<> EIGEN_STRONG_INLINE Packet2cf pdiv<Packet2cf>(const Packet2cf& a, const Packet2cf& b)
 {
-  // TODO optimize it for AltiVec
+  return pdiv_complex(a, b);
  Packet2cf res = conj_helper<Packet2cf,Packet2cf,false,true>().pmul(a, b);
  Packet4f s = pmul<Packet4f>(b.v, b.v);
  return Packet2cf(pdiv(res.v, padd<Packet4f>(s, vec_perm(s, s, p16uc_COMPLEX32_REV))));
 }
 template<> EIGEN_STRONG_INLINE Packet2cf pcplxflip<Packet2cf>(const Packet2cf& x)
@@ -251,77 +233,27 @@ template<> EIGEN_STRONG_INLINE Packet2cf pcmp_eq(const Packet2cf& a, const Packe
  return Packet2cf(vec_and(eq, vec_perm(eq, eq, p16uc_COMPLEX32_REV)));
 }
-#ifdef __VSX__
+#ifdef EIGEN_VECTORIZE_VSX
 template<> EIGEN_STRONG_INLINE Packet2cf pblend(const Selector<2>& ifPacket, const Packet2cf& thenPacket, const Packet2cf& elsePacket) {
  Packet2cf result;
  result.v = reinterpret_cast<Packet4f>(pblend<Packet2d>(ifPacket, reinterpret_cast<Packet2d>(thenPacket.v), reinterpret_cast<Packet2d>(elsePacket.v)));
  return result;
 }
 template<> EIGEN_STRONG_INLINE Packet2cf psqrt<Packet2cf>(const Packet2cf& a)
 {
  return psqrt_complex<Packet2cf>(a);
 }
 #endif
 //---------- double ----------
-#ifdef __VSX__
+#ifdef EIGEN_VECTORIZE_VSX
 struct Packet1cd
 {
  EIGEN_STRONG_INLINE Packet1cd() {}
  EIGEN_STRONG_INLINE explicit Packet1cd(const Packet2d& a) : v(a) {}
  Packet2d v;
 };
-template<> struct packet_traits<std::complex<double> >  : default_packet_traits
+  EIGEN_STRONG_INLINE Packet1cd pmul(const Packet1cd& a, const Packet1cd& b)
 {
  typedef Packet1cd type;
  typedef Packet1cd half;
  enum {
    Vectorizable = 1,
    AlignedOnScalar = 0,
    size = 1,
    HasHalfPacket = 0,
    HasAdd    = 1,
    HasSub    = 1,
    HasMul    = 1,
    HasDiv    = 1,
    HasNegate = 1,
    HasAbs    = 0,
    HasAbs2   = 0,
    HasMin    = 0,
    HasMax    = 0,
    HasSetLinear = 0
  };
 };
 template<> struct unpacket_traits<Packet1cd> { typedef std::complex<double> type; enum {size=1, alignment=Aligned16, vectorizable=true, masked_load_available=false, masked_store_available=false}; typedef Packet1cd half; };
 template<> EIGEN_STRONG_INLINE Packet1cd pload <Packet1cd>(const std::complex<double>* from) { return Packet1cd(pload<Packet2d>((const double*)from)); }
 template<> EIGEN_STRONG_INLINE Packet1cd ploadu<Packet1cd>(const std::complex<double>* from) { return Packet1cd(ploadu<Packet2d>((const double*)from)); }
 template<> EIGEN_STRONG_INLINE void pstore <std::complex<double> >(std::complex<double> *   to, const Packet1cd& from) { pstore((double*)to, from.v); }
 template<> EIGEN_STRONG_INLINE void pstoreu<std::complex<double> >(std::complex<double> *   to, const Packet1cd& from) { pstoreu((double*)to, from.v); }
 template<> EIGEN_STRONG_INLINE Packet1cd pset1<Packet1cd>(const std::complex<double>&  from)
 { /* here we really have to use unaligned loads :( */ return ploadu<Packet1cd>(&from); }
 template<> EIGEN_DEVICE_FUNC inline Packet1cd pgather<std::complex<double>, Packet1cd>(const std::complex<double>* from, Index stride)
 {
  EIGEN_ALIGN16 std::complex<double> af[2];
  af[0] = from[0*stride];
  af[1] = from[1*stride];
  return pload<Packet1cd>(af);
 }
 template<> EIGEN_DEVICE_FUNC inline void pscatter<std::complex<double>, Packet1cd>(std::complex<double>* to, const Packet1cd& from, Index stride)
 {
  EIGEN_ALIGN16 std::complex<double> af[2];
  pstore<std::complex<double> >(af, from);
  to[0*stride] = af[0];
  to[1*stride] = af[1];
 }
 template<> EIGEN_STRONG_INLINE Packet1cd padd<Packet1cd>(const Packet1cd& a, const Packet1cd& b) { return Packet1cd(a.v + b.v); }
 template<> EIGEN_STRONG_INLINE Packet1cd psub<Packet1cd>(const Packet1cd& a, const Packet1cd& b) { return Packet1cd(a.v - b.v); }
 template<> EIGEN_STRONG_INLINE Packet1cd pnegate(const Packet1cd& a) { return Packet1cd(pnegate(Packet2d(a.v))); }
 template<> EIGEN_STRONG_INLINE Packet1cd pconj(const Packet1cd& a) { return Packet1cd(pxor(a.v, reinterpret_cast<Packet2d>(p2ul_CONJ_XOR2))); }
 template<> EIGEN_STRONG_INLINE Packet1cd pmul<Packet1cd>(const Packet1cd& a, const Packet1cd& b)
  {
    Packet2d a_re, a_im, v1, v2;
@@ -339,6 +271,84 @@ template<> EIGEN_STRONG_INLINE Packet1cd pmul<Packet1cd>(const Packet1cd& a, con
    return Packet1cd(padd<Packet2d>(v1, v2));
  }
  EIGEN_STRONG_INLINE Packet1cd& operator*=(const Packet1cd& b) {
    v = pmul(Packet1cd(*this), b).v;
    return *this;
  }
  EIGEN_STRONG_INLINE Packet1cd operator*(const Packet1cd& b) const {
    return Packet1cd(*this) *= b;
  }
  EIGEN_STRONG_INLINE Packet1cd& operator+=(const Packet1cd& b) {
    v = padd(v, b.v);
    return *this;
  }
  EIGEN_STRONG_INLINE Packet1cd operator+(const Packet1cd& b) const {
    return Packet1cd(*this) += b;
  }
  EIGEN_STRONG_INLINE Packet1cd& operator-=(const Packet1cd& b) {
    v = psub(v, b.v);
    return *this;
  }
  EIGEN_STRONG_INLINE Packet1cd operator-(const Packet1cd& b) const {
    return Packet1cd(*this) -= b;
  }
  EIGEN_STRONG_INLINE Packet1cd operator-(void) const {
    return Packet1cd(-v);
  }
  Packet2d v;
 };
 template<> struct packet_traits<std::complex<double> >  : default_packet_traits
 {
  typedef Packet1cd type;
  typedef Packet1cd half;
  typedef Packet2d as_real;
  enum {
    Vectorizable = 1,
    AlignedOnScalar = 0,
    size = 1,
    HasHalfPacket = 0,
    HasAdd    = 1,
    HasSub    = 1,
    HasMul    = 1,
    HasDiv    = 1,
    HasNegate = 1,
    HasAbs    = 0,
    HasAbs2   = 0,
    HasMin    = 0,
    HasMax    = 0,
    HasSqrt   = 1,
    HasSetLinear = 0
  };
 };
 template<> struct unpacket_traits<Packet1cd> { typedef std::complex<double> type; enum {size=1, alignment=Aligned16, vectorizable=true, masked_load_available=false, masked_store_available=false}; typedef Packet1cd half; typedef Packet2d as_real; };
 template<> EIGEN_STRONG_INLINE Packet1cd pload <Packet1cd>(const std::complex<double>* from) { return Packet1cd(pload<Packet2d>((const double*)from)); }
 template<> EIGEN_STRONG_INLINE Packet1cd ploadu<Packet1cd>(const std::complex<double>* from) { return Packet1cd(ploadu<Packet2d>((const double*)from)); }
 template<> EIGEN_STRONG_INLINE void pstore <std::complex<double> >(std::complex<double> *   to, const Packet1cd& from) { pstore((double*)to, from.v); }
 template<> EIGEN_STRONG_INLINE void pstoreu<std::complex<double> >(std::complex<double> *   to, const Packet1cd& from) { pstoreu((double*)to, from.v); }
 template<> EIGEN_STRONG_INLINE Packet1cd pset1<Packet1cd>(const std::complex<double>&  from)
 { /* here we really have to use unaligned loads :( */ return ploadu<Packet1cd>(&from); }
 template<> EIGEN_DEVICE_FUNC inline Packet1cd pgather<std::complex<double>, Packet1cd>(const std::complex<double>* from, Index)
 {
  return pload<Packet1cd>(from);
 }
 template<> EIGEN_DEVICE_FUNC inline void pscatter<std::complex<double>, Packet1cd>(std::complex<double>* to, const Packet1cd& from, Index)
 {
  pstore<std::complex<double> >(to, from);
 }
 template<> EIGEN_STRONG_INLINE Packet1cd padd<Packet1cd>(const Packet1cd& a, const Packet1cd& b) { return Packet1cd(a.v + b.v); }
 template<> EIGEN_STRONG_INLINE Packet1cd psub<Packet1cd>(const Packet1cd& a, const Packet1cd& b) { return Packet1cd(a.v - b.v); }
 template<> EIGEN_STRONG_INLINE Packet1cd pnegate(const Packet1cd& a) { return Packet1cd(pnegate(Packet2d(a.v))); }
 template<> EIGEN_STRONG_INLINE Packet1cd pconj(const Packet1cd& a) { return Packet1cd(pxor(a.v, reinterpret_cast<Packet2d>(p2ul_CONJ_XOR2))); }
 template<> EIGEN_STRONG_INLINE Packet1cd pand   <Packet1cd>(const Packet1cd& a, const Packet1cd& b) { return Packet1cd(pand(a.v,b.v)); }
 template<> EIGEN_STRONG_INLINE Packet1cd por    <Packet1cd>(const Packet1cd& a, const Packet1cd& b) { return Packet1cd(por(a.v,b.v)); }
 template<> EIGEN_STRONG_INLINE Packet1cd pxor   <Packet1cd>(const Packet1cd& a, const Packet1cd& b) { return Packet1cd(pxor(a.v,b.v)); }
@@ -359,61 +369,14 @@ template<> EIGEN_STRONG_INLINE std::complex<double>  pfirst<Packet1cd>(const Pac
 template<> EIGEN_STRONG_INLINE Packet1cd preverse(const Packet1cd& a) { return a; }
 template<> EIGEN_STRONG_INLINE std::complex<double> predux<Packet1cd>(const Packet1cd& a) { return pfirst(a); }
 template<> EIGEN_STRONG_INLINE Packet1cd preduxp<Packet1cd>(const Packet1cd* vecs)        { return vecs[0]; }
 template<> EIGEN_STRONG_INLINE std::complex<double> predux_mul<Packet1cd>(const Packet1cd& a) { return pfirst(a); }
 template<int Offset>
 struct palign_impl<Offset,Packet1cd>
 {
  static EIGEN_STRONG_INLINE void run(Packet1cd& /*first*/, const Packet1cd& /*second*/)
  {
    // FIXME is it sure we never have to align a Packet1cd?
    // Even though a std::complex<double> has 16 bytes, it is not necessarily aligned on a 16 bytes boundary...
  }
 };
 template<> struct conj_helper<Packet1cd, Packet1cd, false,true>
 {
  EIGEN_STRONG_INLINE Packet1cd pmadd(const Packet1cd& x, const Packet1cd& y, const Packet1cd& c) const
  { return padd(pmul(x,y),c); }
  EIGEN_STRONG_INLINE Packet1cd pmul(const Packet1cd& a, const Packet1cd& b) const
  {
    return internal::pmul(a, pconj(b));
  }
 };
 template<> struct conj_helper<Packet1cd, Packet1cd, true,false>
 {
  EIGEN_STRONG_INLINE Packet1cd pmadd(const Packet1cd& x, const Packet1cd& y, const Packet1cd& c) const
  { return padd(pmul(x,y),c); }
  EIGEN_STRONG_INLINE Packet1cd pmul(const Packet1cd& a, const Packet1cd& b) const
  {
    return internal::pmul(pconj(a), b);
  }
 };
 template<> struct conj_helper<Packet1cd, Packet1cd, true,true>
 {
  EIGEN_STRONG_INLINE Packet1cd pmadd(const Packet1cd& x, const Packet1cd& y, const Packet1cd& c) const
  { return padd(pmul(x,y),c); }
  EIGEN_STRONG_INLINE Packet1cd pmul(const Packet1cd& a, const Packet1cd& b) const
  {
    return pconj(internal::pmul(a, b));
  }
 };
 EIGEN_MAKE_CONJ_HELPER_CPLX_REAL(Packet1cd,Packet2d)
 template<> EIGEN_STRONG_INLINE Packet1cd pdiv<Packet1cd>(const Packet1cd& a, const Packet1cd& b)
 {
-  // TODO optimize it for AltiVec
+  return pdiv_complex(a, b);
  Packet1cd res = conj_helper<Packet1cd,Packet1cd,false,true>().pmul(a,b);
  Packet2d s = pmul<Packet2d>(b.v, b.v);
  return Packet1cd(pdiv(res.v, padd<Packet2d>(s, vec_perm(s, s, p16uc_REVERSE64))));
 }
 EIGEN_STRONG_INLINE Packet1cd pcplxflip/*<Packet1cd>*/(const Packet1cd& x)
@@ -439,7 +402,12 @@ template<> EIGEN_STRONG_INLINE Packet1cd pcmp_eq(const Packet1cd& a, const Packe
  return Packet1cd(vec_and(eq, eq_swapped));
 }
-#endif // __VSX__
+template<> EIGEN_STRONG_INLINE Packet1cd psqrt<Packet1cd>(const Packet1cd& a)
 {
  return psqrt_complex<Packet1cd>(a);
 }
 #endif // EIGEN_VECTORIZE_VSX
 } // end namespace internal
 } // end namespace Eigen
--- a/Eigen/src/Core/arch/AltiVec/MathFunctions.h
+++ b/Eigen/src/Core/arch/AltiVec/MathFunctions.h
@@ -40,16 +40,14 @@ Packet4f pcos<Packet4f>(const Packet4f& _x)
  return pcos_float(_x);
 }
 #ifdef EIGEN_VECTORIZE_VSX
 #ifndef EIGEN_COMP_CLANG
 template<> EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS EIGEN_UNUSED
 Packet4f prsqrt<Packet4f>(const Packet4f& x)
 {
  return  vec_rsqrt(x);
 }
 #endif
 #ifdef __VSX__
 #ifndef EIGEN_COMP_CLANG
 template<> EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS EIGEN_UNUSED
 Packet2d prsqrt<Packet2d>(const Packet2d& x)
 {
@@ -57,7 +55,7 @@ Packet2d prsqrt<Packet2d>(const Packet2d& x)
 }
 #endif
-template<> EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS EIGEN_UNUSED
+template<> EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS
 Packet4f psqrt<Packet4f>(const Packet4f& x)
 {
  return  vec_sqrt(x);
@@ -69,12 +67,43 @@ Packet2d psqrt<Packet2d>(const Packet2d& x)
  return  vec_sqrt(x);
 }
-template<> EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS EIGEN_UNUSED
+#if !EIGEN_COMP_CLANG
 template<> EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS
 Packet4f prsqrt<Packet4f>(const Packet4f& x)
 {
  return pset1<Packet4f>(1.0f) / psqrt<Packet4f>(x);
 //  vec_rsqrt returns different results from the generic version
 //  return  vec_rsqrt(x);
 }
 template<> EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS
 Packet2d prsqrt<Packet2d>(const Packet2d& x)
 {
  return pset1<Packet2d>(1.0) / psqrt<Packet2d>(x);
 //  vec_rsqrt returns different results from the generic version
 //  return  vec_rsqrt(x);
 }
 #endif
 template<> EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS
 Packet2d pexp<Packet2d>(const Packet2d& _x)
 {
  return pexp_double(_x);
 }
-#endif
+
 template<> EIGEN_STRONG_INLINE Packet8bf psqrt<Packet8bf> (const Packet8bf& a){
  BF16_TO_F32_UNARY_OP_WRAPPER(vec_sqrt, a);
 }
 template<> EIGEN_STRONG_INLINE Packet8bf prsqrt<Packet8bf> (const Packet8bf& a){
  BF16_TO_F32_UNARY_OP_WRAPPER(prsqrt<Packet4f>, a);
 }
 template<> EIGEN_STRONG_INLINE Packet8bf pexp<Packet8bf> (const Packet8bf& a){
  BF16_TO_F32_UNARY_OP_WRAPPER(pexp_float, a);
 }
 #endif  // EIGEN_VECTORIZE_VSX
 // Hyperbolic Tangent function.
 template <>
--- a/Eigen/src/Core/arch/AltiVec/MatrixProduct.h
+++ b/Eigen/src/Core/arch/AltiVec/MatrixProduct.h
--- a/Eigen/src/Core/arch/AltiVec/MatrixProductCommon.h
+++ b/Eigen/src/Core/arch/AltiVec/MatrixProductCommon.h
@@ -0,0 +1,159 @@
 //#define EIGEN_POWER_USE_PREFETCH  // Use prefetching in gemm routines
 #ifdef EIGEN_POWER_USE_PREFETCH
 #define EIGEN_POWER_PREFETCH(p)  prefetch(p)
 #else
 #define EIGEN_POWER_PREFETCH(p)
 #endif
 namespace Eigen {
 namespace internal {
 template<typename Scalar, typename Packet, typename DataMapper, typename Index, const Index accRows, const Index accCols>
 EIGEN_ALWAYS_INLINE void gemm_extra_row(
  const DataMapper& res,
  const Scalar* lhs_base,
  const Scalar* rhs_base,
  Index depth,
  Index strideA,
  Index offsetA,
  Index row,
  Index col,
  Index rows,
  Index cols,
  Index remaining_rows,
  const Packet& pAlpha,
  const Packet& pMask);
 template<typename Scalar, typename Packet, typename DataMapper, typename Index, const Index accCols, bool ConjugateLhs, bool ConjugateRhs, bool LhsIsReal, bool RhsIsReal>
 EIGEN_STRONG_INLINE void gemm_extra_cols(
  const DataMapper& res,
  const Scalar* blockA,
  const Scalar* blockB,
  Index depth,
  Index strideA,
  Index offsetA,
  Index strideB,
  Index offsetB,
  Index col,
  Index rows,
  Index cols,
  Index remaining_rows,
  const Packet& pAlpha,
  const Packet& pMask);
 template<typename Packet>
 EIGEN_ALWAYS_INLINE Packet bmask(const int remaining_rows);
 template<typename Scalar, typename Packet, typename Packetc, typename DataMapper, typename Index, const Index accRows, const Index accCols, bool ConjugateLhs, bool ConjugateRhs, bool LhsIsReal, bool RhsIsReal>
 EIGEN_ALWAYS_INLINE void gemm_complex_extra_row(
  const DataMapper& res,
  const Scalar* lhs_base,
  const Scalar* rhs_base,
  Index depth,
  Index strideA,
  Index offsetA,
  Index strideB,
  Index row,
  Index col,
  Index rows,
  Index cols,
  Index remaining_rows,
  const Packet& pAlphaReal,
  const Packet& pAlphaImag,
  const Packet& pMask);
 template<typename Scalar, typename Packet, typename Packetc, typename DataMapper, typename Index, const Index accCols, bool ConjugateLhs, bool ConjugateRhs, bool LhsIsReal, bool RhsIsReal>
 EIGEN_STRONG_INLINE void gemm_complex_extra_cols(
  const DataMapper& res,
  const Scalar* blockA,
  const Scalar* blockB,
  Index depth,
  Index strideA,
  Index offsetA,
  Index strideB,
  Index offsetB,
  Index col,
  Index rows,
  Index cols,
  Index remaining_rows,
  const Packet& pAlphaReal,
  const Packet& pAlphaImag,
  const Packet& pMask);
 template<typename Scalar, typename Packet>
 EIGEN_ALWAYS_INLINE Packet ploadLhs(const Scalar* lhs);
 template<typename DataMapper, typename Packet, typename Index, const Index accCols, int StorageOrder, bool Complex, int N>
 EIGEN_ALWAYS_INLINE void bload(PacketBlock<Packet,N>& acc, const DataMapper& res, Index row, Index col);
 template<typename Packet, int N>
 EIGEN_ALWAYS_INLINE void bscale(PacketBlock<Packet,N>& acc, PacketBlock<Packet,N>& accZ, const Packet& pAlpha);
 template<typename Packet, int N>
 EIGEN_ALWAYS_INLINE void bscalec(PacketBlock<Packet,N>& aReal, PacketBlock<Packet,N>& aImag, const Packet& bReal, const Packet& bImag, PacketBlock<Packet,N>& cReal, PacketBlock<Packet,N>& cImag);
 // Grab two decouples real/imaginary PacketBlocks and return two coupled (real/imaginary pairs) PacketBlocks.
 template<typename Packet, typename Packetc, int N>
 EIGEN_ALWAYS_INLINE void bcouple_common(PacketBlock<Packet,N>& taccReal, PacketBlock<Packet,N>& taccImag, PacketBlock<Packetc, N>& acc1, PacketBlock<Packetc, N>& acc2)
 {
  acc1.packet[0].v = vec_mergeh(taccReal.packet[0], taccImag.packet[0]);
  if (N > 1) {
    acc1.packet[1].v = vec_mergeh(taccReal.packet[1], taccImag.packet[1]);
  }
  if (N > 2) {
    acc1.packet[2].v = vec_mergeh(taccReal.packet[2], taccImag.packet[2]);
  }
  if (N > 3) {
    acc1.packet[3].v = vec_mergeh(taccReal.packet[3], taccImag.packet[3]);
  }
  acc2.packet[0].v = vec_mergel(taccReal.packet[0], taccImag.packet[0]);
  if (N > 1) {
    acc2.packet[1].v = vec_mergel(taccReal.packet[1], taccImag.packet[1]);
  }
  if (N > 2) {
    acc2.packet[2].v = vec_mergel(taccReal.packet[2], taccImag.packet[2]);
  }
  if (N > 3) {
    acc2.packet[3].v = vec_mergel(taccReal.packet[3], taccImag.packet[3]);
  }
 }
 template<typename Packet, typename Packetc, int N>
 EIGEN_ALWAYS_INLINE void bcouple(PacketBlock<Packet,N>& taccReal, PacketBlock<Packet,N>& taccImag, PacketBlock<Packetc,N*2>& tRes, PacketBlock<Packetc, N>& acc1, PacketBlock<Packetc, N>& acc2)
 {
  bcouple_common<Packet, Packetc, N>(taccReal, taccImag, acc1, acc2);
  acc1.packet[0] = padd<Packetc>(tRes.packet[0], acc1.packet[0]);
  if (N > 1) {
    acc1.packet[1] = padd<Packetc>(tRes.packet[1], acc1.packet[1]);
  }
  if (N > 2) {
    acc1.packet[2] = padd<Packetc>(tRes.packet[2], acc1.packet[2]);
  }
  if (N > 3) {
    acc1.packet[3] = padd<Packetc>(tRes.packet[3], acc1.packet[3]);
  }
  acc2.packet[0] = padd<Packetc>(tRes.packet[0+N], acc2.packet[0]);
  if (N > 1) {
    acc2.packet[1] = padd<Packetc>(tRes.packet[1+N], acc2.packet[1]);
  }
  if (N > 2) {
    acc2.packet[2] = padd<Packetc>(tRes.packet[2+N], acc2.packet[2]);
  }
  if (N > 3) {
    acc2.packet[3] = padd<Packetc>(tRes.packet[3+N], acc2.packet[3]);
  }
 }
 // This is necessary because ploadRhs for double returns a pair of vectors when MMA is enabled.
 template<typename Scalar, typename Packet>
 EIGEN_ALWAYS_INLINE Packet ploadRhs(const Scalar* rhs)
 {
  return ploadu<Packet>(rhs);
 }
 } // end namespace internal
 } // end namespace Eigen
--- a/Eigen/src/Core/arch/AltiVec/MatrixProductMMA.h
+++ b/Eigen/src/Core/arch/AltiVec/MatrixProductMMA.h
@@ -0,0 +1,627 @@
 // This file is part of Eigen, a lightweight C++ template library
 // for linear algebra.
 //
 // Copyright (C) 2020 Everton Constantino (everton.constantino@ibm.com)
 // Copyright (C) 2021 Chip Kerchner (chip.kerchner@ibm.com)
 //
 // This Source Code Form is subject to the terms of the Mozilla
 // Public License v. 2.0. If a copy of the MPL was not distributed
 // with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
 #ifndef EIGEN_MATRIX_PRODUCT_MMA_ALTIVEC_H
 #define EIGEN_MATRIX_PRODUCT_MMA_ALTIVEC_H
 // If using dynamic dispatch, set the CPU target.
 #if defined(EIGEN_ALTIVEC_MMA_DYNAMIC_DISPATCH)
 #pragma GCC push_options
 #pragma GCC target("cpu=power10,htm")
 #endif
 #ifdef __has_builtin
 #if !__has_builtin(__builtin_vsx_assemble_pair)
 #define __builtin_vsx_assemble_pair __builtin_mma_assemble_pair
 #endif
 #endif
 namespace Eigen {
 namespace internal {
 template<typename Scalar, typename Packet>
 EIGEN_ALWAYS_INLINE void bsetzeroMMA(__vector_quad* acc)
 {
  __builtin_mma_xxsetaccz(acc);
 }
 template<typename DataMapper, typename Index, typename Packet, const Index accCols>
 EIGEN_ALWAYS_INLINE void storeAccumulator(Index i, const DataMapper& data, const Packet& alpha, __vector_quad* acc)
 {
  PacketBlock<Packet, 4> result;
  __builtin_mma_disassemble_acc(&result.packet, acc);
  PacketBlock<Packet, 4> tRes;
  bload<DataMapper, Packet, Index, accCols, ColMajor, false, 4>(tRes, data, i, 0);
  bscale<Packet, 4>(tRes, result, alpha);
  data.template storePacketBlock<Packet, 4>(i, 0, tRes);
 }
 template<typename DataMapper, typename Index, typename Packet, typename Packetc, const Index accColsC>
 EIGEN_ALWAYS_INLINE void storeComplexAccumulator(Index i, const DataMapper& data, const Packet& alphaReal, const Packet& alphaImag, __vector_quad* accReal, __vector_quad* accImag)
 {
  PacketBlock<Packet, 4> resultReal, resultImag;
  __builtin_mma_disassemble_acc(&resultReal.packet, accReal);
  __builtin_mma_disassemble_acc(&resultImag.packet, accImag);
  PacketBlock<Packetc, 8> tRes;
  bload<DataMapper, Packetc, Index, accColsC, ColMajor, true, 4>(tRes, data, i, 0);
  PacketBlock<Packet,4> taccReal, taccImag;
  bscalec<Packet,4>(resultReal, resultImag, alphaReal, alphaImag, taccReal, taccImag);
  PacketBlock<Packetc, 4> acc1, acc2;
  bcouple<Packet, Packetc, 4>(taccReal, taccImag, tRes, acc1, acc2);
  data.template storePacketBlock<Packetc, 4>(i, 0, acc1);
  data.template storePacketBlock<Packetc, 4>(i + accColsC, 0, acc2);
 }
 // Defaults to float32, since Eigen still supports C++03 we can't use default template arguments
 template<typename LhsPacket, typename RhsPacket, bool NegativeAccumulate>
 EIGEN_ALWAYS_INLINE void pgerMMA(__vector_quad* acc, const RhsPacket& a, const LhsPacket& b)
 {
  if(NegativeAccumulate)
  {
    __builtin_mma_xvf32gernp(acc, (__vector unsigned char)a, (__vector unsigned char)b);
  } else {
    __builtin_mma_xvf32gerpp(acc, (__vector unsigned char)a, (__vector unsigned char)b);
  }
 }
 template<typename LhsPacket, typename RhsPacket, bool NegativeAccumulate>
 EIGEN_ALWAYS_INLINE void pgerMMA(__vector_quad* acc, const PacketBlock<Packet2d,2>& a, const Packet2d& b)
 {
  __vector_pair* a0 = (__vector_pair *)(&a.packet[0]);
  if(NegativeAccumulate)
  {
    __builtin_mma_xvf64gernp(acc, *a0, (__vector unsigned char)b);
  } else {
    __builtin_mma_xvf64gerpp(acc, *a0, (__vector unsigned char)b);
  }
 }
 template<typename LhsPacket, typename RhsPacket, bool NegativeAccumulate>
 EIGEN_ALWAYS_INLINE void pgerMMA(__vector_quad* acc, const __vector_pair& a, const Packet2d& b)
 {
  if(NegativeAccumulate)
  {
    __builtin_mma_xvf64gernp(acc, (__vector_pair)a, (__vector unsigned char)b);
  } else {
    __builtin_mma_xvf64gerpp(acc, (__vector_pair)a, (__vector unsigned char)b);
  }
 }
 template<typename LhsPacket, typename RhsPacket, bool NegativeAccumulate>
 EIGEN_ALWAYS_INLINE void pgerMMA(__vector_quad*, const __vector_pair&, const Packet4f&)
 {
  // Just for compilation
 }
 template<typename Scalar, typename Packet, typename RhsPacket, bool ConjugateLhs, bool ConjugateRhs, bool LhsIsReal, bool RhsIsReal>
 EIGEN_ALWAYS_INLINE void pgercMMA(__vector_quad* accReal, __vector_quad* accImag, const Packet& lhsV, const Packet& lhsVi, const RhsPacket& rhsV, const RhsPacket& rhsVi)
 {
  pgerMMA<Packet, RhsPacket, false>(accReal,  rhsV,  lhsV);
  if(LhsIsReal) {
    pgerMMA<Packet, RhsPacket, ConjugateRhs>(accImag, rhsVi,  lhsV);
  } else {
    if(!RhsIsReal) {
      pgerMMA<Packet, RhsPacket, ConjugateLhs == ConjugateRhs>(accReal, rhsVi, lhsVi);
      pgerMMA<Packet, RhsPacket, ConjugateRhs>(accImag, rhsVi,  lhsV);
    } else {
      EIGEN_UNUSED_VARIABLE(rhsVi);
    }
    pgerMMA<Packet, RhsPacket, ConjugateLhs>(accImag,  rhsV, lhsVi);
  }
 }
 // This is necessary because ploadRhs for double returns a pair of vectors when MMA is enabled.
 template<typename Scalar, typename Packet>
 EIGEN_ALWAYS_INLINE void ploadRhsMMA(const Scalar* rhs, Packet& rhsV)
 {
  rhsV = ploadRhs<Scalar, Packet>(rhs);
 } 
 template<>
 EIGEN_ALWAYS_INLINE void ploadRhsMMA<double, PacketBlock<Packet2d, 2> >(const double* rhs, PacketBlock<Packet2d, 2>& rhsV)
 {
  rhsV.packet[0] = ploadRhs<double, Packet2d>((const double *)((Packet2d *)rhs      ));
  rhsV.packet[1] = ploadRhs<double, Packet2d>((const double *)(((Packet2d *)rhs) + 1));
 }
 template<>
 EIGEN_ALWAYS_INLINE void ploadRhsMMA<double, __vector_pair>(const double* rhs, __vector_pair& rhsV)
 {
 #if EIGEN_COMP_LLVM
  __builtin_vsx_assemble_pair(&rhsV,
    (__vector unsigned char)(ploadRhs<double, Packet2d>((const double *)(((Packet2d *)rhs) + 1))),
    (__vector unsigned char)(ploadRhs<double, Packet2d>((const double *)((Packet2d *)rhs      ))));
 #else
  __asm__ ("lxvp %x0,%1" : "=wa" (rhsV) : "Y" (*rhs));
 #endif
 }
 template<>
 EIGEN_ALWAYS_INLINE void ploadRhsMMA(const float*, __vector_pair&)
 {
  // Just for compilation
 }
 // PEEL_MMA loop factor.
 #define PEEL_MMA 7
 #define MICRO_MMA_UNROLL(func) \
  func(0) func(1) func(2) func(3) func(4) func(5) func(6) func(7)
 #define MICRO_MMA_LOAD_ONE(iter) \
  if (unroll_factor > iter) { \
    lhsV##iter = ploadLhs<Scalar, Packet>(lhs_ptr##iter); \
    lhs_ptr##iter += accCols; \
  } else { \
    EIGEN_UNUSED_VARIABLE(lhsV##iter); \
  }
 #define MICRO_MMA_WORK_ONE(iter, type, peel) \
  if (unroll_factor > iter) { \
    pgerMMA<Packet, type, false>(&accZero##iter, rhsV##peel, lhsV##iter); \
  }
 #define MICRO_MMA_TYPE_PEEL(func, func2, type, peel) \
  if (PEEL_MMA > peel) { \
    Packet lhsV0, lhsV1, lhsV2, lhsV3, lhsV4, lhsV5, lhsV6, lhsV7; \
    ploadRhsMMA<Scalar, type>(rhs_ptr + (accRows * peel), rhsV##peel); \
    MICRO_MMA_UNROLL(func2); \
    func(0,type,peel) func(1,type,peel) func(2,type,peel) func(3,type,peel) \
    func(4,type,peel) func(5,type,peel) func(6,type,peel) func(7,type,peel) \
  } else { \
    EIGEN_UNUSED_VARIABLE(rhsV##peel); \
  }
 #define MICRO_MMA_UNROLL_TYPE_PEEL(func, func2, type) \
  type rhsV0, rhsV1, rhsV2, rhsV3, rhsV4, rhsV5, rhsV6, rhsV7; \
  MICRO_MMA_TYPE_PEEL(func,func2,type,0); MICRO_MMA_TYPE_PEEL(func,func2,type,1); \
  MICRO_MMA_TYPE_PEEL(func,func2,type,2); MICRO_MMA_TYPE_PEEL(func,func2,type,3); \
  MICRO_MMA_TYPE_PEEL(func,func2,type,4); MICRO_MMA_TYPE_PEEL(func,func2,type,5); \
  MICRO_MMA_TYPE_PEEL(func,func2,type,6); MICRO_MMA_TYPE_PEEL(func,func2,type,7);
 #define MICRO_MMA_UNROLL_TYPE_ONE(func, func2, type) \
  type rhsV0; \
  MICRO_MMA_TYPE_PEEL(func,func2,type,0);
 #define MICRO_MMA_ONE_PEEL \
  if (sizeof(Scalar) == sizeof(float)) { \
    MICRO_MMA_UNROLL_TYPE_PEEL(MICRO_MMA_WORK_ONE, MICRO_MMA_LOAD_ONE, RhsPacket); \
  } else { \
    MICRO_MMA_UNROLL_TYPE_PEEL(MICRO_MMA_WORK_ONE, MICRO_MMA_LOAD_ONE, __vector_pair); \
  } \
  rhs_ptr += (accRows * PEEL_MMA);
 #define MICRO_MMA_ONE \
  if (sizeof(Scalar) == sizeof(float)) { \
    MICRO_MMA_UNROLL_TYPE_ONE(MICRO_MMA_WORK_ONE, MICRO_MMA_LOAD_ONE, RhsPacket); \
  } else { \
    MICRO_MMA_UNROLL_TYPE_ONE(MICRO_MMA_WORK_ONE, MICRO_MMA_LOAD_ONE, __vector_pair); \
  } \
  rhs_ptr += accRows;
 #define MICRO_MMA_DST_PTR_ONE(iter) \
  if (unroll_factor > iter) { \
    bsetzeroMMA<Scalar, Packet>(&accZero##iter); \
  } else { \
    EIGEN_UNUSED_VARIABLE(accZero##iter); \
  }
 #define MICRO_MMA_DST_PTR MICRO_MMA_UNROLL(MICRO_MMA_DST_PTR_ONE)
 #define MICRO_MMA_SRC_PTR_ONE(iter) \
  if (unroll_factor > iter) { \
    lhs_ptr##iter = lhs_base + ( (row/accCols) + iter )*strideA*accCols; \
  } else { \
    EIGEN_UNUSED_VARIABLE(lhs_ptr##iter); \
  }
 #define MICRO_MMA_SRC_PTR MICRO_MMA_UNROLL(MICRO_MMA_SRC_PTR_ONE)
 #define MICRO_MMA_PREFETCH_ONE(iter) \
  if (unroll_factor > iter) { \
    EIGEN_POWER_PREFETCH(lhs_ptr##iter); \
  }
 #define MICRO_MMA_PREFETCH MICRO_MMA_UNROLL(MICRO_MMA_PREFETCH_ONE)
 #define MICRO_MMA_STORE_ONE(iter) \
  if (unroll_factor > iter) { \
    storeAccumulator<DataMapper, Index, Packet, accCols>(row + iter*accCols, res, pAlpha, &accZero##iter); \
  }
 #define MICRO_MMA_STORE MICRO_MMA_UNROLL(MICRO_MMA_STORE_ONE)
 template<int unroll_factor, typename Scalar, typename Packet, typename RhsPacket, typename DataMapper, typename Index, const Index accRows, const Index accCols>
 EIGEN_ALWAYS_INLINE void gemm_unrolled_MMA_iteration(
  const DataMapper& res,
  const Scalar* lhs_base,
  const Scalar* rhs_base,
  Index depth,
  Index strideA,
  Index& row,
  const Packet& pAlpha)
 {
  const Scalar* rhs_ptr = rhs_base;
  const Scalar* lhs_ptr0 = NULL, * lhs_ptr1 = NULL, * lhs_ptr2 = NULL, * lhs_ptr3 = NULL, * lhs_ptr4 = NULL, * lhs_ptr5 = NULL, * lhs_ptr6 = NULL, * lhs_ptr7 = NULL;
  __vector_quad accZero0, accZero1, accZero2, accZero3, accZero4, accZero5, accZero6, accZero7;
  MICRO_MMA_SRC_PTR
  MICRO_MMA_DST_PTR
  Index k = 0;
  for(; k + PEEL_MMA <= depth; k+= PEEL_MMA)
  {
    EIGEN_POWER_PREFETCH(rhs_ptr);
    MICRO_MMA_PREFETCH
    MICRO_MMA_ONE_PEEL
  }
  for(; k < depth; k++)
  {
    MICRO_MMA_ONE
  }
  MICRO_MMA_STORE
  row += unroll_factor*accCols;
 }
 template<typename Scalar, typename Packet, typename RhsPacket, typename DataMapper, typename Index, const Index accRows, const Index accCols>
 EIGEN_ALWAYS_INLINE void gemmMMA_cols(
  const DataMapper& res,
  const Scalar* blockA,
  const Scalar* blockB,
  Index depth,
  Index strideA,
  Index offsetA,
  Index strideB,
  Index offsetB,
  Index col,
  Index rows,
  Index cols,
  Index remaining_rows,
  const Packet& pAlpha,
  const Packet& pMask)
 {
  const DataMapper res3 = res.getSubMapper(0, col);
  const Scalar* rhs_base = blockB + col*strideB + accRows*offsetB;
  const Scalar* lhs_base = blockA + accCols*offsetA;
  Index row = 0;
 #define MAX_MMA_UNROLL 7
  while(row + MAX_MMA_UNROLL*accCols <= rows) {
    gemm_unrolled_MMA_iteration<MAX_MMA_UNROLL, Scalar, Packet, RhsPacket, DataMapper, Index, accRows, accCols>(res3, lhs_base, rhs_base, depth, strideA, row, pAlpha);
  }
  switch( (rows-row)/accCols ) {
 #if MAX_MMA_UNROLL > 7
    case 7:
      gemm_unrolled_MMA_iteration<7, Scalar, Packet, RhsPacket, DataMapper, Index, accRows, accCols>(res3, lhs_base, rhs_base, depth, strideA, row, pAlpha);
      break;
 #endif
 #if MAX_MMA_UNROLL > 6
    case 6:
      gemm_unrolled_MMA_iteration<6, Scalar, Packet, RhsPacket, DataMapper, Index, accRows, accCols>(res3, lhs_base, rhs_base, depth, strideA, row, pAlpha);
      break;
 #endif
 #if MAX_MMA_UNROLL > 5
    case 5:
      gemm_unrolled_MMA_iteration<5, Scalar, Packet, RhsPacket, DataMapper, Index, accRows, accCols>(res3, lhs_base, rhs_base, depth, strideA, row, pAlpha);
      break;
 #endif
 #if MAX_MMA_UNROLL > 4
    case 4:
      gemm_unrolled_MMA_iteration<4, Scalar, Packet, RhsPacket, DataMapper, Index, accRows, accCols>(res3, lhs_base, rhs_base, depth, strideA, row, pAlpha);
      break;
 #endif
 #if MAX_MMA_UNROLL > 3
    case 3:
      gemm_unrolled_MMA_iteration<3, Scalar, Packet, RhsPacket, DataMapper, Index, accRows, accCols>(res3, lhs_base, rhs_base, depth, strideA, row, pAlpha);
      break;
 #endif
 #if MAX_MMA_UNROLL > 2
    case 2:
      gemm_unrolled_MMA_iteration<2, Scalar, Packet, RhsPacket, DataMapper, Index, accRows, accCols>(res3, lhs_base, rhs_base, depth, strideA, row, pAlpha);
      break;
 #endif
 #if MAX_MMA_UNROLL > 1
    case 1:
      gemm_unrolled_MMA_iteration<1, Scalar, Packet, RhsPacket, DataMapper, Index, accRows, accCols>(res3, lhs_base, rhs_base, depth, strideA, row, pAlpha);
      break;
 #endif
    default:
      break;
  }
 #undef MAX_MMA_UNROLL
  if(remaining_rows > 0)
  {
    gemm_extra_row<Scalar, Packet, DataMapper, Index, accRows, accCols>(res3, blockA, rhs_base, depth, strideA, offsetA, row, col, rows, cols, remaining_rows, pAlpha, pMask);
  }
 }
 template<typename Scalar, typename Index, typename Packet, typename RhsPacket, typename DataMapper, const Index accRows, const Index accCols>
 void gemmMMA(const DataMapper& res, const Scalar* blockA, const Scalar* blockB, Index rows, Index depth, Index cols, Scalar alpha, Index strideA, Index strideB, Index offsetA, Index offsetB)
 {
      const Index remaining_rows = rows % accCols;
      if( strideA == -1 ) strideA = depth;
      if( strideB == -1 ) strideB = depth;
      const Packet pAlpha = pset1<Packet>(alpha);
      const Packet pMask  = bmask<Packet>((const int)(remaining_rows));
      Index col = 0;
      for(; col + accRows <= cols; col += accRows)
      {
        gemmMMA_cols<Scalar, Packet, RhsPacket, DataMapper, Index, accRows, accCols>(res, blockA, blockB, depth, strideA, offsetA, strideB, offsetB, col, rows, cols, remaining_rows, pAlpha, pMask);
      }
      gemm_extra_cols<Scalar, Packet, DataMapper, Index, accCols>(res, blockA, blockB, depth, strideA, offsetA, strideB, offsetB, col, rows, cols, remaining_rows, pAlpha, pMask);
 }
 #define accColsC (accCols / 2)
 #define advanceRows ((LhsIsReal) ? 1 : 2)
 #define advanceCols ((RhsIsReal) ? 1 : 2)
 // PEEL_COMPLEX_MMA loop factor.
 #define PEEL_COMPLEX_MMA 3
 #define MICRO_COMPLEX_MMA_UNROLL(func) \
  func(0) func(1) func(2) func(3)
 #define MICRO_COMPLEX_MMA_LOAD_ONE(iter) \
  if (unroll_factor > iter) { \
    lhsV##iter = ploadLhs<Scalar, Packet>(lhs_ptr_real##iter); \
    if(!LhsIsReal) { \
      lhsVi##iter = ploadLhs<Scalar, Packet>(lhs_ptr_real##iter + imag_delta); \
    } else { \
      EIGEN_UNUSED_VARIABLE(lhsVi##iter); \
    } \
    lhs_ptr_real##iter += accCols; \
  } else { \
    EIGEN_UNUSED_VARIABLE(lhsV##iter); \
    EIGEN_UNUSED_VARIABLE(lhsVi##iter); \
  }
 #define MICRO_COMPLEX_MMA_WORK_ONE(iter, type, peel) \
  if (unroll_factor > iter) { \
    pgercMMA<Scalar, Packet, type, ConjugateLhs, ConjugateRhs, LhsIsReal, RhsIsReal>(&accReal##iter, &accImag##iter, lhsV##iter, lhsVi##iter, rhsV##peel, rhsVi##peel); \
  }
 #define MICRO_COMPLEX_MMA_TYPE_PEEL(func, func2, type, peel) \
  if (PEEL_COMPLEX_MMA > peel) { \
    Packet lhsV0, lhsV1, lhsV2, lhsV3; \
    Packet lhsVi0, lhsVi1, lhsVi2, lhsVi3; \
    ploadRhsMMA<Scalar, type>(rhs_ptr_real + (accRows * peel), rhsV##peel); \
    if(!RhsIsReal) { \
      ploadRhsMMA<Scalar, type>(rhs_ptr_imag + (accRows * peel), rhsVi##peel); \
    } else { \
      EIGEN_UNUSED_VARIABLE(rhsVi##peel); \
    } \
    MICRO_COMPLEX_MMA_UNROLL(func2); \
    func(0,type,peel) func(1,type,peel) func(2,type,peel) func(3,type,peel) \
  } else { \
    EIGEN_UNUSED_VARIABLE(rhsV##peel); \
    EIGEN_UNUSED_VARIABLE(rhsVi##peel); \
  }
 #define MICRO_COMPLEX_MMA_UNROLL_TYPE_PEEL(func, func2, type) \
  type rhsV0, rhsV1, rhsV2, rhsV3; \
  type rhsVi0, rhsVi1, rhsVi2, rhsVi3; \
  MICRO_COMPLEX_MMA_TYPE_PEEL(func,func2,type,0); MICRO_COMPLEX_MMA_TYPE_PEEL(func,func2,type,1); \
  MICRO_COMPLEX_MMA_TYPE_PEEL(func,func2,type,2); MICRO_COMPLEX_MMA_TYPE_PEEL(func,func2,type,3);
 #define MICRO_COMPLEX_MMA_UNROLL_TYPE_ONE(func, func2, type) \
  type rhsV0, rhsVi0; \
  MICRO_COMPLEX_MMA_TYPE_PEEL(func,func2,type,0);
 #define MICRO_COMPLEX_MMA_ONE_PEEL \
  if (sizeof(Scalar) == sizeof(float)) { \
    MICRO_COMPLEX_MMA_UNROLL_TYPE_PEEL(MICRO_COMPLEX_MMA_WORK_ONE, MICRO_COMPLEX_MMA_LOAD_ONE, RhsPacket); \
  } else { \
    MICRO_COMPLEX_MMA_UNROLL_TYPE_PEEL(MICRO_COMPLEX_MMA_WORK_ONE, MICRO_COMPLEX_MMA_LOAD_ONE, __vector_pair); \
  } \
  rhs_ptr_real += (accRows * PEEL_COMPLEX_MMA); \
  if(!RhsIsReal) rhs_ptr_imag += (accRows * PEEL_COMPLEX_MMA);
 #define MICRO_COMPLEX_MMA_ONE \
  if (sizeof(Scalar) == sizeof(float)) { \
    MICRO_COMPLEX_MMA_UNROLL_TYPE_ONE(MICRO_COMPLEX_MMA_WORK_ONE, MICRO_COMPLEX_MMA_LOAD_ONE, RhsPacket); \
  } else { \
    MICRO_COMPLEX_MMA_UNROLL_TYPE_ONE(MICRO_COMPLEX_MMA_WORK_ONE, MICRO_COMPLEX_MMA_LOAD_ONE, __vector_pair); \
  } \
  rhs_ptr_real += accRows; \
  if(!RhsIsReal) rhs_ptr_imag += accRows;
 #define MICRO_COMPLEX_MMA_DST_PTR_ONE(iter) \
  if (unroll_factor > iter) { \
    bsetzeroMMA<Scalar, Packet>(&accReal##iter); \
    bsetzeroMMA<Scalar, Packet>(&accImag##iter); \
  } else { \
    EIGEN_UNUSED_VARIABLE(accReal##iter); \
    EIGEN_UNUSED_VARIABLE(accImag##iter); \
  }
 #define MICRO_COMPLEX_MMA_DST_PTR MICRO_COMPLEX_MMA_UNROLL(MICRO_COMPLEX_MMA_DST_PTR_ONE)
 #define MICRO_COMPLEX_MMA_SRC_PTR_ONE(iter) \
  if (unroll_factor > iter) { \
    lhs_ptr_real##iter = lhs_base + ( ((advanceRows*row)/accCols) + iter*advanceRows )*strideA*accCols; \
  } else { \
    EIGEN_UNUSED_VARIABLE(lhs_ptr_real##iter); \
  }
 #define MICRO_COMPLEX_MMA_SRC_PTR MICRO_COMPLEX_MMA_UNROLL(MICRO_COMPLEX_MMA_SRC_PTR_ONE)
 #define MICRO_COMPLEX_MMA_PREFETCH_ONE(iter) \
  if (unroll_factor > iter) { \
    EIGEN_POWER_PREFETCH(lhs_ptr_real##iter); \
  }
 #define MICRO_COMPLEX_MMA_PREFETCH MICRO_COMPLEX_MMA_UNROLL(MICRO_COMPLEX_MMA_PREFETCH_ONE)
 #define MICRO_COMPLEX_MMA_STORE_ONE(iter) \
  if (unroll_factor > iter) { \
    storeComplexAccumulator<DataMapper, Index, Packet, Packetc, accColsC>(row + iter*accCols, res, pAlphaReal, pAlphaImag, &accReal##iter, &accImag##iter); \
  }
 #define MICRO_COMPLEX_MMA_STORE MICRO_COMPLEX_MMA_UNROLL(MICRO_COMPLEX_MMA_STORE_ONE)
 template<int unroll_factor, typename Scalar, typename Packet, typename Packetc, typename RhsPacket, typename DataMapper, typename Index, const Index accRows, const Index accCols, bool ConjugateLhs, bool ConjugateRhs, bool LhsIsReal, bool RhsIsReal>
 EIGEN_ALWAYS_INLINE void gemm_complex_unrolled_MMA_iteration(
  const DataMapper& res,
  const Scalar* lhs_base,
  const Scalar* rhs_base,
  Index depth,
  Index strideA,
  Index strideB,
  Index& row,
  const Packet& pAlphaReal,
  const Packet& pAlphaImag)
 {
  const Scalar* rhs_ptr_real = rhs_base;
  const Scalar* rhs_ptr_imag = NULL;
  const Index imag_delta = accCols*strideA;
  if(!RhsIsReal) {
    rhs_ptr_imag = rhs_base + accRows*strideB;
  } else {
    EIGEN_UNUSED_VARIABLE(rhs_ptr_imag);
  }
  const Scalar* lhs_ptr_real0 = NULL, * lhs_ptr_real1 = NULL;
  const Scalar* lhs_ptr_real2 = NULL, * lhs_ptr_real3 = NULL;
  __vector_quad accReal0, accImag0, accReal1, accImag1, accReal2, accImag2, accReal3, accImag3;
  MICRO_COMPLEX_MMA_SRC_PTR
  MICRO_COMPLEX_MMA_DST_PTR
  Index k = 0;
  for(; k + PEEL_COMPLEX_MMA <= depth; k+= PEEL_COMPLEX_MMA)
  {
    EIGEN_POWER_PREFETCH(rhs_ptr_real);
    if(!RhsIsReal) {
      EIGEN_POWER_PREFETCH(rhs_ptr_imag);
    }
    MICRO_COMPLEX_MMA_PREFETCH
    MICRO_COMPLEX_MMA_ONE_PEEL
  }
  for(; k < depth; k++)
  {
    MICRO_COMPLEX_MMA_ONE
  }
  MICRO_COMPLEX_MMA_STORE
  row += unroll_factor*accCols;
 }
 template<typename Scalar, typename Packet, typename Packetc, typename RhsPacket, typename DataMapper, typename Index, const Index accRows, const Index accCols, bool ConjugateLhs, bool ConjugateRhs, bool LhsIsReal, bool RhsIsReal>
 EIGEN_ALWAYS_INLINE void gemmMMA_complex_cols(
  const DataMapper& res,
  const Scalar* blockA,
  const Scalar* blockB,
  Index depth,
  Index strideA,
  Index offsetA,
  Index strideB,
  Index offsetB,
  Index col,
  Index rows,
  Index cols,
  Index remaining_rows,
  const Packet& pAlphaReal,
  const Packet& pAlphaImag,
  const Packet& pMask)
 {
  const DataMapper res3 = res.getSubMapper(0, col);
  const Scalar* rhs_base = blockB + advanceCols*col*strideB + accRows*offsetB;
  const Scalar* lhs_base = blockA + accCols*offsetA;
  Index row = 0;
 #define MAX_COMPLEX_MMA_UNROLL 4
  while(row + MAX_COMPLEX_MMA_UNROLL*accCols <= rows) {
    gemm_complex_unrolled_MMA_iteration<MAX_COMPLEX_MMA_UNROLL, Scalar, Packet, Packetc, RhsPacket, DataMapper, Index, accRows, accCols, ConjugateLhs, ConjugateRhs, LhsIsReal, RhsIsReal>(res3, lhs_base, rhs_base, depth, strideA, strideB, row, pAlphaReal, pAlphaImag);
  }
  switch( (rows-row)/accCols ) {
 #if MAX_COMPLEX_MMA_UNROLL > 4
    case 4:
      gemm_complex_unrolled_MMA_iteration<4, Scalar, Packet, Packetc, RhsPacket, DataMapper, Index, accRows, accCols, ConjugateLhs, ConjugateRhs, LhsIsReal, RhsIsReal>(res3, lhs_base, rhs_base, depth, strideA, strideB, row, pAlphaReal, pAlphaImag);
      break;
 #endif
 #if MAX_COMPLEX_MMA_UNROLL > 3
    case 3:
      gemm_complex_unrolled_MMA_iteration<3, Scalar, Packet, Packetc, RhsPacket, DataMapper, Index, accRows, accCols, ConjugateLhs, ConjugateRhs, LhsIsReal, RhsIsReal>(res3, lhs_base, rhs_base, depth, strideA, strideB, row, pAlphaReal, pAlphaImag);
      break;
 #endif
 #if MAX_COMPLEX_MMA_UNROLL > 2
    case 2:
      gemm_complex_unrolled_MMA_iteration<2, Scalar, Packet, Packetc, RhsPacket, DataMapper, Index, accRows, accCols, ConjugateLhs, ConjugateRhs, LhsIsReal, RhsIsReal>(res3, lhs_base, rhs_base, depth, strideA, strideB, row, pAlphaReal, pAlphaImag);
      break;
 #endif
 #if MAX_COMPLEX_MMA_UNROLL > 1
    case 1:
      gemm_complex_unrolled_MMA_iteration<1, Scalar, Packet, Packetc, RhsPacket, DataMapper, Index, accRows, accCols, ConjugateLhs, ConjugateRhs, LhsIsReal, RhsIsReal>(res3, lhs_base, rhs_base, depth, strideA, strideB, row, pAlphaReal, pAlphaImag);
      break;
 #endif
    default:
      break;
  }
 #undef MAX_COMPLEX_MMA_UNROLL
  if(remaining_rows > 0)
  {
    gemm_complex_extra_row<Scalar, Packet, Packetc, DataMapper, Index, accRows, accCols, ConjugateLhs, ConjugateRhs, LhsIsReal, RhsIsReal>(res3, blockA, rhs_base, depth, strideA, offsetA, strideB, row, col, rows, cols, remaining_rows, pAlphaReal, pAlphaImag, pMask);
  }
 }
 template<typename LhsScalar, typename RhsScalar, typename Scalarc, typename Scalar, typename Index, typename Packet, typename Packetc, typename RhsPacket, typename DataMapper, const Index accRows, const Index accCols, bool ConjugateLhs, bool ConjugateRhs, bool LhsIsReal, bool RhsIsReal>
 void gemm_complexMMA(const DataMapper& res, const LhsScalar* blockAc, const RhsScalar* blockBc, Index rows, Index depth, Index cols, Scalarc alpha, Index strideA, Index strideB, Index offsetA, Index offsetB)
 {
      const Index remaining_rows = rows % accCols;
      if( strideA == -1 ) strideA = depth;
      if( strideB == -1 ) strideB = depth;
      const Packet pAlphaReal = pset1<Packet>(alpha.real());
      const Packet pAlphaImag = pset1<Packet>(alpha.imag());
      const Packet pMask = bmask<Packet>((const int)(remaining_rows));
      const Scalar* blockA = (Scalar *) blockAc;
      const Scalar* blockB = (Scalar *) blockBc;
      Index col = 0;
      for(; col + accRows <= cols; col += accRows)
      {
        gemmMMA_complex_cols<Scalar, Packet, Packetc, RhsPacket, DataMapper, Index, accRows, accCols, ConjugateLhs, ConjugateRhs, LhsIsReal, RhsIsReal>(res, blockA, blockB, depth, strideA, offsetA, strideB, offsetB, col, rows, cols, remaining_rows, pAlphaReal, pAlphaImag, pMask);
      }
      gemm_complex_extra_cols<Scalar, Packet, Packetc, DataMapper, Index, accCols, ConjugateLhs, ConjugateRhs, LhsIsReal, RhsIsReal>(res, blockA, blockB, depth, strideA, offsetA, strideB, offsetB, col, rows, cols, remaining_rows, pAlphaReal, pAlphaImag, pMask);
 }
 #undef accColsC
 #undef advanceRows
 #undef advanceCols
 } // end namespace internal
 } // end namespace Eigen
 #if defined(EIGEN_ALTIVEC_MMA_DYNAMIC_DISPATCH)
 #pragma GCC pop_options
 #endif
 #endif // EIGEN_MATRIX_PRODUCT_MMA_ALTIVEC_H
--- a/Eigen/src/Core/arch/AltiVec/MatrixVectorProduct.h
+++ b/Eigen/src/Core/arch/AltiVec/MatrixVectorProduct.h
--- a/Eigen/src/Core/arch/AltiVec/PacketMath.h
+++ b/Eigen/src/Core/arch/AltiVec/PacketMath.h
--- a/Eigen/src/Core/arch/CUDA/Complex.h
+++ b/Eigen/src/Core/arch/CUDA/Complex.h
@@ -2,6 +2,7 @@
 // for linear algebra.
 //
 // Copyright (C) 2014 Benoit Steiner <benoit.steiner.goog@gmail.com>
 // Copyright (C) 2021 C. Antonio Sanchez <cantonios@google.com>
 //
 // This Source Code Form is subject to the terms of the Mozilla
 // Public License v. 2.0. If a copy of the MPL was not distributed
@@ -10,94 +11,259 @@
 #ifndef EIGEN_COMPLEX_CUDA_H
 #define EIGEN_COMPLEX_CUDA_H
-// clang-format off
+// Many std::complex methods such as operator+, operator-, operator* and
 // operator/ are not constexpr. Due to this, GCC and older versions of clang do
 // not treat them as device functions and thus Eigen functors making use of
 // these operators fail to compile. Here, we manually specialize these
 // operators and functors for complex types when building for CUDA to enable
 // their use on-device.
 //
 // NOTES:
 //  - Compound assignment operators +=,-=,*=,/=(Scalar) will not work on device,
 //    since they are already specialized in the standard. Using them will result
 //    in silent kernel failures.
 //  - Compiling with MSVC and using +=,-=,*=,/=(std::complex<Scalar>) will lead
 //    to duplicate definition errors, since these are already specialized in
 //    Visual Studio's <complex> header (contrary to the standard).  This is
 //    preferable to removing such definitions, which will lead to silent kernel
 //    failures.
 //  - Compiling with ICC requires defining _USE_COMPLEX_SPECIALIZATION_ prior
 //    to the first inclusion of <complex>.
 #if defined(EIGEN_CUDACC) && defined(EIGEN_GPU_COMPILE_PHASE)
 // ICC already specializes std::complex<float> and std::complex<double>
 // operators, preventing us from making them device functions here.
 // This will lead to silent runtime errors if the operators are used on device.
 //
 // To allow std::complex operator use on device, define _OVERRIDE_COMPLEX_SPECIALIZATION_
 // prior to first inclusion of <complex>.  This prevents ICC from adding
 // its own specializations, so our custom ones below can be used instead.
 #if !(defined(EIGEN_COMP_ICC) && defined(_USE_COMPLEX_SPECIALIZATION_))
 // Import Eigen's internal operator specializations.
 #define EIGEN_USING_STD_COMPLEX_OPERATORS           \
  using Eigen::complex_operator_detail::operator+;  \
  using Eigen::complex_operator_detail::operator-;  \
  using Eigen::complex_operator_detail::operator*;  \
  using Eigen::complex_operator_detail::operator/;  \
  using Eigen::complex_operator_detail::operator+=; \
  using Eigen::complex_operator_detail::operator-=; \
  using Eigen::complex_operator_detail::operator*=; \
  using Eigen::complex_operator_detail::operator/=; \
  using Eigen::complex_operator_detail::operator==; \
  using Eigen::complex_operator_detail::operator!=;
 namespace Eigen {
-namespace internal {
+// Specialized std::complex overloads.
 namespace complex_operator_detail {
-#if defined(EIGEN_CUDACC) && defined(EIGEN_USE_GPU)
+template<typename T>
-
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
-// Many std::complex methods such as operator+, operator-, operator* and
+std::complex<T> complex_multiply(const std::complex<T>& a, const std::complex<T>& b) {
 // operator/ are not constexpr. Due to this, clang does not treat them as device
 // functions and thus Eigen functors making use of these operators fail to
 // compile. Here, we manually specialize these functors for complex types when
 // building for CUDA to avoid non-constexpr methods.
 // Sum
 template<typename T> struct scalar_sum_op<const std::complex<T>, const std::complex<T> > : binary_op_base<const std::complex<T>, const std::complex<T> > {
  typedef typename std::complex<T> result_type;
  EIGEN_EMPTY_STRUCT_CTOR(scalar_sum_op)
  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE std::complex<T> operator() (const std::complex<T>& a, const std::complex<T>& b) const {
    return std::complex<T>(numext::real(a) + numext::real(b),
                           numext::imag(a) + numext::imag(b));
  }
 };
 template<typename T> struct scalar_sum_op<std::complex<T>, std::complex<T> > : scalar_sum_op<const std::complex<T>, const std::complex<T> > {};
 // Difference
 template<typename T> struct scalar_difference_op<const std::complex<T>, const std::complex<T> >  : binary_op_base<const std::complex<T>, const std::complex<T> > {
  typedef typename std::complex<T> result_type;
  EIGEN_EMPTY_STRUCT_CTOR(scalar_difference_op)
  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE std::complex<T> operator() (const std::complex<T>& a, const std::complex<T>& b) const {
    return std::complex<T>(numext::real(a) - numext::real(b),
                           numext::imag(a) - numext::imag(b));
  }
 };
 template<typename T> struct scalar_difference_op<std::complex<T>, std::complex<T> > : scalar_difference_op<const std::complex<T>, const std::complex<T> > {};
 // Product
 template<typename T> struct scalar_product_op<const std::complex<T>, const std::complex<T> >  : binary_op_base<const std::complex<T>, const std::complex<T> > {
  enum {
    Vectorizable = packet_traits<std::complex<T> >::HasMul
  };
  typedef typename std::complex<T> result_type;
  EIGEN_EMPTY_STRUCT_CTOR(scalar_product_op)
  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE std::complex<T> operator() (const std::complex<T>& a, const std::complex<T>& b) const {
  const T a_real = numext::real(a);
  const T a_imag = numext::imag(a);
  const T b_real = numext::real(b);
  const T b_imag = numext::imag(b);
-    return std::complex<T>(a_real * b_real - a_imag * b_imag,
+  return std::complex<T>(
-                           a_real * b_imag + a_imag * b_real);
+      a_real * b_real - a_imag * b_imag,
      a_imag * b_real + a_real * b_imag);
 }
 };
-template<typename T> struct scalar_product_op<std::complex<T>, std::complex<T> > : scalar_product_op<const std::complex<T>, const std::complex<T> > {};
+template<typename T>
-
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
-
+std::complex<T> complex_divide_fast(const std::complex<T>& a, const std::complex<T>& b) {
 // Quotient
 template<typename T> struct scalar_quotient_op<const std::complex<T>, const std::complex<T> > : binary_op_base<const std::complex<T>, const std::complex<T> > {
  enum {
    Vectorizable = packet_traits<std::complex<T> >::HasDiv
  };
  typedef typename std::complex<T> result_type;
  EIGEN_EMPTY_STRUCT_CTOR(scalar_quotient_op)
  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE std::complex<T> operator() (const std::complex<T>& a, const std::complex<T>& b) const {
  const T a_real = numext::real(a);
  const T a_imag = numext::imag(a);
  const T b_real = numext::real(b);
  const T b_imag = numext::imag(b);
-    const T norm = T(1) / (b_real * b_real + b_imag * b_imag);
+  const T norm = (b_real * b_real + b_imag * b_imag);
-    return std::complex<T>((a_real * b_real + a_imag * b_imag) * norm,
+  return std::complex<T>((a_real * b_real + a_imag * b_imag) / norm,
-                           (a_imag * b_real - a_real * b_imag) * norm);
+                          (a_imag * b_real - a_real * b_imag) / norm);
 }
 };
-template<typename T> struct scalar_quotient_op<std::complex<T>, std::complex<T> > : scalar_quotient_op<const std::complex<T>, const std::complex<T> > {};
+template<typename T>
 EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
 std::complex<T> complex_divide_stable(const std::complex<T>& a, const std::complex<T>& b) {
  const T a_real = numext::real(a);
  const T a_imag = numext::imag(a);
  const T b_real = numext::real(b);
  const T b_imag = numext::imag(b);
  // Smith's complex division (https://arxiv.org/pdf/1210.4539.pdf),
  // guards against over/under-flow.
  const bool scale_imag = numext::abs(b_imag) <= numext::abs(b_real);
  const T rscale = scale_imag ? T(1) : b_real / b_imag;
  const T iscale = scale_imag ? b_imag / b_real : T(1);
  const T denominator = b_real * rscale + b_imag * iscale;
  return std::complex<T>((a_real * rscale + a_imag * iscale) / denominator, 
                         (a_imag * rscale - a_real * iscale) / denominator);
 }
 template<typename T>
 EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
 std::complex<T> complex_divide(const std::complex<T>& a, const std::complex<T>& b) {
 #if EIGEN_FAST_MATH
  return complex_divide_fast(a, b);
 #else
  return complex_divide_stable(a, b);
 #endif
 }
-} // end namespace internal
+// NOTE: We cannot specialize compound assignment operators with Scalar T,
 //         (i.e.  operator@=(const T&), for @=+,-,*,/)
 //       since they are already specialized for float/double/long double within
 //       the standard <complex> header. We also do not specialize the stream
 //       operators.
 #define EIGEN_CREATE_STD_COMPLEX_OPERATOR_SPECIALIZATIONS(T)                                    \
                                                                                                \
 EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE                                                           \
 std::complex<T> operator+(const std::complex<T>& a) { return a; }                               \
                                                                                                \
 EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE                                                           \
 std::complex<T> operator-(const std::complex<T>& a) {                                           \
  return std::complex<T>(-numext::real(a), -numext::imag(a));                                   \
 }                                                                                               \
                                                                                                \
 EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE                                                           \
 std::complex<T> operator+(const std::complex<T>& a, const std::complex<T>& b) {                 \
  return std::complex<T>(numext::real(a) + numext::real(b), numext::imag(a) + numext::imag(b)); \
 }                                                                                               \
                                                                                                \
 EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE                                                           \
 std::complex<T> operator+(const std::complex<T>& a, const T& b) {                               \
  return std::complex<T>(numext::real(a) + b, numext::imag(a));                                 \
 }                                                                                               \
                                                                                                \
 EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE                                                           \
 std::complex<T> operator+(const T& a, const std::complex<T>& b) {                               \
  return std::complex<T>(a + numext::real(b), numext::imag(b));                                 \
 }                                                                                               \
                                                                                                \
 EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE                                                           \
 std::complex<T> operator-(const std::complex<T>& a, const std::complex<T>& b) {                 \
  return std::complex<T>(numext::real(a) - numext::real(b), numext::imag(a) - numext::imag(b)); \
 }                                                                                               \
                                                                                                \
 EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE                                                           \
 std::complex<T> operator-(const std::complex<T>& a, const T& b) {                               \
  return std::complex<T>(numext::real(a) - b, numext::imag(a));                                 \
 }                                                                                               \
                                                                                                \
 EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE                                                           \
 std::complex<T> operator-(const T& a, const std::complex<T>& b) {                               \
  return std::complex<T>(a - numext::real(b), -numext::imag(b));                                \
 }                                                                                               \
                                                                                                \
 EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE                                                           \
 std::complex<T> operator*(const std::complex<T>& a, const std::complex<T>& b) {                 \
  return complex_multiply(a, b);                                                                \
 }                                                                                               \
                                                                                                \
 EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE                                                           \
 std::complex<T> operator*(const std::complex<T>& a, const T& b) {                               \
  return std::complex<T>(numext::real(a) * b, numext::imag(a) * b);                             \
 }                                                                                               \
                                                                                                \
 EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE                                                           \
 std::complex<T> operator*(const T& a, const std::complex<T>& b) {                               \
  return std::complex<T>(a * numext::real(b), a * numext::imag(b));                             \
 }                                                                                               \
                                                                                                \
 EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE                                                           \
 std::complex<T> operator/(const std::complex<T>& a, const std::complex<T>& b) {                 \
  return complex_divide(a, b);                                                                  \
 }                                                                                               \
                                                                                                \
 EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE                                                           \
 std::complex<T> operator/(const std::complex<T>& a, const T& b) {                               \
  return std::complex<T>(numext::real(a) / b, numext::imag(a) / b);                             \
 }                                                                                               \
                                                                                                \
 EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE                                                           \
 std::complex<T> operator/(const T& a, const std::complex<T>& b) {                               \
  return complex_divide(std::complex<T>(a, 0), b);                                              \
 }                                                                                               \
                                                                                                \
 EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE                                                           \
 std::complex<T>& operator+=(std::complex<T>& a, const std::complex<T>& b) {                     \
  numext::real_ref(a) += numext::real(b);                                                       \
  numext::imag_ref(a) += numext::imag(b);                                                       \
  return a;                                                                                     \
 }                                                                                               \
                                                                                                \
 EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE                                                           \
 std::complex<T>& operator-=(std::complex<T>& a, const std::complex<T>& b) {                     \
  numext::real_ref(a) -= numext::real(b);                                                       \
  numext::imag_ref(a) -= numext::imag(b);                                                       \
  return a;                                                                                     \
 }                                                                                               \
                                                                                                \
 EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE                                                           \
 std::complex<T>& operator*=(std::complex<T>& a, const std::complex<T>& b) {                     \
  a = complex_multiply(a, b);                                                                   \
  return a;                                                                                     \
 }                                                                                               \
                                                                                                \
 EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE                                                           \
 std::complex<T>& operator/=(std::complex<T>& a, const std::complex<T>& b) {                     \
  a = complex_divide(a, b);                                                                     \
  return  a;                                                                                    \
 }                                                                                               \
                                                                                                \
 EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE                                                           \
 bool operator==(const std::complex<T>& a, const std::complex<T>& b) {                           \
  return numext::real(a) == numext::real(b) && numext::imag(a) == numext::imag(b);              \
 }                                                                                               \
                                                                                                \
 EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE                                                           \
 bool operator==(const std::complex<T>& a, const T& b) {                                         \
  return numext::real(a) == b && numext::imag(a) == 0;                                          \
 }                                                                                               \
                                                                                                \
 EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE                                                           \
 bool operator==(const T& a, const std::complex<T>& b) {                                         \
  return a  == numext::real(b) && 0 == numext::imag(b);                                         \
 }                                                                                               \
                                                                                                \
 EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE                                                           \
 bool operator!=(const std::complex<T>& a, const std::complex<T>& b) {                           \
  return !(a == b);                                                                             \
 }                                                                                               \
                                                                                                \
 EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE                                                           \
 bool operator!=(const std::complex<T>& a, const T& b) {                                         \
  return !(a == b);                                                                             \
 }                                                                                               \
                                                                                                \
 EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE                                                           \
 bool operator!=(const T& a, const std::complex<T>& b) {                                         \
  return !(a == b);                                                                             \
 }
-} // end namespace Eigen
+// Do not specialize for long double, since that reduces to double on device.
 EIGEN_CREATE_STD_COMPLEX_OPERATOR_SPECIALIZATIONS(float)
 EIGEN_CREATE_STD_COMPLEX_OPERATOR_SPECIALIZATIONS(double)
 #undef EIGEN_CREATE_STD_COMPLEX_OPERATOR_SPECIALIZATIONS
 }  // namespace complex_operator_detail
 EIGEN_USING_STD_COMPLEX_OPERATORS
 namespace numext {
 EIGEN_USING_STD_COMPLEX_OPERATORS
 }  // namespace numext
 namespace internal {
 EIGEN_USING_STD_COMPLEX_OPERATORS
 }  // namespace internal
 }  // namespace Eigen
 #endif  // !(EIGEN_COMP_ICC && _USE_COMPLEX_SPECIALIZATION_)
 #endif  // EIGEN_CUDACC && EIGEN_GPU_COMPILE_PHASE
 #endif  // EIGEN_COMPLEX_CUDA_H
--- a/Eigen/src/Core/arch/Default/BFloat16.h
+++ b/Eigen/src/Core/arch/Default/BFloat16.h
@@ -0,0 +1,688 @@
 /* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
 Licensed under the Apache License, Version 2.0 (the "License");
 you may not use this file except in compliance with the License.
 You may obtain a copy of the License at
    http://www.apache.org/licenses/LICENSE-2.0
 Unless required by applicable law or agreed to in writing, software
 distributed under the License is distributed on an "AS IS" BASIS,
 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 #ifndef EIGEN_BFLOAT16_H
 #define EIGEN_BFLOAT16_H
 #define BF16_PACKET_FUNCTION(PACKET_F, PACKET_BF16, METHOD)         \
  template <>                                                       \
  EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS EIGEN_UNUSED  \
  PACKET_BF16 METHOD<PACKET_BF16>(const PACKET_BF16& _x) {          \
    return F32ToBf16(METHOD<PACKET_F>(Bf16ToF32(_x)));              \
  }
 namespace Eigen {
 struct bfloat16;
 namespace bfloat16_impl {
 // Make our own __bfloat16_raw definition.
 struct __bfloat16_raw {
  EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR __bfloat16_raw() : value(0) {}
  explicit EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR __bfloat16_raw(unsigned short raw) : value(raw) {}
  unsigned short value;
 };
 EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR __bfloat16_raw raw_uint16_to_bfloat16(unsigned short value);
 template <bool AssumeArgumentIsNormalOrInfinityOrZero>
 EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC __bfloat16_raw float_to_bfloat16_rtne(float ff);
 // Forward declarations of template specializations, to avoid Visual C++ 2019 errors, saying:
 // > error C2908: explicit specialization; 'float_to_bfloat16_rtne' has already been instantiated
 template <>
 EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC __bfloat16_raw float_to_bfloat16_rtne<false>(float ff);
 template <>
 EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC __bfloat16_raw float_to_bfloat16_rtne<true>(float ff);
 EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC float bfloat16_to_float(__bfloat16_raw h);
 struct bfloat16_base : public __bfloat16_raw {
  EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR bfloat16_base() {}
  EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR bfloat16_base(const __bfloat16_raw& h) : __bfloat16_raw(h) {}
 };
 } // namespace bfloat16_impl
 // Class definition.
 struct bfloat16 : public bfloat16_impl::bfloat16_base {
  typedef bfloat16_impl::__bfloat16_raw __bfloat16_raw;
  EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR bfloat16() {}
  EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR bfloat16(const __bfloat16_raw& h) : bfloat16_impl::bfloat16_base(h) {}
  explicit EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR bfloat16(bool b)
      : bfloat16_impl::bfloat16_base(bfloat16_impl::raw_uint16_to_bfloat16(b ? 0x3f80 : 0)) {}
  template<class T>
  explicit EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR bfloat16(T val)
      : bfloat16_impl::bfloat16_base(bfloat16_impl::float_to_bfloat16_rtne<internal::is_integral<T>::value>(static_cast<float>(val))) {}
  explicit EIGEN_DEVICE_FUNC bfloat16(float f)
      : bfloat16_impl::bfloat16_base(bfloat16_impl::float_to_bfloat16_rtne<false>(f)) {}
  // Following the convention of numpy, converting between complex and
  // float will lead to loss of imag value.
  template<typename RealScalar>
  explicit EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR bfloat16(const std::complex<RealScalar>& val)
      : bfloat16_impl::bfloat16_base(bfloat16_impl::float_to_bfloat16_rtne<false>(static_cast<float>(val.real()))) {}
  EIGEN_DEVICE_FUNC operator float() const {  // NOLINT: Allow implicit conversion to float, because it is lossless.
    return bfloat16_impl::bfloat16_to_float(*this);
  }
 };
 } // namespace Eigen
 namespace std {
 template<>
 struct numeric_limits<Eigen::bfloat16> {
  static const bool is_specialized = true;
  static const bool is_signed = true;
  static const bool is_integer = false;
  static const bool is_exact = false;
  static const bool has_infinity = true;
  static const bool has_quiet_NaN = true;
  static const bool has_signaling_NaN = true;
  static const float_denorm_style has_denorm = std::denorm_absent;
  static const bool has_denorm_loss = false;
  static const std::float_round_style round_style = numeric_limits<float>::round_style;
  static const bool is_iec559 = false;
  static const bool is_bounded = true;
  static const bool is_modulo = false;
  static const int digits = 8;
  static const int digits10 = 2;
  static const int max_digits10 = 4;
  static const int radix = 2;
  static const int min_exponent = numeric_limits<float>::min_exponent;
  static const int min_exponent10 = numeric_limits<float>::min_exponent10;
  static const int max_exponent = numeric_limits<float>::max_exponent;
  static const int max_exponent10 = numeric_limits<float>::max_exponent10;
  static const bool traps = numeric_limits<float>::traps;
  static const bool tinyness_before = numeric_limits<float>::tinyness_before;
  static Eigen::bfloat16 (min)() { return Eigen::bfloat16_impl::raw_uint16_to_bfloat16(0x0080); }
  static Eigen::bfloat16 lowest() { return Eigen::bfloat16_impl::raw_uint16_to_bfloat16(0xff7f); }
  static Eigen::bfloat16 (max)() { return Eigen::bfloat16_impl::raw_uint16_to_bfloat16(0x7f7f); }
  static Eigen::bfloat16 epsilon() { return Eigen::bfloat16_impl::raw_uint16_to_bfloat16(0x3c00); }
  static Eigen::bfloat16 round_error() { return Eigen::bfloat16(0x3f00); }
  static Eigen::bfloat16 infinity() { return Eigen::bfloat16_impl::raw_uint16_to_bfloat16(0x7f80); }
  static Eigen::bfloat16 quiet_NaN() { return Eigen::bfloat16_impl::raw_uint16_to_bfloat16(0x7fc0); }
  static Eigen::bfloat16 signaling_NaN() { return Eigen::bfloat16_impl::raw_uint16_to_bfloat16(0x7f81); }
  static Eigen::bfloat16 denorm_min() { return Eigen::bfloat16_impl::raw_uint16_to_bfloat16(0x0001); }
 };
 // If std::numeric_limits<T> is specialized, should also specialize
 // std::numeric_limits<const T>, std::numeric_limits<volatile T>, and
 // std::numeric_limits<const volatile T>
 // https://stackoverflow.com/a/16519653/
 template<>
 struct numeric_limits<const Eigen::bfloat16> : numeric_limits<Eigen::bfloat16> {};
 template<>
 struct numeric_limits<volatile Eigen::bfloat16> : numeric_limits<Eigen::bfloat16> {};
 template<>
 struct numeric_limits<const volatile Eigen::bfloat16> : numeric_limits<Eigen::bfloat16> {};
 } // namespace std
 namespace Eigen {
 namespace bfloat16_impl {
 // We need to distinguish ‘clang as the CUDA compiler’ from ‘clang as the host compiler,
 // invoked by NVCC’ (e.g. on MacOS). The former needs to see both host and device implementation
 // of the functions, while the latter can only deal with one of them.
 #if !defined(EIGEN_HAS_NATIVE_BF16) || (EIGEN_COMP_CLANG && !EIGEN_COMP_NVCC) // Emulate support for bfloat16 floats
 #if EIGEN_COMP_CLANG && defined(EIGEN_CUDACC)
 // We need to provide emulated *host-side* BF16 operators for clang.
 #pragma push_macro("EIGEN_DEVICE_FUNC")
 #undef EIGEN_DEVICE_FUNC
 #if defined(EIGEN_HAS_CUDA_BF16) && defined(EIGEN_HAS_NATIVE_BF16)
 #define EIGEN_DEVICE_FUNC __host__
 #else // both host and device need emulated ops.
 #define EIGEN_DEVICE_FUNC __host__ __device__
 #endif
 #endif
 // Definitions for CPUs, mostly working through conversion
 // to/from fp32.
 EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC bfloat16 operator + (const bfloat16& a, const bfloat16& b) {
  return bfloat16(float(a) + float(b));
 }
 EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC bfloat16 operator + (const bfloat16& a, const int& b) {
  return bfloat16(float(a) + static_cast<float>(b));
 }
 EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC bfloat16 operator + (const int& a, const bfloat16& b) {
  return bfloat16(static_cast<float>(a) + float(b));
 }
 EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC bfloat16 operator * (const bfloat16& a, const bfloat16& b) {
  return bfloat16(float(a) * float(b));
 }
 EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC bfloat16 operator - (const bfloat16& a, const bfloat16& b) {
  return bfloat16(float(a) - float(b));
 }
 EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC bfloat16 operator / (const bfloat16& a, const bfloat16& b) {
  return bfloat16(float(a) / float(b));
 }
 EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC bfloat16 operator - (const bfloat16& a) {
  bfloat16 result;
  result.value = a.value ^ 0x8000;
  return result;
 }
 EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC bfloat16& operator += (bfloat16& a, const bfloat16& b) {
  a = bfloat16(float(a) + float(b));
  return a;
 }
 EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC bfloat16& operator *= (bfloat16& a, const bfloat16& b) {
  a = bfloat16(float(a) * float(b));
  return a;
 }
 EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC bfloat16& operator -= (bfloat16& a, const bfloat16& b) {
  a = bfloat16(float(a) - float(b));
  return a;
 }
 EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC bfloat16& operator /= (bfloat16& a, const bfloat16& b) {
  a = bfloat16(float(a) / float(b));
  return a;
 }
 EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC bfloat16 operator++(bfloat16& a) {
  a += bfloat16(1);
  return a;
 }
 EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC bfloat16 operator--(bfloat16& a) {
  a -= bfloat16(1);
  return a;
 }
 EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC bfloat16 operator++(bfloat16& a, int) {
  bfloat16 original_value = a;
  ++a;
  return original_value;
 }
 EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC bfloat16 operator--(bfloat16& a, int) {
  bfloat16 original_value = a;
  --a;
  return original_value;
 }
 EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC bool operator == (const bfloat16& a, const bfloat16& b) {
  return numext::equal_strict(float(a),float(b));
 }
 EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC bool operator != (const bfloat16& a, const bfloat16& b) {
  return numext::not_equal_strict(float(a), float(b));
 }
 EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC bool operator < (const bfloat16& a, const bfloat16& b) {
  return float(a) < float(b);
 }
 EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC bool operator <= (const bfloat16& a, const bfloat16& b) {
  return float(a) <= float(b);
 }
 EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC bool operator > (const bfloat16& a, const bfloat16& b) {
  return float(a) > float(b);
 }
 EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC bool operator >= (const bfloat16& a, const bfloat16& b) {
  return float(a) >= float(b);
 }
 #if EIGEN_COMP_CLANG && defined(EIGEN_CUDACC)
 #pragma pop_macro("EIGEN_DEVICE_FUNC")
 #endif
 #endif  // Emulate support for bfloat16 floats
 // Division by an index. Do it in full float precision to avoid accuracy
 // issues in converting the denominator to bfloat16.
 EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC bfloat16 operator / (const bfloat16& a, Index b) {
  return bfloat16(static_cast<float>(a) / static_cast<float>(b));
 }
 EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC __bfloat16_raw truncate_to_bfloat16(const float v) {
  __bfloat16_raw output;
  if (Eigen::numext::isnan EIGEN_NOT_A_MACRO(v)) {
    output.value = std::signbit(v) ? 0xFFC0: 0x7FC0;
    return output;
  }
  output.value = static_cast<numext::uint16_t>(numext::bit_cast<numext::uint32_t>(v) >> 16);
  return output;
 }
 EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR __bfloat16_raw raw_uint16_to_bfloat16(numext::uint16_t value) {
  return __bfloat16_raw(value);
 }
 EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR numext::uint16_t raw_bfloat16_as_uint16(const __bfloat16_raw& bf) {
  return bf.value;
 }
 // float_to_bfloat16_rtne template specialization that does not make any
 // assumption about the value of its function argument (ff).
 template <>
 EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC __bfloat16_raw float_to_bfloat16_rtne<false>(float ff) {
 #if (defined(EIGEN_HAS_CUDA_BF16) && defined(EIGEN_HAS_HIP_BF16))
  // Nothing to do here
 #else
  __bfloat16_raw output;
  if (Eigen::numext::isnan EIGEN_NOT_A_MACRO(ff)) {
    // If the value is a NaN, squash it to a qNaN with msb of fraction set,
    // this makes sure after truncation we don't end up with an inf.
    //
    // qNaN magic: All exponent bits set + most significant bit of fraction
    // set.
    output.value = std::signbit(ff) ? 0xFFC0: 0x7FC0;
  } else {
    // Fast rounding algorithm that rounds a half value to nearest even. This
    // reduces expected error when we convert a large number of floats. Here
    // is how it works:
    //
    // Definitions:
    // To convert a float 32 to bfloat16, a float 32 can be viewed as 32 bits
    // with the following tags:
    //
    // Sign |  Exp (8 bits) | Frac (23 bits)
    //  S     EEEEEEEE         FFFFFFLRTTTTTTTTTTTTTTT
    //
    //  S: Sign bit.
    //  E: Exponent bits.
    //  F: First 6 bits of fraction.
    //  L: Least significant bit of resulting bfloat16 if we truncate away the
    //  rest of the float32. This is also the 7th bit of fraction
    //  R: Rounding bit, 8th bit of fraction.
    //  T: Sticky bits, rest of fraction, 15 bits.
    //
    // To round half to nearest even, there are 3 cases where we want to round
    // down (simply truncate the result of the bits away, which consists of
    // rounding bit and sticky bits) and two cases where we want to round up
    // (truncate then add one to the result).
    //
    // The fast converting algorithm simply adds lsb (L) to 0x7fff (15 bits of
    // 1s) as the rounding bias, adds the rounding bias to the input, then
    // truncates the last 16 bits away.
    //
    // To understand how it works, we can analyze this algorithm case by case:
    //
    // 1. L = 0, R = 0:
    //   Expect: round down, this is less than half value.
    //
    //   Algorithm:
    //   - Rounding bias: 0x7fff + 0 = 0x7fff
    //   - Adding rounding bias to input may create any carry, depending on
    //   whether there is any value set to 1 in T bits.
    //   - R may be set to 1 if there is a carry.
    //   - L remains 0.
    //   - Note that this case also handles Inf and -Inf, where all fraction
    //   bits, including L, R and Ts are all 0. The output remains Inf after
    //   this algorithm.
    //
    // 2. L = 1, R = 0:
    //   Expect: round down, this is less than half value.
    //
    //   Algorithm:
    //   - Rounding bias: 0x7fff + 1 = 0x8000
    //   - Adding rounding bias to input doesn't change sticky bits but
    //   adds 1 to rounding bit.
    //   - L remains 1.
    //
    // 3. L = 0, R = 1, all of T are 0:
    //   Expect: round down, this is exactly at half, the result is already
    //   even (L=0).
    //
    //   Algorithm:
    //   - Rounding bias: 0x7fff + 0 = 0x7fff
    //   - Adding rounding bias to input sets all sticky bits to 1, but
    //   doesn't create a carry.
    //   - R remains 1.
    //   - L remains 0.
    //
    // 4. L = 1, R = 1:
    //   Expect: round up, this is exactly at half, the result needs to be
    //   round to the next even number.
    //
    //   Algorithm:
    //   - Rounding bias: 0x7fff + 1 = 0x8000
    //   - Adding rounding bias to input doesn't change sticky bits, but
    //   creates a carry from rounding bit.
    //   - The carry sets L to 0, creates another carry bit and propagate
    //   forward to F bits.
    //   - If all the F bits are 1, a carry then propagates to the exponent
    //   bits, which then creates the minimum value with the next exponent
    //   value. Note that we won't have the case where exponents are all 1,
    //   since that's either a NaN (handled in the other if condition) or inf
    //   (handled in case 1).
    //
    // 5. L = 0, R = 1, any of T is 1:
    //   Expect: round up, this is greater than half.
    //
    //   Algorithm:
    //   - Rounding bias: 0x7fff + 0 = 0x7fff
    //   - Adding rounding bias to input creates a carry from sticky bits,
    //   sets rounding bit to 0, then create another carry.
    //   - The second carry sets L to 1.
    //
    // Examples:
    //
    //  Exact half value that is already even:
    //    Input:
    //    Sign |  Exp (8 bit)     | Frac (first 7 bit) | Frac (last 16 bit)
    //     S     E E E E E E E E      F F F F F F L     RTTTTTTTTTTTTTTT
    //     0     0 0 0 0 0 0 0 0      0 0 0 0 0 1 0     1000000000000000
    //
    //     This falls into case 3. We truncate the rest of 16 bits and no
    //     carry is created into F and L:
    //
    //    Output:
    //    Sign |  Exp (8 bit)     | Frac (first 7 bit)
    //     S     E E E E E E E E      F F F F F F L
    //     0     0 0 0 0 0 0 0 0      0 0 0 0 0 1 0
    //
    //  Exact half value, round to next even number:
    //    Input:
    //    Sign |  Exp (8 bit)     | Frac (first 7 bit) | Frac (last 16 bit)
    //     S     E E E E E E E E      F F F F F F L     RTTTTTTTTTTTTTTT
    //     0     0 0 0 0 0 0 0 0      0 0 0 0 0 0 1     1000000000000000
    //
    //     This falls into case 4. We create a carry from R and T,
    //     which then propagates into L and F:
    //
    //    Output:
    //    Sign |  Exp (8 bit)     | Frac (first 7 bit)
    //     S     E E E E E E E E      F F F F F F L
    //     0     0 0 0 0 0 0 0 0      0 0 0 0 0 1 0
    //
    //
    //  Max denormal value round to min normal value:
    //    Input:
    //    Sign |  Exp (8 bit)     | Frac (first 7 bit) | Frac (last 16 bit)
    //     S     E E E E E E E E      F F F F F F L     RTTTTTTTTTTTTTTT
    //     0     0 0 0 0 0 0 0 0      1 1 1 1 1 1 1     1111111111111111
    //
    //     This falls into case 4. We create a carry from R and T,
    //     propagate into L and F, which then propagates into exponent
    //     bits:
    //
    //    Output:
    //    Sign |  Exp (8 bit)     | Frac (first 7 bit)
    //     S     E E E E E E E E      F F F F F F L
    //     0     0 0 0 0 0 0 0 1      0 0 0 0 0 0 0
    //
    //  Max normal value round to Inf:
    //    Input:
    //    Sign |  Exp (8 bit)     | Frac (first 7 bit) | Frac (last 16 bit)
    //     S     E E E E E E E E      F F F F F F L     RTTTTTTTTTTTTTTT
    //     0     1 1 1 1 1 1 1 0      1 1 1 1 1 1 1     1111111111111111
    //
    //     This falls into case 4. We create a carry from R and T,
    //     propagate into L and F, which then propagates into exponent
    //     bits:
    //
    //    Sign |  Exp (8 bit)     | Frac (first 7 bit)
    //     S     E E E E E E E E      F F F F F F L
    //     0     1 1 1 1 1 1 1 1      0 0 0 0 0 0 0
    // At this point, ff must be either a normal float, or +/-infinity.
    output = float_to_bfloat16_rtne<true>(ff);
  }
  return output;
 #endif
 }
 // float_to_bfloat16_rtne template specialization that assumes that its function
 // argument (ff) is either a normal floating point number, or +/-infinity, or
 // zero. Used to improve the runtime performance of conversion from an integer
 // type to bfloat16.
 template <>
 EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC __bfloat16_raw float_to_bfloat16_rtne<true>(float ff) {
 #if (defined(EIGEN_HAS_CUDA_BF16) && defined(EIGEN_HAS_HIP_BF16))
    // Nothing to do here
 #else
    numext::uint32_t input = numext::bit_cast<numext::uint32_t>(ff);
    __bfloat16_raw output;
    // Least significant bit of resulting bfloat.
    numext::uint32_t lsb = (input >> 16) & 1;
    numext::uint32_t rounding_bias = 0x7fff + lsb;
    input += rounding_bias;
    output.value = static_cast<numext::uint16_t>(input >> 16);
    return output;
 #endif
 }
 EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC float bfloat16_to_float(__bfloat16_raw h) {
    return numext::bit_cast<float>(static_cast<numext::uint32_t>(h.value) << 16);
 }
 // --- standard functions ---
 EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC bool (isinf)(const bfloat16& a) {
  EIGEN_USING_STD(isinf);
  return (isinf)(float(a));
 }
 EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC bool (isnan)(const bfloat16& a) {
  EIGEN_USING_STD(isnan);
  return (isnan)(float(a));
 }
 EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC bool (isfinite)(const bfloat16& a) {
  return !(isinf EIGEN_NOT_A_MACRO (a)) && !(isnan EIGEN_NOT_A_MACRO (a));
 }
 EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC bfloat16 abs(const bfloat16& a) {
  bfloat16 result;
  result.value = a.value & 0x7FFF;
  return result;
 }
 EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC bfloat16 exp(const bfloat16& a) {
   return bfloat16(::expf(float(a)));
 }
 EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC bfloat16 expm1(const bfloat16& a) {
  return bfloat16(numext::expm1(float(a)));
 }
 EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC bfloat16 log(const bfloat16& a) {
  return bfloat16(::logf(float(a)));
 }
 EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC bfloat16 log1p(const bfloat16& a) {
  return bfloat16(numext::log1p(float(a)));
 }
 EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC bfloat16 log10(const bfloat16& a) {
  return bfloat16(::log10f(float(a)));
 }
 EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC bfloat16 log2(const bfloat16& a) {
  return bfloat16(static_cast<float>(EIGEN_LOG2E) * ::logf(float(a)));
 }
 EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC bfloat16 sqrt(const bfloat16& a) {
    return bfloat16(::sqrtf(float(a)));
 }
 EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC bfloat16 pow(const bfloat16& a, const bfloat16& b) {
  return bfloat16(::powf(float(a), float(b)));
 }
 EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC bfloat16 sin(const bfloat16& a) {
  return bfloat16(::sinf(float(a)));
 }
 EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC bfloat16 cos(const bfloat16& a) {
  return bfloat16(::cosf(float(a)));
 }
 EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC bfloat16 tan(const bfloat16& a) {
  return bfloat16(::tanf(float(a)));
 }
 EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC bfloat16 asin(const bfloat16& a) {
  return bfloat16(::asinf(float(a)));
 }
 EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC bfloat16 acos(const bfloat16& a) {
  return bfloat16(::acosf(float(a)));
 }
 EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC bfloat16 atan(const bfloat16& a) {
  return bfloat16(::atanf(float(a)));
 }
 EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC bfloat16 sinh(const bfloat16& a) {
  return bfloat16(::sinhf(float(a)));
 }
 EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC bfloat16 cosh(const bfloat16& a) {
  return bfloat16(::coshf(float(a)));
 }
 EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC bfloat16 tanh(const bfloat16& a) {
  return bfloat16(::tanhf(float(a)));
 }
 #if EIGEN_HAS_CXX11_MATH
 EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC bfloat16 asinh(const bfloat16& a) {
  return bfloat16(::asinhf(float(a)));
 }
 EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC bfloat16 acosh(const bfloat16& a) {
  return bfloat16(::acoshf(float(a)));
 }
 EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC bfloat16 atanh(const bfloat16& a) {
  return bfloat16(::atanhf(float(a)));
 }
 #endif
 EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC bfloat16 floor(const bfloat16& a) {
  return bfloat16(::floorf(float(a)));
 }
 EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC bfloat16 ceil(const bfloat16& a) {
  return bfloat16(::ceilf(float(a)));
 }
 EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC bfloat16 rint(const bfloat16& a) {
  return bfloat16(::rintf(float(a)));
 }
 EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC bfloat16 round(const bfloat16& a) {
  return bfloat16(::roundf(float(a)));
 }
 EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC bfloat16 fmod(const bfloat16& a, const bfloat16& b) {
  return bfloat16(::fmodf(float(a), float(b)));
 }
 EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC bfloat16 (min)(const bfloat16& a, const bfloat16& b) {
  const float f1 = static_cast<float>(a);
  const float f2 = static_cast<float>(b);
  return f2 < f1 ? b : a;
 }
 EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC bfloat16 (max)(const bfloat16& a, const bfloat16& b) {
  const float f1 = static_cast<float>(a);
  const float f2 = static_cast<float>(b);
  return f1 < f2 ? b : a;
 }
 EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC bfloat16 fmin(const bfloat16& a, const bfloat16& b) {
  const float f1 = static_cast<float>(a);
  const float f2 = static_cast<float>(b);
  return bfloat16(::fminf(f1, f2));
 }
 EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC bfloat16 fmax(const bfloat16& a, const bfloat16& b) {
  const float f1 = static_cast<float>(a);
  const float f2 = static_cast<float>(b);
  return bfloat16(::fmaxf(f1, f2));
 }
 #ifndef EIGEN_NO_IO
 EIGEN_ALWAYS_INLINE std::ostream& operator << (std::ostream& os, const bfloat16& v) {
  os << static_cast<float>(v);
  return os;
 }
 #endif
 } // namespace bfloat16_impl
 namespace internal {
 template<>
 struct random_default_impl<bfloat16, false, false>
 {
  static inline bfloat16 run(const bfloat16& x, const bfloat16& y)
  {
    return x + (y-x) * bfloat16(float(std::rand()) / float(RAND_MAX));
  }
  static inline bfloat16 run()
  {
    return run(bfloat16(-1.f), bfloat16(1.f));
  }
 };
 template<> struct is_arithmetic<bfloat16> { enum { value = true }; };
 } // namespace internal
 template<> struct NumTraits<Eigen::bfloat16>
    : GenericNumTraits<Eigen::bfloat16>
 {
  enum {
    IsSigned = true,
    IsInteger = false,
    IsComplex = false,
    RequireInitialization = false
  };
  EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR static EIGEN_STRONG_INLINE Eigen::bfloat16 epsilon() {
    return bfloat16_impl::raw_uint16_to_bfloat16(0x3c00);
  }
  EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR static EIGEN_STRONG_INLINE Eigen::bfloat16 dummy_precision() {
    return bfloat16_impl::raw_uint16_to_bfloat16(0x3D4D);  // bfloat16(5e-2f);
  }
  EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR static EIGEN_STRONG_INLINE Eigen::bfloat16 highest() {
    return bfloat16_impl::raw_uint16_to_bfloat16(0x7F7F);
  }
  EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR static EIGEN_STRONG_INLINE Eigen::bfloat16 lowest() {
    return bfloat16_impl::raw_uint16_to_bfloat16(0xFF7F);
  }
  EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR static EIGEN_STRONG_INLINE Eigen::bfloat16 infinity() {
    return bfloat16_impl::raw_uint16_to_bfloat16(0x7f80);
  }
  EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR static EIGEN_STRONG_INLINE Eigen::bfloat16 quiet_NaN() {
    return bfloat16_impl::raw_uint16_to_bfloat16(0x7fc0);
  }
 };
 } // namespace Eigen
 namespace Eigen {
 namespace numext {
 template<>
 EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE
 bool (isnan)(const Eigen::bfloat16& h) {
  return (bfloat16_impl::isnan)(h);
 }
 template<>
 EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE
 bool (isinf)(const Eigen::bfloat16& h) {
  return (bfloat16_impl::isinf)(h);
 }
 template<>
 EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE
 bool (isfinite)(const Eigen::bfloat16& h) {
  return (bfloat16_impl::isfinite)(h);
 }
 template <>
 EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC Eigen::bfloat16 bit_cast<Eigen::bfloat16, uint16_t>(const uint16_t& src) {
  return Eigen::bfloat16(Eigen::bfloat16_impl::raw_uint16_to_bfloat16(src));
 }
 template <>
 EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC uint16_t bit_cast<uint16_t, Eigen::bfloat16>(const Eigen::bfloat16& src) {
  return Eigen::bfloat16_impl::raw_bfloat16_as_uint16(src);
 }
 }  // namespace numext
 }  // namespace Eigen
 #if EIGEN_HAS_STD_HASH
 namespace std {
 template <>
 struct hash<Eigen::bfloat16> {
  EIGEN_STRONG_INLINE std::size_t operator()(const Eigen::bfloat16& a) const {
    return static_cast<std::size_t>(Eigen::numext::bit_cast<Eigen::numext::uint16_t>(a));
  }
 };
 } // namespace std
 #endif
 #endif // EIGEN_BFLOAT16_H
--- a/Eigen/src/Core/arch/Default/ConjHelper.h
+++ b/Eigen/src/Core/arch/Default/ConjHelper.h
@@ -12,18 +12,106 @@
 #define EIGEN_ARCH_CONJ_HELPER_H
 #define EIGEN_MAKE_CONJ_HELPER_CPLX_REAL(PACKET_CPLX, PACKET_REAL)      \
-  template<> struct conj_helper<PACKET_REAL, PACKET_CPLX, false,false> {                                          \
+  template <>                                                           \
-    EIGEN_STRONG_INLINE PACKET_CPLX pmadd(const PACKET_REAL& x, const PACKET_CPLX& y, const PACKET_CPLX& c) const \
+  struct conj_helper<PACKET_REAL, PACKET_CPLX, false, false> {          \
-    { return padd(c, pmul(x,y)); }                                                                                \
+    EIGEN_STRONG_INLINE PACKET_CPLX pmadd(const PACKET_REAL& x,         \
-    EIGEN_STRONG_INLINE PACKET_CPLX pmul(const PACKET_REAL& x, const PACKET_CPLX& y) const                        \
+                                          const PACKET_CPLX& y,         \
-    { return PACKET_CPLX(Eigen::internal::pmul<PACKET_REAL>(x, y.v)); }                                           \
+                                          const PACKET_CPLX& c) const { \
      return padd(c, this->pmul(x, y));                                 \
    }                                                                   \
    EIGEN_STRONG_INLINE PACKET_CPLX pmul(const PACKET_REAL& x,          \
                                         const PACKET_CPLX& y) const {  \
      return PACKET_CPLX(Eigen::internal::pmul<PACKET_REAL>(x, y.v));   \
    }                                                                   \
  };                                                                    \
                                                                        \
-  template<> struct conj_helper<PACKET_CPLX, PACKET_REAL, false,false> {                                          \
+  template <>                                                           \
-    EIGEN_STRONG_INLINE PACKET_CPLX pmadd(const PACKET_CPLX& x, const PACKET_REAL& y, const PACKET_CPLX& c) const \
+  struct conj_helper<PACKET_CPLX, PACKET_REAL, false, false> {          \
-    { return padd(c, pmul(x,y)); }                                                                                \
+    EIGEN_STRONG_INLINE PACKET_CPLX pmadd(const PACKET_CPLX& x,         \
-    EIGEN_STRONG_INLINE PACKET_CPLX pmul(const PACKET_CPLX& x, const PACKET_REAL& y) const                        \
+                                          const PACKET_REAL& y,         \
-    { return PACKET_CPLX(Eigen::internal::pmul<PACKET_REAL>(x.v, y)); }                                           \
+                                          const PACKET_CPLX& c) const { \
      return padd(c, this->pmul(x, y));                                 \
    }                                                                   \
    EIGEN_STRONG_INLINE PACKET_CPLX pmul(const PACKET_CPLX& x,          \
                                         const PACKET_REAL& y) const {  \
      return PACKET_CPLX(Eigen::internal::pmul<PACKET_REAL>(x.v, y));   \
    }                                                                   \
  };
 namespace Eigen {
 namespace internal {
 template<bool Conjugate> struct conj_if;
 template<> struct conj_if<true> {
  template<typename T>
  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE T operator()(const T& x) const { return numext::conj(x); }
  template<typename T>
  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE T pconj(const T& x) const { return internal::pconj(x); }
 };
 template<> struct conj_if<false> {
  template<typename T>
  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const T& operator()(const T& x) const { return x; }
  template<typename T>
  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const T& pconj(const T& x) const { return x; }
 };
 // Generic Implementation, assume scalars since the packet-version is
 // specialized below.
 template<typename LhsType, typename RhsType, bool ConjLhs, bool ConjRhs>
 struct conj_helper {
  typedef typename ScalarBinaryOpTraits<LhsType, RhsType>::ReturnType ResultType;
  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE ResultType
  pmadd(const LhsType& x, const RhsType& y, const ResultType& c) const
  { return this->pmul(x, y) + c; }
  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE ResultType
  pmul(const LhsType& x, const RhsType& y) const
  { return conj_if<ConjLhs>()(x) * conj_if<ConjRhs>()(y); }
 };
 template<typename LhsScalar, typename RhsScalar>
 struct conj_helper<LhsScalar, RhsScalar, true, true> {
  typedef typename ScalarBinaryOpTraits<LhsScalar,RhsScalar>::ReturnType ResultType;
  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE ResultType
  pmadd(const LhsScalar& x, const RhsScalar& y, const ResultType& c) const
  { return this->pmul(x, y) + c; }
  // We save a conjuation by using the identity conj(a)*conj(b) = conj(a*b).
  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE ResultType
  pmul(const LhsScalar& x, const RhsScalar& y) const
  { return numext::conj(x * y); }
 };
 // Implementation with equal type, use packet operations.
 template<typename Packet, bool ConjLhs, bool ConjRhs>
 struct conj_helper<Packet, Packet, ConjLhs, ConjRhs>
 {
  typedef Packet ResultType;
  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet pmadd(const Packet& x, const Packet& y, const Packet& c) const
  { return Eigen::internal::pmadd(conj_if<ConjLhs>().pconj(x), conj_if<ConjRhs>().pconj(y), c); }
  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet pmul(const Packet& x, const Packet& y) const
  { return Eigen::internal::pmul(conj_if<ConjLhs>().pconj(x), conj_if<ConjRhs>().pconj(y)); }
 };
 template<typename Packet>
 struct conj_helper<Packet, Packet, true, true>
 {
  typedef Packet ResultType;
  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet pmadd(const Packet& x, const Packet& y, const Packet& c) const
  { return Eigen::internal::pmadd(pconj(x), pconj(y), c); }
  // We save a conjuation by using the identity conj(a)*conj(b) = conj(a*b).
  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet pmul(const Packet& x, const Packet& y) const
  { return pconj(Eigen::internal::pmul(x, y)); }
 };
 }  // namespace internal
 }  // namespace Eigen
 #endif  // EIGEN_ARCH_CONJ_HELPER_H
--- a/Show More
+++ b/Show More