Merged in ezhulenev/eigen-01 (pull request PR-769)

Capture TensorMap by value inside tensor expression AST
Merged in anshuljl/eigen-2/Anshul-Jaiswal/update-configurevectorizationh-to-not-op-1573079916090 (pull request PR-754)
2026-04-10 11:34:33 +08:00 · 2019-12-04 00:49:10 +00:00 · 2019-12-04 00:45:42 +00:00 · 2019-12-03 16:39:05 -08:00 · 2019-12-03 14:33:19 -08:00 · 2019-12-03 12:08:22 -08:00
1126 changed files with 139061 additions and 28586 deletions
--- a/.hgignore
+++ b/.hgignore
@@ -13,7 +13,7 @@ core
 core.*
 *.bak
 *~
-build*
+*build*
 *.moc.*
 *.moc
 ui_*
@@ -28,7 +28,11 @@ activity.png
 *.rej
 log
 patch
+*.patch
 a
 a.*
 lapack/testing
 lapack/reference
+.*project
+.settings
+Makefile
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -1,6 +1,6 @@
-project(Eigen)
+project(Eigen3)

-cmake_minimum_required(VERSION 2.8.4)
+cmake_minimum_required(VERSION 2.8.11)

 # guard against in-source builds

@@ -8,6 +8,12 @@ if(${CMAKE_SOURCE_DIR} STREQUAL ${CMAKE_BINARY_DIR})
  message(FATAL_ERROR "In-source builds not allowed. Please make a new directory (called a build directory) and run CMake from there. You may need to remove CMakeCache.txt. ")
 endif()

+
+# Alias Eigen_*_DIR to Eigen3_*_DIR:
+
+set(Eigen_SOURCE_DIR ${Eigen3_SOURCE_DIR})
+set(Eigen_BINARY_DIR ${Eigen3_BINARY_DIR})
+
 # guard against bad build-type strings

 if (NOT CMAKE_BUILD_TYPE)
@@ -23,7 +29,7 @@ endif()


 #############################################################################
-# retrieve version infomation                                               #
+# retrieve version information                                               #
 #############################################################################

 # automatically parse the version number
@@ -36,28 +42,59 @@ string(REGEX MATCH "define[ \t]+EIGEN_MINOR_VERSION[ \t]+([0-9]+)" _eigen_minor_
 set(EIGEN_MINOR_VERSION "${CMAKE_MATCH_1}")
 set(EIGEN_VERSION_NUMBER ${EIGEN_WORLD_VERSION}.${EIGEN_MAJOR_VERSION}.${EIGEN_MINOR_VERSION})

-# if the mercurial program is absent, this will leave the EIGEN_HG_CHANGESET string empty,
-# but won't stop CMake.
-execute_process(COMMAND hg tip -R ${CMAKE_SOURCE_DIR} OUTPUT_VARIABLE EIGEN_HGTIP_OUTPUT)
-execute_process(COMMAND hg branch -R ${CMAKE_SOURCE_DIR} OUTPUT_VARIABLE EIGEN_BRANCH_OUTPUT)
+# if we are not in a mercurial clone
+if(IS_DIRECTORY ${CMAKE_SOURCE_DIR}/.hg)
+  # if the mercurial program is absent or this will leave the EIGEN_HG_CHANGESET string empty,
+  # but won't stop CMake.
+  execute_process(COMMAND hg tip -R ${CMAKE_SOURCE_DIR} OUTPUT_VARIABLE EIGEN_HGTIP_OUTPUT)
+  execute_process(COMMAND hg branch -R ${CMAKE_SOURCE_DIR} OUTPUT_VARIABLE EIGEN_BRANCH_OUTPUT)
+endif()

 # if this is the default (aka development) branch, extract the mercurial changeset number from the hg tip output...
 if(EIGEN_BRANCH_OUTPUT MATCHES "default")
 string(REGEX MATCH "^changeset: *[0-9]*:([0-9;a-f]+).*" EIGEN_HG_CHANGESET_MATCH "${EIGEN_HGTIP_OUTPUT}")
 set(EIGEN_HG_CHANGESET "${CMAKE_MATCH_1}")
-endif(EIGEN_BRANCH_OUTPUT MATCHES "default")
+endif()
 #...and show it next to the version number
 if(EIGEN_HG_CHANGESET)
  set(EIGEN_VERSION "${EIGEN_VERSION_NUMBER} (mercurial changeset ${EIGEN_HG_CHANGESET})")
-else(EIGEN_HG_CHANGESET)
+else()
  set(EIGEN_VERSION "${EIGEN_VERSION_NUMBER}")
-endif(EIGEN_HG_CHANGESET)
+endif()


 include(CheckCXXCompilerFlag)
+include(GNUInstallDirs)

 set(CMAKE_MODULE_PATH ${PROJECT_SOURCE_DIR}/cmake)

+
+option(EIGEN_TEST_CXX11 "Enable testing with C++11 and C++11 features (e.g. Tensor module)." OFF)
+
+
+macro(ei_add_cxx_compiler_flag FLAG)
+  string(REGEX REPLACE "-" "" SFLAG1 ${FLAG})
+  string(REGEX REPLACE "\\+" "p" SFLAG ${SFLAG1})
+  check_cxx_compiler_flag(${FLAG} COMPILER_SUPPORT_${SFLAG})
+  if(COMPILER_SUPPORT_${SFLAG})
+    set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} ${FLAG}")
+  endif()
+endmacro()
+
+check_cxx_compiler_flag("-std=c++11" EIGEN_COMPILER_SUPPORT_CPP11)
+
+if(EIGEN_TEST_CXX11)
+  set(CMAKE_CXX_STANDARD 11)
+  set(CMAKE_CXX_EXTENSIONS OFF)
+  if(EIGEN_COMPILER_SUPPORT_CPP11)
+    set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -std=c++11")
+  endif()
+else()
+  #set(CMAKE_CXX_STANDARD 03)
+  #set(CMAKE_CXX_EXTENSIONS OFF)
+  ei_add_cxx_compiler_flag("-std=c++03")
+endif()
+
 #############################################################################
 # find how to link to the standard libraries                                #
 #############################################################################
@@ -92,11 +129,13 @@ else()
 endif()

 option(EIGEN_BUILD_BTL "Build benchmark suite" OFF)
-if(NOT WIN32)
-  option(EIGEN_BUILD_PKGCONFIG "Build pkg-config .pc file for Eigen" ON)
-endif(NOT WIN32)

-set(CMAKE_INCLUDE_CURRENT_DIR ON)
+# Disable pkgconfig only for native Windows builds
+if(NOT WIN32 OR NOT CMAKE_HOST_SYSTEM_NAME MATCHES Windows)
+  option(EIGEN_BUILD_PKGCONFIG "Build pkg-config .pc file for Eigen" ON)
+endif()
+
+set(CMAKE_INCLUDE_CURRENT_DIR OFF)

 option(EIGEN_SPLIT_LARGE_TESTS "Split large tests into smaller executables" ON)

@@ -107,34 +146,20 @@ endif()

 set(EIGEN_TEST_MAX_SIZE "320" CACHE STRING "Maximal matrix/vector size, default is 320")

-macro(ei_add_cxx_compiler_flag FLAG)
-  string(REGEX REPLACE "-" "" SFLAG1 ${FLAG})
-  string(REGEX REPLACE "\\+" "p" SFLAG ${SFLAG1})
-  check_cxx_compiler_flag(${FLAG} COMPILER_SUPPORT_${SFLAG})
-  if(COMPILER_SUPPORT_${SFLAG})
-    set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} ${FLAG}")
-  endif()
-endmacro(ei_add_cxx_compiler_flag)
-
 if(NOT MSVC)
  # We assume that other compilers are partly compatible with GNUCC
-  
-#  set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -fexceptions")
-  set(CMAKE_CXX_FLAGS_DEBUG "-g3")
-  set(CMAKE_CXX_FLAGS_RELEASE "-g0 -O2")
-  
-  # clang outputs some warnings for unknwon flags that are not caught by check_cxx_compiler_flag
+
+  # clang outputs some warnings for unknown flags that are not caught by check_cxx_compiler_flag
  # adding -Werror turns such warnings into errors
  check_cxx_compiler_flag("-Werror" COMPILER_SUPPORT_WERROR)
  if(COMPILER_SUPPORT_WERROR)
    set(CMAKE_REQUIRED_FLAGS "-Werror")
  endif()
-  
  ei_add_cxx_compiler_flag("-pedantic")
  ei_add_cxx_compiler_flag("-Wall")
  ei_add_cxx_compiler_flag("-Wextra")
  #ei_add_cxx_compiler_flag("-Weverything")              # clang
-  
+
  ei_add_cxx_compiler_flag("-Wundef")
  ei_add_cxx_compiler_flag("-Wcast-align")
  ei_add_cxx_compiler_flag("-Wchar-subscripts")
@@ -144,31 +169,30 @@ if(NOT MSVC)
  ei_add_cxx_compiler_flag("-Wwrite-strings")
  ei_add_cxx_compiler_flag("-Wformat-security")
  ei_add_cxx_compiler_flag("-Wshorten-64-to-32")
+  ei_add_cxx_compiler_flag("-Wlogical-op")
  ei_add_cxx_compiler_flag("-Wenum-conversion")
  ei_add_cxx_compiler_flag("-Wc++11-extensions")
-  
-  # -Wshadow is insanely too strict with gcc, hopefully it will become usable with gcc 6
-  # if(NOT CMAKE_COMPILER_IS_GNUCXX OR (CMAKE_CXX_COMPILER_VERSION VERSION_GREATER "5.0.0"))
-  if(NOT CMAKE_COMPILER_IS_GNUCXX)
-    ei_add_cxx_compiler_flag("-Wshadow")
-  endif()
-  
+  ei_add_cxx_compiler_flag("-Wdouble-promotion")
+#  ei_add_cxx_compiler_flag("-Wconversion")
+
+  ei_add_cxx_compiler_flag("-Wshadow")
+
  ei_add_cxx_compiler_flag("-Wno-psabi")
  ei_add_cxx_compiler_flag("-Wno-variadic-macros")
  ei_add_cxx_compiler_flag("-Wno-long-long")
-  
+
  ei_add_cxx_compiler_flag("-fno-check-new")
  ei_add_cxx_compiler_flag("-fno-common")
  ei_add_cxx_compiler_flag("-fstrict-aliasing")
  ei_add_cxx_compiler_flag("-wd981")                    # disable ICC's "operands are evaluated in unspecified order" remark
-  ei_add_cxx_compiler_flag("-wd2304")                   # disbale ICC's "warning #2304: non-explicit constructor with single argument may cause implicit type conversion" produced by -Wnon-virtual-dtor
-  
-  
+  ei_add_cxx_compiler_flag("-wd2304")                   # disable ICC's "warning #2304: non-explicit constructor with single argument may cause implicit type conversion" produced by -Wnon-virtual-dtor
+
+
  # The -ansi flag must be added last, otherwise it is also used as a linker flag by check_cxx_compiler_flag making it fails
  # Moreover we should not set both -strict-ansi and -ansi
  check_cxx_compiler_flag("-strict-ansi" COMPILER_SUPPORT_STRICTANSI)
  ei_add_cxx_compiler_flag("-Qunused-arguments")        # disable clang warning: argument unused during compilation: '-ansi'
-  
+
  if(COMPILER_SUPPORT_STRICTANSI)
    set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -strict-ansi")
  else()
@@ -179,7 +203,7 @@ if(NOT MSVC)
    ei_add_cxx_compiler_flag("-pie")
    ei_add_cxx_compiler_flag("-fPIE")
  endif()
-  
+
  set(CMAKE_REQUIRED_FLAGS "")

  option(EIGEN_TEST_SSE2 "Enable/Disable SSE2 in tests/examples" OFF)
@@ -224,6 +248,21 @@ if(NOT MSVC)
    message(STATUS "Enabling FMA in tests/examples")
  endif()

+  option(EIGEN_TEST_AVX512 "Enable/Disable AVX512 in tests/examples" OFF)
+  if(EIGEN_TEST_AVX512)
+    set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -mavx512f -mfma -DEIGEN_ENABLE_AVX512")
+    if (NOT "${CMAKE_CXX_COMPILER_ID}" STREQUAL "Clang")
+      set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -fabi-version=6")
+    endif()
+    message(STATUS "Enabling AVX512 in tests/examples")
+  endif()
+
+  option(EIGEN_TEST_F16C "Enable/Disable F16C in tests/examples" OFF)
+  if(EIGEN_TEST_F16C)
+    set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -mf16c")
+    message(STATUS "Enabling F16C in tests/examples")
+  endif()
+
  option(EIGEN_TEST_ALTIVEC "Enable/Disable AltiVec in tests/examples" OFF)
  if(EIGEN_TEST_ALTIVEC)
    set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -maltivec -mabi=altivec")
@@ -236,6 +275,12 @@ if(NOT MSVC)
    message(STATUS "Enabling VSX in tests/examples")
  endif()

+  option(EIGEN_TEST_MSA "Enable/Disable MSA in tests/examples" OFF)
+  if(EIGEN_TEST_MSA)
+    set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -mmsa")
+    message(STATUS "Enabling MSA in tests/examples")
+  endif()
+
  option(EIGEN_TEST_NEON "Enable/Disable Neon in tests/examples" OFF)
  if(EIGEN_TEST_NEON)
    if(EIGEN_TEST_FMA)
@@ -243,7 +288,7 @@ if(NOT MSVC)
    else()
      set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -mfpu=neon")
    endif()
-    set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -mfloat-abi=softfp")
+    set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -mfloat-abi=hard")
    message(STATUS "Enabling NEON in tests/examples")
  endif()

@@ -253,7 +298,17 @@ if(NOT MSVC)
    message(STATUS "Enabling NEON in tests/examples")
  endif()

+  option(EIGEN_TEST_Z13 "Enable/Disable S390X(zEC13) ZVECTOR in tests/examples" OFF)
+  if(EIGEN_TEST_Z13)
+    set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -march=z13 -mzvector")
+    message(STATUS "Enabling S390X(zEC13) ZVECTOR in tests/examples")
+  endif()

+  option(EIGEN_TEST_Z14 "Enable/Disable S390X(zEC14) ZVECTOR in tests/examples" OFF)
+  if(EIGEN_TEST_Z14)
+    set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -march=z14 -mzvector")
+    message(STATUS "Enabling S390X(zEC13) ZVECTOR in tests/examples")
+  endif()

  check_cxx_compiler_flag("-fopenmp" COMPILER_SUPPORT_OPENMP)
  if(COMPILER_SUPPORT_OPENMP)
@@ -264,7 +319,7 @@ if(NOT MSVC)
    endif()
  endif()

-else(NOT MSVC)
+else()

  # C4127 - conditional expression is constant
  # C4714 - marked as __forceinline not inlined (I failed to deactivate it selectively)
@@ -272,7 +327,7 @@ else(NOT MSVC)
  #         because we are oftentimes returning objects that have a destructor or may
  #         throw exceptions - in particular in the unit tests we are throwing extra many
  #         exceptions to cover indexing errors.
-  # C4505 - unreferenced local function has been removed (impossible to deactive selectively)
+  # C4505 - unreferenced local function has been removed (impossible to deactivate selectively)
  set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} /EHsc /wd4127 /wd4505 /wd4714")

  # replace all /Wx by /W4
@@ -292,10 +347,23 @@ else(NOT MSVC)
    if(NOT CMAKE_CL_64)
      # arch is not supported on 64 bit systems, SSE is enabled automatically.
      set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} /arch:SSE2")
-    endif(NOT CMAKE_CL_64)
+    endif()
    message(STATUS "Enabling SSE2 in tests/examples")
-  endif(EIGEN_TEST_SSE2)
-endif(NOT MSVC)
+  endif()
+
+  option(EIGEN_TEST_AVX "Enable/Disable AVX in tests/examples" OFF)
+  if(EIGEN_TEST_AVX)
+    set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} /arch:AVX")
+    message(STATUS "Enabling AVX in tests/examples")
+  endif()
+
+  option(EIGEN_TEST_FMA "Enable/Disable FMA/AVX2 in tests/examples" OFF)
+  if(EIGEN_TEST_FMA AND NOT EIGEN_TEST_NEON)
+    set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} /arch:AVX2")
+    message(STATUS "Enabling FMA/AVX2 in tests/examples")
+  endif()
+
+endif()

 option(EIGEN_TEST_NO_EXPLICIT_VECTORIZATION "Disable explicit vectorization in tests/examples" OFF)
 option(EIGEN_TEST_X87 "Force using X87 instructions. Implies no vectorization." OFF)
@@ -337,28 +405,33 @@ if(EIGEN_TEST_NO_EXCEPTIONS)
  message(STATUS "Disabling exceptions in tests/examples")
 endif()

-option(EIGEN_TEST_CXX11 "Enable testing with C++11 and C++11 features (e.g. Tensor module)." OFF)
+set(EIGEN_CUDA_COMPUTE_ARCH 30 CACHE STRING "The CUDA compute architecture level to target when compiling CUDA code")

-include_directories(${CMAKE_CURRENT_SOURCE_DIR} ${CMAKE_CURRENT_BINARY_DIR})
+include_directories(${CMAKE_CURRENT_SOURCE_DIR})

-# the user modifiable install path for header files
-set(EIGEN_INCLUDE_INSTALL_DIR ${EIGEN_INCLUDE_INSTALL_DIR} CACHE PATH "The directory where we install the header files (optional)")
-
-# set the internal install path for header files which depends on wether the user modifiable
-# EIGEN_INCLUDE_INSTALL_DIR has been set by the user or not.
+# Backward compatibility support for EIGEN_INCLUDE_INSTALL_DIR
 if(EIGEN_INCLUDE_INSTALL_DIR)
-  set(INCLUDE_INSTALL_DIR
-    ${EIGEN_INCLUDE_INSTALL_DIR}
-    CACHE INTERNAL
-    "The directory where we install the header files (internal)"
-  )
+  message(WARNING "EIGEN_INCLUDE_INSTALL_DIR is deprecated. Use INCLUDE_INSTALL_DIR instead.")
+endif()
+
+if(EIGEN_INCLUDE_INSTALL_DIR AND NOT INCLUDE_INSTALL_DIR)
+  set(INCLUDE_INSTALL_DIR ${EIGEN_INCLUDE_INSTALL_DIR}
+      CACHE PATH "The directory relative to CMAKE_PREFIX_PATH where Eigen header files are installed")
 else()
  set(INCLUDE_INSTALL_DIR
-    "${CMAKE_INSTALL_PREFIX}/include/eigen3"
-    CACHE INTERNAL
-    "The directory where we install the header files (internal)"
-  )
+      "${CMAKE_INSTALL_INCLUDEDIR}/eigen3"
+      CACHE PATH "The directory relative to CMAKE_PREFIX_PATH where Eigen header files are installed"
+      )
 endif()
+set(CMAKEPACKAGE_INSTALL_DIR
+    "${CMAKE_INSTALL_DATADIR}/eigen3/cmake"
+    CACHE PATH "The directory relative to CMAKE_PREFIX_PATH where Eigen3Config.cmake is installed"
+    )
+set(PKGCONFIG_INSTALL_DIR
+    "${CMAKE_INSTALL_DATADIR}/pkgconfig"
+    CACHE PATH "The directory relative to CMAKE_PREFIX_PATH where eigen3.pc is installed"
+    )
+

 # similar to set_target_properties but append the property instead of overwriting it
 macro(ei_add_target_property target prop value)
@@ -367,9 +440,9 @@ macro(ei_add_target_property target prop value)
  # if the property wasn't previously set, ${previous} is now "previous-NOTFOUND" which cmake allows catching with plain if()
  if(NOT previous)
    set(previous "")
-  endif(NOT previous)
+  endif()
  set_target_properties(${target} PROPERTIES ${prop} "${previous} ${value}")
-endmacro(ei_add_target_property)
+endmacro()

 install(FILES
  signature_of_eigen3_matrix_library
@@ -377,38 +450,27 @@ install(FILES
  )

 if(EIGEN_BUILD_PKGCONFIG)
-    SET(path_separator ":")
-    STRING(REPLACE ${path_separator} ";" pkg_config_libdir_search "$ENV{PKG_CONFIG_LIBDIR}")
-    message(STATUS "searching for 'pkgconfig' directory in PKG_CONFIG_LIBDIR ( $ENV{PKG_CONFIG_LIBDIR} ), ${CMAKE_INSTALL_PREFIX}/share, and ${CMAKE_INSTALL_PREFIX}/lib")
-    FIND_PATH(pkg_config_libdir pkgconfig ${pkg_config_libdir_search} ${CMAKE_INSTALL_PREFIX}/share ${CMAKE_INSTALL_PREFIX}/lib ${pkg_config_libdir_search})
-    if(pkg_config_libdir)
-        SET(pkg_config_install_dir ${pkg_config_libdir})
-        message(STATUS "found ${pkg_config_libdir}/pkgconfig" )
-    else(pkg_config_libdir)
-        SET(pkg_config_install_dir ${CMAKE_INSTALL_PREFIX}/share)
-        message(STATUS "pkgconfig not found; installing in ${pkg_config_install_dir}" )
-    endif(pkg_config_libdir)
-
-    configure_file(eigen3.pc.in eigen3.pc)
+    configure_file(eigen3.pc.in eigen3.pc @ONLY)
    install(FILES ${CMAKE_CURRENT_BINARY_DIR}/eigen3.pc
-        DESTINATION ${pkg_config_install_dir}/pkgconfig
+        DESTINATION ${PKGCONFIG_INSTALL_DIR}
        )
-endif(EIGEN_BUILD_PKGCONFIG)
+endif()

-add_subdirectory(Eigen)
+install(DIRECTORY Eigen DESTINATION ${INCLUDE_INSTALL_DIR} COMPONENT Devel)

 add_subdirectory(doc EXCLUDE_FROM_ALL)

-include(EigenConfigureTesting)
+option(BUILD_TESTING "Enable creation of Eigen tests." ON)
+if(BUILD_TESTING)
+  include(EigenConfigureTesting)

-# fixme, not sure this line is still needed:
-enable_testing() # must be called from the root CMakeLists, see man page
+  if(EIGEN_LEAVE_TEST_IN_ALL_TARGET)
+    add_subdirectory(test) # can't do EXCLUDE_FROM_ALL here, breaks CTest
+  else()
+    add_subdirectory(test EXCLUDE_FROM_ALL)
+  endif()

-
-if(EIGEN_LEAVE_TEST_IN_ALL_TARGET)
-  add_subdirectory(test) # can't do EXCLUDE_FROM_ALL here, breaks CTest
-else()
-  add_subdirectory(test EXCLUDE_FROM_ALL)
+  add_subdirectory(failtest)
 endif()

 if(EIGEN_LEAVE_TEST_IN_ALL_TARGET)
@@ -419,6 +481,35 @@ else()
  add_subdirectory(lapack EXCLUDE_FROM_ALL)
 endif()

+# add SYCL
+option(EIGEN_TEST_SYCL "Add Sycl support." OFF)
+option(EIGEN_SYCL_TRISYCL "Use the triSYCL Sycl implementation (ComputeCPP by default)." OFF)
+if(EIGEN_TEST_SYCL)
+  set (CMAKE_MODULE_PATH "${CMAKE_ROOT}/Modules" "cmake/Modules/" "${CMAKE_MODULE_PATH}")
+  if(EIGEN_SYCL_TRISYCL)
+    message(STATUS "Using triSYCL")
+    include(FindTriSYCL)
+  else()
+    message(STATUS "Using ComputeCPP SYCL")
+    include(FindComputeCpp)
+    set(COMPUTECPP_DRIVER_DEFAULT_VALUE OFF)
+    if (NOT MSVC)
+      set(COMPUTECPP_DRIVER_DEFAULT_VALUE ON)
+    endif()
+    option(COMPUTECPP_USE_COMPILER_DRIVER
+      "Use ComputeCpp driver instead of a 2 steps compilation"
+      ${COMPUTECPP_DRIVER_DEFAULT_VALUE}
+    )
+  endif(EIGEN_SYCL_TRISYCL)
+  option(EIGEN_DONT_VECTORIZE_SYCL "Don't use vectorisation in the SYCL tests." OFF)
+  if(EIGEN_DONT_VECTORIZE_SYCL)
+    message(STATUS "Disabling SYCL vectorization in tests/examples")
+    # When disabling SYCL vectorization, also disable Eigen default vectorization
+    add_definitions(-DEIGEN_DONT_VECTORIZE=1)
+    add_definitions(-DEIGEN_DONT_VECTORIZE_SYCL=1)
+  endif()
+endif()
+
 add_subdirectory(unsupported)

 add_subdirectory(demos EXCLUDE_FROM_ALL)
@@ -429,40 +520,42 @@ add_subdirectory(scripts EXCLUDE_FROM_ALL)
 # TODO: consider also replacing EIGEN_BUILD_BTL by a custom target "make btl"?
 if(EIGEN_BUILD_BTL)
  add_subdirectory(bench/btl EXCLUDE_FROM_ALL)
-endif(EIGEN_BUILD_BTL)
+endif()

 if(NOT WIN32)
  add_subdirectory(bench/spbench EXCLUDE_FROM_ALL)
-endif(NOT WIN32)
+endif()

 configure_file(scripts/cdashtesting.cmake.in cdashtesting.cmake @ONLY)

-ei_testing_print_summary()
+if(BUILD_TESTING)
+  ei_testing_print_summary()
+endif()

 message(STATUS "")
 message(STATUS "Configured Eigen ${EIGEN_VERSION_NUMBER}")
 message(STATUS "")

-option(EIGEN_FAILTEST "Enable failtests." OFF)
-if(EIGEN_FAILTEST)
-  add_subdirectory(failtest)
-endif()
-
 string(TOLOWER "${CMAKE_GENERATOR}" cmake_generator_tolower)
 if(cmake_generator_tolower MATCHES "makefile")
  message(STATUS "Some things you can do now:")
  message(STATUS "--------------+--------------------------------------------------------------")
  message(STATUS "Command       |   Description")
  message(STATUS "--------------+--------------------------------------------------------------")
-  message(STATUS "make install  | Install to ${CMAKE_INSTALL_PREFIX}. To change that:")
-  message(STATUS "              |     cmake . -DCMAKE_INSTALL_PREFIX=yourpath")
-  message(STATUS "              |   Eigen headers will then be installed to:")
-  message(STATUS "              |     ${INCLUDE_INSTALL_DIR}")
-  message(STATUS "              |   To install Eigen headers to a separate location, do:")
-  message(STATUS "              |     cmake . -DEIGEN_INCLUDE_INSTALL_DIR=yourpath")
+  message(STATUS "make install  | Install Eigen. Headers will be installed to:")
+  message(STATUS "              |     <CMAKE_INSTALL_PREFIX>/<INCLUDE_INSTALL_DIR>")
+  message(STATUS "              |   Using the following values:")
+  message(STATUS "              |     CMAKE_INSTALL_PREFIX: ${CMAKE_INSTALL_PREFIX}")
+  message(STATUS "              |     INCLUDE_INSTALL_DIR:  ${INCLUDE_INSTALL_DIR}")
+  message(STATUS "              |   Change the install location of Eigen headers using:")
+  message(STATUS "              |     cmake . -DCMAKE_INSTALL_PREFIX=yourprefix")
+  message(STATUS "              |   Or:")
+  message(STATUS "              |     cmake . -DINCLUDE_INSTALL_DIR=yourdir")
  message(STATUS "make doc      | Generate the API documentation, requires Doxygen & LaTeX")
-  message(STATUS "make check    | Build and run the unit-tests. Read this page:")
-  message(STATUS "              |   http://eigen.tuxfamily.org/index.php?title=Tests")
+  if(BUILD_TESTING)
+    message(STATUS "make check    | Build and run the unit-tests. Read this page:")
+    message(STATUS "              |   http://eigen.tuxfamily.org/index.php?title=Tests")
+  endif()
  message(STATUS "make blas     | Build BLAS library (not the same thing as Eigen)")
  message(STATUS "make uninstall| Removes files installed by make install")
  message(STATUS "--------------+--------------------------------------------------------------")
@@ -473,33 +566,96 @@ endif()

 message(STATUS "")

-set ( EIGEN_CONFIG_CMAKE_PATH
-      lib${LIB_SUFFIX}/cmake/eigen3
-      CACHE PATH "The directory where the CMake files are installed"
-    )
-if ( NOT IS_ABSOLUTE EIGEN_CONFIG_CMAKE_PATH )
-  set ( EIGEN_CONFIG_CMAKE_PATH ${CMAKE_INSTALL_PREFIX}/${EIGEN_CONFIG_CMAKE_PATH} )
-endif ()

-set ( EIGEN_USE_FILE ${EIGEN_CONFIG_CMAKE_PATH}/UseEigen3.cmake )
 set ( EIGEN_VERSION_STRING ${EIGEN_VERSION_NUMBER} )
 set ( EIGEN_VERSION_MAJOR  ${EIGEN_WORLD_VERSION} )
 set ( EIGEN_VERSION_MINOR  ${EIGEN_MAJOR_VERSION} )
 set ( EIGEN_VERSION_PATCH  ${EIGEN_MINOR_VERSION} )
 set ( EIGEN_DEFINITIONS "")
-set ( EIGEN_INCLUDE_DIR ${INCLUDE_INSTALL_DIR} )
-set ( EIGEN_INCLUDE_DIRS ${EIGEN_INCLUDE_DIR} )
+set ( EIGEN_INCLUDE_DIR "${CMAKE_INSTALL_PREFIX}/${INCLUDE_INSTALL_DIR}" )
 set ( EIGEN_ROOT_DIR ${CMAKE_INSTALL_PREFIX} )

-configure_file ( ${CMAKE_CURRENT_SOURCE_DIR}/cmake/Eigen3Config.cmake.in
-                 ${CMAKE_CURRENT_BINARY_DIR}/Eigen3Config.cmake
-                 @ONLY ESCAPE_QUOTES
-               )
+# Interface libraries require at least CMake 3.0
+if (NOT CMAKE_VERSION VERSION_LESS 3.0)
+  include (CMakePackageConfigHelpers)
+
+  # Imported target support
+  add_library (eigen INTERFACE)
+  add_library (Eigen3::Eigen ALIAS eigen)
+  target_compile_definitions (eigen INTERFACE ${EIGEN_DEFINITIONS})
+  target_include_directories (eigen INTERFACE
+    $<BUILD_INTERFACE:${CMAKE_CURRENT_SOURCE_DIR}>
+    $<INSTALL_INTERFACE:${INCLUDE_INSTALL_DIR}>
+  )
+
+  # Export as title case Eigen
+  set_target_properties (eigen PROPERTIES EXPORT_NAME Eigen)
+
+  install (TARGETS eigen EXPORT Eigen3Targets)
+
+  configure_package_config_file (
+    ${CMAKE_CURRENT_SOURCE_DIR}/cmake/Eigen3Config.cmake.in
+    ${CMAKE_CURRENT_BINARY_DIR}/Eigen3Config.cmake
+    PATH_VARS EIGEN_INCLUDE_DIR EIGEN_ROOT_DIR
+    INSTALL_DESTINATION ${CMAKEPACKAGE_INSTALL_DIR}
+    NO_CHECK_REQUIRED_COMPONENTS_MACRO # Eigen does not provide components
+  )
+  # Remove CMAKE_SIZEOF_VOID_P from Eigen3ConfigVersion.cmake since Eigen does
+  # not depend on architecture specific settings or libraries. More
+  # specifically, an Eigen3Config.cmake generated from a 64 bit target can be
+  # used for 32 bit targets as well (and vice versa).
+  set (_Eigen3_CMAKE_SIZEOF_VOID_P ${CMAKE_SIZEOF_VOID_P})
+  unset (CMAKE_SIZEOF_VOID_P)
+  write_basic_package_version_file (Eigen3ConfigVersion.cmake
+                                    VERSION ${EIGEN_VERSION_NUMBER}
+                                    COMPATIBILITY SameMajorVersion)
+  set (CMAKE_SIZEOF_VOID_P ${_Eigen3_CMAKE_SIZEOF_VOID_P})
+
+  # The Eigen target will be located in the Eigen3 namespace. Other CMake
+  # targets can refer to it using Eigen3::Eigen.
+  export (TARGETS eigen NAMESPACE Eigen3:: FILE Eigen3Targets.cmake)
+  # Export Eigen3 package to CMake registry such that it can be easily found by
+  # CMake even if it has not been installed to a standard directory.
+  export (PACKAGE Eigen3)
+
+  install (EXPORT Eigen3Targets NAMESPACE Eigen3:: DESTINATION ${CMAKEPACKAGE_INSTALL_DIR})
+
+else ()
+  # Fallback to legacy Eigen3Config.cmake without the imported target
+
+  # If CMakePackageConfigHelpers module is available (CMake >= 2.8.8)
+  # create a relocatable Config file, otherwise leave the hardcoded paths
+  include(CMakePackageConfigHelpers OPTIONAL RESULT_VARIABLE CPCH_PATH)
+
+  if(CPCH_PATH)
+    configure_package_config_file (
+      ${CMAKE_CURRENT_SOURCE_DIR}/cmake/Eigen3ConfigLegacy.cmake.in
+      ${CMAKE_CURRENT_BINARY_DIR}/Eigen3Config.cmake
+      PATH_VARS EIGEN_INCLUDE_DIR EIGEN_ROOT_DIR
+      INSTALL_DESTINATION ${CMAKEPACKAGE_INSTALL_DIR}
+      NO_CHECK_REQUIRED_COMPONENTS_MACRO # Eigen does not provide components
+    )
+  else()
+    # The PACKAGE_* variables are defined by the configure_package_config_file
+    # but without it we define them manually to the hardcoded paths
+    set(PACKAGE_INIT "")
+    set(PACKAGE_EIGEN_INCLUDE_DIR ${EIGEN_INCLUDE_DIR})
+    set(PACKAGE_EIGEN_ROOT_DIR ${EIGEN_ROOT_DIR})
+    configure_file ( ${CMAKE_CURRENT_SOURCE_DIR}/cmake/Eigen3ConfigLegacy.cmake.in
+                     ${CMAKE_CURRENT_BINARY_DIR}/Eigen3Config.cmake
+                     @ONLY ESCAPE_QUOTES )
+  endif()
+
+  write_basic_package_version_file( Eigen3ConfigVersion.cmake
+                                    VERSION ${EIGEN_VERSION_NUMBER}
+                                    COMPATIBILITY SameMajorVersion )
+
+endif ()

 install ( FILES ${CMAKE_CURRENT_SOURCE_DIR}/cmake/UseEigen3.cmake
                ${CMAKE_CURRENT_BINARY_DIR}/Eigen3Config.cmake
-          DESTINATION ${EIGEN_CONFIG_CMAKE_PATH}
-        )
+                ${CMAKE_CURRENT_BINARY_DIR}/Eigen3ConfigVersion.cmake
+          DESTINATION ${CMAKEPACKAGE_INSTALL_DIR} )

 # Add uninstall target
 add_custom_target ( uninstall
--- a/CTestConfig.cmake
+++ b/CTestConfig.cmake
@@ -2,16 +2,16 @@
 ## Then modify the CMakeLists.txt file in the root directory of your
 ## project to incorporate the testing dashboard.
 ## # The following are required to uses Dart and the Cdash dashboard
-##   ENABLE_TESTING()
-##   INCLUDE(CTest)
+##   enable_testing()
+##   include(CTest)
 set(CTEST_PROJECT_NAME "Eigen")
 set(CTEST_NIGHTLY_START_TIME "00:00:00 UTC")

 set(CTEST_DROP_METHOD "http")
-set(CTEST_DROP_SITE "manao.inria.fr")
-set(CTEST_DROP_LOCATION "/CDash/submit.php?project=Eigen")
+set(CTEST_DROP_SITE "my.cdash.org")
+set(CTEST_DROP_LOCATION "/submit.php?project=Eigen")
 set(CTEST_DROP_SITE_CDASH TRUE)
-set(CTEST_PROJECT_SUBPROJECTS
-Official
-Unsupported
-)
+#set(CTEST_PROJECT_SUBPROJECTS
+#Official
+#Unsupported
+#)
--- a/CTestCustom.cmake.in
+++ b/CTestCustom.cmake.in
@@ -1,3 +1,4 @@

 set(CTEST_CUSTOM_MAXIMUM_NUMBER_OF_WARNINGS "2000")
 set(CTEST_CUSTOM_MAXIMUM_NUMBER_OF_ERRORS   "2000")
+list(APPEND CTEST_CUSTOM_ERROR_EXCEPTION    @EIGEN_CTEST_ERROR_EXCEPTION@)
--- a/Eigen/CMakeLists.txt
+++ b/Eigen/CMakeLists.txt
@@ -1,19 +0,0 @@
-include(RegexUtils)
-test_escape_string_as_regex()
-
-file(GLOB Eigen_directory_files "*")
-
-escape_string_as_regex(ESCAPED_CMAKE_CURRENT_SOURCE_DIR "${CMAKE_CURRENT_SOURCE_DIR}")
-
-foreach(f ${Eigen_directory_files})
-  if(NOT f MATCHES "\\.txt" AND NOT f MATCHES "${ESCAPED_CMAKE_CURRENT_SOURCE_DIR}/[.].+" AND NOT f MATCHES "${ESCAPED_CMAKE_CURRENT_SOURCE_DIR}/src")
-    list(APPEND Eigen_directory_files_to_install ${f})
-  endif()
-endforeach(f ${Eigen_directory_files})
-
-install(FILES
-  ${Eigen_directory_files_to_install}
-  DESTINATION ${INCLUDE_INSTALL_DIR}/Eigen COMPONENT Devel
-  )
-
-add_subdirectory(src)
--- a/Eigen/Cholesky
+++ b/Eigen/Cholesky
@@ -1,7 +1,15 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
 #ifndef EIGEN_CHOLESKY_MODULE_H
 #define EIGEN_CHOLESKY_MODULE_H

 #include "Core"
+#include "Jacobi"

 #include "src/Core/util/DisableStupidWarnings.h"

@@ -24,7 +32,12 @@
 #include "src/Cholesky/LLT.h"
 #include "src/Cholesky/LDLT.h"
 #ifdef EIGEN_USE_LAPACKE
-#include "src/Cholesky/LLT_MKL.h"
+#ifdef EIGEN_USE_MKL
+#include "mkl_lapacke.h"
+#else
+#include "src/misc/lapacke.h"
+#endif
+#include "src/Cholesky/LLT_LAPACKE.h"
 #endif

 #include "src/Core/util/ReenableStupidWarnings.h"
--- a/Eigen/CholmodSupport
+++ b/Eigen/CholmodSupport
@@ -1,3 +1,10 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
 #ifndef EIGEN_CHOLMODSUPPORT_MODULE_H
 #define EIGEN_CHOLMODSUPPORT_MODULE_H

@@ -12,7 +19,7 @@ extern "C" {
 /** \ingroup Support_modules
  * \defgroup CholmodSupport_Module CholmodSupport module
  *
-  * This module provides an interface to the Cholmod library which is part of the <a href="http://www.cise.ufl.edu/research/sparse/SuiteSparse/">suitesparse</a> package.
+  * This module provides an interface to the Cholmod library which is part of the <a href="http://www.suitesparse.com">suitesparse</a> package.
  * It provides the two following main factorization classes:
  * - class CholmodSupernodalLLT: a supernodal LLT Cholesky factorization.
  * - class CholmodDecomposiiton: a general L(D)LT Cholesky factorization with automatic or explicit runtime selection of the underlying factorization method (supernodal or simplicial).
--- a/Eigen/Core
+++ b/Eigen/Core
@@ -14,53 +14,26 @@
 // first thing Eigen does: stop the compiler from committing suicide
 #include "src/Core/util/DisableStupidWarnings.h"

-// Handle NVCC/CUDA
-#ifdef __CUDACC__
-  // Do not try asserts on CUDA!
-  #ifndef EIGEN_NO_DEBUG
-  #define EIGEN_NO_DEBUG
-  #endif
+// then include this file where all our macros are defined. It's really important to do it first because
+// it's where we do all the compiler/OS/arch detections and define most defaults.
+#include "src/Core/util/Macros.h"

-  #ifdef EIGEN_INTERNAL_DEBUGGING
-  #undef EIGEN_INTERNAL_DEBUGGING
-  #endif
+// This detects SSE/AVX/NEON/etc. and configure alignment settings
+#include "src/Core/util/ConfigureVectorization.h"

-  // Do not try to vectorize on CUDA!
-  #ifndef EIGEN_DONT_VECTORIZE
-  #define EIGEN_DONT_VECTORIZE
-  #endif
-
-  #ifdef EIGEN_EXCEPTIONS
-  #undef EIGEN_EXCEPTIONS
-  #endif
-  
-  // All functions callable from CUDA code must be qualified with __device__
-  #define EIGEN_DEVICE_FUNC __host__ __device__
-  
-#else
-  #define EIGEN_DEVICE_FUNC
-  
+// We need cuda_runtime.h/hip_runtime.h to ensure that
+// the EIGEN_USING_STD_MATH macro works properly on the device side
+#if defined(EIGEN_CUDACC)
+  #include <cuda_runtime.h>
+#elif defined(EIGEN_HIPCC)
+  #include <hip/hip_runtime.h>
 #endif

-#if defined(__CUDA_ARCH__)
-  #define EIGEN_USING_STD_MATH(FUNC) using ::FUNC;
-#else
-  #define EIGEN_USING_STD_MATH(FUNC) using std::FUNC;
-#endif
-
-#if (defined(_CPPUNWIND) || defined(__EXCEPTIONS)) && !defined(__CUDA_ARCH__) && !defined(EIGEN_EXCEPTIONS)
-  #define EIGEN_EXCEPTIONS
-#endif

 #ifdef EIGEN_EXCEPTIONS
  #include <new>
 #endif

-// then include this file where all our macros are defined. It's really important to do it first because
-// it's where we do all the alignment settings (platform detection and honoring the user's will if he
-// defined e.g. EIGEN_DONT_ALIGN) so it needs to be done before we do anything with vectorization.
-#include "src/Core/util/Macros.h"
-
 // Disable the ipa-cp-clone optimization flag with MinGW 6.x or newer (enabled by default with -O3)
 // See http://eigen.tuxfamily.org/bz/show_bug.cgi?id=556 for details.
 #if EIGEN_COMP_MINGW && EIGEN_GNUC_AT_LEAST(4,6)
@@ -73,133 +46,9 @@
 // and inclusion of their respective header files
 #include "src/Core/util/MKL_support.h"

-// if alignment is disabled, then disable vectorization. Note: EIGEN_MAX_ALIGN_BYTES is the proper check, it takes into
-// account both the user's will (EIGEN_MAX_ALIGN_BYTES,EIGEN_DONT_ALIGN) and our own platform checks
-#if EIGEN_MAX_ALIGN_BYTES==0
-  #ifndef EIGEN_DONT_VECTORIZE
-    #define EIGEN_DONT_VECTORIZE
-  #endif
-#endif

-#if EIGEN_COMP_MSVC
-  #include <malloc.h> // for _aligned_malloc -- need it regardless of whether vectorization is enabled
-  #if (EIGEN_COMP_MSVC >= 1500) // 2008 or later
-    // Remember that usage of defined() in a #define is undefined by the standard.
-    // a user reported that in 64-bit mode, MSVC doesn't care to define _M_IX86_FP.
-    #if (defined(_M_IX86_FP) && (_M_IX86_FP >= 2)) || EIGEN_ARCH_x86_64
-      #define EIGEN_SSE2_ON_MSVC_2008_OR_LATER
-    #endif
-  #endif
-#else
-  // Remember that usage of defined() in a #define is undefined by the standard
-  #if (defined __SSE2__) && ( (!EIGEN_COMP_GNUC) || EIGEN_COMP_ICC || EIGEN_GNUC_AT_LEAST(4,2) )
-    #define EIGEN_SSE2_ON_NON_MSVC_BUT_NOT_OLD_GCC
-  #endif
-#endif
-
-#ifndef EIGEN_DONT_VECTORIZE
-
-  #if defined (EIGEN_SSE2_ON_NON_MSVC_BUT_NOT_OLD_GCC) || defined(EIGEN_SSE2_ON_MSVC_2008_OR_LATER)
-
-    // Defines symbols for compile-time detection of which instructions are
-    // used.
-    // EIGEN_VECTORIZE_YY is defined if and only if the instruction set YY is used
-    #define EIGEN_VECTORIZE
-    #define EIGEN_VECTORIZE_SSE
-    #define EIGEN_VECTORIZE_SSE2
-
-    // Detect sse3/ssse3/sse4:
-    // gcc and icc defines __SSE3__, ...
-    // there is no way to know about this on msvc. You can define EIGEN_VECTORIZE_SSE* if you
-    // want to force the use of those instructions with msvc.
-    #ifdef __SSE3__
-      #define EIGEN_VECTORIZE_SSE3
-    #endif
-    #ifdef __SSSE3__
-      #define EIGEN_VECTORIZE_SSSE3
-    #endif
-    #ifdef __SSE4_1__
-      #define EIGEN_VECTORIZE_SSE4_1
-    #endif
-    #ifdef __SSE4_2__
-      #define EIGEN_VECTORIZE_SSE4_2
-    #endif
-    #ifdef __AVX__
-      #define EIGEN_VECTORIZE_AVX
-      #define EIGEN_VECTORIZE_SSE3
-      #define EIGEN_VECTORIZE_SSSE3
-      #define EIGEN_VECTORIZE_SSE4_1
-      #define EIGEN_VECTORIZE_SSE4_2
-    #endif
-    #ifdef __AVX2__
-      #define EIGEN_VECTORIZE_AVX2
-    #endif
-    #ifdef __FMA__
-      #define EIGEN_VECTORIZE_FMA
-    #endif
-
-    // include files
-
-    // This extern "C" works around a MINGW-w64 compilation issue
-    // https://sourceforge.net/tracker/index.php?func=detail&aid=3018394&group_id=202880&atid=983354
-    // In essence, intrin.h is included by windows.h and also declares intrinsics (just as emmintrin.h etc. below do).
-    // However, intrin.h uses an extern "C" declaration, and g++ thus complains of duplicate declarations
-    // with conflicting linkage.  The linkage for intrinsics doesn't matter, but at that stage the compiler doesn't know;
-    // so, to avoid compile errors when windows.h is included after Eigen/Core, ensure intrinsics are extern "C" here too.
-    // notice that since these are C headers, the extern "C" is theoretically needed anyways.
-    extern "C" {
-      // In theory we should only include immintrin.h and not the other *mmintrin.h header files directly.
-      // Doing so triggers some issues with ICC. However old gcc versions seems to not have this file, thus:
-      #if EIGEN_COMP_ICC >= 1110
-        #include <immintrin.h>
-      #else
-        #include <emmintrin.h>
-        #include <xmmintrin.h>
-        #ifdef  EIGEN_VECTORIZE_SSE3
-        #include <pmmintrin.h>
-        #endif
-        #ifdef EIGEN_VECTORIZE_SSSE3
-        #include <tmmintrin.h>
-        #endif
-        #ifdef EIGEN_VECTORIZE_SSE4_1
-        #include <smmintrin.h>
-        #endif
-        #ifdef EIGEN_VECTORIZE_SSE4_2
-        #include <nmmintrin.h>
-        #endif
-        #ifdef EIGEN_VECTORIZE_AVX
-        #include <immintrin.h>
-        #endif
-      #endif
-    } // end extern "C"
-  #elif defined __VSX__
-    #define EIGEN_VECTORIZE
-    #define EIGEN_VECTORIZE_VSX
-    #include <altivec.h>
-    // We need to #undef all these ugly tokens defined in <altivec.h>
-    // => use __vector instead of vector
-    #undef bool
-    #undef vector
-    #undef pixel
-  #elif defined __ALTIVEC__
-    #define EIGEN_VECTORIZE
-    #define EIGEN_VECTORIZE_ALTIVEC
-    #include <altivec.h>
-    // We need to #undef all these ugly tokens defined in <altivec.h>
-    // => use __vector instead of vector
-    #undef bool
-    #undef vector
-    #undef pixel
-  #elif (defined  __ARM_NEON) || (defined __ARM_NEON__)
-    #define EIGEN_VECTORIZE
-    #define EIGEN_VECTORIZE_NEON
-    #include <arm_neon.h>
-  #endif
-#endif
-
-#if defined __CUDACC__
-  #define EIGEN_VECTORIZE_CUDA
-  #include <vector_types.h>
+#if defined(EIGEN_HAS_CUDA_FP16) || defined(EIGEN_HAS_HIP_FP16)
+  #define EIGEN_HAS_GPU_FP16
 #endif

 #if (defined _OPENMP) && (!defined EIGEN_DONT_PARALLELIZE)
@@ -223,7 +72,9 @@
 #include <cmath>
 #include <cassert>
 #include <functional>
-#include <iosfwd>
+#ifndef EIGEN_NO_IO
+  #include <iosfwd>
+#endif
 #include <cstring>
 #include <string>
 #include <limits>
@@ -231,6 +82,15 @@
 // for min/max:
 #include <algorithm>

+#if EIGEN_HAS_CXX11
+#include <array>
+#endif
+
+// for std::is_nothrow_move_assignable
+#ifdef EIGEN_INCLUDE_TYPE_TRAITS
+#include <type_traits>
+#endif
+
 // for outputting debug info
 #ifdef EIGEN_DEBUG_ASSIGN
 #include <iostream>
@@ -241,46 +101,41 @@
  #include <intrin.h>
 #endif

-/** \brief Namespace containing all symbols from the %Eigen library. */
-namespace Eigen {
-
-inline static const char *SimdInstructionSetsInUse(void) {
-#if defined(EIGEN_VECTORIZE_AVX)
-  return "AVX SSE, SSE2, SSE3, SSSE3, SSE4.1, SSE4.2";
-#elif defined(EIGEN_VECTORIZE_SSE4_2)
-  return "SSE, SSE2, SSE3, SSSE3, SSE4.1, SSE4.2";
-#elif defined(EIGEN_VECTORIZE_SSE4_1)
-  return "SSE, SSE2, SSE3, SSSE3, SSE4.1";
-#elif defined(EIGEN_VECTORIZE_SSSE3)
-  return "SSE, SSE2, SSE3, SSSE3";
-#elif defined(EIGEN_VECTORIZE_SSE3)
-  return "SSE, SSE2, SSE3";
-#elif defined(EIGEN_VECTORIZE_SSE2)
-  return "SSE, SSE2";
-#elif defined(EIGEN_VECTORIZE_ALTIVEC)
-  return "AltiVec";
-#elif defined(EIGEN_VECTORIZE_VSX)
-  return "VSX";
-#elif defined(EIGEN_VECTORIZE_NEON)
-  return "ARM NEON";
-#else
-  return "None";
+#if defined(EIGEN_USE_SYCL)
+  #undef min
+  #undef max
+  #undef isnan
+  #undef isinf
+  #undef isfinite
+  #include <SYCL/sycl.hpp>
+  #include <map>
+  #include <memory>
+  #include <utility>
+  #include <thread>
+  #ifndef EIGEN_SYCL_LOCAL_THREAD_DIM0
+  #define EIGEN_SYCL_LOCAL_THREAD_DIM0 16
+  #endif
+  #ifndef EIGEN_SYCL_LOCAL_THREAD_DIM1
+  #define EIGEN_SYCL_LOCAL_THREAD_DIM1 16
+  #endif
 #endif
-}

-} // end namespace Eigen

 #if defined EIGEN2_SUPPORT_STAGE40_FULL_EIGEN3_STRICTNESS || defined EIGEN2_SUPPORT_STAGE30_FULL_EIGEN3_API || defined EIGEN2_SUPPORT_STAGE20_RESOLVE_API_CONFLICTS || defined EIGEN2_SUPPORT_STAGE10_FULL_EIGEN2_API || defined EIGEN2_SUPPORT
 // This will generate an error message:
 #error Eigen2-support is only available up to version 3.2. Please go to "http://eigen.tuxfamily.org/index.php?title=Eigen2" for further information
 #endif

-// we use size_t frequently and we'll never remember to prepend it with std:: everytime just to
+namespace Eigen {
+
+// we use size_t frequently and we'll never remember to prepend it with std:: every time just to
 // ensure QNX/QCC support
 using std::size_t;
-// gcc 4.6.0 wants std:: for ptrdiff_t 
+// gcc 4.6.0 wants std:: for ptrdiff_t
 using std::ptrdiff_t;

+}
+
 /** \defgroup Core_Module Core module
  * This is the main module of Eigen providing dense matrix and vector support
  * (both fixed and dynamic size) with all the features corresponding to a BLAS library
@@ -297,48 +152,105 @@ using std::ptrdiff_t;
 #include "src/Core/util/StaticAssert.h"
 #include "src/Core/util/XprHelper.h"
 #include "src/Core/util/Memory.h"
+#include "src/Core/util/IntegralConstant.h"
+#include "src/Core/util/SymbolicIndex.h"

 #include "src/Core/NumTraits.h"
 #include "src/Core/MathFunctions.h"
 #include "src/Core/GenericPacketMath.h"
+#include "src/Core/MathFunctionsImpl.h"
+#include "src/Core/arch/Default/ConjHelper.h"
+// Generic half float support
+#include "src/Core/arch/Default/Half.h"
+#include "src/Core/arch/Default/TypeCasting.h"
+#include "src/Core/arch/Default/GenericPacketMathFunctionsFwd.h"

-#if defined EIGEN_VECTORIZE_AVX
+#if defined EIGEN_VECTORIZE_AVX512
+  #include "src/Core/arch/SSE/PacketMath.h"
+  #include "src/Core/arch/SSE/TypeCasting.h"
+  #include "src/Core/arch/SSE/Complex.h"
+  #include "src/Core/arch/AVX/PacketMath.h"
+  #include "src/Core/arch/AVX/TypeCasting.h"
+  #include "src/Core/arch/AVX/Complex.h"
+  #include "src/Core/arch/AVX512/PacketMath.h"
+  #include "src/Core/arch/AVX512/TypeCasting.h"
+  #include "src/Core/arch/AVX512/Complex.h"
+  #include "src/Core/arch/SSE/MathFunctions.h"
+  #include "src/Core/arch/AVX/MathFunctions.h"
+  #include "src/Core/arch/AVX512/MathFunctions.h"
+#elif defined EIGEN_VECTORIZE_AVX
  // Use AVX for floats and doubles, SSE for integers
  #include "src/Core/arch/SSE/PacketMath.h"
+  #include "src/Core/arch/SSE/TypeCasting.h"
  #include "src/Core/arch/SSE/Complex.h"
-  #include "src/Core/arch/SSE/MathFunctions.h"
  #include "src/Core/arch/AVX/PacketMath.h"
-  #include "src/Core/arch/AVX/MathFunctions.h"
-  #include "src/Core/arch/AVX/Complex.h"
  #include "src/Core/arch/AVX/TypeCasting.h"
+  #include "src/Core/arch/AVX/Complex.h"
+  #include "src/Core/arch/SSE/MathFunctions.h"
+  #include "src/Core/arch/AVX/MathFunctions.h"
 #elif defined EIGEN_VECTORIZE_SSE
  #include "src/Core/arch/SSE/PacketMath.h"
+  #include "src/Core/arch/SSE/TypeCasting.h"
  #include "src/Core/arch/SSE/MathFunctions.h"
  #include "src/Core/arch/SSE/Complex.h"
-  #include "src/Core/arch/SSE/TypeCasting.h"
 #elif defined(EIGEN_VECTORIZE_ALTIVEC) || defined(EIGEN_VECTORIZE_VSX)
  #include "src/Core/arch/AltiVec/PacketMath.h"
  #include "src/Core/arch/AltiVec/MathFunctions.h"
  #include "src/Core/arch/AltiVec/Complex.h"
 #elif defined EIGEN_VECTORIZE_NEON
  #include "src/Core/arch/NEON/PacketMath.h"
+  #include "src/Core/arch/NEON/TypeCasting.h"
  #include "src/Core/arch/NEON/MathFunctions.h"
  #include "src/Core/arch/NEON/Complex.h"
+#elif defined EIGEN_VECTORIZE_ZVECTOR
+  #include "src/Core/arch/ZVector/PacketMath.h"
+  #include "src/Core/arch/ZVector/MathFunctions.h"
+  #include "src/Core/arch/ZVector/Complex.h"
+#elif defined EIGEN_VECTORIZE_MSA
+  #include "src/Core/arch/MSA/PacketMath.h"
+  #include "src/Core/arch/MSA/MathFunctions.h"
+  #include "src/Core/arch/MSA/Complex.h"
 #endif

-#if defined EIGEN_VECTORIZE_CUDA
-  #include "src/Core/arch/CUDA/PacketMath.h"
-  #include "src/Core/arch/CUDA/MathFunctions.h"
+#if defined EIGEN_VECTORIZE_GPU
+  #include "src/Core/arch/GPU/PacketMath.h"
+  #include "src/Core/arch/GPU/MathFunctions.h"
+  #include "src/Core/arch/GPU/TypeCasting.h"
+#endif
+
+#if defined(EIGEN_USE_SYCL)
+  #include "src/Core/arch/SYCL/SyclMemoryModel.h"
+  #include "src/Core/arch/SYCL/InteropHeaders.h"
+#if !defined(EIGEN_DONT_VECTORIZE_SYCL)
+  #include "src/Core/arch/SYCL/PacketMath.h"
+  #include "src/Core/arch/SYCL/MathFunctions.h"
+  #include "src/Core/arch/SYCL/TypeCasting.h"
+#endif
 #endif

 #include "src/Core/arch/Default/Settings.h"
+// This file provides generic implementations valid for scalar as well
+#include "src/Core/arch/Default/GenericPacketMathFunctions.h"

+#include "src/Core/functors/TernaryFunctors.h"
 #include "src/Core/functors/BinaryFunctors.h"
 #include "src/Core/functors/UnaryFunctors.h"
 #include "src/Core/functors/NullaryFunctors.h"
 #include "src/Core/functors/StlFunctors.h"
 #include "src/Core/functors/AssignmentFunctors.h"

+// Specialized functors to enable the processing of complex numbers
+// on CUDA devices
+#ifdef EIGEN_CUDACC
+#include "src/Core/arch/CUDA/Complex.h"
+#endif
+
+#include "src/Core/util/IndexedViewHelper.h"
+#include "src/Core/util/ReshapedHelper.h"
+#include "src/Core/ArithmeticSequence.h"
+#ifndef EIGEN_NO_IO
+  #include "src/Core/IO.h"
+#endif
 #include "src/Core/DenseCoeffsBase.h"
 #include "src/Core/DenseBase.h"
 #include "src/Core/MatrixBase.h"
@@ -365,6 +277,7 @@ using std::ptrdiff_t;
 #include "src/Core/PlainObjectBase.h"
 #include "src/Core/Matrix.h"
 #include "src/Core/Array.h"
+#include "src/Core/CwiseTernaryOp.h"
 #include "src/Core/CwiseBinaryOp.h"
 #include "src/Core/CwiseUnaryOp.h"
 #include "src/Core/CwiseNullaryOp.h"
@@ -378,21 +291,23 @@ using std::ptrdiff_t;
 #include "src/Core/Ref.h"
 #include "src/Core/Block.h"
 #include "src/Core/VectorBlock.h"
+#include "src/Core/IndexedView.h"
+#include "src/Core/Reshaped.h"
 #include "src/Core/Transpose.h"
 #include "src/Core/DiagonalMatrix.h"
 #include "src/Core/Diagonal.h"
 #include "src/Core/DiagonalProduct.h"
-#include "src/Core/PermutationMatrix.h"
-#include "src/Core/Transpositions.h"
 #include "src/Core/Redux.h"
 #include "src/Core/Visitor.h"
 #include "src/Core/Fuzzy.h"
-#include "src/Core/IO.h"
 #include "src/Core/Swap.h"
 #include "src/Core/CommaInitializer.h"
 #include "src/Core/GeneralProduct.h"
 #include "src/Core/Solve.h"
 #include "src/Core/Inverse.h"
+#include "src/Core/SolverBase.h"
+#include "src/Core/PermutationMatrix.h"
+#include "src/Core/Transpositions.h"
 #include "src/Core/TriangularMatrix.h"
 #include "src/Core/SelfAdjointView.h"
 #include "src/Core/products/GeneralBlockPanelKernel.h"
@@ -412,24 +327,27 @@ using std::ptrdiff_t;
 #include "src/Core/products/TriangularSolverVector.h"
 #include "src/Core/BandMatrix.h"
 #include "src/Core/CoreIterators.h"
+#include "src/Core/ConditionEstimator.h"

 #include "src/Core/BooleanRedux.h"
 #include "src/Core/Select.h"
 #include "src/Core/VectorwiseOp.h"
+#include "src/Core/PartialReduxEvaluator.h"
 #include "src/Core/Random.h"
 #include "src/Core/Replicate.h"
 #include "src/Core/Reverse.h"
 #include "src/Core/ArrayWrapper.h"
+#include "src/Core/StlIterators.h"

 #ifdef EIGEN_USE_BLAS
-#include "src/Core/products/GeneralMatrixMatrix_MKL.h"
-#include "src/Core/products/GeneralMatrixVector_MKL.h"
-#include "src/Core/products/GeneralMatrixMatrixTriangular_MKL.h"
-#include "src/Core/products/SelfadjointMatrixMatrix_MKL.h"
-#include "src/Core/products/SelfadjointMatrixVector_MKL.h"
-#include "src/Core/products/TriangularMatrixMatrix_MKL.h"
-#include "src/Core/products/TriangularMatrixVector_MKL.h"
-#include "src/Core/products/TriangularSolverMatrix_MKL.h"
+#include "src/Core/products/GeneralMatrixMatrix_BLAS.h"
+#include "src/Core/products/GeneralMatrixVector_BLAS.h"
+#include "src/Core/products/GeneralMatrixMatrixTriangular_BLAS.h"
+#include "src/Core/products/SelfadjointMatrixMatrix_BLAS.h"
+#include "src/Core/products/SelfadjointMatrixVector_BLAS.h"
+#include "src/Core/products/TriangularMatrixMatrix_BLAS.h"
+#include "src/Core/products/TriangularMatrixVector_BLAS.h"
+#include "src/Core/products/TriangularSolverMatrix_BLAS.h"
 #endif // EIGEN_USE_BLAS

 #ifdef EIGEN_USE_MKL_VML
--- a/Eigen/Eigenvalues
+++ b/Eigen/Eigenvalues
@@ -1,16 +1,23 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
 #ifndef EIGEN_EIGENVALUES_MODULE_H
 #define EIGEN_EIGENVALUES_MODULE_H

 #include "Core"

-#include "src/Core/util/DisableStupidWarnings.h"
-
 #include "Cholesky"
 #include "Jacobi"
 #include "Householder"
 #include "LU"
 #include "Geometry"

+#include "src/Core/util/DisableStupidWarnings.h"
+
 /** \defgroup Eigenvalues_Module Eigenvalues module
  *
  *
@@ -25,6 +32,7 @@
  * \endcode
  */

+#include "src/misc/RealSvd2x2.h"
 #include "src/Eigenvalues/Tridiagonalization.h"
 #include "src/Eigenvalues/RealSchur.h"
 #include "src/Eigenvalues/EigenSolver.h"
@@ -37,9 +45,14 @@
 #include "src/Eigenvalues/GeneralizedEigenSolver.h"
 #include "src/Eigenvalues/MatrixBaseEigenvalues.h"
 #ifdef EIGEN_USE_LAPACKE
-#include "src/Eigenvalues/RealSchur_MKL.h"
-#include "src/Eigenvalues/ComplexSchur_MKL.h"
-#include "src/Eigenvalues/SelfAdjointEigenSolver_MKL.h"
+#ifdef EIGEN_USE_MKL
+#include "mkl_lapacke.h"
+#else
+#include "src/misc/lapacke.h"
+#endif
+#include "src/Eigenvalues/RealSchur_LAPACKE.h"
+#include "src/Eigenvalues/ComplexSchur_LAPACKE.h"
+#include "src/Eigenvalues/SelfAdjointEigenSolver_LAPACKE.h"
 #endif

 #include "src/Core/util/ReenableStupidWarnings.h"
--- a/Eigen/Geometry
+++ b/Eigen/Geometry
@@ -1,25 +1,32 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
 #ifndef EIGEN_GEOMETRY_MODULE_H
 #define EIGEN_GEOMETRY_MODULE_H

 #include "Core"

-#include "src/Core/util/DisableStupidWarnings.h"
-
 #include "SVD"
 #include "LU"
 #include <limits>

+#include "src/Core/util/DisableStupidWarnings.h"
+
 /** \defgroup Geometry_Module Geometry module
-  *
-  *
  *
  * This module provides support for:
  *  - fixed-size homogeneous transformations
  *  - translation, scaling, 2D and 3D rotations
-  *  - quaternions
-  *  - \ref MatrixBase::cross() "cross product"
-  *  - \ref MatrixBase::unitOrthogonal() "orthognal vector generation"
-  *  - some linear components: parametrized-lines and hyperplanes
+  *  - \link Quaternion quaternions \endlink
+  *  - cross products (\ref MatrixBase::cross, \ref MatrixBase::cross3)
+  *  - orthognal vector generation (\ref MatrixBase::unitOrthogonal)
+  *  - some linear components: \link ParametrizedLine parametrized-lines \endlink and \link Hyperplane hyperplanes \endlink
+  *  - \link AlignedBox axis aligned bounding boxes \endlink
+  *  - \link umeyama least-square transformation fitting \endlink
  *
  * \code
  * #include <Eigen/Geometry>
@@ -42,9 +49,8 @@
 #include "src/Geometry/AlignedBox.h"
 #include "src/Geometry/Umeyama.h"

-// Use the SSE optimized version whenever possible. At the moment the
-// SSE version doesn't compile when AVX is enabled
-#if defined EIGEN_VECTORIZE_SSE && !defined EIGEN_VECTORIZE_AVX
+// Use the SSE optimized version whenever possible.
+#if defined EIGEN_VECTORIZE_SSE
 #include "src/Geometry/arch/Geometry_SSE.h"
 #endif

@@ -52,4 +58,3 @@

 #endif // EIGEN_GEOMETRY_MODULE_H
 /* vim: set filetype=cpp et sw=2 ts=2 ai: */
-
--- a/Eigen/Householder
+++ b/Eigen/Householder
@@ -1,3 +1,10 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
 #ifndef EIGEN_HOUSEHOLDER_MODULE_H
 #define EIGEN_HOUSEHOLDER_MODULE_H

--- a/Eigen/IterativeLinearSolvers
+++ b/Eigen/IterativeLinearSolvers
@@ -1,3 +1,10 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
 #ifndef EIGEN_ITERATIVELINEARSOLVERS_MODULE_H
 #define EIGEN_ITERATIVELINEARSOLVERS_MODULE_H

@@ -34,6 +41,7 @@
 #include "src/IterativeLinearSolvers/LeastSquareConjugateGradient.h"
 #include "src/IterativeLinearSolvers/BiCGSTAB.h"
 #include "src/IterativeLinearSolvers/IncompleteLUT.h"
+#include "src/IterativeLinearSolvers/IncompleteCholesky.h"

 #include "src/Core/util/ReenableStupidWarnings.h"

--- a/Eigen/Jacobi
+++ b/Eigen/Jacobi
@@ -1,3 +1,10 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
 #ifndef EIGEN_JACOBI_MODULE_H
 #define EIGEN_JACOBI_MODULE_H

--- a/Eigen/KLUSupport
+++ b/Eigen/KLUSupport
@@ -0,0 +1,41 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+#ifndef EIGEN_KLUSUPPORT_MODULE_H
+#define EIGEN_KLUSUPPORT_MODULE_H
+
+#include <Eigen/SparseCore>
+
+#include <Eigen/src/Core/util/DisableStupidWarnings.h>
+
+extern "C" {
+#include <btf.h>
+#include <klu.h>
+   }
+
+/** \ingroup Support_modules
+  * \defgroup KLUSupport_Module KLUSupport module
+  *
+  * This module provides an interface to the KLU library which is part of the <a href="http://www.suitesparse.com">suitesparse</a> package.
+  * It provides the following factorization class:
+  * - class KLU: a sparse LU factorization, well-suited for circuit simulation.
+  *
+  * \code
+  * #include <Eigen/KLUSupport>
+  * \endcode
+  *
+  * In order to use this module, the klu and btf headers must be accessible from the include paths, and your binary must be linked to the klu library and its dependencies.
+  * The dependencies depend on how umfpack has been compiled.
+  * For a cmake based project, you can use our FindKLU.cmake module to help you in this task.
+  *
+  */
+
+#include "src/KLUSupport/KLUSupport.h"
+
+#include <Eigen/src/Core/util/ReenableStupidWarnings.h>
+
+#endif // EIGEN_KLUSUPPORT_MODULE_H
--- a/Eigen/LU
+++ b/Eigen/LU
@@ -1,3 +1,10 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
 #ifndef EIGEN_LU_MODULE_H
 #define EIGEN_LU_MODULE_H

@@ -21,7 +28,12 @@
 #include "src/LU/FullPivLU.h"
 #include "src/LU/PartialPivLU.h"
 #ifdef EIGEN_USE_LAPACKE
-#include "src/LU/PartialPivLU_MKL.h"
+#ifdef EIGEN_USE_MKL
+#include "mkl_lapacke.h"
+#else
+#include "src/misc/lapacke.h"
+#endif
+#include "src/LU/PartialPivLU_LAPACKE.h"
 #endif
 #include "src/LU/Determinant.h"
 #include "src/LU/InverseImpl.h"
--- a/Eigen/MetisSupport
+++ b/Eigen/MetisSupport
@@ -1,3 +1,10 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
 #ifndef EIGEN_METISSUPPORT_MODULE_H
 #define EIGEN_METISSUPPORT_MODULE_H

--- a/Eigen/OrderingMethods
+++ b/Eigen/OrderingMethods
@@ -1,3 +1,10 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
 #ifndef EIGEN_ORDERINGMETHODS_MODULE_H
 #define EIGEN_ORDERINGMETHODS_MODULE_H

@@ -56,10 +63,7 @@
  * \endcode
  */

-#ifndef EIGEN_MPL2_ONLY
 #include "src/OrderingMethods/Amd.h"
-#endif
-
 #include "src/OrderingMethods/Ordering.h"
 #include "src/Core/util/ReenableStupidWarnings.h"

--- a/Eigen/PaStiXSupport
+++ b/Eigen/PaStiXSupport
@@ -1,3 +1,10 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
 #ifndef EIGEN_PASTIXSUPPORT_MODULE_H
 #define EIGEN_PASTIXSUPPORT_MODULE_H

@@ -5,7 +12,6 @@

 #include "src/Core/util/DisableStupidWarnings.h"

-#include <complex.h>
 extern "C" {
 #include <pastix_nompi.h>
 #include <pastix.h>
@@ -30,6 +36,7 @@ extern "C" {
  * \endcode
  *
  * In order to use this module, the PaSTiX headers must be accessible from the include paths, and your binary must be linked to the PaSTiX library and its dependencies.
+  * This wrapper resuires PaStiX version 5.x compiled without MPI support.
  * The dependencies depend on how PaSTiX has been compiled.
  * For a cmake based project, you can use our FindPaSTiX.cmake module to help you in this task.
  *
--- a/Eigen/PardisoSupport
+++ b/Eigen/PardisoSupport
@@ -1,3 +1,10 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
 #ifndef EIGEN_PARDISOSUPPORT_MODULE_H
 #define EIGEN_PARDISOSUPPORT_MODULE_H

@@ -7,8 +14,6 @@

 #include <mkl_pardiso.h>

-#include <unsupported/Eigen/SparseExtra>
-
 /** \ingroup Support_modules
  * \defgroup PardisoSupport_Module PardisoSupport module
  *
--- a/Eigen/QR
+++ b/Eigen/QR
@@ -1,14 +1,21 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
 #ifndef EIGEN_QR_MODULE_H
 #define EIGEN_QR_MODULE_H

 #include "Core"

-#include "src/Core/util/DisableStupidWarnings.h"
-
 #include "Cholesky"
 #include "Jacobi"
 #include "Householder"

+#include "src/Core/util/DisableStupidWarnings.h"
+
 /** \defgroup QR_Module QR module
  *
  *
@@ -27,9 +34,15 @@
 #include "src/QR/HouseholderQR.h"
 #include "src/QR/FullPivHouseholderQR.h"
 #include "src/QR/ColPivHouseholderQR.h"
+#include "src/QR/CompleteOrthogonalDecomposition.h"
 #ifdef EIGEN_USE_LAPACKE
-#include "src/QR/HouseholderQR_MKL.h"
-#include "src/QR/ColPivHouseholderQR_MKL.h"
+#ifdef EIGEN_USE_MKL
+#include "mkl_lapacke.h"
+#else
+#include "src/misc/lapacke.h"
+#endif
+#include "src/QR/HouseholderQR_LAPACKE.h"
+#include "src/QR/ColPivHouseholderQR_LAPACKE.h"
 #endif

 #include "src/Core/util/ReenableStupidWarnings.h"
--- a/Eigen/QtAlignedMalloc
+++ b/Eigen/QtAlignedMalloc
@@ -1,3 +1,9 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.

 #ifndef EIGEN_QTMALLOC_MODULE_H
 #define EIGEN_QTMALLOC_MODULE_H
@@ -8,7 +14,7 @@

 #include "src/Core/util/DisableStupidWarnings.h"

-void *qMalloc(size_t size)
+void *qMalloc(std::size_t size)
 {
  return Eigen::internal::aligned_malloc(size);
 }
@@ -18,10 +24,10 @@ void qFree(void *ptr)
  Eigen::internal::aligned_free(ptr);
 }

-void *qRealloc(void *ptr, size_t size)
+void *qRealloc(void *ptr, std::size_t size)
 {
  void* newPtr = Eigen::internal::aligned_malloc(size);
-  memcpy(newPtr, ptr, size);
+  std::memcpy(newPtr, ptr, size);
  Eigen::internal::aligned_free(ptr);
  return newPtr;
 }
--- a/Eigen/SPQRSupport
+++ b/Eigen/SPQRSupport
@@ -1,3 +1,10 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
 #ifndef EIGEN_SPQRSUPPORT_MODULE_H
 #define EIGEN_SPQRSUPPORT_MODULE_H

@@ -10,7 +17,7 @@
 /** \ingroup Support_modules
  * \defgroup SPQRSupport_Module SuiteSparseQR module
  * 
-  * This module provides an interface to the SPQR library, which is part of the <a href="http://www.cise.ufl.edu/research/sparse/SuiteSparse/">suitesparse</a> package.
+  * This module provides an interface to the SPQR library, which is part of the <a href="http://www.suitesparse.com">suitesparse</a> package.
  *
  * \code
  * #include <Eigen/SPQRSupport>
--- a/Eigen/SVD
+++ b/Eigen/SVD
@@ -1,3 +1,10 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
 #ifndef EIGEN_SVD_MODULE_H
 #define EIGEN_SVD_MODULE_H

@@ -24,12 +31,18 @@
  * \endcode
  */

+#include "src/misc/RealSvd2x2.h"
 #include "src/SVD/UpperBidiagonalization.h"
 #include "src/SVD/SVDBase.h"
 #include "src/SVD/JacobiSVD.h"
 #include "src/SVD/BDCSVD.h"
 #if defined(EIGEN_USE_LAPACKE) && !defined(EIGEN_USE_LAPACKE_STRICT)
-#include "src/SVD/JacobiSVD_MKL.h"
+#ifdef EIGEN_USE_MKL
+#include "mkl_lapacke.h"
+#else
+#include "src/misc/lapacke.h"
+#endif
+#include "src/SVD/JacobiSVD_LAPACKE.h"
 #endif

 #include "src/Core/util/ReenableStupidWarnings.h"
--- a/Eigen/Sparse
+++ b/Eigen/Sparse
@@ -1,3 +1,10 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
 #ifndef EIGEN_SPARSE_MODULE_H
 #define EIGEN_SPARSE_MODULE_H

--- a/Eigen/SparseCholesky
+++ b/Eigen/SparseCholesky
@@ -30,16 +30,8 @@
  * \endcode
  */

-#ifdef EIGEN_MPL2_ONLY
-#error The SparseCholesky module has nothing to offer in MPL2 only mode
-#endif
-
 #include "src/SparseCholesky/SimplicialCholesky.h"
-
-#ifndef EIGEN_MPL2_ONLY
 #include "src/SparseCholesky/SimplicialCholesky_impl.h"
-#endif
-
 #include "src/Core/util/ReenableStupidWarnings.h"

 #endif // EIGEN_SPARSECHOLESKY_MODULE_H
--- a/Eigen/SparseCore
+++ b/Eigen/SparseCore
@@ -1,3 +1,10 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
 #ifndef EIGEN_SPARSECORE_MODULE_H
 #define EIGEN_SPARSECORE_MODULE_H

@@ -14,7 +21,7 @@
 /** 
  * \defgroup SparseCore_Module SparseCore module
  *
-  * This module provides a sparse matrix representation, and basic associatd matrix manipulations
+  * This module provides a sparse matrix representation, and basic associated matrix manipulations
  * and operations.
  *
  * See the \ref TutorialSparse "Sparse tutorial"
--- a/Eigen/SparseLU
+++ b/Eigen/SparseLU
@@ -23,6 +23,8 @@
 // Ordering interface
 #include "OrderingMethods"

+#include "src/Core/util/DisableStupidWarnings.h"
+
 #include "src/SparseLU/SparseLU_gemm_kernel.h"

 #include "src/SparseLU/SparseLU_Structs.h"
@@ -43,4 +45,6 @@
 #include "src/SparseLU/SparseLU_Utils.h"
 #include "src/SparseLU/SparseLU.h"

+#include "src/Core/util/ReenableStupidWarnings.h"
+
 #endif // EIGEN_SPARSELU_MODULE_H
--- a/Eigen/SparseQR
+++ b/Eigen/SparseQR
@@ -1,3 +1,10 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
 #ifndef EIGEN_SPARSEQR_MODULE_H
 #define EIGEN_SPARSEQR_MODULE_H

@@ -21,7 +28,6 @@
  * 
  */

-#include "OrderingMethods"
 #include "src/SparseCore/SparseColEtree.h"
 #include "src/SparseQR/SparseQR.h"

--- a/Eigen/StdDeque
+++ b/Eigen/StdDeque
@@ -14,7 +14,7 @@
 #include "Core"
 #include <deque>

-#if EIGEN_COMP_MSVC && EIGEN_OS_WIN64 /* MSVC auto aligns in 64 bit builds */
+#if EIGEN_COMP_MSVC && EIGEN_OS_WIN64 && (EIGEN_MAX_STATIC_ALIGN_BYTES<=16) /* MSVC auto aligns up to 16 bytes in 64 bit builds */

 #define EIGEN_DEFINE_STL_DEQUE_SPECIALIZATION(...)

--- a/Eigen/StdList
+++ b/Eigen/StdList
@@ -13,7 +13,7 @@
 #include "Core"
 #include <list>

-#if EIGEN_COMP_MSVC && EIGEN_OS_WIN64 /* MSVC auto aligns in 64 bit builds */    
+#if EIGEN_COMP_MSVC && EIGEN_OS_WIN64 && (EIGEN_MAX_STATIC_ALIGN_BYTES<=16) /* MSVC auto aligns up to 16 bytes in 64 bit builds */

 #define EIGEN_DEFINE_STL_LIST_SPECIALIZATION(...)

--- a/Eigen/StdVector
+++ b/Eigen/StdVector
@@ -14,7 +14,7 @@
 #include "Core"
 #include <vector>

-#if EIGEN_COMP_MSVC && EIGEN_OS_WIN64 /* MSVC auto aligns in 64 bit builds */
+#if EIGEN_COMP_MSVC && EIGEN_OS_WIN64 && (EIGEN_MAX_STATIC_ALIGN_BYTES<=16) /* MSVC auto aligns up to 16 bytes in 64 bit builds */

 #define EIGEN_DEFINE_STL_VECTOR_SPECIALIZATION(...)

--- a/Eigen/SuperLUSupport
+++ b/Eigen/SuperLUSupport
@@ -1,3 +1,10 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
 #ifndef EIGEN_SUPERLUSUPPORT_MODULE_H
 #define EIGEN_SUPERLUSUPPORT_MODULE_H

@@ -36,6 +43,8 @@ namespace Eigen { struct SluMatrix; }
  * - class SuperLU: a supernodal sequential LU factorization.
  * - class SuperILU: a supernodal sequential incomplete LU factorization (to be used as a preconditioner for iterative methods).
  *
+  * \warning This wrapper requires at least versions 4.0 of SuperLU. The 3.x versions are not supported.
+  *
  * \warning When including this module, you have to use SUPERLU_EMPTY instead of EMPTY which is no longer defined because it is too polluting.
  *
  * \code
--- a/Eigen/UmfPackSupport
+++ b/Eigen/UmfPackSupport
@@ -1,3 +1,10 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
 #ifndef EIGEN_UMFPACKSUPPORT_MODULE_H
 #define EIGEN_UMFPACKSUPPORT_MODULE_H

@@ -12,7 +19,7 @@ extern "C" {
 /** \ingroup Support_modules
  * \defgroup UmfPackSupport_Module UmfPackSupport module
  *
-  * This module provides an interface to the UmfPack library which is part of the <a href="http://www.cise.ufl.edu/research/sparse/SuiteSparse/">suitesparse</a> package.
+  * This module provides an interface to the UmfPack library which is part of the <a href="http://www.suitesparse.com">suitesparse</a> package.
  * It provides the following factorization class:
  * - class UmfPackLU: a multifrontal sequential LU factorization.
  *
--- a/Eigen/src/CMakeLists.txt
+++ b/Eigen/src/CMakeLists.txt
@@ -1,7 +0,0 @@
-file(GLOB Eigen_src_subdirectories "*")
-escape_string_as_regex(ESCAPED_CMAKE_CURRENT_SOURCE_DIR "${CMAKE_CURRENT_SOURCE_DIR}")
-foreach(f ${Eigen_src_subdirectories})
-  if(NOT f MATCHES "\\.txt" AND NOT f MATCHES "${ESCAPED_CMAKE_CURRENT_SOURCE_DIR}/[.].+" )
-    add_subdirectory(${f})
-  endif()
-endforeach()
--- a/Eigen/src/Cholesky/CMakeLists.txt
+++ b/Eigen/src/Cholesky/CMakeLists.txt
@@ -1,6 +0,0 @@
-FILE(GLOB Eigen_Cholesky_SRCS "*.h")
-
-INSTALL(FILES
-  ${Eigen_Cholesky_SRCS}
-  DESTINATION ${INCLUDE_INSTALL_DIR}/Eigen/src/Cholesky COMPONENT Devel
-  )
--- a/Eigen/src/Cholesky/LDLT.h
+++ b/Eigen/src/Cholesky/LDLT.h
@@ -13,9 +13,18 @@
 #ifndef EIGEN_LDLT_H
 #define EIGEN_LDLT_H

-namespace Eigen { 
+namespace Eigen {

 namespace internal {
+  template<typename _MatrixType, int _UpLo> struct traits<LDLT<_MatrixType, _UpLo> >
+   : traits<_MatrixType>
+  {
+    typedef MatrixXpr XprKind;
+    typedef SolverStorage StorageKind;
+    typedef int StorageIndex;
+    enum { Flags = 0 };
+  };
+
  template<typename MatrixType, int UpLo> struct LDLT_Traits;

  // PositiveSemiDef means positive semi-definite and non-zero; same for NegativeSemiDef
@@ -28,8 +37,8 @@ namespace internal {
  *
  * \brief Robust Cholesky decomposition of a matrix with pivoting
  *
-  * \param MatrixType the type of the matrix of which to compute the LDL^T Cholesky decomposition
-  * \param UpLo the triangular part that will be used for the decompositon: Lower (default) or Upper.
+  * \tparam _MatrixType the type of the matrix of which to compute the LDL^T Cholesky decomposition
+  * \tparam _UpLo the triangular part that will be used for the decompositon: Lower (default) or Upper.
  *             The other triangular part won't be read.
  *
  * Perform a robust Cholesky decomposition of a positive semidefinite or negative semidefinite
@@ -43,25 +52,25 @@ namespace internal {
  * Remember that Cholesky decompositions are not rank-revealing. Also, do not use a Cholesky
  * decomposition to determine whether a system of equations has a solution.
  *
+  * This class supports the \link InplaceDecomposition inplace decomposition \endlink mechanism.
+  * 
  * \sa MatrixBase::ldlt(), SelfAdjointView::ldlt(), class LLT
  */
 template<typename _MatrixType, int _UpLo> class LDLT
+        : public SolverBase<LDLT<_MatrixType, _UpLo> >
 {
  public:
    typedef _MatrixType MatrixType;
+    typedef SolverBase<LDLT> Base;
+    friend class SolverBase<LDLT>;
+
+    EIGEN_GENERIC_PUBLIC_INTERFACE(LDLT)
    enum {
-      RowsAtCompileTime = MatrixType::RowsAtCompileTime,
-      ColsAtCompileTime = MatrixType::ColsAtCompileTime,
-      Options = MatrixType::Options & ~RowMajorBit, // these are the options for the TmpMatrixType, we need a ColMajor matrix here!
      MaxRowsAtCompileTime = MatrixType::MaxRowsAtCompileTime,
      MaxColsAtCompileTime = MatrixType::MaxColsAtCompileTime,
      UpLo = _UpLo
    };
-    typedef typename MatrixType::Scalar Scalar;
-    typedef typename NumTraits<typename MatrixType::Scalar>::Real RealScalar;
-    typedef Eigen::Index Index; ///< \deprecated since Eigen 3.3
-    typedef typename MatrixType::StorageIndex StorageIndex;
-    typedef Matrix<Scalar, RowsAtCompileTime, 1, Options, MaxRowsAtCompileTime, 1> TmpMatrixType;
+    typedef Matrix<Scalar, RowsAtCompileTime, 1, 0, MaxRowsAtCompileTime, 1> TmpMatrixType;

    typedef Transpositions<RowsAtCompileTime, MaxRowsAtCompileTime> TranspositionType;
    typedef PermutationMatrix<RowsAtCompileTime, MaxRowsAtCompileTime> PermutationType;
@@ -73,11 +82,11 @@ template<typename _MatrixType, int _UpLo> class LDLT
      * The default constructor is useful in cases in which the user intends to
      * perform decompositions via LDLT::compute(const MatrixType&).
      */
-    LDLT() 
-      : m_matrix(), 
-        m_transpositions(), 
+    LDLT()
+      : m_matrix(),
+        m_transpositions(),
        m_sign(internal::ZeroSign),
-        m_isInitialized(false) 
+        m_isInitialized(false)
    {}

    /** \brief Default Constructor with memory preallocation
@@ -97,16 +106,35 @@ template<typename _MatrixType, int _UpLo> class LDLT
    /** \brief Constructor with decomposition
      *
      * This calculates the decomposition for the input \a matrix.
+      *
      * \sa LDLT(Index size)
      */
-    explicit LDLT(const MatrixType& matrix)
+    template<typename InputType>
+    explicit LDLT(const EigenBase<InputType>& matrix)
      : m_matrix(matrix.rows(), matrix.cols()),
        m_transpositions(matrix.rows()),
        m_temporary(matrix.rows()),
        m_sign(internal::ZeroSign),
        m_isInitialized(false)
    {
-      compute(matrix);
+      compute(matrix.derived());
+    }
+
+    /** \brief Constructs a LDLT factorization from a given matrix
+      *
+      * This overloaded constructor is provided for \link InplaceDecomposition inplace decomposition \endlink when \c MatrixType is a Eigen::Ref.
+      *
+      * \sa LDLT(const EigenBase&)
+      */
+    template<typename InputType>
+    explicit LDLT(EigenBase<InputType>& matrix)
+      : m_matrix(matrix.derived()),
+        m_transpositions(matrix.rows()),
+        m_temporary(matrix.rows()),
+        m_sign(internal::ZeroSign),
+        m_isInitialized(false)
+    {
+      compute(matrix.derived());
    }

    /** Clear any existing decomposition
@@ -160,6 +188,7 @@ template<typename _MatrixType, int _UpLo> class LDLT
      return m_sign == internal::NegativeSemiDef || m_sign == internal::ZeroSign;
    }

+    #ifdef EIGEN_PARSED_BY_DOXYGEN
    /** \returns a solution x of \f$ A x = b \f$ using the current decomposition of A.
      *
      * This function also supports in-place solves using the syntax <tt>x = decompositionObject.solve(x)</tt> .
@@ -167,7 +196,7 @@ template<typename _MatrixType, int _UpLo> class LDLT
      * \note_about_checking_solutions
      *
      * More precisely, this method solves \f$ A x = b \f$ using the decomposition \f$ A = P^T L D L^* P \f$
-      * by solving the systems \f$ P^T y_1 = b \f$, \f$ L y_2 = y_1 \f$, \f$ D y_3 = y_2 \f$, 
+      * by solving the systems \f$ P^T y_1 = b \f$, \f$ L y_2 = y_1 \f$, \f$ D y_3 = y_2 \f$,
      * \f$ L^* y_4 = y_3 \f$ and \f$ P x = y_4 \f$ in succession. If the matrix \f$ A \f$ is singular, then
      * \f$ D \f$ will also be singular (all the other matrices are invertible). In that case, the
      * least-square solution of \f$ D y_3 = y_2 \f$ is computed. This does not mean that this function
@@ -177,18 +206,23 @@ template<typename _MatrixType, int _UpLo> class LDLT
      */
    template<typename Rhs>
    inline const Solve<LDLT, Rhs>
-    solve(const MatrixBase<Rhs>& b) const
-    {
-      eigen_assert(m_isInitialized && "LDLT is not initialized.");
-      eigen_assert(m_matrix.rows()==b.rows()
-                && "LDLT::solve(): invalid number of rows of the right hand side matrix b");
-      return Solve<LDLT, Rhs>(*this, b.derived());
-    }
+    solve(const MatrixBase<Rhs>& b) const;
+    #endif

    template<typename Derived>
    bool solveInPlace(MatrixBase<Derived> &bAndX) const;

-    LDLT& compute(const MatrixType& matrix);
+    template<typename InputType>
+    LDLT& compute(const EigenBase<InputType>& matrix);
+
+    /** \returns an estimate of the reciprocal condition number of the matrix of
+     *  which \c *this is the LDLT decomposition.
+     */
+    RealScalar rcond() const
+    {
+      eigen_assert(m_isInitialized && "LDLT is not initialized.");
+      return internal::rcond_estimate_helper(m_l1_norm, *this);
+    }

    template <typename Derived>
    LDLT& rankUpdate(const MatrixBase<Derived>& w, const RealScalar& alpha=1);
@@ -205,28 +239,37 @@ template<typename _MatrixType, int _UpLo> class LDLT

    MatrixType reconstructedMatrix() const;

+    /** \returns the adjoint of \c *this, that is, a const reference to the decomposition itself as the underlying matrix is self-adjoint.
+      *
+      * This method is provided for compatibility with other matrix decompositions, thus enabling generic code such as:
+      * \code x = decomposition.adjoint().solve(b) \endcode
+      */
+    const LDLT& adjoint() const { return *this; };
+
    inline Index rows() const { return m_matrix.rows(); }
    inline Index cols() const { return m_matrix.cols(); }

    /** \brief Reports whether previous computation was successful.
      *
-      * \returns \c Success if computation was succesful,
-      *          \c NumericalIssue if the matrix.appears to be negative.
+      * \returns \c Success if computation was successful,
+      *          \c NumericalIssue if the factorization failed because of a zero pivot.
      */
    ComputationInfo info() const
    {
      eigen_assert(m_isInitialized && "LDLT is not initialized.");
-      return Success;
+      return m_info;
    }
-    
+
    #ifndef EIGEN_PARSED_BY_DOXYGEN
    template<typename RhsType, typename DstType>
-    EIGEN_DEVICE_FUNC
    void _solve_impl(const RhsType &rhs, DstType &dst) const;
+
+    template<bool Conjugate, typename RhsType, typename DstType>
+    void _solve_impl_transposed(const RhsType &rhs, DstType &dst) const;
    #endif

  protected:
-    
+
    static void check_template_parameters()
    {
      EIGEN_STATIC_ASSERT_NON_INTEGER(Scalar);
@@ -239,10 +282,12 @@ template<typename _MatrixType, int _UpLo> class LDLT
      * is not stored), and the diagonal entries correspond to D.
      */
    MatrixType m_matrix;
+    RealScalar m_l1_norm;
    TranspositionType m_transpositions;
    TmpMatrixType m_temporary;
    internal::SignMatrix m_sign;
    bool m_isInitialized;
+    ComputationInfo m_info;
 };

 namespace internal {
@@ -260,12 +305,15 @@ template<> struct ldlt_inplace<Lower>
    typedef typename TranspositionType::StorageIndex IndexType;
    eigen_assert(mat.rows()==mat.cols());
    const Index size = mat.rows();
+    bool found_zero_pivot = false;
+    bool ret = true;

    if (size <= 1)
    {
      transpositions.setIdentity();
-      if (numext::real(mat.coeff(0,0)) > 0) sign = PositiveSemiDef;
-      else if (numext::real(mat.coeff(0,0)) < 0) sign = NegativeSemiDef;
+      if(size==0) sign = ZeroSign;
+      else if (numext::real(mat.coeff(0,0)) > static_cast<RealScalar>(0) ) sign = PositiveSemiDef;
+      else if (numext::real(mat.coeff(0,0)) < static_cast<RealScalar>(0)) sign = NegativeSemiDef;
      else sign = ZeroSign;
      return true;
    }
@@ -312,26 +360,46 @@ template<> struct ldlt_inplace<Lower>
        if(rs>0)
          A21.noalias() -= A20 * temp.head(k);
      }
-      
+
      // In some previous versions of Eigen (e.g., 3.2.1), the scaling was omitted if the pivot
      // was smaller than the cutoff value. However, since LDLT is not rank-revealing
      // we should only make sure that we do not introduce INF or NaN values.
      // Remark that LAPACK also uses 0 as the cutoff value.
      RealScalar realAkk = numext::real(mat.coeffRef(k,k));
-      if((rs>0) && (abs(realAkk) > RealScalar(0)))
+      bool pivot_is_valid = (abs(realAkk) > RealScalar(0));
+
+      if(k==0 && !pivot_is_valid)
+      {
+        // The entire diagonal is zero, there is nothing more to do
+        // except filling the transpositions, and checking whether the matrix is zero.
+        sign = ZeroSign;
+        for(Index j = 0; j<size; ++j)
+        {
+          transpositions.coeffRef(j) = IndexType(j);
+          ret = ret && (mat.col(j).tail(size-j-1).array()==Scalar(0)).all();
+        }
+        return ret;
+      }
+
+      if((rs>0) && pivot_is_valid)
        A21 /= realAkk;
+      else if(rs>0)
+        ret = ret && (A21.array()==Scalar(0)).all();
+
+      if(found_zero_pivot && pivot_is_valid) ret = false; // factorization failed
+      else if(!pivot_is_valid) found_zero_pivot = true;

      if (sign == PositiveSemiDef) {
-        if (realAkk < 0) sign = Indefinite;
+        if (realAkk < static_cast<RealScalar>(0)) sign = Indefinite;
      } else if (sign == NegativeSemiDef) {
-        if (realAkk > 0) sign = Indefinite;
+        if (realAkk > static_cast<RealScalar>(0)) sign = Indefinite;
      } else if (sign == ZeroSign) {
-        if (realAkk > 0) sign = PositiveSemiDef;
-        else if (realAkk < 0) sign = NegativeSemiDef;
+        if (realAkk > static_cast<RealScalar>(0)) sign = PositiveSemiDef;
+        else if (realAkk < static_cast<RealScalar>(0)) sign = NegativeSemiDef;
      }
    }

-    return true;
+    return ret;
  }

  // Reference for the algorithm: Davis and Hager, "Multiple Rank
@@ -427,21 +495,35 @@ template<typename MatrixType> struct LDLT_Traits<MatrixType,Upper>
 /** Compute / recompute the LDLT decomposition A = L D L^* = U^* D U of \a matrix
  */
 template<typename MatrixType, int _UpLo>
-LDLT<MatrixType,_UpLo>& LDLT<MatrixType,_UpLo>::compute(const MatrixType& a)
+template<typename InputType>
+LDLT<MatrixType,_UpLo>& LDLT<MatrixType,_UpLo>::compute(const EigenBase<InputType>& a)
 {
  check_template_parameters();
-  
+
  eigen_assert(a.rows()==a.cols());
  const Index size = a.rows();

-  m_matrix = a;
+  m_matrix = a.derived();
+
+  // Compute matrix L1 norm = max abs column sum.
+  m_l1_norm = RealScalar(0);
+  // TODO move this code to SelfAdjointView
+  for (Index col = 0; col < size; ++col) {
+    RealScalar abs_col_sum;
+    if (_UpLo == Lower)
+      abs_col_sum = m_matrix.col(col).tail(size - col).template lpNorm<1>() + m_matrix.row(col).head(col).template lpNorm<1>();
+    else
+      abs_col_sum = m_matrix.col(col).head(col).template lpNorm<1>() + m_matrix.row(col).tail(size - col).template lpNorm<1>();
+    if (abs_col_sum > m_l1_norm)
+      m_l1_norm = abs_col_sum;
+  }

  m_transpositions.resize(size);
  m_isInitialized = false;
  m_temporary.resize(size);
  m_sign = internal::ZeroSign;

-  internal::ldlt_inplace<UpLo>::unblocked(m_matrix, m_transpositions, m_temporary, m_sign);
+  m_info = internal::ldlt_inplace<UpLo>::unblocked(m_matrix, m_transpositions, m_temporary, m_sign) ? Success : NumericalIssue;

  m_isInitialized = true;
  return *this;
@@ -463,7 +545,7 @@ LDLT<MatrixType,_UpLo>& LDLT<MatrixType,_UpLo>::rankUpdate(const MatrixBase<Deri
    eigen_assert(m_matrix.rows()==size);
  }
  else
-  {    
+  {
    m_matrix.resize(size,size);
    m_matrix.setZero();
    m_transpositions.resize(size);
@@ -484,25 +566,33 @@ template<typename _MatrixType, int _UpLo>
 template<typename RhsType, typename DstType>
 void LDLT<_MatrixType,_UpLo>::_solve_impl(const RhsType &rhs, DstType &dst) const
 {
-  eigen_assert(rhs.rows() == rows());
+  _solve_impl_transposed<true>(rhs, dst);
+}
+
+template<typename _MatrixType,int _UpLo>
+template<bool Conjugate, typename RhsType, typename DstType>
+void LDLT<_MatrixType,_UpLo>::_solve_impl_transposed(const RhsType &rhs, DstType &dst) const
+{
  // dst = P b
  dst = m_transpositions * rhs;

  // dst = L^-1 (P b)
-  matrixL().solveInPlace(dst);
+  // dst = L^-*T (P b)
+  matrixL().template conjugateIf<!Conjugate>().solveInPlace(dst);

-  // dst = D^-1 (L^-1 P b)
+  // dst = D^-* (L^-1 P b)
+  // dst = D^-1 (L^-*T P b)
  // more precisely, use pseudo-inverse of D (see bug 241)
  using std::abs;
  const typename Diagonal<const MatrixType>::RealReturnType vecD(vectorD());
-  // In some previous versions, tolerance was set to the max of 1/highest and the maximal diagonal entry * epsilon
-  // as motivated by LAPACK's xGELSS:
+  // In some previous versions, tolerance was set to the max of 1/highest (or rather numeric_limits::min())
+  // and the maximal diagonal entry * epsilon as motivated by LAPACK's xGELSS:
  // RealScalar tolerance = numext::maxi(vecD.array().abs().maxCoeff() * NumTraits<RealScalar>::epsilon(),RealScalar(1) / NumTraits<RealScalar>::highest());
  // However, LDLT is not rank revealing, and so adjusting the tolerance wrt to the highest
  // diagonal element is not well justified and leads to numerical issues in some cases.
  // Moreover, Lapack's xSYTRS routines use 0 for the tolerance.
-  RealScalar tolerance = RealScalar(1) / NumTraits<RealScalar>::highest();
-  
+  // Using numeric_limits::min() gives us more robustness to denormals.
+  RealScalar tolerance = (std::numeric_limits<RealScalar>::min)();
  for (Index i = 0; i < vecD.size(); ++i)
  {
    if(abs(vecD(i)) > tolerance)
@@ -511,10 +601,12 @@ void LDLT<_MatrixType,_UpLo>::_solve_impl(const RhsType &rhs, DstType &dst) cons
      dst.row(i).setZero();
  }

-  // dst = L^-T (D^-1 L^-1 P b)
-  matrixU().solveInPlace(dst);
+  // dst = L^-* (D^-* L^-1 P b)
+  // dst = L^-T (D^-1 L^-*T P b)
+  matrixL().transpose().template conjugateIf<Conjugate>().solveInPlace(dst);

-  // dst = P^-1 (L^-T D^-1 L^-1 P b) = A^-1 b
+  // dst = P^T (L^-* D^-* L^-1 P b) = A^-1 b
+  // dst = P^-T (L^-T D^-1 L^-*T P b) = A^-1 b
  dst = m_transpositions.transpose() * dst;
 }
 #endif
@@ -569,7 +661,6 @@ MatrixType LDLT<MatrixType,_UpLo>::reconstructedMatrix() const
  return res;
 }

-#ifndef __CUDACC__
 /** \cholesky_module
  * \returns the Cholesky decomposition with full pivoting without square root of \c *this
  * \sa MatrixBase::ldlt()
@@ -591,7 +682,6 @@ MatrixBase<Derived>::ldlt() const
 {
  return LDLT<PlainObject>(derived());
 }
-#endif // __CUDACC__

 } // end namespace Eigen

--- a/Eigen/src/Cholesky/LLT.h
+++ b/Eigen/src/Cholesky/LLT.h
@@ -10,9 +10,19 @@
 #ifndef EIGEN_LLT_H
 #define EIGEN_LLT_H

-namespace Eigen { 
+namespace Eigen {

 namespace internal{
+
+template<typename _MatrixType, int _UpLo> struct traits<LLT<_MatrixType, _UpLo> >
+ : traits<_MatrixType>
+{
+  typedef MatrixXpr XprKind;
+  typedef SolverStorage StorageKind;
+  typedef int StorageIndex;
+  enum { Flags = 0 };
+};
+
 template<typename MatrixType, int UpLo> struct LLT_Traits;
 }

@@ -22,9 +32,9 @@ template<typename MatrixType, int UpLo> struct LLT_Traits;
  *
  * \brief Standard Cholesky decomposition (LL^T) of a matrix and associated features
  *
-  * \param MatrixType the type of the matrix of which we are computing the LL^T Cholesky decomposition
-  * \param UpLo the triangular part that will be used for the decompositon: Lower (default) or Upper.
-  *             The other triangular part won't be read.
+  * \tparam _MatrixType the type of the matrix of which we are computing the LL^T Cholesky decomposition
+  * \tparam _UpLo the triangular part that will be used for the decompositon: Lower (default) or Upper.
+  *               The other triangular part won't be read.
  *
  * This class performs a LL^T Cholesky decomposition of a symmetric, positive definite
  * matrix A such that A = LL^* = U^*U, where L is lower triangular.
@@ -40,27 +50,31 @@ template<typename MatrixType, int UpLo> struct LLT_Traits;
  *
  * Example: \include LLT_example.cpp
  * Output: \verbinclude LLT_example.out
-  *    
+  *
+  * \b Performance: for best performance, it is recommended to use a column-major storage format
+  * with the Lower triangular part (the default), or, equivalently, a row-major storage format
+  * with the Upper triangular part. Otherwise, you might get a 20% slowdown for the full factorization
+  * step, and rank-updates can be up to 3 times slower.
+  *
+  * This class supports the \link InplaceDecomposition inplace decomposition \endlink mechanism.
+  *
+  * Note that during the decomposition, only the lower (or upper, as defined by _UpLo) triangular part of A is considered.
+  * Therefore, the strict lower part does not have to store correct values.
+  *
  * \sa MatrixBase::llt(), SelfAdjointView::llt(), class LDLT
  */
- /* HEY THIS DOX IS DISABLED BECAUSE THERE's A BUG EITHER HERE OR IN LDLT ABOUT THAT (OR BOTH)
-  * Note that during the decomposition, only the upper triangular part of A is considered. Therefore,
-  * the strict lower part does not have to store correct values.
-  */
 template<typename _MatrixType, int _UpLo> class LLT
+        : public SolverBase<LLT<_MatrixType, _UpLo> >
 {
  public:
    typedef _MatrixType MatrixType;
+    typedef SolverBase<LLT> Base;
+    friend class SolverBase<LLT>;
+
+    EIGEN_GENERIC_PUBLIC_INTERFACE(LLT)
    enum {
-      RowsAtCompileTime = MatrixType::RowsAtCompileTime,
-      ColsAtCompileTime = MatrixType::ColsAtCompileTime,
-      Options = MatrixType::Options,
      MaxColsAtCompileTime = MatrixType::MaxColsAtCompileTime
    };
-    typedef typename MatrixType::Scalar Scalar;
-    typedef typename NumTraits<typename MatrixType::Scalar>::Real RealScalar;
-    typedef Eigen::Index Index; ///< \deprecated since Eigen 3.3
-    typedef typename MatrixType::StorageIndex StorageIndex;

    enum {
      PacketSize = internal::packet_traits<Scalar>::size,
@@ -87,11 +101,27 @@ template<typename _MatrixType, int _UpLo> class LLT
    explicit LLT(Index size) : m_matrix(size, size),
                    m_isInitialized(false) {}

-    explicit LLT(const MatrixType& matrix)
+    template<typename InputType>
+    explicit LLT(const EigenBase<InputType>& matrix)
      : m_matrix(matrix.rows(), matrix.cols()),
        m_isInitialized(false)
    {
-      compute(matrix);
+      compute(matrix.derived());
+    }
+
+    /** \brief Constructs a LLT factorization from a given matrix
+      *
+      * This overloaded constructor is provided for \link InplaceDecomposition inplace decomposition \endlink when
+      * \c MatrixType is a Eigen::Ref.
+      *
+      * \sa LLT(const EigenBase&)
+      */
+    template<typename InputType>
+    explicit LLT(EigenBase<InputType>& matrix)
+      : m_matrix(matrix.derived()),
+        m_isInitialized(false)
+    {
+      compute(matrix.derived());
    }

    /** \returns a view of the upper triangular matrix U */
@@ -108,6 +138,7 @@ template<typename _MatrixType, int _UpLo> class LLT
      return Traits::getL(m_matrix);
    }

+    #ifdef EIGEN_PARSED_BY_DOXYGEN
    /** \returns the solution x of \f$ A x = b \f$ using the current decomposition of A.
      *
      * Since this LLT class assumes anyway that the matrix A is invertible, the solution
@@ -120,18 +151,24 @@ template<typename _MatrixType, int _UpLo> class LLT
      */
    template<typename Rhs>
    inline const Solve<LLT, Rhs>
-    solve(const MatrixBase<Rhs>& b) const
-    {
-      eigen_assert(m_isInitialized && "LLT is not initialized.");
-      eigen_assert(m_matrix.rows()==b.rows()
-                && "LLT::solve(): invalid number of rows of the right hand side matrix b");
-      return Solve<LLT, Rhs>(*this, b.derived());
-    }
+    solve(const MatrixBase<Rhs>& b) const;
+    #endif

    template<typename Derived>
-    void solveInPlace(MatrixBase<Derived> &bAndX) const;
+    void solveInPlace(const MatrixBase<Derived> &bAndX) const;

-    LLT& compute(const MatrixType& matrix);
+    template<typename InputType>
+    LLT& compute(const EigenBase<InputType>& matrix);
+
+    /** \returns an estimate of the reciprocal condition number of the matrix of
+      *  which \c *this is the Cholesky decomposition.
+      */
+    RealScalar rcond() const
+    {
+      eigen_assert(m_isInitialized && "LLT is not initialized.");
+      eigen_assert(m_info == Success && "LLT failed because matrix appears to be negative");
+      return internal::rcond_estimate_helper(m_l1_norm, *this);
+    }

    /** \returns the LLT decomposition matrix
      *
@@ -148,8 +185,8 @@ template<typename _MatrixType, int _UpLo> class LLT

    /** \brief Reports whether previous computation was successful.
      *
-      * \returns \c Success if computation was succesful,
-      *          \c NumericalIssue if the matrix.appears to be negative.
+      * \returns \c Success if computation was successful,
+      *          \c NumericalIssue if the matrix.appears not to be positive definite.
      */
    ComputationInfo info() const
    {
@@ -157,30 +194,40 @@ template<typename _MatrixType, int _UpLo> class LLT
      return m_info;
    }

+    /** \returns the adjoint of \c *this, that is, a const reference to the decomposition itself as the underlying matrix is self-adjoint.
+      *
+      * This method is provided for compatibility with other matrix decompositions, thus enabling generic code such as:
+      * \code x = decomposition.adjoint().solve(b) \endcode
+      */
+    const LLT& adjoint() const { return *this; };
+
    inline Index rows() const { return m_matrix.rows(); }
    inline Index cols() const { return m_matrix.cols(); }

    template<typename VectorType>
-    LLT rankUpdate(const VectorType& vec, const RealScalar& sigma = 1);
-    
+    LLT & rankUpdate(const VectorType& vec, const RealScalar& sigma = 1);
+
    #ifndef EIGEN_PARSED_BY_DOXYGEN
    template<typename RhsType, typename DstType>
-    EIGEN_DEVICE_FUNC
    void _solve_impl(const RhsType &rhs, DstType &dst) const;
+
+    template<bool Conjugate, typename RhsType, typename DstType>
+    void _solve_impl_transposed(const RhsType &rhs, DstType &dst) const;
    #endif

  protected:
-    
+
    static void check_template_parameters()
    {
      EIGEN_STATIC_ASSERT_NON_INTEGER(Scalar);
    }
-    
+
    /** \internal
      * Used to compute and store L
      * The strict upper part is not used and even not initialized.
      */
    MatrixType m_matrix;
+    RealScalar m_l1_norm;
    bool m_isInitialized;
    ComputationInfo m_info;
 };
@@ -266,7 +313,7 @@ template<typename Scalar> struct llt_inplace<Scalar, Lower>
  static Index unblocked(MatrixType& mat)
  {
    using std::sqrt;
-    
+
    eigen_assert(mat.rows()==mat.cols());
    const Index size = mat.rows();
    for(Index k = 0; k < size; ++k)
@@ -283,7 +330,7 @@ template<typename Scalar> struct llt_inplace<Scalar, Lower>
        return k;
      mat.coeffRef(k,k) = x = sqrt(x);
      if (k>0 && rs>0) A21.noalias() -= A20 * A10.adjoint();
-      if (rs>0) A21 *= RealScalar(1)/x;
+      if (rs>0) A21 /= x;
    }
    return -1;
  }
@@ -315,7 +362,7 @@ template<typename Scalar> struct llt_inplace<Scalar, Lower>
      Index ret;
      if((ret=unblocked(A11))>=0) return k+ret;
      if(rs>0) A11.adjoint().template triangularView<Upper>().template solveInPlace<OnTheRight>(A21);
-      if(rs>0) A22.template selfadjointView<Lower>().rankUpdate(A21,-1); // bottleneck
+      if(rs>0) A22.template selfadjointView<Lower>().rankUpdate(A21,typename NumTraits<RealScalar>::Literal(-1)); // bottleneck
    }
    return -1;
  }
@@ -326,7 +373,7 @@ template<typename Scalar> struct llt_inplace<Scalar, Lower>
    return Eigen::internal::llt_rank_update_lower(mat, vec, sigma);
  }
 };
-  
+
 template<typename Scalar> struct llt_inplace<Scalar, Upper>
 {
  typedef typename NumTraits<Scalar>::Real RealScalar;
@@ -381,14 +428,29 @@ template<typename MatrixType> struct LLT_Traits<MatrixType,Upper>
  * Output: \verbinclude TutorialLinAlgComputeTwice.out
  */
 template<typename MatrixType, int _UpLo>
-LLT<MatrixType,_UpLo>& LLT<MatrixType,_UpLo>::compute(const MatrixType& a)
+template<typename InputType>
+LLT<MatrixType,_UpLo>& LLT<MatrixType,_UpLo>::compute(const EigenBase<InputType>& a)
 {
  check_template_parameters();
-  
+
  eigen_assert(a.rows()==a.cols());
  const Index size = a.rows();
  m_matrix.resize(size, size);
-  m_matrix = a;
+  if (!internal::is_same_dense(m_matrix, a.derived()))
+    m_matrix = a.derived();
+
+  // Compute matrix L1 norm = max abs column sum.
+  m_l1_norm = RealScalar(0);
+  // TODO move this code to SelfAdjointView
+  for (Index col = 0; col < size; ++col) {
+    RealScalar abs_col_sum;
+    if (_UpLo == Lower)
+      abs_col_sum = m_matrix.col(col).tail(size - col).template lpNorm<1>() + m_matrix.row(col).head(col).template lpNorm<1>();
+    else
+      abs_col_sum = m_matrix.col(col).head(col).template lpNorm<1>() + m_matrix.row(col).tail(size - col).template lpNorm<1>();
+    if (abs_col_sum > m_l1_norm)
+      m_l1_norm = abs_col_sum;
+  }

  m_isInitialized = true;
  bool ok = Traits::inplace_decomposition(m_matrix);
@@ -404,7 +466,7 @@ LLT<MatrixType,_UpLo>& LLT<MatrixType,_UpLo>::compute(const MatrixType& a)
  */
 template<typename _MatrixType, int _UpLo>
 template<typename VectorType>
-LLT<_MatrixType,_UpLo> LLT<_MatrixType,_UpLo>::rankUpdate(const VectorType& v, const RealScalar& sigma)
+LLT<_MatrixType,_UpLo> & LLT<_MatrixType,_UpLo>::rankUpdate(const VectorType& v, const RealScalar& sigma)
 {
  EIGEN_STATIC_ASSERT_VECTOR_ONLY(VectorType);
  eigen_assert(v.size()==m_matrix.cols());
@@ -416,33 +478,42 @@ LLT<_MatrixType,_UpLo> LLT<_MatrixType,_UpLo>::rankUpdate(const VectorType& v, c

  return *this;
 }
- 
+
 #ifndef EIGEN_PARSED_BY_DOXYGEN
 template<typename _MatrixType,int _UpLo>
 template<typename RhsType, typename DstType>
 void LLT<_MatrixType,_UpLo>::_solve_impl(const RhsType &rhs, DstType &dst) const
 {
-  dst = rhs;
-  solveInPlace(dst);
+  _solve_impl_transposed<true>(rhs, dst);
+}
+
+template<typename _MatrixType,int _UpLo>
+template<bool Conjugate, typename RhsType, typename DstType>
+void LLT<_MatrixType,_UpLo>::_solve_impl_transposed(const RhsType &rhs, DstType &dst) const
+{
+    dst = rhs;
+
+    matrixL().template conjugateIf<!Conjugate>().solveInPlace(dst);
+    matrixU().template conjugateIf<!Conjugate>().solveInPlace(dst);
 }
 #endif

 /** \internal use x = llt_object.solve(x);
-  * 
+  *
  * This is the \em in-place version of solve().
  *
  * \param bAndX represents both the right-hand side matrix b and result x.
  *
-  * \returns true always! If you need to check for existence of solutions, use another decomposition like LU, QR, or SVD.
+  * This version avoids a copy when the right hand side matrix b is not needed anymore.
  *
-  * This version avoids a copy when the right hand side matrix b is not
-  * needed anymore.
+  * \warning The parameter is only marked 'const' to make the C++ compiler accept a temporary expression here.
+  * This function will const_cast it, so constness isn't honored here.
  *
  * \sa LLT::solve(), MatrixBase::llt()
  */
 template<typename MatrixType, int _UpLo>
 template<typename Derived>
-void LLT<MatrixType,_UpLo>::solveInPlace(MatrixBase<Derived> &bAndX) const
+void LLT<MatrixType,_UpLo>::solveInPlace(const MatrixBase<Derived> &bAndX) const
 {
  eigen_assert(m_isInitialized && "LLT is not initialized.");
  eigen_assert(m_matrix.rows()==bAndX.rows());
@@ -460,7 +531,6 @@ MatrixType LLT<MatrixType,_UpLo>::reconstructedMatrix() const
  return matrixL() * matrixL().adjoint().toDenseMatrix();
 }

-#ifndef __CUDACC__
 /** \cholesky_module
  * \returns the LLT decomposition of \c *this
  * \sa SelfAdjointView::llt()
@@ -482,8 +552,7 @@ SelfAdjointView<MatrixType, UpLo>::llt() const
 {
  return LLT<PlainObject,UpLo>(m_matrix);
 }
-#endif // __CUDACC__
-  
+
 } // end namespace Eigen

 #endif // EIGEN_LLT_H
--- a/Eigen/src/Cholesky/LLT_LAPACKE.h
+++ b/Eigen/src/Cholesky/LLT_LAPACKE.h
@@ -25,25 +25,22 @@
 SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.

 ********************************************************************************
- *   Content : Eigen bindings to Intel(R) MKL
+ *   Content : Eigen bindings to LAPACKe
 *     LLt decomposition based on LAPACKE_?potrf function.
 ********************************************************************************
 */

-#ifndef EIGEN_LLT_MKL_H
-#define EIGEN_LLT_MKL_H
-
-#include "Eigen/src/Core/util/MKL_support.h"
-#include <iostream>
+#ifndef EIGEN_LLT_LAPACKE_H
+#define EIGEN_LLT_LAPACKE_H

 namespace Eigen { 

 namespace internal {

-template<typename Scalar> struct mkl_llt;
+template<typename Scalar> struct lapacke_llt;

-#define EIGEN_MKL_LLT(EIGTYPE, MKLTYPE, MKLPREFIX) \
-template<> struct mkl_llt<EIGTYPE> \
+#define EIGEN_LAPACKE_LLT(EIGTYPE, BLASTYPE, LAPACKE_PREFIX) \
+template<> struct lapacke_llt<EIGTYPE> \
 { \
  template<typename MatrixType> \
  static inline Index potrf(MatrixType& m, char uplo) \
@@ -53,13 +50,13 @@ template<> struct mkl_llt<EIGTYPE> \
    EIGTYPE* a; \
    eigen_assert(m.rows()==m.cols()); \
    /* Set up parameters for ?potrf */ \
-    size = m.rows(); \
+    size = convert_index<lapack_int>(m.rows()); \
    StorageOrder = MatrixType::Flags&RowMajorBit?RowMajor:ColMajor; \
    matrix_order = StorageOrder==RowMajor ? LAPACK_ROW_MAJOR : LAPACK_COL_MAJOR; \
    a = &(m.coeffRef(0,0)); \
-    lda = m.outerStride(); \
+    lda = convert_index<lapack_int>(m.outerStride()); \
 \
-    info = LAPACKE_##MKLPREFIX##potrf( matrix_order, uplo, size, (MKLTYPE*)a, lda ); \
+    info = LAPACKE_##LAPACKE_PREFIX##potrf( matrix_order, uplo, size, (BLASTYPE*)a, lda ); \
    info = (info==0) ? -1 : info>0 ? info-1 : size; \
    return info; \
  } \
@@ -69,7 +66,7 @@ template<> struct llt_inplace<EIGTYPE, Lower> \
  template<typename MatrixType> \
  static Index blocked(MatrixType& m) \
  { \
-    return mkl_llt<EIGTYPE>::potrf(m, 'L'); \
+    return lapacke_llt<EIGTYPE>::potrf(m, 'L'); \
  } \
  template<typename MatrixType, typename VectorType> \
  static Index rankUpdate(MatrixType& mat, const VectorType& vec, const typename MatrixType::RealScalar& sigma) \
@@ -80,7 +77,7 @@ template<> struct llt_inplace<EIGTYPE, Upper> \
  template<typename MatrixType> \
  static Index blocked(MatrixType& m) \
  { \
-    return mkl_llt<EIGTYPE>::potrf(m, 'U'); \
+    return lapacke_llt<EIGTYPE>::potrf(m, 'U'); \
  } \
  template<typename MatrixType, typename VectorType> \
  static Index rankUpdate(MatrixType& mat, const VectorType& vec, const typename MatrixType::RealScalar& sigma) \
@@ -90,13 +87,13 @@ template<> struct llt_inplace<EIGTYPE, Upper> \
  } \
 };

-EIGEN_MKL_LLT(double, double, d)
-EIGEN_MKL_LLT(float, float, s)
-EIGEN_MKL_LLT(dcomplex, MKL_Complex16, z)
-EIGEN_MKL_LLT(scomplex, MKL_Complex8, c)
+EIGEN_LAPACKE_LLT(double, double, d)
+EIGEN_LAPACKE_LLT(float, float, s)
+EIGEN_LAPACKE_LLT(dcomplex, lapack_complex_double, z)
+EIGEN_LAPACKE_LLT(scomplex, lapack_complex_float, c)

 } // end namespace internal

 } // end namespace Eigen

-#endif // EIGEN_LLT_MKL_H
+#endif // EIGEN_LLT_LAPACKE_H
--- a/Eigen/src/CholmodSupport/CMakeLists.txt
+++ b/Eigen/src/CholmodSupport/CMakeLists.txt
@@ -1,6 +0,0 @@
-FILE(GLOB Eigen_CholmodSupport_SRCS "*.h")
-
-INSTALL(FILES 
-  ${Eigen_CholmodSupport_SRCS}
-  DESTINATION ${INCLUDE_INSTALL_DIR}/Eigen/src/CholmodSupport COMPONENT Devel
-  )
--- a/Eigen/src/CholmodSupport/CholmodSupport.h
+++ b/Eigen/src/CholmodSupport/CholmodSupport.h
@@ -10,38 +10,44 @@
 #ifndef EIGEN_CHOLMODSUPPORT_H
 #define EIGEN_CHOLMODSUPPORT_H

-namespace Eigen { 
+namespace Eigen {

 namespace internal {

-template<typename Scalar, typename CholmodType>
-void cholmod_configure_matrix(CholmodType& mat)
-{
-  if (internal::is_same<Scalar,float>::value)
-  {
-    mat.xtype = CHOLMOD_REAL;
-    mat.dtype = CHOLMOD_SINGLE;
-  }
-  else if (internal::is_same<Scalar,double>::value)
-  {
+template<typename Scalar> struct cholmod_configure_matrix;
+
+template<> struct cholmod_configure_matrix<double> {
+  template<typename CholmodType>
+  static void run(CholmodType& mat) {
    mat.xtype = CHOLMOD_REAL;
    mat.dtype = CHOLMOD_DOUBLE;
  }
-  else if (internal::is_same<Scalar,std::complex<float> >::value)
-  {
-    mat.xtype = CHOLMOD_COMPLEX;
-    mat.dtype = CHOLMOD_SINGLE;
-  }
-  else if (internal::is_same<Scalar,std::complex<double> >::value)
-  {
+};
+
+template<> struct cholmod_configure_matrix<std::complex<double> > {
+  template<typename CholmodType>
+  static void run(CholmodType& mat) {
    mat.xtype = CHOLMOD_COMPLEX;
    mat.dtype = CHOLMOD_DOUBLE;
  }
-  else
-  {
-    eigen_assert(false && "Scalar type not supported by CHOLMOD");
-  }
-}
+};
+
+// Other scalar types are not yet supported by Cholmod
+// template<> struct cholmod_configure_matrix<float> {
+//   template<typename CholmodType>
+//   static void run(CholmodType& mat) {
+//     mat.xtype = CHOLMOD_REAL;
+//     mat.dtype = CHOLMOD_SINGLE;
+//   }
+// };
+//
+// template<> struct cholmod_configure_matrix<std::complex<float> > {
+//   template<typename CholmodType>
+//   static void run(CholmodType& mat) {
+//     mat.xtype = CHOLMOD_COMPLEX;
+//     mat.dtype = CHOLMOD_SINGLE;
+//   }
+// };

 } // namespace internal

@@ -49,11 +55,11 @@ void cholmod_configure_matrix(CholmodType& mat)
  * Note that the data are shared.
  */
 template<typename _Scalar, int _Options, typename _StorageIndex>
-cholmod_sparse viewAsCholmod(SparseMatrix<_Scalar,_Options,_StorageIndex>& mat)
+cholmod_sparse viewAsCholmod(Ref<SparseMatrix<_Scalar,_Options,_StorageIndex> > mat)
 {
  cholmod_sparse res;
  res.nzmax   = mat.nonZeros();
-  res.nrow    = mat.rows();;
+  res.nrow    = mat.rows();
  res.ncol    = mat.cols();
  res.p       = mat.outerIndexPtr();
  res.i       = mat.innerIndexPtr();
@@ -73,12 +79,12 @@ cholmod_sparse viewAsCholmod(SparseMatrix<_Scalar,_Options,_StorageIndex>& mat)

  res.dtype   = 0;
  res.stype   = -1;
-  
+
  if (internal::is_same<_StorageIndex,int>::value)
  {
    res.itype = CHOLMOD_INT;
  }
-  else if (internal::is_same<_StorageIndex,UF_long>::value)
+  else if (internal::is_same<_StorageIndex,SuiteSparse_long>::value)
  {
    res.itype = CHOLMOD_LONG;
  }
@@ -88,17 +94,24 @@ cholmod_sparse viewAsCholmod(SparseMatrix<_Scalar,_Options,_StorageIndex>& mat)
  }

  // setup res.xtype
-  internal::cholmod_configure_matrix<_Scalar>(res);
-  
+  internal::cholmod_configure_matrix<_Scalar>::run(res);
+
  res.stype = 0;
-  
+
  return res;
 }

 template<typename _Scalar, int _Options, typename _Index>
 const cholmod_sparse viewAsCholmod(const SparseMatrix<_Scalar,_Options,_Index>& mat)
 {
-  cholmod_sparse res = viewAsCholmod(mat.const_cast_derived());
+  cholmod_sparse res = viewAsCholmod(Ref<SparseMatrix<_Scalar,_Options,_Index> >(mat.const_cast_derived()));
+  return res;
+}
+
+template<typename _Scalar, int _Options, typename _Index>
+const cholmod_sparse viewAsCholmod(const SparseVector<_Scalar,_Options,_Index>& mat)
+{
+  cholmod_sparse res = viewAsCholmod(Ref<SparseMatrix<_Scalar,_Options,_Index> >(mat.const_cast_derived()));
  return res;
 }

@@ -107,10 +120,13 @@ const cholmod_sparse viewAsCholmod(const SparseMatrix<_Scalar,_Options,_Index>&
 template<typename _Scalar, int _Options, typename _Index, unsigned int UpLo>
 cholmod_sparse viewAsCholmod(const SparseSelfAdjointView<const SparseMatrix<_Scalar,_Options,_Index>, UpLo>& mat)
 {
-  cholmod_sparse res = viewAsCholmod(mat.matrix().const_cast_derived());
-  
+  cholmod_sparse res = viewAsCholmod(Ref<SparseMatrix<_Scalar,_Options,_Index> >(mat.matrix().const_cast_derived()));
+
  if(UpLo==Upper) res.stype =  1;
  if(UpLo==Lower) res.stype = -1;
+  // swap stype for rowmajor matrices (only works for real matrices)
+  EIGEN_STATIC_ASSERT((_Options & RowMajorBit) == 0 || NumTraits<_Scalar>::IsComplex == 0, THIS_METHOD_IS_ONLY_FOR_COLUMN_MAJOR_MATRICES);
+  if(_Options & RowMajorBit) res.stype *=-1;

  return res;
 }
@@ -131,7 +147,7 @@ cholmod_dense viewAsCholmod(MatrixBase<Derived>& mat)
  res.x      = (void*)(mat.derived().data());
  res.z      = 0;

-  internal::cholmod_configure_matrix<Scalar>(res);
+  internal::cholmod_configure_matrix<Scalar>::run(res);

  return res;
 }
@@ -146,6 +162,44 @@ MappedSparseMatrix<Scalar,Flags,StorageIndex> viewAsEigen(cholmod_sparse& cm)
          static_cast<StorageIndex*>(cm.p), static_cast<StorageIndex*>(cm.i),static_cast<Scalar*>(cm.x) );
 }

+namespace internal {
+
+// template specializations for int and long that call the correct cholmod method
+
+#define EIGEN_CHOLMOD_SPECIALIZE0(ret, name) \
+    template<typename _StorageIndex> inline ret cm_ ## name       (cholmod_common &Common) { return cholmod_ ## name   (&Common); } \
+    template<>                       inline ret cm_ ## name<SuiteSparse_long> (cholmod_common &Common) { return cholmod_l_ ## name (&Common); }
+
+#define EIGEN_CHOLMOD_SPECIALIZE1(ret, name, t1, a1) \
+    template<typename _StorageIndex> inline ret cm_ ## name       (t1& a1, cholmod_common &Common) { return cholmod_ ## name   (&a1, &Common); } \
+    template<>                       inline ret cm_ ## name<SuiteSparse_long> (t1& a1, cholmod_common &Common) { return cholmod_l_ ## name (&a1, &Common); }
+
+EIGEN_CHOLMOD_SPECIALIZE0(int, start)
+EIGEN_CHOLMOD_SPECIALIZE0(int, finish)
+
+EIGEN_CHOLMOD_SPECIALIZE1(int, free_factor, cholmod_factor*, L)
+EIGEN_CHOLMOD_SPECIALIZE1(int, free_dense,  cholmod_dense*,  X)
+EIGEN_CHOLMOD_SPECIALIZE1(int, free_sparse, cholmod_sparse*, A)
+
+EIGEN_CHOLMOD_SPECIALIZE1(cholmod_factor*, analyze, cholmod_sparse, A)
+
+template<typename _StorageIndex> inline cholmod_dense*  cm_solve         (int sys, cholmod_factor& L, cholmod_dense&  B, cholmod_common &Common) { return cholmod_solve     (sys, &L, &B, &Common); }
+template<>                       inline cholmod_dense*  cm_solve<SuiteSparse_long>   (int sys, cholmod_factor& L, cholmod_dense&  B, cholmod_common &Common) { return cholmod_l_solve   (sys, &L, &B, &Common); }
+
+template<typename _StorageIndex> inline cholmod_sparse* cm_spsolve       (int sys, cholmod_factor& L, cholmod_sparse& B, cholmod_common &Common) { return cholmod_spsolve   (sys, &L, &B, &Common); }
+template<>                       inline cholmod_sparse* cm_spsolve<SuiteSparse_long> (int sys, cholmod_factor& L, cholmod_sparse& B, cholmod_common &Common) { return cholmod_l_spsolve (sys, &L, &B, &Common); }
+
+template<typename _StorageIndex>
+inline int  cm_factorize_p       (cholmod_sparse*  A, double beta[2], _StorageIndex* fset, std::size_t fsize, cholmod_factor* L, cholmod_common &Common) { return cholmod_factorize_p   (A, beta, fset, fsize, L, &Common); }
+template<>
+inline int  cm_factorize_p<SuiteSparse_long> (cholmod_sparse*  A, double beta[2], SuiteSparse_long* fset,          std::size_t fsize, cholmod_factor* L, cholmod_common &Common) { return cholmod_l_factorize_p (A, beta, fset, fsize, L, &Common); }
+
+#undef EIGEN_CHOLMOD_SPECIALIZE0
+#undef EIGEN_CHOLMOD_SPECIALIZE1
+
+}  // namespace internal
+
+
 enum CholmodMode {
  CholmodAuto, CholmodSimplicialLLt, CholmodSupernodalLLt, CholmodLDLt
 };
@@ -170,37 +224,43 @@ class CholmodBase : public SparseSolverBase<Derived>
    typedef typename MatrixType::RealScalar RealScalar;
    typedef MatrixType CholMatrixType;
    typedef typename MatrixType::StorageIndex StorageIndex;
+    enum {
+      ColsAtCompileTime = MatrixType::ColsAtCompileTime,
+      MaxColsAtCompileTime = MatrixType::MaxColsAtCompileTime
+    };

  public:

    CholmodBase()
-      : m_cholmodFactor(0), m_info(Success)
+      : m_cholmodFactor(0), m_info(Success), m_factorizationIsOk(false), m_analysisIsOk(false)
    {
-      m_shiftOffset[0] = m_shiftOffset[1] = RealScalar(0.0);
-      cholmod_start(&m_cholmod);
+      EIGEN_STATIC_ASSERT((internal::is_same<double,RealScalar>::value), CHOLMOD_SUPPORTS_DOUBLE_PRECISION_ONLY);
+      m_shiftOffset[0] = m_shiftOffset[1] = 0.0;
+      internal::cm_start<StorageIndex>(m_cholmod);
    }

    explicit CholmodBase(const MatrixType& matrix)
-      : m_cholmodFactor(0), m_info(Success)
+      : m_cholmodFactor(0), m_info(Success), m_factorizationIsOk(false), m_analysisIsOk(false)
    {
-      m_shiftOffset[0] = m_shiftOffset[1] = RealScalar(0.0);
-      cholmod_start(&m_cholmod);
+      EIGEN_STATIC_ASSERT((internal::is_same<double,RealScalar>::value), CHOLMOD_SUPPORTS_DOUBLE_PRECISION_ONLY);
+      m_shiftOffset[0] = m_shiftOffset[1] = 0.0;
+      internal::cm_start<StorageIndex>(m_cholmod);
      compute(matrix);
    }

    ~CholmodBase()
    {
      if(m_cholmodFactor)
-        cholmod_free_factor(&m_cholmodFactor, &m_cholmod);
-      cholmod_finish(&m_cholmod);
+        internal::cm_free_factor<StorageIndex>(m_cholmodFactor, m_cholmod);
+      internal::cm_finish<StorageIndex>(m_cholmod);
    }
-    
+
    inline StorageIndex cols() const { return internal::convert_index<StorageIndex, Index>(m_cholmodFactor->n); }
    inline StorageIndex rows() const { return internal::convert_index<StorageIndex, Index>(m_cholmodFactor->n); }
-    
+
    /** \brief Reports whether previous computation was successful.
      *
-      * \returns \c Success if computation was succesful,
+      * \returns \c Success if computation was successful,
      *          \c NumericalIssue if the matrix.appears to be negative.
      */
    ComputationInfo info() const
@@ -216,29 +276,29 @@ class CholmodBase : public SparseSolverBase<Derived>
      factorize(matrix);
      return derived();
    }
-    
+
    /** Performs a symbolic decomposition on the sparsity pattern of \a matrix.
      *
      * This function is particularly useful when solving for several problems having the same structure.
-      * 
+      *
      * \sa factorize()
      */
    void analyzePattern(const MatrixType& matrix)
    {
      if(m_cholmodFactor)
      {
-        cholmod_free_factor(&m_cholmodFactor, &m_cholmod);
+        internal::cm_free_factor<StorageIndex>(m_cholmodFactor, m_cholmod);
        m_cholmodFactor = 0;
      }
      cholmod_sparse A = viewAsCholmod(matrix.template selfadjointView<UpLo>());
-      m_cholmodFactor = cholmod_analyze(&A, &m_cholmod);
-      
+      m_cholmodFactor = internal::cm_analyze<StorageIndex>(A, m_cholmod);
+
      this->m_isInitialized = true;
      this->m_info = Success;
      m_analysisIsOk = true;
      m_factorizationIsOk = false;
    }
-    
+
    /** Performs a numeric decomposition of \a matrix
      *
      * The given matrix must have the same sparsity pattern as the matrix on which the symbolic decomposition has been performed.
@@ -249,17 +309,17 @@ class CholmodBase : public SparseSolverBase<Derived>
    {
      eigen_assert(m_analysisIsOk && "You must first call analyzePattern()");
      cholmod_sparse A = viewAsCholmod(matrix.template selfadjointView<UpLo>());
-      cholmod_factorize_p(&A, m_shiftOffset, 0, 0, m_cholmodFactor, &m_cholmod);
-      
+      internal::cm_factorize_p<StorageIndex>(&A, m_shiftOffset, 0, 0, m_cholmodFactor, m_cholmod);
+
      // If the factorization failed, minor is the column at which it did. On success minor == n.
      this->m_info = (m_cholmodFactor->minor == m_cholmodFactor->n ? Success : NumericalIssue);
      m_factorizationIsOk = true;
    }
-    
+
    /** Returns a reference to the Cholmod's configuration structure to get a full control over the performed operations.
     *  See the Cholmod user guide for details. */
    cholmod_common& cholmod() { return m_cholmod; }
-    
+
    #ifndef EIGEN_PARSED_BY_DOXYGEN
    /** \internal */
    template<typename Rhs,typename Dest>
@@ -270,23 +330,25 @@ class CholmodBase : public SparseSolverBase<Derived>
      EIGEN_UNUSED_VARIABLE(size);
      eigen_assert(size==b.rows());

-      // note: cd stands for Cholmod Dense
-      Rhs& b_ref(b.const_cast_derived());
+      // Cholmod needs column-major storage without inner-stride, which corresponds to the default behavior of Ref.
+      Ref<const Matrix<typename Rhs::Scalar,Dynamic,Dynamic,ColMajor> > b_ref(b.derived());
+
      cholmod_dense b_cd = viewAsCholmod(b_ref);
-      cholmod_dense* x_cd = cholmod_solve(CHOLMOD_A, m_cholmodFactor, &b_cd, &m_cholmod);
+      cholmod_dense* x_cd = internal::cm_solve<StorageIndex>(CHOLMOD_A, *m_cholmodFactor, b_cd, m_cholmod);
      if(!x_cd)
      {
        this->m_info = NumericalIssue;
        return;
      }
      // TODO optimize this copy by swapping when possible (be careful with alignment, etc.)
+      // NOTE Actually, the copy can be avoided by calling cholmod_solve2 instead of cholmod_solve
      dest = Matrix<Scalar,Dest::RowsAtCompileTime,Dest::ColsAtCompileTime>::Map(reinterpret_cast<Scalar*>(x_cd->x),b.rows(),b.cols());
-      cholmod_free_dense(&x_cd, &m_cholmod);
+      internal::cm_free_dense<StorageIndex>(x_cd, m_cholmod);
    }
-    
+
    /** \internal */
-    template<typename RhsScalar, int RhsOptions, typename RhsIndex, typename DestScalar, int DestOptions, typename DestIndex>
-    void _solve_impl(const SparseMatrix<RhsScalar,RhsOptions,RhsIndex> &b, SparseMatrix<DestScalar,DestOptions,DestIndex> &dest) const
+    template<typename RhsDerived, typename DestDerived>
+    void _solve_impl(const SparseMatrixBase<RhsDerived> &b, SparseMatrixBase<DestDerived> &dest) const
    {
      eigen_assert(m_factorizationIsOk && "The decomposition is not in a valid state for solving, you must first call either compute() or symbolic()/numeric()");
      const Index size = m_cholmodFactor->n;
@@ -294,20 +356,22 @@ class CholmodBase : public SparseSolverBase<Derived>
      eigen_assert(size==b.rows());

      // note: cs stands for Cholmod Sparse
-      cholmod_sparse b_cs = viewAsCholmod(b);
-      cholmod_sparse* x_cs = cholmod_spsolve(CHOLMOD_A, m_cholmodFactor, &b_cs, &m_cholmod);
+      Ref<SparseMatrix<typename RhsDerived::Scalar,ColMajor,typename RhsDerived::StorageIndex> > b_ref(b.const_cast_derived());
+      cholmod_sparse b_cs = viewAsCholmod(b_ref);
+      cholmod_sparse* x_cs = internal::cm_spsolve<StorageIndex>(CHOLMOD_A, *m_cholmodFactor, b_cs, m_cholmod);
      if(!x_cs)
      {
        this->m_info = NumericalIssue;
        return;
      }
      // TODO optimize this copy by swapping when possible (be careful with alignment, etc.)
-      dest = viewAsEigen<DestScalar,DestOptions,DestIndex>(*x_cs);
-      cholmod_free_sparse(&x_cs, &m_cholmod);
+      // NOTE cholmod_spsolve in fact just calls the dense solver for blocks of 4 columns at a time (similar to Eigen's sparse solver)
+      dest.derived() = viewAsEigen<typename DestDerived::Scalar,ColMajor,typename DestDerived::StorageIndex>(*x_cs);
+      internal::cm_free_sparse<StorageIndex>(x_cs, m_cholmod);
    }
    #endif // EIGEN_PARSED_BY_DOXYGEN
-    
-    
+
+
    /** Sets the shift parameter that will be used to adjust the diagonal coefficients during the numerical factorization.
      *
      * During the numerical factorization, an offset term is added to the diagonal coefficients:\n
@@ -319,18 +383,69 @@ class CholmodBase : public SparseSolverBase<Derived>
      */
    Derived& setShift(const RealScalar& offset)
    {
-      m_shiftOffset[0] = offset;
+      m_shiftOffset[0] = double(offset);
      return derived();
    }
-    
+
+    /** \returns the determinant of the underlying matrix from the current factorization */
+    Scalar determinant() const
+    {
+      using std::exp;
+      return exp(logDeterminant());
+    }
+
+    /** \returns the log determinant of the underlying matrix from the current factorization */
+    Scalar logDeterminant() const
+    {
+      using std::log;
+      using numext::real;
+      eigen_assert(m_factorizationIsOk && "The decomposition is not in a valid state for solving, you must first call either compute() or symbolic()/numeric()");
+
+      RealScalar logDet = 0;
+      Scalar *x = static_cast<Scalar*>(m_cholmodFactor->x);
+      if (m_cholmodFactor->is_super)
+      {
+        // Supernodal factorization stored as a packed list of dense column-major blocs,
+        // as described by the following structure:
+
+        // super[k] == index of the first column of the j-th super node
+        StorageIndex *super = static_cast<StorageIndex*>(m_cholmodFactor->super);
+        // pi[k] == offset to the description of row indices
+        StorageIndex *pi = static_cast<StorageIndex*>(m_cholmodFactor->pi);
+        // px[k] == offset to the respective dense block
+        StorageIndex *px = static_cast<StorageIndex*>(m_cholmodFactor->px);
+
+        Index nb_super_nodes = m_cholmodFactor->nsuper;
+        for (Index k=0; k < nb_super_nodes; ++k)
+        {
+          StorageIndex ncols = super[k + 1] - super[k];
+          StorageIndex nrows = pi[k + 1] - pi[k];
+
+          Map<const Array<Scalar,1,Dynamic>, 0, InnerStride<> > sk(x + px[k], ncols, InnerStride<>(nrows+1));
+          logDet += sk.real().log().sum();
+        }
+      }
+      else
+      {
+        // Simplicial factorization stored as standard CSC matrix.
+        StorageIndex *p = static_cast<StorageIndex*>(m_cholmodFactor->p);
+        Index size = m_cholmodFactor->n;
+        for (Index k=0; k<size; ++k)
+          logDet += log(real( x[p[k]] ));
+      }
+      if (m_cholmodFactor->is_ll)
+        logDet *= 2.0;
+      return logDet;
+    };
+
    template<typename Stream>
    void dumpMemory(Stream& /*s*/)
    {}
-    
+
  protected:
    mutable cholmod_common m_cholmod;
    cholmod_factor* m_cholmodFactor;
-    RealScalar m_shiftOffset[2];
+    double m_shiftOffset[2];
    mutable ComputationInfo m_info;
    int m_factorizationIsOk;
    int m_analysisIsOk;
@@ -350,20 +465,24 @@ class CholmodBase : public SparseSolverBase<Derived>
  * \tparam _UpLo the triangular part that will be used for the computations. It can be Lower
  *               or Upper. Default is Lower.
  *
+  * \implsparsesolverconcept
+  *
  * This class supports all kind of SparseMatrix<>: row or column major; upper, lower, or both; compressed or non compressed.
  *
-  * \sa \ref TutorialSparseDirectSolvers, class CholmodSupernodalLLT, class SimplicialLLT
+  * \warning Only double precision real and complex scalar types are supported by Cholmod.
+  *
+  * \sa \ref TutorialSparseSolverConcept, class CholmodSupernodalLLT, class SimplicialLLT
  */
 template<typename _MatrixType, int _UpLo = Lower>
 class CholmodSimplicialLLT : public CholmodBase<_MatrixType, _UpLo, CholmodSimplicialLLT<_MatrixType, _UpLo> >
 {
    typedef CholmodBase<_MatrixType, _UpLo, CholmodSimplicialLLT> Base;
    using Base::m_cholmod;
-    
+
  public:
-    
+
    typedef _MatrixType MatrixType;
-    
+
    CholmodSimplicialLLT() : Base() { init(); }

    CholmodSimplicialLLT(const MatrixType& matrix) : Base()
@@ -397,20 +516,24 @@ class CholmodSimplicialLLT : public CholmodBase<_MatrixType, _UpLo, CholmodSimpl
  * \tparam _UpLo the triangular part that will be used for the computations. It can be Lower
  *               or Upper. Default is Lower.
  *
+  * \implsparsesolverconcept
+  *
  * This class supports all kind of SparseMatrix<>: row or column major; upper, lower, or both; compressed or non compressed.
  *
-  * \sa \ref TutorialSparseDirectSolvers, class CholmodSupernodalLLT, class SimplicialLDLT
+  * \warning Only double precision real and complex scalar types are supported by Cholmod.
+  *
+  * \sa \ref TutorialSparseSolverConcept, class CholmodSupernodalLLT, class SimplicialLDLT
  */
 template<typename _MatrixType, int _UpLo = Lower>
 class CholmodSimplicialLDLT : public CholmodBase<_MatrixType, _UpLo, CholmodSimplicialLDLT<_MatrixType, _UpLo> >
 {
    typedef CholmodBase<_MatrixType, _UpLo, CholmodSimplicialLDLT> Base;
    using Base::m_cholmod;
-    
+
  public:
-    
+
    typedef _MatrixType MatrixType;
-    
+
    CholmodSimplicialLDLT() : Base() { init(); }

    CholmodSimplicialLDLT(const MatrixType& matrix) : Base()
@@ -442,20 +565,24 @@ class CholmodSimplicialLDLT : public CholmodBase<_MatrixType, _UpLo, CholmodSimp
  * \tparam _UpLo the triangular part that will be used for the computations. It can be Lower
  *               or Upper. Default is Lower.
  *
+  * \implsparsesolverconcept
+  *
  * This class supports all kind of SparseMatrix<>: row or column major; upper, lower, or both; compressed or non compressed.
  *
-  * \sa \ref TutorialSparseDirectSolvers
+  * \warning Only double precision real and complex scalar types are supported by Cholmod.
+  *
+  * \sa \ref TutorialSparseSolverConcept
  */
 template<typename _MatrixType, int _UpLo = Lower>
 class CholmodSupernodalLLT : public CholmodBase<_MatrixType, _UpLo, CholmodSupernodalLLT<_MatrixType, _UpLo> >
 {
    typedef CholmodBase<_MatrixType, _UpLo, CholmodSupernodalLLT> Base;
    using Base::m_cholmod;
-    
+
  public:
-    
+
    typedef _MatrixType MatrixType;
-    
+
    CholmodSupernodalLLT() : Base() { init(); }

    CholmodSupernodalLLT(const MatrixType& matrix) : Base()
@@ -489,20 +616,24 @@ class CholmodSupernodalLLT : public CholmodBase<_MatrixType, _UpLo, CholmodSuper
  * \tparam _UpLo the triangular part that will be used for the computations. It can be Lower
  *               or Upper. Default is Lower.
  *
+  * \implsparsesolverconcept
+  *
  * This class supports all kind of SparseMatrix<>: row or column major; upper, lower, or both; compressed or non compressed.
  *
-  * \sa \ref TutorialSparseDirectSolvers
+  * \warning Only double precision real and complex scalar types are supported by Cholmod.
+  *
+  * \sa \ref TutorialSparseSolverConcept
  */
 template<typename _MatrixType, int _UpLo = Lower>
 class CholmodDecomposition : public CholmodBase<_MatrixType, _UpLo, CholmodDecomposition<_MatrixType, _UpLo> >
 {
    typedef CholmodBase<_MatrixType, _UpLo, CholmodDecomposition> Base;
    using Base::m_cholmod;
-    
+
  public:
-    
+
    typedef _MatrixType MatrixType;
-    
+
    CholmodDecomposition() : Base() { init(); }

    CholmodDecomposition(const MatrixType& matrix) : Base()
@@ -512,7 +643,7 @@ class CholmodDecomposition : public CholmodBase<_MatrixType, _UpLo, CholmodDecom
    }

    ~CholmodDecomposition() {}
-    
+
    void setMode(CholmodMode mode)
    {
      switch(mode)
--- a/Eigen/src/Core/ArithmeticSequence.h
+++ b/Eigen/src/Core/ArithmeticSequence.h
@@ -0,0 +1,413 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2017 Gael Guennebaud <gael.guennebaud@inria.fr>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+#ifndef EIGEN_ARITHMETIC_SEQUENCE_H
+#define EIGEN_ARITHMETIC_SEQUENCE_H
+
+namespace Eigen {
+
+namespace internal {
+
+#if (!EIGEN_HAS_CXX11) || !((!EIGEN_COMP_GNUC) || EIGEN_COMP_GNUC>=48)
+template<typename T> struct aseq_negate {};
+
+template<> struct aseq_negate<Index> {
+  typedef Index type;
+};
+
+template<int N> struct aseq_negate<FixedInt<N> > {
+  typedef FixedInt<-N> type;
+};
+
+// Compilation error in the following case:
+template<> struct aseq_negate<FixedInt<DynamicIndex> > {};
+
+template<typename FirstType,typename SizeType,typename IncrType,
+         bool FirstIsSymbolic=symbolic::is_symbolic<FirstType>::value,
+         bool SizeIsSymbolic =symbolic::is_symbolic<SizeType>::value>
+struct aseq_reverse_first_type {
+  typedef Index type;
+};
+
+template<typename FirstType,typename SizeType,typename IncrType>
+struct aseq_reverse_first_type<FirstType,SizeType,IncrType,true,true> {
+  typedef symbolic::AddExpr<FirstType,
+                            symbolic::ProductExpr<symbolic::AddExpr<SizeType,symbolic::ValueExpr<FixedInt<-1> > >,
+                                                  symbolic::ValueExpr<IncrType> >
+                           > type;
+};
+
+template<typename SizeType,typename IncrType,typename EnableIf = void>
+struct aseq_reverse_first_type_aux {
+  typedef Index type;
+};
+
+template<typename SizeType,typename IncrType>
+struct aseq_reverse_first_type_aux<SizeType,IncrType,typename internal::enable_if<bool((SizeType::value+IncrType::value)|0x1)>::type> {
+  typedef FixedInt<(SizeType::value-1)*IncrType::value> type;
+};
+
+template<typename FirstType,typename SizeType,typename IncrType>
+struct aseq_reverse_first_type<FirstType,SizeType,IncrType,true,false> {
+  typedef typename aseq_reverse_first_type_aux<SizeType,IncrType>::type Aux;
+  typedef symbolic::AddExpr<FirstType,symbolic::ValueExpr<Aux> > type;
+};
+
+template<typename FirstType,typename SizeType,typename IncrType>
+struct aseq_reverse_first_type<FirstType,SizeType,IncrType,false,true> {
+  typedef symbolic::AddExpr<symbolic::ProductExpr<symbolic::AddExpr<SizeType,symbolic::ValueExpr<FixedInt<-1> > >,
+                                                  symbolic::ValueExpr<IncrType> >,
+                            symbolic::ValueExpr<> > type;
+};
+#endif
+
+// Helper to cleanup the type of the increment:
+template<typename T> struct cleanup_seq_incr {
+  typedef typename cleanup_index_type<T,DynamicIndex>::type type;
+};
+
+}
+
+//--------------------------------------------------------------------------------
+// seq(first,last,incr) and seqN(first,size,incr)
+//--------------------------------------------------------------------------------
+
+template<typename FirstType=Index,typename SizeType=Index,typename IncrType=internal::FixedInt<1> >
+class ArithmeticSequence;
+
+template<typename FirstType,typename SizeType,typename IncrType>
+ArithmeticSequence<typename internal::cleanup_index_type<FirstType>::type,
+                   typename internal::cleanup_index_type<SizeType>::type,
+                   typename internal::cleanup_seq_incr<IncrType>::type >
+seqN(FirstType first, SizeType size, IncrType incr);
+
+/** \class ArithmeticSequence
+  * \ingroup Core_Module
+  *
+  * This class represents an arithmetic progression \f$ a_0, a_1, a_2, ..., a_{n-1}\f$ defined by
+  * its \em first value \f$ a_0 \f$, its \em size (aka length) \em n, and the \em increment (aka stride)
+  * that is equal to \f$ a_{i+1}-a_{i}\f$ for any \em i.
+  *
+  * It is internally used as the return type of the Eigen::seq and Eigen::seqN functions, and as the input arguments
+  * of DenseBase::operator()(const RowIndices&, const ColIndices&), and most of the time this is the
+  * only way it is used.
+  *
+  * \tparam FirstType type of the first element, usually an Index,
+  *                   but internally it can be a symbolic expression
+  * \tparam SizeType type representing the size of the sequence, usually an Index
+  *                  or a compile time integral constant. Internally, it can also be a symbolic expression
+  * \tparam IncrType type of the increment, can be a runtime Index, or a compile time integral constant (default is compile-time 1)
+  *
+  * \sa Eigen::seq, Eigen::seqN, DenseBase::operator()(const RowIndices&, const ColIndices&), class IndexedView
+  */
+template<typename FirstType,typename SizeType,typename IncrType>
+class ArithmeticSequence
+{
+public:
+  ArithmeticSequence(FirstType first, SizeType size) : m_first(first), m_size(size) {}
+  ArithmeticSequence(FirstType first, SizeType size, IncrType incr) : m_first(first), m_size(size), m_incr(incr) {}
+
+  enum {
+    SizeAtCompileTime = internal::get_fixed_value<SizeType>::value,
+    IncrAtCompileTime = internal::get_fixed_value<IncrType,DynamicIndex>::value
+  };
+
+  /** \returns the size, i.e., number of elements, of the sequence */
+  Index size()  const { return m_size; }
+
+  /** \returns the first element \f$ a_0 \f$ in the sequence */
+  Index first()  const { return m_first; }
+
+  /** \returns the value \f$ a_i \f$ at index \a i in the sequence. */
+  Index operator[](Index i) const { return m_first + i * m_incr; }
+
+  const FirstType& firstObject() const { return m_first; }
+  const SizeType&  sizeObject()  const { return m_size; }
+  const IncrType&  incrObject()  const { return m_incr; }
+
+protected:
+  FirstType m_first;
+  SizeType  m_size;
+  IncrType  m_incr;
+
+public:
+
+#if EIGEN_HAS_CXX11 && ((!EIGEN_COMP_GNUC) || EIGEN_COMP_GNUC>=48)
+  auto reverse() const -> decltype(Eigen::seqN(m_first+(m_size+fix<-1>())*m_incr,m_size,-m_incr)) {
+    return seqN(m_first+(m_size+fix<-1>())*m_incr,m_size,-m_incr);
+  }
+#else
+protected:
+  typedef typename internal::aseq_negate<IncrType>::type ReverseIncrType;
+  typedef typename internal::aseq_reverse_first_type<FirstType,SizeType,IncrType>::type ReverseFirstType;
+public:
+  ArithmeticSequence<ReverseFirstType,SizeType,ReverseIncrType>
+  reverse() const {
+    return seqN(m_first+(m_size+fix<-1>())*m_incr,m_size,-m_incr);
+  }
+#endif
+};
+
+/** \returns an ArithmeticSequence starting at \a first, of length \a size, and increment \a incr
+  *
+  * \sa seqN(FirstType,SizeType), seq(FirstType,LastType,IncrType) */
+template<typename FirstType,typename SizeType,typename IncrType>
+ArithmeticSequence<typename internal::cleanup_index_type<FirstType>::type,typename internal::cleanup_index_type<SizeType>::type,typename internal::cleanup_seq_incr<IncrType>::type >
+seqN(FirstType first, SizeType size, IncrType incr)  {
+  return ArithmeticSequence<typename internal::cleanup_index_type<FirstType>::type,typename internal::cleanup_index_type<SizeType>::type,typename internal::cleanup_seq_incr<IncrType>::type>(first,size,incr);
+}
+
+/** \returns an ArithmeticSequence starting at \a first, of length \a size, and unit increment
+  *
+  * \sa seqN(FirstType,SizeType,IncrType), seq(FirstType,LastType) */
+template<typename FirstType,typename SizeType>
+ArithmeticSequence<typename internal::cleanup_index_type<FirstType>::type,typename internal::cleanup_index_type<SizeType>::type >
+seqN(FirstType first, SizeType size)  {
+  return ArithmeticSequence<typename internal::cleanup_index_type<FirstType>::type,typename internal::cleanup_index_type<SizeType>::type>(first,size);
+}
+
+#ifdef EIGEN_PARSED_BY_DOXYGEN
+
+/** \returns an ArithmeticSequence starting at \a f, up (or down) to \a l, and with positive (or negative) increment \a incr
+  *
+  * It is essentially an alias to:
+  * \code
+  * seqN(f, (l-f+incr)/incr, incr);
+  * \endcode
+  *
+  * \sa seqN(FirstType,SizeType,IncrType), seq(FirstType,LastType)
+  */
+template<typename FirstType,typename LastType, typename IncrType>
+auto seq(FirstType f, LastType l, IncrType incr);
+
+/** \returns an ArithmeticSequence starting at \a f, up (or down) to \a l, and unit increment
+  *
+  * It is essentially an alias to:
+  * \code
+  * seqN(f,l-f+1);
+  * \endcode
+  *
+  * \sa seqN(FirstType,SizeType), seq(FirstType,LastType,IncrType)
+  */
+template<typename FirstType,typename LastType>
+auto seq(FirstType f, LastType l);
+
+#else // EIGEN_PARSED_BY_DOXYGEN
+
+#if EIGEN_HAS_CXX11
+template<typename FirstType,typename LastType>
+auto seq(FirstType f, LastType l) -> decltype(seqN(typename internal::cleanup_index_type<FirstType>::type(f),
+                                                   (  typename internal::cleanup_index_type<LastType>::type(l)
+                                                    - typename internal::cleanup_index_type<FirstType>::type(f)+fix<1>())))
+{
+  return seqN(typename internal::cleanup_index_type<FirstType>::type(f),
+              (typename internal::cleanup_index_type<LastType>::type(l)
+               -typename internal::cleanup_index_type<FirstType>::type(f)+fix<1>()));
+}
+
+template<typename FirstType,typename LastType, typename IncrType>
+auto seq(FirstType f, LastType l, IncrType incr)
+  -> decltype(seqN(typename internal::cleanup_index_type<FirstType>::type(f),
+                   (   typename internal::cleanup_index_type<LastType>::type(l)
+                     - typename internal::cleanup_index_type<FirstType>::type(f)+typename internal::cleanup_seq_incr<IncrType>::type(incr)
+                   ) / typename internal::cleanup_seq_incr<IncrType>::type(incr),
+                   typename internal::cleanup_seq_incr<IncrType>::type(incr)))
+{
+  typedef typename internal::cleanup_seq_incr<IncrType>::type CleanedIncrType;
+  return seqN(typename internal::cleanup_index_type<FirstType>::type(f),
+              ( typename internal::cleanup_index_type<LastType>::type(l)
+               -typename internal::cleanup_index_type<FirstType>::type(f)+CleanedIncrType(incr)) / CleanedIncrType(incr),
+              CleanedIncrType(incr));
+}
+
+#else // EIGEN_HAS_CXX11
+
+template<typename FirstType,typename LastType>
+typename internal::enable_if<!(symbolic::is_symbolic<FirstType>::value || symbolic::is_symbolic<LastType>::value),
+                             ArithmeticSequence<typename internal::cleanup_index_type<FirstType>::type,Index> >::type
+seq(FirstType f, LastType l)
+{
+  return seqN(typename internal::cleanup_index_type<FirstType>::type(f),
+              Index((typename internal::cleanup_index_type<LastType>::type(l)-typename internal::cleanup_index_type<FirstType>::type(f)+fix<1>())));
+}
+
+template<typename FirstTypeDerived,typename LastType>
+typename internal::enable_if<!symbolic::is_symbolic<LastType>::value,
+    ArithmeticSequence<FirstTypeDerived, symbolic::AddExpr<symbolic::AddExpr<symbolic::NegateExpr<FirstTypeDerived>,symbolic::ValueExpr<> >,
+                                                            symbolic::ValueExpr<internal::FixedInt<1> > > > >::type
+seq(const symbolic::BaseExpr<FirstTypeDerived> &f, LastType l)
+{
+  return seqN(f.derived(),(typename internal::cleanup_index_type<LastType>::type(l)-f.derived()+fix<1>()));
+}
+
+template<typename FirstType,typename LastTypeDerived>
+typename internal::enable_if<!symbolic::is_symbolic<FirstType>::value,
+    ArithmeticSequence<typename internal::cleanup_index_type<FirstType>::type,
+                        symbolic::AddExpr<symbolic::AddExpr<LastTypeDerived,symbolic::ValueExpr<> >,
+                                          symbolic::ValueExpr<internal::FixedInt<1> > > > >::type
+seq(FirstType f, const symbolic::BaseExpr<LastTypeDerived> &l)
+{
+  return seqN(typename internal::cleanup_index_type<FirstType>::type(f),(l.derived()-typename internal::cleanup_index_type<FirstType>::type(f)+fix<1>()));
+}
+
+template<typename FirstTypeDerived,typename LastTypeDerived>
+ArithmeticSequence<FirstTypeDerived,
+                    symbolic::AddExpr<symbolic::AddExpr<LastTypeDerived,symbolic::NegateExpr<FirstTypeDerived> >,symbolic::ValueExpr<internal::FixedInt<1> > > >
+seq(const symbolic::BaseExpr<FirstTypeDerived> &f, const symbolic::BaseExpr<LastTypeDerived> &l)
+{
+  return seqN(f.derived(),(l.derived()-f.derived()+fix<1>()));
+}
+
+
+template<typename FirstType,typename LastType, typename IncrType>
+typename internal::enable_if<!(symbolic::is_symbolic<FirstType>::value || symbolic::is_symbolic<LastType>::value),
+    ArithmeticSequence<typename internal::cleanup_index_type<FirstType>::type,Index,typename internal::cleanup_seq_incr<IncrType>::type> >::type
+seq(FirstType f, LastType l, IncrType incr)
+{
+  typedef typename internal::cleanup_seq_incr<IncrType>::type CleanedIncrType;
+  return seqN(typename internal::cleanup_index_type<FirstType>::type(f),
+              Index((typename internal::cleanup_index_type<LastType>::type(l)-typename internal::cleanup_index_type<FirstType>::type(f)+CleanedIncrType(incr))/CleanedIncrType(incr)), incr);
+}
+
+template<typename FirstTypeDerived,typename LastType, typename IncrType>
+typename internal::enable_if<!symbolic::is_symbolic<LastType>::value,
+    ArithmeticSequence<FirstTypeDerived,
+                        symbolic::QuotientExpr<symbolic::AddExpr<symbolic::AddExpr<symbolic::NegateExpr<FirstTypeDerived>,
+                                                                                   symbolic::ValueExpr<> >,
+                                                                 symbolic::ValueExpr<typename internal::cleanup_seq_incr<IncrType>::type> >,
+                                              symbolic::ValueExpr<typename internal::cleanup_seq_incr<IncrType>::type> >,
+                        typename internal::cleanup_seq_incr<IncrType>::type> >::type
+seq(const symbolic::BaseExpr<FirstTypeDerived> &f, LastType l, IncrType incr)
+{
+  typedef typename internal::cleanup_seq_incr<IncrType>::type CleanedIncrType;
+  return seqN(f.derived(),(typename internal::cleanup_index_type<LastType>::type(l)-f.derived()+CleanedIncrType(incr))/CleanedIncrType(incr), incr);
+}
+
+template<typename FirstType,typename LastTypeDerived, typename IncrType>
+typename internal::enable_if<!symbolic::is_symbolic<FirstType>::value,
+    ArithmeticSequence<typename internal::cleanup_index_type<FirstType>::type,
+                        symbolic::QuotientExpr<symbolic::AddExpr<symbolic::AddExpr<LastTypeDerived,symbolic::ValueExpr<> >,
+                                                                 symbolic::ValueExpr<typename internal::cleanup_seq_incr<IncrType>::type> >,
+                                               symbolic::ValueExpr<typename internal::cleanup_seq_incr<IncrType>::type> >,
+                        typename internal::cleanup_seq_incr<IncrType>::type> >::type
+seq(FirstType f, const symbolic::BaseExpr<LastTypeDerived> &l, IncrType incr)
+{
+  typedef typename internal::cleanup_seq_incr<IncrType>::type CleanedIncrType;
+  return seqN(typename internal::cleanup_index_type<FirstType>::type(f),
+              (l.derived()-typename internal::cleanup_index_type<FirstType>::type(f)+CleanedIncrType(incr))/CleanedIncrType(incr), incr);
+}
+
+template<typename FirstTypeDerived,typename LastTypeDerived, typename IncrType>
+ArithmeticSequence<FirstTypeDerived,
+                    symbolic::QuotientExpr<symbolic::AddExpr<symbolic::AddExpr<LastTypeDerived,
+                                                                               symbolic::NegateExpr<FirstTypeDerived> >,
+                                                             symbolic::ValueExpr<typename internal::cleanup_seq_incr<IncrType>::type> >,
+                                          symbolic::ValueExpr<typename internal::cleanup_seq_incr<IncrType>::type> >,
+                    typename internal::cleanup_seq_incr<IncrType>::type>
+seq(const symbolic::BaseExpr<FirstTypeDerived> &f, const symbolic::BaseExpr<LastTypeDerived> &l, IncrType incr)
+{
+  typedef typename internal::cleanup_seq_incr<IncrType>::type CleanedIncrType;
+  return seqN(f.derived(),(l.derived()-f.derived()+CleanedIncrType(incr))/CleanedIncrType(incr), incr);
+}
+#endif // EIGEN_HAS_CXX11
+
+#endif // EIGEN_PARSED_BY_DOXYGEN
+
+
+#if EIGEN_HAS_CXX11 || defined(EIGEN_PARSED_BY_DOXYGEN)
+/** \cpp11
+  * \returns a symbolic ArithmeticSequence representing the last \a size elements with increment \a incr.
+  *
+  * It is a shortcut for: \code seqN(last-(size-fix<1>)*incr, size, incr) \endcode
+  * 
+  * \sa lastN(SizeType), seqN(FirstType,SizeType), seq(FirstType,LastType,IncrType) */
+template<typename SizeType,typename IncrType>
+auto lastN(SizeType size, IncrType incr)
+-> decltype(seqN(Eigen::last-(size-fix<1>())*incr, size, incr))
+{
+  return seqN(Eigen::last-(size-fix<1>())*incr, size, incr);
+}
+
+/** \cpp11
+  * \returns a symbolic ArithmeticSequence representing the last \a size elements with a unit increment.
+  *
+  *  It is a shortcut for: \code seq(last+fix<1>-size, last) \endcode
+  * 
+  * \sa lastN(SizeType,IncrType, seqN(FirstType,SizeType), seq(FirstType,LastType) */
+template<typename SizeType>
+auto lastN(SizeType size)
+-> decltype(seqN(Eigen::last+fix<1>()-size, size))
+{
+  return seqN(Eigen::last+fix<1>()-size, size);
+}
+#endif
+
+namespace internal {
+
+// Convert a symbolic span into a usable one (i.e., remove last/end "keywords")
+template<typename T>
+struct make_size_type {
+  typedef typename internal::conditional<symbolic::is_symbolic<T>::value, Index, T>::type type;
+};
+
+template<typename FirstType,typename SizeType,typename IncrType,int XprSize>
+struct IndexedViewCompatibleType<ArithmeticSequence<FirstType,SizeType,IncrType>, XprSize> {
+  typedef ArithmeticSequence<Index,typename make_size_type<SizeType>::type,IncrType> type;
+};
+
+template<typename FirstType,typename SizeType,typename IncrType>
+ArithmeticSequence<Index,typename make_size_type<SizeType>::type,IncrType>
+makeIndexedViewCompatible(const ArithmeticSequence<FirstType,SizeType,IncrType>& ids, Index size,SpecializedType) {
+  return ArithmeticSequence<Index,typename make_size_type<SizeType>::type,IncrType>(
+            eval_expr_given_size(ids.firstObject(),size),eval_expr_given_size(ids.sizeObject(),size),ids.incrObject());
+}
+
+template<typename FirstType,typename SizeType,typename IncrType>
+struct get_compile_time_incr<ArithmeticSequence<FirstType,SizeType,IncrType> > {
+  enum { value = get_fixed_value<IncrType,DynamicIndex>::value };
+};
+
+} // end namespace internal
+
+/** \namespace Eigen::indexing
+  * \ingroup Core_Module
+  * 
+  * The sole purpose of this namespace is to be able to import all functions
+  * and symbols that are expected to be used within operator() for indexing
+  * and slicing. If you already imported the whole Eigen namespace:
+  * \code using namespace Eigen; \endcode
+  * then you are already all set. Otherwise, if you don't want/cannot import
+  * the whole Eigen namespace, the following line:
+  * \code using namespace Eigen::indexing; \endcode
+  * is equivalent to:
+  * \code
+  using Eigen::all;
+  using Eigen::seq;
+  using Eigen::seqN;
+  using Eigen::lastN; // c++11 only
+  using Eigen::last;
+  using Eigen::lastp1;
+  using Eigen::fix;
+  \endcode
+  */
+namespace indexing {
+  using Eigen::all;
+  using Eigen::seq;
+  using Eigen::seqN;
+  #if EIGEN_HAS_CXX11
+  using Eigen::lastN;
+  #endif
+  using Eigen::last;
+  using Eigen::lastp1;
+  using Eigen::fix;
+}
+
+} // end namespace Eigen
+
+#endif // EIGEN_ARITHMETIC_SEQUENCE_H
--- a/Eigen/src/Core/Array.h
+++ b/Eigen/src/Core/Array.h
@@ -12,7 +12,16 @@

 namespace Eigen {

-/** \class Array 
+namespace internal {
+template<typename _Scalar, int _Rows, int _Cols, int _Options, int _MaxRows, int _MaxCols>
+struct traits<Array<_Scalar, _Rows, _Cols, _Options, _MaxRows, _MaxCols> > : traits<Matrix<_Scalar, _Rows, _Cols, _Options, _MaxRows, _MaxCols> >
+{
+  typedef ArrayXpr XprKind;
+  typedef ArrayBase<Array<_Scalar, _Rows, _Cols, _Options, _MaxRows, _MaxCols> > XprBase;
+};
+}
+
+/** \class Array
  * \ingroup Core_Module
  *
  * \brief General-purpose arrays with easy API for coefficient-wise operations
@@ -26,21 +35,12 @@ namespace Eigen {
  *
  * See documentation of class Matrix for detailed information on the template parameters
  * storage layout.
-  * 
-  * This class can be extended with the help of the plugin mechanism described on the page
-  * \ref TopicCustomizingEigen by defining the preprocessor symbol \c EIGEN_ARRAY_PLUGIN.
  *
-  * \sa \ref TutorialArrayClass, \ref TopicClassHierarchy
+  * This class can be extended with the help of the plugin mechanism described on the page
+  * \ref TopicCustomizing_Plugins by defining the preprocessor symbol \c EIGEN_ARRAY_PLUGIN.
+  *
+  * \sa \blank \ref TutorialArrayClass, \ref TopicClassHierarchy
  */
-namespace internal {
-template<typename _Scalar, int _Rows, int _Cols, int _Options, int _MaxRows, int _MaxCols>
-struct traits<Array<_Scalar, _Rows, _Cols, _Options, _MaxRows, _MaxCols> > : traits<Matrix<_Scalar, _Rows, _Cols, _Options, _MaxRows, _MaxCols> >
-{
-  typedef ArrayXpr XprKind;
-  typedef ArrayBase<Array<_Scalar, _Rows, _Cols, _Options, _MaxRows, _MaxCols> > XprBase;
-};
-}
-
 template<typename _Scalar, int _Rows, int _Cols, int _Options, int _MaxRows, int _MaxCols>
 class Array
  : public PlainObjectBase<Array<_Scalar, _Rows, _Cols, _Options, _MaxRows, _MaxCols> >
@@ -147,23 +147,60 @@ class Array
    }
 #endif

-#ifdef EIGEN_HAVE_RVALUE_REFERENCES
+#if EIGEN_HAS_RVALUE_REFERENCES
    EIGEN_DEVICE_FUNC
-    Array(Array&& other)
+    Array(Array&& other) EIGEN_NOEXCEPT_IF(std::is_nothrow_move_constructible<Scalar>::value)
      : Base(std::move(other))
    {
      Base::_check_template_params();
-      if (RowsAtCompileTime!=Dynamic && ColsAtCompileTime!=Dynamic)
-        Base::_set_noalias(other);
    }
    EIGEN_DEVICE_FUNC
-    Array& operator=(Array&& other)
+    Array& operator=(Array&& other) EIGEN_NOEXCEPT_IF(std::is_nothrow_move_assignable<Scalar>::value)
    {
      other.swap(*this);
      return *this;
    }
 #endif

+    #if EIGEN_HAS_CXX11
+    /** \copydoc PlainObjectBase(const Scalar& a0, const Scalar& a1, const Scalar& a2, const Scalar& a3, const ArgTypes&... args)
+     *
+     * Example: \include Array_variadic_ctor_cxx11.cpp
+     * Output: \verbinclude Array_variadic_ctor_cxx11.out
+     *
+     * \sa Array(const std::initializer_list<std::initializer_list<Scalar>>&)
+     * \sa Array(const Scalar&), Array(const Scalar&,const Scalar&)
+     */
+    template <typename... ArgTypes>
+    EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+    Array(const Scalar& a0, const Scalar& a1, const Scalar& a2, const Scalar& a3, const ArgTypes&... args)
+      : Base(a0, a1, a2, a3, args...) {}
+
+    /** \brief Constructs an array and initializes it from the coefficients given as initializer-lists grouped by row. \cpp11
+      * 
+      * In the general case, the constructor takes a list of rows, each row being represented as a list of coefficients:
+      * 
+      * Example: \include Array_initializer_list_23_cxx11.cpp
+      * Output: \verbinclude Array_initializer_list_23_cxx11.out
+      * 
+      * Each of the inner initializer lists must contain the exact same number of elements, otherwise an assertion is triggered.
+      * 
+      * In the case of a compile-time column 1D array, implicit transposition from a single row is allowed.
+      * Therefore <code> Array<int,Dynamic,1>{{1,2,3,4,5}}</code> is legal and the more verbose syntax
+      * <code>Array<int,Dynamic,1>{{1},{2},{3},{4},{5}}</code> can be avoided:
+      * 
+      * Example: \include Array_initializer_list_vector_cxx11.cpp
+      * Output: \verbinclude Array_initializer_list_vector_cxx11.out
+      * 
+      * In the case of fixed-sized arrays, the initializer list sizes must exactly match the array sizes,
+      * and implicit transposition is allowed for compile-time 1D arrays only.
+      * 
+      * \sa  Array(const Scalar& a0, const Scalar& a1, const Scalar& a2, const Scalar& a3, const ArgTypes&... args)
+      */
+    EIGEN_DEVICE_FUNC
+    EIGEN_STRONG_INLINE Array(const std::initializer_list<std::initializer_list<Scalar>>& list) : Base(list) {}
+    #endif // end EIGEN_HAS_CXX11
+
    #ifndef EIGEN_PARSED_BY_DOXYGEN
    template<typename T>
    EIGEN_DEVICE_FUNC
@@ -180,6 +217,7 @@ class Array
      Base::_check_template_params();
      this->template _init2<T0,T1>(val0, val1);
    }
+
    #else
    /** \brief Constructs a fixed-sized array initialized with coefficients starting at \a data */
    EIGEN_DEVICE_FUNC explicit Array(const Scalar *data);
@@ -191,7 +229,8 @@ class Array
      */
    EIGEN_DEVICE_FUNC
    EIGEN_STRONG_INLINE explicit Array(Index dim);
-    /** constructs an initialized 1x1 Array with the given coefficient */
+    /** constructs an initialized 1x1 Array with the given coefficient
+      * \sa const Scalar& a0, const Scalar& a1, const Scalar& a2, const Scalar& a3, const ArgTypes&... args */
    Array(const Scalar& value);
    /** constructs an uninitialized array with \a rows rows and \a cols columns.
      *
@@ -199,11 +238,14 @@ class Array
      * it is redundant to pass these parameters, so one should use the default constructor
      * Array() instead. */
    Array(Index rows, Index cols);
-    /** constructs an initialized 2D vector with given coefficients */
+    /** constructs an initialized 2D vector with given coefficients
+      * \sa Array(const Scalar& a0, const Scalar& a1, const Scalar& a2, const Scalar& a3, const ArgTypes&... args) */
    Array(const Scalar& val0, const Scalar& val1);
-    #endif
+    #endif  // end EIGEN_PARSED_BY_DOXYGEN 

-    /** constructs an initialized 3D vector with given coefficients */
+    /** constructs an initialized 3D vector with given coefficients
+      * \sa Array(const Scalar& a0, const Scalar& a1, const Scalar& a2, const Scalar& a3, const ArgTypes&... args)
+      */
    EIGEN_DEVICE_FUNC
    EIGEN_STRONG_INLINE Array(const Scalar& val0, const Scalar& val1, const Scalar& val2)
    {
@@ -213,7 +255,9 @@ class Array
      m_storage.data()[1] = val1;
      m_storage.data()[2] = val2;
    }
-    /** constructs an initialized 4D vector with given coefficients */
+    /** constructs an initialized 4D vector with given coefficients
+      * \sa Array(const Scalar& a0, const Scalar& a1, const Scalar& a2, const Scalar& a3, const ArgTypes&... args)
+      */
    EIGEN_DEVICE_FUNC
    EIGEN_STRONG_INLINE Array(const Scalar& val0, const Scalar& val1, const Scalar& val2, const Scalar& val3)
    {
@@ -231,10 +275,16 @@ class Array
            : Base(other)
    { }

+  private:
+    struct PrivateType {};
+  public:
+
    /** \sa MatrixBase::operator=(const EigenBase<OtherDerived>&) */
    template<typename OtherDerived>
    EIGEN_DEVICE_FUNC
-    EIGEN_STRONG_INLINE Array(const EigenBase<OtherDerived> &other)
+    EIGEN_STRONG_INLINE Array(const EigenBase<OtherDerived> &other,
+                              typename internal::enable_if<internal::is_convertible<typename OtherDerived::Scalar,Scalar>::value,
+                                                           PrivateType>::type = PrivateType())
      : Base(other.derived())
    { }

@@ -254,7 +304,7 @@ class Array
 /** \defgroup arraytypedefs Global array typedefs
  * \ingroup Core_Module
  *
-  * Eigen defines several typedef shortcuts for most common 1D and 2D array types.
+  * %Eigen defines several typedef shortcuts for most common 1D and 2D array types.
  *
  * The general patterns are the following:
  *
@@ -267,6 +317,12 @@ class Array
  * There are also \c ArraySizeType which are self-explanatory. For example, \c Array4cf is
  * a fixed-size 1D array of 4 complex floats.
  *
+  * With \cpp11, template alias are also defined for common sizes.
+  * They follow the same pattern as above except that the scalar type suffix is replaced by a
+  * template parameter, i.e.:
+  *   - `ArrayRowsCols<Type>` where `Rows` and `Cols` can be \c 2,\c 3,\c 4, or \c X for fixed or dynamic size.
+  *   - `ArraySize<Type>` where `Size` can be \c 2,\c 3,\c 4 or \c X for fixed or dynamic size 1D arrays.
+  * 
  * \sa class Array
  */

@@ -299,9 +355,43 @@ EIGEN_MAKE_ARRAY_TYPEDEFS_ALL_SIZES(std::complex<double>, cd)

 #undef EIGEN_MAKE_ARRAY_TYPEDEFS_ALL_SIZES
 #undef EIGEN_MAKE_ARRAY_TYPEDEFS
+#undef EIGEN_MAKE_ARRAY_FIXED_TYPEDEFS

-#undef EIGEN_MAKE_ARRAY_TYPEDEFS_LARGE
+#if EIGEN_HAS_CXX11

+#define EIGEN_MAKE_ARRAY_TYPEDEFS(Size, SizeSuffix)               \
+/** \ingroup arraytypedefs */                                     \
+/** \brief \cpp11 */                                              \
+template <typename Type>                                          \
+using Array##SizeSuffix##SizeSuffix = Array<Type, Size, Size>;    \
+/** \ingroup arraytypedefs */                                     \
+/** \brief \cpp11 */                                              \
+template <typename Type>                                          \
+using Array##SizeSuffix = Array<Type, Size, 1>; 
+
+#define EIGEN_MAKE_ARRAY_FIXED_TYPEDEFS(Size)                     \
+/** \ingroup arraytypedefs */                                     \
+/** \brief \cpp11 */                                              \
+template <typename Type>                                          \
+using Array##Size##X = Array<Type, Size, Dynamic>;                \
+/** \ingroup arraytypedefs */                                     \
+/** \brief \cpp11 */                                              \
+template <typename Type>                                          \
+using Array##X##Size = Array<Type, Dynamic, Size>;
+
+EIGEN_MAKE_ARRAY_TYPEDEFS(2, 2)
+EIGEN_MAKE_ARRAY_TYPEDEFS(3, 3)
+EIGEN_MAKE_ARRAY_TYPEDEFS(4, 4)
+EIGEN_MAKE_ARRAY_TYPEDEFS(Dynamic, X)
+EIGEN_MAKE_ARRAY_FIXED_TYPEDEFS(2)
+EIGEN_MAKE_ARRAY_FIXED_TYPEDEFS(3)
+EIGEN_MAKE_ARRAY_FIXED_TYPEDEFS(4)
+
+#undef EIGEN_MAKE_ARRAY_TYPEDEFS
+#undef EIGEN_MAKE_ARRAY_FIXED_TYPEDEFS
+
+#endif // EIGEN_HAS_CXX11
+  
 #define EIGEN_USING_ARRAY_TYPEDEFS_FOR_TYPE_AND_SIZE(TypeSuffix, SizeSuffix) \
 using Eigen::Matrix##SizeSuffix##TypeSuffix; \
 using Eigen::Vector##SizeSuffix##TypeSuffix; \
--- a/Eigen/src/Core/ArrayBase.h
+++ b/Eigen/src/Core/ArrayBase.h
@@ -32,7 +32,7 @@ template<typename ExpressionType> class MatrixWrapper;
  * \tparam Derived is the derived type, e.g., an array or an expression type.
  *
  * This class can be extended with the help of the plugin mechanism described on the page
-  * \ref TopicCustomizingEigen by defining the preprocessor symbol \c EIGEN_ARRAYBASE_PLUGIN.
+  * \ref TopicCustomizing_Plugins by defining the preprocessor symbol \c EIGEN_ARRAYBASE_PLUGIN.
  *
  * \sa class MatrixBase, \ref TopicClassHierarchy
  */
@@ -46,9 +46,6 @@ template<typename Derived> class ArrayBase

    typedef ArrayBase Eigen_BaseClassForSpecializationOfGlobalMathFuncImpl;

-    using internal::special_scalar_op_base<Derived,typename internal::traits<Derived>::Scalar,
-                typename NumTraits<typename internal::traits<Derived>::Scalar>::Real>::operator*;
-
    typedef typename internal::traits<Derived>::StorageKind StorageKind;
    typedef typename internal::traits<Derived>::Scalar Scalar;
    typedef typename internal::packet_traits<Scalar>::type PacketScalar;
@@ -72,6 +69,7 @@ template<typename Derived> class ArrayBase
    using Base::coeff;
    using Base::coeffRef;
    using Base::lazyAssign;
+    using Base::operator-;
    using Base::operator=;
    using Base::operator+=;
    using Base::operator-=;
@@ -90,7 +88,7 @@ template<typename Derived> class ArrayBase
 #endif // not EIGEN_PARSED_BY_DOXYGEN

 #define EIGEN_CURRENT_STORAGE_BASE_CLASS Eigen::ArrayBase
-#   include "../plugins/CommonCwiseUnaryOps.h"
+#define EIGEN_DOC_UNARY_ADDONS(X,Y)
 #   include "../plugins/MatrixCwiseUnaryOps.h"
 #   include "../plugins/ArrayCwiseUnaryOps.h"
 #   include "../plugins/CommonCwiseBinaryOps.h"
@@ -100,11 +98,12 @@ template<typename Derived> class ArrayBase
 #     include EIGEN_ARRAYBASE_PLUGIN
 #   endif
 #undef EIGEN_CURRENT_STORAGE_BASE_CLASS
+#undef EIGEN_DOC_UNARY_ADDONS

    /** Special case of the template operator=, in order to prevent the compiler
      * from generating a default operator= (issue hit with g++ 4.1)
      */
-    EIGEN_DEVICE_FUNC
+    EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
    Derived& operator=(const ArrayBase& other)
    {
      internal::call_assignment(derived(), other.derived());
@@ -113,28 +112,28 @@ template<typename Derived> class ArrayBase
    
    /** Set all the entries to \a value.
      * \sa DenseBase::setConstant(), DenseBase::fill() */
-    EIGEN_DEVICE_FUNC
+    EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
    Derived& operator=(const Scalar &value)
    { Base::setConstant(value); return derived(); }

-    EIGEN_DEVICE_FUNC
+    EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
    Derived& operator+=(const Scalar& scalar);
-    EIGEN_DEVICE_FUNC
+    EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
    Derived& operator-=(const Scalar& scalar);

    template<typename OtherDerived>
-    EIGEN_DEVICE_FUNC
+    EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
    Derived& operator+=(const ArrayBase<OtherDerived>& other);
    template<typename OtherDerived>
-    EIGEN_DEVICE_FUNC
+    EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
    Derived& operator-=(const ArrayBase<OtherDerived>& other);

    template<typename OtherDerived>
-    EIGEN_DEVICE_FUNC
+    EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
    Derived& operator*=(const ArrayBase<OtherDerived>& other);

    template<typename OtherDerived>
-    EIGEN_DEVICE_FUNC
+    EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
    Derived& operator/=(const ArrayBase<OtherDerived>& other);

  public:
@@ -176,10 +175,10 @@ template<typename Derived> class ArrayBase
  */
 template<typename Derived>
 template<typename OtherDerived>
-EIGEN_STRONG_INLINE Derived &
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Derived &
 ArrayBase<Derived>::operator-=(const ArrayBase<OtherDerived> &other)
 {
-  call_assignment(derived(), other.derived(), internal::sub_assign_op<Scalar>());
+  call_assignment(derived(), other.derived(), internal::sub_assign_op<Scalar,typename OtherDerived::Scalar>());
  return derived();
 }

@@ -189,10 +188,10 @@ ArrayBase<Derived>::operator-=(const ArrayBase<OtherDerived> &other)
  */
 template<typename Derived>
 template<typename OtherDerived>
-EIGEN_STRONG_INLINE Derived &
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Derived &
 ArrayBase<Derived>::operator+=(const ArrayBase<OtherDerived>& other)
 {
-  call_assignment(derived(), other.derived(), internal::add_assign_op<Scalar>());
+  call_assignment(derived(), other.derived(), internal::add_assign_op<Scalar,typename OtherDerived::Scalar>());
  return derived();
 }

@@ -202,7 +201,7 @@ ArrayBase<Derived>::operator+=(const ArrayBase<OtherDerived>& other)
  */
 template<typename Derived>
 template<typename OtherDerived>
-EIGEN_STRONG_INLINE Derived &
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Derived &
 ArrayBase<Derived>::operator*=(const ArrayBase<OtherDerived>& other)
 {
  call_assignment(derived(), other.derived(), internal::mul_assign_op<Scalar,typename OtherDerived::Scalar>());
@@ -215,10 +214,10 @@ ArrayBase<Derived>::operator*=(const ArrayBase<OtherDerived>& other)
  */
 template<typename Derived>
 template<typename OtherDerived>
-EIGEN_STRONG_INLINE Derived &
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Derived &
 ArrayBase<Derived>::operator/=(const ArrayBase<OtherDerived>& other)
 {
-  call_assignment(derived(), other.derived(), internal::div_assign_op<Scalar>());
+  call_assignment(derived(), other.derived(), internal::div_assign_op<Scalar,typename OtherDerived::Scalar>());
  return derived();
 }

--- a/Eigen/src/Core/ArrayWrapper.h
+++ b/Eigen/src/Core/ArrayWrapper.h
@@ -32,7 +32,8 @@ struct traits<ArrayWrapper<ExpressionType> >
  // Let's remove NestByRefBit
  enum {
    Flags0 = traits<typename remove_all<typename ExpressionType::Nested>::type >::Flags,
-    Flags = Flags0 & ~NestByRefBit
+    LvalueBitFlag = is_lvalue<ExpressionType>::value ? LvalueBit : 0,
+    Flags = (Flags0 & ~(NestByRefBit | LvalueBit)) | LvalueBitFlag
  };
 };
 }
@@ -52,7 +53,9 @@ class ArrayWrapper : public ArrayBase<ArrayWrapper<ExpressionType> >
                       const Scalar
                     >::type ScalarWithConstIfNotLvalue;

-    typedef typename internal::ref_selector<ExpressionType>::type NestedExpressionType;
+    typedef typename internal::ref_selector<ExpressionType>::non_const_type NestedExpressionType;
+
+    using Base::coeffRef;

    EIGEN_DEVICE_FUNC
    explicit EIGEN_STRONG_INLINE ArrayWrapper(ExpressionType& matrix) : m_expression(matrix) {}
@@ -67,76 +70,28 @@ class ArrayWrapper : public ArrayBase<ArrayWrapper<ExpressionType> >
    inline Index innerStride() const { return m_expression.innerStride(); }

    EIGEN_DEVICE_FUNC
-    inline ScalarWithConstIfNotLvalue* data() { return m_expression.const_cast_derived().data(); }
+    inline ScalarWithConstIfNotLvalue* data() { return m_expression.data(); }
    EIGEN_DEVICE_FUNC
    inline const Scalar* data() const { return m_expression.data(); }

-    EIGEN_DEVICE_FUNC
-    inline CoeffReturnType coeff(Index rowId, Index colId) const
-    {
-      return m_expression.coeff(rowId, colId);
-    }
-
-    EIGEN_DEVICE_FUNC
-    inline Scalar& coeffRef(Index rowId, Index colId)
-    {
-      return m_expression.const_cast_derived().coeffRef(rowId, colId);
-    }
-
    EIGEN_DEVICE_FUNC
    inline const Scalar& coeffRef(Index rowId, Index colId) const
    {
-      return m_expression.const_cast_derived().coeffRef(rowId, colId);
-    }
-
-    EIGEN_DEVICE_FUNC
-    inline CoeffReturnType coeff(Index index) const
-    {
-      return m_expression.coeff(index);
-    }
-
-    EIGEN_DEVICE_FUNC
-    inline Scalar& coeffRef(Index index)
-    {
-      return m_expression.const_cast_derived().coeffRef(index);
+      return m_expression.coeffRef(rowId, colId);
    }

    EIGEN_DEVICE_FUNC
    inline const Scalar& coeffRef(Index index) const
    {
-      return m_expression.const_cast_derived().coeffRef(index);
-    }
-
-    template<int LoadMode>
-    inline const PacketScalar packet(Index rowId, Index colId) const
-    {
-      return m_expression.template packet<LoadMode>(rowId, colId);
-    }
-
-    template<int LoadMode>
-    inline void writePacket(Index rowId, Index colId, const PacketScalar& val)
-    {
-      m_expression.const_cast_derived().template writePacket<LoadMode>(rowId, colId, val);
-    }
-
-    template<int LoadMode>
-    inline const PacketScalar packet(Index index) const
-    {
-      return m_expression.template packet<LoadMode>(index);
-    }
-
-    template<int LoadMode>
-    inline void writePacket(Index index, const PacketScalar& val)
-    {
-      m_expression.const_cast_derived().template writePacket<LoadMode>(index, val);
+      return m_expression.coeffRef(index);
    }

    template<typename Dest>
    EIGEN_DEVICE_FUNC
    inline void evalTo(Dest& dst) const { dst = m_expression; }

-    const typename internal::remove_all<NestedExpressionType>::type& 
    EIGEN_DEVICE_FUNC
+    const typename internal::remove_all<NestedExpressionType>::type& 
    nestedExpression() const 
    {
      return m_expression;
@@ -145,11 +100,11 @@ class ArrayWrapper : public ArrayBase<ArrayWrapper<ExpressionType> >
    /** Forwards the resizing request to the nested expression
      * \sa DenseBase::resize(Index)  */
    EIGEN_DEVICE_FUNC
-    void resize(Index newSize) { m_expression.const_cast_derived().resize(newSize); }
+    void resize(Index newSize) { m_expression.resize(newSize); }
    /** Forwards the resizing request to the nested expression
      * \sa DenseBase::resize(Index,Index)*/
    EIGEN_DEVICE_FUNC
-    void resize(Index rows, Index cols) { m_expression.const_cast_derived().resize(rows,cols); }
+    void resize(Index rows, Index cols) { m_expression.resize(rows,cols); }

  protected:
    NestedExpressionType m_expression;
@@ -175,7 +130,8 @@ struct traits<MatrixWrapper<ExpressionType> >
  // Let's remove NestByRefBit
  enum {
    Flags0 = traits<typename remove_all<typename ExpressionType::Nested>::type >::Flags,
-    Flags = Flags0 & ~NestByRefBit
+    LvalueBitFlag = is_lvalue<ExpressionType>::value ? LvalueBit : 0,
+    Flags = (Flags0 & ~(NestByRefBit | LvalueBit)) | LvalueBitFlag
  };
 };
 }
@@ -195,7 +151,9 @@ class MatrixWrapper : public MatrixBase<MatrixWrapper<ExpressionType> >
                       const Scalar
                     >::type ScalarWithConstIfNotLvalue;

-    typedef typename internal::ref_selector<ExpressionType>::type NestedExpressionType;
+    typedef typename internal::ref_selector<ExpressionType>::non_const_type NestedExpressionType;
+
+    using Base::coeffRef;

    EIGEN_DEVICE_FUNC
    explicit inline MatrixWrapper(ExpressionType& matrix) : m_expression(matrix) {}
@@ -210,68 +168,20 @@ class MatrixWrapper : public MatrixBase<MatrixWrapper<ExpressionType> >
    inline Index innerStride() const { return m_expression.innerStride(); }

    EIGEN_DEVICE_FUNC
-    inline ScalarWithConstIfNotLvalue* data() { return m_expression.const_cast_derived().data(); }
+    inline ScalarWithConstIfNotLvalue* data() { return m_expression.data(); }
    EIGEN_DEVICE_FUNC
    inline const Scalar* data() const { return m_expression.data(); }

-    EIGEN_DEVICE_FUNC
-    inline CoeffReturnType coeff(Index rowId, Index colId) const
-    {
-      return m_expression.coeff(rowId, colId);
-    }
-
-    EIGEN_DEVICE_FUNC
-    inline Scalar& coeffRef(Index rowId, Index colId)
-    {
-      return m_expression.const_cast_derived().coeffRef(rowId, colId);
-    }
-
    EIGEN_DEVICE_FUNC
    inline const Scalar& coeffRef(Index rowId, Index colId) const
    {
      return m_expression.derived().coeffRef(rowId, colId);
    }

-    EIGEN_DEVICE_FUNC
-    inline CoeffReturnType coeff(Index index) const
-    {
-      return m_expression.coeff(index);
-    }
-
-    EIGEN_DEVICE_FUNC
-    inline Scalar& coeffRef(Index index)
-    {
-      return m_expression.const_cast_derived().coeffRef(index);
-    }
-
    EIGEN_DEVICE_FUNC
    inline const Scalar& coeffRef(Index index) const
    {
-      return m_expression.const_cast_derived().coeffRef(index);
-    }
-
-    template<int LoadMode>
-    inline const PacketScalar packet(Index rowId, Index colId) const
-    {
-      return m_expression.template packet<LoadMode>(rowId, colId);
-    }
-
-    template<int LoadMode>
-    inline void writePacket(Index rowId, Index colId, const PacketScalar& val)
-    {
-      m_expression.const_cast_derived().template writePacket<LoadMode>(rowId, colId, val);
-    }
-
-    template<int LoadMode>
-    inline const PacketScalar packet(Index index) const
-    {
-      return m_expression.template packet<LoadMode>(index);
-    }
-
-    template<int LoadMode>
-    inline void writePacket(Index index, const PacketScalar& val)
-    {
-      m_expression.const_cast_derived().template writePacket<LoadMode>(index, val);
+      return m_expression.coeffRef(index);
    }

    EIGEN_DEVICE_FUNC
@@ -284,11 +194,11 @@ class MatrixWrapper : public MatrixBase<MatrixWrapper<ExpressionType> >
    /** Forwards the resizing request to the nested expression
      * \sa DenseBase::resize(Index)  */
    EIGEN_DEVICE_FUNC
-    void resize(Index newSize) { m_expression.const_cast_derived().resize(newSize); }
+    void resize(Index newSize) { m_expression.resize(newSize); }
    /** Forwards the resizing request to the nested expression
      * \sa DenseBase::resize(Index,Index)*/
    EIGEN_DEVICE_FUNC
-    void resize(Index rows, Index cols) { m_expression.const_cast_derived().resize(rows,cols); }
+    void resize(Index rows, Index cols) { m_expression.resize(rows,cols); }

  protected:
    NestedExpressionType m_expression;
--- a/Eigen/src/Core/Assign.h
+++ b/Eigen/src/Core/Assign.h
@@ -16,7 +16,7 @@ namespace Eigen {

 template<typename Derived>
 template<typename OtherDerived>
-EIGEN_STRONG_INLINE Derived& DenseBase<Derived>
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Derived& DenseBase<Derived>
  ::lazyAssign(const DenseBase<OtherDerived>& other)
 {
  enum{
--- a/Eigen/src/Core/AssignEvaluator.h
+++ b/Eigen/src/Core/AssignEvaluator.h
@@ -24,25 +24,22 @@ namespace internal {

 // copy_using_evaluator_traits is based on assign_traits

-template <typename DstEvaluator, typename SrcEvaluator, typename AssignFunc>
+template <typename DstEvaluator, typename SrcEvaluator, typename AssignFunc, int MaxPacketSize = -1>
 struct copy_using_evaluator_traits
 {
  typedef typename DstEvaluator::XprType Dst;
  typedef typename Dst::Scalar DstScalar;
-  // TODO distinguish between linear traversal and inner-traversals
-  typedef typename find_best_packet<DstScalar,Dst::SizeAtCompileTime>::type PacketType; 
  
  enum {
    DstFlags = DstEvaluator::Flags,
-    SrcFlags = SrcEvaluator::Flags,
-    RequiredAlignment = unpacket_traits<PacketType>::alignment
+    SrcFlags = SrcEvaluator::Flags
  };
  
 public:
  enum {
    DstAlignment = DstEvaluator::Alignment,
    SrcAlignment = SrcEvaluator::Alignment,
-    DstHasDirectAccess = DstFlags & DirectAccessBit,
+    DstHasDirectAccess = (DstFlags & DirectAccessBit) == DirectAccessBit,
    JointAlignment = EIGEN_PLAIN_ENUM_MIN(DstAlignment,SrcAlignment)
  };

@@ -54,34 +51,56 @@ private:
    InnerMaxSize = int(Dst::IsVectorAtCompileTime) ? int(Dst::MaxSizeAtCompileTime)
              : int(DstFlags)&RowMajorBit ? int(Dst::MaxColsAtCompileTime)
              : int(Dst::MaxRowsAtCompileTime),
-    MaxSizeAtCompileTime = Dst::SizeAtCompileTime,
-    PacketSize = unpacket_traits<PacketType>::size
+    RestrictedInnerSize = EIGEN_SIZE_MIN_PREFER_FIXED(InnerSize,MaxPacketSize),
+    RestrictedLinearSize = EIGEN_SIZE_MIN_PREFER_FIXED(Dst::SizeAtCompileTime,MaxPacketSize),
+    OuterStride = int(outer_stride_at_compile_time<Dst>::ret),
+    MaxSizeAtCompileTime = Dst::SizeAtCompileTime
  };

+  // TODO distinguish between linear traversal and inner-traversals
+  typedef typename find_best_packet<DstScalar,RestrictedLinearSize>::type LinearPacketType;
+  typedef typename find_best_packet<DstScalar,RestrictedInnerSize>::type InnerPacketType;
+
  enum {
-    DstIsRowMajor = DstFlags&RowMajorBit,
-    SrcIsRowMajor = SrcFlags&RowMajorBit,
-    StorageOrdersAgree = (int(DstIsRowMajor) == int(SrcIsRowMajor)),
-    MightVectorize = StorageOrdersAgree
-                  && (int(DstFlags) & int(SrcFlags) & ActualPacketAccessBit)
-                  && (functor_traits<AssignFunc>::PacketAccess),
-    MayInnerVectorize  = MightVectorize && int(InnerSize)!=Dynamic && int(InnerSize)%int(PacketSize)==0
-                       && int(JointAlignment)>=int(RequiredAlignment),
-    MayLinearize = StorageOrdersAgree && (int(DstFlags) & int(SrcFlags) & LinearAccessBit),
-    MayLinearVectorize = MightVectorize && MayLinearize && DstHasDirectAccess
-                       && ((int(DstAlignment)>=int(RequiredAlignment)) || MaxSizeAtCompileTime == Dynamic),
-      /* If the destination isn't aligned, we have to do runtime checks and we don't unroll,
-         so it's only good for large enough sizes. */
-    MaySliceVectorize  = MightVectorize && DstHasDirectAccess
-                       && (int(InnerMaxSize)==Dynamic || int(InnerMaxSize)>=3*PacketSize)
-      /* slice vectorization can be slow, so we only want it if the slices are big, which is
-         indicated by InnerMaxSize rather than InnerSize, think of the case of a dynamic block
-         in a fixed-size matrix */
+    LinearPacketSize = unpacket_traits<LinearPacketType>::size,
+    InnerPacketSize = unpacket_traits<InnerPacketType>::size
  };

 public:
  enum {
-    Traversal = int(MayInnerVectorize)   ? int(InnerVectorizedTraversal)
+    LinearRequiredAlignment = unpacket_traits<LinearPacketType>::alignment,
+    InnerRequiredAlignment = unpacket_traits<InnerPacketType>::alignment
+  };
+
+private:
+  enum {
+    DstIsRowMajor = DstFlags&RowMajorBit,
+    SrcIsRowMajor = SrcFlags&RowMajorBit,
+    StorageOrdersAgree = (int(DstIsRowMajor) == int(SrcIsRowMajor)),
+    MightVectorize = bool(StorageOrdersAgree)
+                  && (int(DstFlags) & int(SrcFlags) & ActualPacketAccessBit)
+                  && bool(functor_traits<AssignFunc>::PacketAccess),
+    MayInnerVectorize  = MightVectorize
+                       && int(InnerSize)!=Dynamic && int(InnerSize)%int(InnerPacketSize)==0
+                       && int(OuterStride)!=Dynamic && int(OuterStride)%int(InnerPacketSize)==0
+                       && (EIGEN_UNALIGNED_VECTORIZE  || int(JointAlignment)>=int(InnerRequiredAlignment)),
+    MayLinearize = bool(StorageOrdersAgree) && (int(DstFlags) & int(SrcFlags) & LinearAccessBit),
+    MayLinearVectorize = bool(MightVectorize) && bool(MayLinearize) && bool(DstHasDirectAccess)
+                       && (EIGEN_UNALIGNED_VECTORIZE || (int(DstAlignment)>=int(LinearRequiredAlignment)) || MaxSizeAtCompileTime == Dynamic),
+      /* If the destination isn't aligned, we have to do runtime checks and we don't unroll,
+         so it's only good for large enough sizes. */
+    MaySliceVectorize  = bool(MightVectorize) && bool(DstHasDirectAccess)
+                       && (int(InnerMaxSize)==Dynamic || int(InnerMaxSize)>=(EIGEN_UNALIGNED_VECTORIZE?InnerPacketSize:(3*InnerPacketSize)))
+      /* slice vectorization can be slow, so we only want it if the slices are big, which is
+         indicated by InnerMaxSize rather than InnerSize, think of the case of a dynamic block
+         in a fixed-size matrix
+         However, with EIGEN_UNALIGNED_VECTORIZE and unrolling, slice vectorization is still worth it */
+  };
+
+public:
+  enum {
+    Traversal = (int(MayLinearVectorize) && (LinearPacketSize>InnerPacketSize)) ? int(LinearVectorizedTraversal)
+              : int(MayInnerVectorize)   ? int(InnerVectorizedTraversal)
              : int(MayLinearVectorize)  ? int(LinearVectorizedTraversal)
              : int(MaySliceVectorize)   ? int(SliceVectorizedTraversal)
              : int(MayLinearize)        ? int(LinearTraversal)
@@ -91,15 +110,18 @@ public:
              || int(Traversal) == SliceVectorizedTraversal
  };

+  typedef typename conditional<int(Traversal)==LinearVectorizedTraversal, LinearPacketType, InnerPacketType>::type PacketType;
+
 private:
  enum {
-    UnrollingLimit      = EIGEN_UNROLLING_LIMIT * (Vectorized ? int(PacketSize) : 1),
+    ActualPacketSize    = int(Traversal)==LinearVectorizedTraversal ? LinearPacketSize
+                        : Vectorized ? InnerPacketSize
+                        : 1,
+    UnrollingLimit      = EIGEN_UNROLLING_LIMIT * ActualPacketSize,
    MayUnrollCompletely = int(Dst::SizeAtCompileTime) != Dynamic
-                       && int(SrcEvaluator::CoeffReadCost) != Dynamic
-                       && int(Dst::SizeAtCompileTime) * int(SrcEvaluator::CoeffReadCost) <= int(UnrollingLimit),
+                       && int(Dst::SizeAtCompileTime) * (int(DstEvaluator::CoeffReadCost)+int(SrcEvaluator::CoeffReadCost)) <= int(UnrollingLimit),
    MayUnrollInner      = int(InnerSize) != Dynamic
-                       && int(SrcEvaluator::CoeffReadCost) != Dynamic
-                       && int(InnerSize) * int(SrcEvaluator::CoeffReadCost) <= int(UnrollingLimit)
+                       && int(InnerSize) * (int(DstEvaluator::CoeffReadCost)+int(SrcEvaluator::CoeffReadCost)) <= int(UnrollingLimit)
  };

 public:
@@ -111,11 +133,17 @@ public:
                                             : int(NoUnrolling)
                  )
              : int(Traversal) == int(LinearVectorizedTraversal)
-                ? ( bool(MayUnrollCompletely) && (int(DstAlignment)>=int(RequiredAlignment)) ? int(CompleteUnrolling)
-                                                                                             : int(NoUnrolling) )
+                ? ( bool(MayUnrollCompletely) && ( EIGEN_UNALIGNED_VECTORIZE || (int(DstAlignment)>=int(LinearRequiredAlignment)))
+                          ? int(CompleteUnrolling)
+                          : int(NoUnrolling) )
              : int(Traversal) == int(LinearTraversal)
                ? ( bool(MayUnrollCompletely) ? int(CompleteUnrolling) 
                                              : int(NoUnrolling) )
+#if EIGEN_UNALIGNED_VECTORIZE
+              : int(Traversal) == int(SliceVectorizedTraversal)
+                ? ( bool(MayUnrollInner) ? int(InnerUnrolling)
+                                         : int(NoUnrolling) )
+#endif
              : int(NoUnrolling)
  };

@@ -125,27 +153,33 @@ public:
    std::cerr << "DstXpr: " << typeid(typename DstEvaluator::XprType).name() << std::endl;
    std::cerr << "SrcXpr: " << typeid(typename SrcEvaluator::XprType).name() << std::endl;
    std::cerr.setf(std::ios::hex, std::ios::basefield);
-    EIGEN_DEBUG_VAR(DstFlags)
-    EIGEN_DEBUG_VAR(SrcFlags)
+    std::cerr << "DstFlags" << " = " << DstFlags << " (" << demangle_flags(DstFlags) << " )" << std::endl;
+    std::cerr << "SrcFlags" << " = " << SrcFlags << " (" << demangle_flags(SrcFlags) << " )" << std::endl;
    std::cerr.unsetf(std::ios::hex);
    EIGEN_DEBUG_VAR(DstAlignment)
    EIGEN_DEBUG_VAR(SrcAlignment)
-    EIGEN_DEBUG_VAR(RequiredAlignment)
+    EIGEN_DEBUG_VAR(LinearRequiredAlignment)
+    EIGEN_DEBUG_VAR(InnerRequiredAlignment)
    EIGEN_DEBUG_VAR(JointAlignment)
    EIGEN_DEBUG_VAR(InnerSize)
    EIGEN_DEBUG_VAR(InnerMaxSize)
-    EIGEN_DEBUG_VAR(PacketSize)
+    EIGEN_DEBUG_VAR(LinearPacketSize)
+    EIGEN_DEBUG_VAR(InnerPacketSize)
+    EIGEN_DEBUG_VAR(ActualPacketSize)
    EIGEN_DEBUG_VAR(StorageOrdersAgree)
    EIGEN_DEBUG_VAR(MightVectorize)
    EIGEN_DEBUG_VAR(MayLinearize)
    EIGEN_DEBUG_VAR(MayInnerVectorize)
    EIGEN_DEBUG_VAR(MayLinearVectorize)
    EIGEN_DEBUG_VAR(MaySliceVectorize)
-    EIGEN_DEBUG_VAR(Traversal)
+    std::cerr << "Traversal" << " = " << Traversal << " (" << demangle_traversal(Traversal) << ")" << std::endl;
+    EIGEN_DEBUG_VAR(SrcEvaluator::CoeffReadCost)
+    EIGEN_DEBUG_VAR(DstEvaluator::CoeffReadCost)
+    EIGEN_DEBUG_VAR(Dst::SizeAtCompileTime)
    EIGEN_DEBUG_VAR(UnrollingLimit)
    EIGEN_DEBUG_VAR(MayUnrollCompletely)
    EIGEN_DEBUG_VAR(MayUnrollInner)
-    EIGEN_DEBUG_VAR(Unrolling)
+    std::cerr << "Unrolling" << " = " << Unrolling << " (" << demangle_unrolling(Unrolling) << ")" << std::endl;
    std::cerr << std::endl;
  }
 #endif
@@ -235,12 +269,13 @@ struct copy_using_evaluator_innervec_CompleteUnrolling
  enum {
    outer = Index / DstXprType::InnerSizeAtCompileTime,
    inner = Index % DstXprType::InnerSizeAtCompileTime,
-    JointAlignment = Kernel::AssignmentTraits::JointAlignment
+    SrcAlignment = Kernel::AssignmentTraits::SrcAlignment,
+    DstAlignment = Kernel::AssignmentTraits::DstAlignment
  };

  EIGEN_DEVICE_FUNC static EIGEN_STRONG_INLINE void run(Kernel &kernel)
  {
-    kernel.template assignPacketByOuterInner<Aligned, JointAlignment, PacketType>(outer, inner);
+    kernel.template assignPacketByOuterInner<DstAlignment, SrcAlignment, PacketType>(outer, inner);
    enum { NextIndex = Index + unpacket_traits<PacketType>::size };
    copy_using_evaluator_innervec_CompleteUnrolling<Kernel, NextIndex, Stop>::run(kernel);
  }
@@ -252,20 +287,20 @@ struct copy_using_evaluator_innervec_CompleteUnrolling<Kernel, Stop, Stop>
  EIGEN_DEVICE_FUNC static EIGEN_STRONG_INLINE void run(Kernel&) { }
 };

-template<typename Kernel, int Index_, int Stop>
+template<typename Kernel, int Index_, int Stop, int SrcAlignment, int DstAlignment>
 struct copy_using_evaluator_innervec_InnerUnrolling
 {
  typedef typename Kernel::PacketType PacketType;
  EIGEN_DEVICE_FUNC static EIGEN_STRONG_INLINE void run(Kernel &kernel, Index outer)
  {
-    kernel.template assignPacketByOuterInner<Aligned, Aligned, PacketType>(outer, Index_);
+    kernel.template assignPacketByOuterInner<DstAlignment, SrcAlignment, PacketType>(outer, Index_);
    enum { NextIndex = Index_ + unpacket_traits<PacketType>::size };
-    copy_using_evaluator_innervec_InnerUnrolling<Kernel, NextIndex, Stop>::run(kernel, outer);
+    copy_using_evaluator_innervec_InnerUnrolling<Kernel, NextIndex, Stop, SrcAlignment, DstAlignment>::run(kernel, outer);
  }
 };

-template<typename Kernel, int Stop>
-struct copy_using_evaluator_innervec_InnerUnrolling<Kernel, Stop, Stop>
+template<typename Kernel, int Stop, int SrcAlignment, int DstAlignment>
+struct copy_using_evaluator_innervec_InnerUnrolling<Kernel, Stop, Stop, SrcAlignment, DstAlignment>
 {
  EIGEN_DEVICE_FUNC static EIGEN_STRONG_INLINE void run(Kernel &, Index) { }
 };
@@ -288,7 +323,7 @@ struct dense_assignment_loop;
 template<typename Kernel>
 struct dense_assignment_loop<Kernel, DefaultTraversal, NoUnrolling>
 {
-  EIGEN_DEVICE_FUNC static void run(Kernel &kernel)
+  EIGEN_DEVICE_FUNC static void EIGEN_STRONG_INLINE run(Kernel &kernel)
  {
    for(Index outer = 0; outer < kernel.outerSize(); ++outer) {
      for(Index inner = 0; inner < kernel.innerSize(); ++inner) {
@@ -311,7 +346,6 @@ struct dense_assignment_loop<Kernel, DefaultTraversal, CompleteUnrolling>
 template<typename Kernel>
 struct dense_assignment_loop<Kernel, DefaultTraversal, InnerUnrolling>
 {
-  typedef typename Kernel::StorageIndex StorageIndex;
  EIGEN_DEVICE_FUNC static EIGEN_STRONG_INLINE void run(Kernel &kernel)
  {
    typedef typename Kernel::DstEvaluatorType::XprType DstXprType;
@@ -370,14 +404,14 @@ struct dense_assignment_loop<Kernel, LinearVectorizedTraversal, NoUnrolling>
    typedef typename Kernel::Scalar Scalar;
    typedef typename Kernel::PacketType PacketType;
    enum {
-      requestedAlignment = Kernel::AssignmentTraits::RequiredAlignment,
+      requestedAlignment = Kernel::AssignmentTraits::LinearRequiredAlignment,
      packetSize = unpacket_traits<PacketType>::size,
      dstIsAligned = int(Kernel::AssignmentTraits::DstAlignment)>=int(requestedAlignment),
      dstAlignment = packet_traits<Scalar>::AlignedOnScalar ? int(requestedAlignment)
                                                            : int(Kernel::AssignmentTraits::DstAlignment),
      srcAlignment = Kernel::AssignmentTraits::JointAlignment
    };
-    const Index alignedStart = dstIsAligned ? 0 : internal::first_aligned<requestedAlignment>(&kernel.dstEvaluator().coeffRef(0), size);
+    const Index alignedStart = dstIsAligned ? 0 : internal::first_aligned<requestedAlignment>(kernel.dstDataPtr(), size);
    const Index alignedEnd = alignedStart + ((size-alignedStart)/packetSize)*packetSize;

    unaligned_dense_assignment_loop<dstIsAligned!=0>::run(kernel, 0, alignedStart);
@@ -392,13 +426,13 @@ struct dense_assignment_loop<Kernel, LinearVectorizedTraversal, NoUnrolling>
 template<typename Kernel>
 struct dense_assignment_loop<Kernel, LinearVectorizedTraversal, CompleteUnrolling>
 {
-  typedef typename Kernel::StorageIndex StorageIndex;
  EIGEN_DEVICE_FUNC static EIGEN_STRONG_INLINE void run(Kernel &kernel)
  {
    typedef typename Kernel::DstEvaluatorType::XprType DstXprType;
+    typedef typename Kernel::PacketType PacketType;
    
    enum { size = DstXprType::SizeAtCompileTime,
-           packetSize = packet_traits<typename Kernel::Scalar>::size,
+           packetSize =unpacket_traits<PacketType>::size,
           alignedSize = (size/packetSize)*packetSize };

    copy_using_evaluator_innervec_CompleteUnrolling<Kernel, 0, alignedSize>::run(kernel);
@@ -414,14 +448,18 @@ template<typename Kernel>
 struct dense_assignment_loop<Kernel, InnerVectorizedTraversal, NoUnrolling>
 {
  typedef typename Kernel::PacketType PacketType;
-  EIGEN_DEVICE_FUNC static inline void run(Kernel &kernel)
+  enum {
+    SrcAlignment = Kernel::AssignmentTraits::SrcAlignment,
+    DstAlignment = Kernel::AssignmentTraits::DstAlignment
+  };
+  EIGEN_DEVICE_FUNC static EIGEN_STRONG_INLINE void run(Kernel &kernel)
  {
    const Index innerSize = kernel.innerSize();
    const Index outerSize = kernel.outerSize();
    const Index packetSize = unpacket_traits<PacketType>::size;
    for(Index outer = 0; outer < outerSize; ++outer)
      for(Index inner = 0; inner < innerSize; inner+=packetSize)
-        kernel.template assignPacketByOuterInner<Aligned, Aligned, PacketType>(outer, inner);
+        kernel.template assignPacketByOuterInner<DstAlignment, SrcAlignment, PacketType>(outer, inner);
  }
 };

@@ -438,13 +476,14 @@ struct dense_assignment_loop<Kernel, InnerVectorizedTraversal, CompleteUnrolling
 template<typename Kernel>
 struct dense_assignment_loop<Kernel, InnerVectorizedTraversal, InnerUnrolling>
 {
-  typedef typename Kernel::StorageIndex StorageIndex;
  EIGEN_DEVICE_FUNC static EIGEN_STRONG_INLINE void run(Kernel &kernel)
  {
    typedef typename Kernel::DstEvaluatorType::XprType DstXprType;
+    typedef typename Kernel::AssignmentTraits Traits;
    const Index outerSize = kernel.outerSize();
    for(Index outer = 0; outer < outerSize; ++outer)
-      copy_using_evaluator_innervec_InnerUnrolling<Kernel, 0, DstXprType::InnerSizeAtCompileTime>::run(kernel, outer);
+      copy_using_evaluator_innervec_InnerUnrolling<Kernel, 0, DstXprType::InnerSizeAtCompileTime,
+                                                   Traits::SrcAlignment, Traits::DstAlignment>::run(kernel, outer);
  }
 };

@@ -455,7 +494,7 @@ struct dense_assignment_loop<Kernel, InnerVectorizedTraversal, InnerUnrolling>
 template<typename Kernel>
 struct dense_assignment_loop<Kernel, LinearTraversal, NoUnrolling>
 {
-  EIGEN_DEVICE_FUNC static inline void run(Kernel &kernel)
+  EIGEN_DEVICE_FUNC static EIGEN_STRONG_INLINE void run(Kernel &kernel)
  {
    const Index size = kernel.size();
    for(Index i = 0; i < size; ++i)
@@ -480,22 +519,22 @@ struct dense_assignment_loop<Kernel, LinearTraversal, CompleteUnrolling>
 template<typename Kernel>
 struct dense_assignment_loop<Kernel, SliceVectorizedTraversal, NoUnrolling>
 {
-  EIGEN_DEVICE_FUNC static inline void run(Kernel &kernel)
+  EIGEN_DEVICE_FUNC static EIGEN_STRONG_INLINE void run(Kernel &kernel)
  {
    typedef typename Kernel::Scalar Scalar;
    typedef typename Kernel::PacketType PacketType;
    enum {
      packetSize = unpacket_traits<PacketType>::size,
-      requestedAlignment = int(Kernel::AssignmentTraits::RequiredAlignment),
+      requestedAlignment = int(Kernel::AssignmentTraits::InnerRequiredAlignment),
      alignable = packet_traits<Scalar>::AlignedOnScalar || int(Kernel::AssignmentTraits::DstAlignment)>=sizeof(Scalar),
      dstIsAligned = int(Kernel::AssignmentTraits::DstAlignment)>=int(requestedAlignment),
      dstAlignment = alignable ? int(requestedAlignment)
                               : int(Kernel::AssignmentTraits::DstAlignment)
    };
-    const Scalar *dst_ptr = &kernel.dstEvaluator().coeffRef(0,0);
-    if((!bool(dstIsAligned)) && (size_t(dst_ptr) % sizeof(Scalar))>0)
+    const Scalar *dst_ptr = kernel.dstDataPtr();
+    if((!bool(dstIsAligned)) && (UIntPtr(dst_ptr) % sizeof(Scalar))>0)
    {
-      // the pointer is not aligend-on scalar, so alignment is not possible
+      // the pointer is not aligned-on scalar, so alignment is not possible
      return dense_assignment_loop<Kernel,DefaultTraversal,NoUnrolling>::run(kernel);
    }
    const Index packetAlignedMask = packetSize - 1;
@@ -519,11 +558,34 @@ struct dense_assignment_loop<Kernel, SliceVectorizedTraversal, NoUnrolling>
      for(Index inner = alignedEnd; inner<innerSize ; ++inner)
        kernel.assignCoeffByOuterInner(outer, inner);

-      alignedStart = std::min<Index>((alignedStart+alignedStep)%packetSize, innerSize);
+      alignedStart = numext::mini((alignedStart+alignedStep)%packetSize, innerSize);
    }
  }
 };

+#if EIGEN_UNALIGNED_VECTORIZE
+template<typename Kernel>
+struct dense_assignment_loop<Kernel, SliceVectorizedTraversal, InnerUnrolling>
+{
+  EIGEN_DEVICE_FUNC static EIGEN_STRONG_INLINE void run(Kernel &kernel)
+  {
+    typedef typename Kernel::DstEvaluatorType::XprType DstXprType;
+    typedef typename Kernel::PacketType PacketType;
+
+    enum { size = DstXprType::InnerSizeAtCompileTime,
+           packetSize =unpacket_traits<PacketType>::size,
+           vectorizableSize = (size/packetSize)*packetSize };
+
+    for(Index outer = 0; outer < kernel.outerSize(); ++outer)
+    {
+      copy_using_evaluator_innervec_InnerUnrolling<Kernel, 0, vectorizableSize, 0, 0>::run(kernel, outer);
+      copy_using_evaluator_DefaultTraversal_InnerUnrolling<Kernel, vectorizableSize, size>::run(kernel, outer);
+    }
+  }
+};
+#endif
+
+
 /***************************************************************************
 * Part 4 : Generic dense assignment kernel
 ***************************************************************************/
@@ -545,12 +607,12 @@ public:
  typedef DstEvaluatorTypeT DstEvaluatorType;
  typedef SrcEvaluatorTypeT SrcEvaluatorType;
  typedef typename DstEvaluatorType::Scalar Scalar;
-  typedef typename DstEvaluatorType::StorageIndex StorageIndex;
  typedef copy_using_evaluator_traits<DstEvaluatorTypeT, SrcEvaluatorTypeT, Functor> AssignmentTraits;
  typedef typename AssignmentTraits::PacketType PacketType;
  
  
-  EIGEN_DEVICE_FUNC generic_dense_assignment_kernel(DstEvaluatorType &dst, const SrcEvaluatorType &src, const Functor &func, DstXprType& dstExpr)
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+  generic_dense_assignment_kernel(DstEvaluatorType &dst, const SrcEvaluatorType &src, const Functor &func, DstXprType& dstExpr)
    : m_dst(dst), m_src(src), m_functor(func), m_dstExpr(dstExpr)
  {
    #ifdef EIGEN_DEBUG_ASSIGN
@@ -565,26 +627,23 @@ public:
  EIGEN_DEVICE_FUNC Index cols() const        { return m_dstExpr.cols(); }
  EIGEN_DEVICE_FUNC Index outerStride() const { return m_dstExpr.outerStride(); }
  
-  // TODO get rid of this one:
-  EIGEN_DEVICE_FUNC DstXprType& dstExpression() const { return m_dstExpr; }
-  
  EIGEN_DEVICE_FUNC DstEvaluatorType& dstEvaluator() { return m_dst; }
  EIGEN_DEVICE_FUNC const SrcEvaluatorType& srcEvaluator() const { return m_src; }
  
  /// Assign src(row,col) to dst(row,col) through the assignment functor.
-  EIGEN_DEVICE_FUNC void assignCoeff(Index row, Index col)
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void assignCoeff(Index row, Index col)
  {
    m_functor.assignCoeff(m_dst.coeffRef(row,col), m_src.coeff(row,col));
  }
  
  /// \sa assignCoeff(Index,Index)
-  EIGEN_DEVICE_FUNC void assignCoeff(Index index)
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void assignCoeff(Index index)
  {
    m_functor.assignCoeff(m_dst.coeffRef(index), m_src.coeff(index));
  }
  
  /// \sa assignCoeff(Index,Index)
-  EIGEN_DEVICE_FUNC void assignCoeffByOuterInner(Index outer, Index inner)
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void assignCoeffByOuterInner(Index outer, Index inner)
  {
    Index row = rowIndexByOuterInner(outer, inner); 
    Index col = colIndexByOuterInner(outer, inner); 
@@ -593,26 +652,26 @@ public:
  
  
  template<int StoreMode, int LoadMode, typename PacketType>
-  EIGEN_DEVICE_FUNC void assignPacket(Index row, Index col)
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void assignPacket(Index row, Index col)
  {
    m_functor.template assignPacket<StoreMode>(&m_dst.coeffRef(row,col), m_src.template packet<LoadMode,PacketType>(row,col));
  }
  
  template<int StoreMode, int LoadMode, typename PacketType>
-  EIGEN_DEVICE_FUNC void assignPacket(Index index)
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void assignPacket(Index index)
  {
    m_functor.template assignPacket<StoreMode>(&m_dst.coeffRef(index), m_src.template packet<LoadMode,PacketType>(index));
  }
  
  template<int StoreMode, int LoadMode, typename PacketType>
-  EIGEN_DEVICE_FUNC void assignPacketByOuterInner(Index outer, Index inner)
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void assignPacketByOuterInner(Index outer, Index inner)
  {
    Index row = rowIndexByOuterInner(outer, inner); 
    Index col = colIndexByOuterInner(outer, inner);
    assignPacket<StoreMode,LoadMode,PacketType>(row, col);
  }
  
-  EIGEN_DEVICE_FUNC static Index rowIndexByOuterInner(Index outer, Index inner)
+  EIGEN_DEVICE_FUNC static EIGEN_STRONG_INLINE Index rowIndexByOuterInner(Index outer, Index inner)
  {
    typedef typename DstEvaluatorType::ExpressionTraits Traits;
    return int(Traits::RowsAtCompileTime) == 1 ? 0
@@ -621,7 +680,7 @@ public:
      : inner;
  }

-  EIGEN_DEVICE_FUNC static Index colIndexByOuterInner(Index outer, Index inner)
+  EIGEN_DEVICE_FUNC static EIGEN_STRONG_INLINE Index colIndexByOuterInner(Index outer, Index inner)
  {
    typedef typename DstEvaluatorType::ExpressionTraits Traits;
    return int(Traits::ColsAtCompileTime) == 1 ? 0
@@ -629,6 +688,11 @@ public:
      : int(DstEvaluatorType::Flags)&RowMajorBit ? inner
      : outer;
  }
+
+  EIGEN_DEVICE_FUNC const Scalar* dstDataPtr() const
+  {
+    return m_dstExpr.data();
+  }
  
 protected:
  DstEvaluatorType& m_dst;
@@ -638,31 +702,75 @@ protected:
  DstXprType& m_dstExpr;
 };

+// Special kernel used when computing small products whose operands have dynamic dimensions.  It ensures that the
+// PacketSize used is no larger than 4, thereby increasing the chance that vectorized instructions will be used
+// when computing the product.
+
+template<typename DstEvaluatorTypeT, typename SrcEvaluatorTypeT, typename Functor>
+class restricted_packet_dense_assignment_kernel : public generic_dense_assignment_kernel<DstEvaluatorTypeT, SrcEvaluatorTypeT, Functor, BuiltIn>
+{
+protected:
+  typedef generic_dense_assignment_kernel<DstEvaluatorTypeT, SrcEvaluatorTypeT, Functor, BuiltIn> Base;
+ public:
+    typedef typename Base::Scalar Scalar;
+    typedef typename Base::DstXprType DstXprType;
+    typedef copy_using_evaluator_traits<DstEvaluatorTypeT, SrcEvaluatorTypeT, Functor, 4> AssignmentTraits;
+    typedef typename AssignmentTraits::PacketType PacketType;
+    
+    EIGEN_DEVICE_FUNC restricted_packet_dense_assignment_kernel(DstEvaluatorTypeT &dst, const SrcEvaluatorTypeT &src, const Functor &func, DstXprType& dstExpr)
+    : Base(dst, src, func, dstExpr)
+  {
+  }
+ };
+ 
 /***************************************************************************
 * Part 5 : Entry point for dense rectangular assignment
 ***************************************************************************/

-template<typename DstXprType, typename SrcXprType, typename Functor>
-EIGEN_DEVICE_FUNC void call_dense_assignment_loop(const DstXprType& dst, const SrcXprType& src, const Functor &func)
+template<typename DstXprType,typename SrcXprType, typename Functor>
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+void resize_if_allowed(DstXprType &dst, const SrcXprType& src, const Functor &/*func*/)
 {
+  EIGEN_ONLY_USED_FOR_DEBUG(dst);
+  EIGEN_ONLY_USED_FOR_DEBUG(src);
  eigen_assert(dst.rows() == src.rows() && dst.cols() == src.cols());
-  
+}
+
+template<typename DstXprType,typename SrcXprType, typename T1, typename T2>
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+void resize_if_allowed(DstXprType &dst, const SrcXprType& src, const internal::assign_op<T1,T2> &/*func*/)
+{
+  Index dstRows = src.rows();
+  Index dstCols = src.cols();
+  if(((dst.rows()!=dstRows) || (dst.cols()!=dstCols)))
+    dst.resize(dstRows, dstCols);
+  eigen_assert(dst.rows() == dstRows && dst.cols() == dstCols);
+}
+
+template<typename DstXprType, typename SrcXprType, typename Functor>
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void call_dense_assignment_loop(DstXprType& dst, const SrcXprType& src, const Functor &func)
+{
  typedef evaluator<DstXprType> DstEvaluatorType;
  typedef evaluator<SrcXprType> SrcEvaluatorType;

-  DstEvaluatorType dstEvaluator(dst);
  SrcEvaluatorType srcEvaluator(src);
+
+  // NOTE To properly handle A = (A*A.transpose())/s with A rectangular,
+  // we need to resize the destination after the source evaluator has been created.
+  resize_if_allowed(dst, src, func);
+
+  DstEvaluatorType dstEvaluator(dst);
    
  typedef generic_dense_assignment_kernel<DstEvaluatorType,SrcEvaluatorType,Functor> Kernel;
  Kernel kernel(dstEvaluator, srcEvaluator, func, dst.const_cast_derived());
-  
+
  dense_assignment_loop<Kernel>::run(kernel);
 }

 template<typename DstXprType, typename SrcXprType>
-EIGEN_DEVICE_FUNC void call_dense_assignment_loop(const DstXprType& dst, const SrcXprType& src)
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void call_dense_assignment_loop(DstXprType& dst, const SrcXprType& src)
 {
-  call_dense_assignment_loop(dst, src, internal::assign_op<typename DstXprType::Scalar>());
+  call_dense_assignment_loop(dst, src, internal::assign_op<typename DstXprType::Scalar,typename SrcXprType::Scalar>());
 }

 /***************************************************************************
@@ -674,7 +782,7 @@ EIGEN_DEVICE_FUNC void call_dense_assignment_loop(const DstXprType& dst, const S
 // AssignmentKind must define a Kind typedef.
 template<typename DstShape, typename SrcShape> struct AssignmentKind;

-// Assignement kind defined in this file:
+// Assignment kind defined in this file:
 struct Dense2Dense {};
 struct EigenBase2EigenBase {};

@@ -684,71 +792,64 @@ template<> struct AssignmentKind<DenseShape,DenseShape> { typedef Dense2Dense Ki
 // This is the main assignment class
 template< typename DstXprType, typename SrcXprType, typename Functor,
          typename Kind = typename AssignmentKind< typename evaluator_traits<DstXprType>::Shape , typename evaluator_traits<SrcXprType>::Shape >::Kind,
-          typename Scalar = typename DstXprType::Scalar>
+          typename EnableIf = void>
 struct Assignment;


-// The only purpose of this call_assignment() function is to deal with noalias() / AssumeAliasing and automatic transposition.
-// Indeed, I (Gael) think that this concept of AssumeAliasing was a mistake, and it makes thing quite complicated.
-// So this intermediate function removes everything related to AssumeAliasing such that Assignment
+// The only purpose of this call_assignment() function is to deal with noalias() / "assume-aliasing" and automatic transposition.
+// Indeed, I (Gael) think that this concept of "assume-aliasing" was a mistake, and it makes thing quite complicated.
+// So this intermediate function removes everything related to "assume-aliasing" such that Assignment
 // does not has to bother about these annoying details.

 template<typename Dst, typename Src>
-EIGEN_DEVICE_FUNC void call_assignment(Dst& dst, const Src& src)
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+void call_assignment(Dst& dst, const Src& src)
 {
-  call_assignment(dst, src, internal::assign_op<typename Dst::Scalar>());
+  call_assignment(dst, src, internal::assign_op<typename Dst::Scalar,typename Src::Scalar>());
 }
 template<typename Dst, typename Src>
-EIGEN_DEVICE_FUNC void call_assignment(const Dst& dst, const Src& src)
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+void call_assignment(const Dst& dst, const Src& src)
 {
-  call_assignment(dst, src, internal::assign_op<typename Dst::Scalar>());
+  call_assignment(dst, src, internal::assign_op<typename Dst::Scalar,typename Src::Scalar>());
 }
                     
-// Deal with AssumeAliasing
+// Deal with "assume-aliasing"
 template<typename Dst, typename Src, typename Func>
-EIGEN_DEVICE_FUNC void call_assignment(Dst& dst, const Src& src, const Func& func, typename enable_if<evaluator_traits<Src>::AssumeAliasing==1, void*>::type = 0)
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+void call_assignment(Dst& dst, const Src& src, const Func& func, typename enable_if< evaluator_assume_aliasing<Src>::value, void*>::type = 0)
 {
  typename plain_matrix_type<Src>::type tmp(src);
  call_assignment_no_alias(dst, tmp, func);
 }

 template<typename Dst, typename Src, typename Func>
-EIGEN_DEVICE_FUNC void call_assignment(Dst& dst, const Src& src, const Func& func, typename enable_if<evaluator_traits<Src>::AssumeAliasing==0, void*>::type = 0)
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+void call_assignment(Dst& dst, const Src& src, const Func& func, typename enable_if<!evaluator_assume_aliasing<Src>::value, void*>::type = 0)
 {
  call_assignment_no_alias(dst, src, func);
 }

-// by-pass AssumeAliasing
-// FIXME the const version should probably not be needed
+// by-pass "assume-aliasing"
 // When there is no aliasing, we require that 'dst' has been properly resized
 template<typename Dst, template <typename> class StorageBase, typename Src, typename Func>
-EIGEN_DEVICE_FUNC void call_assignment(const NoAlias<Dst,StorageBase>& dst, const Src& src, const Func& func)
-{
-  call_assignment_no_alias(dst.expression(), src, func);
-}
-template<typename Dst, template <typename> class StorageBase, typename Src, typename Func>
-EIGEN_DEVICE_FUNC void call_assignment(NoAlias<Dst,StorageBase>& dst, const Src& src, const Func& func)
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+void call_assignment(NoAlias<Dst,StorageBase>& dst, const Src& src, const Func& func)
 {
  call_assignment_no_alias(dst.expression(), src, func);
 }


 template<typename Dst, typename Src, typename Func>
-EIGEN_DEVICE_FUNC void call_assignment_no_alias(Dst& dst, const Src& src, const Func& func)
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+void call_assignment_no_alias(Dst& dst, const Src& src, const Func& func)
 {
  enum {
-    NeedToTranspose = (  (int(Dst::RowsAtCompileTime) == 1 && int(Src::ColsAtCompileTime) == 1)
-                        |   // FIXME | instead of || to please GCC 4.4.0 stupid warning "suggest parentheses around &&".
-                                // revert to || as soon as not needed anymore.
-                         (int(Dst::ColsAtCompileTime) == 1 && int(Src::RowsAtCompileTime) == 1))
-                     && int(Dst::SizeAtCompileTime) != 1
+    NeedToTranspose = (    (int(Dst::RowsAtCompileTime) == 1 && int(Src::ColsAtCompileTime) == 1)
+                        || (int(Dst::ColsAtCompileTime) == 1 && int(Src::RowsAtCompileTime) == 1)
+                      ) && int(Dst::SizeAtCompileTime) != 1
  };

-  Index dstRows = NeedToTranspose ? src.cols() : src.rows();
-  Index dstCols = NeedToTranspose ? src.rows() : src.cols();
-  if((dst.rows()!=dstRows) || (dst.cols()!=dstCols))
-    dst.resize(dstRows, dstCols);
-  
  typedef typename internal::conditional<NeedToTranspose, Transpose<Dst>, Dst>::type ActualDstTypeCleaned;
  typedef typename internal::conditional<NeedToTranspose, Transpose<Dst>, Dst&>::type ActualDstType;
  ActualDstType actualDst(dst);
@@ -756,51 +857,68 @@ EIGEN_DEVICE_FUNC void call_assignment_no_alias(Dst& dst, const Src& src, const
  // TODO check whether this is the right place to perform these checks:
  EIGEN_STATIC_ASSERT_LVALUE(Dst)
  EIGEN_STATIC_ASSERT_SAME_MATRIX_SIZE(ActualDstTypeCleaned,Src)
-
-  // TODO this line is commented to allow matrix = permutation
-  // Actually, the "Scalar" type for a permutation matrix does not really make sense,
-  // perhaps it could be void, and EIGEN_CHECK_BINARY_COMPATIBILIY could allow micing void with anything...?
-//   EIGEN_CHECK_BINARY_COMPATIBILIY(Func,typename ActualDstTypeCleaned::Scalar,typename Src::Scalar);
+  EIGEN_CHECK_BINARY_COMPATIBILIY(Func,typename ActualDstTypeCleaned::Scalar,typename Src::Scalar);
  
  Assignment<ActualDstTypeCleaned,Src,Func>::run(actualDst, src, func);
 }
-template<typename Dst, typename Src>
-EIGEN_DEVICE_FUNC void call_assignment_no_alias(Dst& dst, const Src& src)
+
+template<typename Dst, typename Src, typename Func>
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+void call_restricted_packet_assignment_no_alias(Dst& dst, const Src& src, const Func& func)
 {
-  call_assignment_no_alias(dst, src, internal::assign_op<typename Dst::Scalar>());
+    typedef evaluator<Dst> DstEvaluatorType;
+    typedef evaluator<Src> SrcEvaluatorType;
+    typedef restricted_packet_dense_assignment_kernel<DstEvaluatorType,SrcEvaluatorType,Func> Kernel;
+
+    EIGEN_STATIC_ASSERT_LVALUE(Dst)
+    EIGEN_CHECK_BINARY_COMPATIBILIY(Func,typename Dst::Scalar,typename Src::Scalar);
+
+    SrcEvaluatorType srcEvaluator(src);
+    resize_if_allowed(dst, src, func);
+    
+    DstEvaluatorType dstEvaluator(dst);
+    Kernel kernel(dstEvaluator, srcEvaluator, func, dst.const_cast_derived());
+
+    dense_assignment_loop<Kernel>::run(kernel);
+}
+
+template<typename Dst, typename Src>
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+void call_assignment_no_alias(Dst& dst, const Src& src)
+{
+  call_assignment_no_alias(dst, src, internal::assign_op<typename Dst::Scalar,typename Src::Scalar>());
 }

 template<typename Dst, typename Src, typename Func>
-EIGEN_DEVICE_FUNC void call_assignment_no_alias_no_transpose(Dst& dst, const Src& src, const Func& func)
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+void call_assignment_no_alias_no_transpose(Dst& dst, const Src& src, const Func& func)
 {
-  Index dstRows = src.rows();
-  Index dstCols = src.cols();
-  if((dst.rows()!=dstRows) || (dst.cols()!=dstCols))
-    dst.resize(dstRows, dstCols);
-  
  // TODO check whether this is the right place to perform these checks:
  EIGEN_STATIC_ASSERT_LVALUE(Dst)
  EIGEN_STATIC_ASSERT_SAME_MATRIX_SIZE(Dst,Src)
-  
+  EIGEN_CHECK_BINARY_COMPATIBILIY(Func,typename Dst::Scalar,typename Src::Scalar);
+
  Assignment<Dst,Src,Func>::run(dst, src, func);
 }
 template<typename Dst, typename Src>
-EIGEN_DEVICE_FUNC void call_assignment_no_alias_no_transpose(Dst& dst, const Src& src)
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+void call_assignment_no_alias_no_transpose(Dst& dst, const Src& src)
 {
-  call_assignment_no_alias_no_transpose(dst, src, internal::assign_op<typename Dst::Scalar>());
+  call_assignment_no_alias_no_transpose(dst, src, internal::assign_op<typename Dst::Scalar,typename Src::Scalar>());
 }

 // forward declaration
 template<typename Dst, typename Src> void check_for_aliasing(const Dst &dst, const Src &src);

 // Generic Dense to Dense assignment
-template< typename DstXprType, typename SrcXprType, typename Functor, typename Scalar>
-struct Assignment<DstXprType, SrcXprType, Functor, Dense2Dense, Scalar>
+// Note that the last template argument "Weak" is needed to make it possible to perform
+// both partial specialization+SFINAE without ambiguous specialization
+template< typename DstXprType, typename SrcXprType, typename Functor, typename Weak>
+struct Assignment<DstXprType, SrcXprType, Functor, Dense2Dense, Weak>
 {
-  EIGEN_DEVICE_FUNC static void run(DstXprType &dst, const SrcXprType &src, const Functor &func)
+  EIGEN_DEVICE_FUNC
+  static EIGEN_STRONG_INLINE void run(DstXprType &dst, const SrcXprType &src, const Functor &func)
  {
-    eigen_assert(dst.rows() == src.rows() && dst.cols() == src.cols());
-    
 #ifndef EIGEN_NO_DEBUG
    internal::check_for_aliasing(dst, src);
 #endif
@@ -811,14 +929,50 @@ struct Assignment<DstXprType, SrcXprType, Functor, Dense2Dense, Scalar>

 // Generic assignment through evalTo.
 // TODO: not sure we have to keep that one, but it helps porting current code to new evaluator mechanism.
-template< typename DstXprType, typename SrcXprType, typename Functor, typename Scalar>
-struct Assignment<DstXprType, SrcXprType, Functor, EigenBase2EigenBase, Scalar>
+// Note that the last template argument "Weak" is needed to make it possible to perform
+// both partial specialization+SFINAE without ambiguous specialization
+template< typename DstXprType, typename SrcXprType, typename Functor, typename Weak>
+struct Assignment<DstXprType, SrcXprType, Functor, EigenBase2EigenBase, Weak>
 {
-  EIGEN_DEVICE_FUNC static void run(DstXprType &dst, const SrcXprType &src, const internal::assign_op<typename DstXprType::Scalar> &/*func*/)
+  EIGEN_DEVICE_FUNC
+  static EIGEN_STRONG_INLINE void run(DstXprType &dst, const SrcXprType &src, const internal::assign_op<typename DstXprType::Scalar,typename SrcXprType::Scalar> &/*func*/)
  {
+    Index dstRows = src.rows();
+    Index dstCols = src.cols();
+    if((dst.rows()!=dstRows) || (dst.cols()!=dstCols))
+      dst.resize(dstRows, dstCols);
+
    eigen_assert(dst.rows() == src.rows() && dst.cols() == src.cols());
    src.evalTo(dst);
  }
+
+  // NOTE The following two functions are templated to avoid their instantiation if not needed
+  //      This is needed because some expressions supports evalTo only and/or have 'void' as scalar type.
+  template<typename SrcScalarType>
+  EIGEN_DEVICE_FUNC
+  static EIGEN_STRONG_INLINE void run(DstXprType &dst, const SrcXprType &src, const internal::add_assign_op<typename DstXprType::Scalar,SrcScalarType> &/*func*/)
+  {
+    Index dstRows = src.rows();
+    Index dstCols = src.cols();
+    if((dst.rows()!=dstRows) || (dst.cols()!=dstCols))
+      dst.resize(dstRows, dstCols);
+
+    eigen_assert(dst.rows() == src.rows() && dst.cols() == src.cols());
+    src.addTo(dst);
+  }
+
+  template<typename SrcScalarType>
+  EIGEN_DEVICE_FUNC
+  static EIGEN_STRONG_INLINE void run(DstXprType &dst, const SrcXprType &src, const internal::sub_assign_op<typename DstXprType::Scalar,SrcScalarType> &/*func*/)
+  {
+    Index dstRows = src.rows();
+    Index dstCols = src.cols();
+    if((dst.rows()!=dstRows) || (dst.cols()!=dstCols))
+      dst.resize(dstRows, dstCols);
+
+    eigen_assert(dst.rows() == src.rows() && dst.cols() == src.cols());
+    src.subTo(dst);
+  }
 };

 } // namespace internal
--- a/Eigen/src/Core/Assign_MKL.h
+++ b/Eigen/src/Core/Assign_MKL.h
@@ -68,27 +68,28 @@ class vml_assign_traits

 #define EIGEN_PP_EXPAND(ARG) ARG
 #if !defined (EIGEN_FAST_MATH) || (EIGEN_FAST_MATH != 1)
-#define EIGEN_VMLMODE_EXPAND_LA , VML_HA
+#define EIGEN_VMLMODE_EXPAND_xLA , VML_HA
 #else
-#define EIGEN_VMLMODE_EXPAND_LA , VML_LA
+#define EIGEN_VMLMODE_EXPAND_xLA , VML_LA
 #endif

-#define EIGEN_VMLMODE_EXPAND__ 
+#define EIGEN_VMLMODE_EXPAND_x_

-#define EIGEN_VMLMODE_PREFIX_LA vm
-#define EIGEN_VMLMODE_PREFIX__  v
-#define EIGEN_VMLMODE_PREFIX(VMLMODE) EIGEN_CAT(EIGEN_VMLMODE_PREFIX_,VMLMODE)
+#define EIGEN_VMLMODE_PREFIX_xLA vm
+#define EIGEN_VMLMODE_PREFIX_x_  v
+#define EIGEN_VMLMODE_PREFIX(VMLMODE) EIGEN_CAT(EIGEN_VMLMODE_PREFIX_x,VMLMODE)

 #define EIGEN_MKL_VML_DECLARE_UNARY_CALL(EIGENOP, VMLOP, EIGENTYPE, VMLTYPE, VMLMODE)                                           \
  template< typename DstXprType, typename SrcXprNested>                                                                         \
-  struct Assignment<DstXprType, CwiseUnaryOp<scalar_##EIGENOP##_op<EIGENTYPE>, SrcXprNested>, assign_op<EIGENTYPE>,             \
-                   Dense2Dense, typename enable_if<vml_assign_traits<DstXprType,SrcXprNested>::EnableVml,EIGENTYPE>::type> {    \
+  struct Assignment<DstXprType, CwiseUnaryOp<scalar_##EIGENOP##_op<EIGENTYPE>, SrcXprNested>, assign_op<EIGENTYPE,EIGENTYPE>,   \
+                   Dense2Dense, typename enable_if<vml_assign_traits<DstXprType,SrcXprNested>::EnableVml>::type> {              \
    typedef CwiseUnaryOp<scalar_##EIGENOP##_op<EIGENTYPE>, SrcXprNested> SrcXprType;                                            \
-    static void run(DstXprType &dst, const SrcXprType &src, const assign_op<EIGENTYPE> &/*func*/) {                             \
+    static void run(DstXprType &dst, const SrcXprType &src, const assign_op<EIGENTYPE,EIGENTYPE> &func) {                       \
+      resize_if_allowed(dst, src, func);                                                                                        \
      eigen_assert(dst.rows() == src.rows() && dst.cols() == src.cols());                                                       \
      if(vml_assign_traits<DstXprType,SrcXprNested>::Traversal==LinearTraversal) {                                              \
        VMLOP(dst.size(), (const VMLTYPE*)src.nestedExpression().data(),                                                        \
-              (VMLTYPE*)dst.data() EIGEN_PP_EXPAND(EIGEN_VMLMODE_EXPAND_##VMLMODE) );                                           \
+              (VMLTYPE*)dst.data() EIGEN_PP_EXPAND(EIGEN_VMLMODE_EXPAND_x##VMLMODE) );                                           \
      } else {                                                                                                                  \
        const Index outerSize = dst.outerSize();                                                                                \
        for(Index outer = 0; outer < outerSize; ++outer) {                                                                      \
@@ -96,7 +97,7 @@ class vml_assign_traits
                                                      &(src.nestedExpression().coeffRef(0, outer));                             \
          EIGENTYPE *dst_ptr = dst.IsRowMajor ? &(dst.coeffRef(outer,0)) : &(dst.coeffRef(0, outer));                           \
          VMLOP( dst.innerSize(), (const VMLTYPE*)src_ptr,                                                                      \
-                (VMLTYPE*)dst_ptr EIGEN_PP_EXPAND(EIGEN_VMLMODE_EXPAND_##VMLMODE));                                             \
+                (VMLTYPE*)dst_ptr EIGEN_PP_EXPAND(EIGEN_VMLMODE_EXPAND_x##VMLMODE));                                             \
        }                                                                                                                       \
      }                                                                                                                         \
    }                                                                                                                           \
@@ -138,25 +139,28 @@ EIGEN_MKL_VML_DECLARE_UNARY_CALLS_REAL(floor, Floor,  _)
 EIGEN_MKL_VML_DECLARE_UNARY_CALLS_REAL(ceil,  Ceil,   _)

 #define EIGEN_MKL_VML_DECLARE_POW_CALL(EIGENOP, VMLOP, EIGENTYPE, VMLTYPE, VMLMODE)                                           \
-  template< typename DstXprType, typename SrcXprNested>                                                                       \
-  struct Assignment<DstXprType, CwiseUnaryOp<scalar_##EIGENOP##_op<EIGENTYPE>, SrcXprNested>, assign_op<EIGENTYPE>,           \
-                   Dense2Dense, typename enable_if<vml_assign_traits<DstXprType,SrcXprNested>::EnableVml,EIGENTYPE>::type> {  \
-    typedef CwiseUnaryOp<scalar_##EIGENOP##_op<EIGENTYPE>, SrcXprNested> SrcXprType;                                          \
-    static void run(DstXprType &dst, const SrcXprType &src, const assign_op<EIGENTYPE> &/*func*/) {                           \
+  template< typename DstXprType, typename SrcXprNested, typename Plain>                                                       \
+  struct Assignment<DstXprType, CwiseBinaryOp<scalar_##EIGENOP##_op<EIGENTYPE,EIGENTYPE>, SrcXprNested,                       \
+                    const CwiseNullaryOp<internal::scalar_constant_op<EIGENTYPE>,Plain> >, assign_op<EIGENTYPE,EIGENTYPE>,    \
+                   Dense2Dense, typename enable_if<vml_assign_traits<DstXprType,SrcXprNested>::EnableVml>::type> {            \
+    typedef CwiseBinaryOp<scalar_##EIGENOP##_op<EIGENTYPE,EIGENTYPE>, SrcXprNested,                                           \
+                    const CwiseNullaryOp<internal::scalar_constant_op<EIGENTYPE>,Plain> > SrcXprType;                         \
+    static void run(DstXprType &dst, const SrcXprType &src, const assign_op<EIGENTYPE,EIGENTYPE> &func) {                     \
+      resize_if_allowed(dst, src, func);                                                                                      \
      eigen_assert(dst.rows() == src.rows() && dst.cols() == src.cols());                                                     \
-      VMLTYPE exponent = reinterpret_cast<const VMLTYPE&>(src.functor().m_exponent);                                          \
+      VMLTYPE exponent = reinterpret_cast<const VMLTYPE&>(src.rhs().functor().m_other);                                       \
      if(vml_assign_traits<DstXprType,SrcXprNested>::Traversal==LinearTraversal)                                              \
      {                                                                                                                       \
-        VMLOP( dst.size(), (const VMLTYPE*)src.nestedExpression().data(), exponent,                                           \
-              (VMLTYPE*)dst.data() EIGEN_PP_EXPAND(EIGEN_VMLMODE_EXPAND_##VMLMODE) );                                         \
+        VMLOP( dst.size(), (const VMLTYPE*)src.lhs().data(), exponent,                                                        \
+              (VMLTYPE*)dst.data() EIGEN_PP_EXPAND(EIGEN_VMLMODE_EXPAND_x##VMLMODE) );                                         \
      } else {                                                                                                                \
        const Index outerSize = dst.outerSize();                                                                              \
        for(Index outer = 0; outer < outerSize; ++outer) {                                                                    \
-          const EIGENTYPE *src_ptr = src.IsRowMajor ? &(src.nestedExpression().coeffRef(outer,0)) :                           \
-                                                      &(src.nestedExpression().coeffRef(0, outer));                           \
+          const EIGENTYPE *src_ptr = src.IsRowMajor ? &(src.lhs().coeffRef(outer,0)) :                                        \
+                                                      &(src.lhs().coeffRef(0, outer));                                        \
          EIGENTYPE *dst_ptr = dst.IsRowMajor ? &(dst.coeffRef(outer,0)) : &(dst.coeffRef(0, outer));                         \
          VMLOP( dst.innerSize(), (const VMLTYPE*)src_ptr, exponent,                                                          \
-                 (VMLTYPE*)dst_ptr EIGEN_PP_EXPAND(EIGEN_VMLMODE_EXPAND_##VMLMODE));                                          \
+                 (VMLTYPE*)dst_ptr EIGEN_PP_EXPAND(EIGEN_VMLMODE_EXPAND_x##VMLMODE));                                          \
        }                                                                                                                     \
      }                                                                                                                       \
    }                                                                                                                         \
--- a/Eigen/src/Core/BandMatrix.h
+++ b/Eigen/src/Core/BandMatrix.h
@@ -161,15 +161,15 @@ class BandMatrixBase : public EigenBase<Derived>
  *
  * \brief Represents a rectangular matrix with a banded storage
  *
-  * \param _Scalar Numeric type, i.e. float, double, int
-  * \param Rows Number of rows, or \b Dynamic
-  * \param Cols Number of columns, or \b Dynamic
-  * \param Supers Number of super diagonal
-  * \param Subs Number of sub diagonal
-  * \param _Options A combination of either \b #RowMajor or \b #ColMajor, and of \b #SelfAdjoint
-  *                 The former controls \ref TopicStorageOrders "storage order", and defaults to
-  *                 column-major. The latter controls whether the matrix represents a selfadjoint 
-  *                 matrix in which case either Supers of Subs have to be null.
+  * \tparam _Scalar Numeric type, i.e. float, double, int
+  * \tparam _Rows Number of rows, or \b Dynamic
+  * \tparam _Cols Number of columns, or \b Dynamic
+  * \tparam _Supers Number of super diagonal
+  * \tparam _Subs Number of sub diagonal
+  * \tparam _Options A combination of either \b #RowMajor or \b #ColMajor, and of \b #SelfAdjoint
+  *                  The former controls \ref TopicStorageOrders "storage order", and defaults to
+  *                  column-major. The latter controls whether the matrix represents a selfadjoint
+  *                  matrix in which case either Supers of Subs have to be null.
  *
  * \sa class TridiagonalMatrix
  */
@@ -302,9 +302,9 @@ class BandMatrixWrapper : public BandMatrixBase<BandMatrixWrapper<_CoefficientsT
  *
  * \brief Represents a tridiagonal matrix with a compact banded storage
  *
-  * \param _Scalar Numeric type, i.e. float, double, int
-  * \param Size Number of rows and cols, or \b Dynamic
-  * \param _Options Can be 0 or \b SelfAdjoint
+  * \tparam Scalar Numeric type, i.e. float, double, int
+  * \tparam Size Number of rows and cols, or \b Dynamic
+  * \tparam Options Can be 0 or \b SelfAdjoint
  *
  * \sa class BandMatrix
  */
--- a/Eigen/src/Core/Block.h
+++ b/Eigen/src/Core/Block.h
@@ -13,41 +13,6 @@

 namespace Eigen { 

-/** \class Block
-  * \ingroup Core_Module
-  *
-  * \brief Expression of a fixed-size or dynamic-size block
-  *
-  * \param XprType the type of the expression in which we are taking a block
-  * \param BlockRows the number of rows of the block we are taking at compile time (optional)
-  * \param BlockCols the number of columns of the block we are taking at compile time (optional)
-  * \param InnerPanel is true, if the block maps to a set of rows of a row major matrix or
-  *        to set of columns of a column major matrix (optional). The parameter allows to determine
-  *        at compile time whether aligned access is possible on the block expression.
-  *
-  * This class represents an expression of either a fixed-size or dynamic-size block. It is the return
-  * type of DenseBase::block(Index,Index,Index,Index) and DenseBase::block<int,int>(Index,Index) and
-  * most of the time this is the only way it is used.
-  *
-  * However, if you want to directly maniputate block expressions,
-  * for instance if you want to write a function returning such an expression, you
-  * will need to use this class.
-  *
-  * Here is an example illustrating the dynamic case:
-  * \include class_Block.cpp
-  * Output: \verbinclude class_Block.out
-  *
-  * \note Even though this expression has dynamic size, in the case where \a XprType
-  * has fixed size, this expression inherits a fixed maximal size which means that evaluating
-  * it does not cause a dynamic memory allocation.
-  *
-  * Here is an example illustrating the fixed-size case:
-  * \include class_FixedBlock.cpp
-  * Output: \verbinclude class_FixedBlock.out
-  *
-  * \sa DenseBase::block(Index,Index,Index,Index), DenseBase::block(Index,Index), class VectorBlock
-  */
-
 namespace internal {
 template<typename XprType, int BlockRows, int BlockCols, bool InnerPanel>
 struct traits<Block<XprType, BlockRows, BlockCols, InnerPanel> > : traits<XprType>
@@ -101,6 +66,40 @@ template<typename XprType, int BlockRows=Dynamic, int BlockCols=Dynamic, bool In

 template<typename XprType, int BlockRows, int BlockCols, bool InnerPanel, typename StorageKind> class BlockImpl;

+/** \class Block
+  * \ingroup Core_Module
+  *
+  * \brief Expression of a fixed-size or dynamic-size block
+  *
+  * \tparam XprType the type of the expression in which we are taking a block
+  * \tparam BlockRows the number of rows of the block we are taking at compile time (optional)
+  * \tparam BlockCols the number of columns of the block we are taking at compile time (optional)
+  * \tparam InnerPanel is true, if the block maps to a set of rows of a row major matrix or
+  *         to set of columns of a column major matrix (optional). The parameter allows to determine
+  *         at compile time whether aligned access is possible on the block expression.
+  *
+  * This class represents an expression of either a fixed-size or dynamic-size block. It is the return
+  * type of DenseBase::block(Index,Index,Index,Index) and DenseBase::block<int,int>(Index,Index) and
+  * most of the time this is the only way it is used.
+  *
+  * However, if you want to directly maniputate block expressions,
+  * for instance if you want to write a function returning such an expression, you
+  * will need to use this class.
+  *
+  * Here is an example illustrating the dynamic case:
+  * \include class_Block.cpp
+  * Output: \verbinclude class_Block.out
+  *
+  * \note Even though this expression has dynamic size, in the case where \a XprType
+  * has fixed size, this expression inherits a fixed maximal size which means that evaluating
+  * it does not cause a dynamic memory allocation.
+  *
+  * Here is an example illustrating the fixed-size case:
+  * \include class_FixedBlock.cpp
+  * Output: \verbinclude class_FixedBlock.out
+  *
+  * \sa DenseBase::block(Index,Index,Index,Index), DenseBase::block(Index,Index), class VectorBlock
+  */
 template<typename XprType, int BlockRows, int BlockCols, bool InnerPanel> class Block
  : public BlockImpl<XprType, BlockRows, BlockCols, InnerPanel, typename internal::traits<XprType>::StorageKind>
 {
@@ -115,8 +114,8 @@ template<typename XprType, int BlockRows, int BlockCols, bool InnerPanel> class
  
    /** Column or Row constructor
      */
-    EIGEN_DEVICE_FUNC
-    inline Block(XprType& xpr, Index i) : Impl(xpr,i)
+    EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+    Block(XprType& xpr, Index i) : Impl(xpr,i)
    {
      eigen_assert( (i>=0) && (
          ((BlockRows==1) && (BlockCols==XprType::ColsAtCompileTime) && i<xpr.rows())
@@ -125,19 +124,19 @@ template<typename XprType, int BlockRows, int BlockCols, bool InnerPanel> class

    /** Fixed-size constructor
      */
-    EIGEN_DEVICE_FUNC
-    inline Block(XprType& xpr, Index startRow, Index startCol)
+    EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+    Block(XprType& xpr, Index startRow, Index startCol)
      : Impl(xpr, startRow, startCol)
    {
      EIGEN_STATIC_ASSERT(RowsAtCompileTime!=Dynamic && ColsAtCompileTime!=Dynamic,THIS_METHOD_IS_ONLY_FOR_FIXED_SIZE)
-      eigen_assert(startRow >= 0 && BlockRows >= 1 && startRow + BlockRows <= xpr.rows()
-             && startCol >= 0 && BlockCols >= 1 && startCol + BlockCols <= xpr.cols());
+      eigen_assert(startRow >= 0 && BlockRows >= 0 && startRow + BlockRows <= xpr.rows()
+             && startCol >= 0 && BlockCols >= 0 && startCol + BlockCols <= xpr.cols());
    }

    /** Dynamic-size constructor
      */
-    EIGEN_DEVICE_FUNC
-    inline Block(XprType& xpr,
+    EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+    Block(XprType& xpr,
          Index startRow, Index startCol,
          Index blockRows, Index blockCols)
      : Impl(xpr, startRow, startCol, blockRows, blockCols)
@@ -160,10 +159,10 @@ class BlockImpl<XprType, BlockRows, BlockCols, InnerPanel, Dense>
  public:
    typedef Impl Base;
    EIGEN_INHERIT_ASSIGNMENT_OPERATORS(BlockImpl)
-    EIGEN_DEVICE_FUNC inline BlockImpl(XprType& xpr, Index i) : Impl(xpr,i) {}
-    EIGEN_DEVICE_FUNC inline BlockImpl(XprType& xpr, Index startRow, Index startCol) : Impl(xpr, startRow, startCol) {}
+    EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE BlockImpl(XprType& xpr, Index i) : Impl(xpr,i) {}
+    EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE BlockImpl(XprType& xpr, Index startRow, Index startCol) : Impl(xpr, startRow, startCol) {}
    EIGEN_DEVICE_FUNC
-    inline BlockImpl(XprType& xpr, Index startRow, Index startCol, Index blockRows, Index blockCols)
+    EIGEN_STRONG_INLINE BlockImpl(XprType& xpr, Index startRow, Index startCol, Index blockRows, Index blockCols)
      : Impl(xpr, startRow, startCol, blockRows, blockCols) {}
 };

@@ -174,6 +173,7 @@ template<typename XprType, int BlockRows, int BlockCols, bool InnerPanel, bool H
  : public internal::dense_xpr_base<Block<XprType, BlockRows, BlockCols, InnerPanel> >::type
 {
    typedef Block<XprType, BlockRows, BlockCols, InnerPanel> BlockType;
+    typedef typename internal::ref_selector<XprType>::non_const_type XprTypeNested;
  public:

    typedef typename internal::dense_xpr_base<BlockType>::type Base;
@@ -222,15 +222,13 @@ template<typename XprType, int BlockRows, int BlockCols, bool InnerPanel, bool H
    inline Scalar& coeffRef(Index rowId, Index colId)
    {
      EIGEN_STATIC_ASSERT_LVALUE(XprType)
-      return m_xpr.const_cast_derived()
-               .coeffRef(rowId + m_startRow.value(), colId + m_startCol.value());
+      return m_xpr.coeffRef(rowId + m_startRow.value(), colId + m_startCol.value());
    }

    EIGEN_DEVICE_FUNC
    inline const Scalar& coeffRef(Index rowId, Index colId) const
    {
-      return m_xpr.derived()
-               .coeffRef(rowId + m_startRow.value(), colId + m_startCol.value());
+      return m_xpr.derived().coeffRef(rowId + m_startRow.value(), colId + m_startCol.value());
    }

    EIGEN_DEVICE_FUNC
@@ -243,39 +241,34 @@ template<typename XprType, int BlockRows, int BlockCols, bool InnerPanel, bool H
    inline Scalar& coeffRef(Index index)
    {
      EIGEN_STATIC_ASSERT_LVALUE(XprType)
-      return m_xpr.const_cast_derived()
-             .coeffRef(m_startRow.value() + (RowsAtCompileTime == 1 ? 0 : index),
-                       m_startCol.value() + (RowsAtCompileTime == 1 ? index : 0));
+      return m_xpr.coeffRef(m_startRow.value() + (RowsAtCompileTime == 1 ? 0 : index),
+                            m_startCol.value() + (RowsAtCompileTime == 1 ? index : 0));
    }

    EIGEN_DEVICE_FUNC
    inline const Scalar& coeffRef(Index index) const
    {
-      return m_xpr.const_cast_derived()
-             .coeffRef(m_startRow.value() + (RowsAtCompileTime == 1 ? 0 : index),
-                       m_startCol.value() + (RowsAtCompileTime == 1 ? index : 0));
+      return m_xpr.coeffRef(m_startRow.value() + (RowsAtCompileTime == 1 ? 0 : index),
+                            m_startCol.value() + (RowsAtCompileTime == 1 ? index : 0));
    }

    EIGEN_DEVICE_FUNC
    inline const CoeffReturnType coeff(Index index) const
    {
-      return m_xpr
-             .coeff(m_startRow.value() + (RowsAtCompileTime == 1 ? 0 : index),
-                    m_startCol.value() + (RowsAtCompileTime == 1 ? index : 0));
+      return m_xpr.coeff(m_startRow.value() + (RowsAtCompileTime == 1 ? 0 : index),
+                         m_startCol.value() + (RowsAtCompileTime == 1 ? index : 0));
    }

    template<int LoadMode>
    inline PacketScalar packet(Index rowId, Index colId) const
    {
-      return m_xpr.template packet<Unaligned>
-              (rowId + m_startRow.value(), colId + m_startCol.value());
+      return m_xpr.template packet<Unaligned>(rowId + m_startRow.value(), colId + m_startCol.value());
    }

    template<int LoadMode>
    inline void writePacket(Index rowId, Index colId, const PacketScalar& val)
    {
-      m_xpr.const_cast_derived().template writePacket<Unaligned>
-              (rowId + m_startRow.value(), colId + m_startCol.value(), val);
+      m_xpr.template writePacket<Unaligned>(rowId + m_startRow.value(), colId + m_startCol.value(), val);
    }

    template<int LoadMode>
@@ -289,7 +282,7 @@ template<typename XprType, int BlockRows, int BlockCols, bool InnerPanel, bool H
    template<int LoadMode>
    inline void writePacket(Index index, const PacketScalar& val)
    {
-      m_xpr.const_cast_derived().template writePacket<Unaligned>
+      m_xpr.template writePacket<Unaligned>
         (m_startRow.value() + (RowsAtCompileTime == 1 ? 0 : index),
          m_startCol.value() + (RowsAtCompileTime == 1 ? index : 0), val);
    }
@@ -301,19 +294,22 @@ template<typename XprType, int BlockRows, int BlockCols, bool InnerPanel, bool H
    EIGEN_DEVICE_FUNC inline Index outerStride() const;
    #endif

-    EIGEN_DEVICE_FUNC
-    const typename internal::remove_all<typename XprType::Nested>::type& nestedExpression() const
+    EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+    const typename internal::remove_all<XprTypeNested>::type& nestedExpression() const
    { 
      return m_xpr; 
    }
+
+    EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+    XprType& nestedExpression() { return m_xpr; }
      
-    EIGEN_DEVICE_FUNC
+    EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
    StorageIndex startRow() const
    { 
      return m_startRow.value(); 
    }
      
-    EIGEN_DEVICE_FUNC
+    EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
    StorageIndex startCol() const
    { 
      return m_startCol.value(); 
@@ -321,9 +317,9 @@ template<typename XprType, int BlockRows, int BlockCols, bool InnerPanel, bool H

  protected:

-    const typename XprType::Nested m_xpr;
-    const internal::variable_if_dynamic<StorageIndex, XprType::RowsAtCompileTime == 1 ? 0 : Dynamic> m_startRow;
-    const internal::variable_if_dynamic<StorageIndex, XprType::ColsAtCompileTime == 1 ? 0 : Dynamic> m_startCol;
+    XprTypeNested m_xpr;
+    const internal::variable_if_dynamic<StorageIndex, (XprType::RowsAtCompileTime == 1 && BlockRows==1) ? 0 : Dynamic> m_startRow;
+    const internal::variable_if_dynamic<StorageIndex, (XprType::ColsAtCompileTime == 1 && BlockCols==1) ? 0 : Dynamic> m_startCol;
    const internal::variable_if_dynamic<StorageIndex, RowsAtCompileTime> m_blockRows;
    const internal::variable_if_dynamic<StorageIndex, ColsAtCompileTime> m_blockCols;
 };
@@ -334,6 +330,7 @@ class BlockImpl_dense<XprType,BlockRows,BlockCols, InnerPanel,true>
  : public MapBase<Block<XprType, BlockRows, BlockCols, InnerPanel> >
 {
    typedef Block<XprType, BlockRows, BlockCols, InnerPanel> BlockType;
+    typedef typename internal::ref_selector<XprType>::non_const_type XprTypeNested;
    enum {
      XprTypeIsRowMajor = (int(traits<XprType>::Flags)&RowMajorBit) != 0
    };
@@ -345,48 +342,53 @@ class BlockImpl_dense<XprType,BlockRows,BlockCols, InnerPanel,true>

    /** Column or Row constructor
      */
-    EIGEN_DEVICE_FUNC
-    inline BlockImpl_dense(XprType& xpr, Index i)
+    EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+    BlockImpl_dense(XprType& xpr, Index i)
      : Base(xpr.data() + i * (    ((BlockRows==1) && (BlockCols==XprType::ColsAtCompileTime) && (!XprTypeIsRowMajor)) 
                                || ((BlockRows==XprType::RowsAtCompileTime) && (BlockCols==1) && ( XprTypeIsRowMajor)) ? xpr.innerStride() : xpr.outerStride()),
             BlockRows==1 ? 1 : xpr.rows(),
             BlockCols==1 ? 1 : xpr.cols()),
-        m_xpr(xpr)
+        m_xpr(xpr),
+        m_startRow( (BlockRows==1) && (BlockCols==XprType::ColsAtCompileTime) ? i : 0),
+        m_startCol( (BlockRows==XprType::RowsAtCompileTime) && (BlockCols==1) ? i : 0)
    {
      init();
    }

    /** Fixed-size constructor
      */
-    EIGEN_DEVICE_FUNC
-    inline BlockImpl_dense(XprType& xpr, Index startRow, Index startCol)
+    EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+    BlockImpl_dense(XprType& xpr, Index startRow, Index startCol)
      : Base(xpr.data()+xpr.innerStride()*(XprTypeIsRowMajor?startCol:startRow) + xpr.outerStride()*(XprTypeIsRowMajor?startRow:startCol)),
-        m_xpr(xpr)
+        m_xpr(xpr), m_startRow(startRow), m_startCol(startCol)
    {
      init();
    }

    /** Dynamic-size constructor
      */
-    EIGEN_DEVICE_FUNC
-    inline BlockImpl_dense(XprType& xpr,
+    EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+    BlockImpl_dense(XprType& xpr,
          Index startRow, Index startCol,
          Index blockRows, Index blockCols)
      : Base(xpr.data()+xpr.innerStride()*(XprTypeIsRowMajor?startCol:startRow) + xpr.outerStride()*(XprTypeIsRowMajor?startRow:startCol), blockRows, blockCols),
-        m_xpr(xpr)
+        m_xpr(xpr), m_startRow(startRow), m_startCol(startCol)
    {
      init();
    }

-    EIGEN_DEVICE_FUNC
-    const typename internal::remove_all<typename XprType::Nested>::type& nestedExpression() const
+    EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+    const typename internal::remove_all<XprTypeNested>::type& nestedExpression() const
    { 
      return m_xpr; 
    }
+
+    EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+    XprType& nestedExpression() { return m_xpr; }
      
    /** \sa MapBase::innerStride() */
-    EIGEN_DEVICE_FUNC
-    inline Index innerStride() const
+    EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+    Index innerStride() const
    {
      return internal::traits<BlockType>::HasSameStorageOrderAsXprType
             ? m_xpr.innerStride()
@@ -394,12 +396,24 @@ class BlockImpl_dense<XprType,BlockRows,BlockCols, InnerPanel,true>
    }

    /** \sa MapBase::outerStride() */
-    EIGEN_DEVICE_FUNC
-    inline Index outerStride() const
+    EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+    Index outerStride() const
    {
      return m_outerStride;
    }

+    EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+    StorageIndex startRow() const
+    {
+      return m_startRow.value();
+    }
+
+    EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+    StorageIndex startCol() const
+    {
+      return m_startCol.value();
+    }
+
  #ifndef __SUNPRO_CC
  // FIXME sunstudio is not friendly with the above friend...
  // META-FIXME there is no 'friend' keyword around here. Is this obsolete?
@@ -408,8 +422,8 @@ class BlockImpl_dense<XprType,BlockRows,BlockCols, InnerPanel,true>

    #ifndef EIGEN_PARSED_BY_DOXYGEN
    /** \internal used by allowAligned() */
-    EIGEN_DEVICE_FUNC
-    inline BlockImpl_dense(XprType& xpr, const Scalar* data, Index blockRows, Index blockCols)
+    EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+    BlockImpl_dense(XprType& xpr, const Scalar* data, Index blockRows, Index blockCols)
      : Base(data, blockRows, blockCols), m_xpr(xpr)
    {
      init();
@@ -417,7 +431,7 @@ class BlockImpl_dense<XprType,BlockRows,BlockCols, InnerPanel,true>
    #endif

  protected:
-    EIGEN_DEVICE_FUNC
+    EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
    void init()
    {
      m_outerStride = internal::traits<BlockType>::HasSameStorageOrderAsXprType
@@ -425,7 +439,9 @@ class BlockImpl_dense<XprType,BlockRows,BlockCols, InnerPanel,true>
                    : m_xpr.innerStride();
    }

-    typename XprType::Nested m_xpr;
+    XprTypeNested m_xpr;
+    const internal::variable_if_dynamic<StorageIndex, (XprType::RowsAtCompileTime == 1 && BlockRows==1) ? 0 : Dynamic> m_startRow;
+    const internal::variable_if_dynamic<StorageIndex, (XprType::ColsAtCompileTime == 1 && BlockCols==1) ? 0 : Dynamic> m_startCol;
    Index m_outerStride;
 };

--- a/Eigen/src/Core/BooleanRedux.h
+++ b/Eigen/src/Core/BooleanRedux.h
@@ -14,56 +14,54 @@ namespace Eigen {

 namespace internal {

-template<typename Derived, int UnrollCount>
+template<typename Derived, int UnrollCount, int Rows>
 struct all_unroller
 {
-  typedef typename Derived::ExpressionTraits Traits;
  enum {
-    col = (UnrollCount-1) / Traits::RowsAtCompileTime,
-    row = (UnrollCount-1) % Traits::RowsAtCompileTime
+    col = (UnrollCount-1) / Rows,
+    row = (UnrollCount-1) % Rows
  };

  static inline bool run(const Derived &mat)
  {
-    return all_unroller<Derived, UnrollCount-1>::run(mat) && mat.coeff(row, col);
+    return all_unroller<Derived, UnrollCount-1, Rows>::run(mat) && mat.coeff(row, col);
  }
 };

-template<typename Derived>
-struct all_unroller<Derived, 0>
+template<typename Derived, int Rows>
+struct all_unroller<Derived, 0, Rows>
 {
  static inline bool run(const Derived &/*mat*/) { return true; }
 };

-template<typename Derived>
-struct all_unroller<Derived, Dynamic>
+template<typename Derived, int Rows>
+struct all_unroller<Derived, Dynamic, Rows>
 {
  static inline bool run(const Derived &) { return false; }
 };

-template<typename Derived, int UnrollCount>
+template<typename Derived, int UnrollCount, int Rows>
 struct any_unroller
 {
-  typedef typename Derived::ExpressionTraits Traits;
  enum {
-    col = (UnrollCount-1) / Traits::RowsAtCompileTime,
-    row = (UnrollCount-1) % Traits::RowsAtCompileTime
+    col = (UnrollCount-1) / Rows,
+    row = (UnrollCount-1) % Rows
  };
  
  static inline bool run(const Derived &mat)
  {
-    return any_unroller<Derived, UnrollCount-1>::run(mat) || mat.coeff(row, col);
+    return any_unroller<Derived, UnrollCount-1, Rows>::run(mat) || mat.coeff(row, col);
  }
 };

-template<typename Derived>
-struct any_unroller<Derived, 0>
+template<typename Derived, int Rows>
+struct any_unroller<Derived, 0, Rows>
 {
  static inline bool run(const Derived & /*mat*/) { return false; }
 };

-template<typename Derived>
-struct any_unroller<Derived, Dynamic>
+template<typename Derived, int Rows>
+struct any_unroller<Derived, Dynamic, Rows>
 {
  static inline bool run(const Derived &) { return false; }
 };
@@ -78,18 +76,16 @@ struct any_unroller<Derived, Dynamic>
  * \sa any(), Cwise::operator<()
  */
 template<typename Derived>
-inline bool DenseBase<Derived>::all() const
+EIGEN_DEVICE_FUNC inline bool DenseBase<Derived>::all() const
 {
  typedef internal::evaluator<Derived> Evaluator;
  enum {
    unroll = SizeAtCompileTime != Dynamic
-          && Evaluator::CoeffReadCost != Dynamic
-          && NumTraits<Scalar>::AddCost != Dynamic
          && SizeAtCompileTime * (Evaluator::CoeffReadCost + NumTraits<Scalar>::AddCost) <= EIGEN_UNROLLING_LIMIT
  };
  Evaluator evaluator(derived());
  if(unroll)
-    return internal::all_unroller<Evaluator, unroll ? int(SizeAtCompileTime) : Dynamic>::run(evaluator);
+    return internal::all_unroller<Evaluator, unroll ? int(SizeAtCompileTime) : Dynamic, internal::traits<Derived>::RowsAtCompileTime>::run(evaluator);
  else
  {
    for(Index j = 0; j < cols(); ++j)
@@ -104,18 +100,16 @@ inline bool DenseBase<Derived>::all() const
  * \sa all()
  */
 template<typename Derived>
-inline bool DenseBase<Derived>::any() const
+EIGEN_DEVICE_FUNC inline bool DenseBase<Derived>::any() const
 {
  typedef internal::evaluator<Derived> Evaluator;
  enum {
    unroll = SizeAtCompileTime != Dynamic
-          && Evaluator::CoeffReadCost != Dynamic
-          && NumTraits<Scalar>::AddCost != Dynamic
          && SizeAtCompileTime * (Evaluator::CoeffReadCost + NumTraits<Scalar>::AddCost) <= EIGEN_UNROLLING_LIMIT
  };
  Evaluator evaluator(derived());
  if(unroll)
-    return internal::any_unroller<Evaluator, unroll ? int(SizeAtCompileTime) : Dynamic>::run(evaluator);
+    return internal::any_unroller<Evaluator, unroll ? int(SizeAtCompileTime) : Dynamic, internal::traits<Derived>::RowsAtCompileTime>::run(evaluator);
  else
  {
    for(Index j = 0; j < cols(); ++j)
@@ -130,7 +124,7 @@ inline bool DenseBase<Derived>::any() const
  * \sa all(), any()
  */
 template<typename Derived>
-inline Eigen::Index DenseBase<Derived>::count() const
+EIGEN_DEVICE_FUNC inline Eigen::Index DenseBase<Derived>::count() const
 {
  return derived().template cast<bool>().template cast<Index>().sum();
 }
@@ -142,7 +136,11 @@ inline Eigen::Index DenseBase<Derived>::count() const
 template<typename Derived>
 inline bool DenseBase<Derived>::hasNaN() const
 {
+#if EIGEN_COMP_MSVC || (defined __FAST_MATH__)
+  return derived().array().isNaN().any();
+#else
  return !((derived().array()==derived().array()).all());
+#endif
 }

 /** \returns true if \c *this contains only finite numbers, i.e., no NaN and no +/-INF values.
@@ -152,7 +150,11 @@ inline bool DenseBase<Derived>::hasNaN() const
 template<typename Derived>
 inline bool DenseBase<Derived>::allFinite() const
 {
+#if EIGEN_COMP_MSVC || (defined __FAST_MATH__)
+  return derived().array().isFinite().all();
+#else
  return !((derived()-derived()).hasNaN());
+#endif
 }
    
 } // end namespace Eigen
--- a/Eigen/src/Core/CMakeLists.txt
+++ b/Eigen/src/Core/CMakeLists.txt
@@ -1,11 +0,0 @@
-FILE(GLOB Eigen_Core_SRCS "*.h")
-
-INSTALL(FILES
-  ${Eigen_Core_SRCS}
-  DESTINATION ${INCLUDE_INSTALL_DIR}/Eigen/src/Core COMPONENT Devel
-  )
-
-ADD_SUBDIRECTORY(products)
-ADD_SUBDIRECTORY(util)
-ADD_SUBDIRECTORY(arch)
-ADD_SUBDIRECTORY(functors)
--- a/Eigen/src/Core/CommaInitializer.h
+++ b/Eigen/src/Core/CommaInitializer.h
@@ -22,7 +22,7 @@ namespace Eigen {
  * the return type of MatrixBase::operator<<, and most of the time this is the only
  * way it is used.
  *
-  * \sa \ref MatrixBaseCommaInitRef "MatrixBase::operator<<", CommaInitializer::finished()
+  * \sa \blank \ref MatrixBaseCommaInitRef "MatrixBase::operator<<", CommaInitializer::finished()
  */
 template<typename XprType>
 struct CommaInitializer
@@ -80,9 +80,7 @@ struct CommaInitializer
  EIGEN_DEVICE_FUNC
  CommaInitializer& operator,(const DenseBase<OtherDerived>& other)
  {
-    if(other.cols()==0 || other.rows()==0)
-      return *this;
-    if (m_col==m_xpr.cols())
+    if (m_col==m_xpr.cols() && (other.cols()!=0 || other.rows()!=m_currentBlockRows))
    {
      m_row+=m_currentBlockRows;
      m_col = 0;
@@ -90,15 +88,11 @@ struct CommaInitializer
      eigen_assert(m_row+m_currentBlockRows<=m_xpr.rows()
        && "Too many rows passed to comma initializer (operator<<)");
    }
-    eigen_assert(m_col<m_xpr.cols()
+    eigen_assert((m_col + other.cols() <= m_xpr.cols())
      && "Too many coefficients passed to comma initializer (operator<<)");
    eigen_assert(m_currentBlockRows==other.rows());
-    if (OtherDerived::SizeAtCompileTime != Dynamic)
-      m_xpr.template block<OtherDerived::RowsAtCompileTime != Dynamic ? OtherDerived::RowsAtCompileTime : 1,
-                              OtherDerived::ColsAtCompileTime != Dynamic ? OtherDerived::ColsAtCompileTime : 1>
-                    (m_row, m_col) = other;
-    else
-      m_xpr.block(m_row, m_col, other.rows(), other.cols()) = other;
+    m_xpr.template block<OtherDerived::RowsAtCompileTime, OtherDerived::ColsAtCompileTime>
+                    (m_row, m_col, other.rows(), other.cols()) = other;
    m_col += other.cols();
    return *this;
  }
@@ -106,12 +100,10 @@ struct CommaInitializer
  EIGEN_DEVICE_FUNC
  inline ~CommaInitializer()
 #if defined VERIFY_RAISES_ASSERT && (!defined EIGEN_NO_ASSERTION_CHECKING) && defined EIGEN_EXCEPTIONS
-  throw(Eigen::eigen_assert_exception)
+  EIGEN_EXCEPTION_SPEC(Eigen::eigen_assert_exception)
 #endif
  {
-    eigen_assert((m_row+m_currentBlockRows) == m_xpr.rows()
-         && m_col == m_xpr.cols()
-         && "Too few coefficients passed to comma initializer (operator<<)");
+      finished();
  }

  /** \returns the built matrix once all its coefficients have been set.
@@ -122,7 +114,12 @@ struct CommaInitializer
    * \endcode
    */
  EIGEN_DEVICE_FUNC
-  inline XprType& finished() { return m_xpr; }
+  inline XprType& finished() {
+      eigen_assert(((m_row+m_currentBlockRows) == m_xpr.rows() || m_xpr.cols() == 0)
+           && m_col == m_xpr.cols()
+           && "Too few coefficients passed to comma initializer (operator<<)");
+      return m_xpr;
+  }

  XprType& m_xpr;           // target expression
  Index m_row;              // current row id
@@ -144,7 +141,7 @@ struct CommaInitializer
  * \sa CommaInitializer::finished(), class CommaInitializer
  */
 template<typename Derived>
-inline CommaInitializer<Derived> DenseBase<Derived>::operator<< (const Scalar& s)
+EIGEN_DEVICE_FUNC inline CommaInitializer<Derived> DenseBase<Derived>::operator<< (const Scalar& s)
 {
  return CommaInitializer<Derived>(*static_cast<Derived*>(this), s);
 }
@@ -152,7 +149,7 @@ inline CommaInitializer<Derived> DenseBase<Derived>::operator<< (const Scalar& s
 /** \sa operator<<(const Scalar&) */
 template<typename Derived>
 template<typename OtherDerived>
-inline CommaInitializer<Derived>
+EIGEN_DEVICE_FUNC inline CommaInitializer<Derived>
 DenseBase<Derived>::operator<<(const DenseBase<OtherDerived>& other)
 {
  return CommaInitializer<Derived>(*static_cast<Derived *>(this), other);
--- a/Eigen/src/Core/ConditionEstimator.h
+++ b/Eigen/src/Core/ConditionEstimator.h
@@ -0,0 +1,175 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2016 Rasmus Munk Larsen (rmlarsen@google.com)
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+#ifndef EIGEN_CONDITIONESTIMATOR_H
+#define EIGEN_CONDITIONESTIMATOR_H
+
+namespace Eigen {
+
+namespace internal {
+
+template <typename Vector, typename RealVector, bool IsComplex>
+struct rcond_compute_sign {
+  static inline Vector run(const Vector& v) {
+    const RealVector v_abs = v.cwiseAbs();
+    return (v_abs.array() == static_cast<typename Vector::RealScalar>(0))
+            .select(Vector::Ones(v.size()), v.cwiseQuotient(v_abs));
+  }
+};
+
+// Partial specialization to avoid elementwise division for real vectors.
+template <typename Vector>
+struct rcond_compute_sign<Vector, Vector, false> {
+  static inline Vector run(const Vector& v) {
+    return (v.array() < static_cast<typename Vector::RealScalar>(0))
+           .select(-Vector::Ones(v.size()), Vector::Ones(v.size()));
+  }
+};
+
+/**
+  * \returns an estimate of ||inv(matrix)||_1 given a decomposition of
+  * \a matrix that implements .solve() and .adjoint().solve() methods.
+  *
+  * This function implements Algorithms 4.1 and 5.1 from
+  *   http://www.maths.manchester.ac.uk/~higham/narep/narep135.pdf
+  * which also forms the basis for the condition number estimators in
+  * LAPACK. Since at most 10 calls to the solve method of dec are
+  * performed, the total cost is O(dims^2), as opposed to O(dims^3)
+  * needed to compute the inverse matrix explicitly.
+  *
+  * The most common usage is in estimating the condition number
+  * ||matrix||_1 * ||inv(matrix)||_1. The first term ||matrix||_1 can be
+  * computed directly in O(n^2) operations.
+  *
+  * Supports the following decompositions: FullPivLU, PartialPivLU, LDLT, and
+  * LLT.
+  *
+  * \sa FullPivLU, PartialPivLU, LDLT, LLT.
+  */
+template <typename Decomposition>
+typename Decomposition::RealScalar rcond_invmatrix_L1_norm_estimate(const Decomposition& dec)
+{
+  typedef typename Decomposition::MatrixType MatrixType;
+  typedef typename Decomposition::Scalar Scalar;
+  typedef typename Decomposition::RealScalar RealScalar;
+  typedef typename internal::plain_col_type<MatrixType>::type Vector;
+  typedef typename internal::plain_col_type<MatrixType, RealScalar>::type RealVector;
+  const bool is_complex = (NumTraits<Scalar>::IsComplex != 0);
+
+  eigen_assert(dec.rows() == dec.cols());
+  const Index n = dec.rows();
+  if (n == 0)
+    return 0;
+
+  // Disable Index to float conversion warning
+#ifdef __INTEL_COMPILER
+  #pragma warning push
+  #pragma warning ( disable : 2259 )
+#endif
+  Vector v = dec.solve(Vector::Ones(n) / Scalar(n));
+#ifdef __INTEL_COMPILER
+  #pragma warning pop
+#endif
+
+  // lower_bound is a lower bound on
+  //   ||inv(matrix)||_1  = sup_v ||inv(matrix) v||_1 / ||v||_1
+  // and is the objective maximized by the ("super-") gradient ascent
+  // algorithm below.
+  RealScalar lower_bound = v.template lpNorm<1>();
+  if (n == 1)
+    return lower_bound;
+
+  // Gradient ascent algorithm follows: We know that the optimum is achieved at
+  // one of the simplices v = e_i, so in each iteration we follow a
+  // super-gradient to move towards the optimal one.
+  RealScalar old_lower_bound = lower_bound;
+  Vector sign_vector(n);
+  Vector old_sign_vector;
+  Index v_max_abs_index = -1;
+  Index old_v_max_abs_index = v_max_abs_index;
+  for (int k = 0; k < 4; ++k)
+  {
+    sign_vector = internal::rcond_compute_sign<Vector, RealVector, is_complex>::run(v);
+    if (k > 0 && !is_complex && sign_vector == old_sign_vector) {
+      // Break if the solution stagnated.
+      break;
+    }
+    // v_max_abs_index = argmax |real( inv(matrix)^T * sign_vector )|
+    v = dec.adjoint().solve(sign_vector);
+    v.real().cwiseAbs().maxCoeff(&v_max_abs_index);
+    if (v_max_abs_index == old_v_max_abs_index) {
+      // Break if the solution stagnated.
+      break;
+    }
+    // Move to the new simplex e_j, where j = v_max_abs_index.
+    v = dec.solve(Vector::Unit(n, v_max_abs_index));  // v = inv(matrix) * e_j.
+    lower_bound = v.template lpNorm<1>();
+    if (lower_bound <= old_lower_bound) {
+      // Break if the gradient step did not increase the lower_bound.
+      break;
+    }
+    if (!is_complex) {
+      old_sign_vector = sign_vector;
+    }
+    old_v_max_abs_index = v_max_abs_index;
+    old_lower_bound = lower_bound;
+  }
+  // The following calculates an independent estimate of ||matrix||_1 by
+  // multiplying matrix by a vector with entries of slowly increasing
+  // magnitude and alternating sign:
+  //   v_i = (-1)^{i} (1 + (i / (dim-1))), i = 0,...,dim-1.
+  // This improvement to Hager's algorithm above is due to Higham. It was
+  // added to make the algorithm more robust in certain corner cases where
+  // large elements in the matrix might otherwise escape detection due to
+  // exact cancellation (especially when op and op_adjoint correspond to a
+  // sequence of backsubstitutions and permutations), which could cause
+  // Hager's algorithm to vastly underestimate ||matrix||_1.
+  Scalar alternating_sign(RealScalar(1));
+  for (Index i = 0; i < n; ++i) {
+    // The static_cast is needed when Scalar is a complex and RealScalar implements expression templates
+    v[i] = alternating_sign * static_cast<RealScalar>(RealScalar(1) + (RealScalar(i) / (RealScalar(n - 1))));
+    alternating_sign = -alternating_sign;
+  }
+  v = dec.solve(v);
+  const RealScalar alternate_lower_bound = (2 * v.template lpNorm<1>()) / (3 * RealScalar(n));
+  return numext::maxi(lower_bound, alternate_lower_bound);
+}
+
+/** \brief Reciprocal condition number estimator.
+  *
+  * Computing a decomposition of a dense matrix takes O(n^3) operations, while
+  * this method estimates the condition number quickly and reliably in O(n^2)
+  * operations.
+  *
+  * \returns an estimate of the reciprocal condition number
+  * (1 / (||matrix||_1 * ||inv(matrix)||_1)) of matrix, given ||matrix||_1 and
+  * its decomposition. Supports the following decompositions: FullPivLU,
+  * PartialPivLU, LDLT, and LLT.
+  *
+  * \sa FullPivLU, PartialPivLU, LDLT, LLT.
+  */
+template <typename Decomposition>
+typename Decomposition::RealScalar
+rcond_estimate_helper(typename Decomposition::RealScalar matrix_norm, const Decomposition& dec)
+{
+  typedef typename Decomposition::RealScalar RealScalar;
+  eigen_assert(dec.rows() == dec.cols());
+  if (dec.rows() == 0)              return NumTraits<RealScalar>::infinity();
+  if (matrix_norm == RealScalar(0)) return RealScalar(0);
+  if (dec.rows() == 1)              return RealScalar(1);
+  const RealScalar inverse_matrix_norm = rcond_invmatrix_L1_norm_estimate(dec);
+  return (inverse_matrix_norm == RealScalar(0) ? RealScalar(0)
+                                               : (RealScalar(1) / inverse_matrix_norm) / matrix_norm);
+}
+
+}  // namespace internal
+
+}  // namespace Eigen
+
+#endif
--- a/Eigen/src/Core/CoreEvaluators.h
+++ b/Eigen/src/Core/CoreEvaluators.h
--- a/Eigen/src/Core/CoreIterators.h
+++ b/Eigen/src/Core/CoreIterators.h
@@ -48,6 +48,11 @@ public:
    * Explicit zeros are not skipped over. To skip explicit zeros, see class SparseView
    */
  EIGEN_STRONG_INLINE InnerIterator& operator++()   { m_iter.operator++(); return *this; }
+  EIGEN_STRONG_INLINE InnerIterator& operator+=(Index i) { m_iter.operator+=(i); return *this; }
+  EIGEN_STRONG_INLINE InnerIterator operator+(Index i) 
+  { InnerIterator result(*this); result+=i; return result; }
+    
+
  /// \returns the column or row index of the current coefficient.
  EIGEN_STRONG_INLINE Index index() const           { return m_iter.index(); }
  /// \returns the row index of the current coefficient.
--- a/Eigen/src/Core/CwiseBinaryOp.h
+++ b/Eigen/src/Core/CwiseBinaryOp.h
@@ -13,26 +13,6 @@

 namespace Eigen {

-/** \class CwiseBinaryOp
-  * \ingroup Core_Module
-  *
-  * \brief Generic expression where a coefficient-wise binary operator is applied to two expressions
-  *
-  * \param BinaryOp template functor implementing the operator
-  * \param Lhs the type of the left-hand side
-  * \param Rhs the type of the right-hand side
-  *
-  * This class represents an expression  where a coefficient-wise binary operator is applied to two expressions.
-  * It is the return type of binary operators, by which we mean only those binary operators where
-  * both the left-hand side and the right-hand side are Eigen expressions.
-  * For example, the return type of matrix1+matrix2 is a CwiseBinaryOp.
-  *
-  * Most of the time, this is the only way that it is used, so you typically don't have to name
-  * CwiseBinaryOp types explicitly.
-  *
-  * \sa MatrixBase::binaryExpr(const MatrixBase<OtherDerived> &,const CustomBinaryOp &) const, class CwiseUnaryOp, class CwiseNullaryOp
-  */
-
 namespace internal {
 template<typename BinaryOp, typename Lhs, typename Rhs>
 struct traits<CwiseBinaryOp<BinaryOp, Lhs, Rhs> >
@@ -52,8 +32,8 @@ struct traits<CwiseBinaryOp<BinaryOp, Lhs, Rhs> >
  // we still want to handle the case when the result type is different.
  typedef typename result_of<
                     BinaryOp(
-                       typename Lhs::Scalar,
-                       typename Rhs::Scalar
+                       const typename Lhs::Scalar&,
+                       const typename Rhs::Scalar&
                     )
                   >::type Scalar;
  typedef typename cwise_promote_storage_type<typename traits<Lhs>::StorageKind,
@@ -66,7 +46,7 @@ struct traits<CwiseBinaryOp<BinaryOp, Lhs, Rhs> >
  typedef typename remove_reference<LhsNested>::type _LhsNested;
  typedef typename remove_reference<RhsNested>::type _RhsNested;
  enum {
-    Flags = _LhsNested::Flags & RowMajorBit
+    Flags = cwise_promote_storage_order<typename traits<Lhs>::StorageKind,typename traits<Rhs>::StorageKind,_LhsNested::Flags & RowMajorBit,_RhsNested::Flags & RowMajorBit>::value
  };
 };
 } // end namespace internal
@@ -74,6 +54,25 @@ struct traits<CwiseBinaryOp<BinaryOp, Lhs, Rhs> >
 template<typename BinaryOp, typename Lhs, typename Rhs, typename StorageKind>
 class CwiseBinaryOpImpl;

+/** \class CwiseBinaryOp
+  * \ingroup Core_Module
+  *
+  * \brief Generic expression where a coefficient-wise binary operator is applied to two expressions
+  *
+  * \tparam BinaryOp template functor implementing the operator
+  * \tparam LhsType the type of the left-hand side
+  * \tparam RhsType the type of the right-hand side
+  *
+  * This class represents an expression  where a coefficient-wise binary operator is applied to two expressions.
+  * It is the return type of binary operators, by which we mean only those binary operators where
+  * both the left-hand side and the right-hand side are Eigen expressions.
+  * For example, the return type of matrix1+matrix2 is a CwiseBinaryOp.
+  *
+  * Most of the time, this is the only way that it is used, so you typically don't have to name
+  * CwiseBinaryOp types explicitly.
+  *
+  * \sa MatrixBase::binaryExpr(const MatrixBase<OtherDerived> &,const CustomBinaryOp &) const, class CwiseUnaryOp, class CwiseNullaryOp
+  */
 template<typename BinaryOp, typename LhsType, typename RhsType>
 class CwiseBinaryOp : 
  public CwiseBinaryOpImpl<
@@ -85,6 +84,7 @@ class CwiseBinaryOp :
 {
  public:
    
+    typedef typename internal::remove_all<BinaryOp>::type Functor;
    typedef typename internal::remove_all<LhsType>::type Lhs;
    typedef typename internal::remove_all<RhsType>::type Rhs;

@@ -100,8 +100,14 @@ class CwiseBinaryOp :
    typedef typename internal::remove_reference<LhsNested>::type _LhsNested;
    typedef typename internal::remove_reference<RhsNested>::type _RhsNested;

-    EIGEN_DEVICE_FUNC
-    EIGEN_STRONG_INLINE CwiseBinaryOp(const Lhs& aLhs, const Rhs& aRhs, const BinaryOp& func = BinaryOp())
+#if EIGEN_COMP_MSVC && EIGEN_HAS_CXX11
+    //Required for Visual Studio or the Copy constructor will probably not get inlined!
+    EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+    CwiseBinaryOp(const CwiseBinaryOp<BinaryOp,LhsType,RhsType>&) = default;
+#endif
+
+    EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+    CwiseBinaryOp(const Lhs& aLhs, const Rhs& aRhs, const BinaryOp& func = BinaryOp())
      : m_lhs(aLhs), m_rhs(aRhs), m_functor(func)
    {
      EIGEN_CHECK_BINARY_COMPATIBILIY(BinaryOp,typename Lhs::Scalar,typename Rhs::Scalar);
@@ -110,16 +116,16 @@ class CwiseBinaryOp :
      eigen_assert(aLhs.rows() == aRhs.rows() && aLhs.cols() == aRhs.cols());
    }

-    EIGEN_DEVICE_FUNC
-    EIGEN_STRONG_INLINE Index rows() const {
+    EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+    Index rows() const {
      // return the fixed size type if available to enable compile time optimizations
      if (internal::traits<typename internal::remove_all<LhsNested>::type>::RowsAtCompileTime==Dynamic)
        return m_rhs.rows();
      else
        return m_lhs.rows();
    }
-    EIGEN_DEVICE_FUNC
-    EIGEN_STRONG_INLINE Index cols() const {
+    EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+    Index cols() const {
      // return the fixed size type if available to enable compile time optimizations
      if (internal::traits<typename internal::remove_all<LhsNested>::type>::ColsAtCompileTime==Dynamic)
        return m_rhs.cols();
@@ -128,13 +134,13 @@ class CwiseBinaryOp :
    }

    /** \returns the left hand side nested expression */
-    EIGEN_DEVICE_FUNC
+    EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
    const _LhsNested& lhs() const { return m_lhs; }
    /** \returns the right hand side nested expression */
-    EIGEN_DEVICE_FUNC
+    EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
    const _RhsNested& rhs() const { return m_rhs; }
    /** \returns the functor representing the binary operation */
-    EIGEN_DEVICE_FUNC
+    EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
    const BinaryOp& functor() const { return m_functor; }

  protected:
@@ -158,10 +164,10 @@ public:
  */
 template<typename Derived>
 template<typename OtherDerived>
-EIGEN_STRONG_INLINE Derived &
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Derived &
 MatrixBase<Derived>::operator-=(const MatrixBase<OtherDerived> &other)
 {
-  call_assignment(derived(), other.derived(), internal::sub_assign_op<Scalar>());
+  call_assignment(derived(), other.derived(), internal::sub_assign_op<Scalar,typename OtherDerived::Scalar>());
  return derived();
 }

@@ -171,14 +177,13 @@ MatrixBase<Derived>::operator-=(const MatrixBase<OtherDerived> &other)
  */
 template<typename Derived>
 template<typename OtherDerived>
-EIGEN_STRONG_INLINE Derived &
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Derived &
 MatrixBase<Derived>::operator+=(const MatrixBase<OtherDerived>& other)
 {
-  call_assignment(derived(), other.derived(), internal::add_assign_op<Scalar>());
+  call_assignment(derived(), other.derived(), internal::add_assign_op<Scalar,typename OtherDerived::Scalar>());
  return derived();
 }

 } // end namespace Eigen

 #endif // EIGEN_CWISE_BINARY_OP_H
-
--- a/Eigen/src/Core/CwiseNullaryOp.h
+++ b/Eigen/src/Core/CwiseNullaryOp.h
@@ -12,24 +12,6 @@

 namespace Eigen {

-/** \class CwiseNullaryOp
-  * \ingroup Core_Module
-  *
-  * \brief Generic expression of a matrix where all coefficients are defined by a functor
-  *
-  * \param NullaryOp template functor implementing the operator
-  * \param PlainObjectType the underlying plain matrix/array type
-  *
-  * This class represents an expression of a generic nullary operator.
-  * It is the return type of the Ones(), Zero(), Constant(), Identity() and Random() methods,
-  * and most of the time this is the only way it is used.
-  *
-  * However, if you want to write a function returning such an expression, you
-  * will need to use this class.
-  *
-  * \sa class CwiseUnaryOp, class CwiseBinaryOp, DenseBase::NullaryExpr()
-  */
-
 namespace internal {
 template<typename NullaryOp, typename PlainObjectType>
 struct traits<CwiseNullaryOp<NullaryOp, PlainObjectType> > : traits<PlainObjectType>
@@ -38,8 +20,42 @@ struct traits<CwiseNullaryOp<NullaryOp, PlainObjectType> > : traits<PlainObjectT
    Flags = traits<PlainObjectType>::Flags & RowMajorBit
  };
 };
-}

+} // namespace internal
+
+/** \class CwiseNullaryOp
+  * \ingroup Core_Module
+  *
+  * \brief Generic expression of a matrix where all coefficients are defined by a functor
+  *
+  * \tparam NullaryOp template functor implementing the operator
+  * \tparam PlainObjectType the underlying plain matrix/array type
+  *
+  * This class represents an expression of a generic nullary operator.
+  * It is the return type of the Ones(), Zero(), Constant(), Identity() and Random() methods,
+  * and most of the time this is the only way it is used.
+  *
+  * However, if you want to write a function returning such an expression, you
+  * will need to use this class.
+  *
+  * The functor NullaryOp must expose one of the following method:
+    <table class="manual">
+    <tr            ><td>\c operator()() </td><td>if the procedural generation does not depend on the coefficient entries (e.g., random numbers)</td></tr>
+    <tr class="alt"><td>\c operator()(Index i)</td><td>if the procedural generation makes sense for vectors only and that it depends on the coefficient index \c i (e.g., linspace) </td></tr>
+    <tr            ><td>\c operator()(Index i,Index j)</td><td>if the procedural generation depends on the matrix coordinates \c i, \c j (e.g., to generate a checkerboard with 0 and 1)</td></tr>
+    </table>
+  * It is also possible to expose the last two operators if the generation makes sense for matrices but can be optimized for vectors.
+  *
+  * See DenseBase::NullaryExpr(Index,const CustomNullaryOp&) for an example binding
+  * C++11 random number generators.
+  *
+  * A nullary expression can also be used to implement custom sophisticated matrix manipulations
+  * that cannot be covered by the existing set of natively supported matrix manipulations.
+  * See this \ref TopicCustomizing_NullaryExpr "page" for some examples and additional explanations
+  * on the behavior of CwiseNullaryOp.
+  *
+  * \sa class CwiseUnaryOp, class CwiseBinaryOp, DenseBase::NullaryExpr
+  */
 template<typename NullaryOp, typename PlainObjectType>
 class CwiseNullaryOp : public internal::dense_xpr_base< CwiseNullaryOp<NullaryOp, PlainObjectType> >::type, internal::no_assignment_operator
 {
@@ -63,30 +79,6 @@ class CwiseNullaryOp : public internal::dense_xpr_base< CwiseNullaryOp<NullaryOp
    EIGEN_DEVICE_FUNC
    EIGEN_STRONG_INLINE Index cols() const { return m_cols.value(); }

-    EIGEN_DEVICE_FUNC
-    EIGEN_STRONG_INLINE const Scalar coeff(Index rowId, Index colId) const
-    {
-      return m_functor(rowId, colId);
-    }
-
-    template<int LoadMode>
-    EIGEN_STRONG_INLINE PacketScalar packet(Index rowId, Index colId) const
-    {
-      return m_functor.packetOp(rowId, colId);
-    }
-
-    EIGEN_DEVICE_FUNC
-    EIGEN_STRONG_INLINE const Scalar coeff(Index index) const
-    {
-      return m_functor(index);
-    }
-
-    template<int LoadMode>
-    EIGEN_STRONG_INLINE PacketScalar packet(Index index) const
-    {
-      return m_functor.packetOp(index);
-    }
-
    /** \returns the functor representing the nullary operation */
    EIGEN_DEVICE_FUNC
    const NullaryOp& functor() const { return m_functor; }
@@ -113,7 +105,12 @@ class CwiseNullaryOp : public internal::dense_xpr_base< CwiseNullaryOp<NullaryOp
  */
 template<typename Derived>
 template<typename CustomNullaryOp>
-EIGEN_STRONG_INLINE const CwiseNullaryOp<CustomNullaryOp, typename DenseBase<Derived>::PlainObject>
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+#ifndef EIGEN_PARSED_BY_DOXYGEN
+const CwiseNullaryOp<CustomNullaryOp,typename DenseBase<Derived>::PlainObject>
+#else
+const CwiseNullaryOp<CustomNullaryOp,PlainObject>
+#endif
 DenseBase<Derived>::NullaryExpr(Index rows, Index cols, const CustomNullaryOp& func)
 {
  return CwiseNullaryOp<CustomNullaryOp, PlainObject>(rows, cols, func);
@@ -139,7 +136,12 @@ DenseBase<Derived>::NullaryExpr(Index rows, Index cols, const CustomNullaryOp& f
  */
 template<typename Derived>
 template<typename CustomNullaryOp>
-EIGEN_STRONG_INLINE const CwiseNullaryOp<CustomNullaryOp, typename DenseBase<Derived>::PlainObject>
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+#ifndef EIGEN_PARSED_BY_DOXYGEN
+const CwiseNullaryOp<CustomNullaryOp, typename DenseBase<Derived>::PlainObject>
+#else
+const CwiseNullaryOp<CustomNullaryOp, PlainObject>
+#endif
 DenseBase<Derived>::NullaryExpr(Index size, const CustomNullaryOp& func)
 {
  EIGEN_STATIC_ASSERT_VECTOR_ONLY(Derived)
@@ -158,7 +160,12 @@ DenseBase<Derived>::NullaryExpr(Index size, const CustomNullaryOp& func)
  */
 template<typename Derived>
 template<typename CustomNullaryOp>
-EIGEN_STRONG_INLINE const CwiseNullaryOp<CustomNullaryOp, typename DenseBase<Derived>::PlainObject>
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+#ifndef EIGEN_PARSED_BY_DOXYGEN
+const CwiseNullaryOp<CustomNullaryOp, typename DenseBase<Derived>::PlainObject>
+#else
+const CwiseNullaryOp<CustomNullaryOp, PlainObject>
+#endif
 DenseBase<Derived>::NullaryExpr(const CustomNullaryOp& func)
 {
  return CwiseNullaryOp<CustomNullaryOp, PlainObject>(RowsAtCompileTime, ColsAtCompileTime, func);
@@ -178,7 +185,7 @@ DenseBase<Derived>::NullaryExpr(const CustomNullaryOp& func)
  * \sa class CwiseNullaryOp
  */
 template<typename Derived>
-EIGEN_STRONG_INLINE const typename DenseBase<Derived>::ConstantReturnType
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const typename DenseBase<Derived>::ConstantReturnType
 DenseBase<Derived>::Constant(Index rows, Index cols, const Scalar& value)
 {
  return DenseBase<Derived>::NullaryExpr(rows, cols, internal::scalar_constant_op<Scalar>(value));
@@ -200,7 +207,7 @@ DenseBase<Derived>::Constant(Index rows, Index cols, const Scalar& value)
  * \sa class CwiseNullaryOp
  */
 template<typename Derived>
-EIGEN_STRONG_INLINE const typename DenseBase<Derived>::ConstantReturnType
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const typename DenseBase<Derived>::ConstantReturnType
 DenseBase<Derived>::Constant(Index size, const Scalar& value)
 {
  return DenseBase<Derived>::NullaryExpr(size, internal::scalar_constant_op<Scalar>(value));
@@ -216,53 +223,45 @@ DenseBase<Derived>::Constant(Index size, const Scalar& value)
  * \sa class CwiseNullaryOp
  */
 template<typename Derived>
-EIGEN_STRONG_INLINE const typename DenseBase<Derived>::ConstantReturnType
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const typename DenseBase<Derived>::ConstantReturnType
 DenseBase<Derived>::Constant(const Scalar& value)
 {
  EIGEN_STATIC_ASSERT_FIXED_SIZE(Derived)
  return DenseBase<Derived>::NullaryExpr(RowsAtCompileTime, ColsAtCompileTime, internal::scalar_constant_op<Scalar>(value));
 }

-/**
-  * \brief Sets a linearly space vector.
-  *
-  * The function generates 'size' equally spaced values in the closed interval [low,high].
-  * This particular version of LinSpaced() uses sequential access, i.e. vector access is
-  * assumed to be a(0), a(1), ..., a(size). This assumption allows for better vectorization
-  * and yields faster code than the random access version.
-  *
-  * When size is set to 1, a vector of length 1 containing 'high' is returned.
+/** \deprecated because of accuracy loss. In Eigen 3.3, it is an alias for LinSpaced(Index,const Scalar&,const Scalar&)
  *
  * \only_for_vectors
  *
-  * Example: \include DenseBase_LinSpaced_seq.cpp
-  * Output: \verbinclude DenseBase_LinSpaced_seq.out
+  * Example: \include DenseBase_LinSpaced_seq_deprecated.cpp
+  * Output: \verbinclude DenseBase_LinSpaced_seq_deprecated.out
  *
-  * \sa setLinSpaced(Index,const Scalar&,const Scalar&), LinSpaced(Index,Scalar,Scalar), CwiseNullaryOp
+  * \sa LinSpaced(Index,const Scalar&, const Scalar&), setLinSpaced(Index,const Scalar&,const Scalar&)
  */
 template<typename Derived>
-EIGEN_STRONG_INLINE const typename DenseBase<Derived>::SequentialLinSpacedReturnType
+EIGEN_DEPRECATED EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const typename DenseBase<Derived>::RandomAccessLinSpacedReturnType
 DenseBase<Derived>::LinSpaced(Sequential_t, Index size, const Scalar& low, const Scalar& high)
 {
  EIGEN_STATIC_ASSERT_VECTOR_ONLY(Derived)
-  return DenseBase<Derived>::NullaryExpr(size, internal::linspaced_op<Scalar,PacketScalar,false>(low,high,size));
+  return DenseBase<Derived>::NullaryExpr(size, internal::linspaced_op<Scalar>(low,high,size));
 }

-/**
-  * \copydoc DenseBase::LinSpaced(Sequential_t, Index, const Scalar&, const Scalar&)
-  * Special version for fixed size types which does not require the size parameter.
+/** \deprecated because of accuracy loss. In Eigen 3.3, it is an alias for LinSpaced(const Scalar&,const Scalar&)
+  *
+  * \sa LinSpaced(const Scalar&, const Scalar&)
  */
 template<typename Derived>
-EIGEN_STRONG_INLINE const typename DenseBase<Derived>::SequentialLinSpacedReturnType
+EIGEN_DEPRECATED EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const typename DenseBase<Derived>::RandomAccessLinSpacedReturnType
 DenseBase<Derived>::LinSpaced(Sequential_t, const Scalar& low, const Scalar& high)
 {
  EIGEN_STATIC_ASSERT_VECTOR_ONLY(Derived)
  EIGEN_STATIC_ASSERT_FIXED_SIZE(Derived)
-  return DenseBase<Derived>::NullaryExpr(Derived::SizeAtCompileTime, internal::linspaced_op<Scalar,PacketScalar,false>(low,high,Derived::SizeAtCompileTime));
+  return DenseBase<Derived>::NullaryExpr(Derived::SizeAtCompileTime, internal::linspaced_op<Scalar>(low,high,Derived::SizeAtCompileTime));
 }

 /**
-  * \brief Sets a linearly space vector.
+  * \brief Sets a linearly spaced vector.
  *
  * The function generates 'size' equally spaced values in the closed interval [low,high].
  * When size is set to 1, a vector of length 1 containing 'high' is returned.
@@ -272,14 +271,24 @@ DenseBase<Derived>::LinSpaced(Sequential_t, const Scalar& low, const Scalar& hig
  * Example: \include DenseBase_LinSpaced.cpp
  * Output: \verbinclude DenseBase_LinSpaced.out
  *
-  * \sa setLinSpaced(Index,const Scalar&,const Scalar&), LinSpaced(Sequential_t,Index,const Scalar&,const Scalar&,Index), CwiseNullaryOp
+  * For integer scalar types, an even spacing is possible if and only if the length of the range,
+  * i.e., \c high-low is a scalar multiple of \c size-1, or if \c size is a scalar multiple of the
+  * number of values \c high-low+1 (meaning each value can be repeated the same number of time).
+  * If one of these two considions is not satisfied, then \c high is lowered to the largest value
+  * satisfying one of this constraint.
+  * Here are some examples:
+  *
+  * Example: \include DenseBase_LinSpacedInt.cpp
+  * Output: \verbinclude DenseBase_LinSpacedInt.out
+  *
+  * \sa setLinSpaced(Index,const Scalar&,const Scalar&), CwiseNullaryOp
  */
 template<typename Derived>
-EIGEN_STRONG_INLINE const typename DenseBase<Derived>::RandomAccessLinSpacedReturnType
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const typename DenseBase<Derived>::RandomAccessLinSpacedReturnType
 DenseBase<Derived>::LinSpaced(Index size, const Scalar& low, const Scalar& high)
 {
  EIGEN_STATIC_ASSERT_VECTOR_ONLY(Derived)
-  return DenseBase<Derived>::NullaryExpr(size, internal::linspaced_op<Scalar,PacketScalar,true>(low,high,size));
+  return DenseBase<Derived>::NullaryExpr(size, internal::linspaced_op<Scalar>(low,high,size));
 }

 /**
@@ -287,17 +296,17 @@ DenseBase<Derived>::LinSpaced(Index size, const Scalar& low, const Scalar& high)
  * Special version for fixed size types which does not require the size parameter.
  */
 template<typename Derived>
-EIGEN_STRONG_INLINE const typename DenseBase<Derived>::RandomAccessLinSpacedReturnType
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const typename DenseBase<Derived>::RandomAccessLinSpacedReturnType
 DenseBase<Derived>::LinSpaced(const Scalar& low, const Scalar& high)
 {
  EIGEN_STATIC_ASSERT_VECTOR_ONLY(Derived)
  EIGEN_STATIC_ASSERT_FIXED_SIZE(Derived)
-  return DenseBase<Derived>::NullaryExpr(Derived::SizeAtCompileTime, internal::linspaced_op<Scalar,PacketScalar,true>(low,high,Derived::SizeAtCompileTime));
+  return DenseBase<Derived>::NullaryExpr(Derived::SizeAtCompileTime, internal::linspaced_op<Scalar>(low,high,Derived::SizeAtCompileTime));
 }

 /** \returns true if all coefficients in this matrix are approximately equal to \a val, to within precision \a prec */
 template<typename Derived>
-bool DenseBase<Derived>::isApproxToConstant
+EIGEN_DEVICE_FUNC bool DenseBase<Derived>::isApproxToConstant
 (const Scalar& val, const RealScalar& prec) const
 {
  typename internal::nested_eval<Derived,1>::type self(derived());
@@ -312,7 +321,7 @@ bool DenseBase<Derived>::isApproxToConstant
  *
  * \returns true if all coefficients in this matrix are approximately equal to \a value, to within precision \a prec */
 template<typename Derived>
-bool DenseBase<Derived>::isConstant
+EIGEN_DEVICE_FUNC bool DenseBase<Derived>::isConstant
 (const Scalar& val, const RealScalar& prec) const
 {
  return isApproxToConstant(val, prec);
@@ -323,22 +332,22 @@ bool DenseBase<Derived>::isConstant
  * \sa setConstant(), Constant(), class CwiseNullaryOp
  */
 template<typename Derived>
-EIGEN_STRONG_INLINE void DenseBase<Derived>::fill(const Scalar& val)
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void DenseBase<Derived>::fill(const Scalar& val)
 {
  setConstant(val);
 }

-/** Sets all coefficients in this expression to \a value.
+/** Sets all coefficients in this expression to value \a val.
  *
  * \sa fill(), setConstant(Index,const Scalar&), setConstant(Index,Index,const Scalar&), setZero(), setOnes(), Constant(), class CwiseNullaryOp, setZero(), setOnes()
  */
 template<typename Derived>
-EIGEN_STRONG_INLINE Derived& DenseBase<Derived>::setConstant(const Scalar& val)
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Derived& DenseBase<Derived>::setConstant(const Scalar& val)
 {
  return derived() = Constant(rows(), cols(), val);
 }

-/** Resizes to the given \a size, and sets all coefficients in this expression to the given \a value.
+/** Resizes to the given \a size, and sets all coefficients in this expression to the given value \a val.
  *
  * \only_for_vectors
  *
@@ -348,14 +357,14 @@ EIGEN_STRONG_INLINE Derived& DenseBase<Derived>::setConstant(const Scalar& val)
  * \sa MatrixBase::setConstant(const Scalar&), setConstant(Index,Index,const Scalar&), class CwiseNullaryOp, MatrixBase::Constant(const Scalar&)
  */
 template<typename Derived>
-EIGEN_STRONG_INLINE Derived&
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Derived&
 PlainObjectBase<Derived>::setConstant(Index size, const Scalar& val)
 {
  resize(size);
  return setConstant(val);
 }

-/** Resizes to the given size, and sets all coefficients in this expression to the given \a value.
+/** Resizes to the given size, and sets all coefficients in this expression to the given value \a val.
  *
  * \param rows the new number of rows
  * \param cols the new number of columns
@@ -367,7 +376,7 @@ PlainObjectBase<Derived>::setConstant(Index size, const Scalar& val)
  * \sa MatrixBase::setConstant(const Scalar&), setConstant(Index,const Scalar&), class CwiseNullaryOp, MatrixBase::Constant(const Scalar&)
  */
 template<typename Derived>
-EIGEN_STRONG_INLINE Derived&
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Derived&
 PlainObjectBase<Derived>::setConstant(Index rows, Index cols, const Scalar& val)
 {
  resize(rows, cols);
@@ -375,7 +384,7 @@ PlainObjectBase<Derived>::setConstant(Index rows, Index cols, const Scalar& val)
 }

 /**
-  * \brief Sets a linearly space vector.
+  * \brief Sets a linearly spaced vector.
  *
  * The function generates 'size' equally spaced values in the closed interval [low,high].
  * When size is set to 1, a vector of length 1 containing 'high' is returned.
@@ -385,27 +394,33 @@ PlainObjectBase<Derived>::setConstant(Index rows, Index cols, const Scalar& val)
  * Example: \include DenseBase_setLinSpaced.cpp
  * Output: \verbinclude DenseBase_setLinSpaced.out
  *
-  * \sa CwiseNullaryOp
+  * For integer scalar types, do not miss the explanations on the definition
+  * of \link LinSpaced(Index,const Scalar&,const Scalar&) even spacing \endlink.
+  *
+  * \sa LinSpaced(Index,const Scalar&,const Scalar&), CwiseNullaryOp
  */
 template<typename Derived>
-EIGEN_STRONG_INLINE Derived& DenseBase<Derived>::setLinSpaced(Index newSize, const Scalar& low, const Scalar& high)
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Derived& DenseBase<Derived>::setLinSpaced(Index newSize, const Scalar& low, const Scalar& high)
 {
  EIGEN_STATIC_ASSERT_VECTOR_ONLY(Derived)
-  return derived() = Derived::NullaryExpr(newSize, internal::linspaced_op<Scalar,PacketScalar,false>(low,high,newSize));
+  return derived() = Derived::NullaryExpr(newSize, internal::linspaced_op<Scalar>(low,high,newSize));
 }

 /**
-  * \brief Sets a linearly space vector.
+  * \brief Sets a linearly spaced vector.
  *
-  * The function fill *this with equally spaced values in the closed interval [low,high].
+  * The function fills \c *this with equally spaced values in the closed interval [low,high].
  * When size is set to 1, a vector of length 1 containing 'high' is returned.
  *
  * \only_for_vectors
  *
-  * \sa setLinSpaced(Index, const Scalar&, const Scalar&), CwiseNullaryOp
+  * For integer scalar types, do not miss the explanations on the definition
+  * of \link LinSpaced(Index,const Scalar&,const Scalar&) even spacing \endlink.
+  *
+  * \sa LinSpaced(Index,const Scalar&,const Scalar&), setLinSpaced(Index, const Scalar&, const Scalar&), CwiseNullaryOp
  */
 template<typename Derived>
-EIGEN_STRONG_INLINE Derived& DenseBase<Derived>::setLinSpaced(const Scalar& low, const Scalar& high)
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Derived& DenseBase<Derived>::setLinSpaced(const Scalar& low, const Scalar& high)
 {
  EIGEN_STATIC_ASSERT_VECTOR_ONLY(Derived)
  return setLinSpaced(size(), low, high);
@@ -428,7 +443,7 @@ EIGEN_STRONG_INLINE Derived& DenseBase<Derived>::setLinSpaced(const Scalar& low,
  * \sa Zero(), Zero(Index)
  */
 template<typename Derived>
-EIGEN_STRONG_INLINE const typename DenseBase<Derived>::ConstantReturnType
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const typename DenseBase<Derived>::ConstantReturnType
 DenseBase<Derived>::Zero(Index rows, Index cols)
 {
  return Constant(rows, cols, Scalar(0));
@@ -451,7 +466,7 @@ DenseBase<Derived>::Zero(Index rows, Index cols)
  * \sa Zero(), Zero(Index,Index)
  */
 template<typename Derived>
-EIGEN_STRONG_INLINE const typename DenseBase<Derived>::ConstantReturnType
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const typename DenseBase<Derived>::ConstantReturnType
 DenseBase<Derived>::Zero(Index size)
 {
  return Constant(size, Scalar(0));
@@ -468,7 +483,7 @@ DenseBase<Derived>::Zero(Index size)
  * \sa Zero(Index), Zero(Index,Index)
  */
 template<typename Derived>
-EIGEN_STRONG_INLINE const typename DenseBase<Derived>::ConstantReturnType
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const typename DenseBase<Derived>::ConstantReturnType
 DenseBase<Derived>::Zero()
 {
  return Constant(Scalar(0));
@@ -483,7 +498,7 @@ DenseBase<Derived>::Zero()
  * \sa class CwiseNullaryOp, Zero()
  */
 template<typename Derived>
-bool DenseBase<Derived>::isZero(const RealScalar& prec) const
+EIGEN_DEVICE_FUNC bool DenseBase<Derived>::isZero(const RealScalar& prec) const
 {
  typename internal::nested_eval<Derived,1>::type self(derived());
  for(Index j = 0; j < cols(); ++j)
@@ -501,7 +516,7 @@ bool DenseBase<Derived>::isZero(const RealScalar& prec) const
  * \sa class CwiseNullaryOp, Zero()
  */
 template<typename Derived>
-EIGEN_STRONG_INLINE Derived& DenseBase<Derived>::setZero()
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Derived& DenseBase<Derived>::setZero()
 {
  return setConstant(Scalar(0));
 }
@@ -516,7 +531,7 @@ EIGEN_STRONG_INLINE Derived& DenseBase<Derived>::setZero()
  * \sa DenseBase::setZero(), setZero(Index,Index), class CwiseNullaryOp, DenseBase::Zero()
  */
 template<typename Derived>
-EIGEN_STRONG_INLINE Derived&
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Derived&
 PlainObjectBase<Derived>::setZero(Index newSize)
 {
  resize(newSize);
@@ -534,7 +549,7 @@ PlainObjectBase<Derived>::setZero(Index newSize)
  * \sa DenseBase::setZero(), setZero(Index), class CwiseNullaryOp, DenseBase::Zero()
  */
 template<typename Derived>
-EIGEN_STRONG_INLINE Derived&
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Derived&
 PlainObjectBase<Derived>::setZero(Index rows, Index cols)
 {
  resize(rows, cols);
@@ -558,7 +573,7 @@ PlainObjectBase<Derived>::setZero(Index rows, Index cols)
  * \sa Ones(), Ones(Index), isOnes(), class Ones
  */
 template<typename Derived>
-EIGEN_STRONG_INLINE const typename DenseBase<Derived>::ConstantReturnType
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const typename DenseBase<Derived>::ConstantReturnType
 DenseBase<Derived>::Ones(Index rows, Index cols)
 {
  return Constant(rows, cols, Scalar(1));
@@ -581,7 +596,7 @@ DenseBase<Derived>::Ones(Index rows, Index cols)
  * \sa Ones(), Ones(Index,Index), isOnes(), class Ones
  */
 template<typename Derived>
-EIGEN_STRONG_INLINE const typename DenseBase<Derived>::ConstantReturnType
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const typename DenseBase<Derived>::ConstantReturnType
 DenseBase<Derived>::Ones(Index newSize)
 {
  return Constant(newSize, Scalar(1));
@@ -598,7 +613,7 @@ DenseBase<Derived>::Ones(Index newSize)
  * \sa Ones(Index), Ones(Index,Index), isOnes(), class Ones
  */
 template<typename Derived>
-EIGEN_STRONG_INLINE const typename DenseBase<Derived>::ConstantReturnType
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const typename DenseBase<Derived>::ConstantReturnType
 DenseBase<Derived>::Ones()
 {
  return Constant(Scalar(1));
@@ -613,7 +628,7 @@ DenseBase<Derived>::Ones()
  * \sa class CwiseNullaryOp, Ones()
  */
 template<typename Derived>
-bool DenseBase<Derived>::isOnes
+EIGEN_DEVICE_FUNC bool DenseBase<Derived>::isOnes
 (const RealScalar& prec) const
 {
  return isApproxToConstant(Scalar(1), prec);
@@ -627,7 +642,7 @@ bool DenseBase<Derived>::isOnes
  * \sa class CwiseNullaryOp, Ones()
  */
 template<typename Derived>
-EIGEN_STRONG_INLINE Derived& DenseBase<Derived>::setOnes()
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Derived& DenseBase<Derived>::setOnes()
 {
  return setConstant(Scalar(1));
 }
@@ -642,7 +657,7 @@ EIGEN_STRONG_INLINE Derived& DenseBase<Derived>::setOnes()
  * \sa MatrixBase::setOnes(), setOnes(Index,Index), class CwiseNullaryOp, MatrixBase::Ones()
  */
 template<typename Derived>
-EIGEN_STRONG_INLINE Derived&
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Derived&
 PlainObjectBase<Derived>::setOnes(Index newSize)
 {
  resize(newSize);
@@ -660,7 +675,7 @@ PlainObjectBase<Derived>::setOnes(Index newSize)
  * \sa MatrixBase::setOnes(), setOnes(Index), class CwiseNullaryOp, MatrixBase::Ones()
  */
 template<typename Derived>
-EIGEN_STRONG_INLINE Derived&
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Derived&
 PlainObjectBase<Derived>::setOnes(Index rows, Index cols)
 {
  resize(rows, cols);
@@ -684,7 +699,7 @@ PlainObjectBase<Derived>::setOnes(Index rows, Index cols)
  * \sa Identity(), setIdentity(), isIdentity()
  */
 template<typename Derived>
-EIGEN_STRONG_INLINE const typename MatrixBase<Derived>::IdentityReturnType
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const typename MatrixBase<Derived>::IdentityReturnType
 MatrixBase<Derived>::Identity(Index rows, Index cols)
 {
  return DenseBase<Derived>::NullaryExpr(rows, cols, internal::scalar_identity_op<Scalar>());
@@ -701,7 +716,7 @@ MatrixBase<Derived>::Identity(Index rows, Index cols)
  * \sa Identity(Index,Index), setIdentity(), isIdentity()
  */
 template<typename Derived>
-EIGEN_STRONG_INLINE const typename MatrixBase<Derived>::IdentityReturnType
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const typename MatrixBase<Derived>::IdentityReturnType
 MatrixBase<Derived>::Identity()
 {
  EIGEN_STATIC_ASSERT_FIXED_SIZE(Derived)
@@ -760,7 +775,7 @@ struct setIdentity_impl<Derived, true>
  static EIGEN_STRONG_INLINE Derived& run(Derived& m)
  {
    m.setZero();
-    const Index size = (std::min)(m.rows(), m.cols());
+    const Index size = numext::mini(m.rows(), m.cols());
    for(Index i = 0; i < size; ++i) m.coeffRef(i,i) = typename Derived::Scalar(1);
    return m;
  }
@@ -776,7 +791,7 @@ struct setIdentity_impl<Derived, true>
  * \sa class CwiseNullaryOp, Identity(), Identity(Index,Index), isIdentity()
  */
 template<typename Derived>
-EIGEN_STRONG_INLINE Derived& MatrixBase<Derived>::setIdentity()
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Derived& MatrixBase<Derived>::setIdentity()
 {
  return internal::setIdentity_impl<Derived>::run(derived());
 }
@@ -792,7 +807,7 @@ EIGEN_STRONG_INLINE Derived& MatrixBase<Derived>::setIdentity()
  * \sa MatrixBase::setIdentity(), class CwiseNullaryOp, MatrixBase::Identity()
  */
 template<typename Derived>
-EIGEN_STRONG_INLINE Derived& MatrixBase<Derived>::setIdentity(Index rows, Index cols)
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Derived& MatrixBase<Derived>::setIdentity(Index rows, Index cols)
 {
  derived().resize(rows, cols);
  return setIdentity();
@@ -805,7 +820,7 @@ EIGEN_STRONG_INLINE Derived& MatrixBase<Derived>::setIdentity(Index rows, Index
  * \sa MatrixBase::Unit(Index), MatrixBase::UnitX(), MatrixBase::UnitY(), MatrixBase::UnitZ(), MatrixBase::UnitW()
  */
 template<typename Derived>
-EIGEN_STRONG_INLINE const typename MatrixBase<Derived>::BasisReturnType MatrixBase<Derived>::Unit(Index newSize, Index i)
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const typename MatrixBase<Derived>::BasisReturnType MatrixBase<Derived>::Unit(Index newSize, Index i)
 {
  EIGEN_STATIC_ASSERT_VECTOR_ONLY(Derived)
  return BasisReturnType(SquareMatrixType::Identity(newSize,newSize), i);
@@ -820,7 +835,7 @@ EIGEN_STRONG_INLINE const typename MatrixBase<Derived>::BasisReturnType MatrixBa
  * \sa MatrixBase::Unit(Index,Index), MatrixBase::UnitX(), MatrixBase::UnitY(), MatrixBase::UnitZ(), MatrixBase::UnitW()
  */
 template<typename Derived>
-EIGEN_STRONG_INLINE const typename MatrixBase<Derived>::BasisReturnType MatrixBase<Derived>::Unit(Index i)
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const typename MatrixBase<Derived>::BasisReturnType MatrixBase<Derived>::Unit(Index i)
 {
  EIGEN_STATIC_ASSERT_VECTOR_ONLY(Derived)
  return BasisReturnType(SquareMatrixType::Identity(),i);
@@ -833,7 +848,7 @@ EIGEN_STRONG_INLINE const typename MatrixBase<Derived>::BasisReturnType MatrixBa
  * \sa MatrixBase::Unit(Index,Index), MatrixBase::Unit(Index), MatrixBase::UnitY(), MatrixBase::UnitZ(), MatrixBase::UnitW()
  */
 template<typename Derived>
-EIGEN_STRONG_INLINE const typename MatrixBase<Derived>::BasisReturnType MatrixBase<Derived>::UnitX()
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const typename MatrixBase<Derived>::BasisReturnType MatrixBase<Derived>::UnitX()
 { return Derived::Unit(0); }

 /** \returns an expression of the Y axis unit vector (0,1{,0}^*)
@@ -843,7 +858,7 @@ EIGEN_STRONG_INLINE const typename MatrixBase<Derived>::BasisReturnType MatrixBa
  * \sa MatrixBase::Unit(Index,Index), MatrixBase::Unit(Index), MatrixBase::UnitY(), MatrixBase::UnitZ(), MatrixBase::UnitW()
  */
 template<typename Derived>
-EIGEN_STRONG_INLINE const typename MatrixBase<Derived>::BasisReturnType MatrixBase<Derived>::UnitY()
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const typename MatrixBase<Derived>::BasisReturnType MatrixBase<Derived>::UnitY()
 { return Derived::Unit(1); }

 /** \returns an expression of the Z axis unit vector (0,0,1{,0}^*)
@@ -853,7 +868,7 @@ EIGEN_STRONG_INLINE const typename MatrixBase<Derived>::BasisReturnType MatrixBa
  * \sa MatrixBase::Unit(Index,Index), MatrixBase::Unit(Index), MatrixBase::UnitY(), MatrixBase::UnitZ(), MatrixBase::UnitW()
  */
 template<typename Derived>
-EIGEN_STRONG_INLINE const typename MatrixBase<Derived>::BasisReturnType MatrixBase<Derived>::UnitZ()
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const typename MatrixBase<Derived>::BasisReturnType MatrixBase<Derived>::UnitZ()
 { return Derived::Unit(2); }

 /** \returns an expression of the W axis unit vector (0,0,0,1)
@@ -863,9 +878,45 @@ EIGEN_STRONG_INLINE const typename MatrixBase<Derived>::BasisReturnType MatrixBa
  * \sa MatrixBase::Unit(Index,Index), MatrixBase::Unit(Index), MatrixBase::UnitY(), MatrixBase::UnitZ(), MatrixBase::UnitW()
  */
 template<typename Derived>
-EIGEN_STRONG_INLINE const typename MatrixBase<Derived>::BasisReturnType MatrixBase<Derived>::UnitW()
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const typename MatrixBase<Derived>::BasisReturnType MatrixBase<Derived>::UnitW()
 { return Derived::Unit(3); }

+/** \brief Set the coefficients of \c *this to the i-th unit (basis) vector
+  *
+  * \param i index of the unique coefficient to be set to 1
+  *
+  * \only_for_vectors
+  *
+  * \sa MatrixBase::setIdentity(), class CwiseNullaryOp, MatrixBase::Unit(Index,Index)
+  */
+template<typename Derived>
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Derived& MatrixBase<Derived>::setUnit(Index i)
+{
+  EIGEN_STATIC_ASSERT_VECTOR_ONLY(Derived);
+  eigen_assert(i<size());
+  derived().setZero();
+  derived().coeffRef(i) = Scalar(1);
+  return derived();
+}
+
+/** \brief Resizes to the given \a newSize, and writes the i-th unit (basis) vector into *this.
+  *
+  * \param newSize the new size of the vector
+  * \param i index of the unique coefficient to be set to 1
+  *
+  * \only_for_vectors
+  *
+  * \sa MatrixBase::setIdentity(), class CwiseNullaryOp, MatrixBase::Unit(Index,Index)
+  */
+template<typename Derived>
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Derived& MatrixBase<Derived>::setUnit(Index newSize, Index i)
+{
+  EIGEN_STATIC_ASSERT_VECTOR_ONLY(Derived);
+  eigen_assert(i<newSize);
+  derived().resize(newSize);
+  return setUnit(i);
+}
+
 } // end namespace Eigen

 #endif // EIGEN_CWISE_NULLARY_OP_H
--- a/Eigen/src/Core/CwiseTernaryOp.h
+++ b/Eigen/src/Core/CwiseTernaryOp.h
@@ -0,0 +1,197 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2008-2014 Gael Guennebaud <gael.guennebaud@inria.fr>
+// Copyright (C) 2006-2008 Benoit Jacob <jacob.benoit.1@gmail.com>
+// Copyright (C) 2016 Eugene Brevdo <ebrevdo@gmail.com>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+#ifndef EIGEN_CWISE_TERNARY_OP_H
+#define EIGEN_CWISE_TERNARY_OP_H
+
+namespace Eigen {
+
+namespace internal {
+template <typename TernaryOp, typename Arg1, typename Arg2, typename Arg3>
+struct traits<CwiseTernaryOp<TernaryOp, Arg1, Arg2, Arg3> > {
+  // we must not inherit from traits<Arg1> since it has
+  // the potential to cause problems with MSVC
+  typedef typename remove_all<Arg1>::type Ancestor;
+  typedef typename traits<Ancestor>::XprKind XprKind;
+  enum {
+    RowsAtCompileTime = traits<Ancestor>::RowsAtCompileTime,
+    ColsAtCompileTime = traits<Ancestor>::ColsAtCompileTime,
+    MaxRowsAtCompileTime = traits<Ancestor>::MaxRowsAtCompileTime,
+    MaxColsAtCompileTime = traits<Ancestor>::MaxColsAtCompileTime
+  };
+
+  // even though we require Arg1, Arg2, and Arg3 to have the same scalar type
+  // (see CwiseTernaryOp constructor),
+  // we still want to handle the case when the result type is different.
+  typedef typename result_of<TernaryOp(
+      const typename Arg1::Scalar&, const typename Arg2::Scalar&,
+      const typename Arg3::Scalar&)>::type Scalar;
+
+  typedef typename internal::traits<Arg1>::StorageKind StorageKind;
+  typedef typename internal::traits<Arg1>::StorageIndex StorageIndex;
+
+  typedef typename Arg1::Nested Arg1Nested;
+  typedef typename Arg2::Nested Arg2Nested;
+  typedef typename Arg3::Nested Arg3Nested;
+  typedef typename remove_reference<Arg1Nested>::type _Arg1Nested;
+  typedef typename remove_reference<Arg2Nested>::type _Arg2Nested;
+  typedef typename remove_reference<Arg3Nested>::type _Arg3Nested;
+  enum { Flags = _Arg1Nested::Flags & RowMajorBit };
+};
+}  // end namespace internal
+
+template <typename TernaryOp, typename Arg1, typename Arg2, typename Arg3,
+          typename StorageKind>
+class CwiseTernaryOpImpl;
+
+/** \class CwiseTernaryOp
+  * \ingroup Core_Module
+  *
+  * \brief Generic expression where a coefficient-wise ternary operator is
+ * applied to two expressions
+  *
+  * \tparam TernaryOp template functor implementing the operator
+  * \tparam Arg1Type the type of the first argument
+  * \tparam Arg2Type the type of the second argument
+  * \tparam Arg3Type the type of the third argument
+  *
+  * This class represents an expression where a coefficient-wise ternary
+ * operator is applied to three expressions.
+  * It is the return type of ternary operators, by which we mean only those
+ * ternary operators where
+  * all three arguments are Eigen expressions.
+  * For example, the return type of betainc(matrix1, matrix2, matrix3) is a
+ * CwiseTernaryOp.
+  *
+  * Most of the time, this is the only way that it is used, so you typically
+ * don't have to name
+  * CwiseTernaryOp types explicitly.
+  *
+  * \sa MatrixBase::ternaryExpr(const MatrixBase<Argument2> &, const
+ * MatrixBase<Argument3> &, const CustomTernaryOp &) const, class CwiseBinaryOp,
+ * class CwiseUnaryOp, class CwiseNullaryOp
+  */
+template <typename TernaryOp, typename Arg1Type, typename Arg2Type,
+          typename Arg3Type>
+class CwiseTernaryOp : public CwiseTernaryOpImpl<
+                           TernaryOp, Arg1Type, Arg2Type, Arg3Type,
+                           typename internal::traits<Arg1Type>::StorageKind>,
+                       internal::no_assignment_operator
+{
+ public:
+  typedef typename internal::remove_all<Arg1Type>::type Arg1;
+  typedef typename internal::remove_all<Arg2Type>::type Arg2;
+  typedef typename internal::remove_all<Arg3Type>::type Arg3;
+
+  typedef typename CwiseTernaryOpImpl<
+      TernaryOp, Arg1Type, Arg2Type, Arg3Type,
+      typename internal::traits<Arg1Type>::StorageKind>::Base Base;
+  EIGEN_GENERIC_PUBLIC_INTERFACE(CwiseTernaryOp)
+
+  typedef typename internal::ref_selector<Arg1Type>::type Arg1Nested;
+  typedef typename internal::ref_selector<Arg2Type>::type Arg2Nested;
+  typedef typename internal::ref_selector<Arg3Type>::type Arg3Nested;
+  typedef typename internal::remove_reference<Arg1Nested>::type _Arg1Nested;
+  typedef typename internal::remove_reference<Arg2Nested>::type _Arg2Nested;
+  typedef typename internal::remove_reference<Arg3Nested>::type _Arg3Nested;
+
+  EIGEN_DEVICE_FUNC
+  EIGEN_STRONG_INLINE CwiseTernaryOp(const Arg1& a1, const Arg2& a2,
+                                     const Arg3& a3,
+                                     const TernaryOp& func = TernaryOp())
+      : m_arg1(a1), m_arg2(a2), m_arg3(a3), m_functor(func) {
+    // require the sizes to match
+    EIGEN_STATIC_ASSERT_SAME_MATRIX_SIZE(Arg1, Arg2)
+    EIGEN_STATIC_ASSERT_SAME_MATRIX_SIZE(Arg1, Arg3)
+
+    // The index types should match
+    EIGEN_STATIC_ASSERT((internal::is_same<
+                         typename internal::traits<Arg1Type>::StorageKind,
+                         typename internal::traits<Arg2Type>::StorageKind>::value),
+                        STORAGE_KIND_MUST_MATCH)
+    EIGEN_STATIC_ASSERT((internal::is_same<
+                         typename internal::traits<Arg1Type>::StorageKind,
+                         typename internal::traits<Arg3Type>::StorageKind>::value),
+                        STORAGE_KIND_MUST_MATCH)
+
+    eigen_assert(a1.rows() == a2.rows() && a1.cols() == a2.cols() &&
+                 a1.rows() == a3.rows() && a1.cols() == a3.cols());
+  }
+
+  EIGEN_DEVICE_FUNC
+  EIGEN_STRONG_INLINE Index rows() const {
+    // return the fixed size type if available to enable compile time
+    // optimizations
+    if (internal::traits<typename internal::remove_all<Arg1Nested>::type>::
+                RowsAtCompileTime == Dynamic &&
+        internal::traits<typename internal::remove_all<Arg2Nested>::type>::
+                RowsAtCompileTime == Dynamic)
+      return m_arg3.rows();
+    else if (internal::traits<typename internal::remove_all<Arg1Nested>::type>::
+                     RowsAtCompileTime == Dynamic &&
+             internal::traits<typename internal::remove_all<Arg3Nested>::type>::
+                     RowsAtCompileTime == Dynamic)
+      return m_arg2.rows();
+    else
+      return m_arg1.rows();
+  }
+  EIGEN_DEVICE_FUNC
+  EIGEN_STRONG_INLINE Index cols() const {
+    // return the fixed size type if available to enable compile time
+    // optimizations
+    if (internal::traits<typename internal::remove_all<Arg1Nested>::type>::
+                ColsAtCompileTime == Dynamic &&
+        internal::traits<typename internal::remove_all<Arg2Nested>::type>::
+                ColsAtCompileTime == Dynamic)
+      return m_arg3.cols();
+    else if (internal::traits<typename internal::remove_all<Arg1Nested>::type>::
+                     ColsAtCompileTime == Dynamic &&
+             internal::traits<typename internal::remove_all<Arg3Nested>::type>::
+                     ColsAtCompileTime == Dynamic)
+      return m_arg2.cols();
+    else
+      return m_arg1.cols();
+  }
+
+  /** \returns the first argument nested expression */
+  EIGEN_DEVICE_FUNC
+  const _Arg1Nested& arg1() const { return m_arg1; }
+  /** \returns the first argument nested expression */
+  EIGEN_DEVICE_FUNC
+  const _Arg2Nested& arg2() const { return m_arg2; }
+  /** \returns the third argument nested expression */
+  EIGEN_DEVICE_FUNC
+  const _Arg3Nested& arg3() const { return m_arg3; }
+  /** \returns the functor representing the ternary operation */
+  EIGEN_DEVICE_FUNC
+  const TernaryOp& functor() const { return m_functor; }
+
+ protected:
+  Arg1Nested m_arg1;
+  Arg2Nested m_arg2;
+  Arg3Nested m_arg3;
+  const TernaryOp m_functor;
+};
+
+// Generic API dispatcher
+template <typename TernaryOp, typename Arg1, typename Arg2, typename Arg3,
+          typename StorageKind>
+class CwiseTernaryOpImpl
+    : public internal::generic_xpr_base<
+          CwiseTernaryOp<TernaryOp, Arg1, Arg2, Arg3> >::type {
+ public:
+  typedef typename internal::generic_xpr_base<
+      CwiseTernaryOp<TernaryOp, Arg1, Arg2, Arg3> >::type Base;
+};
+
+}  // end namespace Eigen
+
+#endif  // EIGEN_CWISE_TERNARY_OP_H
--- a/Eigen/src/Core/CwiseUnaryOp.h
+++ b/Eigen/src/Core/CwiseUnaryOp.h
@@ -13,33 +13,13 @@

 namespace Eigen { 

-/** \class CwiseUnaryOp
-  * \ingroup Core_Module
-  *
-  * \brief Generic expression where a coefficient-wise unary operator is applied to an expression
-  *
-  * \param UnaryOp template functor implementing the operator
-  * \param XprType the type of the expression to which we are applying the unary operator
-  *
-  * This class represents an expression where a unary operator is applied to an expression.
-  * It is the return type of all operations taking exactly 1 input expression, regardless of the
-  * presence of other inputs such as scalars. For example, the operator* in the expression 3*matrix
-  * is considered unary, because only the right-hand side is an expression, and its
-  * return type is a specialization of CwiseUnaryOp.
-  *
-  * Most of the time, this is the only way that it is used, so you typically don't have to name
-  * CwiseUnaryOp types explicitly.
-  *
-  * \sa MatrixBase::unaryExpr(const CustomUnaryOp &) const, class CwiseBinaryOp, class CwiseNullaryOp
-  */
-
 namespace internal {
 template<typename UnaryOp, typename XprType>
 struct traits<CwiseUnaryOp<UnaryOp, XprType> >
 : traits<XprType>
 {
  typedef typename result_of<
-                     UnaryOp(typename XprType::Scalar)
+                     UnaryOp(const typename XprType::Scalar&)
                   >::type Scalar;
  typedef typename XprType::Nested XprTypeNested;
  typedef typename remove_reference<XprTypeNested>::type _XprTypeNested;
@@ -52,6 +32,25 @@ struct traits<CwiseUnaryOp<UnaryOp, XprType> >
 template<typename UnaryOp, typename XprType, typename StorageKind>
 class CwiseUnaryOpImpl;

+/** \class CwiseUnaryOp
+  * \ingroup Core_Module
+  *
+  * \brief Generic expression where a coefficient-wise unary operator is applied to an expression
+  *
+  * \tparam UnaryOp template functor implementing the operator
+  * \tparam XprType the type of the expression to which we are applying the unary operator
+  *
+  * This class represents an expression where a unary operator is applied to an expression.
+  * It is the return type of all operations taking exactly 1 input expression, regardless of the
+  * presence of other inputs such as scalars. For example, the operator* in the expression 3*matrix
+  * is considered unary, because only the right-hand side is an expression, and its
+  * return type is a specialization of CwiseUnaryOp.
+  *
+  * Most of the time, this is the only way that it is used, so you typically don't have to name
+  * CwiseUnaryOp types explicitly.
+  *
+  * \sa MatrixBase::unaryExpr(const CustomUnaryOp &) const, class CwiseBinaryOp, class CwiseNullaryOp
+  */
 template<typename UnaryOp, typename XprType>
 class CwiseUnaryOp : public CwiseUnaryOpImpl<UnaryOp, XprType, typename internal::traits<XprType>::StorageKind>, internal::no_assignment_operator
 {
@@ -59,33 +58,34 @@ class CwiseUnaryOp : public CwiseUnaryOpImpl<UnaryOp, XprType, typename internal

    typedef typename CwiseUnaryOpImpl<UnaryOp, XprType,typename internal::traits<XprType>::StorageKind>::Base Base;
    EIGEN_GENERIC_PUBLIC_INTERFACE(CwiseUnaryOp)
+    typedef typename internal::ref_selector<XprType>::type XprTypeNested;
    typedef typename internal::remove_all<XprType>::type NestedExpression;

-    EIGEN_DEVICE_FUNC
-    explicit inline CwiseUnaryOp(const XprType& xpr, const UnaryOp& func = UnaryOp())
+    EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+    explicit CwiseUnaryOp(const XprType& xpr, const UnaryOp& func = UnaryOp())
      : m_xpr(xpr), m_functor(func) {}

-    EIGEN_DEVICE_FUNC
-    EIGEN_STRONG_INLINE Index rows() const { return m_xpr.rows(); }
-    EIGEN_DEVICE_FUNC
-    EIGEN_STRONG_INLINE Index cols() const { return m_xpr.cols(); }
+    EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+    Index rows() const { return m_xpr.rows(); }
+    EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+    Index cols() const { return m_xpr.cols(); }

    /** \returns the functor representing the unary operation */
-    EIGEN_DEVICE_FUNC
+    EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
    const UnaryOp& functor() const { return m_functor; }

    /** \returns the nested expression */
-    EIGEN_DEVICE_FUNC
-    const typename internal::remove_all<typename XprType::Nested>::type&
+    EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+    const typename internal::remove_all<XprTypeNested>::type&
    nestedExpression() const { return m_xpr; }

    /** \returns the nested expression */
-    EIGEN_DEVICE_FUNC
-    typename internal::remove_all<typename XprType::Nested>::type&
-    nestedExpression() { return m_xpr.const_cast_derived(); }
+    EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+    typename internal::remove_all<XprTypeNested>::type&
+    nestedExpression() { return m_xpr; }

  protected:
-    typename XprType::Nested m_xpr;
+    XprTypeNested m_xpr;
    const UnaryOp m_functor;
 };

--- a/Eigen/src/Core/CwiseUnaryView.h
+++ b/Eigen/src/Core/CwiseUnaryView.h
@@ -12,27 +12,13 @@

 namespace Eigen {

-/** \class CwiseUnaryView
-  * \ingroup Core_Module
-  *
-  * \brief Generic lvalue expression of a coefficient-wise unary operator of a matrix or a vector
-  *
-  * \param ViewOp template functor implementing the view
-  * \param MatrixType the type of the matrix we are applying the unary operator
-  *
-  * This class represents a lvalue expression of a generic unary view operator of a matrix or a vector.
-  * It is the return type of real() and imag(), and most of the time this is the only way it is used.
-  *
-  * \sa MatrixBase::unaryViewExpr(const CustomUnaryOp &) const, class CwiseUnaryOp
-  */
-
 namespace internal {
 template<typename ViewOp, typename MatrixType>
 struct traits<CwiseUnaryView<ViewOp, MatrixType> >
 : traits<MatrixType>
 {
  typedef typename result_of<
-                     ViewOp(typename traits<MatrixType>::Scalar)
+                     ViewOp(const typename traits<MatrixType>::Scalar&)
                   >::type Scalar;
  typedef typename MatrixType::Nested MatrixTypeNested;
  typedef typename remove_all<MatrixTypeNested>::type _MatrixTypeNested;
@@ -55,6 +41,19 @@ struct traits<CwiseUnaryView<ViewOp, MatrixType> >
 template<typename ViewOp, typename MatrixType, typename StorageKind>
 class CwiseUnaryViewImpl;

+/** \class CwiseUnaryView
+  * \ingroup Core_Module
+  *
+  * \brief Generic lvalue expression of a coefficient-wise unary operator of a matrix or a vector
+  *
+  * \tparam ViewOp template functor implementing the view
+  * \tparam MatrixType the type of the matrix we are applying the unary operator
+  *
+  * This class represents a lvalue expression of a generic unary view operator of a matrix or a vector.
+  * It is the return type of real() and imag(), and most of the time this is the only way it is used.
+  *
+  * \sa MatrixBase::unaryViewExpr(const CustomUnaryOp &) const, class CwiseUnaryOp
+  */
 template<typename ViewOp, typename MatrixType>
 class CwiseUnaryView : public CwiseUnaryViewImpl<ViewOp, MatrixType, typename internal::traits<MatrixType>::StorageKind>
 {
@@ -62,6 +61,7 @@ class CwiseUnaryView : public CwiseUnaryViewImpl<ViewOp, MatrixType, typename in

    typedef typename CwiseUnaryViewImpl<ViewOp, MatrixType,typename internal::traits<MatrixType>::StorageKind>::Base Base;
    EIGEN_GENERIC_PUBLIC_INTERFACE(CwiseUnaryView)
+    typedef typename internal::ref_selector<MatrixType>::non_const_type MatrixTypeNested;
    typedef typename internal::remove_all<MatrixType>::type NestedExpression;

    explicit inline CwiseUnaryView(MatrixType& mat, const ViewOp& func = ViewOp())
@@ -76,15 +76,15 @@ class CwiseUnaryView : public CwiseUnaryViewImpl<ViewOp, MatrixType, typename in
    const ViewOp& functor() const { return m_functor; }

    /** \returns the nested expression */
-    const typename internal::remove_all<typename MatrixType::Nested>::type&
+    const typename internal::remove_all<MatrixTypeNested>::type&
    nestedExpression() const { return m_matrix; }

    /** \returns the nested expression */
-    typename internal::remove_all<typename MatrixType::Nested>::type&
-    nestedExpression() { return m_matrix.const_cast_derived(); }
+    typename internal::remove_reference<MatrixTypeNested>::type&
+    nestedExpression() { return m_matrix; }

  protected:
-    typename internal::ref_selector<MatrixType>::type m_matrix;
+    MatrixTypeNested m_matrix;
    ViewOp m_functor;
 };

--- a/Eigen/src/Core/DenseBase.h
+++ b/Eigen/src/Core/DenseBase.h
@@ -34,24 +34,18 @@ static inline void check_DenseIndex_is_signed() {
  * \tparam Derived is the derived type, e.g., a matrix type or an expression.
  *
  * This class can be extended with the help of the plugin mechanism described on the page
-  * \ref TopicCustomizingEigen by defining the preprocessor symbol \c EIGEN_DENSEBASE_PLUGIN.
+  * \ref TopicCustomizing_Plugins by defining the preprocessor symbol \c EIGEN_DENSEBASE_PLUGIN.
  *
-  * \sa \ref TopicClassHierarchy
+  * \sa \blank \ref TopicClassHierarchy
  */
 template<typename Derived> class DenseBase
 #ifndef EIGEN_PARSED_BY_DOXYGEN
-  : public internal::special_scalar_op_base<Derived,typename internal::traits<Derived>::Scalar,
-                                     typename NumTraits<typename internal::traits<Derived>::Scalar>::Real>
+  : public DenseCoeffsBase<Derived, internal::accessors_level<Derived>::value>
 #else
-  : public DenseCoeffsBase<Derived>
+  : public DenseCoeffsBase<Derived,DirectWriteAccessors>
 #endif // not EIGEN_PARSED_BY_DOXYGEN
 {
  public:
-    using internal::special_scalar_op_base<Derived,typename internal::traits<Derived>::Scalar,
-                typename NumTraits<typename internal::traits<Derived>::Scalar>::Real>::operator*;
-    using internal::special_scalar_op_base<Derived,typename internal::traits<Derived>::Scalar,
-                typename NumTraits<typename internal::traits<Derived>::Scalar>::Real>::operator/;
-

    /** Inner iterator type to iterate over the coefficients of a row or column.
      * \sa class InnerIterator
@@ -64,7 +58,7 @@ template<typename Derived> class DenseBase
      * \brief The type used to store indices
      * \details This typedef is relevant for types that store multiple indices such as
      *          PermutationMatrix or Transpositions, otherwise it defaults to Eigen::Index
-      * \sa \ref TopicPreprocessorDirectives, Eigen::Index, SparseMatrixBase.
+      * \sa \blank \ref TopicPreprocessorDirectives, Eigen::Index, SparseMatrixBase.
     */
    typedef typename internal::traits<Derived>::StorageIndex StorageIndex;

@@ -77,9 +71,8 @@ template<typename Derived> class DenseBase
    typedef Scalar value_type;
    
    typedef typename NumTraits<Scalar>::Real RealScalar;
+    typedef DenseCoeffsBase<Derived, internal::accessors_level<Derived>::value> Base;

-    typedef internal::special_scalar_op_base<Derived,typename internal::traits<Derived>::Scalar,
-                      typename NumTraits<typename internal::traits<Derived>::Scalar>::Real> Base;
    using Base::derived;
    using Base::const_cast_derived;
    using Base::rows;
@@ -157,13 +150,18 @@ template<typename Derived> class DenseBase
          * \sa SizeAtCompileTime, MaxRowsAtCompileTime, MaxColsAtCompileTime
          */

-      IsVectorAtCompileTime = internal::traits<Derived>::MaxRowsAtCompileTime == 1
-                           || internal::traits<Derived>::MaxColsAtCompileTime == 1,
+      IsVectorAtCompileTime = internal::traits<Derived>::RowsAtCompileTime == 1
+                           || internal::traits<Derived>::ColsAtCompileTime == 1,
        /**< This is set to true if either the number of rows or the number of
          * columns is known at compile-time to be equal to 1. Indeed, in that case,
          * we are dealing with a column-vector (if there is only one column) or with
          * a row-vector (if there is only one row). */

+      NumDimensions = int(MaxSizeAtCompileTime) == 1 ? 0 : bool(IsVectorAtCompileTime) ? 1 : 2,
+        /**< This value is equal to Tensor::NumDimensions, i.e. 0 for scalars, 1 for vectors, 
+         * and 2 for matrices.
+         */
+
      Flags = internal::traits<Derived>::Flags,
        /**< This stores expression \ref flags flags which may or may not be inherited by new expressions
          * constructed from this one. See the \ref flags "list of flags".
@@ -215,10 +213,6 @@ template<typename Derived> class DenseBase
      * of stored coefficients. */
    EIGEN_DEVICE_FUNC
    inline Index nonZeros() const { return size(); }
-    /** \returns true if either the number of rows or the number of columns is equal to 1.
-      * In other words, this function returns
-      * \code rows()==1 || cols()==1 \endcode
-      * \sa rows(), cols(), IsVectorAtCompileTime. */

    /** \returns the outer size.
      *
@@ -271,10 +265,10 @@ template<typename Derived> class DenseBase
 #ifndef EIGEN_PARSED_BY_DOXYGEN
    /** \internal Represents a matrix with all coefficients equal to one another*/
    typedef CwiseNullaryOp<internal::scalar_constant_op<Scalar>,PlainObject> ConstantReturnType;
-    /** \internal Represents a vector with linearly spaced coefficients that allows sequential access only. */
-    typedef CwiseNullaryOp<internal::linspaced_op<Scalar,PacketScalar,false>,PlainObject> SequentialLinSpacedReturnType;
+    /** \internal \deprecated Represents a vector with linearly spaced coefficients that allows sequential access only. */
+    EIGEN_DEPRECATED typedef CwiseNullaryOp<internal::linspaced_op<Scalar>,PlainObject> SequentialLinSpacedReturnType;
    /** \internal Represents a vector with linearly spaced coefficients that allows random access. */
-    typedef CwiseNullaryOp<internal::linspaced_op<Scalar,PacketScalar,true>,PlainObject> RandomAccessLinSpacedReturnType;
+    typedef CwiseNullaryOp<internal::linspaced_op<Scalar>,PlainObject> RandomAccessLinSpacedReturnType;
    /** \internal the return type of MatrixBase::eigenvalues() */
    typedef Matrix<typename NumTraits<typename internal::traits<Derived>::Scalar>::Real, internal::traits<Derived>::ColsAtCompileTime, 1> EigenvaluesReturnType;

@@ -282,13 +276,13 @@ template<typename Derived> class DenseBase

    /** Copies \a other into *this. \returns a reference to *this. */
    template<typename OtherDerived>
-    EIGEN_DEVICE_FUNC
+    EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
    Derived& operator=(const DenseBase<OtherDerived>& other);

    /** Special case of the template operator=, in order to prevent the compiler
      * from generating a default operator= (issue hit with g++ 4.1)
      */
-    EIGEN_DEVICE_FUNC
+    EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
    Derived& operator=(const DenseBase& other);

    template<typename OtherDerived>
@@ -307,18 +301,18 @@ template<typename Derived> class DenseBase
    EIGEN_DEVICE_FUNC
    Derived& operator=(const ReturnByValue<OtherDerived>& func);

-    /** \ínternal
-      * Copies \a other into *this without evaluating other. \returns a reference to *this.
-      * \deprecated */
+    /** \internal
+      * Copies \a other into *this without evaluating other. \returns a reference to *this. */
    template<typename OtherDerived>
-    EIGEN_DEVICE_FUNC
+    /** \deprecated */
+    EIGEN_DEPRECATED EIGEN_DEVICE_FUNC
    Derived& lazyAssign(const DenseBase<OtherDerived>& other);

    EIGEN_DEVICE_FUNC
    CommaInitializer<Derived> operator<< (const Scalar& s);

-    /** \deprecated it now returns \c *this */
    template<unsigned int Added,unsigned int Removed>
+    /** \deprecated it now returns \c *this */
    EIGEN_DEPRECATED
    const Derived& flagged() const
    { return derived(); }
@@ -343,12 +337,13 @@ template<typename Derived> class DenseBase
    EIGEN_DEVICE_FUNC static const ConstantReturnType
    Constant(const Scalar& value);

-    EIGEN_DEVICE_FUNC static const SequentialLinSpacedReturnType
+    EIGEN_DEPRECATED EIGEN_DEVICE_FUNC static const RandomAccessLinSpacedReturnType
    LinSpaced(Sequential_t, Index size, const Scalar& low, const Scalar& high);
+    EIGEN_DEPRECATED EIGEN_DEVICE_FUNC static const RandomAccessLinSpacedReturnType
+    LinSpaced(Sequential_t, const Scalar& low, const Scalar& high);
+
    EIGEN_DEVICE_FUNC static const RandomAccessLinSpacedReturnType
    LinSpaced(Index size, const Scalar& low, const Scalar& high);
-    EIGEN_DEVICE_FUNC static const SequentialLinSpacedReturnType
-    LinSpaced(Sequential_t, const Scalar& low, const Scalar& high);
    EIGEN_DEVICE_FUNC static const RandomAccessLinSpacedReturnType
    LinSpaced(const Scalar& low, const Scalar& high);

@@ -380,7 +375,7 @@ template<typename Derived> class DenseBase
    template<typename OtherDerived> EIGEN_DEVICE_FUNC
    bool isApprox(const DenseBase<OtherDerived>& other,
                  const RealScalar& prec = NumTraits<Scalar>::dummy_precision()) const;
-    EIGEN_DEVICE_FUNC 
+    EIGEN_DEVICE_FUNC
    bool isMuchSmallerThan(const RealScalar& other,
                           const RealScalar& prec = NumTraits<Scalar>::dummy_precision()) const;
    template<typename OtherDerived> EIGEN_DEVICE_FUNC
@@ -391,22 +386,22 @@ template<typename Derived> class DenseBase
    EIGEN_DEVICE_FUNC bool isConstant(const Scalar& value, const RealScalar& prec = NumTraits<Scalar>::dummy_precision()) const;
    EIGEN_DEVICE_FUNC bool isZero(const RealScalar& prec = NumTraits<Scalar>::dummy_precision()) const;
    EIGEN_DEVICE_FUNC bool isOnes(const RealScalar& prec = NumTraits<Scalar>::dummy_precision()) const;
-    
+
    inline bool hasNaN() const;
    inline bool allFinite() const;

-    EIGEN_DEVICE_FUNC
-    inline Derived& operator*=(const Scalar& other);
-    EIGEN_DEVICE_FUNC
-    inline Derived& operator/=(const Scalar& other);
+    EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+    Derived& operator*=(const Scalar& other);
+    EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+    Derived& operator/=(const Scalar& other);

    typedef typename internal::add_const_on_value_type<typename internal::eval<Derived>::type>::type EvalReturnType;
    /** \returns the matrix or vector obtained by evaluating this expression.
      *
      * Notice that in the case of a plain matrix or vector (not an expression) this function just returns
      * a const reference, in order to avoid a useless copy.
-      * 
-      * \warning Be carefull with eval() and the auto C++ keyword, as detailed in this \link TopicPitfalls_auto_keyword page \endlink.
+      *
+      * \warning Be careful with eval() and the auto C++ keyword, as detailed in this \link TopicPitfalls_auto_keyword page \endlink.
      */
    EIGEN_DEVICE_FUNC
    EIGEN_STRONG_INLINE EvalReturnType eval() const
@@ -421,7 +416,7 @@ template<typename Derived> class DenseBase
      *
      */
    template<typename OtherDerived>
-    EIGEN_DEVICE_FUNC
+    EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
    void swap(const DenseBase<OtherDerived>& other)
    {
      EIGEN_STATIC_ASSERT(!OtherDerived::IsPlainObjectBase,THIS_EXPRESSION_IS_NOT_A_LVALUE__IT_IS_READ_ONLY);
@@ -433,7 +428,7 @@ template<typename Derived> class DenseBase
      *
      */
    template<typename OtherDerived>
-    EIGEN_DEVICE_FUNC
+    EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
    void swap(PlainObjectBase<OtherDerived>& other)
    {
      eigen_assert(rows()==other.rows() && cols()==other.cols());
@@ -474,7 +469,17 @@ template<typename Derived> class DenseBase
    EIGEN_DEVICE_FUNC
    void visit(Visitor& func) const;

-    inline const WithFormat<Derived> format(const IOFormat& fmt) const;
+    /** \returns a WithFormat proxy object allowing to print a matrix the with given
+      * format \a fmt.
+      *
+      * See class IOFormat for some examples.
+      *
+      * \sa class IOFormat, class WithFormat
+      */
+    inline const WithFormat<Derived> format(const IOFormat& fmt) const
+    {
+      return WithFormat<Derived>(derived(), fmt);
+    }

    /** \returns the unique coefficient of a 1x1 expression */
    EIGEN_DEVICE_FUNC
@@ -485,16 +490,16 @@ template<typename Derived> class DenseBase
      return derived().coeff(0,0);
    }

-    bool all() const;
-    bool any() const;
-    Index count() const;
+    EIGEN_DEVICE_FUNC bool all() const;
+    EIGEN_DEVICE_FUNC bool any() const;
+    EIGEN_DEVICE_FUNC Index count() const;

    typedef VectorwiseOp<Derived, Horizontal> RowwiseReturnType;
    typedef const VectorwiseOp<const Derived, Horizontal> ConstRowwiseReturnType;
    typedef VectorwiseOp<Derived, Vertical> ColwiseReturnType;
    typedef const VectorwiseOp<const Derived, Vertical> ConstColwiseReturnType;

-    /** \returns a VectorwiseOp wrapper of *this providing additional partial reduction operations
+    /** \returns a VectorwiseOp wrapper of *this for broadcasting and partial reductions
    *
    * Example: \include MatrixBase_rowwise.cpp
    * Output: \verbinclude MatrixBase_rowwise.out
@@ -507,7 +512,7 @@ template<typename Derived> class DenseBase
    }
    EIGEN_DEVICE_FUNC RowwiseReturnType rowwise();

-    /** \returns a VectorwiseOp wrapper of *this providing additional partial reduction operations
+    /** \returns a VectorwiseOp wrapper of *this broadcasting and partial reductions
    *
    * Example: \include MatrixBase_colwise.cpp
    * Output: \verbinclude MatrixBase_colwise.out
@@ -568,13 +573,59 @@ template<typename Derived> class DenseBase
    }
    EIGEN_DEVICE_FUNC void reverseInPlace();

+    #ifdef EIGEN_PARSED_BY_DOXYGEN
+    /** STL-like <a href="https://en.cppreference.com/w/cpp/named_req/RandomAccessIterator">RandomAccessIterator</a>
+      * iterator type as returned by the begin() and end() methods.
+      */
+    typedef random_access_iterator_type iterator;
+    /** This is the const version of iterator (aka read-only) */
+    typedef random_access_iterator_type const_iterator;
+    #else
+    typedef typename internal::conditional< (Flags&DirectAccessBit)==DirectAccessBit,
+                                            internal::pointer_based_stl_iterator<Derived>,
+                                            internal::generic_randaccess_stl_iterator<Derived>
+                                          >::type iterator_type;
+
+    typedef typename internal::conditional< (Flags&DirectAccessBit)==DirectAccessBit,
+                                            internal::pointer_based_stl_iterator<const Derived>,
+                                            internal::generic_randaccess_stl_iterator<const Derived>
+                                          >::type const_iterator_type;
+
+    // Stl-style iterators are supported only for vectors.
+
+    typedef typename internal::conditional< IsVectorAtCompileTime,
+                                            iterator_type,
+                                            void
+                                          >::type iterator;
+
+    typedef typename internal::conditional< IsVectorAtCompileTime,
+                                            const_iterator_type,
+                                            void
+                                          >::type const_iterator;
+    #endif
+
+    inline iterator begin();
+    inline const_iterator begin() const;
+    inline const_iterator cbegin() const;
+    inline iterator end();
+    inline const_iterator end() const;
+    inline const_iterator cend() const;
+
 #define EIGEN_CURRENT_STORAGE_BASE_CLASS Eigen::DenseBase
+#define EIGEN_DOC_BLOCK_ADDONS_NOT_INNER_PANEL
+#define EIGEN_DOC_BLOCK_ADDONS_INNER_PANEL_IF(COND)
+#define EIGEN_DOC_UNARY_ADDONS(X,Y)
+#   include "../plugins/CommonCwiseUnaryOps.h"
 #   include "../plugins/BlockMethods.h"
+#   include "../plugins/IndexedViewMethods.h"
+#   include "../plugins/ReshapedMethods.h"
 #   ifdef EIGEN_DENSEBASE_PLUGIN
 #     include EIGEN_DENSEBASE_PLUGIN
 #   endif
 #undef EIGEN_CURRENT_STORAGE_BASE_CLASS
-
+#undef EIGEN_DOC_BLOCK_ADDONS_NOT_INNER_PANEL
+#undef EIGEN_DOC_BLOCK_ADDONS_INNER_PANEL_IF
+#undef EIGEN_DOC_UNARY_ADDONS

    // disable the use of evalTo for dense objects with a nice compilation error
    template<typename Dest>
--- a/Eigen/src/Core/DenseCoeffsBase.h
+++ b/Eigen/src/Core/DenseCoeffsBase.h
@@ -22,7 +22,8 @@ template<typename T> struct add_const_on_value_type_if_arithmetic
 /** \brief Base class providing read-only coefficient access to matrices and arrays.
  * \ingroup Core_Module
  * \tparam Derived Type of the derived class
-  * \tparam #ReadOnlyAccessors Constant indicating read-only access
+  *
+  * \note #ReadOnlyAccessors Constant indicating read-only access
  *
  * This class defines the \c operator() \c const function and friends, which can be used to read specific
  * entries of a matrix or array.
@@ -138,6 +139,8 @@ class DenseCoeffsBase<Derived,ReadOnlyAccessors> : public EigenBase<Derived>
    EIGEN_STRONG_INLINE CoeffReturnType
    coeff(Index index) const
    {
+      EIGEN_STATIC_ASSERT(internal::evaluator<Derived>::Flags & LinearAccessBit,
+                          THIS_COEFFICIENT_ACCESSOR_TAKING_ONE_ACCESS_IS_ONLY_FOR_EXPRESSIONS_ALLOWING_LINEAR_ACCESS)
      eigen_internal_assert(index >= 0 && index < size());
      return internal::evaluator<Derived>(derived()).coeff(index);
    }
@@ -189,19 +192,31 @@ class DenseCoeffsBase<Derived,ReadOnlyAccessors> : public EigenBase<Derived>

    EIGEN_DEVICE_FUNC
    EIGEN_STRONG_INLINE CoeffReturnType
-    y() const { return (*this)[1]; }
+    y() const
+    {
+      EIGEN_STATIC_ASSERT(Derived::SizeAtCompileTime==-1 || Derived::SizeAtCompileTime>=2, OUT_OF_RANGE_ACCESS);
+      return (*this)[1];
+    }

    /** equivalent to operator[](2).  */

    EIGEN_DEVICE_FUNC
    EIGEN_STRONG_INLINE CoeffReturnType
-    z() const { return (*this)[2]; }
+    z() const
+    {
+      EIGEN_STATIC_ASSERT(Derived::SizeAtCompileTime==-1 || Derived::SizeAtCompileTime>=3, OUT_OF_RANGE_ACCESS);
+      return (*this)[2];
+    }

    /** equivalent to operator[](3).  */

    EIGEN_DEVICE_FUNC
    EIGEN_STRONG_INLINE CoeffReturnType
-    w() const { return (*this)[3]; }
+    w() const
+    {
+      EIGEN_STATIC_ASSERT(Derived::SizeAtCompileTime==-1 || Derived::SizeAtCompileTime>=4, OUT_OF_RANGE_ACCESS);
+      return (*this)[3];
+    }

    /** \internal
      * \returns the packet of coefficients starting at the given row and column. It is your responsibility
@@ -243,6 +258,8 @@ class DenseCoeffsBase<Derived,ReadOnlyAccessors> : public EigenBase<Derived>
    template<int LoadMode>
    EIGEN_STRONG_INLINE PacketReturnType packet(Index index) const
    {
+      EIGEN_STATIC_ASSERT(internal::evaluator<Derived>::Flags & LinearAccessBit,
+                          THIS_COEFFICIENT_ACCESSOR_TAKING_ONE_ACCESS_IS_ONLY_FOR_EXPRESSIONS_ALLOWING_LINEAR_ACCESS)
      typedef typename internal::packet_traits<Scalar>::type DefaultPacketType;
      eigen_internal_assert(index >= 0 && index < size());
      return internal::evaluator<Derived>(derived()).template packet<LoadMode,DefaultPacketType>(index);
@@ -272,7 +289,8 @@ class DenseCoeffsBase<Derived,ReadOnlyAccessors> : public EigenBase<Derived>
 /** \brief Base class providing read/write coefficient access to matrices and arrays.
  * \ingroup Core_Module
  * \tparam Derived Type of the derived class
-  * \tparam #WriteAccessors Constant indicating read/write access
+  *
+  * \note #WriteAccessors Constant indicating read/write access
  *
  * This class defines the non-const \c operator() function and friends, which can be used to write specific
  * entries of a matrix or array. This class inherits DenseCoeffsBase<Derived, ReadOnlyAccessors> which
@@ -370,6 +388,8 @@ class DenseCoeffsBase<Derived, WriteAccessors> : public DenseCoeffsBase<Derived,
    EIGEN_STRONG_INLINE Scalar&
    coeffRef(Index index)
    {
+      EIGEN_STATIC_ASSERT(internal::evaluator<Derived>::Flags & LinearAccessBit,
+                          THIS_COEFFICIENT_ACCESSOR_TAKING_ONE_ACCESS_IS_ONLY_FOR_EXPRESSIONS_ALLOWING_LINEAR_ACCESS)
      eigen_internal_assert(index >= 0 && index < size());
      return internal::evaluator<Derived>(derived()).coeffRef(index);
    }
@@ -418,31 +438,44 @@ class DenseCoeffsBase<Derived, WriteAccessors> : public DenseCoeffsBase<Derived,

    EIGEN_DEVICE_FUNC
    EIGEN_STRONG_INLINE Scalar&
-    y() { return (*this)[1]; }
+    y()
+    {
+      EIGEN_STATIC_ASSERT(Derived::SizeAtCompileTime==-1 || Derived::SizeAtCompileTime>=2, OUT_OF_RANGE_ACCESS);
+      return (*this)[1];
+    }

    /** equivalent to operator[](2).  */

    EIGEN_DEVICE_FUNC
    EIGEN_STRONG_INLINE Scalar&
-    z() { return (*this)[2]; }
+    z()
+    {
+      EIGEN_STATIC_ASSERT(Derived::SizeAtCompileTime==-1 || Derived::SizeAtCompileTime>=3, OUT_OF_RANGE_ACCESS);
+      return (*this)[2];
+    }

    /** equivalent to operator[](3).  */

    EIGEN_DEVICE_FUNC
    EIGEN_STRONG_INLINE Scalar&
-    w() { return (*this)[3]; }
+    w()
+    {
+      EIGEN_STATIC_ASSERT(Derived::SizeAtCompileTime==-1 || Derived::SizeAtCompileTime>=4, OUT_OF_RANGE_ACCESS);
+      return (*this)[3];
+    }
 };

 /** \brief Base class providing direct read-only coefficient access to matrices and arrays.
  * \ingroup Core_Module
  * \tparam Derived Type of the derived class
-  * \tparam #DirectAccessors Constant indicating direct access
+  *
+  * \note #DirectAccessors Constant indicating direct access
  *
  * This class defines functions to work with strides which can be used to access entries directly. This class
  * inherits DenseCoeffsBase<Derived, ReadOnlyAccessors> which defines functions to access entries read-only using
  * \c operator() .
  *
-  * \sa \ref TopicClassHierarchy
+  * \sa \blank \ref TopicClassHierarchy
  */
 template<typename Derived>
 class DenseCoeffsBase<Derived, DirectAccessors> : public DenseCoeffsBase<Derived, ReadOnlyAccessors>
@@ -509,13 +542,14 @@ class DenseCoeffsBase<Derived, DirectAccessors> : public DenseCoeffsBase<Derived
 /** \brief Base class providing direct read/write coefficient access to matrices and arrays.
  * \ingroup Core_Module
  * \tparam Derived Type of the derived class
-  * \tparam #DirectWriteAccessors Constant indicating direct access
+  *
+  * \note #DirectWriteAccessors Constant indicating direct access
  *
  * This class defines functions to work with strides which can be used to access entries directly. This class
  * inherits DenseCoeffsBase<Derived, WriteAccessors> which defines functions to access entries read/write using
  * \c operator().
  *
-  * \sa \ref TopicClassHierarchy
+  * \sa \blank \ref TopicClassHierarchy
  */
 template<typename Derived>
 class DenseCoeffsBase<Derived, DirectWriteAccessors>
@@ -594,7 +628,7 @@ struct first_aligned_impl<Alignment, Derived, false>
 {
  static inline Index run(const Derived& m)
  {
-    return internal::first_aligned<Alignment>(&m.const_cast_derived().coeffRef(0,0), m.size());
+    return internal::first_aligned<Alignment>(m.data(), m.size());
  }
 };

@@ -617,7 +651,7 @@ static inline Index first_default_aligned(const DenseBase<Derived>& m)
 {
  typedef typename Derived::Scalar Scalar;
  typedef typename packet_traits<Scalar>::type DefaultPacketType;
-  return first_aligned<unpacket_traits<DefaultPacketType>::alignment>(m);
+  return internal::first_aligned<int(unpacket_traits<DefaultPacketType>::alignment),Derived>(m);
 }

 template<typename Derived, bool HasDirectAccess = has_direct_access<Derived>::ret>
--- a/Eigen/src/Core/DenseStorage.h
+++ b/Eigen/src/Core/DenseStorage.h
@@ -13,9 +13,9 @@
 #define EIGEN_MATRIXSTORAGE_H

 #ifdef EIGEN_DENSE_STORAGE_CTOR_PLUGIN
-  #define EIGEN_INTERNAL_DENSE_STORAGE_CTOR_PLUGIN EIGEN_DENSE_STORAGE_CTOR_PLUGIN;
+  #define EIGEN_INTERNAL_DENSE_STORAGE_CTOR_PLUGIN(X) X; EIGEN_DENSE_STORAGE_CTOR_PLUGIN;
 #else
-  #define EIGEN_INTERNAL_DENSE_STORAGE_CTOR_PLUGIN
+  #define EIGEN_INTERNAL_DENSE_STORAGE_CTOR_PLUGIN(X)
 #endif

 namespace Eigen {
@@ -61,19 +61,19 @@ struct plain_array
 #if defined(EIGEN_DISABLE_UNALIGNED_ARRAY_ASSERT)
  #define EIGEN_MAKE_UNALIGNED_ARRAY_ASSERT(sizemask)
 #elif EIGEN_GNUC_AT_LEAST(4,7) 
-  // GCC 4.7 is too aggressive in its optimizations and remove the alignement test based on the fact the array is declared to be aligned.
+  // GCC 4.7 is too aggressive in its optimizations and remove the alignment test based on the fact the array is declared to be aligned.
  // See this bug report: http://gcc.gnu.org/bugzilla/show_bug.cgi?id=53900
  // Hiding the origin of the array pointer behind a function argument seems to do the trick even if the function is inlined:
  template<typename PtrType>
  EIGEN_ALWAYS_INLINE PtrType eigen_unaligned_array_assert_workaround_gcc47(PtrType array) { return array; }
  #define EIGEN_MAKE_UNALIGNED_ARRAY_ASSERT(sizemask) \
-    eigen_assert((reinterpret_cast<size_t>(eigen_unaligned_array_assert_workaround_gcc47(array)) & (sizemask)) == 0 \
+    eigen_assert((internal::UIntPtr(eigen_unaligned_array_assert_workaround_gcc47(array)) & (sizemask)) == 0 \
              && "this assertion is explained here: " \
              "http://eigen.tuxfamily.org/dox-devel/group__TopicUnalignedArrayAssert.html" \
              " **** READ THIS WEB PAGE !!! ****");
 #else
  #define EIGEN_MAKE_UNALIGNED_ARRAY_ASSERT(sizemask) \
-    eigen_assert((reinterpret_cast<size_t>(array) & (sizemask)) == 0 \
+    eigen_assert((internal::UIntPtr(array) & (sizemask)) == 0 \
              && "this assertion is explained here: " \
              "http://eigen.tuxfamily.org/dox-devel/group__TopicUnalignedArrayAssert.html" \
              " **** READ THIS WEB PAGE !!! ****");
@@ -184,12 +184,16 @@ template<typename T, int Size, int _Rows, int _Cols, int _Options> class DenseSt
 {
    internal::plain_array<T,Size,_Options> m_data;
  public:
-    EIGEN_DEVICE_FUNC DenseStorage() {}
+    EIGEN_DEVICE_FUNC DenseStorage() {
+      EIGEN_INTERNAL_DENSE_STORAGE_CTOR_PLUGIN(Index size = Size)
+    }
    EIGEN_DEVICE_FUNC
    explicit DenseStorage(internal::constructor_without_unaligned_array_assert)
      : m_data(internal::constructor_without_unaligned_array_assert()) {}
    EIGEN_DEVICE_FUNC 
-    DenseStorage(const DenseStorage& other) : m_data(other.m_data) {}
+    DenseStorage(const DenseStorage& other) : m_data(other.m_data) {
+      EIGEN_INTERNAL_DENSE_STORAGE_CTOR_PLUGIN(Index size = Size)
+    }
    EIGEN_DEVICE_FUNC 
    DenseStorage& operator=(const DenseStorage& other)
    { 
@@ -197,13 +201,15 @@ template<typename T, int Size, int _Rows, int _Cols, int _Options> class DenseSt
      return *this; 
    }
    EIGEN_DEVICE_FUNC DenseStorage(Index size, Index rows, Index cols) {
-      EIGEN_INTERNAL_DENSE_STORAGE_CTOR_PLUGIN
+      EIGEN_INTERNAL_DENSE_STORAGE_CTOR_PLUGIN({})
      eigen_internal_assert(size==rows*cols && rows==_Rows && cols==_Cols);
      EIGEN_UNUSED_VARIABLE(size);
      EIGEN_UNUSED_VARIABLE(rows);
      EIGEN_UNUSED_VARIABLE(cols);
    }
-    EIGEN_DEVICE_FUNC void swap(DenseStorage& other) { std::swap(m_data,other.m_data); }
+    EIGEN_DEVICE_FUNC void swap(DenseStorage& other) {
+      numext::swap(m_data, other.m_data);
+    }
    EIGEN_DEVICE_FUNC static Index rows(void) {return _Rows;}
    EIGEN_DEVICE_FUNC static Index cols(void) {return _Cols;}
    EIGEN_DEVICE_FUNC void conservativeResize(Index,Index,Index) {}
@@ -263,7 +269,11 @@ template<typename T, int Size, int _Options> class DenseStorage<T, Size, Dynamic
    }
    EIGEN_DEVICE_FUNC DenseStorage(Index, Index rows, Index cols) : m_rows(rows), m_cols(cols) {}
    EIGEN_DEVICE_FUNC void swap(DenseStorage& other)
-    { std::swap(m_data,other.m_data); std::swap(m_rows,other.m_rows); std::swap(m_cols,other.m_cols); }
+    {
+      numext::swap(m_data,other.m_data);
+      numext::swap(m_rows,other.m_rows);
+      numext::swap(m_cols,other.m_cols);
+    }
    EIGEN_DEVICE_FUNC Index rows() const {return m_rows;}
    EIGEN_DEVICE_FUNC Index cols() const {return m_cols;}
    EIGEN_DEVICE_FUNC void conservativeResize(Index, Index rows, Index cols) { m_rows = rows; m_cols = cols; }
@@ -292,7 +302,11 @@ template<typename T, int Size, int _Cols, int _Options> class DenseStorage<T, Si
      return *this; 
    }
    EIGEN_DEVICE_FUNC DenseStorage(Index, Index rows, Index) : m_rows(rows) {}
-    EIGEN_DEVICE_FUNC void swap(DenseStorage& other) { std::swap(m_data,other.m_data); std::swap(m_rows,other.m_rows); }
+    EIGEN_DEVICE_FUNC void swap(DenseStorage& other)
+    {
+      numext::swap(m_data,other.m_data);
+      numext::swap(m_rows,other.m_rows);
+    }
    EIGEN_DEVICE_FUNC Index rows(void) const {return m_rows;}
    EIGEN_DEVICE_FUNC Index cols(void) const {return _Cols;}
    EIGEN_DEVICE_FUNC void conservativeResize(Index, Index rows, Index) { m_rows = rows; }
@@ -321,11 +335,14 @@ template<typename T, int Size, int _Rows, int _Options> class DenseStorage<T, Si
      return *this;
    }
    EIGEN_DEVICE_FUNC DenseStorage(Index, Index, Index cols) : m_cols(cols) {}
-    EIGEN_DEVICE_FUNC void swap(DenseStorage& other) { std::swap(m_data,other.m_data); std::swap(m_cols,other.m_cols); }
+    EIGEN_DEVICE_FUNC void swap(DenseStorage& other) {
+      numext::swap(m_data,other.m_data);
+      numext::swap(m_cols,other.m_cols);
+    }
    EIGEN_DEVICE_FUNC Index rows(void) const {return _Rows;}
    EIGEN_DEVICE_FUNC Index cols(void) const {return m_cols;}
-    void conservativeResize(Index, Index, Index cols) { m_cols = cols; }
-    void resize(Index, Index, Index cols) { m_cols = cols; }
+    EIGEN_DEVICE_FUNC void conservativeResize(Index, Index, Index cols) { m_cols = cols; }
+    EIGEN_DEVICE_FUNC void resize(Index, Index, Index cols) { m_cols = cols; }
    EIGEN_DEVICE_FUNC const T *data() const { return m_data.array; }
    EIGEN_DEVICE_FUNC T *data() { return m_data.array; }
 };
@@ -343,7 +360,7 @@ template<typename T, int _Options> class DenseStorage<T, Dynamic, Dynamic, Dynam
    EIGEN_DEVICE_FUNC DenseStorage(Index size, Index rows, Index cols)
      : m_data(internal::conditional_aligned_new_auto<T,(_Options&DontAlign)==0>(size)), m_rows(rows), m_cols(cols)
    {
-      EIGEN_INTERNAL_DENSE_STORAGE_CTOR_PLUGIN
+      EIGEN_INTERNAL_DENSE_STORAGE_CTOR_PLUGIN({})
      eigen_internal_assert(size==rows*cols && rows>=0 && cols >=0);
    }
    EIGEN_DEVICE_FUNC DenseStorage(const DenseStorage& other)
@@ -351,6 +368,7 @@ template<typename T, int _Options> class DenseStorage<T, Dynamic, Dynamic, Dynam
      , m_rows(other.m_rows)
      , m_cols(other.m_cols)
    {
+      EIGEN_INTERNAL_DENSE_STORAGE_CTOR_PLUGIN(Index size = m_rows*m_cols)
      internal::smart_copy(other.m_data, other.m_data+other.m_rows*other.m_cols, m_data);
    }
    EIGEN_DEVICE_FUNC DenseStorage& operator=(const DenseStorage& other)
@@ -362,9 +380,9 @@ template<typename T, int _Options> class DenseStorage<T, Dynamic, Dynamic, Dynam
      }
      return *this;
    }
-#ifdef EIGEN_HAVE_RVALUE_REFERENCES
+#if EIGEN_HAS_RVALUE_REFERENCES
    EIGEN_DEVICE_FUNC
-    DenseStorage(DenseStorage&& other)
+    DenseStorage(DenseStorage&& other) EIGEN_NOEXCEPT
      : m_data(std::move(other.m_data))
      , m_rows(std::move(other.m_rows))
      , m_cols(std::move(other.m_cols))
@@ -374,18 +392,21 @@ template<typename T, int _Options> class DenseStorage<T, Dynamic, Dynamic, Dynam
      other.m_cols = 0;
    }
    EIGEN_DEVICE_FUNC
-    DenseStorage& operator=(DenseStorage&& other)
+    DenseStorage& operator=(DenseStorage&& other) EIGEN_NOEXCEPT
    {
-      using std::swap;
-      swap(m_data, other.m_data);
-      swap(m_rows, other.m_rows);
-      swap(m_cols, other.m_cols);
+      numext::swap(m_data, other.m_data);
+      numext::swap(m_rows, other.m_rows);
+      numext::swap(m_cols, other.m_cols);
      return *this;
    }
 #endif
    EIGEN_DEVICE_FUNC ~DenseStorage() { internal::conditional_aligned_delete_auto<T,(_Options&DontAlign)==0>(m_data, m_rows*m_cols); }
    EIGEN_DEVICE_FUNC void swap(DenseStorage& other)
-    { std::swap(m_data,other.m_data); std::swap(m_rows,other.m_rows); std::swap(m_cols,other.m_cols); }
+    {
+      numext::swap(m_data,other.m_data);
+      numext::swap(m_rows,other.m_rows);
+      numext::swap(m_cols,other.m_cols);
+    }
    EIGEN_DEVICE_FUNC Index rows(void) const {return m_rows;}
    EIGEN_DEVICE_FUNC Index cols(void) const {return m_cols;}
    void conservativeResize(Index size, Index rows, Index cols)
@@ -399,11 +420,11 @@ template<typename T, int _Options> class DenseStorage<T, Dynamic, Dynamic, Dynam
      if(size != m_rows*m_cols)
      {
        internal::conditional_aligned_delete_auto<T,(_Options&DontAlign)==0>(m_data, m_rows*m_cols);
-        if (size)
+        if (size>0) // >0 and not simply !=0 to let the compiler knows that size cannot be negative
          m_data = internal::conditional_aligned_new_auto<T,(_Options&DontAlign)==0>(size);
        else
          m_data = 0;
-        EIGEN_INTERNAL_DENSE_STORAGE_CTOR_PLUGIN
+        EIGEN_INTERNAL_DENSE_STORAGE_CTOR_PLUGIN({})
      }
      m_rows = rows;
      m_cols = cols;
@@ -422,7 +443,7 @@ template<typename T, int _Rows, int _Options> class DenseStorage<T, Dynamic, _Ro
    explicit DenseStorage(internal::constructor_without_unaligned_array_assert) : m_data(0), m_cols(0) {}
    EIGEN_DEVICE_FUNC DenseStorage(Index size, Index rows, Index cols) : m_data(internal::conditional_aligned_new_auto<T,(_Options&DontAlign)==0>(size)), m_cols(cols)
    {
-      EIGEN_INTERNAL_DENSE_STORAGE_CTOR_PLUGIN
+      EIGEN_INTERNAL_DENSE_STORAGE_CTOR_PLUGIN({})
      eigen_internal_assert(size==rows*cols && rows==_Rows && cols >=0);
      EIGEN_UNUSED_VARIABLE(rows);
    }
@@ -430,6 +451,7 @@ template<typename T, int _Rows, int _Options> class DenseStorage<T, Dynamic, _Ro
      : m_data(internal::conditional_aligned_new_auto<T,(_Options&DontAlign)==0>(_Rows*other.m_cols))
      , m_cols(other.m_cols)
    {
+      EIGEN_INTERNAL_DENSE_STORAGE_CTOR_PLUGIN(Index size = m_cols*_Rows)
      internal::smart_copy(other.m_data, other.m_data+_Rows*m_cols, m_data);
    }
    EIGEN_DEVICE_FUNC DenseStorage& operator=(const DenseStorage& other)
@@ -441,9 +463,9 @@ template<typename T, int _Rows, int _Options> class DenseStorage<T, Dynamic, _Ro
      }
      return *this;
    }    
-#ifdef EIGEN_HAVE_RVALUE_REFERENCES
+#if EIGEN_HAS_RVALUE_REFERENCES
    EIGEN_DEVICE_FUNC
-    DenseStorage(DenseStorage&& other)
+    DenseStorage(DenseStorage&& other) EIGEN_NOEXCEPT
      : m_data(std::move(other.m_data))
      , m_cols(std::move(other.m_cols))
    {
@@ -451,16 +473,18 @@ template<typename T, int _Rows, int _Options> class DenseStorage<T, Dynamic, _Ro
      other.m_cols = 0;
    }
    EIGEN_DEVICE_FUNC
-    DenseStorage& operator=(DenseStorage&& other)
+    DenseStorage& operator=(DenseStorage&& other) EIGEN_NOEXCEPT
    {
-      using std::swap;
-      swap(m_data, other.m_data);
-      swap(m_cols, other.m_cols);
+      numext::swap(m_data, other.m_data);
+      numext::swap(m_cols, other.m_cols);
      return *this;
    }
 #endif
    EIGEN_DEVICE_FUNC ~DenseStorage() { internal::conditional_aligned_delete_auto<T,(_Options&DontAlign)==0>(m_data, _Rows*m_cols); }
-    EIGEN_DEVICE_FUNC void swap(DenseStorage& other) { std::swap(m_data,other.m_data); std::swap(m_cols,other.m_cols); }
+    EIGEN_DEVICE_FUNC void swap(DenseStorage& other) {
+      numext::swap(m_data,other.m_data);
+      numext::swap(m_cols,other.m_cols);
+    }
    EIGEN_DEVICE_FUNC static Index rows(void) {return _Rows;}
    EIGEN_DEVICE_FUNC Index cols(void) const {return m_cols;}
    EIGEN_DEVICE_FUNC void conservativeResize(Index size, Index, Index cols)
@@ -473,11 +497,11 @@ template<typename T, int _Rows, int _Options> class DenseStorage<T, Dynamic, _Ro
      if(size != _Rows*m_cols)
      {
        internal::conditional_aligned_delete_auto<T,(_Options&DontAlign)==0>(m_data, _Rows*m_cols);
-        if (size)
+        if (size>0) // >0 and not simply !=0 to let the compiler knows that size cannot be negative
          m_data = internal::conditional_aligned_new_auto<T,(_Options&DontAlign)==0>(size);
        else
          m_data = 0;
-        EIGEN_INTERNAL_DENSE_STORAGE_CTOR_PLUGIN
+        EIGEN_INTERNAL_DENSE_STORAGE_CTOR_PLUGIN({})
      }
      m_cols = cols;
    }
@@ -495,7 +519,7 @@ template<typename T, int _Cols, int _Options> class DenseStorage<T, Dynamic, Dyn
    explicit DenseStorage(internal::constructor_without_unaligned_array_assert) : m_data(0), m_rows(0) {}
    EIGEN_DEVICE_FUNC DenseStorage(Index size, Index rows, Index cols) : m_data(internal::conditional_aligned_new_auto<T,(_Options&DontAlign)==0>(size)), m_rows(rows)
    {
-      EIGEN_INTERNAL_DENSE_STORAGE_CTOR_PLUGIN
+      EIGEN_INTERNAL_DENSE_STORAGE_CTOR_PLUGIN({})
      eigen_internal_assert(size==rows*cols && rows>=0 && cols == _Cols);
      EIGEN_UNUSED_VARIABLE(cols);
    }
@@ -503,6 +527,7 @@ template<typename T, int _Cols, int _Options> class DenseStorage<T, Dynamic, Dyn
      : m_data(internal::conditional_aligned_new_auto<T,(_Options&DontAlign)==0>(other.m_rows*_Cols))
      , m_rows(other.m_rows)
    {
+      EIGEN_INTERNAL_DENSE_STORAGE_CTOR_PLUGIN(Index size = m_rows*_Cols)
      internal::smart_copy(other.m_data, other.m_data+other.m_rows*_Cols, m_data);
    }
    EIGEN_DEVICE_FUNC DenseStorage& operator=(const DenseStorage& other)
@@ -514,9 +539,9 @@ template<typename T, int _Cols, int _Options> class DenseStorage<T, Dynamic, Dyn
      }
      return *this;
    }    
-#ifdef EIGEN_HAVE_RVALUE_REFERENCES
+#if EIGEN_HAS_RVALUE_REFERENCES
    EIGEN_DEVICE_FUNC
-    DenseStorage(DenseStorage&& other)
+    DenseStorage(DenseStorage&& other) EIGEN_NOEXCEPT
      : m_data(std::move(other.m_data))
      , m_rows(std::move(other.m_rows))
    {
@@ -524,16 +549,18 @@ template<typename T, int _Cols, int _Options> class DenseStorage<T, Dynamic, Dyn
      other.m_rows = 0;
    }
    EIGEN_DEVICE_FUNC
-    DenseStorage& operator=(DenseStorage&& other)
+    DenseStorage& operator=(DenseStorage&& other) EIGEN_NOEXCEPT
    {
-      using std::swap;
-      swap(m_data, other.m_data);
-      swap(m_rows, other.m_rows);
+      numext::swap(m_data, other.m_data);
+      numext::swap(m_rows, other.m_rows);
      return *this;
    }
 #endif
    EIGEN_DEVICE_FUNC ~DenseStorage() { internal::conditional_aligned_delete_auto<T,(_Options&DontAlign)==0>(m_data, _Cols*m_rows); }
-    EIGEN_DEVICE_FUNC void swap(DenseStorage& other) { std::swap(m_data,other.m_data); std::swap(m_rows,other.m_rows); }
+    EIGEN_DEVICE_FUNC void swap(DenseStorage& other) {
+      numext::swap(m_data,other.m_data);
+      numext::swap(m_rows,other.m_rows);
+    }
    EIGEN_DEVICE_FUNC Index rows(void) const {return m_rows;}
    EIGEN_DEVICE_FUNC static Index cols(void) {return _Cols;}
    void conservativeResize(Index size, Index rows, Index)
@@ -546,11 +573,11 @@ template<typename T, int _Cols, int _Options> class DenseStorage<T, Dynamic, Dyn
      if(size != m_rows*_Cols)
      {
        internal::conditional_aligned_delete_auto<T,(_Options&DontAlign)==0>(m_data, _Cols*m_rows);
-        if (size)
+        if (size>0) // >0 and not simply !=0 to let the compiler knows that size cannot be negative
          m_data = internal::conditional_aligned_new_auto<T,(_Options&DontAlign)==0>(size);
        else
          m_data = 0;
-        EIGEN_INTERNAL_DENSE_STORAGE_CTOR_PLUGIN
+        EIGEN_INTERNAL_DENSE_STORAGE_CTOR_PLUGIN({})
      }
      m_rows = rows;
    }
--- a/Eigen/src/Core/Diagonal.h
+++ b/Eigen/src/Core/Diagonal.h
@@ -21,7 +21,7 @@ namespace Eigen {
  * \param MatrixType the type of the object in which we are taking a sub/main/super diagonal
  * \param DiagIndex the index of the sub/super diagonal. The default is 0 and it means the main diagonal.
  *              A positive value means a superdiagonal, a negative value means a subdiagonal.
-  *              You can also use Dynamic so the index can be set at runtime.
+  *              You can also use DynamicIndex so the index can be set at runtime.
  *
  * The matrix is not required to be square.
  *
@@ -70,7 +70,10 @@ template<typename MatrixType, int _DiagIndex> class Diagonal
    EIGEN_DENSE_PUBLIC_INTERFACE(Diagonal)

    EIGEN_DEVICE_FUNC
-    explicit inline Diagonal(MatrixType& matrix, Index a_index = DiagIndex) : m_matrix(matrix), m_index(a_index) {}
+    explicit inline Diagonal(MatrixType& matrix, Index a_index = DiagIndex) : m_matrix(matrix), m_index(a_index)
+    {
+      eigen_assert( a_index <= m_matrix.cols() && -a_index <= m_matrix.rows() );
+    }

    EIGEN_INHERIT_ASSIGNMENT_OPERATORS(Diagonal)

@@ -103,21 +106,21 @@ template<typename MatrixType, int _DiagIndex> class Diagonal
                     >::type ScalarWithConstIfNotLvalue;

    EIGEN_DEVICE_FUNC
-    inline ScalarWithConstIfNotLvalue* data() { return &(m_matrix.const_cast_derived().coeffRef(rowOffset(), colOffset())); }
+    inline ScalarWithConstIfNotLvalue* data() { return &(m_matrix.coeffRef(rowOffset(), colOffset())); }
    EIGEN_DEVICE_FUNC
-    inline const Scalar* data() const { return &(m_matrix.const_cast_derived().coeffRef(rowOffset(), colOffset())); }
+    inline const Scalar* data() const { return &(m_matrix.coeffRef(rowOffset(), colOffset())); }

    EIGEN_DEVICE_FUNC
    inline Scalar& coeffRef(Index row, Index)
    {
      EIGEN_STATIC_ASSERT_LVALUE(MatrixType)
-      return m_matrix.const_cast_derived().coeffRef(row+rowOffset(), row+colOffset());
+      return m_matrix.coeffRef(row+rowOffset(), row+colOffset());
    }

    EIGEN_DEVICE_FUNC
    inline const Scalar& coeffRef(Index row, Index) const
    {
-      return m_matrix.const_cast_derived().coeffRef(row+rowOffset(), row+colOffset());
+      return m_matrix.coeffRef(row+rowOffset(), row+colOffset());
    }

    EIGEN_DEVICE_FUNC
@@ -130,13 +133,13 @@ template<typename MatrixType, int _DiagIndex> class Diagonal
    inline Scalar& coeffRef(Index idx)
    {
      EIGEN_STATIC_ASSERT_LVALUE(MatrixType)
-      return m_matrix.const_cast_derived().coeffRef(idx+rowOffset(), idx+colOffset());
+      return m_matrix.coeffRef(idx+rowOffset(), idx+colOffset());
    }

    EIGEN_DEVICE_FUNC
    inline const Scalar& coeffRef(Index idx) const
    {
-      return m_matrix.const_cast_derived().coeffRef(idx+rowOffset(), idx+colOffset());
+      return m_matrix.coeffRef(idx+rowOffset(), idx+colOffset());
    }

    EIGEN_DEVICE_FUNC
@@ -159,7 +162,7 @@ template<typename MatrixType, int _DiagIndex> class Diagonal
    }

  protected:
-    typename MatrixType::Nested m_matrix;
+    typename internal::ref_selector<MatrixType>::non_const_type m_matrix;
    const internal::variable_if_dynamicindex<Index, DiagIndex> m_index;

  private:
@@ -184,7 +187,7 @@ template<typename MatrixType, int _DiagIndex> class Diagonal
  *
  * \sa class Diagonal */
 template<typename Derived>
-inline typename MatrixBase<Derived>::DiagonalReturnType
+EIGEN_DEVICE_FUNC inline typename MatrixBase<Derived>::DiagonalReturnType
 MatrixBase<Derived>::diagonal()
 {
  return DiagonalReturnType(derived());
@@ -192,7 +195,7 @@ MatrixBase<Derived>::diagonal()

 /** This is the const version of diagonal(). */
 template<typename Derived>
-inline typename MatrixBase<Derived>::ConstDiagonalReturnType
+EIGEN_DEVICE_FUNC inline typename MatrixBase<Derived>::ConstDiagonalReturnType
 MatrixBase<Derived>::diagonal() const
 {
  return ConstDiagonalReturnType(derived());
@@ -210,7 +213,7 @@ MatrixBase<Derived>::diagonal() const
  *
  * \sa MatrixBase::diagonal(), class Diagonal */
 template<typename Derived>
-inline typename MatrixBase<Derived>::DiagonalDynamicIndexReturnType
+EIGEN_DEVICE_FUNC inline typename MatrixBase<Derived>::DiagonalDynamicIndexReturnType
 MatrixBase<Derived>::diagonal(Index index)
 {
  return DiagonalDynamicIndexReturnType(derived(), index);
@@ -218,7 +221,7 @@ MatrixBase<Derived>::diagonal(Index index)

 /** This is the const version of diagonal(Index). */
 template<typename Derived>
-inline typename MatrixBase<Derived>::ConstDiagonalDynamicIndexReturnType
+EIGEN_DEVICE_FUNC inline typename MatrixBase<Derived>::ConstDiagonalDynamicIndexReturnType
 MatrixBase<Derived>::diagonal(Index index) const
 {
  return ConstDiagonalDynamicIndexReturnType(derived(), index);
@@ -237,6 +240,7 @@ MatrixBase<Derived>::diagonal(Index index) const
  * \sa MatrixBase::diagonal(), class Diagonal */
 template<typename Derived>
 template<int Index_>
+EIGEN_DEVICE_FUNC
 inline typename MatrixBase<Derived>::template DiagonalIndexReturnType<Index_>::Type
 MatrixBase<Derived>::diagonal()
 {
@@ -246,6 +250,7 @@ MatrixBase<Derived>::diagonal()
 /** This is the const version of diagonal<int>(). */
 template<typename Derived>
 template<int Index_>
+EIGEN_DEVICE_FUNC
 inline typename MatrixBase<Derived>::template ConstDiagonalIndexReturnType<Index_>::Type
 MatrixBase<Derived>::diagonal() const
 {
--- a/Eigen/src/Core/DiagonalMatrix.h
+++ b/Eigen/src/Core/DiagonalMatrix.h
@@ -44,7 +44,7 @@ class DiagonalBase : public EigenBase<Derived>

    EIGEN_DEVICE_FUNC
    DenseMatrixType toDenseMatrix() const { return derived(); }
-    
+
    EIGEN_DEVICE_FUNC
    inline const DiagonalVectorType& diagonal() const { return derived().diagonal(); }
    EIGEN_DEVICE_FUNC
@@ -71,18 +71,41 @@ class DiagonalBase : public EigenBase<Derived>
      return InverseReturnType(diagonal().cwiseInverse());
    }
    
-    typedef DiagonalWrapper<const CwiseUnaryOp<internal::scalar_multiple_op<Scalar>, const DiagonalVectorType> > ScalarMultipleReturnType;
    EIGEN_DEVICE_FUNC
-    inline const ScalarMultipleReturnType
+    inline const DiagonalWrapper<const EIGEN_EXPR_BINARYOP_SCALAR_RETURN_TYPE(DiagonalVectorType,Scalar,product) >
    operator*(const Scalar& scalar) const
    {
-      return ScalarMultipleReturnType(diagonal() * scalar);
+      return DiagonalWrapper<const EIGEN_EXPR_BINARYOP_SCALAR_RETURN_TYPE(DiagonalVectorType,Scalar,product) >(diagonal() * scalar);
    }
    EIGEN_DEVICE_FUNC
-    friend inline const ScalarMultipleReturnType
+    friend inline const DiagonalWrapper<const EIGEN_SCALAR_BINARYOP_EXPR_RETURN_TYPE(Scalar,DiagonalVectorType,product) >
    operator*(const Scalar& scalar, const DiagonalBase& other)
    {
-      return ScalarMultipleReturnType(other.diagonal() * scalar);
+      return DiagonalWrapper<const EIGEN_SCALAR_BINARYOP_EXPR_RETURN_TYPE(Scalar,DiagonalVectorType,product) >(scalar * other.diagonal());
+    }
+
+    template<typename OtherDerived>
+    EIGEN_DEVICE_FUNC
+    #ifdef EIGEN_PARSED_BY_DOXYGEN
+    inline unspecified_expression_type
+    #else
+    inline const DiagonalWrapper<const EIGEN_CWISE_BINARY_RETURN_TYPE(DiagonalVectorType,typename OtherDerived::DiagonalVectorType,sum) >
+    #endif
+    operator+(const DiagonalBase<OtherDerived>& other) const
+    {
+      return (diagonal() + other.diagonal()).asDiagonal();
+    }
+
+    template<typename OtherDerived>
+    EIGEN_DEVICE_FUNC
+    #ifdef EIGEN_PARSED_BY_DOXYGEN
+    inline unspecified_expression_type
+    #else
+    inline const DiagonalWrapper<const EIGEN_CWISE_BINARY_RETURN_TYPE(DiagonalVectorType,typename OtherDerived::DiagonalVectorType,difference) >
+    #endif
+    operator-(const DiagonalBase<OtherDerived>& other) const
+    {
+      return (diagonal() - other.diagonal()).asDiagonal();
    }
 };

@@ -155,6 +178,30 @@ class DiagonalMatrix
    EIGEN_DEVICE_FUNC
    inline DiagonalMatrix(const Scalar& x, const Scalar& y, const Scalar& z) : m_diagonal(x,y,z) {}

+    #if EIGEN_HAS_CXX11
+    /** \brief Construct a diagonal matrix with fixed size from an arbitrary number of coefficients. \cpp11
+      * 
+      * There exists C++98 anologue constructors for fixed-size diagonal matrices having 2 or 3 coefficients.
+      * 
+      * \warning To construct a diagonal matrix of fixed size, the number of values passed to this 
+      * constructor must match the fixed dimension of \c *this.
+      * 
+      * \sa DiagonalMatrix(const Scalar&, const Scalar&)
+      * \sa DiagonalMatrix(const Scalar&, const Scalar&, const Scalar&)
+      */
+    template <typename... ArgTypes>
+    EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+    DiagonalMatrix(const Scalar& a0, const Scalar& a1, const Scalar& a2, const ArgTypes&... args)
+      : m_diagonal(a0, a1, a2, args...) {}
+
+    /** \brief Constructs a DiagonalMatrix and initializes it by elements given by an initializer list of initializer
+      * lists \cpp11
+      */
+    EIGEN_DEVICE_FUNC
+    explicit EIGEN_STRONG_INLINE DiagonalMatrix(const std::initializer_list<std::initializer_list<Scalar>>& list)
+      : m_diagonal(list) {}
+    #endif  // EIGEN_HAS_CXX11
+
    /** Copy constructor. */
    template<typename OtherDerived>
    EIGEN_DEVICE_FUNC
@@ -274,7 +321,7 @@ class DiagonalWrapper
  * \sa class DiagonalWrapper, class DiagonalMatrix, diagonal(), isDiagonal()
  **/
 template<typename Derived>
-inline const DiagonalWrapper<const Derived>
+EIGEN_DEVICE_FUNC inline const DiagonalWrapper<const Derived>
 MatrixBase<Derived>::asDiagonal() const
 {
  return DiagonalWrapper<const Derived>(derived());
@@ -291,12 +338,11 @@ MatrixBase<Derived>::asDiagonal() const
 template<typename Derived>
 bool MatrixBase<Derived>::isDiagonal(const RealScalar& prec) const
 {
-  using std::abs;
  if(cols() != rows()) return false;
  RealScalar maxAbsOnDiagonal = static_cast<RealScalar>(-1);
  for(Index j = 0; j < cols(); ++j)
  {
-    RealScalar absOnDiagonal = abs(coeff(j,j));
+    RealScalar absOnDiagonal = numext::abs(coeff(j,j));
    if(absOnDiagonal > maxAbsOnDiagonal) maxAbsOnDiagonal = absOnDiagonal;
  }
  for(Index j = 0; j < cols(); ++j)
@@ -317,19 +363,24 @@ struct Diagonal2Dense {};
 template<> struct AssignmentKind<DenseShape,DiagonalShape> { typedef Diagonal2Dense Kind; };

 // Diagonal matrix to Dense assignment
-template< typename DstXprType, typename SrcXprType, typename Functor, typename Scalar>
-struct Assignment<DstXprType, SrcXprType, Functor, Diagonal2Dense, Scalar>
+template< typename DstXprType, typename SrcXprType, typename Functor>
+struct Assignment<DstXprType, SrcXprType, Functor, Diagonal2Dense>
 {
-  static void run(DstXprType &dst, const SrcXprType &src, const internal::assign_op<typename DstXprType::Scalar> &/*func*/)
+  static void run(DstXprType &dst, const SrcXprType &src, const internal::assign_op<typename DstXprType::Scalar,typename SrcXprType::Scalar> &/*func*/)
  {
+    Index dstRows = src.rows();
+    Index dstCols = src.cols();
+    if((dst.rows()!=dstRows) || (dst.cols()!=dstCols))
+      dst.resize(dstRows, dstCols);
+    
    dst.setZero();
    dst.diagonal() = src.diagonal();
  }
  
-  static void run(DstXprType &dst, const SrcXprType &src, const internal::add_assign_op<typename DstXprType::Scalar> &/*func*/)
+  static void run(DstXprType &dst, const SrcXprType &src, const internal::add_assign_op<typename DstXprType::Scalar,typename SrcXprType::Scalar> &/*func*/)
  { dst.diagonal() += src.diagonal(); }
  
-  static void run(DstXprType &dst, const SrcXprType &src, const internal::sub_assign_op<typename DstXprType::Scalar> &/*func*/)
+  static void run(DstXprType &dst, const SrcXprType &src, const internal::sub_assign_op<typename DstXprType::Scalar,typename SrcXprType::Scalar> &/*func*/)
  { dst.diagonal() -= src.diagonal(); }
 };

--- a/Eigen/src/Core/DiagonalProduct.h
+++ b/Eigen/src/Core/DiagonalProduct.h
@@ -17,7 +17,7 @@ namespace Eigen {
  */
 template<typename Derived>
 template<typename DiagonalDerived>
-inline const Product<Derived, DiagonalDerived, LazyProduct>
+EIGEN_DEVICE_FUNC inline const Product<Derived, DiagonalDerived, LazyProduct>
 MatrixBase<Derived>::operator*(const DiagonalBase<DiagonalDerived> &a_diagonal) const
 {
  return Product<Derived, DiagonalDerived, LazyProduct>(derived(),a_diagonal.derived());
--- a/Eigen/src/Core/Dot.h
+++ b/Eigen/src/Core/Dot.h
@@ -28,28 +28,33 @@ template<typename T, typename U,
 >
 struct dot_nocheck
 {
-  typedef typename scalar_product_traits<typename traits<T>::Scalar,typename traits<U>::Scalar>::ReturnType ResScalar;
+  typedef scalar_conj_product_op<typename traits<T>::Scalar,typename traits<U>::Scalar> conj_prod;
+  typedef typename conj_prod::result_type ResScalar;
  EIGEN_DEVICE_FUNC
-  static inline ResScalar run(const MatrixBase<T>& a, const MatrixBase<U>& b)
+  EIGEN_STRONG_INLINE
+  static ResScalar run(const MatrixBase<T>& a, const MatrixBase<U>& b)
  {
-    return a.template binaryExpr<scalar_conj_product_op<typename traits<T>::Scalar,typename traits<U>::Scalar> >(b).sum();
+    return a.template binaryExpr<conj_prod>(b).sum();
  }
 };

 template<typename T, typename U>
 struct dot_nocheck<T, U, true>
 {
-  typedef typename scalar_product_traits<typename traits<T>::Scalar,typename traits<U>::Scalar>::ReturnType ResScalar;
+  typedef scalar_conj_product_op<typename traits<T>::Scalar,typename traits<U>::Scalar> conj_prod;
+  typedef typename conj_prod::result_type ResScalar;
  EIGEN_DEVICE_FUNC
-  static inline ResScalar run(const MatrixBase<T>& a, const MatrixBase<U>& b)
+  EIGEN_STRONG_INLINE
+  static ResScalar run(const MatrixBase<T>& a, const MatrixBase<U>& b)
  {
-    return a.transpose().template binaryExpr<scalar_conj_product_op<typename traits<T>::Scalar,typename traits<U>::Scalar> >(b).sum();
+    return a.transpose().template binaryExpr<conj_prod>(b).sum();
  }
 };

 } // end namespace internal

-/** \returns the dot product of *this with other.
+/** \fn MatrixBase::dot
+  * \returns the dot product of *this with other.
  *
  * \only_for_vectors
  *
@@ -62,15 +67,18 @@ struct dot_nocheck<T, U, true>
 template<typename Derived>
 template<typename OtherDerived>
 EIGEN_DEVICE_FUNC
-typename internal::scalar_product_traits<typename internal::traits<Derived>::Scalar,typename internal::traits<OtherDerived>::Scalar>::ReturnType
+EIGEN_STRONG_INLINE
+typename ScalarBinaryOpTraits<typename internal::traits<Derived>::Scalar,typename internal::traits<OtherDerived>::Scalar>::ReturnType
 MatrixBase<Derived>::dot(const MatrixBase<OtherDerived>& other) const
 {
  EIGEN_STATIC_ASSERT_VECTOR_ONLY(Derived)
  EIGEN_STATIC_ASSERT_VECTOR_ONLY(OtherDerived)
  EIGEN_STATIC_ASSERT_SAME_VECTOR_SIZE(Derived,OtherDerived)
+#if !(defined(EIGEN_NO_STATIC_ASSERT) && defined(EIGEN_NO_DEBUG))
  typedef internal::scalar_conj_product_op<Scalar,typename OtherDerived::Scalar> func;
  EIGEN_CHECK_BINARY_COMPATIBILIY(func,Scalar,typename OtherDerived::Scalar);
-
+#endif
+  
  eigen_assert(size() == other.size());

  return internal::dot_nocheck<Derived,OtherDerived>::run(*this, other);
@@ -82,10 +90,10 @@ MatrixBase<Derived>::dot(const MatrixBase<OtherDerived>& other) const
  * In both cases, it consists in the sum of the square of all the matrix entries.
  * For vectors, this is also equals to the dot product of \c *this with itself.
  *
-  * \sa dot(), norm()
+  * \sa dot(), norm(), lpNorm()
  */
 template<typename Derived>
-EIGEN_STRONG_INLINE typename NumTraits<typename internal::traits<Derived>::Scalar>::Real MatrixBase<Derived>::squaredNorm() const
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE typename NumTraits<typename internal::traits<Derived>::Scalar>::Real MatrixBase<Derived>::squaredNorm() const
 {
  return numext::real((*this).cwiseAbs2().sum());
 }
@@ -94,40 +102,98 @@ EIGEN_STRONG_INLINE typename NumTraits<typename internal::traits<Derived>::Scala
  * In both cases, it consists in the square root of the sum of the square of all the matrix entries.
  * For vectors, this is also equals to the square root of the dot product of \c *this with itself.
  *
-  * \sa dot(), squaredNorm()
+  * \sa lpNorm(), dot(), squaredNorm()
  */
 template<typename Derived>
-inline typename NumTraits<typename internal::traits<Derived>::Scalar>::Real MatrixBase<Derived>::norm() const
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE typename NumTraits<typename internal::traits<Derived>::Scalar>::Real MatrixBase<Derived>::norm() const
 {
-  EIGEN_USING_STD_MATH(sqrt)
-  return sqrt(squaredNorm());
+  return numext::sqrt(squaredNorm());
 }

-/** \returns an expression of the quotient of *this by its own norm.
+/** \returns an expression of the quotient of \c *this by its own norm.
+  *
+  * \warning If the input vector is too small (i.e., this->norm()==0),
+  *          then this function returns a copy of the input.
  *
  * \only_for_vectors
  *
  * \sa norm(), normalize()
  */
 template<typename Derived>
-inline const typename MatrixBase<Derived>::PlainObject
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const typename MatrixBase<Derived>::PlainObject
 MatrixBase<Derived>::normalized() const
 {
  typedef typename internal::nested_eval<Derived,2>::type _Nested;
  _Nested n(derived());
-  return n / n.norm();
+  RealScalar z = n.squaredNorm();
+  // NOTE: after extensive benchmarking, this conditional does not impact performance, at least on recent x86 CPU
+  if(z>RealScalar(0))
+    return n / numext::sqrt(z);
+  else
+    return n;
 }

 /** Normalizes the vector, i.e. divides it by its own norm.
  *
  * \only_for_vectors
  *
+  * \warning If the input vector is too small (i.e., this->norm()==0), then \c *this is left unchanged.
+  *
  * \sa norm(), normalized()
  */
 template<typename Derived>
-inline void MatrixBase<Derived>::normalize()
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void MatrixBase<Derived>::normalize()
 {
-  *this /= norm();
+  RealScalar z = squaredNorm();
+  // NOTE: after extensive benchmarking, this conditional does not impact performance, at least on recent x86 CPU
+  if(z>RealScalar(0))
+    derived() /= numext::sqrt(z);
+}
+
+/** \returns an expression of the quotient of \c *this by its own norm while avoiding underflow and overflow.
+  *
+  * \only_for_vectors
+  *
+  * This method is analogue to the normalized() method, but it reduces the risk of
+  * underflow and overflow when computing the norm.
+  *
+  * \warning If the input vector is too small (i.e., this->norm()==0),
+  *          then this function returns a copy of the input.
+  *
+  * \sa stableNorm(), stableNormalize(), normalized()
+  */
+template<typename Derived>
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const typename MatrixBase<Derived>::PlainObject
+MatrixBase<Derived>::stableNormalized() const
+{
+  typedef typename internal::nested_eval<Derived,3>::type _Nested;
+  _Nested n(derived());
+  RealScalar w = n.cwiseAbs().maxCoeff();
+  RealScalar z = (n/w).squaredNorm();
+  if(z>RealScalar(0))
+    return n / (numext::sqrt(z)*w);
+  else
+    return n;
+}
+
+/** Normalizes the vector while avoid underflow and overflow
+  *
+  * \only_for_vectors
+  *
+  * This method is analogue to the normalize() method, but it reduces the risk of
+  * underflow and overflow when computing the norm.
+  *
+  * \warning If the input vector is too small (i.e., this->norm()==0), then \c *this is left unchanged.
+  *
+  * \sa stableNorm(), stableNormalized(), normalize()
+  */
+template<typename Derived>
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void MatrixBase<Derived>::stableNormalize()
+{
+  RealScalar w = cwiseAbs().maxCoeff();
+  RealScalar z = (derived()/w).squaredNorm();
+  if(z>RealScalar(0))
+    derived() /= numext::sqrt(z)*w;
 }

 //---------- implementation of other norms ----------
@@ -169,24 +235,35 @@ struct lpNorm_selector<Derived, 2>
 template<typename Derived>
 struct lpNorm_selector<Derived, Infinity>
 {
+  typedef typename NumTraits<typename traits<Derived>::Scalar>::Real RealScalar;
  EIGEN_DEVICE_FUNC
-  static inline typename NumTraits<typename traits<Derived>::Scalar>::Real run(const MatrixBase<Derived>& m)
+  static inline RealScalar run(const MatrixBase<Derived>& m)
  {
+    if(Derived::SizeAtCompileTime==0 || (Derived::SizeAtCompileTime==Dynamic && m.size()==0))
+      return RealScalar(0);
    return m.cwiseAbs().maxCoeff();
  }
 };

 } // end namespace internal

-/** \returns the \f$ \ell^p \f$ norm of *this, that is, returns the p-th root of the sum of the p-th powers of the absolute values
-  *          of the coefficients of *this. If \a p is the special value \a Eigen::Infinity, this function returns the \f$ \ell^\infty \f$
-  *          norm, that is the maximum of the absolute values of the coefficients of *this.
+/** \returns the \b coefficient-wise \f$ \ell^p \f$ norm of \c *this, that is, returns the p-th root of the sum of the p-th powers of the absolute values
+  *          of the coefficients of \c *this. If \a p is the special value \a Eigen::Infinity, this function returns the \f$ \ell^\infty \f$
+  *          norm, that is the maximum of the absolute values of the coefficients of \c *this.
+  *
+  * In all cases, if \c *this is empty, then the value 0 is returned.
+  *
+  * \note For matrices, this function does not compute the <a href="https://en.wikipedia.org/wiki/Operator_norm">operator-norm</a>. That is, if \c *this is a matrix, then its coefficients are interpreted as a 1D vector. Nonetheless, you can easily compute the 1-norm and \f$\infty\f$-norm matrix operator norms using \link TutorialReductionsVisitorsBroadcastingReductionsNorm partial reductions \endlink.
  *
  * \sa norm()
  */
 template<typename Derived>
 template<int p>
-inline typename NumTraits<typename internal::traits<Derived>::Scalar>::Real
+#ifndef EIGEN_PARSED_BY_DOXYGEN
+EIGEN_DEVICE_FUNC inline typename NumTraits<typename internal::traits<Derived>::Scalar>::Real
+#else
+EIGEN_DEVICE_FUNC MatrixBase<Derived>::RealScalar
+#endif
 MatrixBase<Derived>::lpNorm() const
 {
  return internal::lpNorm_selector<Derived, p>::run(*this);
--- a/Eigen/src/Core/EigenBase.h
+++ b/Eigen/src/Core/EigenBase.h
@@ -14,6 +14,7 @@
 namespace Eigen {

 /** \class EigenBase
+  * \ingroup Core_Module
  * 
  * Common base class for all classes T such that MatrixBase has an operator=(T) and a constructor MatrixBase(T).
  *
@@ -23,7 +24,7 @@ namespace Eigen {
  *
  * Notice that this class is trivial, it is only used to disambiguate overloaded functions.
  *
-  * \sa \ref TopicClassHierarchy
+  * \sa \blank \ref TopicClassHierarchy
  */
 template<typename Derived> struct EigenBase
 {
@@ -31,8 +32,9 @@ template<typename Derived> struct EigenBase
  
  /** \brief The interface type of indices
    * \details To change this, \c \#define the preprocessor symbol \c EIGEN_DEFAULT_DENSE_INDEX_TYPE.
-    * \deprecated Since Eigen 3.3, its usage is deprecated. Use Eigen::Index instead.
    * \sa StorageIndex, \ref TopicPreprocessorDirectives.
+    * DEPRECATED: Since Eigen 3.3, its usage is deprecated. Use Eigen::Index instead.
+    * Deprecation is not marked with a doxygen comment because there are too many existing usages to add the deprecation attribute.
    */
  typedef Eigen::Index Index;

@@ -128,6 +130,7 @@ template<typename Derived> struct EigenBase
  */
 template<typename Derived>
 template<typename OtherDerived>
+EIGEN_DEVICE_FUNC
 Derived& DenseBase<Derived>::operator=(const EigenBase<OtherDerived> &other)
 {
  call_assignment(derived(), other.derived());
@@ -136,17 +139,19 @@ Derived& DenseBase<Derived>::operator=(const EigenBase<OtherDerived> &other)

 template<typename Derived>
 template<typename OtherDerived>
+EIGEN_DEVICE_FUNC
 Derived& DenseBase<Derived>::operator+=(const EigenBase<OtherDerived> &other)
 {
-  call_assignment(derived(), other.derived(), internal::add_assign_op<Scalar>());
+  call_assignment(derived(), other.derived(), internal::add_assign_op<Scalar,typename OtherDerived::Scalar>());
  return derived();
 }

 template<typename Derived>
 template<typename OtherDerived>
+EIGEN_DEVICE_FUNC
 Derived& DenseBase<Derived>::operator-=(const EigenBase<OtherDerived> &other)
 {
-  call_assignment(derived(), other.derived(), internal::sub_assign_op<Scalar>());
+  call_assignment(derived(), other.derived(), internal::sub_assign_op<Scalar,typename OtherDerived::Scalar>());
  return derived();
 }

--- a/Eigen/src/Core/Fuzzy.h
+++ b/Eigen/src/Core/Fuzzy.h
@@ -100,7 +100,7 @@ struct isMuchSmallerThan_scalar_selector<Derived, true>
  */
 template<typename Derived>
 template<typename OtherDerived>
-bool DenseBase<Derived>::isApprox(
+EIGEN_DEVICE_FUNC bool DenseBase<Derived>::isApprox(
  const DenseBase<OtherDerived>& other,
  const RealScalar& prec
 ) const
@@ -122,7 +122,7 @@ bool DenseBase<Derived>::isApprox(
  * \sa isApprox(), isMuchSmallerThan(const DenseBase<OtherDerived>&, RealScalar) const
  */
 template<typename Derived>
-bool DenseBase<Derived>::isMuchSmallerThan(
+EIGEN_DEVICE_FUNC bool DenseBase<Derived>::isMuchSmallerThan(
  const typename NumTraits<Scalar>::Real& other,
  const RealScalar& prec
 ) const
@@ -142,7 +142,7 @@ bool DenseBase<Derived>::isMuchSmallerThan(
  */
 template<typename Derived>
 template<typename OtherDerived>
-bool DenseBase<Derived>::isMuchSmallerThan(
+EIGEN_DEVICE_FUNC bool DenseBase<Derived>::isMuchSmallerThan(
  const DenseBase<OtherDerived>& other,
  const RealScalar& prec
 ) const
--- a/Eigen/src/Core/GeneralProduct.h
+++ b/Eigen/src/Core/GeneralProduct.h
@@ -18,17 +18,33 @@ enum {
  Small = 3
 };

+// Define the threshold value to fallback from the generic matrix-matrix product
+// implementation (heavy) to the lightweight coeff-based product one.
+// See generic_product_impl<Lhs,Rhs,DenseShape,DenseShape,GemmProduct>
+// in products/GeneralMatrixMatrix.h for more details.
+// TODO This threshold should also be used in the compile-time selector below.
+#ifndef EIGEN_GEMM_TO_COEFFBASED_THRESHOLD
+// This default value has been obtained on a Haswell architecture.
+#define EIGEN_GEMM_TO_COEFFBASED_THRESHOLD 20
+#endif
+
 namespace internal {

 template<int Rows, int Cols, int Depth> struct product_type_selector;

 template<int Size, int MaxSize> struct product_size_category
 {
-  enum { is_large = MaxSize == Dynamic ||
-                    Size >= EIGEN_CACHEFRIENDLY_PRODUCT_THRESHOLD,
-         value = is_large  ? Large
-               : Size == 1 ? 1
-                           : Small
+  enum {
+    #ifndef EIGEN_GPU_COMPILE_PHASE
+    is_large = MaxSize == Dynamic ||
+               Size >= EIGEN_CACHEFRIENDLY_PRODUCT_THRESHOLD ||
+               (Size==Dynamic && MaxSize>=EIGEN_CACHEFRIENDLY_PRODUCT_THRESHOLD),
+    #else
+    is_large = 0,
+    #endif
+    value = is_large  ? Large
+          : Size == 1 ? 1
+                      : Small
  };
 };

@@ -76,37 +92,13 @@ public:
 #endif
 };

-// template<typename Lhs, typename Rhs> struct product_tag
-// {
-// private:
-//   
-//   typedef typename remove_all<Lhs>::type _Lhs;
-//   typedef typename remove_all<Rhs>::type _Rhs;
-//   enum {
-//     Rows  = _Lhs::RowsAtCompileTime,
-//     Cols  = _Rhs::ColsAtCompileTime,
-//     Depth = EIGEN_SIZE_MIN_PREFER_FIXED(_Lhs::ColsAtCompileTime, _Rhs::RowsAtCompileTime)
-//   };
-// 
-//   enum {
-//     rows_select = Rows==1 ? int(Rows) : int(Large),
-//     cols_select = Cols==1 ? int(Cols) : int(Large),
-//     depth_select = Depth==1 ? int(Depth) : int(Large)
-//   };
-//   typedef product_type_selector<rows_select, cols_select, depth_select> selector;
-// 
-// public:
-//   enum {
-//     ret = selector::ret
-//   };
-// 
-// };
-
 /* The following allows to select the kind of product at compile time
 * based on the three dimensions of the product.
 * This is a compile time mapping from {1,Small,Large}^3 -> {product types} */
 // FIXME I'm not sure the current mapping is the ideal one.
 template<int M, int N>  struct product_type_selector<M,N,1>              { enum { ret = OuterProduct }; };
+template<int M>         struct product_type_selector<M, 1, 1>            { enum { ret = LazyCoeffBasedProductMode }; };
+template<int N>         struct product_type_selector<1, N, 1>            { enum { ret = LazyCoeffBasedProductMode }; };
 template<int Depth>     struct product_type_selector<1,    1,    Depth>  { enum { ret = InnerProduct }; };
 template<>              struct product_type_selector<1,    1,    1>      { enum { ret = InnerProduct }; };
 template<>              struct product_type_selector<Small,1,    Small>  { enum { ret = CoeffBasedProductMode }; };
@@ -125,8 +117,8 @@ template<>              struct product_type_selector<Small,Small,Large>  { enum
 template<>              struct product_type_selector<Large,Small,Large>  { enum { ret = GemmProduct }; };
 template<>              struct product_type_selector<Small,Large,Large>  { enum { ret = GemmProduct }; };
 template<>              struct product_type_selector<Large,Large,Large>  { enum { ret = GemmProduct }; };
-template<>              struct product_type_selector<Large,Small,Small>  { enum { ret = GemmProduct }; };
-template<>              struct product_type_selector<Small,Large,Small>  { enum { ret = GemmProduct }; };
+template<>              struct product_type_selector<Large,Small,Small>  { enum { ret = CoeffBasedProductMode }; };
+template<>              struct product_type_selector<Small,Large,Small>  { enum { ret = CoeffBasedProductMode }; };
 template<>              struct product_type_selector<Large,Large,Small>  { enum { ret = GemmProduct }; };

 } // end namespace internal
@@ -160,7 +152,7 @@ template<>              struct product_type_selector<Large,Large,Small>  { enum
 namespace internal {

 template<int Side, int StorageOrder, bool BlasCompatible>
-struct gemv_dense_sense_selector;
+struct gemv_dense_selector;

 } // end namespace internal

@@ -171,32 +163,32 @@ template<typename Scalar,int Size,int MaxSize,bool Cond> struct gemv_static_vect
 template<typename Scalar,int Size,int MaxSize>
 struct gemv_static_vector_if<Scalar,Size,MaxSize,false>
 {
-  EIGEN_STRONG_INLINE  Scalar* data() { eigen_internal_assert(false && "should never be called"); return 0; }
+  EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC Scalar* data() { eigen_internal_assert(false && "should never be called"); return 0; }
 };

 template<typename Scalar,int Size>
 struct gemv_static_vector_if<Scalar,Size,Dynamic,true>
 {
-  EIGEN_STRONG_INLINE Scalar* data() { return 0; }
+  EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC Scalar* data() { return 0; }
 };

 template<typename Scalar,int Size,int MaxSize>
 struct gemv_static_vector_if<Scalar,Size,MaxSize,true>
 {
-  #if EIGEN_MAX_STATIC_ALIGN_BYTES!=0
-  internal::plain_array<Scalar,EIGEN_SIZE_MIN_PREFER_FIXED(Size,MaxSize),0> m_data;
-  EIGEN_STRONG_INLINE Scalar* data() { return m_data.array; }
-  #else
-  // Some architectures cannot align on the stack,
-  // => let's manually enforce alignment by allocating more data and return the address of the first aligned element.
  enum {
    ForceAlignment  = internal::packet_traits<Scalar>::Vectorizable,
    PacketSize      = internal::packet_traits<Scalar>::size
  };
-  internal::plain_array<Scalar,EIGEN_SIZE_MIN_PREFER_FIXED(Size,MaxSize)+(ForceAlignment?PacketSize:0),0> m_data;
+  #if EIGEN_MAX_STATIC_ALIGN_BYTES!=0
+  internal::plain_array<Scalar,EIGEN_SIZE_MIN_PREFER_FIXED(Size,MaxSize),0,EIGEN_PLAIN_ENUM_MIN(AlignedMax,PacketSize)> m_data;
+  EIGEN_STRONG_INLINE Scalar* data() { return m_data.array; }
+  #else
+  // Some architectures cannot align on the stack,
+  // => let's manually enforce alignment by allocating more data and return the address of the first aligned element.
+  internal::plain_array<Scalar,EIGEN_SIZE_MIN_PREFER_FIXED(Size,MaxSize)+(ForceAlignment?EIGEN_MAX_ALIGN_BYTES:0),0> m_data;
  EIGEN_STRONG_INLINE Scalar* data() {
    return ForceAlignment
-            ? reinterpret_cast<Scalar*>((reinterpret_cast<size_t>(m_data.array) & ~(size_t(EIGEN_MAX_ALIGN_BYTES-1))) + EIGEN_MAX_ALIGN_BYTES)
+            ? reinterpret_cast<Scalar*>((internal::UIntPtr(m_data.array) & ~(std::size_t(EIGEN_MAX_ALIGN_BYTES-1))) + EIGEN_MAX_ALIGN_BYTES)
            : m_data.array;
  }
  #endif
@@ -204,19 +196,19 @@ struct gemv_static_vector_if<Scalar,Size,MaxSize,true>

 // The vector is on the left => transposition
 template<int StorageOrder, bool BlasCompatible>
-struct gemv_dense_sense_selector<OnTheLeft,StorageOrder,BlasCompatible>
+struct gemv_dense_selector<OnTheLeft,StorageOrder,BlasCompatible>
 {
  template<typename Lhs, typename Rhs, typename Dest>
  static void run(const Lhs &lhs, const Rhs &rhs, Dest& dest, const typename Dest::Scalar& alpha)
  {
    Transpose<Dest> destT(dest);
    enum { OtherStorageOrder = StorageOrder == RowMajor ? ColMajor : RowMajor };
-    gemv_dense_sense_selector<OnTheRight,OtherStorageOrder,BlasCompatible>
+    gemv_dense_selector<OnTheRight,OtherStorageOrder,BlasCompatible>
      ::run(rhs.transpose(), lhs.transpose(), destT, alpha);
  }
 };

-template<> struct gemv_dense_sense_selector<OnTheRight,ColMajor,true>
+template<> struct gemv_dense_selector<OnTheRight,ColMajor,true>
 {
  template<typename Lhs, typename Rhs, typename Dest>
  static inline void run(const Lhs &lhs, const Rhs &rhs, Dest& dest, const typename Dest::Scalar& alpha)
@@ -231,7 +223,7 @@ template<> struct gemv_dense_sense_selector<OnTheRight,ColMajor,true>
    typedef internal::blas_traits<Rhs> RhsBlasTraits;
    typedef typename RhsBlasTraits::DirectLinearAccessType ActualRhsType;
  
-    typedef Map<Matrix<ResScalar,Dynamic,1>, Aligned> MappedDest;
+    typedef Map<Matrix<ResScalar,Dynamic,1>, EIGEN_PLAIN_ENUM_MIN(AlignedMax,internal::packet_traits<ResScalar>::size)> MappedDest;

    ActualLhsType actualLhs = LhsBlasTraits::extract(lhs);
    ActualRhsType actualRhs = RhsBlasTraits::extract(rhs);
@@ -239,60 +231,78 @@ template<> struct gemv_dense_sense_selector<OnTheRight,ColMajor,true>
    ResScalar actualAlpha = alpha * LhsBlasTraits::extractScalarFactor(lhs)
                                  * RhsBlasTraits::extractScalarFactor(rhs);

+    // make sure Dest is a compile-time vector type (bug 1166)
+    typedef typename conditional<Dest::IsVectorAtCompileTime, Dest, typename Dest::ColXpr>::type ActualDest;
+
    enum {
      // FIXME find a way to allow an inner stride on the result if packet_traits<Scalar>::size==1
      // on, the other hand it is good for the cache to pack the vector anyways...
-      EvalToDestAtCompileTime = Dest::InnerStrideAtCompileTime==1,
+      EvalToDestAtCompileTime = (ActualDest::InnerStrideAtCompileTime==1),
      ComplexByReal = (NumTraits<LhsScalar>::IsComplex) && (!NumTraits<RhsScalar>::IsComplex),
-      MightCannotUseDest = (Dest::InnerStrideAtCompileTime!=1) || ComplexByReal
+      MightCannotUseDest = ((!EvalToDestAtCompileTime) || ComplexByReal) && (ActualDest::MaxSizeAtCompileTime!=0)
    };

-    gemv_static_vector_if<ResScalar,Dest::SizeAtCompileTime,Dest::MaxSizeAtCompileTime,MightCannotUseDest> static_dest;
-
-    const bool alphaIsCompatible = (!ComplexByReal) || (numext::imag(actualAlpha)==RealScalar(0));
-    const bool evalToDest = EvalToDestAtCompileTime && alphaIsCompatible;
-
-    RhsScalar compatibleAlpha = get_factor<ResScalar,RhsScalar>::run(actualAlpha);
-
-    ei_declare_aligned_stack_constructed_variable(ResScalar,actualDestPtr,dest.size(),
-                                                  evalToDest ? dest.data() : static_dest.data());
-
-    if(!evalToDest)
-    {
-      #ifdef EIGEN_DENSE_STORAGE_CTOR_PLUGIN
-      Index size = dest.size();
-      EIGEN_DENSE_STORAGE_CTOR_PLUGIN
-      #endif
-      if(!alphaIsCompatible)
-      {
-        MappedDest(actualDestPtr, dest.size()).setZero();
-        compatibleAlpha = RhsScalar(1);
-      }
-      else
-        MappedDest(actualDestPtr, dest.size()) = dest;
-    }
-
    typedef const_blas_data_mapper<LhsScalar,Index,ColMajor> LhsMapper;
    typedef const_blas_data_mapper<RhsScalar,Index,RowMajor> RhsMapper;
-    general_matrix_vector_product
-        <Index,LhsScalar,LhsMapper,ColMajor,LhsBlasTraits::NeedToConjugate,RhsScalar,RhsMapper,RhsBlasTraits::NeedToConjugate>::run(
-        actualLhs.rows(), actualLhs.cols(),
-        LhsMapper(actualLhs.data(), actualLhs.outerStride()),
-        RhsMapper(actualRhs.data(), actualRhs.innerStride()),
-        actualDestPtr, 1,
-        compatibleAlpha);
+    RhsScalar compatibleAlpha = get_factor<ResScalar,RhsScalar>::run(actualAlpha);

-    if (!evalToDest)
+    if(!MightCannotUseDest)
    {
-      if(!alphaIsCompatible)
-        dest += actualAlpha * MappedDest(actualDestPtr, dest.size());
-      else
-        dest = MappedDest(actualDestPtr, dest.size());
+      // shortcut if we are sure to be able to use dest directly,
+      // this ease the compiler to generate cleaner and more optimzized code for most common cases
+      general_matrix_vector_product
+          <Index,LhsScalar,LhsMapper,ColMajor,LhsBlasTraits::NeedToConjugate,RhsScalar,RhsMapper,RhsBlasTraits::NeedToConjugate>::run(
+          actualLhs.rows(), actualLhs.cols(),
+          LhsMapper(actualLhs.data(), actualLhs.outerStride()),
+          RhsMapper(actualRhs.data(), actualRhs.innerStride()),
+          dest.data(), 1,
+          compatibleAlpha);
+    }
+    else
+    {
+      gemv_static_vector_if<ResScalar,ActualDest::SizeAtCompileTime,ActualDest::MaxSizeAtCompileTime,MightCannotUseDest> static_dest;
+
+      const bool alphaIsCompatible = (!ComplexByReal) || (numext::imag(actualAlpha)==RealScalar(0));
+      const bool evalToDest = EvalToDestAtCompileTime && alphaIsCompatible;
+
+      ei_declare_aligned_stack_constructed_variable(ResScalar,actualDestPtr,dest.size(),
+                                                    evalToDest ? dest.data() : static_dest.data());
+
+      if(!evalToDest)
+      {
+        #ifdef EIGEN_DENSE_STORAGE_CTOR_PLUGIN
+        Index size = dest.size();
+        EIGEN_DENSE_STORAGE_CTOR_PLUGIN
+        #endif
+        if(!alphaIsCompatible)
+        {
+          MappedDest(actualDestPtr, dest.size()).setZero();
+          compatibleAlpha = RhsScalar(1);
+        }
+        else
+          MappedDest(actualDestPtr, dest.size()) = dest;
+      }
+
+      general_matrix_vector_product
+          <Index,LhsScalar,LhsMapper,ColMajor,LhsBlasTraits::NeedToConjugate,RhsScalar,RhsMapper,RhsBlasTraits::NeedToConjugate>::run(
+          actualLhs.rows(), actualLhs.cols(),
+          LhsMapper(actualLhs.data(), actualLhs.outerStride()),
+          RhsMapper(actualRhs.data(), actualRhs.innerStride()),
+          actualDestPtr, 1,
+          compatibleAlpha);
+
+      if (!evalToDest)
+      {
+        if(!alphaIsCompatible)
+          dest.matrix() += actualAlpha * MappedDest(actualDestPtr, dest.size());
+        else
+          dest = MappedDest(actualDestPtr, dest.size());
+      }
    }
  }
 };

-template<> struct gemv_dense_sense_selector<OnTheRight,RowMajor,true>
+template<> struct gemv_dense_selector<OnTheRight,RowMajor,true>
 {
  template<typename Lhs, typename Rhs, typename Dest>
  static void run(const Lhs &lhs, const Rhs &rhs, Dest& dest, const typename Dest::Scalar& alpha)
@@ -316,7 +326,7 @@ template<> struct gemv_dense_sense_selector<OnTheRight,RowMajor,true>
    enum {
      // FIXME find a way to allow an inner stride on the result if packet_traits<Scalar>::size==1
      // on, the other hand it is good for the cache to pack the vector anyways...
-      DirectlyUseRhs = ActualRhsTypeCleaned::InnerStrideAtCompileTime==1
+      DirectlyUseRhs = ActualRhsTypeCleaned::InnerStrideAtCompileTime==1 || ActualRhsTypeCleaned::MaxSizeAtCompileTime==0
    };

    gemv_static_vector_if<RhsScalar,ActualRhsTypeCleaned::SizeAtCompileTime,ActualRhsTypeCleaned::MaxSizeAtCompileTime,!DirectlyUseRhs> static_rhs;
@@ -340,32 +350,35 @@ template<> struct gemv_dense_sense_selector<OnTheRight,RowMajor,true>
        actualLhs.rows(), actualLhs.cols(),
        LhsMapper(actualLhs.data(), actualLhs.outerStride()),
        RhsMapper(actualRhsPtr, 1),
-        dest.data(), dest.innerStride(),
+        dest.data(), dest.col(0).innerStride(), //NOTE  if dest is not a vector at compile-time, then dest.innerStride() might be wrong. (bug 1166)
        actualAlpha);
  }
 };

-template<> struct gemv_dense_sense_selector<OnTheRight,ColMajor,false>
+template<> struct gemv_dense_selector<OnTheRight,ColMajor,false>
 {
  template<typename Lhs, typename Rhs, typename Dest>
  static void run(const Lhs &lhs, const Rhs &rhs, Dest& dest, const typename Dest::Scalar& alpha)
  {
-    // TODO makes sure dest is sequentially stored in memory, otherwise use a temp
+    EIGEN_STATIC_ASSERT((!nested_eval<Lhs,1>::Evaluate),EIGEN_INTERNAL_COMPILATION_ERROR_OR_YOU_MADE_A_PROGRAMMING_MISTAKE);
+    // TODO if rhs is large enough it might be beneficial to make sure that dest is sequentially stored in memory, otherwise use a temp
+    typename nested_eval<Rhs,1>::type actual_rhs(rhs);
    const Index size = rhs.rows();
    for(Index k=0; k<size; ++k)
-      dest += (alpha*rhs.coeff(k)) * lhs.col(k);
+      dest += (alpha*actual_rhs.coeff(k)) * lhs.col(k);
  }
 };

-template<> struct gemv_dense_sense_selector<OnTheRight,RowMajor,false>
+template<> struct gemv_dense_selector<OnTheRight,RowMajor,false>
 {
  template<typename Lhs, typename Rhs, typename Dest>
  static void run(const Lhs &lhs, const Rhs &rhs, Dest& dest, const typename Dest::Scalar& alpha)
  {
-    // TODO makes sure rhs is sequentially stored in memory, otherwise use a temp
+    EIGEN_STATIC_ASSERT((!nested_eval<Lhs,1>::Evaluate),EIGEN_INTERNAL_COMPILATION_ERROR_OR_YOU_MADE_A_PROGRAMMING_MISTAKE);
+    typename nested_eval<Rhs,Lhs::RowsAtCompileTime>::type actual_rhs(rhs);
    const Index rows = dest.rows();
    for(Index i=0; i<rows; ++i)
-      dest.coeffRef(i) += alpha * (lhs.row(i).cwiseProduct(rhs.transpose())).sum();
+      dest.coeffRef(i) += alpha * (lhs.row(i).cwiseProduct(actual_rhs.transpose())).sum();
  }
 };

@@ -381,11 +394,10 @@ template<> struct gemv_dense_sense_selector<OnTheRight,RowMajor,false>
  *
  * \sa lazyProduct(), operator*=(const MatrixBase&), Cwise::operator*()
  */
-#ifndef __CUDACC__
-
 template<typename Derived>
 template<typename OtherDerived>
-inline const Product<Derived, OtherDerived>
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+const Product<Derived, OtherDerived>
 MatrixBase<Derived>::operator*(const MatrixBase<OtherDerived> &other) const
 {
  // A note regarding the function declaration: In MSVC, this function will sometimes
@@ -414,8 +426,6 @@ MatrixBase<Derived>::operator*(const MatrixBase<OtherDerived> &other) const
  return Product<Derived, OtherDerived>(derived(), other.derived());
 }

-#endif // __CUDACC__
-
 /** \returns an expression of the matrix product of \c *this and \a other without implicit evaluation.
  *
  * The returned product will behave like any other expressions: the coefficients of the product will be
@@ -429,6 +439,7 @@ MatrixBase<Derived>::operator*(const MatrixBase<OtherDerived> &other) const
  */
 template<typename Derived>
 template<typename OtherDerived>
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
 const Product<Derived,OtherDerived,LazyProduct>
 MatrixBase<Derived>::lazyProduct(const MatrixBase<OtherDerived> &other) const
 {
--- a/Eigen/src/Core/GenericPacketMath.h
+++ b/Eigen/src/Core/GenericPacketMath.h
@@ -43,7 +43,7 @@ struct default_packet_traits
 {
  enum {
    HasHalfPacket = 0,
-    
+
    HasAdd    = 1,
    HasSub    = 1,
    HasMul    = 1,
@@ -56,13 +56,16 @@ struct default_packet_traits
    HasConj   = 1,
    HasSetLinear = 1,
    HasBlend  = 0,
+    HasReduxp = 1,

    HasDiv    = 0,
    HasSqrt   = 0,
    HasRsqrt  = 0,
    HasExp    = 0,
+    HasExpm1  = 0,
    HasLog    = 0,
-    HasLog10    = 0,
+    HasLog1p  = 0,
+    HasLog10  = 0,
    HasPow    = 0,

    HasSin    = 0,
@@ -71,13 +74,28 @@ struct default_packet_traits
    HasASin   = 0,
    HasACos   = 0,
    HasATan   = 0,
-    HasSinh    = 0,
-    HasCosh    = 0,
-    HasTanh    = 0,
+    HasSinh   = 0,
+    HasCosh   = 0,
+    HasTanh   = 0,
+    HasLGamma = 0,
+    HasDiGamma = 0,
+    HasZeta = 0,
+    HasPolygamma = 0,
+    HasErf = 0,
+    HasErfc = 0,
+    HasNdtri = 0,
+    HasBessel = 0,
+    HasIGamma = 0,
+    HasIGammaDerA = 0,
+    HasGammaSampleDerAlpha = 0,
+    HasIGammac = 0,
+    HasBetaInc = 0,

    HasRound  = 0,
    HasFloor  = 0,
-    HasCeil   = 0
+    HasCeil   = 0,
+
+    HasSign   = 0
  };
 };

@@ -128,16 +146,24 @@ pcast(const SrcPacket& a, const SrcPacket& /*b*/) {
  return static_cast<TgtPacket>(a);
 }

+template <typename SrcPacket, typename TgtPacket>
+EIGEN_DEVICE_FUNC inline TgtPacket
+pcast(const SrcPacket& a, const SrcPacket& /*b*/, const SrcPacket& /*c*/, const SrcPacket& /*d*/) {
+  return static_cast<TgtPacket>(a);
+}
+
+/** \internal \returns reinterpret_cast<Target>(a) */
+template <typename Target, typename Packet>
+EIGEN_DEVICE_FUNC inline Target
+preinterpret(const Packet& a); /* { return reinterpret_cast<const Target&>(a); } */

 /** \internal \returns a + b (coeff-wise) */
 template<typename Packet> EIGEN_DEVICE_FUNC inline Packet
-padd(const Packet& a,
-        const Packet& b) { return a+b; }
+padd(const Packet& a, const Packet& b) { return a+b; }

 /** \internal \returns a - b (coeff-wise) */
 template<typename Packet> EIGEN_DEVICE_FUNC inline Packet
-psub(const Packet& a,
-        const Packet& b) { return a-b; }
+psub(const Packet& a, const Packet& b) { return a-b; }

 /** \internal \returns -a (coeff-wise) */
 template<typename Packet> EIGEN_DEVICE_FUNC inline Packet
@@ -150,23 +176,19 @@ pconj(const Packet& a) { return numext::conj(a); }

 /** \internal \returns a * b (coeff-wise) */
 template<typename Packet> EIGEN_DEVICE_FUNC inline Packet
-pmul(const Packet& a,
-        const Packet& b) { return a*b; }
+pmul(const Packet& a, const Packet& b) { return a*b; }

 /** \internal \returns a / b (coeff-wise) */
 template<typename Packet> EIGEN_DEVICE_FUNC inline Packet
-pdiv(const Packet& a,
-        const Packet& b) { return a/b; }
+pdiv(const Packet& a, const Packet& b) { return a/b; }

 /** \internal \returns the min of \a a and \a b  (coeff-wise) */
 template<typename Packet> EIGEN_DEVICE_FUNC inline Packet
-pmin(const Packet& a,
-        const Packet& b) { return numext::mini(a, b); }
+pmin(const Packet& a, const Packet& b) { return numext::mini(a, b); }

 /** \internal \returns the max of \a a and \a b  (coeff-wise) */
 template<typename Packet> EIGEN_DEVICE_FUNC inline Packet
-pmax(const Packet& a,
-        const Packet& b) { return numext::maxi(a, b); }
+pmax(const Packet& a, const Packet& b) { return numext::maxi(a, b); }

 /** \internal \returns the absolute value of \a a */
 template<typename Packet> EIGEN_DEVICE_FUNC inline Packet
@@ -190,7 +212,101 @@ pxor(const Packet& a, const Packet& b) { return a ^ b; }

 /** \internal \returns the bitwise andnot of \a a and \a b */
 template<typename Packet> EIGEN_DEVICE_FUNC inline Packet
-pandnot(const Packet& a, const Packet& b) { return a & (!b); }
+pandnot(const Packet& a, const Packet& b) { return a & (~b); }
+
+/** \internal \returns ones */
+template<typename Packet> EIGEN_DEVICE_FUNC inline Packet
+ptrue(const Packet& /*a*/) { Packet b; memset((void*)&b, 0xff, sizeof(b)); return b;}
+
+template <typename RealScalar>
+EIGEN_DEVICE_FUNC inline std::complex<RealScalar> ptrue(const std::complex<RealScalar>& /*a*/) {
+  RealScalar b;
+  b = ptrue(b);
+  return std::complex<RealScalar>(b, b);
+}
+
+/** \internal \returns the bitwise not of \a a */
+template <typename Packet> EIGEN_DEVICE_FUNC inline Packet
+pnot(const Packet& a) { return pxor(ptrue(a), a);}
+
+/** \internal \returns \a a shifted by N bits to the right */
+template<int N> EIGEN_DEVICE_FUNC inline int
+pshiftright(const int& a) { return a >> N; }
+template<int N> EIGEN_DEVICE_FUNC inline long int
+pshiftright(const long int& a) { return a >> N; }
+
+/** \internal \returns \a a shifted by N bits to the left */
+template<int N> EIGEN_DEVICE_FUNC inline int
+pshiftleft(const int& a) { return a << N; }
+template<int N> EIGEN_DEVICE_FUNC inline long int
+pshiftleft(const long int& a) { return a << N; }
+
+/** \internal \returns the significant and exponent of the underlying floating point numbers
+  * See https://en.cppreference.com/w/cpp/numeric/math/frexp
+  */
+template <typename Packet>
+EIGEN_DEVICE_FUNC inline Packet pfrexp(const Packet& a, Packet& exponent) {
+  int exp;
+  EIGEN_USING_STD_MATH(frexp);
+  Packet result = frexp(a, &exp);
+  exponent = static_cast<Packet>(exp);
+  return result;
+}
+
+/** \internal \returns a * 2^exponent
+  * See https://en.cppreference.com/w/cpp/numeric/math/ldexp
+  */
+template<typename Packet> EIGEN_DEVICE_FUNC inline Packet
+pldexp(const Packet &a, const Packet &exponent) {
+  EIGEN_USING_STD_MATH(ldexp);
+  return ldexp(a, static_cast<int>(exponent));
+}
+
+/** \internal \returns zeros */
+template<typename Packet> EIGEN_DEVICE_FUNC inline Packet
+pzero(const Packet& a) { return pxor(a,a); }
+
+template<> EIGEN_DEVICE_FUNC inline float pzero<float>(const float& a) {
+  EIGEN_UNUSED_VARIABLE(a);
+  return 0.f;
+}
+
+template<> EIGEN_DEVICE_FUNC inline double pzero<double>(const double& a) {
+  EIGEN_UNUSED_VARIABLE(a);
+  return 0.;
+}
+
+/** \internal \returns bits of \a or \b according to the input bit mask \a mask */
+template<typename Packet> EIGEN_DEVICE_FUNC inline Packet
+pselect(const Packet& mask, const Packet& a, const Packet& b) {
+  return por(pand(a,mask),pandnot(b,mask));
+}
+
+template<> EIGEN_DEVICE_FUNC inline float pselect<float>(
+    const float& mask, const float& a, const float&b) {
+  return numext::equal_strict(mask,0.f) ? b : a;
+}
+
+template<> EIGEN_DEVICE_FUNC inline double pselect<double>(
+    const double& mask, const double& a, const double& b) {
+  return numext::equal_strict(mask,0.) ? b : a;
+}
+
+/** \internal \returns a <= b as a bit mask */
+template<typename Packet> EIGEN_DEVICE_FUNC inline Packet
+pcmp_le(const Packet& a, const Packet& b)  { return a<=b ? ptrue(a) : pzero(a); }
+
+/** \internal \returns a < b as a bit mask */
+template<typename Packet> EIGEN_DEVICE_FUNC inline Packet
+pcmp_lt(const Packet& a, const Packet& b)  { return a<b ? ptrue(a) : pzero(a); }
+
+/** \internal \returns a == b as a bit mask */
+template<typename Packet> EIGEN_DEVICE_FUNC inline Packet
+pcmp_eq(const Packet& a, const Packet& b) { return a==b ? ptrue(a) : pzero(a); }
+
+/** \internal \returns a < b or a==NaN or b==NaN as a bit mask */
+template<typename Packet> EIGEN_DEVICE_FUNC inline Packet
+pcmp_lt_or_nan(const Packet& a, const Packet& b) { return pnot(pcmp_le(b,a)); } 

 /** \internal \returns a packet version of \a *from, from must be 16 bytes aligned */
 template<typename Packet> EIGEN_DEVICE_FUNC inline Packet
@@ -200,10 +316,22 @@ pload(const typename unpacket_traits<Packet>::type* from) { return *from; }
 template<typename Packet> EIGEN_DEVICE_FUNC inline Packet
 ploadu(const typename unpacket_traits<Packet>::type* from) { return *from; }

+/** \internal \returns a packet version of \a *from, (un-aligned masked load)
+ * There is no generic implementation. We only have implementations for specialized
+ * cases. Generic case should not be called.
+ */
+template<typename Packet> EIGEN_DEVICE_FUNC inline
+typename enable_if<unpacket_traits<Packet>::masked_load_available, Packet>::type
+ploadu(const typename unpacket_traits<Packet>::type* from, typename unpacket_traits<Packet>::mask_t umask);
+
 /** \internal \returns a packet with constant coefficients \a a, e.g.: (a,a,a,a) */
 template<typename Packet> EIGEN_DEVICE_FUNC inline Packet
 pset1(const typename unpacket_traits<Packet>::type& a) { return a; }

+/** \internal \returns a packet with constant coefficients set from bits */
+template<typename Packet,typename BitsType> EIGEN_DEVICE_FUNC inline Packet
+pset1frombits(BitsType a);
+
 /** \internal \returns a packet with constant coefficients \a a[0], e.g.: (a[0],a[0],a[0],a[0]) */
 template<typename Packet> EIGEN_DEVICE_FUNC inline Packet
 pload1(const typename unpacket_traits<Packet>::type  *a) { return pset1<Packet>(*a); }
@@ -213,7 +341,7 @@ pload1(const typename unpacket_traits<Packet>::type  *a) { return pset1<Packet>(
  * duplicated to form: {from[0],from[0],from[1],from[1],from[2],from[2],from[3],from[3]}
  * Currently, this function is only used for scalar * complex products.
  */
-template<typename Packet> EIGEN_DEVICE_FUNC inline Packet
+template<typename Packet> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet
 ploaddup(const typename unpacket_traits<Packet>::type* from) { return *from; }

 /** \internal \returns a packet with elements of \a *from quadrupled.
@@ -261,7 +389,7 @@ inline void pbroadcast2(const typename unpacket_traits<Packet>::type *a,
 }

 /** \internal \brief Returns a packet with coefficients (a,a+1,...,a+packet_size-1). */
-template<typename Packet> inline Packet
+template<typename Packet> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet
 plset(const typename unpacket_traits<Packet>::type& a) { return a; }

 /** \internal copy the packet \a from to \a *to, \a to must be 16 bytes aligned */
@@ -272,6 +400,15 @@ template<typename Scalar, typename Packet> EIGEN_DEVICE_FUNC inline void pstore(
 template<typename Scalar, typename Packet> EIGEN_DEVICE_FUNC inline void pstoreu(Scalar* to, const Packet& from)
 {  (*to) = from; }

+/** \internal copy the packet \a from to \a *to, (un-aligned store with a mask)
+ * There is no generic implementation. We only have implementations for specialized
+ * cases. Generic case should not be called.
+ */
+template<typename Scalar, typename Packet>
+EIGEN_DEVICE_FUNC inline
+typename enable_if<unpacket_traits<Packet>::masked_store_available, void>::type
+pstoreu(Scalar* to, const Packet& from, typename unpacket_traits<Packet>::mask_t umask);
+
 template<typename Scalar, typename Packet> EIGEN_DEVICE_FUNC inline Packet pgather(const Scalar* from, Index /*stride*/)
 { return ploadu<Packet>(from); }

@@ -279,9 +416,11 @@ template<typename Scalar, typename Packet> EIGEN_DEVICE_FUNC inline void pstoreu
 { pstore(to, from); }

 /** \internal tries to do cache prefetching of \a addr */
-template<typename Scalar> inline void prefetch(const Scalar* addr)
+template<typename Scalar> EIGEN_DEVICE_FUNC inline void prefetch(const Scalar* addr)
 {
-#ifdef __CUDA_ARCH__
+#if defined(EIGEN_HIP_DEVICE_COMPILE)
+  // do nothing
+#elif defined(EIGEN_CUDA_ARCH)
 #if defined(__LP64__)
  // 64-bit pointer operand constraint for inlined asm
  asm(" prefetch.L1 [ %1 ];" : "=l"(addr) : "l"(addr));
@@ -289,7 +428,7 @@ template<typename Scalar> inline void prefetch(const Scalar* addr)
  // 32-bit pointer operand constraint for inlined asm
  asm(" prefetch.L1 [ %1 ];" : "=r"(addr) : "r"(addr));
 #endif
-#elif !EIGEN_COMP_MSVC
+#elif (!EIGEN_COMP_MSVC) && (EIGEN_COMP_GNUC || EIGEN_COMP_CLANG || EIGEN_COMP_ICC)
  __builtin_prefetch(addr);
 #endif
 }
@@ -306,54 +445,56 @@ preduxp(const Packet* vecs) { return vecs[0]; }
 template<typename Packet> EIGEN_DEVICE_FUNC inline typename unpacket_traits<Packet>::type predux(const Packet& a)
 { return a; }

-/** \internal \returns the sum of the elements of \a a by block of 4 elements.
+/** \internal \returns the sum of the elements of upper and lower half of \a a if \a a is larger than 4.
  * For a packet {a0, a1, a2, a3, a4, a5, a6, a7}, it returns a half packet {a0+a4, a1+a5, a2+a6, a3+a7}
  * For packet-size smaller or equal to 4, this boils down to a noop.
  */
 template<typename Packet> EIGEN_DEVICE_FUNC inline
 typename conditional<(unpacket_traits<Packet>::size%8)==0,typename unpacket_traits<Packet>::half,Packet>::type
-predux4(const Packet& a)
+predux_half_dowto4(const Packet& a)
 { return a; }

-/** \internal \returns the product of the elements of \a a*/
+/** \internal \returns the product of the elements of \a a */
 template<typename Packet> EIGEN_DEVICE_FUNC inline typename unpacket_traits<Packet>::type predux_mul(const Packet& a)
 { return a; }

-/** \internal \returns the min of the elements of \a a*/
+/** \internal \returns the min of the elements of \a a */
 template<typename Packet> EIGEN_DEVICE_FUNC inline typename unpacket_traits<Packet>::type predux_min(const Packet& a)
 { return a; }

-/** \internal \returns the max of the elements of \a a*/
+/** \internal \returns the max of the elements of \a a */
 template<typename Packet> EIGEN_DEVICE_FUNC inline typename unpacket_traits<Packet>::type predux_max(const Packet& a)
 { return a; }

+/** \internal \returns true if all coeffs of \a a means "true"
+  * It is supposed to be called on values returned by pcmp_*.
+  */
+// not needed yet
+// template<typename Packet> EIGEN_DEVICE_FUNC inline bool predux_all(const Packet& a)
+// { return bool(a); }
+
+/** \internal \returns true if any coeffs of \a a means "true"
+  * It is supposed to be called on values returned by pcmp_*.
+  */
+template<typename Packet> EIGEN_DEVICE_FUNC inline bool predux_any(const Packet& a)
+{
+  // Dirty but generic implementation where "true" is assumed to be non 0 and all the sames.
+  // It is expected that "true" is either:
+  //  - Scalar(1)
+  //  - bits full of ones (NaN for floats),
+  //  - or first bit equals to 1 (1 for ints, smallest denormal for floats).
+  // For all these cases, taking the sum is just fine, and this boils down to a no-op for scalars.
+  return bool(predux(a));
+}
+
 /** \internal \returns the reversed elements of \a a*/
 template<typename Packet> EIGEN_DEVICE_FUNC inline Packet preverse(const Packet& a)
 { return a; }

-template<size_t offset, typename Packet>
-struct protate_impl
-{
-  // Empty so attempts to use this unimplemented path will fail to compile.
-  // Only specializations of this template should be used.
-};
-
-/** \internal \returns a packet with the coefficients rotated to the right in little-endian convention,
-  * by the given offset, e.g. for offset == 1:
-  *     (packet[3], packet[2], packet[1], packet[0]) becomes (packet[0], packet[3], packet[2], packet[1])
-  */
-template<size_t offset, typename Packet> EIGEN_DEVICE_FUNC inline Packet protate(const Packet& a)
-{
-  return offset ? protate_impl<offset, Packet>::run(a) : a;
-}
-
 /** \internal \returns \a a with real and imaginary part flipped (for complex type only) */
 template<typename Packet> EIGEN_DEVICE_FUNC inline Packet pcplxflip(const Packet& a)
 {
-  // FIXME: uncomment the following in case we drop the internal imag and real functions.
-//   using std::imag;
-//   using std::real;
-  return Packet(imag(a),real(a));
+  return Packet(numext::imag(a),numext::real(a));
 }

 /**************************
@@ -362,55 +503,63 @@ template<typename Packet> EIGEN_DEVICE_FUNC inline Packet pcplxflip(const Packet

 /** \internal \returns the sine of \a a (coeff-wise) */
 template<typename Packet> EIGEN_DECLARE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS
-Packet psin(const Packet& a) { using std::sin; return sin(a); }
+Packet psin(const Packet& a) { EIGEN_USING_STD_MATH(sin); return sin(a); }

 /** \internal \returns the cosine of \a a (coeff-wise) */
 template<typename Packet> EIGEN_DECLARE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS
-Packet pcos(const Packet& a) { using std::cos; return cos(a); }
+Packet pcos(const Packet& a) { EIGEN_USING_STD_MATH(cos); return cos(a); }

 /** \internal \returns the tan of \a a (coeff-wise) */
 template<typename Packet> EIGEN_DECLARE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS
-Packet ptan(const Packet& a) { using std::tan; return tan(a); }
+Packet ptan(const Packet& a) { EIGEN_USING_STD_MATH(tan); return tan(a); }

 /** \internal \returns the arc sine of \a a (coeff-wise) */
 template<typename Packet> EIGEN_DECLARE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS
-Packet pasin(const Packet& a) { using std::asin; return asin(a); }
+Packet pasin(const Packet& a) { EIGEN_USING_STD_MATH(asin); return asin(a); }

 /** \internal \returns the arc cosine of \a a (coeff-wise) */
 template<typename Packet> EIGEN_DECLARE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS
-Packet pacos(const Packet& a) { using std::acos; return acos(a); }
+Packet pacos(const Packet& a) { EIGEN_USING_STD_MATH(acos); return acos(a); }

 /** \internal \returns the arc tangent of \a a (coeff-wise) */
 template<typename Packet> EIGEN_DECLARE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS
-Packet patan(const Packet& a) { using std::atan; return atan(a); }
+Packet patan(const Packet& a) { EIGEN_USING_STD_MATH(atan); return atan(a); }

 /** \internal \returns the hyperbolic sine of \a a (coeff-wise) */
 template<typename Packet> EIGEN_DECLARE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS
-Packet psinh(const Packet& a) { using std::sinh; return sinh(a); }
+Packet psinh(const Packet& a) { EIGEN_USING_STD_MATH(sinh); return sinh(a); }

 /** \internal \returns the hyperbolic cosine of \a a (coeff-wise) */
 template<typename Packet> EIGEN_DECLARE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS
-Packet pcosh(const Packet& a) { using std::cosh; return cosh(a); }
+Packet pcosh(const Packet& a) { EIGEN_USING_STD_MATH(cosh); return cosh(a); }

 /** \internal \returns the hyperbolic tan of \a a (coeff-wise) */
 template<typename Packet> EIGEN_DECLARE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS
-Packet ptanh(const Packet& a) { using std::tanh; return tanh(a); }
+Packet ptanh(const Packet& a) { EIGEN_USING_STD_MATH(tanh); return tanh(a); }

 /** \internal \returns the exp of \a a (coeff-wise) */
 template<typename Packet> EIGEN_DECLARE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS
-Packet pexp(const Packet& a) { using std::exp; return exp(a); }
+Packet pexp(const Packet& a) { EIGEN_USING_STD_MATH(exp); return exp(a); }
+
+/** \internal \returns the expm1 of \a a (coeff-wise) */
+template<typename Packet> EIGEN_DECLARE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS
+Packet pexpm1(const Packet& a) { return numext::expm1(a); }

 /** \internal \returns the log of \a a (coeff-wise) */
 template<typename Packet> EIGEN_DECLARE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS
-Packet plog(const Packet& a) { using std::log; return log(a); }
+Packet plog(const Packet& a) { EIGEN_USING_STD_MATH(log); return log(a); }
+
+/** \internal \returns the log1p of \a a (coeff-wise) */
+template<typename Packet> EIGEN_DECLARE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS
+Packet plog1p(const Packet& a) { return numext::log1p(a); }

 /** \internal \returns the log10 of \a a (coeff-wise) */
 template<typename Packet> EIGEN_DECLARE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS
-Packet plog10(const Packet& a) { using std::log10; return log10(a); }
+Packet plog10(const Packet& a) { EIGEN_USING_STD_MATH(log10); return log10(a); }

 /** \internal \returns the square-root of \a a (coeff-wise) */
 template<typename Packet> EIGEN_DECLARE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS
-Packet psqrt(const Packet& a) { using std::sqrt; return sqrt(a); }
+Packet psqrt(const Packet& a) { EIGEN_USING_STD_MATH(sqrt); return sqrt(a); }

 /** \internal \returns the reciprocal square-root of \a a (coeff-wise) */
 template<typename Packet> EIGEN_DECLARE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS
@@ -434,7 +583,7 @@ Packet pceil(const Packet& a) { using numext::ceil; return ceil(a); }
 * The following functions might not have to be overwritten for vectorized types
 ***************************************************************************/

-/** \internal copy a packet with constant coeficient \a a (e.g., [a,a,a,a]) to \a *to. \a to must be 16 bytes aligned */
+/** \internal copy a packet with constant coefficient \a a (e.g., [a,a,a,a]) to \a *to. \a to must be 16 bytes aligned */
 // NOTE: this function must really be templated on the packet type (think about different packet types for the same scalar type)
 template<typename Packet>
 inline void pstore1(typename unpacket_traits<Packet>::type* to, const typename unpacket_traits<Packet>::type& a)
@@ -477,7 +626,7 @@ EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE void pstoret(Scalar* to, const Packet& fro
  * by the current computation.
  */
 template<typename Packet, int LoadMode>
-inline Packet ploadt_ro(const typename unpacket_traits<Packet>::type* from)
+EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE Packet ploadt_ro(const typename unpacket_traits<Packet>::type* from)
 {
  return ploadt<Packet, LoadMode>(from);
 }
@@ -516,13 +665,13 @@ inline void palign(PacketType& first, const PacketType& second)
 ***************************************************************************/

 // Eigen+CUDA does not support complexes.
-#ifndef __CUDACC__
+#if !defined(EIGEN_GPUCC)

 template<> inline std::complex<float> pmul(const std::complex<float>& a, const std::complex<float>& b)
-{ return std::complex<float>(real(a)*real(b) - imag(a)*imag(b), imag(a)*real(b) + real(a)*imag(b)); }
+{ return std::complex<float>(a.real()*b.real() - a.imag()*b.imag(), a.imag()*b.real() + a.real()*b.imag()); }

 template<> inline std::complex<double> pmul(const std::complex<double>& a, const std::complex<double>& b)
-{ return std::complex<double>(real(a)*real(b) - imag(a)*imag(b), imag(a)*real(b) + real(a)*imag(b)); }
+{ return std::complex<double>(a.real()*b.real() - a.imag()*b.imag(), a.imag()*b.real() + a.real()*b.imag()); }

 #endif

@@ -553,6 +702,50 @@ pblend(const Selector<unpacket_traits<Packet>::size>& ifPacket, const Packet& th
  return ifPacket.select[0] ? thenPacket : elsePacket;
 }

+/** \internal \returns \a a with the first coefficient replaced by the scalar b */
+template<typename Packet> EIGEN_DEVICE_FUNC inline Packet
+pinsertfirst(const Packet& a, typename unpacket_traits<Packet>::type b)
+{
+  // Default implementation based on pblend.
+  // It must be specialized for higher performance.
+  Selector<unpacket_traits<Packet>::size> mask;
+  mask.select[0] = true;
+  // This for loop should be optimized away by the compiler.
+  for(Index i=1; i<unpacket_traits<Packet>::size; ++i)
+    mask.select[i] = false;
+  return pblend(mask, pset1<Packet>(b), a);
+}
+
+/** \internal \returns \a a with the last coefficient replaced by the scalar b */
+template<typename Packet> EIGEN_DEVICE_FUNC inline Packet
+pinsertlast(const Packet& a, typename unpacket_traits<Packet>::type b)
+{
+  // Default implementation based on pblend.
+  // It must be specialized for higher performance.
+  Selector<unpacket_traits<Packet>::size> mask;
+  // This for loop should be optimized away by the compiler.
+  for(Index i=0; i<unpacket_traits<Packet>::size-1; ++i)
+    mask.select[i] = false;
+  mask.select[unpacket_traits<Packet>::size-1] = true;
+  return pblend(mask, pset1<Packet>(b), a);
+}
+
+/***************************************************************************
+ * Some generic implementations to be used by implementors
+***************************************************************************/
+
+/** Default implementation of pfrexp for float.
+  * It is expected to be called by implementers of template<> pfrexp.
+  */
+template<typename Packet> EIGEN_STRONG_INLINE Packet
+pfrexp_float(const Packet& a, Packet& exponent);
+
+/** Default implementation of pldexp for float.
+  * It is expected to be called by implementers of template<> pldexp.
+  */
+template<typename Packet> EIGEN_STRONG_INLINE Packet
+pldexp_float(Packet a, Packet exponent);
+
 } // end namespace internal

 } // end namespace Eigen
--- a/Eigen/src/Core/GlobalFunctions.h
+++ b/Eigen/src/Core/GlobalFunctions.h
@@ -1,7 +1,7 @@
 // This file is part of Eigen, a lightweight C++ template library
 // for linear algebra.
 //
-// Copyright (C) 2010-2012 Gael Guennebaud <gael.guennebaud@inria.fr>
+// Copyright (C) 2010-2016 Gael Guennebaud <gael.guennebaud@inria.fr>
 // Copyright (C) 2010 Benoit Jacob <jacob.benoit.1@gmail.com>
 //
 // This Source Code Form is subject to the terms of the Mozilla
@@ -11,13 +11,30 @@
 #ifndef EIGEN_GLOBAL_FUNCTIONS_H
 #define EIGEN_GLOBAL_FUNCTIONS_H

-#define EIGEN_ARRAY_DECLARE_GLOBAL_UNARY(NAME,FUNCTOR) \
+#ifdef EIGEN_PARSED_BY_DOXYGEN
+
+#define EIGEN_ARRAY_DECLARE_GLOBAL_UNARY(NAME,FUNCTOR,DOC_OP,DOC_DETAILS) \
+  /** \returns an expression of the coefficient-wise DOC_OP of \a x
+
+    DOC_DETAILS
+
+    \sa <a href="group__CoeffwiseMathFunctions.html#cwisetable_##NAME">Math functions</a>, class CwiseUnaryOp
+    */ \
+  template<typename Derived> \
+  inline const Eigen::CwiseUnaryOp<Eigen::internal::FUNCTOR<typename Derived::Scalar>, const Derived> \
+  NAME(const Eigen::ArrayBase<Derived>& x);
+
+#else
+
+#define EIGEN_ARRAY_DECLARE_GLOBAL_UNARY(NAME,FUNCTOR,DOC_OP,DOC_DETAILS) \
  template<typename Derived> \
  inline const Eigen::CwiseUnaryOp<Eigen::internal::FUNCTOR<typename Derived::Scalar>, const Derived> \
  (NAME)(const Eigen::ArrayBase<Derived>& x) { \
    return Eigen::CwiseUnaryOp<Eigen::internal::FUNCTOR<typename Derived::Scalar>, const Derived>(x.derived()); \
  }

+#endif // EIGEN_PARSED_BY_DOXYGEN
+
 #define EIGEN_ARRAY_DECLARE_GLOBAL_EIGEN_UNARY(NAME,FUNCTOR) \
  \
  template<typename Derived> \
@@ -36,40 +53,77 @@

 namespace Eigen
 {
-  EIGEN_ARRAY_DECLARE_GLOBAL_UNARY(real,scalar_real_op)
-  EIGEN_ARRAY_DECLARE_GLOBAL_UNARY(imag,scalar_imag_op)
-  EIGEN_ARRAY_DECLARE_GLOBAL_UNARY(conj,scalar_conjugate_op)
-  EIGEN_ARRAY_DECLARE_GLOBAL_UNARY(inverse,scalar_inverse_op)
-  EIGEN_ARRAY_DECLARE_GLOBAL_UNARY(sin,scalar_sin_op)
-  EIGEN_ARRAY_DECLARE_GLOBAL_UNARY(cos,scalar_cos_op)
-  EIGEN_ARRAY_DECLARE_GLOBAL_UNARY(tan,scalar_tan_op)
-  EIGEN_ARRAY_DECLARE_GLOBAL_UNARY(atan,scalar_atan_op)
-  EIGEN_ARRAY_DECLARE_GLOBAL_UNARY(asin,scalar_asin_op)
-  EIGEN_ARRAY_DECLARE_GLOBAL_UNARY(acos,scalar_acos_op)
-  EIGEN_ARRAY_DECLARE_GLOBAL_UNARY(sinh,scalar_sinh_op)
-  EIGEN_ARRAY_DECLARE_GLOBAL_UNARY(cosh,scalar_cosh_op)
-  EIGEN_ARRAY_DECLARE_GLOBAL_UNARY(tanh,scalar_tanh_op)
-  EIGEN_ARRAY_DECLARE_GLOBAL_UNARY(exp,scalar_exp_op)
-  EIGEN_ARRAY_DECLARE_GLOBAL_UNARY(log,scalar_log_op)
-  EIGEN_ARRAY_DECLARE_GLOBAL_UNARY(log10,scalar_log10_op)
-  EIGEN_ARRAY_DECLARE_GLOBAL_UNARY(abs,scalar_abs_op)
-  EIGEN_ARRAY_DECLARE_GLOBAL_UNARY(abs2,scalar_abs2_op)
-  EIGEN_ARRAY_DECLARE_GLOBAL_UNARY(arg,scalar_arg_op)
-  EIGEN_ARRAY_DECLARE_GLOBAL_UNARY(sqrt,scalar_sqrt_op)
-  EIGEN_ARRAY_DECLARE_GLOBAL_UNARY(square,scalar_square_op)
-  EIGEN_ARRAY_DECLARE_GLOBAL_UNARY(cube,scalar_cube_op)
-  EIGEN_ARRAY_DECLARE_GLOBAL_UNARY(round,scalar_round_op)
-  EIGEN_ARRAY_DECLARE_GLOBAL_UNARY(floor,scalar_floor_op)
-  EIGEN_ARRAY_DECLARE_GLOBAL_UNARY(ceil,scalar_ceil_op)
-  EIGEN_ARRAY_DECLARE_GLOBAL_UNARY(isnan,scalar_isnan_op)
-  EIGEN_ARRAY_DECLARE_GLOBAL_UNARY(isinf,scalar_isinf_op)
-  EIGEN_ARRAY_DECLARE_GLOBAL_UNARY(isfinite,scalar_isfinite_op)
-  
-  template<typename Derived>
-  inline const Eigen::CwiseUnaryOp<Eigen::internal::scalar_pow_op<typename Derived::Scalar>, const Derived>
-  pow(const Eigen::ArrayBase<Derived>& x, const typename Derived::Scalar& exponent) {
-    return x.derived().pow(exponent);
+  EIGEN_ARRAY_DECLARE_GLOBAL_UNARY(real,scalar_real_op,real part,\sa ArrayBase::real)
+  EIGEN_ARRAY_DECLARE_GLOBAL_UNARY(imag,scalar_imag_op,imaginary part,\sa ArrayBase::imag)
+  EIGEN_ARRAY_DECLARE_GLOBAL_UNARY(conj,scalar_conjugate_op,complex conjugate,\sa ArrayBase::conjugate)
+  EIGEN_ARRAY_DECLARE_GLOBAL_UNARY(inverse,scalar_inverse_op,inverse,\sa ArrayBase::inverse)
+  EIGEN_ARRAY_DECLARE_GLOBAL_UNARY(sin,scalar_sin_op,sine,\sa ArrayBase::sin)
+  EIGEN_ARRAY_DECLARE_GLOBAL_UNARY(cos,scalar_cos_op,cosine,\sa ArrayBase::cos)
+  EIGEN_ARRAY_DECLARE_GLOBAL_UNARY(tan,scalar_tan_op,tangent,\sa ArrayBase::tan)
+  EIGEN_ARRAY_DECLARE_GLOBAL_UNARY(atan,scalar_atan_op,arc-tangent,\sa ArrayBase::atan)
+  EIGEN_ARRAY_DECLARE_GLOBAL_UNARY(asin,scalar_asin_op,arc-sine,\sa ArrayBase::asin)
+  EIGEN_ARRAY_DECLARE_GLOBAL_UNARY(acos,scalar_acos_op,arc-consine,\sa ArrayBase::acos)
+  EIGEN_ARRAY_DECLARE_GLOBAL_UNARY(sinh,scalar_sinh_op,hyperbolic sine,\sa ArrayBase::sinh)
+  EIGEN_ARRAY_DECLARE_GLOBAL_UNARY(cosh,scalar_cosh_op,hyperbolic cosine,\sa ArrayBase::cosh)
+  EIGEN_ARRAY_DECLARE_GLOBAL_UNARY(tanh,scalar_tanh_op,hyperbolic tangent,\sa ArrayBase::tanh)
+#if EIGEN_HAS_CXX11_MATH
+  EIGEN_ARRAY_DECLARE_GLOBAL_UNARY(asinh,scalar_asinh_op,inverse hyperbolic sine,\sa ArrayBase::asinh)
+  EIGEN_ARRAY_DECLARE_GLOBAL_UNARY(acosh,scalar_acosh_op,inverse hyperbolic cosine,\sa ArrayBase::acosh)
+  EIGEN_ARRAY_DECLARE_GLOBAL_UNARY(atanh,scalar_atanh_op,inverse hyperbolic tangent,\sa ArrayBase::atanh)
+#endif
+  EIGEN_ARRAY_DECLARE_GLOBAL_UNARY(logistic,scalar_logistic_op,logistic function,\sa ArrayBase::logistic)
+  EIGEN_ARRAY_DECLARE_GLOBAL_UNARY(lgamma,scalar_lgamma_op,natural logarithm of the gamma function,\sa ArrayBase::lgamma)
+  EIGEN_ARRAY_DECLARE_GLOBAL_UNARY(digamma,scalar_digamma_op,derivative of lgamma,\sa ArrayBase::digamma)
+  EIGEN_ARRAY_DECLARE_GLOBAL_UNARY(erf,scalar_erf_op,error function,\sa ArrayBase::erf)
+  EIGEN_ARRAY_DECLARE_GLOBAL_UNARY(erfc,scalar_erfc_op,complement error function,\sa ArrayBase::erfc)
+  EIGEN_ARRAY_DECLARE_GLOBAL_UNARY(ndtri,scalar_ndtri_op,inverse normal distribution function,\sa ArrayBase::ndtri)
+  EIGEN_ARRAY_DECLARE_GLOBAL_UNARY(exp,scalar_exp_op,exponential,\sa ArrayBase::exp)
+  EIGEN_ARRAY_DECLARE_GLOBAL_UNARY(expm1,scalar_expm1_op,exponential of a value minus 1,\sa ArrayBase::expm1)
+  EIGEN_ARRAY_DECLARE_GLOBAL_UNARY(log,scalar_log_op,natural logarithm,\sa Eigen::log10 DOXCOMMA ArrayBase::log)
+  EIGEN_ARRAY_DECLARE_GLOBAL_UNARY(log1p,scalar_log1p_op,natural logarithm of 1 plus the value,\sa ArrayBase::log1p)
+  EIGEN_ARRAY_DECLARE_GLOBAL_UNARY(log10,scalar_log10_op,base 10 logarithm,\sa Eigen::log DOXCOMMA ArrayBase::log)
+  EIGEN_ARRAY_DECLARE_GLOBAL_UNARY(abs,scalar_abs_op,absolute value,\sa ArrayBase::abs DOXCOMMA MatrixBase::cwiseAbs)
+  EIGEN_ARRAY_DECLARE_GLOBAL_UNARY(abs2,scalar_abs2_op,squared absolute value,\sa ArrayBase::abs2 DOXCOMMA MatrixBase::cwiseAbs2)
+  EIGEN_ARRAY_DECLARE_GLOBAL_UNARY(arg,scalar_arg_op,complex argument,\sa ArrayBase::arg)
+  EIGEN_ARRAY_DECLARE_GLOBAL_UNARY(sqrt,scalar_sqrt_op,square root,\sa ArrayBase::sqrt DOXCOMMA MatrixBase::cwiseSqrt)
+  EIGEN_ARRAY_DECLARE_GLOBAL_UNARY(rsqrt,scalar_rsqrt_op,reciprocal square root,\sa ArrayBase::rsqrt)
+  EIGEN_ARRAY_DECLARE_GLOBAL_UNARY(square,scalar_square_op,square (power 2),\sa Eigen::abs2 DOXCOMMA Eigen::pow DOXCOMMA ArrayBase::square)
+  EIGEN_ARRAY_DECLARE_GLOBAL_UNARY(cube,scalar_cube_op,cube (power 3),\sa Eigen::pow DOXCOMMA ArrayBase::cube)
+  EIGEN_ARRAY_DECLARE_GLOBAL_UNARY(round,scalar_round_op,nearest integer,\sa Eigen::floor DOXCOMMA Eigen::ceil DOXCOMMA ArrayBase::round)
+  EIGEN_ARRAY_DECLARE_GLOBAL_UNARY(floor,scalar_floor_op,nearest integer not greater than the giben value,\sa Eigen::ceil DOXCOMMA ArrayBase::floor)
+  EIGEN_ARRAY_DECLARE_GLOBAL_UNARY(ceil,scalar_ceil_op,nearest integer not less than the giben value,\sa Eigen::floor DOXCOMMA ArrayBase::ceil)
+  EIGEN_ARRAY_DECLARE_GLOBAL_UNARY(isnan,scalar_isnan_op,not-a-number test,\sa Eigen::isinf DOXCOMMA Eigen::isfinite DOXCOMMA ArrayBase::isnan)
+  EIGEN_ARRAY_DECLARE_GLOBAL_UNARY(isinf,scalar_isinf_op,infinite value test,\sa Eigen::isnan DOXCOMMA Eigen::isfinite DOXCOMMA ArrayBase::isinf)
+  EIGEN_ARRAY_DECLARE_GLOBAL_UNARY(isfinite,scalar_isfinite_op,finite value test,\sa Eigen::isinf DOXCOMMA Eigen::isnan DOXCOMMA ArrayBase::isfinite)
+  EIGEN_ARRAY_DECLARE_GLOBAL_UNARY(sign,scalar_sign_op,sign (or 0),\sa ArrayBase::sign)
+
+  /** \returns an expression of the coefficient-wise power of \a x to the given constant \a exponent.
+    *
+    * \tparam ScalarExponent is the scalar type of \a exponent. It must be compatible with the scalar type of the given expression (\c Derived::Scalar).
+    *
+    * \sa ArrayBase::pow()
+    *
+    * \relates ArrayBase
+    */
+#ifdef EIGEN_PARSED_BY_DOXYGEN
+  template<typename Derived,typename ScalarExponent>
+  inline const CwiseBinaryOp<internal::scalar_pow_op<Derived::Scalar,ScalarExponent>,Derived,Constant<ScalarExponent> >
+  pow(const Eigen::ArrayBase<Derived>& x, const ScalarExponent& exponent);
+#else
+  template <typename Derived,typename ScalarExponent>
+  EIGEN_DEVICE_FUNC inline
+  EIGEN_MSVC10_WORKAROUND_BINARYOP_RETURN_TYPE(
+    const EIGEN_EXPR_BINARYOP_SCALAR_RETURN_TYPE(Derived,typename internal::promote_scalar_arg<typename Derived::Scalar
+                                                 EIGEN_COMMA ScalarExponent EIGEN_COMMA
+                                                 EIGEN_SCALAR_BINARY_SUPPORTED(pow,typename Derived::Scalar,ScalarExponent)>::type,pow))
+  pow(const Eigen::ArrayBase<Derived>& x, const ScalarExponent& exponent)
+  {
+    typedef typename internal::promote_scalar_arg<typename Derived::Scalar,ScalarExponent,
+                                                  EIGEN_SCALAR_BINARY_SUPPORTED(pow,typename Derived::Scalar,ScalarExponent)>::type PromotedExponent;
+    return EIGEN_EXPR_BINARYOP_SCALAR_RETURN_TYPE(Derived,PromotedExponent,pow)(x.derived(),
+           typename internal::plain_constant_type<Derived,PromotedExponent>::type(x.derived().rows(), x.derived().cols(), internal::scalar_constant_op<PromotedExponent>(exponent)));
  }
+#endif

  /** \returns an expression of the coefficient-wise power of \a x to the given array of \a exponents.
    *
@@ -77,52 +131,53 @@ namespace Eigen
    *
    * Example: \include Cwise_array_power_array.cpp
    * Output: \verbinclude Cwise_array_power_array.out
-    * 
+    *
    * \sa ArrayBase::pow()
+    *
+    * \relates ArrayBase
    */
  template<typename Derived,typename ExponentDerived>
-  inline const Eigen::CwiseBinaryOp<Eigen::internal::scalar_binary_pow_op<typename Derived::Scalar, typename ExponentDerived::Scalar>, const Derived, const ExponentDerived>
-  pow(const Eigen::ArrayBase<Derived>& x, const Eigen::ArrayBase<ExponentDerived>& exponents) 
+  inline const Eigen::CwiseBinaryOp<Eigen::internal::scalar_pow_op<typename Derived::Scalar, typename ExponentDerived::Scalar>, const Derived, const ExponentDerived>
+  pow(const Eigen::ArrayBase<Derived>& x, const Eigen::ArrayBase<ExponentDerived>& exponents)
  {
-    return Eigen::CwiseBinaryOp<Eigen::internal::scalar_binary_pow_op<typename Derived::Scalar, typename ExponentDerived::Scalar>, const Derived, const ExponentDerived>(
+    return Eigen::CwiseBinaryOp<Eigen::internal::scalar_pow_op<typename Derived::Scalar, typename ExponentDerived::Scalar>, const Derived, const ExponentDerived>(
      x.derived(),
      exponents.derived()
    );
  }
-  
+
  /** \returns an expression of the coefficient-wise power of the scalar \a x to the given array of \a exponents.
    *
    * This function computes the coefficient-wise power between a scalar and an array of exponents.
-    * Beaware that the scalar type of the input scalar \a x and the exponents \a exponents must be the same.
+    *
+    * \tparam Scalar is the scalar type of \a x. It must be compatible with the scalar type of the given array expression (\c Derived::Scalar).
    *
    * Example: \include Cwise_scalar_power_array.cpp
    * Output: \verbinclude Cwise_scalar_power_array.out
-    * 
+    *
    * \sa ArrayBase::pow()
+    *
+    * \relates ArrayBase
    */
-  template<typename Derived>
-  inline const Eigen::CwiseBinaryOp<Eigen::internal::scalar_binary_pow_op<typename Derived::Scalar, typename Derived::Scalar>, const typename Derived::ConstantReturnType, const Derived>
-  pow(const typename Derived::Scalar& x, const Eigen::ArrayBase<Derived>& exponents) 
-  {
-    typename Derived::ConstantReturnType constant_x(exponents.rows(), exponents.cols(), x);
-    return Eigen::CwiseBinaryOp<Eigen::internal::scalar_binary_pow_op<typename Derived::Scalar, typename Derived::Scalar>, const typename Derived::ConstantReturnType, const Derived>(
-      constant_x,
-      exponents.derived()
-    );
-  }
-  
-  /**
-  * \brief Component-wise division of a scalar by array elements.
-  **/
-  template <typename Derived>
-  inline const Eigen::CwiseUnaryOp<Eigen::internal::scalar_inverse_mult_op<typename Derived::Scalar>, const Derived>
-    operator/(const typename Derived::Scalar& s, const Eigen::ArrayBase<Derived>& a)
-  {
-    return Eigen::CwiseUnaryOp<Eigen::internal::scalar_inverse_mult_op<typename Derived::Scalar>, const Derived>(
-      a.derived(),
-      Eigen::internal::scalar_inverse_mult_op<typename Derived::Scalar>(s)  
-    );
+#ifdef EIGEN_PARSED_BY_DOXYGEN
+  template<typename Scalar,typename Derived>
+  inline const CwiseBinaryOp<internal::scalar_pow_op<Scalar,Derived::Scalar>,Constant<Scalar>,Derived>
+  pow(const Scalar& x,const Eigen::ArrayBase<Derived>& x);
+#else
+  template <typename Scalar, typename Derived>
+  EIGEN_DEVICE_FUNC inline
+  EIGEN_MSVC10_WORKAROUND_BINARYOP_RETURN_TYPE(
+    const EIGEN_SCALAR_BINARYOP_EXPR_RETURN_TYPE(typename internal::promote_scalar_arg<typename Derived::Scalar
+                                                 EIGEN_COMMA Scalar EIGEN_COMMA
+                                                 EIGEN_SCALAR_BINARY_SUPPORTED(pow,Scalar,typename Derived::Scalar)>::type,Derived,pow))
+  pow(const Scalar& x, const Eigen::ArrayBase<Derived>& exponents) {
+    typedef typename internal::promote_scalar_arg<typename Derived::Scalar,Scalar,
+                                                  EIGEN_SCALAR_BINARY_SUPPORTED(pow,Scalar,typename Derived::Scalar)>::type PromotedScalar;
+    return EIGEN_SCALAR_BINARYOP_EXPR_RETURN_TYPE(PromotedScalar,Derived,pow)(
+           typename internal::plain_constant_type<Derived,PromotedScalar>::type(exponents.derived().rows(), exponents.derived().cols(), internal::scalar_constant_op<PromotedScalar>(x)), exponents.derived());
  }
+#endif
+

  namespace internal
  {
--- a/Eigen/src/Core/IO.h
+++ b/Eigen/src/Core/IO.h
@@ -41,6 +41,7 @@ std::ostream & print_matrix(std::ostream & s, const Derived& _m, const IOFormat&
  *  - \b rowSuffix string printed at the end of each row
  *  - \b matPrefix string printed at the beginning of the matrix
  *  - \b matSuffix string printed at the end of the matrix
+  *  - \b fill character printed to fill the empty space in aligned columns
  *
  * Example: \include IOFormat.cpp
  * Output: \verbinclude IOFormat.out
@@ -53,9 +54,9 @@ struct IOFormat
  IOFormat(int _precision = StreamPrecision, int _flags = 0,
    const std::string& _coeffSeparator = " ",
    const std::string& _rowSeparator = "\n", const std::string& _rowPrefix="", const std::string& _rowSuffix="",
-    const std::string& _matPrefix="", const std::string& _matSuffix="")
+    const std::string& _matPrefix="", const std::string& _matSuffix="", const char _fill=' ')
  : matPrefix(_matPrefix), matSuffix(_matSuffix), rowPrefix(_rowPrefix), rowSuffix(_rowSuffix), rowSeparator(_rowSeparator),
-    rowSpacer(""), coeffSeparator(_coeffSeparator), precision(_precision), flags(_flags)
+    rowSpacer(""), coeffSeparator(_coeffSeparator), fill(_fill), precision(_precision), flags(_flags)
  {
    // TODO check if rowPrefix, rowSuffix or rowSeparator contains a newline
    // don't add rowSpacer if columns are not to be aligned
@@ -71,6 +72,7 @@ struct IOFormat
  std::string matPrefix, matSuffix;
  std::string rowPrefix, rowSuffix, rowSeparator, rowSpacer;
  std::string coeffSeparator;
+  char fill;
  int precision;
  int flags;
 };
@@ -80,7 +82,7 @@ struct IOFormat
  *
  * \brief Pseudo expression providing matrix output with given format
  *
-  * \param ExpressionType the type of the object on which IO stream operations are performed
+  * \tparam ExpressionType the type of the object on which IO stream operations are performed
  *
  * This class represents an expression with stream operators controlled by a given IOFormat.
  * It is the return type of DenseBase::format()
@@ -105,51 +107,23 @@ class WithFormat
    }

  protected:
-    const typename ExpressionType::Nested m_matrix;
+    typename ExpressionType::Nested m_matrix;
    IOFormat m_format;
 };

-/** \returns a WithFormat proxy object allowing to print a matrix the with given
-  * format \a fmt.
-  *
-  * See class IOFormat for some examples.
-  *
-  * \sa class IOFormat, class WithFormat
-  */
-template<typename Derived>
-inline const WithFormat<Derived>
-DenseBase<Derived>::format(const IOFormat& fmt) const
-{
-  return WithFormat<Derived>(derived(), fmt);
-}
-
 namespace internal {

-template<typename Scalar, bool IsInteger>
-struct significant_decimals_default_impl
-{
-  typedef typename NumTraits<Scalar>::Real RealScalar;
-  static inline int run()
-  {
-    using std::ceil;
-    using std::log;
-    return cast<RealScalar,int>(ceil(-log(NumTraits<RealScalar>::epsilon())/log(RealScalar(10))));
-  }
-};
-
-template<typename Scalar>
-struct significant_decimals_default_impl<Scalar, true>
-{
-  static inline int run()
-  {
-    return 0;
-  }
-};
-
+// NOTE: This helper is kept for backward compatibility with previous code specializing
+//       this internal::significant_decimals_impl structure. In the future we should directly
+//       call digits10() which has been introduced in July 2016 in 3.3.
 template<typename Scalar>
 struct significant_decimals_impl
-  : significant_decimals_default_impl<Scalar, NumTraits<Scalar>::IsInteger>
-{};
+{
+  static inline int run()
+  {
+    return NumTraits<Scalar>::digits10();
+  }
+};

 /** \internal
  * print the matrix \a _m to the output stream \a s using the output format \a fmt */
@@ -204,18 +178,26 @@ std::ostream & print_matrix(std::ostream & s, const Derived& _m, const IOFormat&
        width = std::max<Index>(width, Index(sstr.str().length()));
      }
  }
+  std::streamsize old_width = s.width();
+  char old_fill_character = s.fill();
  s << fmt.matPrefix;
  for(Index i = 0; i < m.rows(); ++i)
  {
    if (i)
      s << fmt.rowSpacer;
    s << fmt.rowPrefix;
-    if(width) s.width(width);
+    if(width) {
+      s.fill(fmt.fill);
+      s.width(width);
+    }
    s << m.coeff(i, 0);
    for(Index j = 1; j < m.cols(); ++j)
    {
      s << fmt.coeffSeparator;
-      if (width) s.width(width);
+      if(width) {
+        s.fill(fmt.fill);
+        s.width(width);
+      }
      s << m.coeff(i, j);
    }
    s << fmt.rowSuffix;
@@ -224,6 +206,10 @@ std::ostream & print_matrix(std::ostream & s, const Derived& _m, const IOFormat&
  }
  s << fmt.matSuffix;
  if(explicit_precision) s.precision(old_precision);
+  if(width) {
+    s.fill(old_fill_character);
+    s.width(old_width);
+  }
  return s;
 }

--- a/Eigen/src/Core/IndexedView.h
+++ b/Eigen/src/Core/IndexedView.h
@@ -0,0 +1,207 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2017 Gael Guennebaud <gael.guennebaud@inria.fr>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+#ifndef EIGEN_INDEXED_VIEW_H
+#define EIGEN_INDEXED_VIEW_H
+
+namespace Eigen {
+
+namespace internal {
+
+template<typename XprType, typename RowIndices, typename ColIndices>
+struct traits<IndexedView<XprType, RowIndices, ColIndices> >
+ : traits<XprType>
+{
+  enum {
+    RowsAtCompileTime = int(array_size<RowIndices>::value),
+    ColsAtCompileTime = int(array_size<ColIndices>::value),
+    MaxRowsAtCompileTime = RowsAtCompileTime != Dynamic ? int(RowsAtCompileTime) : Dynamic,
+    MaxColsAtCompileTime = ColsAtCompileTime != Dynamic ? int(ColsAtCompileTime) : Dynamic,
+
+    XprTypeIsRowMajor = (int(traits<XprType>::Flags)&RowMajorBit) != 0,
+    IsRowMajor = (MaxRowsAtCompileTime==1&&MaxColsAtCompileTime!=1) ? 1
+               : (MaxColsAtCompileTime==1&&MaxRowsAtCompileTime!=1) ? 0
+               : XprTypeIsRowMajor,
+
+    RowIncr = int(get_compile_time_incr<RowIndices>::value),
+    ColIncr = int(get_compile_time_incr<ColIndices>::value),
+    InnerIncr = IsRowMajor ? ColIncr : RowIncr,
+    OuterIncr = IsRowMajor ? RowIncr : ColIncr,
+
+    HasSameStorageOrderAsXprType = (IsRowMajor == XprTypeIsRowMajor),
+    XprInnerStride = HasSameStorageOrderAsXprType ? int(inner_stride_at_compile_time<XprType>::ret) : int(outer_stride_at_compile_time<XprType>::ret),
+    XprOuterstride = HasSameStorageOrderAsXprType ? int(outer_stride_at_compile_time<XprType>::ret) : int(inner_stride_at_compile_time<XprType>::ret),
+
+    InnerSize = XprTypeIsRowMajor ? ColsAtCompileTime : RowsAtCompileTime,
+    IsBlockAlike = InnerIncr==1 && OuterIncr==1,
+    IsInnerPannel = HasSameStorageOrderAsXprType && is_same<AllRange<InnerSize>,typename conditional<XprTypeIsRowMajor,ColIndices,RowIndices>::type>::value,
+
+    InnerStrideAtCompileTime = InnerIncr<0 || InnerIncr==DynamicIndex || XprInnerStride==Dynamic ? Dynamic : XprInnerStride * InnerIncr,
+    OuterStrideAtCompileTime = OuterIncr<0 || OuterIncr==DynamicIndex || XprOuterstride==Dynamic ? Dynamic : XprOuterstride * OuterIncr,
+
+    ReturnAsScalar = is_same<RowIndices,SingleRange>::value && is_same<ColIndices,SingleRange>::value,
+    ReturnAsBlock = (!ReturnAsScalar) && IsBlockAlike,
+    ReturnAsIndexedView = (!ReturnAsScalar) && (!ReturnAsBlock),
+
+    // FIXME we deal with compile-time strides if and only if we have DirectAccessBit flag,
+    // but this is too strict regarding negative strides...
+    DirectAccessMask = (int(InnerIncr)!=UndefinedIncr && int(OuterIncr)!=UndefinedIncr && InnerIncr>=0 && OuterIncr>=0) ? DirectAccessBit : 0,
+    FlagsRowMajorBit = IsRowMajor ? RowMajorBit : 0,
+    FlagsLvalueBit = is_lvalue<XprType>::value ? LvalueBit : 0,
+    Flags = (traits<XprType>::Flags & (HereditaryBits | DirectAccessMask)) | FlagsLvalueBit | FlagsRowMajorBit
+  };
+
+  typedef Block<XprType,RowsAtCompileTime,ColsAtCompileTime,IsInnerPannel> BlockType;
+};
+
+}
+
+template<typename XprType, typename RowIndices, typename ColIndices, typename StorageKind>
+class IndexedViewImpl;
+
+
+/** \class IndexedView
+  * \ingroup Core_Module
+  *
+  * \brief Expression of a non-sequential sub-matrix defined by arbitrary sequences of row and column indices
+  *
+  * \tparam XprType the type of the expression in which we are taking the intersections of sub-rows and sub-columns
+  * \tparam RowIndices the type of the object defining the sequence of row indices
+  * \tparam ColIndices the type of the object defining the sequence of column indices
+  *
+  * This class represents an expression of a sub-matrix (or sub-vector) defined as the intersection
+  * of sub-sets of rows and columns, that are themself defined by generic sequences of row indices \f$ \{r_0,r_1,..r_{m-1}\} \f$
+  * and column indices \f$ \{c_0,c_1,..c_{n-1} \}\f$. Let \f$ A \f$  be the nested matrix, then the resulting matrix \f$ B \f$ has \c m
+  * rows and \c n columns, and its entries are given by: \f$ B(i,j) = A(r_i,c_j) \f$.
+  *
+  * The \c RowIndices and \c ColIndices types must be compatible with the following API:
+  * \code
+  * <integral type> operator[](Index) const;
+  * Index size() const;
+  * \endcode
+  *
+  * Typical supported types thus include:
+  *  - std::vector<int>
+  *  - std::valarray<int>
+  *  - std::array<int>
+  *  - Plain C arrays: int[N]
+  *  - Eigen::ArrayXi
+  *  - decltype(ArrayXi::LinSpaced(...))
+  *  - Any view/expressions of the previous types
+  *  - Eigen::ArithmeticSequence
+  *  - Eigen::internal::AllRange      (helper for Eigen::all)
+  *  - Eigen::internal::SingleRange  (helper for single index)
+  *  - etc.
+  *
+  * In typical usages of %Eigen, this class should never be used directly. It is the return type of
+  * DenseBase::operator()(const RowIndices&, const ColIndices&).
+  *
+  * \sa class Block
+  */
+template<typename XprType, typename RowIndices, typename ColIndices>
+class IndexedView : public IndexedViewImpl<XprType, RowIndices, ColIndices, typename internal::traits<XprType>::StorageKind>
+{
+public:
+  typedef typename IndexedViewImpl<XprType, RowIndices, ColIndices, typename internal::traits<XprType>::StorageKind>::Base Base;
+  EIGEN_GENERIC_PUBLIC_INTERFACE(IndexedView)
+  EIGEN_INHERIT_ASSIGNMENT_OPERATORS(IndexedView)
+
+  typedef typename internal::ref_selector<XprType>::non_const_type MatrixTypeNested;
+  typedef typename internal::remove_all<XprType>::type NestedExpression;
+
+  template<typename T0, typename T1>
+  IndexedView(XprType& xpr, const T0& rowIndices, const T1& colIndices)
+    : m_xpr(xpr), m_rowIndices(rowIndices), m_colIndices(colIndices)
+  {}
+
+  /** \returns number of rows */
+  Index rows() const { return internal::size(m_rowIndices); }
+
+  /** \returns number of columns */
+  Index cols() const { return internal::size(m_colIndices); }
+
+  /** \returns the nested expression */
+  const typename internal::remove_all<XprType>::type&
+  nestedExpression() const { return m_xpr; }
+
+  /** \returns the nested expression */
+  typename internal::remove_reference<XprType>::type&
+  nestedExpression() { return m_xpr; }
+
+  /** \returns a const reference to the object storing/generating the row indices */
+  const RowIndices& rowIndices() const { return m_rowIndices; }
+
+  /** \returns a const reference to the object storing/generating the column indices */
+  const ColIndices& colIndices() const { return m_colIndices; }
+
+protected:
+  MatrixTypeNested m_xpr;
+  RowIndices m_rowIndices;
+  ColIndices m_colIndices;
+};
+
+
+// Generic API dispatcher
+template<typename XprType, typename RowIndices, typename ColIndices, typename StorageKind>
+class IndexedViewImpl
+  : public internal::generic_xpr_base<IndexedView<XprType, RowIndices, ColIndices> >::type
+{
+public:
+  typedef typename internal::generic_xpr_base<IndexedView<XprType, RowIndices, ColIndices> >::type Base;
+};
+
+namespace internal {
+
+
+template<typename ArgType, typename RowIndices, typename ColIndices>
+struct unary_evaluator<IndexedView<ArgType, RowIndices, ColIndices>, IndexBased>
+  : evaluator_base<IndexedView<ArgType, RowIndices, ColIndices> >
+{
+  typedef IndexedView<ArgType, RowIndices, ColIndices> XprType;
+
+  enum {
+    CoeffReadCost = evaluator<ArgType>::CoeffReadCost /* TODO + cost of row/col index */,
+
+    Flags = (evaluator<ArgType>::Flags & (HereditaryBits /*| LinearAccessBit | DirectAccessBit*/)),
+
+    Alignment = 0
+  };
+
+  EIGEN_DEVICE_FUNC explicit unary_evaluator(const XprType& xpr) : m_argImpl(xpr.nestedExpression()), m_xpr(xpr)
+  {
+    EIGEN_INTERNAL_CHECK_COST_VALUE(CoeffReadCost);
+  }
+
+  typedef typename XprType::Scalar Scalar;
+  typedef typename XprType::CoeffReturnType CoeffReturnType;
+
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+  CoeffReturnType coeff(Index row, Index col) const
+  {
+    return m_argImpl.coeff(m_xpr.rowIndices()[row], m_xpr.colIndices()[col]);
+  }
+
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+  Scalar& coeffRef(Index row, Index col)
+  {
+    return m_argImpl.coeffRef(m_xpr.rowIndices()[row], m_xpr.colIndices()[col]);
+  }
+
+protected:
+
+  evaluator<ArgType> m_argImpl;
+  const XprType& m_xpr;
+
+};
+
+} // end namespace internal
+
+} // end namespace Eigen
+
+#endif // EIGEN_INDEXED_VIEW_H
--- a/Eigen/src/Core/Inverse.h
+++ b/Eigen/src/Core/Inverse.h
@@ -1,7 +1,7 @@
 // This file is part of Eigen, a lightweight C++ template library
 // for linear algebra.
 //
-// Copyright (C) 2014 Gael Guennebaud <gael.guennebaud@inria.fr>
+// Copyright (C) 2014-2019 Gael Guennebaud <gael.guennebaud@inria.fr>
 //
 // This Source Code Form is subject to the terms of the Mozilla
 // Public License v. 2.0. If a copy of the MPL was not distributed
@@ -12,8 +12,6 @@

 namespace Eigen { 

-// TODO move the general declaration in Core, and rename this file DenseInverseImpl.h, or something like this...
-
 template<typename XprType,typename StorageKind> class InverseImpl;

 namespace internal {
@@ -46,16 +44,18 @@ class Inverse : public InverseImpl<XprType,typename internal::traits<XprType>::S
 {
 public:
  typedef typename XprType::StorageIndex StorageIndex;
-  typedef typename XprType::PlainObject                       PlainObject;
+  typedef typename XprType::Scalar                            Scalar;
  typedef typename internal::ref_selector<XprType>::type      XprTypeNested;
  typedef typename internal::remove_all<XprTypeNested>::type  XprTypeNestedCleaned;
+  typedef typename internal::ref_selector<Inverse>::type Nested;
+  typedef typename internal::remove_all<XprType>::type NestedExpression;
  
-  explicit Inverse(const XprType &xpr)
+  explicit EIGEN_DEVICE_FUNC Inverse(const XprType &xpr)
    : m_xpr(xpr)
  {}
-  
-  EIGEN_DEVICE_FUNC Index rows() const { return m_xpr.rows(); }
-  EIGEN_DEVICE_FUNC Index cols() const { return m_xpr.cols(); }
+
+  EIGEN_DEVICE_FUNC Index rows() const { return m_xpr.cols(); }
+  EIGEN_DEVICE_FUNC Index cols() const { return m_xpr.rows(); }

  EIGEN_DEVICE_FUNC const XprTypeNestedCleaned& nestedExpression() const { return m_xpr; }

@@ -63,25 +63,16 @@ protected:
  XprTypeNested m_xpr;
 };

-/** \internal
-  * Specialization of the Inverse expression for dense expressions.
-  * Direct access to the coefficients are discared.
-  * FIXME this intermediate class is probably not needed anymore.
-  */
-template<typename XprType>
-class InverseImpl<XprType,Dense>
-  : public MatrixBase<Inverse<XprType> >
+// Generic API dispatcher
+template<typename XprType, typename StorageKind>
+class InverseImpl
+  : public internal::generic_xpr_base<Inverse<XprType> >::type
 {
-  typedef Inverse<XprType> Derived;
-  
 public:
-  
-  typedef MatrixBase<Derived> Base;
-  EIGEN_DENSE_PUBLIC_INTERFACE(Derived)
-  typedef typename internal::remove_all<XprType>::type NestedExpression;
-
+  typedef typename internal::generic_xpr_base<Inverse<XprType> >::type Base;
+  typedef typename XprType::Scalar Scalar;
 private:
-  
+
  Scalar coeff(Index row, Index col) const;
  Scalar coeff(Index i) const;
 };
--- a/Eigen/src/Core/Map.h
+++ b/Eigen/src/Core/Map.h
@@ -13,6 +13,34 @@

 namespace Eigen { 

+namespace internal {
+template<typename PlainObjectType, int MapOptions, typename StrideType>
+struct traits<Map<PlainObjectType, MapOptions, StrideType> >
+  : public traits<PlainObjectType>
+{
+  typedef traits<PlainObjectType> TraitsBase;
+  enum {
+    PlainObjectTypeInnerSize = ((traits<PlainObjectType>::Flags&RowMajorBit)==RowMajorBit)
+                             ? PlainObjectType::ColsAtCompileTime
+                             : PlainObjectType::RowsAtCompileTime,
+
+    InnerStrideAtCompileTime = StrideType::InnerStrideAtCompileTime == 0
+                             ? int(PlainObjectType::InnerStrideAtCompileTime)
+                             : int(StrideType::InnerStrideAtCompileTime),
+    OuterStrideAtCompileTime = StrideType::OuterStrideAtCompileTime == 0
+                             ? (InnerStrideAtCompileTime==Dynamic || PlainObjectTypeInnerSize==Dynamic
+                                ? Dynamic
+                                : int(InnerStrideAtCompileTime) * int(PlainObjectTypeInnerSize))
+                             : int(StrideType::OuterStrideAtCompileTime),
+    Alignment = int(MapOptions)&int(AlignedMask),
+    Flags0 = TraitsBase::Flags & (~NestByRefBit),
+    Flags = is_lvalue<PlainObjectType>::value ? int(Flags0) : (int(Flags0) & ~LvalueBit)
+  };
+private:
+  enum { Options }; // Expressions don't have Options
+};
+}
+
 /** \class Map
  * \ingroup Core_Module
  *
@@ -63,29 +91,6 @@ namespace Eigen {
  *
  * \sa PlainObjectBase::Map(), \ref TopicStorageOrders
  */
-
-namespace internal {
-template<typename PlainObjectType, int MapOptions, typename StrideType>
-struct traits<Map<PlainObjectType, MapOptions, StrideType> >
-  : public traits<PlainObjectType>
-{
-  typedef traits<PlainObjectType> TraitsBase;
-  enum {
-    InnerStrideAtCompileTime = StrideType::InnerStrideAtCompileTime == 0
-                             ? int(PlainObjectType::InnerStrideAtCompileTime)
-                             : int(StrideType::InnerStrideAtCompileTime),
-    OuterStrideAtCompileTime = StrideType::OuterStrideAtCompileTime == 0
-                             ? int(PlainObjectType::OuterStrideAtCompileTime)
-                             : int(StrideType::OuterStrideAtCompileTime),
-    Alignment = int(MapOptions)&int(AlignedMask),
-    Flags0 = TraitsBase::Flags & (~NestByRefBit),
-    Flags = is_lvalue<PlainObjectType>::value ? int(Flags0) : (int(Flags0) & ~LvalueBit)
-  };
-private:
-  enum { Options }; // Expressions don't have Options
-};
-}
-
 template<typename PlainObjectType, int MapOptions, typename StrideType> class Map
  : public MapBase<Map<PlainObjectType, MapOptions, StrideType> >
 {
@@ -109,9 +114,10 @@ template<typename PlainObjectType, int MapOptions, typename StrideType> class Ma
    inline Index outerStride() const
    {
      return StrideType::OuterStrideAtCompileTime != 0 ? m_stride.outer()
-           : IsVectorAtCompileTime ? this->size()
-           : int(Flags)&RowMajorBit ? this->cols()
-           : this->rows();
+           : internal::traits<Map>::OuterStrideAtCompileTime != Dynamic ? Index(internal::traits<Map>::OuterStrideAtCompileTime)
+           : IsVectorAtCompileTime ? (this->size() * innerStride())
+           : int(Flags)&RowMajorBit ? (this->cols() * innerStride())
+           : (this->rows() * innerStride());
    }

    /** Constructor in the fixed-size case.
--- a/Eigen/src/Core/MapBase.h
+++ b/Eigen/src/Core/MapBase.h
@@ -17,10 +17,20 @@

 namespace Eigen { 

-/** \class MapBase
-  * \ingroup Core_Module
+/** \ingroup Core_Module
  *
-  * \brief Base class for Map and Block expression with direct access
+  * \brief Base class for dense Map and Block expression with direct access
+  *
+  * This base class provides the const low-level accessors (e.g. coeff, coeffRef) of dense
+  * Map and Block objects with direct access.
+  * Typical users do not have to directly deal with this class.
+  *
+  * This class can be extended by through the macro plugin \c EIGEN_MAPBASE_PLUGIN.
+  * See \link TopicCustomizing_Plugins customizing Eigen \endlink for details.
+  *
+  * The \c Derived class has to provide the following two methods describing the memory layout:
+  *  \code Index innerStride() const; \endcode
+  *  \code Index outerStride() const; \endcode
  *
  * \sa class Map, class Block
  */
@@ -33,6 +43,7 @@ template<typename Derived> class MapBase<Derived, ReadOnlyAccessors>
    enum {
      RowsAtCompileTime = internal::traits<Derived>::RowsAtCompileTime,
      ColsAtCompileTime = internal::traits<Derived>::ColsAtCompileTime,
+      InnerStrideAtCompileTime = internal::traits<Derived>::InnerStrideAtCompileTime,
      SizeAtCompileTime = Base::SizeAtCompileTime
    };

@@ -75,7 +86,9 @@ template<typename Derived> class MapBase<Derived, ReadOnlyAccessors>

    typedef typename Base::CoeffReturnType CoeffReturnType;

+    /** \copydoc DenseBase::rows() */
    EIGEN_DEVICE_FUNC inline Index rows() const { return m_rows.value(); }
+    /** \copydoc DenseBase::cols() */
    EIGEN_DEVICE_FUNC inline Index cols() const { return m_cols.value(); }

    /** Returns a pointer to the first coefficient of the matrix or vector.
@@ -86,12 +99,14 @@ template<typename Derived> class MapBase<Derived, ReadOnlyAccessors>
      */
    EIGEN_DEVICE_FUNC inline const Scalar* data() const { return m_data; }

+    /** \copydoc PlainObjectBase::coeff(Index,Index) const */
    EIGEN_DEVICE_FUNC
    inline const Scalar& coeff(Index rowId, Index colId) const
    {
      return m_data[colId * colStride() + rowId * rowStride()];
    }

+    /** \copydoc PlainObjectBase::coeff(Index) const */
    EIGEN_DEVICE_FUNC
    inline const Scalar& coeff(Index index) const
    {
@@ -99,12 +114,14 @@ template<typename Derived> class MapBase<Derived, ReadOnlyAccessors>
      return m_data[index * innerStride()];
    }

+    /** \copydoc PlainObjectBase::coeffRef(Index,Index) const */
    EIGEN_DEVICE_FUNC
    inline const Scalar& coeffRef(Index rowId, Index colId) const
    {
      return this->m_data[colId * colStride() + rowId * rowStride()];
    }

+    /** \copydoc PlainObjectBase::coeffRef(Index) const */
    EIGEN_DEVICE_FUNC
    inline const Scalar& coeffRef(Index index) const
    {
@@ -112,6 +129,7 @@ template<typename Derived> class MapBase<Derived, ReadOnlyAccessors>
      return this->m_data[index * innerStride()];
    }

+    /** \internal */
    template<int LoadMode>
    inline PacketScalar packet(Index rowId, Index colId) const
    {
@@ -119,6 +137,7 @@ template<typename Derived> class MapBase<Derived, ReadOnlyAccessors>
               (m_data + (colId * colStride() + rowId * rowStride()));
    }

+    /** \internal */
    template<int LoadMode>
    inline PacketScalar packet(Index index) const
    {
@@ -126,13 +145,15 @@ template<typename Derived> class MapBase<Derived, ReadOnlyAccessors>
      return internal::ploadt<PacketScalar, LoadMode>(m_data + index * innerStride());
    }

+    /** \internal Constructor for fixed size matrices or vectors */
    EIGEN_DEVICE_FUNC
    explicit inline MapBase(PointerType dataPtr) : m_data(dataPtr), m_rows(RowsAtCompileTime), m_cols(ColsAtCompileTime)
    {
      EIGEN_STATIC_ASSERT_FIXED_SIZE(Derived)
-      checkSanity();
+      checkSanity<Derived>();
    }

+    /** \internal Constructor for dynamically sized vectors */
    EIGEN_DEVICE_FUNC
    inline MapBase(PointerType dataPtr, Index vecSize)
            : m_data(dataPtr),
@@ -142,9 +163,10 @@ template<typename Derived> class MapBase<Derived, ReadOnlyAccessors>
      EIGEN_STATIC_ASSERT_VECTOR_ONLY(Derived)
      eigen_assert(vecSize >= 0);
      eigen_assert(dataPtr == 0 || SizeAtCompileTime == Dynamic || SizeAtCompileTime == vecSize);
-      checkSanity();
+      checkSanity<Derived>();
    }

+    /** \internal Constructor for dynamically sized matrices */
    EIGEN_DEVICE_FUNC
    inline MapBase(PointerType dataPtr, Index rows, Index cols)
            : m_data(dataPtr), m_rows(rows), m_cols(cols)
@@ -152,24 +174,48 @@ template<typename Derived> class MapBase<Derived, ReadOnlyAccessors>
      eigen_assert( (dataPtr == 0)
              || (   rows >= 0 && (RowsAtCompileTime == Dynamic || RowsAtCompileTime == rows)
                  && cols >= 0 && (ColsAtCompileTime == Dynamic || ColsAtCompileTime == cols)));
-      checkSanity();
+      checkSanity<Derived>();
    }

+    #ifdef EIGEN_MAPBASE_PLUGIN
+    #include EIGEN_MAPBASE_PLUGIN
+    #endif
+
  protected:

+    template<typename T>
    EIGEN_DEVICE_FUNC
-    void checkSanity() const
+    void checkSanity(typename internal::enable_if<(internal::traits<T>::Alignment>0),void*>::type = 0) const
    {
 #if EIGEN_MAX_ALIGN_BYTES>0
-      eigen_assert(((size_t(m_data) % EIGEN_PLAIN_ENUM_MAX(1,internal::traits<Derived>::Alignment)) == 0) && "data is not aligned");
+      // innerStride() is not set yet when this function is called, so we optimistically assume the lowest plausible value:
+      const Index minInnerStride = InnerStrideAtCompileTime == Dynamic ? 1 : Index(InnerStrideAtCompileTime);
+      EIGEN_ONLY_USED_FOR_DEBUG(minInnerStride);
+      eigen_assert((   ((internal::UIntPtr(m_data) % internal::traits<Derived>::Alignment) == 0)
+                    || (cols() * rows() * minInnerStride * sizeof(Scalar)) < internal::traits<Derived>::Alignment ) && "data is not aligned");
 #endif
    }

+    template<typename T>
+    EIGEN_DEVICE_FUNC
+    void checkSanity(typename internal::enable_if<internal::traits<T>::Alignment==0,void*>::type = 0) const
+    {}
+
    PointerType m_data;
    const internal::variable_if_dynamic<Index, RowsAtCompileTime> m_rows;
    const internal::variable_if_dynamic<Index, ColsAtCompileTime> m_cols;
 };

+/** \ingroup Core_Module
+  *
+  * \brief Base class for non-const dense Map and Block expression with direct access
+  *
+  * This base class provides the non-const low-level accessors (e.g. coeff and coeffRef) of
+  * dense Map and Block objects with direct access.
+  * It inherits MapBase<Derived, ReadOnlyAccessors> which defines the const variant for reading specific entries.
+  *
+  * \sa class Map, class Block
+  */
 template<typename Derived> class MapBase<Derived, WriteAccessors>
  : public MapBase<Derived, ReadOnlyAccessors>
 {
--- a/Eigen/src/Core/MathFunctions.h
+++ b/Eigen/src/Core/MathFunctions.h
--- a/Eigen/src/Core/MathFunctionsImpl.h
+++ b/Eigen/src/Core/MathFunctionsImpl.h
@@ -0,0 +1,97 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2014 Pedro Gonnet (pedro.gonnet@gmail.com)
+// Copyright (C) 2016 Gael Guennebaud <gael.guennebaud@inria.fr>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+#ifndef EIGEN_MATHFUNCTIONSIMPL_H
+#define EIGEN_MATHFUNCTIONSIMPL_H
+
+namespace Eigen {
+
+namespace internal {
+
+/** \internal \returns the hyperbolic tan of \a a (coeff-wise)
+    Doesn't do anything fancy, just a 13/6-degree rational interpolant which
+    is accurate up to a couple of ulp in the range [-9, 9], outside of which
+    the tanh(x) = +/-1.
+
+    This implementation works on both scalars and packets.
+*/
+template<typename T>
+T generic_fast_tanh_float(const T& a_x)
+{
+  // Clamp the inputs to the range [-9, 9] since anything outside
+  // this range is +/-1.0f in single-precision.
+  const T plus_9 = pset1<T>(9.f);
+  const T minus_9 = pset1<T>(-9.f);
+  const T x = pmax(pmin(a_x, plus_9), minus_9);
+  // The monomial coefficients of the numerator polynomial (odd).
+  const T alpha_1 = pset1<T>(4.89352455891786e-03f);
+  const T alpha_3 = pset1<T>(6.37261928875436e-04f);
+  const T alpha_5 = pset1<T>(1.48572235717979e-05f);
+  const T alpha_7 = pset1<T>(5.12229709037114e-08f);
+  const T alpha_9 = pset1<T>(-8.60467152213735e-11f);
+  const T alpha_11 = pset1<T>(2.00018790482477e-13f);
+  const T alpha_13 = pset1<T>(-2.76076847742355e-16f);
+
+  // The monomial coefficients of the denominator polynomial (even).
+  const T beta_0 = pset1<T>(4.89352518554385e-03f);
+  const T beta_2 = pset1<T>(2.26843463243900e-03f);
+  const T beta_4 = pset1<T>(1.18534705686654e-04f);
+  const T beta_6 = pset1<T>(1.19825839466702e-06f);
+
+  // Since the polynomials are odd/even, we need x^2.
+  const T x2 = pmul(x, x);
+
+  // Evaluate the numerator polynomial p.
+  T p = pmadd(x2, alpha_13, alpha_11);
+  p = pmadd(x2, p, alpha_9);
+  p = pmadd(x2, p, alpha_7);
+  p = pmadd(x2, p, alpha_5);
+  p = pmadd(x2, p, alpha_3);
+  p = pmadd(x2, p, alpha_1);
+  p = pmul(x, p);
+
+  // Evaluate the denominator polynomial p.
+  T q = pmadd(x2, beta_6, beta_4);
+  q = pmadd(x2, q, beta_2);
+  q = pmadd(x2, q, beta_0);
+
+  // Divide the numerator by the denominator.
+  return pdiv(p, q);
+}
+
+template<typename RealScalar>
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+RealScalar positive_real_hypot(const RealScalar& x, const RealScalar& y)
+{
+  EIGEN_USING_STD_MATH(sqrt);
+  RealScalar p, qp;
+  p = numext::maxi(x,y);
+  if(p==RealScalar(0)) return RealScalar(0);
+  qp = numext::mini(y,x) / p;
+  return p * sqrt(RealScalar(1) + qp*qp);
+}
+
+template<typename Scalar>
+struct hypot_impl
+{
+  typedef typename NumTraits<Scalar>::Real RealScalar;
+  static EIGEN_DEVICE_FUNC
+  inline RealScalar run(const Scalar& x, const Scalar& y)
+  {
+    EIGEN_USING_STD_MATH(abs);
+    return positive_real_hypot<RealScalar>(abs(x), abs(y));
+  }
+};
+
+} // end namespace internal
+
+} // end namespace Eigen
+
+#endif // EIGEN_MATHFUNCTIONSIMPL_H
--- a/Eigen/src/Core/Matrix.h
+++ b/Eigen/src/Core/Matrix.h
@@ -13,6 +13,45 @@

 namespace Eigen {

+namespace internal {
+template<typename _Scalar, int _Rows, int _Cols, int _Options, int _MaxRows, int _MaxCols>
+struct traits<Matrix<_Scalar, _Rows, _Cols, _Options, _MaxRows, _MaxCols> >
+{
+private:
+  enum { size = internal::size_at_compile_time<_Rows,_Cols>::ret };
+  typedef typename find_best_packet<_Scalar,size>::type PacketScalar;
+  enum {
+      row_major_bit = _Options&RowMajor ? RowMajorBit : 0,
+      is_dynamic_size_storage = _MaxRows==Dynamic || _MaxCols==Dynamic,
+      max_size = is_dynamic_size_storage ? Dynamic : _MaxRows*_MaxCols,
+      default_alignment = compute_default_alignment<_Scalar,max_size>::value,
+      actual_alignment = ((_Options&DontAlign)==0) ? default_alignment : 0,
+      required_alignment = unpacket_traits<PacketScalar>::alignment,
+      packet_access_bit = (packet_traits<_Scalar>::Vectorizable && (EIGEN_UNALIGNED_VECTORIZE || (actual_alignment>=required_alignment))) ? PacketAccessBit : 0
+    };
+    
+public:
+  typedef _Scalar Scalar;
+  typedef Dense StorageKind;
+  typedef Eigen::Index StorageIndex;
+  typedef MatrixXpr XprKind;
+  enum {
+    RowsAtCompileTime = _Rows,
+    ColsAtCompileTime = _Cols,
+    MaxRowsAtCompileTime = _MaxRows,
+    MaxColsAtCompileTime = _MaxCols,
+    Flags = compute_matrix_flags<_Scalar, _Rows, _Cols, _Options, _MaxRows, _MaxCols>::ret,
+    Options = _Options,
+    InnerStrideAtCompileTime = 1,
+    OuterStrideAtCompileTime = (Options&RowMajor) ? ColsAtCompileTime : RowsAtCompileTime,
+    
+    // FIXME, the following flag in only used to define NeedsToAlign in PlainObjectBase
+    EvaluatorFlags = LinearAccessBit | DirectAccessBit | packet_access_bit | row_major_bit,
+    Alignment = actual_alignment
+  };
+};
+}
+
 /** \class Matrix
  * \ingroup Core_Module
  *
@@ -67,7 +106,7 @@ namespace Eigen {
  * \endcode
  *
  * This class can be extended with the help of the plugin mechanism described on the page
-  * \ref TopicCustomizingEigen by defining the preprocessor symbol \c EIGEN_MATRIX_PLUGIN.
+  * \ref TopicCustomizing_Plugins by defining the preprocessor symbol \c EIGEN_MATRIX_PLUGIN.
  *
  * <i><b>Some notes:</b></i>
  *
@@ -98,7 +137,7 @@ namespace Eigen {
  * </dl>
  *
  * <i><b>ABI and storage layout</b></i>
-  * 
+  *
  * The table below summarizes the ABI of some possible Matrix instances which is fixed thorough the lifetime of Eigen 3.
  * <table  class="manual">
  * <tr><th>Matrix type</th><th>Equivalent C structure</th></tr>
@@ -130,50 +169,11 @@ namespace Eigen {
  * </table>
  * Note that in this table Rows, Cols, MaxRows and MaxCols are all positive integers. A(S) is defined to the largest possible power-of-two
  * smaller to EIGEN_MAX_STATIC_ALIGN_BYTES.
-  * 
-  * \see MatrixBase for the majority of the API methods for matrices, \ref TopicClassHierarchy, 
-  * \ref TopicStorageOrders 
+  *
+  * \see MatrixBase for the majority of the API methods for matrices, \ref TopicClassHierarchy,
+  * \ref TopicStorageOrders
  */

-namespace internal {
-template<typename _Scalar, int _Rows, int _Cols, int _Options, int _MaxRows, int _MaxCols>
-struct traits<Matrix<_Scalar, _Rows, _Cols, _Options, _MaxRows, _MaxCols> >
-{
-private:
-  enum { size = internal::size_at_compile_time<_Rows,_Cols>::ret };
-  typedef typename find_best_packet<_Scalar,size>::type PacketScalar;
-  enum {
-      row_major_bit = _Options&RowMajor ? RowMajorBit : 0,
-      is_dynamic_size_storage = _MaxRows==Dynamic || _MaxCols==Dynamic,
-      max_size = is_dynamic_size_storage ? Dynamic : _MaxRows*_MaxCols,
-      default_alignment = compute_default_alignment<_Scalar,max_size>::value,
-      actual_alignment = ((_Options&DontAlign)==0) ? default_alignment : 0,
-      required_alignment = unpacket_traits<PacketScalar>::alignment,
-      packet_access_bit = packet_traits<_Scalar>::Vectorizable && (actual_alignment>=required_alignment) ? PacketAccessBit : 0
-    };
-    
-public:
-  typedef _Scalar Scalar;
-  typedef Dense StorageKind;
-  typedef Eigen::Index StorageIndex;
-  typedef MatrixXpr XprKind;
-  enum {
-    RowsAtCompileTime = _Rows,
-    ColsAtCompileTime = _Cols,
-    MaxRowsAtCompileTime = _MaxRows,
-    MaxColsAtCompileTime = _MaxCols,
-    Flags = compute_matrix_flags<_Scalar, _Rows, _Cols, _Options, _MaxRows, _MaxCols>::ret,
-    Options = _Options,
-    InnerStrideAtCompileTime = 1,
-    OuterStrideAtCompileTime = (Options&RowMajor) ? ColsAtCompileTime : RowsAtCompileTime,
-    
-    // FIXME, the following flag in only used to define NeedsToAlign in PlainObjectBase
-    EvaluatorFlags = LinearAccessBit | DirectAccessBit | packet_access_bit | row_major_bit,
-    Alignment = actual_alignment
-  };
-};
-}
-
 template<typename _Scalar, int _Rows, int _Cols, int _Options, int _MaxRows, int _MaxCols>
 class Matrix
  : public PlainObjectBase<Matrix<_Scalar, _Rows, _Cols, _Options, _MaxRows, _MaxCols> >
@@ -255,55 +255,93 @@ class Matrix
      *
      * \sa resize(Index,Index)
      */
-    EIGEN_DEVICE_FUNC
-    EIGEN_STRONG_INLINE Matrix() : Base()
+    EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+    Matrix() : Base()
    {
      Base::_check_template_params();
      EIGEN_INITIALIZE_COEFFS_IF_THAT_OPTION_IS_ENABLED
    }

    // FIXME is it still needed
-    EIGEN_DEVICE_FUNC
+    EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
    explicit Matrix(internal::constructor_without_unaligned_array_assert)
      : Base(internal::constructor_without_unaligned_array_assert())
    { Base::_check_template_params(); EIGEN_INITIALIZE_COEFFS_IF_THAT_OPTION_IS_ENABLED }

-#ifdef EIGEN_HAVE_RVALUE_REFERENCES
-    EIGEN_DEVICE_FUNC
-    Matrix(Matrix&& other)
+#if EIGEN_HAS_RVALUE_REFERENCES
+    EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+    Matrix(Matrix&& other) EIGEN_NOEXCEPT_IF(std::is_nothrow_move_constructible<Scalar>::value)
      : Base(std::move(other))
    {
      Base::_check_template_params();
-      if (RowsAtCompileTime!=Dynamic && ColsAtCompileTime!=Dynamic)
-        Base::_set_noalias(other);
    }
-    EIGEN_DEVICE_FUNC
-    Matrix& operator=(Matrix&& other)
+    EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+    Matrix& operator=(Matrix&& other) EIGEN_NOEXCEPT_IF(std::is_nothrow_move_assignable<Scalar>::value)
    {
      other.swap(*this);
      return *this;
    }
 #endif

-    #ifndef EIGEN_PARSED_BY_DOXYGEN
+#if EIGEN_HAS_CXX11
+    /** \copydoc PlainObjectBase(const Scalar&, const Scalar&, const Scalar&,  const Scalar&, const ArgTypes&... args)
+     *
+     * Example: \include Matrix_variadic_ctor_cxx11.cpp
+     * Output: \verbinclude Matrix_variadic_ctor_cxx11.out
+     *
+     * \sa Matrix(const std::initializer_list<std::initializer_list<Scalar>>&)
+     */
+    template <typename... ArgTypes>
+    EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+    Matrix(const Scalar& a0, const Scalar& a1, const Scalar& a2,  const Scalar& a3, const ArgTypes&... args)
+      : Base(a0, a1, a2, a3, args...) {}
+
+    /** \brief Constructs a Matrix and initializes it from the coefficients given as initializer-lists grouped by row. \cpp11
+      * 
+      * In the general case, the constructor takes a list of rows, each row being represented as a list of coefficients:
+      * 
+      * Example: \include Matrix_initializer_list_23_cxx11.cpp
+      * Output: \verbinclude Matrix_initializer_list_23_cxx11.out
+      * 
+      * Each of the inner initializer lists must contain the exact same number of elements, otherwise an assertion is triggered.
+      * 
+      * In the case of a compile-time column vector, implicit transposition from a single row is allowed.
+      * Therefore <code>VectorXd{{1,2,3,4,5}}</code> is legal and the more verbose syntax
+      * <code>RowVectorXd{{1},{2},{3},{4},{5}}</code> can be avoided:
+      * 
+      * Example: \include Matrix_initializer_list_vector_cxx11.cpp
+      * Output: \verbinclude Matrix_initializer_list_vector_cxx11.out
+      * 
+      * In the case of fixed-sized matrices, the initializer list sizes must exactly match the matrix sizes,
+      * and implicit transposition is allowed for compile-time vectors only.
+      * 
+      * \sa Matrix(const Scalar& a0, const Scalar& a1, const Scalar& a2,  const Scalar& a3, const ArgTypes&... args)
+      */
+    EIGEN_DEVICE_FUNC
+    explicit EIGEN_STRONG_INLINE Matrix(const std::initializer_list<std::initializer_list<Scalar>>& list) : Base(list) {}
+#endif // end EIGEN_HAS_CXX11
+
+#ifndef EIGEN_PARSED_BY_DOXYGEN

    // This constructor is for both 1x1 matrices and dynamic vectors
    template<typename T>
-    EIGEN_DEVICE_FUNC
-    EIGEN_STRONG_INLINE explicit Matrix(const T& x)
+    EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+    explicit Matrix(const T& x)
    {
      Base::_check_template_params();
      Base::template _init1<T>(x);
    }

    template<typename T0, typename T1>
-    EIGEN_DEVICE_FUNC
-    EIGEN_STRONG_INLINE Matrix(const T0& x, const T1& y)
+    EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+    Matrix(const T0& x, const T1& y)
    {
      Base::_check_template_params();
      Base::template _init2<T0,T1>(x, y);
    }
-    #else
+
+
+#else
    /** \brief Constructs a fixed-sized matrix initialized with coefficients starting at \a data */
    EIGEN_DEVICE_FUNC
    explicit Matrix(const Scalar *data);
@@ -321,7 +359,8 @@ class Matrix
      * \c EIGEN_INITIALIZE_MATRICES_BY_{ZERO,\c NAN} macros (see \ref TopicPreprocessorDirectives).
      */
    EIGEN_STRONG_INLINE explicit Matrix(Index dim);
-    /** \brief Constructs an initialized 1x1 matrix with the given coefficient */
+    /** \brief Constructs an initialized 1x1 matrix with the given coefficient
+      * \sa Matrix(const Scalar&, const Scalar&, const Scalar&,  const Scalar&, const ArgTypes&...) */
    Matrix(const Scalar& x);
    /** \brief Constructs an uninitialized matrix with \a rows rows and \a cols columns.
      *
@@ -338,11 +377,14 @@ class Matrix
    EIGEN_DEVICE_FUNC
    Matrix(Index rows, Index cols);
    
-    /** \brief Constructs an initialized 2D vector with given coefficients */
+    /** \brief Constructs an initialized 2D vector with given coefficients
+      * \sa Matrix(const Scalar&, const Scalar&, const Scalar&,  const Scalar&, const ArgTypes&...) */
    Matrix(const Scalar& x, const Scalar& y);
-    #endif
+    #endif  // end EIGEN_PARSED_BY_DOXYGEN

-    /** \brief Constructs an initialized 3D vector with given coefficients */
+    /** \brief Constructs an initialized 3D vector with given coefficients
+      * \sa Matrix(const Scalar&, const Scalar&, const Scalar&,  const Scalar&, const ArgTypes&...)
+      */
    EIGEN_DEVICE_FUNC
    EIGEN_STRONG_INLINE Matrix(const Scalar& x, const Scalar& y, const Scalar& z)
    {
@@ -352,7 +394,9 @@ class Matrix
      m_storage.data()[1] = y;
      m_storage.data()[2] = z;
    }
-    /** \brief Constructs an initialized 4D vector with given coefficients */
+    /** \brief Constructs an initialized 4D vector with given coefficients
+      * \sa Matrix(const Scalar&, const Scalar&, const Scalar&,  const Scalar&, const ArgTypes&...)
+      */
    EIGEN_DEVICE_FUNC
    EIGEN_STRONG_INLINE Matrix(const Scalar& x, const Scalar& y, const Scalar& z, const Scalar& w)
    {
@@ -407,7 +451,7 @@ class Matrix
  *
  * \ingroup Core_Module
  *
-  * Eigen defines several typedef shortcuts for most common matrix and vector types.
+  * %Eigen defines several typedef shortcuts for most common matrix and vector types.
  *
  * The general patterns are the following:
  *
@@ -419,6 +463,15 @@ class Matrix
  *
  * There are also \c VectorSizeType and \c RowVectorSizeType which are self-explanatory. For example, \c Vector4cf is
  * a fixed-size vector of 4 complex floats.
+  * 
+  * With \cpp11, template alias are also defined for common sizes.
+  * They follow the same pattern as above except that the scalar type suffix is replaced by a
+  * template parameter, i.e.:
+  *   - `MatrixSize<Type>` where `Size` can be \c 2,\c 3,\c 4 for fixed size square matrices or \c X for dynamic size.
+  *   - `MatrixXSize<Type>` and `MatrixSizeX<Type>` where `Size` can be \c 2,\c 3,\c 4 for hybrid dynamic/fixed matrices.
+  *   - `VectorSize<Type>` and `RowVectorSize<Type>` for column and row vectors.
+  * 
+  * With \cpp11, you can also use fully generic column and row vector types: `Vector<Type,Size>` and `RowVector<Type,Size>`.
  *
  * \sa class Matrix
  */
@@ -456,6 +509,55 @@ EIGEN_MAKE_TYPEDEFS_ALL_SIZES(std::complex<double>, cd)
 #undef EIGEN_MAKE_TYPEDEFS
 #undef EIGEN_MAKE_FIXED_TYPEDEFS

+#if EIGEN_HAS_CXX11
+
+#define EIGEN_MAKE_TYPEDEFS(Size, SizeSuffix)                     \
+/** \ingroup matrixtypedefs */                                    \
+/** \brief \cpp11 */                                              \
+template <typename Type>                                          \
+using Matrix##SizeSuffix = Matrix<Type, Size, Size>;              \
+/** \ingroup matrixtypedefs */                                    \
+/** \brief \cpp11 */                                              \
+template <typename Type>                                          \
+using Vector##SizeSuffix = Matrix<Type, Size, 1>;                 \
+/** \ingroup matrixtypedefs */                                    \
+/** \brief \cpp11 */                                              \
+template <typename Type>                                          \
+using RowVector##SizeSuffix = Matrix<Type, 1, Size>;
+
+#define EIGEN_MAKE_FIXED_TYPEDEFS(Size)                           \
+/** \ingroup matrixtypedefs */                                    \
+/** \brief \cpp11 */                                              \
+template <typename Type>                                          \
+using Matrix##Size##X = Matrix<Type, Size, Dynamic>;              \
+/** \ingroup matrixtypedefs */                                    \
+/** \brief \cpp11 */                                              \
+template <typename Type>                                          \
+using Matrix##X##Size = Matrix<Type, Dynamic, Size>;
+
+EIGEN_MAKE_TYPEDEFS(2, 2)
+EIGEN_MAKE_TYPEDEFS(3, 3)
+EIGEN_MAKE_TYPEDEFS(4, 4)
+EIGEN_MAKE_TYPEDEFS(Dynamic, X)
+EIGEN_MAKE_FIXED_TYPEDEFS(2)
+EIGEN_MAKE_FIXED_TYPEDEFS(3)
+EIGEN_MAKE_FIXED_TYPEDEFS(4)
+
+/** \ingroup matrixtypedefs
+  * \brief \cpp11 */
+template <typename Type, int Size>
+using Vector = Matrix<Type, Size, 1>;
+
+/** \ingroup matrixtypedefs
+  * \brief \cpp11 */
+template <typename Type, int Size>
+using RowVector = Matrix<Type, 1, Size>;
+
+#undef EIGEN_MAKE_TYPEDEFS
+#undef EIGEN_MAKE_FIXED_TYPEDEFS
+
+#endif // EIGEN_HAS_CXX11
+
 } // end namespace Eigen

 #endif // EIGEN_MATRIX_H
--- a/Eigen/src/Core/MatrixBase.h
+++ b/Eigen/src/Core/MatrixBase.h
@@ -41,9 +41,9 @@ namespace Eigen {
  * \endcode
  *
  * This class can be extended with the help of the plugin mechanism described on the page
-  * \ref TopicCustomizingEigen by defining the preprocessor symbol \c EIGEN_MATRIXBASE_PLUGIN.
+  * \ref TopicCustomizing_Plugins by defining the preprocessor symbol \c EIGEN_MATRIXBASE_PLUGIN.
  *
-  * \sa \ref TopicClassHierarchy
+  * \sa \blank \ref TopicClassHierarchy
  */
 template<typename Derived> class MatrixBase
  : public DenseBase<Derived>
@@ -66,7 +66,7 @@ template<typename Derived> class MatrixBase
    using Base::MaxSizeAtCompileTime;
    using Base::IsVectorAtCompileTime;
    using Base::Flags;
-    
+
    using Base::derived;
    using Base::const_cast_derived;
    using Base::rows;
@@ -76,12 +76,11 @@ template<typename Derived> class MatrixBase
    using Base::coeffRef;
    using Base::lazyAssign;
    using Base::eval;
+    using Base::operator-;
    using Base::operator+=;
    using Base::operator-=;
    using Base::operator*=;
    using Base::operator/=;
-    using Base::operator*;
-    using Base::operator/;

    typedef typename Base::CoeffReturnType CoeffReturnType;
    typedef typename Base::ConstTransposeReturnType ConstTransposeReturnType;
@@ -100,7 +99,7 @@ template<typename Derived> class MatrixBase
    /** \returns the size of the main diagonal, which is min(rows(),cols()).
      * \sa rows(), cols(), SizeAtCompileTime. */
    EIGEN_DEVICE_FUNC
-    inline Index diagonalSize() const { return (std::min)(rows(),cols()); }
+    inline Index diagonalSize() const { return (numext::mini)(rows(),cols()); }

    typedef typename Base::PlainObject PlainObject;

@@ -123,7 +122,7 @@ template<typename Derived> class MatrixBase
 #endif // not EIGEN_PARSED_BY_DOXYGEN

 #define EIGEN_CURRENT_STORAGE_BASE_CLASS Eigen::MatrixBase
-#   include "../plugins/CommonCwiseUnaryOps.h"
+#define EIGEN_DOC_UNARY_ADDONS(X,Y)
 #   include "../plugins/CommonCwiseBinaryOps.h"
 #   include "../plugins/MatrixCwiseUnaryOps.h"
 #   include "../plugins/MatrixCwiseBinaryOps.h"
@@ -131,18 +130,19 @@ template<typename Derived> class MatrixBase
 #     include EIGEN_MATRIXBASE_PLUGIN
 #   endif
 #undef EIGEN_CURRENT_STORAGE_BASE_CLASS
+#undef EIGEN_DOC_UNARY_ADDONS

    /** Special case of the template operator=, in order to prevent the compiler
      * from generating a default operator= (issue hit with g++ 4.1)
      */
-    EIGEN_DEVICE_FUNC
+    EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
    Derived& operator=(const MatrixBase& other);

    // We cannot inherit here via Base::operator= since it is causing
    // trouble with MSVC.

    template <typename OtherDerived>
-    EIGEN_DEVICE_FUNC
+    EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
    Derived& operator=(const DenseBase<OtherDerived>& other);

    template <typename OtherDerived>
@@ -154,28 +154,19 @@ template<typename Derived> class MatrixBase
    Derived& operator=(const ReturnByValue<OtherDerived>& other);

    template<typename OtherDerived>
-    EIGEN_DEVICE_FUNC
+    EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
    Derived& operator+=(const MatrixBase<OtherDerived>& other);
    template<typename OtherDerived>
-    EIGEN_DEVICE_FUNC
+    EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
    Derived& operator-=(const MatrixBase<OtherDerived>& other);

-#ifdef __CUDACC__
    template<typename OtherDerived>
    EIGEN_DEVICE_FUNC
-    const Product<Derived,OtherDerived,LazyProduct>
-    operator*(const MatrixBase<OtherDerived> &other) const
-    { return this->lazyProduct(other); }
-#else
-
-    template<typename OtherDerived>
    const Product<Derived,OtherDerived>
    operator*(const MatrixBase<OtherDerived> &other) const;

-#endif
-
    template<typename OtherDerived>
-    EIGEN_DEVICE_FUNC 
+    EIGEN_DEVICE_FUNC
    const Product<Derived,OtherDerived,LazyProduct>
    lazyProduct(const MatrixBase<OtherDerived> &other) const;

@@ -195,7 +186,7 @@ template<typename Derived> class MatrixBase

    template<typename OtherDerived>
    EIGEN_DEVICE_FUNC
-    typename internal::scalar_product_traits<typename internal::traits<Derived>::Scalar,typename internal::traits<OtherDerived>::Scalar>::ReturnType
+    typename ScalarBinaryOpTraits<typename internal::traits<Derived>::Scalar,typename internal::traits<OtherDerived>::Scalar>::ReturnType
    dot(const MatrixBase<OtherDerived>& other) const;

    EIGEN_DEVICE_FUNC RealScalar squaredNorm() const;
@@ -204,7 +195,9 @@ template<typename Derived> class MatrixBase
    RealScalar blueNorm() const;
    RealScalar hypotNorm() const;
    EIGEN_DEVICE_FUNC const PlainObject normalized() const;
+    EIGEN_DEVICE_FUNC const PlainObject stableNormalized() const;
    EIGEN_DEVICE_FUNC void normalize();
+    EIGEN_DEVICE_FUNC void stableNormalize();

    EIGEN_DEVICE_FUNC const AdjointReturnType adjoint() const;
    EIGEN_DEVICE_FUNC void adjointInPlace();
@@ -212,7 +205,7 @@ template<typename Derived> class MatrixBase
    typedef Diagonal<Derived> DiagonalReturnType;
    EIGEN_DEVICE_FUNC
    DiagonalReturnType diagonal();
-    
+
    typedef typename internal::add_const<Diagonal<const Derived> >::type ConstDiagonalReturnType;
    EIGEN_DEVICE_FUNC
    ConstDiagonalReturnType diagonal() const;
@@ -220,14 +213,14 @@ template<typename Derived> class MatrixBase
    template<int Index> struct DiagonalIndexReturnType { typedef Diagonal<Derived,Index> Type; };
    template<int Index> struct ConstDiagonalIndexReturnType { typedef const Diagonal<const Derived,Index> Type; };

-    template<int Index> 
+    template<int Index>
    EIGEN_DEVICE_FUNC
    typename DiagonalIndexReturnType<Index>::Type diagonal();

    template<int Index>
    EIGEN_DEVICE_FUNC
    typename ConstDiagonalIndexReturnType<Index>::Type diagonal() const;
-    
+
    typedef Diagonal<Derived,DynamicIndex> DiagonalDynamicIndexReturnType;
    typedef typename internal::add_const<Diagonal<const Derived,DynamicIndex> >::type ConstDiagonalDynamicIndexReturnType;

@@ -249,7 +242,7 @@ template<typename Derived> class MatrixBase
    template<unsigned int UpLo> struct SelfAdjointViewReturnType { typedef SelfAdjointView<Derived, UpLo> Type; };
    template<unsigned int UpLo> struct ConstSelfAdjointViewReturnType { typedef const SelfAdjointView<const Derived, UpLo> Type; };

-    template<unsigned int UpLo> 
+    template<unsigned int UpLo>
    EIGEN_DEVICE_FUNC
    typename SelfAdjointViewReturnType<UpLo>::Type selfadjointView();
    template<unsigned int UpLo>
@@ -275,6 +268,8 @@ template<typename Derived> class MatrixBase
    Derived& setIdentity();
    EIGEN_DEVICE_FUNC
    Derived& setIdentity(Index rows, Index cols);
+    EIGEN_DEVICE_FUNC Derived& setUnit(Index i);
+    EIGEN_DEVICE_FUNC Derived& setUnit(Index newSize, Index i);

    bool isIdentity(const RealScalar& prec = NumTraits<Scalar>::dummy_precision()) const;
    bool isDiagonal(const RealScalar& prec = NumTraits<Scalar>::dummy_precision()) const;
@@ -292,7 +287,7 @@ template<typename Derived> class MatrixBase
      *          fuzzy comparison such as isApprox()
      * \sa isApprox(), operator!= */
    template<typename OtherDerived>
-    inline bool operator==(const MatrixBase<OtherDerived>& other) const
+    EIGEN_DEVICE_FUNC inline bool operator==(const MatrixBase<OtherDerived>& other) const
    { return cwiseEqual(other).all(); }

    /** \returns true if at least one pair of coefficients of \c *this and \a other are not exactly equal to each other.
@@ -300,10 +295,10 @@ template<typename Derived> class MatrixBase
      *          fuzzy comparison such as isApprox()
      * \sa isApprox(), operator== */
    template<typename OtherDerived>
-    inline bool operator!=(const MatrixBase<OtherDerived>& other) const
+    EIGEN_DEVICE_FUNC inline bool operator!=(const MatrixBase<OtherDerived>& other) const
    { return cwiseNotEqual(other).any(); }

-    NoAlias<Derived,Eigen::MatrixBase > noalias();
+    NoAlias<Derived,Eigen::MatrixBase > EIGEN_DEVICE_FUNC noalias();

    // TODO forceAlignedAccess is temporarily disabled
    // Need to find a nicer workaround.
@@ -328,100 +323,114 @@ template<typename Derived> class MatrixBase

 /////////// LU module ///////////

-    EIGEN_DEVICE_FUNC const FullPivLU<PlainObject> fullPivLu() const;
-    EIGEN_DEVICE_FUNC const PartialPivLU<PlainObject> partialPivLu() const;
+    inline const FullPivLU<PlainObject> fullPivLu() const;
+    inline const PartialPivLU<PlainObject> partialPivLu() const;

-    const PartialPivLU<PlainObject> lu() const;
+    inline const PartialPivLU<PlainObject> lu() const;

    EIGEN_DEVICE_FUNC
-    const Inverse<Derived> inverse() const;
-    
+    inline const Inverse<Derived> inverse() const;
+
    template<typename ResultType>
-    void computeInverseAndDetWithCheck(
+    inline void computeInverseAndDetWithCheck(
      ResultType& inverse,
      typename ResultType::Scalar& determinant,
      bool& invertible,
      const RealScalar& absDeterminantThreshold = NumTraits<Scalar>::dummy_precision()
    ) const;
+
    template<typename ResultType>
-    void computeInverseWithCheck(
+    inline void computeInverseWithCheck(
      ResultType& inverse,
      bool& invertible,
      const RealScalar& absDeterminantThreshold = NumTraits<Scalar>::dummy_precision()
    ) const;
+
+    EIGEN_DEVICE_FUNC
    Scalar determinant() const;

 /////////// Cholesky module ///////////

-    const LLT<PlainObject>  llt() const;
-    const LDLT<PlainObject> ldlt() const;
+    inline const LLT<PlainObject>  llt() const;
+    inline const LDLT<PlainObject> ldlt() const;

 /////////// QR module ///////////

-    const HouseholderQR<PlainObject> householderQr() const;
-    const ColPivHouseholderQR<PlainObject> colPivHouseholderQr() const;
-    const FullPivHouseholderQR<PlainObject> fullPivHouseholderQr() const;
+    inline const HouseholderQR<PlainObject> householderQr() const;
+    inline const ColPivHouseholderQR<PlainObject> colPivHouseholderQr() const;
+    inline const FullPivHouseholderQR<PlainObject> fullPivHouseholderQr() const;
+    inline const CompleteOrthogonalDecomposition<PlainObject> completeOrthogonalDecomposition() const;

-    EigenvaluesReturnType eigenvalues() const;
-    RealScalar operatorNorm() const;
+/////////// Eigenvalues module ///////////
+
+    inline EigenvaluesReturnType eigenvalues() const;
+    inline RealScalar operatorNorm() const;

 /////////// SVD module ///////////

-    JacobiSVD<PlainObject> jacobiSvd(unsigned int computationOptions = 0) const;
-    BDCSVD<PlainObject>    bdcSvd(unsigned int computationOptions = 0) const;
+    inline JacobiSVD<PlainObject> jacobiSvd(unsigned int computationOptions = 0) const;
+    inline BDCSVD<PlainObject>    bdcSvd(unsigned int computationOptions = 0) const;

 /////////// Geometry module ///////////

    #ifndef EIGEN_PARSED_BY_DOXYGEN
    /// \internal helper struct to form the return type of the cross product
    template<typename OtherDerived> struct cross_product_return_type {
-      typedef typename internal::scalar_product_traits<typename internal::traits<Derived>::Scalar,typename internal::traits<OtherDerived>::Scalar>::ReturnType Scalar;
+      typedef typename ScalarBinaryOpTraits<typename internal::traits<Derived>::Scalar,typename internal::traits<OtherDerived>::Scalar>::ReturnType Scalar;
      typedef Matrix<Scalar,MatrixBase::RowsAtCompileTime,MatrixBase::ColsAtCompileTime> type;
    };
    #endif // EIGEN_PARSED_BY_DOXYGEN
    template<typename OtherDerived>
    EIGEN_DEVICE_FUNC
-    typename cross_product_return_type<OtherDerived>::type
+#ifndef EIGEN_PARSED_BY_DOXYGEN
+    inline typename cross_product_return_type<OtherDerived>::type
+#else
+    inline PlainObject
+#endif
    cross(const MatrixBase<OtherDerived>& other) const;
-    
+
    template<typename OtherDerived>
    EIGEN_DEVICE_FUNC
-    PlainObject cross3(const MatrixBase<OtherDerived>& other) const;
-    
+    inline PlainObject cross3(const MatrixBase<OtherDerived>& other) const;
+
    EIGEN_DEVICE_FUNC
-    PlainObject unitOrthogonal(void) const;
-    
-    Matrix<Scalar,3,1> eulerAngles(Index a0, Index a1, Index a2) const;
-    
-    ScalarMultipleReturnType operator*(const UniformScaling<Scalar>& s) const;
+    inline PlainObject unitOrthogonal(void) const;
+
+    EIGEN_DEVICE_FUNC
+    inline Matrix<Scalar,3,1> eulerAngles(Index a0, Index a1, Index a2) const;
+
    // put this as separate enum value to work around possible GCC 4.3 bug (?)
    enum { HomogeneousReturnTypeDirection = ColsAtCompileTime==1&&RowsAtCompileTime==1 ? ((internal::traits<Derived>::Flags&RowMajorBit)==RowMajorBit ? Horizontal : Vertical)
                                          : ColsAtCompileTime==1 ? Vertical : Horizontal };
    typedef Homogeneous<Derived, HomogeneousReturnTypeDirection> HomogeneousReturnType;
-    HomogeneousReturnType homogeneous() const;
-    
+    EIGEN_DEVICE_FUNC
+    inline HomogeneousReturnType homogeneous() const;
+
    enum {
      SizeMinusOne = SizeAtCompileTime==Dynamic ? Dynamic : SizeAtCompileTime-1
    };
    typedef Block<const Derived,
                  internal::traits<Derived>::ColsAtCompileTime==1 ? SizeMinusOne : 1,
                  internal::traits<Derived>::ColsAtCompileTime==1 ? 1 : SizeMinusOne> ConstStartMinusOne;
-    typedef CwiseUnaryOp<internal::scalar_quotient1_op<typename internal::traits<Derived>::Scalar>,
-                const ConstStartMinusOne > HNormalizedReturnType;
-
-    const HNormalizedReturnType hnormalized() const;
+    typedef EIGEN_EXPR_BINARYOP_SCALAR_RETURN_TYPE(ConstStartMinusOne,Scalar,quotient) HNormalizedReturnType;
+    EIGEN_DEVICE_FUNC
+    inline const HNormalizedReturnType hnormalized() const;

 ////////// Householder module ///////////

+    EIGEN_DEVICE_FUNC
    void makeHouseholderInPlace(Scalar& tau, RealScalar& beta);
    template<typename EssentialPart>
+    EIGEN_DEVICE_FUNC
    void makeHouseholder(EssentialPart& essential,
                         Scalar& tau, RealScalar& beta) const;
    template<typename EssentialPart>
+    EIGEN_DEVICE_FUNC
    void applyHouseholderOnTheLeft(const EssentialPart& essential,
                                   const Scalar& tau,
                                   Scalar* workspace);
    template<typename EssentialPart>
+    EIGEN_DEVICE_FUNC
    void applyHouseholderOnTheRight(const EssentialPart& essential,
                                    const Scalar& tau,
                                    Scalar* workspace);
@@ -429,23 +438,47 @@ template<typename Derived> class MatrixBase
 ///////// Jacobi module /////////

    template<typename OtherScalar>
+    EIGEN_DEVICE_FUNC
    void applyOnTheLeft(Index p, Index q, const JacobiRotation<OtherScalar>& j);
    template<typename OtherScalar>
+    EIGEN_DEVICE_FUNC
    void applyOnTheRight(Index p, Index q, const JacobiRotation<OtherScalar>& j);

+///////// SparseCore module /////////
+
+    template<typename OtherDerived>
+    EIGEN_STRONG_INLINE const typename SparseMatrixBase<OtherDerived>::template CwiseProductDenseReturnType<Derived>::Type
+    cwiseProduct(const SparseMatrixBase<OtherDerived> &other) const
+    {
+      return other.cwiseProduct(derived());
+    }
+
 ///////// MatrixFunctions module /////////

    typedef typename internal::stem_function<Scalar>::type StemFunction;
-    const MatrixExponentialReturnValue<Derived> exp() const;
+#define EIGEN_MATRIX_FUNCTION(ReturnType, Name, Description) \
+    /** \returns an expression of the matrix Description of \c *this. \brief This function requires the <a href="unsupported/group__MatrixFunctions__Module.html"> unsupported MatrixFunctions module</a>. To compute the coefficient-wise Description use ArrayBase::##Name . */ \
+    const ReturnType<Derived> Name() const;
+#define EIGEN_MATRIX_FUNCTION_1(ReturnType, Name, Description, Argument) \
+    /** \returns an expression of the matrix Description of \c *this. \brief This function requires the <a href="unsupported/group__MatrixFunctions__Module.html"> unsupported MatrixFunctions module</a>. To compute the coefficient-wise Description use ArrayBase::##Name . */ \
+    const ReturnType<Derived> Name(Argument) const;
+
+    EIGEN_MATRIX_FUNCTION(MatrixExponentialReturnValue, exp, exponential)
+    /** \brief Helper function for the <a href="unsupported/group__MatrixFunctions__Module.html"> unsupported MatrixFunctions module</a>.*/
    const MatrixFunctionReturnValue<Derived> matrixFunction(StemFunction f) const;
-    const MatrixFunctionReturnValue<Derived> cosh() const;
-    const MatrixFunctionReturnValue<Derived> sinh() const;
-    const MatrixFunctionReturnValue<Derived> cos() const;
-    const MatrixFunctionReturnValue<Derived> sin() const;
-    const MatrixSquareRootReturnValue<Derived> sqrt() const;
-    const MatrixLogarithmReturnValue<Derived> log() const;
-    const MatrixPowerReturnValue<Derived> pow(const RealScalar& p) const;
-    const MatrixComplexPowerReturnValue<Derived> pow(const std::complex<RealScalar>& p) const;
+    EIGEN_MATRIX_FUNCTION(MatrixFunctionReturnValue, cosh, hyperbolic cosine)
+    EIGEN_MATRIX_FUNCTION(MatrixFunctionReturnValue, sinh, hyperbolic sine)
+#if EIGEN_HAS_CXX11_MATH
+    EIGEN_MATRIX_FUNCTION(MatrixFunctionReturnValue, atanh, inverse hyperbolic cosine)
+    EIGEN_MATRIX_FUNCTION(MatrixFunctionReturnValue, acosh, inverse hyperbolic cosine)
+    EIGEN_MATRIX_FUNCTION(MatrixFunctionReturnValue, asinh, inverse hyperbolic sine)
+#endif
+    EIGEN_MATRIX_FUNCTION(MatrixFunctionReturnValue, cos, cosine)
+    EIGEN_MATRIX_FUNCTION(MatrixFunctionReturnValue, sin, sine)
+    EIGEN_MATRIX_FUNCTION(MatrixSquareRootReturnValue, sqrt, square root)
+    EIGEN_MATRIX_FUNCTION(MatrixLogarithmReturnValue, log, logarithm)
+    EIGEN_MATRIX_FUNCTION_1(MatrixPowerReturnValue,        pow, power to \c p, const RealScalar& p)
+    EIGEN_MATRIX_FUNCTION_1(MatrixComplexPowerReturnValue, pow, power to \c p, const std::complex<RealScalar>& p)

  protected:
    EIGEN_DEVICE_FUNC MatrixBase() : Base() {}
--- a/Eigen/src/Core/NestByValue.h
+++ b/Eigen/src/Core/NestByValue.h
@@ -13,25 +13,28 @@

 namespace Eigen {

+namespace internal {
+template<typename ExpressionType>
+struct traits<NestByValue<ExpressionType> > : public traits<ExpressionType>
+{
+  enum {
+    Flags = traits<ExpressionType>::Flags & ~NestByRefBit
+  };
+};
+}
+
 /** \class NestByValue
  * \ingroup Core_Module
  *
  * \brief Expression which must be nested by value
  *
-  * \param ExpressionType the type of the object of which we are requiring nesting-by-value
+  * \tparam ExpressionType the type of the object of which we are requiring nesting-by-value
  *
  * This class is the return type of MatrixBase::nestByValue()
  * and most of the time this is the only way it is used.
  *
  * \sa MatrixBase::nestByValue()
  */
-
-namespace internal {
-template<typename ExpressionType>
-struct traits<NestByValue<ExpressionType> > : public traits<ExpressionType>
-{};
-}
-
 template<typename ExpressionType> class NestByValue
  : public internal::dense_xpr_base< NestByValue<ExpressionType> >::type
 {
@@ -44,55 +47,11 @@ template<typename ExpressionType> class NestByValue

    EIGEN_DEVICE_FUNC inline Index rows() const { return m_expression.rows(); }
    EIGEN_DEVICE_FUNC inline Index cols() const { return m_expression.cols(); }
-    EIGEN_DEVICE_FUNC inline Index outerStride() const { return m_expression.outerStride(); }
-    EIGEN_DEVICE_FUNC inline Index innerStride() const { return m_expression.innerStride(); }
-
-    EIGEN_DEVICE_FUNC inline const CoeffReturnType coeff(Index row, Index col) const
-    {
-      return m_expression.coeff(row, col);
-    }
-
-    EIGEN_DEVICE_FUNC inline Scalar& coeffRef(Index row, Index col)
-    {
-      return m_expression.const_cast_derived().coeffRef(row, col);
-    }
-
-    EIGEN_DEVICE_FUNC inline const CoeffReturnType coeff(Index index) const
-    {
-      return m_expression.coeff(index);
-    }
-
-    EIGEN_DEVICE_FUNC inline Scalar& coeffRef(Index index)
-    {
-      return m_expression.const_cast_derived().coeffRef(index);
-    }
-
-    template<int LoadMode>
-    inline const PacketScalar packet(Index row, Index col) const
-    {
-      return m_expression.template packet<LoadMode>(row, col);
-    }
-
-    template<int LoadMode>
-    inline void writePacket(Index row, Index col, const PacketScalar& x)
-    {
-      m_expression.const_cast_derived().template writePacket<LoadMode>(row, col, x);
-    }
-
-    template<int LoadMode>
-    inline const PacketScalar packet(Index index) const
-    {
-      return m_expression.template packet<LoadMode>(index);
-    }
-
-    template<int LoadMode>
-    inline void writePacket(Index index, const PacketScalar& x)
-    {
-      m_expression.const_cast_derived().template writePacket<LoadMode>(index, x);
-    }

    EIGEN_DEVICE_FUNC operator const ExpressionType&() const { return m_expression; }

+    EIGEN_DEVICE_FUNC const ExpressionType& nestedExpression() const { return m_expression; }
+
  protected:
    const ExpressionType m_expression;
 };
@@ -100,12 +59,27 @@ template<typename ExpressionType> class NestByValue
 /** \returns an expression of the temporary version of *this.
  */
 template<typename Derived>
-inline const NestByValue<Derived>
+EIGEN_DEVICE_FUNC inline const NestByValue<Derived>
 DenseBase<Derived>::nestByValue() const
 {
  return NestByValue<Derived>(derived());
 }

+namespace internal {
+
+// Evaluator of Solve -> eval into a temporary
+template<typename ArgType>
+struct evaluator<NestByValue<ArgType> >
+  : public evaluator<ArgType>
+{
+  typedef evaluator<ArgType> Base;
+
+  EIGEN_DEVICE_FUNC explicit evaluator(const NestByValue<ArgType>& xpr)
+    : Base(xpr.nestedExpression())
+  {}
+};
+}
+
 } // end namespace Eigen

 #endif // EIGEN_NESTBYVALUE_H
--- a/Eigen/src/Core/NoAlias.h
+++ b/Eigen/src/Core/NoAlias.h
@@ -17,7 +17,7 @@ namespace Eigen {
  *
  * \brief Pseudo expression providing an operator = assuming no aliasing
  *
-  * \param ExpressionType the type of the object on which to do the lazy assignment
+  * \tparam ExpressionType the type of the object on which to do the lazy assignment
  *
  * This class represents an expression with special assignment operators
  * assuming no aliasing between the target expression and the source expression.
@@ -33,13 +33,14 @@ class NoAlias
  public:
    typedef typename ExpressionType::Scalar Scalar;
    
+    EIGEN_DEVICE_FUNC
    explicit NoAlias(ExpressionType& expression) : m_expression(expression) {}
    
    template<typename OtherDerived>
    EIGEN_DEVICE_FUNC
    EIGEN_STRONG_INLINE ExpressionType& operator=(const StorageBase<OtherDerived>& other)
    {
-      call_assignment_no_alias(m_expression, other.derived(), internal::assign_op<Scalar>());
+      call_assignment_no_alias(m_expression, other.derived(), internal::assign_op<Scalar,typename OtherDerived::Scalar>());
      return m_expression;
    }
    
@@ -47,7 +48,7 @@ class NoAlias
    EIGEN_DEVICE_FUNC
    EIGEN_STRONG_INLINE ExpressionType& operator+=(const StorageBase<OtherDerived>& other)
    {
-      call_assignment_no_alias(m_expression, other.derived(), internal::add_assign_op<Scalar>());
+      call_assignment_no_alias(m_expression, other.derived(), internal::add_assign_op<Scalar,typename OtherDerived::Scalar>());
      return m_expression;
    }
    
@@ -55,7 +56,7 @@ class NoAlias
    EIGEN_DEVICE_FUNC
    EIGEN_STRONG_INLINE ExpressionType& operator-=(const StorageBase<OtherDerived>& other)
    {
-      call_assignment_no_alias(m_expression, other.derived(), internal::sub_assign_op<Scalar>());
+      call_assignment_no_alias(m_expression, other.derived(), internal::sub_assign_op<Scalar,typename OtherDerived::Scalar>());
      return m_expression;
    }

@@ -74,10 +75,10 @@ class NoAlias
  *
  * More precisely, noalias() allows to bypass the EvalBeforeAssignBit flag.
  * Currently, even though several expressions may alias, only product
-  * expressions have this flag. Therefore, noalias() is only usefull when
+  * expressions have this flag. Therefore, noalias() is only useful when
  * the source expression contains a matrix product.
  *
-  * Here are some examples where noalias is usefull:
+  * Here are some examples where noalias is useful:
  * \code
  * D.noalias()  = A * B;
  * D.noalias() += A.transpose() * B;
@@ -98,7 +99,7 @@ class NoAlias
  * \sa class NoAlias
  */
 template<typename Derived>
-NoAlias<Derived,MatrixBase> MatrixBase<Derived>::noalias()
+NoAlias<Derived,MatrixBase> EIGEN_DEVICE_FUNC MatrixBase<Derived>::noalias()
 {
  return NoAlias<Derived, Eigen::MatrixBase >(derived());
 }
--- a/Eigen/src/Core/NumTraits.h
+++ b/Eigen/src/Core/NumTraits.h
@@ -12,24 +12,91 @@

 namespace Eigen {

+namespace internal {
+
+// default implementation of digits10(), based on numeric_limits if specialized,
+// 0 for integer types, and log10(epsilon()) otherwise.
+template< typename T,
+          bool use_numeric_limits = std::numeric_limits<T>::is_specialized,
+          bool is_integer = NumTraits<T>::IsInteger>
+struct default_digits10_impl
+{
+  EIGEN_DEVICE_FUNC
+  static int run() { return std::numeric_limits<T>::digits10; }
+};
+
+template<typename T>
+struct default_digits10_impl<T,false,false> // Floating point
+{
+  EIGEN_DEVICE_FUNC
+  static int run() {
+    using std::log10;
+    using std::ceil;
+    typedef typename NumTraits<T>::Real Real;
+    return int(ceil(-log10(NumTraits<Real>::epsilon())));
+  }
+};
+
+template<typename T>
+struct default_digits10_impl<T,false,true> // Integer
+{
+  EIGEN_DEVICE_FUNC
+  static int run() { return 0; }
+};
+
+
+// default implementation of digits(), based on numeric_limits if specialized,
+// 0 for integer types, and log2(epsilon()) otherwise.
+template< typename T,
+          bool use_numeric_limits = std::numeric_limits<T>::is_specialized,
+          bool is_integer = NumTraits<T>::IsInteger>
+struct default_digits_impl
+{
+  EIGEN_DEVICE_FUNC
+  static int run() { return std::numeric_limits<T>::digits; }
+};
+
+template<typename T>
+struct default_digits_impl<T,false,false> // Floating point
+{
+  EIGEN_DEVICE_FUNC
+  static int run() {
+    using std::log;
+    using std::ceil;
+    typedef typename NumTraits<T>::Real Real;
+    return int(ceil(-log(NumTraits<Real>::epsilon())/log(static_cast<Real>(2))));
+  }
+};
+
+template<typename T>
+struct default_digits_impl<T,false,true> // Integer
+{
+  EIGEN_DEVICE_FUNC
+  static int run() { return 0; }
+};
+
+} // end namespace internal
+
 /** \class NumTraits
  * \ingroup Core_Module
  *
  * \brief Holds information about the various numeric (i.e. scalar) types allowed by Eigen.
  *
-  * \param T the numeric type at hand
+  * \tparam T the numeric type at hand
  *
  * This class stores enums, typedefs and static methods giving information about a numeric type.
  *
  * The provided data consists of:
-  * \li A typedef \a Real, giving the "real part" type of \a T. If \a T is already real,
-  *     then \a Real is just a typedef to \a T. If \a T is \c std::complex<U> then \a Real
+  * \li A typedef \c Real, giving the "real part" type of \a T. If \a T is already real,
+  *     then \c Real is just a typedef to \a T. If \a T is \c std::complex<U> then \c Real
  *     is a typedef to \a U.
-  * \li A typedef \a NonInteger, giving the type that should be used for operations producing non-integral values,
+  * \li A typedef \c NonInteger, giving the type that should be used for operations producing non-integral values,
  *     such as quotients, square roots, etc. If \a T is a floating-point type, then this typedef just gives
  *     \a T again. Note however that many Eigen functions such as internal::sqrt simply refuse to
  *     take integers. Outside of a few cases, Eigen doesn't do automatic type promotion. Thus, this typedef is
  *     only intended as a helper for code that needs to explicitly promote types.
+  * \li A typedef \c Literal giving the type to use for numeric literals such as "2" or "0.5". For instance, for \c std::complex<U>, Literal is defined as \c U.
+  *     Of course, this type must be fully compatible with \a T. In doubt, just use \a T here.
  * \li A typedef \a Nested giving the type to use to nest a value inside of the expression tree. If you don't know what
  *     this means, just use \a T here.
  * \li An enum value \a IsComplex. It is equal to 1 if \a T is a \c std::complex
@@ -38,14 +105,18 @@ namespace Eigen {
  *     and to \c 0 otherwise.
  * \li Enum values ReadCost, AddCost and MulCost representing a rough estimate of the number of CPU cycles needed
  *     to by move / add / mul instructions respectively, assuming the data is already stored in CPU registers.
-  *     Stay vague here. No need to do architecture-specific stuff.
+  *     Stay vague here. No need to do architecture-specific stuff. If you don't know what this means, just use \c Eigen::HugeCost.
  * \li An enum value \a IsSigned. It is equal to \c 1 if \a T is a signed type and to 0 if \a T is unsigned.
  * \li An enum value \a RequireInitialization. It is equal to \c 1 if the constructor of the numeric type \a T must
  *     be called, and to 0 if it is safe not to call it. Default is 0 if \a T is an arithmetic type, and 1 otherwise.
-  * \li An epsilon() function which, unlike std::numeric_limits::epsilon(), returns a \a Real instead of a \a T.
+  * \li An epsilon() function which, unlike <a href="http://en.cppreference.com/w/cpp/types/numeric_limits/epsilon">std::numeric_limits::epsilon()</a>,
+  *     it returns a \a Real instead of a \a T.
  * \li A dummy_precision() function returning a weak epsilon value. It is mainly used as a default
  *     value by the fuzzy comparison operators.
  * \li highest() and lowest() functions returning the highest and lowest possible values respectively.
+  * \li digits10() function returning the number of decimal digits that can be represented without change. This is
+  *     the analogue of <a href="http://en.cppreference.com/w/cpp/types/numeric_limits/digits10">std::numeric_limits<T>::digits10</a>
+  *     which is used as the default implementation if specialized.
  */

 template<typename T> struct GenericNumTraits
@@ -67,16 +138,26 @@ template<typename T> struct GenericNumTraits
                     T
                   >::type NonInteger;
  typedef T Nested;
+  typedef T Literal;

  EIGEN_DEVICE_FUNC
  static inline Real epsilon()
  {
-    #if defined(__CUDA_ARCH__)
-    return internal::device::numeric_limits<T>::epsilon();
-    #else
-    return std::numeric_limits<T>::epsilon();
-    #endif
+    return numext::numeric_limits<T>::epsilon();
  }
+
+  EIGEN_DEVICE_FUNC
+  static inline int digits10()
+  {
+    return internal::default_digits10_impl<T>::run();
+  }
+
+  EIGEN_DEVICE_FUNC
+  static inline int digits()
+  {
+    return internal::default_digits_impl<T>::run();
+  }
+
  EIGEN_DEVICE_FUNC
  static inline Real dummy_precision()
  {
@@ -87,20 +168,23 @@ template<typename T> struct GenericNumTraits

  EIGEN_DEVICE_FUNC
  static inline T highest() {
-#if defined(__CUDA_ARCH__)
-    return (internal::device::numeric_limits<T>::max)();
-#else
-    return (std::numeric_limits<T>::max)();
-#endif
+    return (numext::numeric_limits<T>::max)();
  }

  EIGEN_DEVICE_FUNC
  static inline T lowest()  {
-#if defined(__CUDA_ARCH__)
-    return IsInteger ? (internal::device::numeric_limits<T>::min)() : (-(internal::device::numeric_limits<T>::max)());
-#else
-    return IsInteger ? (std::numeric_limits<T>::min)() : (-(std::numeric_limits<T>::max)());
-#endif
+    return IsInteger ? (numext::numeric_limits<T>::min)()
+                     : static_cast<T>(-(numext::numeric_limits<T>::max)());
+  }
+
+  EIGEN_DEVICE_FUNC
+  static inline T infinity() {
+    return numext::numeric_limits<T>::infinity();
+  }
+
+  EIGEN_DEVICE_FUNC
+  static inline T quiet_NaN() {
+    return numext::numeric_limits<T>::quiet_NaN();
  }
 };

@@ -130,6 +214,7 @@ template<typename _Real> struct NumTraits<std::complex<_Real> >
  : GenericNumTraits<std::complex<_Real> >
 {
  typedef _Real Real;
+  typedef typename NumTraits<_Real>::Literal Literal;
  enum {
    IsComplex = 1,
    RequireInitialization = NumTraits<_Real>::RequireInitialization,
@@ -138,8 +223,12 @@ template<typename _Real> struct NumTraits<std::complex<_Real> >
    MulCost = 4 * NumTraits<Real>::MulCost + 2 * NumTraits<Real>::AddCost
  };

+  EIGEN_DEVICE_FUNC
  static inline Real epsilon() { return NumTraits<Real>::epsilon(); }
+  EIGEN_DEVICE_FUNC
  static inline Real dummy_precision() { return NumTraits<Real>::dummy_precision(); }
+  EIGEN_DEVICE_FUNC
+  static inline int digits10() { return NumTraits<Real>::digits10(); }
 };

 template<typename Scalar, int Rows, int Cols, int Options, int MaxRows, int MaxCols>
@@ -151,21 +240,50 @@ struct NumTraits<Array<Scalar, Rows, Cols, Options, MaxRows, MaxCols> >
  typedef typename NumTraits<Scalar>::NonInteger NonIntegerScalar;
  typedef Array<NonIntegerScalar, Rows, Cols, Options, MaxRows, MaxCols> NonInteger;
  typedef ArrayType & Nested;
-  
+  typedef typename NumTraits<Scalar>::Literal Literal;
+
  enum {
    IsComplex = NumTraits<Scalar>::IsComplex,
    IsInteger = NumTraits<Scalar>::IsInteger,
    IsSigned  = NumTraits<Scalar>::IsSigned,
    RequireInitialization = 1,
-    ReadCost = ArrayType::SizeAtCompileTime==Dynamic ? Dynamic : ArrayType::SizeAtCompileTime * NumTraits<Scalar>::ReadCost,
-    AddCost  = ArrayType::SizeAtCompileTime==Dynamic ? Dynamic : ArrayType::SizeAtCompileTime * NumTraits<Scalar>::AddCost,
-    MulCost  = ArrayType::SizeAtCompileTime==Dynamic ? Dynamic : ArrayType::SizeAtCompileTime * NumTraits<Scalar>::MulCost
+    ReadCost = ArrayType::SizeAtCompileTime==Dynamic ? HugeCost : ArrayType::SizeAtCompileTime * NumTraits<Scalar>::ReadCost,
+    AddCost  = ArrayType::SizeAtCompileTime==Dynamic ? HugeCost : ArrayType::SizeAtCompileTime * NumTraits<Scalar>::AddCost,
+    MulCost  = ArrayType::SizeAtCompileTime==Dynamic ? HugeCost : ArrayType::SizeAtCompileTime * NumTraits<Scalar>::MulCost
  };
-  
+
+  EIGEN_DEVICE_FUNC
  static inline RealScalar epsilon() { return NumTraits<RealScalar>::epsilon(); }
+  EIGEN_DEVICE_FUNC
  static inline RealScalar dummy_precision() { return NumTraits<RealScalar>::dummy_precision(); }
+
+  static inline int digits10() { return NumTraits<Scalar>::digits10(); }
 };

+template<> struct NumTraits<std::string>
+  : GenericNumTraits<std::string>
+{
+  enum {
+    RequireInitialization = 1,
+    ReadCost = HugeCost,
+    AddCost  = HugeCost,
+    MulCost  = HugeCost
+  };
+
+  static inline int digits10() { return 0; }
+
+private:
+  static inline std::string epsilon();
+  static inline std::string dummy_precision();
+  static inline std::string lowest();
+  static inline std::string highest();
+  static inline std::string infinity();
+  static inline std::string quiet_NaN();
+};
+
+// Empty specialization for void to allow template specialization based on NumTraits<T>::Real with T==void and SFINAE.
+template<> struct NumTraits<void> {};
+
 } // end namespace Eigen

 #endif // EIGEN_NUMTRAITS_H
--- a/Eigen/src/Core/PartialReduxEvaluator.h
+++ b/Eigen/src/Core/PartialReduxEvaluator.h
@@ -0,0 +1,232 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2011-2018 Gael Guennebaud <gael.guennebaud@inria.fr>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+#ifndef EIGEN_PARTIALREDUX_H
+#define EIGEN_PARTIALREDUX_H
+
+namespace Eigen { 
+
+namespace internal {
+
+
+/***************************************************************************
+*
+* This file provides evaluators for partial reductions.
+* There are two modes:
+*
+*  - scalar path: simply calls the respective function on the column or row.
+*    -> nothing special here, all the tricky part is handled by the return
+*       types of VectorwiseOp's members. They embed the functor calling the
+*       respective DenseBase's member function.
+*
+*  - vectorized path: implements a packet-wise reductions followed by
+*    some (optional) processing of the outcome, e.g., division by n for mean.
+*
+* For the vectorized path let's observe that the packet-size and outer-unrolling
+* are both decided by the assignement logic. So all we have to do is to decide
+* on the inner unrolling.
+*
+* For the unrolling, we can reuse "internal::redux_vec_unroller" from Redux.h,
+* but be need to be careful to specify correct increment.
+*
+***************************************************************************/
+
+
+/* logic deciding a strategy for unrolling of vectorized paths */
+template<typename Func, typename Evaluator>
+struct packetwise_redux_traits
+{
+  enum {
+    OuterSize = int(Evaluator::IsRowMajor) ? Evaluator::RowsAtCompileTime : Evaluator::ColsAtCompileTime,
+    Cost = OuterSize == Dynamic ? HugeCost
+         : OuterSize * Evaluator::CoeffReadCost + (OuterSize-1) * functor_traits<Func>::Cost,
+    Unrolling = Cost <= EIGEN_UNROLLING_LIMIT ? CompleteUnrolling : NoUnrolling
+  };
+
+};
+
+/* Value to be returned when size==0 , by default let's return 0 */
+template<typename PacketType,typename Func>
+EIGEN_DEVICE_FUNC
+PacketType packetwise_redux_empty_value(const Func& ) { return pset1<PacketType>(0); }
+
+/* For products the default is 1 */
+template<typename PacketType,typename Scalar>
+EIGEN_DEVICE_FUNC
+PacketType packetwise_redux_empty_value(const scalar_product_op<Scalar,Scalar>& ) { return pset1<PacketType>(1); }
+
+/* Perform the actual reduction */
+template<typename Func, typename Evaluator,
+         int Unrolling = packetwise_redux_traits<Func, Evaluator>::Unrolling
+>
+struct packetwise_redux_impl;
+
+/* Perform the actual reduction with unrolling */
+template<typename Func, typename Evaluator>
+struct packetwise_redux_impl<Func, Evaluator, CompleteUnrolling>
+{
+  typedef redux_novec_unroller<Func,Evaluator, 0, Evaluator::SizeAtCompileTime> Base;
+  typedef typename Evaluator::Scalar Scalar;
+
+  template<typename PacketType>
+  EIGEN_DEVICE_FUNC static EIGEN_STRONG_INLINE
+  PacketType run(const Evaluator &eval, const Func& func, Index /*size*/)
+  {
+    return redux_vec_unroller<Func, Evaluator, 0, packetwise_redux_traits<Func, Evaluator>::OuterSize>::template run<PacketType>(eval,func);
+  }
+};
+
+/* Add a specialization of redux_vec_unroller for size==0 at compiletime.
+ * This specialization is not required for general reductions, which is
+ * why it is defined here.
+ */
+template<typename Func, typename Evaluator, int Start>
+struct redux_vec_unroller<Func, Evaluator, Start, 0>
+{
+  template<typename PacketType>
+  EIGEN_DEVICE_FUNC
+  static EIGEN_STRONG_INLINE PacketType run(const Evaluator &, const Func& f)
+  {
+    return packetwise_redux_empty_value<PacketType>(f);
+  }
+};
+
+/* Perform the actual reduction for dynamic sizes */
+template<typename Func, typename Evaluator>
+struct packetwise_redux_impl<Func, Evaluator, NoUnrolling>
+{
+  typedef typename Evaluator::Scalar Scalar;
+  typedef typename redux_traits<Func, Evaluator>::PacketType PacketScalar;
+
+  template<typename PacketType>
+  EIGEN_DEVICE_FUNC
+  static PacketType run(const Evaluator &eval, const Func& func, Index size)
+  {
+    if(size==0)
+      return packetwise_redux_empty_value<PacketType>(func);
+    
+    const Index size4 = (size-1)&(~3);
+    PacketType p = eval.template packetByOuterInner<Unaligned,PacketType>(0,0);
+    Index i = 1;
+    // This loop is optimized for instruction pipelining:
+    // - each iteration generates two independent instructions
+    // - thanks to branch prediction and out-of-order execution we have independent instructions across loops
+    for(; i<size4; i+=4)
+      p = func.packetOp(p,
+            func.packetOp(
+              func.packetOp(eval.template packetByOuterInner<Unaligned,PacketType>(i+0,0),eval.template packetByOuterInner<Unaligned,PacketType>(i+1,0)),
+              func.packetOp(eval.template packetByOuterInner<Unaligned,PacketType>(i+2,0),eval.template packetByOuterInner<Unaligned,PacketType>(i+3,0))));
+    for(; i<size; ++i)
+      p = func.packetOp(p, eval.template packetByOuterInner<Unaligned,PacketType>(i,0));
+    return p;
+  }
+};
+
+template< typename ArgType, typename MemberOp, int Direction>
+struct evaluator<PartialReduxExpr<ArgType, MemberOp, Direction> >
+  : evaluator_base<PartialReduxExpr<ArgType, MemberOp, Direction> >
+{
+  typedef PartialReduxExpr<ArgType, MemberOp, Direction> XprType;
+  typedef typename internal::nested_eval<ArgType,1>::type ArgTypeNested;
+  typedef typename internal::add_const_on_value_type<ArgTypeNested>::type ConstArgTypeNested;
+  typedef typename internal::remove_all<ArgTypeNested>::type ArgTypeNestedCleaned;
+  typedef typename ArgType::Scalar InputScalar;
+  typedef typename XprType::Scalar Scalar;
+  enum {
+    TraversalSize = Direction==int(Vertical) ? int(ArgType::RowsAtCompileTime) :  int(ArgType::ColsAtCompileTime)
+  };
+  typedef typename MemberOp::template Cost<int(TraversalSize)> CostOpType;
+  enum {
+    CoeffReadCost = TraversalSize==Dynamic ? HugeCost
+                  : TraversalSize==0 ? 1
+                  : TraversalSize * evaluator<ArgType>::CoeffReadCost + int(CostOpType::value),
+    
+    _ArgFlags = evaluator<ArgType>::Flags,
+
+    _Vectorizable =  bool(int(_ArgFlags)&PacketAccessBit)
+                  && bool(MemberOp::Vectorizable)
+                  && (Direction==int(Vertical) ? bool(_ArgFlags&RowMajorBit) : (_ArgFlags&RowMajorBit)==0)
+                  && (TraversalSize!=0),
+                  
+    Flags = (traits<XprType>::Flags&RowMajorBit)
+          | (evaluator<ArgType>::Flags&(HereditaryBits&(~RowMajorBit)))
+          | (_Vectorizable ? PacketAccessBit : 0)
+          | LinearAccessBit,
+    
+    Alignment = 0 // FIXME this will need to be improved once PartialReduxExpr is vectorized
+  };
+
+  EIGEN_DEVICE_FUNC explicit evaluator(const XprType xpr)
+    : m_arg(xpr.nestedExpression()), m_functor(xpr.functor())
+  {
+    EIGEN_INTERNAL_CHECK_COST_VALUE(TraversalSize==Dynamic ? HugeCost : (TraversalSize==0 ? 1 : int(CostOpType::value)));
+    EIGEN_INTERNAL_CHECK_COST_VALUE(CoeffReadCost);
+  }
+
+  typedef typename XprType::CoeffReturnType CoeffReturnType;
+
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+  const Scalar coeff(Index i, Index j) const
+  {
+    return coeff(Direction==Vertical ? j : i);
+  }
+
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+  const Scalar coeff(Index index) const
+  {
+    return m_functor(m_arg.template subVector<DirectionType(Direction)>(index));
+  }
+
+  template<int LoadMode,typename PacketType>
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+  PacketType packet(Index i, Index j) const
+  {
+    return packet<LoadMode,PacketType>(Direction==Vertical ? j : i);
+  }
+  
+  template<int LoadMode,typename PacketType>
+  EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC
+  PacketType packet(Index idx) const
+  {
+    enum { PacketSize = internal::unpacket_traits<PacketType>::size };
+    typedef Block<const ArgTypeNestedCleaned,
+                  Direction==Vertical ? int(ArgType::RowsAtCompileTime) : int(PacketSize),
+                  Direction==Vertical ? int(PacketSize) : int(ArgType::ColsAtCompileTime),
+                  true /* InnerPanel */> PanelType;
+    
+    PanelType panel(m_arg,
+                    Direction==Vertical ? 0 : idx,
+                    Direction==Vertical ? idx : 0,
+                    Direction==Vertical ? m_arg.rows() : Index(PacketSize),
+                    Direction==Vertical ? Index(PacketSize) : m_arg.cols());
+
+    // FIXME
+    // See bug 1612, currently if PacketSize==1 (i.e. complex<double> with 128bits registers) then the storage-order of panel get reversed
+    // and methods like packetByOuterInner do not make sense anymore in this context.
+    // So let's just by pass "vectorization" in this case:
+    if(PacketSize==1)
+      return internal::pset1<PacketType>(coeff(idx));
+    
+    typedef typename internal::redux_evaluator<PanelType> PanelEvaluator;
+    PanelEvaluator panel_eval(panel);
+    typedef typename MemberOp::BinaryOp BinaryOp;
+    PacketType p = internal::packetwise_redux_impl<BinaryOp,PanelEvaluator>::template run<PacketType>(panel_eval,m_functor.binaryFunc(),m_arg.outerSize());
+    return p;
+  }
+
+protected:
+  ConstArgTypeNested m_arg;
+  const MemberOp m_functor;
+};
+
+} // end namespace internal
+
+} // end namespace Eigen
+
+#endif // EIGEN_PARTIALREDUX_H
--- a/Eigen/src/Core/PermutationMatrix.h
+++ b/Eigen/src/Core/PermutationMatrix.h
@@ -2,7 +2,7 @@
 // for linear algebra.
 //
 // Copyright (C) 2009 Benoit Jacob <jacob.benoit.1@gmail.com>
-// Copyright (C) 2009-2011 Gael Guennebaud <gael.guennebaud@inria.fr>
+// Copyright (C) 2009-2015 Gael Guennebaud <gael.guennebaud@inria.fr>
 //
 // This Source Code Form is subject to the terms of the Mozilla
 // Public License v. 2.0. If a copy of the MPL was not distributed
@@ -13,15 +13,18 @@

 namespace Eigen { 

-// TODO: this does not seems to be needed at all:
-// template<int RowCol,typename IndicesType,typename MatrixType, typename StorageKind> class PermutedImpl;
+namespace internal {
+
+enum PermPermProduct_t {PermPermProduct};
+
+} // end namespace internal

 /** \class PermutationBase
  * \ingroup Core_Module
  *
  * \brief Base class for permutations
  *
-  * \param Derived the derived class
+  * \tparam Derived the derived class
  *
  * This class is the base class for all expressions representing a permutation matrix,
  * internally stored as a vector of integers.
@@ -39,13 +42,6 @@ namespace Eigen {
  *
  * \sa class PermutationMatrix, class PermutationWrapper
  */
-
-namespace internal {
-
-enum PermPermProduct_t {PermPermProduct};
-
-} // end namespace internal
-
 template<typename Derived>
 class PermutationBase : public EigenBase<Derived>
 {
@@ -67,8 +63,10 @@ class PermutationBase : public EigenBase<Derived>
            DenseMatrixType;
    typedef PermutationMatrix<IndicesType::SizeAtCompileTime,IndicesType::MaxSizeAtCompileTime,StorageIndex>
            PlainPermutationType;
+    typedef PlainPermutationType PlainObject;
    using Base::derived;
-    typedef Transpose<PermutationBase> TransposeReturnType;
+    typedef Inverse<Derived> InverseReturnType;
+    typedef void Scalar;
    #endif

    /** Copies the other permutation into *this */
@@ -89,25 +87,14 @@ class PermutationBase : public EigenBase<Derived>
      return derived();
    }

-    #ifndef EIGEN_PARSED_BY_DOXYGEN
-    /** This is a special case of the templated operator=. Its purpose is to
-      * prevent a default operator= from hiding the templated operator=.
-      */
-    Derived& operator=(const PermutationBase& other)
-    {
-      indices() = other.indices();
-      return derived();
-    }
-    #endif
-
    /** \returns the number of rows */
-    inline Index rows() const { return Index(indices().size()); }
+    inline EIGEN_DEVICE_FUNC Index rows() const { return Index(indices().size()); }

    /** \returns the number of columns */
-    inline Index cols() const { return Index(indices().size()); }
+    inline EIGEN_DEVICE_FUNC Index cols() const { return Index(indices().size()); }

    /** \returns the size of a side of the respective square matrix, i.e., the number of indices */
-    inline Index size() const { return Index(indices().size()); }
+    inline EIGEN_DEVICE_FUNC Index size() const { return Index(indices().size()); }

    #ifndef EIGEN_PARSED_BY_DOXYGEN
    template<typename DenseDerived>
@@ -193,16 +180,16 @@ class PermutationBase : public EigenBase<Derived>

    /** \returns the inverse permutation matrix.
      *
-      * \note \note_try_to_help_rvo
+      * \note \blank \note_try_to_help_rvo
      */
-    inline TransposeReturnType inverse() const
-    { return TransposeReturnType(derived()); }
+    inline InverseReturnType inverse() const
+    { return InverseReturnType(derived()); }
    /** \returns the tranpose permutation matrix.
      *
-      * \note \note_try_to_help_rvo
+      * \note \blank \note_try_to_help_rvo
      */
-    inline TransposeReturnType transpose() const
-    { return TransposeReturnType(derived()); }
+    inline InverseReturnType transpose() const
+    { return InverseReturnType(derived()); }

    /**** multiplication helpers to hopefully get RVO ****/

@@ -226,7 +213,7 @@ class PermutationBase : public EigenBase<Derived>

    /** \returns the product permutation matrix.
      *
-      * \note \note_try_to_help_rvo
+      * \note \blank \note_try_to_help_rvo
      */
    template<typename Other>
    inline PlainPermutationType operator*(const PermutationBase<Other>& other) const
@@ -234,18 +221,18 @@ class PermutationBase : public EigenBase<Derived>

    /** \returns the product of a permutation with another inverse permutation.
      *
-      * \note \note_try_to_help_rvo
+      * \note \blank \note_try_to_help_rvo
      */
    template<typename Other>
-    inline PlainPermutationType operator*(const Transpose<PermutationBase<Other> >& other) const
+    inline PlainPermutationType operator*(const InverseImpl<Other,PermutationStorage>& other) const
    { return PlainPermutationType(internal::PermPermProduct, *this, other.eval()); }

    /** \returns the product of an inverse permutation with another permutation.
      *
-      * \note \note_try_to_help_rvo
+      * \note \blank \note_try_to_help_rvo
      */
    template<typename Other> friend
-    inline PlainPermutationType operator*(const Transpose<PermutationBase<Other> >& other, const PermutationBase& perm)
+    inline PlainPermutationType operator*(const InverseImpl<Other, PermutationStorage>& other, const PermutationBase& perm)
    { return PlainPermutationType(internal::PermPermProduct, other.eval(), perm); }
    
    /** \returns the determinant of the permutation matrix, which is either 1 or -1 depending on the parity of the permutation.
@@ -281,20 +268,6 @@ class PermutationBase : public EigenBase<Derived>

 };

-/** \class PermutationMatrix
-  * \ingroup Core_Module
-  *
-  * \brief Permutation matrix
-  *
-  * \param SizeAtCompileTime the number of rows/cols, or Dynamic
-  * \param MaxSizeAtCompileTime the maximum number of rows/cols, or Dynamic. This optional parameter defaults to SizeAtCompileTime. Most of the time, you should not have to specify it.
-  * \param StorageIndex the integer type of the indices
-  *
-  * This class represents a permutation matrix, internally stored as a vector of integers.
-  *
-  * \sa class PermutationBase, class PermutationWrapper, class DiagonalMatrix
-  */
-
 namespace internal {
 template<int SizeAtCompileTime, int MaxSizeAtCompileTime, typename _StorageIndex>
 struct traits<PermutationMatrix<SizeAtCompileTime, MaxSizeAtCompileTime, _StorageIndex> >
@@ -303,9 +276,23 @@ struct traits<PermutationMatrix<SizeAtCompileTime, MaxSizeAtCompileTime, _Storag
  typedef PermutationStorage StorageKind;
  typedef Matrix<_StorageIndex, SizeAtCompileTime, 1, 0, MaxSizeAtCompileTime, 1> IndicesType;
  typedef _StorageIndex StorageIndex;
+  typedef void Scalar;
 };
 }

+/** \class PermutationMatrix
+  * \ingroup Core_Module
+  *
+  * \brief Permutation matrix
+  *
+  * \tparam SizeAtCompileTime the number of rows/cols, or Dynamic
+  * \tparam MaxSizeAtCompileTime the maximum number of rows/cols, or Dynamic. This optional parameter defaults to SizeAtCompileTime. Most of the time, you should not have to specify it.
+  * \tparam _StorageIndex the integer type of the indices
+  *
+  * This class represents a permutation matrix, internally stored as a vector of integers.
+  *
+  * \sa class PermutationBase, class PermutationWrapper, class DiagonalMatrix
+  */
 template<int SizeAtCompileTime, int MaxSizeAtCompileTime, typename _StorageIndex>
 class PermutationMatrix : public PermutationBase<PermutationMatrix<SizeAtCompileTime, MaxSizeAtCompileTime, _StorageIndex> >
 {
@@ -335,12 +322,6 @@ class PermutationMatrix : public PermutationBase<PermutationMatrix<SizeAtCompile
    inline PermutationMatrix(const PermutationBase<OtherDerived>& other)
      : m_indices(other.indices()) {}

-    #ifndef EIGEN_PARSED_BY_DOXYGEN
-    /** Standard copy constructor. Defined only to prevent a default copy constructor
-      * from hiding the other templated constructor */
-    inline PermutationMatrix(const PermutationMatrix& other) : m_indices(other.indices()) {}
-    #endif
-
    /** Generic constructor from expression of the indices. The indices
      * array has the meaning that the permutations sends each integer i to indices[i].
      *
@@ -375,17 +356,6 @@ class PermutationMatrix : public PermutationBase<PermutationMatrix<SizeAtCompile
      return Base::operator=(tr.derived());
    }

-    #ifndef EIGEN_PARSED_BY_DOXYGEN
-    /** This is a special case of the templated operator=. Its purpose is to
-      * prevent a default operator= from hiding the templated operator=.
-      */
-    PermutationMatrix& operator=(const PermutationMatrix& other)
-    {
-      m_indices = other.m_indices;
-      return *this;
-    }
-    #endif
-
    /** const version of indices(). */
    const IndicesType& indices() const { return m_indices; }
    /** \returns a reference to the stored array representing the permutation. */
@@ -396,13 +366,13 @@ class PermutationMatrix : public PermutationBase<PermutationMatrix<SizeAtCompile

 #ifndef EIGEN_PARSED_BY_DOXYGEN
    template<typename Other>
-    PermutationMatrix(const Transpose<PermutationBase<Other> >& other)
-      : m_indices(other.nestedExpression().size())
+    PermutationMatrix(const InverseImpl<Other,PermutationStorage>& other)
+      : m_indices(other.derived().nestedExpression().size())
    {
      eigen_internal_assert(m_indices.size() <= NumTraits<StorageIndex>::highest());
      StorageIndex end = StorageIndex(m_indices.size());
      for (StorageIndex i=0; i<end;++i)
-        m_indices.coeffRef(other.nestedExpression().indices().coeff(i)) = i;
+        m_indices.coeffRef(other.derived().nestedExpression().indices().coeff(i)) = i;
    }
    template<typename Lhs,typename Rhs>
    PermutationMatrix(internal::PermPermProduct_t, const Lhs& lhs, const Rhs& rhs)
@@ -426,6 +396,7 @@ struct traits<Map<PermutationMatrix<SizeAtCompileTime, MaxSizeAtCompileTime, _St
  typedef PermutationStorage StorageKind;
  typedef Map<const Matrix<_StorageIndex, SizeAtCompileTime, 1, 0, MaxSizeAtCompileTime, 1>, _PacketAccess> IndicesType;
  typedef _StorageIndex StorageIndex;
+  typedef void Scalar;
 };
 }

@@ -481,25 +452,13 @@ class Map<PermutationMatrix<SizeAtCompileTime, MaxSizeAtCompileTime, _StorageInd
    IndicesType m_indices;
 };

-/** \class PermutationWrapper
-  * \ingroup Core_Module
-  *
-  * \brief Class to view a vector of integers as a permutation matrix
-  *
-  * \param _IndicesType the type of the vector of integer (can be any compatible expression)
-  *
-  * This class allows to view any vector expression of integers as a permutation matrix.
-  *
-  * \sa class PermutationBase, class PermutationMatrix
-  */
-
 template<typename _IndicesType> class TranspositionsWrapper;
 namespace internal {
 template<typename _IndicesType>
 struct traits<PermutationWrapper<_IndicesType> >
 {
  typedef PermutationStorage StorageKind;
-  typedef typename _IndicesType::Scalar Scalar;
+  typedef void Scalar;
  typedef typename _IndicesType::Scalar StorageIndex;
  typedef _IndicesType IndicesType;
  enum {
@@ -512,6 +471,17 @@ struct traits<PermutationWrapper<_IndicesType> >
 };
 }

+/** \class PermutationWrapper
+  * \ingroup Core_Module
+  *
+  * \brief Class to view a vector of integers as a permutation matrix
+  *
+  * \tparam _IndicesType the type of the vector of integer (can be any compatible expression)
+  *
+  * This class allows to view any vector expression of integers as a permutation matrix.
+  *
+  * \sa class PermutationBase, class PermutationMatrix
+  */
 template<typename _IndicesType>
 class PermutationWrapper : public PermutationBase<PermutationWrapper<_IndicesType> >
 {
@@ -561,84 +531,61 @@ operator*(const PermutationBase<PermutationDerived> &permutation,
            (permutation.derived(), matrix.derived());
 }

-namespace internal {

-/* Template partial specialization for transposed/inverse permutations */
-
-template<typename Derived>
-struct traits<Transpose<PermutationBase<Derived> > >
- : traits<Derived>
-{};
-
-} // end namespace internal
-
-// TODO: the specificties should be handled by the evaluator,
-// at the very least we should only specialize TransposeImpl
-template<typename Derived>
-class Transpose<PermutationBase<Derived> >
-  : public EigenBase<Transpose<PermutationBase<Derived> > >
+template<typename PermutationType>
+class InverseImpl<PermutationType, PermutationStorage>
+  : public EigenBase<Inverse<PermutationType> >
 {
-    typedef Derived PermutationType;
-    typedef typename PermutationType::IndicesType IndicesType;
    typedef typename PermutationType::PlainPermutationType PlainPermutationType;
+    typedef internal::traits<PermutationType> PermTraits;
+  protected:
+    InverseImpl() {}
  public:
+    typedef Inverse<PermutationType> InverseType;
+    using EigenBase<Inverse<PermutationType> >::derived;

    #ifndef EIGEN_PARSED_BY_DOXYGEN
-    typedef internal::traits<PermutationType> Traits;
-    typedef typename Derived::DenseMatrixType DenseMatrixType;
+    typedef typename PermutationType::DenseMatrixType DenseMatrixType;
    enum {
-      Flags = Traits::Flags,
-      RowsAtCompileTime = Traits::RowsAtCompileTime,
-      ColsAtCompileTime = Traits::ColsAtCompileTime,
-      MaxRowsAtCompileTime = Traits::MaxRowsAtCompileTime,
-      MaxColsAtCompileTime = Traits::MaxColsAtCompileTime
+      RowsAtCompileTime = PermTraits::RowsAtCompileTime,
+      ColsAtCompileTime = PermTraits::ColsAtCompileTime,
+      MaxRowsAtCompileTime = PermTraits::MaxRowsAtCompileTime,
+      MaxColsAtCompileTime = PermTraits::MaxColsAtCompileTime
    };
-    typedef typename Traits::Scalar Scalar;
-    typedef typename Traits::StorageIndex StorageIndex;
    #endif

-    Transpose(const PermutationType& p) : m_permutation(p) {}
-
-    inline Index rows() const { return m_permutation.rows(); }
-    inline Index cols() const { return m_permutation.cols(); }
-
    #ifndef EIGEN_PARSED_BY_DOXYGEN
    template<typename DenseDerived>
    void evalTo(MatrixBase<DenseDerived>& other) const
    {
      other.setZero();
-      for (Index i=0; i<rows();++i)
-        other.coeffRef(i, m_permutation.indices().coeff(i)) = typename DenseDerived::Scalar(1);
+      for (Index i=0; i<derived().rows();++i)
+        other.coeffRef(i, derived().nestedExpression().indices().coeff(i)) = typename DenseDerived::Scalar(1);
    }
    #endif

    /** \return the equivalent permutation matrix */
-    PlainPermutationType eval() const { return *this; }
+    PlainPermutationType eval() const { return derived(); }

-    DenseMatrixType toDenseMatrix() const { return *this; }
+    DenseMatrixType toDenseMatrix() const { return derived(); }

    /** \returns the matrix with the inverse permutation applied to the columns.
      */
    template<typename OtherDerived> friend
-    const Product<OtherDerived, Transpose, AliasFreeProduct>
-    operator*(const MatrixBase<OtherDerived>& matrix, const Transpose& trPerm)
+    const Product<OtherDerived, InverseType, AliasFreeProduct>
+    operator*(const MatrixBase<OtherDerived>& matrix, const InverseType& trPerm)
    {
-      return Product<OtherDerived, Transpose, AliasFreeProduct>(matrix.derived(), trPerm.derived());
+      return Product<OtherDerived, InverseType, AliasFreeProduct>(matrix.derived(), trPerm.derived());
    }

    /** \returns the matrix with the inverse permutation applied to the rows.
      */
    template<typename OtherDerived>
-    const Product<Transpose, OtherDerived, AliasFreeProduct>
+    const Product<InverseType, OtherDerived, AliasFreeProduct>
    operator*(const MatrixBase<OtherDerived>& matrix) const
    {
-      return Product<Transpose, OtherDerived, AliasFreeProduct>(*this, matrix.derived());
+      return Product<InverseType, OtherDerived, AliasFreeProduct>(derived(), matrix.derived());
    }
-
-    const PermutationType& nestedExpression() const { return m_permutation; }
-
-  protected:
-    const PermutationType& m_permutation;
 };

 template<typename Derived>
--- a/Eigen/src/Core/PlainObjectBase.h
+++ b/Eigen/src/Core/PlainObjectBase.h
@@ -41,7 +41,7 @@ template<> struct check_rows_cols_for_overflow<Dynamic> {
  {
    // http://hg.mozilla.org/mozilla-central/file/6c8a909977d3/xpcom/ds/CheckedInt.h#l242
    // we assume Index is signed
-    Index max_index = (size_t(1) << (8 * sizeof(Index) - 1)) - 1; // assume Index is signed
+    Index max_index = (std::size_t(1) << (8 * sizeof(Index) - 1)) - 1; // assume Index is signed
    bool error = (rows == 0 || cols == 0) ? false
               : (rows > max_index / cols);
    if (error)
@@ -58,34 +58,41 @@ template<typename MatrixTypeA, typename MatrixTypeB, bool SwapPointers> struct m

 } // end namespace internal

+#ifdef EIGEN_PARSED_BY_DOXYGEN
+namespace doxygen {
+
+// This is a workaround to doxygen not being able to understand the inheritance logic
+// when it is hidden by the dense_xpr_base helper struct.
+// Moreover, doxygen fails to include members that are not documented in the declaration body of
+// MatrixBase if we inherits MatrixBase<Matrix<_Scalar, _Rows, _Cols, _Options, _MaxRows, _MaxCols> >,
+// this is why we simply inherits MatrixBase, though this does not make sense.
+
+/** This class is just a workaround for Doxygen and it does not not actually exist. */
+template<typename Derived> struct dense_xpr_base_dispatcher;
+/** This class is just a workaround for Doxygen and it does not not actually exist. */
+template<typename _Scalar, int _Rows, int _Cols, int _Options, int _MaxRows, int _MaxCols>
+struct dense_xpr_base_dispatcher<Matrix<_Scalar, _Rows, _Cols, _Options, _MaxRows, _MaxCols> >
+    : public MatrixBase {};
+/** This class is just a workaround for Doxygen and it does not not actually exist. */
+template<typename _Scalar, int _Rows, int _Cols, int _Options, int _MaxRows, int _MaxCols>
+struct dense_xpr_base_dispatcher<Array<_Scalar, _Rows, _Cols, _Options, _MaxRows, _MaxCols> >
+    : public ArrayBase {};
+
+} // namespace doxygen
+
 /** \class PlainObjectBase
+  * \ingroup Core_Module
  * \brief %Dense storage base class for matrices and arrays.
  *
  * This class can be extended with the help of the plugin mechanism described on the page
-  * \ref TopicCustomizingEigen by defining the preprocessor symbol \c EIGEN_PLAINOBJECTBASE_PLUGIN.
+  * \ref TopicCustomizing_Plugins by defining the preprocessor symbol \c EIGEN_PLAINOBJECTBASE_PLUGIN.
+  *
+  * \tparam Derived is the derived type, e.g., a Matrix or Array
  *
  * \sa \ref TopicClassHierarchy
  */
-#ifdef EIGEN_PARSED_BY_DOXYGEN
-namespace internal {
-
-// this is a workaround to doxygen not being able to understand the inheritance logic
-// when it is hidden by the dense_xpr_base helper struct.
-/** This class is just a workaround for Doxygen and it does not not actually exist. */
-template<typename Derived> struct dense_xpr_base_dispatcher_for_doxygen;// : public MatrixBase<Derived> {};
-/** This class is just a workaround for Doxygen and it does not not actually exist. */
-template<typename _Scalar, int _Rows, int _Cols, int _Options, int _MaxRows, int _MaxCols>
-struct dense_xpr_base_dispatcher_for_doxygen<Matrix<_Scalar, _Rows, _Cols, _Options, _MaxRows, _MaxCols> >
-    : public MatrixBase<Matrix<_Scalar, _Rows, _Cols, _Options, _MaxRows, _MaxCols> > {};
-/** This class is just a workaround for Doxygen and it does not not actually exist. */
-template<typename _Scalar, int _Rows, int _Cols, int _Options, int _MaxRows, int _MaxCols>
-struct dense_xpr_base_dispatcher_for_doxygen<Array<_Scalar, _Rows, _Cols, _Options, _MaxRows, _MaxCols> >
-    : public ArrayBase<Array<_Scalar, _Rows, _Cols, _Options, _MaxRows, _MaxCols> > {};
-
-} // namespace internal
-
 template<typename Derived>
-class PlainObjectBase : public internal::dense_xpr_base_dispatcher_for_doxygen<Derived>
+class PlainObjectBase : public doxygen::dense_xpr_base_dispatcher<Derived>
 #else
 template<typename Derived>
 class PlainObjectBase : public internal::dense_xpr_base<Derived>::type
@@ -97,7 +104,7 @@ class PlainObjectBase : public internal::dense_xpr_base<Derived>::type

    typedef typename internal::traits<Derived>::StorageKind StorageKind;
    typedef typename internal::traits<Derived>::Scalar Scalar;
-    
+
    typedef typename internal::packet_traits<Scalar>::type PacketScalar;
    typedef typename NumTraits<Scalar>::Real RealScalar;
    typedef Derived DenseType;
@@ -145,6 +152,10 @@ class PlainObjectBase : public internal::dense_xpr_base<Derived>::type
    EIGEN_DEVICE_FUNC
    EIGEN_STRONG_INLINE Index cols() const { return m_storage.cols(); }

+    /** This is an overloaded version of DenseCoeffsBase<Derived,ReadOnlyAccessors>::coeff(Index,Index) const
+      * provided to by-pass the creation of an evaluator of the expression, thus saving compilation efforts.
+      *
+      * See DenseCoeffsBase<Derived,ReadOnlyAccessors>::coeff(Index) const for details. */
    EIGEN_DEVICE_FUNC
    EIGEN_STRONG_INLINE const Scalar& coeff(Index rowId, Index colId) const
    {
@@ -154,12 +165,20 @@ class PlainObjectBase : public internal::dense_xpr_base<Derived>::type
        return m_storage.data()[rowId + colId * m_storage.rows()];
    }

+    /** This is an overloaded version of DenseCoeffsBase<Derived,ReadOnlyAccessors>::coeff(Index) const
+      * provided to by-pass the creation of an evaluator of the expression, thus saving compilation efforts.
+      *
+      * See DenseCoeffsBase<Derived,ReadOnlyAccessors>::coeff(Index) const for details. */
    EIGEN_DEVICE_FUNC
    EIGEN_STRONG_INLINE const Scalar& coeff(Index index) const
    {
      return m_storage.data()[index];
    }

+    /** This is an overloaded version of DenseCoeffsBase<Derived,WriteAccessors>::coeffRef(Index,Index) const
+      * provided to by-pass the creation of an evaluator of the expression, thus saving compilation efforts.
+      *
+      * See DenseCoeffsBase<Derived,WriteAccessors>::coeffRef(Index,Index) const for details. */
    EIGEN_DEVICE_FUNC
    EIGEN_STRONG_INLINE Scalar& coeffRef(Index rowId, Index colId)
    {
@@ -169,12 +188,18 @@ class PlainObjectBase : public internal::dense_xpr_base<Derived>::type
        return m_storage.data()[rowId + colId * m_storage.rows()];
    }

+    /** This is an overloaded version of DenseCoeffsBase<Derived,WriteAccessors>::coeffRef(Index) const
+      * provided to by-pass the creation of an evaluator of the expression, thus saving compilation efforts.
+      *
+      * See DenseCoeffsBase<Derived,WriteAccessors>::coeffRef(Index) const for details. */
    EIGEN_DEVICE_FUNC
    EIGEN_STRONG_INLINE Scalar& coeffRef(Index index)
    {
      return m_storage.data()[index];
    }

+    /** This is the const version of coeffRef(Index,Index) which is thus synonym of coeff(Index,Index).
+      * It is provided for convenience. */
    EIGEN_DEVICE_FUNC
    EIGEN_STRONG_INLINE const Scalar& coeffRef(Index rowId, Index colId) const
    {
@@ -184,6 +209,8 @@ class PlainObjectBase : public internal::dense_xpr_base<Derived>::type
        return m_storage.data()[rowId + colId * m_storage.rows()];
    }

+    /** This is the const version of coeffRef(Index) which is thus synonym of coeff(Index).
+      * It is provided for convenience. */
    EIGEN_DEVICE_FUNC
    EIGEN_STRONG_INLINE const Scalar& coeffRef(Index index) const
    {
@@ -263,7 +290,6 @@ class PlainObjectBase : public internal::dense_xpr_base<Derived>::type
        m_storage.resize(size, rows, cols);
        if(size_changed) EIGEN_INITIALIZE_COEFFS_IF_THAT_OPTION_IS_ENABLED
      #else
-        internal::check_rows_cols_for_overflow<MaxSizeAtCompileTime>::run(rows, cols);
        m_storage.resize(rows*cols, rows, cols);
      #endif
    }
@@ -332,7 +358,7 @@ class PlainObjectBase : public internal::dense_xpr_base<Derived>::type
      * remain row-vectors and vectors remain vectors.
      */
    template<typename OtherDerived>
-    EIGEN_DEVICE_FUNC 
+    EIGEN_DEVICE_FUNC
    EIGEN_STRONG_INLINE void resizeLike(const EigenBase<OtherDerived>& _other)
    {
      const OtherDerived& other = _other.derived();
@@ -357,7 +383,7 @@ class PlainObjectBase : public internal::dense_xpr_base<Derived>::type
      * of rows and/or of columns, you can use conservativeResize(NoChange_t, Index) or
      * conservativeResize(Index, NoChange_t).
      *
-      * Matrices are resized relative to the top-left element. In case values need to be 
+      * Matrices are resized relative to the top-left element. In case values need to be
      * appended to the matrix they will be uninitialized.
      */
    EIGEN_DEVICE_FUNC
@@ -414,7 +440,7 @@ class PlainObjectBase : public internal::dense_xpr_base<Derived>::type
      * of rows and/or of columns, you can use conservativeResize(NoChange_t, Index) or
      * conservativeResize(Index, NoChange_t).
      *
-      * Matrices are resized relative to the top-left element. In case values need to be 
+      * Matrices are resized relative to the top-left element. In case values need to be
      * appended to the matrix they will copied from \c other.
      */
    template<typename OtherDerived>
@@ -450,6 +476,10 @@ class PlainObjectBase : public internal::dense_xpr_base<Derived>::type
      return Base::operator=(func);
    }

+    // Prevent user from trying to instantiate PlainObjectBase objects
+    // by making all its constructor protected. See bug 1074.
+  protected:
+
    EIGEN_DEVICE_FUNC
    EIGEN_STRONG_INLINE PlainObjectBase() : m_storage()
    {
@@ -468,15 +498,15 @@ class PlainObjectBase : public internal::dense_xpr_base<Derived>::type
    }
 #endif

-#ifdef EIGEN_HAVE_RVALUE_REFERENCES
+#if EIGEN_HAS_RVALUE_REFERENCES
    EIGEN_DEVICE_FUNC
-    PlainObjectBase(PlainObjectBase&& other)
+    PlainObjectBase(PlainObjectBase&& other) EIGEN_NOEXCEPT
      : m_storage( std::move(other.m_storage) )
    {
    }

    EIGEN_DEVICE_FUNC
-    PlainObjectBase& operator=(PlainObjectBase&& other)
+    PlainObjectBase& operator=(PlainObjectBase&& other) EIGEN_NOEXCEPT
    {
      using std::swap;
      swap(m_storage, other.m_storage);
@@ -496,17 +526,71 @@ class PlainObjectBase : public internal::dense_xpr_base<Derived>::type
 //       EIGEN_INITIALIZE_COEFFS_IF_THAT_OPTION_IS_ENABLED
    }

-    /** \copydoc MatrixBase::operator=(const EigenBase<OtherDerived>&)
+    #if EIGEN_HAS_CXX11
+    /** \brief Construct a row of column vector with fixed size from an arbitrary number of coefficients. \cpp11
+      *
+      * \only_for_vectors
+      * 
+      * This constructor is for 1D array or vectors with more than 4 coefficients.
+      * There exists C++98 analogue constructors for fixed-size array/vector having 1, 2, 3, or 4 coefficients.
+      * 
+      * \warning To construct a column (resp. row) vector of fixed length, the number of values passed to this 
+      * constructor must match the the fixed number of rows (resp. columns) of \c *this.
      */
-    template<typename OtherDerived>
-    EIGEN_DEVICE_FUNC 
-    EIGEN_STRONG_INLINE Derived& operator=(const EigenBase<OtherDerived> &other)
+    template <typename... ArgTypes>
+    EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+    PlainObjectBase(const Scalar& a0, const Scalar& a1, const Scalar& a2,  const Scalar& a3, const ArgTypes&... args)
+      : m_storage()
    {
-      _resize_to_match(other);
-      Base::operator=(other.derived());
-      return this->derived();
+      _check_template_params();
+      EIGEN_STATIC_ASSERT_VECTOR_SPECIFIC_SIZE(PlainObjectBase, sizeof...(args) + 4);
+      m_storage.data()[0] = a0;
+      m_storage.data()[1] = a1;
+      m_storage.data()[2] = a2;
+      m_storage.data()[3] = a3;
+      int i = 4;
+      auto x = {(m_storage.data()[i++] = args, 0)...};
+      static_cast<void>(x);
    }

+    /** \brief Constructs a Matrix or Array and initializes it by elements given by an initializer list of initializer
+      * lists \cpp11
+      */
+    EIGEN_DEVICE_FUNC
+    explicit EIGEN_STRONG_INLINE PlainObjectBase(const std::initializer_list<std::initializer_list<Scalar>>& list)
+      : m_storage()
+    {
+      _check_template_params();
+
+      size_t list_size = 0;
+      if (list.begin() != list.end()) {
+        list_size = list.begin()->size();
+      }
+
+      // This is to allow syntax like VectorXi {{1, 2, 3, 4}}
+      if (ColsAtCompileTime == 1 && list.size() == 1) {
+        eigen_assert(list_size == static_cast<size_t>(RowsAtCompileTime) || RowsAtCompileTime == Dynamic);
+        resize(list_size, ColsAtCompileTime);
+        std::copy(list.begin()->begin(), list.begin()->end(), m_storage.data());
+      } else {
+        eigen_assert(list.size() == static_cast<size_t>(RowsAtCompileTime) || RowsAtCompileTime == Dynamic);
+        eigen_assert(list_size == static_cast<size_t>(ColsAtCompileTime) || ColsAtCompileTime == Dynamic);
+        resize(list.size(), list_size);
+       
+        Index row_index = 0;
+        for (const std::initializer_list<Scalar>& row : list) {
+          eigen_assert(list_size == row.size());
+          Index col_index = 0;
+          for (const Scalar& e : row) {
+            coeffRef(row_index, col_index) = e;
+            ++col_index;
+          }
+          ++row_index;
+        }
+      }
+    }
+    #endif  // end EIGEN_HAS_CXX11
+
    /** \sa PlainObjectBase::operator=(const EigenBase<OtherDerived>&) */
    template<typename OtherDerived>
    EIGEN_DEVICE_FUNC
@@ -520,7 +604,7 @@ class PlainObjectBase : public internal::dense_xpr_base<Derived>::type

    /** \sa PlainObjectBase::operator=(const EigenBase<OtherDerived>&) */
    template<typename OtherDerived>
-    EIGEN_DEVICE_FUNC 
+    EIGEN_DEVICE_FUNC
    EIGEN_STRONG_INLINE PlainObjectBase(const EigenBase<OtherDerived> &other)
      : m_storage()
    {
@@ -539,11 +623,29 @@ class PlainObjectBase : public internal::dense_xpr_base<Derived>::type
      other.evalTo(this->derived());
    }

+  public:
+
+    /** \brief Copies the generic expression \a other into *this.
+      * \copydetails DenseBase::operator=(const EigenBase<OtherDerived> &other)
+      */
+    template<typename OtherDerived>
+    EIGEN_DEVICE_FUNC
+    EIGEN_STRONG_INLINE Derived& operator=(const EigenBase<OtherDerived> &other)
+    {
+      _resize_to_match(other);
+      Base::operator=(other.derived());
+      return this->derived();
+    }
+
    /** \name Map
      * These are convenience functions returning Map objects. The Map() static functions return unaligned Map objects,
      * while the AlignedMap() functions return aligned Map objects and thus should be called only with 16-byte-aligned
      * \a data pointers.
      *
+      * Here is an example using strides:
+      * \include Matrix_Map_stride.cpp
+      * Output: \verbinclude Matrix_Map_stride.out
+      *
      * \see class Map
      */
    //@{
@@ -613,8 +715,8 @@ class PlainObjectBase : public internal::dense_xpr_base<Derived>::type
    //@}

    using Base::setConstant;
-    EIGEN_DEVICE_FUNC Derived& setConstant(Index size, const Scalar& value);
-    EIGEN_DEVICE_FUNC Derived& setConstant(Index rows, Index cols, const Scalar& value);
+    EIGEN_DEVICE_FUNC Derived& setConstant(Index size, const Scalar& val);
+    EIGEN_DEVICE_FUNC Derived& setConstant(Index rows, Index cols, const Scalar& val);

    using Base::setZero;
    EIGEN_DEVICE_FUNC Derived& setZero(Index size);
@@ -641,7 +743,7 @@ class PlainObjectBase : public internal::dense_xpr_base<Derived>::type
      * remain row-vectors and vectors remain vectors.
      */
    template<typename OtherDerived>
-    EIGEN_DEVICE_FUNC 
+    EIGEN_DEVICE_FUNC
    EIGEN_STRONG_INLINE void _resize_to_match(const EigenBase<OtherDerived>& other)
    {
      #ifdef EIGEN_NO_AUTOMATIC_RESIZING
@@ -668,10 +770,10 @@ class PlainObjectBase : public internal::dense_xpr_base<Derived>::type
      *
      * \internal
      */
-    // aliasing is dealt once in internall::call_assignment
+    // aliasing is dealt once in internal::call_assignment
    // so at this stage we have to assume aliasing... and resising has to be done later.
    template<typename OtherDerived>
-    EIGEN_DEVICE_FUNC 
+    EIGEN_DEVICE_FUNC
    EIGEN_STRONG_INLINE Derived& _set(const DenseBase<OtherDerived>& other)
    {
      internal::call_assignment(this->derived(), other.derived());
@@ -684,7 +786,7 @@ class PlainObjectBase : public internal::dense_xpr_base<Derived>::type
      * \sa operator=(const MatrixBase<OtherDerived>&), _set()
      */
    template<typename OtherDerived>
-    EIGEN_DEVICE_FUNC 
+    EIGEN_DEVICE_FUNC
    EIGEN_STRONG_INLINE Derived& _set_noalias(const DenseBase<OtherDerived>& other)
    {
      // I don't think we need this resize call since the lazyAssign will anyways resize
@@ -692,7 +794,7 @@ class PlainObjectBase : public internal::dense_xpr_base<Derived>::type
      //_resize_to_match(other);
      // the 'false' below means to enforce lazy evaluation. We don't use lazyAssign() because
      // it wouldn't allow to copy a row-vector into a column-vector.
-      internal::call_assignment_no_alias(this->derived(), other.derived(), internal::assign_op<Scalar>());
+      internal::call_assignment_no_alias(this->derived(), other.derived(), internal::assign_op<Scalar,typename OtherDerived::Scalar>());
      return this->derived();
    }

@@ -700,23 +802,25 @@ class PlainObjectBase : public internal::dense_xpr_base<Derived>::type
    EIGEN_DEVICE_FUNC
    EIGEN_STRONG_INLINE void _init2(Index rows, Index cols, typename internal::enable_if<Base::SizeAtCompileTime!=2,T0>::type* = 0)
    {
-      EIGEN_STATIC_ASSERT(bool(NumTraits<T0>::IsInteger) &&
-                          bool(NumTraits<T1>::IsInteger),
+      const bool t0_is_integer_alike = internal::is_valid_index_type<T0>::value;
+      const bool t1_is_integer_alike = internal::is_valid_index_type<T1>::value;
+      EIGEN_STATIC_ASSERT(t0_is_integer_alike &&
+                          t1_is_integer_alike,
                          FLOATING_POINT_ARGUMENT_PASSED__INTEGER_WAS_EXPECTED)
      resize(rows,cols);
    }
-    
+
    template<typename T0, typename T1>
-    EIGEN_DEVICE_FUNC 
-    EIGEN_STRONG_INLINE void _init2(const Scalar& val0, const Scalar& val1, typename internal::enable_if<Base::SizeAtCompileTime==2,T0>::type* = 0)
+    EIGEN_DEVICE_FUNC
+    EIGEN_STRONG_INLINE void _init2(const T0& val0, const T1& val1, typename internal::enable_if<Base::SizeAtCompileTime==2,T0>::type* = 0)
    {
      EIGEN_STATIC_ASSERT_VECTOR_SPECIFIC_SIZE(PlainObjectBase, 2)
-      m_storage.data()[0] = val0;
-      m_storage.data()[1] = val1;
+      m_storage.data()[0] = Scalar(val0);
+      m_storage.data()[1] = Scalar(val1);
    }
-    
+
    template<typename T0, typename T1>
-    EIGEN_DEVICE_FUNC 
+    EIGEN_DEVICE_FUNC
    EIGEN_STRONG_INLINE void _init2(const Index& val0, const Index& val1,
                                    typename internal::enable_if<    (!internal::is_same<Index,Scalar>::value)
                                                                  && (internal::is_same<T0,Index>::value)
@@ -736,13 +840,14 @@ class PlainObjectBase : public internal::dense_xpr_base<Derived>::type
                                                                              && ((!internal::is_same<typename internal::traits<Derived>::XprKind,ArrayXpr>::value || Base::SizeAtCompileTime==Dynamic)),T>::type* = 0)
    {
      // NOTE MSVC 2008 complains if we directly put bool(NumTraits<T>::IsInteger) as the EIGEN_STATIC_ASSERT argument.
-      const bool is_integer = NumTraits<T>::IsInteger;
-      EIGEN_STATIC_ASSERT(is_integer,
+      const bool is_integer_alike = internal::is_valid_index_type<T>::value;
+      EIGEN_UNUSED_VARIABLE(is_integer_alike);
+      EIGEN_STATIC_ASSERT(is_integer_alike,
                          FLOATING_POINT_ARGUMENT_PASSED__INTEGER_WAS_EXPECTED)
      resize(size);
    }
-    
-    // We have a 1x1 matrix/array => the argument is interpreted as the value of the unique coefficient (case where scalar type can be implicitely converted)
+
+    // We have a 1x1 matrix/array => the argument is interpreted as the value of the unique coefficient (case where scalar type can be implicitly converted)
    template<typename T>
    EIGEN_DEVICE_FUNC
    EIGEN_STRONG_INLINE void _init1(const Scalar& val0, typename internal::enable_if<Base::SizeAtCompileTime==1 && internal::is_convertible<T, Scalar>::value,T>::type* = 0)
@@ -750,7 +855,7 @@ class PlainObjectBase : public internal::dense_xpr_base<Derived>::type
      EIGEN_STATIC_ASSERT_VECTOR_SPECIFIC_SIZE(PlainObjectBase, 1)
      m_storage.data()[0] = val0;
    }
-    
+
    // We have a 1x1 matrix/array => the argument is interpreted as the value of the unique coefficient (case where scalar type match the index type)
    template<typename T>
    EIGEN_DEVICE_FUNC
@@ -778,6 +883,13 @@ class PlainObjectBase : public internal::dense_xpr_base<Derived>::type
      this->_set_noalias(other);
    }

+    // Initialize an arbitrary matrix from an object convertible to the Derived type.
+    template<typename T>
+    EIGEN_DEVICE_FUNC
+    EIGEN_STRONG_INLINE void _init1(const Derived& other){
+      this->_set_noalias(other);
+    }
+
    // Initialize an arbitrary matrix from a generic Eigen expression
    template<typename T, typename OtherDerived>
    EIGEN_DEVICE_FUNC
@@ -799,8 +911,8 @@ class PlainObjectBase : public internal::dense_xpr_base<Derived>::type
    {
      this->derived() = r;
    }
-    
-    // For fixed -size arrays:
+
+    // For fixed-size Array<Scalar,...>
    template<typename T>
    EIGEN_DEVICE_FUNC
    EIGEN_STRONG_INLINE void _init1(const Scalar& val0,
@@ -811,7 +923,8 @@ class PlainObjectBase : public internal::dense_xpr_base<Derived>::type
    {
      Base::setConstant(val0);
    }
-    
+
+    // For fixed-size Array<Index,...>
    template<typename T>
    EIGEN_DEVICE_FUNC
    EIGEN_STRONG_INLINE void _init1(const Index& val0,
@@ -824,34 +937,34 @@ class PlainObjectBase : public internal::dense_xpr_base<Derived>::type
    {
      Base::setConstant(val0);
    }
-    
+
    template<typename MatrixTypeA, typename MatrixTypeB, bool SwapPointers>
    friend struct internal::matrix_swap_impl;

  public:
-    
+
 #ifndef EIGEN_PARSED_BY_DOXYGEN
    /** \internal
      * \brief Override DenseBase::swap() since for dynamic-sized matrices
      * of same type it is enough to swap the data pointers.
      */
    template<typename OtherDerived>
-    EIGEN_DEVICE_FUNC
+    EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
    void swap(DenseBase<OtherDerived> & other)
    {
      enum { SwapPointers = internal::is_same<Derived, OtherDerived>::value && Base::SizeAtCompileTime==Dynamic };
      internal::matrix_swap_impl<Derived, OtherDerived, bool(SwapPointers)>::run(this->derived(), other.derived());
    }
-    
+
    /** \internal
      * \brief const version forwarded to DenseBase::swap
      */
    template<typename OtherDerived>
-    EIGEN_DEVICE_FUNC
+    EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
    void swap(DenseBase<OtherDerived> const & other)
    { Base::swap(other.derived()); }
-    
-    EIGEN_DEVICE_FUNC 
+
+    EIGEN_DEVICE_FUNC
    static EIGEN_STRONG_INLINE void _check_template_params()
    {
      EIGEN_STATIC_ASSERT((EIGEN_IMPLIES(MaxRowsAtCompileTime==1 && MaxColsAtCompileTime!=1, (Options&RowMajor)==RowMajor)
@@ -875,13 +988,19 @@ namespace internal {
 template <typename Derived, typename OtherDerived, bool IsVector>
 struct conservative_resize_like_impl
 {
+  #if EIGEN_HAS_TYPE_TRAITS
+  static const bool IsRelocatable = std::is_trivially_copyable<typename Derived::Scalar>::value;
+  #else
+  static const bool IsRelocatable = !NumTraits<typename Derived::Scalar>::RequireInitialization;
+  #endif
  static void run(DenseBase<Derived>& _this, Index rows, Index cols)
  {
    if (_this.rows() == rows && _this.cols() == cols) return;
    EIGEN_STATIC_ASSERT_DYNAMIC_SIZE(Derived)

-    if ( ( Derived::IsRowMajor && _this.cols() == cols) || // row-major and we change only the number of rows
-         (!Derived::IsRowMajor && _this.rows() == rows) )  // column-major and we change only the number of columns
+    if ( IsRelocatable
+          && (( Derived::IsRowMajor && _this.cols() == cols) ||  // row-major and we change only the number of rows
+              (!Derived::IsRowMajor && _this.rows() == rows) ))  // column-major and we change only the number of columns
    {
      internal::check_rows_cols_for_overflow<Derived::MaxSizeAtCompileTime>::run(rows, cols);
      _this.derived().m_storage.conservativeResize(rows*cols,rows,cols);
@@ -890,8 +1009,8 @@ struct conservative_resize_like_impl
    {
      // The storage order does not allow us to use reallocation.
      typename Derived::PlainObject tmp(rows,cols);
-      const Index common_rows = (std::min)(rows, _this.rows());
-      const Index common_cols = (std::min)(cols, _this.cols());
+      const Index common_rows = numext::mini(rows, _this.rows());
+      const Index common_cols = numext::mini(cols, _this.cols());
      tmp.block(0,0,common_rows,common_cols) = _this.block(0,0,common_rows,common_cols);
      _this.derived().swap(tmp);
    }
@@ -909,8 +1028,9 @@ struct conservative_resize_like_impl
    EIGEN_STATIC_ASSERT_DYNAMIC_SIZE(Derived)
    EIGEN_STATIC_ASSERT_DYNAMIC_SIZE(OtherDerived)

-    if ( ( Derived::IsRowMajor && _this.cols() == other.cols()) || // row-major and we change only the number of rows
-         (!Derived::IsRowMajor && _this.rows() == other.rows()) )  // column-major and we change only the number of columns
+    if ( IsRelocatable &&
+          (( Derived::IsRowMajor && _this.cols() == other.cols()) ||  // row-major and we change only the number of rows
+           (!Derived::IsRowMajor && _this.rows() == other.rows()) ))  // column-major and we change only the number of columns
    {
      const Index new_rows = other.rows() - _this.rows();
      const Index new_cols = other.cols() - _this.cols();
@@ -924,8 +1044,8 @@ struct conservative_resize_like_impl
    {
      // The storage order does not allow us to use reallocation.
      typename Derived::PlainObject tmp(other);
-      const Index common_rows = (std::min)(tmp.rows(), _this.rows());
-      const Index common_cols = (std::min)(tmp.cols(), _this.cols());
+      const Index common_rows = numext::mini(tmp.rows(), _this.rows());
+      const Index common_cols = numext::mini(tmp.cols(), _this.cols());
      tmp.block(0,0,common_rows,common_cols) = _this.block(0,0,common_rows,common_cols);
      _this.derived().swap(tmp);
    }
@@ -938,13 +1058,18 @@ template <typename Derived, typename OtherDerived>
 struct conservative_resize_like_impl<Derived,OtherDerived,true>
  : conservative_resize_like_impl<Derived,OtherDerived,false>
 {
-  using conservative_resize_like_impl<Derived,OtherDerived,false>::run;
-  
+  typedef conservative_resize_like_impl<Derived,OtherDerived,false> Base;
+  using Base::run;
+  using Base::IsRelocatable;
+
  static void run(DenseBase<Derived>& _this, Index size)
  {
    const Index new_rows = Derived::RowsAtCompileTime==1 ? 1 : size;
    const Index new_cols = Derived::RowsAtCompileTime==1 ? size : 1;
-    _this.derived().m_storage.conservativeResize(size,new_rows,new_cols);
+    if(IsRelocatable)
+      _this.derived().m_storage.conservativeResize(size,new_rows,new_cols);
+    else
+      Base::run(_this.derived(), new_rows, new_cols);
  }

  static void run(DenseBase<Derived>& _this, const DenseBase<OtherDerived>& other)
@@ -955,7 +1080,10 @@ struct conservative_resize_like_impl<Derived,OtherDerived,true>

    const Index new_rows = Derived::RowsAtCompileTime==1 ? 1 : other.rows();
    const Index new_cols = Derived::RowsAtCompileTime==1 ? other.cols() : 1;
-    _this.derived().m_storage.conservativeResize(other.size(),new_rows,new_cols);
+    if(IsRelocatable)
+      _this.derived().m_storage.conservativeResize(other.size(),new_rows,new_cols);
+    else
+      Base::run(_this.derived(), new_rows, new_cols);

    if (num_new_elements > 0)
      _this.tail(num_new_elements) = other.tail(num_new_elements);
@@ -966,7 +1094,7 @@ template<typename MatrixTypeA, typename MatrixTypeB, bool SwapPointers>
 struct matrix_swap_impl
 {
  EIGEN_DEVICE_FUNC
-  static inline void run(MatrixTypeA& a, MatrixTypeB& b)
+  static EIGEN_STRONG_INLINE void run(MatrixTypeA& a, MatrixTypeB& b)
  {
    a.base().swap(b);
  }
--- a/Eigen/src/Core/Product.h
+++ b/Eigen/src/Core/Product.h
@@ -14,57 +14,8 @@ namespace Eigen {

 template<typename Lhs, typename Rhs, int Option, typename StorageKind> class ProductImpl;

-/** \class Product
-  * \ingroup Core_Module
-  *
-  * \brief Expression of the product of two arbitrary matrices or vectors
-  *
-  * \param Lhs the type of the left-hand side expression
-  * \param Rhs the type of the right-hand side expression
-  *
-  * This class represents an expression of the product of two arbitrary matrices.
-  * 
-  * The other template parameters are:
-  * \tparam Option     can be DefaultProduct, AliasFreeProduct, or LazyProduct
-  *
-  */
-
-
 namespace internal {

-// Determine the scalar of Product<Lhs, Rhs>. This is normally the same as Lhs::Scalar times
-// Rhs::Scalar, but product with permutation matrices inherit the scalar of the other factor.
-template<typename Lhs, typename Rhs, typename LhsShape = typename evaluator_traits<Lhs>::Shape, 
-         typename RhsShape = typename evaluator_traits<Rhs>::Shape >
-struct product_result_scalar
-{
-  typedef typename scalar_product_traits<typename Lhs::Scalar, typename Rhs::Scalar>::ReturnType Scalar;
-};
-
-template<typename Lhs, typename Rhs, typename RhsShape>
-struct product_result_scalar<Lhs, Rhs, PermutationShape, RhsShape>
-{
-  typedef typename Rhs::Scalar Scalar;
-};
-
-template<typename Lhs, typename Rhs, typename LhsShape>
-  struct product_result_scalar<Lhs, Rhs, LhsShape, PermutationShape>
-{
-  typedef typename Lhs::Scalar Scalar;
-};
-
-template<typename Lhs, typename Rhs, typename RhsShape>
-struct product_result_scalar<Lhs, Rhs, TranspositionsShape, RhsShape>
-{
-  typedef typename Rhs::Scalar Scalar;
-};
-
-template<typename Lhs, typename Rhs, typename LhsShape>
-  struct product_result_scalar<Lhs, Rhs, LhsShape, TranspositionsShape>
-{
-  typedef typename Lhs::Scalar Scalar;
-};
-
 template<typename Lhs, typename Rhs, int Option>
 struct traits<Product<Lhs, Rhs, Option> >
 {
@@ -75,7 +26,7 @@ struct traits<Product<Lhs, Rhs, Option> >
  
  typedef MatrixXpr XprKind;
  
-  typedef typename product_result_scalar<LhsCleaned,RhsCleaned>::Scalar Scalar;
+  typedef typename ScalarBinaryOpTraits<typename traits<LhsCleaned>::Scalar, typename traits<RhsCleaned>::Scalar>::ReturnType Scalar;
  typedef typename product_promote_storage_type<typename LhsTraits::StorageKind,
                                                typename RhsTraits::StorageKind,
                                                internal::product_type<Lhs,Rhs>::ret>::ret StorageKind;
@@ -102,7 +53,20 @@ struct traits<Product<Lhs, Rhs, Option> >

 } // end namespace internal

-
+/** \class Product
+  * \ingroup Core_Module
+  *
+  * \brief Expression of the product of two arbitrary matrices or vectors
+  *
+  * \tparam _Lhs the type of the left-hand side expression
+  * \tparam _Rhs the type of the right-hand side expression
+  *
+  * This class represents an expression of the product of two arbitrary matrices.
+  *
+  * The other template parameters are:
+  * \tparam Option     can be DefaultProduct, AliasFreeProduct, or LazyProduct
+  *
+  */
 template<typename _Lhs, typename _Rhs, int Option>
 class Product : public ProductImpl<_Lhs,_Rhs,Option,
                                   typename internal::product_promote_storage_type<typename internal::traits<_Lhs>::StorageKind,
@@ -126,18 +90,23 @@ class Product : public ProductImpl<_Lhs,_Rhs,Option,
    typedef typename internal::remove_all<LhsNested>::type LhsNestedCleaned;
    typedef typename internal::remove_all<RhsNested>::type RhsNestedCleaned;

-    EIGEN_DEVICE_FUNC Product(const Lhs& lhs, const Rhs& rhs) : m_lhs(lhs), m_rhs(rhs)
+    EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+    Product(const Lhs& lhs, const Rhs& rhs) : m_lhs(lhs), m_rhs(rhs)
    {
      eigen_assert(lhs.cols() == rhs.rows()
        && "invalid matrix product"
        && "if you wanted a coeff-wise or a dot product use the respective explicit functions");
    }

-    EIGEN_DEVICE_FUNC inline Index rows() const { return m_lhs.rows(); }
-    EIGEN_DEVICE_FUNC inline Index cols() const { return m_rhs.cols(); }
+    EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+    Index rows() const { return m_lhs.rows(); }
+    EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+    Index cols() const { return m_rhs.cols(); }

-    EIGEN_DEVICE_FUNC const LhsNestedCleaned& lhs() const { return m_lhs; }
-    EIGEN_DEVICE_FUNC const RhsNestedCleaned& rhs() const { return m_rhs; }
+    EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+    const LhsNestedCleaned& lhs() const { return m_lhs; }
+    EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+    const RhsNestedCleaned& rhs() const { return m_rhs; }

  protected:

@@ -152,7 +121,7 @@ class dense_product_base
 : public internal::dense_xpr_base<Product<Lhs,Rhs,Option> >::type
 {};

-/** Convertion to scalar for inner-products */
+/** Conversion to scalar for inner-products */
 template<typename Lhs, typename Rhs, int Option>
 class dense_product_base<Lhs, Rhs, Option, InnerProduct>
 : public internal::dense_xpr_base<Product<Lhs,Rhs,Option> >::type
@@ -163,7 +132,7 @@ public:
  using Base::derived;
  typedef typename Base::Scalar Scalar;
  
-  operator const Scalar() const
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE operator const Scalar() const
  {
    return internal::evaluator<ProductXpr>(derived()).coeff(0,0);
  }
@@ -198,7 +167,7 @@ class ProductImpl<Lhs,Rhs,Option,Dense>
    
  public:
  
-    EIGEN_DEVICE_FUNC Scalar coeff(Index row, Index col) const
+    EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Scalar coeff(Index row, Index col) const
    {
      EIGEN_STATIC_ASSERT(EnableCoeff, THIS_METHOD_IS_ONLY_FOR_INNER_OR_LAZY_PRODUCTS);
      eigen_assert( (Option==LazyProduct) || (this->rows() == 1 && this->cols() == 1) );
@@ -206,7 +175,7 @@ class ProductImpl<Lhs,Rhs,Option,Dense>
      return internal::evaluator<Derived>(derived()).coeff(row,col);
    }

-    EIGEN_DEVICE_FUNC Scalar coeff(Index i) const
+    EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Scalar coeff(Index i) const
    {
      EIGEN_STATIC_ASSERT(EnableCoeff, THIS_METHOD_IS_ONLY_FOR_INNER_OR_LAZY_PRODUCTS);
      eigen_assert( (Option==LazyProduct) || (this->rows() == 1 && this->cols() == 1) );
@@ -217,29 +186,6 @@ class ProductImpl<Lhs,Rhs,Option,Dense>
  
 };

-/***************************************************************************
-* Implementation of matrix base methods
-***************************************************************************/
-
-
-/** \internal used to test the evaluator only
-  */
-template<typename Lhs,typename Rhs>
-const Product<Lhs,Rhs>
-prod(const Lhs& lhs, const Rhs& rhs)
-{
-  return Product<Lhs,Rhs>(lhs,rhs);
-}
-
-/** \internal used to test the evaluator only
-  */
-template<typename Lhs,typename Rhs>
-const Product<Lhs,Rhs,LazyProduct>
-lazyprod(const Lhs& lhs, const Rhs& rhs)
-{
-  return Product<Lhs,Rhs,LazyProduct>(lhs,rhs);
-}
-
 } // end namespace Eigen

 #endif // EIGEN_PRODUCT_H
--- a/Eigen/src/Core/ProductEvaluators.h
+++ b/Eigen/src/Core/ProductEvaluators.h
@@ -20,7 +20,7 @@ namespace internal {
 /** \internal
  * Evaluator of a product expression.
  * Since products require special treatments to handle all possible cases,
-  * we simply deffer the evaluation logic to a product_evaluator class
+  * we simply defer the evaluation logic to a product_evaluator class
  * which offers more partial specialization possibilities.
  * 
  * \sa class product_evaluator
@@ -32,20 +32,31 @@ struct evaluator<Product<Lhs, Rhs, Options> >
  typedef Product<Lhs, Rhs, Options> XprType;
  typedef product_evaluator<XprType> Base;
  
-  EIGEN_DEVICE_FUNC explicit evaluator(const XprType& xpr) : Base(xpr) {}
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE explicit evaluator(const XprType& xpr) : Base(xpr) {}
 };
 
-// Catch scalar * ( A * B ) and transform it to (A*scalar) * B
+// Catch "scalar * ( A * B )" and transform it to "(A*scalar) * B"
 // TODO we should apply that rule only if that's really helpful
-template<typename Lhs, typename Rhs, typename Scalar>
-struct evaluator<CwiseUnaryOp<internal::scalar_multiple_op<Scalar>,  const Product<Lhs, Rhs, DefaultProduct>  > > 
- : public evaluator<Product<CwiseUnaryOp<internal::scalar_multiple_op<Scalar>,const Lhs>, Rhs, DefaultProduct> >
+template<typename Lhs, typename Rhs, typename Scalar1, typename Scalar2, typename Plain1>
+struct evaluator_assume_aliasing<CwiseBinaryOp<internal::scalar_product_op<Scalar1,Scalar2>,
+                                               const CwiseNullaryOp<internal::scalar_constant_op<Scalar1>, Plain1>,
+                                               const Product<Lhs, Rhs, DefaultProduct> > >
 {
-  typedef CwiseUnaryOp<internal::scalar_multiple_op<Scalar>, const Product<Lhs, Rhs, DefaultProduct> > XprType;
-  typedef evaluator<Product<CwiseUnaryOp<internal::scalar_multiple_op<Scalar>,const Lhs>, Rhs, DefaultProduct> > Base;
-  
-  EIGEN_DEVICE_FUNC explicit evaluator(const XprType& xpr)
-    : Base(xpr.functor().m_other * xpr.nestedExpression().lhs() * xpr.nestedExpression().rhs())
+  static const bool value = true;
+};
+template<typename Lhs, typename Rhs, typename Scalar1, typename Scalar2, typename Plain1>
+struct evaluator<CwiseBinaryOp<internal::scalar_product_op<Scalar1,Scalar2>,
+                               const CwiseNullaryOp<internal::scalar_constant_op<Scalar1>, Plain1>,
+                               const Product<Lhs, Rhs, DefaultProduct> > >
+ : public evaluator<Product<EIGEN_SCALAR_BINARYOP_EXPR_RETURN_TYPE(Scalar1,Lhs,product), Rhs, DefaultProduct> >
+{
+  typedef CwiseBinaryOp<internal::scalar_product_op<Scalar1,Scalar2>,
+                               const CwiseNullaryOp<internal::scalar_constant_op<Scalar1>, Plain1>,
+                               const Product<Lhs, Rhs, DefaultProduct> > XprType;
+  typedef evaluator<Product<EIGEN_SCALAR_BINARYOP_EXPR_RETURN_TYPE(Scalar1,Lhs,product), Rhs, DefaultProduct> > Base;
+
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE explicit evaluator(const XprType& xpr)
+    : Base(xpr.lhs().functor().m_other * xpr.rhs().lhs() * xpr.rhs().rhs())
  {}
 };

@@ -57,7 +68,7 @@ struct evaluator<Diagonal<const Product<Lhs, Rhs, DefaultProduct>, DiagIndex> >
  typedef Diagonal<const Product<Lhs, Rhs, DefaultProduct>, DiagIndex> XprType;
  typedef evaluator<Diagonal<const Product<Lhs, Rhs, LazyProduct>, DiagIndex> > Base;
  
-  EIGEN_DEVICE_FUNC explicit evaluator(const XprType& xpr)
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE explicit evaluator(const XprType& xpr)
    : Base(Diagonal<const Product<Lhs, Rhs, LazyProduct>, DiagIndex>(
        Product<Lhs, Rhs, LazyProduct>(xpr.nestedExpression().lhs(), xpr.nestedExpression().rhs()),
        xpr.index() ))
@@ -75,24 +86,14 @@ template< typename Lhs, typename Rhs,
 struct generic_product_impl;

 template<typename Lhs, typename Rhs>
-struct evaluator_traits<Product<Lhs, Rhs, DefaultProduct> > 
- : evaluator_traits_base<Product<Lhs, Rhs, DefaultProduct> >
-{
-  enum { AssumeAliasing = 1 };
-};
-
-template<typename Lhs, typename Rhs>
-struct evaluator_traits<Product<Lhs, Rhs, AliasFreeProduct> > 
- : evaluator_traits_base<Product<Lhs, Rhs, AliasFreeProduct> >
-{
-  enum { AssumeAliasing = 0 };
+struct evaluator_assume_aliasing<Product<Lhs, Rhs, DefaultProduct> > {
+  static const bool value = true;
 };

 // This is the default evaluator implementation for products:
 // It creates a temporary and call generic_product_impl
 template<typename Lhs, typename Rhs, int Options, int ProductTag, typename LhsShape, typename RhsShape>
-struct product_evaluator<Product<Lhs, Rhs, Options>, ProductTag, LhsShape, RhsShape, typename traits<Lhs>::Scalar, typename traits<Rhs>::Scalar,
-  EnableIf<(Options==DefaultProduct || Options==AliasFreeProduct)> >
+struct product_evaluator<Product<Lhs, Rhs, Options>, ProductTag, LhsShape, RhsShape>
  : public evaluator<typename Product<Lhs, Rhs, Options>::PlainObject>
 {
  typedef Product<Lhs, Rhs, Options> XprType;
@@ -102,7 +103,8 @@ struct product_evaluator<Product<Lhs, Rhs, Options>, ProductTag, LhsShape, RhsSh
    Flags = Base::Flags | EvalBeforeNestingBit
  };

-  EIGEN_DEVICE_FUNC explicit product_evaluator(const XprType& xpr)
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+  explicit product_evaluator(const XprType& xpr)
    : m_result(xpr.rows(), xpr.cols())
  {
    ::new (static_cast<Base*>(this)) Base(m_result);
@@ -126,14 +128,22 @@ protected:
  PlainObject m_result;
 };

+// The following three shortcuts are enabled only if the scalar types match exactly.
+// TODO: we could enable them for different scalar types when the product is not vectorized.
+
 // Dense = Product
 template< typename DstXprType, typename Lhs, typename Rhs, int Options, typename Scalar>
-struct Assignment<DstXprType, Product<Lhs,Rhs,Options>, internal::assign_op<Scalar>, Dense2Dense,
-  typename enable_if<(Options==DefaultProduct || Options==AliasFreeProduct),Scalar>::type>
+struct Assignment<DstXprType, Product<Lhs,Rhs,Options>, internal::assign_op<Scalar,Scalar>, Dense2Dense,
+  typename enable_if<(Options==DefaultProduct || Options==AliasFreeProduct)>::type>
 {
  typedef Product<Lhs,Rhs,Options> SrcXprType;
-  static void run(DstXprType &dst, const SrcXprType &src, const internal::assign_op<Scalar> &)
+  static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+  void run(DstXprType &dst, const SrcXprType &src, const internal::assign_op<Scalar,Scalar> &)
  {
+    Index dstRows = src.rows();
+    Index dstCols = src.cols();
+    if((dst.rows()!=dstRows) || (dst.cols()!=dstCols))
+      dst.resize(dstRows, dstCols);
    // FIXME shall we handle nested_eval here?
    generic_product_impl<Lhs, Rhs>::evalTo(dst, src.lhs(), src.rhs());
  }
@@ -141,12 +151,14 @@ struct Assignment<DstXprType, Product<Lhs,Rhs,Options>, internal::assign_op<Scal

 // Dense += Product
 template< typename DstXprType, typename Lhs, typename Rhs, int Options, typename Scalar>
-struct Assignment<DstXprType, Product<Lhs,Rhs,Options>, internal::add_assign_op<Scalar>, Dense2Dense,
-  typename enable_if<(Options==DefaultProduct || Options==AliasFreeProduct),Scalar>::type>
+struct Assignment<DstXprType, Product<Lhs,Rhs,Options>, internal::add_assign_op<Scalar,Scalar>, Dense2Dense,
+  typename enable_if<(Options==DefaultProduct || Options==AliasFreeProduct)>::type>
 {
  typedef Product<Lhs,Rhs,Options> SrcXprType;
-  static void run(DstXprType &dst, const SrcXprType &src, const internal::add_assign_op<Scalar> &)
+  static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+  void run(DstXprType &dst, const SrcXprType &src, const internal::add_assign_op<Scalar,Scalar> &)
  {
+    eigen_assert(dst.rows() == src.rows() && dst.cols() == src.cols());
    // FIXME shall we handle nested_eval here?
    generic_product_impl<Lhs, Rhs>::addTo(dst, src.lhs(), src.rhs());
  }
@@ -154,12 +166,14 @@ struct Assignment<DstXprType, Product<Lhs,Rhs,Options>, internal::add_assign_op<

 // Dense -= Product
 template< typename DstXprType, typename Lhs, typename Rhs, int Options, typename Scalar>
-struct Assignment<DstXprType, Product<Lhs,Rhs,Options>, internal::sub_assign_op<Scalar>, Dense2Dense,
-  typename enable_if<(Options==DefaultProduct || Options==AliasFreeProduct),Scalar>::type>
+struct Assignment<DstXprType, Product<Lhs,Rhs,Options>, internal::sub_assign_op<Scalar,Scalar>, Dense2Dense,
+  typename enable_if<(Options==DefaultProduct || Options==AliasFreeProduct)>::type>
 {
  typedef Product<Lhs,Rhs,Options> SrcXprType;
-  static void run(DstXprType &dst, const SrcXprType &src, const internal::sub_assign_op<Scalar> &)
+  static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+  void run(DstXprType &dst, const SrcXprType &src, const internal::sub_assign_op<Scalar,Scalar> &)
  {
+    eigen_assert(dst.rows() == src.rows() && dst.cols() == src.cols());
    // FIXME shall we handle nested_eval here?
    generic_product_impl<Lhs, Rhs>::subTo(dst, src.lhs(), src.rhs());
  }
@@ -169,37 +183,82 @@ struct Assignment<DstXprType, Product<Lhs,Rhs,Options>, internal::sub_assign_op<
 // Dense ?= scalar * Product
 // TODO we should apply that rule if that's really helpful
 // for instance, this is not good for inner products
-template< typename DstXprType, typename Lhs, typename Rhs, typename AssignFunc, typename Scalar, typename ScalarBis>
-struct Assignment<DstXprType, CwiseUnaryOp<internal::scalar_multiple_op<ScalarBis>,
-                                           const Product<Lhs,Rhs,DefaultProduct> >, AssignFunc, Dense2Dense, Scalar>
+template< typename DstXprType, typename Lhs, typename Rhs, typename AssignFunc, typename Scalar, typename ScalarBis, typename Plain>
+struct Assignment<DstXprType, CwiseBinaryOp<internal::scalar_product_op<ScalarBis,Scalar>, const CwiseNullaryOp<internal::scalar_constant_op<ScalarBis>,Plain>,
+                                           const Product<Lhs,Rhs,DefaultProduct> >, AssignFunc, Dense2Dense>
 {
-  typedef CwiseUnaryOp<internal::scalar_multiple_op<ScalarBis>,
-                       const Product<Lhs,Rhs,DefaultProduct> > SrcXprType;
-  static void run(DstXprType &dst, const SrcXprType &src, const AssignFunc& func)
+  typedef CwiseBinaryOp<internal::scalar_product_op<ScalarBis,Scalar>,
+                        const CwiseNullaryOp<internal::scalar_constant_op<ScalarBis>,Plain>,
+                        const Product<Lhs,Rhs,DefaultProduct> > SrcXprType;
+  static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+  void run(DstXprType &dst, const SrcXprType &src, const AssignFunc& func)
  {
-    // TODO use operator* instead of prod() once we have made enough progress
-    call_assignment(dst.noalias(), prod(src.functor().m_other * src.nestedExpression().lhs(), src.nestedExpression().rhs()), func);
+    call_assignment_no_alias(dst, (src.lhs().functor().m_other * src.rhs().lhs())*src.rhs().rhs(), func);
  }
 };

+//----------------------------------------
+// Catch "Dense ?= xpr + Product<>" expression to save one temporary
+// FIXME we could probably enable these rules for any product, i.e., not only Dense and DefaultProduct
+
+template<typename OtherXpr, typename Lhs, typename Rhs>
+struct evaluator_assume_aliasing<CwiseBinaryOp<internal::scalar_sum_op<typename OtherXpr::Scalar,typename Product<Lhs,Rhs,DefaultProduct>::Scalar>, const OtherXpr,
+                                               const Product<Lhs,Rhs,DefaultProduct> >, DenseShape > {
+  static const bool value = true;
+};
+
+template<typename OtherXpr, typename Lhs, typename Rhs>
+struct evaluator_assume_aliasing<CwiseBinaryOp<internal::scalar_difference_op<typename OtherXpr::Scalar,typename Product<Lhs,Rhs,DefaultProduct>::Scalar>, const OtherXpr,
+                                               const Product<Lhs,Rhs,DefaultProduct> >, DenseShape > {
+  static const bool value = true;
+};
+
+template<typename DstXprType, typename OtherXpr, typename ProductType, typename Func1, typename Func2>
+struct assignment_from_xpr_op_product
+{
+  template<typename SrcXprType, typename InitialFunc>
+  static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+  void run(DstXprType &dst, const SrcXprType &src, const InitialFunc& /*func*/)
+  {
+    call_assignment_no_alias(dst, src.lhs(), Func1());
+    call_assignment_no_alias(dst, src.rhs(), Func2());
+  }
+};
+
+#define EIGEN_CATCH_ASSIGN_XPR_OP_PRODUCT(ASSIGN_OP,BINOP,ASSIGN_OP2) \
+  template< typename DstXprType, typename OtherXpr, typename Lhs, typename Rhs, typename DstScalar, typename SrcScalar, typename OtherScalar,typename ProdScalar> \
+  struct Assignment<DstXprType, CwiseBinaryOp<internal::BINOP<OtherScalar,ProdScalar>, const OtherXpr, \
+                                            const Product<Lhs,Rhs,DefaultProduct> >, internal::ASSIGN_OP<DstScalar,SrcScalar>, Dense2Dense> \
+    : assignment_from_xpr_op_product<DstXprType, OtherXpr, Product<Lhs,Rhs,DefaultProduct>, internal::ASSIGN_OP<DstScalar,OtherScalar>, internal::ASSIGN_OP2<DstScalar,ProdScalar> > \
+  {}
+
+EIGEN_CATCH_ASSIGN_XPR_OP_PRODUCT(assign_op,    scalar_sum_op,add_assign_op);
+EIGEN_CATCH_ASSIGN_XPR_OP_PRODUCT(add_assign_op,scalar_sum_op,add_assign_op);
+EIGEN_CATCH_ASSIGN_XPR_OP_PRODUCT(sub_assign_op,scalar_sum_op,sub_assign_op);
+
+EIGEN_CATCH_ASSIGN_XPR_OP_PRODUCT(assign_op,    scalar_difference_op,sub_assign_op);
+EIGEN_CATCH_ASSIGN_XPR_OP_PRODUCT(add_assign_op,scalar_difference_op,sub_assign_op);
+EIGEN_CATCH_ASSIGN_XPR_OP_PRODUCT(sub_assign_op,scalar_difference_op,add_assign_op);
+
+//----------------------------------------

 template<typename Lhs, typename Rhs>
 struct generic_product_impl<Lhs,Rhs,DenseShape,DenseShape,InnerProduct>
 {
  template<typename Dst>
-  static inline void evalTo(Dst& dst, const Lhs& lhs, const Rhs& rhs)
+  static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void evalTo(Dst& dst, const Lhs& lhs, const Rhs& rhs)
  {
    dst.coeffRef(0,0) = (lhs.transpose().cwiseProduct(rhs)).sum();
  }
  
  template<typename Dst>
-  static inline void addTo(Dst& dst, const Lhs& lhs, const Rhs& rhs)
+  static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void addTo(Dst& dst, const Lhs& lhs, const Rhs& rhs)
  {
    dst.coeffRef(0,0) += (lhs.transpose().cwiseProduct(rhs)).sum();
  }
  
  template<typename Dst>
-  static void subTo(Dst& dst, const Lhs& lhs, const Rhs& rhs)
+  static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void subTo(Dst& dst, const Lhs& lhs, const Rhs& rhs)
  { dst.coeffRef(0,0) -= (lhs.transpose().cwiseProduct(rhs)).sum(); }
 };

@@ -210,70 +269,70 @@ struct generic_product_impl<Lhs,Rhs,DenseShape,DenseShape,InnerProduct>

 // Column major result
 template<typename Dst, typename Lhs, typename Rhs, typename Func>
-EIGEN_DONT_INLINE void outer_product_selector_run(Dst& dst, const Lhs &lhs, const Rhs &rhs, const Func& func, const false_type&)
+void EIGEN_DEVICE_FUNC outer_product_selector_run(Dst& dst, const Lhs &lhs, const Rhs &rhs, const Func& func, const false_type&)
 {
  evaluator<Rhs> rhsEval(rhs);
-  // FIXME make sure lhs is sequentially stored
+  ei_declare_local_nested_eval(Lhs,lhs,Rhs::SizeAtCompileTime,actual_lhs);
+  // FIXME if cols is large enough, then it might be useful to make sure that lhs is sequentially stored
  // FIXME not very good if rhs is real and lhs complex while alpha is real too
-  // FIXME we should probably build an evaluator for dst
  const Index cols = dst.cols();
  for (Index j=0; j<cols; ++j)
-    func(dst.col(j), rhsEval.coeff(0,j) * lhs);
+    func(dst.col(j), rhsEval.coeff(Index(0),j) * actual_lhs);
 }

 // Row major result
 template<typename Dst, typename Lhs, typename Rhs, typename Func>
-EIGEN_DONT_INLINE void outer_product_selector_run(Dst& dst, const Lhs &lhs, const Rhs &rhs, const Func& func, const true_type&)
+void EIGEN_DEVICE_FUNC outer_product_selector_run(Dst& dst, const Lhs &lhs, const Rhs &rhs, const Func& func, const true_type&)
 {
  evaluator<Lhs> lhsEval(lhs);
-  // FIXME make sure rhs is sequentially stored
+  ei_declare_local_nested_eval(Rhs,rhs,Lhs::SizeAtCompileTime,actual_rhs);
+  // FIXME if rows is large enough, then it might be useful to make sure that rhs is sequentially stored
  // FIXME not very good if lhs is real and rhs complex while alpha is real too
-  // FIXME we should probably build an evaluator for dst
  const Index rows = dst.rows();
  for (Index i=0; i<rows; ++i)
-    func(dst.row(i), lhsEval.coeff(i,0) * rhs);
+    func(dst.row(i), lhsEval.coeff(i,Index(0)) * actual_rhs);
 }

 template<typename Lhs, typename Rhs>
 struct generic_product_impl<Lhs,Rhs,DenseShape,DenseShape,OuterProduct>
 {
-  template<typename T> struct IsRowMajor : internal::conditional<(int(T::Flags)&RowMajorBit), internal::true_type, internal::false_type>::type {};
+  template<typename T> struct is_row_major : internal::conditional<(int(T::Flags)&RowMajorBit), internal::true_type, internal::false_type>::type {};
  typedef typename Product<Lhs,Rhs>::Scalar Scalar;
  
  // TODO it would be nice to be able to exploit our *_assign_op functors for that purpose
-  struct set  { template<typename Dst, typename Src> void operator()(const Dst& dst, const Src& src) const { dst.const_cast_derived()  = src; } };
-  struct add  { template<typename Dst, typename Src> void operator()(const Dst& dst, const Src& src) const { dst.const_cast_derived() += src; } };
-  struct sub  { template<typename Dst, typename Src> void operator()(const Dst& dst, const Src& src) const { dst.const_cast_derived() -= src; } };
+  struct set  { template<typename Dst, typename Src> EIGEN_DEVICE_FUNC void operator()(const Dst& dst, const Src& src) const { dst.const_cast_derived()  = src; } };
+  struct add  { template<typename Dst, typename Src> EIGEN_DEVICE_FUNC void operator()(const Dst& dst, const Src& src) const { dst.const_cast_derived() += src; } };
+  struct sub  { template<typename Dst, typename Src> EIGEN_DEVICE_FUNC void operator()(const Dst& dst, const Src& src) const { dst.const_cast_derived() -= src; } };
  struct adds {
    Scalar m_scale;
    explicit adds(const Scalar& s) : m_scale(s) {}
-    template<typename Dst, typename Src> void operator()(const Dst& dst, const Src& src) const {
+    template<typename Dst, typename Src> void EIGEN_DEVICE_FUNC operator()(const Dst& dst, const Src& src) const {
      dst.const_cast_derived() += m_scale * src;
    }
  };
  
  template<typename Dst>
-  static inline void evalTo(Dst& dst, const Lhs& lhs, const Rhs& rhs)
+  static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void evalTo(Dst& dst, const Lhs& lhs, const Rhs& rhs)
  {
-    internal::outer_product_selector_run(dst, lhs, rhs, set(), IsRowMajor<Dst>());
+    internal::outer_product_selector_run(dst, lhs, rhs, set(), is_row_major<Dst>());
  }
  
  template<typename Dst>
-  static inline void addTo(Dst& dst, const Lhs& lhs, const Rhs& rhs)
+  static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void addTo(Dst& dst, const Lhs& lhs, const Rhs& rhs)
  {
-    internal::outer_product_selector_run(dst, lhs, rhs, add(), IsRowMajor<Dst>());
+    internal::outer_product_selector_run(dst, lhs, rhs, add(), is_row_major<Dst>());
  }
  
  template<typename Dst>
-  static inline void subTo(Dst& dst, const Lhs& lhs, const Rhs& rhs)
+  static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void subTo(Dst& dst, const Lhs& lhs, const Rhs& rhs)
  {
-    internal::outer_product_selector_run(dst, lhs, rhs, sub(), IsRowMajor<Dst>());
+    internal::outer_product_selector_run(dst, lhs, rhs, sub(), is_row_major<Dst>());
  }
  
  template<typename Dst>
-  static inline void scaleAndAddTo(Dst& dst, const Lhs& lhs, const Rhs& rhs, const Scalar& alpha)
+  static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void scaleAndAddTo(Dst& dst, const Lhs& lhs, const Rhs& rhs, const Scalar& alpha)
  {
-    internal::outer_product_selector_run(dst, lhs, rhs, adds(alpha), IsRowMajor<Dst>());
+    internal::outer_product_selector_run(dst, lhs, rhs, adds(alpha), is_row_major<Dst>());
  }
  
 };
@@ -286,19 +345,19 @@ struct generic_product_impl_base
  typedef typename Product<Lhs,Rhs>::Scalar Scalar;
  
  template<typename Dst>
-  static void evalTo(Dst& dst, const Lhs& lhs, const Rhs& rhs)
+  static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void evalTo(Dst& dst, const Lhs& lhs, const Rhs& rhs)
  { dst.setZero(); scaleAndAddTo(dst, lhs, rhs, Scalar(1)); }

  template<typename Dst>
-  static void addTo(Dst& dst, const Lhs& lhs, const Rhs& rhs)
+  static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void addTo(Dst& dst, const Lhs& lhs, const Rhs& rhs)
  { scaleAndAddTo(dst,lhs, rhs, Scalar(1)); }

  template<typename Dst>
-  static void subTo(Dst& dst, const Lhs& lhs, const Rhs& rhs)
+  static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void subTo(Dst& dst, const Lhs& lhs, const Rhs& rhs)
  { scaleAndAddTo(dst, lhs, rhs, Scalar(-1)); }
  
  template<typename Dst>
-  static void scaleAndAddTo(Dst& dst, const Lhs& lhs, const Rhs& rhs, const Scalar& alpha)
+  static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void scaleAndAddTo(Dst& dst, const Lhs& lhs, const Rhs& rhs, const Scalar& alpha)
  { Derived::scaleAndAddTo(dst,lhs,rhs,alpha); }

 };
@@ -307,17 +366,21 @@ template<typename Lhs, typename Rhs>
 struct generic_product_impl<Lhs,Rhs,DenseShape,DenseShape,GemvProduct>
  : generic_product_impl_base<Lhs,Rhs,generic_product_impl<Lhs,Rhs,DenseShape,DenseShape,GemvProduct> >
 {
+  typedef typename nested_eval<Lhs,1>::type LhsNested;
+  typedef typename nested_eval<Rhs,1>::type RhsNested;
  typedef typename Product<Lhs,Rhs>::Scalar Scalar;
  enum { Side = Lhs::IsVectorAtCompileTime ? OnTheLeft : OnTheRight };
-  typedef typename internal::conditional<int(Side)==OnTheRight,Lhs,Rhs>::type MatrixType;
+  typedef typename internal::remove_all<typename internal::conditional<int(Side)==OnTheRight,LhsNested,RhsNested>::type>::type MatrixType;

  template<typename Dest>
-  static void scaleAndAddTo(Dest& dst, const Lhs& lhs, const Rhs& rhs, const Scalar& alpha)
+  static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void scaleAndAddTo(Dest& dst, const Lhs& lhs, const Rhs& rhs, const Scalar& alpha)
  {
-    internal::gemv_dense_sense_selector<Side,
+    LhsNested actual_lhs(lhs);
+    RhsNested actual_rhs(rhs);
+    internal::gemv_dense_selector<Side,
                            (int(MatrixType::Flags)&RowMajorBit) ? RowMajor : ColMajor,
                            bool(internal::blas_traits<MatrixType>::HasUsableDirectAccess)
-                           >::run(lhs, rhs, dst, alpha);
+                           >::run(actual_lhs, actual_rhs, dst, alpha);
  }
 };

@@ -327,30 +390,79 @@ struct generic_product_impl<Lhs,Rhs,DenseShape,DenseShape,CoeffBasedProductMode>
  typedef typename Product<Lhs,Rhs>::Scalar Scalar;
  
  template<typename Dst>
-  static inline void evalTo(Dst& dst, const Lhs& lhs, const Rhs& rhs)
+  static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void evalTo(Dst& dst, const Lhs& lhs, const Rhs& rhs)
  {
-    // TODO: use the following instead of calling call_assignment, same for the other methods
-    // dst = lazyprod(lhs,rhs);
-    call_assignment(dst, lazyprod(lhs,rhs), internal::assign_op<Scalar>());
+    // Same as: dst.noalias() = lhs.lazyProduct(rhs);
+    // but easier on the compiler side
+    call_assignment_no_alias(dst, lhs.lazyProduct(rhs), internal::assign_op<typename Dst::Scalar,Scalar>());
+  }
+
+  template<typename Dst>
+  static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void addTo(Dst& dst, const Lhs& lhs, const Rhs& rhs)
+  {
+    // dst.noalias() += lhs.lazyProduct(rhs);
+    call_assignment_no_alias(dst, lhs.lazyProduct(rhs), internal::add_assign_op<typename Dst::Scalar,Scalar>());
  }
  
  template<typename Dst>
-  static inline void addTo(Dst& dst, const Lhs& lhs, const Rhs& rhs)
+  static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void subTo(Dst& dst, const Lhs& lhs, const Rhs& rhs)
  {
-    // dst += lazyprod(lhs,rhs);
-    call_assignment(dst, lazyprod(lhs,rhs), internal::add_assign_op<Scalar>());
+    // dst.noalias() -= lhs.lazyProduct(rhs);
+    call_assignment_no_alias(dst, lhs.lazyProduct(rhs), internal::sub_assign_op<typename Dst::Scalar,Scalar>());
  }
-  
-  template<typename Dst>
-  static inline void subTo(Dst& dst, const Lhs& lhs, const Rhs& rhs)
+
+  // This is a special evaluation path called from generic_product_impl<...,GemmProduct> in file GeneralMatrixMatrix.h
+  // This variant tries to extract scalar multiples from both the LHS and RHS and factor them out. For instance:
+  //   dst {,+,-}= (s1*A)*(B*s2)
+  // will be rewritten as:
+  //   dst {,+,-}= (s1*s2) * (A.lazyProduct(B))
+  // There are at least four benefits of doing so:
+  //  1 - huge performance gain for heap-allocated matrix types as it save costly allocations.
+  //  2 - it is faster than simply by-passing the heap allocation through stack allocation.
+  //  3 - it makes this fallback consistent with the heavy GEMM routine.
+  //  4 - it fully by-passes huge stack allocation attempts when multiplying huge fixed-size matrices.
+  //      (see https://stackoverflow.com/questions/54738495)
+  // For small fixed sizes matrices, howver, the gains are less obvious, it is sometimes x2 faster, but sometimes x3 slower,
+  // and the behavior depends also a lot on the compiler... This is why this re-writting strategy is currently
+  // enabled only when falling back from the main GEMM.
+  template<typename Dst, typename Func>
+  static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+  void eval_dynamic(Dst& dst, const Lhs& lhs, const Rhs& rhs, const Func &func)
  {
-    // dst -= lazyprod(lhs,rhs);
-    call_assignment(dst, lazyprod(lhs,rhs), internal::sub_assign_op<Scalar>());
+    enum {
+      HasScalarFactor = blas_traits<Lhs>::HasScalarFactor || blas_traits<Rhs>::HasScalarFactor,
+      ConjLhs = blas_traits<Lhs>::NeedToConjugate,
+      ConjRhs = blas_traits<Rhs>::NeedToConjugate
+    };
+    // FIXME: in c++11 this should be auto, and extractScalarFactor should also return auto
+    //        this is important for real*complex_mat
+    Scalar actualAlpha =    blas_traits<Lhs>::extractScalarFactor(lhs)
+                          * blas_traits<Rhs>::extractScalarFactor(rhs);
+    eval_dynamic_impl(dst,
+                      blas_traits<Lhs>::extract(lhs).template conjugateIf<ConjLhs>(),
+                      blas_traits<Rhs>::extract(rhs).template conjugateIf<ConjRhs>(),
+                      func,
+                      actualAlpha,
+                      typename conditional<HasScalarFactor,true_type,false_type>::type());
+  }
+
+protected:
+
+  template<typename Dst, typename LhsT, typename RhsT, typename Func, typename Scalar>
+  static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+  void eval_dynamic_impl(Dst& dst, const LhsT& lhs, const RhsT& rhs, const Func &func, const Scalar&  s /* == 1 */, false_type)
+  {
+    EIGEN_UNUSED_VARIABLE(s);
+    eigen_internal_assert(s==Scalar(1));
+    call_restricted_packet_assignment_no_alias(dst, lhs.lazyProduct(rhs), func);
+  }
+
+  template<typename Dst, typename LhsT, typename RhsT, typename Func, typename Scalar>
+  static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+  void eval_dynamic_impl(Dst& dst, const LhsT& lhs, const RhsT& rhs, const Func &func, const Scalar& s, true_type)
+  {
+    call_restricted_packet_assignment_no_alias(dst, s * lhs.lazyProduct(rhs), func);
  }
-  
-//   template<typename Dst>
-//   static inline void scaleAndAddTo(Dst& dst, const Lhs& lhs, const Rhs& rhs, const Scalar& alpha)
-//   { dst += alpha * lazyprod(lhs,rhs); }
 };

 // This specialization enforces the use of a coefficient-based evaluation strategy
@@ -371,23 +483,38 @@ template<int StorageOrder, int UnrollingIndex, typename Lhs, typename Rhs, typen
 struct etor_product_packet_impl;

 template<typename Lhs, typename Rhs, int ProductTag>
-struct product_evaluator<Product<Lhs, Rhs, LazyProduct>, ProductTag, DenseShape, DenseShape, typename Lhs::Scalar, typename Rhs::Scalar > 
+struct product_evaluator<Product<Lhs, Rhs, LazyProduct>, ProductTag, DenseShape, DenseShape>
    : evaluator_base<Product<Lhs, Rhs, LazyProduct> >
 {
  typedef Product<Lhs, Rhs, LazyProduct> XprType;
  typedef typename XprType::Scalar Scalar;
  typedef typename XprType::CoeffReturnType CoeffReturnType;
-  typedef typename XprType::PacketScalar PacketScalar;
-  typedef typename XprType::PacketReturnType PacketReturnType;

-  EIGEN_DEVICE_FUNC explicit product_evaluator(const XprType& xpr)
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+  explicit product_evaluator(const XprType& xpr)
    : m_lhs(xpr.lhs()),
      m_rhs(xpr.rhs()),
      m_lhsImpl(m_lhs),     // FIXME the creation of the evaluator objects should result in a no-op, but check that!
      m_rhsImpl(m_rhs),     //       Moreover, they are only useful for the packet path, so we could completely disable them when not needed,
                            //       or perhaps declare them on the fly on the packet method... We have experiment to check what's best.
      m_innerDim(xpr.lhs().cols())
-  { }
+  {
+    EIGEN_INTERNAL_CHECK_COST_VALUE(NumTraits<Scalar>::MulCost);
+    EIGEN_INTERNAL_CHECK_COST_VALUE(NumTraits<Scalar>::AddCost);
+    EIGEN_INTERNAL_CHECK_COST_VALUE(CoeffReadCost);
+#if 0
+    std::cerr << "LhsOuterStrideBytes=  " << LhsOuterStrideBytes << "\n";
+    std::cerr << "RhsOuterStrideBytes=  " << RhsOuterStrideBytes << "\n";
+    std::cerr << "LhsAlignment=         " << LhsAlignment << "\n";
+    std::cerr << "RhsAlignment=         " << RhsAlignment << "\n";
+    std::cerr << "CanVectorizeLhs=      " << CanVectorizeLhs << "\n";
+    std::cerr << "CanVectorizeRhs=      " << CanVectorizeRhs << "\n";
+    std::cerr << "CanVectorizeInner=    " << CanVectorizeInner << "\n";
+    std::cerr << "EvalToRowMajor=       " << EvalToRowMajor << "\n";
+    std::cerr << "Alignment=            " << Alignment << "\n";
+    std::cerr << "Flags=                " << Flags << "\n";
+#endif
+  }

  // Everything below here is taken from CoeffBasedProduct.h

@@ -399,74 +526,78 @@ struct product_evaluator<Product<Lhs, Rhs, LazyProduct>, ProductTag, DenseShape,

  typedef evaluator<LhsNestedCleaned> LhsEtorType;
  typedef evaluator<RhsNestedCleaned> RhsEtorType;
-  
+
  enum {
    RowsAtCompileTime = LhsNestedCleaned::RowsAtCompileTime,
    ColsAtCompileTime = RhsNestedCleaned::ColsAtCompileTime,
    InnerSize = EIGEN_SIZE_MIN_PREFER_FIXED(LhsNestedCleaned::ColsAtCompileTime, RhsNestedCleaned::RowsAtCompileTime),
    MaxRowsAtCompileTime = LhsNestedCleaned::MaxRowsAtCompileTime,
-    MaxColsAtCompileTime = RhsNestedCleaned::MaxColsAtCompileTime,
+    MaxColsAtCompileTime = RhsNestedCleaned::MaxColsAtCompileTime
+  };
+
+  typedef typename find_best_packet<Scalar,RowsAtCompileTime>::type LhsVecPacketType;
+  typedef typename find_best_packet<Scalar,ColsAtCompileTime>::type RhsVecPacketType;
+
+  enum {
      
-    PacketSize = packet_traits<Scalar>::size,
-    
    LhsCoeffReadCost = LhsEtorType::CoeffReadCost,
    RhsCoeffReadCost = RhsEtorType::CoeffReadCost,
    CoeffReadCost = InnerSize==0 ? NumTraits<Scalar>::ReadCost
-                  : (InnerSize == Dynamic || LhsCoeffReadCost==Dynamic || RhsCoeffReadCost==Dynamic || NumTraits<Scalar>::AddCost==Dynamic || NumTraits<Scalar>::MulCost==Dynamic) ? Dynamic
+                  : InnerSize == Dynamic ? HugeCost
                  : InnerSize * (NumTraits<Scalar>::MulCost + LhsCoeffReadCost + RhsCoeffReadCost)
                    + (InnerSize - 1) * NumTraits<Scalar>::AddCost,

-    Unroll = CoeffReadCost != Dynamic && CoeffReadCost <= EIGEN_UNROLLING_LIMIT,
+    Unroll = CoeffReadCost <= EIGEN_UNROLLING_LIMIT,
    
    LhsFlags = LhsEtorType::Flags,
    RhsFlags = RhsEtorType::Flags,
    
-    LhsAlignment = LhsEtorType::Alignment,
-    RhsAlignment = RhsEtorType::Alignment,
-    
-    LhsIsAligned = int(LhsAlignment) >= int(unpacket_traits<PacketScalar>::alignment),
-    RhsIsAligned = int(RhsAlignment) >= int(unpacket_traits<PacketScalar>::alignment),
-    
    LhsRowMajor = LhsFlags & RowMajorBit,
    RhsRowMajor = RhsFlags & RowMajorBit,
+
+    LhsVecPacketSize = unpacket_traits<LhsVecPacketType>::size,
+    RhsVecPacketSize = unpacket_traits<RhsVecPacketType>::size,
+
+    // Here, we don't care about alignment larger than the usable packet size.
+    LhsAlignment = EIGEN_PLAIN_ENUM_MIN(LhsEtorType::Alignment,LhsVecPacketSize*int(sizeof(typename LhsNestedCleaned::Scalar))),
+    RhsAlignment = EIGEN_PLAIN_ENUM_MIN(RhsEtorType::Alignment,RhsVecPacketSize*int(sizeof(typename RhsNestedCleaned::Scalar))),
      
    SameType = is_same<typename LhsNestedCleaned::Scalar,typename RhsNestedCleaned::Scalar>::value,

-    CanVectorizeRhs = RhsRowMajor && (RhsFlags & PacketAccessBit)
-                    && (ColsAtCompileTime == Dynamic || ( (ColsAtCompileTime % PacketSize) == 0 && RhsIsAligned ) ),
-
-    CanVectorizeLhs = (!LhsRowMajor) && (LhsFlags & PacketAccessBit)
-                    && (RowsAtCompileTime == Dynamic || ( (RowsAtCompileTime % PacketSize) == 0 && LhsIsAligned ) ),
+    CanVectorizeRhs = bool(RhsRowMajor) && (RhsFlags & PacketAccessBit) && (ColsAtCompileTime!=1),
+    CanVectorizeLhs = (!LhsRowMajor) && (LhsFlags & PacketAccessBit) && (RowsAtCompileTime!=1),

    EvalToRowMajor = (MaxRowsAtCompileTime==1&&MaxColsAtCompileTime!=1) ? 1
                    : (MaxColsAtCompileTime==1&&MaxRowsAtCompileTime!=1) ? 0
-                    : (RhsRowMajor && !CanVectorizeLhs),
+                    : (bool(RhsRowMajor) && !CanVectorizeLhs),

    Flags = ((unsigned int)(LhsFlags | RhsFlags) & HereditaryBits & ~RowMajorBit)
          | (EvalToRowMajor ? RowMajorBit : 0)
          // TODO enable vectorization for mixed types
-          | (SameType && (CanVectorizeLhs || CanVectorizeRhs) ? PacketAccessBit : 0),
+          | (SameType && (CanVectorizeLhs || CanVectorizeRhs) ? PacketAccessBit : 0)
+          | (XprType::IsVectorAtCompileTime ? LinearAccessBit : 0),
          
-    Alignment = CanVectorizeLhs ? LhsAlignment
-              : CanVectorizeRhs ? RhsAlignment
+    LhsOuterStrideBytes = int(LhsNestedCleaned::OuterStrideAtCompileTime) * int(sizeof(typename LhsNestedCleaned::Scalar)),
+    RhsOuterStrideBytes = int(RhsNestedCleaned::OuterStrideAtCompileTime) * int(sizeof(typename RhsNestedCleaned::Scalar)),
+
+    Alignment = bool(CanVectorizeLhs) ? (LhsOuterStrideBytes<=0 || (int(LhsOuterStrideBytes) % EIGEN_PLAIN_ENUM_MAX(1,LhsAlignment))!=0 ? 0 : LhsAlignment)
+              : bool(CanVectorizeRhs) ? (RhsOuterStrideBytes<=0 || (int(RhsOuterStrideBytes) % EIGEN_PLAIN_ENUM_MAX(1,RhsAlignment))!=0 ? 0 : RhsAlignment)
              : 0,
-          
+
    /* CanVectorizeInner deserves special explanation. It does not affect the product flags. It is not used outside
-    * of Product. If the Product itself is not a packet-access expression, there is still a chance that the inner
-    * loop of the product might be vectorized. This is the meaning of CanVectorizeInner. Since it doesn't affect
-    * the Flags, it is safe to make this value depend on ActualPacketAccessBit, that doesn't affect the ABI.
-    */
+     * of Product. If the Product itself is not a packet-access expression, there is still a chance that the inner
+     * loop of the product might be vectorized. This is the meaning of CanVectorizeInner. Since it doesn't affect
+     * the Flags, it is safe to make this value depend on ActualPacketAccessBit, that doesn't affect the ABI.
+     */
    CanVectorizeInner =    SameType
                        && LhsRowMajor
                        && (!RhsRowMajor)
                        && (LhsFlags & RhsFlags & ActualPacketAccessBit)
-                        && (LhsIsAligned && RhsIsAligned)
                        && (InnerSize % packet_traits<Scalar>::size == 0)
  };
  
-  EIGEN_DEVICE_FUNC const CoeffReturnType coeff(Index row, Index col) const
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const CoeffReturnType coeff(Index row, Index col) const
  {
-    // TODO check performance regression wrt to Eigen 3.2 which has special handling of this function
    return (m_lhs.row(row).transpose().cwiseProduct( m_rhs.col(col) )).sum();
  }

@@ -474,29 +605,38 @@ struct product_evaluator<Product<Lhs, Rhs, LazyProduct>, ProductTag, DenseShape,
   * which is why we don't set the LinearAccessBit.
   * TODO: this seems possible when the result is a vector
   */
-  EIGEN_DEVICE_FUNC const CoeffReturnType coeff(Index index) const
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+  const CoeffReturnType coeff(Index index) const
  {
-    const Index row = RowsAtCompileTime == 1 ? 0 : index;
-    const Index col = RowsAtCompileTime == 1 ? index : 0;
-    // TODO check performance regression wrt to Eigen 3.2 which has special handling of this function
+    const Index row = (RowsAtCompileTime == 1 || MaxRowsAtCompileTime==1) ? 0 : index;
+    const Index col = (RowsAtCompileTime == 1 || MaxRowsAtCompileTime==1) ? index : 0;
    return (m_lhs.row(row).transpose().cwiseProduct( m_rhs.col(col) )).sum();
  }

  template<int LoadMode, typename PacketType>
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
  const PacketType packet(Index row, Index col) const
  {
    PacketType res;
-    typedef etor_product_packet_impl<Flags&RowMajorBit ? RowMajor : ColMajor,
-                                     Unroll ? InnerSize : Dynamic,
+    typedef etor_product_packet_impl<bool(int(Flags)&RowMajorBit) ? RowMajor : ColMajor,
+                                     Unroll ? int(InnerSize) : Dynamic,
                                     LhsEtorType, RhsEtorType, PacketType, LoadMode> PacketImpl;
-
    PacketImpl::run(row, col, m_lhsImpl, m_rhsImpl, m_innerDim, res);
    return res;
  }

+  template<int LoadMode, typename PacketType>
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+  const PacketType packet(Index index) const
+  {
+    const Index row = (RowsAtCompileTime == 1 || MaxRowsAtCompileTime==1) ? 0 : index;
+    const Index col = (RowsAtCompileTime == 1 || MaxRowsAtCompileTime==1) ? index : 0;
+    return packet<LoadMode,PacketType>(row,col);
+  }
+
 protected:
-  const LhsNested m_lhs;
-  const RhsNested m_rhs;
+  typename internal::add_const_on_value_type<LhsNested>::type m_lhs;
+  typename internal::add_const_on_value_type<RhsNested>::type m_rhs;
  
  LhsEtorType m_lhsImpl;
  RhsEtorType m_rhsImpl;
@@ -506,16 +646,17 @@ protected:
 };

 template<typename Lhs, typename Rhs>
-struct product_evaluator<Product<Lhs, Rhs, DefaultProduct>, LazyCoeffBasedProductMode, DenseShape, DenseShape, typename traits<Lhs>::Scalar, typename traits<Rhs>::Scalar > 
-  : product_evaluator<Product<Lhs, Rhs, LazyProduct>, CoeffBasedProductMode, DenseShape, DenseShape, typename traits<Lhs>::Scalar, typename traits<Rhs>::Scalar >
+struct product_evaluator<Product<Lhs, Rhs, DefaultProduct>, LazyCoeffBasedProductMode, DenseShape, DenseShape>
+  : product_evaluator<Product<Lhs, Rhs, LazyProduct>, CoeffBasedProductMode, DenseShape, DenseShape>
 {
  typedef Product<Lhs, Rhs, DefaultProduct> XprType;
  typedef Product<Lhs, Rhs, LazyProduct> BaseProduct;
-  typedef product_evaluator<BaseProduct, CoeffBasedProductMode, DenseShape, DenseShape, typename Lhs::Scalar, typename Rhs::Scalar > Base;
+  typedef product_evaluator<BaseProduct, CoeffBasedProductMode, DenseShape, DenseShape> Base;
  enum {
    Flags = Base::Flags | EvalBeforeNestingBit
  };
-  EIGEN_DEVICE_FUNC explicit product_evaluator(const XprType& xpr)
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+  explicit product_evaluator(const XprType& xpr)
    : Base(BaseProduct(xpr.lhs(),xpr.rhs()))
  {}
 };
@@ -530,7 +671,7 @@ struct etor_product_packet_impl<RowMajor, UnrollingIndex, Lhs, Rhs, Packet, Load
  static EIGEN_STRONG_INLINE void run(Index row, Index col, const Lhs& lhs, const Rhs& rhs, Index innerDim, Packet &res)
  {
    etor_product_packet_impl<RowMajor, UnrollingIndex-1, Lhs, Rhs, Packet, LoadMode>::run(row, col, lhs, rhs, innerDim, res);
-    res =  pmadd(pset1<Packet>(lhs.coeff(row, UnrollingIndex-1)), rhs.template packet<LoadMode,Packet>(UnrollingIndex-1, col), res);
+    res =  pmadd(pset1<Packet>(lhs.coeff(row, Index(UnrollingIndex-1))), rhs.template packet<LoadMode,Packet>(Index(UnrollingIndex-1), col), res);
  }
 };

@@ -540,7 +681,7 @@ struct etor_product_packet_impl<ColMajor, UnrollingIndex, Lhs, Rhs, Packet, Load
  static EIGEN_STRONG_INLINE void run(Index row, Index col, const Lhs& lhs, const Rhs& rhs, Index innerDim, Packet &res)
  {
    etor_product_packet_impl<ColMajor, UnrollingIndex-1, Lhs, Rhs, Packet, LoadMode>::run(row, col, lhs, rhs, innerDim, res);
-    res =  pmadd(lhs.template packet<LoadMode,Packet>(row, UnrollingIndex-1), pset1<Packet>(rhs.coeff(UnrollingIndex-1, col)), res);
+    res =  pmadd(lhs.template packet<LoadMode,Packet>(row, Index(UnrollingIndex-1)), pset1<Packet>(rhs.coeff(Index(UnrollingIndex-1), col)), res);
  }
 };

@@ -549,7 +690,7 @@ struct etor_product_packet_impl<RowMajor, 1, Lhs, Rhs, Packet, LoadMode>
 {
  static EIGEN_STRONG_INLINE void run(Index row, Index col, const Lhs& lhs, const Rhs& rhs, Index /*innerDim*/, Packet &res)
  {
-    res = pmul(pset1<Packet>(lhs.coeff(row, 0)),rhs.template packet<LoadMode,Packet>(0, col));
+    res = pmul(pset1<Packet>(lhs.coeff(row, Index(0))),rhs.template packet<LoadMode,Packet>(Index(0), col));
  }
 };

@@ -558,7 +699,7 @@ struct etor_product_packet_impl<ColMajor, 1, Lhs, Rhs, Packet, LoadMode>
 {
  static EIGEN_STRONG_INLINE void run(Index row, Index col, const Lhs& lhs, const Rhs& rhs, Index /*innerDim*/, Packet &res)
  {
-    res = pmul(lhs.template packet<LoadMode,Packet>(row, 0), pset1<Packet>(rhs.coeff(0, col)));
+    res = pmul(lhs.template packet<LoadMode,Packet>(row, Index(0)), pset1<Packet>(rhs.coeff(Index(0), col)));
  }
 };

@@ -567,7 +708,7 @@ struct etor_product_packet_impl<RowMajor, 0, Lhs, Rhs, Packet, LoadMode>
 {
  static EIGEN_STRONG_INLINE void run(Index /*row*/, Index /*col*/, const Lhs& /*lhs*/, const Rhs& /*rhs*/, Index /*innerDim*/, Packet &res)
  {
-    res = pset1<Packet>(0);
+    res = pset1<Packet>(typename unpacket_traits<Packet>::type(0));
  }
 };

@@ -576,7 +717,7 @@ struct etor_product_packet_impl<ColMajor, 0, Lhs, Rhs, Packet, LoadMode>
 {
  static EIGEN_STRONG_INLINE void run(Index /*row*/, Index /*col*/, const Lhs& /*lhs*/, const Rhs& /*rhs*/, Index /*innerDim*/, Packet &res)
  {
-    res = pset1<Packet>(0);
+    res = pset1<Packet>(typename unpacket_traits<Packet>::type(0));
  }
 };

@@ -585,7 +726,7 @@ struct etor_product_packet_impl<RowMajor, Dynamic, Lhs, Rhs, Packet, LoadMode>
 {
  static EIGEN_STRONG_INLINE void run(Index row, Index col, const Lhs& lhs, const Rhs& rhs, Index innerDim, Packet& res)
  {
-    res = pset1<Packet>(0);
+    res = pset1<Packet>(typename unpacket_traits<Packet>::type(0));
    for(Index i = 0; i < innerDim; ++i)
      res =  pmadd(pset1<Packet>(lhs.coeff(row, i)), rhs.template packet<LoadMode,Packet>(i, col), res);
  }
@@ -596,7 +737,7 @@ struct etor_product_packet_impl<ColMajor, Dynamic, Lhs, Rhs, Packet, LoadMode>
 {
  static EIGEN_STRONG_INLINE void run(Index row, Index col, const Lhs& lhs, const Rhs& rhs, Index innerDim, Packet& res)
  {
-    res = pset1<Packet>(0);
+    res = pset1<Packet>(typename unpacket_traits<Packet>::type(0));
    for(Index i = 0; i < innerDim; ++i)
      res =  pmadd(lhs.template packet<LoadMode,Packet>(row, i), pset1<Packet>(rhs.coeff(i, col)), res);
  }
@@ -653,7 +794,8 @@ struct generic_product_impl<Lhs,Rhs,SelfAdjointShape,DenseShape,ProductTag>
  typedef typename Product<Lhs,Rhs>::Scalar Scalar;
  
  template<typename Dest>
-  static void scaleAndAddTo(Dest& dst, const Lhs& lhs, const Rhs& rhs, const Scalar& alpha)
+  static EIGEN_DEVICE_FUNC
+  void scaleAndAddTo(Dest& dst, const Lhs& lhs, const Rhs& rhs, const Scalar& alpha)
  {
    selfadjoint_product_impl<typename Lhs::MatrixType,Lhs::Mode,false,Rhs,0,Rhs::IsVectorAtCompileTime>::run(dst, lhs.nestedExpression(), rhs, alpha);
  }
@@ -681,33 +823,50 @@ template<typename MatrixType, typename DiagonalType, typename Derived, int Produ
 struct diagonal_product_evaluator_base
  : evaluator_base<Derived>
 {
-   typedef typename scalar_product_traits<typename MatrixType::Scalar, typename DiagonalType::Scalar>::ReturnType Scalar;
+   typedef typename ScalarBinaryOpTraits<typename MatrixType::Scalar, typename DiagonalType::Scalar>::ReturnType Scalar;
 public:
  enum {
    CoeffReadCost = NumTraits<Scalar>::MulCost + evaluator<MatrixType>::CoeffReadCost + evaluator<DiagonalType>::CoeffReadCost,
    
    MatrixFlags = evaluator<MatrixType>::Flags,
    DiagFlags = evaluator<DiagonalType>::Flags,
-    _StorageOrder = MatrixFlags & RowMajorBit ? RowMajor : ColMajor,
+    
+    _StorageOrder = (Derived::MaxRowsAtCompileTime==1 && Derived::MaxColsAtCompileTime!=1) ? RowMajor
+                  : (Derived::MaxColsAtCompileTime==1 && Derived::MaxRowsAtCompileTime!=1) ? ColMajor
+                  : MatrixFlags & RowMajorBit ? RowMajor : ColMajor,
+    _SameStorageOrder = _StorageOrder == (MatrixFlags & RowMajorBit ? RowMajor : ColMajor),
+
    _ScalarAccessOnDiag =  !((int(_StorageOrder) == ColMajor && int(ProductOrder) == OnTheLeft)
                           ||(int(_StorageOrder) == RowMajor && int(ProductOrder) == OnTheRight)),
    _SameTypes = is_same<typename MatrixType::Scalar, typename DiagonalType::Scalar>::value,
    // FIXME currently we need same types, but in the future the next rule should be the one
    //_Vectorizable = bool(int(MatrixFlags)&PacketAccessBit) && ((!_PacketOnDiag) || (_SameTypes && bool(int(DiagFlags)&PacketAccessBit))),
-    _Vectorizable = bool(int(MatrixFlags)&PacketAccessBit) && _SameTypes && (_ScalarAccessOnDiag || (bool(int(DiagFlags)&PacketAccessBit))),
+    _Vectorizable =   bool(int(MatrixFlags)&PacketAccessBit)
+                  &&  _SameTypes
+                  && (_SameStorageOrder || (MatrixFlags&LinearAccessBit)==LinearAccessBit)
+                  && (_ScalarAccessOnDiag || (bool(int(DiagFlags)&PacketAccessBit))),
    _LinearAccessMask = (MatrixType::RowsAtCompileTime==1 || MatrixType::ColsAtCompileTime==1) ? LinearAccessBit : 0,
    Flags = ((HereditaryBits|_LinearAccessMask) & (unsigned int)(MatrixFlags)) | (_Vectorizable ? PacketAccessBit : 0),
-    Alignment = evaluator<MatrixType>::Alignment
+    Alignment = evaluator<MatrixType>::Alignment,
+
+    AsScalarProduct =     (DiagonalType::SizeAtCompileTime==1)
+                      ||  (DiagonalType::SizeAtCompileTime==Dynamic && MatrixType::RowsAtCompileTime==1 && ProductOrder==OnTheLeft)
+                      ||  (DiagonalType::SizeAtCompileTime==Dynamic && MatrixType::ColsAtCompileTime==1 && ProductOrder==OnTheRight)
  };
  
  diagonal_product_evaluator_base(const MatrixType &mat, const DiagonalType &diag)
    : m_diagImpl(diag), m_matImpl(mat)
  {
+    EIGEN_INTERNAL_CHECK_COST_VALUE(NumTraits<Scalar>::MulCost);
+    EIGEN_INTERNAL_CHECK_COST_VALUE(CoeffReadCost);
  }
  
  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Scalar coeff(Index idx) const
  {
-    return m_diagImpl.coeff(idx) * m_matImpl.coeff(idx);
+    if(AsScalarProduct)
+      return m_diagImpl.coeff(0) * m_matImpl.coeff(idx);
+    else
+      return m_diagImpl.coeff(idx) * m_matImpl.coeff(idx);
  }
  
 protected:
@@ -735,7 +894,7 @@ protected:

 // diagonal * dense
 template<typename Lhs, typename Rhs, int ProductKind, int ProductTag>
-struct product_evaluator<Product<Lhs, Rhs, ProductKind>, ProductTag, DiagonalShape, DenseShape, typename Lhs::Scalar, typename Rhs::Scalar> 
+struct product_evaluator<Product<Lhs, Rhs, ProductKind>, ProductTag, DiagonalShape, DenseShape>
  : diagonal_product_evaluator_base<Rhs, typename Lhs::DiagonalVectorType, Product<Lhs, Rhs, LazyProduct>, OnTheLeft>
 {
  typedef diagonal_product_evaluator_base<Rhs, typename Lhs::DiagonalVectorType, Product<Lhs, Rhs, LazyProduct>, OnTheLeft> Base;
@@ -746,10 +905,10 @@ struct product_evaluator<Product<Lhs, Rhs, ProductKind>, ProductTag, DiagonalSha
  
  typedef Product<Lhs, Rhs, ProductKind> XprType;
  typedef typename XprType::PlainObject PlainObject;
+  typedef typename Lhs::DiagonalVectorType DiagonalType;
+
  
-  enum {
-    StorageOrder = int(Rhs::Flags) & RowMajorBit ? RowMajor : ColMajor
-  };
+  enum { StorageOrder = Base::_StorageOrder };

  EIGEN_DEVICE_FUNC explicit product_evaluator(const XprType& xpr)
    : Base(xpr.rhs(), xpr.lhs().diagonal())
@@ -761,7 +920,7 @@ struct product_evaluator<Product<Lhs, Rhs, ProductKind>, ProductTag, DiagonalSha
    return m_diagImpl.coeff(row) * m_matImpl.coeff(row, col);
  }
  
-#ifndef __CUDACC__
+#ifndef EIGEN_GPUCC
  template<int LoadMode,typename PacketType>
  EIGEN_STRONG_INLINE PacketType packet(Index row, Index col) const
  {
@@ -781,7 +940,7 @@ struct product_evaluator<Product<Lhs, Rhs, ProductKind>, ProductTag, DiagonalSha

 // dense * diagonal
 template<typename Lhs, typename Rhs, int ProductKind, int ProductTag>
-struct product_evaluator<Product<Lhs, Rhs, ProductKind>, ProductTag, DenseShape, DiagonalShape, typename Lhs::Scalar, typename Rhs::Scalar> 
+struct product_evaluator<Product<Lhs, Rhs, ProductKind>, ProductTag, DenseShape, DiagonalShape>
  : diagonal_product_evaluator_base<Lhs, typename Rhs::DiagonalVectorType, Product<Lhs, Rhs, LazyProduct>, OnTheRight>
 {
  typedef diagonal_product_evaluator_base<Lhs, typename Rhs::DiagonalVectorType, Product<Lhs, Rhs, LazyProduct>, OnTheRight> Base;
@@ -793,7 +952,7 @@ struct product_evaluator<Product<Lhs, Rhs, ProductKind>, ProductTag, DenseShape,
  typedef Product<Lhs, Rhs, ProductKind> XprType;
  typedef typename XprType::PlainObject PlainObject;
  
-  enum { StorageOrder = int(Lhs::Flags) & RowMajorBit ? RowMajor : ColMajor };
+  enum { StorageOrder = Base::_StorageOrder };

  EIGEN_DEVICE_FUNC explicit product_evaluator(const XprType& xpr)
    : Base(xpr.lhs(), xpr.rhs().diagonal())
@@ -805,7 +964,7 @@ struct product_evaluator<Product<Lhs, Rhs, ProductKind>, ProductTag, DenseShape,
    return m_matImpl.coeff(row, col) * m_diagImpl.coeff(col);
  }
  
-#ifndef __CUDACC__
+#ifndef EIGEN_GPUCC
  template<int LoadMode,typename PacketType>
  EIGEN_STRONG_INLINE PacketType packet(Index row, Index col) const
  {
@@ -911,20 +1070,20 @@ struct generic_product_impl<Lhs, Rhs, MatrixShape, PermutationShape, ProductTag>
 };

 template<typename Lhs, typename Rhs, int ProductTag, typename MatrixShape>
-struct generic_product_impl<Transpose<Lhs>, Rhs, PermutationShape, MatrixShape, ProductTag>
+struct generic_product_impl<Inverse<Lhs>, Rhs, PermutationShape, MatrixShape, ProductTag>
 {
  template<typename Dest>
-  static void evalTo(Dest& dst, const Transpose<Lhs>& lhs, const Rhs& rhs)
+  static void evalTo(Dest& dst, const Inverse<Lhs>& lhs, const Rhs& rhs)
  {
    permutation_matrix_product<Rhs, OnTheLeft, true, MatrixShape>::run(dst, lhs.nestedExpression(), rhs);
  }
 };

 template<typename Lhs, typename Rhs, int ProductTag, typename MatrixShape>
-struct generic_product_impl<Lhs, Transpose<Rhs>, MatrixShape, PermutationShape, ProductTag>
+struct generic_product_impl<Lhs, Inverse<Rhs>, MatrixShape, PermutationShape, ProductTag>
 {
  template<typename Dest>
-  static void evalTo(Dest& dst, const Lhs& lhs, const Transpose<Rhs>& rhs)
+  static void evalTo(Dest& dst, const Lhs& lhs, const Inverse<Rhs>& rhs)
  {
    permutation_matrix_product<Lhs, OnTheRight, true, MatrixShape>::run(dst, rhs.nestedExpression(), lhs);
  }
@@ -955,7 +1114,7 @@ struct transposition_matrix_product
    const Index size = tr.size();
    StorageIndex j = 0;

-    if(!(is_same<MatrixTypeCleaned,Dest>::value && extract_data(dst) == extract_data(mat)))
+    if(!is_same_dense(dst,mat))
      dst = mat;

    for(Index k=(Transposed?size-1:0) ; Transposed?k>=0:k<size ; Transposed?--k:++k)
--- a/Eigen/src/Core/Random.h
+++ b/Eigen/src/Core/Random.h
@@ -16,8 +16,7 @@ namespace internal {

 template<typename Scalar> struct scalar_random_op {
  EIGEN_EMPTY_STRUCT_CTOR(scalar_random_op)
-  template<typename Index>
-  inline const Scalar operator() (Index, Index = 0) const { return random<Scalar>(); }
+  inline const Scalar operator() () const { return random<Scalar>(); }
 };

 template<typename Scalar>
@@ -129,7 +128,7 @@ DenseBase<Derived>::Random()
  * \sa class CwiseNullaryOp, setRandom(Index), setRandom(Index,Index)
  */
 template<typename Derived>
-inline Derived& DenseBase<Derived>::setRandom()
+EIGEN_DEVICE_FUNC inline Derived& DenseBase<Derived>::setRandom()
 {
  return *this = Random(rows(), cols());
 }
--- a/Eigen/src/Core/Redux.h
+++ b/Eigen/src/Core/Redux.h
@@ -23,22 +23,29 @@ namespace internal {
 * Part 1 : the logic deciding a strategy for vectorization and unrolling
 ***************************************************************************/

-template<typename Func, typename Derived>
+template<typename Func, typename Evaluator>
 struct redux_traits
 {
 public:
+    typedef typename find_best_packet<typename Evaluator::Scalar,Evaluator::SizeAtCompileTime>::type PacketType;
  enum {
-    PacketSize = packet_traits<typename Derived::Scalar>::size,
-    InnerMaxSize = int(Derived::IsRowMajor)
-                 ? Derived::MaxColsAtCompileTime
-                 : Derived::MaxRowsAtCompileTime
+    PacketSize = unpacket_traits<PacketType>::size,
+    InnerMaxSize = int(Evaluator::IsRowMajor)
+                 ? Evaluator::MaxColsAtCompileTime
+                 : Evaluator::MaxRowsAtCompileTime,
+    OuterMaxSize = int(Evaluator::IsRowMajor)
+                 ? Evaluator::MaxRowsAtCompileTime
+                 : Evaluator::MaxColsAtCompileTime,
+    SliceVectorizedWork = int(InnerMaxSize)==Dynamic ? Dynamic
+                        : int(OuterMaxSize)==Dynamic ? (int(InnerMaxSize)>=int(PacketSize) ? Dynamic : 0)
+                        : (int(InnerMaxSize)/int(PacketSize)) * int(OuterMaxSize)
  };

  enum {
-    MightVectorize = (int(Derived::Flags)&ActualPacketAccessBit)
+    MightVectorize = (int(Evaluator::Flags)&ActualPacketAccessBit)
                  && (functor_traits<Func>::PacketAccess),
-    MayLinearVectorize = MightVectorize && (int(Derived::Flags)&LinearAccessBit),
-    MaySliceVectorize  = MightVectorize && int(InnerMaxSize)>=3*PacketSize
+    MayLinearVectorize = bool(MightVectorize) && (int(Evaluator::Flags)&LinearAccessBit),
+    MaySliceVectorize  = bool(MightVectorize) && (int(SliceVectorizedWork)==Dynamic || int(SliceVectorizedWork)>=3)
  };

 public:
@@ -50,37 +57,33 @@ public:

 public:
  enum {
-    Cost = (  Derived::SizeAtCompileTime == Dynamic
-           || Derived::CoeffReadCost == Dynamic
-           || (Derived::SizeAtCompileTime!=1 && functor_traits<Func>::Cost == Dynamic)
-           ) ? Dynamic
-           : Derived::SizeAtCompileTime * Derived::CoeffReadCost
-               + (Derived::SizeAtCompileTime-1) * functor_traits<Func>::Cost,
+    Cost = Evaluator::SizeAtCompileTime == Dynamic ? HugeCost
+         : Evaluator::SizeAtCompileTime * Evaluator::CoeffReadCost + (Evaluator::SizeAtCompileTime-1) * functor_traits<Func>::Cost,
    UnrollingLimit = EIGEN_UNROLLING_LIMIT * (int(Traversal) == int(DefaultTraversal) ? 1 : int(PacketSize))
  };

 public:
  enum {
-    Unrolling = Cost != Dynamic && Cost <= UnrollingLimit
-              ? CompleteUnrolling
-              : NoUnrolling
+    Unrolling = Cost <= UnrollingLimit ? CompleteUnrolling : NoUnrolling
  };
  
 #ifdef EIGEN_DEBUG_ASSIGN
  static void debug()
  {
-    std::cerr << "Xpr: " << typeid(typename Derived::XprType).name() << std::endl;
+    std::cerr << "Xpr: " << typeid(typename Evaluator::XprType).name() << std::endl;
    std::cerr.setf(std::ios::hex, std::ios::basefield);
-    EIGEN_DEBUG_VAR(Derived::Flags)
+    EIGEN_DEBUG_VAR(Evaluator::Flags)
    std::cerr.unsetf(std::ios::hex);
    EIGEN_DEBUG_VAR(InnerMaxSize)
+    EIGEN_DEBUG_VAR(OuterMaxSize)
+    EIGEN_DEBUG_VAR(SliceVectorizedWork)
    EIGEN_DEBUG_VAR(PacketSize)
    EIGEN_DEBUG_VAR(MightVectorize)
    EIGEN_DEBUG_VAR(MayLinearVectorize)
    EIGEN_DEBUG_VAR(MaySliceVectorize)
-    EIGEN_DEBUG_VAR(Traversal)
+    std::cerr << "Traversal" << " = " << Traversal << " (" << demangle_traversal(Traversal) << ")" << std::endl;
    EIGEN_DEBUG_VAR(UnrollingLimit)
-    EIGEN_DEBUG_VAR(Unrolling)
+    std::cerr << "Unrolling" << " = " << Unrolling << " (" << demangle_unrolling(Unrolling) << ")" << std::endl;
    std::cerr << std::endl;
  }
 #endif
@@ -92,88 +95,86 @@ public:

 /*** no vectorization ***/

-template<typename Func, typename Derived, int Start, int Length>
+template<typename Func, typename Evaluator, int Start, int Length>
 struct redux_novec_unroller
 {
  enum {
    HalfLength = Length/2
  };

-  typedef typename Derived::Scalar Scalar;
+  typedef typename Evaluator::Scalar Scalar;

  EIGEN_DEVICE_FUNC
-  static EIGEN_STRONG_INLINE Scalar run(const Derived &mat, const Func& func)
+  static EIGEN_STRONG_INLINE Scalar run(const Evaluator &eval, const Func& func)
  {
-    return func(redux_novec_unroller<Func, Derived, Start, HalfLength>::run(mat,func),
-                redux_novec_unroller<Func, Derived, Start+HalfLength, Length-HalfLength>::run(mat,func));
+    return func(redux_novec_unroller<Func, Evaluator, Start, HalfLength>::run(eval,func),
+                redux_novec_unroller<Func, Evaluator, Start+HalfLength, Length-HalfLength>::run(eval,func));
  }
 };

-template<typename Func, typename Derived, int Start>
-struct redux_novec_unroller<Func, Derived, Start, 1>
+template<typename Func, typename Evaluator, int Start>
+struct redux_novec_unroller<Func, Evaluator, Start, 1>
 {
  enum {
-    outer = Start / Derived::InnerSizeAtCompileTime,
-    inner = Start % Derived::InnerSizeAtCompileTime
+    outer = Start / Evaluator::InnerSizeAtCompileTime,
+    inner = Start % Evaluator::InnerSizeAtCompileTime
  };

-  typedef typename Derived::Scalar Scalar;
+  typedef typename Evaluator::Scalar Scalar;

  EIGEN_DEVICE_FUNC
-  static EIGEN_STRONG_INLINE Scalar run(const Derived &mat, const Func&)
+  static EIGEN_STRONG_INLINE Scalar run(const Evaluator &eval, const Func&)
  {
-    return mat.coeffByOuterInner(outer, inner);
+    return eval.coeffByOuterInner(outer, inner);
  }
 };

 // This is actually dead code and will never be called. It is required
 // to prevent false warnings regarding failed inlining though
 // for 0 length run() will never be called at all.
-template<typename Func, typename Derived, int Start>
-struct redux_novec_unroller<Func, Derived, Start, 0>
+template<typename Func, typename Evaluator, int Start>
+struct redux_novec_unroller<Func, Evaluator, Start, 0>
 {
-  typedef typename Derived::Scalar Scalar;
+  typedef typename Evaluator::Scalar Scalar;
  EIGEN_DEVICE_FUNC 
-  static EIGEN_STRONG_INLINE Scalar run(const Derived&, const Func&) { return Scalar(); }
+  static EIGEN_STRONG_INLINE Scalar run(const Evaluator&, const Func&) { return Scalar(); }
 };

 /*** vectorization ***/

-template<typename Func, typename Derived, int Start, int Length>
+template<typename Func, typename Evaluator, int Start, int Length>
 struct redux_vec_unroller
 {
-  enum {
-    PacketSize = packet_traits<typename Derived::Scalar>::size,
-    HalfLength = Length/2
-  };
-
-  typedef typename Derived::Scalar Scalar;
-  typedef typename packet_traits<Scalar>::type PacketScalar;
-
-  static EIGEN_STRONG_INLINE PacketScalar run(const Derived &mat, const Func& func)
+  template<typename PacketType>
+  EIGEN_DEVICE_FUNC
+  static EIGEN_STRONG_INLINE PacketType run(const Evaluator &eval, const Func& func)
  {
+    enum {
+      PacketSize = unpacket_traits<PacketType>::size,
+      HalfLength = Length/2
+    };
+
    return func.packetOp(
-            redux_vec_unroller<Func, Derived, Start, HalfLength>::run(mat,func),
-            redux_vec_unroller<Func, Derived, Start+HalfLength, Length-HalfLength>::run(mat,func) );
+            redux_vec_unroller<Func, Evaluator, Start, HalfLength>::template run<PacketType>(eval,func),
+            redux_vec_unroller<Func, Evaluator, Start+HalfLength, Length-HalfLength>::template run<PacketType>(eval,func) );
  }
 };

-template<typename Func, typename Derived, int Start>
-struct redux_vec_unroller<Func, Derived, Start, 1>
+template<typename Func, typename Evaluator, int Start>
+struct redux_vec_unroller<Func, Evaluator, Start, 1>
 {
-  enum {
-    index = Start * packet_traits<typename Derived::Scalar>::size,
-    outer = index / int(Derived::InnerSizeAtCompileTime),
-    inner = index % int(Derived::InnerSizeAtCompileTime),
-    alignment = Derived::Alignment
-  };
-
-  typedef typename Derived::Scalar Scalar;
-  typedef typename packet_traits<Scalar>::type PacketScalar;
-
-  static EIGEN_STRONG_INLINE PacketScalar run(const Derived &mat, const Func&)
+  template<typename PacketType>
+  EIGEN_DEVICE_FUNC
+  static EIGEN_STRONG_INLINE PacketType run(const Evaluator &eval, const Func&)
  {
-    return mat.template packetByOuterInner<alignment,PacketScalar>(outer, inner);
+    enum {
+      PacketSize = unpacket_traits<PacketType>::size,
+      index = Start * PacketSize,
+      outer = index / int(Evaluator::InnerSizeAtCompileTime),
+      inner = index % int(Evaluator::InnerSizeAtCompileTime),
+      alignment = Evaluator::Alignment
+    };
+    return eval.template packetByOuterInner<alignment,PacketType>(outer, inner);
  }
 };

@@ -181,53 +182,65 @@ struct redux_vec_unroller<Func, Derived, Start, 1>
 * Part 3 : implementation of all cases
 ***************************************************************************/

-template<typename Func, typename Derived,
-         int Traversal = redux_traits<Func, Derived>::Traversal,
-         int Unrolling = redux_traits<Func, Derived>::Unrolling
+template<typename Func, typename Evaluator,
+         int Traversal = redux_traits<Func, Evaluator>::Traversal,
+         int Unrolling = redux_traits<Func, Evaluator>::Unrolling
 >
 struct redux_impl;

-template<typename Func, typename Derived>
-struct redux_impl<Func, Derived, DefaultTraversal, NoUnrolling>
+template<typename Func, typename Evaluator>
+struct redux_impl<Func, Evaluator, DefaultTraversal, NoUnrolling>
 {
-  typedef typename Derived::Scalar Scalar;
-  EIGEN_DEVICE_FUNC
-  static EIGEN_STRONG_INLINE Scalar run(const Derived &mat, const Func& func)
+  typedef typename Evaluator::Scalar Scalar;
+
+  template<typename XprType>
+  EIGEN_DEVICE_FUNC static EIGEN_STRONG_INLINE
+  Scalar run(const Evaluator &eval, const Func& func, const XprType& xpr)
  {
-    eigen_assert(mat.rows()>0 && mat.cols()>0 && "you are using an empty matrix");
+    eigen_assert(xpr.rows()>0 && xpr.cols()>0 && "you are using an empty matrix");
    Scalar res;
-    res = mat.coeffByOuterInner(0, 0);
-    for(Index i = 1; i < mat.innerSize(); ++i)
-      res = func(res, mat.coeffByOuterInner(0, i));
-    for(Index i = 1; i < mat.outerSize(); ++i)
-      for(Index j = 0; j < mat.innerSize(); ++j)
-        res = func(res, mat.coeffByOuterInner(i, j));
+    res = eval.coeffByOuterInner(0, 0);
+    for(Index i = 1; i < xpr.innerSize(); ++i)
+      res = func(res, eval.coeffByOuterInner(0, i));
+    for(Index i = 1; i < xpr.outerSize(); ++i)
+      for(Index j = 0; j < xpr.innerSize(); ++j)
+        res = func(res, eval.coeffByOuterInner(i, j));
    return res;
  }
 };

-template<typename Func, typename Derived>
-struct redux_impl<Func,Derived, DefaultTraversal, CompleteUnrolling>
-  : public redux_novec_unroller<Func,Derived, 0, Derived::SizeAtCompileTime>
-{};
-
-template<typename Func, typename Derived>
-struct redux_impl<Func, Derived, LinearVectorizedTraversal, NoUnrolling>
+template<typename Func, typename Evaluator>
+struct redux_impl<Func,Evaluator, DefaultTraversal, CompleteUnrolling>
+  : redux_novec_unroller<Func,Evaluator, 0, Evaluator::SizeAtCompileTime>
 {
-  typedef typename Derived::Scalar Scalar;
-  typedef typename packet_traits<Scalar>::type PacketScalar;
-
-  static Scalar run(const Derived &mat, const Func& func)
+  typedef redux_novec_unroller<Func,Evaluator, 0, Evaluator::SizeAtCompileTime> Base;
+  typedef typename Evaluator::Scalar Scalar;
+  template<typename XprType>
+  EIGEN_DEVICE_FUNC static EIGEN_STRONG_INLINE
+  Scalar run(const Evaluator &eval, const Func& func, const XprType& /*xpr*/)
  {
-    const Index size = mat.size();
+    return Base::run(eval,func);
+  }
+};
+
+template<typename Func, typename Evaluator>
+struct redux_impl<Func, Evaluator, LinearVectorizedTraversal, NoUnrolling>
+{
+  typedef typename Evaluator::Scalar Scalar;
+  typedef typename redux_traits<Func, Evaluator>::PacketType PacketScalar;
+
+  template<typename XprType>
+  static Scalar run(const Evaluator &eval, const Func& func, const XprType& xpr)
+  {
+    const Index size = xpr.size();
    
-    const Index packetSize = packet_traits<Scalar>::size;
+    const Index packetSize = redux_traits<Func, Evaluator>::PacketSize;
    const int packetAlignment = unpacket_traits<PacketScalar>::alignment;
    enum {
-      alignment0 = (bool(Derived::Flags & DirectAccessBit) && bool(packet_traits<Scalar>::AlignedOnScalar)) ? int(packetAlignment) : int(Unaligned),
-      alignment = EIGEN_PLAIN_ENUM_MAX(alignment0, Derived::Alignment)
+      alignment0 = (bool(Evaluator::Flags & DirectAccessBit) && bool(packet_traits<Scalar>::AlignedOnScalar)) ? int(packetAlignment) : int(Unaligned),
+      alignment = EIGEN_PLAIN_ENUM_MAX(alignment0, Evaluator::Alignment)
    };
-    const Index alignedStart = internal::first_default_aligned(mat.nestedExpression());
+    const Index alignedStart = internal::first_default_aligned(xpr);
    const Index alignedSize2 = ((size-alignedStart)/(2*packetSize))*(2*packetSize);
    const Index alignedSize = ((size-alignedStart)/(packetSize))*(packetSize);
    const Index alignedEnd2 = alignedStart + alignedSize2;
@@ -235,163 +248,143 @@ struct redux_impl<Func, Derived, LinearVectorizedTraversal, NoUnrolling>
    Scalar res;
    if(alignedSize)
    {
-      PacketScalar packet_res0 = mat.template packet<alignment,PacketScalar>(alignedStart);
+      PacketScalar packet_res0 = eval.template packet<alignment,PacketScalar>(alignedStart);
      if(alignedSize>packetSize) // we have at least two packets to partly unroll the loop
      {
-        PacketScalar packet_res1 = mat.template packet<alignment,PacketScalar>(alignedStart+packetSize);
+        PacketScalar packet_res1 = eval.template packet<alignment,PacketScalar>(alignedStart+packetSize);
        for(Index index = alignedStart + 2*packetSize; index < alignedEnd2; index += 2*packetSize)
        {
-          packet_res0 = func.packetOp(packet_res0, mat.template packet<alignment,PacketScalar>(index));
-          packet_res1 = func.packetOp(packet_res1, mat.template packet<alignment,PacketScalar>(index+packetSize));
+          packet_res0 = func.packetOp(packet_res0, eval.template packet<alignment,PacketScalar>(index));
+          packet_res1 = func.packetOp(packet_res1, eval.template packet<alignment,PacketScalar>(index+packetSize));
        }

        packet_res0 = func.packetOp(packet_res0,packet_res1);
        if(alignedEnd>alignedEnd2)
-          packet_res0 = func.packetOp(packet_res0, mat.template packet<alignment,PacketScalar>(alignedEnd2));
+          packet_res0 = func.packetOp(packet_res0, eval.template packet<alignment,PacketScalar>(alignedEnd2));
      }
      res = func.predux(packet_res0);

      for(Index index = 0; index < alignedStart; ++index)
-        res = func(res,mat.coeff(index));
+        res = func(res,eval.coeff(index));

      for(Index index = alignedEnd; index < size; ++index)
-        res = func(res,mat.coeff(index));
+        res = func(res,eval.coeff(index));
    }
    else // too small to vectorize anything.
         // since this is dynamic-size hence inefficient anyway for such small sizes, don't try to optimize.
    {
-      res = mat.coeff(0);
+      res = eval.coeff(0);
      for(Index index = 1; index < size; ++index)
-        res = func(res,mat.coeff(index));
+        res = func(res,eval.coeff(index));
    }

    return res;
  }
 };

-template<typename Func, typename Derived>
-struct redux_impl<Func, Derived, SliceVectorizedTraversal, NoUnrolling>
+// NOTE: for SliceVectorizedTraversal we simply bypass unrolling
+template<typename Func, typename Evaluator, int Unrolling>
+struct redux_impl<Func, Evaluator, SliceVectorizedTraversal, Unrolling>
 {
-  typedef typename Derived::Scalar Scalar;
-  typedef typename packet_traits<Scalar>::type PacketType;
+  typedef typename Evaluator::Scalar Scalar;
+  typedef typename redux_traits<Func, Evaluator>::PacketType PacketType;

-  EIGEN_DEVICE_FUNC static Scalar run(const Derived &mat, const Func& func)
+  template<typename XprType>
+  EIGEN_DEVICE_FUNC static Scalar run(const Evaluator &eval, const Func& func, const XprType& xpr)
  {
-    eigen_assert(mat.rows()>0 && mat.cols()>0 && "you are using an empty matrix");
-    const Index innerSize = mat.innerSize();
-    const Index outerSize = mat.outerSize();
+    eigen_assert(xpr.rows()>0 && xpr.cols()>0 && "you are using an empty matrix");
+    const Index innerSize = xpr.innerSize();
+    const Index outerSize = xpr.outerSize();
    enum {
-      packetSize = packet_traits<Scalar>::size
+      packetSize = redux_traits<Func, Evaluator>::PacketSize
    };
    const Index packetedInnerSize = ((innerSize)/packetSize)*packetSize;
    Scalar res;
    if(packetedInnerSize)
    {
-      PacketType packet_res = mat.template packet<Unaligned,PacketType>(0,0);
+      PacketType packet_res = eval.template packet<Unaligned,PacketType>(0,0);
      for(Index j=0; j<outerSize; ++j)
        for(Index i=(j==0?packetSize:0); i<packetedInnerSize; i+=Index(packetSize))
-          packet_res = func.packetOp(packet_res, mat.template packetByOuterInner<Unaligned,PacketType>(j,i));
+          packet_res = func.packetOp(packet_res, eval.template packetByOuterInner<Unaligned,PacketType>(j,i));

      res = func.predux(packet_res);
      for(Index j=0; j<outerSize; ++j)
        for(Index i=packetedInnerSize; i<innerSize; ++i)
-          res = func(res, mat.coeffByOuterInner(j,i));
+          res = func(res, eval.coeffByOuterInner(j,i));
    }
    else // too small to vectorize anything.
         // since this is dynamic-size hence inefficient anyway for such small sizes, don't try to optimize.
    {
-      res = redux_impl<Func, Derived, DefaultTraversal, NoUnrolling>::run(mat, func);
+      res = redux_impl<Func, Evaluator, DefaultTraversal, NoUnrolling>::run(eval, func, xpr);
    }

    return res;
  }
 };

-template<typename Func, typename Derived>
-struct redux_impl<Func, Derived, LinearVectorizedTraversal, CompleteUnrolling>
+template<typename Func, typename Evaluator>
+struct redux_impl<Func, Evaluator, LinearVectorizedTraversal, CompleteUnrolling>
 {
-  typedef typename Derived::Scalar Scalar;
-  typedef typename packet_traits<Scalar>::type PacketScalar;
+  typedef typename Evaluator::Scalar Scalar;
+
+  typedef typename redux_traits<Func, Evaluator>::PacketType PacketType;
  enum {
-    PacketSize = packet_traits<Scalar>::size,
-    Size = Derived::SizeAtCompileTime,
+    PacketSize = redux_traits<Func, Evaluator>::PacketSize,
+    Size = Evaluator::SizeAtCompileTime,
    VectorizedSize = (Size / PacketSize) * PacketSize
  };
-  EIGEN_DEVICE_FUNC static EIGEN_STRONG_INLINE Scalar run(const Derived &mat, const Func& func)
+
+  template<typename XprType>
+  EIGEN_DEVICE_FUNC static EIGEN_STRONG_INLINE
+  Scalar run(const Evaluator &eval, const Func& func, const XprType &xpr)
  {
-    eigen_assert(mat.rows()>0 && mat.cols()>0 && "you are using an empty matrix");
+    EIGEN_ONLY_USED_FOR_DEBUG(xpr)
+    eigen_assert(xpr.rows()>0 && xpr.cols()>0 && "you are using an empty matrix");
    if (VectorizedSize > 0) {
-      Scalar res = func.predux(redux_vec_unroller<Func, Derived, 0, Size / PacketSize>::run(mat,func));
+      Scalar res = func.predux(redux_vec_unroller<Func, Evaluator, 0, Size / PacketSize>::template run<PacketType>(eval,func));
      if (VectorizedSize != Size)
-        res = func(res,redux_novec_unroller<Func, Derived, VectorizedSize, Size-VectorizedSize>::run(mat,func));
+        res = func(res,redux_novec_unroller<Func, Evaluator, VectorizedSize, Size-VectorizedSize>::run(eval,func));
      return res;
    }
    else {
-      return redux_novec_unroller<Func, Derived, 0, Size>::run(mat,func);
+      return redux_novec_unroller<Func, Evaluator, 0, Size>::run(eval,func);
    }
  }
 };

 // evaluator adaptor
 template<typename _XprType>
-class redux_evaluator
+class redux_evaluator : public internal::evaluator<_XprType>
 {
+  typedef internal::evaluator<_XprType> Base;
 public:
  typedef _XprType XprType;
-  EIGEN_DEVICE_FUNC explicit redux_evaluator(const XprType &xpr) : m_evaluator(xpr), m_xpr(xpr) {}
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+  explicit redux_evaluator(const XprType &xpr) : Base(xpr) {}
  
  typedef typename XprType::Scalar Scalar;
  typedef typename XprType::CoeffReturnType CoeffReturnType;
  typedef typename XprType::PacketScalar PacketScalar;
-  typedef typename XprType::PacketReturnType PacketReturnType;
  
  enum {
    MaxRowsAtCompileTime = XprType::MaxRowsAtCompileTime,
    MaxColsAtCompileTime = XprType::MaxColsAtCompileTime,
    // TODO we should not remove DirectAccessBit and rather find an elegant way to query the alignment offset at runtime from the evaluator
-    Flags = evaluator<XprType>::Flags & ~DirectAccessBit,
+    Flags = Base::Flags & ~DirectAccessBit,
    IsRowMajor = XprType::IsRowMajor,
    SizeAtCompileTime = XprType::SizeAtCompileTime,
-    InnerSizeAtCompileTime = XprType::InnerSizeAtCompileTime,
-    CoeffReadCost = evaluator<XprType>::CoeffReadCost,
-    Alignment = evaluator<XprType>::Alignment
+    InnerSizeAtCompileTime = XprType::InnerSizeAtCompileTime
  };
  
-  EIGEN_DEVICE_FUNC Index rows() const { return m_xpr.rows(); }
-  EIGEN_DEVICE_FUNC Index cols() const { return m_xpr.cols(); }
-  EIGEN_DEVICE_FUNC Index size() const { return m_xpr.size(); }
-  EIGEN_DEVICE_FUNC Index innerSize() const { return m_xpr.innerSize(); }
-  EIGEN_DEVICE_FUNC Index outerSize() const { return m_xpr.outerSize(); }
-
-  EIGEN_DEVICE_FUNC
-  CoeffReturnType coeff(Index row, Index col) const
-  { return m_evaluator.coeff(row, col); }
-
-  EIGEN_DEVICE_FUNC
-  CoeffReturnType coeff(Index index) const
-  { return m_evaluator.coeff(index); }
-
-  template<int LoadMode, typename PacketType>
-  PacketReturnType packet(Index row, Index col) const
-  { return m_evaluator.template packet<LoadMode,PacketType>(row, col); }
-
-  template<int LoadMode, typename PacketType>
-  PacketReturnType packet(Index index) const
-  { return m_evaluator.template packet<LoadMode,PacketType>(index); }
-  
-  EIGEN_DEVICE_FUNC
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
  CoeffReturnType coeffByOuterInner(Index outer, Index inner) const
-  { return m_evaluator.coeff(IsRowMajor ? outer : inner, IsRowMajor ? inner : outer); }
+  { return Base::coeff(IsRowMajor ? outer : inner, IsRowMajor ? inner : outer); }
  
  template<int LoadMode, typename PacketType>
-  PacketReturnType packetByOuterInner(Index outer, Index inner) const
-  { return m_evaluator.template packet<LoadMode,PacketType>(IsRowMajor ? outer : inner, IsRowMajor ? inner : outer); }
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+  PacketType packetByOuterInner(Index outer, Index inner) const
+  { return Base::template packet<LoadMode,PacketType>(IsRowMajor ? outer : inner, IsRowMajor ? inner : outer); }
  
-  const XprType & nestedExpression() const { return m_xpr; }
-  
-protected:
-  internal::evaluator<XprType> m_evaluator;
-  const XprType &m_xpr;
 };

 } // end namespace internal
@@ -406,62 +399,60 @@ protected:
  * The template parameter \a BinaryOp is the type of the functor \a func which must be
  * an associative operator. Both current C++98 and C++11 functor styles are handled.
  *
+  * \warning the matrix must be not empty, otherwise an assertion is triggered.
+  *
  * \sa DenseBase::sum(), DenseBase::minCoeff(), DenseBase::maxCoeff(), MatrixBase::colwise(), MatrixBase::rowwise()
  */
 template<typename Derived>
 template<typename Func>
-typename internal::traits<Derived>::Scalar
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE typename internal::traits<Derived>::Scalar
 DenseBase<Derived>::redux(const Func& func) const
 {
  eigen_assert(this->rows()>0 && this->cols()>0 && "you are using an empty matrix");
-  
-  // FIXME, eval_nest should be handled by redux_evaluator, however:
-  //  - it is currently difficult to provide the right Flags since they are still handled by the expressions
-  //  - handling it here might reduce the number of template instantiations
-//   typedef typename internal::nested_eval<Derived,1>::type ThisNested;
-//   typedef typename internal::remove_all<ThisNested>::type ThisNestedCleaned;
-//   typedef typename internal::redux_evaluator<ThisNestedCleaned> ThisEvaluator;
-//   
-//   ThisNested thisNested(derived());
-//   ThisEvaluator thisEval(thisNested);
-  
+
  typedef typename internal::redux_evaluator<Derived> ThisEvaluator;
  ThisEvaluator thisEval(derived());
-  
-  return internal::redux_impl<Func, ThisEvaluator>::run(thisEval, func);
+
+  // The initial expression is passed to the reducer as an additional argument instead of
+  // passing it as a member of redux_evaluator to help  
+  return internal::redux_impl<Func, ThisEvaluator>::run(thisEval, func, derived());
 }

 /** \returns the minimum of all coefficients of \c *this.
+  * \warning the matrix must be not empty, otherwise an assertion is triggered.
  * \warning the result is undefined if \c *this contains NaN.
  */
 template<typename Derived>
-EIGEN_STRONG_INLINE typename internal::traits<Derived>::Scalar
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE typename internal::traits<Derived>::Scalar
 DenseBase<Derived>::minCoeff() const
 {
-  return derived().redux(Eigen::internal::scalar_min_op<Scalar>());
+  return derived().redux(Eigen::internal::scalar_min_op<Scalar,Scalar>());
 }

 /** \returns the maximum of all coefficients of \c *this.
+  * \warning the matrix must be not empty, otherwise an assertion is triggered.
  * \warning the result is undefined if \c *this contains NaN.
  */
 template<typename Derived>
-EIGEN_STRONG_INLINE typename internal::traits<Derived>::Scalar
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE typename internal::traits<Derived>::Scalar
 DenseBase<Derived>::maxCoeff() const
 {
-  return derived().redux(Eigen::internal::scalar_max_op<Scalar>());
+  return derived().redux(Eigen::internal::scalar_max_op<Scalar,Scalar>());
 }

-/** \returns the sum of all coefficients of *this
+/** \returns the sum of all coefficients of \c *this
+  *
+  * If \c *this is empty, then the value 0 is returned.
  *
  * \sa trace(), prod(), mean()
  */
 template<typename Derived>
-EIGEN_STRONG_INLINE typename internal::traits<Derived>::Scalar
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE typename internal::traits<Derived>::Scalar
 DenseBase<Derived>::sum() const
 {
  if(SizeAtCompileTime==0 || (SizeAtCompileTime==Dynamic && size()==0))
    return Scalar(0);
-  return derived().redux(Eigen::internal::scalar_sum_op<Scalar>());
+  return derived().redux(Eigen::internal::scalar_sum_op<Scalar,Scalar>());
 }

 /** \returns the mean of all coefficients of *this
@@ -469,10 +460,17 @@ DenseBase<Derived>::sum() const
 * \sa trace(), prod(), sum()
 */
 template<typename Derived>
-EIGEN_STRONG_INLINE typename internal::traits<Derived>::Scalar
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE typename internal::traits<Derived>::Scalar
 DenseBase<Derived>::mean() const
 {
-  return Scalar(derived().redux(Eigen::internal::scalar_sum_op<Scalar>())) / Scalar(this->size());
+#ifdef __INTEL_COMPILER
+  #pragma warning push
+  #pragma warning ( disable : 2259 )
+#endif
+  return Scalar(derived().redux(Eigen::internal::scalar_sum_op<Scalar,Scalar>())) / Scalar(this->size());
+#ifdef __INTEL_COMPILER
+  #pragma warning pop
+#endif
 }

 /** \returns the product of all coefficients of *this
@@ -483,7 +481,7 @@ DenseBase<Derived>::mean() const
  * \sa sum(), mean(), trace()
  */
 template<typename Derived>
-EIGEN_STRONG_INLINE typename internal::traits<Derived>::Scalar
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE typename internal::traits<Derived>::Scalar
 DenseBase<Derived>::prod() const
 {
  if(SizeAtCompileTime==0 || (SizeAtCompileTime==Dynamic && size()==0))
@@ -498,7 +496,7 @@ DenseBase<Derived>::prod() const
  * \sa diagonal(), sum()
  */
 template<typename Derived>
-EIGEN_STRONG_INLINE typename internal::traits<Derived>::Scalar
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE typename internal::traits<Derived>::Scalar
 MatrixBase<Derived>::trace() const
 {
  return derived().diagonal().sum();
--- a/Eigen/src/Core/Ref.h
+++ b/Eigen/src/Core/Ref.h
@@ -12,14 +12,123 @@

 namespace Eigen { 

+namespace internal {
+
+template<typename _PlainObjectType, int _Options, typename _StrideType>
+struct traits<Ref<_PlainObjectType, _Options, _StrideType> >
+  : public traits<Map<_PlainObjectType, _Options, _StrideType> >
+{
+  typedef _PlainObjectType PlainObjectType;
+  typedef _StrideType StrideType;
+  enum {
+    Options = _Options,
+    Flags = traits<Map<_PlainObjectType, _Options, _StrideType> >::Flags | NestByRefBit,
+    Alignment = traits<Map<_PlainObjectType, _Options, _StrideType> >::Alignment
+  };
+
+  template<typename Derived> struct match {
+    enum {
+      IsVectorAtCompileTime = PlainObjectType::IsVectorAtCompileTime || Derived::IsVectorAtCompileTime,
+      HasDirectAccess = internal::has_direct_access<Derived>::ret,
+      StorageOrderMatch = IsVectorAtCompileTime || ((PlainObjectType::Flags&RowMajorBit)==(Derived::Flags&RowMajorBit)),
+      InnerStrideMatch = int(StrideType::InnerStrideAtCompileTime)==int(Dynamic)
+                      || int(StrideType::InnerStrideAtCompileTime)==int(Derived::InnerStrideAtCompileTime)
+                      || (int(StrideType::InnerStrideAtCompileTime)==0 && int(Derived::InnerStrideAtCompileTime)==1),
+      OuterStrideMatch = IsVectorAtCompileTime
+                      || int(StrideType::OuterStrideAtCompileTime)==int(Dynamic) || int(StrideType::OuterStrideAtCompileTime)==int(Derived::OuterStrideAtCompileTime),
+      // NOTE, this indirection of evaluator<Derived>::Alignment is needed
+      // to workaround a very strange bug in MSVC related to the instantiation
+      // of has_*ary_operator in evaluator<CwiseNullaryOp>.
+      // This line is surprisingly very sensitive. For instance, simply adding parenthesis
+      // as "DerivedAlignment = (int(evaluator<Derived>::Alignment))," will make MSVC fail...
+      DerivedAlignment = int(evaluator<Derived>::Alignment),
+      AlignmentMatch = (int(traits<PlainObjectType>::Alignment)==int(Unaligned)) || (DerivedAlignment >= int(Alignment)), // FIXME the first condition is not very clear, it should be replaced by the required alignment
+      ScalarTypeMatch = internal::is_same<typename PlainObjectType::Scalar, typename Derived::Scalar>::value,
+      MatchAtCompileTime = HasDirectAccess && StorageOrderMatch && InnerStrideMatch && OuterStrideMatch && AlignmentMatch && ScalarTypeMatch
+    };
+    typedef typename internal::conditional<MatchAtCompileTime,internal::true_type,internal::false_type>::type type;
+  };
+  
+};
+
+template<typename Derived>
+struct traits<RefBase<Derived> > : public traits<Derived> {};
+
+}
+
+template<typename Derived> class RefBase
+ : public MapBase<Derived>
+{
+  typedef typename internal::traits<Derived>::PlainObjectType PlainObjectType;
+  typedef typename internal::traits<Derived>::StrideType StrideType;
+
+public:
+
+  typedef MapBase<Derived> Base;
+  EIGEN_DENSE_PUBLIC_INTERFACE(RefBase)
+
+  EIGEN_DEVICE_FUNC inline Index innerStride() const
+  {
+    return StrideType::InnerStrideAtCompileTime != 0 ? m_stride.inner() : 1;
+  }
+
+  EIGEN_DEVICE_FUNC inline Index outerStride() const
+  {
+    return StrideType::OuterStrideAtCompileTime != 0 ? m_stride.outer()
+         : IsVectorAtCompileTime ? this->size()
+         : int(Flags)&RowMajorBit ? this->cols()
+         : this->rows();
+  }
+
+  EIGEN_DEVICE_FUNC RefBase()
+    : Base(0,RowsAtCompileTime==Dynamic?0:RowsAtCompileTime,ColsAtCompileTime==Dynamic?0:ColsAtCompileTime),
+      // Stride<> does not allow default ctor for Dynamic strides, so let' initialize it with dummy values:
+      m_stride(StrideType::OuterStrideAtCompileTime==Dynamic?0:StrideType::OuterStrideAtCompileTime,
+               StrideType::InnerStrideAtCompileTime==Dynamic?0:StrideType::InnerStrideAtCompileTime)
+  {}
+  
+  EIGEN_INHERIT_ASSIGNMENT_OPERATORS(RefBase)
+
+protected:
+
+  typedef Stride<StrideType::OuterStrideAtCompileTime,StrideType::InnerStrideAtCompileTime> StrideBase;
+
+  template<typename Expression>
+  EIGEN_DEVICE_FUNC void construct(Expression& expr)
+  {
+    EIGEN_STATIC_ASSERT_SAME_MATRIX_SIZE(PlainObjectType,Expression);
+
+    if(PlainObjectType::RowsAtCompileTime==1)
+    {
+      eigen_assert(expr.rows()==1 || expr.cols()==1);
+      ::new (static_cast<Base*>(this)) Base(expr.data(), 1, expr.size());
+    }
+    else if(PlainObjectType::ColsAtCompileTime==1)
+    {
+      eigen_assert(expr.rows()==1 || expr.cols()==1);
+      ::new (static_cast<Base*>(this)) Base(expr.data(), expr.size(), 1);
+    }
+    else
+      ::new (static_cast<Base*>(this)) Base(expr.data(), expr.rows(), expr.cols());
+    
+    if(Expression::IsVectorAtCompileTime && (!PlainObjectType::IsVectorAtCompileTime) && ((Expression::Flags&RowMajorBit)!=(PlainObjectType::Flags&RowMajorBit)))
+      ::new (&m_stride) StrideBase(expr.innerStride(), StrideType::InnerStrideAtCompileTime==0?0:1);
+    else
+      ::new (&m_stride) StrideBase(StrideType::OuterStrideAtCompileTime==0?0:expr.outerStride(),
+                                   StrideType::InnerStrideAtCompileTime==0?0:expr.innerStride());    
+  }
+
+  StrideBase m_stride;
+};
+
 /** \class Ref
  * \ingroup Core_Module
  *
  * \brief A matrix or vector expression mapping an existing expression
  *
  * \tparam PlainObjectType the equivalent matrix type of the mapped data
-  * \tparam MapOptions specifies the pointer alignment in bytes. It can be: \c #Aligned128, , \c #Aligned64, \c #Aligned32, \c #Aligned16, \c #Aligned8 or \c #Unaligned.
-  *                The default is \c #Unaligned.
+  * \tparam Options specifies the pointer alignment in bytes. It can be: \c #Aligned128, , \c #Aligned64, \c #Aligned32, \c #Aligned16, \c #Aligned8 or \c #Unaligned.
+  *                 The default is \c #Unaligned.
  * \tparam StrideType optionally specifies strides. By default, Ref implies a contiguous storage along the inner dimension (inner stride==1),
  *                   but accepts a variable outer stride (leading dimension).
  *                   This can be overridden by specifying strides.
@@ -78,111 +187,11 @@ namespace Eigen {
  * void foo(const Ref<MatrixXf,0,Stride<> >& A) { foo_impl(A); }
  * \endcode
  *
+  * See also the following stackoverflow questions for further references:
+  *  - <a href="http://stackoverflow.com/questions/21132538/correct-usage-of-the-eigenref-class">Correct usage of the Eigen::Ref<> class</a>
  *
  * \sa PlainObjectBase::Map(), \ref TopicStorageOrders
  */
-
-namespace internal {
-
-template<typename _PlainObjectType, int _Options, typename _StrideType>
-struct traits<Ref<_PlainObjectType, _Options, _StrideType> >
-  : public traits<Map<_PlainObjectType, _Options, _StrideType> >
-{
-  typedef _PlainObjectType PlainObjectType;
-  typedef _StrideType StrideType;
-  enum {
-    Options = _Options,
-    Flags = traits<Map<_PlainObjectType, _Options, _StrideType> >::Flags | NestByRefBit,
-    Alignment = traits<Map<_PlainObjectType, _Options, _StrideType> >::Alignment
-  };
-
-  template<typename Derived> struct match {
-    enum {
-      HasDirectAccess = internal::has_direct_access<Derived>::ret,
-      StorageOrderMatch = PlainObjectType::IsVectorAtCompileTime || Derived::IsVectorAtCompileTime || ((PlainObjectType::Flags&RowMajorBit)==(Derived::Flags&RowMajorBit)),
-      InnerStrideMatch = int(StrideType::InnerStrideAtCompileTime)==int(Dynamic)
-                      || int(StrideType::InnerStrideAtCompileTime)==int(Derived::InnerStrideAtCompileTime)
-                      || (int(StrideType::InnerStrideAtCompileTime)==0 && int(Derived::InnerStrideAtCompileTime)==1),
-      OuterStrideMatch = Derived::IsVectorAtCompileTime
-                      || int(StrideType::OuterStrideAtCompileTime)==int(Dynamic) || int(StrideType::OuterStrideAtCompileTime)==int(Derived::OuterStrideAtCompileTime),
-      AlignmentMatch = (int(traits<PlainObjectType>::Alignment)==int(Unaligned)) || (int(evaluator<Derived>::Alignment) >= int(Alignment)), // FIXME the first condition is not very clear, it should be replaced by the required alignment
-      ScalarTypeMatch = internal::is_same<typename PlainObjectType::Scalar, typename Derived::Scalar>::value,
-      MatchAtCompileTime = HasDirectAccess && StorageOrderMatch && InnerStrideMatch && OuterStrideMatch && AlignmentMatch && ScalarTypeMatch
-    };
-    typedef typename internal::conditional<MatchAtCompileTime,internal::true_type,internal::false_type>::type type;
-  };
-  
-};
-
-template<typename Derived>
-struct traits<RefBase<Derived> > : public traits<Derived> {};
-
-}
-
-template<typename Derived> class RefBase
- : public MapBase<Derived>
-{
-  typedef typename internal::traits<Derived>::PlainObjectType PlainObjectType;
-  typedef typename internal::traits<Derived>::StrideType StrideType;
-
-public:
-
-  typedef MapBase<Derived> Base;
-  EIGEN_DENSE_PUBLIC_INTERFACE(RefBase)
-
-  EIGEN_DEVICE_FUNC inline Index innerStride() const
-  {
-    return StrideType::InnerStrideAtCompileTime != 0 ? m_stride.inner() : 1;
-  }
-
-  EIGEN_DEVICE_FUNC inline Index outerStride() const
-  {
-    return StrideType::OuterStrideAtCompileTime != 0 ? m_stride.outer()
-         : IsVectorAtCompileTime ? this->size()
-         : int(Flags)&RowMajorBit ? this->cols()
-         : this->rows();
-  }
-
-  EIGEN_DEVICE_FUNC RefBase()
-    : Base(0,RowsAtCompileTime==Dynamic?0:RowsAtCompileTime,ColsAtCompileTime==Dynamic?0:ColsAtCompileTime),
-      // Stride<> does not allow default ctor for Dynamic strides, so let' initialize it with dummy values:
-      m_stride(StrideType::OuterStrideAtCompileTime==Dynamic?0:StrideType::OuterStrideAtCompileTime,
-               StrideType::InnerStrideAtCompileTime==Dynamic?0:StrideType::InnerStrideAtCompileTime)
-  {}
-  
-  EIGEN_INHERIT_ASSIGNMENT_OPERATORS(RefBase)
-
-protected:
-
-  typedef Stride<StrideType::OuterStrideAtCompileTime,StrideType::InnerStrideAtCompileTime> StrideBase;
-
-  template<typename Expression>
-  EIGEN_DEVICE_FUNC void construct(Expression& expr)
-  {
-    if(PlainObjectType::RowsAtCompileTime==1)
-    {
-      eigen_assert(expr.rows()==1 || expr.cols()==1);
-      ::new (static_cast<Base*>(this)) Base(expr.data(), 1, expr.size());
-    }
-    else if(PlainObjectType::ColsAtCompileTime==1)
-    {
-      eigen_assert(expr.rows()==1 || expr.cols()==1);
-      ::new (static_cast<Base*>(this)) Base(expr.data(), expr.size(), 1);
-    }
-    else
-      ::new (static_cast<Base*>(this)) Base(expr.data(), expr.rows(), expr.cols());
-    
-    if(Expression::IsVectorAtCompileTime && (!PlainObjectType::IsVectorAtCompileTime) && ((Expression::Flags&RowMajorBit)!=(PlainObjectType::Flags&RowMajorBit)))
-      ::new (&m_stride) StrideBase(expr.innerStride(), StrideType::InnerStrideAtCompileTime==0?0:1);
-    else
-      ::new (&m_stride) StrideBase(StrideType::OuterStrideAtCompileTime==0?0:expr.outerStride(),
-                                   StrideType::InnerStrideAtCompileTime==0?0:expr.innerStride());    
-  }
-
-  StrideBase m_stride;
-};
-
-
 template<typename PlainObjectType, int Options, typename StrideType> class Ref
  : public RefBase<Ref<PlainObjectType, Options, StrideType> >
 {
@@ -209,6 +218,7 @@ template<typename PlainObjectType, int Options, typename StrideType> class Ref
    EIGEN_DEVICE_FUNC inline Ref(const DenseBase<Derived>& expr,
                                 typename internal::enable_if<bool(Traits::template match<Derived>::MatchAtCompileTime),Derived>::type* = 0)
    #else
+    /** Implicit constructor from any dense expression */
    template<typename Derived>
    inline Ref(DenseBase<Derived>& expr)
    #endif
@@ -263,7 +273,7 @@ template<typename TPlainObjectType, int Options, typename StrideType> class Ref<
    template<typename Expression>
    EIGEN_DEVICE_FUNC void construct(const Expression& expr, internal::false_type)
    {
-      internal::call_assignment_no_alias(m_object,expr,internal::assign_op<Scalar>());
+      internal::call_assignment_no_alias(m_object,expr,internal::assign_op<Scalar,Scalar>());
      Base::construct(m_object);
    }

--- a/Eigen/src/Core/Replicate.h
+++ b/Eigen/src/Core/Replicate.h
@@ -12,21 +12,6 @@

 namespace Eigen { 

-/**
-  * \class Replicate
-  * \ingroup Core_Module
-  *
-  * \brief Expression of the multiple replication of a matrix or vector
-  *
-  * \param MatrixType the type of the object we are replicating
-  *
-  * This class represents an expression of the multiple replication of a matrix or vector.
-  * It is the return type of DenseBase::replicate() and most of the time
-  * this is the only way it is used.
-  *
-  * \sa DenseBase::replicate()
-  */
-
 namespace internal {
 template<typename MatrixType,int RowFactor,int ColFactor>
 struct traits<Replicate<MatrixType,RowFactor,ColFactor> >
@@ -57,6 +42,22 @@ struct traits<Replicate<MatrixType,RowFactor,ColFactor> >
 };
 }

+/**
+  * \class Replicate
+  * \ingroup Core_Module
+  *
+  * \brief Expression of the multiple replication of a matrix or vector
+  *
+  * \tparam MatrixType the type of the object we are replicating
+  * \tparam RowFactor number of repetitions at compile time along the vertical direction, can be Dynamic.
+  * \tparam ColFactor number of repetitions at compile time along the horizontal direction, can be Dynamic.
+  *
+  * This class represents an expression of the multiple replication of a matrix or vector.
+  * It is the return type of DenseBase::replicate() and most of the time
+  * this is the only way it is used.
+  *
+  * \sa DenseBase::replicate()
+  */
 template<typename MatrixType,int RowFactor,int ColFactor> class Replicate
  : public internal::dense_xpr_base< Replicate<MatrixType,RowFactor,ColFactor> >::type
 {
@@ -114,7 +115,7 @@ template<typename MatrixType,int RowFactor,int ColFactor> class Replicate
  */
 template<typename Derived>
 template<int RowFactor, int ColFactor>
-const Replicate<Derived,RowFactor,ColFactor>
+EIGEN_DEVICE_FUNC const Replicate<Derived,RowFactor,ColFactor>
 DenseBase<Derived>::replicate() const
 {
  return Replicate<Derived,RowFactor,ColFactor>(derived());
@@ -129,7 +130,7 @@ DenseBase<Derived>::replicate() const
  * \sa VectorwiseOp::replicate(), DenseBase::replicate(), class Replicate
  */
 template<typename ExpressionType, int Direction>
-const typename VectorwiseOp<ExpressionType,Direction>::ReplicateReturnType
+EIGEN_DEVICE_FUNC const typename VectorwiseOp<ExpressionType,Direction>::ReplicateReturnType
 VectorwiseOp<ExpressionType,Direction>::replicate(Index factor) const
 {
  return typename VectorwiseOp<ExpressionType,Direction>::ReplicateReturnType
--- a/Eigen/src/Core/Reshaped.h
+++ b/Eigen/src/Core/Reshaped.h
@@ -0,0 +1,453 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2008-2017 Gael Guennebaud <gael.guennebaud@inria.fr>
+// Copyright (C) 2014 yoco <peter.xiau@gmail.com>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+#ifndef EIGEN_RESHAPED_H
+#define EIGEN_RESHAPED_H
+
+namespace Eigen {
+namespace internal {
+
+/** \class Reshaped
+  * \ingroup Core_Module
+  *
+  * \brief Expression of a fixed-size or dynamic-size reshape
+  *
+  * \tparam XprType the type of the expression in which we are taking a reshape
+  * \tparam Rows the number of rows of the reshape we are taking at compile time (optional)
+  * \tparam Cols the number of columns of the reshape we are taking at compile time (optional)
+  * \tparam Order can be ColMajor or RowMajor, default is ColMajor.
+  *
+  * This class represents an expression of either a fixed-size or dynamic-size reshape.
+  * It is the return type of DenseBase::reshaped(NRowsType,NColsType) and
+  * most of the time this is the only way it is used.
+  *
+  * However, in C++98, if you want to directly maniputate reshaped expressions,
+  * for instance if you want to write a function returning such an expression, you
+  * will need to use this class. In C++11, it is advised to use the \em auto
+  * keyword for such use cases.
+  *
+  * Here is an example illustrating the dynamic case:
+  * \include class_Reshaped.cpp
+  * Output: \verbinclude class_Reshaped.out
+  *
+  * Here is an example illustrating the fixed-size case:
+  * \include class_FixedReshaped.cpp
+  * Output: \verbinclude class_FixedReshaped.out
+  *
+  * \sa DenseBase::reshaped(NRowsType,NColsType)
+  */
+
+template<typename XprType, int Rows, int Cols, int Order>
+struct traits<Reshaped<XprType, Rows, Cols, Order> > : traits<XprType>
+{
+  typedef typename traits<XprType>::Scalar Scalar;
+  typedef typename traits<XprType>::StorageKind StorageKind;
+  typedef typename traits<XprType>::XprKind XprKind;
+  enum{
+    MatrixRows = traits<XprType>::RowsAtCompileTime,
+    MatrixCols = traits<XprType>::ColsAtCompileTime,
+    RowsAtCompileTime = Rows,
+    ColsAtCompileTime = Cols,
+    MaxRowsAtCompileTime = Rows,
+    MaxColsAtCompileTime = Cols,
+    XpxStorageOrder = ((int(traits<XprType>::Flags) & RowMajorBit) == RowMajorBit) ? RowMajor : ColMajor,
+    ReshapedStorageOrder = (RowsAtCompileTime == 1 && ColsAtCompileTime != 1) ? RowMajor
+                         : (ColsAtCompileTime == 1 && RowsAtCompileTime != 1) ? ColMajor
+                         : XpxStorageOrder,
+    HasSameStorageOrderAsXprType = (ReshapedStorageOrder == XpxStorageOrder),
+    InnerSize = (ReshapedStorageOrder==int(RowMajor)) ? int(ColsAtCompileTime) : int(RowsAtCompileTime),
+    InnerStrideAtCompileTime = HasSameStorageOrderAsXprType
+                             ? int(inner_stride_at_compile_time<XprType>::ret)
+                             : Dynamic,
+    OuterStrideAtCompileTime = Dynamic,
+
+    HasDirectAccess = internal::has_direct_access<XprType>::ret
+                    && (Order==int(XpxStorageOrder))
+                    && ((evaluator<XprType>::Flags&LinearAccessBit)==LinearAccessBit),
+
+    MaskPacketAccessBit = (InnerSize == Dynamic || (InnerSize % packet_traits<Scalar>::size) == 0)
+                       && (InnerStrideAtCompileTime == 1)
+                        ? PacketAccessBit : 0,
+    //MaskAlignedBit = ((OuterStrideAtCompileTime!=Dynamic) && (((OuterStrideAtCompileTime * int(sizeof(Scalar))) % 16) == 0)) ? AlignedBit : 0,
+    FlagsLinearAccessBit = (RowsAtCompileTime == 1 || ColsAtCompileTime == 1) ? LinearAccessBit : 0,
+    FlagsLvalueBit = is_lvalue<XprType>::value ? LvalueBit : 0,
+    FlagsRowMajorBit = (ReshapedStorageOrder==int(RowMajor)) ? RowMajorBit : 0,
+    FlagsDirectAccessBit = HasDirectAccess ? DirectAccessBit : 0,
+    Flags0 = traits<XprType>::Flags & ( (HereditaryBits & ~RowMajorBit) | MaskPacketAccessBit),
+
+    Flags = (Flags0 | FlagsLinearAccessBit | FlagsLvalueBit | FlagsRowMajorBit | FlagsDirectAccessBit)
+  };
+};
+
+template<typename XprType, int Rows, int Cols, int Order, bool HasDirectAccess> class ReshapedImpl_dense;
+
+} // end namespace internal
+
+template<typename XprType, int Rows, int Cols, int Order, typename StorageKind> class ReshapedImpl;
+
+template<typename XprType, int Rows, int Cols, int Order> class Reshaped
+  : public ReshapedImpl<XprType, Rows, Cols, Order, typename internal::traits<XprType>::StorageKind>
+{
+    typedef ReshapedImpl<XprType, Rows, Cols, Order, typename internal::traits<XprType>::StorageKind> Impl;
+  public:
+    //typedef typename Impl::Base Base;
+    typedef Impl Base;
+    EIGEN_GENERIC_PUBLIC_INTERFACE(Reshaped)
+    EIGEN_INHERIT_ASSIGNMENT_OPERATORS(Reshaped)
+
+    /** Fixed-size constructor
+      */
+    EIGEN_DEVICE_FUNC
+    inline Reshaped(XprType& xpr)
+      : Impl(xpr)
+    {
+      EIGEN_STATIC_ASSERT(RowsAtCompileTime!=Dynamic && ColsAtCompileTime!=Dynamic,THIS_METHOD_IS_ONLY_FOR_FIXED_SIZE)
+      eigen_assert(Rows * Cols == xpr.rows() * xpr.cols());
+    }
+
+    /** Dynamic-size constructor
+      */
+    EIGEN_DEVICE_FUNC
+    inline Reshaped(XprType& xpr,
+          Index reshapeRows, Index reshapeCols)
+      : Impl(xpr, reshapeRows, reshapeCols)
+    {
+      eigen_assert((RowsAtCompileTime==Dynamic || RowsAtCompileTime==reshapeRows)
+          && (ColsAtCompileTime==Dynamic || ColsAtCompileTime==reshapeCols));
+      eigen_assert(reshapeRows * reshapeCols == xpr.rows() * xpr.cols());
+    }
+};
+
+// The generic default implementation for dense reshape simply forward to the internal::ReshapedImpl_dense
+// that must be specialized for direct and non-direct access...
+template<typename XprType, int Rows, int Cols, int Order>
+class ReshapedImpl<XprType, Rows, Cols, Order, Dense>
+  : public internal::ReshapedImpl_dense<XprType, Rows, Cols, Order,internal::traits<Reshaped<XprType,Rows,Cols,Order> >::HasDirectAccess>
+{
+    typedef internal::ReshapedImpl_dense<XprType, Rows, Cols, Order,internal::traits<Reshaped<XprType,Rows,Cols,Order> >::HasDirectAccess> Impl;
+  public:
+    typedef Impl Base;
+    EIGEN_INHERIT_ASSIGNMENT_OPERATORS(ReshapedImpl)
+    EIGEN_DEVICE_FUNC inline ReshapedImpl(XprType& xpr) : Impl(xpr) {}
+    EIGEN_DEVICE_FUNC inline ReshapedImpl(XprType& xpr, Index reshapeRows, Index reshapeCols)
+      : Impl(xpr, reshapeRows, reshapeCols) {}
+};
+
+namespace internal {
+
+/** \internal Internal implementation of dense Reshaped in the general case. */
+template<typename XprType, int Rows, int Cols, int Order>
+class ReshapedImpl_dense<XprType,Rows,Cols,Order,false>
+  : public internal::dense_xpr_base<Reshaped<XprType, Rows, Cols, Order> >::type
+{
+    typedef Reshaped<XprType, Rows, Cols, Order> ReshapedType;
+  public:
+
+    typedef typename internal::dense_xpr_base<ReshapedType>::type Base;
+    EIGEN_DENSE_PUBLIC_INTERFACE(ReshapedType)
+    EIGEN_INHERIT_ASSIGNMENT_OPERATORS(ReshapedImpl_dense)
+
+    typedef typename internal::ref_selector<XprType>::non_const_type MatrixTypeNested;
+    typedef typename internal::remove_all<XprType>::type NestedExpression;
+
+    class InnerIterator;
+
+    /** Fixed-size constructor
+      */
+    EIGEN_DEVICE_FUNC
+    inline ReshapedImpl_dense(XprType& xpr)
+      : m_xpr(xpr), m_rows(Rows), m_cols(Cols)
+    {}
+
+    /** Dynamic-size constructor
+      */
+    EIGEN_DEVICE_FUNC
+    inline ReshapedImpl_dense(XprType& xpr, Index nRows, Index nCols)
+      : m_xpr(xpr), m_rows(nRows), m_cols(nCols)
+    {}
+
+    EIGEN_DEVICE_FUNC Index rows() const { return m_rows; }
+    EIGEN_DEVICE_FUNC Index cols() const { return m_cols; }
+
+    #ifdef EIGEN_PARSED_BY_DOXYGEN
+    /** \sa MapBase::data() */
+    EIGEN_DEVICE_FUNC inline const Scalar* data() const;
+    EIGEN_DEVICE_FUNC inline Index innerStride() const;
+    EIGEN_DEVICE_FUNC inline Index outerStride() const;
+    #endif
+
+    /** \returns the nested expression */
+    EIGEN_DEVICE_FUNC
+    const typename internal::remove_all<XprType>::type&
+    nestedExpression() const { return m_xpr; }
+
+    /** \returns the nested expression */
+    EIGEN_DEVICE_FUNC
+    typename internal::remove_reference<XprType>::type&
+    nestedExpression() { return m_xpr; }
+
+  protected:
+
+    MatrixTypeNested m_xpr;
+    const internal::variable_if_dynamic<Index, Rows> m_rows;
+    const internal::variable_if_dynamic<Index, Cols> m_cols;
+};
+
+
+/** \internal Internal implementation of dense Reshaped in the direct access case. */
+template<typename XprType, int Rows, int Cols, int Order>
+class ReshapedImpl_dense<XprType, Rows, Cols, Order, true>
+  : public MapBase<Reshaped<XprType, Rows, Cols, Order> >
+{
+    typedef Reshaped<XprType, Rows, Cols, Order> ReshapedType;
+    typedef typename internal::ref_selector<XprType>::non_const_type XprTypeNested;
+  public:
+
+    typedef MapBase<ReshapedType> Base;
+    EIGEN_DENSE_PUBLIC_INTERFACE(ReshapedType)
+    EIGEN_INHERIT_ASSIGNMENT_OPERATORS(ReshapedImpl_dense)
+
+    /** Fixed-size constructor
+      */
+    EIGEN_DEVICE_FUNC
+    inline ReshapedImpl_dense(XprType& xpr)
+      : Base(xpr.data()), m_xpr(xpr)
+    {}
+
+    /** Dynamic-size constructor
+      */
+    EIGEN_DEVICE_FUNC
+    inline ReshapedImpl_dense(XprType& xpr, Index nRows, Index nCols)
+      : Base(xpr.data(), nRows, nCols),
+        m_xpr(xpr)
+    {}
+
+    EIGEN_DEVICE_FUNC
+    const typename internal::remove_all<XprTypeNested>::type& nestedExpression() const
+    {
+      return m_xpr;
+    }
+
+    EIGEN_DEVICE_FUNC
+    XprType& nestedExpression() { return m_xpr; }
+
+    /** \sa MapBase::innerStride() */
+    EIGEN_DEVICE_FUNC
+    inline Index innerStride() const
+    {
+      return m_xpr.innerStride();
+    }
+
+    /** \sa MapBase::outerStride() */
+    EIGEN_DEVICE_FUNC
+    inline Index outerStride() const
+    {
+      return ((Flags&RowMajorBit)==RowMajorBit) ? this->cols() : this->rows();
+    }
+
+  protected:
+
+    XprTypeNested m_xpr;
+};
+
+// Evaluators
+template<typename ArgType, int Rows, int Cols, int Order, bool HasDirectAccess> struct reshaped_evaluator;
+
+template<typename ArgType, int Rows, int Cols, int Order>
+struct evaluator<Reshaped<ArgType, Rows, Cols, Order> >
+  : reshaped_evaluator<ArgType, Rows, Cols, Order, traits<Reshaped<ArgType,Rows,Cols,Order> >::HasDirectAccess>
+{
+  typedef Reshaped<ArgType, Rows, Cols, Order> XprType;
+  typedef typename XprType::Scalar Scalar;
+  // TODO: should check for smaller packet types
+  typedef typename packet_traits<Scalar>::type PacketScalar;
+
+  enum {
+    CoeffReadCost = evaluator<ArgType>::CoeffReadCost,
+    HasDirectAccess = traits<XprType>::HasDirectAccess,
+
+//     RowsAtCompileTime = traits<XprType>::RowsAtCompileTime,
+//     ColsAtCompileTime = traits<XprType>::ColsAtCompileTime,
+//     MaxRowsAtCompileTime = traits<XprType>::MaxRowsAtCompileTime,
+//     MaxColsAtCompileTime = traits<XprType>::MaxColsAtCompileTime,
+//
+//     InnerStrideAtCompileTime = traits<XprType>::HasSameStorageOrderAsXprType
+//                              ? int(inner_stride_at_compile_time<ArgType>::ret)
+//                              : Dynamic,
+//     OuterStrideAtCompileTime = Dynamic,
+
+    FlagsLinearAccessBit = (traits<XprType>::RowsAtCompileTime == 1 || traits<XprType>::ColsAtCompileTime == 1 || HasDirectAccess) ? LinearAccessBit : 0,
+    FlagsRowMajorBit = (traits<XprType>::ReshapedStorageOrder==int(RowMajor)) ? RowMajorBit : 0,
+    FlagsDirectAccessBit =  HasDirectAccess ? DirectAccessBit : 0,
+    Flags0 = evaluator<ArgType>::Flags & (HereditaryBits & ~RowMajorBit),
+    Flags = Flags0 | FlagsLinearAccessBit | FlagsRowMajorBit | FlagsDirectAccessBit,
+
+    PacketAlignment = unpacket_traits<PacketScalar>::alignment,
+    Alignment = evaluator<ArgType>::Alignment
+  };
+  typedef reshaped_evaluator<ArgType, Rows, Cols, Order, HasDirectAccess> reshaped_evaluator_type;
+  EIGEN_DEVICE_FUNC explicit evaluator(const XprType& xpr) : reshaped_evaluator_type(xpr)
+  {
+    EIGEN_INTERNAL_CHECK_COST_VALUE(CoeffReadCost);
+  }
+};
+
+template<typename ArgType, int Rows, int Cols, int Order>
+struct reshaped_evaluator<ArgType, Rows, Cols, Order, /* HasDirectAccess */ false>
+  : evaluator_base<Reshaped<ArgType, Rows, Cols, Order> >
+{
+  typedef Reshaped<ArgType, Rows, Cols, Order> XprType;
+
+  enum {
+    CoeffReadCost = evaluator<ArgType>::CoeffReadCost /* TODO + cost of index computations */,
+
+    Flags = (evaluator<ArgType>::Flags & (HereditaryBits /*| LinearAccessBit | DirectAccessBit*/)),
+
+    Alignment = 0
+  };
+
+  EIGEN_DEVICE_FUNC explicit reshaped_evaluator(const XprType& xpr) : m_argImpl(xpr.nestedExpression()), m_xpr(xpr)
+  {
+    EIGEN_INTERNAL_CHECK_COST_VALUE(CoeffReadCost);
+  }
+
+  typedef typename XprType::Scalar Scalar;
+  typedef typename XprType::CoeffReturnType CoeffReturnType;
+
+  typedef std::pair<Index, Index> RowCol;
+
+  inline RowCol index_remap(Index rowId, Index colId) const
+  {
+    if(Order==ColMajor)
+    {
+      const Index nth_elem_idx = colId * m_xpr.rows() + rowId;
+      return RowCol(nth_elem_idx % m_xpr.nestedExpression().rows(),
+                    nth_elem_idx / m_xpr.nestedExpression().rows());
+    }
+    else
+    {
+      const Index nth_elem_idx = colId + rowId * m_xpr.cols();
+      return RowCol(nth_elem_idx / m_xpr.nestedExpression().cols(),
+                    nth_elem_idx % m_xpr.nestedExpression().cols());
+    }
+  }
+
+  EIGEN_DEVICE_FUNC
+  inline Scalar& coeffRef(Index rowId, Index colId)
+  {
+    EIGEN_STATIC_ASSERT_LVALUE(XprType)
+    const RowCol row_col = index_remap(rowId, colId);
+    return m_argImpl.coeffRef(row_col.first, row_col.second);
+  }
+
+  EIGEN_DEVICE_FUNC
+  inline const Scalar& coeffRef(Index rowId, Index colId) const
+  {
+    const RowCol row_col = index_remap(rowId, colId);
+    return m_argImpl.coeffRef(row_col.first, row_col.second);
+  }
+
+  EIGEN_DEVICE_FUNC
+  EIGEN_STRONG_INLINE const CoeffReturnType coeff(Index rowId, Index colId) const
+  {
+    const RowCol row_col = index_remap(rowId, colId);
+    return m_argImpl.coeff(row_col.first, row_col.second);
+  }
+
+  EIGEN_DEVICE_FUNC
+  inline Scalar& coeffRef(Index index)
+  {
+    EIGEN_STATIC_ASSERT_LVALUE(XprType)
+    const RowCol row_col = index_remap(Rows == 1 ? 0 : index,
+                                       Rows == 1 ? index : 0);
+    return m_argImpl.coeffRef(row_col.first, row_col.second);
+
+  }
+
+  EIGEN_DEVICE_FUNC
+  inline const Scalar& coeffRef(Index index) const
+  {
+    const RowCol row_col = index_remap(Rows == 1 ? 0 : index,
+                                       Rows == 1 ? index : 0);
+    return m_argImpl.coeffRef(row_col.first, row_col.second);
+  }
+
+  EIGEN_DEVICE_FUNC
+  inline const CoeffReturnType coeff(Index index) const
+  {
+    const RowCol row_col = index_remap(Rows == 1 ? 0 : index,
+                                       Rows == 1 ? index : 0);
+    return m_argImpl.coeff(row_col.first, row_col.second);
+  }
+#if 0
+  EIGEN_DEVICE_FUNC
+  template<int LoadMode>
+  inline PacketScalar packet(Index rowId, Index colId) const
+  {
+    const RowCol row_col = index_remap(rowId, colId);
+    return m_argImpl.template packet<Unaligned>(row_col.first, row_col.second);
+
+  }
+
+  template<int LoadMode>
+  EIGEN_DEVICE_FUNC
+  inline void writePacket(Index rowId, Index colId, const PacketScalar& val)
+  {
+    const RowCol row_col = index_remap(rowId, colId);
+    m_argImpl.const_cast_derived().template writePacket<Unaligned>
+            (row_col.first, row_col.second, val);
+  }
+
+  template<int LoadMode>
+  EIGEN_DEVICE_FUNC
+  inline PacketScalar packet(Index index) const
+  {
+    const RowCol row_col = index_remap(RowsAtCompileTime == 1 ? 0 : index,
+                                        RowsAtCompileTime == 1 ? index : 0);
+    return m_argImpl.template packet<Unaligned>(row_col.first, row_col.second);
+  }
+
+  template<int LoadMode>
+  EIGEN_DEVICE_FUNC
+  inline void writePacket(Index index, const PacketScalar& val)
+  {
+    const RowCol row_col = index_remap(RowsAtCompileTime == 1 ? 0 : index,
+                                        RowsAtCompileTime == 1 ? index : 0);
+    return m_argImpl.template packet<Unaligned>(row_col.first, row_col.second, val);
+  }
+#endif
+protected:
+
+  evaluator<ArgType> m_argImpl;
+  const XprType& m_xpr;
+
+};
+
+template<typename ArgType, int Rows, int Cols, int Order>
+struct reshaped_evaluator<ArgType, Rows, Cols, Order, /* HasDirectAccess */ true>
+: mapbase_evaluator<Reshaped<ArgType, Rows, Cols, Order>,
+                      typename Reshaped<ArgType, Rows, Cols, Order>::PlainObject>
+{
+  typedef Reshaped<ArgType, Rows, Cols, Order> XprType;
+  typedef typename XprType::Scalar Scalar;
+
+  EIGEN_DEVICE_FUNC explicit reshaped_evaluator(const XprType& xpr)
+    : mapbase_evaluator<XprType, typename XprType::PlainObject>(xpr)
+  {
+    // TODO: for the 3.4 release, this should be turned to an internal assertion, but let's keep it as is for the beta lifetime
+    eigen_assert(((internal::UIntPtr(xpr.data()) % EIGEN_PLAIN_ENUM_MAX(1,evaluator<XprType>::Alignment)) == 0) && "data is not aligned");
+  }
+};
+
+} // end namespace internal
+
+} // end namespace Eigen
+
+#endif // EIGEN_RESHAPED_H
--- a/Eigen/src/Core/ReturnByValue.h
+++ b/Eigen/src/Core/ReturnByValue.h
@@ -13,11 +13,6 @@

 namespace Eigen {

-/** \class ReturnByValue
-  * \ingroup Core_Module
-  *
-  */
-
 namespace internal {

 template<typename Derived>
@@ -48,6 +43,10 @@ struct nested_eval<ReturnByValue<Derived>, n, PlainObject>

 } // end namespace internal

+/** \class ReturnByValue
+  * \ingroup Core_Module
+  *
+  */
 template<typename Derived> class ReturnByValue
  : public internal::dense_xpr_base< ReturnByValue<Derived> >::type, internal::no_assignment_operator
 {
@@ -80,7 +79,7 @@ template<typename Derived> class ReturnByValue

 template<typename Derived>
 template<typename OtherDerived>
-Derived& DenseBase<Derived>::operator=(const ReturnByValue<OtherDerived>& other)
+EIGEN_DEVICE_FUNC Derived& DenseBase<Derived>::operator=(const ReturnByValue<OtherDerived>& other)
 {
  other.evalTo(derived());
  return derived();
--- a/Eigen/src/Core/Reverse.h
+++ b/Eigen/src/Core/Reverse.h
@@ -14,20 +14,6 @@

 namespace Eigen { 

-/** \class Reverse
-  * \ingroup Core_Module
-  *
-  * \brief Expression of the reverse of a vector or matrix
-  *
-  * \param MatrixType the type of the object of which we are taking the reverse
-  *
-  * This class represents an expression of the reverse of a vector.
-  * It is the return type of MatrixBase::reverse() and VectorwiseOp::reverse()
-  * and most of the time this is the only way it is used.
-  *
-  * \sa MatrixBase::reverse(), VectorwiseOp::reverse()
-  */
-
 namespace internal {

 template<typename MatrixType, int Direction>
@@ -60,6 +46,20 @@ template<typename PacketType> struct reverse_packet_cond<PacketType,false>

 } // end namespace internal 

+/** \class Reverse
+  * \ingroup Core_Module
+  *
+  * \brief Expression of the reverse of a vector or matrix
+  *
+  * \tparam MatrixType the type of the object of which we are taking the reverse
+  * \tparam Direction defines the direction of the reverse operation, can be Vertical, Horizontal, or BothDirections
+  *
+  * This class represents an expression of the reverse of a vector.
+  * It is the return type of MatrixBase::reverse() and VectorwiseOp::reverse()
+  * and most of the time this is the only way it is used.
+  *
+  * \sa MatrixBase::reverse(), VectorwiseOp::reverse()
+  */
 template<typename MatrixType, int Direction> class Reverse
  : public internal::dense_xpr_base< Reverse<MatrixType, Direction> >::type
 {
@@ -114,7 +114,7 @@ template<typename MatrixType, int Direction> class Reverse
  *
  */
 template<typename Derived>
-inline typename DenseBase<Derived>::ReverseReturnType
+EIGEN_DEVICE_FUNC inline typename DenseBase<Derived>::ReverseReturnType
 DenseBase<Derived>::reverse()
 {
  return ReverseReturnType(derived());
@@ -136,7 +136,7 @@ DenseBase<Derived>::reverse()
  *
  * \sa VectorwiseOp::reverseInPlace(), reverse() */
 template<typename Derived>
-inline void DenseBase<Derived>::reverseInPlace()
+EIGEN_DEVICE_FUNC inline void DenseBase<Derived>::reverseInPlace()
 {
  if(cols()>rows())
  {
@@ -171,8 +171,10 @@ struct vectorwise_reverse_inplace_impl<Vertical>
  template<typename ExpressionType>
  static void run(ExpressionType &xpr)
  {
+    const int HalfAtCompileTime = ExpressionType::RowsAtCompileTime==Dynamic?Dynamic:ExpressionType::RowsAtCompileTime/2;
    Index half = xpr.rows()/2;
-    xpr.topRows(half).swap(xpr.bottomRows(half).colwise().reverse());
+    xpr.topRows(fix<HalfAtCompileTime>(half))
+       .swap(xpr.bottomRows(fix<HalfAtCompileTime>(half)).colwise().reverse());
  }
 };

@@ -182,8 +184,10 @@ struct vectorwise_reverse_inplace_impl<Horizontal>
  template<typename ExpressionType>
  static void run(ExpressionType &xpr)
  {
+    const int HalfAtCompileTime = ExpressionType::ColsAtCompileTime==Dynamic?Dynamic:ExpressionType::ColsAtCompileTime/2;
    Index half = xpr.cols()/2;
-    xpr.leftCols(half).swap(xpr.rightCols(half).rowwise().reverse());
+    xpr.leftCols(fix<HalfAtCompileTime>(half))
+       .swap(xpr.rightCols(fix<HalfAtCompileTime>(half)).rowwise().reverse());
  }
 };

@@ -201,9 +205,9 @@ struct vectorwise_reverse_inplace_impl<Horizontal>
  *
  * \sa DenseBase::reverseInPlace(), reverse() */
 template<typename ExpressionType, int Direction>
-void VectorwiseOp<ExpressionType,Direction>::reverseInPlace()
+EIGEN_DEVICE_FUNC void VectorwiseOp<ExpressionType,Direction>::reverseInPlace()
 {
-  internal::vectorwise_reverse_inplace_impl<Direction>::run(_expression().const_cast_derived());
+  internal::vectorwise_reverse_inplace_impl<Direction>::run(m_matrix);
 }

 } // end namespace Eigen
--- a/Eigen/src/Core/SelfAdjointView.h
+++ b/Eigen/src/Core/SelfAdjointView.h
@@ -32,7 +32,7 @@ namespace internal {
 template<typename MatrixType, unsigned int UpLo>
 struct traits<SelfAdjointView<MatrixType, UpLo> > : traits<MatrixType>
 {
-  typedef typename ref_selector<MatrixType>::type MatrixTypeNested;
+  typedef typename ref_selector<MatrixType>::non_const_type MatrixTypeNested;
  typedef typename remove_all<MatrixTypeNested>::type MatrixTypeNestedCleaned;
  typedef MatrixType ExpressionType;
  typedef typename MatrixType::PlainObject FullMatrixType;
@@ -45,7 +45,7 @@ struct traits<SelfAdjointView<MatrixType, UpLo> > : traits<MatrixType>
 };
 }

-// FIXME could also be called SelfAdjointWrapper to be consistent with DiagonalWrapper ??
+
 template<typename _MatrixType, unsigned int UpLo> class SelfAdjointView
  : public TriangularBase<SelfAdjointView<_MatrixType, UpLo> >
 {
@@ -55,20 +55,26 @@ template<typename _MatrixType, unsigned int UpLo> class SelfAdjointView
    typedef TriangularBase<SelfAdjointView> Base;
    typedef typename internal::traits<SelfAdjointView>::MatrixTypeNested MatrixTypeNested;
    typedef typename internal::traits<SelfAdjointView>::MatrixTypeNestedCleaned MatrixTypeNestedCleaned;
+    typedef MatrixTypeNestedCleaned NestedExpression;

    /** \brief The type of coefficients in this matrix */
    typedef typename internal::traits<SelfAdjointView>::Scalar Scalar; 
    typedef typename MatrixType::StorageIndex StorageIndex;
+    typedef typename internal::remove_all<typename MatrixType::ConjugateReturnType>::type MatrixConjugateReturnType;
+    typedef SelfAdjointView<typename internal::add_const<MatrixType>::type, UpLo> ConstSelfAdjointView;

    enum {
      Mode = internal::traits<SelfAdjointView>::Mode,
-      Flags = internal::traits<SelfAdjointView>::Flags
+      Flags = internal::traits<SelfAdjointView>::Flags,
+      TransposeMode = ((Mode & Upper) ? Lower : 0) | ((Mode & Lower) ? Upper : 0)
    };
    typedef typename MatrixType::PlainObject PlainObject;

    EIGEN_DEVICE_FUNC
    explicit inline SelfAdjointView(MatrixType& matrix) : m_matrix(matrix)
-    {}
+    {
+      EIGEN_STATIC_ASSERT(UpLo==Lower || UpLo==Upper,SELFADJOINTVIEW_ACCEPTS_UPPER_AND_LOWER_MODE_ONLY);
+    }

    EIGEN_DEVICE_FUNC
    inline Index rows() const { return m_matrix.rows(); }
@@ -97,7 +103,7 @@ template<typename _MatrixType, unsigned int UpLo> class SelfAdjointView
    {
      EIGEN_STATIC_ASSERT_LVALUE(SelfAdjointView);
      Base::check_coordinates_internal(row, col);
-      return m_matrix.const_cast_derived().coeffRef(row, col);
+      return m_matrix.coeffRef(row, col);
    }

    /** \internal */
@@ -107,7 +113,7 @@ template<typename _MatrixType, unsigned int UpLo> class SelfAdjointView
    EIGEN_DEVICE_FUNC
    const MatrixTypeNestedCleaned& nestedExpression() const { return m_matrix; }
    EIGEN_DEVICE_FUNC
-    MatrixTypeNestedCleaned& nestedExpression() { return *const_cast<MatrixTypeNestedCleaned*>(&m_matrix); }
+    MatrixTypeNestedCleaned& nestedExpression() { return m_matrix; }

    /** Efficient triangular matrix times vector/matrix product */
    template<typename OtherDerived>
@@ -128,7 +134,7 @@ template<typename _MatrixType, unsigned int UpLo> class SelfAdjointView
    }
    
    friend EIGEN_DEVICE_FUNC
-    const SelfAdjointView<const CwiseUnaryOp<internal::scalar_multiple_op<Scalar>,MatrixType>,UpLo>
+    const SelfAdjointView<const EIGEN_SCALAR_BINARYOP_EXPR_RETURN_TYPE(Scalar,MatrixType,product),UpLo>
    operator*(const Scalar& s, const SelfAdjointView& mat)
    {
      return (s*mat.nestedExpression()).template selfadjointView<UpLo>();
@@ -162,6 +168,83 @@ template<typename _MatrixType, unsigned int UpLo> class SelfAdjointView
    EIGEN_DEVICE_FUNC
    SelfAdjointView& rankUpdate(const MatrixBase<DerivedU>& u, const Scalar& alpha = Scalar(1));

+    /** \returns an expression of a triangular view extracted from the current selfadjoint view of a given triangular part
+      *
+      * The parameter \a TriMode can have the following values: \c #Upper, \c #StrictlyUpper, \c #UnitUpper,
+      * \c #Lower, \c #StrictlyLower, \c #UnitLower.
+      *
+      * If \c TriMode references the same triangular part than \c *this, then this method simply return a \c TriangularView of the nested expression,
+      * otherwise, the nested expression is first transposed, thus returning a \c TriangularView<Transpose<MatrixType>> object.
+      *
+      * \sa MatrixBase::triangularView(), class TriangularView
+      */
+    template<unsigned int TriMode>
+    EIGEN_DEVICE_FUNC
+    typename internal::conditional<(TriMode&(Upper|Lower))==(UpLo&(Upper|Lower)),
+                                   TriangularView<MatrixType,TriMode>,
+                                   TriangularView<typename MatrixType::AdjointReturnType,TriMode> >::type
+    triangularView() const
+    {
+      typename internal::conditional<(TriMode&(Upper|Lower))==(UpLo&(Upper|Lower)), MatrixType&, typename MatrixType::ConstTransposeReturnType>::type tmp1(m_matrix);
+      typename internal::conditional<(TriMode&(Upper|Lower))==(UpLo&(Upper|Lower)), MatrixType&, typename MatrixType::AdjointReturnType>::type tmp2(tmp1);
+      return typename internal::conditional<(TriMode&(Upper|Lower))==(UpLo&(Upper|Lower)),
+                                   TriangularView<MatrixType,TriMode>,
+                                   TriangularView<typename MatrixType::AdjointReturnType,TriMode> >::type(tmp2);
+    }
+
+    typedef SelfAdjointView<const MatrixConjugateReturnType,UpLo> ConjugateReturnType;
+    /** \sa MatrixBase::conjugate() const */
+    EIGEN_DEVICE_FUNC
+    inline const ConjugateReturnType conjugate() const
+    { return ConjugateReturnType(m_matrix.conjugate()); }
+
+    /** \returns an expression of the complex conjugate of \c *this if Cond==true,
+     *           returns \c *this otherwise.
+     */
+    template<bool Cond>
+    EIGEN_DEVICE_FUNC
+    inline typename internal::conditional<Cond,ConjugateReturnType,ConstSelfAdjointView>::type
+    conjugateIf() const
+    {
+      typedef typename internal::conditional<Cond,ConjugateReturnType,ConstSelfAdjointView>::type ReturnType;
+      return ReturnType(m_matrix.template conjugateIf<Cond>());
+    }
+
+    typedef SelfAdjointView<const typename MatrixType::AdjointReturnType,TransposeMode> AdjointReturnType;
+    /** \sa MatrixBase::adjoint() const */
+    EIGEN_DEVICE_FUNC
+    inline const AdjointReturnType adjoint() const
+    { return AdjointReturnType(m_matrix.adjoint()); }
+
+    typedef SelfAdjointView<typename MatrixType::TransposeReturnType,TransposeMode> TransposeReturnType;
+     /** \sa MatrixBase::transpose() */
+    EIGEN_DEVICE_FUNC
+    inline TransposeReturnType transpose()
+    {
+      EIGEN_STATIC_ASSERT_LVALUE(MatrixType)
+      typename MatrixType::TransposeReturnType tmp(m_matrix);
+      return TransposeReturnType(tmp);
+    }
+
+    typedef SelfAdjointView<const typename MatrixType::ConstTransposeReturnType,TransposeMode> ConstTransposeReturnType;
+    /** \sa MatrixBase::transpose() const */
+    EIGEN_DEVICE_FUNC
+    inline const ConstTransposeReturnType transpose() const
+    {
+      return ConstTransposeReturnType(m_matrix.transpose());
+    }
+
+    /** \returns a const expression of the main diagonal of the matrix \c *this
+      *
+      * This method simply returns the diagonal of the nested expression, thus by-passing the SelfAdjointView decorator.
+      *
+      * \sa MatrixBase::diagonal(), class Diagonal */
+    EIGEN_DEVICE_FUNC
+    typename MatrixType::ConstDiagonalReturnType diagonal() const
+    {
+      return typename MatrixType::ConstDiagonalReturnType(m_matrix);
+    }
+
 /////////// Cholesky module ///////////

    const LLT<PlainObject, UpLo> llt() const;
@@ -203,8 +286,6 @@ struct evaluator_traits<SelfAdjointView<MatrixType,Mode> >
 {
  typedef typename storage_kind_to_evaluator_kind<typename MatrixType::StorageKind>::Kind Kind;
  typedef SelfAdjointShape Shape;
-  
-  static const int AssumeAliasing = 0;
 };

 template<int UpLo, int SetOpposite, typename DstEvaluatorTypeT, typename SrcEvaluatorTypeT, typename Functor, int Version>
@@ -253,17 +334,27 @@ public:
 * Implementation of MatrixBase methods
 ***************************************************************************/

+/** This is the const version of MatrixBase::selfadjointView() */
 template<typename Derived>
 template<unsigned int UpLo>
-typename MatrixBase<Derived>::template ConstSelfAdjointViewReturnType<UpLo>::Type
+EIGEN_DEVICE_FUNC typename MatrixBase<Derived>::template ConstSelfAdjointViewReturnType<UpLo>::Type
 MatrixBase<Derived>::selfadjointView() const
 {
  return typename ConstSelfAdjointViewReturnType<UpLo>::Type(derived());
 }

+/** \returns an expression of a symmetric/self-adjoint view extracted from the upper or lower triangular part of the current matrix
+  *
+  * The parameter \a UpLo can be either \c #Upper or \c #Lower
+  *
+  * Example: \include MatrixBase_selfadjointView.cpp
+  * Output: \verbinclude MatrixBase_selfadjointView.out
+  *
+  * \sa class SelfAdjointView
+  */
 template<typename Derived>
 template<unsigned int UpLo>
-typename MatrixBase<Derived>::template SelfAdjointViewReturnType<UpLo>::Type
+EIGEN_DEVICE_FUNC typename MatrixBase<Derived>::template SelfAdjointViewReturnType<UpLo>::Type
 MatrixBase<Derived>::selfadjointView()
 {
  return typename SelfAdjointViewReturnType<UpLo>::Type(derived());
--- a/Eigen/src/Core/SelfCwiseBinaryOp.h
+++ b/Eigen/src/Core/SelfCwiseBinaryOp.h
@@ -12,35 +12,33 @@

 namespace Eigen { 

+// TODO generalize the scalar type of 'other'
+
 template<typename Derived>
-inline Derived& DenseBase<Derived>::operator*=(const Scalar& other)
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Derived& DenseBase<Derived>::operator*=(const Scalar& other)
 {
-  typedef typename Derived::PlainObject PlainObject;
-  internal::call_assignment(this->derived(), PlainObject::Constant(rows(),cols(),other), internal::mul_assign_op<Scalar>());
+  internal::call_assignment(this->derived(), PlainObject::Constant(rows(),cols(),other), internal::mul_assign_op<Scalar,Scalar>());
  return derived();
 }

 template<typename Derived>
-inline Derived& ArrayBase<Derived>::operator+=(const Scalar& other)
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Derived& ArrayBase<Derived>::operator+=(const Scalar& other)
 {
-  typedef typename Derived::PlainObject PlainObject;
-  internal::call_assignment(this->derived(), PlainObject::Constant(rows(),cols(),other), internal::add_assign_op<Scalar>());
+  internal::call_assignment(this->derived(), PlainObject::Constant(rows(),cols(),other), internal::add_assign_op<Scalar,Scalar>());
  return derived();
 }

 template<typename Derived>
-inline Derived& ArrayBase<Derived>::operator-=(const Scalar& other)
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Derived& ArrayBase<Derived>::operator-=(const Scalar& other)
 {
-  typedef typename Derived::PlainObject PlainObject;
-  internal::call_assignment(this->derived(), PlainObject::Constant(rows(),cols(),other), internal::sub_assign_op<Scalar>());
+  internal::call_assignment(this->derived(), PlainObject::Constant(rows(),cols(),other), internal::sub_assign_op<Scalar,Scalar>());
  return derived();
 }

 template<typename Derived>
-inline Derived& DenseBase<Derived>::operator/=(const Scalar& other)
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Derived& DenseBase<Derived>::operator/=(const Scalar& other)
 {
-  typedef typename Derived::PlainObject PlainObject;
-  internal::call_assignment(this->derived(), PlainObject::Constant(rows(),cols(),other), internal::div_assign_op<Scalar>());
+  internal::call_assignment(this->derived(), PlainObject::Constant(rows(),cols(),other), internal::div_assign_op<Scalar,Scalar>());
  return derived();
 }

--- a/Eigen/src/Core/Solve.h
+++ b/Eigen/src/Core/Solve.h
@@ -19,7 +19,7 @@ template<typename Decomposition, typename RhsType, typename StorageKind> class S
  *
  * \brief Pseudo expression representing a solving operation
  *
-  * \tparam Decomposition the type of the matrix or decomposion object
+  * \tparam Decomposition the type of the matrix or decomposition object
  * \tparam Rhstype the type of the right-hand side
  *
  * This class represents an expression of A.solve(B)
@@ -34,13 +34,12 @@ template<typename Decomposition, typename RhsType,typename StorageKind> struct s
 template<typename Decomposition, typename RhsType>
 struct solve_traits<Decomposition,RhsType,Dense>
 {
-  typedef typename Decomposition::MatrixType MatrixType;
-  typedef Matrix<typename RhsType::Scalar,
-                 MatrixType::ColsAtCompileTime,
+  typedef typename make_proper_matrix_type<typename RhsType::Scalar,
+                 Decomposition::ColsAtCompileTime,
                 RhsType::ColsAtCompileTime,
                 RhsType::PlainObject::Options,
-                 MatrixType::MaxColsAtCompileTime,
-                 RhsType::MaxColsAtCompileTime> PlainObject;  
+                 Decomposition::MaxColsAtCompileTime,
+                 RhsType::MaxColsAtCompileTime>::type PlainObject;
 };

 template<typename Decomposition, typename RhsType>
@@ -52,7 +51,7 @@ struct traits<Solve<Decomposition, RhsType> >
  typedef traits<PlainObject> BaseTraits;
  enum {
    Flags = BaseTraits::Flags & RowMajorBit,
-    CoeffReadCost = Dynamic
+    CoeffReadCost = HugeCost
  };
 };

@@ -118,6 +117,8 @@ struct evaluator<Solve<Decomposition,RhsType> >
  typedef Solve<Decomposition,RhsType> SolveType;
  typedef typename SolveType::PlainObject PlainObject;
  typedef evaluator<PlainObject> Base;
+
+  enum { Flags = Base::Flags | EvalBeforeNestingBit };
  
  EIGEN_DEVICE_FUNC explicit evaluator(const SolveType& solve)
    : m_result(solve.rows(), solve.cols())
@@ -133,17 +134,54 @@ protected:
 // Specialization for "dst = dec.solve(rhs)"
 // NOTE we need to specialize it for Dense2Dense to avoid ambiguous specialization error and a Sparse2Sparse specialization must exist somewhere
 template<typename DstXprType, typename DecType, typename RhsType, typename Scalar>
-struct Assignment<DstXprType, Solve<DecType,RhsType>, internal::assign_op<Scalar>, Dense2Dense, Scalar>
+struct Assignment<DstXprType, Solve<DecType,RhsType>, internal::assign_op<Scalar,Scalar>, Dense2Dense>
 {
  typedef Solve<DecType,RhsType> SrcXprType;
-  static void run(DstXprType &dst, const SrcXprType &src, const internal::assign_op<Scalar> &)
+  static void run(DstXprType &dst, const SrcXprType &src, const internal::assign_op<Scalar,Scalar> &)
  {
-    // FIXME shall we resize dst here?
+    Index dstRows = src.rows();
+    Index dstCols = src.cols();
+    if((dst.rows()!=dstRows) || (dst.cols()!=dstCols))
+      dst.resize(dstRows, dstCols);
+
    src.dec()._solve_impl(src.rhs(), dst);
  }
 };

-} // end namepsace internal
+// Specialization for "dst = dec.transpose().solve(rhs)"
+template<typename DstXprType, typename DecType, typename RhsType, typename Scalar>
+struct Assignment<DstXprType, Solve<Transpose<const DecType>,RhsType>, internal::assign_op<Scalar,Scalar>, Dense2Dense>
+{
+  typedef Solve<Transpose<const DecType>,RhsType> SrcXprType;
+  static void run(DstXprType &dst, const SrcXprType &src, const internal::assign_op<Scalar,Scalar> &)
+  {
+    Index dstRows = src.rows();
+    Index dstCols = src.cols();
+    if((dst.rows()!=dstRows) || (dst.cols()!=dstCols))
+      dst.resize(dstRows, dstCols);
+
+    src.dec().nestedExpression().template _solve_impl_transposed<false>(src.rhs(), dst);
+  }
+};
+
+// Specialization for "dst = dec.adjoint().solve(rhs)"
+template<typename DstXprType, typename DecType, typename RhsType, typename Scalar>
+struct Assignment<DstXprType, Solve<CwiseUnaryOp<internal::scalar_conjugate_op<typename DecType::Scalar>, const Transpose<const DecType> >,RhsType>,
+                  internal::assign_op<Scalar,Scalar>, Dense2Dense>
+{
+  typedef Solve<CwiseUnaryOp<internal::scalar_conjugate_op<typename DecType::Scalar>, const Transpose<const DecType> >,RhsType> SrcXprType;
+  static void run(DstXprType &dst, const SrcXprType &src, const internal::assign_op<Scalar,Scalar> &)
+  {
+    Index dstRows = src.rows();
+    Index dstCols = src.cols();
+    if((dst.rows()!=dstRows) || (dst.cols()!=dstCols))
+      dst.resize(dstRows, dstCols);
+    
+    src.dec().nestedExpression().nestedExpression().template _solve_impl_transposed<true>(src.rhs(), dst);
+  }
+};
+
+} // end namespace internal

 } // end namespace Eigen

--- a/Eigen/src/Core/SolveTriangular.h
+++ b/Eigen/src/Core/SolveTriangular.h
@@ -19,7 +19,7 @@ namespace internal {
 template<typename LhsScalar, typename RhsScalar, typename Index, int Side, int Mode, bool Conjugate, int StorageOrder>
 struct triangular_solve_vector;

-template <typename Scalar, typename Index, int Side, int Mode, bool Conjugate, int TriStorageOrder, int OtherStorageOrder>
+template <typename Scalar, typename Index, int Side, int Mode, bool Conjugate, int TriStorageOrder, int OtherStorageOrder, int OtherInnerStride>
 struct triangular_solve_matrix;

 // small helper struct extracting some traits on the underlying solver operation
@@ -98,8 +98,8 @@ struct triangular_solver_selector<Lhs,Rhs,Side,Mode,NoUnrolling,Dynamic>
    BlockingType blocking(rhs.rows(), rhs.cols(), size, 1, false);

    triangular_solve_matrix<Scalar,Index,Side,Mode,LhsProductTraits::NeedToConjugate,(int(Lhs::Flags) & RowMajorBit) ? RowMajor : ColMajor,
-                               (Rhs::Flags&RowMajorBit) ? RowMajor : ColMajor>
-      ::run(size, othersize, &actualLhs.coeffRef(0,0), actualLhs.outerStride(), &rhs.coeffRef(0,0), rhs.outerStride(), blocking);
+                               (Rhs::Flags&RowMajorBit) ? RowMajor : ColMajor, Rhs::InnerStrideAtCompileTime>
+      ::run(size, othersize, &actualLhs.coeffRef(0,0), actualLhs.outerStride(), &rhs.coeffRef(0,0), rhs.innerStride(), rhs.outerStride(), blocking);
  }
 };

@@ -107,32 +107,32 @@ struct triangular_solver_selector<Lhs,Rhs,Side,Mode,NoUnrolling,Dynamic>
 * meta-unrolling implementation
 ***************************************************************************/

-template<typename Lhs, typename Rhs, int Mode, int Index, int Size,
-         bool Stop = Index==Size>
+template<typename Lhs, typename Rhs, int Mode, int LoopIndex, int Size,
+         bool Stop = LoopIndex==Size>
 struct triangular_solver_unroller;

-template<typename Lhs, typename Rhs, int Mode, int Index, int Size>
-struct triangular_solver_unroller<Lhs,Rhs,Mode,Index,Size,false> {
+template<typename Lhs, typename Rhs, int Mode, int LoopIndex, int Size>
+struct triangular_solver_unroller<Lhs,Rhs,Mode,LoopIndex,Size,false> {
  enum {
    IsLower = ((Mode&Lower)==Lower),
-    I = IsLower ? Index : Size - Index - 1,
-    S = IsLower ? 0     : I+1
+    DiagIndex  = IsLower ? LoopIndex : Size - LoopIndex - 1,
+    StartIndex = IsLower ? 0         : DiagIndex+1
  };
  static void run(const Lhs& lhs, Rhs& rhs)
  {
-    if (Index>0)
-      rhs.coeffRef(I) -= lhs.row(I).template segment<Index>(S).transpose()
-                         .cwiseProduct(rhs.template segment<Index>(S)).sum();
+    if (LoopIndex>0)
+      rhs.coeffRef(DiagIndex) -= lhs.row(DiagIndex).template segment<LoopIndex>(StartIndex).transpose()
+                                .cwiseProduct(rhs.template segment<LoopIndex>(StartIndex)).sum();

    if(!(Mode & UnitDiag))
-      rhs.coeffRef(I) /= lhs.coeff(I,I);
+      rhs.coeffRef(DiagIndex) /= lhs.coeff(DiagIndex,DiagIndex);

-    triangular_solver_unroller<Lhs,Rhs,Mode,Index+1,Size>::run(lhs,rhs);
+    triangular_solver_unroller<Lhs,Rhs,Mode,LoopIndex+1,Size>::run(lhs,rhs);
  }
 };

-template<typename Lhs, typename Rhs, int Mode, int Index, int Size>
-struct triangular_solver_unroller<Lhs,Rhs,Mode,Index,Size,true> {
+template<typename Lhs, typename Rhs, int Mode, int LoopIndex, int Size>
+struct triangular_solver_unroller<Lhs,Rhs,Mode,LoopIndex,Size,true> {
  static void run(const Lhs&, Rhs&) {}
 };

@@ -161,22 +161,19 @@ struct triangular_solver_selector<Lhs,Rhs,OnTheRight,Mode,CompleteUnrolling,1> {
 * TriangularView methods
 ***************************************************************************/

-/** "in-place" version of TriangularView::solve() where the result is written in \a other
-  *
-  * \warning The parameter is only marked 'const' to make the C++ compiler accept a temporary expression here.
-  * This function will const_cast it, so constness isn't honored here.
-  *
-  * See TriangularView:solve() for the details.
-  */
+#ifndef EIGEN_PARSED_BY_DOXYGEN
 template<typename MatrixType, unsigned int Mode>
 template<int Side, typename OtherDerived>
-void TriangularViewImpl<MatrixType,Mode,Dense>::solveInPlace(const MatrixBase<OtherDerived>& _other) const
+EIGEN_DEVICE_FUNC void TriangularViewImpl<MatrixType,Mode,Dense>::solveInPlace(const MatrixBase<OtherDerived>& _other) const
 {
  OtherDerived& other = _other.const_cast_derived();
  eigen_assert( derived().cols() == derived().rows() && ((Side==OnTheLeft && derived().cols() == other.rows()) || (Side==OnTheRight && derived().cols() == other.cols())) );
  eigen_assert((!(Mode & ZeroDiag)) && bool(Mode & (Upper|Lower)));
+  // If solving for a 0x0 matrix, nothing to do, simply return.
+  if (derived().cols() == 0)
+    return;

-  enum { copy = internal::traits<OtherDerived>::Flags & RowMajorBit  && OtherDerived::IsVectorAtCompileTime };
+  enum { copy = (internal::traits<OtherDerived>::Flags & RowMajorBit)  && OtherDerived::IsVectorAtCompileTime && OtherDerived::SizeAtCompileTime!=1};
  typedef typename internal::conditional<copy,
    typename internal::plain_matrix_type_column_major<OtherDerived>::type, OtherDerived&>::type OtherCopy;
  OtherCopy otherCopy(other);
@@ -188,27 +185,6 @@ void TriangularViewImpl<MatrixType,Mode,Dense>::solveInPlace(const MatrixBase<Ot
    other = otherCopy;
 }

-/** \returns the product of the inverse of \c *this with \a other, \a *this being triangular.
-  *
-  * This function computes the inverse-matrix matrix product inverse(\c *this) * \a other if
-  * \a Side==OnTheLeft (the default), or the right-inverse-multiply  \a other * inverse(\c *this) if
-  * \a Side==OnTheRight.
-  *
-  * The matrix \c *this must be triangular and invertible (i.e., all the coefficients of the
-  * diagonal must be non zero). It works as a forward (resp. backward) substitution if \c *this
-  * is an upper (resp. lower) triangular matrix.
-  *
-  * Example: \include Triangular_solve.cpp
-  * Output: \verbinclude Triangular_solve.out
-  *
-  * This function returns an expression of the inverse-multiply and can works in-place if it is assigned
-  * to the same matrix or vector \a other.
-  *
-  * For users coming from BLAS, this function (and more specifically solveInPlace()) offer
-  * all the operations supported by the \c *TRSV and \c *TRSM BLAS routines.
-  *
-  * \sa TriangularView::solveInPlace()
-  */
 template<typename Derived, unsigned int Mode>
 template<int Side, typename Other>
 const internal::triangular_solve_retval<Side,TriangularView<Derived,Mode>,Other>
@@ -216,6 +192,7 @@ TriangularViewImpl<Derived,Mode,Dense>::solve(const MatrixBase<Other>& other) co
 {
  return internal::triangular_solve_retval<Side,TriangularViewType,Other>(derived(), other.derived());
 }
+#endif

 namespace internal {

@@ -241,7 +218,7 @@ template<int Side, typename TriangularType, typename Rhs> struct triangular_solv

  template<typename Dest> inline void evalTo(Dest& dst) const
  {
-    if(!(is_same<RhsNestedCleaned,Dest>::value && extract_data(dst) == extract_data(m_rhs)))
+    if(!is_same_dense(dst,m_rhs))
      dst = m_rhs;
    m_triangularMatrix.template solveInPlace<Side>(dst);
  }
--- a/Show More
+++ b/Show More