bump to 3.2.9

Fix compilation issue if PastixSupport
merge
2026-04-10 11:34:33 +08:00 · 2016-07-18 16:28:24 +02:00 · 2016-07-18 14:55:06 +02:00 · 2016-07-18 14:51:53 +02:00 · 2016-07-18 14:51:44 +02:00 · 2016-07-18 13:59:43 +02:00
771 changed files with 36536 additions and 64174 deletions
--- a/.hgeol
+++ b/.hgeol
@@ -1,9 +1,6 @@
 [patterns]
-*.sh = LF
-*.MINPACK = CRLF
 scripts/*.in = LF
 debug/msvc/*.dat = CRLF
-debug/msvc/*.natvis = CRLF
 unsupported/test/mpreal/*.* = CRLF
 ** = native

--- a/.hgignore
+++ b/.hgignore
@@ -30,5 +30,3 @@ log
 patch
 a
 a.*
-lapack/testing
-lapack/reference
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -1,6 +1,5 @@
 project(Eigen)
-
-cmake_minimum_required(VERSION 2.8.4)
+cmake_minimum_required(VERSION 2.8.5)

 # guard against in-source builds

@@ -55,6 +54,7 @@ endif(EIGEN_HG_CHANGESET)


 include(CheckCXXCompilerFlag)
+include(GNUInstallDirs)

 set(CMAKE_MODULE_PATH ${PROJECT_SOURCE_DIR}/cmake)

@@ -108,8 +108,7 @@ endif()
 set(EIGEN_TEST_MAX_SIZE "320" CACHE STRING "Maximal matrix/vector size, default is 320")

 macro(ei_add_cxx_compiler_flag FLAG)
-  string(REGEX REPLACE "-" "" SFLAG1 ${FLAG})
-  string(REGEX REPLACE "\\+" "p" SFLAG ${SFLAG1})
+  string(REGEX REPLACE "-" "" SFLAG ${FLAG})
  check_cxx_compiler_flag(${FLAG} COMPILER_SUPPORT_${SFLAG})
  if(COMPILER_SUPPORT_${SFLAG})
    set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} ${FLAG}")
@@ -119,7 +118,7 @@ endmacro(ei_add_cxx_compiler_flag)
 if(NOT MSVC)
  # We assume that other compilers are partly compatible with GNUCC
  
-#  set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -fexceptions")
+  set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -fexceptions")
  set(CMAKE_CXX_FLAGS_DEBUG "-g3")
  set(CMAKE_CXX_FLAGS_RELEASE "-g0 -O2")
  
@@ -143,9 +142,6 @@ if(NOT MSVC)
  ei_add_cxx_compiler_flag("-Wpointer-arith")
  ei_add_cxx_compiler_flag("-Wwrite-strings")
  ei_add_cxx_compiler_flag("-Wformat-security")
-  ei_add_cxx_compiler_flag("-Wshorten-64-to-32")
-  ei_add_cxx_compiler_flag("-Wenum-conversion")
-  ei_add_cxx_compiler_flag("-Wc++11-extensions")
  
  ei_add_cxx_compiler_flag("-Wno-psabi")
  ei_add_cxx_compiler_flag("-Wno-variadic-macros")
@@ -157,7 +153,6 @@ if(NOT MSVC)
  ei_add_cxx_compiler_flag("-wd981")                    # disable ICC's "operands are evaluated in unspecified order" remark
  ei_add_cxx_compiler_flag("-wd2304")                   # disbale ICC's "warning #2304: non-explicit constructor with single argument may cause implicit type conversion" produced by -Wnon-virtual-dtor
  
-  
  # The -ansi flag must be added last, otherwise it is also used as a linker flag by check_cxx_compiler_flag making it fails
  # Moreover we should not set both -strict-ansi and -ansi
  check_cxx_compiler_flag("-strict-ansi" COMPILER_SUPPORT_STRICTANSI)
@@ -201,44 +196,18 @@ if(NOT MSVC)
    message(STATUS "Enabling SSE4.2 in tests/examples")
  endif()

-  option(EIGEN_TEST_AVX "Enable/Disable AVX in tests/examples" OFF)
-  if(EIGEN_TEST_AVX)
-    set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -mavx")
-    message(STATUS "Enabling AVX in tests/examples")
-  endif()
-
-  option(EIGEN_TEST_FMA "Enable/Disable FMA in tests/examples" OFF)
-  if(EIGEN_TEST_FMA)
-    set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -mfma")
-    message(STATUS "Enabling FMA in tests/examples")
-  endif()
-
  option(EIGEN_TEST_ALTIVEC "Enable/Disable AltiVec in tests/examples" OFF)
  if(EIGEN_TEST_ALTIVEC)
    set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -maltivec -mabi=altivec")
    message(STATUS "Enabling AltiVec in tests/examples")
  endif()

-  option(EIGEN_TEST_VSX "Enable/Disable VSX in tests/examples" OFF)
-  if(EIGEN_TEST_VSX)
-    set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -m64 -mvsx")
-    message(STATUS "Enabling VSX in tests/examples")
-  endif()
-
  option(EIGEN_TEST_NEON "Enable/Disable Neon in tests/examples" OFF)
  if(EIGEN_TEST_NEON)
    set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -mfpu=neon -mcpu=cortex-a8")
    message(STATUS "Enabling NEON in tests/examples")
  endif()

-  option(EIGEN_TEST_NEON64 "Enable/Disable Neon in tests/examples" OFF)
-  if(EIGEN_TEST_NEON64)
-    set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS}")
-    message(STATUS "Enabling NEON in tests/examples")
-  endif()
-
-
-
  check_cxx_compiler_flag("-fopenmp" COMPILER_SUPPORT_OPENMP)
  if(COMPILER_SUPPORT_OPENMP)
    option(EIGEN_TEST_OPENMP "Enable/Disable OpenMP in tests/examples" OFF)
@@ -315,35 +284,30 @@ if(EIGEN_TEST_NO_EXPLICIT_ALIGNMENT)
  message(STATUS "Disabling alignment in tests/examples")
 endif()

-option(EIGEN_TEST_NO_EXCEPTIONS "Disables C++ exceptions" OFF)
-if(EIGEN_TEST_NO_EXCEPTIONS)
-  ei_add_cxx_compiler_flag("-fno-exceptions")
-  message(STATUS "Disabling exceptions in tests/examples")
-endif()
-
 option(EIGEN_TEST_C++0x "Enables all C++0x features." OFF)

 include_directories(${CMAKE_CURRENT_SOURCE_DIR} ${CMAKE_CURRENT_BINARY_DIR})

-# the user modifiable install path for header files
-set(EIGEN_INCLUDE_INSTALL_DIR ${EIGEN_INCLUDE_INSTALL_DIR} CACHE PATH "The directory where we install the header files (optional)")
-
-# set the internal install path for header files which depends on wether the user modifiable
-# EIGEN_INCLUDE_INSTALL_DIR has been set by the user or not.
-if(EIGEN_INCLUDE_INSTALL_DIR)
-  set(INCLUDE_INSTALL_DIR
-    ${EIGEN_INCLUDE_INSTALL_DIR}
-    CACHE INTERNAL
-    "The directory where we install the header files (internal)"
-  )
+# Backward compatibility support for EIGEN_INCLUDE_INSTALL_DIR
+if(EIGEN_INCLUDE_INSTALL_DIR AND NOT INCLUDE_INSTALL_DIR)
+  set(INCLUDE_INSTALL_DIR ${EIGEN_INCLUDE_INSTALL_DIR}
+      CACHE PATH "The directory relative to CMAKE_PREFIX_PATH where Eigen header files are installed")
 else()
  set(INCLUDE_INSTALL_DIR
-    "${CMAKE_INSTALL_PREFIX}/include/eigen3"
-    CACHE INTERNAL
-    "The directory where we install the header files (internal)"
-  )
+      "${CMAKE_INSTALL_INCLUDEDIR}/eigen3"
+      CACHE PATH "The directory relative to CMAKE_PREFIX_PATH where Eigen header files are installed"
+      )
 endif()

+set(CMAKEPACKAGE_INSTALL_DIR
+    "${CMAKE_INSTALL_LIBDIR}/cmake/eigen3"
+    CACHE PATH "The directory relative to CMAKE_PREFIX_PATH where Eigen3Config.cmake is installed"
+    )
+set(PKGCONFIG_INSTALL_DIR
+    "${CMAKE_INSTALL_DATADIR}/pkgconfig"
+    CACHE PATH "The directory relative to CMAKE_PREFIX_PATH where eigen3.pc is installed"
+    )
+
 # similar to set_target_properties but append the property instead of overwriting it
 macro(ei_add_target_property target prop value)

@@ -361,21 +325,9 @@ install(FILES
  )

 if(EIGEN_BUILD_PKGCONFIG)
-    SET(path_separator ":")
-    STRING(REPLACE ${path_separator} ";" pkg_config_libdir_search "$ENV{PKG_CONFIG_LIBDIR}")
-    message(STATUS "searching for 'pkgconfig' directory in PKG_CONFIG_LIBDIR ( $ENV{PKG_CONFIG_LIBDIR} ), ${CMAKE_INSTALL_PREFIX}/share, and ${CMAKE_INSTALL_PREFIX}/lib")
-    FIND_PATH(pkg_config_libdir pkgconfig ${pkg_config_libdir_search} ${CMAKE_INSTALL_PREFIX}/share ${CMAKE_INSTALL_PREFIX}/lib ${pkg_config_libdir_search})
-    if(pkg_config_libdir)
-        SET(pkg_config_install_dir ${pkg_config_libdir})
-        message(STATUS "found ${pkg_config_libdir}/pkgconfig" )
-    else(pkg_config_libdir)
-        SET(pkg_config_install_dir ${CMAKE_INSTALL_PREFIX}/share)
-        message(STATUS "pkgconfig not found; installing in ${pkg_config_install_dir}" )
-    endif(pkg_config_libdir)
-
-    configure_file(eigen3.pc.in eigen3.pc)
+    configure_file(eigen3.pc.in eigen3.pc @ONLY)
    install(FILES ${CMAKE_CURRENT_BINARY_DIR}/eigen3.pc
-        DESTINATION ${pkg_config_install_dir}/pkgconfig
+        DESTINATION ${PKGCONFIG_INSTALL_DIR}
        )
 endif(EIGEN_BUILD_PKGCONFIG)

@@ -438,17 +390,19 @@ if(cmake_generator_tolower MATCHES "makefile")
  message(STATUS "--------------+--------------------------------------------------------------")
  message(STATUS "Command       |   Description")
  message(STATUS "--------------+--------------------------------------------------------------")
-  message(STATUS "make install  | Install to ${CMAKE_INSTALL_PREFIX}. To change that:")
-  message(STATUS "              |     cmake . -DCMAKE_INSTALL_PREFIX=yourpath")
-  message(STATUS "              |   Eigen headers will then be installed to:")
-  message(STATUS "              |     ${INCLUDE_INSTALL_DIR}")
-  message(STATUS "              |   To install Eigen headers to a separate location, do:")
-  message(STATUS "              |     cmake . -DEIGEN_INCLUDE_INSTALL_DIR=yourpath")
+  message(STATUS "make install  | Install Eigen. Headers will be installed to:")
+  message(STATUS "              |     <CMAKE_INSTALL_PREFIX>/<INCLUDE_INSTALL_DIR>")
+  message(STATUS "              |   Using the following values:")
+  message(STATUS "              |     CMAKE_INSTALL_PREFIX: ${CMAKE_INSTALL_PREFIX}")
+  message(STATUS "              |     INCLUDE_INSTALL_DIR:  ${INCLUDE_INSTALL_DIR}")
+  message(STATUS "              |   Change the install location of Eigen headers using:")
+  message(STATUS "              |     cmake . -DCMAKE_INSTALL_PREFIX=yourprefix")
+  message(STATUS "              |   Or:")
+  message(STATUS "              |     cmake . -DINCLUDE_INSTALL_DIR=yourdir")
  message(STATUS "make doc      | Generate the API documentation, requires Doxygen & LaTeX")
  message(STATUS "make check    | Build and run the unit-tests. Read this page:")
  message(STATUS "              |   http://eigen.tuxfamily.org/index.php?title=Tests")
  message(STATUS "make blas     | Build BLAS library (not the same thing as Eigen)")
-  message(STATUS "make uninstall| Removes files installed by make install")
  message(STATUS "--------------+--------------------------------------------------------------")
 else()
  message(STATUS "To build/run the unit tests, read this page:")
@@ -456,35 +410,3 @@ else()
 endif()

 message(STATUS "")
-
-set ( EIGEN_CONFIG_CMAKE_PATH
-      lib${LIB_SUFFIX}/cmake/eigen3
-      CACHE PATH "The directory where the CMake files are installed"
-    )
-if ( NOT IS_ABSOLUTE EIGEN_CONFIG_CMAKE_PATH )
-  set ( EIGEN_CONFIG_CMAKE_PATH ${CMAKE_INSTALL_PREFIX}/${EIGEN_CONFIG_CMAKE_PATH} )
-endif ()
-
-set ( EIGEN_USE_FILE ${EIGEN_CONFIG_CMAKE_PATH}/UseEigen3.cmake )
-set ( EIGEN_VERSION_STRING ${EIGEN_VERSION_NUMBER} )
-set ( EIGEN_VERSION_MAJOR  ${EIGEN_WORLD_VERSION} )
-set ( EIGEN_VERSION_MINOR  ${EIGEN_MAJOR_VERSION} )
-set ( EIGEN_VERSION_PATCH  ${EIGEN_MINOR_VERSION} )
-set ( EIGEN_DEFINITIONS "")
-set ( EIGEN_INCLUDE_DIR ${INCLUDE_INSTALL_DIR} )
-set ( EIGEN_INCLUDE_DIRS ${EIGEN_INCLUDE_DIR} )
-set ( EIGEN_ROOT_DIR ${CMAKE_INSTALL_PREFIX} )
-
-configure_file ( ${CMAKE_CURRENT_SOURCE_DIR}/cmake/Eigen3Config.cmake.in
-                 ${CMAKE_CURRENT_BINARY_DIR}/Eigen3Config.cmake
-                 @ONLY ESCAPE_QUOTES
-               )
-
-install ( FILES ${CMAKE_CURRENT_SOURCE_DIR}/cmake/UseEigen3.cmake
-                ${CMAKE_CURRENT_BINARY_DIR}/Eigen3Config.cmake
-          DESTINATION ${EIGEN_CONFIG_CMAKE_PATH}
-        )
-
-# Add uninstall target
-add_custom_target ( uninstall
-    COMMAND ${CMAKE_COMMAND} -P ${CMAKE_CURRENT_SOURCE_DIR}/cmake/EigenUninstall.cmake)
--- a/CTestConfig.cmake
+++ b/CTestConfig.cmake
@@ -4,14 +4,10 @@
 ## # The following are required to uses Dart and the Cdash dashboard
 ##   ENABLE_TESTING()
 ##   INCLUDE(CTest)
-set(CTEST_PROJECT_NAME "Eigen")
+set(CTEST_PROJECT_NAME "Eigen3.2")
 set(CTEST_NIGHTLY_START_TIME "00:00:00 UTC")

 set(CTEST_DROP_METHOD "http")
 set(CTEST_DROP_SITE "manao.inria.fr")
-set(CTEST_DROP_LOCATION "/CDash/submit.php?project=Eigen")
+set(CTEST_DROP_LOCATION "/CDash/submit.php?project=Eigen3.2")
 set(CTEST_DROP_SITE_CDASH TRUE)
-set(CTEST_PROJECT_SUBPROJECTS
-Official
-Unsupported
-)
--- a/Eigen/Array
+++ b/Eigen/Array
@@ -0,0 +1,11 @@
+#ifndef EIGEN_ARRAY_MODULE_H
+#define EIGEN_ARRAY_MODULE_H
+
+// include Core first to handle Eigen2 support macros
+#include "Core"
+
+#ifndef EIGEN2_SUPPORT
+  #error The Eigen/Array header does no longer exist in Eigen3. All that functionality has moved to Eigen/Core.
+#endif
+
+#endif // EIGEN_ARRAY_MODULE_H
--- a/Eigen/Cholesky
+++ b/Eigen/Cholesky
@@ -10,17 +10,16 @@
  *
  *
  * This module provides two variants of the Cholesky decomposition for selfadjoint (hermitian) matrices.
-  * Those decompositions are also accessible via the following methods:
-  *  - MatrixBase::llt()
+  * Those decompositions are accessible via the following MatrixBase methods:
+  *  - MatrixBase::llt(),
  *  - MatrixBase::ldlt()
-  *  - SelfAdjointView::llt()
-  *  - SelfAdjointView::ldlt()
  *
  * \code
  * #include <Eigen/Cholesky>
  * \endcode
  */

+#include "src/misc/Solve.h"
 #include "src/Cholesky/LLT.h"
 #include "src/Cholesky/LDLT.h"
 #ifdef EIGEN_USE_LAPACKE
--- a/Eigen/CholmodSupport
+++ b/Eigen/CholmodSupport
@@ -12,7 +12,7 @@ extern "C" {
 /** \ingroup Support_modules
  * \defgroup CholmodSupport_Module CholmodSupport module
  *
-  * This module provides an interface to the Cholmod library which is part of the <a href="http://www.cise.ufl.edu/research/sparse/SuiteSparse/">suitesparse</a> package.
+  * This module provides an interface to the Cholmod library which is part of the <a href="http://www.suitesparse.com">suitesparse</a> package.
  * It provides the two following main factorization classes:
  * - class CholmodSupernodalLLT: a supernodal LLT Cholesky factorization.
  * - class CholmodDecomposiiton: a general L(D)LT Cholesky factorization with automatic or explicit runtime selection of the underlying factorization method (supernodal or simplicial).
@@ -33,8 +33,12 @@ extern "C" {
  *
  */

+#include "src/misc/Solve.h"
+#include "src/misc/SparseSolve.h"
+
 #include "src/CholmodSupport/CholmodSupport.h"

+
 #include "src/Core/util/ReenableStupidWarnings.h"

 #endif // EIGEN_CHOLMODSUPPORT_MODULE_H
--- a/Eigen/Core
+++ b/Eigen/Core
@@ -14,42 +14,6 @@
 // first thing Eigen does: stop the compiler from committing suicide
 #include "src/Core/util/DisableStupidWarnings.h"

-// Handle NVCC/CUDA
-#ifdef __CUDACC__
-  // Do not try asserts on CUDA!
-  #ifndef EIGEN_NO_DEBUG
-  #define EIGEN_NO_DEBUG
-  #endif
-
-  #ifdef EIGEN_INTERNAL_DEBUGGING
-  #undef EIGEN_INTERNAL_DEBUGGING
-  #endif
-  
-  // Do not try to vectorize on CUDA!
-  #define EIGEN_DONT_VECTORIZE
-  
-  // All functions callable from CUDA code must be qualified with __device__
-  #define EIGEN_DEVICE_FUNC __host__ __device__
-  
-#else
-  #define EIGEN_DEVICE_FUNC
-  
-#endif
-
-#if defined(__CUDA_ARCH__)
-  #define EIGEN_USING_STD_MATH(FUNC) using ::FUNC;
-#else
-  #define EIGEN_USING_STD_MATH(FUNC) using std::FUNC;
-#endif
-
-#if (defined(_CPPUNWIND) || defined(__EXCEPTIONS)) && !defined(__CUDA_ARCH__) && !defined(EIGEN_EXCEPTIONS)
-  #define EIGEN_EXCEPTIONS
-#endif
-
-#ifdef EIGEN_EXCEPTIONS
-  #include <new>
-#endif
-
 // then include this file where all our macros are defined. It's really important to do it first because
 // it's where we do all the alignment settings (platform detection and honoring the user's will if he
 // defined e.g. EIGEN_DONT_ALIGN) so it needs to be done before we do anything with vectorization.
@@ -57,7 +21,7 @@

 // Disable the ipa-cp-clone optimization flag with MinGW 6.x or newer (enabled by default with -O3)
 // See http://eigen.tuxfamily.org/bz/show_bug.cgi?id=556 for details.
-#if EIGEN_COMP_MINGW && EIGEN_GNUC_AT_LEAST(4,6)
+#if defined(__MINGW32__) && EIGEN_GNUC_AT_LEAST(4,6)
  #pragma GCC optimize ("-fno-ipa-cp-clone")
 #endif

@@ -75,18 +39,18 @@
  #endif
 #endif

-#if EIGEN_COMP_MSVC
+#ifdef _MSC_VER
  #include <malloc.h> // for _aligned_malloc -- need it regardless of whether vectorization is enabled
-  #if (EIGEN_COMP_MSVC >= 1500) // 2008 or later
+  #if (_MSC_VER >= 1500) // 2008 or later
    // Remember that usage of defined() in a #define is undefined by the standard.
    // a user reported that in 64-bit mode, MSVC doesn't care to define _M_IX86_FP.
-    #if (defined(_M_IX86_FP) && (_M_IX86_FP >= 2)) || EIGEN_ARCH_x86_64
+    #if (defined(_M_IX86_FP) && (_M_IX86_FP >= 2)) || defined(_M_X64)
      #define EIGEN_SSE2_ON_MSVC_2008_OR_LATER
    #endif
  #endif
 #else
  // Remember that usage of defined() in a #define is undefined by the standard
-  #if (defined __SSE2__) && ( (!EIGEN_COMP_GNUC) || EIGEN_COMP_ICC || EIGEN_GNUC_AT_LEAST(4,2) )
+  #if (defined __SSE2__) && ( (!defined __GNUC__) || (defined __INTEL_COMPILER) || EIGEN_GNUC_AT_LEAST(4,2) )
    #define EIGEN_SSE2_ON_NON_MSVC_BUT_NOT_OLD_GCC
  #endif
 #endif
@@ -118,13 +82,6 @@
    #ifdef __SSE4_2__
      #define EIGEN_VECTORIZE_SSE4_2
    #endif
-    #ifdef __AVX__
-      #define EIGEN_VECTORIZE_AVX
-      #define EIGEN_VECTORIZE_SSE3
-      #define EIGEN_VECTORIZE_SSSE3
-      #define EIGEN_VECTORIZE_SSE4_1
-      #define EIGEN_VECTORIZE_SSE4_2
-    #endif

    // include files

@@ -138,7 +95,7 @@
    extern "C" {
      // In theory we should only include immintrin.h and not the other *mmintrin.h header files directly.
      // Doing so triggers some issues with ICC. However old gcc versions seems to not have this file, thus:
-      #if EIGEN_COMP_ICC >= 1110
+      #if defined(__INTEL_COMPILER) && __INTEL_COMPILER >= 1110
        #include <immintrin.h>
      #else
        #include <emmintrin.h>
@@ -155,20 +112,8 @@
        #ifdef EIGEN_VECTORIZE_SSE4_2
        #include <nmmintrin.h>
        #endif
-        #ifdef EIGEN_VECTORIZE_AVX
-        #include <immintrin.h>
-        #endif
      #endif
    } // end extern "C"
-  #elif defined __VSX__
-    #define EIGEN_VECTORIZE
-    #define EIGEN_VECTORIZE_VSX
-    #include <altivec.h>
-    // We need to #undef all these ugly tokens defined in <altivec.h>
-    // => use __vector instead of vector
-    #undef bool
-    #undef vector
-    #undef pixel
  #elif defined __ALTIVEC__
    #define EIGEN_VECTORIZE
    #define EIGEN_VECTORIZE_ALTIVEC
@@ -185,11 +130,6 @@
  #endif
 #endif

-#if defined __CUDACC__
-  #define EIGEN_VECTORIZE_CUDA
-  #include <vector_types.h>
-#endif
-
 #if (defined _OPENMP) && (!defined EIGEN_DONT_PARALLELIZE)
  #define EIGEN_HAS_OPENMP
 #endif
@@ -199,7 +139,7 @@
 #endif

 // MSVC for windows mobile does not have the errno.h file
-#if !(EIGEN_COMP_MSVC && EIGEN_OS_WINCE) && !EIGEN_COMP_ARM
+#if !(defined(_MSC_VER) && defined(_WIN32_WCE)) && !defined(__ARMCC_VERSION)
 #define EIGEN_HAS_ERRNO
 #endif

@@ -225,17 +165,23 @@
 #endif

 // required for __cpuid, needs to be included after cmath
-#if EIGEN_COMP_MSVC && EIGEN_ARCH_i386_OR_x86_64 && !EIGEN_OS_WINCE
+#if defined(_MSC_VER) && (defined(_M_IX86)||defined(_M_X64)) && (!defined(_WIN32_WCE))
  #include <intrin.h>
 #endif

+#if defined(_CPPUNWIND) || defined(__EXCEPTIONS)
+  #define EIGEN_EXCEPTIONS
+#endif
+
+#ifdef EIGEN_EXCEPTIONS
+  #include <new>
+#endif
+
 /** \brief Namespace containing all symbols from the %Eigen library. */
 namespace Eigen {

 inline static const char *SimdInstructionSetsInUse(void) {
-#if defined(EIGEN_VECTORIZE_AVX)
-  return "AVX SSE, SSE2, SSE3, SSSE3, SSE4.1, SSE4.2";
-#elif defined(EIGEN_VECTORIZE_SSE4_2)
+#if defined(EIGEN_VECTORIZE_SSE4_2)
  return "SSE, SSE2, SSE3, SSSE3, SSE4.1, SSE4.2";
 #elif defined(EIGEN_VECTORIZE_SSE4_1)
  return "SSE, SSE2, SSE3, SSSE3, SSE4.1";
@@ -247,8 +193,6 @@ inline static const char *SimdInstructionSetsInUse(void) {
  return "SSE, SSE2";
 #elif defined(EIGEN_VECTORIZE_ALTIVEC)
  return "AltiVec";
-#elif defined(EIGEN_VECTORIZE_VSX)
-  return "VSX";
 #elif defined(EIGEN_VECTORIZE_NEON)
  return "ARM NEON";
 #else
@@ -258,9 +202,34 @@ inline static const char *SimdInstructionSetsInUse(void) {

 } // end namespace Eigen

-#if defined EIGEN2_SUPPORT_STAGE40_FULL_EIGEN3_STRICTNESS || defined EIGEN2_SUPPORT_STAGE30_FULL_EIGEN3_API || defined EIGEN2_SUPPORT_STAGE20_RESOLVE_API_CONFLICTS || defined EIGEN2_SUPPORT_STAGE10_FULL_EIGEN2_API || defined EIGEN2_SUPPORT
-// This will generate an error message:
-#error Eigen2-support is only available up to version 3.2. Please go to "http://eigen.tuxfamily.org/index.php?title=Eigen2" for further information
+#define STAGE10_FULL_EIGEN2_API             10
+#define STAGE20_RESOLVE_API_CONFLICTS       20
+#define STAGE30_FULL_EIGEN3_API             30
+#define STAGE40_FULL_EIGEN3_STRICTNESS      40
+#define STAGE99_NO_EIGEN2_SUPPORT           99
+
+#if   defined EIGEN2_SUPPORT_STAGE40_FULL_EIGEN3_STRICTNESS
+  #define EIGEN2_SUPPORT
+  #define EIGEN2_SUPPORT_STAGE STAGE40_FULL_EIGEN3_STRICTNESS
+#elif defined EIGEN2_SUPPORT_STAGE30_FULL_EIGEN3_API
+  #define EIGEN2_SUPPORT
+  #define EIGEN2_SUPPORT_STAGE STAGE30_FULL_EIGEN3_API
+#elif defined EIGEN2_SUPPORT_STAGE20_RESOLVE_API_CONFLICTS
+  #define EIGEN2_SUPPORT
+  #define EIGEN2_SUPPORT_STAGE STAGE20_RESOLVE_API_CONFLICTS
+#elif defined EIGEN2_SUPPORT_STAGE10_FULL_EIGEN2_API
+  #define EIGEN2_SUPPORT
+  #define EIGEN2_SUPPORT_STAGE STAGE10_FULL_EIGEN2_API
+#elif defined EIGEN2_SUPPORT
+  // default to stage 3, that's what it's always meant
+  #define EIGEN2_SUPPORT_STAGE30_FULL_EIGEN3_API
+  #define EIGEN2_SUPPORT_STAGE STAGE30_FULL_EIGEN3_API
+#else
+  #define EIGEN2_SUPPORT_STAGE STAGE99_NO_EIGEN2_SUPPORT
+#endif
+
+#ifdef EIGEN2_SUPPORT
+#undef minor
 #endif

 // we use size_t frequently and we'll never remember to prepend it with std:: everytime just to
@@ -280,8 +249,8 @@ using std::ptrdiff_t;
  */

 #include "src/Core/util/Constants.h"
-#include "src/Core/util/Meta.h"
 #include "src/Core/util/ForwardDeclarations.h"
+#include "src/Core/util/Meta.h"
 #include "src/Core/util/StaticAssert.h"
 #include "src/Core/util/XprHelper.h"
 #include "src/Core/util/Memory.h"
@@ -290,61 +259,35 @@ using std::ptrdiff_t;
 #include "src/Core/MathFunctions.h"
 #include "src/Core/GenericPacketMath.h"

-#if defined EIGEN_VECTORIZE_AVX
-  // Use AVX for floats and doubles, SSE for integers
-  #include "src/Core/arch/SSE/PacketMath.h"
-  #include "src/Core/arch/SSE/Complex.h"
-  #include "src/Core/arch/AVX/PacketMath.h"
-  #include "src/Core/arch/AVX/MathFunctions.h"
-  #include "src/Core/arch/AVX/Complex.h"
-#elif defined EIGEN_VECTORIZE_SSE
+#if defined EIGEN_VECTORIZE_SSE
  #include "src/Core/arch/SSE/PacketMath.h"
  #include "src/Core/arch/SSE/MathFunctions.h"
  #include "src/Core/arch/SSE/Complex.h"
-#elif defined(EIGEN_VECTORIZE_ALTIVEC) || defined(EIGEN_VECTORIZE_VSX)
+#elif defined EIGEN_VECTORIZE_ALTIVEC
  #include "src/Core/arch/AltiVec/PacketMath.h"
  #include "src/Core/arch/AltiVec/Complex.h"
 #elif defined EIGEN_VECTORIZE_NEON
  #include "src/Core/arch/NEON/PacketMath.h"
-  #include "src/Core/arch/NEON/MathFunctions.h"
  #include "src/Core/arch/NEON/Complex.h"
 #endif

-#if defined EIGEN_VECTORIZE_CUDA
-  #include "src/Core/arch/CUDA/PacketMath.h"
-  #include "src/Core/arch/CUDA/MathFunctions.h"
-#endif
-
 #include "src/Core/arch/Default/Settings.h"

-#include "src/Core/functors/BinaryFunctors.h"
-#include "src/Core/functors/UnaryFunctors.h"
-#include "src/Core/functors/NullaryFunctors.h"
-#include "src/Core/functors/StlFunctors.h"
-#include "src/Core/functors/AssignmentFunctors.h"
-
+#include "src/Core/Functors.h"
 #include "src/Core/DenseCoeffsBase.h"
 #include "src/Core/DenseBase.h"
 #include "src/Core/MatrixBase.h"
 #include "src/Core/EigenBase.h"

-#include "src/Core/Product.h"
-#include "src/Core/CoreEvaluators.h"
-#include "src/Core/AssignEvaluator.h"
-
 #ifndef EIGEN_PARSED_BY_DOXYGEN // work around Doxygen bug triggered by Assign.h r814874
                                // at least confirmed with Doxygen 1.5.5 and 1.5.6
  #include "src/Core/Assign.h"
 #endif

-#include "src/Core/ArrayBase.h"
 #include "src/Core/util/BlasUtil.h"
 #include "src/Core/DenseStorage.h"
 #include "src/Core/NestByValue.h"
-
-// #include "src/Core/ForceAlignedAccess.h"
-// #include "src/Core/Flagged.h"
-
+#include "src/Core/ForceAlignedAccess.h"
 #include "src/Core/ReturnByValue.h"
 #include "src/Core/NoAlias.h"
 #include "src/Core/PlainObjectBase.h"
@@ -357,12 +300,12 @@ using std::ptrdiff_t;
 #include "src/Core/SelfCwiseBinaryOp.h"
 #include "src/Core/Dot.h"
 #include "src/Core/StableNorm.h"
-#include "src/Core/Stride.h"
 #include "src/Core/MapBase.h"
+#include "src/Core/Stride.h"
 #include "src/Core/Map.h"
-#include "src/Core/Ref.h"
 #include "src/Core/Block.h"
 #include "src/Core/VectorBlock.h"
+#include "src/Core/Ref.h"
 #include "src/Core/Transpose.h"
 #include "src/Core/DiagonalMatrix.h"
 #include "src/Core/Diagonal.h"
@@ -375,15 +318,14 @@ using std::ptrdiff_t;
 #include "src/Core/IO.h"
 #include "src/Core/Swap.h"
 #include "src/Core/CommaInitializer.h"
+#include "src/Core/Flagged.h"
 #include "src/Core/ProductBase.h"
 #include "src/Core/GeneralProduct.h"
-#include "src/Core/Solve.h"
-#include "src/Core/Inverse.h"
 #include "src/Core/TriangularMatrix.h"
 #include "src/Core/SelfAdjointView.h"
 #include "src/Core/products/GeneralBlockPanelKernel.h"
 #include "src/Core/products/Parallelizer.h"
-#include "src/Core/ProductEvaluators.h"
+#include "src/Core/products/CoeffBasedProduct.h"
 #include "src/Core/products/GeneralMatrixVector.h"
 #include "src/Core/products/GeneralMatrixMatrix.h"
 #include "src/Core/SolveTriangular.h"
@@ -405,6 +347,7 @@ using std::ptrdiff_t;
 #include "src/Core/Random.h"
 #include "src/Core/Replicate.h"
 #include "src/Core/Reverse.h"
+#include "src/Core/ArrayBase.h"
 #include "src/Core/ArrayWrapper.h"

 #ifdef EIGEN_USE_BLAS
@@ -426,4 +369,8 @@ using std::ptrdiff_t;

 #include "src/Core/util/ReenableStupidWarnings.h"

+#ifdef EIGEN2_SUPPORT
+#include "Eigen2Support"
+#endif
+
 #endif // EIGEN_CORE_H
--- a/Eigen/Eigen
+++ b/Eigen/Eigen
@@ -1,2 +1,2 @@
 #include "Dense"
-#include "Sparse"
+//#include "Sparse"
--- a/Eigen/Eigen2Support
+++ b/Eigen/Eigen2Support
@@ -0,0 +1,95 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2009 Gael Guennebaud <gael.guennebaud@inria.fr>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+#ifndef EIGEN2SUPPORT_H
+#define EIGEN2SUPPORT_H
+
+#if (!defined(EIGEN2_SUPPORT)) || (!defined(EIGEN_CORE_H))
+#error Eigen2 support must be enabled by defining EIGEN2_SUPPORT before including any Eigen header
+#endif
+
+#ifndef EIGEN_NO_EIGEN2_DEPRECATED_WARNING
+
+#if defined(__GNUC__) || defined(__INTEL_COMPILER) || defined(__clang__)
+#warning "Eigen2 support is deprecated in Eigen 3.2.x and it will be removed in Eigen 3.3. (Define EIGEN_NO_EIGEN2_DEPRECATED_WARNING to disable this warning)"
+#else
+#pragma message ("Eigen2 support is deprecated in Eigen 3.2.x and it will be removed in Eigen 3.3. (Define EIGEN_NO_EIGEN2_DEPRECATED_WARNING to disable this warning)")
+#endif
+
+#endif // EIGEN_NO_EIGEN2_DEPRECATED_WARNING
+
+#include "src/Core/util/DisableStupidWarnings.h"
+
+/** \ingroup Support_modules
+  * \defgroup Eigen2Support_Module Eigen2 support module
+  *
+  * \warning Eigen2 support is deprecated in Eigen 3.2.x and it will be removed in Eigen 3.3.
+  *
+  * This module provides a couple of deprecated functions improving the compatibility with Eigen2.
+  * 
+  * To use it, define EIGEN2_SUPPORT before including any Eigen header
+  * \code
+  * #define EIGEN2_SUPPORT
+  * \endcode
+  *
+  */
+
+#include "src/Eigen2Support/Macros.h"
+#include "src/Eigen2Support/Memory.h"
+#include "src/Eigen2Support/Meta.h"
+#include "src/Eigen2Support/Lazy.h"
+#include "src/Eigen2Support/Cwise.h"
+#include "src/Eigen2Support/CwiseOperators.h"
+#include "src/Eigen2Support/TriangularSolver.h"
+#include "src/Eigen2Support/Block.h"
+#include "src/Eigen2Support/VectorBlock.h"
+#include "src/Eigen2Support/Minor.h"
+#include "src/Eigen2Support/MathFunctions.h"
+
+
+#include "src/Core/util/ReenableStupidWarnings.h"
+
+// Eigen2 used to include iostream
+#include<iostream>
+
+#define EIGEN_USING_MATRIX_TYPEDEFS_FOR_TYPE_AND_SIZE(TypeSuffix, SizeSuffix) \
+using Eigen::Matrix##SizeSuffix##TypeSuffix; \
+using Eigen::Vector##SizeSuffix##TypeSuffix; \
+using Eigen::RowVector##SizeSuffix##TypeSuffix;
+
+#define EIGEN_USING_MATRIX_TYPEDEFS_FOR_TYPE(TypeSuffix) \
+EIGEN_USING_MATRIX_TYPEDEFS_FOR_TYPE_AND_SIZE(TypeSuffix, 2) \
+EIGEN_USING_MATRIX_TYPEDEFS_FOR_TYPE_AND_SIZE(TypeSuffix, 3) \
+EIGEN_USING_MATRIX_TYPEDEFS_FOR_TYPE_AND_SIZE(TypeSuffix, 4) \
+EIGEN_USING_MATRIX_TYPEDEFS_FOR_TYPE_AND_SIZE(TypeSuffix, X) \
+
+#define EIGEN_USING_MATRIX_TYPEDEFS \
+EIGEN_USING_MATRIX_TYPEDEFS_FOR_TYPE(i) \
+EIGEN_USING_MATRIX_TYPEDEFS_FOR_TYPE(f) \
+EIGEN_USING_MATRIX_TYPEDEFS_FOR_TYPE(d) \
+EIGEN_USING_MATRIX_TYPEDEFS_FOR_TYPE(cf) \
+EIGEN_USING_MATRIX_TYPEDEFS_FOR_TYPE(cd)
+
+#define USING_PART_OF_NAMESPACE_EIGEN \
+EIGEN_USING_MATRIX_TYPEDEFS \
+using Eigen::Matrix; \
+using Eigen::MatrixBase; \
+using Eigen::ei_random; \
+using Eigen::ei_real; \
+using Eigen::ei_imag; \
+using Eigen::ei_conj; \
+using Eigen::ei_abs; \
+using Eigen::ei_abs2; \
+using Eigen::ei_sqrt; \
+using Eigen::ei_exp; \
+using Eigen::ei_log; \
+using Eigen::ei_sin; \
+using Eigen::ei_cos;
+
+#endif // EIGEN2SUPPORT_H
--- a/Eigen/Geometry
+++ b/Eigen/Geometry
@@ -33,23 +33,27 @@
 #include "src/Geometry/OrthoMethods.h"
 #include "src/Geometry/EulerAngles.h"

-#include "src/Geometry/Homogeneous.h"
-#include "src/Geometry/RotationBase.h"
-#include "src/Geometry/Rotation2D.h"
-#include "src/Geometry/Quaternion.h"
-#include "src/Geometry/AngleAxis.h"
-#include "src/Geometry/Transform.h"
-#include "src/Geometry/Translation.h"
-#include "src/Geometry/Scaling.h"
-#include "src/Geometry/Hyperplane.h"
-#include "src/Geometry/ParametrizedLine.h"
-#include "src/Geometry/AlignedBox.h"
-#include "src/Geometry/Umeyama.h"
+#if EIGEN2_SUPPORT_STAGE > STAGE20_RESOLVE_API_CONFLICTS
+  #include "src/Geometry/Homogeneous.h"
+  #include "src/Geometry/RotationBase.h"
+  #include "src/Geometry/Rotation2D.h"
+  #include "src/Geometry/Quaternion.h"
+  #include "src/Geometry/AngleAxis.h"
+  #include "src/Geometry/Transform.h"
+  #include "src/Geometry/Translation.h"
+  #include "src/Geometry/Scaling.h"
+  #include "src/Geometry/Hyperplane.h"
+  #include "src/Geometry/ParametrizedLine.h"
+  #include "src/Geometry/AlignedBox.h"
+  #include "src/Geometry/Umeyama.h"

-// Use the SSE optimized version whenever possible. At the moment the
-// SSE version doesn't compile when AVX is enabled
-#if defined EIGEN_VECTORIZE_SSE && !defined EIGEN_VECTORIZE_AVX
-#include "src/Geometry/arch/Geometry_SSE.h"
+  #if defined EIGEN_VECTORIZE_SSE
+    #include "src/Geometry/arch/Geometry_SSE.h"
+  #endif
+#endif
+
+#ifdef EIGEN2_SUPPORT
+#include "src/Eigen2Support/Geometry/All.h"
 #endif

 #include "src/Core/util/ReenableStupidWarnings.h"
--- a/Eigen/IterativeLinearSolvers
+++ b/Eigen/IterativeLinearSolvers
@@ -26,7 +26,9 @@
  * \endcode
  */

-#include "src/IterativeLinearSolvers/SolveWithGuess.h"
+#include "src/misc/Solve.h"
+#include "src/misc/SparseSolve.h"
+
 #include "src/IterativeLinearSolvers/IterativeSolverBase.h"
 #include "src/IterativeLinearSolvers/BasicPreconditioners.h"
 #include "src/IterativeLinearSolvers/ConjugateGradient.h"
--- a/Eigen/LU
+++ b/Eigen/LU
@@ -16,6 +16,7 @@
  * \endcode
  */

+#include "src/misc/Solve.h"
 #include "src/misc/Kernel.h"
 #include "src/misc/Image.h"
 #include "src/LU/FullPivLU.h"
@@ -24,14 +25,16 @@
 #include "src/LU/PartialPivLU_MKL.h"
 #endif
 #include "src/LU/Determinant.h"
-#include "src/LU/InverseImpl.h"
+#include "src/LU/Inverse.h"

-// Use the SSE optimized version whenever possible. At the moment the
-// SSE version doesn't compile when AVX is enabled
-#if defined EIGEN_VECTORIZE_SSE && !defined EIGEN_VECTORIZE_AVX
+#if defined EIGEN_VECTORIZE_SSE
  #include "src/LU/arch/Inverse_SSE.h"
 #endif

+#ifdef EIGEN2_SUPPORT
+  #include "src/Eigen2Support/LU.h"
+#endif
+
 #include "src/Core/util/ReenableStupidWarnings.h"

 #endif // EIGEN_LU_MODULE_H
--- a/Eigen/LeastSquares
+++ b/Eigen/LeastSquares
@@ -0,0 +1,32 @@
+#ifndef EIGEN_REGRESSION_MODULE_H
+#define EIGEN_REGRESSION_MODULE_H
+
+#ifndef EIGEN2_SUPPORT
+#error LeastSquares is only available in Eigen2 support mode (define EIGEN2_SUPPORT)
+#endif
+
+// exclude from normal eigen3-only documentation
+#ifdef EIGEN2_SUPPORT
+
+#include "Core"
+
+#include "src/Core/util/DisableStupidWarnings.h"
+
+#include "Eigenvalues"
+#include "Geometry"
+
+/** \defgroup LeastSquares_Module LeastSquares module
+  * This module provides linear regression and related features.
+  *
+  * \code
+  * #include <Eigen/LeastSquares>
+  * \endcode
+  */
+
+#include "src/Eigen2Support/LeastSquares.h"
+
+#include "src/Core/util/ReenableStupidWarnings.h"
+
+#endif // EIGEN2_SUPPORT
+
+#endif // EIGEN_REGRESSION_MODULE_H
--- a/Eigen/PaStiXSupport
+++ b/Eigen/PaStiXSupport
@@ -35,8 +35,12 @@ extern "C" {
  *
  */

+#include "src/misc/Solve.h"
+#include "src/misc/SparseSolve.h"
+
 #include "src/PaStiXSupport/PaStiXSupport.h"

+
 #include "src/Core/util/ReenableStupidWarnings.h"

 #endif // EIGEN_PASTIXSUPPORT_MODULE_H
--- a/Eigen/QR
+++ b/Eigen/QR
@@ -15,15 +15,14 @@
  *
  * This module provides various QR decompositions
  * This module also provides some MatrixBase methods, including:
-  *  - MatrixBase::householderQr()
-  *  - MatrixBase::colPivHouseholderQr()
-  *  - MatrixBase::fullPivHouseholderQr()
+  *  - MatrixBase::qr(),
  *
  * \code
  * #include <Eigen/QR>
  * \endcode
  */

+#include "src/misc/Solve.h"
 #include "src/QR/HouseholderQR.h"
 #include "src/QR/FullPivHouseholderQR.h"
 #include "src/QR/ColPivHouseholderQR.h"
@@ -32,7 +31,15 @@
 #include "src/QR/ColPivHouseholderQR_MKL.h"
 #endif

+#ifdef EIGEN2_SUPPORT
+#include "src/Eigen2Support/QR.h"
+#endif
+
 #include "src/Core/util/ReenableStupidWarnings.h"

+#ifdef EIGEN2_SUPPORT
+#include "Eigenvalues"
+#endif
+
 #endif // EIGEN_QR_MODULE_H
 /* vim: set filetype=cpp et sw=2 ts=2 ai: */
--- a/Eigen/SPQRSupport
+++ b/Eigen/SPQRSupport
@@ -10,7 +10,7 @@
 /** \ingroup Support_modules
  * \defgroup SPQRSupport_Module SuiteSparseQR module
  * 
-  * This module provides an interface to the SPQR library, which is part of the <a href="http://www.cise.ufl.edu/research/sparse/SuiteSparse/">suitesparse</a> package.
+  * This module provides an interface to the SPQR library, which is part of the <a href="http://www.suitesparse.com">suitesparse</a> package.
  *
  * \code
  * #include <Eigen/SPQRSupport>
@@ -21,6 +21,8 @@
  *
  */

+#include "src/misc/Solve.h"
+#include "src/misc/SparseSolve.h"
 #include "src/CholmodSupport/CholmodSupport.h"
 #include "src/SPQRSupport/SuiteSparseQRSupport.h"

--- a/Eigen/SVD
+++ b/Eigen/SVD
@@ -12,25 +12,24 @@
  *
  *
  * This module provides SVD decomposition for matrices (both real and complex).
-  * Two decomposition algorithms are provided:
-  *  - JacobiSVD implementing two-sided Jacobi iterations is numerically very accurate, fast for small matrices, but very slow for larger ones.
-  *  - BDCSVD implementing a recursive divide & conquer strategy on top of an upper-bidiagonalization which remains fast for large problems.
-  * These decompositions are accessible via the respective classes and following MatrixBase methods:
+  * This decomposition is accessible via the following MatrixBase method:
  *  - MatrixBase::jacobiSvd()
-  *  - MatrixBase::bdcSvd()
  *
  * \code
  * #include <Eigen/SVD>
  * \endcode
  */

-#include "src/SVD/UpperBidiagonalization.h"
-#include "src/SVD/SVDBase.h"
+#include "src/misc/Solve.h"
 #include "src/SVD/JacobiSVD.h"
-#include "src/SVD/BDCSVD.h"
 #if defined(EIGEN_USE_LAPACKE) && !defined(EIGEN_USE_LAPACKE_STRICT)
 #include "src/SVD/JacobiSVD_MKL.h"
 #endif
+#include "src/SVD/UpperBidiagonalization.h"
+
+#ifdef EIGEN2_SUPPORT
+#include "src/Eigen2Support/SVD.h"
+#endif

 #include "src/Core/util/ReenableStupidWarnings.h"

--- a/Eigen/SparseCholesky
+++ b/Eigen/SparseCholesky
@@ -34,6 +34,8 @@
 #error The SparseCholesky module has nothing to offer in MPL2 only mode
 #endif

+#include "src/misc/Solve.h"
+#include "src/misc/SparseSolve.h"
 #include "src/SparseCholesky/SimplicialCholesky.h"

 #ifndef EIGEN_MPL2_ONLY
--- a/Eigen/SparseCore
+++ b/Eigen/SparseCore
@@ -14,7 +14,7 @@
 /** 
  * \defgroup SparseCore_Module SparseCore module
  *
-  * This module provides a sparse matrix representation, and basic associatd matrix manipulations
+  * This module provides a sparse matrix representation, and basic associated matrix manipulations
  * and operations.
  *
  * See the \ref TutorialSparse "Sparse tutorial"
@@ -26,35 +26,37 @@
  * This module depends on: Core.
  */

+namespace Eigen {
+
+/** The type used to identify a general sparse storage. */
+struct Sparse {};
+
+}
+
 #include "src/SparseCore/SparseUtil.h"
 #include "src/SparseCore/SparseMatrixBase.h"
-#include "src/SparseCore/SparseAssign.h"
 #include "src/SparseCore/CompressedStorage.h"
 #include "src/SparseCore/AmbiVector.h"
-#include "src/SparseCore/SparseCompressedBase.h"
 #include "src/SparseCore/SparseMatrix.h"
-#include "src/SparseCore/SparseMap.h"
 #include "src/SparseCore/MappedSparseMatrix.h"
 #include "src/SparseCore/SparseVector.h"
-#include "src/SparseCore/SparseRef.h"
+#include "src/SparseCore/SparseBlock.h"
+#include "src/SparseCore/SparseTranspose.h"
 #include "src/SparseCore/SparseCwiseUnaryOp.h"
 #include "src/SparseCore/SparseCwiseBinaryOp.h"
-#include "src/SparseCore/SparseTranspose.h"
-#include "src/SparseCore/SparseBlock.h"
 #include "src/SparseCore/SparseDot.h"
+#include "src/SparseCore/SparsePermutation.h"
 #include "src/SparseCore/SparseRedux.h"
-#include "src/SparseCore/SparseView.h"
-#include "src/SparseCore/SparseDiagonalProduct.h"
+#include "src/SparseCore/SparseFuzzy.h"
 #include "src/SparseCore/ConservativeSparseSparseProduct.h"
 #include "src/SparseCore/SparseSparseProductWithPruning.h"
 #include "src/SparseCore/SparseProduct.h"
 #include "src/SparseCore/SparseDenseProduct.h"
-#include "src/SparseCore/SparseSelfAdjointView.h"
+#include "src/SparseCore/SparseDiagonalProduct.h"
 #include "src/SparseCore/SparseTriangularView.h"
+#include "src/SparseCore/SparseSelfAdjointView.h"
 #include "src/SparseCore/TriangularSolver.h"
-#include "src/SparseCore/SparsePermutation.h"
-#include "src/SparseCore/SparseFuzzy.h"
-#include "src/SparseCore/SparseSolverBase.h"
+#include "src/SparseCore/SparseView.h"

 #include "src/Core/util/ReenableStupidWarnings.h"

--- a/Eigen/SparseLU
+++ b/Eigen/SparseLU
@@ -20,6 +20,9 @@
  * Please, see the documentation of the SparseLU class for more details.
  */

+#include "src/misc/Solve.h"
+#include "src/misc/SparseSolve.h"
+
 // Ordering interface
 #include "OrderingMethods"

--- a/Eigen/SparseQR
+++ b/Eigen/SparseQR
@@ -21,6 +21,9 @@
  * 
  */

+#include "src/misc/Solve.h"
+#include "src/misc/SparseSolve.h"
+
 #include "OrderingMethods"
 #include "src/SparseCore/SparseColEtree.h"
 #include "src/SparseQR/SparseQR.h"
--- a/Eigen/StdDeque
+++ b/Eigen/StdDeque
@@ -14,7 +14,7 @@
 #include "Core"
 #include <deque>

-#if EIGEN_COMP_MSVC && EIGEN_OS_WIN64 /* MSVC auto aligns in 64 bit builds */
+#if (defined(_MSC_VER) && defined(_WIN64)) /* MSVC auto aligns in 64 bit builds */

 #define EIGEN_DEFINE_STL_DEQUE_SPECIALIZATION(...)

--- a/Eigen/StdList
+++ b/Eigen/StdList
@@ -13,7 +13,7 @@
 #include "Core"
 #include <list>

-#if EIGEN_COMP_MSVC && EIGEN_OS_WIN64 /* MSVC auto aligns in 64 bit builds */    
+#if (defined(_MSC_VER) && defined(_WIN64)) /* MSVC auto aligns in 64 bit builds */    

 #define EIGEN_DEFINE_STL_LIST_SPECIALIZATION(...)

--- a/Eigen/StdVector
+++ b/Eigen/StdVector
@@ -14,7 +14,7 @@
 #include "Core"
 #include <vector>

-#if EIGEN_COMP_MSVC && EIGEN_OS_WIN64 /* MSVC auto aligns in 64 bit builds */
+#if (defined(_MSC_VER) && defined(_WIN64)) /* MSVC auto aligns in 64 bit builds */

 #define EIGEN_DEFINE_STL_VECTOR_SPECIALIZATION(...)

--- a/Eigen/SuperLUSupport
+++ b/Eigen/SuperLUSupport
@@ -48,8 +48,12 @@ namespace Eigen { struct SluMatrix; }
  *
  */

+#include "src/misc/Solve.h"
+#include "src/misc/SparseSolve.h"
+
 #include "src/SuperLUSupport/SuperLUSupport.h"

+
 #include "src/Core/util/ReenableStupidWarnings.h"

 #endif // EIGEN_SUPERLUSUPPORT_MODULE_H
--- a/Eigen/UmfPackSupport
+++ b/Eigen/UmfPackSupport
@@ -12,7 +12,7 @@ extern "C" {
 /** \ingroup Support_modules
  * \defgroup UmfPackSupport_Module UmfPackSupport module
  *
-  * This module provides an interface to the UmfPack library which is part of the <a href="http://www.cise.ufl.edu/research/sparse/SuiteSparse/">suitesparse</a> package.
+  * This module provides an interface to the UmfPack library which is part of the <a href="http://www.suitesparse.com">suitesparse</a> package.
  * It provides the following factorization class:
  * - class UmfPackLU: a multifrontal sequential LU factorization.
  *
@@ -26,6 +26,9 @@ extern "C" {
  *
  */

+#include "src/misc/Solve.h"
+#include "src/misc/SparseSolve.h"
+
 #include "src/UmfPackSupport/UmfPackSupport.h"

 #include "src/Core/util/ReenableStupidWarnings.h"
--- a/Eigen/src/Cholesky/LDLT.h
+++ b/Eigen/src/Cholesky/LDLT.h
@@ -43,7 +43,7 @@ namespace internal {
  * Remember that Cholesky decompositions are not rank-revealing. Also, do not use a Cholesky
  * decomposition to determine whether a system of equations has a solution.
  *
-  * \sa MatrixBase::ldlt(), SelfAdjointView::ldlt(), class LLT
+  * \sa MatrixBase::ldlt(), class LLT
  */
 template<typename _MatrixType, int _UpLo> class LDLT
 {
@@ -59,8 +59,7 @@ template<typename _MatrixType, int _UpLo> class LDLT
    };
    typedef typename MatrixType::Scalar Scalar;
    typedef typename NumTraits<typename MatrixType::Scalar>::Real RealScalar;
-    typedef Eigen::Index Index; ///< \deprecated since Eigen 3.3
-    typedef typename MatrixType::StorageIndex StorageIndex;
+    typedef typename MatrixType::Index Index;
    typedef Matrix<Scalar, RowsAtCompileTime, 1, Options, MaxRowsAtCompileTime, 1> TmpMatrixType;

    typedef Transpositions<RowsAtCompileTime, MaxRowsAtCompileTime> TranspositionType;
@@ -86,7 +85,7 @@ template<typename _MatrixType, int _UpLo> class LDLT
      * according to the specified problem \a size.
      * \sa LDLT()
      */
-    explicit LDLT(Index size)
+    LDLT(Index size)
      : m_matrix(size, size),
        m_transpositions(size),
        m_temporary(size),
@@ -99,7 +98,7 @@ template<typename _MatrixType, int _UpLo> class LDLT
      * This calculates the decomposition for the input \a matrix.
      * \sa LDLT(Index size)
      */
-    explicit LDLT(const MatrixType& matrix)
+    LDLT(const MatrixType& matrix)
      : m_matrix(matrix.rows(), matrix.cols()),
        m_transpositions(matrix.rows()),
        m_temporary(matrix.rows()),
@@ -152,6 +151,13 @@ template<typename _MatrixType, int _UpLo> class LDLT
      eigen_assert(m_isInitialized && "LDLT is not initialized.");
      return m_sign == internal::PositiveSemiDef || m_sign == internal::ZeroSign;
    }
+    
+    #ifdef EIGEN2_SUPPORT
+    inline bool isPositiveDefinite() const
+    {
+      return isPositive();
+    }
+    #endif

    /** \returns true if the matrix is negative (semidefinite) */
    inline bool isNegative(void) const
@@ -173,18 +179,27 @@ template<typename _MatrixType, int _UpLo> class LDLT
      * least-square solution of \f$ D y_3 = y_2 \f$ is computed. This does not mean that this function
      * computes the least-square solution of \f$ A x = b \f$ is \f$ A \f$ is singular.
      *
-      * \sa MatrixBase::ldlt(), SelfAdjointView::ldlt()
+      * \sa MatrixBase::ldlt()
      */
    template<typename Rhs>
-    inline const Solve<LDLT, Rhs>
+    inline const internal::solve_retval<LDLT, Rhs>
    solve(const MatrixBase<Rhs>& b) const
    {
      eigen_assert(m_isInitialized && "LDLT is not initialized.");
      eigen_assert(m_matrix.rows()==b.rows()
                && "LDLT::solve(): invalid number of rows of the right hand side matrix b");
-      return Solve<LDLT, Rhs>(*this, b.derived());
+      return internal::solve_retval<LDLT, Rhs>(*this, b.derived());
    }

+    #ifdef EIGEN2_SUPPORT
+    template<typename OtherDerived, typename ResultType>
+    bool solve(const MatrixBase<OtherDerived>& b, ResultType *result) const
+    {
+      *result = this->solve(b);
+      return true;
+    }
+    #endif
+
    template<typename Derived>
    bool solveInPlace(MatrixBase<Derived> &bAndX) const;

@@ -218,14 +233,13 @@ template<typename _MatrixType, int _UpLo> class LDLT
      eigen_assert(m_isInitialized && "LDLT is not initialized.");
      return Success;
    }
-    
-    #ifndef EIGEN_PARSED_BY_DOXYGEN
-    template<typename RhsType, typename DstType>
-    EIGEN_DEVICE_FUNC
-    void _solve_impl(const RhsType &rhs, DstType &dst) const;
-    #endif

  protected:
+    
+    static void check_template_parameters()
+    {
+      EIGEN_STATIC_ASSERT_NON_INTEGER(Scalar);
+    }

    /** \internal
      * Used to compute and store the Cholesky decomposition A = L D L^* = U^* D U.
@@ -252,7 +266,7 @@ template<> struct ldlt_inplace<Lower>
    using std::abs;
    typedef typename MatrixType::Scalar Scalar;
    typedef typename MatrixType::RealScalar RealScalar;
-    typedef typename TranspositionType::StorageIndex IndexType;
+    typedef typename MatrixType::Index Index;
    eigen_assert(mat.rows()==mat.cols());
    const Index size = mat.rows();

@@ -272,7 +286,7 @@ template<> struct ldlt_inplace<Lower>
      mat.diagonal().tail(size-k).cwiseAbs().maxCoeff(&index_of_biggest_in_corner);
      index_of_biggest_in_corner += k;

-      transpositions.coeffRef(k) = IndexType(index_of_biggest_in_corner);
+      transpositions.coeffRef(k) = index_of_biggest_in_corner;
      if(k != index_of_biggest_in_corner)
      {
        // apply the transposition while taking care to consider only
@@ -281,7 +295,7 @@ template<> struct ldlt_inplace<Lower>
        mat.row(k).head(k).swap(mat.row(index_of_biggest_in_corner).head(k));
        mat.col(k).tail(s).swap(mat.col(index_of_biggest_in_corner).tail(s));
        std::swap(mat.coeffRef(k,k),mat.coeffRef(index_of_biggest_in_corner,index_of_biggest_in_corner));
-        for(Index i=k+1;i<index_of_biggest_in_corner;++i)
+        for(int i=k+1;i<index_of_biggest_in_corner;++i)
        {
          Scalar tmp = mat.coeffRef(i,k);
          mat.coeffRef(i,k) = numext::conj(mat.coeffRef(index_of_biggest_in_corner,i));
@@ -342,6 +356,7 @@ template<> struct ldlt_inplace<Lower>
    using numext::isfinite;
    typedef typename MatrixType::Scalar Scalar;
    typedef typename MatrixType::RealScalar RealScalar;
+    typedef typename MatrixType::Index Index;

    const Index size = mat.rows();
    eigen_assert(mat.cols() == size && w.size()==size);
@@ -405,16 +420,16 @@ template<typename MatrixType> struct LDLT_Traits<MatrixType,Lower>
 {
  typedef const TriangularView<const MatrixType, UnitLower> MatrixL;
  typedef const TriangularView<const typename MatrixType::AdjointReturnType, UnitUpper> MatrixU;
-  static inline MatrixL getL(const MatrixType& m) { return MatrixL(m); }
-  static inline MatrixU getU(const MatrixType& m) { return MatrixU(m.adjoint()); }
+  static inline MatrixL getL(const MatrixType& m) { return m; }
+  static inline MatrixU getU(const MatrixType& m) { return m.adjoint(); }
 };

 template<typename MatrixType> struct LDLT_Traits<MatrixType,Upper>
 {
  typedef const TriangularView<const typename MatrixType::AdjointReturnType, UnitLower> MatrixL;
  typedef const TriangularView<const MatrixType, UnitUpper> MatrixU;
-  static inline MatrixL getL(const MatrixType& m) { return MatrixL(m.adjoint()); }
-  static inline MatrixU getU(const MatrixType& m) { return MatrixU(m); }
+  static inline MatrixL getL(const MatrixType& m) { return m.adjoint(); }
+  static inline MatrixU getU(const MatrixType& m) { return m; }
 };

 } // end namespace internal
@@ -424,6 +439,8 @@ template<typename MatrixType> struct LDLT_Traits<MatrixType,Upper>
 template<typename MatrixType, int _UpLo>
 LDLT<MatrixType,_UpLo>& LDLT<MatrixType,_UpLo>::compute(const MatrixType& a)
 {
+  check_template_parameters();
+  
  eigen_assert(a.rows()==a.cols());
  const Index size = a.rows();

@@ -447,9 +464,8 @@ LDLT<MatrixType,_UpLo>& LDLT<MatrixType,_UpLo>::compute(const MatrixType& a)
  */
 template<typename MatrixType, int _UpLo>
 template<typename Derived>
-LDLT<MatrixType,_UpLo>& LDLT<MatrixType,_UpLo>::rankUpdate(const MatrixBase<Derived>& w, const typename NumTraits<typename MatrixType::Scalar>::Real& sigma)
+LDLT<MatrixType,_UpLo>& LDLT<MatrixType,_UpLo>::rankUpdate(const MatrixBase<Derived>& w, const typename LDLT<MatrixType,_UpLo>::RealScalar& sigma)
 {
-  typedef typename TranspositionType::StorageIndex IndexType;
  const Index size = w.rows();
  if (m_isInitialized)
  {
@@ -461,7 +477,7 @@ LDLT<MatrixType,_UpLo>& LDLT<MatrixType,_UpLo>::rankUpdate(const MatrixBase<Deri
    m_matrix.setZero();
    m_transpositions.resize(size);
    for (Index i = 0; i < size; i++)
-      m_transpositions.coeffRef(i) = IndexType(i);
+      m_transpositions.coeffRef(i) = i;
    m_temporary.resize(size);
    m_sign = sigma>=0 ? internal::PositiveSemiDef : internal::NegativeSemiDef;
    m_isInitialized = true;
@@ -472,45 +488,53 @@ LDLT<MatrixType,_UpLo>& LDLT<MatrixType,_UpLo>::rankUpdate(const MatrixBase<Deri
  return *this;
 }

-#ifndef EIGEN_PARSED_BY_DOXYGEN
-template<typename _MatrixType, int _UpLo>
-template<typename RhsType, typename DstType>
-void LDLT<_MatrixType,_UpLo>::_solve_impl(const RhsType &rhs, DstType &dst) const
+namespace internal {
+template<typename _MatrixType, int _UpLo, typename Rhs>
+struct solve_retval<LDLT<_MatrixType,_UpLo>, Rhs>
+  : solve_retval_base<LDLT<_MatrixType,_UpLo>, Rhs>
 {
-  eigen_assert(rhs.rows() == rows());
-  // dst = P b
-  dst = m_transpositions * rhs;
+  typedef LDLT<_MatrixType,_UpLo> LDLTType;
+  EIGEN_MAKE_SOLVE_HELPERS(LDLTType,Rhs)

-  // dst = L^-1 (P b)
-  matrixL().solveInPlace(dst);
-
-  // dst = D^-1 (L^-1 P b)
-  // more precisely, use pseudo-inverse of D (see bug 241)
-  using std::abs;
-  const typename Diagonal<const MatrixType>::RealReturnType vecD(vectorD());
-  // In some previous versions, tolerance was set to the max of 1/highest and the maximal diagonal entry * epsilon
-  // as motivated by LAPACK's xGELSS:
-  // RealScalar tolerance = numext::maxi(vectorD.array().abs().maxCoeff() *NumTraits<RealScalar>::epsilon(),RealScalar(1) / NumTraits<RealScalar>::highest());
-  // However, LDLT is not rank revealing, and so adjusting the tolerance wrt to the highest
-  // diagonal element is not well justified and to numerical issues in some cases.
-  // Moreover, Lapack's xSYTRS routines use 0 for the tolerance.
-  RealScalar tolerance = RealScalar(1) / NumTraits<RealScalar>::highest();
-  
-  for (Index i = 0; i < vecD.size(); ++i)
+  template<typename Dest> void evalTo(Dest& dst) const
  {
-    if(abs(vecD(i)) > tolerance)
-      dst.row(i) /= vecD(i);
-    else
-      dst.row(i).setZero();
+    eigen_assert(rhs().rows() == dec().matrixLDLT().rows());
+    // dst = P b
+    dst = dec().transpositionsP() * rhs();
+
+    // dst = L^-1 (P b)
+    dec().matrixL().solveInPlace(dst);
+
+    // dst = D^-1 (L^-1 P b)
+    // more precisely, use pseudo-inverse of D (see bug 241)
+    using std::abs;
+    using std::max;
+    typedef typename LDLTType::MatrixType MatrixType;
+    typedef typename LDLTType::RealScalar RealScalar;
+    const typename Diagonal<const MatrixType>::RealReturnType vectorD(dec().vectorD());
+    // In some previous versions, tolerance was set to the max of 1/highest and the maximal diagonal entry * epsilon
+    // as motivated by LAPACK's xGELSS:
+    // RealScalar tolerance = (max)(vectorD.array().abs().maxCoeff() *NumTraits<RealScalar>::epsilon(),RealScalar(1) / NumTraits<RealScalar>::highest());
+    // However, LDLT is not rank revealing, and so adjusting the tolerance wrt to the highest
+    // diagonal element is not well justified and to numerical issues in some cases.
+    // Moreover, Lapack's xSYTRS routines use 0 for the tolerance.
+    RealScalar tolerance = RealScalar(1) / NumTraits<RealScalar>::highest();
+    
+    for (Index i = 0; i < vectorD.size(); ++i) {
+      if(abs(vectorD(i)) > tolerance)
+        dst.row(i) /= vectorD(i);
+      else
+        dst.row(i).setZero();
+    }
+
+    // dst = L^-T (D^-1 L^-1 P b)
+    dec().matrixU().solveInPlace(dst);
+
+    // dst = P^-1 (L^-T D^-1 L^-1 P b) = A^-1 b
+    dst = dec().transpositionsP().transpose() * dst;
  }
-
-  // dst = L^-T (D^-1 L^-1 P b)
-  matrixU().solveInPlace(dst);
-
-  // dst = P^-1 (L^-T D^-1 L^-1 P b) = A^-1 b
-  dst = m_transpositions.transpose() * dst;
+};
 }
-#endif

 /** \internal use x = ldlt_object.solve(x);
  *
@@ -562,10 +586,8 @@ MatrixType LDLT<MatrixType,_UpLo>::reconstructedMatrix() const
  return res;
 }

-#ifndef __CUDACC__
 /** \cholesky_module
  * \returns the Cholesky decomposition with full pivoting without square root of \c *this
-  * \sa MatrixBase::ldlt()
  */
 template<typename MatrixType, unsigned int UpLo>
 inline const LDLT<typename SelfAdjointView<MatrixType, UpLo>::PlainObject, UpLo>
@@ -576,7 +598,6 @@ SelfAdjointView<MatrixType, UpLo>::ldlt() const

 /** \cholesky_module
  * \returns the Cholesky decomposition with full pivoting without square root of \c *this
-  * \sa SelfAdjointView::ldlt()
  */
 template<typename Derived>
 inline const LDLT<typename MatrixBase<Derived>::PlainObject>
@@ -584,7 +605,6 @@ MatrixBase<Derived>::ldlt() const
 {
  return LDLT<PlainObject>(derived());
 }
-#endif // __CUDACC__

 } // end namespace Eigen

--- a/Eigen/src/Cholesky/LLT.h
+++ b/Eigen/src/Cholesky/LLT.h
@@ -41,7 +41,7 @@ template<typename MatrixType, int UpLo> struct LLT_Traits;
  * Example: \include LLT_example.cpp
  * Output: \verbinclude LLT_example.out
  *    
-  * \sa MatrixBase::llt(), SelfAdjointView::llt(), class LDLT
+  * \sa MatrixBase::llt(), class LDLT
  */
 /* HEY THIS DOX IS DISABLED BECAUSE THERE's A BUG EITHER HERE OR IN LDLT ABOUT THAT (OR BOTH)
  * Note that during the decomposition, only the upper triangular part of A is considered. Therefore,
@@ -59,8 +59,7 @@ template<typename _MatrixType, int _UpLo> class LLT
    };
    typedef typename MatrixType::Scalar Scalar;
    typedef typename NumTraits<typename MatrixType::Scalar>::Real RealScalar;
-    typedef Eigen::Index Index; ///< \deprecated since Eigen 3.3
-    typedef typename MatrixType::StorageIndex StorageIndex;
+    typedef typename MatrixType::Index Index;

    enum {
      PacketSize = internal::packet_traits<Scalar>::size,
@@ -84,10 +83,10 @@ template<typename _MatrixType, int _UpLo> class LLT
      * according to the specified problem \a size.
      * \sa LLT()
      */
-    explicit LLT(Index size) : m_matrix(size, size),
+    LLT(Index size) : m_matrix(size, size),
                    m_isInitialized(false) {}

-    explicit LLT(const MatrixType& matrix)
+    LLT(const MatrixType& matrix)
      : m_matrix(matrix.rows(), matrix.cols()),
        m_isInitialized(false)
    {
@@ -116,18 +115,29 @@ template<typename _MatrixType, int _UpLo> class LLT
      * Example: \include LLT_solve.cpp
      * Output: \verbinclude LLT_solve.out
      *
-      * \sa solveInPlace(), MatrixBase::llt(), SelfAdjointView::llt()
+      * \sa solveInPlace(), MatrixBase::llt()
      */
    template<typename Rhs>
-    inline const Solve<LLT, Rhs>
+    inline const internal::solve_retval<LLT, Rhs>
    solve(const MatrixBase<Rhs>& b) const
    {
      eigen_assert(m_isInitialized && "LLT is not initialized.");
      eigen_assert(m_matrix.rows()==b.rows()
                && "LLT::solve(): invalid number of rows of the right hand side matrix b");
-      return Solve<LLT, Rhs>(*this, b.derived());
+      return internal::solve_retval<LLT, Rhs>(*this, b.derived());
    }

+    #ifdef EIGEN2_SUPPORT
+    template<typename OtherDerived, typename ResultType>
+    bool solve(const MatrixBase<OtherDerived>& b, ResultType *result) const
+    {
+      *result = this->solve(b);
+      return true;
+    }
+    
+    bool isPositiveDefinite() const { return true; }
+    #endif
+
    template<typename Derived>
    void solveInPlace(MatrixBase<Derived> &bAndX) const;

@@ -162,14 +172,14 @@ template<typename _MatrixType, int _UpLo> class LLT

    template<typename VectorType>
    LLT rankUpdate(const VectorType& vec, const RealScalar& sigma = 1);
-    
-    #ifndef EIGEN_PARSED_BY_DOXYGEN
-    template<typename RhsType, typename DstType>
-    EIGEN_DEVICE_FUNC
-    void _solve_impl(const RhsType &rhs, DstType &dst) const;
-    #endif

  protected:
+    
+    static void check_template_parameters()
+    {
+      EIGEN_STATIC_ASSERT_NON_INTEGER(Scalar);
+    }
+    
    /** \internal
      * Used to compute and store L
      * The strict upper part is not used and even not initialized.
@@ -184,11 +194,12 @@ namespace internal {
 template<typename Scalar, int UpLo> struct llt_inplace;

 template<typename MatrixType, typename VectorType>
-static Index llt_rank_update_lower(MatrixType& mat, const VectorType& vec, const typename MatrixType::RealScalar& sigma)
+static typename MatrixType::Index llt_rank_update_lower(MatrixType& mat, const VectorType& vec, const typename MatrixType::RealScalar& sigma)
 {
  using std::sqrt;
  typedef typename MatrixType::Scalar Scalar;
  typedef typename MatrixType::RealScalar RealScalar;
+  typedef typename MatrixType::Index Index;
  typedef typename MatrixType::ColXpr ColXpr;
  typedef typename internal::remove_all<ColXpr>::type ColXprCleaned;
  typedef typename ColXprCleaned::SegmentReturnType ColXprSegment;
@@ -257,9 +268,10 @@ template<typename Scalar> struct llt_inplace<Scalar, Lower>
 {
  typedef typename NumTraits<Scalar>::Real RealScalar;
  template<typename MatrixType>
-  static Index unblocked(MatrixType& mat)
+  static typename MatrixType::Index unblocked(MatrixType& mat)
  {
    using std::sqrt;
+    typedef typename MatrixType::Index Index;
    
    eigen_assert(mat.rows()==mat.cols());
    const Index size = mat.rows();
@@ -277,14 +289,15 @@ template<typename Scalar> struct llt_inplace<Scalar, Lower>
        return k;
      mat.coeffRef(k,k) = x = sqrt(x);
      if (k>0 && rs>0) A21.noalias() -= A20 * A10.adjoint();
-      if (rs>0) A21 *= RealScalar(1)/x;
+      if (rs>0) A21 /= x;
    }
    return -1;
  }

  template<typename MatrixType>
-  static Index blocked(MatrixType& m)
+  static typename MatrixType::Index blocked(MatrixType& m)
  {
+    typedef typename MatrixType::Index Index;
    eigen_assert(m.rows()==m.cols());
    Index size = m.rows();
    if(size<32)
@@ -315,7 +328,7 @@ template<typename Scalar> struct llt_inplace<Scalar, Lower>
  }

  template<typename MatrixType, typename VectorType>
-  static Index rankUpdate(MatrixType& mat, const VectorType& vec, const RealScalar& sigma)
+  static typename MatrixType::Index rankUpdate(MatrixType& mat, const VectorType& vec, const RealScalar& sigma)
  {
    return Eigen::internal::llt_rank_update_lower(mat, vec, sigma);
  }
@@ -326,19 +339,19 @@ template<typename Scalar> struct llt_inplace<Scalar, Upper>
  typedef typename NumTraits<Scalar>::Real RealScalar;

  template<typename MatrixType>
-  static EIGEN_STRONG_INLINE Index unblocked(MatrixType& mat)
+  static EIGEN_STRONG_INLINE typename MatrixType::Index unblocked(MatrixType& mat)
  {
    Transpose<MatrixType> matt(mat);
    return llt_inplace<Scalar, Lower>::unblocked(matt);
  }
  template<typename MatrixType>
-  static EIGEN_STRONG_INLINE Index blocked(MatrixType& mat)
+  static EIGEN_STRONG_INLINE typename MatrixType::Index blocked(MatrixType& mat)
  {
    Transpose<MatrixType> matt(mat);
    return llt_inplace<Scalar, Lower>::blocked(matt);
  }
  template<typename MatrixType, typename VectorType>
-  static Index rankUpdate(MatrixType& mat, const VectorType& vec, const RealScalar& sigma)
+  static typename MatrixType::Index rankUpdate(MatrixType& mat, const VectorType& vec, const RealScalar& sigma)
  {
    Transpose<MatrixType> matt(mat);
    return llt_inplace<Scalar, Lower>::rankUpdate(matt, vec.conjugate(), sigma);
@@ -349,8 +362,8 @@ template<typename MatrixType> struct LLT_Traits<MatrixType,Lower>
 {
  typedef const TriangularView<const MatrixType, Lower> MatrixL;
  typedef const TriangularView<const typename MatrixType::AdjointReturnType, Upper> MatrixU;
-  static inline MatrixL getL(const MatrixType& m) { return MatrixL(m); }
-  static inline MatrixU getU(const MatrixType& m) { return MatrixU(m.adjoint()); }
+  static inline MatrixL getL(const MatrixType& m) { return m; }
+  static inline MatrixU getU(const MatrixType& m) { return m.adjoint(); }
  static bool inplace_decomposition(MatrixType& m)
  { return llt_inplace<typename MatrixType::Scalar, Lower>::blocked(m)==-1; }
 };
@@ -359,8 +372,8 @@ template<typename MatrixType> struct LLT_Traits<MatrixType,Upper>
 {
  typedef const TriangularView<const typename MatrixType::AdjointReturnType, Lower> MatrixL;
  typedef const TriangularView<const MatrixType, Upper> MatrixU;
-  static inline MatrixL getL(const MatrixType& m) { return MatrixL(m.adjoint()); }
-  static inline MatrixU getU(const MatrixType& m) { return MatrixU(m); }
+  static inline MatrixL getL(const MatrixType& m) { return m.adjoint(); }
+  static inline MatrixU getU(const MatrixType& m) { return m; }
  static bool inplace_decomposition(MatrixType& m)
  { return llt_inplace<typename MatrixType::Scalar, Upper>::blocked(m)==-1; }
 };
@@ -377,6 +390,8 @@ template<typename MatrixType> struct LLT_Traits<MatrixType,Upper>
 template<typename MatrixType, int _UpLo>
 LLT<MatrixType,_UpLo>& LLT<MatrixType,_UpLo>::compute(const MatrixType& a)
 {
+  check_template_parameters();
+  
  eigen_assert(a.rows()==a.cols());
  const Index size = a.rows();
  m_matrix.resize(size, size);
@@ -408,16 +423,22 @@ LLT<_MatrixType,_UpLo> LLT<_MatrixType,_UpLo>::rankUpdate(const VectorType& v, c

  return *this;
 }
- 
-#ifndef EIGEN_PARSED_BY_DOXYGEN
-template<typename _MatrixType,int _UpLo>
-template<typename RhsType, typename DstType>
-void LLT<_MatrixType,_UpLo>::_solve_impl(const RhsType &rhs, DstType &dst) const
+    
+namespace internal {
+template<typename _MatrixType, int UpLo, typename Rhs>
+struct solve_retval<LLT<_MatrixType, UpLo>, Rhs>
+  : solve_retval_base<LLT<_MatrixType, UpLo>, Rhs>
 {
-  dst = rhs;
-  solveInPlace(dst);
+  typedef LLT<_MatrixType,UpLo> LLTType;
+  EIGEN_MAKE_SOLVE_HELPERS(LLTType,Rhs)
+
+  template<typename Dest> void evalTo(Dest& dst) const
+  {
+    dst = rhs();
+    dec().solveInPlace(dst);
+  }
+};
 }
-#endif

 /** \internal use x = llt_object.solve(x);
  * 
@@ -452,10 +473,8 @@ MatrixType LLT<MatrixType,_UpLo>::reconstructedMatrix() const
  return matrixL() * matrixL().adjoint().toDenseMatrix();
 }

-#ifndef __CUDACC__
 /** \cholesky_module
  * \returns the LLT decomposition of \c *this
-  * \sa SelfAdjointView::llt()
  */
 template<typename Derived>
 inline const LLT<typename MatrixBase<Derived>::PlainObject>
@@ -466,7 +485,6 @@ MatrixBase<Derived>::llt() const

 /** \cholesky_module
  * \returns the LLT decomposition of \c *this
-  * \sa SelfAdjointView::llt()
  */
 template<typename MatrixType, unsigned int UpLo>
 inline const LLT<typename SelfAdjointView<MatrixType, UpLo>::PlainObject, UpLo>
@@ -474,8 +492,7 @@ SelfAdjointView<MatrixType, UpLo>::llt() const
 {
  return LLT<PlainObject,UpLo>(m_matrix);
 }
-#endif // __CUDACC__
-  
+
 } // end namespace Eigen

 #endif // EIGEN_LLT_H
--- a/Eigen/src/Cholesky/LLT_MKL.h
+++ b/Eigen/src/Cholesky/LLT_MKL.h
@@ -46,7 +46,7 @@ template<typename Scalar> struct mkl_llt;
 template<> struct mkl_llt<EIGTYPE> \
 { \
  template<typename MatrixType> \
-  static inline Index potrf(MatrixType& m, char uplo) \
+  static inline typename MatrixType::Index potrf(MatrixType& m, char uplo) \
  { \
    lapack_int matrix_order; \
    lapack_int size, lda, info, StorageOrder; \
@@ -60,30 +60,30 @@ template<> struct mkl_llt<EIGTYPE> \
    lda = m.outerStride(); \
 \
    info = LAPACKE_##MKLPREFIX##potrf( matrix_order, uplo, size, (MKLTYPE*)a, lda ); \
-    info = (info==0) ? Success : NumericalIssue; \
+    info = (info==0) ? -1 : info>0 ? info-1 : size; \
    return info; \
  } \
 }; \
 template<> struct llt_inplace<EIGTYPE, Lower> \
 { \
  template<typename MatrixType> \
-  static Index blocked(MatrixType& m) \
+  static typename MatrixType::Index blocked(MatrixType& m) \
  { \
    return mkl_llt<EIGTYPE>::potrf(m, 'L'); \
  } \
  template<typename MatrixType, typename VectorType> \
-  static Index rankUpdate(MatrixType& mat, const VectorType& vec, const typename MatrixType::RealScalar& sigma) \
+  static typename MatrixType::Index rankUpdate(MatrixType& mat, const VectorType& vec, const typename MatrixType::RealScalar& sigma) \
  { return Eigen::internal::llt_rank_update_lower(mat, vec, sigma); } \
 }; \
 template<> struct llt_inplace<EIGTYPE, Upper> \
 { \
  template<typename MatrixType> \
-  static Index blocked(MatrixType& m) \
+  static typename MatrixType::Index blocked(MatrixType& m) \
  { \
    return mkl_llt<EIGTYPE>::potrf(m, 'U'); \
  } \
  template<typename MatrixType, typename VectorType> \
-  static Index rankUpdate(MatrixType& mat, const VectorType& vec, const typename MatrixType::RealScalar& sigma) \
+  static typename MatrixType::Index rankUpdate(MatrixType& mat, const VectorType& vec, const typename MatrixType::RealScalar& sigma) \
  { \
    Transpose<MatrixType> matt(mat); \
    return llt_inplace<EIGTYPE, Lower>::rankUpdate(matt, vec.conjugate(), sigma); \
--- a/Eigen/src/CholmodSupport/CholmodSupport.h
+++ b/Eigen/src/CholmodSupport/CholmodSupport.h
@@ -48,8 +48,8 @@ void cholmod_configure_matrix(CholmodType& mat)
 /** Wraps the Eigen sparse matrix \a mat into a Cholmod sparse matrix object.
  * Note that the data are shared.
  */
-template<typename _Scalar, int _Options, typename _StorageIndex>
-cholmod_sparse viewAsCholmod(SparseMatrix<_Scalar,_Options,_StorageIndex>& mat)
+template<typename _Scalar, int _Options, typename _Index>
+cholmod_sparse viewAsCholmod(SparseMatrix<_Scalar,_Options,_Index>& mat)
 {
  cholmod_sparse res;
  res.nzmax   = mat.nonZeros();
@@ -74,11 +74,11 @@ cholmod_sparse viewAsCholmod(SparseMatrix<_Scalar,_Options,_StorageIndex>& mat)
  res.dtype   = 0;
  res.stype   = -1;
  
-  if (internal::is_same<_StorageIndex,int>::value)
+  if (internal::is_same<_Index,int>::value)
  {
    res.itype = CHOLMOD_INT;
  }
-  else if (internal::is_same<_StorageIndex,UF_long>::value)
+  else if (internal::is_same<_Index,SuiteSparse_long>::value)
  {
    res.itype = CHOLMOD_LONG;
  }
@@ -105,7 +105,7 @@ const cholmod_sparse viewAsCholmod(const SparseMatrix<_Scalar,_Options,_Index>&
 /** Returns a view of the Eigen sparse matrix \a mat as Cholmod sparse matrix.
  * The data are not copied but shared. */
 template<typename _Scalar, int _Options, typename _Index, unsigned int UpLo>
-cholmod_sparse viewAsCholmod(const SparseSelfAdjointView<const SparseMatrix<_Scalar,_Options,_Index>, UpLo>& mat)
+cholmod_sparse viewAsCholmod(const SparseSelfAdjointView<SparseMatrix<_Scalar,_Options,_Index>, UpLo>& mat)
 {
  cholmod_sparse res = viewAsCholmod(mat.matrix().const_cast_derived());
  
@@ -138,12 +138,12 @@ cholmod_dense viewAsCholmod(MatrixBase<Derived>& mat)

 /** Returns a view of the Cholmod sparse matrix \a cm as an Eigen sparse matrix.
  * The data are not copied but shared. */
-template<typename Scalar, int Flags, typename StorageIndex>
-MappedSparseMatrix<Scalar,Flags,StorageIndex> viewAsEigen(cholmod_sparse& cm)
+template<typename Scalar, int Flags, typename Index>
+MappedSparseMatrix<Scalar,Flags,Index> viewAsEigen(cholmod_sparse& cm)
 {
-  return MappedSparseMatrix<Scalar,Flags,StorageIndex>
-         (cm.nrow, cm.ncol, static_cast<StorageIndex*>(cm.p)[cm.ncol],
-          static_cast<StorageIndex*>(cm.p), static_cast<StorageIndex*>(cm.i),static_cast<Scalar*>(cm.x) );
+  return MappedSparseMatrix<Scalar,Flags,Index>
+         (cm.nrow, cm.ncol, static_cast<Index*>(cm.p)[cm.ncol],
+          static_cast<Index*>(cm.p), static_cast<Index*>(cm.i),static_cast<Scalar*>(cm.x) );
 }

 enum CholmodMode {
@@ -157,31 +157,27 @@ enum CholmodMode {
  * \sa class CholmodSupernodalLLT, class CholmodSimplicialLDLT, class CholmodSimplicialLLT
  */
 template<typename _MatrixType, int _UpLo, typename Derived>
-class CholmodBase : public SparseSolverBase<Derived>
+class CholmodBase : internal::noncopyable
 {
-  protected:
-    typedef SparseSolverBase<Derived> Base;
-    using Base::derived;
-    using Base::m_isInitialized;
  public:
    typedef _MatrixType MatrixType;
    enum { UpLo = _UpLo };
    typedef typename MatrixType::Scalar Scalar;
    typedef typename MatrixType::RealScalar RealScalar;
    typedef MatrixType CholMatrixType;
-    typedef typename MatrixType::StorageIndex StorageIndex;
+    typedef typename MatrixType::Index Index;

  public:

    CholmodBase()
-      : m_cholmodFactor(0), m_info(Success)
+      : m_cholmodFactor(0), m_info(Success), m_isInitialized(false)
    {
      m_shiftOffset[0] = m_shiftOffset[1] = RealScalar(0.0);
      cholmod_start(&m_cholmod);
    }

-    explicit CholmodBase(const MatrixType& matrix)
-      : m_cholmodFactor(0), m_info(Success)
+    CholmodBase(const MatrixType& matrix)
+      : m_cholmodFactor(0), m_info(Success), m_isInitialized(false)
    {
      m_shiftOffset[0] = m_shiftOffset[1] = RealScalar(0.0);
      cholmod_start(&m_cholmod);
@@ -195,8 +191,11 @@ class CholmodBase : public SparseSolverBase<Derived>
      cholmod_finish(&m_cholmod);
    }
    
-    inline StorageIndex cols() const { return internal::convert_index<StorageIndex, Index>(m_cholmodFactor->n); }
-    inline StorageIndex rows() const { return internal::convert_index<StorageIndex, Index>(m_cholmodFactor->n); }
+    inline Index cols() const { return m_cholmodFactor->n; }
+    inline Index rows() const { return m_cholmodFactor->n; }
+    
+    Derived& derived() { return *static_cast<Derived*>(this); }
+    const Derived& derived() const { return *static_cast<const Derived*>(this); }
    
    /** \brief Reports whether previous computation was successful.
      *
@@ -217,6 +216,34 @@ class CholmodBase : public SparseSolverBase<Derived>
      return derived();
    }
    
+    /** \returns the solution x of \f$ A x = b \f$ using the current decomposition of A.
+      *
+      * \sa compute()
+      */
+    template<typename Rhs>
+    inline const internal::solve_retval<CholmodBase, Rhs>
+    solve(const MatrixBase<Rhs>& b) const
+    {
+      eigen_assert(m_isInitialized && "LLT is not initialized.");
+      eigen_assert(rows()==b.rows()
+                && "CholmodDecomposition::solve(): invalid number of rows of the right hand side matrix b");
+      return internal::solve_retval<CholmodBase, Rhs>(*this, b.derived());
+    }
+    
+    /** \returns the solution x of \f$ A x = b \f$ using the current decomposition of A.
+      *
+      * \sa compute()
+      */
+    template<typename Rhs>
+    inline const internal::sparse_solve_retval<CholmodBase, Rhs>
+    solve(const SparseMatrixBase<Rhs>& b) const
+    {
+      eigen_assert(m_isInitialized && "LLT is not initialized.");
+      eigen_assert(rows()==b.rows()
+                && "CholmodDecomposition::solve(): invalid number of rows of the right hand side matrix b");
+      return internal::sparse_solve_retval<CholmodBase, Rhs>(*this, b.derived());
+    }
+    
    /** Performs a symbolic decomposition on the sparsity pattern of \a matrix.
      *
      * This function is particularly useful when solving for several problems having the same structure.
@@ -263,7 +290,7 @@ class CholmodBase : public SparseSolverBase<Derived>
    #ifndef EIGEN_PARSED_BY_DOXYGEN
    /** \internal */
    template<typename Rhs,typename Dest>
-    void _solve_impl(const MatrixBase<Rhs> &b, MatrixBase<Dest> &dest) const
+    void _solve(const MatrixBase<Rhs> &b, MatrixBase<Dest> &dest) const
    {
      eigen_assert(m_factorizationIsOk && "The decomposition is not in a valid state for solving, you must first call either compute() or symbolic()/numeric()");
      const Index size = m_cholmodFactor->n;
@@ -285,7 +312,7 @@ class CholmodBase : public SparseSolverBase<Derived>
    
    /** \internal */
    template<typename RhsScalar, int RhsOptions, typename RhsIndex, typename DestScalar, int DestOptions, typename DestIndex>
-    void _solve_impl(const SparseMatrix<RhsScalar,RhsOptions,RhsIndex> &b, SparseMatrix<DestScalar,DestOptions,DestIndex> &dest) const
+    void _solve(const SparseMatrix<RhsScalar,RhsOptions,RhsIndex> &b, SparseMatrix<DestScalar,DestOptions,DestIndex> &dest) const
    {
      eigen_assert(m_factorizationIsOk && "The decomposition is not in a valid state for solving, you must first call either compute() or symbolic()/numeric()");
      const Index size = m_cholmodFactor->n;
@@ -330,6 +357,7 @@ class CholmodBase : public SparseSolverBase<Derived>
    cholmod_factor* m_cholmodFactor;
    RealScalar m_shiftOffset[2];
    mutable ComputationInfo m_info;
+    bool m_isInitialized;
    int m_factorizationIsOk;
    int m_analysisIsOk;
 };
@@ -367,7 +395,7 @@ class CholmodSimplicialLLT : public CholmodBase<_MatrixType, _UpLo, CholmodSimpl
    CholmodSimplicialLLT(const MatrixType& matrix) : Base()
    {
      init();
-      compute(matrix);
+      Base::compute(matrix);
    }

    ~CholmodSimplicialLLT() {}
@@ -414,7 +442,7 @@ class CholmodSimplicialLDLT : public CholmodBase<_MatrixType, _UpLo, CholmodSimp
    CholmodSimplicialLDLT(const MatrixType& matrix) : Base()
    {
      init();
-      compute(matrix);
+      Base::compute(matrix);
    }

    ~CholmodSimplicialLDLT() {}
@@ -459,7 +487,7 @@ class CholmodSupernodalLLT : public CholmodBase<_MatrixType, _UpLo, CholmodSuper
    CholmodSupernodalLLT(const MatrixType& matrix) : Base()
    {
      init();
-      compute(matrix);
+      Base::compute(matrix);
    }

    ~CholmodSupernodalLLT() {}
@@ -506,7 +534,7 @@ class CholmodDecomposition : public CholmodBase<_MatrixType, _UpLo, CholmodDecom
    CholmodDecomposition(const MatrixType& matrix) : Base()
    {
      init();
-      compute(matrix);
+      Base::compute(matrix);
    }

    ~CholmodDecomposition() {}
@@ -544,6 +572,36 @@ class CholmodDecomposition : public CholmodBase<_MatrixType, _UpLo, CholmodDecom
    }
 };

+namespace internal {
+  
+template<typename _MatrixType, int _UpLo, typename Derived, typename Rhs>
+struct solve_retval<CholmodBase<_MatrixType,_UpLo,Derived>, Rhs>
+  : solve_retval_base<CholmodBase<_MatrixType,_UpLo,Derived>, Rhs>
+{
+  typedef CholmodBase<_MatrixType,_UpLo,Derived> Dec;
+  EIGEN_MAKE_SOLVE_HELPERS(Dec,Rhs)
+
+  template<typename Dest> void evalTo(Dest& dst) const
+  {
+    dec()._solve(rhs(),dst);
+  }
+};
+
+template<typename _MatrixType, int _UpLo, typename Derived, typename Rhs>
+struct sparse_solve_retval<CholmodBase<_MatrixType,_UpLo,Derived>, Rhs>
+  : sparse_solve_retval_base<CholmodBase<_MatrixType,_UpLo,Derived>, Rhs>
+{
+  typedef CholmodBase<_MatrixType,_UpLo,Derived> Dec;
+  EIGEN_MAKE_SPARSE_SOLVE_HELPERS(Dec,Rhs)
+
+  template<typename Dest> void evalTo(Dest& dst) const
+  {
+    dec()._solve(rhs(),dst);
+  }
+};
+
+} // end namespace internal
+
 } // end namespace Eigen

 #endif // EIGEN_CHOLMODSUPPORT_H
--- a/Eigen/src/Core/Array.h
+++ b/Eigen/src/Core/Array.h
@@ -69,26 +69,10 @@ class Array
      * the usage of 'using'. This should be done only for operator=.
      */
    template<typename OtherDerived>
-    EIGEN_DEVICE_FUNC
    EIGEN_STRONG_INLINE Array& operator=(const EigenBase<OtherDerived> &other)
    {
      return Base::operator=(other);
    }
-    
-    /** Set all the entries to \a value.
-      * \sa DenseBase::setConstant(), DenseBase::fill()
-      */
-    /* This overload is needed because the usage of
-      *   using Base::operator=;
-      * fails on MSVC. Since the code below is working with GCC and MSVC, we skipped
-      * the usage of 'using'. This should be done only for operator=.
-      */
-    EIGEN_DEVICE_FUNC
-    EIGEN_STRONG_INLINE Array& operator=(const Scalar &value)
-    {
-      Base::setConstant(value);
-      return *this;
-    }

    /** Copies the value of the expression \a other into \c *this with automatic resizing.
      *
@@ -100,7 +84,6 @@ class Array
      * remain row-vectors and vectors remain vectors.
      */
    template<typename OtherDerived>
-    EIGEN_DEVICE_FUNC
    EIGEN_STRONG_INLINE Array& operator=(const ArrayBase<OtherDerived>& other)
    {
      return Base::_set(other);
@@ -109,12 +92,11 @@ class Array
    /** This is a special case of the templated operator=. Its purpose is to
      * prevent a default operator= from hiding the templated operator=.
      */
-    EIGEN_DEVICE_FUNC
    EIGEN_STRONG_INLINE Array& operator=(const Array& other)
    {
      return Base::_set(other);
    }
-    
+
    /** Default constructor.
      *
      * For fixed-size matrices, does nothing.
@@ -125,7 +107,6 @@ class Array
      *
      * \sa resize(Index,Index)
      */
-    EIGEN_DEVICE_FUNC
    EIGEN_STRONG_INLINE Array() : Base()
    {
      Base::_check_template_params();
@@ -135,7 +116,6 @@ class Array
 #ifndef EIGEN_PARSED_BY_DOXYGEN
    // FIXME is it still needed ??
    /** \internal */
-    EIGEN_DEVICE_FUNC
    Array(internal::constructor_without_unaligned_array_assert)
      : Base(internal::constructor_without_unaligned_array_assert())
    {
@@ -159,47 +139,41 @@ class Array
    }
 #endif

-    #ifndef EIGEN_PARSED_BY_DOXYGEN
-    template<typename T>
-    EIGEN_DEVICE_FUNC
-    EIGEN_STRONG_INLINE explicit Array(const T& x)
+    /** Constructs a vector or row-vector with given dimension. \only_for_vectors
+      *
+      * Note that this is only useful for dynamic-size vectors. For fixed-size vectors,
+      * it is redundant to pass the dimension here, so it makes more sense to use the default
+      * constructor Matrix() instead.
+      */
+    EIGEN_STRONG_INLINE explicit Array(Index dim)
+      : Base(dim, RowsAtCompileTime == 1 ? 1 : dim, ColsAtCompileTime == 1 ? 1 : dim)
    {
      Base::_check_template_params();
-      Base::template _init1<T>(x);
+      EIGEN_STATIC_ASSERT_VECTOR_ONLY(Array)
+      eigen_assert(dim >= 0);
+      eigen_assert(SizeAtCompileTime == Dynamic || SizeAtCompileTime == dim);
+      EIGEN_INITIALIZE_COEFFS_IF_THAT_OPTION_IS_ENABLED
    }

+    #ifndef EIGEN_PARSED_BY_DOXYGEN
    template<typename T0, typename T1>
-    EIGEN_DEVICE_FUNC
    EIGEN_STRONG_INLINE Array(const T0& val0, const T1& val1)
    {
      Base::_check_template_params();
      this->template _init2<T0,T1>(val0, val1);
    }
    #else
-    /** \brief Constructs a fixed-sized array initialized with coefficients starting at \a data */
-    EIGEN_DEVICE_FUNC explicit Array(const Scalar *data);
-    /** Constructs a vector or row-vector with given dimension. \only_for_vectors
+    /** constructs an uninitialized matrix with \a rows rows and \a cols columns.
      *
-      * Note that this is only useful for dynamic-size vectors. For fixed-size vectors,
-      * it is redundant to pass the dimension here, so it makes more sense to use the default
-      * constructor Array() instead.
-      */
-    EIGEN_DEVICE_FUNC
-    EIGEN_STRONG_INLINE explicit Array(Index dim);
-    /** constructs an initialized 1x1 Array with the given coefficient */
-    Array(const Scalar& value);
-    /** constructs an uninitialized array with \a rows rows and \a cols columns.
-      *
-      * This is useful for dynamic-size arrays. For fixed-size arrays,
+      * This is useful for dynamic-size matrices. For fixed-size matrices,
      * it is redundant to pass these parameters, so one should use the default constructor
-      * Array() instead. */
+      * Matrix() instead. */
    Array(Index rows, Index cols);
    /** constructs an initialized 2D vector with given coefficients */
    Array(const Scalar& val0, const Scalar& val1);
    #endif

    /** constructs an initialized 3D vector with given coefficients */
-    EIGEN_DEVICE_FUNC
    EIGEN_STRONG_INLINE Array(const Scalar& val0, const Scalar& val1, const Scalar& val2)
    {
      Base::_check_template_params();
@@ -209,7 +183,6 @@ class Array
      m_storage.data()[2] = val2;
    }
    /** constructs an initialized 4D vector with given coefficients */
-    EIGEN_DEVICE_FUNC
    EIGEN_STRONG_INLINE Array(const Scalar& val0, const Scalar& val1, const Scalar& val2, const Scalar& val3)
    {
      Base::_check_template_params();
@@ -220,9 +193,10 @@ class Array
      m_storage.data()[3] = val3;
    }

+    explicit Array(const Scalar *data);
+
    /** Constructor copying the value of the expression \a other */
    template<typename OtherDerived>
-    EIGEN_DEVICE_FUNC
    EIGEN_STRONG_INLINE Array(const ArrayBase<OtherDerived>& other)
             : Base(other.rows() * other.cols(), other.rows(), other.cols())
    {
@@ -230,7 +204,6 @@ class Array
      Base::_set_noalias(other);
    }
    /** Copy constructor */
-    EIGEN_DEVICE_FUNC
    EIGEN_STRONG_INLINE Array(const Array& other)
            : Base(other.rows() * other.cols(), other.rows(), other.cols())
    {
@@ -239,7 +212,6 @@ class Array
    }
    /** Copy constructor with in-place evaluation */
    template<typename OtherDerived>
-    EIGEN_DEVICE_FUNC
    EIGEN_STRONG_INLINE Array(const ReturnByValue<OtherDerived>& other)
    {
      Base::_check_template_params();
@@ -249,7 +221,6 @@ class Array

    /** \sa MatrixBase::operator=(const EigenBase<OtherDerived>&) */
    template<typename OtherDerived>
-    EIGEN_DEVICE_FUNC
    EIGEN_STRONG_INLINE Array(const EigenBase<OtherDerived> &other)
      : Base(other.derived().rows() * other.derived().cols(), other.derived().rows(), other.derived().cols())
    {
@@ -258,8 +229,15 @@ class Array
      *this = other;
    }

-    EIGEN_DEVICE_FUNC inline Index innerStride() const { return 1; }
-    EIGEN_DEVICE_FUNC inline Index outerStride() const { return this->innerSize(); }
+    /** Override MatrixBase::swap() since for dynamic-sized matrices of same type it is enough to swap the
+      * data pointers.
+      */
+    template<typename OtherDerived>
+    void swap(ArrayBase<OtherDerived> const & other)
+    { this->_swap(other.derived()); }
+
+    inline Index innerStride() const { return 1; }
+    inline Index outerStride() const { return this->innerSize(); }

    #ifdef EIGEN_ARRAY_PLUGIN
    #include EIGEN_ARRAY_PLUGIN
--- a/Eigen/src/Core/ArrayBase.h
+++ b/Eigen/src/Core/ArrayBase.h
@@ -46,15 +46,14 @@ template<typename Derived> class ArrayBase

    typedef ArrayBase Eigen_BaseClassForSpecializationOfGlobalMathFuncImpl;

-    using internal::special_scalar_op_base<Derived,typename internal::traits<Derived>::Scalar,
-                typename NumTraits<typename internal::traits<Derived>::Scalar>::Real>::operator*;
-
    typedef typename internal::traits<Derived>::StorageKind StorageKind;
+    typedef typename internal::traits<Derived>::Index Index;
    typedef typename internal::traits<Derived>::Scalar Scalar;
    typedef typename internal::packet_traits<Scalar>::type PacketScalar;
    typedef typename NumTraits<Scalar>::Real RealScalar;

    typedef DenseBase<Derived> Base;
+    using Base::operator*;
    using Base::RowsAtCompileTime;
    using Base::ColsAtCompileTime;
    using Base::SizeAtCompileTime;
@@ -63,7 +62,8 @@ template<typename Derived> class ArrayBase
    using Base::MaxSizeAtCompileTime;
    using Base::IsVectorAtCompileTime;
    using Base::Flags;
-    
+    using Base::CoeffReadCost;
+
    using Base::derived;
    using Base::const_cast_derived;
    using Base::rows;
@@ -116,57 +116,40 @@ template<typename Derived> class ArrayBase
    /** Special case of the template operator=, in order to prevent the compiler
      * from generating a default operator= (issue hit with g++ 4.1)
      */
-    EIGEN_DEVICE_FUNC
    Derived& operator=(const ArrayBase& other)
    {
-      internal::call_assignment(derived(), other.derived());
-      return derived();
+      return internal::assign_selector<Derived,Derived>::run(derived(), other.derived());
    }
-    
-    /** Set all the entries to \a value.
-      * \sa DenseBase::setConstant(), DenseBase::fill() */
-    EIGEN_DEVICE_FUNC
-    Derived& operator=(const Scalar &value)
-    { Base::setConstant(value); return derived(); }

-    EIGEN_DEVICE_FUNC
-    Derived& operator+=(const Scalar& scalar);
-    EIGEN_DEVICE_FUNC
-    Derived& operator-=(const Scalar& scalar);
+    Derived& operator+=(const Scalar& scalar)
+    { return *this = derived() + scalar; }
+    Derived& operator-=(const Scalar& scalar)
+    { return *this = derived() - scalar; }

    template<typename OtherDerived>
-    EIGEN_DEVICE_FUNC
    Derived& operator+=(const ArrayBase<OtherDerived>& other);
    template<typename OtherDerived>
-    EIGEN_DEVICE_FUNC
    Derived& operator-=(const ArrayBase<OtherDerived>& other);

    template<typename OtherDerived>
-    EIGEN_DEVICE_FUNC
    Derived& operator*=(const ArrayBase<OtherDerived>& other);

    template<typename OtherDerived>
-    EIGEN_DEVICE_FUNC
    Derived& operator/=(const ArrayBase<OtherDerived>& other);

  public:
-    EIGEN_DEVICE_FUNC
    ArrayBase<Derived>& array() { return *this; }
-    EIGEN_DEVICE_FUNC
    const ArrayBase<Derived>& array() const { return *this; }

    /** \returns an \link Eigen::MatrixBase Matrix \endlink expression of this array
      * \sa MatrixBase::array() */
-    EIGEN_DEVICE_FUNC
-    MatrixWrapper<Derived> matrix() { return MatrixWrapper<Derived>(derived()); }
-    EIGEN_DEVICE_FUNC
-    const MatrixWrapper<const Derived> matrix() const { return MatrixWrapper<const Derived>(derived()); }
+    MatrixWrapper<Derived> matrix() { return derived(); }
+    const MatrixWrapper<const Derived> matrix() const { return derived(); }

 //     template<typename Dest>
 //     inline void evalTo(Dest& dst) const { dst = matrix(); }

  protected:
-    EIGEN_DEVICE_FUNC
    ArrayBase() : Base() {}

  private:
@@ -191,7 +174,8 @@ template<typename OtherDerived>
 EIGEN_STRONG_INLINE Derived &
 ArrayBase<Derived>::operator-=(const ArrayBase<OtherDerived> &other)
 {
-  call_assignment(derived(), other.derived(), internal::sub_assign_op<Scalar>());
+  SelfCwiseBinaryOp<internal::scalar_difference_op<Scalar>, Derived, OtherDerived> tmp(derived());
+  tmp = other.derived();
  return derived();
 }

@@ -204,7 +188,8 @@ template<typename OtherDerived>
 EIGEN_STRONG_INLINE Derived &
 ArrayBase<Derived>::operator+=(const ArrayBase<OtherDerived>& other)
 {
-  call_assignment(derived(), other.derived(), internal::add_assign_op<Scalar>());
+  SelfCwiseBinaryOp<internal::scalar_sum_op<Scalar>, Derived, OtherDerived> tmp(derived());
+  tmp = other.derived();
  return derived();
 }

@@ -217,7 +202,8 @@ template<typename OtherDerived>
 EIGEN_STRONG_INLINE Derived &
 ArrayBase<Derived>::operator*=(const ArrayBase<OtherDerived>& other)
 {
-  call_assignment(derived(), other.derived(), internal::mul_assign_op<Scalar,typename OtherDerived::Scalar>());
+  SelfCwiseBinaryOp<internal::scalar_product_op<Scalar>, Derived, OtherDerived> tmp(derived());
+  tmp = other.derived();
  return derived();
 }

@@ -230,7 +216,8 @@ template<typename OtherDerived>
 EIGEN_STRONG_INLINE Derived &
 ArrayBase<Derived>::operator/=(const ArrayBase<OtherDerived>& other)
 {
-  call_assignment(derived(), other.derived(), internal::div_assign_op<Scalar>());
+  SelfCwiseBinaryOp<internal::scalar_quotient_op<Scalar>, Derived, OtherDerived> tmp(derived());
+  tmp = other.derived();
  return derived();
 }

--- a/Eigen/src/Core/ArrayWrapper.h
+++ b/Eigen/src/Core/ArrayWrapper.h
@@ -44,7 +44,6 @@ class ArrayWrapper : public ArrayBase<ArrayWrapper<ExpressionType> >
    typedef ArrayBase<ArrayWrapper> Base;
    EIGEN_DENSE_PUBLIC_INTERFACE(ArrayWrapper)
    EIGEN_INHERIT_ASSIGNMENT_OPERATORS(ArrayWrapper)
-    typedef typename internal::remove_all<ExpressionType>::type NestedExpression;

    typedef typename internal::conditional<
                       internal::is_lvalue<ExpressionType>::value,
@@ -54,54 +53,41 @@ class ArrayWrapper : public ArrayBase<ArrayWrapper<ExpressionType> >

    typedef typename internal::nested<ExpressionType>::type NestedExpressionType;

-    EIGEN_DEVICE_FUNC
-    explicit EIGEN_STRONG_INLINE ArrayWrapper(ExpressionType& matrix) : m_expression(matrix) {}
+    inline ArrayWrapper(ExpressionType& matrix) : m_expression(matrix) {}

-    EIGEN_DEVICE_FUNC
    inline Index rows() const { return m_expression.rows(); }
-    EIGEN_DEVICE_FUNC
    inline Index cols() const { return m_expression.cols(); }
-    EIGEN_DEVICE_FUNC
    inline Index outerStride() const { return m_expression.outerStride(); }
-    EIGEN_DEVICE_FUNC
    inline Index innerStride() const { return m_expression.innerStride(); }

-    EIGEN_DEVICE_FUNC
    inline ScalarWithConstIfNotLvalue* data() { return m_expression.const_cast_derived().data(); }
-    EIGEN_DEVICE_FUNC
    inline const Scalar* data() const { return m_expression.data(); }

-    EIGEN_DEVICE_FUNC
    inline CoeffReturnType coeff(Index rowId, Index colId) const
    {
      return m_expression.coeff(rowId, colId);
    }

-    EIGEN_DEVICE_FUNC
    inline Scalar& coeffRef(Index rowId, Index colId)
    {
      return m_expression.const_cast_derived().coeffRef(rowId, colId);
    }

-    EIGEN_DEVICE_FUNC
    inline const Scalar& coeffRef(Index rowId, Index colId) const
    {
      return m_expression.const_cast_derived().coeffRef(rowId, colId);
    }

-    EIGEN_DEVICE_FUNC
    inline CoeffReturnType coeff(Index index) const
    {
      return m_expression.coeff(index);
    }

-    EIGEN_DEVICE_FUNC
    inline Scalar& coeffRef(Index index)
    {
      return m_expression.const_cast_derived().coeffRef(index);
    }

-    EIGEN_DEVICE_FUNC
    inline const Scalar& coeffRef(Index index) const
    {
      return m_expression.const_cast_derived().coeffRef(index);
@@ -132,11 +118,9 @@ class ArrayWrapper : public ArrayBase<ArrayWrapper<ExpressionType> >
    }

    template<typename Dest>
-    EIGEN_DEVICE_FUNC
    inline void evalTo(Dest& dst) const { dst = m_expression; }

    const typename internal::remove_all<NestedExpressionType>::type& 
-    EIGEN_DEVICE_FUNC
    nestedExpression() const 
    {
      return m_expression;
@@ -144,11 +128,9 @@ class ArrayWrapper : public ArrayBase<ArrayWrapper<ExpressionType> >

    /** Forwards the resizing request to the nested expression
      * \sa DenseBase::resize(Index)  */
-    EIGEN_DEVICE_FUNC
    void resize(Index newSize) { m_expression.const_cast_derived().resize(newSize); }
    /** Forwards the resizing request to the nested expression
      * \sa DenseBase::resize(Index,Index)*/
-    EIGEN_DEVICE_FUNC
    void resize(Index nbRows, Index nbCols) { m_expression.const_cast_derived().resize(nbRows,nbCols); }

  protected:
@@ -187,7 +169,6 @@ class MatrixWrapper : public MatrixBase<MatrixWrapper<ExpressionType> >
    typedef MatrixBase<MatrixWrapper<ExpressionType> > Base;
    EIGEN_DENSE_PUBLIC_INTERFACE(MatrixWrapper)
    EIGEN_INHERIT_ASSIGNMENT_OPERATORS(MatrixWrapper)
-    typedef typename internal::remove_all<ExpressionType>::type NestedExpression;

    typedef typename internal::conditional<
                       internal::is_lvalue<ExpressionType>::value,
@@ -197,54 +178,41 @@ class MatrixWrapper : public MatrixBase<MatrixWrapper<ExpressionType> >

    typedef typename internal::nested<ExpressionType>::type NestedExpressionType;

-    EIGEN_DEVICE_FUNC
-    explicit inline MatrixWrapper(ExpressionType& a_matrix) : m_expression(a_matrix) {}
+    inline MatrixWrapper(ExpressionType& a_matrix) : m_expression(a_matrix) {}

-    EIGEN_DEVICE_FUNC
    inline Index rows() const { return m_expression.rows(); }
-    EIGEN_DEVICE_FUNC
    inline Index cols() const { return m_expression.cols(); }
-    EIGEN_DEVICE_FUNC
    inline Index outerStride() const { return m_expression.outerStride(); }
-    EIGEN_DEVICE_FUNC
    inline Index innerStride() const { return m_expression.innerStride(); }

-    EIGEN_DEVICE_FUNC
    inline ScalarWithConstIfNotLvalue* data() { return m_expression.const_cast_derived().data(); }
-    EIGEN_DEVICE_FUNC
    inline const Scalar* data() const { return m_expression.data(); }

-    EIGEN_DEVICE_FUNC
    inline CoeffReturnType coeff(Index rowId, Index colId) const
    {
      return m_expression.coeff(rowId, colId);
    }

-    EIGEN_DEVICE_FUNC
    inline Scalar& coeffRef(Index rowId, Index colId)
    {
      return m_expression.const_cast_derived().coeffRef(rowId, colId);
    }

-    EIGEN_DEVICE_FUNC
    inline const Scalar& coeffRef(Index rowId, Index colId) const
    {
      return m_expression.derived().coeffRef(rowId, colId);
    }

-    EIGEN_DEVICE_FUNC
    inline CoeffReturnType coeff(Index index) const
    {
      return m_expression.coeff(index);
    }

-    EIGEN_DEVICE_FUNC
    inline Scalar& coeffRef(Index index)
    {
      return m_expression.const_cast_derived().coeffRef(index);
    }

-    EIGEN_DEVICE_FUNC
    inline const Scalar& coeffRef(Index index) const
    {
      return m_expression.const_cast_derived().coeffRef(index);
@@ -274,7 +242,6 @@ class MatrixWrapper : public MatrixBase<MatrixWrapper<ExpressionType> >
      m_expression.const_cast_derived().template writePacket<LoadMode>(index, val);
    }

-    EIGEN_DEVICE_FUNC
    const typename internal::remove_all<NestedExpressionType>::type& 
    nestedExpression() const 
    {
@@ -283,11 +250,9 @@ class MatrixWrapper : public MatrixBase<MatrixWrapper<ExpressionType> >

    /** Forwards the resizing request to the nested expression
      * \sa DenseBase::resize(Index)  */
-    EIGEN_DEVICE_FUNC
    void resize(Index newSize) { m_expression.const_cast_derived().resize(newSize); }
    /** Forwards the resizing request to the nested expression
      * \sa DenseBase::resize(Index,Index)*/
-    EIGEN_DEVICE_FUNC
    void resize(Index nbRows, Index nbCols) { m_expression.const_cast_derived().resize(nbRows,nbCols); }

  protected:
--- a/Eigen/src/Core/Assign.h
+++ b/Eigen/src/Core/Assign.h
@@ -14,6 +14,478 @@

 namespace Eigen {

+namespace internal {
+
+/***************************************************************************
+* Part 1 : the logic deciding a strategy for traversal and unrolling       *
+***************************************************************************/
+
+template <typename Derived, typename OtherDerived>
+struct assign_traits
+{
+public:
+  enum {
+    DstIsAligned = Derived::Flags & AlignedBit,
+    DstHasDirectAccess = Derived::Flags & DirectAccessBit,
+    SrcIsAligned = OtherDerived::Flags & AlignedBit,
+    JointAlignment = bool(DstIsAligned) && bool(SrcIsAligned) ? Aligned : Unaligned
+  };
+
+private:
+  enum {
+    InnerSize = int(Derived::IsVectorAtCompileTime) ? int(Derived::SizeAtCompileTime)
+              : int(Derived::Flags)&RowMajorBit ? int(Derived::ColsAtCompileTime)
+              : int(Derived::RowsAtCompileTime),
+    InnerMaxSize = int(Derived::IsVectorAtCompileTime) ? int(Derived::MaxSizeAtCompileTime)
+              : int(Derived::Flags)&RowMajorBit ? int(Derived::MaxColsAtCompileTime)
+              : int(Derived::MaxRowsAtCompileTime),
+    MaxSizeAtCompileTime = Derived::SizeAtCompileTime,
+    PacketSize = packet_traits<typename Derived::Scalar>::size
+  };
+
+  enum {
+    StorageOrdersAgree = (int(Derived::IsRowMajor) == int(OtherDerived::IsRowMajor)),
+    MightVectorize = StorageOrdersAgree
+                  && (int(Derived::Flags) & int(OtherDerived::Flags) & ActualPacketAccessBit),
+    MayInnerVectorize  = MightVectorize && int(InnerSize)!=Dynamic && int(InnerSize)%int(PacketSize)==0
+                       && int(DstIsAligned) && int(SrcIsAligned),
+    MayLinearize = StorageOrdersAgree && (int(Derived::Flags) & int(OtherDerived::Flags) & LinearAccessBit),
+    MayLinearVectorize = MightVectorize && MayLinearize && DstHasDirectAccess
+                       && (DstIsAligned || MaxSizeAtCompileTime == Dynamic),
+      /* If the destination isn't aligned, we have to do runtime checks and we don't unroll,
+         so it's only good for large enough sizes. */
+    MaySliceVectorize  = MightVectorize && DstHasDirectAccess
+                       && (int(InnerMaxSize)==Dynamic || int(InnerMaxSize)>=3*PacketSize)
+      /* slice vectorization can be slow, so we only want it if the slices are big, which is
+         indicated by InnerMaxSize rather than InnerSize, think of the case of a dynamic block
+         in a fixed-size matrix */
+  };
+
+public:
+  enum {
+    Traversal = int(MayInnerVectorize)  ? int(InnerVectorizedTraversal)
+              : int(MayLinearVectorize) ? int(LinearVectorizedTraversal)
+              : int(MaySliceVectorize)  ? int(SliceVectorizedTraversal)
+              : int(MayLinearize)       ? int(LinearTraversal)
+                                        : int(DefaultTraversal),
+    Vectorized = int(Traversal) == InnerVectorizedTraversal
+              || int(Traversal) == LinearVectorizedTraversal
+              || int(Traversal) == SliceVectorizedTraversal
+  };
+
+private:
+  enum {
+    UnrollingLimit      = EIGEN_UNROLLING_LIMIT * (Vectorized ? int(PacketSize) : 1),
+    MayUnrollCompletely = int(Derived::SizeAtCompileTime) != Dynamic
+                       && int(OtherDerived::CoeffReadCost) != Dynamic
+                       && int(Derived::SizeAtCompileTime) * int(OtherDerived::CoeffReadCost) <= int(UnrollingLimit),
+    MayUnrollInner      = int(InnerSize) != Dynamic
+                       && int(OtherDerived::CoeffReadCost) != Dynamic
+                       && int(InnerSize) * int(OtherDerived::CoeffReadCost) <= int(UnrollingLimit)
+  };
+
+public:
+  enum {
+    Unrolling = (int(Traversal) == int(InnerVectorizedTraversal) || int(Traversal) == int(DefaultTraversal))
+                ? (
+                    int(MayUnrollCompletely) ? int(CompleteUnrolling)
+                  : int(MayUnrollInner)      ? int(InnerUnrolling)
+                                             : int(NoUnrolling)
+                  )
+              : int(Traversal) == int(LinearVectorizedTraversal)
+                ? ( bool(MayUnrollCompletely) && bool(DstIsAligned) ? int(CompleteUnrolling) : int(NoUnrolling) )
+              : int(Traversal) == int(LinearTraversal)
+                ? ( bool(MayUnrollCompletely) ? int(CompleteUnrolling) : int(NoUnrolling) )
+              : int(NoUnrolling)
+  };
+
+#ifdef EIGEN_DEBUG_ASSIGN
+  static void debug()
+  {
+    EIGEN_DEBUG_VAR(DstIsAligned)
+    EIGEN_DEBUG_VAR(SrcIsAligned)
+    EIGEN_DEBUG_VAR(JointAlignment)
+    EIGEN_DEBUG_VAR(InnerSize)
+    EIGEN_DEBUG_VAR(InnerMaxSize)
+    EIGEN_DEBUG_VAR(PacketSize)
+    EIGEN_DEBUG_VAR(StorageOrdersAgree)
+    EIGEN_DEBUG_VAR(MightVectorize)
+    EIGEN_DEBUG_VAR(MayLinearize)
+    EIGEN_DEBUG_VAR(MayInnerVectorize)
+    EIGEN_DEBUG_VAR(MayLinearVectorize)
+    EIGEN_DEBUG_VAR(MaySliceVectorize)
+    EIGEN_DEBUG_VAR(Traversal)
+    EIGEN_DEBUG_VAR(UnrollingLimit)
+    EIGEN_DEBUG_VAR(MayUnrollCompletely)
+    EIGEN_DEBUG_VAR(MayUnrollInner)
+    EIGEN_DEBUG_VAR(Unrolling)
+  }
+#endif
+};
+
+/***************************************************************************
+* Part 2 : meta-unrollers
+***************************************************************************/
+
+/************************
+*** Default traversal ***
+************************/
+
+template<typename Derived1, typename Derived2, int Index, int Stop>
+struct assign_DefaultTraversal_CompleteUnrolling
+{
+  enum {
+    outer = Index / Derived1::InnerSizeAtCompileTime,
+    inner = Index % Derived1::InnerSizeAtCompileTime
+  };
+
+  static EIGEN_STRONG_INLINE void run(Derived1 &dst, const Derived2 &src)
+  {
+    dst.copyCoeffByOuterInner(outer, inner, src);
+    assign_DefaultTraversal_CompleteUnrolling<Derived1, Derived2, Index+1, Stop>::run(dst, src);
+  }
+};
+
+template<typename Derived1, typename Derived2, int Stop>
+struct assign_DefaultTraversal_CompleteUnrolling<Derived1, Derived2, Stop, Stop>
+{
+  static EIGEN_STRONG_INLINE void run(Derived1 &, const Derived2 &) {}
+};
+
+template<typename Derived1, typename Derived2, int Index, int Stop>
+struct assign_DefaultTraversal_InnerUnrolling
+{
+  static EIGEN_STRONG_INLINE void run(Derived1 &dst, const Derived2 &src, typename Derived1::Index outer)
+  {
+    dst.copyCoeffByOuterInner(outer, Index, src);
+    assign_DefaultTraversal_InnerUnrolling<Derived1, Derived2, Index+1, Stop>::run(dst, src, outer);
+  }
+};
+
+template<typename Derived1, typename Derived2, int Stop>
+struct assign_DefaultTraversal_InnerUnrolling<Derived1, Derived2, Stop, Stop>
+{
+  static EIGEN_STRONG_INLINE void run(Derived1 &, const Derived2 &, typename Derived1::Index) {}
+};
+
+/***********************
+*** Linear traversal ***
+***********************/
+
+template<typename Derived1, typename Derived2, int Index, int Stop>
+struct assign_LinearTraversal_CompleteUnrolling
+{
+  static EIGEN_STRONG_INLINE void run(Derived1 &dst, const Derived2 &src)
+  {
+    dst.copyCoeff(Index, src);
+    assign_LinearTraversal_CompleteUnrolling<Derived1, Derived2, Index+1, Stop>::run(dst, src);
+  }
+};
+
+template<typename Derived1, typename Derived2, int Stop>
+struct assign_LinearTraversal_CompleteUnrolling<Derived1, Derived2, Stop, Stop>
+{
+  static EIGEN_STRONG_INLINE void run(Derived1 &, const Derived2 &) {}
+};
+
+/**************************
+*** Inner vectorization ***
+**************************/
+
+template<typename Derived1, typename Derived2, int Index, int Stop>
+struct assign_innervec_CompleteUnrolling
+{
+  enum {
+    outer = Index / Derived1::InnerSizeAtCompileTime,
+    inner = Index % Derived1::InnerSizeAtCompileTime,
+    JointAlignment = assign_traits<Derived1,Derived2>::JointAlignment
+  };
+
+  static EIGEN_STRONG_INLINE void run(Derived1 &dst, const Derived2 &src)
+  {
+    dst.template copyPacketByOuterInner<Derived2, Aligned, JointAlignment>(outer, inner, src);
+    assign_innervec_CompleteUnrolling<Derived1, Derived2,
+      Index+packet_traits<typename Derived1::Scalar>::size, Stop>::run(dst, src);
+  }
+};
+
+template<typename Derived1, typename Derived2, int Stop>
+struct assign_innervec_CompleteUnrolling<Derived1, Derived2, Stop, Stop>
+{
+  static EIGEN_STRONG_INLINE void run(Derived1 &, const Derived2 &) {}
+};
+
+template<typename Derived1, typename Derived2, int Index, int Stop>
+struct assign_innervec_InnerUnrolling
+{
+  static EIGEN_STRONG_INLINE void run(Derived1 &dst, const Derived2 &src, typename Derived1::Index outer)
+  {
+    dst.template copyPacketByOuterInner<Derived2, Aligned, Aligned>(outer, Index, src);
+    assign_innervec_InnerUnrolling<Derived1, Derived2,
+      Index+packet_traits<typename Derived1::Scalar>::size, Stop>::run(dst, src, outer);
+  }
+};
+
+template<typename Derived1, typename Derived2, int Stop>
+struct assign_innervec_InnerUnrolling<Derived1, Derived2, Stop, Stop>
+{
+  static EIGEN_STRONG_INLINE void run(Derived1 &, const Derived2 &, typename Derived1::Index) {}
+};
+
+/***************************************************************************
+* Part 3 : implementation of all cases
+***************************************************************************/
+
+template<typename Derived1, typename Derived2,
+         int Traversal = assign_traits<Derived1, Derived2>::Traversal,
+         int Unrolling = assign_traits<Derived1, Derived2>::Unrolling,
+         int Version = Specialized>
+struct assign_impl;
+
+/************************
+*** Default traversal ***
+************************/
+
+template<typename Derived1, typename Derived2, int Unrolling, int Version>
+struct assign_impl<Derived1, Derived2, InvalidTraversal, Unrolling, Version>
+{
+  static inline void run(Derived1 &, const Derived2 &) { }
+};
+
+template<typename Derived1, typename Derived2, int Version>
+struct assign_impl<Derived1, Derived2, DefaultTraversal, NoUnrolling, Version>
+{
+  typedef typename Derived1::Index Index;
+  static inline void run(Derived1 &dst, const Derived2 &src)
+  {
+    const Index innerSize = dst.innerSize();
+    const Index outerSize = dst.outerSize();
+    for(Index outer = 0; outer < outerSize; ++outer)
+      for(Index inner = 0; inner < innerSize; ++inner)
+        dst.copyCoeffByOuterInner(outer, inner, src);
+  }
+};
+
+template<typename Derived1, typename Derived2, int Version>
+struct assign_impl<Derived1, Derived2, DefaultTraversal, CompleteUnrolling, Version>
+{
+  static EIGEN_STRONG_INLINE void run(Derived1 &dst, const Derived2 &src)
+  {
+    assign_DefaultTraversal_CompleteUnrolling<Derived1, Derived2, 0, Derived1::SizeAtCompileTime>
+      ::run(dst, src);
+  }
+};
+
+template<typename Derived1, typename Derived2, int Version>
+struct assign_impl<Derived1, Derived2, DefaultTraversal, InnerUnrolling, Version>
+{
+  typedef typename Derived1::Index Index;
+  static EIGEN_STRONG_INLINE void run(Derived1 &dst, const Derived2 &src)
+  {
+    const Index outerSize = dst.outerSize();
+    for(Index outer = 0; outer < outerSize; ++outer)
+      assign_DefaultTraversal_InnerUnrolling<Derived1, Derived2, 0, Derived1::InnerSizeAtCompileTime>
+        ::run(dst, src, outer);
+  }
+};
+
+/***********************
+*** Linear traversal ***
+***********************/
+
+template<typename Derived1, typename Derived2, int Version>
+struct assign_impl<Derived1, Derived2, LinearTraversal, NoUnrolling, Version>
+{
+  typedef typename Derived1::Index Index;
+  static inline void run(Derived1 &dst, const Derived2 &src)
+  {
+    const Index size = dst.size();
+    for(Index i = 0; i < size; ++i)
+      dst.copyCoeff(i, src);
+  }
+};
+
+template<typename Derived1, typename Derived2, int Version>
+struct assign_impl<Derived1, Derived2, LinearTraversal, CompleteUnrolling, Version>
+{
+  static EIGEN_STRONG_INLINE void run(Derived1 &dst, const Derived2 &src)
+  {
+    assign_LinearTraversal_CompleteUnrolling<Derived1, Derived2, 0, Derived1::SizeAtCompileTime>
+      ::run(dst, src);
+  }
+};
+
+/**************************
+*** Inner vectorization ***
+**************************/
+
+template<typename Derived1, typename Derived2, int Version>
+struct assign_impl<Derived1, Derived2, InnerVectorizedTraversal, NoUnrolling, Version>
+{
+  typedef typename Derived1::Index Index;
+  static inline void run(Derived1 &dst, const Derived2 &src)
+  {
+    const Index innerSize = dst.innerSize();
+    const Index outerSize = dst.outerSize();
+    const Index packetSize = packet_traits<typename Derived1::Scalar>::size;
+    for(Index outer = 0; outer < outerSize; ++outer)
+      for(Index inner = 0; inner < innerSize; inner+=packetSize)
+        dst.template copyPacketByOuterInner<Derived2, Aligned, Aligned>(outer, inner, src);
+  }
+};
+
+template<typename Derived1, typename Derived2, int Version>
+struct assign_impl<Derived1, Derived2, InnerVectorizedTraversal, CompleteUnrolling, Version>
+{
+  static EIGEN_STRONG_INLINE void run(Derived1 &dst, const Derived2 &src)
+  {
+    assign_innervec_CompleteUnrolling<Derived1, Derived2, 0, Derived1::SizeAtCompileTime>
+      ::run(dst, src);
+  }
+};
+
+template<typename Derived1, typename Derived2, int Version>
+struct assign_impl<Derived1, Derived2, InnerVectorizedTraversal, InnerUnrolling, Version>
+{
+  typedef typename Derived1::Index Index;
+  static EIGEN_STRONG_INLINE void run(Derived1 &dst, const Derived2 &src)
+  {
+    const Index outerSize = dst.outerSize();
+    for(Index outer = 0; outer < outerSize; ++outer)
+      assign_innervec_InnerUnrolling<Derived1, Derived2, 0, Derived1::InnerSizeAtCompileTime>
+        ::run(dst, src, outer);
+  }
+};
+
+/***************************
+*** Linear vectorization ***
+***************************/
+
+template <bool IsAligned = false>
+struct unaligned_assign_impl
+{
+  template <typename Derived, typename OtherDerived>
+  static EIGEN_STRONG_INLINE void run(const Derived&, OtherDerived&, typename Derived::Index, typename Derived::Index) {}
+};
+
+template <>
+struct unaligned_assign_impl<false>
+{
+  // MSVC must not inline this functions. If it does, it fails to optimize the
+  // packet access path.
+#ifdef _MSC_VER
+  template <typename Derived, typename OtherDerived>
+  static EIGEN_DONT_INLINE void run(const Derived& src, OtherDerived& dst, typename Derived::Index start, typename Derived::Index end)
+#else
+  template <typename Derived, typename OtherDerived>
+  static EIGEN_STRONG_INLINE void run(const Derived& src, OtherDerived& dst, typename Derived::Index start, typename Derived::Index end)
+#endif
+  {
+    for (typename Derived::Index index = start; index < end; ++index)
+      dst.copyCoeff(index, src);
+  }
+};
+
+template<typename Derived1, typename Derived2, int Version>
+struct assign_impl<Derived1, Derived2, LinearVectorizedTraversal, NoUnrolling, Version>
+{
+  typedef typename Derived1::Index Index;
+  static EIGEN_STRONG_INLINE void run(Derived1 &dst, const Derived2 &src)
+  {
+    const Index size = dst.size();
+    typedef packet_traits<typename Derived1::Scalar> PacketTraits;
+    enum {
+      packetSize = PacketTraits::size,
+      dstAlignment = PacketTraits::AlignedOnScalar ? Aligned : int(assign_traits<Derived1,Derived2>::DstIsAligned) ,
+      srcAlignment = assign_traits<Derived1,Derived2>::JointAlignment
+    };
+    const Index alignedStart = assign_traits<Derived1,Derived2>::DstIsAligned ? 0
+                             : internal::first_aligned(&dst.coeffRef(0), size);
+    const Index alignedEnd = alignedStart + ((size-alignedStart)/packetSize)*packetSize;
+
+    unaligned_assign_impl<assign_traits<Derived1,Derived2>::DstIsAligned!=0>::run(src,dst,0,alignedStart);
+
+    for(Index index = alignedStart; index < alignedEnd; index += packetSize)
+    {
+      dst.template copyPacket<Derived2, dstAlignment, srcAlignment>(index, src);
+    }
+
+    unaligned_assign_impl<>::run(src,dst,alignedEnd,size);
+  }
+};
+
+template<typename Derived1, typename Derived2, int Version>
+struct assign_impl<Derived1, Derived2, LinearVectorizedTraversal, CompleteUnrolling, Version>
+{
+  typedef typename Derived1::Index Index;
+  static EIGEN_STRONG_INLINE void run(Derived1 &dst, const Derived2 &src)
+  {
+    enum { size = Derived1::SizeAtCompileTime,
+           packetSize = packet_traits<typename Derived1::Scalar>::size,
+           alignedSize = (size/packetSize)*packetSize };
+
+    assign_innervec_CompleteUnrolling<Derived1, Derived2, 0, alignedSize>::run(dst, src);
+    assign_DefaultTraversal_CompleteUnrolling<Derived1, Derived2, alignedSize, size>::run(dst, src);
+  }
+};
+
+/**************************
+*** Slice vectorization ***
+***************************/
+
+template<typename Derived1, typename Derived2, int Version>
+struct assign_impl<Derived1, Derived2, SliceVectorizedTraversal, NoUnrolling, Version>
+{
+  typedef typename Derived1::Index Index;
+  static inline void run(Derived1 &dst, const Derived2 &src)
+  {
+    typedef typename Derived1::Scalar Scalar;
+    typedef packet_traits<Scalar> PacketTraits;
+    enum {
+      packetSize = PacketTraits::size,
+      alignable = PacketTraits::AlignedOnScalar,
+      dstIsAligned = assign_traits<Derived1,Derived2>::DstIsAligned,
+      dstAlignment = alignable ? Aligned : int(dstIsAligned),
+      srcAlignment = assign_traits<Derived1,Derived2>::JointAlignment
+    };
+    const Scalar *dst_ptr = &dst.coeffRef(0,0);
+    if((!bool(dstIsAligned)) && (size_t(dst_ptr) % sizeof(Scalar))>0)
+    {
+      // the pointer is not aligend-on scalar, so alignment is not possible
+      return assign_impl<Derived1,Derived2,DefaultTraversal,NoUnrolling>::run(dst, src);
+    }
+    const Index packetAlignedMask = packetSize - 1;
+    const Index innerSize = dst.innerSize();
+    const Index outerSize = dst.outerSize();
+    const Index alignedStep = alignable ? (packetSize - dst.outerStride() % packetSize) & packetAlignedMask : 0;
+    Index alignedStart = ((!alignable) || bool(dstIsAligned)) ? 0 : internal::first_aligned(dst_ptr, innerSize);
+
+    for(Index outer = 0; outer < outerSize; ++outer)
+    {
+      const Index alignedEnd = alignedStart + ((innerSize-alignedStart) & ~packetAlignedMask);
+      // do the non-vectorizable part of the assignment
+      for(Index inner = 0; inner<alignedStart ; ++inner)
+        dst.copyCoeffByOuterInner(outer, inner, src);
+
+      // do the vectorizable part of the assignment
+      for(Index inner = alignedStart; inner<alignedEnd; inner+=packetSize)
+        dst.template copyPacketByOuterInner<Derived2, dstAlignment, Unaligned>(outer, inner, src);
+
+      // do the non-vectorizable part of the assignment
+      for(Index inner = alignedEnd; inner<innerSize ; ++inner)
+        dst.copyCoeffByOuterInner(outer, inner, src);
+
+      alignedStart = std::min<Index>((alignedStart+alignedStep)%packetSize, innerSize);
+    }
+  }
+};
+
+} // end namespace internal
+
+/***************************************************************************
+* Part 4 : implementation of DenseBase methods
+***************************************************************************/
+
 template<typename Derived>
 template<typename OtherDerived>
 EIGEN_STRONG_INLINE Derived& DenseBase<Derived>
@@ -27,62 +499,90 @@ EIGEN_STRONG_INLINE Derived& DenseBase<Derived>
  EIGEN_STATIC_ASSERT_SAME_MATRIX_SIZE(Derived,OtherDerived)
  EIGEN_STATIC_ASSERT(SameType,YOU_MIXED_DIFFERENT_NUMERIC_TYPES__YOU_NEED_TO_USE_THE_CAST_METHOD_OF_MATRIXBASE_TO_CAST_NUMERIC_TYPES_EXPLICITLY)

+#ifdef EIGEN_DEBUG_ASSIGN
+  internal::assign_traits<Derived, OtherDerived>::debug();
+#endif
  eigen_assert(rows() == other.rows() && cols() == other.cols());
-  internal::call_assignment_no_alias(derived(),other.derived());
-  
+  internal::assign_impl<Derived, OtherDerived, int(SameType) ? int(internal::assign_traits<Derived, OtherDerived>::Traversal)
+                                                       : int(InvalidTraversal)>::run(derived(),other.derived());
+#ifndef EIGEN_NO_DEBUG
+  checkTransposeAliasing(other.derived());
+#endif
  return derived();
 }

+namespace internal {
+
+template<typename Derived, typename OtherDerived,
+         bool EvalBeforeAssigning = (int(internal::traits<OtherDerived>::Flags) & EvalBeforeAssigningBit) != 0,
+         bool NeedToTranspose = ((int(Derived::RowsAtCompileTime) == 1 && int(OtherDerived::ColsAtCompileTime) == 1)
+                              |   // FIXME | instead of || to please GCC 4.4.0 stupid warning "suggest parentheses around &&".
+                                  // revert to || as soon as not needed anymore.
+                                  (int(Derived::ColsAtCompileTime) == 1 && int(OtherDerived::RowsAtCompileTime) == 1))
+                              && int(Derived::SizeAtCompileTime) != 1>
+struct assign_selector;
+
+template<typename Derived, typename OtherDerived>
+struct assign_selector<Derived,OtherDerived,false,false> {
+  static EIGEN_STRONG_INLINE Derived& run(Derived& dst, const OtherDerived& other) { return dst.lazyAssign(other.derived()); }
+  template<typename ActualDerived, typename ActualOtherDerived>
+  static EIGEN_STRONG_INLINE Derived& evalTo(ActualDerived& dst, const ActualOtherDerived& other) { other.evalTo(dst); return dst; }
+};
+template<typename Derived, typename OtherDerived>
+struct assign_selector<Derived,OtherDerived,true,false> {
+  static EIGEN_STRONG_INLINE Derived& run(Derived& dst, const OtherDerived& other) { return dst.lazyAssign(other.eval()); }
+};
+template<typename Derived, typename OtherDerived>
+struct assign_selector<Derived,OtherDerived,false,true> {
+  static EIGEN_STRONG_INLINE Derived& run(Derived& dst, const OtherDerived& other) { return dst.lazyAssign(other.transpose()); }
+  template<typename ActualDerived, typename ActualOtherDerived>
+  static EIGEN_STRONG_INLINE Derived& evalTo(ActualDerived& dst, const ActualOtherDerived& other) { Transpose<ActualDerived> dstTrans(dst); other.evalTo(dstTrans); return dst; }
+};
+template<typename Derived, typename OtherDerived>
+struct assign_selector<Derived,OtherDerived,true,true> {
+  static EIGEN_STRONG_INLINE Derived& run(Derived& dst, const OtherDerived& other) { return dst.lazyAssign(other.transpose().eval()); }
+};
+
+} // end namespace internal
+
 template<typename Derived>
 template<typename OtherDerived>
-EIGEN_DEVICE_FUNC
 EIGEN_STRONG_INLINE Derived& DenseBase<Derived>::operator=(const DenseBase<OtherDerived>& other)
 {
-  internal::call_assignment(derived(), other.derived());
-  return derived();
+  return internal::assign_selector<Derived,OtherDerived>::run(derived(), other.derived());
 }

 template<typename Derived>
-EIGEN_DEVICE_FUNC
 EIGEN_STRONG_INLINE Derived& DenseBase<Derived>::operator=(const DenseBase& other)
 {
-  internal::call_assignment(derived(), other.derived());
-  return derived();
+  return internal::assign_selector<Derived,Derived>::run(derived(), other.derived());
 }

 template<typename Derived>
-EIGEN_DEVICE_FUNC
 EIGEN_STRONG_INLINE Derived& MatrixBase<Derived>::operator=(const MatrixBase& other)
 {
-  internal::call_assignment(derived(), other.derived());
-  return derived();
+  return internal::assign_selector<Derived,Derived>::run(derived(), other.derived());
 }

 template<typename Derived>
 template <typename OtherDerived>
-EIGEN_DEVICE_FUNC
 EIGEN_STRONG_INLINE Derived& MatrixBase<Derived>::operator=(const DenseBase<OtherDerived>& other)
 {
-  internal::call_assignment(derived(), other.derived());
-  return derived();
+  return internal::assign_selector<Derived,OtherDerived>::run(derived(), other.derived());
 }

 template<typename Derived>
 template <typename OtherDerived>
-EIGEN_DEVICE_FUNC
 EIGEN_STRONG_INLINE Derived& MatrixBase<Derived>::operator=(const EigenBase<OtherDerived>& other)
 {
-  internal::call_assignment(derived(), other.derived());
-  return derived();
+  return internal::assign_selector<Derived,OtherDerived,false>::evalTo(derived(), other.derived());
 }

 template<typename Derived>
 template<typename OtherDerived>
-EIGEN_DEVICE_FUNC
 EIGEN_STRONG_INLINE Derived& MatrixBase<Derived>::operator=(const ReturnByValue<OtherDerived>& other)
 {
-  other.derived().evalTo(derived());
-  return derived();
+  return internal::assign_selector<Derived,OtherDerived,false>::evalTo(derived(), other.derived());
 }

 } // end namespace Eigen
--- a/Eigen/src/Core/AssignEvaluator.h
+++ b/Eigen/src/Core/AssignEvaluator.h
@@ -1,788 +0,0 @@
-// This file is part of Eigen, a lightweight C++ template library
-// for linear algebra.
-//
-// Copyright (C) 2011 Benoit Jacob <jacob.benoit.1@gmail.com>
-// Copyright (C) 2011-2014 Gael Guennebaud <gael.guennebaud@inria.fr>
-// Copyright (C) 2011-2012 Jitse Niesen <jitse@maths.leeds.ac.uk>
-//
-// This Source Code Form is subject to the terms of the Mozilla
-// Public License v. 2.0. If a copy of the MPL was not distributed
-// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
-
-#ifndef EIGEN_ASSIGN_EVALUATOR_H
-#define EIGEN_ASSIGN_EVALUATOR_H
-
-namespace Eigen {
-
-// This implementation is based on Assign.h
-
-namespace internal {
-  
-/***************************************************************************
-* Part 1 : the logic deciding a strategy for traversal and unrolling       *
-***************************************************************************/
-
-// copy_using_evaluator_traits is based on assign_traits
-
-template <typename DstEvaluator, typename SrcEvaluator, typename AssignFunc>
-struct copy_using_evaluator_traits
-{
-  typedef typename DstEvaluator::XprType Dst;
-  
-  enum {
-    DstFlags = DstEvaluator::Flags,
-    SrcFlags = SrcEvaluator::Flags
-  };
-  
-public:
-  enum {
-    DstIsAligned = DstFlags & AlignedBit,
-    DstHasDirectAccess = DstFlags & DirectAccessBit,
-    SrcIsAligned = SrcFlags & AlignedBit,
-    JointAlignment = bool(DstIsAligned) && bool(SrcIsAligned) ? Aligned : Unaligned
-  };
-
-private:
-  enum {
-    InnerSize = int(Dst::IsVectorAtCompileTime) ? int(Dst::SizeAtCompileTime)
-              : int(DstFlags)&RowMajorBit ? int(Dst::ColsAtCompileTime)
-              : int(Dst::RowsAtCompileTime),
-    InnerMaxSize = int(Dst::IsVectorAtCompileTime) ? int(Dst::MaxSizeAtCompileTime)
-              : int(DstFlags)&RowMajorBit ? int(Dst::MaxColsAtCompileTime)
-              : int(Dst::MaxRowsAtCompileTime),
-    MaxSizeAtCompileTime = Dst::SizeAtCompileTime,
-    PacketSize = packet_traits<typename Dst::Scalar>::size
-  };
-
-  enum {
-    DstIsRowMajor = DstFlags&RowMajorBit,
-    SrcIsRowMajor = SrcFlags&RowMajorBit,
-    StorageOrdersAgree = (int(DstIsRowMajor) == int(SrcIsRowMajor)),
-    MightVectorize = StorageOrdersAgree
-                  && (int(DstFlags) & int(SrcFlags) & ActualPacketAccessBit)
-                  && (functor_traits<AssignFunc>::PacketAccess),
-    MayInnerVectorize  = MightVectorize && int(InnerSize)!=Dynamic && int(InnerSize)%int(PacketSize)==0
-                       && int(DstIsAligned) && int(SrcIsAligned),
-    MayLinearize = StorageOrdersAgree && (int(DstFlags) & int(SrcFlags) & LinearAccessBit),
-    MayLinearVectorize = MightVectorize && MayLinearize && DstHasDirectAccess
-                       && (DstIsAligned || MaxSizeAtCompileTime == Dynamic),
-      /* If the destination isn't aligned, we have to do runtime checks and we don't unroll,
-         so it's only good for large enough sizes. */
-    MaySliceVectorize  = MightVectorize && DstHasDirectAccess
-                       && (int(InnerMaxSize)==Dynamic || int(InnerMaxSize)>=3*PacketSize)
-      /* slice vectorization can be slow, so we only want it if the slices are big, which is
-         indicated by InnerMaxSize rather than InnerSize, think of the case of a dynamic block
-         in a fixed-size matrix */
-  };
-
-public:
-  enum {
-    Traversal = int(MayInnerVectorize)   ? int(InnerVectorizedTraversal)
-              : int(MayLinearVectorize)  ? int(LinearVectorizedTraversal)
-              : int(MaySliceVectorize)   ? int(SliceVectorizedTraversal)
-              : int(MayLinearize)        ? int(LinearTraversal)
-                                         : int(DefaultTraversal),
-    Vectorized = int(Traversal) == InnerVectorizedTraversal
-              || int(Traversal) == LinearVectorizedTraversal
-              || int(Traversal) == SliceVectorizedTraversal
-  };
-
-private:
-  enum {
-    UnrollingLimit      = EIGEN_UNROLLING_LIMIT * (Vectorized ? int(PacketSize) : 1),
-    MayUnrollCompletely = int(Dst::SizeAtCompileTime) != Dynamic
-                       && int(SrcEvaluator::CoeffReadCost) != Dynamic
-                       && int(Dst::SizeAtCompileTime) * int(SrcEvaluator::CoeffReadCost) <= int(UnrollingLimit),
-    MayUnrollInner      = int(InnerSize) != Dynamic
-                       && int(SrcEvaluator::CoeffReadCost) != Dynamic
-                       && int(InnerSize) * int(SrcEvaluator::CoeffReadCost) <= int(UnrollingLimit)
-  };
-
-public:
-  enum {
-    Unrolling = (int(Traversal) == int(InnerVectorizedTraversal) || int(Traversal) == int(DefaultTraversal))
-                ? (
-                    int(MayUnrollCompletely) ? int(CompleteUnrolling)
-                  : int(MayUnrollInner)      ? int(InnerUnrolling)
-                                             : int(NoUnrolling)
-                  )
-              : int(Traversal) == int(LinearVectorizedTraversal)
-                ? ( bool(MayUnrollCompletely) && bool(DstIsAligned) ? int(CompleteUnrolling) 
-                                                                    : int(NoUnrolling) )
-              : int(Traversal) == int(LinearTraversal)
-                ? ( bool(MayUnrollCompletely) ? int(CompleteUnrolling) 
-                                              : int(NoUnrolling) )
-              : int(NoUnrolling)
-  };
-
-#ifdef EIGEN_DEBUG_ASSIGN
-  static void debug()
-  {
-    std::cerr << "DstXpr: " << typeid(typename DstEvaluator::XprType).name() << std::endl;
-    std::cerr << "SrcXpr: " << typeid(typename SrcEvaluator::XprType).name() << std::endl;
-    std::cerr.setf(std::ios::hex, std::ios::basefield);
-    EIGEN_DEBUG_VAR(DstFlags)
-    EIGEN_DEBUG_VAR(SrcFlags)
-    std::cerr.unsetf(std::ios::hex);
-    EIGEN_DEBUG_VAR(DstIsAligned)
-    EIGEN_DEBUG_VAR(SrcIsAligned)
-    EIGEN_DEBUG_VAR(JointAlignment)
-    EIGEN_DEBUG_VAR(InnerSize)
-    EIGEN_DEBUG_VAR(InnerMaxSize)
-    EIGEN_DEBUG_VAR(PacketSize)
-    EIGEN_DEBUG_VAR(StorageOrdersAgree)
-    EIGEN_DEBUG_VAR(MightVectorize)
-    EIGEN_DEBUG_VAR(MayLinearize)
-    EIGEN_DEBUG_VAR(MayInnerVectorize)
-    EIGEN_DEBUG_VAR(MayLinearVectorize)
-    EIGEN_DEBUG_VAR(MaySliceVectorize)
-    EIGEN_DEBUG_VAR(Traversal)
-    EIGEN_DEBUG_VAR(UnrollingLimit)
-    EIGEN_DEBUG_VAR(MayUnrollCompletely)
-    EIGEN_DEBUG_VAR(MayUnrollInner)
-    EIGEN_DEBUG_VAR(Unrolling)
-    std::cerr << std::endl;
-  }
-#endif
-};
-
-/***************************************************************************
-* Part 2 : meta-unrollers
-***************************************************************************/
-
-/************************
-*** Default traversal ***
-************************/
-
-template<typename Kernel, int Index, int Stop>
-struct copy_using_evaluator_DefaultTraversal_CompleteUnrolling
-{
-  // FIXME: this is not very clean, perhaps this information should be provided by the kernel?
-  typedef typename Kernel::DstEvaluatorType DstEvaluatorType;
-  typedef typename DstEvaluatorType::XprType DstXprType;
-  
-  enum {
-    outer = Index / DstXprType::InnerSizeAtCompileTime,
-    inner = Index % DstXprType::InnerSizeAtCompileTime
-  };
-
-  EIGEN_DEVICE_FUNC static EIGEN_STRONG_INLINE void run(Kernel &kernel)
-  {
-    kernel.assignCoeffByOuterInner(outer, inner);
-    copy_using_evaluator_DefaultTraversal_CompleteUnrolling<Kernel, Index+1, Stop>::run(kernel);
-  }
-};
-
-template<typename Kernel, int Stop>
-struct copy_using_evaluator_DefaultTraversal_CompleteUnrolling<Kernel, Stop, Stop>
-{
-  EIGEN_DEVICE_FUNC static EIGEN_STRONG_INLINE void run(Kernel&) { }
-};
-
-template<typename Kernel, int Index_, int Stop>
-struct copy_using_evaluator_DefaultTraversal_InnerUnrolling
-{
-  EIGEN_DEVICE_FUNC static EIGEN_STRONG_INLINE void run(Kernel &kernel, Index outer)
-  {
-    kernel.assignCoeffByOuterInner(outer, Index_);
-    copy_using_evaluator_DefaultTraversal_InnerUnrolling<Kernel, Index_+1, Stop>::run(kernel, outer);
-  }
-};
-
-template<typename Kernel, int Stop>
-struct copy_using_evaluator_DefaultTraversal_InnerUnrolling<Kernel, Stop, Stop>
-{
-  EIGEN_DEVICE_FUNC static EIGEN_STRONG_INLINE void run(Kernel&, Index) { }
-};
-
-/***********************
-*** Linear traversal ***
-***********************/
-
-template<typename Kernel, int Index, int Stop>
-struct copy_using_evaluator_LinearTraversal_CompleteUnrolling
-{
-  EIGEN_DEVICE_FUNC static EIGEN_STRONG_INLINE void run(Kernel& kernel)
-  {
-    kernel.assignCoeff(Index);
-    copy_using_evaluator_LinearTraversal_CompleteUnrolling<Kernel, Index+1, Stop>::run(kernel);
-  }
-};
-
-template<typename Kernel, int Stop>
-struct copy_using_evaluator_LinearTraversal_CompleteUnrolling<Kernel, Stop, Stop>
-{
-  EIGEN_DEVICE_FUNC static EIGEN_STRONG_INLINE void run(Kernel&) { }
-};
-
-/**************************
-*** Inner vectorization ***
-**************************/
-
-template<typename Kernel, int Index, int Stop>
-struct copy_using_evaluator_innervec_CompleteUnrolling
-{
-  // FIXME: this is not very clean, perhaps this information should be provided by the kernel?
-  typedef typename Kernel::DstEvaluatorType DstEvaluatorType;
-  typedef typename DstEvaluatorType::XprType DstXprType;
-  
-  enum {
-    outer = Index / DstXprType::InnerSizeAtCompileTime,
-    inner = Index % DstXprType::InnerSizeAtCompileTime,
-    JointAlignment = Kernel::AssignmentTraits::JointAlignment
-  };
-
-  EIGEN_DEVICE_FUNC static EIGEN_STRONG_INLINE void run(Kernel &kernel)
-  {
-    kernel.template assignPacketByOuterInner<Aligned, JointAlignment>(outer, inner);
-    enum { NextIndex = Index + packet_traits<typename DstXprType::Scalar>::size };
-    copy_using_evaluator_innervec_CompleteUnrolling<Kernel, NextIndex, Stop>::run(kernel);
-  }
-};
-
-template<typename Kernel, int Stop>
-struct copy_using_evaluator_innervec_CompleteUnrolling<Kernel, Stop, Stop>
-{
-  EIGEN_DEVICE_FUNC static EIGEN_STRONG_INLINE void run(Kernel&) { }
-};
-
-template<typename Kernel, int Index_, int Stop>
-struct copy_using_evaluator_innervec_InnerUnrolling
-{
-  EIGEN_DEVICE_FUNC static EIGEN_STRONG_INLINE void run(Kernel &kernel, Index outer)
-  {
-    kernel.template assignPacketByOuterInner<Aligned, Aligned>(outer, Index_);
-    enum { NextIndex = Index_ + packet_traits<typename Kernel::Scalar>::size };
-    copy_using_evaluator_innervec_InnerUnrolling<Kernel, NextIndex, Stop>::run(kernel, outer);
-  }
-};
-
-template<typename Kernel, int Stop>
-struct copy_using_evaluator_innervec_InnerUnrolling<Kernel, Stop, Stop>
-{
-  EIGEN_DEVICE_FUNC static EIGEN_STRONG_INLINE void run(Kernel &, Index) { }
-};
-
-/***************************************************************************
-* Part 3 : implementation of all cases
-***************************************************************************/
-
-// dense_assignment_loop is based on assign_impl
-
-template<typename Kernel,
-         int Traversal = Kernel::AssignmentTraits::Traversal,
-         int Unrolling = Kernel::AssignmentTraits::Unrolling>
-struct dense_assignment_loop;
-
-/************************
-*** Default traversal ***
-************************/
-
-template<typename Kernel>
-struct dense_assignment_loop<Kernel, DefaultTraversal, NoUnrolling>
-{
-  EIGEN_DEVICE_FUNC static void run(Kernel &kernel)
-  {
-    for(Index outer = 0; outer < kernel.outerSize(); ++outer) {
-      for(Index inner = 0; inner < kernel.innerSize(); ++inner) {
-        kernel.assignCoeffByOuterInner(outer, inner);
-      }
-    }
-  }
-};
-
-template<typename Kernel>
-struct dense_assignment_loop<Kernel, DefaultTraversal, CompleteUnrolling>
-{
-  EIGEN_DEVICE_FUNC static EIGEN_STRONG_INLINE void run(Kernel &kernel)
-  {
-    typedef typename Kernel::DstEvaluatorType::XprType DstXprType;
-    copy_using_evaluator_DefaultTraversal_CompleteUnrolling<Kernel, 0, DstXprType::SizeAtCompileTime>::run(kernel);
-  }
-};
-
-template<typename Kernel>
-struct dense_assignment_loop<Kernel, DefaultTraversal, InnerUnrolling>
-{
-  typedef typename Kernel::StorageIndex StorageIndex;
-  EIGEN_DEVICE_FUNC static EIGEN_STRONG_INLINE void run(Kernel &kernel)
-  {
-    typedef typename Kernel::DstEvaluatorType::XprType DstXprType;
-
-    const Index outerSize = kernel.outerSize();
-    for(Index outer = 0; outer < outerSize; ++outer)
-      copy_using_evaluator_DefaultTraversal_InnerUnrolling<Kernel, 0, DstXprType::InnerSizeAtCompileTime>::run(kernel, outer);
-  }
-};
-
-/***************************
-*** Linear vectorization ***
-***************************/
-
-
-// The goal of unaligned_dense_assignment_loop is simply to factorize the handling
-// of the non vectorizable beginning and ending parts
-
-template <bool IsAligned = false>
-struct unaligned_dense_assignment_loop
-{
-  // if IsAligned = true, then do nothing
-  template <typename Kernel>
-  EIGEN_DEVICE_FUNC static EIGEN_STRONG_INLINE void run(Kernel&, Index, Index) {}
-};
-
-template <>
-struct unaligned_dense_assignment_loop<false>
-{
-  // MSVC must not inline this functions. If it does, it fails to optimize the
-  // packet access path.
-  // FIXME check which version exhibits this issue
-#if EIGEN_COMP_MSVC
-  template <typename Kernel>
-  static EIGEN_DONT_INLINE void run(Kernel &kernel,
-                                    Index start,
-                                    Index end)
-#else
-  template <typename Kernel>
-  EIGEN_DEVICE_FUNC static EIGEN_STRONG_INLINE void run(Kernel &kernel,
-                                      Index start,
-                                      Index end)
-#endif
-  {
-    for (Index index = start; index < end; ++index)
-      kernel.assignCoeff(index);
-  }
-};
-
-template<typename Kernel>
-struct dense_assignment_loop<Kernel, LinearVectorizedTraversal, NoUnrolling>
-{
-  EIGEN_DEVICE_FUNC static EIGEN_STRONG_INLINE void run(Kernel &kernel)
-  {
-    const Index size = kernel.size();
-    typedef packet_traits<typename Kernel::Scalar> PacketTraits;
-    enum {
-      packetSize = PacketTraits::size,
-      dstIsAligned = int(Kernel::AssignmentTraits::DstIsAligned),
-      dstAlignment = PacketTraits::AlignedOnScalar ? Aligned : dstIsAligned,
-      srcAlignment = Kernel::AssignmentTraits::JointAlignment
-    };
-    const Index alignedStart = dstIsAligned ? 0 : internal::first_aligned(&kernel.dstEvaluator().coeffRef(0), size);
-    const Index alignedEnd = alignedStart + ((size-alignedStart)/packetSize)*packetSize;
-
-    unaligned_dense_assignment_loop<dstIsAligned!=0>::run(kernel, 0, alignedStart);
-
-    for(Index index = alignedStart; index < alignedEnd; index += packetSize)
-      kernel.template assignPacket<dstAlignment, srcAlignment>(index);
-
-    unaligned_dense_assignment_loop<>::run(kernel, alignedEnd, size);
-  }
-};
-
-template<typename Kernel>
-struct dense_assignment_loop<Kernel, LinearVectorizedTraversal, CompleteUnrolling>
-{
-  typedef typename Kernel::StorageIndex StorageIndex;
-  EIGEN_DEVICE_FUNC static EIGEN_STRONG_INLINE void run(Kernel &kernel)
-  {
-    typedef typename Kernel::DstEvaluatorType::XprType DstXprType;
-    
-    enum { size = DstXprType::SizeAtCompileTime,
-           packetSize = packet_traits<typename Kernel::Scalar>::size,
-           alignedSize = (size/packetSize)*packetSize };
-
-    copy_using_evaluator_innervec_CompleteUnrolling<Kernel, 0, alignedSize>::run(kernel);
-    copy_using_evaluator_DefaultTraversal_CompleteUnrolling<Kernel, alignedSize, size>::run(kernel);
-  }
-};
-
-/**************************
-*** Inner vectorization ***
-**************************/
-
-template<typename Kernel>
-struct dense_assignment_loop<Kernel, InnerVectorizedTraversal, NoUnrolling>
-{
-  EIGEN_DEVICE_FUNC static inline void run(Kernel &kernel)
-  {
-    const Index innerSize = kernel.innerSize();
-    const Index outerSize = kernel.outerSize();
-    const Index packetSize = packet_traits<typename Kernel::Scalar>::size;
-    for(Index outer = 0; outer < outerSize; ++outer)
-      for(Index inner = 0; inner < innerSize; inner+=packetSize)
-        kernel.template assignPacketByOuterInner<Aligned, Aligned>(outer, inner);
-  }
-};
-
-template<typename Kernel>
-struct dense_assignment_loop<Kernel, InnerVectorizedTraversal, CompleteUnrolling>
-{
-  EIGEN_DEVICE_FUNC static EIGEN_STRONG_INLINE void run(Kernel &kernel)
-  {
-    typedef typename Kernel::DstEvaluatorType::XprType DstXprType;
-    copy_using_evaluator_innervec_CompleteUnrolling<Kernel, 0, DstXprType::SizeAtCompileTime>::run(kernel);
-  }
-};
-
-template<typename Kernel>
-struct dense_assignment_loop<Kernel, InnerVectorizedTraversal, InnerUnrolling>
-{
-  typedef typename Kernel::StorageIndex StorageIndex;
-  EIGEN_DEVICE_FUNC static EIGEN_STRONG_INLINE void run(Kernel &kernel)
-  {
-    typedef typename Kernel::DstEvaluatorType::XprType DstXprType;
-    const Index outerSize = kernel.outerSize();
-    for(Index outer = 0; outer < outerSize; ++outer)
-      copy_using_evaluator_innervec_InnerUnrolling<Kernel, 0, DstXprType::InnerSizeAtCompileTime>::run(kernel, outer);
-  }
-};
-
-/***********************
-*** Linear traversal ***
-***********************/
-
-template<typename Kernel>
-struct dense_assignment_loop<Kernel, LinearTraversal, NoUnrolling>
-{
-  EIGEN_DEVICE_FUNC static inline void run(Kernel &kernel)
-  {
-    const Index size = kernel.size();
-    for(Index i = 0; i < size; ++i)
-      kernel.assignCoeff(i);
-  }
-};
-
-template<typename Kernel>
-struct dense_assignment_loop<Kernel, LinearTraversal, CompleteUnrolling>
-{
-  EIGEN_DEVICE_FUNC static EIGEN_STRONG_INLINE void run(Kernel &kernel)
-  {
-    typedef typename Kernel::DstEvaluatorType::XprType DstXprType;
-    copy_using_evaluator_LinearTraversal_CompleteUnrolling<Kernel, 0, DstXprType::SizeAtCompileTime>::run(kernel);
-  }
-};
-
-/**************************
-*** Slice vectorization ***
-***************************/
-
-template<typename Kernel>
-struct dense_assignment_loop<Kernel, SliceVectorizedTraversal, NoUnrolling>
-{
-  EIGEN_DEVICE_FUNC static inline void run(Kernel &kernel)
-  {
-    typedef packet_traits<typename Kernel::Scalar> PacketTraits;
-    enum {
-      packetSize = PacketTraits::size,
-      alignable = PacketTraits::AlignedOnScalar,
-      dstAlignment = alignable ? Aligned : int(Kernel::AssignmentTraits::DstIsAligned)
-    };
-    const Index packetAlignedMask = packetSize - 1;
-    const Index innerSize = kernel.innerSize();
-    const Index outerSize = kernel.outerSize();
-    const Index alignedStep = alignable ? (packetSize - kernel.outerStride() % packetSize) & packetAlignedMask : 0;
-    Index alignedStart = ((!alignable) || Kernel::AssignmentTraits::DstIsAligned) ? 0
-                       : internal::first_aligned(&kernel.dstEvaluator().coeffRef(0,0), innerSize);
-
-    for(Index outer = 0; outer < outerSize; ++outer)
-    {
-      const Index alignedEnd = alignedStart + ((innerSize-alignedStart) & ~packetAlignedMask);
-      // do the non-vectorizable part of the assignment
-      for(Index inner = 0; inner<alignedStart ; ++inner)
-        kernel.assignCoeffByOuterInner(outer, inner);
-
-      // do the vectorizable part of the assignment
-      for(Index inner = alignedStart; inner<alignedEnd; inner+=packetSize)
-        kernel.template assignPacketByOuterInner<dstAlignment, Unaligned>(outer, inner);
-
-      // do the non-vectorizable part of the assignment
-      for(Index inner = alignedEnd; inner<innerSize ; ++inner)
-        kernel.assignCoeffByOuterInner(outer, inner);
-
-      alignedStart = std::min<Index>((alignedStart+alignedStep)%packetSize, innerSize);
-    }
-  }
-};
-
-/***************************************************************************
-* Part 4 : Generic dense assignment kernel
-***************************************************************************/
-
-// This class generalize the assignment of a coefficient (or packet) from one dense evaluator
-// to another dense writable evaluator.
-// It is parametrized by the two evaluators, and the actual assignment functor.
-// This abstraction level permits to keep the evaluation loops as simple and as generic as possible.
-// One can customize the assignment using this generic dense_assignment_kernel with different
-// functors, or by completely overloading it, by-passing a functor.
-template<typename DstEvaluatorTypeT, typename SrcEvaluatorTypeT, typename Functor, int Version = Specialized>
-class generic_dense_assignment_kernel
-{
-protected:
-  typedef typename DstEvaluatorTypeT::XprType DstXprType;
-  typedef typename SrcEvaluatorTypeT::XprType SrcXprType;
-public:
-  
-  typedef DstEvaluatorTypeT DstEvaluatorType;
-  typedef SrcEvaluatorTypeT SrcEvaluatorType;
-  typedef typename DstEvaluatorType::Scalar Scalar;
-  typedef typename DstEvaluatorType::StorageIndex StorageIndex;
-  typedef copy_using_evaluator_traits<DstEvaluatorTypeT, SrcEvaluatorTypeT, Functor> AssignmentTraits;
-  
-  
-  EIGEN_DEVICE_FUNC generic_dense_assignment_kernel(DstEvaluatorType &dst, const SrcEvaluatorType &src, const Functor &func, DstXprType& dstExpr)
-    : m_dst(dst), m_src(src), m_functor(func), m_dstExpr(dstExpr)
-  {
-    #ifdef EIGEN_DEBUG_ASSIGN
-    AssignmentTraits::debug();
-    #endif
-  }
-  
-  EIGEN_DEVICE_FUNC Index size() const        { return m_dstExpr.size(); }
-  EIGEN_DEVICE_FUNC Index innerSize() const   { return m_dstExpr.innerSize(); }
-  EIGEN_DEVICE_FUNC Index outerSize() const   { return m_dstExpr.outerSize(); }
-  EIGEN_DEVICE_FUNC Index rows() const        { return m_dstExpr.rows(); }
-  EIGEN_DEVICE_FUNC Index cols() const        { return m_dstExpr.cols(); }
-  EIGEN_DEVICE_FUNC Index outerStride() const { return m_dstExpr.outerStride(); }
-  
-  // TODO get rid of this one:
-  EIGEN_DEVICE_FUNC DstXprType& dstExpression() const { return m_dstExpr; }
-  
-  EIGEN_DEVICE_FUNC DstEvaluatorType& dstEvaluator() { return m_dst; }
-  EIGEN_DEVICE_FUNC const SrcEvaluatorType& srcEvaluator() const { return m_src; }
-  
-  /// Assign src(row,col) to dst(row,col) through the assignment functor.
-  EIGEN_DEVICE_FUNC void assignCoeff(Index row, Index col)
-  {
-    m_functor.assignCoeff(m_dst.coeffRef(row,col), m_src.coeff(row,col));
-  }
-  
-  /// \sa assignCoeff(Index,Index)
-  EIGEN_DEVICE_FUNC void assignCoeff(Index index)
-  {
-    m_functor.assignCoeff(m_dst.coeffRef(index), m_src.coeff(index));
-  }
-  
-  /// \sa assignCoeff(Index,Index)
-  EIGEN_DEVICE_FUNC void assignCoeffByOuterInner(Index outer, Index inner)
-  {
-    Index row = rowIndexByOuterInner(outer, inner); 
-    Index col = colIndexByOuterInner(outer, inner); 
-    assignCoeff(row, col);
-  }
-  
-  
-  template<int StoreMode, int LoadMode>
-  EIGEN_DEVICE_FUNC void assignPacket(Index row, Index col)
-  {
-    m_functor.template assignPacket<StoreMode>(&m_dst.coeffRef(row,col), m_src.template packet<LoadMode>(row,col));
-  }
-  
-  template<int StoreMode, int LoadMode>
-  EIGEN_DEVICE_FUNC void assignPacket(Index index)
-  {
-    m_functor.template assignPacket<StoreMode>(&m_dst.coeffRef(index), m_src.template packet<LoadMode>(index));
-  }
-  
-  template<int StoreMode, int LoadMode>
-  EIGEN_DEVICE_FUNC void assignPacketByOuterInner(Index outer, Index inner)
-  {
-    Index row = rowIndexByOuterInner(outer, inner); 
-    Index col = colIndexByOuterInner(outer, inner);
-    assignPacket<StoreMode,LoadMode>(row, col);
-  }
-  
-  EIGEN_DEVICE_FUNC static Index rowIndexByOuterInner(Index outer, Index inner)
-  {
-    typedef typename DstEvaluatorType::ExpressionTraits Traits;
-    return int(Traits::RowsAtCompileTime) == 1 ? 0
-      : int(Traits::ColsAtCompileTime) == 1 ? inner
-      : int(DstEvaluatorType::Flags)&RowMajorBit ? outer
-      : inner;
-  }
-
-  EIGEN_DEVICE_FUNC static Index colIndexByOuterInner(Index outer, Index inner)
-  {
-    typedef typename DstEvaluatorType::ExpressionTraits Traits;
-    return int(Traits::ColsAtCompileTime) == 1 ? 0
-      : int(Traits::RowsAtCompileTime) == 1 ? inner
-      : int(DstEvaluatorType::Flags)&RowMajorBit ? inner
-      : outer;
-  }
-  
-protected:
-  DstEvaluatorType& m_dst;
-  const SrcEvaluatorType& m_src;
-  const Functor &m_functor;
-  // TODO find a way to avoid the needs of the original expression
-  DstXprType& m_dstExpr;
-};
-
-/***************************************************************************
-* Part 5 : Entry point for dense rectangular assignment
-***************************************************************************/
-
-template<typename DstXprType, typename SrcXprType, typename Functor>
-EIGEN_DEVICE_FUNC void call_dense_assignment_loop(const DstXprType& dst, const SrcXprType& src, const Functor &func)
-{
-  eigen_assert(dst.rows() == src.rows() && dst.cols() == src.cols());
-  
-  typedef typename evaluator<DstXprType>::type DstEvaluatorType;
-  typedef typename evaluator<SrcXprType>::type SrcEvaluatorType;
-
-  DstEvaluatorType dstEvaluator(dst);
-  SrcEvaluatorType srcEvaluator(src);
-    
-  typedef generic_dense_assignment_kernel<DstEvaluatorType,SrcEvaluatorType,Functor> Kernel;
-  Kernel kernel(dstEvaluator, srcEvaluator, func, dst.const_cast_derived());
-  
-  dense_assignment_loop<Kernel>::run(kernel);
-}
-
-template<typename DstXprType, typename SrcXprType>
-EIGEN_DEVICE_FUNC void call_dense_assignment_loop(const DstXprType& dst, const SrcXprType& src)
-{
-  call_dense_assignment_loop(dst, src, internal::assign_op<typename DstXprType::Scalar>());
-}
-
-/***************************************************************************
-* Part 6 : Generic assignment
-***************************************************************************/
-
-// Based on the respective shapes of the destination and source,
-// the class AssignmentKind determine the kind of assignment mechanism.
-// AssignmentKind must define a Kind typedef.
-template<typename DstShape, typename SrcShape> struct AssignmentKind;
-
-// Assignement kind defined in this file:
-struct Dense2Dense {};
-struct EigenBase2EigenBase {};
-
-template<typename,typename> struct AssignmentKind { typedef EigenBase2EigenBase Kind; };
-template<> struct AssignmentKind<DenseShape,DenseShape> { typedef Dense2Dense Kind; };
-    
-// This is the main assignment class
-template< typename DstXprType, typename SrcXprType, typename Functor,
-          typename Kind = typename AssignmentKind< typename evaluator_traits<DstXprType>::Shape , typename evaluator_traits<SrcXprType>::Shape >::Kind,
-          typename Scalar = typename DstXprType::Scalar>
-struct Assignment;
-
-
-// The only purpose of this call_assignment() function is to deal with noalias() / AssumeAliasing and automatic transposition.
-// Indeed, I (Gael) think that this concept of AssumeAliasing was a mistake, and it makes thing quite complicated.
-// So this intermediate function removes everything related to AssumeAliasing such that Assignment
-// does not has to bother about these annoying details.
-
-template<typename Dst, typename Src>
-EIGEN_DEVICE_FUNC void call_assignment(Dst& dst, const Src& src)
-{
-  call_assignment(dst, src, internal::assign_op<typename Dst::Scalar>());
-}
-template<typename Dst, typename Src>
-EIGEN_DEVICE_FUNC void call_assignment(const Dst& dst, const Src& src)
-{
-  call_assignment(dst, src, internal::assign_op<typename Dst::Scalar>());
-}
-                     
-// Deal with AssumeAliasing
-template<typename Dst, typename Src, typename Func>
-EIGEN_DEVICE_FUNC void call_assignment(Dst& dst, const Src& src, const Func& func, typename enable_if<evaluator_traits<Src>::AssumeAliasing==1, void*>::type = 0)
-{
-  typename plain_matrix_type<Src>::type tmp(src);
-  call_assignment_no_alias(dst, tmp, func);
-}
-
-template<typename Dst, typename Src, typename Func>
-EIGEN_DEVICE_FUNC void call_assignment(Dst& dst, const Src& src, const Func& func, typename enable_if<evaluator_traits<Src>::AssumeAliasing==0, void*>::type = 0)
-{
-  call_assignment_no_alias(dst, src, func);
-}
-
-// by-pass AssumeAliasing
-// FIXME the const version should probably not be needed
-// When there is no aliasing, we require that 'dst' has been properly resized
-template<typename Dst, template <typename> class StorageBase, typename Src, typename Func>
-EIGEN_DEVICE_FUNC void call_assignment(const NoAlias<Dst,StorageBase>& dst, const Src& src, const Func& func)
-{
-  call_assignment_no_alias(dst.expression(), src, func);
-}
-template<typename Dst, template <typename> class StorageBase, typename Src, typename Func>
-EIGEN_DEVICE_FUNC void call_assignment(NoAlias<Dst,StorageBase>& dst, const Src& src, const Func& func)
-{
-  call_assignment_no_alias(dst.expression(), src, func);
-}
-
-
-template<typename Dst, typename Src, typename Func>
-EIGEN_DEVICE_FUNC void call_assignment_no_alias(Dst& dst, const Src& src, const Func& func)
-{
-  enum {
-    NeedToTranspose = (  (int(Dst::RowsAtCompileTime) == 1 && int(Src::ColsAtCompileTime) == 1)
-                        |   // FIXME | instead of || to please GCC 4.4.0 stupid warning "suggest parentheses around &&".
-                                // revert to || as soon as not needed anymore.
-                         (int(Dst::ColsAtCompileTime) == 1 && int(Src::RowsAtCompileTime) == 1))
-                     && int(Dst::SizeAtCompileTime) != 1
-  };
-
-  Index dstRows = NeedToTranspose ? src.cols() : src.rows();
-  Index dstCols = NeedToTranspose ? src.rows() : src.cols();
-  if((dst.rows()!=dstRows) || (dst.cols()!=dstCols))
-    dst.resize(dstRows, dstCols);
-  
-  typedef typename internal::conditional<NeedToTranspose, Transpose<Dst>, Dst>::type ActualDstTypeCleaned;
-  typedef typename internal::conditional<NeedToTranspose, Transpose<Dst>, Dst&>::type ActualDstType;
-  ActualDstType actualDst(dst);
-  
-  // TODO check whether this is the right place to perform these checks:
-  EIGEN_STATIC_ASSERT_LVALUE(Dst)
-  EIGEN_STATIC_ASSERT_SAME_MATRIX_SIZE(ActualDstTypeCleaned,Src)
-
-  // TODO this line is commented to allow matrix = permutation
-  // Actually, the "Scalar" type for a permutation matrix does not really make sense,
-  // perhaps it could be void, and EIGEN_CHECK_BINARY_COMPATIBILIY could allow micing void with anything...?
-//   EIGEN_CHECK_BINARY_COMPATIBILIY(Func,typename ActualDstTypeCleaned::Scalar,typename Src::Scalar);
-  
-  Assignment<ActualDstTypeCleaned,Src,Func>::run(actualDst, src, func);
-}
-template<typename Dst, typename Src>
-EIGEN_DEVICE_FUNC void call_assignment_no_alias(Dst& dst, const Src& src)
-{
-  call_assignment_no_alias(dst, src, internal::assign_op<typename Dst::Scalar>());
-}
-
-// forward declaration
-template<typename Dst, typename Src> void check_for_aliasing(const Dst &dst, const Src &src);
-
-// Generic Dense to Dense assignment
-template< typename DstXprType, typename SrcXprType, typename Functor, typename Scalar>
-struct Assignment<DstXprType, SrcXprType, Functor, Dense2Dense, Scalar>
-{
-  EIGEN_DEVICE_FUNC static void run(DstXprType &dst, const SrcXprType &src, const Functor &func)
-  {
-    eigen_assert(dst.rows() == src.rows() && dst.cols() == src.cols());
-    
-#ifndef EIGEN_NO_DEBUG
-    internal::check_for_aliasing(dst, src);
-#endif
-    
-    call_dense_assignment_loop(dst, src, func);
-  }
-};
-
-// Generic assignment through evalTo.
-// TODO: not sure we have to keep that one, but it helps porting current code to new evaluator mechanism.
-template< typename DstXprType, typename SrcXprType, typename Functor, typename Scalar>
-struct Assignment<DstXprType, SrcXprType, Functor, EigenBase2EigenBase, Scalar>
-{
-  EIGEN_DEVICE_FUNC static void run(DstXprType &dst, const SrcXprType &src, const internal::assign_op<typename DstXprType::Scalar> &/*func*/)
-  {
-    eigen_assert(dst.rows() == src.rows() && dst.cols() == src.cols());
-    
-    src.evalTo(dst);
-  }
-};
-
-} // namespace internal
-
-} // end namespace Eigen
-
-#endif // EIGEN_ASSIGN_EVALUATOR_H
--- a/Eigen/src/Core/Assign_MKL.h
+++ b/Eigen/src/Core/Assign_MKL.h
@@ -84,6 +84,7 @@ template<typename Derived1, typename Derived2, typename UnaryOp, int Traversal,
 struct vml_assign_impl<Derived1, Derived2, UnaryOp, Traversal, Unrolling, InnerVectorizedTraversal>
 {
  typedef typename Derived1::Scalar Scalar;
+  typedef typename Derived1::Index Index;
  static inline void run(Derived1& dst, const CwiseUnaryOp<UnaryOp, Derived2>& src)
  {
    // in case we want to (or have to) skip VML at runtime we can call:
@@ -201,7 +202,6 @@ EIGEN_MKL_VML_DECLARE_UNARY_CALLS_LA(asin, Asin)
 EIGEN_MKL_VML_DECLARE_UNARY_CALLS_LA(cos,  Cos)
 EIGEN_MKL_VML_DECLARE_UNARY_CALLS_LA(acos, Acos)
 EIGEN_MKL_VML_DECLARE_UNARY_CALLS_LA(tan,  Tan)
-EIGEN_MKL_VML_DECLARE_UNARY_CALLS_LA(atan,  Atan)
 //EIGEN_MKL_VML_DECLARE_UNARY_CALLS(abs,  Abs)
 EIGEN_MKL_VML_DECLARE_UNARY_CALLS_LA(exp,  Exp)
 EIGEN_MKL_VML_DECLARE_UNARY_CALLS_LA(log,  Ln)
--- a/Eigen/src/Core/BandMatrix.h
+++ b/Eigen/src/Core/BandMatrix.h
@@ -32,7 +32,7 @@ class BandMatrixBase : public EigenBase<Derived>
    };
    typedef typename internal::traits<Derived>::Scalar Scalar;
    typedef Matrix<Scalar,RowsAtCompileTime,ColsAtCompileTime> DenseMatrixType;
-    typedef typename DenseMatrixType::StorageIndex StorageIndex;
+    typedef typename DenseMatrixType::Index Index;
    typedef typename internal::traits<Derived>::CoefficientsType CoefficientsType;
    typedef EigenBase<Derived> Base;

@@ -179,7 +179,7 @@ struct traits<BandMatrix<_Scalar,_Rows,_Cols,_Supers,_Subs,_Options> >
 {
  typedef _Scalar Scalar;
  typedef Dense StorageKind;
-  typedef Eigen::Index StorageIndex;
+  typedef DenseIndex Index;
  enum {
    CoeffReadCost = NumTraits<Scalar>::ReadCost,
    RowsAtCompileTime = _Rows,
@@ -201,10 +201,10 @@ class BandMatrix : public BandMatrixBase<BandMatrix<_Scalar,Rows,Cols,Supers,Sub
  public:

    typedef typename internal::traits<BandMatrix>::Scalar Scalar;
-    typedef typename internal::traits<BandMatrix>::StorageIndex StorageIndex;
+    typedef typename internal::traits<BandMatrix>::Index Index;
    typedef typename internal::traits<BandMatrix>::CoefficientsType CoefficientsType;

-    explicit inline BandMatrix(Index rows=Rows, Index cols=Cols, Index supers=Supers, Index subs=Subs)
+    inline BandMatrix(Index rows=Rows, Index cols=Cols, Index supers=Supers, Index subs=Subs)
      : m_coeffs(1+supers+subs,cols),
        m_rows(rows), m_supers(supers), m_subs(subs)
    {
@@ -241,7 +241,7 @@ struct traits<BandMatrixWrapper<_CoefficientsType,_Rows,_Cols,_Supers,_Subs,_Opt
 {
  typedef typename _CoefficientsType::Scalar Scalar;
  typedef typename _CoefficientsType::StorageKind StorageKind;
-  typedef typename _CoefficientsType::StorageIndex StorageIndex;
+  typedef typename _CoefficientsType::Index Index;
  enum {
    CoeffReadCost = internal::traits<_CoefficientsType>::CoeffReadCost,
    RowsAtCompileTime = _Rows,
@@ -264,9 +264,9 @@ class BandMatrixWrapper : public BandMatrixBase<BandMatrixWrapper<_CoefficientsT

    typedef typename internal::traits<BandMatrixWrapper>::Scalar Scalar;
    typedef typename internal::traits<BandMatrixWrapper>::CoefficientsType CoefficientsType;
-    typedef typename internal::traits<BandMatrixWrapper>::StorageIndex StorageIndex;
+    typedef typename internal::traits<BandMatrixWrapper>::Index Index;

-    explicit inline BandMatrixWrapper(const CoefficientsType& coeffs, Index rows=_Rows, Index cols=_Cols, Index supers=_Supers, Index subs=_Subs)
+    inline BandMatrixWrapper(const CoefficientsType& coeffs, Index rows=_Rows, Index cols=_Cols, Index supers=_Supers, Index subs=_Subs)
      : m_coeffs(coeffs),
        m_rows(rows), m_supers(supers), m_subs(subs)
    {
@@ -312,9 +312,9 @@ template<typename Scalar, int Size, int Options>
 class TridiagonalMatrix : public BandMatrix<Scalar,Size,Size,Options&SelfAdjoint?0:1,1,Options|RowMajor>
 {
    typedef BandMatrix<Scalar,Size,Size,Options&SelfAdjoint?0:1,1,Options|RowMajor> Base;
-    typedef typename Base::StorageIndex StorageIndex;
+    typedef typename Base::Index Index;
  public:
-    explicit TridiagonalMatrix(Index size = Size) : Base(size,size,Options&SelfAdjoint?0:1,1) {}
+    TridiagonalMatrix(Index size = Size) : Base(size,size,Options&SelfAdjoint?0:1,1) {}

    inline typename Base::template DiagonalIntReturnType<1>::Type super()
    { return Base::template diagonal<1>(); }
@@ -327,25 +327,6 @@ class TridiagonalMatrix : public BandMatrix<Scalar,Size,Size,Options&SelfAdjoint
  protected:
 };

-
-struct BandShape {};
-
-template<typename _Scalar, int _Rows, int _Cols, int _Supers, int _Subs, int _Options>
-struct evaluator_traits<BandMatrix<_Scalar,_Rows,_Cols,_Supers,_Subs,_Options> >
-  : public evaluator_traits_base<BandMatrix<_Scalar,_Rows,_Cols,_Supers,_Subs,_Options> >
-{
-  typedef BandShape Shape;
-};
-
-template<typename _CoefficientsType,int _Rows, int _Cols, int _Supers, int _Subs,int _Options>
-struct evaluator_traits<BandMatrixWrapper<_CoefficientsType,_Rows,_Cols,_Supers,_Subs,_Options> >
-  : public evaluator_traits_base<BandMatrixWrapper<_CoefficientsType,_Rows,_Cols,_Supers,_Subs,_Options> >
-{
-  typedef BandShape Shape;
-};
-
-template<> struct AssignmentKind<DenseShape,BandShape> { typedef EigenBase2EigenBase Kind; };
-
 } // end namespace internal

 } // end namespace Eigen
--- a/Eigen/src/Core/Block.h
+++ b/Eigen/src/Core/Block.h
@@ -21,9 +21,6 @@ namespace Eigen {
  * \param XprType the type of the expression in which we are taking a block
  * \param BlockRows the number of rows of the block we are taking at compile time (optional)
  * \param BlockCols the number of columns of the block we are taking at compile time (optional)
-  * \param InnerPanel is true, if the block maps to a set of rows of a row major matrix or
-  *        to set of columns of a column major matrix (optional). The parameter allows to determine
-  *        at compile time whether aligned access is possible on the block expression.
  *
  * This class represents an expression of either a fixed-size or dynamic-size block. It is the return
  * type of DenseBase::block(Index,Index,Index,Index) and DenseBase::block<int,int>(Index,Index) and
@@ -68,10 +65,10 @@ struct traits<Block<XprType, BlockRows, BlockCols, InnerPanel> > : traits<XprTyp
    MaxColsAtCompileTime = BlockCols==0 ? 0
                         : ColsAtCompileTime != Dynamic ? int(ColsAtCompileTime)
                         : int(traits<XprType>::MaxColsAtCompileTime),
-
    XprTypeIsRowMajor = (int(traits<XprType>::Flags)&RowMajorBit) != 0,
-    IsRowMajor = (MaxRowsAtCompileTime==1&&MaxColsAtCompileTime!=1) ? 1
-               : (MaxColsAtCompileTime==1&&MaxRowsAtCompileTime!=1) ? 0
+    IsDense = is_same<StorageKind,Dense>::value,
+    IsRowMajor = (IsDense&&MaxRowsAtCompileTime==1&&MaxColsAtCompileTime!=1) ? 1
+               : (IsDense&&MaxColsAtCompileTime==1&&MaxRowsAtCompileTime!=1) ? 0
               : XprTypeIsRowMajor,
    HasSameStorageOrderAsXprType = (IsRowMajor == XprTypeIsRowMajor),
    InnerSize = IsRowMajor ? int(ColsAtCompileTime) : int(RowsAtCompileTime),
@@ -81,14 +78,18 @@ struct traits<Block<XprType, BlockRows, BlockCols, InnerPanel> > : traits<XprTyp
    OuterStrideAtCompileTime = HasSameStorageOrderAsXprType
                             ? int(outer_stride_at_compile_time<XprType>::ret)
                             : int(inner_stride_at_compile_time<XprType>::ret),
-    // IsAligned is needed by MapBase's assertions
-    // We can sefely set it to false here. Internal alignment errors will be detected by an eigen_internal_assert in the respective evaluator
-    IsAligned = 0,
-    // FIXME, this traits is rather specialized for dense object and it needs to be cleaned further
+    MaskPacketAccessBit = (InnerSize == Dynamic || (InnerSize % packet_traits<Scalar>::size) == 0)
+                       && (InnerStrideAtCompileTime == 1)
+                        ? PacketAccessBit : 0,
+    MaskAlignedBit = (InnerPanel && (OuterStrideAtCompileTime!=Dynamic) && (((OuterStrideAtCompileTime * int(sizeof(Scalar))) % 16) == 0)) ? AlignedBit : 0,
+    FlagsLinearAccessBit = (RowsAtCompileTime == 1 || ColsAtCompileTime == 1 || (InnerPanel && (traits<XprType>::Flags&LinearAccessBit))) ? LinearAccessBit : 0,
    FlagsLvalueBit = is_lvalue<XprType>::value ? LvalueBit : 0,
    FlagsRowMajorBit = IsRowMajor ? RowMajorBit : 0,
-    Flags = (traits<XprType>::Flags & (DirectAccessBit | (InnerPanel?CompressedAccessBit:0))) | FlagsLvalueBit | FlagsRowMajorBit
-    // FIXME DirectAccessBit should not be handled by expressions
+    Flags0 = traits<XprType>::Flags & ( (HereditaryBits & ~RowMajorBit) |
+                                        DirectAccessBit |
+                                        MaskPacketAccessBit |
+                                        MaskAlignedBit),
+    Flags = Flags0 | FlagsLinearAccessBit | FlagsLvalueBit | FlagsRowMajorBit
  };
 };

@@ -108,12 +109,9 @@ template<typename XprType, int BlockRows, int BlockCols, bool InnerPanel> class
    typedef Impl Base;
    EIGEN_GENERIC_PUBLIC_INTERFACE(Block)
    EIGEN_INHERIT_ASSIGNMENT_OPERATORS(Block)
-    
-    typedef typename internal::remove_all<XprType>::type NestedExpression;
  
    /** Column or Row constructor
      */
-    EIGEN_DEVICE_FUNC
    inline Block(XprType& xpr, Index i) : Impl(xpr,i)
    {
      eigen_assert( (i>=0) && (
@@ -123,7 +121,6 @@ template<typename XprType, int BlockRows, int BlockCols, bool InnerPanel> class

    /** Fixed-size constructor
      */
-    EIGEN_DEVICE_FUNC
    inline Block(XprType& xpr, Index a_startRow, Index a_startCol)
      : Impl(xpr, a_startRow, a_startCol)
    {
@@ -134,7 +131,6 @@ template<typename XprType, int BlockRows, int BlockCols, bool InnerPanel> class

    /** Dynamic-size constructor
      */
-    EIGEN_DEVICE_FUNC
    inline Block(XprType& xpr,
          Index a_startRow, Index a_startCol,
          Index blockRows, Index blockCols)
@@ -154,13 +150,12 @@ class BlockImpl<XprType, BlockRows, BlockCols, InnerPanel, Dense>
  : public internal::BlockImpl_dense<XprType, BlockRows, BlockCols, InnerPanel>
 {
    typedef internal::BlockImpl_dense<XprType, BlockRows, BlockCols, InnerPanel> Impl;
-    typedef typename XprType::StorageIndex StorageIndex;
+    typedef typename XprType::Index Index;
  public:
    typedef Impl Base;
    EIGEN_INHERIT_ASSIGNMENT_OPERATORS(BlockImpl)
-    EIGEN_DEVICE_FUNC inline BlockImpl(XprType& xpr, Index i) : Impl(xpr,i) {}
-    EIGEN_DEVICE_FUNC inline BlockImpl(XprType& xpr, Index a_startRow, Index a_startCol) : Impl(xpr, a_startRow, a_startCol) {}
-    EIGEN_DEVICE_FUNC
+    inline BlockImpl(XprType& xpr, Index i) : Impl(xpr,i) {}
+    inline BlockImpl(XprType& xpr, Index a_startRow, Index a_startCol) : Impl(xpr, a_startRow, a_startCol) {}
    inline BlockImpl(XprType& xpr, Index a_startRow, Index a_startCol, Index blockRows, Index blockCols)
      : Impl(xpr, a_startRow, a_startCol, blockRows, blockCols) {}
 };
@@ -178,11 +173,10 @@ template<typename XprType, int BlockRows, int BlockCols, bool InnerPanel, bool H
    EIGEN_DENSE_PUBLIC_INTERFACE(BlockType)
    EIGEN_INHERIT_ASSIGNMENT_OPERATORS(BlockImpl_dense)

-    // class InnerIterator; // FIXME apparently never used
+    class InnerIterator;

    /** Column or Row constructor
      */
-    EIGEN_DEVICE_FUNC
    inline BlockImpl_dense(XprType& xpr, Index i)
      : m_xpr(xpr),
        // It is a row if and only if BlockRows==1 and BlockCols==XprType::ColsAtCompileTime,
@@ -197,7 +191,6 @@ template<typename XprType, int BlockRows, int BlockCols, bool InnerPanel, bool H

    /** Fixed-size constructor
      */
-    EIGEN_DEVICE_FUNC
    inline BlockImpl_dense(XprType& xpr, Index a_startRow, Index a_startCol)
      : m_xpr(xpr), m_startRow(a_startRow), m_startCol(a_startCol),
                    m_blockRows(BlockRows), m_blockCols(BlockCols)
@@ -205,7 +198,6 @@ template<typename XprType, int BlockRows, int BlockCols, bool InnerPanel, bool H

    /** Dynamic-size constructor
      */
-    EIGEN_DEVICE_FUNC
    inline BlockImpl_dense(XprType& xpr,
          Index a_startRow, Index a_startCol,
          Index blockRows, Index blockCols)
@@ -213,10 +205,9 @@ template<typename XprType, int BlockRows, int BlockCols, bool InnerPanel, bool H
                    m_blockRows(blockRows), m_blockCols(blockCols)
    {}

-    EIGEN_DEVICE_FUNC inline Index rows() const { return m_blockRows.value(); }
-    EIGEN_DEVICE_FUNC inline Index cols() const { return m_blockCols.value(); }
+    inline Index rows() const { return m_blockRows.value(); }
+    inline Index cols() const { return m_blockCols.value(); }

-    EIGEN_DEVICE_FUNC
    inline Scalar& coeffRef(Index rowId, Index colId)
    {
      EIGEN_STATIC_ASSERT_LVALUE(XprType)
@@ -224,20 +215,17 @@ template<typename XprType, int BlockRows, int BlockCols, bool InnerPanel, bool H
               .coeffRef(rowId + m_startRow.value(), colId + m_startCol.value());
    }

-    EIGEN_DEVICE_FUNC
    inline const Scalar& coeffRef(Index rowId, Index colId) const
    {
      return m_xpr.derived()
               .coeffRef(rowId + m_startRow.value(), colId + m_startCol.value());
    }

-    EIGEN_DEVICE_FUNC
    EIGEN_STRONG_INLINE const CoeffReturnType coeff(Index rowId, Index colId) const
    {
      return m_xpr.coeff(rowId + m_startRow.value(), colId + m_startCol.value());
    }

-    EIGEN_DEVICE_FUNC
    inline Scalar& coeffRef(Index index)
    {
      EIGEN_STATIC_ASSERT_LVALUE(XprType)
@@ -246,7 +234,6 @@ template<typename XprType, int BlockRows, int BlockCols, bool InnerPanel, bool H
                       m_startCol.value() + (RowsAtCompileTime == 1 ? index : 0));
    }

-    EIGEN_DEVICE_FUNC
    inline const Scalar& coeffRef(Index index) const
    {
      return m_xpr.const_cast_derived()
@@ -254,7 +241,6 @@ template<typename XprType, int BlockRows, int BlockCols, bool InnerPanel, bool H
                       m_startCol.value() + (RowsAtCompileTime == 1 ? index : 0));
    }

-    EIGEN_DEVICE_FUNC
    inline const CoeffReturnType coeff(Index index) const
    {
      return m_xpr
@@ -294,25 +280,22 @@ template<typename XprType, int BlockRows, int BlockCols, bool InnerPanel, bool H

    #ifdef EIGEN_PARSED_BY_DOXYGEN
    /** \sa MapBase::data() */
-    EIGEN_DEVICE_FUNC inline const Scalar* data() const;
-    EIGEN_DEVICE_FUNC inline Index innerStride() const;
-    EIGEN_DEVICE_FUNC inline Index outerStride() const;
+    inline const Scalar* data() const;
+    inline Index innerStride() const;
+    inline Index outerStride() const;
    #endif

-    EIGEN_DEVICE_FUNC
-    const typename internal::remove_all<typename XprType::Nested>::type& nestedExpression() const
+    const typename internal::remove_all<typename XprType::Nested>::type& nestedExpression() const 
    { 
      return m_xpr; 
    }
      
-    EIGEN_DEVICE_FUNC
-    StorageIndex startRow() const
+    Index startRow() const 
    { 
      return m_startRow.value(); 
    }
      
-    EIGEN_DEVICE_FUNC
-    StorageIndex startCol() const
+    Index startCol() const 
    { 
      return m_startCol.value(); 
    }
@@ -320,10 +303,10 @@ template<typename XprType, int BlockRows, int BlockCols, bool InnerPanel, bool H
  protected:

    const typename XprType::Nested m_xpr;
-    const internal::variable_if_dynamic<StorageIndex, XprType::RowsAtCompileTime == 1 ? 0 : Dynamic> m_startRow;
-    const internal::variable_if_dynamic<StorageIndex, XprType::ColsAtCompileTime == 1 ? 0 : Dynamic> m_startCol;
-    const internal::variable_if_dynamic<StorageIndex, RowsAtCompileTime> m_blockRows;
-    const internal::variable_if_dynamic<StorageIndex, ColsAtCompileTime> m_blockCols;
+    const internal::variable_if_dynamic<Index, XprType::RowsAtCompileTime == 1 ? 0 : Dynamic> m_startRow;
+    const internal::variable_if_dynamic<Index, XprType::ColsAtCompileTime == 1 ? 0 : Dynamic> m_startCol;
+    const internal::variable_if_dynamic<Index, RowsAtCompileTime> m_blockRows;
+    const internal::variable_if_dynamic<Index, ColsAtCompileTime> m_blockCols;
 };

 /** \internal Internal implementation of dense Blocks in the direct access case.*/
@@ -332,9 +315,6 @@ class BlockImpl_dense<XprType,BlockRows,BlockCols, InnerPanel,true>
  : public MapBase<Block<XprType, BlockRows, BlockCols, InnerPanel> >
 {
    typedef Block<XprType, BlockRows, BlockCols, InnerPanel> BlockType;
-    enum {
-      XprTypeIsRowMajor = (int(traits<XprType>::Flags)&RowMajorBit) != 0
-    };
  public:

    typedef MapBase<BlockType> Base;
@@ -343,10 +323,10 @@ class BlockImpl_dense<XprType,BlockRows,BlockCols, InnerPanel,true>

    /** Column or Row constructor
      */
-    EIGEN_DEVICE_FUNC
    inline BlockImpl_dense(XprType& xpr, Index i)
-      : Base(xpr.data() + i * (    ((BlockRows==1) && (BlockCols==XprType::ColsAtCompileTime) && (!XprTypeIsRowMajor)) 
-                                || ((BlockRows==XprType::RowsAtCompileTime) && (BlockCols==1) && ( XprTypeIsRowMajor)) ? xpr.innerStride() : xpr.outerStride()),
+      : Base(internal::const_cast_ptr(&xpr.coeffRef(
+              (BlockRows==1) && (BlockCols==XprType::ColsAtCompileTime) ? i : 0,
+              (BlockRows==XprType::RowsAtCompileTime) && (BlockCols==1) ? i : 0)),
             BlockRows==1 ? 1 : xpr.rows(),
             BlockCols==1 ? 1 : xpr.cols()),
        m_xpr(xpr)
@@ -356,34 +336,29 @@ class BlockImpl_dense<XprType,BlockRows,BlockCols, InnerPanel,true>

    /** Fixed-size constructor
      */
-    EIGEN_DEVICE_FUNC
    inline BlockImpl_dense(XprType& xpr, Index startRow, Index startCol)
-      : Base(xpr.data()+xpr.innerStride()*(XprTypeIsRowMajor?startCol:startRow) + xpr.outerStride()*(XprTypeIsRowMajor?startRow:startCol)),
-        m_xpr(xpr)
+      : Base(internal::const_cast_ptr(&xpr.coeffRef(startRow,startCol))), m_xpr(xpr)
    {
      init();
    }

    /** Dynamic-size constructor
      */
-    EIGEN_DEVICE_FUNC
    inline BlockImpl_dense(XprType& xpr,
          Index startRow, Index startCol,
          Index blockRows, Index blockCols)
-      : Base(xpr.data()+xpr.innerStride()*(XprTypeIsRowMajor?startCol:startRow) + xpr.outerStride()*(XprTypeIsRowMajor?startRow:startCol), blockRows, blockCols),
+      : Base(internal::const_cast_ptr(&xpr.coeffRef(startRow,startCol)), blockRows, blockCols),
        m_xpr(xpr)
    {
      init();
    }

-    EIGEN_DEVICE_FUNC
-    const typename internal::remove_all<typename XprType::Nested>::type& nestedExpression() const
+    const typename internal::remove_all<typename XprType::Nested>::type& nestedExpression() const 
    { 
      return m_xpr; 
    }
      
    /** \sa MapBase::innerStride() */
-    EIGEN_DEVICE_FUNC
    inline Index innerStride() const
    {
      return internal::traits<BlockType>::HasSameStorageOrderAsXprType
@@ -392,7 +367,6 @@ class BlockImpl_dense<XprType,BlockRows,BlockCols, InnerPanel,true>
    }

    /** \sa MapBase::outerStride() */
-    EIGEN_DEVICE_FUNC
    inline Index outerStride() const
    {
      return m_outerStride;
@@ -406,7 +380,6 @@ class BlockImpl_dense<XprType,BlockRows,BlockCols, InnerPanel,true>

    #ifndef EIGEN_PARSED_BY_DOXYGEN
    /** \internal used by allowAligned() */
-    EIGEN_DEVICE_FUNC
    inline BlockImpl_dense(XprType& xpr, const Scalar* data, Index blockRows, Index blockCols)
      : Base(data, blockRows, blockCols), m_xpr(xpr)
    {
@@ -415,7 +388,6 @@ class BlockImpl_dense<XprType,BlockRows,BlockCols, InnerPanel,true>
    #endif

  protected:
-    EIGEN_DEVICE_FUNC
    void init()
    {
      m_outerStride = internal::traits<BlockType>::HasSameStorageOrderAsXprType
--- a/Eigen/src/Core/BooleanRedux.h
+++ b/Eigen/src/Core/BooleanRedux.h
@@ -17,10 +17,9 @@ namespace internal {
 template<typename Derived, int UnrollCount>
 struct all_unroller
 {
-  typedef typename Derived::ExpressionTraits Traits;
  enum {
-    col = (UnrollCount-1) / Traits::RowsAtCompileTime,
-    row = (UnrollCount-1) % Traits::RowsAtCompileTime
+    col = (UnrollCount-1) / Derived::RowsAtCompileTime,
+    row = (UnrollCount-1) % Derived::RowsAtCompileTime
  };

  static inline bool run(const Derived &mat)
@@ -44,12 +43,11 @@ struct all_unroller<Derived, Dynamic>
 template<typename Derived, int UnrollCount>
 struct any_unroller
 {
-  typedef typename Derived::ExpressionTraits Traits;
  enum {
-    col = (UnrollCount-1) / Traits::RowsAtCompileTime,
-    row = (UnrollCount-1) % Traits::RowsAtCompileTime
+    col = (UnrollCount-1) / Derived::RowsAtCompileTime,
+    row = (UnrollCount-1) % Derived::RowsAtCompileTime
  };
-  
+
  static inline bool run(const Derived &mat)
  {
    return any_unroller<Derived, UnrollCount-1>::run(mat) || mat.coeff(row, col);
@@ -80,21 +78,19 @@ struct any_unroller<Derived, Dynamic>
 template<typename Derived>
 inline bool DenseBase<Derived>::all() const
 {
-  typedef typename internal::evaluator<Derived>::type Evaluator;
  enum {
    unroll = SizeAtCompileTime != Dynamic
-          && Evaluator::CoeffReadCost != Dynamic
+          && CoeffReadCost != Dynamic
          && NumTraits<Scalar>::AddCost != Dynamic
-          && SizeAtCompileTime * (Evaluator::CoeffReadCost + NumTraits<Scalar>::AddCost) <= EIGEN_UNROLLING_LIMIT
+          && SizeAtCompileTime * (CoeffReadCost + NumTraits<Scalar>::AddCost) <= EIGEN_UNROLLING_LIMIT
  };
-  Evaluator evaluator(derived());
  if(unroll)
-    return internal::all_unroller<Evaluator, unroll ? int(SizeAtCompileTime) : Dynamic>::run(evaluator);
+    return internal::all_unroller<Derived, unroll ? int(SizeAtCompileTime) : Dynamic>::run(derived());
  else
  {
    for(Index j = 0; j < cols(); ++j)
      for(Index i = 0; i < rows(); ++i)
-        if (!evaluator.coeff(i, j)) return false;
+        if (!coeff(i, j)) return false;
    return true;
  }
 }
@@ -106,21 +102,19 @@ inline bool DenseBase<Derived>::all() const
 template<typename Derived>
 inline bool DenseBase<Derived>::any() const
 {
-  typedef typename internal::evaluator<Derived>::type Evaluator;
  enum {
    unroll = SizeAtCompileTime != Dynamic
-          && Evaluator::CoeffReadCost != Dynamic
+          && CoeffReadCost != Dynamic
          && NumTraits<Scalar>::AddCost != Dynamic
-          && SizeAtCompileTime * (Evaluator::CoeffReadCost + NumTraits<Scalar>::AddCost) <= EIGEN_UNROLLING_LIMIT
+          && SizeAtCompileTime * (CoeffReadCost + NumTraits<Scalar>::AddCost) <= EIGEN_UNROLLING_LIMIT
  };
-  Evaluator evaluator(derived());
  if(unroll)
-    return internal::any_unroller<Evaluator, unroll ? int(SizeAtCompileTime) : Dynamic>::run(evaluator);
+    return internal::any_unroller<Derived, unroll ? int(SizeAtCompileTime) : Dynamic>::run(derived());
  else
  {
    for(Index j = 0; j < cols(); ++j)
      for(Index i = 0; i < rows(); ++i)
-        if (evaluator.coeff(i, j)) return true;
+        if (coeff(i, j)) return true;
    return false;
  }
 }
@@ -130,7 +124,7 @@ inline bool DenseBase<Derived>::any() const
  * \sa all(), any()
  */
 template<typename Derived>
-inline Eigen::Index DenseBase<Derived>::count() const
+inline typename DenseBase<Derived>::Index DenseBase<Derived>::count() const
 {
  return derived().template cast<bool>().template cast<Index>().sum();
 }
--- a/Eigen/src/Core/CMakeLists.txt
+++ b/Eigen/src/Core/CMakeLists.txt
@@ -8,4 +8,3 @@ INSTALL(FILES
 ADD_SUBDIRECTORY(products)
 ADD_SUBDIRECTORY(util)
 ADD_SUBDIRECTORY(arch)
-ADD_SUBDIRECTORY(functors)
--- a/Eigen/src/Core/CommaInitializer.h
+++ b/Eigen/src/Core/CommaInitializer.h
@@ -28,8 +28,8 @@ template<typename XprType>
 struct CommaInitializer
 {
  typedef typename XprType::Scalar Scalar;
+  typedef typename XprType::Index Index;

-  EIGEN_DEVICE_FUNC
  inline CommaInitializer(XprType& xpr, const Scalar& s)
    : m_xpr(xpr), m_row(0), m_col(1), m_currentBlockRows(1)
  {
@@ -37,7 +37,6 @@ struct CommaInitializer
  }

  template<typename OtherDerived>
-  EIGEN_DEVICE_FUNC
  inline CommaInitializer(XprType& xpr, const DenseBase<OtherDerived>& other)
    : m_xpr(xpr), m_row(0), m_col(other.cols()), m_currentBlockRows(other.rows())
  {
@@ -47,7 +46,6 @@ struct CommaInitializer
  /* Copy/Move constructor which transfers ownership. This is crucial in 
   * absence of return value optimization to avoid assertions during destruction. */
  // FIXME in C++11 mode this could be replaced by a proper RValue constructor
-  EIGEN_DEVICE_FUNC
  inline CommaInitializer(const CommaInitializer& o)
  : m_xpr(o.m_xpr), m_row(o.m_row), m_col(o.m_col), m_currentBlockRows(o.m_currentBlockRows) {
    // Mark original object as finished. In absence of R-value references we need to const_cast:
@@ -57,7 +55,6 @@ struct CommaInitializer
  }

  /* inserts a scalar value in the target matrix */
-  EIGEN_DEVICE_FUNC
  CommaInitializer& operator,(const Scalar& s)
  {
    if (m_col==m_xpr.cols())
@@ -77,11 +74,13 @@ struct CommaInitializer

  /* inserts a matrix expression in the target matrix */
  template<typename OtherDerived>
-  EIGEN_DEVICE_FUNC
  CommaInitializer& operator,(const DenseBase<OtherDerived>& other)
  {
-    if(other.cols()==0 || other.rows()==0)
+    if(other.rows()==0)
+    {
+      m_col += other.cols();
      return *this;
+    }
    if (m_col==m_xpr.cols())
    {
      m_row+=m_currentBlockRows;
@@ -90,7 +89,7 @@ struct CommaInitializer
      eigen_assert(m_row+m_currentBlockRows<=m_xpr.rows()
        && "Too many rows passed to comma initializer (operator<<)");
    }
-    eigen_assert(m_col<m_xpr.cols()
+    eigen_assert((m_col<m_xpr.cols() || (m_xpr.cols()==0 && m_col==0))
      && "Too many coefficients passed to comma initializer (operator<<)");
    eigen_assert(m_currentBlockRows==other.rows());
    if (OtherDerived::SizeAtCompileTime != Dynamic)
@@ -103,7 +102,6 @@ struct CommaInitializer
    return *this;
  }

-  EIGEN_DEVICE_FUNC
  inline ~CommaInitializer()
  {
    eigen_assert((m_row+m_currentBlockRows) == m_xpr.rows()
@@ -118,10 +116,9 @@ struct CommaInitializer
    * quaternion.fromRotationMatrix((Matrix3f() << axis0, axis1, axis2).finished());
    * \endcode
    */
-  EIGEN_DEVICE_FUNC
  inline XprType& finished() { return m_xpr; }

-  XprType& m_xpr;           // target expression
+  XprType& m_xpr;   // target expression
  Index m_row;              // current row id
  Index m_col;              // current col id
  Index m_currentBlockRows; // current block height
--- a/Eigen/src/Core/CoreEvaluators.h
+++ b/Eigen/src/Core/CoreEvaluators.h
--- a/Eigen/src/Core/CoreIterators.h
+++ b/Eigen/src/Core/CoreIterators.h
@@ -1,7 +1,7 @@
 // This file is part of Eigen, a lightweight C++ template library
 // for linear algebra.
 //
-// Copyright (C) 2008-2014 Gael Guennebaud <gael.guennebaud@inria.fr>
+// Copyright (C) 2008-2010 Gael Guennebaud <gael.guennebaud@inria.fr>
 //
 // This Source Code Form is subject to the terms of the Mozilla
 // Public License v. 2.0. If a copy of the MPL was not distributed
@@ -15,113 +15,47 @@ namespace Eigen {
 /* This file contains the respective InnerIterator definition of the expressions defined in Eigen/Core
 */

-namespace internal {
-
-template<typename XprType, typename EvaluatorKind>
-class inner_iterator_selector;
-
-}
-
-/** \class InnerIterator
-  * \brief An InnerIterator allows to loop over the element of any matrix expression.
-  * 
-  * \warning To be used with care because an evaluator is constructed every time an InnerIterator iterator is constructed.
-  * 
-  * TODO: add a usage example
+/** \ingroup SparseCore_Module
+  * \class InnerIterator
+  * \brief An InnerIterator allows to loop over the element of a sparse (or dense) matrix or expression
+  *
+  * todo
  */
-template<typename XprType>
-class InnerIterator
+
+// generic version for dense matrix and expressions
+template<typename Derived> class DenseBase<Derived>::InnerIterator
 {
-protected:
-  typedef internal::inner_iterator_selector<XprType, typename internal::evaluator_traits<XprType>::Kind> IteratorType;
-  typedef typename internal::evaluator<XprType>::type EvaluatorType;
-  typedef typename internal::traits<XprType>::Scalar Scalar;
-public:
-  /** Construct an iterator over the \a outerId -th row or column of \a xpr */
-  InnerIterator(const XprType &xpr, const Index &outerId)
-    : m_eval(xpr), m_iter(m_eval, outerId, xpr.innerSize())
-  {}
-  
-  /// \returns the value of the current coefficient.
-  EIGEN_STRONG_INLINE Scalar value() const          { return m_iter.value(); }
-  /** Increment the iterator \c *this to the next non-zero coefficient.
-    * Explicit zeros are not skipped over. To skip explicit zeros, see class SparseView
-    */
-  EIGEN_STRONG_INLINE InnerIterator& operator++()   { m_iter.operator++(); return *this; }
-  /// \returns the column or row index of the current coefficient.
-  EIGEN_STRONG_INLINE Index index() const           { return m_iter.index(); }
-  /// \returns the row index of the current coefficient.
-  EIGEN_STRONG_INLINE Index row() const             { return m_iter.row(); }
-  /// \returns the column index of the current coefficient.
-  EIGEN_STRONG_INLINE Index col() const             { return m_iter.col(); }
-  /// \returns \c true if the iterator \c *this still references a valid coefficient.
-  EIGEN_STRONG_INLINE operator bool() const         { return m_iter; }
-  
-protected:
-  EvaluatorType m_eval;
-  IteratorType m_iter;
-private:
-  // If you get here, then you're not using the right InnerIterator type, e.g.:
-  //   SparseMatrix<double,RowMajor> A;
-  //   SparseMatrix<double>::InnerIterator it(A,0);
-  template<typename T> InnerIterator(const EigenBase<T>&,Index outer);
+  protected:
+    typedef typename Derived::Scalar Scalar;
+    typedef typename Derived::Index Index;
+
+    enum { IsRowMajor = (Derived::Flags&RowMajorBit)==RowMajorBit };
+  public:
+    EIGEN_STRONG_INLINE InnerIterator(const Derived& expr, Index outer)
+      : m_expression(expr), m_inner(0), m_outer(outer), m_end(expr.innerSize())
+    {}
+
+    EIGEN_STRONG_INLINE Scalar value() const
+    {
+      return (IsRowMajor) ? m_expression.coeff(m_outer, m_inner)
+                          : m_expression.coeff(m_inner, m_outer);
+    }
+
+    EIGEN_STRONG_INLINE InnerIterator& operator++() { m_inner++; return *this; }
+
+    EIGEN_STRONG_INLINE Index index() const { return m_inner; }
+    inline Index row() const { return IsRowMajor ? m_outer : index(); }
+    inline Index col() const { return IsRowMajor ? index() : m_outer; }
+
+    EIGEN_STRONG_INLINE operator bool() const { return m_inner < m_end && m_inner>=0; }
+
+  protected:
+    const Derived& m_expression;
+    Index m_inner;
+    const Index m_outer;
+    const Index m_end;
 };

-namespace internal {
-
-// Generic inner iterator implementation for dense objects
-template<typename XprType>
-class inner_iterator_selector<XprType, IndexBased>
-{
-protected:
-  typedef typename evaluator<XprType>::type EvaluatorType;
-  typedef typename traits<XprType>::Scalar Scalar;
-  enum { IsRowMajor = (XprType::Flags&RowMajorBit)==RowMajorBit };
-  
-public:
-  EIGEN_STRONG_INLINE inner_iterator_selector(const EvaluatorType &eval, const Index &outerId, const Index &innerSize)
-    : m_eval(eval), m_inner(0), m_outer(outerId), m_end(innerSize)
-  {}
-
-  EIGEN_STRONG_INLINE Scalar value() const
-  {
-    return (IsRowMajor) ? m_eval.coeff(m_outer, m_inner)
-                        : m_eval.coeff(m_inner, m_outer);
-  }
-
-  EIGEN_STRONG_INLINE inner_iterator_selector& operator++() { m_inner++; return *this; }
-
-  EIGEN_STRONG_INLINE Index index() const { return m_inner; }
-  inline Index row() const { return IsRowMajor ? m_outer : index(); }
-  inline Index col() const { return IsRowMajor ? index() : m_outer; }
-
-  EIGEN_STRONG_INLINE operator bool() const { return m_inner < m_end && m_inner>=0; }
-
-protected:
-  const EvaluatorType& m_eval;
-  Index m_inner;
-  const Index m_outer;
-  const Index m_end;
-};
-
-// For iterator-based evaluator, inner-iterator is already implemented as
-// evaluator<>::InnerIterator
-template<typename XprType>
-class inner_iterator_selector<XprType, IteratorBased>
- : public evaluator<XprType>::InnerIterator
-{
-protected:
-  typedef typename evaluator<XprType>::InnerIterator Base;
-  typedef typename evaluator<XprType>::type EvaluatorType;
-  
-public:
-  EIGEN_STRONG_INLINE inner_iterator_selector(const EvaluatorType &eval, const Index &outerId, const Index &/*innerSize*/)
-    : Base(eval, outerId)
-  {}  
-};
-
-} // end namespace internal
-
 } // end namespace Eigen

 #endif // EIGEN_COREITERATORS_H
--- a/Eigen/src/Core/CwiseBinaryOp.h
+++ b/Eigen/src/Core/CwiseBinaryOp.h
@@ -1,7 +1,7 @@
 // This file is part of Eigen, a lightweight C++ template library
 // for linear algebra.
 //
-// Copyright (C) 2008-2014 Gael Guennebaud <gael.guennebaud@inria.fr>
+// Copyright (C) 2008-2009 Gael Guennebaud <gael.guennebaud@inria.fr>
 // Copyright (C) 2006-2008 Benoit Jacob <jacob.benoit.1@gmail.com>
 //
 // This Source Code Form is subject to the terms of the Mozilla
@@ -56,51 +56,73 @@ struct traits<CwiseBinaryOp<BinaryOp, Lhs, Rhs> >
                       typename Rhs::Scalar
                     )
                   >::type Scalar;
-  typedef typename cwise_promote_storage_type<typename traits<Lhs>::StorageKind,
-                                              typename traits<Rhs>::StorageKind,
-                                              BinaryOp>::ret StorageKind;
-  typedef typename promote_index_type<typename traits<Lhs>::StorageIndex,
-                                      typename traits<Rhs>::StorageIndex>::type StorageIndex;
+  typedef typename promote_storage_type<typename traits<Lhs>::StorageKind,
+                                           typename traits<Rhs>::StorageKind>::ret StorageKind;
+  typedef typename promote_index_type<typename traits<Lhs>::Index,
+                                         typename traits<Rhs>::Index>::type Index;
  typedef typename Lhs::Nested LhsNested;
  typedef typename Rhs::Nested RhsNested;
  typedef typename remove_reference<LhsNested>::type _LhsNested;
  typedef typename remove_reference<RhsNested>::type _RhsNested;
  enum {
-    Flags = _LhsNested::Flags & RowMajorBit
+    LhsCoeffReadCost = _LhsNested::CoeffReadCost,
+    RhsCoeffReadCost = _RhsNested::CoeffReadCost,
+    LhsFlags = _LhsNested::Flags,
+    RhsFlags = _RhsNested::Flags,
+    SameType = is_same<typename _LhsNested::Scalar,typename _RhsNested::Scalar>::value,
+    StorageOrdersAgree = (int(Lhs::Flags)&RowMajorBit)==(int(Rhs::Flags)&RowMajorBit),
+    Flags0 = (int(LhsFlags) | int(RhsFlags)) & (
+        HereditaryBits
+      | (int(LhsFlags) & int(RhsFlags) &
+           ( AlignedBit
+           | (StorageOrdersAgree ? LinearAccessBit : 0)
+           | (functor_traits<BinaryOp>::PacketAccess && StorageOrdersAgree && SameType ? PacketAccessBit : 0)
+           )
+        )
+     ),
+    Flags = (Flags0 & ~RowMajorBit) | (LhsFlags & RowMajorBit),
+    Cost0 = EIGEN_ADD_COST(LhsCoeffReadCost,RhsCoeffReadCost),
+    CoeffReadCost = EIGEN_ADD_COST(Cost0,functor_traits<BinaryOp>::Cost)
  };
 };
 } // end namespace internal

+// we require Lhs and Rhs to have the same scalar type. Currently there is no example of a binary functor
+// that would take two operands of different types. If there were such an example, then this check should be
+// moved to the BinaryOp functors, on a per-case basis. This would however require a change in the BinaryOp functors, as
+// currently they take only one typename Scalar template parameter.
+// It is tempting to always allow mixing different types but remember that this is often impossible in the vectorized paths.
+// So allowing mixing different types gives very unexpected errors when enabling vectorization, when the user tries to
+// add together a float matrix and a double matrix.
+#define EIGEN_CHECK_BINARY_COMPATIBILIY(BINOP,LHS,RHS) \
+  EIGEN_STATIC_ASSERT((internal::functor_is_product_like<BINOP>::ret \
+                        ? int(internal::scalar_product_traits<LHS, RHS>::Defined) \
+                        : int(internal::is_same<LHS, RHS>::value)), \
+    YOU_MIXED_DIFFERENT_NUMERIC_TYPES__YOU_NEED_TO_USE_THE_CAST_METHOD_OF_MATRIXBASE_TO_CAST_NUMERIC_TYPES_EXPLICITLY)
+
 template<typename BinaryOp, typename Lhs, typename Rhs, typename StorageKind>
 class CwiseBinaryOpImpl;

-template<typename BinaryOp, typename LhsType, typename RhsType>
-class CwiseBinaryOp : 
+template<typename BinaryOp, typename Lhs, typename Rhs>
+class CwiseBinaryOp : internal::no_assignment_operator,
  public CwiseBinaryOpImpl<
-          BinaryOp, LhsType, RhsType,
-          typename internal::cwise_promote_storage_type<typename internal::traits<LhsType>::StorageKind,
-                                                        typename internal::traits<RhsType>::StorageKind,
-                                                        BinaryOp>::ret>,
-  internal::no_assignment_operator
+          BinaryOp, Lhs, Rhs,
+          typename internal::promote_storage_type<typename internal::traits<Lhs>::StorageKind,
+                                           typename internal::traits<Rhs>::StorageKind>::ret>
 {
  public:
-    
-    typedef typename internal::remove_all<LhsType>::type Lhs;
-    typedef typename internal::remove_all<RhsType>::type Rhs;

    typedef typename CwiseBinaryOpImpl<
-        BinaryOp, LhsType, RhsType,
-        typename internal::cwise_promote_storage_type<typename internal::traits<LhsType>::StorageKind,
-                                                      typename internal::traits<Rhs>::StorageKind,
-                                                      BinaryOp>::ret>::Base Base;
+        BinaryOp, Lhs, Rhs,
+        typename internal::promote_storage_type<typename internal::traits<Lhs>::StorageKind,
+                                         typename internal::traits<Rhs>::StorageKind>::ret>::Base Base;
    EIGEN_GENERIC_PUBLIC_INTERFACE(CwiseBinaryOp)

-    typedef typename internal::nested<LhsType>::type LhsNested;
-    typedef typename internal::nested<RhsType>::type RhsNested;
+    typedef typename internal::nested<Lhs>::type LhsNested;
+    typedef typename internal::nested<Rhs>::type RhsNested;
    typedef typename internal::remove_reference<LhsNested>::type _LhsNested;
    typedef typename internal::remove_reference<RhsNested>::type _RhsNested;

-    EIGEN_DEVICE_FUNC
    EIGEN_STRONG_INLINE CwiseBinaryOp(const Lhs& aLhs, const Rhs& aRhs, const BinaryOp& func = BinaryOp())
      : m_lhs(aLhs), m_rhs(aRhs), m_functor(func)
    {
@@ -110,7 +132,6 @@ class CwiseBinaryOp :
      eigen_assert(aLhs.rows() == aRhs.rows() && aLhs.cols() == aRhs.cols());
    }

-    EIGEN_DEVICE_FUNC
    EIGEN_STRONG_INLINE Index rows() const {
      // return the fixed size type if available to enable compile time optimizations
      if (internal::traits<typename internal::remove_all<LhsNested>::type>::RowsAtCompileTime==Dynamic)
@@ -118,7 +139,6 @@ class CwiseBinaryOp :
      else
        return m_lhs.rows();
    }
-    EIGEN_DEVICE_FUNC
    EIGEN_STRONG_INLINE Index cols() const {
      // return the fixed size type if available to enable compile time optimizations
      if (internal::traits<typename internal::remove_all<LhsNested>::type>::ColsAtCompileTime==Dynamic)
@@ -128,13 +148,10 @@ class CwiseBinaryOp :
    }

    /** \returns the left hand side nested expression */
-    EIGEN_DEVICE_FUNC
    const _LhsNested& lhs() const { return m_lhs; }
    /** \returns the right hand side nested expression */
-    EIGEN_DEVICE_FUNC
    const _RhsNested& rhs() const { return m_rhs; }
    /** \returns the functor representing the binary operation */
-    EIGEN_DEVICE_FUNC
    const BinaryOp& functor() const { return m_functor; }

  protected:
@@ -143,13 +160,41 @@ class CwiseBinaryOp :
    const BinaryOp m_functor;
 };

-// Generic API dispatcher
-template<typename BinaryOp, typename Lhs, typename Rhs, typename StorageKind>
-class CwiseBinaryOpImpl
-  : public internal::generic_xpr_base<CwiseBinaryOp<BinaryOp, Lhs, Rhs> >::type
+template<typename BinaryOp, typename Lhs, typename Rhs>
+class CwiseBinaryOpImpl<BinaryOp, Lhs, Rhs, Dense>
+  : public internal::dense_xpr_base<CwiseBinaryOp<BinaryOp, Lhs, Rhs> >::type
 {
-public:
-  typedef typename internal::generic_xpr_base<CwiseBinaryOp<BinaryOp, Lhs, Rhs> >::type Base;
+    typedef CwiseBinaryOp<BinaryOp, Lhs, Rhs> Derived;
+  public:
+
+    typedef typename internal::dense_xpr_base<CwiseBinaryOp<BinaryOp, Lhs, Rhs> >::type Base;
+    EIGEN_DENSE_PUBLIC_INTERFACE( Derived )
+
+    EIGEN_STRONG_INLINE const Scalar coeff(Index rowId, Index colId) const
+    {
+      return derived().functor()(derived().lhs().coeff(rowId, colId),
+                                 derived().rhs().coeff(rowId, colId));
+    }
+
+    template<int LoadMode>
+    EIGEN_STRONG_INLINE PacketScalar packet(Index rowId, Index colId) const
+    {
+      return derived().functor().packetOp(derived().lhs().template packet<LoadMode>(rowId, colId),
+                                          derived().rhs().template packet<LoadMode>(rowId, colId));
+    }
+
+    EIGEN_STRONG_INLINE const Scalar coeff(Index index) const
+    {
+      return derived().functor()(derived().lhs().coeff(index),
+                                 derived().rhs().coeff(index));
+    }
+
+    template<int LoadMode>
+    EIGEN_STRONG_INLINE PacketScalar packet(Index index) const
+    {
+      return derived().functor().packetOp(derived().lhs().template packet<LoadMode>(index),
+                                          derived().rhs().template packet<LoadMode>(index));
+    }
 };

 /** replaces \c *this by \c *this - \a other.
@@ -161,7 +206,8 @@ template<typename OtherDerived>
 EIGEN_STRONG_INLINE Derived &
 MatrixBase<Derived>::operator-=(const MatrixBase<OtherDerived> &other)
 {
-  call_assignment(derived(), other.derived(), internal::sub_assign_op<Scalar>());
+  SelfCwiseBinaryOp<internal::scalar_difference_op<Scalar>, Derived, OtherDerived> tmp(derived());
+  tmp = other.derived();
  return derived();
 }

@@ -174,11 +220,11 @@ template<typename OtherDerived>
 EIGEN_STRONG_INLINE Derived &
 MatrixBase<Derived>::operator+=(const MatrixBase<OtherDerived>& other)
 {
-  call_assignment(derived(), other.derived(), internal::add_assign_op<Scalar>());
+  SelfCwiseBinaryOp<internal::scalar_sum_op<Scalar>, Derived, OtherDerived> tmp(derived());
+  tmp = other.derived();
  return derived();
 }

 } // end namespace Eigen

 #endif // EIGEN_CWISE_BINARY_OP_H
-
--- a/Eigen/src/Core/CwiseNullaryOp.h
+++ b/Eigen/src/Core/CwiseNullaryOp.h
@@ -35,20 +35,25 @@ template<typename NullaryOp, typename PlainObjectType>
 struct traits<CwiseNullaryOp<NullaryOp, PlainObjectType> > : traits<PlainObjectType>
 {
  enum {
-    Flags = traits<PlainObjectType>::Flags & RowMajorBit
+    Flags = (traits<PlainObjectType>::Flags
+      & (  HereditaryBits
+         | (functor_has_linear_access<NullaryOp>::ret ? LinearAccessBit : 0)
+         | (functor_traits<NullaryOp>::PacketAccess ? PacketAccessBit : 0)))
+      | (functor_traits<NullaryOp>::IsRepeatable ? 0 : EvalBeforeNestingBit),
+    CoeffReadCost = functor_traits<NullaryOp>::Cost
  };
 };
 }

 template<typename NullaryOp, typename PlainObjectType>
-class CwiseNullaryOp : public internal::dense_xpr_base< CwiseNullaryOp<NullaryOp, PlainObjectType> >::type, internal::no_assignment_operator
+class CwiseNullaryOp : internal::no_assignment_operator,
+  public internal::dense_xpr_base< CwiseNullaryOp<NullaryOp, PlainObjectType> >::type
 {
  public:

    typedef typename internal::dense_xpr_base<CwiseNullaryOp>::type Base;
    EIGEN_DENSE_PUBLIC_INTERFACE(CwiseNullaryOp)

-    EIGEN_DEVICE_FUNC
    CwiseNullaryOp(Index nbRows, Index nbCols, const NullaryOp& func = NullaryOp())
      : m_rows(nbRows), m_cols(nbCols), m_functor(func)
    {
@@ -58,12 +63,9 @@ class CwiseNullaryOp : public internal::dense_xpr_base< CwiseNullaryOp<NullaryOp
            && (ColsAtCompileTime == Dynamic || ColsAtCompileTime == nbCols));
    }

-    EIGEN_DEVICE_FUNC
    EIGEN_STRONG_INLINE Index rows() const { return m_rows.value(); }
-    EIGEN_DEVICE_FUNC
    EIGEN_STRONG_INLINE Index cols() const { return m_cols.value(); }

-    EIGEN_DEVICE_FUNC
    EIGEN_STRONG_INLINE const Scalar coeff(Index rowId, Index colId) const
    {
      return m_functor(rowId, colId);
@@ -75,7 +77,6 @@ class CwiseNullaryOp : public internal::dense_xpr_base< CwiseNullaryOp<NullaryOp
      return m_functor.packetOp(rowId, colId);
    }

-    EIGEN_DEVICE_FUNC
    EIGEN_STRONG_INLINE const Scalar coeff(Index index) const
    {
      return m_functor(index);
@@ -88,7 +89,6 @@ class CwiseNullaryOp : public internal::dense_xpr_base< CwiseNullaryOp<NullaryOp
    }

    /** \returns the functor representing the nullary operation */
-    EIGEN_DEVICE_FUNC
    const NullaryOp& functor() const { return m_functor; }

  protected:
@@ -132,9 +132,6 @@ DenseBase<Derived>::NullaryExpr(Index rows, Index cols, const CustomNullaryOp& f
  *
  * The template parameter \a CustomNullaryOp is the type of the functor.
  *
-  * Here is an example with C++11 random generators: \include random_cpp11.cpp
-  * Output: \verbinclude random_cpp11.out
-  * 
  * \sa class CwiseNullaryOp
  */
 template<typename Derived>
@@ -743,7 +740,6 @@ namespace internal {
 template<typename Derived, bool Big = (Derived::SizeAtCompileTime>=16)>
 struct setIdentity_impl
 {
-  EIGEN_DEVICE_FUNC
  static EIGEN_STRONG_INLINE Derived& run(Derived& m)
  {
    return m = Derived::Identity(m.rows(), m.cols());
@@ -753,7 +749,7 @@ struct setIdentity_impl
 template<typename Derived>
 struct setIdentity_impl<Derived, true>
 {
-  EIGEN_DEVICE_FUNC
+  typedef typename Derived::Index Index;
  static EIGEN_STRONG_INLINE Derived& run(Derived& m)
  {
    m.setZero();
--- a/Eigen/src/Core/CwiseUnaryOp.h
+++ b/Eigen/src/Core/CwiseUnaryOp.h
@@ -1,7 +1,7 @@
 // This file is part of Eigen, a lightweight C++ template library
 // for linear algebra.
 //
-// Copyright (C) 2008-2014 Gael Guennebaud <gael.guennebaud@inria.fr>
+// Copyright (C) 2008-2010 Gael Guennebaud <gael.guennebaud@inria.fr>
 // Copyright (C) 2006-2008 Benoit Jacob <jacob.benoit.1@gmail.com>
 //
 // This Source Code Form is subject to the terms of the Mozilla
@@ -44,7 +44,10 @@ struct traits<CwiseUnaryOp<UnaryOp, XprType> >
  typedef typename XprType::Nested XprTypeNested;
  typedef typename remove_reference<XprTypeNested>::type _XprTypeNested;
  enum {
-    Flags = _XprTypeNested::Flags & RowMajorBit 
+    Flags = _XprTypeNested::Flags & (
+      HereditaryBits | LinearAccessBit | AlignedBit
+      | (functor_traits<UnaryOp>::PacketAccess ? PacketAccessBit : 0)),
+    CoeffReadCost = EIGEN_ADD_COST(_XprTypeNested::CoeffReadCost, functor_traits<UnaryOp>::Cost)
  };
 };
 }
@@ -53,34 +56,28 @@ template<typename UnaryOp, typename XprType, typename StorageKind>
 class CwiseUnaryOpImpl;

 template<typename UnaryOp, typename XprType>
-class CwiseUnaryOp : public CwiseUnaryOpImpl<UnaryOp, XprType, typename internal::traits<XprType>::StorageKind>, internal::no_assignment_operator
+class CwiseUnaryOp : internal::no_assignment_operator,
+  public CwiseUnaryOpImpl<UnaryOp, XprType, typename internal::traits<XprType>::StorageKind>
 {
  public:

    typedef typename CwiseUnaryOpImpl<UnaryOp, XprType,typename internal::traits<XprType>::StorageKind>::Base Base;
    EIGEN_GENERIC_PUBLIC_INTERFACE(CwiseUnaryOp)
-    typedef typename internal::remove_all<XprType>::type NestedExpression;

-    EIGEN_DEVICE_FUNC
-    explicit inline CwiseUnaryOp(const XprType& xpr, const UnaryOp& func = UnaryOp())
+    inline CwiseUnaryOp(const XprType& xpr, const UnaryOp& func = UnaryOp())
      : m_xpr(xpr), m_functor(func) {}

-    EIGEN_DEVICE_FUNC
    EIGEN_STRONG_INLINE Index rows() const { return m_xpr.rows(); }
-    EIGEN_DEVICE_FUNC
    EIGEN_STRONG_INLINE Index cols() const { return m_xpr.cols(); }

    /** \returns the functor representing the unary operation */
-    EIGEN_DEVICE_FUNC
    const UnaryOp& functor() const { return m_functor; }

    /** \returns the nested expression */
-    EIGEN_DEVICE_FUNC
    const typename internal::remove_all<typename XprType::Nested>::type&
    nestedExpression() const { return m_xpr; }

    /** \returns the nested expression */
-    EIGEN_DEVICE_FUNC
    typename internal::remove_all<typename XprType::Nested>::type&
    nestedExpression() { return m_xpr.const_cast_derived(); }

@@ -89,13 +86,39 @@ class CwiseUnaryOp : public CwiseUnaryOpImpl<UnaryOp, XprType, typename internal
    const UnaryOp m_functor;
 };

-// Generic API dispatcher
-template<typename UnaryOp, typename XprType, typename StorageKind>
-class CwiseUnaryOpImpl
-  : public internal::generic_xpr_base<CwiseUnaryOp<UnaryOp, XprType> >::type
+// This is the generic implementation for dense storage.
+// It can be used for any expression types implementing the dense concept.
+template<typename UnaryOp, typename XprType>
+class CwiseUnaryOpImpl<UnaryOp,XprType,Dense>
+  : public internal::dense_xpr_base<CwiseUnaryOp<UnaryOp, XprType> >::type
 {
-public:
-  typedef typename internal::generic_xpr_base<CwiseUnaryOp<UnaryOp, XprType> >::type Base;
+  public:
+
+    typedef CwiseUnaryOp<UnaryOp, XprType> Derived;
+    typedef typename internal::dense_xpr_base<CwiseUnaryOp<UnaryOp, XprType> >::type Base;
+    EIGEN_DENSE_PUBLIC_INTERFACE(Derived)
+
+    EIGEN_STRONG_INLINE const Scalar coeff(Index rowId, Index colId) const
+    {
+      return derived().functor()(derived().nestedExpression().coeff(rowId, colId));
+    }
+
+    template<int LoadMode>
+    EIGEN_STRONG_INLINE PacketScalar packet(Index rowId, Index colId) const
+    {
+      return derived().functor().packetOp(derived().nestedExpression().template packet<LoadMode>(rowId, colId));
+    }
+
+    EIGEN_STRONG_INLINE const Scalar coeff(Index index) const
+    {
+      return derived().functor()(derived().nestedExpression().coeff(index));
+    }
+
+    template<int LoadMode>
+    EIGEN_STRONG_INLINE PacketScalar packet(Index index) const
+    {
+      return derived().functor().packetOp(derived().nestedExpression().template packet<LoadMode>(index));
+    }
 };

 } // end namespace Eigen
--- a/Eigen/src/Core/CwiseUnaryView.h
+++ b/Eigen/src/Core/CwiseUnaryView.h
@@ -37,8 +37,8 @@ struct traits<CwiseUnaryView<ViewOp, MatrixType> >
  typedef typename MatrixType::Nested MatrixTypeNested;
  typedef typename remove_all<MatrixTypeNested>::type _MatrixTypeNested;
  enum {
-    FlagsLvalueBit = is_lvalue<MatrixType>::value ? LvalueBit : 0,
-    Flags = traits<_MatrixTypeNested>::Flags & (RowMajorBit | FlagsLvalueBit | DirectAccessBit), // FIXME DirectAccessBit should not be handled by expressions
+    Flags = (traits<_MatrixTypeNested>::Flags & (HereditaryBits | LvalueBit | LinearAccessBit | DirectAccessBit)),
+    CoeffReadCost = EIGEN_ADD_COST(traits<_MatrixTypeNested>::CoeffReadCost, functor_traits<ViewOp>::Cost),
    MatrixTypeInnerStride =  inner_stride_at_compile_time<MatrixType>::ret,
    // need to cast the sizeof's from size_t to int explicitly, otherwise:
    // "error: no integral type can represent all of the enumerator values
@@ -62,9 +62,8 @@ class CwiseUnaryView : public CwiseUnaryViewImpl<ViewOp, MatrixType, typename in

    typedef typename CwiseUnaryViewImpl<ViewOp, MatrixType,typename internal::traits<MatrixType>::StorageKind>::Base Base;
    EIGEN_GENERIC_PUBLIC_INTERFACE(CwiseUnaryView)
-    typedef typename internal::remove_all<MatrixType>::type NestedExpression;

-    explicit inline CwiseUnaryView(MatrixType& mat, const ViewOp& func = ViewOp())
+    inline CwiseUnaryView(const MatrixType& mat, const ViewOp& func = ViewOp())
      : m_matrix(mat), m_functor(func) {}

    EIGEN_INHERIT_ASSIGNMENT_OPERATORS(CwiseUnaryView)
@@ -89,15 +88,6 @@ class CwiseUnaryView : public CwiseUnaryViewImpl<ViewOp, MatrixType, typename in
    ViewOp m_functor;
 };

-// Generic API dispatcher
-template<typename ViewOp, typename XprType, typename StorageKind>
-class CwiseUnaryViewImpl
-  : public internal::generic_xpr_base<CwiseUnaryView<ViewOp, XprType> >::type
-{
-public:
-  typedef typename internal::generic_xpr_base<CwiseUnaryView<ViewOp, XprType> >::type Base;
-};
-
 template<typename ViewOp, typename MatrixType>
 class CwiseUnaryViewImpl<ViewOp,MatrixType,Dense>
  : public internal::dense_xpr_base< CwiseUnaryView<ViewOp, MatrixType> >::type
@@ -110,18 +100,38 @@ class CwiseUnaryViewImpl<ViewOp,MatrixType,Dense>
    EIGEN_DENSE_PUBLIC_INTERFACE(Derived)
    EIGEN_INHERIT_ASSIGNMENT_OPERATORS(CwiseUnaryViewImpl)
    
-    EIGEN_DEVICE_FUNC inline Scalar* data() { return &(this->coeffRef(0)); }
-    EIGEN_DEVICE_FUNC inline const Scalar* data() const { return &(this->coeff(0)); }
+    inline Scalar* data() { return &coeffRef(0); }
+    inline const Scalar* data() const { return &coeff(0); }

-    EIGEN_DEVICE_FUNC inline Index innerStride() const
+    inline Index innerStride() const
    {
      return derived().nestedExpression().innerStride() * sizeof(typename internal::traits<MatrixType>::Scalar) / sizeof(Scalar);
    }

-    EIGEN_DEVICE_FUNC inline Index outerStride() const
+    inline Index outerStride() const
    {
      return derived().nestedExpression().outerStride() * sizeof(typename internal::traits<MatrixType>::Scalar) / sizeof(Scalar);
    }
+
+    EIGEN_STRONG_INLINE CoeffReturnType coeff(Index row, Index col) const
+    {
+      return derived().functor()(derived().nestedExpression().coeff(row, col));
+    }
+
+    EIGEN_STRONG_INLINE CoeffReturnType coeff(Index index) const
+    {
+      return derived().functor()(derived().nestedExpression().coeff(index));
+    }
+
+    EIGEN_STRONG_INLINE Scalar& coeffRef(Index row, Index col)
+    {
+      return derived().functor()(const_cast_derived().nestedExpression().coeffRef(row, col));
+    }
+
+    EIGEN_STRONG_INLINE Scalar& coeffRef(Index index)
+    {
+      return derived().functor()(const_cast_derived().nestedExpression().coeffRef(index));
+    }
 };

 } // end namespace Eigen
--- a/Eigen/src/Core/DenseBase.h
+++ b/Eigen/src/Core/DenseBase.h
@@ -40,38 +40,31 @@ static inline void check_DenseIndex_is_signed() {
  */
 template<typename Derived> class DenseBase
 #ifndef EIGEN_PARSED_BY_DOXYGEN
-  : public internal::special_scalar_op_base<Derived,typename internal::traits<Derived>::Scalar,
-                                     typename NumTraits<typename internal::traits<Derived>::Scalar>::Real>
+  : public internal::special_scalar_op_base<Derived, typename internal::traits<Derived>::Scalar,
+                                            typename NumTraits<typename internal::traits<Derived>::Scalar>::Real,
+                                            DenseCoeffsBase<Derived> >
 #else
  : public DenseCoeffsBase<Derived>
 #endif // not EIGEN_PARSED_BY_DOXYGEN
 {
  public:
-    using internal::special_scalar_op_base<Derived,typename internal::traits<Derived>::Scalar,
-                typename NumTraits<typename internal::traits<Derived>::Scalar>::Real>::operator*;

-
-    /** Inner iterator type to iterate over the coefficients of a row or column.
-      * \sa class InnerIterator
-      */
-    typedef Eigen::InnerIterator<Derived> InnerIterator;
+    class InnerIterator;

    typedef typename internal::traits<Derived>::StorageKind StorageKind;

-    /**
-      * \brief The type used to store indices
-      * \details This typedef is relevant for types that store multiple indices such as
-      *          PermutationMatrix or Transpositions, otherwise it defaults to Eigen::Index
-      * \sa \ref TopicPreprocessorDirectives, Eigen::Index, SparseMatrixBase.
-     */
-    typedef typename internal::traits<Derived>::StorageIndex StorageIndex;
+    /** \brief The type of indices 
+      * \details To change this, \c \#define the preprocessor symbol \c EIGEN_DEFAULT_DENSE_INDEX_TYPE.
+      * \sa \ref TopicPreprocessorDirectives.
+      */
+    typedef typename internal::traits<Derived>::Index Index; 

    typedef typename internal::traits<Derived>::Scalar Scalar;
    typedef typename internal::packet_traits<Scalar>::type PacketScalar;
    typedef typename NumTraits<Scalar>::Real RealScalar;
+    typedef internal::special_scalar_op_base<Derived,Scalar,RealScalar, DenseCoeffsBase<Derived> > Base;

-    typedef internal::special_scalar_op_base<Derived,typename internal::traits<Derived>::Scalar,
-                      typename NumTraits<typename internal::traits<Derived>::Scalar>::Real> Base;
+    using Base::operator*;
    using Base::derived;
    using Base::const_cast_derived;
    using Base::rows;
@@ -81,6 +74,16 @@ template<typename Derived> class DenseBase
    using Base::colIndexByOuterInner;
    using Base::coeff;
    using Base::coeffByOuterInner;
+    using Base::packet;
+    using Base::packetByOuterInner;
+    using Base::writePacket;
+    using Base::writePacketByOuterInner;
+    using Base::coeffRef;
+    using Base::coeffRefByOuterInner;
+    using Base::copyCoeff;
+    using Base::copyCoeffByOuterInner;
+    using Base::copyPacket;
+    using Base::copyPacketByOuterInner;
    using Base::operator();
    using Base::operator[];
    using Base::x;
@@ -166,27 +169,26 @@ template<typename Derived> class DenseBase
      InnerSizeAtCompileTime = int(IsVectorAtCompileTime) ? int(SizeAtCompileTime)
                             : int(IsRowMajor) ? int(ColsAtCompileTime) : int(RowsAtCompileTime),

+      CoeffReadCost = internal::traits<Derived>::CoeffReadCost,
+        /**< This is a rough measure of how expensive it is to read one coefficient from
+          * this expression.
+          */
+
      InnerStrideAtCompileTime = internal::inner_stride_at_compile_time<Derived>::ret,
      OuterStrideAtCompileTime = internal::outer_stride_at_compile_time<Derived>::ret
    };

-    enum { IsPlainObjectBase = 0 };
+    enum { ThisConstantIsPrivateInPlainObjectBase };

    /** \returns the number of nonzero coefficients which is in practice the number
      * of stored coefficients. */
-    EIGEN_DEVICE_FUNC
    inline Index nonZeros() const { return size(); }
-    /** \returns true if either the number of rows or the number of columns is equal to 1.
-      * In other words, this function returns
-      * \code rows()==1 || cols()==1 \endcode
-      * \sa rows(), cols(), IsVectorAtCompileTime. */

    /** \returns the outer size.
      *
      * \note For a vector, this returns just 1. For a matrix (non-vector), this is the major dimension
      * with respect to the \ref TopicStorageOrders "storage order", i.e., the number of columns for a
      * column-major matrix, and the number of rows for a row-major matrix. */
-    EIGEN_DEVICE_FUNC
    Index outerSize() const
    {
      return IsVectorAtCompileTime ? 1
@@ -198,7 +200,6 @@ template<typename Derived> class DenseBase
      * \note For a vector, this is just the size. For a matrix (non-vector), this is the minor dimension
      * with respect to the \ref TopicStorageOrders "storage order", i.e., the number of rows for a 
      * column-major matrix, and the number of columns for a row-major matrix. */
-    EIGEN_DEVICE_FUNC
    Index innerSize() const
    {
      return IsVectorAtCompileTime ? this->size()
@@ -209,7 +210,6 @@ template<typename Derived> class DenseBase
      * Matrix::resize() and Array::resize(). The present method only asserts that the new size equals the old size, and does
      * nothing else.
      */
-    EIGEN_DEVICE_FUNC
    void resize(Index newSize)
    {
      EIGEN_ONLY_USED_FOR_DEBUG(newSize);
@@ -220,7 +220,6 @@ template<typename Derived> class DenseBase
      * Matrix::resize() and Array::resize(). The present method only asserts that the new size equals the old size, and does
      * nothing else.
      */
-    EIGEN_DEVICE_FUNC
    void resize(Index nbRows, Index nbCols)
    {
      EIGEN_ONLY_USED_FOR_DEBUG(nbRows);
@@ -244,123 +243,112 @@ template<typename Derived> class DenseBase

    /** Copies \a other into *this. \returns a reference to *this. */
    template<typename OtherDerived>
-    EIGEN_DEVICE_FUNC
    Derived& operator=(const DenseBase<OtherDerived>& other);

    /** Special case of the template operator=, in order to prevent the compiler
      * from generating a default operator= (issue hit with g++ 4.1)
      */
-    EIGEN_DEVICE_FUNC
    Derived& operator=(const DenseBase& other);

    template<typename OtherDerived>
-    EIGEN_DEVICE_FUNC
    Derived& operator=(const EigenBase<OtherDerived> &other);

    template<typename OtherDerived>
-    EIGEN_DEVICE_FUNC
    Derived& operator+=(const EigenBase<OtherDerived> &other);

    template<typename OtherDerived>
-    EIGEN_DEVICE_FUNC
    Derived& operator-=(const EigenBase<OtherDerived> &other);

    template<typename OtherDerived>
-    EIGEN_DEVICE_FUNC
    Derived& operator=(const ReturnByValue<OtherDerived>& func);

-#ifndef EIGEN_PARSED_BY_DOXYGEN
-    /** Copies \a other into *this without evaluating other. \returns a reference to *this.
-      * \deprecated */
+    /** \internal Copies \a other into *this without evaluating other. \returns a reference to *this. */
    template<typename OtherDerived>
-    EIGEN_DEVICE_FUNC
    Derived& lazyAssign(const DenseBase<OtherDerived>& other);
-#endif // not EIGEN_PARSED_BY_DOXYGEN

-    EIGEN_DEVICE_FUNC
+    /** \internal Evaluates \a other into *this. \returns a reference to *this. */
+    template<typename OtherDerived>
+    Derived& lazyAssign(const ReturnByValue<OtherDerived>& other);
+
    CommaInitializer<Derived> operator<< (const Scalar& s);

-    // TODO flagged is temporarly disabled. It seems useless now
    template<unsigned int Added,unsigned int Removed>
-    EIGEN_DEPRECATED
-    const Derived& flagged() const
-    { return derived(); }
+    const Flagged<Derived, Added, Removed> flagged() const;

    template<typename OtherDerived>
-    EIGEN_DEVICE_FUNC
    CommaInitializer<Derived> operator<< (const DenseBase<OtherDerived>& other);

-    typedef Transpose<Derived> TransposeReturnType;
-    EIGEN_DEVICE_FUNC
-    TransposeReturnType transpose();
-    typedef typename internal::add_const<Transpose<const Derived> >::type ConstTransposeReturnType;
-    EIGEN_DEVICE_FUNC
+    Eigen::Transpose<Derived> transpose();
+	typedef typename internal::add_const<Transpose<const Derived> >::type ConstTransposeReturnType;
    ConstTransposeReturnType transpose() const;
-    EIGEN_DEVICE_FUNC
    void transposeInPlace();
+#ifndef EIGEN_NO_DEBUG
+  protected:
+    template<typename OtherDerived>
+    void checkTransposeAliasing(const OtherDerived& other) const;
+  public:
+#endif

-    EIGEN_DEVICE_FUNC static const ConstantReturnType
+
+    static const ConstantReturnType
    Constant(Index rows, Index cols, const Scalar& value);
-    EIGEN_DEVICE_FUNC static const ConstantReturnType
+    static const ConstantReturnType
    Constant(Index size, const Scalar& value);
-    EIGEN_DEVICE_FUNC static const ConstantReturnType
+    static const ConstantReturnType
    Constant(const Scalar& value);

-    EIGEN_DEVICE_FUNC static const SequentialLinSpacedReturnType
+    static const SequentialLinSpacedReturnType
    LinSpaced(Sequential_t, Index size, const Scalar& low, const Scalar& high);
-    EIGEN_DEVICE_FUNC static const RandomAccessLinSpacedReturnType
+    static const RandomAccessLinSpacedReturnType
    LinSpaced(Index size, const Scalar& low, const Scalar& high);
-    EIGEN_DEVICE_FUNC static const SequentialLinSpacedReturnType
+    static const SequentialLinSpacedReturnType
    LinSpaced(Sequential_t, const Scalar& low, const Scalar& high);
-    EIGEN_DEVICE_FUNC static const RandomAccessLinSpacedReturnType
+    static const RandomAccessLinSpacedReturnType
    LinSpaced(const Scalar& low, const Scalar& high);

-    template<typename CustomNullaryOp> EIGEN_DEVICE_FUNC
+    template<typename CustomNullaryOp>
    static const CwiseNullaryOp<CustomNullaryOp, Derived>
    NullaryExpr(Index rows, Index cols, const CustomNullaryOp& func);
-    template<typename CustomNullaryOp> EIGEN_DEVICE_FUNC
+    template<typename CustomNullaryOp>
    static const CwiseNullaryOp<CustomNullaryOp, Derived>
    NullaryExpr(Index size, const CustomNullaryOp& func);
-    template<typename CustomNullaryOp> EIGEN_DEVICE_FUNC
+    template<typename CustomNullaryOp>
    static const CwiseNullaryOp<CustomNullaryOp, Derived>
    NullaryExpr(const CustomNullaryOp& func);

-    EIGEN_DEVICE_FUNC static const ConstantReturnType Zero(Index rows, Index cols);
-    EIGEN_DEVICE_FUNC static const ConstantReturnType Zero(Index size);
-    EIGEN_DEVICE_FUNC static const ConstantReturnType Zero();
-    EIGEN_DEVICE_FUNC static const ConstantReturnType Ones(Index rows, Index cols);
-    EIGEN_DEVICE_FUNC static const ConstantReturnType Ones(Index size);
-    EIGEN_DEVICE_FUNC static const ConstantReturnType Ones();
+    static const ConstantReturnType Zero(Index rows, Index cols);
+    static const ConstantReturnType Zero(Index size);
+    static const ConstantReturnType Zero();
+    static const ConstantReturnType Ones(Index rows, Index cols);
+    static const ConstantReturnType Ones(Index size);
+    static const ConstantReturnType Ones();

-    EIGEN_DEVICE_FUNC void fill(const Scalar& value);
-    EIGEN_DEVICE_FUNC Derived& setConstant(const Scalar& value);
-    EIGEN_DEVICE_FUNC Derived& setLinSpaced(Index size, const Scalar& low, const Scalar& high);
-    EIGEN_DEVICE_FUNC Derived& setLinSpaced(const Scalar& low, const Scalar& high);
-    EIGEN_DEVICE_FUNC Derived& setZero();
-    EIGEN_DEVICE_FUNC Derived& setOnes();
-    EIGEN_DEVICE_FUNC Derived& setRandom();
+    void fill(const Scalar& value);
+    Derived& setConstant(const Scalar& value);
+    Derived& setLinSpaced(Index size, const Scalar& low, const Scalar& high);
+    Derived& setLinSpaced(const Scalar& low, const Scalar& high);
+    Derived& setZero();
+    Derived& setOnes();
+    Derived& setRandom();

-    template<typename OtherDerived> EIGEN_DEVICE_FUNC
+    template<typename OtherDerived>
    bool isApprox(const DenseBase<OtherDerived>& other,
                  const RealScalar& prec = NumTraits<Scalar>::dummy_precision()) const;
-    EIGEN_DEVICE_FUNC 
    bool isMuchSmallerThan(const RealScalar& other,
                           const RealScalar& prec = NumTraits<Scalar>::dummy_precision()) const;
-    template<typename OtherDerived> EIGEN_DEVICE_FUNC
+    template<typename OtherDerived>
    bool isMuchSmallerThan(const DenseBase<OtherDerived>& other,
                           const RealScalar& prec = NumTraits<Scalar>::dummy_precision()) const;

-    EIGEN_DEVICE_FUNC bool isApproxToConstant(const Scalar& value, const RealScalar& prec = NumTraits<Scalar>::dummy_precision()) const;
-    EIGEN_DEVICE_FUNC bool isConstant(const Scalar& value, const RealScalar& prec = NumTraits<Scalar>::dummy_precision()) const;
-    EIGEN_DEVICE_FUNC bool isZero(const RealScalar& prec = NumTraits<Scalar>::dummy_precision()) const;
-    EIGEN_DEVICE_FUNC bool isOnes(const RealScalar& prec = NumTraits<Scalar>::dummy_precision()) const;
+    bool isApproxToConstant(const Scalar& value, const RealScalar& prec = NumTraits<Scalar>::dummy_precision()) const;
+    bool isConstant(const Scalar& value, const RealScalar& prec = NumTraits<Scalar>::dummy_precision()) const;
+    bool isZero(const RealScalar& prec = NumTraits<Scalar>::dummy_precision()) const;
+    bool isOnes(const RealScalar& prec = NumTraits<Scalar>::dummy_precision()) const;
    
    inline bool hasNaN() const;
    inline bool allFinite() const;

-    EIGEN_DEVICE_FUNC
    inline Derived& operator*=(const Scalar& other);
-    EIGEN_DEVICE_FUNC
    inline Derived& operator/=(const Scalar& other);

    typedef typename internal::add_const_on_value_type<typename internal::eval<Derived>::type>::type EvalReturnType;
@@ -369,7 +357,6 @@ template<typename Derived> class DenseBase
      * Notice that in the case of a plain matrix or vector (not an expression) this function just returns
      * a const reference, in order to avoid a useless copy.
      */
-    EIGEN_DEVICE_FUNC
    EIGEN_STRONG_INLINE EvalReturnType eval() const
    {
      // Even though MSVC does not honor strong inlining when the return type
@@ -377,69 +364,61 @@ template<typename Derived> class DenseBase
      // size types on MSVC.
      return typename internal::eval<Derived>::type(derived());
    }
-    
+
    /** swaps *this with the expression \a other.
      *
      */
    template<typename OtherDerived>
-    EIGEN_DEVICE_FUNC
-    void swap(const DenseBase<OtherDerived>& other)
+    void swap(const DenseBase<OtherDerived>& other,
+              int = OtherDerived::ThisConstantIsPrivateInPlainObjectBase)
    {
-      EIGEN_STATIC_ASSERT(!OtherDerived::IsPlainObjectBase,THIS_EXPRESSION_IS_NOT_A_LVALUE__IT_IS_READ_ONLY);
-      eigen_assert(rows()==other.rows() && cols()==other.cols());
-      call_assignment(derived(), other.const_cast_derived(), internal::swap_assign_op<Scalar>());
+      SwapWrapper<Derived>(derived()).lazyAssign(other.derived());
    }

    /** swaps *this with the matrix or array \a other.
      *
      */
    template<typename OtherDerived>
-    EIGEN_DEVICE_FUNC
    void swap(PlainObjectBase<OtherDerived>& other)
    {
-      eigen_assert(rows()==other.rows() && cols()==other.cols());
-      call_assignment(derived(), other.derived(), internal::swap_assign_op<Scalar>());
+      SwapWrapper<Derived>(derived()).lazyAssign(other.derived());
    }

-    EIGEN_DEVICE_FUNC inline const NestByValue<Derived> nestByValue() const;
-    EIGEN_DEVICE_FUNC inline const ForceAlignedAccess<Derived> forceAlignedAccess() const;
-    EIGEN_DEVICE_FUNC inline ForceAlignedAccess<Derived> forceAlignedAccess();
-    template<bool Enable> EIGEN_DEVICE_FUNC
-    inline const typename internal::conditional<Enable,ForceAlignedAccess<Derived>,Derived&>::type forceAlignedAccessIf() const;
-    template<bool Enable> EIGEN_DEVICE_FUNC
-    inline typename internal::conditional<Enable,ForceAlignedAccess<Derived>,Derived&>::type forceAlignedAccessIf();

-    EIGEN_DEVICE_FUNC Scalar sum() const;
-    EIGEN_DEVICE_FUNC Scalar mean() const;
-    EIGEN_DEVICE_FUNC Scalar trace() const;
+    inline const NestByValue<Derived> nestByValue() const;
+    inline const ForceAlignedAccess<Derived> forceAlignedAccess() const;
+    inline ForceAlignedAccess<Derived> forceAlignedAccess();
+    template<bool Enable> inline const typename internal::conditional<Enable,ForceAlignedAccess<Derived>,Derived&>::type forceAlignedAccessIf() const;
+    template<bool Enable> inline typename internal::conditional<Enable,ForceAlignedAccess<Derived>,Derived&>::type forceAlignedAccessIf();

-    EIGEN_DEVICE_FUNC Scalar prod() const;
+    Scalar sum() const;
+    Scalar mean() const;
+    Scalar trace() const;

-    EIGEN_DEVICE_FUNC typename internal::traits<Derived>::Scalar minCoeff() const;
-    EIGEN_DEVICE_FUNC typename internal::traits<Derived>::Scalar maxCoeff() const;
+    Scalar prod() const;

-    template<typename IndexType> EIGEN_DEVICE_FUNC
+    typename internal::traits<Derived>::Scalar minCoeff() const;
+    typename internal::traits<Derived>::Scalar maxCoeff() const;
+
+    template<typename IndexType>
    typename internal::traits<Derived>::Scalar minCoeff(IndexType* row, IndexType* col) const;
-    template<typename IndexType> EIGEN_DEVICE_FUNC
+    template<typename IndexType>
    typename internal::traits<Derived>::Scalar maxCoeff(IndexType* row, IndexType* col) const;
-    template<typename IndexType> EIGEN_DEVICE_FUNC
+    template<typename IndexType>
    typename internal::traits<Derived>::Scalar minCoeff(IndexType* index) const;
-    template<typename IndexType> EIGEN_DEVICE_FUNC
+    template<typename IndexType>
    typename internal::traits<Derived>::Scalar maxCoeff(IndexType* index) const;

    template<typename BinaryOp>
-    EIGEN_DEVICE_FUNC
    typename internal::result_of<BinaryOp(typename internal::traits<Derived>::Scalar)>::type
    redux(const BinaryOp& func) const;

    template<typename Visitor>
-    EIGEN_DEVICE_FUNC
    void visit(Visitor& func) const;

    inline const WithFormat<Derived> format(const IOFormat& fmt) const;

    /** \returns the unique coefficient of a 1x1 expression */
-    EIGEN_DEVICE_FUNC
    CoeffReturnType value() const
    {
      EIGEN_STATIC_ASSERT_SIZE_1x1(Derived)
@@ -447,8 +426,8 @@ template<typename Derived> class DenseBase
      return derived().coeff(0,0);
    }

-    bool all() const;
-    bool any() const;
+    bool all(void) const;
+    bool any(void) const;
    Index count() const;

    typedef VectorwiseOp<Derived, Horizontal> RowwiseReturnType;
@@ -481,8 +460,10 @@ template<typename Derived> class DenseBase
    template<int p> RealScalar lpNorm() const;

    template<int RowFactor, int ColFactor>
-    const Replicate<Derived,RowFactor,ColFactor> replicate() const;
-    const Replicate<Derived,Dynamic,Dynamic> replicate(Index rowFacor,Index colFactor) const;
+    inline const Replicate<Derived,RowFactor,ColFactor> replicate() const;
+    
+    typedef Replicate<Derived,Dynamic,Dynamic> ReplicateReturnType;
+    inline const ReplicateReturnType replicate(Index rowFacor,Index colFactor) const;

    typedef Reverse<Derived, BothDirections> ReverseReturnType;
    typedef const Reverse<const Derived, BothDirections> ConstReverseReturnType;
@@ -497,18 +478,27 @@ template<typename Derived> class DenseBase
 #   endif
 #undef EIGEN_CURRENT_STORAGE_BASE_CLASS

+#ifdef EIGEN2_SUPPORT
+
+    Block<Derived> corner(CornerType type, Index cRows, Index cCols);
+    const Block<Derived> corner(CornerType type, Index cRows, Index cCols) const;
+    template<int CRows, int CCols>
+    Block<Derived, CRows, CCols> corner(CornerType type);
+    template<int CRows, int CCols>
+    const Block<Derived, CRows, CCols> corner(CornerType type) const;
+
+#endif // EIGEN2_SUPPORT
+

    // disable the use of evalTo for dense objects with a nice compilation error
-    template<typename Dest>
-    EIGEN_DEVICE_FUNC
-    inline void evalTo(Dest& ) const
+    template<typename Dest> inline void evalTo(Dest& ) const
    {
      EIGEN_STATIC_ASSERT((internal::is_same<Dest,void>::value),THE_EVAL_EVALTO_FUNCTION_SHOULD_NEVER_BE_CALLED_FOR_DENSE_OBJECTS);
    }

  protected:
    /** Default constructor. Do nothing. */
-    EIGEN_DEVICE_FUNC DenseBase()
+    DenseBase()
    {
      /* Just checks for self-consistency of the flags.
       * Only do it when debugging Eigen, as this borders on paranoiac and could slow compilation down
@@ -521,9 +511,9 @@ template<typename Derived> class DenseBase
    }

  private:
-    EIGEN_DEVICE_FUNC explicit DenseBase(int);
-    EIGEN_DEVICE_FUNC DenseBase(int,int);
-    template<typename OtherDerived> EIGEN_DEVICE_FUNC explicit DenseBase(const DenseBase<OtherDerived>&);
+    explicit DenseBase(int);
+    DenseBase(int,int);
+    template<typename OtherDerived> explicit DenseBase(const DenseBase<OtherDerived>&);
 };

 } // end namespace Eigen
--- a/Eigen/src/Core/DenseCoeffsBase.h
+++ b/Eigen/src/Core/DenseCoeffsBase.h
@@ -35,6 +35,7 @@ class DenseCoeffsBase<Derived,ReadOnlyAccessors> : public EigenBase<Derived>
 {
  public:
    typedef typename internal::traits<Derived>::StorageKind StorageKind;
+    typedef typename internal::traits<Derived>::Index Index;
    typedef typename internal::traits<Derived>::Scalar Scalar;
    typedef typename internal::packet_traits<Scalar>::type PacketScalar;

@@ -60,7 +61,6 @@ class DenseCoeffsBase<Derived,ReadOnlyAccessors> : public EigenBase<Derived>
    using Base::size;
    using Base::derived;

-    EIGEN_DEVICE_FUNC
    EIGEN_STRONG_INLINE Index rowIndexByOuterInner(Index outer, Index inner) const
    {
      return int(Derived::RowsAtCompileTime) == 1 ? 0
@@ -69,7 +69,6 @@ class DenseCoeffsBase<Derived,ReadOnlyAccessors> : public EigenBase<Derived>
          : inner;
    }

-    EIGEN_DEVICE_FUNC
    EIGEN_STRONG_INLINE Index colIndexByOuterInner(Index outer, Index inner) const
    {
      return int(Derived::ColsAtCompileTime) == 1 ? 0
@@ -92,15 +91,13 @@ class DenseCoeffsBase<Derived,ReadOnlyAccessors> : public EigenBase<Derived>
      *
      * \sa operator()(Index,Index) const, coeffRef(Index,Index), coeff(Index) const
      */
-    EIGEN_DEVICE_FUNC
    EIGEN_STRONG_INLINE CoeffReturnType coeff(Index row, Index col) const
    {
      eigen_internal_assert(row >= 0 && row < rows()
-                         && col >= 0 && col < cols());
-      return typename internal::evaluator<Derived>::type(derived()).coeff(row,col);
+                        && col >= 0 && col < cols());
+      return derived().coeff(row, col);
    }

-    EIGEN_DEVICE_FUNC
    EIGEN_STRONG_INLINE CoeffReturnType coeffByOuterInner(Index outer, Index inner) const
    {
      return coeff(rowIndexByOuterInner(outer, inner),
@@ -111,12 +108,11 @@ class DenseCoeffsBase<Derived,ReadOnlyAccessors> : public EigenBase<Derived>
      *
      * \sa operator()(Index,Index), operator[](Index)
      */
-    EIGEN_DEVICE_FUNC
    EIGEN_STRONG_INLINE CoeffReturnType operator()(Index row, Index col) const
    {
      eigen_assert(row >= 0 && row < rows()
          && col >= 0 && col < cols());
-      return coeff(row, col);
+      return derived().coeff(row, col);
    }

    /** Short version: don't use this function, use
@@ -134,12 +130,11 @@ class DenseCoeffsBase<Derived,ReadOnlyAccessors> : public EigenBase<Derived>
      * \sa operator[](Index) const, coeffRef(Index), coeff(Index,Index) const
      */

-    EIGEN_DEVICE_FUNC
    EIGEN_STRONG_INLINE CoeffReturnType
    coeff(Index index) const
    {
      eigen_internal_assert(index >= 0 && index < size());
-      return typename internal::evaluator<Derived>::type(derived()).coeff(index);
+      return derived().coeff(index);
    }


@@ -151,14 +146,15 @@ class DenseCoeffsBase<Derived,ReadOnlyAccessors> : public EigenBase<Derived>
      * z() const, w() const
      */

-    EIGEN_DEVICE_FUNC
    EIGEN_STRONG_INLINE CoeffReturnType
    operator[](Index index) const
    {
+      #ifndef EIGEN2_SUPPORT
      EIGEN_STATIC_ASSERT(Derived::IsVectorAtCompileTime,
                          THE_BRACKET_OPERATOR_IS_ONLY_FOR_VECTORS__USE_THE_PARENTHESIS_OPERATOR_INSTEAD)
+      #endif
      eigen_assert(index >= 0 && index < size());
-      return coeff(index);
+      return derived().coeff(index);
    }

    /** \returns the coefficient at given index.
@@ -171,35 +167,30 @@ class DenseCoeffsBase<Derived,ReadOnlyAccessors> : public EigenBase<Derived>
      * z() const, w() const
      */

-    EIGEN_DEVICE_FUNC
    EIGEN_STRONG_INLINE CoeffReturnType
    operator()(Index index) const
    {
      eigen_assert(index >= 0 && index < size());
-      return coeff(index);
+      return derived().coeff(index);
    }

    /** equivalent to operator[](0).  */

-    EIGEN_DEVICE_FUNC
    EIGEN_STRONG_INLINE CoeffReturnType
    x() const { return (*this)[0]; }

    /** equivalent to operator[](1).  */

-    EIGEN_DEVICE_FUNC
    EIGEN_STRONG_INLINE CoeffReturnType
    y() const { return (*this)[1]; }

    /** equivalent to operator[](2).  */

-    EIGEN_DEVICE_FUNC
    EIGEN_STRONG_INLINE CoeffReturnType
    z() const { return (*this)[2]; }

    /** equivalent to operator[](3).  */

-    EIGEN_DEVICE_FUNC
    EIGEN_STRONG_INLINE CoeffReturnType
    w() const { return (*this)[3]; }

@@ -216,8 +207,9 @@ class DenseCoeffsBase<Derived,ReadOnlyAccessors> : public EigenBase<Derived>
    template<int LoadMode>
    EIGEN_STRONG_INLINE PacketReturnType packet(Index row, Index col) const
    {
-      eigen_internal_assert(row >= 0 && row < rows() && col >= 0 && col < cols());
-      return typename internal::evaluator<Derived>::type(derived()).template packet<LoadMode>(row,col);
+      eigen_internal_assert(row >= 0 && row < rows()
+                      && col >= 0 && col < cols());
+      return derived().template packet<LoadMode>(row,col);
    }


@@ -243,7 +235,7 @@ class DenseCoeffsBase<Derived,ReadOnlyAccessors> : public EigenBase<Derived>
    EIGEN_STRONG_INLINE PacketReturnType packet(Index index) const
    {
      eigen_internal_assert(index >= 0 && index < size());
-      return typename internal::evaluator<Derived>::type(derived()).template packet<LoadMode>(index);
+      return derived().template packet<LoadMode>(index);
    }

  protected:
@@ -286,6 +278,7 @@ class DenseCoeffsBase<Derived, WriteAccessors> : public DenseCoeffsBase<Derived,
    typedef DenseCoeffsBase<Derived, ReadOnlyAccessors> Base;

    typedef typename internal::traits<Derived>::StorageKind StorageKind;
+    typedef typename internal::traits<Derived>::Index Index;
    typedef typename internal::traits<Derived>::Scalar Scalar;
    typedef typename internal::packet_traits<Scalar>::type PacketScalar;
    typedef typename NumTraits<Scalar>::Real RealScalar;
@@ -318,15 +311,13 @@ class DenseCoeffsBase<Derived, WriteAccessors> : public DenseCoeffsBase<Derived,
      *
      * \sa operator()(Index,Index), coeff(Index, Index) const, coeffRef(Index)
      */
-    EIGEN_DEVICE_FUNC
    EIGEN_STRONG_INLINE Scalar& coeffRef(Index row, Index col)
    {
      eigen_internal_assert(row >= 0 && row < rows()
-                         && col >= 0 && col < cols());
-      return typename internal::evaluator<Derived>::type(derived()).coeffRef(row,col);
+                        && col >= 0 && col < cols());
+      return derived().coeffRef(row, col);
    }

-    EIGEN_DEVICE_FUNC
    EIGEN_STRONG_INLINE Scalar&
    coeffRefByOuterInner(Index outer, Index inner)
    {
@@ -339,13 +330,12 @@ class DenseCoeffsBase<Derived, WriteAccessors> : public DenseCoeffsBase<Derived,
      * \sa operator[](Index)
      */

-    EIGEN_DEVICE_FUNC
    EIGEN_STRONG_INLINE Scalar&
    operator()(Index row, Index col)
    {
      eigen_assert(row >= 0 && row < rows()
          && col >= 0 && col < cols());
-      return coeffRef(row, col);
+      return derived().coeffRef(row, col);
    }


@@ -364,12 +354,11 @@ class DenseCoeffsBase<Derived, WriteAccessors> : public DenseCoeffsBase<Derived,
      * \sa operator[](Index), coeff(Index) const, coeffRef(Index,Index)
      */

-    EIGEN_DEVICE_FUNC
    EIGEN_STRONG_INLINE Scalar&
    coeffRef(Index index)
    {
      eigen_internal_assert(index >= 0 && index < size());
-      return typename internal::evaluator<Derived>::type(derived()).coeffRef(index);
+      return derived().coeffRef(index);
    }

    /** \returns a reference to the coefficient at given index.
@@ -379,14 +368,15 @@ class DenseCoeffsBase<Derived, WriteAccessors> : public DenseCoeffsBase<Derived,
      * \sa operator[](Index) const, operator()(Index,Index), x(), y(), z(), w()
      */

-    EIGEN_DEVICE_FUNC
    EIGEN_STRONG_INLINE Scalar&
    operator[](Index index)
    {
+      #ifndef EIGEN2_SUPPORT
      EIGEN_STATIC_ASSERT(Derived::IsVectorAtCompileTime,
                          THE_BRACKET_OPERATOR_IS_ONLY_FOR_VECTORS__USE_THE_PARENTHESIS_OPERATOR_INSTEAD)
+      #endif
      eigen_assert(index >= 0 && index < size());
-      return coeffRef(index);
+      return derived().coeffRef(index);
    }

    /** \returns a reference to the coefficient at given index.
@@ -398,37 +388,167 @@ class DenseCoeffsBase<Derived, WriteAccessors> : public DenseCoeffsBase<Derived,
      * \sa operator[](Index) const, operator()(Index,Index), x(), y(), z(), w()
      */

-    EIGEN_DEVICE_FUNC
    EIGEN_STRONG_INLINE Scalar&
    operator()(Index index)
    {
      eigen_assert(index >= 0 && index < size());
-      return coeffRef(index);
+      return derived().coeffRef(index);
    }

    /** equivalent to operator[](0).  */

-    EIGEN_DEVICE_FUNC
    EIGEN_STRONG_INLINE Scalar&
    x() { return (*this)[0]; }

    /** equivalent to operator[](1).  */

-    EIGEN_DEVICE_FUNC
    EIGEN_STRONG_INLINE Scalar&
    y() { return (*this)[1]; }

    /** equivalent to operator[](2).  */

-    EIGEN_DEVICE_FUNC
    EIGEN_STRONG_INLINE Scalar&
    z() { return (*this)[2]; }

    /** equivalent to operator[](3).  */

-    EIGEN_DEVICE_FUNC
    EIGEN_STRONG_INLINE Scalar&
    w() { return (*this)[3]; }
+
+    /** \internal
+      * Stores the given packet of coefficients, at the given row and column of this expression. It is your responsibility
+      * to ensure that a packet really starts there. This method is only available on expressions having the
+      * PacketAccessBit.
+      *
+      * The \a LoadMode parameter may have the value \a #Aligned or \a #Unaligned. Its effect is to select
+      * the appropriate vectorization instruction. Aligned access is faster, but is only possible for packets
+      * starting at an address which is a multiple of the packet size.
+      */
+
+    template<int StoreMode>
+    EIGEN_STRONG_INLINE void writePacket
+    (Index row, Index col, const typename internal::packet_traits<Scalar>::type& val)
+    {
+      eigen_internal_assert(row >= 0 && row < rows()
+                        && col >= 0 && col < cols());
+      derived().template writePacket<StoreMode>(row,col,val);
+    }
+
+
+    /** \internal */
+    template<int StoreMode>
+    EIGEN_STRONG_INLINE void writePacketByOuterInner
+    (Index outer, Index inner, const typename internal::packet_traits<Scalar>::type& val)
+    {
+      writePacket<StoreMode>(rowIndexByOuterInner(outer, inner),
+                            colIndexByOuterInner(outer, inner),
+                            val);
+    }
+
+    /** \internal
+      * Stores the given packet of coefficients, at the given index in this expression. It is your responsibility
+      * to ensure that a packet really starts there. This method is only available on expressions having the
+      * PacketAccessBit and the LinearAccessBit.
+      *
+      * The \a LoadMode parameter may have the value \a Aligned or \a Unaligned. Its effect is to select
+      * the appropriate vectorization instruction. Aligned access is faster, but is only possible for packets
+      * starting at an address which is a multiple of the packet size.
+      */
+    template<int StoreMode>
+    EIGEN_STRONG_INLINE void writePacket
+    (Index index, const typename internal::packet_traits<Scalar>::type& val)
+    {
+      eigen_internal_assert(index >= 0 && index < size());
+      derived().template writePacket<StoreMode>(index,val);
+    }
+
+#ifndef EIGEN_PARSED_BY_DOXYGEN
+
+    /** \internal Copies the coefficient at position (row,col) of other into *this.
+      *
+      * This method is overridden in SwapWrapper, allowing swap() assignments to share 99% of their code
+      * with usual assignments.
+      *
+      * Outside of this internal usage, this method has probably no usefulness. It is hidden in the public API dox.
+      */
+
+    template<typename OtherDerived>
+    EIGEN_STRONG_INLINE void copyCoeff(Index row, Index col, const DenseBase<OtherDerived>& other)
+    {
+      eigen_internal_assert(row >= 0 && row < rows()
+                        && col >= 0 && col < cols());
+      derived().coeffRef(row, col) = other.derived().coeff(row, col);
+    }
+
+    /** \internal Copies the coefficient at the given index of other into *this.
+      *
+      * This method is overridden in SwapWrapper, allowing swap() assignments to share 99% of their code
+      * with usual assignments.
+      *
+      * Outside of this internal usage, this method has probably no usefulness. It is hidden in the public API dox.
+      */
+
+    template<typename OtherDerived>
+    EIGEN_STRONG_INLINE void copyCoeff(Index index, const DenseBase<OtherDerived>& other)
+    {
+      eigen_internal_assert(index >= 0 && index < size());
+      derived().coeffRef(index) = other.derived().coeff(index);
+    }
+
+
+    template<typename OtherDerived>
+    EIGEN_STRONG_INLINE void copyCoeffByOuterInner(Index outer, Index inner, const DenseBase<OtherDerived>& other)
+    {
+      const Index row = rowIndexByOuterInner(outer,inner);
+      const Index col = colIndexByOuterInner(outer,inner);
+      // derived() is important here: copyCoeff() may be reimplemented in Derived!
+      derived().copyCoeff(row, col, other);
+    }
+
+    /** \internal Copies the packet at position (row,col) of other into *this.
+      *
+      * This method is overridden in SwapWrapper, allowing swap() assignments to share 99% of their code
+      * with usual assignments.
+      *
+      * Outside of this internal usage, this method has probably no usefulness. It is hidden in the public API dox.
+      */
+
+    template<typename OtherDerived, int StoreMode, int LoadMode>
+    EIGEN_STRONG_INLINE void copyPacket(Index row, Index col, const DenseBase<OtherDerived>& other)
+    {
+      eigen_internal_assert(row >= 0 && row < rows()
+                        && col >= 0 && col < cols());
+      derived().template writePacket<StoreMode>(row, col,
+        other.derived().template packet<LoadMode>(row, col));
+    }
+
+    /** \internal Copies the packet at the given index of other into *this.
+      *
+      * This method is overridden in SwapWrapper, allowing swap() assignments to share 99% of their code
+      * with usual assignments.
+      *
+      * Outside of this internal usage, this method has probably no usefulness. It is hidden in the public API dox.
+      */
+
+    template<typename OtherDerived, int StoreMode, int LoadMode>
+    EIGEN_STRONG_INLINE void copyPacket(Index index, const DenseBase<OtherDerived>& other)
+    {
+      eigen_internal_assert(index >= 0 && index < size());
+      derived().template writePacket<StoreMode>(index,
+        other.derived().template packet<LoadMode>(index));
+    }
+
+    /** \internal */
+    template<typename OtherDerived, int StoreMode, int LoadMode>
+    EIGEN_STRONG_INLINE void copyPacketByOuterInner(Index outer, Index inner, const DenseBase<OtherDerived>& other)
+    {
+      const Index row = rowIndexByOuterInner(outer,inner);
+      const Index col = colIndexByOuterInner(outer,inner);
+      // derived() is important here: copyCoeff() may be reimplemented in Derived!
+      derived().template copyPacket< OtherDerived, StoreMode, LoadMode>(row, col, other);
+    }
+#endif
+
 };

 /** \brief Base class providing direct read-only coefficient access to matrices and arrays.
@@ -448,6 +568,7 @@ class DenseCoeffsBase<Derived, DirectAccessors> : public DenseCoeffsBase<Derived
  public:

    typedef DenseCoeffsBase<Derived, ReadOnlyAccessors> Base;
+    typedef typename internal::traits<Derived>::Index Index;
    typedef typename internal::traits<Derived>::Scalar Scalar;
    typedef typename NumTraits<Scalar>::Real RealScalar;

@@ -460,7 +581,6 @@ class DenseCoeffsBase<Derived, DirectAccessors> : public DenseCoeffsBase<Derived
      *
      * \sa outerStride(), rowStride(), colStride()
      */
-    EIGEN_DEVICE_FUNC
    inline Index innerStride() const
    {
      return derived().innerStride();
@@ -471,7 +591,6 @@ class DenseCoeffsBase<Derived, DirectAccessors> : public DenseCoeffsBase<Derived
      *
      * \sa innerStride(), rowStride(), colStride()
      */
-    EIGEN_DEVICE_FUNC
    inline Index outerStride() const
    {
      return derived().outerStride();
@@ -487,7 +606,6 @@ class DenseCoeffsBase<Derived, DirectAccessors> : public DenseCoeffsBase<Derived
      *
      * \sa innerStride(), outerStride(), colStride()
      */
-    EIGEN_DEVICE_FUNC
    inline Index rowStride() const
    {
      return Derived::IsRowMajor ? outerStride() : innerStride();
@@ -497,7 +615,6 @@ class DenseCoeffsBase<Derived, DirectAccessors> : public DenseCoeffsBase<Derived
      *
      * \sa innerStride(), outerStride(), rowStride()
      */
-    EIGEN_DEVICE_FUNC
    inline Index colStride() const
    {
      return Derived::IsRowMajor ? innerStride() : outerStride();
@@ -522,6 +639,7 @@ class DenseCoeffsBase<Derived, DirectWriteAccessors>
  public:

    typedef DenseCoeffsBase<Derived, WriteAccessors> Base;
+    typedef typename internal::traits<Derived>::Index Index;
    typedef typename internal::traits<Derived>::Scalar Scalar;
    typedef typename NumTraits<Scalar>::Real RealScalar;

@@ -534,7 +652,6 @@ class DenseCoeffsBase<Derived, DirectWriteAccessors>
      *
      * \sa outerStride(), rowStride(), colStride()
      */
-    EIGEN_DEVICE_FUNC
    inline Index innerStride() const
    {
      return derived().innerStride();
@@ -545,7 +662,6 @@ class DenseCoeffsBase<Derived, DirectWriteAccessors>
      *
      * \sa innerStride(), rowStride(), colStride()
      */
-    EIGEN_DEVICE_FUNC
    inline Index outerStride() const
    {
      return derived().outerStride();
@@ -561,7 +677,6 @@ class DenseCoeffsBase<Derived, DirectWriteAccessors>
      *
      * \sa innerStride(), outerStride(), colStride()
      */
-    EIGEN_DEVICE_FUNC
    inline Index rowStride() const
    {
      return Derived::IsRowMajor ? outerStride() : innerStride();
@@ -571,7 +686,6 @@ class DenseCoeffsBase<Derived, DirectWriteAccessors>
      *
      * \sa innerStride(), outerStride(), rowStride()
      */
-    EIGEN_DEVICE_FUNC
    inline Index colStride() const
    {
      return Derived::IsRowMajor ? innerStride() : outerStride();
@@ -583,14 +697,14 @@ namespace internal {
 template<typename Derived, bool JustReturnZero>
 struct first_aligned_impl
 {
-  static inline Index run(const Derived&)
+  static inline typename Derived::Index run(const Derived&)
  { return 0; }
 };

 template<typename Derived>
 struct first_aligned_impl<Derived, false>
 {
-  static inline Index run(const Derived& m)
+  static inline typename Derived::Index run(const Derived& m)
  {
    return internal::first_aligned(&m.const_cast_derived().coeffRef(0,0), m.size());
  }
@@ -602,7 +716,7 @@ struct first_aligned_impl<Derived, false>
  * documentation.
  */
 template<typename Derived>
-static inline Index first_aligned(const Derived& m)
+static inline typename Derived::Index first_aligned(const Derived& m)
 {
  return first_aligned_impl
          <Derived, (Derived::Flags & AlignedBit) || !(Derived::Flags & DirectAccessBit)>
--- a/Eigen/src/Core/DenseStorage.h
+++ b/Eigen/src/Core/DenseStorage.h
@@ -3,7 +3,7 @@
 //
 // Copyright (C) 2008 Gael Guennebaud <gael.guennebaud@inria.fr>
 // Copyright (C) 2006-2009 Benoit Jacob <jacob.benoit.1@gmail.com>
-// Copyright (C) 2010-2013 Hauke Heibel <hauke.heibel@gmail.com>
+// Copyright (C) 2010 Hauke Heibel <hauke.heibel@gmail.com>
 //
 // This Source Code Form is subject to the terms of the Mozilla
 // Public License v. 2.0. If a copy of the MPL was not distributed
@@ -24,9 +24,7 @@ namespace internal {

 struct constructor_without_unaligned_array_assert {};

-template<typename T, int Size>
-EIGEN_DEVICE_FUNC
-void check_static_allocation_size()
+template<typename T, int Size> void check_static_allocation_size()
 {
  // if EIGEN_STACK_ALLOCATION_LIMIT is defined to 0, then no limit
  #if EIGEN_STACK_ALLOCATION_LIMIT
@@ -40,20 +38,18 @@ void check_static_allocation_size()
  */
 template <typename T, int Size, int MatrixOrArrayOptions,
          int Alignment = (MatrixOrArrayOptions&DontAlign) ? 0
-                        : (((Size*sizeof(T))%EIGEN_ALIGN_BYTES)==0) ? EIGEN_ALIGN_BYTES
+                        : (((Size*sizeof(T))%16)==0) ? 16
                        : 0 >
 struct plain_array
 {
  T array[Size];

-  EIGEN_DEVICE_FUNC
-  plain_array()
+  plain_array() 
  { 
    check_static_allocation_size<T,Size>();
  }

-  EIGEN_DEVICE_FUNC
-  plain_array(constructor_without_unaligned_array_assert)
+  plain_array(constructor_without_unaligned_array_assert) 
  { 
    check_static_allocation_size<T,Size>();
  }
@@ -68,31 +64,29 @@ struct plain_array
  template<typename PtrType>
  EIGEN_ALWAYS_INLINE PtrType eigen_unaligned_array_assert_workaround_gcc47(PtrType array) { return array; }
  #define EIGEN_MAKE_UNALIGNED_ARRAY_ASSERT(sizemask) \
-    eigen_assert((reinterpret_cast<size_t>(eigen_unaligned_array_assert_workaround_gcc47(array)) & (sizemask)) == 0 \
+    eigen_assert((reinterpret_cast<size_t>(eigen_unaligned_array_assert_workaround_gcc47(array)) & sizemask) == 0 \
              && "this assertion is explained here: " \
              "http://eigen.tuxfamily.org/dox-devel/group__TopicUnalignedArrayAssert.html" \
              " **** READ THIS WEB PAGE !!! ****");
 #else
  #define EIGEN_MAKE_UNALIGNED_ARRAY_ASSERT(sizemask) \
-    eigen_assert((reinterpret_cast<size_t>(array) & (sizemask)) == 0 \
+    eigen_assert((reinterpret_cast<size_t>(array) & sizemask) == 0 \
              && "this assertion is explained here: " \
              "http://eigen.tuxfamily.org/dox-devel/group__TopicUnalignedArrayAssert.html" \
              " **** READ THIS WEB PAGE !!! ****");
 #endif

 template <typename T, int Size, int MatrixOrArrayOptions>
-struct plain_array<T, Size, MatrixOrArrayOptions, EIGEN_ALIGN_BYTES>
+struct plain_array<T, Size, MatrixOrArrayOptions, 16>
 {
-  EIGEN_USER_ALIGN_DEFAULT T array[Size];
+  EIGEN_USER_ALIGN16 T array[Size];

-  EIGEN_DEVICE_FUNC
  plain_array() 
  { 
-    EIGEN_MAKE_UNALIGNED_ARRAY_ASSERT(EIGEN_ALIGN_BYTES-1);
+    EIGEN_MAKE_UNALIGNED_ARRAY_ASSERT(0xf);
    check_static_allocation_size<T,Size>();
  }

-  EIGEN_DEVICE_FUNC
  plain_array(constructor_without_unaligned_array_assert) 
  { 
    check_static_allocation_size<T,Size>();
@@ -102,9 +96,9 @@ struct plain_array<T, Size, MatrixOrArrayOptions, EIGEN_ALIGN_BYTES>
 template <typename T, int MatrixOrArrayOptions, int Alignment>
 struct plain_array<T, 0, MatrixOrArrayOptions, Alignment>
 {
-  EIGEN_USER_ALIGN_DEFAULT T array[1];
-  EIGEN_DEVICE_FUNC plain_array() {}
-  EIGEN_DEVICE_FUNC plain_array(constructor_without_unaligned_array_assert) {}
+  EIGEN_USER_ALIGN16 T array[1];
+  plain_array() {}
+  plain_array(constructor_without_unaligned_array_assert) {}
 };

 } // end namespace internal
@@ -128,44 +122,41 @@ template<typename T, int Size, int _Rows, int _Cols, int _Options> class DenseSt
 {
    internal::plain_array<T,Size,_Options> m_data;
  public:
-    EIGEN_DEVICE_FUNC DenseStorage() {}
-    EIGEN_DEVICE_FUNC
-    explicit DenseStorage(internal::constructor_without_unaligned_array_assert)
+    DenseStorage() {}
+    DenseStorage(internal::constructor_without_unaligned_array_assert)
      : m_data(internal::constructor_without_unaligned_array_assert()) {}
-    EIGEN_DEVICE_FUNC 
    DenseStorage(const DenseStorage& other) : m_data(other.m_data) {}
-    EIGEN_DEVICE_FUNC 
    DenseStorage& operator=(const DenseStorage& other)
-    { 
+    {
      if (this != &other) m_data = other.m_data;
-      return *this; 
+      return *this;
    }
-    EIGEN_DEVICE_FUNC DenseStorage(Index,Index,Index) {}
-    EIGEN_DEVICE_FUNC void swap(DenseStorage& other) { std::swap(m_data,other.m_data); }
-    EIGEN_DEVICE_FUNC static Index rows(void) {return _Rows;}
-    EIGEN_DEVICE_FUNC static Index cols(void) {return _Cols;}
-    EIGEN_DEVICE_FUNC void conservativeResize(Index,Index,Index) {}
-    EIGEN_DEVICE_FUNC void resize(Index,Index,Index) {}
-    EIGEN_DEVICE_FUNC const T *data() const { return m_data.array; }
-    EIGEN_DEVICE_FUNC T *data() { return m_data.array; }
+    DenseStorage(DenseIndex,DenseIndex,DenseIndex) {}
+    void swap(DenseStorage& other) { std::swap(m_data,other.m_data); }
+    static DenseIndex rows(void) {return _Rows;}
+    static DenseIndex cols(void) {return _Cols;}
+    void conservativeResize(DenseIndex,DenseIndex,DenseIndex) {}
+    void resize(DenseIndex,DenseIndex,DenseIndex) {}
+    const T *data() const { return m_data.array; }
+    T *data() { return m_data.array; }
 };

 // null matrix
 template<typename T, int _Rows, int _Cols, int _Options> class DenseStorage<T, 0, _Rows, _Cols, _Options>
 {
  public:
-    EIGEN_DEVICE_FUNC DenseStorage() {}
-    EIGEN_DEVICE_FUNC explicit DenseStorage(internal::constructor_without_unaligned_array_assert) {}
-    EIGEN_DEVICE_FUNC DenseStorage(const DenseStorage&) {}
-    EIGEN_DEVICE_FUNC DenseStorage& operator=(const DenseStorage&) { return *this; }
-    EIGEN_DEVICE_FUNC DenseStorage(Index,Index,Index) {}
-    EIGEN_DEVICE_FUNC void swap(DenseStorage& ) {}
-    EIGEN_DEVICE_FUNC static Index rows(void) {return _Rows;}
-    EIGEN_DEVICE_FUNC static Index cols(void) {return _Cols;}
-    EIGEN_DEVICE_FUNC void conservativeResize(Index,Index,Index) {}
-    EIGEN_DEVICE_FUNC void resize(Index,Index,Index) {}
-    EIGEN_DEVICE_FUNC const T *data() const { return 0; }
-    EIGEN_DEVICE_FUNC T *data() { return 0; }
+    DenseStorage() {}
+    DenseStorage(internal::constructor_without_unaligned_array_assert) {}
+    DenseStorage(const DenseStorage&) {}
+    DenseStorage& operator=(const DenseStorage&) { return *this; }
+    DenseStorage(DenseIndex,DenseIndex,DenseIndex) {}
+    void swap(DenseStorage& ) {}
+    static DenseIndex rows(void) {return _Rows;}
+    static DenseIndex cols(void) {return _Cols;}
+    void conservativeResize(DenseIndex,DenseIndex,DenseIndex) {}
+    void resize(DenseIndex,DenseIndex,DenseIndex) {}
+    const T *data() const { return 0; }
+    T *data() { return 0; }
 };

 // more specializations for null matrices; these are necessary to resolve ambiguities
@@ -182,71 +173,71 @@ template<typename T, int _Cols, int _Options> class DenseStorage<T, 0, Dynamic,
 template<typename T, int Size, int _Options> class DenseStorage<T, Size, Dynamic, Dynamic, _Options>
 {
    internal::plain_array<T,Size,_Options> m_data;
-    Index m_rows;
-    Index m_cols;
+    DenseIndex m_rows;
+    DenseIndex m_cols;
  public:
-    EIGEN_DEVICE_FUNC DenseStorage() : m_rows(0), m_cols(0) {}
-    explicit DenseStorage(internal::constructor_without_unaligned_array_assert)
+    DenseStorage() : m_rows(0), m_cols(0) {}
+    DenseStorage(internal::constructor_without_unaligned_array_assert)
      : m_data(internal::constructor_without_unaligned_array_assert()), m_rows(0), m_cols(0) {}
    DenseStorage(const DenseStorage& other) : m_data(other.m_data), m_rows(other.m_rows), m_cols(other.m_cols) {}
-    DenseStorage& operator=(const DenseStorage& other) 
-    { 
+    DenseStorage& operator=(const DenseStorage& other)
+    {
      if (this != &other)
      {
        m_data = other.m_data;
        m_rows = other.m_rows;
        m_cols = other.m_cols;
      }
-      return *this; 
+      return *this;
    }
-    DenseStorage(Index, Index nbRows, Index nbCols) : m_rows(nbRows), m_cols(nbCols) {}
+    DenseStorage(DenseIndex, DenseIndex nbRows, DenseIndex nbCols) : m_rows(nbRows), m_cols(nbCols) {}
    void swap(DenseStorage& other)
    { std::swap(m_data,other.m_data); std::swap(m_rows,other.m_rows); std::swap(m_cols,other.m_cols); }
-    EIGEN_DEVICE_FUNC Index rows() const {return m_rows;}
-    EIGEN_DEVICE_FUNC Index cols() const {return m_cols;}
-    void conservativeResize(Index, Index nbRows, Index nbCols) { m_rows = nbRows; m_cols = nbCols; }
-    void resize(Index, Index nbRows, Index nbCols) { m_rows = nbRows; m_cols = nbCols; }
-    EIGEN_DEVICE_FUNC const T *data() const { return m_data.array; }
-    EIGEN_DEVICE_FUNC T *data() { return m_data.array; }
+    DenseIndex rows() const {return m_rows;}
+    DenseIndex cols() const {return m_cols;}
+    void conservativeResize(DenseIndex, DenseIndex nbRows, DenseIndex nbCols) { m_rows = nbRows; m_cols = nbCols; }
+    void resize(DenseIndex, DenseIndex nbRows, DenseIndex nbCols) { m_rows = nbRows; m_cols = nbCols; }
+    const T *data() const { return m_data.array; }
+    T *data() { return m_data.array; }
 };

 // dynamic-size matrix with fixed-size storage and fixed width
 template<typename T, int Size, int _Cols, int _Options> class DenseStorage<T, Size, Dynamic, _Cols, _Options>
 {
    internal::plain_array<T,Size,_Options> m_data;
-    Index m_rows;
+    DenseIndex m_rows;
  public:
-    EIGEN_DEVICE_FUNC DenseStorage() : m_rows(0) {}
-    explicit DenseStorage(internal::constructor_without_unaligned_array_assert)
+    DenseStorage() : m_rows(0) {}
+    DenseStorage(internal::constructor_without_unaligned_array_assert)
      : m_data(internal::constructor_without_unaligned_array_assert()), m_rows(0) {}
    DenseStorage(const DenseStorage& other) : m_data(other.m_data), m_rows(other.m_rows) {}
-    DenseStorage& operator=(const DenseStorage& other) 
+    DenseStorage& operator=(const DenseStorage& other)
    {
      if (this != &other)
      {
        m_data = other.m_data;
        m_rows = other.m_rows;
      }
-      return *this; 
+      return *this;
    }
-    DenseStorage(Index, Index nbRows, Index) : m_rows(nbRows) {}
+    DenseStorage(DenseIndex, DenseIndex nbRows, DenseIndex) : m_rows(nbRows) {}
    void swap(DenseStorage& other) { std::swap(m_data,other.m_data); std::swap(m_rows,other.m_rows); }
-    EIGEN_DEVICE_FUNC Index rows(void) const {return m_rows;}
-    EIGEN_DEVICE_FUNC Index cols(void) const {return _Cols;}
-    void conservativeResize(Index, Index nbRows, Index) { m_rows = nbRows; }
-    void resize(Index, Index nbRows, Index) { m_rows = nbRows; }
-    EIGEN_DEVICE_FUNC const T *data() const { return m_data.array; }
-    EIGEN_DEVICE_FUNC T *data() { return m_data.array; }
+    DenseIndex rows(void) const {return m_rows;}
+    DenseIndex cols(void) const {return _Cols;}
+    void conservativeResize(DenseIndex, DenseIndex nbRows, DenseIndex) { m_rows = nbRows; }
+    void resize(DenseIndex, DenseIndex nbRows, DenseIndex) { m_rows = nbRows; }
+    const T *data() const { return m_data.array; }
+    T *data() { return m_data.array; }
 };

 // dynamic-size matrix with fixed-size storage and fixed height
 template<typename T, int Size, int _Rows, int _Options> class DenseStorage<T, Size, _Rows, Dynamic, _Options>
 {
    internal::plain_array<T,Size,_Options> m_data;
-    Index m_cols;
+    DenseIndex m_cols;
  public:
-    EIGEN_DEVICE_FUNC DenseStorage() : m_cols(0) {}
-    explicit DenseStorage(internal::constructor_without_unaligned_array_assert)
+    DenseStorage() : m_cols(0) {}
+    DenseStorage(internal::constructor_without_unaligned_array_assert)
      : m_data(internal::constructor_without_unaligned_array_assert()), m_cols(0) {}
    DenseStorage(const DenseStorage& other) : m_data(other.m_data), m_cols(other.m_cols) {}
    DenseStorage& operator=(const DenseStorage& other)
@@ -258,45 +249,29 @@ template<typename T, int Size, int _Rows, int _Options> class DenseStorage<T, Si
      }
      return *this;
    }
-    DenseStorage(Index, Index, Index nbCols) : m_cols(nbCols) {}
+    DenseStorage(DenseIndex, DenseIndex, DenseIndex nbCols) : m_cols(nbCols) {}
    void swap(DenseStorage& other) { std::swap(m_data,other.m_data); std::swap(m_cols,other.m_cols); }
-    EIGEN_DEVICE_FUNC Index rows(void) const {return _Rows;}
-    EIGEN_DEVICE_FUNC Index cols(void) const {return m_cols;}
-    void conservativeResize(Index, Index, Index nbCols) { m_cols = nbCols; }
-    void resize(Index, Index, Index nbCols) { m_cols = nbCols; }
-    EIGEN_DEVICE_FUNC const T *data() const { return m_data.array; }
-    EIGEN_DEVICE_FUNC T *data() { return m_data.array; }
+    DenseIndex rows(void) const {return _Rows;}
+    DenseIndex cols(void) const {return m_cols;}
+    void conservativeResize(DenseIndex, DenseIndex, DenseIndex nbCols) { m_cols = nbCols; }
+    void resize(DenseIndex, DenseIndex, DenseIndex nbCols) { m_cols = nbCols; }
+    const T *data() const { return m_data.array; }
+    T *data() { return m_data.array; }
 };

 // purely dynamic matrix.
 template<typename T, int _Options> class DenseStorage<T, Dynamic, Dynamic, Dynamic, _Options>
 {
    T *m_data;
-    Index m_rows;
-    Index m_cols;
+    DenseIndex m_rows;
+    DenseIndex m_cols;
  public:
-    EIGEN_DEVICE_FUNC DenseStorage() : m_data(0), m_rows(0), m_cols(0) {}
-    explicit DenseStorage(internal::constructor_without_unaligned_array_assert)
+    DenseStorage() : m_data(0), m_rows(0), m_cols(0) {}
+    DenseStorage(internal::constructor_without_unaligned_array_assert)
       : m_data(0), m_rows(0), m_cols(0) {}
-    DenseStorage(Index size, Index nbRows, Index nbCols)
+    DenseStorage(DenseIndex size, DenseIndex nbRows, DenseIndex nbCols)
      : m_data(internal::conditional_aligned_new_auto<T,(_Options&DontAlign)==0>(size)), m_rows(nbRows), m_cols(nbCols)
    { EIGEN_INTERNAL_DENSE_STORAGE_CTOR_PLUGIN }
-    DenseStorage(const DenseStorage& other)
-      : m_data(internal::conditional_aligned_new_auto<T,(_Options&DontAlign)==0>(other.m_rows*other.m_cols))
-      , m_rows(other.m_rows)
-      , m_cols(other.m_cols)
-    {
-      internal::smart_copy(other.m_data, other.m_data+other.m_rows*other.m_cols, m_data);
-    }
-    DenseStorage& operator=(const DenseStorage& other)
-    {
-      if (this != &other)
-      {
-        DenseStorage tmp(other);
-        this->swap(tmp);
-      }
-      return *this;
-    }
 #ifdef EIGEN_HAVE_RVALUE_REFERENCES
    DenseStorage(DenseStorage&& other)
      : m_data(std::move(other.m_data))
@@ -304,8 +279,6 @@ template<typename T, int _Options> class DenseStorage<T, Dynamic, Dynamic, Dynam
      , m_cols(std::move(other.m_cols))
    {
      other.m_data = nullptr;
-      other.m_rows = 0;
-      other.m_cols = 0;
    }
    DenseStorage& operator=(DenseStorage&& other)
    {
@@ -319,15 +292,15 @@ template<typename T, int _Options> class DenseStorage<T, Dynamic, Dynamic, Dynam
    ~DenseStorage() { internal::conditional_aligned_delete_auto<T,(_Options&DontAlign)==0>(m_data, m_rows*m_cols); }
    void swap(DenseStorage& other)
    { std::swap(m_data,other.m_data); std::swap(m_rows,other.m_rows); std::swap(m_cols,other.m_cols); }
-    EIGEN_DEVICE_FUNC Index rows(void) const {return m_rows;}
-    EIGEN_DEVICE_FUNC Index cols(void) const {return m_cols;}
-    void conservativeResize(Index size, Index nbRows, Index nbCols)
+    DenseIndex rows(void) const {return m_rows;}
+    DenseIndex cols(void) const {return m_cols;}
+    void conservativeResize(DenseIndex size, DenseIndex nbRows, DenseIndex nbCols)
    {
      m_data = internal::conditional_aligned_realloc_new_auto<T,(_Options&DontAlign)==0>(m_data, size, m_rows*m_cols);
      m_rows = nbRows;
      m_cols = nbCols;
    }
-    void resize(Index size, Index nbRows, Index nbCols)
+    void resize(DenseIndex size, DenseIndex nbRows, DenseIndex nbCols)
    {
      if(size != m_rows*m_cols)
      {
@@ -341,42 +314,29 @@ template<typename T, int _Options> class DenseStorage<T, Dynamic, Dynamic, Dynam
      m_rows = nbRows;
      m_cols = nbCols;
    }
-    EIGEN_DEVICE_FUNC const T *data() const { return m_data; }
-    EIGEN_DEVICE_FUNC T *data() { return m_data; }
+    const T *data() const { return m_data; }
+    T *data() { return m_data; }
+  private:
+    DenseStorage(const DenseStorage&);
+    DenseStorage& operator=(const DenseStorage&);
 };

 // matrix with dynamic width and fixed height (so that matrix has dynamic size).
 template<typename T, int _Rows, int _Options> class DenseStorage<T, Dynamic, _Rows, Dynamic, _Options>
 {
    T *m_data;
-    Index m_cols;
+    DenseIndex m_cols;
  public:
-    EIGEN_DEVICE_FUNC DenseStorage() : m_data(0), m_cols(0) {}
-    explicit DenseStorage(internal::constructor_without_unaligned_array_assert) : m_data(0), m_cols(0) {}
-    DenseStorage(Index size, Index, Index nbCols) : m_data(internal::conditional_aligned_new_auto<T,(_Options&DontAlign)==0>(size)), m_cols(nbCols)
+    DenseStorage() : m_data(0), m_cols(0) {}
+    DenseStorage(internal::constructor_without_unaligned_array_assert) : m_data(0), m_cols(0) {}
+    DenseStorage(DenseIndex size, DenseIndex, DenseIndex nbCols) : m_data(internal::conditional_aligned_new_auto<T,(_Options&DontAlign)==0>(size)), m_cols(nbCols)
    { EIGEN_INTERNAL_DENSE_STORAGE_CTOR_PLUGIN }
-    DenseStorage(const DenseStorage& other)
-      : m_data(internal::conditional_aligned_new_auto<T,(_Options&DontAlign)==0>(_Rows*other.m_cols))
-      , m_cols(other.m_cols)
-    {
-      internal::smart_copy(other.m_data, other.m_data+_Rows*m_cols, m_data);
-    }
-    DenseStorage& operator=(const DenseStorage& other)
-    {
-      if (this != &other)
-      {
-        DenseStorage tmp(other);
-        this->swap(tmp);
-      }
-      return *this;
-    }    
 #ifdef EIGEN_HAVE_RVALUE_REFERENCES
    DenseStorage(DenseStorage&& other)
      : m_data(std::move(other.m_data))
      , m_cols(std::move(other.m_cols))
    {
      other.m_data = nullptr;
-      other.m_cols = 0;
    }
    DenseStorage& operator=(DenseStorage&& other)
    {
@@ -388,14 +348,14 @@ template<typename T, int _Rows, int _Options> class DenseStorage<T, Dynamic, _Ro
 #endif
    ~DenseStorage() { internal::conditional_aligned_delete_auto<T,(_Options&DontAlign)==0>(m_data, _Rows*m_cols); }
    void swap(DenseStorage& other) { std::swap(m_data,other.m_data); std::swap(m_cols,other.m_cols); }
-    EIGEN_DEVICE_FUNC static Index rows(void) {return _Rows;}
-    EIGEN_DEVICE_FUNC Index cols(void) const {return m_cols;}
-    void conservativeResize(Index size, Index, Index nbCols)
+    static DenseIndex rows(void) {return _Rows;}
+    DenseIndex cols(void) const {return m_cols;}
+    void conservativeResize(DenseIndex size, DenseIndex, DenseIndex nbCols)
    {
      m_data = internal::conditional_aligned_realloc_new_auto<T,(_Options&DontAlign)==0>(m_data, size, _Rows*m_cols);
      m_cols = nbCols;
    }
-    EIGEN_STRONG_INLINE void resize(Index size, Index, Index nbCols)
+    EIGEN_STRONG_INLINE void resize(DenseIndex size, DenseIndex, DenseIndex nbCols)
    {
      if(size != _Rows*m_cols)
      {
@@ -408,42 +368,29 @@ template<typename T, int _Rows, int _Options> class DenseStorage<T, Dynamic, _Ro
      }
      m_cols = nbCols;
    }
-    EIGEN_DEVICE_FUNC const T *data() const { return m_data; }
-    EIGEN_DEVICE_FUNC T *data() { return m_data; }
+    const T *data() const { return m_data; }
+    T *data() { return m_data; }
+  private:
+    DenseStorage(const DenseStorage&);
+    DenseStorage& operator=(const DenseStorage&);
 };

 // matrix with dynamic height and fixed width (so that matrix has dynamic size).
 template<typename T, int _Cols, int _Options> class DenseStorage<T, Dynamic, Dynamic, _Cols, _Options>
 {
    T *m_data;
-    Index m_rows;
+    DenseIndex m_rows;
  public:
-    EIGEN_DEVICE_FUNC DenseStorage() : m_data(0), m_rows(0) {}
-    explicit DenseStorage(internal::constructor_without_unaligned_array_assert) : m_data(0), m_rows(0) {}
-    DenseStorage(Index size, Index nbRows, Index) : m_data(internal::conditional_aligned_new_auto<T,(_Options&DontAlign)==0>(size)), m_rows(nbRows)
+    DenseStorage() : m_data(0), m_rows(0) {}
+    DenseStorage(internal::constructor_without_unaligned_array_assert) : m_data(0), m_rows(0) {}
+    DenseStorage(DenseIndex size, DenseIndex nbRows, DenseIndex) : m_data(internal::conditional_aligned_new_auto<T,(_Options&DontAlign)==0>(size)), m_rows(nbRows)
    { EIGEN_INTERNAL_DENSE_STORAGE_CTOR_PLUGIN }
-    DenseStorage(const DenseStorage& other)
-      : m_data(internal::conditional_aligned_new_auto<T,(_Options&DontAlign)==0>(other.m_rows*_Cols))
-      , m_rows(other.m_rows)
-    {
-      internal::smart_copy(other.m_data, other.m_data+other.m_rows*_Cols, m_data);
-    }
-    DenseStorage& operator=(const DenseStorage& other)
-    {
-      if (this != &other)
-      {
-        DenseStorage tmp(other);
-        this->swap(tmp);
-      }
-      return *this;
-    }    
 #ifdef EIGEN_HAVE_RVALUE_REFERENCES
    DenseStorage(DenseStorage&& other)
      : m_data(std::move(other.m_data))
      , m_rows(std::move(other.m_rows))
    {
      other.m_data = nullptr;
-      other.m_rows = 0;
    }
    DenseStorage& operator=(DenseStorage&& other)
    {
@@ -455,14 +402,14 @@ template<typename T, int _Cols, int _Options> class DenseStorage<T, Dynamic, Dyn
 #endif
    ~DenseStorage() { internal::conditional_aligned_delete_auto<T,(_Options&DontAlign)==0>(m_data, _Cols*m_rows); }
    void swap(DenseStorage& other) { std::swap(m_data,other.m_data); std::swap(m_rows,other.m_rows); }
-    EIGEN_DEVICE_FUNC Index rows(void) const {return m_rows;}
-    EIGEN_DEVICE_FUNC static Index cols(void) {return _Cols;}
-    void conservativeResize(Index size, Index nbRows, Index)
+    DenseIndex rows(void) const {return m_rows;}
+    static DenseIndex cols(void) {return _Cols;}
+    void conservativeResize(DenseIndex size, DenseIndex nbRows, DenseIndex)
    {
      m_data = internal::conditional_aligned_realloc_new_auto<T,(_Options&DontAlign)==0>(m_data, size, m_rows*_Cols);
      m_rows = nbRows;
    }
-    EIGEN_STRONG_INLINE void resize(Index size, Index nbRows, Index)
+    EIGEN_STRONG_INLINE void resize(DenseIndex size, DenseIndex nbRows, DenseIndex)
    {
      if(size != m_rows*_Cols)
      {
@@ -475,8 +422,11 @@ template<typename T, int _Cols, int _Options> class DenseStorage<T, Dynamic, Dyn
      }
      m_rows = nbRows;
    }
-    EIGEN_DEVICE_FUNC const T *data() const { return m_data; }
-    EIGEN_DEVICE_FUNC T *data() { return m_data; }
+    const T *data() const { return m_data; }
+    T *data() { return m_data; }
+  private:
+    DenseStorage(const DenseStorage&);
+    DenseStorage& operator=(const DenseStorage&);
 };

 } // end namespace Eigen
--- a/Eigen/src/Core/Diagonal.h
+++ b/Eigen/src/Core/Diagonal.h
@@ -52,7 +52,8 @@ struct traits<Diagonal<MatrixType,DiagIndex> >
                                                 MatrixType::MaxColsAtCompileTime - EIGEN_PLAIN_ENUM_MAX( DiagIndex, 0))),
    MaxColsAtCompileTime = 1,
    MaskLvalueBit = is_lvalue<MatrixType>::value ? LvalueBit : 0,
-    Flags = (unsigned int)_MatrixTypeNested::Flags & (RowMajorBit | MaskLvalueBit | DirectAccessBit) & ~RowMajorBit, // FIXME DirectAccessBit should not be handled by expressions
+    Flags = (unsigned int)_MatrixTypeNested::Flags & (HereditaryBits | LinearAccessBit | MaskLvalueBit | DirectAccessBit) & ~RowMajorBit,
+    CoeffReadCost = _MatrixTypeNested::CoeffReadCost,
    MatrixTypeOuterStride = outer_stride_at_compile_time<MatrixType>::ret,
    InnerStrideAtCompileTime = MatrixTypeOuterStride == Dynamic ? Dynamic : MatrixTypeOuterStride+1,
    OuterStrideAtCompileTime = 0
@@ -69,28 +70,20 @@ template<typename MatrixType, int _DiagIndex> class Diagonal
    typedef typename internal::dense_xpr_base<Diagonal>::type Base;
    EIGEN_DENSE_PUBLIC_INTERFACE(Diagonal)

-    EIGEN_DEVICE_FUNC
-    explicit inline Diagonal(MatrixType& matrix, Index a_index = DiagIndex) : m_matrix(matrix), m_index(a_index) {}
+    inline Diagonal(MatrixType& matrix, Index a_index = DiagIndex) : m_matrix(matrix), m_index(a_index) {}

    EIGEN_INHERIT_ASSIGNMENT_OPERATORS(Diagonal)

-    EIGEN_DEVICE_FUNC
    inline Index rows() const
-    {
-      return m_index.value()<0 ? numext::mini<Index>(m_matrix.cols(),m_matrix.rows()+m_index.value())
-                               : numext::mini<Index>(m_matrix.rows(),m_matrix.cols()-m_index.value());
-    }
+    { return m_index.value()<0 ? (std::min<Index>)(m_matrix.cols(),m_matrix.rows()+m_index.value()) : (std::min<Index>)(m_matrix.rows(),m_matrix.cols()-m_index.value()); }

-    EIGEN_DEVICE_FUNC
    inline Index cols() const { return 1; }

-    EIGEN_DEVICE_FUNC
    inline Index innerStride() const
    {
      return m_matrix.outerStride() + 1;
    }

-    EIGEN_DEVICE_FUNC
    inline Index outerStride() const
    {
      return 0;
@@ -102,58 +95,48 @@ template<typename MatrixType, int _DiagIndex> class Diagonal
                       const Scalar
                     >::type ScalarWithConstIfNotLvalue;

-    EIGEN_DEVICE_FUNC
    inline ScalarWithConstIfNotLvalue* data() { return &(m_matrix.const_cast_derived().coeffRef(rowOffset(), colOffset())); }
-    EIGEN_DEVICE_FUNC
    inline const Scalar* data() const { return &(m_matrix.const_cast_derived().coeffRef(rowOffset(), colOffset())); }

-    EIGEN_DEVICE_FUNC
    inline Scalar& coeffRef(Index row, Index)
    {
      EIGEN_STATIC_ASSERT_LVALUE(MatrixType)
      return m_matrix.const_cast_derived().coeffRef(row+rowOffset(), row+colOffset());
    }

-    EIGEN_DEVICE_FUNC
    inline const Scalar& coeffRef(Index row, Index) const
    {
      return m_matrix.const_cast_derived().coeffRef(row+rowOffset(), row+colOffset());
    }

-    EIGEN_DEVICE_FUNC
    inline CoeffReturnType coeff(Index row, Index) const
    {
      return m_matrix.coeff(row+rowOffset(), row+colOffset());
    }

-    EIGEN_DEVICE_FUNC
    inline Scalar& coeffRef(Index idx)
    {
      EIGEN_STATIC_ASSERT_LVALUE(MatrixType)
      return m_matrix.const_cast_derived().coeffRef(idx+rowOffset(), idx+colOffset());
    }

-    EIGEN_DEVICE_FUNC
    inline const Scalar& coeffRef(Index idx) const
    {
      return m_matrix.const_cast_derived().coeffRef(idx+rowOffset(), idx+colOffset());
    }

-    EIGEN_DEVICE_FUNC
    inline CoeffReturnType coeff(Index idx) const
    {
      return m_matrix.coeff(idx+rowOffset(), idx+colOffset());
    }

-    EIGEN_DEVICE_FUNC
-    inline const typename internal::remove_all<typename MatrixType::Nested>::type& 
+    const typename internal::remove_all<typename MatrixType::Nested>::type& 
    nestedExpression() const 
    {
      return m_matrix;
    }

-    EIGEN_DEVICE_FUNC
-    inline Index index() const
+    int index() const
    {
      return m_index.value();
    }
@@ -164,13 +147,10 @@ template<typename MatrixType, int _DiagIndex> class Diagonal

  private:
    // some compilers may fail to optimize std::max etc in case of compile-time constants...
-    EIGEN_DEVICE_FUNC
    EIGEN_STRONG_INLINE Index absDiagIndex() const { return m_index.value()>0 ? m_index.value() : -m_index.value(); }
-    EIGEN_DEVICE_FUNC
    EIGEN_STRONG_INLINE Index rowOffset() const { return m_index.value()>0 ? 0 : -m_index.value(); }
-    EIGEN_DEVICE_FUNC
    EIGEN_STRONG_INLINE Index colOffset() const { return m_index.value()>0 ? m_index.value() : 0; }
-    // trigger a compile time error is someone try to call packet
+    // triger a compile time error is someone try to call packet
    template<int LoadMode> typename MatrixType::PacketReturnType packet(Index) const;
    template<int LoadMode> typename MatrixType::PacketReturnType packet(Index,Index) const;
 };
@@ -187,7 +167,7 @@ template<typename Derived>
 inline typename MatrixBase<Derived>::DiagonalReturnType
 MatrixBase<Derived>::diagonal()
 {
-  return DiagonalReturnType(derived());
+  return derived();
 }

 /** This is the const version of diagonal(). */
@@ -236,20 +216,20 @@ MatrixBase<Derived>::diagonal(Index index) const
  *
  * \sa MatrixBase::diagonal(), class Diagonal */
 template<typename Derived>
-template<int Index_>
-inline typename MatrixBase<Derived>::template DiagonalIndexReturnType<Index_>::Type
+template<int Index>
+inline typename MatrixBase<Derived>::template DiagonalIndexReturnType<Index>::Type
 MatrixBase<Derived>::diagonal()
 {
-  return typename DiagonalIndexReturnType<Index_>::Type(derived());
+  return derived();
 }

 /** This is the const version of diagonal<int>(). */
 template<typename Derived>
-template<int Index_>
-inline typename MatrixBase<Derived>::template ConstDiagonalIndexReturnType<Index_>::Type
+template<int Index>
+inline typename MatrixBase<Derived>::template ConstDiagonalIndexReturnType<Index>::Type
 MatrixBase<Derived>::diagonal() const
 {
-  return typename ConstDiagonalIndexReturnType<Index_>::Type(derived());
+  return derived();
 }

 } // end namespace Eigen
--- a/Eigen/src/Core/DiagonalMatrix.h
+++ b/Eigen/src/Core/DiagonalMatrix.h
@@ -22,7 +22,7 @@ class DiagonalBase : public EigenBase<Derived>
    typedef typename DiagonalVectorType::Scalar Scalar;
    typedef typename DiagonalVectorType::RealScalar RealScalar;
    typedef typename internal::traits<Derived>::StorageKind StorageKind;
-    typedef typename internal::traits<Derived>::StorageIndex StorageIndex;
+    typedef typename internal::traits<Derived>::Index Index;

    enum {
      RowsAtCompileTime = DiagonalVectorType::SizeAtCompileTime,
@@ -30,62 +30,79 @@ class DiagonalBase : public EigenBase<Derived>
      MaxRowsAtCompileTime = DiagonalVectorType::MaxSizeAtCompileTime,
      MaxColsAtCompileTime = DiagonalVectorType::MaxSizeAtCompileTime,
      IsVectorAtCompileTime = 0,
-      Flags = NoPreferredStorageOrderBit
+      Flags = 0
    };

    typedef Matrix<Scalar, RowsAtCompileTime, ColsAtCompileTime, 0, MaxRowsAtCompileTime, MaxColsAtCompileTime> DenseMatrixType;
    typedef DenseMatrixType DenseType;
    typedef DiagonalMatrix<Scalar,DiagonalVectorType::SizeAtCompileTime,DiagonalVectorType::MaxSizeAtCompileTime> PlainObject;

-    EIGEN_DEVICE_FUNC
    inline const Derived& derived() const { return *static_cast<const Derived*>(this); }
-    EIGEN_DEVICE_FUNC
    inline Derived& derived() { return *static_cast<Derived*>(this); }

-    EIGEN_DEVICE_FUNC
    DenseMatrixType toDenseMatrix() const { return derived(); }
-    
-    EIGEN_DEVICE_FUNC
+    template<typename DenseDerived>
+    void evalTo(MatrixBase<DenseDerived> &other) const;
+    template<typename DenseDerived>
+    inline void addTo(MatrixBase<DenseDerived> &other) const
+    { other.diagonal() += diagonal(); }
+    template<typename DenseDerived>
+    inline void subTo(MatrixBase<DenseDerived> &other) const
+    { other.diagonal() -= diagonal(); }
+
    inline const DiagonalVectorType& diagonal() const { return derived().diagonal(); }
-    EIGEN_DEVICE_FUNC
    inline DiagonalVectorType& diagonal() { return derived().diagonal(); }

-    EIGEN_DEVICE_FUNC
    inline Index rows() const { return diagonal().size(); }
-    EIGEN_DEVICE_FUNC
    inline Index cols() const { return diagonal().size(); }

+    /** \returns the diagonal matrix product of \c *this by the matrix \a matrix.
+      */
    template<typename MatrixDerived>
-    EIGEN_DEVICE_FUNC
-    const Product<Derived,MatrixDerived,LazyProduct>
+    const DiagonalProduct<MatrixDerived, Derived, OnTheLeft>
    operator*(const MatrixBase<MatrixDerived> &matrix) const
    {
-      return Product<Derived, MatrixDerived, LazyProduct>(derived(),matrix.derived());
+      return DiagonalProduct<MatrixDerived, Derived, OnTheLeft>(matrix.derived(), derived());
    }

-    typedef DiagonalWrapper<const CwiseUnaryOp<internal::scalar_inverse_op<Scalar>, const DiagonalVectorType> > InverseReturnType;
-    EIGEN_DEVICE_FUNC
-    inline const InverseReturnType
+    inline const DiagonalWrapper<const CwiseUnaryOp<internal::scalar_inverse_op<Scalar>, const DiagonalVectorType> >
    inverse() const
    {
-      return InverseReturnType(diagonal().cwiseInverse());
+      return diagonal().cwiseInverse();
    }
    
-    typedef DiagonalWrapper<const CwiseUnaryOp<internal::scalar_multiple_op<Scalar>, const DiagonalVectorType> > ScalarMultipleReturnType;
-    EIGEN_DEVICE_FUNC
-    inline const ScalarMultipleReturnType
+    inline const DiagonalWrapper<const CwiseUnaryOp<internal::scalar_multiple_op<Scalar>, const DiagonalVectorType> >
    operator*(const Scalar& scalar) const
    {
-      return ScalarMultipleReturnType(diagonal() * scalar);
+      return diagonal() * scalar;
    }
-    EIGEN_DEVICE_FUNC
-    friend inline const ScalarMultipleReturnType
+    friend inline const DiagonalWrapper<const CwiseUnaryOp<internal::scalar_multiple_op<Scalar>, const DiagonalVectorType> >
    operator*(const Scalar& scalar, const DiagonalBase& other)
    {
-      return ScalarMultipleReturnType(other.diagonal() * scalar);
+      return other.diagonal() * scalar;
    }
+    
+    #ifdef EIGEN2_SUPPORT
+    template<typename OtherDerived>
+    bool isApprox(const DiagonalBase<OtherDerived>& other, typename NumTraits<Scalar>::Real precision = NumTraits<Scalar>::dummy_precision()) const
+    {
+      return diagonal().isApprox(other.diagonal(), precision);
+    }
+    template<typename OtherDerived>
+    bool isApprox(const MatrixBase<OtherDerived>& other, typename NumTraits<Scalar>::Real precision = NumTraits<Scalar>::dummy_precision()) const
+    {
+      return toDenseMatrix().isApprox(other, precision);
+    }
+    #endif
 };

+template<typename Derived>
+template<typename DenseDerived>
+inline void DiagonalBase<Derived>::evalTo(MatrixBase<DenseDerived> &other) const
+{
+  other.setZero();
+  other.diagonal() = diagonal();
+}
 #endif

 /** \class DiagonalMatrix
@@ -107,9 +124,10 @@ struct traits<DiagonalMatrix<_Scalar,SizeAtCompileTime,MaxSizeAtCompileTime> >
 : traits<Matrix<_Scalar,SizeAtCompileTime,SizeAtCompileTime,0,MaxSizeAtCompileTime,MaxSizeAtCompileTime> >
 {
  typedef Matrix<_Scalar,SizeAtCompileTime,1,0,MaxSizeAtCompileTime,1> DiagonalVectorType;
-  typedef DiagonalShape StorageKind;
+  typedef Dense StorageKind;
+  typedef DenseIndex Index;
  enum {
-    Flags = LvalueBit | NoPreferredStorageOrderBit
+    Flags = LvalueBit
  };
 };
 }
@@ -123,7 +141,7 @@ class DiagonalMatrix
    typedef const DiagonalMatrix& Nested;
    typedef _Scalar Scalar;
    typedef typename internal::traits<DiagonalMatrix>::StorageKind StorageKind;
-    typedef typename internal::traits<DiagonalMatrix>::StorageIndex StorageIndex;
+    typedef typename internal::traits<DiagonalMatrix>::Index Index;
    #endif

  protected:
@@ -133,31 +151,24 @@ class DiagonalMatrix
  public:

    /** const version of diagonal(). */
-    EIGEN_DEVICE_FUNC
    inline const DiagonalVectorType& diagonal() const { return m_diagonal; }
    /** \returns a reference to the stored vector of diagonal coefficients. */
-    EIGEN_DEVICE_FUNC
    inline DiagonalVectorType& diagonal() { return m_diagonal; }

    /** Default constructor without initialization */
-    EIGEN_DEVICE_FUNC
    inline DiagonalMatrix() {}

    /** Constructs a diagonal matrix with given dimension  */
-    EIGEN_DEVICE_FUNC
-    explicit inline DiagonalMatrix(Index dim) : m_diagonal(dim) {}
+    inline DiagonalMatrix(Index dim) : m_diagonal(dim) {}

    /** 2D constructor. */
-    EIGEN_DEVICE_FUNC
    inline DiagonalMatrix(const Scalar& x, const Scalar& y) : m_diagonal(x,y) {}

    /** 3D constructor. */
-    EIGEN_DEVICE_FUNC
    inline DiagonalMatrix(const Scalar& x, const Scalar& y, const Scalar& z) : m_diagonal(x,y,z) {}

    /** Copy constructor. */
    template<typename OtherDerived>
-    EIGEN_DEVICE_FUNC
    inline DiagonalMatrix(const DiagonalBase<OtherDerived>& other) : m_diagonal(other.diagonal()) {}

    #ifndef EIGEN_PARSED_BY_DOXYGEN
@@ -167,13 +178,11 @@ class DiagonalMatrix

    /** generic constructor from expression of the diagonal coefficients */
    template<typename OtherDerived>
-    EIGEN_DEVICE_FUNC
    explicit inline DiagonalMatrix(const MatrixBase<OtherDerived>& other) : m_diagonal(other)
    {}

    /** Copy operator. */
    template<typename OtherDerived>
-    EIGEN_DEVICE_FUNC
    DiagonalMatrix& operator=(const DiagonalBase<OtherDerived>& other)
    {
      m_diagonal = other.diagonal();
@@ -184,7 +193,6 @@ class DiagonalMatrix
    /** This is a special case of the templated operator=. Its purpose is to
      * prevent a default operator= from hiding the templated operator=.
      */
-    EIGEN_DEVICE_FUNC
    DiagonalMatrix& operator=(const DiagonalMatrix& other)
    {
      m_diagonal = other.diagonal();
@@ -193,19 +201,14 @@ class DiagonalMatrix
    #endif

    /** Resizes to given size. */
-    EIGEN_DEVICE_FUNC
    inline void resize(Index size) { m_diagonal.resize(size); }
    /** Sets all coefficients to zero. */
-    EIGEN_DEVICE_FUNC
    inline void setZero() { m_diagonal.setZero(); }
    /** Resizes and sets all coefficients to zero. */
-    EIGEN_DEVICE_FUNC
    inline void setZero(Index size) { m_diagonal.setZero(size); }
    /** Sets this matrix to be the identity matrix of the current size. */
-    EIGEN_DEVICE_FUNC
    inline void setIdentity() { m_diagonal.setOnes(); }
    /** Sets this matrix to be the identity matrix of the given size. */
-    EIGEN_DEVICE_FUNC
    inline void setIdentity(Index size) { m_diagonal.setOnes(size); }
 };

@@ -229,15 +232,14 @@ struct traits<DiagonalWrapper<_DiagonalVectorType> >
 {
  typedef _DiagonalVectorType DiagonalVectorType;
  typedef typename DiagonalVectorType::Scalar Scalar;
-  typedef typename DiagonalVectorType::StorageIndex StorageIndex;
-  typedef DiagonalShape StorageKind;
-  typedef typename traits<DiagonalVectorType>::XprKind XprKind;
+  typedef typename DiagonalVectorType::Index Index;
+  typedef typename DiagonalVectorType::StorageKind StorageKind;
  enum {
    RowsAtCompileTime = DiagonalVectorType::SizeAtCompileTime,
    ColsAtCompileTime = DiagonalVectorType::SizeAtCompileTime,
-    MaxRowsAtCompileTime = DiagonalVectorType::MaxSizeAtCompileTime,
-    MaxColsAtCompileTime = DiagonalVectorType::MaxSizeAtCompileTime,
-    Flags =  (traits<DiagonalVectorType>::Flags & LvalueBit) | NoPreferredStorageOrderBit
+    MaxRowsAtCompileTime = DiagonalVectorType::SizeAtCompileTime,
+    MaxColsAtCompileTime = DiagonalVectorType::SizeAtCompileTime,
+    Flags =  traits<DiagonalVectorType>::Flags & LvalueBit
  };
 };
 }
@@ -253,11 +255,9 @@ class DiagonalWrapper
    #endif

    /** Constructor from expression of diagonal coefficients to wrap. */
-    EIGEN_DEVICE_FUNC
-    explicit inline DiagonalWrapper(DiagonalVectorType& a_diagonal) : m_diagonal(a_diagonal) {}
+    inline DiagonalWrapper(DiagonalVectorType& a_diagonal) : m_diagonal(a_diagonal) {}

    /** \returns a const reference to the wrapped expression of diagonal coefficients. */
-    EIGEN_DEVICE_FUNC
    const DiagonalVectorType& diagonal() const { return m_diagonal; }

  protected:
@@ -277,7 +277,7 @@ template<typename Derived>
 inline const DiagonalWrapper<const Derived>
 MatrixBase<Derived>::asDiagonal() const
 {
-  return DiagonalWrapper<const Derived>(derived());
+  return derived();
 }

 /** \returns true if *this is approximately equal to a diagonal matrix,
@@ -308,33 +308,6 @@ bool MatrixBase<Derived>::isDiagonal(const RealScalar& prec) const
  return true;
 }

-namespace internal {
-
-template<> struct storage_kind_to_shape<DiagonalShape> { typedef DiagonalShape Shape; };
-
-struct Diagonal2Dense {};
-
-template<> struct AssignmentKind<DenseShape,DiagonalShape> { typedef Diagonal2Dense Kind; };
-
-// Diagonal matrix to Dense assignment
-template< typename DstXprType, typename SrcXprType, typename Functor, typename Scalar>
-struct Assignment<DstXprType, SrcXprType, Functor, Diagonal2Dense, Scalar>
-{
-  static void run(DstXprType &dst, const SrcXprType &src, const internal::assign_op<typename DstXprType::Scalar> &/*func*/)
-  {
-    dst.setZero();
-    dst.diagonal() = src.diagonal();
-  }
-  
-  static void run(DstXprType &dst, const SrcXprType &src, const internal::add_assign_op<typename DstXprType::Scalar> &/*func*/)
-  { dst.diagonal() += src.diagonal(); }
-  
-  static void run(DstXprType &dst, const SrcXprType &src, const internal::sub_assign_op<typename DstXprType::Scalar> &/*func*/)
-  { dst.diagonal() -= src.diagonal(); }
-};
-
-} // namespace internal
-
 } // end namespace Eigen

 #endif // EIGEN_DIAGONALMATRIX_H
--- a/Eigen/src/Core/DiagonalProduct.h
+++ b/Eigen/src/Core/DiagonalProduct.h
@@ -13,14 +13,117 @@

 namespace Eigen { 

+namespace internal {
+template<typename MatrixType, typename DiagonalType, int ProductOrder>
+struct traits<DiagonalProduct<MatrixType, DiagonalType, ProductOrder> >
+ : traits<MatrixType>
+{
+  typedef typename scalar_product_traits<typename MatrixType::Scalar, typename DiagonalType::Scalar>::ReturnType Scalar;
+  enum {
+    RowsAtCompileTime = MatrixType::RowsAtCompileTime,
+    ColsAtCompileTime = MatrixType::ColsAtCompileTime,
+    MaxRowsAtCompileTime = MatrixType::MaxRowsAtCompileTime,
+    MaxColsAtCompileTime = MatrixType::MaxColsAtCompileTime,
+
+    _StorageOrder = MatrixType::Flags & RowMajorBit ? RowMajor : ColMajor,
+    _ScalarAccessOnDiag =  !((int(_StorageOrder) == ColMajor && int(ProductOrder) == OnTheLeft)
+                          ||(int(_StorageOrder) == RowMajor && int(ProductOrder) == OnTheRight)),
+    _SameTypes = is_same<typename MatrixType::Scalar, typename DiagonalType::Scalar>::value,
+    // FIXME currently we need same types, but in the future the next rule should be the one
+    //_Vectorizable = bool(int(MatrixType::Flags)&PacketAccessBit) && ((!_PacketOnDiag) || (_SameTypes && bool(int(DiagonalType::DiagonalVectorType::Flags)&PacketAccessBit))),
+    _Vectorizable = bool(int(MatrixType::Flags)&PacketAccessBit) && _SameTypes && (_ScalarAccessOnDiag || (bool(int(DiagonalType::DiagonalVectorType::Flags)&PacketAccessBit))),
+    _LinearAccessMask = (RowsAtCompileTime==1 || ColsAtCompileTime==1) ? LinearAccessBit : 0,
+
+    Flags = ((HereditaryBits|_LinearAccessMask|AlignedBit) & (unsigned int)(MatrixType::Flags)) | (_Vectorizable ? PacketAccessBit : 0),//(int(MatrixType::Flags)&int(DiagonalType::DiagonalVectorType::Flags)&AlignedBit),
+    Cost0 = EIGEN_ADD_COST(NumTraits<Scalar>::MulCost, MatrixType::CoeffReadCost),
+    CoeffReadCost = EIGEN_ADD_COST(Cost0,DiagonalType::DiagonalVectorType::CoeffReadCost)
+  };
+};
+}
+
+template<typename MatrixType, typename DiagonalType, int ProductOrder>
+class DiagonalProduct : internal::no_assignment_operator,
+                        public MatrixBase<DiagonalProduct<MatrixType, DiagonalType, ProductOrder> >
+{
+  public:
+
+    typedef MatrixBase<DiagonalProduct> Base;
+    EIGEN_DENSE_PUBLIC_INTERFACE(DiagonalProduct)
+
+    inline DiagonalProduct(const MatrixType& matrix, const DiagonalType& diagonal)
+      : m_matrix(matrix), m_diagonal(diagonal)
+    {
+      eigen_assert(diagonal.diagonal().size() == (ProductOrder == OnTheLeft ? matrix.rows() : matrix.cols()));
+    }
+
+    EIGEN_STRONG_INLINE Index rows() const { return m_matrix.rows(); }
+    EIGEN_STRONG_INLINE Index cols() const { return m_matrix.cols(); }
+
+    EIGEN_STRONG_INLINE const Scalar coeff(Index row, Index col) const
+    {
+      return m_diagonal.diagonal().coeff(ProductOrder == OnTheLeft ? row : col) * m_matrix.coeff(row, col);
+    }
+    
+    EIGEN_STRONG_INLINE const Scalar coeff(Index idx) const
+    {
+      enum {
+        StorageOrder = int(MatrixType::Flags) & RowMajorBit ? RowMajor : ColMajor
+      };
+      return coeff(int(StorageOrder)==ColMajor?idx:0,int(StorageOrder)==ColMajor?0:idx);
+    }
+
+    template<int LoadMode>
+    EIGEN_STRONG_INLINE PacketScalar packet(Index row, Index col) const
+    {
+      enum {
+        StorageOrder = Flags & RowMajorBit ? RowMajor : ColMajor
+      };
+      const Index indexInDiagonalVector = ProductOrder == OnTheLeft ? row : col;
+      return packet_impl<LoadMode>(row,col,indexInDiagonalVector,typename internal::conditional<
+        ((int(StorageOrder) == RowMajor && int(ProductOrder) == OnTheLeft)
+       ||(int(StorageOrder) == ColMajor && int(ProductOrder) == OnTheRight)), internal::true_type, internal::false_type>::type());
+    }
+    
+    template<int LoadMode>
+    EIGEN_STRONG_INLINE PacketScalar packet(Index idx) const
+    {
+      enum {
+        StorageOrder = int(MatrixType::Flags) & RowMajorBit ? RowMajor : ColMajor
+      };
+      return packet<LoadMode>(int(StorageOrder)==ColMajor?idx:0,int(StorageOrder)==ColMajor?0:idx);
+    }
+
+  protected:
+    template<int LoadMode>
+    EIGEN_STRONG_INLINE PacketScalar packet_impl(Index row, Index col, Index id, internal::true_type) const
+    {
+      return internal::pmul(m_matrix.template packet<LoadMode>(row, col),
+                     internal::pset1<PacketScalar>(m_diagonal.diagonal().coeff(id)));
+    }
+
+    template<int LoadMode>
+    EIGEN_STRONG_INLINE PacketScalar packet_impl(Index row, Index col, Index id, internal::false_type) const
+    {
+      enum {
+        InnerSize = (MatrixType::Flags & RowMajorBit) ? MatrixType::ColsAtCompileTime : MatrixType::RowsAtCompileTime,
+        DiagonalVectorPacketLoadMode = (LoadMode == Aligned && (((InnerSize%16) == 0) || (int(DiagonalType::DiagonalVectorType::Flags)&AlignedBit)==AlignedBit) ? Aligned : Unaligned)
+      };
+      return internal::pmul(m_matrix.template packet<LoadMode>(row, col),
+                     m_diagonal.diagonal().template packet<DiagonalVectorPacketLoadMode>(id));
+    }
+
+    typename MatrixType::Nested m_matrix;
+    typename DiagonalType::Nested m_diagonal;
+};
+
 /** \returns the diagonal matrix product of \c *this by the diagonal matrix \a diagonal.
  */
 template<typename Derived>
 template<typename DiagonalDerived>
-inline const Product<Derived, DiagonalDerived, LazyProduct>
+inline const DiagonalProduct<Derived, DiagonalDerived, OnTheRight>
 MatrixBase<Derived>::operator*(const DiagonalBase<DiagonalDerived> &a_diagonal) const
 {
-  return Product<Derived, DiagonalDerived, LazyProduct>(derived(),a_diagonal.derived());
+  return DiagonalProduct<Derived, DiagonalDerived, OnTheRight>(derived(), a_diagonal.derived());
 }

 } // end namespace Eigen
--- a/Eigen/src/Core/Dot.h
+++ b/Eigen/src/Core/Dot.h
@@ -29,7 +29,6 @@ template<typename T, typename U,
 struct dot_nocheck
 {
  typedef typename scalar_product_traits<typename traits<T>::Scalar,typename traits<U>::Scalar>::ReturnType ResScalar;
-  EIGEN_DEVICE_FUNC
  static inline ResScalar run(const MatrixBase<T>& a, const MatrixBase<U>& b)
  {
    return a.template binaryExpr<scalar_conj_product_op<typename traits<T>::Scalar,typename traits<U>::Scalar> >(b).sum();
@@ -40,7 +39,6 @@ template<typename T, typename U>
 struct dot_nocheck<T, U, true>
 {
  typedef typename scalar_product_traits<typename traits<T>::Scalar,typename traits<U>::Scalar>::ReturnType ResScalar;
-  EIGEN_DEVICE_FUNC
  static inline ResScalar run(const MatrixBase<T>& a, const MatrixBase<U>& b)
  {
    return a.transpose().template binaryExpr<scalar_conj_product_op<typename traits<T>::Scalar,typename traits<U>::Scalar> >(b).sum();
@@ -61,8 +59,7 @@ struct dot_nocheck<T, U, true>
  */
 template<typename Derived>
 template<typename OtherDerived>
-EIGEN_DEVICE_FUNC
-typename internal::scalar_product_traits<typename internal::traits<Derived>::Scalar,typename internal::traits<OtherDerived>::Scalar>::ReturnType
+inline typename internal::scalar_product_traits<typename internal::traits<Derived>::Scalar,typename internal::traits<OtherDerived>::Scalar>::ReturnType
 MatrixBase<Derived>::dot(const MatrixBase<OtherDerived>& other) const
 {
  EIGEN_STATIC_ASSERT_VECTOR_ONLY(Derived)
@@ -76,6 +73,34 @@ MatrixBase<Derived>::dot(const MatrixBase<OtherDerived>& other) const
  return internal::dot_nocheck<Derived,OtherDerived>::run(*this, other);
 }

+#ifdef EIGEN2_SUPPORT
+/** \returns the dot product of *this with other, with the Eigen2 convention that the dot product is linear in the first variable
+  * (conjugating the second variable). Of course this only makes a difference in the complex case.
+  *
+  * This method is only available in EIGEN2_SUPPORT mode.
+  *
+  * \only_for_vectors
+  *
+  * \sa dot()
+  */
+template<typename Derived>
+template<typename OtherDerived>
+typename internal::traits<Derived>::Scalar
+MatrixBase<Derived>::eigen2_dot(const MatrixBase<OtherDerived>& other) const
+{
+  EIGEN_STATIC_ASSERT_VECTOR_ONLY(Derived)
+  EIGEN_STATIC_ASSERT_VECTOR_ONLY(OtherDerived)
+  EIGEN_STATIC_ASSERT_SAME_VECTOR_SIZE(Derived,OtherDerived)
+  EIGEN_STATIC_ASSERT((internal::is_same<Scalar, typename OtherDerived::Scalar>::value),
+    YOU_MIXED_DIFFERENT_NUMERIC_TYPES__YOU_NEED_TO_USE_THE_CAST_METHOD_OF_MATRIXBASE_TO_CAST_NUMERIC_TYPES_EXPLICITLY)
+
+  eigen_assert(size() == other.size());
+
+  return internal::dot_nocheck<OtherDerived,Derived>::run(other,*this);
+}
+#endif
+
+
 //---------- implementation of L2 norm and related functions ----------

 /** \returns, for vectors, the squared \em l2 norm of \c *this, and for matrices the Frobenius norm.
@@ -113,7 +138,8 @@ template<typename Derived>
 inline const typename MatrixBase<Derived>::PlainObject
 MatrixBase<Derived>::normalized() const
 {
-  typedef typename internal::nested_eval<Derived,2>::type _Nested;
+  typedef typename internal::nested<Derived>::type Nested;
+  typedef typename internal::remove_reference<Nested>::type _Nested;
  _Nested n(derived());
  return n / n.norm();
 }
@@ -138,7 +164,6 @@ template<typename Derived, int p>
 struct lpNorm_selector
 {
  typedef typename NumTraits<typename traits<Derived>::Scalar>::Real RealScalar;
-  EIGEN_DEVICE_FUNC
  static inline RealScalar run(const MatrixBase<Derived>& m)
  {
    using std::pow;
@@ -149,7 +174,6 @@ struct lpNorm_selector
 template<typename Derived>
 struct lpNorm_selector<Derived, 1>
 {
-  EIGEN_DEVICE_FUNC
  static inline typename NumTraits<typename traits<Derived>::Scalar>::Real run(const MatrixBase<Derived>& m)
  {
    return m.cwiseAbs().sum();
@@ -159,7 +183,6 @@ struct lpNorm_selector<Derived, 1>
 template<typename Derived>
 struct lpNorm_selector<Derived, 2>
 {
-  EIGEN_DEVICE_FUNC
  static inline typename NumTraits<typename traits<Derived>::Scalar>::Real run(const MatrixBase<Derived>& m)
  {
    return m.norm();
@@ -169,7 +192,6 @@ struct lpNorm_selector<Derived, 2>
 template<typename Derived>
 struct lpNorm_selector<Derived, Infinity>
 {
-  EIGEN_DEVICE_FUNC
  static inline typename NumTraits<typename traits<Derived>::Scalar>::Real run(const MatrixBase<Derived>& m)
  {
    return m.cwiseAbs().maxCoeff();
@@ -205,8 +227,8 @@ template<typename OtherDerived>
 bool MatrixBase<Derived>::isOrthogonal
 (const MatrixBase<OtherDerived>& other, const RealScalar& prec) const
 {
-  typename internal::nested_eval<Derived,2>::type nested(derived());
-  typename internal::nested_eval<OtherDerived,2>::type otherNested(other.derived());
+  typename internal::nested<Derived,2>::type nested(derived());
+  typename internal::nested<OtherDerived,2>::type otherNested(other.derived());
  return numext::abs2(nested.dot(otherNested)) <= prec * prec * nested.squaredNorm() * otherNested.squaredNorm();
 }

--- a/Eigen/src/Core/EigenBase.h
+++ b/Eigen/src/Core/EigenBase.h
@@ -13,9 +13,7 @@

 namespace Eigen {

-/** \class EigenBase
-  * 
-  * Common base class for all classes T such that MatrixBase has an operator=(T) and a constructor MatrixBase(T).
+/** Common base class for all classes T such that MatrixBase has an operator=(T) and a constructor MatrixBase(T).
  *
  * In other words, an EigenBase object is an object that can be copied into a MatrixBase.
  *
@@ -28,52 +26,34 @@ namespace Eigen {
 template<typename Derived> struct EigenBase
 {
 //   typedef typename internal::plain_matrix_type<Derived>::type PlainObject;
-  
-  /** \brief The interface type of indices
-    * \details To change this, \c \#define the preprocessor symbol \c EIGEN_DEFAULT_DENSE_INDEX_TYPE.
-    * \deprecated Since Eigen 3.3, its usage is deprecated. Use Eigen::Index instead.
-    * \sa StorageIndex, \ref TopicPreprocessorDirectives.
-    */
-  typedef Eigen::Index Index;

-  // FIXME is it needed?
  typedef typename internal::traits<Derived>::StorageKind StorageKind;
+  typedef typename internal::traits<Derived>::Index Index;

  /** \returns a reference to the derived object */
-  EIGEN_DEVICE_FUNC
  Derived& derived() { return *static_cast<Derived*>(this); }
  /** \returns a const reference to the derived object */
-  EIGEN_DEVICE_FUNC
  const Derived& derived() const { return *static_cast<const Derived*>(this); }

-  EIGEN_DEVICE_FUNC
  inline Derived& const_cast_derived() const
  { return *static_cast<Derived*>(const_cast<EigenBase*>(this)); }
-  EIGEN_DEVICE_FUNC
  inline const Derived& const_derived() const
  { return *static_cast<const Derived*>(this); }

  /** \returns the number of rows. \sa cols(), RowsAtCompileTime */
-  EIGEN_DEVICE_FUNC
  inline Index rows() const { return derived().rows(); }
  /** \returns the number of columns. \sa rows(), ColsAtCompileTime*/
-  EIGEN_DEVICE_FUNC
  inline Index cols() const { return derived().cols(); }
  /** \returns the number of coefficients, which is rows()*cols().
    * \sa rows(), cols(), SizeAtCompileTime. */
-  EIGEN_DEVICE_FUNC
  inline Index size() const { return rows() * cols(); }

  /** \internal Don't use it, but do the equivalent: \code dst = *this; \endcode */
-  template<typename Dest>
-  EIGEN_DEVICE_FUNC
-  inline void evalTo(Dest& dst) const
+  template<typename Dest> inline void evalTo(Dest& dst) const
  { derived().evalTo(dst); }

  /** \internal Don't use it, but do the equivalent: \code dst += *this; \endcode */
-  template<typename Dest>
-  EIGEN_DEVICE_FUNC
-  inline void addTo(Dest& dst) const
+  template<typename Dest> inline void addTo(Dest& dst) const
  {
    // This is the default implementation,
    // derived class can reimplement it in a more optimized way.
@@ -83,9 +63,7 @@ template<typename Derived> struct EigenBase
  }

  /** \internal Don't use it, but do the equivalent: \code dst -= *this; \endcode */
-  template<typename Dest>
-  EIGEN_DEVICE_FUNC
-  inline void subTo(Dest& dst) const
+  template<typename Dest> inline void subTo(Dest& dst) const
  {
    // This is the default implementation,
    // derived class can reimplement it in a more optimized way.
@@ -95,8 +73,7 @@ template<typename Derived> struct EigenBase
  }

  /** \internal Don't use it, but do the equivalent: \code dst.applyOnTheRight(*this); \endcode */
-  template<typename Dest>
-  EIGEN_DEVICE_FUNC inline void applyThisOnTheRight(Dest& dst) const
+  template<typename Dest> inline void applyThisOnTheRight(Dest& dst) const
  {
    // This is the default implementation,
    // derived class can reimplement it in a more optimized way.
@@ -104,8 +81,7 @@ template<typename Derived> struct EigenBase
  }

  /** \internal Don't use it, but do the equivalent: \code dst.applyOnTheLeft(*this); \endcode */
-  template<typename Dest>
-  EIGEN_DEVICE_FUNC inline void applyThisOnTheLeft(Dest& dst) const
+  template<typename Dest> inline void applyThisOnTheLeft(Dest& dst) const
  {
    // This is the default implementation,
    // derived class can reimplement it in a more optimized way.
@@ -130,7 +106,7 @@ template<typename Derived>
 template<typename OtherDerived>
 Derived& DenseBase<Derived>::operator=(const EigenBase<OtherDerived> &other)
 {
-  call_assignment(derived(), other.derived());
+  other.derived().evalTo(derived());
  return derived();
 }

@@ -138,7 +114,7 @@ template<typename Derived>
 template<typename OtherDerived>
 Derived& DenseBase<Derived>::operator+=(const EigenBase<OtherDerived> &other)
 {
-  call_assignment(derived(), other.derived(), internal::add_assign_op<Scalar>());
+  other.derived().addTo(derived());
  return derived();
 }

@@ -146,7 +122,7 @@ template<typename Derived>
 template<typename OtherDerived>
 Derived& DenseBase<Derived>::operator-=(const EigenBase<OtherDerived> &other)
 {
-  call_assignment(derived(), other.derived(), internal::sub_assign_op<Scalar>());
+  other.derived().subTo(derived());
  return derived();
 }

--- a/Eigen/src/Core/Flagged.h
+++ b/Eigen/src/Core/Flagged.h
@@ -48,39 +48,39 @@ template<typename ExpressionType, unsigned int Added, unsigned int Removed> clas
        ExpressionType, const ExpressionType&>::type ExpressionTypeNested;
    typedef typename ExpressionType::InnerIterator InnerIterator;

-    explicit inline Flagged(const ExpressionType& matrix) : m_matrix(matrix) {}
+    inline Flagged(const ExpressionType& matrix) : m_matrix(matrix) {}

-    EIGEN_DEVICE_FUNC inline Index rows() const { return m_matrix.rows(); }
-    EIGEN_DEVICE_FUNC inline Index cols() const { return m_matrix.cols(); }
-    EIGEN_DEVICE_FUNC inline Index outerStride() const { return m_matrix.outerStride(); }
-    EIGEN_DEVICE_FUNC inline Index innerStride() const { return m_matrix.innerStride(); }
+    inline Index rows() const { return m_matrix.rows(); }
+    inline Index cols() const { return m_matrix.cols(); }
+    inline Index outerStride() const { return m_matrix.outerStride(); }
+    inline Index innerStride() const { return m_matrix.innerStride(); }

-    EIGEN_DEVICE_FUNC inline CoeffReturnType coeff(Index row, Index col) const
+    inline CoeffReturnType coeff(Index row, Index col) const
    {
      return m_matrix.coeff(row, col);
    }

-    EIGEN_DEVICE_FUNC inline CoeffReturnType coeff(Index index) const
+    inline CoeffReturnType coeff(Index index) const
    {
      return m_matrix.coeff(index);
    }
    
-    EIGEN_DEVICE_FUNC inline const Scalar& coeffRef(Index row, Index col) const
+    inline const Scalar& coeffRef(Index row, Index col) const
    {
      return m_matrix.const_cast_derived().coeffRef(row, col);
    }

-    EIGEN_DEVICE_FUNC inline const Scalar& coeffRef(Index index) const
+    inline const Scalar& coeffRef(Index index) const
    {
      return m_matrix.const_cast_derived().coeffRef(index);
    }

-    EIGEN_DEVICE_FUNC inline Scalar& coeffRef(Index row, Index col)
+    inline Scalar& coeffRef(Index row, Index col)
    {
      return m_matrix.const_cast_derived().coeffRef(row, col);
    }

-    EIGEN_DEVICE_FUNC inline Scalar& coeffRef(Index index)
+    inline Scalar& coeffRef(Index index)
    {
      return m_matrix.const_cast_derived().coeffRef(index);
    }
@@ -109,13 +109,13 @@ template<typename ExpressionType, unsigned int Added, unsigned int Removed> clas
      m_matrix.const_cast_derived().template writePacket<LoadMode>(index, x);
    }

-    EIGEN_DEVICE_FUNC const ExpressionType& _expression() const { return m_matrix; }
+    const ExpressionType& _expression() const { return m_matrix; }

    template<typename OtherDerived>
-    EIGEN_DEVICE_FUNC typename ExpressionType::PlainObject solveTriangular(const MatrixBase<OtherDerived>& other) const;
+    typename ExpressionType::PlainObject solveTriangular(const MatrixBase<OtherDerived>& other) const;

    template<typename OtherDerived>
-    EIGEN_DEVICE_FUNC void solveTriangularInPlace(const MatrixBase<OtherDerived>& other) const;
+    void solveTriangularInPlace(const MatrixBase<OtherDerived>& other) const;

  protected:
    ExpressionTypeNested m_matrix;
@@ -132,7 +132,7 @@ template<unsigned int Added,unsigned int Removed>
 inline const Flagged<Derived, Added, Removed>
 DenseBase<Derived>::flagged() const
 {
-  return Flagged<Derived, Added, Removed>(derived());
+  return derived();
 }

 } // end namespace Eigen
--- a/Eigen/src/Core/ForceAlignedAccess.h
+++ b/Eigen/src/Core/ForceAlignedAccess.h
@@ -39,29 +39,29 @@ template<typename ExpressionType> class ForceAlignedAccess
    typedef typename internal::dense_xpr_base<ForceAlignedAccess>::type Base;
    EIGEN_DENSE_PUBLIC_INTERFACE(ForceAlignedAccess)

-    EIGEN_DEVICE_FUNC explicit inline ForceAlignedAccess(const ExpressionType& matrix) : m_expression(matrix) {}
+    inline ForceAlignedAccess(const ExpressionType& matrix) : m_expression(matrix) {}

-    EIGEN_DEVICE_FUNC inline Index rows() const { return m_expression.rows(); }
-    EIGEN_DEVICE_FUNC inline Index cols() const { return m_expression.cols(); }
-    EIGEN_DEVICE_FUNC inline Index outerStride() const { return m_expression.outerStride(); }
-    EIGEN_DEVICE_FUNC inline Index innerStride() const { return m_expression.innerStride(); }
+    inline Index rows() const { return m_expression.rows(); }
+    inline Index cols() const { return m_expression.cols(); }
+    inline Index outerStride() const { return m_expression.outerStride(); }
+    inline Index innerStride() const { return m_expression.innerStride(); }

-    EIGEN_DEVICE_FUNC inline const CoeffReturnType coeff(Index row, Index col) const
+    inline const CoeffReturnType coeff(Index row, Index col) const
    {
      return m_expression.coeff(row, col);
    }

-    EIGEN_DEVICE_FUNC inline Scalar& coeffRef(Index row, Index col)
+    inline Scalar& coeffRef(Index row, Index col)
    {
      return m_expression.const_cast_derived().coeffRef(row, col);
    }

-    EIGEN_DEVICE_FUNC inline const CoeffReturnType coeff(Index index) const
+    inline const CoeffReturnType coeff(Index index) const
    {
      return m_expression.coeff(index);
    }

-    EIGEN_DEVICE_FUNC inline Scalar& coeffRef(Index index)
+    inline Scalar& coeffRef(Index index)
    {
      return m_expression.const_cast_derived().coeffRef(index);
    }
@@ -90,7 +90,7 @@ template<typename ExpressionType> class ForceAlignedAccess
      m_expression.const_cast_derived().template writePacket<Aligned>(index, x);
    }

-    EIGEN_DEVICE_FUNC operator const ExpressionType&() const { return m_expression; }
+    operator const ExpressionType&() const { return m_expression; }

  protected:
    const ExpressionType& m_expression;
@@ -127,7 +127,7 @@ template<bool Enable>
 inline typename internal::add_const_on_value_type<typename internal::conditional<Enable,ForceAlignedAccess<Derived>,Derived&>::type>::type
 MatrixBase<Derived>::forceAlignedAccessIf() const
 {
-  return derived();  // FIXME This should not work but apparently is never used
+  return derived();
 }

 /** \returns an expression of *this with forced aligned access if \a Enable is true.
@@ -138,7 +138,7 @@ template<bool Enable>
 inline typename internal::conditional<Enable,ForceAlignedAccess<Derived>,Derived&>::type
 MatrixBase<Derived>::forceAlignedAccessIf()
 {
-  return derived();  // FIXME This should not work but apparently is never used
+  return derived();
 }

 } // end namespace Eigen
--- a/Eigen/src/Core/Functors.h
+++ b/Eigen/src/Core/Functors.h
--- a/Eigen/src/Core/Fuzzy.h
+++ b/Eigen/src/Core/Fuzzy.h
@@ -19,19 +19,18 @@ namespace internal
 template<typename Derived, typename OtherDerived, bool is_integer = NumTraits<typename Derived::Scalar>::IsInteger>
 struct isApprox_selector
 {
-  EIGEN_DEVICE_FUNC
  static bool run(const Derived& x, const OtherDerived& y, const typename Derived::RealScalar& prec)
  {
-    typename internal::nested_eval<Derived,2>::type nested(x);
-    typename internal::nested_eval<OtherDerived,2>::type otherNested(y);
-    return (nested - otherNested).cwiseAbs2().sum() <= prec * prec * numext::mini(nested.cwiseAbs2().sum(), otherNested.cwiseAbs2().sum());
+    using std::min;
+    typename internal::nested<Derived,2>::type nested(x);
+    typename internal::nested<OtherDerived,2>::type otherNested(y);
+    return (nested - otherNested).cwiseAbs2().sum() <= prec * prec * (min)(nested.cwiseAbs2().sum(), otherNested.cwiseAbs2().sum());
  }
 };

 template<typename Derived, typename OtherDerived>
 struct isApprox_selector<Derived, OtherDerived, true>
 {
-  EIGEN_DEVICE_FUNC
  static bool run(const Derived& x, const OtherDerived& y, const typename Derived::RealScalar&)
  {
    return x.matrix() == y.matrix();
@@ -41,7 +40,6 @@ struct isApprox_selector<Derived, OtherDerived, true>
 template<typename Derived, typename OtherDerived, bool is_integer = NumTraits<typename Derived::Scalar>::IsInteger>
 struct isMuchSmallerThan_object_selector
 {
-  EIGEN_DEVICE_FUNC
  static bool run(const Derived& x, const OtherDerived& y, const typename Derived::RealScalar& prec)
  {
    return x.cwiseAbs2().sum() <= numext::abs2(prec) * y.cwiseAbs2().sum();
@@ -51,7 +49,6 @@ struct isMuchSmallerThan_object_selector
 template<typename Derived, typename OtherDerived>
 struct isMuchSmallerThan_object_selector<Derived, OtherDerived, true>
 {
-  EIGEN_DEVICE_FUNC
  static bool run(const Derived& x, const OtherDerived&, const typename Derived::RealScalar&)
  {
    return x.matrix() == Derived::Zero(x.rows(), x.cols()).matrix();
@@ -61,7 +58,6 @@ struct isMuchSmallerThan_object_selector<Derived, OtherDerived, true>
 template<typename Derived, bool is_integer = NumTraits<typename Derived::Scalar>::IsInteger>
 struct isMuchSmallerThan_scalar_selector
 {
-  EIGEN_DEVICE_FUNC
  static bool run(const Derived& x, const typename Derived::RealScalar& y, const typename Derived::RealScalar& prec)
  {
    return x.cwiseAbs2().sum() <= numext::abs2(prec * y);
@@ -71,7 +67,6 @@ struct isMuchSmallerThan_scalar_selector
 template<typename Derived>
 struct isMuchSmallerThan_scalar_selector<Derived, true>
 {
-  EIGEN_DEVICE_FUNC
  static bool run(const Derived& x, const typename Derived::RealScalar&, const typename Derived::RealScalar&)
  {
    return x.matrix() == Derived::Zero(x.rows(), x.cols()).matrix();
--- a/Eigen/src/Core/GeneralProduct.h
+++ b/Eigen/src/Core/GeneralProduct.h
@@ -11,7 +11,29 @@
 #ifndef EIGEN_GENERAL_PRODUCT_H
 #define EIGEN_GENERAL_PRODUCT_H

-namespace Eigen {
+namespace Eigen { 
+
+/** \class GeneralProduct
+  * \ingroup Core_Module
+  *
+  * \brief Expression of the product of two general matrices or vectors
+  *
+  * \param LhsNested the type used to store the left-hand side
+  * \param RhsNested the type used to store the right-hand side
+  * \param ProductMode the type of the product
+  *
+  * This class represents an expression of the product of two general matrices.
+  * We call a general matrix, a dense matrix with full storage. For instance,
+  * This excludes triangular, selfadjoint, and sparse matrices.
+  * It is the return type of the operator* between general matrices. Its template
+  * arguments are determined automatically by ProductReturnType. Therefore,
+  * GeneralProduct should never be used direclty. To determine the result type of a
+  * function which involves a matrix product, use ProductReturnType::Type.
+  *
+  * \sa ProductReturnType, MatrixBase::operator*(const MatrixBase<OtherDerived>&)
+  */
+template<typename Lhs, typename Rhs, int ProductType = internal::product_type<Lhs,Rhs>::value>
+class GeneralProduct;

 enum {
  Large = 2,
@@ -37,14 +59,15 @@ template<typename Lhs, typename Rhs> struct product_type
  typedef typename remove_all<Lhs>::type _Lhs;
  typedef typename remove_all<Rhs>::type _Rhs;
  enum {
-    MaxRows = traits<_Lhs>::MaxRowsAtCompileTime,
-    Rows    = traits<_Lhs>::RowsAtCompileTime,
-    MaxCols = traits<_Rhs>::MaxColsAtCompileTime,
-    Cols    = traits<_Rhs>::ColsAtCompileTime,
-    MaxDepth = EIGEN_SIZE_MIN_PREFER_FIXED(traits<_Lhs>::MaxColsAtCompileTime,
-                                           traits<_Rhs>::MaxRowsAtCompileTime),
-    Depth = EIGEN_SIZE_MIN_PREFER_FIXED(traits<_Lhs>::ColsAtCompileTime,
-                                        traits<_Rhs>::RowsAtCompileTime)
+    MaxRows  = _Lhs::MaxRowsAtCompileTime,
+    Rows  = _Lhs::RowsAtCompileTime,
+    MaxCols  = _Rhs::MaxColsAtCompileTime,
+    Cols  = _Rhs::ColsAtCompileTime,
+    MaxDepth = EIGEN_SIZE_MIN_PREFER_FIXED(_Lhs::MaxColsAtCompileTime,
+                                           _Rhs::MaxRowsAtCompileTime),
+    Depth = EIGEN_SIZE_MIN_PREFER_FIXED(_Lhs::ColsAtCompileTime,
+                                        _Rhs::RowsAtCompileTime),
+    LargeThreshold = EIGEN_CACHEFRIENDLY_PRODUCT_THRESHOLD
  };

  // the splitting into different lines of code here, introducing the _select enums and the typedef below,
@@ -59,8 +82,7 @@ private:

 public:
  enum {
-    value = selector::ret,
-    ret = selector::ret
+    value = selector::ret
  };
 #ifdef EIGEN_DEBUG_PRODUCT
  static void debug()
@@ -76,31 +98,6 @@ public:
 #endif
 };

-// template<typename Lhs, typename Rhs> struct product_tag
-// {
-// private:
-//   
-//   typedef typename remove_all<Lhs>::type _Lhs;
-//   typedef typename remove_all<Rhs>::type _Rhs;
-//   enum {
-//     Rows  = _Lhs::RowsAtCompileTime,
-//     Cols  = _Rhs::ColsAtCompileTime,
-//     Depth = EIGEN_SIZE_MIN_PREFER_FIXED(_Lhs::ColsAtCompileTime, _Rhs::RowsAtCompileTime)
-//   };
-// 
-//   enum {
-//     rows_select = Rows==1 ? int(Rows) : int(Large),
-//     cols_select = Cols==1 ? int(Cols) : int(Large),
-//     depth_select = Depth==1 ? int(Depth) : int(Large)
-//   };
-//   typedef product_type_selector<rows_select, cols_select, depth_select> selector;
-// 
-// public:
-//   enum {
-//     ret = selector::ret
-//   };
-// 
-// };

 /* The following allows to select the kind of product at compile time
 * based on the three dimensions of the product.
@@ -131,6 +128,54 @@ template<>              struct product_type_selector<Large,Large,Small>  { enum

 } // end namespace internal

+/** \class ProductReturnType
+  * \ingroup Core_Module
+  *
+  * \brief Helper class to get the correct and optimized returned type of operator*
+  *
+  * \param Lhs the type of the left-hand side
+  * \param Rhs the type of the right-hand side
+  * \param ProductMode the type of the product (determined automatically by internal::product_mode)
+  *
+  * This class defines the typename Type representing the optimized product expression
+  * between two matrix expressions. In practice, using ProductReturnType<Lhs,Rhs>::Type
+  * is the recommended way to define the result type of a function returning an expression
+  * which involve a matrix product. The class Product should never be
+  * used directly.
+  *
+  * \sa class Product, MatrixBase::operator*(const MatrixBase<OtherDerived>&)
+  */
+template<typename Lhs, typename Rhs, int ProductType>
+struct ProductReturnType
+{
+  // TODO use the nested type to reduce instanciations ????
+//   typedef typename internal::nested<Lhs,Rhs::ColsAtCompileTime>::type LhsNested;
+//   typedef typename internal::nested<Rhs,Lhs::RowsAtCompileTime>::type RhsNested;
+
+  typedef GeneralProduct<Lhs/*Nested*/, Rhs/*Nested*/, ProductType> Type;
+};
+
+template<typename Lhs, typename Rhs>
+struct ProductReturnType<Lhs,Rhs,CoeffBasedProductMode>
+{
+  typedef typename internal::nested<Lhs, Rhs::ColsAtCompileTime, typename internal::plain_matrix_type<Lhs>::type >::type LhsNested;
+  typedef typename internal::nested<Rhs, Lhs::RowsAtCompileTime, typename internal::plain_matrix_type<Rhs>::type >::type RhsNested;
+  typedef CoeffBasedProduct<LhsNested, RhsNested, EvalBeforeAssigningBit | EvalBeforeNestingBit> Type;
+};
+
+template<typename Lhs, typename Rhs>
+struct ProductReturnType<Lhs,Rhs,LazyCoeffBasedProductMode>
+{
+  typedef typename internal::nested<Lhs, Rhs::ColsAtCompileTime, typename internal::plain_matrix_type<Lhs>::type >::type LhsNested;
+  typedef typename internal::nested<Rhs, Lhs::RowsAtCompileTime, typename internal::plain_matrix_type<Rhs>::type >::type RhsNested;
+  typedef CoeffBasedProduct<LhsNested, RhsNested, NestByRefBit> Type;
+};
+
+// this is a workaround for sun CC
+template<typename Lhs, typename Rhs>
+struct LazyProductReturnType : public ProductReturnType<Lhs,Rhs,LazyCoeffBasedProductMode>
+{};
+
 /***********************************************************************
 *  Implementation of Inner Vector Vector Product
 ***********************************************************************/
@@ -142,10 +187,114 @@ template<>              struct product_type_selector<Large,Large,Small>  { enum
 // product ends up to a row-vector times col-vector product... To tackle this use
 // case, we could have a specialization for Block<MatrixType,1,1> with: operator=(Scalar x);

+namespace internal {
+
+template<typename Lhs, typename Rhs>
+struct traits<GeneralProduct<Lhs,Rhs,InnerProduct> >
+ : traits<Matrix<typename scalar_product_traits<typename Lhs::Scalar, typename Rhs::Scalar>::ReturnType,1,1> >
+{};
+
+}
+
+template<typename Lhs, typename Rhs>
+class GeneralProduct<Lhs, Rhs, InnerProduct>
+  : internal::no_assignment_operator,
+    public Matrix<typename internal::scalar_product_traits<typename Lhs::Scalar, typename Rhs::Scalar>::ReturnType,1,1>
+{
+    typedef Matrix<typename internal::scalar_product_traits<typename Lhs::Scalar, typename Rhs::Scalar>::ReturnType,1,1> Base;
+  public:
+    GeneralProduct(const Lhs& lhs, const Rhs& rhs)
+    {
+      Base::coeffRef(0,0) = (lhs.transpose().cwiseProduct(rhs)).sum();
+    }
+
+    /** Convertion to scalar */
+    operator const typename Base::Scalar() const {
+      return Base::coeff(0,0);
+    }
+};
+
 /***********************************************************************
 *  Implementation of Outer Vector Vector Product
 ***********************************************************************/

+namespace internal {
+
+// Column major
+template<typename ProductType, typename Dest, typename Func>
+EIGEN_DONT_INLINE void outer_product_selector_run(const ProductType& prod, Dest& dest, const Func& func, const false_type&)
+{
+  typedef typename Dest::Index Index;
+  // FIXME make sure lhs is sequentially stored
+  // FIXME not very good if rhs is real and lhs complex while alpha is real too
+  const Index cols = dest.cols();
+  for (Index j=0; j<cols; ++j)
+    func(dest.col(j), prod.rhs().coeff(0,j) * prod.lhs());
+}
+
+// Row major
+template<typename ProductType, typename Dest, typename Func>
+EIGEN_DONT_INLINE void outer_product_selector_run(const ProductType& prod, Dest& dest, const Func& func, const true_type&) {
+  typedef typename Dest::Index Index;
+  // FIXME make sure rhs is sequentially stored
+  // FIXME not very good if lhs is real and rhs complex while alpha is real too
+  const Index rows = dest.rows();
+  for (Index i=0; i<rows; ++i)
+    func(dest.row(i), prod.lhs().coeff(i,0) * prod.rhs());
+}
+
+template<typename Lhs, typename Rhs>
+struct traits<GeneralProduct<Lhs,Rhs,OuterProduct> >
+ : traits<ProductBase<GeneralProduct<Lhs,Rhs,OuterProduct>, Lhs, Rhs> >
+{};
+
+}
+
+template<typename Lhs, typename Rhs>
+class GeneralProduct<Lhs, Rhs, OuterProduct>
+  : public ProductBase<GeneralProduct<Lhs,Rhs,OuterProduct>, Lhs, Rhs>
+{
+    template<typename T> struct is_row_major : internal::conditional<(int(T::Flags)&RowMajorBit), internal::true_type, internal::false_type>::type {};
+    
+  public:
+    EIGEN_PRODUCT_PUBLIC_INTERFACE(GeneralProduct)
+
+    GeneralProduct(const Lhs& lhs, const Rhs& rhs) : Base(lhs,rhs)
+    {
+    }
+    
+    struct set  { template<typename Dst, typename Src> void operator()(const Dst& dst, const Src& src) const { dst.const_cast_derived()  = src; } };
+    struct add  { template<typename Dst, typename Src> void operator()(const Dst& dst, const Src& src) const { dst.const_cast_derived() += src; } };
+    struct sub  { template<typename Dst, typename Src> void operator()(const Dst& dst, const Src& src) const { dst.const_cast_derived() -= src; } };
+    struct adds {
+      Scalar m_scale;
+      adds(const Scalar& s) : m_scale(s) {}
+      template<typename Dst, typename Src> void operator()(const Dst& dst, const Src& src) const {
+        dst.const_cast_derived() += m_scale * src;
+      }
+    };
+    
+    template<typename Dest>
+    inline void evalTo(Dest& dest) const {
+      internal::outer_product_selector_run(*this, dest, set(), is_row_major<Dest>());
+    }
+    
+    template<typename Dest>
+    inline void addTo(Dest& dest) const {
+      internal::outer_product_selector_run(*this, dest, add(), is_row_major<Dest>());
+    }
+
+    template<typename Dest>
+    inline void subTo(Dest& dest) const {
+      internal::outer_product_selector_run(*this, dest, sub(), is_row_major<Dest>());
+    }
+
+    template<typename Dest> void scaleAndAddTo(Dest& dest, const Scalar& alpha) const
+    {
+      internal::outer_product_selector_run(*this, dest, adds(alpha), is_row_major<Dest>());
+    }
+};
+
 /***********************************************************************
 *  Implementation of General Matrix Vector Product
 ***********************************************************************/
@@ -159,13 +308,60 @@ template<>              struct product_type_selector<Large,Large,Small>  { enum
 */
 namespace internal {

+template<typename Lhs, typename Rhs>
+struct traits<GeneralProduct<Lhs,Rhs,GemvProduct> >
+ : traits<ProductBase<GeneralProduct<Lhs,Rhs,GemvProduct>, Lhs, Rhs> >
+{};
+
 template<int Side, int StorageOrder, bool BlasCompatible>
-struct gemv_dense_sense_selector;
+struct gemv_selector;

 } // end namespace internal

+template<typename Lhs, typename Rhs>
+class GeneralProduct<Lhs, Rhs, GemvProduct>
+  : public ProductBase<GeneralProduct<Lhs,Rhs,GemvProduct>, Lhs, Rhs>
+{
+  public:
+    EIGEN_PRODUCT_PUBLIC_INTERFACE(GeneralProduct)
+
+    typedef typename Lhs::Scalar LhsScalar;
+    typedef typename Rhs::Scalar RhsScalar;
+
+    GeneralProduct(const Lhs& a_lhs, const Rhs& a_rhs) : Base(a_lhs,a_rhs)
+    {
+//       EIGEN_STATIC_ASSERT((internal::is_same<typename Lhs::Scalar, typename Rhs::Scalar>::value),
+//         YOU_MIXED_DIFFERENT_NUMERIC_TYPES__YOU_NEED_TO_USE_THE_CAST_METHOD_OF_MATRIXBASE_TO_CAST_NUMERIC_TYPES_EXPLICITLY)
+    }
+
+    enum { Side = Lhs::IsVectorAtCompileTime ? OnTheLeft : OnTheRight };
+    typedef typename internal::conditional<int(Side)==OnTheRight,_LhsNested,_RhsNested>::type MatrixType;
+
+    template<typename Dest> void scaleAndAddTo(Dest& dst, const Scalar& alpha) const
+    {
+      eigen_assert(m_lhs.rows() == dst.rows() && m_rhs.cols() == dst.cols());
+      internal::gemv_selector<Side,(int(MatrixType::Flags)&RowMajorBit) ? RowMajor : ColMajor,
+                       bool(internal::blas_traits<MatrixType>::HasUsableDirectAccess)>::run(*this, dst, alpha);
+    }
+};
+
 namespace internal {

+// The vector is on the left => transposition
+template<int StorageOrder, bool BlasCompatible>
+struct gemv_selector<OnTheLeft,StorageOrder,BlasCompatible>
+{
+  template<typename ProductType, typename Dest>
+  static void run(const ProductType& prod, Dest& dest, const typename ProductType::Scalar& alpha)
+  {
+    Transpose<Dest> destT(dest);
+    enum { OtherStorageOrder = StorageOrder == RowMajor ? ColMajor : RowMajor };
+    gemv_selector<OnTheRight,OtherStorageOrder,BlasCompatible>
+      ::run(GeneralProduct<Transpose<const typename ProductType::_RhsNested>,Transpose<const typename ProductType::_LhsNested>, GemvProduct>
+        (prod.rhs().transpose(), prod.lhs().transpose()), destT, alpha);
+  }
+};
+
 template<typename Scalar,int Size,int MaxSize,bool Cond> struct gemv_static_vector_if;

 template<typename Scalar,int Size,int MaxSize>
@@ -196,71 +392,59 @@ struct gemv_static_vector_if<Scalar,Size,MaxSize,true>
  internal::plain_array<Scalar,EIGEN_SIZE_MIN_PREFER_FIXED(Size,MaxSize)+(ForceAlignment?PacketSize:0),0> m_data;
  EIGEN_STRONG_INLINE Scalar* data() {
    return ForceAlignment
-            ? reinterpret_cast<Scalar*>((reinterpret_cast<size_t>(m_data.array) & ~(size_t(EIGEN_ALIGN_BYTES-1))) + EIGEN_ALIGN_BYTES)
+            ? reinterpret_cast<Scalar*>((reinterpret_cast<size_t>(m_data.array) & ~(size_t(15))) + 16)
            : m_data.array;
  }
  #endif
 };

-// The vector is on the left => transposition
-template<int StorageOrder, bool BlasCompatible>
-struct gemv_dense_sense_selector<OnTheLeft,StorageOrder,BlasCompatible>
+template<> struct gemv_selector<OnTheRight,ColMajor,true>
 {
-  template<typename Lhs, typename Rhs, typename Dest>
-  static void run(const Lhs &lhs, const Rhs &rhs, Dest& dest, const typename Dest::Scalar& alpha)
+  template<typename ProductType, typename Dest>
+  static inline void run(const ProductType& prod, Dest& dest, const typename ProductType::Scalar& alpha)
  {
-    Transpose<Dest> destT(dest);
-    enum { OtherStorageOrder = StorageOrder == RowMajor ? ColMajor : RowMajor };
-    gemv_dense_sense_selector<OnTheRight,OtherStorageOrder,BlasCompatible>
-      ::run(rhs.transpose(), lhs.transpose(), destT, alpha);
-  }
-};
-
-template<> struct gemv_dense_sense_selector<OnTheRight,ColMajor,true>
-{
-  template<typename Lhs, typename Rhs, typename Dest>
-  static inline void run(const Lhs &lhs, const Rhs &rhs, Dest& dest, const typename Dest::Scalar& alpha)
-  {
-    typedef typename Lhs::Scalar   LhsScalar;
-    typedef typename Rhs::Scalar   RhsScalar;
-    typedef typename Dest::Scalar  ResScalar;
-    typedef typename Dest::RealScalar  RealScalar;
-    
-    typedef internal::blas_traits<Lhs> LhsBlasTraits;
-    typedef typename LhsBlasTraits::DirectLinearAccessType ActualLhsType;
-    typedef internal::blas_traits<Rhs> RhsBlasTraits;
-    typedef typename RhsBlasTraits::DirectLinearAccessType ActualRhsType;
-  
+    typedef typename ProductType::Index Index;
+    typedef typename ProductType::LhsScalar   LhsScalar;
+    typedef typename ProductType::RhsScalar   RhsScalar;
+    typedef typename ProductType::Scalar      ResScalar;
+    typedef typename ProductType::RealScalar  RealScalar;
+    typedef typename ProductType::ActualLhsType ActualLhsType;
+    typedef typename ProductType::ActualRhsType ActualRhsType;
+    typedef typename ProductType::LhsBlasTraits LhsBlasTraits;
+    typedef typename ProductType::RhsBlasTraits RhsBlasTraits;
    typedef Map<Matrix<ResScalar,Dynamic,1>, Aligned> MappedDest;

-    ActualLhsType actualLhs = LhsBlasTraits::extract(lhs);
-    ActualRhsType actualRhs = RhsBlasTraits::extract(rhs);
+    ActualLhsType actualLhs = LhsBlasTraits::extract(prod.lhs());
+    ActualRhsType actualRhs = RhsBlasTraits::extract(prod.rhs());

-    ResScalar actualAlpha = alpha * LhsBlasTraits::extractScalarFactor(lhs)
-                                  * RhsBlasTraits::extractScalarFactor(rhs);
+    ResScalar actualAlpha = alpha * LhsBlasTraits::extractScalarFactor(prod.lhs())
+                                  * RhsBlasTraits::extractScalarFactor(prod.rhs());
+
+    // make sure Dest is a compile-time vector type (bug 1166)
+    typedef typename conditional<Dest::IsVectorAtCompileTime, Dest, typename Dest::ColXpr>::type ActualDest;

    enum {
      // FIXME find a way to allow an inner stride on the result if packet_traits<Scalar>::size==1
      // on, the other hand it is good for the cache to pack the vector anyways...
-      EvalToDestAtCompileTime = Dest::InnerStrideAtCompileTime==1,
+      EvalToDestAtCompileTime = (ActualDest::InnerStrideAtCompileTime==1),
      ComplexByReal = (NumTraits<LhsScalar>::IsComplex) && (!NumTraits<RhsScalar>::IsComplex),
-      MightCannotUseDest = (Dest::InnerStrideAtCompileTime!=1) || ComplexByReal
+      MightCannotUseDest = (ActualDest::InnerStrideAtCompileTime!=1) || ComplexByReal
    };

-    gemv_static_vector_if<ResScalar,Dest::SizeAtCompileTime,Dest::MaxSizeAtCompileTime,MightCannotUseDest> static_dest;
+    gemv_static_vector_if<ResScalar,ActualDest::SizeAtCompileTime,ActualDest::MaxSizeAtCompileTime,MightCannotUseDest> static_dest;

    bool alphaIsCompatible = (!ComplexByReal) || (numext::imag(actualAlpha)==RealScalar(0));
    bool evalToDest = EvalToDestAtCompileTime && alphaIsCompatible;
-
+    
    RhsScalar compatibleAlpha = get_factor<ResScalar,RhsScalar>::run(actualAlpha);

    ei_declare_aligned_stack_constructed_variable(ResScalar,actualDestPtr,dest.size(),
                                                  evalToDest ? dest.data() : static_dest.data());
-
+    
    if(!evalToDest)
    {
      #ifdef EIGEN_DENSE_STORAGE_CTOR_PLUGIN
-      Index size = dest.size();
+      int size = dest.size();
      EIGEN_DENSE_STORAGE_CTOR_PLUGIN
      #endif
      if(!alphaIsCompatible)
@@ -272,13 +456,11 @@ template<> struct gemv_dense_sense_selector<OnTheRight,ColMajor,true>
        MappedDest(actualDestPtr, dest.size()) = dest;
    }

-    typedef const_blas_data_mapper<LhsScalar,Index,ColMajor> LhsMapper;
-    typedef const_blas_data_mapper<RhsScalar,Index,RowMajor> RhsMapper;
    general_matrix_vector_product
-        <Index,LhsScalar,LhsMapper,ColMajor,LhsBlasTraits::NeedToConjugate,RhsScalar,RhsMapper,RhsBlasTraits::NeedToConjugate>::run(
+      <Index,LhsScalar,ColMajor,LhsBlasTraits::NeedToConjugate,RhsScalar,RhsBlasTraits::NeedToConjugate>::run(
        actualLhs.rows(), actualLhs.cols(),
-        LhsMapper(actualLhs.data(), actualLhs.outerStride()),
-        RhsMapper(actualRhs.data(), actualRhs.innerStride()),
+        actualLhs.data(), actualLhs.outerStride(),
+        actualRhs.data(), actualRhs.innerStride(),
        actualDestPtr, 1,
        compatibleAlpha);

@@ -292,34 +474,34 @@ template<> struct gemv_dense_sense_selector<OnTheRight,ColMajor,true>
  }
 };

-template<> struct gemv_dense_sense_selector<OnTheRight,RowMajor,true>
+template<> struct gemv_selector<OnTheRight,RowMajor,true>
 {
-  template<typename Lhs, typename Rhs, typename Dest>
-  static void run(const Lhs &lhs, const Rhs &rhs, Dest& dest, const typename Dest::Scalar& alpha)
+  template<typename ProductType, typename Dest>
+  static void run(const ProductType& prod, Dest& dest, const typename ProductType::Scalar& alpha)
  {
-    typedef typename Lhs::Scalar   LhsScalar;
-    typedef typename Rhs::Scalar   RhsScalar;
-    typedef typename Dest::Scalar  ResScalar;
-    
-    typedef internal::blas_traits<Lhs> LhsBlasTraits;
-    typedef typename LhsBlasTraits::DirectLinearAccessType ActualLhsType;
-    typedef internal::blas_traits<Rhs> RhsBlasTraits;
-    typedef typename RhsBlasTraits::DirectLinearAccessType ActualRhsType;
-    typedef typename internal::remove_all<ActualRhsType>::type ActualRhsTypeCleaned;
+    typedef typename ProductType::LhsScalar LhsScalar;
+    typedef typename ProductType::RhsScalar RhsScalar;
+    typedef typename ProductType::Scalar    ResScalar;
+    typedef typename ProductType::Index Index;
+    typedef typename ProductType::ActualLhsType ActualLhsType;
+    typedef typename ProductType::ActualRhsType ActualRhsType;
+    typedef typename ProductType::_ActualRhsType _ActualRhsType;
+    typedef typename ProductType::LhsBlasTraits LhsBlasTraits;
+    typedef typename ProductType::RhsBlasTraits RhsBlasTraits;

-    typename add_const<ActualLhsType>::type actualLhs = LhsBlasTraits::extract(lhs);
-    typename add_const<ActualRhsType>::type actualRhs = RhsBlasTraits::extract(rhs);
+    typename add_const<ActualLhsType>::type actualLhs = LhsBlasTraits::extract(prod.lhs());
+    typename add_const<ActualRhsType>::type actualRhs = RhsBlasTraits::extract(prod.rhs());

-    ResScalar actualAlpha = alpha * LhsBlasTraits::extractScalarFactor(lhs)
-                                  * RhsBlasTraits::extractScalarFactor(rhs);
+    ResScalar actualAlpha = alpha * LhsBlasTraits::extractScalarFactor(prod.lhs())
+                                  * RhsBlasTraits::extractScalarFactor(prod.rhs());

    enum {
      // FIXME find a way to allow an inner stride on the result if packet_traits<Scalar>::size==1
      // on, the other hand it is good for the cache to pack the vector anyways...
-      DirectlyUseRhs = ActualRhsTypeCleaned::InnerStrideAtCompileTime==1
+      DirectlyUseRhs = _ActualRhsType::InnerStrideAtCompileTime==1
    };

-    gemv_static_vector_if<RhsScalar,ActualRhsTypeCleaned::SizeAtCompileTime,ActualRhsTypeCleaned::MaxSizeAtCompileTime,!DirectlyUseRhs> static_rhs;
+    gemv_static_vector_if<RhsScalar,_ActualRhsType::SizeAtCompileTime,_ActualRhsType::MaxSizeAtCompileTime,!DirectlyUseRhs> static_rhs;

    ei_declare_aligned_stack_constructed_variable(RhsScalar,actualRhsPtr,actualRhs.size(),
        DirectlyUseRhs ? const_cast<RhsScalar*>(actualRhs.data()) : static_rhs.data());
@@ -327,45 +509,45 @@ template<> struct gemv_dense_sense_selector<OnTheRight,RowMajor,true>
    if(!DirectlyUseRhs)
    {
      #ifdef EIGEN_DENSE_STORAGE_CTOR_PLUGIN
-      Index size = actualRhs.size();
+      int size = actualRhs.size();
      EIGEN_DENSE_STORAGE_CTOR_PLUGIN
      #endif
-      Map<typename ActualRhsTypeCleaned::PlainObject>(actualRhsPtr, actualRhs.size()) = actualRhs;
+      Map<typename _ActualRhsType::PlainObject>(actualRhsPtr, actualRhs.size()) = actualRhs;
    }

-    typedef const_blas_data_mapper<LhsScalar,Index,RowMajor> LhsMapper;
-    typedef const_blas_data_mapper<RhsScalar,Index,ColMajor> RhsMapper;
    general_matrix_vector_product
-        <Index,LhsScalar,LhsMapper,RowMajor,LhsBlasTraits::NeedToConjugate,RhsScalar,RhsMapper,RhsBlasTraits::NeedToConjugate>::run(
+      <Index,LhsScalar,RowMajor,LhsBlasTraits::NeedToConjugate,RhsScalar,RhsBlasTraits::NeedToConjugate>::run(
        actualLhs.rows(), actualLhs.cols(),
-        LhsMapper(actualLhs.data(), actualLhs.outerStride()),
-        RhsMapper(actualRhsPtr, 1),
-        dest.data(), dest.innerStride(),
+        actualLhs.data(), actualLhs.outerStride(),
+        actualRhsPtr, 1,
+        dest.data(), dest.col(0).innerStride(), //NOTE  if dest is not a vector at compile-time, then dest.innerStride() might be wrong. (bug 1166)
        actualAlpha);
  }
 };

-template<> struct gemv_dense_sense_selector<OnTheRight,ColMajor,false>
+template<> struct gemv_selector<OnTheRight,ColMajor,false>
 {
-  template<typename Lhs, typename Rhs, typename Dest>
-  static void run(const Lhs &lhs, const Rhs &rhs, Dest& dest, const typename Dest::Scalar& alpha)
+  template<typename ProductType, typename Dest>
+  static void run(const ProductType& prod, Dest& dest, const typename ProductType::Scalar& alpha)
  {
+    typedef typename Dest::Index Index;
    // TODO makes sure dest is sequentially stored in memory, otherwise use a temp
-    const Index size = rhs.rows();
+    const Index size = prod.rhs().rows();
    for(Index k=0; k<size; ++k)
-      dest += (alpha*rhs.coeff(k)) * lhs.col(k);
+      dest += (alpha*prod.rhs().coeff(k)) * prod.lhs().col(k);
  }
 };

-template<> struct gemv_dense_sense_selector<OnTheRight,RowMajor,false>
+template<> struct gemv_selector<OnTheRight,RowMajor,false>
 {
-  template<typename Lhs, typename Rhs, typename Dest>
-  static void run(const Lhs &lhs, const Rhs &rhs, Dest& dest, const typename Dest::Scalar& alpha)
+  template<typename ProductType, typename Dest>
+  static void run(const ProductType& prod, Dest& dest, const typename ProductType::Scalar& alpha)
  {
+    typedef typename Dest::Index Index;
    // TODO makes sure rhs is sequentially stored in memory, otherwise use a temp
-    const Index rows = dest.rows();
+    const Index rows = prod.rows();
    for(Index i=0; i<rows; ++i)
-      dest.coeffRef(i) += alpha * (lhs.row(i).cwiseProduct(rhs.transpose())).sum();
+      dest.coeffRef(i) += alpha * (prod.lhs().row(i).cwiseProduct(prod.rhs().transpose())).sum();
  }
 };

@@ -381,11 +563,9 @@ template<> struct gemv_dense_sense_selector<OnTheRight,RowMajor,false>
  *
  * \sa lazyProduct(), operator*=(const MatrixBase&), Cwise::operator*()
  */
-#ifndef __CUDACC__
-
 template<typename Derived>
 template<typename OtherDerived>
-inline const Product<Derived, OtherDerived>
+inline const typename ProductReturnType<Derived, OtherDerived>::Type
 MatrixBase<Derived>::operator*(const MatrixBase<OtherDerived> &other) const
 {
  // A note regarding the function declaration: In MSVC, this function will sometimes
@@ -410,12 +590,9 @@ MatrixBase<Derived>::operator*(const MatrixBase<OtherDerived> &other) const
 #ifdef EIGEN_DEBUG_PRODUCT
  internal::product_type<Derived,OtherDerived>::debug();
 #endif
-
-  return Product<Derived, OtherDerived>(derived(), other.derived());
+  return typename ProductReturnType<Derived,OtherDerived>::Type(derived(), other.derived());
 }

-#endif // __CUDACC__
-
 /** \returns an expression of the matrix product of \c *this and \a other without implicit evaluation.
  *
  * The returned product will behave like any other expressions: the coefficients of the product will be
@@ -429,7 +606,7 @@ MatrixBase<Derived>::operator*(const MatrixBase<OtherDerived> &other) const
  */
 template<typename Derived>
 template<typename OtherDerived>
-const Product<Derived,OtherDerived,LazyProduct>
+const typename LazyProductReturnType<Derived,OtherDerived>::Type
 MatrixBase<Derived>::lazyProduct(const MatrixBase<OtherDerived> &other) const
 {
  enum {
@@ -448,7 +625,7 @@ MatrixBase<Derived>::lazyProduct(const MatrixBase<OtherDerived> &other) const
    INVALID_MATRIX_PRODUCT__IF_YOU_WANTED_A_COEFF_WISE_PRODUCT_YOU_MUST_USE_THE_EXPLICIT_FUNCTION)
  EIGEN_STATIC_ASSERT(ProductIsValid || SameSizes, INVALID_MATRIX_PRODUCT)

-  return Product<Derived,OtherDerived,LazyProduct>(derived(), other.derived());
+  return typename LazyProductReturnType<Derived,OtherDerived>::Type(derived(), other.derived());
 }

 } // end namespace Eigen
--- a/Eigen/src/Core/GenericPacketMath.h
+++ b/Eigen/src/Core/GenericPacketMath.h
@@ -42,8 +42,6 @@ namespace internal {
 struct default_packet_traits
 {
  enum {
-    HasHalfPacket = 0,
-    
    HasAdd    = 1,
    HasSub    = 1,
    HasMul    = 1,
@@ -54,7 +52,6 @@ struct default_packet_traits
    HasMax    = 1,
    HasConj   = 1,
    HasSetLinear = 1,
-    HasBlend  = 0,

    HasDiv    = 0,
    HasSqrt   = 0,
@@ -74,12 +71,10 @@ struct default_packet_traits
 template<typename T> struct packet_traits : default_packet_traits
 {
  typedef T type;
-  typedef T half;
  enum {
    Vectorizable = 0,
    size = 1,
-    AlignedOnScalar = 0,
-    HasHalfPacket = 0
+    AlignedOnScalar = 0
  };
  enum {
    HasAdd    = 0,
@@ -95,201 +90,135 @@ template<typename T> struct packet_traits : default_packet_traits
  };
 };

-template<typename T> struct packet_traits<const T> : packet_traits<T> { };
-
 /** \internal \returns a + b (coeff-wise) */
-template<typename Packet> EIGEN_DEVICE_FUNC inline Packet
+template<typename Packet> inline Packet
 padd(const Packet& a,
        const Packet& b) { return a+b; }

 /** \internal \returns a - b (coeff-wise) */
-template<typename Packet> EIGEN_DEVICE_FUNC inline Packet
+template<typename Packet> inline Packet
 psub(const Packet& a,
        const Packet& b) { return a-b; }

 /** \internal \returns -a (coeff-wise) */
-template<typename Packet> EIGEN_DEVICE_FUNC inline Packet
+template<typename Packet> inline Packet
 pnegate(const Packet& a) { return -a; }

 /** \internal \returns conj(a) (coeff-wise) */
-
-template<typename Packet> EIGEN_DEVICE_FUNC inline Packet
+template<typename Packet> inline Packet
 pconj(const Packet& a) { return numext::conj(a); }

 /** \internal \returns a * b (coeff-wise) */
-template<typename Packet> EIGEN_DEVICE_FUNC inline Packet
+template<typename Packet> inline Packet
 pmul(const Packet& a,
        const Packet& b) { return a*b; }

 /** \internal \returns a / b (coeff-wise) */
-template<typename Packet> EIGEN_DEVICE_FUNC inline Packet
+template<typename Packet> inline Packet
 pdiv(const Packet& a,
        const Packet& b) { return a/b; }

 /** \internal \returns the min of \a a and \a b  (coeff-wise) */
-template<typename Packet> EIGEN_DEVICE_FUNC inline Packet
+template<typename Packet> inline Packet
 pmin(const Packet& a,
-        const Packet& b) { return numext::mini(a, b); }
+        const Packet& b) { using std::min; return (min)(a, b); }

 /** \internal \returns the max of \a a and \a b  (coeff-wise) */
-template<typename Packet> EIGEN_DEVICE_FUNC inline Packet
+template<typename Packet> inline Packet
 pmax(const Packet& a,
-        const Packet& b) { return numext::maxi(a, b); }
+        const Packet& b) { using std::max; return (max)(a, b); }

 /** \internal \returns the absolute value of \a a */
-template<typename Packet> EIGEN_DEVICE_FUNC inline Packet
+template<typename Packet> inline Packet
 pabs(const Packet& a) { using std::abs; return abs(a); }

 /** \internal \returns the bitwise and of \a a and \a b */
-template<typename Packet> EIGEN_DEVICE_FUNC inline Packet
+template<typename Packet> inline Packet
 pand(const Packet& a, const Packet& b) { return a & b; }

 /** \internal \returns the bitwise or of \a a and \a b */
-template<typename Packet> EIGEN_DEVICE_FUNC inline Packet
+template<typename Packet> inline Packet
 por(const Packet& a, const Packet& b) { return a | b; }

 /** \internal \returns the bitwise xor of \a a and \a b */
-template<typename Packet> EIGEN_DEVICE_FUNC inline Packet
+template<typename Packet> inline Packet
 pxor(const Packet& a, const Packet& b) { return a ^ b; }

 /** \internal \returns the bitwise andnot of \a a and \a b */
-template<typename Packet> EIGEN_DEVICE_FUNC inline Packet
+template<typename Packet> inline Packet
 pandnot(const Packet& a, const Packet& b) { return a & (!b); }

 /** \internal \returns a packet version of \a *from, from must be 16 bytes aligned */
-template<typename Packet> EIGEN_DEVICE_FUNC inline Packet
+template<typename Packet> inline Packet
 pload(const typename unpacket_traits<Packet>::type* from) { return *from; }

 /** \internal \returns a packet version of \a *from, (un-aligned load) */
-template<typename Packet> EIGEN_DEVICE_FUNC inline Packet
+template<typename Packet> inline Packet
 ploadu(const typename unpacket_traits<Packet>::type* from) { return *from; }

-/** \internal \returns a packet with constant coefficients \a a, e.g.: (a,a,a,a) */
-template<typename Packet> EIGEN_DEVICE_FUNC inline Packet
-pset1(const typename unpacket_traits<Packet>::type& a) { return a; }
-
-/** \internal \returns a packet with constant coefficients \a a[0], e.g.: (a[0],a[0],a[0],a[0]) */
-template<typename Packet> EIGEN_DEVICE_FUNC inline Packet
-pload1(const typename unpacket_traits<Packet>::type  *a) { return pset1<Packet>(*a); }
-
 /** \internal \returns a packet with elements of \a *from duplicated.
-  * For instance, for a packet of 8 elements, 4 scalars will be read from \a *from and
-  * duplicated to form: {from[0],from[0],from[1],from[1],from[2],from[2],from[3],from[3]}
+  * For instance, for a packet of 8 elements, 4 scalar will be read from \a *from and
+  * duplicated to form: {from[0],from[0],from[1],from[1],,from[2],from[2],,from[3],from[3]}
  * Currently, this function is only used for scalar * complex products.
-  */
-template<typename Packet> EIGEN_DEVICE_FUNC inline Packet
+ */
+template<typename Packet> inline Packet
 ploaddup(const typename unpacket_traits<Packet>::type* from) { return *from; }

-/** \internal \returns a packet with elements of \a *from quadrupled.
-  * For instance, for a packet of 8 elements, 2 scalars will be read from \a *from and
-  * replicated to form: {from[0],from[0],from[0],from[0],from[1],from[1],from[1],from[1]}
-  * Currently, this function is only used in matrix products.
-  * For packet-size smaller or equal to 4, this function is equivalent to pload1 
-  */
-template<typename Packet> EIGEN_DEVICE_FUNC inline Packet
-ploadquad(const typename unpacket_traits<Packet>::type* from)
-{ return pload1<Packet>(from); }
-
-/** \internal equivalent to
-  * \code
-  * a0 = pload1(a+0);
-  * a1 = pload1(a+1);
-  * a2 = pload1(a+2);
-  * a3 = pload1(a+3);
-  * \endcode
-  * \sa pset1, pload1, ploaddup, pbroadcast2
-  */
-template<typename Packet> EIGEN_DEVICE_FUNC
-inline void pbroadcast4(const typename unpacket_traits<Packet>::type *a,
-                        Packet& a0, Packet& a1, Packet& a2, Packet& a3)
-{
-  a0 = pload1<Packet>(a+0);
-  a1 = pload1<Packet>(a+1);
-  a2 = pload1<Packet>(a+2);
-  a3 = pload1<Packet>(a+3);
-}
-
-/** \internal equivalent to
-  * \code
-  * a0 = pload1(a+0);
-  * a1 = pload1(a+1);
-  * \endcode
-  * \sa pset1, pload1, ploaddup, pbroadcast4
-  */
-template<typename Packet> EIGEN_DEVICE_FUNC
-inline void pbroadcast2(const typename unpacket_traits<Packet>::type *a,
-                        Packet& a0, Packet& a1)
-{
-  a0 = pload1<Packet>(a+0);
-  a1 = pload1<Packet>(a+1);
-}
+/** \internal \returns a packet with constant coefficients \a a, e.g.: (a,a,a,a) */
+template<typename Packet> inline Packet
+pset1(const typename unpacket_traits<Packet>::type& a) { return a; }

 /** \internal \brief Returns a packet with coefficients (a,a+1,...,a+packet_size-1). */
 template<typename Scalar> inline typename packet_traits<Scalar>::type
 plset(const Scalar& a) { return a; }

 /** \internal copy the packet \a from to \a *to, \a to must be 16 bytes aligned */
-template<typename Scalar, typename Packet> EIGEN_DEVICE_FUNC inline void pstore(Scalar* to, const Packet& from)
+template<typename Scalar, typename Packet> inline void pstore(Scalar* to, const Packet& from)
 { (*to) = from; }

 /** \internal copy the packet \a from to \a *to, (un-aligned store) */
-template<typename Scalar, typename Packet> EIGEN_DEVICE_FUNC inline void pstoreu(Scalar* to, const Packet& from)
-{  (*to) = from; }
-
- template<typename Scalar, typename Packet> EIGEN_DEVICE_FUNC inline Packet pgather(const Scalar* from, Index /*stride*/)
- { return ploadu<Packet>(from); }
-
- template<typename Scalar, typename Packet> EIGEN_DEVICE_FUNC inline void pscatter(Scalar* to, const Packet& from, Index /*stride*/)
- { pstore(to, from); }
+template<typename Scalar, typename Packet> inline void pstoreu(Scalar* to, const Packet& from)
+{ (*to) = from; }

 /** \internal tries to do cache prefetching of \a addr */
 template<typename Scalar> inline void prefetch(const Scalar* addr)
 {
-#if !EIGEN_COMP_MSVC
-  __builtin_prefetch(addr);
+#if !defined(_MSC_VER)
+__builtin_prefetch(addr);
 #endif
 }

 /** \internal \returns the first element of a packet */
-template<typename Packet> EIGEN_DEVICE_FUNC inline typename unpacket_traits<Packet>::type pfirst(const Packet& a)
+template<typename Packet> inline typename unpacket_traits<Packet>::type pfirst(const Packet& a)
 { return a; }

 /** \internal \returns a packet where the element i contains the sum of the packet of \a vec[i] */
-template<typename Packet> EIGEN_DEVICE_FUNC inline Packet
+template<typename Packet> inline Packet
 preduxp(const Packet* vecs) { return vecs[0]; }

 /** \internal \returns the sum of the elements of \a a*/
-template<typename Packet> EIGEN_DEVICE_FUNC inline typename unpacket_traits<Packet>::type predux(const Packet& a)
-{ return a; }
-
-/** \internal \returns the sum of the elements of \a a by block of 4 elements.
-  * For a packet {a0, a1, a2, a3, a4, a5, a6, a7}, it returns a half packet {a0+a4, a1+a5, a2+a6, a3+a7}
-  * For packet-size smaller or equal to 4, this boils down to a noop.
-  */
-template<typename Packet> EIGEN_DEVICE_FUNC inline
-typename conditional<(unpacket_traits<Packet>::size%8)==0,typename unpacket_traits<Packet>::half,Packet>::type
-predux4(const Packet& a)
+template<typename Packet> inline typename unpacket_traits<Packet>::type predux(const Packet& a)
 { return a; }

 /** \internal \returns the product of the elements of \a a*/
-template<typename Packet> EIGEN_DEVICE_FUNC inline typename unpacket_traits<Packet>::type predux_mul(const Packet& a)
+template<typename Packet> inline typename unpacket_traits<Packet>::type predux_mul(const Packet& a)
 { return a; }

 /** \internal \returns the min of the elements of \a a*/
-template<typename Packet> EIGEN_DEVICE_FUNC inline typename unpacket_traits<Packet>::type predux_min(const Packet& a)
+template<typename Packet> inline typename unpacket_traits<Packet>::type predux_min(const Packet& a)
 { return a; }

 /** \internal \returns the max of the elements of \a a*/
-template<typename Packet> EIGEN_DEVICE_FUNC inline typename unpacket_traits<Packet>::type predux_max(const Packet& a)
+template<typename Packet> inline typename unpacket_traits<Packet>::type predux_max(const Packet& a)
 { return a; }

 /** \internal \returns the reversed elements of \a a*/
-template<typename Packet> EIGEN_DEVICE_FUNC inline Packet preverse(const Packet& a)
+template<typename Packet> inline Packet preverse(const Packet& a)
 { return a; }


 /** \internal \returns \a a with real and imaginary part flipped (for complex type only) */
-template<typename Packet> EIGEN_DEVICE_FUNC inline Packet pcplxflip(const Packet& a)
+template<typename Packet> inline Packet pcplxflip(const Packet& a)
 {
  // FIXME: uncomment the following in case we drop the internal imag and real functions.
 //   using std::imag;
@@ -321,10 +250,6 @@ Packet pasin(const Packet& a) { using std::asin; return asin(a); }
 template<typename Packet> EIGEN_DECLARE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS
 Packet pacos(const Packet& a) { using std::acos; return acos(a); }

-/** \internal \returns the atan of \a a (coeff-wise) */
-template<typename Packet> EIGEN_DECLARE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS
-Packet patan(const Packet& a) { using std::atan; return atan(a); }
-
 /** \internal \returns the exp of \a a (coeff-wise) */
 template<typename Packet> EIGEN_DECLARE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS
 Packet pexp(const Packet& a) { using std::exp; return exp(a); }
@@ -350,7 +275,7 @@ inline void pstore1(typename unpacket_traits<Packet>::type* to, const typename u
 }

 /** \internal \returns a * b + c (coeff-wise) */
-template<typename Packet> EIGEN_DEVICE_FUNC inline Packet
+template<typename Packet> inline Packet
 pmadd(const Packet&  a,
         const Packet&  b,
         const Packet&  c)
@@ -359,7 +284,7 @@ pmadd(const Packet&  a,
 /** \internal \returns a packet version of \a *from.
  * If LoadMode equals #Aligned, \a from must be 16 bytes aligned */
 template<typename Packet, int LoadMode>
-EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE Packet ploadt(const typename unpacket_traits<Packet>::type* from)
+inline Packet ploadt(const typename unpacket_traits<Packet>::type* from)
 {
  if(LoadMode == Aligned)
    return pload<Packet>(from);
@@ -370,7 +295,7 @@ EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE Packet ploadt(const typename unpacket_trai
 /** \internal copy the packet \a from to \a *to.
  * If StoreMode equals #Aligned, \a to must be 16 bytes aligned */
 template<typename Scalar, typename Packet, int LoadMode>
-EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE void pstoret(Scalar* to, const Packet& from)
+inline void pstoret(Scalar* to, const Packet& from)
 {
  if(LoadMode == Aligned)
    pstore(to, from);
@@ -378,17 +303,6 @@ EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE void pstoret(Scalar* to, const Packet& fro
    pstoreu(to, from);
 }

-/** \internal \returns a packet version of \a *from.
-  * Unlike ploadt, ploadt_ro takes advantage of the read-only memory path on the
-  * hardware if available to speedup the loading of data that won't be modified
-  * by the current computation.
-  */
-template<typename Packet, int LoadMode>
-inline Packet ploadt_ro(const typename unpacket_traits<Packet>::type* from)
-{
-  return ploadt<Packet, LoadMode>(from);
-}
-
 /** \internal default implementation of palign() allowing partial specialization */
 template<int Offset,typename PacketType>
 struct palign_impl
@@ -422,46 +336,15 @@ inline void palign(PacketType& first, const PacketType& second)
 * Fast complex products (GCC generates a function call which is very slow)
 ***************************************************************************/

-// Eigen+CUDA does not support complexes.
-#ifndef __CUDACC__
-
 template<> inline std::complex<float> pmul(const std::complex<float>& a, const std::complex<float>& b)
 { return std::complex<float>(real(a)*real(b) - imag(a)*imag(b), imag(a)*real(b) + real(a)*imag(b)); }

 template<> inline std::complex<double> pmul(const std::complex<double>& a, const std::complex<double>& b)
 { return std::complex<double>(real(a)*real(b) - imag(a)*imag(b), imag(a)*real(b) + real(a)*imag(b)); }

-#endif
-
-
-/***************************************************************************
- * PacketBlock, that is a collection of N packets where the number of words
- * in the packet is a multiple of N.
-***************************************************************************/
-template <typename Packet,int N=unpacket_traits<Packet>::size> struct PacketBlock {
-  Packet packet[N];
-};
-
-template<typename Packet> EIGEN_DEVICE_FUNC inline void
-ptranspose(PacketBlock<Packet,1>& /*kernel*/) {
-  // Nothing to do in the scalar case, i.e. a 1x1 matrix.
-}
-
-/***************************************************************************
- * Selector, i.e. vector of N boolean values used to select (i.e. blend)
- * words from 2 packets.
-***************************************************************************/
-template <size_t N> struct Selector {
-  bool select[N];
-};
-
-template<typename Packet> EIGEN_DEVICE_FUNC inline Packet
-pblend(const Selector<unpacket_traits<Packet>::size>& ifPacket, const Packet& thenPacket, const Packet& elsePacket) {
-  return ifPacket.select[0] ? thenPacket : elsePacket;
-}
-
 } // end namespace internal

 } // end namespace Eigen

 #endif // EIGEN_GENERIC_PACKET_MATH_H
+
--- a/Eigen/src/Core/GlobalFunctions.h
+++ b/Eigen/src/Core/GlobalFunctions.h
@@ -15,7 +15,7 @@
  template<typename Derived> \
  inline const Eigen::CwiseUnaryOp<Eigen::internal::FUNCTOR<typename Derived::Scalar>, const Derived> \
  NAME(const Eigen::ArrayBase<Derived>& x) { \
-    return Eigen::CwiseUnaryOp<Eigen::internal::FUNCTOR<typename Derived::Scalar>, const Derived>(x.derived()); \
+    return x.derived(); \
  }

 #define EIGEN_ARRAY_DECLARE_GLOBAL_EIGEN_UNARY(NAME,FUNCTOR) \
@@ -30,7 +30,7 @@
  { \
    static inline typename NAME##_retval<ArrayBase<Derived> >::type run(const Eigen::ArrayBase<Derived>& x) \
    { \
-      return typename NAME##_retval<ArrayBase<Derived> >::type(x.derived()); \
+      return x.derived(); \
    } \
  };

@@ -45,7 +45,6 @@ namespace Eigen
  EIGEN_ARRAY_DECLARE_GLOBAL_UNARY(asin,scalar_asin_op)
  EIGEN_ARRAY_DECLARE_GLOBAL_UNARY(acos,scalar_acos_op)
  EIGEN_ARRAY_DECLARE_GLOBAL_UNARY(tan,scalar_tan_op)
-  EIGEN_ARRAY_DECLARE_GLOBAL_UNARY(atan,scalar_atan_op)
  EIGEN_ARRAY_DECLARE_GLOBAL_UNARY(exp,scalar_exp_op)
  EIGEN_ARRAY_DECLARE_GLOBAL_UNARY(log,scalar_log_op)
  EIGEN_ARRAY_DECLARE_GLOBAL_UNARY(abs,scalar_abs_op)
--- a/Eigen/src/Core/IO.h
+++ b/Eigen/src/Core/IO.h
@@ -49,7 +49,7 @@ std::ostream & print_matrix(std::ostream & s, const Derived& _m, const IOFormat&
  */
 struct IOFormat
 {
-  /** Default constructor, see class IOFormat for the meaning of the parameters */
+  /** Default contructor, see class IOFormat for the meaning of the parameters */
  IOFormat(int _precision = StreamPrecision, int _flags = 0,
    const std::string& _coeffSeparator = " ",
    const std::string& _rowSeparator = "\n", const std::string& _rowPrefix="", const std::string& _rowSuffix="",
@@ -57,10 +57,6 @@ struct IOFormat
  : matPrefix(_matPrefix), matSuffix(_matSuffix), rowPrefix(_rowPrefix), rowSuffix(_rowSuffix), rowSeparator(_rowSeparator),
    rowSpacer(""), coeffSeparator(_coeffSeparator), precision(_precision), flags(_flags)
  {
-    // TODO check if rowPrefix, rowSuffix or rowSeparator contains a newline
-    // don't add rowSpacer if columns are not to be aligned
-    if((flags & DontAlignCols))
-      return;
    int i = int(matSuffix.length())-1;
    while (i>=0 && matSuffix[i]!='\n')
    {
@@ -164,6 +160,7 @@ std::ostream & print_matrix(std::ostream & s, const Derived& _m, const IOFormat&
  
  typename Derived::Nested m = _m;
  typedef typename Derived::Scalar Scalar;
+  typedef typename Derived::Index Index;

  Index width = 0;

--- a/Eigen/src/Core/Inverse.h
+++ b/Eigen/src/Core/Inverse.h
@@ -1,129 +0,0 @@
-// This file is part of Eigen, a lightweight C++ template library
-// for linear algebra.
-//
-// Copyright (C) 2014 Gael Guennebaud <gael.guennebaud@inria.fr>
-//
-// This Source Code Form is subject to the terms of the Mozilla
-// Public License v. 2.0. If a copy of the MPL was not distributed
-// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
-
-#ifndef EIGEN_INVERSE_H
-#define EIGEN_INVERSE_H
-
-namespace Eigen { 
-
-// TODO move the general declaration in Core, and rename this file DenseInverseImpl.h, or something like this...
-
-template<typename XprType,typename StorageKind> class InverseImpl;
-
-namespace internal {
-
-template<typename XprType>
-struct traits<Inverse<XprType> >
-  : traits<typename XprType::PlainObject>
-{
-  typedef typename XprType::PlainObject PlainObject;
-  typedef traits<PlainObject> BaseTraits;
-  enum {
-    Flags = BaseTraits::Flags & RowMajorBit
-  };
-};
-
-} // end namespace internal
-
-/** \class Inverse
-  *
-  * \brief Expression of the inverse of another expression
-  *
-  * \tparam XprType the type of the expression we are taking the inverse
-  *
-  * This class represents an abstract expression of A.inverse()
-  * and most of the time this is the only way it is used.
-  *
-  */
-template<typename XprType>
-class Inverse : public InverseImpl<XprType,typename internal::traits<XprType>::StorageKind>
-{
-public:
-  typedef typename XprType::StorageIndex StorageIndex;
-  typedef typename XprType::PlainObject                       PlainObject;
-  typedef typename internal::nested<XprType>::type            XprTypeNested;
-  typedef typename internal::remove_all<XprTypeNested>::type  XprTypeNestedCleaned;
-  
-  explicit Inverse(const XprType &xpr)
-    : m_xpr(xpr)
-  {}
-  
-  EIGEN_DEVICE_FUNC Index rows() const { return m_xpr.rows(); }
-  EIGEN_DEVICE_FUNC Index cols() const { return m_xpr.cols(); }
-
-  EIGEN_DEVICE_FUNC const XprTypeNestedCleaned& nestedExpression() const { return m_xpr; }
-
-protected:
-  XprTypeNested m_xpr;
-};
-
-/** \internal
-  * Specialization of the Inverse expression for dense expressions.
-  * Direct access to the coefficients are discared.
-  * FIXME this intermediate class is probably not needed anymore.
-  */
-template<typename XprType>
-class InverseImpl<XprType,Dense>
-  : public MatrixBase<Inverse<XprType> >
-{
-  typedef Inverse<XprType> Derived;
-  
-public:
-  
-  typedef MatrixBase<Derived> Base;
-  EIGEN_DENSE_PUBLIC_INTERFACE(Derived)
-  typedef typename internal::remove_all<XprType>::type NestedExpression;
-
-private:
-  
-  Scalar coeff(Index row, Index col) const;
-  Scalar coeff(Index i) const;
-};
-
-namespace internal {
-
-/** \internal
-  * \brief Default evaluator for Inverse expression.
-  * 
-  * This default evaluator for Inverse expression simply evaluate the inverse into a temporary
-  * by a call to internal::call_assignment_no_alias.
-  * Therefore, inverse implementers only have to specialize Assignment<Dst,Inverse<...>, ...> for
-  * there own nested expression.
-  *
-  * \sa class Inverse
-  */
-template<typename ArgType>
-struct unary_evaluator<Inverse<ArgType> >
-  : public evaluator<typename Inverse<ArgType>::PlainObject>::type
-{
-  typedef Inverse<ArgType> InverseType;
-  typedef typename InverseType::PlainObject PlainObject;
-  typedef typename evaluator<PlainObject>::type Base;
-  
-  typedef evaluator<InverseType> type;
-  typedef evaluator<InverseType> nestedType;
-  
-  enum { Flags = Base::Flags | EvalBeforeNestingBit };
-
-  unary_evaluator(const InverseType& inv_xpr)
-    : m_result(inv_xpr.rows(), inv_xpr.cols())
-  {
-    ::new (static_cast<Base*>(this)) Base(m_result);
-    internal::call_assignment_no_alias(m_result, inv_xpr);
-  }
-  
-protected:
-  PlainObject m_result;
-};
-  
-} // end namespace internal
-
-} // end namespace Eigen
-
-#endif // EIGEN_INVERSE_H
--- a/Eigen/src/Core/Map.h
+++ b/Eigen/src/Core/Map.h
@@ -70,6 +70,8 @@ struct traits<Map<PlainObjectType, MapOptions, StrideType> >
  : public traits<PlainObjectType>
 {
  typedef traits<PlainObjectType> TraitsBase;
+  typedef typename PlainObjectType::Index Index;
+  typedef typename PlainObjectType::Scalar Scalar;
  enum {
    InnerStrideAtCompileTime = StrideType::InnerStrideAtCompileTime == 0
                             ? int(PlainObjectType::InnerStrideAtCompileTime)
@@ -77,9 +79,22 @@ struct traits<Map<PlainObjectType, MapOptions, StrideType> >
    OuterStrideAtCompileTime = StrideType::OuterStrideAtCompileTime == 0
                             ? int(PlainObjectType::OuterStrideAtCompileTime)
                             : int(StrideType::OuterStrideAtCompileTime),
+    HasNoInnerStride = InnerStrideAtCompileTime == 1,
+    HasNoOuterStride = StrideType::OuterStrideAtCompileTime == 0,
+    HasNoStride = HasNoInnerStride && HasNoOuterStride,
    IsAligned = bool(EIGEN_ALIGN) && ((int(MapOptions)&Aligned)==Aligned),
+    IsDynamicSize = PlainObjectType::SizeAtCompileTime==Dynamic,
+    KeepsPacketAccess = bool(HasNoInnerStride)
+                        && ( bool(IsDynamicSize)
+                           || HasNoOuterStride
+                           || ( OuterStrideAtCompileTime!=Dynamic
+                           && ((static_cast<int>(sizeof(Scalar))*OuterStrideAtCompileTime)%16)==0 ) ),
    Flags0 = TraitsBase::Flags & (~NestByRefBit),
-    Flags = is_lvalue<PlainObjectType>::value ? int(Flags0) : (int(Flags0) & ~LvalueBit)
+    Flags1 = IsAligned ? (int(Flags0) | AlignedBit) : (int(Flags0) & ~AlignedBit),
+    Flags2 = (bool(HasNoStride) || bool(PlainObjectType::IsVectorAtCompileTime))
+           ? int(Flags1) : int(Flags1 & ~LinearAccessBit),
+    Flags3 = is_lvalue<PlainObjectType>::value ? int(Flags2) : (int(Flags2) & ~LvalueBit),
+    Flags = KeepsPacketAccess ? int(Flags3) : (int(Flags3) & ~PacketAccessBit)
  };
 private:
  enum { Options }; // Expressions don't have Options
@@ -95,17 +110,19 @@ template<typename PlainObjectType, int MapOptions, typename StrideType> class Ma
    EIGEN_DENSE_PUBLIC_INTERFACE(Map)

    typedef typename Base::PointerType PointerType;
+#if EIGEN2_SUPPORT_STAGE <= STAGE30_FULL_EIGEN3_API
+    typedef const Scalar* PointerArgType;
+    inline PointerType cast_to_pointer_type(PointerArgType ptr) { return const_cast<PointerType>(ptr); }
+#else
    typedef PointerType PointerArgType;
-    EIGEN_DEVICE_FUNC
    inline PointerType cast_to_pointer_type(PointerArgType ptr) { return ptr; }
+#endif

-    EIGEN_DEVICE_FUNC
    inline Index innerStride() const
    {
      return StrideType::InnerStrideAtCompileTime != 0 ? m_stride.inner() : 1;
    }

-    EIGEN_DEVICE_FUNC
    inline Index outerStride() const
    {
      return StrideType::OuterStrideAtCompileTime != 0 ? m_stride.outer()
@@ -119,8 +136,7 @@ template<typename PlainObjectType, int MapOptions, typename StrideType> class Ma
      * \param dataPtr pointer to the array to map
      * \param a_stride optional Stride object, passing the strides.
      */
-    EIGEN_DEVICE_FUNC
-    explicit inline Map(PointerArgType dataPtr, const StrideType& a_stride = StrideType())
+    inline Map(PointerArgType dataPtr, const StrideType& a_stride = StrideType())
      : Base(cast_to_pointer_type(dataPtr)), m_stride(a_stride)
    {
      PlainObjectType::Base::_check_template_params();
@@ -132,7 +148,6 @@ template<typename PlainObjectType, int MapOptions, typename StrideType> class Ma
      * \param a_size the size of the vector expression
      * \param a_stride optional Stride object, passing the strides.
      */
-    EIGEN_DEVICE_FUNC
    inline Map(PointerArgType dataPtr, Index a_size, const StrideType& a_stride = StrideType())
      : Base(cast_to_pointer_type(dataPtr), a_size), m_stride(a_stride)
    {
@@ -146,7 +161,6 @@ template<typename PlainObjectType, int MapOptions, typename StrideType> class Ma
      * \param nbCols the number of columns of the matrix expression
      * \param a_stride optional Stride object, passing the strides.
      */
-    EIGEN_DEVICE_FUNC
    inline Map(PointerArgType dataPtr, Index nbRows, Index nbCols, const StrideType& a_stride = StrideType())
      : Base(cast_to_pointer_type(dataPtr), nbRows, nbCols), m_stride(a_stride)
    {
@@ -159,6 +173,19 @@ template<typename PlainObjectType, int MapOptions, typename StrideType> class Ma
    StrideType m_stride;
 };

+template<typename _Scalar, int _Rows, int _Cols, int _Options, int _MaxRows, int _MaxCols>
+inline Array<_Scalar, _Rows, _Cols, _Options, _MaxRows, _MaxCols>
+  ::Array(const Scalar *data)
+{
+  this->_set_noalias(Eigen::Map<const Array>(data));
+}
+
+template<typename _Scalar, int _Rows, int _Cols, int _Options, int _MaxRows, int _MaxCols>
+inline Matrix<_Scalar, _Rows, _Cols, _Options, _MaxRows, _MaxCols>
+  ::Matrix(const Scalar *data)
+{
+  this->_set_noalias(Eigen::Map<const Matrix>(data));
+}

 } // end namespace Eigen

--- a/Eigen/src/Core/MapBase.h
+++ b/Eigen/src/Core/MapBase.h
@@ -12,7 +12,7 @@
 #define EIGEN_MAPBASE_H

 #define EIGEN_STATIC_ASSERT_INDEX_BASED_ACCESS(Derived) \
-      EIGEN_STATIC_ASSERT((int(internal::evaluator<Derived>::Flags) & LinearAccessBit) || Derived::IsVectorAtCompileTime, \
+      EIGEN_STATIC_ASSERT((int(internal::traits<Derived>::Flags) & LinearAccessBit) || Derived::IsVectorAtCompileTime, \
                          YOU_ARE_TRYING_TO_USE_AN_INDEX_BASED_ACCESSOR_ON_AN_EXPRESSION_THAT_DOES_NOT_SUPPORT_THAT)

 namespace Eigen { 
@@ -37,6 +37,7 @@ template<typename Derived> class MapBase<Derived, ReadOnlyAccessors>
    };

    typedef typename internal::traits<Derived>::StorageKind StorageKind;
+    typedef typename internal::traits<Derived>::Index Index;
    typedef typename internal::traits<Derived>::Scalar Scalar;
    typedef typename internal::packet_traits<Scalar>::type PacketScalar;
    typedef typename NumTraits<Scalar>::Real RealScalar;
@@ -75,8 +76,8 @@ template<typename Derived> class MapBase<Derived, ReadOnlyAccessors>

    typedef typename Base::CoeffReturnType CoeffReturnType;

-    EIGEN_DEVICE_FUNC inline Index rows() const { return m_rows.value(); }
-    EIGEN_DEVICE_FUNC inline Index cols() const { return m_cols.value(); }
+    inline Index rows() const { return m_rows.value(); }
+    inline Index cols() const { return m_cols.value(); }

    /** Returns a pointer to the first coefficient of the matrix or vector.
      *
@@ -84,28 +85,24 @@ template<typename Derived> class MapBase<Derived, ReadOnlyAccessors>
      *
      * \sa innerStride(), outerStride()
      */
-    EIGEN_DEVICE_FUNC inline const Scalar* data() const { return m_data; }
+    inline const Scalar* data() const { return m_data; }

-    EIGEN_DEVICE_FUNC
    inline const Scalar& coeff(Index rowId, Index colId) const
    {
      return m_data[colId * colStride() + rowId * rowStride()];
    }

-    EIGEN_DEVICE_FUNC
    inline const Scalar& coeff(Index index) const
    {
      EIGEN_STATIC_ASSERT_INDEX_BASED_ACCESS(Derived)
      return m_data[index * innerStride()];
    }

-    EIGEN_DEVICE_FUNC
    inline const Scalar& coeffRef(Index rowId, Index colId) const
    {
      return this->m_data[colId * colStride() + rowId * rowStride()];
    }

-    EIGEN_DEVICE_FUNC
    inline const Scalar& coeffRef(Index index) const
    {
      EIGEN_STATIC_ASSERT_INDEX_BASED_ACCESS(Derived)
@@ -126,14 +123,12 @@ template<typename Derived> class MapBase<Derived, ReadOnlyAccessors>
      return internal::ploadt<PacketScalar, LoadMode>(m_data + index * innerStride());
    }

-    EIGEN_DEVICE_FUNC
    explicit inline MapBase(PointerType dataPtr) : m_data(dataPtr), m_rows(RowsAtCompileTime), m_cols(ColsAtCompileTime)
    {
      EIGEN_STATIC_ASSERT_FIXED_SIZE(Derived)
      checkSanity();
    }

-    EIGEN_DEVICE_FUNC
    inline MapBase(PointerType dataPtr, Index vecSize)
            : m_data(dataPtr),
              m_rows(RowsAtCompileTime == Dynamic ? vecSize : Index(RowsAtCompileTime)),
@@ -145,7 +140,6 @@ template<typename Derived> class MapBase<Derived, ReadOnlyAccessors>
      checkSanity();
    }

-    EIGEN_DEVICE_FUNC
    inline MapBase(PointerType dataPtr, Index nbRows, Index nbCols)
            : m_data(dataPtr), m_rows(nbRows), m_cols(nbCols)
    {
@@ -155,12 +149,19 @@ template<typename Derived> class MapBase<Derived, ReadOnlyAccessors>
      checkSanity();
    }

+    #ifdef EIGEN_MAPBASE_PLUGIN
+    #include EIGEN_MAPBASE_PLUGIN
+    #endif
+
  protected:

-    EIGEN_DEVICE_FUNC
    void checkSanity() const
    {
-      eigen_assert(EIGEN_IMPLIES(internal::traits<Derived>::IsAligned, (size_t(m_data) % EIGEN_ALIGN_BYTES) == 0) && "data is not aligned");
+      EIGEN_STATIC_ASSERT(EIGEN_IMPLIES(internal::traits<Derived>::Flags&PacketAccessBit,
+                                        internal::inner_stride_at_compile_time<Derived>::ret==1),
+                          PACKET_ACCESS_REQUIRES_TO_HAVE_INNER_STRIDE_FIXED_TO_1);
+      eigen_assert(EIGEN_IMPLIES(internal::traits<Derived>::Flags&AlignedBit, (size_t(m_data) % 16) == 0)
+                   && "input pointer is not aligned on a 16 byte boundary");
    }

    PointerType m_data;
@@ -178,7 +179,7 @@ template<typename Derived> class MapBase<Derived, WriteAccessors>

    typedef typename Base::Scalar Scalar;
    typedef typename Base::PacketScalar PacketScalar;
-    typedef typename Base::StorageIndex StorageIndex;
+    typedef typename Base::Index Index;
    typedef typename Base::PointerType PointerType;

    using Base::derived;
@@ -199,18 +200,14 @@ template<typename Derived> class MapBase<Derived, WriteAccessors>
                    const Scalar
                  >::type ScalarWithConstIfNotLvalue;

-    EIGEN_DEVICE_FUNC
    inline const Scalar* data() const { return this->m_data; }
-    EIGEN_DEVICE_FUNC
    inline ScalarWithConstIfNotLvalue* data() { return this->m_data; } // no const-cast here so non-const-correct code will give a compile error

-    EIGEN_DEVICE_FUNC
    inline ScalarWithConstIfNotLvalue& coeffRef(Index row, Index col)
    {
      return this->m_data[col * colStride() + row * rowStride()];
    }

-    EIGEN_DEVICE_FUNC
    inline ScalarWithConstIfNotLvalue& coeffRef(Index index)
    {
      EIGEN_STATIC_ASSERT_INDEX_BASED_ACCESS(Derived)
@@ -232,11 +229,10 @@ template<typename Derived> class MapBase<Derived, WriteAccessors>
                (this->m_data + index * innerStride(), val);
    }

-    EIGEN_DEVICE_FUNC explicit inline MapBase(PointerType dataPtr) : Base(dataPtr) {}
-    EIGEN_DEVICE_FUNC inline MapBase(PointerType dataPtr, Index vecSize) : Base(dataPtr, vecSize) {}
-    EIGEN_DEVICE_FUNC inline MapBase(PointerType dataPtr, Index nbRows, Index nbCols) : Base(dataPtr, nbRows, nbCols) {}
+    explicit inline MapBase(PointerType dataPtr) : Base(dataPtr) {}
+    inline MapBase(PointerType dataPtr, Index vecSize) : Base(dataPtr, vecSize) {}
+    inline MapBase(PointerType dataPtr, Index nbRows, Index nbCols) : Base(dataPtr, nbRows, nbCols) {}

-    EIGEN_DEVICE_FUNC
    Derived& operator=(const MapBase& other)
    {
      ReadOnlyMapBase::Base::operator=(other);
--- a/Eigen/src/Core/MathFunctions.h
+++ b/Eigen/src/Core/MathFunctions.h
@@ -12,15 +12,6 @@

 namespace Eigen {

-// On WINCE, std::abs is defined for int only, so let's defined our own overloads:
-// This issue has been confirmed with MSVC 2008 only, but the issue might exist for more recent versions too.
-#if EIGEN_OS_WINCE && EIGEN_COMP_MSVC && EIGEN_COMP_MSVC<=1500
-long        abs(long        x) { return (labs(x));  }
-double      abs(double      x) { return (fabs(x));  }
-float       abs(float       x) { return (fabsf(x)); }
-long double abs(long double x) { return (fabsl(x)); }
-#endif
-  
 namespace internal {

 /** \internal \struct global_math_functions_filtering_base
@@ -71,7 +62,6 @@ template<typename Scalar, bool IsComplex = NumTraits<Scalar>::IsComplex>
 struct real_default_impl
 {
  typedef typename NumTraits<Scalar>::Real RealScalar;
-  EIGEN_DEVICE_FUNC
  static inline RealScalar run(const Scalar& x)
  {
    return x;
@@ -82,7 +72,6 @@ template<typename Scalar>
 struct real_default_impl<Scalar,true>
 {
  typedef typename NumTraits<Scalar>::Real RealScalar;
-  EIGEN_DEVICE_FUNC
  static inline RealScalar run(const Scalar& x)
  {
    using std::real;
@@ -98,6 +87,7 @@ struct real_retval
  typedef typename NumTraits<Scalar>::Real type;
 };

+
 /****************************************************************************
 * Implementation of imag                                                 *
 ****************************************************************************/
@@ -106,7 +96,6 @@ template<typename Scalar, bool IsComplex = NumTraits<Scalar>::IsComplex>
 struct imag_default_impl
 {
  typedef typename NumTraits<Scalar>::Real RealScalar;
-  EIGEN_DEVICE_FUNC
  static inline RealScalar run(const Scalar&)
  {
    return RealScalar(0);
@@ -117,7 +106,6 @@ template<typename Scalar>
 struct imag_default_impl<Scalar,true>
 {
  typedef typename NumTraits<Scalar>::Real RealScalar;
-  EIGEN_DEVICE_FUNC
  static inline RealScalar run(const Scalar& x)
  {
    using std::imag;
@@ -141,12 +129,10 @@ template<typename Scalar>
 struct real_ref_impl
 {
  typedef typename NumTraits<Scalar>::Real RealScalar;
-  EIGEN_DEVICE_FUNC
  static inline RealScalar& run(Scalar& x)
  {
    return reinterpret_cast<RealScalar*>(&x)[0];
  }
-  EIGEN_DEVICE_FUNC
  static inline const RealScalar& run(const Scalar& x)
  {
    return reinterpret_cast<const RealScalar*>(&x)[0];
@@ -167,12 +153,10 @@ template<typename Scalar, bool IsComplex>
 struct imag_ref_default_impl
 {
  typedef typename NumTraits<Scalar>::Real RealScalar;
-  EIGEN_DEVICE_FUNC
  static inline RealScalar& run(Scalar& x)
  {
    return reinterpret_cast<RealScalar*>(&x)[1];
  }
-  EIGEN_DEVICE_FUNC
  static inline const RealScalar& run(const Scalar& x)
  {
    return reinterpret_cast<RealScalar*>(&x)[1];
@@ -182,12 +166,10 @@ struct imag_ref_default_impl
 template<typename Scalar>
 struct imag_ref_default_impl<Scalar, false>
 {
-  EIGEN_DEVICE_FUNC
  static inline Scalar run(Scalar&)
  {
    return Scalar(0);
  }
-  EIGEN_DEVICE_FUNC
  static inline const Scalar run(const Scalar&)
  {
    return Scalar(0);
@@ -210,7 +192,6 @@ struct imag_ref_retval
 template<typename Scalar, bool IsComplex = NumTraits<Scalar>::IsComplex>
 struct conj_impl
 {
-  EIGEN_DEVICE_FUNC
  static inline Scalar run(const Scalar& x)
  {
    return x;
@@ -220,7 +201,6 @@ struct conj_impl
 template<typename Scalar>
 struct conj_impl<Scalar,true>
 {
-  EIGEN_DEVICE_FUNC
  static inline Scalar run(const Scalar& x)
  {
    using std::conj;
@@ -238,27 +218,36 @@ struct conj_retval
 * Implementation of abs2                                                 *
 ****************************************************************************/

-template<typename Scalar>
-struct abs2_impl
+template<typename Scalar,bool IsComplex>
+struct abs2_impl_default
 {
  typedef typename NumTraits<Scalar>::Real RealScalar;
-  EIGEN_DEVICE_FUNC
  static inline RealScalar run(const Scalar& x)
  {
    return x*x;
  }
 };

-template<typename RealScalar>
-struct abs2_impl<std::complex<RealScalar> >
+template<typename Scalar>
+struct abs2_impl_default<Scalar, true> // IsComplex
 {
-  EIGEN_DEVICE_FUNC
-  static inline RealScalar run(const std::complex<RealScalar>& x)
+  typedef typename NumTraits<Scalar>::Real RealScalar;
+  static inline RealScalar run(const Scalar& x)
  {
    return real(x)*real(x) + imag(x)*imag(x);
  }
 };

+template<typename Scalar>
+struct abs2_impl
+{
+  typedef typename NumTraits<Scalar>::Real RealScalar;
+  static inline RealScalar run(const Scalar& x)
+  {
+    return abs2_impl_default<Scalar,NumTraits<Scalar>::IsComplex>::run(x);
+  }
+};
+
 template<typename Scalar>
 struct abs2_retval
 {
@@ -273,7 +262,6 @@ template<typename Scalar, bool IsComplex>
 struct norm1_default_impl
 {
  typedef typename NumTraits<Scalar>::Real RealScalar;
-  EIGEN_DEVICE_FUNC
  static inline RealScalar run(const Scalar& x)
  {
    using std::abs;
@@ -284,7 +272,6 @@ struct norm1_default_impl
 template<typename Scalar>
 struct norm1_default_impl<Scalar, false>
 {
-  EIGEN_DEVICE_FUNC
  static inline Scalar run(const Scalar& x)
  {
    using std::abs;
@@ -311,23 +298,16 @@ struct hypot_impl
  typedef typename NumTraits<Scalar>::Real RealScalar;
  static inline RealScalar run(const Scalar& x, const Scalar& y)
  {
-    EIGEN_USING_STD_MATH(max);
-    EIGEN_USING_STD_MATH(min);
+    using std::max;
+    using std::min;
    using std::abs;
    using std::sqrt;
    RealScalar _x = abs(x);
    RealScalar _y = abs(y);
-    Scalar p, qp;
-    if(_x>_y)
-    {
-      p = _x;
-      qp = _y / p;
-    }
-    else
-    {
-      p = _y;
-      qp = _x / p;
-    }
+    RealScalar p = (max)(_x, _y);
+    if(p==RealScalar(0)) return RealScalar(0);
+    RealScalar q = (min)(_x, _y);
+    RealScalar qp = q/p;
    return p * sqrt(RealScalar(1) + qp*qp);
  }
 };
@@ -360,31 +340,42 @@ inline NewType cast(const OldType& x)
 }

 /****************************************************************************
-* Implementation of logp1                                                *
+* Implementation of atanh2                                                *
 ****************************************************************************/

-template<typename Scalar>
-struct log1p_impl
+template<typename Scalar, bool IsInteger>
+struct atanh2_default_impl
 {
-  static inline Scalar run(const Scalar& x)
+  typedef Scalar retval;
+  typedef typename NumTraits<Scalar>::Real RealScalar;
+  static inline Scalar run(const Scalar& x, const Scalar& y)
  {
-    EIGEN_STATIC_ASSERT_NON_INTEGER(Scalar)
-    // Let's be conservative and enable the default C++11 implementation only if we are sure it exists
-    #if (__cplusplus >= 201103L) && (EIGEN_COMP_GNUC_STRICT || EIGEN_COMP_CLANG || EIGEN_COMP_MSVC || EIGEN_COMP_ICC)  \
-        && (EIGEN_ARCH_i386_OR_x86_64) && (EIGEN_OS_GNULINUX || EIGEN_OS_WIN_STRICT || EIGEN_OS_MAC)
-      using std::log1p;
-      return log1p(x);
-    #else
-      typedef typename NumTraits<Scalar>::Real RealScalar;
-      using std::log;
-      Scalar x1p = RealScalar(1) + x;
-      return ( x1p == Scalar(1) ) ? x : x * ( log(x1p) / (x1p - RealScalar(1)) );
-    #endif
+    using std::abs;
+    using std::log;
+    using std::sqrt;
+    Scalar z = x / y;
+    if (y == Scalar(0) || abs(z) > sqrt(NumTraits<RealScalar>::epsilon()))
+      return RealScalar(0.5) * log((y + x) / (y - x));
+    else
+      return z + z*z*z / RealScalar(3);
  }
 };

 template<typename Scalar>
-struct log1p_retval
+struct atanh2_default_impl<Scalar, true>
+{
+  static inline Scalar run(const Scalar&, const Scalar&)
+  {
+    EIGEN_STATIC_ASSERT_NON_INTEGER(Scalar)
+    return Scalar(0);
+  }
+};
+
+template<typename Scalar>
+struct atanh2_impl : atanh2_default_impl<Scalar, NumTraits<Scalar>::IsInteger> {};
+
+template<typename Scalar>
+struct atanh2_retval
 {
  typedef Scalar type;
 };
@@ -572,102 +563,74 @@ inline EIGEN_MATHFUNC_RETVAL(random, Scalar) random()
 ****************************************************************************/

 namespace numext {
-  
-template<typename T>
-EIGEN_DEVICE_FUNC
-inline T mini(const T& x, const T& y)
-{
-  EIGEN_USING_STD_MATH(min);
-  return min EIGEN_NOT_A_MACRO (x,y);
-}
-
-template<typename T>
-EIGEN_DEVICE_FUNC
-inline T maxi(const T& x, const T& y)
-{
-  EIGEN_USING_STD_MATH(max);
-  return max EIGEN_NOT_A_MACRO (x,y);
-}

 template<typename Scalar>
-EIGEN_DEVICE_FUNC
 inline EIGEN_MATHFUNC_RETVAL(real, Scalar) real(const Scalar& x)
 {
  return EIGEN_MATHFUNC_IMPL(real, Scalar)::run(x);
 }  

 template<typename Scalar>
-EIGEN_DEVICE_FUNC
 inline typename internal::add_const_on_value_type< EIGEN_MATHFUNC_RETVAL(real_ref, Scalar) >::type real_ref(const Scalar& x)
 {
  return internal::real_ref_impl<Scalar>::run(x);
 }

 template<typename Scalar>
-EIGEN_DEVICE_FUNC
 inline EIGEN_MATHFUNC_RETVAL(real_ref, Scalar) real_ref(Scalar& x)
 {
  return EIGEN_MATHFUNC_IMPL(real_ref, Scalar)::run(x);
 }

 template<typename Scalar>
-EIGEN_DEVICE_FUNC
 inline EIGEN_MATHFUNC_RETVAL(imag, Scalar) imag(const Scalar& x)
 {
  return EIGEN_MATHFUNC_IMPL(imag, Scalar)::run(x);
 }

 template<typename Scalar>
-EIGEN_DEVICE_FUNC
 inline typename internal::add_const_on_value_type< EIGEN_MATHFUNC_RETVAL(imag_ref, Scalar) >::type imag_ref(const Scalar& x)
 {
  return internal::imag_ref_impl<Scalar>::run(x);
 }

 template<typename Scalar>
-EIGEN_DEVICE_FUNC
 inline EIGEN_MATHFUNC_RETVAL(imag_ref, Scalar) imag_ref(Scalar& x)
 {
  return EIGEN_MATHFUNC_IMPL(imag_ref, Scalar)::run(x);
 }

 template<typename Scalar>
-EIGEN_DEVICE_FUNC
 inline EIGEN_MATHFUNC_RETVAL(conj, Scalar) conj(const Scalar& x)
 {
  return EIGEN_MATHFUNC_IMPL(conj, Scalar)::run(x);
 }

 template<typename Scalar>
-EIGEN_DEVICE_FUNC
 inline EIGEN_MATHFUNC_RETVAL(abs2, Scalar) abs2(const Scalar& x)
 {
  return EIGEN_MATHFUNC_IMPL(abs2, Scalar)::run(x);
 }

 template<typename Scalar>
-EIGEN_DEVICE_FUNC
 inline EIGEN_MATHFUNC_RETVAL(norm1, Scalar) norm1(const Scalar& x)
 {
  return EIGEN_MATHFUNC_IMPL(norm1, Scalar)::run(x);
 }

 template<typename Scalar>
-EIGEN_DEVICE_FUNC
 inline EIGEN_MATHFUNC_RETVAL(hypot, Scalar) hypot(const Scalar& x, const Scalar& y)
 {
  return EIGEN_MATHFUNC_IMPL(hypot, Scalar)::run(x, y);
 }

 template<typename Scalar>
-EIGEN_DEVICE_FUNC
-inline EIGEN_MATHFUNC_RETVAL(log1p, Scalar) log1p(const Scalar& x)
+inline EIGEN_MATHFUNC_RETVAL(atanh2, Scalar) atanh2(const Scalar& x, const Scalar& y)
 {
-  return EIGEN_MATHFUNC_IMPL(log1p, Scalar)::run(x);
+  return EIGEN_MATHFUNC_IMPL(atanh2, Scalar)::run(x, y);
 }

 template<typename Scalar>
-EIGEN_DEVICE_FUNC
 inline EIGEN_MATHFUNC_RETVAL(pow, Scalar) pow(const Scalar& x, const Scalar& y)
 {
  return EIGEN_MATHFUNC_IMPL(pow, Scalar)::run(x, y);
@@ -675,37 +638,11 @@ inline EIGEN_MATHFUNC_RETVAL(pow, Scalar) pow(const Scalar& x, const Scalar& y)

 // std::isfinite is non standard, so let's define our own version,
 // even though it is not very efficient.
-template<typename T>
-EIGEN_DEVICE_FUNC
-bool (isfinite)(const T& x)
+template<typename T> bool (isfinite)(const T& x)
 {
  return x<NumTraits<T>::highest() && x>NumTraits<T>::lowest();
 }

-template<typename T>
-EIGEN_DEVICE_FUNC
-bool (isfinite)(const std::complex<T>& x)
-{
-  using std::real;
-  using std::imag;
-  return isfinite(real(x)) && isfinite(imag(x));
-}
-
-// Log base 2 for 32 bits positive integers.
-// Conveniently returns 0 for x==0.
-inline int log2(int x)
-{
-  eigen_assert(x>=0);
-  unsigned int v(x);
-  static const int table[32] = { 0, 9, 1, 10, 13, 21, 2, 29, 11, 14, 16, 18, 22, 25, 3, 30, 8, 12, 20, 28, 15, 17, 24, 7, 19, 27, 23, 6, 26, 5, 4, 31 };
-  v |= v >> 1;
-  v |= v >> 2;
-  v |= v >> 4;
-  v |= v >> 8;
-  v |= v >> 16;
-  return table[(v * 0x07C4ACDDU) >> 27];
-}
-
 } // end namespace numext

 namespace internal {
@@ -723,20 +660,18 @@ template<typename Scalar>
 struct scalar_fuzzy_default_impl<Scalar, false, false>
 {
  typedef typename NumTraits<Scalar>::Real RealScalar;
-  template<typename OtherScalar> EIGEN_DEVICE_FUNC
+  template<typename OtherScalar>
  static inline bool isMuchSmallerThan(const Scalar& x, const OtherScalar& y, const RealScalar& prec)
  {
    using std::abs;
    return abs(x) <= abs(y) * prec;
  }
-  EIGEN_DEVICE_FUNC
  static inline bool isApprox(const Scalar& x, const Scalar& y, const RealScalar& prec)
  {
-    EIGEN_USING_STD_MATH(min);
+    using std::min;
    using std::abs;
    return abs(x - y) <= (min)(abs(x), abs(y)) * prec;
  }
-  EIGEN_DEVICE_FUNC
  static inline bool isApproxOrLessThan(const Scalar& x, const Scalar& y, const RealScalar& prec)
  {
    return x <= y || isApprox(x, y, prec);
@@ -747,17 +682,15 @@ template<typename Scalar>
 struct scalar_fuzzy_default_impl<Scalar, false, true>
 {
  typedef typename NumTraits<Scalar>::Real RealScalar;
-  template<typename OtherScalar> EIGEN_DEVICE_FUNC
+  template<typename OtherScalar>
  static inline bool isMuchSmallerThan(const Scalar& x, const Scalar&, const RealScalar&)
  {
    return x == Scalar(0);
  }
-  EIGEN_DEVICE_FUNC
  static inline bool isApprox(const Scalar& x, const Scalar& y, const RealScalar&)
  {
    return x == y;
  }
-  EIGEN_DEVICE_FUNC
  static inline bool isApproxOrLessThan(const Scalar& x, const Scalar& y, const RealScalar&)
  {
    return x <= y;
@@ -775,7 +708,7 @@ struct scalar_fuzzy_default_impl<Scalar, true, false>
  }
  static inline bool isApprox(const Scalar& x, const Scalar& y, const RealScalar& prec)
  {
-    EIGEN_USING_STD_MATH(min);
+    using std::min;
    return numext::abs2(x - y) <= (min)(numext::abs2(x), numext::abs2(y)) * prec * prec;
  }
 };
@@ -783,23 +716,23 @@ struct scalar_fuzzy_default_impl<Scalar, true, false>
 template<typename Scalar>
 struct scalar_fuzzy_impl : scalar_fuzzy_default_impl<Scalar, NumTraits<Scalar>::IsComplex, NumTraits<Scalar>::IsInteger> {};

-template<typename Scalar, typename OtherScalar> EIGEN_DEVICE_FUNC
+template<typename Scalar, typename OtherScalar>
 inline bool isMuchSmallerThan(const Scalar& x, const OtherScalar& y,
-                                   typename NumTraits<Scalar>::Real precision = NumTraits<Scalar>::dummy_precision())
+                              const typename NumTraits<Scalar>::Real &precision = NumTraits<Scalar>::dummy_precision())
 {
  return scalar_fuzzy_impl<Scalar>::template isMuchSmallerThan<OtherScalar>(x, y, precision);
 }

-template<typename Scalar> EIGEN_DEVICE_FUNC
+template<typename Scalar>
 inline bool isApprox(const Scalar& x, const Scalar& y,
-                          typename NumTraits<Scalar>::Real precision = NumTraits<Scalar>::dummy_precision())
+                     const typename NumTraits<Scalar>::Real &precision = NumTraits<Scalar>::dummy_precision())
 {
  return scalar_fuzzy_impl<Scalar>::isApprox(x, y, precision);
 }

-template<typename Scalar> EIGEN_DEVICE_FUNC
+template<typename Scalar>
 inline bool isApproxOrLessThan(const Scalar& x, const Scalar& y,
-                                    typename NumTraits<Scalar>::Real precision = NumTraits<Scalar>::dummy_precision())
+                               const typename NumTraits<Scalar>::Real &precision = NumTraits<Scalar>::dummy_precision())
 {
  return scalar_fuzzy_impl<Scalar>::isApproxOrLessThan(x, y, precision);
 }
@@ -820,19 +753,17 @@ template<> struct scalar_fuzzy_impl<bool>
 {
  typedef bool RealScalar;
  
-  template<typename OtherScalar> EIGEN_DEVICE_FUNC
+  template<typename OtherScalar>
  static inline bool isMuchSmallerThan(const bool& x, const bool&, const bool&)
  {
    return !x;
  }
  
-  EIGEN_DEVICE_FUNC
  static inline bool isApprox(bool x, bool y, bool)
  {
    return x == y;
  }

-  EIGEN_DEVICE_FUNC
  static inline bool isApproxOrLessThan(const bool& x, const bool& y, const bool&)
  {
    return (!x) || y;
--- a/Eigen/src/Core/Matrix.h
+++ b/Eigen/src/Core/Matrix.h
@@ -107,7 +107,7 @@ struct traits<Matrix<_Scalar, _Rows, _Cols, _Options, _MaxRows, _MaxCols> >
 {
  typedef _Scalar Scalar;
  typedef Dense StorageKind;
-  typedef Eigen::Index StorageIndex;
+  typedef DenseIndex Index;
  typedef MatrixXpr XprKind;
  enum {
    RowsAtCompileTime = _Rows,
@@ -115,8 +115,7 @@ struct traits<Matrix<_Scalar, _Rows, _Cols, _Options, _MaxRows, _MaxCols> >
    MaxRowsAtCompileTime = _MaxRows,
    MaxColsAtCompileTime = _MaxCols,
    Flags = compute_matrix_flags<_Scalar, _Rows, _Cols, _Options, _MaxRows, _MaxCols>::ret,
-    // FIXME, the following flag in only used to define NeedsToAlign in PlainObjectBase
-    EvaluatorFlags = compute_matrix_evaluator_flags<_Scalar, _Rows, _Cols, _Options, _MaxRows, _MaxCols>::ret,
+    CoeffReadCost = NumTraits<Scalar>::ReadCost,
    Options = _Options,
    InnerStrideAtCompileTime = 1,
    OuterStrideAtCompileTime = (Options&RowMajor) ? ColsAtCompileTime : RowsAtCompileTime
@@ -152,7 +151,6 @@ class Matrix
      *
      * \callgraph
      */
-    EIGEN_DEVICE_FUNC
    EIGEN_STRONG_INLINE Matrix& operator=(const Matrix& other)
    {
      return Base::_set(other);
@@ -169,7 +167,6 @@ class Matrix
      * remain row-vectors and vectors remain vectors.
      */
    template<typename OtherDerived>
-    EIGEN_DEVICE_FUNC
    EIGEN_STRONG_INLINE Matrix& operator=(const MatrixBase<OtherDerived>& other)
    {
      return Base::_set(other);
@@ -182,14 +179,12 @@ class Matrix
      * \copydetails DenseBase::operator=(const EigenBase<OtherDerived> &other)
      */
    template<typename OtherDerived>
-    EIGEN_DEVICE_FUNC
    EIGEN_STRONG_INLINE Matrix& operator=(const EigenBase<OtherDerived> &other)
    {
      return Base::operator=(other);
    }

    template<typename OtherDerived>
-    EIGEN_DEVICE_FUNC
    EIGEN_STRONG_INLINE Matrix& operator=(const ReturnByValue<OtherDerived>& func)
    {
      return Base::operator=(func);
@@ -205,7 +200,6 @@ class Matrix
      *
      * \sa resize(Index,Index)
      */
-    EIGEN_DEVICE_FUNC
    EIGEN_STRONG_INLINE Matrix() : Base()
    {
      Base::_check_template_params();
@@ -213,8 +207,7 @@ class Matrix
    }

    // FIXME is it still needed
-    EIGEN_DEVICE_FUNC
-    explicit Matrix(internal::constructor_without_unaligned_array_assert)
+    Matrix(internal::constructor_without_unaligned_array_assert)
      : Base(internal::constructor_without_unaligned_array_assert())
    { Base::_check_template_params(); EIGEN_INITIALIZE_COEFFS_IF_THAT_OPTION_IS_ENABLED }

@@ -233,65 +226,41 @@ class Matrix
    }
 #endif

-    #ifndef EIGEN_PARSED_BY_DOXYGEN
-
-    // This constructor is for both 1x1 matrices and dynamic vectors
-    template<typename T>
-    EIGEN_DEVICE_FUNC
-    EIGEN_STRONG_INLINE explicit Matrix(const T& x)
+    /** \brief Constructs a vector or row-vector with given dimension. \only_for_vectors
+      *
+      * Note that this is only useful for dynamic-size vectors. For fixed-size vectors,
+      * it is redundant to pass the dimension here, so it makes more sense to use the default
+      * constructor Matrix() instead.
+      */
+    EIGEN_STRONG_INLINE explicit Matrix(Index dim)
+      : Base(dim, RowsAtCompileTime == 1 ? 1 : dim, ColsAtCompileTime == 1 ? 1 : dim)
    {
      Base::_check_template_params();
-      Base::template _init1<T>(x);
+      EIGEN_STATIC_ASSERT_VECTOR_ONLY(Matrix)
+      eigen_assert(dim >= 0);
+      eigen_assert(SizeAtCompileTime == Dynamic || SizeAtCompileTime == dim);
+      EIGEN_INITIALIZE_COEFFS_IF_THAT_OPTION_IS_ENABLED
    }

+    #ifndef EIGEN_PARSED_BY_DOXYGEN
    template<typename T0, typename T1>
-    EIGEN_DEVICE_FUNC
    EIGEN_STRONG_INLINE Matrix(const T0& x, const T1& y)
    {
      Base::_check_template_params();
      Base::template _init2<T0,T1>(x, y);
    }
    #else
-    /** \brief Constructs a fixed-sized matrix initialized with coefficients starting at \a data */
-    EIGEN_DEVICE_FUNC
-    explicit Matrix(const Scalar *data);
-
-    /** \brief Constructs a vector or row-vector with given dimension. \only_for_vectors
-      *
-      * This is useful for dynamic-size vectors. For fixed-size vectors,
-      * it is redundant to pass these parameters, so one should use the default constructor
-      * Matrix() instead.
-      * 
-      * \warning This constructor is disabled for fixed-size \c 1x1 matrices. For instance,
-      * calling Matrix<double,1,1>(1) will call the initialization constructor: Matrix(const Scalar&).
-      * For fixed-size \c 1x1 matrices it is thefore recommended to use the default
-      * constructor Matrix() instead, especilly when using one of the non standard
-      * \c EIGEN_INITIALIZE_MATRICES_BY_{ZERO,\c NAN} macros (see \ref TopicPreprocessorDirectives).
-      */
-    EIGEN_STRONG_INLINE explicit Matrix(Index dim);
-    /** \brief Constructs an initialized 1x1 matrix with the given coefficient */
-    Matrix(const Scalar& x);
    /** \brief Constructs an uninitialized matrix with \a rows rows and \a cols columns.
      *
      * This is useful for dynamic-size matrices. For fixed-size matrices,
      * it is redundant to pass these parameters, so one should use the default constructor
-      * Matrix() instead.
-      * 
-      * \warning This constructor is disabled for fixed-size \c 1x2 and \c 2x1 vectors. For instance,
-      * calling Matrix2f(2,1) will call the initialization constructor: Matrix(const Scalar& x, const Scalar& y).
-      * For fixed-size \c 1x2 or \c 2x1 vectors it is thefore recommended to use the default
-      * constructor Matrix() instead, especilly when using one of the non standard
-      * \c EIGEN_INITIALIZE_MATRICES_BY_{ZERO,\c NAN} macros (see \ref TopicPreprocessorDirectives).
-      */
-    EIGEN_DEVICE_FUNC
+      * Matrix() instead. */
    Matrix(Index rows, Index cols);
-    
    /** \brief Constructs an initialized 2D vector with given coefficients */
    Matrix(const Scalar& x, const Scalar& y);
    #endif

    /** \brief Constructs an initialized 3D vector with given coefficients */
-    EIGEN_DEVICE_FUNC
    EIGEN_STRONG_INLINE Matrix(const Scalar& x, const Scalar& y, const Scalar& z)
    {
      Base::_check_template_params();
@@ -301,7 +270,6 @@ class Matrix
      m_storage.data()[2] = z;
    }
    /** \brief Constructs an initialized 4D vector with given coefficients */
-    EIGEN_DEVICE_FUNC
    EIGEN_STRONG_INLINE Matrix(const Scalar& x, const Scalar& y, const Scalar& z, const Scalar& w)
    {
      Base::_check_template_params();
@@ -312,10 +280,10 @@ class Matrix
      m_storage.data()[3] = w;
    }

+    explicit Matrix(const Scalar *data);

    /** \brief Constructor copying the value of the expression \a other */
    template<typename OtherDerived>
-    EIGEN_DEVICE_FUNC
    EIGEN_STRONG_INLINE Matrix(const MatrixBase<OtherDerived>& other)
             : Base(other.rows() * other.cols(), other.rows(), other.cols())
    {
@@ -328,7 +296,6 @@ class Matrix
      Base::_set_noalias(other);
    }
    /** \brief Copy constructor */
-    EIGEN_DEVICE_FUNC
    EIGEN_STRONG_INLINE Matrix(const Matrix& other)
            : Base(other.rows() * other.cols(), other.rows(), other.cols())
    {
@@ -337,7 +304,6 @@ class Matrix
    }
    /** \brief Copy constructor with in-place evaluation */
    template<typename OtherDerived>
-    EIGEN_DEVICE_FUNC
    EIGEN_STRONG_INLINE Matrix(const ReturnByValue<OtherDerived>& other)
    {
      Base::_check_template_params();
@@ -349,7 +315,6 @@ class Matrix
      * \sa MatrixBase::operator=(const EigenBase<OtherDerived>&)
      */
    template<typename OtherDerived>
-    EIGEN_DEVICE_FUNC
    EIGEN_STRONG_INLINE Matrix(const EigenBase<OtherDerived> &other)
      : Base(other.derived().rows() * other.derived().cols(), other.derived().rows(), other.derived().cols())
    {
@@ -360,18 +325,31 @@ class Matrix
      *this = other;
    }

-    EIGEN_DEVICE_FUNC inline Index innerStride() const { return 1; }
-    EIGEN_DEVICE_FUNC inline Index outerStride() const { return this->innerSize(); }
+    /** \internal
+      * \brief Override MatrixBase::swap() since for dynamic-sized matrices
+      * of same type it is enough to swap the data pointers.
+      */
+    template<typename OtherDerived>
+    void swap(MatrixBase<OtherDerived> const & other)
+    { this->_swap(other.derived()); }
+
+    inline Index innerStride() const { return 1; }
+    inline Index outerStride() const { return this->innerSize(); }

    /////////// Geometry module ///////////

    template<typename OtherDerived>
-    EIGEN_DEVICE_FUNC
    explicit Matrix(const RotationBase<OtherDerived,ColsAtCompileTime>& r);
    template<typename OtherDerived>
-    EIGEN_DEVICE_FUNC
    Matrix& operator=(const RotationBase<OtherDerived,ColsAtCompileTime>& r);

+    #ifdef EIGEN2_SUPPORT
+    template<typename OtherDerived>
+    explicit Matrix(const eigen2_RotationBase<OtherDerived,ColsAtCompileTime>& r);
+    template<typename OtherDerived>
+    Matrix& operator=(const eigen2_RotationBase<OtherDerived,ColsAtCompileTime>& r);
+    #endif
+
    // allow to extend Matrix outside Eigen
    #ifdef EIGEN_MATRIX_PLUGIN
    #include EIGEN_MATRIX_PLUGIN
--- a/Eigen/src/Core/MatrixBase.h
+++ b/Eigen/src/Core/MatrixBase.h
@@ -52,7 +52,7 @@ template<typename Derived> class MatrixBase
 #ifndef EIGEN_PARSED_BY_DOXYGEN
    typedef MatrixBase StorageBaseType;
    typedef typename internal::traits<Derived>::StorageKind StorageKind;
-    typedef typename internal::traits<Derived>::StorageIndex StorageIndex;
+    typedef typename internal::traits<Derived>::Index Index;
    typedef typename internal::traits<Derived>::Scalar Scalar;
    typedef typename internal::packet_traits<Scalar>::type PacketScalar;
    typedef typename NumTraits<Scalar>::Real RealScalar;
@@ -66,7 +66,8 @@ template<typename Derived> class MatrixBase
    using Base::MaxSizeAtCompileTime;
    using Base::IsVectorAtCompileTime;
    using Base::Flags;
-    
+    using Base::CoeffReadCost;
+
    using Base::derived;
    using Base::const_cast_derived;
    using Base::rows;
@@ -80,7 +81,6 @@ template<typename Derived> class MatrixBase
    using Base::operator-=;
    using Base::operator*=;
    using Base::operator/=;
-    using Base::operator*;

    typedef typename Base::CoeffReturnType CoeffReturnType;
    typedef typename Base::ConstTransposeReturnType ConstTransposeReturnType;
@@ -98,7 +98,6 @@ template<typename Derived> class MatrixBase

    /** \returns the size of the main diagonal, which is min(rows(),cols()).
      * \sa rows(), cols(), SizeAtCompileTime. */
-    EIGEN_DEVICE_FUNC
    inline Index diagonalSize() const { return (std::min)(rows(),cols()); }

    /** \brief The plain matrix type corresponding to this expression.
@@ -146,54 +145,37 @@ template<typename Derived> class MatrixBase
    /** Special case of the template operator=, in order to prevent the compiler
      * from generating a default operator= (issue hit with g++ 4.1)
      */
-    EIGEN_DEVICE_FUNC
    Derived& operator=(const MatrixBase& other);

    // We cannot inherit here via Base::operator= since it is causing
    // trouble with MSVC.

    template <typename OtherDerived>
-    EIGEN_DEVICE_FUNC
    Derived& operator=(const DenseBase<OtherDerived>& other);

    template <typename OtherDerived>
-    EIGEN_DEVICE_FUNC
    Derived& operator=(const EigenBase<OtherDerived>& other);

    template<typename OtherDerived>
-    EIGEN_DEVICE_FUNC
    Derived& operator=(const ReturnByValue<OtherDerived>& other);

-#ifndef EIGEN_PARSED_BY_DOXYGEN
    template<typename ProductDerived, typename Lhs, typename Rhs>
-    EIGEN_DEVICE_FUNC
    Derived& lazyAssign(const ProductBase<ProductDerived, Lhs,Rhs>& other);
-#endif // not EIGEN_PARSED_BY_DOXYGEN
+
+    template<typename MatrixPower, typename Lhs, typename Rhs>
+    Derived& lazyAssign(const MatrixPowerProduct<MatrixPower, Lhs,Rhs>& other);

    template<typename OtherDerived>
-    EIGEN_DEVICE_FUNC
    Derived& operator+=(const MatrixBase<OtherDerived>& other);
    template<typename OtherDerived>
-    EIGEN_DEVICE_FUNC
    Derived& operator-=(const MatrixBase<OtherDerived>& other);

-#ifdef __CUDACC__
    template<typename OtherDerived>
-    EIGEN_DEVICE_FUNC
-    const Product<Derived,OtherDerived,LazyProduct>
-    operator*(const MatrixBase<OtherDerived> &other) const
-    { return this->lazyProduct(other); }
-#else
-
-    template<typename OtherDerived>
-    const Product<Derived,OtherDerived>
+    const typename ProductReturnType<Derived,OtherDerived>::Type
    operator*(const MatrixBase<OtherDerived> &other) const;

-#endif
-
    template<typename OtherDerived>
-    EIGEN_DEVICE_FUNC 
-    const Product<Derived,OtherDerived,LazyProduct>
+    const typename LazyProductReturnType<Derived,OtherDerived>::Type
    lazyProduct(const MatrixBase<OtherDerived> &other) const;

    template<typename OtherDerived>
@@ -206,91 +188,84 @@ template<typename Derived> class MatrixBase
    void applyOnTheRight(const EigenBase<OtherDerived>& other);

    template<typename DiagonalDerived>
-    EIGEN_DEVICE_FUNC
-    const Product<Derived, DiagonalDerived, LazyProduct>
+    const DiagonalProduct<Derived, DiagonalDerived, OnTheRight>
    operator*(const DiagonalBase<DiagonalDerived> &diagonal) const;

    template<typename OtherDerived>
-    EIGEN_DEVICE_FUNC
    typename internal::scalar_product_traits<typename internal::traits<Derived>::Scalar,typename internal::traits<OtherDerived>::Scalar>::ReturnType
    dot(const MatrixBase<OtherDerived>& other) const;

-    EIGEN_DEVICE_FUNC RealScalar squaredNorm() const;
-    EIGEN_DEVICE_FUNC RealScalar norm() const;
+    #ifdef EIGEN2_SUPPORT
+      template<typename OtherDerived>
+      Scalar eigen2_dot(const MatrixBase<OtherDerived>& other) const;
+    #endif
+
+    RealScalar squaredNorm() const;
+    RealScalar norm() const;
    RealScalar stableNorm() const;
    RealScalar blueNorm() const;
    RealScalar hypotNorm() const;
-    EIGEN_DEVICE_FUNC const PlainObject normalized() const;
-    EIGEN_DEVICE_FUNC void normalize();
+    const PlainObject normalized() const;
+    void normalize();

-    EIGEN_DEVICE_FUNC const AdjointReturnType adjoint() const;
-    EIGEN_DEVICE_FUNC void adjointInPlace();
+    const AdjointReturnType adjoint() const;
+    void adjointInPlace();

    typedef Diagonal<Derived> DiagonalReturnType;
-    EIGEN_DEVICE_FUNC
    DiagonalReturnType diagonal();
-    
    typedef typename internal::add_const<Diagonal<const Derived> >::type ConstDiagonalReturnType;
-    EIGEN_DEVICE_FUNC
    ConstDiagonalReturnType diagonal() const;

    template<int Index> struct DiagonalIndexReturnType { typedef Diagonal<Derived,Index> Type; };
    template<int Index> struct ConstDiagonalIndexReturnType { typedef const Diagonal<const Derived,Index> Type; };

-    template<int Index> 
-    EIGEN_DEVICE_FUNC
-    typename DiagonalIndexReturnType<Index>::Type diagonal();
-
-    template<int Index>
-    EIGEN_DEVICE_FUNC
-    typename ConstDiagonalIndexReturnType<Index>::Type diagonal() const;
+    template<int Index> typename DiagonalIndexReturnType<Index>::Type diagonal();
+    template<int Index> typename ConstDiagonalIndexReturnType<Index>::Type diagonal() const;
    
    typedef Diagonal<Derived,DynamicIndex> DiagonalDynamicIndexReturnType;
    typedef typename internal::add_const<Diagonal<const Derived,DynamicIndex> >::type ConstDiagonalDynamicIndexReturnType;

-    EIGEN_DEVICE_FUNC
    DiagonalDynamicIndexReturnType diagonal(Index index);
-    EIGEN_DEVICE_FUNC
    ConstDiagonalDynamicIndexReturnType diagonal(Index index) const;

+    #ifdef EIGEN2_SUPPORT
+    template<unsigned int Mode> typename internal::eigen2_part_return_type<Derived, Mode>::type part();
+    template<unsigned int Mode> const typename internal::eigen2_part_return_type<Derived, Mode>::type part() const;
+    
+    // huuuge hack. make Eigen2's matrix.part<Diagonal>() work in eigen3. Problem: Diagonal is now a class template instead
+    // of an integer constant. Solution: overload the part() method template wrt template parameters list.
+    template<template<typename T, int N> class U>
+    const DiagonalWrapper<ConstDiagonalReturnType> part() const
+    { return diagonal().asDiagonal(); }
+    #endif // EIGEN2_SUPPORT
+
    template<unsigned int Mode> struct TriangularViewReturnType { typedef TriangularView<Derived, Mode> Type; };
    template<unsigned int Mode> struct ConstTriangularViewReturnType { typedef const TriangularView<const Derived, Mode> Type; };

-    template<unsigned int Mode>
-    EIGEN_DEVICE_FUNC
-    typename TriangularViewReturnType<Mode>::Type triangularView();
-    template<unsigned int Mode>
-    EIGEN_DEVICE_FUNC
-    typename ConstTriangularViewReturnType<Mode>::Type triangularView() const;
+    template<unsigned int Mode> typename TriangularViewReturnType<Mode>::Type triangularView();
+    template<unsigned int Mode> typename ConstTriangularViewReturnType<Mode>::Type triangularView() const;

    template<unsigned int UpLo> struct SelfAdjointViewReturnType { typedef SelfAdjointView<Derived, UpLo> Type; };
    template<unsigned int UpLo> struct ConstSelfAdjointViewReturnType { typedef const SelfAdjointView<const Derived, UpLo> Type; };

-    template<unsigned int UpLo> 
-    EIGEN_DEVICE_FUNC
-    typename SelfAdjointViewReturnType<UpLo>::Type selfadjointView();
-    template<unsigned int UpLo>
-    EIGEN_DEVICE_FUNC
-    typename ConstSelfAdjointViewReturnType<UpLo>::Type selfadjointView() const;
+    template<unsigned int UpLo> typename SelfAdjointViewReturnType<UpLo>::Type selfadjointView();
+    template<unsigned int UpLo> typename ConstSelfAdjointViewReturnType<UpLo>::Type selfadjointView() const;

    const SparseView<Derived> sparseView(const Scalar& m_reference = Scalar(0),
                                         const typename NumTraits<Scalar>::Real& m_epsilon = NumTraits<Scalar>::dummy_precision()) const;
-    EIGEN_DEVICE_FUNC static const IdentityReturnType Identity();
-    EIGEN_DEVICE_FUNC static const IdentityReturnType Identity(Index rows, Index cols);
-    EIGEN_DEVICE_FUNC static const BasisReturnType Unit(Index size, Index i);
-    EIGEN_DEVICE_FUNC static const BasisReturnType Unit(Index i);
-    EIGEN_DEVICE_FUNC static const BasisReturnType UnitX();
-    EIGEN_DEVICE_FUNC static const BasisReturnType UnitY();
-    EIGEN_DEVICE_FUNC static const BasisReturnType UnitZ();
-    EIGEN_DEVICE_FUNC static const BasisReturnType UnitW();
+    static const IdentityReturnType Identity();
+    static const IdentityReturnType Identity(Index rows, Index cols);
+    static const BasisReturnType Unit(Index size, Index i);
+    static const BasisReturnType Unit(Index i);
+    static const BasisReturnType UnitX();
+    static const BasisReturnType UnitY();
+    static const BasisReturnType UnitZ();
+    static const BasisReturnType UnitW();

-    EIGEN_DEVICE_FUNC
    const DiagonalWrapper<const Derived> asDiagonal() const;
    const PermutationWrapper<const Derived> asPermutation() const;

-    EIGEN_DEVICE_FUNC
    Derived& setIdentity();
-    EIGEN_DEVICE_FUNC
    Derived& setIdentity(Index rows, Index cols);

    bool isIdentity(const RealScalar& prec = NumTraits<Scalar>::dummy_precision()) const;
@@ -322,37 +297,50 @@ template<typename Derived> class MatrixBase

    NoAlias<Derived,Eigen::MatrixBase > noalias();

-    // TODO forceAlignedAccess is temporarily disabled
-    // Need to find a nicer workaround.
-    inline const Derived& forceAlignedAccess() const { return derived(); }
-    inline Derived& forceAlignedAccess() { return derived(); }
-    template<bool Enable> inline const Derived& forceAlignedAccessIf() const { return derived(); }
-    template<bool Enable> inline Derived& forceAlignedAccessIf() { return derived(); }
+    inline const ForceAlignedAccess<Derived> forceAlignedAccess() const;
+    inline ForceAlignedAccess<Derived> forceAlignedAccess();
+    template<bool Enable> inline typename internal::add_const_on_value_type<typename internal::conditional<Enable,ForceAlignedAccess<Derived>,Derived&>::type>::type forceAlignedAccessIf() const;
+    template<bool Enable> inline typename internal::conditional<Enable,ForceAlignedAccess<Derived>,Derived&>::type forceAlignedAccessIf();

    Scalar trace() const;

-    template<int p> EIGEN_DEVICE_FUNC RealScalar lpNorm() const;
+/////////// Array module ///////////

-    EIGEN_DEVICE_FUNC MatrixBase<Derived>& matrix() { return *this; }
-    EIGEN_DEVICE_FUNC const MatrixBase<Derived>& matrix() const { return *this; }
+    template<int p> RealScalar lpNorm() const;
+
+    MatrixBase<Derived>& matrix() { return *this; }
+    const MatrixBase<Derived>& matrix() const { return *this; }

    /** \returns an \link Eigen::ArrayBase Array \endlink expression of this matrix
      * \sa ArrayBase::matrix() */
-    EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE ArrayWrapper<Derived> array() { return ArrayWrapper<Derived>(derived()); }
-    /** \returns a const \link Eigen::ArrayBase Array \endlink expression of this matrix
-      * \sa ArrayBase::matrix() */
-    EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const ArrayWrapper<const Derived> array() const { return ArrayWrapper<const Derived>(derived()); }
+    ArrayWrapper<Derived> array() { return derived(); }
+    const ArrayWrapper<const Derived> array() const { return derived(); }

 /////////// LU module ///////////

-    EIGEN_DEVICE_FUNC const FullPivLU<PlainObject> fullPivLu() const;
-    EIGEN_DEVICE_FUNC const PartialPivLU<PlainObject> partialPivLu() const;
+    const FullPivLU<PlainObject> fullPivLu() const;
+    const PartialPivLU<PlainObject> partialPivLu() const;

+    #if EIGEN2_SUPPORT_STAGE < STAGE20_RESOLVE_API_CONFLICTS
+    const LU<PlainObject> lu() const;
+    #endif
+
+    #ifdef EIGEN2_SUPPORT
+    const LU<PlainObject> eigen2_lu() const;
+    #endif
+
+    #if EIGEN2_SUPPORT_STAGE > STAGE20_RESOLVE_API_CONFLICTS
    const PartialPivLU<PlainObject> lu() const;
-
-    EIGEN_DEVICE_FUNC
-    const Inverse<Derived> inverse() const;
+    #endif
    
+    #ifdef EIGEN2_SUPPORT
+    template<typename ResultType>
+    void computeInverse(MatrixBase<ResultType> *result) const {
+      *result = this->inverse();
+    }
+    #endif
+
+    const internal::inverse_impl<Derived> inverse() const;
    template<typename ResultType>
    void computeInverseAndDetWithCheck(
      ResultType& inverse,
@@ -378,6 +366,10 @@ template<typename Derived> class MatrixBase
    const HouseholderQR<PlainObject> householderQr() const;
    const ColPivHouseholderQR<PlainObject> colPivHouseholderQr() const;
    const FullPivHouseholderQR<PlainObject> fullPivHouseholderQr() const;
+    
+    #ifdef EIGEN2_SUPPORT
+    const QR<PlainObject> qr() const;
+    #endif

    EigenvaluesReturnType eigenvalues() const;
    RealScalar operatorNorm() const;
@@ -385,7 +377,10 @@ template<typename Derived> class MatrixBase
 /////////// SVD module ///////////

    JacobiSVD<PlainObject> jacobiSvd(unsigned int computationOptions = 0) const;
-    BDCSVD<PlainObject>    bdcSvd(unsigned int computationOptions = 0) const;
+
+    #ifdef EIGEN2_SUPPORT
+    SVD<PlainObject> svd() const;
+    #endif

 /////////// Geometry module ///////////

@@ -397,24 +392,20 @@ template<typename Derived> class MatrixBase
    };
    #endif // EIGEN_PARSED_BY_DOXYGEN
    template<typename OtherDerived>
-    EIGEN_DEVICE_FUNC
    typename cross_product_return_type<OtherDerived>::type
    cross(const MatrixBase<OtherDerived>& other) const;
-    
    template<typename OtherDerived>
-    EIGEN_DEVICE_FUNC
    PlainObject cross3(const MatrixBase<OtherDerived>& other) const;
-    
-    EIGEN_DEVICE_FUNC
    PlainObject unitOrthogonal(void) const;
-    
    Matrix<Scalar,3,1> eulerAngles(Index a0, Index a1, Index a2) const;
    
+    #if EIGEN2_SUPPORT_STAGE > STAGE20_RESOLVE_API_CONFLICTS
    ScalarMultipleReturnType operator*(const UniformScaling<Scalar>& s) const;
    // put this as separate enum value to work around possible GCC 4.3 bug (?)
    enum { HomogeneousReturnTypeDirection = ColsAtCompileTime==1?Vertical:Horizontal };
    typedef Homogeneous<Derived, HomogeneousReturnTypeDirection> HomogeneousReturnType;
    HomogeneousReturnType homogeneous() const;
+    #endif
    
    enum {
      SizeMinusOne = SizeAtCompileTime==Dynamic ? Dynamic : SizeAtCompileTime-1
@@ -449,6 +440,15 @@ template<typename Derived> class MatrixBase
    template<typename OtherScalar>
    void applyOnTheRight(Index p, Index q, const JacobiRotation<OtherScalar>& j);

+///////// SparseCore module /////////
+
+    template<typename OtherDerived>
+    EIGEN_STRONG_INLINE const typename SparseMatrixBase<OtherDerived>::template CwiseProductDenseReturnType<Derived>::Type
+    cwiseProduct(const SparseMatrixBase<OtherDerived> &other) const
+    {
+      return other.cwiseProduct(derived());
+    }
+
 ///////// MatrixFunctions module /////////

    typedef typename internal::stem_function<Scalar>::type StemFunction;
@@ -461,15 +461,49 @@ template<typename Derived> class MatrixBase
    const MatrixSquareRootReturnValue<Derived> sqrt() const;
    const MatrixLogarithmReturnValue<Derived> log() const;
    const MatrixPowerReturnValue<Derived> pow(const RealScalar& p) const;
-    const MatrixComplexPowerReturnValue<Derived> pow(const std::complex<RealScalar>& p) const;
+
+#ifdef EIGEN2_SUPPORT
+    template<typename ProductDerived, typename Lhs, typename Rhs>
+    Derived& operator+=(const Flagged<ProductBase<ProductDerived, Lhs,Rhs>, 0,
+                                      EvalBeforeAssigningBit>& other);
+
+    template<typename ProductDerived, typename Lhs, typename Rhs>
+    Derived& operator-=(const Flagged<ProductBase<ProductDerived, Lhs,Rhs>, 0,
+                                      EvalBeforeAssigningBit>& other);
+
+    /** \deprecated because .lazy() is deprecated
+      * Overloaded for cache friendly product evaluation */
+    template<typename OtherDerived>
+    Derived& lazyAssign(const Flagged<OtherDerived, 0, EvalBeforeAssigningBit>& other)
+    { return lazyAssign(other._expression()); }
+
+    template<unsigned int Added>
+    const Flagged<Derived, Added, 0> marked() const;
+    const Flagged<Derived, 0, EvalBeforeAssigningBit> lazy() const;
+
+    inline const Cwise<Derived> cwise() const;
+    inline Cwise<Derived> cwise();
+
+    VectorBlock<Derived> start(Index size);
+    const VectorBlock<const Derived> start(Index size) const;
+    VectorBlock<Derived> end(Index size);
+    const VectorBlock<const Derived> end(Index size) const;
+    template<int Size> VectorBlock<Derived,Size> start();
+    template<int Size> const VectorBlock<const Derived,Size> start() const;
+    template<int Size> VectorBlock<Derived,Size> end();
+    template<int Size> const VectorBlock<const Derived,Size> end() const;
+
+    Minor<Derived> minor(Index row, Index col);
+    const Minor<Derived> minor(Index row, Index col) const;
+#endif

  protected:
-    EIGEN_DEVICE_FUNC MatrixBase() : Base() {}
+    MatrixBase() : Base() {}

  private:
-    EIGEN_DEVICE_FUNC explicit MatrixBase(int);
-    EIGEN_DEVICE_FUNC MatrixBase(int,int);
-    template<typename OtherDerived> EIGEN_DEVICE_FUNC explicit MatrixBase(const MatrixBase<OtherDerived>&);
+    explicit MatrixBase(int);
+    MatrixBase(int,int);
+    template<typename OtherDerived> explicit MatrixBase(const MatrixBase<OtherDerived>&);
  protected:
    // mixing arrays and matrices is not legal
    template<typename OtherDerived> Derived& operator+=(const ArrayBase<OtherDerived>& )
--- a/Eigen/src/Core/NestByValue.h
+++ b/Eigen/src/Core/NestByValue.h
@@ -40,29 +40,29 @@ template<typename ExpressionType> class NestByValue
    typedef typename internal::dense_xpr_base<NestByValue>::type Base;
    EIGEN_DENSE_PUBLIC_INTERFACE(NestByValue)

-    EIGEN_DEVICE_FUNC explicit inline NestByValue(const ExpressionType& matrix) : m_expression(matrix) {}
+    inline NestByValue(const ExpressionType& matrix) : m_expression(matrix) {}

-    EIGEN_DEVICE_FUNC inline Index rows() const { return m_expression.rows(); }
-    EIGEN_DEVICE_FUNC inline Index cols() const { return m_expression.cols(); }
-    EIGEN_DEVICE_FUNC inline Index outerStride() const { return m_expression.outerStride(); }
-    EIGEN_DEVICE_FUNC inline Index innerStride() const { return m_expression.innerStride(); }
+    inline Index rows() const { return m_expression.rows(); }
+    inline Index cols() const { return m_expression.cols(); }
+    inline Index outerStride() const { return m_expression.outerStride(); }
+    inline Index innerStride() const { return m_expression.innerStride(); }

-    EIGEN_DEVICE_FUNC inline const CoeffReturnType coeff(Index row, Index col) const
+    inline const CoeffReturnType coeff(Index row, Index col) const
    {
      return m_expression.coeff(row, col);
    }

-    EIGEN_DEVICE_FUNC inline Scalar& coeffRef(Index row, Index col)
+    inline Scalar& coeffRef(Index row, Index col)
    {
      return m_expression.const_cast_derived().coeffRef(row, col);
    }

-    EIGEN_DEVICE_FUNC inline const CoeffReturnType coeff(Index index) const
+    inline const CoeffReturnType coeff(Index index) const
    {
      return m_expression.coeff(index);
    }

-    EIGEN_DEVICE_FUNC inline Scalar& coeffRef(Index index)
+    inline Scalar& coeffRef(Index index)
    {
      return m_expression.const_cast_derived().coeffRef(index);
    }
@@ -91,7 +91,7 @@ template<typename ExpressionType> class NestByValue
      m_expression.const_cast_derived().template writePacket<LoadMode>(index, x);
    }

-    EIGEN_DEVICE_FUNC operator const ExpressionType&() const { return m_expression; }
+    operator const ExpressionType&() const { return m_expression; }

  protected:
    const ExpressionType m_expression;
--- a/Eigen/src/Core/NoAlias.h
+++ b/Eigen/src/Core/NoAlias.h
@@ -30,36 +30,62 @@ namespace Eigen {
 template<typename ExpressionType, template <typename> class StorageBase>
 class NoAlias
 {
-  public:
    typedef typename ExpressionType::Scalar Scalar;
-    
-    explicit NoAlias(ExpressionType& expression) : m_expression(expression) {}
-    
+  public:
+    NoAlias(ExpressionType& expression) : m_expression(expression) {}
+
+    /** Behaves like MatrixBase::lazyAssign(other)
+      * \sa MatrixBase::lazyAssign() */
    template<typename OtherDerived>
-    EIGEN_DEVICE_FUNC
    EIGEN_STRONG_INLINE ExpressionType& operator=(const StorageBase<OtherDerived>& other)
-    {
-      call_assignment_no_alias(m_expression, other.derived(), internal::assign_op<Scalar>());
-      return m_expression;
-    }
-    
+    { return internal::assign_selector<ExpressionType,OtherDerived,false>::run(m_expression,other.derived()); }
+
+    /** \sa MatrixBase::operator+= */
    template<typename OtherDerived>
-    EIGEN_DEVICE_FUNC
    EIGEN_STRONG_INLINE ExpressionType& operator+=(const StorageBase<OtherDerived>& other)
    {
-      call_assignment_no_alias(m_expression, other.derived(), internal::add_assign_op<Scalar>());
-      return m_expression;
-    }
-    
-    template<typename OtherDerived>
-    EIGEN_DEVICE_FUNC
-    EIGEN_STRONG_INLINE ExpressionType& operator-=(const StorageBase<OtherDerived>& other)
-    {
-      call_assignment_no_alias(m_expression, other.derived(), internal::sub_assign_op<Scalar>());
+      typedef SelfCwiseBinaryOp<internal::scalar_sum_op<Scalar>, ExpressionType, OtherDerived> SelfAdder;
+      SelfAdder tmp(m_expression);
+      typedef typename internal::nested<OtherDerived>::type OtherDerivedNested;
+      typedef typename internal::remove_all<OtherDerivedNested>::type _OtherDerivedNested;
+      internal::assign_selector<SelfAdder,_OtherDerivedNested,false>::run(tmp,OtherDerivedNested(other.derived()));
      return m_expression;
    }

-    EIGEN_DEVICE_FUNC
+    /** \sa MatrixBase::operator-= */
+    template<typename OtherDerived>
+    EIGEN_STRONG_INLINE ExpressionType& operator-=(const StorageBase<OtherDerived>& other)
+    {
+      typedef SelfCwiseBinaryOp<internal::scalar_difference_op<Scalar>, ExpressionType, OtherDerived> SelfAdder;
+      SelfAdder tmp(m_expression);
+      typedef typename internal::nested<OtherDerived>::type OtherDerivedNested;
+      typedef typename internal::remove_all<OtherDerivedNested>::type _OtherDerivedNested;
+      internal::assign_selector<SelfAdder,_OtherDerivedNested,false>::run(tmp,OtherDerivedNested(other.derived()));
+      return m_expression;
+    }
+
+#ifndef EIGEN_PARSED_BY_DOXYGEN
+    template<typename ProductDerived, typename Lhs, typename Rhs>
+    EIGEN_STRONG_INLINE ExpressionType& operator+=(const ProductBase<ProductDerived, Lhs,Rhs>& other)
+    { other.derived().addTo(m_expression); return m_expression; }
+
+    template<typename ProductDerived, typename Lhs, typename Rhs>
+    EIGEN_STRONG_INLINE ExpressionType& operator-=(const ProductBase<ProductDerived, Lhs,Rhs>& other)
+    { other.derived().subTo(m_expression); return m_expression; }
+
+    template<typename Lhs, typename Rhs, int NestingFlags>
+    EIGEN_STRONG_INLINE ExpressionType& operator+=(const CoeffBasedProduct<Lhs,Rhs,NestingFlags>& other)
+    { return m_expression.derived() += CoeffBasedProduct<Lhs,Rhs,NestByRefBit>(other.lhs(), other.rhs()); }
+
+    template<typename Lhs, typename Rhs, int NestingFlags>
+    EIGEN_STRONG_INLINE ExpressionType& operator-=(const CoeffBasedProduct<Lhs,Rhs,NestingFlags>& other)
+    { return m_expression.derived() -= CoeffBasedProduct<Lhs,Rhs,NestByRefBit>(other.lhs(), other.rhs()); }
+    
+    template<typename OtherDerived>
+    ExpressionType& operator=(const ReturnByValue<OtherDerived>& func)
+    { return m_expression = func; }
+#endif
+
    ExpressionType& expression() const
    {
      return m_expression;
@@ -100,7 +126,7 @@ class NoAlias
 template<typename Derived>
 NoAlias<Derived,MatrixBase> MatrixBase<Derived>::noalias()
 {
-  return NoAlias<Derived, Eigen::MatrixBase >(derived());
+  return derived();
 }

 } // end namespace Eigen
--- a/Eigen/src/Core/NumTraits.h
+++ b/Eigen/src/Core/NumTraits.h
@@ -68,16 +68,7 @@ template<typename T> struct GenericNumTraits
                   >::type NonInteger;
  typedef T Nested;

-  EIGEN_DEVICE_FUNC
-  static inline Real epsilon()
-  {
-    #if defined(__CUDA_ARCH__)
-    return internal::device::numeric_limits<T>::epsilon();
-    #else
-    return std::numeric_limits<T>::epsilon();
-    #endif
-  }
-  EIGEN_DEVICE_FUNC
+  static inline Real epsilon() { return std::numeric_limits<T>::epsilon(); }
  static inline Real dummy_precision()
  {
    // make sure to override this for floating-point types
@@ -85,6 +76,13 @@ template<typename T> struct GenericNumTraits
  }
  static inline T highest() { return (std::numeric_limits<T>::max)(); }
  static inline T lowest()  { return IsInteger ? (std::numeric_limits<T>::min)() : (-(std::numeric_limits<T>::max)()); }
+  
+#ifdef EIGEN2_SUPPORT
+  enum {
+    HasFloatingPoint = !IsInteger
+  };
+  typedef NonInteger FloatingPoint;
+#endif
 };

 template<typename T> struct NumTraits : GenericNumTraits<T>
@@ -93,13 +91,11 @@ template<typename T> struct NumTraits : GenericNumTraits<T>
 template<> struct NumTraits<float>
  : GenericNumTraits<float>
 {
-  EIGEN_DEVICE_FUNC
  static inline float dummy_precision() { return 1e-5f; }
 };

 template<> struct NumTraits<double> : GenericNumTraits<double>
 {
-  EIGEN_DEVICE_FUNC
  static inline double dummy_precision() { return 1e-12; }
 };

--- a/Eigen/src/Core/PermutationMatrix.h
+++ b/Eigen/src/Core/PermutationMatrix.h
@@ -13,8 +13,7 @@

 namespace Eigen { 

-// TODO: this does not seems to be needed at all:
-// template<int RowCol,typename IndicesType,typename MatrixType, typename StorageKind> class PermutedImpl;
+template<int RowCol,typename IndicesType,typename MatrixType, typename StorageKind> class PermutedImpl;

 /** \class PermutationBase
  * \ingroup Core_Module
@@ -61,18 +60,19 @@ class PermutationBase : public EigenBase<Derived>
    typedef typename Traits::IndicesType IndicesType;
    enum {
      Flags = Traits::Flags,
+      CoeffReadCost = Traits::CoeffReadCost,
      RowsAtCompileTime = Traits::RowsAtCompileTime,
      ColsAtCompileTime = Traits::ColsAtCompileTime,
      MaxRowsAtCompileTime = Traits::MaxRowsAtCompileTime,
      MaxColsAtCompileTime = Traits::MaxColsAtCompileTime
    };
-    typedef typename Traits::StorageIndex StorageIndex;
-    typedef Matrix<StorageIndex,RowsAtCompileTime,ColsAtCompileTime,0,MaxRowsAtCompileTime,MaxColsAtCompileTime>
+    typedef typename Traits::Scalar Scalar;
+    typedef typename Traits::Index Index;
+    typedef Matrix<Scalar,RowsAtCompileTime,ColsAtCompileTime,0,MaxRowsAtCompileTime,MaxColsAtCompileTime>
            DenseMatrixType;
-    typedef PermutationMatrix<IndicesType::SizeAtCompileTime,IndicesType::MaxSizeAtCompileTime,StorageIndex>
+    typedef PermutationMatrix<IndicesType::SizeAtCompileTime,IndicesType::MaxSizeAtCompileTime,Index>
            PlainPermutationType;
    using Base::derived;
-    typedef Transpose<PermutationBase> TransposeReturnType;
    #endif

    /** Copies the other permutation into *this */
@@ -118,7 +118,7 @@ class PermutationBase : public EigenBase<Derived>
    void evalTo(MatrixBase<DenseDerived>& other) const
    {
      other.setZero();
-      for (Index i=0; i<rows(); ++i)
+      for (int i=0; i<rows();++i)
        other.coeffRef(indices().coeff(i),i) = typename DenseDerived::Scalar(1);
    }
    #endif
@@ -147,8 +147,7 @@ class PermutationBase : public EigenBase<Derived>
    /** Sets *this to be the identity permutation matrix */
    void setIdentity()
    {
-      StorageIndex n = StorageIndex(size());
-      for(StorageIndex i = 0; i < n; ++i)
+      for(Index i = 0; i < size(); ++i)
        indices().coeffRef(i) = i;
    }

@@ -164,18 +163,18 @@ class PermutationBase : public EigenBase<Derived>
      *
      * \returns a reference to *this.
      *
-      * \warning This is much slower than applyTranspositionOnTheRight(Index,Index):
+      * \warning This is much slower than applyTranspositionOnTheRight(int,int):
      * this has linear complexity and requires a lot of branching.
      *
-      * \sa applyTranspositionOnTheRight(Index,Index)
+      * \sa applyTranspositionOnTheRight(int,int)
      */
    Derived& applyTranspositionOnTheLeft(Index i, Index j)
    {
      eigen_assert(i>=0 && j>=0 && i<size() && j<size());
      for(Index k = 0; k < size(); ++k)
      {
-        if(indices().coeff(k) == i) indices().coeffRef(k) = StorageIndex(j);
-        else if(indices().coeff(k) == j) indices().coeffRef(k) = StorageIndex(i);
+        if(indices().coeff(k) == i) indices().coeffRef(k) = j;
+        else if(indices().coeff(k) == j) indices().coeffRef(k) = i;
      }
      return derived();
    }
@@ -186,7 +185,7 @@ class PermutationBase : public EigenBase<Derived>
      *
      * This is a fast operation, it only consists in swapping two indices.
      *
-      * \sa applyTranspositionOnTheLeft(Index,Index)
+      * \sa applyTranspositionOnTheLeft(int,int)
      */
    Derived& applyTranspositionOnTheRight(Index i, Index j)
    {
@@ -199,14 +198,14 @@ class PermutationBase : public EigenBase<Derived>
      *
      * \note \note_try_to_help_rvo
      */
-    inline TransposeReturnType inverse() const
-    { return TransposeReturnType(derived()); }
+    inline Transpose<PermutationBase> inverse() const
+    { return derived(); }
    /** \returns the tranpose permutation matrix.
      *
      * \note \note_try_to_help_rvo
      */
-    inline TransposeReturnType transpose() const
-    { return TransposeReturnType(derived()); }
+    inline Transpose<PermutationBase> transpose() const
+    { return derived(); }

    /**** multiplication helpers to hopefully get RVO ****/

@@ -216,13 +215,13 @@ class PermutationBase : public EigenBase<Derived>
    template<typename OtherDerived>
    void assignTranspose(const PermutationBase<OtherDerived>& other)
    {
-      for (Index i=0; i<rows();++i) indices().coeffRef(other.indices().coeff(i)) = i;
+      for (int i=0; i<rows();++i) indices().coeffRef(other.indices().coeff(i)) = i;
    }
    template<typename Lhs,typename Rhs>
    void assignProduct(const Lhs& lhs, const Rhs& rhs)
    {
      eigen_assert(lhs.cols() == rhs.rows());
-      for (Index i=0; i<rows();++i) indices().coeffRef(i) = lhs.indices().coeff(rhs.indices().coeff(i));
+      for (int i=0; i<rows();++i) indices().coeffRef(i) = lhs.indices().coeff(rhs.indices().coeff(i));
    }
 #endif

@@ -292,7 +291,7 @@ class PermutationBase : public EigenBase<Derived>
  *
  * \param SizeAtCompileTime the number of rows/cols, or Dynamic
  * \param MaxSizeAtCompileTime the maximum number of rows/cols, or Dynamic. This optional parameter defaults to SizeAtCompileTime. Most of the time, you should not have to specify it.
-  * \param StorageIndex the integer type of the indices
+  * \param IndexType the interger type of the indices
  *
  * This class represents a permutation matrix, internally stored as a vector of integers.
  *
@@ -300,28 +299,24 @@ class PermutationBase : public EigenBase<Derived>
  */

 namespace internal {
-template<int SizeAtCompileTime, int MaxSizeAtCompileTime, typename _StorageIndex>
-struct traits<PermutationMatrix<SizeAtCompileTime, MaxSizeAtCompileTime, _StorageIndex> >
- : traits<Matrix<_StorageIndex,SizeAtCompileTime,SizeAtCompileTime,0,MaxSizeAtCompileTime,MaxSizeAtCompileTime> >
+template<int SizeAtCompileTime, int MaxSizeAtCompileTime, typename IndexType>
+struct traits<PermutationMatrix<SizeAtCompileTime, MaxSizeAtCompileTime, IndexType> >
+ : traits<Matrix<IndexType,SizeAtCompileTime,SizeAtCompileTime,0,MaxSizeAtCompileTime,MaxSizeAtCompileTime> >
 {
-  typedef PermutationStorage StorageKind;
-  typedef Matrix<_StorageIndex, SizeAtCompileTime, 1, 0, MaxSizeAtCompileTime, 1> IndicesType;
-  typedef _StorageIndex StorageIndex;
+  typedef IndexType Index;
+  typedef Matrix<IndexType, SizeAtCompileTime, 1, 0, MaxSizeAtCompileTime, 1> IndicesType;
 };
 }

-template<int SizeAtCompileTime, int MaxSizeAtCompileTime, typename _StorageIndex>
-class PermutationMatrix : public PermutationBase<PermutationMatrix<SizeAtCompileTime, MaxSizeAtCompileTime, _StorageIndex> >
+template<int SizeAtCompileTime, int MaxSizeAtCompileTime, typename IndexType>
+class PermutationMatrix : public PermutationBase<PermutationMatrix<SizeAtCompileTime, MaxSizeAtCompileTime, IndexType> >
 {
    typedef PermutationBase<PermutationMatrix> Base;
    typedef internal::traits<PermutationMatrix> Traits;
  public:

-    typedef const PermutationMatrix& Nested;
-
    #ifndef EIGEN_PARSED_BY_DOXYGEN
    typedef typename Traits::IndicesType IndicesType;
-    typedef typename Traits::StorageIndex StorageIndex;
    #endif

    inline PermutationMatrix()
@@ -329,10 +324,8 @@ class PermutationMatrix : public PermutationBase<PermutationMatrix<SizeAtCompile

    /** Constructs an uninitialized permutation matrix of given size.
      */
-    explicit inline PermutationMatrix(Index size) : m_indices(size)
-    {
-      eigen_internal_assert(size <= NumTraits<StorageIndex>::highest());
-    }
+    inline PermutationMatrix(int size) : m_indices(size)
+    {}

    /** Copy constructor. */
    template<typename OtherDerived>
@@ -403,10 +396,7 @@ class PermutationMatrix : public PermutationBase<PermutationMatrix<SizeAtCompile
    PermutationMatrix(const Transpose<PermutationBase<Other> >& other)
      : m_indices(other.nestedPermutation().size())
    {
-      eigen_internal_assert(m_indices.size() <= NumTraits<StorageIndex>::highest());
-      StorageIndex end = StorageIndex(m_indices.size());
-      for (StorageIndex i=0; i<end;++i)
-        m_indices.coeffRef(other.nestedPermutation().indices().coeff(i)) = i;
+      for (int i=0; i<m_indices.size();++i) m_indices.coeffRef(other.nestedPermutation().indices().coeff(i)) = i;
    }
    template<typename Lhs,typename Rhs>
    PermutationMatrix(internal::PermPermProduct_t, const Lhs& lhs, const Rhs& rhs)
@@ -423,19 +413,18 @@ class PermutationMatrix : public PermutationBase<PermutationMatrix<SizeAtCompile


 namespace internal {
-template<int SizeAtCompileTime, int MaxSizeAtCompileTime, typename _StorageIndex, int _PacketAccess>
-struct traits<Map<PermutationMatrix<SizeAtCompileTime, MaxSizeAtCompileTime, _StorageIndex>,_PacketAccess> >
- : traits<Matrix<_StorageIndex,SizeAtCompileTime,SizeAtCompileTime,0,MaxSizeAtCompileTime,MaxSizeAtCompileTime> >
+template<int SizeAtCompileTime, int MaxSizeAtCompileTime, typename IndexType, int _PacketAccess>
+struct traits<Map<PermutationMatrix<SizeAtCompileTime, MaxSizeAtCompileTime, IndexType>,_PacketAccess> >
+ : traits<Matrix<IndexType,SizeAtCompileTime,SizeAtCompileTime,0,MaxSizeAtCompileTime,MaxSizeAtCompileTime> >
 {
-  typedef PermutationStorage StorageKind;
-  typedef Map<const Matrix<_StorageIndex, SizeAtCompileTime, 1, 0, MaxSizeAtCompileTime, 1>, _PacketAccess> IndicesType;
-  typedef _StorageIndex StorageIndex;
+  typedef IndexType Index;
+  typedef Map<const Matrix<IndexType, SizeAtCompileTime, 1, 0, MaxSizeAtCompileTime, 1>, _PacketAccess> IndicesType;
 };
 }

-template<int SizeAtCompileTime, int MaxSizeAtCompileTime, typename _StorageIndex, int _PacketAccess>
-class Map<PermutationMatrix<SizeAtCompileTime, MaxSizeAtCompileTime, _StorageIndex>,_PacketAccess>
-  : public PermutationBase<Map<PermutationMatrix<SizeAtCompileTime, MaxSizeAtCompileTime, _StorageIndex>,_PacketAccess> >
+template<int SizeAtCompileTime, int MaxSizeAtCompileTime, typename IndexType, int _PacketAccess>
+class Map<PermutationMatrix<SizeAtCompileTime, MaxSizeAtCompileTime, IndexType>,_PacketAccess>
+  : public PermutationBase<Map<PermutationMatrix<SizeAtCompileTime, MaxSizeAtCompileTime, IndexType>,_PacketAccess> >
 {
    typedef PermutationBase<Map> Base;
    typedef internal::traits<Map> Traits;
@@ -443,14 +432,14 @@ class Map<PermutationMatrix<SizeAtCompileTime, MaxSizeAtCompileTime, _StorageInd

    #ifndef EIGEN_PARSED_BY_DOXYGEN
    typedef typename Traits::IndicesType IndicesType;
-    typedef typename IndicesType::Scalar StorageIndex;
+    typedef typename IndicesType::Scalar Index;
    #endif

-    inline Map(const StorageIndex* indicesPtr)
+    inline Map(const Index* indicesPtr)
      : m_indices(indicesPtr)
    {}

-    inline Map(const StorageIndex* indicesPtr, Index size)
+    inline Map(const Index* indicesPtr, Index size)
      : m_indices(indicesPtr,size)
    {}

@@ -497,6 +486,8 @@ class Map<PermutationMatrix<SizeAtCompileTime, MaxSizeAtCompileTime, _StorageInd
  * \sa class PermutationBase, class PermutationMatrix
  */

+struct PermutationStorage {};
+
 template<typename _IndicesType> class TranspositionsWrapper;
 namespace internal {
 template<typename _IndicesType>
@@ -504,14 +495,15 @@ struct traits<PermutationWrapper<_IndicesType> >
 {
  typedef PermutationStorage StorageKind;
  typedef typename _IndicesType::Scalar Scalar;
-  typedef typename _IndicesType::Scalar StorageIndex;
+  typedef typename _IndicesType::Scalar Index;
  typedef _IndicesType IndicesType;
  enum {
    RowsAtCompileTime = _IndicesType::SizeAtCompileTime,
    ColsAtCompileTime = _IndicesType::SizeAtCompileTime,
-    MaxRowsAtCompileTime = IndicesType::MaxSizeAtCompileTime,
-    MaxColsAtCompileTime = IndicesType::MaxSizeAtCompileTime,
-    Flags = 0
+    MaxRowsAtCompileTime = IndicesType::MaxRowsAtCompileTime,
+    MaxColsAtCompileTime = IndicesType::MaxColsAtCompileTime,
+    Flags = 0,
+    CoeffReadCost = _IndicesType::CoeffReadCost
  };
 };
 }
@@ -540,39 +532,35 @@ class PermutationWrapper : public PermutationBase<PermutationWrapper<_IndicesTyp
    typename IndicesType::Nested m_indices;
 };

-
-// TODO: Do we need to define these operator* functions? Would it be better to have them inherited
-// from MatrixBase?
-
 /** \returns the matrix with the permutation applied to the columns.
  */
-template<typename MatrixDerived, typename PermutationDerived>
-EIGEN_DEVICE_FUNC
-const Product<MatrixDerived, PermutationDerived, DefaultProduct>
-operator*(const MatrixBase<MatrixDerived> &matrix,
-          const PermutationBase<PermutationDerived>& permutation)
+template<typename Derived, typename PermutationDerived>
+inline const internal::permut_matrix_product_retval<PermutationDerived, Derived, OnTheRight>
+operator*(const MatrixBase<Derived>& matrix,
+          const PermutationBase<PermutationDerived> &permutation)
 {
-  return Product<MatrixDerived, PermutationDerived, DefaultProduct>
-            (matrix.derived(), permutation.derived());
+  return internal::permut_matrix_product_retval
+           <PermutationDerived, Derived, OnTheRight>
+           (permutation.derived(), matrix.derived());
 }

 /** \returns the matrix with the permutation applied to the rows.
  */
-template<typename PermutationDerived, typename MatrixDerived>
-EIGEN_DEVICE_FUNC
-const Product<PermutationDerived, MatrixDerived, DefaultProduct>
+template<typename Derived, typename PermutationDerived>
+inline const internal::permut_matrix_product_retval
+               <PermutationDerived, Derived, OnTheLeft>
 operator*(const PermutationBase<PermutationDerived> &permutation,
-          const MatrixBase<MatrixDerived>& matrix)
+          const MatrixBase<Derived>& matrix)
 {
-  return Product<PermutationDerived, MatrixDerived, DefaultProduct>
-            (permutation.derived(), matrix.derived());
+  return internal::permut_matrix_product_retval
+           <PermutationDerived, Derived, OnTheLeft>
+           (permutation.derived(), matrix.derived());
 }

 namespace internal {

 template<typename PermutationType, typename MatrixType, int Side, bool Transposed>
 struct traits<permut_matrix_product_retval<PermutationType, MatrixType, Side, Transposed> >
-  : traits<typename MatrixType::PlainObject>
 {
  typedef typename MatrixType::PlainObject ReturnType;
 };
@@ -582,7 +570,7 @@ struct permut_matrix_product_retval
 : public ReturnByValue<permut_matrix_product_retval<PermutationType, MatrixType, Side, Transposed> >
 {
    typedef typename remove_all<typename MatrixType::Nested>::type MatrixTypeNestedCleaned;
-    typedef typename MatrixType::StorageIndex StorageIndex;
+    typedef typename MatrixType::Index Index;

    permut_matrix_product_retval(const PermutationType& perm, const MatrixType& matrix)
      : m_permutation(perm), m_matrix(matrix)
@@ -596,7 +584,11 @@ struct permut_matrix_product_retval
      const Index n = Side==OnTheLeft ? rows() : cols();
      // FIXME we need an is_same for expression that is not sensitive to constness. For instance
      // is_same_xpr<Block<const Matrix>, Block<Matrix> >::value should be true.
-      if(is_same<MatrixTypeNestedCleaned,Dest>::value && extract_data(dst) == extract_data(m_matrix))
+      const typename Dest::Scalar *dst_data = internal::extract_data(dst);
+      if(    is_same<MatrixTypeNestedCleaned,Dest>::value
+          && blas_traits<MatrixTypeNestedCleaned>::HasUsableDirectAccess
+          && blas_traits<Dest>::HasUsableDirectAccess
+          && dst_data!=0 && dst_data == extract_data(m_matrix))
      {
        // apply the permutation inplace
        Matrix<bool,PermutationType::RowsAtCompileTime,1,0,PermutationType::MaxRowsAtCompileTime> mask(m_permutation.size());
@@ -625,7 +617,7 @@ struct permut_matrix_product_retval
      }
      else
      {
-        for(Index i = 0; i < n; ++i)
+        for(int i = 0; i < n; ++i)
        {
          Block<Dest, Side==OnTheLeft ? 1 : Dest::RowsAtCompileTime, Side==OnTheRight ? 1 : Dest::ColsAtCompileTime>
               (dst, ((Side==OnTheLeft) ^ Transposed) ? m_permutation.indices().coeff(i) : i)
@@ -652,8 +644,6 @@ struct traits<Transpose<PermutationBase<Derived> > >

 } // end namespace internal

-// TODO: the specificties should be handled by the evaluator,
-// at the very least we should only specialize TransposeImpl
 template<typename Derived>
 class Transpose<PermutationBase<Derived> >
  : public EigenBase<Transpose<PermutationBase<Derived> > >
@@ -668,26 +658,26 @@ class Transpose<PermutationBase<Derived> >
    typedef typename Derived::DenseMatrixType DenseMatrixType;
    enum {
      Flags = Traits::Flags,
+      CoeffReadCost = Traits::CoeffReadCost,
      RowsAtCompileTime = Traits::RowsAtCompileTime,
      ColsAtCompileTime = Traits::ColsAtCompileTime,
      MaxRowsAtCompileTime = Traits::MaxRowsAtCompileTime,
      MaxColsAtCompileTime = Traits::MaxColsAtCompileTime
    };
    typedef typename Traits::Scalar Scalar;
-    typedef typename Traits::StorageIndex StorageIndex;
    #endif

    Transpose(const PermutationType& p) : m_permutation(p) {}

-    inline Index rows() const { return m_permutation.rows(); }
-    inline Index cols() const { return m_permutation.cols(); }
+    inline int rows() const { return m_permutation.rows(); }
+    inline int cols() const { return m_permutation.cols(); }

    #ifndef EIGEN_PARSED_BY_DOXYGEN
    template<typename DenseDerived>
    void evalTo(MatrixBase<DenseDerived>& other) const
    {
      other.setZero();
-      for (Index i=0; i<rows();++i)
+      for (int i=0; i<rows();++i)
        other.coeffRef(i, m_permutation.indices().coeff(i)) = typename DenseDerived::Scalar(1);
    }
    #endif
@@ -700,19 +690,19 @@ class Transpose<PermutationBase<Derived> >
    /** \returns the matrix with the inverse permutation applied to the columns.
      */
    template<typename OtherDerived> friend
-    const Product<OtherDerived, Transpose, DefaultProduct>
+    inline const internal::permut_matrix_product_retval<PermutationType, OtherDerived, OnTheRight, true>
    operator*(const MatrixBase<OtherDerived>& matrix, const Transpose& trPerm)
    {
-      return Product<OtherDerived, Transpose, DefaultProduct>(matrix.derived(), trPerm.derived());
+      return internal::permut_matrix_product_retval<PermutationType, OtherDerived, OnTheRight, true>(trPerm.m_permutation, matrix.derived());
    }

    /** \returns the matrix with the inverse permutation applied to the rows.
      */
    template<typename OtherDerived>
-    const Product<Transpose, OtherDerived, DefaultProduct>
+    inline const internal::permut_matrix_product_retval<PermutationType, OtherDerived, OnTheLeft, true>
    operator*(const MatrixBase<OtherDerived>& matrix) const
    {
-      return Product<Transpose, OtherDerived, DefaultProduct>(*this, matrix.derived());
+      return internal::permut_matrix_product_retval<PermutationType, OtherDerived, OnTheLeft, true>(m_permutation, matrix.derived());
    }

    const PermutationType& nestedPermutation() const { return m_permutation; }
@@ -727,38 +717,6 @@ const PermutationWrapper<const Derived> MatrixBase<Derived>::asPermutation() con
  return derived();
 }

-namespace internal {
-  
-// TODO currently a permutation matrix expression has the form PermutationMatrix or PermutationWrapper
-//      or their transpose; in the future shape should be defined by the expression traits
-template<int SizeAtCompileTime, int MaxSizeAtCompileTime, typename IndexType>
-struct evaluator_traits<PermutationMatrix<SizeAtCompileTime, MaxSizeAtCompileTime, IndexType> >
-{
-  typedef typename storage_kind_to_evaluator_kind<Dense>::Kind Kind;
-  typedef PermutationShape Shape;
-  static const int AssumeAliasing = 0;
-};
-
-template<typename IndicesType>
-struct evaluator_traits<PermutationWrapper<IndicesType> >
-{
-  typedef typename storage_kind_to_evaluator_kind<Dense>::Kind Kind;
-  typedef PermutationShape Shape;
-  static const int AssumeAliasing = 0;
-};
-
-template<typename Derived>
-struct evaluator_traits<Transpose<PermutationBase<Derived> > >
-{
-  typedef typename storage_kind_to_evaluator_kind<Dense>::Kind Kind;
-  typedef PermutationShape Shape;
-  static const int AssumeAliasing = 0;
-};
-
-template<> struct AssignmentKind<DenseShape,PermutationShape> { typedef EigenBase2EigenBase Kind; };
-
-} // end namespace internal
-
 } // end namespace Eigen

 #endif // EIGEN_PERMUTATIONMATRIX_H
--- a/Eigen/src/Core/PlainObjectBase.h
+++ b/Eigen/src/Core/PlainObjectBase.h
@@ -28,7 +28,6 @@ namespace internal {

 template<int MaxSizeAtCompileTime> struct check_rows_cols_for_overflow {
  template<typename Index>
-  EIGEN_DEVICE_FUNC
  static EIGEN_ALWAYS_INLINE void run(Index, Index)
  {
  }
@@ -36,7 +35,6 @@ template<int MaxSizeAtCompileTime> struct check_rows_cols_for_overflow {

 template<> struct check_rows_cols_for_overflow<Dynamic> {
  template<typename Index>
-  EIGEN_DEVICE_FUNC
  static EIGEN_ALWAYS_INLINE void run(Index rows, Index cols)
  {
    // http://hg.mozilla.org/mozilla-central/file/6c8a909977d3/xpcom/ds/CheckedInt.h#l242
@@ -95,6 +93,7 @@ class PlainObjectBase : public internal::dense_xpr_base<Derived>::type
    typedef typename internal::dense_xpr_base<Derived>::type Base;

    typedef typename internal::traits<Derived>::StorageKind StorageKind;
+    typedef typename internal::traits<Derived>::Index Index;
    typedef typename internal::traits<Derived>::Scalar Scalar;
    typedef typename internal::packet_traits<Scalar>::type PacketScalar;
    typedef typename NumTraits<Scalar>::Real RealScalar;
@@ -127,20 +126,15 @@ class PlainObjectBase : public internal::dense_xpr_base<Derived>::type
    DenseStorage<Scalar, Base::MaxSizeAtCompileTime, Base::RowsAtCompileTime, Base::ColsAtCompileTime, Options> m_storage;

  public:
-    enum { NeedsToAlign = SizeAtCompileTime != Dynamic && (internal::traits<Derived>::EvaluatorFlags & AlignedBit) != 0 };
+    enum { NeedsToAlign = SizeAtCompileTime != Dynamic && (internal::traits<Derived>::Flags & AlignedBit) != 0 };
    EIGEN_MAKE_ALIGNED_OPERATOR_NEW_IF(NeedsToAlign)

-    EIGEN_DEVICE_FUNC
    Base& base() { return *static_cast<Base*>(this); }
-    EIGEN_DEVICE_FUNC
    const Base& base() const { return *static_cast<const Base*>(this); }

-    EIGEN_DEVICE_FUNC
    EIGEN_STRONG_INLINE Index rows() const { return m_storage.rows(); }
-    EIGEN_DEVICE_FUNC
    EIGEN_STRONG_INLINE Index cols() const { return m_storage.cols(); }

-    EIGEN_DEVICE_FUNC
    EIGEN_STRONG_INLINE const Scalar& coeff(Index rowId, Index colId) const
    {
      if(Flags & RowMajorBit)
@@ -149,13 +143,11 @@ class PlainObjectBase : public internal::dense_xpr_base<Derived>::type
        return m_storage.data()[rowId + colId * m_storage.rows()];
    }

-    EIGEN_DEVICE_FUNC
    EIGEN_STRONG_INLINE const Scalar& coeff(Index index) const
    {
      return m_storage.data()[index];
    }

-    EIGEN_DEVICE_FUNC
    EIGEN_STRONG_INLINE Scalar& coeffRef(Index rowId, Index colId)
    {
      if(Flags & RowMajorBit)
@@ -164,13 +156,11 @@ class PlainObjectBase : public internal::dense_xpr_base<Derived>::type
        return m_storage.data()[rowId + colId * m_storage.rows()];
    }

-    EIGEN_DEVICE_FUNC
    EIGEN_STRONG_INLINE Scalar& coeffRef(Index index)
    {
      return m_storage.data()[index];
    }

-    EIGEN_DEVICE_FUNC
    EIGEN_STRONG_INLINE const Scalar& coeffRef(Index rowId, Index colId) const
    {
      if(Flags & RowMajorBit)
@@ -179,7 +169,6 @@ class PlainObjectBase : public internal::dense_xpr_base<Derived>::type
        return m_storage.data()[rowId + colId * m_storage.rows()];
    }

-    EIGEN_DEVICE_FUNC
    EIGEN_STRONG_INLINE const Scalar& coeffRef(Index index) const
    {
      return m_storage.data()[index];
@@ -220,11 +209,11 @@ class PlainObjectBase : public internal::dense_xpr_base<Derived>::type
    }

    /** \returns a const pointer to the data array of this matrix */
-    EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Scalar *data() const
+    EIGEN_STRONG_INLINE const Scalar *data() const
    { return m_storage.data(); }

    /** \returns a pointer to the data array of this matrix */
-    EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Scalar *data()
+    EIGEN_STRONG_INLINE Scalar *data()
    { return m_storage.data(); }

    /** Resizes \c *this to a \a rows x \a cols matrix.
@@ -243,7 +232,6 @@ class PlainObjectBase : public internal::dense_xpr_base<Derived>::type
      *
      * \sa resize(Index) for vectors, resize(NoChange_t, Index), resize(Index, NoChange_t)
      */
-    EIGEN_DEVICE_FUNC
    EIGEN_STRONG_INLINE void resize(Index nbRows, Index nbCols)
    {
      eigen_assert(   EIGEN_IMPLIES(RowsAtCompileTime!=Dynamic,nbRows==RowsAtCompileTime)
@@ -274,7 +262,6 @@ class PlainObjectBase : public internal::dense_xpr_base<Derived>::type
      *
      * \sa resize(Index,Index), resize(NoChange_t, Index), resize(Index, NoChange_t)
      */
-    EIGEN_DEVICE_FUNC
    inline void resize(Index size)
    {
      EIGEN_STATIC_ASSERT_VECTOR_ONLY(PlainObjectBase)
@@ -299,7 +286,6 @@ class PlainObjectBase : public internal::dense_xpr_base<Derived>::type
      *
      * \sa resize(Index,Index)
      */
-    EIGEN_DEVICE_FUNC
    inline void resize(NoChange_t, Index nbCols)
    {
      resize(rows(), nbCols);
@@ -313,7 +299,6 @@ class PlainObjectBase : public internal::dense_xpr_base<Derived>::type
      *
      * \sa resize(Index,Index)
      */
-    EIGEN_DEVICE_FUNC
    inline void resize(Index nbRows, NoChange_t)
    {
      resize(nbRows, cols());
@@ -327,12 +312,11 @@ class PlainObjectBase : public internal::dense_xpr_base<Derived>::type
      * remain row-vectors and vectors remain vectors.
      */
    template<typename OtherDerived>
-    EIGEN_DEVICE_FUNC 
    EIGEN_STRONG_INLINE void resizeLike(const EigenBase<OtherDerived>& _other)
    {
      const OtherDerived& other = _other.derived();
-      internal::check_rows_cols_for_overflow<MaxSizeAtCompileTime>::run(other.rows(), other.cols());
-      const Index othersize = other.rows()*other.cols();
+      internal::check_rows_cols_for_overflow<MaxSizeAtCompileTime>::run(Index(other.rows()), Index(other.cols()));
+      const Index othersize = Index(other.rows())*Index(other.cols());
      if(RowsAtCompileTime == 1)
      {
        eigen_assert(other.rows() == 1 || other.cols() == 1);
@@ -355,7 +339,6 @@ class PlainObjectBase : public internal::dense_xpr_base<Derived>::type
      * Matrices are resized relative to the top-left element. In case values need to be 
      * appended to the matrix they will be uninitialized.
      */
-    EIGEN_DEVICE_FUNC
    EIGEN_STRONG_INLINE void conservativeResize(Index nbRows, Index nbCols)
    {
      internal::conservative_resize_like_impl<Derived>::run(*this, nbRows, nbCols);
@@ -368,7 +351,6 @@ class PlainObjectBase : public internal::dense_xpr_base<Derived>::type
      *
      * In case the matrix is growing, new rows will be uninitialized.
      */
-    EIGEN_DEVICE_FUNC
    EIGEN_STRONG_INLINE void conservativeResize(Index nbRows, NoChange_t)
    {
      // Note: see the comment in conservativeResize(Index,Index)
@@ -382,7 +364,6 @@ class PlainObjectBase : public internal::dense_xpr_base<Derived>::type
      *
      * In case the matrix is growing, new columns will be uninitialized.
      */
-    EIGEN_DEVICE_FUNC
    EIGEN_STRONG_INLINE void conservativeResize(NoChange_t, Index nbCols)
    {
      // Note: see the comment in conservativeResize(Index,Index)
@@ -397,7 +378,6 @@ class PlainObjectBase : public internal::dense_xpr_base<Derived>::type
      *
      * When values are appended, they will be uninitialized.
      */
-    EIGEN_DEVICE_FUNC
    EIGEN_STRONG_INLINE void conservativeResize(Index size)
    {
      internal::conservative_resize_like_impl<Derived>::run(*this, size);
@@ -413,7 +393,6 @@ class PlainObjectBase : public internal::dense_xpr_base<Derived>::type
      * appended to the matrix they will copied from \c other.
      */
    template<typename OtherDerived>
-    EIGEN_DEVICE_FUNC
    EIGEN_STRONG_INLINE void conservativeResizeLike(const DenseBase<OtherDerived>& other)
    {
      internal::conservative_resize_like_impl<Derived,OtherDerived>::run(*this, other);
@@ -422,7 +401,6 @@ class PlainObjectBase : public internal::dense_xpr_base<Derived>::type
    /** This is a special case of the templated operator=. Its purpose is to
      * prevent a default operator= from hiding the templated operator=.
      */
-    EIGEN_DEVICE_FUNC
    EIGEN_STRONG_INLINE Derived& operator=(const PlainObjectBase& other)
    {
      return _set(other);
@@ -430,7 +408,6 @@ class PlainObjectBase : public internal::dense_xpr_base<Derived>::type

    /** \sa MatrixBase::lazyAssign() */
    template<typename OtherDerived>
-    EIGEN_DEVICE_FUNC
    EIGEN_STRONG_INLINE Derived& lazyAssign(const DenseBase<OtherDerived>& other)
    {
      _resize_to_match(other);
@@ -438,14 +415,12 @@ class PlainObjectBase : public internal::dense_xpr_base<Derived>::type
    }

    template<typename OtherDerived>
-    EIGEN_DEVICE_FUNC
    EIGEN_STRONG_INLINE Derived& operator=(const ReturnByValue<OtherDerived>& func)
    {
      resize(func.rows(), func.cols());
      return Base::operator=(func);
    }

-    EIGEN_DEVICE_FUNC
    EIGEN_STRONG_INLINE PlainObjectBase() : m_storage()
    {
 //       _check_template_params();
@@ -455,8 +430,7 @@ class PlainObjectBase : public internal::dense_xpr_base<Derived>::type
 #ifndef EIGEN_PARSED_BY_DOXYGEN
    // FIXME is it still needed ?
    /** \internal */
-    EIGEN_DEVICE_FUNC
-    explicit PlainObjectBase(internal::constructor_without_unaligned_array_assert)
+    PlainObjectBase(internal::constructor_without_unaligned_array_assert)
      : m_storage(internal::constructor_without_unaligned_array_assert())
    {
 //       _check_template_params(); EIGEN_INITIALIZE_COEFFS_IF_THAT_OPTION_IS_ENABLED
@@ -464,13 +438,11 @@ class PlainObjectBase : public internal::dense_xpr_base<Derived>::type
 #endif

 #ifdef EIGEN_HAVE_RVALUE_REFERENCES
-    EIGEN_DEVICE_FUNC
    PlainObjectBase(PlainObjectBase&& other)
      : m_storage( std::move(other.m_storage) )
    {
    }

-    EIGEN_DEVICE_FUNC
    PlainObjectBase& operator=(PlainObjectBase&& other)
    {
      using std::swap;
@@ -479,7 +451,22 @@ class PlainObjectBase : public internal::dense_xpr_base<Derived>::type
    }
 #endif

-    EIGEN_DEVICE_FUNC
+    /** Copy constructor */
+    EIGEN_STRONG_INLINE PlainObjectBase(const PlainObjectBase& other)
+      : m_storage()
+    {
+      _check_template_params();
+      lazyAssign(other);
+    }
+
+    template<typename OtherDerived>
+    EIGEN_STRONG_INLINE PlainObjectBase(const DenseBase<OtherDerived> &other)
+      : m_storage()
+    {
+      _check_template_params();
+      lazyAssign(other);
+    }
+
    EIGEN_STRONG_INLINE PlainObjectBase(Index a_size, Index nbRows, Index nbCols)
      : m_storage(a_size, nbRows, nbCols)
    {
@@ -490,7 +477,6 @@ class PlainObjectBase : public internal::dense_xpr_base<Derived>::type
    /** \copydoc MatrixBase::operator=(const EigenBase<OtherDerived>&)
      */
    template<typename OtherDerived>
-    EIGEN_DEVICE_FUNC 
    EIGEN_STRONG_INLINE Derived& operator=(const EigenBase<OtherDerived> &other)
    {
      _resize_to_match(other);
@@ -500,9 +486,8 @@ class PlainObjectBase : public internal::dense_xpr_base<Derived>::type

    /** \sa MatrixBase::operator=(const EigenBase<OtherDerived>&) */
    template<typename OtherDerived>
-    EIGEN_DEVICE_FUNC 
    EIGEN_STRONG_INLINE PlainObjectBase(const EigenBase<OtherDerived> &other)
-      : m_storage(other.derived().rows() * other.derived().cols(), other.derived().rows(), other.derived().cols())
+      : m_storage(Index(other.derived().rows()) * Index(other.derived().cols()), other.derived().rows(), other.derived().cols())
    {
      _check_template_params();
      internal::check_rows_cols_for_overflow<MaxSizeAtCompileTime>::run(other.derived().rows(), other.derived().cols());
@@ -583,16 +568,16 @@ class PlainObjectBase : public internal::dense_xpr_base<Derived>::type
    //@}

    using Base::setConstant;
-    EIGEN_DEVICE_FUNC Derived& setConstant(Index size, const Scalar& value);
-    EIGEN_DEVICE_FUNC Derived& setConstant(Index rows, Index cols, const Scalar& value);
+    Derived& setConstant(Index size, const Scalar& value);
+    Derived& setConstant(Index rows, Index cols, const Scalar& value);

    using Base::setZero;
-    EIGEN_DEVICE_FUNC Derived& setZero(Index size);
-    EIGEN_DEVICE_FUNC Derived& setZero(Index rows, Index cols);
+    Derived& setZero(Index size);
+    Derived& setZero(Index rows, Index cols);

    using Base::setOnes;
-    EIGEN_DEVICE_FUNC Derived& setOnes(Index size);
-    EIGEN_DEVICE_FUNC Derived& setOnes(Index rows, Index cols);
+    Derived& setOnes(Index size);
+    Derived& setOnes(Index rows, Index cols);

    using Base::setRandom;
    Derived& setRandom(Index size);
@@ -611,7 +596,6 @@ class PlainObjectBase : public internal::dense_xpr_base<Derived>::type
      * remain row-vectors and vectors remain vectors.
      */
    template<typename OtherDerived>
-    EIGEN_DEVICE_FUNC 
    EIGEN_STRONG_INLINE void _resize_to_match(const EigenBase<OtherDerived>& other)
    {
      #ifdef EIGEN_NO_AUTOMATIC_RESIZING
@@ -619,6 +603,8 @@ class PlainObjectBase : public internal::dense_xpr_base<Derived>::type
                 : (rows() == other.rows() && cols() == other.cols())))
        && "Size mismatch. Automatic resizing is disabled because EIGEN_NO_AUTOMATIC_RESIZING is defined");
      EIGEN_ONLY_USED_FOR_DEBUG(other);
+      if(this->size()==0)
+        resizeLike(other);
      #else
      resizeLike(other);
      #endif
@@ -638,23 +624,25 @@ class PlainObjectBase : public internal::dense_xpr_base<Derived>::type
      *
      * \internal
      */
-    // aliasing is dealt once in internall::call_assignment
-    // so at this stage we have to assume aliasing... and resising has to be done later.
    template<typename OtherDerived>
-    EIGEN_DEVICE_FUNC 
    EIGEN_STRONG_INLINE Derived& _set(const DenseBase<OtherDerived>& other)
    {
-      internal::call_assignment(this->derived(), other.derived());
+      _set_selector(other.derived(), typename internal::conditional<static_cast<bool>(int(OtherDerived::Flags) & EvalBeforeAssigningBit), internal::true_type, internal::false_type>::type());
      return this->derived();
    }

+    template<typename OtherDerived>
+    EIGEN_STRONG_INLINE void _set_selector(const OtherDerived& other, const internal::true_type&) { _set_noalias(other.eval()); }
+
+    template<typename OtherDerived>
+    EIGEN_STRONG_INLINE void _set_selector(const OtherDerived& other, const internal::false_type&) { _set_noalias(other); }
+
    /** \internal Like _set() but additionally makes the assumption that no aliasing effect can happen (which
      * is the case when creating a new matrix) so one can enforce lazy evaluation.
      *
      * \sa operator=(const MatrixBase<OtherDerived>&), _set()
      */
    template<typename OtherDerived>
-    EIGEN_DEVICE_FUNC 
    EIGEN_STRONG_INLINE Derived& _set_noalias(const DenseBase<OtherDerived>& other)
    {
      // I don't think we need this resize call since the lazyAssign will anyways resize
@@ -662,12 +650,10 @@ class PlainObjectBase : public internal::dense_xpr_base<Derived>::type
      //_resize_to_match(other);
      // the 'false' below means to enforce lazy evaluation. We don't use lazyAssign() because
      // it wouldn't allow to copy a row-vector into a column-vector.
-      internal::call_assignment_no_alias(this->derived(), other.derived(), internal::assign_op<Scalar>());
-      return this->derived();
+      return internal::assign_selector<Derived,OtherDerived,false>::run(this->derived(), other.derived());
    }

    template<typename T0, typename T1>
-    EIGEN_DEVICE_FUNC
    EIGEN_STRONG_INLINE void _init2(Index nbRows, Index nbCols, typename internal::enable_if<Base::SizeAtCompileTime!=2,T0>::type* = 0)
    {
      EIGEN_STATIC_ASSERT(bool(NumTraits<T0>::IsInteger) &&
@@ -675,153 +661,29 @@ class PlainObjectBase : public internal::dense_xpr_base<Derived>::type
                          FLOATING_POINT_ARGUMENT_PASSED__INTEGER_WAS_EXPECTED)
      resize(nbRows,nbCols);
    }
-    
    template<typename T0, typename T1>
-    EIGEN_DEVICE_FUNC 
    EIGEN_STRONG_INLINE void _init2(const Scalar& val0, const Scalar& val1, typename internal::enable_if<Base::SizeAtCompileTime==2,T0>::type* = 0)
    {
      EIGEN_STATIC_ASSERT_VECTOR_SPECIFIC_SIZE(PlainObjectBase, 2)
      m_storage.data()[0] = val0;
      m_storage.data()[1] = val1;
    }
-    
-    template<typename T0, typename T1>
-    EIGEN_DEVICE_FUNC 
-    EIGEN_STRONG_INLINE void _init2(const Index& val0, const Index& val1,
-                                    typename internal::enable_if<    (!internal::is_same<Index,Scalar>::value)
-                                                                  && (internal::is_same<T0,Index>::value)
-                                                                  && (internal::is_same<T1,Index>::value)
-                                                                  && Base::SizeAtCompileTime==2,T1>::type* = 0)
-    {
-      EIGEN_STATIC_ASSERT_VECTOR_SPECIFIC_SIZE(PlainObjectBase, 2)
-      m_storage.data()[0] = Scalar(val0);
-      m_storage.data()[1] = Scalar(val1);
-    }

-    // The argument is convertible to the Index type and we either have a non 1x1 Matrix, or a dynamic-sized Array,
-    // then the argument is meant to be the size of the object.
-    template<typename T>
-    EIGEN_DEVICE_FUNC
-    EIGEN_STRONG_INLINE void _init1(Index size, typename internal::enable_if<    (Base::SizeAtCompileTime!=1 || !internal::is_convertible<T, Scalar>::value)
-                                                                              && ((!internal::is_same<typename internal::traits<Derived>::XprKind,ArrayXpr>::value || Base::SizeAtCompileTime==Dynamic)),T>::type* = 0)
-    {
-      // NOTE MSVC 2008 complains if we directly put bool(NumTraits<T>::IsInteger) as the EIGEN_STATIC_ASSERT argument.
-      const bool is_integer = NumTraits<T>::IsInteger;
-      EIGEN_STATIC_ASSERT(is_integer,
-                          FLOATING_POINT_ARGUMENT_PASSED__INTEGER_WAS_EXPECTED)
-      resize(size);
-    }
-    
-    // We have a 1x1 matrix/array => the argument is interpreted as the value of the unique coefficient (case where scalar type can be implicitely converted)
-    template<typename T>
-    EIGEN_DEVICE_FUNC
-    EIGEN_STRONG_INLINE void _init1(const Scalar& val0, typename internal::enable_if<Base::SizeAtCompileTime==1 && internal::is_convertible<T, Scalar>::value,T>::type* = 0)
-    {
-      EIGEN_STATIC_ASSERT_VECTOR_SPECIFIC_SIZE(PlainObjectBase, 1)
-      m_storage.data()[0] = val0;
-    }
-    
-    // We have a 1x1 matrix/array => the argument is interpreted as the value of the unique coefficient (case where scalar type match the index type)
-    template<typename T>
-    EIGEN_DEVICE_FUNC
-    EIGEN_STRONG_INLINE void _init1(const Index& val0,
-                                    typename internal::enable_if<    (!internal::is_same<Index,Scalar>::value)
-                                                                  && (internal::is_same<Index,T>::value)
-                                                                  && Base::SizeAtCompileTime==1
-                                                                  && internal::is_convertible<T, Scalar>::value,T*>::type* = 0)
-    {
-      EIGEN_STATIC_ASSERT_VECTOR_SPECIFIC_SIZE(PlainObjectBase, 1)
-      m_storage.data()[0] = Scalar(val0);
-    }
-
-    // Initialize a fixed size matrix from a pointer to raw data
-    template<typename T>
-    EIGEN_DEVICE_FUNC
-    EIGEN_STRONG_INLINE void _init1(const Scalar* data){
-      this->_set_noalias(ConstMapType(data));
-    }
-
-    // Initialize an arbitrary matrix from a dense expression
-    template<typename T, typename OtherDerived>
-    EIGEN_DEVICE_FUNC
-    EIGEN_STRONG_INLINE void _init1(const DenseBase<OtherDerived>& other){
-      this->_set_noalias(other);
-    }
-
-    // Initialize an arbitrary matrix from a generic Eigen expression
-    template<typename T, typename OtherDerived>
-    EIGEN_DEVICE_FUNC
-    EIGEN_STRONG_INLINE void _init1(const EigenBase<OtherDerived>& other){
-      this->derived() = other;
-    }
-
-    template<typename T, typename OtherDerived>
-    EIGEN_DEVICE_FUNC
-    EIGEN_STRONG_INLINE void _init1(const ReturnByValue<OtherDerived>& other)
-    {
-      resize(other.rows(), other.cols());
-      other.evalTo(this->derived());
-    }
-
-    template<typename T, typename OtherDerived, int ColsAtCompileTime>
-    EIGEN_DEVICE_FUNC
-    EIGEN_STRONG_INLINE void _init1(const RotationBase<OtherDerived,ColsAtCompileTime>& r)
-    {
-      this->derived() = r;
-    }
-    
-    // For fixed -size arrays:
-    template<typename T>
-    EIGEN_DEVICE_FUNC
-    EIGEN_STRONG_INLINE void _init1(const Scalar& val0,
-                                    typename internal::enable_if<    Base::SizeAtCompileTime!=Dynamic
-                                                                  && Base::SizeAtCompileTime!=1
-                                                                  && internal::is_convertible<T, Scalar>::value
-                                                                  && internal::is_same<typename internal::traits<Derived>::XprKind,ArrayXpr>::value,T>::type* = 0)
-    {
-      Base::setConstant(val0);
-    }
-    
-    template<typename T>
-    EIGEN_DEVICE_FUNC
-    EIGEN_STRONG_INLINE void _init1(const Index& val0,
-                                    typename internal::enable_if<    (!internal::is_same<Index,Scalar>::value)
-                                                                  && (internal::is_same<Index,T>::value)
-                                                                  && Base::SizeAtCompileTime!=Dynamic
-                                                                  && Base::SizeAtCompileTime!=1
-                                                                  && internal::is_convertible<T, Scalar>::value
-                                                                  && internal::is_same<typename internal::traits<Derived>::XprKind,ArrayXpr>::value,T*>::type* = 0)
-    {
-      Base::setConstant(val0);
-    }
-    
    template<typename MatrixTypeA, typename MatrixTypeB, bool SwapPointers>
    friend struct internal::matrix_swap_impl;

-  public:
-    
-#ifndef EIGEN_PARSED_BY_DOXYGEN
-    /** \internal
-      * \brief Override DenseBase::swap() since for dynamic-sized matrices
-      * of same type it is enough to swap the data pointers.
+    /** \internal generic implementation of swap for dense storage since for dynamic-sized matrices of same type it is enough to swap the
+      * data pointers.
      */
    template<typename OtherDerived>
-    EIGEN_DEVICE_FUNC
-    void swap(DenseBase<OtherDerived> & other)
+    void _swap(DenseBase<OtherDerived> const & other)
    {
      enum { SwapPointers = internal::is_same<Derived, OtherDerived>::value && Base::SizeAtCompileTime==Dynamic };
-      internal::matrix_swap_impl<Derived, OtherDerived, bool(SwapPointers)>::run(this->derived(), other.derived());
+      internal::matrix_swap_impl<Derived, OtherDerived, bool(SwapPointers)>::run(this->derived(), other.const_cast_derived());
    }
-    
-    /** \internal
-      * \brief const version forwarded to DenseBase::swap
-      */
-    template<typename OtherDerived>
-    EIGEN_DEVICE_FUNC
-    void swap(DenseBase<OtherDerived> const & other)
-    { Base::swap(other.derived()); }
-    
-    EIGEN_DEVICE_FUNC 
+
+  public:
+#ifndef EIGEN_PARSED_BY_DOXYGEN
    static EIGEN_STRONG_INLINE void _check_template_params()
    {
      EIGEN_STATIC_ASSERT((EIGEN_IMPLIES(MaxRowsAtCompileTime==1 && MaxColsAtCompileTime!=1, (Options&RowMajor)==RowMajor)
@@ -835,9 +697,10 @@ class PlainObjectBase : public internal::dense_xpr_base<Derived>::type
                        && (Options & (DontAlign|RowMajor)) == Options),
        INVALID_MATRIX_TEMPLATE_PARAMETERS)
    }
-
-    enum { IsPlainObjectBase = 1 };
 #endif
+
+private:
+    enum { ThisConstantIsPrivateInPlainObjectBase };
 };

 namespace internal {
@@ -845,6 +708,7 @@ namespace internal {
 template <typename Derived, typename OtherDerived, bool IsVector>
 struct conservative_resize_like_impl
 {
+  typedef typename Derived::Index Index;
  static void run(DenseBase<Derived>& _this, Index rows, Index cols)
  {
    if (_this.rows() == rows && _this.cols() == cols) return;
@@ -910,6 +774,7 @@ struct conservative_resize_like_impl<Derived,OtherDerived,true>
 {
  using conservative_resize_like_impl<Derived,OtherDerived,false>::run;
  
+  typedef typename Derived::Index Index;
  static void run(DenseBase<Derived>& _this, Index size)
  {
    const Index new_rows = Derived::RowsAtCompileTime==1 ? 1 : size;
@@ -935,7 +800,6 @@ struct conservative_resize_like_impl<Derived,OtherDerived,true>
 template<typename MatrixTypeA, typename MatrixTypeB, bool SwapPointers>
 struct matrix_swap_impl
 {
-  EIGEN_DEVICE_FUNC
  static inline void run(MatrixTypeA& a, MatrixTypeB& b)
  {
    a.base().swap(b);
@@ -945,7 +809,6 @@ struct matrix_swap_impl
 template<typename MatrixTypeA, typename MatrixTypeB>
 struct matrix_swap_impl<MatrixTypeA, MatrixTypeB, true>
 {
-  EIGEN_DEVICE_FUNC
  static inline void run(MatrixTypeA& a, MatrixTypeB& b)
  {
    static_cast<typename MatrixTypeA::Base&>(a).m_storage.swap(static_cast<typename MatrixTypeB::Base&>(b).m_storage);
--- a/Eigen/src/Core/Product.h
+++ b/Eigen/src/Core/Product.h
@@ -1,232 +0,0 @@
-// This file is part of Eigen, a lightweight C++ template library
-// for linear algebra.
-//
-// Copyright (C) 2008-2011 Gael Guennebaud <gael.guennebaud@inria.fr>
-//
-// This Source Code Form is subject to the terms of the Mozilla
-// Public License v. 2.0. If a copy of the MPL was not distributed
-// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
-
-#ifndef EIGEN_PRODUCT_H
-#define EIGEN_PRODUCT_H
-
-namespace Eigen {
-
-template<typename Lhs, typename Rhs, int Option, typename StorageKind> class ProductImpl;
-
-/** \class Product
-  * \ingroup Core_Module
-  *
-  * \brief Expression of the product of two arbitrary matrices or vectors
-  *
-  * \param Lhs the type of the left-hand side expression
-  * \param Rhs the type of the right-hand side expression
-  *
-  * This class represents an expression of the product of two arbitrary matrices.
-  * 
-  * The other template parameters are:
-  * \tparam Option     can be DefaultProduct or LazyProduct
-  *
-  */
-
-
-namespace internal {
-
-// Determine the scalar of Product<Lhs, Rhs>. This is normally the same as Lhs::Scalar times
-// Rhs::Scalar, but product with permutation matrices inherit the scalar of the other factor.
-template<typename Lhs, typename Rhs, typename LhsShape = typename evaluator_traits<Lhs>::Shape, 
-         typename RhsShape = typename evaluator_traits<Rhs>::Shape >
-struct product_result_scalar
-{
-  typedef typename scalar_product_traits<typename Lhs::Scalar, typename Rhs::Scalar>::ReturnType Scalar;
-};
-
-template<typename Lhs, typename Rhs, typename RhsShape>
-struct product_result_scalar<Lhs, Rhs, PermutationShape, RhsShape>
-{
-  typedef typename Rhs::Scalar Scalar;
-};
-
-template<typename Lhs, typename Rhs, typename LhsShape>
-  struct product_result_scalar<Lhs, Rhs, LhsShape, PermutationShape>
-{
-  typedef typename Lhs::Scalar Scalar;
-};
-
-template<typename Lhs, typename Rhs, int Option>
-struct traits<Product<Lhs, Rhs, Option> >
-{
-  typedef typename remove_all<Lhs>::type LhsCleaned;
-  typedef typename remove_all<Rhs>::type RhsCleaned;
-  typedef traits<LhsCleaned> LhsTraits;
-  typedef traits<RhsCleaned> RhsTraits;
-  
-  typedef MatrixXpr XprKind;
-  
-  typedef typename product_result_scalar<LhsCleaned,RhsCleaned>::Scalar Scalar;
-  typedef typename product_promote_storage_type<typename LhsTraits::StorageKind,
-                                                typename RhsTraits::StorageKind,
-                                                internal::product_type<Lhs,Rhs>::ret>::ret StorageKind;
-  typedef typename promote_index_type<typename LhsTraits::StorageIndex,
-                                      typename RhsTraits::StorageIndex>::type StorageIndex;
-  
-  enum {
-    RowsAtCompileTime    = LhsTraits::RowsAtCompileTime,
-    ColsAtCompileTime    = RhsTraits::ColsAtCompileTime,
-    MaxRowsAtCompileTime = LhsTraits::MaxRowsAtCompileTime,
-    MaxColsAtCompileTime = RhsTraits::MaxColsAtCompileTime,
-    
-    // FIXME: only needed by GeneralMatrixMatrixTriangular
-    InnerSize = EIGEN_SIZE_MIN_PREFER_FIXED(LhsTraits::ColsAtCompileTime, RhsTraits::RowsAtCompileTime),
-    
-    // The storage order is somewhat arbitrary here. The correct one will be determined through the evaluator.
-    Flags = (   (MaxRowsAtCompileTime==1 && MaxColsAtCompileTime!=1)
-             || ((LhsTraits::Flags&NoPreferredStorageOrderBit) && (RhsTraits::Flags&RowMajorBit))
-             || ((RhsTraits::Flags&NoPreferredStorageOrderBit) && (LhsTraits::Flags&RowMajorBit)) )
-          ? RowMajorBit : (MaxColsAtCompileTime==1 ? 0 : NoPreferredStorageOrderBit)
-  };
-};
-
-} // end namespace internal
-
-
-template<typename _Lhs, typename _Rhs, int Option>
-class Product : public ProductImpl<_Lhs,_Rhs,Option,
-                                   typename internal::product_promote_storage_type<typename internal::traits<_Lhs>::StorageKind,
-                                                                                   typename internal::traits<_Rhs>::StorageKind,
-                                                                                   internal::product_type<_Lhs,_Rhs>::ret>::ret>
-{
-  public:
-    
-    typedef _Lhs Lhs;
-    typedef _Rhs Rhs;
-    
-    typedef typename ProductImpl<
-        Lhs, Rhs, Option,
-        typename internal::product_promote_storage_type<typename internal::traits<Lhs>::StorageKind,
-                                                        typename internal::traits<Rhs>::StorageKind,
-                                                        internal::product_type<Lhs,Rhs>::ret>::ret>::Base Base;
-    EIGEN_GENERIC_PUBLIC_INTERFACE(Product)
-
-    typedef typename internal::nested<Lhs>::type LhsNested;
-    typedef typename internal::nested<Rhs>::type RhsNested;
-    typedef typename internal::remove_all<LhsNested>::type LhsNestedCleaned;
-    typedef typename internal::remove_all<RhsNested>::type RhsNestedCleaned;
-
-    EIGEN_DEVICE_FUNC Product(const Lhs& lhs, const Rhs& rhs) : m_lhs(lhs), m_rhs(rhs)
-    {
-      eigen_assert(lhs.cols() == rhs.rows()
-        && "invalid matrix product"
-        && "if you wanted a coeff-wise or a dot product use the respective explicit functions");
-    }
-
-    EIGEN_DEVICE_FUNC inline Index rows() const { return m_lhs.rows(); }
-    EIGEN_DEVICE_FUNC inline Index cols() const { return m_rhs.cols(); }
-
-    EIGEN_DEVICE_FUNC const LhsNestedCleaned& lhs() const { return m_lhs; }
-    EIGEN_DEVICE_FUNC const RhsNestedCleaned& rhs() const { return m_rhs; }
-
-  protected:
-
-    LhsNested m_lhs;
-    RhsNested m_rhs;
-};
-
-namespace internal {
-  
-template<typename Lhs, typename Rhs, int Option, int ProductTag = internal::product_type<Lhs,Rhs>::ret>
-class dense_product_base
- : public internal::dense_xpr_base<Product<Lhs,Rhs,Option> >::type
-{};
-
-/** Convertion to scalar for inner-products */
-template<typename Lhs, typename Rhs, int Option>
-class dense_product_base<Lhs, Rhs, Option, InnerProduct>
- : public internal::dense_xpr_base<Product<Lhs,Rhs,Option> >::type
-{
-  typedef Product<Lhs,Rhs,Option> ProductXpr;
-  typedef typename internal::dense_xpr_base<ProductXpr>::type Base;
-public:
-  using Base::derived;
-  typedef typename Base::Scalar Scalar;
-  
-  operator const Scalar() const
-  {
-    return typename internal::evaluator<ProductXpr>::type(derived()).coeff(0,0);
-  }
-};
-
-} // namespace internal
-
-// Generic API dispatcher
-template<typename Lhs, typename Rhs, int Option, typename StorageKind>
-class ProductImpl : public internal::generic_xpr_base<Product<Lhs,Rhs,Option>, MatrixXpr, StorageKind>::type
-{
-  public:
-    typedef typename internal::generic_xpr_base<Product<Lhs,Rhs,Option>, MatrixXpr, StorageKind>::type Base;
-};
-
-template<typename Lhs, typename Rhs, int Option>
-class ProductImpl<Lhs,Rhs,Option,Dense>
-  : public internal::dense_product_base<Lhs,Rhs,Option>
-{
-    typedef Product<Lhs, Rhs, Option> Derived;
-    
-  public:
-    
-    typedef typename internal::dense_product_base<Lhs, Rhs, Option> Base;
-    EIGEN_DENSE_PUBLIC_INTERFACE(Derived)
-  protected:
-    enum {
-      IsOneByOne = (RowsAtCompileTime == 1 || RowsAtCompileTime == Dynamic) && 
-                   (ColsAtCompileTime == 1 || ColsAtCompileTime == Dynamic),
-      EnableCoeff = IsOneByOne || Option==LazyProduct
-    };
-    
-  public:
-  
-    EIGEN_DEVICE_FUNC Scalar coeff(Index row, Index col) const
-    {
-      EIGEN_STATIC_ASSERT(EnableCoeff, THIS_METHOD_IS_ONLY_FOR_INNER_OR_LAZY_PRODUCTS);
-      eigen_assert( (Option==LazyProduct) || (this->rows() == 1 && this->cols() == 1) );
-      
-      return typename internal::evaluator<Derived>::type(derived()).coeff(row,col);
-    }
-
-    EIGEN_DEVICE_FUNC Scalar coeff(Index i) const
-    {
-      EIGEN_STATIC_ASSERT(EnableCoeff, THIS_METHOD_IS_ONLY_FOR_INNER_OR_LAZY_PRODUCTS);
-      eigen_assert( (Option==LazyProduct) || (this->rows() == 1 && this->cols() == 1) );
-      
-      return typename internal::evaluator<Derived>::type(derived()).coeff(i);
-    }
-    
-  
-};
-
-/***************************************************************************
-* Implementation of matrix base methods
-***************************************************************************/
-
-
-/** \internal used to test the evaluator only
-  */
-template<typename Lhs,typename Rhs>
-const Product<Lhs,Rhs>
-prod(const Lhs& lhs, const Rhs& rhs)
-{
-  return Product<Lhs,Rhs>(lhs,rhs);
-}
-
-/** \internal used to test the evaluator only
-  */
-template<typename Lhs,typename Rhs>
-const Product<Lhs,Rhs,LazyProduct>
-lazyprod(const Lhs& lhs, const Rhs& rhs)
-{
-  return Product<Lhs,Rhs,LazyProduct>(lhs,rhs);
-}
-
-} // end namespace Eigen
-
-#endif // EIGEN_PRODUCT_H
--- a/Eigen/src/Core/ProductBase.h
+++ b/Eigen/src/Core/ProductBase.h
@@ -12,6 +12,269 @@

 namespace Eigen { 

+/** \class ProductBase
+  * \ingroup Core_Module
+  *
+  */
+
+namespace internal {
+template<typename Derived, typename _Lhs, typename _Rhs>
+struct traits<ProductBase<Derived,_Lhs,_Rhs> >
+{
+  typedef MatrixXpr XprKind;
+  typedef typename remove_all<_Lhs>::type Lhs;
+  typedef typename remove_all<_Rhs>::type Rhs;
+  typedef typename scalar_product_traits<typename Lhs::Scalar, typename Rhs::Scalar>::ReturnType Scalar;
+  typedef typename promote_storage_type<typename traits<Lhs>::StorageKind,
+                                           typename traits<Rhs>::StorageKind>::ret StorageKind;
+  typedef typename promote_index_type<typename traits<Lhs>::Index,
+                                         typename traits<Rhs>::Index>::type Index;
+  enum {
+    RowsAtCompileTime = traits<Lhs>::RowsAtCompileTime,
+    ColsAtCompileTime = traits<Rhs>::ColsAtCompileTime,
+    MaxRowsAtCompileTime = traits<Lhs>::MaxRowsAtCompileTime,
+    MaxColsAtCompileTime = traits<Rhs>::MaxColsAtCompileTime,
+    Flags = (MaxRowsAtCompileTime==1 ? RowMajorBit : 0)
+          | EvalBeforeNestingBit | EvalBeforeAssigningBit | NestByRefBit,
+                  // Note that EvalBeforeNestingBit and NestByRefBit
+                  // are not used in practice because nested is overloaded for products
+    CoeffReadCost = 0 // FIXME why is it needed ?
+  };
+};
+}
+
+#define EIGEN_PRODUCT_PUBLIC_INTERFACE(Derived) \
+  typedef ProductBase<Derived, Lhs, Rhs > Base; \
+  EIGEN_DENSE_PUBLIC_INTERFACE(Derived) \
+  typedef typename Base::LhsNested LhsNested; \
+  typedef typename Base::_LhsNested _LhsNested; \
+  typedef typename Base::LhsBlasTraits LhsBlasTraits; \
+  typedef typename Base::ActualLhsType ActualLhsType; \
+  typedef typename Base::_ActualLhsType _ActualLhsType; \
+  typedef typename Base::RhsNested RhsNested; \
+  typedef typename Base::_RhsNested _RhsNested; \
+  typedef typename Base::RhsBlasTraits RhsBlasTraits; \
+  typedef typename Base::ActualRhsType ActualRhsType; \
+  typedef typename Base::_ActualRhsType _ActualRhsType; \
+  using Base::m_lhs; \
+  using Base::m_rhs;
+
+template<typename Derived, typename Lhs, typename Rhs>
+class ProductBase : public MatrixBase<Derived>
+{
+  public:
+    typedef MatrixBase<Derived> Base;
+    EIGEN_DENSE_PUBLIC_INTERFACE(ProductBase)
+    
+    typedef typename Lhs::Nested LhsNested;
+    typedef typename internal::remove_all<LhsNested>::type _LhsNested;
+    typedef internal::blas_traits<_LhsNested> LhsBlasTraits;
+    typedef typename LhsBlasTraits::DirectLinearAccessType ActualLhsType;
+    typedef typename internal::remove_all<ActualLhsType>::type _ActualLhsType;
+    typedef typename internal::traits<Lhs>::Scalar LhsScalar;
+
+    typedef typename Rhs::Nested RhsNested;
+    typedef typename internal::remove_all<RhsNested>::type _RhsNested;
+    typedef internal::blas_traits<_RhsNested> RhsBlasTraits;
+    typedef typename RhsBlasTraits::DirectLinearAccessType ActualRhsType;
+    typedef typename internal::remove_all<ActualRhsType>::type _ActualRhsType;
+    typedef typename internal::traits<Rhs>::Scalar RhsScalar;
+
+    // Diagonal of a product: no need to evaluate the arguments because they are going to be evaluated only once
+    typedef CoeffBasedProduct<LhsNested, RhsNested, 0> FullyLazyCoeffBaseProductType;
+
+  public:
+
+#ifndef EIGEN_NO_MALLOC
+    typedef typename Base::PlainObject BasePlainObject;
+    typedef Matrix<Scalar,RowsAtCompileTime==1?1:Dynamic,ColsAtCompileTime==1?1:Dynamic,BasePlainObject::Options> DynPlainObject;
+    typedef typename internal::conditional<(BasePlainObject::SizeAtCompileTime==Dynamic) || (BasePlainObject::SizeAtCompileTime*int(sizeof(Scalar)) < int(EIGEN_STACK_ALLOCATION_LIMIT)),
+                                           BasePlainObject, DynPlainObject>::type PlainObject;
+#else
+    typedef typename Base::PlainObject PlainObject;
+#endif
+
+    ProductBase(const Lhs& a_lhs, const Rhs& a_rhs)
+      : m_lhs(a_lhs), m_rhs(a_rhs)
+    {
+      eigen_assert(a_lhs.cols() == a_rhs.rows()
+        && "invalid matrix product"
+        && "if you wanted a coeff-wise or a dot product use the respective explicit functions");
+    }
+
+    inline Index rows() const { return m_lhs.rows(); }
+    inline Index cols() const { return m_rhs.cols(); }
+
+    template<typename Dest>
+    inline void evalTo(Dest& dst) const { dst.setZero(); scaleAndAddTo(dst,Scalar(1)); }
+
+    template<typename Dest>
+    inline void addTo(Dest& dst) const { scaleAndAddTo(dst,Scalar(1)); }
+
+    template<typename Dest>
+    inline void subTo(Dest& dst) const { scaleAndAddTo(dst,Scalar(-1)); }
+
+    template<typename Dest>
+    inline void scaleAndAddTo(Dest& dst, const Scalar& alpha) const { derived().scaleAndAddTo(dst,alpha); }
+
+    const _LhsNested& lhs() const { return m_lhs; }
+    const _RhsNested& rhs() const { return m_rhs; }
+
+    // Implicit conversion to the nested type (trigger the evaluation of the product)
+    operator const PlainObject& () const
+    {
+      m_result.resize(m_lhs.rows(), m_rhs.cols());
+      derived().evalTo(m_result);
+      return m_result;
+    }
+
+    const Diagonal<const FullyLazyCoeffBaseProductType,0> diagonal() const
+    { return FullyLazyCoeffBaseProductType(m_lhs, m_rhs); }
+
+    template<int Index>
+    const Diagonal<FullyLazyCoeffBaseProductType,Index> diagonal() const
+    { return FullyLazyCoeffBaseProductType(m_lhs, m_rhs); }
+
+    const Diagonal<FullyLazyCoeffBaseProductType,Dynamic> diagonal(Index index) const
+    { return FullyLazyCoeffBaseProductType(m_lhs, m_rhs).diagonal(index); }
+
+    // restrict coeff accessors to 1x1 expressions. No need to care about mutators here since this isnt a Lvalue expression
+    typename Base::CoeffReturnType coeff(Index row, Index col) const
+    {
+#ifdef EIGEN2_SUPPORT
+      return lhs().row(row).cwiseProduct(rhs().col(col).transpose()).sum();
+#else
+      EIGEN_STATIC_ASSERT_SIZE_1x1(Derived)
+      eigen_assert(this->rows() == 1 && this->cols() == 1);
+      Matrix<Scalar,1,1> result = *this;
+      return result.coeff(row,col);
+#endif
+    }
+
+    typename Base::CoeffReturnType coeff(Index i) const
+    {
+      EIGEN_STATIC_ASSERT_SIZE_1x1(Derived)
+      eigen_assert(this->rows() == 1 && this->cols() == 1);
+      Matrix<Scalar,1,1> result = *this;
+      return result.coeff(i);
+    }
+
+    const Scalar& coeffRef(Index row, Index col) const
+    {
+      EIGEN_STATIC_ASSERT_SIZE_1x1(Derived)
+      eigen_assert(this->rows() == 1 && this->cols() == 1);
+      return derived().coeffRef(row,col);
+    }
+
+    const Scalar& coeffRef(Index i) const
+    {
+      EIGEN_STATIC_ASSERT_SIZE_1x1(Derived)
+      eigen_assert(this->rows() == 1 && this->cols() == 1);
+      return derived().coeffRef(i);
+    }
+
+  protected:
+
+    LhsNested m_lhs;
+    RhsNested m_rhs;
+
+    mutable PlainObject m_result;
+};
+
+// here we need to overload the nested rule for products
+// such that the nested type is a const reference to a plain matrix
+namespace internal {
+template<typename Lhs, typename Rhs, int Mode, int N, typename PlainObject>
+struct nested<GeneralProduct<Lhs,Rhs,Mode>, N, PlainObject>
+{
+  typedef typename GeneralProduct<Lhs,Rhs,Mode>::PlainObject const& type;
+};
+template<typename Lhs, typename Rhs, int Mode, int N, typename PlainObject>
+struct nested<const GeneralProduct<Lhs,Rhs,Mode>, N, PlainObject>
+{
+  typedef typename GeneralProduct<Lhs,Rhs,Mode>::PlainObject const& type;
+};
+}
+
+template<typename NestedProduct>
+class ScaledProduct;
+
+// Note that these two operator* functions are not defined as member
+// functions of ProductBase, because, otherwise we would have to
+// define all overloads defined in MatrixBase. Furthermore, Using
+// "using Base::operator*" would not work with MSVC.
+//
+// Also note that here we accept any compatible scalar types
+template<typename Derived,typename Lhs,typename Rhs>
+const ScaledProduct<Derived>
+operator*(const ProductBase<Derived,Lhs,Rhs>& prod, const typename Derived::Scalar& x)
+{ return ScaledProduct<Derived>(prod.derived(), x); }
+
+template<typename Derived,typename Lhs,typename Rhs>
+typename internal::enable_if<!internal::is_same<typename Derived::Scalar,typename Derived::RealScalar>::value,
+                      const ScaledProduct<Derived> >::type
+operator*(const ProductBase<Derived,Lhs,Rhs>& prod, const typename Derived::RealScalar& x)
+{ return ScaledProduct<Derived>(prod.derived(), x); }
+
+
+template<typename Derived,typename Lhs,typename Rhs>
+const ScaledProduct<Derived>
+operator*(const typename Derived::Scalar& x,const ProductBase<Derived,Lhs,Rhs>& prod)
+{ return ScaledProduct<Derived>(prod.derived(), x); }
+
+template<typename Derived,typename Lhs,typename Rhs>
+typename internal::enable_if<!internal::is_same<typename Derived::Scalar,typename Derived::RealScalar>::value,
+                      const ScaledProduct<Derived> >::type
+operator*(const typename Derived::RealScalar& x,const ProductBase<Derived,Lhs,Rhs>& prod)
+{ return ScaledProduct<Derived>(prod.derived(), x); }
+
+namespace internal {
+template<typename NestedProduct>
+struct traits<ScaledProduct<NestedProduct> >
+ : traits<ProductBase<ScaledProduct<NestedProduct>,
+                         typename NestedProduct::_LhsNested,
+                         typename NestedProduct::_RhsNested> >
+{
+  typedef typename traits<NestedProduct>::StorageKind StorageKind;
+};
+}
+
+template<typename NestedProduct>
+class ScaledProduct
+  : public ProductBase<ScaledProduct<NestedProduct>,
+                       typename NestedProduct::_LhsNested,
+                       typename NestedProduct::_RhsNested>
+{
+  public:
+    typedef ProductBase<ScaledProduct<NestedProduct>,
+                       typename NestedProduct::_LhsNested,
+                       typename NestedProduct::_RhsNested> Base;
+    typedef typename Base::Scalar Scalar;
+    typedef typename Base::PlainObject PlainObject;
+//     EIGEN_PRODUCT_PUBLIC_INTERFACE(ScaledProduct)
+
+    ScaledProduct(const NestedProduct& prod, const Scalar& x)
+    : Base(prod.lhs(),prod.rhs()), m_prod(prod), m_alpha(x) {}
+
+    template<typename Dest>
+    inline void evalTo(Dest& dst) const { dst.setZero(); scaleAndAddTo(dst, Scalar(1)); }
+
+    template<typename Dest>
+    inline void addTo(Dest& dst) const { scaleAndAddTo(dst, Scalar(1)); }
+
+    template<typename Dest>
+    inline void subTo(Dest& dst) const { scaleAndAddTo(dst, Scalar(-1)); }
+
+    template<typename Dest>
+    inline void scaleAndAddTo(Dest& dst, const Scalar& a_alpha) const { m_prod.derived().scaleAndAddTo(dst,a_alpha * m_alpha); }
+
+    const Scalar& alpha() const { return m_alpha; }
+    
+  protected:
+    const NestedProduct& m_prod;
+    Scalar m_alpha;
+};
+
 /** \internal
  * Overloaded to perform an efficient C = (A*B).lazy() */
 template<typename Derived>
--- a/Eigen/src/Core/ProductEvaluators.h
+++ b/Eigen/src/Core/ProductEvaluators.h
@@ -1,861 +0,0 @@
-// This file is part of Eigen, a lightweight C++ template library
-// for linear algebra.
-//
-// Copyright (C) 2006-2008 Benoit Jacob <jacob.benoit.1@gmail.com>
-// Copyright (C) 2008-2010 Gael Guennebaud <gael.guennebaud@inria.fr>
-// Copyright (C) 2011 Jitse Niesen <jitse@maths.leeds.ac.uk>
-//
-// This Source Code Form is subject to the terms of the Mozilla
-// Public License v. 2.0. If a copy of the MPL was not distributed
-// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
-
-
-#ifndef EIGEN_PRODUCTEVALUATORS_H
-#define EIGEN_PRODUCTEVALUATORS_H
-
-namespace Eigen {
-  
-namespace internal {
-
-/** \internal
-  * Evaluator of a product expression.
-  * Since products require special treatments to handle all possible cases,
-  * we simply deffer the evaluation logic to a product_evaluator class
-  * which offers more partial specialization possibilities.
-  * 
-  * \sa class product_evaluator
-  */
-template<typename Lhs, typename Rhs, int Options>
-struct evaluator<Product<Lhs, Rhs, Options> > 
- : public product_evaluator<Product<Lhs, Rhs, Options> >
-{
-  typedef Product<Lhs, Rhs, Options> XprType;
-  typedef product_evaluator<XprType> Base;
-  
-  typedef evaluator type;
-  typedef evaluator nestedType;
-  
-  EIGEN_DEVICE_FUNC explicit evaluator(const XprType& xpr) : Base(xpr) {}
-};
- 
-// Catch scalar * ( A * B ) and transform it to (A*scalar) * B
-// TODO we should apply that rule only if that's really helpful
-template<typename Lhs, typename Rhs, typename Scalar>
-struct evaluator<CwiseUnaryOp<internal::scalar_multiple_op<Scalar>,  const Product<Lhs, Rhs, DefaultProduct>  > > 
- : public evaluator<Product<CwiseUnaryOp<internal::scalar_multiple_op<Scalar>,const Lhs>, Rhs, DefaultProduct> >
-{
-  typedef CwiseUnaryOp<internal::scalar_multiple_op<Scalar>, const Product<Lhs, Rhs, DefaultProduct> > XprType;
-  typedef evaluator<Product<CwiseUnaryOp<internal::scalar_multiple_op<Scalar>,const Lhs>, Rhs, DefaultProduct> > Base;
-  
-  typedef evaluator type;
-  typedef evaluator nestedType;
-  
-  EIGEN_DEVICE_FUNC explicit evaluator(const XprType& xpr)
-    : Base(xpr.functor().m_other * xpr.nestedExpression().lhs() * xpr.nestedExpression().rhs())
-  {}
-};
-
-
-template<typename Lhs, typename Rhs, int DiagIndex>
-struct evaluator<Diagonal<const Product<Lhs, Rhs, DefaultProduct>, DiagIndex> > 
- : public evaluator<Diagonal<const Product<Lhs, Rhs, LazyProduct>, DiagIndex> >
-{
-  typedef Diagonal<const Product<Lhs, Rhs, DefaultProduct>, DiagIndex> XprType;
-  typedef evaluator<Diagonal<const Product<Lhs, Rhs, LazyProduct>, DiagIndex> > Base;
-  
-  typedef evaluator type;
-  typedef evaluator nestedType;
-
-  EIGEN_DEVICE_FUNC explicit evaluator(const XprType& xpr)
-    : Base(Diagonal<const Product<Lhs, Rhs, LazyProduct>, DiagIndex>(
-        Product<Lhs, Rhs, LazyProduct>(xpr.nestedExpression().lhs(), xpr.nestedExpression().rhs()),
-        xpr.index() ))
-  {}
-};
-
-
-// Helper class to perform a matrix product with the destination at hand.
-// Depending on the sizes of the factors, there are different evaluation strategies
-// as controlled by internal::product_type.
-template< typename Lhs, typename Rhs,
-          typename LhsShape = typename evaluator_traits<Lhs>::Shape,
-          typename RhsShape = typename evaluator_traits<Rhs>::Shape,
-          int ProductType = internal::product_type<Lhs,Rhs>::value>
-struct generic_product_impl;
-
-template<typename Lhs, typename Rhs>
-struct evaluator_traits<Product<Lhs, Rhs, DefaultProduct> > 
- : evaluator_traits_base<Product<Lhs, Rhs, DefaultProduct> >
-{
-  enum { AssumeAliasing = 1 };
-};
-
-// This is the default evaluator implementation for products:
-// It creates a temporary and call generic_product_impl
-template<typename Lhs, typename Rhs, int ProductTag, typename LhsShape, typename RhsShape>
-struct product_evaluator<Product<Lhs, Rhs, DefaultProduct>, ProductTag, LhsShape, RhsShape, typename traits<Lhs>::Scalar, typename traits<Rhs>::Scalar> 
-  : public evaluator<typename Product<Lhs, Rhs, DefaultProduct>::PlainObject>::type
-{
-  typedef Product<Lhs, Rhs, DefaultProduct> XprType;
-  typedef typename XprType::PlainObject PlainObject;
-  typedef typename evaluator<PlainObject>::type Base;
-  enum {
-    Flags = Base::Flags | EvalBeforeNestingBit
-//     CoeffReadCost = 0 // FIXME why is it needed? (this was already the case before the evaluators, see traits<ProductBase>)
-  };
-
-  EIGEN_DEVICE_FUNC explicit product_evaluator(const XprType& xpr)
-    : m_result(xpr.rows(), xpr.cols())
-  {
-    ::new (static_cast<Base*>(this)) Base(m_result);
-    
-// FIXME shall we handle nested_eval here?
-//     typedef typename internal::nested_eval<Lhs,Rhs::ColsAtCompileTime>::type LhsNested;
-//     typedef typename internal::nested_eval<Rhs,Lhs::RowsAtCompileTime>::type RhsNested;
-//     typedef typename internal::remove_all<LhsNested>::type LhsNestedCleaned;
-//     typedef typename internal::remove_all<RhsNested>::type RhsNestedCleaned;
-//     
-//     const LhsNested lhs(xpr.lhs());
-//     const RhsNested rhs(xpr.rhs());
-//   
-//     generic_product_impl<LhsNestedCleaned, RhsNestedCleaned>::evalTo(m_result, lhs, rhs);
-
-    generic_product_impl<Lhs, Rhs, LhsShape, RhsShape, ProductTag>::evalTo(m_result, xpr.lhs(), xpr.rhs());
-  }
-  
-protected:  
-  PlainObject m_result;
-};
-
-// Dense = Product
-template< typename DstXprType, typename Lhs, typename Rhs, typename Scalar>
-struct Assignment<DstXprType, Product<Lhs,Rhs,DefaultProduct>, internal::assign_op<Scalar>, Dense2Dense, Scalar>
-{
-  typedef Product<Lhs,Rhs,DefaultProduct> SrcXprType;
-  static void run(DstXprType &dst, const SrcXprType &src, const internal::assign_op<Scalar> &)
-  {
-    // FIXME shall we handle nested_eval here?
-    generic_product_impl<Lhs, Rhs>::evalTo(dst, src.lhs(), src.rhs());
-  }
-};
-
-// Dense += Product
-template< typename DstXprType, typename Lhs, typename Rhs, typename Scalar>
-struct Assignment<DstXprType, Product<Lhs,Rhs,DefaultProduct>, internal::add_assign_op<Scalar>, Dense2Dense, Scalar>
-{
-  typedef Product<Lhs,Rhs,DefaultProduct> SrcXprType;
-  static void run(DstXprType &dst, const SrcXprType &src, const internal::add_assign_op<Scalar> &)
-  {
-    // FIXME shall we handle nested_eval here?
-    generic_product_impl<Lhs, Rhs>::addTo(dst, src.lhs(), src.rhs());
-  }
-};
-
-// Dense -= Product
-template< typename DstXprType, typename Lhs, typename Rhs, typename Scalar>
-struct Assignment<DstXprType, Product<Lhs,Rhs,DefaultProduct>, internal::sub_assign_op<Scalar>, Dense2Dense, Scalar>
-{
-  typedef Product<Lhs,Rhs,DefaultProduct> SrcXprType;
-  static void run(DstXprType &dst, const SrcXprType &src, const internal::sub_assign_op<Scalar> &)
-  {
-    // FIXME shall we handle nested_eval here?
-    generic_product_impl<Lhs, Rhs>::subTo(dst, src.lhs(), src.rhs());
-  }
-};
-
-
-// Dense ?= scalar * Product
-// TODO we should apply that rule if that's really helpful
-// for instance, this is not good for inner products
-template< typename DstXprType, typename Lhs, typename Rhs, typename AssignFunc, typename Scalar, typename ScalarBis>
-struct Assignment<DstXprType, CwiseUnaryOp<internal::scalar_multiple_op<ScalarBis>,
-                                           const Product<Lhs,Rhs,DefaultProduct> >, AssignFunc, Dense2Dense, Scalar>
-{
-  typedef CwiseUnaryOp<internal::scalar_multiple_op<ScalarBis>,
-                       const Product<Lhs,Rhs,DefaultProduct> > SrcXprType;
-  static void run(DstXprType &dst, const SrcXprType &src, const AssignFunc& func)
-  {
-    // TODO use operator* instead of prod() once we have made enough progress
-    call_assignment(dst.noalias(), prod(src.functor().m_other * src.nestedExpression().lhs(), src.nestedExpression().rhs()), func);
-  }
-};
-
-
-template<typename Lhs, typename Rhs>
-struct generic_product_impl<Lhs,Rhs,DenseShape,DenseShape,InnerProduct>
-{
-  template<typename Dst>
-  static inline void evalTo(Dst& dst, const Lhs& lhs, const Rhs& rhs)
-  {
-    dst.coeffRef(0,0) = (lhs.transpose().cwiseProduct(rhs)).sum();
-  }
-  
-  template<typename Dst>
-  static inline void addTo(Dst& dst, const Lhs& lhs, const Rhs& rhs)
-  {
-    dst.coeffRef(0,0) += (lhs.transpose().cwiseProduct(rhs)).sum();
-  }
-  
-  template<typename Dst>
-  static void subTo(Dst& dst, const Lhs& lhs, const Rhs& rhs)
-  { dst.coeffRef(0,0) -= (lhs.transpose().cwiseProduct(rhs)).sum(); }
-};
-
-
-/***********************************************************************
-*  Implementation of outer dense * dense vector product
-***********************************************************************/
-
-// Column major result
-template<typename Dst, typename Lhs, typename Rhs, typename Func>
-EIGEN_DONT_INLINE void outer_product_selector_run(Dst& dst, const Lhs &lhs, const Rhs &rhs, const Func& func, const false_type&)
-{
-  typename evaluator<Rhs>::type rhsEval(rhs);
-  // FIXME make sure lhs is sequentially stored
-  // FIXME not very good if rhs is real and lhs complex while alpha is real too
-  // FIXME we should probably build an evaluator for dst
-  const Index cols = dst.cols();
-  for (Index j=0; j<cols; ++j)
-    func(dst.col(j), rhsEval.coeff(0,j) * lhs);
-}
-
-// Row major result
-template<typename Dst, typename Lhs, typename Rhs, typename Func>
-EIGEN_DONT_INLINE void outer_product_selector_run(Dst& dst, const Lhs &lhs, const Rhs &rhs, const Func& func, const true_type&)
-{
-  typename evaluator<Lhs>::type lhsEval(lhs);
-  // FIXME make sure rhs is sequentially stored
-  // FIXME not very good if lhs is real and rhs complex while alpha is real too
-  // FIXME we should probably build an evaluator for dst
-  const Index rows = dst.rows();
-  for (Index i=0; i<rows; ++i)
-    func(dst.row(i), lhsEval.coeff(i,0) * rhs);
-}
-
-template<typename Lhs, typename Rhs>
-struct generic_product_impl<Lhs,Rhs,DenseShape,DenseShape,OuterProduct>
-{
-  template<typename T> struct IsRowMajor : internal::conditional<(int(T::Flags)&RowMajorBit), internal::true_type, internal::false_type>::type {};
-  typedef typename Product<Lhs,Rhs>::Scalar Scalar;
-  
-  // TODO it would be nice to be able to exploit our *_assign_op functors for that purpose
-  struct set  { template<typename Dst, typename Src> void operator()(const Dst& dst, const Src& src) const { dst.const_cast_derived()  = src; } };
-  struct add  { template<typename Dst, typename Src> void operator()(const Dst& dst, const Src& src) const { dst.const_cast_derived() += src; } };
-  struct sub  { template<typename Dst, typename Src> void operator()(const Dst& dst, const Src& src) const { dst.const_cast_derived() -= src; } };
-  struct adds {
-    Scalar m_scale;
-    explicit adds(const Scalar& s) : m_scale(s) {}
-    template<typename Dst, typename Src> void operator()(const Dst& dst, const Src& src) const {
-      dst.const_cast_derived() += m_scale * src;
-    }
-  };
-  
-  template<typename Dst>
-  static inline void evalTo(Dst& dst, const Lhs& lhs, const Rhs& rhs)
-  {
-    internal::outer_product_selector_run(dst, lhs, rhs, set(), IsRowMajor<Dst>());
-  }
-  
-  template<typename Dst>
-  static inline void addTo(Dst& dst, const Lhs& lhs, const Rhs& rhs)
-  {
-    internal::outer_product_selector_run(dst, lhs, rhs, add(), IsRowMajor<Dst>());
-  }
-  
-  template<typename Dst>
-  static inline void subTo(Dst& dst, const Lhs& lhs, const Rhs& rhs)
-  {
-    internal::outer_product_selector_run(dst, lhs, rhs, sub(), IsRowMajor<Dst>());
-  }
-  
-  template<typename Dst>
-  static inline void scaleAndAddTo(Dst& dst, const Lhs& lhs, const Rhs& rhs, const Scalar& alpha)
-  {
-    internal::outer_product_selector_run(dst, lhs, rhs, adds(alpha), IsRowMajor<Dst>());
-  }
-  
-};
-
-
-// This base class provides default implementations for evalTo, addTo, subTo, in terms of scaleAndAddTo
-template<typename Lhs, typename Rhs, typename Derived>
-struct generic_product_impl_base
-{
-  typedef typename Product<Lhs,Rhs>::Scalar Scalar;
-  
-  template<typename Dst>
-  static void evalTo(Dst& dst, const Lhs& lhs, const Rhs& rhs)
-  { dst.setZero(); scaleAndAddTo(dst, lhs, rhs, Scalar(1)); }
-
-  template<typename Dst>
-  static void addTo(Dst& dst, const Lhs& lhs, const Rhs& rhs)
-  { scaleAndAddTo(dst,lhs, rhs, Scalar(1)); }
-
-  template<typename Dst>
-  static void subTo(Dst& dst, const Lhs& lhs, const Rhs& rhs)
-  { scaleAndAddTo(dst, lhs, rhs, Scalar(-1)); }
-  
-  template<typename Dst>
-  static void scaleAndAddTo(Dst& dst, const Lhs& lhs, const Rhs& rhs, const Scalar& alpha)
-  { Derived::scaleAndAddTo(dst,lhs,rhs,alpha); }
-
-};
-
-template<typename Lhs, typename Rhs>
-struct generic_product_impl<Lhs,Rhs,DenseShape,DenseShape,GemvProduct>
-  : generic_product_impl_base<Lhs,Rhs,generic_product_impl<Lhs,Rhs,DenseShape,DenseShape,GemvProduct> >
-{
-  typedef typename Product<Lhs,Rhs>::Scalar Scalar;
-  enum { Side = Lhs::IsVectorAtCompileTime ? OnTheLeft : OnTheRight };
-  typedef typename internal::conditional<int(Side)==OnTheRight,Lhs,Rhs>::type MatrixType;
-
-  template<typename Dest>
-  static void scaleAndAddTo(Dest& dst, const Lhs& lhs, const Rhs& rhs, const Scalar& alpha)
-  {
-    internal::gemv_dense_sense_selector<Side,
-                            (int(MatrixType::Flags)&RowMajorBit) ? RowMajor : ColMajor,
-                            bool(internal::blas_traits<MatrixType>::HasUsableDirectAccess)
-                           >::run(lhs, rhs, dst, alpha);
-  }
-};
-
-template<typename Lhs, typename Rhs>
-struct generic_product_impl<Lhs,Rhs,DenseShape,DenseShape,CoeffBasedProductMode> 
-{
-  typedef typename Product<Lhs,Rhs>::Scalar Scalar;
-  
-  template<typename Dst>
-  static inline void evalTo(Dst& dst, const Lhs& lhs, const Rhs& rhs)
-  {
-    // TODO: use the following instead of calling call_assignment, same for the other methods
-    // dst = lazyprod(lhs,rhs);
-    call_assignment(dst, lazyprod(lhs,rhs), internal::assign_op<Scalar>());
-  }
-  
-  template<typename Dst>
-  static inline void addTo(Dst& dst, const Lhs& lhs, const Rhs& rhs)
-  {
-    // dst += lazyprod(lhs,rhs);
-    call_assignment(dst, lazyprod(lhs,rhs), internal::add_assign_op<Scalar>());
-  }
-  
-  template<typename Dst>
-  static inline void subTo(Dst& dst, const Lhs& lhs, const Rhs& rhs)
-  {
-    // dst -= lazyprod(lhs,rhs);
-    call_assignment(dst, lazyprod(lhs,rhs), internal::sub_assign_op<Scalar>());
-  }
-  
-//   template<typename Dst>
-//   static inline void scaleAndAddTo(Dst& dst, const Lhs& lhs, const Rhs& rhs, const Scalar& alpha)
-//   { dst += alpha * lazyprod(lhs,rhs); }
-};
-
-// This specialization enforces the use of a coefficient-based evaluation strategy
-template<typename Lhs, typename Rhs>
-struct generic_product_impl<Lhs,Rhs,DenseShape,DenseShape,LazyCoeffBasedProductMode>
-  : generic_product_impl<Lhs,Rhs,DenseShape,DenseShape,CoeffBasedProductMode> {};
-
-// Case 2: Evaluate coeff by coeff
-//
-// This is mostly taken from CoeffBasedProduct.h
-// The main difference is that we add an extra argument to the etor_product_*_impl::run() function
-// for the inner dimension of the product, because evaluator object do not know their size.
-
-template<int Traversal, int UnrollingIndex, typename Lhs, typename Rhs, typename RetScalar>
-struct etor_product_coeff_impl;
-
-template<int StorageOrder, int UnrollingIndex, typename Lhs, typename Rhs, typename Packet, int LoadMode>
-struct etor_product_packet_impl;
-
-template<typename Lhs, typename Rhs, int ProductTag>
-struct product_evaluator<Product<Lhs, Rhs, LazyProduct>, ProductTag, DenseShape, DenseShape, typename Lhs::Scalar, typename Rhs::Scalar > 
-    : evaluator_base<Product<Lhs, Rhs, LazyProduct> >
-{
-  typedef Product<Lhs, Rhs, LazyProduct> XprType;
-  typedef typename XprType::Scalar Scalar;
-  typedef typename XprType::CoeffReturnType CoeffReturnType;
-  typedef typename XprType::PacketScalar PacketScalar;
-  typedef typename XprType::PacketReturnType PacketReturnType;
-
-  EIGEN_DEVICE_FUNC explicit product_evaluator(const XprType& xpr)
-    : m_lhs(xpr.lhs()),
-      m_rhs(xpr.rhs()),
-      m_lhsImpl(m_lhs),     // FIXME the creation of the evaluator objects should result in a no-op, but check that!
-      m_rhsImpl(m_rhs),     //       Moreover, they are only useful for the packet path, so we could completely disable them when not needed,
-                            //       or perhaps declare them on the fly on the packet method... We have experiment to check what's best.
-      m_innerDim(xpr.lhs().cols())
-  { }
-
-  // Everything below here is taken from CoeffBasedProduct.h
-
-  typedef typename internal::nested_eval<Lhs,Rhs::ColsAtCompileTime>::type LhsNested;
-  typedef typename internal::nested_eval<Rhs,Lhs::RowsAtCompileTime>::type RhsNested;
-  
-  typedef typename internal::remove_all<LhsNested>::type LhsNestedCleaned;
-  typedef typename internal::remove_all<RhsNested>::type RhsNestedCleaned;
-
-  typedef typename evaluator<LhsNestedCleaned>::type LhsEtorType;
-  typedef typename evaluator<RhsNestedCleaned>::type RhsEtorType;
-  
-  enum {
-    RowsAtCompileTime = LhsNestedCleaned::RowsAtCompileTime,
-    ColsAtCompileTime = RhsNestedCleaned::ColsAtCompileTime,
-    InnerSize = EIGEN_SIZE_MIN_PREFER_FIXED(LhsNestedCleaned::ColsAtCompileTime, RhsNestedCleaned::RowsAtCompileTime),
-    MaxRowsAtCompileTime = LhsNestedCleaned::MaxRowsAtCompileTime,
-    MaxColsAtCompileTime = RhsNestedCleaned::MaxColsAtCompileTime,
-      
-    PacketSize = packet_traits<Scalar>::size,
-    
-    LhsCoeffReadCost = LhsEtorType::CoeffReadCost,
-    RhsCoeffReadCost = RhsEtorType::CoeffReadCost,
-    CoeffReadCost = (InnerSize == Dynamic || LhsCoeffReadCost==Dynamic || RhsCoeffReadCost==Dynamic || NumTraits<Scalar>::AddCost==Dynamic || NumTraits<Scalar>::MulCost==Dynamic) ? Dynamic
-                  : InnerSize * (NumTraits<Scalar>::MulCost + LhsCoeffReadCost + RhsCoeffReadCost)
-                    + (InnerSize - 1) * NumTraits<Scalar>::AddCost,
-
-    Unroll = CoeffReadCost != Dynamic && CoeffReadCost <= EIGEN_UNROLLING_LIMIT,
-    
-    LhsFlags = LhsEtorType::Flags,
-    RhsFlags = RhsEtorType::Flags,
-    
-    LhsRowMajor = LhsFlags & RowMajorBit,
-    RhsRowMajor = RhsFlags & RowMajorBit,
-      
-    SameType = is_same<typename LhsNestedCleaned::Scalar,typename RhsNestedCleaned::Scalar>::value,
-
-    CanVectorizeRhs = RhsRowMajor && (RhsFlags & PacketAccessBit)
-                    && (ColsAtCompileTime == Dynamic
-                        || ( (ColsAtCompileTime % packet_traits<Scalar>::size) == 0
-                            && (RhsFlags&AlignedBit)
-                            )
-                        ),
-
-    CanVectorizeLhs = (!LhsRowMajor) && (LhsFlags & PacketAccessBit)
-                    && (RowsAtCompileTime == Dynamic
-                        || ( (RowsAtCompileTime % packet_traits<Scalar>::size) == 0
-                            && (LhsFlags&AlignedBit)
-                            )
-                        ),
-
-    EvalToRowMajor = (MaxRowsAtCompileTime==1&&MaxColsAtCompileTime!=1) ? 1
-                    : (MaxColsAtCompileTime==1&&MaxRowsAtCompileTime!=1) ? 0
-                    : (RhsRowMajor && !CanVectorizeLhs),
-
-    Flags = ((unsigned int)(LhsFlags | RhsFlags) & HereditaryBits & ~RowMajorBit)
-          | (EvalToRowMajor ? RowMajorBit : 0)
-          | (CanVectorizeLhs ? (LhsFlags & AlignedBit) : 0)
-          | (CanVectorizeRhs ? (RhsFlags & AlignedBit) : 0)
-          // TODO enable vectorization for mixed types
-          | (SameType && (CanVectorizeLhs || CanVectorizeRhs) ? PacketAccessBit : 0),
-          
-    /* CanVectorizeInner deserves special explanation. It does not affect the product flags. It is not used outside
-    * of Product. If the Product itself is not a packet-access expression, there is still a chance that the inner
-    * loop of the product might be vectorized. This is the meaning of CanVectorizeInner. Since it doesn't affect
-    * the Flags, it is safe to make this value depend on ActualPacketAccessBit, that doesn't affect the ABI.
-    */
-    CanVectorizeInner =    SameType
-                        && LhsRowMajor
-                        && (!RhsRowMajor)
-                        && (LhsFlags & RhsFlags & ActualPacketAccessBit)
-                        && (LhsFlags & RhsFlags & AlignedBit)
-                        && (InnerSize % packet_traits<Scalar>::size == 0)
-  };
-  
-  EIGEN_DEVICE_FUNC const CoeffReturnType coeff(Index row, Index col) const
-  {
-    // TODO check performance regression wrt to Eigen 3.2 which has special handling of this function
-    return (m_lhs.row(row).transpose().cwiseProduct( m_rhs.col(col) )).sum();
-  }
-
-  /* Allow index-based non-packet access. It is impossible though to allow index-based packed access,
-   * which is why we don't set the LinearAccessBit.
-   * TODO: this seems possible when the result is a vector
-   */
-  EIGEN_DEVICE_FUNC const CoeffReturnType coeff(Index index) const
-  {
-    const Index row = RowsAtCompileTime == 1 ? 0 : index;
-    const Index col = RowsAtCompileTime == 1 ? index : 0;
-    // TODO check performance regression wrt to Eigen 3.2 which has special handling of this function
-    return (m_lhs.row(row).transpose().cwiseProduct( m_rhs.col(col) )).sum();
-  }
-
-  template<int LoadMode>
-  const PacketReturnType packet(Index row, Index col) const
-  {
-    PacketScalar res;
-    typedef etor_product_packet_impl<Flags&RowMajorBit ? RowMajor : ColMajor,
-                                     Unroll ? InnerSize-1 : Dynamic,
-                                     LhsEtorType, RhsEtorType, PacketScalar, LoadMode> PacketImpl;
-
-    PacketImpl::run(row, col, m_lhsImpl, m_rhsImpl, m_innerDim, res);
-    return res;
-  }
-
-protected:
-  const LhsNested m_lhs;
-  const RhsNested m_rhs;
-  
-  LhsEtorType m_lhsImpl;
-  RhsEtorType m_rhsImpl;
-
-  // TODO: Get rid of m_innerDim if known at compile time
-  Index m_innerDim;
-};
-
-template<typename Lhs, typename Rhs>
-struct product_evaluator<Product<Lhs, Rhs, DefaultProduct>, LazyCoeffBasedProductMode, DenseShape, DenseShape, typename traits<Lhs>::Scalar, typename traits<Rhs>::Scalar > 
-  : product_evaluator<Product<Lhs, Rhs, LazyProduct>, CoeffBasedProductMode, DenseShape, DenseShape, typename traits<Lhs>::Scalar, typename traits<Rhs>::Scalar >
-{
-  typedef Product<Lhs, Rhs, DefaultProduct> XprType;
-  typedef Product<Lhs, Rhs, LazyProduct> BaseProduct;
-  typedef product_evaluator<BaseProduct, CoeffBasedProductMode, DenseShape, DenseShape, typename Lhs::Scalar, typename Rhs::Scalar > Base;
-  enum {
-    Flags = Base::Flags | EvalBeforeNestingBit
-  };
-  EIGEN_DEVICE_FUNC explicit product_evaluator(const XprType& xpr)
-    : Base(BaseProduct(xpr.lhs(),xpr.rhs()))
-  {}
-};
-
-/****************************************
-*** Coeff based product, Packet path  ***
-****************************************/
-
-template<int UnrollingIndex, typename Lhs, typename Rhs, typename Packet, int LoadMode>
-struct etor_product_packet_impl<RowMajor, UnrollingIndex, Lhs, Rhs, Packet, LoadMode>
-{
-  static EIGEN_STRONG_INLINE void run(Index row, Index col, const Lhs& lhs, const Rhs& rhs, Index innerDim, Packet &res)
-  {
-    etor_product_packet_impl<RowMajor, UnrollingIndex-1, Lhs, Rhs, Packet, LoadMode>::run(row, col, lhs, rhs, innerDim, res);
-    res =  pmadd(pset1<Packet>(lhs.coeff(row, UnrollingIndex)), rhs.template packet<LoadMode>(UnrollingIndex, col), res);
-  }
-};
-
-template<int UnrollingIndex, typename Lhs, typename Rhs, typename Packet, int LoadMode>
-struct etor_product_packet_impl<ColMajor, UnrollingIndex, Lhs, Rhs, Packet, LoadMode>
-{
-  static EIGEN_STRONG_INLINE void run(Index row, Index col, const Lhs& lhs, const Rhs& rhs, Index innerDim, Packet &res)
-  {
-    etor_product_packet_impl<ColMajor, UnrollingIndex-1, Lhs, Rhs, Packet, LoadMode>::run(row, col, lhs, rhs, innerDim, res);
-    res =  pmadd(lhs.template packet<LoadMode>(row, UnrollingIndex), pset1<Packet>(rhs.coeff(UnrollingIndex, col)), res);
-  }
-};
-
-template<typename Lhs, typename Rhs, typename Packet, int LoadMode>
-struct etor_product_packet_impl<RowMajor, 0, Lhs, Rhs, Packet, LoadMode>
-{
-  static EIGEN_STRONG_INLINE void run(Index row, Index col, const Lhs& lhs, const Rhs& rhs, Index /*innerDim*/, Packet &res)
-  {
-    res = pmul(pset1<Packet>(lhs.coeff(row, 0)),rhs.template packet<LoadMode>(0, col));
-  }
-};
-
-template<typename Lhs, typename Rhs, typename Packet, int LoadMode>
-struct etor_product_packet_impl<ColMajor, 0, Lhs, Rhs, Packet, LoadMode>
-{
-  static EIGEN_STRONG_INLINE void run(Index row, Index col, const Lhs& lhs, const Rhs& rhs, Index /*innerDim*/, Packet &res)
-  {
-    res = pmul(lhs.template packet<LoadMode>(row, 0), pset1<Packet>(rhs.coeff(0, col)));
-  }
-};
-
-template<typename Lhs, typename Rhs, typename Packet, int LoadMode>
-struct etor_product_packet_impl<RowMajor, Dynamic, Lhs, Rhs, Packet, LoadMode>
-{
-  static EIGEN_STRONG_INLINE void run(Index row, Index col, const Lhs& lhs, const Rhs& rhs, Index innerDim, Packet& res)
-  {
-    eigen_assert(innerDim>0 && "you are using a non initialized matrix");
-    res = pmul(pset1<Packet>(lhs.coeff(row, 0)),rhs.template packet<LoadMode>(0, col));
-    for(Index i = 1; i < innerDim; ++i)
-      res =  pmadd(pset1<Packet>(lhs.coeff(row, i)), rhs.template packet<LoadMode>(i, col), res);
-  }
-};
-
-template<typename Lhs, typename Rhs, typename Packet, int LoadMode>
-struct etor_product_packet_impl<ColMajor, Dynamic, Lhs, Rhs, Packet, LoadMode>
-{
-  static EIGEN_STRONG_INLINE void run(Index row, Index col, const Lhs& lhs, const Rhs& rhs, Index innerDim, Packet& res)
-  {
-    eigen_assert(innerDim>0 && "you are using a non initialized matrix");
-    res = pmul(lhs.template packet<LoadMode>(row, 0), pset1<Packet>(rhs.coeff(0, col)));
-    for(Index i = 1; i < innerDim; ++i)
-      res =  pmadd(lhs.template packet<LoadMode>(row, i), pset1<Packet>(rhs.coeff(i, col)), res);
-  }
-};
-
-
-/***************************************************************************
-* Triangular products
-***************************************************************************/
-template<int Mode, bool LhsIsTriangular,
-         typename Lhs, bool LhsIsVector,
-         typename Rhs, bool RhsIsVector>
-struct triangular_product_impl;
-
-template<typename Lhs, typename Rhs, int ProductTag>
-struct generic_product_impl<Lhs,Rhs,TriangularShape,DenseShape,ProductTag>
-  : generic_product_impl_base<Lhs,Rhs,generic_product_impl<Lhs,Rhs,TriangularShape,DenseShape,ProductTag> >
-{
-  typedef typename Product<Lhs,Rhs>::Scalar Scalar;
-  
-  template<typename Dest>
-  static void scaleAndAddTo(Dest& dst, const Lhs& lhs, const Rhs& rhs, const Scalar& alpha)
-  {
-    triangular_product_impl<Lhs::Mode,true,typename Lhs::MatrixType,false,Rhs, Rhs::ColsAtCompileTime==1>
-        ::run(dst, lhs.nestedExpression(), rhs, alpha);
-  }
-};
-
-template<typename Lhs, typename Rhs, int ProductTag>
-struct generic_product_impl<Lhs,Rhs,DenseShape,TriangularShape,ProductTag>
-: generic_product_impl_base<Lhs,Rhs,generic_product_impl<Lhs,Rhs,DenseShape,TriangularShape,ProductTag> >
-{
-  typedef typename Product<Lhs,Rhs>::Scalar Scalar;
-  
-  template<typename Dest>
-  static void scaleAndAddTo(Dest& dst, const Lhs& lhs, const Rhs& rhs, const Scalar& alpha)
-  {
-    triangular_product_impl<Rhs::Mode,false,Lhs,Lhs::RowsAtCompileTime==1, typename Rhs::MatrixType, false>::run(dst, lhs, rhs.nestedExpression(), alpha);
-  }
-};
-
-
-/***************************************************************************
-* SelfAdjoint products
-***************************************************************************/
-template <typename Lhs, int LhsMode, bool LhsIsVector,
-          typename Rhs, int RhsMode, bool RhsIsVector>
-struct selfadjoint_product_impl;
-
-template<typename Lhs, typename Rhs, int ProductTag>
-struct generic_product_impl<Lhs,Rhs,SelfAdjointShape,DenseShape,ProductTag>
-  : generic_product_impl_base<Lhs,Rhs,generic_product_impl<Lhs,Rhs,SelfAdjointShape,DenseShape,ProductTag> >
-{
-  typedef typename Product<Lhs,Rhs>::Scalar Scalar;
-  
-  template<typename Dest>
-  static void scaleAndAddTo(Dest& dst, const Lhs& lhs, const Rhs& rhs, const Scalar& alpha)
-  {
-    selfadjoint_product_impl<typename Lhs::MatrixType,Lhs::Mode,false,Rhs,0,Rhs::IsVectorAtCompileTime>::run(dst, lhs.nestedExpression(), rhs, alpha);
-  }
-};
-
-template<typename Lhs, typename Rhs, int ProductTag>
-struct generic_product_impl<Lhs,Rhs,DenseShape,SelfAdjointShape,ProductTag>
-: generic_product_impl_base<Lhs,Rhs,generic_product_impl<Lhs,Rhs,DenseShape,SelfAdjointShape,ProductTag> >
-{
-  typedef typename Product<Lhs,Rhs>::Scalar Scalar;
-  
-  template<typename Dest>
-  static void scaleAndAddTo(Dest& dst, const Lhs& lhs, const Rhs& rhs, const Scalar& alpha)
-  {
-    selfadjoint_product_impl<Lhs,0,Lhs::IsVectorAtCompileTime,typename Rhs::MatrixType,Rhs::Mode,false>::run(dst, lhs, rhs.nestedExpression(), alpha);
-  }
-};
-
-
-/***************************************************************************
-* Diagonal products
-***************************************************************************/
-  
-template<typename MatrixType, typename DiagonalType, typename Derived, int ProductOrder>
-struct diagonal_product_evaluator_base
-  : evaluator_base<Derived>
-{
-   typedef typename scalar_product_traits<typename MatrixType::Scalar, typename DiagonalType::Scalar>::ReturnType Scalar;
-   typedef typename internal::packet_traits<Scalar>::type PacketScalar;
-public:
-  enum {
-    CoeffReadCost = NumTraits<Scalar>::MulCost + evaluator<MatrixType>::CoeffReadCost + evaluator<DiagonalType>::CoeffReadCost,
-    
-    MatrixFlags = evaluator<MatrixType>::Flags,
-    DiagFlags = evaluator<DiagonalType>::Flags,
-    _StorageOrder = MatrixFlags & RowMajorBit ? RowMajor : ColMajor,
-    _ScalarAccessOnDiag =  !((int(_StorageOrder) == ColMajor && int(ProductOrder) == OnTheLeft)
-                           ||(int(_StorageOrder) == RowMajor && int(ProductOrder) == OnTheRight)),
-    _SameTypes = is_same<typename MatrixType::Scalar, typename DiagonalType::Scalar>::value,
-    // FIXME currently we need same types, but in the future the next rule should be the one
-    //_Vectorizable = bool(int(MatrixFlags)&PacketAccessBit) && ((!_PacketOnDiag) || (_SameTypes && bool(int(DiagFlags)&PacketAccessBit))),
-    _Vectorizable = bool(int(MatrixFlags)&PacketAccessBit) && _SameTypes && (_ScalarAccessOnDiag || (bool(int(DiagFlags)&PacketAccessBit))),
-    _LinearAccessMask = (MatrixType::RowsAtCompileTime==1 || MatrixType::ColsAtCompileTime==1) ? LinearAccessBit : 0,
-    Flags = ((HereditaryBits|_LinearAccessMask) & (unsigned int)(MatrixFlags)) | (_Vectorizable ? PacketAccessBit : 0) | AlignedBit
-            //(int(MatrixFlags)&int(DiagFlags)&AlignedBit),
-  };
-  
-  diagonal_product_evaluator_base(const MatrixType &mat, const DiagonalType &diag)
-    : m_diagImpl(diag), m_matImpl(mat)
-  {
-  }
-  
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Scalar coeff(Index idx) const
-  {
-    return m_diagImpl.coeff(idx) * m_matImpl.coeff(idx);
-  }
-  
-protected:
-  template<int LoadMode>
-  EIGEN_STRONG_INLINE PacketScalar packet_impl(Index row, Index col, Index id, internal::true_type) const
-  {
-    return internal::pmul(m_matImpl.template packet<LoadMode>(row, col),
-                          internal::pset1<PacketScalar>(m_diagImpl.coeff(id)));
-  }
-  
-  template<int LoadMode>
-  EIGEN_STRONG_INLINE PacketScalar packet_impl(Index row, Index col, Index id, internal::false_type) const
-  {
-    enum {
-      InnerSize = (MatrixType::Flags & RowMajorBit) ? MatrixType::ColsAtCompileTime : MatrixType::RowsAtCompileTime,
-      DiagonalPacketLoadMode = (LoadMode == Aligned && (((InnerSize%16) == 0) || (int(DiagFlags)&AlignedBit)==AlignedBit) ? Aligned : Unaligned)
-    };
-    return internal::pmul(m_matImpl.template packet<LoadMode>(row, col),
-                          m_diagImpl.template packet<DiagonalPacketLoadMode>(id));
-  }
-  
-  typename evaluator<DiagonalType>::nestedType m_diagImpl;
-  typename evaluator<MatrixType>::nestedType   m_matImpl;
-};
-
-// diagonal * dense
-template<typename Lhs, typename Rhs, int ProductKind, int ProductTag>
-struct product_evaluator<Product<Lhs, Rhs, ProductKind>, ProductTag, DiagonalShape, DenseShape, typename Lhs::Scalar, typename Rhs::Scalar> 
-  : diagonal_product_evaluator_base<Rhs, typename Lhs::DiagonalVectorType, Product<Lhs, Rhs, LazyProduct>, OnTheLeft>
-{
-  typedef diagonal_product_evaluator_base<Rhs, typename Lhs::DiagonalVectorType, Product<Lhs, Rhs, LazyProduct>, OnTheLeft> Base;
-  using Base::m_diagImpl;
-  using Base::m_matImpl;
-  using Base::coeff;
-  using Base::packet_impl;
-  typedef typename Base::Scalar Scalar;
-  typedef typename Base::PacketScalar PacketScalar;
-  
-  typedef Product<Lhs, Rhs, ProductKind> XprType;
-  typedef typename XprType::PlainObject PlainObject;
-  
-  enum {
-    StorageOrder = int(Rhs::Flags) & RowMajorBit ? RowMajor : ColMajor
-  };
-
-  EIGEN_DEVICE_FUNC explicit product_evaluator(const XprType& xpr)
-    : Base(xpr.rhs(), xpr.lhs().diagonal())
-  {
-  }
-  
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Scalar coeff(Index row, Index col) const
-  {
-    return m_diagImpl.coeff(row) * m_matImpl.coeff(row, col);
-  }
-  
-#ifndef __CUDACC__
-  template<int LoadMode>
-  EIGEN_STRONG_INLINE PacketScalar packet(Index row, Index col) const
-  {
-    // NVCC complains about template keyword, so we disable this function in CUDA mode
-    return this->template packet_impl<LoadMode>(row,col, row,
-                                 typename internal::conditional<int(StorageOrder)==RowMajor, internal::true_type, internal::false_type>::type());
-  }
-  
-  template<int LoadMode>
-  EIGEN_STRONG_INLINE PacketScalar packet(Index idx) const
-  {
-    return packet<LoadMode>(int(StorageOrder)==ColMajor?idx:0,int(StorageOrder)==ColMajor?0:idx);
-  }
-#endif
-};
-
-// dense * diagonal
-template<typename Lhs, typename Rhs, int ProductKind, int ProductTag>
-struct product_evaluator<Product<Lhs, Rhs, ProductKind>, ProductTag, DenseShape, DiagonalShape, typename Lhs::Scalar, typename Rhs::Scalar> 
-  : diagonal_product_evaluator_base<Lhs, typename Rhs::DiagonalVectorType, Product<Lhs, Rhs, LazyProduct>, OnTheRight>
-{
-  typedef diagonal_product_evaluator_base<Lhs, typename Rhs::DiagonalVectorType, Product<Lhs, Rhs, LazyProduct>, OnTheRight> Base;
-  using Base::m_diagImpl;
-  using Base::m_matImpl;
-  using Base::coeff;
-  using Base::packet_impl;
-  typedef typename Base::Scalar Scalar;
-  typedef typename Base::PacketScalar PacketScalar;
-  
-  typedef Product<Lhs, Rhs, ProductKind> XprType;
-  typedef typename XprType::PlainObject PlainObject;
-  
-  enum { StorageOrder = int(Lhs::Flags) & RowMajorBit ? RowMajor : ColMajor };
-
-  EIGEN_DEVICE_FUNC explicit product_evaluator(const XprType& xpr)
-    : Base(xpr.lhs(), xpr.rhs().diagonal())
-  {
-  }
-  
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Scalar coeff(Index row, Index col) const
-  {
-    return m_matImpl.coeff(row, col) * m_diagImpl.coeff(col);
-  }
-  
-#ifndef __CUDACC__
-  template<int LoadMode>
-  EIGEN_STRONG_INLINE PacketScalar packet(Index row, Index col) const
-  {
-    return this->template packet_impl<LoadMode>(row,col, col,
-                                 typename internal::conditional<int(StorageOrder)==ColMajor, internal::true_type, internal::false_type>::type());
-  }
-  
-  template<int LoadMode>
-  EIGEN_STRONG_INLINE PacketScalar packet(Index idx) const
-  {
-    return packet<LoadMode>(int(StorageOrder)==ColMajor?idx:0,int(StorageOrder)==ColMajor?0:idx);
-  }
-#endif
-};
-
-/***************************************************************************
-* Products with permutation matrices
-***************************************************************************/
-  
-template<typename Lhs, typename Rhs, int ProductTag>
-struct generic_product_impl<Lhs, Rhs, PermutationShape, DenseShape, ProductTag>
-{
-  template<typename Dest>
-  static void evalTo(Dest& dst, const Lhs& lhs, const Rhs& rhs)
-  {
-    permut_matrix_product_retval<Lhs, Rhs, OnTheLeft, false> pmpr(lhs, rhs);
-    pmpr.evalTo(dst);
-  }
-};
-
-template<typename Lhs, typename Rhs, int ProductTag>
-struct generic_product_impl<Lhs, Rhs, DenseShape, PermutationShape, ProductTag>
-{
-  template<typename Dest>
-  static void evalTo(Dest& dst, const Lhs& lhs, const Rhs& rhs)
-  {
-    permut_matrix_product_retval<Rhs, Lhs, OnTheRight, false> pmpr(rhs, lhs);
-    pmpr.evalTo(dst);
-  }
-};
-
-template<typename Lhs, typename Rhs, int ProductTag>
-struct generic_product_impl<Transpose<Lhs>, Rhs, PermutationShape, DenseShape, ProductTag>
-{
-  template<typename Dest>
-  static void evalTo(Dest& dst, const Transpose<Lhs>& lhs, const Rhs& rhs)
-  {
-    permut_matrix_product_retval<Lhs, Rhs, OnTheLeft, true> pmpr(lhs.nestedPermutation(), rhs);
-    pmpr.evalTo(dst);
-  }
-};
-
-template<typename Lhs, typename Rhs, int ProductTag>
-struct generic_product_impl<Lhs, Transpose<Rhs>, DenseShape, PermutationShape, ProductTag>
-{
-  template<typename Dest>
-  static void evalTo(Dest& dst, const Lhs& lhs, const Transpose<Rhs>& rhs)
-  {
-    permut_matrix_product_retval<Rhs, Lhs, OnTheRight, true> pmpr(rhs.nestedPermutation(), lhs);
-    pmpr.evalTo(dst);
-  }
-};
-
-} // end namespace internal
-
-} // end namespace Eigen
-
-#endif // EIGEN_PRODUCT_EVALUATORS_H
--- a/Eigen/src/Core/Random.h
+++ b/Eigen/src/Core/Random.h
@@ -28,18 +28,12 @@ struct functor_traits<scalar_random_op<Scalar> >

 /** \returns a random matrix expression
  *
-  * Numbers are uniformly spread through their whole definition range for integer types,
-  * and in the [-1:1] range for floating point scalar types.
-  * 
  * The parameters \a rows and \a cols are the number of rows and of columns of
  * the returned matrix. Must be compatible with this MatrixBase type.
  *
-  * \not_reentrant
-  * 
  * This variant is meant to be used for dynamic-size matrix types. For fixed-size types,
  * it is redundant to pass \a rows and \a cols as arguments, so Random() should be used
  * instead.
-  * 
  *
  * Example: \include MatrixBase_random_int_int.cpp
  * Output: \verbinclude MatrixBase_random_int_int.out
@@ -47,10 +41,8 @@ struct functor_traits<scalar_random_op<Scalar> >
  * This expression has the "evaluate before nesting" flag so that it will be evaluated into
  * a temporary matrix whenever it is nested in a larger expression. This prevents unexpected
  * behavior with expressions involving random matrices.
-  * 
-  * See DenseBase::NullaryExpr(Index, const CustomNullaryOp&) for an example using C++11 random generators.
  *
-  * \sa DenseBase::setRandom(), DenseBase::Random(Index), DenseBase::Random()
+  * \sa MatrixBase::setRandom(), MatrixBase::Random(Index), MatrixBase::Random()
  */
 template<typename Derived>
 inline const CwiseNullaryOp<internal::scalar_random_op<typename internal::traits<Derived>::Scalar>, Derived>
@@ -60,15 +52,11 @@ DenseBase<Derived>::Random(Index rows, Index cols)
 }

 /** \returns a random vector expression
-  *
-  * Numbers are uniformly spread through their whole definition range for integer types,
-  * and in the [-1:1] range for floating point scalar types.
  *
  * The parameter \a size is the size of the returned vector.
  * Must be compatible with this MatrixBase type.
  *
  * \only_for_vectors
-  * \not_reentrant
  *
  * This variant is meant to be used for dynamic-size vector types. For fixed-size types,
  * it is redundant to pass \a size as argument, so Random() should be used
@@ -81,7 +69,7 @@ DenseBase<Derived>::Random(Index rows, Index cols)
  * a temporary vector whenever it is nested in a larger expression. This prevents unexpected
  * behavior with expressions involving random matrices.
  *
-  * \sa DenseBase::setRandom(), DenseBase::Random(Index,Index), DenseBase::Random()
+  * \sa MatrixBase::setRandom(), MatrixBase::Random(Index,Index), MatrixBase::Random()
  */
 template<typename Derived>
 inline const CwiseNullaryOp<internal::scalar_random_op<typename internal::traits<Derived>::Scalar>, Derived>
@@ -92,9 +80,6 @@ DenseBase<Derived>::Random(Index size)

 /** \returns a fixed-size random matrix or vector expression
  *
-  * Numbers are uniformly spread through their whole definition range for integer types,
-  * and in the [-1:1] range for floating point scalar types.
-  * 
  * This variant is only for fixed-size MatrixBase types. For dynamic-size types, you
  * need to use the variants taking size arguments.
  *
@@ -104,10 +89,8 @@ DenseBase<Derived>::Random(Index size)
  * This expression has the "evaluate before nesting" flag so that it will be evaluated into
  * a temporary matrix whenever it is nested in a larger expression. This prevents unexpected
  * behavior with expressions involving random matrices.
-  * 
-  * \not_reentrant
  *
-  * \sa DenseBase::setRandom(), DenseBase::Random(Index,Index), DenseBase::Random(Index)
+  * \sa MatrixBase::setRandom(), MatrixBase::Random(Index,Index), MatrixBase::Random(Index)
  */
 template<typename Derived>
 inline const CwiseNullaryOp<internal::scalar_random_op<typename internal::traits<Derived>::Scalar>, Derived>
@@ -118,11 +101,6 @@ DenseBase<Derived>::Random()

 /** Sets all coefficients in this expression to random values.
  *
-  * Numbers are uniformly spread through their whole definition range for integer types,
-  * and in the [-1:1] range for floating point scalar types.
-  * 
-  * \not_reentrant
-  * 
  * Example: \include MatrixBase_setRandom.cpp
  * Output: \verbinclude MatrixBase_setRandom.out
  *
@@ -136,16 +114,12 @@ inline Derived& DenseBase<Derived>::setRandom()

 /** Resizes to the given \a newSize, and sets all coefficients in this expression to random values.
  *
-  * Numbers are uniformly spread through their whole definition range for integer types,
-  * and in the [-1:1] range for floating point scalar types.
-  * 
  * \only_for_vectors
-  * \not_reentrant
  *
  * Example: \include Matrix_setRandom_int.cpp
  * Output: \verbinclude Matrix_setRandom_int.out
  *
-  * \sa DenseBase::setRandom(), setRandom(Index,Index), class CwiseNullaryOp, DenseBase::Random()
+  * \sa MatrixBase::setRandom(), setRandom(Index,Index), class CwiseNullaryOp, MatrixBase::Random()
  */
 template<typename Derived>
 EIGEN_STRONG_INLINE Derived&
@@ -157,18 +131,13 @@ PlainObjectBase<Derived>::setRandom(Index newSize)

 /** Resizes to the given size, and sets all coefficients in this expression to random values.
  *
-  * Numbers are uniformly spread through their whole definition range for integer types,
-  * and in the [-1:1] range for floating point scalar types.
-  *
-  * \not_reentrant
-  * 
  * \param nbRows the new number of rows
  * \param nbCols the new number of columns
  *
  * Example: \include Matrix_setRandom_int_int.cpp
  * Output: \verbinclude Matrix_setRandom_int_int.out
  *
-  * \sa DenseBase::setRandom(), setRandom(Index), class CwiseNullaryOp, DenseBase::Random()
+  * \sa MatrixBase::setRandom(), setRandom(Index), class CwiseNullaryOp, MatrixBase::Random()
  */
 template<typename Derived>
 EIGEN_STRONG_INLINE Derived&
--- a/Eigen/src/Core/Redux.h
+++ b/Eigen/src/Core/Redux.h
@@ -65,25 +65,6 @@ public:
              ? CompleteUnrolling
              : NoUnrolling
  };
-  
-#ifdef EIGEN_DEBUG_ASSIGN
-  static void debug()
-  {
-    std::cerr << "Xpr: " << typeid(typename Derived::XprType).name() << std::endl;
-    std::cerr.setf(std::ios::hex, std::ios::basefield);
-    EIGEN_DEBUG_VAR(Derived::Flags)
-    std::cerr.unsetf(std::ios::hex);
-    EIGEN_DEBUG_VAR(InnerMaxSize)
-    EIGEN_DEBUG_VAR(PacketSize)
-    EIGEN_DEBUG_VAR(MightVectorize)
-    EIGEN_DEBUG_VAR(MayLinearVectorize)
-    EIGEN_DEBUG_VAR(MaySliceVectorize)
-    EIGEN_DEBUG_VAR(Traversal)
-    EIGEN_DEBUG_VAR(UnrollingLimit)
-    EIGEN_DEBUG_VAR(Unrolling)
-    std::cerr << std::endl;
-  }
-#endif
 };

 /***************************************************************************
@@ -101,7 +82,6 @@ struct redux_novec_unroller

  typedef typename Derived::Scalar Scalar;

-  EIGEN_DEVICE_FUNC
  static EIGEN_STRONG_INLINE Scalar run(const Derived &mat, const Func& func)
  {
    return func(redux_novec_unroller<Func, Derived, Start, HalfLength>::run(mat,func),
@@ -119,7 +99,6 @@ struct redux_novec_unroller<Func, Derived, Start, 1>

  typedef typename Derived::Scalar Scalar;

-  EIGEN_DEVICE_FUNC
  static EIGEN_STRONG_INLINE Scalar run(const Derived &mat, const Func&)
  {
    return mat.coeffByOuterInner(outer, inner);
@@ -133,7 +112,6 @@ template<typename Func, typename Derived, int Start>
 struct redux_novec_unroller<Func, Derived, Start, 0>
 {
  typedef typename Derived::Scalar Scalar;
-  EIGEN_DEVICE_FUNC 
  static EIGEN_STRONG_INLINE Scalar run(const Derived&, const Func&) { return Scalar(); }
 };

@@ -191,8 +169,8 @@ template<typename Func, typename Derived>
 struct redux_impl<Func, Derived, DefaultTraversal, NoUnrolling>
 {
  typedef typename Derived::Scalar Scalar;
-  EIGEN_DEVICE_FUNC
-  static EIGEN_STRONG_INLINE Scalar run(const Derived &mat, const Func& func)
+  typedef typename Derived::Index Index;
+  static EIGEN_STRONG_INLINE Scalar run(const Derived& mat, const Func& func)
  {
    eigen_assert(mat.rows()>0 && mat.cols()>0 && "you are using an empty matrix");
    Scalar res;
@@ -216,15 +194,16 @@ struct redux_impl<Func, Derived, LinearVectorizedTraversal, NoUnrolling>
 {
  typedef typename Derived::Scalar Scalar;
  typedef typename packet_traits<Scalar>::type PacketScalar;
+  typedef typename Derived::Index Index;

-  static Scalar run(const Derived &mat, const Func& func)
+  static Scalar run(const Derived& mat, const Func& func)
  {
    const Index size = mat.size();
-    
+    eigen_assert(size && "you are using an empty matrix");
    const Index packetSize = packet_traits<Scalar>::size;
    const Index alignedStart = internal::first_aligned(mat);
    enum {
-      alignment = (bool(Derived::Flags & DirectAccessBit) && bool(packet_traits<Scalar>::AlignedOnScalar)) || bool(Derived::Flags & AlignedBit)
+      alignment = bool(Derived::Flags & DirectAccessBit) || bool(Derived::Flags & AlignedBit)
                ? Aligned : Unaligned
    };
    const Index alignedSize2 = ((size-alignedStart)/(2*packetSize))*(2*packetSize);
@@ -268,13 +247,15 @@ struct redux_impl<Func, Derived, LinearVectorizedTraversal, NoUnrolling>
  }
 };

-template<typename Func, typename Derived>
-struct redux_impl<Func, Derived, SliceVectorizedTraversal, NoUnrolling>
+// NOTE: for SliceVectorizedTraversal we simply bypass unrolling
+template<typename Func, typename Derived, int Unrolling>
+struct redux_impl<Func, Derived, SliceVectorizedTraversal, Unrolling>
 {
  typedef typename Derived::Scalar Scalar;
  typedef typename packet_traits<Scalar>::type PacketScalar;
+  typedef typename Derived::Index Index;

-  EIGEN_DEVICE_FUNC static Scalar run(const Derived &mat, const Func& func)
+  static Scalar run(const Derived& mat, const Func& func)
  {
    eigen_assert(mat.rows()>0 && mat.cols()>0 && "you are using an empty matrix");
    const Index innerSize = mat.innerSize();
@@ -316,80 +297,16 @@ struct redux_impl<Func, Derived, LinearVectorizedTraversal, CompleteUnrolling>
    Size = Derived::SizeAtCompileTime,
    VectorizedSize = (Size / PacketSize) * PacketSize
  };
-  EIGEN_DEVICE_FUNC static EIGEN_STRONG_INLINE Scalar run(const Derived &mat, const Func& func)
+  static EIGEN_STRONG_INLINE Scalar run(const Derived& mat, const Func& func)
  {
    eigen_assert(mat.rows()>0 && mat.cols()>0 && "you are using an empty matrix");
-    if (VectorizedSize > 0) {
-      Scalar res = func.predux(redux_vec_unroller<Func, Derived, 0, Size / PacketSize>::run(mat,func));
-      if (VectorizedSize != Size)
-        res = func(res,redux_novec_unroller<Func, Derived, VectorizedSize, Size-VectorizedSize>::run(mat,func));
-      return res;
-    }
-    else {
-      return redux_novec_unroller<Func, Derived, 0, Size>::run(mat,func);
-    }
+    Scalar res = func.predux(redux_vec_unroller<Func, Derived, 0, Size / PacketSize>::run(mat,func));
+    if (VectorizedSize != Size)
+      res = func(res,redux_novec_unroller<Func, Derived, VectorizedSize, Size-VectorizedSize>::run(mat,func));
+    return res;
  }
 };

-// evaluator adaptor
-template<typename _XprType>
-class redux_evaluator
-{
-public:
-  typedef _XprType XprType;
-  EIGEN_DEVICE_FUNC explicit redux_evaluator(const XprType &xpr) : m_evaluator(xpr), m_xpr(xpr) {}
-  
-  typedef typename XprType::Scalar Scalar;
-  typedef typename XprType::CoeffReturnType CoeffReturnType;
-  typedef typename XprType::PacketScalar PacketScalar;
-  typedef typename XprType::PacketReturnType PacketReturnType;
-  
-  enum {
-    MaxRowsAtCompileTime = XprType::MaxRowsAtCompileTime,
-    MaxColsAtCompileTime = XprType::MaxColsAtCompileTime,
-    // TODO we should not remove DirectAccessBit and rather find an elegant way to query the alignment offset at runtime from the evaluator
-    Flags = evaluator<XprType>::Flags & ~DirectAccessBit,
-    IsRowMajor = XprType::IsRowMajor,
-    SizeAtCompileTime = XprType::SizeAtCompileTime,
-    InnerSizeAtCompileTime = XprType::InnerSizeAtCompileTime,
-    CoeffReadCost = evaluator<XprType>::CoeffReadCost
-  };
-  
-  EIGEN_DEVICE_FUNC Index rows() const { return m_xpr.rows(); }
-  EIGEN_DEVICE_FUNC Index cols() const { return m_xpr.cols(); }
-  EIGEN_DEVICE_FUNC Index size() const { return m_xpr.size(); }
-  EIGEN_DEVICE_FUNC Index innerSize() const { return m_xpr.innerSize(); }
-  EIGEN_DEVICE_FUNC Index outerSize() const { return m_xpr.outerSize(); }
-
-  EIGEN_DEVICE_FUNC
-  CoeffReturnType coeff(Index row, Index col) const
-  { return m_evaluator.coeff(row, col); }
-
-  EIGEN_DEVICE_FUNC
-  CoeffReturnType coeff(Index index) const
-  { return m_evaluator.coeff(index); }
-
-  template<int LoadMode>
-  PacketReturnType packet(Index row, Index col) const
-  { return m_evaluator.template packet<LoadMode>(row, col); }
-
-  template<int LoadMode>
-  PacketReturnType packet(Index index) const
-  { return m_evaluator.template packet<LoadMode>(index); }
-  
-  EIGEN_DEVICE_FUNC
-  CoeffReturnType coeffByOuterInner(Index outer, Index inner) const
-  { return m_evaluator.coeff(IsRowMajor ? outer : inner, IsRowMajor ? inner : outer); }
-  
-  template<int LoadMode>
-  PacketReturnType packetByOuterInner(Index outer, Index inner) const
-  { return m_evaluator.template packet<LoadMode>(IsRowMajor ? outer : inner, IsRowMajor ? inner : outer); }
-  
-protected:
-  typename internal::evaluator<XprType>::nestedType m_evaluator;
-  const XprType &m_xpr;
-};
-
 } // end namespace internal

 /***************************************************************************
@@ -400,7 +317,7 @@ protected:
 /** \returns the result of a full redux operation on the whole matrix or vector using \a func
  *
  * The template parameter \a BinaryOp is the type of the functor \a func which must be
-  * an associative operator. Both current C++98 and C++11 functor styles are handled.
+  * an associative operator. Both current STL and TR1 functor styles are handled.
  *
  * \sa DenseBase::sum(), DenseBase::minCoeff(), DenseBase::maxCoeff(), MatrixBase::colwise(), MatrixBase::rowwise()
  */
@@ -409,22 +326,9 @@ template<typename Func>
 EIGEN_STRONG_INLINE typename internal::result_of<Func(typename internal::traits<Derived>::Scalar)>::type
 DenseBase<Derived>::redux(const Func& func) const
 {
-  eigen_assert(this->rows()>0 && this->cols()>0 && "you are using an empty matrix");
-  
-  // FIXME, eval_nest should be handled by redux_evaluator, however:
-  //  - it is currently difficult to provide the right Flags since they are still handled by the expressions
-  //  - handling it here might reduce the number of template instantiations
-//   typedef typename internal::nested_eval<Derived,1>::type ThisNested;
-//   typedef typename internal::remove_all<ThisNested>::type ThisNestedCleaned;
-//   typedef typename internal::redux_evaluator<ThisNestedCleaned> ThisEvaluator;
-//   
-//   ThisNested thisNested(derived());
-//   ThisEvaluator thisEval(thisNested);
-  
-  typedef typename internal::redux_evaluator<Derived> ThisEvaluator;
-  ThisEvaluator thisEval(derived());
-  
-  return internal::redux_impl<Func, ThisEvaluator>::run(thisEval, func);
+  typedef typename internal::remove_all<typename Derived::Nested>::type ThisNested;
+  return internal::redux_impl<Func, ThisNested>
+            ::run(derived(), func);
 }

 /** \returns the minimum of all coefficients of \c *this.
@@ -434,7 +338,7 @@ template<typename Derived>
 EIGEN_STRONG_INLINE typename internal::traits<Derived>::Scalar
 DenseBase<Derived>::minCoeff() const
 {
-  return derived().redux(Eigen::internal::scalar_min_op<Scalar>());
+  return this->redux(Eigen::internal::scalar_min_op<Scalar>());
 }

 /** \returns the maximum of all coefficients of \c *this.
@@ -444,7 +348,7 @@ template<typename Derived>
 EIGEN_STRONG_INLINE typename internal::traits<Derived>::Scalar
 DenseBase<Derived>::maxCoeff() const
 {
-  return derived().redux(Eigen::internal::scalar_max_op<Scalar>());
+  return this->redux(Eigen::internal::scalar_max_op<Scalar>());
 }

 /** \returns the sum of all coefficients of *this
@@ -457,7 +361,7 @@ DenseBase<Derived>::sum() const
 {
  if(SizeAtCompileTime==0 || (SizeAtCompileTime==Dynamic && size()==0))
    return Scalar(0);
-  return derived().redux(Eigen::internal::scalar_sum_op<Scalar>());
+  return this->redux(Eigen::internal::scalar_sum_op<Scalar>());
 }

 /** \returns the mean of all coefficients of *this
@@ -468,7 +372,7 @@ template<typename Derived>
 EIGEN_STRONG_INLINE typename internal::traits<Derived>::Scalar
 DenseBase<Derived>::mean() const
 {
-  return Scalar(derived().redux(Eigen::internal::scalar_sum_op<Scalar>())) / Scalar(this->size());
+  return Scalar(this->redux(Eigen::internal::scalar_sum_op<Scalar>())) / Scalar(this->size());
 }

 /** \returns the product of all coefficients of *this
@@ -484,7 +388,7 @@ DenseBase<Derived>::prod() const
 {
  if(SizeAtCompileTime==0 || (SizeAtCompileTime==Dynamic && size()==0))
    return Scalar(1);
-  return derived().redux(Eigen::internal::scalar_product_op<Scalar>());
+  return this->redux(Eigen::internal::scalar_product_op<Scalar>());
 }

 /** \returns the trace of \c *this, i.e. the sum of the coefficients on the main diagonal.
--- a/Eigen/src/Core/Ref.h
+++ b/Eigen/src/Core/Ref.h
@@ -12,20 +12,24 @@

 namespace Eigen { 

+template<typename Derived> class RefBase;
+template<typename PlainObjectType, int Options = 0,
+         typename StrideType = typename internal::conditional<PlainObjectType::IsVectorAtCompileTime,InnerStride<1>,OuterStride<> >::type > class Ref;
+
 /** \class Ref
  * \ingroup Core_Module
  *
-  * \brief A matrix or vector expression mapping an existing expression
+  * \brief A matrix or vector expression mapping an existing expressions
  *
  * \tparam PlainObjectType the equivalent matrix type of the mapped data
  * \tparam Options specifies whether the pointer is \c #Aligned, or \c #Unaligned.
  *                The default is \c #Unaligned.
  * \tparam StrideType optionally specifies strides. By default, Ref implies a contiguous storage along the inner dimension (inner stride==1),
-  *                   but accepts a variable outer stride (leading dimension).
+  *                   but accept a variable outer stride (leading dimension).
  *                   This can be overridden by specifying strides.
  *                   The type passed here must be a specialization of the Stride template, see examples below.
  *
-  * This class provides a way to write non-template functions taking Eigen objects as parameters while limiting the number of copies.
+  * This class permits to write non template functions taking Eigen's object as parameters while limiting the number of copies.
  * A Ref<> object can represent either a const expression or a l-value:
  * \code
  * // in-out argument:
@@ -35,10 +39,10 @@ namespace Eigen {
  * void foo2(const Ref<const VectorXf>& x);
  * \endcode
  *
-  * In the in-out case, the input argument must satisfy the constraints of the actual Ref<> type, otherwise a compilation issue will be triggered.
+  * In the in-out case, the input argument must satisfies the constraints of the actual Ref<> type, otherwise a compilation issue will be triggered.
  * By default, a Ref<VectorXf> can reference any dense vector expression of float having a contiguous memory layout.
-  * Likewise, a Ref<MatrixXf> can reference any column-major dense matrix expression of float whose column's elements are contiguously stored with
-  * the possibility to have a constant space in-between each column, i.e. the inner stride must be equal to 1, but the outer stride (or leading dimension)
+  * Likewise, a Ref<MatrixXf> can reference any column major dense matrix expression of float whose column's elements are contiguously stored with
+  * the possibility to have a constant space inbetween each column, i.e.: the inner stride mmust be equal to 1, but the outer-stride (or leading dimension),
  * can be greater than the number of rows.
  *
  * In the const case, if the input expression does not match the above requirement, then it is evaluated into a temporary before being passed to the function.
@@ -54,15 +58,15 @@ namespace Eigen {
  * foo2(A.col().segment(2,4)); // No temporary
  * \endcode
  *
-  * The range of inputs that can be referenced without temporary can be enlarged using the last two template parameters.
+  * The range of inputs that can be referenced without temporary can be enlarged using the last two template parameter.
  * Here is an example accepting an innerstride!=1:
  * \code
  * // in-out argument:
  * void foo3(Ref<VectorXf,0,InnerStride<> > x);
  * foo3(A.row());              // OK
  * \endcode
-  * The downside here is that the function foo3 might be significantly slower than foo1 because it won't be able to exploit vectorization, and will involve more
-  * expensive address computations even if the input is contiguously stored in memory. To overcome this issue, one might propose to overload internally calling a
+  * The downside here is that the function foo3 might be significantly slower than foo1 because it won't be able to exploit vectorization, and will involved more
+  * expensive address computations even if the input is contiguously stored in memory. To overcome this issue, one might propose to overloads internally calling a
  * template function, e.g.:
  * \code
  * // in the .h:
@@ -104,7 +108,8 @@ struct traits<Ref<_PlainObjectType, _Options, _StrideType> >
      OuterStrideMatch = Derived::IsVectorAtCompileTime
                      || int(StrideType::OuterStrideAtCompileTime)==int(Dynamic) || int(StrideType::OuterStrideAtCompileTime)==int(Derived::OuterStrideAtCompileTime),
      AlignmentMatch = (_Options!=Aligned) || ((PlainObjectType::Flags&AlignedBit)==0) || ((traits<Derived>::Flags&AlignedBit)==AlignedBit),
-      MatchAtCompileTime = HasDirectAccess && StorageOrderMatch && InnerStrideMatch && OuterStrideMatch && AlignmentMatch
+      ScalarTypeMatch = internal::is_same<typename PlainObjectType::Scalar, typename Derived::Scalar>::value,
+      MatchAtCompileTime = HasDirectAccess && StorageOrderMatch && InnerStrideMatch && OuterStrideMatch && AlignmentMatch && ScalarTypeMatch
    };
    typedef typename internal::conditional<MatchAtCompileTime,internal::true_type,internal::false_type>::type type;
  };
@@ -127,12 +132,12 @@ public:
  typedef MapBase<Derived> Base;
  EIGEN_DENSE_PUBLIC_INTERFACE(RefBase)

-  EIGEN_DEVICE_FUNC inline Index innerStride() const
+  inline Index innerStride() const
  {
    return StrideType::InnerStrideAtCompileTime != 0 ? m_stride.inner() : 1;
  }

-  EIGEN_DEVICE_FUNC inline Index outerStride() const
+  inline Index outerStride() const
  {
    return StrideType::OuterStrideAtCompileTime != 0 ? m_stride.outer()
         : IsVectorAtCompileTime ? this->size()
@@ -140,7 +145,7 @@ public:
         : this->rows();
  }

-  EIGEN_DEVICE_FUNC RefBase()
+  RefBase()
    : Base(0,RowsAtCompileTime==Dynamic?0:RowsAtCompileTime,ColsAtCompileTime==Dynamic?0:ColsAtCompileTime),
      // Stride<> does not allow default ctor for Dynamic strides, so let' initialize it with dummy values:
      m_stride(StrideType::OuterStrideAtCompileTime==Dynamic?0:StrideType::OuterStrideAtCompileTime,
@@ -154,7 +159,7 @@ protected:
  typedef Stride<StrideType::OuterStrideAtCompileTime,StrideType::InnerStrideAtCompileTime> StrideBase;

  template<typename Expression>
-  EIGEN_DEVICE_FUNC void construct(Expression& expr)
+  void construct(Expression& expr)
  {
    if(PlainObjectType::RowsAtCompileTime==1)
    {
@@ -183,9 +188,11 @@ protected:
 template<typename PlainObjectType, int Options, typename StrideType> class Ref
  : public RefBase<Ref<PlainObjectType, Options, StrideType> >
 {
+  private:
    typedef internal::traits<Ref> Traits;
    template<typename Derived>
-    EIGEN_DEVICE_FUNC inline Ref(const PlainObjectBase<Derived>& expr);
+    inline Ref(const PlainObjectBase<Derived>& expr,
+               typename internal::enable_if<bool(Traits::template match<Derived>::MatchAtCompileTime),Derived>::type* = 0);
  public:

    typedef RefBase<Ref> Base;
@@ -194,21 +201,23 @@ template<typename PlainObjectType, int Options, typename StrideType> class Ref

    #ifndef EIGEN_PARSED_BY_DOXYGEN
    template<typename Derived>
-    EIGEN_DEVICE_FUNC inline Ref(PlainObjectBase<Derived>& expr)
+    inline Ref(PlainObjectBase<Derived>& expr,
+               typename internal::enable_if<bool(Traits::template match<Derived>::MatchAtCompileTime),Derived>::type* = 0)
    {
-      EIGEN_STATIC_ASSERT(bool(Traits::template match<Derived>::MatchAtCompileTime), STORAGE_LAYOUT_DOES_NOT_MATCH);
+      EIGEN_STATIC_ASSERT(static_cast<bool>(Traits::template match<Derived>::MatchAtCompileTime), STORAGE_LAYOUT_DOES_NOT_MATCH);
      Base::construct(expr.derived());
    }
    template<typename Derived>
-    EIGEN_DEVICE_FUNC inline Ref(const DenseBase<Derived>& expr)
+    inline Ref(const DenseBase<Derived>& expr,
+               typename internal::enable_if<bool(Traits::template match<Derived>::MatchAtCompileTime),Derived>::type* = 0)
    #else
    template<typename Derived>
    inline Ref(DenseBase<Derived>& expr)
    #endif
    {
-      EIGEN_STATIC_ASSERT(bool(internal::is_lvalue<Derived>::value), THIS_EXPRESSION_IS_NOT_A_LVALUE__IT_IS_READ_ONLY);
-      EIGEN_STATIC_ASSERT(bool(Traits::template match<Derived>::MatchAtCompileTime), STORAGE_LAYOUT_DOES_NOT_MATCH);
-      EIGEN_STATIC_ASSERT(!Derived::IsPlainObjectBase,THIS_EXPRESSION_IS_NOT_A_LVALUE__IT_IS_READ_ONLY);
+      EIGEN_STATIC_ASSERT(static_cast<bool>(internal::is_lvalue<Derived>::value), THIS_EXPRESSION_IS_NOT_A_LVALUE__IT_IS_READ_ONLY);
+      EIGEN_STATIC_ASSERT(static_cast<bool>(Traits::template match<Derived>::MatchAtCompileTime), STORAGE_LAYOUT_DOES_NOT_MATCH);
+      enum { THIS_EXPRESSION_IS_NOT_A_LVALUE__IT_IS_READ_ONLY = Derived::ThisConstantIsPrivateInPlainObjectBase};
      Base::construct(expr.const_cast_derived());
    }

@@ -227,35 +236,36 @@ template<typename TPlainObjectType, int Options, typename StrideType> class Ref<
    EIGEN_DENSE_PUBLIC_INTERFACE(Ref)

    template<typename Derived>
-    EIGEN_DEVICE_FUNC inline Ref(const DenseBase<Derived>& expr)
+    inline Ref(const DenseBase<Derived>& expr,
+               typename internal::enable_if<bool(Traits::template match<Derived>::ScalarTypeMatch),Derived>::type* = 0)
    {
 //      std::cout << match_helper<Derived>::HasDirectAccess << "," << match_helper<Derived>::OuterStrideMatch << "," << match_helper<Derived>::InnerStrideMatch << "\n";
 //      std::cout << int(StrideType::OuterStrideAtCompileTime) << " - " << int(Derived::OuterStrideAtCompileTime) << "\n";
 //      std::cout << int(StrideType::InnerStrideAtCompileTime) << " - " << int(Derived::InnerStrideAtCompileTime) << "\n";
      construct(expr.derived(), typename Traits::template match<Derived>::type());
    }
-
-    EIGEN_DEVICE_FUNC inline Ref(const Ref& other) : Base(other) {
+    
+    inline Ref(const Ref& other) : Base(other) {
      // copy constructor shall not copy the m_object, to avoid unnecessary malloc and copy
    }

    template<typename OtherRef>
-    EIGEN_DEVICE_FUNC inline Ref(const RefBase<OtherRef>& other) {
+    inline Ref(const RefBase<OtherRef>& other) {
      construct(other.derived(), typename Traits::template match<OtherRef>::type());
    }

  protected:

    template<typename Expression>
-    EIGEN_DEVICE_FUNC void construct(const Expression& expr,internal::true_type)
+    void construct(const Expression& expr,internal::true_type)
    {
      Base::construct(expr);
    }

    template<typename Expression>
-    EIGEN_DEVICE_FUNC void construct(const Expression& expr, internal::false_type)
+    void construct(const Expression& expr, internal::false_type)
    {
-      internal::call_assignment_no_alias(m_object,expr,internal::assign_op<Scalar>());
+      m_object.lazyAssign(expr);
      Base::construct(m_object);
    }

--- a/Eigen/src/Core/Replicate.h
+++ b/Eigen/src/Core/Replicate.h
@@ -53,9 +53,8 @@ struct traits<Replicate<MatrixType,RowFactor,ColFactor> >
    IsRowMajor = MaxRowsAtCompileTime==1 && MaxColsAtCompileTime!=1 ? 1
               : MaxColsAtCompileTime==1 && MaxRowsAtCompileTime!=1 ? 0
               : (MatrixType::Flags & RowMajorBit) ? 1 : 0,
-    
-    // FIXME enable DirectAccess with negative strides?
-    Flags = IsRowMajor ? RowMajorBit : 0
+    Flags = (_MatrixTypeNested::Flags & HereditaryBits & ~RowMajorBit) | (IsRowMajor ? RowMajorBit : 0),
+    CoeffReadCost = _MatrixTypeNested::CoeffReadCost
  };
 };
 }
@@ -69,7 +68,6 @@ template<typename MatrixType,int RowFactor,int ColFactor> class Replicate

    typedef typename internal::dense_xpr_base<Replicate>::type Base;
    EIGEN_DENSE_PUBLIC_INTERFACE(Replicate)
-    typedef typename internal::remove_all<MatrixType>::type NestedExpression;

    template<typename OriginalMatrixType>
    inline explicit Replicate(const OriginalMatrixType& a_matrix)
@@ -137,7 +135,7 @@ template<typename MatrixType,int RowFactor,int ColFactor> class Replicate
  */
 template<typename Derived>
 template<int RowFactor, int ColFactor>
-inline const Replicate<Derived,RowFactor,ColFactor>
+const Replicate<Derived,RowFactor,ColFactor>
 DenseBase<Derived>::replicate() const
 {
  return Replicate<Derived,RowFactor,ColFactor>(derived());
@@ -152,7 +150,7 @@ DenseBase<Derived>::replicate() const
  * \sa VectorwiseOp::replicate(), DenseBase::replicate<int,int>(), class Replicate
  */
 template<typename Derived>
-inline const Replicate<Derived,Dynamic,Dynamic>
+const typename DenseBase<Derived>::ReplicateReturnType
 DenseBase<Derived>::replicate(Index rowFactor,Index colFactor) const
 {
  return Replicate<Derived,Dynamic,Dynamic>(derived(),rowFactor,colFactor);
--- a/Eigen/src/Core/ReturnByValue.h
+++ b/Eigen/src/Core/ReturnByValue.h
@@ -38,10 +38,9 @@ struct traits<ReturnByValue<Derived> >
 * So internal::nested always gives the plain return matrix type.
 *
 * FIXME: I don't understand why we need this specialization: isn't this taken care of by the EvalBeforeNestingBit ??
- * Answer: EvalBeforeNestingBit should be deprecated since we have the evaluators
 */
 template<typename Derived,int n,typename PlainObject>
-struct nested_eval<ReturnByValue<Derived>, n, PlainObject>
+struct nested<ReturnByValue<Derived>, n, PlainObject>
 {
  typedef typename traits<Derived>::ReturnType type;
 };
@@ -49,7 +48,7 @@ struct nested_eval<ReturnByValue<Derived>, n, PlainObject>
 } // end namespace internal

 template<typename Derived> class ReturnByValue
-  : public internal::dense_xpr_base< ReturnByValue<Derived> >::type, internal::no_assignment_operator
+  : internal::no_assignment_operator, public internal::dense_xpr_base< ReturnByValue<Derived> >::type
 {
  public:
    typedef typename internal::traits<Derived>::ReturnType ReturnType;
@@ -58,11 +57,10 @@ template<typename Derived> class ReturnByValue
    EIGEN_DENSE_PUBLIC_INTERFACE(ReturnByValue)

    template<typename Dest>
-    EIGEN_DEVICE_FUNC
    inline void evalTo(Dest& dst) const
    { static_cast<const Derived*>(this)->evalTo(dst); }
-    EIGEN_DEVICE_FUNC inline Index rows() const { return static_cast<const Derived*>(this)->rows(); }
-    EIGEN_DEVICE_FUNC inline Index cols() const { return static_cast<const Derived*>(this)->cols(); }
+    inline Index rows() const { return static_cast<const Derived*>(this)->rows(); }
+    inline Index cols() const { return static_cast<const Derived*>(this)->cols(); }

 #ifndef EIGEN_PARSED_BY_DOXYGEN
 #define Unusable YOU_ARE_TRYING_TO_ACCESS_A_SINGLE_COEFFICIENT_IN_A_SPECIAL_EXPRESSION_WHERE_THAT_IS_NOT_ALLOWED_BECAUSE_THAT_WOULD_BE_INEFFICIENT
@@ -74,7 +72,8 @@ template<typename Derived> class ReturnByValue
    const Unusable& coeff(Index,Index) const { return *reinterpret_cast<const Unusable*>(this); }
    Unusable& coeffRef(Index) { return *reinterpret_cast<Unusable*>(this); }
    Unusable& coeffRef(Index,Index) { return *reinterpret_cast<Unusable*>(this); }
-#undef Unusable
+    template<int LoadMode>  Unusable& packet(Index) const;
+    template<int LoadMode>  Unusable& packet(Index, Index) const;
 #endif
 };

@@ -86,35 +85,14 @@ Derived& DenseBase<Derived>::operator=(const ReturnByValue<OtherDerived>& other)
  return derived();
 }

-namespace internal {
-
-// Expression is evaluated in a temporary; default implementation of Assignment is bypassed so that
-// when a ReturnByValue expression is assigned, the evaluator is not constructed.
-// TODO: Finalize port to new regime; ReturnByValue should not exist in the expression world
-  
 template<typename Derived>
-struct evaluator<ReturnByValue<Derived> >
-  : public evaluator<typename internal::traits<Derived>::ReturnType>::type
+template<typename OtherDerived>
+Derived& DenseBase<Derived>::lazyAssign(const ReturnByValue<OtherDerived>& other)
 {
-  typedef ReturnByValue<Derived> XprType;
-  typedef typename internal::traits<Derived>::ReturnType PlainObject;
-  typedef typename evaluator<PlainObject>::type Base;
-  
-  typedef evaluator type;
-  typedef evaluator nestedType;
+  other.evalTo(derived());
+  return derived();
+}

-  EIGEN_DEVICE_FUNC explicit evaluator(const XprType& xpr)
-    : m_result(xpr.rows(), xpr.cols())
-  {
-    ::new (static_cast<Base*>(this)) Base(m_result);
-    xpr.evalTo(m_result);
-  }
-
-protected:
-  PlainObject m_result;
-};
-
-} // end namespace internal

 } // end namespace Eigen

--- a/Eigen/src/Core/Reverse.h
+++ b/Eigen/src/Core/Reverse.h
@@ -44,7 +44,14 @@ struct traits<Reverse<MatrixType, Direction> >
    ColsAtCompileTime = MatrixType::ColsAtCompileTime,
    MaxRowsAtCompileTime = MatrixType::MaxRowsAtCompileTime,
    MaxColsAtCompileTime = MatrixType::MaxColsAtCompileTime,
-    Flags = _MatrixTypeNested::Flags & (RowMajorBit | LvalueBit)
+
+    // let's enable LinearAccess only with vectorization because of the product overhead
+    LinearAccess = ( (Direction==BothDirections) && (int(_MatrixTypeNested::Flags)&PacketAccessBit) )
+                 ? LinearAccessBit : 0,
+
+    Flags = int(_MatrixTypeNested::Flags) & (HereditaryBits | LvalueBit | PacketAccessBit | LinearAccess),
+
+    CoeffReadCost = _MatrixTypeNested::CoeffReadCost
  };
 };

@@ -67,7 +74,6 @@ template<typename MatrixType, int Direction> class Reverse

    typedef typename internal::dense_xpr_base<Reverse>::type Base;
    EIGEN_DENSE_PUBLIC_INTERFACE(Reverse)
-    typedef typename internal::remove_all<MatrixType>::type NestedExpression;
    using Base::IsRowMajor;

    // next line is necessary because otherwise const version of operator()
@@ -89,47 +95,47 @@ template<typename MatrixType, int Direction> class Reverse
    typedef internal::reverse_packet_cond<PacketScalar,ReversePacket> reverse_packet;
  public:

-    EIGEN_DEVICE_FUNC explicit inline Reverse(const MatrixType& matrix) : m_matrix(matrix) { }
+    inline Reverse(const MatrixType& matrix) : m_matrix(matrix) { }

    EIGEN_INHERIT_ASSIGNMENT_OPERATORS(Reverse)

-    EIGEN_DEVICE_FUNC inline Index rows() const { return m_matrix.rows(); }
-    EIGEN_DEVICE_FUNC inline Index cols() const { return m_matrix.cols(); }
+    inline Index rows() const { return m_matrix.rows(); }
+    inline Index cols() const { return m_matrix.cols(); }

-    EIGEN_DEVICE_FUNC inline Index innerStride() const
+    inline Index innerStride() const
    {
      return -m_matrix.innerStride();
    }

-    EIGEN_DEVICE_FUNC inline Scalar& operator()(Index row, Index col)
+    inline Scalar& operator()(Index row, Index col)
    {
      eigen_assert(row >= 0 && row < rows() && col >= 0 && col < cols());
      return coeffRef(row, col);
    }

-    EIGEN_DEVICE_FUNC inline Scalar& coeffRef(Index row, Index col)
+    inline Scalar& coeffRef(Index row, Index col)
    {
      return m_matrix.const_cast_derived().coeffRef(ReverseRow ? m_matrix.rows() - row - 1 : row,
                                                    ReverseCol ? m_matrix.cols() - col - 1 : col);
    }

-    EIGEN_DEVICE_FUNC inline CoeffReturnType coeff(Index row, Index col) const
+    inline CoeffReturnType coeff(Index row, Index col) const
    {
      return m_matrix.coeff(ReverseRow ? m_matrix.rows() - row - 1 : row,
                            ReverseCol ? m_matrix.cols() - col - 1 : col);
    }

-    EIGEN_DEVICE_FUNC inline CoeffReturnType coeff(Index index) const
+    inline CoeffReturnType coeff(Index index) const
    {
      return m_matrix.coeff(m_matrix.size() - index - 1);
    }

-    EIGEN_DEVICE_FUNC inline Scalar& coeffRef(Index index)
+    inline Scalar& coeffRef(Index index)
    {
      return m_matrix.const_cast_derived().coeffRef(m_matrix.size() - index - 1);
    }

-    EIGEN_DEVICE_FUNC inline Scalar& operator()(Index index)
+    inline Scalar& operator()(Index index)
    {
      eigen_assert(index >= 0 && index < m_matrix.size());
      return coeffRef(index);
@@ -164,7 +170,7 @@ template<typename MatrixType, int Direction> class Reverse
      m_matrix.const_cast_derived().template writePacket<LoadMode>(m_matrix.size() - index - PacketSize, internal::preverse(x));
    }

-    EIGEN_DEVICE_FUNC const typename internal::remove_all<typename MatrixType::Nested>::type&
+    const typename internal::remove_all<typename MatrixType::Nested>::type& 
    nestedExpression() const 
    {
      return m_matrix;
@@ -184,7 +190,7 @@ template<typename Derived>
 inline typename DenseBase<Derived>::ReverseReturnType
 DenseBase<Derived>::reverse()
 {
-  return ReverseReturnType(derived());
+  return derived();
 }

 /** This is the const version of reverse(). */
@@ -192,7 +198,7 @@ template<typename Derived>
 inline const typename DenseBase<Derived>::ConstReverseReturnType
 DenseBase<Derived>::reverse() const
 {
-  return ConstReverseReturnType(derived());
+  return derived();
 }

 /** This is the "in place" version of reverse: it reverses \c *this.
--- a/Eigen/src/Core/Select.h
+++ b/Eigen/src/Core/Select.h
@@ -43,21 +43,23 @@ struct traits<Select<ConditionMatrixType, ThenMatrixType, ElseMatrixType> >
    ColsAtCompileTime = ConditionMatrixType::ColsAtCompileTime,
    MaxRowsAtCompileTime = ConditionMatrixType::MaxRowsAtCompileTime,
    MaxColsAtCompileTime = ConditionMatrixType::MaxColsAtCompileTime,
-    Flags = (unsigned int)ThenMatrixType::Flags & ElseMatrixType::Flags & RowMajorBit
+    Flags = (unsigned int)ThenMatrixType::Flags & ElseMatrixType::Flags & HereditaryBits,
+    CoeffReadCost = traits<typename remove_all<ConditionMatrixNested>::type>::CoeffReadCost
+                  + EIGEN_SIZE_MAX(traits<typename remove_all<ThenMatrixNested>::type>::CoeffReadCost,
+                                   traits<typename remove_all<ElseMatrixNested>::type>::CoeffReadCost)
  };
 };
 }

 template<typename ConditionMatrixType, typename ThenMatrixType, typename ElseMatrixType>
-class Select : public internal::dense_xpr_base< Select<ConditionMatrixType, ThenMatrixType, ElseMatrixType> >::type,
-               internal::no_assignment_operator
+class Select : internal::no_assignment_operator,
+  public internal::dense_xpr_base< Select<ConditionMatrixType, ThenMatrixType, ElseMatrixType> >::type
 {
  public:

    typedef typename internal::dense_xpr_base<Select>::type Base;
    EIGEN_DENSE_PUBLIC_INTERFACE(Select)

-    inline EIGEN_DEVICE_FUNC
    Select(const ConditionMatrixType& a_conditionMatrix,
           const ThenMatrixType& a_thenMatrix,
           const ElseMatrixType& a_elseMatrix)
@@ -67,10 +69,9 @@ class Select : public internal::dense_xpr_base< Select<ConditionMatrixType, Then
      eigen_assert(m_condition.cols() == m_then.cols() && m_condition.cols() == m_else.cols());
    }

-    inline EIGEN_DEVICE_FUNC Index rows() const { return m_condition.rows(); }
-    inline EIGEN_DEVICE_FUNC Index cols() const { return m_condition.cols(); }
+    Index rows() const { return m_condition.rows(); }
+    Index cols() const { return m_condition.cols(); }

-    inline EIGEN_DEVICE_FUNC
    const Scalar coeff(Index i, Index j) const
    {
      if (m_condition.coeff(i,j))
@@ -79,7 +80,6 @@ class Select : public internal::dense_xpr_base< Select<ConditionMatrixType, Then
        return m_else.coeff(i,j);
    }

-    inline EIGEN_DEVICE_FUNC
    const Scalar coeff(Index i) const
    {
      if (m_condition.coeff(i))
@@ -88,17 +88,17 @@ class Select : public internal::dense_xpr_base< Select<ConditionMatrixType, Then
        return m_else.coeff(i);
    }

-    inline EIGEN_DEVICE_FUNC const ConditionMatrixType& conditionMatrix() const
+    const ConditionMatrixType& conditionMatrix() const
    {
      return m_condition;
    }

-    inline EIGEN_DEVICE_FUNC const ThenMatrixType& thenMatrix() const
+    const ThenMatrixType& thenMatrix() const
    {
      return m_then;
    }

-    inline EIGEN_DEVICE_FUNC const ElseMatrixType& elseMatrix() const
+    const ElseMatrixType& elseMatrix() const
    {
      return m_else;
    }
--- a/Eigen/src/Core/SelfAdjointView.h
+++ b/Eigen/src/Core/SelfAdjointView.h
@@ -35,54 +35,51 @@ struct traits<SelfAdjointView<MatrixType, UpLo> > : traits<MatrixType>
  typedef typename nested<MatrixType>::type MatrixTypeNested;
  typedef typename remove_all<MatrixTypeNested>::type MatrixTypeNestedCleaned;
  typedef MatrixType ExpressionType;
-  typedef typename MatrixType::PlainObject FullMatrixType;
+  typedef typename MatrixType::PlainObject DenseMatrixType;
  enum {
    Mode = UpLo | SelfAdjoint,
-    FlagsLvalueBit = is_lvalue<MatrixType>::value ? LvalueBit : 0,
-    Flags =  MatrixTypeNestedCleaned::Flags & (HereditaryBits|FlagsLvalueBit)
-           & (~(PacketAccessBit | DirectAccessBit | LinearAccessBit)) // FIXME these flags should be preserved
+    Flags =  MatrixTypeNestedCleaned::Flags & (HereditaryBits)
+           & (~(PacketAccessBit | DirectAccessBit | LinearAccessBit)), // FIXME these flags should be preserved
+    CoeffReadCost = MatrixTypeNestedCleaned::CoeffReadCost
  };
 };
 }

+template <typename Lhs, int LhsMode, bool LhsIsVector,
+          typename Rhs, int RhsMode, bool RhsIsVector>
+struct SelfadjointProductMatrix;
+
 // FIXME could also be called SelfAdjointWrapper to be consistent with DiagonalWrapper ??
-template<typename _MatrixType, unsigned int UpLo> class SelfAdjointView
-  : public TriangularBase<SelfAdjointView<_MatrixType, UpLo> >
+template<typename MatrixType, unsigned int UpLo> class SelfAdjointView
+  : public TriangularBase<SelfAdjointView<MatrixType, UpLo> >
 {
  public:

-    typedef _MatrixType MatrixType;
    typedef TriangularBase<SelfAdjointView> Base;
    typedef typename internal::traits<SelfAdjointView>::MatrixTypeNested MatrixTypeNested;
    typedef typename internal::traits<SelfAdjointView>::MatrixTypeNestedCleaned MatrixTypeNestedCleaned;

    /** \brief The type of coefficients in this matrix */
    typedef typename internal::traits<SelfAdjointView>::Scalar Scalar; 
-    typedef typename MatrixType::StorageIndex StorageIndex;
+
+    typedef typename MatrixType::Index Index;

    enum {
-      Mode = internal::traits<SelfAdjointView>::Mode,
-      Flags = internal::traits<SelfAdjointView>::Flags
+      Mode = internal::traits<SelfAdjointView>::Mode
    };
    typedef typename MatrixType::PlainObject PlainObject;

-    EIGEN_DEVICE_FUNC
-    explicit inline SelfAdjointView(MatrixType& matrix) : m_matrix(matrix)
+    inline SelfAdjointView(MatrixType& matrix) : m_matrix(matrix)
    {}

-    EIGEN_DEVICE_FUNC
    inline Index rows() const { return m_matrix.rows(); }
-    EIGEN_DEVICE_FUNC
    inline Index cols() const { return m_matrix.cols(); }
-    EIGEN_DEVICE_FUNC
    inline Index outerStride() const { return m_matrix.outerStride(); }
-    EIGEN_DEVICE_FUNC
    inline Index innerStride() const { return m_matrix.innerStride(); }

    /** \sa MatrixBase::coeff()
      * \warning the coordinates must fit into the referenced triangular part
      */
-    EIGEN_DEVICE_FUNC
    inline Scalar coeff(Index row, Index col) const
    {
      Base::check_coordinates_internal(row, col);
@@ -92,46 +89,36 @@ template<typename _MatrixType, unsigned int UpLo> class SelfAdjointView
    /** \sa MatrixBase::coeffRef()
      * \warning the coordinates must fit into the referenced triangular part
      */
-    EIGEN_DEVICE_FUNC
    inline Scalar& coeffRef(Index row, Index col)
    {
-      EIGEN_STATIC_ASSERT_LVALUE(SelfAdjointView);
      Base::check_coordinates_internal(row, col);
      return m_matrix.const_cast_derived().coeffRef(row, col);
    }

    /** \internal */
-    EIGEN_DEVICE_FUNC
    const MatrixTypeNestedCleaned& _expression() const { return m_matrix; }

-    EIGEN_DEVICE_FUNC
    const MatrixTypeNestedCleaned& nestedExpression() const { return m_matrix; }
-    EIGEN_DEVICE_FUNC
    MatrixTypeNestedCleaned& nestedExpression() { return *const_cast<MatrixTypeNestedCleaned*>(&m_matrix); }

-    /** Efficient triangular matrix times vector/matrix product */
+    /** Efficient self-adjoint matrix times vector/matrix product */
    template<typename OtherDerived>
-    EIGEN_DEVICE_FUNC
-    const Product<SelfAdjointView,OtherDerived>
+    SelfadjointProductMatrix<MatrixType,Mode,false,OtherDerived,0,OtherDerived::IsVectorAtCompileTime>
    operator*(const MatrixBase<OtherDerived>& rhs) const
    {
-      return Product<SelfAdjointView,OtherDerived>(*this, rhs.derived());
+      return SelfadjointProductMatrix
+              <MatrixType,Mode,false,OtherDerived,0,OtherDerived::IsVectorAtCompileTime>
+              (m_matrix, rhs.derived());
    }

-    /** Efficient vector/matrix times triangular matrix product */
+    /** Efficient vector/matrix times self-adjoint matrix product */
    template<typename OtherDerived> friend
-    EIGEN_DEVICE_FUNC
-    const Product<OtherDerived,SelfAdjointView>
+    SelfadjointProductMatrix<OtherDerived,0,OtherDerived::IsVectorAtCompileTime,MatrixType,Mode,false>
    operator*(const MatrixBase<OtherDerived>& lhs, const SelfAdjointView& rhs)
    {
-      return Product<OtherDerived,SelfAdjointView>(lhs.derived(),rhs);
-    }
-    
-    friend EIGEN_DEVICE_FUNC
-    const SelfAdjointView<const CwiseUnaryOp<internal::scalar_multiple_op<Scalar>,MatrixType>,UpLo>
-    operator*(const Scalar& s, const SelfAdjointView& mat)
-    {
-      return (s*mat.nestedExpression()).template selfadjointView<UpLo>();
+      return SelfadjointProductMatrix
+              <OtherDerived,0,OtherDerived::IsVectorAtCompileTime,MatrixType,Mode,false>
+              (lhs.derived(),rhs.m_matrix);
    }

    /** Perform a symmetric rank 2 update of the selfadjoint matrix \c *this:
@@ -145,7 +132,6 @@ template<typename _MatrixType, unsigned int UpLo> class SelfAdjointView
      * \sa rankUpdate(const MatrixBase<DerivedU>&, Scalar)
      */
    template<typename DerivedU, typename DerivedV>
-    EIGEN_DEVICE_FUNC
    SelfAdjointView& rankUpdate(const MatrixBase<DerivedU>& u, const MatrixBase<DerivedV>& v, const Scalar& alpha = Scalar(1));

    /** Perform a symmetric rank K update of the selfadjoint matrix \c *this:
@@ -159,7 +145,6 @@ template<typename _MatrixType, unsigned int UpLo> class SelfAdjointView
      * \sa rankUpdate(const MatrixBase<DerivedU>&, const MatrixBase<DerivedV>&, Scalar)
      */
    template<typename DerivedU>
-    EIGEN_DEVICE_FUNC
    SelfAdjointView& rankUpdate(const MatrixBase<DerivedU>& u, const Scalar& alpha = Scalar(1));

 /////////// Cholesky module ///////////
@@ -174,10 +159,31 @@ template<typename _MatrixType, unsigned int UpLo> class SelfAdjointView
    /** Return type of eigenvalues() */
    typedef Matrix<RealScalar, internal::traits<MatrixType>::ColsAtCompileTime, 1> EigenvaluesReturnType;

-    EIGEN_DEVICE_FUNC
    EigenvaluesReturnType eigenvalues() const;
-    EIGEN_DEVICE_FUNC
    RealScalar operatorNorm() const;
+    
+    #ifdef EIGEN2_SUPPORT
+    template<typename OtherDerived>
+    SelfAdjointView& operator=(const MatrixBase<OtherDerived>& other)
+    {
+      enum {
+        OtherPart = UpLo == Upper ? StrictlyLower : StrictlyUpper
+      };
+      m_matrix.const_cast_derived().template triangularView<UpLo>() = other;
+      m_matrix.const_cast_derived().template triangularView<OtherPart>() = other.adjoint();
+      return *this;
+    }
+    template<typename OtherMatrixType, unsigned int OtherMode>
+    SelfAdjointView& operator=(const TriangularView<OtherMatrixType, OtherMode>& other)
+    {
+      enum {
+        OtherPart = UpLo == Upper ? StrictlyLower : StrictlyUpper
+      };
+      m_matrix.const_cast_derived().template triangularView<UpLo>() = other.toDenseMatrix();
+      m_matrix.const_cast_derived().template triangularView<OtherPart>() = other.toDenseMatrix().adjoint();
+      return *this;
+    }
+    #endif

  protected:
    MatrixTypeNested m_matrix;
@@ -195,56 +201,90 @@ template<typename _MatrixType, unsigned int UpLo> class SelfAdjointView

 namespace internal {

-// TODO currently a selfadjoint expression has the form SelfAdjointView<.,.>
-//      in the future selfadjoint-ness should be defined by the expression traits
-//      such that Transpose<SelfAdjointView<.,.> > is valid. (currently TriangularBase::transpose() is overloaded to make it work)
-template<typename MatrixType, unsigned int Mode>
-struct evaluator_traits<SelfAdjointView<MatrixType,Mode> >
+template<typename Derived1, typename Derived2, int UnrollCount, bool ClearOpposite>
+struct triangular_assignment_selector<Derived1, Derived2, (SelfAdjoint|Upper), UnrollCount, ClearOpposite>
 {
-  typedef typename storage_kind_to_evaluator_kind<typename MatrixType::StorageKind>::Kind Kind;
-  typedef SelfAdjointShape Shape;
-  
-  static const int AssumeAliasing = 0;
+  enum {
+    col = (UnrollCount-1) / Derived1::RowsAtCompileTime,
+    row = (UnrollCount-1) % Derived1::RowsAtCompileTime
+  };
+
+  static inline void run(Derived1 &dst, const Derived2 &src)
+  {
+    triangular_assignment_selector<Derived1, Derived2, (SelfAdjoint|Upper), UnrollCount-1, ClearOpposite>::run(dst, src);
+
+    if(row == col)
+      dst.coeffRef(row, col) = numext::real(src.coeff(row, col));
+    else if(row < col)
+      dst.coeffRef(col, row) = numext::conj(dst.coeffRef(row, col) = src.coeff(row, col));
+  }
 };

-template<int UpLo, int SetOpposite, typename DstEvaluatorTypeT, typename SrcEvaluatorTypeT, typename Functor, int Version>
-class triangular_dense_assignment_kernel<UpLo,SelfAdjoint,SetOpposite,DstEvaluatorTypeT,SrcEvaluatorTypeT,Functor,Version>
-  : public generic_dense_assignment_kernel<DstEvaluatorTypeT, SrcEvaluatorTypeT, Functor, Version>
+template<typename Derived1, typename Derived2, bool ClearOpposite>
+struct triangular_assignment_selector<Derived1, Derived2, SelfAdjoint|Upper, 0, ClearOpposite>
 {
-protected:
-  typedef generic_dense_assignment_kernel<DstEvaluatorTypeT, SrcEvaluatorTypeT, Functor, Version> Base;
-  typedef typename Base::DstXprType DstXprType;
-  typedef typename Base::SrcXprType SrcXprType;
-  using Base::m_dst;
-  using Base::m_src;
-  using Base::m_functor;
-public:
-  
-  typedef typename Base::DstEvaluatorType DstEvaluatorType;
-  typedef typename Base::SrcEvaluatorType SrcEvaluatorType;
-  typedef typename Base::Scalar Scalar;
-  typedef typename Base::AssignmentTraits AssignmentTraits;
-  
-  
-  EIGEN_DEVICE_FUNC triangular_dense_assignment_kernel(DstEvaluatorType &dst, const SrcEvaluatorType &src, const Functor &func, DstXprType& dstExpr)
-    : Base(dst, src, func, dstExpr)
-  {}
-  
-  EIGEN_DEVICE_FUNC void assignCoeff(Index row, Index col)
+  static inline void run(Derived1 &, const Derived2 &) {}
+};
+
+template<typename Derived1, typename Derived2, int UnrollCount, bool ClearOpposite>
+struct triangular_assignment_selector<Derived1, Derived2, (SelfAdjoint|Lower), UnrollCount, ClearOpposite>
+{
+  enum {
+    col = (UnrollCount-1) / Derived1::RowsAtCompileTime,
+    row = (UnrollCount-1) % Derived1::RowsAtCompileTime
+  };
+
+  static inline void run(Derived1 &dst, const Derived2 &src)
  {
-    eigen_internal_assert(row!=col);
-    Scalar tmp = m_src.coeff(row,col);
-    m_functor.assignCoeff(m_dst.coeffRef(row,col), tmp);
-    m_functor.assignCoeff(m_dst.coeffRef(col,row), numext::conj(tmp));
+    triangular_assignment_selector<Derived1, Derived2, (SelfAdjoint|Lower), UnrollCount-1, ClearOpposite>::run(dst, src);
+
+    if(row == col)
+      dst.coeffRef(row, col) = numext::real(src.coeff(row, col));
+    else if(row > col)
+      dst.coeffRef(col, row) = numext::conj(dst.coeffRef(row, col) = src.coeff(row, col));
  }
-  
-  EIGEN_DEVICE_FUNC void assignDiagonalCoeff(Index id)
+};
+
+template<typename Derived1, typename Derived2, bool ClearOpposite>
+struct triangular_assignment_selector<Derived1, Derived2, SelfAdjoint|Lower, 0, ClearOpposite>
+{
+  static inline void run(Derived1 &, const Derived2 &) {}
+};
+
+template<typename Derived1, typename Derived2, bool ClearOpposite>
+struct triangular_assignment_selector<Derived1, Derived2, SelfAdjoint|Upper, Dynamic, ClearOpposite>
+{
+  typedef typename Derived1::Index Index;
+  static inline void run(Derived1 &dst, const Derived2 &src)
  {
-    Base::assignCoeff(id,id);
+    for(Index j = 0; j < dst.cols(); ++j)
+    {
+      for(Index i = 0; i < j; ++i)
+      {
+        dst.copyCoeff(i, j, src);
+        dst.coeffRef(j,i) = numext::conj(dst.coeff(i,j));
+      }
+      dst.copyCoeff(j, j, src);
+    }
+  }
+};
+
+template<typename Derived1, typename Derived2, bool ClearOpposite>
+struct triangular_assignment_selector<Derived1, Derived2, SelfAdjoint|Lower, Dynamic, ClearOpposite>
+{
+  static inline void run(Derived1 &dst, const Derived2 &src)
+  {
+  typedef typename Derived1::Index Index;
+    for(Index i = 0; i < dst.rows(); ++i)
+    {
+      for(Index j = 0; j < i; ++j)
+      {
+        dst.copyCoeff(i, j, src);
+        dst.coeffRef(j,i) = numext::conj(dst.coeff(i,j));
+      }
+      dst.copyCoeff(i, i, src);
+    }
  }
-  
-  EIGEN_DEVICE_FUNC void assignOppositeCoeff(Index, Index)
-  { eigen_internal_assert(false && "should never be called"); }
 };

 } // end namespace internal
@@ -258,7 +298,7 @@ template<unsigned int UpLo>
 typename MatrixBase<Derived>::template ConstSelfAdjointViewReturnType<UpLo>::Type
 MatrixBase<Derived>::selfadjointView() const
 {
-  return typename ConstSelfAdjointViewReturnType<UpLo>::Type(derived());
+  return derived();
 }

 template<typename Derived>
@@ -266,7 +306,7 @@ template<unsigned int UpLo>
 typename MatrixBase<Derived>::template SelfAdjointViewReturnType<UpLo>::Type
 MatrixBase<Derived>::selfadjointView()
 {
-  return typename SelfAdjointViewReturnType<UpLo>::Type(derived());
+  return derived();
 }

 } // end namespace Eigen
--- a/Eigen/src/Core/SelfCwiseBinaryOp.h
+++ b/Eigen/src/Core/SelfCwiseBinaryOp.h
@@ -12,27 +12,168 @@

 namespace Eigen { 

+/** \class SelfCwiseBinaryOp
+  * \ingroup Core_Module
+  *
+  * \internal
+  *
+  * \brief Internal helper class for optimizing operators like +=, -=
+  *
+  * This is a pseudo expression class re-implementing the copyCoeff/copyPacket
+  * method to directly performs a +=/-= operations in an optimal way. In particular,
+  * this allows to make sure that the input/output data are loaded only once using
+  * aligned packet loads.
+  *
+  * \sa class SwapWrapper for a similar trick.
+  */
+
+namespace internal {
+template<typename BinaryOp, typename Lhs, typename Rhs>
+struct traits<SelfCwiseBinaryOp<BinaryOp,Lhs,Rhs> >
+  : traits<CwiseBinaryOp<BinaryOp,Lhs,Rhs> >
+{
+  enum {
+    // Note that it is still a good idea to preserve the DirectAccessBit
+    // so that assign can correctly align the data.
+    Flags = traits<CwiseBinaryOp<BinaryOp,Lhs,Rhs> >::Flags | (Lhs::Flags&DirectAccessBit) | (Lhs::Flags&LvalueBit),
+    OuterStrideAtCompileTime = Lhs::OuterStrideAtCompileTime,
+    InnerStrideAtCompileTime = Lhs::InnerStrideAtCompileTime
+  };
+};
+}
+
+template<typename BinaryOp, typename Lhs, typename Rhs> class SelfCwiseBinaryOp
+  : public internal::dense_xpr_base< SelfCwiseBinaryOp<BinaryOp, Lhs, Rhs> >::type
+{
+  public:
+
+    typedef typename internal::dense_xpr_base<SelfCwiseBinaryOp>::type Base;
+    EIGEN_DENSE_PUBLIC_INTERFACE(SelfCwiseBinaryOp)
+
+    typedef typename internal::packet_traits<Scalar>::type Packet;
+
+    inline SelfCwiseBinaryOp(Lhs& xpr, const BinaryOp& func = BinaryOp()) : m_matrix(xpr), m_functor(func) {}
+
+    inline Index rows() const { return m_matrix.rows(); }
+    inline Index cols() const { return m_matrix.cols(); }
+    inline Index outerStride() const { return m_matrix.outerStride(); }
+    inline Index innerStride() const { return m_matrix.innerStride(); }
+    inline const Scalar* data() const { return m_matrix.data(); }
+
+    // note that this function is needed by assign to correctly align loads/stores
+    // TODO make Assign use .data()
+    inline Scalar& coeffRef(Index row, Index col)
+    {
+      EIGEN_STATIC_ASSERT_LVALUE(Lhs)
+      return m_matrix.const_cast_derived().coeffRef(row, col);
+    }
+    inline const Scalar& coeffRef(Index row, Index col) const
+    {
+      return m_matrix.coeffRef(row, col);
+    }
+
+    // note that this function is needed by assign to correctly align loads/stores
+    // TODO make Assign use .data()
+    inline Scalar& coeffRef(Index index)
+    {
+      EIGEN_STATIC_ASSERT_LVALUE(Lhs)
+      return m_matrix.const_cast_derived().coeffRef(index);
+    }
+    inline const Scalar& coeffRef(Index index) const
+    {
+      return m_matrix.const_cast_derived().coeffRef(index);
+    }
+
+    template<typename OtherDerived>
+    void copyCoeff(Index row, Index col, const DenseBase<OtherDerived>& other)
+    {
+      OtherDerived& _other = other.const_cast_derived();
+      eigen_internal_assert(row >= 0 && row < rows()
+                         && col >= 0 && col < cols());
+      Scalar& tmp = m_matrix.coeffRef(row,col);
+      tmp = m_functor(tmp, _other.coeff(row,col));
+    }
+
+    template<typename OtherDerived>
+    void copyCoeff(Index index, const DenseBase<OtherDerived>& other)
+    {
+      OtherDerived& _other = other.const_cast_derived();
+      eigen_internal_assert(index >= 0 && index < m_matrix.size());
+      Scalar& tmp = m_matrix.coeffRef(index);
+      tmp = m_functor(tmp, _other.coeff(index));
+    }
+
+    template<typename OtherDerived, int StoreMode, int LoadMode>
+    void copyPacket(Index row, Index col, const DenseBase<OtherDerived>& other)
+    {
+      OtherDerived& _other = other.const_cast_derived();
+      eigen_internal_assert(row >= 0 && row < rows()
+                        && col >= 0 && col < cols());
+      m_matrix.template writePacket<StoreMode>(row, col,
+        m_functor.packetOp(m_matrix.template packet<StoreMode>(row, col),_other.template packet<LoadMode>(row, col)) );
+    }
+
+    template<typename OtherDerived, int StoreMode, int LoadMode>
+    void copyPacket(Index index, const DenseBase<OtherDerived>& other)
+    {
+      OtherDerived& _other = other.const_cast_derived();
+      eigen_internal_assert(index >= 0 && index < m_matrix.size());
+      m_matrix.template writePacket<StoreMode>(index,
+        m_functor.packetOp(m_matrix.template packet<StoreMode>(index),_other.template packet<LoadMode>(index)) );
+    }
+
+    // reimplement lazyAssign to handle complex *= real
+    // see CwiseBinaryOp ctor for details
+    template<typename RhsDerived>
+    EIGEN_STRONG_INLINE SelfCwiseBinaryOp& lazyAssign(const DenseBase<RhsDerived>& rhs)
+    {
+      EIGEN_STATIC_ASSERT_SAME_MATRIX_SIZE(Lhs,RhsDerived)
+      EIGEN_CHECK_BINARY_COMPATIBILIY(BinaryOp,typename Lhs::Scalar,typename RhsDerived::Scalar);
+      
+    #ifdef EIGEN_DEBUG_ASSIGN
+      internal::assign_traits<SelfCwiseBinaryOp, RhsDerived>::debug();
+    #endif
+      eigen_assert(rows() == rhs.rows() && cols() == rhs.cols());
+      internal::assign_impl<SelfCwiseBinaryOp, RhsDerived>::run(*this,rhs.derived());
+    #ifndef EIGEN_NO_DEBUG
+      this->checkTransposeAliasing(rhs.derived());
+    #endif
+      return *this;
+    }
+    
+    // overloaded to honor evaluation of special matrices
+    // maybe another solution would be to not use SelfCwiseBinaryOp
+    // at first...
+    SelfCwiseBinaryOp& operator=(const Rhs& _rhs)
+    {
+      typename internal::nested<Rhs>::type rhs(_rhs);
+      return Base::operator=(rhs);
+    }
+
+    Lhs& expression() const 
+    { 
+      return m_matrix;
+    }
+
+    const BinaryOp& functor() const 
+    { 
+      return m_functor;
+    }
+
+  protected:
+    Lhs& m_matrix;
+    const BinaryOp& m_functor;
+
+  private:
+    SelfCwiseBinaryOp& operator=(const SelfCwiseBinaryOp&);
+};
+
 template<typename Derived>
 inline Derived& DenseBase<Derived>::operator*=(const Scalar& other)
 {
  typedef typename Derived::PlainObject PlainObject;
-  internal::call_assignment(this->derived(), PlainObject::Constant(rows(),cols(),other), internal::mul_assign_op<Scalar>());
-  return derived();
-}
-
-template<typename Derived>
-inline Derived& ArrayBase<Derived>::operator+=(const Scalar& other)
-{
-  typedef typename Derived::PlainObject PlainObject;
-  internal::call_assignment(this->derived(), PlainObject::Constant(rows(),cols(),other), internal::add_assign_op<Scalar>());
-  return derived();
-}
-
-template<typename Derived>
-inline Derived& ArrayBase<Derived>::operator-=(const Scalar& other)
-{
-  typedef typename Derived::PlainObject PlainObject;
-  internal::call_assignment(this->derived(), PlainObject::Constant(rows(),cols(),other), internal::sub_assign_op<Scalar>());
+  SelfCwiseBinaryOp<internal::scalar_product_op<Scalar>, Derived, typename PlainObject::ConstantReturnType> tmp(derived());
+  tmp = PlainObject::Constant(rows(),cols(),other);
  return derived();
 }

@@ -40,7 +181,8 @@ template<typename Derived>
 inline Derived& DenseBase<Derived>::operator/=(const Scalar& other)
 {
  typedef typename Derived::PlainObject PlainObject;
-  internal::call_assignment(this->derived(), PlainObject::Constant(rows(),cols(),other), internal::div_assign_op<Scalar>());
+  SelfCwiseBinaryOp<internal::scalar_quotient_op<Scalar>, Derived, typename PlainObject::ConstantReturnType> tmp(derived());
+  tmp = PlainObject::Constant(rows(),cols(), other);
  return derived();
 }

--- a/Eigen/src/Core/Solve.h
+++ b/Eigen/src/Core/Solve.h
@@ -1,153 +0,0 @@
-// This file is part of Eigen, a lightweight C++ template library
-// for linear algebra.
-//
-// Copyright (C) 2014 Gael Guennebaud <gael.guennebaud@inria.fr>
-//
-// This Source Code Form is subject to the terms of the Mozilla
-// Public License v. 2.0. If a copy of the MPL was not distributed
-// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
-
-#ifndef EIGEN_SOLVE_H
-#define EIGEN_SOLVE_H
-
-namespace Eigen {
-
-template<typename Decomposition, typename RhsType, typename StorageKind> class SolveImpl;
-  
-/** \class Solve
-  * \ingroup Core_Module
-  *
-  * \brief Pseudo expression representing a solving operation
-  *
-  * \tparam Decomposition the type of the matrix or decomposion object
-  * \tparam Rhstype the type of the right-hand side
-  *
-  * This class represents an expression of A.solve(B)
-  * and most of the time this is the only way it is used.
-  *
-  */
-namespace internal {
-
-// this solve_traits class permits to determine the evaluation type with respect to storage kind (Dense vs Sparse)
-template<typename Decomposition, typename RhsType,typename StorageKind> struct solve_traits;
-
-template<typename Decomposition, typename RhsType>
-struct solve_traits<Decomposition,RhsType,Dense>
-{
-  typedef typename Decomposition::MatrixType MatrixType;
-  typedef Matrix<typename RhsType::Scalar,
-                 MatrixType::ColsAtCompileTime,
-                 RhsType::ColsAtCompileTime,
-                 RhsType::PlainObject::Options,
-                 MatrixType::MaxColsAtCompileTime,
-                 RhsType::MaxColsAtCompileTime> PlainObject;  
-};
-
-template<typename Decomposition, typename RhsType>
-struct traits<Solve<Decomposition, RhsType> >
-  : traits<typename solve_traits<Decomposition,RhsType,typename internal::traits<RhsType>::StorageKind>::PlainObject>
-{
-  typedef typename solve_traits<Decomposition,RhsType,typename internal::traits<RhsType>::StorageKind>::PlainObject PlainObject;
-  typedef typename promote_index_type<typename Decomposition::StorageIndex, typename RhsType::StorageIndex>::type StorageIndex;
-  typedef traits<PlainObject> BaseTraits;
-  enum {
-    Flags = BaseTraits::Flags & RowMajorBit,
-    CoeffReadCost = Dynamic
-  };
-};
-
-}
-
-
-template<typename Decomposition, typename RhsType>
-class Solve : public SolveImpl<Decomposition,RhsType,typename internal::traits<RhsType>::StorageKind>
-{
-public:
-  typedef typename internal::traits<Solve>::PlainObject PlainObject;
-  typedef typename internal::traits<Solve>::StorageIndex StorageIndex;
-  
-  Solve(const Decomposition &dec, const RhsType &rhs)
-    : m_dec(dec), m_rhs(rhs)
-  {}
-  
-  EIGEN_DEVICE_FUNC Index rows() const { return m_dec.cols(); }
-  EIGEN_DEVICE_FUNC Index cols() const { return m_rhs.cols(); }
-
-  EIGEN_DEVICE_FUNC const Decomposition& dec() const { return m_dec; }
-  EIGEN_DEVICE_FUNC const RhsType&       rhs() const { return m_rhs; }
-
-protected:
-  const Decomposition &m_dec;
-  const RhsType       &m_rhs;
-};
-
-
-// Specialization of the Solve expression for dense results
-template<typename Decomposition, typename RhsType>
-class SolveImpl<Decomposition,RhsType,Dense>
-  : public MatrixBase<Solve<Decomposition,RhsType> >
-{
-  typedef Solve<Decomposition,RhsType> Derived;
-  
-public:
-  
-  typedef MatrixBase<Solve<Decomposition,RhsType> > Base;
-  EIGEN_DENSE_PUBLIC_INTERFACE(Derived)
-
-private:
-  
-  Scalar coeff(Index row, Index col) const;
-  Scalar coeff(Index i) const;
-};
-
-// Generic API dispatcher
-template<typename Decomposition, typename RhsType, typename StorageKind>
-class SolveImpl : public internal::generic_xpr_base<Solve<Decomposition,RhsType>, MatrixXpr, StorageKind>::type
-{
-  public:
-    typedef typename internal::generic_xpr_base<Solve<Decomposition,RhsType>, MatrixXpr, StorageKind>::type Base;
-};
-
-namespace internal {
-
-// Evaluator of Solve -> eval into a temporary
-template<typename Decomposition, typename RhsType>
-struct evaluator<Solve<Decomposition,RhsType> >
-  : public evaluator<typename Solve<Decomposition,RhsType>::PlainObject>::type
-{
-  typedef Solve<Decomposition,RhsType> SolveType;
-  typedef typename SolveType::PlainObject PlainObject;
-  typedef typename evaluator<PlainObject>::type Base;
-  
-  typedef evaluator type;
-  typedef evaluator nestedType;
-
-  EIGEN_DEVICE_FUNC explicit evaluator(const SolveType& solve)
-    : m_result(solve.rows(), solve.cols())
-  {
-    ::new (static_cast<Base*>(this)) Base(m_result);
-    solve.dec()._solve_impl(solve.rhs(), m_result);
-  }
-  
-protected:  
-  PlainObject m_result;
-};
-
-// Specialization for "dst = dec.solve(rhs)"
-// NOTE we need to specialize it for Dense2Dense to avoid ambiguous specialization error and a Sparse2Sparse specialization must exist somewhere
-template<typename DstXprType, typename DecType, typename RhsType, typename Scalar>
-struct Assignment<DstXprType, Solve<DecType,RhsType>, internal::assign_op<Scalar>, Dense2Dense, Scalar>
-{
-  typedef Solve<DecType,RhsType> SrcXprType;
-  static void run(DstXprType &dst, const SrcXprType &src, const internal::assign_op<Scalar> &)
-  {
-    // FIXME shall we resize dst here?
-    src.dec()._solve_impl(src.rhs(), dst);
-  }
-};
-
-} // end namepsace internal
-
-} // end namespace Eigen
-
-#endif // EIGEN_SOLVE_H
--- a/Eigen/src/Core/SolveTriangular.h
+++ b/Eigen/src/Core/SolveTriangular.h
@@ -68,7 +68,7 @@ struct triangular_solver_selector<Lhs,Rhs,Side,Mode,NoUnrolling,1>
    if(!useRhsDirectly)
      MappedRhs(actualRhs,rhs.size()) = rhs;

-    triangular_solve_vector<LhsScalar, RhsScalar, Index, Side, Mode, LhsProductTraits::NeedToConjugate,
+    triangular_solve_vector<LhsScalar, RhsScalar, typename Lhs::Index, Side, Mode, LhsProductTraits::NeedToConjugate,
                            (int(Lhs::Flags) & RowMajorBit) ? RowMajor : ColMajor>
      ::run(actualLhs.cols(), actualLhs.data(), actualLhs.outerStride(), actualRhs);

@@ -82,6 +82,7 @@ template<typename Lhs, typename Rhs, int Side, int Mode>
 struct triangular_solver_selector<Lhs,Rhs,Side,Mode,NoUnrolling,Dynamic>
 {
  typedef typename Rhs::Scalar Scalar;
+  typedef typename Rhs::Index Index;
  typedef blas_traits<Lhs> LhsProductTraits;
  typedef typename LhsProductTraits::DirectLinearAccessType ActualLhsType;

@@ -95,7 +96,7 @@ struct triangular_solver_selector<Lhs,Rhs,Side,Mode,NoUnrolling,Dynamic>
    typedef internal::gemm_blocking_space<(Rhs::Flags&RowMajorBit) ? RowMajor : ColMajor,Scalar,Scalar,
              Rhs::MaxRowsAtCompileTime, Rhs::MaxColsAtCompileTime, Lhs::MaxRowsAtCompileTime,4> BlockingType;

-    BlockingType blocking(rhs.rows(), rhs.cols(), size, 1, false);
+    BlockingType blocking(rhs.rows(), rhs.cols(), size);

    triangular_solve_matrix<Scalar,Index,Side,Mode,LhsProductTraits::NeedToConjugate,(int(Lhs::Flags) & RowMajorBit) ? RowMajor : ColMajor,
                               (Rhs::Flags&RowMajorBit) ? RowMajor : ColMajor>
@@ -115,17 +116,17 @@ template<typename Lhs, typename Rhs, int Mode, int Index, int Size>
 struct triangular_solver_unroller<Lhs,Rhs,Mode,Index,Size,false> {
  enum {
    IsLower = ((Mode&Lower)==Lower),
-    I = IsLower ? Index : Size - Index - 1,
-    S = IsLower ? 0     : I+1
+    RowIndex = IsLower ? Index : Size - Index - 1,
+    S = IsLower ? 0     : RowIndex+1
  };
  static void run(const Lhs& lhs, Rhs& rhs)
  {
    if (Index>0)
-      rhs.coeffRef(I) -= lhs.row(I).template segment<Index>(S).transpose()
+      rhs.coeffRef(RowIndex) -= lhs.row(RowIndex).template segment<Index>(S).transpose()
                         .cwiseProduct(rhs.template segment<Index>(S)).sum();

    if(!(Mode & UnitDiag))
-      rhs.coeffRef(I) /= lhs.coeff(I,I);
+      rhs.coeffRef(RowIndex) /= lhs.coeff(RowIndex,RowIndex);

    triangular_solver_unroller<Lhs,Rhs,Mode,Index+1,Size>::run(lhs,rhs);
  }
@@ -170,10 +171,10 @@ struct triangular_solver_selector<Lhs,Rhs,OnTheRight,Mode,CompleteUnrolling,1> {
  */
 template<typename MatrixType, unsigned int Mode>
 template<int Side, typename OtherDerived>
-void TriangularViewImpl<MatrixType,Mode,Dense>::solveInPlace(const MatrixBase<OtherDerived>& _other) const
+void TriangularView<MatrixType,Mode>::solveInPlace(const MatrixBase<OtherDerived>& _other) const
 {
  OtherDerived& other = _other.const_cast_derived();
-  eigen_assert( derived().cols() == derived().rows() && ((Side==OnTheLeft && derived().cols() == other.rows()) || (Side==OnTheRight && derived().cols() == other.cols())) );
+  eigen_assert( cols() == rows() && ((Side==OnTheLeft && cols() == other.rows()) || (Side==OnTheRight && cols() == other.cols())) );
  eigen_assert((!(Mode & ZeroDiag)) && bool(Mode & (Upper|Lower)));

  enum { copy = internal::traits<OtherDerived>::Flags & RowMajorBit  && OtherDerived::IsVectorAtCompileTime };
@@ -182,7 +183,7 @@ void TriangularViewImpl<MatrixType,Mode,Dense>::solveInPlace(const MatrixBase<Ot
  OtherCopy otherCopy(other);

  internal::triangular_solver_selector<MatrixType, typename internal::remove_reference<OtherCopy>::type,
-    Side, Mode>::run(derived().nestedExpression(), otherCopy);
+    Side, Mode>::run(nestedExpression(), otherCopy);

  if (copy)
    other = otherCopy;
@@ -212,9 +213,9 @@ void TriangularViewImpl<MatrixType,Mode,Dense>::solveInPlace(const MatrixBase<Ot
 template<typename Derived, unsigned int Mode>
 template<int Side, typename Other>
 const internal::triangular_solve_retval<Side,TriangularView<Derived,Mode>,Other>
-TriangularViewImpl<Derived,Mode,Dense>::solve(const MatrixBase<Other>& other) const
+TriangularView<Derived,Mode>::solve(const MatrixBase<Other>& other) const
 {
-  return internal::triangular_solve_retval<Side,TriangularViewType,Other>(derived(), other.derived());
+  return internal::triangular_solve_retval<Side,TriangularView,Other>(*this, other.derived());
 }

 namespace internal {
@@ -231,6 +232,7 @@ template<int Side, typename TriangularType, typename Rhs> struct triangular_solv
 {
  typedef typename remove_all<typename Rhs::Nested>::type RhsNestedCleaned;
  typedef ReturnByValue<triangular_solve_retval> Base;
+  typedef typename Base::Index Index;

  triangular_solve_retval(const TriangularType& tri, const Rhs& rhs)
    : m_triangularMatrix(tri), m_rhs(rhs)
@@ -241,7 +243,8 @@ template<int Side, typename TriangularType, typename Rhs> struct triangular_solv

  template<typename Dest> inline void evalTo(Dest& dst) const
  {
-    if(!(is_same<RhsNestedCleaned,Dest>::value && extract_data(dst) == extract_data(m_rhs)))
+    const typename Dest::Scalar *dst_data = internal::extract_data(dst);
+    if(!(is_same<RhsNestedCleaned,Dest>::value && dst_data!=0 && extract_data(dst) == extract_data(m_rhs)))
      dst = m_rhs;
    m_triangularMatrix.template solveInPlace<Side>(dst);
  }
--- a/Eigen/src/Core/StableNorm.h
+++ b/Eigen/src/Core/StableNorm.h
@@ -17,9 +17,10 @@ namespace internal {
 template<typename ExpressionType, typename Scalar>
 inline void stable_norm_kernel(const ExpressionType& bl, Scalar& ssq, Scalar& scale, Scalar& invScale)
 {
+  using std::max;
  Scalar maxCoeff = bl.cwiseAbs().maxCoeff();
  
-  if(maxCoeff>scale)
+  if (maxCoeff>scale)
  {
    ssq = ssq * numext::abs2(scale/maxCoeff);
    Scalar tmp = Scalar(1)/maxCoeff;
@@ -28,21 +29,12 @@ inline void stable_norm_kernel(const ExpressionType& bl, Scalar& ssq, Scalar& sc
      invScale = NumTraits<Scalar>::highest();
      scale = Scalar(1)/invScale;
    }
-    else if(maxCoeff>NumTraits<Scalar>::highest()) // we got a INF
-    {
-      invScale = Scalar(1);
-      scale = maxCoeff;
-    }
    else
    {
      scale = maxCoeff;
      invScale = tmp;
    }
  }
-  else if(maxCoeff!=maxCoeff) // we got a NaN
-  {
-    scale = maxCoeff;
-  }
  
  // TODO if the maxCoeff is much much smaller than the current scale,
  // then we can neglect this sub vector
@@ -55,12 +47,15 @@ inline typename NumTraits<typename traits<Derived>::Scalar>::Real
 blueNorm_impl(const EigenBase<Derived>& _vec)
 {
  typedef typename Derived::RealScalar RealScalar;  
+  typedef typename Derived::Index Index;
  using std::pow;
+  using std::min;
+  using std::max;
  using std::sqrt;
  using std::abs;
  const Derived& vec(_vec.derived());
  static bool initialized = false;
-  static RealScalar b1, b2, s1m, s2m, rbig, relerr;
+  static RealScalar b1, b2, s1m, s2m, overfl, rbig, relerr;
  if(!initialized)
  {
    int ibeta, it, iemin, iemax, iexp;
@@ -89,6 +84,7 @@ blueNorm_impl(const EigenBase<Derived>& _vec)
    iexp  = - ((iemax+it)/2);
    s2m   = RealScalar(pow(RealScalar(ibeta),RealScalar(iexp)));    // scaling factor for upper range

+    overfl  = rbig*s2m;                                             // overflow boundary for abig
    eps     = RealScalar(pow(double(ibeta), 1-it));
    relerr  = sqrt(eps);                                            // tolerance for neglecting asml
    initialized = true;
@@ -105,13 +101,13 @@ blueNorm_impl(const EigenBase<Derived>& _vec)
    else if(ax < b1) asml += numext::abs2(ax*s1m);
    else             amed += numext::abs2(ax);
  }
-  if(amed!=amed)
-    return amed;  // we got a NaN
  if(abig > RealScalar(0))
  {
    abig = sqrt(abig);
-    if(abig > rbig) // overflow, or *this contains INF values
-      return abig;  // return INF
+    if(abig > overfl)
+    {
+      return rbig;
+    }
    if(amed > RealScalar(0))
    {
      abig = abig/s2m;
@@ -132,8 +128,8 @@ blueNorm_impl(const EigenBase<Derived>& _vec)
  }
  else
    return sqrt(amed);
-  asml = numext::mini(abig, amed);
-  abig = numext::maxi(abig, amed);
+  asml = (min)(abig, amed);
+  abig = (max)(abig, amed);
  if(asml <= abig*relerr)
    return abig;
  else
@@ -156,6 +152,7 @@ template<typename Derived>
 inline typename NumTraits<typename internal::traits<Derived>::Scalar>::Real
 MatrixBase<Derived>::stableNorm() const
 {
+  using std::min;
  using std::sqrt;
  const Index blockSize = 4096;
  RealScalar scale(0);
@@ -169,7 +166,7 @@ MatrixBase<Derived>::stableNorm() const
  if (bi>0)
    internal::stable_norm_kernel(this->head(bi), ssq, scale, invScale);
  for (; bi<n; bi+=blockSize)
-    internal::stable_norm_kernel(this->segment(bi,numext::mini(blockSize, n - bi)).template forceAlignedAccessIf<Alignment>(), ssq, scale, invScale);
+    internal::stable_norm_kernel(this->segment(bi,(min)(blockSize, n - bi)).template forceAlignedAccessIf<Alignment>(), ssq, scale, invScale);
  return scale * sqrt(ssq);
 }

--- a/Eigen/src/Core/Stride.h
+++ b/Eigen/src/Core/Stride.h
@@ -44,14 +44,13 @@ template<int _OuterStrideAtCompileTime, int _InnerStrideAtCompileTime>
 class Stride
 {
  public:
-    typedef Eigen::Index Index; ///< \deprecated since Eigen 3.3
+    typedef DenseIndex Index;
    enum {
      InnerStrideAtCompileTime = _InnerStrideAtCompileTime,
      OuterStrideAtCompileTime = _OuterStrideAtCompileTime
    };

    /** Default constructor, for use when strides are fixed at compile time */
-    EIGEN_DEVICE_FUNC
    Stride()
      : m_outer(OuterStrideAtCompileTime), m_inner(InnerStrideAtCompileTime)
    {
@@ -59,7 +58,6 @@ class Stride
    }

    /** Constructor allowing to pass the strides at runtime */
-    EIGEN_DEVICE_FUNC
    Stride(Index outerStride, Index innerStride)
      : m_outer(outerStride), m_inner(innerStride)
    {
@@ -67,16 +65,13 @@ class Stride
    }

    /** Copy constructor */
-    EIGEN_DEVICE_FUNC
    Stride(const Stride& other)
      : m_outer(other.outer()), m_inner(other.inner())
    {}

    /** \returns the outer stride */
-    EIGEN_DEVICE_FUNC
    inline Index outer() const { return m_outer.value(); }
    /** \returns the inner stride */
-    EIGEN_DEVICE_FUNC
    inline Index inner() const { return m_inner.value(); }

  protected:
@@ -86,24 +81,26 @@ class Stride

 /** \brief Convenience specialization of Stride to specify only an inner stride
  * See class Map for some examples */
-template<int Value>
+template<int Value = Dynamic>
 class InnerStride : public Stride<0, Value>
 {
    typedef Stride<0, Value> Base;
  public:
-    EIGEN_DEVICE_FUNC InnerStride() : Base() {}
-    EIGEN_DEVICE_FUNC InnerStride(Index v) : Base(0, v) {} // FIXME making this explicit could break valid code
+    typedef DenseIndex Index;
+    InnerStride() : Base() {}
+    InnerStride(Index v) : Base(0, v) {}
 };

 /** \brief Convenience specialization of Stride to specify only an outer stride
  * See class Map for some examples */
-template<int Value>
+template<int Value = Dynamic>
 class OuterStride : public Stride<Value, 0>
 {
    typedef Stride<Value, 0> Base;
  public:
-    EIGEN_DEVICE_FUNC OuterStride() : Base() {}
-    EIGEN_DEVICE_FUNC OuterStride(Index v) : Base(v,0) {} // FIXME making this explicit could break valid code
+    typedef DenseIndex Index;
+    OuterStride() : Base() {}
+    OuterStride(Index v) : Base(v,0) {}
 };

 } // end namespace Eigen
--- a/Eigen/src/Core/Swap.h
+++ b/Eigen/src/Core/Swap.h
@@ -12,52 +12,114 @@

 namespace Eigen { 

+/** \class SwapWrapper
+  * \ingroup Core_Module
+  *
+  * \internal
+  *
+  * \brief Internal helper class for swapping two expressions
+  */
 namespace internal {
+template<typename ExpressionType>
+struct traits<SwapWrapper<ExpressionType> > : traits<ExpressionType> {};
+}

-// Overload default assignPacket behavior for swapping them
-template<typename DstEvaluatorTypeT, typename SrcEvaluatorTypeT>
-class generic_dense_assignment_kernel<DstEvaluatorTypeT, SrcEvaluatorTypeT, swap_assign_op<typename DstEvaluatorTypeT::Scalar>, Specialized>
- : public generic_dense_assignment_kernel<DstEvaluatorTypeT, SrcEvaluatorTypeT, swap_assign_op<typename DstEvaluatorTypeT::Scalar>, BuiltIn>
+template<typename ExpressionType> class SwapWrapper
+  : public internal::dense_xpr_base<SwapWrapper<ExpressionType> >::type
 {
-protected:
-  typedef generic_dense_assignment_kernel<DstEvaluatorTypeT, SrcEvaluatorTypeT, swap_assign_op<typename DstEvaluatorTypeT::Scalar>, BuiltIn> Base;
-  typedef typename DstEvaluatorTypeT::PacketScalar PacketScalar;
-  using Base::m_dst;
-  using Base::m_src;
-  using Base::m_functor;
-  
-public:
-  typedef typename Base::Scalar Scalar;
-  typedef typename Base::DstXprType DstXprType;
-  typedef swap_assign_op<Scalar> Functor;
-  
-  EIGEN_DEVICE_FUNC generic_dense_assignment_kernel(DstEvaluatorTypeT &dst, const SrcEvaluatorTypeT &src, const Functor &func, DstXprType& dstExpr)
-    : Base(dst, src, func, dstExpr)
-  {}
-  
-  template<int StoreMode, int LoadMode>
-  void assignPacket(Index row, Index col)
-  {
-    m_functor.template swapPacket<StoreMode,LoadMode,PacketScalar>(&m_dst.coeffRef(row,col), &const_cast<SrcEvaluatorTypeT&>(m_src).coeffRef(row,col));
-  }
-  
-  template<int StoreMode, int LoadMode>
-  void assignPacket(Index index)
-  {
-    m_functor.template swapPacket<StoreMode,LoadMode,PacketScalar>(&m_dst.coeffRef(index), &const_cast<SrcEvaluatorTypeT&>(m_src).coeffRef(index));
-  }
-  
-  // TODO find a simple way not to have to copy/paste this function from generic_dense_assignment_kernel, by simple I mean no CRTP (Gael)
-  template<int StoreMode, int LoadMode>
-  void assignPacketByOuterInner(Index outer, Index inner)
-  {
-    Index row = Base::rowIndexByOuterInner(outer, inner); 
-    Index col = Base::colIndexByOuterInner(outer, inner);
-    assignPacket<StoreMode,LoadMode>(row, col);
-  }
-};
+  public:

-} // namespace internal
+    typedef typename internal::dense_xpr_base<SwapWrapper>::type Base;
+    EIGEN_DENSE_PUBLIC_INTERFACE(SwapWrapper)
+    typedef typename internal::packet_traits<Scalar>::type Packet;
+
+    inline SwapWrapper(ExpressionType& xpr) : m_expression(xpr) {}
+
+    inline Index rows() const { return m_expression.rows(); }
+    inline Index cols() const { return m_expression.cols(); }
+    inline Index outerStride() const { return m_expression.outerStride(); }
+    inline Index innerStride() const { return m_expression.innerStride(); }
+    
+    typedef typename internal::conditional<
+                       internal::is_lvalue<ExpressionType>::value,
+                       Scalar,
+                       const Scalar
+                     >::type ScalarWithConstIfNotLvalue;
+                     
+    inline ScalarWithConstIfNotLvalue* data() { return m_expression.data(); }
+    inline const Scalar* data() const { return m_expression.data(); }
+
+    inline Scalar& coeffRef(Index rowId, Index colId)
+    {
+      return m_expression.const_cast_derived().coeffRef(rowId, colId);
+    }
+
+    inline Scalar& coeffRef(Index index)
+    {
+      return m_expression.const_cast_derived().coeffRef(index);
+    }
+
+    inline Scalar& coeffRef(Index rowId, Index colId) const
+    {
+      return m_expression.coeffRef(rowId, colId);
+    }
+
+    inline Scalar& coeffRef(Index index) const
+    {
+      return m_expression.coeffRef(index);
+    }
+
+    template<typename OtherDerived>
+    void copyCoeff(Index rowId, Index colId, const DenseBase<OtherDerived>& other)
+    {
+      OtherDerived& _other = other.const_cast_derived();
+      eigen_internal_assert(rowId >= 0 && rowId < rows()
+                         && colId >= 0 && colId < cols());
+      Scalar tmp = m_expression.coeff(rowId, colId);
+      m_expression.coeffRef(rowId, colId) = _other.coeff(rowId, colId);
+      _other.coeffRef(rowId, colId) = tmp;
+    }
+
+    template<typename OtherDerived>
+    void copyCoeff(Index index, const DenseBase<OtherDerived>& other)
+    {
+      OtherDerived& _other = other.const_cast_derived();
+      eigen_internal_assert(index >= 0 && index < m_expression.size());
+      Scalar tmp = m_expression.coeff(index);
+      m_expression.coeffRef(index) = _other.coeff(index);
+      _other.coeffRef(index) = tmp;
+    }
+
+    template<typename OtherDerived, int StoreMode, int LoadMode>
+    void copyPacket(Index rowId, Index colId, const DenseBase<OtherDerived>& other)
+    {
+      OtherDerived& _other = other.const_cast_derived();
+      eigen_internal_assert(rowId >= 0 && rowId < rows()
+                        && colId >= 0 && colId < cols());
+      Packet tmp = m_expression.template packet<StoreMode>(rowId, colId);
+      m_expression.template writePacket<StoreMode>(rowId, colId,
+        _other.template packet<LoadMode>(rowId, colId)
+      );
+      _other.template writePacket<LoadMode>(rowId, colId, tmp);
+    }
+
+    template<typename OtherDerived, int StoreMode, int LoadMode>
+    void copyPacket(Index index, const DenseBase<OtherDerived>& other)
+    {
+      OtherDerived& _other = other.const_cast_derived();
+      eigen_internal_assert(index >= 0 && index < m_expression.size());
+      Packet tmp = m_expression.template packet<StoreMode>(index);
+      m_expression.template writePacket<StoreMode>(index,
+        _other.template packet<LoadMode>(index)
+      );
+      _other.template writePacket<LoadMode>(index, tmp);
+    }
+
+    ExpressionType& expression() const { return m_expression; }
+
+  protected:
+    ExpressionType& m_expression;
+};

 } // end namespace Eigen

--- a/Eigen/src/Core/Transpose.h
+++ b/Eigen/src/Core/Transpose.h
@@ -2,7 +2,7 @@
 // for linear algebra.
 //
 // Copyright (C) 2006-2008 Benoit Jacob <jacob.benoit.1@gmail.com>
-// Copyright (C) 2009-2014 Gael Guennebaud <gael.guennebaud@inria.fr>
+// Copyright (C) 2009-2010 Gael Guennebaud <gael.guennebaud@inria.fr>
 //
 // This Source Code Form is subject to the terms of the Mozilla
 // Public License v. 2.0. If a copy of the MPL was not distributed
@@ -29,10 +29,13 @@ namespace Eigen {

 namespace internal {
 template<typename MatrixType>
-struct traits<Transpose<MatrixType> > : public traits<MatrixType>
+struct traits<Transpose<MatrixType> > : traits<MatrixType>
 {
+  typedef typename MatrixType::Scalar Scalar;
  typedef typename nested<MatrixType>::type MatrixTypeNested;
  typedef typename remove_reference<MatrixTypeNested>::type MatrixTypeNestedPlain;
+  typedef typename traits<MatrixType>::StorageKind StorageKind;
+  typedef typename traits<MatrixType>::XprKind XprKind;
  enum {
    RowsAtCompileTime = MatrixType::ColsAtCompileTime,
    ColsAtCompileTime = MatrixType::RowsAtCompileTime,
@@ -42,6 +45,7 @@ struct traits<Transpose<MatrixType> > : public traits<MatrixType>
    Flags0 = MatrixTypeNestedPlain::Flags & ~(LvalueBit | NestByRefBit),
    Flags1 = Flags0 | FlagsLvalueBit,
    Flags = Flags1 ^ RowMajorBit,
+    CoeffReadCost = MatrixTypeNestedPlain::CoeffReadCost,
    InnerStrideAtCompileTime = inner_stride_at_compile_time<MatrixType>::ret,
    OuterStrideAtCompileTime = outer_stride_at_compile_time<MatrixType>::ret
  };
@@ -57,23 +61,19 @@ template<typename MatrixType> class Transpose

    typedef typename TransposeImpl<MatrixType,typename internal::traits<MatrixType>::StorageKind>::Base Base;
    EIGEN_GENERIC_PUBLIC_INTERFACE(Transpose)
-    typedef typename internal::remove_all<MatrixType>::type NestedExpression;

-    EIGEN_DEVICE_FUNC
-    explicit inline Transpose(MatrixType& a_matrix) : m_matrix(a_matrix) {}
+    inline Transpose(MatrixType& a_matrix) : m_matrix(a_matrix) {}

    EIGEN_INHERIT_ASSIGNMENT_OPERATORS(Transpose)

-    EIGEN_DEVICE_FUNC inline Index rows() const { return m_matrix.cols(); }
-    EIGEN_DEVICE_FUNC inline Index cols() const { return m_matrix.rows(); }
+    inline Index rows() const { return m_matrix.cols(); }
+    inline Index cols() const { return m_matrix.rows(); }

    /** \returns the nested expression */
-    EIGEN_DEVICE_FUNC
    const typename internal::remove_all<typename MatrixType::Nested>::type&
    nestedExpression() const { return m_matrix; }

    /** \returns the nested expression */
-    EIGEN_DEVICE_FUNC
    typename internal::remove_all<typename MatrixType::Nested>::type&
    nestedExpression() { return m_matrix.const_cast_derived(); }

@@ -97,27 +97,17 @@ struct TransposeImpl_base<MatrixType, false>

 } // end namespace internal

-// Generic API dispatcher
-template<typename XprType, typename StorageKind>
-class TransposeImpl
-  : public internal::generic_xpr_base<Transpose<XprType> >::type
-{
-public:
-  typedef typename internal::generic_xpr_base<Transpose<XprType> >::type Base;
-};
-
 template<typename MatrixType> class TransposeImpl<MatrixType,Dense>
  : public internal::TransposeImpl_base<MatrixType>::type
 {
  public:

    typedef typename internal::TransposeImpl_base<MatrixType>::type Base;
-    using Base::coeffRef;
    EIGEN_DENSE_PUBLIC_INTERFACE(Transpose<MatrixType>)
    EIGEN_INHERIT_ASSIGNMENT_OPERATORS(TransposeImpl)

-    EIGEN_DEVICE_FUNC inline Index innerStride() const { return derived().nestedExpression().innerStride(); }
-    EIGEN_DEVICE_FUNC inline Index outerStride() const { return derived().nestedExpression().outerStride(); }
+    inline Index innerStride() const { return derived().nestedExpression().innerStride(); }
+    inline Index outerStride() const { return derived().nestedExpression().outerStride(); }

    typedef typename internal::conditional<
                       internal::is_lvalue<MatrixType>::value,
@@ -125,21 +115,64 @@ template<typename MatrixType> class TransposeImpl<MatrixType,Dense>
                       const Scalar
                     >::type ScalarWithConstIfNotLvalue;

-    EIGEN_DEVICE_FUNC inline ScalarWithConstIfNotLvalue* data() { return derived().nestedExpression().data(); }
-    EIGEN_DEVICE_FUNC inline const Scalar* data() const { return derived().nestedExpression().data(); }
+    inline ScalarWithConstIfNotLvalue* data() { return derived().nestedExpression().data(); }
+    inline const Scalar* data() const { return derived().nestedExpression().data(); }
+
+    inline ScalarWithConstIfNotLvalue& coeffRef(Index rowId, Index colId)
+    {
+      EIGEN_STATIC_ASSERT_LVALUE(MatrixType)
+      return derived().nestedExpression().const_cast_derived().coeffRef(colId, rowId);
+    }
+
+    inline ScalarWithConstIfNotLvalue& coeffRef(Index index)
+    {
+      EIGEN_STATIC_ASSERT_LVALUE(MatrixType)
+      return derived().nestedExpression().const_cast_derived().coeffRef(index);
+    }

-    // FIXME: shall we keep the const version of coeffRef?
-    EIGEN_DEVICE_FUNC
    inline const Scalar& coeffRef(Index rowId, Index colId) const
    {
      return derived().nestedExpression().coeffRef(colId, rowId);
    }

-    EIGEN_DEVICE_FUNC
    inline const Scalar& coeffRef(Index index) const
    {
      return derived().nestedExpression().coeffRef(index);
    }
+
+    inline CoeffReturnType coeff(Index rowId, Index colId) const
+    {
+      return derived().nestedExpression().coeff(colId, rowId);
+    }
+
+    inline CoeffReturnType coeff(Index index) const
+    {
+      return derived().nestedExpression().coeff(index);
+    }
+
+    template<int LoadMode>
+    inline const PacketScalar packet(Index rowId, Index colId) const
+    {
+      return derived().nestedExpression().template packet<LoadMode>(colId, rowId);
+    }
+
+    template<int LoadMode>
+    inline void writePacket(Index rowId, Index colId, const PacketScalar& x)
+    {
+      derived().nestedExpression().const_cast_derived().template writePacket<LoadMode>(colId, rowId, x);
+    }
+
+    template<int LoadMode>
+    inline const PacketScalar packet(Index index) const
+    {
+      return derived().nestedExpression().template packet<LoadMode>(index);
+    }
+
+    template<int LoadMode>
+    inline void writePacket(Index index, const PacketScalar& x)
+    {
+      derived().nestedExpression().const_cast_derived().template writePacket<LoadMode>(index, x);
+    }
 };

 /** \returns an expression of the transpose of *this.
@@ -165,7 +198,7 @@ template<typename Derived>
 inline Transpose<Derived>
 DenseBase<Derived>::transpose()
 {
-  return TransposeReturnType(derived());
+  return derived();
 }

 /** This is the const version of transpose().
@@ -203,7 +236,8 @@ template<typename Derived>
 inline const typename MatrixBase<Derived>::AdjointReturnType
 MatrixBase<Derived>::adjoint() const
 {
-  return AdjointReturnType(this->transpose());
+  return this->transpose(); // in the complex case, the .conjugate() is be implicit here
+                            // due to implicit conversion to return type
 }

 /***************************************************************************
@@ -213,38 +247,18 @@ MatrixBase<Derived>::adjoint() const
 namespace internal {

 template<typename MatrixType,
-  bool IsSquare = (MatrixType::RowsAtCompileTime == MatrixType::ColsAtCompileTime) && MatrixType::RowsAtCompileTime!=Dynamic,
-  bool MatchPacketSize =
-        (int(MatrixType::RowsAtCompileTime) == int(internal::packet_traits<typename MatrixType::Scalar>::size))
-    &&  (internal::evaluator<MatrixType>::Flags&PacketAccessBit) >
+  bool IsSquare = (MatrixType::RowsAtCompileTime == MatrixType::ColsAtCompileTime) && MatrixType::RowsAtCompileTime!=Dynamic>
 struct inplace_transpose_selector;

 template<typename MatrixType>
-struct inplace_transpose_selector<MatrixType,true,false> { // square matrix
+struct inplace_transpose_selector<MatrixType,true> { // square matrix
  static void run(MatrixType& m) {
    m.matrix().template triangularView<StrictlyUpper>().swap(m.matrix().transpose());
  }
 };

-// TODO: vectorized path is currently limited to LargestPacketSize x LargestPacketSize cases only.
 template<typename MatrixType>
-struct inplace_transpose_selector<MatrixType,true,true> { // PacketSize x PacketSize
-  static void run(MatrixType& m) {
-    typedef typename MatrixType::Scalar Scalar;
-    typedef typename internal::packet_traits<typename MatrixType::Scalar>::type Packet;
-    const Index PacketSize = internal::packet_traits<Scalar>::size;
-    const Index Alignment = internal::evaluator<MatrixType>::Flags&AlignedBit ? Aligned : Unaligned;
-    PacketBlock<Packet> A;
-    for (Index i=0; i<PacketSize; ++i)
-      A.packet[i] = m.template packetByOuterInner<Alignment>(i,0);
-    internal::ptranspose(A);
-    for (Index i=0; i<PacketSize; ++i)
-      m.template writePacket<Alignment>(m.rowIndexByOuterInner(i,0), m.colIndexByOuterInner(i,0), A.packet[i]);
-  }
-};
-
-template<typename MatrixType,bool MatchPacketSize>
-struct inplace_transpose_selector<MatrixType,false,MatchPacketSize> { // non square matrix
+struct inplace_transpose_selector<MatrixType,false> { // non square matrix
  static void run(MatrixType& m) {
    if (m.rows()==m.cols())
      m.matrix().template triangularView<StrictlyUpper>().swap(m.matrix().transpose());
@@ -317,11 +331,11 @@ inline void MatrixBase<Derived>::adjointInPlace()

 namespace internal {

-template<typename BinOp,typename NestedXpr,typename Rhs>
-struct blas_traits<SelfCwiseBinaryOp<BinOp,NestedXpr,Rhs> >
- : blas_traits<NestedXpr>
+template<typename BinOp,typename Xpr,typename Rhs>
+struct blas_traits<SelfCwiseBinaryOp<BinOp,Xpr,Rhs> >
+ : blas_traits<typename internal::remove_all<typename Xpr::Nested>::type>
 {
-  typedef SelfCwiseBinaryOp<BinOp,NestedXpr,Rhs> XprType;
+  typedef SelfCwiseBinaryOp<BinOp,Xpr,Rhs> XprType;
  static inline const XprType extract(const XprType& x) { return x; }
 };

@@ -378,7 +392,6 @@ struct checkTransposeAliasing_impl
                      ::run(extract_data(dst), other))
          && "aliasing detected during transposition, use transposeInPlace() "
             "or evaluate the rhs into a temporary using .eval()");
-
    }
 };

@@ -390,15 +403,15 @@ struct checkTransposeAliasing_impl<Derived, OtherDerived, false>
    }
 };

-template<typename Dst, typename Src>
-void check_for_aliasing(const Dst &dst, const Src &src)
-{
-  internal::checkTransposeAliasing_impl<Dst, Src>::run(dst, src);
-}
-
 } // end namespace internal

-#endif // EIGEN_NO_DEBUG
+template<typename Derived>
+template<typename OtherDerived>
+void DenseBase<Derived>::checkTransposeAliasing(const OtherDerived& other) const
+{
+    internal::checkTransposeAliasing_impl<Derived, OtherDerived>::run(derived(), other);
+}
+#endif

 } // end namespace Eigen

--- a/Eigen/src/Core/Transpositions.h
+++ b/Eigen/src/Core/Transpositions.h
@@ -53,8 +53,7 @@ class TranspositionsBase
  public:

    typedef typename Traits::IndicesType IndicesType;
-    typedef typename IndicesType::Scalar StorageIndex;
-    typedef Eigen::Index Index; ///< \deprecated since Eigen 3.3
+    typedef typename IndicesType::Scalar Index;

    Derived& derived() { return *static_cast<Derived*>(this); }
    const Derived& derived() const { return *static_cast<const Derived*>(this); }
@@ -82,17 +81,17 @@ class TranspositionsBase
    inline Index size() const { return indices().size(); }

    /** Direct access to the underlying index vector */
-    inline const StorageIndex& coeff(Index i) const { return indices().coeff(i); }
+    inline const Index& coeff(Index i) const { return indices().coeff(i); }
    /** Direct access to the underlying index vector */
-    inline StorageIndex& coeffRef(Index i) { return indices().coeffRef(i); }
+    inline Index& coeffRef(Index i) { return indices().coeffRef(i); }
    /** Direct access to the underlying index vector */
-    inline const StorageIndex& operator()(Index i) const { return indices()(i); }
+    inline const Index& operator()(Index i) const { return indices()(i); }
    /** Direct access to the underlying index vector */
-    inline StorageIndex& operator()(Index i) { return indices()(i); }
+    inline Index& operator()(Index i) { return indices()(i); }
    /** Direct access to the underlying index vector */
-    inline const StorageIndex& operator[](Index i) const { return indices()(i); }
+    inline const Index& operator[](Index i) const { return indices()(i); }
    /** Direct access to the underlying index vector */
-    inline StorageIndex& operator[](Index i) { return indices()(i); }
+    inline Index& operator[](Index i) { return indices()(i); }

    /** const version of indices(). */
    const IndicesType& indices() const { return derived().indices(); }
@@ -100,7 +99,7 @@ class TranspositionsBase
    IndicesType& indices() { return derived().indices(); }

    /** Resizes to given size. */
-    inline void resize(Index newSize)
+    inline void resize(int newSize)
    {
      indices().resize(newSize);
    }
@@ -108,7 +107,7 @@ class TranspositionsBase
    /** Sets \c *this to represents an identity transformation */
    void setIdentity()
    {
-      for(StorageIndex i = 0; i < indices().size(); ++i)
+      for(int i = 0; i < indices().size(); ++i)
        coeffRef(i) = i;
    }

@@ -145,23 +144,23 @@ class TranspositionsBase
 };

 namespace internal {
-template<int SizeAtCompileTime, int MaxSizeAtCompileTime, typename _StorageIndex>
-struct traits<Transpositions<SizeAtCompileTime,MaxSizeAtCompileTime,_StorageIndex> >
+template<int SizeAtCompileTime, int MaxSizeAtCompileTime, typename IndexType>
+struct traits<Transpositions<SizeAtCompileTime,MaxSizeAtCompileTime,IndexType> >
 {
-  typedef Matrix<_StorageIndex, SizeAtCompileTime, 1, 0, MaxSizeAtCompileTime, 1> IndicesType;
-  typedef _StorageIndex StorageIndex;
+  typedef IndexType Index;
+  typedef Matrix<Index, SizeAtCompileTime, 1, 0, MaxSizeAtCompileTime, 1> IndicesType;
 };
 }

-template<int SizeAtCompileTime, int MaxSizeAtCompileTime, typename _StorageIndex>
-class Transpositions : public TranspositionsBase<Transpositions<SizeAtCompileTime,MaxSizeAtCompileTime,_StorageIndex> >
+template<int SizeAtCompileTime, int MaxSizeAtCompileTime, typename IndexType>
+class Transpositions : public TranspositionsBase<Transpositions<SizeAtCompileTime,MaxSizeAtCompileTime,IndexType> >
 {
    typedef internal::traits<Transpositions> Traits;
  public:

    typedef TranspositionsBase<Transpositions> Base;
    typedef typename Traits::IndicesType IndicesType;
-    typedef typename IndicesType::Scalar StorageIndex;
+    typedef typename IndicesType::Scalar Index;

    inline Transpositions() {}

@@ -216,30 +215,30 @@ class Transpositions : public TranspositionsBase<Transpositions<SizeAtCompileTim


 namespace internal {
-template<int SizeAtCompileTime, int MaxSizeAtCompileTime, typename _StorageIndex, int _PacketAccess>
-struct traits<Map<Transpositions<SizeAtCompileTime,MaxSizeAtCompileTime,_StorageIndex>,_PacketAccess> >
+template<int SizeAtCompileTime, int MaxSizeAtCompileTime, typename IndexType, int _PacketAccess>
+struct traits<Map<Transpositions<SizeAtCompileTime,MaxSizeAtCompileTime,IndexType>,_PacketAccess> >
 {
-  typedef Map<const Matrix<_StorageIndex,SizeAtCompileTime,1,0,MaxSizeAtCompileTime,1>, _PacketAccess> IndicesType;
-  typedef _StorageIndex StorageIndex;
+  typedef IndexType Index;
+  typedef Map<const Matrix<Index,SizeAtCompileTime,1,0,MaxSizeAtCompileTime,1>, _PacketAccess> IndicesType;
 };
 }

-template<int SizeAtCompileTime, int MaxSizeAtCompileTime, typename _StorageIndex, int PacketAccess>
-class Map<Transpositions<SizeAtCompileTime,MaxSizeAtCompileTime,_StorageIndex>,PacketAccess>
- : public TranspositionsBase<Map<Transpositions<SizeAtCompileTime,MaxSizeAtCompileTime,_StorageIndex>,PacketAccess> >
+template<int SizeAtCompileTime, int MaxSizeAtCompileTime, typename IndexType, int PacketAccess>
+class Map<Transpositions<SizeAtCompileTime,MaxSizeAtCompileTime,IndexType>,PacketAccess>
+ : public TranspositionsBase<Map<Transpositions<SizeAtCompileTime,MaxSizeAtCompileTime,IndexType>,PacketAccess> >
 {
    typedef internal::traits<Map> Traits;
  public:

    typedef TranspositionsBase<Map> Base;
    typedef typename Traits::IndicesType IndicesType;
-    typedef typename IndicesType::Scalar StorageIndex;
+    typedef typename IndicesType::Scalar Index;

-    explicit inline Map(const StorageIndex* indicesPtr)
+    inline Map(const Index* indicesPtr)
      : m_indices(indicesPtr)
    {}

-    inline Map(const StorageIndex* indicesPtr, Index size)
+    inline Map(const Index* indicesPtr, Index size)
      : m_indices(indicesPtr,size)
    {}

@@ -276,7 +275,7 @@ namespace internal {
 template<typename _IndicesType>
 struct traits<TranspositionsWrapper<_IndicesType> >
 {
-  typedef typename _IndicesType::Scalar StorageIndex;
+  typedef typename _IndicesType::Scalar Index;
  typedef _IndicesType IndicesType;
 };
 }
@@ -290,9 +289,9 @@ class TranspositionsWrapper

    typedef TranspositionsBase<TranspositionsWrapper> Base;
    typedef typename Traits::IndicesType IndicesType;
-    typedef typename IndicesType::Scalar StorageIndex;
+    typedef typename IndicesType::Scalar Index;

-    explicit inline TranspositionsWrapper(IndicesType& a_indices)
+    inline TranspositionsWrapper(IndicesType& a_indices)
      : m_indices(a_indices)
    {}

@@ -363,25 +362,26 @@ struct transposition_matrix_product_retval
 : public ReturnByValue<transposition_matrix_product_retval<TranspositionType, MatrixType, Side, Transposed> >
 {
    typedef typename remove_all<typename MatrixType::Nested>::type MatrixTypeNestedCleaned;
-    typedef typename TranspositionType::StorageIndex StorageIndex;
+    typedef typename TranspositionType::Index Index;

    transposition_matrix_product_retval(const TranspositionType& tr, const MatrixType& matrix)
      : m_transpositions(tr), m_matrix(matrix)
    {}

-    inline Index rows() const { return m_matrix.rows(); }
-    inline Index cols() const { return m_matrix.cols(); }
+    inline int rows() const { return m_matrix.rows(); }
+    inline int cols() const { return m_matrix.cols(); }

    template<typename Dest> inline void evalTo(Dest& dst) const
    {
-      const Index size = m_transpositions.size();
-      StorageIndex j = 0;
+      const int size = m_transpositions.size();
+      Index j = 0;

-      if(!(is_same<MatrixTypeNestedCleaned,Dest>::value && extract_data(dst) == extract_data(m_matrix)))
+      const typename Dest::Scalar *dst_data = internal::extract_data(dst);
+      if(!(is_same<MatrixTypeNestedCleaned,Dest>::value && dst_data!=0 && dst_data == extract_data(m_matrix)))
        dst = m_matrix;

-      for(Index k=(Transposed?size-1:0) ; Transposed?k>=0:k<size ; Transposed?--k:++k)
-        if(Index(j=m_transpositions.coeff(k))!=k)
+      for(int k=(Transposed?size-1:0) ; Transposed?k>=0:k<size ; Transposed?--k:++k)
+        if((j=m_transpositions.coeff(k))!=k)
        {
          if(Side==OnTheLeft)
            dst.row(k).swap(dst.row(j));
@@ -406,7 +406,7 @@ class Transpose<TranspositionsBase<TranspositionsDerived> >
    typedef typename TranspositionType::IndicesType IndicesType;
  public:

-    explicit Transpose(const TranspositionType& t) : m_transpositions(t) {}
+    Transpose(const TranspositionType& t) : m_transpositions(t) {}

    inline int size() const { return m_transpositions.size(); }

--- a/Eigen/src/Core/TriangularMatrix.h
+++ b/Eigen/src/Core/TriangularMatrix.h
--- a/Eigen/src/Core/VectorBlock.h
+++ b/Eigen/src/Core/VectorBlock.h
@@ -72,7 +72,6 @@ template<typename VectorType, int Size> class VectorBlock

    /** Dynamic-size constructor
      */
-    EIGEN_DEVICE_FUNC
    inline VectorBlock(VectorType& vector, Index start, Index size)
      : Base(vector,
             IsColVector ? start : 0, IsColVector ? 0 : start,
@@ -83,7 +82,6 @@ template<typename VectorType, int Size> class VectorBlock

    /** Fixed-size constructor
      */
-    EIGEN_DEVICE_FUNC
    inline VectorBlock(VectorType& vector, Index start)
      : Base(vector, IsColVector ? start : 0, IsColVector ? 0 : start)
    {
--- a/Eigen/src/Core/VectorwiseOp.h
+++ b/Eigen/src/Core/VectorwiseOp.h
@@ -48,15 +48,25 @@ struct traits<PartialReduxExpr<MatrixType, MemberOp, Direction> >
    ColsAtCompileTime = Direction==Horizontal ? 1 : MatrixType::ColsAtCompileTime,
    MaxRowsAtCompileTime = Direction==Vertical   ? 1 : MatrixType::MaxRowsAtCompileTime,
    MaxColsAtCompileTime = Direction==Horizontal ? 1 : MatrixType::MaxColsAtCompileTime,
-    Flags = RowsAtCompileTime == 1 ? RowMajorBit : 0,
+    Flags0 = (unsigned int)_MatrixTypeNested::Flags & HereditaryBits,
+    Flags = (Flags0 & ~RowMajorBit) | (RowsAtCompileTime == 1 ? RowMajorBit : 0),
    TraversalSize = Direction==Vertical ? MatrixType::RowsAtCompileTime :  MatrixType::ColsAtCompileTime
  };
+  #if EIGEN_GNUC_AT_LEAST(3,4)
+  typedef typename MemberOp::template Cost<InputScalar,int(TraversalSize)> CostOpType;
+  #else
+  typedef typename MemberOp::template Cost<InputScalar,TraversalSize> CostOpType;
+  #endif
+  enum {
+    CoeffReadCost = TraversalSize==Dynamic ? Dynamic
+                  : TraversalSize * traits<_MatrixTypeNested>::CoeffReadCost + int(CostOpType::value)
+  };
 };
 }

 template< typename MatrixType, typename MemberOp, int Direction>
-class PartialReduxExpr : public internal::dense_xpr_base< PartialReduxExpr<MatrixType, MemberOp, Direction> >::type,
-                         internal::no_assignment_operator
+class PartialReduxExpr : internal::no_assignment_operator,
+  public internal::dense_xpr_base< PartialReduxExpr<MatrixType, MemberOp, Direction> >::type
 {
  public:

@@ -65,7 +75,7 @@ class PartialReduxExpr : public internal::dense_xpr_base< PartialReduxExpr<Matri
    typedef typename internal::traits<PartialReduxExpr>::MatrixTypeNested MatrixTypeNested;
    typedef typename internal::traits<PartialReduxExpr>::_MatrixTypeNested _MatrixTypeNested;

-    explicit PartialReduxExpr(const MatrixType& mat, const MemberOp& func = MemberOp())
+    PartialReduxExpr(const MatrixType& mat, const MemberOp& func = MemberOp())
      : m_matrix(mat), m_functor(func) {}

    Index rows() const { return (Direction==Vertical   ? 1 : m_matrix.rows()); }
@@ -128,7 +138,7 @@ struct member_redux {
                   >::type  result_type;
  template<typename _Scalar, int Size> struct Cost
  { enum { value = (Size-1) * functor_traits<BinaryOp>::Cost }; };
-  explicit member_redux(const BinaryOp func) : m_functor(func) {}
+  member_redux(const BinaryOp func) : m_functor(func) {}
  template<typename Derived>
  inline result_type operator()(const DenseBase<Derived>& mat) const
  { return mat.redux(m_functor); }
@@ -159,16 +169,16 @@ template<typename ExpressionType, int Direction> class VectorwiseOp

    typedef typename ExpressionType::Scalar Scalar;
    typedef typename ExpressionType::RealScalar RealScalar;
-    typedef Eigen::Index Index; ///< \deprecated since Eigen 3.3
+    typedef typename ExpressionType::Index Index;
    typedef typename internal::conditional<internal::must_nest_by_value<ExpressionType>::ret,
        ExpressionType, ExpressionType&>::type ExpressionTypeNested;
    typedef typename internal::remove_all<ExpressionTypeNested>::type ExpressionTypeNestedCleaned;

    template<template<typename _Scalar> class Functor,
-                      typename Scalar_=Scalar> struct ReturnType
+                      typename Scalar=typename internal::traits<ExpressionType>::Scalar> struct ReturnType
    {
      typedef PartialReduxExpr<ExpressionType,
-                               Functor<Scalar_>,
+                               Functor<Scalar>,
                               Direction
                              > Type;
    };
@@ -176,7 +186,7 @@ template<typename ExpressionType, int Direction> class VectorwiseOp
    template<typename BinaryOp> struct ReduxReturnType
    {
      typedef PartialReduxExpr<ExpressionType,
-                               internal::member_redux<BinaryOp,Scalar>,
+                               internal::member_redux<BinaryOp,typename internal::traits<ExpressionType>::Scalar>,
                               Direction
                              > Type;
    };
@@ -249,7 +259,7 @@ template<typename ExpressionType, int Direction> class VectorwiseOp

  public:

-    explicit inline VectorwiseOp(ExpressionType& matrix) : m_matrix(matrix) {}
+    inline VectorwiseOp(ExpressionType& matrix) : m_matrix(matrix) {}

    /** \internal */
    inline const ExpressionType& _expression() const { return m_matrix; }
@@ -264,22 +274,7 @@ template<typename ExpressionType, int Direction> class VectorwiseOp
    template<typename BinaryOp>
    const typename ReduxReturnType<BinaryOp>::Type
    redux(const BinaryOp& func = BinaryOp()) const
-    { return typename ReduxReturnType<BinaryOp>::Type(_expression(), internal::member_redux<BinaryOp,Scalar>(func)); }
-
-    typedef typename ReturnType<internal::member_minCoeff>::Type MinCoeffReturnType;
-    typedef typename ReturnType<internal::member_maxCoeff>::Type MaxCoeffReturnType;
-    typedef typename ReturnType<internal::member_squaredNorm,RealScalar>::Type SquaredNormReturnType;
-    typedef typename ReturnType<internal::member_norm,RealScalar>::Type NormReturnType;
-    typedef typename ReturnType<internal::member_blueNorm,RealScalar>::Type BlueNormReturnType;
-    typedef typename ReturnType<internal::member_stableNorm,RealScalar>::Type StableNormReturnType;
-    typedef typename ReturnType<internal::member_hypotNorm,RealScalar>::Type HypotNormReturnType;
-    typedef typename ReturnType<internal::member_sum>::Type SumReturnType;
-    typedef typename ReturnType<internal::member_mean>::Type MeanReturnType;
-    typedef typename ReturnType<internal::member_all>::Type AllReturnType;
-    typedef typename ReturnType<internal::member_any>::Type AnyReturnType;
-    typedef PartialReduxExpr<ExpressionType, internal::member_count<Index>, Direction> CountReturnType;
-    typedef typename ReturnType<internal::member_prod>::Type ProdReturnType;
-    typedef Reverse<ExpressionType, Direction> ReverseReturnType;
+    { return typename ReduxReturnType<BinaryOp>::Type(_expression(), func); }

    /** \returns a row (or column) vector expression of the smallest coefficient
      * of each column (or row) of the referenced expression.
@@ -290,8 +285,8 @@ template<typename ExpressionType, int Direction> class VectorwiseOp
      * Output: \verbinclude PartialRedux_minCoeff.out
      *
      * \sa DenseBase::minCoeff() */
-    const MinCoeffReturnType minCoeff() const
-    { return MinCoeffReturnType(_expression()); }
+    const typename ReturnType<internal::member_minCoeff>::Type minCoeff() const
+    { return _expression(); }

    /** \returns a row (or column) vector expression of the largest coefficient
      * of each column (or row) of the referenced expression.
@@ -302,60 +297,55 @@ template<typename ExpressionType, int Direction> class VectorwiseOp
      * Output: \verbinclude PartialRedux_maxCoeff.out
      *
      * \sa DenseBase::maxCoeff() */
-    const MaxCoeffReturnType maxCoeff() const
-    { return MaxCoeffReturnType(_expression()); }
+    const typename ReturnType<internal::member_maxCoeff>::Type maxCoeff() const
+    { return _expression(); }

    /** \returns a row (or column) vector expression of the squared norm
      * of each column (or row) of the referenced expression.
-      * This is a vector with real entries, even if the original matrix has complex entries.
      *
      * Example: \include PartialRedux_squaredNorm.cpp
      * Output: \verbinclude PartialRedux_squaredNorm.out
      *
      * \sa DenseBase::squaredNorm() */
-    const SquaredNormReturnType squaredNorm() const
-    { return SquaredNormReturnType(_expression()); }
+    const typename ReturnType<internal::member_squaredNorm,RealScalar>::Type squaredNorm() const
+    { return _expression(); }

    /** \returns a row (or column) vector expression of the norm
      * of each column (or row) of the referenced expression.
-      * This is a vector with real entries, even if the original matrix has complex entries.
      *
      * Example: \include PartialRedux_norm.cpp
      * Output: \verbinclude PartialRedux_norm.out
      *
      * \sa DenseBase::norm() */
-    const NormReturnType norm() const
-    { return NormReturnType(_expression()); }
+    const typename ReturnType<internal::member_norm,RealScalar>::Type norm() const
+    { return _expression(); }


    /** \returns a row (or column) vector expression of the norm
      * of each column (or row) of the referenced expression, using
-      * Blue's algorithm. 
-      * This is a vector with real entries, even if the original matrix has complex entries.
+      * blue's algorithm.
      *
      * \sa DenseBase::blueNorm() */
-    const BlueNormReturnType blueNorm() const
-    { return BlueNormReturnType(_expression()); }
+    const typename ReturnType<internal::member_blueNorm,RealScalar>::Type blueNorm() const
+    { return _expression(); }


    /** \returns a row (or column) vector expression of the norm
      * of each column (or row) of the referenced expression, avoiding
      * underflow and overflow.
-      * This is a vector with real entries, even if the original matrix has complex entries.
      *
      * \sa DenseBase::stableNorm() */
-    const StableNormReturnType stableNorm() const
-    { return StableNormReturnType(_expression()); }
+    const typename ReturnType<internal::member_stableNorm,RealScalar>::Type stableNorm() const
+    { return _expression(); }


    /** \returns a row (or column) vector expression of the norm
      * of each column (or row) of the referenced expression, avoiding
      * underflow and overflow using a concatenation of hypot() calls.
-      * This is a vector with real entries, even if the original matrix has complex entries.
      *
      * \sa DenseBase::hypotNorm() */
-    const HypotNormReturnType hypotNorm() const
-    { return HypotNormReturnType(_expression()); }
+    const typename ReturnType<internal::member_hypotNorm,RealScalar>::Type hypotNorm() const
+    { return _expression(); }

    /** \returns a row (or column) vector expression of the sum
      * of each column (or row) of the referenced expression.
@@ -364,43 +354,39 @@ template<typename ExpressionType, int Direction> class VectorwiseOp
      * Output: \verbinclude PartialRedux_sum.out
      *
      * \sa DenseBase::sum() */
-    const SumReturnType sum() const
-    { return SumReturnType(_expression()); }
+    const typename ReturnType<internal::member_sum>::Type sum() const
+    { return _expression(); }

    /** \returns a row (or column) vector expression of the mean
    * of each column (or row) of the referenced expression.
    *
    * \sa DenseBase::mean() */
-    const MeanReturnType mean() const
-    { return MeanReturnType(_expression()); }
+    const typename ReturnType<internal::member_mean>::Type mean() const
+    { return _expression(); }

    /** \returns a row (or column) vector expression representing
      * whether \b all coefficients of each respective column (or row) are \c true.
-      * This expression can be assigned to a vector with entries of type \c bool.
      *
      * \sa DenseBase::all() */
-    const AllReturnType all() const
-    { return AllReturnType(_expression()); }
+    const typename ReturnType<internal::member_all>::Type all() const
+    { return _expression(); }

    /** \returns a row (or column) vector expression representing
      * whether \b at \b least one coefficient of each respective column (or row) is \c true.
-      * This expression can be assigned to a vector with entries of type \c bool.
      *
      * \sa DenseBase::any() */
-    const AnyReturnType any() const
-    { return Any(_expression()); }
+    const typename ReturnType<internal::member_any>::Type any() const
+    { return _expression(); }

    /** \returns a row (or column) vector expression representing
      * the number of \c true coefficients of each respective column (or row).
-      * This expression can be assigned to a vector whose entries have the same type as is used to
-      * index entries of the original matrix; for dense matrices, this is \c std::ptrdiff_t .
      *
      * Example: \include PartialRedux_count.cpp
      * Output: \verbinclude PartialRedux_count.out
      *
      * \sa DenseBase::count() */
-    const CountReturnType count() const
-    { return CountReturnType(_expression()); }
+    const PartialReduxExpr<ExpressionType, internal::member_count<Index>, Direction> count() const
+    { return _expression(); }

    /** \returns a row (or column) vector expression of the product
      * of each column (or row) of the referenced expression.
@@ -409,8 +395,8 @@ template<typename ExpressionType, int Direction> class VectorwiseOp
      * Output: \verbinclude PartialRedux_prod.out
      *
      * \sa DenseBase::prod() */
-    const ProdReturnType prod() const
-    { return ProdReturnType(_expression()); }
+    const typename ReturnType<internal::member_prod>::Type prod() const
+    { return _expression(); }


    /** \returns a matrix expression
@@ -420,8 +406,8 @@ template<typename ExpressionType, int Direction> class VectorwiseOp
      * Output: \verbinclude Vectorwise_reverse.out
      *
      * \sa DenseBase::reverse() */
-    const ReverseReturnType reverse() const
-    { return ReverseReturnType( _expression() ); }
+    const Reverse<ExpressionType, Direction> reverse() const
+    { return Reverse<ExpressionType, Direction>( _expression() ); }

    typedef Replicate<ExpressionType,Direction==Vertical?Dynamic:1,Direction==Horizontal?Dynamic:1> ReplicateReturnType;
    const ReplicateReturnType replicate(Index factor) const;
@@ -565,8 +551,9 @@ template<typename ExpressionType, int Direction> class VectorwiseOp

 /////////// Geometry module ///////////

-    typedef Homogeneous<ExpressionType,Direction> HomogeneousReturnType;
-    HomogeneousReturnType homogeneous() const;
+    #if EIGEN2_SUPPORT_STAGE > STAGE20_RESOLVE_API_CONFLICTS
+    Homogeneous<ExpressionType,Direction> homogeneous() const;
+    #endif

    typedef typename ExpressionType::PlainObject CrossReturnType;
    template<typename OtherDerived>
@@ -611,7 +598,7 @@ template<typename Derived>
 inline const typename DenseBase<Derived>::ConstColwiseReturnType
 DenseBase<Derived>::colwise() const
 {
-  return ConstColwiseReturnType(derived());
+  return derived();
 }

 /** \returns a writable VectorwiseOp wrapper of *this providing additional partial reduction operations
@@ -622,7 +609,7 @@ template<typename Derived>
 inline typename DenseBase<Derived>::ColwiseReturnType
 DenseBase<Derived>::colwise()
 {
-  return ColwiseReturnType(derived());
+  return derived();
 }

 /** \returns a VectorwiseOp wrapper of *this providing additional partial reduction operations
@@ -636,7 +623,7 @@ template<typename Derived>
 inline const typename DenseBase<Derived>::ConstRowwiseReturnType
 DenseBase<Derived>::rowwise() const
 {
-  return ConstRowwiseReturnType(derived());
+  return derived();
 }

 /** \returns a writable VectorwiseOp wrapper of *this providing additional partial reduction operations
@@ -647,7 +634,7 @@ template<typename Derived>
 inline typename DenseBase<Derived>::RowwiseReturnType
 DenseBase<Derived>::rowwise()
 {
-  return RowwiseReturnType(derived());
+  return derived();
 }

 } // end namespace Eigen
--- a/Eigen/src/Core/Visitor.h
+++ b/Eigen/src/Core/Visitor.h
@@ -41,6 +41,7 @@ struct visitor_impl<Visitor, Derived, 1>
 template<typename Visitor, typename Derived>
 struct visitor_impl<Visitor, Derived, Dynamic>
 {
+  typedef typename Derived::Index Index;
  static inline void run(const Derived& mat, Visitor& visitor)
  {
    visitor.init(mat.coeff(0,0), 0, 0);
@@ -52,32 +53,6 @@ struct visitor_impl<Visitor, Derived, Dynamic>
  }
 };

-// evaluator adaptor
-template<typename XprType>
-class visitor_evaluator
-{
-public:
-  explicit visitor_evaluator(const XprType &xpr) : m_evaluator(xpr), m_xpr(xpr) {}
-  
-  typedef typename XprType::Scalar Scalar;
-  typedef typename XprType::CoeffReturnType CoeffReturnType;
-  
-  enum {
-    RowsAtCompileTime = XprType::RowsAtCompileTime,
-    CoeffReadCost = internal::evaluator<XprType>::CoeffReadCost
-  };
-  
-  Index rows() const { return m_xpr.rows(); }
-  Index cols() const { return m_xpr.cols(); }
-  Index size() const { return m_xpr.size(); }
-
-  CoeffReturnType coeff(Index row, Index col) const
-  { return m_evaluator.coeff(row, col); }
-  
-protected:
-  typename internal::evaluator<XprType>::nestedType m_evaluator;
-  const XprType &m_xpr;
-};
 } // end namespace internal

 /** Applies the visitor \a visitor to the whole coefficients of the matrix or vector.
@@ -101,17 +76,17 @@ template<typename Derived>
 template<typename Visitor>
 void DenseBase<Derived>::visit(Visitor& visitor) const
 {
-  typedef typename internal::visitor_evaluator<Derived> ThisEvaluator;
-  ThisEvaluator thisEval(derived());
-  
-  enum { unroll =   SizeAtCompileTime != Dynamic
-                &&  ThisEvaluator::CoeffReadCost != Dynamic
-                &&  (SizeAtCompileTime == 1 || internal::functor_traits<Visitor>::Cost != Dynamic)
-                &&  SizeAtCompileTime * ThisEvaluator::CoeffReadCost + (SizeAtCompileTime-1) * internal::functor_traits<Visitor>::Cost
-                <= EIGEN_UNROLLING_LIMIT };
-  return internal::visitor_impl<Visitor, ThisEvaluator,
+  typedef typename internal::remove_all<typename Derived::Nested>::type ThisNested;
+  typename Derived::Nested thisNested(derived());
+
+  enum { unroll = SizeAtCompileTime != Dynamic
+                   && CoeffReadCost != Dynamic
+                   && (SizeAtCompileTime == 1 || internal::functor_traits<Visitor>::Cost != Dynamic)
+                   && SizeAtCompileTime * CoeffReadCost + (SizeAtCompileTime-1) * internal::functor_traits<Visitor>::Cost
+                      <= EIGEN_UNROLLING_LIMIT };
+  return internal::visitor_impl<Visitor, ThisNested,
      unroll ? int(SizeAtCompileTime) : Dynamic
-    >::run(thisEval, visitor);
+    >::run(thisNested, visitor);
 }

 namespace internal {
@@ -122,6 +97,7 @@ namespace internal {
 template <typename Derived>
 struct coeff_visitor
 {
+  typedef typename Derived::Index Index;
  typedef typename Derived::Scalar Scalar;
  Index row, col;
  Scalar res;
@@ -141,6 +117,7 @@ struct coeff_visitor
 template <typename Derived>
 struct min_coeff_visitor : coeff_visitor<Derived>
 {
+  typedef typename Derived::Index Index;
  typedef typename Derived::Scalar Scalar;
  void operator() (const Scalar& value, Index i, Index j)
  {
@@ -168,6 +145,7 @@ struct functor_traits<min_coeff_visitor<Scalar> > {
 template <typename Derived>
 struct max_coeff_visitor : coeff_visitor<Derived>
 {
+  typedef typename Derived::Index Index;
  typedef typename Derived::Scalar Scalar;
  void operator() (const Scalar& value, Index i, Index j)
  {
@@ -219,7 +197,7 @@ DenseBase<Derived>::minCoeff(IndexType* index) const
  EIGEN_STATIC_ASSERT_VECTOR_ONLY(Derived)
  internal::min_coeff_visitor<Derived> minVisitor;
  this->visit(minVisitor);
-  *index = IndexType((RowsAtCompileTime==1) ? minVisitor.col : minVisitor.row);
+  *index = (RowsAtCompileTime==1) ? minVisitor.col : minVisitor.row;
  return minVisitor.res;
 }

--- a/Eigen/src/Core/arch/AVX/CMakeLists.txt
+++ b/Eigen/src/Core/arch/AVX/CMakeLists.txt
@@ -1,6 +0,0 @@
-FILE(GLOB Eigen_Core_arch_AVX_SRCS "*.h")
-
-INSTALL(FILES
-  ${Eigen_Core_arch_AVX_SRCS}
-  DESTINATION ${INCLUDE_INSTALL_DIR}/Eigen/src/Core/arch/AVX COMPONENT Devel
-)
--- a/Eigen/src/Core/arch/AVX/Complex.h
+++ b/Eigen/src/Core/arch/AVX/Complex.h
@@ -1,463 +0,0 @@
-// This file is part of Eigen, a lightweight C++ template library
-// for linear algebra.
-//
-// Copyright (C) 2014 Benoit Steiner (benoit.steiner.goog@gmail.com)
-//
-// This Source Code Form is subject to the terms of the Mozilla
-// Public License v. 2.0. If a copy of the MPL was not distributed
-// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
-
-#ifndef EIGEN_COMPLEX_AVX_H
-#define EIGEN_COMPLEX_AVX_H
-
-namespace Eigen {
-
-namespace internal {
-
-//---------- float ----------
-struct Packet4cf
-{
-  EIGEN_STRONG_INLINE Packet4cf() {}
-  EIGEN_STRONG_INLINE explicit Packet4cf(const __m256& a) : v(a) {}
-  __m256  v;
-};
-
-template<> struct packet_traits<std::complex<float> >  : default_packet_traits
-{
-  typedef Packet4cf type;
-  typedef Packet2cf half;
-  enum {
-    Vectorizable = 1,
-    AlignedOnScalar = 1,
-    size = 4,
-    HasHalfPacket = 1,
-
-    HasAdd    = 1,
-    HasSub    = 1,
-    HasMul    = 1,
-    HasDiv    = 1,
-    HasNegate = 1,
-    HasAbs    = 0,
-    HasAbs2   = 0,
-    HasMin    = 0,
-    HasMax    = 0,
-    HasSetLinear = 0
-  };
-};
-
-template<> struct unpacket_traits<Packet4cf> { typedef std::complex<float> type; enum {size=4}; typedef Packet2cf half; };
-
-template<> EIGEN_STRONG_INLINE Packet4cf padd<Packet4cf>(const Packet4cf& a, const Packet4cf& b) { return Packet4cf(_mm256_add_ps(a.v,b.v)); }
-template<> EIGEN_STRONG_INLINE Packet4cf psub<Packet4cf>(const Packet4cf& a, const Packet4cf& b) { return Packet4cf(_mm256_sub_ps(a.v,b.v)); }
-template<> EIGEN_STRONG_INLINE Packet4cf pnegate(const Packet4cf& a)
-{
-  return Packet4cf(pnegate(a.v));
-}
-template<> EIGEN_STRONG_INLINE Packet4cf pconj(const Packet4cf& a)
-{
-  const __m256 mask = _mm256_castsi256_ps(_mm256_setr_epi32(0x00000000,0x80000000,0x00000000,0x80000000,0x00000000,0x80000000,0x00000000,0x80000000));
-  return Packet4cf(_mm256_xor_ps(a.v,mask));
-}
-
-template<> EIGEN_STRONG_INLINE Packet4cf pmul<Packet4cf>(const Packet4cf& a, const Packet4cf& b)
-{
-  __m256 tmp1 = _mm256_mul_ps(_mm256_moveldup_ps(a.v), b.v);
-  __m256 tmp2 = _mm256_mul_ps(_mm256_movehdup_ps(a.v), _mm256_permute_ps(b.v, _MM_SHUFFLE(2,3,0,1)));
-  __m256 result = _mm256_addsub_ps(tmp1, tmp2);
-  return Packet4cf(result);
-}
-
-template<> EIGEN_STRONG_INLINE Packet4cf pand   <Packet4cf>(const Packet4cf& a, const Packet4cf& b) { return Packet4cf(_mm256_and_ps(a.v,b.v)); }
-template<> EIGEN_STRONG_INLINE Packet4cf por    <Packet4cf>(const Packet4cf& a, const Packet4cf& b) { return Packet4cf(_mm256_or_ps(a.v,b.v)); }
-template<> EIGEN_STRONG_INLINE Packet4cf pxor   <Packet4cf>(const Packet4cf& a, const Packet4cf& b) { return Packet4cf(_mm256_xor_ps(a.v,b.v)); }
-template<> EIGEN_STRONG_INLINE Packet4cf pandnot<Packet4cf>(const Packet4cf& a, const Packet4cf& b) { return Packet4cf(_mm256_andnot_ps(a.v,b.v)); }
-
-template<> EIGEN_STRONG_INLINE Packet4cf pload <Packet4cf>(const std::complex<float>* from) { EIGEN_DEBUG_ALIGNED_LOAD return Packet4cf(pload<Packet8f>(&numext::real_ref(*from))); }
-template<> EIGEN_STRONG_INLINE Packet4cf ploadu<Packet4cf>(const std::complex<float>* from) { EIGEN_DEBUG_UNALIGNED_LOAD return Packet4cf(ploadu<Packet8f>(&numext::real_ref(*from))); }
-
-
-template<> EIGEN_STRONG_INLINE Packet4cf pset1<Packet4cf>(const std::complex<float>& from)
-{
-  return Packet4cf(_mm256_castpd_ps(_mm256_broadcast_sd((const double*)(const void*)&from)));
-}
-
-template<> EIGEN_STRONG_INLINE Packet4cf ploaddup<Packet4cf>(const std::complex<float>* from)
-{
-  // FIXME The following might be optimized using _mm256_movedup_pd
-  Packet2cf a = ploaddup<Packet2cf>(from);
-  Packet2cf b = ploaddup<Packet2cf>(from+1);
-  return  Packet4cf(_mm256_insertf128_ps(_mm256_castps128_ps256(a.v), b.v, 1));
-}
-
-template<> EIGEN_STRONG_INLINE void pstore <std::complex<float> >(std::complex<float>* to, const Packet4cf& from) { EIGEN_DEBUG_ALIGNED_STORE pstore(&numext::real_ref(*to), from.v); }
-template<> EIGEN_STRONG_INLINE void pstoreu<std::complex<float> >(std::complex<float>* to, const Packet4cf& from) { EIGEN_DEBUG_UNALIGNED_STORE pstoreu(&numext::real_ref(*to), from.v); }
-
-template<> EIGEN_DEVICE_FUNC inline Packet4cf pgather<std::complex<float>, Packet4cf>(const std::complex<float>* from, Index stride)
-{
-  return Packet4cf(_mm256_set_ps(std::imag(from[3*stride]), std::real(from[3*stride]),
-                                 std::imag(from[2*stride]), std::real(from[2*stride]),
-                                 std::imag(from[1*stride]), std::real(from[1*stride]),
-                                 std::imag(from[0*stride]), std::real(from[0*stride])));
-}
-
-template<> EIGEN_DEVICE_FUNC inline void pscatter<std::complex<float>, Packet4cf>(std::complex<float>* to, const Packet4cf& from, Index stride)
-{
-  __m128 low = _mm256_extractf128_ps(from.v, 0);
-  to[stride*0] = std::complex<float>(_mm_cvtss_f32(_mm_shuffle_ps(low, low, 0)),
-                                     _mm_cvtss_f32(_mm_shuffle_ps(low, low, 1)));
-  to[stride*1] = std::complex<float>(_mm_cvtss_f32(_mm_shuffle_ps(low, low, 2)),
-                                     _mm_cvtss_f32(_mm_shuffle_ps(low, low, 3)));
-
-  __m128 high = _mm256_extractf128_ps(from.v, 1);
-  to[stride*2] = std::complex<float>(_mm_cvtss_f32(_mm_shuffle_ps(high, high, 0)),
-                                     _mm_cvtss_f32(_mm_shuffle_ps(high, high, 1)));
-  to[stride*3] = std::complex<float>(_mm_cvtss_f32(_mm_shuffle_ps(high, high, 2)),
-                                     _mm_cvtss_f32(_mm_shuffle_ps(high, high, 3)));
-
-}
-
-template<> EIGEN_STRONG_INLINE std::complex<float>  pfirst<Packet4cf>(const Packet4cf& a)
-{
-  return pfirst(Packet2cf(_mm256_castps256_ps128(a.v)));
-}
-
-template<> EIGEN_STRONG_INLINE Packet4cf preverse(const Packet4cf& a) {
-  __m128 low  = _mm256_extractf128_ps(a.v, 0);
-  __m128 high = _mm256_extractf128_ps(a.v, 1);
-  __m128d lowd  = _mm_castps_pd(low);
-  __m128d highd = _mm_castps_pd(high);
-  low  = _mm_castpd_ps(_mm_shuffle_pd(lowd,lowd,0x1));
-  high = _mm_castpd_ps(_mm_shuffle_pd(highd,highd,0x1));
-  __m256 result = _mm256_setzero_ps();
-  result = _mm256_insertf128_ps(result, low, 1);
-  result = _mm256_insertf128_ps(result, high, 0);
-  return Packet4cf(result);
-}
-
-template<> EIGEN_STRONG_INLINE std::complex<float> predux<Packet4cf>(const Packet4cf& a)
-{
-  return predux(padd(Packet2cf(_mm256_extractf128_ps(a.v,0)),
-                     Packet2cf(_mm256_extractf128_ps(a.v,1))));
-}
-
-template<> EIGEN_STRONG_INLINE Packet4cf preduxp<Packet4cf>(const Packet4cf* vecs)
-{
-  Packet8f t0 = _mm256_shuffle_ps(vecs[0].v, vecs[0].v, _MM_SHUFFLE(3, 1, 2 ,0));
-  Packet8f t1 = _mm256_shuffle_ps(vecs[1].v, vecs[1].v, _MM_SHUFFLE(3, 1, 2 ,0));
-  t0 = _mm256_hadd_ps(t0,t1);
-  Packet8f t2 = _mm256_shuffle_ps(vecs[2].v, vecs[2].v, _MM_SHUFFLE(3, 1, 2 ,0));
-  Packet8f t3 = _mm256_shuffle_ps(vecs[3].v, vecs[3].v, _MM_SHUFFLE(3, 1, 2 ,0));
-  t2 = _mm256_hadd_ps(t2,t3);
-  
-  t1 = _mm256_permute2f128_ps(t0,t2, 0 + (2<<4));
-  t3 = _mm256_permute2f128_ps(t0,t2, 1 + (3<<4));
-
-  return Packet4cf(_mm256_add_ps(t1,t3));
-}
-
-template<> EIGEN_STRONG_INLINE std::complex<float> predux_mul<Packet4cf>(const Packet4cf& a)
-{
-  return predux_mul(pmul(Packet2cf(_mm256_extractf128_ps(a.v, 0)),
-                         Packet2cf(_mm256_extractf128_ps(a.v, 1))));
-}
-
-template<int Offset>
-struct palign_impl<Offset,Packet4cf>
-{
-  static EIGEN_STRONG_INLINE void run(Packet4cf& first, const Packet4cf& second)
-  {
-    if (Offset==0) return;
-    palign_impl<Offset*2,Packet8f>::run(first.v, second.v);
-  }
-};
-
-template<> struct conj_helper<Packet4cf, Packet4cf, false,true>
-{
-  EIGEN_STRONG_INLINE Packet4cf pmadd(const Packet4cf& x, const Packet4cf& y, const Packet4cf& c) const
-  { return padd(pmul(x,y),c); }
-
-  EIGEN_STRONG_INLINE Packet4cf pmul(const Packet4cf& a, const Packet4cf& b) const
-  {
-    return internal::pmul(a, pconj(b));
-  }
-};
-
-template<> struct conj_helper<Packet4cf, Packet4cf, true,false>
-{
-  EIGEN_STRONG_INLINE Packet4cf pmadd(const Packet4cf& x, const Packet4cf& y, const Packet4cf& c) const
-  { return padd(pmul(x,y),c); }
-
-  EIGEN_STRONG_INLINE Packet4cf pmul(const Packet4cf& a, const Packet4cf& b) const
-  {
-    return internal::pmul(pconj(a), b);
-  }
-};
-
-template<> struct conj_helper<Packet4cf, Packet4cf, true,true>
-{
-  EIGEN_STRONG_INLINE Packet4cf pmadd(const Packet4cf& x, const Packet4cf& y, const Packet4cf& c) const
-  { return padd(pmul(x,y),c); }
-
-  EIGEN_STRONG_INLINE Packet4cf pmul(const Packet4cf& a, const Packet4cf& b) const
-  {
-    return pconj(internal::pmul(a, b));
-  }
-};
-
-template<> struct conj_helper<Packet8f, Packet4cf, false,false>
-{
-  EIGEN_STRONG_INLINE Packet4cf pmadd(const Packet8f& x, const Packet4cf& y, const Packet4cf& c) const
-  { return padd(c, pmul(x,y)); }
-
-  EIGEN_STRONG_INLINE Packet4cf pmul(const Packet8f& x, const Packet4cf& y) const
-  { return Packet4cf(Eigen::internal::pmul(x, y.v)); }
-};
-
-template<> struct conj_helper<Packet4cf, Packet8f, false,false>
-{
-  EIGEN_STRONG_INLINE Packet4cf pmadd(const Packet4cf& x, const Packet8f& y, const Packet4cf& c) const
-  { return padd(c, pmul(x,y)); }
-
-  EIGEN_STRONG_INLINE Packet4cf pmul(const Packet4cf& x, const Packet8f& y) const
-  { return Packet4cf(Eigen::internal::pmul(x.v, y)); }
-};
-
-template<> EIGEN_STRONG_INLINE Packet4cf pdiv<Packet4cf>(const Packet4cf& a, const Packet4cf& b)
-{
-  Packet4cf num = pmul(a, pconj(b));
-  __m256 tmp = _mm256_mul_ps(b.v, b.v);
-  __m256 tmp2    = _mm256_shuffle_ps(tmp,tmp,0xB1);
-  __m256 denom = _mm256_add_ps(tmp, tmp2);
-  return Packet4cf(_mm256_div_ps(num.v, denom));
-}
-
-template<> EIGEN_STRONG_INLINE Packet4cf pcplxflip<Packet4cf>(const Packet4cf& x)
-{
-  return Packet4cf(_mm256_shuffle_ps(x.v, x.v, _MM_SHUFFLE(2, 3, 0 ,1)));
-}
-
-//---------- double ----------
-struct Packet2cd
-{
-  EIGEN_STRONG_INLINE Packet2cd() {}
-  EIGEN_STRONG_INLINE explicit Packet2cd(const __m256d& a) : v(a) {}
-  __m256d  v;
-};
-
-template<> struct packet_traits<std::complex<double> >  : default_packet_traits
-{
-  typedef Packet2cd type;
-  typedef Packet1cd half;
-  enum {
-    Vectorizable = 1,
-    AlignedOnScalar = 0,
-    size = 2,
-    HasHalfPacket = 1,
-
-    HasAdd    = 1,
-    HasSub    = 1,
-    HasMul    = 1,
-    HasDiv    = 1,
-    HasNegate = 1,
-    HasAbs    = 0,
-    HasAbs2   = 0,
-    HasMin    = 0,
-    HasMax    = 0,
-    HasSetLinear = 0
-  };
-};
-
-template<> struct unpacket_traits<Packet2cd> { typedef std::complex<double> type; enum {size=2}; typedef Packet1cd half; };
-
-template<> EIGEN_STRONG_INLINE Packet2cd padd<Packet2cd>(const Packet2cd& a, const Packet2cd& b) { return Packet2cd(_mm256_add_pd(a.v,b.v)); }
-template<> EIGEN_STRONG_INLINE Packet2cd psub<Packet2cd>(const Packet2cd& a, const Packet2cd& b) { return Packet2cd(_mm256_sub_pd(a.v,b.v)); }
-template<> EIGEN_STRONG_INLINE Packet2cd pnegate(const Packet2cd& a) { return Packet2cd(pnegate(a.v)); }
-template<> EIGEN_STRONG_INLINE Packet2cd pconj(const Packet2cd& a)
-{
-  const __m256d mask = _mm256_castsi256_pd(_mm256_set_epi32(0x80000000,0x0,0x0,0x0,0x80000000,0x0,0x0,0x0));
-  return Packet2cd(_mm256_xor_pd(a.v,mask));
-}
-
-template<> EIGEN_STRONG_INLINE Packet2cd pmul<Packet2cd>(const Packet2cd& a, const Packet2cd& b)
-{
-  __m256d tmp1 = _mm256_shuffle_pd(a.v,a.v,0x0);
-  __m256d even = _mm256_mul_pd(tmp1, b.v);
-  __m256d tmp2 = _mm256_shuffle_pd(a.v,a.v,0xF);
-  __m256d tmp3 = _mm256_shuffle_pd(b.v,b.v,0x5);
-  __m256d odd  = _mm256_mul_pd(tmp2, tmp3);
-  return Packet2cd(_mm256_addsub_pd(even, odd));
-}
-
-template<> EIGEN_STRONG_INLINE Packet2cd pand   <Packet2cd>(const Packet2cd& a, const Packet2cd& b) { return Packet2cd(_mm256_and_pd(a.v,b.v)); }
-template<> EIGEN_STRONG_INLINE Packet2cd por    <Packet2cd>(const Packet2cd& a, const Packet2cd& b) { return Packet2cd(_mm256_or_pd(a.v,b.v)); }
-template<> EIGEN_STRONG_INLINE Packet2cd pxor   <Packet2cd>(const Packet2cd& a, const Packet2cd& b) { return Packet2cd(_mm256_xor_pd(a.v,b.v)); }
-template<> EIGEN_STRONG_INLINE Packet2cd pandnot<Packet2cd>(const Packet2cd& a, const Packet2cd& b) { return Packet2cd(_mm256_andnot_pd(a.v,b.v)); }
-
-template<> EIGEN_STRONG_INLINE Packet2cd pload <Packet2cd>(const std::complex<double>* from)
-{ EIGEN_DEBUG_ALIGNED_LOAD return Packet2cd(pload<Packet4d>((const double*)from)); }
-template<> EIGEN_STRONG_INLINE Packet2cd ploadu<Packet2cd>(const std::complex<double>* from)
-{ EIGEN_DEBUG_UNALIGNED_LOAD return Packet2cd(ploadu<Packet4d>((const double*)from)); }
-
-template<> EIGEN_STRONG_INLINE Packet2cd pset1<Packet2cd>(const std::complex<double>& from)
-{
-  // in case casting to a __m128d* is really not safe, then we can still fallback to this version: (much slower though)
-//   return Packet2cd(_mm256_loadu2_m128d((const double*)&from,(const double*)&from));
-    return Packet2cd(_mm256_broadcast_pd((const __m128d*)(const void*)&from));
-}
-
-template<> EIGEN_STRONG_INLINE Packet2cd ploaddup<Packet2cd>(const std::complex<double>* from) { return pset1<Packet2cd>(*from); }
-
-template<> EIGEN_STRONG_INLINE void pstore <std::complex<double> >(std::complex<double> *   to, const Packet2cd& from) { EIGEN_DEBUG_ALIGNED_STORE pstore((double*)to, from.v); }
-template<> EIGEN_STRONG_INLINE void pstoreu<std::complex<double> >(std::complex<double> *   to, const Packet2cd& from) { EIGEN_DEBUG_UNALIGNED_STORE pstoreu((double*)to, from.v); }
-
-template<> EIGEN_DEVICE_FUNC inline Packet2cd pgather<std::complex<double>, Packet2cd>(const std::complex<double>* from, Index stride)
-{
-  return Packet2cd(_mm256_set_pd(std::imag(from[1*stride]), std::real(from[1*stride]),
-				 std::imag(from[0*stride]), std::real(from[0*stride])));
-}
-
-template<> EIGEN_DEVICE_FUNC inline void pscatter<std::complex<double>, Packet2cd>(std::complex<double>* to, const Packet2cd& from, Index stride)
-{
-  __m128d low = _mm256_extractf128_pd(from.v, 0);
-  to[stride*0] = std::complex<double>(_mm_cvtsd_f64(low), _mm_cvtsd_f64(_mm_shuffle_pd(low, low, 1)));
-  __m128d high = _mm256_extractf128_pd(from.v, 1);
-  to[stride*1] = std::complex<double>(_mm_cvtsd_f64(high), _mm_cvtsd_f64(_mm_shuffle_pd(high, high, 1)));
-}
-
-template<> EIGEN_STRONG_INLINE std::complex<double> pfirst<Packet2cd>(const Packet2cd& a)
-{
-  __m128d low = _mm256_extractf128_pd(a.v, 0);
-  EIGEN_ALIGN16 double res[2];
-  _mm_store_pd(res, low);
-  return std::complex<double>(res[0],res[1]);
-}
-
-template<> EIGEN_STRONG_INLINE Packet2cd preverse(const Packet2cd& a) {
-  __m256d result = _mm256_permute2f128_pd(a.v, a.v, 1);
-  return Packet2cd(result);
-}
-
-template<> EIGEN_STRONG_INLINE std::complex<double> predux<Packet2cd>(const Packet2cd& a)
-{
-  return predux(padd(Packet1cd(_mm256_extractf128_pd(a.v,0)),
-                     Packet1cd(_mm256_extractf128_pd(a.v,1))));
-}
-
-template<> EIGEN_STRONG_INLINE Packet2cd preduxp<Packet2cd>(const Packet2cd* vecs)
-{
-  Packet4d t0 = _mm256_permute2f128_pd(vecs[0].v,vecs[1].v, 0 + (2<<4));
-  Packet4d t1 = _mm256_permute2f128_pd(vecs[0].v,vecs[1].v, 1 + (3<<4));
-
-  return Packet2cd(_mm256_add_pd(t0,t1));
-}
-
-template<> EIGEN_STRONG_INLINE std::complex<double> predux_mul<Packet2cd>(const Packet2cd& a)
-{
-  return predux(pmul(Packet1cd(_mm256_extractf128_pd(a.v,0)),
-                     Packet1cd(_mm256_extractf128_pd(a.v,1))));
-}
-
-template<int Offset>
-struct palign_impl<Offset,Packet2cd>
-{
-  static EIGEN_STRONG_INLINE void run(Packet2cd& first, const Packet2cd& second)
-  {
-    if (Offset==0) return;
-    palign_impl<Offset*2,Packet4d>::run(first.v, second.v);
-  }
-};
-
-template<> struct conj_helper<Packet2cd, Packet2cd, false,true>
-{
-  EIGEN_STRONG_INLINE Packet2cd pmadd(const Packet2cd& x, const Packet2cd& y, const Packet2cd& c) const
-  { return padd(pmul(x,y),c); }
-
-  EIGEN_STRONG_INLINE Packet2cd pmul(const Packet2cd& a, const Packet2cd& b) const
-  {
-    return internal::pmul(a, pconj(b));
-  }
-};
-
-template<> struct conj_helper<Packet2cd, Packet2cd, true,false>
-{
-  EIGEN_STRONG_INLINE Packet2cd pmadd(const Packet2cd& x, const Packet2cd& y, const Packet2cd& c) const
-  { return padd(pmul(x,y),c); }
-
-  EIGEN_STRONG_INLINE Packet2cd pmul(const Packet2cd& a, const Packet2cd& b) const
-  {
-    return internal::pmul(pconj(a), b);
-  }
-};
-
-template<> struct conj_helper<Packet2cd, Packet2cd, true,true>
-{
-  EIGEN_STRONG_INLINE Packet2cd pmadd(const Packet2cd& x, const Packet2cd& y, const Packet2cd& c) const
-  { return padd(pmul(x,y),c); }
-
-  EIGEN_STRONG_INLINE Packet2cd pmul(const Packet2cd& a, const Packet2cd& b) const
-  {
-    return pconj(internal::pmul(a, b));
-  }
-};
-
-template<> struct conj_helper<Packet4d, Packet2cd, false,false>
-{
-  EIGEN_STRONG_INLINE Packet2cd pmadd(const Packet4d& x, const Packet2cd& y, const Packet2cd& c) const
-  { return padd(c, pmul(x,y)); }
-
-  EIGEN_STRONG_INLINE Packet2cd pmul(const Packet4d& x, const Packet2cd& y) const
-  { return Packet2cd(Eigen::internal::pmul(x, y.v)); }
-};
-
-template<> struct conj_helper<Packet2cd, Packet4d, false,false>
-{
-  EIGEN_STRONG_INLINE Packet2cd pmadd(const Packet2cd& x, const Packet4d& y, const Packet2cd& c) const
-  { return padd(c, pmul(x,y)); }
-
-  EIGEN_STRONG_INLINE Packet2cd pmul(const Packet2cd& x, const Packet4d& y) const
-  { return Packet2cd(Eigen::internal::pmul(x.v, y)); }
-};
-
-template<> EIGEN_STRONG_INLINE Packet2cd pdiv<Packet2cd>(const Packet2cd& a, const Packet2cd& b)
-{
-  Packet2cd num = pmul(a, pconj(b));
-  __m256d tmp = _mm256_mul_pd(b.v, b.v);
-  __m256d denom = _mm256_hadd_pd(tmp, tmp);
-  return Packet2cd(_mm256_div_pd(num.v, denom));
-}
-
-template<> EIGEN_STRONG_INLINE Packet2cd pcplxflip<Packet2cd>(const Packet2cd& x)
-{
-  return Packet2cd(_mm256_shuffle_pd(x.v, x.v, 0x5));
-}
-
-EIGEN_DEVICE_FUNC inline void
-ptranspose(PacketBlock<Packet4cf,4>& kernel) {
-  __m256d P0 = _mm256_castps_pd(kernel.packet[0].v);
-  __m256d P1 = _mm256_castps_pd(kernel.packet[1].v);
-  __m256d P2 = _mm256_castps_pd(kernel.packet[2].v);
-  __m256d P3 = _mm256_castps_pd(kernel.packet[3].v);
-
-  __m256d T0 = _mm256_shuffle_pd(P0, P1, 15);
-  __m256d T1 = _mm256_shuffle_pd(P0, P1, 0);
-  __m256d T2 = _mm256_shuffle_pd(P2, P3, 15);
-  __m256d T3 = _mm256_shuffle_pd(P2, P3, 0);
-
-  kernel.packet[1].v = _mm256_castpd_ps(_mm256_permute2f128_pd(T0, T2, 32));
-  kernel.packet[3].v = _mm256_castpd_ps(_mm256_permute2f128_pd(T0, T2, 49));
-  kernel.packet[0].v = _mm256_castpd_ps(_mm256_permute2f128_pd(T1, T3, 32));
-  kernel.packet[2].v = _mm256_castpd_ps(_mm256_permute2f128_pd(T1, T3, 49));
-}
-
-EIGEN_DEVICE_FUNC inline void
-ptranspose(PacketBlock<Packet2cd,2>& kernel) {
-  __m256d tmp = _mm256_permute2f128_pd(kernel.packet[0].v, kernel.packet[1].v, 0+(2<<4));
-  kernel.packet[1].v = _mm256_permute2f128_pd(kernel.packet[0].v, kernel.packet[1].v, 1+(3<<4));
- kernel.packet[0].v = tmp;
-}
-
-} // end namespace internal
-
-} // end namespace Eigen
-
-#endif // EIGEN_COMPLEX_AVX_H
--- a/Show More
+++ b/Show More